diff options
Diffstat (limited to 'net')
492 files changed, 23260 insertions, 33133 deletions
diff --git a/net/802/fddi.c b/net/802/fddi.c index ace6386384b..91dde41b548 100644 --- a/net/802/fddi.c +++ b/net/802/fddi.c @@ -100,7 +100,7 @@ static int fddi_rebuild_header(struct sk_buff *skb) struct fddihdr *fddi = (struct fddihdr *)skb->data; #ifdef CONFIG_INET - if (fddi->hdr.llc_snap.ethertype == __constant_htons(ETH_P_IP)) + if (fddi->hdr.llc_snap.ethertype == htons(ETH_P_IP)) /* Try to get ARP to resolve the header and fill destination address */ return arp_find(fddi->daddr, skb); else @@ -130,12 +130,13 @@ __be16 fddi_type_trans(struct sk_buff *skb, struct net_device *dev) * to start of packet data. Assume 802.2 SNAP frames for now. */ - skb->mac.raw = skb->data; /* point to frame control (FC) */ + skb->dev = dev; + skb_reset_mac_header(skb); /* point to frame control (FC) */ if(fddi->hdr.llc_8022_1.dsap==0xe0) { skb_pull(skb, FDDI_K_8022_HLEN-3); - type = __constant_htons(ETH_P_802_2); + type = htons(ETH_P_802_2); } else { diff --git a/net/802/hippi.c b/net/802/hippi.c index 578f2a3d692..87ffc12b689 100644 --- a/net/802/hippi.c +++ b/net/802/hippi.c @@ -60,7 +60,7 @@ static int hippi_header(struct sk_buff *skb, struct net_device *dev, * Due to the stupidity of the little endian byte-order we * have to set the fp field this way. */ - hip->fp.fixed = __constant_htonl(0x04800018); + hip->fp.fixed = htonl(0x04800018); hip->fp.d2_size = htonl(len + 8); hip->le.fc = 0; hip->le.double_wide = 0; /* only HIPPI 800 for the time being */ @@ -104,7 +104,7 @@ static int hippi_rebuild_header(struct sk_buff *skb) * Only IP is currently supported */ - if(hip->snap.ethertype != __constant_htons(ETH_P_IP)) + if(hip->snap.ethertype != htons(ETH_P_IP)) { printk(KERN_DEBUG "%s: unable to resolve type %X addresses.\n",skb->dev->name,ntohs(hip->snap.ethertype)); return 0; @@ -126,14 +126,14 @@ __be16 hippi_type_trans(struct sk_buff *skb, struct net_device *dev) { struct hippi_hdr *hip; - hip = (struct hippi_hdr *) skb->data; - /* * This is actually wrong ... question is if we really should * set the raw address here. */ - skb->mac.raw = skb->data; - skb_pull(skb, HIPPI_HLEN); + skb->dev = dev; + skb_reset_mac_header(skb); + hip = (struct hippi_hdr *)skb_mac_header(skb); + skb_pull(skb, HIPPI_HLEN); /* * No fancy promisc stuff here now. diff --git a/net/802/psnap.c b/net/802/psnap.c index 6e7c2120b83..04ee43e7538 100644 --- a/net/802/psnap.c +++ b/net/802/psnap.c @@ -56,10 +56,10 @@ static int snap_rcv(struct sk_buff *skb, struct net_device *dev, }; rcu_read_lock(); - proto = find_snap_client(skb->h.raw); + proto = find_snap_client(skb_transport_header(skb)); if (proto) { /* Pass the frame on. */ - skb->h.raw += 5; + skb->transport_header += 5; skb_pull_rcsum(skb, 5); rc = proto->rcvfunc(skb, dev, &snap_packet_type, orig_dev); } else { diff --git a/net/802/tr.c b/net/802/tr.c index 96bd14452c5..0ba1946211c 100644 --- a/net/802/tr.c +++ b/net/802/tr.c @@ -189,11 +189,13 @@ static int tr_rebuild_header(struct sk_buff *skb) __be16 tr_type_trans(struct sk_buff *skb, struct net_device *dev) { - struct trh_hdr *trh=(struct trh_hdr *)skb->data; + struct trh_hdr *trh; struct trllc *trllc; unsigned riflen=0; - skb->mac.raw = skb->data; + skb->dev = dev; + skb_reset_mac_header(skb); + trh = tr_hdr(skb); if(trh->saddr[0] & TR_RII) riflen = (ntohs(trh->rcf) & TR_RCF_LEN_MASK) >> 8; @@ -552,7 +554,8 @@ static int rif_seq_show(struct seq_file *seq, void *v) if(j==1) { segment=ntohs(entry->rseg[j-1])>>4; seq_printf(seq," %03X",segment); - }; + } + segment=ntohs(entry->rseg[j])>>4; brdgnmb=ntohs(entry->rseg[j-1])&0x00f; seq_printf(seq,"-%01X-%03X",brdgnmb,segment); diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index eb1c71ed7df..bd93c45778d 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -117,8 +117,7 @@ static void __exit vlan_cleanup_devices(void) struct net_device *dev, *nxt; rtnl_lock(); - for (dev = dev_base; dev; dev = nxt) { - nxt = dev->next; + for_each_netdev_safe(dev, nxt) { if (dev->priv_flags & IFF_802_1Q_VLAN) { unregister_vlan_dev(VLAN_DEV_INFO(dev)->real_dev, VLAN_DEV_INFO(dev)->vlan_id); @@ -470,7 +469,7 @@ static struct net_device *register_vlan_device(const char *eth_IF_name, */ default: snprintf(name, IFNAMSIZ, "vlan%.4i", VLAN_ID); - }; + } new_dev = alloc_netdev(sizeof(struct vlan_dev_info), name, vlan_setup); @@ -685,7 +684,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, break; } break; - }; + } out: return NOTIFY_DONE; @@ -819,7 +818,7 @@ static int vlan_ioctl_handler(void __user *arg) printk(VLAN_DBG "%s: Unknown VLAN CMD: %x \n", __FUNCTION__, args.cmd); return -EINVAL; - }; + } out: return err; } diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index b6e0eea1e39..ec46084f44b 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -66,7 +66,7 @@ int vlan_dev_rebuild_header(struct sk_buff *skb) memcpy(veth->h_source, dev->dev_addr, ETH_ALEN); break; - }; + } return 0; } @@ -83,7 +83,7 @@ static inline struct sk_buff *vlan_check_reorder_header(struct sk_buff *skb) /* Lifted from Gleb's VLAN code... */ memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 12); - skb->mac.raw += VLAN_HLEN; + skb->mac_header += VLAN_HLEN; } } @@ -219,7 +219,7 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev, break; default: break; - }; + } /* Was a VLAN packet, grab the encapsulated protocol, which the layer * three protocols care about. @@ -258,7 +258,7 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev, * won't work for fault tolerant netware but does for the rest. */ if (*(unsigned short *)rawp == 0xFFFF) { - skb->protocol = __constant_htons(ETH_P_802_3); + skb->protocol = htons(ETH_P_802_3); /* place it back on the queue to be handled by true layer 3 protocols. */ @@ -281,7 +281,7 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev, /* * Real 802.2 LLC */ - skb->protocol = __constant_htons(ETH_P_802_2); + skb->protocol = htons(ETH_P_802_2); /* place it back on the queue to be handled by upper layer protocols. */ @@ -382,7 +382,7 @@ int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, } skb->protocol = htons(ETH_P_8021Q); - skb->nh.raw = skb->data; + skb_reset_network_header(skb); } /* Before delegating work to the lower layer, enter our MAC-address */ @@ -448,7 +448,7 @@ int vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) * OTHER THINGS LIKE FDDI/TokenRing/802.3 SNAPs... */ - if (veth->h_vlan_proto != __constant_htons(ETH_P_8021Q)) { + if (veth->h_vlan_proto != htons(ETH_P_8021Q)) { int orig_headroom = skb_headroom(skb); unsigned short veth_TCI; diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c index 5e24f72602a..d216a64421c 100644 --- a/net/8021q/vlanproc.c +++ b/net/8021q/vlanproc.c @@ -237,13 +237,9 @@ int vlan_proc_rem_dev(struct net_device *vlandev) * The following few functions build the content of /proc/net/vlan/config */ -/* starting at dev, find a VLAN device */ -static struct net_device *vlan_skip(struct net_device *dev) +static inline int is_vlan_dev(struct net_device *dev) { - while (dev && !(dev->priv_flags & IFF_802_1Q_VLAN)) - dev = dev->next; - - return dev; + return dev->priv_flags & IFF_802_1Q_VLAN; } /* start read of /proc/net/vlan/config */ @@ -257,19 +253,35 @@ static void *vlan_seq_start(struct seq_file *seq, loff_t *pos) if (*pos == 0) return SEQ_START_TOKEN; - for (dev = vlan_skip(dev_base); dev && i < *pos; - dev = vlan_skip(dev->next), ++i); + for_each_netdev(dev) { + if (!is_vlan_dev(dev)) + continue; + + if (i++ == *pos) + return dev; + } - return (i == *pos) ? dev : NULL; + return NULL; } static void *vlan_seq_next(struct seq_file *seq, void *v, loff_t *pos) { + struct net_device *dev; + ++*pos; - return vlan_skip((v == SEQ_START_TOKEN) - ? dev_base - : ((struct net_device *)v)->next); + dev = (struct net_device *)v; + if (v == SEQ_START_TOKEN) + dev = net_device_entry(&dev_base_head); + + for_each_netdev_continue(dev) { + if (!is_vlan_dev(dev)) + continue; + + return dev; + } + + return NULL; } static void vlan_seq_stop(struct seq_file *seq, void *v) diff --git a/net/Kconfig b/net/Kconfig index 915657832d9..2fc8e77b1e6 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -27,13 +27,6 @@ if NET menu "Networking options" -config NETDEBUG - bool "Network packet debugging" - help - You can say Y here if you want to get additional messages useful in - debugging bad packets, but can overwhelm logs under denial of service - attacks. - source "net/packet/Kconfig" source "net/unix/Kconfig" source "net/xfrm/Kconfig" @@ -219,14 +212,18 @@ endmenu source "net/ax25/Kconfig" source "net/irda/Kconfig" source "net/bluetooth/Kconfig" -source "net/ieee80211/Kconfig" - -config WIRELESS_EXT - bool +source "net/rxrpc/Kconfig" config FIB_RULES bool +menu "Wireless" + +source "net/wireless/Kconfig" +source "net/ieee80211/Kconfig" + +endmenu + endif # if NET endmenu # Networking diff --git a/net/Makefile b/net/Makefile index 4854ac50631..6b74d4118c5 100644 --- a/net/Makefile +++ b/net/Makefile @@ -38,6 +38,7 @@ obj-$(CONFIG_IRDA) += irda/ obj-$(CONFIG_BT) += bluetooth/ obj-$(CONFIG_SUNRPC) += sunrpc/ obj-$(CONFIG_RXRPC) += rxrpc/ +obj-$(CONFIG_AF_RXRPC) += rxrpc/ obj-$(CONFIG_ATM) += atm/ obj-$(CONFIG_DECNET) += decnet/ obj-$(CONFIG_ECONET) += econet/ @@ -52,3 +53,5 @@ obj-$(CONFIG_IUCV) += iucv/ ifeq ($(CONFIG_NET),y) obj-$(CONFIG_SYSCTL) += sysctl_net.o endif + +obj-y += wireless/ diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c index d89d62f3702..5ef6a238bdb 100644 --- a/net/appletalk/aarp.c +++ b/net/appletalk/aarp.c @@ -118,7 +118,9 @@ static void __aarp_send_query(struct aarp_entry *a) /* Set up the buffer */ skb_reserve(skb, dev->hard_header_len + aarp_dl->header_length); - skb->nh.raw = skb->h.raw = skb_put(skb, sizeof(*eah)); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + skb_put(skb, sizeof(*eah)); skb->protocol = htons(ETH_P_ATALK); skb->dev = dev; eah = aarp_hdr(skb); @@ -163,7 +165,9 @@ static void aarp_send_reply(struct net_device *dev, struct atalk_addr *us, /* Set up the buffer */ skb_reserve(skb, dev->hard_header_len + aarp_dl->header_length); - skb->nh.raw = skb->h.raw = skb_put(skb, sizeof(*eah)); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + skb_put(skb, sizeof(*eah)); skb->protocol = htons(ETH_P_ATALK); skb->dev = dev; eah = aarp_hdr(skb); @@ -212,7 +216,9 @@ static void aarp_send_probe(struct net_device *dev, struct atalk_addr *us) /* Set up the buffer */ skb_reserve(skb, dev->hard_header_len + aarp_dl->header_length); - skb->nh.raw = skb->h.raw = skb_put(skb, sizeof(*eah)); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + skb_put(skb, sizeof(*eah)); skb->protocol = htons(ETH_P_ATALK); skb->dev = dev; eah = aarp_hdr(skb); @@ -539,7 +545,7 @@ int aarp_send_ddp(struct net_device *dev, struct sk_buff *skb, int hash; struct aarp_entry *a; - skb->nh.raw = skb->data; + skb_reset_network_header(skb); /* Check for LocalTalk first */ if (dev->type == ARPHRD_LOCALTLK) { diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index c8b7dc2c325..f6a92a0b7aa 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1275,7 +1275,7 @@ static int handle_ip_over_ddp(struct sk_buff *skb) skb->protocol = htons(ETH_P_IP); skb_pull(skb, 13); skb->dev = dev; - skb->h.raw = skb->data; + skb_reset_transport_header(skb); stats = dev->priv; stats->rx_packets++; @@ -1383,10 +1383,10 @@ free_it: * @pt - packet type * * Receive a packet (in skb) from device dev. This has come from the SNAP - * decoder, and on entry skb->h.raw is the DDP header, skb->len is the DDP - * header, skb->len is the DDP length. The physical headers have been - * extracted. PPP should probably pass frames marked as for this layer. - * [ie ARPHRD_ETHERTALK] + * decoder, and on entry skb->transport_header is the DDP header, skb->len + * is the DDP header, skb->len is the DDP length. The physical headers + * have been extracted. PPP should probably pass frames marked as for this + * layer. [ie ARPHRD_ETHERTALK] */ static int atalk_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) @@ -1484,7 +1484,7 @@ static int ltalk_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { /* Expand any short form frames */ - if (skb->mac.raw[2] == 1) { + if (skb_mac_header(skb)[2] == 1) { struct ddpehdr *ddp; /* Find our address */ struct atalk_addr *ap = atalk_find_dev_addr(dev); @@ -1510,8 +1510,8 @@ static int ltalk_rcv(struct sk_buff *skb, struct net_device *dev, * we write the network numbers ! */ - ddp->deh_dnode = skb->mac.raw[0]; /* From physical header */ - ddp->deh_snode = skb->mac.raw[1]; /* From physical header */ + ddp->deh_dnode = skb_mac_header(skb)[0]; /* From physical header */ + ddp->deh_snode = skb_mac_header(skb)[1]; /* From physical header */ ddp->deh_dnet = ap->s_net; /* Network number */ ddp->deh_snet = ap->s_net; @@ -1522,7 +1522,7 @@ static int ltalk_rcv(struct sk_buff *skb, struct net_device *dev, /* Non routable, so force a drop if we slip up later */ ddp->deh_len_hops = htons(skb->len + (DDP_MAXHOPS << 10)); } - skb->h.raw = skb->data; + skb_reset_transport_header(skb); return atalk_rcv(skb, dev, pt, orig_dev); freeit: @@ -1771,6 +1771,9 @@ static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCGSTAMP: rc = sock_get_timestamp(sk, argp); break; + case SIOCGSTAMPNS: + rc = sock_get_timestampns(sk, argp); + break; /* Routing */ case SIOCADDRT: case SIOCDELRT: diff --git a/net/atm/br2684.c b/net/atm/br2684.c index ec4ebd3299e..0e9f00c5c89 100644 --- a/net/atm/br2684.c +++ b/net/atm/br2684.c @@ -173,7 +173,7 @@ static int br2684_xmit_vcc(struct sk_buff *skb, struct br2684_dev *brdev, } skb_push(skb, minheadroom); if (brvcc->encaps == e_llc) - memcpy(skb->data, llc_oui_pid_pad, 10); + skb_copy_to_linear_data(skb, llc_oui_pid_pad, 10); else memset(skb->data, 0, 2); #endif /* FASTER_VERSION */ @@ -375,11 +375,11 @@ packet_fails_filter(__be16 type, struct br2684_vcc *brvcc, struct sk_buff *skb) { if (brvcc->filter.netmask == 0) return 0; /* no filter in place */ - if (type == __constant_htons(ETH_P_IP) && + if (type == htons(ETH_P_IP) && (((struct iphdr *) (skb->data))->daddr & brvcc->filter. netmask) == brvcc->filter.prefix) return 0; - if (type == __constant_htons(ETH_P_ARP)) + if (type == htons(ETH_P_ARP)) return 0; /* TODO: we should probably filter ARPs too.. don't want to have * them returning values that don't make sense, or is that ok? @@ -458,7 +458,7 @@ static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb) /* FIXME: tcpdump shows that pointer to mac header is 2 bytes earlier, than should be. What else should I set? */ skb_pull(skb, plen); - skb->mac.raw = ((char *) (skb->data)) - ETH_HLEN; + skb_set_mac_header(skb, -ETH_HLEN); skb->pkt_type = PACKET_HOST; #ifdef CONFIG_BR2684_FAST_TRANS skb->protocol = ((u16 *) skb->data)[-1]; diff --git a/net/atm/clip.c b/net/atm/clip.c index 8c382581608..876b77f1474 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -213,7 +213,7 @@ static void clip_push(struct atm_vcc *vcc, struct sk_buff *skb) return; } ATM_SKB(skb)->vcc = vcc; - skb->mac.raw = skb->data; + skb_reset_mac_header(skb); if (!clip_vcc->encap || skb->len < RFC1483LLC_LEN || memcmp(skb->data, llc_oui, sizeof (llc_oui))) @@ -702,7 +702,7 @@ static struct atm_dev atmarpd_dev = { .ops = &atmarpd_dev_ops, .type = "arpd", .number = 999, - .lock = SPIN_LOCK_UNLOCKED + .lock = __SPIN_LOCK_UNLOCKED(atmarpd_dev.lock) }; diff --git a/net/atm/ioctl.c b/net/atm/ioctl.c index 8ccee4591f6..7afd8e7754f 100644 --- a/net/atm/ioctl.c +++ b/net/atm/ioctl.c @@ -82,6 +82,9 @@ int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCGSTAMP: /* borrowed from IP */ error = sock_get_timestamp(sk, argp); goto done; + case SIOCGSTAMPNS: /* borrowed from IP */ + error = sock_get_timestampns(sk, argp); + goto done; case ATM_SETSC: printk(KERN_WARNING "ATM_SETSC is obsolete\n"); error = 0; diff --git a/net/atm/lec.c b/net/atm/lec.c index 3d804d61f65..4dc5f2b8c43 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -283,8 +283,8 @@ static int lec_start_xmit(struct sk_buff *skb, struct net_device *dev) } DPRINTK("skbuff head:%lx data:%lx tail:%lx end:%lx\n", - (long)skb->head, (long)skb->data, (long)skb->tail, - (long)skb->end); + (long)skb->head, (long)skb->data, (long)skb_tail_pointer(skb), + (long)skb_end_pointer(skb)); #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) if (memcmp(skb->data, bridge_ula_lec, sizeof(bridge_ula_lec)) == 0) lec_handle_bridge(skb, dev); @@ -576,8 +576,8 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb) break; } skb2->len = sizeof(struct atmlec_msg); - memcpy(skb2->data, mesg, - sizeof(struct atmlec_msg)); + skb_copy_to_linear_data(skb2, mesg, + sizeof(*mesg)); atm_force_charge(priv->lecd, skb2->truesize); sk = sk_atm(priv->lecd); skb_queue_tail(&sk->sk_receive_queue, skb2); @@ -630,7 +630,7 @@ static struct atm_dev lecatm_dev = { .ops = &lecdev_ops, .type = "lec", .number = 999, /* dummy device number */ - .lock = SPIN_LOCK_UNLOCKED + .lock = __SPIN_LOCK_UNLOCKED(lecatm_dev.lock) }; /* @@ -825,7 +825,6 @@ static void lec_push(struct atm_vcc *vcc, struct sk_buff *skb) if (!hlist_empty(&priv->lec_arp_empty_ones)) { lec_arp_check_empties(priv, vcc, skb); } - skb->dev = dev; skb_pull(skb, 2); /* skip lec_id */ #ifdef CONFIG_TR if (priv->is_trdev) @@ -1338,7 +1337,7 @@ static int lane2_resolve(struct net_device *dev, u8 *dst_mac, int force, if (skb == NULL) return -1; skb->len = *sizeoftlvs; - memcpy(skb->data, *tlvs, *sizeoftlvs); + skb_copy_to_linear_data(skb, *tlvs, *sizeoftlvs); retval = send_to_lecd(priv, l_arp_xmt, dst_mac, NULL, skb); } return retval; @@ -1372,7 +1371,7 @@ static int lane2_associate_req(struct net_device *dev, u8 *lan_dst, if (skb == NULL) return 0; skb->len = sizeoftlvs; - memcpy(skb->data, tlvs, sizeoftlvs); + skb_copy_to_linear_data(skb, tlvs, sizeoftlvs); retval = send_to_lecd(priv, l_associate_req, NULL, NULL, skb); if (retval != 0) printk("lec.c: lane2_associate_req() failed\n"); diff --git a/net/atm/mpc.c b/net/atm/mpc.c index cb3c004ff02..7c85aa551d5 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -504,11 +504,13 @@ static int send_via_shortcut(struct sk_buff *skb, struct mpoa_client *mpc) tagged_llc_snap_hdr.tag = entry->ctrl_info.tag; skb_pull(skb, ETH_HLEN); /* get rid of Eth header */ skb_push(skb, sizeof(tagged_llc_snap_hdr)); /* add LLC/SNAP header */ - memcpy(skb->data, &tagged_llc_snap_hdr, sizeof(tagged_llc_snap_hdr)); + skb_copy_to_linear_data(skb, &tagged_llc_snap_hdr, + sizeof(tagged_llc_snap_hdr)); } else { skb_pull(skb, ETH_HLEN); /* get rid of Eth header */ skb_push(skb, sizeof(struct llc_snap_hdr)); /* add LLC/SNAP header + tag */ - memcpy(skb->data, &llc_snap_mpoa_data, sizeof(struct llc_snap_hdr)); + skb_copy_to_linear_data(skb, &llc_snap_mpoa_data, + sizeof(struct llc_snap_hdr)); } atomic_add(skb->truesize, &sk_atm(entry->shortcut)->sk_wmem_alloc); @@ -711,11 +713,12 @@ static void mpc_push(struct atm_vcc *vcc, struct sk_buff *skb) return; } skb_push(new_skb, eg->ctrl_info.DH_length); /* add MAC header */ - memcpy(new_skb->data, eg->ctrl_info.DLL_header, eg->ctrl_info.DH_length); + skb_copy_to_linear_data(new_skb, eg->ctrl_info.DLL_header, + eg->ctrl_info.DH_length); new_skb->protocol = eth_type_trans(new_skb, dev); - new_skb->nh.raw = new_skb->data; + skb_reset_network_header(new_skb); - eg->latest_ip_addr = new_skb->nh.iph->saddr; + eg->latest_ip_addr = ip_hdr(new_skb)->saddr; eg->packets_rcvd++; mpc->eg_ops->put(eg); @@ -734,7 +737,7 @@ static struct atm_dev mpc_dev = { .ops = &mpc_ops, .type = "mpc", .number = 42, - .lock = SPIN_LOCK_UNLOCKED + .lock = __SPIN_LOCK_UNLOCKED(mpc_dev.lock) /* members not explicitly initialised will be 0 */ }; @@ -936,7 +939,7 @@ int msg_to_mpoad(struct k_message *mesg, struct mpoa_client *mpc) if (skb == NULL) return -ENOMEM; skb_put(skb, sizeof(struct k_message)); - memcpy(skb->data, mesg, sizeof(struct k_message)); + skb_copy_to_linear_data(skb, mesg, sizeof(*mesg)); atm_force_charge(mpc->mpoad_vcc, skb->truesize); sk = sk_atm(mpc->mpoad_vcc); diff --git a/net/atm/signaling.c b/net/atm/signaling.c index 31d98b57e1d..d14baaf1f4c 100644 --- a/net/atm/signaling.c +++ b/net/atm/signaling.c @@ -256,7 +256,7 @@ static struct atm_dev sigd_dev = { .ops = &sigd_dev_ops, .type = "sig", .number = 999, - .lock = SPIN_LOCK_UNLOCKED + .lock = __SPIN_LOCK_UNLOCKED(sigd_dev.lock) }; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 1c07c6a50eb..6ded95272a5 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1127,22 +1127,22 @@ static int __must_check ax25_connect(struct socket *sock, switch (sk->sk_state) { case TCP_SYN_SENT: /* still trying */ err = -EINPROGRESS; - goto out; + goto out_release; case TCP_ESTABLISHED: /* connection established */ sock->state = SS_CONNECTED; - goto out; + goto out_release; case TCP_CLOSE: /* connection refused */ sock->state = SS_UNCONNECTED; err = -ECONNREFUSED; - goto out; + goto out_release; } } if (sk->sk_state == TCP_ESTABLISHED && sk->sk_type == SOCK_SEQPACKET) { err = -EISCONN; /* No reconnect on a seqpacket socket */ - goto out; + goto out_release; } sk->sk_state = TCP_CLOSE; @@ -1159,12 +1159,12 @@ static int __must_check ax25_connect(struct socket *sock, /* Valid number of digipeaters ? */ if (fsa->fsa_ax25.sax25_ndigis < 1 || fsa->fsa_ax25.sax25_ndigis > AX25_MAX_DIGIS) { err = -EINVAL; - goto out; + goto out_release; } if ((digi = kmalloc(sizeof(ax25_digi), GFP_KERNEL)) == NULL) { err = -ENOBUFS; - goto out; + goto out_release; } digi->ndigi = fsa->fsa_ax25.sax25_ndigis; @@ -1194,7 +1194,7 @@ static int __must_check ax25_connect(struct socket *sock, current->comm); if ((err = ax25_rt_autobind(ax25, &fsa->fsa_ax25.sax25_call)) < 0) { kfree(digi); - goto out; + goto out_release; } ax25_fillin_cb(ax25, ax25->ax25_dev); @@ -1203,7 +1203,7 @@ static int __must_check ax25_connect(struct socket *sock, if (ax25->ax25_dev == NULL) { kfree(digi); err = -EHOSTUNREACH; - goto out; + goto out_release; } } @@ -1213,7 +1213,7 @@ static int __must_check ax25_connect(struct socket *sock, kfree(digi); err = -EADDRINUSE; /* Already such a connection */ ax25_cb_put(ax25t); - goto out; + goto out_release; } ax25->dest_addr = fsa->fsa_ax25.sax25_call; @@ -1223,7 +1223,7 @@ static int __must_check ax25_connect(struct socket *sock, if (sk->sk_type != SOCK_SEQPACKET) { sock->state = SS_CONNECTED; sk->sk_state = TCP_ESTABLISHED; - goto out; + goto out_release; } /* Move to connecting socket, ax.25 lapb WAIT_UA.. */ @@ -1255,55 +1255,53 @@ static int __must_check ax25_connect(struct socket *sock, /* Now the loop */ if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) { err = -EINPROGRESS; - goto out; + goto out_release; } if (sk->sk_state == TCP_SYN_SENT) { - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); + DEFINE_WAIT(wait); - add_wait_queue(sk->sk_sleep, &wait); for (;;) { + prepare_to_wait(sk->sk_sleep, &wait, + TASK_INTERRUPTIBLE); if (sk->sk_state != TCP_SYN_SENT) break; - set_current_state(TASK_INTERRUPTIBLE); - release_sock(sk); - if (!signal_pending(tsk)) { + if (!signal_pending(current)) { + release_sock(sk); schedule(); lock_sock(sk); continue; } - current->state = TASK_RUNNING; - remove_wait_queue(sk->sk_sleep, &wait); - return -ERESTARTSYS; + err = -ERESTARTSYS; + break; } - current->state = TASK_RUNNING; - remove_wait_queue(sk->sk_sleep, &wait); + finish_wait(sk->sk_sleep, &wait); + + if (err) + goto out_release; } if (sk->sk_state != TCP_ESTABLISHED) { /* Not in ABM, not in WAIT_UA -> failed */ sock->state = SS_UNCONNECTED; err = sock_error(sk); /* Always set at this point */ - goto out; + goto out_release; } sock->state = SS_CONNECTED; - err=0; -out: + err = 0; +out_release: release_sock(sk); return err; } - static int ax25_accept(struct socket *sock, struct socket *newsock, int flags) { - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); struct sk_buff *skb; struct sock *newsk; + DEFINE_WAIT(wait); struct sock *sk; int err = 0; @@ -1328,30 +1326,29 @@ static int ax25_accept(struct socket *sock, struct socket *newsock, int flags) * The read queue this time is holding sockets ready to use * hooked into the SABM we saved */ - add_wait_queue(sk->sk_sleep, &wait); for (;;) { + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); skb = skb_dequeue(&sk->sk_receive_queue); if (skb) break; - release_sock(sk); - current->state = TASK_INTERRUPTIBLE; if (flags & O_NONBLOCK) { - current->state = TASK_RUNNING; - remove_wait_queue(sk->sk_sleep, &wait); - return -EWOULDBLOCK; + err = -EWOULDBLOCK; + break; } - if (!signal_pending(tsk)) { + if (!signal_pending(current)) { + release_sock(sk); schedule(); lock_sock(sk); continue; } - current->state = TASK_RUNNING; - remove_wait_queue(sk->sk_sleep, &wait); - return -ERESTARTSYS; + err = -ERESTARTSYS; + break; } - current->state = TASK_RUNNING; - remove_wait_queue(sk->sk_sleep, &wait); + finish_wait(sk->sk_sleep, &wait); + + if (err) + goto out; newsk = skb->sk; newsk->sk_socket = newsock; @@ -1425,7 +1422,6 @@ static int ax25_sendmsg(struct kiocb *iocb, struct socket *sock, struct sockaddr_ax25 sax; struct sk_buff *skb; ax25_digi dtmp, *dp; - unsigned char *asmptr; ax25_cb *ax25; size_t size; int lv, err, addr_len = msg->msg_namelen; @@ -1548,13 +1544,11 @@ static int ax25_sendmsg(struct kiocb *iocb, struct socket *sock, goto out; } - skb->nh.raw = skb->data; + skb_reset_network_header(skb); /* Add the PID if one is not supplied by the user in the skb */ - if (!ax25->pidincl) { - asmptr = skb_push(skb, 1); - *asmptr = sk->sk_protocol; - } + if (!ax25->pidincl) + *skb_push(skb, 1) = sk->sk_protocol; SOCK_DEBUG(sk, "AX.25: Transmitting buffer\n"); @@ -1573,7 +1567,7 @@ static int ax25_sendmsg(struct kiocb *iocb, struct socket *sock, goto out; } - asmptr = skb_push(skb, 1 + ax25_addr_size(dp)); + skb_push(skb, 1 + ax25_addr_size(dp)); SOCK_DEBUG(sk, "Building AX.25 Header (dp=%p).\n", dp); @@ -1581,17 +1575,17 @@ static int ax25_sendmsg(struct kiocb *iocb, struct socket *sock, SOCK_DEBUG(sk, "Num digipeaters=%d\n", dp->ndigi); /* Build an AX.25 header */ - asmptr += (lv = ax25_addr_build(asmptr, &ax25->source_addr, - &sax.sax25_call, dp, - AX25_COMMAND, AX25_MODULUS)); + lv = ax25_addr_build(skb->data, &ax25->source_addr, &sax.sax25_call, + dp, AX25_COMMAND, AX25_MODULUS); SOCK_DEBUG(sk, "Built header (%d bytes)\n",lv); - skb->h.raw = asmptr; + skb_set_transport_header(skb, lv); - SOCK_DEBUG(sk, "base=%p pos=%p\n", skb->data, asmptr); + SOCK_DEBUG(sk, "base=%p pos=%p\n", + skb->data, skb_transport_header(skb)); - *asmptr = AX25_UI; + *skb_transport_header(skb) = AX25_UI; /* Datagram frames go straight out of the door as UI */ ax25_queue_xmit(skb, ax25->ax25_dev->dev); @@ -1631,8 +1625,8 @@ static int ax25_recvmsg(struct kiocb *iocb, struct socket *sock, if (!ax25_sk(sk)->pidincl) skb_pull(skb, 1); /* Remove PID */ - skb->h.raw = skb->data; - copied = skb->len; + skb_reset_transport_header(skb); + copied = skb->len; if (copied > size) { copied = size; @@ -1645,9 +1639,10 @@ static int ax25_recvmsg(struct kiocb *iocb, struct socket *sock, struct sockaddr_ax25 *sax = (struct sockaddr_ax25 *)msg->msg_name; ax25_digi digi; ax25_address src; + const unsigned char *mac = skb_mac_header(skb); - ax25_addr_parse(skb->mac.raw+1, skb->data-skb->mac.raw-1, &src, NULL, &digi, NULL, NULL); - + ax25_addr_parse(mac + 1, skb->data - mac - 1, &src, NULL, + &digi, NULL, NULL); sax->sax25_family = AF_AX25; /* We set this correctly, even though we may not let the application know the digi calls further down (because it @@ -1711,6 +1706,10 @@ static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) res = sock_get_timestamp(sk, argp); break; + case SIOCGSTAMPNS: + res = sock_get_timestampns(sk, argp); + break; + case SIOCAX25ADDUID: /* Add a uid to the uid/call map table */ case SIOCAX25DELUID: /* Delete a uid from the uid/call map table */ case SIOCAX25GETUID: { diff --git a/net/ax25/ax25_ds_subr.c b/net/ax25/ax25_ds_subr.c index 9569dd3fa46..a49773ff2b9 100644 --- a/net/ax25/ax25_ds_subr.c +++ b/net/ax25/ax25_ds_subr.c @@ -136,7 +136,7 @@ static void ax25_kiss_cmd(ax25_dev *ax25_dev, unsigned char cmd, unsigned char p if ((skb = alloc_skb(2, GFP_ATOMIC)) == NULL) return; - skb->nh.raw = skb->data; + skb_reset_network_header(skb); p = skb_put(skb, 2); *p++ = cmd; diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c index 4a6b26becad..0ddaff0df21 100644 --- a/net/ax25/ax25_in.c +++ b/net/ax25/ax25_in.c @@ -61,12 +61,14 @@ static int ax25_rx_fragment(ax25_cb *ax25, struct sk_buff *skb) skb_reserve(skbn, AX25_MAX_HEADER_LEN); skbn->dev = ax25->ax25_dev->dev; - skbn->h.raw = skbn->data; - skbn->nh.raw = skbn->data; + skb_reset_network_header(skbn); + skb_reset_transport_header(skbn); /* Copy data from the fragments */ while ((skbo = skb_dequeue(&ax25->frag_queue)) != NULL) { - memcpy(skb_put(skbn, skbo->len), skbo->data, skbo->len); + skb_copy_from_linear_data(skbo, + skb_put(skbn, skbo->len), + skbo->len); kfree_skb(skbo); } @@ -122,8 +124,8 @@ int ax25_rx_iframe(ax25_cb *ax25, struct sk_buff *skb) } skb_pull(skb, 1); /* Remove PID */ - skb->mac.raw = skb->nh.raw; - skb->nh.raw = skb->data; + skb_reset_mac_header(skb); + skb_reset_network_header(skb); skb->dev = ax25->ax25_dev->dev; skb->pkt_type = PACKET_HOST; skb->protocol = htons(ETH_P_IP); @@ -196,7 +198,7 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev, * Process the AX.25/LAPB frame. */ - skb->h.raw = skb->data; + skb_reset_transport_header(skb); if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) { kfree_skb(skb); @@ -233,7 +235,7 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev, /* UI frame - bypass LAPB processing */ if ((*skb->data & ~0x10) == AX25_UI && dp.lastrepeat + 1 == dp.ndigi) { - skb->h.raw = skb->data + 2; /* skip control and pid */ + skb_set_transport_header(skb, 2); /* skip control and pid */ ax25_send_to_raw(&dest, skb, skb->data[1]); @@ -246,8 +248,8 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev, switch (skb->data[1]) { case AX25_P_IP: skb_pull(skb,2); /* drop PID/CTRL */ - skb->h.raw = skb->data; - skb->nh.raw = skb->data; + skb_reset_transport_header(skb); + skb_reset_network_header(skb); skb->dev = dev; skb->pkt_type = PACKET_HOST; skb->protocol = htons(ETH_P_IP); @@ -256,8 +258,8 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev, case AX25_P_ARP: skb_pull(skb,2); - skb->h.raw = skb->data; - skb->nh.raw = skb->data; + skb_reset_transport_header(skb); + skb_reset_network_header(skb); skb->dev = dev; skb->pkt_type = PACKET_HOST; skb->protocol = htons(ETH_P_ARP); diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c index 7f818bbcd1c..930e4918037 100644 --- a/net/ax25/ax25_ip.c +++ b/net/ax25/ax25_ip.c @@ -121,7 +121,7 @@ int ax25_rebuild_header(struct sk_buff *skb) digipeat = route->digipeat; dev = route->dev; ip_mode = route->ip_mode; - }; + } if (dev == NULL) dev = skb->dev; @@ -171,7 +171,7 @@ int ax25_rebuild_header(struct sk_buff *skb) src_c = *(ax25_address *)(bp + 8); skb_pull(ourskb, AX25_HEADER_LEN - 1); /* Keep PID */ - ourskb->nh.raw = ourskb->data; + skb_reset_network_header(ourskb); ax25=ax25_send_frame( ourskb, diff --git a/net/ax25/ax25_out.c b/net/ax25/ax25_out.c index 223835092b7..92b517af726 100644 --- a/net/ax25/ax25_out.c +++ b/net/ax25/ax25_out.c @@ -148,8 +148,9 @@ void ax25_output(ax25_cb *ax25, int paclen, struct sk_buff *skb) if (ka9qfrag == 1) { skb_reserve(skbn, frontlen + 2); - skbn->nh.raw = skbn->data + (skb->nh.raw - skb->data); - memcpy(skb_put(skbn, len), skb->data, len); + skb_set_network_header(skbn, + skb_network_offset(skb)); + skb_copy_from_linear_data(skb, skb_put(skbn, len), len); p = skb_push(skbn, 2); *p++ = AX25_P_SEGMENT; @@ -161,8 +162,9 @@ void ax25_output(ax25_cb *ax25, int paclen, struct sk_buff *skb) } } else { skb_reserve(skbn, frontlen + 1); - skbn->nh.raw = skbn->data + (skb->nh.raw - skb->data); - memcpy(skb_put(skbn, len), skb->data, len); + skb_set_network_header(skbn, + skb_network_offset(skb)); + skb_copy_from_linear_data(skb, skb_put(skbn, len), len); p = skb_push(skbn, 1); *p = AX25_P_TEXT; } @@ -205,7 +207,7 @@ static void ax25_send_iframe(ax25_cb *ax25, struct sk_buff *skb, int poll_bit) if (skb == NULL) return; - skb->nh.raw = skb->data; + skb_reset_network_header(skb); if (ax25->modulus == AX25_MODULUS) { frame = skb_push(skb, 1); diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c index b6c577e3c91..5fe9b2a6697 100644 --- a/net/ax25/ax25_subr.c +++ b/net/ax25/ax25_subr.c @@ -162,7 +162,7 @@ void ax25_send_control(ax25_cb *ax25, int frametype, int poll_bit, int type) skb_reserve(skb, ax25->ax25_dev->dev->hard_header_len); - skb->nh.raw = skb->data; + skb_reset_network_header(skb); /* Assume a response - address structure for DTE */ if (ax25->modulus == AX25_MODULUS) { @@ -205,7 +205,7 @@ void ax25_return_dm(struct net_device *dev, ax25_address *src, ax25_address *des return; /* Next SABM will get DM'd */ skb_reserve(skb, dev->hard_header_len); - skb->nh.raw = skb->data; + skb_reset_network_header(skb); ax25_digi_invert(digi, &retdigi); diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index c7228cfc621..d942b946ba0 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -221,7 +221,7 @@ int bt_sock_recvmsg(struct kiocb *iocb, struct socket *sock, copied = len; } - skb->h.raw = skb->data; + skb_reset_transport_header(skb); err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); skb_free_datagram(sk, skb); diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c index b85d1492c35..ab2db55982c 100644 --- a/net/bluetooth/bnep/core.c +++ b/net/bluetooth/bnep/core.c @@ -326,7 +326,7 @@ static inline int bnep_rx_frame(struct bnep_session *s, struct sk_buff *skb) return 0; } - skb->mac.raw = skb->data; + skb_reset_mac_header(skb); /* Verify and pull out header */ if (!skb_pull(skb, __bnep_rx_hlen[type & BNEP_TYPE_MASK])) @@ -364,26 +364,28 @@ static inline int bnep_rx_frame(struct bnep_session *s, struct sk_buff *skb) case BNEP_COMPRESSED_SRC_ONLY: memcpy(__skb_put(nskb, ETH_ALEN), s->eh.h_dest, ETH_ALEN); - memcpy(__skb_put(nskb, ETH_ALEN), skb->mac.raw, ETH_ALEN); + memcpy(__skb_put(nskb, ETH_ALEN), skb_mac_header(skb), ETH_ALEN); put_unaligned(s->eh.h_proto, (__be16 *) __skb_put(nskb, 2)); break; case BNEP_COMPRESSED_DST_ONLY: - memcpy(__skb_put(nskb, ETH_ALEN), skb->mac.raw, ETH_ALEN); - memcpy(__skb_put(nskb, ETH_ALEN + 2), s->eh.h_source, ETH_ALEN + 2); + memcpy(__skb_put(nskb, ETH_ALEN), skb_mac_header(skb), + ETH_ALEN); + memcpy(__skb_put(nskb, ETH_ALEN + 2), s->eh.h_source, + ETH_ALEN + 2); break; case BNEP_GENERAL: - memcpy(__skb_put(nskb, ETH_ALEN * 2), skb->mac.raw, ETH_ALEN * 2); + memcpy(__skb_put(nskb, ETH_ALEN * 2), skb_mac_header(skb), + ETH_ALEN * 2); put_unaligned(s->eh.h_proto, (__be16 *) __skb_put(nskb, 2)); break; } - memcpy(__skb_put(nskb, skb->len), skb->data, skb->len); + skb_copy_from_linear_data(skb, __skb_put(nskb, skb->len), skb->len); kfree_skb(skb); s->stats.rx_packets++; - nskb->dev = dev; nskb->ip_summed = CHECKSUM_NONE; nskb->protocol = eth_type_trans(nskb, dev); netif_rx_ni(nskb); diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c index 3933608a929..66bef1ccee2 100644 --- a/net/bluetooth/cmtp/core.c +++ b/net/bluetooth/cmtp/core.c @@ -124,7 +124,7 @@ static inline void cmtp_add_msgpart(struct cmtp_session *session, int id, const } if (skb && (skb->len > 0)) - memcpy(skb_put(nskb, skb->len), skb->data, skb->len); + skb_copy_from_linear_data(skb, skb_put(nskb, skb->len), skb->len); memcpy(skb_put(nskb, count), buf, count); @@ -256,7 +256,7 @@ static void cmtp_process_transmit(struct cmtp_session *session) hdr[2] = size >> 8; } - memcpy(skb_put(nskb, size), skb->data, size); + skb_copy_from_linear_data(skb, skb_put(nskb, size), size); skb_pull(skb, size); if (skb->len > 0) { diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index f3403fdb59f..63980bd6b5f 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -72,11 +72,11 @@ void hci_acl_connect(struct hci_conn *conn) inquiry_entry_age(ie) <= INQUIRY_ENTRY_AGE_MAX) { cp.pscan_rep_mode = ie->data.pscan_rep_mode; cp.pscan_mode = ie->data.pscan_mode; - cp.clock_offset = ie->data.clock_offset | __cpu_to_le16(0x8000); + cp.clock_offset = ie->data.clock_offset | cpu_to_le16(0x8000); memcpy(conn->dev_class, ie->data.dev_class, 3); } - cp.pkt_type = __cpu_to_le16(hdev->pkt_type & ACL_PTYPE_MASK); + cp.pkt_type = cpu_to_le16(hdev->pkt_type & ACL_PTYPE_MASK); if (lmp_rswitch_capable(hdev) && !(hdev->link_mode & HCI_LM_MASTER)) cp.role_switch = 0x01; else @@ -107,7 +107,7 @@ void hci_acl_disconn(struct hci_conn *conn, __u8 reason) conn->state = BT_DISCONN; - cp.handle = __cpu_to_le16(conn->handle); + cp.handle = cpu_to_le16(conn->handle); cp.reason = reason; hci_send_cmd(conn->hdev, OGF_LINK_CTL, OCF_DISCONNECT, sizeof(cp), &cp); @@ -123,8 +123,8 @@ void hci_add_sco(struct hci_conn *conn, __u16 handle) conn->state = BT_CONNECT; conn->out = 1; - cp.pkt_type = __cpu_to_le16(hdev->pkt_type & SCO_PTYPE_MASK); - cp.handle = __cpu_to_le16(handle); + cp.pkt_type = cpu_to_le16(hdev->pkt_type & SCO_PTYPE_MASK); + cp.handle = cpu_to_le16(handle); hci_send_cmd(hdev, OGF_LINK_CTL, OCF_ADD_SCO, sizeof(cp), &cp); } @@ -348,7 +348,7 @@ int hci_conn_auth(struct hci_conn *conn) if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->pend)) { struct hci_cp_auth_requested cp; - cp.handle = __cpu_to_le16(conn->handle); + cp.handle = cpu_to_le16(conn->handle); hci_send_cmd(conn->hdev, OGF_LINK_CTL, OCF_AUTH_REQUESTED, sizeof(cp), &cp); } return 0; @@ -368,7 +368,7 @@ int hci_conn_encrypt(struct hci_conn *conn) if (hci_conn_auth(conn)) { struct hci_cp_set_conn_encrypt cp; - cp.handle = __cpu_to_le16(conn->handle); + cp.handle = cpu_to_le16(conn->handle); cp.encrypt = 1; hci_send_cmd(conn->hdev, OGF_LINK_CTL, OCF_SET_CONN_ENCRYPT, sizeof(cp), &cp); } @@ -383,7 +383,7 @@ int hci_conn_change_link_key(struct hci_conn *conn) if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->pend)) { struct hci_cp_change_conn_link_key cp; - cp.handle = __cpu_to_le16(conn->handle); + cp.handle = cpu_to_le16(conn->handle); hci_send_cmd(conn->hdev, OGF_LINK_CTL, OCF_CHANGE_CONN_LINK_KEY, sizeof(cp), &cp); } return 0; @@ -423,7 +423,7 @@ void hci_conn_enter_active_mode(struct hci_conn *conn) if (!test_and_set_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->pend)) { struct hci_cp_exit_sniff_mode cp; - cp.handle = __cpu_to_le16(conn->handle); + cp.handle = cpu_to_le16(conn->handle); hci_send_cmd(hdev, OGF_LINK_POLICY, OCF_EXIT_SNIFF_MODE, sizeof(cp), &cp); } @@ -452,21 +452,21 @@ void hci_conn_enter_sniff_mode(struct hci_conn *conn) if (lmp_sniffsubr_capable(hdev) && lmp_sniffsubr_capable(conn)) { struct hci_cp_sniff_subrate cp; - cp.handle = __cpu_to_le16(conn->handle); - cp.max_latency = __constant_cpu_to_le16(0); - cp.min_remote_timeout = __constant_cpu_to_le16(0); - cp.min_local_timeout = __constant_cpu_to_le16(0); + cp.handle = cpu_to_le16(conn->handle); + cp.max_latency = cpu_to_le16(0); + cp.min_remote_timeout = cpu_to_le16(0); + cp.min_local_timeout = cpu_to_le16(0); hci_send_cmd(hdev, OGF_LINK_POLICY, OCF_SNIFF_SUBRATE, sizeof(cp), &cp); } if (!test_and_set_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->pend)) { struct hci_cp_sniff_mode cp; - cp.handle = __cpu_to_le16(conn->handle); - cp.max_interval = __cpu_to_le16(hdev->sniff_max_interval); - cp.min_interval = __cpu_to_le16(hdev->sniff_min_interval); - cp.attempt = __constant_cpu_to_le16(4); - cp.timeout = __constant_cpu_to_le16(1); + cp.handle = cpu_to_le16(conn->handle); + cp.max_interval = cpu_to_le16(hdev->sniff_max_interval); + cp.min_interval = cpu_to_le16(hdev->sniff_min_interval); + cp.attempt = cpu_to_le16(4); + cp.timeout = cpu_to_le16(1); hci_send_cmd(hdev, OGF_LINK_POLICY, OCF_SNIFF_MODE, sizeof(cp), &cp); } diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 4917919d86a..aa4b56a8c3e 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -149,7 +149,7 @@ static int __hci_request(struct hci_dev *hdev, void (*req)(struct hci_dev *hdev, default: err = -ETIMEDOUT; break; - }; + } hdev->req_status = hdev->req_result = 0; @@ -216,10 +216,10 @@ static void hci_init_req(struct hci_dev *hdev, unsigned long opt) /* Host buffer size */ { struct hci_cp_host_buffer_size cp; - cp.acl_mtu = __cpu_to_le16(HCI_MAX_ACL_SIZE); + cp.acl_mtu = cpu_to_le16(HCI_MAX_ACL_SIZE); cp.sco_mtu = HCI_MAX_SCO_SIZE; - cp.acl_max_pkt = __cpu_to_le16(0xffff); - cp.sco_max_pkt = __cpu_to_le16(0xffff); + cp.acl_max_pkt = cpu_to_le16(0xffff); + cp.sco_max_pkt = cpu_to_le16(0xffff); hci_send_cmd(hdev, OGF_HOST_CTL, OCF_HOST_BUFFER_SIZE, sizeof(cp), &cp); } #endif @@ -240,11 +240,11 @@ static void hci_init_req(struct hci_dev *hdev, unsigned long opt) } /* Page timeout ~20 secs */ - param = __cpu_to_le16(0x8000); + param = cpu_to_le16(0x8000); hci_send_cmd(hdev, OGF_HOST_CTL, OCF_WRITE_PG_TIMEOUT, 2, ¶m); /* Connection accept timeout ~20 secs */ - param = __cpu_to_le16(0x7d00); + param = cpu_to_le16(0x7d00); hci_send_cmd(hdev, OGF_HOST_CTL, OCF_WRITE_CA_TIMEOUT, 2, ¶m); } @@ -1034,7 +1034,7 @@ int hci_send_cmd(struct hci_dev *hdev, __u16 ogf, __u16 ocf, __u32 plen, void *p } hdr = (struct hci_command_hdr *) skb_put(skb, HCI_COMMAND_HDR_SIZE); - hdr->opcode = __cpu_to_le16(hci_opcode_pack(ogf, ocf)); + hdr->opcode = cpu_to_le16(hci_opcode_pack(ogf, ocf)); hdr->plen = plen; if (plen) @@ -1060,7 +1060,7 @@ void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 ogf, __u16 ocf) hdr = (void *) hdev->sent_cmd->data; - if (hdr->opcode != __cpu_to_le16(hci_opcode_pack(ogf, ocf))) + if (hdr->opcode != cpu_to_le16(hci_opcode_pack(ogf, ocf))) return NULL; BT_DBG("%s ogf 0x%x ocf 0x%x", hdev->name, ogf, ocf); @@ -1074,11 +1074,11 @@ static void hci_add_acl_hdr(struct sk_buff *skb, __u16 handle, __u16 flags) struct hci_acl_hdr *hdr; int len = skb->len; - hdr = (struct hci_acl_hdr *) skb_push(skb, HCI_ACL_HDR_SIZE); - hdr->handle = __cpu_to_le16(hci_handle_pack(handle, flags)); - hdr->dlen = __cpu_to_le16(len); - - skb->h.raw = (void *) hdr; + skb_push(skb, HCI_ACL_HDR_SIZE); + skb_reset_transport_header(skb); + hdr = (struct hci_acl_hdr *)skb_transport_header(skb); + hdr->handle = cpu_to_le16(hci_handle_pack(handle, flags)); + hdr->dlen = cpu_to_le16(len); } int hci_send_acl(struct hci_conn *conn, struct sk_buff *skb, __u16 flags) @@ -1140,11 +1140,12 @@ int hci_send_sco(struct hci_conn *conn, struct sk_buff *skb) return -EINVAL; } - hdr.handle = __cpu_to_le16(conn->handle); + hdr.handle = cpu_to_le16(conn->handle); hdr.dlen = skb->len; - skb->h.raw = skb_push(skb, HCI_SCO_HDR_SIZE); - memcpy(skb->h.raw, &hdr, HCI_SCO_HDR_SIZE); + skb_push(skb, HCI_SCO_HDR_SIZE); + skb_reset_transport_header(skb); + memcpy(skb_transport_header(skb), &hdr, HCI_SCO_HDR_SIZE); skb->dev = (void *) hdev; bt_cb(skb)->pkt_type = HCI_SCODATA_PKT; @@ -1387,7 +1388,7 @@ static void hci_rx_task(unsigned long arg) case HCI_SCODATA_PKT: kfree_skb(skb); continue; - }; + } } /* Process frame */ diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 936d3fc479c..447ba713122 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -783,7 +783,7 @@ static inline void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *s if (conn->type == ACL_LINK && hdev->link_policy) { struct hci_cp_write_link_policy cp; cp.handle = ev->handle; - cp.policy = __cpu_to_le16(hdev->link_policy); + cp.policy = cpu_to_le16(hdev->link_policy); hci_send_cmd(hdev, OGF_LINK_POLICY, OCF_WRITE_LINK_POLICY, sizeof(cp), &cp); } @@ -793,8 +793,8 @@ static inline void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *s struct hci_cp_change_conn_ptype cp; cp.handle = ev->handle; cp.pkt_type = (conn->type == ACL_LINK) ? - __cpu_to_le16(hdev->pkt_type & ACL_PTYPE_MASK): - __cpu_to_le16(hdev->pkt_type & SCO_PTYPE_MASK); + cpu_to_le16(hdev->pkt_type & ACL_PTYPE_MASK): + cpu_to_le16(hdev->pkt_type & SCO_PTYPE_MASK); hci_send_cmd(hdev, OGF_LINK_CTL, OCF_CHANGE_CONN_PTYPE, sizeof(cp), &cp); @@ -970,7 +970,7 @@ static inline void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *s if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend)) { if (!ev->status) { struct hci_cp_set_conn_encrypt cp; - cp.handle = __cpu_to_le16(conn->handle); + cp.handle = cpu_to_le16(conn->handle); cp.encrypt = 1; hci_send_cmd(conn->hdev, OGF_LINK_CTL, OCF_SET_CONN_ENCRYPT, sizeof(cp), &cp); diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 71f5cfbbebb..832b5f44be5 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -375,7 +375,7 @@ static int hci_sock_recvmsg(struct kiocb *iocb, struct socket *sock, copied = len; } - skb->h.raw = skb->data; + skb_reset_transport_header(skb); err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); hci_sock_cmsg(sk, msg, skb); diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c index e83ee82440d..a5867879b61 100644 --- a/net/bluetooth/l2cap.c +++ b/net/bluetooth/l2cap.c @@ -459,8 +459,8 @@ static void __l2cap_sock_close(struct sock *sk, int reason) sk->sk_state = BT_DISCONN; l2cap_sock_set_timer(sk, sk->sk_sndtimeo); - req.dcid = __cpu_to_le16(l2cap_pi(sk)->dcid); - req.scid = __cpu_to_le16(l2cap_pi(sk)->scid); + req.dcid = cpu_to_le16(l2cap_pi(sk)->dcid); + req.scid = cpu_to_le16(l2cap_pi(sk)->scid); l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_DISCONN_REQ, sizeof(req), &req); } else { @@ -652,7 +652,7 @@ static int l2cap_do_connect(struct sock *sk) if (sk->sk_type == SOCK_SEQPACKET) { struct l2cap_conn_req req; l2cap_pi(sk)->ident = l2cap_get_ident(conn); - req.scid = __cpu_to_le16(l2cap_pi(sk)->scid); + req.scid = cpu_to_le16(l2cap_pi(sk)->scid); req.psm = l2cap_pi(sk)->psm; l2cap_send_cmd(conn, l2cap_pi(sk)->ident, L2CAP_CONN_REQ, sizeof(req), &req); @@ -868,8 +868,8 @@ static inline int l2cap_do_send(struct sock *sk, struct msghdr *msg, int len) /* Create L2CAP header */ lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE); - lh->cid = __cpu_to_le16(l2cap_pi(sk)->dcid); - lh->len = __cpu_to_le16(len + (hlen - L2CAP_HDR_SIZE)); + lh->cid = cpu_to_le16(l2cap_pi(sk)->dcid); + lh->len = cpu_to_le16(len + (hlen - L2CAP_HDR_SIZE)); if (sk->sk_type == SOCK_DGRAM) put_unaligned(l2cap_pi(sk)->psm, (u16 *) skb_put(skb, 2)); @@ -1096,7 +1096,7 @@ static void l2cap_conn_ready(struct l2cap_conn *conn) } else if (sk->sk_state == BT_CONNECT) { struct l2cap_conn_req req; l2cap_pi(sk)->ident = l2cap_get_ident(conn); - req.scid = __cpu_to_le16(l2cap_pi(sk)->scid); + req.scid = cpu_to_le16(l2cap_pi(sk)->scid); req.psm = l2cap_pi(sk)->psm; l2cap_send_cmd(conn, l2cap_pi(sk)->ident, L2CAP_CONN_REQ, sizeof(req), &req); } @@ -1192,13 +1192,13 @@ static struct sk_buff *l2cap_build_cmd(struct l2cap_conn *conn, return NULL; lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE); - lh->len = __cpu_to_le16(L2CAP_CMD_HDR_SIZE + dlen); - lh->cid = __cpu_to_le16(0x0001); + lh->len = cpu_to_le16(L2CAP_CMD_HDR_SIZE + dlen); + lh->cid = cpu_to_le16(0x0001); cmd = (struct l2cap_cmd_hdr *) skb_put(skb, L2CAP_CMD_HDR_SIZE); cmd->code = code; cmd->ident = ident; - cmd->len = __cpu_to_le16(dlen); + cmd->len = cpu_to_le16(dlen); if (dlen) { count -= L2CAP_HDR_SIZE + L2CAP_CMD_HDR_SIZE; @@ -1316,11 +1316,11 @@ static void l2cap_add_conf_opt(void **ptr, u8 type, u8 len, unsigned long val) break; case 2: - *((u16 *) opt->val) = __cpu_to_le16(val); + *((u16 *) opt->val) = cpu_to_le16(val); break; case 4: - *((u32 *) opt->val) = __cpu_to_le32(val); + *((u32 *) opt->val) = cpu_to_le32(val); break; default: @@ -1346,8 +1346,8 @@ static int l2cap_build_conf_req(struct sock *sk, void *data) //if (flush_to != L2CAP_DEFAULT_FLUSH_TO) // l2cap_add_conf_opt(&ptr, L2CAP_CONF_FLUSH_TO, 2, pi->flush_to); - req->dcid = __cpu_to_le16(pi->dcid); - req->flags = __cpu_to_le16(0); + req->dcid = cpu_to_le16(pi->dcid); + req->flags = cpu_to_le16(0); return ptr - data; } @@ -1383,9 +1383,9 @@ static int l2cap_build_conf_rsp(struct sock *sk, void *data, int *result) else flags = 0x0001; - rsp->scid = __cpu_to_le16(l2cap_pi(sk)->dcid); - rsp->result = __cpu_to_le16(result ? *result : 0); - rsp->flags = __cpu_to_le16(flags); + rsp->scid = cpu_to_le16(l2cap_pi(sk)->dcid); + rsp->result = cpu_to_le16(result ? *result : 0); + rsp->flags = cpu_to_le16(flags); return ptr - data; } @@ -1470,10 +1470,10 @@ response: bh_unlock_sock(parent); sendresp: - rsp.scid = __cpu_to_le16(scid); - rsp.dcid = __cpu_to_le16(dcid); - rsp.result = __cpu_to_le16(result); - rsp.status = __cpu_to_le16(status); + rsp.scid = cpu_to_le16(scid); + rsp.dcid = cpu_to_le16(dcid); + rsp.result = cpu_to_le16(result); + rsp.status = cpu_to_le16(status); l2cap_send_cmd(conn, cmd->ident, L2CAP_CONN_RSP, sizeof(rsp), &rsp); return 0; } @@ -1613,8 +1613,8 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hdr l2cap_sock_set_timer(sk, HZ * 5); { struct l2cap_disconn_req req; - req.dcid = __cpu_to_le16(l2cap_pi(sk)->dcid); - req.scid = __cpu_to_le16(l2cap_pi(sk)->scid); + req.dcid = cpu_to_le16(l2cap_pi(sk)->dcid); + req.scid = cpu_to_le16(l2cap_pi(sk)->scid); l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_DISCONN_REQ, sizeof(req), &req); } @@ -1652,8 +1652,8 @@ static inline int l2cap_disconnect_req(struct l2cap_conn *conn, struct l2cap_cmd if (!(sk = l2cap_get_chan_by_scid(&conn->chan_list, dcid))) return 0; - rsp.dcid = __cpu_to_le16(l2cap_pi(sk)->scid); - rsp.scid = __cpu_to_le16(l2cap_pi(sk)->dcid); + rsp.dcid = cpu_to_le16(l2cap_pi(sk)->scid); + rsp.scid = cpu_to_le16(l2cap_pi(sk)->dcid); l2cap_send_cmd(conn, cmd->ident, L2CAP_DISCONN_RSP, sizeof(rsp), &rsp); sk->sk_shutdown = SHUTDOWN_MASK; @@ -1696,8 +1696,8 @@ static inline int l2cap_information_req(struct l2cap_conn *conn, struct l2cap_cm BT_DBG("type 0x%4.4x", type); - rsp.type = __cpu_to_le16(type); - rsp.result = __cpu_to_le16(L2CAP_IR_NOTSUPP); + rsp.type = cpu_to_le16(type); + rsp.result = cpu_to_le16(L2CAP_IR_NOTSUPP); l2cap_send_cmd(conn, cmd->ident, L2CAP_INFO_RSP, sizeof(rsp), &rsp); return 0; @@ -1794,7 +1794,7 @@ static inline void l2cap_sig_channel(struct l2cap_conn *conn, struct sk_buff *sk BT_DBG("error %d", err); /* FIXME: Map err to a valid reason */ - rej.reason = __cpu_to_le16(0); + rej.reason = cpu_to_le16(0); l2cap_send_cmd(conn, cmd.ident, L2CAP_COMMAND_REJ, sizeof(rej), &rej); } @@ -1993,10 +1993,10 @@ static int l2cap_auth_cfm(struct hci_conn *hcon, u8 status) result = L2CAP_CR_SEC_BLOCK; } - rsp.scid = __cpu_to_le16(l2cap_pi(sk)->dcid); - rsp.dcid = __cpu_to_le16(l2cap_pi(sk)->scid); - rsp.result = __cpu_to_le16(result); - rsp.status = __cpu_to_le16(0); + rsp.scid = cpu_to_le16(l2cap_pi(sk)->dcid); + rsp.dcid = cpu_to_le16(l2cap_pi(sk)->scid); + rsp.result = cpu_to_le16(result); + rsp.status = cpu_to_le16(0); l2cap_send_cmd(conn, l2cap_pi(sk)->ident, L2CAP_CONN_RSP, sizeof(rsp), &rsp); @@ -2041,10 +2041,10 @@ static int l2cap_encrypt_cfm(struct hci_conn *hcon, u8 status) result = L2CAP_CR_SEC_BLOCK; } - rsp.scid = __cpu_to_le16(l2cap_pi(sk)->dcid); - rsp.dcid = __cpu_to_le16(l2cap_pi(sk)->scid); - rsp.result = __cpu_to_le16(result); - rsp.status = __cpu_to_le16(0); + rsp.scid = cpu_to_le16(l2cap_pi(sk)->dcid); + rsp.dcid = cpu_to_le16(l2cap_pi(sk)->scid); + rsp.result = cpu_to_le16(result); + rsp.status = cpu_to_le16(0); l2cap_send_cmd(conn, l2cap_pi(sk)->ident, L2CAP_CONN_RSP, sizeof(rsp), &rsp); @@ -2107,7 +2107,8 @@ static int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 fl if (!(conn->rx_skb = bt_skb_alloc(len, GFP_ATOMIC))) goto drop; - memcpy(skb_put(conn->rx_skb, skb->len), skb->data, skb->len); + skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), + skb->len); conn->rx_len = len - skb->len; } else { BT_DBG("Cont: frag len %d (expecting %d)", skb->len, conn->rx_len); @@ -2128,7 +2129,8 @@ static int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 fl goto drop; } - memcpy(skb_put(conn->rx_skb, skb->len), skb->data, skb->len); + skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), + skb->len); conn->rx_len -= skb->len; if (!conn->rx_len) { diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 94f45736056..fe7df90eb70 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -1567,7 +1567,7 @@ static int rfcomm_recv_frame(struct rfcomm_session *s, struct sk_buff *skb) /* Trim FCS */ skb->len--; skb->tail--; - fcs = *(u8 *) skb->tail; + fcs = *(u8 *)skb_tail_pointer(skb); if (__check_fcs(skb->data, type, fcs)) { BT_ERR("bad checksum in packet"); @@ -1851,18 +1851,18 @@ static void rfcomm_worker(void) BT_DBG(""); while (!atomic_read(&terminate)) { + set_current_state(TASK_INTERRUPTIBLE); if (!test_bit(RFCOMM_SCHED_WAKEUP, &rfcomm_event)) { /* No pending events. Let's sleep. * Incoming connections and data will wake us up. */ - set_current_state(TASK_INTERRUPTIBLE); schedule(); } + set_current_state(TASK_RUNNING); /* Process stuff */ clear_bit(RFCOMM_SCHED_WAKEUP, &rfcomm_event); rfcomm_process_sessions(); } - set_current_state(TASK_RUNNING); return; } diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index ae439144095..3f5163e725e 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -393,7 +393,7 @@ static void sco_sock_close(struct sock *sk) default: sock_set_flag(sk, SOCK_ZAPPED); break; - }; + } release_sock(sk); diff --git a/net/bridge/br.c b/net/bridge/br.c index 2994387999a..848b8fa8bed 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -37,7 +37,9 @@ static int __init br_init(void) return -EADDRINUSE; } - br_fdb_init(); + err = br_fdb_init(); + if (err) + goto err_out1; err = br_netfilter_init(); if (err) @@ -47,7 +49,10 @@ static int __init br_init(void) if (err) goto err_out2; - br_netlink_init(); + err = br_netlink_init(); + if (err) + goto err_out3; + brioctl_set(br_ioctl_deviceless_stub); br_handle_frame_hook = br_handle_frame; @@ -55,7 +60,8 @@ static int __init br_init(void) br_fdb_put_hook = br_fdb_put; return 0; - +err_out3: + unregister_netdevice_notifier(&br_device_notifier); err_out2: br_netfilter_fini(); err_out1: diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 905a39c33a1..5e1892d8d87 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -37,7 +37,7 @@ int br_dev_xmit(struct sk_buff *skb, struct net_device *dev) br->statistics.tx_packets++; br->statistics.tx_bytes += skb->len; - skb->mac.raw = skb->data; + skb_reset_mac_header(skb); skb_pull(skb, ETH_HLEN); if (dest[0] & 1) @@ -83,27 +83,21 @@ static int br_change_mtu(struct net_device *dev, int new_mtu) return 0; } -/* Allow setting mac address of pseudo-bridge to be same as - * any of the bound interfaces - */ +/* Allow setting mac address to any valid ethernet address. */ static int br_set_mac_address(struct net_device *dev, void *p) { struct net_bridge *br = netdev_priv(dev); struct sockaddr *addr = p; - struct net_bridge_port *port; - int err = -EADDRNOTAVAIL; + + if (!is_valid_ether_addr(addr->sa_data)) + return -EINVAL; spin_lock_bh(&br->lock); - list_for_each_entry(port, &br->port_list, list) { - if (!compare_ether_addr(port->dev->dev_addr, addr->sa_data)) { - br_stp_change_bridge_id(br, addr->sa_data); - err = 0; - break; - } - } + memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN); + br_stp_change_bridge_id(br, addr->sa_data); spin_unlock_bh(&br->lock); - return err; + return 0; } static void br_getinfo(struct net_device *dev, struct ethtool_drvinfo *info) diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 8d566c13cc7..91b017016d5 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -20,19 +20,28 @@ #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/jhash.h> +#include <linux/random.h> #include <asm/atomic.h> +#include <asm/unaligned.h> #include "br_private.h" static struct kmem_cache *br_fdb_cache __read_mostly; static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr); -void __init br_fdb_init(void) +static u32 fdb_salt __read_mostly; + +int __init br_fdb_init(void) { br_fdb_cache = kmem_cache_create("bridge_fdb_cache", sizeof(struct net_bridge_fdb_entry), 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!br_fdb_cache) + return -ENOMEM; + + get_random_bytes(&fdb_salt, sizeof(fdb_salt)); + return 0; } void __exit br_fdb_fini(void) @@ -44,24 +53,26 @@ void __exit br_fdb_fini(void) /* if topology_changing then use forward_delay (default 15 sec) * otherwise keep longer (default 5 minutes) */ -static __inline__ unsigned long hold_time(const struct net_bridge *br) +static inline unsigned long hold_time(const struct net_bridge *br) { return br->topology_change ? br->forward_delay : br->ageing_time; } -static __inline__ int has_expired(const struct net_bridge *br, +static inline int has_expired(const struct net_bridge *br, const struct net_bridge_fdb_entry *fdb) { return !fdb->is_static && time_before_eq(fdb->ageing_timer + hold_time(br), jiffies); } -static __inline__ int br_mac_hash(const unsigned char *mac) +static inline int br_mac_hash(const unsigned char *mac) { - return jhash(mac, ETH_ALEN, 0) & (BR_HASH_SIZE - 1); + /* use 1 byte of OUI cnd 3 bytes of NIC */ + u32 key = get_unaligned((u32 *)(mac + 2)); + return jhash_1word(key, fdb_salt) & (BR_HASH_SIZE - 1); } -static __inline__ void fdb_delete(struct net_bridge_fdb_entry *f) +static inline void fdb_delete(struct net_bridge_fdb_entry *f) { hlist_del_rcu(&f->hlist); br_fdb_put(f); @@ -128,7 +139,26 @@ void br_fdb_cleanup(unsigned long _data) mod_timer(&br->gc_timer, jiffies + HZ/10); } +/* Completely flush all dynamic entries in forwarding database.*/ +void br_fdb_flush(struct net_bridge *br) +{ + int i; + spin_lock_bh(&br->hash_lock); + for (i = 0; i < BR_HASH_SIZE; i++) { + struct net_bridge_fdb_entry *f; + struct hlist_node *h, *n; + hlist_for_each_entry_safe(f, h, n, &br->hash[i], hlist) { + if (!f->is_static) + fdb_delete(f); + } + } + spin_unlock_bh(&br->hash_lock); +} + +/* Flush all entries refering to a specific port. + * if do_all is set also flush static entries + */ void br_fdb_delete_by_port(struct net_bridge *br, const struct net_bridge_port *p, int do_all) diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 3e45c1a1aa9..ada7f495445 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -71,7 +71,7 @@ static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb) indev = skb->dev; skb->dev = to->dev; - skb->ip_summed = CHECKSUM_NONE; + skb_forward_csum(skb); NF_HOOK(PF_BRIDGE, NF_BR_FORWARD, skb, indev, skb->dev, br_forward_finish); diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index f3a2e29be40..849deaf1410 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -152,6 +152,8 @@ static void del_nbp(struct net_bridge_port *p) br_stp_disable_port(p); spin_unlock_bh(&br->lock); + br_ifinfo_notify(RTM_DELLINK, p); + br_fdb_delete_by_port(br, p, 1); list_del_rcu(&p->list); @@ -203,7 +205,7 @@ static struct net_device *new_bridge_dev(const char *name) memcpy(br->group_addr, br_group_address, ETH_ALEN); br->feature_mask = dev->features; - br->stp_enabled = 0; + br->stp_enabled = BR_NO_STP; br->designated_root = br->bridge_id; br->root_path_cost = 0; br->root_port = 0; @@ -434,6 +436,8 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) br_stp_enable_port(p); spin_unlock_bh(&br->lock); + br_ifinfo_notify(RTM_NEWLINK, p); + dev_set_mtu(br->dev, br_min_mtu(br)); kobject_uevent(&p->kobj, KOBJ_ADD); @@ -471,11 +475,9 @@ void __exit br_cleanup_bridges(void) struct net_device *dev, *nxt; rtnl_lock(); - for (dev = dev_base; dev; dev = nxt) { - nxt = dev->next; + for_each_netdev_safe(dev, nxt) if (dev->priv_flags & IFF_EBRIDGE) del_br(dev->priv); - } rtnl_unlock(); } diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 35b94f9a1ac..420bbb9955e 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -112,46 +112,59 @@ static int br_handle_local_finish(struct sk_buff *skb) */ static inline int is_link_local(const unsigned char *dest) { - return memcmp(dest, br_group_address, 5) == 0 && (dest[5] & 0xf0) == 0; + const u16 *a = (const u16 *) dest; + static const u16 *const b = (const u16 *const ) br_group_address; + static const u16 m = __constant_cpu_to_be16(0xfff0); + + return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | ((a[2] ^ b[2]) & m)) == 0; } /* * Called via br_handle_frame_hook. - * Return 0 if *pskb should be processed furthur - * 1 if *pskb is handled + * Return NULL if skb is handled * note: already called with rcu_read_lock (preempt_disabled) */ -int br_handle_frame(struct net_bridge_port *p, struct sk_buff **pskb) +struct sk_buff *br_handle_frame(struct net_bridge_port *p, struct sk_buff *skb) { - struct sk_buff *skb = *pskb; const unsigned char *dest = eth_hdr(skb)->h_dest; if (!is_valid_ether_addr(eth_hdr(skb)->h_source)) - goto err; + goto drop; if (unlikely(is_link_local(dest))) { - skb->pkt_type = PACKET_HOST; - return NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_IN, skb, skb->dev, - NULL, br_handle_local_finish) != 0; + /* Pause frames shouldn't be passed up by driver anyway */ + if (skb->protocol == htons(ETH_P_PAUSE)) + goto drop; + + /* Process STP BPDU's through normal netif_receive_skb() path */ + if (p->br->stp_enabled != BR_NO_STP) { + if (NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_IN, skb, skb->dev, + NULL, br_handle_local_finish)) + return NULL; + else + return skb; + } } - if (p->state == BR_STATE_FORWARDING || p->state == BR_STATE_LEARNING) { + switch (p->state) { + case BR_STATE_FORWARDING: + if (br_should_route_hook) { - if (br_should_route_hook(pskb)) - return 0; - skb = *pskb; + if (br_should_route_hook(&skb)) + return skb; dest = eth_hdr(skb)->h_dest; } - + /* fall through */ + case BR_STATE_LEARNING: if (!compare_ether_addr(p->br->dev->dev_addr, dest)) skb->pkt_type = PACKET_HOST; NF_HOOK(PF_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL, br_handle_frame_finish); - return 1; + break; + default: +drop: + kfree_skb(skb); } - -err: - kfree_skb(skb); - return 1; + return NULL; } diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index 147015fe5c7..bb15e9e259b 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -27,7 +27,9 @@ static int get_bridge_ifindices(int *indices, int num) struct net_device *dev; int i = 0; - for (dev = dev_base; dev && i < num; dev = dev->next) { + for_each_netdev(dev) { + if (i >= num) + break; if (dev->priv_flags & IFF_EBRIDGE) indices[i++] = dev->ifindex; } @@ -137,7 +139,8 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) b.topology_change = br->topology_change; b.topology_change_detected = br->topology_change_detected; b.root_port = br->root_port; - b.stp_enabled = br->stp_enabled; + + b.stp_enabled = (br->stp_enabled != BR_NO_STP); b.ageing_time = jiffies_to_clock_t(br->ageing_time); b.hello_timer_value = br_timer_value(&br->hello_timer); b.tcn_timer_value = br_timer_value(&br->tcn_timer); @@ -251,7 +254,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) if (!capable(CAP_NET_ADMIN)) return -EPERM; - br->stp_enabled = args[1]?1:0; + br_stp_set_enabled(br, args[1]); return 0; case BRCTL_SET_BRIDGE_PRIORITY: diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 5439a3c46c3..fa779874b9d 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -29,6 +29,8 @@ #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> +#include <linux/if_pppox.h> +#include <linux/ppp_defs.h> #include <linux/netfilter_bridge.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> @@ -48,8 +50,8 @@ #define skb_origaddr(skb) (((struct bridge_skb_cb *) \ (skb->nf_bridge->data))->daddr.ipv4) -#define store_orig_dstaddr(skb) (skb_origaddr(skb) = (skb)->nh.iph->daddr) -#define dnat_took_place(skb) (skb_origaddr(skb) != (skb)->nh.iph->daddr) +#define store_orig_dstaddr(skb) (skb_origaddr(skb) = ip_hdr(skb)->daddr) +#define dnat_took_place(skb) (skb_origaddr(skb) != ip_hdr(skb)->daddr) #ifdef CONFIG_SYSCTL static struct ctl_table_header *brnf_sysctl_header; @@ -57,8 +59,10 @@ static int brnf_call_iptables __read_mostly = 1; static int brnf_call_ip6tables __read_mostly = 1; static int brnf_call_arptables __read_mostly = 1; static int brnf_filter_vlan_tagged __read_mostly = 1; +static int brnf_filter_pppoe_tagged __read_mostly = 1; #else #define brnf_filter_vlan_tagged 1 +#define brnf_filter_pppoe_tagged 1 #endif static inline __be16 vlan_proto(const struct sk_buff *skb) @@ -81,6 +85,22 @@ static inline __be16 vlan_proto(const struct sk_buff *skb) vlan_proto(skb) == htons(ETH_P_ARP) && \ brnf_filter_vlan_tagged) +static inline __be16 pppoe_proto(const struct sk_buff *skb) +{ + return *((__be16 *)(skb_mac_header(skb) + ETH_HLEN + + sizeof(struct pppoe_hdr))); +} + +#define IS_PPPOE_IP(skb) \ + (skb->protocol == htons(ETH_P_PPP_SES) && \ + pppoe_proto(skb) == htons(PPP_IP) && \ + brnf_filter_pppoe_tagged) + +#define IS_PPPOE_IPV6(skb) \ + (skb->protocol == htons(ETH_P_PPP_SES) && \ + pppoe_proto(skb) == htons(PPP_IPV6) && \ + brnf_filter_pppoe_tagged) + /* We need these fake structures to make netfilter happy -- * lots of places assume that skb->dst != NULL, which isn't * all that unreasonable. @@ -122,14 +142,36 @@ static inline struct nf_bridge_info *nf_bridge_alloc(struct sk_buff *skb) return skb->nf_bridge; } -static inline void nf_bridge_save_header(struct sk_buff *skb) +static inline void nf_bridge_push_encap_header(struct sk_buff *skb) +{ + unsigned int len = nf_bridge_encap_header_len(skb); + + skb_push(skb, len); + skb->network_header -= len; +} + +static inline void nf_bridge_pull_encap_header(struct sk_buff *skb) { - int header_size = ETH_HLEN; + unsigned int len = nf_bridge_encap_header_len(skb); - if (skb->protocol == htons(ETH_P_8021Q)) - header_size += VLAN_HLEN; + skb_pull(skb, len); + skb->network_header += len; +} + +static inline void nf_bridge_pull_encap_header_rcsum(struct sk_buff *skb) +{ + unsigned int len = nf_bridge_encap_header_len(skb); + + skb_pull_rcsum(skb, len); + skb->network_header += len; +} - memcpy(skb->nf_bridge->data, skb->data - header_size, header_size); +static inline void nf_bridge_save_header(struct sk_buff *skb) +{ + int header_size = ETH_HLEN + nf_bridge_encap_header_len(skb); + + skb_copy_from_linear_data_offset(skb, -header_size, + skb->nf_bridge->data, header_size); } /* @@ -139,19 +181,15 @@ static inline void nf_bridge_save_header(struct sk_buff *skb) int nf_bridge_copy_header(struct sk_buff *skb) { int err; - int header_size = ETH_HLEN; - - if (skb->protocol == htons(ETH_P_8021Q)) - header_size += VLAN_HLEN; + int header_size = ETH_HLEN + nf_bridge_encap_header_len(skb); err = skb_cow(skb, header_size); if (err) return err; - memcpy(skb->data - header_size, skb->nf_bridge->data, header_size); - - if (skb->protocol == htons(ETH_P_8021Q)) - __skb_push(skb, VLAN_HLEN); + skb_copy_to_linear_data_offset(skb, -header_size, + skb->nf_bridge->data, header_size); + __skb_push(skb, nf_bridge_encap_header_len(skb)); return 0; } @@ -172,10 +210,7 @@ static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb) dst_hold(skb->dst); skb->dev = nf_bridge->physindev; - if (skb->protocol == htons(ETH_P_8021Q)) { - skb_push(skb, VLAN_HLEN); - skb->nh.raw -= VLAN_HLEN; - } + nf_bridge_push_encap_header(skb); NF_HOOK_THRESH(PF_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL, br_handle_frame_finish, 1); @@ -253,10 +288,7 @@ static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb) if (!skb->dev) kfree_skb(skb); else { - if (skb->protocol == htons(ETH_P_8021Q)) { - skb_pull(skb, VLAN_HLEN); - skb->nh.raw += VLAN_HLEN; - } + nf_bridge_pull_encap_header(skb); skb->dst->output(skb); } return 0; @@ -265,7 +297,7 @@ static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb) static int br_nf_pre_routing_finish(struct sk_buff *skb) { struct net_device *dev = skb->dev; - struct iphdr *iph = skb->nh.iph; + struct iphdr *iph = ip_hdr(skb); struct nf_bridge_info *nf_bridge = skb->nf_bridge; int err; @@ -322,11 +354,7 @@ bridged_dnat: * bridged frame */ nf_bridge->mask |= BRNF_BRIDGED_DNAT; skb->dev = nf_bridge->physindev; - if (skb->protocol == - htons(ETH_P_8021Q)) { - skb_push(skb, VLAN_HLEN); - skb->nh.raw -= VLAN_HLEN; - } + nf_bridge_push_encap_header(skb); NF_HOOK_THRESH(PF_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL, br_nf_pre_routing_finish_bridge, @@ -342,10 +370,7 @@ bridged_dnat: } skb->dev = nf_bridge->physindev; - if (skb->protocol == htons(ETH_P_8021Q)) { - skb_push(skb, VLAN_HLEN); - skb->nh.raw -= VLAN_HLEN; - } + nf_bridge_push_encap_header(skb); NF_HOOK_THRESH(PF_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL, br_handle_frame_finish, 1); @@ -372,9 +397,10 @@ static struct net_device *setup_pre_routing(struct sk_buff *skb) /* We only check the length. A bridge shouldn't do any hop-by-hop stuff anyway */ static int check_hbh_len(struct sk_buff *skb) { - unsigned char *raw = (u8 *) (skb->nh.ipv6h + 1); + unsigned char *raw = (u8 *)(ipv6_hdr(skb) + 1); u32 pkt_len; - int off = raw - skb->nh.raw; + const unsigned char *nh = skb_network_header(skb); + int off = raw - nh; int len = (raw[1] + 1) << 3; if ((raw + len) - skb->data > skb_headlen(skb)) @@ -384,9 +410,9 @@ static int check_hbh_len(struct sk_buff *skb) len -= 2; while (len > 0) { - int optlen = skb->nh.raw[off + 1] + 2; + int optlen = nh[off + 1] + 2; - switch (skb->nh.raw[off]) { + switch (nh[off]) { case IPV6_TLV_PAD0: optlen = 1; break; @@ -395,17 +421,18 @@ static int check_hbh_len(struct sk_buff *skb) break; case IPV6_TLV_JUMBO: - if (skb->nh.raw[off + 1] != 4 || (off & 3) != 2) + if (nh[off + 1] != 4 || (off & 3) != 2) goto bad; - pkt_len = ntohl(*(__be32 *) (skb->nh.raw + off + 2)); + pkt_len = ntohl(*(__be32 *) (nh + off + 2)); if (pkt_len <= IPV6_MAXPLEN || - skb->nh.ipv6h->payload_len) + ipv6_hdr(skb)->payload_len) goto bad; if (pkt_len > skb->len - sizeof(struct ipv6hdr)) goto bad; if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) goto bad; + nh = skb_network_header(skb); break; default: if (optlen > len) @@ -439,7 +466,7 @@ static unsigned int br_nf_pre_routing_ipv6(unsigned int hook, if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto inhdr_error; - hdr = skb->nh.ipv6h; + hdr = ipv6_hdr(skb); if (hdr->version != 6) goto inhdr_error; @@ -485,18 +512,15 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff **pskb, __u32 len; struct sk_buff *skb = *pskb; - if (skb->protocol == htons(ETH_P_IPV6) || IS_VLAN_IPV6(skb)) { + if (skb->protocol == htons(ETH_P_IPV6) || IS_VLAN_IPV6(skb) || + IS_PPPOE_IPV6(skb)) { #ifdef CONFIG_SYSCTL if (!brnf_call_ip6tables) return NF_ACCEPT; #endif if ((skb = skb_share_check(*pskb, GFP_ATOMIC)) == NULL) goto out; - - if (skb->protocol == htons(ETH_P_8021Q)) { - skb_pull_rcsum(skb, VLAN_HLEN); - skb->nh.raw += VLAN_HLEN; - } + nf_bridge_pull_encap_header_rcsum(skb); return br_nf_pre_routing_ipv6(hook, skb, in, out, okfn); } #ifdef CONFIG_SYSCTL @@ -504,28 +528,25 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff **pskb, return NF_ACCEPT; #endif - if (skb->protocol != htons(ETH_P_IP) && !IS_VLAN_IP(skb)) + if (skb->protocol != htons(ETH_P_IP) && !IS_VLAN_IP(skb) && + !IS_PPPOE_IP(skb)) return NF_ACCEPT; if ((skb = skb_share_check(*pskb, GFP_ATOMIC)) == NULL) goto out; - - if (skb->protocol == htons(ETH_P_8021Q)) { - skb_pull_rcsum(skb, VLAN_HLEN); - skb->nh.raw += VLAN_HLEN; - } + nf_bridge_pull_encap_header_rcsum(skb); if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto inhdr_error; - iph = skb->nh.iph; + iph = ip_hdr(skb); if (iph->ihl < 5 || iph->version != 4) goto inhdr_error; if (!pskb_may_pull(skb, 4 * iph->ihl)) goto inhdr_error; - iph = skb->nh.iph; + iph = ip_hdr(skb); if (ip_fast_csum((__u8 *) iph, iph->ihl) != 0) goto inhdr_error; @@ -591,10 +612,7 @@ static int br_nf_forward_finish(struct sk_buff *skb) } else { in = *((struct net_device **)(skb->cb)); } - if (skb->protocol == htons(ETH_P_8021Q)) { - skb_push(skb, VLAN_HLEN); - skb->nh.raw -= VLAN_HLEN; - } + nf_bridge_push_encap_header(skb); NF_HOOK_THRESH(PF_BRIDGE, NF_BR_FORWARD, skb, in, skb->dev, br_forward_finish, 1); return 0; @@ -622,15 +640,13 @@ static unsigned int br_nf_forward_ip(unsigned int hook, struct sk_buff **pskb, if (!parent) return NF_DROP; - if (skb->protocol == htons(ETH_P_IP) || IS_VLAN_IP(skb)) + if (skb->protocol == htons(ETH_P_IP) || IS_VLAN_IP(skb) || + IS_PPPOE_IP(skb)) pf = PF_INET; else pf = PF_INET6; - if (skb->protocol == htons(ETH_P_8021Q)) { - skb_pull(*pskb, VLAN_HLEN); - (*pskb)->nh.raw += VLAN_HLEN; - } + nf_bridge_pull_encap_header(*pskb); nf_bridge = skb->nf_bridge; if (skb->pkt_type == PACKET_OTHERHOST) { @@ -664,15 +680,12 @@ static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff **pskb, if (skb->protocol != htons(ETH_P_ARP)) { if (!IS_VLAN_ARP(skb)) return NF_ACCEPT; - skb_pull(*pskb, VLAN_HLEN); - (*pskb)->nh.raw += VLAN_HLEN; + nf_bridge_pull_encap_header(*pskb); } - if (skb->nh.arph->ar_pln != 4) { - if (IS_VLAN_ARP(skb)) { - skb_push(*pskb, VLAN_HLEN); - (*pskb)->nh.raw -= VLAN_HLEN; - } + if (arp_hdr(skb)->ar_pln != 4) { + if (IS_VLAN_ARP(skb)) + nf_bridge_push_encap_header(*pskb); return NF_ACCEPT; } *d = (struct net_device *)in; @@ -719,10 +732,7 @@ static unsigned int br_nf_local_out(unsigned int hook, struct sk_buff **pskb, skb->pkt_type = PACKET_OTHERHOST; nf_bridge->mask ^= BRNF_PKT_TYPE; } - if (skb->protocol == htons(ETH_P_8021Q)) { - skb_push(skb, VLAN_HLEN); - skb->nh.raw -= VLAN_HLEN; - } + nf_bridge_push_encap_header(skb); NF_HOOK(PF_BRIDGE, NF_BR_FORWARD, skb, realindev, skb->dev, br_forward_finish); @@ -753,7 +763,8 @@ static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff **pskb, #ifdef CONFIG_NETFILTER_DEBUG /* Be very paranoid. This probably won't happen anymore, but let's * keep the check just to be sure... */ - if (skb->mac.raw < skb->head || skb->mac.raw + ETH_HLEN > skb->data) { + if (skb_mac_header(skb) < skb->head || + skb_mac_header(skb) + ETH_HLEN > skb->data) { printk(KERN_CRIT "br_netfilter: Argh!! br_nf_post_routing: " "bad mac.raw pointer.\n"); goto print_error; @@ -766,7 +777,8 @@ static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff **pskb, if (!realoutdev) return NF_DROP; - if (skb->protocol == htons(ETH_P_IP) || IS_VLAN_IP(skb)) + if (skb->protocol == htons(ETH_P_IP) || IS_VLAN_IP(skb) || + IS_PPPOE_IP(skb)) pf = PF_INET; else pf = PF_INET6; @@ -785,11 +797,7 @@ static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff **pskb, nf_bridge->mask |= BRNF_PKT_TYPE; } - if (skb->protocol == htons(ETH_P_8021Q)) { - skb_pull(skb, VLAN_HLEN); - skb->nh.raw += VLAN_HLEN; - } - + nf_bridge_pull_encap_header(skb); nf_bridge_save_header(skb); #if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) @@ -808,7 +816,7 @@ print_error: if (realoutdev) printk("[%s]", realoutdev->name); } - printk(" head:%p, raw:%p, data:%p\n", skb->head, skb->mac.raw, + printk(" head:%p, raw:%p, data:%p\n", skb->head, skb_mac_header(skb), skb->data); dump_stack(); return NF_ACCEPT; @@ -925,6 +933,14 @@ static ctl_table brnf_table[] = { .mode = 0644, .proc_handler = &brnf_sysctl_call_tables, }, + { + .ctl_name = NET_BRIDGE_NF_FILTER_PPPOE_TAGGED, + .procname = "bridge-nf-filter-pppoe-tagged", + .data = &brnf_filter_pppoe_tagged, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &brnf_sysctl_call_tables, + }, { .ctl_name = 0 } }; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 7d68b24b565..0fcf6f07306 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -11,8 +11,7 @@ */ #include <linux/kernel.h> -#include <linux/rtnetlink.h> -#include <net/netlink.h> +#include <net/rtnetlink.h> #include "br_private.h" static inline size_t br_nlmsg_size(void) @@ -110,8 +109,8 @@ static int br_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) struct net_device *dev; int idx; - read_lock(&dev_base_lock); - for (dev = dev_base, idx = 0; dev; dev = dev->next) { + idx = 0; + for_each_netdev(dev) { /* not a bridge port */ if (dev->br_port == NULL || idx < cb->args[0]) goto skip; @@ -123,7 +122,6 @@ static int br_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) skip: ++idx; } - read_unlock(&dev_base_lock); cb->args[0] = idx; @@ -166,7 +164,7 @@ static int br_rtm_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) return -EINVAL; /* if kernel STP is running, don't allow changes */ - if (p->br->stp_enabled) + if (p->br->stp_enabled == BR_KERNEL_STP) return -EBUSY; if (!netif_running(dev) || @@ -179,18 +177,19 @@ static int br_rtm_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) } -static struct rtnetlink_link bridge_rtnetlink_table[RTM_NR_MSGTYPES] = { - [RTM_GETLINK - RTM_BASE] = { .dumpit = br_dump_ifinfo, }, - [RTM_SETLINK - RTM_BASE] = { .doit = br_rtm_setlink, }, -}; - -void __init br_netlink_init(void) +int __init br_netlink_init(void) { - rtnetlink_links[PF_BRIDGE] = bridge_rtnetlink_table; + if (__rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, br_dump_ifinfo)) + return -ENOBUFS; + + /* Only the first call to __rtnl_register can fail */ + __rtnl_register(PF_BRIDGE, RTM_SETLINK, br_rtm_setlink, NULL); + + return 0; } void __exit br_netlink_fini(void) { - rtnetlink_links[PF_BRIDGE] = NULL; + rtnl_unregister_all(PF_BRIDGE); } diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c index 37357ed2149..c8451d3a070 100644 --- a/net/bridge/br_notify.c +++ b/net/bridge/br_notify.c @@ -50,7 +50,6 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v case NETDEV_CHANGEADDR: spin_lock_bh(&br->lock); br_fdb_changeaddr(p, dev->dev_addr); - br_ifinfo_notify(RTM_NEWLINK, p); br_stp_recalculate_bridge_id(br); spin_unlock_bh(&br->lock); break; @@ -74,10 +73,11 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v break; case NETDEV_UP: - spin_lock_bh(&br->lock); - if (netif_carrier_ok(dev) && (br->dev->flags & IFF_UP)) + if (netif_carrier_ok(dev) && (br->dev->flags & IFF_UP)) { + spin_lock_bh(&br->lock); br_stp_enable_port(p); - spin_unlock_bh(&br->lock); + spin_unlock_bh(&br->lock); + } break; case NETDEV_UNREGISTER: @@ -85,5 +85,10 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v break; } + /* Events that may cause spanning tree to refresh */ + if (event == NETDEV_CHANGEADDR || event == NETDEV_UP || + event == NETDEV_CHANGE || event == NETDEV_DOWN) + br_ifinfo_notify(RTM_NEWLINK, p); + return NOTIFY_DONE; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index cc3f1c99261..21bf3a9a03f 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -26,7 +26,10 @@ #define BR_PORT_BITS 10 #define BR_MAX_PORTS (1<<BR_PORT_BITS) -#define BR_VERSION "2.2" +#define BR_VERSION "2.3" + +/* Path to usermode spanning tree program */ +#define BR_STP_PROG "/sbin/bridge-stp" typedef struct bridge_id bridge_id; typedef struct mac_addr mac_addr; @@ -107,7 +110,13 @@ struct net_bridge u8 group_addr[ETH_ALEN]; u16 root_port; - unsigned char stp_enabled; + + enum { + BR_NO_STP, /* no spanning tree */ + BR_KERNEL_STP, /* old STP in kernel */ + BR_USER_STP, /* new RSTP in userspace */ + } stp_enabled; + unsigned char topology_change; unsigned char topology_change_detected; @@ -127,14 +136,14 @@ static inline int br_is_root_bridge(const struct net_bridge *br) return !memcmp(&br->bridge_id, &br->designated_root, 8); } - /* br_device.c */ extern void br_dev_setup(struct net_device *dev); extern int br_dev_xmit(struct sk_buff *skb, struct net_device *dev); /* br_fdb.c */ -extern void br_fdb_init(void); +extern int br_fdb_init(void); extern void br_fdb_fini(void); +extern void br_fdb_flush(struct net_bridge *br); extern void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr); extern void br_fdb_cleanup(unsigned long arg); @@ -182,7 +191,8 @@ extern void br_features_recompute(struct net_bridge *br); /* br_input.c */ extern int br_handle_frame_finish(struct sk_buff *skb); -extern int br_handle_frame(struct net_bridge_port *p, struct sk_buff **pskb); +extern struct sk_buff *br_handle_frame(struct net_bridge_port *p, + struct sk_buff *skb); /* br_ioctl.c */ extern int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); @@ -207,6 +217,7 @@ extern void br_become_designated_port(struct net_bridge_port *p); /* br_stp_if.c */ extern void br_stp_enable_bridge(struct net_bridge *br); extern void br_stp_disable_bridge(struct net_bridge *br); +extern void br_stp_set_enabled(struct net_bridge *br, unsigned long val); extern void br_stp_enable_port(struct net_bridge_port *p); extern void br_stp_disable_port(struct net_bridge_port *p); extern void br_stp_recalculate_bridge_id(struct net_bridge *br); @@ -235,7 +246,7 @@ extern void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent); /* br_netlink.c */ -extern void br_netlink_init(void); +extern int br_netlink_init(void); extern void br_netlink_fini(void); extern void br_ifinfo_notify(int event, struct net_bridge_port *port); diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index f9ff4d57b0d..ebb0861e9bd 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -370,11 +370,11 @@ static void br_make_blocking(struct net_bridge_port *p) static void br_make_forwarding(struct net_bridge_port *p) { if (p->state == BR_STATE_BLOCKING) { - if (p->br->stp_enabled) { + if (p->br->stp_enabled == BR_KERNEL_STP) p->state = BR_STATE_LISTENING; - } else { + else p->state = BR_STATE_LEARNING; - } + br_log_state(p); mod_timer(&p->forward_delay_timer, jiffies + p->br->forward_delay); } } @@ -384,6 +384,10 @@ void br_port_state_selection(struct net_bridge *br) { struct net_bridge_port *p; + /* Don't change port states if userspace is handling STP */ + if (br->stp_enabled == BR_USER_STP) + return; + list_for_each_entry(p, &br->port_list, list) { if (p->state != BR_STATE_DISABLED) { if (p->port_no == br->root_port) { diff --git a/net/bridge/br_stp_bpdu.c b/net/bridge/br_stp_bpdu.c index b9fb0dc4ab1..60112bce669 100644 --- a/net/bridge/br_stp_bpdu.c +++ b/net/bridge/br_stp_bpdu.c @@ -33,9 +33,6 @@ static void br_send_bpdu(struct net_bridge_port *p, { struct sk_buff *skb; - if (!p->br->stp_enabled) - return; - skb = dev_alloc_skb(length+LLC_RESERVE); if (!skb) return; @@ -75,6 +72,9 @@ void br_send_config_bpdu(struct net_bridge_port *p, struct br_config_bpdu *bpdu) { unsigned char buf[35]; + if (p->br->stp_enabled != BR_KERNEL_STP) + return; + buf[0] = 0; buf[1] = 0; buf[2] = 0; @@ -117,6 +117,9 @@ void br_send_tcn_bpdu(struct net_bridge_port *p) { unsigned char buf[4]; + if (p->br->stp_enabled != BR_KERNEL_STP) + return; + buf[0] = 0; buf[1] = 0; buf[2] = 0; @@ -157,9 +160,13 @@ int br_stp_rcv(struct sk_buff *skb, struct net_device *dev, br = p->br; spin_lock(&br->lock); - if (p->state == BR_STATE_DISABLED - || !br->stp_enabled - || !(br->dev->flags & IFF_UP)) + if (br->stp_enabled != BR_KERNEL_STP) + goto out; + + if (!(br->dev->flags & IFF_UP)) + goto out; + + if (p->state == BR_STATE_DISABLED) goto out; if (compare_ether_addr(dest, br->group_addr) != 0) diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index a285897a2fb..3e246b37020 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -87,7 +87,6 @@ void br_stp_disable_bridge(struct net_bridge *br) void br_stp_enable_port(struct net_bridge_port *p) { br_init_port(p); - br_ifinfo_notify(RTM_NEWLINK, p); br_port_state_selection(p->br); } @@ -101,8 +100,6 @@ void br_stp_disable_port(struct net_bridge_port *p) printk(KERN_INFO "%s: port %i(%s) entering %s state\n", br->dev->name, p->port_no, p->dev->name, "disabled"); - br_ifinfo_notify(RTM_DELLINK, p); - wasroot = br_is_root_bridge(br); br_become_designated_port(p); p->state = BR_STATE_DISABLED; @@ -123,6 +120,62 @@ void br_stp_disable_port(struct net_bridge_port *p) br_become_root_bridge(br); } +static void br_stp_start(struct net_bridge *br) +{ + int r; + char *argv[] = { BR_STP_PROG, br->dev->name, "start", NULL }; + char *envp[] = { NULL }; + + r = call_usermodehelper(BR_STP_PROG, argv, envp, 1); + if (r == 0) { + br->stp_enabled = BR_USER_STP; + printk(KERN_INFO "%s: userspace STP started\n", br->dev->name); + } else { + br->stp_enabled = BR_KERNEL_STP; + printk(KERN_INFO "%s: starting userspace STP failed, " + "staring kernel STP\n", br->dev->name); + + /* To start timers on any ports left in blocking */ + spin_lock_bh(&br->lock); + br_port_state_selection(br); + spin_unlock_bh(&br->lock); + } +} + +static void br_stp_stop(struct net_bridge *br) +{ + int r; + char *argv[] = { BR_STP_PROG, br->dev->name, "stop", NULL }; + char *envp[] = { NULL }; + + if (br->stp_enabled == BR_USER_STP) { + r = call_usermodehelper(BR_STP_PROG, argv, envp, 1); + printk(KERN_INFO "%s: userspace STP stopped, return code %d\n", + br->dev->name, r); + + + /* To start timers on any ports left in blocking */ + spin_lock_bh(&br->lock); + br_port_state_selection(br); + spin_unlock_bh(&br->lock); + } + + br->stp_enabled = BR_NO_STP; +} + +void br_stp_set_enabled(struct net_bridge *br, unsigned long val) +{ + ASSERT_RTNL(); + + if (val) { + if (br->stp_enabled == BR_NO_STP) + br_stp_start(br); + } else { + if (br->stp_enabled != BR_NO_STP) + br_stp_stop(br); + } +} + /* called under bridge lock */ void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *addr) { diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 01a22ad0cc7..33c6c4a7c68 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -149,7 +149,11 @@ static ssize_t show_stp_state(struct device *d, static void set_stp_state(struct net_bridge *br, unsigned long val) { - br->stp_enabled = val; + rtnl_lock(); + spin_unlock_bh(&br->lock); + br_stp_set_enabled(br, val); + spin_lock_bh(&br->lock); + rtnl_unlock(); } static ssize_t store_stp_state(struct device *d, @@ -309,6 +313,19 @@ static ssize_t store_group_addr(struct device *d, static DEVICE_ATTR(group_addr, S_IRUGO | S_IWUSR, show_group_addr, store_group_addr); +static ssize_t store_flush(struct device *d, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct net_bridge *br = to_bridge(d); + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + br_fdb_flush(br); + return len; +} +static DEVICE_ATTR(flush, S_IWUSR, NULL, store_flush); static struct attribute *bridge_attrs[] = { &dev_attr_forward_delay.attr, @@ -328,6 +345,7 @@ static struct attribute *bridge_attrs[] = { &dev_attr_topology_change_timer.attr, &dev_attr_gc_timer.attr, &dev_attr_group_addr.attr, + &dev_attr_flush.attr, NULL }; diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 0bc2aef8f9f..2da22927d8d 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -137,6 +137,13 @@ static ssize_t show_hold_timer(struct net_bridge_port *p, } static BRPORT_ATTR(hold_timer, S_IRUGO, show_hold_timer, NULL); +static ssize_t store_flush(struct net_bridge_port *p, unsigned long v) +{ + br_fdb_delete_by_port(p->br, p, 0); // Don't delete local entry + return 0; +} +static BRPORT_ATTR(flush, S_IWUSR, NULL, store_flush); + static struct brport_attribute *brport_attrs[] = { &brport_attr_path_cost, &brport_attr_priority, @@ -152,6 +159,7 @@ static struct brport_attribute *brport_attrs[] = { &brport_attr_message_age_timer, &brport_attr_forward_delay_timer, &brport_attr_hold_timer, + &brport_attr_flush, NULL }; diff --git a/net/bridge/netfilter/ebt_arp.c b/net/bridge/netfilter/ebt_arp.c index 9c599800a90..1a46952a56d 100644 --- a/net/bridge/netfilter/ebt_arp.c +++ b/net/bridge/netfilter/ebt_arp.c @@ -35,40 +35,36 @@ static int ebt_filter_arp(const struct sk_buff *skb, const struct net_device *in return EBT_NOMATCH; if (info->bitmask & (EBT_ARP_SRC_IP | EBT_ARP_DST_IP)) { - __be32 _addr, *ap; + __be32 saddr, daddr, *sap, *dap; - /* IPv4 addresses are always 4 bytes */ - if (ah->ar_pln != sizeof(__be32)) + if (ah->ar_pln != sizeof(__be32) || ah->ar_pro != htons(ETH_P_IP)) + return EBT_NOMATCH; + sap = skb_header_pointer(skb, sizeof(struct arphdr) + + ah->ar_hln, sizeof(saddr), + &saddr); + if (sap == NULL) + return EBT_NOMATCH; + dap = skb_header_pointer(skb, sizeof(struct arphdr) + + 2*ah->ar_hln+sizeof(saddr), + sizeof(daddr), &daddr); + if (dap == NULL) + return EBT_NOMATCH; + if (info->bitmask & EBT_ARP_SRC_IP && + FWINV(info->saddr != (*sap & info->smsk), EBT_ARP_SRC_IP)) + return EBT_NOMATCH; + if (info->bitmask & EBT_ARP_DST_IP && + FWINV(info->daddr != (*dap & info->dmsk), EBT_ARP_DST_IP)) + return EBT_NOMATCH; + if (info->bitmask & EBT_ARP_GRAT && + FWINV(*dap != *sap, EBT_ARP_GRAT)) return EBT_NOMATCH; - if (info->bitmask & EBT_ARP_SRC_IP) { - ap = skb_header_pointer(skb, sizeof(struct arphdr) + - ah->ar_hln, sizeof(_addr), - &_addr); - if (ap == NULL) - return EBT_NOMATCH; - if (FWINV(info->saddr != (*ap & info->smsk), - EBT_ARP_SRC_IP)) - return EBT_NOMATCH; - } - - if (info->bitmask & EBT_ARP_DST_IP) { - ap = skb_header_pointer(skb, sizeof(struct arphdr) + - 2*ah->ar_hln+sizeof(__be32), - sizeof(_addr), &_addr); - if (ap == NULL) - return EBT_NOMATCH; - if (FWINV(info->daddr != (*ap & info->dmsk), - EBT_ARP_DST_IP)) - return EBT_NOMATCH; - } } if (info->bitmask & (EBT_ARP_SRC_MAC | EBT_ARP_DST_MAC)) { unsigned char _mac[ETH_ALEN], *mp; uint8_t verdict, i; - /* MAC addresses are 6 bytes */ - if (ah->ar_hln != ETH_ALEN) + if (ah->ar_hln != ETH_ALEN || ah->ar_hrd != htons(ARPHRD_ETHER)) return EBT_NOMATCH; if (info->bitmask & EBT_ARP_SRC_MAC) { mp = skb_header_pointer(skb, sizeof(struct arphdr), diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c index 45712aec6a0..031bfa4a51f 100644 --- a/net/bridge/netfilter/ebt_log.c +++ b/net/bridge/netfilter/ebt_log.c @@ -196,14 +196,10 @@ static int __init ebt_log_init(void) ret = ebt_register_watcher(&log); if (ret < 0) return ret; - if (nf_log_register(PF_BRIDGE, &ebt_log_logger) < 0) { - printk(KERN_WARNING "ebt_log: not logging via system console " - "since somebody else already registered for PF_INET\n"); - /* we cannot make module load fail here, since otherwise - * ebtables userspace would abort */ - } - - return 0; + ret = nf_log_register(PF_BRIDGE, &ebt_log_logger); + if (ret < 0 && ret != -EEXIST) + ebt_unregister_watcher(&log); + return ret; } static void __exit ebt_log_fini(void) diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c index 8e15cc47f6c..9411db62591 100644 --- a/net/bridge/netfilter/ebt_ulog.c +++ b/net/bridge/netfilter/ebt_ulog.c @@ -130,6 +130,7 @@ static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb, unsigned int group = uloginfo->nlgroup; ebt_ulog_buff_t *ub = &ulog_buffers[group]; spinlock_t *lock = &ub->lock; + ktime_t kt; if ((uloginfo->cprange == 0) || (uloginfo->cprange > skb->len + ETH_HLEN)) @@ -164,9 +165,10 @@ static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb, /* Fill in the ulog data */ pm->version = EBT_ULOG_VERSION; - do_gettimeofday(&pm->stamp); + kt = ktime_get_real(); + pm->stamp = ktime_to_timeval(kt); if (ub->qlen == 1) - skb_set_timestamp(ub->skb, &pm->stamp); + ub->skb->tstamp = kt; pm->data_len = copy_len; pm->mark = skb->mark; pm->hook = hooknr; @@ -295,14 +297,12 @@ static int __init ebt_ulog_init(void) /* initialize ulog_buffers */ for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) { - init_timer(&ulog_buffers[i].timer); - ulog_buffers[i].timer.function = ulog_timer; - ulog_buffers[i].timer.data = i; + setup_timer(&ulog_buffers[i].timer, ulog_timer, i); spin_lock_init(&ulog_buffers[i].lock); } ebtulognl = netlink_kernel_create(NETLINK_NFLOG, EBT_ULOG_MAXNLGROUPS, - NULL, THIS_MODULE); + NULL, NULL, THIS_MODULE); if (!ebtulognl) ret = -ENOMEM; else if ((ret = ebt_register_watcher(&ulog))) diff --git a/net/compat.c b/net/compat.c index 1f32866d09b..9a0f5f2b90c 100644 --- a/net/compat.c +++ b/net/compat.c @@ -34,11 +34,11 @@ static inline int iov_from_user_compat_to_kern(struct iovec *kiov, { int tot_len = 0; - while(niov > 0) { + while (niov > 0) { compat_uptr_t buf; compat_size_t len; - if(get_user(len, &uiov32->iov_len) || + if (get_user(len, &uiov32->iov_len) || get_user(buf, &uiov32->iov_base)) { tot_len = -EFAULT; break; @@ -78,12 +78,12 @@ int verify_compat_iovec(struct msghdr *kern_msg, struct iovec *kern_iov, { int tot_len; - if(kern_msg->msg_namelen) { - if(mode==VERIFY_READ) { + if (kern_msg->msg_namelen) { + if (mode==VERIFY_READ) { int err = move_addr_to_kernel(kern_msg->msg_name, kern_msg->msg_namelen, kern_address); - if(err < 0) + if (err < 0) return err; } kern_msg->msg_name = kern_address; @@ -93,7 +93,7 @@ int verify_compat_iovec(struct msghdr *kern_msg, struct iovec *kern_iov, tot_len = iov_from_user_compat_to_kern(kern_iov, (struct compat_iovec __user *)kern_msg->msg_iov, kern_msg->msg_iovlen); - if(tot_len >= 0) + if (tot_len >= 0) kern_msg->msg_iov = kern_iov; return tot_len; @@ -146,8 +146,8 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk, kcmlen = 0; kcmsg_base = kcmsg = (struct cmsghdr *)stackbuf; ucmsg = CMSG_COMPAT_FIRSTHDR(kmsg); - while(ucmsg != NULL) { - if(get_user(ucmlen, &ucmsg->cmsg_len)) + while (ucmsg != NULL) { + if (get_user(ucmlen, &ucmsg->cmsg_len)) return -EFAULT; /* Catch bogons. */ @@ -160,7 +160,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk, kcmlen += tmp; ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen); } - if(kcmlen == 0) + if (kcmlen == 0) return -EINVAL; /* The kcmlen holds the 64-bit version of the control length. @@ -176,7 +176,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk, /* Now copy them over neatly. */ memset(kcmsg, 0, kcmlen); ucmsg = CMSG_COMPAT_FIRSTHDR(kmsg); - while(ucmsg != NULL) { + while (ucmsg != NULL) { if (__get_user(ucmlen, &ucmsg->cmsg_len)) goto Efault; if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg)) @@ -215,11 +215,12 @@ Efault: int put_cmsg_compat(struct msghdr *kmsg, int level, int type, int len, void *data) { struct compat_timeval ctv; + struct compat_timespec cts; struct compat_cmsghdr __user *cm = (struct compat_cmsghdr __user *) kmsg->msg_control; struct compat_cmsghdr cmhdr; int cmlen; - if(cm == NULL || kmsg->msg_controllen < sizeof(*cm)) { + if (cm == NULL || kmsg->msg_controllen < sizeof(*cm)) { kmsg->msg_flags |= MSG_CTRUNC; return 0; /* XXX: return error? check spec. */ } @@ -229,11 +230,18 @@ int put_cmsg_compat(struct msghdr *kmsg, int level, int type, int len, void *dat ctv.tv_sec = tv->tv_sec; ctv.tv_usec = tv->tv_usec; data = &ctv; - len = sizeof(struct compat_timeval); + len = sizeof(ctv); + } + if (level == SOL_SOCKET && type == SO_TIMESTAMPNS) { + struct timespec *ts = (struct timespec *)data; + cts.tv_sec = ts->tv_sec; + cts.tv_nsec = ts->tv_nsec; + data = &cts; + len = sizeof(cts); } cmlen = CMSG_COMPAT_LEN(len); - if(kmsg->msg_controllen < cmlen) { + if (kmsg->msg_controllen < cmlen) { kmsg->msg_flags |= MSG_CTRUNC; cmlen = kmsg->msg_controllen; } @@ -241,9 +249,9 @@ int put_cmsg_compat(struct msghdr *kmsg, int level, int type, int len, void *dat cmhdr.cmsg_type = type; cmhdr.cmsg_len = cmlen; - if(copy_to_user(cm, &cmhdr, sizeof cmhdr)) + if (copy_to_user(cm, &cmhdr, sizeof cmhdr)) return -EFAULT; - if(copy_to_user(CMSG_COMPAT_DATA(cm), data, cmlen - sizeof(struct compat_cmsghdr))) + if (copy_to_user(CMSG_COMPAT_DATA(cm), data, cmlen - sizeof(struct compat_cmsghdr))) return -EFAULT; cmlen = CMSG_COMPAT_SPACE(len); kmsg->msg_control += cmlen; @@ -545,20 +553,49 @@ int compat_sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) struct compat_timeval __user *ctv = (struct compat_timeval __user*) userstamp; int err = -ENOENT; + struct timeval tv; if (!sock_flag(sk, SOCK_TIMESTAMP)) sock_enable_timestamp(sk); - if (sk->sk_stamp.tv_sec == -1) + tv = ktime_to_timeval(sk->sk_stamp); + if (tv.tv_sec == -1) return err; - if (sk->sk_stamp.tv_sec == 0) - do_gettimeofday(&sk->sk_stamp); - if (put_user(sk->sk_stamp.tv_sec, &ctv->tv_sec) || - put_user(sk->sk_stamp.tv_usec, &ctv->tv_usec)) + if (tv.tv_sec == 0) { + sk->sk_stamp = ktime_get_real(); + tv = ktime_to_timeval(sk->sk_stamp); + } + err = 0; + if (put_user(tv.tv_sec, &ctv->tv_sec) || + put_user(tv.tv_usec, &ctv->tv_usec)) err = -EFAULT; return err; } EXPORT_SYMBOL(compat_sock_get_timestamp); +int compat_sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) +{ + struct compat_timespec __user *ctv = + (struct compat_timespec __user*) userstamp; + int err = -ENOENT; + struct timespec ts; + + if (!sock_flag(sk, SOCK_TIMESTAMP)) + sock_enable_timestamp(sk); + ts = ktime_to_timespec(sk->sk_stamp); + if (ts.tv_sec == -1) + return err; + if (ts.tv_sec == 0) { + sk->sk_stamp = ktime_get_real(); + ts = ktime_to_timespec(sk->sk_stamp); + } + err = 0; + if (put_user(ts.tv_sec, &ctv->tv_sec) || + put_user(ts.tv_nsec, &ctv->tv_nsec)) + err = -EFAULT; + return err; +} +EXPORT_SYMBOL(compat_sock_get_timestampns); + asmlinkage long compat_sys_getsockopt(int fd, int level, int optname, char __user *optval, int __user *optlen) { @@ -617,7 +654,7 @@ asmlinkage long compat_sys_socketcall(int call, u32 __user *args) a0 = a[0]; a1 = a[1]; - switch(call) { + switch (call) { case SYS_SOCKET: ret = sys_socket(a0, a1, a[2]); break; diff --git a/net/core/Makefile b/net/core/Makefile index 73272d506e9..4751613e1b5 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -13,7 +13,6 @@ obj-y += dev.o ethtool.o dev_mcast.o dst.o netevent.o \ obj-$(CONFIG_XFRM) += flow.o obj-$(CONFIG_SYSFS) += net-sysfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o -obj-$(CONFIG_WIRELESS_EXT) += wireless.o obj-$(CONFIG_NETPOLL) += netpoll.o obj-$(CONFIG_NET_DMA) += user_dma.o obj-$(CONFIG_FIB_RULES) += fib_rules.o diff --git a/net/core/datagram.c b/net/core/datagram.c index 186212b5b7d..cb056f47612 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -411,11 +411,11 @@ fault: return -EFAULT; } -__sum16 __skb_checksum_complete(struct sk_buff *skb) +__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) { __sum16 sum; - sum = csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); + sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); if (likely(!sum)) { if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) netdev_rx_csum_fault(skb->dev); @@ -423,6 +423,12 @@ __sum16 __skb_checksum_complete(struct sk_buff *skb) } return sum; } +EXPORT_SYMBOL(__skb_checksum_complete_head); + +__sum16 __skb_checksum_complete(struct sk_buff *skb) +{ + return __skb_checksum_complete_head(skb, skb->len); +} EXPORT_SYMBOL(__skb_checksum_complete); /** diff --git a/net/core/dev.c b/net/core/dev.c index 4dc93cc4d5b..f27d4ab181e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -109,7 +109,7 @@ #include <linux/netpoll.h> #include <linux/rcupdate.h> #include <linux/delay.h> -#include <linux/wireless.h> +#include <net/wext.h> #include <net/iw_handler.h> #include <asm/current.h> #include <linux/audit.h> @@ -146,8 +146,8 @@ */ static DEFINE_SPINLOCK(ptype_lock); -static struct list_head ptype_base[16]; /* 16 way hashed list */ -static struct list_head ptype_all; /* Taps */ +static struct list_head ptype_base[16] __read_mostly; /* 16 way hashed list */ +static struct list_head ptype_all __read_mostly; /* Taps */ #ifdef CONFIG_NET_DMA static struct dma_client *net_dma_client; @@ -156,13 +156,13 @@ static spinlock_t net_dma_event_lock; #endif /* - * The @dev_base list is protected by @dev_base_lock and the rtnl + * The @dev_base_head list is protected by @dev_base_lock and the rtnl * semaphore. * * Pure readers hold dev_base_lock for reading. * * Writers must hold the rtnl semaphore while they loop through the - * dev_base list, and hold dev_base_lock for writing when they do the + * dev_base_head list, and hold dev_base_lock for writing when they do the * actual updates. This allows pure readers to access the list even * while a writer is preparing to update it. * @@ -174,11 +174,10 @@ static spinlock_t net_dma_event_lock; * unregister_netdevice(), which must be called with the rtnl * semaphore held. */ -struct net_device *dev_base; -static struct net_device **dev_tail = &dev_base; +LIST_HEAD(dev_base_head); DEFINE_RWLOCK(dev_base_lock); -EXPORT_SYMBOL(dev_base); +EXPORT_SYMBOL(dev_base_head); EXPORT_SYMBOL(dev_base_lock); #define NETDEV_HASHBITS 8 @@ -226,12 +225,6 @@ extern void netdev_unregister_sysfs(struct net_device *); *******************************************************************************/ /* - * For efficiency - */ - -static int netdev_nit; - -/* * Add a protocol ID to the list. Now that the input handler is * smarter we can dispense with all the messy stuff that used to be * here. @@ -265,10 +258,9 @@ void dev_add_pack(struct packet_type *pt) int hash; spin_lock_bh(&ptype_lock); - if (pt->type == htons(ETH_P_ALL)) { - netdev_nit++; + if (pt->type == htons(ETH_P_ALL)) list_add_rcu(&pt->list, &ptype_all); - } else { + else { hash = ntohs(pt->type) & 15; list_add_rcu(&pt->list, &ptype_base[hash]); } @@ -295,10 +287,9 @@ void __dev_remove_pack(struct packet_type *pt) spin_lock_bh(&ptype_lock); - if (pt->type == htons(ETH_P_ALL)) { - netdev_nit--; + if (pt->type == htons(ETH_P_ALL)) head = &ptype_all; - } else + else head = &ptype_base[ntohs(pt->type) & 15]; list_for_each_entry(pt1, head, list) { @@ -575,26 +566,38 @@ struct net_device *dev_getbyhwaddr(unsigned short type, char *ha) ASSERT_RTNL(); - for (dev = dev_base; dev; dev = dev->next) + for_each_netdev(dev) if (dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len)) - break; - return dev; + return dev; + + return NULL; } EXPORT_SYMBOL(dev_getbyhwaddr); +struct net_device *__dev_getfirstbyhwtype(unsigned short type) +{ + struct net_device *dev; + + ASSERT_RTNL(); + for_each_netdev(dev) + if (dev->type == type) + return dev; + + return NULL; +} + +EXPORT_SYMBOL(__dev_getfirstbyhwtype); + struct net_device *dev_getfirstbyhwtype(unsigned short type) { struct net_device *dev; rtnl_lock(); - for (dev = dev_base; dev; dev = dev->next) { - if (dev->type == type) { - dev_hold(dev); - break; - } - } + dev = __dev_getfirstbyhwtype(type); + if (dev) + dev_hold(dev); rtnl_unlock(); return dev; } @@ -614,17 +617,19 @@ EXPORT_SYMBOL(dev_getfirstbyhwtype); struct net_device * dev_get_by_flags(unsigned short if_flags, unsigned short mask) { - struct net_device *dev; + struct net_device *dev, *ret; + ret = NULL; read_lock(&dev_base_lock); - for (dev = dev_base; dev != NULL; dev = dev->next) { + for_each_netdev(dev) { if (((dev->flags ^ if_flags) & mask) == 0) { dev_hold(dev); + ret = dev; break; } } read_unlock(&dev_base_lock); - return dev; + return ret; } /** @@ -690,7 +695,7 @@ int dev_alloc_name(struct net_device *dev, const char *name) if (!inuse) return -ENOMEM; - for (d = dev_base; d; d = d->next) { + for_each_netdev(d) { if (!sscanf(d->name, name, &i)) continue; if (i < 0 || i >= max_netdevices) @@ -817,7 +822,6 @@ static int default_rebuild_header(struct sk_buff *skb) return 1; } - /** * dev_open - prepare an interface for use. * @dev: device to open @@ -973,7 +977,7 @@ int register_netdevice_notifier(struct notifier_block *nb) rtnl_lock(); err = raw_notifier_chain_register(&netdev_chain, nb); if (!err) { - for (dev = dev_base; dev; dev = dev->next) { + for_each_netdev(dev) { nb->notifier_call(nb, NETDEV_REGISTER, dev); if (dev->flags & IFF_UP) @@ -1031,23 +1035,12 @@ void net_disable_timestamp(void) atomic_dec(&netstamp_needed); } -void __net_timestamp(struct sk_buff *skb) -{ - struct timeval tv; - - do_gettimeofday(&tv); - skb_set_timestamp(skb, &tv); -} -EXPORT_SYMBOL(__net_timestamp); - static inline void net_timestamp(struct sk_buff *skb) { if (atomic_read(&netstamp_needed)) __net_timestamp(skb); - else { - skb->tstamp.off_sec = 0; - skb->tstamp.off_usec = 0; - } + else + skb->tstamp.tv64 = 0; } /* @@ -1077,18 +1070,18 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) set by sender, so that the second statement is just protection against buggy protocols. */ - skb2->mac.raw = skb2->data; + skb_reset_mac_header(skb2); - if (skb2->nh.raw < skb2->data || - skb2->nh.raw > skb2->tail) { + if (skb_network_header(skb2) < skb2->data || + skb2->network_header > skb2->tail) { if (net_ratelimit()) printk(KERN_CRIT "protocol %04x is " "buggy, dev %s\n", skb2->protocol, dev->name); - skb2->nh.raw = skb2->data; + skb_reset_network_header(skb2); } - skb2->h.raw = skb2->nh.raw; + skb2->transport_header = skb2->network_header; skb2->pkt_type = PACKET_OUTGOING; ptype->func(skb2, skb->dev, ptype, skb->dev); } @@ -1167,7 +1160,7 @@ EXPORT_SYMBOL(netif_device_attach); int skb_checksum_help(struct sk_buff *skb) { __wsum csum; - int ret = 0, offset = skb->h.raw - skb->data; + int ret = 0, offset; if (skb->ip_summed == CHECKSUM_COMPLETE) goto out_set_summed; @@ -1183,15 +1176,16 @@ int skb_checksum_help(struct sk_buff *skb) goto out; } + offset = skb->csum_start - skb_headroom(skb); BUG_ON(offset > (int)skb->len); csum = skb_checksum(skb, offset, skb->len-offset, 0); - offset = skb->tail - skb->h.raw; + offset = skb_headlen(skb) - offset; BUG_ON(offset <= 0); BUG_ON(skb->csum_offset + 2 > offset); - *(__sum16*)(skb->h.raw + skb->csum_offset) = csum_fold(csum); - + *(__sum16 *)(skb->head + skb->csum_start + skb->csum_offset) = + csum_fold(csum); out_set_summed: skb->ip_summed = CHECKSUM_NONE; out: @@ -1217,11 +1211,11 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features) BUG_ON(skb_shinfo(skb)->frag_list); - skb->mac.raw = skb->data; - skb->mac_len = skb->nh.raw - skb->data; + skb_reset_mac_header(skb); + skb->mac_len = skb->network_header - skb->mac_header; __skb_pull(skb, skb->mac_len); - if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { + if (WARN_ON(skb->ip_summed != CHECKSUM_PARTIAL)) { if (skb_header_cloned(skb) && (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) return ERR_PTR(err); @@ -1235,7 +1229,8 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features) segs = ERR_PTR(err); if (err || skb_gso_ok(skb, features)) break; - __skb_push(skb, skb->data - skb->nh.raw); + __skb_push(skb, (skb->data - + skb_network_header(skb))); } segs = ptype->gso_segment(skb, features); break; @@ -1243,7 +1238,7 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features) } rcu_read_unlock(); - __skb_push(skb, skb->data - skb->mac.raw); + __skb_push(skb, skb->data - skb_mac_header(skb)); return segs; } @@ -1340,7 +1335,7 @@ static int dev_gso_segment(struct sk_buff *skb) int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) { if (likely(!skb->next)) { - if (netdev_nit) + if (!list_empty(&ptype_all)) dev_queue_xmit_nit(skb, dev); if (netif_needs_gso(dev, skb)) { @@ -1442,12 +1437,16 @@ int dev_queue_xmit(struct sk_buff *skb) /* If packet is not checksummed and device does not support * checksumming for this protocol, complete checksumming here. */ - if (skb->ip_summed == CHECKSUM_PARTIAL && - (!(dev->features & NETIF_F_GEN_CSUM) && - (!(dev->features & NETIF_F_IP_CSUM) || - skb->protocol != htons(ETH_P_IP)))) - if (skb_checksum_help(skb)) - goto out_kfree_skb; + if (skb->ip_summed == CHECKSUM_PARTIAL) { + skb_set_transport_header(skb, skb->csum_start - + skb_headroom(skb)); + + if (!(dev->features & NETIF_F_GEN_CSUM) && + (!(dev->features & NETIF_F_IP_CSUM) || + skb->protocol != htons(ETH_P_IP))) + if (skb_checksum_help(skb)) + goto out_kfree_skb; + } gso: spin_lock_prefetch(&dev->queue_lock); @@ -1543,9 +1542,9 @@ out: Receiver routines =======================================================================*/ -int netdev_max_backlog = 1000; -int netdev_budget = 300; -int weight_p = 64; /* old backlog weight */ +int netdev_max_backlog __read_mostly = 1000; +int netdev_budget __read_mostly = 300; +int weight_p __read_mostly = 64; /* old backlog weight */ DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; @@ -1577,7 +1576,7 @@ int netif_rx(struct sk_buff *skb) if (netpoll_rx(skb)) return NET_RX_DROP; - if (!skb->tstamp.off_sec) + if (!skb->tstamp.tv64) net_timestamp(skb); /* @@ -1684,40 +1683,46 @@ static void net_tx_action(struct softirq_action *h) } } -static __inline__ int deliver_skb(struct sk_buff *skb, - struct packet_type *pt_prev, - struct net_device *orig_dev) +static inline int deliver_skb(struct sk_buff *skb, + struct packet_type *pt_prev, + struct net_device *orig_dev) { atomic_inc(&skb->users); return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) -int (*br_handle_frame_hook)(struct net_bridge_port *p, struct sk_buff **pskb); +/* These hooks defined here for ATM */ struct net_bridge; struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br, unsigned char *addr); -void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent); +void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly; -static __inline__ int handle_bridge(struct sk_buff **pskb, - struct packet_type **pt_prev, int *ret, - struct net_device *orig_dev) +/* + * If bridge module is loaded call bridging hook. + * returns NULL if packet was consumed. + */ +struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p, + struct sk_buff *skb) __read_mostly; +static inline struct sk_buff *handle_bridge(struct sk_buff *skb, + struct packet_type **pt_prev, int *ret, + struct net_device *orig_dev) { struct net_bridge_port *port; - if ((*pskb)->pkt_type == PACKET_LOOPBACK || - (port = rcu_dereference((*pskb)->dev->br_port)) == NULL) - return 0; + if (skb->pkt_type == PACKET_LOOPBACK || + (port = rcu_dereference(skb->dev->br_port)) == NULL) + return skb; if (*pt_prev) { - *ret = deliver_skb(*pskb, *pt_prev, orig_dev); + *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; } - return br_handle_frame_hook(port, pskb); + return br_handle_frame_hook(port, skb); } #else -#define handle_bridge(skb, pt_prev, ret, orig_dev) (0) +#define handle_bridge(skb, pt_prev, ret, orig_dev) (skb) #endif #ifdef CONFIG_NET_CLS_ACT @@ -1747,10 +1752,10 @@ static int ing_filter(struct sk_buff *skb) skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS); - spin_lock(&dev->queue_lock); + spin_lock(&dev->ingress_lock); if ((q = dev->qdisc_ingress) != NULL) result = q->enqueue(skb, q); - spin_unlock(&dev->queue_lock); + spin_unlock(&dev->ingress_lock); } @@ -1769,7 +1774,7 @@ int netif_receive_skb(struct sk_buff *skb) if (skb->dev->poll && netpoll_rx(skb)) return NET_RX_DROP; - if (!skb->tstamp.off_sec) + if (!skb->tstamp.tv64) net_timestamp(skb); if (!skb->iif) @@ -1782,8 +1787,9 @@ int netif_receive_skb(struct sk_buff *skb) __get_cpu_var(netdev_rx_stat).total++; - skb->h.raw = skb->nh.raw = skb->data; - skb->mac_len = skb->nh.raw - skb->mac.raw; + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + skb->mac_len = skb->network_header - skb->mac_header; pt_prev = NULL; @@ -1823,7 +1829,8 @@ int netif_receive_skb(struct sk_buff *skb) ncls: #endif - if (handle_bridge(&skb, &pt_prev, &ret, orig_dev)) + skb = handle_bridge(skb, &pt_prev, &ret, orig_dev); + if (!skb) goto out; type = skb->protocol; @@ -2044,7 +2051,7 @@ static int dev_ifconf(char __user *arg) */ total = 0; - for (dev = dev_base; dev; dev = dev->next) { + for_each_netdev(dev) { for (i = 0; i < NPROTO; i++) { if (gifconf_list[i]) { int done; @@ -2076,26 +2083,28 @@ static int dev_ifconf(char __user *arg) * This is invoked by the /proc filesystem handler to display a device * in detail. */ -static __inline__ struct net_device *dev_get_idx(loff_t pos) +void *dev_seq_start(struct seq_file *seq, loff_t *pos) { + loff_t off; struct net_device *dev; - loff_t i; - for (i = 0, dev = dev_base; dev && i < pos; ++i, dev = dev->next); + read_lock(&dev_base_lock); + if (!*pos) + return SEQ_START_TOKEN; - return i == pos ? dev : NULL; -} + off = 1; + for_each_netdev(dev) + if (off++ == *pos) + return dev; -void *dev_seq_start(struct seq_file *seq, loff_t *pos) -{ - read_lock(&dev_base_lock); - return *pos ? dev_get_idx(*pos - 1) : SEQ_START_TOKEN; + return NULL; } void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; - return v == SEQ_START_TOKEN ? dev_base : ((struct net_device *)v)->next; + return v == SEQ_START_TOKEN ? + first_net_device() : next_net_device((struct net_device *)v); } void dev_seq_stop(struct seq_file *seq, void *v) @@ -2105,28 +2114,25 @@ void dev_seq_stop(struct seq_file *seq, void *v) static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) { - if (dev->get_stats) { - struct net_device_stats *stats = dev->get_stats(dev); - - seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu " - "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n", - dev->name, stats->rx_bytes, stats->rx_packets, - stats->rx_errors, - stats->rx_dropped + stats->rx_missed_errors, - stats->rx_fifo_errors, - stats->rx_length_errors + stats->rx_over_errors + - stats->rx_crc_errors + stats->rx_frame_errors, - stats->rx_compressed, stats->multicast, - stats->tx_bytes, stats->tx_packets, - stats->tx_errors, stats->tx_dropped, - stats->tx_fifo_errors, stats->collisions, - stats->tx_carrier_errors + - stats->tx_aborted_errors + - stats->tx_window_errors + - stats->tx_heartbeat_errors, - stats->tx_compressed); - } else - seq_printf(seq, "%6s: No statistics available.\n", dev->name); + struct net_device_stats *stats = dev->get_stats(dev); + + seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu " + "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n", + dev->name, stats->rx_bytes, stats->rx_packets, + stats->rx_errors, + stats->rx_dropped + stats->rx_missed_errors, + stats->rx_fifo_errors, + stats->rx_length_errors + stats->rx_over_errors + + stats->rx_crc_errors + stats->rx_frame_errors, + stats->rx_compressed, stats->multicast, + stats->tx_bytes, stats->tx_packets, + stats->tx_errors, stats->tx_dropped, + stats->tx_fifo_errors, stats->collisions, + stats->tx_carrier_errors + + stats->tx_aborted_errors + + stats->tx_window_errors + + stats->tx_heartbeat_errors, + stats->tx_compressed); } /* @@ -2185,7 +2191,7 @@ static int softnet_seq_show(struct seq_file *seq, void *v) return 0; } -static struct seq_operations dev_seq_ops = { +static const struct seq_operations dev_seq_ops = { .start = dev_seq_start, .next = dev_seq_next, .stop = dev_seq_stop, @@ -2205,7 +2211,7 @@ static const struct file_operations dev_seq_fops = { .release = seq_release, }; -static struct seq_operations softnet_seq_ops = { +static const struct seq_operations softnet_seq_ops = { .start = softnet_seq_start, .next = softnet_seq_next, .stop = softnet_seq_stop, @@ -2225,12 +2231,135 @@ static const struct file_operations softnet_seq_fops = { .release = seq_release, }; -#ifdef CONFIG_WIRELESS_EXT -extern int wireless_proc_init(void); -#else -#define wireless_proc_init() 0 +static void *ptype_get_idx(loff_t pos) +{ + struct packet_type *pt = NULL; + loff_t i = 0; + int t; + + list_for_each_entry_rcu(pt, &ptype_all, list) { + if (i == pos) + return pt; + ++i; + } + + for (t = 0; t < 16; t++) { + list_for_each_entry_rcu(pt, &ptype_base[t], list) { + if (i == pos) + return pt; + ++i; + } + } + return NULL; +} + +static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) +{ + rcu_read_lock(); + return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN; +} + +static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct packet_type *pt; + struct list_head *nxt; + int hash; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ptype_get_idx(0); + + pt = v; + nxt = pt->list.next; + if (pt->type == htons(ETH_P_ALL)) { + if (nxt != &ptype_all) + goto found; + hash = 0; + nxt = ptype_base[0].next; + } else + hash = ntohs(pt->type) & 15; + + while (nxt == &ptype_base[hash]) { + if (++hash >= 16) + return NULL; + nxt = ptype_base[hash].next; + } +found: + return list_entry(nxt, struct packet_type, list); +} + +static void ptype_seq_stop(struct seq_file *seq, void *v) +{ + rcu_read_unlock(); +} + +static void ptype_seq_decode(struct seq_file *seq, void *sym) +{ +#ifdef CONFIG_KALLSYMS + unsigned long offset = 0, symsize; + const char *symname; + char *modname; + char namebuf[128]; + + symname = kallsyms_lookup((unsigned long)sym, &symsize, &offset, + &modname, namebuf); + + if (symname) { + char *delim = ":"; + + if (!modname) + modname = delim = ""; + seq_printf(seq, "%s%s%s%s+0x%lx", delim, modname, delim, + symname, offset); + return; + } #endif + seq_printf(seq, "[%p]", sym); +} + +static int ptype_seq_show(struct seq_file *seq, void *v) +{ + struct packet_type *pt = v; + + if (v == SEQ_START_TOKEN) + seq_puts(seq, "Type Device Function\n"); + else { + if (pt->type == htons(ETH_P_ALL)) + seq_puts(seq, "ALL "); + else + seq_printf(seq, "%04x", ntohs(pt->type)); + + seq_printf(seq, " %-8s ", + pt->dev ? pt->dev->name : ""); + ptype_seq_decode(seq, pt->func); + seq_putc(seq, '\n'); + } + + return 0; +} + +static const struct seq_operations ptype_seq_ops = { + .start = ptype_seq_start, + .next = ptype_seq_next, + .stop = ptype_seq_stop, + .show = ptype_seq_show, +}; + +static int ptype_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ptype_seq_ops); +} + +static const struct file_operations ptype_seq_fops = { + .owner = THIS_MODULE, + .open = ptype_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + + static int __init dev_proc_init(void) { int rc = -ENOMEM; @@ -2239,13 +2368,18 @@ static int __init dev_proc_init(void) goto out; if (!proc_net_fops_create("softnet_stat", S_IRUGO, &softnet_seq_fops)) goto out_dev; - if (wireless_proc_init()) + if (!proc_net_fops_create("ptype", S_IRUGO, &ptype_seq_fops)) + goto out_dev2; + + if (wext_proc_init()) goto out_softnet; rc = 0; out: return rc; out_softnet: proc_net_remove("softnet_stat"); +out_dev2: + proc_net_remove("ptype"); out_dev: proc_net_remove("dev"); goto out; @@ -2795,29 +2929,9 @@ int dev_ioctl(unsigned int cmd, void __user *arg) ret = -EFAULT; return ret; } -#ifdef CONFIG_WIRELESS_EXT /* Take care of Wireless Extensions */ - if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { - /* If command is `set a parameter', or - * `get the encoding parameters', check if - * the user has the right to do it */ - if (IW_IS_SET(cmd) || cmd == SIOCGIWENCODE - || cmd == SIOCGIWENCODEEXT) { - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - } - dev_load(ifr.ifr_name); - rtnl_lock(); - /* Follow me in net/core/wireless.c */ - ret = wireless_process_ioctl(&ifr, cmd); - rtnl_unlock(); - if (IW_IS_GET(cmd) && - copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - return ret; - } -#endif /* CONFIG_WIRELESS_EXT */ + if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) + return wext_handle_ioctl(&ifr, cmd, arg); return -EINVAL; } } @@ -2847,7 +2961,7 @@ static int dev_boot_phase = 1; static DEFINE_SPINLOCK(net_todo_list_lock); static struct list_head net_todo_list = LIST_HEAD_INIT(net_todo_list); -static inline void net_set_todo(struct net_device *dev) +static void net_set_todo(struct net_device *dev) { spin_lock(&net_todo_list_lock); list_add_tail(&dev->todo_list, &net_todo_list); @@ -2888,9 +3002,7 @@ int register_netdevice(struct net_device *dev) spin_lock_init(&dev->queue_lock); spin_lock_init(&dev->_xmit_lock); dev->xmit_lock_owner = -1; -#ifdef CONFIG_NET_CLS_ACT spin_lock_init(&dev->ingress_lock); -#endif dev->iflink = -1; @@ -2974,11 +3086,9 @@ int register_netdevice(struct net_device *dev) set_bit(__LINK_STATE_PRESENT, &dev->state); - dev->next = NULL; dev_init_scheduler(dev); write_lock_bh(&dev_base_lock); - *dev_tail = dev; - dev_tail = &dev->next; + list_add_tail(&dev->dev_list, &dev_base_head); hlist_add_head(&dev->name_hlist, head); hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex)); dev_hold(dev); @@ -3002,7 +3112,7 @@ out: * chain. 0 is returned on success. A negative errno code is returned * on a failure to set up the device, or if the name is a duplicate. * - * This is a wrapper around register_netdev that takes the rtnl semaphore + * This is a wrapper around register_netdevice that takes the rtnl semaphore * and expands the device name if you passed a format string to * alloc_netdev. */ @@ -3157,6 +3267,11 @@ out: mutex_unlock(&net_todo_run_mutex); } +static struct net_device_stats *internal_stats(struct net_device *dev) +{ + return &dev->stats; +} + /** * alloc_netdev - allocate network device * @sizeof_priv: size of private data to allocate space for @@ -3192,6 +3307,7 @@ struct net_device *alloc_netdev(int sizeof_priv, const char *name, if (sizeof_priv) dev->priv = netdev_priv(dev); + dev->get_stats = internal_stats; setup(dev); strcpy(dev->name, name); return dev; @@ -3246,8 +3362,6 @@ void synchronize_net(void) void unregister_netdevice(struct net_device *dev) { - struct net_device *d, **dp; - BUG_ON(dev_boot_phase); ASSERT_RTNL(); @@ -3267,19 +3381,11 @@ void unregister_netdevice(struct net_device *dev) dev_close(dev); /* And unlink it from device chain. */ - for (dp = &dev_base; (d = *dp) != NULL; dp = &d->next) { - if (d == dev) { - write_lock_bh(&dev_base_lock); - hlist_del(&dev->name_hlist); - hlist_del(&dev->index_hlist); - if (dev_tail == &dev->next) - dev_tail = dp; - *dp = d->next; - write_unlock_bh(&dev_base_lock); - break; - } - } - BUG_ON(!d); + write_lock_bh(&dev_base_lock); + list_del(&dev->dev_list); + hlist_del(&dev->name_hlist); + hlist_del(&dev->index_hlist); + write_unlock_bh(&dev_base_lock); dev->reg_state = NETREG_UNREGISTERING; diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c index 56b310c0c86..5a54053386c 100644 --- a/net/core/dev_mcast.c +++ b/net/core/dev_mcast.c @@ -223,7 +223,7 @@ static void *dev_mc_seq_start(struct seq_file *seq, loff_t *pos) loff_t off = 0; read_lock(&dev_base_lock); - for (dev = dev_base; dev; dev = dev->next) { + for_each_netdev(dev) { if (off++ == *pos) return dev; } @@ -232,9 +232,8 @@ static void *dev_mc_seq_start(struct seq_file *seq, loff_t *pos) static void *dev_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct net_device *dev = v; ++*pos; - return dev->next; + return next_net_device((struct net_device *)v); } static void dev_mc_seq_stop(struct seq_file *seq, void *v) @@ -264,7 +263,7 @@ static int dev_mc_seq_show(struct seq_file *seq, void *v) return 0; } -static struct seq_operations dev_mc_seq_ops = { +static const struct seq_operations dev_mc_seq_ops = { .start = dev_mc_seq_start, .next = dev_mc_seq_next, .stop = dev_mc_seq_stop, diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 6168edd137d..8d5e5a09b57 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -836,7 +836,7 @@ int dev_ethtool(struct ifreq *ifr) return -EPERM; } - if(dev->ethtool_ops->begin) + if (dev->ethtool_ops->begin) if ((rc = dev->ethtool_ops->begin(dev)) < 0) return rc; @@ -952,7 +952,7 @@ int dev_ethtool(struct ifreq *ifr) rc = -EOPNOTSUPP; } - if(dev->ethtool_ops->complete) + if (dev->ethtool_ops->complete) dev->ethtool_ops->complete(dev); if (old_features != dev->features) diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 7174ced75ef..8c5474e1668 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -44,6 +44,12 @@ static void rules_ops_put(struct fib_rules_ops *ops) module_put(ops->owner); } +static void flush_route_cache(struct fib_rules_ops *ops) +{ + if (ops->flush_cache) + ops->flush_cache(); +} + int fib_rules_register(struct fib_rules_ops *ops) { int err = -EEXIST; @@ -132,10 +138,25 @@ int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl, rcu_read_lock(); list_for_each_entry_rcu(rule, ops->rules_list, list) { +jumped: if (!fib_rule_match(rule, ops, fl, flags)) continue; - err = ops->action(rule, fl, flags, arg); + if (rule->action == FR_ACT_GOTO) { + struct fib_rule *target; + + target = rcu_dereference(rule->ctarget); + if (target == NULL) { + continue; + } else { + rule = target; + goto jumped; + } + } else if (rule->action == FR_ACT_NOP) + continue; + else + err = ops->action(rule, fl, flags, arg); + if (err != -EAGAIN) { fib_rule_get(rule); arg->rule = rule; @@ -174,13 +195,13 @@ errout: return err; } -int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) { struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rules_ops *ops = NULL; struct fib_rule *rule, *r, *last = NULL; struct nlattr *tb[FRA_MAX+1]; - int err = -EINVAL; + int err = -EINVAL, unresolved = 0; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) goto errout; @@ -237,6 +258,28 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (!rule->pref && ops->default_pref) rule->pref = ops->default_pref(); + err = -EINVAL; + if (tb[FRA_GOTO]) { + if (rule->action != FR_ACT_GOTO) + goto errout_free; + + rule->target = nla_get_u32(tb[FRA_GOTO]); + /* Backward jumps are prohibited to avoid endless loops */ + if (rule->target <= rule->pref) + goto errout_free; + + list_for_each_entry(r, ops->rules_list, list) { + if (r->pref == rule->target) { + rule->ctarget = r; + break; + } + } + + if (rule->ctarget == NULL) + unresolved = 1; + } else if (rule->action == FR_ACT_GOTO) + goto errout_free; + err = ops->configure(rule, skb, nlh, frh, tb); if (err < 0) goto errout_free; @@ -249,12 +292,35 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) fib_rule_get(rule); + if (ops->unresolved_rules) { + /* + * There are unresolved goto rules in the list, check if + * any of them are pointing to this new rule. + */ + list_for_each_entry(r, ops->rules_list, list) { + if (r->action == FR_ACT_GOTO && + r->target == rule->pref) { + BUG_ON(r->ctarget != NULL); + rcu_assign_pointer(r->ctarget, rule); + if (--ops->unresolved_rules == 0) + break; + } + } + } + + if (rule->action == FR_ACT_GOTO) + ops->nr_goto_rules++; + + if (unresolved) + ops->unresolved_rules++; + if (last) list_add_rcu(&rule->list, &last->list); else list_add_rcu(&rule->list, ops->rules_list); notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).pid); + flush_route_cache(ops); rules_ops_put(ops); return 0; @@ -265,11 +331,11 @@ errout: return err; } -int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) { struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rules_ops *ops = NULL; - struct fib_rule *rule; + struct fib_rule *rule, *tmp; struct nlattr *tb[FRA_MAX+1]; int err = -EINVAL; @@ -322,10 +388,30 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) } list_del_rcu(&rule->list); + + if (rule->action == FR_ACT_GOTO) + ops->nr_goto_rules--; + + /* + * Check if this rule is a target to any of them. If so, + * disable them. As this operation is eventually very + * expensive, it is only performed if goto rules have + * actually been added. + */ + if (ops->nr_goto_rules > 0) { + list_for_each_entry(tmp, ops->rules_list, list) { + if (tmp->ctarget == rule) { + rcu_assign_pointer(tmp->ctarget, NULL); + ops->unresolved_rules++; + } + } + } + synchronize_rcu(); notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).pid); fib_rule_put(rule); + flush_route_cache(ops); rules_ops_put(ops); return 0; } @@ -371,9 +457,16 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, frh->action = rule->action; frh->flags = rule->flags; - if (rule->ifname[0]) + if (rule->action == FR_ACT_GOTO && rule->ctarget == NULL) + frh->flags |= FIB_RULE_UNRESOLVED; + + if (rule->ifname[0]) { NLA_PUT_STRING(skb, FRA_IFNAME, rule->ifname); + if (rule->ifindex == -1) + frh->flags |= FIB_RULE_DEV_DETACHED; + } + if (rule->pref) NLA_PUT_U32(skb, FRA_PRIORITY, rule->pref); @@ -383,6 +476,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, if (rule->mark_mask || rule->mark) NLA_PUT_U32(skb, FRA_FWMASK, rule->mark_mask); + if (rule->target) + NLA_PUT_U32(skb, FRA_GOTO, rule->target); + if (ops->fill(rule, skb, nlh, frh) < 0) goto nla_put_failure; @@ -393,19 +489,14 @@ nla_put_failure: return -EMSGSIZE; } -int fib_rules_dump(struct sk_buff *skb, struct netlink_callback *cb, int family) +static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb, + struct fib_rules_ops *ops) { int idx = 0; struct fib_rule *rule; - struct fib_rules_ops *ops; - - ops = lookup_rules_ops(family); - if (ops == NULL) - return -EAFNOSUPPORT; - rcu_read_lock(); - list_for_each_entry_rcu(rule, ops->rules_list, list) { - if (idx < cb->args[0]) + list_for_each_entry(rule, ops->rules_list, list) { + if (idx < cb->args[1]) goto skip; if (fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).pid, @@ -415,14 +506,44 @@ int fib_rules_dump(struct sk_buff *skb, struct netlink_callback *cb, int family) skip: idx++; } - rcu_read_unlock(); - cb->args[0] = idx; + cb->args[1] = idx; rules_ops_put(ops); return skb->len; } -EXPORT_SYMBOL_GPL(fib_rules_dump); +static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct fib_rules_ops *ops; + int idx = 0, family; + + family = rtnl_msg_family(cb->nlh); + if (family != AF_UNSPEC) { + /* Protocol specific dump request */ + ops = lookup_rules_ops(family); + if (ops == NULL) + return -EAFNOSUPPORT; + + return dump_rules(skb, cb, ops); + } + + rcu_read_lock(); + list_for_each_entry_rcu(ops, &rules_ops, list) { + if (idx < cb->args[0] || !try_module_get(ops->owner)) + goto skip; + + if (dump_rules(skb, cb, ops) < 0) + break; + + cb->args[1] = 0; + skip: + idx++; + } + rcu_read_unlock(); + cb->args[0] = idx; + + return skb->len; +} static void notify_rule_change(int event, struct fib_rule *rule, struct fib_rules_ops *ops, struct nlmsghdr *nlh, @@ -501,6 +622,10 @@ static struct notifier_block fib_rules_notifier = { static int __init fib_rules_init(void) { + rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL); + rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL); + rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule); + return register_netdevice_notifier(&fib_rules_notifier); } diff --git a/net/core/filter.c b/net/core/filter.c index 8d185a089c5..bd903aaf7aa 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -42,11 +42,11 @@ static void *__load_pointer(struct sk_buff *skb, int k) u8 *ptr = NULL; if (k >= SKF_NET_OFF) - ptr = skb->nh.raw + k - SKF_NET_OFF; + ptr = skb_network_header(skb) + k - SKF_NET_OFF; else if (k >= SKF_LL_OFF) - ptr = skb->mac.raw + k - SKF_LL_OFF; + ptr = skb_mac_header(skb) + k - SKF_LL_OFF; - if (ptr >= skb->head && ptr < skb->tail) + if (ptr >= skb->head && ptr < skb_tail_pointer(skb)) return ptr; return NULL; } diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 259473d0559..bcc25591d8a 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -61,7 +61,7 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type, spin_lock_bh(lock); d->lock = lock; if (type) - d->tail = (struct rtattr *) skb->tail; + d->tail = (struct rtattr *)skb_tail_pointer(skb); d->skb = skb; d->compat_tc_stats = tc_stats_type; d->compat_xstats = xstats_type; @@ -212,7 +212,7 @@ int gnet_stats_finish_copy(struct gnet_dump *d) { if (d->tail) - d->tail->rta_len = d->skb->tail - (u8 *) d->tail; + d->tail->rta_len = skb_tail_pointer(d->skb) - (u8 *)d->tail; if (d->compat_tc_stats) if (gnet_stats_copy(d, d->compat_tc_stats, &d->tc_stats, diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 8b45c9d3b24..e3c26a9ccad 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -79,7 +79,7 @@ static void rfc2863_policy(struct net_device *dev) case IF_LINK_MODE_DEFAULT: default: break; - }; + } dev->operstate = operstate; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 841e3f32cab..6f3bb73053c 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1125,7 +1125,7 @@ int neigh_compat_output(struct sk_buff *skb) { struct net_device *dev = skb->dev; - __skb_pull(skb, skb->nh.raw - skb->data); + __skb_pull(skb, skb_network_offset(skb)); if (dev->hard_header && dev->hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL, @@ -1147,7 +1147,7 @@ int neigh_resolve_output(struct sk_buff *skb) if (!dst || !(neigh = dst->neighbour)) goto discard; - __skb_pull(skb, skb->nh.raw - skb->data); + __skb_pull(skb, skb_network_offset(skb)); if (!neigh_event_send(neigh, skb)) { int err; @@ -1190,7 +1190,7 @@ int neigh_connected_output(struct sk_buff *skb) struct neighbour *neigh = dst->neighbour; struct net_device *dev = neigh->dev; - __skb_pull(skb, skb->nh.raw - skb->data); + __skb_pull(skb, skb_network_offset(skb)); read_lock_bh(&neigh->lock); err = dev->hard_header(skb, dev, ntohs(skb->protocol), @@ -1441,7 +1441,7 @@ int neigh_table_clear(struct neigh_table *tbl) return 0; } -int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { struct ndmsg *ndm; struct nlattr *dst_attr; @@ -1506,7 +1506,7 @@ out: return err; } -int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; @@ -1786,7 +1786,7 @@ static struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] __read_mostly = { [NDTPA_LOCKTIME] = { .type = NLA_U64 }, }; -int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { struct neigh_table *tbl; struct ndtmsg *ndtmsg; @@ -1910,7 +1910,7 @@ errout: return err; } -int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) +static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) { int family, tidx, nidx = 0; int tbl_skip = cb->args[0]; @@ -2034,7 +2034,7 @@ out: return rc; } -int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) +static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) { struct neigh_table *tbl; int t, family, s_t; @@ -2393,7 +2393,7 @@ static int neigh_stat_seq_show(struct seq_file *seq, void *v) return 0; } -static struct seq_operations neigh_stat_seq_ops = { +static const struct seq_operations neigh_stat_seq_ops = { .start = neigh_stat_seq_start, .next = neigh_stat_seq_next, .stop = neigh_stat_seq_stop, @@ -2746,14 +2746,26 @@ void neigh_sysctl_unregister(struct neigh_parms *p) #endif /* CONFIG_SYSCTL */ +static int __init neigh_init(void) +{ + rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL); + rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL); + rtnl_register(PF_UNSPEC, RTM_GETNEIGH, NULL, neigh_dump_info); + + rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info); + rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL); + + return 0; +} + +subsys_initcall(neigh_init); + EXPORT_SYMBOL(__neigh_event_send); EXPORT_SYMBOL(neigh_changeaddr); EXPORT_SYMBOL(neigh_compat_output); EXPORT_SYMBOL(neigh_connected_output); EXPORT_SYMBOL(neigh_create); -EXPORT_SYMBOL(neigh_delete); EXPORT_SYMBOL(neigh_destroy); -EXPORT_SYMBOL(neigh_dump_info); EXPORT_SYMBOL(neigh_event_ns); EXPORT_SYMBOL(neigh_ifdown); EXPORT_SYMBOL(neigh_lookup); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 4cbb1290a6a..b21307b15b8 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -352,8 +352,8 @@ static ssize_t wireless_show(struct device *d, char *buf, read_lock(&dev_base_lock); if (dev_isalive(dev)) { - if(dev->wireless_handlers && - dev->wireless_handlers->get_wireless_stats) + if (dev->wireless_handlers && + dev->wireless_handlers->get_wireless_stats) iw = dev->wireless_handlers->get_wireless_stats(dev); if (iw != NULL) ret = (*format)(iw, buf); @@ -412,20 +412,25 @@ static int netdev_uevent(struct device *d, char **envp, int num_envp, char *buf, int size) { struct net_device *dev = to_net_dev(d); - int i = 0; - int n; + int retval, len = 0, i = 0; /* pass interface to uevent. */ - envp[i++] = buf; - n = snprintf(buf, size, "INTERFACE=%s", dev->name) + 1; - buf += n; - size -= n; - - if ((size <= 0) || (i >= num_envp)) - return -ENOMEM; - + retval = add_uevent_var(envp, num_envp, &i, + buf, size, &len, + "INTERFACE=%s", dev->name); + if (retval) + goto exit; + + /* pass ifindex to uevent. + * ifindex is useful as it won't change (interface name may change) + * and is what RtNetlink uses natively. */ + retval = add_uevent_var(envp, num_envp, &i, + buf, size, &len, + "IFINDEX=%d", dev->ifindex); + +exit: envp[i] = NULL; - return 0; + return retval; } #endif diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 4581ece48bb..b316435b0e2 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -86,7 +86,7 @@ static __sum16 checksum_udp(struct sk_buff *skb, struct udphdr *uh, { __wsum psum; - if (uh->check == 0 || skb->ip_summed == CHECKSUM_UNNECESSARY) + if (uh->check == 0 || skb_csum_unnecessary(skb)) return 0; psum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); @@ -293,10 +293,12 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) if (!skb) return; - memcpy(skb->data, msg, len); + skb_copy_to_linear_data(skb, msg, len); skb->len += len; - skb->h.uh = udph = (struct udphdr *) skb_push(skb, sizeof(*udph)); + skb_push(skb, sizeof(*udph)); + skb_reset_transport_header(skb); + udph = udp_hdr(skb); udph->source = htons(np->local_port); udph->dest = htons(np->remote_port); udph->len = htons(udp_len); @@ -308,7 +310,9 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) if (udph->check == 0) udph->check = CSUM_MANGLED_0; - skb->nh.iph = iph = (struct iphdr *)skb_push(skb, sizeof(*iph)); + skb_push(skb, sizeof(*iph)); + skb_reset_network_header(skb); + iph = ip_hdr(skb); /* iph->version = 4; iph->ihl = 5; */ put_unaligned(0x45, (unsigned char *)iph); @@ -324,7 +328,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); - skb->mac.raw = skb->data; + skb_reset_mac_header(skb); skb->protocol = eth->h_proto = htons(ETH_P_IP); memcpy(eth->h_source, np->local_mac, 6); memcpy(eth->h_dest, np->remote_mac, 6); @@ -359,8 +363,9 @@ static void arp_reply(struct sk_buff *skb) (2 * sizeof(u32))))) return; - skb->h.raw = skb->nh.raw = skb->data; - arp = skb->nh.arph; + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + arp = arp_hdr(skb); if ((arp->ar_hrd != htons(ARPHRD_ETHER) && arp->ar_hrd != htons(ARPHRD_IEEE802)) || @@ -389,7 +394,7 @@ static void arp_reply(struct sk_buff *skb) if (!send_skb) return; - send_skb->nh.raw = send_skb->data; + skb_reset_network_header(send_skb); arp = (struct arphdr *) skb_put(send_skb, size); send_skb->dev = skb->dev; send_skb->protocol = htons(ETH_P_ARP); @@ -443,7 +448,7 @@ int __netpoll_rx(struct sk_buff *skb) goto out; /* check if netpoll clients need ARP */ - if (skb->protocol == __constant_htons(ETH_P_ARP) && + if (skb->protocol == htons(ETH_P_ARP) && atomic_read(&trapped)) { skb_queue_tail(&npi->arp_tx, skb); return 1; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 4b01496dc33..b92a322872a 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -164,14 +164,11 @@ #define VERSION "pktgen v2.68: Packet Generator for packet performance testing.\n" -/* #define PG_DEBUG(a) a */ -#define PG_DEBUG(a) - /* The buckets are exponential in 'width' */ #define LAT_BUCKETS_MAX 32 #define IP_NAME_SZ 32 #define MAX_MPLS_LABELS 16 /* This is the max label stack depth */ -#define MPLS_STACK_BOTTOM __constant_htonl(0x00000100) +#define MPLS_STACK_BOTTOM htonl(0x00000100) /* Device flag bits */ #define F_IPSRC_RND (1<<0) /* IP-Src Random */ @@ -214,15 +211,11 @@ struct flow_state { }; struct pktgen_dev { - /* * Try to keep frequent/infrequent used vars. separated. */ - - char ifname[IFNAMSIZ]; - char result[512]; - - struct pktgen_thread *pg_thread; /* the owner */ + struct proc_dir_entry *entry; /* proc file */ + struct pktgen_thread *pg_thread;/* the owner */ struct list_head list; /* Used for chaining in the thread's run-queue */ int running; /* if this changes to false, the test will stop */ @@ -349,6 +342,8 @@ struct pktgen_dev { unsigned cflows; /* Concurrent flows (config) */ unsigned lflow; /* Flow length (config) */ unsigned nflows; /* accumulated flows (stats) */ + + char result[512]; }; struct pktgen_hdr { @@ -468,17 +463,6 @@ static inline __u64 pg_div64(__u64 n, __u64 base) return tmp; } -static inline u32 pktgen_random(void) -{ -#if 0 - __u32 n; - get_random_bytes(&n, 4); - return n; -#else - return net_random(); -#endif -} - static inline __u64 getCurMs(void) { struct timeval tv; @@ -512,7 +496,7 @@ static void pktgen_stop_all_threads_ifs(void); static int pktgen_stop_device(struct pktgen_dev *pkt_dev); static void pktgen_stop(struct pktgen_thread *t); static void pktgen_clear_counters(struct pktgen_dev *pkt_dev); -static int pktgen_mark_device(const char *ifname); + static unsigned int scan_ip6(const char *s, char ip[16]); static unsigned int fmt_ip6(char *s, const char ip[16]); @@ -606,7 +590,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v) " frags: %d delay: %u clone_skb: %d ifname: %s\n", pkt_dev->nfrags, 1000 * pkt_dev->delay_us + pkt_dev->delay_ns, - pkt_dev->clone_skb, pkt_dev->ifname); + pkt_dev->clone_skb, pkt_dev->odev->name); seq_printf(seq, " flows: %u flowlen: %u\n", pkt_dev->cflows, pkt_dev->lflow); @@ -661,7 +645,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v) if (pkt_dev->nr_labels) { unsigned i; seq_printf(seq, " mpls: "); - for(i = 0; i < pkt_dev->nr_labels; i++) + for (i = 0; i < pkt_dev->nr_labels; i++) seq_printf(seq, "%08x%s", ntohl(pkt_dev->labels[i]), i == pkt_dev->nr_labels-1 ? "\n" : ", "); } @@ -766,7 +750,7 @@ static int hex32_arg(const char __user *user_buffer, unsigned long maxlen, __u32 int i = 0; *num = 0; - for(; i < maxlen; i++) { + for (; i < maxlen; i++) { char c; *num <<= 4; if (get_user(c, &user_buffer[i])) @@ -802,7 +786,7 @@ static int count_trail_chars(const char __user * user_buffer, break; default: goto done; - }; + } } done: return i; @@ -845,7 +829,7 @@ static int strn_len(const char __user * user_buffer, unsigned int maxlen) break; default: break; - }; + } } done_str: return i; @@ -874,7 +858,7 @@ static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev) n++; if (n >= MAX_MPLS_LABELS) return -E2BIG; - } while(c == ','); + } while (c == ','); pkt_dev->nr_labels = n; return i; @@ -1503,7 +1487,7 @@ static ssize_t pktgen_if_write(struct file *file, if (len < 0) { return len; } i += len; offset = sprintf(pg_result, "OK: mpls="); - for(n = 0; n < pkt_dev->nr_labels; n++) + for (n = 0; n < pkt_dev->nr_labels; n++) offset += sprintf(pg_result + offset, "%08x%s", ntohl(pkt_dev->labels[n]), n == pkt_dev->nr_labels-1 ? "" : ","); @@ -1697,13 +1681,13 @@ static int pktgen_thread_show(struct seq_file *seq, void *v) if_lock(t); list_for_each_entry(pkt_dev, &t->if_list, list) if (pkt_dev->running) - seq_printf(seq, "%s ", pkt_dev->ifname); + seq_printf(seq, "%s ", pkt_dev->odev->name); seq_printf(seq, "\nStopped: "); list_for_each_entry(pkt_dev, &t->if_list, list) if (!pkt_dev->running) - seq_printf(seq, "%s ", pkt_dev->ifname); + seq_printf(seq, "%s ", pkt_dev->odev->name); if (t->result[0]) seq_printf(seq, "\nResult: %s\n", t->result); @@ -1849,16 +1833,14 @@ static struct pktgen_dev *__pktgen_NN_threads(const char *ifname, int remove) /* * mark a device for removal */ -static int pktgen_mark_device(const char *ifname) +static void pktgen_mark_device(const char *ifname) { struct pktgen_dev *pkt_dev = NULL; const int max_tries = 10, msec_per_try = 125; int i = 0; - int ret = 0; mutex_lock(&pktgen_thread_lock); - PG_DEBUG(printk("pktgen: pktgen_mark_device marking %s for removal\n", - ifname)); + pr_debug("pktgen: pktgen_mark_device marking %s for removal\n", ifname); while (1) { @@ -1867,8 +1849,8 @@ static int pktgen_mark_device(const char *ifname) break; /* success */ mutex_unlock(&pktgen_thread_lock); - PG_DEBUG(printk("pktgen: pktgen_mark_device waiting for %s " - "to disappear....\n", ifname)); + pr_debug("pktgen: pktgen_mark_device waiting for %s " + "to disappear....\n", ifname); schedule_timeout_interruptible(msecs_to_jiffies(msec_per_try)); mutex_lock(&pktgen_thread_lock); @@ -1876,79 +1858,91 @@ static int pktgen_mark_device(const char *ifname) printk("pktgen_mark_device: timed out after waiting " "%d msec for device %s to be removed\n", msec_per_try * i, ifname); - ret = 1; break; } } mutex_unlock(&pktgen_thread_lock); +} - return ret; +static void pktgen_change_name(struct net_device *dev) +{ + struct pktgen_thread *t; + + list_for_each_entry(t, &pktgen_threads, th_list) { + struct pktgen_dev *pkt_dev; + + list_for_each_entry(pkt_dev, &t->if_list, list) { + if (pkt_dev->odev != dev) + continue; + + remove_proc_entry(pkt_dev->entry->name, pg_proc_dir); + + pkt_dev->entry = create_proc_entry(dev->name, 0600, + pg_proc_dir); + if (!pkt_dev->entry) + printk(KERN_ERR "pktgen: can't move proc " + " entry for '%s'\n", dev->name); + break; + } + } } static int pktgen_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { - struct net_device *dev = (struct net_device *)(ptr); + struct net_device *dev = ptr; /* It is OK that we do not hold the group lock right now, * as we run under the RTNL lock. */ switch (event) { - case NETDEV_CHANGEADDR: - case NETDEV_GOING_DOWN: - case NETDEV_DOWN: - case NETDEV_UP: - /* Ignore for now */ + case NETDEV_CHANGENAME: + pktgen_change_name(dev); break; case NETDEV_UNREGISTER: pktgen_mark_device(dev->name); break; - }; + } return NOTIFY_DONE; } /* Associate pktgen_dev with a device. */ -static struct net_device *pktgen_setup_dev(struct pktgen_dev *pkt_dev) +static int pktgen_setup_dev(struct pktgen_dev *pkt_dev, const char *ifname) { struct net_device *odev; + int err; /* Clean old setups */ - if (pkt_dev->odev) { dev_put(pkt_dev->odev); pkt_dev->odev = NULL; } - odev = dev_get_by_name(pkt_dev->ifname); - + odev = dev_get_by_name(ifname); if (!odev) { - printk("pktgen: no such netdevice: \"%s\"\n", pkt_dev->ifname); - goto out; + printk("pktgen: no such netdevice: \"%s\"\n", ifname); + return -ENODEV; } + if (odev->type != ARPHRD_ETHER) { - printk("pktgen: not an ethernet device: \"%s\"\n", - pkt_dev->ifname); - goto out_put; - } - if (!netif_running(odev)) { - printk("pktgen: device is down: \"%s\"\n", pkt_dev->ifname); - goto out_put; + printk("pktgen: not an ethernet device: \"%s\"\n", ifname); + err = -EINVAL; + } else if (!netif_running(odev)) { + printk("pktgen: device is down: \"%s\"\n", ifname); + err = -ENETDOWN; + } else { + pkt_dev->odev = odev; + return 0; } - pkt_dev->odev = odev; - return pkt_dev->odev; - -out_put: dev_put(odev); -out: - return NULL; - + return err; } /* Read pkt_dev from the interface and set up internal pktgen_dev @@ -1956,10 +1950,6 @@ out: */ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) { - /* Try once more, just in case it works now. */ - if (!pkt_dev->odev) - pktgen_setup_dev(pkt_dev); - if (!pkt_dev->odev) { printk("pktgen: ERROR: pkt_dev->odev == NULL in setup_inject.\n"); sprintf(pkt_dev->result, @@ -2096,7 +2086,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) int flow = 0; if (pkt_dev->cflows) { - flow = pktgen_random() % pkt_dev->cflows; + flow = random32() % pkt_dev->cflows; if (pkt_dev->flows[flow].count > pkt_dev->lflow) pkt_dev->flows[flow].count = 0; @@ -2108,7 +2098,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) __u32 tmp; if (pkt_dev->flags & F_MACSRC_RND) - mc = pktgen_random() % (pkt_dev->src_mac_count); + mc = random32() % pkt_dev->src_mac_count; else { mc = pkt_dev->cur_src_mac_offset++; if (pkt_dev->cur_src_mac_offset > @@ -2134,7 +2124,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) __u32 tmp; if (pkt_dev->flags & F_MACDST_RND) - mc = pktgen_random() % (pkt_dev->dst_mac_count); + mc = random32() % pkt_dev->dst_mac_count; else { mc = pkt_dev->cur_dst_mac_offset++; @@ -2158,27 +2148,26 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->flags & F_MPLS_RND) { unsigned i; - for(i = 0; i < pkt_dev->nr_labels; i++) + for (i = 0; i < pkt_dev->nr_labels; i++) if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM) pkt_dev->labels[i] = MPLS_STACK_BOTTOM | - ((__force __be32)pktgen_random() & + ((__force __be32)random32() & htonl(0x000fffff)); } if ((pkt_dev->flags & F_VID_RND) && (pkt_dev->vlan_id != 0xffff)) { - pkt_dev->vlan_id = pktgen_random() % 4096; + pkt_dev->vlan_id = random32() & (4096-1); } if ((pkt_dev->flags & F_SVID_RND) && (pkt_dev->svlan_id != 0xffff)) { - pkt_dev->svlan_id = pktgen_random() % 4096; + pkt_dev->svlan_id = random32() & (4096 - 1); } if (pkt_dev->udp_src_min < pkt_dev->udp_src_max) { if (pkt_dev->flags & F_UDPSRC_RND) - pkt_dev->cur_udp_src = - ((pktgen_random() % - (pkt_dev->udp_src_max - pkt_dev->udp_src_min)) + - pkt_dev->udp_src_min); + pkt_dev->cur_udp_src = random32() % + (pkt_dev->udp_src_max - pkt_dev->udp_src_min) + + pkt_dev->udp_src_min; else { pkt_dev->cur_udp_src++; @@ -2189,10 +2178,9 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->udp_dst_min < pkt_dev->udp_dst_max) { if (pkt_dev->flags & F_UDPDST_RND) { - pkt_dev->cur_udp_dst = - ((pktgen_random() % - (pkt_dev->udp_dst_max - pkt_dev->udp_dst_min)) + - pkt_dev->udp_dst_min); + pkt_dev->cur_udp_dst = random32() % + (pkt_dev->udp_dst_max - pkt_dev->udp_dst_min) + + pkt_dev->udp_dst_min; } else { pkt_dev->cur_udp_dst++; if (pkt_dev->cur_udp_dst >= pkt_dev->udp_dst_max) @@ -2207,7 +2195,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) saddr_max))) { __u32 t; if (pkt_dev->flags & F_IPSRC_RND) - t = ((pktgen_random() % (imx - imn)) + imn); + t = random32() % (imx - imn) + imn; else { t = ntohl(pkt_dev->cur_saddr); t++; @@ -2228,14 +2216,13 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) __be32 s; if (pkt_dev->flags & F_IPDST_RND) { - t = pktgen_random() % (imx - imn) + imn; + t = random32() % (imx - imn) + imn; s = htonl(t); while (LOOPBACK(s) || MULTICAST(s) || BADCLASS(s) || ZERONET(s) || LOCAL_MCAST(s)) { - t = (pktgen_random() % - (imx - imn)) + imn; + t = random32() % (imx - imn) + imn; s = htonl(t); } pkt_dev->cur_daddr = s; @@ -2267,7 +2254,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) for (i = 0; i < 4; i++) { pkt_dev->cur_in6_daddr.s6_addr32[i] = - (((__force __be32)pktgen_random() | + (((__force __be32)random32() | pkt_dev->min_in6_daddr.s6_addr32[i]) & pkt_dev->max_in6_daddr.s6_addr32[i]); } @@ -2277,9 +2264,9 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) { __u32 t; if (pkt_dev->flags & F_TXSIZE_RND) { - t = ((pktgen_random() % - (pkt_dev->max_pkt_size - pkt_dev->min_pkt_size)) - + pkt_dev->min_pkt_size); + t = random32() % + (pkt_dev->max_pkt_size - pkt_dev->min_pkt_size) + + pkt_dev->min_pkt_size; } else { t = pkt_dev->cur_pkt_size + 1; if (t > pkt_dev->max_pkt_size) @@ -2294,7 +2281,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) static void mpls_push(__be32 *mpls, struct pktgen_dev *pkt_dev) { unsigned i; - for(i = 0; i < pkt_dev->nr_labels; i++) { + for (i = 0; i < pkt_dev->nr_labels; i++) { *mpls++ = pkt_dev->labels[i] & ~MPLS_STACK_BOTTOM; } mpls--; @@ -2316,7 +2303,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, int datalen, iplen; struct iphdr *iph; struct pktgen_hdr *pgh = NULL; - __be16 protocol = __constant_htons(ETH_P_IP); + __be16 protocol = htons(ETH_P_IP); __be32 *mpls; __be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */ __be16 *vlan_encapsulated_proto = NULL; /* packet type ID field (or len) for VLAN tag */ @@ -2325,10 +2312,10 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, if (pkt_dev->nr_labels) - protocol = __constant_htons(ETH_P_MPLS_UC); + protocol = htons(ETH_P_MPLS_UC); if (pkt_dev->vlan_id != 0xffff) - protocol = __constant_htons(ETH_P_8021Q); + protocol = htons(ETH_P_8021Q); /* Update any of the values, used when we're incrementing various * fields. @@ -2354,24 +2341,28 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, mpls_push(mpls, pkt_dev); if (pkt_dev->vlan_id != 0xffff) { - if(pkt_dev->svlan_id != 0xffff) { + if (pkt_dev->svlan_id != 0xffff) { svlan_tci = (__be16 *)skb_put(skb, sizeof(__be16)); *svlan_tci = build_tci(pkt_dev->svlan_id, pkt_dev->svlan_cfi, pkt_dev->svlan_p); svlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16)); - *svlan_encapsulated_proto = __constant_htons(ETH_P_8021Q); + *svlan_encapsulated_proto = htons(ETH_P_8021Q); } vlan_tci = (__be16 *)skb_put(skb, sizeof(__be16)); *vlan_tci = build_tci(pkt_dev->vlan_id, pkt_dev->vlan_cfi, pkt_dev->vlan_p); vlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16)); - *vlan_encapsulated_proto = __constant_htons(ETH_P_IP); + *vlan_encapsulated_proto = htons(ETH_P_IP); } - iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr)); - udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr)); + skb->network_header = skb->tail; + skb->transport_header = skb->network_header + sizeof(struct iphdr); + skb_put(skb, sizeof(struct iphdr) + sizeof(struct udphdr)); + + iph = ip_hdr(skb); + udph = udp_hdr(skb); memcpy(eth, pkt_dev->hh, 12); *(__be16 *) & eth[12] = protocol; @@ -2400,12 +2391,11 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, iph->check = 0; iph->check = ip_fast_csum((void *)iph, iph->ihl); skb->protocol = protocol; - skb->mac.raw = ((u8 *) iph) - 14 - pkt_dev->nr_labels*sizeof(u32) - - VLAN_TAG_SIZE(pkt_dev) - SVLAN_TAG_SIZE(pkt_dev); + skb->mac_header = (skb->network_header - ETH_HLEN - + pkt_dev->nr_labels * sizeof(u32) - + VLAN_TAG_SIZE(pkt_dev) - SVLAN_TAG_SIZE(pkt_dev)); skb->dev = odev; skb->pkt_type = PACKET_HOST; - skb->nh.iph = iph; - skb->h.uh = udph; if (pkt_dev->nfrags <= 0) pgh = (struct pktgen_hdr *)skb_put(skb, datalen); @@ -2654,7 +2644,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, int datalen; struct ipv6hdr *iph; struct pktgen_hdr *pgh = NULL; - __be16 protocol = __constant_htons(ETH_P_IPV6); + __be16 protocol = htons(ETH_P_IPV6); __be32 *mpls; __be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */ __be16 *vlan_encapsulated_proto = NULL; /* packet type ID field (or len) for VLAN tag */ @@ -2662,10 +2652,10 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, __be16 *svlan_encapsulated_proto = NULL; /* packet type ID field (or len) for SVLAN tag */ if (pkt_dev->nr_labels) - protocol = __constant_htons(ETH_P_MPLS_UC); + protocol = htons(ETH_P_MPLS_UC); if (pkt_dev->vlan_id != 0xffff) - protocol = __constant_htons(ETH_P_8021Q); + protocol = htons(ETH_P_8021Q); /* Update any of the values, used when we're incrementing various * fields. @@ -2690,24 +2680,28 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, mpls_push(mpls, pkt_dev); if (pkt_dev->vlan_id != 0xffff) { - if(pkt_dev->svlan_id != 0xffff) { + if (pkt_dev->svlan_id != 0xffff) { svlan_tci = (__be16 *)skb_put(skb, sizeof(__be16)); *svlan_tci = build_tci(pkt_dev->svlan_id, pkt_dev->svlan_cfi, pkt_dev->svlan_p); svlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16)); - *svlan_encapsulated_proto = __constant_htons(ETH_P_8021Q); + *svlan_encapsulated_proto = htons(ETH_P_8021Q); } vlan_tci = (__be16 *)skb_put(skb, sizeof(__be16)); *vlan_tci = build_tci(pkt_dev->vlan_id, pkt_dev->vlan_cfi, pkt_dev->vlan_p); vlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16)); - *vlan_encapsulated_proto = __constant_htons(ETH_P_IPV6); + *vlan_encapsulated_proto = htons(ETH_P_IPV6); } - iph = (struct ipv6hdr *)skb_put(skb, sizeof(struct ipv6hdr)); - udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr)); + skb->network_header = skb->tail; + skb->transport_header = skb->network_header + sizeof(struct ipv6hdr); + skb_put(skb, sizeof(struct ipv6hdr) + sizeof(struct udphdr)); + + iph = ipv6_hdr(skb); + udph = udp_hdr(skb); memcpy(eth, pkt_dev->hh, 12); *(__be16 *) & eth[12] = protocol; @@ -2729,7 +2723,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, udph->len = htons(datalen + sizeof(struct udphdr)); udph->check = 0; /* No checksum */ - *(__be32 *) iph = __constant_htonl(0x60000000); /* Version + flow */ + *(__be32 *) iph = htonl(0x60000000); /* Version + flow */ if (pkt_dev->traffic_class) { /* Version + traffic class + flow (0) */ @@ -2744,13 +2738,12 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, ipv6_addr_copy(&iph->daddr, &pkt_dev->cur_in6_daddr); ipv6_addr_copy(&iph->saddr, &pkt_dev->cur_in6_saddr); - skb->mac.raw = ((u8 *) iph) - 14 - pkt_dev->nr_labels*sizeof(u32) - - VLAN_TAG_SIZE(pkt_dev) - SVLAN_TAG_SIZE(pkt_dev); + skb->mac_header = (skb->network_header - ETH_HLEN - + pkt_dev->nr_labels * sizeof(u32) - + VLAN_TAG_SIZE(pkt_dev) - SVLAN_TAG_SIZE(pkt_dev)); skb->protocol = protocol; skb->dev = odev; skb->pkt_type = PACKET_HOST; - skb->nh.ipv6h = iph; - skb->h.uh = udph; if (pkt_dev->nfrags <= 0) pgh = (struct pktgen_hdr *)skb_put(skb, datalen); @@ -2848,7 +2841,7 @@ static void pktgen_run(struct pktgen_thread *t) struct pktgen_dev *pkt_dev; int started = 0; - PG_DEBUG(printk("pktgen: entering pktgen_run. %p\n", t)); + pr_debug("pktgen: entering pktgen_run. %p\n", t); if_lock(t); list_for_each_entry(pkt_dev, &t->if_list, list) { @@ -2880,7 +2873,7 @@ static void pktgen_stop_all_threads_ifs(void) { struct pktgen_thread *t; - PG_DEBUG(printk("pktgen: entering pktgen_stop_all_threads_ifs.\n")); + pr_debug("pktgen: entering pktgen_stop_all_threads_ifs.\n"); mutex_lock(&pktgen_thread_lock); @@ -2948,7 +2941,7 @@ static void pktgen_run_all_threads(void) { struct pktgen_thread *t; - PG_DEBUG(printk("pktgen: entering pktgen_run_all_threads.\n")); + pr_debug("pktgen: entering pktgen_run_all_threads.\n"); mutex_lock(&pktgen_thread_lock); @@ -3006,7 +2999,7 @@ static int pktgen_stop_device(struct pktgen_dev *pkt_dev) if (!pkt_dev->running) { printk("pktgen: interface: %s is already stopped\n", - pkt_dev->ifname); + pkt_dev->odev->name); return -EINVAL; } @@ -3040,7 +3033,7 @@ static void pktgen_stop(struct pktgen_thread *t) { struct pktgen_dev *pkt_dev; - PG_DEBUG(printk("pktgen: entering pktgen_stop\n")); + pr_debug("pktgen: entering pktgen_stop\n"); if_lock(t); @@ -3064,7 +3057,7 @@ static void pktgen_rem_one_if(struct pktgen_thread *t) struct list_head *q, *n; struct pktgen_dev *cur; - PG_DEBUG(printk("pktgen: entering pktgen_rem_one_if\n")); + pr_debug("pktgen: entering pktgen_rem_one_if\n"); if_lock(t); @@ -3093,7 +3086,7 @@ static void pktgen_rem_all_ifs(struct pktgen_thread *t) /* Remove all devices, free mem */ - PG_DEBUG(printk("pktgen: entering pktgen_rem_all_ifs\n")); + pr_debug("pktgen: entering pktgen_rem_all_ifs\n"); if_lock(t); list_for_each_safe(q, n, &t->if_list) { @@ -3276,7 +3269,7 @@ static int pktgen_thread_worker(void *arg) t->pid = current->pid; - PG_DEBUG(printk("pktgen: starting pktgen/%d: pid=%d\n", cpu, current->pid)); + pr_debug("pktgen: starting pktgen/%d: pid=%d\n", cpu, current->pid); max_before_softirq = t->max_before_softirq; @@ -3339,13 +3332,13 @@ static int pktgen_thread_worker(void *arg) set_current_state(TASK_INTERRUPTIBLE); } - PG_DEBUG(printk("pktgen: %s stopping all device\n", t->tsk->comm)); + pr_debug("pktgen: %s stopping all device\n", t->tsk->comm); pktgen_stop(t); - PG_DEBUG(printk("pktgen: %s removing all device\n", t->tsk->comm)); + pr_debug("pktgen: %s removing all device\n", t->tsk->comm); pktgen_rem_all_ifs(t); - PG_DEBUG(printk("pktgen: %s removing thread.\n", t->tsk->comm)); + pr_debug("pktgen: %s removing thread.\n", t->tsk->comm); pktgen_rem_thread(t); return 0; @@ -3358,13 +3351,13 @@ static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, if_lock(t); list_for_each_entry(p, &t->if_list, list) - if (strncmp(p->ifname, ifname, IFNAMSIZ) == 0) { + if (strncmp(p->odev->name, ifname, IFNAMSIZ) == 0) { pkt_dev = p; break; } if_unlock(t); - PG_DEBUG(printk("pktgen: find_dev(%s) returning %p\n", ifname, pkt_dev)); + pr_debug("pktgen: find_dev(%s) returning %p\n", ifname, pkt_dev); return pkt_dev; } @@ -3399,7 +3392,7 @@ out: static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) { struct pktgen_dev *pkt_dev; - struct proc_dir_entry *pe; + int err; /* We don't allow a device to be on several threads */ @@ -3441,29 +3434,28 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) pkt_dev->svlan_cfi = 0; pkt_dev->svlan_id = 0xffff; - strncpy(pkt_dev->ifname, ifname, IFNAMSIZ); + err = pktgen_setup_dev(pkt_dev, ifname); + if (err) + goto out1; - if (!pktgen_setup_dev(pkt_dev)) { - printk("pktgen: ERROR: pktgen_setup_dev failed.\n"); - if (pkt_dev->flows) - vfree(pkt_dev->flows); - kfree(pkt_dev); - return -ENODEV; - } - - pe = create_proc_entry(ifname, 0600, pg_proc_dir); - if (!pe) { + pkt_dev->entry = create_proc_entry(ifname, 0600, pg_proc_dir); + if (!pkt_dev->entry) { printk("pktgen: cannot create %s/%s procfs entry.\n", PG_PROC_DIR, ifname); - if (pkt_dev->flows) - vfree(pkt_dev->flows); - kfree(pkt_dev); - return -EINVAL; + err = -EINVAL; + goto out2; } - pe->proc_fops = &pktgen_if_fops; - pe->data = pkt_dev; + pkt_dev->entry->proc_fops = &pktgen_if_fops; + pkt_dev->entry->data = pkt_dev; return add_dev_to_thread(t, pkt_dev); +out2: + dev_put(pkt_dev->odev); +out1: + if (pkt_dev->flows) + vfree(pkt_dev->flows); + kfree(pkt_dev); + return err; } static int __init pktgen_create_thread(int cpu) @@ -3533,7 +3525,7 @@ static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *pkt_dev) { - PG_DEBUG(printk("pktgen: remove_device pkt_dev=%p\n", pkt_dev)); + pr_debug("pktgen: remove_device pkt_dev=%p\n", pkt_dev); if (pkt_dev->running) { printk("pktgen:WARNING: trying to remove a running interface, stopping it now.\n"); @@ -3551,9 +3543,8 @@ static int pktgen_remove_device(struct pktgen_thread *t, _rem_dev_from_if_list(t, pkt_dev); - /* Clean up proc file system */ - - remove_proc_entry(pkt_dev->ifname, pg_proc_dir); + if (pkt_dev->entry) + remove_proc_entry(pkt_dev->entry->name, pg_proc_dir); if (pkt_dev->flows) vfree(pkt_dev->flows); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 33ea8eac7fe..8c971a2efe2 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -50,11 +50,13 @@ #include <net/sock.h> #include <net/pkt_sched.h> #include <net/fib_rules.h> -#include <net/netlink.h> -#ifdef CONFIG_NET_WIRELESS_RTNETLINK -#include <linux/wireless.h> -#include <net/iw_handler.h> -#endif /* CONFIG_NET_WIRELESS_RTNETLINK */ +#include <net/rtnetlink.h> + +struct rtnl_link +{ + rtnl_doit_func doit; + rtnl_dumpit_func dumpit; +}; static DEFINE_MUTEX(rtnl_mutex); static struct sock *rtnl; @@ -95,7 +97,151 @@ int rtattr_parse(struct rtattr *tb[], int maxattr, struct rtattr *rta, int len) return 0; } -struct rtnetlink_link * rtnetlink_links[NPROTO]; +static struct rtnl_link *rtnl_msg_handlers[NPROTO]; + +static inline int rtm_msgindex(int msgtype) +{ + int msgindex = msgtype - RTM_BASE; + + /* + * msgindex < 0 implies someone tried to register a netlink + * control code. msgindex >= RTM_NR_MSGTYPES may indicate that + * the message type has not been added to linux/rtnetlink.h + */ + BUG_ON(msgindex < 0 || msgindex >= RTM_NR_MSGTYPES); + + return msgindex; +} + +static rtnl_doit_func rtnl_get_doit(int protocol, int msgindex) +{ + struct rtnl_link *tab; + + tab = rtnl_msg_handlers[protocol]; + if (tab == NULL || tab[msgindex].doit == NULL) + tab = rtnl_msg_handlers[PF_UNSPEC]; + + return tab ? tab[msgindex].doit : NULL; +} + +static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex) +{ + struct rtnl_link *tab; + + tab = rtnl_msg_handlers[protocol]; + if (tab == NULL || tab[msgindex].dumpit == NULL) + tab = rtnl_msg_handlers[PF_UNSPEC]; + + return tab ? tab[msgindex].dumpit : NULL; +} + +/** + * __rtnl_register - Register a rtnetlink message type + * @protocol: Protocol family or PF_UNSPEC + * @msgtype: rtnetlink message type + * @doit: Function pointer called for each request message + * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message + * + * Registers the specified function pointers (at least one of them has + * to be non-NULL) to be called whenever a request message for the + * specified protocol family and message type is received. + * + * The special protocol family PF_UNSPEC may be used to define fallback + * function pointers for the case when no entry for the specific protocol + * family exists. + * + * Returns 0 on success or a negative error code. + */ +int __rtnl_register(int protocol, int msgtype, + rtnl_doit_func doit, rtnl_dumpit_func dumpit) +{ + struct rtnl_link *tab; + int msgindex; + + BUG_ON(protocol < 0 || protocol >= NPROTO); + msgindex = rtm_msgindex(msgtype); + + tab = rtnl_msg_handlers[protocol]; + if (tab == NULL) { + tab = kcalloc(RTM_NR_MSGTYPES, sizeof(*tab), GFP_KERNEL); + if (tab == NULL) + return -ENOBUFS; + + rtnl_msg_handlers[protocol] = tab; + } + + if (doit) + tab[msgindex].doit = doit; + + if (dumpit) + tab[msgindex].dumpit = dumpit; + + return 0; +} + +EXPORT_SYMBOL_GPL(__rtnl_register); + +/** + * rtnl_register - Register a rtnetlink message type + * + * Identical to __rtnl_register() but panics on failure. This is useful + * as failure of this function is very unlikely, it can only happen due + * to lack of memory when allocating the chain to store all message + * handlers for a protocol. Meant for use in init functions where lack + * of memory implies no sense in continueing. + */ +void rtnl_register(int protocol, int msgtype, + rtnl_doit_func doit, rtnl_dumpit_func dumpit) +{ + if (__rtnl_register(protocol, msgtype, doit, dumpit) < 0) + panic("Unable to register rtnetlink message handler, " + "protocol = %d, message type = %d\n", + protocol, msgtype); +} + +EXPORT_SYMBOL_GPL(rtnl_register); + +/** + * rtnl_unregister - Unregister a rtnetlink message type + * @protocol: Protocol family or PF_UNSPEC + * @msgtype: rtnetlink message type + * + * Returns 0 on success or a negative error code. + */ +int rtnl_unregister(int protocol, int msgtype) +{ + int msgindex; + + BUG_ON(protocol < 0 || protocol >= NPROTO); + msgindex = rtm_msgindex(msgtype); + + if (rtnl_msg_handlers[protocol] == NULL) + return -ENOENT; + + rtnl_msg_handlers[protocol][msgindex].doit = NULL; + rtnl_msg_handlers[protocol][msgindex].dumpit = NULL; + + return 0; +} + +EXPORT_SYMBOL_GPL(rtnl_unregister); + +/** + * rtnl_unregister_all - Unregister all rtnetlink message type of a protocol + * @protocol : Protocol family or PF_UNSPEC + * + * Identical to calling rtnl_unregster() for all registered message types + * of a certain protocol family. + */ +void rtnl_unregister_all(int protocol) +{ + BUG_ON(protocol < 0 || protocol >= NPROTO); + + kfree(rtnl_msg_handlers[protocol]); + rtnl_msg_handlers[protocol] = NULL; +} + +EXPORT_SYMBOL_GPL(rtnl_unregister_all); static const int rtm_min[RTM_NR_FAMILIES] = { @@ -249,7 +395,7 @@ static void set_operstate(struct net_device *dev, unsigned char transition) operstate == IF_OPER_UNKNOWN) operstate = IF_OPER_DORMANT; break; - }; + } if (dev->operstate != operstate) { write_lock_bh(&dev_base_lock); @@ -393,16 +539,17 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) int s_idx = cb->args[0]; struct net_device *dev; - read_lock(&dev_base_lock); - for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { + idx = 0; + for_each_netdev(dev) { if (idx < s_idx) - continue; + goto cont; if (rtnl_fill_ifinfo(skb, dev, NULL, 0, RTM_NEWLINK, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, 0, NLM_F_MULTI) <= 0) break; +cont: + idx++; } - read_unlock(&dev_base_lock); cb->args[0] = idx; return skb->len; @@ -536,17 +683,6 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) modified = 1; } -#ifdef CONFIG_NET_WIRELESS_RTNETLINK - if (tb[IFLA_WIRELESS]) { - /* Call Wireless Extensions. - * Various stuff checked in there... */ - err = wireless_rtnetlink_set(dev, nla_data(tb[IFLA_WIRELESS]), - nla_len(tb[IFLA_WIRELESS])); - if (err < 0) - goto errout_dev; - } -#endif /* CONFIG_NET_WIRELESS_RTNETLINK */ - if (tb[IFLA_BROADCAST]) { nla_memcpy(dev->broadcast, tb[IFLA_BROADCAST], dev->addr_len); send_addr_notify = 1; @@ -610,22 +746,6 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) } else return -EINVAL; - -#ifdef CONFIG_NET_WIRELESS_RTNETLINK - if (tb[IFLA_WIRELESS]) { - /* Call Wireless Extensions. We need to know the size before - * we can alloc. Various stuff checked in there... */ - err = wireless_rtnetlink_get(dev, nla_data(tb[IFLA_WIRELESS]), - nla_len(tb[IFLA_WIRELESS]), - &iw_buf, &iw_buf_len); - if (err < 0) - goto errout; - - /* Payload is at an offset in buffer */ - iw = iw_buf + IW_EV_POINT_OFF; - } -#endif /* CONFIG_NET_WIRELESS_RTNETLINK */ - nskb = nlmsg_new(if_nlmsg_size(iw_buf_len), GFP_KERNEL); if (nskb == NULL) { err = -ENOBUFS; @@ -659,12 +779,12 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) int type = cb->nlh->nlmsg_type-RTM_BASE; if (idx < s_idx || idx == PF_PACKET) continue; - if (rtnetlink_links[idx] == NULL || - rtnetlink_links[idx][type].dumpit == NULL) + if (rtnl_msg_handlers[idx] == NULL || + rtnl_msg_handlers[idx][type].dumpit == NULL) continue; if (idx > s_idx) memset(&cb->args[0], 0, sizeof(cb->args)); - if (rtnetlink_links[idx][type].dumpit(skb, cb)) + if (rtnl_msg_handlers[idx][type].dumpit(skb, cb)) break; } cb->family = idx; @@ -700,30 +820,18 @@ static int rtattr_max; /* Process one rtnetlink message. */ -static __inline__ int -rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp) +static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { - struct rtnetlink_link *link; - struct rtnetlink_link *link_tab; + rtnl_doit_func doit; int sz_idx, kind; int min_len; int family; int type; int err; - /* Only requests are handled by kernel now */ - if (!(nlh->nlmsg_flags&NLM_F_REQUEST)) - return 0; - type = nlh->nlmsg_type; - - /* A control message: ignore them */ - if (type < RTM_BASE) - return 0; - - /* Unknown message: reply with EINVAL */ if (type > RTM_MAX) - goto err_inval; + return -EOPNOTSUPP; type -= RTM_BASE; @@ -732,45 +840,33 @@ rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp) return 0; family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family; - if (family >= NPROTO) { - *errp = -EAFNOSUPPORT; - return -1; - } - - link_tab = rtnetlink_links[family]; - if (link_tab == NULL) - link_tab = rtnetlink_links[PF_UNSPEC]; - link = &link_tab[type]; + if (family >= NPROTO) + return -EAFNOSUPPORT; sz_idx = type>>2; kind = type&3; - if (kind != 2 && security_netlink_recv(skb, CAP_NET_ADMIN)) { - *errp = -EPERM; - return -1; - } + if (kind != 2 && security_netlink_recv(skb, CAP_NET_ADMIN)) + return -EPERM; if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { - if (link->dumpit == NULL) - link = &(rtnetlink_links[PF_UNSPEC][type]); - - if (link->dumpit == NULL) - goto err_inval; + rtnl_dumpit_func dumpit; - if ((*errp = netlink_dump_start(rtnl, skb, nlh, - link->dumpit, NULL)) != 0) { - return -1; - } + dumpit = rtnl_get_dumpit(family, type); + if (dumpit == NULL) + return -EOPNOTSUPP; - netlink_queue_skip(nlh, skb); - return -1; + __rtnl_unlock(); + err = netlink_dump_start(rtnl, skb, nlh, dumpit, NULL); + rtnl_lock(); + return err; } memset(rta_buf, 0, (rtattr_max * sizeof(struct rtattr *))); min_len = rtm_min[sz_idx]; if (nlh->nlmsg_len < min_len) - goto err_inval; + return -EINVAL; if (nlh->nlmsg_len > min_len) { int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); @@ -780,25 +876,18 @@ rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp) unsigned flavor = attr->rta_type; if (flavor) { if (flavor > rta_max[sz_idx]) - goto err_inval; + return -EINVAL; rta_buf[flavor-1] = attr; } attr = RTA_NEXT(attr, attrlen); } } - if (link->doit == NULL) - link = &(rtnetlink_links[PF_UNSPEC][type]); - if (link->doit == NULL) - goto err_inval; - err = link->doit(skb, nlh, (void *)&rta_buf[0]); + doit = rtnl_get_doit(family, type); + if (doit == NULL) + return -EOPNOTSUPP; - *errp = err; - return err; - -err_inval: - *errp = -EINVAL; - return -1; + return doit(skb, nlh, (void *)&rta_buf[0]); } static void rtnetlink_rcv(struct sock *sk, int len) @@ -814,25 +903,6 @@ static void rtnetlink_rcv(struct sock *sk, int len) } while (qlen); } -static struct rtnetlink_link link_rtnetlink_table[RTM_NR_MSGTYPES] = -{ - [RTM_GETLINK - RTM_BASE] = { .doit = rtnl_getlink, - .dumpit = rtnl_dump_ifinfo }, - [RTM_SETLINK - RTM_BASE] = { .doit = rtnl_setlink }, - [RTM_GETADDR - RTM_BASE] = { .dumpit = rtnl_dump_all }, - [RTM_GETROUTE - RTM_BASE] = { .dumpit = rtnl_dump_all }, - [RTM_NEWNEIGH - RTM_BASE] = { .doit = neigh_add }, - [RTM_DELNEIGH - RTM_BASE] = { .doit = neigh_delete }, - [RTM_GETNEIGH - RTM_BASE] = { .dumpit = neigh_dump_info }, -#ifdef CONFIG_FIB_RULES - [RTM_NEWRULE - RTM_BASE] = { .doit = fib_nl_newrule }, - [RTM_DELRULE - RTM_BASE] = { .doit = fib_nl_delrule }, -#endif - [RTM_GETRULE - RTM_BASE] = { .dumpit = rtnl_dump_all }, - [RTM_GETNEIGHTBL - RTM_BASE] = { .dumpit = neightbl_dump_info }, - [RTM_SETNEIGHTBL - RTM_BASE] = { .doit = neightbl_set }, -}; - static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = ptr; @@ -874,19 +944,22 @@ void __init rtnetlink_init(void) panic("rtnetlink_init: cannot allocate rta_buf\n"); rtnl = netlink_kernel_create(NETLINK_ROUTE, RTNLGRP_MAX, rtnetlink_rcv, - THIS_MODULE); + &rtnl_mutex, THIS_MODULE); if (rtnl == NULL) panic("rtnetlink_init: cannot initialize rtnetlink\n"); netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV); register_netdevice_notifier(&rtnetlink_dev_notifier); - rtnetlink_links[PF_UNSPEC] = link_rtnetlink_table; - rtnetlink_links[PF_PACKET] = link_rtnetlink_table; + + rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink, rtnl_dump_ifinfo); + rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL); + + rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all); + rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all); } EXPORT_SYMBOL(__rta_fill); EXPORT_SYMBOL(rtattr_strlcpy); EXPORT_SYMBOL(rtattr_parse); -EXPORT_SYMBOL(rtnetlink_links); EXPORT_SYMBOL(rtnetlink_put_metrics); EXPORT_SYMBOL(rtnl_lock); EXPORT_SYMBOL(rtnl_trylock); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 336958fbbcb..142257307fa 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -55,6 +55,7 @@ #include <linux/cache.h> #include <linux/rtnetlink.h> #include <linux/init.h> +#include <linux/scatterlist.h> #include <net/protocol.h> #include <net/dst.h> @@ -87,8 +88,9 @@ static struct kmem_cache *skbuff_fclone_cache __read_mostly; void skb_over_panic(struct sk_buff *skb, int sz, void *here) { printk(KERN_EMERG "skb_over_panic: text:%p len:%d put:%d head:%p " - "data:%p tail:%p end:%p dev:%s\n", - here, skb->len, sz, skb->head, skb->data, skb->tail, skb->end, + "data:%p tail:%#lx end:%#lx dev:%s\n", + here, skb->len, sz, skb->head, skb->data, + (unsigned long)skb->tail, (unsigned long)skb->end, skb->dev ? skb->dev->name : "<NULL>"); BUG(); } @@ -105,8 +107,9 @@ void skb_over_panic(struct sk_buff *skb, int sz, void *here) void skb_under_panic(struct sk_buff *skb, int sz, void *here) { printk(KERN_EMERG "skb_under_panic: text:%p len:%d put:%d head:%p " - "data:%p tail:%p end:%p dev:%s\n", - here, skb->len, sz, skb->head, skb->data, skb->tail, skb->end, + "data:%p tail:%#lx end:%#lx dev:%s\n", + here, skb->len, sz, skb->head, skb->data, + (unsigned long)skb->tail, (unsigned long)skb->end, skb->dev ? skb->dev->name : "<NULL>"); BUG(); } @@ -155,20 +158,22 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, if (!skb) goto out; - /* Get the DATA. Size must match skb_add_mtu(). */ size = SKB_DATA_ALIGN(size); data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info), gfp_mask, node); if (!data) goto nodata; - memset(skb, 0, offsetof(struct sk_buff, truesize)); + /* + * See comment in sk_buff definition, just before the 'tail' member + */ + memset(skb, 0, offsetof(struct sk_buff, tail)); skb->truesize = size + sizeof(struct sk_buff); atomic_set(&skb->users, 1); skb->head = data; skb->data = data; - skb->tail = data; - skb->end = data + size; + skb_reset_tail_pointer(skb); + skb->end = skb->tail + size; /* make sure we initialize shinfo sequentially */ shinfo = skb_shinfo(skb); atomic_set(&shinfo->dataref, 1); @@ -299,7 +304,7 @@ void kfree_skbmem(struct sk_buff *skb) if (atomic_dec_and_test(fclone_ref)) kmem_cache_free(skbuff_fclone_cache, other); break; - }; + } } /** @@ -321,15 +326,13 @@ void __kfree_skb(struct sk_buff *skb) WARN_ON(in_irq()); skb->destructor(skb); } -#ifdef CONFIG_NETFILTER - nf_conntrack_put(skb->nfct); #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + nf_conntrack_put(skb->nfct); nf_conntrack_put_reasm(skb->nfct_reasm); #endif #ifdef CONFIG_BRIDGE_NETFILTER nf_bridge_put(skb->nf_bridge); #endif -#endif /* XXX: IS this still necessary? - JHS */ #ifdef CONFIG_NET_SCHED skb->tc_index = 0; @@ -396,9 +399,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) n->sk = NULL; C(tstamp); C(dev); - C(h); - C(nh); - C(mac); + C(transport_header); + C(network_header); + C(mac_header); C(dst); dst_clone(skb->dst); C(sp); @@ -422,19 +425,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) C(protocol); n->destructor = NULL; C(mark); -#ifdef CONFIG_NETFILTER - C(nfct); - nf_conntrack_get(skb->nfct); - C(nfctinfo); -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) - C(nfct_reasm); - nf_conntrack_get_reasm(skb->nfct_reasm); -#endif -#ifdef CONFIG_BRIDGE_NETFILTER - C(nf_bridge); - nf_bridge_get(skb->nf_bridge); -#endif -#endif /*CONFIG_NETFILTER*/ + __nf_copy(n, skb); #ifdef CONFIG_NET_SCHED C(tc_index); #ifdef CONFIG_NET_CLS_ACT @@ -460,11 +451,12 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) { +#ifndef NET_SKBUFF_DATA_USES_OFFSET /* * Shift between the two data areas in bytes */ unsigned long offset = new->data - old->data; - +#endif new->sk = NULL; new->dev = old->dev; new->priority = old->priority; @@ -473,9 +465,15 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #ifdef CONFIG_INET new->sp = secpath_get(old->sp); #endif - new->h.raw = old->h.raw + offset; - new->nh.raw = old->nh.raw + offset; - new->mac.raw = old->mac.raw + offset; + new->transport_header = old->transport_header; + new->network_header = old->network_header; + new->mac_header = old->mac_header; +#ifndef NET_SKBUFF_DATA_USES_OFFSET + /* {transport,network,mac}_header are relative to skb->head */ + new->transport_header += offset; + new->network_header += offset; + new->mac_header += offset; +#endif memcpy(new->cb, old->cb, sizeof(old->cb)); new->local_df = old->local_df; new->fclone = SKB_FCLONE_UNAVAILABLE; @@ -483,22 +481,10 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->tstamp = old->tstamp; new->destructor = NULL; new->mark = old->mark; -#ifdef CONFIG_NETFILTER - new->nfct = old->nfct; - nf_conntrack_get(old->nfct); - new->nfctinfo = old->nfctinfo; -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) - new->nfct_reasm = old->nfct_reasm; - nf_conntrack_get_reasm(old->nfct_reasm); -#endif + __nf_copy(new, old); #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) new->ipvs_property = old->ipvs_property; #endif -#ifdef CONFIG_BRIDGE_NETFILTER - new->nf_bridge = old->nf_bridge; - nf_bridge_get(old->nf_bridge); -#endif -#endif #ifdef CONFIG_NET_SCHED #ifdef CONFIG_NET_CLS_ACT new->tc_verd = old->tc_verd; @@ -535,8 +521,12 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) /* * Allocate the copy buffer */ - struct sk_buff *n = alloc_skb(skb->end - skb->head + skb->data_len, - gfp_mask); + struct sk_buff *n; +#ifdef NET_SKBUFF_DATA_USES_OFFSET + n = alloc_skb(skb->end + skb->data_len, gfp_mask); +#else + n = alloc_skb(skb->end - skb->head + skb->data_len, gfp_mask); +#endif if (!n) return NULL; @@ -573,8 +563,12 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) /* * Allocate the copy buffer */ - struct sk_buff *n = alloc_skb(skb->end - skb->head, gfp_mask); - + struct sk_buff *n; +#ifdef NET_SKBUFF_DATA_USES_OFFSET + n = alloc_skb(skb->end, gfp_mask); +#else + n = alloc_skb(skb->end - skb->head, gfp_mask); +#endif if (!n) goto out; @@ -583,7 +577,7 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) /* Set the tail pointer and length */ skb_put(n, skb_headlen(skb)); /* Copy the bytes */ - memcpy(n->data, skb->data, n->len); + skb_copy_from_linear_data(skb, n->data, n->len); n->csum = skb->csum; n->ip_summed = skb->ip_summed; @@ -632,7 +626,11 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, { int i; u8 *data; +#ifdef NET_SKBUFF_DATA_USES_OFFSET + int size = nhead + skb->end + ntail; +#else int size = nhead + (skb->end - skb->head) + ntail; +#endif long off; if (skb_shared(skb)) @@ -646,8 +644,14 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, /* Copy only real data... and, alas, header. This should be * optimized for the cases when header is void. */ - memcpy(data + nhead, skb->head, skb->tail - skb->head); - memcpy(data + size, skb->end, sizeof(struct skb_shared_info)); + memcpy(data + nhead, skb->head, +#ifdef NET_SKBUFF_DATA_USES_OFFSET + skb->tail); +#else + skb->tail - skb->head); +#endif + memcpy(data + size, skb_end_pointer(skb), + sizeof(struct skb_shared_info)); for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) get_page(skb_shinfo(skb)->frags[i].page); @@ -660,12 +664,18 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, off = (data + nhead) - skb->head; skb->head = data; - skb->end = data + size; skb->data += off; - skb->tail += off; - skb->mac.raw += off; - skb->h.raw += off; - skb->nh.raw += off; +#ifdef NET_SKBUFF_DATA_USES_OFFSET + skb->end = size; + off = nhead; +#else + skb->end = skb->head + size; +#endif + /* {transport,network,mac}_header and tail are relative to skb->head */ + skb->tail += off; + skb->transport_header += off; + skb->network_header += off; + skb->mac_header += off; skb->cloned = 0; skb->nohdr = 0; atomic_set(&skb_shinfo(skb)->dataref, 1); @@ -726,7 +736,9 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, */ struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom, gfp_mask); + int oldheadroom = skb_headroom(skb); int head_copy_len, head_copy_off; + int off = 0; if (!n) return NULL; @@ -736,7 +748,7 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, /* Set the tail pointer and length */ skb_put(n, skb->len); - head_copy_len = skb_headroom(skb); + head_copy_len = oldheadroom; head_copy_off = 0; if (newheadroom <= head_copy_len) head_copy_len = newheadroom; @@ -750,6 +762,13 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, copy_skb_header(n, skb); +#ifdef NET_SKBUFF_DATA_USES_OFFSET + off = newheadroom - oldheadroom; +#endif + n->transport_header += off; + n->network_header += off; + n->mac_header += off; + return n; } @@ -877,7 +896,7 @@ done: } else { skb->len = len; skb->data_len = 0; - skb->tail = skb->data + len; + skb_set_tail_pointer(skb, len); } return 0; @@ -922,7 +941,7 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) return NULL; } - if (skb_copy_bits(skb, skb_headlen(skb), skb->tail, delta)) + if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta)) BUG(); /* Optimization: no fragments, no reasons to preestimate @@ -1018,7 +1037,7 @@ pull_pages: skb->tail += delta; skb->data_len -= delta; - return skb->tail; + return skb_tail_pointer(skb); } /* Copy some data bits from skb to kernel buffer. */ @@ -1035,7 +1054,7 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) if ((copy = start - offset) > 0) { if (copy > len) copy = len; - memcpy(to, skb->data + offset, copy); + skb_copy_from_linear_data_offset(skb, offset, to, copy); if ((len -= copy) == 0) return 0; offset += copy; @@ -1110,7 +1129,7 @@ fault: * traversing fragment lists and such. */ -int skb_store_bits(const struct sk_buff *skb, int offset, void *from, int len) +int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) { int i, copy; int start = skb_headlen(skb); @@ -1121,7 +1140,7 @@ int skb_store_bits(const struct sk_buff *skb, int offset, void *from, int len) if ((copy = start - offset) > 0) { if (copy > len) copy = len; - memcpy(skb->data + offset, from, copy); + skb_copy_to_linear_data_offset(skb, offset, from, copy); if ((len -= copy) == 0) return 0; offset += copy; @@ -1348,13 +1367,13 @@ void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) long csstart; if (skb->ip_summed == CHECKSUM_PARTIAL) - csstart = skb->h.raw - skb->data; + csstart = skb->csum_start - skb_headroom(skb); else csstart = skb_headlen(skb); BUG_ON(csstart > skb_headlen(skb)); - memcpy(to, skb->data, csstart); + skb_copy_from_linear_data(skb, to, csstart); csum = 0; if (csstart != skb->len) @@ -1522,27 +1541,14 @@ void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head spin_unlock_irqrestore(&list->lock, flags); } -#if 0 -/* - * Tune the memory allocator for a new MTU size. - */ -void skb_add_mtu(int mtu) -{ - /* Must match allocation in alloc_skb */ - mtu = SKB_DATA_ALIGN(mtu) + sizeof(struct skb_shared_info); - - kmem_add_cache_size(mtu); -} -#endif - static inline void skb_split_inside_header(struct sk_buff *skb, struct sk_buff* skb1, const u32 len, const int pos) { int i; - memcpy(skb_put(skb1, pos - len), skb->data + len, pos - len); - + skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len), + pos - len); /* And move data appendix as is. */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; @@ -1553,7 +1559,7 @@ static inline void skb_split_inside_header(struct sk_buff *skb, skb1->len += skb1->data_len; skb->data_len = 0; skb->len = len; - skb->tail = skb->data + len; + skb_set_tail_pointer(skb, len); } static inline void skb_split_no_header(struct sk_buff *skb, @@ -1878,7 +1884,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features) struct sk_buff *segs = NULL; struct sk_buff *tail = NULL; unsigned int mss = skb_shinfo(skb)->gso_size; - unsigned int doffset = skb->data - skb->mac.raw; + unsigned int doffset = skb->data - skb_mac_header(skb); unsigned int offset = doffset; unsigned int headroom; unsigned int len; @@ -1928,11 +1934,12 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features) nskb->mac_len = skb->mac_len; skb_reserve(nskb, headroom); - nskb->mac.raw = nskb->data; - nskb->nh.raw = nskb->data + skb->mac_len; - nskb->h.raw = nskb->nh.raw + (skb->h.raw - skb->nh.raw); - memcpy(skb_put(nskb, doffset), skb->data, doffset); - + skb_reset_mac_header(nskb); + skb_set_network_header(nskb, skb->mac_len); + nskb->transport_header = (nskb->network_header + + skb_network_header_len(skb)); + skb_copy_from_linear_data(skb, skb_put(nskb, doffset), + doffset); if (!sg) { nskb->csum = skb_copy_and_csum_bits(skb, offset, skb_put(nskb, len), @@ -1945,7 +1952,8 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features) nskb->ip_summed = CHECKSUM_PARTIAL; nskb->csum = skb->csum; - memcpy(skb_put(nskb, hsize), skb->data + offset, hsize); + skb_copy_from_linear_data_offset(skb, offset, + skb_put(nskb, hsize), hsize); while (pos < offset + len) { BUG_ON(i >= nfrags); @@ -2005,6 +2013,190 @@ void __init skb_init(void) NULL, NULL); } +/** + * skb_to_sgvec - Fill a scatter-gather list from a socket buffer + * @skb: Socket buffer containing the buffers to be mapped + * @sg: The scatter-gather list to map into + * @offset: The offset into the buffer's contents to start mapping + * @len: Length of buffer space to be mapped + * + * Fill the specified scatter-gather list with mappings/pointers into a + * region of the buffer space attached to a socket buffer. + */ +int +skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + int elt = 0; + + if (copy > 0) { + if (copy > len) + copy = len; + sg[elt].page = virt_to_page(skb->data + offset); + sg[elt].offset = (unsigned long)(skb->data + offset) % PAGE_SIZE; + sg[elt].length = copy; + elt++; + if ((len -= copy) == 0) + return elt; + offset += copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end - offset) > 0) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + if (copy > len) + copy = len; + sg[elt].page = frag->page; + sg[elt].offset = frag->page_offset+offset-start; + sg[elt].length = copy; + elt++; + if (!(len -= copy)) + return elt; + offset += copy; + } + start = end; + } + + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + + for (; list; list = list->next) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + list->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + elt += skb_to_sgvec(list, sg+elt, offset - start, copy); + if ((len -= copy) == 0) + return elt; + offset += copy; + } + start = end; + } + } + BUG_ON(len); + return elt; +} + +/** + * skb_cow_data - Check that a socket buffer's data buffers are writable + * @skb: The socket buffer to check. + * @tailbits: Amount of trailing space to be added + * @trailer: Returned pointer to the skb where the @tailbits space begins + * + * Make sure that the data buffers attached to a socket buffer are + * writable. If they are not, private copies are made of the data buffers + * and the socket buffer is set to use these instead. + * + * If @tailbits is given, make sure that there is space to write @tailbits + * bytes of data beyond current end of socket buffer. @trailer will be + * set to point to the skb in which this space begins. + * + * The number of scatterlist elements required to completely map the + * COW'd and extended socket buffer will be returned. + */ +int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) +{ + int copyflag; + int elt; + struct sk_buff *skb1, **skb_p; + + /* If skb is cloned or its head is paged, reallocate + * head pulling out all the pages (pages are considered not writable + * at the moment even if they are anonymous). + */ + if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) && + __pskb_pull_tail(skb, skb_pagelen(skb)-skb_headlen(skb)) == NULL) + return -ENOMEM; + + /* Easy case. Most of packets will go this way. */ + if (!skb_shinfo(skb)->frag_list) { + /* A little of trouble, not enough of space for trailer. + * This should not happen, when stack is tuned to generate + * good frames. OK, on miss we reallocate and reserve even more + * space, 128 bytes is fair. */ + + if (skb_tailroom(skb) < tailbits && + pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC)) + return -ENOMEM; + + /* Voila! */ + *trailer = skb; + return 1; + } + + /* Misery. We are in troubles, going to mincer fragments... */ + + elt = 1; + skb_p = &skb_shinfo(skb)->frag_list; + copyflag = 0; + + while ((skb1 = *skb_p) != NULL) { + int ntail = 0; + + /* The fragment is partially pulled by someone, + * this can happen on input. Copy it and everything + * after it. */ + + if (skb_shared(skb1)) + copyflag = 1; + + /* If the skb is the last, worry about trailer. */ + + if (skb1->next == NULL && tailbits) { + if (skb_shinfo(skb1)->nr_frags || + skb_shinfo(skb1)->frag_list || + skb_tailroom(skb1) < tailbits) + ntail = tailbits + 128; + } + + if (copyflag || + skb_cloned(skb1) || + ntail || + skb_shinfo(skb1)->nr_frags || + skb_shinfo(skb1)->frag_list) { + struct sk_buff *skb2; + + /* Fuck, we are miserable poor guys... */ + if (ntail == 0) + skb2 = skb_copy(skb1, GFP_ATOMIC); + else + skb2 = skb_copy_expand(skb1, + skb_headroom(skb1), + ntail, + GFP_ATOMIC); + if (unlikely(skb2 == NULL)) + return -ENOMEM; + + if (skb1->sk) + skb_set_owner_w(skb2, skb1->sk); + + /* Looking around. Are we still alive? + * OK, link new skb, drop old one */ + + skb2->next = skb1->next; + *skb_p = skb2; + kfree_skb(skb1); + skb1 = skb2; + } + elt++; + *trailer = skb1; + skb_p = &skb1->next; + } + + return elt; +} + EXPORT_SYMBOL(___pskb_trim); EXPORT_SYMBOL(__kfree_skb); EXPORT_SYMBOL(kfree_skb); @@ -2039,3 +2231,6 @@ EXPORT_SYMBOL(skb_seq_read); EXPORT_SYMBOL(skb_abort_seq_read); EXPORT_SYMBOL(skb_find_text); EXPORT_SYMBOL(skb_append_datato_frags); + +EXPORT_SYMBOL_GPL(skb_to_sgvec); +EXPORT_SYMBOL_GPL(skb_cow_data); diff --git a/net/core/sock.c b/net/core/sock.c index 27c4f62382b..22183c2ef28 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -154,7 +154,8 @@ static const char *af_family_key_strings[AF_MAX+1] = { "sk_lock-21" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" , "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" , "sk_lock-27" , "sk_lock-28" , "sk_lock-29" , - "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-AF_MAX" + "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" , + "sk_lock-AF_RXRPC" , "sk_lock-AF_MAX" }; static const char *af_family_slock_key_strings[AF_MAX+1] = { "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" , @@ -167,7 +168,8 @@ static const char *af_family_slock_key_strings[AF_MAX+1] = { "slock-21" , "slock-AF_SNA" , "slock-AF_IRDA" , "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" , "slock-27" , "slock-28" , "slock-29" , - "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_MAX" + "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" , + "slock-AF_RXRPC" , "slock-AF_MAX" }; #endif @@ -361,8 +363,8 @@ int sock_setsockopt(struct socket *sock, int level, int optname, } #endif - if(optlen<sizeof(int)) - return(-EINVAL); + if (optlen < sizeof(int)) + return -EINVAL; if (get_user(val, (int __user *)optval)) return -EFAULT; @@ -371,265 +373,270 @@ int sock_setsockopt(struct socket *sock, int level, int optname, lock_sock(sk); - switch(optname) - { - case SO_DEBUG: - if(val && !capable(CAP_NET_ADMIN)) - { - ret = -EACCES; - } - else if (valbool) - sock_set_flag(sk, SOCK_DBG); - else - sock_reset_flag(sk, SOCK_DBG); - break; - case SO_REUSEADDR: - sk->sk_reuse = valbool; - break; - case SO_TYPE: - case SO_ERROR: - ret = -ENOPROTOOPT; - break; - case SO_DONTROUTE: - if (valbool) - sock_set_flag(sk, SOCK_LOCALROUTE); - else - sock_reset_flag(sk, SOCK_LOCALROUTE); - break; - case SO_BROADCAST: - sock_valbool_flag(sk, SOCK_BROADCAST, valbool); - break; - case SO_SNDBUF: - /* Don't error on this BSD doesn't and if you think - about it this is right. Otherwise apps have to - play 'guess the biggest size' games. RCVBUF/SNDBUF - are treated in BSD as hints */ - - if (val > sysctl_wmem_max) - val = sysctl_wmem_max; + switch(optname) { + case SO_DEBUG: + if (val && !capable(CAP_NET_ADMIN)) { + ret = -EACCES; + } + else if (valbool) + sock_set_flag(sk, SOCK_DBG); + else + sock_reset_flag(sk, SOCK_DBG); + break; + case SO_REUSEADDR: + sk->sk_reuse = valbool; + break; + case SO_TYPE: + case SO_ERROR: + ret = -ENOPROTOOPT; + break; + case SO_DONTROUTE: + if (valbool) + sock_set_flag(sk, SOCK_LOCALROUTE); + else + sock_reset_flag(sk, SOCK_LOCALROUTE); + break; + case SO_BROADCAST: + sock_valbool_flag(sk, SOCK_BROADCAST, valbool); + break; + case SO_SNDBUF: + /* Don't error on this BSD doesn't and if you think + about it this is right. Otherwise apps have to + play 'guess the biggest size' games. RCVBUF/SNDBUF + are treated in BSD as hints */ + + if (val > sysctl_wmem_max) + val = sysctl_wmem_max; set_sndbuf: - sk->sk_userlocks |= SOCK_SNDBUF_LOCK; - if ((val * 2) < SOCK_MIN_SNDBUF) - sk->sk_sndbuf = SOCK_MIN_SNDBUF; - else - sk->sk_sndbuf = val * 2; + sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + if ((val * 2) < SOCK_MIN_SNDBUF) + sk->sk_sndbuf = SOCK_MIN_SNDBUF; + else + sk->sk_sndbuf = val * 2; - /* - * Wake up sending tasks if we - * upped the value. - */ - sk->sk_write_space(sk); - break; + /* + * Wake up sending tasks if we + * upped the value. + */ + sk->sk_write_space(sk); + break; - case SO_SNDBUFFORCE: - if (!capable(CAP_NET_ADMIN)) { - ret = -EPERM; - break; - } - goto set_sndbuf; + case SO_SNDBUFFORCE: + if (!capable(CAP_NET_ADMIN)) { + ret = -EPERM; + break; + } + goto set_sndbuf; - case SO_RCVBUF: - /* Don't error on this BSD doesn't and if you think - about it this is right. Otherwise apps have to - play 'guess the biggest size' games. RCVBUF/SNDBUF - are treated in BSD as hints */ + case SO_RCVBUF: + /* Don't error on this BSD doesn't and if you think + about it this is right. Otherwise apps have to + play 'guess the biggest size' games. RCVBUF/SNDBUF + are treated in BSD as hints */ - if (val > sysctl_rmem_max) - val = sysctl_rmem_max; + if (val > sysctl_rmem_max) + val = sysctl_rmem_max; set_rcvbuf: - sk->sk_userlocks |= SOCK_RCVBUF_LOCK; - /* - * We double it on the way in to account for - * "struct sk_buff" etc. overhead. Applications - * assume that the SO_RCVBUF setting they make will - * allow that much actual data to be received on that - * socket. - * - * Applications are unaware that "struct sk_buff" and - * other overheads allocate from the receive buffer - * during socket buffer allocation. - * - * And after considering the possible alternatives, - * returning the value we actually used in getsockopt - * is the most desirable behavior. - */ - if ((val * 2) < SOCK_MIN_RCVBUF) - sk->sk_rcvbuf = SOCK_MIN_RCVBUF; - else - sk->sk_rcvbuf = val * 2; + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + /* + * We double it on the way in to account for + * "struct sk_buff" etc. overhead. Applications + * assume that the SO_RCVBUF setting they make will + * allow that much actual data to be received on that + * socket. + * + * Applications are unaware that "struct sk_buff" and + * other overheads allocate from the receive buffer + * during socket buffer allocation. + * + * And after considering the possible alternatives, + * returning the value we actually used in getsockopt + * is the most desirable behavior. + */ + if ((val * 2) < SOCK_MIN_RCVBUF) + sk->sk_rcvbuf = SOCK_MIN_RCVBUF; + else + sk->sk_rcvbuf = val * 2; + break; + + case SO_RCVBUFFORCE: + if (!capable(CAP_NET_ADMIN)) { + ret = -EPERM; break; + } + goto set_rcvbuf; - case SO_RCVBUFFORCE: - if (!capable(CAP_NET_ADMIN)) { - ret = -EPERM; - break; - } - goto set_rcvbuf; - - case SO_KEEPALIVE: + case SO_KEEPALIVE: #ifdef CONFIG_INET - if (sk->sk_protocol == IPPROTO_TCP) - tcp_set_keepalive(sk, valbool); + if (sk->sk_protocol == IPPROTO_TCP) + tcp_set_keepalive(sk, valbool); #endif - sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); - break; - - case SO_OOBINLINE: - sock_valbool_flag(sk, SOCK_URGINLINE, valbool); + sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); + break; + + case SO_OOBINLINE: + sock_valbool_flag(sk, SOCK_URGINLINE, valbool); + break; + + case SO_NO_CHECK: + sk->sk_no_check = valbool; + break; + + case SO_PRIORITY: + if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) + sk->sk_priority = val; + else + ret = -EPERM; + break; + + case SO_LINGER: + if (optlen < sizeof(ling)) { + ret = -EINVAL; /* 1003.1g */ break; - - case SO_NO_CHECK: - sk->sk_no_check = valbool; - break; - - case SO_PRIORITY: - if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) - sk->sk_priority = val; - else - ret = -EPERM; + } + if (copy_from_user(&ling,optval,sizeof(ling))) { + ret = -EFAULT; break; - - case SO_LINGER: - if(optlen<sizeof(ling)) { - ret = -EINVAL; /* 1003.1g */ - break; - } - if (copy_from_user(&ling,optval,sizeof(ling))) { - ret = -EFAULT; - break; - } - if (!ling.l_onoff) - sock_reset_flag(sk, SOCK_LINGER); - else { + } + if (!ling.l_onoff) + sock_reset_flag(sk, SOCK_LINGER); + else { #if (BITS_PER_LONG == 32) - if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) - sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; - else + if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) + sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; + else #endif - sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; - sock_set_flag(sk, SOCK_LINGER); - } - break; - - case SO_BSDCOMPAT: - sock_warn_obsolete_bsdism("setsockopt"); - break; - - case SO_PASSCRED: - if (valbool) - set_bit(SOCK_PASSCRED, &sock->flags); + sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; + sock_set_flag(sk, SOCK_LINGER); + } + break; + + case SO_BSDCOMPAT: + sock_warn_obsolete_bsdism("setsockopt"); + break; + + case SO_PASSCRED: + if (valbool) + set_bit(SOCK_PASSCRED, &sock->flags); + else + clear_bit(SOCK_PASSCRED, &sock->flags); + break; + + case SO_TIMESTAMP: + case SO_TIMESTAMPNS: + if (valbool) { + if (optname == SO_TIMESTAMP) + sock_reset_flag(sk, SOCK_RCVTSTAMPNS); else - clear_bit(SOCK_PASSCRED, &sock->flags); - break; + sock_set_flag(sk, SOCK_RCVTSTAMPNS); + sock_set_flag(sk, SOCK_RCVTSTAMP); + sock_enable_timestamp(sk); + } else { + sock_reset_flag(sk, SOCK_RCVTSTAMP); + sock_reset_flag(sk, SOCK_RCVTSTAMPNS); + } + break; - case SO_TIMESTAMP: - if (valbool) { - sock_set_flag(sk, SOCK_RCVTSTAMP); - sock_enable_timestamp(sk); - } else - sock_reset_flag(sk, SOCK_RCVTSTAMP); - break; + case SO_RCVLOWAT: + if (val < 0) + val = INT_MAX; + sk->sk_rcvlowat = val ? : 1; + break; - case SO_RCVLOWAT: - if (val < 0) - val = INT_MAX; - sk->sk_rcvlowat = val ? : 1; - break; + case SO_RCVTIMEO: + ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen); + break; - case SO_RCVTIMEO: - ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen); - break; + case SO_SNDTIMEO: + ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen); + break; - case SO_SNDTIMEO: - ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen); +#ifdef CONFIG_NETDEVICES + case SO_BINDTODEVICE: + { + char devname[IFNAMSIZ]; + + /* Sorry... */ + if (!capable(CAP_NET_RAW)) { + ret = -EPERM; break; + } -#ifdef CONFIG_NETDEVICES - case SO_BINDTODEVICE: - { - char devname[IFNAMSIZ]; + /* Bind this socket to a particular device like "eth0", + * as specified in the passed interface name. If the + * name is "" or the option length is zero the socket + * is not bound. + */ - /* Sorry... */ - if (!capable(CAP_NET_RAW)) { - ret = -EPERM; + if (!valbool) { + sk->sk_bound_dev_if = 0; + } else { + if (optlen > IFNAMSIZ - 1) + optlen = IFNAMSIZ - 1; + memset(devname, 0, sizeof(devname)); + if (copy_from_user(devname, optval, optlen)) { + ret = -EFAULT; break; } - /* Bind this socket to a particular device like "eth0", - * as specified in the passed interface name. If the - * name is "" or the option length is zero the socket - * is not bound. - */ + /* Remove any cached route for this socket. */ + sk_dst_reset(sk); - if (!valbool) { + if (devname[0] == '\0') { sk->sk_bound_dev_if = 0; } else { - if (optlen > IFNAMSIZ - 1) - optlen = IFNAMSIZ - 1; - memset(devname, 0, sizeof(devname)); - if (copy_from_user(devname, optval, optlen)) { - ret = -EFAULT; + struct net_device *dev = dev_get_by_name(devname); + if (!dev) { + ret = -ENODEV; break; } - - /* Remove any cached route for this socket. */ - sk_dst_reset(sk); - - if (devname[0] == '\0') { - sk->sk_bound_dev_if = 0; - } else { - struct net_device *dev = dev_get_by_name(devname); - if (!dev) { - ret = -ENODEV; - break; - } - sk->sk_bound_dev_if = dev->ifindex; - dev_put(dev); - } + sk->sk_bound_dev_if = dev->ifindex; + dev_put(dev); } - break; } + break; + } #endif - case SO_ATTACH_FILTER: - ret = -EINVAL; - if (optlen == sizeof(struct sock_fprog)) { - struct sock_fprog fprog; + case SO_ATTACH_FILTER: + ret = -EINVAL; + if (optlen == sizeof(struct sock_fprog)) { + struct sock_fprog fprog; - ret = -EFAULT; - if (copy_from_user(&fprog, optval, sizeof(fprog))) - break; - - ret = sk_attach_filter(&fprog, sk); - } - break; - - case SO_DETACH_FILTER: - rcu_read_lock_bh(); - filter = rcu_dereference(sk->sk_filter); - if (filter) { - rcu_assign_pointer(sk->sk_filter, NULL); - sk_filter_release(sk, filter); - rcu_read_unlock_bh(); + ret = -EFAULT; + if (copy_from_user(&fprog, optval, sizeof(fprog))) break; - } + + ret = sk_attach_filter(&fprog, sk); + } + break; + + case SO_DETACH_FILTER: + rcu_read_lock_bh(); + filter = rcu_dereference(sk->sk_filter); + if (filter) { + rcu_assign_pointer(sk->sk_filter, NULL); + sk_filter_release(sk, filter); rcu_read_unlock_bh(); - ret = -ENONET; break; + } + rcu_read_unlock_bh(); + ret = -ENONET; + break; - case SO_PASSSEC: - if (valbool) - set_bit(SOCK_PASSSEC, &sock->flags); - else - clear_bit(SOCK_PASSSEC, &sock->flags); - break; + case SO_PASSSEC: + if (valbool) + set_bit(SOCK_PASSSEC, &sock->flags); + else + clear_bit(SOCK_PASSSEC, &sock->flags); + break; /* We implement the SO_SNDLOWAT etc to not be settable (1003.1g 5.3) */ - default: - ret = -ENOPROTOOPT; - break; + default: + ret = -ENOPROTOOPT; + break; } release_sock(sk); return ret; @@ -641,8 +648,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; - union - { + union { int val; struct linger ling; struct timeval tm; @@ -651,148 +657,153 @@ int sock_getsockopt(struct socket *sock, int level, int optname, unsigned int lv = sizeof(int); int len; - if(get_user(len,optlen)) + if (get_user(len, optlen)) return -EFAULT; - if(len < 0) + if (len < 0) return -EINVAL; - switch(optname) - { - case SO_DEBUG: - v.val = sock_flag(sk, SOCK_DBG); - break; - - case SO_DONTROUTE: - v.val = sock_flag(sk, SOCK_LOCALROUTE); - break; - - case SO_BROADCAST: - v.val = !!sock_flag(sk, SOCK_BROADCAST); - break; - - case SO_SNDBUF: - v.val = sk->sk_sndbuf; - break; - - case SO_RCVBUF: - v.val = sk->sk_rcvbuf; - break; - - case SO_REUSEADDR: - v.val = sk->sk_reuse; - break; - - case SO_KEEPALIVE: - v.val = !!sock_flag(sk, SOCK_KEEPOPEN); - break; - - case SO_TYPE: - v.val = sk->sk_type; - break; - - case SO_ERROR: - v.val = -sock_error(sk); - if(v.val==0) - v.val = xchg(&sk->sk_err_soft, 0); - break; - - case SO_OOBINLINE: - v.val = !!sock_flag(sk, SOCK_URGINLINE); - break; - - case SO_NO_CHECK: - v.val = sk->sk_no_check; - break; - - case SO_PRIORITY: - v.val = sk->sk_priority; - break; - - case SO_LINGER: - lv = sizeof(v.ling); - v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER); - v.ling.l_linger = sk->sk_lingertime / HZ; - break; - - case SO_BSDCOMPAT: - sock_warn_obsolete_bsdism("getsockopt"); - break; - - case SO_TIMESTAMP: - v.val = sock_flag(sk, SOCK_RCVTSTAMP); - break; + switch(optname) { + case SO_DEBUG: + v.val = sock_flag(sk, SOCK_DBG); + break; + + case SO_DONTROUTE: + v.val = sock_flag(sk, SOCK_LOCALROUTE); + break; + + case SO_BROADCAST: + v.val = !!sock_flag(sk, SOCK_BROADCAST); + break; + + case SO_SNDBUF: + v.val = sk->sk_sndbuf; + break; + + case SO_RCVBUF: + v.val = sk->sk_rcvbuf; + break; + + case SO_REUSEADDR: + v.val = sk->sk_reuse; + break; + + case SO_KEEPALIVE: + v.val = !!sock_flag(sk, SOCK_KEEPOPEN); + break; + + case SO_TYPE: + v.val = sk->sk_type; + break; + + case SO_ERROR: + v.val = -sock_error(sk); + if (v.val==0) + v.val = xchg(&sk->sk_err_soft, 0); + break; + + case SO_OOBINLINE: + v.val = !!sock_flag(sk, SOCK_URGINLINE); + break; + + case SO_NO_CHECK: + v.val = sk->sk_no_check; + break; + + case SO_PRIORITY: + v.val = sk->sk_priority; + break; + + case SO_LINGER: + lv = sizeof(v.ling); + v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER); + v.ling.l_linger = sk->sk_lingertime / HZ; + break; + + case SO_BSDCOMPAT: + sock_warn_obsolete_bsdism("getsockopt"); + break; + + case SO_TIMESTAMP: + v.val = sock_flag(sk, SOCK_RCVTSTAMP) && + !sock_flag(sk, SOCK_RCVTSTAMPNS); + break; + + case SO_TIMESTAMPNS: + v.val = sock_flag(sk, SOCK_RCVTSTAMPNS); + break; + + case SO_RCVTIMEO: + lv=sizeof(struct timeval); + if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) { + v.tm.tv_sec = 0; + v.tm.tv_usec = 0; + } else { + v.tm.tv_sec = sk->sk_rcvtimeo / HZ; + v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ; + } + break; + + case SO_SNDTIMEO: + lv=sizeof(struct timeval); + if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) { + v.tm.tv_sec = 0; + v.tm.tv_usec = 0; + } else { + v.tm.tv_sec = sk->sk_sndtimeo / HZ; + v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ; + } + break; - case SO_RCVTIMEO: - lv=sizeof(struct timeval); - if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) { - v.tm.tv_sec = 0; - v.tm.tv_usec = 0; - } else { - v.tm.tv_sec = sk->sk_rcvtimeo / HZ; - v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ; - } - break; + case SO_RCVLOWAT: + v.val = sk->sk_rcvlowat; + break; - case SO_SNDTIMEO: - lv=sizeof(struct timeval); - if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) { - v.tm.tv_sec = 0; - v.tm.tv_usec = 0; - } else { - v.tm.tv_sec = sk->sk_sndtimeo / HZ; - v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ; - } - break; + case SO_SNDLOWAT: + v.val=1; + break; - case SO_RCVLOWAT: - v.val = sk->sk_rcvlowat; - break; + case SO_PASSCRED: + v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0; + break; - case SO_SNDLOWAT: - v.val=1; - break; + case SO_PEERCRED: + if (len > sizeof(sk->sk_peercred)) + len = sizeof(sk->sk_peercred); + if (copy_to_user(optval, &sk->sk_peercred, len)) + return -EFAULT; + goto lenout; - case SO_PASSCRED: - v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0; - break; - - case SO_PEERCRED: - if (len > sizeof(sk->sk_peercred)) - len = sizeof(sk->sk_peercred); - if (copy_to_user(optval, &sk->sk_peercred, len)) - return -EFAULT; - goto lenout; - - case SO_PEERNAME: - { - char address[128]; - - if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2)) - return -ENOTCONN; - if (lv < len) - return -EINVAL; - if (copy_to_user(optval, address, len)) - return -EFAULT; - goto lenout; - } + case SO_PEERNAME: + { + char address[128]; + + if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2)) + return -ENOTCONN; + if (lv < len) + return -EINVAL; + if (copy_to_user(optval, address, len)) + return -EFAULT; + goto lenout; + } - /* Dubious BSD thing... Probably nobody even uses it, but - * the UNIX standard wants it for whatever reason... -DaveM - */ - case SO_ACCEPTCONN: - v.val = sk->sk_state == TCP_LISTEN; - break; + /* Dubious BSD thing... Probably nobody even uses it, but + * the UNIX standard wants it for whatever reason... -DaveM + */ + case SO_ACCEPTCONN: + v.val = sk->sk_state == TCP_LISTEN; + break; - case SO_PASSSEC: - v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0; - break; + case SO_PASSSEC: + v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0; + break; - case SO_PEERSEC: - return security_socket_getpeersec_stream(sock, optval, optlen, len); + case SO_PEERSEC: + return security_socket_getpeersec_stream(sock, optval, optlen, len); - default: - return(-ENOPROTOOPT); + default: + return -ENOPROTOOPT; } + if (len > lv) len = lv; if (copy_to_user(optval, &v, len)) @@ -904,6 +915,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) sk_node_init(&newsk->sk_node); sock_lock_init(newsk); bh_lock_sock(newsk); + newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; atomic_set(&newsk->sk_rmem_alloc, 0); atomic_set(&newsk->sk_wmem_alloc, 0); @@ -923,7 +935,6 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) newsk->sk_wmem_queued = 0; newsk->sk_forward_alloc = 0; newsk->sk_send_head = NULL; - newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; sock_reset_flag(newsk, SOCK_DONE); @@ -970,6 +981,21 @@ out: EXPORT_SYMBOL_GPL(sk_clone); +void sk_setup_caps(struct sock *sk, struct dst_entry *dst) +{ + __sk_dst_set(sk, dst); + sk->sk_route_caps = dst->dev->features; + if (sk->sk_route_caps & NETIF_F_GSO) + sk->sk_route_caps |= NETIF_F_GSO_MASK; + if (sk_can_gso(sk)) { + if (dst->header_len) + sk->sk_route_caps &= ~NETIF_F_GSO_MASK; + else + sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; + } +} +EXPORT_SYMBOL_GPL(sk_setup_caps); + void __init sk_init(void) { if (num_physpages <= 4096) { @@ -1220,13 +1246,13 @@ static void __lock_sock(struct sock *sk) { DEFINE_WAIT(wait); - for(;;) { + for (;;) { prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, TASK_UNINTERRUPTIBLE); spin_unlock_bh(&sk->sk_lock.slock); schedule(); spin_lock_bh(&sk->sk_lock.slock); - if(!sock_owned_by_user(sk)) + if (!sock_owned_by_user(sk)) break; } finish_wait(&sk->sk_lock.wq, &wait); @@ -1258,7 +1284,7 @@ static void __release_sock(struct sock *sk) } while (skb != NULL); bh_lock_sock(sk); - } while((skb = sk->sk_backlog.head) != NULL); + } while ((skb = sk->sk_backlog.head) != NULL); } /** @@ -1420,7 +1446,7 @@ static void sock_def_write_space(struct sock *sk) /* Do not wake up a writer until he can make "significant" * progress. --DaveM */ - if((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { + if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) wake_up_interruptible(sk->sk_sleep); @@ -1482,8 +1508,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sock_set_flag(sk, SOCK_ZAPPED); - if(sock) - { + if (sock) { sk->sk_type = sock->type; sk->sk_sleep = &sock->wait; sock->sk = sk; @@ -1512,8 +1537,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; - sk->sk_stamp.tv_sec = -1L; - sk->sk_stamp.tv_usec = -1L; + sk->sk_stamp = ktime_set(-1L, -1L); atomic_set(&sk->sk_refcnt, 1); } @@ -1554,17 +1578,36 @@ EXPORT_SYMBOL(release_sock); int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) { + struct timeval tv; if (!sock_flag(sk, SOCK_TIMESTAMP)) sock_enable_timestamp(sk); - if (sk->sk_stamp.tv_sec == -1) + tv = ktime_to_timeval(sk->sk_stamp); + if (tv.tv_sec == -1) return -ENOENT; - if (sk->sk_stamp.tv_sec == 0) - do_gettimeofday(&sk->sk_stamp); - return copy_to_user(userstamp, &sk->sk_stamp, sizeof(struct timeval)) ? - -EFAULT : 0; + if (tv.tv_sec == 0) { + sk->sk_stamp = ktime_get_real(); + tv = ktime_to_timeval(sk->sk_stamp); + } + return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0; } EXPORT_SYMBOL(sock_get_timestamp); +int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) +{ + struct timespec ts; + if (!sock_flag(sk, SOCK_TIMESTAMP)) + sock_enable_timestamp(sk); + ts = ktime_to_timespec(sk->sk_stamp); + if (ts.tv_sec == -1) + return -ENOENT; + if (ts.tv_sec == 0) { + sk->sk_stamp = ktime_get_real(); + ts = ktime_to_timespec(sk->sk_stamp); + } + return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0; +} +EXPORT_SYMBOL(sock_get_timestampns); + void sock_enable_timestamp(struct sock *sk) { if (!sock_flag(sk, SOCK_TIMESTAMP)) { @@ -1899,7 +1942,7 @@ static int proto_seq_show(struct seq_file *seq, void *v) return 0; } -static struct seq_operations proto_seq_ops = { +static const struct seq_operations proto_seq_ops = { .start = proto_seq_start, .next = proto_seq_next, .stop = proto_seq_stop, diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 1e75b158546..b29712033dd 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -136,6 +136,14 @@ ctl_table core_table[] = { .mode = 0644, .proc_handler = &proc_dointvec }, + { + .ctl_name = NET_CORE_WARNINGS, + .procname = "warnings", + .data = &net_msg_warn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, { .ctl_name = 0 } }; diff --git a/net/core/utils.c b/net/core/utils.c index 07236c17fab..adecfd281ae 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -30,8 +30,10 @@ #include <asm/system.h> #include <asm/uaccess.h> -int net_msg_cost = 5*HZ; -int net_msg_burst = 10; +int net_msg_cost __read_mostly = 5*HZ; +int net_msg_burst __read_mostly = 10; +int net_msg_warn __read_mostly = 1; +EXPORT_SYMBOL(net_msg_warn); /* * All net warning printk()s should be guarded by this function. diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index a086c6312d3..01030f34617 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -157,7 +157,7 @@ struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) if (av != NULL) { av->dccpav_buf_head = DCCP_MAX_ACKVEC_LEN - 1; - av->dccpav_buf_ackno = DCCP_MAX_SEQNO + 1; + av->dccpav_buf_ackno = UINT48_MAX + 1; av->dccpav_buf_nonce = av->dccpav_buf_nonce = 0; av->dccpav_time.tv_sec = 0; av->dccpav_time.tv_usec = 0; diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 746f79d104b..d7d9ce73724 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -33,7 +33,6 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ - #include "../ccid.h" #include "../dccp.h" #include "lib/packet_history.h" @@ -52,6 +51,9 @@ static struct dccp_tx_hist *ccid3_tx_hist; static struct dccp_rx_hist *ccid3_rx_hist; static struct dccp_li_hist *ccid3_li_hist; +/* + * Transmitter Half-Connection Routines + */ #ifdef CONFIG_IP_DCCP_CCID3_DEBUG static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state) { @@ -80,23 +82,37 @@ static void ccid3_hc_tx_set_state(struct sock *sk, } /* - * Recalculate scheduled nominal send time t_nom, inter-packet interval - * t_ipi, and delta value. Should be called after each change to X. + * Compute the initial sending rate X_init according to RFC 3390: + * w_init = min(4 * MSS, max(2 * MSS, 4380 bytes)) + * X_init = w_init / RTT + * For consistency with other parts of the code, X_init is scaled by 2^6. */ -static inline void ccid3_update_send_time(struct ccid3_hc_tx_sock *hctx) +static inline u64 rfc3390_initial_rate(struct sock *sk) { - timeval_sub_usecs(&hctx->ccid3hctx_t_nom, hctx->ccid3hctx_t_ipi); + const struct dccp_sock *dp = dccp_sk(sk); + const __u32 w_init = min(4 * dp->dccps_mss_cache, + max(2 * dp->dccps_mss_cache, 4380U)); - /* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */ - hctx->ccid3hctx_t_ipi = scaled_div(hctx->ccid3hctx_s, - hctx->ccid3hctx_x >> 6); + return scaled_div(w_init << 6, ccid3_hc_tx_sk(sk)->ccid3hctx_rtt); +} - /* Update nominal send time with regard to the new t_ipi */ - timeval_add_usecs(&hctx->ccid3hctx_t_nom, hctx->ccid3hctx_t_ipi); +/* + * Recalculate t_ipi and delta (should be called whenever X changes) + */ +static inline void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx) +{ + /* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */ + hctx->ccid3hctx_t_ipi = scaled_div32(((u64)hctx->ccid3hctx_s) << 6, + hctx->ccid3hctx_x); /* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */ hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2, TFRC_OPSYS_HALF_TIME_GRAN); + + ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n", + hctx->ccid3hctx_t_ipi, hctx->ccid3hctx_delta, + hctx->ccid3hctx_s, (unsigned)(hctx->ccid3hctx_x >> 6)); + } /* * Update X by @@ -112,19 +128,28 @@ static inline void ccid3_update_send_time(struct ccid3_hc_tx_sock *hctx) * fine-grained resolution of sending rates. This requires scaling by 2^6 * throughout the code. Only X_calc is unscaled (in bytes/second). * - * If X has changed, we also update the scheduled send time t_now, - * the inter-packet interval t_ipi, and the delta value. */ static void ccid3_hc_tx_update_x(struct sock *sk, struct timeval *now) { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); + __u64 min_rate = 2 * hctx->ccid3hctx_x_recv; const __u64 old_x = hctx->ccid3hctx_x; + /* + * Handle IDLE periods: do not reduce below RFC3390 initial sending rate + * when idling [RFC 4342, 5.1]. See also draft-ietf-dccp-rfc3448bis. + * For consistency with X and X_recv, min_rate is also scaled by 2^6. + */ + if (unlikely(hctx->ccid3hctx_idle)) { + min_rate = rfc3390_initial_rate(sk); + min_rate = max(min_rate, 2 * hctx->ccid3hctx_x_recv); + } + if (hctx->ccid3hctx_p > 0) { hctx->ccid3hctx_x = min(((__u64)hctx->ccid3hctx_x_calc) << 6, - hctx->ccid3hctx_x_recv * 2); + min_rate); hctx->ccid3hctx_x = max(hctx->ccid3hctx_x, (((__u64)hctx->ccid3hctx_s) << 6) / TFRC_T_MBI); @@ -133,14 +158,21 @@ static void ccid3_hc_tx_update_x(struct sock *sk, struct timeval *now) (suseconds_t)hctx->ccid3hctx_rtt >= 0) { hctx->ccid3hctx_x = - max(2 * min(hctx->ccid3hctx_x, hctx->ccid3hctx_x_recv), + max(min(2 * hctx->ccid3hctx_x, min_rate), scaled_div(((__u64)hctx->ccid3hctx_s) << 6, hctx->ccid3hctx_rtt)); hctx->ccid3hctx_t_ld = *now; } - if (hctx->ccid3hctx_x != old_x) - ccid3_update_send_time(hctx); + if (hctx->ccid3hctx_x != old_x) { + ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, " + "X_recv=%u\n", (unsigned)(old_x >> 6), + (unsigned)(hctx->ccid3hctx_x >> 6), + hctx->ccid3hctx_x_calc, + (unsigned)(hctx->ccid3hctx_x_recv >> 6)); + + ccid3_update_send_interval(hctx); + } } /* @@ -149,17 +181,12 @@ static void ccid3_hc_tx_update_x(struct sock *sk, struct timeval *now) */ static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len) { - if (unlikely(len == 0)) - ccid3_pr_debug("Packet payload length is 0 - not updating\n"); - else - hctx->ccid3hctx_s = hctx->ccid3hctx_s == 0 ? len : - (9 * hctx->ccid3hctx_s + len) / 10; - /* - * Note: We could do a potential optimisation here - when `s' changes, - * recalculate sending rate and consequently t_ipi, t_delta, and - * t_now. This is however non-standard, and the benefits are not - * clear, so it is currently left out. - */ + const u16 old_s = hctx->ccid3hctx_s; + + hctx->ccid3hctx_s = old_s == 0 ? len : (9 * old_s + len) / 10; + + if (hctx->ccid3hctx_s != old_s) + ccid3_update_send_interval(hctx); } /* @@ -193,6 +220,7 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) { struct sock *sk = (struct sock *)data; struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); + struct timeval now; unsigned long t_nfb = USEC_PER_SEC / 5; bh_lock_sock(sk); @@ -205,6 +233,8 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) ccid3_pr_debug("%s(%p, state=%s) - entry \n", dccp_role(sk), sk, ccid3_tx_state_name(hctx->ccid3hctx_state)); + hctx->ccid3hctx_idle = 1; + switch (hctx->ccid3hctx_state) { case TFRC_SSTATE_NO_FBACK: /* RFC 3448, 4.4: Halve send rate directly */ @@ -219,53 +249,37 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) /* The value of R is still undefined and so we can not recompute * the timout value. Keep initial value as per [RFC 4342, 5]. */ t_nfb = TFRC_INITIAL_TIMEOUT; - ccid3_update_send_time(hctx); + ccid3_update_send_interval(hctx); break; case TFRC_SSTATE_FBACK: /* - * Check if IDLE since last timeout and recv rate is less than - * 4 packets (in units of 64*bytes/sec) per RTT + * Modify the cached value of X_recv [RFC 3448, 4.4] + * + * If (p == 0 || X_calc > 2 * X_recv) + * X_recv = max(X_recv / 2, s / (2 * t_mbi)); + * Else + * X_recv = X_calc / 4; + * + * Note that X_recv is scaled by 2^6 while X_calc is not */ - if (!hctx->ccid3hctx_idle || - (hctx->ccid3hctx_x_recv >= 4 * - scaled_div(((__u64)hctx->ccid3hctx_s) << 6, - hctx->ccid3hctx_rtt))) { - struct timeval now; + BUG_ON(hctx->ccid3hctx_p && !hctx->ccid3hctx_x_calc); - ccid3_pr_debug("%s(%p, state=%s), not idle\n", - dccp_role(sk), sk, - ccid3_tx_state_name(hctx->ccid3hctx_state)); + if (hctx->ccid3hctx_p == 0 || + (hctx->ccid3hctx_x_calc > (hctx->ccid3hctx_x_recv >> 5))) { - /* - * Modify the cached value of X_recv [RFC 3448, 4.4] - * - * If (p == 0 || X_calc > 2 * X_recv) - * X_recv = max(X_recv / 2, s / (2 * t_mbi)); - * Else - * X_recv = X_calc / 4; - * - * Note that X_recv is scaled by 2^6 while X_calc is not - */ - BUG_ON(hctx->ccid3hctx_p && !hctx->ccid3hctx_x_calc); - - if (hctx->ccid3hctx_p == 0 || - (hctx->ccid3hctx_x_calc > - (hctx->ccid3hctx_x_recv >> 5))) { - - hctx->ccid3hctx_x_recv = - max(hctx->ccid3hctx_x_recv / 2, - (((__u64)hctx->ccid3hctx_s) << 6) / - (2 * TFRC_T_MBI)); - - if (hctx->ccid3hctx_p == 0) - dccp_timestamp(sk, &now); - } else { - hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc; - hctx->ccid3hctx_x_recv <<= 4; - } - /* Now recalculate X [RFC 3448, 4.3, step (4)] */ - ccid3_hc_tx_update_x(sk, &now); + hctx->ccid3hctx_x_recv = + max(hctx->ccid3hctx_x_recv / 2, + (((__u64)hctx->ccid3hctx_s) << 6) / + (2 * TFRC_T_MBI)); + + if (hctx->ccid3hctx_p == 0) + dccp_timestamp(sk, &now); + } else { + hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc; + hctx->ccid3hctx_x_recv <<= 4; } + /* Now recalculate X [RFC 3448, 4.3, step (4)] */ + ccid3_hc_tx_update_x(sk, &now); /* * Schedule no feedback timer to expire in * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) @@ -280,8 +294,6 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) goto out; } - hctx->ccid3hctx_idle = 1; - restart_timer: sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, jiffies + usecs_to_jiffies(t_nfb)); @@ -322,24 +334,35 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) usecs_to_jiffies(TFRC_INITIAL_TIMEOUT))); hctx->ccid3hctx_last_win_count = 0; hctx->ccid3hctx_t_last_win_count = now; - ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); - - /* Set initial sending rate X/s to 1pps (X is scaled by 2^6) */ - ccid3_hc_tx_update_s(hctx, skb->len); - hctx->ccid3hctx_x = hctx->ccid3hctx_s; - hctx->ccid3hctx_x <<= 6; - - /* First timeout, according to [RFC 3448, 4.2], is 1 second */ - hctx->ccid3hctx_t_ipi = USEC_PER_SEC; - /* Initial delta: minimum of 0.5 sec and t_gran/2 */ - hctx->ccid3hctx_delta = TFRC_OPSYS_HALF_TIME_GRAN; /* Set t_0 for initial packet */ hctx->ccid3hctx_t_nom = now; + + hctx->ccid3hctx_s = skb->len; + + /* + * Use initial RTT sample when available: recommended by erratum + * to RFC 4342. This implements the initialisation procedure of + * draft rfc3448bis, section 4.2. Remember, X is scaled by 2^6. + */ + if (dp->dccps_syn_rtt) { + ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt); + hctx->ccid3hctx_rtt = dp->dccps_syn_rtt; + hctx->ccid3hctx_x = rfc3390_initial_rate(sk); + hctx->ccid3hctx_t_ld = now; + } else { + /* Sender does not have RTT sample: X = MSS/second */ + hctx->ccid3hctx_x = dp->dccps_mss_cache; + hctx->ccid3hctx_x <<= 6; + } + ccid3_update_send_interval(hctx); + + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); break; case TFRC_SSTATE_NO_FBACK: case TFRC_SSTATE_FBACK: delay = timeval_delta(&hctx->ccid3hctx_t_nom, &now); + ccid3_pr_debug("delay=%ld\n", (long)delay); /* * Scheduling of packet transmissions [RFC 3448, 4.6] * @@ -361,6 +384,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) /* prepare to send now (add options etc.) */ dp->dccps_hc_tx_insert_options = 1; DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count; + hctx->ccid3hctx_idle = 0; /* set the nominal send time for the next following packet */ timeval_add_usecs(&hctx->ccid3hctx_t_nom, hctx->ccid3hctx_t_ipi); @@ -391,7 +415,6 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, packet->dccphtx_seqno = dccp_sk(sk)->dccps_gss; packet->dccphtx_rtt = hctx->ccid3hctx_rtt; packet->dccphtx_sent = 1; - hctx->ccid3hctx_idle = 0; } static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) @@ -402,8 +425,7 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) struct dccp_tx_hist_entry *packet; struct timeval now; unsigned long t_nfb; - u32 pinv; - suseconds_t r_sample, t_elapsed; + u32 pinv, r_sample; BUG_ON(hctx == NULL); @@ -445,18 +467,10 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) * Calculate new round trip sample as per [RFC 3448, 4.3] by * R_sample = (now - t_recvdata) - t_elapsed */ - r_sample = timeval_delta(&now, &packet->dccphtx_tstamp); - t_elapsed = dp->dccps_options_received.dccpor_elapsed_time * 10; - - DCCP_BUG_ON(r_sample < 0); - if (unlikely(r_sample <= t_elapsed)) - DCCP_WARN("WARNING: r_sample=%dus <= t_elapsed=%dus\n", - (int)r_sample, (int)t_elapsed); - else - r_sample -= t_elapsed; - CCID3_RTT_SANITY_CHECK(r_sample); + r_sample = dccp_sample_rtt(sk, &now, &packet->dccphtx_tstamp); - /* Update RTT estimate by + /* + * Update RTT estimate by * If (No feedback recv) * R = R_sample; * Else @@ -467,27 +481,23 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) { /* * Larger Initial Windows [RFC 4342, sec. 5] - * We deviate in that we use `s' instead of `MSS'. */ - __u64 w_init = min(4 * hctx->ccid3hctx_s, - max(2 * hctx->ccid3hctx_s, 4380)); hctx->ccid3hctx_rtt = r_sample; - hctx->ccid3hctx_x = scaled_div(w_init << 6, r_sample); + hctx->ccid3hctx_x = rfc3390_initial_rate(sk); hctx->ccid3hctx_t_ld = now; - ccid3_update_send_time(hctx); + ccid3_update_send_interval(hctx); - ccid3_pr_debug("%s(%p), s=%u, w_init=%llu, " - "R_sample=%dus, X=%u\n", dccp_role(sk), + ccid3_pr_debug("%s(%p), s=%u, MSS=%u, " + "R_sample=%uus, X=%u\n", dccp_role(sk), sk, hctx->ccid3hctx_s, - (unsigned long long)w_init, - (int)r_sample, + dp->dccps_mss_cache, r_sample, (unsigned)(hctx->ccid3hctx_x >> 6)); ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK); } else { hctx->ccid3hctx_rtt = (9 * hctx->ccid3hctx_rtt + - (u32)r_sample) / 10; + r_sample) / 10; /* Update sending rate (step 4 of [RFC 3448, 4.3]) */ if (hctx->ccid3hctx_p > 0) @@ -497,10 +507,10 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) hctx->ccid3hctx_p); ccid3_hc_tx_update_x(sk, &now); - ccid3_pr_debug("%s(%p), RTT=%uus (sample=%dus), s=%u, " + ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, " "p=%u, X_calc=%u, X_recv=%u, X=%u\n", dccp_role(sk), - sk, hctx->ccid3hctx_rtt, (int)r_sample, + sk, hctx->ccid3hctx_rtt, r_sample, hctx->ccid3hctx_s, hctx->ccid3hctx_p, hctx->ccid3hctx_x_calc, (unsigned)(hctx->ccid3hctx_x_recv >> 6), @@ -644,10 +654,50 @@ static void ccid3_hc_tx_exit(struct sock *sk) dccp_tx_hist_purge(ccid3_tx_hist, &hctx->ccid3hctx_hist); } +static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) +{ + const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); + + /* Listen socks doesn't have a private CCID block */ + if (sk->sk_state == DCCP_LISTEN) + return; + + BUG_ON(hctx == NULL); + + info->tcpi_rto = hctx->ccid3hctx_t_rto; + info->tcpi_rtt = hctx->ccid3hctx_rtt; +} + +static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, + u32 __user *optval, int __user *optlen) +{ + const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); + const void *val; + + /* Listen socks doesn't have a private CCID block */ + if (sk->sk_state == DCCP_LISTEN) + return -EINVAL; + + switch (optname) { + case DCCP_SOCKOPT_CCID_TX_INFO: + if (len < sizeof(hctx->ccid3hctx_tfrc)) + return -EINVAL; + len = sizeof(hctx->ccid3hctx_tfrc); + val = &hctx->ccid3hctx_tfrc; + break; + default: + return -ENOPROTOOPT; + } + + if (put_user(len, optlen) || copy_to_user(optval, val, len)) + return -EFAULT; + + return 0; +} + /* - * RX Half Connection methods + * Receiver Half-Connection Routines */ - #ifdef CONFIG_IP_DCCP_CCID3_DEBUG static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state) { @@ -977,8 +1027,7 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) const struct dccp_options_received *opt_recv; struct dccp_rx_hist_entry *packet; struct timeval now; - u32 p_prev, rtt_prev; - suseconds_t r_sample, t_elapsed; + u32 p_prev, r_sample, rtt_prev; int loss, payload_size; BUG_ON(hcrx == NULL); @@ -994,17 +1043,7 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) break; rtt_prev = hcrx->ccid3hcrx_rtt; dccp_timestamp(sk, &now); - timeval_sub_usecs(&now, opt_recv->dccpor_timestamp_echo * 10); - r_sample = timeval_usecs(&now); - t_elapsed = opt_recv->dccpor_elapsed_time * 10; - - DCCP_BUG_ON(r_sample < 0); - if (unlikely(r_sample <= t_elapsed)) - DCCP_WARN("r_sample=%ldus, t_elapsed=%ldus\n", - (long)r_sample, (long)t_elapsed); - else - r_sample -= t_elapsed; - CCID3_RTT_SANITY_CHECK(r_sample); + r_sample = dccp_sample_rtt(sk, &now, NULL); if (hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA) hcrx->ccid3hcrx_rtt = r_sample; @@ -1132,20 +1171,6 @@ static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_rcv_rtt = hcrx->ccid3hcrx_rtt; } -static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) -{ - const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); - - /* Listen socks doesn't have a private CCID block */ - if (sk->sk_state == DCCP_LISTEN) - return; - - BUG_ON(hctx == NULL); - - info->tcpi_rto = hctx->ccid3hctx_t_rto; - info->tcpi_rtt = hctx->ccid3hctx_rtt; -} - static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen) { @@ -1173,33 +1198,6 @@ static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, return 0; } -static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, - u32 __user *optval, int __user *optlen) -{ - const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); - const void *val; - - /* Listen socks doesn't have a private CCID block */ - if (sk->sk_state == DCCP_LISTEN) - return -EINVAL; - - switch (optname) { - case DCCP_SOCKOPT_CCID_TX_INFO: - if (len < sizeof(hctx->ccid3hctx_tfrc)) - return -EINVAL; - len = sizeof(hctx->ccid3hctx_tfrc); - val = &hctx->ccid3hctx_tfrc; - break; - default: - return -ENOPROTOOPT; - } - - if (put_user(len, optlen) || copy_to_user(optval, val, len)) - return -EFAULT; - - return 0; -} - static struct ccid_operations ccid3 = { .ccid_id = DCCPC_CCID3, .ccid_name = "ccid3", diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index 15776a88c09..8d31b389c19 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -51,16 +51,6 @@ /* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */ #define TFRC_T_MBI 64 -/* What we think is a reasonable upper limit on RTT values */ -#define CCID3_SANE_RTT_MAX ((suseconds_t)(4 * USEC_PER_SEC)) - -#define CCID3_RTT_SANITY_CHECK(rtt) do { \ - if (rtt > CCID3_SANE_RTT_MAX) { \ - DCCP_CRIT("RTT (%d) too large, substituting %d", \ - (int)rtt, (int)CCID3_SANE_RTT_MAX); \ - rtt = CCID3_SANE_RTT_MAX; \ - } } while (0) - enum ccid3_options { TFRC_OPT_LOSS_EVENT_RATE = 192, TFRC_OPT_LOSS_INTERVALS = 193, diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c index 0a0baef16b3..372d7e75cdd 100644 --- a/net/dccp/ccids/lib/loss_interval.c +++ b/net/dccp/ccids/lib/loss_interval.c @@ -91,7 +91,7 @@ u32 dccp_li_hist_calc_i_mean(struct list_head *list) u32 w_tot = 0; list_for_each_entry_safe(li_entry, li_next, list, dccplih_node) { - if (li_entry->dccplih_interval != ~0) { + if (li_entry->dccplih_interval != ~0U) { i_tot0 += li_entry->dccplih_interval * dccp_li_hist_w[i]; w_tot += dccp_li_hist_w[i]; if (i != 0) diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index e33a9edb403..d8ad27bfe01 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -31,13 +31,9 @@ __stringify(cond)); \ } while (0) -#ifdef MODULE #define DCCP_PRINTK(enable, fmt, args...) do { if (enable) \ printk(fmt, ##args); \ } while(0) -#else -#define DCCP_PRINTK(enable, fmt, args...) printk(fmt, ##args) -#endif #define DCCP_PR_DEBUG(enable, fmt, a...) DCCP_PRINTK(enable, KERN_DEBUG \ "%s: " fmt, __FUNCTION__, ##a) @@ -75,11 +71,15 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo); /* RFC 1122, 4.2.3.1 initial RTO value */ #define DCCP_TIMEOUT_INIT ((unsigned)(3 * HZ)) +#define DCCP_RTO_MAX ((unsigned)(120 * HZ)) /* FIXME: using TCP value */ + +/* bounds for sampled RTT values from packet exchanges (in usec) */ +#define DCCP_SANE_RTT_MIN 100 +#define DCCP_SANE_RTT_MAX (4 * USEC_PER_SEC) + /* Maximal interval between probes for local resources. */ #define DCCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ / 2U)) -#define DCCP_RTO_MAX ((unsigned)(120 * HZ)) /* FIXME: using TCP value */ - /* sysctl variables for DCCP */ extern int sysctl_dccp_request_retries; extern int sysctl_dccp_retries1; @@ -92,17 +92,43 @@ extern int sysctl_dccp_feat_send_ack_vector; extern int sysctl_dccp_feat_send_ndp_count; extern int sysctl_dccp_tx_qlen; +/* + * 48-bit sequence number arithmetic (signed and unsigned) + */ +#define INT48_MIN 0x800000000000LL /* 2^47 */ +#define UINT48_MAX 0xFFFFFFFFFFFFLL /* 2^48 - 1 */ +#define COMPLEMENT48(x) (0x1000000000000LL - (x)) /* 2^48 - x */ +#define TO_SIGNED48(x) (((x) < INT48_MIN)? (x) : -COMPLEMENT48( (x))) +#define TO_UNSIGNED48(x) (((x) >= 0)? (x) : COMPLEMENT48(-(x))) +#define ADD48(a, b) (((a) + (b)) & UINT48_MAX) +#define SUB48(a, b) ADD48((a), COMPLEMENT48(b)) + +static inline void dccp_set_seqno(u64 *seqno, u64 value) +{ + *seqno = value & UINT48_MAX; +} + +static inline void dccp_inc_seqno(u64 *seqno) +{ + *seqno = ADD48(*seqno, 1); +} + +/* signed mod-2^48 distance: pos. if seqno1 < seqno2, neg. if seqno1 > seqno2 */ +static inline s64 dccp_delta_seqno(const u64 seqno1, const u64 seqno2) +{ + u64 delta = SUB48(seqno2, seqno1); + + return TO_SIGNED48(delta); +} + /* is seq1 < seq2 ? */ static inline int before48(const u64 seq1, const u64 seq2) { - return (s64)((seq1 << 16) - (seq2 << 16)) < 0; + return (s64)((seq2 << 16) - (seq1 << 16)) > 0; } /* is seq1 > seq2 ? */ -static inline int after48(const u64 seq1, const u64 seq2) -{ - return (s64)((seq2 << 16) - (seq1 << 16)) < 0; -} +#define after48(seq1, seq2) before48(seq2, seq1) /* is seq2 <= seq1 <= seq3 ? */ static inline int between48(const u64 seq1, const u64 seq2, const u64 seq3) @@ -118,9 +144,7 @@ static inline u64 max48(const u64 seq1, const u64 seq2) /* is seq1 next seqno after seq2 */ static inline int follows48(const u64 seq1, const u64 seq2) { - int diff = (seq1 & 0xFFFF) - (seq2 & 0xFFFF); - - return diff==1; + return dccp_delta_seqno(seq2, seq1) == 1; } enum { @@ -272,6 +296,8 @@ extern int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, extern int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code); extern void dccp_send_close(struct sock *sk, const int active); extern int dccp_invalid_packet(struct sk_buff *skb); +extern u32 dccp_sample_rtt(struct sock *sk, struct timeval *t_recv, + struct timeval *t_history); static inline int dccp_bad_service_code(const struct sock *sk, const __be32 service) @@ -313,26 +339,7 @@ static inline int dccp_packet_without_ack(const struct sk_buff *skb) return type == DCCP_PKT_DATA || type == DCCP_PKT_REQUEST; } -#define DCCP_MAX_SEQNO ((((u64)1) << 48) - 1) -#define DCCP_PKT_WITHOUT_ACK_SEQ (DCCP_MAX_SEQNO << 2) - -static inline void dccp_set_seqno(u64 *seqno, u64 value) -{ - if (value > DCCP_MAX_SEQNO) - value -= DCCP_MAX_SEQNO + 1; - *seqno = value; -} - -static inline u64 dccp_delta_seqno(u64 seqno1, u64 seqno2) -{ - return ((seqno2 << 16) - (seqno1 << 16)) >> 16; -} - -static inline void dccp_inc_seqno(u64 *seqno) -{ - if (++*seqno > DCCP_MAX_SEQNO) - *seqno = 0; -} +#define DCCP_PKT_WITHOUT_ACK_SEQ (UINT48_MAX << 2) static inline void dccp_hdr_set_seq(struct dccp_hdr *dh, const u64 gss) { diff --git a/net/dccp/input.c b/net/dccp/input.c index 78b043c458b..da6ec185ed5 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -86,7 +86,8 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) dh->dccph_type == DCCP_PKT_SYNCACK) { if (between48(DCCP_SKB_CB(skb)->dccpd_ack_seq, dp->dccps_awl, dp->dccps_awh) && - !before48(DCCP_SKB_CB(skb)->dccpd_seq, dp->dccps_swl)) + dccp_delta_seqno(dp->dccps_swl, + DCCP_SKB_CB(skb)->dccpd_seq) >= 0) dccp_update_gsr(sk, DCCP_SKB_CB(skb)->dccpd_seq); else return -1; @@ -203,7 +204,8 @@ static int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb, if (dp->dccps_role != DCCP_ROLE_CLIENT) goto send_sync; check_seq: - if (!before48(DCCP_SKB_CB(skb)->dccpd_seq, dp->dccps_osr)) { + if (dccp_delta_seqno(dp->dccps_osr, + DCCP_SKB_CB(skb)->dccpd_seq) >= 0) { send_sync: dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC); @@ -298,6 +300,14 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, if (dccp_parse_options(sk, skb)) goto out_invalid_packet; + /* Obtain RTT sample from SYN exchange (used by CCID 3) */ + if (dp->dccps_options_received.dccpor_timestamp_echo) { + struct timeval now; + + dccp_timestamp(sk, &now); + dp->dccps_syn_rtt = dccp_sample_rtt(sk, &now, NULL); + } + if (dccp_msk(sk)->dccpms_send_ack_vector && dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, DCCP_SKB_CB(skb)->dccpd_seq, @@ -575,3 +585,43 @@ discard: } EXPORT_SYMBOL_GPL(dccp_rcv_state_process); + +/** + * dccp_sample_rtt - Sample RTT from packet exchange + * + * @sk: connected dccp_sock + * @t_recv: receive timestamp of packet with timestamp echo + * @t_hist: packet history timestamp or NULL + */ +u32 dccp_sample_rtt(struct sock *sk, struct timeval *t_recv, + struct timeval *t_hist) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct dccp_options_received *or = &dp->dccps_options_received; + suseconds_t delta; + + if (t_hist == NULL) { + if (!or->dccpor_timestamp_echo) { + DCCP_WARN("packet without timestamp echo\n"); + return DCCP_SANE_RTT_MAX; + } + timeval_sub_usecs(t_recv, or->dccpor_timestamp_echo * 10); + delta = timeval_usecs(t_recv); + } else + delta = timeval_delta(t_recv, t_hist); + + delta -= or->dccpor_elapsed_time * 10; /* either set or 0 */ + + if (unlikely(delta <= 0)) { + DCCP_WARN("unusable RTT sample %ld, using min\n", (long)delta); + return DCCP_SANE_RTT_MIN; + } + if (unlikely(delta - (suseconds_t)DCCP_SANE_RTT_MAX > 0)) { + DCCP_WARN("RTT sample %ld too large, using max\n", (long)delta); + return DCCP_SANE_RTT_MAX; + } + + return delta; +} + +EXPORT_SYMBOL_GPL(dccp_sample_rtt); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 4a83978aa66..718f2fa923a 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -207,8 +207,8 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) (iph->ihl << 2)); struct dccp_sock *dp; struct inet_sock *inet; - const int type = skb->h.icmph->type; - const int code = skb->h.icmph->code; + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; struct sock *sk; __u64 seq; int err; @@ -363,8 +363,8 @@ EXPORT_SYMBOL_GPL(dccp_v4_send_check); static inline u64 dccp_v4_init_sequence(const struct sk_buff *skb) { - return secure_dccp_sequence_number(skb->nh.iph->daddr, - skb->nh.iph->saddr, + return secure_dccp_sequence_number(ip_hdr(skb)->daddr, + ip_hdr(skb)->saddr, dccp_hdr(skb)->dccph_dport, dccp_hdr(skb)->dccph_sport); } @@ -405,7 +405,7 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb, newinet->opt = ireq->opt; ireq->opt = NULL; newinet->mc_index = inet_iif(skb); - newinet->mc_ttl = skb->nh.iph->ttl; + newinet->mc_ttl = ip_hdr(skb)->ttl; newinet->id = jiffies; dccp_sync_mss(newsk, dst_mtu(dst)); @@ -428,7 +428,7 @@ EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock); static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) { const struct dccp_hdr *dh = dccp_hdr(skb); - const struct iphdr *iph = skb->nh.iph; + const struct iphdr *iph = ip_hdr(skb); struct sock *nsk; struct request_sock **prev; /* Find possible connection requests. */ @@ -460,8 +460,8 @@ static struct dst_entry* dccp_v4_route_skb(struct sock *sk, struct rtable *rt; struct flowi fl = { .oif = ((struct rtable *)skb->dst)->rt_iif, .nl_u = { .ip4_u = - { .daddr = skb->nh.iph->saddr, - .saddr = skb->nh.iph->daddr, + { .daddr = ip_hdr(skb)->saddr, + .saddr = ip_hdr(skb)->daddr, .tos = RT_CONN_FLAGS(sk) } }, .proto = sk->sk_protocol, .uli_u = { .ports = @@ -513,6 +513,7 @@ static void dccp_v4_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb) { int err; struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh; + const struct iphdr *rxiph; const int dccp_hdr_reset_len = sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext) + sizeof(struct dccp_hdr_reset); @@ -559,13 +560,13 @@ static void dccp_v4_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb) dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), DCCP_SKB_CB(rxskb)->dccpd_seq); dccp_csum_outgoing(skb); - dh->dccph_checksum = dccp_v4_csum_finish(skb, rxskb->nh.iph->saddr, - rxskb->nh.iph->daddr); + rxiph = ip_hdr(rxskb); + dh->dccph_checksum = dccp_v4_csum_finish(skb, rxiph->saddr, + rxiph->daddr); bh_lock_sock(dccp_v4_ctl_socket->sk); err = ip_build_and_send_pkt(skb, dccp_v4_ctl_socket->sk, - rxskb->nh.iph->daddr, - rxskb->nh.iph->saddr, NULL); + rxiph->daddr, rxiph->saddr, NULL); bh_unlock_sock(dccp_v4_ctl_socket->sk); if (net_xmit_eval(err) == 0) { @@ -640,8 +641,8 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) goto drop_and_free; ireq = inet_rsk(req); - ireq->loc_addr = skb->nh.iph->daddr; - ireq->rmt_addr = skb->nh.iph->saddr; + ireq->loc_addr = ip_hdr(skb)->daddr; + ireq->rmt_addr = ip_hdr(skb)->saddr; ireq->opt = NULL; /* @@ -809,6 +810,7 @@ EXPORT_SYMBOL_GPL(dccp_invalid_packet); static int dccp_v4_rcv(struct sk_buff *skb) { const struct dccp_hdr *dh; + const struct iphdr *iph; struct sock *sk; int min_cov; @@ -817,8 +819,9 @@ static int dccp_v4_rcv(struct sk_buff *skb) if (dccp_invalid_packet(skb)) goto discard_it; + iph = ip_hdr(skb); /* Step 1: If header checksum is incorrect, drop packet and return */ - if (dccp_v4_csum_finish(skb, skb->nh.iph->saddr, skb->nh.iph->daddr)) { + if (dccp_v4_csum_finish(skb, iph->saddr, iph->daddr)) { DCCP_WARN("dropped packet with invalid checksum\n"); goto discard_it; } @@ -832,8 +835,8 @@ static int dccp_v4_rcv(struct sk_buff *skb) "src=%u.%u.%u.%u@%-5d " "dst=%u.%u.%u.%u@%-5d seq=%llu", dccp_packet_name(dh->dccph_type), - NIPQUAD(skb->nh.iph->saddr), ntohs(dh->dccph_sport), - NIPQUAD(skb->nh.iph->daddr), ntohs(dh->dccph_dport), + NIPQUAD(iph->saddr), ntohs(dh->dccph_sport), + NIPQUAD(iph->daddr), ntohs(dh->dccph_dport), (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq); if (dccp_packet_without_ack(skb)) { @@ -848,10 +851,8 @@ static int dccp_v4_rcv(struct sk_buff *skb) /* Step 2: * Look up flow ID in table and get corresponding socket */ sk = __inet_lookup(&dccp_hashinfo, - skb->nh.iph->saddr, dh->dccph_sport, - skb->nh.iph->daddr, dh->dccph_dport, - inet_iif(skb)); - + iph->saddr, dh->dccph_sport, + iph->daddr, dh->dccph_dport, inet_iif(skb)); /* * Step 2: * If no socket ... diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 7f51e8db396..64eac2515aa 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -84,8 +84,8 @@ static inline __u32 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, static inline __u32 dccp_v6_init_sequence(struct sk_buff *skb) { - return secure_dccpv6_sequence_number(skb->nh.ipv6h->daddr.s6_addr32, - skb->nh.ipv6h->saddr.s6_addr32, + return secure_dccpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32, + ipv6_hdr(skb)->saddr.s6_addr32, dccp_hdr(skb)->dccph_dport, dccp_hdr(skb)->dccph_sport ); @@ -261,8 +261,8 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req, if (rxopt->srcrt) opt = ipv6_invert_rthdr(sk, - (struct ipv6_rt_hdr *)(pktopts->nh.raw + - rxopt->srcrt)); + (struct ipv6_rt_hdr *)(skb_network_header(pktopts) + + rxopt->srcrt)); } if (opt != NULL && opt->srcrt != NULL) { @@ -313,6 +313,7 @@ static void dccp_v6_reqsk_destructor(struct request_sock *req) static void dccp_v6_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb) { struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh; + struct ipv6hdr *rxip6h; const u32 dccp_hdr_reset_len = sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext) + sizeof(struct dccp_hdr_reset); @@ -352,12 +353,13 @@ static void dccp_v6_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb) dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), DCCP_SKB_CB(rxskb)->dccpd_seq); dccp_csum_outgoing(skb); - dh->dccph_checksum = dccp_v6_csum_finish(skb, &rxskb->nh.ipv6h->saddr, - &rxskb->nh.ipv6h->daddr); + rxip6h = ipv6_hdr(rxskb); + dh->dccph_checksum = dccp_v6_csum_finish(skb, &rxip6h->saddr, + &rxip6h->daddr); memset(&fl, 0, sizeof(fl)); - ipv6_addr_copy(&fl.fl6_dst, &rxskb->nh.ipv6h->saddr); - ipv6_addr_copy(&fl.fl6_src, &rxskb->nh.ipv6h->daddr); + ipv6_addr_copy(&fl.fl6_dst, &rxip6h->saddr); + ipv6_addr_copy(&fl.fl6_src, &rxip6h->daddr); fl.proto = IPPROTO_DCCP; fl.oif = inet6_iif(rxskb); @@ -390,7 +392,7 @@ static struct request_sock_ops dccp6_request_sock_ops = { static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) { const struct dccp_hdr *dh = dccp_hdr(skb); - const struct ipv6hdr *iph = skb->nh.ipv6h; + const struct ipv6hdr *iph = ipv6_hdr(skb); struct sock *nsk; struct request_sock **prev; /* Find possible connection requests. */ @@ -460,8 +462,8 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) goto drop_and_free; ireq6 = inet6_rsk(req); - ipv6_addr_copy(&ireq6->rmt_addr, &skb->nh.ipv6h->saddr); - ipv6_addr_copy(&ireq6->loc_addr, &skb->nh.ipv6h->daddr); + ipv6_addr_copy(&ireq6->rmt_addr, &ipv6_hdr(skb)->saddr); + ipv6_addr_copy(&ireq6->loc_addr, &ipv6_hdr(skb)->daddr); ireq6->pktopts = NULL; if (ipv6_opt_accepted(sk, skb) || @@ -546,7 +548,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, newnp->pktoptions = NULL; newnp->opt = NULL; newnp->mcast_oif = inet6_iif(skb); - newnp->mcast_hops = skb->nh.ipv6h->hop_limit; + newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; /* * No need to charge this sock to the relevant IPv6 refcnt debug socks count @@ -573,8 +575,8 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, if (rxopt->srcrt) opt = ipv6_invert_rthdr(sk, - (struct ipv6_rt_hdr *)(ireq6->pktopts->nh.raw + - rxopt->srcrt)); + (struct ipv6_rt_hdr *)(skb_network_header(ireq6->pktopts) + + rxopt->srcrt)); } if (dst == NULL) { @@ -653,7 +655,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, } newnp->opt = NULL; newnp->mcast_oif = inet6_iif(skb); - newnp->mcast_hops = skb->nh.ipv6h->hop_limit; + newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; /* * Clone native IPv6 options from listening socket (if any) @@ -826,8 +828,8 @@ static int dccp_v6_rcv(struct sk_buff **pskb) goto discard_it; /* Step 1: If header checksum is incorrect, drop packet and return. */ - if (dccp_v6_csum_finish(skb, &skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr)) { + if (dccp_v6_csum_finish(skb, &ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr)) { DCCP_WARN("dropped packet with invalid checksum\n"); goto discard_it; } @@ -844,9 +846,9 @@ static int dccp_v6_rcv(struct sk_buff **pskb) /* Step 2: * Look up flow ID in table and get corresponding socket */ - sk = __inet6_lookup(&dccp_hashinfo, &skb->nh.ipv6h->saddr, + sk = __inet6_lookup(&dccp_hashinfo, &ipv6_hdr(skb)->saddr, dh->dccph_sport, - &skb->nh.ipv6h->daddr, ntohs(dh->dccph_dport), + &ipv6_hdr(skb)->daddr, ntohs(dh->dccph_dport), inet6_iif(skb)); /* * Step 2: diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 6d235b3013d..e18e249ac49 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -27,7 +27,7 @@ struct inet_timewait_death_row dccp_death_row = { .sysctl_max_tw_buckets = NR_FILE * 2, .period = DCCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS, - .death_lock = SPIN_LOCK_UNLOCKED, + .death_lock = __SPIN_LOCK_UNLOCKED(dccp_death_row.death_lock), .hashinfo = &dccp_hashinfo, .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0, (unsigned long)&dccp_death_row), diff --git a/net/dccp/options.c b/net/dccp/options.c index ca13f773199..34d536d5f1a 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -29,8 +29,6 @@ int sysctl_dccp_feat_ack_ratio = DCCPF_INITIAL_ACK_RATIO; int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR; int sysctl_dccp_feat_send_ndp_count = DCCPF_INITIAL_SEND_NDP_COUNT; -EXPORT_SYMBOL_GPL(sysctl_dccp_feat_sequence_window); - void dccp_minisock_init(struct dccp_minisock *dmsk) { dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window; @@ -174,21 +172,25 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb) opt_recv->dccpor_timestamp_echo = ntohl(*(__be32 *)value); dccp_pr_debug("%s rx opt: TIMESTAMP_ECHO=%u, len=%d, " - "ackno=%llu, ", dccp_role(sk), + "ackno=%llu", dccp_role(sk), opt_recv->dccpor_timestamp_echo, len + 2, (unsigned long long) DCCP_SKB_CB(skb)->dccpd_ack_seq); - if (len == 4) + if (len == 4) { + dccp_pr_debug_cat("\n"); break; + } if (len == 6) elapsed_time = ntohs(*(__be16 *)(value + 4)); else elapsed_time = ntohl(*(__be32 *)(value + 4)); + dccp_pr_debug_cat(", ELAPSED_TIME=%d\n", elapsed_time); + /* Give precedence to the biggest ELAPSED_TIME */ if (elapsed_time > opt_recv->dccpor_elapsed_time) opt_recv->dccpor_elapsed_time = elapsed_time; @@ -565,6 +567,14 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb) dccp_insert_options_feat(sk, skb)) return -1; + /* + * Obtain RTT sample from Request/Response exchange. + * This is currently used in CCID 3 initialisation. + */ + if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST && + dccp_insert_option_timestamp(sk, skb)) + return -1; + /* XXX: insert other options when appropriate */ if (DCCP_SKB_CB(skb)->dccpd_opt_len != 0) { diff --git a/net/dccp/output.c b/net/dccp/output.c index aa21cc4de37..c8d843e983f 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -194,6 +194,7 @@ static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb) rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); if (rc <= 0) break; + dccp_pr_debug("delayed send by %d msec\n", rc); delay = msecs_to_jiffies(rc); sk->sk_write_pending++; release_sock(sk); @@ -255,7 +256,7 @@ void dccp_write_xmit(struct sock *sk, int block) DCCP_BUG("err=%d after ccid_hc_tx_packet_sent", err); } else { - dccp_pr_debug("packet discarded\n"); + dccp_pr_debug("packet discarded due to err=%d\n", err); kfree_skb(skb); } } diff --git a/net/dccp/probe.c b/net/dccp/probe.c index 3b1f509f51d..1f5e3ba6206 100644 --- a/net/dccp/probe.c +++ b/net/dccp/probe.c @@ -90,15 +90,18 @@ static int jdccp_sendmsg(struct kiocb *iocb, struct sock *sk, if (port == 0 || ntohs(inet->dport) == port || ntohs(inet->sport) == port) { if (hctx) - printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %d %d %d %d\n", - NIPQUAD(inet->saddr), ntohs(inet->sport), - NIPQUAD(inet->daddr), ntohs(inet->dport), size, - hctx->ccid3hctx_s, hctx->ccid3hctx_rtt, - hctx->ccid3hctx_p, hctx->ccid3hctx_t_ipi); + printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %d %d %d %u " + "%llu %llu %d\n", + NIPQUAD(inet->saddr), ntohs(inet->sport), + NIPQUAD(inet->daddr), ntohs(inet->dport), size, + hctx->ccid3hctx_s, hctx->ccid3hctx_rtt, + hctx->ccid3hctx_p, hctx->ccid3hctx_x_calc, + hctx->ccid3hctx_x_recv >> 6, + hctx->ccid3hctx_x >> 6, hctx->ccid3hctx_t_ipi); else printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d\n", - NIPQUAD(inet->saddr), ntohs(inet->sport), - NIPQUAD(inet->daddr), ntohs(inet->dport), size); + NIPQUAD(inet->saddr), ntohs(inet->sport), + NIPQUAD(inet->daddr), ntohs(inet->dport), size); } jprobe_return(); diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index c6568d637e1..9fbe87c9380 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -721,7 +721,7 @@ static int dn_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct sock *sk = sock->sk; struct dn_scp *scp = DN_SK(sk); struct sockaddr_dn *saddr = (struct sockaddr_dn *)uaddr; - struct net_device *dev; + struct net_device *dev, *ldev; int rv; if (addr_len != sizeof(struct sockaddr_dn)) @@ -746,14 +746,17 @@ static int dn_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (!(saddr->sdn_flags & SDF_WILD)) { if (dn_ntohs(saddr->sdn_nodeaddrl)) { read_lock(&dev_base_lock); - for(dev = dev_base; dev; dev = dev->next) { + ldev = NULL; + for_each_netdev(dev) { if (!dev->dn_ptr) continue; - if (dn_dev_islocal(dev, dn_saddr2dn(saddr))) + if (dn_dev_islocal(dev, dn_saddr2dn(saddr))) { + ldev = dev; break; + } } read_unlock(&dev_base_lock); - if (dev == NULL) + if (ldev == NULL) return -EADDRNOTAVAIL; } } @@ -2413,6 +2416,7 @@ module_init(decnet_init); static void __exit decnet_exit(void) { sock_unregister(AF_DECnet); + rtnl_unregister_all(PF_DECnet); dev_remove_pack(&dn_dix_packet_type); dn_unregister_sysctl(); diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index 060d725e294..764a56a13e3 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -799,10 +799,10 @@ static int dn_nl_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) skip_ndevs = cb->args[0]; skip_naddr = cb->args[1]; - read_lock(&dev_base_lock); - for (dev = dev_base, idx = 0; dev; dev = dev->next, idx++) { + idx = 0; + for_each_netdev(dev) { if (idx < skip_ndevs) - continue; + goto cont; else if (idx > skip_ndevs) { /* Only skip over addresses for first dev dumped * in this iteration (idx == skip_ndevs) */ @@ -810,22 +810,22 @@ static int dn_nl_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) } if ((dn_db = dev->dn_ptr) == NULL) - continue; + goto cont; for (ifa = dn_db->ifa_list, dn_idx = 0; ifa; ifa = ifa->ifa_next, dn_idx++) { if (dn_idx < skip_naddr) - continue; + goto cont; if (dn_nl_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, RTM_NEWADDR, NLM_F_MULTI) < 0) goto done; } +cont: + idx++; } done: - read_unlock(&dev_base_lock); - cb->args[0] = idx; cb->args[1] = dn_idx; @@ -913,7 +913,7 @@ static void dn_send_endnode_hello(struct net_device *dev, struct dn_ifaddr *ifa) pktlen = (__le16 *)skb_push(skb,2); *pktlen = dn_htons(skb->len - 2); - skb->nh.raw = skb->data; + skb_reset_network_header(skb); dn_rt_finish_output(skb, dn_rt_all_rt_mcast, msg->id); } @@ -1005,7 +1005,7 @@ static void dn_send_router_hello(struct net_device *dev, struct dn_ifaddr *ifa) pktlen = (__le16 *)skb_push(skb, 2); *pktlen = dn_htons(skb->len - 2); - skb->nh.raw = skb->data; + skb_reset_network_header(skb); if (dn_am_i_a_router(dn, dn_db, ifa)) { struct sk_buff *skb2 = skb_copy(skb, GFP_ATOMIC); @@ -1299,7 +1299,7 @@ void dn_dev_devices_off(void) struct net_device *dev; rtnl_lock(); - for(dev = dev_base; dev; dev = dev->next) + for_each_netdev(dev) dn_dev_down(dev); rtnl_unlock(); @@ -1310,7 +1310,7 @@ void dn_dev_devices_on(void) struct net_device *dev; rtnl_lock(); - for(dev = dev_base; dev; dev = dev->next) { + for_each_netdev(dev) { if (dev->flags & IFF_UP) dn_dev_up(dev); } @@ -1328,62 +1328,56 @@ int unregister_dnaddr_notifier(struct notifier_block *nb) } #ifdef CONFIG_PROC_FS -static inline struct net_device *dn_dev_get_next(struct seq_file *seq, struct net_device *dev) +static inline int is_dn_dev(struct net_device *dev) { - do { - dev = dev->next; - } while(dev && !dev->dn_ptr); - - return dev; + return dev->dn_ptr != NULL; } -static struct net_device *dn_dev_get_idx(struct seq_file *seq, loff_t pos) +static void *dn_dev_seq_start(struct seq_file *seq, loff_t *pos) { + int i; struct net_device *dev; - dev = dev_base; - if (dev && !dev->dn_ptr) - dev = dn_dev_get_next(seq, dev); - if (pos) { - while(dev && (dev = dn_dev_get_next(seq, dev))) - --pos; - } - return dev; -} + read_lock(&dev_base_lock); -static void *dn_dev_seq_start(struct seq_file *seq, loff_t *pos) -{ - if (*pos) { - struct net_device *dev; - read_lock(&dev_base_lock); - dev = dn_dev_get_idx(seq, *pos - 1); - if (dev == NULL) - read_unlock(&dev_base_lock); - return dev; + if (*pos == 0) + return SEQ_START_TOKEN; + + i = 1; + for_each_netdev(dev) { + if (!is_dn_dev(dev)) + continue; + + if (i++ == *pos) + return dev; } - return SEQ_START_TOKEN; + + return NULL; } static void *dn_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct net_device *dev = v; - loff_t one = 1; + struct net_device *dev; - if (v == SEQ_START_TOKEN) { - dev = dn_dev_seq_start(seq, &one); - } else { - dev = dn_dev_get_next(seq, dev); - if (dev == NULL) - read_unlock(&dev_base_lock); - } ++*pos; - return dev; + + dev = (struct net_device *)v; + if (v == SEQ_START_TOKEN) + dev = net_device_entry(&dev_base_head); + + for_each_netdev_continue(dev) { + if (!is_dn_dev(dev)) + continue; + + return dev; + } + + return NULL; } static void dn_dev_seq_stop(struct seq_file *seq, void *v) { - if (v && v != SEQ_START_TOKEN) - read_unlock(&dev_base_lock); + read_unlock(&dev_base_lock); } static char *dn_type2asc(char type) @@ -1447,24 +1441,6 @@ static const struct file_operations dn_dev_seq_fops = { #endif /* CONFIG_PROC_FS */ -static struct rtnetlink_link dnet_rtnetlink_table[RTM_NR_MSGTYPES] = -{ - [RTM_NEWADDR - RTM_BASE] = { .doit = dn_nl_newaddr, }, - [RTM_DELADDR - RTM_BASE] = { .doit = dn_nl_deladdr, }, - [RTM_GETADDR - RTM_BASE] = { .dumpit = dn_nl_dump_ifaddr, }, -#ifdef CONFIG_DECNET_ROUTER - [RTM_NEWROUTE - RTM_BASE] = { .doit = dn_fib_rtm_newroute, }, - [RTM_DELROUTE - RTM_BASE] = { .doit = dn_fib_rtm_delroute, }, - [RTM_GETROUTE - RTM_BASE] = { .doit = dn_cache_getroute, - .dumpit = dn_fib_dump, }, - [RTM_GETRULE - RTM_BASE] = { .dumpit = dn_fib_dump_rules, }, -#else - [RTM_GETROUTE - RTM_BASE] = { .doit = dn_cache_getroute, - .dumpit = dn_cache_dump, }, -#endif - -}; - static int __initdata addr[2]; module_param_array(addr, int, NULL, 0444); MODULE_PARM_DESC(addr, "The DECnet address of this machine: area,node"); @@ -1485,7 +1461,9 @@ void __init dn_dev_init(void) dn_dev_devices_on(); - rtnetlink_links[PF_DECnet] = dnet_rtnetlink_table; + rtnl_register(PF_DECnet, RTM_NEWADDR, dn_nl_newaddr, NULL); + rtnl_register(PF_DECnet, RTM_DELADDR, dn_nl_deladdr, NULL); + rtnl_register(PF_DECnet, RTM_GETADDR, NULL, dn_nl_dump_ifaddr); proc_net_fops_create("decnet_dev", S_IRUGO, &dn_dev_seq_fops); @@ -1500,8 +1478,6 @@ void __init dn_dev_init(void) void __exit dn_dev_cleanup(void) { - rtnetlink_links[PF_DECnet] = NULL; - #ifdef CONFIG_SYSCTL { int i; diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c index 82d58a977e6..d2bc19d4795 100644 --- a/net/decnet/dn_fib.c +++ b/net/decnet/dn_fib.c @@ -504,7 +504,7 @@ static int dn_fib_check_attr(struct rtmsg *r, struct rtattr **rta) return 0; } -int dn_fib_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int dn_fib_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { struct dn_fib_table *tb; struct rtattr **rta = arg; @@ -520,7 +520,7 @@ int dn_fib_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) return -ESRCH; } -int dn_fib_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int dn_fib_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { struct dn_fib_table *tb; struct rtattr **rta = arg; @@ -602,7 +602,7 @@ static void dn_fib_del_ifaddr(struct dn_ifaddr *ifa) /* Scan device list */ read_lock(&dev_base_lock); - for(dev = dev_base; dev; dev = dev->next) { + for_each_netdev(dev) { dn_db = dev->dn_ptr; if (dn_db == NULL) continue; @@ -748,11 +748,13 @@ void __exit dn_fib_cleanup(void) void __init dn_fib_init(void) { - dn_fib_table_init(); dn_fib_rules_init(); register_dnaddr_notifier(&dn_fib_dnaddr_notifier); + + rtnl_register(PF_DECnet, RTM_NEWROUTE, dn_fib_rtm_newroute, NULL); + rtnl_register(PF_DECnet, RTM_DELROUTE, dn_fib_rtm_delroute, NULL); } diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c index bf701cf5a38..4bf066c416e 100644 --- a/net/decnet/dn_neigh.c +++ b/net/decnet/dn_neigh.c @@ -261,7 +261,7 @@ static int dn_long_output(struct sk_buff *skb) lp->s_class = 0; lp->pt = 0; - skb->nh.raw = skb->data; + skb_reset_network_header(skb); return NF_HOOK(PF_DECnet, NF_DN_POST_ROUTING, skb, NULL, neigh->dev, dn_neigh_output_packet); } @@ -300,7 +300,7 @@ static int dn_short_output(struct sk_buff *skb) sp->srcnode = cb->src; sp->forward = cb->hops & 0x3f; - skb->nh.raw = skb->data; + skb_reset_network_header(skb); return NF_HOOK(PF_DECnet, NF_DN_POST_ROUTING, skb, NULL, neigh->dev, dn_neigh_output_packet); } @@ -342,7 +342,7 @@ static int dn_phase3_output(struct sk_buff *skb) sp->srcnode = cb->src & dn_htons(0x03ff); sp->forward = cb->hops & 0x3f; - skb->nh.raw = skb->data; + skb_reset_network_header(skb); return NF_HOOK(PF_DECnet, NF_DN_POST_ROUTING, skb, NULL, neigh->dev, dn_neigh_output_packet); } diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c index 9d20904f6f5..4074a6e5d0d 100644 --- a/net/decnet/dn_nsp_in.c +++ b/net/decnet/dn_nsp_in.c @@ -362,7 +362,8 @@ static void dn_nsp_conn_conf(struct sock *sk, struct sk_buff *skb) u16 dlen = *skb->data; if ((dlen <= 16) && (dlen <= skb->len)) { scp->conndata_in.opt_optl = dn_htons(dlen); - memcpy(scp->conndata_in.opt_data, skb->data + 1, dlen); + skb_copy_from_linear_data_offset(skb, 1, + scp->conndata_in.opt_data, dlen); } } dn_nsp_send_link(sk, DN_NOCHANGE, 0); @@ -406,7 +407,7 @@ static void dn_nsp_disc_init(struct sock *sk, struct sk_buff *skb) u16 dlen = *skb->data; if ((dlen <= 16) && (dlen <= skb->len)) { scp->discdata_in.opt_optl = dn_htons(dlen); - memcpy(scp->discdata_in.opt_data, skb->data + 1, dlen); + skb_copy_from_linear_data_offset(skb, 1, scp->discdata_in.opt_data, dlen); } } @@ -725,7 +726,7 @@ static int dn_nsp_rx_packet(struct sk_buff *skb) if (!pskb_may_pull(skb, 2)) goto free_out; - skb->h.raw = skb->data; + skb_reset_transport_header(skb); cb->nsp_flags = *ptr++; if (decnet_debug_level & 2) diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c index 2d2cda82c7d..7404653880b 100644 --- a/net/decnet/dn_nsp_out.c +++ b/net/decnet/dn_nsp_out.c @@ -79,7 +79,7 @@ static void dn_nsp_send(struct sk_buff *skb) struct dst_entry *dst; struct flowi fl; - skb->h.raw = skb->data; + skb_reset_transport_header(skb); scp->stamp = jiffies; dst = sk_dst_check(sk, 0); @@ -681,8 +681,10 @@ void dn_nsp_send_conninit(struct sock *sk, unsigned char msgflg) if (scp->peer.sdn_objnum) type = 0; - skb_put(skb, dn_sockaddr2username(&scp->peer, skb->tail, type)); - skb_put(skb, dn_sockaddr2username(&scp->addr, skb->tail, 2)); + skb_put(skb, dn_sockaddr2username(&scp->peer, + skb_tail_pointer(skb), type)); + skb_put(skb, dn_sockaddr2username(&scp->addr, + skb_tail_pointer(skb), 2)); menuver = DN_MENUVER_ACC | DN_MENUVER_USR; if (scp->peer.sdn_flags & SDF_PROXY) diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index c1b5502f195..a8bf106b7a6 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -77,6 +77,7 @@ #include <linux/rcupdate.h> #include <linux/times.h> #include <asm/errno.h> +#include <net/netlink.h> #include <net/neighbour.h> #include <net/dst.h> #include <net/flow.h> @@ -386,7 +387,7 @@ static int dn_return_short(struct sk_buff *skb) __le16 tmp; /* Add back headers */ - skb_push(skb, skb->data - skb->nh.raw); + skb_push(skb, skb->data - skb_network_header(skb)); if ((skb = skb_unshare(skb, GFP_ATOMIC)) == NULL) return NET_RX_DROP; @@ -425,7 +426,7 @@ static int dn_return_long(struct sk_buff *skb) unsigned char tmp[ETH_ALEN]; /* Add back all headers */ - skb_push(skb, skb->data - skb->nh.raw); + skb_push(skb, skb->data - skb_network_header(skb)); if ((skb = skb_unshare(skb, GFP_ATOMIC)) == NULL) return NET_RX_DROP; @@ -504,7 +505,7 @@ static int dn_route_rx_long(struct sk_buff *skb) goto drop_it; skb_pull(skb, 20); - skb->h.raw = skb->data; + skb_reset_transport_header(skb); /* Destination info */ ptr += 2; @@ -542,7 +543,7 @@ static int dn_route_rx_short(struct sk_buff *skb) goto drop_it; skb_pull(skb, 5); - skb->h.raw = skb->data; + skb_reset_transport_header(skb); cb->dst = *(__le16 *)ptr; ptr += 2; @@ -615,7 +616,7 @@ int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type flags = *skb->data; } - skb->nh.raw = skb->data; + skb_reset_network_header(skb); /* * Weed out future version DECnet @@ -885,7 +886,7 @@ static int dn_route_output_slow(struct dst_entry **pprt, const struct flowi *old .iif = loopback_dev.ifindex, .oif = oldflp->oif }; struct dn_route *rt = NULL; - struct net_device *dev_out = NULL; + struct net_device *dev_out = NULL, *dev; struct neighbour *neigh = NULL; unsigned hash; unsigned flags = 0; @@ -924,15 +925,17 @@ static int dn_route_output_slow(struct dst_entry **pprt, const struct flowi *old goto out; } read_lock(&dev_base_lock); - for(dev_out = dev_base; dev_out; dev_out = dev_out->next) { - if (!dev_out->dn_ptr) + for_each_netdev(dev) { + if (!dev->dn_ptr) continue; - if (!dn_dev_islocal(dev_out, oldflp->fld_src)) + if (!dn_dev_islocal(dev, oldflp->fld_src)) continue; - if ((dev_out->flags & IFF_LOOPBACK) && + if ((dev->flags & IFF_LOOPBACK) && oldflp->fld_dst && - !dn_dev_islocal(dev_out, oldflp->fld_dst)) + !dn_dev_islocal(dev, oldflp->fld_dst)) continue; + + dev_out = dev; break; } read_unlock(&dev_base_lock); @@ -1468,7 +1471,7 @@ static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, struct dn_route *rt = (struct dn_route *)skb->dst; struct rtmsg *r; struct nlmsghdr *nlh; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); long expires; nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags); @@ -1509,19 +1512,19 @@ static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, if (rt->fl.iif) RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif); - nlh->nlmsg_len = skb->tail - b; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; nlmsg_failure: rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } /* * This is called by both endnodes and routers now. */ -int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg) +static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg) { struct rtattr **rta = arg; struct rtmsg *rtm = NLMSG_DATA(nlh); @@ -1537,7 +1540,7 @@ int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg) skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (skb == NULL) return -ENOBUFS; - skb->mac.raw = skb->data; + skb_reset_mac_header(skb); cb = DN_SKB_CB(skb); if (rta[RTA_SRC-1]) @@ -1812,6 +1815,13 @@ void __init dn_route_init(void) dn_dst_ops.gc_thresh = (dn_rt_hash_mask + 1); proc_net_fops_create("decnet_cache", S_IRUGO, &dn_rt_cache_seq_fops); + +#ifdef CONFIG_DECNET_ROUTER + rtnl_register(PF_DECnet, RTM_GETROUTE, dn_cache_getroute, dn_fib_dump); +#else + rtnl_register(PF_DECnet, RTM_GETROUTE, dn_cache_getroute, + dn_cache_dump); +#endif } void __exit dn_route_cleanup(void) diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c index 5e86dd54230..17a1932216d 100644 --- a/net/decnet/dn_rules.c +++ b/net/decnet/dn_rules.c @@ -31,6 +31,7 @@ #include <net/dn_fib.h> #include <net/dn_neigh.h> #include <net/dn_dev.h> +#include <net/dn_route.h> static struct fib_rules_ops dn_fib_rules_ops; @@ -239,9 +240,9 @@ static u32 dn_fib_rule_default_pref(void) return 0; } -int dn_fib_dump_rules(struct sk_buff *skb, struct netlink_callback *cb) +static void dn_fib_rule_flush_cache(void) { - return fib_rules_dump(skb, cb, AF_DECnet); + dn_rt_cache_flush(-1); } static struct fib_rules_ops dn_fib_rules_ops = { @@ -254,6 +255,7 @@ static struct fib_rules_ops dn_fib_rules_ops = { .compare = dn_fib_rule_compare, .fill = dn_fib_rule_fill, .default_pref = dn_fib_rule_default_pref, + .flush_cache = dn_fib_rule_flush_cache, .nlgroup = RTNLGRP_DECnet_RULE, .policy = dn_fib_rule_policy, .rules_list = &dn_fib_rules, diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c index 780a141f834..d6615c9361e 100644 --- a/net/decnet/dn_table.c +++ b/net/decnet/dn_table.c @@ -28,6 +28,7 @@ #include <asm/uaccess.h> #include <linux/route.h> /* RTF_xxx */ #include <net/neighbour.h> +#include <net/netlink.h> #include <net/dst.h> #include <net/flow.h> #include <net/fib_rules.h> @@ -295,7 +296,7 @@ static int dn_fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, { struct rtmsg *rtm; struct nlmsghdr *nlh; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*rtm), flags); rtm = NLMSG_DATA(nlh); @@ -337,19 +338,19 @@ static int dn_fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, nhp->rtnh_ifindex = nh->nh_oif; if (nh->nh_gw) RTA_PUT(skb, RTA_GATEWAY, 2, &nh->nh_gw); - nhp->rtnh_len = skb->tail - (unsigned char *)nhp; + nhp->rtnh_len = skb_tail_pointer(skb) - (unsigned char *)nhp; } endfor_nexthops(fi); mp_head->rta_type = RTA_MULTIPATH; - mp_head->rta_len = skb->tail - (u8*)mp_head; + mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head; } - nlh->nlmsg_len = skb->tail - b; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; nlmsg_failure: rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -EMSGSIZE; } diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c index 0e62def05a5..696234688cf 100644 --- a/net/decnet/netfilter/dn_rtmsg.c +++ b/net/decnet/netfilter/dn_rtmsg.c @@ -33,7 +33,7 @@ static struct sk_buff *dnrmg_build_message(struct sk_buff *rt_skb, int *errp) { struct sk_buff *skb = NULL; size_t size; - unsigned char *old_tail; + sk_buff_data_t old_tail; struct nlmsghdr *nlh; unsigned char *ptr; struct nf_dn_rtmsg *rtm; @@ -48,7 +48,7 @@ static struct sk_buff *dnrmg_build_message(struct sk_buff *rt_skb, int *errp) rtm = (struct nf_dn_rtmsg *)NLMSG_DATA(nlh); rtm->nfdn_ifindex = rt_skb->dev->ifindex; ptr = NFDN_RTMSG(rtm); - memcpy(ptr, rt_skb->data, rt_skb->len); + skb_copy_from_linear_data(rt_skb, ptr, rt_skb->len); nlh->nlmsg_len = skb->tail - old_tail; return skb; @@ -102,7 +102,7 @@ static unsigned int dnrmg_hook(unsigned int hook, static inline void dnrmg_receive_user_skb(struct sk_buff *skb) { - struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data; + struct nlmsghdr *nlh = nlmsg_hdr(skb); if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) return; @@ -138,7 +138,7 @@ static int __init dn_rtmsg_init(void) int rv = 0; dnrmg = netlink_kernel_create(NETLINK_DNRTMSG, DNRNG_NLGRP_MAX, - dnrmg_receive_user_sk, THIS_MODULE); + dnrmg_receive_user_sk, NULL, THIS_MODULE); if (dnrmg == NULL) { printk(KERN_ERR "dn_rtmsg: Cannot create netlink socket"); return -ENOMEM; diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c index bc12e36263f..b5524f32ac2 100644 --- a/net/econet/af_econet.c +++ b/net/econet/af_econet.c @@ -162,7 +162,7 @@ static int econet_recvmsg(struct kiocb *iocb, struct socket *sock, err = memcpy_toiovec(msg->msg_iov, skb->data, copied); if (err) goto out_free; - skb_get_timestamp(skb, &sk->sk_stamp); + sk->sk_stamp = skb->tstamp; if (msg->msg_name) memcpy(msg->msg_name, skb->cb, msg->msg_namelen); @@ -345,7 +345,7 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock, goto out_unlock; skb_reserve(skb, LL_RESERVED_SPACE(dev)); - skb->nh.raw = skb->data; + skb_reset_network_header(skb); eb = (struct ec_cb *)&skb->cb; @@ -366,7 +366,7 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock, fh->cb = cb; fh->port = port; if (sock->type != SOCK_DGRAM) { - skb->tail = skb->data; + skb_reset_tail_pointer(skb); skb->len = 0; } else if (res < 0) goto out_free; @@ -727,6 +727,9 @@ static int econet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg case SIOCGSTAMP: return sock_get_timestamp(sk, argp); + case SIOCGSTAMPNS: + return sock_get_timestampns(sk, argp); + case SIOCSIFADDR: case SIOCGIFADDR: return ec_dev_ioctl(sock, cmd, argp); @@ -845,7 +848,7 @@ static void aun_send_response(__u32 addr, unsigned long seq, int code, int cb) static void aun_incoming(struct sk_buff *skb, struct aunhdr *ah, size_t len) { - struct iphdr *ip = skb->nh.iph; + struct iphdr *ip = ip_hdr(skb); unsigned char stn = ntohl(ip->saddr) & 0xff; struct sock *sk; struct sk_buff *newskb; @@ -940,10 +943,10 @@ static void aun_data_available(struct sock *sk, int slen) printk(KERN_DEBUG "AUN: recvfrom() error %d\n", -err); } - data = skb->h.raw + sizeof(struct udphdr); + data = skb_transport_header(skb) + sizeof(struct udphdr); ah = (struct aunhdr *)data; len = skb->len - sizeof(struct udphdr); - ip = skb->nh.iph; + ip = ip_hdr(skb); switch (ah->code) { diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 7391f55904d..0ac2524f3b6 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -156,7 +156,8 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) struct ethhdr *eth; unsigned char *rawp; - skb->mac.raw = skb->data; + skb->dev = dev; + skb_reset_mac_header(skb); skb_pull(skb, ETH_HLEN); eth = eth_hdr(skb); @@ -228,7 +229,7 @@ int eth_header_cache(struct neighbour *neigh, struct hh_cache *hh) eth = (struct ethhdr *) (((u8 *) hh->hh_data) + (HH_DATA_OFF(sizeof(*eth)))); - if (type == __constant_htons(ETH_P_802_3)) + if (type == htons(ETH_P_802_3)) return -1; eth->h_proto = type; diff --git a/net/ieee80211/Kconfig b/net/ieee80211/Kconfig index 6ef766ef961..1438adedbc8 100644 --- a/net/ieee80211/Kconfig +++ b/net/ieee80211/Kconfig @@ -56,7 +56,8 @@ config IEEE80211_CRYPT_CCMP config IEEE80211_CRYPT_TKIP tristate "IEEE 802.11i TKIP encryption" - depends on IEEE80211 && NET_RADIO + depends on IEEE80211 + select WIRELESS_EXT select CRYPTO select CRYPTO_MICHAEL_MIC select CRYPTO_ECB diff --git a/net/ieee80211/ieee80211_crypt.c b/net/ieee80211/ieee80211_crypt.c index 5ed0a98b2d7..df5592c9339 100644 --- a/net/ieee80211/ieee80211_crypt.c +++ b/net/ieee80211/ieee80211_crypt.c @@ -1,7 +1,7 @@ /* * Host AP crypto routines * - * Copyright (c) 2002-2003, Jouni Malinen <jkmaline@cc.hut.fi> + * Copyright (c) 2002-2003, Jouni Malinen <j@w1.fi> * Portions Copyright (C) 2004, Intel Corporation <jketreno@linux.intel.com> * * This program is free software; you can redistribute it and/or modify diff --git a/net/ieee80211/ieee80211_crypt_ccmp.c b/net/ieee80211/ieee80211_crypt_ccmp.c index 35aa3426c3f..b016b4104de 100644 --- a/net/ieee80211/ieee80211_crypt_ccmp.c +++ b/net/ieee80211/ieee80211_crypt_ccmp.c @@ -1,7 +1,7 @@ /* * Host AP crypt: host-based CCMP encryption implementation for Host AP driver * - * Copyright (c) 2003-2004, Jouni Malinen <jkmaline@cc.hut.fi> + * Copyright (c) 2003-2004, Jouni Malinen <j@w1.fi> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -338,7 +338,7 @@ static int ieee80211_ccmp_decrypt(struct sk_buff *skb, int hdr_len, void *priv) if (ccmp_replay_check(pn, key->rx_pn)) { if (net_ratelimit()) { - printk(KERN_DEBUG "CCMP: replay detected: STA=" MAC_FMT + IEEE80211_DEBUG_DROP("CCMP: replay detected: STA=" MAC_FMT " previous PN %02x%02x%02x%02x%02x%02x " "received PN %02x%02x%02x%02x%02x%02x\n", MAC_ARG(hdr->addr2), MAC_ARG(key->rx_pn), diff --git a/net/ieee80211/ieee80211_crypt_tkip.c b/net/ieee80211/ieee80211_crypt_tkip.c index fc1f99a5973..5a48d8e0aec 100644 --- a/net/ieee80211/ieee80211_crypt_tkip.c +++ b/net/ieee80211/ieee80211_crypt_tkip.c @@ -1,7 +1,7 @@ /* * Host AP crypt: host-based TKIP encryption implementation for Host AP driver * - * Copyright (c) 2003-2004, Jouni Malinen <jkmaline@cc.hut.fi> + * Copyright (c) 2003-2004, Jouni Malinen <j@w1.fi> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -465,7 +465,7 @@ static int ieee80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv) if (tkip_replay_check(iv32, iv16, tkey->rx_iv32, tkey->rx_iv16)) { if (net_ratelimit()) { - printk(KERN_DEBUG "TKIP: replay detected: STA=" MAC_FMT + IEEE80211_DEBUG_DROP("TKIP: replay detected: STA=" MAC_FMT " previous TSC %08x%04x received TSC " "%08x%04x\n", MAC_ARG(hdr->addr2), tkey->rx_iv32, tkey->rx_iv16, iv32, iv16); @@ -507,7 +507,7 @@ static int ieee80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv) tkey->rx_phase1_done = 0; } if (net_ratelimit()) { - printk(KERN_DEBUG "TKIP: ICV error detected: STA=" + IEEE80211_DEBUG_DROP("TKIP: ICV error detected: STA=" MAC_FMT "\n", MAC_ARG(hdr->addr2)); } tkey->dot11RSNAStatsTKIPICVErrors++; diff --git a/net/ieee80211/ieee80211_crypt_wep.c b/net/ieee80211/ieee80211_crypt_wep.c index ec6d8851a06..8d182459344 100644 --- a/net/ieee80211/ieee80211_crypt_wep.c +++ b/net/ieee80211/ieee80211_crypt_wep.c @@ -1,7 +1,7 @@ /* * Host AP crypt: host-based WEP encryption implementation for Host AP driver * - * Copyright (c) 2002-2004, Jouni Malinen <jkmaline@cc.hut.fi> + * Copyright (c) 2002-2004, Jouni Malinen <j@w1.fi> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -152,7 +152,7 @@ static int prism2_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv) return -1; /* Copy the IV into the first 3 bytes of the key */ - memcpy(key, skb->data + hdr_len, 3); + skb_copy_from_linear_data_offset(skb, hdr_len, key, 3); /* Copy rest of the WEP key (the secret part) */ memcpy(key + 3, wep->key, wep->key_len); diff --git a/net/ieee80211/ieee80211_module.c b/net/ieee80211/ieee80211_module.c index b1c6d1f717d..7ec6610841b 100644 --- a/net/ieee80211/ieee80211_module.c +++ b/net/ieee80211/ieee80211_module.c @@ -5,8 +5,8 @@ Portions of this file are based on the WEP enablement code provided by the Host AP project hostap-drivers v0.1.3 Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen - <jkmaline@cc.hut.fi> - Copyright (c) 2002-2003, Jouni Malinen <jkmaline@cc.hut.fi> + <j@w1.fi> + Copyright (c) 2002-2003, Jouni Malinen <j@w1.fi> This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as @@ -229,6 +229,7 @@ void free_ieee80211(struct net_device *dev) static int debug = 0; u32 ieee80211_debug_level = 0; +EXPORT_SYMBOL_GPL(ieee80211_debug_level); static struct proc_dir_entry *ieee80211_proc = NULL; static int show_debug_level(char *page, char **start, off_t offset, diff --git a/net/ieee80211/ieee80211_rx.c b/net/ieee80211/ieee80211_rx.c index 4084909f6f9..f2de2e48b02 100644 --- a/net/ieee80211/ieee80211_rx.c +++ b/net/ieee80211/ieee80211_rx.c @@ -3,8 +3,8 @@ * for Intersil Prism2/2.5/3 - hostap.o module, common routines * * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen - * <jkmaline@cc.hut.fi> - * Copyright (c) 2002-2003, Jouni Malinen <jkmaline@cc.hut.fi> + * <j@w1.fi> + * Copyright (c) 2002-2003, Jouni Malinen <j@w1.fi> * Copyright (c) 2004-2005, Intel Corporation * * This program is free software; you can redistribute it and/or modify @@ -42,7 +42,7 @@ static void ieee80211_monitor_rx(struct ieee80211_device *ieee, u16 fc = le16_to_cpu(hdr->frame_ctl); skb->dev = ieee->dev; - skb->mac.raw = skb->data; + skb_reset_mac_header(skb); skb_pull(skb, ieee80211_get_hdrlen(fc)); skb->pkt_type = PACKET_OTHERHOST; skb->protocol = __constant_htons(ETH_P_80211_RAW); @@ -606,12 +606,12 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb, if (frag == 0) { /* copy first fragment (including full headers) into * beginning of the fragment cache skb */ - memcpy(skb_put(frag_skb, flen), skb->data, flen); + skb_copy_from_linear_data(skb, skb_put(frag_skb, flen), flen); } else { /* append frame payload to the end of the fragment * cache skb */ - memcpy(skb_put(frag_skb, flen), skb->data + hdrlen, - flen); + skb_copy_from_linear_data_offset(skb, hdrlen, + skb_put(frag_skb, flen), flen); } dev_kfree_skb_any(skb); skb = NULL; @@ -759,8 +759,9 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb, IEEE80211_FCTL_TODS) && skb->len >= ETH_HLEN + ETH_ALEN) { /* Non-standard frame: get addr4 from its bogus location after * the payload */ - memcpy(skb->data + ETH_ALEN, - skb->data + skb->len - ETH_ALEN, ETH_ALEN); + skb_copy_to_linear_data_offset(skb, ETH_ALEN, + skb->data + skb->len - ETH_ALEN, + ETH_ALEN); skb_trim(skb, skb->len - ETH_ALEN); } #endif @@ -789,10 +790,11 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb, if (skb2 != NULL) { /* send to wireless media */ - skb2->protocol = __constant_htons(ETH_P_802_3); - skb2->mac.raw = skb2->nh.raw = skb2->data; - /* skb2->nh.raw = skb2->data + ETH_HLEN; */ skb2->dev = dev; + skb2->protocol = __constant_htons(ETH_P_802_3); + skb_reset_mac_header(skb2); + skb_reset_network_header(skb2); + /* skb2->network_header += ETH_HLEN; */ dev_queue_xmit(skb2); } #endif @@ -800,7 +802,6 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb, if (skb) { skb->protocol = eth_type_trans(skb, dev); memset(skb->cb, 0, sizeof(skb->cb)); - skb->dev = dev; skb->ip_summed = CHECKSUM_NONE; /* 802.11 crc not sufficient */ if (netif_rx(skb) == NET_RX_DROP) { /* netif_rx always succeeds, but it might drop diff --git a/net/ieee80211/ieee80211_tx.c b/net/ieee80211/ieee80211_tx.c index 0292d6348e1..a4c3c51140a 100644 --- a/net/ieee80211/ieee80211_tx.c +++ b/net/ieee80211/ieee80211_tx.c @@ -225,10 +225,10 @@ static int ieee80211_classify(struct sk_buff *skb) struct iphdr *ip; eth = (struct ethhdr *)skb->data; - if (eth->h_proto != __constant_htons(ETH_P_IP)) + if (eth->h_proto != htons(ETH_P_IP)) return 0; - ip = skb->nh.iph; + ip = ip_hdr(skb); switch (ip->tos & 0xfc) { case 0x20: return 2; @@ -309,8 +309,8 @@ int ieee80211_xmit(struct sk_buff *skb, struct net_device *dev) } /* Save source and destination addresses */ - memcpy(dest, skb->data, ETH_ALEN); - memcpy(src, skb->data + ETH_ALEN, ETH_ALEN); + skb_copy_from_linear_data(skb, dest, ETH_ALEN); + skb_copy_from_linear_data_offset(skb, ETH_ALEN, src, ETH_ALEN); if (host_encrypt || host_build_iv) fc = IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA | @@ -363,7 +363,7 @@ int ieee80211_xmit(struct sk_buff *skb, struct net_device *dev) snapped = 1; ieee80211_copy_snap(skb_put(skb_new, SNAP_SIZE + sizeof(u16)), ether_type); - memcpy(skb_put(skb_new, skb->len), skb->data, skb->len); + skb_copy_from_linear_data(skb, skb_put(skb_new, skb->len), skb->len); res = crypt->ops->encrypt_msdu(skb_new, hdr_len, crypt->priv); if (res < 0) { IEEE80211_ERROR("msdu encryption failed\n"); @@ -492,7 +492,7 @@ int ieee80211_xmit(struct sk_buff *skb, struct net_device *dev) bytes -= SNAP_SIZE + sizeof(u16); } - memcpy(skb_put(skb_frag, bytes), skb->data, bytes); + skb_copy_from_linear_data(skb, skb_put(skb_frag, bytes), bytes); /* Advance the SKB... */ skb_pull(skb, bytes); diff --git a/net/ieee80211/ieee80211_wx.c b/net/ieee80211/ieee80211_wx.c index 40d7a55fe03..cee5e13bc42 100644 --- a/net/ieee80211/ieee80211_wx.c +++ b/net/ieee80211/ieee80211_wx.c @@ -5,8 +5,8 @@ Portions of this file are based on the WEP enablement code provided by the Host AP project hostap-drivers v0.1.3 Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen - <jkmaline@cc.hut.fi> - Copyright (c) 2002-2003, Jouni Malinen <jkmaline@cc.hut.fi> + <j@w1.fi> + Copyright (c) 2002-2003, Jouni Malinen <j@w1.fi> This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 9e8ef509c51..e62aee0ec4c 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -574,6 +574,33 @@ config TCP_CONG_VENO loss packets. See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf +config TCP_CONG_YEAH + tristate "YeAH TCP" + depends on EXPERIMENTAL + default n + ---help--- + YeAH-TCP is a sender-side high-speed enabled TCP congestion control + algorithm, which uses a mixed loss/delay approach to compute the + congestion window. It's design goals target high efficiency, + internal, RTT and Reno fairness, resilience to link loss while + keeping network elements load as low as possible. + + For further details look here: + http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf + +config TCP_CONG_ILLINOIS + tristate "TCP Illinois" + depends on EXPERIMENTAL + default n + ---help--- + TCP-Illinois is a sender-side modificatio of TCP Reno for + high speed long delay links. It uses round-trip-time to + adjust the alpha and beta parameters to achieve a higher average + throughput and maintain fairness. + + For further details see: + http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html + choice prompt "Default TCP congestion control" default DEFAULT_CUBIC diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 7a068626fee..4ff6c151d7f 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -49,6 +49,8 @@ obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o +obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o +obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index cf358c84c44..16aae8ef555 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -87,6 +87,7 @@ #include <linux/init.h> #include <linux/poll.h> #include <linux/netfilter_ipv4.h> +#include <linux/random.h> #include <asm/uaccess.h> #include <asm/system.h> @@ -217,6 +218,26 @@ out: return err; } +u32 inet_ehash_secret __read_mostly; +EXPORT_SYMBOL(inet_ehash_secret); + +/* + * inet_ehash_secret must be set exactly once + * Instead of using a dedicated spinlock, we (ab)use inetsw_lock + */ +void build_ehash_secret(void) +{ + u32 rnd; + do { + get_random_bytes(&rnd, sizeof(rnd)); + } while (rnd == 0); + spin_lock_bh(&inetsw_lock); + if (!inet_ehash_secret) + inet_ehash_secret = rnd; + spin_unlock_bh(&inetsw_lock); +} +EXPORT_SYMBOL(build_ehash_secret); + /* * Create an inet socket. */ @@ -233,6 +254,11 @@ static int inet_create(struct socket *sock, int protocol) int try_loading_module = 0; int err; + if (sock->type != SOCK_RAW && + sock->type != SOCK_DGRAM && + !inet_ehash_secret) + build_ehash_secret(); + sock->state = SS_UNCONNECTED; /* Look for the requested type/protocol pair. */ @@ -755,6 +781,9 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCGSTAMP: err = sock_get_timestamp(sk, (struct timeval __user *)arg); break; + case SIOCGSTAMPNS: + err = sock_get_timestampns(sk, (struct timespec __user *)arg); + break; case SIOCADDRT: case SIOCDELRT: case SIOCRTMSG: @@ -1109,7 +1138,7 @@ static int inet_gso_send_check(struct sk_buff *skb) if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) goto out; - iph = skb->nh.iph; + iph = ip_hdr(skb); ihl = iph->ihl * 4; if (ihl < sizeof(*iph)) goto out; @@ -1117,8 +1146,9 @@ static int inet_gso_send_check(struct sk_buff *skb) if (unlikely(!pskb_may_pull(skb, ihl))) goto out; - skb->h.raw = __skb_pull(skb, ihl); - iph = skb->nh.iph; + __skb_pull(skb, ihl); + skb_reset_transport_header(skb); + iph = ip_hdr(skb); proto = iph->protocol & (MAX_INET_PROTOS - 1); err = -EPROTONOSUPPORT; @@ -1152,7 +1182,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features) if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) goto out; - iph = skb->nh.iph; + iph = ip_hdr(skb); ihl = iph->ihl * 4; if (ihl < sizeof(*iph)) goto out; @@ -1160,8 +1190,9 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features) if (unlikely(!pskb_may_pull(skb, ihl))) goto out; - skb->h.raw = __skb_pull(skb, ihl); - iph = skb->nh.iph; + __skb_pull(skb, ihl); + skb_reset_transport_header(skb); + iph = ip_hdr(skb); id = ntohs(iph->id); proto = iph->protocol & (MAX_INET_PROTOS - 1); segs = ERR_PTR(-EPROTONOSUPPORT); @@ -1177,17 +1208,57 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features) skb = segs; do { - iph = skb->nh.iph; + iph = ip_hdr(skb); iph->id = htons(id++); iph->tot_len = htons(skb->len - skb->mac_len); iph->check = 0; - iph->check = ip_fast_csum(skb->nh.raw, iph->ihl); + iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl); } while ((skb = skb->next)); out: return segs; } +unsigned long snmp_fold_field(void *mib[], int offt) +{ + unsigned long res = 0; + int i; + + for_each_possible_cpu(i) { + res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt); + res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt); + } + return res; +} +EXPORT_SYMBOL_GPL(snmp_fold_field); + +int snmp_mib_init(void *ptr[2], size_t mibsize, size_t mibalign) +{ + BUG_ON(ptr == NULL); + ptr[0] = __alloc_percpu(mibsize); + if (!ptr[0]) + goto err0; + ptr[1] = __alloc_percpu(mibsize); + if (!ptr[1]) + goto err1; + return 0; +err1: + free_percpu(ptr[0]); + ptr[0] = NULL; +err0: + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(snmp_mib_init); + +void snmp_mib_free(void *ptr[2]) +{ + BUG_ON(ptr == NULL); + free_percpu(ptr[0]); + free_percpu(ptr[1]); + ptr[0] = ptr[1] = NULL; +} +EXPORT_SYMBOL_GPL(snmp_mib_free); + #ifdef CONFIG_IP_MULTICAST static struct net_protocol igmp_protocol = { .handler = igmp_rcv, @@ -1214,28 +1285,47 @@ static struct net_protocol icmp_protocol = { static int __init init_ipv4_mibs(void) { - net_statistics[0] = alloc_percpu(struct linux_mib); - net_statistics[1] = alloc_percpu(struct linux_mib); - ip_statistics[0] = alloc_percpu(struct ipstats_mib); - ip_statistics[1] = alloc_percpu(struct ipstats_mib); - icmp_statistics[0] = alloc_percpu(struct icmp_mib); - icmp_statistics[1] = alloc_percpu(struct icmp_mib); - tcp_statistics[0] = alloc_percpu(struct tcp_mib); - tcp_statistics[1] = alloc_percpu(struct tcp_mib); - udp_statistics[0] = alloc_percpu(struct udp_mib); - udp_statistics[1] = alloc_percpu(struct udp_mib); - udplite_statistics[0] = alloc_percpu(struct udp_mib); - udplite_statistics[1] = alloc_percpu(struct udp_mib); - if (! - (net_statistics[0] && net_statistics[1] && ip_statistics[0] - && ip_statistics[1] && tcp_statistics[0] && tcp_statistics[1] - && udp_statistics[0] && udp_statistics[1] - && udplite_statistics[0] && udplite_statistics[1] ) ) - return -ENOMEM; - - (void) tcp_mib_init(); + if (snmp_mib_init((void **)net_statistics, + sizeof(struct linux_mib), + __alignof__(struct linux_mib)) < 0) + goto err_net_mib; + if (snmp_mib_init((void **)ip_statistics, + sizeof(struct ipstats_mib), + __alignof__(struct ipstats_mib)) < 0) + goto err_ip_mib; + if (snmp_mib_init((void **)icmp_statistics, + sizeof(struct icmp_mib), + __alignof__(struct icmp_mib)) < 0) + goto err_icmp_mib; + if (snmp_mib_init((void **)tcp_statistics, + sizeof(struct tcp_mib), + __alignof__(struct tcp_mib)) < 0) + goto err_tcp_mib; + if (snmp_mib_init((void **)udp_statistics, + sizeof(struct udp_mib), + __alignof__(struct udp_mib)) < 0) + goto err_udp_mib; + if (snmp_mib_init((void **)udplite_statistics, + sizeof(struct udp_mib), + __alignof__(struct udp_mib)) < 0) + goto err_udplite_mib; + + tcp_mib_init(); return 0; + +err_udplite_mib: + snmp_mib_free((void **)udp_statistics); +err_udp_mib: + snmp_mib_free((void **)tcp_statistics); +err_tcp_mib: + snmp_mib_free((void **)icmp_statistics); +err_icmp_mib: + snmp_mib_free((void **)ip_statistics); +err_ip_mib: + snmp_mib_free((void **)net_statistics); +err_net_mib: + return -ENOMEM; } static int ipv4_proc_init(void); @@ -1336,7 +1426,7 @@ static int __init inet_init(void) * Initialise per-cpu ipv4 mibs */ - if(init_ipv4_mibs()) + if (init_ipv4_mibs()) printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n"); ; ipv4_proc_init(); diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 7194eb40b6d..6da8ff597ad 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -65,7 +65,7 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) char buf[60]; } tmp_iph; - top_iph = skb->nh.iph; + top_iph = ip_hdr(skb); iph = &tmp_iph.iph; iph->tos = top_iph->tos; @@ -152,9 +152,9 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) skb->ip_summed = CHECKSUM_NONE; ah = (struct ip_auth_hdr*)skb->data; - iph = skb->nh.iph; + iph = ip_hdr(skb); - ihl = skb->data - skb->nh.raw; + ihl = skb->data - skb_network_header(skb); memcpy(work_buf, iph, ihl); iph->ttl = 0; @@ -181,7 +181,9 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) } } ((struct iphdr*)work_buf)->protocol = ah->nexthdr; - skb->h.raw = memcpy(skb->nh.raw += ah_hlen, work_buf, ihl); + skb->network_header += ah_hlen; + memcpy(skb_network_header(skb), work_buf, ihl); + skb->transport_header = skb->network_header; __skb_pull(skb, ah_hlen + ihl); return 0; @@ -196,8 +198,8 @@ static void ah4_err(struct sk_buff *skb, u32 info) struct ip_auth_hdr *ah = (struct ip_auth_hdr*)(skb->data+(iph->ihl<<2)); struct xfrm_state *x; - if (skb->h.icmph->type != ICMP_DEST_UNREACH || - skb->h.icmph->code != ICMP_FRAG_NEEDED) + if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || + icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return; x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 1a3488a83f4..7110779a024 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -342,13 +342,13 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { default: case 0: /* By default announce any local IP */ - if (skb && inet_addr_type(skb->nh.iph->saddr) == RTN_LOCAL) - saddr = skb->nh.iph->saddr; + if (skb && inet_addr_type(ip_hdr(skb)->saddr) == RTN_LOCAL) + saddr = ip_hdr(skb)->saddr; break; case 1: /* Restrict announcements of saddr in same subnet */ if (!skb) break; - saddr = skb->nh.iph->saddr; + saddr = ip_hdr(skb)->saddr; if (inet_addr_type(saddr) == RTN_LOCAL) { /* saddr should be known to target */ if (inet_addr_onlink(in_dev, target, saddr)) @@ -578,7 +578,7 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, return NULL; skb_reserve(skb, LL_RESERVED_SPACE(dev)); - skb->nh.raw = skb->data; + skb_reset_network_header(skb); arp = (struct arphdr *) skb_put(skb,sizeof(struct arphdr) + 2*(dev->addr_len+4)); skb->dev = dev; skb->protocol = htons(ETH_P_ARP); @@ -721,7 +721,7 @@ static int arp_process(struct sk_buff *skb) if (in_dev == NULL) goto out; - arp = skb->nh.arph; + arp = arp_hdr(skb); switch (dev_type) { default: @@ -937,7 +937,7 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, (2 * sizeof(u32))))) goto freeskb; - arp = skb->nh.arph; + arp = arp_hdr(skb); if (arp->ar_hln != dev->addr_len || dev->flags & IFF_NOARP || skb->pkt_type == PACKET_OTHERHOST || @@ -1178,7 +1178,7 @@ int arp_ioctl(unsigned int cmd, void __user *arg) goto out; } - switch(cmd) { + switch (cmd) { case SIOCDARP: err = arp_req_delete(&r, dev); break; @@ -1360,7 +1360,7 @@ static void *arp_seq_start(struct seq_file *seq, loff_t *pos) /* ------------------------------------------------------------------------ */ -static struct seq_operations arp_seq_ops = { +static const struct seq_operations arp_seq_ops = { .start = arp_seq_start, .next = neigh_seq_next, .stop = neigh_seq_stop, diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 2ce5b693a8b..e1f18489db1 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -92,6 +92,33 @@ int cipso_v4_rbm_optfmt = 0; int cipso_v4_rbm_strictvalid = 1; /* + * Protocol Constants + */ + +/* Maximum size of the CIPSO IP option, derived from the fact that the maximum + * IPv4 header size is 60 bytes and the base IPv4 header is 20 bytes long. */ +#define CIPSO_V4_OPT_LEN_MAX 40 + +/* Length of the base CIPSO option, this includes the option type (1 byte), the + * option length (1 byte), and the DOI (4 bytes). */ +#define CIPSO_V4_HDR_LEN 6 + +/* Base length of the restrictive category bitmap tag (tag #1). */ +#define CIPSO_V4_TAG_RBM_BLEN 4 + +/* Base length of the enumerated category tag (tag #2). */ +#define CIPSO_V4_TAG_ENUM_BLEN 4 + +/* Base length of the ranged categories bitmap tag (tag #5). */ +#define CIPSO_V4_TAG_RNG_BLEN 4 +/* The maximum number of category ranges permitted in the ranged category tag + * (tag #5). You may note that the IETF draft states that the maximum number + * of category ranges is 7, but if the low end of the last category range is + * zero then it is possibile to fit 8 category ranges because the zero should + * be omitted. */ +#define CIPSO_V4_TAG_RNG_CAT_MAX 8 + +/* * Helper Functions */ @@ -1109,16 +1136,15 @@ static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def, unsigned char *net_cat, u32 net_cat_len) { - /* The constant '16' is not random, it is the maximum number of - * high/low category range pairs as permitted by the CIPSO draft based - * on a maximum IPv4 header length of 60 bytes - the BUG_ON() assertion - * does a sanity check to make sure we don't overflow the array. */ int iter = -1; - u16 array[16]; + u16 array[CIPSO_V4_TAG_RNG_CAT_MAX * 2]; u32 array_cnt = 0; u32 cat_size = 0; - BUG_ON(net_cat_len > 30); + /* make sure we don't overflow the 'array[]' variable */ + if (net_cat_len > + (CIPSO_V4_OPT_LEN_MAX - CIPSO_V4_HDR_LEN - CIPSO_V4_TAG_RNG_BLEN)) + return -ENOSPC; for (;;) { iter = netlbl_secattr_catmap_walk(secattr->mls_cat, iter + 1); @@ -1174,7 +1200,7 @@ static int cipso_v4_map_cat_rng_ntoh(const struct cipso_v4_doi *doi_def, u16 cat_low; u16 cat_high; - for(net_iter = 0; net_iter < net_cat_len; net_iter += 4) { + for (net_iter = 0; net_iter < net_cat_len; net_iter += 4) { cat_high = ntohs(*((__be16 *)&net_cat[net_iter])); if ((net_iter + 4) <= net_cat_len) cat_low = ntohs(*((__be16 *)&net_cat[net_iter + 2])); @@ -1196,9 +1222,6 @@ static int cipso_v4_map_cat_rng_ntoh(const struct cipso_v4_doi *doi_def, * Protocol Handling Functions */ -#define CIPSO_V4_OPT_LEN_MAX 40 -#define CIPSO_V4_HDR_LEN 6 - /** * cipso_v4_gentag_hdr - Generate a CIPSO option header * @doi_def: the DOI definition @@ -1676,7 +1699,7 @@ validate_return: */ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) { - if (skb->nh.iph->protocol == IPPROTO_ICMP || error != -EACCES) + if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES) return; if (gateway) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 98a00d0edc7..7f95e6e9bee 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -48,7 +48,6 @@ #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> -#include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/notifier.h> #include <linux/inetdevice.h> @@ -62,7 +61,7 @@ #include <net/ip.h> #include <net/route.h> #include <net/ip_fib.h> -#include <net/netlink.h> +#include <net/rtnetlink.h> struct ipv4_devconf ipv4_devconf = { .accept_redirects = 1, @@ -633,7 +632,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg) dev_load(ifr.ifr_name); #endif - switch(cmd) { + switch (cmd) { case SIOCGIFADDR: /* Get interface address */ case SIOCGIFBRDADDR: /* Get the broadcast address */ case SIOCGIFDSTADDR: /* Get the destination address */ @@ -708,7 +707,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg) if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS) goto done; - switch(cmd) { + switch (cmd) { case SIOCGIFADDR: /* Get interface address */ sin->sin_addr.s_addr = ifa->ifa_local; goto rarok; @@ -911,7 +910,7 @@ no_in_dev: */ read_lock(&dev_base_lock); rcu_read_lock(); - for (dev = dev_base; dev; dev = dev->next) { + for_each_netdev(dev) { if ((in_dev = __in_dev_get_rcu(dev)) == NULL) continue; @@ -990,7 +989,7 @@ __be32 inet_confirm_addr(const struct net_device *dev, __be32 dst, __be32 local, read_lock(&dev_base_lock); rcu_read_lock(); - for (dev = dev_base; dev; dev = dev->next) { + for_each_netdev(dev) { if ((in_dev = __in_dev_get_rcu(dev))) { addr = confirm_addr_indev(in_dev, dst, local, scope); if (addr) @@ -1183,34 +1182,29 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) int s_ip_idx, s_idx = cb->args[0]; s_ip_idx = ip_idx = cb->args[1]; - read_lock(&dev_base_lock); - for (dev = dev_base, idx = 0; dev; dev = dev->next, idx++) { + idx = 0; + for_each_netdev(dev) { if (idx < s_idx) - continue; + goto cont; if (idx > s_idx) s_ip_idx = 0; - rcu_read_lock(); - if ((in_dev = __in_dev_get_rcu(dev)) == NULL) { - rcu_read_unlock(); - continue; - } + if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) + goto cont; for (ifa = in_dev->ifa_list, ip_idx = 0; ifa; ifa = ifa->ifa_next, ip_idx++) { if (ip_idx < s_ip_idx) - continue; + goto cont; if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, - RTM_NEWADDR, NLM_F_MULTI) <= 0) { - rcu_read_unlock(); + RTM_NEWADDR, NLM_F_MULTI) <= 0) goto done; - } } - rcu_read_unlock(); +cont: + idx++; } done: - read_unlock(&dev_base_lock); cb->args[0] = idx; cb->args[1] = ip_idx; @@ -1241,19 +1235,6 @@ errout: rtnl_set_sk_err(RTNLGRP_IPV4_IFADDR, err); } -static struct rtnetlink_link inet_rtnetlink_table[RTM_NR_MSGTYPES] = { - [RTM_NEWADDR - RTM_BASE] = { .doit = inet_rtm_newaddr, }, - [RTM_DELADDR - RTM_BASE] = { .doit = inet_rtm_deladdr, }, - [RTM_GETADDR - RTM_BASE] = { .dumpit = inet_dump_ifaddr, }, - [RTM_NEWROUTE - RTM_BASE] = { .doit = inet_rtm_newroute, }, - [RTM_DELROUTE - RTM_BASE] = { .doit = inet_rtm_delroute, }, - [RTM_GETROUTE - RTM_BASE] = { .doit = inet_rtm_getroute, - .dumpit = inet_dump_fib, }, -#ifdef CONFIG_IP_MULTIPLE_TABLES - [RTM_GETRULE - RTM_BASE] = { .dumpit = fib4_rules_dump, }, -#endif -}; - #ifdef CONFIG_SYSCTL void inet_forward_change(void) @@ -1265,7 +1246,7 @@ void inet_forward_change(void) ipv4_devconf_dflt.forwarding = on; read_lock(&dev_base_lock); - for (dev = dev_base; dev; dev = dev->next) { + for_each_netdev(dev) { struct in_device *in_dev; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); @@ -1636,7 +1617,10 @@ void __init devinet_init(void) { register_gifconf(PF_INET, inet_gifconf); register_netdevice_notifier(&ip_netdev_notifier); - rtnetlink_links[PF_INET] = inet_rtnetlink_table; + + rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL); + rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL); + rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr); #ifdef CONFIG_SYSCTL devinet_sysctl.sysctl_header = register_sysctl_table(devinet_sysctl.devinet_root_dir); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 31041127eeb..47c95e8ef04 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -21,13 +21,14 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) struct blkcipher_desc desc; struct esp_data *esp; struct sk_buff *trailer; + u8 *tail; int blksize; int clen; int alen; int nfrags; /* Strip IP+ESP header. */ - __skb_pull(skb, skb->h.raw - skb->data); + __skb_pull(skb, skb_transport_offset(skb)); /* Now skb is pure payload to encrypt */ err = -ENOMEM; @@ -49,19 +50,21 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) goto error; /* Fill padding... */ + tail = skb_tail_pointer(trailer); do { int i; for (i=0; i<clen-skb->len - 2; i++) - *(u8*)(trailer->tail + i) = i+1; + tail[i] = i + 1; } while (0); - *(u8*)(trailer->tail + clen-skb->len - 2) = (clen - skb->len)-2; + tail[clen - skb->len - 2] = (clen - skb->len) - 2; pskb_put(skb, trailer, clen - skb->len); - __skb_push(skb, skb->data - skb->nh.raw); - top_iph = skb->nh.iph; - esph = (struct ip_esp_hdr *)(skb->nh.raw + top_iph->ihl*4); + __skb_push(skb, skb->data - skb_network_header(skb)); + top_iph = ip_hdr(skb); + esph = (struct ip_esp_hdr *)(skb_network_header(skb) + + top_iph->ihl * 4); top_iph->tot_len = htons(skb->len + alen); - *(u8*)(trailer->tail - 1) = top_iph->protocol; + *(skb_tail_pointer(trailer) - 1) = top_iph->protocol; /* this is non-NULL only with UDP Encapsulation */ if (x->encap) { @@ -217,12 +220,12 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) /* ... check padding bits here. Silly. :-) */ - iph = skb->nh.iph; + iph = ip_hdr(skb); ihl = iph->ihl * 4; if (x->encap) { struct xfrm_encap_tmpl *encap = x->encap; - struct udphdr *uh = (void *)(skb->nh.raw + ihl); + struct udphdr *uh = (void *)(skb_network_header(skb) + ihl); /* * 1) if the NAT-T peer's IP or port changed then @@ -260,7 +263,8 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) iph->protocol = nexthdr[1]; pskb_trim(skb, skb->len - alen - padlen - 2); - skb->h.raw = __skb_pull(skb, sizeof(*esph) + esp->conf.ivlen) - ihl; + __skb_pull(skb, sizeof(*esph) + esp->conf.ivlen); + skb_set_transport_header(skb, -ihl); return 0; @@ -268,32 +272,33 @@ out: return -EINVAL; } -static u32 esp4_get_max_size(struct xfrm_state *x, int mtu) +static u32 esp4_get_mtu(struct xfrm_state *x, int mtu) { struct esp_data *esp = x->data; u32 blksize = ALIGN(crypto_blkcipher_blocksize(esp->conf.tfm), 4); - int enclen = 0; + u32 align = max_t(u32, blksize, esp->conf.padlen); + u32 rem; + + mtu -= x->props.header_len + esp->auth.icv_trunc_len; + rem = mtu & (align - 1); + mtu &= ~(align - 1); switch (x->props.mode) { case XFRM_MODE_TUNNEL: - mtu = ALIGN(mtu +2, blksize); break; default: case XFRM_MODE_TRANSPORT: /* The worst case */ - mtu = ALIGN(mtu + 2, 4) + blksize - 4; + mtu -= blksize - 4; + mtu += min_t(u32, blksize - 4, rem); break; case XFRM_MODE_BEET: /* The worst case. */ - enclen = IPV4_BEET_PHMAXLEN; - mtu = ALIGN(mtu + enclen + 2, blksize); + mtu += min_t(u32, IPV4_BEET_PHMAXLEN, rem); break; } - if (esp->conf.padlen) - mtu = ALIGN(mtu, esp->conf.padlen); - - return mtu + x->props.header_len + esp->auth.icv_trunc_len - enclen; + return mtu - 2; } static void esp4_err(struct sk_buff *skb, u32 info) @@ -302,8 +307,8 @@ static void esp4_err(struct sk_buff *skb, u32 info) struct ip_esp_hdr *esph = (struct ip_esp_hdr*)(skb->data+(iph->ihl<<2)); struct xfrm_state *x; - if (skb->h.icmph->type != ICMP_DEST_UNREACH || - skb->h.icmph->code != ICMP_FRAG_NEEDED) + if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || + icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return; x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); @@ -336,6 +341,7 @@ static int esp_init_state(struct xfrm_state *x) { struct esp_data *esp = NULL; struct crypto_blkcipher *tfm; + u32 align; /* null auth and encryption can have zero length keys */ if (x->aalg) { @@ -402,6 +408,8 @@ static int esp_init_state(struct xfrm_state *x) x->props.header_len = sizeof(struct ip_esp_hdr) + esp->conf.ivlen; if (x->props.mode == XFRM_MODE_TUNNEL) x->props.header_len += sizeof(struct iphdr); + else if (x->props.mode == XFRM_MODE_BEET) + x->props.header_len += IPV4_BEET_PHMAXLEN; if (x->encap) { struct xfrm_encap_tmpl *encap = x->encap; @@ -417,7 +425,10 @@ static int esp_init_state(struct xfrm_state *x) } } x->data = esp; - x->props.trailer_len = esp4_get_max_size(x, 0) - x->props.header_len; + align = ALIGN(crypto_blkcipher_blocksize(esp->conf.tfm), 4); + if (esp->conf.padlen) + align = max_t(u32, align, esp->conf.padlen); + x->props.trailer_len = align + 1 + esp->auth.icv_trunc_len; return 0; error: @@ -434,7 +445,7 @@ static struct xfrm_type esp_type = .proto = IPPROTO_ESP, .init_state = esp_init_state, .destructor = esp_destroy, - .get_max_size = esp4_get_max_size, + .get_mtu = esp4_get_mtu, .input = esp_input, .output = esp_output }; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index cac06c43f00..837f2957fa8 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -34,7 +34,6 @@ #include <linux/if_addr.h> #include <linux/if_arp.h> #include <linux/skbuff.h> -#include <linux/netlink.h> #include <linux/init.h> #include <linux/list.h> @@ -46,6 +45,7 @@ #include <net/icmp.h> #include <net/arp.h> #include <net/ip_fib.h> +#include <net/rtnetlink.h> #define FFprint(a...) printk(KERN_DEBUG a) @@ -540,7 +540,7 @@ errout: return err; } -int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) { struct fib_config cfg; struct fib_table *tb; @@ -561,7 +561,7 @@ errout: return err; } -int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) { struct fib_config cfg; struct fib_table *tb; @@ -582,7 +582,7 @@ errout: return err; } -int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) +static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { unsigned int h, s_h; unsigned int e = 0, s_e; @@ -777,6 +777,10 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb ) .tos = frn->fl_tos, .scope = frn->fl_scope } } }; +#ifdef CONFIG_IP_MULTIPLE_TABLES + res.r = NULL; +#endif + frn->err = -ENOENT; if (tb) { local_bh_disable(); @@ -807,7 +811,7 @@ static void nl_fib_input(struct sock *sk, int len) if (skb == NULL) return; - nlh = (struct nlmsghdr *)skb->data; + nlh = nlmsg_hdr(skb); if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len || nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn))) { kfree_skb(skb); @@ -827,7 +831,8 @@ static void nl_fib_input(struct sock *sk, int len) static void nl_fib_lookup_init(void) { - netlink_kernel_create(NETLINK_FIB_LOOKUP, 0, nl_fib_input, THIS_MODULE); + netlink_kernel_create(NETLINK_FIB_LOOKUP, 0, nl_fib_input, NULL, + THIS_MODULE); } static void fib_disable_ip(struct net_device *dev, int force) @@ -925,6 +930,10 @@ void __init ip_fib_init(void) register_netdevice_notifier(&fib_netdev_notifier); register_inetaddr_notifier(&fib_inetaddr_notifier); nl_fib_lookup_init(); + + rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL); + rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL); + rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib); } EXPORT_SYMBOL(inet_addr_type); diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index a4949f957ab..9cfecf1215c 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -1027,7 +1027,7 @@ out: return 0; } -static struct seq_operations fib_seq_ops = { +static const struct seq_operations fib_seq_ops = { .start = fib_seq_start, .next = fib_seq_next, .stop = fib_seq_stop, diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index c660c074c76..33083ad52e9 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -274,11 +274,6 @@ nla_put_failure: return -ENOBUFS; } -int fib4_rules_dump(struct sk_buff *skb, struct netlink_callback *cb) -{ - return fib_rules_dump(skb, cb, AF_INET); -} - static u32 fib4_rule_default_pref(void) { struct list_head *pos; @@ -303,6 +298,11 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule) + nla_total_size(4); /* flow */ } +static void fib4_rule_flush_cache(void) +{ + rt_cache_flush(-1); +} + static struct fib_rules_ops fib4_rules_ops = { .family = AF_INET, .rule_size = sizeof(struct fib4_rule), @@ -314,6 +314,7 @@ static struct fib_rules_ops fib4_rules_ops = { .fill = fib4_rule_fill, .default_pref = fib4_rule_default_pref, .nlmsg_payload = fib4_rule_nlmsg_payload, + .flush_cache = fib4_rule_flush_cache, .nlgroup = RTNLGRP_IPV4_RULE, .policy = fib4_rule_policy, .rules_list = &fib4_rules, diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 3dad12ee76c..406ea7050ae 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -927,7 +927,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp, default: printk(KERN_DEBUG "impossible 102\n"); return -EINVAL; - }; + } } return err; } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 214c34732e8..9be7da7c3a8 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -50,7 +50,7 @@ * Patrick McHardy <kaber@trash.net> */ -#define VERSION "0.407" +#define VERSION "0.408" #include <asm/uaccess.h> #include <asm/system.h> @@ -292,8 +292,8 @@ static inline void check_tnode(const struct tnode *tn) static int halve_threshold = 25; static int inflate_threshold = 50; -static int halve_threshold_root = 15; -static int inflate_threshold_root = 25; +static int halve_threshold_root = 8; +static int inflate_threshold_root = 15; static void __alias_free_mem(struct rcu_head *head) @@ -350,11 +350,10 @@ static void __tnode_free_rcu(struct rcu_head *head) static inline void tnode_free(struct tnode *tn) { - if(IS_LEAF(tn)) { + if (IS_LEAF(tn)) { struct leaf *l = (struct leaf *) tn; call_rcu_bh(&l->rcu, __leaf_free_rcu); - } - else + } else call_rcu(&tn->rcu, __tnode_free_rcu); } @@ -459,6 +458,7 @@ static struct node *resize(struct trie *t, struct tnode *tn) struct tnode *old_tn; int inflate_threshold_use; int halve_threshold_use; + int max_resize; if (!tn) return NULL; @@ -553,13 +553,14 @@ static struct node *resize(struct trie *t, struct tnode *tn) /* Keep root node larger */ - if(!tn->parent) + if (!tn->parent) inflate_threshold_use = inflate_threshold_root; else inflate_threshold_use = inflate_threshold; err = 0; - while ((tn->full_children > 0 && + max_resize = 10; + while ((tn->full_children > 0 && max_resize-- && 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >= inflate_threshold_use * tnode_child_length(tn))) { @@ -574,6 +575,15 @@ static struct node *resize(struct trie *t, struct tnode *tn) } } + if (max_resize < 0) { + if (!tn->parent) + printk(KERN_WARNING "Fix inflate_threshold_root. Now=%d size=%d bits\n", + inflate_threshold_root, tn->bits); + else + printk(KERN_WARNING "Fix inflate_threshold. Now=%d size=%d bits\n", + inflate_threshold, tn->bits); + } + check_tnode(tn); /* @@ -584,13 +594,14 @@ static struct node *resize(struct trie *t, struct tnode *tn) /* Keep root node larger */ - if(!tn->parent) + if (!tn->parent) halve_threshold_use = halve_threshold_root; else halve_threshold_use = halve_threshold; err = 0; - while (tn->bits > 1 && + max_resize = 10; + while (tn->bits > 1 && max_resize-- && 100 * (tnode_child_length(tn) - tn->empty_children) < halve_threshold_use * tnode_child_length(tn)) { @@ -605,6 +616,14 @@ static struct node *resize(struct trie *t, struct tnode *tn) } } + if (max_resize < 0) { + if (!tn->parent) + printk(KERN_WARNING "Fix halve_threshold_root. Now=%d size=%d bits\n", + halve_threshold_root, tn->bits); + else + printk(KERN_WARNING "Fix halve_threshold. Now=%d size=%d bits\n", + halve_threshold, tn->bits); + } /* Only one child remains */ if (tn->empty_children == tnode_child_length(tn) - 1) @@ -2039,12 +2058,12 @@ static struct node *fib_trie_get_first(struct fib_trie_iter *iter, { struct node *n ; - if(!t) + if (!t) return NULL; n = rcu_dereference(t->trie); - if(!iter) + if (!iter) return NULL; if (n) { @@ -2084,7 +2103,7 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s) int i; s->tnodes++; - if(tn->bits < MAX_STAT_DEPTH) + if (tn->bits < MAX_STAT_DEPTH) s->nodesizes[tn->bits]++; for (i = 0; i < (1<<tn->bits); i++) @@ -2250,7 +2269,7 @@ static inline const char *rtn_scope(enum rt_scope_t s) { static char buf[32]; - switch(s) { + switch (s) { case RT_SCOPE_UNIVERSE: return "universe"; case RT_SCOPE_SITE: return "site"; case RT_SCOPE_LINK: return "link"; @@ -2340,7 +2359,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v) return 0; } -static struct seq_operations fib_trie_seq_ops = { +static const struct seq_operations fib_trie_seq_ops = { .start = fib_trie_seq_start, .next = fib_trie_seq_next, .stop = fib_trie_seq_stop, @@ -2461,7 +2480,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) return 0; } -static struct seq_operations fib_route_seq_ops = { +static const struct seq_operations fib_route_seq_ops = { .start = fib_trie_seq_start, .next = fib_trie_seq_next, .stop = fib_trie_seq_stop, diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 4b7a0d946a0..d38cbba92a4 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -355,7 +355,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param, ipc, rt, MSG_DONTWAIT) < 0) ip_flush_pending_frames(icmp_socket->sk); else if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) { - struct icmphdr *icmph = skb->h.icmph; + struct icmphdr *icmph = icmp_hdr(skb); __wsum csum = 0; struct sk_buff *skb1; @@ -392,7 +392,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) icmp_param->data.icmph.checksum = 0; icmp_out_count(icmp_param->data.icmph.type); - inet->tos = skb->nh.iph->tos; + inet->tos = ip_hdr(skb)->tos; daddr = ipc.addr = rt->rt_src; ipc.opt = NULL; if (icmp_param->replyopts.optlen) { @@ -404,7 +404,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) struct flowi fl = { .nl_u = { .ip4_u = { .daddr = daddr, .saddr = rt->rt_spec_dst, - .tos = RT_TOS(skb->nh.iph->tos) } }, + .tos = RT_TOS(ip_hdr(skb)->tos) } }, .proto = IPPROTO_ICMP }; security_skb_classify_flow(skb, &fl); if (ip_route_output_key(&rt, &fl)) @@ -448,9 +448,10 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) * Check this, icmp_send is called from the most obscure devices * sometimes. */ - iph = skb_in->nh.iph; + iph = ip_hdr(skb_in); - if ((u8 *)iph < skb_in->head || (u8 *)(iph + 1) > skb_in->tail) + if ((u8 *)iph < skb_in->head || + (skb_in->network_header + sizeof(*iph)) > skb_in->tail) goto out; /* @@ -484,7 +485,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) u8 _inner_type, *itp; itp = skb_header_pointer(skb_in, - skb_in->nh.raw + + skb_network_header(skb_in) + (iph->ihl << 2) + offsetof(struct icmphdr, type) - @@ -536,7 +537,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) icmp_param.data.icmph.un.gateway = info; icmp_param.data.icmph.checksum = 0; icmp_param.skb = skb_in; - icmp_param.offset = skb_in->nh.raw - skb_in->data; + icmp_param.offset = skb_network_offset(skb_in); icmp_out_count(icmp_param.data.icmph.type); inet_sk(icmp_socket->sk)->tos = tos; ipc.addr = iph->saddr; @@ -613,7 +614,7 @@ static void icmp_unreach(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto out_err; - icmph = skb->h.icmph; + icmph = icmp_hdr(skb); iph = (struct iphdr *)skb->data; if (iph->ihl < 5) /* Mangled header, drop. */ @@ -676,7 +677,7 @@ static void icmp_unreach(struct sk_buff *skb) printk(KERN_WARNING "%u.%u.%u.%u sent an invalid ICMP " "type %u, code %u " "error to a broadcast: %u.%u.%u.%u on %s\n", - NIPQUAD(skb->nh.iph->saddr), + NIPQUAD(ip_hdr(skb)->saddr), icmph->type, icmph->code, NIPQUAD(iph->daddr), skb->dev->name); @@ -743,7 +744,7 @@ static void icmp_redirect(struct sk_buff *skb) iph = (struct iphdr *)skb->data; - switch (skb->h.icmph->code & 7) { + switch (icmp_hdr(skb)->code & 7) { case ICMP_REDIR_NET: case ICMP_REDIR_NETTOS: /* @@ -751,8 +752,8 @@ static void icmp_redirect(struct sk_buff *skb) */ case ICMP_REDIR_HOST: case ICMP_REDIR_HOSTTOS: - ip_rt_redirect(skb->nh.iph->saddr, iph->daddr, - skb->h.icmph->un.gateway, + ip_rt_redirect(ip_hdr(skb)->saddr, iph->daddr, + icmp_hdr(skb)->un.gateway, iph->saddr, skb->dev); break; } @@ -780,7 +781,7 @@ static void icmp_echo(struct sk_buff *skb) if (!sysctl_icmp_echo_ignore_all) { struct icmp_bxm icmp_param; - icmp_param.data.icmph = *skb->h.icmph; + icmp_param.data.icmph = *icmp_hdr(skb); icmp_param.data.icmph.type = ICMP_ECHOREPLY; icmp_param.skb = skb; icmp_param.offset = 0; @@ -816,7 +817,7 @@ static void icmp_timestamp(struct sk_buff *skb) icmp_param.data.times[2] = icmp_param.data.times[1]; if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4)) BUG(); - icmp_param.data.icmph = *skb->h.icmph; + icmp_param.data.icmph = *icmp_hdr(skb); icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY; icmp_param.data.icmph.code = 0; icmp_param.skb = skb; @@ -943,7 +944,7 @@ int icmp_rcv(struct sk_buff *skb) if (!pskb_pull(skb, sizeof(struct icmphdr))) goto error; - icmph = skb->h.icmph; + icmph = icmp_hdr(skb); /* * 18 is the highest 'known' ICMP type. Anything else is a mystery diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 8cedb2a2c9d..f4dd4745310 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -314,7 +314,9 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) skb_reserve(skb, LL_RESERVED_SPACE(dev)); - skb->nh.iph = pip =(struct iphdr *)skb_put(skb, sizeof(struct iphdr)+4); + skb_reset_network_header(skb); + pip = ip_hdr(skb); + skb_put(skb, sizeof(struct iphdr) + 4); pip->version = 4; pip->ihl = (sizeof(struct iphdr)+4)>>2; @@ -331,8 +333,9 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) ((u8*)&pip[1])[2] = 0; ((u8*)&pip[1])[3] = 0; - pig =(struct igmpv3_report *)skb_put(skb, sizeof(*pig)); - skb->h.igmph = (struct igmphdr *)pig; + skb->transport_header = skb->network_header + sizeof(struct iphdr) + 4; + skb_put(skb, sizeof(*pig)); + pig = igmpv3_report_hdr(skb); pig->type = IGMPV3_HOST_MEMBERSHIP_REPORT; pig->resv1 = 0; pig->csum = 0; @@ -343,16 +346,14 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) static int igmpv3_sendpack(struct sk_buff *skb) { - struct iphdr *pip = skb->nh.iph; - struct igmphdr *pig = skb->h.igmph; - int iplen, igmplen; + struct iphdr *pip = ip_hdr(skb); + struct igmphdr *pig = igmp_hdr(skb); + const int iplen = skb->tail - skb->network_header; + const int igmplen = skb->tail - skb->transport_header; - iplen = skb->tail - (unsigned char *)skb->nh.iph; pip->tot_len = htons(iplen); ip_send_check(pip); - - igmplen = skb->tail - (unsigned char *)skb->h.igmph; - pig->csum = ip_compute_csum((void *)skb->h.igmph, igmplen); + pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen); return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb->dev, dst_output); @@ -379,7 +380,7 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc, pgr->grec_auxwords = 0; pgr->grec_nsrcs = 0; pgr->grec_mca = pmc->multiaddr; - pih = (struct igmpv3_report *)skb->h.igmph; + pih = igmpv3_report_hdr(skb); pih->ngrec = htons(ntohs(pih->ngrec)+1); *ppgr = pgr; return skb; @@ -412,7 +413,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, if (!*psf_list) goto empty_source; - pih = skb ? (struct igmpv3_report *)skb->h.igmph : NULL; + pih = skb ? igmpv3_report_hdr(skb) : NULL; /* EX and TO_EX get a fresh packet, if needed */ if (truncate) { @@ -664,7 +665,9 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, skb_reserve(skb, LL_RESERVED_SPACE(dev)); - skb->nh.iph = iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr)+4); + skb_reset_network_header(skb); + iph = ip_hdr(skb); + skb_put(skb, sizeof(struct iphdr) + 4); iph->version = 4; iph->ihl = (sizeof(struct iphdr)+4)>>2; @@ -827,8 +830,8 @@ static void igmp_heard_report(struct in_device *in_dev, __be32 group) static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, int len) { - struct igmphdr *ih = skb->h.igmph; - struct igmpv3_query *ih3 = (struct igmpv3_query *)ih; + struct igmphdr *ih = igmp_hdr(skb); + struct igmpv3_query *ih3 = igmpv3_query_hdr(skb); struct ip_mc_list *im; __be32 group = ih->group; int max_delay; @@ -861,12 +864,12 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) return; - ih3 = (struct igmpv3_query *) skb->h.raw; + ih3 = igmpv3_query_hdr(skb); if (ih3->nsrcs) { if (!pskb_may_pull(skb, sizeof(struct igmpv3_query) + ntohs(ih3->nsrcs)*sizeof(__be32))) return; - ih3 = (struct igmpv3_query *) skb->h.raw; + ih3 = igmpv3_query_hdr(skb); } max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE); @@ -943,7 +946,7 @@ int igmp_rcv(struct sk_buff *skb) goto drop; } - ih = skb->h.igmph; + ih = igmp_hdr(skb); switch (ih->type) { case IGMP_HOST_MEMBERSHIP_QUERY: igmp_heard_query(in_dev, skb, len); @@ -2285,9 +2288,8 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq) struct ip_mc_list *im = NULL; struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); - for (state->dev = dev_base, state->in_dev = NULL; - state->dev; - state->dev = state->dev->next) { + state->in_dev = NULL; + for_each_netdev(state->dev) { struct in_device *in_dev; in_dev = in_dev_get(state->dev); if (!in_dev) @@ -2313,7 +2315,7 @@ static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_li read_unlock(&state->in_dev->mc_list_lock); in_dev_put(state->in_dev); } - state->dev = state->dev->next; + state->dev = next_net_device(state->dev); if (!state->dev) { state->in_dev = NULL; break; @@ -2397,7 +2399,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v) return 0; } -static struct seq_operations igmp_mc_seq_ops = { +static const struct seq_operations igmp_mc_seq_ops = { .start = igmp_mc_seq_start, .next = igmp_mc_seq_next, .stop = igmp_mc_seq_stop, @@ -2447,9 +2449,9 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq) struct ip_mc_list *im = NULL; struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); - for (state->dev = dev_base, state->idev = NULL, state->im = NULL; - state->dev; - state->dev = state->dev->next) { + state->idev = NULL; + state->im = NULL; + for_each_netdev(state->dev) { struct in_device *idev; idev = in_dev_get(state->dev); if (unlikely(idev == NULL)) @@ -2485,7 +2487,7 @@ static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_l read_unlock(&state->idev->mc_list_lock); in_dev_put(state->idev); } - state->dev = state->dev->next; + state->dev = next_net_device(state->dev); if (!state->dev) { state->idev = NULL; goto out; @@ -2571,7 +2573,7 @@ static int igmp_mcf_seq_show(struct seq_file *seq, void *v) return 0; } -static struct seq_operations igmp_mcf_seq_ops = { +static const struct seq_operations igmp_mcf_seq_ops = { .start = igmp_mcf_seq_start, .next = igmp_mcf_seq_next, .stop = igmp_mcf_seq_stop, diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 5df71cd08da..dbeacd8b0f9 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -27,6 +27,7 @@ #include <net/inet_hashtables.h> #include <net/inet_timewait_sock.h> #include <net/inet6_hashtables.h> +#include <net/netlink.h> #include <linux/inet.h> #include <linux/stddef.h> @@ -60,7 +61,7 @@ static int inet_csk_diag_fill(struct sock *sk, struct nlmsghdr *nlh; void *info = NULL; struct inet_diag_meminfo *minfo = NULL; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); const struct inet_diag_handler *handler; handler = inet_diag_table[unlh->nlmsg_type]; @@ -147,12 +148,12 @@ static int inet_csk_diag_fill(struct sock *sk, icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info) icsk->icsk_ca_ops->get_info(sk, ext, skb); - nlh->nlmsg_len = skb->tail - b; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; rtattr_failure: nlmsg_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -EMSGSIZE; } @@ -163,7 +164,7 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, { long tmo; struct inet_diag_msg *r; - const unsigned char *previous_tail = skb->tail; + const unsigned char *previous_tail = skb_tail_pointer(skb); struct nlmsghdr *nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); @@ -205,10 +206,10 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, &tw6->tw_v6_daddr); } #endif - nlh->nlmsg_len = skb->tail - previous_tail; + nlh->nlmsg_len = skb_tail_pointer(skb) - previous_tail; return skb->len; nlmsg_failure: - skb_trim(skb, previous_tail - skb->data); + nlmsg_trim(skb, previous_tail); return -EMSGSIZE; } @@ -535,7 +536,7 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, { const struct inet_request_sock *ireq = inet_rsk(req); struct inet_sock *inet = inet_sk(sk); - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct inet_diag_msg *r; struct nlmsghdr *nlh; long tmo; @@ -574,12 +575,12 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, &inet6_rsk(req)->rmt_addr); } #endif - nlh->nlmsg_len = skb->tail - b; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; nlmsg_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } @@ -805,68 +806,43 @@ done: return skb->len; } -static inline int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +static int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { - if (!(nlh->nlmsg_flags&NLM_F_REQUEST)) - return 0; + int hdrlen = sizeof(struct inet_diag_req); - if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX) - goto err_inval; + if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX || + nlmsg_len(nlh) < hdrlen) + return -EINVAL; if (inet_diag_table[nlh->nlmsg_type] == NULL) return -ENOENT; - if (NLMSG_LENGTH(sizeof(struct inet_diag_req)) > skb->len) - goto err_inval; - - if (nlh->nlmsg_flags&NLM_F_DUMP) { - if (nlh->nlmsg_len > - (4 + NLMSG_SPACE(sizeof(struct inet_diag_req)))) { - struct rtattr *rta = (void *)(NLMSG_DATA(nlh) + - sizeof(struct inet_diag_req)); - if (rta->rta_type != INET_DIAG_REQ_BYTECODE || - rta->rta_len < 8 || - rta->rta_len > - (nlh->nlmsg_len - - NLMSG_SPACE(sizeof(struct inet_diag_req)))) - goto err_inval; - if (inet_diag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta))) - goto err_inval; + if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (nlmsg_attrlen(nlh, hdrlen)) { + struct nlattr *attr; + + attr = nlmsg_find_attr(nlh, hdrlen, + INET_DIAG_REQ_BYTECODE); + if (attr == NULL || + nla_len(attr) < sizeof(struct inet_diag_bc_op) || + inet_diag_bc_audit(nla_data(attr), nla_len(attr))) + return -EINVAL; } + return netlink_dump_start(idiagnl, skb, nlh, inet_diag_dump, NULL); - } else - return inet_diag_get_exact(skb, nlh); - -err_inval: - return -EINVAL; -} - - -static inline void inet_diag_rcv_skb(struct sk_buff *skb) -{ - if (skb->len >= NLMSG_SPACE(0)) { - int err; - struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data; - - if (nlh->nlmsg_len < sizeof(*nlh) || - skb->len < nlh->nlmsg_len) - return; - err = inet_diag_rcv_msg(skb, nlh); - if (err || nlh->nlmsg_flags & NLM_F_ACK) - netlink_ack(skb, nlh, err); } + + return inet_diag_get_exact(skb, nlh); } static void inet_diag_rcv(struct sock *sk, int len) { - struct sk_buff *skb; - unsigned int qlen = skb_queue_len(&sk->sk_receive_queue); + unsigned int qlen = 0; - while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) { - inet_diag_rcv_skb(skb); - kfree_skb(skb); - } + do { + netlink_run_queue(sk, &qlen, &inet_diag_rcv_msg); + } while (qlen); } static DEFINE_SPINLOCK(inet_diag_register_lock); @@ -917,7 +893,7 @@ static int __init inet_diag_init(void) goto out; idiagnl = netlink_kernel_create(NETLINK_INET_DIAG, 0, inet_diag_rcv, - THIS_MODULE); + NULL, THIS_MODULE); if (idiagnl == NULL) goto out_free_table; err = 0; diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index db3ef96bdfd..2f44e612806 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -87,10 +87,12 @@ static DEFINE_RWLOCK(peer_pool_lock); static int peer_total; /* Exported for sysctl_net_ipv4. */ -int inet_peer_threshold = 65536 + 128; /* start to throw entries more +int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries more * aggressively at this stage */ -int inet_peer_minttl = 120 * HZ; /* TTL under high load: 120 sec */ -int inet_peer_maxttl = 10 * 60 * HZ; /* usual time to live: 10 min */ +int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */ +int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */ +int inet_peer_gc_mintime __read_mostly = 10 * HZ; +int inet_peer_gc_maxtime __read_mostly = 120 * HZ; static struct inet_peer *inet_peer_unused_head; static struct inet_peer **inet_peer_unused_tailp = &inet_peer_unused_head; @@ -99,9 +101,6 @@ static DEFINE_SPINLOCK(inet_peer_unused_lock); static void peer_check_expire(unsigned long dummy); static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0); -/* Exported for sysctl_net_ipv4. */ -int inet_peer_gc_mintime = 10 * HZ, - inet_peer_gc_maxtime = 120 * HZ; /* Called from ip_output.c:ip_init */ void __init inet_initpeers(void) @@ -151,20 +150,27 @@ static void unlink_from_unused(struct inet_peer *p) spin_unlock_bh(&inet_peer_unused_lock); } -/* Called with local BH disabled and the pool lock held. */ -#define lookup(daddr) \ +/* + * Called with local BH disabled and the pool lock held. + * _stack is known to be NULL or not at compile time, + * so compiler will optimize the if (_stack) tests. + */ +#define lookup(_daddr,_stack) \ ({ \ struct inet_peer *u, **v; \ - stackptr = stack; \ - *stackptr++ = &peer_root; \ + if (_stack) { \ + stackptr = _stack; \ + *stackptr++ = &peer_root; \ + } \ for (u = peer_root; u != peer_avl_empty; ) { \ - if (daddr == u->v4daddr) \ + if (_daddr == u->v4daddr) \ break; \ - if ((__force __u32)daddr < (__force __u32)u->v4daddr) \ + if ((__force __u32)_daddr < (__force __u32)u->v4daddr) \ v = &u->avl_left; \ else \ v = &u->avl_right; \ - *stackptr++ = v; \ + if (_stack) \ + *stackptr++ = v; \ u = *v; \ } \ u; \ @@ -288,7 +294,7 @@ static void unlink_from_pool(struct inet_peer *p) if (atomic_read(&p->refcnt) == 1) { struct inet_peer **stack[PEER_MAXDEPTH]; struct inet_peer ***stackptr, ***delp; - if (lookup(p->v4daddr) != p) + if (lookup(p->v4daddr, stack) != p) BUG(); delp = stackptr - 1; /* *delp[0] == p */ if (p->avl_left == peer_avl_empty) { @@ -373,7 +379,7 @@ struct inet_peer *inet_getpeer(__be32 daddr, int create) /* Look up for the address quickly. */ read_lock_bh(&peer_pool_lock); - p = lookup(daddr); + p = lookup(daddr, NULL); if (p != peer_avl_empty) atomic_inc(&p->refcnt); read_unlock_bh(&peer_pool_lock); @@ -400,7 +406,7 @@ struct inet_peer *inet_getpeer(__be32 daddr, int create) write_lock_bh(&peer_pool_lock); /* Check if an entry has suddenly appeared. */ - p = lookup(daddr); + p = lookup(daddr, stack); if (p != peer_avl_empty) goto out_free; diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 369e721c4ba..9cb04df0054 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -67,14 +67,14 @@ int ip_forward(struct sk_buff *skb) if (skb->pkt_type != PACKET_HOST) goto drop; - skb->ip_summed = CHECKSUM_NONE; + skb_forward_csum(skb); /* * According to the RFC, we must first decrease the TTL field. If * that reaches zero, we must reply an ICMP control message telling * that the packet's lifetime expired. */ - if (skb->nh.iph->ttl <= 1) + if (ip_hdr(skb)->ttl <= 1) goto too_many_hops; if (!xfrm4_route_forward(skb)) @@ -85,10 +85,18 @@ int ip_forward(struct sk_buff *skb) if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) goto sr_failed; + if (unlikely(skb->len > dst_mtu(&rt->u.dst) && + (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { + IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(dst_mtu(&rt->u.dst))); + goto drop; + } + /* We are about to mangle packet. Copy it! */ if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len)) goto drop; - iph = skb->nh.iph; + iph = ip_hdr(skb); /* Decrease ttl after skb cow done */ ip_decrease_ttl(iph); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index b6f05538037..0231bdcb2ab 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -92,7 +92,7 @@ struct ipq { spinlock_t lock; atomic_t refcnt; struct timer_list timer; /* when will this queue expire? */ - struct timeval stamp; + ktime_t stamp; int iif; unsigned int rid; struct inet_peer *peer; @@ -184,7 +184,7 @@ static __inline__ struct ipq *frag_alloc_queue(void) { struct ipq *qp = kmalloc(sizeof(struct ipq), GFP_ATOMIC); - if(!qp) + if (!qp) return NULL; atomic_add(sizeof(struct ipq), &ip_frag_mem); return qp; @@ -321,11 +321,11 @@ static struct ipq *ip_frag_intern(struct ipq *qp_in) * promoted read lock to write lock. */ hlist_for_each_entry(qp, n, &ipq_hash[hash], list) { - if(qp->id == qp_in->id && - qp->saddr == qp_in->saddr && - qp->daddr == qp_in->daddr && - qp->protocol == qp_in->protocol && - qp->user == qp_in->user) { + if (qp->id == qp_in->id && + qp->saddr == qp_in->saddr && + qp->daddr == qp_in->daddr && + qp->protocol == qp_in->protocol && + qp->user == qp_in->user) { atomic_inc(&qp->refcnt); write_unlock(&ipfrag_lock); qp_in->last_in |= COMPLETE; @@ -398,11 +398,11 @@ static inline struct ipq *ip_find(struct iphdr *iph, u32 user) read_lock(&ipfrag_lock); hash = ipqhashfn(id, saddr, daddr, protocol); hlist_for_each_entry(qp, n, &ipq_hash[hash], list) { - if(qp->id == id && - qp->saddr == saddr && - qp->daddr == daddr && - qp->protocol == protocol && - qp->user == user) { + if (qp->id == id && + qp->saddr == saddr && + qp->daddr == daddr && + qp->protocol == protocol && + qp->user == user) { atomic_inc(&qp->refcnt); read_unlock(&ipfrag_lock); return qp; @@ -479,11 +479,11 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) goto err; } - offset = ntohs(skb->nh.iph->frag_off); + offset = ntohs(ip_hdr(skb)->frag_off); flags = offset & ~IP_OFFSET; offset &= IP_OFFSET; offset <<= 3; /* offset is in 8-byte chunks */ - ihl = skb->nh.iph->ihl * 4; + ihl = ip_hdrlen(skb); /* Determine the position of this fragment. */ end = offset + skb->len - ihl; @@ -524,7 +524,7 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) * this fragment, right? */ prev = NULL; - for(next = qp->fragments; next != NULL; next = next->next) { + for (next = qp->fragments; next != NULL; next = next->next) { if (FRAG_CB(next)->offset >= offset) break; /* bingo! */ prev = next; @@ -592,7 +592,7 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) if (skb->dev) qp->iif = skb->dev->ifindex; skb->dev = NULL; - skb_get_timestamp(skb, &qp->stamp); + qp->stamp = skb->tstamp; qp->meat += skb->len; atomic_add(skb->truesize, &ip_frag_mem); if (offset == 0) @@ -624,10 +624,10 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev) BUG_TRAP(FRAG_CB(head)->offset == 0); /* Allocate a new buffer for the datagram. */ - ihlen = head->nh.iph->ihl*4; + ihlen = ip_hdrlen(head); len = ihlen + qp->len; - if(len > 65535) + if (len > 65535) goto out_oversize; /* Head of list must not be cloned. */ @@ -658,7 +658,7 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev) } skb_shinfo(head)->frag_list = head->next; - skb_push(head, head->data - head->nh.raw); + skb_push(head, head->data - skb_network_header(head)); atomic_sub(head->truesize, &ip_frag_mem); for (fp=head->next; fp; fp = fp->next) { @@ -674,9 +674,9 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev) head->next = NULL; head->dev = dev; - skb_set_timestamp(head, &qp->stamp); + head->tstamp = qp->stamp; - iph = head->nh.iph; + iph = ip_hdr(head); iph->frag_off = 0; iph->tot_len = htons(len); IP_INC_STATS_BH(IPSTATS_MIB_REASMOKS); @@ -700,7 +700,6 @@ out_fail: /* Process an incoming IP datagram fragment. */ struct sk_buff *ip_defrag(struct sk_buff *skb, u32 user) { - struct iphdr *iph = skb->nh.iph; struct ipq *qp; struct net_device *dev; @@ -713,7 +712,7 @@ struct sk_buff *ip_defrag(struct sk_buff *skb, u32 user) dev = skb->dev; /* Lookup (or create) queue header */ - if ((qp = ip_find(iph, user)) != NULL) { + if ((qp = ip_find(ip_hdr(skb), user)) != NULL) { struct sk_buff *ret = NULL; spin_lock(&qp->lock); @@ -734,7 +733,7 @@ struct sk_buff *ip_defrag(struct sk_buff *skb, u32 user) return NULL; } -void ipfrag_init(void) +void __init ipfrag_init(void) { ipfrag_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^ (jiffies ^ (jiffies >> 6))); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 9151da64231..63282934725 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -191,11 +191,11 @@ static struct ip_tunnel * ipgre_tunnel_lookup(__be32 remote, __be32 local, __be3 return NULL; } -static struct ip_tunnel **ipgre_bucket(struct ip_tunnel *t) +static struct ip_tunnel **__ipgre_bucket(struct ip_tunnel_parm *parms) { - __be32 remote = t->parms.iph.daddr; - __be32 local = t->parms.iph.saddr; - __be32 key = t->parms.i_key; + __be32 remote = parms->iph.daddr; + __be32 local = parms->iph.saddr; + __be32 key = parms->i_key; unsigned h = HASH(key); int prio = 0; @@ -209,6 +209,11 @@ static struct ip_tunnel **ipgre_bucket(struct ip_tunnel *t) return &tunnels[prio][h]; } +static inline struct ip_tunnel **ipgre_bucket(struct ip_tunnel *t) +{ + return __ipgre_bucket(&t->parms); +} + static void ipgre_tunnel_link(struct ip_tunnel *t) { struct ip_tunnel **tp = ipgre_bucket(t); @@ -240,17 +245,9 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int __be32 key = parms->i_key; struct ip_tunnel *t, **tp, *nt; struct net_device *dev; - unsigned h = HASH(key); - int prio = 0; char name[IFNAMSIZ]; - if (local) - prio |= 1; - if (remote && !MULTICAST(remote)) { - prio |= 2; - h ^= HASH(remote); - } - for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) { + for (tp = __ipgre_bucket(parms); (t = *tp) != NULL; tp = &t->next) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) { if (key == t->parms.i_key) return t; @@ -320,8 +317,8 @@ static void ipgre_err(struct sk_buff *skb, u32 info) struct iphdr *iph = (struct iphdr*)skb->data; __be16 *p = (__be16*)(skb->data+(iph->ihl<<2)); int grehlen = (iph->ihl<<2) + 4; - int type = skb->h.icmph->type; - int code = skb->h.icmph->code; + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; struct ip_tunnel *t; __be16 flags; @@ -388,8 +385,8 @@ out: struct iphdr *iph = (struct iphdr*)dp; struct iphdr *eiph; __be16 *p = (__be16*)(dp+(iph->ihl<<2)); - int type = skb->h.icmph->type; - int code = skb->h.icmph->code; + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; int rel_type = 0; int rel_code = 0; __be32 rel_info = 0; @@ -422,7 +419,7 @@ out: default: return; case ICMP_PARAMETERPROB: - n = ntohl(skb->h.icmph->un.gateway) >> 24; + n = ntohl(icmp_hdr(skb)->un.gateway) >> 24; if (n < (iph->ihl<<2)) return; @@ -442,7 +439,7 @@ out: return; case ICMP_FRAG_NEEDED: /* And it is the only really necessary thing :-) */ - n = ntohs(skb->h.icmph->un.frag.mtu); + n = ntohs(icmp_hdr(skb)->un.frag.mtu); if (n < grehlen+68) return; n -= grehlen; @@ -474,7 +471,7 @@ out: dst_release(skb2->dst); skb2->dst = NULL; skb_pull(skb2, skb->data - (u8*)eiph); - skb2->nh.raw = skb2->data; + skb_reset_network_header(skb2); /* Try to guess incoming interface */ memset(&fl, 0, sizeof(fl)); @@ -533,9 +530,9 @@ static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) { if (INET_ECN_is_ce(iph->tos)) { if (skb->protocol == htons(ETH_P_IP)) { - IP_ECN_set_ce(skb->nh.iph); + IP_ECN_set_ce(ip_hdr(skb)); } else if (skb->protocol == htons(ETH_P_IPV6)) { - IP6_ECN_set_ce(skb->nh.ipv6h); + IP6_ECN_set_ce(ipv6_hdr(skb)); } } } @@ -565,7 +562,7 @@ static int ipgre_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, 16)) goto drop_nolock; - iph = skb->nh.iph; + iph = ip_hdr(skb); h = skb->data; flags = *(__be16*)h; @@ -616,9 +613,10 @@ static int ipgre_rcv(struct sk_buff *skb) offset += 4; } - skb->mac.raw = skb->nh.raw; - skb->nh.raw = __pskb_pull(skb, offset); - skb_postpull_rcsum(skb, skb->h.raw, offset); + skb_reset_mac_header(skb); + __pskb_pull(skb, offset); + skb_reset_network_header(skb); + skb_postpull_rcsum(skb, skb_transport_header(skb), offset); skb->pkt_type = PACKET_HOST; #ifdef CONFIG_NET_IPGRE_BROADCAST if (MULTICAST(iph->daddr)) { @@ -669,7 +667,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct net_device_stats *stats = &tunnel->stat; - struct iphdr *old_iph = skb->nh.iph; + struct iphdr *old_iph = ip_hdr(skb); struct iphdr *tiph; u8 tos; __be16 df; @@ -720,7 +718,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) addr_type = ipv6_addr_type(addr6); if (addr_type == IPV6_ADDR_ANY) { - addr6 = &skb->nh.ipv6h->daddr; + addr6 = &ipv6_hdr(skb)->daddr; addr_type = ipv6_addr_type(addr6); } @@ -824,11 +822,12 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) skb_set_owner_w(new_skb, skb->sk); dev_kfree_skb(skb); skb = new_skb; - old_iph = skb->nh.iph; + old_iph = ip_hdr(skb); } - skb->h.raw = skb->nh.raw; - skb->nh.raw = skb_push(skb, gre_hlen); + skb->transport_header = skb->network_header; + skb_push(skb, gre_hlen); + skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); @@ -839,7 +838,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) * Push down and install the IPIP header. */ - iph = skb->nh.iph; + iph = ip_hdr(skb); iph->version = 4; iph->ihl = sizeof(struct iphdr) >> 2; iph->frag_off = df; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index f38e97647ac..97069399d86 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -158,7 +158,7 @@ DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics) __read_mostly; int ip_call_ra_chain(struct sk_buff *skb) { struct ip_ra_chain *ra; - u8 protocol = skb->nh.iph->protocol; + u8 protocol = ip_hdr(skb)->protocol; struct sock *last = NULL; read_lock(&ip_ra_lock); @@ -171,7 +171,7 @@ int ip_call_ra_chain(struct sk_buff *skb) if (sk && inet_sk(sk)->num == protocol && (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == skb->dev->ifindex)) { - if (skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { + if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { skb = ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN); if (skb == NULL) { read_unlock(&ip_ra_lock); @@ -198,17 +198,15 @@ int ip_call_ra_chain(struct sk_buff *skb) static inline int ip_local_deliver_finish(struct sk_buff *skb) { - int ihl = skb->nh.iph->ihl*4; - - __skb_pull(skb, ihl); + __skb_pull(skb, ip_hdrlen(skb)); /* Point into the IP datagram, just past the header. */ - skb->h.raw = skb->data; + skb_reset_transport_header(skb); rcu_read_lock(); { /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */ - int protocol = skb->nh.iph->protocol; + int protocol = ip_hdr(skb)->protocol; int hash; struct sock *raw_sk; struct net_protocol *ipprot; @@ -220,7 +218,7 @@ static inline int ip_local_deliver_finish(struct sk_buff *skb) /* If there maybe a raw socket we must check - if not we * don't care less */ - if (raw_sk && !raw_v4_input(skb, skb->nh.iph, hash)) + if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash)) raw_sk = NULL; if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) { @@ -266,7 +264,7 @@ int ip_local_deliver(struct sk_buff *skb) * Reassemble IP fragments. */ - if (skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { + if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { skb = ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER); if (!skb) return 0; @@ -294,7 +292,7 @@ static inline int ip_rcv_options(struct sk_buff *skb) goto drop; } - iph = skb->nh.iph; + iph = ip_hdr(skb); if (ip_options_compile(NULL, skb)) { IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); @@ -330,7 +328,8 @@ drop: static inline int ip_rcv_finish(struct sk_buff *skb) { - struct iphdr *iph = skb->nh.iph; + const struct iphdr *iph = ip_hdr(skb); + struct rtable *rt; /* * Initialise the virtual path cache for the packet. It describes @@ -342,6 +341,8 @@ static inline int ip_rcv_finish(struct sk_buff *skb) if (unlikely(err)) { if (err == -EHOSTUNREACH) IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS); + else if (err == -ENETUNREACH) + IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES); goto drop; } } @@ -360,6 +361,12 @@ static inline int ip_rcv_finish(struct sk_buff *skb) if (iph->ihl > 5 && ip_rcv_options(skb)) goto drop; + rt = (struct rtable*)skb->dst; + if (rt->rt_type == RTN_MULTICAST) + IP_INC_STATS_BH(IPSTATS_MIB_INMCASTPKTS); + else if (rt->rt_type == RTN_BROADCAST) + IP_INC_STATS_BH(IPSTATS_MIB_INBCASTPKTS); + return dst_input(skb); drop: @@ -391,7 +398,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto inhdr_error; - iph = skb->nh.iph; + iph = ip_hdr(skb); /* * RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the checksum. @@ -410,13 +417,16 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, if (!pskb_may_pull(skb, iph->ihl*4)) goto inhdr_error; - iph = skb->nh.iph; + iph = ip_hdr(skb); if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto inhdr_error; len = ntohs(iph->tot_len); - if (skb->len < len || len < (iph->ihl*4)) + if (skb->len < len) { + IP_INC_STATS_BH(IPSTATS_MIB_INTRUNCATEDPKTS); + goto drop; + } else if (len < (iph->ihl*4)) goto inhdr_error; /* Our transport medium may have padded the buffer out. Now we know it diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index f906a80d5a8..251346828cb 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -40,7 +40,7 @@ void ip_options_build(struct sk_buff * skb, struct ip_options * opt, __be32 daddr, struct rtable *rt, int is_frag) { - unsigned char * iph = skb->nh.raw; + unsigned char *iph = skb_network_header(skb); memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options)); memcpy(iph+sizeof(struct iphdr), opt->__data, opt->optlen); @@ -104,13 +104,13 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) return 0; } - sptr = skb->nh.raw; + sptr = skb_network_header(skb); dptr = dopt->__data; if (skb->dst) daddr = ((struct rtable*)skb->dst)->rt_spec_dst; else - daddr = skb->nh.iph->daddr; + daddr = ip_hdr(skb)->daddr; if (sopt->rr) { optlen = sptr[sopt->rr+1]; @@ -180,7 +180,8 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) /* * RFC1812 requires to fix illegal source routes. */ - if (memcmp(&skb->nh.iph->saddr, &start[soffset+3], 4) == 0) + if (memcmp(&ip_hdr(skb)->saddr, + &start[soffset + 3], 4) == 0) doffset -= 4; } if (doffset > 3) { @@ -217,7 +218,7 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) void ip_options_fragment(struct sk_buff * skb) { - unsigned char * optptr = skb->nh.raw + sizeof(struct iphdr); + unsigned char *optptr = skb_network_header(skb) + sizeof(struct iphdr); struct ip_options * opt = &(IPCB(skb)->opt); int l = opt->optlen; int optlen; @@ -264,12 +265,13 @@ int ip_options_compile(struct ip_options * opt, struct sk_buff * skb) if (!opt) { opt = &(IPCB(skb)->opt); - iph = skb->nh.raw; + iph = skb_network_header(skb); opt->optlen = ((struct iphdr *)iph)->ihl*4 - sizeof(struct iphdr); optptr = iph + sizeof(struct iphdr); opt->is_data = 0; } else { - optptr = opt->is_data ? opt->__data : (unsigned char*)&(skb->nh.iph[1]); + optptr = opt->is_data ? opt->__data : + (unsigned char *)&(ip_hdr(skb)[1]); iph = optptr - sizeof(struct iphdr); } @@ -563,7 +565,7 @@ void ip_forward_options(struct sk_buff *skb) struct ip_options * opt = &(IPCB(skb)->opt); unsigned char * optptr; struct rtable *rt = (struct rtable*)skb->dst; - unsigned char *raw = skb->nh.raw; + unsigned char *raw = skb_network_header(skb); if (opt->rr_needaddr) { optptr = (unsigned char *)raw + opt->rr; @@ -587,7 +589,7 @@ void ip_forward_options(struct sk_buff *skb) if (srrptr + 3 <= srrspace) { opt->is_changed = 1; ip_rt_get_source(&optptr[srrptr-1], rt); - skb->nh.iph->daddr = rt->rt_dst; + ip_hdr(skb)->daddr = rt->rt_dst; optptr[2] = srrptr+4; } else if (net_ratelimit()) printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n"); @@ -599,7 +601,7 @@ void ip_forward_options(struct sk_buff *skb) } if (opt->is_changed) { opt->is_changed = 0; - ip_send_check(skb->nh.iph); + ip_send_check(ip_hdr(skb)); } } @@ -608,8 +610,8 @@ int ip_options_rcv_srr(struct sk_buff *skb) struct ip_options *opt = &(IPCB(skb)->opt); int srrspace, srrptr; __be32 nexthop; - struct iphdr *iph = skb->nh.iph; - unsigned char * optptr = skb->nh.raw + opt->srr; + struct iphdr *iph = ip_hdr(skb); + unsigned char *optptr = skb_network_header(skb) + opt->srr; struct rtable *rt = (struct rtable*)skb->dst; struct rtable *rt2; int err; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index d096332f6c6..d6427d91851 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -95,8 +95,8 @@ __inline__ void ip_send_check(struct iphdr *iph) /* dev_loopback_xmit for use with netfilter. */ static int ip_dev_loopback_xmit(struct sk_buff *newskb) { - newskb->mac.raw = newskb->data; - __skb_pull(newskb, newskb->nh.raw - newskb->data); + skb_reset_mac_header(newskb); + __skb_pull(newskb, skb_network_offset(newskb)); newskb->pkt_type = PACKET_LOOPBACK; newskb->ip_summed = CHECKSUM_UNNECESSARY; BUG_TRAP(newskb->dst); @@ -125,11 +125,9 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, struct iphdr *iph; /* Build the IP header. */ - if (opt) - iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen); - else - iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr)); - + skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); + skb_reset_network_header(skb); + iph = ip_hdr(skb); iph->version = 4; iph->ihl = 5; iph->tos = inet->tos; @@ -143,7 +141,6 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, iph->protocol = sk->sk_protocol; iph->tot_len = htons(skb->len); ip_select_ident(iph, &rt->u.dst, sk); - skb->nh.iph = iph; if (opt && opt->optlen) { iph->ihl += opt->optlen>>2; @@ -163,9 +160,15 @@ EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); static inline int ip_finish_output2(struct sk_buff *skb) { struct dst_entry *dst = skb->dst; + struct rtable *rt = (struct rtable *)dst; struct net_device *dev = dst->dev; int hh_len = LL_RESERVED_SPACE(dev); + if (rt->rt_type == RTN_MULTICAST) + IP_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS); + else if (rt->rt_type == RTN_BROADCAST) + IP_INC_STATS(IPSTATS_MIB_OUTBCASTPKTS); + /* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) { struct sk_buff *skb2; @@ -192,6 +195,14 @@ static inline int ip_finish_output2(struct sk_buff *skb) return -EINVAL; } +static inline int ip_skb_dst_mtu(struct sk_buff *skb) +{ + struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL; + + return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ? + skb->dst->dev->mtu : dst_mtu(skb->dst); +} + static inline int ip_finish_output(struct sk_buff *skb) { #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) @@ -201,7 +212,7 @@ static inline int ip_finish_output(struct sk_buff *skb) return dst_output(skb); } #endif - if (skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb)) + if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb)) return ip_fragment(skb, ip_finish_output2); else return ip_finish_output2(skb); @@ -248,7 +259,7 @@ int ip_mc_output(struct sk_buff *skb) /* Multicasts with ttl 0 must not go beyond the host */ - if (skb->nh.iph->ttl == 0) { + if (ip_hdr(skb)->ttl == 0) { kfree_skb(skb); return 0; } @@ -333,7 +344,9 @@ packet_routed: goto no_route; /* OK, we know where to send it, allocate and build IP header. */ - iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); + skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); + skb_reset_network_header(skb); + iph = ip_hdr(skb); *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); iph->tot_len = htons(skb->len); if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok) @@ -344,7 +357,6 @@ packet_routed: iph->protocol = sk->sk_protocol; iph->saddr = rt->rt_src; iph->daddr = rt->rt_dst; - skb->nh.iph = iph; /* Transport layer set skb->h.foo itself. */ if (opt && opt->optlen) { @@ -386,21 +398,10 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) #ifdef CONFIG_NET_SCHED to->tc_index = from->tc_index; #endif -#ifdef CONFIG_NETFILTER - /* Connection association is same as pre-frag packet */ - nf_conntrack_put(to->nfct); - to->nfct = from->nfct; - nf_conntrack_get(to->nfct); - to->nfctinfo = from->nfctinfo; + nf_copy(to, from); #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) to->ipvs_property = from->ipvs_property; #endif -#ifdef CONFIG_BRIDGE_NETFILTER - nf_bridge_put(to->nf_bridge); - to->nf_bridge = from->nf_bridge; - nf_bridge_get(to->nf_bridge); -#endif -#endif skb_copy_secmark(to, from); } @@ -430,12 +431,12 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) * Point into the IP datagram header. */ - iph = skb->nh.iph; + iph = ip_hdr(skb); if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) { IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(dst_mtu(&rt->u.dst))); + htonl(ip_skb_dst_mtu(skb))); kfree_skb(skb); return -EMSGSIZE; } @@ -502,10 +503,11 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) * before previous one went down. */ if (frag) { frag->ip_summed = CHECKSUM_NONE; - frag->h.raw = frag->data; - frag->nh.raw = __skb_push(frag, hlen); - memcpy(frag->nh.raw, iph, hlen); - iph = frag->nh.iph; + skb_reset_transport_header(frag); + __skb_push(frag, hlen); + skb_reset_network_header(frag); + memcpy(skb_network_header(frag), iph, hlen); + iph = ip_hdr(frag); iph->tot_len = htons(frag->len); ip_copy_metadata(frag, skb); if (offset == 0) @@ -566,7 +568,7 @@ slow_path: * Keep copying data until we run out. */ - while(left > 0) { + while (left > 0) { len = left; /* IF: it doesn't fit, use 'mtu' - the data space left */ if (len > mtu) @@ -593,8 +595,8 @@ slow_path: ip_copy_metadata(skb2, skb); skb_reserve(skb2, ll_rs); skb_put(skb2, len + hlen); - skb2->nh.raw = skb2->data; - skb2->h.raw = skb2->data + hlen; + skb_reset_network_header(skb2); + skb2->transport_header = skb2->network_header + hlen; /* * Charge the memory for the fragment to any owner @@ -608,19 +610,19 @@ slow_path: * Copy the packet header into the new buffer. */ - memcpy(skb2->nh.raw, skb->data, hlen); + skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen); /* * Copy a block of the IP datagram. */ - if (skb_copy_bits(skb, ptr, skb2->h.raw, len)) + if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len)) BUG(); left -= len; /* * Fill in the new header fields. */ - iph = skb2->nh.iph; + iph = ip_hdr(skb2); iph->frag_off = htons((offset >> 3)); /* ANK: dirty, but effective trick. Upgrade options only if @@ -722,10 +724,10 @@ static inline int ip_ufo_append_data(struct sock *sk, skb_put(skb,fragheaderlen + transhdrlen); /* initialize network header pointer */ - skb->nh.raw = skb->data; + skb_reset_network_header(skb); /* initialize protocol header pointer */ - skb->h.raw = skb->data + fragheaderlen; + skb->transport_header = skb->network_header + fragheaderlen; skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; @@ -799,7 +801,9 @@ int ip_append_data(struct sock *sk, inet->cork.addr = ipc->addr; } dst_hold(&rt->u.dst); - inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path); + inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ? + rt->u.dst.dev->mtu : + dst_mtu(rt->u.dst.path); inet->cork.rt = rt; inet->cork.length = 0; sk->sk_sndmsg_page = NULL; @@ -929,9 +933,10 @@ alloc_new_skb: * Find where to start putting bytes. */ data = skb_put(skb, fraglen); - skb->nh.raw = data + exthdrlen; + skb_set_network_header(skb, exthdrlen); + skb->transport_header = (skb->network_header + + fragheaderlen); data += fragheaderlen; - skb->h.raw = data + exthdrlen; if (fraggap) { skb->csum = skb_copy_and_csum_bits( @@ -1100,8 +1105,6 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, } if (len <= 0) { struct sk_buff *skb_prev; - char *data; - struct iphdr *iph; int alloclen; skb_prev = skb; @@ -1124,15 +1127,15 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, /* * Find where to start putting bytes. */ - data = skb_put(skb, fragheaderlen + fraggap); - skb->nh.iph = iph = (struct iphdr *)data; - data += fragheaderlen; - skb->h.raw = data; - + skb_put(skb, fragheaderlen + fraggap); + skb_reset_network_header(skb); + skb->transport_header = (skb->network_header + + fragheaderlen); if (fraggap) { - skb->csum = skb_copy_and_csum_bits( - skb_prev, maxfraglen, - data, fraggap, 0); + skb->csum = skb_copy_and_csum_bits(skb_prev, + maxfraglen, + skb_transport_header(skb), + fraggap, 0); skb_prev->csum = csum_sub(skb_prev->csum, skb->csum); pskb_trim_unique(skb_prev, maxfraglen); @@ -1198,10 +1201,10 @@ int ip_push_pending_frames(struct sock *sk) tail_skb = &(skb_shinfo(skb)->frag_list); /* move skb->data to ip header from ext header */ - if (skb->data < skb->nh.raw) - __skb_pull(skb, skb->nh.raw - skb->data); + if (skb->data < skb_network_header(skb)) + __skb_pull(skb, skb_network_offset(skb)); while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { - __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw); + __skb_pull(tmp_skb, skb_network_header_len(skb)); *tail_skb = tmp_skb; tail_skb = &(tmp_skb->next); skb->len += tmp_skb->len; @@ -1216,13 +1219,13 @@ int ip_push_pending_frames(struct sock *sk) * to fragment the frame generated here. No matter, what transforms * how transforms change size of the packet, it will come out. */ - if (inet->pmtudisc != IP_PMTUDISC_DO) + if (inet->pmtudisc < IP_PMTUDISC_DO) skb->local_df = 1; /* DF bit is set when we want to see DF on outgoing frames. * If local_df is set too, we still allow to fragment this frame * locally. */ - if (inet->pmtudisc == IP_PMTUDISC_DO || + if (inet->pmtudisc >= IP_PMTUDISC_DO || (skb->len <= dst_mtu(&rt->u.dst) && ip_dont_fragment(sk, &rt->u.dst))) df = htons(IP_DF); @@ -1352,11 +1355,11 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar struct flowi fl = { .nl_u = { .ip4_u = { .daddr = daddr, .saddr = rt->rt_spec_dst, - .tos = RT_TOS(skb->nh.iph->tos) } }, + .tos = RT_TOS(ip_hdr(skb)->tos) } }, /* Not quite clean, but right. */ .uli_u = { .ports = - { .sport = skb->h.th->dest, - .dport = skb->h.th->source } }, + { .sport = tcp_hdr(skb)->dest, + .dport = tcp_hdr(skb)->source } }, .proto = sk->sk_protocol }; security_skb_classify_flow(skb, &fl); if (ip_route_output_key(&rt, &fl)) @@ -1370,14 +1373,16 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar with locally disabled BH and that sk cannot be already spinlocked. */ bh_lock_sock(sk); - inet->tos = skb->nh.iph->tos; + inet->tos = ip_hdr(skb)->tos; sk->sk_priority = skb->priority; - sk->sk_protocol = skb->nh.iph->protocol; + sk->sk_protocol = ip_hdr(skb)->protocol; ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0, &ipc, rt, MSG_DONTWAIT); if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { if (arg->csumoffset >= 0) - *((__sum16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum)); + *((__sum16 *)skb_transport_header(skb) + + arg->csumoffset) = csum_fold(csum_add(skb->csum, + arg->csum)); skb->ip_summed = CHECKSUM_NONE; ip_push_pending_frames(sk); } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 23048d9f358..4d544573f48 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -59,7 +59,7 @@ static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) struct in_pktinfo info; struct rtable *rt = (struct rtable *)skb->dst; - info.ipi_addr.s_addr = skb->nh.iph->daddr; + info.ipi_addr.s_addr = ip_hdr(skb)->daddr; if (rt) { info.ipi_ifindex = rt->rt_iif; info.ipi_spec_dst.s_addr = rt->rt_spec_dst; @@ -73,13 +73,13 @@ static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) static void ip_cmsg_recv_ttl(struct msghdr *msg, struct sk_buff *skb) { - int ttl = skb->nh.iph->ttl; + int ttl = ip_hdr(skb)->ttl; put_cmsg(msg, SOL_IP, IP_TTL, sizeof(int), &ttl); } static void ip_cmsg_recv_tos(struct msghdr *msg, struct sk_buff *skb) { - put_cmsg(msg, SOL_IP, IP_TOS, 1, &skb->nh.iph->tos); + put_cmsg(msg, SOL_IP, IP_TOS, 1, &ip_hdr(skb)->tos); } static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb) @@ -87,7 +87,8 @@ static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb) if (IPCB(skb)->opt.optlen == 0) return; - put_cmsg(msg, SOL_IP, IP_RECVOPTS, IPCB(skb)->opt.optlen, skb->nh.iph+1); + put_cmsg(msg, SOL_IP, IP_RECVOPTS, IPCB(skb)->opt.optlen, + ip_hdr(skb) + 1); } @@ -268,18 +269,21 @@ void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, serr = SKB_EXT_ERR(skb); serr->ee.ee_errno = err; serr->ee.ee_origin = SO_EE_ORIGIN_ICMP; - serr->ee.ee_type = skb->h.icmph->type; - serr->ee.ee_code = skb->h.icmph->code; + serr->ee.ee_type = icmp_hdr(skb)->type; + serr->ee.ee_code = icmp_hdr(skb)->code; serr->ee.ee_pad = 0; serr->ee.ee_info = info; serr->ee.ee_data = 0; - serr->addr_offset = (u8*)&(((struct iphdr*)(skb->h.icmph+1))->daddr) - skb->nh.raw; + serr->addr_offset = (u8 *)&(((struct iphdr *)(icmp_hdr(skb) + 1))->daddr) - + skb_network_header(skb); serr->port = port; - skb->h.raw = payload; - if (!skb_pull(skb, payload - skb->data) || - sock_queue_err_skb(sk, skb)) - kfree_skb(skb); + if (skb_pull(skb, payload - skb->data) != NULL) { + skb_reset_transport_header(skb); + if (sock_queue_err_skb(sk, skb) == 0) + return; + } + kfree_skb(skb); } void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 info) @@ -296,8 +300,9 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf if (!skb) return; - iph = (struct iphdr*)skb_put(skb, sizeof(struct iphdr)); - skb->nh.iph = iph; + skb_put(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); + iph = ip_hdr(skb); iph->daddr = daddr; serr = SKB_EXT_ERR(skb); @@ -308,11 +313,11 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf serr->ee.ee_pad = 0; serr->ee.ee_info = info; serr->ee.ee_data = 0; - serr->addr_offset = (u8*)&iph->daddr - skb->nh.raw; + serr->addr_offset = (u8 *)&iph->daddr - skb_network_header(skb); serr->port = port; - skb->h.raw = skb->tail; - __skb_pull(skb, skb->tail - skb->data); + __skb_pull(skb, skb_tail_pointer(skb) - skb->data); + skb_reset_transport_header(skb); if (sock_queue_err_skb(sk, skb)) kfree_skb(skb); @@ -354,7 +359,8 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len) sin = (struct sockaddr_in *)msg->msg_name; if (sin) { sin->sin_family = AF_INET; - sin->sin_addr.s_addr = *(__be32*)(skb->nh.raw + serr->addr_offset); + sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) + + serr->addr_offset); sin->sin_port = serr->port; memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); } @@ -366,7 +372,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len) struct inet_sock *inet = inet_sk(sk); sin->sin_family = AF_INET; - sin->sin_addr.s_addr = skb->nh.iph->saddr; + sin->sin_addr.s_addr = ip_hdr(skb)->saddr; sin->sin_port = 0; memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); if (inet->cmsg_flags) @@ -403,20 +409,20 @@ out: */ static int do_ip_setsockopt(struct sock *sk, int level, - int optname, char __user *optval, int optlen) + int optname, char __user *optval, int optlen) { struct inet_sock *inet = inet_sk(sk); int val=0,err; if (((1<<optname) & ((1<<IP_PKTINFO) | (1<<IP_RECVTTL) | - (1<<IP_RECVOPTS) | (1<<IP_RECVTOS) | - (1<<IP_RETOPTS) | (1<<IP_TOS) | - (1<<IP_TTL) | (1<<IP_HDRINCL) | - (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) | - (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) | - (1<<IP_PASSSEC))) || - optname == IP_MULTICAST_TTL || - optname == IP_MULTICAST_LOOP) { + (1<<IP_RECVOPTS) | (1<<IP_RECVTOS) | + (1<<IP_RETOPTS) | (1<<IP_TOS) | + (1<<IP_TTL) | (1<<IP_HDRINCL) | + (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) | + (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) | + (1<<IP_PASSSEC))) || + optname == IP_MULTICAST_TTL || + optname == IP_MULTICAST_LOOP) { if (optlen >= sizeof(int)) { if (get_user(val, (int __user *) optval)) return -EFAULT; @@ -440,444 +446,444 @@ static int do_ip_setsockopt(struct sock *sk, int level, lock_sock(sk); switch (optname) { - case IP_OPTIONS: - { - struct ip_options * opt = NULL; - if (optlen > 40 || optlen < 0) - goto e_inval; - err = ip_options_get_from_user(&opt, optval, optlen); - if (err) - break; - if (inet->is_icsk) { - struct inet_connection_sock *icsk = inet_csk(sk); + case IP_OPTIONS: + { + struct ip_options * opt = NULL; + if (optlen > 40 || optlen < 0) + goto e_inval; + err = ip_options_get_from_user(&opt, optval, optlen); + if (err) + break; + if (inet->is_icsk) { + struct inet_connection_sock *icsk = inet_csk(sk); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - if (sk->sk_family == PF_INET || - (!((1 << sk->sk_state) & - (TCPF_LISTEN | TCPF_CLOSE)) && - inet->daddr != LOOPBACK4_IPV6)) { + if (sk->sk_family == PF_INET || + (!((1 << sk->sk_state) & + (TCPF_LISTEN | TCPF_CLOSE)) && + inet->daddr != LOOPBACK4_IPV6)) { #endif - if (inet->opt) - icsk->icsk_ext_hdr_len -= inet->opt->optlen; - if (opt) - icsk->icsk_ext_hdr_len += opt->optlen; - icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); + if (inet->opt) + icsk->icsk_ext_hdr_len -= inet->opt->optlen; + if (opt) + icsk->icsk_ext_hdr_len += opt->optlen; + icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - } -#endif } - opt = xchg(&inet->opt, opt); - kfree(opt); - break; +#endif } - case IP_PKTINFO: - if (val) - inet->cmsg_flags |= IP_CMSG_PKTINFO; - else - inet->cmsg_flags &= ~IP_CMSG_PKTINFO; - break; - case IP_RECVTTL: - if (val) - inet->cmsg_flags |= IP_CMSG_TTL; - else - inet->cmsg_flags &= ~IP_CMSG_TTL; - break; - case IP_RECVTOS: - if (val) - inet->cmsg_flags |= IP_CMSG_TOS; - else - inet->cmsg_flags &= ~IP_CMSG_TOS; - break; - case IP_RECVOPTS: - if (val) - inet->cmsg_flags |= IP_CMSG_RECVOPTS; - else - inet->cmsg_flags &= ~IP_CMSG_RECVOPTS; - break; - case IP_RETOPTS: - if (val) - inet->cmsg_flags |= IP_CMSG_RETOPTS; - else - inet->cmsg_flags &= ~IP_CMSG_RETOPTS; + opt = xchg(&inet->opt, opt); + kfree(opt); + break; + } + case IP_PKTINFO: + if (val) + inet->cmsg_flags |= IP_CMSG_PKTINFO; + else + inet->cmsg_flags &= ~IP_CMSG_PKTINFO; + break; + case IP_RECVTTL: + if (val) + inet->cmsg_flags |= IP_CMSG_TTL; + else + inet->cmsg_flags &= ~IP_CMSG_TTL; + break; + case IP_RECVTOS: + if (val) + inet->cmsg_flags |= IP_CMSG_TOS; + else + inet->cmsg_flags &= ~IP_CMSG_TOS; + break; + case IP_RECVOPTS: + if (val) + inet->cmsg_flags |= IP_CMSG_RECVOPTS; + else + inet->cmsg_flags &= ~IP_CMSG_RECVOPTS; + break; + case IP_RETOPTS: + if (val) + inet->cmsg_flags |= IP_CMSG_RETOPTS; + else + inet->cmsg_flags &= ~IP_CMSG_RETOPTS; + break; + case IP_PASSSEC: + if (val) + inet->cmsg_flags |= IP_CMSG_PASSSEC; + else + inet->cmsg_flags &= ~IP_CMSG_PASSSEC; + break; + case IP_TOS: /* This sets both TOS and Precedence */ + if (sk->sk_type == SOCK_STREAM) { + val &= ~3; + val |= inet->tos & 3; + } + if (IPTOS_PREC(val) >= IPTOS_PREC_CRITIC_ECP && + !capable(CAP_NET_ADMIN)) { + err = -EPERM; break; - case IP_PASSSEC: - if (val) - inet->cmsg_flags |= IP_CMSG_PASSSEC; - else - inet->cmsg_flags &= ~IP_CMSG_PASSSEC; + } + if (inet->tos != val) { + inet->tos = val; + sk->sk_priority = rt_tos2priority(val); + sk_dst_reset(sk); + } + break; + case IP_TTL: + if (optlen<1) + goto e_inval; + if (val != -1 && (val < 1 || val>255)) + goto e_inval; + inet->uc_ttl = val; + break; + case IP_HDRINCL: + if (sk->sk_type != SOCK_RAW) { + err = -ENOPROTOOPT; break; - case IP_TOS: /* This sets both TOS and Precedence */ - if (sk->sk_type == SOCK_STREAM) { - val &= ~3; - val |= inet->tos & 3; - } - if (IPTOS_PREC(val) >= IPTOS_PREC_CRITIC_ECP && - !capable(CAP_NET_ADMIN)) { - err = -EPERM; + } + inet->hdrincl = val ? 1 : 0; + break; + case IP_MTU_DISCOVER: + if (val<0 || val>3) + goto e_inval; + inet->pmtudisc = val; + break; + case IP_RECVERR: + inet->recverr = !!val; + if (!val) + skb_queue_purge(&sk->sk_error_queue); + break; + case IP_MULTICAST_TTL: + if (sk->sk_type == SOCK_STREAM) + goto e_inval; + if (optlen<1) + goto e_inval; + if (val==-1) + val = 1; + if (val < 0 || val > 255) + goto e_inval; + inet->mc_ttl = val; + break; + case IP_MULTICAST_LOOP: + if (optlen<1) + goto e_inval; + inet->mc_loop = !!val; + break; + case IP_MULTICAST_IF: + { + struct ip_mreqn mreq; + struct net_device *dev = NULL; + + if (sk->sk_type == SOCK_STREAM) + goto e_inval; + /* + * Check the arguments are allowable + */ + + err = -EFAULT; + if (optlen >= sizeof(struct ip_mreqn)) { + if (copy_from_user(&mreq,optval,sizeof(mreq))) break; - } - if (inet->tos != val) { - inet->tos = val; - sk->sk_priority = rt_tos2priority(val); - sk_dst_reset(sk); - } - break; - case IP_TTL: - if (optlen<1) - goto e_inval; - if (val != -1 && (val < 1 || val>255)) - goto e_inval; - inet->uc_ttl = val; - break; - case IP_HDRINCL: - if (sk->sk_type != SOCK_RAW) { - err = -ENOPROTOOPT; + } else { + memset(&mreq, 0, sizeof(mreq)); + if (optlen >= sizeof(struct in_addr) && + copy_from_user(&mreq.imr_address,optval,sizeof(struct in_addr))) + break; + } + + if (!mreq.imr_ifindex) { + if (mreq.imr_address.s_addr == INADDR_ANY) { + inet->mc_index = 0; + inet->mc_addr = 0; + err = 0; break; } - inet->hdrincl = val ? 1 : 0; - break; - case IP_MTU_DISCOVER: - if (val<0 || val>2) - goto e_inval; - inet->pmtudisc = val; - break; - case IP_RECVERR: - inet->recverr = !!val; - if (!val) - skb_queue_purge(&sk->sk_error_queue); - break; - case IP_MULTICAST_TTL: - if (sk->sk_type == SOCK_STREAM) - goto e_inval; - if (optlen<1) - goto e_inval; - if (val==-1) - val = 1; - if (val < 0 || val > 255) - goto e_inval; - inet->mc_ttl = val; - break; - case IP_MULTICAST_LOOP: - if (optlen<1) - goto e_inval; - inet->mc_loop = !!val; - break; - case IP_MULTICAST_IF: - { - struct ip_mreqn mreq; - struct net_device *dev = NULL; + dev = ip_dev_find(mreq.imr_address.s_addr); + if (dev) { + mreq.imr_ifindex = dev->ifindex; + dev_put(dev); + } + } else + dev = __dev_get_by_index(mreq.imr_ifindex); - if (sk->sk_type == SOCK_STREAM) - goto e_inval; - /* - * Check the arguments are allowable - */ - err = -EFAULT; - if (optlen >= sizeof(struct ip_mreqn)) { - if (copy_from_user(&mreq,optval,sizeof(mreq))) - break; - } else { - memset(&mreq, 0, sizeof(mreq)); - if (optlen >= sizeof(struct in_addr) && - copy_from_user(&mreq.imr_address,optval,sizeof(struct in_addr))) - break; - } + err = -EADDRNOTAVAIL; + if (!dev) + break; - if (!mreq.imr_ifindex) { - if (mreq.imr_address.s_addr == INADDR_ANY) { - inet->mc_index = 0; - inet->mc_addr = 0; - err = 0; - break; - } - dev = ip_dev_find(mreq.imr_address.s_addr); - if (dev) { - mreq.imr_ifindex = dev->ifindex; - dev_put(dev); - } - } else - dev = __dev_get_by_index(mreq.imr_ifindex); + err = -EINVAL; + if (sk->sk_bound_dev_if && + mreq.imr_ifindex != sk->sk_bound_dev_if) + break; + inet->mc_index = mreq.imr_ifindex; + inet->mc_addr = mreq.imr_address.s_addr; + err = 0; + break; + } - err = -EADDRNOTAVAIL; - if (!dev) - break; + case IP_ADD_MEMBERSHIP: + case IP_DROP_MEMBERSHIP: + { + struct ip_mreqn mreq; - err = -EINVAL; - if (sk->sk_bound_dev_if && - mreq.imr_ifindex != sk->sk_bound_dev_if) + if (optlen < sizeof(struct ip_mreq)) + goto e_inval; + err = -EFAULT; + if (optlen >= sizeof(struct ip_mreqn)) { + if (copy_from_user(&mreq,optval,sizeof(mreq))) break; + } else { + memset(&mreq, 0, sizeof(mreq)); + if (copy_from_user(&mreq,optval,sizeof(struct ip_mreq))) + break; + } - inet->mc_index = mreq.imr_ifindex; - inet->mc_addr = mreq.imr_address.s_addr; - err = 0; + if (optname == IP_ADD_MEMBERSHIP) + err = ip_mc_join_group(sk, &mreq); + else + err = ip_mc_leave_group(sk, &mreq); + break; + } + case IP_MSFILTER: + { + extern int sysctl_igmp_max_msf; + struct ip_msfilter *msf; + + if (optlen < IP_MSFILTER_SIZE(0)) + goto e_inval; + if (optlen > sysctl_optmem_max) { + err = -ENOBUFS; break; } + msf = kmalloc(optlen, GFP_KERNEL); + if (msf == 0) { + err = -ENOBUFS; + break; + } + err = -EFAULT; + if (copy_from_user(msf, optval, optlen)) { + kfree(msf); + break; + } + /* numsrc >= (1G-4) overflow in 32 bits */ + if (msf->imsf_numsrc >= 0x3ffffffcU || + msf->imsf_numsrc > sysctl_igmp_max_msf) { + kfree(msf); + err = -ENOBUFS; + break; + } + if (IP_MSFILTER_SIZE(msf->imsf_numsrc) > optlen) { + kfree(msf); + err = -EINVAL; + break; + } + err = ip_mc_msfilter(sk, msf, 0); + kfree(msf); + break; + } + case IP_BLOCK_SOURCE: + case IP_UNBLOCK_SOURCE: + case IP_ADD_SOURCE_MEMBERSHIP: + case IP_DROP_SOURCE_MEMBERSHIP: + { + struct ip_mreq_source mreqs; + int omode, add; - case IP_ADD_MEMBERSHIP: - case IP_DROP_MEMBERSHIP: - { - struct ip_mreqn mreq; - - if (optlen < sizeof(struct ip_mreq)) - goto e_inval; + if (optlen != sizeof(struct ip_mreq_source)) + goto e_inval; + if (copy_from_user(&mreqs, optval, sizeof(mreqs))) { err = -EFAULT; - if (optlen >= sizeof(struct ip_mreqn)) { - if(copy_from_user(&mreq,optval,sizeof(mreq))) - break; - } else { - memset(&mreq, 0, sizeof(mreq)); - if (copy_from_user(&mreq,optval,sizeof(struct ip_mreq))) - break; - } - - if (optname == IP_ADD_MEMBERSHIP) - err = ip_mc_join_group(sk, &mreq); - else - err = ip_mc_leave_group(sk, &mreq); break; } - case IP_MSFILTER: - { - extern int sysctl_igmp_max_msf; - struct ip_msfilter *msf; + if (optname == IP_BLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 1; + } else if (optname == IP_UNBLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 0; + } else if (optname == IP_ADD_SOURCE_MEMBERSHIP) { + struct ip_mreqn mreq; - if (optlen < IP_MSFILTER_SIZE(0)) - goto e_inval; - if (optlen > sysctl_optmem_max) { - err = -ENOBUFS; - break; - } - msf = kmalloc(optlen, GFP_KERNEL); - if (msf == 0) { - err = -ENOBUFS; + mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr; + mreq.imr_address.s_addr = mreqs.imr_interface; + mreq.imr_ifindex = 0; + err = ip_mc_join_group(sk, &mreq); + if (err && err != -EADDRINUSE) break; - } + omode = MCAST_INCLUDE; + add = 1; + } else /* IP_DROP_SOURCE_MEMBERSHIP */ { + omode = MCAST_INCLUDE; + add = 0; + } + err = ip_mc_source(add, omode, sk, &mreqs, 0); + break; + } + case MCAST_JOIN_GROUP: + case MCAST_LEAVE_GROUP: + { + struct group_req greq; + struct sockaddr_in *psin; + struct ip_mreqn mreq; + + if (optlen < sizeof(struct group_req)) + goto e_inval; + err = -EFAULT; + if (copy_from_user(&greq, optval, sizeof(greq))) + break; + psin = (struct sockaddr_in *)&greq.gr_group; + if (psin->sin_family != AF_INET) + goto e_inval; + memset(&mreq, 0, sizeof(mreq)); + mreq.imr_multiaddr = psin->sin_addr; + mreq.imr_ifindex = greq.gr_interface; + + if (optname == MCAST_JOIN_GROUP) + err = ip_mc_join_group(sk, &mreq); + else + err = ip_mc_leave_group(sk, &mreq); + break; + } + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + { + struct group_source_req greqs; + struct ip_mreq_source mreqs; + struct sockaddr_in *psin; + int omode, add; + + if (optlen != sizeof(struct group_source_req)) + goto e_inval; + if (copy_from_user(&greqs, optval, sizeof(greqs))) { err = -EFAULT; - if (copy_from_user(msf, optval, optlen)) { - kfree(msf); - break; - } - /* numsrc >= (1G-4) overflow in 32 bits */ - if (msf->imsf_numsrc >= 0x3ffffffcU || - msf->imsf_numsrc > sysctl_igmp_max_msf) { - kfree(msf); - err = -ENOBUFS; - break; - } - if (IP_MSFILTER_SIZE(msf->imsf_numsrc) > optlen) { - kfree(msf); - err = -EINVAL; - break; - } - err = ip_mc_msfilter(sk, msf, 0); - kfree(msf); break; } - case IP_BLOCK_SOURCE: - case IP_UNBLOCK_SOURCE: - case IP_ADD_SOURCE_MEMBERSHIP: - case IP_DROP_SOURCE_MEMBERSHIP: - { - struct ip_mreq_source mreqs; - int omode, add; - - if (optlen != sizeof(struct ip_mreq_source)) - goto e_inval; - if (copy_from_user(&mreqs, optval, sizeof(mreqs))) { - err = -EFAULT; - break; - } - if (optname == IP_BLOCK_SOURCE) { - omode = MCAST_EXCLUDE; - add = 1; - } else if (optname == IP_UNBLOCK_SOURCE) { - omode = MCAST_EXCLUDE; - add = 0; - } else if (optname == IP_ADD_SOURCE_MEMBERSHIP) { - struct ip_mreqn mreq; - - mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr; - mreq.imr_address.s_addr = mreqs.imr_interface; - mreq.imr_ifindex = 0; - err = ip_mc_join_group(sk, &mreq); - if (err && err != -EADDRINUSE) - break; - omode = MCAST_INCLUDE; - add = 1; - } else /* IP_DROP_SOURCE_MEMBERSHIP */ { - omode = MCAST_INCLUDE; - add = 0; - } - err = ip_mc_source(add, omode, sk, &mreqs, 0); + if (greqs.gsr_group.ss_family != AF_INET || + greqs.gsr_source.ss_family != AF_INET) { + err = -EADDRNOTAVAIL; break; } - case MCAST_JOIN_GROUP: - case MCAST_LEAVE_GROUP: - { - struct group_req greq; - struct sockaddr_in *psin; + psin = (struct sockaddr_in *)&greqs.gsr_group; + mreqs.imr_multiaddr = psin->sin_addr.s_addr; + psin = (struct sockaddr_in *)&greqs.gsr_source; + mreqs.imr_sourceaddr = psin->sin_addr.s_addr; + mreqs.imr_interface = 0; /* use index for mc_source */ + + if (optname == MCAST_BLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 1; + } else if (optname == MCAST_UNBLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 0; + } else if (optname == MCAST_JOIN_SOURCE_GROUP) { struct ip_mreqn mreq; - if (optlen < sizeof(struct group_req)) - goto e_inval; - err = -EFAULT; - if(copy_from_user(&greq, optval, sizeof(greq))) - break; - psin = (struct sockaddr_in *)&greq.gr_group; - if (psin->sin_family != AF_INET) - goto e_inval; - memset(&mreq, 0, sizeof(mreq)); + psin = (struct sockaddr_in *)&greqs.gsr_group; mreq.imr_multiaddr = psin->sin_addr; - mreq.imr_ifindex = greq.gr_interface; - - if (optname == MCAST_JOIN_GROUP) - err = ip_mc_join_group(sk, &mreq); - else - err = ip_mc_leave_group(sk, &mreq); + mreq.imr_address.s_addr = 0; + mreq.imr_ifindex = greqs.gsr_interface; + err = ip_mc_join_group(sk, &mreq); + if (err && err != -EADDRINUSE) + break; + greqs.gsr_interface = mreq.imr_ifindex; + omode = MCAST_INCLUDE; + add = 1; + } else /* MCAST_LEAVE_SOURCE_GROUP */ { + omode = MCAST_INCLUDE; + add = 0; + } + err = ip_mc_source(add, omode, sk, &mreqs, + greqs.gsr_interface); + break; + } + case MCAST_MSFILTER: + { + extern int sysctl_igmp_max_msf; + struct sockaddr_in *psin; + struct ip_msfilter *msf = NULL; + struct group_filter *gsf = NULL; + int msize, i, ifindex; + + if (optlen < GROUP_FILTER_SIZE(0)) + goto e_inval; + if (optlen > sysctl_optmem_max) { + err = -ENOBUFS; break; } - case MCAST_JOIN_SOURCE_GROUP: - case MCAST_LEAVE_SOURCE_GROUP: - case MCAST_BLOCK_SOURCE: - case MCAST_UNBLOCK_SOURCE: - { - struct group_source_req greqs; - struct ip_mreq_source mreqs; - struct sockaddr_in *psin; - int omode, add; - - if (optlen != sizeof(struct group_source_req)) - goto e_inval; - if (copy_from_user(&greqs, optval, sizeof(greqs))) { - err = -EFAULT; - break; - } - if (greqs.gsr_group.ss_family != AF_INET || - greqs.gsr_source.ss_family != AF_INET) { - err = -EADDRNOTAVAIL; - break; - } - psin = (struct sockaddr_in *)&greqs.gsr_group; - mreqs.imr_multiaddr = psin->sin_addr.s_addr; - psin = (struct sockaddr_in *)&greqs.gsr_source; - mreqs.imr_sourceaddr = psin->sin_addr.s_addr; - mreqs.imr_interface = 0; /* use index for mc_source */ - - if (optname == MCAST_BLOCK_SOURCE) { - omode = MCAST_EXCLUDE; - add = 1; - } else if (optname == MCAST_UNBLOCK_SOURCE) { - omode = MCAST_EXCLUDE; - add = 0; - } else if (optname == MCAST_JOIN_SOURCE_GROUP) { - struct ip_mreqn mreq; - - psin = (struct sockaddr_in *)&greqs.gsr_group; - mreq.imr_multiaddr = psin->sin_addr; - mreq.imr_address.s_addr = 0; - mreq.imr_ifindex = greqs.gsr_interface; - err = ip_mc_join_group(sk, &mreq); - if (err && err != -EADDRINUSE) - break; - greqs.gsr_interface = mreq.imr_ifindex; - omode = MCAST_INCLUDE; - add = 1; - } else /* MCAST_LEAVE_SOURCE_GROUP */ { - omode = MCAST_INCLUDE; - add = 0; - } - err = ip_mc_source(add, omode, sk, &mreqs, - greqs.gsr_interface); + gsf = kmalloc(optlen,GFP_KERNEL); + if (gsf == 0) { + err = -ENOBUFS; break; } - case MCAST_MSFILTER: - { - extern int sysctl_igmp_max_msf; - struct sockaddr_in *psin; - struct ip_msfilter *msf = NULL; - struct group_filter *gsf = NULL; - int msize, i, ifindex; - - if (optlen < GROUP_FILTER_SIZE(0)) - goto e_inval; - if (optlen > sysctl_optmem_max) { - err = -ENOBUFS; - break; - } - gsf = kmalloc(optlen,GFP_KERNEL); - if (gsf == 0) { - err = -ENOBUFS; - break; - } - err = -EFAULT; - if (copy_from_user(gsf, optval, optlen)) { - goto mc_msf_out; - } - /* numsrc >= (4G-140)/128 overflow in 32 bits */ - if (gsf->gf_numsrc >= 0x1ffffff || - gsf->gf_numsrc > sysctl_igmp_max_msf) { - err = -ENOBUFS; - goto mc_msf_out; - } - if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) { - err = -EINVAL; - goto mc_msf_out; - } - msize = IP_MSFILTER_SIZE(gsf->gf_numsrc); - msf = kmalloc(msize,GFP_KERNEL); - if (msf == 0) { - err = -ENOBUFS; - goto mc_msf_out; - } - ifindex = gsf->gf_interface; - psin = (struct sockaddr_in *)&gsf->gf_group; - if (psin->sin_family != AF_INET) { - err = -EADDRNOTAVAIL; - goto mc_msf_out; - } - msf->imsf_multiaddr = psin->sin_addr.s_addr; - msf->imsf_interface = 0; - msf->imsf_fmode = gsf->gf_fmode; - msf->imsf_numsrc = gsf->gf_numsrc; + err = -EFAULT; + if (copy_from_user(gsf, optval, optlen)) { + goto mc_msf_out; + } + /* numsrc >= (4G-140)/128 overflow in 32 bits */ + if (gsf->gf_numsrc >= 0x1ffffff || + gsf->gf_numsrc > sysctl_igmp_max_msf) { + err = -ENOBUFS; + goto mc_msf_out; + } + if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) { + err = -EINVAL; + goto mc_msf_out; + } + msize = IP_MSFILTER_SIZE(gsf->gf_numsrc); + msf = kmalloc(msize,GFP_KERNEL); + if (msf == 0) { + err = -ENOBUFS; + goto mc_msf_out; + } + ifindex = gsf->gf_interface; + psin = (struct sockaddr_in *)&gsf->gf_group; + if (psin->sin_family != AF_INET) { err = -EADDRNOTAVAIL; - for (i=0; i<gsf->gf_numsrc; ++i) { - psin = (struct sockaddr_in *)&gsf->gf_slist[i]; - - if (psin->sin_family != AF_INET) - goto mc_msf_out; - msf->imsf_slist[i] = psin->sin_addr.s_addr; - } - kfree(gsf); - gsf = NULL; - - err = ip_mc_msfilter(sk, msf, ifindex); -mc_msf_out: - kfree(msf); - kfree(gsf); - break; + goto mc_msf_out; } - case IP_ROUTER_ALERT: - err = ip_ra_control(sk, val ? 1 : 0, NULL); - break; - - case IP_FREEBIND: - if (optlen<1) - goto e_inval; - inet->freebind = !!val; - break; + msf->imsf_multiaddr = psin->sin_addr.s_addr; + msf->imsf_interface = 0; + msf->imsf_fmode = gsf->gf_fmode; + msf->imsf_numsrc = gsf->gf_numsrc; + err = -EADDRNOTAVAIL; + for (i=0; i<gsf->gf_numsrc; ++i) { + psin = (struct sockaddr_in *)&gsf->gf_slist[i]; - case IP_IPSEC_POLICY: - case IP_XFRM_POLICY: - err = -EPERM; - if (!capable(CAP_NET_ADMIN)) - break; - err = xfrm_user_policy(sk, optname, optval, optlen); + if (psin->sin_family != AF_INET) + goto mc_msf_out; + msf->imsf_slist[i] = psin->sin_addr.s_addr; + } + kfree(gsf); + gsf = NULL; + + err = ip_mc_msfilter(sk, msf, ifindex); + mc_msf_out: + kfree(msf); + kfree(gsf); + break; + } + case IP_ROUTER_ALERT: + err = ip_ra_control(sk, val ? 1 : 0, NULL); + break; + + case IP_FREEBIND: + if (optlen<1) + goto e_inval; + inet->freebind = !!val; + break; + + case IP_IPSEC_POLICY: + case IP_XFRM_POLICY: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) break; + err = xfrm_user_policy(sk, optname, optval, optlen); + break; - default: - err = -ENOPROTOOPT; - break; + default: + err = -ENOPROTOOPT; + break; } release_sock(sk); return err; @@ -948,214 +954,213 @@ EXPORT_SYMBOL(compat_ip_setsockopt); */ static int do_ip_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) + char __user *optval, int __user *optlen) { struct inet_sock *inet = inet_sk(sk); int val; int len; - if(level!=SOL_IP) + if (level != SOL_IP) return -EOPNOTSUPP; #ifdef CONFIG_IP_MROUTE - if(optname>=MRT_BASE && optname <=MRT_BASE+10) - { + if (optname >= MRT_BASE && optname <= MRT_BASE+10) { return ip_mroute_getsockopt(sk,optname,optval,optlen); } #endif - if(get_user(len,optlen)) + if (get_user(len,optlen)) return -EFAULT; - if(len < 0) + if (len < 0) return -EINVAL; lock_sock(sk); - switch(optname) { - case IP_OPTIONS: - { - unsigned char optbuf[sizeof(struct ip_options)+40]; - struct ip_options * opt = (struct ip_options*)optbuf; - opt->optlen = 0; - if (inet->opt) - memcpy(optbuf, inet->opt, - sizeof(struct ip_options)+ - inet->opt->optlen); - release_sock(sk); - - if (opt->optlen == 0) - return put_user(0, optlen); - - ip_options_undo(opt); - - len = min_t(unsigned int, len, opt->optlen); - if(put_user(len, optlen)) - return -EFAULT; - if(copy_to_user(optval, opt->__data, len)) - return -EFAULT; - return 0; - } - case IP_PKTINFO: - val = (inet->cmsg_flags & IP_CMSG_PKTINFO) != 0; - break; - case IP_RECVTTL: - val = (inet->cmsg_flags & IP_CMSG_TTL) != 0; - break; - case IP_RECVTOS: - val = (inet->cmsg_flags & IP_CMSG_TOS) != 0; - break; - case IP_RECVOPTS: - val = (inet->cmsg_flags & IP_CMSG_RECVOPTS) != 0; - break; - case IP_RETOPTS: - val = (inet->cmsg_flags & IP_CMSG_RETOPTS) != 0; - break; - case IP_PASSSEC: - val = (inet->cmsg_flags & IP_CMSG_PASSSEC) != 0; - break; - case IP_TOS: - val = inet->tos; - break; - case IP_TTL: - val = (inet->uc_ttl == -1 ? - sysctl_ip_default_ttl : - inet->uc_ttl); - break; - case IP_HDRINCL: - val = inet->hdrincl; - break; - case IP_MTU_DISCOVER: - val = inet->pmtudisc; - break; - case IP_MTU: - { - struct dst_entry *dst; - val = 0; - dst = sk_dst_get(sk); - if (dst) { - val = dst_mtu(dst); - dst_release(dst); - } - if (!val) { - release_sock(sk); - return -ENOTCONN; - } - break; + switch (optname) { + case IP_OPTIONS: + { + unsigned char optbuf[sizeof(struct ip_options)+40]; + struct ip_options * opt = (struct ip_options*)optbuf; + opt->optlen = 0; + if (inet->opt) + memcpy(optbuf, inet->opt, + sizeof(struct ip_options)+ + inet->opt->optlen); + release_sock(sk); + + if (opt->optlen == 0) + return put_user(0, optlen); + + ip_options_undo(opt); + + len = min_t(unsigned int, len, opt->optlen); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, opt->__data, len)) + return -EFAULT; + return 0; + } + case IP_PKTINFO: + val = (inet->cmsg_flags & IP_CMSG_PKTINFO) != 0; + break; + case IP_RECVTTL: + val = (inet->cmsg_flags & IP_CMSG_TTL) != 0; + break; + case IP_RECVTOS: + val = (inet->cmsg_flags & IP_CMSG_TOS) != 0; + break; + case IP_RECVOPTS: + val = (inet->cmsg_flags & IP_CMSG_RECVOPTS) != 0; + break; + case IP_RETOPTS: + val = (inet->cmsg_flags & IP_CMSG_RETOPTS) != 0; + break; + case IP_PASSSEC: + val = (inet->cmsg_flags & IP_CMSG_PASSSEC) != 0; + break; + case IP_TOS: + val = inet->tos; + break; + case IP_TTL: + val = (inet->uc_ttl == -1 ? + sysctl_ip_default_ttl : + inet->uc_ttl); + break; + case IP_HDRINCL: + val = inet->hdrincl; + break; + case IP_MTU_DISCOVER: + val = inet->pmtudisc; + break; + case IP_MTU: + { + struct dst_entry *dst; + val = 0; + dst = sk_dst_get(sk); + if (dst) { + val = dst_mtu(dst); + dst_release(dst); } - case IP_RECVERR: - val = inet->recverr; - break; - case IP_MULTICAST_TTL: - val = inet->mc_ttl; - break; - case IP_MULTICAST_LOOP: - val = inet->mc_loop; - break; - case IP_MULTICAST_IF: - { - struct in_addr addr; - len = min_t(unsigned int, len, sizeof(struct in_addr)); - addr.s_addr = inet->mc_addr; + if (!val) { release_sock(sk); - - if(put_user(len, optlen)) - return -EFAULT; - if(copy_to_user(optval, &addr, len)) - return -EFAULT; - return 0; + return -ENOTCONN; } - case IP_MSFILTER: - { - struct ip_msfilter msf; - int err; + break; + } + case IP_RECVERR: + val = inet->recverr; + break; + case IP_MULTICAST_TTL: + val = inet->mc_ttl; + break; + case IP_MULTICAST_LOOP: + val = inet->mc_loop; + break; + case IP_MULTICAST_IF: + { + struct in_addr addr; + len = min_t(unsigned int, len, sizeof(struct in_addr)); + addr.s_addr = inet->mc_addr; + release_sock(sk); - if (len < IP_MSFILTER_SIZE(0)) { - release_sock(sk); - return -EINVAL; - } - if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) { - release_sock(sk); - return -EFAULT; - } - err = ip_mc_msfget(sk, &msf, - (struct ip_msfilter __user *)optval, optlen); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &addr, len)) + return -EFAULT; + return 0; + } + case IP_MSFILTER: + { + struct ip_msfilter msf; + int err; + + if (len < IP_MSFILTER_SIZE(0)) { release_sock(sk); - return err; + return -EINVAL; } - case MCAST_MSFILTER: - { - struct group_filter gsf; - int err; - - if (len < GROUP_FILTER_SIZE(0)) { - release_sock(sk); - return -EINVAL; - } - if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) { - release_sock(sk); - return -EFAULT; - } - err = ip_mc_gsfget(sk, &gsf, - (struct group_filter __user *)optval, optlen); + if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) { release_sock(sk); - return err; + return -EFAULT; } - case IP_PKTOPTIONS: - { - struct msghdr msg; + err = ip_mc_msfget(sk, &msf, + (struct ip_msfilter __user *)optval, optlen); + release_sock(sk); + return err; + } + case MCAST_MSFILTER: + { + struct group_filter gsf; + int err; + if (len < GROUP_FILTER_SIZE(0)) { release_sock(sk); + return -EINVAL; + } + if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) { + release_sock(sk); + return -EFAULT; + } + err = ip_mc_gsfget(sk, &gsf, + (struct group_filter __user *)optval, optlen); + release_sock(sk); + return err; + } + case IP_PKTOPTIONS: + { + struct msghdr msg; + + release_sock(sk); - if (sk->sk_type != SOCK_STREAM) - return -ENOPROTOOPT; + if (sk->sk_type != SOCK_STREAM) + return -ENOPROTOOPT; - msg.msg_control = optval; - msg.msg_controllen = len; - msg.msg_flags = 0; + msg.msg_control = optval; + msg.msg_controllen = len; + msg.msg_flags = 0; - if (inet->cmsg_flags & IP_CMSG_PKTINFO) { - struct in_pktinfo info; + if (inet->cmsg_flags & IP_CMSG_PKTINFO) { + struct in_pktinfo info; - info.ipi_addr.s_addr = inet->rcv_saddr; - info.ipi_spec_dst.s_addr = inet->rcv_saddr; - info.ipi_ifindex = inet->mc_index; - put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); - } - if (inet->cmsg_flags & IP_CMSG_TTL) { - int hlim = inet->mc_ttl; - put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim); - } - len -= msg.msg_controllen; - return put_user(len, optlen); + info.ipi_addr.s_addr = inet->rcv_saddr; + info.ipi_spec_dst.s_addr = inet->rcv_saddr; + info.ipi_ifindex = inet->mc_index; + put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); } - case IP_FREEBIND: - val = inet->freebind; - break; - default: - release_sock(sk); - return -ENOPROTOOPT; + if (inet->cmsg_flags & IP_CMSG_TTL) { + int hlim = inet->mc_ttl; + put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim); + } + len -= msg.msg_controllen; + return put_user(len, optlen); + } + case IP_FREEBIND: + val = inet->freebind; + break; + default: + release_sock(sk); + return -ENOPROTOOPT; } release_sock(sk); if (len < sizeof(int) && len > 0 && val>=0 && val<255) { unsigned char ucval = (unsigned char)val; len = 1; - if(put_user(len, optlen)) + if (put_user(len, optlen)) return -EFAULT; - if(copy_to_user(optval,&ucval,1)) + if (copy_to_user(optval,&ucval,1)) return -EFAULT; } else { len = min_t(unsigned int, sizeof(int), len); - if(put_user(len, optlen)) + if (put_user(len, optlen)) return -EFAULT; - if(copy_to_user(optval,&val,len)) + if (copy_to_user(optval,&val,len)) return -EFAULT; } return 0; } int ip_getsockopt(struct sock *sk, int level, - int optname, char __user *optval, int __user *optlen) + int optname, char __user *optval, int __user *optlen) { int err; @@ -1169,7 +1174,7 @@ int ip_getsockopt(struct sock *sk, int level, ) { int len; - if(get_user(len,optlen)) + if (get_user(len,optlen)) return -EFAULT; lock_sock(sk); diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index aa704b88f01..ab86137c71d 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -43,21 +43,15 @@ static LIST_HEAD(ipcomp_tfms_list); static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) { - int err, plen, dlen; struct ipcomp_data *ipcd = x->data; - u8 *start, *scratch; - struct crypto_comp *tfm; - int cpu; - - plen = skb->len; - dlen = IPCOMP_SCRATCH_SIZE; - start = skb->data; + const int plen = skb->len; + int dlen = IPCOMP_SCRATCH_SIZE; + const u8 *start = skb->data; + const int cpu = get_cpu(); + u8 *scratch = *per_cpu_ptr(ipcomp_scratches, cpu); + struct crypto_comp *tfm = *per_cpu_ptr(ipcd->tfms, cpu); + int err = crypto_comp_decompress(tfm, start, plen, scratch, &dlen); - cpu = get_cpu(); - scratch = *per_cpu_ptr(ipcomp_scratches, cpu); - tfm = *per_cpu_ptr(ipcd->tfms, cpu); - - err = crypto_comp_decompress(tfm, start, plen, scratch, &dlen); if (err) goto out; @@ -72,7 +66,7 @@ static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) skb->truesize += dlen - plen; __skb_put(skb, dlen - plen); - memcpy(skb->data, scratch, dlen); + skb_copy_to_linear_data(skb, scratch, dlen); out: put_cpu(); return err; @@ -90,10 +84,10 @@ static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb) skb->ip_summed = CHECKSUM_NONE; /* Remove ipcomp header and decompress original payload */ - iph = skb->nh.iph; + iph = ip_hdr(skb); ipch = (void *)skb->data; iph->protocol = ipch->nexthdr; - skb->h.raw = skb->nh.raw + sizeof(*ipch); + skb->transport_header = skb->network_header + sizeof(*ipch); __skb_pull(skb, sizeof(*ipch)); err = ipcomp_decompress(x, skb); @@ -103,23 +97,16 @@ out: static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb) { - int err, plen, dlen, ihlen; - struct iphdr *iph = skb->nh.iph; struct ipcomp_data *ipcd = x->data; - u8 *start, *scratch; - struct crypto_comp *tfm; - int cpu; + const int ihlen = ip_hdrlen(skb); + const int plen = skb->len - ihlen; + int dlen = IPCOMP_SCRATCH_SIZE; + u8 *start = skb->data + ihlen; + const int cpu = get_cpu(); + u8 *scratch = *per_cpu_ptr(ipcomp_scratches, cpu); + struct crypto_comp *tfm = *per_cpu_ptr(ipcd->tfms, cpu); + int err = crypto_comp_compress(tfm, start, plen, scratch, &dlen); - ihlen = iph->ihl * 4; - plen = skb->len - ihlen; - dlen = IPCOMP_SCRATCH_SIZE; - start = skb->data + ihlen; - - cpu = get_cpu(); - scratch = *per_cpu_ptr(ipcomp_scratches, cpu); - tfm = *per_cpu_ptr(ipcd->tfms, cpu); - - err = crypto_comp_compress(tfm, start, plen, scratch, &dlen); if (err) goto out; @@ -142,12 +129,11 @@ out: static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb) { int err; - struct iphdr *iph; struct ip_comp_hdr *ipch; struct ipcomp_data *ipcd = x->data; int hdr_len = 0; + struct iphdr *iph = ip_hdr(skb); - iph = skb->nh.iph; iph->tot_len = htons(skb->len); hdr_len = iph->ihl * 4; if ((skb->len - hdr_len) < ipcd->threshold) { @@ -159,7 +145,7 @@ static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb) goto out_ok; err = ipcomp_compress(x, skb); - iph = skb->nh.iph; + iph = ip_hdr(skb); if (err) { goto out_ok; @@ -188,8 +174,8 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); struct xfrm_state *x; - if (skb->h.icmph->type != ICMP_DEST_UNREACH || - skb->h.icmph->code != ICMP_FRAG_NEEDED) + if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || + icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return; spi = htonl(ntohs(ipch->cpi)); diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index cf49de1a498..342ca8d8945 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -192,7 +192,7 @@ static int __init ic_open_devs(void) if (dev_change_flags(&loopback_dev, loopback_dev.flags | IFF_UP) < 0) printk(KERN_ERR "IP-Config: Failed to open %s\n", loopback_dev.name); - for (dev = dev_base; dev; dev = dev->next) { + for_each_netdev(dev) { if (dev == &loopback_dev) continue; if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) : @@ -432,7 +432,7 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt goto drop; /* Basic sanity checks can be done without the lock. */ - rarp = (struct arphdr *)skb->h.raw; + rarp = (struct arphdr *)skb_transport_header(skb); /* If this test doesn't pass, it's not IP, or we should * ignore it anyway. @@ -455,7 +455,7 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt goto drop; /* OK, it is all there and looks valid, process... */ - rarp = (struct arphdr *)skb->h.raw; + rarp = (struct arphdr *)skb_transport_header(skb); rarp_ptr = (unsigned char *) (rarp + 1); /* One reply at a time, please. */ @@ -702,7 +702,8 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d memset(b, 0, sizeof(struct bootp_pkt)); /* Construct IP header */ - skb->nh.iph = h = &b->iph; + skb_reset_network_header(skb); + h = ip_hdr(skb); h->version = 4; h->ihl = 5; h->tot_len = htons(sizeof(struct bootp_pkt)); @@ -782,7 +783,7 @@ static void __init ic_do_bootp_ext(u8 *ext) u8 *c; printk("DHCP/BOOTP: Got extension %d:",*ext); - for(c=ext+2; c<ext+2+ext[1]; c++) + for (c=ext+2; c<ext+2+ext[1]; c++) printk(" %02x", *c); printk("\n"); #endif @@ -845,7 +846,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str sizeof(struct udphdr))) goto drop; - b = (struct bootp_pkt *) skb->nh.iph; + b = (struct bootp_pkt *)skb_network_header(skb); h = &b->iph; if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP) @@ -883,7 +884,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str if (!pskb_may_pull(skb, skb->len)) goto drop; - b = (struct bootp_pkt *) skb->nh.iph; + b = (struct bootp_pkt *)skb_network_header(skb); h = &b->iph; /* One reply at a time, please. */ @@ -938,7 +939,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str if (opt[1] >= 4) memcpy(&server_id, opt + 2, 4); break; - }; + } } #ifdef IPCONFIG_DEBUG @@ -983,7 +984,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str ic_myaddr = NONE; ic_servaddr = NONE; goto drop_unlock; - }; + } ic_dhcp_msgtype = mt; @@ -1094,7 +1095,7 @@ static int __init ic_dynamic(void) retries = CONF_SEND_RETRIES; get_random_bytes(&timeout, sizeof(timeout)); timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM); - for(;;) { + for (;;) { #ifdef IPCONFIG_BOOTP if (do_bootp && (d->able & IC_BOOTP)) ic_bootp_send_if(d, jiffies - start_jiffies); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 3ec5ce0f549..ebd2f2d532f 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -157,10 +157,10 @@ static struct ip_tunnel * ipip_tunnel_lookup(__be32 remote, __be32 local) return NULL; } -static struct ip_tunnel **ipip_bucket(struct ip_tunnel *t) +static struct ip_tunnel **__ipip_bucket(struct ip_tunnel_parm *parms) { - __be32 remote = t->parms.iph.daddr; - __be32 local = t->parms.iph.saddr; + __be32 remote = parms->iph.daddr; + __be32 local = parms->iph.saddr; unsigned h = 0; int prio = 0; @@ -175,6 +175,10 @@ static struct ip_tunnel **ipip_bucket(struct ip_tunnel *t) return &tunnels[prio][h]; } +static inline struct ip_tunnel **ipip_bucket(struct ip_tunnel *t) +{ + return __ipip_bucket(&t->parms); +} static void ipip_tunnel_unlink(struct ip_tunnel *t) { @@ -206,19 +210,9 @@ static struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int c __be32 local = parms->iph.saddr; struct ip_tunnel *t, **tp, *nt; struct net_device *dev; - unsigned h = 0; - int prio = 0; char name[IFNAMSIZ]; - if (remote) { - prio |= 2; - h ^= HASH(remote); - } - if (local) { - prio |= 1; - h ^= HASH(local); - } - for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) { + for (tp = __ipip_bucket(parms); (t = *tp) != NULL; tp = &t->next) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) return t; } @@ -280,8 +274,8 @@ static int ipip_err(struct sk_buff *skb, u32 info) ICMP in the real Internet is absolutely infeasible. */ struct iphdr *iph = (struct iphdr*)skb->data; - int type = skb->h.icmph->type; - int code = skb->h.icmph->code; + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; struct ip_tunnel *t; int err; @@ -336,8 +330,8 @@ out: struct iphdr *iph = (struct iphdr*)dp; int hlen = iph->ihl<<2; struct iphdr *eiph; - int type = skb->h.icmph->type; - int code = skb->h.icmph->code; + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; int rel_type = 0; int rel_code = 0; __be32 rel_info = 0; @@ -354,7 +348,7 @@ out: default: return 0; case ICMP_PARAMETERPROB: - n = ntohl(skb->h.icmph->un.gateway) >> 24; + n = ntohl(icmp_hdr(skb)->un.gateway) >> 24; if (n < hlen) return 0; @@ -373,7 +367,7 @@ out: return 0; case ICMP_FRAG_NEEDED: /* And it is the only really necessary thing :-) */ - n = ntohs(skb->h.icmph->un.frag.mtu); + n = ntohs(icmp_hdr(skb)->un.frag.mtu); if (n < hlen+68) return 0; n -= hlen; @@ -405,7 +399,7 @@ out: dst_release(skb2->dst); skb2->dst = NULL; skb_pull(skb2, skb->data - (u8*)eiph); - skb2->nh.raw = skb2->data; + skb_reset_network_header(skb2); /* Try to guess incoming interface */ memset(&fl, 0, sizeof(fl)); @@ -461,9 +455,10 @@ out: #endif } -static inline void ipip_ecn_decapsulate(struct iphdr *outer_iph, struct sk_buff *skb) +static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph, + struct sk_buff *skb) { - struct iphdr *inner_iph = skb->nh.iph; + struct iphdr *inner_iph = ip_hdr(skb); if (INET_ECN_is_ce(outer_iph->tos)) IP_ECN_set_ce(inner_iph); @@ -471,10 +466,8 @@ static inline void ipip_ecn_decapsulate(struct iphdr *outer_iph, struct sk_buff static int ipip_rcv(struct sk_buff *skb) { - struct iphdr *iph; struct ip_tunnel *tunnel; - - iph = skb->nh.iph; + const struct iphdr *iph = ip_hdr(skb); read_lock(&ipip_lock); if ((tunnel = ipip_tunnel_lookup(iph->saddr, iph->daddr)) != NULL) { @@ -486,8 +479,8 @@ static int ipip_rcv(struct sk_buff *skb) secpath_reset(skb); - skb->mac.raw = skb->nh.raw; - skb->nh.raw = skb->data; + skb->mac_header = skb->network_header; + skb_reset_network_header(skb); skb->protocol = htons(ETH_P_IP); skb->pkt_type = PACKET_HOST; @@ -521,7 +514,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) __be16 df = tiph->frag_off; struct rtable *rt; /* Route to the other host */ struct net_device *tdev; /* Device to other host */ - struct iphdr *old_iph = skb->nh.iph; + struct iphdr *old_iph = ip_hdr(skb); struct iphdr *iph; /* Our new IP header */ int max_headroom; /* The extra header space needed */ __be32 dst = tiph->daddr; @@ -615,11 +608,12 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) skb_set_owner_w(new_skb, skb->sk); dev_kfree_skb(skb); skb = new_skb; - old_iph = skb->nh.iph; + old_iph = ip_hdr(skb); } - skb->h.raw = skb->nh.raw; - skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); + skb->transport_header = skb->network_header; + skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); @@ -630,7 +624,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) * Push down and install the IPIP header. */ - iph = skb->nh.iph; + iph = ip_hdr(skb); iph->version = 4; iph->ihl = sizeof(struct iphdr)>>2; iph->frag_off = df; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 601e3df6925..0ebae413ae8 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -62,6 +62,7 @@ #include <linux/netfilter_ipv4.h> #include <net/ipip.h> #include <net/checksum.h> +#include <net/netlink.h> #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2) #define CONFIG_IP_PIMSM 1 @@ -302,8 +303,8 @@ static void ipmr_destroy_unres(struct mfc_cache *c) atomic_dec(&cache_resolve_queue_len); - while((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) { - if (skb->nh.iph->version == 0) { + while ((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) { + if (ip_hdr(skb)->version == 0) { struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); nlh->nlmsg_type = NLMSG_ERROR; nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); @@ -479,7 +480,7 @@ static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp) static struct mfc_cache *ipmr_cache_alloc(void) { struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); - if(c==NULL) + if (c==NULL) return NULL; c->mfc_un.res.minvif = MAXVIFS; return c; @@ -488,7 +489,7 @@ static struct mfc_cache *ipmr_cache_alloc(void) static struct mfc_cache *ipmr_cache_alloc_unres(void) { struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); - if(c==NULL) + if (c==NULL) return NULL; skb_queue_head_init(&c->mfc_un.unres.unresolved); c->mfc_un.unres.expires = jiffies + 10*HZ; @@ -508,12 +509,13 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c) * Play the pending entries through our router */ - while((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) { - if (skb->nh.iph->version == 0) { + while ((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) { + if (ip_hdr(skb)->version == 0) { struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) { - nlh->nlmsg_len = skb->tail - (u8*)nlh; + nlh->nlmsg_len = (skb_tail_pointer(skb) - + (u8 *)nlh); } else { nlh->nlmsg_type = NLMSG_ERROR; nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); @@ -539,7 +541,7 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c) static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) { struct sk_buff *skb; - int ihl = pkt->nh.iph->ihl<<2; + const int ihl = ip_hdrlen(pkt); struct igmphdr *igmp; struct igmpmsg *msg; int ret; @@ -551,7 +553,7 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) #endif skb = alloc_skb(128, GFP_ATOMIC); - if(!skb) + if (!skb) return -ENOBUFS; #ifdef CONFIG_IP_PIMSM @@ -561,14 +563,17 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) And all this only to mangle msg->im_msgtype and to set msg->im_mbz to "mbz" :-) */ - msg = (struct igmpmsg*)skb_push(skb, sizeof(struct iphdr)); - skb->nh.raw = skb->h.raw = (u8*)msg; - memcpy(msg, pkt->nh.raw, sizeof(struct iphdr)); + skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + msg = (struct igmpmsg *)skb_network_header(skb); + memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr)); msg->im_msgtype = IGMPMSG_WHOLEPKT; msg->im_mbz = 0; msg->im_vif = reg_vif_num; - skb->nh.iph->ihl = sizeof(struct iphdr) >> 2; - skb->nh.iph->tot_len = htons(ntohs(pkt->nh.iph->tot_len) + sizeof(struct iphdr)); + ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2; + ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) + + sizeof(struct iphdr)); } else #endif { @@ -577,10 +582,11 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) * Copy the IP header */ - skb->nh.iph = (struct iphdr *)skb_put(skb, ihl); - memcpy(skb->data,pkt->data,ihl); - skb->nh.iph->protocol = 0; /* Flag to the kernel this is a route add */ - msg = (struct igmpmsg*)skb->nh.iph; + skb->network_header = skb->tail; + skb_put(skb, ihl); + skb_copy_to_linear_data(skb, pkt->data, ihl); + ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */ + msg = (struct igmpmsg *)skb_network_header(skb); msg->im_vif = vifi; skb->dst = dst_clone(pkt->dst); @@ -592,8 +598,8 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) igmp->type = msg->im_msgtype = assert; igmp->code = 0; - skb->nh.iph->tot_len=htons(skb->len); /* Fix the length */ - skb->h.raw = skb->nh.raw; + ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */ + skb->transport_header = skb->network_header; } if (mroute_socket == NULL) { @@ -622,11 +628,12 @@ ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb) { int err; struct mfc_cache *c; + const struct iphdr *iph = ip_hdr(skb); spin_lock_bh(&mfc_unres_lock); for (c=mfc_unres_queue; c; c=c->next) { - if (c->mfc_mcastgrp == skb->nh.iph->daddr && - c->mfc_origin == skb->nh.iph->saddr) + if (c->mfc_mcastgrp == iph->daddr && + c->mfc_origin == iph->saddr) break; } @@ -646,9 +653,9 @@ ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb) /* * Fill in the new cache entry */ - c->mfc_parent=-1; - c->mfc_origin=skb->nh.iph->saddr; - c->mfc_mcastgrp=skb->nh.iph->daddr; + c->mfc_parent = -1; + c->mfc_origin = iph->saddr; + c->mfc_mcastgrp = iph->daddr; /* * Reflect first query at mrouted. @@ -734,7 +741,7 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock) return 0; } - if(!MULTICAST(mfc->mfcc_mcastgrp.s_addr)) + if (!MULTICAST(mfc->mfcc_mcastgrp.s_addr)) return -EINVAL; c=ipmr_cache_alloc(); @@ -788,7 +795,7 @@ static void mroute_clean_tables(struct sock *sk) /* * Shut down all active vif entries */ - for(i=0; i<maxvif; i++) { + for (i=0; i<maxvif; i++) { if (!(vif_table[i].flags&VIFF_STATIC)) vif_delete(i); } @@ -858,119 +865,117 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int opt struct vifctl vif; struct mfcctl mfc; - if(optname!=MRT_INIT) - { - if(sk!=mroute_socket && !capable(CAP_NET_ADMIN)) + if (optname != MRT_INIT) { + if (sk != mroute_socket && !capable(CAP_NET_ADMIN)) return -EACCES; } - switch(optname) - { - case MRT_INIT: - if (sk->sk_type != SOCK_RAW || - inet_sk(sk)->num != IPPROTO_IGMP) - return -EOPNOTSUPP; - if(optlen!=sizeof(int)) - return -ENOPROTOOPT; - - rtnl_lock(); - if (mroute_socket) { - rtnl_unlock(); - return -EADDRINUSE; - } - - ret = ip_ra_control(sk, 1, mrtsock_destruct); - if (ret == 0) { - write_lock_bh(&mrt_lock); - mroute_socket=sk; - write_unlock_bh(&mrt_lock); + switch (optname) { + case MRT_INIT: + if (sk->sk_type != SOCK_RAW || + inet_sk(sk)->num != IPPROTO_IGMP) + return -EOPNOTSUPP; + if (optlen!=sizeof(int)) + return -ENOPROTOOPT; - ipv4_devconf.mc_forwarding++; - } + rtnl_lock(); + if (mroute_socket) { rtnl_unlock(); - return ret; - case MRT_DONE: - if (sk!=mroute_socket) - return -EACCES; - return ip_ra_control(sk, 0, NULL); - case MRT_ADD_VIF: - case MRT_DEL_VIF: - if(optlen!=sizeof(vif)) - return -EINVAL; - if (copy_from_user(&vif,optval,sizeof(vif))) - return -EFAULT; - if(vif.vifc_vifi >= MAXVIFS) - return -ENFILE; - rtnl_lock(); - if (optname==MRT_ADD_VIF) { - ret = vif_add(&vif, sk==mroute_socket); - } else { - ret = vif_delete(vif.vifc_vifi); - } - rtnl_unlock(); - return ret; + return -EADDRINUSE; + } + + ret = ip_ra_control(sk, 1, mrtsock_destruct); + if (ret == 0) { + write_lock_bh(&mrt_lock); + mroute_socket=sk; + write_unlock_bh(&mrt_lock); + + ipv4_devconf.mc_forwarding++; + } + rtnl_unlock(); + return ret; + case MRT_DONE: + if (sk!=mroute_socket) + return -EACCES; + return ip_ra_control(sk, 0, NULL); + case MRT_ADD_VIF: + case MRT_DEL_VIF: + if (optlen!=sizeof(vif)) + return -EINVAL; + if (copy_from_user(&vif,optval,sizeof(vif))) + return -EFAULT; + if (vif.vifc_vifi >= MAXVIFS) + return -ENFILE; + rtnl_lock(); + if (optname==MRT_ADD_VIF) { + ret = vif_add(&vif, sk==mroute_socket); + } else { + ret = vif_delete(vif.vifc_vifi); + } + rtnl_unlock(); + return ret; /* * Manipulate the forwarding caches. These live * in a sort of kernel/user symbiosis. */ - case MRT_ADD_MFC: - case MRT_DEL_MFC: - if(optlen!=sizeof(mfc)) - return -EINVAL; - if (copy_from_user(&mfc,optval, sizeof(mfc))) - return -EFAULT; - rtnl_lock(); - if (optname==MRT_DEL_MFC) - ret = ipmr_mfc_delete(&mfc); - else - ret = ipmr_mfc_add(&mfc, sk==mroute_socket); - rtnl_unlock(); - return ret; + case MRT_ADD_MFC: + case MRT_DEL_MFC: + if (optlen!=sizeof(mfc)) + return -EINVAL; + if (copy_from_user(&mfc,optval, sizeof(mfc))) + return -EFAULT; + rtnl_lock(); + if (optname==MRT_DEL_MFC) + ret = ipmr_mfc_delete(&mfc); + else + ret = ipmr_mfc_add(&mfc, sk==mroute_socket); + rtnl_unlock(); + return ret; /* * Control PIM assert. */ - case MRT_ASSERT: - { - int v; - if(get_user(v,(int __user *)optval)) - return -EFAULT; - mroute_do_assert=(v)?1:0; - return 0; - } + case MRT_ASSERT: + { + int v; + if (get_user(v,(int __user *)optval)) + return -EFAULT; + mroute_do_assert=(v)?1:0; + return 0; + } #ifdef CONFIG_IP_PIMSM - case MRT_PIM: - { - int v, ret; - if(get_user(v,(int __user *)optval)) - return -EFAULT; - v = (v)?1:0; - rtnl_lock(); - ret = 0; - if (v != mroute_do_pim) { - mroute_do_pim = v; - mroute_do_assert = v; + case MRT_PIM: + { + int v, ret; + if (get_user(v,(int __user *)optval)) + return -EFAULT; + v = (v)?1:0; + rtnl_lock(); + ret = 0; + if (v != mroute_do_pim) { + mroute_do_pim = v; + mroute_do_assert = v; #ifdef CONFIG_IP_PIMSM_V2 - if (mroute_do_pim) - ret = inet_add_protocol(&pim_protocol, - IPPROTO_PIM); - else - ret = inet_del_protocol(&pim_protocol, - IPPROTO_PIM); - if (ret < 0) - ret = -EAGAIN; + if (mroute_do_pim) + ret = inet_add_protocol(&pim_protocol, + IPPROTO_PIM); + else + ret = inet_del_protocol(&pim_protocol, + IPPROTO_PIM); + if (ret < 0) + ret = -EAGAIN; #endif - } - rtnl_unlock(); - return ret; } + rtnl_unlock(); + return ret; + } #endif - /* - * Spurious command, or MRT_VERSION which you cannot - * set. - */ - default: - return -ENOPROTOOPT; + /* + * Spurious command, or MRT_VERSION which you cannot + * set. + */ + default: + return -ENOPROTOOPT; } } @@ -983,7 +988,7 @@ int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __u int olr; int val; - if(optname!=MRT_VERSION && + if (optname!=MRT_VERSION && #ifdef CONFIG_IP_PIMSM optname!=MRT_PIM && #endif @@ -997,17 +1002,17 @@ int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __u if (olr < 0) return -EINVAL; - if(put_user(olr,optlen)) + if (put_user(olr,optlen)) return -EFAULT; - if(optname==MRT_VERSION) + if (optname==MRT_VERSION) val=0x0305; #ifdef CONFIG_IP_PIMSM - else if(optname==MRT_PIM) + else if (optname==MRT_PIM) val=mroute_do_pim; #endif else val=mroute_do_assert; - if(copy_to_user(optval,&val,olr)) + if (copy_to_user(optval,&val,olr)) return -EFAULT; return 0; } @@ -1023,48 +1028,47 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) struct vif_device *vif; struct mfc_cache *c; - switch(cmd) - { - case SIOCGETVIFCNT: - if (copy_from_user(&vr,arg,sizeof(vr))) - return -EFAULT; - if(vr.vifi>=maxvif) - return -EINVAL; - read_lock(&mrt_lock); - vif=&vif_table[vr.vifi]; - if(VIF_EXISTS(vr.vifi)) { - vr.icount=vif->pkt_in; - vr.ocount=vif->pkt_out; - vr.ibytes=vif->bytes_in; - vr.obytes=vif->bytes_out; - read_unlock(&mrt_lock); - - if (copy_to_user(arg,&vr,sizeof(vr))) - return -EFAULT; - return 0; - } + switch (cmd) { + case SIOCGETVIFCNT: + if (copy_from_user(&vr,arg,sizeof(vr))) + return -EFAULT; + if (vr.vifi>=maxvif) + return -EINVAL; + read_lock(&mrt_lock); + vif=&vif_table[vr.vifi]; + if (VIF_EXISTS(vr.vifi)) { + vr.icount=vif->pkt_in; + vr.ocount=vif->pkt_out; + vr.ibytes=vif->bytes_in; + vr.obytes=vif->bytes_out; read_unlock(&mrt_lock); - return -EADDRNOTAVAIL; - case SIOCGETSGCNT: - if (copy_from_user(&sr,arg,sizeof(sr))) - return -EFAULT; - read_lock(&mrt_lock); - c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr); - if (c) { - sr.pktcnt = c->mfc_un.res.pkt; - sr.bytecnt = c->mfc_un.res.bytes; - sr.wrong_if = c->mfc_un.res.wrong_if; - read_unlock(&mrt_lock); - - if (copy_to_user(arg,&sr,sizeof(sr))) - return -EFAULT; - return 0; - } + if (copy_to_user(arg,&vr,sizeof(vr))) + return -EFAULT; + return 0; + } + read_unlock(&mrt_lock); + return -EADDRNOTAVAIL; + case SIOCGETSGCNT: + if (copy_from_user(&sr,arg,sizeof(sr))) + return -EFAULT; + + read_lock(&mrt_lock); + c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr); + if (c) { + sr.pktcnt = c->mfc_un.res.pkt; + sr.bytecnt = c->mfc_un.res.bytes; + sr.wrong_if = c->mfc_un.res.wrong_if; read_unlock(&mrt_lock); - return -EADDRNOTAVAIL; - default: - return -ENOIOCTLCMD; + + if (copy_to_user(arg,&sr,sizeof(sr))) + return -EFAULT; + return 0; + } + read_unlock(&mrt_lock); + return -EADDRNOTAVAIL; + default: + return -ENOIOCTLCMD; } } @@ -1076,7 +1080,7 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; v=&vif_table[0]; - for(ct=0;ct<maxvif;ct++,v++) { + for (ct=0;ct<maxvif;ct++,v++) { if (v->dev==ptr) vif_delete(ct); } @@ -1096,11 +1100,17 @@ static struct notifier_block ip_mr_notifier={ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr) { - struct iphdr *iph = (struct iphdr *)skb_push(skb,sizeof(struct iphdr)); + struct iphdr *iph; + struct iphdr *old_iph = ip_hdr(skb); + + skb_push(skb, sizeof(struct iphdr)); + skb->transport_header = skb->network_header; + skb_reset_network_header(skb); + iph = ip_hdr(skb); iph->version = 4; - iph->tos = skb->nh.iph->tos; - iph->ttl = skb->nh.iph->ttl; + iph->tos = old_iph->tos; + iph->ttl = old_iph->ttl; iph->frag_off = 0; iph->daddr = daddr; iph->saddr = saddr; @@ -1110,8 +1120,6 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr) ip_select_ident(iph, skb->dst, NULL); ip_send_check(iph); - skb->h.ipiph = skb->nh.iph; - skb->nh.iph = iph; memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); nf_reset(skb); } @@ -1134,7 +1142,7 @@ static inline int ipmr_forward_finish(struct sk_buff *skb) static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) { - struct iphdr *iph = skb->nh.iph; + const struct iphdr *iph = ip_hdr(skb); struct vif_device *vif = &vif_table[vifi]; struct net_device *dev; struct rtable *rt; @@ -1200,8 +1208,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) dst_release(skb->dst); skb->dst = &rt->u.dst; - iph = skb->nh.iph; - ip_decrease_ttl(iph); + ip_decrease_ttl(ip_hdr(skb)); /* FIXME: forward and output firewalls used to be called here. * What do we do with netfilter? -- RR */ @@ -1301,7 +1308,7 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local * Forward the frame */ for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) { - if (skb->nh.iph->ttl > cache->mfc_un.res.ttls[ct]) { + if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) { if (psend != -1) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) @@ -1347,7 +1354,7 @@ int ip_mr_input(struct sk_buff *skb) if (IPCB(skb)->opt.router_alert) { if (ip_call_ra_chain(skb)) return 0; - } else if (skb->nh.iph->protocol == IPPROTO_IGMP){ + } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){ /* IGMPv1 (and broken IGMPv2 implementations sort of Cisco IOS <= 11.2(8)) do not put router alert option to IGMP packets destined to routable @@ -1366,7 +1373,7 @@ int ip_mr_input(struct sk_buff *skb) } read_lock(&mrt_lock); - cache = ipmr_cache_find(skb->nh.iph->saddr, skb->nh.iph->daddr); + cache = ipmr_cache_find(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); /* * No usable cache entry @@ -1426,14 +1433,15 @@ int pim_rcv_v1(struct sk_buff * skb) if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) goto drop; - pim = (struct igmphdr*)skb->h.raw; + pim = igmp_hdr(skb); if (!mroute_do_pim || skb->len < sizeof(*pim) + sizeof(*encap) || pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) goto drop; - encap = (struct iphdr*)(skb->h.raw + sizeof(struct igmphdr)); + encap = (struct iphdr *)(skb_transport_header(skb) + + sizeof(struct igmphdr)); /* Check that: a. packet is really destinted to a multicast group @@ -1455,9 +1463,9 @@ int pim_rcv_v1(struct sk_buff * skb) if (reg_dev == NULL) goto drop; - skb->mac.raw = skb->nh.raw; + skb->mac_header = skb->network_header; skb_pull(skb, (u8*)encap - skb->data); - skb->nh.iph = (struct iphdr *)skb->data; + skb_reset_network_header(skb); skb->dev = reg_dev; skb->protocol = htons(ETH_P_IP); skb->ip_summed = 0; @@ -1486,7 +1494,7 @@ static int pim_rcv(struct sk_buff * skb) if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) goto drop; - pim = (struct pimreghdr*)skb->h.raw; + pim = (struct pimreghdr *)skb_transport_header(skb); if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) || (pim->flags&PIM_NULL_REGISTER) || (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && @@ -1494,7 +1502,8 @@ static int pim_rcv(struct sk_buff * skb) goto drop; /* check if the inner packet is destined to mcast group */ - encap = (struct iphdr*)(skb->h.raw + sizeof(struct pimreghdr)); + encap = (struct iphdr *)(skb_transport_header(skb) + + sizeof(struct pimreghdr)); if (!MULTICAST(encap->daddr) || encap->tot_len == 0 || ntohs(encap->tot_len) + sizeof(*pim) > skb->len) @@ -1510,9 +1519,9 @@ static int pim_rcv(struct sk_buff * skb) if (reg_dev == NULL) goto drop; - skb->mac.raw = skb->nh.raw; + skb->mac_header = skb->network_header; skb_pull(skb, (u8*)encap - skb->data); - skb->nh.iph = (struct iphdr *)skb->data; + skb_reset_network_header(skb); skb->dev = reg_dev; skb->protocol = htons(ETH_P_IP); skb->ip_summed = 0; @@ -1537,7 +1546,7 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm) int ct; struct rtnexthop *nhp; struct net_device *dev = vif_table[c->mfc_parent].dev; - u8 *b = skb->tail; + u8 *b = skb_tail_pointer(skb); struct rtattr *mp_head; if (dev) @@ -1557,12 +1566,12 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm) } } mp_head->rta_type = RTA_MULTIPATH; - mp_head->rta_len = skb->tail - (u8*)mp_head; + mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head; rtm->rtm_type = RTN_MULTICAST; return 1; rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -EMSGSIZE; } @@ -1577,6 +1586,7 @@ int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait) if (cache==NULL) { struct sk_buff *skb2; + struct iphdr *iph; struct net_device *dev; int vif; @@ -1596,11 +1606,13 @@ int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait) return -ENOMEM; } - skb2->nh.raw = skb_push(skb2, sizeof(struct iphdr)); - skb2->nh.iph->ihl = sizeof(struct iphdr)>>2; - skb2->nh.iph->saddr = rt->rt_src; - skb2->nh.iph->daddr = rt->rt_dst; - skb2->nh.iph->version = 0; + skb_push(skb2, sizeof(struct iphdr)); + skb_reset_network_header(skb2); + iph = ip_hdr(skb2); + iph->ihl = sizeof(struct iphdr) >> 2; + iph->saddr = rt->rt_src; + iph->daddr = rt->rt_dst; + iph->version = 0; err = ipmr_cache_unresolved(vif, skb2); read_unlock(&mrt_lock); return err; @@ -1625,7 +1637,7 @@ static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter, loff_t pos) { for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) { - if(!VIF_EXISTS(iter->ct)) + if (!VIF_EXISTS(iter->ct)) continue; if (pos-- == 0) return &vif_table[iter->ct]; @@ -1649,7 +1661,7 @@ static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) return ipmr_vif_seq_idx(iter, 0); while (++iter->ct < maxvif) { - if(!VIF_EXISTS(iter->ct)) + if (!VIF_EXISTS(iter->ct)) continue; return &vif_table[iter->ct]; } @@ -1680,7 +1692,7 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v) return 0; } -static struct seq_operations ipmr_vif_seq_ops = { +static const struct seq_operations ipmr_vif_seq_ops = { .start = ipmr_vif_seq_start, .next = ipmr_vif_seq_next, .stop = ipmr_vif_seq_stop, @@ -1732,14 +1744,14 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos) it->cache = mfc_cache_array; read_lock(&mrt_lock); for (it->ct = 0; it->ct < MFC_LINES; it->ct++) - for(mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next) + for (mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next) if (pos-- == 0) return mfc; read_unlock(&mrt_lock); it->cache = &mfc_unres_queue; spin_lock_bh(&mfc_unres_lock); - for(mfc = mfc_unres_queue; mfc; mfc = mfc->next) + for (mfc = mfc_unres_queue; mfc; mfc = mfc->next) if (pos-- == 0) return mfc; spin_unlock_bh(&mfc_unres_lock); @@ -1829,9 +1841,9 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) mfc->mfc_un.res.wrong_if); if (it->cache != &mfc_unres_queue) { - for(n = mfc->mfc_un.res.minvif; - n < mfc->mfc_un.res.maxvif; n++ ) { - if(VIF_EXISTS(n) + for (n = mfc->mfc_un.res.minvif; + n < mfc->mfc_un.res.maxvif; n++ ) { + if (VIF_EXISTS(n) && mfc->mfc_un.res.ttls[n] < 255) seq_printf(seq, " %2d:%-3d", @@ -1843,7 +1855,7 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) return 0; } -static struct seq_operations ipmr_mfc_seq_ops = { +static const struct seq_operations ipmr_mfc_seq_ops = { .start = ipmr_mfc_seq_start, .next = ipmr_mfc_seq_next, .stop = ipmr_mfc_seq_stop, diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c index 22e104c6a49..15ad5dd2d98 100644 --- a/net/ipv4/ipvs/ip_vs_app.c +++ b/net/ipv4/ipvs/ip_vs_app.c @@ -331,14 +331,14 @@ static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff **pskb, struct ip_vs_app *app) { int diff; - unsigned int tcp_offset = (*pskb)->nh.iph->ihl*4; + const unsigned int tcp_offset = ip_hdrlen(*pskb); struct tcphdr *th; __u32 seq; if (!ip_vs_make_skb_writable(pskb, tcp_offset + sizeof(*th))) return 0; - th = (struct tcphdr *)((*pskb)->nh.raw + tcp_offset); + th = (struct tcphdr *)(skb_network_header(*pskb) + tcp_offset); /* * Remember seq number in case this pkt gets resized @@ -406,14 +406,14 @@ static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff **pskb, struct ip_vs_app *app) { int diff; - unsigned int tcp_offset = (*pskb)->nh.iph->ihl*4; + const unsigned int tcp_offset = ip_hdrlen(*pskb); struct tcphdr *th; __u32 seq; if (!ip_vs_make_skb_writable(pskb, tcp_offset + sizeof(*th))) return 0; - th = (struct tcphdr *)((*pskb)->nh.raw + tcp_offset); + th = (struct tcphdr *)(skb_network_header(*pskb) + tcp_offset); /* * Remember seq number in case this pkt gets resized @@ -577,7 +577,6 @@ static const struct file_operations ip_vs_app_fops = { int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri, char *o_buf, int o_len, char *n_buf, int n_len) { - struct iphdr *iph; int diff; int o_offset; int o_left; @@ -603,12 +602,11 @@ int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri, skb_put(skb, diff); memmove(skb->data + o_offset + n_len, skb->data + o_offset + o_len, o_left); - memcpy(skb->data + o_offset, n_buf, n_len); + skb_copy_to_linear_data_offset(skb, o_offset, n_buf, n_len); } /* must update the iph total length here */ - iph = skb->nh.iph; - iph->tot_len = htons(skb->len); + ip_hdr(skb)->tot_len = htons(skb->len); LeaveFunction(9); return 0; diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 24d7b66eb6d..f005a2f929f 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -212,7 +212,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, __be16 ports[2]) { struct ip_vs_conn *cp = NULL; - struct iphdr *iph = skb->nh.iph; + struct iphdr *iph = ip_hdr(skb); struct ip_vs_dest *dest; struct ip_vs_conn *ct; __be16 dport; /* destination port to forward */ @@ -381,7 +381,7 @@ struct ip_vs_conn * ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) { struct ip_vs_conn *cp = NULL; - struct iphdr *iph = skb->nh.iph; + struct iphdr *iph = ip_hdr(skb); struct ip_vs_dest *dest; __be16 _ports[2], *pptr; @@ -447,7 +447,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_protocol *pp) { __be16 _ports[2], *pptr; - struct iphdr *iph = skb->nh.iph; + struct iphdr *iph = ip_hdr(skb); pptr = skb_header_pointer(skb, iph->ihl*4, sizeof(_ports), _ports); @@ -546,7 +546,7 @@ ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) { skb = ip_defrag(skb, user); if (skb) - ip_send_check(skb->nh.iph); + ip_send_check(ip_hdr(skb)); return skb; } @@ -557,9 +557,10 @@ ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, int inout) { - struct iphdr *iph = skb->nh.iph; + struct iphdr *iph = ip_hdr(skb); unsigned int icmp_offset = iph->ihl*4; - struct icmphdr *icmph = (struct icmphdr *)(skb->nh.raw + icmp_offset); + struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) + + icmp_offset); struct iphdr *ciph = (struct iphdr *)(icmph + 1); if (inout) { @@ -617,14 +618,14 @@ static int ip_vs_out_icmp(struct sk_buff **pskb, int *related) *related = 1; /* reassemble IP fragments */ - if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) { + if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT); if (!skb) return NF_STOLEN; *pskb = skb; } - iph = skb->nh.iph; + iph = ip_hdr(skb); offset = ihl = iph->ihl * 4; ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); if (ic == NULL) @@ -659,7 +660,7 @@ static int ip_vs_out_icmp(struct sk_buff **pskb, int *related) return NF_ACCEPT; /* Is the embedded protocol header present? */ - if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) && + if (unlikely(cih->frag_off & htons(IP_OFFSET) && pp->dont_defrag)) return NF_ACCEPT; @@ -680,8 +681,7 @@ static int ip_vs_out_icmp(struct sk_buff **pskb, int *related) } /* Ensure the checksum is correct */ - if (skb->ip_summed != CHECKSUM_UNNECESSARY && - ip_vs_checksum_complete(skb, ihl)) { + if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { /* Failed checksum! */ IP_VS_DBG(1, "Forward ICMP: failed checksum from %d.%d.%d.%d!\n", NIPQUAD(iph->saddr)); @@ -712,8 +712,7 @@ static inline int is_tcp_reset(const struct sk_buff *skb) { struct tcphdr _tcph, *th; - th = skb_header_pointer(skb, skb->nh.iph->ihl * 4, - sizeof(_tcph), &_tcph); + th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph); if (th == NULL) return 0; return th->rst; @@ -740,14 +739,14 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb, if (skb->ipvs_property) return NF_ACCEPT; - iph = skb->nh.iph; + iph = ip_hdr(skb); if (unlikely(iph->protocol == IPPROTO_ICMP)) { int related, verdict = ip_vs_out_icmp(pskb, &related); if (related) return verdict; skb = *pskb; - iph = skb->nh.iph; + iph = ip_hdr(skb); } pp = ip_vs_proto_get(iph->protocol); @@ -755,12 +754,12 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb, return NF_ACCEPT; /* reassemble IP fragments */ - if (unlikely(iph->frag_off & __constant_htons(IP_MF|IP_OFFSET) && + if (unlikely(iph->frag_off & htons(IP_MF|IP_OFFSET) && !pp->dont_defrag)) { skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT); if (!skb) return NF_STOLEN; - iph = skb->nh.iph; + iph = ip_hdr(skb); *pskb = skb; } @@ -810,8 +809,8 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb, if (pp->snat_handler && !pp->snat_handler(pskb, pp, cp)) goto drop; skb = *pskb; - skb->nh.iph->saddr = cp->vaddr; - ip_send_check(skb->nh.iph); + ip_hdr(skb)->saddr = cp->vaddr; + ip_send_check(ip_hdr(skb)); /* For policy routing, packets originating from this * machine itself may be routed differently to packets @@ -861,7 +860,7 @@ ip_vs_in_icmp(struct sk_buff **pskb, int *related, unsigned int hooknum) *related = 1; /* reassemble IP fragments */ - if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) { + if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { skb = ip_vs_gather_frags(skb, hooknum == NF_IP_LOCAL_IN ? IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD); @@ -870,7 +869,7 @@ ip_vs_in_icmp(struct sk_buff **pskb, int *related, unsigned int hooknum) *pskb = skb; } - iph = skb->nh.iph; + iph = ip_hdr(skb); offset = ihl = iph->ihl * 4; ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); if (ic == NULL) @@ -905,7 +904,7 @@ ip_vs_in_icmp(struct sk_buff **pskb, int *related, unsigned int hooknum) return NF_ACCEPT; /* Is the embedded protocol header present? */ - if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) && + if (unlikely(cih->frag_off & htons(IP_OFFSET) && pp->dont_defrag)) return NF_ACCEPT; @@ -921,8 +920,7 @@ ip_vs_in_icmp(struct sk_buff **pskb, int *related, unsigned int hooknum) verdict = NF_DROP; /* Ensure the checksum is correct */ - if (skb->ip_summed != CHECKSUM_UNNECESSARY && - ip_vs_checksum_complete(skb, ihl)) { + if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { /* Failed checksum! */ IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n", NIPQUAD(iph->saddr)); @@ -966,19 +964,19 @@ ip_vs_in(unsigned int hooknum, struct sk_buff **pskb, || skb->dev == &loopback_dev || skb->sk)) { IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n", skb->pkt_type, - skb->nh.iph->protocol, - NIPQUAD(skb->nh.iph->daddr)); + ip_hdr(skb)->protocol, + NIPQUAD(ip_hdr(skb)->daddr)); return NF_ACCEPT; } - iph = skb->nh.iph; + iph = ip_hdr(skb); if (unlikely(iph->protocol == IPPROTO_ICMP)) { int related, verdict = ip_vs_in_icmp(pskb, &related, hooknum); if (related) return verdict; skb = *pskb; - iph = skb->nh.iph; + iph = ip_hdr(skb); } /* Protocol supported? */ @@ -1064,7 +1062,7 @@ ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff **pskb, { int r; - if ((*pskb)->nh.iph->protocol != IPPROTO_ICMP) + if (ip_hdr(*pskb)->protocol != IPPROTO_ICMP) return NF_ACCEPT; return ip_vs_in_icmp(pskb, &r, hooknum); diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c index 502111fba87..dcf5d46aaa5 100644 --- a/net/ipv4/ipvs/ip_vs_dh.c +++ b/net/ipv4/ipvs/ip_vs_dh.c @@ -204,7 +204,7 @@ ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) { struct ip_vs_dest *dest; struct ip_vs_dh_bucket *tbl; - struct iphdr *iph = skb->nh.iph; + struct iphdr *iph = ip_hdr(skb); IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n"); diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c index 847c47af040..344ddbbdc75 100644 --- a/net/ipv4/ipvs/ip_vs_ftp.c +++ b/net/ipv4/ipvs/ip_vs_ftp.c @@ -159,10 +159,10 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, return 0; if (cp->app_data == &ip_vs_ftp_pasv) { - iph = (*pskb)->nh.iph; + iph = ip_hdr(*pskb); th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); data = (char *)th + (th->doff << 2); - data_limit = (*pskb)->tail; + data_limit = skb_tail_pointer(*pskb); if (ip_vs_ftp_get_addrport(data, data_limit, SERVER_STRING, @@ -262,14 +262,14 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, /* * Detecting whether it is passive */ - iph = (*pskb)->nh.iph; + iph = ip_hdr(*pskb); th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); /* Since there may be OPTIONS in the TCP packet and the HLEN is the length of the header in 32-bit multiples, it is accurate to calculate data address by th+HLEN*4 */ data = data_start = (char *)th + (th->doff << 2); - data_limit = (*pskb)->tail; + data_limit = skb_tail_pointer(*pskb); while (data <= data_limit - 6) { if (strnicmp(data, "PASV\r\n", 6) == 0) { diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c index c801273cb88..052f4ed5917 100644 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ b/net/ipv4/ipvs/ip_vs_lblc.c @@ -521,7 +521,7 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) struct ip_vs_dest *dest; struct ip_vs_lblc_table *tbl; struct ip_vs_lblc_entry *en; - struct iphdr *iph = skb->nh.iph; + struct iphdr *iph = ip_hdr(skb); IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c index 23f9b9e73c8..6225acac7a3 100644 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ b/net/ipv4/ipvs/ip_vs_lblcr.c @@ -775,7 +775,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) struct ip_vs_dest *dest; struct ip_vs_lblcr_table *tbl; struct ip_vs_lblcr_entry *en; - struct iphdr *iph = skb->nh.iph; + struct iphdr *iph = ip_hdr(skb); IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n"); diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c index 8b0505b0931..a842676e1c6 100644 --- a/net/ipv4/ipvs/ip_vs_proto_ah.c +++ b/net/ipv4/ipvs/ip_vs_proto_ah.c @@ -52,15 +52,15 @@ ah_conn_in_get(const struct sk_buff *skb, if (likely(!inverse)) { cp = ip_vs_conn_in_get(IPPROTO_UDP, iph->saddr, - __constant_htons(PORT_ISAKMP), + htons(PORT_ISAKMP), iph->daddr, - __constant_htons(PORT_ISAKMP)); + htons(PORT_ISAKMP)); } else { cp = ip_vs_conn_in_get(IPPROTO_UDP, iph->daddr, - __constant_htons(PORT_ISAKMP), + htons(PORT_ISAKMP), iph->saddr, - __constant_htons(PORT_ISAKMP)); + htons(PORT_ISAKMP)); } if (!cp) { @@ -89,15 +89,15 @@ ah_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, if (likely(!inverse)) { cp = ip_vs_conn_out_get(IPPROTO_UDP, iph->saddr, - __constant_htons(PORT_ISAKMP), + htons(PORT_ISAKMP), iph->daddr, - __constant_htons(PORT_ISAKMP)); + htons(PORT_ISAKMP)); } else { cp = ip_vs_conn_out_get(IPPROTO_UDP, iph->daddr, - __constant_htons(PORT_ISAKMP), + htons(PORT_ISAKMP), iph->saddr, - __constant_htons(PORT_ISAKMP)); + htons(PORT_ISAKMP)); } if (!cp) { diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index 16a9ebee2fe..e65577a7700 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -76,16 +76,15 @@ tcp_conn_schedule(struct sk_buff *skb, struct ip_vs_service *svc; struct tcphdr _tcph, *th; - th = skb_header_pointer(skb, skb->nh.iph->ihl*4, - sizeof(_tcph), &_tcph); + th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph); if (th == NULL) { *verdict = NF_DROP; return 0; } if (th->syn && - (svc = ip_vs_service_get(skb->mark, skb->nh.iph->protocol, - skb->nh.iph->daddr, th->dest))) { + (svc = ip_vs_service_get(skb->mark, ip_hdr(skb)->protocol, + ip_hdr(skb)->daddr, th->dest))) { if (ip_vs_todrop()) { /* * It seems that we are very loaded. @@ -127,7 +126,7 @@ tcp_snat_handler(struct sk_buff **pskb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp) { struct tcphdr *tcph; - unsigned int tcphoff = (*pskb)->nh.iph->ihl * 4; + const unsigned int tcphoff = ip_hdrlen(*pskb); /* csum_check requires unshared skb */ if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph))) @@ -143,7 +142,7 @@ tcp_snat_handler(struct sk_buff **pskb, return 0; } - tcph = (void *)(*pskb)->nh.iph + tcphoff; + tcph = (void *)ip_hdr(*pskb) + tcphoff; tcph->source = cp->vport; /* Adjust TCP checksums */ @@ -175,7 +174,7 @@ tcp_dnat_handler(struct sk_buff **pskb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp) { struct tcphdr *tcph; - unsigned int tcphoff = (*pskb)->nh.iph->ihl * 4; + const unsigned int tcphoff = ip_hdrlen(*pskb); /* csum_check requires unshared skb */ if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph))) @@ -194,7 +193,7 @@ tcp_dnat_handler(struct sk_buff **pskb, return 0; } - tcph = (void *)(*pskb)->nh.iph + tcphoff; + tcph = (void *)ip_hdr(*pskb) + tcphoff; tcph->dest = cp->dport; /* @@ -224,15 +223,15 @@ tcp_dnat_handler(struct sk_buff **pskb, static int tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) { - unsigned int tcphoff = skb->nh.iph->ihl*4; + const unsigned int tcphoff = ip_hdrlen(skb); switch (skb->ip_summed) { case CHECKSUM_NONE: skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); case CHECKSUM_COMPLETE: - if (csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr, + if (csum_tcpudp_magic(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, skb->len - tcphoff, - skb->nh.iph->protocol, skb->csum)) { + ip_hdr(skb)->protocol, skb->csum)) { IP_VS_DBG_RL_PKT(0, pp, skb, 0, "Failed checksum for"); return 0; @@ -467,8 +466,7 @@ tcp_state_transition(struct ip_vs_conn *cp, int direction, { struct tcphdr _tcph, *th; - th = skb_header_pointer(skb, skb->nh.iph->ihl*4, - sizeof(_tcph), &_tcph); + th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph); if (th == NULL) return 0; diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c index 03f0a414cfa..8ee5fe6a101 100644 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c @@ -22,7 +22,7 @@ #include <linux/udp.h> #include <net/ip_vs.h> - +#include <net/ip.h> static struct ip_vs_conn * udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, @@ -56,7 +56,7 @@ udp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp; __be16 _ports[2], *pptr; - pptr = skb_header_pointer(skb, skb->nh.iph->ihl*4, + pptr = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ports), _ports); if (pptr == NULL) return NULL; @@ -82,15 +82,15 @@ udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_service *svc; struct udphdr _udph, *uh; - uh = skb_header_pointer(skb, skb->nh.iph->ihl*4, + uh = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_udph), &_udph); if (uh == NULL) { *verdict = NF_DROP; return 0; } - if ((svc = ip_vs_service_get(skb->mark, skb->nh.iph->protocol, - skb->nh.iph->daddr, uh->dest))) { + if ((svc = ip_vs_service_get(skb->mark, ip_hdr(skb)->protocol, + ip_hdr(skb)->daddr, uh->dest))) { if (ip_vs_todrop()) { /* * It seems that we are very loaded. @@ -133,7 +133,7 @@ udp_snat_handler(struct sk_buff **pskb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp) { struct udphdr *udph; - unsigned int udphoff = (*pskb)->nh.iph->ihl * 4; + const unsigned int udphoff = ip_hdrlen(*pskb); /* csum_check requires unshared skb */ if (!ip_vs_make_skb_writable(pskb, udphoff+sizeof(*udph))) @@ -151,7 +151,7 @@ udp_snat_handler(struct sk_buff **pskb, return 0; } - udph = (void *)(*pskb)->nh.iph + udphoff; + udph = (void *)ip_hdr(*pskb) + udphoff; udph->source = cp->vport; /* @@ -187,7 +187,7 @@ udp_dnat_handler(struct sk_buff **pskb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp) { struct udphdr *udph; - unsigned int udphoff = (*pskb)->nh.iph->ihl * 4; + unsigned int udphoff = ip_hdrlen(*pskb); /* csum_check requires unshared skb */ if (!ip_vs_make_skb_writable(pskb, udphoff+sizeof(*udph))) @@ -206,7 +206,7 @@ udp_dnat_handler(struct sk_buff **pskb, return 0; } - udph = (void *)(*pskb)->nh.iph + udphoff; + udph = (void *)ip_hdr(*pskb) + udphoff; udph->dest = cp->dport; /* @@ -239,7 +239,7 @@ static int udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) { struct udphdr _udph, *uh; - unsigned int udphoff = skb->nh.iph->ihl*4; + const unsigned int udphoff = ip_hdrlen(skb); uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph); if (uh == NULL) @@ -251,10 +251,10 @@ udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); case CHECKSUM_COMPLETE: - if (csum_tcpudp_magic(skb->nh.iph->saddr, - skb->nh.iph->daddr, + if (csum_tcpudp_magic(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, skb->len - udphoff, - skb->nh.iph->protocol, + ip_hdr(skb)->protocol, skb->csum)) { IP_VS_DBG_RL_PKT(0, pp, skb, 0, "Failed checksum for"); diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c index 338668f88fe..1b25b00ef1e 100644 --- a/net/ipv4/ipvs/ip_vs_sh.c +++ b/net/ipv4/ipvs/ip_vs_sh.c @@ -201,7 +201,7 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) { struct ip_vs_dest *dest; struct ip_vs_sh_bucket *tbl; - struct iphdr *iph = skb->nh.iph; + struct iphdr *iph = ip_hdr(skb); IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c index e1f77bd7c9a..900ce29db38 100644 --- a/net/ipv4/ipvs/ip_vs_xmit.c +++ b/net/ipv4/ipvs/ip_vs_xmit.c @@ -156,7 +156,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp) { struct rtable *rt; /* Route to the other host */ - struct iphdr *iph = skb->nh.iph; + struct iphdr *iph = ip_hdr(skb); u8 tos = iph->tos; int mtu; struct flowi fl = { @@ -178,7 +178,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* MTU checking */ mtu = dst_mtu(&rt->u.dst); - if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) { + if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { ip_rt_put(rt); icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n"); @@ -193,7 +193,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ip_rt_put(rt); return NF_STOLEN; } - ip_send_check(skb->nh.iph); + ip_send_check(ip_hdr(skb)); /* drop old route */ dst_release(skb->dst); @@ -226,7 +226,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, { struct rtable *rt; /* Route to the other host */ int mtu; - struct iphdr *iph = skb->nh.iph; + struct iphdr *iph = ip_hdr(skb); EnterFunction(10); @@ -245,7 +245,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* MTU checking */ mtu = dst_mtu(&rt->u.dst); - if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) { + if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { ip_rt_put(rt); icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); @@ -266,8 +266,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* mangle the packet */ if (pp->dnat_handler && !pp->dnat_handler(&skb, pp, cp)) goto tx_error; - skb->nh.iph->daddr = cp->daddr; - ip_send_check(skb->nh.iph); + ip_hdr(skb)->daddr = cp->daddr; + ip_send_check(ip_hdr(skb)); IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); @@ -320,19 +320,20 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, { struct rtable *rt; /* Route to the other host */ struct net_device *tdev; /* Device to other host */ - struct iphdr *old_iph = skb->nh.iph; + struct iphdr *old_iph = ip_hdr(skb); u8 tos = old_iph->tos; __be16 df = old_iph->frag_off; + sk_buff_data_t old_transport_header = skb->transport_header; struct iphdr *iph; /* Our new IP header */ int max_headroom; /* The extra header space needed */ int mtu; EnterFunction(10); - if (skb->protocol != __constant_htons(ETH_P_IP)) { + if (skb->protocol != htons(ETH_P_IP)) { IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, " "ETH_P_IP: %d, skb protocol: %d\n", - __constant_htons(ETH_P_IP), skb->protocol); + htons(ETH_P_IP), skb->protocol); goto tx_error; } @@ -350,9 +351,9 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, if (skb->dst) skb->dst->ops->update_pmtu(skb->dst, mtu); - df |= (old_iph->frag_off&__constant_htons(IP_DF)); + df |= (old_iph->frag_off & htons(IP_DF)); - if ((old_iph->frag_off&__constant_htons(IP_DF)) + if ((old_iph->frag_off & htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) { icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); ip_rt_put(rt); @@ -377,15 +378,16 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, } kfree_skb(skb); skb = new_skb; - old_iph = skb->nh.iph; + old_iph = ip_hdr(skb); } - skb->h.raw = (void *) old_iph; + skb->transport_header = old_transport_header; /* fix old IP header checksum */ ip_send_check(old_iph); - skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); + skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); /* drop old route */ @@ -395,7 +397,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* * Push down and install the IPIP header. */ - iph = skb->nh.iph; + iph = ip_hdr(skb); iph->version = 4; iph->ihl = sizeof(struct iphdr)>>2; iph->frag_off = df; @@ -435,7 +437,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp) { struct rtable *rt; /* Route to the other host */ - struct iphdr *iph = skb->nh.iph; + struct iphdr *iph = ip_hdr(skb); int mtu; EnterFunction(10); @@ -445,7 +447,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* MTU checking */ mtu = dst_mtu(&rt->u.dst); - if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) { + if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) { icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); ip_rt_put(rt); IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n"); @@ -460,7 +462,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ip_rt_put(rt); return NF_STOLEN; } - ip_send_check(skb->nh.iph); + ip_send_check(ip_hdr(skb)); /* drop old route */ dst_release(skb->dst); @@ -514,12 +516,12 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, * mangle and send the packet here (only for VS/NAT) */ - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(skb->nh.iph->tos)))) + if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos)))) goto tx_error_icmp; /* MTU checking */ mtu = dst_mtu(&rt->u.dst); - if ((skb->len > mtu) && (skb->nh.iph->frag_off&__constant_htons(IP_DF))) { + if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) { ip_rt_put(rt); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); diff --git a/net/ipv4/multipath_drr.c b/net/ipv4/multipath_drr.c index 574c735836f..b03c5ca2c82 100644 --- a/net/ipv4/multipath_drr.c +++ b/net/ipv4/multipath_drr.c @@ -100,7 +100,7 @@ static int drr_dev_event(struct notifier_block *this, spin_unlock_bh(&state_lock); break; - }; + } return NOTIFY_DONE; } diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 6069a11514f..b44192924f9 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -10,7 +10,7 @@ /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ int ip_route_me_harder(struct sk_buff **pskb, unsigned addr_type) { - struct iphdr *iph = (*pskb)->nh.iph; + const struct iphdr *iph = ip_hdr(*pskb); struct rtable *rt; struct flowi fl = {}; struct dst_entry *odst; @@ -142,7 +142,7 @@ static void nf_ip_saveroute(const struct sk_buff *skb, struct nf_info *info) struct ip_rt_info *rt_info = nf_info_reroute(info); if (info->hook == NF_IP_LOCAL_OUT) { - const struct iphdr *iph = skb->nh.iph; + const struct iphdr *iph = ip_hdr(skb); rt_info->tos = iph->tos; rt_info->daddr = iph->daddr; @@ -155,7 +155,7 @@ static int nf_ip_reroute(struct sk_buff **pskb, const struct nf_info *info) const struct ip_rt_info *rt_info = nf_info_reroute(info); if (info->hook == NF_IP_LOCAL_OUT) { - struct iphdr *iph = (*pskb)->nh.iph; + const struct iphdr *iph = ip_hdr(*pskb); if (!(iph->tos == rt_info->tos && iph->daddr == rt_info->daddr @@ -168,7 +168,7 @@ static int nf_ip_reroute(struct sk_buff **pskb, const struct nf_info *info) __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u_int8_t protocol) { - struct iphdr *iph = skb->nh.iph; + const struct iphdr *iph = ip_hdr(skb); __sum16 csum = 0; switch (skb->ip_summed) { diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 601808c796e..46509fae9fd 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -30,188 +30,6 @@ config NF_CONNTRACK_PROC_COMPAT If unsure, say Y. -# connection tracking, helpers and protocols -config IP_NF_CT_ACCT - bool "Connection tracking flow accounting" - depends on IP_NF_CONNTRACK - help - If this option is enabled, the connection tracking code will - keep per-flow packet and byte counters. - - Those counters can be used for flow-based accounting or the - `connbytes' match. - - If unsure, say `N'. - -config IP_NF_CONNTRACK_MARK - bool 'Connection mark tracking support' - depends on IP_NF_CONNTRACK - help - This option enables support for connection marks, used by the - `CONNMARK' target and `connmark' match. Similar to the mark value - of packets, but this mark value is kept in the conntrack session - instead of the individual packets. - -config IP_NF_CONNTRACK_SECMARK - bool 'Connection tracking security mark support' - depends on IP_NF_CONNTRACK && NETWORK_SECMARK - help - This option enables security markings to be applied to - connections. Typically they are copied to connections from - packets using the CONNSECMARK target and copied back from - connections to packets with the same target, with the packets - being originally labeled via SECMARK. - - If unsure, say 'N'. - -config IP_NF_CONNTRACK_EVENTS - bool "Connection tracking events (EXPERIMENTAL)" - depends on EXPERIMENTAL && IP_NF_CONNTRACK - help - If this option is enabled, the connection tracking code will - provide a notifier chain that can be used by other kernel code - to get notified about changes in the connection tracking state. - - IF unsure, say `N'. - -config IP_NF_CONNTRACK_NETLINK - tristate 'Connection tracking netlink interface (EXPERIMENTAL)' - depends on EXPERIMENTAL && IP_NF_CONNTRACK && NETFILTER_NETLINK - depends on IP_NF_CONNTRACK!=y || NETFILTER_NETLINK!=m - depends on IP_NF_NAT=n || IP_NF_NAT - help - This option enables support for a netlink-based userspace interface - - -config IP_NF_CT_PROTO_SCTP - tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)' - depends on IP_NF_CONNTRACK && EXPERIMENTAL - help - With this option enabled, the connection tracking code will - be able to do state tracking on SCTP connections. - - If you want to compile it as a module, say M here and read - <file:Documentation/modules.txt>. If unsure, say `N'. - -config IP_NF_FTP - tristate "FTP protocol support" - depends on IP_NF_CONNTRACK - help - Tracking FTP connections is problematic: special helpers are - required for tracking them, and doing masquerading and other forms - of Network Address Translation on them. - - To compile it as a module, choose M here. If unsure, say Y. - -config IP_NF_IRC - tristate "IRC protocol support" - depends on IP_NF_CONNTRACK - ---help--- - There is a commonly-used extension to IRC called - Direct Client-to-Client Protocol (DCC). This enables users to send - files to each other, and also chat to each other without the need - of a server. DCC Sending is used anywhere you send files over IRC, - and DCC Chat is most commonly used by Eggdrop bots. If you are - using NAT, this extension will enable you to send files and initiate - chats. Note that you do NOT need this extension to get files or - have others initiate chats, or everything else in IRC. - - To compile it as a module, choose M here. If unsure, say Y. - -config IP_NF_NETBIOS_NS - tristate "NetBIOS name service protocol support (EXPERIMENTAL)" - depends on IP_NF_CONNTRACK && EXPERIMENTAL - help - NetBIOS name service requests are sent as broadcast messages from an - unprivileged port and responded to with unicast messages to the - same port. This make them hard to firewall properly because connection - tracking doesn't deal with broadcasts. This helper tracks locally - originating NetBIOS name service requests and the corresponding - responses. It relies on correct IP address configuration, specifically - netmask and broadcast address. When properly configured, the output - of "ip address show" should look similar to this: - - $ ip -4 address show eth0 - 4: eth0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc pfifo_fast qlen 1000 - inet 172.16.2.252/24 brd 172.16.2.255 scope global eth0 - - To compile it as a module, choose M here. If unsure, say N. - -config IP_NF_TFTP - tristate "TFTP protocol support" - depends on IP_NF_CONNTRACK - help - TFTP connection tracking helper, this is required depending - on how restrictive your ruleset is. - If you are using a tftp client behind -j SNAT or -j MASQUERADING - you will need this. - - To compile it as a module, choose M here. If unsure, say Y. - -config IP_NF_AMANDA - tristate "Amanda backup protocol support" - depends on IP_NF_CONNTRACK - select TEXTSEARCH - select TEXTSEARCH_KMP - help - If you are running the Amanda backup package <http://www.amanda.org/> - on this machine or machines that will be MASQUERADED through this - machine, then you may want to enable this feature. This allows the - connection tracking and natting code to allow the sub-channels that - Amanda requires for communication of the backup data, messages and - index. - - To compile it as a module, choose M here. If unsure, say Y. - -config IP_NF_PPTP - tristate 'PPTP protocol support' - depends on IP_NF_CONNTRACK - help - This module adds support for PPTP (Point to Point Tunnelling - Protocol, RFC2637) connection tracking and NAT. - - If you are running PPTP sessions over a stateful firewall or NAT - box, you may want to enable this feature. - - Please note that not all PPTP modes of operation are supported yet. - For more info, read top of the file - net/ipv4/netfilter/ip_conntrack_pptp.c - - If you want to compile it as a module, say M here and read - Documentation/modules.txt. If unsure, say `N'. - -config IP_NF_H323 - tristate 'H.323 protocol support (EXPERIMENTAL)' - depends on IP_NF_CONNTRACK && EXPERIMENTAL - help - H.323 is a VoIP signalling protocol from ITU-T. As one of the most - important VoIP protocols, it is widely used by voice hardware and - software including voice gateways, IP phones, Netmeeting, OpenPhone, - Gnomemeeting, etc. - - With this module you can support H.323 on a connection tracking/NAT - firewall. - - This module supports RAS, Fast Start, H.245 Tunnelling, Call - Forwarding, RTP/RTCP and T.120 based audio, video, fax, chat, - whiteboard, file transfer, etc. For more information, please - visit http://nath323.sourceforge.net/. - - If you want to compile it as a module, say 'M' here and read - Documentation/modules.txt. If unsure, say 'N'. - -config IP_NF_SIP - tristate "SIP protocol support (EXPERIMENTAL)" - depends on IP_NF_CONNTRACK && EXPERIMENTAL - help - SIP is an application-layer control protocol that can establish, - modify, and terminate multimedia sessions (conferences) such as - Internet telephony calls. With the ip_conntrack_sip and - the ip_nat_sip modules you can support the protocol on a connection - tracking/NATing firewall. - - To compile it as a module, choose M here. If unsure, say Y. - config IP_NF_QUEUE tristate "IP Userspace queueing via NETLINK (OBSOLETE)" help @@ -361,17 +179,6 @@ config IP_NF_TARGET_ULOG To compile it as a module, choose M here. If unsure, say N. -# NAT + specific targets: ip_conntrack -config IP_NF_NAT - tristate "Full NAT" - depends on IP_NF_IPTABLES && IP_NF_CONNTRACK - help - The Full NAT option allows masquerading, port forwarding and other - forms of full Network Address Port Translation. It is controlled by - the `nat' table in iptables: see the man page for iptables(8). - - To compile it as a module, choose M here. If unsure, say N. - # NAT + specific targets: nf_conntrack config NF_NAT tristate "Full NAT" @@ -383,11 +190,6 @@ config NF_NAT To compile it as a module, choose M here. If unsure, say N. -config IP_NF_NAT_NEEDED - bool - depends on IP_NF_NAT - default y - config NF_NAT_NEEDED bool depends on NF_NAT @@ -395,7 +197,7 @@ config NF_NAT_NEEDED config IP_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" - depends on (NF_NAT || IP_NF_NAT) + depends on NF_NAT help Masquerading is a special case of NAT: all outgoing connections are changed to seem to come from a particular interface's address, and @@ -407,7 +209,7 @@ config IP_NF_TARGET_MASQUERADE config IP_NF_TARGET_REDIRECT tristate "REDIRECT target support" - depends on (NF_NAT || IP_NF_NAT) + depends on NF_NAT help REDIRECT is a special case of NAT: all incoming connections are mapped onto the incoming interface's address, causing the packets to @@ -418,7 +220,7 @@ config IP_NF_TARGET_REDIRECT config IP_NF_TARGET_NETMAP tristate "NETMAP target support" - depends on (NF_NAT || IP_NF_NAT) + depends on NF_NAT help NETMAP is an implementation of static 1:1 NAT mapping of network addresses. It maps the network address part, while keeping the host @@ -429,28 +231,13 @@ config IP_NF_TARGET_NETMAP config IP_NF_TARGET_SAME tristate "SAME target support" - depends on (NF_NAT || IP_NF_NAT) + depends on NF_NAT help This option adds a `SAME' target, which works like the standard SNAT target, but attempts to give clients the same IP for all connections. To compile it as a module, choose M here. If unsure, say N. -config IP_NF_NAT_SNMP_BASIC - tristate "Basic SNMP-ALG support (EXPERIMENTAL)" - depends on EXPERIMENTAL && IP_NF_NAT - ---help--- - - This module implements an Application Layer Gateway (ALG) for - SNMP payloads. In conjunction with NAT, it allows a network - management system to access multiple private networks with - conflicting addresses. It works by modifying IP addresses - inside SNMP payloads to match IP-layer NAT mapping. - - This is the "basic" form of SNMP-ALG, as described in RFC 2962 - - To compile it as a module, choose M here. If unsure, say N. - config NF_NAT_SNMP_BASIC tristate "Basic SNMP-ALG support (EXPERIMENTAL)" depends on EXPERIMENTAL && NF_NAT @@ -477,78 +264,37 @@ config NF_NAT_PROTO_GRE tristate depends on NF_NAT && NF_CT_PROTO_GRE -config IP_NF_NAT_FTP - tristate - depends on IP_NF_IPTABLES && IP_NF_CONNTRACK && IP_NF_NAT - default IP_NF_NAT && IP_NF_FTP - config NF_NAT_FTP tristate depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT default NF_NAT && NF_CONNTRACK_FTP -config IP_NF_NAT_IRC - tristate - depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n - default IP_NF_NAT if IP_NF_IRC=y - default m if IP_NF_IRC=m - config NF_NAT_IRC tristate depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT default NF_NAT && NF_CONNTRACK_IRC -config IP_NF_NAT_TFTP - tristate - depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n - default IP_NF_NAT if IP_NF_TFTP=y - default m if IP_NF_TFTP=m - config NF_NAT_TFTP tristate depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT default NF_NAT && NF_CONNTRACK_TFTP -config IP_NF_NAT_AMANDA - tristate - depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n - default IP_NF_NAT if IP_NF_AMANDA=y - default m if IP_NF_AMANDA=m - config NF_NAT_AMANDA tristate depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT default NF_NAT && NF_CONNTRACK_AMANDA -config IP_NF_NAT_PPTP - tristate - depends on IP_NF_NAT!=n && IP_NF_PPTP!=n - default IP_NF_NAT if IP_NF_PPTP=y - default m if IP_NF_PPTP=m - config NF_NAT_PPTP tristate depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT default NF_NAT && NF_CONNTRACK_PPTP select NF_NAT_PROTO_GRE -config IP_NF_NAT_H323 - tristate - depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n - default IP_NF_NAT if IP_NF_H323=y - default m if IP_NF_H323=m - config NF_NAT_H323 tristate depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT default NF_NAT && NF_CONNTRACK_H323 -config IP_NF_NAT_SIP - tristate - depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n - default IP_NF_NAT if IP_NF_SIP=y - default m if IP_NF_SIP=m - config NF_NAT_SIP tristate depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT @@ -606,9 +352,8 @@ config IP_NF_TARGET_TTL config IP_NF_TARGET_CLUSTERIP tristate "CLUSTERIP target support (EXPERIMENTAL)" depends on IP_NF_MANGLE && EXPERIMENTAL - depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4 - select IP_NF_CONNTRACK_MARK if IP_NF_CONNTRACK - select NF_CONNTRACK_MARK if NF_CONNTRACK_IPV4 + depends on NF_CONNTRACK_IPV4 + select NF_CONNTRACK_MARK help The CLUSTERIP target allows you to build load-balancing clusters of network servers without having a dedicated load-balancing diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 6625ec68180..409d273f6f8 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -2,8 +2,6 @@ # Makefile for the netfilter modules on top of IPv4. # -# objects for the standalone - connection tracking / NAT -ip_conntrack-objs := ip_conntrack_standalone.o ip_conntrack_core.o ip_conntrack_proto_generic.o ip_conntrack_proto_tcp.o ip_conntrack_proto_udp.o ip_conntrack_proto_icmp.o # objects for l3 independent conntrack nf_conntrack_ipv4-objs := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o ifeq ($(CONFIG_NF_CONNTRACK_PROC_COMPAT),y) @@ -12,53 +10,14 @@ nf_conntrack_ipv4-objs += nf_conntrack_l3proto_ipv4_compat.o endif endif -ip_nat-objs := ip_nat_core.o ip_nat_helper.o ip_nat_proto_unknown.o ip_nat_proto_tcp.o ip_nat_proto_udp.o ip_nat_proto_icmp.o -nf_nat-objs := nf_nat_core.o nf_nat_helper.o nf_nat_proto_unknown.o nf_nat_proto_tcp.o nf_nat_proto_udp.o nf_nat_proto_icmp.o -ifneq ($(CONFIG_NF_NAT),) +nf_nat-objs := nf_nat_core.o nf_nat_helper.o nf_nat_proto_unknown.o nf_nat_proto_tcp.o nf_nat_proto_udp.o nf_nat_proto_icmp.o iptable_nat-objs := nf_nat_rule.o nf_nat_standalone.o -else -iptable_nat-objs := ip_nat_rule.o ip_nat_standalone.o -endif - -ip_conntrack_pptp-objs := ip_conntrack_helper_pptp.o ip_conntrack_proto_gre.o -ip_nat_pptp-objs := ip_nat_helper_pptp.o ip_nat_proto_gre.o - -ip_conntrack_h323-objs := ip_conntrack_helper_h323.o ../../netfilter/nf_conntrack_h323_asn1.o -ip_nat_h323-objs := ip_nat_helper_h323.o # connection tracking -obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o -obj-$(CONFIG_IP_NF_NAT) += ip_nat.o obj-$(CONFIG_NF_NAT) += nf_nat.o -# conntrack netlink interface -obj-$(CONFIG_IP_NF_CONNTRACK_NETLINK) += ip_conntrack_netlink.o - - -# SCTP protocol connection tracking -obj-$(CONFIG_IP_NF_CT_PROTO_SCTP) += ip_conntrack_proto_sctp.o - -# connection tracking helpers -obj-$(CONFIG_IP_NF_H323) += ip_conntrack_h323.o -obj-$(CONFIG_IP_NF_PPTP) += ip_conntrack_pptp.o -obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o -obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o -obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o -obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o -obj-$(CONFIG_IP_NF_SIP) += ip_conntrack_sip.o -obj-$(CONFIG_IP_NF_NETBIOS_NS) += ip_conntrack_netbios_ns.o - -# NAT helpers (ip_conntrack) -obj-$(CONFIG_IP_NF_NAT_H323) += ip_nat_h323.o -obj-$(CONFIG_IP_NF_NAT_PPTP) += ip_nat_pptp.o -obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o -obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o -obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o -obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o -obj-$(CONFIG_IP_NF_NAT_SIP) += ip_nat_sip.o - # NAT helpers (nf_conntrack) obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o @@ -78,7 +37,6 @@ obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o # the three instances of ip_tables obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o -obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o obj-$(CONFIG_NF_NAT) += iptable_nat.o obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o @@ -100,7 +58,6 @@ obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o obj-$(CONFIG_IP_NF_TARGET_SAME) += ipt_SAME.o -obj-$(CONFIG_IP_NF_NAT_SNMP_BASIC) += ip_nat_snmp_basic.o obj-$(CONFIG_IP_NF_TARGET_LOG) += ipt_LOG.o obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 57b0221f9e2..cae41215e3c 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -245,7 +245,7 @@ unsigned int arpt_do_table(struct sk_buff **pskb, e = get_entry(table_base, private->hook_entry[hook]); back = get_entry(table_base, private->underflow[hook]); - arp = (*pskb)->nh.arph; + arp = arp_hdr(*pskb); do { if (arp_packet_match(arp, (*pskb)->dev, indev, outdev, &e->arp)) { struct arpt_entry_target *t; @@ -297,7 +297,7 @@ unsigned int arpt_do_table(struct sk_buff **pskb, t->data); /* Target might have changed stuff. */ - arp = (*pskb)->nh.arph; + arp = arp_hdr(*pskb); if (verdict == ARPT_CONTINUE) e = (void *)e + e->next_offset; diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c index 709db4d3f48..6298d404e7c 100644 --- a/net/ipv4/netfilter/arpt_mangle.c +++ b/net/ipv4/netfilter/arpt_mangle.c @@ -30,35 +30,35 @@ target(struct sk_buff **pskb, *pskb = nskb; } - arp = (*pskb)->nh.arph; - arpptr = (*pskb)->nh.raw + sizeof(*arp); + arp = arp_hdr(*pskb); + arpptr = skb_network_header(*pskb) + sizeof(*arp); pln = arp->ar_pln; hln = arp->ar_hln; /* We assume that pln and hln were checked in the match */ if (mangle->flags & ARPT_MANGLE_SDEV) { if (ARPT_DEV_ADDR_LEN_MAX < hln || - (arpptr + hln > (**pskb).tail)) + (arpptr + hln > skb_tail_pointer(*pskb))) return NF_DROP; memcpy(arpptr, mangle->src_devaddr, hln); } arpptr += hln; if (mangle->flags & ARPT_MANGLE_SIP) { if (ARPT_MANGLE_ADDR_LEN_MAX < pln || - (arpptr + pln > (**pskb).tail)) + (arpptr + pln > skb_tail_pointer(*pskb))) return NF_DROP; memcpy(arpptr, &mangle->u_s.src_ip, pln); } arpptr += pln; if (mangle->flags & ARPT_MANGLE_TDEV) { if (ARPT_DEV_ADDR_LEN_MAX < hln || - (arpptr + hln > (**pskb).tail)) + (arpptr + hln > skb_tail_pointer(*pskb))) return NF_DROP; memcpy(arpptr, mangle->tgt_devaddr, hln); } arpptr += hln; if (mangle->flags & ARPT_MANGLE_TIP) { if (ARPT_MANGLE_ADDR_LEN_MAX < pln || - (arpptr + pln > (**pskb).tail)) + (arpptr + pln > skb_tail_pointer(*pskb))) return NF_DROP; memcpy(arpptr, &mangle->u_t.tgt_ip, pln); } diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c deleted file mode 100644 index 4f561f52c83..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_amanda.c +++ /dev/null @@ -1,229 +0,0 @@ -/* Amanda extension for IP connection tracking, Version 0.2 - * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca> - * based on HW's ip_conntrack_irc.c as well as other modules - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Module load syntax: - * insmod ip_conntrack_amanda.o [master_timeout=n] - * - * Where master_timeout is the timeout (in seconds) of the master - * connection (port 10080). This defaults to 5 minutes but if - * your clients take longer than 5 minutes to do their work - * before getting back to the Amanda server, you can increase - * this value. - * - */ -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/textsearch.h> -#include <linux/skbuff.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/udp.h> - -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_amanda.h> - -static unsigned int master_timeout = 300; -static char *ts_algo = "kmp"; - -MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); -MODULE_DESCRIPTION("Amanda connection tracking module"); -MODULE_LICENSE("GPL"); -module_param(master_timeout, uint, 0600); -MODULE_PARM_DESC(master_timeout, "timeout for the master connection"); -module_param(ts_algo, charp, 0400); -MODULE_PARM_DESC(ts_algo, "textsearch algorithm to use (default kmp)"); - -unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - unsigned int matchoff, - unsigned int matchlen, - struct ip_conntrack_expect *exp); -EXPORT_SYMBOL_GPL(ip_nat_amanda_hook); - -enum amanda_strings { - SEARCH_CONNECT, - SEARCH_NEWLINE, - SEARCH_DATA, - SEARCH_MESG, - SEARCH_INDEX, -}; - -static struct { - char *string; - size_t len; - struct ts_config *ts; -} search[] = { - [SEARCH_CONNECT] = { - .string = "CONNECT ", - .len = 8, - }, - [SEARCH_NEWLINE] = { - .string = "\n", - .len = 1, - }, - [SEARCH_DATA] = { - .string = "DATA ", - .len = 5, - }, - [SEARCH_MESG] = { - .string = "MESG ", - .len = 5, - }, - [SEARCH_INDEX] = { - .string = "INDEX ", - .len = 6, - }, -}; - -static int help(struct sk_buff **pskb, - struct ip_conntrack *ct, enum ip_conntrack_info ctinfo) -{ - struct ts_state ts; - struct ip_conntrack_expect *exp; - unsigned int dataoff, start, stop, off, i; - char pbuf[sizeof("65535")], *tmp; - u_int16_t port, len; - int ret = NF_ACCEPT; - typeof(ip_nat_amanda_hook) ip_nat_amanda; - - /* Only look at packets from the Amanda server */ - if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) - return NF_ACCEPT; - - /* increase the UDP timeout of the master connection as replies from - * Amanda clients to the server can be quite delayed */ - ip_ct_refresh(ct, *pskb, master_timeout * HZ); - - /* No data? */ - dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); - if (dataoff >= (*pskb)->len) { - if (net_ratelimit()) - printk("amanda_help: skblen = %u\n", (*pskb)->len); - return NF_ACCEPT; - } - - memset(&ts, 0, sizeof(ts)); - start = skb_find_text(*pskb, dataoff, (*pskb)->len, - search[SEARCH_CONNECT].ts, &ts); - if (start == UINT_MAX) - goto out; - start += dataoff + search[SEARCH_CONNECT].len; - - memset(&ts, 0, sizeof(ts)); - stop = skb_find_text(*pskb, start, (*pskb)->len, - search[SEARCH_NEWLINE].ts, &ts); - if (stop == UINT_MAX) - goto out; - stop += start; - - for (i = SEARCH_DATA; i <= SEARCH_INDEX; i++) { - memset(&ts, 0, sizeof(ts)); - off = skb_find_text(*pskb, start, stop, search[i].ts, &ts); - if (off == UINT_MAX) - continue; - off += start + search[i].len; - - len = min_t(unsigned int, sizeof(pbuf) - 1, stop - off); - if (skb_copy_bits(*pskb, off, pbuf, len)) - break; - pbuf[len] = '\0'; - - port = simple_strtoul(pbuf, &tmp, 10); - len = tmp - pbuf; - if (port == 0 || len > 5) - break; - - exp = ip_conntrack_expect_alloc(ct); - if (exp == NULL) { - ret = NF_DROP; - goto out; - } - - exp->expectfn = NULL; - exp->flags = 0; - - exp->tuple.src.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip; - exp->tuple.src.u.tcp.port = 0; - exp->tuple.dst.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip; - exp->tuple.dst.protonum = IPPROTO_TCP; - exp->tuple.dst.u.tcp.port = htons(port); - - exp->mask.src.ip = htonl(0xFFFFFFFF); - exp->mask.src.u.tcp.port = 0; - exp->mask.dst.ip = htonl(0xFFFFFFFF); - exp->mask.dst.protonum = 0xFF; - exp->mask.dst.u.tcp.port = htons(0xFFFF); - - /* RCU read locked by nf_hook_slow */ - ip_nat_amanda = rcu_dereference(ip_nat_amanda_hook); - if (ip_nat_amanda) - ret = ip_nat_amanda(pskb, ctinfo, off - dataoff, - len, exp); - else if (ip_conntrack_expect_related(exp) != 0) - ret = NF_DROP; - ip_conntrack_expect_put(exp); - } - -out: - return ret; -} - -static struct ip_conntrack_helper amanda_helper = { - .max_expected = 3, - .timeout = 180, - .me = THIS_MODULE, - .help = help, - .name = "amanda", - - .tuple = { .src = { .u = { .udp = {.port = __constant_htons(10080) } } }, - .dst = { .protonum = IPPROTO_UDP }, - }, - .mask = { .src = { .u = { 0xFFFF } }, - .dst = { .protonum = 0xFF }, - }, -}; - -static void __exit ip_conntrack_amanda_fini(void) -{ - int i; - - ip_conntrack_helper_unregister(&amanda_helper); - for (i = 0; i < ARRAY_SIZE(search); i++) - textsearch_destroy(search[i].ts); -} - -static int __init ip_conntrack_amanda_init(void) -{ - int ret, i; - - ret = -ENOMEM; - for (i = 0; i < ARRAY_SIZE(search); i++) { - search[i].ts = textsearch_prepare(ts_algo, search[i].string, - search[i].len, - GFP_KERNEL, TS_AUTOLOAD); - if (search[i].ts == NULL) - goto err; - } - ret = ip_conntrack_helper_register(&amanda_helper); - if (ret < 0) - goto err; - return 0; - -err: - for (; i >= 0; i--) { - if (search[i].ts) - textsearch_destroy(search[i].ts); - } - return ret; -} - -module_init(ip_conntrack_amanda_init); -module_exit(ip_conntrack_amanda_fini); diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c deleted file mode 100644 index 23b99ae2cc3..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ /dev/null @@ -1,1550 +0,0 @@ -/* Connection state tracking for netfilter. This is separated from, - but required by, the NAT layer; it can also be used by an iptables - extension. */ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * 23 Apr 2001: Harald Welte <laforge@gnumonks.org> - * - new API and handling of conntrack/nat helpers - * - now capable of multiple expectations for one master - * 16 Jul 2002: Harald Welte <laforge@gnumonks.org> - * - add usage/reference counts to ip_conntrack_expect - * - export ip_conntrack[_expect]_{find_get,put} functions - * */ - -#include <linux/types.h> -#include <linux/icmp.h> -#include <linux/ip.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/proc_fs.h> -#include <linux/vmalloc.h> -#include <net/checksum.h> -#include <net/ip.h> -#include <linux/stddef.h> -#include <linux/sysctl.h> -#include <linux/slab.h> -#include <linux/random.h> -#include <linux/jhash.h> -#include <linux/err.h> -#include <linux/percpu.h> -#include <linux/moduleparam.h> -#include <linux/notifier.h> - -/* ip_conntrack_lock protects the main hash table, protocol/helper/expected - registrations, conntrack timers*/ -#include <linux/netfilter_ipv4/ip_conntrack.h> -#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_core.h> - -#define IP_CONNTRACK_VERSION "2.4" - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - -DEFINE_RWLOCK(ip_conntrack_lock); - -/* ip_conntrack_standalone needs this */ -atomic_t ip_conntrack_count = ATOMIC_INIT(0); - -void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL; -LIST_HEAD(ip_conntrack_expect_list); -struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO] __read_mostly; -static LIST_HEAD(helpers); -unsigned int ip_conntrack_htable_size __read_mostly = 0; -int ip_conntrack_max __read_mostly; -struct list_head *ip_conntrack_hash __read_mostly; -static struct kmem_cache *ip_conntrack_cachep __read_mostly; -static struct kmem_cache *ip_conntrack_expect_cachep __read_mostly; -struct ip_conntrack ip_conntrack_untracked; -unsigned int ip_ct_log_invalid __read_mostly; -static LIST_HEAD(unconfirmed); -static int ip_conntrack_vmalloc __read_mostly; - -static unsigned int ip_conntrack_next_id; -static unsigned int ip_conntrack_expect_next_id; -#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS -ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain); -ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain); - -DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache); - -/* deliver cached events and clear cache entry - must be called with locally - * disabled softirqs */ -static inline void -__ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache) -{ - DEBUGP("ecache: delivering events for %p\n", ecache->ct); - if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events) - atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events, - ecache->ct); - ecache->events = 0; - ip_conntrack_put(ecache->ct); - ecache->ct = NULL; -} - -/* Deliver all cached events for a particular conntrack. This is called - * by code prior to async packet handling or freeing the skb */ -void ip_ct_deliver_cached_events(const struct ip_conntrack *ct) -{ - struct ip_conntrack_ecache *ecache; - - local_bh_disable(); - ecache = &__get_cpu_var(ip_conntrack_ecache); - if (ecache->ct == ct) - __ip_ct_deliver_cached_events(ecache); - local_bh_enable(); -} - -void __ip_ct_event_cache_init(struct ip_conntrack *ct) -{ - struct ip_conntrack_ecache *ecache; - - /* take care of delivering potentially old events */ - ecache = &__get_cpu_var(ip_conntrack_ecache); - BUG_ON(ecache->ct == ct); - if (ecache->ct) - __ip_ct_deliver_cached_events(ecache); - /* initialize for this conntrack/packet */ - ecache->ct = ct; - nf_conntrack_get(&ct->ct_general); -} - -/* flush the event cache - touches other CPU's data and must not be called while - * packets are still passing through the code */ -static void ip_ct_event_cache_flush(void) -{ - struct ip_conntrack_ecache *ecache; - int cpu; - - for_each_possible_cpu(cpu) { - ecache = &per_cpu(ip_conntrack_ecache, cpu); - if (ecache->ct) - ip_conntrack_put(ecache->ct); - } -} -#else -static inline void ip_ct_event_cache_flush(void) {} -#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */ - -DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); - -static int ip_conntrack_hash_rnd_initted; -static unsigned int ip_conntrack_hash_rnd; - -static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple, - unsigned int size, unsigned int rnd) -{ - return (jhash_3words((__force u32)tuple->src.ip, - ((__force u32)tuple->dst.ip ^ tuple->dst.protonum), - (tuple->src.u.all | (tuple->dst.u.all << 16)), - rnd) % size); -} - -static u_int32_t -hash_conntrack(const struct ip_conntrack_tuple *tuple) -{ - return __hash_conntrack(tuple, ip_conntrack_htable_size, - ip_conntrack_hash_rnd); -} - -int -ip_ct_get_tuple(const struct iphdr *iph, - const struct sk_buff *skb, - unsigned int dataoff, - struct ip_conntrack_tuple *tuple, - const struct ip_conntrack_protocol *protocol) -{ - /* Never happen */ - if (iph->frag_off & htons(IP_OFFSET)) { - printk("ip_conntrack_core: Frag of proto %u.\n", - iph->protocol); - return 0; - } - - tuple->src.ip = iph->saddr; - tuple->dst.ip = iph->daddr; - tuple->dst.protonum = iph->protocol; - tuple->dst.dir = IP_CT_DIR_ORIGINAL; - - return protocol->pkt_to_tuple(skb, dataoff, tuple); -} - -int -ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse, - const struct ip_conntrack_tuple *orig, - const struct ip_conntrack_protocol *protocol) -{ - inverse->src.ip = orig->dst.ip; - inverse->dst.ip = orig->src.ip; - inverse->dst.protonum = orig->dst.protonum; - inverse->dst.dir = !orig->dst.dir; - - return protocol->invert_tuple(inverse, orig); -} - - -/* ip_conntrack_expect helper functions */ -void ip_ct_unlink_expect(struct ip_conntrack_expect *exp) -{ - IP_NF_ASSERT(!timer_pending(&exp->timeout)); - list_del(&exp->list); - CONNTRACK_STAT_INC(expect_delete); - exp->master->expecting--; - ip_conntrack_expect_put(exp); -} - -static void expectation_timed_out(unsigned long ul_expect) -{ - struct ip_conntrack_expect *exp = (void *)ul_expect; - - write_lock_bh(&ip_conntrack_lock); - ip_ct_unlink_expect(exp); - write_unlock_bh(&ip_conntrack_lock); - ip_conntrack_expect_put(exp); -} - -struct ip_conntrack_expect * -__ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple) -{ - struct ip_conntrack_expect *i; - - list_for_each_entry(i, &ip_conntrack_expect_list, list) { - if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) - return i; - } - return NULL; -} - -/* Just find a expectation corresponding to a tuple. */ -struct ip_conntrack_expect * -ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple) -{ - struct ip_conntrack_expect *i; - - read_lock_bh(&ip_conntrack_lock); - i = __ip_conntrack_expect_find(tuple); - if (i) - atomic_inc(&i->use); - read_unlock_bh(&ip_conntrack_lock); - - return i; -} - -/* If an expectation for this connection is found, it gets delete from - * global list then returned. */ -static struct ip_conntrack_expect * -find_expectation(const struct ip_conntrack_tuple *tuple) -{ - struct ip_conntrack_expect *i; - - list_for_each_entry(i, &ip_conntrack_expect_list, list) { - /* If master is not in hash table yet (ie. packet hasn't left - this machine yet), how can other end know about expected? - Hence these are not the droids you are looking for (if - master ct never got confirmed, we'd hold a reference to it - and weird things would happen to future packets). */ - if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) - && is_confirmed(i->master)) { - if (i->flags & IP_CT_EXPECT_PERMANENT) { - atomic_inc(&i->use); - return i; - } else if (del_timer(&i->timeout)) { - ip_ct_unlink_expect(i); - return i; - } - } - } - return NULL; -} - -/* delete all expectations for this conntrack */ -void ip_ct_remove_expectations(struct ip_conntrack *ct) -{ - struct ip_conntrack_expect *i, *tmp; - - /* Optimization: most connection never expect any others. */ - if (ct->expecting == 0) - return; - - list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) { - if (i->master == ct && del_timer(&i->timeout)) { - ip_ct_unlink_expect(i); - ip_conntrack_expect_put(i); - } - } -} - -static void -clean_from_lists(struct ip_conntrack *ct) -{ - DEBUGP("clean_from_lists(%p)\n", ct); - list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); - list_del(&ct->tuplehash[IP_CT_DIR_REPLY].list); - - /* Destroy all pending expectations */ - ip_ct_remove_expectations(ct); -} - -static void -destroy_conntrack(struct nf_conntrack *nfct) -{ - struct ip_conntrack *ct = (struct ip_conntrack *)nfct; - struct ip_conntrack_protocol *proto; - struct ip_conntrack_helper *helper; - typeof(ip_conntrack_destroyed) destroyed; - - DEBUGP("destroy_conntrack(%p)\n", ct); - IP_NF_ASSERT(atomic_read(&nfct->use) == 0); - IP_NF_ASSERT(!timer_pending(&ct->timeout)); - - ip_conntrack_event(IPCT_DESTROY, ct); - set_bit(IPS_DYING_BIT, &ct->status); - - helper = ct->helper; - if (helper && helper->destroy) - helper->destroy(ct); - - /* To make sure we don't get any weird locking issues here: - * destroy_conntrack() MUST NOT be called with a write lock - * to ip_conntrack_lock!!! -HW */ - rcu_read_lock(); - proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); - if (proto && proto->destroy) - proto->destroy(ct); - - destroyed = rcu_dereference(ip_conntrack_destroyed); - if (destroyed) - destroyed(ct); - - rcu_read_unlock(); - - write_lock_bh(&ip_conntrack_lock); - /* Expectations will have been removed in clean_from_lists, - * except TFTP can create an expectation on the first packet, - * before connection is in the list, so we need to clean here, - * too. */ - ip_ct_remove_expectations(ct); - - /* We overload first tuple to link into unconfirmed list. */ - if (!is_confirmed(ct)) { - BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list)); - list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); - } - - CONNTRACK_STAT_INC(delete); - write_unlock_bh(&ip_conntrack_lock); - - if (ct->master) - ip_conntrack_put(ct->master); - - DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct); - ip_conntrack_free(ct); -} - -static void death_by_timeout(unsigned long ul_conntrack) -{ - struct ip_conntrack *ct = (void *)ul_conntrack; - - write_lock_bh(&ip_conntrack_lock); - /* Inside lock so preempt is disabled on module removal path. - * Otherwise we can get spurious warnings. */ - CONNTRACK_STAT_INC(delete_list); - clean_from_lists(ct); - write_unlock_bh(&ip_conntrack_lock); - ip_conntrack_put(ct); -} - -struct ip_conntrack_tuple_hash * -__ip_conntrack_find(const struct ip_conntrack_tuple *tuple, - const struct ip_conntrack *ignored_conntrack) -{ - struct ip_conntrack_tuple_hash *h; - unsigned int hash = hash_conntrack(tuple); - - list_for_each_entry(h, &ip_conntrack_hash[hash], list) { - if (tuplehash_to_ctrack(h) != ignored_conntrack && - ip_ct_tuple_equal(tuple, &h->tuple)) { - CONNTRACK_STAT_INC(found); - return h; - } - CONNTRACK_STAT_INC(searched); - } - - return NULL; -} - -/* Find a connection corresponding to a tuple. */ -struct ip_conntrack_tuple_hash * -ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple, - const struct ip_conntrack *ignored_conntrack) -{ - struct ip_conntrack_tuple_hash *h; - - read_lock_bh(&ip_conntrack_lock); - h = __ip_conntrack_find(tuple, ignored_conntrack); - if (h) - atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use); - read_unlock_bh(&ip_conntrack_lock); - - return h; -} - -static void __ip_conntrack_hash_insert(struct ip_conntrack *ct, - unsigned int hash, - unsigned int repl_hash) -{ - ct->id = ++ip_conntrack_next_id; - list_add(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list, - &ip_conntrack_hash[hash]); - list_add(&ct->tuplehash[IP_CT_DIR_REPLY].list, - &ip_conntrack_hash[repl_hash]); -} - -void ip_conntrack_hash_insert(struct ip_conntrack *ct) -{ - unsigned int hash, repl_hash; - - hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); - - write_lock_bh(&ip_conntrack_lock); - __ip_conntrack_hash_insert(ct, hash, repl_hash); - write_unlock_bh(&ip_conntrack_lock); -} - -/* Confirm a connection given skb; places it in hash table */ -int -__ip_conntrack_confirm(struct sk_buff **pskb) -{ - unsigned int hash, repl_hash; - struct ip_conntrack_tuple_hash *h; - struct ip_conntrack *ct; - enum ip_conntrack_info ctinfo; - - ct = ip_conntrack_get(*pskb, &ctinfo); - - /* ipt_REJECT uses ip_conntrack_attach to attach related - ICMP/TCP RST packets in other direction. Actual packet - which created connection will be IP_CT_NEW or for an - expected connection, IP_CT_RELATED. */ - if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) - return NF_ACCEPT; - - hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); - - /* We're not in hash table, and we refuse to set up related - connections for unconfirmed conns. But packet copies and - REJECT will give spurious warnings here. */ - /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */ - - /* No external references means noone else could have - confirmed us. */ - IP_NF_ASSERT(!is_confirmed(ct)); - DEBUGP("Confirming conntrack %p\n", ct); - - write_lock_bh(&ip_conntrack_lock); - - /* See if there's one in the list already, including reverse: - NAT could have grabbed it without realizing, since we're - not in the hash. If there is, we lost race. */ - list_for_each_entry(h, &ip_conntrack_hash[hash], list) - if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - &h->tuple)) - goto out; - list_for_each_entry(h, &ip_conntrack_hash[repl_hash], list) - if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, - &h->tuple)) - goto out; - - /* Remove from unconfirmed list */ - list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); - - __ip_conntrack_hash_insert(ct, hash, repl_hash); - /* Timer relative to confirmation time, not original - setting time, otherwise we'd get timer wrap in - weird delay cases. */ - ct->timeout.expires += jiffies; - add_timer(&ct->timeout); - atomic_inc(&ct->ct_general.use); - set_bit(IPS_CONFIRMED_BIT, &ct->status); - CONNTRACK_STAT_INC(insert); - write_unlock_bh(&ip_conntrack_lock); - if (ct->helper) - ip_conntrack_event_cache(IPCT_HELPER, *pskb); -#ifdef CONFIG_IP_NF_NAT_NEEDED - if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) || - test_bit(IPS_DST_NAT_DONE_BIT, &ct->status)) - ip_conntrack_event_cache(IPCT_NATINFO, *pskb); -#endif - ip_conntrack_event_cache(master_ct(ct) ? - IPCT_RELATED : IPCT_NEW, *pskb); - - return NF_ACCEPT; - -out: - CONNTRACK_STAT_INC(insert_failed); - write_unlock_bh(&ip_conntrack_lock); - return NF_DROP; -} - -/* Returns true if a connection correspondings to the tuple (required - for NAT). */ -int -ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple, - const struct ip_conntrack *ignored_conntrack) -{ - struct ip_conntrack_tuple_hash *h; - - read_lock_bh(&ip_conntrack_lock); - h = __ip_conntrack_find(tuple, ignored_conntrack); - read_unlock_bh(&ip_conntrack_lock); - - return h != NULL; -} - -/* There's a small race here where we may free a just-assured - connection. Too bad: we're in trouble anyway. */ -static int early_drop(struct list_head *chain) -{ - /* Traverse backwards: gives us oldest, which is roughly LRU */ - struct ip_conntrack_tuple_hash *h; - struct ip_conntrack *ct = NULL, *tmp; - int dropped = 0; - - read_lock_bh(&ip_conntrack_lock); - list_for_each_entry_reverse(h, chain, list) { - tmp = tuplehash_to_ctrack(h); - if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) { - ct = tmp; - atomic_inc(&ct->ct_general.use); - break; - } - } - read_unlock_bh(&ip_conntrack_lock); - - if (!ct) - return dropped; - - if (del_timer(&ct->timeout)) { - death_by_timeout((unsigned long)ct); - dropped = 1; - CONNTRACK_STAT_INC_ATOMIC(early_drop); - } - ip_conntrack_put(ct); - return dropped; -} - -static struct ip_conntrack_helper * -__ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple) -{ - struct ip_conntrack_helper *h; - - list_for_each_entry(h, &helpers, list) { - if (ip_ct_tuple_mask_cmp(tuple, &h->tuple, &h->mask)) - return h; - } - return NULL; -} - -struct ip_conntrack_helper * -ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple) -{ - struct ip_conntrack_helper *helper; - - /* need ip_conntrack_lock to assure that helper exists until - * try_module_get() is called */ - read_lock_bh(&ip_conntrack_lock); - - helper = __ip_conntrack_helper_find(tuple); - if (helper) { - /* need to increase module usage count to assure helper will - * not go away while the caller is e.g. busy putting a - * conntrack in the hash that uses the helper */ - if (!try_module_get(helper->me)) - helper = NULL; - } - - read_unlock_bh(&ip_conntrack_lock); - - return helper; -} - -void ip_conntrack_helper_put(struct ip_conntrack_helper *helper) -{ - module_put(helper->me); -} - -struct ip_conntrack_protocol * -__ip_conntrack_proto_find(u_int8_t protocol) -{ - return ip_ct_protos[protocol]; -} - -/* this is guaranteed to always return a valid protocol helper, since - * it falls back to generic_protocol */ -struct ip_conntrack_protocol * -ip_conntrack_proto_find_get(u_int8_t protocol) -{ - struct ip_conntrack_protocol *p; - - rcu_read_lock(); - p = __ip_conntrack_proto_find(protocol); - if (p) { - if (!try_module_get(p->me)) - p = &ip_conntrack_generic_protocol; - } - rcu_read_unlock(); - - return p; -} - -void ip_conntrack_proto_put(struct ip_conntrack_protocol *p) -{ - module_put(p->me); -} - -struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig, - struct ip_conntrack_tuple *repl) -{ - struct ip_conntrack *conntrack; - - if (!ip_conntrack_hash_rnd_initted) { - get_random_bytes(&ip_conntrack_hash_rnd, 4); - ip_conntrack_hash_rnd_initted = 1; - } - - /* We don't want any race condition at early drop stage */ - atomic_inc(&ip_conntrack_count); - - if (ip_conntrack_max - && atomic_read(&ip_conntrack_count) > ip_conntrack_max) { - unsigned int hash = hash_conntrack(orig); - /* Try dropping from this hash chain. */ - if (!early_drop(&ip_conntrack_hash[hash])) { - atomic_dec(&ip_conntrack_count); - if (net_ratelimit()) - printk(KERN_WARNING - "ip_conntrack: table full, dropping" - " packet.\n"); - return ERR_PTR(-ENOMEM); - } - } - - conntrack = kmem_cache_zalloc(ip_conntrack_cachep, GFP_ATOMIC); - if (!conntrack) { - DEBUGP("Can't allocate conntrack.\n"); - atomic_dec(&ip_conntrack_count); - return ERR_PTR(-ENOMEM); - } - - atomic_set(&conntrack->ct_general.use, 1); - conntrack->ct_general.destroy = destroy_conntrack; - conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; - conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; - /* Don't set timer yet: wait for confirmation */ - init_timer(&conntrack->timeout); - conntrack->timeout.data = (unsigned long)conntrack; - conntrack->timeout.function = death_by_timeout; - - return conntrack; -} - -void -ip_conntrack_free(struct ip_conntrack *conntrack) -{ - atomic_dec(&ip_conntrack_count); - kmem_cache_free(ip_conntrack_cachep, conntrack); -} - -/* Allocate a new conntrack: we return -ENOMEM if classification - * failed due to stress. Otherwise it really is unclassifiable */ -static struct ip_conntrack_tuple_hash * -init_conntrack(struct ip_conntrack_tuple *tuple, - struct ip_conntrack_protocol *protocol, - struct sk_buff *skb) -{ - struct ip_conntrack *conntrack; - struct ip_conntrack_tuple repl_tuple; - struct ip_conntrack_expect *exp; - - if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) { - DEBUGP("Can't invert tuple.\n"); - return NULL; - } - - conntrack = ip_conntrack_alloc(tuple, &repl_tuple); - if (conntrack == NULL || IS_ERR(conntrack)) - return (struct ip_conntrack_tuple_hash *)conntrack; - - if (!protocol->new(conntrack, skb)) { - ip_conntrack_free(conntrack); - return NULL; - } - - write_lock_bh(&ip_conntrack_lock); - exp = find_expectation(tuple); - - if (exp) { - DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n", - conntrack, exp); - /* Welcome, Mr. Bond. We've been expecting you... */ - __set_bit(IPS_EXPECTED_BIT, &conntrack->status); - conntrack->master = exp->master; -#ifdef CONFIG_IP_NF_CONNTRACK_MARK - conntrack->mark = exp->master->mark; -#endif -#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \ - defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE) - /* this is ugly, but there is no other place where to put it */ - conntrack->nat.masq_index = exp->master->nat.masq_index; -#endif -#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK - conntrack->secmark = exp->master->secmark; -#endif - nf_conntrack_get(&conntrack->master->ct_general); - CONNTRACK_STAT_INC(expect_new); - } else { - conntrack->helper = __ip_conntrack_helper_find(&repl_tuple); - - CONNTRACK_STAT_INC(new); - } - - /* Overload tuple linked list to put us in unconfirmed list. */ - list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed); - - write_unlock_bh(&ip_conntrack_lock); - - if (exp) { - if (exp->expectfn) - exp->expectfn(conntrack, exp); - ip_conntrack_expect_put(exp); - } - - return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL]; -} - -/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */ -static inline struct ip_conntrack * -resolve_normal_ct(struct sk_buff *skb, - struct ip_conntrack_protocol *proto, - int *set_reply, - unsigned int hooknum, - enum ip_conntrack_info *ctinfo) -{ - struct ip_conntrack_tuple tuple; - struct ip_conntrack_tuple_hash *h; - struct ip_conntrack *ct; - - IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0); - - if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, - &tuple,proto)) - return NULL; - - /* look for tuple match */ - h = ip_conntrack_find_get(&tuple, NULL); - if (!h) { - h = init_conntrack(&tuple, proto, skb); - if (!h) - return NULL; - if (IS_ERR(h)) - return (void *)h; - } - ct = tuplehash_to_ctrack(h); - - /* It exists; we have (non-exclusive) reference. */ - if (DIRECTION(h) == IP_CT_DIR_REPLY) { - *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY; - /* Please set reply bit if this packet OK */ - *set_reply = 1; - } else { - /* Once we've had two way comms, always ESTABLISHED. */ - if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { - DEBUGP("ip_conntrack_in: normal packet for %p\n", - ct); - *ctinfo = IP_CT_ESTABLISHED; - } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { - DEBUGP("ip_conntrack_in: related packet for %p\n", - ct); - *ctinfo = IP_CT_RELATED; - } else { - DEBUGP("ip_conntrack_in: new packet for %p\n", - ct); - *ctinfo = IP_CT_NEW; - } - *set_reply = 0; - } - skb->nfct = &ct->ct_general; - skb->nfctinfo = *ctinfo; - return ct; -} - -/* Netfilter hook itself. */ -unsigned int ip_conntrack_in(unsigned int hooknum, - struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct ip_conntrack *ct; - enum ip_conntrack_info ctinfo; - struct ip_conntrack_protocol *proto; - int set_reply = 0; - int ret; - - /* Previously seen (loopback or untracked)? Ignore. */ - if ((*pskb)->nfct) { - CONNTRACK_STAT_INC_ATOMIC(ignore); - return NF_ACCEPT; - } - - /* Never happen */ - if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) { - if (net_ratelimit()) { - printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n", - (*pskb)->nh.iph->protocol, hooknum); - } - return NF_DROP; - } - -/* Doesn't cover locally-generated broadcast, so not worth it. */ -#if 0 - /* Ignore broadcast: no `connection'. */ - if ((*pskb)->pkt_type == PACKET_BROADCAST) { - printk("Broadcast packet!\n"); - return NF_ACCEPT; - } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) - == htonl(0x000000FF)) { - printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n", - NIPQUAD((*pskb)->nh.iph->saddr), - NIPQUAD((*pskb)->nh.iph->daddr), - (*pskb)->sk, (*pskb)->pkt_type); - } -#endif - - /* rcu_read_lock()ed by nf_hook_slow */ - proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol); - - /* It may be an special packet, error, unclean... - * inverse of the return code tells to the netfilter - * core what to do with the packet. */ - if (proto->error != NULL - && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) { - CONNTRACK_STAT_INC_ATOMIC(error); - CONNTRACK_STAT_INC_ATOMIC(invalid); - return -ret; - } - - if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) { - /* Not valid part of a connection */ - CONNTRACK_STAT_INC_ATOMIC(invalid); - return NF_ACCEPT; - } - - if (IS_ERR(ct)) { - /* Too stressed to deal. */ - CONNTRACK_STAT_INC_ATOMIC(drop); - return NF_DROP; - } - - IP_NF_ASSERT((*pskb)->nfct); - - ret = proto->packet(ct, *pskb, ctinfo); - if (ret < 0) { - /* Invalid: inverse of the return code tells - * the netfilter core what to do*/ - nf_conntrack_put((*pskb)->nfct); - (*pskb)->nfct = NULL; - CONNTRACK_STAT_INC_ATOMIC(invalid); - return -ret; - } - - if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) - ip_conntrack_event_cache(IPCT_STATUS, *pskb); - - return ret; -} - -int invert_tuplepr(struct ip_conntrack_tuple *inverse, - const struct ip_conntrack_tuple *orig) -{ - struct ip_conntrack_protocol *proto; - int ret; - - rcu_read_lock(); - proto = __ip_conntrack_proto_find(orig->dst.protonum); - ret = ip_ct_invert_tuple(inverse, orig, proto); - rcu_read_unlock(); - - return ret; -} - -/* Would two expected things clash? */ -static inline int expect_clash(const struct ip_conntrack_expect *a, - const struct ip_conntrack_expect *b) -{ - /* Part covered by intersection of masks must be unequal, - otherwise they clash */ - struct ip_conntrack_tuple intersect_mask - = { { a->mask.src.ip & b->mask.src.ip, - { a->mask.src.u.all & b->mask.src.u.all } }, - { a->mask.dst.ip & b->mask.dst.ip, - { a->mask.dst.u.all & b->mask.dst.u.all }, - a->mask.dst.protonum & b->mask.dst.protonum } }; - - return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask); -} - -static inline int expect_matches(const struct ip_conntrack_expect *a, - const struct ip_conntrack_expect *b) -{ - return a->master == b->master - && ip_ct_tuple_equal(&a->tuple, &b->tuple) - && ip_ct_tuple_equal(&a->mask, &b->mask); -} - -/* Generally a bad idea to call this: could have matched already. */ -void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp) -{ - struct ip_conntrack_expect *i; - - write_lock_bh(&ip_conntrack_lock); - /* choose the the oldest expectation to evict */ - list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { - if (expect_matches(i, exp) && del_timer(&i->timeout)) { - ip_ct_unlink_expect(i); - write_unlock_bh(&ip_conntrack_lock); - ip_conntrack_expect_put(i); - return; - } - } - write_unlock_bh(&ip_conntrack_lock); -} - -/* We don't increase the master conntrack refcount for non-fulfilled - * conntracks. During the conntrack destruction, the expectations are - * always killed before the conntrack itself */ -struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me) -{ - struct ip_conntrack_expect *new; - - new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC); - if (!new) { - DEBUGP("expect_related: OOM allocating expect\n"); - return NULL; - } - new->master = me; - atomic_set(&new->use, 1); - return new; -} - -void ip_conntrack_expect_put(struct ip_conntrack_expect *exp) -{ - if (atomic_dec_and_test(&exp->use)) - kmem_cache_free(ip_conntrack_expect_cachep, exp); -} - -static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp) -{ - atomic_inc(&exp->use); - exp->master->expecting++; - list_add(&exp->list, &ip_conntrack_expect_list); - - init_timer(&exp->timeout); - exp->timeout.data = (unsigned long)exp; - exp->timeout.function = expectation_timed_out; - exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ; - add_timer(&exp->timeout); - - exp->id = ++ip_conntrack_expect_next_id; - atomic_inc(&exp->use); - CONNTRACK_STAT_INC(expect_create); -} - -/* Race with expectations being used means we could have none to find; OK. */ -static void evict_oldest_expect(struct ip_conntrack *master) -{ - struct ip_conntrack_expect *i; - - list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { - if (i->master == master) { - if (del_timer(&i->timeout)) { - ip_ct_unlink_expect(i); - ip_conntrack_expect_put(i); - } - break; - } - } -} - -static inline int refresh_timer(struct ip_conntrack_expect *i) -{ - if (!del_timer(&i->timeout)) - return 0; - - i->timeout.expires = jiffies + i->master->helper->timeout*HZ; - add_timer(&i->timeout); - return 1; -} - -int ip_conntrack_expect_related(struct ip_conntrack_expect *expect) -{ - struct ip_conntrack_expect *i; - int ret; - - DEBUGP("ip_conntrack_expect_related %p\n", related_to); - DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple); - DEBUGP("mask: "); DUMP_TUPLE(&expect->mask); - - write_lock_bh(&ip_conntrack_lock); - list_for_each_entry(i, &ip_conntrack_expect_list, list) { - if (expect_matches(i, expect)) { - /* Refresh timer: if it's dying, ignore.. */ - if (refresh_timer(i)) { - ret = 0; - goto out; - } - } else if (expect_clash(i, expect)) { - ret = -EBUSY; - goto out; - } - } - - /* Will be over limit? */ - if (expect->master->helper->max_expected && - expect->master->expecting >= expect->master->helper->max_expected) - evict_oldest_expect(expect->master); - - ip_conntrack_expect_insert(expect); - ip_conntrack_expect_event(IPEXP_NEW, expect); - ret = 0; -out: - write_unlock_bh(&ip_conntrack_lock); - return ret; -} - -/* Alter reply tuple (maybe alter helper). This is for NAT, and is - implicitly racy: see __ip_conntrack_confirm */ -void ip_conntrack_alter_reply(struct ip_conntrack *conntrack, - const struct ip_conntrack_tuple *newreply) -{ - write_lock_bh(&ip_conntrack_lock); - /* Should be unconfirmed, so not in hash table yet */ - IP_NF_ASSERT(!is_confirmed(conntrack)); - - DEBUGP("Altering reply tuple of %p to ", conntrack); - DUMP_TUPLE(newreply); - - conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; - if (!conntrack->master && conntrack->expecting == 0) - conntrack->helper = __ip_conntrack_helper_find(newreply); - write_unlock_bh(&ip_conntrack_lock); -} - -int ip_conntrack_helper_register(struct ip_conntrack_helper *me) -{ - BUG_ON(me->timeout == 0); - write_lock_bh(&ip_conntrack_lock); - list_add(&me->list, &helpers); - write_unlock_bh(&ip_conntrack_lock); - - return 0; -} - -struct ip_conntrack_helper * -__ip_conntrack_helper_find_byname(const char *name) -{ - struct ip_conntrack_helper *h; - - list_for_each_entry(h, &helpers, list) { - if (!strcmp(h->name, name)) - return h; - } - - return NULL; -} - -static inline void unhelp(struct ip_conntrack_tuple_hash *i, - const struct ip_conntrack_helper *me) -{ - if (tuplehash_to_ctrack(i)->helper == me) { - ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i)); - tuplehash_to_ctrack(i)->helper = NULL; - } -} - -void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me) -{ - unsigned int i; - struct ip_conntrack_tuple_hash *h; - struct ip_conntrack_expect *exp, *tmp; - - /* Need write lock here, to delete helper. */ - write_lock_bh(&ip_conntrack_lock); - list_del(&me->list); - - /* Get rid of expectations */ - list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) { - if (exp->master->helper == me && del_timer(&exp->timeout)) { - ip_ct_unlink_expect(exp); - ip_conntrack_expect_put(exp); - } - } - /* Get rid of expecteds, set helpers to NULL. */ - list_for_each_entry(h, &unconfirmed, list) - unhelp(h, me); - for (i = 0; i < ip_conntrack_htable_size; i++) { - list_for_each_entry(h, &ip_conntrack_hash[i], list) - unhelp(h, me); - } - write_unlock_bh(&ip_conntrack_lock); - - /* Someone could be still looking at the helper in a bh. */ - synchronize_net(); -} - -/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ -void __ip_ct_refresh_acct(struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - const struct sk_buff *skb, - unsigned long extra_jiffies, - int do_acct) -{ - int event = 0; - - IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct); - IP_NF_ASSERT(skb); - - write_lock_bh(&ip_conntrack_lock); - - /* Only update if this is not a fixed timeout */ - if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) { - write_unlock_bh(&ip_conntrack_lock); - return; - } - - /* If not in hash table, timer will not be active yet */ - if (!is_confirmed(ct)) { - ct->timeout.expires = extra_jiffies; - event = IPCT_REFRESH; - } else { - /* Need del_timer for race avoidance (may already be dying). */ - if (del_timer(&ct->timeout)) { - ct->timeout.expires = jiffies + extra_jiffies; - add_timer(&ct->timeout); - event = IPCT_REFRESH; - } - } - -#ifdef CONFIG_IP_NF_CT_ACCT - if (do_acct) { - ct->counters[CTINFO2DIR(ctinfo)].packets++; - ct->counters[CTINFO2DIR(ctinfo)].bytes += - ntohs(skb->nh.iph->tot_len); - if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000) - || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000)) - event |= IPCT_COUNTER_FILLING; - } -#endif - - write_unlock_bh(&ip_conntrack_lock); - - /* must be unlocked when calling event cache */ - if (event) - ip_conntrack_event_cache(event, skb); -} - -#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ - defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) -/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be - * in ip_conntrack_core, since we don't want the protocols to autoload - * or depend on ctnetlink */ -int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb, - const struct ip_conntrack_tuple *tuple) -{ - NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(__be16), - &tuple->src.u.tcp.port); - NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(__be16), - &tuple->dst.u.tcp.port); - return 0; - -nfattr_failure: - return -1; -} - -int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[], - struct ip_conntrack_tuple *t) -{ - if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1]) - return -EINVAL; - - t->src.u.tcp.port = - *(__be16 *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]); - t->dst.u.tcp.port = - *(__be16 *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]); - - return 0; -} -#endif - -/* Returns new sk_buff, or NULL */ -struct sk_buff * -ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user) -{ - skb_orphan(skb); - - local_bh_disable(); - skb = ip_defrag(skb, user); - local_bh_enable(); - - if (skb) - ip_send_check(skb->nh.iph); - return skb; -} - -/* Used by ipt_REJECT. */ -static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) -{ - struct ip_conntrack *ct; - enum ip_conntrack_info ctinfo; - - /* This ICMP is in reverse direction to the packet which caused it */ - ct = ip_conntrack_get(skb, &ctinfo); - - if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) - ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY; - else - ctinfo = IP_CT_RELATED; - - /* Attach to new skbuff, and increment count */ - nskb->nfct = &ct->ct_general; - nskb->nfctinfo = ctinfo; - nf_conntrack_get(nskb->nfct); -} - -/* Bring out ya dead! */ -static struct ip_conntrack * -get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data), - void *data, unsigned int *bucket) -{ - struct ip_conntrack_tuple_hash *h; - struct ip_conntrack *ct; - - write_lock_bh(&ip_conntrack_lock); - for (; *bucket < ip_conntrack_htable_size; (*bucket)++) { - list_for_each_entry(h, &ip_conntrack_hash[*bucket], list) { - ct = tuplehash_to_ctrack(h); - if (iter(ct, data)) - goto found; - } - } - list_for_each_entry(h, &unconfirmed, list) { - ct = tuplehash_to_ctrack(h); - if (iter(ct, data)) - set_bit(IPS_DYING_BIT, &ct->status); - } - write_unlock_bh(&ip_conntrack_lock); - return NULL; - -found: - atomic_inc(&ct->ct_general.use); - write_unlock_bh(&ip_conntrack_lock); - return ct; -} - -void -ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data) -{ - struct ip_conntrack *ct; - unsigned int bucket = 0; - - while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) { - /* Time to push up daises... */ - if (del_timer(&ct->timeout)) - death_by_timeout((unsigned long)ct); - /* ... else the timer will get him soon. */ - - ip_conntrack_put(ct); - } -} - -/* Fast function for those who don't want to parse /proc (and I don't - blame them). */ -/* Reversing the socket's dst/src point of view gives us the reply - mapping. */ -static int -getorigdst(struct sock *sk, int optval, void __user *user, int *len) -{ - struct inet_sock *inet = inet_sk(sk); - struct ip_conntrack_tuple_hash *h; - struct ip_conntrack_tuple tuple; - - IP_CT_TUPLE_U_BLANK(&tuple); - tuple.src.ip = inet->rcv_saddr; - tuple.src.u.tcp.port = inet->sport; - tuple.dst.ip = inet->daddr; - tuple.dst.u.tcp.port = inet->dport; - tuple.dst.protonum = IPPROTO_TCP; - - /* We only do TCP at the moment: is there a better way? */ - if (strcmp(sk->sk_prot->name, "TCP")) { - DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n"); - return -ENOPROTOOPT; - } - - if ((unsigned int) *len < sizeof(struct sockaddr_in)) { - DEBUGP("SO_ORIGINAL_DST: len %u not %u\n", - *len, sizeof(struct sockaddr_in)); - return -EINVAL; - } - - h = ip_conntrack_find_get(&tuple, NULL); - if (h) { - struct sockaddr_in sin; - struct ip_conntrack *ct = tuplehash_to_ctrack(h); - - sin.sin_family = AF_INET; - sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.dst.u.tcp.port; - sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.dst.ip; - memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); - - DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n", - NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); - ip_conntrack_put(ct); - if (copy_to_user(user, &sin, sizeof(sin)) != 0) - return -EFAULT; - else - return 0; - } - DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n", - NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port), - NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port)); - return -ENOENT; -} - -static struct nf_sockopt_ops so_getorigdst = { - .pf = PF_INET, - .get_optmin = SO_ORIGINAL_DST, - .get_optmax = SO_ORIGINAL_DST+1, - .get = &getorigdst, -}; - -static int kill_all(struct ip_conntrack *i, void *data) -{ - return 1; -} - -void ip_conntrack_flush(void) -{ - ip_ct_iterate_cleanup(kill_all, NULL); -} - -static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size) -{ - if (vmalloced) - vfree(hash); - else - free_pages((unsigned long)hash, - get_order(sizeof(struct list_head) * size)); -} - -/* Mishearing the voices in his head, our hero wonders how he's - supposed to kill the mall. */ -void ip_conntrack_cleanup(void) -{ - rcu_assign_pointer(ip_ct_attach, NULL); - - /* This makes sure all current packets have passed through - netfilter framework. Roll on, two-stage module - delete... */ - synchronize_net(); - - ip_ct_event_cache_flush(); - i_see_dead_people: - ip_conntrack_flush(); - if (atomic_read(&ip_conntrack_count) != 0) { - schedule(); - goto i_see_dead_people; - } - /* wait until all references to ip_conntrack_untracked are dropped */ - while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1) - schedule(); - - kmem_cache_destroy(ip_conntrack_cachep); - kmem_cache_destroy(ip_conntrack_expect_cachep); - free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc, - ip_conntrack_htable_size); - nf_unregister_sockopt(&so_getorigdst); -} - -static struct list_head *alloc_hashtable(int size, int *vmalloced) -{ - struct list_head *hash; - unsigned int i; - - *vmalloced = 0; - hash = (void*)__get_free_pages(GFP_KERNEL, - get_order(sizeof(struct list_head) - * size)); - if (!hash) { - *vmalloced = 1; - printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n"); - hash = vmalloc(sizeof(struct list_head) * size); - } - - if (hash) - for (i = 0; i < size; i++) - INIT_LIST_HEAD(&hash[i]); - - return hash; -} - -static int set_hashsize(const char *val, struct kernel_param *kp) -{ - int i, bucket, hashsize, vmalloced; - int old_vmalloced, old_size; - int rnd; - struct list_head *hash, *old_hash; - struct ip_conntrack_tuple_hash *h; - - /* On boot, we can set this without any fancy locking. */ - if (!ip_conntrack_htable_size) - return param_set_int(val, kp); - - hashsize = simple_strtol(val, NULL, 0); - if (!hashsize) - return -EINVAL; - - hash = alloc_hashtable(hashsize, &vmalloced); - if (!hash) - return -ENOMEM; - - /* We have to rehash for the new table anyway, so we also can - * use a new random seed */ - get_random_bytes(&rnd, 4); - - write_lock_bh(&ip_conntrack_lock); - for (i = 0; i < ip_conntrack_htable_size; i++) { - while (!list_empty(&ip_conntrack_hash[i])) { - h = list_entry(ip_conntrack_hash[i].next, - struct ip_conntrack_tuple_hash, list); - list_del(&h->list); - bucket = __hash_conntrack(&h->tuple, hashsize, rnd); - list_add_tail(&h->list, &hash[bucket]); - } - } - old_size = ip_conntrack_htable_size; - old_vmalloced = ip_conntrack_vmalloc; - old_hash = ip_conntrack_hash; - - ip_conntrack_htable_size = hashsize; - ip_conntrack_vmalloc = vmalloced; - ip_conntrack_hash = hash; - ip_conntrack_hash_rnd = rnd; - write_unlock_bh(&ip_conntrack_lock); - - free_conntrack_hash(old_hash, old_vmalloced, old_size); - return 0; -} - -module_param_call(hashsize, set_hashsize, param_get_uint, - &ip_conntrack_htable_size, 0600); - -int __init ip_conntrack_init(void) -{ - unsigned int i; - int ret; - - /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB - * machine has 256 buckets. >= 1GB machines have 8192 buckets. */ - if (!ip_conntrack_htable_size) { - ip_conntrack_htable_size - = (((num_physpages << PAGE_SHIFT) / 16384) - / sizeof(struct list_head)); - if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) - ip_conntrack_htable_size = 8192; - if (ip_conntrack_htable_size < 16) - ip_conntrack_htable_size = 16; - } - ip_conntrack_max = 8 * ip_conntrack_htable_size; - - printk("ip_conntrack version %s (%u buckets, %d max)" - " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION, - ip_conntrack_htable_size, ip_conntrack_max, - sizeof(struct ip_conntrack)); - - ret = nf_register_sockopt(&so_getorigdst); - if (ret != 0) { - printk(KERN_ERR "Unable to register netfilter socket option\n"); - return ret; - } - - ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size, - &ip_conntrack_vmalloc); - if (!ip_conntrack_hash) { - printk(KERN_ERR "Unable to create ip_conntrack_hash\n"); - goto err_unreg_sockopt; - } - - ip_conntrack_cachep = kmem_cache_create("ip_conntrack", - sizeof(struct ip_conntrack), 0, - 0, NULL, NULL); - if (!ip_conntrack_cachep) { - printk(KERN_ERR "Unable to create ip_conntrack slab cache\n"); - goto err_free_hash; - } - - ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect", - sizeof(struct ip_conntrack_expect), - 0, 0, NULL, NULL); - if (!ip_conntrack_expect_cachep) { - printk(KERN_ERR "Unable to create ip_expect slab cache\n"); - goto err_free_conntrack_slab; - } - - /* Don't NEED lock here, but good form anyway. */ - write_lock_bh(&ip_conntrack_lock); - for (i = 0; i < MAX_IP_CT_PROTO; i++) - rcu_assign_pointer(ip_ct_protos[i], &ip_conntrack_generic_protocol); - /* Sew in builtin protocols. */ - rcu_assign_pointer(ip_ct_protos[IPPROTO_TCP], &ip_conntrack_protocol_tcp); - rcu_assign_pointer(ip_ct_protos[IPPROTO_UDP], &ip_conntrack_protocol_udp); - rcu_assign_pointer(ip_ct_protos[IPPROTO_ICMP], &ip_conntrack_protocol_icmp); - write_unlock_bh(&ip_conntrack_lock); - - /* For use by ipt_REJECT */ - rcu_assign_pointer(ip_ct_attach, ip_conntrack_attach); - - /* Set up fake conntrack: - - to never be deleted, not in any hashes */ - atomic_set(&ip_conntrack_untracked.ct_general.use, 1); - /* - and look it like as a confirmed connection */ - set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status); - - return ret; - -err_free_conntrack_slab: - kmem_cache_destroy(ip_conntrack_cachep); -err_free_hash: - free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc, - ip_conntrack_htable_size); -err_unreg_sockopt: - nf_unregister_sockopt(&so_getorigdst); - - return -ENOMEM; -} diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c deleted file mode 100644 index 1faa68ab943..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_ftp.c +++ /dev/null @@ -1,520 +0,0 @@ -/* FTP extension for IP connection tracking. */ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/netfilter.h> -#include <linux/ip.h> -#include <linux/ctype.h> -#include <net/checksum.h> -#include <net/tcp.h> - -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_ftp.h> -#include <linux/moduleparam.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); -MODULE_DESCRIPTION("ftp connection tracking helper"); - -/* This is slow, but it's simple. --RR */ -static char *ftp_buffer; -static DEFINE_SPINLOCK(ip_ftp_lock); - -#define MAX_PORTS 8 -static unsigned short ports[MAX_PORTS]; -static int ports_c; -module_param_array(ports, ushort, &ports_c, 0400); - -static int loose; -module_param(loose, bool, 0600); - -unsigned int (*ip_nat_ftp_hook)(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - enum ip_ct_ftp_type type, - unsigned int matchoff, - unsigned int matchlen, - struct ip_conntrack_expect *exp, - u32 *seq); -EXPORT_SYMBOL_GPL(ip_nat_ftp_hook); - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - -static int try_rfc959(const char *, size_t, u_int32_t [], char); -static int try_eprt(const char *, size_t, u_int32_t [], char); -static int try_epsv_response(const char *, size_t, u_int32_t [], char); - -static const struct ftp_search { - const char *pattern; - size_t plen; - char skip; - char term; - enum ip_ct_ftp_type ftptype; - int (*getnum)(const char *, size_t, u_int32_t[], char); -} search[IP_CT_DIR_MAX][2] = { - [IP_CT_DIR_ORIGINAL] = { - { - .pattern = "PORT", - .plen = sizeof("PORT") - 1, - .skip = ' ', - .term = '\r', - .ftptype = IP_CT_FTP_PORT, - .getnum = try_rfc959, - }, - { - .pattern = "EPRT", - .plen = sizeof("EPRT") - 1, - .skip = ' ', - .term = '\r', - .ftptype = IP_CT_FTP_EPRT, - .getnum = try_eprt, - }, - }, - [IP_CT_DIR_REPLY] = { - { - .pattern = "227 ", - .plen = sizeof("227 ") - 1, - .skip = '(', - .term = ')', - .ftptype = IP_CT_FTP_PASV, - .getnum = try_rfc959, - }, - { - .pattern = "229 ", - .plen = sizeof("229 ") - 1, - .skip = '(', - .term = ')', - .ftptype = IP_CT_FTP_EPSV, - .getnum = try_epsv_response, - }, - }, -}; - -static int try_number(const char *data, size_t dlen, u_int32_t array[], - int array_size, char sep, char term) -{ - u_int32_t i, len; - - memset(array, 0, sizeof(array[0])*array_size); - - /* Keep data pointing at next char. */ - for (i = 0, len = 0; len < dlen && i < array_size; len++, data++) { - if (*data >= '0' && *data <= '9') { - array[i] = array[i]*10 + *data - '0'; - } - else if (*data == sep) - i++; - else { - /* Unexpected character; true if it's the - terminator and we're finished. */ - if (*data == term && i == array_size - 1) - return len; - - DEBUGP("Char %u (got %u nums) `%u' unexpected\n", - len, i, *data); - return 0; - } - } - DEBUGP("Failed to fill %u numbers separated by %c\n", array_size, sep); - - return 0; -} - -/* Returns 0, or length of numbers: 192,168,1,1,5,6 */ -static int try_rfc959(const char *data, size_t dlen, u_int32_t array[6], - char term) -{ - return try_number(data, dlen, array, 6, ',', term); -} - -/* Grab port: number up to delimiter */ -static int get_port(const char *data, int start, size_t dlen, char delim, - u_int32_t array[2]) -{ - u_int16_t port = 0; - int i; - - for (i = start; i < dlen; i++) { - /* Finished? */ - if (data[i] == delim) { - if (port == 0) - break; - array[0] = port >> 8; - array[1] = port; - return i + 1; - } - else if (data[i] >= '0' && data[i] <= '9') - port = port*10 + data[i] - '0'; - else /* Some other crap */ - break; - } - return 0; -} - -/* Returns 0, or length of numbers: |1|132.235.1.2|6275| */ -static int try_eprt(const char *data, size_t dlen, u_int32_t array[6], - char term) -{ - char delim; - int length; - - /* First character is delimiter, then "1" for IPv4, then - delimiter again. */ - if (dlen <= 3) return 0; - delim = data[0]; - if (isdigit(delim) || delim < 33 || delim > 126 - || data[1] != '1' || data[2] != delim) - return 0; - - DEBUGP("EPRT: Got |1|!\n"); - /* Now we have IP address. */ - length = try_number(data + 3, dlen - 3, array, 4, '.', delim); - if (length == 0) - return 0; - - DEBUGP("EPRT: Got IP address!\n"); - /* Start offset includes initial "|1|", and trailing delimiter */ - return get_port(data, 3 + length + 1, dlen, delim, array+4); -} - -/* Returns 0, or length of numbers: |||6446| */ -static int try_epsv_response(const char *data, size_t dlen, u_int32_t array[6], - char term) -{ - char delim; - - /* Three delimiters. */ - if (dlen <= 3) return 0; - delim = data[0]; - if (isdigit(delim) || delim < 33 || delim > 126 - || data[1] != delim || data[2] != delim) - return 0; - - return get_port(data, 3, dlen, delim, array+4); -} - -/* Return 1 for match, 0 for accept, -1 for partial. */ -static int find_pattern(const char *data, size_t dlen, - const char *pattern, size_t plen, - char skip, char term, - unsigned int *numoff, - unsigned int *numlen, - u_int32_t array[6], - int (*getnum)(const char *, size_t, u_int32_t[], char)) -{ - size_t i; - - DEBUGP("find_pattern `%s': dlen = %u\n", pattern, dlen); - if (dlen == 0) - return 0; - - if (dlen <= plen) { - /* Short packet: try for partial? */ - if (strnicmp(data, pattern, dlen) == 0) - return -1; - else return 0; - } - - if (strnicmp(data, pattern, plen) != 0) { -#if 0 - size_t i; - - DEBUGP("ftp: string mismatch\n"); - for (i = 0; i < plen; i++) { - DEBUGP("ftp:char %u `%c'(%u) vs `%c'(%u)\n", - i, data[i], data[i], - pattern[i], pattern[i]); - } -#endif - return 0; - } - - DEBUGP("Pattern matches!\n"); - /* Now we've found the constant string, try to skip - to the 'skip' character */ - for (i = plen; data[i] != skip; i++) - if (i == dlen - 1) return -1; - - /* Skip over the last character */ - i++; - - DEBUGP("Skipped up to `%c'!\n", skip); - - *numoff = i; - *numlen = getnum(data + i, dlen - i, array, term); - if (!*numlen) - return -1; - - DEBUGP("Match succeeded!\n"); - return 1; -} - -/* Look up to see if we're just after a \n. */ -static int find_nl_seq(u32 seq, const struct ip_ct_ftp_master *info, int dir) -{ - unsigned int i; - - for (i = 0; i < info->seq_aft_nl_num[dir]; i++) - if (info->seq_aft_nl[dir][i] == seq) - return 1; - return 0; -} - -/* We don't update if it's older than what we have. */ -static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir, - struct sk_buff *skb) -{ - unsigned int i, oldest = NUM_SEQ_TO_REMEMBER; - - /* Look for oldest: if we find exact match, we're done. */ - for (i = 0; i < info->seq_aft_nl_num[dir]; i++) { - if (info->seq_aft_nl[dir][i] == nl_seq) - return; - - if (oldest == info->seq_aft_nl_num[dir] - || before(info->seq_aft_nl[dir][i], oldest)) - oldest = i; - } - - if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) { - info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq; - ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb); - } else if (oldest != NUM_SEQ_TO_REMEMBER) { - info->seq_aft_nl[dir][oldest] = nl_seq; - ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb); - } -} - -static int help(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo) -{ - unsigned int dataoff, datalen; - struct tcphdr _tcph, *th; - char *fb_ptr; - int ret; - u32 seq, array[6] = { 0 }; - int dir = CTINFO2DIR(ctinfo); - unsigned int matchlen, matchoff; - struct ip_ct_ftp_master *ct_ftp_info = &ct->help.ct_ftp_info; - struct ip_conntrack_expect *exp; - unsigned int i; - int found = 0, ends_in_nl; - typeof(ip_nat_ftp_hook) ip_nat_ftp; - - /* Until there's been traffic both ways, don't look in packets. */ - if (ctinfo != IP_CT_ESTABLISHED - && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) { - DEBUGP("ftp: Conntrackinfo = %u\n", ctinfo); - return NF_ACCEPT; - } - - th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4, - sizeof(_tcph), &_tcph); - if (th == NULL) - return NF_ACCEPT; - - dataoff = (*pskb)->nh.iph->ihl*4 + th->doff*4; - /* No data? */ - if (dataoff >= (*pskb)->len) { - DEBUGP("ftp: pskblen = %u\n", (*pskb)->len); - return NF_ACCEPT; - } - datalen = (*pskb)->len - dataoff; - - spin_lock_bh(&ip_ftp_lock); - fb_ptr = skb_header_pointer(*pskb, dataoff, - (*pskb)->len - dataoff, ftp_buffer); - BUG_ON(fb_ptr == NULL); - - ends_in_nl = (fb_ptr[datalen - 1] == '\n'); - seq = ntohl(th->seq) + datalen; - - /* Look up to see if we're just after a \n. */ - if (!find_nl_seq(ntohl(th->seq), ct_ftp_info, dir)) { - /* Now if this ends in \n, update ftp info. */ - DEBUGP("ip_conntrack_ftp_help: wrong seq pos %s(%u) or %s(%u)\n", - ct_ftp_info->seq_aft_nl[0][dir] - old_seq_aft_nl_set ? "":"(UNSET) ", old_seq_aft_nl); - ret = NF_ACCEPT; - goto out_update_nl; - } - - /* Initialize IP array to expected address (it's not mentioned - in EPSV responses) */ - array[0] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 24) & 0xFF; - array[1] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 16) & 0xFF; - array[2] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 8) & 0xFF; - array[3] = ntohl(ct->tuplehash[dir].tuple.src.ip) & 0xFF; - - for (i = 0; i < ARRAY_SIZE(search[dir]); i++) { - found = find_pattern(fb_ptr, (*pskb)->len - dataoff, - search[dir][i].pattern, - search[dir][i].plen, - search[dir][i].skip, - search[dir][i].term, - &matchoff, &matchlen, - array, - search[dir][i].getnum); - if (found) break; - } - if (found == -1) { - /* We don't usually drop packets. After all, this is - connection tracking, not packet filtering. - However, it is necessary for accurate tracking in - this case. */ - if (net_ratelimit()) - printk("conntrack_ftp: partial %s %u+%u\n", - search[dir][i].pattern, - ntohl(th->seq), datalen); - ret = NF_DROP; - goto out; - } else if (found == 0) { /* No match */ - ret = NF_ACCEPT; - goto out_update_nl; - } - - DEBUGP("conntrack_ftp: match `%s' (%u bytes at %u)\n", - fb_ptr + matchoff, matchlen, ntohl(th->seq) + matchoff); - - /* Allocate expectation which will be inserted */ - exp = ip_conntrack_expect_alloc(ct); - if (exp == NULL) { - ret = NF_DROP; - goto out; - } - - /* We refer to the reverse direction ("!dir") tuples here, - * because we're expecting something in the other direction. - * Doesn't matter unless NAT is happening. */ - exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip; - - if (htonl((array[0] << 24) | (array[1] << 16) | (array[2] << 8) | array[3]) - != ct->tuplehash[dir].tuple.src.ip) { - /* Enrico Scholz's passive FTP to partially RNAT'd ftp - server: it really wants us to connect to a - different IP address. Simply don't record it for - NAT. */ - DEBUGP("conntrack_ftp: NOT RECORDING: %u,%u,%u,%u != %u.%u.%u.%u\n", - array[0], array[1], array[2], array[3], - NIPQUAD(ct->tuplehash[dir].tuple.src.ip)); - - /* Thanks to Cristiano Lincoln Mattos - <lincoln@cesar.org.br> for reporting this potential - problem (DMZ machines opening holes to internal - networks, or the packet filter itself). */ - if (!loose) { - ret = NF_ACCEPT; - goto out_put_expect; - } - exp->tuple.dst.ip = htonl((array[0] << 24) | (array[1] << 16) - | (array[2] << 8) | array[3]); - } - - exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; - exp->tuple.dst.u.tcp.port = htons(array[4] << 8 | array[5]); - exp->tuple.src.u.tcp.port = 0; /* Don't care. */ - exp->tuple.dst.protonum = IPPROTO_TCP; - exp->mask = ((struct ip_conntrack_tuple) - { { htonl(0xFFFFFFFF), { 0 } }, - { htonl(0xFFFFFFFF), { .tcp = { htons(0xFFFF) } }, 0xFF }}); - - exp->expectfn = NULL; - exp->flags = 0; - - /* Now, NAT might want to mangle the packet, and register the - * (possibly changed) expectation itself. */ - ip_nat_ftp = rcu_dereference(ip_nat_ftp_hook); - if (ip_nat_ftp) - ret = ip_nat_ftp(pskb, ctinfo, search[dir][i].ftptype, - matchoff, matchlen, exp, &seq); - else { - /* Can't expect this? Best to drop packet now. */ - if (ip_conntrack_expect_related(exp) != 0) - ret = NF_DROP; - else - ret = NF_ACCEPT; - } - -out_put_expect: - ip_conntrack_expect_put(exp); - -out_update_nl: - /* Now if this ends in \n, update ftp info. Seq may have been - * adjusted by NAT code. */ - if (ends_in_nl) - update_nl_seq(seq, ct_ftp_info,dir, *pskb); - out: - spin_unlock_bh(&ip_ftp_lock); - return ret; -} - -static struct ip_conntrack_helper ftp[MAX_PORTS]; -static char ftp_names[MAX_PORTS][sizeof("ftp-65535")]; - -/* Not __exit: called from init() */ -static void ip_conntrack_ftp_fini(void) -{ - int i; - for (i = 0; i < ports_c; i++) { - DEBUGP("ip_ct_ftp: unregistering helper for port %d\n", - ports[i]); - ip_conntrack_helper_unregister(&ftp[i]); - } - - kfree(ftp_buffer); -} - -static int __init ip_conntrack_ftp_init(void) -{ - int i, ret; - char *tmpname; - - ftp_buffer = kmalloc(65536, GFP_KERNEL); - if (!ftp_buffer) - return -ENOMEM; - - if (ports_c == 0) - ports[ports_c++] = FTP_PORT; - - for (i = 0; i < ports_c; i++) { - ftp[i].tuple.src.u.tcp.port = htons(ports[i]); - ftp[i].tuple.dst.protonum = IPPROTO_TCP; - ftp[i].mask.src.u.tcp.port = htons(0xFFFF); - ftp[i].mask.dst.protonum = 0xFF; - ftp[i].max_expected = 1; - ftp[i].timeout = 5 * 60; /* 5 minutes */ - ftp[i].me = THIS_MODULE; - ftp[i].help = help; - - tmpname = &ftp_names[i][0]; - if (ports[i] == FTP_PORT) - sprintf(tmpname, "ftp"); - else - sprintf(tmpname, "ftp-%d", ports[i]); - ftp[i].name = tmpname; - - DEBUGP("ip_ct_ftp: registering helper for port %d\n", - ports[i]); - ret = ip_conntrack_helper_register(&ftp[i]); - - if (ret) { - ip_conntrack_ftp_fini(); - return ret; - } - } - return 0; -} - -module_init(ip_conntrack_ftp_init); -module_exit(ip_conntrack_ftp_fini); diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323.c b/net/ipv4/netfilter/ip_conntrack_helper_h323.c deleted file mode 100644 index 53eb365ccc7..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_helper_h323.c +++ /dev/null @@ -1,1841 +0,0 @@ -/* - * H.323 connection tracking helper - * - * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net> - * - * This source code is licensed under General Public License version 2. - * - * Based on the 'brute force' H.323 connection tracking module by - * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - * - * For more information, please see http://nath323.sourceforge.net/ - */ - -#include <linux/module.h> -#include <linux/netfilter.h> -#include <linux/ip.h> -#include <net/tcp.h> -#include <linux/netfilter_ipv4/ip_conntrack.h> -#include <linux/netfilter_ipv4/ip_conntrack_core.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_tuple.h> -#include <linux/netfilter_ipv4/ip_conntrack_h323.h> -#include <linux/moduleparam.h> -#include <linux/ctype.h> -#include <linux/inet.h> - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - -/* Parameters */ -static unsigned int default_rrq_ttl = 300; -module_param(default_rrq_ttl, uint, 0600); -MODULE_PARM_DESC(default_rrq_ttl, "use this TTL if it's missing in RRQ"); - -static int gkrouted_only = 1; -module_param(gkrouted_only, int, 0600); -MODULE_PARM_DESC(gkrouted_only, "only accept calls from gatekeeper"); - -static int callforward_filter = 1; -module_param(callforward_filter, bool, 0600); -MODULE_PARM_DESC(callforward_filter, "only create call forwarding expectations " - "if both endpoints are on different sides " - "(determined by routing information)"); - -/* Hooks for NAT */ -int (*set_h245_addr_hook) (struct sk_buff ** pskb, - unsigned char **data, int dataoff, - H245_TransportAddress * addr, - __be32 ip, u_int16_t port); -int (*set_h225_addr_hook) (struct sk_buff ** pskb, - unsigned char **data, int dataoff, - TransportAddress * addr, - __be32 ip, u_int16_t port); -int (*set_sig_addr_hook) (struct sk_buff ** pskb, - struct ip_conntrack * ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, - TransportAddress * addr, int count); -int (*set_ras_addr_hook) (struct sk_buff ** pskb, - struct ip_conntrack * ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, - TransportAddress * addr, int count); -int (*nat_rtp_rtcp_hook) (struct sk_buff ** pskb, - struct ip_conntrack * ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - H245_TransportAddress * addr, - u_int16_t port, u_int16_t rtp_port, - struct ip_conntrack_expect * rtp_exp, - struct ip_conntrack_expect * rtcp_exp); -int (*nat_t120_hook) (struct sk_buff ** pskb, - struct ip_conntrack * ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - H245_TransportAddress * addr, u_int16_t port, - struct ip_conntrack_expect * exp); -int (*nat_h245_hook) (struct sk_buff ** pskb, - struct ip_conntrack * ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - TransportAddress * addr, u_int16_t port, - struct ip_conntrack_expect * exp); -int (*nat_callforwarding_hook) (struct sk_buff ** pskb, - struct ip_conntrack * ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - TransportAddress * addr, u_int16_t port, - struct ip_conntrack_expect * exp); -int (*nat_q931_hook) (struct sk_buff ** pskb, - struct ip_conntrack * ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, TransportAddress * addr, int idx, - u_int16_t port, struct ip_conntrack_expect * exp); - - -static DEFINE_SPINLOCK(ip_h323_lock); -static char *h323_buffer; - -/****************************************************************************/ -static int get_tpkt_data(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int *datalen, int *dataoff) -{ - struct ip_ct_h323_master *info = &ct->help.ct_h323_info; - int dir = CTINFO2DIR(ctinfo); - struct tcphdr _tcph, *th; - int tcpdatalen; - int tcpdataoff; - unsigned char *tpkt; - int tpktlen; - int tpktoff; - - /* Get TCP header */ - th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl * 4, - sizeof(_tcph), &_tcph); - if (th == NULL) - return 0; - - /* Get TCP data offset */ - tcpdataoff = (*pskb)->nh.iph->ihl * 4 + th->doff * 4; - - /* Get TCP data length */ - tcpdatalen = (*pskb)->len - tcpdataoff; - if (tcpdatalen <= 0) /* No TCP data */ - goto clear_out; - - if (*data == NULL) { /* first TPKT */ - /* Get first TPKT pointer */ - tpkt = skb_header_pointer(*pskb, tcpdataoff, tcpdatalen, - h323_buffer); - BUG_ON(tpkt == NULL); - - /* Validate TPKT identifier */ - if (tcpdatalen < 4 || tpkt[0] != 0x03 || tpkt[1] != 0) { - /* Netmeeting sends TPKT header and data separately */ - if (info->tpkt_len[dir] > 0) { - DEBUGP("ip_ct_h323: previous packet " - "indicated separate TPKT data of %hu " - "bytes\n", info->tpkt_len[dir]); - if (info->tpkt_len[dir] <= tcpdatalen) { - /* Yes, there was a TPKT header - * received */ - *data = tpkt; - *datalen = info->tpkt_len[dir]; - *dataoff = 0; - goto out; - } - - /* Fragmented TPKT */ - if (net_ratelimit()) - printk("ip_ct_h323: " - "fragmented TPKT\n"); - goto clear_out; - } - - /* It is not even a TPKT */ - return 0; - } - tpktoff = 0; - } else { /* Next TPKT */ - tpktoff = *dataoff + *datalen; - tcpdatalen -= tpktoff; - if (tcpdatalen <= 4) /* No more TPKT */ - goto clear_out; - tpkt = *data + *datalen; - - /* Validate TPKT identifier */ - if (tpkt[0] != 0x03 || tpkt[1] != 0) - goto clear_out; - } - - /* Validate TPKT length */ - tpktlen = tpkt[2] * 256 + tpkt[3]; - if (tpktlen < 4) - goto clear_out; - if (tpktlen > tcpdatalen) { - if (tcpdatalen == 4) { /* Separate TPKT header */ - /* Netmeeting sends TPKT header and data separately */ - DEBUGP("ip_ct_h323: separate TPKT header indicates " - "there will be TPKT data of %hu bytes\n", - tpktlen - 4); - info->tpkt_len[dir] = tpktlen - 4; - return 0; - } - - if (net_ratelimit()) - printk("ip_ct_h323: incomplete TPKT (fragmented?)\n"); - goto clear_out; - } - - /* This is the encapsulated data */ - *data = tpkt + 4; - *datalen = tpktlen - 4; - *dataoff = tpktoff + 4; - - out: - /* Clear TPKT length */ - info->tpkt_len[dir] = 0; - return 1; - - clear_out: - info->tpkt_len[dir] = 0; - return 0; -} - -/****************************************************************************/ -static int get_h245_addr(unsigned char *data, H245_TransportAddress * addr, - __be32 * ip, u_int16_t * port) -{ - unsigned char *p; - - if (addr->choice != eH245_TransportAddress_unicastAddress || - addr->unicastAddress.choice != eUnicastAddress_iPAddress) - return 0; - - p = data + addr->unicastAddress.iPAddress.network; - *ip = htonl((p[0] << 24) | (p[1] << 16) | (p[2] << 8) | (p[3])); - *port = (p[4] << 8) | (p[5]); - - return 1; -} - -/****************************************************************************/ -static int expect_rtp_rtcp(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - H245_TransportAddress * addr) -{ - int dir = CTINFO2DIR(ctinfo); - int ret = 0; - __be32 ip; - u_int16_t port; - u_int16_t rtp_port; - struct ip_conntrack_expect *rtp_exp; - struct ip_conntrack_expect *rtcp_exp; - typeof(nat_rtp_rtcp_hook) nat_rtp_rtcp; - - /* Read RTP or RTCP address */ - if (!get_h245_addr(*data, addr, &ip, &port) || - ip != ct->tuplehash[dir].tuple.src.ip || port == 0) - return 0; - - /* RTP port is even */ - rtp_port = port & (~1); - - /* Create expect for RTP */ - if ((rtp_exp = ip_conntrack_expect_alloc(ct)) == NULL) - return -1; - rtp_exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; - rtp_exp->tuple.src.u.udp.port = 0; - rtp_exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip; - rtp_exp->tuple.dst.u.udp.port = htons(rtp_port); - rtp_exp->tuple.dst.protonum = IPPROTO_UDP; - rtp_exp->mask.src.ip = htonl(0xFFFFFFFF); - rtp_exp->mask.src.u.udp.port = 0; - rtp_exp->mask.dst.ip = htonl(0xFFFFFFFF); - rtp_exp->mask.dst.u.udp.port = htons(0xFFFF); - rtp_exp->mask.dst.protonum = 0xFF; - rtp_exp->flags = 0; - - /* Create expect for RTCP */ - if ((rtcp_exp = ip_conntrack_expect_alloc(ct)) == NULL) { - ip_conntrack_expect_put(rtp_exp); - return -1; - } - rtcp_exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; - rtcp_exp->tuple.src.u.udp.port = 0; - rtcp_exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip; - rtcp_exp->tuple.dst.u.udp.port = htons(rtp_port + 1); - rtcp_exp->tuple.dst.protonum = IPPROTO_UDP; - rtcp_exp->mask.src.ip = htonl(0xFFFFFFFF); - rtcp_exp->mask.src.u.udp.port = 0; - rtcp_exp->mask.dst.ip = htonl(0xFFFFFFFF); - rtcp_exp->mask.dst.u.udp.port = htons(0xFFFF); - rtcp_exp->mask.dst.protonum = 0xFF; - rtcp_exp->flags = 0; - - if (ct->tuplehash[dir].tuple.src.ip != - ct->tuplehash[!dir].tuple.dst.ip && - (nat_rtp_rtcp = rcu_dereference(nat_rtp_rtcp_hook))) { - /* NAT needed */ - ret = nat_rtp_rtcp(pskb, ct, ctinfo, data, dataoff, - addr, port, rtp_port, rtp_exp, rtcp_exp); - } else { /* Conntrack only */ - rtp_exp->expectfn = NULL; - rtcp_exp->expectfn = NULL; - - if (ip_conntrack_expect_related(rtp_exp) == 0) { - if (ip_conntrack_expect_related(rtcp_exp) == 0) { - DEBUGP("ip_ct_h323: expect RTP " - "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(rtp_exp->tuple.src.ip), - ntohs(rtp_exp->tuple.src.u.udp.port), - NIPQUAD(rtp_exp->tuple.dst.ip), - ntohs(rtp_exp->tuple.dst.u.udp.port)); - DEBUGP("ip_ct_h323: expect RTCP " - "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(rtcp_exp->tuple.src.ip), - ntohs(rtcp_exp->tuple.src.u.udp.port), - NIPQUAD(rtcp_exp->tuple.dst.ip), - ntohs(rtcp_exp->tuple.dst.u.udp.port)); - } else { - ip_conntrack_unexpect_related(rtp_exp); - ret = -1; - } - } else - ret = -1; - } - - ip_conntrack_expect_put(rtp_exp); - ip_conntrack_expect_put(rtcp_exp); - - return ret; -} - -/****************************************************************************/ -static int expect_t120(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - H245_TransportAddress * addr) -{ - int dir = CTINFO2DIR(ctinfo); - int ret = 0; - __be32 ip; - u_int16_t port; - struct ip_conntrack_expect *exp = NULL; - typeof(nat_t120_hook) nat_t120; - - /* Read T.120 address */ - if (!get_h245_addr(*data, addr, &ip, &port) || - ip != ct->tuplehash[dir].tuple.src.ip || port == 0) - return 0; - - /* Create expect for T.120 connections */ - if ((exp = ip_conntrack_expect_alloc(ct)) == NULL) - return -1; - exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; - exp->tuple.src.u.tcp.port = 0; - exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip; - exp->tuple.dst.u.tcp.port = htons(port); - exp->tuple.dst.protonum = IPPROTO_TCP; - exp->mask.src.ip = htonl(0xFFFFFFFF); - exp->mask.src.u.tcp.port = 0; - exp->mask.dst.ip = htonl(0xFFFFFFFF); - exp->mask.dst.u.tcp.port = htons(0xFFFF); - exp->mask.dst.protonum = 0xFF; - exp->flags = IP_CT_EXPECT_PERMANENT; /* Accept multiple channels */ - - if (ct->tuplehash[dir].tuple.src.ip != - ct->tuplehash[!dir].tuple.dst.ip && - (nat_t120 = rcu_dereference(nat_t120_hook))) { - /* NAT needed */ - ret = nat_t120(pskb, ct, ctinfo, data, dataoff, addr, - port, exp); - } else { /* Conntrack only */ - exp->expectfn = NULL; - if (ip_conntrack_expect_related(exp) == 0) { - DEBUGP("ip_ct_h323: expect T.120 " - "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(exp->tuple.src.ip), - ntohs(exp->tuple.src.u.tcp.port), - NIPQUAD(exp->tuple.dst.ip), - ntohs(exp->tuple.dst.u.tcp.port)); - } else - ret = -1; - } - - ip_conntrack_expect_put(exp); - - return ret; -} - -/****************************************************************************/ -static int process_h245_channel(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - H2250LogicalChannelParameters * channel) -{ - int ret; - - if (channel->options & eH2250LogicalChannelParameters_mediaChannel) { - /* RTP */ - ret = expect_rtp_rtcp(pskb, ct, ctinfo, data, dataoff, - &channel->mediaChannel); - if (ret < 0) - return -1; - } - - if (channel-> - options & eH2250LogicalChannelParameters_mediaControlChannel) { - /* RTCP */ - ret = expect_rtp_rtcp(pskb, ct, ctinfo, data, dataoff, - &channel->mediaControlChannel); - if (ret < 0) - return -1; - } - - return 0; -} - -/****************************************************************************/ -static int process_olc(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - OpenLogicalChannel * olc) -{ - int ret; - - DEBUGP("ip_ct_h323: OpenLogicalChannel\n"); - - if (olc->forwardLogicalChannelParameters.multiplexParameters.choice == - eOpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters) - { - ret = process_h245_channel(pskb, ct, ctinfo, data, dataoff, - &olc-> - forwardLogicalChannelParameters. - multiplexParameters. - h2250LogicalChannelParameters); - if (ret < 0) - return -1; - } - - if ((olc->options & - eOpenLogicalChannel_reverseLogicalChannelParameters) && - (olc->reverseLogicalChannelParameters.options & - eOpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters) - && (olc->reverseLogicalChannelParameters.multiplexParameters. - choice == - eOpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters)) - { - ret = - process_h245_channel(pskb, ct, ctinfo, data, dataoff, - &olc-> - reverseLogicalChannelParameters. - multiplexParameters. - h2250LogicalChannelParameters); - if (ret < 0) - return -1; - } - - if ((olc->options & eOpenLogicalChannel_separateStack) && - olc->forwardLogicalChannelParameters.dataType.choice == - eDataType_data && - olc->forwardLogicalChannelParameters.dataType.data.application. - choice == eDataApplicationCapability_application_t120 && - olc->forwardLogicalChannelParameters.dataType.data.application. - t120.choice == eDataProtocolCapability_separateLANStack && - olc->separateStack.networkAddress.choice == - eNetworkAccessParameters_networkAddress_localAreaAddress) { - ret = expect_t120(pskb, ct, ctinfo, data, dataoff, - &olc->separateStack.networkAddress. - localAreaAddress); - if (ret < 0) - return -1; - } - - return 0; -} - -/****************************************************************************/ -static int process_olca(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - OpenLogicalChannelAck * olca) -{ - H2250LogicalChannelAckParameters *ack; - int ret; - - DEBUGP("ip_ct_h323: OpenLogicalChannelAck\n"); - - if ((olca->options & - eOpenLogicalChannelAck_reverseLogicalChannelParameters) && - (olca->reverseLogicalChannelParameters.options & - eOpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters) - && (olca->reverseLogicalChannelParameters.multiplexParameters. - choice == - eOpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters)) - { - ret = process_h245_channel(pskb, ct, ctinfo, data, dataoff, - &olca-> - reverseLogicalChannelParameters. - multiplexParameters. - h2250LogicalChannelParameters); - if (ret < 0) - return -1; - } - - if ((olca->options & - eOpenLogicalChannelAck_forwardMultiplexAckParameters) && - (olca->forwardMultiplexAckParameters.choice == - eOpenLogicalChannelAck_forwardMultiplexAckParameters_h2250LogicalChannelAckParameters)) - { - ack = &olca->forwardMultiplexAckParameters. - h2250LogicalChannelAckParameters; - if (ack->options & - eH2250LogicalChannelAckParameters_mediaChannel) { - /* RTP */ - ret = expect_rtp_rtcp(pskb, ct, ctinfo, data, dataoff, - &ack->mediaChannel); - if (ret < 0) - return -1; - } - - if (ack->options & - eH2250LogicalChannelAckParameters_mediaControlChannel) { - /* RTCP */ - ret = expect_rtp_rtcp(pskb, ct, ctinfo, data, dataoff, - &ack->mediaControlChannel); - if (ret < 0) - return -1; - } - } - - return 0; -} - -/****************************************************************************/ -static int process_h245(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - MultimediaSystemControlMessage * mscm) -{ - switch (mscm->choice) { - case eMultimediaSystemControlMessage_request: - if (mscm->request.choice == - eRequestMessage_openLogicalChannel) { - return process_olc(pskb, ct, ctinfo, data, dataoff, - &mscm->request.openLogicalChannel); - } - DEBUGP("ip_ct_h323: H.245 Request %d\n", - mscm->request.choice); - break; - case eMultimediaSystemControlMessage_response: - if (mscm->response.choice == - eResponseMessage_openLogicalChannelAck) { - return process_olca(pskb, ct, ctinfo, data, dataoff, - &mscm->response. - openLogicalChannelAck); - } - DEBUGP("ip_ct_h323: H.245 Response %d\n", - mscm->response.choice); - break; - default: - DEBUGP("ip_ct_h323: H.245 signal %d\n", mscm->choice); - break; - } - - return 0; -} - -/****************************************************************************/ -static int h245_help(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo) -{ - static MultimediaSystemControlMessage mscm; - unsigned char *data = NULL; - int datalen; - int dataoff; - int ret; - - /* Until there's been traffic both ways, don't look in packets. */ - if (ctinfo != IP_CT_ESTABLISHED - && ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) { - return NF_ACCEPT; - } - DEBUGP("ip_ct_h245: skblen = %u\n", (*pskb)->len); - - spin_lock_bh(&ip_h323_lock); - - /* Process each TPKT */ - while (get_tpkt_data(pskb, ct, ctinfo, &data, &datalen, &dataoff)) { - DEBUGP("ip_ct_h245: TPKT %u.%u.%u.%u->%u.%u.%u.%u, len=%d\n", - NIPQUAD((*pskb)->nh.iph->saddr), - NIPQUAD((*pskb)->nh.iph->daddr), datalen); - - /* Decode H.245 signal */ - ret = DecodeMultimediaSystemControlMessage(data, datalen, - &mscm); - if (ret < 0) { - if (net_ratelimit()) - printk("ip_ct_h245: decoding error: %s\n", - ret == H323_ERROR_BOUND ? - "out of bound" : "out of range"); - /* We don't drop when decoding error */ - break; - } - - /* Process H.245 signal */ - if (process_h245(pskb, ct, ctinfo, &data, dataoff, &mscm) < 0) - goto drop; - } - - spin_unlock_bh(&ip_h323_lock); - return NF_ACCEPT; - - drop: - spin_unlock_bh(&ip_h323_lock); - if (net_ratelimit()) - printk("ip_ct_h245: packet dropped\n"); - return NF_DROP; -} - -/****************************************************************************/ -static struct ip_conntrack_helper ip_conntrack_helper_h245 = { - .name = "H.245", - .me = THIS_MODULE, - .max_expected = H323_RTP_CHANNEL_MAX * 4 + 2 /* T.120 */ , - .timeout = 240, - .tuple = {.dst = {.protonum = IPPROTO_TCP}}, - .mask = {.src = {.u = {0xFFFF}}, - .dst = {.protonum = 0xFF}}, - .help = h245_help -}; - -/****************************************************************************/ -void ip_conntrack_h245_expect(struct ip_conntrack *new, - struct ip_conntrack_expect *this) -{ - write_lock_bh(&ip_conntrack_lock); - new->helper = &ip_conntrack_helper_h245; - write_unlock_bh(&ip_conntrack_lock); -} - -/****************************************************************************/ -int get_h225_addr(unsigned char *data, TransportAddress * addr, - __be32 * ip, u_int16_t * port) -{ - unsigned char *p; - - if (addr->choice != eTransportAddress_ipAddress) - return 0; - - p = data + addr->ipAddress.ip; - *ip = htonl((p[0] << 24) | (p[1] << 16) | (p[2] << 8) | (p[3])); - *port = (p[4] << 8) | (p[5]); - - return 1; -} - -/****************************************************************************/ -static int expect_h245(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - TransportAddress * addr) -{ - int dir = CTINFO2DIR(ctinfo); - int ret = 0; - __be32 ip; - u_int16_t port; - struct ip_conntrack_expect *exp = NULL; - typeof(nat_h245_hook) nat_h245; - - /* Read h245Address */ - if (!get_h225_addr(*data, addr, &ip, &port) || - ip != ct->tuplehash[dir].tuple.src.ip || port == 0) - return 0; - - /* Create expect for h245 connection */ - if ((exp = ip_conntrack_expect_alloc(ct)) == NULL) - return -1; - exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; - exp->tuple.src.u.tcp.port = 0; - exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip; - exp->tuple.dst.u.tcp.port = htons(port); - exp->tuple.dst.protonum = IPPROTO_TCP; - exp->mask.src.ip = htonl(0xFFFFFFFF); - exp->mask.src.u.tcp.port = 0; - exp->mask.dst.ip = htonl(0xFFFFFFFF); - exp->mask.dst.u.tcp.port = htons(0xFFFF); - exp->mask.dst.protonum = 0xFF; - exp->flags = 0; - - if (ct->tuplehash[dir].tuple.src.ip != - ct->tuplehash[!dir].tuple.dst.ip && - (nat_h245 = rcu_dereference(nat_h245_hook))) { - /* NAT needed */ - ret = nat_h245(pskb, ct, ctinfo, data, dataoff, addr, - port, exp); - } else { /* Conntrack only */ - exp->expectfn = ip_conntrack_h245_expect; - - if (ip_conntrack_expect_related(exp) == 0) { - DEBUGP("ip_ct_q931: expect H.245 " - "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(exp->tuple.src.ip), - ntohs(exp->tuple.src.u.tcp.port), - NIPQUAD(exp->tuple.dst.ip), - ntohs(exp->tuple.dst.u.tcp.port)); - } else - ret = -1; - } - - ip_conntrack_expect_put(exp); - - return ret; -} - -/* Forwarding declaration */ -void ip_conntrack_q931_expect(struct ip_conntrack *new, - struct ip_conntrack_expect *this); - -/****************************************************************************/ -static int expect_callforwarding(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - TransportAddress * addr) -{ - int dir = CTINFO2DIR(ctinfo); - int ret = 0; - __be32 ip; - u_int16_t port; - struct ip_conntrack_expect *exp = NULL; - typeof(nat_callforwarding_hook) nat_callforwarding; - - /* Read alternativeAddress */ - if (!get_h225_addr(*data, addr, &ip, &port) || port == 0) - return 0; - - /* If the calling party is on the same side of the forward-to party, - * we don't need to track the second call */ - if (callforward_filter) { - struct rtable *rt1, *rt2; - struct flowi fl1 = { - .fl4_dst = ip, - }; - struct flowi fl2 = { - .fl4_dst = ct->tuplehash[!dir].tuple.src.ip, - }; - - if (ip_route_output_key(&rt1, &fl1) == 0) { - if (ip_route_output_key(&rt2, &fl2) == 0) { - if (rt1->rt_gateway == rt2->rt_gateway && - rt1->u.dst.dev == rt2->u.dst.dev) - ret = 1; - dst_release(&rt2->u.dst); - } - dst_release(&rt1->u.dst); - } - if (ret) { - DEBUGP("ip_ct_q931: Call Forwarding not tracked\n"); - return 0; - } - } - - /* Create expect for the second call leg */ - if ((exp = ip_conntrack_expect_alloc(ct)) == NULL) - return -1; - exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; - exp->tuple.src.u.tcp.port = 0; - exp->tuple.dst.ip = ip; - exp->tuple.dst.u.tcp.port = htons(port); - exp->tuple.dst.protonum = IPPROTO_TCP; - exp->mask.src.ip = htonl(0xFFFFFFFF); - exp->mask.src.u.tcp.port = 0; - exp->mask.dst.ip = htonl(0xFFFFFFFF); - exp->mask.dst.u.tcp.port = htons(0xFFFF); - exp->mask.dst.protonum = 0xFF; - exp->flags = 0; - - if (ct->tuplehash[dir].tuple.src.ip != - ct->tuplehash[!dir].tuple.dst.ip && - (nat_callforwarding = rcu_dereference(nat_callforwarding_hook))) { - /* Need NAT */ - ret = nat_callforwarding(pskb, ct, ctinfo, data, dataoff, - addr, port, exp); - } else { /* Conntrack only */ - exp->expectfn = ip_conntrack_q931_expect; - - if (ip_conntrack_expect_related(exp) == 0) { - DEBUGP("ip_ct_q931: expect Call Forwarding " - "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(exp->tuple.src.ip), - ntohs(exp->tuple.src.u.tcp.port), - NIPQUAD(exp->tuple.dst.ip), - ntohs(exp->tuple.dst.u.tcp.port)); - } else - ret = -1; - } - - ip_conntrack_expect_put(exp); - - return ret; -} - -/****************************************************************************/ -static int process_setup(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - Setup_UUIE * setup) -{ - int dir = CTINFO2DIR(ctinfo); - int ret; - int i; - __be32 ip; - u_int16_t port; - typeof(set_h225_addr_hook) set_h225_addr; - - DEBUGP("ip_ct_q931: Setup\n"); - - if (setup->options & eSetup_UUIE_h245Address) { - ret = expect_h245(pskb, ct, ctinfo, data, dataoff, - &setup->h245Address); - if (ret < 0) - return -1; - } - - set_h225_addr = rcu_dereference(set_h225_addr_hook); - - if ((setup->options & eSetup_UUIE_destCallSignalAddress) && - (set_h225_addr) && - get_h225_addr(*data, &setup->destCallSignalAddress, &ip, &port) && - ip != ct->tuplehash[!dir].tuple.src.ip) { - DEBUGP("ip_ct_q931: set destCallSignalAddress " - "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(ip), port, - NIPQUAD(ct->tuplehash[!dir].tuple.src.ip), - ntohs(ct->tuplehash[!dir].tuple.src.u.tcp.port)); - ret = set_h225_addr(pskb, data, dataoff, - &setup->destCallSignalAddress, - ct->tuplehash[!dir].tuple.src.ip, - ntohs(ct->tuplehash[!dir].tuple.src. - u.tcp.port)); - if (ret < 0) - return -1; - } - - if ((setup->options & eSetup_UUIE_sourceCallSignalAddress) && - (set_h225_addr) && - get_h225_addr(*data, &setup->sourceCallSignalAddress, &ip, &port) - && ip != ct->tuplehash[!dir].tuple.dst.ip) { - DEBUGP("ip_ct_q931: set sourceCallSignalAddress " - "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(ip), port, - NIPQUAD(ct->tuplehash[!dir].tuple.dst.ip), - ntohs(ct->tuplehash[!dir].tuple.dst.u.tcp.port)); - ret = set_h225_addr(pskb, data, dataoff, - &setup->sourceCallSignalAddress, - ct->tuplehash[!dir].tuple.dst.ip, - ntohs(ct->tuplehash[!dir].tuple.dst. - u.tcp.port)); - if (ret < 0) - return -1; - } - - if (setup->options & eSetup_UUIE_fastStart) { - for (i = 0; i < setup->fastStart.count; i++) { - ret = process_olc(pskb, ct, ctinfo, data, dataoff, - &setup->fastStart.item[i]); - if (ret < 0) - return -1; - } - } - - return 0; -} - -/****************************************************************************/ -static int process_callproceeding(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - CallProceeding_UUIE * callproc) -{ - int ret; - int i; - - DEBUGP("ip_ct_q931: CallProceeding\n"); - - if (callproc->options & eCallProceeding_UUIE_h245Address) { - ret = expect_h245(pskb, ct, ctinfo, data, dataoff, - &callproc->h245Address); - if (ret < 0) - return -1; - } - - if (callproc->options & eCallProceeding_UUIE_fastStart) { - for (i = 0; i < callproc->fastStart.count; i++) { - ret = process_olc(pskb, ct, ctinfo, data, dataoff, - &callproc->fastStart.item[i]); - if (ret < 0) - return -1; - } - } - - return 0; -} - -/****************************************************************************/ -static int process_connect(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - Connect_UUIE * connect) -{ - int ret; - int i; - - DEBUGP("ip_ct_q931: Connect\n"); - - if (connect->options & eConnect_UUIE_h245Address) { - ret = expect_h245(pskb, ct, ctinfo, data, dataoff, - &connect->h245Address); - if (ret < 0) - return -1; - } - - if (connect->options & eConnect_UUIE_fastStart) { - for (i = 0; i < connect->fastStart.count; i++) { - ret = process_olc(pskb, ct, ctinfo, data, dataoff, - &connect->fastStart.item[i]); - if (ret < 0) - return -1; - } - } - - return 0; -} - -/****************************************************************************/ -static int process_alerting(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - Alerting_UUIE * alert) -{ - int ret; - int i; - - DEBUGP("ip_ct_q931: Alerting\n"); - - if (alert->options & eAlerting_UUIE_h245Address) { - ret = expect_h245(pskb, ct, ctinfo, data, dataoff, - &alert->h245Address); - if (ret < 0) - return -1; - } - - if (alert->options & eAlerting_UUIE_fastStart) { - for (i = 0; i < alert->fastStart.count; i++) { - ret = process_olc(pskb, ct, ctinfo, data, dataoff, - &alert->fastStart.item[i]); - if (ret < 0) - return -1; - } - } - - return 0; -} - -/****************************************************************************/ -static int process_information(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - Information_UUIE * info) -{ - int ret; - int i; - - DEBUGP("ip_ct_q931: Information\n"); - - if (info->options & eInformation_UUIE_fastStart) { - for (i = 0; i < info->fastStart.count; i++) { - ret = process_olc(pskb, ct, ctinfo, data, dataoff, - &info->fastStart.item[i]); - if (ret < 0) - return -1; - } - } - - return 0; -} - -/****************************************************************************/ -static int process_facility(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - Facility_UUIE * facility) -{ - int ret; - int i; - - DEBUGP("ip_ct_q931: Facility\n"); - - if (facility->reason.choice == eFacilityReason_callForwarded) { - if (facility->options & eFacility_UUIE_alternativeAddress) - return expect_callforwarding(pskb, ct, ctinfo, data, - dataoff, - &facility-> - alternativeAddress); - return 0; - } - - if (facility->options & eFacility_UUIE_h245Address) { - ret = expect_h245(pskb, ct, ctinfo, data, dataoff, - &facility->h245Address); - if (ret < 0) - return -1; - } - - if (facility->options & eFacility_UUIE_fastStart) { - for (i = 0; i < facility->fastStart.count; i++) { - ret = process_olc(pskb, ct, ctinfo, data, dataoff, - &facility->fastStart.item[i]); - if (ret < 0) - return -1; - } - } - - return 0; -} - -/****************************************************************************/ -static int process_progress(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - Progress_UUIE * progress) -{ - int ret; - int i; - - DEBUGP("ip_ct_q931: Progress\n"); - - if (progress->options & eProgress_UUIE_h245Address) { - ret = expect_h245(pskb, ct, ctinfo, data, dataoff, - &progress->h245Address); - if (ret < 0) - return -1; - } - - if (progress->options & eProgress_UUIE_fastStart) { - for (i = 0; i < progress->fastStart.count; i++) { - ret = process_olc(pskb, ct, ctinfo, data, dataoff, - &progress->fastStart.item[i]); - if (ret < 0) - return -1; - } - } - - return 0; -} - -/****************************************************************************/ -static int process_q931(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, Q931 * q931) -{ - H323_UU_PDU *pdu = &q931->UUIE.h323_uu_pdu; - int i; - int ret = 0; - - switch (pdu->h323_message_body.choice) { - case eH323_UU_PDU_h323_message_body_setup: - ret = process_setup(pskb, ct, ctinfo, data, dataoff, - &pdu->h323_message_body.setup); - break; - case eH323_UU_PDU_h323_message_body_callProceeding: - ret = process_callproceeding(pskb, ct, ctinfo, data, dataoff, - &pdu->h323_message_body. - callProceeding); - break; - case eH323_UU_PDU_h323_message_body_connect: - ret = process_connect(pskb, ct, ctinfo, data, dataoff, - &pdu->h323_message_body.connect); - break; - case eH323_UU_PDU_h323_message_body_alerting: - ret = process_alerting(pskb, ct, ctinfo, data, dataoff, - &pdu->h323_message_body.alerting); - break; - case eH323_UU_PDU_h323_message_body_information: - ret = process_information(pskb, ct, ctinfo, data, dataoff, - &pdu->h323_message_body. - information); - break; - case eH323_UU_PDU_h323_message_body_facility: - ret = process_facility(pskb, ct, ctinfo, data, dataoff, - &pdu->h323_message_body.facility); - break; - case eH323_UU_PDU_h323_message_body_progress: - ret = process_progress(pskb, ct, ctinfo, data, dataoff, - &pdu->h323_message_body.progress); - break; - default: - DEBUGP("ip_ct_q931: Q.931 signal %d\n", - pdu->h323_message_body.choice); - break; - } - - if (ret < 0) - return -1; - - if (pdu->options & eH323_UU_PDU_h245Control) { - for (i = 0; i < pdu->h245Control.count; i++) { - ret = process_h245(pskb, ct, ctinfo, data, dataoff, - &pdu->h245Control.item[i]); - if (ret < 0) - return -1; - } - } - - return 0; -} - -/****************************************************************************/ -static int q931_help(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo) -{ - static Q931 q931; - unsigned char *data = NULL; - int datalen; - int dataoff; - int ret; - - /* Until there's been traffic both ways, don't look in packets. */ - if (ctinfo != IP_CT_ESTABLISHED - && ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) { - return NF_ACCEPT; - } - DEBUGP("ip_ct_q931: skblen = %u\n", (*pskb)->len); - - spin_lock_bh(&ip_h323_lock); - - /* Process each TPKT */ - while (get_tpkt_data(pskb, ct, ctinfo, &data, &datalen, &dataoff)) { - DEBUGP("ip_ct_q931: TPKT %u.%u.%u.%u->%u.%u.%u.%u, len=%d\n", - NIPQUAD((*pskb)->nh.iph->saddr), - NIPQUAD((*pskb)->nh.iph->daddr), datalen); - - /* Decode Q.931 signal */ - ret = DecodeQ931(data, datalen, &q931); - if (ret < 0) { - if (net_ratelimit()) - printk("ip_ct_q931: decoding error: %s\n", - ret == H323_ERROR_BOUND ? - "out of bound" : "out of range"); - /* We don't drop when decoding error */ - break; - } - - /* Process Q.931 signal */ - if (process_q931(pskb, ct, ctinfo, &data, dataoff, &q931) < 0) - goto drop; - } - - spin_unlock_bh(&ip_h323_lock); - return NF_ACCEPT; - - drop: - spin_unlock_bh(&ip_h323_lock); - if (net_ratelimit()) - printk("ip_ct_q931: packet dropped\n"); - return NF_DROP; -} - -/****************************************************************************/ -static struct ip_conntrack_helper ip_conntrack_helper_q931 = { - .name = "Q.931", - .me = THIS_MODULE, - .max_expected = H323_RTP_CHANNEL_MAX * 4 + 4 /* T.120 and H.245 */ , - .timeout = 240, - .tuple = {.src = {.u = {.tcp = {.port = __constant_htons(Q931_PORT)}}}, - .dst = {.protonum = IPPROTO_TCP}}, - .mask = {.src = {.u = {0xFFFF}}, - .dst = {.protonum = 0xFF}}, - .help = q931_help -}; - -/****************************************************************************/ -void ip_conntrack_q931_expect(struct ip_conntrack *new, - struct ip_conntrack_expect *this) -{ - write_lock_bh(&ip_conntrack_lock); - new->helper = &ip_conntrack_helper_q931; - write_unlock_bh(&ip_conntrack_lock); -} - -/****************************************************************************/ -static unsigned char *get_udp_data(struct sk_buff **pskb, int *datalen) -{ - struct udphdr _uh, *uh; - int dataoff; - - uh = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl * 4, sizeof(_uh), - &_uh); - if (uh == NULL) - return NULL; - dataoff = (*pskb)->nh.iph->ihl * 4 + sizeof(_uh); - if (dataoff >= (*pskb)->len) - return NULL; - *datalen = (*pskb)->len - dataoff; - return skb_header_pointer(*pskb, dataoff, *datalen, h323_buffer); -} - -/****************************************************************************/ -static struct ip_conntrack_expect *find_expect(struct ip_conntrack *ct, - __be32 ip, u_int16_t port) -{ - struct ip_conntrack_expect *exp; - struct ip_conntrack_tuple tuple; - - tuple.src.ip = 0; - tuple.src.u.tcp.port = 0; - tuple.dst.ip = ip; - tuple.dst.u.tcp.port = htons(port); - tuple.dst.protonum = IPPROTO_TCP; - - exp = __ip_conntrack_expect_find(&tuple); - if (exp && exp->master == ct) - return exp; - return NULL; -} - -/****************************************************************************/ -static int set_expect_timeout(struct ip_conntrack_expect *exp, - unsigned timeout) -{ - if (!exp || !del_timer(&exp->timeout)) - return 0; - - exp->timeout.expires = jiffies + timeout * HZ; - add_timer(&exp->timeout); - - return 1; -} - -/****************************************************************************/ -static int expect_q931(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, - TransportAddress * addr, int count) -{ - struct ip_ct_h323_master *info = &ct->help.ct_h323_info; - int dir = CTINFO2DIR(ctinfo); - int ret = 0; - int i; - __be32 ip; - u_int16_t port; - struct ip_conntrack_expect *exp; - typeof(nat_q931_hook) nat_q931; - - /* Look for the first related address */ - for (i = 0; i < count; i++) { - if (get_h225_addr(*data, &addr[i], &ip, &port) && - ip == ct->tuplehash[dir].tuple.src.ip && port != 0) - break; - } - - if (i >= count) /* Not found */ - return 0; - - /* Create expect for Q.931 */ - if ((exp = ip_conntrack_expect_alloc(ct)) == NULL) - return -1; - exp->tuple.src.ip = gkrouted_only ? /* only accept calls from GK? */ - ct->tuplehash[!dir].tuple.src.ip : 0; - exp->tuple.src.u.tcp.port = 0; - exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip; - exp->tuple.dst.u.tcp.port = htons(port); - exp->tuple.dst.protonum = IPPROTO_TCP; - exp->mask.src.ip = gkrouted_only ? htonl(0xFFFFFFFF) : 0; - exp->mask.src.u.tcp.port = 0; - exp->mask.dst.ip = htonl(0xFFFFFFFF); - exp->mask.dst.u.tcp.port = htons(0xFFFF); - exp->mask.dst.protonum = 0xFF; - exp->flags = IP_CT_EXPECT_PERMANENT; /* Accept multiple calls */ - - nat_q931 = rcu_dereference(nat_q931_hook); - if (nat_q931) { /* Need NAT */ - ret = nat_q931(pskb, ct, ctinfo, data, addr, i, port, exp); - } else { /* Conntrack only */ - exp->expectfn = ip_conntrack_q931_expect; - - if (ip_conntrack_expect_related(exp) == 0) { - DEBUGP("ip_ct_ras: expect Q.931 " - "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(exp->tuple.src.ip), - ntohs(exp->tuple.src.u.tcp.port), - NIPQUAD(exp->tuple.dst.ip), - ntohs(exp->tuple.dst.u.tcp.port)); - - /* Save port for looking up expect in processing RCF */ - info->sig_port[dir] = port; - } else - ret = -1; - } - - ip_conntrack_expect_put(exp); - - return ret; -} - -/****************************************************************************/ -static int process_grq(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, GatekeeperRequest * grq) -{ - typeof(set_ras_addr_hook) set_ras_addr; - - DEBUGP("ip_ct_ras: GRQ\n"); - - set_ras_addr = rcu_dereference(set_ras_addr_hook); - if (set_ras_addr) /* NATed */ - return set_ras_addr(pskb, ct, ctinfo, data, - &grq->rasAddress, 1); - return 0; -} - -/* Declare before using */ -static void ip_conntrack_ras_expect(struct ip_conntrack *new, - struct ip_conntrack_expect *this); - -/****************************************************************************/ -static int process_gcf(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, GatekeeperConfirm * gcf) -{ - int dir = CTINFO2DIR(ctinfo); - int ret = 0; - __be32 ip; - u_int16_t port; - struct ip_conntrack_expect *exp; - - DEBUGP("ip_ct_ras: GCF\n"); - - if (!get_h225_addr(*data, &gcf->rasAddress, &ip, &port)) - return 0; - - /* Registration port is the same as discovery port */ - if (ip == ct->tuplehash[dir].tuple.src.ip && - port == ntohs(ct->tuplehash[dir].tuple.src.u.udp.port)) - return 0; - - /* Avoid RAS expectation loops. A GCF is never expected. */ - if (test_bit(IPS_EXPECTED_BIT, &ct->status)) - return 0; - - /* Need new expect */ - if ((exp = ip_conntrack_expect_alloc(ct)) == NULL) - return -1; - exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; - exp->tuple.src.u.tcp.port = 0; - exp->tuple.dst.ip = ip; - exp->tuple.dst.u.tcp.port = htons(port); - exp->tuple.dst.protonum = IPPROTO_UDP; - exp->mask.src.ip = htonl(0xFFFFFFFF); - exp->mask.src.u.tcp.port = 0; - exp->mask.dst.ip = htonl(0xFFFFFFFF); - exp->mask.dst.u.tcp.port = htons(0xFFFF); - exp->mask.dst.protonum = 0xFF; - exp->flags = 0; - exp->expectfn = ip_conntrack_ras_expect; - if (ip_conntrack_expect_related(exp) == 0) { - DEBUGP("ip_ct_ras: expect RAS " - "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(exp->tuple.src.ip), - ntohs(exp->tuple.src.u.tcp.port), - NIPQUAD(exp->tuple.dst.ip), - ntohs(exp->tuple.dst.u.tcp.port)); - } else - ret = -1; - - ip_conntrack_expect_put(exp); - - return ret; -} - -/****************************************************************************/ -static int process_rrq(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, RegistrationRequest * rrq) -{ - struct ip_ct_h323_master *info = &ct->help.ct_h323_info; - int ret; - typeof(set_ras_addr_hook) set_ras_addr; - - DEBUGP("ip_ct_ras: RRQ\n"); - - ret = expect_q931(pskb, ct, ctinfo, data, - rrq->callSignalAddress.item, - rrq->callSignalAddress.count); - if (ret < 0) - return -1; - - set_ras_addr = rcu_dereference(set_ras_addr_hook); - if (set_ras_addr) { - ret = set_ras_addr(pskb, ct, ctinfo, data, - rrq->rasAddress.item, - rrq->rasAddress.count); - if (ret < 0) - return -1; - } - - if (rrq->options & eRegistrationRequest_timeToLive) { - DEBUGP("ip_ct_ras: RRQ TTL = %u seconds\n", rrq->timeToLive); - info->timeout = rrq->timeToLive; - } else - info->timeout = default_rrq_ttl; - - return 0; -} - -/****************************************************************************/ -static int process_rcf(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, RegistrationConfirm * rcf) -{ - struct ip_ct_h323_master *info = &ct->help.ct_h323_info; - int dir = CTINFO2DIR(ctinfo); - int ret; - struct ip_conntrack_expect *exp; - typeof(set_sig_addr_hook) set_sig_addr; - - DEBUGP("ip_ct_ras: RCF\n"); - - set_sig_addr = rcu_dereference(set_sig_addr_hook); - if (set_sig_addr) { - ret = set_sig_addr(pskb, ct, ctinfo, data, - rcf->callSignalAddress.item, - rcf->callSignalAddress.count); - if (ret < 0) - return -1; - } - - if (rcf->options & eRegistrationConfirm_timeToLive) { - DEBUGP("ip_ct_ras: RCF TTL = %u seconds\n", rcf->timeToLive); - info->timeout = rcf->timeToLive; - } - - if (info->timeout > 0) { - DEBUGP - ("ip_ct_ras: set RAS connection timeout to %u seconds\n", - info->timeout); - ip_ct_refresh(ct, *pskb, info->timeout * HZ); - - /* Set expect timeout */ - read_lock_bh(&ip_conntrack_lock); - exp = find_expect(ct, ct->tuplehash[dir].tuple.dst.ip, - info->sig_port[!dir]); - if (exp) { - DEBUGP("ip_ct_ras: set Q.931 expect " - "(%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu) " - "timeout to %u seconds\n", - NIPQUAD(exp->tuple.src.ip), - ntohs(exp->tuple.src.u.tcp.port), - NIPQUAD(exp->tuple.dst.ip), - ntohs(exp->tuple.dst.u.tcp.port), - info->timeout); - set_expect_timeout(exp, info->timeout); - } - read_unlock_bh(&ip_conntrack_lock); - } - - return 0; -} - -/****************************************************************************/ -static int process_urq(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, UnregistrationRequest * urq) -{ - struct ip_ct_h323_master *info = &ct->help.ct_h323_info; - int dir = CTINFO2DIR(ctinfo); - int ret; - typeof(set_sig_addr_hook) set_sig_addr; - - DEBUGP("ip_ct_ras: URQ\n"); - - set_sig_addr = rcu_dereference(set_sig_addr_hook); - if (set_sig_addr) { - ret = set_sig_addr(pskb, ct, ctinfo, data, - urq->callSignalAddress.item, - urq->callSignalAddress.count); - if (ret < 0) - return -1; - } - - /* Clear old expect */ - ip_ct_remove_expectations(ct); - info->sig_port[dir] = 0; - info->sig_port[!dir] = 0; - - /* Give it 30 seconds for UCF or URJ */ - ip_ct_refresh(ct, *pskb, 30 * HZ); - - return 0; -} - -/****************************************************************************/ -static int process_arq(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, AdmissionRequest * arq) -{ - struct ip_ct_h323_master *info = &ct->help.ct_h323_info; - int dir = CTINFO2DIR(ctinfo); - __be32 ip; - u_int16_t port; - typeof(set_h225_addr_hook) set_h225_addr; - - DEBUGP("ip_ct_ras: ARQ\n"); - - set_h225_addr = rcu_dereference(set_h225_addr_hook); - if ((arq->options & eAdmissionRequest_destCallSignalAddress) && - get_h225_addr(*data, &arq->destCallSignalAddress, &ip, &port) && - ip == ct->tuplehash[dir].tuple.src.ip && - port == info->sig_port[dir] && set_h225_addr) { - /* Answering ARQ */ - return set_h225_addr(pskb, data, 0, - &arq->destCallSignalAddress, - ct->tuplehash[!dir].tuple.dst.ip, - info->sig_port[!dir]); - } - - if ((arq->options & eAdmissionRequest_srcCallSignalAddress) && - get_h225_addr(*data, &arq->srcCallSignalAddress, &ip, &port) && - ip == ct->tuplehash[dir].tuple.src.ip && set_h225_addr) { - /* Calling ARQ */ - return set_h225_addr(pskb, data, 0, - &arq->srcCallSignalAddress, - ct->tuplehash[!dir].tuple.dst.ip, - port); - } - - return 0; -} - -/****************************************************************************/ -static int process_acf(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, AdmissionConfirm * acf) -{ - int dir = CTINFO2DIR(ctinfo); - int ret = 0; - __be32 ip; - u_int16_t port; - struct ip_conntrack_expect *exp; - typeof(set_sig_addr_hook) set_sig_addr; - - DEBUGP("ip_ct_ras: ACF\n"); - - if (!get_h225_addr(*data, &acf->destCallSignalAddress, &ip, &port)) - return 0; - - if (ip == ct->tuplehash[dir].tuple.dst.ip) { /* Answering ACF */ - set_sig_addr = rcu_dereference(set_sig_addr_hook); - if (set_sig_addr) - return set_sig_addr(pskb, ct, ctinfo, data, - &acf->destCallSignalAddress, 1); - return 0; - } - - /* Need new expect */ - if ((exp = ip_conntrack_expect_alloc(ct)) == NULL) - return -1; - exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; - exp->tuple.src.u.tcp.port = 0; - exp->tuple.dst.ip = ip; - exp->tuple.dst.u.tcp.port = htons(port); - exp->tuple.dst.protonum = IPPROTO_TCP; - exp->mask.src.ip = htonl(0xFFFFFFFF); - exp->mask.src.u.tcp.port = 0; - exp->mask.dst.ip = htonl(0xFFFFFFFF); - exp->mask.dst.u.tcp.port = htons(0xFFFF); - exp->mask.dst.protonum = 0xFF; - exp->flags = IP_CT_EXPECT_PERMANENT; - exp->expectfn = ip_conntrack_q931_expect; - - if (ip_conntrack_expect_related(exp) == 0) { - DEBUGP("ip_ct_ras: expect Q.931 " - "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(exp->tuple.src.ip), - ntohs(exp->tuple.src.u.tcp.port), - NIPQUAD(exp->tuple.dst.ip), - ntohs(exp->tuple.dst.u.tcp.port)); - } else - ret = -1; - - ip_conntrack_expect_put(exp); - - return ret; -} - -/****************************************************************************/ -static int process_lrq(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, LocationRequest * lrq) -{ - typeof(set_ras_addr_hook) set_ras_addr; - - DEBUGP("ip_ct_ras: LRQ\n"); - - set_ras_addr = rcu_dereference(set_ras_addr_hook); - if (set_ras_addr) - return set_ras_addr(pskb, ct, ctinfo, data, - &lrq->replyAddress, 1); - return 0; -} - -/****************************************************************************/ -static int process_lcf(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, LocationConfirm * lcf) -{ - int dir = CTINFO2DIR(ctinfo); - int ret = 0; - __be32 ip; - u_int16_t port; - struct ip_conntrack_expect *exp = NULL; - - DEBUGP("ip_ct_ras: LCF\n"); - - if (!get_h225_addr(*data, &lcf->callSignalAddress, &ip, &port)) - return 0; - - /* Need new expect for call signal */ - if ((exp = ip_conntrack_expect_alloc(ct)) == NULL) - return -1; - exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; - exp->tuple.src.u.tcp.port = 0; - exp->tuple.dst.ip = ip; - exp->tuple.dst.u.tcp.port = htons(port); - exp->tuple.dst.protonum = IPPROTO_TCP; - exp->mask.src.ip = htonl(0xFFFFFFFF); - exp->mask.src.u.tcp.port = 0; - exp->mask.dst.ip = htonl(0xFFFFFFFF); - exp->mask.dst.u.tcp.port = htons(0xFFFF); - exp->mask.dst.protonum = 0xFF; - exp->flags = IP_CT_EXPECT_PERMANENT; - exp->expectfn = ip_conntrack_q931_expect; - - if (ip_conntrack_expect_related(exp) == 0) { - DEBUGP("ip_ct_ras: expect Q.931 " - "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(exp->tuple.src.ip), - ntohs(exp->tuple.src.u.tcp.port), - NIPQUAD(exp->tuple.dst.ip), - ntohs(exp->tuple.dst.u.tcp.port)); - } else - ret = -1; - - ip_conntrack_expect_put(exp); - - /* Ignore rasAddress */ - - return ret; -} - -/****************************************************************************/ -static int process_irr(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, InfoRequestResponse * irr) -{ - int ret; - typeof(set_ras_addr_hook) set_ras_addr; - typeof(set_sig_addr_hook) set_sig_addr; - - DEBUGP("ip_ct_ras: IRR\n"); - - set_ras_addr = rcu_dereference(set_ras_addr_hook); - if (set_ras_addr) { - ret = set_ras_addr(pskb, ct, ctinfo, data, - &irr->rasAddress, 1); - if (ret < 0) - return -1; - } - - set_sig_addr = rcu_dereference(set_sig_addr_hook); - if (set_sig_addr) { - ret = set_sig_addr(pskb, ct, ctinfo, data, - irr->callSignalAddress.item, - irr->callSignalAddress.count); - if (ret < 0) - return -1; - } - - return 0; -} - -/****************************************************************************/ -static int process_ras(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, RasMessage * ras) -{ - switch (ras->choice) { - case eRasMessage_gatekeeperRequest: - return process_grq(pskb, ct, ctinfo, data, - &ras->gatekeeperRequest); - case eRasMessage_gatekeeperConfirm: - return process_gcf(pskb, ct, ctinfo, data, - &ras->gatekeeperConfirm); - case eRasMessage_registrationRequest: - return process_rrq(pskb, ct, ctinfo, data, - &ras->registrationRequest); - case eRasMessage_registrationConfirm: - return process_rcf(pskb, ct, ctinfo, data, - &ras->registrationConfirm); - case eRasMessage_unregistrationRequest: - return process_urq(pskb, ct, ctinfo, data, - &ras->unregistrationRequest); - case eRasMessage_admissionRequest: - return process_arq(pskb, ct, ctinfo, data, - &ras->admissionRequest); - case eRasMessage_admissionConfirm: - return process_acf(pskb, ct, ctinfo, data, - &ras->admissionConfirm); - case eRasMessage_locationRequest: - return process_lrq(pskb, ct, ctinfo, data, - &ras->locationRequest); - case eRasMessage_locationConfirm: - return process_lcf(pskb, ct, ctinfo, data, - &ras->locationConfirm); - case eRasMessage_infoRequestResponse: - return process_irr(pskb, ct, ctinfo, data, - &ras->infoRequestResponse); - default: - DEBUGP("ip_ct_ras: RAS message %d\n", ras->choice); - break; - } - - return 0; -} - -/****************************************************************************/ -static int ras_help(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo) -{ - static RasMessage ras; - unsigned char *data; - int datalen = 0; - int ret; - - DEBUGP("ip_ct_ras: skblen = %u\n", (*pskb)->len); - - spin_lock_bh(&ip_h323_lock); - - /* Get UDP data */ - data = get_udp_data(pskb, &datalen); - if (data == NULL) - goto accept; - DEBUGP("ip_ct_ras: RAS message %u.%u.%u.%u->%u.%u.%u.%u, len=%d\n", - NIPQUAD((*pskb)->nh.iph->saddr), - NIPQUAD((*pskb)->nh.iph->daddr), datalen); - - /* Decode RAS message */ - ret = DecodeRasMessage(data, datalen, &ras); - if (ret < 0) { - if (net_ratelimit()) - printk("ip_ct_ras: decoding error: %s\n", - ret == H323_ERROR_BOUND ? - "out of bound" : "out of range"); - goto accept; - } - - /* Process RAS message */ - if (process_ras(pskb, ct, ctinfo, &data, &ras) < 0) - goto drop; - - accept: - spin_unlock_bh(&ip_h323_lock); - return NF_ACCEPT; - - drop: - spin_unlock_bh(&ip_h323_lock); - if (net_ratelimit()) - printk("ip_ct_ras: packet dropped\n"); - return NF_DROP; -} - -/****************************************************************************/ -static struct ip_conntrack_helper ip_conntrack_helper_ras = { - .name = "RAS", - .me = THIS_MODULE, - .max_expected = 32, - .timeout = 240, - .tuple = {.src = {.u = {.tcp = {.port = __constant_htons(RAS_PORT)}}}, - .dst = {.protonum = IPPROTO_UDP}}, - .mask = {.src = {.u = {0xFFFE}}, - .dst = {.protonum = 0xFF}}, - .help = ras_help, -}; - -/****************************************************************************/ -static void ip_conntrack_ras_expect(struct ip_conntrack *new, - struct ip_conntrack_expect *this) -{ - write_lock_bh(&ip_conntrack_lock); - new->helper = &ip_conntrack_helper_ras; - write_unlock_bh(&ip_conntrack_lock); -} - -/****************************************************************************/ -/* Not __exit - called from init() */ -static void fini(void) -{ - ip_conntrack_helper_unregister(&ip_conntrack_helper_ras); - ip_conntrack_helper_unregister(&ip_conntrack_helper_q931); - kfree(h323_buffer); - DEBUGP("ip_ct_h323: fini\n"); -} - -/****************************************************************************/ -static int __init init(void) -{ - int ret; - - h323_buffer = kmalloc(65536, GFP_KERNEL); - if (!h323_buffer) - return -ENOMEM; - if ((ret = ip_conntrack_helper_register(&ip_conntrack_helper_q931)) || - (ret = ip_conntrack_helper_register(&ip_conntrack_helper_ras))) { - fini(); - return ret; - } - DEBUGP("ip_ct_h323: init success\n"); - return 0; -} - -/****************************************************************************/ -module_init(init); -module_exit(fini); - -EXPORT_SYMBOL_GPL(get_h225_addr); -EXPORT_SYMBOL_GPL(ip_conntrack_h245_expect); -EXPORT_SYMBOL_GPL(ip_conntrack_q931_expect); -EXPORT_SYMBOL_GPL(set_h245_addr_hook); -EXPORT_SYMBOL_GPL(set_h225_addr_hook); -EXPORT_SYMBOL_GPL(set_sig_addr_hook); -EXPORT_SYMBOL_GPL(set_ras_addr_hook); -EXPORT_SYMBOL_GPL(nat_rtp_rtcp_hook); -EXPORT_SYMBOL_GPL(nat_t120_hook); -EXPORT_SYMBOL_GPL(nat_h245_hook); -EXPORT_SYMBOL_GPL(nat_callforwarding_hook); -EXPORT_SYMBOL_GPL(nat_q931_hook); - -MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>"); -MODULE_DESCRIPTION("H.323 connection tracking helper"); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c deleted file mode 100644 index 2b760c5cf70..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c +++ /dev/null @@ -1,684 +0,0 @@ -/* - * ip_conntrack_pptp.c - Version 3.0 - * - * Connection tracking support for PPTP (Point to Point Tunneling Protocol). - * PPTP is a a protocol for creating virtual private networks. - * It is a specification defined by Microsoft and some vendors - * working with Microsoft. PPTP is built on top of a modified - * version of the Internet Generic Routing Encapsulation Protocol. - * GRE is defined in RFC 1701 and RFC 1702. Documentation of - * PPTP can be found in RFC 2637 - * - * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org> - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - * - * Limitations: - * - We blindly assume that control connections are always - * established in PNS->PAC direction. This is a violation - * of RFFC2673 - * - We can only support one single call within each session - * - * TODO: - * - testing of incoming PPTP calls - * - * Changes: - * 2002-02-05 - Version 1.3 - * - Call ip_conntrack_unexpect_related() from - * pptp_destroy_siblings() to destroy expectations in case - * CALL_DISCONNECT_NOTIFY or tcp fin packet was seen - * (Philip Craig <philipc@snapgear.com>) - * - Add Version information at module loadtime - * 2002-02-10 - Version 1.6 - * - move to C99 style initializers - * - remove second expectation if first arrives - * 2004-10-22 - Version 2.0 - * - merge Mandrake's 2.6.x port with recent 2.6.x API changes - * - fix lots of linear skb assumptions from Mandrake's port - * 2005-06-10 - Version 2.1 - * - use ip_conntrack_expect_free() instead of kfree() on the - * expect's (which are from the slab for quite some time) - * 2005-06-10 - Version 3.0 - * - port helper to post-2.6.11 API changes, - * funded by Oxcoda NetBox Blue (http://www.netboxblue.com/) - * 2005-07-30 - Version 3.1 - * - port helper to 2.6.13 API changes - * - */ - -#include <linux/module.h> -#include <linux/netfilter.h> -#include <linux/ip.h> -#include <net/checksum.h> -#include <net/tcp.h> - -#include <linux/netfilter_ipv4/ip_conntrack.h> -#include <linux/netfilter_ipv4/ip_conntrack_core.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_proto_gre.h> -#include <linux/netfilter_ipv4/ip_conntrack_pptp.h> - -#define IP_CT_PPTP_VERSION "3.1" - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); -MODULE_DESCRIPTION("Netfilter connection tracking helper module for PPTP"); - -static DEFINE_SPINLOCK(ip_pptp_lock); - -int -(*ip_nat_pptp_hook_outbound)(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - struct PptpControlHeader *ctlh, - union pptp_ctrl_union *pptpReq); - -int -(*ip_nat_pptp_hook_inbound)(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - struct PptpControlHeader *ctlh, - union pptp_ctrl_union *pptpReq); - -void -(*ip_nat_pptp_hook_exp_gre)(struct ip_conntrack_expect *expect_orig, - struct ip_conntrack_expect *expect_reply); - -void -(*ip_nat_pptp_hook_expectfn)(struct ip_conntrack *ct, - struct ip_conntrack_expect *exp); - -#if 0 -/* PptpControlMessageType names */ -const char *pptp_msg_name[] = { - "UNKNOWN_MESSAGE", - "START_SESSION_REQUEST", - "START_SESSION_REPLY", - "STOP_SESSION_REQUEST", - "STOP_SESSION_REPLY", - "ECHO_REQUEST", - "ECHO_REPLY", - "OUT_CALL_REQUEST", - "OUT_CALL_REPLY", - "IN_CALL_REQUEST", - "IN_CALL_REPLY", - "IN_CALL_CONNECT", - "CALL_CLEAR_REQUEST", - "CALL_DISCONNECT_NOTIFY", - "WAN_ERROR_NOTIFY", - "SET_LINK_INFO" -}; -EXPORT_SYMBOL(pptp_msg_name); -#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s: " format, __FILE__, __FUNCTION__, ## args) -#else -#define DEBUGP(format, args...) -#endif - -#define SECS *HZ -#define MINS * 60 SECS -#define HOURS * 60 MINS - -#define PPTP_GRE_TIMEOUT (10 MINS) -#define PPTP_GRE_STREAM_TIMEOUT (5 HOURS) - -static void pptp_expectfn(struct ip_conntrack *ct, - struct ip_conntrack_expect *exp) -{ - typeof(ip_nat_pptp_hook_expectfn) ip_nat_pptp_expectfn; - - DEBUGP("increasing timeouts\n"); - - /* increase timeout of GRE data channel conntrack entry */ - ct->proto.gre.timeout = PPTP_GRE_TIMEOUT; - ct->proto.gre.stream_timeout = PPTP_GRE_STREAM_TIMEOUT; - - /* Can you see how rusty this code is, compared with the pre-2.6.11 - * one? That's what happened to my shiny newnat of 2002 ;( -HW */ - - rcu_read_lock(); - ip_nat_pptp_expectfn = rcu_dereference(ip_nat_pptp_hook_expectfn); - if (!ip_nat_pptp_expectfn) { - struct ip_conntrack_tuple inv_t; - struct ip_conntrack_expect *exp_other; - - /* obviously this tuple inversion only works until you do NAT */ - invert_tuplepr(&inv_t, &exp->tuple); - DEBUGP("trying to unexpect other dir: "); - DUMP_TUPLE(&inv_t); - - exp_other = ip_conntrack_expect_find_get(&inv_t); - if (exp_other) { - /* delete other expectation. */ - DEBUGP("found\n"); - ip_conntrack_unexpect_related(exp_other); - ip_conntrack_expect_put(exp_other); - } else { - DEBUGP("not found\n"); - } - } else { - /* we need more than simple inversion */ - ip_nat_pptp_expectfn(ct, exp); - } - rcu_read_unlock(); -} - -static int destroy_sibling_or_exp(const struct ip_conntrack_tuple *t) -{ - struct ip_conntrack_tuple_hash *h; - struct ip_conntrack_expect *exp; - - DEBUGP("trying to timeout ct or exp for tuple "); - DUMP_TUPLE(t); - - h = ip_conntrack_find_get(t, NULL); - if (h) { - struct ip_conntrack *sibling = tuplehash_to_ctrack(h); - DEBUGP("setting timeout of conntrack %p to 0\n", sibling); - sibling->proto.gre.timeout = 0; - sibling->proto.gre.stream_timeout = 0; - if (del_timer(&sibling->timeout)) - sibling->timeout.function((unsigned long)sibling); - ip_conntrack_put(sibling); - return 1; - } else { - exp = ip_conntrack_expect_find_get(t); - if (exp) { - DEBUGP("unexpect_related of expect %p\n", exp); - ip_conntrack_unexpect_related(exp); - ip_conntrack_expect_put(exp); - return 1; - } - } - - return 0; -} - - -/* timeout GRE data connections */ -static void pptp_destroy_siblings(struct ip_conntrack *ct) -{ - struct ip_conntrack_tuple t; - - ip_ct_gre_keymap_destroy(ct); - /* Since ct->sibling_list has literally rusted away in 2.6.11, - * we now need another way to find out about our sibling - * contrack and expects... -HW */ - - /* try original (pns->pac) tuple */ - memcpy(&t, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, sizeof(t)); - t.dst.protonum = IPPROTO_GRE; - t.src.u.gre.key = ct->help.ct_pptp_info.pns_call_id; - t.dst.u.gre.key = ct->help.ct_pptp_info.pac_call_id; - - if (!destroy_sibling_or_exp(&t)) - DEBUGP("failed to timeout original pns->pac ct/exp\n"); - - /* try reply (pac->pns) tuple */ - memcpy(&t, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, sizeof(t)); - t.dst.protonum = IPPROTO_GRE; - t.src.u.gre.key = ct->help.ct_pptp_info.pac_call_id; - t.dst.u.gre.key = ct->help.ct_pptp_info.pns_call_id; - - if (!destroy_sibling_or_exp(&t)) - DEBUGP("failed to timeout reply pac->pns ct/exp\n"); -} - -/* expect GRE connections (PNS->PAC and PAC->PNS direction) */ -static inline int -exp_gre(struct ip_conntrack *ct, - __be16 callid, - __be16 peer_callid) -{ - struct ip_conntrack_expect *exp_orig, *exp_reply; - int ret = 1; - typeof(ip_nat_pptp_hook_exp_gre) ip_nat_pptp_exp_gre; - - exp_orig = ip_conntrack_expect_alloc(ct); - if (exp_orig == NULL) - goto out; - - exp_reply = ip_conntrack_expect_alloc(ct); - if (exp_reply == NULL) - goto out_put_orig; - - /* original direction, PNS->PAC */ - exp_orig->tuple.src.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip; - exp_orig->tuple.src.u.gre.key = peer_callid; - exp_orig->tuple.dst.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip; - exp_orig->tuple.dst.u.gre.key = callid; - exp_orig->tuple.dst.protonum = IPPROTO_GRE; - - exp_orig->mask.src.ip = htonl(0xffffffff); - exp_orig->mask.src.u.all = 0; - exp_orig->mask.dst.u.gre.key = htons(0xffff); - exp_orig->mask.dst.ip = htonl(0xffffffff); - exp_orig->mask.dst.protonum = 0xff; - - exp_orig->master = ct; - exp_orig->expectfn = pptp_expectfn; - exp_orig->flags = 0; - - /* both expectations are identical apart from tuple */ - memcpy(exp_reply, exp_orig, sizeof(*exp_reply)); - - /* reply direction, PAC->PNS */ - exp_reply->tuple.src.ip = ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip; - exp_reply->tuple.src.u.gre.key = callid; - exp_reply->tuple.dst.ip = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip; - exp_reply->tuple.dst.u.gre.key = peer_callid; - exp_reply->tuple.dst.protonum = IPPROTO_GRE; - - ip_nat_pptp_exp_gre = rcu_dereference(ip_nat_pptp_hook_exp_gre); - if (ip_nat_pptp_exp_gre) - ip_nat_pptp_exp_gre(exp_orig, exp_reply); - if (ip_conntrack_expect_related(exp_orig) != 0) - goto out_put_both; - if (ip_conntrack_expect_related(exp_reply) != 0) - goto out_unexpect_orig; - - /* Add GRE keymap entries */ - if (ip_ct_gre_keymap_add(ct, &exp_orig->tuple, 0) != 0) - goto out_unexpect_both; - if (ip_ct_gre_keymap_add(ct, &exp_reply->tuple, 1) != 0) { - ip_ct_gre_keymap_destroy(ct); - goto out_unexpect_both; - } - ret = 0; - -out_put_both: - ip_conntrack_expect_put(exp_reply); -out_put_orig: - ip_conntrack_expect_put(exp_orig); -out: - return ret; - -out_unexpect_both: - ip_conntrack_unexpect_related(exp_reply); -out_unexpect_orig: - ip_conntrack_unexpect_related(exp_orig); - goto out_put_both; -} - -static inline int -pptp_inbound_pkt(struct sk_buff **pskb, - struct PptpControlHeader *ctlh, - union pptp_ctrl_union *pptpReq, - unsigned int reqlen, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo) -{ - struct ip_ct_pptp_master *info = &ct->help.ct_pptp_info; - u_int16_t msg; - __be16 cid = 0, pcid = 0; - typeof(ip_nat_pptp_hook_inbound) ip_nat_pptp_inbound; - - msg = ntohs(ctlh->messageType); - DEBUGP("inbound control message %s\n", pptp_msg_name[msg]); - - switch (msg) { - case PPTP_START_SESSION_REPLY: - /* server confirms new control session */ - if (info->sstate < PPTP_SESSION_REQUESTED) - goto invalid; - if (pptpReq->srep.resultCode == PPTP_START_OK) - info->sstate = PPTP_SESSION_CONFIRMED; - else - info->sstate = PPTP_SESSION_ERROR; - break; - - case PPTP_STOP_SESSION_REPLY: - /* server confirms end of control session */ - if (info->sstate > PPTP_SESSION_STOPREQ) - goto invalid; - if (pptpReq->strep.resultCode == PPTP_STOP_OK) - info->sstate = PPTP_SESSION_NONE; - else - info->sstate = PPTP_SESSION_ERROR; - break; - - case PPTP_OUT_CALL_REPLY: - /* server accepted call, we now expect GRE frames */ - if (info->sstate != PPTP_SESSION_CONFIRMED) - goto invalid; - if (info->cstate != PPTP_CALL_OUT_REQ && - info->cstate != PPTP_CALL_OUT_CONF) - goto invalid; - - cid = pptpReq->ocack.callID; - pcid = pptpReq->ocack.peersCallID; - if (info->pns_call_id != pcid) - goto invalid; - DEBUGP("%s, CID=%X, PCID=%X\n", pptp_msg_name[msg], - ntohs(cid), ntohs(pcid)); - - if (pptpReq->ocack.resultCode == PPTP_OUTCALL_CONNECT) { - info->cstate = PPTP_CALL_OUT_CONF; - info->pac_call_id = cid; - exp_gre(ct, cid, pcid); - } else - info->cstate = PPTP_CALL_NONE; - break; - - case PPTP_IN_CALL_REQUEST: - /* server tells us about incoming call request */ - if (info->sstate != PPTP_SESSION_CONFIRMED) - goto invalid; - - cid = pptpReq->icreq.callID; - DEBUGP("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid)); - info->cstate = PPTP_CALL_IN_REQ; - info->pac_call_id = cid; - break; - - case PPTP_IN_CALL_CONNECT: - /* server tells us about incoming call established */ - if (info->sstate != PPTP_SESSION_CONFIRMED) - goto invalid; - if (info->cstate != PPTP_CALL_IN_REP && - info->cstate != PPTP_CALL_IN_CONF) - goto invalid; - - pcid = pptpReq->iccon.peersCallID; - cid = info->pac_call_id; - - if (info->pns_call_id != pcid) - goto invalid; - - DEBUGP("%s, PCID=%X\n", pptp_msg_name[msg], ntohs(pcid)); - info->cstate = PPTP_CALL_IN_CONF; - - /* we expect a GRE connection from PAC to PNS */ - exp_gre(ct, cid, pcid); - break; - - case PPTP_CALL_DISCONNECT_NOTIFY: - /* server confirms disconnect */ - cid = pptpReq->disc.callID; - DEBUGP("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid)); - info->cstate = PPTP_CALL_NONE; - - /* untrack this call id, unexpect GRE packets */ - pptp_destroy_siblings(ct); - break; - - case PPTP_WAN_ERROR_NOTIFY: - case PPTP_ECHO_REQUEST: - case PPTP_ECHO_REPLY: - /* I don't have to explain these ;) */ - break; - default: - goto invalid; - } - - ip_nat_pptp_inbound = rcu_dereference(ip_nat_pptp_hook_inbound); - if (ip_nat_pptp_inbound) - return ip_nat_pptp_inbound(pskb, ct, ctinfo, ctlh, pptpReq); - return NF_ACCEPT; - -invalid: - DEBUGP("invalid %s: type=%d cid=%u pcid=%u " - "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n", - msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0], - msg, ntohs(cid), ntohs(pcid), info->cstate, info->sstate, - ntohs(info->pns_call_id), ntohs(info->pac_call_id)); - return NF_ACCEPT; -} - -static inline int -pptp_outbound_pkt(struct sk_buff **pskb, - struct PptpControlHeader *ctlh, - union pptp_ctrl_union *pptpReq, - unsigned int reqlen, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo) -{ - struct ip_ct_pptp_master *info = &ct->help.ct_pptp_info; - u_int16_t msg; - __be16 cid = 0, pcid = 0; - typeof(ip_nat_pptp_hook_outbound) ip_nat_pptp_outbound; - - msg = ntohs(ctlh->messageType); - DEBUGP("outbound control message %s\n", pptp_msg_name[msg]); - - switch (msg) { - case PPTP_START_SESSION_REQUEST: - /* client requests for new control session */ - if (info->sstate != PPTP_SESSION_NONE) - goto invalid; - info->sstate = PPTP_SESSION_REQUESTED; - break; - case PPTP_STOP_SESSION_REQUEST: - /* client requests end of control session */ - info->sstate = PPTP_SESSION_STOPREQ; - break; - - case PPTP_OUT_CALL_REQUEST: - /* client initiating connection to server */ - if (info->sstate != PPTP_SESSION_CONFIRMED) - goto invalid; - info->cstate = PPTP_CALL_OUT_REQ; - /* track PNS call id */ - cid = pptpReq->ocreq.callID; - DEBUGP("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid)); - info->pns_call_id = cid; - break; - case PPTP_IN_CALL_REPLY: - /* client answers incoming call */ - if (info->cstate != PPTP_CALL_IN_REQ && - info->cstate != PPTP_CALL_IN_REP) - goto invalid; - - cid = pptpReq->icack.callID; - pcid = pptpReq->icack.peersCallID; - if (info->pac_call_id != pcid) - goto invalid; - DEBUGP("%s, CID=%X PCID=%X\n", pptp_msg_name[msg], - ntohs(cid), ntohs(pcid)); - - if (pptpReq->icack.resultCode == PPTP_INCALL_ACCEPT) { - /* part two of the three-way handshake */ - info->cstate = PPTP_CALL_IN_REP; - info->pns_call_id = cid; - } else - info->cstate = PPTP_CALL_NONE; - break; - - case PPTP_CALL_CLEAR_REQUEST: - /* client requests hangup of call */ - if (info->sstate != PPTP_SESSION_CONFIRMED) - goto invalid; - /* FUTURE: iterate over all calls and check if - * call ID is valid. We don't do this without newnat, - * because we only know about last call */ - info->cstate = PPTP_CALL_CLEAR_REQ; - break; - case PPTP_SET_LINK_INFO: - case PPTP_ECHO_REQUEST: - case PPTP_ECHO_REPLY: - /* I don't have to explain these ;) */ - break; - default: - goto invalid; - } - - ip_nat_pptp_outbound = rcu_dereference(ip_nat_pptp_hook_outbound); - if (ip_nat_pptp_outbound) - return ip_nat_pptp_outbound(pskb, ct, ctinfo, ctlh, pptpReq); - return NF_ACCEPT; - -invalid: - DEBUGP("invalid %s: type=%d cid=%u pcid=%u " - "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n", - msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0], - msg, ntohs(cid), ntohs(pcid), info->cstate, info->sstate, - ntohs(info->pns_call_id), ntohs(info->pac_call_id)); - return NF_ACCEPT; -} - -static const unsigned int pptp_msg_size[] = { - [PPTP_START_SESSION_REQUEST] = sizeof(struct PptpStartSessionRequest), - [PPTP_START_SESSION_REPLY] = sizeof(struct PptpStartSessionReply), - [PPTP_STOP_SESSION_REQUEST] = sizeof(struct PptpStopSessionRequest), - [PPTP_STOP_SESSION_REPLY] = sizeof(struct PptpStopSessionReply), - [PPTP_OUT_CALL_REQUEST] = sizeof(struct PptpOutCallRequest), - [PPTP_OUT_CALL_REPLY] = sizeof(struct PptpOutCallReply), - [PPTP_IN_CALL_REQUEST] = sizeof(struct PptpInCallRequest), - [PPTP_IN_CALL_REPLY] = sizeof(struct PptpInCallReply), - [PPTP_IN_CALL_CONNECT] = sizeof(struct PptpInCallConnected), - [PPTP_CALL_CLEAR_REQUEST] = sizeof(struct PptpClearCallRequest), - [PPTP_CALL_DISCONNECT_NOTIFY] = sizeof(struct PptpCallDisconnectNotify), - [PPTP_WAN_ERROR_NOTIFY] = sizeof(struct PptpWanErrorNotify), - [PPTP_SET_LINK_INFO] = sizeof(struct PptpSetLinkInfo), -}; - -/* track caller id inside control connection, call expect_related */ -static int -conntrack_pptp_help(struct sk_buff **pskb, - struct ip_conntrack *ct, enum ip_conntrack_info ctinfo) - -{ - int dir = CTINFO2DIR(ctinfo); - struct ip_ct_pptp_master *info = &ct->help.ct_pptp_info; - struct tcphdr _tcph, *tcph; - struct pptp_pkt_hdr _pptph, *pptph; - struct PptpControlHeader _ctlh, *ctlh; - union pptp_ctrl_union _pptpReq, *pptpReq; - unsigned int tcplen = (*pskb)->len - (*pskb)->nh.iph->ihl * 4; - unsigned int datalen, reqlen, nexthdr_off; - int oldsstate, oldcstate; - int ret; - u_int16_t msg; - - /* don't do any tracking before tcp handshake complete */ - if (ctinfo != IP_CT_ESTABLISHED - && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) { - DEBUGP("ctinfo = %u, skipping\n", ctinfo); - return NF_ACCEPT; - } - - nexthdr_off = (*pskb)->nh.iph->ihl*4; - tcph = skb_header_pointer(*pskb, nexthdr_off, sizeof(_tcph), &_tcph); - BUG_ON(!tcph); - nexthdr_off += tcph->doff * 4; - datalen = tcplen - tcph->doff * 4; - - pptph = skb_header_pointer(*pskb, nexthdr_off, sizeof(_pptph), &_pptph); - if (!pptph) { - DEBUGP("no full PPTP header, can't track\n"); - return NF_ACCEPT; - } - nexthdr_off += sizeof(_pptph); - datalen -= sizeof(_pptph); - - /* if it's not a control message we can't do anything with it */ - if (ntohs(pptph->packetType) != PPTP_PACKET_CONTROL || - ntohl(pptph->magicCookie) != PPTP_MAGIC_COOKIE) { - DEBUGP("not a control packet\n"); - return NF_ACCEPT; - } - - ctlh = skb_header_pointer(*pskb, nexthdr_off, sizeof(_ctlh), &_ctlh); - if (!ctlh) - return NF_ACCEPT; - nexthdr_off += sizeof(_ctlh); - datalen -= sizeof(_ctlh); - - reqlen = datalen; - msg = ntohs(ctlh->messageType); - if (msg > 0 && msg <= PPTP_MSG_MAX && reqlen < pptp_msg_size[msg]) - return NF_ACCEPT; - if (reqlen > sizeof(*pptpReq)) - reqlen = sizeof(*pptpReq); - - pptpReq = skb_header_pointer(*pskb, nexthdr_off, reqlen, &_pptpReq); - if (!pptpReq) - return NF_ACCEPT; - - oldsstate = info->sstate; - oldcstate = info->cstate; - - spin_lock_bh(&ip_pptp_lock); - - /* FIXME: We just blindly assume that the control connection is always - * established from PNS->PAC. However, RFC makes no guarantee */ - if (dir == IP_CT_DIR_ORIGINAL) - /* client -> server (PNS -> PAC) */ - ret = pptp_outbound_pkt(pskb, ctlh, pptpReq, reqlen, ct, - ctinfo); - else - /* server -> client (PAC -> PNS) */ - ret = pptp_inbound_pkt(pskb, ctlh, pptpReq, reqlen, ct, - ctinfo); - DEBUGP("sstate: %d->%d, cstate: %d->%d\n", - oldsstate, info->sstate, oldcstate, info->cstate); - spin_unlock_bh(&ip_pptp_lock); - - return ret; -} - -/* control protocol helper */ -static struct ip_conntrack_helper pptp = { - .list = { NULL, NULL }, - .name = "pptp", - .me = THIS_MODULE, - .max_expected = 2, - .timeout = 5 * 60, - .tuple = { .src = { .ip = 0, - .u = { .tcp = { .port = - __constant_htons(PPTP_CONTROL_PORT) } } - }, - .dst = { .ip = 0, - .u = { .all = 0 }, - .protonum = IPPROTO_TCP - } - }, - .mask = { .src = { .ip = 0, - .u = { .tcp = { .port = __constant_htons(0xffff) } } - }, - .dst = { .ip = 0, - .u = { .all = 0 }, - .protonum = 0xff - } - }, - .help = conntrack_pptp_help, - .destroy = pptp_destroy_siblings, -}; - -extern void ip_ct_proto_gre_fini(void); -extern int __init ip_ct_proto_gre_init(void); - -/* ip_conntrack_pptp initialization */ -static int __init ip_conntrack_helper_pptp_init(void) -{ - int retcode; - - retcode = ip_ct_proto_gre_init(); - if (retcode < 0) - return retcode; - - DEBUGP(" registering helper\n"); - if ((retcode = ip_conntrack_helper_register(&pptp))) { - printk(KERN_ERR "Unable to register conntrack application " - "helper for pptp: %d\n", retcode); - ip_ct_proto_gre_fini(); - return retcode; - } - - printk("ip_conntrack_pptp version %s loaded\n", IP_CT_PPTP_VERSION); - return 0; -} - -static void __exit ip_conntrack_helper_pptp_fini(void) -{ - ip_conntrack_helper_unregister(&pptp); - ip_ct_proto_gre_fini(); - printk("ip_conntrack_pptp version %s unloaded\n", IP_CT_PPTP_VERSION); -} - -module_init(ip_conntrack_helper_pptp_init); -module_exit(ip_conntrack_helper_pptp_fini); - -EXPORT_SYMBOL(ip_nat_pptp_hook_outbound); -EXPORT_SYMBOL(ip_nat_pptp_hook_inbound); -EXPORT_SYMBOL(ip_nat_pptp_hook_exp_gre); -EXPORT_SYMBOL(ip_nat_pptp_hook_expectfn); diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c deleted file mode 100644 index 053e591f407..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_irc.c +++ /dev/null @@ -1,314 +0,0 @@ -/* IRC extension for IP connection tracking, Version 1.21 - * (C) 2000-2002 by Harald Welte <laforge@gnumonks.org> - * based on RR's ip_conntrack_ftp.c - * - * ip_conntrack_irc.c,v 1.21 2002/02/05 14:49:26 laforge Exp - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - ** - * Module load syntax: - * insmod ip_conntrack_irc.o ports=port1,port2,...port<MAX_PORTS> - * max_dcc_channels=n dcc_timeout=secs - * - * please give the ports of all IRC servers You wish to connect to. - * If You don't specify ports, the default will be port 6667. - * With max_dcc_channels you can define the maximum number of not - * yet answered DCC channels per IRC session (default 8). - * With dcc_timeout you can specify how long the system waits for - * an expected DCC channel (default 300 seconds). - * - */ - -#include <linux/module.h> -#include <linux/netfilter.h> -#include <linux/ip.h> -#include <net/checksum.h> -#include <net/tcp.h> - -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_irc.h> -#include <linux/moduleparam.h> - -#define MAX_PORTS 8 -static unsigned short ports[MAX_PORTS]; -static int ports_c; -static unsigned int max_dcc_channels = 8; -static unsigned int dcc_timeout = 300; -/* This is slow, but it's simple. --RR */ -static char *irc_buffer; -static DEFINE_SPINLOCK(irc_buffer_lock); - -unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - unsigned int matchoff, - unsigned int matchlen, - struct ip_conntrack_expect *exp); -EXPORT_SYMBOL_GPL(ip_nat_irc_hook); - -MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("IRC (DCC) connection tracking helper"); -MODULE_LICENSE("GPL"); -module_param_array(ports, ushort, &ports_c, 0400); -MODULE_PARM_DESC(ports, "port numbers of IRC servers"); -module_param(max_dcc_channels, uint, 0400); -MODULE_PARM_DESC(max_dcc_channels, "max number of expected DCC channels per IRC session"); -module_param(dcc_timeout, uint, 0400); -MODULE_PARM_DESC(dcc_timeout, "timeout on for unestablished DCC channels"); - -static const char *dccprotos[] = { "SEND ", "CHAT ", "MOVE ", "TSEND ", "SCHAT " }; -#define MINMATCHLEN 5 - -#if 0 -#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s:" format, \ - __FILE__, __FUNCTION__ , ## args) -#else -#define DEBUGP(format, args...) -#endif - -static int parse_dcc(char *data, char *data_end, u_int32_t *ip, - u_int16_t *port, char **ad_beg_p, char **ad_end_p) -/* tries to get the ip_addr and port out of a dcc command - return value: -1 on failure, 0 on success - data pointer to first byte of DCC command data - data_end pointer to last byte of dcc command data - ip returns parsed ip of dcc command - port returns parsed port of dcc command - ad_beg_p returns pointer to first byte of addr data - ad_end_p returns pointer to last byte of addr data */ -{ - - /* at least 12: "AAAAAAAA P\1\n" */ - while (*data++ != ' ') - if (data > data_end - 12) - return -1; - - *ad_beg_p = data; - *ip = simple_strtoul(data, &data, 10); - - /* skip blanks between ip and port */ - while (*data == ' ') { - if (data >= data_end) - return -1; - data++; - } - - *port = simple_strtoul(data, &data, 10); - *ad_end_p = data; - - return 0; -} - -static int help(struct sk_buff **pskb, - struct ip_conntrack *ct, enum ip_conntrack_info ctinfo) -{ - unsigned int dataoff; - struct tcphdr _tcph, *th; - char *data, *data_limit, *ib_ptr; - int dir = CTINFO2DIR(ctinfo); - struct ip_conntrack_expect *exp; - u32 seq; - u_int32_t dcc_ip; - u_int16_t dcc_port; - int i, ret = NF_ACCEPT; - char *addr_beg_p, *addr_end_p; - typeof(ip_nat_irc_hook) ip_nat_irc; - - DEBUGP("entered\n"); - - /* If packet is coming from IRC server */ - if (dir == IP_CT_DIR_REPLY) - return NF_ACCEPT; - - /* Until there's been traffic both ways, don't look in packets. */ - if (ctinfo != IP_CT_ESTABLISHED - && ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) { - DEBUGP("Conntrackinfo = %u\n", ctinfo); - return NF_ACCEPT; - } - - /* Not a full tcp header? */ - th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4, - sizeof(_tcph), &_tcph); - if (th == NULL) - return NF_ACCEPT; - - /* No data? */ - dataoff = (*pskb)->nh.iph->ihl*4 + th->doff*4; - if (dataoff >= (*pskb)->len) - return NF_ACCEPT; - - spin_lock_bh(&irc_buffer_lock); - ib_ptr = skb_header_pointer(*pskb, dataoff, - (*pskb)->len - dataoff, irc_buffer); - BUG_ON(ib_ptr == NULL); - - data = ib_ptr; - data_limit = ib_ptr + (*pskb)->len - dataoff; - - /* strlen("\1DCC SENT t AAAAAAAA P\1\n")=24 - * 5+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=14 */ - while (data < (data_limit - (19 + MINMATCHLEN))) { - if (memcmp(data, "\1DCC ", 5)) { - data++; - continue; - } - - data += 5; - /* we have at least (19+MINMATCHLEN)-5 bytes valid data left */ - - DEBUGP("DCC found in master %u.%u.%u.%u:%u %u.%u.%u.%u:%u...\n", - NIPQUAD(iph->saddr), ntohs(th->source), - NIPQUAD(iph->daddr), ntohs(th->dest)); - - for (i = 0; i < ARRAY_SIZE(dccprotos); i++) { - if (memcmp(data, dccprotos[i], strlen(dccprotos[i]))) { - /* no match */ - continue; - } - - DEBUGP("DCC %s detected\n", dccprotos[i]); - data += strlen(dccprotos[i]); - /* we have at least - * (19+MINMATCHLEN)-5-dccprotos[i].matchlen bytes valid - * data left (== 14/13 bytes) */ - if (parse_dcc((char *)data, data_limit, &dcc_ip, - &dcc_port, &addr_beg_p, &addr_end_p)) { - /* unable to parse */ - DEBUGP("unable to parse dcc command\n"); - continue; - } - DEBUGP("DCC bound ip/port: %u.%u.%u.%u:%u\n", - HIPQUAD(dcc_ip), dcc_port); - - /* dcc_ip can be the internal OR external (NAT'ed) IP - * Tiago Sousa <mirage@kaotik.org> */ - if (ct->tuplehash[dir].tuple.src.ip != htonl(dcc_ip) - && ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip != htonl(dcc_ip)) { - if (net_ratelimit()) - printk(KERN_WARNING - "Forged DCC command from " - "%u.%u.%u.%u: %u.%u.%u.%u:%u\n", - NIPQUAD(ct->tuplehash[dir].tuple.src.ip), - HIPQUAD(dcc_ip), dcc_port); - - continue; - } - - exp = ip_conntrack_expect_alloc(ct); - if (exp == NULL) { - ret = NF_DROP; - goto out; - } - - /* save position of address in dcc string, - * necessary for NAT */ - DEBUGP("tcph->seq = %u\n", th->seq); - seq = ntohl(th->seq) + (addr_beg_p - ib_ptr); - - /* We refer to the reverse direction ("!dir") - * tuples here, because we're expecting - * something in the other * direction. - * Doesn't matter unless NAT is happening. */ - exp->tuple = ((struct ip_conntrack_tuple) - { { 0, { 0 } }, - { ct->tuplehash[!dir].tuple.dst.ip, - { .tcp = { htons(dcc_port) } }, - IPPROTO_TCP }}); - exp->mask = ((struct ip_conntrack_tuple) - { { 0, { 0 } }, - { htonl(0xFFFFFFFF), - { .tcp = { htons(0xFFFF) } }, 0xFF }}); - exp->expectfn = NULL; - exp->flags = 0; - ip_nat_irc = rcu_dereference(ip_nat_irc_hook); - if (ip_nat_irc) - ret = ip_nat_irc(pskb, ctinfo, - addr_beg_p - ib_ptr, - addr_end_p - addr_beg_p, - exp); - else if (ip_conntrack_expect_related(exp) != 0) - ret = NF_DROP; - ip_conntrack_expect_put(exp); - goto out; - } /* for .. NUM_DCCPROTO */ - } /* while data < ... */ - - out: - spin_unlock_bh(&irc_buffer_lock); - return ret; -} - -static struct ip_conntrack_helper irc_helpers[MAX_PORTS]; -static char irc_names[MAX_PORTS][sizeof("irc-65535")]; - -static void ip_conntrack_irc_fini(void); - -static int __init ip_conntrack_irc_init(void) -{ - int i, ret; - struct ip_conntrack_helper *hlpr; - char *tmpname; - - if (max_dcc_channels < 1) { - printk("ip_conntrack_irc: max_dcc_channels must be a positive integer\n"); - return -EBUSY; - } - - irc_buffer = kmalloc(65536, GFP_KERNEL); - if (!irc_buffer) - return -ENOMEM; - - /* If no port given, default to standard irc port */ - if (ports_c == 0) - ports[ports_c++] = IRC_PORT; - - for (i = 0; i < ports_c; i++) { - hlpr = &irc_helpers[i]; - hlpr->tuple.src.u.tcp.port = htons(ports[i]); - hlpr->tuple.dst.protonum = IPPROTO_TCP; - hlpr->mask.src.u.tcp.port = htons(0xFFFF); - hlpr->mask.dst.protonum = 0xFF; - hlpr->max_expected = max_dcc_channels; - hlpr->timeout = dcc_timeout; - hlpr->me = THIS_MODULE; - hlpr->help = help; - - tmpname = &irc_names[i][0]; - if (ports[i] == IRC_PORT) - sprintf(tmpname, "irc"); - else - sprintf(tmpname, "irc-%d", i); - hlpr->name = tmpname; - - DEBUGP("port #%d: %d\n", i, ports[i]); - - ret = ip_conntrack_helper_register(hlpr); - - if (ret) { - printk("ip_conntrack_irc: ERROR registering port %d\n", - ports[i]); - ip_conntrack_irc_fini(); - return -EBUSY; - } - } - return 0; -} - -/* This function is intentionally _NOT_ defined as __exit, because - * it is needed by the init function */ -static void ip_conntrack_irc_fini(void) -{ - int i; - for (i = 0; i < ports_c; i++) { - DEBUGP("unregistering port %d\n", - ports[i]); - ip_conntrack_helper_unregister(&irc_helpers[i]); - } - kfree(irc_buffer); -} - -module_init(ip_conntrack_irc_init); -module_exit(ip_conntrack_irc_fini); diff --git a/net/ipv4/netfilter/ip_conntrack_netbios_ns.c b/net/ipv4/netfilter/ip_conntrack_netbios_ns.c deleted file mode 100644 index cc6dd49c9da..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_netbios_ns.c +++ /dev/null @@ -1,143 +0,0 @@ -/* - * NetBIOS name service broadcast connection tracking helper - * - * (c) 2005 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -/* - * This helper tracks locally originating NetBIOS name service - * requests by issuing permanent expectations (valid until - * timing out) matching all reply connections from the - * destination network. The only NetBIOS specific thing is - * actually the port number. - */ -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/skbuff.h> -#include <linux/netdevice.h> -#include <linux/inetdevice.h> -#include <linux/if_addr.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <net/route.h> - -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter_ipv4/ip_conntrack.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> - -#define NMBD_PORT 137 - -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_DESCRIPTION("NetBIOS name service broadcast connection tracking helper"); -MODULE_LICENSE("GPL"); - -static unsigned int timeout = 3; -module_param(timeout, uint, 0400); -MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds"); - -static int help(struct sk_buff **pskb, - struct ip_conntrack *ct, enum ip_conntrack_info ctinfo) -{ - struct ip_conntrack_expect *exp; - struct iphdr *iph = (*pskb)->nh.iph; - struct rtable *rt = (struct rtable *)(*pskb)->dst; - struct in_device *in_dev; - __be32 mask = 0; - - /* we're only interested in locally generated packets */ - if ((*pskb)->sk == NULL) - goto out; - if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST)) - goto out; - if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) - goto out; - - rcu_read_lock(); - in_dev = __in_dev_get_rcu(rt->u.dst.dev); - if (in_dev != NULL) { - for_primary_ifa(in_dev) { - if (ifa->ifa_broadcast == iph->daddr) { - mask = ifa->ifa_mask; - break; - } - } endfor_ifa(in_dev); - } - rcu_read_unlock(); - - if (mask == 0) - goto out; - - exp = ip_conntrack_expect_alloc(ct); - if (exp == NULL) - goto out; - - exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; - exp->tuple.src.u.udp.port = htons(NMBD_PORT); - - exp->mask.src.ip = mask; - exp->mask.src.u.udp.port = htons(0xFFFF); - exp->mask.dst.ip = htonl(0xFFFFFFFF); - exp->mask.dst.u.udp.port = htons(0xFFFF); - exp->mask.dst.protonum = 0xFF; - - exp->expectfn = NULL; - exp->flags = IP_CT_EXPECT_PERMANENT; - - ip_conntrack_expect_related(exp); - ip_conntrack_expect_put(exp); - - ip_ct_refresh(ct, *pskb, timeout * HZ); -out: - return NF_ACCEPT; -} - -static struct ip_conntrack_helper helper = { - .name = "netbios-ns", - .tuple = { - .src = { - .u = { - .udp = { - .port = __constant_htons(NMBD_PORT), - } - } - }, - .dst = { - .protonum = IPPROTO_UDP, - }, - }, - .mask = { - .src = { - .u = { - .udp = { - .port = __constant_htons(0xFFFF), - } - } - }, - .dst = { - .protonum = 0xFF, - }, - }, - .max_expected = 1, - .me = THIS_MODULE, - .help = help, -}; - -static int __init ip_conntrack_netbios_ns_init(void) -{ - helper.timeout = timeout; - return ip_conntrack_helper_register(&helper); -} - -static void __exit ip_conntrack_netbios_ns_fini(void) -{ - ip_conntrack_helper_unregister(&helper); -} - -module_init(ip_conntrack_netbios_ns_init); -module_exit(ip_conntrack_netbios_ns_fini); diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c deleted file mode 100644 index 9228b76ccd9..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ /dev/null @@ -1,1577 +0,0 @@ -/* Connection tracking via netlink socket. Allows for user space - * protocol helpers and general trouble making from userspace. - * - * (C) 2001 by Jay Schulist <jschlst@samba.org> - * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org> - * (C) 2003 by Patrick Mchardy <kaber@trash.net> - * (C) 2005-2006 by Pablo Neira Ayuso <pablo@eurodev.net> - * - * I've reworked this stuff to use attributes instead of conntrack - * structures. 5.44 am. I need more tea. --pablo 05/07/11. - * - * Initial connection tracking via netlink development funded and - * generally made possible by Network Robots, Inc. (www.networkrobots.com) - * - * Further development of this code funded by Astaro AG (http://www.astaro.com) - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - */ - -#include <linux/init.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/timer.h> -#include <linux/skbuff.h> -#include <linux/errno.h> -#include <linux/netlink.h> -#include <linux/spinlock.h> -#include <linux/interrupt.h> -#include <linux/notifier.h> - -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4/ip_conntrack.h> -#include <linux/netfilter_ipv4/ip_conntrack_core.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> -#include <linux/netfilter_ipv4/ip_nat_protocol.h> - -#include <linux/netfilter/nfnetlink.h> -#include <linux/netfilter/nfnetlink_conntrack.h> - -MODULE_LICENSE("GPL"); - -static char __initdata version[] = "0.90"; - -static inline int -ctnetlink_dump_tuples_proto(struct sk_buff *skb, - const struct ip_conntrack_tuple *tuple, - struct ip_conntrack_protocol *proto) -{ - int ret = 0; - struct nfattr *nest_parms = NFA_NEST(skb, CTA_TUPLE_PROTO); - - NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum); - - if (likely(proto->tuple_to_nfattr)) - ret = proto->tuple_to_nfattr(skb, tuple); - - NFA_NEST_END(skb, nest_parms); - - return ret; - -nfattr_failure: - return -1; -} - -static inline int -ctnetlink_dump_tuples_ip(struct sk_buff *skb, - const struct ip_conntrack_tuple *tuple) -{ - struct nfattr *nest_parms = NFA_NEST(skb, CTA_TUPLE_IP); - - NFA_PUT(skb, CTA_IP_V4_SRC, sizeof(__be32), &tuple->src.ip); - NFA_PUT(skb, CTA_IP_V4_DST, sizeof(__be32), &tuple->dst.ip); - - NFA_NEST_END(skb, nest_parms); - - return 0; - -nfattr_failure: - return -1; -} - -static inline int -ctnetlink_dump_tuples(struct sk_buff *skb, - const struct ip_conntrack_tuple *tuple) -{ - int ret; - struct ip_conntrack_protocol *proto; - - ret = ctnetlink_dump_tuples_ip(skb, tuple); - if (unlikely(ret < 0)) - return ret; - - proto = ip_conntrack_proto_find_get(tuple->dst.protonum); - ret = ctnetlink_dump_tuples_proto(skb, tuple, proto); - ip_conntrack_proto_put(proto); - - return ret; -} - -static inline int -ctnetlink_dump_status(struct sk_buff *skb, const struct ip_conntrack *ct) -{ - __be32 status = htonl((u_int32_t) ct->status); - NFA_PUT(skb, CTA_STATUS, sizeof(status), &status); - return 0; - -nfattr_failure: - return -1; -} - -static inline int -ctnetlink_dump_timeout(struct sk_buff *skb, const struct ip_conntrack *ct) -{ - long timeout_l = ct->timeout.expires - jiffies; - __be32 timeout; - - if (timeout_l < 0) - timeout = 0; - else - timeout = htonl(timeout_l / HZ); - - NFA_PUT(skb, CTA_TIMEOUT, sizeof(timeout), &timeout); - return 0; - -nfattr_failure: - return -1; -} - -static inline int -ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct ip_conntrack *ct) -{ - struct ip_conntrack_protocol *proto = ip_conntrack_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); - - struct nfattr *nest_proto; - int ret; - - if (!proto->to_nfattr) { - ip_conntrack_proto_put(proto); - return 0; - } - - nest_proto = NFA_NEST(skb, CTA_PROTOINFO); - - ret = proto->to_nfattr(skb, nest_proto, ct); - - ip_conntrack_proto_put(proto); - - NFA_NEST_END(skb, nest_proto); - - return ret; - -nfattr_failure: - ip_conntrack_proto_put(proto); - return -1; -} - -static inline int -ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct ip_conntrack *ct) -{ - struct nfattr *nest_helper; - - if (!ct->helper) - return 0; - - nest_helper = NFA_NEST(skb, CTA_HELP); - NFA_PUT(skb, CTA_HELP_NAME, strlen(ct->helper->name), ct->helper->name); - - if (ct->helper->to_nfattr) - ct->helper->to_nfattr(skb, ct); - - NFA_NEST_END(skb, nest_helper); - - return 0; - -nfattr_failure: - return -1; -} - -#ifdef CONFIG_IP_NF_CT_ACCT -static inline int -ctnetlink_dump_counters(struct sk_buff *skb, const struct ip_conntrack *ct, - enum ip_conntrack_dir dir) -{ - enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG; - struct nfattr *nest_count = NFA_NEST(skb, type); - __be32 tmp; - - tmp = htonl(ct->counters[dir].packets); - NFA_PUT(skb, CTA_COUNTERS32_PACKETS, sizeof(__be32), &tmp); - - tmp = htonl(ct->counters[dir].bytes); - NFA_PUT(skb, CTA_COUNTERS32_BYTES, sizeof(__be32), &tmp); - - NFA_NEST_END(skb, nest_count); - - return 0; - -nfattr_failure: - return -1; -} -#else -#define ctnetlink_dump_counters(a, b, c) (0) -#endif - -#ifdef CONFIG_IP_NF_CONNTRACK_MARK -static inline int -ctnetlink_dump_mark(struct sk_buff *skb, const struct ip_conntrack *ct) -{ - __be32 mark = htonl(ct->mark); - - NFA_PUT(skb, CTA_MARK, sizeof(__be32), &mark); - return 0; - -nfattr_failure: - return -1; -} -#else -#define ctnetlink_dump_mark(a, b) (0) -#endif - -static inline int -ctnetlink_dump_id(struct sk_buff *skb, const struct ip_conntrack *ct) -{ - __be32 id = htonl(ct->id); - NFA_PUT(skb, CTA_ID, sizeof(__be32), &id); - return 0; - -nfattr_failure: - return -1; -} - -static inline int -ctnetlink_dump_use(struct sk_buff *skb, const struct ip_conntrack *ct) -{ - __be32 use = htonl(atomic_read(&ct->ct_general.use)); - - NFA_PUT(skb, CTA_USE, sizeof(__be32), &use); - return 0; - -nfattr_failure: - return -1; -} - -#define tuple(ct, dir) (&(ct)->tuplehash[dir].tuple) - -static int -ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq, - int event, int nowait, - const struct ip_conntrack *ct) -{ - struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; - struct nfattr *nest_parms; - unsigned char *b; - - b = skb->tail; - - event |= NFNL_SUBSYS_CTNETLINK << 8; - nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg)); - nfmsg = NLMSG_DATA(nlh); - - nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0; - nfmsg->nfgen_family = AF_INET; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - - nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG); - if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) - goto nfattr_failure; - NFA_NEST_END(skb, nest_parms); - - nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY); - if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0) - goto nfattr_failure; - NFA_NEST_END(skb, nest_parms); - - if (ctnetlink_dump_status(skb, ct) < 0 || - ctnetlink_dump_timeout(skb, ct) < 0 || - ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || - ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 || - ctnetlink_dump_protoinfo(skb, ct) < 0 || - ctnetlink_dump_helpinfo(skb, ct) < 0 || - ctnetlink_dump_mark(skb, ct) < 0 || - ctnetlink_dump_id(skb, ct) < 0 || - ctnetlink_dump_use(skb, ct) < 0) - goto nfattr_failure; - - nlh->nlmsg_len = skb->tail - b; - return skb->len; - -nlmsg_failure: -nfattr_failure: - skb_trim(skb, b - skb->data); - return -1; -} - -#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS -static int ctnetlink_conntrack_event(struct notifier_block *this, - unsigned long events, void *ptr) -{ - struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; - struct nfattr *nest_parms; - struct ip_conntrack *ct = (struct ip_conntrack *)ptr; - struct sk_buff *skb; - unsigned int type; - unsigned char *b; - unsigned int flags = 0, group; - - /* ignore our fake conntrack entry */ - if (ct == &ip_conntrack_untracked) - return NOTIFY_DONE; - - if (events & IPCT_DESTROY) { - type = IPCTNL_MSG_CT_DELETE; - group = NFNLGRP_CONNTRACK_DESTROY; - } else if (events & (IPCT_NEW | IPCT_RELATED)) { - type = IPCTNL_MSG_CT_NEW; - flags = NLM_F_CREATE|NLM_F_EXCL; - group = NFNLGRP_CONNTRACK_NEW; - } else if (events & (IPCT_STATUS | IPCT_PROTOINFO)) { - type = IPCTNL_MSG_CT_NEW; - group = NFNLGRP_CONNTRACK_UPDATE; - } else - return NOTIFY_DONE; - - if (!nfnetlink_has_listeners(group)) - return NOTIFY_DONE; - - skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); - if (!skb) - return NOTIFY_DONE; - - b = skb->tail; - - type |= NFNL_SUBSYS_CTNETLINK << 8; - nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg)); - nfmsg = NLMSG_DATA(nlh); - - nlh->nlmsg_flags = flags; - nfmsg->nfgen_family = AF_INET; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - - nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG); - if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) - goto nfattr_failure; - NFA_NEST_END(skb, nest_parms); - - nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY); - if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0) - goto nfattr_failure; - NFA_NEST_END(skb, nest_parms); - - if (events & IPCT_DESTROY) { - if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || - ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0) - goto nfattr_failure; - } else { - if (ctnetlink_dump_status(skb, ct) < 0) - goto nfattr_failure; - - if (ctnetlink_dump_timeout(skb, ct) < 0) - goto nfattr_failure; - - if (events & IPCT_PROTOINFO - && ctnetlink_dump_protoinfo(skb, ct) < 0) - goto nfattr_failure; - - if ((events & IPCT_HELPER || ct->helper) - && ctnetlink_dump_helpinfo(skb, ct) < 0) - goto nfattr_failure; - -#ifdef CONFIG_IP_NF_CONNTRACK_MARK - if ((events & IPCT_MARK || ct->mark) - && ctnetlink_dump_mark(skb, ct) < 0) - goto nfattr_failure; -#endif - - if (events & IPCT_COUNTER_FILLING && - (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || - ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0)) - goto nfattr_failure; - } - - nlh->nlmsg_len = skb->tail - b; - nfnetlink_send(skb, 0, group, 0); - return NOTIFY_DONE; - -nlmsg_failure: -nfattr_failure: - kfree_skb(skb); - return NOTIFY_DONE; -} -#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */ - -static int ctnetlink_done(struct netlink_callback *cb) -{ - if (cb->args[1]) - ip_conntrack_put((struct ip_conntrack *)cb->args[1]); - return 0; -} - -static int -ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct ip_conntrack *ct, *last; - struct ip_conntrack_tuple_hash *h; - struct list_head *i; - - read_lock_bh(&ip_conntrack_lock); - last = (struct ip_conntrack *)cb->args[1]; - for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++) { -restart: - list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) { - h = (struct ip_conntrack_tuple_hash *) i; - if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) - continue; - ct = tuplehash_to_ctrack(h); - if (cb->args[1]) { - if (ct != last) - continue; - cb->args[1] = 0; - } - if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, - IPCTNL_MSG_CT_NEW, - 1, ct) < 0) { - nf_conntrack_get(&ct->ct_general); - cb->args[1] = (unsigned long)ct; - goto out; - } -#ifdef CONFIG_NF_CT_ACCT - if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == - IPCTNL_MSG_CT_GET_CTRZERO) - memset(&ct->counters, 0, sizeof(ct->counters)); -#endif - } - if (cb->args[1]) { - cb->args[1] = 0; - goto restart; - } - } -out: - read_unlock_bh(&ip_conntrack_lock); - if (last) - ip_conntrack_put(last); - - return skb->len; -} - -static const size_t cta_min_ip[CTA_IP_MAX] = { - [CTA_IP_V4_SRC-1] = sizeof(__be32), - [CTA_IP_V4_DST-1] = sizeof(__be32), -}; - -static inline int -ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple) -{ - struct nfattr *tb[CTA_IP_MAX]; - - nfattr_parse_nested(tb, CTA_IP_MAX, attr); - - if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip)) - return -EINVAL; - - if (!tb[CTA_IP_V4_SRC-1]) - return -EINVAL; - tuple->src.ip = *(__be32 *)NFA_DATA(tb[CTA_IP_V4_SRC-1]); - - if (!tb[CTA_IP_V4_DST-1]) - return -EINVAL; - tuple->dst.ip = *(__be32 *)NFA_DATA(tb[CTA_IP_V4_DST-1]); - - return 0; -} - -static const size_t cta_min_proto[CTA_PROTO_MAX] = { - [CTA_PROTO_NUM-1] = sizeof(u_int8_t), - [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t), - [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t), - [CTA_PROTO_ICMP_TYPE-1] = sizeof(u_int8_t), - [CTA_PROTO_ICMP_CODE-1] = sizeof(u_int8_t), - [CTA_PROTO_ICMP_ID-1] = sizeof(u_int16_t), -}; - -static inline int -ctnetlink_parse_tuple_proto(struct nfattr *attr, - struct ip_conntrack_tuple *tuple) -{ - struct nfattr *tb[CTA_PROTO_MAX]; - struct ip_conntrack_protocol *proto; - int ret = 0; - - nfattr_parse_nested(tb, CTA_PROTO_MAX, attr); - - if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto)) - return -EINVAL; - - if (!tb[CTA_PROTO_NUM-1]) - return -EINVAL; - tuple->dst.protonum = *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]); - - proto = ip_conntrack_proto_find_get(tuple->dst.protonum); - - if (likely(proto->nfattr_to_tuple)) - ret = proto->nfattr_to_tuple(tb, tuple); - - ip_conntrack_proto_put(proto); - - return ret; -} - -static inline int -ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple, - enum ctattr_tuple type) -{ - struct nfattr *tb[CTA_TUPLE_MAX]; - int err; - - memset(tuple, 0, sizeof(*tuple)); - - nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]); - - if (!tb[CTA_TUPLE_IP-1]) - return -EINVAL; - - err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP-1], tuple); - if (err < 0) - return err; - - if (!tb[CTA_TUPLE_PROTO-1]) - return -EINVAL; - - err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO-1], tuple); - if (err < 0) - return err; - - /* orig and expect tuples get DIR_ORIGINAL */ - if (type == CTA_TUPLE_REPLY) - tuple->dst.dir = IP_CT_DIR_REPLY; - else - tuple->dst.dir = IP_CT_DIR_ORIGINAL; - - return 0; -} - -#ifdef CONFIG_IP_NF_NAT_NEEDED -static const size_t cta_min_protonat[CTA_PROTONAT_MAX] = { - [CTA_PROTONAT_PORT_MIN-1] = sizeof(u_int16_t), - [CTA_PROTONAT_PORT_MAX-1] = sizeof(u_int16_t), -}; - -static int ctnetlink_parse_nat_proto(struct nfattr *attr, - const struct ip_conntrack *ct, - struct ip_nat_range *range) -{ - struct nfattr *tb[CTA_PROTONAT_MAX]; - struct ip_nat_protocol *npt; - - nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr); - - if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat)) - return -EINVAL; - - npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); - - if (!npt->nfattr_to_range) { - ip_nat_proto_put(npt); - return 0; - } - - /* nfattr_to_range returns 1 if it parsed, 0 if not, neg. on error */ - if (npt->nfattr_to_range(tb, range) > 0) - range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED; - - ip_nat_proto_put(npt); - - return 0; -} - -static const size_t cta_min_nat[CTA_NAT_MAX] = { - [CTA_NAT_MINIP-1] = sizeof(__be32), - [CTA_NAT_MAXIP-1] = sizeof(__be32), -}; - -static inline int -ctnetlink_parse_nat(struct nfattr *nat, - const struct ip_conntrack *ct, struct ip_nat_range *range) -{ - struct nfattr *tb[CTA_NAT_MAX]; - int err; - - memset(range, 0, sizeof(*range)); - - nfattr_parse_nested(tb, CTA_NAT_MAX, nat); - - if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat)) - return -EINVAL; - - if (tb[CTA_NAT_MINIP-1]) - range->min_ip = *(__be32 *)NFA_DATA(tb[CTA_NAT_MINIP-1]); - - if (!tb[CTA_NAT_MAXIP-1]) - range->max_ip = range->min_ip; - else - range->max_ip = *(__be32 *)NFA_DATA(tb[CTA_NAT_MAXIP-1]); - - if (range->min_ip) - range->flags |= IP_NAT_RANGE_MAP_IPS; - - if (!tb[CTA_NAT_PROTO-1]) - return 0; - - err = ctnetlink_parse_nat_proto(tb[CTA_NAT_PROTO-1], ct, range); - if (err < 0) - return err; - - return 0; -} -#endif - -static inline int -ctnetlink_parse_help(struct nfattr *attr, char **helper_name) -{ - struct nfattr *tb[CTA_HELP_MAX]; - - nfattr_parse_nested(tb, CTA_HELP_MAX, attr); - - if (!tb[CTA_HELP_NAME-1]) - return -EINVAL; - - *helper_name = NFA_DATA(tb[CTA_HELP_NAME-1]); - - return 0; -} - -static const size_t cta_min[CTA_MAX] = { - [CTA_STATUS-1] = sizeof(__be32), - [CTA_TIMEOUT-1] = sizeof(__be32), - [CTA_MARK-1] = sizeof(__be32), - [CTA_USE-1] = sizeof(__be32), - [CTA_ID-1] = sizeof(__be32) -}; - -static int -ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) -{ - struct ip_conntrack_tuple_hash *h; - struct ip_conntrack_tuple tuple; - struct ip_conntrack *ct; - int err = 0; - - if (nfattr_bad_size(cda, CTA_MAX, cta_min)) - return -EINVAL; - - if (cda[CTA_TUPLE_ORIG-1]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); - else if (cda[CTA_TUPLE_REPLY-1]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY); - else { - /* Flush the whole table */ - ip_conntrack_flush(); - return 0; - } - - if (err < 0) - return err; - - h = ip_conntrack_find_get(&tuple, NULL); - if (!h) - return -ENOENT; - - ct = tuplehash_to_ctrack(h); - - if (cda[CTA_ID-1]) { - u_int32_t id = ntohl(*(__be32 *)NFA_DATA(cda[CTA_ID-1])); - if (ct->id != id) { - ip_conntrack_put(ct); - return -ENOENT; - } - } - if (del_timer(&ct->timeout)) - ct->timeout.function((unsigned long)ct); - - ip_conntrack_put(ct); - - return 0; -} - -static int -ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) -{ - struct ip_conntrack_tuple_hash *h; - struct ip_conntrack_tuple tuple; - struct ip_conntrack *ct; - struct sk_buff *skb2 = NULL; - int err = 0; - - if (nlh->nlmsg_flags & NLM_F_DUMP) { - struct nfgenmsg *msg = NLMSG_DATA(nlh); - u32 rlen; - - if (msg->nfgen_family != AF_INET) - return -EAFNOSUPPORT; - -#ifndef CONFIG_IP_NF_CT_ACCT - if (NFNL_MSG_TYPE(nlh->nlmsg_type) == IPCTNL_MSG_CT_GET_CTRZERO) - return -ENOTSUPP; -#endif - if ((*errp = netlink_dump_start(ctnl, skb, nlh, - ctnetlink_dump_table, - ctnetlink_done)) != 0) - return -EINVAL; - - rlen = NLMSG_ALIGN(nlh->nlmsg_len); - if (rlen > skb->len) - rlen = skb->len; - skb_pull(skb, rlen); - return 0; - } - - if (nfattr_bad_size(cda, CTA_MAX, cta_min)) - return -EINVAL; - - if (cda[CTA_TUPLE_ORIG-1]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); - else if (cda[CTA_TUPLE_REPLY-1]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY); - else - return -EINVAL; - - if (err < 0) - return err; - - h = ip_conntrack_find_get(&tuple, NULL); - if (!h) - return -ENOENT; - - ct = tuplehash_to_ctrack(h); - - err = -ENOMEM; - skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); - if (!skb2) { - ip_conntrack_put(ct); - return -ENOMEM; - } - - err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, - IPCTNL_MSG_CT_NEW, 1, ct); - ip_conntrack_put(ct); - if (err <= 0) - goto free; - - err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); - if (err < 0) - goto out; - - return 0; - -free: - kfree_skb(skb2); -out: - return err; -} - -static inline int -ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[]) -{ - unsigned long d; - unsigned status = ntohl(*(__be32 *)NFA_DATA(cda[CTA_STATUS-1])); - d = ct->status ^ status; - - if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING)) - /* unchangeable */ - return -EINVAL; - - if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY)) - /* SEEN_REPLY bit can only be set */ - return -EINVAL; - - - if (d & IPS_ASSURED && !(status & IPS_ASSURED)) - /* ASSURED bit can only be set */ - return -EINVAL; - - if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) { -#ifndef CONFIG_IP_NF_NAT_NEEDED - return -EINVAL; -#else - struct ip_nat_range range; - - if (cda[CTA_NAT_DST-1]) { - if (ctnetlink_parse_nat(cda[CTA_NAT_DST-1], ct, - &range) < 0) - return -EINVAL; - if (ip_nat_initialized(ct, - HOOK2MANIP(NF_IP_PRE_ROUTING))) - return -EEXIST; - ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING); - } - if (cda[CTA_NAT_SRC-1]) { - if (ctnetlink_parse_nat(cda[CTA_NAT_SRC-1], ct, - &range) < 0) - return -EINVAL; - if (ip_nat_initialized(ct, - HOOK2MANIP(NF_IP_POST_ROUTING))) - return -EEXIST; - ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING); - } -#endif - } - - /* Be careful here, modifying NAT bits can screw up things, - * so don't let users modify them directly if they don't pass - * ip_nat_range. */ - ct->status |= status & ~(IPS_NAT_DONE_MASK | IPS_NAT_MASK); - return 0; -} - - -static inline int -ctnetlink_change_helper(struct ip_conntrack *ct, struct nfattr *cda[]) -{ - struct ip_conntrack_helper *helper; - char *helpname; - int err; - - /* don't change helper of sibling connections */ - if (ct->master) - return -EINVAL; - - err = ctnetlink_parse_help(cda[CTA_HELP-1], &helpname); - if (err < 0) - return err; - - helper = __ip_conntrack_helper_find_byname(helpname); - if (!helper) { - if (!strcmp(helpname, "")) - helper = NULL; - else - return -EINVAL; - } - - if (ct->helper) { - if (!helper) { - /* we had a helper before ... */ - ip_ct_remove_expectations(ct); - ct->helper = NULL; - } else { - /* need to zero data of old helper */ - memset(&ct->help, 0, sizeof(ct->help)); - } - } - - ct->helper = helper; - - return 0; -} - -static inline int -ctnetlink_change_timeout(struct ip_conntrack *ct, struct nfattr *cda[]) -{ - u_int32_t timeout = ntohl(*(__be32 *)NFA_DATA(cda[CTA_TIMEOUT-1])); - - if (!del_timer(&ct->timeout)) - return -ETIME; - - ct->timeout.expires = jiffies + timeout * HZ; - add_timer(&ct->timeout); - - return 0; -} - -static inline int -ctnetlink_change_protoinfo(struct ip_conntrack *ct, struct nfattr *cda[]) -{ - struct nfattr *tb[CTA_PROTOINFO_MAX], *attr = cda[CTA_PROTOINFO-1]; - struct ip_conntrack_protocol *proto; - u_int16_t npt = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; - int err = 0; - - nfattr_parse_nested(tb, CTA_PROTOINFO_MAX, attr); - - proto = ip_conntrack_proto_find_get(npt); - - if (proto->from_nfattr) - err = proto->from_nfattr(tb, ct); - ip_conntrack_proto_put(proto); - - return err; -} - -static int -ctnetlink_change_conntrack(struct ip_conntrack *ct, struct nfattr *cda[]) -{ - int err; - - if (cda[CTA_HELP-1]) { - err = ctnetlink_change_helper(ct, cda); - if (err < 0) - return err; - } - - if (cda[CTA_TIMEOUT-1]) { - err = ctnetlink_change_timeout(ct, cda); - if (err < 0) - return err; - } - - if (cda[CTA_STATUS-1]) { - err = ctnetlink_change_status(ct, cda); - if (err < 0) - return err; - } - - if (cda[CTA_PROTOINFO-1]) { - err = ctnetlink_change_protoinfo(ct, cda); - if (err < 0) - return err; - } - -#if defined(CONFIG_IP_NF_CONNTRACK_MARK) - if (cda[CTA_MARK-1]) - ct->mark = ntohl(*(__be32 *)NFA_DATA(cda[CTA_MARK-1])); -#endif - - return 0; -} - -static int -ctnetlink_create_conntrack(struct nfattr *cda[], - struct ip_conntrack_tuple *otuple, - struct ip_conntrack_tuple *rtuple) -{ - struct ip_conntrack *ct; - int err = -EINVAL; - - ct = ip_conntrack_alloc(otuple, rtuple); - if (ct == NULL || IS_ERR(ct)) - return -ENOMEM; - - if (!cda[CTA_TIMEOUT-1]) - goto err; - ct->timeout.expires = ntohl(*(__be32 *)NFA_DATA(cda[CTA_TIMEOUT-1])); - - ct->timeout.expires = jiffies + ct->timeout.expires * HZ; - ct->status |= IPS_CONFIRMED; - - if (cda[CTA_STATUS-1]) { - err = ctnetlink_change_status(ct, cda); - if (err < 0) - goto err; - } - - if (cda[CTA_PROTOINFO-1]) { - err = ctnetlink_change_protoinfo(ct, cda); - if (err < 0) - goto err; - } - -#if defined(CONFIG_IP_NF_CONNTRACK_MARK) - if (cda[CTA_MARK-1]) - ct->mark = ntohl(*(__be32 *)NFA_DATA(cda[CTA_MARK-1])); -#endif - - ct->helper = ip_conntrack_helper_find_get(rtuple); - - add_timer(&ct->timeout); - ip_conntrack_hash_insert(ct); - - if (ct->helper) - ip_conntrack_helper_put(ct->helper); - - return 0; - -err: - ip_conntrack_free(ct); - return err; -} - -static int -ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) -{ - struct ip_conntrack_tuple otuple, rtuple; - struct ip_conntrack_tuple_hash *h = NULL; - int err = 0; - - if (nfattr_bad_size(cda, CTA_MAX, cta_min)) - return -EINVAL; - - if (cda[CTA_TUPLE_ORIG-1]) { - err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG); - if (err < 0) - return err; - } - - if (cda[CTA_TUPLE_REPLY-1]) { - err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY); - if (err < 0) - return err; - } - - write_lock_bh(&ip_conntrack_lock); - if (cda[CTA_TUPLE_ORIG-1]) - h = __ip_conntrack_find(&otuple, NULL); - else if (cda[CTA_TUPLE_REPLY-1]) - h = __ip_conntrack_find(&rtuple, NULL); - - if (h == NULL) { - write_unlock_bh(&ip_conntrack_lock); - err = -ENOENT; - if (nlh->nlmsg_flags & NLM_F_CREATE) - err = ctnetlink_create_conntrack(cda, &otuple, &rtuple); - return err; - } - /* implicit 'else' */ - - /* we only allow nat config for new conntracks */ - if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) { - err = -EINVAL; - goto out_unlock; - } - - /* We manipulate the conntrack inside the global conntrack table lock, - * so there's no need to increase the refcount */ - err = -EEXIST; - if (!(nlh->nlmsg_flags & NLM_F_EXCL)) - err = ctnetlink_change_conntrack(tuplehash_to_ctrack(h), cda); - -out_unlock: - write_unlock_bh(&ip_conntrack_lock); - return err; -} - -/*********************************************************************** - * EXPECT - ***********************************************************************/ - -static inline int -ctnetlink_exp_dump_tuple(struct sk_buff *skb, - const struct ip_conntrack_tuple *tuple, - enum ctattr_expect type) -{ - struct nfattr *nest_parms = NFA_NEST(skb, type); - - if (ctnetlink_dump_tuples(skb, tuple) < 0) - goto nfattr_failure; - - NFA_NEST_END(skb, nest_parms); - - return 0; - -nfattr_failure: - return -1; -} - -static inline int -ctnetlink_exp_dump_mask(struct sk_buff *skb, - const struct ip_conntrack_tuple *tuple, - const struct ip_conntrack_tuple *mask) -{ - int ret; - struct ip_conntrack_protocol *proto; - struct nfattr *nest_parms = NFA_NEST(skb, CTA_EXPECT_MASK); - - ret = ctnetlink_dump_tuples_ip(skb, mask); - if (unlikely(ret < 0)) - goto nfattr_failure; - - proto = ip_conntrack_proto_find_get(tuple->dst.protonum); - ret = ctnetlink_dump_tuples_proto(skb, mask, proto); - ip_conntrack_proto_put(proto); - if (unlikely(ret < 0)) - goto nfattr_failure; - - NFA_NEST_END(skb, nest_parms); - - return 0; - -nfattr_failure: - return -1; -} - -static inline int -ctnetlink_exp_dump_expect(struct sk_buff *skb, - const struct ip_conntrack_expect *exp) -{ - struct ip_conntrack *master = exp->master; - __be32 timeout = htonl((exp->timeout.expires - jiffies) / HZ); - __be32 id = htonl(exp->id); - - if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0) - goto nfattr_failure; - if (ctnetlink_exp_dump_mask(skb, &exp->tuple, &exp->mask) < 0) - goto nfattr_failure; - if (ctnetlink_exp_dump_tuple(skb, - &master->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - CTA_EXPECT_MASTER) < 0) - goto nfattr_failure; - - NFA_PUT(skb, CTA_EXPECT_TIMEOUT, sizeof(__be32), &timeout); - NFA_PUT(skb, CTA_EXPECT_ID, sizeof(__be32), &id); - - return 0; - -nfattr_failure: - return -1; -} - -static int -ctnetlink_exp_fill_info(struct sk_buff *skb, u32 pid, u32 seq, - int event, - int nowait, - const struct ip_conntrack_expect *exp) -{ - struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; - unsigned char *b; - - b = skb->tail; - - event |= NFNL_SUBSYS_CTNETLINK_EXP << 8; - nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg)); - nfmsg = NLMSG_DATA(nlh); - - nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0; - nfmsg->nfgen_family = AF_INET; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - - if (ctnetlink_exp_dump_expect(skb, exp) < 0) - goto nfattr_failure; - - nlh->nlmsg_len = skb->tail - b; - return skb->len; - -nlmsg_failure: -nfattr_failure: - skb_trim(skb, b - skb->data); - return -1; -} - -#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS -static int ctnetlink_expect_event(struct notifier_block *this, - unsigned long events, void *ptr) -{ - struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; - struct ip_conntrack_expect *exp = (struct ip_conntrack_expect *)ptr; - struct sk_buff *skb; - unsigned int type; - unsigned char *b; - int flags = 0; - - if (events & IPEXP_NEW) { - type = IPCTNL_MSG_EXP_NEW; - flags = NLM_F_CREATE|NLM_F_EXCL; - } else - return NOTIFY_DONE; - - if (!nfnetlink_has_listeners(NFNLGRP_CONNTRACK_EXP_NEW)) - return NOTIFY_DONE; - - skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); - if (!skb) - return NOTIFY_DONE; - - b = skb->tail; - - type |= NFNL_SUBSYS_CTNETLINK_EXP << 8; - nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg)); - nfmsg = NLMSG_DATA(nlh); - - nlh->nlmsg_flags = flags; - nfmsg->nfgen_family = AF_INET; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - - if (ctnetlink_exp_dump_expect(skb, exp) < 0) - goto nfattr_failure; - - nlh->nlmsg_len = skb->tail - b; - nfnetlink_send(skb, 0, NFNLGRP_CONNTRACK_EXP_NEW, 0); - return NOTIFY_DONE; - -nlmsg_failure: -nfattr_failure: - kfree_skb(skb); - return NOTIFY_DONE; -} -#endif - -static int -ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct ip_conntrack_expect *exp = NULL; - struct list_head *i; - u_int32_t *id = (u_int32_t *) &cb->args[0]; - - read_lock_bh(&ip_conntrack_lock); - list_for_each_prev(i, &ip_conntrack_expect_list) { - exp = (struct ip_conntrack_expect *) i; - if (exp->id <= *id) - continue; - if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, - IPCTNL_MSG_EXP_NEW, - 1, exp) < 0) - goto out; - *id = exp->id; - } -out: - read_unlock_bh(&ip_conntrack_lock); - - return skb->len; -} - -static const size_t cta_min_exp[CTA_EXPECT_MAX] = { - [CTA_EXPECT_TIMEOUT-1] = sizeof(__be32), - [CTA_EXPECT_ID-1] = sizeof(__be32) -}; - -static int -ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) -{ - struct ip_conntrack_tuple tuple; - struct ip_conntrack_expect *exp; - struct sk_buff *skb2; - int err = 0; - - if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) - return -EINVAL; - - if (nlh->nlmsg_flags & NLM_F_DUMP) { - struct nfgenmsg *msg = NLMSG_DATA(nlh); - u32 rlen; - - if (msg->nfgen_family != AF_INET) - return -EAFNOSUPPORT; - - if ((*errp = netlink_dump_start(ctnl, skb, nlh, - ctnetlink_exp_dump_table, - ctnetlink_done)) != 0) - return -EINVAL; - rlen = NLMSG_ALIGN(nlh->nlmsg_len); - if (rlen > skb->len) - rlen = skb->len; - skb_pull(skb, rlen); - return 0; - } - - if (cda[CTA_EXPECT_MASTER-1]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER); - else - return -EINVAL; - - if (err < 0) - return err; - - exp = ip_conntrack_expect_find_get(&tuple); - if (!exp) - return -ENOENT; - - if (cda[CTA_EXPECT_ID-1]) { - __be32 id = *(__be32 *)NFA_DATA(cda[CTA_EXPECT_ID-1]); - if (exp->id != ntohl(id)) { - ip_conntrack_expect_put(exp); - return -ENOENT; - } - } - - err = -ENOMEM; - skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); - if (!skb2) - goto out; - - err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid, - nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, - 1, exp); - if (err <= 0) - goto free; - - ip_conntrack_expect_put(exp); - - return netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); - -free: - kfree_skb(skb2); -out: - ip_conntrack_expect_put(exp); - return err; -} - -static int -ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) -{ - struct ip_conntrack_expect *exp, *tmp; - struct ip_conntrack_tuple tuple; - struct ip_conntrack_helper *h; - int err; - - if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) - return -EINVAL; - - if (cda[CTA_EXPECT_TUPLE-1]) { - /* delete a single expect by tuple */ - err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); - if (err < 0) - return err; - - /* bump usage count to 2 */ - exp = ip_conntrack_expect_find_get(&tuple); - if (!exp) - return -ENOENT; - - if (cda[CTA_EXPECT_ID-1]) { - __be32 id = - *(__be32 *)NFA_DATA(cda[CTA_EXPECT_ID-1]); - if (exp->id != ntohl(id)) { - ip_conntrack_expect_put(exp); - return -ENOENT; - } - } - - /* after list removal, usage count == 1 */ - ip_conntrack_unexpect_related(exp); - /* have to put what we 'get' above. - * after this line usage count == 0 */ - ip_conntrack_expect_put(exp); - } else if (cda[CTA_EXPECT_HELP_NAME-1]) { - char *name = NFA_DATA(cda[CTA_EXPECT_HELP_NAME-1]); - - /* delete all expectations for this helper */ - write_lock_bh(&ip_conntrack_lock); - h = __ip_conntrack_helper_find_byname(name); - if (!h) { - write_unlock_bh(&ip_conntrack_lock); - return -EINVAL; - } - list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, - list) { - if (exp->master->helper == h - && del_timer(&exp->timeout)) { - ip_ct_unlink_expect(exp); - ip_conntrack_expect_put(exp); - } - } - write_unlock_bh(&ip_conntrack_lock); - } else { - /* This basically means we have to flush everything*/ - write_lock_bh(&ip_conntrack_lock); - list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, - list) { - if (del_timer(&exp->timeout)) { - ip_ct_unlink_expect(exp); - ip_conntrack_expect_put(exp); - } - } - write_unlock_bh(&ip_conntrack_lock); - } - - return 0; -} -static int -ctnetlink_change_expect(struct ip_conntrack_expect *x, struct nfattr *cda[]) -{ - return -EOPNOTSUPP; -} - -static int -ctnetlink_create_expect(struct nfattr *cda[]) -{ - struct ip_conntrack_tuple tuple, mask, master_tuple; - struct ip_conntrack_tuple_hash *h = NULL; - struct ip_conntrack_expect *exp; - struct ip_conntrack *ct; - int err = 0; - - /* caller guarantees that those three CTA_EXPECT_* exist */ - err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); - if (err < 0) - return err; - err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK); - if (err < 0) - return err; - err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER); - if (err < 0) - return err; - - /* Look for master conntrack of this expectation */ - h = ip_conntrack_find_get(&master_tuple, NULL); - if (!h) - return -ENOENT; - ct = tuplehash_to_ctrack(h); - - if (!ct->helper) { - /* such conntrack hasn't got any helper, abort */ - err = -EINVAL; - goto out; - } - - exp = ip_conntrack_expect_alloc(ct); - if (!exp) { - err = -ENOMEM; - goto out; - } - - exp->expectfn = NULL; - exp->flags = 0; - exp->master = ct; - memcpy(&exp->tuple, &tuple, sizeof(struct ip_conntrack_tuple)); - memcpy(&exp->mask, &mask, sizeof(struct ip_conntrack_tuple)); - - err = ip_conntrack_expect_related(exp); - ip_conntrack_expect_put(exp); - -out: - ip_conntrack_put(tuplehash_to_ctrack(h)); - return err; -} - -static int -ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) -{ - struct ip_conntrack_tuple tuple; - struct ip_conntrack_expect *exp; - int err = 0; - - if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) - return -EINVAL; - - if (!cda[CTA_EXPECT_TUPLE-1] - || !cda[CTA_EXPECT_MASK-1] - || !cda[CTA_EXPECT_MASTER-1]) - return -EINVAL; - - err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); - if (err < 0) - return err; - - write_lock_bh(&ip_conntrack_lock); - exp = __ip_conntrack_expect_find(&tuple); - - if (!exp) { - write_unlock_bh(&ip_conntrack_lock); - err = -ENOENT; - if (nlh->nlmsg_flags & NLM_F_CREATE) - err = ctnetlink_create_expect(cda); - return err; - } - - err = -EEXIST; - if (!(nlh->nlmsg_flags & NLM_F_EXCL)) - err = ctnetlink_change_expect(exp, cda); - write_unlock_bh(&ip_conntrack_lock); - - return err; -} - -#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS -static struct notifier_block ctnl_notifier = { - .notifier_call = ctnetlink_conntrack_event, -}; - -static struct notifier_block ctnl_notifier_exp = { - .notifier_call = ctnetlink_expect_event, -}; -#endif - -static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = { - [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack, - .attr_count = CTA_MAX, }, - [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack, - .attr_count = CTA_MAX, }, - [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack, - .attr_count = CTA_MAX, }, - [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack, - .attr_count = CTA_MAX, }, -}; - -static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = { - [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect, - .attr_count = CTA_EXPECT_MAX, }, - [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect, - .attr_count = CTA_EXPECT_MAX, }, - [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect, - .attr_count = CTA_EXPECT_MAX, }, -}; - -static struct nfnetlink_subsystem ctnl_subsys = { - .name = "conntrack", - .subsys_id = NFNL_SUBSYS_CTNETLINK, - .cb_count = IPCTNL_MSG_MAX, - .cb = ctnl_cb, -}; - -static struct nfnetlink_subsystem ctnl_exp_subsys = { - .name = "conntrack_expect", - .subsys_id = NFNL_SUBSYS_CTNETLINK_EXP, - .cb_count = IPCTNL_MSG_EXP_MAX, - .cb = ctnl_exp_cb, -}; - -MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK); -MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_EXP); - -static int __init ctnetlink_init(void) -{ - int ret; - - printk("ctnetlink v%s: registering with nfnetlink.\n", version); - ret = nfnetlink_subsys_register(&ctnl_subsys); - if (ret < 0) { - printk("ctnetlink_init: cannot register with nfnetlink.\n"); - goto err_out; - } - - ret = nfnetlink_subsys_register(&ctnl_exp_subsys); - if (ret < 0) { - printk("ctnetlink_init: cannot register exp with nfnetlink.\n"); - goto err_unreg_subsys; - } - -#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS - ret = ip_conntrack_register_notifier(&ctnl_notifier); - if (ret < 0) { - printk("ctnetlink_init: cannot register notifier.\n"); - goto err_unreg_exp_subsys; - } - - ret = ip_conntrack_expect_register_notifier(&ctnl_notifier_exp); - if (ret < 0) { - printk("ctnetlink_init: cannot expect register notifier.\n"); - goto err_unreg_notifier; - } -#endif - - return 0; - -#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS -err_unreg_notifier: - ip_conntrack_unregister_notifier(&ctnl_notifier); -err_unreg_exp_subsys: - nfnetlink_subsys_unregister(&ctnl_exp_subsys); -#endif -err_unreg_subsys: - nfnetlink_subsys_unregister(&ctnl_subsys); -err_out: - return ret; -} - -static void __exit ctnetlink_exit(void) -{ - printk("ctnetlink: unregistering from nfnetlink.\n"); - -#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS - ip_conntrack_expect_unregister_notifier(&ctnl_notifier_exp); - ip_conntrack_unregister_notifier(&ctnl_notifier); -#endif - - nfnetlink_subsys_unregister(&ctnl_exp_subsys); - nfnetlink_subsys_unregister(&ctnl_subsys); - return; -} - -module_init(ctnetlink_init); -module_exit(ctnetlink_exit); diff --git a/net/ipv4/netfilter/ip_conntrack_proto_generic.c b/net/ipv4/netfilter/ip_conntrack_proto_generic.c deleted file mode 100644 index 88af82e9865..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_proto_generic.c +++ /dev/null @@ -1,74 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/timer.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> - -unsigned int ip_ct_generic_timeout __read_mostly = 600*HZ; - -static int generic_pkt_to_tuple(const struct sk_buff *skb, - unsigned int dataoff, - struct ip_conntrack_tuple *tuple) -{ - tuple->src.u.all = 0; - tuple->dst.u.all = 0; - - return 1; -} - -static int generic_invert_tuple(struct ip_conntrack_tuple *tuple, - const struct ip_conntrack_tuple *orig) -{ - tuple->src.u.all = 0; - tuple->dst.u.all = 0; - - return 1; -} - -/* Print out the per-protocol part of the tuple. */ -static int generic_print_tuple(struct seq_file *s, - const struct ip_conntrack_tuple *tuple) -{ - return 0; -} - -/* Print out the private part of the conntrack. */ -static int generic_print_conntrack(struct seq_file *s, - const struct ip_conntrack *state) -{ - return 0; -} - -/* Returns verdict for packet, or -1 for invalid. */ -static int packet(struct ip_conntrack *conntrack, - const struct sk_buff *skb, - enum ip_conntrack_info ctinfo) -{ - ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_generic_timeout); - return NF_ACCEPT; -} - -/* Called when a new connection for this protocol found. */ -static int new(struct ip_conntrack *conntrack, const struct sk_buff *skb) -{ - return 1; -} - -struct ip_conntrack_protocol ip_conntrack_generic_protocol = -{ - .proto = 0, - .name = "unknown", - .pkt_to_tuple = generic_pkt_to_tuple, - .invert_tuple = generic_invert_tuple, - .print_tuple = generic_print_tuple, - .print_conntrack = generic_print_conntrack, - .packet = packet, - .new = new, -}; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c deleted file mode 100644 index ac1c49ef36a..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c +++ /dev/null @@ -1,328 +0,0 @@ -/* - * ip_conntrack_proto_gre.c - Version 3.0 - * - * Connection tracking protocol helper module for GRE. - * - * GRE is a generic encapsulation protocol, which is generally not very - * suited for NAT, as it has no protocol-specific part as port numbers. - * - * It has an optional key field, which may help us distinguishing two - * connections between the same two hosts. - * - * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784 - * - * PPTP is built on top of a modified version of GRE, and has a mandatory - * field called "CallID", which serves us for the same purpose as the key - * field in plain GRE. - * - * Documentation about PPTP can be found in RFC 2637 - * - * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org> - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - * - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/timer.h> -#include <linux/netfilter.h> -#include <linux/ip.h> -#include <linux/in.h> -#include <linux/list.h> -#include <linux/seq_file.h> -#include <linux/interrupt.h> - -static DEFINE_RWLOCK(ip_ct_gre_lock); - -#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_core.h> - -#include <linux/netfilter_ipv4/ip_conntrack_proto_gre.h> -#include <linux/netfilter_ipv4/ip_conntrack_pptp.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); -MODULE_DESCRIPTION("netfilter connection tracking protocol helper for GRE"); - -/* shamelessly stolen from ip_conntrack_proto_udp.c */ -#define GRE_TIMEOUT (30*HZ) -#define GRE_STREAM_TIMEOUT (180*HZ) - -#if 0 -#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s: " format, __FILE__, __FUNCTION__, ## args) -#define DUMP_TUPLE_GRE(x) printk("%u.%u.%u.%u:0x%x -> %u.%u.%u.%u:0x%x\n", \ - NIPQUAD((x)->src.ip), ntohs((x)->src.u.gre.key), \ - NIPQUAD((x)->dst.ip), ntohs((x)->dst.u.gre.key)) -#else -#define DEBUGP(x, args...) -#define DUMP_TUPLE_GRE(x) -#endif - -/* GRE KEYMAP HANDLING FUNCTIONS */ -static LIST_HEAD(gre_keymap_list); - -static inline int gre_key_cmpfn(const struct ip_ct_gre_keymap *km, - const struct ip_conntrack_tuple *t) -{ - return ((km->tuple.src.ip == t->src.ip) && - (km->tuple.dst.ip == t->dst.ip) && - (km->tuple.dst.protonum == t->dst.protonum) && - (km->tuple.dst.u.all == t->dst.u.all)); -} - -/* look up the source key for a given tuple */ -static __be16 gre_keymap_lookup(struct ip_conntrack_tuple *t) -{ - struct ip_ct_gre_keymap *km; - __be16 key = 0; - - read_lock_bh(&ip_ct_gre_lock); - list_for_each_entry(km, &gre_keymap_list, list) { - if (gre_key_cmpfn(km, t)) { - key = km->tuple.src.u.gre.key; - break; - } - } - read_unlock_bh(&ip_ct_gre_lock); - - DEBUGP("lookup src key 0x%x up key for ", key); - DUMP_TUPLE_GRE(t); - - return key; -} - -/* add a single keymap entry, associate with specified master ct */ -int -ip_ct_gre_keymap_add(struct ip_conntrack *ct, - struct ip_conntrack_tuple *t, int reply) -{ - struct ip_ct_gre_keymap **exist_km, *km; - - if (!ct->helper || strcmp(ct->helper->name, "pptp")) { - DEBUGP("refusing to add GRE keymap to non-pptp session\n"); - return -1; - } - - if (!reply) - exist_km = &ct->help.ct_pptp_info.keymap_orig; - else - exist_km = &ct->help.ct_pptp_info.keymap_reply; - - if (*exist_km) { - /* check whether it's a retransmission */ - list_for_each_entry(km, &gre_keymap_list, list) { - if (gre_key_cmpfn(km, t) && km == *exist_km) - return 0; - } - DEBUGP("trying to override keymap_%s for ct %p\n", - reply? "reply":"orig", ct); - return -EEXIST; - } - - km = kmalloc(sizeof(*km), GFP_ATOMIC); - if (!km) - return -ENOMEM; - - memcpy(&km->tuple, t, sizeof(*t)); - *exist_km = km; - - DEBUGP("adding new entry %p: ", km); - DUMP_TUPLE_GRE(&km->tuple); - - write_lock_bh(&ip_ct_gre_lock); - list_add_tail(&km->list, &gre_keymap_list); - write_unlock_bh(&ip_ct_gre_lock); - - return 0; -} - -/* destroy the keymap entries associated with specified master ct */ -void ip_ct_gre_keymap_destroy(struct ip_conntrack *ct) -{ - DEBUGP("entering for ct %p\n", ct); - - if (!ct->helper || strcmp(ct->helper->name, "pptp")) { - DEBUGP("refusing to destroy GRE keymap to non-pptp session\n"); - return; - } - - write_lock_bh(&ip_ct_gre_lock); - if (ct->help.ct_pptp_info.keymap_orig) { - DEBUGP("removing %p from list\n", - ct->help.ct_pptp_info.keymap_orig); - list_del(&ct->help.ct_pptp_info.keymap_orig->list); - kfree(ct->help.ct_pptp_info.keymap_orig); - ct->help.ct_pptp_info.keymap_orig = NULL; - } - if (ct->help.ct_pptp_info.keymap_reply) { - DEBUGP("removing %p from list\n", - ct->help.ct_pptp_info.keymap_reply); - list_del(&ct->help.ct_pptp_info.keymap_reply->list); - kfree(ct->help.ct_pptp_info.keymap_reply); - ct->help.ct_pptp_info.keymap_reply = NULL; - } - write_unlock_bh(&ip_ct_gre_lock); -} - - -/* PUBLIC CONNTRACK PROTO HELPER FUNCTIONS */ - -/* invert gre part of tuple */ -static int gre_invert_tuple(struct ip_conntrack_tuple *tuple, - const struct ip_conntrack_tuple *orig) -{ - tuple->dst.u.gre.key = orig->src.u.gre.key; - tuple->src.u.gre.key = orig->dst.u.gre.key; - - return 1; -} - -/* gre hdr info to tuple */ -static int gre_pkt_to_tuple(const struct sk_buff *skb, - unsigned int dataoff, - struct ip_conntrack_tuple *tuple) -{ - struct gre_hdr_pptp _pgrehdr, *pgrehdr; - __be16 srckey; - struct gre_hdr _grehdr, *grehdr; - - /* first only delinearize old RFC1701 GRE header */ - grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr); - if (!grehdr || grehdr->version != GRE_VERSION_PPTP) { - /* try to behave like "ip_conntrack_proto_generic" */ - tuple->src.u.all = 0; - tuple->dst.u.all = 0; - return 1; - } - - /* PPTP header is variable length, only need up to the call_id field */ - pgrehdr = skb_header_pointer(skb, dataoff, 8, &_pgrehdr); - if (!pgrehdr) - return 1; - - if (ntohs(grehdr->protocol) != GRE_PROTOCOL_PPTP) { - DEBUGP("GRE_VERSION_PPTP but unknown proto\n"); - return 0; - } - - tuple->dst.u.gre.key = pgrehdr->call_id; - srckey = gre_keymap_lookup(tuple); - tuple->src.u.gre.key = srckey; - - return 1; -} - -/* print gre part of tuple */ -static int gre_print_tuple(struct seq_file *s, - const struct ip_conntrack_tuple *tuple) -{ - return seq_printf(s, "srckey=0x%x dstkey=0x%x ", - ntohs(tuple->src.u.gre.key), - ntohs(tuple->dst.u.gre.key)); -} - -/* print private data for conntrack */ -static int gre_print_conntrack(struct seq_file *s, - const struct ip_conntrack *ct) -{ - return seq_printf(s, "timeout=%u, stream_timeout=%u ", - (ct->proto.gre.timeout / HZ), - (ct->proto.gre.stream_timeout / HZ)); -} - -/* Returns verdict for packet, and may modify conntrack */ -static int gre_packet(struct ip_conntrack *ct, - const struct sk_buff *skb, - enum ip_conntrack_info conntrackinfo) -{ - /* If we've seen traffic both ways, this is a GRE connection. - * Extend timeout. */ - if (ct->status & IPS_SEEN_REPLY) { - ip_ct_refresh_acct(ct, conntrackinfo, skb, - ct->proto.gre.stream_timeout); - /* Also, more likely to be important, and not a probe. */ - set_bit(IPS_ASSURED_BIT, &ct->status); - ip_conntrack_event_cache(IPCT_STATUS, skb); - } else - ip_ct_refresh_acct(ct, conntrackinfo, skb, - ct->proto.gre.timeout); - - return NF_ACCEPT; -} - -/* Called when a new connection for this protocol found. */ -static int gre_new(struct ip_conntrack *ct, - const struct sk_buff *skb) -{ - DEBUGP(": "); - DUMP_TUPLE_GRE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - - /* initialize to sane value. Ideally a conntrack helper - * (e.g. in case of pptp) is increasing them */ - ct->proto.gre.stream_timeout = GRE_STREAM_TIMEOUT; - ct->proto.gre.timeout = GRE_TIMEOUT; - - return 1; -} - -/* Called when a conntrack entry has already been removed from the hashes - * and is about to be deleted from memory */ -static void gre_destroy(struct ip_conntrack *ct) -{ - struct ip_conntrack *master = ct->master; - DEBUGP(" entering\n"); - - if (!master) - DEBUGP("no master !?!\n"); - else - ip_ct_gre_keymap_destroy(master); -} - -/* protocol helper struct */ -static struct ip_conntrack_protocol gre = { - .proto = IPPROTO_GRE, - .name = "gre", - .pkt_to_tuple = gre_pkt_to_tuple, - .invert_tuple = gre_invert_tuple, - .print_tuple = gre_print_tuple, - .print_conntrack = gre_print_conntrack, - .packet = gre_packet, - .new = gre_new, - .destroy = gre_destroy, - .me = THIS_MODULE, -#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ - defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) - .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr, - .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple, -#endif -}; - -/* ip_conntrack_proto_gre initialization */ -int __init ip_ct_proto_gre_init(void) -{ - return ip_conntrack_protocol_register(&gre); -} - -/* This cannot be __exit, as it is invoked from ip_conntrack_helper_pptp.c's - * init() code on errors. - */ -void ip_ct_proto_gre_fini(void) -{ - struct list_head *pos, *n; - - /* delete all keymap entries */ - write_lock_bh(&ip_ct_gre_lock); - list_for_each_safe(pos, n, &gre_keymap_list) { - DEBUGP("deleting keymap %p at module unload time\n", pos); - list_del(pos); - kfree(pos); - } - write_unlock_bh(&ip_ct_gre_lock); - - ip_conntrack_protocol_unregister(&gre); -} - -EXPORT_SYMBOL(ip_ct_gre_keymap_add); -EXPORT_SYMBOL(ip_ct_gre_keymap_destroy); diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c deleted file mode 100644 index ad70c81a21e..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ /dev/null @@ -1,315 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/timer.h> -#include <linux/netfilter.h> -#include <linux/in.h> -#include <linux/icmp.h> -#include <linux/seq_file.h> -#include <linux/skbuff.h> -#include <net/ip.h> -#include <net/checksum.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter_ipv4/ip_conntrack.h> -#include <linux/netfilter_ipv4/ip_conntrack_core.h> -#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> - -unsigned int ip_ct_icmp_timeout __read_mostly = 30*HZ; - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - -static int icmp_pkt_to_tuple(const struct sk_buff *skb, - unsigned int dataoff, - struct ip_conntrack_tuple *tuple) -{ - struct icmphdr _hdr, *hp; - - hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); - if (hp == NULL) - return 0; - - tuple->dst.u.icmp.type = hp->type; - tuple->src.u.icmp.id = hp->un.echo.id; - tuple->dst.u.icmp.code = hp->code; - - return 1; -} - -/* Add 1; spaces filled with 0. */ -static const u_int8_t invmap[] = { - [ICMP_ECHO] = ICMP_ECHOREPLY + 1, - [ICMP_ECHOREPLY] = ICMP_ECHO + 1, - [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, - [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1, - [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1, - [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1, - [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1, - [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1 -}; - -static int icmp_invert_tuple(struct ip_conntrack_tuple *tuple, - const struct ip_conntrack_tuple *orig) -{ - if (orig->dst.u.icmp.type >= sizeof(invmap) - || !invmap[orig->dst.u.icmp.type]) - return 0; - - tuple->src.u.icmp.id = orig->src.u.icmp.id; - tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1; - tuple->dst.u.icmp.code = orig->dst.u.icmp.code; - return 1; -} - -/* Print out the per-protocol part of the tuple. */ -static int icmp_print_tuple(struct seq_file *s, - const struct ip_conntrack_tuple *tuple) -{ - return seq_printf(s, "type=%u code=%u id=%u ", - tuple->dst.u.icmp.type, - tuple->dst.u.icmp.code, - ntohs(tuple->src.u.icmp.id)); -} - -/* Print out the private part of the conntrack. */ -static int icmp_print_conntrack(struct seq_file *s, - const struct ip_conntrack *conntrack) -{ - return 0; -} - -/* Returns verdict for packet, or -1 for invalid. */ -static int icmp_packet(struct ip_conntrack *ct, - const struct sk_buff *skb, - enum ip_conntrack_info ctinfo) -{ - /* Try to delete connection immediately after all replies: - won't actually vanish as we still have skb, and del_timer - means this will only run once even if count hits zero twice - (theoretically possible with SMP) */ - if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { - if (atomic_dec_and_test(&ct->proto.icmp.count) - && del_timer(&ct->timeout)) - ct->timeout.function((unsigned long)ct); - } else { - atomic_inc(&ct->proto.icmp.count); - ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); - ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout); - } - - return NF_ACCEPT; -} - -/* Called when a new connection for this protocol found. */ -static int icmp_new(struct ip_conntrack *conntrack, - const struct sk_buff *skb) -{ - static const u_int8_t valid_new[] = { - [ICMP_ECHO] = 1, - [ICMP_TIMESTAMP] = 1, - [ICMP_INFO_REQUEST] = 1, - [ICMP_ADDRESS] = 1 - }; - - if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) - || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) { - /* Can't create a new ICMP `conn' with this. */ - DEBUGP("icmp: can't create new conn with type %u\n", - conntrack->tuplehash[0].tuple.dst.u.icmp.type); - DUMP_TUPLE(&conntrack->tuplehash[0].tuple); - return 0; - } - atomic_set(&conntrack->proto.icmp.count, 0); - return 1; -} - -static int -icmp_error_message(struct sk_buff *skb, - enum ip_conntrack_info *ctinfo, - unsigned int hooknum) -{ - struct ip_conntrack_tuple innertuple, origtuple; - struct { - struct icmphdr icmp; - struct iphdr ip; - } _in, *inside; - struct ip_conntrack_protocol *innerproto; - struct ip_conntrack_tuple_hash *h; - int dataoff; - - IP_NF_ASSERT(skb->nfct == NULL); - - /* Not enough header? */ - inside = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_in), &_in); - if (inside == NULL) - return -NF_ACCEPT; - - /* Ignore ICMP's containing fragments (shouldn't happen) */ - if (inside->ip.frag_off & htons(IP_OFFSET)) { - DEBUGP("icmp_error_track: fragment of proto %u\n", - inside->ip.protocol); - return -NF_ACCEPT; - } - - innerproto = ip_conntrack_proto_find_get(inside->ip.protocol); - dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp) + inside->ip.ihl*4; - /* Are they talking about one of our connections? */ - if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) { - DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol); - ip_conntrack_proto_put(innerproto); - return -NF_ACCEPT; - } - - /* Ordinarily, we'd expect the inverted tupleproto, but it's - been preserved inside the ICMP. */ - if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) { - DEBUGP("icmp_error_track: Can't invert tuple\n"); - ip_conntrack_proto_put(innerproto); - return -NF_ACCEPT; - } - ip_conntrack_proto_put(innerproto); - - *ctinfo = IP_CT_RELATED; - - h = ip_conntrack_find_get(&innertuple, NULL); - if (!h) { - /* Locally generated ICMPs will match inverted if they - haven't been SNAT'ed yet */ - /* FIXME: NAT code has to handle half-done double NAT --RR */ - if (hooknum == NF_IP_LOCAL_OUT) - h = ip_conntrack_find_get(&origtuple, NULL); - - if (!h) { - DEBUGP("icmp_error_track: no match\n"); - return -NF_ACCEPT; - } - /* Reverse direction from that found */ - if (DIRECTION(h) != IP_CT_DIR_REPLY) - *ctinfo += IP_CT_IS_REPLY; - } else { - if (DIRECTION(h) == IP_CT_DIR_REPLY) - *ctinfo += IP_CT_IS_REPLY; - } - - /* Update skb to refer to this connection */ - skb->nfct = &tuplehash_to_ctrack(h)->ct_general; - skb->nfctinfo = *ctinfo; - return -NF_ACCEPT; -} - -/* Small and modified version of icmp_rcv */ -static int -icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, - unsigned int hooknum) -{ - struct icmphdr _ih, *icmph; - - /* Not enough header? */ - icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih); - if (icmph == NULL) { - if (LOG_INVALID(IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "ip_ct_icmp: short packet "); - return -NF_ACCEPT; - } - - /* See ip_conntrack_proto_tcp.c */ - if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING && - nf_ip_checksum(skb, hooknum, skb->nh.iph->ihl * 4, 0)) { - if (LOG_INVALID(IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "ip_ct_icmp: bad ICMP checksum "); - return -NF_ACCEPT; - } - - /* - * 18 is the highest 'known' ICMP type. Anything else is a mystery - * - * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently - * discarded. - */ - if (icmph->type > NR_ICMP_TYPES) { - if (LOG_INVALID(IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "ip_ct_icmp: invalid ICMP type "); - return -NF_ACCEPT; - } - - /* Need to track icmp error message? */ - if (icmph->type != ICMP_DEST_UNREACH - && icmph->type != ICMP_SOURCE_QUENCH - && icmph->type != ICMP_TIME_EXCEEDED - && icmph->type != ICMP_PARAMETERPROB - && icmph->type != ICMP_REDIRECT) - return NF_ACCEPT; - - return icmp_error_message(skb, ctinfo, hooknum); -} - -#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ - defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) -static int icmp_tuple_to_nfattr(struct sk_buff *skb, - const struct ip_conntrack_tuple *t) -{ - NFA_PUT(skb, CTA_PROTO_ICMP_ID, sizeof(__be16), - &t->src.u.icmp.id); - NFA_PUT(skb, CTA_PROTO_ICMP_TYPE, sizeof(u_int8_t), - &t->dst.u.icmp.type); - NFA_PUT(skb, CTA_PROTO_ICMP_CODE, sizeof(u_int8_t), - &t->dst.u.icmp.code); - - return 0; - -nfattr_failure: - return -1; -} - -static int icmp_nfattr_to_tuple(struct nfattr *tb[], - struct ip_conntrack_tuple *tuple) -{ - if (!tb[CTA_PROTO_ICMP_TYPE-1] - || !tb[CTA_PROTO_ICMP_CODE-1] - || !tb[CTA_PROTO_ICMP_ID-1]) - return -EINVAL; - - tuple->dst.u.icmp.type = - *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_TYPE-1]); - tuple->dst.u.icmp.code = - *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]); - tuple->src.u.icmp.id = - *(__be16 *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]); - - if (tuple->dst.u.icmp.type >= sizeof(invmap) - || !invmap[tuple->dst.u.icmp.type]) - return -EINVAL; - - return 0; -} -#endif - -struct ip_conntrack_protocol ip_conntrack_protocol_icmp = -{ - .proto = IPPROTO_ICMP, - .name = "icmp", - .pkt_to_tuple = icmp_pkt_to_tuple, - .invert_tuple = icmp_invert_tuple, - .print_tuple = icmp_print_tuple, - .print_conntrack = icmp_print_conntrack, - .packet = icmp_packet, - .new = icmp_new, - .error = icmp_error, -#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ - defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) - .tuple_to_nfattr = icmp_tuple_to_nfattr, - .nfattr_to_tuple = icmp_nfattr_to_tuple, -#endif -}; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c deleted file mode 100644 index e6942992b2f..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c +++ /dev/null @@ -1,659 +0,0 @@ -/* - * Connection tracking protocol helper module for SCTP. - * - * SCTP is defined in RFC 2960. References to various sections in this code - * are to this RFC. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -/* - * Added support for proc manipulation of timeouts. - */ - -#include <linux/types.h> -#include <linux/timer.h> -#include <linux/interrupt.h> -#include <linux/netfilter.h> -#include <linux/module.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/sctp.h> -#include <linux/string.h> -#include <linux/seq_file.h> - -#include <linux/netfilter_ipv4/ip_conntrack.h> -#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> - -#if 0 -#define DEBUGP(format, ...) printk(format, ## __VA_ARGS__) -#else -#define DEBUGP(format, args...) -#endif - -/* Protects conntrack->proto.sctp */ -static DEFINE_RWLOCK(sctp_lock); - -/* FIXME: Examine ipfilter's timeouts and conntrack transitions more - closely. They're more complex. --RR - - And so for me for SCTP :D -Kiran */ - -static const char *sctp_conntrack_names[] = { - "NONE", - "CLOSED", - "COOKIE_WAIT", - "COOKIE_ECHOED", - "ESTABLISHED", - "SHUTDOWN_SENT", - "SHUTDOWN_RECD", - "SHUTDOWN_ACK_SENT", -}; - -#define SECS * HZ -#define MINS * 60 SECS -#define HOURS * 60 MINS -#define DAYS * 24 HOURS - -static unsigned int ip_ct_sctp_timeout_closed __read_mostly = 10 SECS; -static unsigned int ip_ct_sctp_timeout_cookie_wait __read_mostly = 3 SECS; -static unsigned int ip_ct_sctp_timeout_cookie_echoed __read_mostly = 3 SECS; -static unsigned int ip_ct_sctp_timeout_established __read_mostly = 5 DAYS; -static unsigned int ip_ct_sctp_timeout_shutdown_sent __read_mostly = 300 SECS / 1000; -static unsigned int ip_ct_sctp_timeout_shutdown_recd __read_mostly = 300 SECS / 1000; -static unsigned int ip_ct_sctp_timeout_shutdown_ack_sent __read_mostly = 3 SECS; - -static const unsigned int * sctp_timeouts[] -= { NULL, /* SCTP_CONNTRACK_NONE */ - &ip_ct_sctp_timeout_closed, /* SCTP_CONNTRACK_CLOSED */ - &ip_ct_sctp_timeout_cookie_wait, /* SCTP_CONNTRACK_COOKIE_WAIT */ - &ip_ct_sctp_timeout_cookie_echoed, /* SCTP_CONNTRACK_COOKIE_ECHOED */ - &ip_ct_sctp_timeout_established, /* SCTP_CONNTRACK_ESTABLISHED */ - &ip_ct_sctp_timeout_shutdown_sent, /* SCTP_CONNTRACK_SHUTDOWN_SENT */ - &ip_ct_sctp_timeout_shutdown_recd, /* SCTP_CONNTRACK_SHUTDOWN_RECD */ - &ip_ct_sctp_timeout_shutdown_ack_sent /* SCTP_CONNTRACK_SHUTDOWN_ACK_SENT */ - }; - -#define sNO SCTP_CONNTRACK_NONE -#define sCL SCTP_CONNTRACK_CLOSED -#define sCW SCTP_CONNTRACK_COOKIE_WAIT -#define sCE SCTP_CONNTRACK_COOKIE_ECHOED -#define sES SCTP_CONNTRACK_ESTABLISHED -#define sSS SCTP_CONNTRACK_SHUTDOWN_SENT -#define sSR SCTP_CONNTRACK_SHUTDOWN_RECD -#define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT -#define sIV SCTP_CONNTRACK_MAX - -/* - These are the descriptions of the states: - -NOTE: These state names are tantalizingly similar to the states of an -SCTP endpoint. But the interpretation of the states is a little different, -considering that these are the states of the connection and not of an end -point. Please note the subtleties. -Kiran - -NONE - Nothing so far. -COOKIE WAIT - We have seen an INIT chunk in the original direction, or also - an INIT_ACK chunk in the reply direction. -COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction. -ESTABLISHED - We have seen a COOKIE_ACK in the reply direction. -SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction. -SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply directoin. -SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite - to that of the SHUTDOWN chunk. -CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of - the SHUTDOWN chunk. Connection is closed. -*/ - -/* TODO - - I have assumed that the first INIT is in the original direction. - This messes things when an INIT comes in the reply direction in CLOSED - state. - - Check the error type in the reply dir before transitioning from -cookie echoed to closed. - - Sec 5.2.4 of RFC 2960 - - Multi Homing support. -*/ - -/* SCTP conntrack state transitions */ -static const enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = { - { -/* ORIGINAL */ -/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ -/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA}, -/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA}, -/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, -/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA}, -/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA}, -/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant have Stale cookie*/ -/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */ -/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in orig dir */ -/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL} - }, - { -/* REPLY */ -/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ -/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* INIT in sCL Big TODO */ -/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA}, -/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, -/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA}, -/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA}, -/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA}, -/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in reply dir */ -/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA}, -/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL} - } -}; - -static int sctp_pkt_to_tuple(const struct sk_buff *skb, - unsigned int dataoff, - struct ip_conntrack_tuple *tuple) -{ - sctp_sctphdr_t _hdr, *hp; - - DEBUGP(__FUNCTION__); - DEBUGP("\n"); - - /* Actually only need first 8 bytes. */ - hp = skb_header_pointer(skb, dataoff, 8, &_hdr); - if (hp == NULL) - return 0; - - tuple->src.u.sctp.port = hp->source; - tuple->dst.u.sctp.port = hp->dest; - return 1; -} - -static int sctp_invert_tuple(struct ip_conntrack_tuple *tuple, - const struct ip_conntrack_tuple *orig) -{ - DEBUGP(__FUNCTION__); - DEBUGP("\n"); - - tuple->src.u.sctp.port = orig->dst.u.sctp.port; - tuple->dst.u.sctp.port = orig->src.u.sctp.port; - return 1; -} - -/* Print out the per-protocol part of the tuple. */ -static int sctp_print_tuple(struct seq_file *s, - const struct ip_conntrack_tuple *tuple) -{ - DEBUGP(__FUNCTION__); - DEBUGP("\n"); - - return seq_printf(s, "sport=%hu dport=%hu ", - ntohs(tuple->src.u.sctp.port), - ntohs(tuple->dst.u.sctp.port)); -} - -/* Print out the private part of the conntrack. */ -static int sctp_print_conntrack(struct seq_file *s, - const struct ip_conntrack *conntrack) -{ - enum sctp_conntrack state; - - DEBUGP(__FUNCTION__); - DEBUGP("\n"); - - read_lock_bh(&sctp_lock); - state = conntrack->proto.sctp.state; - read_unlock_bh(&sctp_lock); - - return seq_printf(s, "%s ", sctp_conntrack_names[state]); -} - -#define for_each_sctp_chunk(skb, sch, _sch, offset, count) \ -for (offset = skb->nh.iph->ihl * 4 + sizeof(sctp_sctphdr_t), count = 0; \ - offset < skb->len && \ - (sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch)); \ - offset += (ntohs(sch->length) + 3) & ~3, count++) - -/* Some validity checks to make sure the chunks are fine */ -static int do_basic_checks(struct ip_conntrack *conntrack, - const struct sk_buff *skb, - char *map) -{ - u_int32_t offset, count; - sctp_chunkhdr_t _sch, *sch; - int flag; - - DEBUGP(__FUNCTION__); - DEBUGP("\n"); - - flag = 0; - - for_each_sctp_chunk (skb, sch, _sch, offset, count) { - DEBUGP("Chunk Num: %d Type: %d\n", count, sch->type); - - if (sch->type == SCTP_CID_INIT - || sch->type == SCTP_CID_INIT_ACK - || sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { - flag = 1; - } - - /* - * Cookie Ack/Echo chunks not the first OR - * Init / Init Ack / Shutdown compl chunks not the only chunks - * OR zero-length. - */ - if (((sch->type == SCTP_CID_COOKIE_ACK - || sch->type == SCTP_CID_COOKIE_ECHO - || flag) - && count !=0) || !sch->length) { - DEBUGP("Basic checks failed\n"); - return 1; - } - - if (map) { - set_bit(sch->type, (void *)map); - } - } - - DEBUGP("Basic checks passed\n"); - return count == 0; -} - -static int new_state(enum ip_conntrack_dir dir, - enum sctp_conntrack cur_state, - int chunk_type) -{ - int i; - - DEBUGP(__FUNCTION__); - DEBUGP("\n"); - - DEBUGP("Chunk type: %d\n", chunk_type); - - switch (chunk_type) { - case SCTP_CID_INIT: - DEBUGP("SCTP_CID_INIT\n"); - i = 0; break; - case SCTP_CID_INIT_ACK: - DEBUGP("SCTP_CID_INIT_ACK\n"); - i = 1; break; - case SCTP_CID_ABORT: - DEBUGP("SCTP_CID_ABORT\n"); - i = 2; break; - case SCTP_CID_SHUTDOWN: - DEBUGP("SCTP_CID_SHUTDOWN\n"); - i = 3; break; - case SCTP_CID_SHUTDOWN_ACK: - DEBUGP("SCTP_CID_SHUTDOWN_ACK\n"); - i = 4; break; - case SCTP_CID_ERROR: - DEBUGP("SCTP_CID_ERROR\n"); - i = 5; break; - case SCTP_CID_COOKIE_ECHO: - DEBUGP("SCTP_CID_COOKIE_ECHO\n"); - i = 6; break; - case SCTP_CID_COOKIE_ACK: - DEBUGP("SCTP_CID_COOKIE_ACK\n"); - i = 7; break; - case SCTP_CID_SHUTDOWN_COMPLETE: - DEBUGP("SCTP_CID_SHUTDOWN_COMPLETE\n"); - i = 8; break; - default: - /* Other chunks like DATA, SACK, HEARTBEAT and - its ACK do not cause a change in state */ - DEBUGP("Unknown chunk type, Will stay in %s\n", - sctp_conntrack_names[cur_state]); - return cur_state; - } - - DEBUGP("dir: %d cur_state: %s chunk_type: %d new_state: %s\n", - dir, sctp_conntrack_names[cur_state], chunk_type, - sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]); - - return sctp_conntracks[dir][i][cur_state]; -} - -/* Returns verdict for packet, or -1 for invalid. */ -static int sctp_packet(struct ip_conntrack *conntrack, - const struct sk_buff *skb, - enum ip_conntrack_info ctinfo) -{ - enum sctp_conntrack newconntrack, oldsctpstate; - struct iphdr *iph = skb->nh.iph; - sctp_sctphdr_t _sctph, *sh; - sctp_chunkhdr_t _sch, *sch; - u_int32_t offset, count; - char map[256 / sizeof (char)] = {0}; - - DEBUGP(__FUNCTION__); - DEBUGP("\n"); - - sh = skb_header_pointer(skb, iph->ihl * 4, sizeof(_sctph), &_sctph); - if (sh == NULL) - return -1; - - if (do_basic_checks(conntrack, skb, map) != 0) - return -1; - - /* Check the verification tag (Sec 8.5) */ - if (!test_bit(SCTP_CID_INIT, (void *)map) - && !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, (void *)map) - && !test_bit(SCTP_CID_COOKIE_ECHO, (void *)map) - && !test_bit(SCTP_CID_ABORT, (void *)map) - && !test_bit(SCTP_CID_SHUTDOWN_ACK, (void *)map) - && (sh->vtag != conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) { - DEBUGP("Verification tag check failed\n"); - return -1; - } - - oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX; - for_each_sctp_chunk (skb, sch, _sch, offset, count) { - write_lock_bh(&sctp_lock); - - /* Special cases of Verification tag check (Sec 8.5.1) */ - if (sch->type == SCTP_CID_INIT) { - /* Sec 8.5.1 (A) */ - if (sh->vtag != 0) { - write_unlock_bh(&sctp_lock); - return -1; - } - } else if (sch->type == SCTP_CID_ABORT) { - /* Sec 8.5.1 (B) */ - if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)]) - && !(sh->vtag == conntrack->proto.sctp.vtag - [1 - CTINFO2DIR(ctinfo)])) { - write_unlock_bh(&sctp_lock); - return -1; - } - } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { - /* Sec 8.5.1 (C) */ - if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)]) - && !(sh->vtag == conntrack->proto.sctp.vtag - [1 - CTINFO2DIR(ctinfo)] - && (sch->flags & 1))) { - write_unlock_bh(&sctp_lock); - return -1; - } - } else if (sch->type == SCTP_CID_COOKIE_ECHO) { - /* Sec 8.5.1 (D) */ - if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) { - write_unlock_bh(&sctp_lock); - return -1; - } - } - - oldsctpstate = conntrack->proto.sctp.state; - newconntrack = new_state(CTINFO2DIR(ctinfo), oldsctpstate, sch->type); - - /* Invalid */ - if (newconntrack == SCTP_CONNTRACK_MAX) { - DEBUGP("ip_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n", - CTINFO2DIR(ctinfo), sch->type, oldsctpstate); - write_unlock_bh(&sctp_lock); - return -1; - } - - /* If it is an INIT or an INIT ACK note down the vtag */ - if (sch->type == SCTP_CID_INIT - || sch->type == SCTP_CID_INIT_ACK) { - sctp_inithdr_t _inithdr, *ih; - - ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t), - sizeof(_inithdr), &_inithdr); - if (ih == NULL) { - write_unlock_bh(&sctp_lock); - return -1; - } - DEBUGP("Setting vtag %x for dir %d\n", - ih->init_tag, !CTINFO2DIR(ctinfo)); - conntrack->proto.sctp.vtag[!CTINFO2DIR(ctinfo)] = ih->init_tag; - } - - conntrack->proto.sctp.state = newconntrack; - if (oldsctpstate != newconntrack) - ip_conntrack_event_cache(IPCT_PROTOINFO, skb); - write_unlock_bh(&sctp_lock); - } - - ip_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]); - - if (oldsctpstate == SCTP_CONNTRACK_COOKIE_ECHOED - && CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY - && newconntrack == SCTP_CONNTRACK_ESTABLISHED) { - DEBUGP("Setting assured bit\n"); - set_bit(IPS_ASSURED_BIT, &conntrack->status); - ip_conntrack_event_cache(IPCT_STATUS, skb); - } - - return NF_ACCEPT; -} - -/* Called when a new connection for this protocol found. */ -static int sctp_new(struct ip_conntrack *conntrack, - const struct sk_buff *skb) -{ - enum sctp_conntrack newconntrack; - struct iphdr *iph = skb->nh.iph; - sctp_sctphdr_t _sctph, *sh; - sctp_chunkhdr_t _sch, *sch; - u_int32_t offset, count; - char map[256 / sizeof (char)] = {0}; - - DEBUGP(__FUNCTION__); - DEBUGP("\n"); - - sh = skb_header_pointer(skb, iph->ihl * 4, sizeof(_sctph), &_sctph); - if (sh == NULL) - return 0; - - if (do_basic_checks(conntrack, skb, map) != 0) - return 0; - - /* If an OOTB packet has any of these chunks discard (Sec 8.4) */ - if ((test_bit (SCTP_CID_ABORT, (void *)map)) - || (test_bit (SCTP_CID_SHUTDOWN_COMPLETE, (void *)map)) - || (test_bit (SCTP_CID_COOKIE_ACK, (void *)map))) { - return 0; - } - - newconntrack = SCTP_CONNTRACK_MAX; - for_each_sctp_chunk (skb, sch, _sch, offset, count) { - /* Don't need lock here: this conntrack not in circulation yet */ - newconntrack = new_state (IP_CT_DIR_ORIGINAL, - SCTP_CONNTRACK_NONE, sch->type); - - /* Invalid: delete conntrack */ - if (newconntrack == SCTP_CONNTRACK_MAX) { - DEBUGP("ip_conntrack_sctp: invalid new deleting.\n"); - return 0; - } - - /* Copy the vtag into the state info */ - if (sch->type == SCTP_CID_INIT) { - if (sh->vtag == 0) { - sctp_inithdr_t _inithdr, *ih; - - ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t), - sizeof(_inithdr), &_inithdr); - if (ih == NULL) - return 0; - - DEBUGP("Setting vtag %x for new conn\n", - ih->init_tag); - - conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = - ih->init_tag; - } else { - /* Sec 8.5.1 (A) */ - return 0; - } - } - /* If it is a shutdown ack OOTB packet, we expect a return - shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */ - else { - DEBUGP("Setting vtag %x for new conn OOTB\n", - sh->vtag); - conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag; - } - - conntrack->proto.sctp.state = newconntrack; - } - - return 1; -} - -static struct ip_conntrack_protocol ip_conntrack_protocol_sctp = { - .proto = IPPROTO_SCTP, - .name = "sctp", - .pkt_to_tuple = sctp_pkt_to_tuple, - .invert_tuple = sctp_invert_tuple, - .print_tuple = sctp_print_tuple, - .print_conntrack = sctp_print_conntrack, - .packet = sctp_packet, - .new = sctp_new, - .destroy = NULL, - .me = THIS_MODULE, -#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ - defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) - .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr, - .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple, -#endif -}; - -#ifdef CONFIG_SYSCTL -static ctl_table ip_ct_sysctl_table[] = { - { - .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, - .procname = "ip_conntrack_sctp_timeout_closed", - .data = &ip_ct_sctp_timeout_closed, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, - .procname = "ip_conntrack_sctp_timeout_cookie_wait", - .data = &ip_ct_sctp_timeout_cookie_wait, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, - .procname = "ip_conntrack_sctp_timeout_cookie_echoed", - .data = &ip_ct_sctp_timeout_cookie_echoed, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, - .procname = "ip_conntrack_sctp_timeout_established", - .data = &ip_ct_sctp_timeout_established, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, - .procname = "ip_conntrack_sctp_timeout_shutdown_sent", - .data = &ip_ct_sctp_timeout_shutdown_sent, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, - .procname = "ip_conntrack_sctp_timeout_shutdown_recd", - .data = &ip_ct_sctp_timeout_shutdown_recd, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, - .procname = "ip_conntrack_sctp_timeout_shutdown_ack_sent", - .data = &ip_ct_sctp_timeout_shutdown_ack_sent, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { .ctl_name = 0 } -}; - -static ctl_table ip_ct_netfilter_table[] = { - { - .ctl_name = NET_IPV4_NETFILTER, - .procname = "netfilter", - .mode = 0555, - .child = ip_ct_sysctl_table, - }, - { .ctl_name = 0 } -}; - -static ctl_table ip_ct_ipv4_table[] = { - { - .ctl_name = NET_IPV4, - .procname = "ipv4", - .mode = 0555, - .child = ip_ct_netfilter_table, - }, - { .ctl_name = 0 } -}; - -static ctl_table ip_ct_net_table[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = ip_ct_ipv4_table, - }, - { .ctl_name = 0 } -}; - -static struct ctl_table_header *ip_ct_sysctl_header; -#endif - -static int __init ip_conntrack_proto_sctp_init(void) -{ - int ret; - - ret = ip_conntrack_protocol_register(&ip_conntrack_protocol_sctp); - if (ret) { - printk("ip_conntrack_proto_sctp: protocol register failed\n"); - goto out; - } - -#ifdef CONFIG_SYSCTL - ip_ct_sysctl_header = register_sysctl_table(ip_ct_net_table); - if (ip_ct_sysctl_header == NULL) { - ret = -ENOMEM; - printk("ip_conntrack_proto_sctp: can't register to sysctl.\n"); - goto cleanup; - } -#endif - - return ret; - -#ifdef CONFIG_SYSCTL - cleanup: - ip_conntrack_protocol_unregister(&ip_conntrack_protocol_sctp); -#endif - out: - DEBUGP("SCTP conntrack module loading %s\n", - ret ? "failed": "succeeded"); - return ret; -} - -static void __exit ip_conntrack_proto_sctp_fini(void) -{ - ip_conntrack_protocol_unregister(&ip_conntrack_protocol_sctp); -#ifdef CONFIG_SYSCTL - unregister_sysctl_table(ip_ct_sysctl_header); -#endif - DEBUGP("SCTP conntrack module unloaded\n"); -} - -module_init(ip_conntrack_proto_sctp_init); -module_exit(ip_conntrack_proto_sctp_fini); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Kiran Kumar Immidi"); -MODULE_DESCRIPTION("Netfilter connection tracking protocol helper for SCTP"); diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c deleted file mode 100644 index 0a72eab1462..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ /dev/null @@ -1,1164 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>: - * - Real stateful connection tracking - * - Modified state transitions table - * - Window scaling support added - * - SACK support added - * - * Willy Tarreau: - * - State table bugfixes - * - More robust state changes - * - Tuning timer parameters - * - * version 2.2 - */ - -#include <linux/types.h> -#include <linux/timer.h> -#include <linux/netfilter.h> -#include <linux/module.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/tcp.h> -#include <linux/spinlock.h> - -#include <net/tcp.h> - -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter_ipv4/ip_conntrack.h> -#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> - -#if 0 -#define DEBUGP printk -#define DEBUGP_VARS -#else -#define DEBUGP(format, args...) -#endif - -/* Protects conntrack->proto.tcp */ -static DEFINE_RWLOCK(tcp_lock); - -/* "Be conservative in what you do, - be liberal in what you accept from others." - If it's non-zero, we mark only out of window RST segments as INVALID. */ -int ip_ct_tcp_be_liberal __read_mostly = 0; - -/* If it is set to zero, we disable picking up already established - connections. */ -int ip_ct_tcp_loose __read_mostly = 1; - -/* Max number of the retransmitted packets without receiving an (acceptable) - ACK from the destination. If this number is reached, a shorter timer - will be started. */ -int ip_ct_tcp_max_retrans __read_mostly = 3; - - /* FIXME: Examine ipfilter's timeouts and conntrack transitions more - closely. They're more complex. --RR */ - -static const char *tcp_conntrack_names[] = { - "NONE", - "SYN_SENT", - "SYN_RECV", - "ESTABLISHED", - "FIN_WAIT", - "CLOSE_WAIT", - "LAST_ACK", - "TIME_WAIT", - "CLOSE", - "LISTEN" -}; - -#define SECS * HZ -#define MINS * 60 SECS -#define HOURS * 60 MINS -#define DAYS * 24 HOURS - -unsigned int ip_ct_tcp_timeout_syn_sent __read_mostly = 2 MINS; -unsigned int ip_ct_tcp_timeout_syn_recv __read_mostly = 60 SECS; -unsigned int ip_ct_tcp_timeout_established __read_mostly = 5 DAYS; -unsigned int ip_ct_tcp_timeout_fin_wait __read_mostly = 2 MINS; -unsigned int ip_ct_tcp_timeout_close_wait __read_mostly = 60 SECS; -unsigned int ip_ct_tcp_timeout_last_ack __read_mostly = 30 SECS; -unsigned int ip_ct_tcp_timeout_time_wait __read_mostly = 2 MINS; -unsigned int ip_ct_tcp_timeout_close __read_mostly = 10 SECS; - -/* RFC1122 says the R2 limit should be at least 100 seconds. - Linux uses 15 packets as limit, which corresponds - to ~13-30min depending on RTO. */ -unsigned int ip_ct_tcp_timeout_max_retrans __read_mostly = 5 MINS; - -static const unsigned int * tcp_timeouts[] -= { NULL, /* TCP_CONNTRACK_NONE */ - &ip_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */ - &ip_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */ - &ip_ct_tcp_timeout_established, /* TCP_CONNTRACK_ESTABLISHED, */ - &ip_ct_tcp_timeout_fin_wait, /* TCP_CONNTRACK_FIN_WAIT, */ - &ip_ct_tcp_timeout_close_wait, /* TCP_CONNTRACK_CLOSE_WAIT, */ - &ip_ct_tcp_timeout_last_ack, /* TCP_CONNTRACK_LAST_ACK, */ - &ip_ct_tcp_timeout_time_wait, /* TCP_CONNTRACK_TIME_WAIT, */ - &ip_ct_tcp_timeout_close, /* TCP_CONNTRACK_CLOSE, */ - NULL, /* TCP_CONNTRACK_LISTEN */ - }; - -#define sNO TCP_CONNTRACK_NONE -#define sSS TCP_CONNTRACK_SYN_SENT -#define sSR TCP_CONNTRACK_SYN_RECV -#define sES TCP_CONNTRACK_ESTABLISHED -#define sFW TCP_CONNTRACK_FIN_WAIT -#define sCW TCP_CONNTRACK_CLOSE_WAIT -#define sLA TCP_CONNTRACK_LAST_ACK -#define sTW TCP_CONNTRACK_TIME_WAIT -#define sCL TCP_CONNTRACK_CLOSE -#define sLI TCP_CONNTRACK_LISTEN -#define sIV TCP_CONNTRACK_MAX -#define sIG TCP_CONNTRACK_IGNORE - -/* What TCP flags are set from RST/SYN/FIN/ACK. */ -enum tcp_bit_set { - TCP_SYN_SET, - TCP_SYNACK_SET, - TCP_FIN_SET, - TCP_ACK_SET, - TCP_RST_SET, - TCP_NONE_SET, -}; - -/* - * The TCP state transition table needs a few words... - * - * We are the man in the middle. All the packets go through us - * but might get lost in transit to the destination. - * It is assumed that the destinations can't receive segments - * we haven't seen. - * - * The checked segment is in window, but our windows are *not* - * equivalent with the ones of the sender/receiver. We always - * try to guess the state of the current sender. - * - * The meaning of the states are: - * - * NONE: initial state - * SYN_SENT: SYN-only packet seen - * SYN_RECV: SYN-ACK packet seen - * ESTABLISHED: ACK packet seen - * FIN_WAIT: FIN packet seen - * CLOSE_WAIT: ACK seen (after FIN) - * LAST_ACK: FIN seen (after FIN) - * TIME_WAIT: last ACK seen - * CLOSE: closed connection - * - * LISTEN state is not used. - * - * Packets marked as IGNORED (sIG): - * if they may be either invalid or valid - * and the receiver may send back a connection - * closing RST or a SYN/ACK. - * - * Packets marked as INVALID (sIV): - * if they are invalid - * or we do not support the request (simultaneous open) - */ -static const enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { - { -/* ORIGINAL */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sIV }, -/* - * sNO -> sSS Initialize a new connection - * sSS -> sSS Retransmitted SYN - * sSR -> sIG Late retransmitted SYN? - * sES -> sIG Error: SYNs in window outside the SYN_SENT state - * are errors. Receiver will reply with RST - * and close the connection. - * Or we are not in sync and hold a dead connection. - * sFW -> sIG - * sCW -> sIG - * sLA -> sIG - * sTW -> sSS Reopened connection (RFC 1122). - * sCL -> sSS - */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*synack*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }, -/* - * A SYN/ACK from the client is always invalid: - * - either it tries to set up a simultaneous open, which is - * not supported; - * - or the firewall has just been inserted between the two hosts - * during the session set-up. The SYN will be retransmitted - * by the true client (or it'll time out). - */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, -/* - * sNO -> sIV Too late and no reason to do anything... - * sSS -> sIV Client migth not send FIN in this state: - * we enforce waiting for a SYN/ACK reply first. - * sSR -> sFW Close started. - * sES -> sFW - * sFW -> sLA FIN seen in both directions, waiting for - * the last ACK. - * Migth be a retransmitted FIN as well... - * sCW -> sLA - * sLA -> sLA Retransmitted FIN. Remain in the same state. - * sTW -> sTW - * sCL -> sCL - */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV }, -/* - * sNO -> sES Assumed. - * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet. - * sSR -> sES Established state is reached. - * sES -> sES :-) - * sFW -> sCW Normal close request answered by ACK. - * sCW -> sCW - * sLA -> sTW Last ACK detected. - * sTW -> sTW Retransmitted last ACK. Remain in the same state. - * sCL -> sCL - */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV }, -/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } - }, - { -/* REPLY */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*syn*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }, -/* - * sNO -> sIV Never reached. - * sSS -> sIV Simultaneous open, not supported - * sSR -> sIV Simultaneous open, not supported. - * sES -> sIV Server may not initiate a connection. - * sFW -> sIV - * sCW -> sIV - * sLA -> sIV - * sTW -> sIV Reopened connection, but server may not do it. - * sCL -> sIV - */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIV }, -/* - * sSS -> sSR Standard open. - * sSR -> sSR Retransmitted SYN/ACK. - * sES -> sIG Late retransmitted SYN/ACK? - * sFW -> sIG Might be SYN/ACK answering ignored SYN - * sCW -> sIG - * sLA -> sIG - * sTW -> sIG - * sCL -> sIG - */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, -/* - * sSS -> sIV Server might not send FIN in this state. - * sSR -> sFW Close started. - * sES -> sFW - * sFW -> sLA FIN seen in both directions. - * sCW -> sLA - * sLA -> sLA Retransmitted FIN. - * sTW -> sTW - * sCL -> sCL - */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*ack*/ { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV }, -/* - * sSS -> sIG Might be a half-open connection. - * sSR -> sSR Might answer late resent SYN. - * sES -> sES :-) - * sFW -> sCW Normal close request answered by ACK. - * sCW -> sCW - * sLA -> sTW Last ACK detected. - * sTW -> sTW Retransmitted last ACK. - * sCL -> sCL - */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV }, -/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } - } -}; - -static int tcp_pkt_to_tuple(const struct sk_buff *skb, - unsigned int dataoff, - struct ip_conntrack_tuple *tuple) -{ - struct tcphdr _hdr, *hp; - - /* Actually only need first 8 bytes. */ - hp = skb_header_pointer(skb, dataoff, 8, &_hdr); - if (hp == NULL) - return 0; - - tuple->src.u.tcp.port = hp->source; - tuple->dst.u.tcp.port = hp->dest; - - return 1; -} - -static int tcp_invert_tuple(struct ip_conntrack_tuple *tuple, - const struct ip_conntrack_tuple *orig) -{ - tuple->src.u.tcp.port = orig->dst.u.tcp.port; - tuple->dst.u.tcp.port = orig->src.u.tcp.port; - return 1; -} - -/* Print out the per-protocol part of the tuple. */ -static int tcp_print_tuple(struct seq_file *s, - const struct ip_conntrack_tuple *tuple) -{ - return seq_printf(s, "sport=%hu dport=%hu ", - ntohs(tuple->src.u.tcp.port), - ntohs(tuple->dst.u.tcp.port)); -} - -/* Print out the private part of the conntrack. */ -static int tcp_print_conntrack(struct seq_file *s, - const struct ip_conntrack *conntrack) -{ - enum tcp_conntrack state; - - read_lock_bh(&tcp_lock); - state = conntrack->proto.tcp.state; - read_unlock_bh(&tcp_lock); - - return seq_printf(s, "%s ", tcp_conntrack_names[state]); -} - -#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ - defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) -static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa, - const struct ip_conntrack *ct) -{ - struct nfattr *nest_parms; - - read_lock_bh(&tcp_lock); - nest_parms = NFA_NEST(skb, CTA_PROTOINFO_TCP); - NFA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t), - &ct->proto.tcp.state); - read_unlock_bh(&tcp_lock); - - NFA_NEST_END(skb, nest_parms); - - return 0; - -nfattr_failure: - read_unlock_bh(&tcp_lock); - return -1; -} - -static const size_t cta_min_tcp[CTA_PROTOINFO_TCP_MAX] = { - [CTA_PROTOINFO_TCP_STATE-1] = sizeof(u_int8_t), -}; - -static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct) -{ - struct nfattr *attr = cda[CTA_PROTOINFO_TCP-1]; - struct nfattr *tb[CTA_PROTOINFO_TCP_MAX]; - - /* updates could not contain anything about the private - * protocol info, in that case skip the parsing */ - if (!attr) - return 0; - - nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr); - - if (nfattr_bad_size(tb, CTA_PROTOINFO_TCP_MAX, cta_min_tcp)) - return -EINVAL; - - if (!tb[CTA_PROTOINFO_TCP_STATE-1]) - return -EINVAL; - - write_lock_bh(&tcp_lock); - ct->proto.tcp.state = - *(u_int8_t *)NFA_DATA(tb[CTA_PROTOINFO_TCP_STATE-1]); - write_unlock_bh(&tcp_lock); - - return 0; -} -#endif - -static unsigned int get_conntrack_index(const struct tcphdr *tcph) -{ - if (tcph->rst) return TCP_RST_SET; - else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET); - else if (tcph->fin) return TCP_FIN_SET; - else if (tcph->ack) return TCP_ACK_SET; - else return TCP_NONE_SET; -} - -/* TCP connection tracking based on 'Real Stateful TCP Packet Filtering - in IP Filter' by Guido van Rooij. - - http://www.nluug.nl/events/sane2000/papers.html - http://www.iae.nl/users/guido/papers/tcp_filtering.ps.gz - - The boundaries and the conditions are changed according to RFC793: - the packet must intersect the window (i.e. segments may be - after the right or before the left edge) and thus receivers may ACK - segments after the right edge of the window. - - td_maxend = max(sack + max(win,1)) seen in reply packets - td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets - td_maxwin += seq + len - sender.td_maxend - if seq + len > sender.td_maxend - td_end = max(seq + len) seen in sent packets - - I. Upper bound for valid data: seq <= sender.td_maxend - II. Lower bound for valid data: seq + len >= sender.td_end - receiver.td_maxwin - III. Upper bound for valid ack: sack <= receiver.td_end - IV. Lower bound for valid ack: ack >= receiver.td_end - MAXACKWINDOW - - where sack is the highest right edge of sack block found in the packet. - - The upper bound limit for a valid ack is not ignored - - we doesn't have to deal with fragments. -*/ - -static inline __u32 segment_seq_plus_len(__u32 seq, - size_t len, - struct iphdr *iph, - struct tcphdr *tcph) -{ - return (seq + len - (iph->ihl + tcph->doff)*4 - + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0)); -} - -/* Fixme: what about big packets? */ -#define MAXACKWINCONST 66000 -#define MAXACKWINDOW(sender) \ - ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin \ - : MAXACKWINCONST) - -/* - * Simplified tcp_parse_options routine from tcp_input.c - */ -static void tcp_options(const struct sk_buff *skb, - struct iphdr *iph, - struct tcphdr *tcph, - struct ip_ct_tcp_state *state) -{ - unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; - unsigned char *ptr; - int length = (tcph->doff*4) - sizeof(struct tcphdr); - - if (!length) - return; - - ptr = skb_header_pointer(skb, - (iph->ihl * 4) + sizeof(struct tcphdr), - length, buff); - BUG_ON(ptr == NULL); - - state->td_scale = - state->flags = 0; - - while (length > 0) { - int opcode=*ptr++; - int opsize; - - switch (opcode) { - case TCPOPT_EOL: - return; - case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ - length--; - continue; - default: - opsize=*ptr++; - if (opsize < 2) /* "silly options" */ - return; - if (opsize > length) - break; /* don't parse partial options */ - - if (opcode == TCPOPT_SACK_PERM - && opsize == TCPOLEN_SACK_PERM) - state->flags |= IP_CT_TCP_FLAG_SACK_PERM; - else if (opcode == TCPOPT_WINDOW - && opsize == TCPOLEN_WINDOW) { - state->td_scale = *(u_int8_t *)ptr; - - if (state->td_scale > 14) { - /* See RFC1323 */ - state->td_scale = 14; - } - state->flags |= - IP_CT_TCP_FLAG_WINDOW_SCALE; - } - ptr += opsize - 2; - length -= opsize; - } - } -} - -static void tcp_sack(const struct sk_buff *skb, - struct iphdr *iph, - struct tcphdr *tcph, - __u32 *sack) -{ - unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; - unsigned char *ptr; - int length = (tcph->doff*4) - sizeof(struct tcphdr); - __u32 tmp; - - if (!length) - return; - - ptr = skb_header_pointer(skb, - (iph->ihl * 4) + sizeof(struct tcphdr), - length, buff); - BUG_ON(ptr == NULL); - - /* Fast path for timestamp-only option */ - if (length == TCPOLEN_TSTAMP_ALIGNED*4 - && *(__be32 *)ptr == - __constant_htonl((TCPOPT_NOP << 24) - | (TCPOPT_NOP << 16) - | (TCPOPT_TIMESTAMP << 8) - | TCPOLEN_TIMESTAMP)) - return; - - while (length > 0) { - int opcode=*ptr++; - int opsize, i; - - switch (opcode) { - case TCPOPT_EOL: - return; - case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ - length--; - continue; - default: - opsize=*ptr++; - if (opsize < 2) /* "silly options" */ - return; - if (opsize > length) - break; /* don't parse partial options */ - - if (opcode == TCPOPT_SACK - && opsize >= (TCPOLEN_SACK_BASE - + TCPOLEN_SACK_PERBLOCK) - && !((opsize - TCPOLEN_SACK_BASE) - % TCPOLEN_SACK_PERBLOCK)) { - for (i = 0; - i < (opsize - TCPOLEN_SACK_BASE); - i += TCPOLEN_SACK_PERBLOCK) { - tmp = ntohl(*((__be32 *)(ptr+i)+1)); - - if (after(tmp, *sack)) - *sack = tmp; - } - return; - } - ptr += opsize - 2; - length -= opsize; - } - } -} - -static int tcp_in_window(struct ip_ct_tcp *state, - enum ip_conntrack_dir dir, - unsigned int index, - const struct sk_buff *skb, - struct iphdr *iph, - struct tcphdr *tcph) -{ - struct ip_ct_tcp_state *sender = &state->seen[dir]; - struct ip_ct_tcp_state *receiver = &state->seen[!dir]; - __u32 seq, ack, sack, end, win, swin; - int res; - - /* - * Get the required data from the packet. - */ - seq = ntohl(tcph->seq); - ack = sack = ntohl(tcph->ack_seq); - win = ntohs(tcph->window); - end = segment_seq_plus_len(seq, skb->len, iph, tcph); - - if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM) - tcp_sack(skb, iph, tcph, &sack); - - DEBUGP("tcp_in_window: START\n"); - DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " - "seq=%u ack=%u sack=%u win=%u end=%u\n", - NIPQUAD(iph->saddr), ntohs(tcph->source), - NIPQUAD(iph->daddr), ntohs(tcph->dest), - seq, ack, sack, win, end); - DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " - "receiver end=%u maxend=%u maxwin=%u scale=%i\n", - sender->td_end, sender->td_maxend, sender->td_maxwin, - sender->td_scale, - receiver->td_end, receiver->td_maxend, receiver->td_maxwin, - receiver->td_scale); - - if (sender->td_end == 0) { - /* - * Initialize sender data. - */ - if (tcph->syn && tcph->ack) { - /* - * Outgoing SYN-ACK in reply to a SYN. - */ - sender->td_end = - sender->td_maxend = end; - sender->td_maxwin = (win == 0 ? 1 : win); - - tcp_options(skb, iph, tcph, sender); - /* - * RFC 1323: - * Both sides must send the Window Scale option - * to enable window scaling in either direction. - */ - if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE - && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) - sender->td_scale = - receiver->td_scale = 0; - } else { - /* - * We are in the middle of a connection, - * its history is lost for us. - * Let's try to use the data from the packet. - */ - sender->td_end = end; - sender->td_maxwin = (win == 0 ? 1 : win); - sender->td_maxend = end + sender->td_maxwin; - } - } else if (((state->state == TCP_CONNTRACK_SYN_SENT - && dir == IP_CT_DIR_ORIGINAL) - || (state->state == TCP_CONNTRACK_SYN_RECV - && dir == IP_CT_DIR_REPLY)) - && after(end, sender->td_end)) { - /* - * RFC 793: "if a TCP is reinitialized ... then it need - * not wait at all; it must only be sure to use sequence - * numbers larger than those recently used." - */ - sender->td_end = - sender->td_maxend = end; - sender->td_maxwin = (win == 0 ? 1 : win); - - tcp_options(skb, iph, tcph, sender); - } - - if (!(tcph->ack)) { - /* - * If there is no ACK, just pretend it was set and OK. - */ - ack = sack = receiver->td_end; - } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) == - (TCP_FLAG_ACK|TCP_FLAG_RST)) - && (ack == 0)) { - /* - * Broken TCP stacks, that set ACK in RST packets as well - * with zero ack value. - */ - ack = sack = receiver->td_end; - } - - if (seq == end - && (!tcph->rst - || (seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT))) - /* - * Packets contains no data: we assume it is valid - * and check the ack value only. - * However RST segments are always validated by their - * SEQ number, except when seq == 0 (reset sent answering - * SYN. - */ - seq = end = sender->td_end; - - DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " - "seq=%u ack=%u sack =%u win=%u end=%u\n", - NIPQUAD(iph->saddr), ntohs(tcph->source), - NIPQUAD(iph->daddr), ntohs(tcph->dest), - seq, ack, sack, win, end); - DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " - "receiver end=%u maxend=%u maxwin=%u scale=%i\n", - sender->td_end, sender->td_maxend, sender->td_maxwin, - sender->td_scale, - receiver->td_end, receiver->td_maxend, receiver->td_maxwin, - receiver->td_scale); - - DEBUGP("tcp_in_window: I=%i II=%i III=%i IV=%i\n", - before(seq, sender->td_maxend + 1), - after(end, sender->td_end - receiver->td_maxwin - 1), - before(sack, receiver->td_end + 1), - after(ack, receiver->td_end - MAXACKWINDOW(sender))); - - if (before(seq, sender->td_maxend + 1) && - after(end, sender->td_end - receiver->td_maxwin - 1) && - before(sack, receiver->td_end + 1) && - after(ack, receiver->td_end - MAXACKWINDOW(sender))) { - /* - * Take into account window scaling (RFC 1323). - */ - if (!tcph->syn) - win <<= sender->td_scale; - - /* - * Update sender data. - */ - swin = win + (sack - ack); - if (sender->td_maxwin < swin) - sender->td_maxwin = swin; - if (after(end, sender->td_end)) - sender->td_end = end; - /* - * Update receiver data. - */ - if (after(end, sender->td_maxend)) - receiver->td_maxwin += end - sender->td_maxend; - if (after(sack + win, receiver->td_maxend - 1)) { - receiver->td_maxend = sack + win; - if (win == 0) - receiver->td_maxend++; - } - - /* - * Check retransmissions. - */ - if (index == TCP_ACK_SET) { - if (state->last_dir == dir - && state->last_seq == seq - && state->last_ack == ack - && state->last_end == end - && state->last_win == win) - state->retrans++; - else { - state->last_dir = dir; - state->last_seq = seq; - state->last_ack = ack; - state->last_end = end; - state->last_win = win; - state->retrans = 0; - } - } - res = 1; - } else { - res = 0; - if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL || - ip_ct_tcp_be_liberal) - res = 1; - if (!res && LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "ip_ct_tcp: %s ", - before(seq, sender->td_maxend + 1) ? - after(end, sender->td_end - receiver->td_maxwin - 1) ? - before(sack, receiver->td_end + 1) ? - after(ack, receiver->td_end - MAXACKWINDOW(sender)) ? "BUG" - : "ACK is under the lower bound (possible overly delayed ACK)" - : "ACK is over the upper bound (ACKed data not seen yet)" - : "SEQ is under the lower bound (already ACKed data retransmitted)" - : "SEQ is over the upper bound (over the window of the receiver)"); - } - - DEBUGP("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u " - "receiver end=%u maxend=%u maxwin=%u\n", - res, sender->td_end, sender->td_maxend, sender->td_maxwin, - receiver->td_end, receiver->td_maxend, receiver->td_maxwin); - - return res; -} - -#ifdef CONFIG_IP_NF_NAT_NEEDED -/* Update sender->td_end after NAT successfully mangled the packet */ -void ip_conntrack_tcp_update(struct sk_buff *skb, - struct ip_conntrack *conntrack, - enum ip_conntrack_dir dir) -{ - struct iphdr *iph = skb->nh.iph; - struct tcphdr *tcph = (void *)skb->nh.iph + skb->nh.iph->ihl*4; - __u32 end; -#ifdef DEBUGP_VARS - struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[dir]; - struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[!dir]; -#endif - - end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, iph, tcph); - - write_lock_bh(&tcp_lock); - /* - * We have to worry for the ack in the reply packet only... - */ - if (after(end, conntrack->proto.tcp.seen[dir].td_end)) - conntrack->proto.tcp.seen[dir].td_end = end; - conntrack->proto.tcp.last_end = end; - write_unlock_bh(&tcp_lock); - DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i " - "receiver end=%u maxend=%u maxwin=%u scale=%i\n", - sender->td_end, sender->td_maxend, sender->td_maxwin, - sender->td_scale, - receiver->td_end, receiver->td_maxend, receiver->td_maxwin, - receiver->td_scale); -} - -#endif - -#define TH_FIN 0x01 -#define TH_SYN 0x02 -#define TH_RST 0x04 -#define TH_PUSH 0x08 -#define TH_ACK 0x10 -#define TH_URG 0x20 -#define TH_ECE 0x40 -#define TH_CWR 0x80 - -/* table of valid flag combinations - ECE and CWR are always valid */ -static const u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] = -{ - [TH_SYN] = 1, - [TH_SYN|TH_PUSH] = 1, - [TH_SYN|TH_URG] = 1, - [TH_SYN|TH_PUSH|TH_URG] = 1, - [TH_SYN|TH_ACK] = 1, - [TH_SYN|TH_ACK|TH_PUSH] = 1, - [TH_RST] = 1, - [TH_RST|TH_ACK] = 1, - [TH_RST|TH_ACK|TH_PUSH] = 1, - [TH_FIN|TH_ACK] = 1, - [TH_ACK] = 1, - [TH_ACK|TH_PUSH] = 1, - [TH_ACK|TH_URG] = 1, - [TH_ACK|TH_URG|TH_PUSH] = 1, - [TH_FIN|TH_ACK|TH_PUSH] = 1, - [TH_FIN|TH_ACK|TH_URG] = 1, - [TH_FIN|TH_ACK|TH_URG|TH_PUSH] = 1, -}; - -/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ -static int tcp_error(struct sk_buff *skb, - enum ip_conntrack_info *ctinfo, - unsigned int hooknum) -{ - struct iphdr *iph = skb->nh.iph; - struct tcphdr _tcph, *th; - unsigned int tcplen = skb->len - iph->ihl * 4; - u_int8_t tcpflags; - - /* Smaller that minimal TCP header? */ - th = skb_header_pointer(skb, iph->ihl * 4, - sizeof(_tcph), &_tcph); - if (th == NULL) { - if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "ip_ct_tcp: short packet "); - return -NF_ACCEPT; - } - - /* Not whole TCP header or malformed packet */ - if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { - if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "ip_ct_tcp: truncated/malformed packet "); - return -NF_ACCEPT; - } - - /* Checksum invalid? Ignore. - * We skip checking packets on the outgoing path - * because it is assumed to be correct. - */ - /* FIXME: Source route IP option packets --RR */ - if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING && - nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_TCP)) { - if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "ip_ct_tcp: bad TCP checksum "); - return -NF_ACCEPT; - } - - /* Check TCP flags. */ - tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR)); - if (!tcp_valid_flags[tcpflags]) { - if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "ip_ct_tcp: invalid TCP flag combination "); - return -NF_ACCEPT; - } - - return NF_ACCEPT; -} - -/* Returns verdict for packet, or -1 for invalid. */ -static int tcp_packet(struct ip_conntrack *conntrack, - const struct sk_buff *skb, - enum ip_conntrack_info ctinfo) -{ - enum tcp_conntrack new_state, old_state; - enum ip_conntrack_dir dir; - struct iphdr *iph = skb->nh.iph; - struct tcphdr *th, _tcph; - unsigned long timeout; - unsigned int index; - - th = skb_header_pointer(skb, iph->ihl * 4, - sizeof(_tcph), &_tcph); - BUG_ON(th == NULL); - - write_lock_bh(&tcp_lock); - old_state = conntrack->proto.tcp.state; - dir = CTINFO2DIR(ctinfo); - index = get_conntrack_index(th); - new_state = tcp_conntracks[dir][index][old_state]; - - switch (new_state) { - case TCP_CONNTRACK_IGNORE: - /* Ignored packets: - * - * a) SYN in ORIGINAL - * b) SYN/ACK in REPLY - * c) ACK in reply direction after initial SYN in original. - */ - if (index == TCP_SYNACK_SET - && conntrack->proto.tcp.last_index == TCP_SYN_SET - && conntrack->proto.tcp.last_dir != dir - && ntohl(th->ack_seq) == - conntrack->proto.tcp.last_end) { - /* This SYN/ACK acknowledges a SYN that we earlier - * ignored as invalid. This means that the client and - * the server are both in sync, while the firewall is - * not. We kill this session and block the SYN/ACK so - * that the client cannot but retransmit its SYN and - * thus initiate a clean new session. - */ - write_unlock_bh(&tcp_lock); - if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, - NULL, "ip_ct_tcp: " - "killing out of sync session "); - if (del_timer(&conntrack->timeout)) - conntrack->timeout.function((unsigned long) - conntrack); - return -NF_DROP; - } - conntrack->proto.tcp.last_index = index; - conntrack->proto.tcp.last_dir = dir; - conntrack->proto.tcp.last_seq = ntohl(th->seq); - conntrack->proto.tcp.last_end = - segment_seq_plus_len(ntohl(th->seq), skb->len, iph, th); - - write_unlock_bh(&tcp_lock); - if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "ip_ct_tcp: invalid packet ignored "); - return NF_ACCEPT; - case TCP_CONNTRACK_MAX: - /* Invalid packet */ - DEBUGP("ip_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", - dir, get_conntrack_index(th), - old_state); - write_unlock_bh(&tcp_lock); - if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "ip_ct_tcp: invalid state "); - return -NF_ACCEPT; - case TCP_CONNTRACK_SYN_SENT: - if (old_state < TCP_CONNTRACK_TIME_WAIT) - break; - if ((conntrack->proto.tcp.seen[dir].flags & - IP_CT_TCP_FLAG_CLOSE_INIT) - || after(ntohl(th->seq), - conntrack->proto.tcp.seen[dir].td_end)) { - /* Attempt to reopen a closed connection. - * Delete this connection and look up again. */ - write_unlock_bh(&tcp_lock); - if (del_timer(&conntrack->timeout)) - conntrack->timeout.function((unsigned long) - conntrack); - return -NF_REPEAT; - } else { - write_unlock_bh(&tcp_lock); - if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, - NULL, "ip_ct_tcp: invalid SYN"); - return -NF_ACCEPT; - } - case TCP_CONNTRACK_CLOSE: - if (index == TCP_RST_SET - && ((test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status) - && conntrack->proto.tcp.last_index == TCP_SYN_SET) - || (!test_bit(IPS_ASSURED_BIT, &conntrack->status) - && conntrack->proto.tcp.last_index == TCP_ACK_SET)) - && ntohl(th->ack_seq) == conntrack->proto.tcp.last_end) { - /* RST sent to invalid SYN or ACK we had let through - * at a) and c) above: - * - * a) SYN was in window then - * c) we hold a half-open connection. - * - * Delete our connection entry. - * We skip window checking, because packet might ACK - * segments we ignored. */ - goto in_window; - } - /* Just fall through */ - default: - /* Keep compilers happy. */ - break; - } - - if (!tcp_in_window(&conntrack->proto.tcp, dir, index, - skb, iph, th)) { - write_unlock_bh(&tcp_lock); - return -NF_ACCEPT; - } - in_window: - /* From now on we have got in-window packets */ - conntrack->proto.tcp.last_index = index; - - DEBUGP("tcp_conntracks: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " - "syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n", - NIPQUAD(iph->saddr), ntohs(th->source), - NIPQUAD(iph->daddr), ntohs(th->dest), - (th->syn ? 1 : 0), (th->ack ? 1 : 0), - (th->fin ? 1 : 0), (th->rst ? 1 : 0), - old_state, new_state); - - conntrack->proto.tcp.state = new_state; - if (old_state != new_state - && (new_state == TCP_CONNTRACK_FIN_WAIT - || new_state == TCP_CONNTRACK_CLOSE)) - conntrack->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; - timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans - && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans - ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; - write_unlock_bh(&tcp_lock); - - ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); - if (new_state != old_state) - ip_conntrack_event_cache(IPCT_PROTOINFO, skb); - - if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { - /* If only reply is a RST, we can consider ourselves not to - have an established connection: this is a fairly common - problem case, so we can delete the conntrack - immediately. --RR */ - if (th->rst) { - if (del_timer(&conntrack->timeout)) - conntrack->timeout.function((unsigned long) - conntrack); - return NF_ACCEPT; - } - } else if (!test_bit(IPS_ASSURED_BIT, &conntrack->status) - && (old_state == TCP_CONNTRACK_SYN_RECV - || old_state == TCP_CONNTRACK_ESTABLISHED) - && new_state == TCP_CONNTRACK_ESTABLISHED) { - /* Set ASSURED if we see see valid ack in ESTABLISHED - after SYN_RECV or a valid answer for a picked up - connection. */ - set_bit(IPS_ASSURED_BIT, &conntrack->status); - ip_conntrack_event_cache(IPCT_STATUS, skb); - } - ip_ct_refresh_acct(conntrack, ctinfo, skb, timeout); - - return NF_ACCEPT; -} - -/* Called when a new connection for this protocol found. */ -static int tcp_new(struct ip_conntrack *conntrack, - const struct sk_buff *skb) -{ - enum tcp_conntrack new_state; - struct iphdr *iph = skb->nh.iph; - struct tcphdr *th, _tcph; -#ifdef DEBUGP_VARS - struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[0]; - struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[1]; -#endif - - th = skb_header_pointer(skb, iph->ihl * 4, - sizeof(_tcph), &_tcph); - BUG_ON(th == NULL); - - /* Don't need lock here: this conntrack not in circulation yet */ - new_state - = tcp_conntracks[0][get_conntrack_index(th)] - [TCP_CONNTRACK_NONE]; - - /* Invalid: delete conntrack */ - if (new_state >= TCP_CONNTRACK_MAX) { - DEBUGP("ip_ct_tcp: invalid new deleting.\n"); - return 0; - } - - if (new_state == TCP_CONNTRACK_SYN_SENT) { - /* SYN packet */ - conntrack->proto.tcp.seen[0].td_end = - segment_seq_plus_len(ntohl(th->seq), skb->len, - iph, th); - conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window); - if (conntrack->proto.tcp.seen[0].td_maxwin == 0) - conntrack->proto.tcp.seen[0].td_maxwin = 1; - conntrack->proto.tcp.seen[0].td_maxend = - conntrack->proto.tcp.seen[0].td_end; - - tcp_options(skb, iph, th, &conntrack->proto.tcp.seen[0]); - conntrack->proto.tcp.seen[1].flags = 0; - } else if (ip_ct_tcp_loose == 0) { - /* Don't try to pick up connections. */ - return 0; - } else { - /* - * We are in the middle of a connection, - * its history is lost for us. - * Let's try to use the data from the packet. - */ - conntrack->proto.tcp.seen[0].td_end = - segment_seq_plus_len(ntohl(th->seq), skb->len, - iph, th); - conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window); - if (conntrack->proto.tcp.seen[0].td_maxwin == 0) - conntrack->proto.tcp.seen[0].td_maxwin = 1; - conntrack->proto.tcp.seen[0].td_maxend = - conntrack->proto.tcp.seen[0].td_end + - conntrack->proto.tcp.seen[0].td_maxwin; - conntrack->proto.tcp.seen[0].td_scale = 0; - - /* We assume SACK and liberal window checking to handle - * window scaling */ - conntrack->proto.tcp.seen[0].flags = - conntrack->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM | - IP_CT_TCP_FLAG_BE_LIBERAL; - } - - conntrack->proto.tcp.seen[1].td_end = 0; - conntrack->proto.tcp.seen[1].td_maxend = 0; - conntrack->proto.tcp.seen[1].td_maxwin = 1; - conntrack->proto.tcp.seen[1].td_scale = 0; - - /* tcp_packet will set them */ - conntrack->proto.tcp.state = TCP_CONNTRACK_NONE; - conntrack->proto.tcp.last_index = TCP_NONE_SET; - - DEBUGP("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i " - "receiver end=%u maxend=%u maxwin=%u scale=%i\n", - sender->td_end, sender->td_maxend, sender->td_maxwin, - sender->td_scale, - receiver->td_end, receiver->td_maxend, receiver->td_maxwin, - receiver->td_scale); - return 1; -} - -struct ip_conntrack_protocol ip_conntrack_protocol_tcp = -{ - .proto = IPPROTO_TCP, - .name = "tcp", - .pkt_to_tuple = tcp_pkt_to_tuple, - .invert_tuple = tcp_invert_tuple, - .print_tuple = tcp_print_tuple, - .print_conntrack = tcp_print_conntrack, - .packet = tcp_packet, - .new = tcp_new, - .error = tcp_error, -#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ - defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) - .to_nfattr = tcp_to_nfattr, - .from_nfattr = nfattr_to_tcp, - .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr, - .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple, -#endif -}; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c deleted file mode 100644 index 14c30c646c7..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c +++ /dev/null @@ -1,148 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/timer.h> -#include <linux/netfilter.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/udp.h> -#include <linux/seq_file.h> -#include <net/checksum.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> - -unsigned int ip_ct_udp_timeout __read_mostly = 30*HZ; -unsigned int ip_ct_udp_timeout_stream __read_mostly = 180*HZ; - -static int udp_pkt_to_tuple(const struct sk_buff *skb, - unsigned int dataoff, - struct ip_conntrack_tuple *tuple) -{ - struct udphdr _hdr, *hp; - - /* Actually only need first 8 bytes. */ - hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); - if (hp == NULL) - return 0; - - tuple->src.u.udp.port = hp->source; - tuple->dst.u.udp.port = hp->dest; - - return 1; -} - -static int udp_invert_tuple(struct ip_conntrack_tuple *tuple, - const struct ip_conntrack_tuple *orig) -{ - tuple->src.u.udp.port = orig->dst.u.udp.port; - tuple->dst.u.udp.port = orig->src.u.udp.port; - return 1; -} - -/* Print out the per-protocol part of the tuple. */ -static int udp_print_tuple(struct seq_file *s, - const struct ip_conntrack_tuple *tuple) -{ - return seq_printf(s, "sport=%hu dport=%hu ", - ntohs(tuple->src.u.udp.port), - ntohs(tuple->dst.u.udp.port)); -} - -/* Print out the private part of the conntrack. */ -static int udp_print_conntrack(struct seq_file *s, - const struct ip_conntrack *conntrack) -{ - return 0; -} - -/* Returns verdict for packet, and may modify conntracktype */ -static int udp_packet(struct ip_conntrack *conntrack, - const struct sk_buff *skb, - enum ip_conntrack_info ctinfo) -{ - /* If we've seen traffic both ways, this is some kind of UDP - stream. Extend timeout. */ - if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { - ip_ct_refresh_acct(conntrack, ctinfo, skb, - ip_ct_udp_timeout_stream); - /* Also, more likely to be important, and not a probe */ - if (!test_and_set_bit(IPS_ASSURED_BIT, &conntrack->status)) - ip_conntrack_event_cache(IPCT_STATUS, skb); - } else - ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout); - - return NF_ACCEPT; -} - -/* Called when a new connection for this protocol found. */ -static int udp_new(struct ip_conntrack *conntrack, const struct sk_buff *skb) -{ - return 1; -} - -static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, - unsigned int hooknum) -{ - struct iphdr *iph = skb->nh.iph; - unsigned int udplen = skb->len - iph->ihl * 4; - struct udphdr _hdr, *hdr; - - /* Header is too small? */ - hdr = skb_header_pointer(skb, iph->ihl*4, sizeof(_hdr), &_hdr); - if (hdr == NULL) { - if (LOG_INVALID(IPPROTO_UDP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "ip_ct_udp: short packet "); - return -NF_ACCEPT; - } - - /* Truncated/malformed packets */ - if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { - if (LOG_INVALID(IPPROTO_UDP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "ip_ct_udp: truncated/malformed packet "); - return -NF_ACCEPT; - } - - /* Packet with no checksum */ - if (!hdr->check) - return NF_ACCEPT; - - /* Checksum invalid? Ignore. - * We skip checking packets on the outgoing path - * because the checksum is assumed to be correct. - * FIXME: Source route IP option packets --RR */ - if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING && - nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_UDP)) { - if (LOG_INVALID(IPPROTO_UDP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "ip_ct_udp: bad UDP checksum "); - return -NF_ACCEPT; - } - - return NF_ACCEPT; -} - -struct ip_conntrack_protocol ip_conntrack_protocol_udp = -{ - .proto = IPPROTO_UDP, - .name = "udp", - .pkt_to_tuple = udp_pkt_to_tuple, - .invert_tuple = udp_invert_tuple, - .print_tuple = udp_print_tuple, - .print_conntrack = udp_print_conntrack, - .packet = udp_packet, - .new = udp_new, - .error = udp_error, -#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ - defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) - .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr, - .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple, -#endif -}; diff --git a/net/ipv4/netfilter/ip_conntrack_sip.c b/net/ipv4/netfilter/ip_conntrack_sip.c deleted file mode 100644 index c59a962c1f6..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_sip.c +++ /dev/null @@ -1,520 +0,0 @@ -/* SIP extension for IP connection tracking. - * - * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar> - * based on RR's ip_conntrack_ftp.c and other modules. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/ctype.h> -#include <linux/skbuff.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/udp.h> - -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_sip.h> - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>"); -MODULE_DESCRIPTION("SIP connection tracking helper"); - -#define MAX_PORTS 8 -static unsigned short ports[MAX_PORTS]; -static int ports_c; -module_param_array(ports, ushort, &ports_c, 0400); -MODULE_PARM_DESC(ports, "port numbers of sip servers"); - -static unsigned int sip_timeout = SIP_TIMEOUT; -module_param(sip_timeout, uint, 0600); -MODULE_PARM_DESC(sip_timeout, "timeout for the master SIP session"); - -unsigned int (*ip_nat_sip_hook)(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - struct ip_conntrack *ct, - const char **dptr); -EXPORT_SYMBOL_GPL(ip_nat_sip_hook); - -unsigned int (*ip_nat_sdp_hook)(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - struct ip_conntrack_expect *exp, - const char *dptr); -EXPORT_SYMBOL_GPL(ip_nat_sdp_hook); - -static int digits_len(const char *dptr, const char *limit, int *shift); -static int epaddr_len(const char *dptr, const char *limit, int *shift); -static int skp_digits_len(const char *dptr, const char *limit, int *shift); -static int skp_epaddr_len(const char *dptr, const char *limit, int *shift); - -struct sip_header_nfo { - const char *lname; - const char *sname; - const char *ln_str; - size_t lnlen; - size_t snlen; - size_t ln_strlen; - int case_sensitive; - int (*match_len)(const char *, const char *, int *); -}; - -static struct sip_header_nfo ct_sip_hdrs[] = { - [POS_REG_REQ_URI] = { /* SIP REGISTER request URI */ - .lname = "sip:", - .lnlen = sizeof("sip:") - 1, - .ln_str = ":", - .ln_strlen = sizeof(":") - 1, - .match_len = epaddr_len - }, - [POS_REQ_URI] = { /* SIP request URI */ - .lname = "sip:", - .lnlen = sizeof("sip:") - 1, - .ln_str = "@", - .ln_strlen = sizeof("@") - 1, - .match_len = epaddr_len - }, - [POS_FROM] = { /* SIP From header */ - .lname = "From:", - .lnlen = sizeof("From:") - 1, - .sname = "\r\nf:", - .snlen = sizeof("\r\nf:") - 1, - .ln_str = "sip:", - .ln_strlen = sizeof("sip:") - 1, - .match_len = skp_epaddr_len, - }, - [POS_TO] = { /* SIP To header */ - .lname = "To:", - .lnlen = sizeof("To:") - 1, - .sname = "\r\nt:", - .snlen = sizeof("\r\nt:") - 1, - .ln_str = "sip:", - .ln_strlen = sizeof("sip:") - 1, - .match_len = skp_epaddr_len, - }, - [POS_VIA] = { /* SIP Via header */ - .lname = "Via:", - .lnlen = sizeof("Via:") - 1, - .sname = "\r\nv:", - .snlen = sizeof("\r\nv:") - 1, /* rfc3261 "\r\n" */ - .ln_str = "UDP ", - .ln_strlen = sizeof("UDP ") - 1, - .match_len = epaddr_len, - }, - [POS_CONTACT] = { /* SIP Contact header */ - .lname = "Contact:", - .lnlen = sizeof("Contact:") - 1, - .sname = "\r\nm:", - .snlen = sizeof("\r\nm:") - 1, - .ln_str = "sip:", - .ln_strlen = sizeof("sip:") - 1, - .match_len = skp_epaddr_len - }, - [POS_CONTENT] = { /* SIP Content length header */ - .lname = "Content-Length:", - .lnlen = sizeof("Content-Length:") - 1, - .sname = "\r\nl:", - .snlen = sizeof("\r\nl:") - 1, - .ln_str = ":", - .ln_strlen = sizeof(":") - 1, - .match_len = skp_digits_len - }, - [POS_MEDIA] = { /* SDP media info */ - .case_sensitive = 1, - .lname = "\nm=", - .lnlen = sizeof("\nm=") - 1, - .sname = "\rm=", - .snlen = sizeof("\rm=") - 1, - .ln_str = "audio ", - .ln_strlen = sizeof("audio ") - 1, - .match_len = digits_len - }, - [POS_OWNER] = { /* SDP owner address*/ - .case_sensitive = 1, - .lname = "\no=", - .lnlen = sizeof("\no=") - 1, - .sname = "\ro=", - .snlen = sizeof("\ro=") - 1, - .ln_str = "IN IP4 ", - .ln_strlen = sizeof("IN IP4 ") - 1, - .match_len = epaddr_len - }, - [POS_CONNECTION] = { /* SDP connection info */ - .case_sensitive = 1, - .lname = "\nc=", - .lnlen = sizeof("\nc=") - 1, - .sname = "\rc=", - .snlen = sizeof("\rc=") - 1, - .ln_str = "IN IP4 ", - .ln_strlen = sizeof("IN IP4 ") - 1, - .match_len = epaddr_len - }, - [POS_SDP_HEADER] = { /* SDP version header */ - .case_sensitive = 1, - .lname = "\nv=", - .lnlen = sizeof("\nv=") - 1, - .sname = "\rv=", - .snlen = sizeof("\rv=") - 1, - .ln_str = "=", - .ln_strlen = sizeof("=") - 1, - .match_len = digits_len - } -}; - -/* get line lenght until first CR or LF seen. */ -int ct_sip_lnlen(const char *line, const char *limit) -{ - const char *k = line; - - while ((line <= limit) && (*line == '\r' || *line == '\n')) - line++; - - while (line <= limit) { - if (*line == '\r' || *line == '\n') - break; - line++; - } - return line - k; -} -EXPORT_SYMBOL_GPL(ct_sip_lnlen); - -/* Linear string search, case sensitive. */ -const char *ct_sip_search(const char *needle, const char *haystack, - size_t needle_len, size_t haystack_len, - int case_sensitive) -{ - const char *limit = haystack + (haystack_len - needle_len); - - while (haystack <= limit) { - if (case_sensitive) { - if (strncmp(haystack, needle, needle_len) == 0) - return haystack; - } else { - if (strnicmp(haystack, needle, needle_len) == 0) - return haystack; - } - haystack++; - } - return NULL; -} -EXPORT_SYMBOL_GPL(ct_sip_search); - -static int digits_len(const char *dptr, const char *limit, int *shift) -{ - int len = 0; - while (dptr <= limit && isdigit(*dptr)) { - dptr++; - len++; - } - return len; -} - -/* get digits lenght, skiping blank spaces. */ -static int skp_digits_len(const char *dptr, const char *limit, int *shift) -{ - for (; dptr <= limit && *dptr == ' '; dptr++) - (*shift)++; - - return digits_len(dptr, limit, shift); -} - -/* Simple ipaddr parser.. */ -static int parse_ipaddr(const char *cp, const char **endp, - __be32 *ipaddr, const char *limit) -{ - unsigned long int val; - int i, digit = 0; - - for (i = 0, *ipaddr = 0; cp <= limit && i < 4; i++) { - digit = 0; - if (!isdigit(*cp)) - break; - - val = simple_strtoul(cp, (char **)&cp, 10); - if (val > 0xFF) - return -1; - - ((u_int8_t *)ipaddr)[i] = val; - digit = 1; - - if (*cp != '.') - break; - cp++; - } - if (!digit) - return -1; - - if (endp) - *endp = cp; - - return 0; -} - -/* skip ip address. returns it lenght. */ -static int epaddr_len(const char *dptr, const char *limit, int *shift) -{ - const char *aux = dptr; - __be32 ip; - - if (parse_ipaddr(dptr, &dptr, &ip, limit) < 0) { - DEBUGP("ip: %s parse failed.!\n", dptr); - return 0; - } - - /* Port number */ - if (*dptr == ':') { - dptr++; - dptr += digits_len(dptr, limit, shift); - } - return dptr - aux; -} - -/* get address length, skiping user info. */ -static int skp_epaddr_len(const char *dptr, const char *limit, int *shift) -{ - int s = *shift; - - /* Search for @, but stop at the end of the line. - * We are inside a sip: URI, so we don't need to worry about - * continuation lines. */ - while (dptr <= limit && - *dptr != '@' && *dptr != '\r' && *dptr != '\n') { - (*shift)++; - dptr++; - } - - if (dptr <= limit && *dptr == '@') { - dptr++; - (*shift)++; - } else - *shift = s; - - return epaddr_len(dptr, limit, shift); -} - -/* Returns 0 if not found, -1 error parsing. */ -int ct_sip_get_info(const char *dptr, size_t dlen, - unsigned int *matchoff, - unsigned int *matchlen, - enum sip_header_pos pos) -{ - struct sip_header_nfo *hnfo = &ct_sip_hdrs[pos]; - const char *limit, *aux, *k = dptr; - int shift = 0; - - limit = dptr + (dlen - hnfo->lnlen); - - while (dptr <= limit) { - if ((strncmp(dptr, hnfo->lname, hnfo->lnlen) != 0) && - (hnfo->sname == NULL || - strncmp(dptr, hnfo->sname, hnfo->snlen) != 0)) { - dptr++; - continue; - } - aux = ct_sip_search(hnfo->ln_str, dptr, hnfo->ln_strlen, - ct_sip_lnlen(dptr, limit), - hnfo->case_sensitive); - if (!aux) { - DEBUGP("'%s' not found in '%s'.\n", hnfo->ln_str, - hnfo->lname); - return -1; - } - aux += hnfo->ln_strlen; - - *matchlen = hnfo->match_len(aux, limit, &shift); - if (!*matchlen) - return -1; - - *matchoff = (aux - k) + shift; - - DEBUGP("%s match succeeded! - len: %u\n", hnfo->lname, - *matchlen); - return 1; - } - DEBUGP("%s header not found.\n", hnfo->lname); - return 0; -} -EXPORT_SYMBOL_GPL(ct_sip_get_info); - -static int set_expected_rtp(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - __be32 ipaddr, u_int16_t port, - const char *dptr) -{ - struct ip_conntrack_expect *exp; - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - int ret; - typeof(ip_nat_sdp_hook) ip_nat_sdp; - - exp = ip_conntrack_expect_alloc(ct); - if (exp == NULL) - return NF_DROP; - - exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; - exp->tuple.src.u.udp.port = 0; - exp->tuple.dst.ip = ipaddr; - exp->tuple.dst.u.udp.port = htons(port); - exp->tuple.dst.protonum = IPPROTO_UDP; - - exp->mask.src.ip = htonl(0xFFFFFFFF); - exp->mask.src.u.udp.port = 0; - exp->mask.dst.ip = htonl(0xFFFFFFFF); - exp->mask.dst.u.udp.port = htons(0xFFFF); - exp->mask.dst.protonum = 0xFF; - - exp->expectfn = NULL; - exp->flags = 0; - - ip_nat_sdp = rcu_dereference(ip_nat_sdp_hook); - if (ip_nat_sdp) - ret = ip_nat_sdp(pskb, ctinfo, exp, dptr); - else { - if (ip_conntrack_expect_related(exp) != 0) - ret = NF_DROP; - else - ret = NF_ACCEPT; - } - ip_conntrack_expect_put(exp); - - return ret; -} - -static int sip_help(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo) -{ - unsigned int dataoff, datalen; - const char *dptr; - int ret = NF_ACCEPT; - int matchoff, matchlen; - __be32 ipaddr; - u_int16_t port; - typeof(ip_nat_sip_hook) ip_nat_sip; - - /* No Data ? */ - dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); - if (dataoff >= (*pskb)->len) { - DEBUGP("skb->len = %u\n", (*pskb)->len); - return NF_ACCEPT; - } - - ip_ct_refresh(ct, *pskb, sip_timeout * HZ); - - if (!skb_is_nonlinear(*pskb)) - dptr = (*pskb)->data + dataoff; - else { - DEBUGP("Copy of skbuff not supported yet.\n"); - goto out; - } - - ip_nat_sip = rcu_dereference(ip_nat_sip_hook); - if (ip_nat_sip) { - if (!ip_nat_sip(pskb, ctinfo, ct, &dptr)) { - ret = NF_DROP; - goto out; - } - } - - /* After this point NAT, could have mangled skb, so - we need to recalculate payload lenght. */ - datalen = (*pskb)->len - dataoff; - - if (datalen < (sizeof("SIP/2.0 200") - 1)) - goto out; - - /* RTP info only in some SDP pkts */ - if (memcmp(dptr, "INVITE", sizeof("INVITE") - 1) != 0 && - memcmp(dptr, "SIP/2.0 200", sizeof("SIP/2.0 200") - 1) != 0) { - goto out; - } - /* Get ip and port address from SDP packet. */ - if (ct_sip_get_info(dptr, datalen, &matchoff, &matchlen, - POS_CONNECTION) > 0) { - - /* We'll drop only if there are parse problems. */ - if (parse_ipaddr(dptr + matchoff, NULL, &ipaddr, - dptr + datalen) < 0) { - ret = NF_DROP; - goto out; - } - if (ct_sip_get_info(dptr, datalen, &matchoff, &matchlen, - POS_MEDIA) > 0) { - - port = simple_strtoul(dptr + matchoff, NULL, 10); - if (port < 1024) { - ret = NF_DROP; - goto out; - } - ret = set_expected_rtp(pskb, ct, ctinfo, - ipaddr, port, dptr); - } - } -out: - return ret; -} - -static struct ip_conntrack_helper sip[MAX_PORTS]; -static char sip_names[MAX_PORTS][10]; - -static void fini(void) -{ - int i; - for (i = 0; i < ports_c; i++) { - DEBUGP("unregistering helper for port %d\n", ports[i]); - ip_conntrack_helper_unregister(&sip[i]); - } -} - -static int __init init(void) -{ - int i, ret; - char *tmpname; - - if (ports_c == 0) - ports[ports_c++] = SIP_PORT; - - for (i = 0; i < ports_c; i++) { - /* Create helper structure */ - memset(&sip[i], 0, sizeof(struct ip_conntrack_helper)); - - sip[i].tuple.dst.protonum = IPPROTO_UDP; - sip[i].tuple.src.u.udp.port = htons(ports[i]); - sip[i].mask.src.u.udp.port = htons(0xFFFF); - sip[i].mask.dst.protonum = 0xFF; - sip[i].max_expected = 2; - sip[i].timeout = 3 * 60; /* 3 minutes */ - sip[i].me = THIS_MODULE; - sip[i].help = sip_help; - - tmpname = &sip_names[i][0]; - if (ports[i] == SIP_PORT) - sprintf(tmpname, "sip"); - else - sprintf(tmpname, "sip-%d", i); - sip[i].name = tmpname; - - DEBUGP("port #%d: %d\n", i, ports[i]); - - ret = ip_conntrack_helper_register(&sip[i]); - if (ret) { - printk("ERROR registering helper for port %d\n", - ports[i]); - fini(); - return ret; - } - } - return 0; -} - -module_init(init); -module_exit(fini); diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c deleted file mode 100644 index 56b2f7546d1..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ /dev/null @@ -1,962 +0,0 @@ -/* This file contains all the functions required for the standalone - ip_conntrack module. - - These are not required by the compatibility layer. -*/ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/ip.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/percpu.h> -#ifdef CONFIG_SYSCTL -#include <linux/sysctl.h> -#endif -#include <net/checksum.h> -#include <net/ip.h> -#include <net/route.h> - -#include <linux/netfilter_ipv4/ip_conntrack.h> -#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> -#include <linux/netfilter_ipv4/ip_conntrack_core.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - -MODULE_LICENSE("GPL"); - -extern atomic_t ip_conntrack_count; -DECLARE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); - -static int kill_proto(struct ip_conntrack *i, void *data) -{ - return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum == - *((u_int8_t *) data)); -} - -#ifdef CONFIG_PROC_FS -static int -print_tuple(struct seq_file *s, const struct ip_conntrack_tuple *tuple, - struct ip_conntrack_protocol *proto) -{ - seq_printf(s, "src=%u.%u.%u.%u dst=%u.%u.%u.%u ", - NIPQUAD(tuple->src.ip), NIPQUAD(tuple->dst.ip)); - return proto->print_tuple(s, tuple); -} - -#ifdef CONFIG_IP_NF_CT_ACCT -static unsigned int -seq_print_counters(struct seq_file *s, - const struct ip_conntrack_counter *counter) -{ - return seq_printf(s, "packets=%llu bytes=%llu ", - (unsigned long long)counter->packets, - (unsigned long long)counter->bytes); -} -#else -#define seq_print_counters(x, y) 0 -#endif - -struct ct_iter_state { - unsigned int bucket; -}; - -static struct list_head *ct_get_first(struct seq_file *seq) -{ - struct ct_iter_state *st = seq->private; - - for (st->bucket = 0; - st->bucket < ip_conntrack_htable_size; - st->bucket++) { - if (!list_empty(&ip_conntrack_hash[st->bucket])) - return ip_conntrack_hash[st->bucket].next; - } - return NULL; -} - -static struct list_head *ct_get_next(struct seq_file *seq, struct list_head *head) -{ - struct ct_iter_state *st = seq->private; - - head = head->next; - while (head == &ip_conntrack_hash[st->bucket]) { - if (++st->bucket >= ip_conntrack_htable_size) - return NULL; - head = ip_conntrack_hash[st->bucket].next; - } - return head; -} - -static struct list_head *ct_get_idx(struct seq_file *seq, loff_t pos) -{ - struct list_head *head = ct_get_first(seq); - - if (head) - while (pos && (head = ct_get_next(seq, head))) - pos--; - return pos ? NULL : head; -} - -static void *ct_seq_start(struct seq_file *seq, loff_t *pos) -{ - read_lock_bh(&ip_conntrack_lock); - return ct_get_idx(seq, *pos); -} - -static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos) -{ - (*pos)++; - return ct_get_next(s, v); -} - -static void ct_seq_stop(struct seq_file *s, void *v) -{ - read_unlock_bh(&ip_conntrack_lock); -} - -static int ct_seq_show(struct seq_file *s, void *v) -{ - const struct ip_conntrack_tuple_hash *hash = v; - const struct ip_conntrack *conntrack = tuplehash_to_ctrack(hash); - struct ip_conntrack_protocol *proto; - - IP_NF_ASSERT(conntrack); - - /* we only want to print DIR_ORIGINAL */ - if (DIRECTION(hash)) - return 0; - - proto = __ip_conntrack_proto_find(conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); - IP_NF_ASSERT(proto); - - if (seq_printf(s, "%-8s %u %ld ", - proto->name, - conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum, - timer_pending(&conntrack->timeout) - ? (long)(conntrack->timeout.expires - jiffies)/HZ - : 0) != 0) - return -ENOSPC; - - if (proto->print_conntrack(s, conntrack)) - return -ENOSPC; - - if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - proto)) - return -ENOSPC; - - if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_ORIGINAL])) - return -ENOSPC; - - if (!(test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status))) - if (seq_printf(s, "[UNREPLIED] ")) - return -ENOSPC; - - if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple, - proto)) - return -ENOSPC; - - if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_REPLY])) - return -ENOSPC; - - if (test_bit(IPS_ASSURED_BIT, &conntrack->status)) - if (seq_printf(s, "[ASSURED] ")) - return -ENOSPC; - -#if defined(CONFIG_IP_NF_CONNTRACK_MARK) - if (seq_printf(s, "mark=%u ", conntrack->mark)) - return -ENOSPC; -#endif - -#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK - if (seq_printf(s, "secmark=%u ", conntrack->secmark)) - return -ENOSPC; -#endif - - if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use))) - return -ENOSPC; - - return 0; -} - -static struct seq_operations ct_seq_ops = { - .start = ct_seq_start, - .next = ct_seq_next, - .stop = ct_seq_stop, - .show = ct_seq_show -}; - -static int ct_open(struct inode *inode, struct file *file) -{ - struct seq_file *seq; - struct ct_iter_state *st; - int ret; - - st = kmalloc(sizeof(struct ct_iter_state), GFP_KERNEL); - if (st == NULL) - return -ENOMEM; - ret = seq_open(file, &ct_seq_ops); - if (ret) - goto out_free; - seq = file->private_data; - seq->private = st; - memset(st, 0, sizeof(struct ct_iter_state)); - return ret; -out_free: - kfree(st); - return ret; -} - -static const struct file_operations ct_file_ops = { - .owner = THIS_MODULE, - .open = ct_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; - -/* expects */ -static void *exp_seq_start(struct seq_file *s, loff_t *pos) -{ - struct list_head *e = &ip_conntrack_expect_list; - loff_t i; - - /* strange seq_file api calls stop even if we fail, - * thus we need to grab lock since stop unlocks */ - read_lock_bh(&ip_conntrack_lock); - - if (list_empty(e)) - return NULL; - - for (i = 0; i <= *pos; i++) { - e = e->next; - if (e == &ip_conntrack_expect_list) - return NULL; - } - return e; -} - -static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos) -{ - struct list_head *e = v; - - ++*pos; - e = e->next; - - if (e == &ip_conntrack_expect_list) - return NULL; - - return e; -} - -static void exp_seq_stop(struct seq_file *s, void *v) -{ - read_unlock_bh(&ip_conntrack_lock); -} - -static int exp_seq_show(struct seq_file *s, void *v) -{ - struct ip_conntrack_expect *expect = v; - - if (expect->timeout.function) - seq_printf(s, "%ld ", timer_pending(&expect->timeout) - ? (long)(expect->timeout.expires - jiffies)/HZ : 0); - else - seq_printf(s, "- "); - - seq_printf(s, "proto=%u ", expect->tuple.dst.protonum); - - print_tuple(s, &expect->tuple, - __ip_conntrack_proto_find(expect->tuple.dst.protonum)); - return seq_putc(s, '\n'); -} - -static struct seq_operations exp_seq_ops = { - .start = exp_seq_start, - .next = exp_seq_next, - .stop = exp_seq_stop, - .show = exp_seq_show -}; - -static int exp_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &exp_seq_ops); -} - -static const struct file_operations exp_file_ops = { - .owner = THIS_MODULE, - .open = exp_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release -}; - -static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) -{ - int cpu; - - if (*pos == 0) - return SEQ_START_TOKEN; - - for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) { - if (!cpu_possible(cpu)) - continue; - *pos = cpu+1; - return &per_cpu(ip_conntrack_stat, cpu); - } - - return NULL; -} - -static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - int cpu; - - for (cpu = *pos; cpu < NR_CPUS; ++cpu) { - if (!cpu_possible(cpu)) - continue; - *pos = cpu+1; - return &per_cpu(ip_conntrack_stat, cpu); - } - - return NULL; -} - -static void ct_cpu_seq_stop(struct seq_file *seq, void *v) -{ -} - -static int ct_cpu_seq_show(struct seq_file *seq, void *v) -{ - unsigned int nr_conntracks = atomic_read(&ip_conntrack_count); - struct ip_conntrack_stat *st = v; - - if (v == SEQ_START_TOKEN) { - seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete\n"); - return 0; - } - - seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " - "%08x %08x %08x %08x %08x %08x %08x %08x \n", - nr_conntracks, - st->searched, - st->found, - st->new, - st->invalid, - st->ignore, - st->delete, - st->delete_list, - st->insert, - st->insert_failed, - st->drop, - st->early_drop, - st->error, - - st->expect_new, - st->expect_create, - st->expect_delete - ); - return 0; -} - -static struct seq_operations ct_cpu_seq_ops = { - .start = ct_cpu_seq_start, - .next = ct_cpu_seq_next, - .stop = ct_cpu_seq_stop, - .show = ct_cpu_seq_show, -}; - -static int ct_cpu_seq_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &ct_cpu_seq_ops); -} - -static const struct file_operations ct_cpu_seq_fops = { - .owner = THIS_MODULE, - .open = ct_cpu_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; -#endif - -static unsigned int ip_confirm(unsigned int hooknum, - struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - /* We've seen it coming out the other side: confirm it */ - return ip_conntrack_confirm(pskb); -} - -static unsigned int ip_conntrack_help(unsigned int hooknum, - struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct ip_conntrack *ct; - enum ip_conntrack_info ctinfo; - - /* This is where we call the helper: as the packet goes out. */ - ct = ip_conntrack_get(*pskb, &ctinfo); - if (ct && ct->helper && ctinfo != IP_CT_RELATED + IP_CT_IS_REPLY) { - unsigned int ret; - ret = ct->helper->help(pskb, ct, ctinfo); - if (ret != NF_ACCEPT) - return ret; - } - return NF_ACCEPT; -} - -static unsigned int ip_conntrack_defrag(unsigned int hooknum, - struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ -#if !defined(CONFIG_IP_NF_NAT) && !defined(CONFIG_IP_NF_NAT_MODULE) - /* Previously seen (loopback)? Ignore. Do this before - fragment check. */ - if ((*pskb)->nfct) - return NF_ACCEPT; -#endif - - /* Gather fragments. */ - if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { - *pskb = ip_ct_gather_frags(*pskb, - hooknum == NF_IP_PRE_ROUTING ? - IP_DEFRAG_CONNTRACK_IN : - IP_DEFRAG_CONNTRACK_OUT); - if (!*pskb) - return NF_STOLEN; - } - return NF_ACCEPT; -} - -static unsigned int ip_conntrack_local(unsigned int hooknum, - struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - /* root is playing with raw sockets. */ - if ((*pskb)->len < sizeof(struct iphdr) - || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { - if (net_ratelimit()) - printk("ipt_hook: happy cracking.\n"); - return NF_ACCEPT; - } - return ip_conntrack_in(hooknum, pskb, in, out, okfn); -} - -/* Connection tracking may drop packets, but never alters them, so - make it the first hook. */ -static struct nf_hook_ops ip_conntrack_ops[] = { - { - .hook = ip_conntrack_defrag, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_PRE_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_DEFRAG, - }, - { - .hook = ip_conntrack_in, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_PRE_ROUTING, - .priority = NF_IP_PRI_CONNTRACK, - }, - { - .hook = ip_conntrack_defrag, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_LOCAL_OUT, - .priority = NF_IP_PRI_CONNTRACK_DEFRAG, - }, - { - .hook = ip_conntrack_local, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_LOCAL_OUT, - .priority = NF_IP_PRI_CONNTRACK, - }, - { - .hook = ip_conntrack_help, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_POST_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_HELPER, - }, - { - .hook = ip_conntrack_help, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_LOCAL_IN, - .priority = NF_IP_PRI_CONNTRACK_HELPER, - }, - { - .hook = ip_confirm, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_POST_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM, - }, - { - .hook = ip_confirm, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_LOCAL_IN, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM, - }, -}; - -/* Sysctl support */ - -int ip_conntrack_checksum __read_mostly = 1; - -#ifdef CONFIG_SYSCTL - -/* From ip_conntrack_core.c */ -extern int ip_conntrack_max; -extern unsigned int ip_conntrack_htable_size; - -/* From ip_conntrack_proto_tcp.c */ -extern unsigned int ip_ct_tcp_timeout_syn_sent; -extern unsigned int ip_ct_tcp_timeout_syn_recv; -extern unsigned int ip_ct_tcp_timeout_established; -extern unsigned int ip_ct_tcp_timeout_fin_wait; -extern unsigned int ip_ct_tcp_timeout_close_wait; -extern unsigned int ip_ct_tcp_timeout_last_ack; -extern unsigned int ip_ct_tcp_timeout_time_wait; -extern unsigned int ip_ct_tcp_timeout_close; -extern unsigned int ip_ct_tcp_timeout_max_retrans; -extern int ip_ct_tcp_loose; -extern int ip_ct_tcp_be_liberal; -extern int ip_ct_tcp_max_retrans; - -/* From ip_conntrack_proto_udp.c */ -extern unsigned int ip_ct_udp_timeout; -extern unsigned int ip_ct_udp_timeout_stream; - -/* From ip_conntrack_proto_icmp.c */ -extern unsigned int ip_ct_icmp_timeout; - -/* From ip_conntrack_proto_generic.c */ -extern unsigned int ip_ct_generic_timeout; - -/* Log invalid packets of a given protocol */ -static int log_invalid_proto_min = 0; -static int log_invalid_proto_max = 255; - -static struct ctl_table_header *ip_ct_sysctl_header; - -static ctl_table ip_ct_sysctl_table[] = { - { - .ctl_name = NET_IPV4_NF_CONNTRACK_MAX, - .procname = "ip_conntrack_max", - .data = &ip_conntrack_max, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_COUNT, - .procname = "ip_conntrack_count", - .data = &ip_conntrack_count, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_BUCKETS, - .procname = "ip_conntrack_buckets", - .data = &ip_conntrack_htable_size, - .maxlen = sizeof(unsigned int), - .mode = 0444, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_CHECKSUM, - .procname = "ip_conntrack_checksum", - .data = &ip_conntrack_checksum, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, - .procname = "ip_conntrack_tcp_timeout_syn_sent", - .data = &ip_ct_tcp_timeout_syn_sent, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, - .procname = "ip_conntrack_tcp_timeout_syn_recv", - .data = &ip_ct_tcp_timeout_syn_recv, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, - .procname = "ip_conntrack_tcp_timeout_established", - .data = &ip_ct_tcp_timeout_established, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, - .procname = "ip_conntrack_tcp_timeout_fin_wait", - .data = &ip_ct_tcp_timeout_fin_wait, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, - .procname = "ip_conntrack_tcp_timeout_close_wait", - .data = &ip_ct_tcp_timeout_close_wait, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, - .procname = "ip_conntrack_tcp_timeout_last_ack", - .data = &ip_ct_tcp_timeout_last_ack, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, - .procname = "ip_conntrack_tcp_timeout_time_wait", - .data = &ip_ct_tcp_timeout_time_wait, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, - .procname = "ip_conntrack_tcp_timeout_close", - .data = &ip_ct_tcp_timeout_close, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT, - .procname = "ip_conntrack_udp_timeout", - .data = &ip_ct_udp_timeout, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM, - .procname = "ip_conntrack_udp_timeout_stream", - .data = &ip_ct_udp_timeout_stream, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT, - .procname = "ip_conntrack_icmp_timeout", - .data = &ip_ct_icmp_timeout, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT, - .procname = "ip_conntrack_generic_timeout", - .data = &ip_ct_generic_timeout, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_LOG_INVALID, - .procname = "ip_conntrack_log_invalid", - .data = &ip_ct_log_invalid, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &log_invalid_proto_min, - .extra2 = &log_invalid_proto_max, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, - .procname = "ip_conntrack_tcp_timeout_max_retrans", - .data = &ip_ct_tcp_timeout_max_retrans, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_LOOSE, - .procname = "ip_conntrack_tcp_loose", - .data = &ip_ct_tcp_loose, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, - .procname = "ip_conntrack_tcp_be_liberal", - .data = &ip_ct_tcp_be_liberal, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, - .procname = "ip_conntrack_tcp_max_retrans", - .data = &ip_ct_tcp_max_retrans, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { .ctl_name = 0 } -}; - -#define NET_IP_CONNTRACK_MAX 2089 - -static ctl_table ip_ct_netfilter_table[] = { - { - .ctl_name = NET_IPV4_NETFILTER, - .procname = "netfilter", - .mode = 0555, - .child = ip_ct_sysctl_table, - }, - { - .ctl_name = NET_IP_CONNTRACK_MAX, - .procname = "ip_conntrack_max", - .data = &ip_conntrack_max, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { .ctl_name = 0 } -}; - -static ctl_table ip_ct_ipv4_table[] = { - { - .ctl_name = NET_IPV4, - .procname = "ipv4", - .mode = 0555, - .child = ip_ct_netfilter_table, - }, - { .ctl_name = 0 } -}; - -static ctl_table ip_ct_net_table[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = ip_ct_ipv4_table, - }, - { .ctl_name = 0 } -}; - -EXPORT_SYMBOL(ip_ct_log_invalid); -#endif /* CONFIG_SYSCTL */ - -/* FIXME: Allow NULL functions and sub in pointers to generic for - them. --RR */ -int ip_conntrack_protocol_register(struct ip_conntrack_protocol *proto) -{ - int ret = 0; - - write_lock_bh(&ip_conntrack_lock); - if (ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) { - ret = -EBUSY; - goto out; - } - rcu_assign_pointer(ip_ct_protos[proto->proto], proto); - out: - write_unlock_bh(&ip_conntrack_lock); - return ret; -} - -void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto) -{ - write_lock_bh(&ip_conntrack_lock); - rcu_assign_pointer(ip_ct_protos[proto->proto], - &ip_conntrack_generic_protocol); - write_unlock_bh(&ip_conntrack_lock); - synchronize_rcu(); - - /* Remove all contrack entries for this protocol */ - ip_ct_iterate_cleanup(kill_proto, &proto->proto); -} - -static int __init ip_conntrack_standalone_init(void) -{ -#ifdef CONFIG_PROC_FS - struct proc_dir_entry *proc, *proc_exp, *proc_stat; -#endif - int ret = 0; - - ret = ip_conntrack_init(); - if (ret < 0) - return ret; - -#ifdef CONFIG_PROC_FS - ret = -ENOMEM; - proc = proc_net_fops_create("ip_conntrack", 0440, &ct_file_ops); - if (!proc) goto cleanup_init; - - proc_exp = proc_net_fops_create("ip_conntrack_expect", 0440, - &exp_file_ops); - if (!proc_exp) goto cleanup_proc; - - proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, proc_net_stat); - if (!proc_stat) - goto cleanup_proc_exp; - - proc_stat->proc_fops = &ct_cpu_seq_fops; - proc_stat->owner = THIS_MODULE; -#endif - - ret = nf_register_hooks(ip_conntrack_ops, ARRAY_SIZE(ip_conntrack_ops)); - if (ret < 0) { - printk("ip_conntrack: can't register hooks.\n"); - goto cleanup_proc_stat; - } -#ifdef CONFIG_SYSCTL - ip_ct_sysctl_header = register_sysctl_table(ip_ct_net_table); - if (ip_ct_sysctl_header == NULL) { - printk("ip_conntrack: can't register to sysctl.\n"); - ret = -ENOMEM; - goto cleanup_hooks; - } -#endif - return ret; - -#ifdef CONFIG_SYSCTL - cleanup_hooks: - nf_unregister_hooks(ip_conntrack_ops, ARRAY_SIZE(ip_conntrack_ops)); -#endif - cleanup_proc_stat: -#ifdef CONFIG_PROC_FS - remove_proc_entry("ip_conntrack", proc_net_stat); - cleanup_proc_exp: - proc_net_remove("ip_conntrack_expect"); - cleanup_proc: - proc_net_remove("ip_conntrack"); - cleanup_init: -#endif /* CONFIG_PROC_FS */ - ip_conntrack_cleanup(); - return ret; -} - -static void __exit ip_conntrack_standalone_fini(void) -{ - synchronize_net(); -#ifdef CONFIG_SYSCTL - unregister_sysctl_table(ip_ct_sysctl_header); -#endif - nf_unregister_hooks(ip_conntrack_ops, ARRAY_SIZE(ip_conntrack_ops)); -#ifdef CONFIG_PROC_FS - remove_proc_entry("ip_conntrack", proc_net_stat); - proc_net_remove("ip_conntrack_expect"); - proc_net_remove("ip_conntrack"); -#endif /* CONFIG_PROC_FS */ - ip_conntrack_cleanup(); -} - -module_init(ip_conntrack_standalone_init); -module_exit(ip_conntrack_standalone_fini); - -/* Some modules need us, but don't depend directly on any symbol. - They should call this. */ -void need_conntrack(void) -{ -} - -#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS -EXPORT_SYMBOL_GPL(ip_conntrack_chain); -EXPORT_SYMBOL_GPL(ip_conntrack_expect_chain); -EXPORT_SYMBOL_GPL(ip_conntrack_register_notifier); -EXPORT_SYMBOL_GPL(ip_conntrack_unregister_notifier); -EXPORT_SYMBOL_GPL(__ip_ct_event_cache_init); -EXPORT_PER_CPU_SYMBOL_GPL(ip_conntrack_ecache); -#endif -EXPORT_SYMBOL(ip_conntrack_protocol_register); -EXPORT_SYMBOL(ip_conntrack_protocol_unregister); -EXPORT_SYMBOL(ip_ct_get_tuple); -EXPORT_SYMBOL(invert_tuplepr); -EXPORT_SYMBOL(ip_conntrack_alter_reply); -EXPORT_SYMBOL(ip_conntrack_destroyed); -EXPORT_SYMBOL(need_conntrack); -EXPORT_SYMBOL(ip_conntrack_helper_register); -EXPORT_SYMBOL(ip_conntrack_helper_unregister); -EXPORT_SYMBOL(ip_ct_iterate_cleanup); -EXPORT_SYMBOL(__ip_ct_refresh_acct); - -EXPORT_SYMBOL(ip_conntrack_expect_alloc); -EXPORT_SYMBOL(ip_conntrack_expect_put); -EXPORT_SYMBOL_GPL(__ip_conntrack_expect_find); -EXPORT_SYMBOL_GPL(ip_conntrack_expect_find_get); -EXPORT_SYMBOL(ip_conntrack_expect_related); -EXPORT_SYMBOL(ip_conntrack_unexpect_related); -EXPORT_SYMBOL_GPL(ip_conntrack_expect_list); -EXPORT_SYMBOL_GPL(ip_ct_unlink_expect); - -EXPORT_SYMBOL(ip_conntrack_tuple_taken); -EXPORT_SYMBOL(ip_ct_gather_frags); -EXPORT_SYMBOL(ip_conntrack_htable_size); -EXPORT_SYMBOL(ip_conntrack_lock); -EXPORT_SYMBOL(ip_conntrack_hash); -EXPORT_SYMBOL(ip_conntrack_untracked); -EXPORT_SYMBOL_GPL(ip_conntrack_find_get); -#ifdef CONFIG_IP_NF_NAT_NEEDED -EXPORT_SYMBOL(ip_conntrack_tcp_update); -#endif - -EXPORT_SYMBOL_GPL(ip_conntrack_flush); -EXPORT_SYMBOL_GPL(__ip_conntrack_find); - -EXPORT_SYMBOL_GPL(ip_conntrack_alloc); -EXPORT_SYMBOL_GPL(ip_conntrack_free); -EXPORT_SYMBOL_GPL(ip_conntrack_hash_insert); - -EXPORT_SYMBOL_GPL(ip_ct_remove_expectations); - -EXPORT_SYMBOL_GPL(ip_conntrack_helper_find_get); -EXPORT_SYMBOL_GPL(ip_conntrack_helper_put); -EXPORT_SYMBOL_GPL(__ip_conntrack_helper_find_byname); - -EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get); -EXPORT_SYMBOL_GPL(ip_conntrack_proto_put); -EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find); -EXPORT_SYMBOL_GPL(ip_conntrack_checksum); -#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ - defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) -EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr); -EXPORT_SYMBOL_GPL(ip_ct_port_nfattr_to_tuple); -#endif diff --git a/net/ipv4/netfilter/ip_conntrack_tftp.c b/net/ipv4/netfilter/ip_conntrack_tftp.c deleted file mode 100644 index 76e175e7a97..00000000000 --- a/net/ipv4/netfilter/ip_conntrack_tftp.c +++ /dev/null @@ -1,161 +0,0 @@ -/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Version: 0.0.7 - * - * Thu 21 Mar 2002 Harald Welte <laforge@gnumonks.org> - * - port to newnat API - * - */ - -#include <linux/module.h> -#include <linux/ip.h> -#include <linux/udp.h> - -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_tftp.h> -#include <linux/moduleparam.h> - -MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>"); -MODULE_DESCRIPTION("tftp connection tracking helper"); -MODULE_LICENSE("GPL"); - -#define MAX_PORTS 8 -static unsigned short ports[MAX_PORTS]; -static int ports_c; -module_param_array(ports, ushort, &ports_c, 0400); -MODULE_PARM_DESC(ports, "port numbers of tftp servers"); - -#if 0 -#define DEBUGP(format, args...) printk("%s:%s:" format, \ - __FILE__, __FUNCTION__ , ## args) -#else -#define DEBUGP(format, args...) -#endif - -unsigned int (*ip_nat_tftp_hook)(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - struct ip_conntrack_expect *exp); -EXPORT_SYMBOL_GPL(ip_nat_tftp_hook); - -static int tftp_help(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo) -{ - struct tftphdr _tftph, *tfh; - struct ip_conntrack_expect *exp; - unsigned int ret = NF_ACCEPT; - typeof(ip_nat_tftp_hook) ip_nat_tftp; - - tfh = skb_header_pointer(*pskb, - (*pskb)->nh.iph->ihl*4+sizeof(struct udphdr), - sizeof(_tftph), &_tftph); - if (tfh == NULL) - return NF_ACCEPT; - - switch (ntohs(tfh->opcode)) { - /* RRQ and WRQ works the same way */ - case TFTP_OPCODE_READ: - case TFTP_OPCODE_WRITE: - DEBUGP(""); - DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); - - exp = ip_conntrack_expect_alloc(ct); - if (exp == NULL) - return NF_DROP; - - exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; - exp->mask.src.ip = htonl(0xffffffff); - exp->mask.src.u.udp.port = 0; - exp->mask.dst.ip = htonl(0xffffffff); - exp->mask.dst.u.udp.port = htons(0xffff); - exp->mask.dst.protonum = 0xff; - exp->expectfn = NULL; - exp->flags = 0; - - DEBUGP("expect: "); - DUMP_TUPLE(&exp->tuple); - DUMP_TUPLE(&exp->mask); - ip_nat_tftp = rcu_dereference(ip_nat_tftp_hook); - if (ip_nat_tftp) - ret = ip_nat_tftp(pskb, ctinfo, exp); - else if (ip_conntrack_expect_related(exp) != 0) - ret = NF_DROP; - ip_conntrack_expect_put(exp); - break; - case TFTP_OPCODE_DATA: - case TFTP_OPCODE_ACK: - DEBUGP("Data/ACK opcode\n"); - break; - case TFTP_OPCODE_ERROR: - DEBUGP("Error opcode\n"); - break; - default: - DEBUGP("Unknown opcode\n"); - } - return NF_ACCEPT; -} - -static struct ip_conntrack_helper tftp[MAX_PORTS]; -static char tftp_names[MAX_PORTS][sizeof("tftp-65535")]; - -static void ip_conntrack_tftp_fini(void) -{ - int i; - - for (i = 0 ; i < ports_c; i++) { - DEBUGP("unregistering helper for port %d\n", - ports[i]); - ip_conntrack_helper_unregister(&tftp[i]); - } -} - -static int __init ip_conntrack_tftp_init(void) -{ - int i, ret; - char *tmpname; - - if (ports_c == 0) - ports[ports_c++] = TFTP_PORT; - - for (i = 0; i < ports_c; i++) { - /* Create helper structure */ - memset(&tftp[i], 0, sizeof(struct ip_conntrack_helper)); - - tftp[i].tuple.dst.protonum = IPPROTO_UDP; - tftp[i].tuple.src.u.udp.port = htons(ports[i]); - tftp[i].mask.dst.protonum = 0xFF; - tftp[i].mask.src.u.udp.port = htons(0xFFFF); - tftp[i].max_expected = 1; - tftp[i].timeout = 5 * 60; /* 5 minutes */ - tftp[i].me = THIS_MODULE; - tftp[i].help = tftp_help; - - tmpname = &tftp_names[i][0]; - if (ports[i] == TFTP_PORT) - sprintf(tmpname, "tftp"); - else - sprintf(tmpname, "tftp-%d", i); - tftp[i].name = tmpname; - - DEBUGP("port #%d: %d\n", i, ports[i]); - - ret=ip_conntrack_helper_register(&tftp[i]); - if (ret) { - printk("ERROR registering helper for port %d\n", - ports[i]); - ip_conntrack_tftp_fini(); - return(ret); - } - } - return(0); -} - -module_init(ip_conntrack_tftp_init); -module_exit(ip_conntrack_tftp_fini); diff --git a/net/ipv4/netfilter/ip_nat_amanda.c b/net/ipv4/netfilter/ip_nat_amanda.c deleted file mode 100644 index 85df1a9aed3..00000000000 --- a/net/ipv4/netfilter/ip_nat_amanda.c +++ /dev/null @@ -1,85 +0,0 @@ -/* Amanda extension for TCP NAT alteration. - * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca> - * based on a copy of HW's ip_nat_irc.c as well as other modules - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Module load syntax: - * insmod ip_nat_amanda.o - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/netfilter.h> -#include <linux/skbuff.h> -#include <linux/ip.h> -#include <linux/udp.h> -#include <net/tcp.h> -#include <net/udp.h> - -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter_ipv4/ip_nat.h> -#include <linux/netfilter_ipv4/ip_nat_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_amanda.h> - - -MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); -MODULE_DESCRIPTION("Amanda NAT helper"); -MODULE_LICENSE("GPL"); - -static unsigned int help(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - unsigned int matchoff, - unsigned int matchlen, - struct ip_conntrack_expect *exp) -{ - char buffer[sizeof("65535")]; - u_int16_t port; - unsigned int ret; - - /* Connection comes from client. */ - exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; - exp->dir = IP_CT_DIR_ORIGINAL; - - /* When you see the packet, we need to NAT it the same as the - * this one (ie. same IP: it will be TCP and master is UDP). */ - exp->expectfn = ip_nat_follow_master; - - /* Try to get same port: if not, try to change it. */ - for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { - exp->tuple.dst.u.tcp.port = htons(port); - if (ip_conntrack_expect_related(exp) == 0) - break; - } - - if (port == 0) - return NF_DROP; - - sprintf(buffer, "%u", port); - ret = ip_nat_mangle_udp_packet(pskb, exp->master, ctinfo, - matchoff, matchlen, - buffer, strlen(buffer)); - if (ret != NF_ACCEPT) - ip_conntrack_unexpect_related(exp); - return ret; -} - -static void __exit ip_nat_amanda_fini(void) -{ - rcu_assign_pointer(ip_nat_amanda_hook, NULL); - synchronize_rcu(); -} - -static int __init ip_nat_amanda_init(void) -{ - BUG_ON(rcu_dereference(ip_nat_amanda_hook)); - rcu_assign_pointer(ip_nat_amanda_hook, help); - return 0; -} - -module_init(ip_nat_amanda_init); -module_exit(ip_nat_amanda_fini); diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c deleted file mode 100644 index 40737fdbe9a..00000000000 --- a/net/ipv4/netfilter/ip_nat_core.c +++ /dev/null @@ -1,634 +0,0 @@ -/* NAT for netfilter; shared with compatibility layer. */ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/timer.h> -#include <linux/skbuff.h> -#include <linux/netfilter_ipv4.h> -#include <linux/vmalloc.h> -#include <net/checksum.h> -#include <net/icmp.h> -#include <net/ip.h> -#include <net/tcp.h> /* For tcp_prot in getorigdst */ -#include <linux/icmp.h> -#include <linux/udp.h> -#include <linux/jhash.h> - -#include <linux/netfilter_ipv4/ip_conntrack.h> -#include <linux/netfilter_ipv4/ip_conntrack_core.h> -#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> -#include <linux/netfilter_ipv4/ip_nat.h> -#include <linux/netfilter_ipv4/ip_nat_protocol.h> -#include <linux/netfilter_ipv4/ip_nat_core.h> -#include <linux/netfilter_ipv4/ip_nat_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - -DEFINE_RWLOCK(ip_nat_lock); - -/* Calculated at init based on memory size */ -static unsigned int ip_nat_htable_size; - -static struct list_head *bysource; - -#define MAX_IP_NAT_PROTO 256 -static struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO]; - -static inline struct ip_nat_protocol * -__ip_nat_proto_find(u_int8_t protonum) -{ - return rcu_dereference(ip_nat_protos[protonum]); -} - -struct ip_nat_protocol * -ip_nat_proto_find_get(u_int8_t protonum) -{ - struct ip_nat_protocol *p; - - rcu_read_lock(); - p = __ip_nat_proto_find(protonum); - if (!try_module_get(p->me)) - p = &ip_nat_unknown_protocol; - rcu_read_unlock(); - - return p; -} -EXPORT_SYMBOL_GPL(ip_nat_proto_find_get); - -void -ip_nat_proto_put(struct ip_nat_protocol *p) -{ - module_put(p->me); -} -EXPORT_SYMBOL_GPL(ip_nat_proto_put); - -/* We keep an extra hash for each conntrack, for fast searching. */ -static inline unsigned int -hash_by_src(const struct ip_conntrack_tuple *tuple) -{ - /* Original src, to ensure we map it consistently if poss. */ - return jhash_3words((__force u32)tuple->src.ip, tuple->src.u.all, - tuple->dst.protonum, 0) % ip_nat_htable_size; -} - -/* Noone using conntrack by the time this called. */ -static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn) -{ - if (!(conn->status & IPS_NAT_DONE_MASK)) - return; - - write_lock_bh(&ip_nat_lock); - list_del(&conn->nat.info.bysource); - write_unlock_bh(&ip_nat_lock); -} - -/* Is this tuple already taken? (not by us) */ -int -ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple, - const struct ip_conntrack *ignored_conntrack) -{ - /* Conntrack tracking doesn't keep track of outgoing tuples; only - incoming ones. NAT means they don't have a fixed mapping, - so we invert the tuple and look for the incoming reply. - - We could keep a separate hash if this proves too slow. */ - struct ip_conntrack_tuple reply; - - invert_tuplepr(&reply, tuple); - return ip_conntrack_tuple_taken(&reply, ignored_conntrack); -} -EXPORT_SYMBOL(ip_nat_used_tuple); - -/* If we source map this tuple so reply looks like reply_tuple, will - * that meet the constraints of range. */ -static int -in_range(const struct ip_conntrack_tuple *tuple, - const struct ip_nat_range *range) -{ - struct ip_nat_protocol *proto; - int ret = 0; - - /* If we are supposed to map IPs, then we must be in the - range specified, otherwise let this drag us onto a new src IP. */ - if (range->flags & IP_NAT_RANGE_MAP_IPS) { - if (ntohl(tuple->src.ip) < ntohl(range->min_ip) - || ntohl(tuple->src.ip) > ntohl(range->max_ip)) - return 0; - } - - rcu_read_lock(); - proto = __ip_nat_proto_find(tuple->dst.protonum); - if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) - || proto->in_range(tuple, IP_NAT_MANIP_SRC, - &range->min, &range->max)) - ret = 1; - rcu_read_unlock(); - - return ret; -} - -static inline int -same_src(const struct ip_conntrack *ct, - const struct ip_conntrack_tuple *tuple) -{ - return (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum - == tuple->dst.protonum - && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip - == tuple->src.ip - && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all - == tuple->src.u.all); -} - -/* Only called for SRC manip */ -static int -find_appropriate_src(const struct ip_conntrack_tuple *tuple, - struct ip_conntrack_tuple *result, - const struct ip_nat_range *range) -{ - unsigned int h = hash_by_src(tuple); - struct ip_conntrack *ct; - - read_lock_bh(&ip_nat_lock); - list_for_each_entry(ct, &bysource[h], nat.info.bysource) { - if (same_src(ct, tuple)) { - /* Copy source part from reply tuple. */ - invert_tuplepr(result, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); - result->dst = tuple->dst; - - if (in_range(result, range)) { - read_unlock_bh(&ip_nat_lock); - return 1; - } - } - } - read_unlock_bh(&ip_nat_lock); - return 0; -} - -/* For [FUTURE] fragmentation handling, we want the least-used - src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus - if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports - 1-65535, we don't do pro-rata allocation based on ports; we choose - the ip with the lowest src-ip/dst-ip/proto usage. -*/ -static void -find_best_ips_proto(struct ip_conntrack_tuple *tuple, - const struct ip_nat_range *range, - const struct ip_conntrack *conntrack, - enum ip_nat_manip_type maniptype) -{ - __be32 *var_ipp; - /* Host order */ - u_int32_t minip, maxip, j; - - /* No IP mapping? Do nothing. */ - if (!(range->flags & IP_NAT_RANGE_MAP_IPS)) - return; - - if (maniptype == IP_NAT_MANIP_SRC) - var_ipp = &tuple->src.ip; - else - var_ipp = &tuple->dst.ip; - - /* Fast path: only one choice. */ - if (range->min_ip == range->max_ip) { - *var_ipp = range->min_ip; - return; - } - - /* Hashing source and destination IPs gives a fairly even - * spread in practice (if there are a small number of IPs - * involved, there usually aren't that many connections - * anyway). The consistency means that servers see the same - * client coming from the same IP (some Internet Banking sites - * like this), even across reboots. */ - minip = ntohl(range->min_ip); - maxip = ntohl(range->max_ip); - j = jhash_2words((__force u32)tuple->src.ip, (__force u32)tuple->dst.ip, 0); - *var_ipp = htonl(minip + j % (maxip - minip + 1)); -} - -/* Manipulate the tuple into the range given. For NF_IP_POST_ROUTING, - * we change the source to map into the range. For NF_IP_PRE_ROUTING - * and NF_IP_LOCAL_OUT, we change the destination to map into the - * range. It might not be possible to get a unique tuple, but we try. - * At worst (or if we race), we will end up with a final duplicate in - * __ip_conntrack_confirm and drop the packet. */ -static void -get_unique_tuple(struct ip_conntrack_tuple *tuple, - const struct ip_conntrack_tuple *orig_tuple, - const struct ip_nat_range *range, - struct ip_conntrack *conntrack, - enum ip_nat_manip_type maniptype) -{ - struct ip_nat_protocol *proto; - - /* 1) If this srcip/proto/src-proto-part is currently mapped, - and that same mapping gives a unique tuple within the given - range, use that. - - This is only required for source (ie. NAT/masq) mappings. - So far, we don't do local source mappings, so multiple - manips not an issue. */ - if (maniptype == IP_NAT_MANIP_SRC) { - if (find_appropriate_src(orig_tuple, tuple, range)) { - DEBUGP("get_unique_tuple: Found current src map\n"); - if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) - if (!ip_nat_used_tuple(tuple, conntrack)) - return; - } - } - - /* 2) Select the least-used IP/proto combination in the given - range. */ - *tuple = *orig_tuple; - find_best_ips_proto(tuple, range, conntrack, maniptype); - - /* 3) The per-protocol part of the manip is made to map into - the range to make a unique tuple. */ - - rcu_read_lock(); - proto = __ip_nat_proto_find(orig_tuple->dst.protonum); - - /* Change protocol info to have some randomization */ - if (range->flags & IP_NAT_RANGE_PROTO_RANDOM) { - proto->unique_tuple(tuple, range, maniptype, conntrack); - goto out; - } - - /* Only bother mapping if it's not already in range and unique */ - if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) - || proto->in_range(tuple, maniptype, &range->min, &range->max)) - && !ip_nat_used_tuple(tuple, conntrack)) - goto out; - - /* Last change: get protocol to try to obtain unique tuple. */ - proto->unique_tuple(tuple, range, maniptype, conntrack); -out: - rcu_read_unlock(); -} - -unsigned int -ip_nat_setup_info(struct ip_conntrack *conntrack, - const struct ip_nat_range *range, - unsigned int hooknum) -{ - struct ip_conntrack_tuple curr_tuple, new_tuple; - struct ip_nat_info *info = &conntrack->nat.info; - int have_to_hash = !(conntrack->status & IPS_NAT_DONE_MASK); - enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum); - - IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING - || hooknum == NF_IP_POST_ROUTING - || hooknum == NF_IP_LOCAL_IN - || hooknum == NF_IP_LOCAL_OUT); - BUG_ON(ip_nat_initialized(conntrack, maniptype)); - - /* What we've got will look like inverse of reply. Normally - this is what is in the conntrack, except for prior - manipulations (future optimization: if num_manips == 0, - orig_tp = - conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */ - invert_tuplepr(&curr_tuple, - &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple); - - get_unique_tuple(&new_tuple, &curr_tuple, range, conntrack, maniptype); - - if (!ip_ct_tuple_equal(&new_tuple, &curr_tuple)) { - struct ip_conntrack_tuple reply; - - /* Alter conntrack table so will recognize replies. */ - invert_tuplepr(&reply, &new_tuple); - ip_conntrack_alter_reply(conntrack, &reply); - - /* Non-atomic: we own this at the moment. */ - if (maniptype == IP_NAT_MANIP_SRC) - conntrack->status |= IPS_SRC_NAT; - else - conntrack->status |= IPS_DST_NAT; - } - - /* Place in source hash if this is the first time. */ - if (have_to_hash) { - unsigned int srchash - = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple); - write_lock_bh(&ip_nat_lock); - list_add(&info->bysource, &bysource[srchash]); - write_unlock_bh(&ip_nat_lock); - } - - /* It's done. */ - if (maniptype == IP_NAT_MANIP_DST) - set_bit(IPS_DST_NAT_DONE_BIT, &conntrack->status); - else - set_bit(IPS_SRC_NAT_DONE_BIT, &conntrack->status); - - return NF_ACCEPT; -} -EXPORT_SYMBOL(ip_nat_setup_info); - -/* Returns true if succeeded. */ -static int -manip_pkt(u_int16_t proto, - struct sk_buff **pskb, - unsigned int iphdroff, - const struct ip_conntrack_tuple *target, - enum ip_nat_manip_type maniptype) -{ - struct iphdr *iph; - struct ip_nat_protocol *p; - - if (!skb_make_writable(pskb, iphdroff + sizeof(*iph))) - return 0; - - iph = (void *)(*pskb)->data + iphdroff; - - /* Manipulate protcol part. */ - - /* rcu_read_lock()ed by nf_hook_slow */ - p = __ip_nat_proto_find(proto); - if (!p->manip_pkt(pskb, iphdroff, target, maniptype)) - return 0; - - iph = (void *)(*pskb)->data + iphdroff; - - if (maniptype == IP_NAT_MANIP_SRC) { - nf_csum_replace4(&iph->check, iph->saddr, target->src.ip); - iph->saddr = target->src.ip; - } else { - nf_csum_replace4(&iph->check, iph->daddr, target->dst.ip); - iph->daddr = target->dst.ip; - } - return 1; -} - -/* Do packet manipulations according to ip_nat_setup_info. */ -unsigned int ip_nat_packet(struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned int hooknum, - struct sk_buff **pskb) -{ - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - unsigned long statusbit; - enum ip_nat_manip_type mtype = HOOK2MANIP(hooknum); - - if (mtype == IP_NAT_MANIP_SRC) - statusbit = IPS_SRC_NAT; - else - statusbit = IPS_DST_NAT; - - /* Invert if this is reply dir. */ - if (dir == IP_CT_DIR_REPLY) - statusbit ^= IPS_NAT_MASK; - - /* Non-atomic: these bits don't change. */ - if (ct->status & statusbit) { - struct ip_conntrack_tuple target; - - /* We are aiming to look like inverse of other direction. */ - invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); - - if (!manip_pkt(target.dst.protonum, pskb, 0, &target, mtype)) - return NF_DROP; - } - return NF_ACCEPT; -} -EXPORT_SYMBOL_GPL(ip_nat_packet); - -/* Dir is direction ICMP is coming from (opposite to packet it contains) */ -int ip_nat_icmp_reply_translation(struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned int hooknum, - struct sk_buff **pskb) -{ - struct { - struct icmphdr icmp; - struct iphdr ip; - } *inside; - struct ip_conntrack_protocol *proto; - struct ip_conntrack_tuple inner, target; - int hdrlen = (*pskb)->nh.iph->ihl * 4; - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - unsigned long statusbit; - enum ip_nat_manip_type manip = HOOK2MANIP(hooknum); - - if (!skb_make_writable(pskb, hdrlen + sizeof(*inside))) - return 0; - - inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; - - /* We're actually going to mangle it beyond trivial checksum - adjustment, so make sure the current checksum is correct. */ - if (nf_ip_checksum(*pskb, hooknum, hdrlen, 0)) - return 0; - - /* Must be RELATED */ - IP_NF_ASSERT((*pskb)->nfctinfo == IP_CT_RELATED || - (*pskb)->nfctinfo == IP_CT_RELATED+IP_CT_IS_REPLY); - - /* Redirects on non-null nats must be dropped, else they'll - start talking to each other without our translation, and be - confused... --RR */ - if (inside->icmp.type == ICMP_REDIRECT) { - /* If NAT isn't finished, assume it and drop. */ - if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK) - return 0; - - if (ct->status & IPS_NAT_MASK) - return 0; - } - - DEBUGP("icmp_reply_translation: translating error %p manp %u dir %s\n", - *pskb, manip, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY"); - - /* rcu_read_lock()ed by nf_hook_slow */ - proto = __ip_conntrack_proto_find(inside->ip.protocol); - if (!ip_ct_get_tuple(&inside->ip, *pskb, (*pskb)->nh.iph->ihl*4 + - sizeof(struct icmphdr) + inside->ip.ihl*4, - &inner, proto)) - return 0; - - /* Change inner back to look like incoming packet. We do the - opposite manip on this hook to normal, because it might not - pass all hooks (locally-generated ICMP). Consider incoming - packet: PREROUTING (DST manip), routing produces ICMP, goes - through POSTROUTING (which must correct the DST manip). */ - if (!manip_pkt(inside->ip.protocol, pskb, - (*pskb)->nh.iph->ihl*4 - + sizeof(inside->icmp), - &ct->tuplehash[!dir].tuple, - !manip)) - return 0; - - if ((*pskb)->ip_summed != CHECKSUM_PARTIAL) { - /* Reloading "inside" here since manip_pkt inner. */ - inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; - inside->icmp.checksum = 0; - inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen, - (*pskb)->len - hdrlen, - 0)); - } - - /* Change outer to look the reply to an incoming packet - * (proto 0 means don't invert per-proto part). */ - if (manip == IP_NAT_MANIP_SRC) - statusbit = IPS_SRC_NAT; - else - statusbit = IPS_DST_NAT; - - /* Invert if this is reply dir. */ - if (dir == IP_CT_DIR_REPLY) - statusbit ^= IPS_NAT_MASK; - - if (ct->status & statusbit) { - invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); - if (!manip_pkt(0, pskb, 0, &target, manip)) - return 0; - } - - return 1; -} -EXPORT_SYMBOL_GPL(ip_nat_icmp_reply_translation); - -/* Protocol registration. */ -int ip_nat_protocol_register(struct ip_nat_protocol *proto) -{ - int ret = 0; - - write_lock_bh(&ip_nat_lock); - if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) { - ret = -EBUSY; - goto out; - } - rcu_assign_pointer(ip_nat_protos[proto->protonum], proto); - out: - write_unlock_bh(&ip_nat_lock); - return ret; -} -EXPORT_SYMBOL(ip_nat_protocol_register); - -/* Noone stores the protocol anywhere; simply delete it. */ -void ip_nat_protocol_unregister(struct ip_nat_protocol *proto) -{ - write_lock_bh(&ip_nat_lock); - rcu_assign_pointer(ip_nat_protos[proto->protonum], - &ip_nat_unknown_protocol); - write_unlock_bh(&ip_nat_lock); - synchronize_rcu(); -} -EXPORT_SYMBOL(ip_nat_protocol_unregister); - -#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ - defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) -int -ip_nat_port_range_to_nfattr(struct sk_buff *skb, - const struct ip_nat_range *range) -{ - NFA_PUT(skb, CTA_PROTONAT_PORT_MIN, sizeof(__be16), - &range->min.tcp.port); - NFA_PUT(skb, CTA_PROTONAT_PORT_MAX, sizeof(__be16), - &range->max.tcp.port); - - return 0; - -nfattr_failure: - return -1; -} - -int -ip_nat_port_nfattr_to_range(struct nfattr *tb[], struct ip_nat_range *range) -{ - int ret = 0; - - /* we have to return whether we actually parsed something or not */ - - if (tb[CTA_PROTONAT_PORT_MIN-1]) { - ret = 1; - range->min.tcp.port = - *(__be16 *)NFA_DATA(tb[CTA_PROTONAT_PORT_MIN-1]); - } - - if (!tb[CTA_PROTONAT_PORT_MAX-1]) { - if (ret) - range->max.tcp.port = range->min.tcp.port; - } else { - ret = 1; - range->max.tcp.port = - *(__be16 *)NFA_DATA(tb[CTA_PROTONAT_PORT_MAX-1]); - } - - return ret; -} -EXPORT_SYMBOL_GPL(ip_nat_port_nfattr_to_range); -EXPORT_SYMBOL_GPL(ip_nat_port_range_to_nfattr); -#endif - -static int __init ip_nat_init(void) -{ - size_t i; - - /* Leave them the same for the moment. */ - ip_nat_htable_size = ip_conntrack_htable_size; - - /* One vmalloc for both hash tables */ - bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size); - if (!bysource) - return -ENOMEM; - - /* Sew in builtin protocols. */ - write_lock_bh(&ip_nat_lock); - for (i = 0; i < MAX_IP_NAT_PROTO; i++) - rcu_assign_pointer(ip_nat_protos[i], &ip_nat_unknown_protocol); - rcu_assign_pointer(ip_nat_protos[IPPROTO_TCP], &ip_nat_protocol_tcp); - rcu_assign_pointer(ip_nat_protos[IPPROTO_UDP], &ip_nat_protocol_udp); - rcu_assign_pointer(ip_nat_protos[IPPROTO_ICMP], &ip_nat_protocol_icmp); - write_unlock_bh(&ip_nat_lock); - - for (i = 0; i < ip_nat_htable_size; i++) { - INIT_LIST_HEAD(&bysource[i]); - } - - /* FIXME: Man, this is a hack. <SIGH> */ - IP_NF_ASSERT(rcu_dereference(ip_conntrack_destroyed) == NULL); - rcu_assign_pointer(ip_conntrack_destroyed, ip_nat_cleanup_conntrack); - - /* Initialize fake conntrack so that NAT will skip it */ - ip_conntrack_untracked.status |= IPS_NAT_DONE_MASK; - return 0; -} - -/* Clear NAT section of all conntracks, in case we're loaded again. */ -static int clean_nat(struct ip_conntrack *i, void *data) -{ - memset(&i->nat, 0, sizeof(i->nat)); - i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST); - return 0; -} - -static void __exit ip_nat_cleanup(void) -{ - ip_ct_iterate_cleanup(&clean_nat, NULL); - rcu_assign_pointer(ip_conntrack_destroyed, NULL); - synchronize_rcu(); - vfree(bysource); -} - -MODULE_LICENSE("GPL"); - -module_init(ip_nat_init); -module_exit(ip_nat_cleanup); diff --git a/net/ipv4/netfilter/ip_nat_ftp.c b/net/ipv4/netfilter/ip_nat_ftp.c deleted file mode 100644 index 32e01d8dffc..00000000000 --- a/net/ipv4/netfilter/ip_nat_ftp.c +++ /dev/null @@ -1,180 +0,0 @@ -/* FTP extension for TCP NAT alteration. */ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/netfilter_ipv4.h> -#include <linux/ip.h> -#include <linux/tcp.h> -#include <linux/moduleparam.h> -#include <net/tcp.h> -#include <linux/netfilter_ipv4/ip_nat.h> -#include <linux/netfilter_ipv4/ip_nat_helper.h> -#include <linux/netfilter_ipv4/ip_nat_rule.h> -#include <linux/netfilter_ipv4/ip_conntrack_ftp.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); -MODULE_DESCRIPTION("ftp NAT helper"); - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - -/* FIXME: Time out? --RR */ - -static int -mangle_rfc959_packet(struct sk_buff **pskb, - __be32 newip, - u_int16_t port, - unsigned int matchoff, - unsigned int matchlen, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - u32 *seq) -{ - char buffer[sizeof("nnn,nnn,nnn,nnn,nnn,nnn")]; - - sprintf(buffer, "%u,%u,%u,%u,%u,%u", - NIPQUAD(newip), port>>8, port&0xFF); - - DEBUGP("calling ip_nat_mangle_tcp_packet\n"); - - *seq += strlen(buffer) - matchlen; - return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff, - matchlen, buffer, strlen(buffer)); -} - -/* |1|132.235.1.2|6275| */ -static int -mangle_eprt_packet(struct sk_buff **pskb, - __be32 newip, - u_int16_t port, - unsigned int matchoff, - unsigned int matchlen, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - u32 *seq) -{ - char buffer[sizeof("|1|255.255.255.255|65535|")]; - - sprintf(buffer, "|1|%u.%u.%u.%u|%u|", NIPQUAD(newip), port); - - DEBUGP("calling ip_nat_mangle_tcp_packet\n"); - - *seq += strlen(buffer) - matchlen; - return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff, - matchlen, buffer, strlen(buffer)); -} - -/* |1|132.235.1.2|6275| */ -static int -mangle_epsv_packet(struct sk_buff **pskb, - __be32 newip, - u_int16_t port, - unsigned int matchoff, - unsigned int matchlen, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - u32 *seq) -{ - char buffer[sizeof("|||65535|")]; - - sprintf(buffer, "|||%u|", port); - - DEBUGP("calling ip_nat_mangle_tcp_packet\n"); - - *seq += strlen(buffer) - matchlen; - return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff, - matchlen, buffer, strlen(buffer)); -} - -static int (*mangle[])(struct sk_buff **, __be32, u_int16_t, - unsigned int, - unsigned int, - struct ip_conntrack *, - enum ip_conntrack_info, - u32 *seq) -= { [IP_CT_FTP_PORT] = mangle_rfc959_packet, - [IP_CT_FTP_PASV] = mangle_rfc959_packet, - [IP_CT_FTP_EPRT] = mangle_eprt_packet, - [IP_CT_FTP_EPSV] = mangle_epsv_packet -}; - -/* So, this packet has hit the connection tracking matching code. - Mangle it, and change the expectation to match the new version. */ -static unsigned int ip_nat_ftp(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - enum ip_ct_ftp_type type, - unsigned int matchoff, - unsigned int matchlen, - struct ip_conntrack_expect *exp, - u32 *seq) -{ - __be32 newip; - u_int16_t port; - int dir = CTINFO2DIR(ctinfo); - struct ip_conntrack *ct = exp->master; - - DEBUGP("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen); - - /* Connection will come from wherever this packet goes, hence !dir */ - newip = ct->tuplehash[!dir].tuple.dst.ip; - exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; - exp->dir = !dir; - - /* When you see the packet, we need to NAT it the same as the - * this one. */ - exp->expectfn = ip_nat_follow_master; - - /* Try to get same port: if not, try to change it. */ - for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { - exp->tuple.dst.u.tcp.port = htons(port); - if (ip_conntrack_expect_related(exp) == 0) - break; - } - - if (port == 0) - return NF_DROP; - - if (!mangle[type](pskb, newip, port, matchoff, matchlen, ct, ctinfo, - seq)) { - ip_conntrack_unexpect_related(exp); - return NF_DROP; - } - return NF_ACCEPT; -} - -static void __exit ip_nat_ftp_fini(void) -{ - rcu_assign_pointer(ip_nat_ftp_hook, NULL); - synchronize_rcu(); -} - -static int __init ip_nat_ftp_init(void) -{ - BUG_ON(rcu_dereference(ip_nat_ftp_hook)); - rcu_assign_pointer(ip_nat_ftp_hook, ip_nat_ftp); - return 0; -} - -/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ -static int warn_set(const char *val, struct kernel_param *kp) -{ - printk(KERN_INFO KBUILD_MODNAME - ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); - return 0; -} -module_param_call(ports, warn_set, NULL, NULL, 0); - -module_init(ip_nat_ftp_init); -module_exit(ip_nat_ftp_fini); diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c deleted file mode 100644 index dc778cfef58..00000000000 --- a/net/ipv4/netfilter/ip_nat_helper.c +++ /dev/null @@ -1,436 +0,0 @@ -/* ip_nat_helper.c - generic support functions for NAT helpers - * - * (C) 2000-2002 Harald Welte <laforge@netfilter.org> - * (C) 2003-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * 14 Jan 2002 Harald Welte <laforge@gnumonks.org>: - * - add support for SACK adjustment - * 14 Mar 2002 Harald Welte <laforge@gnumonks.org>: - * - merge SACK support into newnat API - * 16 Aug 2002 Brian J. Murrell <netfilter@interlinx.bc.ca>: - * - make ip_nat_resize_packet more generic (TCP and UDP) - * - add ip_nat_mangle_udp_packet - */ -#include <linux/module.h> -#include <linux/kmod.h> -#include <linux/types.h> -#include <linux/timer.h> -#include <linux/skbuff.h> -#include <linux/netfilter_ipv4.h> -#include <net/checksum.h> -#include <net/icmp.h> -#include <net/ip.h> -#include <net/tcp.h> -#include <net/udp.h> - -#include <linux/netfilter_ipv4/ip_conntrack.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/netfilter_ipv4/ip_nat.h> -#include <linux/netfilter_ipv4/ip_nat_protocol.h> -#include <linux/netfilter_ipv4/ip_nat_core.h> -#include <linux/netfilter_ipv4/ip_nat_helper.h> - -#if 0 -#define DEBUGP printk -#define DUMP_OFFSET(x) printk("offset_before=%d, offset_after=%d, correction_pos=%u\n", x->offset_before, x->offset_after, x->correction_pos); -#else -#define DEBUGP(format, args...) -#define DUMP_OFFSET(x) -#endif - -static DEFINE_SPINLOCK(ip_nat_seqofs_lock); - -/* Setup TCP sequence correction given this change at this sequence */ -static inline void -adjust_tcp_sequence(u32 seq, - int sizediff, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo) -{ - int dir; - struct ip_nat_seq *this_way, *other_way; - - DEBUGP("ip_nat_resize_packet: old_size = %u, new_size = %u\n", - (*skb)->len, new_size); - - dir = CTINFO2DIR(ctinfo); - - this_way = &ct->nat.info.seq[dir]; - other_way = &ct->nat.info.seq[!dir]; - - DEBUGP("ip_nat_resize_packet: Seq_offset before: "); - DUMP_OFFSET(this_way); - - spin_lock_bh(&ip_nat_seqofs_lock); - - /* SYN adjust. If it's uninitialized, or this is after last - * correction, record it: we don't handle more than one - * adjustment in the window, but do deal with common case of a - * retransmit */ - if (this_way->offset_before == this_way->offset_after - || before(this_way->correction_pos, seq)) { - this_way->correction_pos = seq; - this_way->offset_before = this_way->offset_after; - this_way->offset_after += sizediff; - } - spin_unlock_bh(&ip_nat_seqofs_lock); - - DEBUGP("ip_nat_resize_packet: Seq_offset after: "); - DUMP_OFFSET(this_way); -} - -/* Frobs data inside this packet, which is linear. */ -static void mangle_contents(struct sk_buff *skb, - unsigned int dataoff, - unsigned int match_offset, - unsigned int match_len, - const char *rep_buffer, - unsigned int rep_len) -{ - unsigned char *data; - - BUG_ON(skb_is_nonlinear(skb)); - data = (unsigned char *)skb->nh.iph + dataoff; - - /* move post-replacement */ - memmove(data + match_offset + rep_len, - data + match_offset + match_len, - skb->tail - (data + match_offset + match_len)); - - /* insert data from buffer */ - memcpy(data + match_offset, rep_buffer, rep_len); - - /* update skb info */ - if (rep_len > match_len) { - DEBUGP("ip_nat_mangle_packet: Extending packet by " - "%u from %u bytes\n", rep_len - match_len, - skb->len); - skb_put(skb, rep_len - match_len); - } else { - DEBUGP("ip_nat_mangle_packet: Shrinking packet from " - "%u from %u bytes\n", match_len - rep_len, - skb->len); - __skb_trim(skb, skb->len + rep_len - match_len); - } - - /* fix IP hdr checksum information */ - skb->nh.iph->tot_len = htons(skb->len); - ip_send_check(skb->nh.iph); -} - -/* Unusual, but possible case. */ -static int enlarge_skb(struct sk_buff **pskb, unsigned int extra) -{ - struct sk_buff *nskb; - - if ((*pskb)->len + extra > 65535) - return 0; - - nskb = skb_copy_expand(*pskb, skb_headroom(*pskb), extra, GFP_ATOMIC); - if (!nskb) - return 0; - - /* Transfer socket to new skb. */ - if ((*pskb)->sk) - skb_set_owner_w(nskb, (*pskb)->sk); - kfree_skb(*pskb); - *pskb = nskb; - return 1; -} - -/* Generic function for mangling variable-length address changes inside - * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX - * command in FTP). - * - * Takes care about all the nasty sequence number changes, checksumming, - * skb enlargement, ... - * - * */ -int -ip_nat_mangle_tcp_packet(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned int match_offset, - unsigned int match_len, - const char *rep_buffer, - unsigned int rep_len) -{ - struct iphdr *iph; - struct tcphdr *tcph; - int oldlen, datalen; - - if (!skb_make_writable(pskb, (*pskb)->len)) - return 0; - - if (rep_len > match_len - && rep_len - match_len > skb_tailroom(*pskb) - && !enlarge_skb(pskb, rep_len - match_len)) - return 0; - - SKB_LINEAR_ASSERT(*pskb); - - iph = (*pskb)->nh.iph; - tcph = (void *)iph + iph->ihl*4; - - oldlen = (*pskb)->len - iph->ihl*4; - mangle_contents(*pskb, iph->ihl*4 + tcph->doff*4, - match_offset, match_len, rep_buffer, rep_len); - - datalen = (*pskb)->len - iph->ihl*4; - if ((*pskb)->ip_summed != CHECKSUM_PARTIAL) { - tcph->check = 0; - tcph->check = tcp_v4_check(datalen, - iph->saddr, iph->daddr, - csum_partial((char *)tcph, - datalen, 0)); - } else - nf_proto_csum_replace2(&tcph->check, *pskb, - htons(oldlen), htons(datalen), 1); - - if (rep_len != match_len) { - set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); - adjust_tcp_sequence(ntohl(tcph->seq), - (int)rep_len - (int)match_len, - ct, ctinfo); - /* Tell TCP window tracking about seq change */ - ip_conntrack_tcp_update(*pskb, ct, CTINFO2DIR(ctinfo)); - } - return 1; -} -EXPORT_SYMBOL(ip_nat_mangle_tcp_packet); - -/* Generic function for mangling variable-length address changes inside - * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX - * command in the Amanda protocol) - * - * Takes care about all the nasty sequence number changes, checksumming, - * skb enlargement, ... - * - * XXX - This function could be merged with ip_nat_mangle_tcp_packet which - * should be fairly easy to do. - */ -int -ip_nat_mangle_udp_packet(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned int match_offset, - unsigned int match_len, - const char *rep_buffer, - unsigned int rep_len) -{ - struct iphdr *iph; - struct udphdr *udph; - int datalen, oldlen; - - /* UDP helpers might accidentally mangle the wrong packet */ - iph = (*pskb)->nh.iph; - if ((*pskb)->len < iph->ihl*4 + sizeof(*udph) + - match_offset + match_len) - return 0; - - if (!skb_make_writable(pskb, (*pskb)->len)) - return 0; - - if (rep_len > match_len - && rep_len - match_len > skb_tailroom(*pskb) - && !enlarge_skb(pskb, rep_len - match_len)) - return 0; - - iph = (*pskb)->nh.iph; - udph = (void *)iph + iph->ihl*4; - - oldlen = (*pskb)->len - iph->ihl*4; - mangle_contents(*pskb, iph->ihl*4 + sizeof(*udph), - match_offset, match_len, rep_buffer, rep_len); - - /* update the length of the UDP packet */ - datalen = (*pskb)->len - iph->ihl*4; - udph->len = htons(datalen); - - /* fix udp checksum if udp checksum was previously calculated */ - if (!udph->check && (*pskb)->ip_summed != CHECKSUM_PARTIAL) - return 1; - - if ((*pskb)->ip_summed != CHECKSUM_PARTIAL) { - udph->check = 0; - udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, - datalen, IPPROTO_UDP, - csum_partial((char *)udph, - datalen, 0)); - if (!udph->check) - udph->check = CSUM_MANGLED_0; - } else - nf_proto_csum_replace2(&udph->check, *pskb, - htons(oldlen), htons(datalen), 1); - return 1; -} -EXPORT_SYMBOL(ip_nat_mangle_udp_packet); - -/* Adjust one found SACK option including checksum correction */ -static void -sack_adjust(struct sk_buff *skb, - struct tcphdr *tcph, - unsigned int sackoff, - unsigned int sackend, - struct ip_nat_seq *natseq) -{ - while (sackoff < sackend) { - struct tcp_sack_block_wire *sack; - __be32 new_start_seq, new_end_seq; - - sack = (void *)skb->data + sackoff; - if (after(ntohl(sack->start_seq) - natseq->offset_before, - natseq->correction_pos)) - new_start_seq = htonl(ntohl(sack->start_seq) - - natseq->offset_after); - else - new_start_seq = htonl(ntohl(sack->start_seq) - - natseq->offset_before); - - if (after(ntohl(sack->end_seq) - natseq->offset_before, - natseq->correction_pos)) - new_end_seq = htonl(ntohl(sack->end_seq) - - natseq->offset_after); - else - new_end_seq = htonl(ntohl(sack->end_seq) - - natseq->offset_before); - - DEBUGP("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n", - ntohl(sack->start_seq), new_start_seq, - ntohl(sack->end_seq), new_end_seq); - - nf_proto_csum_replace4(&tcph->check, skb, - sack->start_seq, new_start_seq, 0); - nf_proto_csum_replace4(&tcph->check, skb, - sack->end_seq, new_end_seq, 0); - sack->start_seq = new_start_seq; - sack->end_seq = new_end_seq; - sackoff += sizeof(*sack); - } -} - -/* TCP SACK sequence number adjustment */ -static inline unsigned int -ip_nat_sack_adjust(struct sk_buff **pskb, - struct tcphdr *tcph, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo) -{ - unsigned int dir, optoff, optend; - - optoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct tcphdr); - optend = (*pskb)->nh.iph->ihl*4 + tcph->doff*4; - - if (!skb_make_writable(pskb, optend)) - return 0; - - dir = CTINFO2DIR(ctinfo); - - while (optoff < optend) { - /* Usually: option, length. */ - unsigned char *op = (*pskb)->data + optoff; - - switch (op[0]) { - case TCPOPT_EOL: - return 1; - case TCPOPT_NOP: - optoff++; - continue; - default: - /* no partial options */ - if (optoff + 1 == optend - || optoff + op[1] > optend - || op[1] < 2) - return 0; - if (op[0] == TCPOPT_SACK - && op[1] >= 2+TCPOLEN_SACK_PERBLOCK - && ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0) - sack_adjust(*pskb, tcph, optoff+2, - optoff+op[1], - &ct->nat.info.seq[!dir]); - optoff += op[1]; - } - } - return 1; -} - -/* TCP sequence number adjustment. Returns 1 on success, 0 on failure */ -int -ip_nat_seq_adjust(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo) -{ - struct tcphdr *tcph; - int dir; - __be32 newseq, newack; - struct ip_nat_seq *this_way, *other_way; - - dir = CTINFO2DIR(ctinfo); - - this_way = &ct->nat.info.seq[dir]; - other_way = &ct->nat.info.seq[!dir]; - - if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) - return 0; - - tcph = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; - if (after(ntohl(tcph->seq), this_way->correction_pos)) - newseq = htonl(ntohl(tcph->seq) + this_way->offset_after); - else - newseq = htonl(ntohl(tcph->seq) + this_way->offset_before); - - if (after(ntohl(tcph->ack_seq) - other_way->offset_before, - other_way->correction_pos)) - newack = htonl(ntohl(tcph->ack_seq) - other_way->offset_after); - else - newack = htonl(ntohl(tcph->ack_seq) - other_way->offset_before); - - nf_proto_csum_replace4(&tcph->check, *pskb, tcph->seq, newseq, 0); - nf_proto_csum_replace4(&tcph->check, *pskb, tcph->ack_seq, newack, 0); - - DEBUGP("Adjusting sequence number from %u->%u, ack from %u->%u\n", - ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq), - ntohl(newack)); - - tcph->seq = newseq; - tcph->ack_seq = newack; - - if (!ip_nat_sack_adjust(pskb, tcph, ct, ctinfo)) - return 0; - - ip_conntrack_tcp_update(*pskb, ct, dir); - - return 1; -} -EXPORT_SYMBOL(ip_nat_seq_adjust); - -/* Setup NAT on this expected conntrack so it follows master. */ -/* If we fail to get a free NAT slot, we'll get dropped on confirm */ -void ip_nat_follow_master(struct ip_conntrack *ct, - struct ip_conntrack_expect *exp) -{ - struct ip_nat_range range; - - /* This must be a fresh one. */ - BUG_ON(ct->status & IPS_NAT_DONE_MASK); - - /* Change src to where master sends to */ - range.flags = IP_NAT_RANGE_MAP_IPS; - range.min_ip = range.max_ip - = ct->master->tuplehash[!exp->dir].tuple.dst.ip; - /* hook doesn't matter, but it has to do source manip */ - ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING); - - /* For DST manip, map port here to where it's expected. */ - range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); - range.min = range.max = exp->saved_proto; - range.min_ip = range.max_ip - = ct->master->tuplehash[!exp->dir].tuple.src.ip; - /* hook doesn't matter, but it has to do destination manip */ - ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING); -} -EXPORT_SYMBOL(ip_nat_follow_master); diff --git a/net/ipv4/netfilter/ip_nat_helper_h323.c b/net/ipv4/netfilter/ip_nat_helper_h323.c deleted file mode 100644 index bdc99ef6159..00000000000 --- a/net/ipv4/netfilter/ip_nat_helper_h323.c +++ /dev/null @@ -1,611 +0,0 @@ -/* - * H.323 extension for NAT alteration. - * - * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net> - * - * This source code is licensed under General Public License version 2. - * - * Based on the 'brute force' H.323 NAT module by - * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - */ - -#include <linux/module.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter.h> -#include <linux/ip.h> -#include <linux/tcp.h> -#include <linux/moduleparam.h> -#include <net/tcp.h> -#include <linux/netfilter_ipv4/ip_nat.h> -#include <linux/netfilter_ipv4/ip_nat_helper.h> -#include <linux/netfilter_ipv4/ip_nat_rule.h> -#include <linux/netfilter_ipv4/ip_conntrack_tuple.h> -#include <linux/netfilter_ipv4/ip_conntrack_h323.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - -/****************************************************************************/ -static int set_addr(struct sk_buff **pskb, - unsigned char **data, int dataoff, - unsigned int addroff, __be32 ip, u_int16_t port) -{ - enum ip_conntrack_info ctinfo; - struct ip_conntrack *ct = ip_conntrack_get(*pskb, &ctinfo); - struct { - __be32 ip; - __be16 port; - } __attribute__ ((__packed__)) buf; - struct tcphdr _tcph, *th; - - buf.ip = ip; - buf.port = htons(port); - addroff += dataoff; - - if ((*pskb)->nh.iph->protocol == IPPROTO_TCP) { - if (!ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, - addroff, sizeof(buf), - (char *) &buf, sizeof(buf))) { - if (net_ratelimit()) - printk("ip_nat_h323: ip_nat_mangle_tcp_packet" - " error\n"); - return -1; - } - - /* Relocate data pointer */ - th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl * 4, - sizeof(_tcph), &_tcph); - if (th == NULL) - return -1; - *data = (*pskb)->data + (*pskb)->nh.iph->ihl * 4 + - th->doff * 4 + dataoff; - } else { - if (!ip_nat_mangle_udp_packet(pskb, ct, ctinfo, - addroff, sizeof(buf), - (char *) &buf, sizeof(buf))) { - if (net_ratelimit()) - printk("ip_nat_h323: ip_nat_mangle_udp_packet" - " error\n"); - return -1; - } - /* ip_nat_mangle_udp_packet uses skb_make_writable() to copy - * or pull everything in a linear buffer, so we can safely - * use the skb pointers now */ - *data = (*pskb)->data + (*pskb)->nh.iph->ihl * 4 + - sizeof(struct udphdr); - } - - return 0; -} - -/****************************************************************************/ -static int set_h225_addr(struct sk_buff **pskb, - unsigned char **data, int dataoff, - TransportAddress * addr, - __be32 ip, u_int16_t port) -{ - return set_addr(pskb, data, dataoff, addr->ipAddress.ip, ip, port); -} - -/****************************************************************************/ -static int set_h245_addr(struct sk_buff **pskb, - unsigned char **data, int dataoff, - H245_TransportAddress * addr, - __be32 ip, u_int16_t port) -{ - return set_addr(pskb, data, dataoff, - addr->unicastAddress.iPAddress.network, ip, port); -} - -/****************************************************************************/ -static int set_sig_addr(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, - TransportAddress * addr, int count) -{ - struct ip_ct_h323_master *info = &ct->help.ct_h323_info; - int dir = CTINFO2DIR(ctinfo); - int i; - __be32 ip; - u_int16_t port; - - for (i = 0; i < count; i++) { - if (get_h225_addr(*data, &addr[i], &ip, &port)) { - if (ip == ct->tuplehash[dir].tuple.src.ip && - port == info->sig_port[dir]) { - /* GW->GK */ - - /* Fix for Gnomemeeting */ - if (i > 0 && - get_h225_addr(*data, &addr[0], - &ip, &port) && - (ntohl(ip) & 0xff000000) == 0x7f000000) - i = 0; - - DEBUGP - ("ip_nat_ras: set signal address " - "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(ip), port, - NIPQUAD(ct->tuplehash[!dir].tuple.dst. - ip), info->sig_port[!dir]); - return set_h225_addr(pskb, data, 0, &addr[i], - ct->tuplehash[!dir]. - tuple.dst.ip, - info->sig_port[!dir]); - } else if (ip == ct->tuplehash[dir].tuple.dst.ip && - port == info->sig_port[dir]) { - /* GK->GW */ - DEBUGP - ("ip_nat_ras: set signal address " - "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(ip), port, - NIPQUAD(ct->tuplehash[!dir].tuple.src. - ip), info->sig_port[!dir]); - return set_h225_addr(pskb, data, 0, &addr[i], - ct->tuplehash[!dir]. - tuple.src.ip, - info->sig_port[!dir]); - } - } - } - - return 0; -} - -/****************************************************************************/ -static int set_ras_addr(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, - TransportAddress * addr, int count) -{ - int dir = CTINFO2DIR(ctinfo); - int i; - __be32 ip; - u_int16_t port; - - for (i = 0; i < count; i++) { - if (get_h225_addr(*data, &addr[i], &ip, &port) && - ip == ct->tuplehash[dir].tuple.src.ip && - port == ntohs(ct->tuplehash[dir].tuple.src.u.udp.port)) { - DEBUGP("ip_nat_ras: set rasAddress " - "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(ip), port, - NIPQUAD(ct->tuplehash[!dir].tuple.dst.ip), - ntohs(ct->tuplehash[!dir].tuple.dst.u.udp. - port)); - return set_h225_addr(pskb, data, 0, &addr[i], - ct->tuplehash[!dir].tuple.dst.ip, - ntohs(ct->tuplehash[!dir].tuple. - dst.u.udp.port)); - } - } - - return 0; -} - -/****************************************************************************/ -static int nat_rtp_rtcp(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - H245_TransportAddress * addr, - u_int16_t port, u_int16_t rtp_port, - struct ip_conntrack_expect *rtp_exp, - struct ip_conntrack_expect *rtcp_exp) -{ - struct ip_ct_h323_master *info = &ct->help.ct_h323_info; - int dir = CTINFO2DIR(ctinfo); - int i; - u_int16_t nated_port; - - /* Set expectations for NAT */ - rtp_exp->saved_proto.udp.port = rtp_exp->tuple.dst.u.udp.port; - rtp_exp->expectfn = ip_nat_follow_master; - rtp_exp->dir = !dir; - rtcp_exp->saved_proto.udp.port = rtcp_exp->tuple.dst.u.udp.port; - rtcp_exp->expectfn = ip_nat_follow_master; - rtcp_exp->dir = !dir; - - /* Lookup existing expects */ - for (i = 0; i < H323_RTP_CHANNEL_MAX; i++) { - if (info->rtp_port[i][dir] == rtp_port) { - /* Expected */ - - /* Use allocated ports first. This will refresh - * the expects */ - rtp_exp->tuple.dst.u.udp.port = - htons(info->rtp_port[i][dir]); - rtcp_exp->tuple.dst.u.udp.port = - htons(info->rtp_port[i][dir] + 1); - break; - } else if (info->rtp_port[i][dir] == 0) { - /* Not expected */ - break; - } - } - - /* Run out of expectations */ - if (i >= H323_RTP_CHANNEL_MAX) { - if (net_ratelimit()) - printk("ip_nat_h323: out of expectations\n"); - return 0; - } - - /* Try to get a pair of ports. */ - for (nated_port = ntohs(rtp_exp->tuple.dst.u.udp.port); - nated_port != 0; nated_port += 2) { - rtp_exp->tuple.dst.u.udp.port = htons(nated_port); - if (ip_conntrack_expect_related(rtp_exp) == 0) { - rtcp_exp->tuple.dst.u.udp.port = - htons(nated_port + 1); - if (ip_conntrack_expect_related(rtcp_exp) == 0) - break; - ip_conntrack_unexpect_related(rtp_exp); - } - } - - if (nated_port == 0) { /* No port available */ - if (net_ratelimit()) - printk("ip_nat_h323: out of RTP ports\n"); - return 0; - } - - /* Modify signal */ - if (set_h245_addr(pskb, data, dataoff, addr, - ct->tuplehash[!dir].tuple.dst.ip, - (port & 1) ? nated_port + 1 : nated_port) == 0) { - /* Save ports */ - info->rtp_port[i][dir] = rtp_port; - info->rtp_port[i][!dir] = nated_port; - } else { - ip_conntrack_unexpect_related(rtp_exp); - ip_conntrack_unexpect_related(rtcp_exp); - return -1; - } - - /* Success */ - DEBUGP("ip_nat_h323: expect RTP %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(rtp_exp->tuple.src.ip), - ntohs(rtp_exp->tuple.src.u.udp.port), - NIPQUAD(rtp_exp->tuple.dst.ip), - ntohs(rtp_exp->tuple.dst.u.udp.port)); - DEBUGP("ip_nat_h323: expect RTCP %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(rtcp_exp->tuple.src.ip), - ntohs(rtcp_exp->tuple.src.u.udp.port), - NIPQUAD(rtcp_exp->tuple.dst.ip), - ntohs(rtcp_exp->tuple.dst.u.udp.port)); - - return 0; -} - -/****************************************************************************/ -static int nat_t120(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - H245_TransportAddress * addr, u_int16_t port, - struct ip_conntrack_expect *exp) -{ - int dir = CTINFO2DIR(ctinfo); - u_int16_t nated_port = port; - - /* Set expectations for NAT */ - exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; - exp->expectfn = ip_nat_follow_master; - exp->dir = !dir; - - /* Try to get same port: if not, try to change it. */ - for (; nated_port != 0; nated_port++) { - exp->tuple.dst.u.tcp.port = htons(nated_port); - if (ip_conntrack_expect_related(exp) == 0) - break; - } - - if (nated_port == 0) { /* No port available */ - if (net_ratelimit()) - printk("ip_nat_h323: out of TCP ports\n"); - return 0; - } - - /* Modify signal */ - if (set_h245_addr(pskb, data, dataoff, addr, - ct->tuplehash[!dir].tuple.dst.ip, nated_port) < 0) { - ip_conntrack_unexpect_related(exp); - return -1; - } - - DEBUGP("ip_nat_h323: expect T.120 %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port), - NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port)); - - return 0; -} - -/**************************************************************************** - * This conntrack expect function replaces ip_conntrack_h245_expect() - * which was set by ip_conntrack_helper_h323.c. It calls both - * ip_nat_follow_master() and ip_conntrack_h245_expect() - ****************************************************************************/ -static void ip_nat_h245_expect(struct ip_conntrack *new, - struct ip_conntrack_expect *this) -{ - ip_nat_follow_master(new, this); - ip_conntrack_h245_expect(new, this); -} - -/****************************************************************************/ -static int nat_h245(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - TransportAddress * addr, u_int16_t port, - struct ip_conntrack_expect *exp) -{ - struct ip_ct_h323_master *info = &ct->help.ct_h323_info; - int dir = CTINFO2DIR(ctinfo); - u_int16_t nated_port = port; - - /* Set expectations for NAT */ - exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; - exp->expectfn = ip_nat_h245_expect; - exp->dir = !dir; - - /* Check existing expects */ - if (info->sig_port[dir] == port) - nated_port = info->sig_port[!dir]; - - /* Try to get same port: if not, try to change it. */ - for (; nated_port != 0; nated_port++) { - exp->tuple.dst.u.tcp.port = htons(nated_port); - if (ip_conntrack_expect_related(exp) == 0) - break; - } - - if (nated_port == 0) { /* No port available */ - if (net_ratelimit()) - printk("ip_nat_q931: out of TCP ports\n"); - return 0; - } - - /* Modify signal */ - if (set_h225_addr(pskb, data, dataoff, addr, - ct->tuplehash[!dir].tuple.dst.ip, - nated_port) == 0) { - /* Save ports */ - info->sig_port[dir] = port; - info->sig_port[!dir] = nated_port; - } else { - ip_conntrack_unexpect_related(exp); - return -1; - } - - DEBUGP("ip_nat_q931: expect H.245 %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port), - NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port)); - - return 0; -} - -/**************************************************************************** - * This conntrack expect function replaces ip_conntrack_q931_expect() - * which was set by ip_conntrack_helper_h323.c. - ****************************************************************************/ -static void ip_nat_q931_expect(struct ip_conntrack *new, - struct ip_conntrack_expect *this) -{ - struct ip_nat_range range; - - if (this->tuple.src.ip != 0) { /* Only accept calls from GK */ - ip_nat_follow_master(new, this); - goto out; - } - - /* This must be a fresh one. */ - BUG_ON(new->status & IPS_NAT_DONE_MASK); - - /* Change src to where master sends to */ - range.flags = IP_NAT_RANGE_MAP_IPS; - range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.ip; - - /* hook doesn't matter, but it has to do source manip */ - ip_nat_setup_info(new, &range, NF_IP_POST_ROUTING); - - /* For DST manip, map port here to where it's expected. */ - range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); - range.min = range.max = this->saved_proto; - range.min_ip = range.max_ip = - new->master->tuplehash[!this->dir].tuple.src.ip; - - /* hook doesn't matter, but it has to do destination manip */ - ip_nat_setup_info(new, &range, NF_IP_PRE_ROUTING); - - out: - ip_conntrack_q931_expect(new, this); -} - -/****************************************************************************/ -static int nat_q931(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, TransportAddress * addr, int idx, - u_int16_t port, struct ip_conntrack_expect *exp) -{ - struct ip_ct_h323_master *info = &ct->help.ct_h323_info; - int dir = CTINFO2DIR(ctinfo); - u_int16_t nated_port = port; - __be32 ip; - - /* Set expectations for NAT */ - exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; - exp->expectfn = ip_nat_q931_expect; - exp->dir = !dir; - - /* Check existing expects */ - if (info->sig_port[dir] == port) - nated_port = info->sig_port[!dir]; - - /* Try to get same port: if not, try to change it. */ - for (; nated_port != 0; nated_port++) { - exp->tuple.dst.u.tcp.port = htons(nated_port); - if (ip_conntrack_expect_related(exp) == 0) - break; - } - - if (nated_port == 0) { /* No port available */ - if (net_ratelimit()) - printk("ip_nat_ras: out of TCP ports\n"); - return 0; - } - - /* Modify signal */ - if (set_h225_addr(pskb, data, 0, &addr[idx], - ct->tuplehash[!dir].tuple.dst.ip, - nated_port) == 0) { - /* Save ports */ - info->sig_port[dir] = port; - info->sig_port[!dir] = nated_port; - - /* Fix for Gnomemeeting */ - if (idx > 0 && - get_h225_addr(*data, &addr[0], &ip, &port) && - (ntohl(ip) & 0xff000000) == 0x7f000000) { - set_h225_addr_hook(pskb, data, 0, &addr[0], - ct->tuplehash[!dir].tuple.dst.ip, - info->sig_port[!dir]); - } - } else { - ip_conntrack_unexpect_related(exp); - return -1; - } - - /* Success */ - DEBUGP("ip_nat_ras: expect Q.931 %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port), - NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port)); - - return 0; -} - -/****************************************************************************/ -static void ip_nat_callforwarding_expect(struct ip_conntrack *new, - struct ip_conntrack_expect *this) -{ - struct ip_nat_range range; - - /* This must be a fresh one. */ - BUG_ON(new->status & IPS_NAT_DONE_MASK); - - /* Change src to where master sends to */ - range.flags = IP_NAT_RANGE_MAP_IPS; - range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.ip; - - /* hook doesn't matter, but it has to do source manip */ - ip_nat_setup_info(new, &range, NF_IP_POST_ROUTING); - - /* For DST manip, map port here to where it's expected. */ - range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); - range.min = range.max = this->saved_proto; - range.min_ip = range.max_ip = this->saved_ip; - - /* hook doesn't matter, but it has to do destination manip */ - ip_nat_setup_info(new, &range, NF_IP_PRE_ROUTING); - - ip_conntrack_q931_expect(new, this); -} - -/****************************************************************************/ -static int nat_callforwarding(struct sk_buff **pskb, struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - unsigned char **data, int dataoff, - TransportAddress * addr, u_int16_t port, - struct ip_conntrack_expect *exp) -{ - int dir = CTINFO2DIR(ctinfo); - u_int16_t nated_port; - - /* Set expectations for NAT */ - exp->saved_ip = exp->tuple.dst.ip; - exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip; - exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; - exp->expectfn = ip_nat_callforwarding_expect; - exp->dir = !dir; - - /* Try to get same port: if not, try to change it. */ - for (nated_port = port; nated_port != 0; nated_port++) { - exp->tuple.dst.u.tcp.port = htons(nated_port); - if (ip_conntrack_expect_related(exp) == 0) - break; - } - - if (nated_port == 0) { /* No port available */ - if (net_ratelimit()) - printk("ip_nat_q931: out of TCP ports\n"); - return 0; - } - - /* Modify signal */ - if (!set_h225_addr(pskb, data, dataoff, addr, - ct->tuplehash[!dir].tuple.dst.ip, - nated_port) == 0) { - ip_conntrack_unexpect_related(exp); - return -1; - } - - /* Success */ - DEBUGP("ip_nat_q931: expect Call Forwarding " - "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", - NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port), - NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port)); - - return 0; -} - -/****************************************************************************/ -static int __init init(void) -{ - BUG_ON(rcu_dereference(set_h245_addr_hook) != NULL); - BUG_ON(rcu_dereference(set_h225_addr_hook) != NULL); - BUG_ON(rcu_dereference(set_sig_addr_hook) != NULL); - BUG_ON(rcu_dereference(set_ras_addr_hook) != NULL); - BUG_ON(rcu_dereference(nat_rtp_rtcp_hook) != NULL); - BUG_ON(rcu_dereference(nat_t120_hook) != NULL); - BUG_ON(rcu_dereference(nat_h245_hook) != NULL); - BUG_ON(rcu_dereference(nat_callforwarding_hook) != NULL); - BUG_ON(rcu_dereference(nat_q931_hook) != NULL); - - rcu_assign_pointer(set_h245_addr_hook, set_h245_addr); - rcu_assign_pointer(set_h225_addr_hook, set_h225_addr); - rcu_assign_pointer(set_sig_addr_hook, set_sig_addr); - rcu_assign_pointer(set_ras_addr_hook, set_ras_addr); - rcu_assign_pointer(nat_rtp_rtcp_hook, nat_rtp_rtcp); - rcu_assign_pointer(nat_t120_hook, nat_t120); - rcu_assign_pointer(nat_h245_hook, nat_h245); - rcu_assign_pointer(nat_callforwarding_hook, nat_callforwarding); - rcu_assign_pointer(nat_q931_hook, nat_q931); - - DEBUGP("ip_nat_h323: init success\n"); - return 0; -} - -/****************************************************************************/ -static void __exit fini(void) -{ - rcu_assign_pointer(set_h245_addr_hook, NULL); - rcu_assign_pointer(set_h225_addr_hook, NULL); - rcu_assign_pointer(set_sig_addr_hook, NULL); - rcu_assign_pointer(set_ras_addr_hook, NULL); - rcu_assign_pointer(nat_rtp_rtcp_hook, NULL); - rcu_assign_pointer(nat_t120_hook, NULL); - rcu_assign_pointer(nat_h245_hook, NULL); - rcu_assign_pointer(nat_callforwarding_hook, NULL); - rcu_assign_pointer(nat_q931_hook, NULL); - synchronize_rcu(); -} - -/****************************************************************************/ -module_init(init); -module_exit(fini); - -MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>"); -MODULE_DESCRIPTION("H.323 NAT helper"); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/netfilter/ip_nat_helper_pptp.c b/net/ipv4/netfilter/ip_nat_helper_pptp.c deleted file mode 100644 index 24ce4a5023d..00000000000 --- a/net/ipv4/netfilter/ip_nat_helper_pptp.c +++ /dev/null @@ -1,350 +0,0 @@ -/* - * ip_nat_pptp.c - Version 3.0 - * - * NAT support for PPTP (Point to Point Tunneling Protocol). - * PPTP is a a protocol for creating virtual private networks. - * It is a specification defined by Microsoft and some vendors - * working with Microsoft. PPTP is built on top of a modified - * version of the Internet Generic Routing Encapsulation Protocol. - * GRE is defined in RFC 1701 and RFC 1702. Documentation of - * PPTP can be found in RFC 2637 - * - * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org> - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - * - * TODO: - NAT to a unique tuple, not to TCP source port - * (needs netfilter tuple reservation) - * - * Changes: - * 2002-02-10 - Version 1.3 - * - Use ip_nat_mangle_tcp_packet() because of cloned skb's - * in local connections (Philip Craig <philipc@snapgear.com>) - * - add checks for magicCookie and pptp version - * - make argument list of pptp_{out,in}bound_packet() shorter - * - move to C99 style initializers - * - print version number at module loadtime - * 2003-09-22 - Version 1.5 - * - use SNATed tcp sourceport as callid, since we get called before - * TCP header is mangled (Philip Craig <philipc@snapgear.com>) - * 2004-10-22 - Version 2.0 - * - kernel 2.6.x version - * 2005-06-10 - Version 3.0 - * - kernel >= 2.6.11 version, - * funded by Oxcoda NetBox Blue (http://www.netboxblue.com/) - * - */ - -#include <linux/module.h> -#include <linux/ip.h> -#include <linux/tcp.h> -#include <net/tcp.h> - -#include <linux/netfilter_ipv4/ip_nat.h> -#include <linux/netfilter_ipv4/ip_nat_rule.h> -#include <linux/netfilter_ipv4/ip_nat_helper.h> -#include <linux/netfilter_ipv4/ip_nat_pptp.h> -#include <linux/netfilter_ipv4/ip_conntrack_core.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_proto_gre.h> -#include <linux/netfilter_ipv4/ip_conntrack_pptp.h> - -#define IP_NAT_PPTP_VERSION "3.0" - -#define REQ_CID(req, off) (*(__be16 *)((char *)(req) + (off))) - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); -MODULE_DESCRIPTION("Netfilter NAT helper module for PPTP"); - - -#if 0 -extern const char *pptp_msg_name[]; -#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s: " format, __FILE__, \ - __FUNCTION__, ## args) -#else -#define DEBUGP(format, args...) -#endif - -static void pptp_nat_expected(struct ip_conntrack *ct, - struct ip_conntrack_expect *exp) -{ - struct ip_conntrack *master = ct->master; - struct ip_conntrack_expect *other_exp; - struct ip_conntrack_tuple t; - struct ip_ct_pptp_master *ct_pptp_info; - struct ip_nat_pptp *nat_pptp_info; - struct ip_nat_range range; - - ct_pptp_info = &master->help.ct_pptp_info; - nat_pptp_info = &master->nat.help.nat_pptp_info; - - /* And here goes the grand finale of corrosion... */ - - if (exp->dir == IP_CT_DIR_ORIGINAL) { - DEBUGP("we are PNS->PAC\n"); - /* therefore, build tuple for PAC->PNS */ - t.src.ip = master->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip; - t.src.u.gre.key = master->help.ct_pptp_info.pac_call_id; - t.dst.ip = master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip; - t.dst.u.gre.key = master->help.ct_pptp_info.pns_call_id; - t.dst.protonum = IPPROTO_GRE; - } else { - DEBUGP("we are PAC->PNS\n"); - /* build tuple for PNS->PAC */ - t.src.ip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip; - t.src.u.gre.key = master->nat.help.nat_pptp_info.pns_call_id; - t.dst.ip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip; - t.dst.u.gre.key = master->nat.help.nat_pptp_info.pac_call_id; - t.dst.protonum = IPPROTO_GRE; - } - - DEBUGP("trying to unexpect other dir: "); - DUMP_TUPLE(&t); - other_exp = ip_conntrack_expect_find_get(&t); - if (other_exp) { - ip_conntrack_unexpect_related(other_exp); - ip_conntrack_expect_put(other_exp); - DEBUGP("success\n"); - } else { - DEBUGP("not found!\n"); - } - - /* This must be a fresh one. */ - BUG_ON(ct->status & IPS_NAT_DONE_MASK); - - /* Change src to where master sends to */ - range.flags = IP_NAT_RANGE_MAP_IPS; - range.min_ip = range.max_ip - = ct->master->tuplehash[!exp->dir].tuple.dst.ip; - if (exp->dir == IP_CT_DIR_ORIGINAL) { - range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED; - range.min = range.max = exp->saved_proto; - } - /* hook doesn't matter, but it has to do source manip */ - ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING); - - /* For DST manip, map port here to where it's expected. */ - range.flags = IP_NAT_RANGE_MAP_IPS; - range.min_ip = range.max_ip - = ct->master->tuplehash[!exp->dir].tuple.src.ip; - if (exp->dir == IP_CT_DIR_REPLY) { - range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED; - range.min = range.max = exp->saved_proto; - } - /* hook doesn't matter, but it has to do destination manip */ - ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING); -} - -/* outbound packets == from PNS to PAC */ -static int -pptp_outbound_pkt(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - struct PptpControlHeader *ctlh, - union pptp_ctrl_union *pptpReq) - -{ - struct ip_ct_pptp_master *ct_pptp_info = &ct->help.ct_pptp_info; - struct ip_nat_pptp *nat_pptp_info = &ct->nat.help.nat_pptp_info; - u_int16_t msg; - __be16 new_callid; - unsigned int cid_off; - - new_callid = ct_pptp_info->pns_call_id; - - switch (msg = ntohs(ctlh->messageType)) { - case PPTP_OUT_CALL_REQUEST: - cid_off = offsetof(union pptp_ctrl_union, ocreq.callID); - /* FIXME: ideally we would want to reserve a call ID - * here. current netfilter NAT core is not able to do - * this :( For now we use TCP source port. This breaks - * multiple calls within one control session */ - - /* save original call ID in nat_info */ - nat_pptp_info->pns_call_id = ct_pptp_info->pns_call_id; - - /* don't use tcph->source since we are at a DSTmanip - * hook (e.g. PREROUTING) and pkt is not mangled yet */ - new_callid = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port; - - /* save new call ID in ct info */ - ct_pptp_info->pns_call_id = new_callid; - break; - case PPTP_IN_CALL_REPLY: - cid_off = offsetof(union pptp_ctrl_union, icack.callID); - break; - case PPTP_CALL_CLEAR_REQUEST: - cid_off = offsetof(union pptp_ctrl_union, clrreq.callID); - break; - default: - DEBUGP("unknown outbound packet 0x%04x:%s\n", msg, - (msg <= PPTP_MSG_MAX)? - pptp_msg_name[msg]:pptp_msg_name[0]); - /* fall through */ - - case PPTP_SET_LINK_INFO: - /* only need to NAT in case PAC is behind NAT box */ - case PPTP_START_SESSION_REQUEST: - case PPTP_START_SESSION_REPLY: - case PPTP_STOP_SESSION_REQUEST: - case PPTP_STOP_SESSION_REPLY: - case PPTP_ECHO_REQUEST: - case PPTP_ECHO_REPLY: - /* no need to alter packet */ - return NF_ACCEPT; - } - - /* only OUT_CALL_REQUEST, IN_CALL_REPLY, CALL_CLEAR_REQUEST pass - * down to here */ - DEBUGP("altering call id from 0x%04x to 0x%04x\n", - ntohs(REQ_CID(pptpReq, cid_off)), ntohs(new_callid)); - - /* mangle packet */ - if (ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, - cid_off + sizeof(struct pptp_pkt_hdr) + - sizeof(struct PptpControlHeader), - sizeof(new_callid), (char *)&new_callid, - sizeof(new_callid)) == 0) - return NF_DROP; - - return NF_ACCEPT; -} - -static void -pptp_exp_gre(struct ip_conntrack_expect *expect_orig, - struct ip_conntrack_expect *expect_reply) -{ - struct ip_conntrack *ct = expect_orig->master; - struct ip_ct_pptp_master *ct_pptp_info = &ct->help.ct_pptp_info; - struct ip_nat_pptp *nat_pptp_info = &ct->nat.help.nat_pptp_info; - - /* save original PAC call ID in nat_info */ - nat_pptp_info->pac_call_id = ct_pptp_info->pac_call_id; - - /* alter expectation for PNS->PAC direction */ - expect_orig->saved_proto.gre.key = ct_pptp_info->pns_call_id; - expect_orig->tuple.src.u.gre.key = nat_pptp_info->pns_call_id; - expect_orig->tuple.dst.u.gre.key = ct_pptp_info->pac_call_id; - expect_orig->dir = IP_CT_DIR_ORIGINAL; - - /* alter expectation for PAC->PNS direction */ - expect_reply->saved_proto.gre.key = nat_pptp_info->pns_call_id; - expect_reply->tuple.src.u.gre.key = nat_pptp_info->pac_call_id; - expect_reply->tuple.dst.u.gre.key = ct_pptp_info->pns_call_id; - expect_reply->dir = IP_CT_DIR_REPLY; -} - -/* inbound packets == from PAC to PNS */ -static int -pptp_inbound_pkt(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - struct PptpControlHeader *ctlh, - union pptp_ctrl_union *pptpReq) -{ - struct ip_nat_pptp *nat_pptp_info = &ct->nat.help.nat_pptp_info; - u_int16_t msg; - __be16 new_pcid; - unsigned int pcid_off; - - new_pcid = nat_pptp_info->pns_call_id; - - switch (msg = ntohs(ctlh->messageType)) { - case PPTP_OUT_CALL_REPLY: - pcid_off = offsetof(union pptp_ctrl_union, ocack.peersCallID); - break; - case PPTP_IN_CALL_CONNECT: - pcid_off = offsetof(union pptp_ctrl_union, iccon.peersCallID); - break; - case PPTP_IN_CALL_REQUEST: - /* only need to nat in case PAC is behind NAT box */ - return NF_ACCEPT; - case PPTP_WAN_ERROR_NOTIFY: - pcid_off = offsetof(union pptp_ctrl_union, wanerr.peersCallID); - break; - case PPTP_CALL_DISCONNECT_NOTIFY: - pcid_off = offsetof(union pptp_ctrl_union, disc.callID); - break; - case PPTP_SET_LINK_INFO: - pcid_off = offsetof(union pptp_ctrl_union, setlink.peersCallID); - break; - - default: - DEBUGP("unknown inbound packet %s\n", (msg <= PPTP_MSG_MAX)? - pptp_msg_name[msg]:pptp_msg_name[0]); - /* fall through */ - - case PPTP_START_SESSION_REQUEST: - case PPTP_START_SESSION_REPLY: - case PPTP_STOP_SESSION_REQUEST: - case PPTP_STOP_SESSION_REPLY: - case PPTP_ECHO_REQUEST: - case PPTP_ECHO_REPLY: - /* no need to alter packet */ - return NF_ACCEPT; - } - - /* only OUT_CALL_REPLY, IN_CALL_CONNECT, IN_CALL_REQUEST, - * WAN_ERROR_NOTIFY, CALL_DISCONNECT_NOTIFY pass down here */ - - /* mangle packet */ - DEBUGP("altering peer call id from 0x%04x to 0x%04x\n", - ntohs(REQ_CID(pptpReq, pcid_off)), ntohs(new_pcid)); - - if (ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, - pcid_off + sizeof(struct pptp_pkt_hdr) + - sizeof(struct PptpControlHeader), - sizeof(new_pcid), (char *)&new_pcid, - sizeof(new_pcid)) == 0) - return NF_DROP; - return NF_ACCEPT; -} - - -extern int __init ip_nat_proto_gre_init(void); -extern void __exit ip_nat_proto_gre_fini(void); - -static int __init ip_nat_helper_pptp_init(void) -{ - int ret; - - DEBUGP("%s: registering NAT helper\n", __FILE__); - - ret = ip_nat_proto_gre_init(); - if (ret < 0) - return ret; - - BUG_ON(rcu_dereference(ip_nat_pptp_hook_outbound)); - rcu_assign_pointer(ip_nat_pptp_hook_outbound, pptp_outbound_pkt); - - BUG_ON(rcu_dereference(ip_nat_pptp_hook_inbound)); - rcu_assign_pointer(ip_nat_pptp_hook_inbound, pptp_inbound_pkt); - - BUG_ON(rcu_dereference(ip_nat_pptp_hook_exp_gre)); - rcu_assign_pointer(ip_nat_pptp_hook_exp_gre, pptp_exp_gre); - - BUG_ON(rcu_dereference(ip_nat_pptp_hook_expectfn)); - rcu_assign_pointer(ip_nat_pptp_hook_expectfn, pptp_nat_expected); - - printk("ip_nat_pptp version %s loaded\n", IP_NAT_PPTP_VERSION); - return 0; -} - -static void __exit ip_nat_helper_pptp_fini(void) -{ - DEBUGP("cleanup_module\n" ); - - rcu_assign_pointer(ip_nat_pptp_hook_expectfn, NULL); - rcu_assign_pointer(ip_nat_pptp_hook_exp_gre, NULL); - rcu_assign_pointer(ip_nat_pptp_hook_inbound, NULL); - rcu_assign_pointer(ip_nat_pptp_hook_outbound, NULL); - synchronize_rcu(); - - ip_nat_proto_gre_fini(); - - printk("ip_nat_pptp version %s unloaded\n", IP_NAT_PPTP_VERSION); -} - -module_init(ip_nat_helper_pptp_init); -module_exit(ip_nat_helper_pptp_fini); diff --git a/net/ipv4/netfilter/ip_nat_irc.c b/net/ipv4/netfilter/ip_nat_irc.c deleted file mode 100644 index cfaeea38314..00000000000 --- a/net/ipv4/netfilter/ip_nat_irc.c +++ /dev/null @@ -1,122 +0,0 @@ -/* IRC extension for TCP NAT alteration. - * (C) 2000-2001 by Harald Welte <laforge@gnumonks.org> - * (C) 2004 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation - * based on a copy of RR's ip_nat_ftp.c - * - * ip_nat_irc.c,v 1.16 2001/12/06 07:42:10 laforge Exp - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/module.h> -#include <linux/netfilter_ipv4.h> -#include <linux/ip.h> -#include <linux/tcp.h> -#include <linux/kernel.h> -#include <net/tcp.h> -#include <linux/netfilter_ipv4/ip_nat.h> -#include <linux/netfilter_ipv4/ip_nat_helper.h> -#include <linux/netfilter_ipv4/ip_nat_rule.h> -#include <linux/netfilter_ipv4/ip_conntrack_irc.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/moduleparam.h> - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - -MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); -MODULE_DESCRIPTION("IRC (DCC) NAT helper"); -MODULE_LICENSE("GPL"); - -static unsigned int help(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - unsigned int matchoff, - unsigned int matchlen, - struct ip_conntrack_expect *exp) -{ - u_int16_t port; - unsigned int ret; - - /* "4294967296 65635 " */ - char buffer[18]; - - DEBUGP("IRC_NAT: info (seq %u + %u) in %u\n", - expect->seq, exp_irc_info->len, - ntohl(tcph->seq)); - - /* Reply comes from server. */ - exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; - exp->dir = IP_CT_DIR_REPLY; - - /* When you see the packet, we need to NAT it the same as the - * this one. */ - exp->expectfn = ip_nat_follow_master; - - /* Try to get same port: if not, try to change it. */ - for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { - exp->tuple.dst.u.tcp.port = htons(port); - if (ip_conntrack_expect_related(exp) == 0) - break; - } - - if (port == 0) - return NF_DROP; - - /* strlen("\1DCC CHAT chat AAAAAAAA P\1\n")=27 - * strlen("\1DCC SCHAT chat AAAAAAAA P\1\n")=28 - * strlen("\1DCC SEND F AAAAAAAA P S\1\n")=26 - * strlen("\1DCC MOVE F AAAAAAAA P S\1\n")=26 - * strlen("\1DCC TSEND F AAAAAAAA P S\1\n")=27 - * AAAAAAAAA: bound addr (1.0.0.0==16777216, min 8 digits, - * 255.255.255.255==4294967296, 10 digits) - * P: bound port (min 1 d, max 5d (65635)) - * F: filename (min 1 d ) - * S: size (min 1 d ) - * 0x01, \n: terminators - */ - - /* AAA = "us", ie. where server normally talks to. */ - sprintf(buffer, "%u %u", - ntohl(exp->master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip), - port); - DEBUGP("ip_nat_irc: Inserting '%s' == %u.%u.%u.%u, port %u\n", - buffer, NIPQUAD(exp->tuple.src.ip), port); - - ret = ip_nat_mangle_tcp_packet(pskb, exp->master, ctinfo, - matchoff, matchlen, buffer, - strlen(buffer)); - if (ret != NF_ACCEPT) - ip_conntrack_unexpect_related(exp); - return ret; -} - -static void __exit ip_nat_irc_fini(void) -{ - rcu_assign_pointer(ip_nat_irc_hook, NULL); - synchronize_rcu(); -} - -static int __init ip_nat_irc_init(void) -{ - BUG_ON(rcu_dereference(ip_nat_irc_hook)); - rcu_assign_pointer(ip_nat_irc_hook, help); - return 0; -} - -/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ -static int warn_set(const char *val, struct kernel_param *kp) -{ - printk(KERN_INFO KBUILD_MODNAME - ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); - return 0; -} -module_param_call(ports, warn_set, NULL, NULL, 0); - -module_init(ip_nat_irc_init); -module_exit(ip_nat_irc_fini); diff --git a/net/ipv4/netfilter/ip_nat_proto_gre.c b/net/ipv4/netfilter/ip_nat_proto_gre.c deleted file mode 100644 index 95810202d84..00000000000 --- a/net/ipv4/netfilter/ip_nat_proto_gre.c +++ /dev/null @@ -1,174 +0,0 @@ -/* - * ip_nat_proto_gre.c - Version 2.0 - * - * NAT protocol helper module for GRE. - * - * GRE is a generic encapsulation protocol, which is generally not very - * suited for NAT, as it has no protocol-specific part as port numbers. - * - * It has an optional key field, which may help us distinguishing two - * connections between the same two hosts. - * - * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784 - * - * PPTP is built on top of a modified version of GRE, and has a mandatory - * field called "CallID", which serves us for the same purpose as the key - * field in plain GRE. - * - * Documentation about PPTP can be found in RFC 2637 - * - * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org> - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - * - */ - -#include <linux/module.h> -#include <linux/ip.h> -#include <linux/netfilter_ipv4/ip_nat.h> -#include <linux/netfilter_ipv4/ip_nat_rule.h> -#include <linux/netfilter_ipv4/ip_nat_protocol.h> -#include <linux/netfilter_ipv4/ip_conntrack_proto_gre.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); -MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE"); - -#if 0 -#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s: " format, __FILE__, \ - __FUNCTION__, ## args) -#else -#define DEBUGP(x, args...) -#endif - -/* is key in given range between min and max */ -static int -gre_in_range(const struct ip_conntrack_tuple *tuple, - enum ip_nat_manip_type maniptype, - const union ip_conntrack_manip_proto *min, - const union ip_conntrack_manip_proto *max) -{ - __be16 key; - - if (maniptype == IP_NAT_MANIP_SRC) - key = tuple->src.u.gre.key; - else - key = tuple->dst.u.gre.key; - - return ntohs(key) >= ntohs(min->gre.key) - && ntohs(key) <= ntohs(max->gre.key); -} - -/* generate unique tuple ... */ -static int -gre_unique_tuple(struct ip_conntrack_tuple *tuple, - const struct ip_nat_range *range, - enum ip_nat_manip_type maniptype, - const struct ip_conntrack *conntrack) -{ - static u_int16_t key; - __be16 *keyptr; - unsigned int min, i, range_size; - - if (maniptype == IP_NAT_MANIP_SRC) - keyptr = &tuple->src.u.gre.key; - else - keyptr = &tuple->dst.u.gre.key; - - if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { - DEBUGP("%p: NATing GRE PPTP\n", conntrack); - min = 1; - range_size = 0xffff; - } else { - min = ntohs(range->min.gre.key); - range_size = ntohs(range->max.gre.key) - min + 1; - } - - DEBUGP("min = %u, range_size = %u\n", min, range_size); - - for (i = 0; i < range_size; i++, key++) { - *keyptr = htons(min + key % range_size); - if (!ip_nat_used_tuple(tuple, conntrack)) - return 1; - } - - DEBUGP("%p: no NAT mapping\n", conntrack); - - return 0; -} - -/* manipulate a GRE packet according to maniptype */ -static int -gre_manip_pkt(struct sk_buff **pskb, - unsigned int iphdroff, - const struct ip_conntrack_tuple *tuple, - enum ip_nat_manip_type maniptype) -{ - struct gre_hdr *greh; - struct gre_hdr_pptp *pgreh; - struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff); - unsigned int hdroff = iphdroff + iph->ihl*4; - - /* pgreh includes two optional 32bit fields which are not required - * to be there. That's where the magic '8' comes from */ - if (!skb_make_writable(pskb, hdroff + sizeof(*pgreh)-8)) - return 0; - - greh = (void *)(*pskb)->data + hdroff; - pgreh = (struct gre_hdr_pptp *) greh; - - /* we only have destination manip of a packet, since 'source key' - * is not present in the packet itself */ - if (maniptype == IP_NAT_MANIP_DST) { - /* key manipulation is always dest */ - switch (greh->version) { - case 0: - if (!greh->key) { - DEBUGP("can't nat GRE w/o key\n"); - break; - } - if (greh->csum) { - /* FIXME: Never tested this code... */ - nf_proto_csum_replace4(gre_csum(greh), *pskb, - *(gre_key(greh)), - tuple->dst.u.gre.key, 0); - } - *(gre_key(greh)) = tuple->dst.u.gre.key; - break; - case GRE_VERSION_PPTP: - DEBUGP("call_id -> 0x%04x\n", - ntohs(tuple->dst.u.gre.key)); - pgreh->call_id = tuple->dst.u.gre.key; - break; - default: - DEBUGP("can't nat unknown GRE version\n"); - return 0; - break; - } - } - return 1; -} - -/* nat helper struct */ -static struct ip_nat_protocol gre = { - .name = "GRE", - .protonum = IPPROTO_GRE, - .manip_pkt = gre_manip_pkt, - .in_range = gre_in_range, - .unique_tuple = gre_unique_tuple, -#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ - defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) - .range_to_nfattr = ip_nat_port_range_to_nfattr, - .nfattr_to_range = ip_nat_port_nfattr_to_range, -#endif -}; - -int __init ip_nat_proto_gre_init(void) -{ - return ip_nat_protocol_register(&gre); -} - -void __exit ip_nat_proto_gre_fini(void) -{ - ip_nat_protocol_unregister(&gre); -} diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c deleted file mode 100644 index 22a528ae038..00000000000 --- a/net/ipv4/netfilter/ip_nat_proto_icmp.c +++ /dev/null @@ -1,87 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/init.h> -#include <linux/netfilter.h> -#include <linux/ip.h> -#include <linux/icmp.h> -#include <linux/if.h> - -#include <linux/netfilter_ipv4/ip_nat.h> -#include <linux/netfilter_ipv4/ip_nat_core.h> -#include <linux/netfilter_ipv4/ip_nat_rule.h> -#include <linux/netfilter_ipv4/ip_nat_protocol.h> - -static int -icmp_in_range(const struct ip_conntrack_tuple *tuple, - enum ip_nat_manip_type maniptype, - const union ip_conntrack_manip_proto *min, - const union ip_conntrack_manip_proto *max) -{ - return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) && - ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); -} - -static int -icmp_unique_tuple(struct ip_conntrack_tuple *tuple, - const struct ip_nat_range *range, - enum ip_nat_manip_type maniptype, - const struct ip_conntrack *conntrack) -{ - static u_int16_t id; - unsigned int range_size; - unsigned int i; - - range_size = ntohs(range->max.icmp.id) - ntohs(range->min.icmp.id) + 1; - /* If no range specified... */ - if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) - range_size = 0xFFFF; - - for (i = 0; i < range_size; i++, id++) { - tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) + - (id % range_size)); - if (!ip_nat_used_tuple(tuple, conntrack)) - return 1; - } - return 0; -} - -static int -icmp_manip_pkt(struct sk_buff **pskb, - unsigned int iphdroff, - const struct ip_conntrack_tuple *tuple, - enum ip_nat_manip_type maniptype) -{ - struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff); - struct icmphdr *hdr; - unsigned int hdroff = iphdroff + iph->ihl*4; - - if (!skb_make_writable(pskb, hdroff + sizeof(*hdr))) - return 0; - - hdr = (struct icmphdr *)((*pskb)->data + hdroff); - nf_proto_csum_replace2(&hdr->checksum, *pskb, - hdr->un.echo.id, tuple->src.u.icmp.id, 0); - hdr->un.echo.id = tuple->src.u.icmp.id; - return 1; -} - -struct ip_nat_protocol ip_nat_protocol_icmp = { - .name = "ICMP", - .protonum = IPPROTO_ICMP, - .me = THIS_MODULE, - .manip_pkt = icmp_manip_pkt, - .in_range = icmp_in_range, - .unique_tuple = icmp_unique_tuple, -#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ - defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) - .range_to_nfattr = ip_nat_port_range_to_nfattr, - .nfattr_to_range = ip_nat_port_nfattr_to_range, -#endif -}; diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c deleted file mode 100644 index 14ff24f53a7..00000000000 --- a/net/ipv4/netfilter/ip_nat_proto_tcp.c +++ /dev/null @@ -1,154 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/init.h> -#include <linux/random.h> -#include <linux/netfilter.h> -#include <linux/ip.h> -#include <linux/tcp.h> -#include <linux/if.h> -#include <linux/netfilter/nfnetlink_conntrack.h> -#include <linux/netfilter_ipv4/ip_nat.h> -#include <linux/netfilter_ipv4/ip_nat_rule.h> -#include <linux/netfilter_ipv4/ip_nat_protocol.h> -#include <linux/netfilter_ipv4/ip_nat_core.h> - -static int -tcp_in_range(const struct ip_conntrack_tuple *tuple, - enum ip_nat_manip_type maniptype, - const union ip_conntrack_manip_proto *min, - const union ip_conntrack_manip_proto *max) -{ - __be16 port; - - if (maniptype == IP_NAT_MANIP_SRC) - port = tuple->src.u.tcp.port; - else - port = tuple->dst.u.tcp.port; - - return ntohs(port) >= ntohs(min->tcp.port) - && ntohs(port) <= ntohs(max->tcp.port); -} - -static int -tcp_unique_tuple(struct ip_conntrack_tuple *tuple, - const struct ip_nat_range *range, - enum ip_nat_manip_type maniptype, - const struct ip_conntrack *conntrack) -{ - static u_int16_t port; - __be16 *portptr; - unsigned int range_size, min, i; - - if (maniptype == IP_NAT_MANIP_SRC) - portptr = &tuple->src.u.tcp.port; - else - portptr = &tuple->dst.u.tcp.port; - - /* If no range specified... */ - if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { - /* If it's dst rewrite, can't change port */ - if (maniptype == IP_NAT_MANIP_DST) - return 0; - - /* Map privileged onto privileged. */ - if (ntohs(*portptr) < 1024) { - /* Loose convention: >> 512 is credential passing */ - if (ntohs(*portptr)<512) { - min = 1; - range_size = 511 - min + 1; - } else { - min = 600; - range_size = 1023 - min + 1; - } - } else { - min = 1024; - range_size = 65535 - 1024 + 1; - } - } else { - min = ntohs(range->min.tcp.port); - range_size = ntohs(range->max.tcp.port) - min + 1; - } - - /* Start from random port to avoid prediction */ - if (range->flags & IP_NAT_RANGE_PROTO_RANDOM) - port = net_random(); - - for (i = 0; i < range_size; i++, port++) { - *portptr = htons(min + port % range_size); - if (!ip_nat_used_tuple(tuple, conntrack)) { - return 1; - } - } - return 0; -} - -static int -tcp_manip_pkt(struct sk_buff **pskb, - unsigned int iphdroff, - const struct ip_conntrack_tuple *tuple, - enum ip_nat_manip_type maniptype) -{ - struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff); - struct tcphdr *hdr; - unsigned int hdroff = iphdroff + iph->ihl*4; - __be32 oldip, newip; - __be16 *portptr, newport, oldport; - int hdrsize = 8; /* TCP connection tracking guarantees this much */ - - /* this could be a inner header returned in icmp packet; in such - cases we cannot update the checksum field since it is outside of - the 8 bytes of transport layer headers we are guaranteed */ - if ((*pskb)->len >= hdroff + sizeof(struct tcphdr)) - hdrsize = sizeof(struct tcphdr); - - if (!skb_make_writable(pskb, hdroff + hdrsize)) - return 0; - - iph = (struct iphdr *)((*pskb)->data + iphdroff); - hdr = (struct tcphdr *)((*pskb)->data + hdroff); - - if (maniptype == IP_NAT_MANIP_SRC) { - /* Get rid of src ip and src pt */ - oldip = iph->saddr; - newip = tuple->src.ip; - newport = tuple->src.u.tcp.port; - portptr = &hdr->source; - } else { - /* Get rid of dst ip and dst pt */ - oldip = iph->daddr; - newip = tuple->dst.ip; - newport = tuple->dst.u.tcp.port; - portptr = &hdr->dest; - } - - oldport = *portptr; - *portptr = newport; - - if (hdrsize < sizeof(*hdr)) - return 1; - - nf_proto_csum_replace4(&hdr->check, *pskb, oldip, newip, 1); - nf_proto_csum_replace2(&hdr->check, *pskb, oldport, newport, 0); - return 1; -} - -struct ip_nat_protocol ip_nat_protocol_tcp = { - .name = "TCP", - .protonum = IPPROTO_TCP, - .me = THIS_MODULE, - .manip_pkt = tcp_manip_pkt, - .in_range = tcp_in_range, - .unique_tuple = tcp_unique_tuple, -#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ - defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) - .range_to_nfattr = ip_nat_port_range_to_nfattr, - .nfattr_to_range = ip_nat_port_nfattr_to_range, -#endif -}; diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c deleted file mode 100644 index dfd52167289..00000000000 --- a/net/ipv4/netfilter/ip_nat_proto_udp.c +++ /dev/null @@ -1,144 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/init.h> -#include <linux/random.h> -#include <linux/netfilter.h> -#include <linux/ip.h> -#include <linux/udp.h> -#include <linux/if.h> - -#include <linux/netfilter_ipv4/ip_nat.h> -#include <linux/netfilter_ipv4/ip_nat_core.h> -#include <linux/netfilter_ipv4/ip_nat_rule.h> -#include <linux/netfilter_ipv4/ip_nat_protocol.h> - -static int -udp_in_range(const struct ip_conntrack_tuple *tuple, - enum ip_nat_manip_type maniptype, - const union ip_conntrack_manip_proto *min, - const union ip_conntrack_manip_proto *max) -{ - __be16 port; - - if (maniptype == IP_NAT_MANIP_SRC) - port = tuple->src.u.udp.port; - else - port = tuple->dst.u.udp.port; - - return ntohs(port) >= ntohs(min->udp.port) - && ntohs(port) <= ntohs(max->udp.port); -} - -static int -udp_unique_tuple(struct ip_conntrack_tuple *tuple, - const struct ip_nat_range *range, - enum ip_nat_manip_type maniptype, - const struct ip_conntrack *conntrack) -{ - static u_int16_t port; - __be16 *portptr; - unsigned int range_size, min, i; - - if (maniptype == IP_NAT_MANIP_SRC) - portptr = &tuple->src.u.udp.port; - else - portptr = &tuple->dst.u.udp.port; - - /* If no range specified... */ - if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { - /* If it's dst rewrite, can't change port */ - if (maniptype == IP_NAT_MANIP_DST) - return 0; - - if (ntohs(*portptr) < 1024) { - /* Loose convention: >> 512 is credential passing */ - if (ntohs(*portptr)<512) { - min = 1; - range_size = 511 - min + 1; - } else { - min = 600; - range_size = 1023 - min + 1; - } - } else { - min = 1024; - range_size = 65535 - 1024 + 1; - } - } else { - min = ntohs(range->min.udp.port); - range_size = ntohs(range->max.udp.port) - min + 1; - } - - /* Start from random port to avoid prediction */ - if (range->flags & IP_NAT_RANGE_PROTO_RANDOM) - port = net_random(); - - for (i = 0; i < range_size; i++, port++) { - *portptr = htons(min + port % range_size); - if (!ip_nat_used_tuple(tuple, conntrack)) - return 1; - } - return 0; -} - -static int -udp_manip_pkt(struct sk_buff **pskb, - unsigned int iphdroff, - const struct ip_conntrack_tuple *tuple, - enum ip_nat_manip_type maniptype) -{ - struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff); - struct udphdr *hdr; - unsigned int hdroff = iphdroff + iph->ihl*4; - __be32 oldip, newip; - __be16 *portptr, newport; - - if (!skb_make_writable(pskb, hdroff + sizeof(*hdr))) - return 0; - - iph = (struct iphdr *)((*pskb)->data + iphdroff); - hdr = (struct udphdr *)((*pskb)->data + hdroff); - - if (maniptype == IP_NAT_MANIP_SRC) { - /* Get rid of src ip and src pt */ - oldip = iph->saddr; - newip = tuple->src.ip; - newport = tuple->src.u.udp.port; - portptr = &hdr->source; - } else { - /* Get rid of dst ip and dst pt */ - oldip = iph->daddr; - newip = tuple->dst.ip; - newport = tuple->dst.u.udp.port; - portptr = &hdr->dest; - } - - if (hdr->check || (*pskb)->ip_summed == CHECKSUM_PARTIAL) { - nf_proto_csum_replace4(&hdr->check, *pskb, oldip, newip, 1); - nf_proto_csum_replace2(&hdr->check, *pskb, *portptr, newport, 0); - if (!hdr->check) - hdr->check = CSUM_MANGLED_0; - } - *portptr = newport; - return 1; -} - -struct ip_nat_protocol ip_nat_protocol_udp = { - .name = "UDP", - .protonum = IPPROTO_UDP, - .me = THIS_MODULE, - .manip_pkt = udp_manip_pkt, - .in_range = udp_in_range, - .unique_tuple = udp_unique_tuple, -#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ - defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) - .range_to_nfattr = ip_nat_port_range_to_nfattr, - .nfattr_to_range = ip_nat_port_nfattr_to_range, -#endif -}; diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c deleted file mode 100644 index 3bf04951724..00000000000 --- a/net/ipv4/netfilter/ip_nat_proto_unknown.c +++ /dev/null @@ -1,55 +0,0 @@ -/* The "unknown" protocol. This is what is used for protocols we - * don't understand. It's returned by ip_ct_find_proto(). - */ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/init.h> -#include <linux/netfilter.h> -#include <linux/if.h> - -#include <linux/netfilter_ipv4/ip_nat.h> -#include <linux/netfilter_ipv4/ip_nat_rule.h> -#include <linux/netfilter_ipv4/ip_nat_protocol.h> - -static int unknown_in_range(const struct ip_conntrack_tuple *tuple, - enum ip_nat_manip_type manip_type, - const union ip_conntrack_manip_proto *min, - const union ip_conntrack_manip_proto *max) -{ - return 1; -} - -static int unknown_unique_tuple(struct ip_conntrack_tuple *tuple, - const struct ip_nat_range *range, - enum ip_nat_manip_type maniptype, - const struct ip_conntrack *conntrack) -{ - /* Sorry: we can't help you; if it's not unique, we can't frob - anything. */ - return 0; -} - -static int -unknown_manip_pkt(struct sk_buff **pskb, - unsigned int iphdroff, - const struct ip_conntrack_tuple *tuple, - enum ip_nat_manip_type maniptype) -{ - return 1; -} - -struct ip_nat_protocol ip_nat_unknown_protocol = { - .name = "unknown", - /* .me isn't set: getting a ref to this cannot fail. */ - .manip_pkt = unknown_manip_pkt, - .in_range = unknown_in_range, - .unique_tuple = unknown_unique_tuple, -}; diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c deleted file mode 100644 index 080eb1d9220..00000000000 --- a/net/ipv4/netfilter/ip_nat_rule.c +++ /dev/null @@ -1,314 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -/* Everything about the rules for NAT. */ -#include <linux/types.h> -#include <linux/ip.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> -#include <linux/module.h> -#include <linux/kmod.h> -#include <linux/skbuff.h> -#include <linux/proc_fs.h> -#include <net/checksum.h> -#include <net/route.h> -#include <linux/bitops.h> - -#include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_ipv4/ip_nat.h> -#include <linux/netfilter_ipv4/ip_nat_core.h> -#include <linux/netfilter_ipv4/ip_nat_rule.h> - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - -#define NAT_VALID_HOOKS ((1<<NF_IP_PRE_ROUTING) | (1<<NF_IP_POST_ROUTING) | (1<<NF_IP_LOCAL_OUT)) - -static struct -{ - struct ipt_replace repl; - struct ipt_standard entries[3]; - struct ipt_error term; -} nat_initial_table __initdata -= { { "nat", NAT_VALID_HOOKS, 4, - sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), - { [NF_IP_PRE_ROUTING] = 0, - [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard), - [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 }, - { [NF_IP_PRE_ROUTING] = 0, - [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard), - [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 }, - 0, NULL, { } }, - { - /* PRE_ROUTING */ - { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, - 0, - sizeof(struct ipt_entry), - sizeof(struct ipt_standard), - 0, { 0, 0 }, { } }, - { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, - -NF_ACCEPT - 1 } }, - /* POST_ROUTING */ - { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, - 0, - sizeof(struct ipt_entry), - sizeof(struct ipt_standard), - 0, { 0, 0 }, { } }, - { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, - -NF_ACCEPT - 1 } }, - /* LOCAL_OUT */ - { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, - 0, - sizeof(struct ipt_entry), - sizeof(struct ipt_standard), - 0, { 0, 0 }, { } }, - { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, - -NF_ACCEPT - 1 } } - }, - /* ERROR */ - { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, - 0, - sizeof(struct ipt_entry), - sizeof(struct ipt_error), - 0, { 0, 0 }, { } }, - { { { { IPT_ALIGN(sizeof(struct ipt_error_target)), IPT_ERROR_TARGET } }, - { } }, - "ERROR" - } - } -}; - -static struct xt_table nat_table = { - .name = "nat", - .valid_hooks = NAT_VALID_HOOKS, - .lock = RW_LOCK_UNLOCKED, - .me = THIS_MODULE, - .af = AF_INET, -}; - -/* Source NAT */ -static unsigned int ipt_snat_target(struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) -{ - struct ip_conntrack *ct; - enum ip_conntrack_info ctinfo; - const struct ip_nat_multi_range_compat *mr = targinfo; - - IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING); - - ct = ip_conntrack_get(*pskb, &ctinfo); - - /* Connection must be valid and new. */ - IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED - || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); - IP_NF_ASSERT(out); - - return ip_nat_setup_info(ct, &mr->range[0], hooknum); -} - -/* Before 2.6.11 we did implicit source NAT if required. Warn about change. */ -static void warn_if_extra_mangle(__be32 dstip, __be32 srcip) -{ - static int warned = 0; - struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dstip } } }; - struct rtable *rt; - - if (ip_route_output_key(&rt, &fl) != 0) - return; - - if (rt->rt_src != srcip && !warned) { - printk("NAT: no longer support implicit source local NAT\n"); - printk("NAT: packet src %u.%u.%u.%u -> dst %u.%u.%u.%u\n", - NIPQUAD(srcip), NIPQUAD(dstip)); - warned = 1; - } - ip_rt_put(rt); -} - -static unsigned int ipt_dnat_target(struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) -{ - struct ip_conntrack *ct; - enum ip_conntrack_info ctinfo; - const struct ip_nat_multi_range_compat *mr = targinfo; - - IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING - || hooknum == NF_IP_LOCAL_OUT); - - ct = ip_conntrack_get(*pskb, &ctinfo); - - /* Connection must be valid and new. */ - IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); - - if (hooknum == NF_IP_LOCAL_OUT - && mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) - warn_if_extra_mangle((*pskb)->nh.iph->daddr, - mr->range[0].min_ip); - - return ip_nat_setup_info(ct, &mr->range[0], hooknum); -} - -static int ipt_snat_checkentry(const char *tablename, - const void *entry, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) -{ - struct ip_nat_multi_range_compat *mr = targinfo; - - /* Must be a valid range */ - if (mr->rangesize != 1) { - printk("SNAT: multiple ranges no longer supported\n"); - return 0; - } - return 1; -} - -static int ipt_dnat_checkentry(const char *tablename, - const void *entry, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) -{ - struct ip_nat_multi_range_compat *mr = targinfo; - - /* Must be a valid range */ - if (mr->rangesize != 1) { - printk("DNAT: multiple ranges no longer supported\n"); - return 0; - } - if (mr->range[0].flags & IP_NAT_RANGE_PROTO_RANDOM) { - printk("DNAT: port randomization not supported\n"); - return 0; - } - return 1; -} - -inline unsigned int -alloc_null_binding(struct ip_conntrack *conntrack, - struct ip_nat_info *info, - unsigned int hooknum) -{ - /* Force range to this IP; let proto decide mapping for - per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). - Use reply in case it's already been mangled (eg local packet). - */ - __be32 ip - = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC - ? conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip - : conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip); - struct ip_nat_range range - = { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } }; - - DEBUGP("Allocating NULL binding for %p (%u.%u.%u.%u)\n", conntrack, - NIPQUAD(ip)); - return ip_nat_setup_info(conntrack, &range, hooknum); -} - -unsigned int -alloc_null_binding_confirmed(struct ip_conntrack *conntrack, - struct ip_nat_info *info, - unsigned int hooknum) -{ - __be32 ip - = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC - ? conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip - : conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip); - u_int16_t all - = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC - ? conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.all - : conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.u.all); - struct ip_nat_range range - = { IP_NAT_RANGE_MAP_IPS, ip, ip, { all }, { all } }; - - DEBUGP("Allocating NULL binding for confirmed %p (%u.%u.%u.%u)\n", - conntrack, NIPQUAD(ip)); - return ip_nat_setup_info(conntrack, &range, hooknum); -} - -int ip_nat_rule_find(struct sk_buff **pskb, - unsigned int hooknum, - const struct net_device *in, - const struct net_device *out, - struct ip_conntrack *ct, - struct ip_nat_info *info) -{ - int ret; - - ret = ipt_do_table(pskb, hooknum, in, out, &nat_table); - - if (ret == NF_ACCEPT) { - if (!ip_nat_initialized(ct, HOOK2MANIP(hooknum))) - /* NUL mapping */ - ret = alloc_null_binding(ct, info, hooknum); - } - return ret; -} - -static struct xt_target ipt_snat_reg = { - .name = "SNAT", - .family = AF_INET, - .target = ipt_snat_target, - .targetsize = sizeof(struct ip_nat_multi_range_compat), - .table = "nat", - .hooks = 1 << NF_IP_POST_ROUTING, - .checkentry = ipt_snat_checkentry, -}; - -static struct xt_target ipt_dnat_reg = { - .name = "DNAT", - .family = AF_INET, - .target = ipt_dnat_target, - .targetsize = sizeof(struct ip_nat_multi_range_compat), - .table = "nat", - .hooks = (1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT), - .checkentry = ipt_dnat_checkentry, -}; - -int __init ip_nat_rule_init(void) -{ - int ret; - - ret = ipt_register_table(&nat_table, &nat_initial_table.repl); - if (ret != 0) - return ret; - ret = xt_register_target(&ipt_snat_reg); - if (ret != 0) - goto unregister_table; - - ret = xt_register_target(&ipt_dnat_reg); - if (ret != 0) - goto unregister_snat; - - return ret; - - unregister_snat: - xt_unregister_target(&ipt_snat_reg); - unregister_table: - xt_unregister_table(&nat_table); - - return ret; -} - -void ip_nat_rule_cleanup(void) -{ - xt_unregister_target(&ipt_dnat_reg); - xt_unregister_target(&ipt_snat_reg); - ipt_unregister_table(&nat_table); -} diff --git a/net/ipv4/netfilter/ip_nat_sip.c b/net/ipv4/netfilter/ip_nat_sip.c deleted file mode 100644 index 325c5a9dc2e..00000000000 --- a/net/ipv4/netfilter/ip_nat_sip.c +++ /dev/null @@ -1,282 +0,0 @@ -/* SIP extension for UDP NAT alteration. - * - * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar> - * based on RR's ip_nat_ftp.c and other modules. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/ip.h> -#include <linux/udp.h> - -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter_ipv4/ip_nat.h> -#include <linux/netfilter_ipv4/ip_nat_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_sip.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>"); -MODULE_DESCRIPTION("SIP NAT helper"); - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - -struct addr_map { - struct { - char src[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; - char dst[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; - unsigned int srclen, srciplen; - unsigned int dstlen, dstiplen; - } addr[IP_CT_DIR_MAX]; -}; - -static void addr_map_init(struct ip_conntrack *ct, struct addr_map *map) -{ - struct ip_conntrack_tuple *t; - enum ip_conntrack_dir dir; - unsigned int n; - - for (dir = 0; dir < IP_CT_DIR_MAX; dir++) { - t = &ct->tuplehash[dir].tuple; - - n = sprintf(map->addr[dir].src, "%u.%u.%u.%u", - NIPQUAD(t->src.ip)); - map->addr[dir].srciplen = n; - n += sprintf(map->addr[dir].src + n, ":%u", - ntohs(t->src.u.udp.port)); - map->addr[dir].srclen = n; - - n = sprintf(map->addr[dir].dst, "%u.%u.%u.%u", - NIPQUAD(t->dst.ip)); - map->addr[dir].dstiplen = n; - n += sprintf(map->addr[dir].dst + n, ":%u", - ntohs(t->dst.u.udp.port)); - map->addr[dir].dstlen = n; - } -} - -static int map_sip_addr(struct sk_buff **pskb, enum ip_conntrack_info ctinfo, - struct ip_conntrack *ct, const char **dptr, size_t dlen, - enum sip_header_pos pos, struct addr_map *map) -{ - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - unsigned int matchlen, matchoff, addrlen; - char *addr; - - if (ct_sip_get_info(*dptr, dlen, &matchoff, &matchlen, pos) <= 0) - return 1; - - if ((matchlen == map->addr[dir].srciplen || - matchlen == map->addr[dir].srclen) && - memcmp(*dptr + matchoff, map->addr[dir].src, matchlen) == 0) { - addr = map->addr[!dir].dst; - addrlen = map->addr[!dir].dstlen; - } else if ((matchlen == map->addr[dir].dstiplen || - matchlen == map->addr[dir].dstlen) && - memcmp(*dptr + matchoff, map->addr[dir].dst, matchlen) == 0) { - addr = map->addr[!dir].src; - addrlen = map->addr[!dir].srclen; - } else - return 1; - - if (!ip_nat_mangle_udp_packet(pskb, ct, ctinfo, - matchoff, matchlen, addr, addrlen)) - return 0; - *dptr = (*pskb)->data + (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); - return 1; - -} - -static unsigned int ip_nat_sip(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - struct ip_conntrack *ct, - const char **dptr) -{ - enum sip_header_pos pos; - struct addr_map map; - int dataoff, datalen; - - dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); - datalen = (*pskb)->len - dataoff; - if (datalen < sizeof("SIP/2.0") - 1) - return NF_DROP; - - addr_map_init(ct, &map); - - /* Basic rules: requests and responses. */ - if (strncmp(*dptr, "SIP/2.0", sizeof("SIP/2.0") - 1) != 0) { - /* 10.2: Constructing the REGISTER Request: - * - * The "userinfo" and "@" components of the SIP URI MUST NOT - * be present. - */ - if (datalen >= sizeof("REGISTER") - 1 && - strncmp(*dptr, "REGISTER", sizeof("REGISTER") - 1) == 0) - pos = POS_REG_REQ_URI; - else - pos = POS_REQ_URI; - - if (!map_sip_addr(pskb, ctinfo, ct, dptr, datalen, pos, &map)) - return NF_DROP; - } - - if (!map_sip_addr(pskb, ctinfo, ct, dptr, datalen, POS_FROM, &map) || - !map_sip_addr(pskb, ctinfo, ct, dptr, datalen, POS_TO, &map) || - !map_sip_addr(pskb, ctinfo, ct, dptr, datalen, POS_VIA, &map) || - !map_sip_addr(pskb, ctinfo, ct, dptr, datalen, POS_CONTACT, &map)) - return NF_DROP; - return NF_ACCEPT; -} - -static unsigned int mangle_sip_packet(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - struct ip_conntrack *ct, - const char **dptr, size_t dlen, - char *buffer, int bufflen, - enum sip_header_pos pos) -{ - unsigned int matchlen, matchoff; - - if (ct_sip_get_info(*dptr, dlen, &matchoff, &matchlen, pos) <= 0) - return 0; - - if (!ip_nat_mangle_udp_packet(pskb, ct, ctinfo, - matchoff, matchlen, buffer, bufflen)) - return 0; - - /* We need to reload this. Thanks Patrick. */ - *dptr = (*pskb)->data + (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); - return 1; -} - -static int mangle_content_len(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - struct ip_conntrack *ct, - const char *dptr) -{ - unsigned int dataoff, matchoff, matchlen; - char buffer[sizeof("65536")]; - int bufflen; - - dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); - - /* Get actual SDP lenght */ - if (ct_sip_get_info(dptr, (*pskb)->len - dataoff, &matchoff, - &matchlen, POS_SDP_HEADER) > 0) { - - /* since ct_sip_get_info() give us a pointer passing 'v=' - we need to add 2 bytes in this count. */ - int c_len = (*pskb)->len - dataoff - matchoff + 2; - - /* Now, update SDP lenght */ - if (ct_sip_get_info(dptr, (*pskb)->len - dataoff, &matchoff, - &matchlen, POS_CONTENT) > 0) { - - bufflen = sprintf(buffer, "%u", c_len); - - return ip_nat_mangle_udp_packet(pskb, ct, ctinfo, - matchoff, matchlen, - buffer, bufflen); - } - } - return 0; -} - -static unsigned int mangle_sdp(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - struct ip_conntrack *ct, - __be32 newip, u_int16_t port, - const char *dptr) -{ - char buffer[sizeof("nnn.nnn.nnn.nnn")]; - unsigned int dataoff, bufflen; - - dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); - - /* Mangle owner and contact info. */ - bufflen = sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(newip)); - if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff, - buffer, bufflen, POS_OWNER)) - return 0; - - if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff, - buffer, bufflen, POS_CONNECTION)) - return 0; - - /* Mangle media port. */ - bufflen = sprintf(buffer, "%u", port); - if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff, - buffer, bufflen, POS_MEDIA)) - return 0; - - return mangle_content_len(pskb, ctinfo, ct, dptr); -} - -/* So, this packet has hit the connection tracking matching code. - Mangle it, and change the expectation to match the new version. */ -static unsigned int ip_nat_sdp(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - struct ip_conntrack_expect *exp, - const char *dptr) -{ - struct ip_conntrack *ct = exp->master; - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - __be32 newip; - u_int16_t port; - - DEBUGP("ip_nat_sdp():\n"); - - /* Connection will come from reply */ - newip = ct->tuplehash[!dir].tuple.dst.ip; - - exp->tuple.dst.ip = newip; - exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port; - exp->dir = !dir; - - /* When you see the packet, we need to NAT it the same as the - this one. */ - exp->expectfn = ip_nat_follow_master; - - /* Try to get same port: if not, try to change it. */ - for (port = ntohs(exp->saved_proto.udp.port); port != 0; port++) { - exp->tuple.dst.u.udp.port = htons(port); - if (ip_conntrack_expect_related(exp) == 0) - break; - } - - if (port == 0) - return NF_DROP; - - if (!mangle_sdp(pskb, ctinfo, ct, newip, port, dptr)) { - ip_conntrack_unexpect_related(exp); - return NF_DROP; - } - return NF_ACCEPT; -} - -static void __exit fini(void) -{ - rcu_assign_pointer(ip_nat_sip_hook, NULL); - rcu_assign_pointer(ip_nat_sdp_hook, NULL); - synchronize_rcu(); -} - -static int __init init(void) -{ - BUG_ON(rcu_dereference(ip_nat_sip_hook)); - BUG_ON(rcu_dereference(ip_nat_sdp_hook)); - rcu_assign_pointer(ip_nat_sip_hook, ip_nat_sip); - rcu_assign_pointer(ip_nat_sdp_hook, ip_nat_sdp); - return 0; -} - -module_init(init); -module_exit(fini); diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c deleted file mode 100644 index e41d0efae51..00000000000 --- a/net/ipv4/netfilter/ip_nat_snmp_basic.c +++ /dev/null @@ -1,1333 +0,0 @@ -/* - * ip_nat_snmp_basic.c - * - * Basic SNMP Application Layer Gateway - * - * This IP NAT module is intended for use with SNMP network - * discovery and monitoring applications where target networks use - * conflicting private address realms. - * - * Static NAT is used to remap the networks from the view of the network - * management system at the IP layer, and this module remaps some application - * layer addresses to match. - * - * The simplest form of ALG is performed, where only tagged IP addresses - * are modified. The module does not need to be MIB aware and only scans - * messages at the ASN.1/BER level. - * - * Currently, only SNMPv1 and SNMPv2 are supported. - * - * More information on ALG and associated issues can be found in - * RFC 2962 - * - * The ASB.1/BER parsing code is derived from the gxsnmp package by Gregory - * McLean & Jochen Friedrich, stripped down for use in the kernel. - * - * Copyright (c) 2000 RP Internet (www.rpi.net.au). - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * Author: James Morris <jmorris@intercode.com.au> - * - * Updates: - * 2000-08-06: Convert to new helper API (Harald Welte). - * - */ -#include <linux/in.h> -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/moduleparam.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter_ipv4/ip_nat.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/netfilter_ipv4/ip_nat_helper.h> -#include <linux/ip.h> -#include <linux/udp.h> -#include <net/checksum.h> -#include <net/udp.h> -#include <asm/uaccess.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); -MODULE_DESCRIPTION("Basic SNMP Application Layer Gateway"); - -#define SNMP_PORT 161 -#define SNMP_TRAP_PORT 162 -#define NOCT1(n) (*(u8 *)n) - -static int debug; -static DEFINE_SPINLOCK(snmp_lock); - -/* - * Application layer address mapping mimics the NAT mapping, but - * only for the first octet in this case (a more flexible system - * can be implemented if needed). - */ -struct oct1_map -{ - u_int8_t from; - u_int8_t to; -}; - - -/***************************************************************************** - * - * Basic ASN.1 decoding routines (gxsnmp author Dirk Wisse) - * - *****************************************************************************/ - -/* Class */ -#define ASN1_UNI 0 /* Universal */ -#define ASN1_APL 1 /* Application */ -#define ASN1_CTX 2 /* Context */ -#define ASN1_PRV 3 /* Private */ - -/* Tag */ -#define ASN1_EOC 0 /* End Of Contents */ -#define ASN1_BOL 1 /* Boolean */ -#define ASN1_INT 2 /* Integer */ -#define ASN1_BTS 3 /* Bit String */ -#define ASN1_OTS 4 /* Octet String */ -#define ASN1_NUL 5 /* Null */ -#define ASN1_OJI 6 /* Object Identifier */ -#define ASN1_OJD 7 /* Object Description */ -#define ASN1_EXT 8 /* External */ -#define ASN1_SEQ 16 /* Sequence */ -#define ASN1_SET 17 /* Set */ -#define ASN1_NUMSTR 18 /* Numerical String */ -#define ASN1_PRNSTR 19 /* Printable String */ -#define ASN1_TEXSTR 20 /* Teletext String */ -#define ASN1_VIDSTR 21 /* Video String */ -#define ASN1_IA5STR 22 /* IA5 String */ -#define ASN1_UNITIM 23 /* Universal Time */ -#define ASN1_GENTIM 24 /* General Time */ -#define ASN1_GRASTR 25 /* Graphical String */ -#define ASN1_VISSTR 26 /* Visible String */ -#define ASN1_GENSTR 27 /* General String */ - -/* Primitive / Constructed methods*/ -#define ASN1_PRI 0 /* Primitive */ -#define ASN1_CON 1 /* Constructed */ - -/* - * Error codes. - */ -#define ASN1_ERR_NOERROR 0 -#define ASN1_ERR_DEC_EMPTY 2 -#define ASN1_ERR_DEC_EOC_MISMATCH 3 -#define ASN1_ERR_DEC_LENGTH_MISMATCH 4 -#define ASN1_ERR_DEC_BADVALUE 5 - -/* - * ASN.1 context. - */ -struct asn1_ctx -{ - int error; /* Error condition */ - unsigned char *pointer; /* Octet just to be decoded */ - unsigned char *begin; /* First octet */ - unsigned char *end; /* Octet after last octet */ -}; - -/* - * Octet string (not null terminated) - */ -struct asn1_octstr -{ - unsigned char *data; - unsigned int len; -}; - -static void asn1_open(struct asn1_ctx *ctx, - unsigned char *buf, - unsigned int len) -{ - ctx->begin = buf; - ctx->end = buf + len; - ctx->pointer = buf; - ctx->error = ASN1_ERR_NOERROR; -} - -static unsigned char asn1_octet_decode(struct asn1_ctx *ctx, unsigned char *ch) -{ - if (ctx->pointer >= ctx->end) { - ctx->error = ASN1_ERR_DEC_EMPTY; - return 0; - } - *ch = *(ctx->pointer)++; - return 1; -} - -static unsigned char asn1_tag_decode(struct asn1_ctx *ctx, unsigned int *tag) -{ - unsigned char ch; - - *tag = 0; - - do - { - if (!asn1_octet_decode(ctx, &ch)) - return 0; - *tag <<= 7; - *tag |= ch & 0x7F; - } while ((ch & 0x80) == 0x80); - return 1; -} - -static unsigned char asn1_id_decode(struct asn1_ctx *ctx, - unsigned int *cls, - unsigned int *con, - unsigned int *tag) -{ - unsigned char ch; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *cls = (ch & 0xC0) >> 6; - *con = (ch & 0x20) >> 5; - *tag = (ch & 0x1F); - - if (*tag == 0x1F) { - if (!asn1_tag_decode(ctx, tag)) - return 0; - } - return 1; -} - -static unsigned char asn1_length_decode(struct asn1_ctx *ctx, - unsigned int *def, - unsigned int *len) -{ - unsigned char ch, cnt; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - if (ch == 0x80) - *def = 0; - else { - *def = 1; - - if (ch < 0x80) - *len = ch; - else { - cnt = (unsigned char) (ch & 0x7F); - *len = 0; - - while (cnt > 0) { - if (!asn1_octet_decode(ctx, &ch)) - return 0; - *len <<= 8; - *len |= ch; - cnt--; - } - } - } - return 1; -} - -static unsigned char asn1_header_decode(struct asn1_ctx *ctx, - unsigned char **eoc, - unsigned int *cls, - unsigned int *con, - unsigned int *tag) -{ - unsigned int def, len; - - if (!asn1_id_decode(ctx, cls, con, tag)) - return 0; - - def = len = 0; - if (!asn1_length_decode(ctx, &def, &len)) - return 0; - - if (def) - *eoc = ctx->pointer + len; - else - *eoc = NULL; - return 1; -} - -static unsigned char asn1_eoc_decode(struct asn1_ctx *ctx, unsigned char *eoc) -{ - unsigned char ch; - - if (eoc == 0) { - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - if (ch != 0x00) { - ctx->error = ASN1_ERR_DEC_EOC_MISMATCH; - return 0; - } - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - if (ch != 0x00) { - ctx->error = ASN1_ERR_DEC_EOC_MISMATCH; - return 0; - } - return 1; - } else { - if (ctx->pointer != eoc) { - ctx->error = ASN1_ERR_DEC_LENGTH_MISMATCH; - return 0; - } - return 1; - } -} - -static unsigned char asn1_null_decode(struct asn1_ctx *ctx, unsigned char *eoc) -{ - ctx->pointer = eoc; - return 1; -} - -static unsigned char asn1_long_decode(struct asn1_ctx *ctx, - unsigned char *eoc, - long *integer) -{ - unsigned char ch; - unsigned int len; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer = (signed char) ch; - len = 1; - - while (ctx->pointer < eoc) { - if (++len > sizeof (long)) { - ctx->error = ASN1_ERR_DEC_BADVALUE; - return 0; - } - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer <<= 8; - *integer |= ch; - } - return 1; -} - -static unsigned char asn1_uint_decode(struct asn1_ctx *ctx, - unsigned char *eoc, - unsigned int *integer) -{ - unsigned char ch; - unsigned int len; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer = ch; - if (ch == 0) len = 0; - else len = 1; - - while (ctx->pointer < eoc) { - if (++len > sizeof (unsigned int)) { - ctx->error = ASN1_ERR_DEC_BADVALUE; - return 0; - } - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer <<= 8; - *integer |= ch; - } - return 1; -} - -static unsigned char asn1_ulong_decode(struct asn1_ctx *ctx, - unsigned char *eoc, - unsigned long *integer) -{ - unsigned char ch; - unsigned int len; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer = ch; - if (ch == 0) len = 0; - else len = 1; - - while (ctx->pointer < eoc) { - if (++len > sizeof (unsigned long)) { - ctx->error = ASN1_ERR_DEC_BADVALUE; - return 0; - } - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer <<= 8; - *integer |= ch; - } - return 1; -} - -static unsigned char asn1_octets_decode(struct asn1_ctx *ctx, - unsigned char *eoc, - unsigned char **octets, - unsigned int *len) -{ - unsigned char *ptr; - - *len = 0; - - *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC); - if (*octets == NULL) { - if (net_ratelimit()) - printk("OOM in bsalg (%d)\n", __LINE__); - return 0; - } - - ptr = *octets; - while (ctx->pointer < eoc) { - if (!asn1_octet_decode(ctx, (unsigned char *)ptr++)) { - kfree(*octets); - *octets = NULL; - return 0; - } - (*len)++; - } - return 1; -} - -static unsigned char asn1_subid_decode(struct asn1_ctx *ctx, - unsigned long *subid) -{ - unsigned char ch; - - *subid = 0; - - do { - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *subid <<= 7; - *subid |= ch & 0x7F; - } while ((ch & 0x80) == 0x80); - return 1; -} - -static unsigned char asn1_oid_decode(struct asn1_ctx *ctx, - unsigned char *eoc, - unsigned long **oid, - unsigned int *len) -{ - unsigned long subid; - unsigned int size; - unsigned long *optr; - - size = eoc - ctx->pointer + 1; - *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC); - if (*oid == NULL) { - if (net_ratelimit()) - printk("OOM in bsalg (%d)\n", __LINE__); - return 0; - } - - optr = *oid; - - if (!asn1_subid_decode(ctx, &subid)) { - kfree(*oid); - *oid = NULL; - return 0; - } - - if (subid < 40) { - optr [0] = 0; - optr [1] = subid; - } else if (subid < 80) { - optr [0] = 1; - optr [1] = subid - 40; - } else { - optr [0] = 2; - optr [1] = subid - 80; - } - - *len = 2; - optr += 2; - - while (ctx->pointer < eoc) { - if (++(*len) > size) { - ctx->error = ASN1_ERR_DEC_BADVALUE; - kfree(*oid); - *oid = NULL; - return 0; - } - - if (!asn1_subid_decode(ctx, optr++)) { - kfree(*oid); - *oid = NULL; - return 0; - } - } - return 1; -} - -/***************************************************************************** - * - * SNMP decoding routines (gxsnmp author Dirk Wisse) - * - *****************************************************************************/ - -/* SNMP Versions */ -#define SNMP_V1 0 -#define SNMP_V2C 1 -#define SNMP_V2 2 -#define SNMP_V3 3 - -/* Default Sizes */ -#define SNMP_SIZE_COMM 256 -#define SNMP_SIZE_OBJECTID 128 -#define SNMP_SIZE_BUFCHR 256 -#define SNMP_SIZE_BUFINT 128 -#define SNMP_SIZE_SMALLOBJECTID 16 - -/* Requests */ -#define SNMP_PDU_GET 0 -#define SNMP_PDU_NEXT 1 -#define SNMP_PDU_RESPONSE 2 -#define SNMP_PDU_SET 3 -#define SNMP_PDU_TRAP1 4 -#define SNMP_PDU_BULK 5 -#define SNMP_PDU_INFORM 6 -#define SNMP_PDU_TRAP2 7 - -/* Errors */ -#define SNMP_NOERROR 0 -#define SNMP_TOOBIG 1 -#define SNMP_NOSUCHNAME 2 -#define SNMP_BADVALUE 3 -#define SNMP_READONLY 4 -#define SNMP_GENERROR 5 -#define SNMP_NOACCESS 6 -#define SNMP_WRONGTYPE 7 -#define SNMP_WRONGLENGTH 8 -#define SNMP_WRONGENCODING 9 -#define SNMP_WRONGVALUE 10 -#define SNMP_NOCREATION 11 -#define SNMP_INCONSISTENTVALUE 12 -#define SNMP_RESOURCEUNAVAILABLE 13 -#define SNMP_COMMITFAILED 14 -#define SNMP_UNDOFAILED 15 -#define SNMP_AUTHORIZATIONERROR 16 -#define SNMP_NOTWRITABLE 17 -#define SNMP_INCONSISTENTNAME 18 - -/* General SNMP V1 Traps */ -#define SNMP_TRAP_COLDSTART 0 -#define SNMP_TRAP_WARMSTART 1 -#define SNMP_TRAP_LINKDOWN 2 -#define SNMP_TRAP_LINKUP 3 -#define SNMP_TRAP_AUTFAILURE 4 -#define SNMP_TRAP_EQPNEIGHBORLOSS 5 -#define SNMP_TRAP_ENTSPECIFIC 6 - -/* SNMPv1 Types */ -#define SNMP_NULL 0 -#define SNMP_INTEGER 1 /* l */ -#define SNMP_OCTETSTR 2 /* c */ -#define SNMP_DISPLAYSTR 2 /* c */ -#define SNMP_OBJECTID 3 /* ul */ -#define SNMP_IPADDR 4 /* uc */ -#define SNMP_COUNTER 5 /* ul */ -#define SNMP_GAUGE 6 /* ul */ -#define SNMP_TIMETICKS 7 /* ul */ -#define SNMP_OPAQUE 8 /* c */ - -/* Additional SNMPv2 Types */ -#define SNMP_UINTEGER 5 /* ul */ -#define SNMP_BITSTR 9 /* uc */ -#define SNMP_NSAP 10 /* uc */ -#define SNMP_COUNTER64 11 /* ul */ -#define SNMP_NOSUCHOBJECT 12 -#define SNMP_NOSUCHINSTANCE 13 -#define SNMP_ENDOFMIBVIEW 14 - -union snmp_syntax -{ - unsigned char uc[0]; /* 8 bit unsigned */ - char c[0]; /* 8 bit signed */ - unsigned long ul[0]; /* 32 bit unsigned */ - long l[0]; /* 32 bit signed */ -}; - -struct snmp_object -{ - unsigned long *id; - unsigned int id_len; - unsigned short type; - unsigned int syntax_len; - union snmp_syntax syntax; -}; - -struct snmp_request -{ - unsigned long id; - unsigned int error_status; - unsigned int error_index; -}; - -struct snmp_v1_trap -{ - unsigned long *id; - unsigned int id_len; - unsigned long ip_address; /* pointer */ - unsigned int general; - unsigned int specific; - unsigned long time; -}; - -/* SNMP types */ -#define SNMP_IPA 0 -#define SNMP_CNT 1 -#define SNMP_GGE 2 -#define SNMP_TIT 3 -#define SNMP_OPQ 4 -#define SNMP_C64 6 - -/* SNMP errors */ -#define SERR_NSO 0 -#define SERR_NSI 1 -#define SERR_EOM 2 - -static inline void mangle_address(unsigned char *begin, - unsigned char *addr, - const struct oct1_map *map, - __sum16 *check); -struct snmp_cnv -{ - unsigned int class; - unsigned int tag; - int syntax; -}; - -static struct snmp_cnv snmp_conv [] = -{ - {ASN1_UNI, ASN1_NUL, SNMP_NULL}, - {ASN1_UNI, ASN1_INT, SNMP_INTEGER}, - {ASN1_UNI, ASN1_OTS, SNMP_OCTETSTR}, - {ASN1_UNI, ASN1_OTS, SNMP_DISPLAYSTR}, - {ASN1_UNI, ASN1_OJI, SNMP_OBJECTID}, - {ASN1_APL, SNMP_IPA, SNMP_IPADDR}, - {ASN1_APL, SNMP_CNT, SNMP_COUNTER}, /* Counter32 */ - {ASN1_APL, SNMP_GGE, SNMP_GAUGE}, /* Gauge32 == Unsigned32 */ - {ASN1_APL, SNMP_TIT, SNMP_TIMETICKS}, - {ASN1_APL, SNMP_OPQ, SNMP_OPAQUE}, - - /* SNMPv2 data types and errors */ - {ASN1_UNI, ASN1_BTS, SNMP_BITSTR}, - {ASN1_APL, SNMP_C64, SNMP_COUNTER64}, - {ASN1_CTX, SERR_NSO, SNMP_NOSUCHOBJECT}, - {ASN1_CTX, SERR_NSI, SNMP_NOSUCHINSTANCE}, - {ASN1_CTX, SERR_EOM, SNMP_ENDOFMIBVIEW}, - {0, 0, -1} -}; - -static unsigned char snmp_tag_cls2syntax(unsigned int tag, - unsigned int cls, - unsigned short *syntax) -{ - struct snmp_cnv *cnv; - - cnv = snmp_conv; - - while (cnv->syntax != -1) { - if (cnv->tag == tag && cnv->class == cls) { - *syntax = cnv->syntax; - return 1; - } - cnv++; - } - return 0; -} - -static unsigned char snmp_object_decode(struct asn1_ctx *ctx, - struct snmp_object **obj) -{ - unsigned int cls, con, tag, len, idlen; - unsigned short type; - unsigned char *eoc, *end, *p; - unsigned long *lp, *id; - unsigned long ul; - long l; - - *obj = NULL; - id = NULL; - - if (!asn1_header_decode(ctx, &eoc, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ) - return 0; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI) - return 0; - - if (!asn1_oid_decode(ctx, end, &id, &idlen)) - return 0; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) { - kfree(id); - return 0; - } - - if (con != ASN1_PRI) { - kfree(id); - return 0; - } - - type = 0; - if (!snmp_tag_cls2syntax(tag, cls, &type)) { - kfree(id); - return 0; - } - - l = 0; - switch (type) { - case SNMP_INTEGER: - len = sizeof(long); - if (!asn1_long_decode(ctx, end, &l)) { - kfree(id); - return 0; - } - *obj = kmalloc(sizeof(struct snmp_object) + len, - GFP_ATOMIC); - if (*obj == NULL) { - kfree(id); - if (net_ratelimit()) - printk("OOM in bsalg (%d)\n", __LINE__); - return 0; - } - (*obj)->syntax.l[0] = l; - break; - case SNMP_OCTETSTR: - case SNMP_OPAQUE: - if (!asn1_octets_decode(ctx, end, &p, &len)) { - kfree(id); - return 0; - } - *obj = kmalloc(sizeof(struct snmp_object) + len, - GFP_ATOMIC); - if (*obj == NULL) { - kfree(id); - if (net_ratelimit()) - printk("OOM in bsalg (%d)\n", __LINE__); - return 0; - } - memcpy((*obj)->syntax.c, p, len); - kfree(p); - break; - case SNMP_NULL: - case SNMP_NOSUCHOBJECT: - case SNMP_NOSUCHINSTANCE: - case SNMP_ENDOFMIBVIEW: - len = 0; - *obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC); - if (*obj == NULL) { - kfree(id); - if (net_ratelimit()) - printk("OOM in bsalg (%d)\n", __LINE__); - return 0; - } - if (!asn1_null_decode(ctx, end)) { - kfree(id); - kfree(*obj); - *obj = NULL; - return 0; - } - break; - case SNMP_OBJECTID: - if (!asn1_oid_decode(ctx, end, (unsigned long **)&lp, &len)) { - kfree(id); - return 0; - } - len *= sizeof(unsigned long); - *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); - if (*obj == NULL) { - kfree(lp); - kfree(id); - if (net_ratelimit()) - printk("OOM in bsalg (%d)\n", __LINE__); - return 0; - } - memcpy((*obj)->syntax.ul, lp, len); - kfree(lp); - break; - case SNMP_IPADDR: - if (!asn1_octets_decode(ctx, end, &p, &len)) { - kfree(id); - return 0; - } - if (len != 4) { - kfree(p); - kfree(id); - return 0; - } - *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); - if (*obj == NULL) { - kfree(p); - kfree(id); - if (net_ratelimit()) - printk("OOM in bsalg (%d)\n", __LINE__); - return 0; - } - memcpy((*obj)->syntax.uc, p, len); - kfree(p); - break; - case SNMP_COUNTER: - case SNMP_GAUGE: - case SNMP_TIMETICKS: - len = sizeof(unsigned long); - if (!asn1_ulong_decode(ctx, end, &ul)) { - kfree(id); - return 0; - } - *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); - if (*obj == NULL) { - kfree(id); - if (net_ratelimit()) - printk("OOM in bsalg (%d)\n", __LINE__); - return 0; - } - (*obj)->syntax.ul[0] = ul; - break; - default: - kfree(id); - return 0; - } - - (*obj)->syntax_len = len; - (*obj)->type = type; - (*obj)->id = id; - (*obj)->id_len = idlen; - - if (!asn1_eoc_decode(ctx, eoc)) { - kfree(id); - kfree(*obj); - *obj = NULL; - return 0; - } - return 1; -} - -static unsigned char snmp_request_decode(struct asn1_ctx *ctx, - struct snmp_request *request) -{ - unsigned int cls, con, tag; - unsigned char *end; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) - return 0; - - if (!asn1_ulong_decode(ctx, end, &request->id)) - return 0; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) - return 0; - - if (!asn1_uint_decode(ctx, end, &request->error_status)) - return 0; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) - return 0; - - if (!asn1_uint_decode(ctx, end, &request->error_index)) - return 0; - - return 1; -} - -/* - * Fast checksum update for possibly oddly-aligned UDP byte, from the - * code example in the draft. - */ -static void fast_csum(__sum16 *csum, - const unsigned char *optr, - const unsigned char *nptr, - int offset) -{ - unsigned char s[4]; - - if (offset & 1) { - s[0] = s[2] = 0; - s[1] = ~*optr; - s[3] = *nptr; - } else { - s[1] = s[3] = 0; - s[0] = ~*optr; - s[2] = *nptr; - } - - *csum = csum_fold(csum_partial(s, 4, ~csum_unfold(*csum))); -} - -/* - * Mangle IP address. - * - begin points to the start of the snmp messgae - * - addr points to the start of the address - */ -static inline void mangle_address(unsigned char *begin, - unsigned char *addr, - const struct oct1_map *map, - __sum16 *check) -{ - if (map->from == NOCT1(addr)) { - u_int32_t old; - - if (debug) - memcpy(&old, (unsigned char *)addr, sizeof(old)); - - *addr = map->to; - - /* Update UDP checksum if being used */ - if (*check) { - fast_csum(check, - &map->from, &map->to, addr - begin); - } - - if (debug) - printk(KERN_DEBUG "bsalg: mapped %u.%u.%u.%u to " - "%u.%u.%u.%u\n", NIPQUAD(old), NIPQUAD(*addr)); - } -} - -static unsigned char snmp_trap_decode(struct asn1_ctx *ctx, - struct snmp_v1_trap *trap, - const struct oct1_map *map, - __sum16 *check) -{ - unsigned int cls, con, tag, len; - unsigned char *end; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI) - return 0; - - if (!asn1_oid_decode(ctx, end, &trap->id, &trap->id_len)) - return 0; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - goto err_id_free; - - if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_IPA) || - (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_OTS))) - goto err_id_free; - - if (!asn1_octets_decode(ctx, end, (unsigned char **)&trap->ip_address, &len)) - goto err_id_free; - - /* IPv4 only */ - if (len != 4) - goto err_addr_free; - - mangle_address(ctx->begin, ctx->pointer - 4, map, check); - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - goto err_addr_free; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) - goto err_addr_free; - - if (!asn1_uint_decode(ctx, end, &trap->general)) - goto err_addr_free; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - goto err_addr_free; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) - goto err_addr_free; - - if (!asn1_uint_decode(ctx, end, &trap->specific)) - goto err_addr_free; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - goto err_addr_free; - - if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_TIT) || - (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_INT))) - goto err_addr_free; - - if (!asn1_ulong_decode(ctx, end, &trap->time)) - goto err_addr_free; - - return 1; - -err_addr_free: - kfree((unsigned long *)trap->ip_address); - -err_id_free: - kfree(trap->id); - - return 0; -} - -/***************************************************************************** - * - * Misc. routines - * - *****************************************************************************/ - -static void hex_dump(unsigned char *buf, size_t len) -{ - size_t i; - - for (i = 0; i < len; i++) { - if (i && !(i % 16)) - printk("\n"); - printk("%02x ", *(buf + i)); - } - printk("\n"); -} - -/* - * Parse and mangle SNMP message according to mapping. - * (And this is the fucking 'basic' method). - */ -static int snmp_parse_mangle(unsigned char *msg, - u_int16_t len, - const struct oct1_map *map, - __sum16 *check) -{ - unsigned char *eoc, *end; - unsigned int cls, con, tag, vers, pdutype; - struct asn1_ctx ctx; - struct asn1_octstr comm; - struct snmp_object **obj; - - if (debug > 1) - hex_dump(msg, len); - - asn1_open(&ctx, msg, len); - - /* - * Start of SNMP message. - */ - if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag)) - return 0; - if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ) - return 0; - - /* - * Version 1 or 2 handled. - */ - if (!asn1_header_decode(&ctx, &end, &cls, &con, &tag)) - return 0; - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) - return 0; - if (!asn1_uint_decode (&ctx, end, &vers)) - return 0; - if (debug > 1) - printk(KERN_DEBUG "bsalg: snmp version: %u\n", vers + 1); - if (vers > 1) - return 1; - - /* - * Community. - */ - if (!asn1_header_decode (&ctx, &end, &cls, &con, &tag)) - return 0; - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OTS) - return 0; - if (!asn1_octets_decode(&ctx, end, &comm.data, &comm.len)) - return 0; - if (debug > 1) { - unsigned int i; - - printk(KERN_DEBUG "bsalg: community: "); - for (i = 0; i < comm.len; i++) - printk("%c", comm.data[i]); - printk("\n"); - } - kfree(comm.data); - - /* - * PDU type - */ - if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &pdutype)) - return 0; - if (cls != ASN1_CTX || con != ASN1_CON) - return 0; - if (debug > 1) { - unsigned char *pdus[] = { - [SNMP_PDU_GET] = "get", - [SNMP_PDU_NEXT] = "get-next", - [SNMP_PDU_RESPONSE] = "response", - [SNMP_PDU_SET] = "set", - [SNMP_PDU_TRAP1] = "trapv1", - [SNMP_PDU_BULK] = "bulk", - [SNMP_PDU_INFORM] = "inform", - [SNMP_PDU_TRAP2] = "trapv2" - }; - - if (pdutype > SNMP_PDU_TRAP2) - printk(KERN_DEBUG "bsalg: bad pdu type %u\n", pdutype); - else - printk(KERN_DEBUG "bsalg: pdu: %s\n", pdus[pdutype]); - } - if (pdutype != SNMP_PDU_RESPONSE && - pdutype != SNMP_PDU_TRAP1 && pdutype != SNMP_PDU_TRAP2) - return 1; - - /* - * Request header or v1 trap - */ - if (pdutype == SNMP_PDU_TRAP1) { - struct snmp_v1_trap trap; - unsigned char ret = snmp_trap_decode(&ctx, &trap, map, check); - - if (ret) { - kfree(trap.id); - kfree((unsigned long *)trap.ip_address); - } else - return ret; - - } else { - struct snmp_request req; - - if (!snmp_request_decode(&ctx, &req)) - return 0; - - if (debug > 1) - printk(KERN_DEBUG "bsalg: request: id=0x%lx error_status=%u " - "error_index=%u\n", req.id, req.error_status, - req.error_index); - } - - /* - * Loop through objects, look for IP addresses to mangle. - */ - if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ) - return 0; - - obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC); - if (obj == NULL) { - if (net_ratelimit()) - printk(KERN_WARNING "OOM in bsalg(%d)\n", __LINE__); - return 0; - } - - while (!asn1_eoc_decode(&ctx, eoc)) { - unsigned int i; - - if (!snmp_object_decode(&ctx, obj)) { - if (*obj) { - kfree((*obj)->id); - kfree(*obj); - } - kfree(obj); - return 0; - } - - if (debug > 1) { - printk(KERN_DEBUG "bsalg: object: "); - for (i = 0; i < (*obj)->id_len; i++) { - if (i > 0) - printk("."); - printk("%lu", (*obj)->id[i]); - } - printk(": type=%u\n", (*obj)->type); - - } - - if ((*obj)->type == SNMP_IPADDR) - mangle_address(ctx.begin, ctx.pointer - 4 , map, check); - - kfree((*obj)->id); - kfree(*obj); - } - kfree(obj); - - if (!asn1_eoc_decode(&ctx, eoc)) - return 0; - - return 1; -} - -/***************************************************************************** - * - * NAT routines. - * - *****************************************************************************/ - -/* - * SNMP translation routine. - */ -static int snmp_translate(struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - struct sk_buff **pskb) -{ - struct iphdr *iph = (*pskb)->nh.iph; - struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl); - u_int16_t udplen = ntohs(udph->len); - u_int16_t paylen = udplen - sizeof(struct udphdr); - int dir = CTINFO2DIR(ctinfo); - struct oct1_map map; - - /* - * Determine mappping for application layer addresses based - * on NAT manipulations for the packet. - */ - if (dir == IP_CT_DIR_ORIGINAL) { - /* SNAT traps */ - map.from = NOCT1(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip); - map.to = NOCT1(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip); - } else { - /* DNAT replies */ - map.from = NOCT1(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip); - map.to = NOCT1(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip); - } - - if (map.from == map.to) - return NF_ACCEPT; - - if (!snmp_parse_mangle((unsigned char *)udph + sizeof(struct udphdr), - paylen, &map, &udph->check)) { - if (net_ratelimit()) - printk(KERN_WARNING "bsalg: parser failed\n"); - return NF_DROP; - } - return NF_ACCEPT; -} - -/* We don't actually set up expectations, just adjust internal IP - * addresses if this is being NATted */ -static int help(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo) -{ - int dir = CTINFO2DIR(ctinfo); - unsigned int ret; - struct iphdr *iph = (*pskb)->nh.iph; - struct udphdr *udph = (struct udphdr *)((u_int32_t *)iph + iph->ihl); - - /* SNMP replies and originating SNMP traps get mangled */ - if (udph->source == htons(SNMP_PORT) && dir != IP_CT_DIR_REPLY) - return NF_ACCEPT; - if (udph->dest == htons(SNMP_TRAP_PORT) && dir != IP_CT_DIR_ORIGINAL) - return NF_ACCEPT; - - /* No NAT? */ - if (!(ct->status & IPS_NAT_MASK)) - return NF_ACCEPT; - - /* - * Make sure the packet length is ok. So far, we were only guaranteed - * to have a valid length IP header plus 8 bytes, which means we have - * enough room for a UDP header. Just verify the UDP length field so we - * can mess around with the payload. - */ - if (ntohs(udph->len) != (*pskb)->len - (iph->ihl << 2)) { - if (net_ratelimit()) - printk(KERN_WARNING "SNMP: dropping malformed packet " - "src=%u.%u.%u.%u dst=%u.%u.%u.%u\n", - NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); - return NF_DROP; - } - - if (!skb_make_writable(pskb, (*pskb)->len)) - return NF_DROP; - - spin_lock_bh(&snmp_lock); - ret = snmp_translate(ct, ctinfo, pskb); - spin_unlock_bh(&snmp_lock); - return ret; -} - -static struct ip_conntrack_helper snmp_helper = { - .max_expected = 0, - .timeout = 180, - .me = THIS_MODULE, - .help = help, - .name = "snmp", - - .tuple = {.src = {.u = {.udp = {.port = __constant_htons(SNMP_PORT)}}}, - .dst = {.protonum = IPPROTO_UDP}, - }, - .mask = {.src = {.u = {0xFFFF}}, - .dst = {.protonum = 0xFF}, - }, -}; - -static struct ip_conntrack_helper snmp_trap_helper = { - .max_expected = 0, - .timeout = 180, - .me = THIS_MODULE, - .help = help, - .name = "snmp_trap", - - .tuple = {.src = {.u = {.udp = {.port = __constant_htons(SNMP_TRAP_PORT)}}}, - .dst = {.protonum = IPPROTO_UDP}, - }, - .mask = {.src = {.u = {0xFFFF}}, - .dst = {.protonum = 0xFF}, - }, -}; - -/***************************************************************************** - * - * Module stuff. - * - *****************************************************************************/ - -static int __init ip_nat_snmp_basic_init(void) -{ - int ret = 0; - - ret = ip_conntrack_helper_register(&snmp_helper); - if (ret < 0) - return ret; - ret = ip_conntrack_helper_register(&snmp_trap_helper); - if (ret < 0) { - ip_conntrack_helper_unregister(&snmp_helper); - return ret; - } - return ret; -} - -static void __exit ip_nat_snmp_basic_fini(void) -{ - ip_conntrack_helper_unregister(&snmp_helper); - ip_conntrack_helper_unregister(&snmp_trap_helper); -} - -module_init(ip_nat_snmp_basic_init); -module_exit(ip_nat_snmp_basic_fini); - -module_param(debug, int, 0600); diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c deleted file mode 100644 index 6bcfdf6dfcc..00000000000 --- a/net/ipv4/netfilter/ip_nat_standalone.c +++ /dev/null @@ -1,388 +0,0 @@ -/* This file contains all the functions required for the standalone - ip_nat module. - - These are not required by the compatibility layer. -*/ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -/* - * 23 Apr 2001: Harald Welte <laforge@gnumonks.org> - * - new API and handling of conntrack/nat helpers - * - now capable of multiple expectations for one master - * */ - -#include <linux/types.h> -#include <linux/icmp.h> -#include <linux/ip.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/proc_fs.h> -#include <net/ip.h> -#include <net/checksum.h> -#include <linux/spinlock.h> - -#include <linux/netfilter_ipv4/ip_nat.h> -#include <linux/netfilter_ipv4/ip_nat_rule.h> -#include <linux/netfilter_ipv4/ip_nat_protocol.h> -#include <linux/netfilter_ipv4/ip_nat_core.h> -#include <linux/netfilter_ipv4/ip_nat_helper.h> -#include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_ipv4/ip_conntrack_core.h> - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - -#ifdef CONFIG_XFRM -static void nat_decode_session(struct sk_buff *skb, struct flowi *fl) -{ - struct ip_conntrack *ct; - struct ip_conntrack_tuple *t; - enum ip_conntrack_info ctinfo; - enum ip_conntrack_dir dir; - unsigned long statusbit; - - ct = ip_conntrack_get(skb, &ctinfo); - if (ct == NULL) - return; - dir = CTINFO2DIR(ctinfo); - t = &ct->tuplehash[dir].tuple; - - if (dir == IP_CT_DIR_ORIGINAL) - statusbit = IPS_DST_NAT; - else - statusbit = IPS_SRC_NAT; - - if (ct->status & statusbit) { - fl->fl4_dst = t->dst.ip; - if (t->dst.protonum == IPPROTO_TCP || - t->dst.protonum == IPPROTO_UDP) - fl->fl_ip_dport = t->dst.u.tcp.port; - } - - statusbit ^= IPS_NAT_MASK; - - if (ct->status & statusbit) { - fl->fl4_src = t->src.ip; - if (t->dst.protonum == IPPROTO_TCP || - t->dst.protonum == IPPROTO_UDP) - fl->fl_ip_sport = t->src.u.tcp.port; - } -} -#endif - -static unsigned int -ip_nat_fn(unsigned int hooknum, - struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct ip_conntrack *ct; - enum ip_conntrack_info ctinfo; - struct ip_nat_info *info; - /* maniptype == SRC for postrouting. */ - enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum); - - /* We never see fragments: conntrack defrags on pre-routing - and local-out, and ip_nat_out protects post-routing. */ - IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off - & htons(IP_MF|IP_OFFSET))); - - ct = ip_conntrack_get(*pskb, &ctinfo); - /* Can't track? It's not due to stress, or conntrack would - have dropped it. Hence it's the user's responsibilty to - packet filter it out, or implement conntrack/NAT for that - protocol. 8) --RR */ - if (!ct) { - /* Exception: ICMP redirect to new connection (not in - hash table yet). We must not let this through, in - case we're doing NAT to the same network. */ - if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) { - struct icmphdr _hdr, *hp; - - hp = skb_header_pointer(*pskb, - (*pskb)->nh.iph->ihl*4, - sizeof(_hdr), &_hdr); - if (hp != NULL && - hp->type == ICMP_REDIRECT) - return NF_DROP; - } - return NF_ACCEPT; - } - - /* Don't try to NAT if this packet is not conntracked */ - if (ct == &ip_conntrack_untracked) - return NF_ACCEPT; - - switch (ctinfo) { - case IP_CT_RELATED: - case IP_CT_RELATED+IP_CT_IS_REPLY: - if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) { - if (!ip_nat_icmp_reply_translation(ct, ctinfo, - hooknum, pskb)) - return NF_DROP; - else - return NF_ACCEPT; - } - /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ - case IP_CT_NEW: - info = &ct->nat.info; - - /* Seen it before? This can happen for loopback, retrans, - or local packets.. */ - if (!ip_nat_initialized(ct, maniptype)) { - unsigned int ret; - - if (unlikely(is_confirmed(ct))) - /* NAT module was loaded late */ - ret = alloc_null_binding_confirmed(ct, info, - hooknum); - else if (hooknum == NF_IP_LOCAL_IN) - /* LOCAL_IN hook doesn't have a chain! */ - ret = alloc_null_binding(ct, info, hooknum); - else - ret = ip_nat_rule_find(pskb, hooknum, - in, out, ct, - info); - - if (ret != NF_ACCEPT) { - return ret; - } - } else - DEBUGP("Already setup manip %s for ct %p\n", - maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST", - ct); - break; - - default: - /* ESTABLISHED */ - IP_NF_ASSERT(ctinfo == IP_CT_ESTABLISHED - || ctinfo == (IP_CT_ESTABLISHED+IP_CT_IS_REPLY)); - info = &ct->nat.info; - } - - IP_NF_ASSERT(info); - return ip_nat_packet(ct, ctinfo, hooknum, pskb); -} - -static unsigned int -ip_nat_in(unsigned int hooknum, - struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - unsigned int ret; - __be32 daddr = (*pskb)->nh.iph->daddr; - - ret = ip_nat_fn(hooknum, pskb, in, out, okfn); - if (ret != NF_DROP && ret != NF_STOLEN - && daddr != (*pskb)->nh.iph->daddr) { - dst_release((*pskb)->dst); - (*pskb)->dst = NULL; - } - return ret; -} - -static unsigned int -ip_nat_out(unsigned int hooknum, - struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ -#ifdef CONFIG_XFRM - struct ip_conntrack *ct; - enum ip_conntrack_info ctinfo; -#endif - unsigned int ret; - - /* root is playing with raw sockets. */ - if ((*pskb)->len < sizeof(struct iphdr) - || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) - return NF_ACCEPT; - - ret = ip_nat_fn(hooknum, pskb, in, out, okfn); -#ifdef CONFIG_XFRM - if (ret != NF_DROP && ret != NF_STOLEN - && (ct = ip_conntrack_get(*pskb, &ctinfo)) != NULL) { - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - - if (ct->tuplehash[dir].tuple.src.ip != - ct->tuplehash[!dir].tuple.dst.ip - || ct->tuplehash[dir].tuple.src.u.all != - ct->tuplehash[!dir].tuple.dst.u.all - ) - return ip_xfrm_me_harder(pskb) == 0 ? ret : NF_DROP; - } -#endif - return ret; -} - -static unsigned int -ip_nat_local_fn(unsigned int hooknum, - struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct ip_conntrack *ct; - enum ip_conntrack_info ctinfo; - unsigned int ret; - - /* root is playing with raw sockets. */ - if ((*pskb)->len < sizeof(struct iphdr) - || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) - return NF_ACCEPT; - - ret = ip_nat_fn(hooknum, pskb, in, out, okfn); - if (ret != NF_DROP && ret != NF_STOLEN - && (ct = ip_conntrack_get(*pskb, &ctinfo)) != NULL) { - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - - if (ct->tuplehash[dir].tuple.dst.ip != - ct->tuplehash[!dir].tuple.src.ip) { - if (ip_route_me_harder(pskb, RTN_UNSPEC)) - ret = NF_DROP; - } -#ifdef CONFIG_XFRM - else if (ct->tuplehash[dir].tuple.dst.u.all != - ct->tuplehash[!dir].tuple.src.u.all) - if (ip_xfrm_me_harder(pskb)) - ret = NF_DROP; -#endif - - } - return ret; -} - -static unsigned int -ip_nat_adjust(unsigned int hooknum, - struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct ip_conntrack *ct; - enum ip_conntrack_info ctinfo; - - ct = ip_conntrack_get(*pskb, &ctinfo); - if (ct && test_bit(IPS_SEQ_ADJUST_BIT, &ct->status)) { - DEBUGP("ip_nat_standalone: adjusting sequence number\n"); - if (!ip_nat_seq_adjust(pskb, ct, ctinfo)) - return NF_DROP; - } - return NF_ACCEPT; -} - -/* We must be after connection tracking and before packet filtering. */ - -static struct nf_hook_ops ip_nat_ops[] = { - /* Before packet filtering, change destination */ - { - .hook = ip_nat_in, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_PRE_ROUTING, - .priority = NF_IP_PRI_NAT_DST, - }, - /* After packet filtering, change source */ - { - .hook = ip_nat_out, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_POST_ROUTING, - .priority = NF_IP_PRI_NAT_SRC, - }, - /* After conntrack, adjust sequence number */ - { - .hook = ip_nat_adjust, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_POST_ROUTING, - .priority = NF_IP_PRI_NAT_SEQ_ADJUST, - }, - /* Before packet filtering, change destination */ - { - .hook = ip_nat_local_fn, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_LOCAL_OUT, - .priority = NF_IP_PRI_NAT_DST, - }, - /* After packet filtering, change source */ - { - .hook = ip_nat_fn, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_LOCAL_IN, - .priority = NF_IP_PRI_NAT_SRC, - }, - /* After conntrack, adjust sequence number */ - { - .hook = ip_nat_adjust, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_LOCAL_IN, - .priority = NF_IP_PRI_NAT_SEQ_ADJUST, - }, -}; - -static int __init ip_nat_standalone_init(void) -{ - int ret = 0; - - need_conntrack(); - -#ifdef CONFIG_XFRM - BUG_ON(ip_nat_decode_session != NULL); - ip_nat_decode_session = nat_decode_session; -#endif - ret = ip_nat_rule_init(); - if (ret < 0) { - printk("ip_nat_init: can't setup rules.\n"); - goto cleanup_decode_session; - } - ret = nf_register_hooks(ip_nat_ops, ARRAY_SIZE(ip_nat_ops)); - if (ret < 0) { - printk("ip_nat_init: can't register hooks.\n"); - goto cleanup_rule_init; - } - return ret; - - cleanup_rule_init: - ip_nat_rule_cleanup(); - cleanup_decode_session: -#ifdef CONFIG_XFRM - ip_nat_decode_session = NULL; - synchronize_net(); -#endif - return ret; -} - -static void __exit ip_nat_standalone_fini(void) -{ - nf_unregister_hooks(ip_nat_ops, ARRAY_SIZE(ip_nat_ops)); - ip_nat_rule_cleanup(); -#ifdef CONFIG_XFRM - ip_nat_decode_session = NULL; - synchronize_net(); -#endif -} - -module_init(ip_nat_standalone_init); -module_exit(ip_nat_standalone_fini); - -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/netfilter/ip_nat_tftp.c b/net/ipv4/netfilter/ip_nat_tftp.c deleted file mode 100644 index 604793536fc..00000000000 --- a/net/ipv4/netfilter/ip_nat_tftp.c +++ /dev/null @@ -1,70 +0,0 @@ -/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Version: 0.0.7 - * - * Thu 21 Mar 2002 Harald Welte <laforge@gnumonks.org> - * - Port to newnat API - * - * This module currently supports DNAT: - * iptables -t nat -A PREROUTING -d x.x.x.x -j DNAT --to-dest x.x.x.y - * - * and SNAT: - * iptables -t nat -A POSTROUTING { -j MASQUERADE , -j SNAT --to-source x.x.x.x } - * - * It has not been tested with - * -j SNAT --to-source x.x.x.x-x.x.x.y since I only have one external ip - * If you do test this please let me know if it works or not. - * - */ - -#include <linux/module.h> -#include <linux/netfilter_ipv4.h> -#include <linux/ip.h> -#include <linux/udp.h> - -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#include <linux/netfilter_ipv4/ip_conntrack_tftp.h> -#include <linux/netfilter_ipv4/ip_nat_helper.h> -#include <linux/netfilter_ipv4/ip_nat_rule.h> -#include <linux/moduleparam.h> - -MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>"); -MODULE_DESCRIPTION("tftp NAT helper"); -MODULE_LICENSE("GPL"); - -static unsigned int help(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - struct ip_conntrack_expect *exp) -{ - struct ip_conntrack *ct = exp->master; - - exp->saved_proto.udp.port - = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port; - exp->dir = IP_CT_DIR_REPLY; - exp->expectfn = ip_nat_follow_master; - if (ip_conntrack_expect_related(exp) != 0) - return NF_DROP; - return NF_ACCEPT; -} - -static void __exit ip_nat_tftp_fini(void) -{ - rcu_assign_pointer(ip_nat_tftp_hook, NULL); - synchronize_rcu(); -} - -static int __init ip_nat_tftp_init(void) -{ - BUG_ON(rcu_dereference(ip_nat_tftp_hook)); - rcu_assign_pointer(ip_nat_tftp_hook, help); - return 0; -} - -module_init(ip_nat_tftp_init); -module_exit(ip_nat_tftp_fini); diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index a14798a850d..702d94db19b 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -8,18 +8,6 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. - * - * 2000-03-27: Simplified code (thanks to Andi Kleen for clues). - * 2000-05-20: Fixed notifier problems (following Miguel Freitas' report). - * 2000-06-19: Fixed so nfmark is copied to metadata (reported by Sebastian - * Zander). - * 2000-08-01: Added Nick Williams' MAC support. - * 2002-06-25: Code cleanup. - * 2005-01-10: Added /proc counter for dropped packets; fixed so - * packets aren't delivered to user space if they're going - * to be dropped. - * 2005-05-26: local_bh_{disable,enable} around nf_reinject (Harald Welte) - * */ #include <linux/module.h> #include <linux/skbuff.h> @@ -191,12 +179,13 @@ ipq_flush(int verdict) static struct sk_buff * ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp) { - unsigned char *old_tail; + sk_buff_data_t old_tail; size_t size = 0; size_t data_len = 0; struct sk_buff *skb; struct ipq_packet_msg *pmsg; struct nlmsghdr *nlh; + struct timeval tv; read_lock_bh(&queue_lock); @@ -234,15 +223,16 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp) if (!skb) goto nlmsg_failure; - old_tail= skb->tail; + old_tail = skb->tail; nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh)); pmsg = NLMSG_DATA(nlh); memset(pmsg, 0, sizeof(*pmsg)); pmsg->packet_id = (unsigned long )entry; pmsg->data_len = data_len; - pmsg->timestamp_sec = entry->skb->tstamp.off_sec; - pmsg->timestamp_usec = entry->skb->tstamp.off_usec; + tv = ktime_to_timeval(entry->skb->tstamp); + pmsg->timestamp_sec = tv.tv_sec; + pmsg->timestamp_usec = tv.tv_usec; pmsg->mark = entry->skb->mark; pmsg->hook = entry->info->hook; pmsg->hw_protocol = entry->skb->protocol; @@ -378,7 +368,7 @@ ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e) } if (!skb_make_writable(&e->skb, v->data_len)) return -ENOMEM; - memcpy(e->skb->data, v->payload, v->data_len); + skb_copy_to_linear_data(e->skb, v->payload, v->data_len); e->skb->ip_summed = CHECKSUM_NONE; return 0; @@ -495,7 +485,7 @@ ipq_rcv_skb(struct sk_buff *skb) if (skblen < sizeof(*nlh)) return; - nlh = (struct nlmsghdr *)skb->data; + nlh = nlmsg_hdr(skb); nlmsglen = nlh->nlmsg_len; if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen) return; @@ -678,7 +668,7 @@ static int __init ip_queue_init(void) netlink_register_notifier(&ipq_nl_notifier); ipqnl = netlink_kernel_create(NETLINK_FIREWALL, 0, ipq_rcv_sk, - THIS_MODULE); + NULL, THIS_MODULE); if (ipqnl == NULL) { printk(KERN_ERR "ip_queue: failed to create netlink socket\n"); goto cleanup_netlink_notifier; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 50cc4b92e28..e3f83bf160d 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -7,12 +7,6 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. - * - * 19 Jan 2002 Harald Welte <laforge@gnumonks.org> - * - increase module usage count as soon as we have rules inside - * a table - * 08 Oct 2005 Harald Welte <lafore@netfilter.org> - * - Generalize into "x_tables" layer and "{ip,ip6,arp}_tables" */ #include <linux/cache.h> #include <linux/capability.h> @@ -198,7 +192,7 @@ int do_match(struct ipt_entry_match *m, { /* Stop iteration if it doesn't match */ if (!m->u.kernel.match->match(skb, in, out, m->u.kernel.match, m->data, - offset, skb->nh.iph->ihl*4, hotdrop)) + offset, ip_hdrlen(skb), hotdrop)) return 1; else return 0; @@ -231,7 +225,7 @@ ipt_do_table(struct sk_buff **pskb, struct xt_table_info *private; /* Initialization */ - ip = (*pskb)->nh.iph; + ip = ip_hdr(*pskb); datalen = (*pskb)->len - ip->ihl * 4; indev = in ? in->name : nulldevname; outdev = out ? out->name : nulldevname; @@ -320,7 +314,7 @@ ipt_do_table(struct sk_buff **pskb, = 0x57acc001; #endif /* Target might have changed stuff. */ - ip = (*pskb)->nh.iph; + ip = ip_hdr(*pskb); datalen = (*pskb)->len - ip->ihl * 4; if (verdict == IPT_CONTINUE) diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 42b08029e86..40e27342139 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -21,15 +21,12 @@ #include <linux/if_arp.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> - -#include <net/checksum.h> - #include <linux/netfilter_arp.h> - #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_CLUSTERIP.h> -#include <net/netfilter/nf_conntrack_compat.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/checksum.h> #define CLUSTERIP_VERSION "0.8" @@ -240,7 +237,7 @@ clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum) static inline u_int32_t clusterip_hashfn(struct sk_buff *skb, struct clusterip_config *config) { - struct iphdr *iph = skb->nh.iph; + struct iphdr *iph = ip_hdr(skb); unsigned long hashval; u_int16_t sport, dport; u_int16_t *ports; @@ -310,15 +307,16 @@ target(struct sk_buff **pskb, const void *targinfo) { const struct ipt_clusterip_tgt_info *cipinfo = targinfo; + struct nf_conn *ct; enum ip_conntrack_info ctinfo; - u_int32_t *mark, hash; + u_int32_t hash; /* don't need to clusterip_config_get() here, since refcount * is only decremented by destroy() - and ip_tables guarantees * that the ->target() function isn't called after ->destroy() */ - mark = nf_ct_get_mark((*pskb), &ctinfo); - if (mark == NULL) { + ct = nf_ct_get(*pskb, &ctinfo); + if (ct == NULL) { printk(KERN_ERR "CLUSTERIP: no conntrack!\n"); /* FIXME: need to drop invalid ones, since replies * to outgoing connections of other nodes will be @@ -328,7 +326,7 @@ target(struct sk_buff **pskb, /* special case: ICMP error handling. conntrack distinguishes between * error messages (RELATED) and information requests (see below) */ - if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP + if (ip_hdr(*pskb)->protocol == IPPROTO_ICMP && (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED+IP_CT_IS_REPLY)) return XT_CONTINUE; @@ -341,7 +339,7 @@ target(struct sk_buff **pskb, switch (ctinfo) { case IP_CT_NEW: - *mark = hash; + ct->mark = hash; break; case IP_CT_RELATED: case IP_CT_RELATED+IP_CT_IS_REPLY: @@ -358,7 +356,7 @@ target(struct sk_buff **pskb, #ifdef DEBUG_CLUSTERP DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); #endif - DEBUGP("hash=%u ct_hash=%u ", hash, *mark); + DEBUGP("hash=%u ct_hash=%u ", hash, ct->mark); if (!clusterip_responsible(cipinfo->config, hash)) { DEBUGP("not responsible\n"); return NF_DROP; @@ -521,7 +519,7 @@ arp_mangle(unsigned int hook, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - struct arphdr *arp = (*pskb)->nh.arph; + struct arphdr *arp = arp_hdr(*pskb); struct arp_payload *payload; struct clusterip_config *c; diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c index 4f565633631..918ca92e534 100644 --- a/net/ipv4/netfilter/ipt_ECN.c +++ b/net/ipv4/netfilter/ipt_ECN.c @@ -5,14 +5,13 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. - * - * ipt_ECN.c,v 1.5 2002/08/18 19:36:51 laforge Exp */ #include <linux/in.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ip.h> +#include <net/ip.h> #include <linux/tcp.h> #include <net/checksum.h> @@ -29,13 +28,13 @@ MODULE_DESCRIPTION("iptables ECN modification module"); static inline int set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo) { - struct iphdr *iph = (*pskb)->nh.iph; + struct iphdr *iph = ip_hdr(*pskb); if ((iph->tos & IPT_ECN_IP_MASK) != (einfo->ip_ect & IPT_ECN_IP_MASK)) { __u8 oldtos; if (!skb_make_writable(pskb, sizeof(struct iphdr))) return 0; - iph = (*pskb)->nh.iph; + iph = ip_hdr(*pskb); oldtos = iph->tos; iph->tos &= ~IPT_ECN_IP_MASK; iph->tos |= (einfo->ip_ect & IPT_ECN_IP_MASK); @@ -52,7 +51,7 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo) __be16 oldval; /* Not enought header? */ - tcph = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4, + tcph = skb_header_pointer(*pskb, ip_hdrlen(*pskb), sizeof(_tcph), &_tcph); if (!tcph) return 0; @@ -63,9 +62,9 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo) tcph->cwr == einfo->proto.tcp.cwr))) return 1; - if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) + if (!skb_make_writable(pskb, ip_hdrlen(*pskb) + sizeof(*tcph))) return 0; - tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4; + tcph = (void *)ip_hdr(*pskb) + ip_hdrlen(*pskb); oldval = ((__be16 *)tcph)[6]; if (einfo->operation & IPT_ECN_OP_SET_ECE) @@ -93,7 +92,7 @@ target(struct sk_buff **pskb, return NF_DROP; if (einfo->operation & (IPT_ECN_OP_SET_ECE | IPT_ECN_OP_SET_CWR) - && (*pskb)->nh.iph->protocol == IPPROTO_TCP) + && ip_hdr(*pskb)->protocol == IPPROTO_TCP) if (!set_ect_tcp(pskb, einfo)) return NF_DROP; diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index d9c37fd9422..a42c5cd968b 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -399,9 +399,9 @@ ipt_log_packet(unsigned int pf, /* MAC logging for input chain only. */ printk("MAC="); if (skb->dev && skb->dev->hard_header_len - && skb->mac.raw != (void*)skb->nh.iph) { + && skb->mac_header != skb->network_header) { int i; - unsigned char *p = skb->mac.raw; + const unsigned char *p = skb_mac_header(skb); for (i = 0; i < skb->dev->hard_header_len; i++,p++) printk("%02x%c", *p, i==skb->dev->hard_header_len - 1 @@ -477,14 +477,10 @@ static int __init ipt_log_init(void) ret = xt_register_target(&ipt_log_reg); if (ret < 0) return ret; - if (nf_log_register(PF_INET, &ipt_log_logger) < 0) { - printk(KERN_WARNING "ipt_LOG: not logging via system console " - "since somebody else already registered for PF_INET\n"); - /* we cannot make module load fail here, since otherwise - * iptables userspace would abort */ - } - - return 0; + ret = nf_log_register(PF_INET, &ipt_log_logger); + if (ret < 0 && ret != -EEXIST) + xt_unregister_target(&ipt_log_reg); + return ret; } static void __exit ipt_log_fini(void) diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index b5955f3a3f8..d4f2d777533 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -19,12 +19,8 @@ #include <net/ip.h> #include <net/checksum.h> #include <net/route.h> -#include <linux/netfilter_ipv4.h> -#ifdef CONFIG_NF_NAT_NEEDED #include <net/netfilter/nf_nat_rule.h> -#else -#include <linux/netfilter_ipv4/ip_nat_rule.h> -#endif +#include <linux/netfilter_ipv4.h> #include <linux/netfilter/x_tables.h> MODULE_LICENSE("GPL"); @@ -48,7 +44,7 @@ masquerade_check(const char *tablename, void *targinfo, unsigned int hook_mask) { - const struct ip_nat_multi_range_compat *mr = targinfo; + const struct nf_nat_multi_range_compat *mr = targinfo; if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { DEBUGP("masquerade_check: bad MAP_IPS.\n"); @@ -69,33 +65,26 @@ masquerade_target(struct sk_buff **pskb, const struct xt_target *target, const void *targinfo) { -#ifdef CONFIG_NF_NAT_NEEDED + struct nf_conn *ct; struct nf_conn_nat *nat; -#endif - struct ip_conntrack *ct; enum ip_conntrack_info ctinfo; - struct ip_nat_range newrange; - const struct ip_nat_multi_range_compat *mr; + struct nf_nat_range newrange; + const struct nf_nat_multi_range_compat *mr; struct rtable *rt; __be32 newsrc; - IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING); + NF_CT_ASSERT(hooknum == NF_IP_POST_ROUTING); - ct = ip_conntrack_get(*pskb, &ctinfo); -#ifdef CONFIG_NF_NAT_NEEDED + ct = nf_ct_get(*pskb, &ctinfo); nat = nfct_nat(ct); -#endif - IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED + + NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); /* Source address is 0.0.0.0 - locally generated packet that is * probably not supposed to be masqueraded. */ -#ifdef CONFIG_NF_NAT_NEEDED if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0) -#else - if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip == 0) -#endif return NF_ACCEPT; mr = targinfo; @@ -107,40 +96,30 @@ masquerade_target(struct sk_buff **pskb, } write_lock_bh(&masq_lock); -#ifdef CONFIG_NF_NAT_NEEDED nat->masq_index = out->ifindex; -#else - ct->nat.masq_index = out->ifindex; -#endif write_unlock_bh(&masq_lock); /* Transfer from original range. */ - newrange = ((struct ip_nat_range) + newrange = ((struct nf_nat_range) { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS, newsrc, newsrc, mr->range[0].min, mr->range[0].max }); /* Hand modified range to generic setup. */ - return ip_nat_setup_info(ct, &newrange, hooknum); + return nf_nat_setup_info(ct, &newrange, hooknum); } static inline int -device_cmp(struct ip_conntrack *i, void *ifindex) +device_cmp(struct nf_conn *i, void *ifindex) { - int ret; -#ifdef CONFIG_NF_NAT_NEEDED struct nf_conn_nat *nat = nfct_nat(i); + int ret; if (!nat) return 0; -#endif read_lock_bh(&masq_lock); -#ifdef CONFIG_NF_NAT_NEEDED ret = (nat->masq_index == (int)(long)ifindex); -#else - ret = (i->nat.masq_index == (int)(long)ifindex); -#endif read_unlock_bh(&masq_lock); return ret; @@ -156,9 +135,9 @@ static int masq_device_event(struct notifier_block *this, /* Device was downed. Search entire table for conntracks which were associated with that device, and forget them. */ - IP_NF_ASSERT(dev->ifindex != 0); + NF_CT_ASSERT(dev->ifindex != 0); - ip_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex); + nf_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex); } return NOTIFY_DONE; @@ -174,9 +153,9 @@ static int masq_inet_event(struct notifier_block *this, /* IP address was deleted. Search entire table for conntracks which were associated with that device, and forget them. */ - IP_NF_ASSERT(dev->ifindex != 0); + NF_CT_ASSERT(dev->ifindex != 0); - ip_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex); + nf_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex); } return NOTIFY_DONE; @@ -194,7 +173,7 @@ static struct xt_target masquerade = { .name = "MASQUERADE", .family = AF_INET, .target = masquerade_target, - .targetsize = sizeof(struct ip_nat_multi_range_compat), + .targetsize = sizeof(struct nf_nat_multi_range_compat), .table = "nat", .hooks = 1 << NF_IP_POST_ROUTING, .checkentry = masquerade_check, diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c index fd7aaa347cd..068c69bce30 100644 --- a/net/ipv4/netfilter/ipt_NETMAP.c +++ b/net/ipv4/netfilter/ipt_NETMAP.c @@ -16,11 +16,7 @@ #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter/x_tables.h> -#ifdef CONFIG_NF_NAT_NEEDED #include <net/netfilter/nf_nat_rule.h> -#else -#include <linux/netfilter_ipv4/ip_nat_rule.h> -#endif #define MODULENAME "NETMAP" MODULE_LICENSE("GPL"); @@ -40,7 +36,7 @@ check(const char *tablename, void *targinfo, unsigned int hook_mask) { - const struct ip_nat_multi_range_compat *mr = targinfo; + const struct nf_nat_multi_range_compat *mr = targinfo; if (!(mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)) { DEBUGP(MODULENAME":check: bad MAP_IPS.\n"); @@ -61,39 +57,39 @@ target(struct sk_buff **pskb, const struct xt_target *target, const void *targinfo) { - struct ip_conntrack *ct; + struct nf_conn *ct; enum ip_conntrack_info ctinfo; __be32 new_ip, netmask; - const struct ip_nat_multi_range_compat *mr = targinfo; - struct ip_nat_range newrange; + const struct nf_nat_multi_range_compat *mr = targinfo; + struct nf_nat_range newrange; - IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING + NF_CT_ASSERT(hooknum == NF_IP_PRE_ROUTING || hooknum == NF_IP_POST_ROUTING || hooknum == NF_IP_LOCAL_OUT); - ct = ip_conntrack_get(*pskb, &ctinfo); + ct = nf_ct_get(*pskb, &ctinfo); netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); if (hooknum == NF_IP_PRE_ROUTING || hooknum == NF_IP_LOCAL_OUT) - new_ip = (*pskb)->nh.iph->daddr & ~netmask; + new_ip = ip_hdr(*pskb)->daddr & ~netmask; else - new_ip = (*pskb)->nh.iph->saddr & ~netmask; + new_ip = ip_hdr(*pskb)->saddr & ~netmask; new_ip |= mr->range[0].min_ip & netmask; - newrange = ((struct ip_nat_range) + newrange = ((struct nf_nat_range) { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS, new_ip, new_ip, mr->range[0].min, mr->range[0].max }); /* Hand modified range to generic setup. */ - return ip_nat_setup_info(ct, &newrange, hooknum); + return nf_nat_setup_info(ct, &newrange, hooknum); } static struct xt_target target_module = { .name = MODULENAME, .family = AF_INET, .target = target, - .targetsize = sizeof(struct ip_nat_multi_range_compat), + .targetsize = sizeof(struct nf_nat_multi_range_compat), .table = "nat", .hooks = (1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING) | (1 << NF_IP_LOCAL_OUT), diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c index c2b6b80670f..68cc76a198e 100644 --- a/net/ipv4/netfilter/ipt_REDIRECT.c +++ b/net/ipv4/netfilter/ipt_REDIRECT.c @@ -19,11 +19,7 @@ #include <net/checksum.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter/x_tables.h> -#ifdef CONFIG_NF_NAT_NEEDED #include <net/netfilter/nf_nat_rule.h> -#else -#include <linux/netfilter_ipv4/ip_nat_rule.h> -#endif MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); @@ -43,7 +39,7 @@ redirect_check(const char *tablename, void *targinfo, unsigned int hook_mask) { - const struct ip_nat_multi_range_compat *mr = targinfo; + const struct nf_nat_multi_range_compat *mr = targinfo; if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { DEBUGP("redirect_check: bad MAP_IPS.\n"); @@ -64,17 +60,17 @@ redirect_target(struct sk_buff **pskb, const struct xt_target *target, const void *targinfo) { - struct ip_conntrack *ct; + struct nf_conn *ct; enum ip_conntrack_info ctinfo; __be32 newdst; - const struct ip_nat_multi_range_compat *mr = targinfo; - struct ip_nat_range newrange; + const struct nf_nat_multi_range_compat *mr = targinfo; + struct nf_nat_range newrange; - IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING + NF_CT_ASSERT(hooknum == NF_IP_PRE_ROUTING || hooknum == NF_IP_LOCAL_OUT); - ct = ip_conntrack_get(*pskb, &ctinfo); - IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); + ct = nf_ct_get(*pskb, &ctinfo); + NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); /* Local packets: make them go to loopback */ if (hooknum == NF_IP_LOCAL_OUT) @@ -96,20 +92,20 @@ redirect_target(struct sk_buff **pskb, } /* Transfer from original range. */ - newrange = ((struct ip_nat_range) + newrange = ((struct nf_nat_range) { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS, newdst, newdst, mr->range[0].min, mr->range[0].max }); /* Hand modified range to generic setup. */ - return ip_nat_setup_info(ct, &newrange, hooknum); + return nf_nat_setup_info(ct, &newrange, hooknum); } static struct xt_target redirect_reg = { .name = "REDIRECT", .family = AF_INET, .target = redirect_target, - .targetsize = sizeof(struct ip_nat_multi_range_compat), + .targetsize = sizeof(struct nf_nat_multi_range_compat), .table = "nat", .hooks = (1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT), .checkentry = redirect_check, diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 80f739e2182..9041e0741f6 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -1,7 +1,5 @@ /* * This is a module which is used for rejecting packets. - * Added support for customized reject packets (Jozsef Kadlecsik). - * Added support for ICMP type-3-code-13 (Maciej Soltysiak). [RFC 1812] */ /* (C) 1999-2001 Paul `Rusty' Russell @@ -43,7 +41,7 @@ MODULE_DESCRIPTION("iptables REJECT target module"); static void send_reset(struct sk_buff *oldskb, int hook) { struct sk_buff *nskb; - struct iphdr *iph = oldskb->nh.iph; + struct iphdr *niph; struct tcphdr _otcph, *oth, *tcph; __be16 tmp_port; __be32 tmp_addr; @@ -51,10 +49,10 @@ static void send_reset(struct sk_buff *oldskb, int hook) unsigned int addr_type; /* IP header checks: fragment. */ - if (oldskb->nh.iph->frag_off & htons(IP_OFFSET)) + if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) return; - oth = skb_header_pointer(oldskb, oldskb->nh.iph->ihl * 4, + oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb), sizeof(_otcph), &_otcph); if (oth == NULL) return; @@ -64,7 +62,7 @@ static void send_reset(struct sk_buff *oldskb, int hook) return; /* Check checksum */ - if (nf_ip_checksum(oldskb, hook, iph->ihl * 4, IPPROTO_TCP)) + if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP)) return; /* We need a linear, writeable skb. We also need to expand @@ -84,20 +82,21 @@ static void send_reset(struct sk_buff *oldskb, int hook) skb_shinfo(nskb)->gso_segs = 0; skb_shinfo(nskb)->gso_type = 0; - tcph = (struct tcphdr *)((u_int32_t*)nskb->nh.iph + nskb->nh.iph->ihl); + tcph = (struct tcphdr *)(skb_network_header(nskb) + ip_hdrlen(nskb)); /* Swap source and dest */ - tmp_addr = nskb->nh.iph->saddr; - nskb->nh.iph->saddr = nskb->nh.iph->daddr; - nskb->nh.iph->daddr = tmp_addr; + niph = ip_hdr(nskb); + tmp_addr = niph->saddr; + niph->saddr = niph->daddr; + niph->daddr = tmp_addr; tmp_port = tcph->source; tcph->source = tcph->dest; tcph->dest = tmp_port; /* Truncate to length (no data) */ tcph->doff = sizeof(struct tcphdr)/4; - skb_trim(nskb, nskb->nh.iph->ihl*4 + sizeof(struct tcphdr)); - nskb->nh.iph->tot_len = htons(nskb->len); + skb_trim(nskb, ip_hdrlen(nskb) + sizeof(struct tcphdr)); + niph->tot_len = htons(nskb->len); if (tcph->ack) { needs_ack = 0; @@ -105,9 +104,9 @@ static void send_reset(struct sk_buff *oldskb, int hook) tcph->ack_seq = 0; } else { needs_ack = 1; - tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin - + oldskb->len - oldskb->nh.iph->ihl*4 - - (oth->doff<<2)); + tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin + + oldskb->len - ip_hdrlen(oldskb) - + (oth->doff << 2)); tcph->seq = 0; } @@ -122,14 +121,13 @@ static void send_reset(struct sk_buff *oldskb, int hook) /* Adjust TCP checksum */ tcph->check = 0; tcph->check = tcp_v4_check(sizeof(struct tcphdr), - nskb->nh.iph->saddr, - nskb->nh.iph->daddr, + niph->saddr, niph->daddr, csum_partial((char *)tcph, sizeof(struct tcphdr), 0)); /* Set DF, id = 0 */ - nskb->nh.iph->frag_off = htons(IP_DF); - nskb->nh.iph->id = 0; + niph->frag_off = htons(IP_DF); + niph->id = 0; addr_type = RTN_UNSPEC; if (hook != NF_IP_FORWARD @@ -145,12 +143,11 @@ static void send_reset(struct sk_buff *oldskb, int hook) nskb->ip_summed = CHECKSUM_NONE; /* Adjust IP TTL */ - nskb->nh.iph->ttl = dst_metric(nskb->dst, RTAX_HOPLIMIT); + niph->ttl = dst_metric(nskb->dst, RTAX_HOPLIMIT); /* Adjust IP checksum */ - nskb->nh.iph->check = 0; - nskb->nh.iph->check = ip_fast_csum((unsigned char *)nskb->nh.iph, - nskb->nh.iph->ihl); + niph->check = 0; + niph->check = ip_fast_csum(skb_network_header(nskb), niph->ihl); /* "Never happens" */ if (nskb->len > dst_mtu(nskb->dst)) @@ -182,7 +179,7 @@ static unsigned int reject(struct sk_buff **pskb, /* Our naive response construction doesn't deal with IP options, and probably shouldn't try. */ - if ((*pskb)->nh.iph->ihl<<2 != sizeof(struct iphdr)) + if (ip_hdrlen(*pskb) != sizeof(struct iphdr)) return NF_DROP; /* WARNING: This code causes reentry within iptables. diff --git a/net/ipv4/netfilter/ipt_SAME.c b/net/ipv4/netfilter/ipt_SAME.c index bd4404e5c68..511e5ff8493 100644 --- a/net/ipv4/netfilter/ipt_SAME.c +++ b/net/ipv4/netfilter/ipt_SAME.c @@ -7,21 +7,6 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. - * - * 010320 Martin Josefsson <gandalf@wlug.westbo.se> - * * copied ipt_BALANCE.c to ipt_SAME.c and changed a few things. - * 010728 Martin Josefsson <gandalf@wlug.westbo.se> - * * added --nodst to not include destination-ip in new source - * calculations. - * * added some more sanity-checks. - * 010729 Martin Josefsson <gandalf@wlug.westbo.se> - * * fixed a buggy if-statement in same_check(), should have - * used ntohl() but didn't. - * * added support for multiple ranges. IPT_SAME_MAX_RANGE is - * defined in linux/include/linux/netfilter_ipv4/ipt_SAME.h - * and is currently set to 10. - * * added support for 1-address range, nice to have now that - * we have multiple ranges. */ #include <linux/types.h> #include <linux/ip.h> @@ -35,11 +20,7 @@ #include <net/checksum.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter/x_tables.h> -#ifdef CONFIG_NF_NAT_NEEDED #include <net/netfilter/nf_nat_rule.h> -#else -#include <linux/netfilter_ipv4/ip_nat_rule.h> -#endif #include <linux/netfilter_ipv4/ipt_SAME.h> MODULE_LICENSE("GPL"); @@ -138,17 +119,17 @@ same_target(struct sk_buff **pskb, const struct xt_target *target, const void *targinfo) { - struct ip_conntrack *ct; + struct nf_conn *ct; enum ip_conntrack_info ctinfo; u_int32_t tmpip, aindex; __be32 new_ip; const struct ipt_same_info *same = targinfo; - struct ip_nat_range newrange; - const struct ip_conntrack_tuple *t; + struct nf_nat_range newrange; + const struct nf_conntrack_tuple *t; - IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING || + NF_CT_ASSERT(hooknum == NF_IP_PRE_ROUTING || hooknum == NF_IP_POST_ROUTING); - ct = ip_conntrack_get(*pskb, &ctinfo); + ct = nf_ct_get(*pskb, &ctinfo); t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; @@ -157,17 +138,10 @@ same_target(struct sk_buff **pskb, Here we calculate the index in same->iparray which holds the ipaddress we should use */ -#ifdef CONFIG_NF_NAT_NEEDED tmpip = ntohl(t->src.u3.ip); if (!(same->info & IPT_SAME_NODST)) tmpip += ntohl(t->dst.u3.ip); -#else - tmpip = ntohl(t->src.ip); - - if (!(same->info & IPT_SAME_NODST)) - tmpip += ntohl(t->dst.ip); -#endif aindex = tmpip % same->ipnum; new_ip = htonl(same->iparray[aindex]); @@ -178,13 +152,13 @@ same_target(struct sk_buff **pskb, NIPQUAD(new_ip)); /* Transfer from original range. */ - newrange = ((struct ip_nat_range) + newrange = ((struct nf_nat_range) { same->range[0].flags, new_ip, new_ip, /* FIXME: Use ports from correct range! */ same->range[0].min, same->range[0].max }); /* Hand modified range to generic setup. */ - return ip_nat_setup_info(ct, &newrange, hooknum); + return nf_nat_setup_info(ct, &newrange, hooknum); } static struct xt_target same_reg = { diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c index cedf9f7d9d6..0ad02f24983 100644 --- a/net/ipv4/netfilter/ipt_TOS.c +++ b/net/ipv4/netfilter/ipt_TOS.c @@ -29,13 +29,13 @@ target(struct sk_buff **pskb, const void *targinfo) { const struct ipt_tos_target_info *tosinfo = targinfo; - struct iphdr *iph = (*pskb)->nh.iph; + struct iphdr *iph = ip_hdr(*pskb); if ((iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) { __u8 oldtos; if (!skb_make_writable(pskb, sizeof(struct iphdr))) return NF_DROP; - iph = (*pskb)->nh.iph; + iph = ip_hdr(*pskb); oldtos = iph->tos; iph->tos = (iph->tos & IPTOS_PREC_MASK) | tosinfo->tos; nf_csum_replace2(&iph->check, htons(oldtos), htons(iph->tos)); diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c index 64be31c22ba..a991ec7bd4e 100644 --- a/net/ipv4/netfilter/ipt_TTL.c +++ b/net/ipv4/netfilter/ipt_TTL.c @@ -32,7 +32,7 @@ ipt_ttl_target(struct sk_buff **pskb, if (!skb_make_writable(pskb, (*pskb)->len)) return NF_DROP; - iph = (*pskb)->nh.iph; + iph = ip_hdr(*pskb); switch (info->mode) { case IPT_TTL_SET: diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index 9acc018766f..23b607b33b3 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -2,20 +2,6 @@ * netfilter module for userspace packet logging daemons * * (C) 2000-2004 by Harald Welte <laforge@netfilter.org> - * - * 2000/09/22 ulog-cprange feature added - * 2001/01/04 in-kernel queue as proposed by Sebastian Zander - * <zander@fokus.gmd.de> - * 2001/01/30 per-rule nlgroup conflicts with global queue. - * nlgroup now global (sysctl) - * 2001/04/19 ulog-queue reworked, now fixed buffer size specified at - * module loadtime -HW - * 2002/07/07 remove broken nflog_rcv() function -HW - * 2002/08/29 fix shifted/unshifted nlgroup bug -HW - * 2002/10/30 fix uninitialized mac_len field - <Anders K. Pedersen> - * 2004/10/25 fix erroneous calculation of 'len' parameter to NLMSG_PUT - * resulting in bogus 'error during NLMSG_PUT' messages. - * * (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> * @@ -42,8 +28,6 @@ * flushtimeout: * Specify, after how many hundredths of a second the queue should be * flushed even if it is not full yet. - * - * ipt_ULOG.c,v 1.22 2002/10/30 09:07:31 laforge Exp */ #include <linux/module.h> @@ -187,6 +171,7 @@ static void ipt_ulog_packet(unsigned int hooknum, ulog_packet_msg_t *pm; size_t size, copy_len; struct nlmsghdr *nlh; + struct timeval tv; /* ffs == find first bit set, necessary because userspace * is already shifting groupnumber, but we need unshifted. @@ -232,13 +217,14 @@ static void ipt_ulog_packet(unsigned int hooknum, pm = NLMSG_DATA(nlh); /* We might not have a timestamp, get one */ - if (skb->tstamp.off_sec == 0) + if (skb->tstamp.tv64 == 0) __net_timestamp((struct sk_buff *)skb); /* copy hook, prefix, timestamp, payload, etc. */ pm->data_len = copy_len; - put_unaligned(skb->tstamp.off_sec, &pm->timestamp_sec); - put_unaligned(skb->tstamp.off_usec, &pm->timestamp_usec); + tv = ktime_to_timeval(skb->tstamp); + put_unaligned(tv.tv_sec, &pm->timestamp_sec); + put_unaligned(tv.tv_usec, &pm->timestamp_usec); put_unaligned(skb->mark, &pm->mark); pm->hook = hooknum; if (prefix != NULL) @@ -249,9 +235,9 @@ static void ipt_ulog_packet(unsigned int hooknum, *(pm->prefix) = '\0'; if (in && in->hard_header_len > 0 - && skb->mac.raw != (void *) skb->nh.iph + && skb->mac_header != skb->network_header && in->hard_header_len <= ULOG_MAC_LEN) { - memcpy(pm->mac, skb->mac.raw, in->hard_header_len); + memcpy(pm->mac, skb_mac_header(skb), in->hard_header_len); pm->mac_len = in->hard_header_len; } else pm->mac_len = 0; @@ -363,12 +349,52 @@ static int ipt_ulog_checkentry(const char *tablename, return 1; } +#ifdef CONFIG_COMPAT +struct compat_ipt_ulog_info { + compat_uint_t nl_group; + compat_size_t copy_range; + compat_size_t qthreshold; + char prefix[ULOG_PREFIX_LEN]; +}; + +static void compat_from_user(void *dst, void *src) +{ + struct compat_ipt_ulog_info *cl = src; + struct ipt_ulog_info l = { + .nl_group = cl->nl_group, + .copy_range = cl->copy_range, + .qthreshold = cl->qthreshold, + }; + + memcpy(l.prefix, cl->prefix, sizeof(l.prefix)); + memcpy(dst, &l, sizeof(l)); +} + +static int compat_to_user(void __user *dst, void *src) +{ + struct ipt_ulog_info *l = src; + struct compat_ipt_ulog_info cl = { + .nl_group = l->nl_group, + .copy_range = l->copy_range, + .qthreshold = l->qthreshold, + }; + + memcpy(cl.prefix, l->prefix, sizeof(cl.prefix)); + return copy_to_user(dst, &cl, sizeof(cl)) ? -EFAULT : 0; +} +#endif /* CONFIG_COMPAT */ + static struct xt_target ipt_ulog_reg = { .name = "ULOG", .family = AF_INET, .target = ipt_ulog_target, .targetsize = sizeof(struct ipt_ulog_info), .checkentry = ipt_ulog_checkentry, +#ifdef CONFIG_COMPAT + .compatsize = sizeof(struct compat_ipt_ulog_info), + .compat_from_user = compat_from_user, + .compat_to_user = compat_to_user, +#endif .me = THIS_MODULE, }; @@ -390,14 +416,11 @@ static int __init ipt_ulog_init(void) } /* initialize ulog_buffers */ - for (i = 0; i < ULOG_MAXNLGROUPS; i++) { - init_timer(&ulog_buffers[i].timer); - ulog_buffers[i].timer.function = ulog_timer; - ulog_buffers[i].timer.data = i; - } + for (i = 0; i < ULOG_MAXNLGROUPS; i++) + setup_timer(&ulog_buffers[i].timer, ulog_timer, i); nflognl = netlink_kernel_create(NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL, - THIS_MODULE); + NULL, THIS_MODULE); if (!nflognl) return -ENOMEM; diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c index cfa0472617f..a652a145155 100644 --- a/net/ipv4/netfilter/ipt_addrtype.c +++ b/net/ipv4/netfilter/ipt_addrtype.c @@ -33,7 +33,7 @@ static int match(const struct sk_buff *skb, int offset, unsigned int protoff, int *hotdrop) { const struct ipt_addrtype_info *info = matchinfo; - const struct iphdr *iph = skb->nh.iph; + const struct iphdr *iph = ip_hdr(skb); int ret = 1; if (info->source) diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c index 37508b2cfea..26218122f86 100644 --- a/net/ipv4/netfilter/ipt_ecn.c +++ b/net/ipv4/netfilter/ipt_ecn.c @@ -1,7 +1,5 @@ /* IP tables module for matching the value of the IPv4 and TCP ECN bits * - * ipt_ecn.c,v 1.3 2002/05/29 15:09:00 laforge Exp - * * (C) 2002 by Harald Welte <laforge@gnumonks.org> * * This program is free software; you can redistribute it and/or modify @@ -11,6 +9,7 @@ #include <linux/in.h> #include <linux/ip.h> +#include <net/ip.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/tcp.h> @@ -26,7 +25,7 @@ MODULE_LICENSE("GPL"); static inline int match_ip(const struct sk_buff *skb, const struct ipt_ecn_info *einfo) { - return ((skb->nh.iph->tos&IPT_ECN_IP_MASK) == einfo->ip_ect); + return (ip_hdr(skb)->tos & IPT_ECN_IP_MASK) == einfo->ip_ect; } static inline int match_tcp(const struct sk_buff *skb, @@ -38,8 +37,7 @@ static inline int match_tcp(const struct sk_buff *skb, /* In practice, TCP match does this, so can't fail. But let's * be good citizens. */ - th = skb_header_pointer(skb, skb->nh.iph->ihl * 4, - sizeof(_tcph), &_tcph); + th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph); if (th == NULL) { *hotdrop = 0; return 0; @@ -80,7 +78,7 @@ static int match(const struct sk_buff *skb, return 0; if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) { - if (skb->nh.iph->protocol != IPPROTO_TCP) + if (ip_hdr(skb)->protocol != IPPROTO_TCP) return 0; if (!match_tcp(skb, info, hotdrop)) return 0; diff --git a/net/ipv4/netfilter/ipt_iprange.c b/net/ipv4/netfilter/ipt_iprange.c index bc5d5e6091e..33af9e94088 100644 --- a/net/ipv4/netfilter/ipt_iprange.c +++ b/net/ipv4/netfilter/ipt_iprange.c @@ -32,7 +32,7 @@ match(const struct sk_buff *skb, int offset, unsigned int protoff, int *hotdrop) { const struct ipt_iprange_info *info = matchinfo; - const struct iphdr *iph = skb->nh.iph; + const struct iphdr *iph = ip_hdr(skb); if (info->flags & IPRANGE_SRC) { if (((ntohl(iph->saddr) < ntohl(info->src.min_ip)) diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c index aecb9c48e15..15a9e8bbb7c 100644 --- a/net/ipv4/netfilter/ipt_recent.c +++ b/net/ipv4/netfilter/ipt_recent.c @@ -183,11 +183,11 @@ ipt_recent_match(const struct sk_buff *skb, int ret = info->invert; if (info->side == IPT_RECENT_DEST) - addr = skb->nh.iph->daddr; + addr = ip_hdr(skb)->daddr; else - addr = skb->nh.iph->saddr; + addr = ip_hdr(skb)->saddr; - ttl = skb->nh.iph->ttl; + ttl = ip_hdr(skb)->ttl; /* use TTL as seen before forwarding */ if (out && !skb->sk) ttl++; diff --git a/net/ipv4/netfilter/ipt_tos.c b/net/ipv4/netfilter/ipt_tos.c index 5d33b51d49d..d314844af12 100644 --- a/net/ipv4/netfilter/ipt_tos.c +++ b/net/ipv4/netfilter/ipt_tos.c @@ -30,7 +30,7 @@ match(const struct sk_buff *skb, { const struct ipt_tos_info *info = matchinfo; - return (skb->nh.iph->tos == info->tos) ^ info->invert; + return (ip_hdr(skb)->tos == info->tos) ^ info->invert; } static struct xt_match tos_match = { diff --git a/net/ipv4/netfilter/ipt_ttl.c b/net/ipv4/netfilter/ipt_ttl.c index 1eca9f40037..ab02d9e3139 100644 --- a/net/ipv4/netfilter/ipt_ttl.c +++ b/net/ipv4/netfilter/ipt_ttl.c @@ -1,7 +1,5 @@ /* IP tables module for matching the value of the TTL * - * ipt_ttl.c,v 1.5 2000/11/13 11:16:08 laforge Exp - * * (C) 2000,2001 by Harald Welte <laforge@netfilter.org> * * This program is free software; you can redistribute it and/or modify @@ -26,19 +24,20 @@ static int match(const struct sk_buff *skb, int offset, unsigned int protoff, int *hotdrop) { const struct ipt_ttl_info *info = matchinfo; + const u8 ttl = ip_hdr(skb)->ttl; switch (info->mode) { case IPT_TTL_EQ: - return (skb->nh.iph->ttl == info->ttl); + return (ttl == info->ttl); break; case IPT_TTL_NE: - return (!(skb->nh.iph->ttl == info->ttl)); + return (!(ttl == info->ttl)); break; case IPT_TTL_LT: - return (skb->nh.iph->ttl < info->ttl); + return (ttl < info->ttl); break; case IPT_TTL_GT: - return (skb->nh.iph->ttl > info->ttl); + return (ttl > info->ttl); break; default: printk(KERN_WARNING "ipt_ttl: unknown mode %d\n", diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index d1d61e97b97..42728909eba 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -13,6 +13,7 @@ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/netfilter_ipv4/ip_tables.h> +#include <net/ip.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); @@ -102,7 +103,7 @@ ipt_local_out_hook(unsigned int hook, { /* root is playing with raw sockets. */ if ((*pskb)->len < sizeof(struct iphdr) - || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { + || ip_hdrlen(*pskb) < sizeof(struct iphdr)) { if (net_ratelimit()) printk("ipt_hook: happy cracking.\n"); return NF_ACCEPT; diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 98b66ef0c71..9278802f274 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -7,8 +7,6 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. - * - * Extended to all five netfilter hooks by Brad Chapman & Harald Welte */ #include <linux/module.h> #include <linux/netfilter_ipv4/ip_tables.h> @@ -17,6 +15,7 @@ #include <net/sock.h> #include <net/route.h> #include <linux/ip.h> +#include <net/ip.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); @@ -130,13 +129,14 @@ ipt_local_hook(unsigned int hook, int (*okfn)(struct sk_buff *)) { unsigned int ret; + const struct iphdr *iph; u_int8_t tos; __be32 saddr, daddr; u_int32_t mark; /* root is playing with raw sockets. */ if ((*pskb)->len < sizeof(struct iphdr) - || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { + || ip_hdrlen(*pskb) < sizeof(struct iphdr)) { if (net_ratelimit()) printk("ipt_hook: happy cracking.\n"); return NF_ACCEPT; @@ -144,19 +144,23 @@ ipt_local_hook(unsigned int hook, /* Save things which could affect route */ mark = (*pskb)->mark; - saddr = (*pskb)->nh.iph->saddr; - daddr = (*pskb)->nh.iph->daddr; - tos = (*pskb)->nh.iph->tos; + iph = ip_hdr(*pskb); + saddr = iph->saddr; + daddr = iph->daddr; + tos = iph->tos; ret = ipt_do_table(pskb, hook, in, out, &packet_mangler); /* Reroute for ANY change. */ - if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE - && ((*pskb)->nh.iph->saddr != saddr - || (*pskb)->nh.iph->daddr != daddr - || (*pskb)->mark != mark - || (*pskb)->nh.iph->tos != tos)) - if (ip_route_me_harder(pskb, RTN_UNSPEC)) - ret = NF_DROP; + if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) { + iph = ip_hdr(*pskb); + + if (iph->saddr != saddr || + iph->daddr != daddr || + (*pskb)->mark != mark || + iph->tos != tos) + if (ip_route_me_harder(pskb, RTN_UNSPEC)) + ret = NF_DROP; + } return ret; } diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 8f3e92d20df..0654eaae70c 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -4,14 +4,6 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. - * - * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> - * - move L3 protocol dependent part to this file. - * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> - * - add get_features() to support various size of conntrack - * structures. - * - * Derived from net/ipv4/netfilter/ip_conntrack_standalone.c */ #include <linux/types.h> @@ -87,7 +79,7 @@ nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) local_bh_enable(); if (skb) - ip_send_check(skb->nh.iph); + ip_send_check(ip_hdr(skb)); return skb; } @@ -97,16 +89,16 @@ ipv4_prepare(struct sk_buff **pskb, unsigned int hooknum, unsigned int *dataoff, u_int8_t *protonum) { /* Never happen */ - if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) { + if (ip_hdr(*pskb)->frag_off & htons(IP_OFFSET)) { if (net_ratelimit()) { printk(KERN_ERR "ipv4_prepare: Frag of proto %u (hook=%u)\n", - (*pskb)->nh.iph->protocol, hooknum); + ip_hdr(*pskb)->protocol, hooknum); } return -NF_DROP; } - *dataoff = (*pskb)->nh.raw - (*pskb)->data + (*pskb)->nh.iph->ihl*4; - *protonum = (*pskb)->nh.iph->protocol; + *dataoff = skb_network_offset(*pskb) + ip_hdrlen(*pskb); + *protonum = ip_hdr(*pskb)->protocol; return NF_ACCEPT; } @@ -152,9 +144,8 @@ static unsigned int ipv4_conntrack_help(unsigned int hooknum, return NF_ACCEPT; return help->helper->help(pskb, - (*pskb)->nh.raw - (*pskb)->data - + (*pskb)->nh.iph->ihl*4, - ct, ctinfo); + skb_network_offset(*pskb) + ip_hdrlen(*pskb), + ct, ctinfo); } static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, @@ -171,7 +162,7 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, #endif /* Gather fragments. */ - if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { + if (ip_hdr(*pskb)->frag_off & htons(IP_MF | IP_OFFSET)) { *pskb = nf_ct_ipv4_gather_frags(*pskb, hooknum == NF_IP_PRE_ROUTING ? IP_DEFRAG_CONNTRACK_IN : @@ -199,7 +190,7 @@ static unsigned int ipv4_conntrack_local(unsigned int hooknum, { /* root is playing with raw sockets. */ if ((*pskb)->len < sizeof(struct iphdr) - || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { + || ip_hdrlen(*pskb) < sizeof(struct iphdr)) { if (net_ratelimit()) printk("ipt_hook: happy cracking.\n"); return NF_ACCEPT; diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 5fd1e5363c1..f4fc657c198 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -4,11 +4,6 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. - * - * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> - * - enable working with Layer 3 protocol independent connection tracking. - * - * Derived from net/ipv4/netfilter/ip_conntrack_proto_icmp.c */ #include <linux/types.h> @@ -158,7 +153,7 @@ icmp_error_message(struct sk_buff *skb, NF_CT_ASSERT(skb->nfct == NULL); /* Not enough header? */ - inside = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_in), &_in); + inside = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_in), &_in); if (inside == NULL) return -NF_ACCEPT; @@ -172,7 +167,7 @@ icmp_error_message(struct sk_buff *skb, /* rcu_read_lock()ed by nf_hook_slow */ innerproto = __nf_ct_l4proto_find(PF_INET, inside->ip.protocol); - dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp); + dataoff = ip_hdrlen(skb) + sizeof(inside->icmp); /* Are they talking about one of our connections? */ if (!nf_ct_get_tuple(skb, dataoff, dataoff + inside->ip.ihl*4, PF_INET, inside->ip.protocol, &origtuple, @@ -227,7 +222,7 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff, struct icmphdr _ih, *icmph; /* Not enough header? */ - icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih); + icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih); if (icmph == NULL) { if (LOG_INVALID(IPPROTO_ICMP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 452e9d32668..ea02f00d2da 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -431,7 +431,7 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, } *inside; struct nf_conntrack_l4proto *l4proto; struct nf_conntrack_tuple inner, target; - int hdrlen = (*pskb)->nh.iph->ihl * 4; + int hdrlen = ip_hdrlen(*pskb); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); unsigned long statusbit; enum nf_nat_manip_type manip = HOOK2MANIP(hooknum); @@ -439,7 +439,7 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, if (!skb_make_writable(pskb, hdrlen + sizeof(*inside))) return 0; - inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; + inside = (void *)(*pskb)->data + ip_hdrlen(*pskb); /* We're actually going to mangle it beyond trivial checksum adjustment, so make sure the current checksum is correct. */ @@ -469,9 +469,9 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, l4proto = __nf_ct_l4proto_find(PF_INET, inside->ip.protocol); if (!nf_ct_get_tuple(*pskb, - (*pskb)->nh.iph->ihl*4 + sizeof(struct icmphdr), - (*pskb)->nh.iph->ihl*4 + - sizeof(struct icmphdr) + inside->ip.ihl*4, + ip_hdrlen(*pskb) + sizeof(struct icmphdr), + (ip_hdrlen(*pskb) + + sizeof(struct icmphdr) + inside->ip.ihl * 4), (u_int16_t)AF_INET, inside->ip.protocol, &inner, l3proto, l4proto)) @@ -483,14 +483,14 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, packet: PREROUTING (DST manip), routing produces ICMP, goes through POSTROUTING (which must correct the DST manip). */ if (!manip_pkt(inside->ip.protocol, pskb, - (*pskb)->nh.iph->ihl*4 + sizeof(inside->icmp), + ip_hdrlen(*pskb) + sizeof(inside->icmp), &ct->tuplehash[!dir].tuple, !manip)) return 0; if ((*pskb)->ip_summed != CHECKSUM_PARTIAL) { /* Reloading "inside" here since manip_pkt inner. */ - inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; + inside = (void *)(*pskb)->data + ip_hdrlen(*pskb); inside->icmp.checksum = 0; inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen, diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index 9cbf3f9be13..fcebc968d37 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -33,7 +33,7 @@ static int set_addr(struct sk_buff **pskb, unsigned int addroff, __be32 ip, __be16 port) { enum ip_conntrack_info ctinfo; - struct nf_conn *ct = ip_conntrack_get(*pskb, &ctinfo); + struct nf_conn *ct = nf_ct_get(*pskb, &ctinfo); struct { __be32 ip; __be16 port; @@ -44,7 +44,7 @@ static int set_addr(struct sk_buff **pskb, buf.port = port; addroff += dataoff; - if ((*pskb)->nh.iph->protocol == IPPROTO_TCP) { + if (ip_hdr(*pskb)->protocol == IPPROTO_TCP) { if (!nf_nat_mangle_tcp_packet(pskb, ct, ctinfo, addroff, sizeof(buf), (char *) &buf, sizeof(buf))) { @@ -55,11 +55,11 @@ static int set_addr(struct sk_buff **pskb, } /* Relocate data pointer */ - th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl * 4, + th = skb_header_pointer(*pskb, ip_hdrlen(*pskb), sizeof(_tcph), &_tcph); if (th == NULL) return -1; - *data = (*pskb)->data + (*pskb)->nh.iph->ihl * 4 + + *data = (*pskb)->data + ip_hdrlen(*pskb) + th->doff * 4 + dataoff; } else { if (!nf_nat_mangle_udp_packet(pskb, ct, ctinfo, @@ -73,8 +73,8 @@ static int set_addr(struct sk_buff **pskb, /* nf_nat_mangle_udp_packet uses skb_make_writable() to copy * or pull everything in a linear buffer, so we can safely * use the skb pointers now */ - *data = (*pskb)->data + (*pskb)->nh.iph->ihl * 4 + - sizeof(struct udphdr); + *data = ((*pskb)->data + ip_hdrlen(*pskb) + + sizeof(struct udphdr)); } return 0; @@ -383,7 +383,7 @@ static int nat_h245(struct sk_buff **pskb, struct nf_conn *ct, static void ip_nat_q931_expect(struct nf_conn *new, struct nf_conntrack_expect *this) { - struct ip_nat_range range; + struct nf_nat_range range; if (this->tuple.src.u3.ip != 0) { /* Only accept calls from GK */ nf_nat_follow_master(new, this); diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c index 49a90c39ffc..15b6e5ce3a0 100644 --- a/net/ipv4/netfilter/nf_nat_helper.c +++ b/net/ipv4/netfilter/nf_nat_helper.c @@ -87,12 +87,13 @@ static void mangle_contents(struct sk_buff *skb, unsigned char *data; BUG_ON(skb_is_nonlinear(skb)); - data = (unsigned char *)skb->nh.iph + dataoff; + data = skb_network_header(skb) + dataoff; /* move post-replacement */ memmove(data + match_offset + rep_len, data + match_offset + match_len, - skb->tail - (data + match_offset + match_len)); + skb->tail - (skb->network_header + dataoff + + match_offset + match_len)); /* insert data from buffer */ memcpy(data + match_offset, rep_buffer, rep_len); @@ -111,8 +112,8 @@ static void mangle_contents(struct sk_buff *skb, } /* fix IP hdr checksum information */ - skb->nh.iph->tot_len = htons(skb->len); - ip_send_check(skb->nh.iph); + ip_hdr(skb)->tot_len = htons(skb->len); + ip_send_check(ip_hdr(skb)); } /* Unusual, but possible case. */ @@ -152,6 +153,7 @@ nf_nat_mangle_tcp_packet(struct sk_buff **pskb, const char *rep_buffer, unsigned int rep_len) { + struct rtable *rt = (struct rtable *)(*pskb)->dst; struct iphdr *iph; struct tcphdr *tcph; int oldlen, datalen; @@ -166,7 +168,7 @@ nf_nat_mangle_tcp_packet(struct sk_buff **pskb, SKB_LINEAR_ASSERT(*pskb); - iph = (*pskb)->nh.iph; + iph = ip_hdr(*pskb); tcph = (void *)iph + iph->ihl*4; oldlen = (*pskb)->len - iph->ihl*4; @@ -175,11 +177,22 @@ nf_nat_mangle_tcp_packet(struct sk_buff **pskb, datalen = (*pskb)->len - iph->ihl*4; if ((*pskb)->ip_summed != CHECKSUM_PARTIAL) { - tcph->check = 0; - tcph->check = tcp_v4_check(datalen, - iph->saddr, iph->daddr, - csum_partial((char *)tcph, - datalen, 0)); + if (!(rt->rt_flags & RTCF_LOCAL) && + (*pskb)->dev->features & NETIF_F_ALL_CSUM) { + (*pskb)->ip_summed = CHECKSUM_PARTIAL; + (*pskb)->csum_start = skb_headroom(*pskb) + + skb_network_offset(*pskb) + + iph->ihl * 4; + (*pskb)->csum_offset = offsetof(struct tcphdr, check); + tcph->check = ~tcp_v4_check(datalen, + iph->saddr, iph->daddr, 0); + } else { + tcph->check = 0; + tcph->check = tcp_v4_check(datalen, + iph->saddr, iph->daddr, + csum_partial((char *)tcph, + datalen, 0)); + } } else nf_proto_csum_replace2(&tcph->check, *pskb, htons(oldlen), htons(datalen), 1); @@ -190,7 +203,7 @@ nf_nat_mangle_tcp_packet(struct sk_buff **pskb, (int)rep_len - (int)match_len, ct, ctinfo); /* Tell TCP window tracking about seq change */ - nf_conntrack_tcp_update(*pskb, (*pskb)->nh.iph->ihl*4, + nf_conntrack_tcp_update(*pskb, ip_hdrlen(*pskb), ct, CTINFO2DIR(ctinfo)); } return 1; @@ -216,12 +229,13 @@ nf_nat_mangle_udp_packet(struct sk_buff **pskb, const char *rep_buffer, unsigned int rep_len) { + struct rtable *rt = (struct rtable *)(*pskb)->dst; struct iphdr *iph; struct udphdr *udph; int datalen, oldlen; /* UDP helpers might accidentally mangle the wrong packet */ - iph = (*pskb)->nh.iph; + iph = ip_hdr(*pskb); if ((*pskb)->len < iph->ihl*4 + sizeof(*udph) + match_offset + match_len) return 0; @@ -234,7 +248,7 @@ nf_nat_mangle_udp_packet(struct sk_buff **pskb, !enlarge_skb(pskb, rep_len - match_len)) return 0; - iph = (*pskb)->nh.iph; + iph = ip_hdr(*pskb); udph = (void *)iph + iph->ihl*4; oldlen = (*pskb)->len - iph->ihl*4; @@ -250,13 +264,25 @@ nf_nat_mangle_udp_packet(struct sk_buff **pskb, return 1; if ((*pskb)->ip_summed != CHECKSUM_PARTIAL) { - udph->check = 0; - udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, - datalen, IPPROTO_UDP, - csum_partial((char *)udph, - datalen, 0)); - if (!udph->check) - udph->check = CSUM_MANGLED_0; + if (!(rt->rt_flags & RTCF_LOCAL) && + (*pskb)->dev->features & NETIF_F_ALL_CSUM) { + (*pskb)->ip_summed = CHECKSUM_PARTIAL; + (*pskb)->csum_start = skb_headroom(*pskb) + + skb_network_offset(*pskb) + + iph->ihl * 4; + (*pskb)->csum_offset = offsetof(struct udphdr, check); + udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, + datalen, IPPROTO_UDP, + 0); + } else { + udph->check = 0; + udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, + datalen, IPPROTO_UDP, + csum_partial((char *)udph, + datalen, 0)); + if (!udph->check) + udph->check = CSUM_MANGLED_0; + } } else nf_proto_csum_replace2(&udph->check, *pskb, htons(oldlen), htons(datalen), 1); @@ -318,8 +344,8 @@ nf_nat_sack_adjust(struct sk_buff **pskb, unsigned int dir, optoff, optend; struct nf_conn_nat *nat = nfct_nat(ct); - optoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct tcphdr); - optend = (*pskb)->nh.iph->ihl*4 + tcph->doff*4; + optoff = ip_hdrlen(*pskb) + sizeof(struct tcphdr); + optend = ip_hdrlen(*pskb) + tcph->doff * 4; if (!skb_make_writable(pskb, optend)) return 0; @@ -371,10 +397,10 @@ nf_nat_seq_adjust(struct sk_buff **pskb, this_way = &nat->info.seq[dir]; other_way = &nat->info.seq[!dir]; - if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) + if (!skb_make_writable(pskb, ip_hdrlen(*pskb) + sizeof(*tcph))) return 0; - tcph = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; + tcph = (void *)(*pskb)->data + ip_hdrlen(*pskb); if (after(ntohl(tcph->seq), this_way->correction_pos)) newseq = htonl(ntohl(tcph->seq) + this_way->offset_after); else @@ -399,7 +425,7 @@ nf_nat_seq_adjust(struct sk_buff **pskb, if (!nf_nat_sack_adjust(pskb, tcph, ct, ctinfo)) return 0; - nf_conntrack_tcp_update(*pskb, (*pskb)->nh.iph->ihl*4, ct, dir); + nf_conntrack_tcp_update(*pskb, ip_hdrlen(*pskb), ct, dir); return 1; } diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index 7ba341c22ea..a66888749ce 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -53,7 +53,7 @@ static void pptp_nat_expected(struct nf_conn *ct, struct nf_conntrack_tuple t; struct nf_ct_pptp_master *ct_pptp_info; struct nf_nat_pptp *nat_pptp_info; - struct ip_nat_range range; + struct nf_nat_range range; ct_pptp_info = &nfct_help(master)->help.ct_pptp_info; nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info; diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c index e5a34c17d92..c3908bc5a70 100644 --- a/net/ipv4/netfilter/nf_nat_proto_gre.c +++ b/net/ipv4/netfilter/nf_nat_proto_gre.c @@ -72,6 +72,11 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple, __be16 *keyptr; unsigned int min, i, range_size; + /* If there is no master conntrack we are not PPTP, + do not change tuples */ + if (!conntrack->master) + return 0; + if (maniptype == IP_NAT_MANIP_SRC) keyptr = &tuple->src.u.gre.key; else @@ -122,18 +127,9 @@ gre_manip_pkt(struct sk_buff **pskb, unsigned int iphdroff, if (maniptype != IP_NAT_MANIP_DST) return 1; switch (greh->version) { - case 0: - if (!greh->key) { - DEBUGP("can't nat GRE w/o key\n"); - break; - } - if (greh->csum) { - /* FIXME: Never tested this code... */ - nf_proto_csum_replace4(gre_csum(greh), *pskb, - *(gre_key(greh)), - tuple->dst.u.gre.key, 0); - } - *(gre_key(greh)) = tuple->dst.u.gre.key; + case GRE_VERSION_1701: + /* We do not currently NAT any GREv0 packets. + * Try to behave like "nf_nat_proto_unknown" */ break; case GRE_VERSION_PPTP: DEBUGP("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key)); diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c index 147a4370cf0..2534f718ab9 100644 --- a/net/ipv4/netfilter/nf_nat_rule.c +++ b/net/ipv4/netfilter/nf_nat_rule.c @@ -191,7 +191,7 @@ static unsigned int ipt_dnat_target(struct sk_buff **pskb, if (hooknum == NF_IP_LOCAL_OUT && mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) - warn_if_extra_mangle((*pskb)->nh.iph->daddr, + warn_if_extra_mangle(ip_hdr(*pskb)->daddr, mr->range[0].min_ip); return nf_nat_setup_info(ct, &mr->range[0], hooknum); @@ -226,10 +226,6 @@ static int ipt_dnat_checkentry(const char *tablename, printk("DNAT: multiple ranges no longer supported\n"); return 0; } - if (mr->range[0].flags & IP_NAT_RANGE_PROTO_RANDOM) { - printk("DNAT: port randomization not supported\n"); - return 0; - } return 1; } diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c index b12cd7c314c..fac97cf51ae 100644 --- a/net/ipv4/netfilter/nf_nat_sip.c +++ b/net/ipv4/netfilter/nf_nat_sip.c @@ -11,6 +11,7 @@ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ip.h> +#include <net/ip.h> #include <linux/udp.h> #include <net/netfilter/nf_nat.h> @@ -92,7 +93,7 @@ static int map_sip_addr(struct sk_buff **pskb, enum ip_conntrack_info ctinfo, if (!nf_nat_mangle_udp_packet(pskb, ct, ctinfo, matchoff, matchlen, addr, addrlen)) return 0; - *dptr = (*pskb)->data + (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + *dptr = (*pskb)->data + ip_hdrlen(*pskb) + sizeof(struct udphdr); return 1; } @@ -106,7 +107,7 @@ static unsigned int ip_nat_sip(struct sk_buff **pskb, struct addr_map map; int dataoff, datalen; - dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + dataoff = ip_hdrlen(*pskb) + sizeof(struct udphdr); datalen = (*pskb)->len - dataoff; if (datalen < sizeof("SIP/2.0") - 1) return NF_DROP; @@ -155,7 +156,7 @@ static unsigned int mangle_sip_packet(struct sk_buff **pskb, return 0; /* We need to reload this. Thanks Patrick. */ - *dptr = (*pskb)->data + (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + *dptr = (*pskb)->data + ip_hdrlen(*pskb) + sizeof(struct udphdr); return 1; } @@ -168,7 +169,7 @@ static int mangle_content_len(struct sk_buff **pskb, char buffer[sizeof("65536")]; int bufflen; - dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + dataoff = ip_hdrlen(*pskb) + sizeof(struct udphdr); /* Get actual SDP lenght */ if (ct_sip_get_info(ct, dptr, (*pskb)->len - dataoff, &matchoff, @@ -200,7 +201,7 @@ static unsigned int mangle_sdp(struct sk_buff **pskb, char buffer[sizeof("nnn.nnn.nnn.nnn")]; unsigned int dataoff, bufflen; - dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + dataoff = ip_hdrlen(*pskb) + sizeof(struct udphdr); /* Mangle owner and contact info. */ bufflen = sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(newip)); @@ -221,6 +222,29 @@ static unsigned int mangle_sdp(struct sk_buff **pskb, return mangle_content_len(pskb, ctinfo, ct, dptr); } +static void ip_nat_sdp_expect(struct nf_conn *ct, + struct nf_conntrack_expect *exp) +{ + struct nf_nat_range range; + + /* This must be a fresh one. */ + BUG_ON(ct->status & IPS_NAT_DONE_MASK); + + /* Change src to where master sends to */ + range.flags = IP_NAT_RANGE_MAP_IPS; + range.min_ip = range.max_ip + = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip; + /* hook doesn't matter, but it has to do source manip */ + nf_nat_setup_info(ct, &range, NF_IP_POST_ROUTING); + + /* For DST manip, map port here to where it's expected. */ + range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); + range.min = range.max = exp->saved_proto; + range.min_ip = range.max_ip = exp->saved_ip; + /* hook doesn't matter, but it has to do destination manip */ + nf_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING); +} + /* So, this packet has hit the connection tracking matching code. Mangle it, and change the expectation to match the new version. */ static unsigned int ip_nat_sdp(struct sk_buff **pskb, @@ -238,13 +262,14 @@ static unsigned int ip_nat_sdp(struct sk_buff **pskb, /* Connection will come from reply */ newip = ct->tuplehash[!dir].tuple.dst.u3.ip; + exp->saved_ip = exp->tuple.dst.u3.ip; exp->tuple.dst.u3.ip = newip; exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port; exp->dir = !dir; /* When you see the packet, we need to NAT it the same as the this one. */ - exp->expectfn = nf_nat_follow_master; + exp->expectfn = ip_nat_sdp_expect; /* Try to get same port: if not, try to change it. */ for (port = ntohs(exp->saved_proto.udp.port); port != 0; port++) { diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index ce5c4939a6e..6e88505d616 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -38,10 +38,6 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Author: James Morris <jmorris@intercode.com.au> - * - * Updates: - * 2000-08-06: Convert to new helper API (Harald Welte). - * */ #include <linux/module.h> #include <linux/moduleparam.h> @@ -1194,7 +1190,7 @@ static int snmp_translate(struct nf_conn *ct, enum ip_conntrack_info ctinfo, struct sk_buff **pskb) { - struct iphdr *iph = (*pskb)->nh.iph; + struct iphdr *iph = ip_hdr(*pskb); struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl); u_int16_t udplen = ntohs(udph->len); u_int16_t paylen = udplen - sizeof(struct udphdr); @@ -1235,7 +1231,7 @@ static int help(struct sk_buff **pskb, unsigned int protoff, { int dir = CTINFO2DIR(ctinfo); unsigned int ret; - struct iphdr *iph = (*pskb)->nh.iph; + struct iphdr *iph = ip_hdr(*pskb); struct udphdr *udph = (struct udphdr *)((u_int32_t *)iph + iph->ihl); /* SNMP replies and originating SNMP traps get mangled */ diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c index 15aa3db8cb3..64bbed2ba78 100644 --- a/net/ipv4/netfilter/nf_nat_standalone.c +++ b/net/ipv4/netfilter/nf_nat_standalone.c @@ -86,8 +86,7 @@ nf_nat_fn(unsigned int hooknum, /* We never see fragments: conntrack defrags on pre-routing and local-out, and nf_nat_out protects post-routing. */ - NF_CT_ASSERT(!((*pskb)->nh.iph->frag_off - & htons(IP_MF|IP_OFFSET))); + NF_CT_ASSERT(!(ip_hdr(*pskb)->frag_off & htons(IP_MF | IP_OFFSET))); ct = nf_ct_get(*pskb, &ctinfo); /* Can't track? It's not due to stress, or conntrack would @@ -98,11 +97,10 @@ nf_nat_fn(unsigned int hooknum, /* Exception: ICMP redirect to new connection (not in hash table yet). We must not let this through, in case we're doing NAT to the same network. */ - if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) { + if (ip_hdr(*pskb)->protocol == IPPROTO_ICMP) { struct icmphdr _hdr, *hp; - hp = skb_header_pointer(*pskb, - (*pskb)->nh.iph->ihl*4, + hp = skb_header_pointer(*pskb, ip_hdrlen(*pskb), sizeof(_hdr), &_hdr); if (hp != NULL && hp->type == ICMP_REDIRECT) @@ -122,7 +120,7 @@ nf_nat_fn(unsigned int hooknum, switch (ctinfo) { case IP_CT_RELATED: case IP_CT_RELATED+IP_CT_IS_REPLY: - if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) { + if (ip_hdr(*pskb)->protocol == IPPROTO_ICMP) { if (!nf_nat_icmp_reply_translation(ct, ctinfo, hooknum, pskb)) return NF_DROP; @@ -177,11 +175,11 @@ nf_nat_in(unsigned int hooknum, int (*okfn)(struct sk_buff *)) { unsigned int ret; - __be32 daddr = (*pskb)->nh.iph->daddr; + __be32 daddr = ip_hdr(*pskb)->daddr; ret = nf_nat_fn(hooknum, pskb, in, out, okfn); if (ret != NF_DROP && ret != NF_STOLEN && - daddr != (*pskb)->nh.iph->daddr) { + daddr != ip_hdr(*pskb)->daddr) { dst_release((*pskb)->dst); (*pskb)->dst = NULL; } @@ -203,7 +201,7 @@ nf_nat_out(unsigned int hooknum, /* root is playing with raw sockets. */ if ((*pskb)->len < sizeof(struct iphdr) || - (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) + ip_hdrlen(*pskb) < sizeof(struct iphdr)) return NF_ACCEPT; ret = nf_nat_fn(hooknum, pskb, in, out, okfn); @@ -236,7 +234,7 @@ nf_nat_local_fn(unsigned int hooknum, /* root is playing with raw sockets. */ if ((*pskb)->len < sizeof(struct iphdr) || - (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) + ip_hdrlen(*pskb) < sizeof(struct iphdr)) return NF_ACCEPT; ret = nf_nat_fn(hooknum, pskb, in, out, okfn); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index ae68a691e8c..37ab5802ca0 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -87,19 +87,6 @@ static const struct file_operations sockstat_seq_fops = { .release = single_release, }; -static unsigned long -fold_field(void *mib[], int offt) -{ - unsigned long res = 0; - int i; - - for_each_possible_cpu(i) { - res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt); - res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt); - } - return res; -} - /* snmp items */ static const struct snmp_mib snmp4_ipstats_list[] = { SNMP_MIB_ITEM("InReceives", IPSTATS_MIB_INRECEIVES), @@ -266,8 +253,8 @@ static int snmp_seq_show(struct seq_file *seq, void *v) for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) seq_printf(seq, " %lu", - fold_field((void **) ip_statistics, - snmp4_ipstats_list[i].entry)); + snmp_fold_field((void **)ip_statistics, + snmp4_ipstats_list[i].entry)); seq_puts(seq, "\nIcmp:"); for (i = 0; snmp4_icmp_list[i].name != NULL; i++) @@ -276,8 +263,8 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nIcmp:"); for (i = 0; snmp4_icmp_list[i].name != NULL; i++) seq_printf(seq, " %lu", - fold_field((void **) icmp_statistics, - snmp4_icmp_list[i].entry)); + snmp_fold_field((void **)icmp_statistics, + snmp4_icmp_list[i].entry)); seq_puts(seq, "\nTcp:"); for (i = 0; snmp4_tcp_list[i].name != NULL; i++) @@ -288,12 +275,12 @@ static int snmp_seq_show(struct seq_file *seq, void *v) /* MaxConn field is signed, RFC 2012 */ if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) seq_printf(seq, " %ld", - fold_field((void **) tcp_statistics, - snmp4_tcp_list[i].entry)); + snmp_fold_field((void **)tcp_statistics, + snmp4_tcp_list[i].entry)); else seq_printf(seq, " %lu", - fold_field((void **) tcp_statistics, - snmp4_tcp_list[i].entry)); + snmp_fold_field((void **)tcp_statistics, + snmp4_tcp_list[i].entry)); } seq_puts(seq, "\nUdp:"); @@ -303,8 +290,8 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nUdp:"); for (i = 0; snmp4_udp_list[i].name != NULL; i++) seq_printf(seq, " %lu", - fold_field((void **) udp_statistics, - snmp4_udp_list[i].entry)); + snmp_fold_field((void **)udp_statistics, + snmp4_udp_list[i].entry)); /* the UDP and UDP-Lite MIBs are the same */ seq_puts(seq, "\nUdpLite:"); @@ -314,8 +301,8 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nUdpLite:"); for (i = 0; snmp4_udp_list[i].name != NULL; i++) seq_printf(seq, " %lu", - fold_field((void **) udplite_statistics, - snmp4_udp_list[i].entry) ); + snmp_fold_field((void **)udplite_statistics, + snmp4_udp_list[i].entry)); seq_putc(seq, '\n'); return 0; @@ -348,8 +335,8 @@ static int netstat_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nTcpExt:"); for (i = 0; snmp4_net_list[i].name != NULL; i++) seq_printf(seq, " %lu", - fold_field((void **) net_statistics, - snmp4_net_list[i].entry)); + snmp_fold_field((void **)net_statistics, + snmp4_net_list[i].entry)); seq_putc(seq, '\n'); return 0; diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index da70fef82c9..971ab9356e5 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -45,7 +45,7 @@ #include <net/ipip.h> #include <linux/igmp.h> -struct net_protocol *inet_protos[MAX_INET_PROTOS]; +struct net_protocol *inet_protos[MAX_INET_PROTOS] ____cacheline_aligned_in_smp; static DEFINE_SPINLOCK(inet_proto_lock); /* diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 87e9c161810..24d7c9f3191 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -132,7 +132,7 @@ static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct icmphdr))) return 1; - type = skb->h.icmph->type; + type = icmp_hdr(skb)->type; if (type < 32) { __u32 data = raw_sk(sk)->filter.data; @@ -184,8 +184,8 @@ out: void raw_err (struct sock *sk, struct sk_buff *skb, u32 info) { struct inet_sock *inet = inet_sk(sk); - int type = skb->h.icmph->type; - int code = skb->h.icmph->code; + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; int err = 0; int harderr = 0; @@ -256,7 +256,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb) } nf_reset(skb); - skb_push(skb, skb->data - skb->nh.raw); + skb_push(skb, skb->data - skb_network_header(skb)); raw_rcv_skb(sk, skb); return 0; @@ -291,11 +291,13 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, skb->priority = sk->sk_priority; skb->dst = dst_clone(&rt->u.dst); - skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length); + skb_reset_network_header(skb); + iph = ip_hdr(skb); + skb_put(skb, length); skb->ip_summed = CHECKSUM_NONE; - skb->h.raw = skb->nh.raw; + skb->transport_header = skb->network_header; err = memcpy_fromiovecend((void *)iph, from, 0, length); if (err) goto error_fault; @@ -613,7 +615,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* Copy the address. */ if (sin) { sin->sin_family = AF_INET; - sin->sin_addr.s_addr = skb->nh.iph->saddr; + sin->sin_addr.s_addr = ip_hdr(skb)->saddr; sin->sin_port = 0; memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); } @@ -887,7 +889,7 @@ static int raw_seq_show(struct seq_file *seq, void *v) return 0; } -static struct seq_operations raw_seq_ops = { +static const struct seq_operations raw_seq_ops = { .start = raw_seq_start, .next = raw_seq_next, .stop = raw_seq_stop, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 37e0d4d5cf9..cb76e3c725a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -82,7 +82,6 @@ #include <linux/proc_fs.h> #include <linux/init.h> #include <linux/skbuff.h> -#include <linux/rtnetlink.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/pkt_sched.h> @@ -104,6 +103,7 @@ #include <net/xfrm.h> #include <net/ip_mp_alg.h> #include <net/netevent.h> +#include <net/rtnetlink.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif @@ -364,7 +364,7 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) return 0; } -static struct seq_operations rt_cache_seq_ops = { +static const struct seq_operations rt_cache_seq_ops = { .start = rt_cache_seq_start, .next = rt_cache_seq_next, .stop = rt_cache_seq_stop, @@ -470,7 +470,7 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v) return 0; } -static struct seq_operations rt_cpu_seq_ops = { +static const struct seq_operations rt_cpu_seq_ops = { .start = rt_cpu_seq_start, .next = rt_cpu_seq_next, .stop = rt_cpu_seq_stop, @@ -1519,7 +1519,7 @@ static void ipv4_link_failure(struct sk_buff *skb) static int ip_rt_bug(struct sk_buff *skb) { printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n", - NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr), + NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr), skb->dev ? skb->dev->name : "?"); kfree_skb(skb); return 0; @@ -1698,9 +1698,9 @@ static void ip_handle_martian_source(struct net_device *dev, printk(KERN_WARNING "martian source %u.%u.%u.%u from " "%u.%u.%u.%u, on dev %s\n", NIPQUAD(daddr), NIPQUAD(saddr), dev->name); - if (dev->hard_header_len && skb->mac.raw) { + if (dev->hard_header_len && skb_mac_header_was_set(skb)) { int i; - unsigned char *p = skb->mac.raw; + const unsigned char *p = skb_mac_header(skb); printk(KERN_WARNING "ll header: "); for (i = 0; i < dev->hard_header_len; i++, p++) { printk("%02x", *p); @@ -2134,7 +2134,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, rcu_read_lock(); if ((in_dev = __in_dev_get_rcu(dev)) != NULL) { int our = ip_check_mc(in_dev, daddr, saddr, - skb->nh.iph->protocol); + ip_hdr(skb)->protocol); if (our #ifdef CONFIG_IP_MROUTE || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev)) @@ -2396,7 +2396,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ dev_out = ip_dev_find(oldflp->fl4_src); - if (dev_out == NULL) + if ((dev_out == NULL) && !(sysctl_ip_nonlocal_bind)) goto out; /* I removed check for oif == dev_out->oif here. @@ -2407,7 +2407,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) of another iface. --ANK */ - if (oldflp->oif == 0 + if (dev_out && oldflp->oif == 0 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) { /* Special hack: user can direct multicasts and limited broadcast via necessary interface @@ -2683,7 +2683,7 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, id = rt->peer->ip_id_count; if (rt->peer->tcp_ts_stamp) { ts = rt->peer->tcp_ts; - tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp; + tsage = get_seconds() - rt->peer->tcp_ts_stamp; } } @@ -2721,7 +2721,7 @@ nla_put_failure: return -EMSGSIZE; } -int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) +static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) { struct rtmsg *rtm; struct nlattr *tb[RTA_MAX+1]; @@ -2747,10 +2747,11 @@ int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) /* Reserve room for dummy headers, this skb can pass through good chunk of routing engine. */ - skb->mac.raw = skb->nh.raw = skb->data; + skb_reset_mac_header(skb); + skb_reset_network_header(skb); /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */ - skb->nh.iph->protocol = IPPROTO_ICMP; + ip_hdr(skb)->protocol = IPPROTO_ICMP; skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0; @@ -3193,6 +3194,8 @@ int __init ip_rt_init(void) xfrm_init(); xfrm4_init(); #endif + rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL); + return rc; } diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 33016cc90f0..2da1be0589a 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -125,10 +125,11 @@ static __u16 const msstab[] = { __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) { struct tcp_sock *tp = tcp_sk(sk); + const struct iphdr *iph = ip_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); int mssind; const __u16 mss = *mssp; - tp->last_synq_overflow = jiffies; /* XXX sort msstab[] by probability? Binary search? */ @@ -138,9 +139,8 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESSENT); - return secure_tcp_syn_cookie(skb->nh.iph->saddr, skb->nh.iph->daddr, - skb->h.th->source, skb->h.th->dest, - ntohl(skb->h.th->seq), + return secure_tcp_syn_cookie(iph->saddr, iph->daddr, + th->source, th->dest, ntohl(th->seq), jiffies / (HZ * 60), mssind); } @@ -157,14 +157,13 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) */ static inline int cookie_check(struct sk_buff *skb, __u32 cookie) { - __u32 seq; - __u32 mssind; - - seq = ntohl(skb->h.th->seq)-1; - mssind = check_tcp_syn_cookie(cookie, - skb->nh.iph->saddr, skb->nh.iph->daddr, - skb->h.th->source, skb->h.th->dest, - seq, jiffies / (HZ * 60), COUNTER_TRIES); + const struct iphdr *iph = ip_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); + __u32 seq = ntohl(th->seq) - 1; + __u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr, + th->source, th->dest, seq, + jiffies / (HZ * 60), + COUNTER_TRIES); return mssind < NUM_MSS ? msstab[mssind] + 1 : 0; } @@ -191,14 +190,15 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct inet_request_sock *ireq; struct tcp_request_sock *treq; struct tcp_sock *tp = tcp_sk(sk); - __u32 cookie = ntohl(skb->h.th->ack_seq) - 1; + const struct tcphdr *th = tcp_hdr(skb); + __u32 cookie = ntohl(th->ack_seq) - 1; struct sock *ret = sk; struct request_sock *req; int mss; struct rtable *rt; __u8 rcv_wscale; - if (!sysctl_tcp_syncookies || !skb->h.th->ack) + if (!sysctl_tcp_syncookies || !th->ack) goto out; if (time_after(jiffies, tp->last_synq_overflow + TCP_TIMEOUT_INIT) || @@ -220,12 +220,12 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, } ireq = inet_rsk(req); treq = tcp_rsk(req); - treq->rcv_isn = ntohl(skb->h.th->seq) - 1; + treq->rcv_isn = ntohl(th->seq) - 1; treq->snt_isn = cookie; req->mss = mss; - ireq->rmt_port = skb->h.th->source; - ireq->loc_addr = skb->nh.iph->daddr; - ireq->rmt_addr = skb->nh.iph->saddr; + ireq->rmt_port = th->source; + ireq->loc_addr = ip_hdr(skb)->daddr; + ireq->rmt_addr = ip_hdr(skb)->saddr; ireq->opt = NULL; /* We throwed the options of the initial SYN away, so we hope @@ -261,8 +261,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, .tos = RT_CONN_FLAGS(sk) } }, .proto = IPPROTO_TCP, .uli_u = { .ports = - { .sport = skb->h.th->dest, - .dport = skb->h.th->source } } }; + { .sport = th->dest, + .dport = th->source } } }; security_req_classify_flow(req, &fl); if (ip_route_output_key(&rt, &fl)) { reqsk_free(req); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 0aa304711a9..6817d6485df 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -647,6 +647,14 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_dointvec }, { + .ctl_name = NET_TCP_FRTO_RESPONSE, + .procname = "tcp_frto_response", + .data = &sysctl_tcp_frto_response, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { .ctl_name = NET_TCP_LOW_LATENCY, .procname = "tcp_low_latency", .data = &sysctl_tcp_low_latency, @@ -803,6 +811,14 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_allowed_congestion_control, .strategy = &strategy_allowed_congestion_control, }, + { + .ctl_name = NET_TCP_MAX_SSTHRESH, + .procname = "tcp_max_ssthresh", + .data = &sysctl_tcp_max_ssthresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = 0 } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 3834b10b511..8b124eafbb9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -297,7 +297,7 @@ EXPORT_SYMBOL(tcp_sockets_allocated); * All the sk_stream_mem_schedule() is of this nature: accounting * is strict, actions are advisory and have some latency. */ -int tcp_memory_pressure; +int tcp_memory_pressure __read_mostly; EXPORT_SYMBOL(tcp_memory_pressure); @@ -425,7 +425,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) /* Subtract 1, if FIN is in queue. */ if (answ && !skb_queue_empty(&sk->sk_receive_queue)) answ -= - ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin; + tcp_hdr((struct sk_buff *)sk->sk_receive_queue.prev)->fin; } else answ = tp->urg_seq - tp->copied_seq; release_sock(sk); @@ -444,7 +444,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) break; default: return -ENOIOCTLCMD; - }; + } return put_user(answ, (int __user *)arg); } @@ -460,9 +460,9 @@ static inline int forced_push(struct tcp_sock *tp) return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); } -static inline void skb_entail(struct sock *sk, struct tcp_sock *tp, - struct sk_buff *skb) +static inline void skb_entail(struct sock *sk, struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); skb->csum = 0; @@ -470,10 +470,8 @@ static inline void skb_entail(struct sock *sk, struct tcp_sock *tp, tcb->flags = TCPCB_FLAG_ACK; tcb->sacked = 0; skb_header_release(skb); - __skb_queue_tail(&sk->sk_write_queue, skb); + tcp_add_write_queue_tail(sk, skb); sk_charge_skb(sk, skb); - if (!sk->sk_send_head) - sk->sk_send_head = skb; if (tp->nonagle & TCP_NAGLE_PUSH) tp->nonagle &= ~TCP_NAGLE_PUSH; } @@ -488,15 +486,17 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags, } } -static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags, - int mss_now, int nonagle) +static inline void tcp_push(struct sock *sk, int flags, int mss_now, + int nonagle) { - if (sk->sk_send_head) { - struct sk_buff *skb = sk->sk_write_queue.prev; + struct tcp_sock *tp = tcp_sk(sk); + + if (tcp_send_head(sk)) { + struct sk_buff *skb = tcp_write_queue_tail(sk); if (!(flags & MSG_MORE) || forced_push(tp)) tcp_mark_push(tp, skb); tcp_mark_urg(tp, flags, skb); - __tcp_push_pending_frames(sk, tp, mss_now, + __tcp_push_pending_frames(sk, mss_now, (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle); } } @@ -526,13 +526,13 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse goto do_error; while (psize > 0) { - struct sk_buff *skb = sk->sk_write_queue.prev; + struct sk_buff *skb = tcp_write_queue_tail(sk); struct page *page = pages[poffset / PAGE_SIZE]; int copy, i, can_coalesce; int offset = poffset % PAGE_SIZE; int size = min_t(size_t, psize, PAGE_SIZE - offset); - if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) { + if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) { new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; @@ -542,7 +542,7 @@ new_segment: if (!skb) goto wait_for_memory; - skb_entail(sk, tp, skb); + skb_entail(sk, skb); copy = size_goal; } @@ -588,8 +588,8 @@ new_segment: if (forced_push(tp)) { tcp_mark_push(tp, skb); - __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH); - } else if (skb == sk->sk_send_head) + __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); + } else if (skb == tcp_send_head(sk)) tcp_push_one(sk, mss_now); continue; @@ -597,7 +597,7 @@ wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: if (copied) - tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); + tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) goto do_error; @@ -608,7 +608,7 @@ wait_for_memory: out: if (copied) - tcp_push(sk, tp, flags, mss_now, tp->nonagle); + tcp_push(sk, flags, mss_now, tp->nonagle); return copied; do_error: @@ -639,8 +639,9 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, #define TCP_PAGE(sk) (sk->sk_sndmsg_page) #define TCP_OFF(sk) (sk->sk_sndmsg_off) -static inline int select_size(struct sock *sk, struct tcp_sock *tp) +static inline int select_size(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); int tmp = tp->mss_cache; if (sk->sk_route_caps & NETIF_F_SG) { @@ -704,9 +705,9 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, while (seglen > 0) { int copy; - skb = sk->sk_write_queue.prev; + skb = tcp_write_queue_tail(sk); - if (!sk->sk_send_head || + if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) { new_segment: @@ -716,7 +717,7 @@ new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; - skb = sk_stream_alloc_pskb(sk, select_size(sk, tp), + skb = sk_stream_alloc_pskb(sk, select_size(sk), 0, sk->sk_allocation); if (!skb) goto wait_for_memory; @@ -727,7 +728,7 @@ new_segment: if (sk->sk_route_caps & NETIF_F_ALL_CSUM) skb->ip_summed = CHECKSUM_PARTIAL; - skb_entail(sk, tp, skb); + skb_entail(sk, skb); copy = size_goal; } @@ -832,8 +833,8 @@ new_segment: if (forced_push(tp)) { tcp_mark_push(tp, skb); - __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH); - } else if (skb == sk->sk_send_head) + __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); + } else if (skb == tcp_send_head(sk)) tcp_push_one(sk, mss_now); continue; @@ -841,7 +842,7 @@ wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: if (copied) - tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); + tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) goto do_error; @@ -853,16 +854,18 @@ wait_for_memory: out: if (copied) - tcp_push(sk, tp, flags, mss_now, tp->nonagle); + tcp_push(sk, flags, mss_now, tp->nonagle); TCP_CHECK_TIMER(sk); release_sock(sk); return copied; do_fault: if (!skb->len) { - if (sk->sk_send_head == skb) - sk->sk_send_head = NULL; - __skb_unlink(skb, &sk->sk_write_queue); + tcp_unlink_write_queue(skb, sk); + /* It is the one place in all of TCP, except connection + * reset, where we can be unlinking the send_head. + */ + tcp_check_send_head(sk, skb); sk_stream_free_skb(sk, skb); } @@ -1016,9 +1019,9 @@ static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) skb_queue_walk(&sk->sk_receive_queue, skb) { offset = seq - TCP_SKB_CB(skb)->seq; - if (skb->h.th->syn) + if (tcp_hdr(skb)->syn) offset--; - if (offset < skb->len || skb->h.th->fin) { + if (offset < skb->len || tcp_hdr(skb)->fin) { *off = offset; return skb; } @@ -1070,7 +1073,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, if (offset != skb->len) break; } - if (skb->h.th->fin) { + if (tcp_hdr(skb)->fin) { sk_eat_skb(sk, skb, 0); ++seq; break; @@ -1174,11 +1177,11 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, break; } offset = *seq - TCP_SKB_CB(skb)->seq; - if (skb->h.th->syn) + if (tcp_hdr(skb)->syn) offset--; if (offset < skb->len) goto found_ok_skb; - if (skb->h.th->fin) + if (tcp_hdr(skb)->fin) goto found_fin_ok; BUG_TRAP(flags & MSG_PEEK); skb = skb->next; @@ -1389,12 +1392,12 @@ do_prequeue: skip_copy: if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) { tp->urg_data = 0; - tcp_fast_path_check(sk, tp); + tcp_fast_path_check(sk); } if (used + offset < skb->len) continue; - if (skb->h.th->fin) + if (tcp_hdr(skb)->fin) goto found_fin_ok; if (!(flags & MSG_PEEK)) { sk_eat_skb(sk, skb, copied_early); @@ -1563,21 +1566,19 @@ void tcp_close(struct sock *sk, long timeout) */ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - - skb->h.th->fin; + tcp_hdr(skb)->fin; data_was_unread += len; __kfree_skb(skb); } sk_stream_mem_reclaim(sk); - /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section - * 3.10, we send a RST here because data was lost. To - * witness the awful effects of the old behavior of always - * doing a FIN, run an older 2.1.x kernel or 2.0.x, start - * a bulk GET in an FTP client, suspend the process, wait - * for the client to advertise a zero window, then kill -9 - * the FTP client, wheee... Note: timeout is always zero - * in such a case. + /* As outlined in RFC 2525, section 2.17, we send a RST here because + * data was lost. To witness the awful effects of the old behavior of + * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk + * GET in an FTP client, suspend the process, wait for the client to + * advertise a zero window, then kill -9 the FTP client, wheee... + * Note: timeout is always zero in such a case. */ if (data_was_unread) { /* Unread data was tossed, zap the connection. */ @@ -1732,7 +1733,7 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_clear_xmit_timers(sk); __skb_queue_purge(&sk->sk_receive_queue); - sk_stream_writequeue_purge(sk); + tcp_write_queue_purge(sk); __skb_queue_purge(&tp->out_of_order_queue); #ifdef CONFIG_NET_DMA __skb_queue_purge(&sk->sk_async_wait_queue); @@ -1758,9 +1759,8 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_set_ca_state(sk, TCP_CA_Open); tcp_clear_retrans(tp); inet_csk_delack_init(sk); - sk->sk_send_head = NULL; - tp->rx_opt.saw_tstamp = 0; - tcp_sack_reset(&tp->rx_opt); + tcp_init_send_head(sk); + memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); BUG_TRAP(!inet->num || icsk->icsk_bind_hash); @@ -1830,7 +1830,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, * for currently queued segments. */ tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; - tcp_push_pending_frames(sk, tp); + tcp_push_pending_frames(sk); } else { tp->nonagle &= ~TCP_NAGLE_OFF; } @@ -1854,7 +1854,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, tp->nonagle &= ~TCP_NAGLE_CORK; if (tp->nonagle&TCP_NAGLE_OFF) tp->nonagle |= TCP_NAGLE_PUSH; - tcp_push_pending_frames(sk, tp); + tcp_push_pending_frames(sk); } break; @@ -1954,7 +1954,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level, default: err = -ENOPROTOOPT; break; - }; + } + release_sock(sk); return err; } @@ -2124,7 +2125,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return 0; default: return -ENOPROTOOPT; - }; + } if (put_user(len, optlen)) return -EFAULT; @@ -2170,7 +2171,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) if (!pskb_may_pull(skb, sizeof(*th))) goto out; - th = skb->h.th; + th = tcp_hdr(skb); thlen = th->doff * 4; if (thlen < sizeof(*th)) goto out; @@ -2210,7 +2211,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) delta = htonl(oldlen + (thlen + len)); skb = segs; - th = skb->h.th; + th = tcp_hdr(skb); seq = ntohl(th->seq); do { @@ -2219,23 +2220,25 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) th->check = ~csum_fold((__force __wsum)((__force u32)th->check + (__force u32)delta)); if (skb->ip_summed != CHECKSUM_PARTIAL) - th->check = csum_fold(csum_partial(skb->h.raw, thlen, - skb->csum)); + th->check = + csum_fold(csum_partial(skb_transport_header(skb), + thlen, skb->csum)); seq += len; skb = skb->next; - th = skb->h.th; + th = tcp_hdr(skb); th->seq = htonl(seq); th->cwr = 0; } while (skb->next); - delta = htonl(oldlen + (skb->tail - skb->h.raw) + skb->data_len); + delta = htonl(oldlen + (skb->tail - skb->transport_header) + + skb->data_len); th->check = ~csum_fold((__force __wsum)((__force u32)th->check + (__force u32)delta)); if (skb->ip_summed != CHECKSUM_PARTIAL) - th->check = csum_fold(csum_partial(skb->h.raw, thlen, - skb->csum)); + th->check = csum_fold(csum_partial(skb_transport_header(skb), + thlen, skb->csum)); out: return segs; @@ -2372,6 +2375,23 @@ void __tcp_put_md5sig_pool(void) EXPORT_SYMBOL(__tcp_put_md5sig_pool); #endif +void tcp_done(struct sock *sk) +{ + if(sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) + TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); + + tcp_set_state(sk, TCP_CLOSE); + tcp_clear_xmit_timers(sk); + + sk->sk_shutdown = SHUTDOWN_MASK; + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); + else + inet_csk_destroy_sock(sk); +} +EXPORT_SYMBOL_GPL(tcp_done); + extern void __skb_cb_too_small_for_tcp(int, int); extern struct tcp_congestion_ops tcp_reno; diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 5730333cd0a..281c9f91325 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -206,7 +206,7 @@ static void bictcp_state(struct sock *sk, u8 new_state) /* Track delayed acknowledgment ratio using sliding window * ratio = (15*ratio + sample) / 16 */ -static void bictcp_acked(struct sock *sk, u32 cnt) +static void bictcp_acked(struct sock *sk, u32 cnt, ktime_t last) { const struct inet_connection_sock *icsk = inet_csk(sk); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 34ae3f13483..86b26539e54 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -12,6 +12,8 @@ #include <linux/list.h> #include <net/tcp.h> +int sysctl_tcp_max_ssthresh = 0; + static DEFINE_SPINLOCK(tcp_cong_list_lock); static LIST_HEAD(tcp_cong_list); @@ -124,7 +126,7 @@ int tcp_set_default_congestion_control(const char *name) #endif if (ca) { - ca->non_restricted = 1; /* default is always allowed */ + ca->flags |= TCP_CONG_NON_RESTRICTED; /* default is always allowed */ list_move(&ca->list, &tcp_cong_list); ret = 0; } @@ -179,7 +181,7 @@ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen) *buf = '\0'; rcu_read_lock(); list_for_each_entry_rcu(ca, &tcp_cong_list, list) { - if (!ca->non_restricted) + if (!(ca->flags & TCP_CONG_NON_RESTRICTED)) continue; offs += snprintf(buf + offs, maxlen - offs, "%s%s", @@ -210,16 +212,16 @@ int tcp_set_allowed_congestion_control(char *val) } } - /* pass 2 clear */ + /* pass 2 clear old values */ list_for_each_entry_rcu(ca, &tcp_cong_list, list) - ca->non_restricted = 0; + ca->flags &= ~TCP_CONG_NON_RESTRICTED; /* pass 3 mark as allowed */ while ((name = strsep(&val, " ")) && *name) { ca = tcp_ca_find(name); WARN_ON(!ca); if (ca) - ca->non_restricted = 1; + ca->flags |= TCP_CONG_NON_RESTRICTED; } out: spin_unlock(&tcp_cong_list_lock); @@ -254,7 +256,7 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) if (!ca) err = -ENOENT; - else if (!(ca->non_restricted || capable(CAP_NET_ADMIN))) + else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || capable(CAP_NET_ADMIN))) err = -EPERM; else if (!try_module_get(ca->owner)) @@ -274,10 +276,13 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) /* - * Linear increase during slow start + * Slow start (exponential increase) with + * RFC3742 Limited Slow Start (fast linear increase) support. */ void tcp_slow_start(struct tcp_sock *tp) { + int cnt = 0; + if (sysctl_tcp_abc) { /* RFC3465: Slow Start * TCP sender SHOULD increase cwnd by the number of @@ -286,17 +291,25 @@ void tcp_slow_start(struct tcp_sock *tp) */ if (tp->bytes_acked < tp->mss_cache) return; - - /* We MAY increase by 2 if discovered delayed ack */ - if (sysctl_tcp_abc > 1 && tp->bytes_acked >= 2*tp->mss_cache) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } } + + if (sysctl_tcp_max_ssthresh > 0 && + tp->snd_cwnd > sysctl_tcp_max_ssthresh) + cnt += sysctl_tcp_max_ssthresh>>1; + else + cnt += tp->snd_cwnd; + + /* RFC3465: We MAY increase by 2 if discovered delayed ack */ + if (sysctl_tcp_abc > 1 && tp->bytes_acked >= 2*tp->mss_cache) + cnt <<= 1; tp->bytes_acked = 0; - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; + tp->snd_cwnd_cnt += cnt; + while (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + tp->snd_cwnd_cnt -= tp->snd_cwnd; + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } } EXPORT_SYMBOL_GPL(tcp_slow_start); @@ -358,8 +371,8 @@ u32 tcp_reno_min_cwnd(const struct sock *sk) EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); struct tcp_congestion_ops tcp_reno = { + .flags = TCP_CONG_NON_RESTRICTED, .name = "reno", - .non_restricted = 1, .owner = THIS_MODULE, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 9a582fb4ef9..14224487b16 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -1,5 +1,5 @@ /* - * TCP CUBIC: Binary Increase Congestion control for TCP v2.0 + * TCP CUBIC: Binary Increase Congestion control for TCP v2.1 * * This is from the implementation of CUBIC TCP in * Injong Rhee, Lisong Xu. @@ -51,8 +51,6 @@ MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_ module_param(tcp_friendliness, int, 0644); MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness"); -#include <asm/div64.h> - /* BIC TCP Parameters */ struct bictcp { u32 cnt; /* increase cwnd by 1 after ACKs */ @@ -93,50 +91,51 @@ static void bictcp_init(struct sock *sk) tcp_sk(sk)->snd_ssthresh = initial_ssthresh; } -/* 64bit divisor, dividend and result. dynamic precision */ -static inline u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor) -{ - u_int32_t d = divisor; - - if (divisor > 0xffffffffULL) { - unsigned int shift = fls(divisor >> 32); - - d = divisor >> shift; - dividend >>= shift; - } - - /* avoid 64 bit division if possible */ - if (dividend >> 32) - do_div(dividend, d); - else - dividend = (uint32_t) dividend / d; - - return dividend; -} - -/* - * calculate the cubic root of x using Newton-Raphson +/* calculate the cubic root of x using a table lookup followed by one + * Newton-Raphson iteration. + * Avg err ~= 0.195% */ static u32 cubic_root(u64 a) { - u32 x, x1; - - /* Initial estimate is based on: - * cbrt(x) = exp(log(x) / 3) + u32 x, b, shift; + /* + * cbrt(x) MSB values for x MSB values in [0..63]. + * Precomputed then refined by hand - Willy Tarreau + * + * For x in [0..63], + * v = cbrt(x << 18) - 1 + * cbrt(x) = (v[x] + 10) >> 6 */ - x = 1u << (fls64(a)/3); + static const u8 v[] = { + /* 0x00 */ 0, 54, 54, 54, 118, 118, 118, 118, + /* 0x08 */ 123, 129, 134, 138, 143, 147, 151, 156, + /* 0x10 */ 157, 161, 164, 168, 170, 173, 176, 179, + /* 0x18 */ 181, 185, 187, 190, 192, 194, 197, 199, + /* 0x20 */ 200, 202, 204, 206, 209, 211, 213, 215, + /* 0x28 */ 217, 219, 221, 222, 224, 225, 227, 229, + /* 0x30 */ 231, 232, 234, 236, 237, 239, 240, 242, + /* 0x38 */ 244, 245, 246, 248, 250, 251, 252, 254, + }; + + b = fls64(a); + if (b < 7) { + /* a in [0..63] */ + return ((u32)v[(u32)a] + 35) >> 6; + } + + b = ((b * 84) >> 8) - 1; + shift = (a >> (b * 3)); + + x = ((u32)(((u32)v[shift] + 10) << b)) >> 6; /* - * Iteration based on: + * Newton-Raphson iteration * 2 * x = ( 2 * x + a / x ) / 3 * k+1 k k */ - do { - x1 = x; - x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3; - } while (abs(x1 - x) > 1); - + x = (2 * x + (u32)div64_64(a, (u64)x * (u64)(x - 1))); + x = ((x * 341) >> 10); return x; } @@ -215,7 +214,9 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) if (ca->delay_min > 0) { /* max increment = Smax * rtt / 0.1 */ min_cnt = (cwnd * HZ * 8)/(10 * max_increment * ca->delay_min); - if (ca->cnt < min_cnt) + + /* use concave growth when the target is above the origin */ + if (ca->cnt < min_cnt && t >= ca->bic_K) ca->cnt = min_cnt; } @@ -333,7 +334,7 @@ static void bictcp_state(struct sock *sk, u8 new_state) /* Track delayed acknowledgment ratio using sliding window * ratio = (15*ratio + sample) / 16 */ -static void bictcp_acked(struct sock *sk, u32 cnt) +static void bictcp_acked(struct sock *sk, u32 cnt, ktime_t last) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -401,4 +402,4 @@ module_exit(cubictcp_unregister); MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("CUBIC TCP"); -MODULE_VERSION("2.0"); +MODULE_VERSION("2.1"); diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index a291097fcc0..43d624e5043 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -97,10 +97,6 @@ struct hstcp { u32 ai; }; -static int max_ssthresh = 100; -module_param(max_ssthresh, int, 0644); -MODULE_PARM_DESC(max_ssthresh, "limited slow start threshold (RFC3742)"); - static void hstcp_init(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -122,23 +118,9 @@ static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* RFC3742: limited slow start - * the window is increased by 1/K MSS for each arriving ACK, - * for K = int(cwnd/(0.5 max_ssthresh)) - */ - if (max_ssthresh > 0 && tp->snd_cwnd > max_ssthresh) { - u32 k = max(tp->snd_cwnd / (max_ssthresh >> 1), 1U); - if (++tp->snd_cwnd_cnt >= k) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - } - } else { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } - } else { + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { /* Update AIMD parameters. * * We want to guarantee that: diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 1020eb48d8d..4ba4a7ae0a8 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -98,7 +98,7 @@ static inline void measure_rtt(struct sock *sk) } } -static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked) +static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked, ktime_t last) { const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index 59e691d26f6..e5be3511722 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -144,7 +144,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt, ca->snd_cwnd_cents += odd; /* check when fractions goes >=128 and increase cwnd by 1. */ - while(ca->snd_cwnd_cents >= 128) { + while (ca->snd_cwnd_cents >= 128) { tp->snd_cwnd++; ca->snd_cwnd_cents -= 128; tp->snd_cwnd_cnt = 0; diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c new file mode 100644 index 00000000000..4adc47c5535 --- /dev/null +++ b/net/ipv4/tcp_illinois.c @@ -0,0 +1,356 @@ +/* + * TCP Illinois congestion control. + * Home page: + * http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html + * + * The algorithm is described in: + * "TCP-Illinois: A Loss and Delay-Based Congestion Control Algorithm + * for High-Speed Networks" + * http://www.ews.uiuc.edu/~shaoliu/papersandslides/liubassri06perf.pdf + * + * Implemented from description in paper and ns-2 simulation. + * Copyright (C) 2007 Stephen Hemminger <shemminger@linux-foundation.org> + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/inet_diag.h> +#include <asm/div64.h> +#include <net/tcp.h> + +#define ALPHA_SHIFT 7 +#define ALPHA_SCALE (1u<<ALPHA_SHIFT) +#define ALPHA_MIN ((3*ALPHA_SCALE)/10) /* ~0.3 */ +#define ALPHA_MAX (10*ALPHA_SCALE) /* 10.0 */ +#define ALPHA_BASE ALPHA_SCALE /* 1.0 */ +#define U32_MAX ((u32)~0U) +#define RTT_MAX (U32_MAX / ALPHA_MAX) /* 3.3 secs */ + +#define BETA_SHIFT 6 +#define BETA_SCALE (1u<<BETA_SHIFT) +#define BETA_MIN (BETA_SCALE/8) /* 0.125 */ +#define BETA_MAX (BETA_SCALE/2) /* 0.5 */ +#define BETA_BASE BETA_MAX + +static int win_thresh __read_mostly = 15; +module_param(win_thresh, int, 0); +MODULE_PARM_DESC(win_thresh, "Window threshold for starting adaptive sizing"); + +static int theta __read_mostly = 5; +module_param(theta, int, 0); +MODULE_PARM_DESC(theta, "# of fast RTT's before full growth"); + +/* TCP Illinois Parameters */ +struct illinois { + u64 sum_rtt; /* sum of rtt's measured within last rtt */ + u16 cnt_rtt; /* # of rtts measured within last rtt */ + u32 base_rtt; /* min of all rtt in usec */ + u32 max_rtt; /* max of all rtt in usec */ + u32 end_seq; /* right edge of current RTT */ + u32 alpha; /* Additive increase */ + u32 beta; /* Muliplicative decrease */ + u16 acked; /* # packets acked by current ACK */ + u8 rtt_above; /* average rtt has gone above threshold */ + u8 rtt_low; /* # of rtts measurements below threshold */ +}; + +static void rtt_reset(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct illinois *ca = inet_csk_ca(sk); + + ca->end_seq = tp->snd_nxt; + ca->cnt_rtt = 0; + ca->sum_rtt = 0; + + /* TODO: age max_rtt? */ +} + +static void tcp_illinois_init(struct sock *sk) +{ + struct illinois *ca = inet_csk_ca(sk); + + ca->alpha = ALPHA_MAX; + ca->beta = BETA_BASE; + ca->base_rtt = 0x7fffffff; + ca->max_rtt = 0; + + ca->acked = 0; + ca->rtt_low = 0; + ca->rtt_above = 0; + + rtt_reset(sk); +} + +/* Measure RTT for each ack. */ +static void tcp_illinois_acked(struct sock *sk, u32 pkts_acked, ktime_t last) +{ + struct illinois *ca = inet_csk_ca(sk); + u32 rtt; + + ca->acked = pkts_acked; + + rtt = ktime_to_us(net_timedelta(last)); + + /* ignore bogus values, this prevents wraparound in alpha math */ + if (rtt > RTT_MAX) + rtt = RTT_MAX; + + /* keep track of minimum RTT seen so far */ + if (ca->base_rtt > rtt) + ca->base_rtt = rtt; + + /* and max */ + if (ca->max_rtt < rtt) + ca->max_rtt = rtt; + + ++ca->cnt_rtt; + ca->sum_rtt += rtt; +} + +/* Maximum queuing delay */ +static inline u32 max_delay(const struct illinois *ca) +{ + return ca->max_rtt - ca->base_rtt; +} + +/* Average queuing delay */ +static inline u32 avg_delay(const struct illinois *ca) +{ + u64 t = ca->sum_rtt; + + do_div(t, ca->cnt_rtt); + return t - ca->base_rtt; +} + +/* + * Compute value of alpha used for additive increase. + * If small window then use 1.0, equivalent to Reno. + * + * For larger windows, adjust based on average delay. + * A. If average delay is at minimum (we are uncongested), + * then use large alpha (10.0) to increase faster. + * B. If average delay is at maximum (getting congested) + * then use small alpha (0.3) + * + * The result is a convex window growth curve. + */ +static u32 alpha(struct illinois *ca, u32 da, u32 dm) +{ + u32 d1 = dm / 100; /* Low threshold */ + + if (da <= d1) { + /* If never got out of low delay zone, then use max */ + if (!ca->rtt_above) + return ALPHA_MAX; + + /* Wait for 5 good RTT's before allowing alpha to go alpha max. + * This prevents one good RTT from causing sudden window increase. + */ + if (++ca->rtt_low < theta) + return ca->alpha; + + ca->rtt_low = 0; + ca->rtt_above = 0; + return ALPHA_MAX; + } + + ca->rtt_above = 1; + + /* + * Based on: + * + * (dm - d1) amin amax + * k1 = ------------------- + * amax - amin + * + * (dm - d1) amin + * k2 = ---------------- - d1 + * amax - amin + * + * k1 + * alpha = ---------- + * k2 + da + */ + + dm -= d1; + da -= d1; + return (dm * ALPHA_MAX) / + (dm + (da * (ALPHA_MAX - ALPHA_MIN)) / ALPHA_MIN); +} + +/* + * Beta used for multiplicative decrease. + * For small window sizes returns same value as Reno (0.5) + * + * If delay is small (10% of max) then beta = 1/8 + * If delay is up to 80% of max then beta = 1/2 + * In between is a linear function + */ +static u32 beta(u32 da, u32 dm) +{ + u32 d2, d3; + + d2 = dm / 10; + if (da <= d2) + return BETA_MIN; + + d3 = (8 * dm) / 10; + if (da >= d3 || d3 <= d2) + return BETA_MAX; + + /* + * Based on: + * + * bmin d3 - bmax d2 + * k3 = ------------------- + * d3 - d2 + * + * bmax - bmin + * k4 = ------------- + * d3 - d2 + * + * b = k3 + k4 da + */ + return (BETA_MIN * d3 - BETA_MAX * d2 + (BETA_MAX - BETA_MIN) * da) + / (d3 - d2); +} + +/* Update alpha and beta values once per RTT */ +static void update_params(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct illinois *ca = inet_csk_ca(sk); + + if (tp->snd_cwnd < win_thresh) { + ca->alpha = ALPHA_BASE; + ca->beta = BETA_BASE; + } else if (ca->cnt_rtt > 0) { + u32 dm = max_delay(ca); + u32 da = avg_delay(ca); + + ca->alpha = alpha(ca, da, dm); + ca->beta = beta(da, dm); + } + + rtt_reset(sk); +} + +/* + * In case of loss, reset to default values + */ +static void tcp_illinois_state(struct sock *sk, u8 new_state) +{ + struct illinois *ca = inet_csk_ca(sk); + + if (new_state == TCP_CA_Loss) { + ca->alpha = ALPHA_BASE; + ca->beta = BETA_BASE; + ca->rtt_low = 0; + ca->rtt_above = 0; + rtt_reset(sk); + } +} + +/* + * Increase window in response to successful acknowledgment. + */ +static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 rtt, + u32 in_flight, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct illinois *ca = inet_csk_ca(sk); + + if (after(ack, ca->end_seq)) + update_params(sk); + + /* RFC2861 only increase cwnd if fully utilized */ + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + /* In slow start */ + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + + else { + u32 delta; + + /* snd_cwnd_cnt is # of packets since last cwnd increment */ + tp->snd_cwnd_cnt += ca->acked; + ca->acked = 1; + + /* This is close approximation of: + * tp->snd_cwnd += alpha/tp->snd_cwnd + */ + delta = (tp->snd_cwnd_cnt * ca->alpha) >> ALPHA_SHIFT; + if (delta >= tp->snd_cwnd) { + tp->snd_cwnd = min(tp->snd_cwnd + delta / tp->snd_cwnd, + (u32) tp->snd_cwnd_clamp); + tp->snd_cwnd_cnt = 0; + } + } +} + +static u32 tcp_illinois_ssthresh(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct illinois *ca = inet_csk_ca(sk); + + /* Multiplicative decrease */ + return max((tp->snd_cwnd * ca->beta) >> BETA_SHIFT, 2U); +} + + +/* Extract info for Tcp socket info provided via netlink. */ +static void tcp_illinois_info(struct sock *sk, u32 ext, + struct sk_buff *skb) +{ + const struct illinois *ca = inet_csk_ca(sk); + + if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { + struct tcpvegas_info info = { + .tcpv_enabled = 1, + .tcpv_rttcnt = ca->cnt_rtt, + .tcpv_minrtt = ca->base_rtt, + }; + u64 t = ca->sum_rtt; + + do_div(t, ca->cnt_rtt); + info.tcpv_rtt = t; + + nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); + } +} + +static struct tcp_congestion_ops tcp_illinois = { + .flags = TCP_CONG_RTT_STAMP, + .init = tcp_illinois_init, + .ssthresh = tcp_illinois_ssthresh, + .min_cwnd = tcp_reno_min_cwnd, + .cong_avoid = tcp_illinois_cong_avoid, + .set_state = tcp_illinois_state, + .get_info = tcp_illinois_info, + .pkts_acked = tcp_illinois_acked, + + .owner = THIS_MODULE, + .name = "illinois", +}; + +static int __init tcp_illinois_register(void) +{ + BUILD_BUG_ON(sizeof(struct illinois) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_illinois); +} + +static void __exit tcp_illinois_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_illinois); +} + +module_init(tcp_illinois_register); +module_exit(tcp_illinois_unregister); + +MODULE_AUTHOR("Stephen Hemminger, Shao Liu"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Illinois"); +MODULE_VERSION("1.0"); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 1a14191687a..7641b2761a1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -86,6 +86,7 @@ int sysctl_tcp_stdurg __read_mostly; int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; int sysctl_tcp_frto __read_mostly; +int sysctl_tcp_frto_response __read_mostly; int sysctl_tcp_nometrics_save __read_mostly; int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; @@ -100,6 +101,7 @@ int sysctl_tcp_abc __read_mostly; #define FLAG_ECE 0x40 /* ECE in this ACK */ #define FLAG_DATA_LOST 0x80 /* SACK detected data lossage. */ #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ +#define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) @@ -110,6 +112,8 @@ int sysctl_tcp_abc __read_mostly; #define IsFack(tp) ((tp)->rx_opt.sack_ok & 2) #define IsDSack(tp) ((tp)->rx_opt.sack_ok & 4) +#define IsSackFrto() (sysctl_tcp_frto == 0x2) + #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) /* Adapt the MSS value used to make delayed ack decision to the @@ -136,7 +140,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, * * "len" is invariant segment length, including TCP header. */ - len += skb->data - skb->h.raw; + len += skb->data - skb_transport_header(skb); if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) || /* If PSH is not set, packet should be * full sized, provided peer TCP is not badly broken. @@ -144,7 +148,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, * to handle super-low mtu links fairly. */ (len >= TCP_MIN_MSS + sizeof(struct tcphdr) && - !(tcp_flag_word(skb->h.th)&TCP_REMNANT))) { + !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) { /* Subtract also invariant (if peer is RFC compliant), * tcp header plus fixed timestamp option length. * Resulting "len" is MSS free of SACK jitter. @@ -231,9 +235,9 @@ static void tcp_fixup_sndbuf(struct sock *sk) */ /* Slow part of check#2. */ -static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp, - const struct sk_buff *skb) +static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); /* Optimize this! */ int truesize = tcp_win_from_space(skb->truesize)/2; int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2; @@ -248,9 +252,11 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp, return 0; } -static void tcp_grow_window(struct sock *sk, struct tcp_sock *tp, +static void tcp_grow_window(struct sock *sk, struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); + /* Check #1 */ if (tp->rcv_ssthresh < tp->window_clamp && (int)tp->rcv_ssthresh < tcp_space(sk) && @@ -263,7 +269,7 @@ static void tcp_grow_window(struct sock *sk, struct tcp_sock *tp, if (tcp_win_from_space(skb->truesize) <= skb->len) incr = 2*tp->advmss; else - incr = __tcp_grow_window(sk, tp, skb); + incr = __tcp_grow_window(sk, skb); if (incr) { tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp); @@ -326,8 +332,9 @@ static void tcp_init_buffer_space(struct sock *sk) } /* 5. Recalculate window clamp after socket hit its memory bounds. */ -static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) +static void tcp_clamp_window(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_ack.quick = 0; @@ -499,8 +506,9 @@ new_measure: * each ACK we send, he increments snd_cwnd and transmits more of his * queue. -DaveM */ -static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) +static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); u32 now; @@ -541,7 +549,7 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ TCP_ECN_check_ce(tp, skb); if (skb->len >= 128) - tcp_grow_window(sk, tp, skb); + tcp_grow_window(sk, skb); } /* Called to compute a smoothed rtt estimate. The data fed to this @@ -574,7 +582,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) */ - if(m == 0) + if (m == 0) m = 1; if (tp->srtt != 0) { m -= (tp->srtt >> 3); /* m is now error in rtt est */ @@ -759,15 +767,17 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) } /* Set slow start threshold and cwnd not falling to slow start */ -void tcp_enter_cwr(struct sock *sk) +void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) { struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); tp->prior_ssthresh = 0; tp->bytes_acked = 0; - if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { + if (icsk->icsk_ca_state < TCP_CA_CWR) { tp->undo_marker = 0; - tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); + if (set_ssthresh) + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1U); tp->snd_cwnd_cnt = 0; @@ -934,7 +944,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked; + unsigned char *ptr = (skb_transport_header(ack_skb) + + TCP_SKB_CB(ack_skb)->sacked); struct tcp_sack_block_wire *sp = (struct tcp_sack_block_wire *)(ptr+2); struct sk_buff *cached_skb; int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3; @@ -1038,7 +1049,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ cached_skb = tp->fastpath_skb_hint; cached_fack_count = tp->fastpath_cnt_hint; if (!cached_skb) { - cached_skb = sk->sk_write_queue.next; + cached_skb = tcp_write_queue_head(sk); cached_fack_count = 0; } @@ -1055,10 +1066,13 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (after(end_seq, tp->high_seq)) flag |= FLAG_DATA_LOST; - sk_stream_for_retrans_queue_from(skb, sk) { + tcp_for_write_queue_from(skb, sk) { int in_sack, pcount; u8 sacked; + if (skb == tcp_send_head(sk)) + break; + cached_skb = skb; cached_fack_count = fack_count; if (i == first_sack_index) { @@ -1159,6 +1173,18 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ /* clear lost hint */ tp->retransmit_skb_hint = NULL; } + /* SACK enhanced F-RTO detection. + * Set flag if and only if non-rexmitted + * segments below frto_highmark are + * SACKed (RFC4138; Appendix B). + * Clearing correct due to in-order walk + */ + if (after(end_seq, tp->frto_highmark)) { + flag &= ~FLAG_ONLY_ORIG_SACKED; + } else { + if (!(sacked & TCPCB_RETRANS)) + flag |= FLAG_ONLY_ORIG_SACKED; + } } TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; @@ -1195,7 +1221,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) { struct sk_buff *skb; - sk_stream_for_retrans_queue(skb, sk) { + tcp_for_write_queue(skb, sk) { + if (skb == tcp_send_head(sk)) + break; if (after(TCP_SKB_CB(skb)->seq, lost_retrans)) break; if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) @@ -1224,7 +1252,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ tp->left_out = tp->sacked_out + tp->lost_out; - if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss) + if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss && + (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark))) tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0); #if FASTRETRANS_DEBUG > 0 @@ -1236,9 +1265,49 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ return flag; } -/* RTO occurred, but do not yet enter loss state. Instead, transmit two new - * segments to see from the next ACKs whether any data was really missing. - * If the RTO was spurious, new ACKs should arrive. +/* F-RTO can only be used if TCP has never retransmitted anything other than + * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here) + */ +int tcp_use_frto(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + if (!sysctl_tcp_frto) + return 0; + + if (IsSackFrto()) + return 1; + + /* Avoid expensive walking of rexmit queue if possible */ + if (tp->retrans_out > 1) + return 0; + + skb = tcp_write_queue_head(sk); + skb = tcp_write_queue_next(sk, skb); /* Skips head */ + tcp_for_write_queue_from(skb, sk) { + if (skb == tcp_send_head(sk)) + break; + if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS) + return 0; + /* Short-circuit when first non-SACKed skb has been checked */ + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) + break; + } + return 1; +} + +/* RTO occurred, but do not yet enter Loss state. Instead, defer RTO + * recovery a bit and use heuristics in tcp_process_frto() to detect if + * the RTO was spurious. Only clear SACKED_RETRANS of the head here to + * keep retrans_out counting accurate (with SACK F-RTO, other than head + * may still have that bit set); TCPCB_LOST and remaining SACKED_RETRANS + * bits are handled if the Loss state is really to be entered (in + * tcp_enter_frto_loss). + * + * Do like tcp_enter_loss() would; when RTO expires the second time it + * does: + * "Reduce ssthresh if it has not yet been made inside this window." */ void tcp_enter_frto(struct sock *sk) { @@ -1246,39 +1315,69 @@ void tcp_enter_frto(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - tp->frto_counter = 1; - - if (icsk->icsk_ca_state <= TCP_CA_Disorder || + if ((!tp->frto_counter && icsk->icsk_ca_state <= TCP_CA_Disorder) || tp->snd_una == tp->high_seq || - (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { + ((icsk->icsk_ca_state == TCP_CA_Loss || tp->frto_counter) && + !icsk->icsk_retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(sk); - tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); + /* Our state is too optimistic in ssthresh() call because cwnd + * is not reduced until tcp_enter_frto_loss() when previous FRTO + * recovery has not yet completed. Pattern would be this: RTO, + * Cumulative ACK, RTO (2xRTO for the same segment does not end + * up here twice). + * RFC4138 should be more specific on what to do, even though + * RTO is quite unlikely to occur after the first Cumulative ACK + * due to back-off and complexity of triggering events ... + */ + if (tp->frto_counter) { + u32 stored_cwnd; + stored_cwnd = tp->snd_cwnd; + tp->snd_cwnd = 2; + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); + tp->snd_cwnd = stored_cwnd; + } else { + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); + } + /* ... in theory, cong.control module could do "any tricks" in + * ssthresh(), which means that ca_state, lost bits and lost_out + * counter would have to be faked before the call occurs. We + * consider that too expensive, unlikely and hacky, so modules + * using these in ssthresh() must deal these incompatibility + * issues if they receives CA_EVENT_FRTO and frto_counter != 0 + */ tcp_ca_event(sk, CA_EVENT_FRTO); } - /* Have to clear retransmission markers here to keep the bookkeeping - * in shape, even though we are not yet in Loss state. - * If something was really lost, it is eventually caught up - * in tcp_enter_frto_loss. - */ - tp->retrans_out = 0; tp->undo_marker = tp->snd_una; tp->undo_retrans = 0; - sk_stream_for_retrans_queue(skb, sk) { - TCP_SKB_CB(skb)->sacked &= ~TCPCB_RETRANS; + skb = tcp_write_queue_head(sk); + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= tcp_skb_pcount(skb); } tcp_sync_left_out(tp); - tcp_set_ca_state(sk, TCP_CA_Open); - tp->frto_highmark = tp->snd_nxt; + /* Earlier loss recovery underway (see RFC4138; Appendix B). + * The last condition is necessary at least in tp->frto_counter case. + */ + if (IsSackFrto() && (tp->frto_counter || + ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) && + after(tp->high_seq, tp->snd_una)) { + tp->frto_highmark = tp->high_seq; + } else { + tp->frto_highmark = tp->snd_nxt; + } + tcp_set_ca_state(sk, TCP_CA_Disorder); + tp->high_seq = tp->snd_nxt; + tp->frto_counter = 1; } /* Enter Loss state after F-RTO was applied. Dupack arrived after RTO, * which indicates that we should follow the traditional RTO recovery, * i.e. mark everything lost and do go-back-N retransmission. */ -static void tcp_enter_frto_loss(struct sock *sk) +static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -1287,10 +1386,23 @@ static void tcp_enter_frto_loss(struct sock *sk) tp->sacked_out = 0; tp->lost_out = 0; tp->fackets_out = 0; + tp->retrans_out = 0; - sk_stream_for_retrans_queue(skb, sk) { + tcp_for_write_queue(skb, sk) { + if (skb == tcp_send_head(sk)) + break; cnt += tcp_skb_pcount(skb); - TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; + /* + * Count the retransmission made on RTO correctly (only when + * waiting for the first ACK and did not get it)... + */ + if ((tp->frto_counter == 1) && !(flag&FLAG_DATA_ACKED)) { + tp->retrans_out += tcp_skb_pcount(skb); + /* ...enter this if branch just for the first segment */ + flag |= FLAG_DATA_ACKED; + } else { + TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); + } if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) { /* Do not mark those segments lost that were @@ -1308,7 +1420,7 @@ static void tcp_enter_frto_loss(struct sock *sk) } tcp_sync_left_out(tp); - tp->snd_cwnd = tp->frto_counter + tcp_packets_in_flight(tp)+1; + tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments; tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; tp->undo_marker = 0; @@ -1366,7 +1478,9 @@ void tcp_enter_loss(struct sock *sk, int how) if (!how) tp->undo_marker = tp->snd_una; - sk_stream_for_retrans_queue(skb, sk) { + tcp_for_write_queue(skb, sk) { + if (skb == tcp_send_head(sk)) + break; cnt += tcp_skb_pcount(skb); if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS) tp->undo_marker = 0; @@ -1401,14 +1515,14 @@ static int tcp_check_sack_reneging(struct sock *sk) * receiver _host_ is heavily congested (or buggy). * Do processing similar to RTO timeout. */ - if ((skb = skb_peek(&sk->sk_write_queue)) != NULL && + if ((skb = tcp_write_queue_head(sk)) != NULL && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { struct inet_connection_sock *icsk = inet_csk(sk); NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING); tcp_enter_loss(sk, 1); icsk->icsk_retransmits++; - tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)); + tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); return 1; @@ -1426,10 +1540,12 @@ static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto); } -static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp) +static inline int tcp_head_timedout(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); + return tp->packets_out && - tcp_skb_timedout(sk, skb_peek(&sk->sk_write_queue)); + tcp_skb_timedout(sk, tcp_write_queue_head(sk)); } /* Linux NewReno/SACK/FACK/ECN state machine. @@ -1525,10 +1641,15 @@ static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp) * Main question: may we further continue forward transmission * with the same cwnd? */ -static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp) +static int tcp_time_to_recover(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); __u32 packets_out; + /* Do not perform any recovery during FRTO algorithm */ + if (tp->frto_counter) + return 0; + /* Trick#1: The loss is proven. */ if (tp->lost_out) return 1; @@ -1540,7 +1661,7 @@ static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp) /* Trick#3 : when we use RFC2988 timer restart, fast * retransmit can be triggered by timeout of queue head. */ - if (tcp_head_timedout(sk, tp)) + if (tcp_head_timedout(sk)) return 1; /* Trick#4: It is still not OK... But will it be useful to delay @@ -1549,7 +1670,7 @@ static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp) packets_out = tp->packets_out; if (packets_out <= tp->reordering && tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) && - !tcp_may_send_now(sk, tp)) { + !tcp_may_send_now(sk)) { /* We have nothing to send. This connection is limited * either by receiver window or by application. */ @@ -1589,8 +1710,10 @@ static void tcp_add_reno_sack(struct sock *sk) /* Account for ACK, ACKing some data in Reno Recovery phase. */ -static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_sock *tp, int acked) +static void tcp_remove_reno_sacks(struct sock *sk, int acked) { + struct tcp_sock *tp = tcp_sk(sk); + if (acked > 0) { /* One ACK acked hole. The rest eat duplicate ACKs. */ if (acked-1 >= tp->sacked_out) @@ -1609,9 +1732,10 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp) } /* Mark head of queue up as lost. */ -static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp, +static void tcp_mark_head_lost(struct sock *sk, int packets, u32 high_seq) { + struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int cnt; @@ -1620,11 +1744,13 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp, skb = tp->lost_skb_hint; cnt = tp->lost_cnt_hint; } else { - skb = sk->sk_write_queue.next; + skb = tcp_write_queue_head(sk); cnt = 0; } - sk_stream_for_retrans_queue_from(skb, sk) { + tcp_for_write_queue_from(skb, sk) { + if (skb == tcp_send_head(sk)) + break; /* TODO: do this better */ /* this is not the most efficient way to do this... */ tp->lost_skb_hint = skb; @@ -1638,12 +1764,11 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp, /* clear xmit_retransmit_queue hints * if this is beyond hint */ - if(tp->retransmit_skb_hint != NULL && - before(TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) { - + if (tp->retransmit_skb_hint != NULL && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) tp->retransmit_skb_hint = NULL; - } + } } tcp_sync_left_out(tp); @@ -1651,15 +1776,17 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp, /* Account newly detected lost packet(s) */ -static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) +static void tcp_update_scoreboard(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); + if (IsFack(tp)) { int lost = tp->fackets_out - tp->reordering; if (lost <= 0) lost = 1; - tcp_mark_head_lost(sk, tp, lost, tp->high_seq); + tcp_mark_head_lost(sk, lost, tp->high_seq); } else { - tcp_mark_head_lost(sk, tp, 1, tp->high_seq); + tcp_mark_head_lost(sk, 1, tp->high_seq); } /* New heuristics: it is possible only after we switched @@ -1667,13 +1794,15 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) * Hence, we can detect timed out packets during fast * retransmit without falling to slow start. */ - if (!IsReno(tp) && tcp_head_timedout(sk, tp)) { + if (!IsReno(tp) && tcp_head_timedout(sk)) { struct sk_buff *skb; skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint - : sk->sk_write_queue.next; + : tcp_write_queue_head(sk); - sk_stream_for_retrans_queue_from(skb, sk) { + tcp_for_write_queue_from(skb, sk) { + if (skb == tcp_send_head(sk)) + break; if (!tcp_skb_timedout(sk, skb)) break; @@ -1745,9 +1874,11 @@ static inline int tcp_packet_delayed(struct tcp_sock *tp) /* Undo procedures. */ #if FASTRETRANS_DEBUG > 1 -static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg) +static void DBGUNDO(struct sock *sk, const char *msg) { + struct tcp_sock *tp = tcp_sk(sk); struct inet_sock *inet = inet_sk(sk); + printk(KERN_DEBUG "Undo %s %u.%u.%u.%u/%u c%u l%u ss%u/%u p%u\n", msg, NIPQUAD(inet->daddr), ntohs(inet->dport), @@ -1793,13 +1924,15 @@ static inline int tcp_may_undo(struct tcp_sock *tp) } /* People celebrate: "We love our President!" */ -static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp) +static int tcp_try_undo_recovery(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); + if (tcp_may_undo(tp)) { /* Happy end! We did not retransmit anything * or our original transmission succeeded. */ - DBGUNDO(sk, tp, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); + DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); tcp_undo_cwr(sk, 1); if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); @@ -1819,10 +1952,12 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp) } /* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */ -static void tcp_try_undo_dsack(struct sock *sk, struct tcp_sock *tp) +static void tcp_try_undo_dsack(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); + if (tp->undo_marker && !tp->undo_retrans) { - DBGUNDO(sk, tp, "D-SACK"); + DBGUNDO(sk, "D-SACK"); tcp_undo_cwr(sk, 1); tp->undo_marker = 0; NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO); @@ -1831,9 +1966,9 @@ static void tcp_try_undo_dsack(struct sock *sk, struct tcp_sock *tp) /* Undo during fast recovery after partial ACK. */ -static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp, - int acked) +static int tcp_try_undo_partial(struct sock *sk, int acked) { + struct tcp_sock *tp = tcp_sk(sk); /* Partial ACK arrived. Force Hoe's retransmit. */ int failed = IsReno(tp) || tp->fackets_out>tp->reordering; @@ -1846,7 +1981,7 @@ static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp, tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); - DBGUNDO(sk, tp, "Hoe"); + DBGUNDO(sk, "Hoe"); tcp_undo_cwr(sk, 0); NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO); @@ -1860,17 +1995,21 @@ static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp, } /* Undo during loss recovery after partial ACK. */ -static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) +static int tcp_try_undo_loss(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); + if (tcp_may_undo(tp)) { struct sk_buff *skb; - sk_stream_for_retrans_queue(skb, sk) { + tcp_for_write_queue(skb, sk) { + if (skb == tcp_send_head(sk)) + break; TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; } clear_all_retrans_hints(tp); - DBGUNDO(sk, tp, "partial loss"); + DBGUNDO(sk, "partial loss"); tp->lost_out = 0; tp->left_out = tp->sacked_out; tcp_undo_cwr(sk, 1); @@ -1892,15 +2031,17 @@ static inline void tcp_complete_cwr(struct sock *sk) tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); } -static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) +static void tcp_try_to_open(struct sock *sk, int flag) { + struct tcp_sock *tp = tcp_sk(sk); + tp->left_out = tp->sacked_out; if (tp->retrans_out == 0) tp->retrans_stamp = 0; if (flag&FLAG_ECE) - tcp_enter_cwr(sk); + tcp_enter_cwr(sk, 1); if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { int state = TCP_CA_Open; @@ -1987,7 +2128,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, before(tp->snd_una, tp->high_seq) && icsk->icsk_ca_state != TCP_CA_Open && tp->fackets_out > tp->reordering) { - tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq); + tcp_mark_head_lost(sk, tp->fackets_out-tp->reordering, tp->high_seq); NET_INC_STATS_BH(LINUX_MIB_TCPLOSS); } @@ -1997,14 +2138,13 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, /* E. Check state exit conditions. State can be terminated * when high_seq is ACKed. */ if (icsk->icsk_ca_state == TCP_CA_Open) { - if (!sysctl_tcp_frto) - BUG_TRAP(tp->retrans_out == 0); + BUG_TRAP(tp->retrans_out == 0); tp->retrans_stamp = 0; } else if (!before(tp->snd_una, tp->high_seq)) { switch (icsk->icsk_ca_state) { case TCP_CA_Loss: icsk->icsk_retransmits = 0; - if (tcp_try_undo_recovery(sk, tp)) + if (tcp_try_undo_recovery(sk)) return; break; @@ -2018,7 +2158,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, break; case TCP_CA_Disorder: - tcp_try_undo_dsack(sk, tp); + tcp_try_undo_dsack(sk); if (!tp->undo_marker || /* For SACK case do not Open to allow to undo * catching for all duplicate ACKs. */ @@ -2031,7 +2171,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, case TCP_CA_Recovery: if (IsReno(tp)) tcp_reset_reno_sack(tp); - if (tcp_try_undo_recovery(sk, tp)) + if (tcp_try_undo_recovery(sk)) return; tcp_complete_cwr(sk); break; @@ -2047,14 +2187,14 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, } else { int acked = prior_packets - tp->packets_out; if (IsReno(tp)) - tcp_remove_reno_sacks(sk, tp, acked); - is_dupack = tcp_try_undo_partial(sk, tp, acked); + tcp_remove_reno_sacks(sk, acked); + is_dupack = tcp_try_undo_partial(sk, acked); } break; case TCP_CA_Loss: if (flag&FLAG_DATA_ACKED) icsk->icsk_retransmits = 0; - if (!tcp_try_undo_loss(sk, tp)) { + if (!tcp_try_undo_loss(sk)) { tcp_moderate_cwnd(tp); tcp_xmit_retransmit_queue(sk); return; @@ -2071,10 +2211,10 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, } if (icsk->icsk_ca_state == TCP_CA_Disorder) - tcp_try_undo_dsack(sk, tp); + tcp_try_undo_dsack(sk); - if (!tcp_time_to_recover(sk, tp)) { - tcp_try_to_open(sk, tp, flag); + if (!tcp_time_to_recover(sk)) { + tcp_try_to_open(sk, flag); return; } @@ -2113,8 +2253,8 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, tcp_set_ca_state(sk, TCP_CA_Recovery); } - if (is_dupack || tcp_head_timedout(sk, tp)) - tcp_update_scoreboard(sk, tp); + if (is_dupack || tcp_head_timedout(sk)) + tcp_update_scoreboard(sk); tcp_cwnd_down(sk); tcp_xmit_retransmit_queue(sk); } @@ -2190,8 +2330,10 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, * RFC2988 recommends to restart timer to now+rto. */ -static void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) +static void tcp_ack_packets_out(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); + if (!tp->packets_out) { inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); } else { @@ -2255,14 +2397,6 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, return acked; } -static u32 tcp_usrtt(struct timeval *tv) -{ - struct timeval now; - - do_gettimeofday(&now); - return (now.tv_sec - tv->tv_sec) * 1000000 + (now.tv_usec - tv->tv_usec); -} - /* Remove acknowledged frames from the retransmission queue. */ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) { @@ -2273,12 +2407,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) int acked = 0; __s32 seq_rtt = -1; u32 pkts_acked = 0; - void (*rtt_sample)(struct sock *sk, u32 usrtt) - = icsk->icsk_ca_ops->rtt_sample; - struct timeval tv = { .tv_sec = 0, .tv_usec = 0 }; + ktime_t last_ackt = ktime_set(0,0); - while ((skb = skb_peek(&sk->sk_write_queue)) && - skb != sk->sk_send_head) { + while ((skb = tcp_write_queue_head(sk)) && + skb != tcp_send_head(sk)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); __u8 sacked = scb->sacked; @@ -2318,13 +2450,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) if (sacked) { if (sacked & TCPCB_RETRANS) { - if(sacked & TCPCB_SACKED_RETRANS) + if (sacked & TCPCB_SACKED_RETRANS) tp->retrans_out -= tcp_skb_pcount(skb); acked |= FLAG_RETRANS_DATA_ACKED; seq_rtt = -1; } else if (seq_rtt < 0) { seq_rtt = now - scb->when; - skb_get_timestamp(skb, &tv); + last_ackt = skb->tstamp; } if (sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= tcp_skb_pcount(skb); @@ -2337,23 +2469,24 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) } } else if (seq_rtt < 0) { seq_rtt = now - scb->when; - skb_get_timestamp(skb, &tv); + last_ackt = skb->tstamp; } tcp_dec_pcount_approx(&tp->fackets_out, skb); tcp_packets_out_dec(tp, skb); - __skb_unlink(skb, &sk->sk_write_queue); + tcp_unlink_write_queue(skb, sk); sk_stream_free_skb(sk, skb); clear_all_retrans_hints(tp); } if (acked&FLAG_ACKED) { + const struct tcp_congestion_ops *ca_ops + = inet_csk(sk)->icsk_ca_ops; + tcp_ack_update_rtt(sk, acked, seq_rtt); - tcp_ack_packets_out(sk, tp); - if (rtt_sample && !(acked & FLAG_RETRANS_DATA_ACKED)) - (*rtt_sample)(sk, tcp_usrtt(&tv)); + tcp_ack_packets_out(sk); - if (icsk->icsk_ca_ops->pkts_acked) - icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked); + if (ca_ops->pkts_acked) + ca_ops->pkts_acked(sk, pkts_acked, last_ackt); } #if FASTRETRANS_DEBUG > 0 @@ -2390,7 +2523,7 @@ static void tcp_ack_probe(struct sock *sk) /* Was it a usable window open? */ - if (!after(TCP_SKB_CB(sk->sk_send_head)->end_seq, + if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tp->snd_una + tp->snd_wnd)) { icsk->icsk_backoff = 0; inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); @@ -2433,13 +2566,14 @@ static inline int tcp_may_update_window(const struct tcp_sock *tp, const u32 ack * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2 * and in FreeBSD. NetBSD's one is even worse.) is wrong. */ -static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp, - struct sk_buff *skb, u32 ack, u32 ack_seq) +static int tcp_ack_update_window(struct sock *sk, struct sk_buff *skb, u32 ack, + u32 ack_seq) { + struct tcp_sock *tp = tcp_sk(sk); int flag = 0; - u32 nwin = ntohs(skb->h.th->window); + u32 nwin = ntohs(tcp_hdr(skb)->window); - if (likely(!skb->h.th->syn)) + if (likely(!tcp_hdr(skb)->syn)) nwin <<= tp->rx_opt.snd_wscale; if (tcp_may_update_window(tp, ack, ack_seq, nwin)) { @@ -2453,7 +2587,7 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp, * fast path is recovered for sending TCP. */ tp->pred_flags = 0; - tcp_fast_path_check(sk, tp); + tcp_fast_path_check(sk); if (nwin > tp->max_window) { tp->max_window = nwin; @@ -2467,39 +2601,139 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp, return flag; } -static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) +/* A very conservative spurious RTO response algorithm: reduce cwnd and + * continue in congestion avoidance. + */ +static void tcp_conservative_spur_to_response(struct tcp_sock *tp) +{ + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); + tp->snd_cwnd_cnt = 0; + tcp_moderate_cwnd(tp); +} + +/* A conservative spurious RTO response algorithm: reduce cwnd using + * rate halving and continue in congestion avoidance. + */ +static void tcp_ratehalving_spur_to_response(struct sock *sk) +{ + tcp_enter_cwr(sk, 0); +} + +static void tcp_undo_spur_to_response(struct sock *sk, int flag) +{ + if (flag&FLAG_ECE) + tcp_ratehalving_spur_to_response(sk); + else + tcp_undo_cwr(sk, 1); +} + +/* F-RTO spurious RTO detection algorithm (RFC4138) + * + * F-RTO affects during two new ACKs following RTO (well, almost, see inline + * comments). State (ACK number) is kept in frto_counter. When ACK advances + * window (but not to or beyond highest sequence sent before RTO): + * On First ACK, send two new segments out. + * On Second ACK, RTO was likely spurious. Do spurious response (response + * algorithm is not part of the F-RTO detection algorithm + * given in RFC4138 but can be selected separately). + * Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss + * and TCP falls back to conventional RTO recovery. F-RTO allows overriding + * of Nagle, this is done using frto_counter states 2 and 3, when a new data + * segment of any size sent during F-RTO, state 2 is upgraded to 3. + * + * Rationale: if the RTO was spurious, new ACKs should arrive from the + * original window even after we transmit two new data segments. + * + * SACK version: + * on first step, wait until first cumulative ACK arrives, then move to + * the second step. In second step, the next ACK decides. + * + * F-RTO is implemented (mainly) in four functions: + * - tcp_use_frto() is used to determine if TCP is can use F-RTO + * - tcp_enter_frto() prepares TCP state on RTO if F-RTO is used, it is + * called when tcp_use_frto() showed green light + * - tcp_process_frto() handles incoming ACKs during F-RTO algorithm + * - tcp_enter_frto_loss() is called if there is not enough evidence + * to prove that the RTO is indeed spurious. It transfers the control + * from F-RTO to the conventional RTO recovery + */ +static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) { struct tcp_sock *tp = tcp_sk(sk); tcp_sync_left_out(tp); - if (tp->snd_una == prior_snd_una || - !before(tp->snd_una, tp->frto_highmark)) { - /* RTO was caused by loss, start retransmitting in - * go-back-N slow start + /* Duplicate the behavior from Loss state (fastretrans_alert) */ + if (flag&FLAG_DATA_ACKED) + inet_csk(sk)->icsk_retransmits = 0; + + if (!before(tp->snd_una, tp->frto_highmark)) { + tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag); + return 1; + } + + if (!IsSackFrto() || IsReno(tp)) { + /* RFC4138 shortcoming in step 2; should also have case c): + * ACK isn't duplicate nor advances window, e.g., opposite dir + * data, winupdate */ - tcp_enter_frto_loss(sk); - return; + if ((tp->snd_una == prior_snd_una) && (flag&FLAG_NOT_DUP) && + !(flag&FLAG_FORWARD_PROGRESS)) + return 1; + + if (!(flag&FLAG_DATA_ACKED)) { + tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3), + flag); + return 1; + } + } else { + if (!(flag&FLAG_DATA_ACKED) && (tp->frto_counter == 1)) { + /* Prevent sending of new data. */ + tp->snd_cwnd = min(tp->snd_cwnd, + tcp_packets_in_flight(tp)); + return 1; + } + + if ((tp->frto_counter >= 2) && + (!(flag&FLAG_FORWARD_PROGRESS) || + ((flag&FLAG_DATA_SACKED) && !(flag&FLAG_ONLY_ORIG_SACKED)))) { + /* RFC4138 shortcoming (see comment above) */ + if (!(flag&FLAG_FORWARD_PROGRESS) && (flag&FLAG_NOT_DUP)) + return 1; + + tcp_enter_frto_loss(sk, 3, flag); + return 1; + } } if (tp->frto_counter == 1) { - /* First ACK after RTO advances the window: allow two new - * segments out. - */ + /* Sending of the next skb must be allowed or no FRTO */ + if (!tcp_send_head(sk) || + after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, + tp->snd_una + tp->snd_wnd)) { + tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), + flag); + return 1; + } + tp->snd_cwnd = tcp_packets_in_flight(tp) + 2; + tp->frto_counter = 2; + return 1; } else { - /* Also the second ACK after RTO advances the window. - * The RTO was likely spurious. Reduce cwnd and continue - * in congestion avoidance - */ - tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); - tcp_moderate_cwnd(tp); + switch (sysctl_tcp_frto_response) { + case 2: + tcp_undo_spur_to_response(sk, flag); + break; + case 1: + tcp_conservative_spur_to_response(tp); + break; + default: + tcp_ratehalving_spur_to_response(sk); + break; + } + tp->frto_counter = 0; } - - /* F-RTO affects on two new ACKs following RTO. - * At latest on third ACK the TCP behavior is back to normal. - */ - tp->frto_counter = (tp->frto_counter + 1) % 3; + return 0; } /* This routine deals with incoming acks, but not outgoing ones. */ @@ -2513,6 +2747,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) u32 prior_in_flight; s32 seq_rtt; int prior_packets; + int frto_cwnd = 0; /* If the ack is newer than sent or older than previous acks * then we can probably ignore it. @@ -2549,12 +2784,12 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) else NET_INC_STATS_BH(LINUX_MIB_TCPPUREACKS); - flag |= tcp_ack_update_window(sk, tp, skb, ack, ack_seq); + flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); if (TCP_SKB_CB(skb)->sacked) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); - if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) + if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) flag |= FLAG_ECE; tcp_ca_event(sk, CA_EVENT_SLOW_ACK); @@ -2575,15 +2810,16 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) flag |= tcp_clean_rtx_queue(sk, &seq_rtt); if (tp->frto_counter) - tcp_process_frto(sk, prior_snd_una); + frto_cwnd = tcp_process_frto(sk, prior_snd_una, flag); if (tcp_ack_is_dubious(sk, flag)) { /* Advance CWND, if state allows this. */ - if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) + if ((flag & FLAG_DATA_ACKED) && !frto_cwnd && + tcp_may_raise_cwnd(sk, flag)) tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0); tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); } else { - if ((flag & FLAG_DATA_ACKED)) + if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 1); } @@ -2599,7 +2835,7 @@ no_queue: * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. */ - if (sk->sk_send_head) + if (tcp_send_head(sk)) tcp_ack_probe(sk); return 1; @@ -2620,13 +2856,13 @@ uninteresting_ack: void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab) { unsigned char *ptr; - struct tcphdr *th = skb->h.th; + struct tcphdr *th = tcp_hdr(skb); int length=(th->doff*4)-sizeof(struct tcphdr); ptr = (unsigned char *)(th + 1); opt_rx->saw_tstamp = 0; - while(length>0) { + while (length > 0) { int opcode=*ptr++; int opsize; @@ -2642,9 +2878,9 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, return; if (opsize > length) return; /* don't parse partial options */ - switch(opcode) { + switch (opcode) { case TCPOPT_MSS: - if(opsize==TCPOLEN_MSS && th->syn && !estab) { + if (opsize==TCPOLEN_MSS && th->syn && !estab) { u16 in_mss = ntohs(get_unaligned((__be16 *)ptr)); if (in_mss) { if (opt_rx->user_mss && opt_rx->user_mss < in_mss) @@ -2654,12 +2890,12 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, } break; case TCPOPT_WINDOW: - if(opsize==TCPOLEN_WINDOW && th->syn && !estab) + if (opsize==TCPOLEN_WINDOW && th->syn && !estab) if (sysctl_tcp_window_scaling) { __u8 snd_wscale = *(__u8 *) ptr; opt_rx->wscale_ok = 1; if (snd_wscale > 14) { - if(net_ratelimit()) + if (net_ratelimit()) printk(KERN_INFO "tcp_parse_options: Illegal window " "scaling value %d >14 received.\n", snd_wscale); @@ -2669,7 +2905,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, } break; case TCPOPT_TIMESTAMP: - if(opsize==TCPOLEN_TIMESTAMP) { + if (opsize==TCPOLEN_TIMESTAMP) { if ((estab && opt_rx->tstamp_ok) || (!estab && sysctl_tcp_timestamps)) { opt_rx->saw_tstamp = 1; @@ -2679,7 +2915,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, } break; case TCPOPT_SACK_PERM: - if(opsize==TCPOLEN_SACK_PERM && th->syn && !estab) { + if (opsize==TCPOLEN_SACK_PERM && th->syn && !estab) { if (sysctl_tcp_sack) { opt_rx->sack_ok = 1; tcp_sack_reset(opt_rx); @@ -2688,7 +2924,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, break; case TCPOPT_SACK: - if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) && + if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) && !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) && opt_rx->sack_ok) { TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th; @@ -2701,10 +2937,11 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, */ break; #endif - }; + } + ptr+=opsize-2; length-=opsize; - }; + } } } @@ -2737,7 +2974,7 @@ static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, static inline void tcp_store_ts_recent(struct tcp_sock *tp) { tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; - tp->rx_opt.ts_recent_stamp = xtime.tv_sec; + tp->rx_opt.ts_recent_stamp = get_seconds(); } static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) @@ -2750,8 +2987,8 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) * Not only, also it occurs for expired timestamps. */ - if((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 || - xtime.tv_sec >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS) + if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 || + get_seconds() >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS) tcp_store_ts_recent(tp); } } @@ -2782,7 +3019,7 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - struct tcphdr *th = skb->h.th; + struct tcphdr *th = tcp_hdr(skb); u32 seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; @@ -2803,7 +3040,7 @@ static inline int tcp_paws_discard(const struct sock *sk, const struct sk_buff * { const struct tcp_sock *tp = tcp_sk(sk); return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW && - xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS && + get_seconds() < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS && !tcp_disordered_ack(sk, skb)); } @@ -2910,7 +3147,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n", __FUNCTION__, sk->sk_state); break; - }; + } /* It _is_ possible, that we have something out-of-order _after_ FIN. * Probably, we should reset in this case. For now drop them. @@ -3009,7 +3246,7 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) */ tp->rx_opt.num_sacks--; tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok); - for(i=this_sack; i < tp->rx_opt.num_sacks; i++) + for (i=this_sack; i < tp->rx_opt.num_sacks; i++) sp[i] = sp[i+1]; continue; } @@ -3062,7 +3299,7 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) tp->rx_opt.num_sacks--; sp--; } - for(; this_sack > 0; this_sack--, sp--) + for (; this_sack > 0; this_sack--, sp--) *sp = *(sp-1); new_sack: @@ -3088,7 +3325,7 @@ static void tcp_sack_remove(struct tcp_sock *tp) return; } - for(this_sack = 0; this_sack < num_sacks; ) { + for (this_sack = 0; this_sack < num_sacks; ) { /* Check if the start of the sack is covered by RCV.NXT. */ if (!before(tp->rcv_nxt, sp->start_seq)) { int i; @@ -3144,8 +3381,8 @@ static void tcp_ofo_queue(struct sock *sk) __skb_unlink(skb, &tp->out_of_order_queue); __skb_queue_tail(&sk->sk_receive_queue, skb); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - if(skb->h.th->fin) - tcp_fin(skb, sk, skb->h.th); + if (tcp_hdr(skb)->fin) + tcp_fin(skb, sk, tcp_hdr(skb)); } } @@ -3153,7 +3390,7 @@ static int tcp_prune_queue(struct sock *sk); static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { - struct tcphdr *th = skb->h.th; + struct tcphdr *th = tcp_hdr(skb); struct tcp_sock *tp = tcp_sk(sk); int eaten = -1; @@ -3210,9 +3447,9 @@ queue_and_out: __skb_queue_tail(&sk->sk_receive_queue, skb); } tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - if(skb->len) - tcp_event_data_recv(sk, tp, skb); - if(th->fin) + if (skb->len) + tcp_event_data_recv(sk, skb); + if (th->fin) tcp_fin(skb, sk, th); if (!skb_queue_empty(&tp->out_of_order_queue)) { @@ -3228,7 +3465,7 @@ queue_and_out: if (tp->rx_opt.num_sacks) tcp_sack_remove(tp); - tcp_fast_path_check(sk, tp); + tcp_fast_path_check(sk); if (eaten > 0) __kfree_skb(skb); @@ -3392,7 +3629,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, * - bloated or contains data before "start" or * overlaps to the next one. */ - if (!skb->h.th->syn && !skb->h.th->fin && + if (!tcp_hdr(skb)->syn && !tcp_hdr(skb)->fin && (tcp_win_from_space(skb->truesize) > skb->len || before(TCP_SKB_CB(skb)->seq, start) || (skb->next != tail && @@ -3403,7 +3640,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, start = TCP_SKB_CB(skb)->end_seq; skb = skb->next; } - if (skb == tail || skb->h.th->syn || skb->h.th->fin) + if (skb == tail || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin) return; while (before(start, end)) { @@ -3419,11 +3656,14 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, nskb = alloc_skb(copy+header, GFP_ATOMIC); if (!nskb) return; + + skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head); + skb_set_network_header(nskb, (skb_network_header(skb) - + skb->head)); + skb_set_transport_header(nskb, (skb_transport_header(skb) - + skb->head)); skb_reserve(nskb, header); memcpy(nskb->head, skb->head, header); - nskb->nh.raw = nskb->head + (skb->nh.raw-skb->head); - nskb->h.raw = nskb->head + (skb->h.raw-skb->head); - nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head); memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; __skb_insert(nskb, skb->prev, skb, list); @@ -3449,7 +3689,9 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, __kfree_skb(skb); NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED); skb = next; - if (skb == tail || skb->h.th->syn || skb->h.th->fin) + if (skb == tail || + tcp_hdr(skb)->syn || + tcp_hdr(skb)->fin) return; } } @@ -3514,7 +3756,7 @@ static int tcp_prune_queue(struct sock *sk) NET_INC_STATS_BH(LINUX_MIB_PRUNECALLED); if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) - tcp_clamp_window(sk, tp); + tcp_clamp_window(sk); else if (tcp_memory_pressure) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); @@ -3583,8 +3825,10 @@ void tcp_cwnd_application_limited(struct sock *sk) tp->snd_cwnd_stamp = tcp_time_stamp; } -static int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp) +static int tcp_should_expand_sndbuf(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); + /* If the user specified a specific send buffer setting, do * not modify it. */ @@ -3616,7 +3860,7 @@ static void tcp_new_space(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_should_expand_sndbuf(sk, tp)) { + if (tcp_should_expand_sndbuf(sk)) { int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff), demanded = max_t(unsigned int, tp->snd_cwnd, @@ -3640,9 +3884,9 @@ static void tcp_check_space(struct sock *sk) } } -static inline void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp) +static inline void tcp_data_snd_check(struct sock *sk) { - tcp_push_pending_frames(sk, tp); + tcp_push_pending_frames(sk); tcp_check_space(sk); } @@ -3790,7 +4034,7 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) int err; local_bh_enable(); - if (skb->ip_summed==CHECKSUM_UNNECESSARY) + if (skb_csum_unnecessary(skb)) err = skb_copy_datagram_iovec(skb, hlen, tp->ucopy.iov, chunk); else err = skb_copy_and_csum_datagram_iovec(skb, hlen, @@ -3822,7 +4066,7 @@ static __sum16 __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) { - return skb->ip_summed != CHECKSUM_UNNECESSARY && + return !skb_csum_unnecessary(skb) && __tcp_checksum_complete_user(sk, skb); } @@ -3840,7 +4084,7 @@ static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) tp->ucopy.dma_chan = get_softnet_dma(); - if (tp->ucopy.dma_chan && skb->ip_summed == CHECKSUM_UNNECESSARY) { + if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) { dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan, skb, hlen, tp->ucopy.iov, chunk, tp->ucopy.pinned_list); @@ -3856,7 +4100,7 @@ static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen tcp_rcv_space_adjust(sk); if ((tp->ucopy.len == 0) || - (tcp_flag_word(skb->h.th) & TCP_FLAG_PSH) || + (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) || (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) { tp->ucopy.wakeup = 1; sk->sk_data_ready(sk, 0); @@ -3976,7 +4220,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, */ tcp_ack(sk, skb, 0); __kfree_skb(skb); - tcp_data_snd_check(sk, tp); + tcp_data_snd_check(sk); return 0; } else { /* Header too small */ TCP_INC_STATS_BH(TCP_MIB_INERRS); @@ -4047,12 +4291,12 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; } - tcp_event_data_recv(sk, tp, skb); + tcp_event_data_recv(sk, skb); if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { /* Well, only one small jumplet in fast path... */ tcp_ack(sk, skb, FLAG_DATA); - tcp_data_snd_check(sk, tp); + tcp_data_snd_check(sk); if (!inet_csk_ack_scheduled(sk)) goto no_ack; } @@ -4109,7 +4353,7 @@ slow_path: goto discard; } - if(th->rst) { + if (th->rst) { tcp_reset(sk); goto discard; } @@ -4124,7 +4368,7 @@ slow_path: } step5: - if(th->ack) + if (th->ack) tcp_ack(sk, skb, FLAG_SLOWPATH); tcp_rcv_rtt_measure_ts(sk, skb); @@ -4135,7 +4379,7 @@ step5: /* step 7: process the segment text */ tcp_data_queue(sk, skb); - tcp_data_snd_check(sk, tp); + tcp_data_snd_check(sk); tcp_ack_snd_check(sk); return 0; @@ -4412,13 +4656,13 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, goto discard; case TCP_LISTEN: - if(th->ack) + if (th->ack) return 1; - if(th->rst) + if (th->rst) goto discard; - if(th->syn) { + if (th->syn) { if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) return 1; @@ -4452,7 +4696,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* Do step6 onward by hand. */ tcp_urg(sk, skb, th); __kfree_skb(skb); - tcp_data_snd_check(sk, tp); + tcp_data_snd_check(sk); return 0; } @@ -4474,7 +4718,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, } /* step 2: check RST bit */ - if(th->rst) { + if (th->rst) { tcp_reset(sk); goto discard; } @@ -4497,7 +4741,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (th->ack) { int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH); - switch(sk->sk_state) { + switch (sk->sk_state) { case TCP_SYN_RECV: if (acceptable) { tp->copied_seq = tp->rcv_nxt; @@ -4644,7 +4888,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* tcp_data could move socket to TIME-WAIT */ if (sk->sk_state != TCP_CLOSE) { - tcp_data_snd_check(sk, tp); + tcp_data_snd_check(sk); tcp_ack_snd_check(sk); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 0ba74bbe7d3..5a3e7f839fc 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -88,7 +88,7 @@ int sysctl_tcp_low_latency __read_mostly; #define ICMP_MIN_LENGTH 8 /* Socket used for sending RSTs */ -static struct socket *tcp_socket; +static struct socket *tcp_socket __read_mostly; void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb); @@ -125,10 +125,10 @@ void tcp_unhash(struct sock *sk) static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb) { - return secure_tcp_sequence_number(skb->nh.iph->daddr, - skb->nh.iph->saddr, - skb->h.th->dest, - skb->h.th->source); + return secure_tcp_sequence_number(ip_hdr(skb)->daddr, + ip_hdr(skb)->saddr, + tcp_hdr(skb)->dest, + tcp_hdr(skb)->source); } int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) @@ -149,7 +149,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) */ if (tcptw->tw_ts_recent_stamp && (twp == NULL || (sysctl_tcp_tw_reuse && - xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) { + get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; if (tp->write_seq == 0) tp->write_seq = 1; @@ -224,7 +224,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) * when trying new connection. */ if (peer != NULL && - peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) { + peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) { tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; tp->rx_opt.ts_recent = peer->tcp_ts; } @@ -354,8 +354,8 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); struct tcp_sock *tp; struct inet_sock *inet; - int type = skb->h.icmph->type; - int code = skb->h.icmph->code; + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; struct sock *sk; __u32 seq; int err; @@ -499,11 +499,12 @@ out: void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb) { struct inet_sock *inet = inet_sk(sk); - struct tcphdr *th = skb->h.th; + struct tcphdr *th = tcp_hdr(skb); if (skb->ip_summed == CHECKSUM_PARTIAL) { th->check = ~tcp_v4_check(len, inet->saddr, inet->daddr, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); } else { th->check = tcp_v4_check(len, inet->saddr, inet->daddr, @@ -515,17 +516,18 @@ void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb) int tcp_v4_gso_send_check(struct sk_buff *skb) { - struct iphdr *iph; + const struct iphdr *iph; struct tcphdr *th; if (!pskb_may_pull(skb, sizeof(*th))) return -EINVAL; - iph = skb->nh.iph; - th = skb->h.th; + iph = ip_hdr(skb); + th = tcp_hdr(skb); th->check = 0; th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); skb->ip_summed = CHECKSUM_PARTIAL; return 0; @@ -546,7 +548,7 @@ int tcp_v4_gso_send_check(struct sk_buff *skb) static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) { - struct tcphdr *th = skb->h.th; + struct tcphdr *th = tcp_hdr(skb); struct { struct tcphdr th; #ifdef CONFIG_TCP_MD5SIG @@ -585,7 +587,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) arg.iov[0].iov_len = sizeof(rep.th); #ifdef CONFIG_TCP_MD5SIG - key = sk ? tcp_v4_md5_do_lookup(sk, skb->nh.iph->daddr) : NULL; + key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL; if (key) { rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | @@ -597,14 +599,14 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1], key, - skb->nh.iph->daddr, - skb->nh.iph->saddr, + ip_hdr(skb)->daddr, + ip_hdr(skb)->saddr, &rep.th, IPPROTO_TCP, arg.iov[0].iov_len); } #endif - arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr, - skb->nh.iph->saddr, /* XXX */ + arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, + ip_hdr(skb)->saddr, /* XXX */ sizeof(struct tcphdr), IPPROTO_TCP, 0); arg.csumoffset = offsetof(struct tcphdr, check) / 2; @@ -622,7 +624,7 @@ static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts) { - struct tcphdr *th = skb->h.th; + struct tcphdr *th = tcp_hdr(skb); struct { struct tcphdr th; __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) @@ -670,7 +672,7 @@ static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk, * skb->sk) holds true, but we program defensively. */ if (!twsk && skb->sk) { - key = tcp_v4_md5_do_lookup(skb->sk, skb->nh.iph->daddr); + key = tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr); } else if (twsk && twsk->tw_md5_keylen) { tw_key.key = twsk->tw_md5_key; tw_key.keylen = twsk->tw_md5_keylen; @@ -690,14 +692,14 @@ static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk, tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset], key, - skb->nh.iph->daddr, - skb->nh.iph->saddr, + ip_hdr(skb)->daddr, + ip_hdr(skb)->saddr, &rep.th, IPPROTO_TCP, arg.iov[0].iov_len); } #endif - arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr, - skb->nh.iph->saddr, /* XXX */ + arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, + ip_hdr(skb)->saddr, /* XXX */ arg.iov[0].iov_len, IPPROTO_TCP, 0); arg.csumoffset = offsetof(struct tcphdr, check) / 2; @@ -745,7 +747,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req, skb = tcp_make_synack(sk, dst, req); if (skb) { - struct tcphdr *th = skb->h.th; + struct tcphdr *th = tcp_hdr(skb); th->check = tcp_v4_check(skb->len, ireq->loc_addr, @@ -781,7 +783,7 @@ static void syn_flood_warning(struct sk_buff *skb) warntime = jiffies; printk(KERN_INFO "possible SYN flooding on port %d. Sending cookies.\n", - ntohs(skb->h.th->dest)); + ntohs(tcp_hdr(skb)->dest)); } } #endif @@ -1133,8 +1135,8 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb) */ __u8 *hash_location = NULL; struct tcp_md5sig_key *hash_expected; - struct iphdr *iph = skb->nh.iph; - struct tcphdr *th = skb->h.th; + const struct iphdr *iph = ip_hdr(skb); + struct tcphdr *th = tcp_hdr(skb); int length = (th->doff << 2) - sizeof(struct tcphdr); int genhash; unsigned char *ptr; @@ -1251,8 +1253,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) struct inet_request_sock *ireq; struct tcp_options_received tmp_opt; struct request_sock *req; - __be32 saddr = skb->nh.iph->saddr; - __be32 daddr = skb->nh.iph->daddr; + __be32 saddr = ip_hdr(skb)->saddr; + __be32 daddr = ip_hdr(skb)->daddr; __u32 isn = TCP_SKB_CB(skb)->when; struct dst_entry *dst = NULL; #ifdef CONFIG_SYN_COOKIES @@ -1327,7 +1329,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) ireq->rmt_addr = saddr; ireq->opt = tcp_v4_save_options(sk, skb); if (!want_cookie) - TCP_ECN_create_request(req, skb->h.th); + TCP_ECN_create_request(req, tcp_hdr(skb)); if (want_cookie) { #ifdef CONFIG_SYN_COOKIES @@ -1351,7 +1353,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) (dst = inet_csk_route_req(sk, req)) != NULL && (peer = rt_get_peer((struct rtable *)dst)) != NULL && peer->v4daddr == saddr) { - if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL && + if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL && (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) { NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED); @@ -1375,7 +1377,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open " "request from %u.%u.%u.%u/%u\n", NIPQUAD(saddr), - ntohs(skb->h.th->source)); + ntohs(tcp_hdr(skb)->source)); dst_release(dst); goto drop_and_free; } @@ -1439,7 +1441,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newinet->opt = ireq->opt; ireq->opt = NULL; newinet->mc_index = inet_iif(skb); - newinet->mc_ttl = skb->nh.iph->ttl; + newinet->mc_ttl = ip_hdr(skb)->ttl; inet_csk(newsk)->icsk_ext_hdr_len = 0; if (newinet->opt) inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen; @@ -1481,8 +1483,8 @@ exit: static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) { - struct tcphdr *th = skb->h.th; - struct iphdr *iph = skb->nh.iph; + struct tcphdr *th = tcp_hdr(skb); + const struct iphdr *iph = ip_hdr(skb); struct sock *nsk; struct request_sock **prev; /* Find possible connection requests. */ @@ -1491,9 +1493,8 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) if (req) return tcp_check_req(sk, skb, req, prev); - nsk = inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr, - th->source, skb->nh.iph->daddr, - th->dest, inet_iif(skb)); + nsk = inet_lookup_established(&tcp_hashinfo, iph->saddr, th->source, + iph->daddr, th->dest, inet_iif(skb)); if (nsk) { if (nsk->sk_state != TCP_TIME_WAIT) { @@ -1513,15 +1514,17 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) static __sum16 tcp_v4_checksum_init(struct sk_buff *skb) { + const struct iphdr *iph = ip_hdr(skb); + if (skb->ip_summed == CHECKSUM_COMPLETE) { - if (!tcp_v4_check(skb->len, skb->nh.iph->saddr, - skb->nh.iph->daddr, skb->csum)) { + if (!tcp_v4_check(skb->len, iph->saddr, + iph->daddr, skb->csum)) { skb->ip_summed = CHECKSUM_UNNECESSARY; return 0; } } - skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr, + skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, skb->len, IPPROTO_TCP, 0); if (skb->len <= 76) { @@ -1555,7 +1558,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ TCP_CHECK_TIMER(sk); - if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) { + if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; goto reset; } @@ -1563,7 +1566,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) return 0; } - if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb)) + if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb)) goto csum_err; if (sk->sk_state == TCP_LISTEN) { @@ -1581,7 +1584,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) } TCP_CHECK_TIMER(sk); - if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len)) { + if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; goto reset; } @@ -1610,6 +1613,7 @@ csum_err: int tcp_v4_rcv(struct sk_buff *skb) { + const struct iphdr *iph; struct tcphdr *th; struct sock *sk; int ret; @@ -1623,7 +1627,7 @@ int tcp_v4_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct tcphdr))) goto discard_it; - th = skb->h.th; + th = tcp_hdr(skb); if (th->doff < sizeof(struct tcphdr) / 4) goto bad_packet; @@ -1634,23 +1638,21 @@ int tcp_v4_rcv(struct sk_buff *skb) * Packet length and doff are validated by header prediction, * provided case of th->doff==0 is eliminated. * So, we defer the checks. */ - if ((skb->ip_summed != CHECKSUM_UNNECESSARY && - tcp_v4_checksum_init(skb))) + if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb)) goto bad_packet; - th = skb->h.th; + th = tcp_hdr(skb); + iph = ip_hdr(skb); TCP_SKB_CB(skb)->seq = ntohl(th->seq); TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff * 4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); TCP_SKB_CB(skb)->when = 0; - TCP_SKB_CB(skb)->flags = skb->nh.iph->tos; + TCP_SKB_CB(skb)->flags = iph->tos; TCP_SKB_CB(skb)->sacked = 0; - sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source, - skb->nh.iph->daddr, th->dest, - inet_iif(skb)); - + sk = __inet_lookup(&tcp_hashinfo, iph->saddr, th->source, + iph->daddr, th->dest, inet_iif(skb)); if (!sk) goto no_tcp_socket; @@ -1724,8 +1726,7 @@ do_time_wait: switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo, - skb->nh.iph->daddr, - th->dest, + iph->daddr, th->dest, inet_iif(skb)); if (sk2) { inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); @@ -1770,7 +1771,7 @@ int tcp_v4_remember_stamp(struct sock *sk) if (peer) { if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 || - (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec && + (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() && peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) { peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp; peer->tcp_ts = tp->rx_opt.ts_recent; @@ -1791,7 +1792,7 @@ int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || - (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec && + (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() && peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) { peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp; peer->tcp_ts = tcptw->tw_ts_recent; @@ -1890,7 +1891,7 @@ int tcp_v4_destroy_sock(struct sock *sk) tcp_cleanup_congestion_control(sk); /* Cleanup up the write buffer. */ - sk_stream_writequeue_purge(sk); + tcp_write_queue_purge(sk); /* Cleans up our, hopefully empty, out_of_order_queue. */ __skb_queue_purge(&tp->out_of_order_queue); @@ -2293,13 +2294,13 @@ static void get_openreq4(struct sock *sk, struct request_sock *req, req); } -static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i) +static void get_tcp4_sock(struct sock *sk, char *tmpbuf, int i) { int timer_active; unsigned long timer_expires; - struct tcp_sock *tp = tcp_sk(sp); - const struct inet_connection_sock *icsk = inet_csk(sp); - struct inet_sock *inet = inet_sk(sp); + struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_sock *inet = inet_sk(sk); __be32 dest = inet->daddr; __be32 src = inet->rcv_saddr; __u16 destp = ntohs(inet->dport); @@ -2311,9 +2312,9 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i) } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; timer_expires = icsk->icsk_timeout; - } else if (timer_pending(&sp->sk_timer)) { + } else if (timer_pending(&sk->sk_timer)) { timer_active = 2; - timer_expires = sp->sk_timer.expires; + timer_expires = sk->sk_timer.expires; } else { timer_active = 0; timer_expires = jiffies; @@ -2321,17 +2322,17 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i) sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " "%08X %5d %8d %lu %d %p %u %u %u %u %d", - i, src, srcp, dest, destp, sp->sk_state, + i, src, srcp, dest, destp, sk->sk_state, tp->write_seq - tp->snd_una, - sp->sk_state == TCP_LISTEN ? sp->sk_ack_backlog : + sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog : (tp->rcv_nxt - tp->copied_seq), timer_active, jiffies_to_clock_t(timer_expires - jiffies), icsk->icsk_retransmits, - sock_i_uid(sp), + sock_i_uid(sk), icsk->icsk_probes_out, - sock_i_ino(sp), - atomic_read(&sp->sk_refcnt), sp, + sock_i_ino(sk), + atomic_read(&sk->sk_refcnt), sk, icsk->icsk_rto, icsk->icsk_ack.ato, (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index f0ebaf0e21c..43294ad9f63 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -218,7 +218,7 @@ static u32 tcp_lp_owd_calculator(struct sock *sk) * 3. calc smoothed OWD (SOWD). * Most ideas come from the original TCP-LP implementation. */ -static void tcp_lp_rtt_sample(struct sock *sk, u32 usrtt) +static void tcp_lp_rtt_sample(struct sock *sk, u32 rtt) { struct lp *lp = inet_csk_ca(sk); s64 mowd = tcp_lp_owd_calculator(sk); @@ -261,11 +261,13 @@ static void tcp_lp_rtt_sample(struct sock *sk, u32 usrtt) * newReno in increase case. * We work it out by following the idea from TCP-LP's paper directly */ -static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked) +static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, ktime_t last) { struct tcp_sock *tp = tcp_sk(sk); struct lp *lp = inet_csk_ca(sk); + tcp_lp_rtt_sample(sk, ktime_to_us(net_timedelta(last))); + /* calc inference */ if (tcp_time_stamp > tp->rx_opt.rcv_tsecr) lp->inference = 3 * (tcp_time_stamp - tp->rx_opt.rcv_tsecr); @@ -312,11 +314,11 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked) } static struct tcp_congestion_ops tcp_lp = { + .flags = TCP_CONG_RTT_STAMP, .init = tcp_lp_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_lp_cong_avoid, .min_cwnd = tcp_reno_min_cwnd, - .rtt_sample = tcp_lp_rtt_sample, .pkts_acked = tcp_lp_pkts_acked, .owner = THIS_MODULE, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 6b5c64f3c92..a12b08fca5a 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -149,7 +149,7 @@ kill_with_rst: tw->tw_substate = TCP_TIME_WAIT; tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; if (tmp_opt.saw_tstamp) { - tcptw->tw_ts_recent_stamp = xtime.tv_sec; + tcptw->tw_ts_recent_stamp = get_seconds(); tcptw->tw_ts_recent = tmp_opt.rcv_tsval; } @@ -208,7 +208,7 @@ kill: if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent = tmp_opt.rcv_tsval; - tcptw->tw_ts_recent_stamp = xtime.tv_sec; + tcptw->tw_ts_recent_stamp = get_seconds(); } inet_twsk_put(tw); @@ -246,7 +246,7 @@ kill: if (paws_reject) NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); - if(!th->rst) { + if (!th->rst) { /* In this case we must reset the TIMEWAIT timer. * * If it is ACKless SYN it may be both old duplicate @@ -324,7 +324,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) if (tcp_alloc_md5sig_pool() == NULL) BUG(); } - } while(0); + } while (0); #endif /* Linkage updates. */ @@ -387,8 +387,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, /* Now setup tcp_sock */ newtp = tcp_sk(newsk); newtp->pred_flags = 0; - newtp->rcv_nxt = treq->rcv_isn + 1; - newtp->snd_nxt = newtp->snd_una = newtp->snd_sml = treq->snt_isn + 1; + newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1; + newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1; tcp_prequeue_init(newtp); @@ -422,10 +422,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, tcp_set_ca_state(newsk, TCP_CA_Open); tcp_init_xmit_timers(newsk); skb_queue_head_init(&newtp->out_of_order_queue); - newtp->rcv_wup = treq->rcv_isn + 1; newtp->write_seq = treq->snt_isn + 1; newtp->pushed_seq = newtp->write_seq; - newtp->copied_seq = treq->rcv_isn + 1; newtp->rx_opt.saw_tstamp = 0; @@ -440,7 +438,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, keepalive_time_when(newtp)); newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; - if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) { + if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) { if (sysctl_tcp_fack) newtp->rx_opt.sack_ok |= 2; } @@ -455,12 +453,13 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; newtp->window_clamp = min(newtp->window_clamp, 65535U); } - newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->rx_opt.snd_wscale; + newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) << + newtp->rx_opt.snd_wscale); newtp->max_window = newtp->snd_wnd; if (newtp->rx_opt.tstamp_ok) { newtp->rx_opt.ts_recent = req->ts_recent; - newtp->rx_opt.ts_recent_stamp = xtime.tv_sec; + newtp->rx_opt.ts_recent_stamp = get_seconds(); newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; } else { newtp->rx_opt.ts_recent_stamp = 0; @@ -490,7 +489,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, struct request_sock *req, struct request_sock **prev) { - struct tcphdr *th = skb->h.th; + const struct tcphdr *th = tcp_hdr(skb); __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); int paws_reject = 0; struct tcp_options_received tmp_opt; @@ -506,7 +505,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, * it can be estimated (approximately) * from another data. */ - tmp_opt.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans); + tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans); paws_reject = tcp_paws_check(&tmp_opt, th->rst); } } @@ -712,8 +711,8 @@ int tcp_child_process(struct sock *parent, struct sock *child, int state = child->sk_state; if (!sock_owned_by_user(child)) { - ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len); - + ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), + skb->len); /* Wakeup parent, send SIGIO */ if (state == TCP_SYN_RECV && child->sk_state != state) parent->sk_data_ready(parent, 0); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3c24881f2a6..0faacf9c419 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -62,14 +62,13 @@ int sysctl_tcp_base_mss __read_mostly = 512; /* By default, RFC2861 behavior. */ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; -static void update_send_head(struct sock *sk, struct tcp_sock *tp, - struct sk_buff *skb) +static void update_send_head(struct sock *sk, struct sk_buff *skb) { - sk->sk_send_head = skb->next; - if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue) - sk->sk_send_head = NULL; + struct tcp_sock *tp = tcp_sk(sk); + + tcp_advance_send_head(sk, skb); tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - tcp_packets_out_inc(sk, tp, skb); + tcp_packets_out_inc(sk, skb); } /* SND.NXT, if window was not shrunk. @@ -78,8 +77,10 @@ static void update_send_head(struct sock *sk, struct tcp_sock *tp, * Anything in between SND.UNA...SND.UNA+SND.WND also can be already * invalid. OK, let's make this for now: */ -static inline __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_sock *tp) +static inline __u32 tcp_acceptable_seq(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); + if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt)) return tp->snd_nxt; else @@ -238,7 +239,7 @@ static u16 tcp_select_window(struct sock *sk) u32 new_win = __tcp_select_window(sk); /* Never shrink the offered window */ - if(new_win < cur_win) { + if (new_win < cur_win) { /* Danger Will Robinson! * Don't update rcv_wup/rcv_wnd here or else * we will not be able to advertise a zero @@ -289,10 +290,12 @@ static void tcp_build_and_update_options(__be32 *ptr, struct tcp_sock *tp, (TCPOPT_SACK << 8) | (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK))); - for(this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) { + + for (this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) { *ptr++ = htonl(sp[this_sack].start_seq); *ptr++ = htonl(sp[this_sack].end_seq); } + if (tp->rx_opt.dsack) { tp->rx_opt.dsack = 0; tp->rx_opt.eff_sacks--; @@ -337,7 +340,7 @@ static void tcp_syn_build_options(__be32 *ptr, int mss, int ts, int sack, */ *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss); if (ts) { - if(sack) + if (sack) *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | (TCPOPT_TIMESTAMP << 8) | @@ -349,7 +352,7 @@ static void tcp_syn_build_options(__be32 *ptr, int mss, int ts, int sack, TCPOLEN_TIMESTAMP); *ptr++ = htonl(tstamp); /* TSVAL */ *ptr++ = htonl(ts_recent); /* TSECR */ - } else if(sack) + } else if (sack) *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_SACK_PERM << 8) | @@ -406,7 +409,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, /* If congestion control is doing timestamping, we must * take such a timestamp before we potentially clone/copy. */ - if (icsk->icsk_ca_ops->rtt_sample) + if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP) __net_timestamp(skb); if (likely(clone_it)) { @@ -430,7 +433,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, sysctl_flags = 0; if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; - if(sysctl_tcp_timestamps) { + if (sysctl_tcp_timestamps) { tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; sysctl_flags |= SYSCTL_FLAG_TSTAMPS; } @@ -465,11 +468,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tcp_header_size += TCPOLEN_MD5SIG_ALIGNED; #endif - th = (struct tcphdr *) skb_push(skb, tcp_header_size); - skb->h.th = th; + skb_push(skb, tcp_header_size); + skb_reset_transport_header(skb); skb_set_owner_w(skb, sk); /* Build TCP header and checksum it. */ + th = tcp_hdr(skb); th->source = inet->sport; th->dest = inet->dport; th->seq = htonl(tcb->seq); @@ -515,7 +519,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, md5 ? &md5_hash_location : #endif NULL); - TCP_ECN_send(sk, tp, skb, tcp_header_size); + TCP_ECN_send(sk, skb, tcp_header_size); } #ifdef CONFIG_TCP_MD5SIG @@ -524,7 +528,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tp->af_specific->calc_md5_hash(md5_hash_location, md5, sk, NULL, NULL, - skb->h.th, + tcp_hdr(skb), sk->sk_protocol, skb->len); } @@ -545,7 +549,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, if (likely(err <= 0)) return err; - tcp_enter_cwr(sk); + tcp_enter_cwr(sk, 1); return net_xmit_eval(err); @@ -567,12 +571,8 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) /* Advance write_seq and place onto the write_queue. */ tp->write_seq = TCP_SKB_CB(skb)->end_seq; skb_header_release(skb); - __skb_queue_tail(&sk->sk_write_queue, skb); + tcp_add_write_queue_tail(sk, skb); sk_charge_skb(sk, skb); - - /* Queue it, remembering where we must start sending. */ - if (sk->sk_send_head == NULL) - sk->sk_send_head = skb; } static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now) @@ -705,7 +705,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss /* Link BUFF into the send queue. */ skb_header_release(buff); - __skb_append(skb, buff, &sk->sk_write_queue); + tcp_insert_write_queue_after(skb, buff, sk); return 0; } @@ -736,7 +736,7 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) } skb_shinfo(skb)->nr_frags = k; - skb->tail = skb->data; + skb_reset_tail_pointer(skb); skb->data_len -= len; skb->len = skb->data_len; } @@ -930,8 +930,9 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) /* Congestion window validation. (RFC2861) */ -static void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) +static void tcp_cwnd_validate(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); __u32 packets_out = tp->packets_out; if (packets_out >= tp->snd_cwnd) { @@ -1034,8 +1035,10 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, if (nonagle & TCP_NAGLE_PUSH) return 1; - /* Don't use the nagle rule for urgent data (or for the final FIN). */ - if (tp->urg_mode || + /* Don't use the nagle rule for urgent data (or for the final FIN). + * Nagle can be ignored during F-RTO too (see RFC4138). + */ + if (tp->urg_mode || (tp->frto_counter == 2) || (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) return 1; @@ -1056,7 +1059,7 @@ static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, uns return !after(end_seq, tp->snd_una + tp->snd_wnd); } -/* This checks if the data bearing packet SKB (usually sk->sk_send_head) +/* This checks if the data bearing packet SKB (usually tcp_send_head(sk)) * should be put on the wire right now. If so, it returns the number of * packets allowed by the congestion window. */ @@ -1079,15 +1082,10 @@ static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb, return cwnd_quota; } -static inline int tcp_skb_is_last(const struct sock *sk, - const struct sk_buff *skb) +int tcp_may_send_now(struct sock *sk) { - return skb->next == (struct sk_buff *)&sk->sk_write_queue; -} - -int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp) -{ - struct sk_buff *skb = sk->sk_send_head; + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb = tcp_send_head(sk); return (skb && tcp_snd_test(sk, skb, tcp_current_mss(sk, 1), @@ -1143,7 +1141,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, /* Link BUFF into the send queue. */ skb_header_release(buff); - __skb_append(skb, buff, &sk->sk_write_queue); + tcp_insert_write_queue_after(skb, buff, sk); return 0; } @@ -1153,8 +1151,9 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, * * This algorithm is from John Heffner. */ -static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) +static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); u32 send_win, cong_win, limit, in_flight; @@ -1249,10 +1248,10 @@ static int tcp_mtu_probe(struct sock *sk) /* Have enough data in the send queue to probe? */ len = 0; - if ((skb = sk->sk_send_head) == NULL) + if ((skb = tcp_send_head(sk)) == NULL) return -1; while ((len += skb->len) < probe_size && !tcp_skb_is_last(sk, skb)) - skb = skb->next; + skb = tcp_write_queue_next(sk, skb); if (len < probe_size) return -1; @@ -1279,9 +1278,9 @@ static int tcp_mtu_probe(struct sock *sk) return -1; sk_charge_skb(sk, nskb); - skb = sk->sk_send_head; - __skb_insert(nskb, skb->prev, skb, &sk->sk_write_queue); - sk->sk_send_head = nskb; + skb = tcp_send_head(sk); + tcp_insert_write_queue_before(nskb, skb, sk); + tcp_advance_send_head(sk, skb); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; @@ -1292,7 +1291,7 @@ static int tcp_mtu_probe(struct sock *sk) len = 0; while (len < probe_size) { - next = skb->next; + next = tcp_write_queue_next(sk, skb); copy = min_t(int, skb->len, probe_size - len); if (nskb->ip_summed) @@ -1305,7 +1304,7 @@ static int tcp_mtu_probe(struct sock *sk) /* We've eaten all the data from this skb. * Throw it away. */ TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags; - __skb_unlink(skb, &sk->sk_write_queue); + tcp_unlink_write_queue(skb, sk); sk_stream_free_skb(sk, skb); } else { TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags & @@ -1333,7 +1332,7 @@ static int tcp_mtu_probe(struct sock *sk) /* Decrement cwnd here because we are sending * effectively two packets. */ tp->snd_cwnd--; - update_send_head(sk, tp, nskb); + update_send_head(sk, nskb); icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len); tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq; @@ -1377,7 +1376,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) sent_pkts = 1; } - while ((skb = sk->sk_send_head)) { + while ((skb = tcp_send_head(sk))) { unsigned int limit; tso_segs = tcp_init_tso_segs(sk, skb, mss_now); @@ -1396,7 +1395,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) nonagle : TCP_NAGLE_PUSH)))) break; } else { - if (tcp_tso_should_defer(sk, tp, skb)) + if (tcp_tso_should_defer(sk, skb)) break; } @@ -1425,31 +1424,31 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) /* Advance the send_head. This one is sent out. * This call will increment packets_out. */ - update_send_head(sk, tp, skb); + update_send_head(sk, skb); tcp_minshall_update(tp, mss_now, skb); sent_pkts++; } if (likely(sent_pkts)) { - tcp_cwnd_validate(sk, tp); + tcp_cwnd_validate(sk); return 0; } - return !tp->packets_out && sk->sk_send_head; + return !tp->packets_out && tcp_send_head(sk); } /* Push out any pending frames which were held back due to * TCP_CORK or attempt at coalescing tiny packets. * The socket must be locked by the caller. */ -void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, - unsigned int cur_mss, int nonagle) +void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, + int nonagle) { - struct sk_buff *skb = sk->sk_send_head; + struct sk_buff *skb = tcp_send_head(sk); if (skb) { if (tcp_write_xmit(sk, cur_mss, nonagle)) - tcp_check_probe_timer(sk, tp); + tcp_check_probe_timer(sk); } } @@ -1459,7 +1458,7 @@ void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, void tcp_push_one(struct sock *sk, unsigned int mss_now) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb = sk->sk_send_head; + struct sk_buff *skb = tcp_send_head(sk); unsigned int tso_segs, cwnd_quota; BUG_ON(!skb || skb->len < mss_now); @@ -1493,8 +1492,8 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now) TCP_SKB_CB(skb)->when = tcp_time_stamp; if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) { - update_send_head(sk, tp, skb); - tcp_cwnd_validate(sk, tp); + update_send_head(sk, skb); + tcp_cwnd_validate(sk); return; } } @@ -1620,7 +1619,7 @@ u32 __tcp_select_window(struct sock *sk) static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *next_skb = skb->next; + struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); /* The first test we must make is that neither of these two * SKB's are still referenced by someone else. @@ -1630,7 +1629,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m u16 flags = TCP_SKB_CB(skb)->flags; /* Also punt if next skb has been SACK'd. */ - if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED) + if (TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED) return; /* Next skb is out of window. */ @@ -1652,9 +1651,11 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m clear_all_retrans_hints(tp); /* Ok. We will be able to collapse the packet. */ - __skb_unlink(next_skb, &sk->sk_write_queue); + tcp_unlink_write_queue(next_skb, sk); - memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); + skb_copy_from_linear_data(next_skb, + skb_put(skb, next_skb_size), + next_skb_size); if (next_skb->ip_summed == CHECKSUM_PARTIAL) skb->ip_summed = CHECKSUM_PARTIAL; @@ -1706,7 +1707,9 @@ void tcp_simple_retransmit(struct sock *sk) unsigned int mss = tcp_current_mss(sk, 0); int lost = 0; - sk_stream_for_retrans_queue(skb, sk) { + tcp_for_write_queue(skb, sk) { + if (skb == tcp_send_head(sk)) + break; if (skb->len > mss && !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) { if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) { @@ -1788,13 +1791,13 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) } /* Collapse two adjacent packets if worthwhile and we can. */ - if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) && - (skb->len < (cur_mss >> 1)) && - (skb->next != sk->sk_send_head) && - (skb->next != (struct sk_buff *)&sk->sk_write_queue) && - (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(skb->next)->nr_frags == 0) && - (tcp_skb_pcount(skb) == 1 && tcp_skb_pcount(skb->next) == 1) && - (sysctl_tcp_retrans_collapse != 0)) + if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) && + (skb->len < (cur_mss >> 1)) && + (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) && + (!tcp_skb_is_last(sk, skb)) && + (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) && + (tcp_skb_pcount(skb) == 1 && tcp_skb_pcount(tcp_write_queue_next(sk, skb)) == 1) && + (sysctl_tcp_retrans_collapse != 0)) tcp_retrans_try_collapse(sk, skb, cur_mss); if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) @@ -1804,9 +1807,9 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) * retransmit when old data is attached. So strip it off * since it is cheap to do so and saves bytes on the network. */ - if(skb->len > 0 && - (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && - tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { + if (skb->len > 0 && + (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && + tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { if (!pskb_trim(skb, 0)) { TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1; skb_shinfo(skb)->gso_segs = 1; @@ -1872,15 +1875,17 @@ void tcp_xmit_retransmit_queue(struct sock *sk) skb = tp->retransmit_skb_hint; packet_cnt = tp->retransmit_cnt_hint; }else{ - skb = sk->sk_write_queue.next; + skb = tcp_write_queue_head(sk); packet_cnt = 0; } /* First pass: retransmit lost packets. */ if (tp->lost_out) { - sk_stream_for_retrans_queue_from(skb, sk) { + tcp_for_write_queue_from(skb, sk) { __u8 sacked = TCP_SKB_CB(skb)->sacked; + if (skb == tcp_send_head(sk)) + break; /* we could do better than to assign each time */ tp->retransmit_skb_hint = skb; tp->retransmit_cnt_hint = packet_cnt; @@ -1906,8 +1911,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) else NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS); - if (skb == - skb_peek(&sk->sk_write_queue)) + if (skb == tcp_write_queue_head(sk)) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); @@ -1937,18 +1941,20 @@ void tcp_xmit_retransmit_queue(struct sock *sk) * segments to send. */ - if (tcp_may_send_now(sk, tp)) + if (tcp_may_send_now(sk)) return; if (tp->forward_skb_hint) { skb = tp->forward_skb_hint; packet_cnt = tp->forward_cnt_hint; } else{ - skb = sk->sk_write_queue.next; + skb = tcp_write_queue_head(sk); packet_cnt = 0; } - sk_stream_for_retrans_queue_from(skb, sk) { + tcp_for_write_queue_from(skb, sk) { + if (skb == tcp_send_head(sk)) + break; tp->forward_cnt_hint = packet_cnt; tp->forward_skb_hint = skb; @@ -1973,7 +1979,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) break; } - if (skb == skb_peek(&sk->sk_write_queue)) + if (skb == tcp_write_queue_head(sk)) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); @@ -1989,7 +1995,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) void tcp_send_fin(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb = skb_peek_tail(&sk->sk_write_queue); + struct sk_buff *skb = tcp_write_queue_tail(sk); int mss_now; /* Optimization, tack on the FIN if we have a queue of @@ -1998,7 +2004,7 @@ void tcp_send_fin(struct sock *sk) */ mss_now = tcp_current_mss(sk, 1); - if (sk->sk_send_head != NULL) { + if (tcp_send_head(sk) != NULL) { TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN; TCP_SKB_CB(skb)->end_seq++; tp->write_seq++; @@ -2025,17 +2031,16 @@ void tcp_send_fin(struct sock *sk) TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; tcp_queue_skb(sk, skb); } - __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_OFF); + __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF); } /* We get here when a process closes a file descriptor (either due to * an explicit close() or as a byproduct of exit()'ing) and there * was unread data in the receive queue. This behavior is recommended - * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM + * by RFC 2525, section 2.17. -DaveM */ void tcp_send_active_reset(struct sock *sk, gfp_t priority) { - struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; /* NOTE: No TCP options attached and we never retransmit this. */ @@ -2055,7 +2060,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) skb_shinfo(skb)->gso_type = 0; /* Send it off. */ - TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp); + TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk); TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(skb)->when = tcp_time_stamp; if (tcp_transmit_skb(sk, skb, 0, priority)) @@ -2071,7 +2076,7 @@ int tcp_send_synack(struct sock *sk) { struct sk_buff* skb; - skb = skb_peek(&sk->sk_write_queue); + skb = tcp_write_queue_head(sk); if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) { printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n"); return -EFAULT; @@ -2081,9 +2086,9 @@ int tcp_send_synack(struct sock *sk) struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); if (nskb == NULL) return -ENOMEM; - __skb_unlink(skb, &sk->sk_write_queue); + tcp_unlink_write_queue(skb, sk); skb_header_release(nskb); - __skb_queue_head(&sk->sk_write_queue, nskb); + __tcp_add_write_queue_head(sk, nskb); sk_stream_free_skb(sk, skb); sk_charge_skb(sk, nskb); skb = nskb; @@ -2133,8 +2138,10 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, if (md5) tcp_header_size += TCPOLEN_MD5SIG_ALIGNED; #endif - skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size); + skb_push(skb, tcp_header_size); + skb_reset_transport_header(skb); + th = tcp_hdr(skb); memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; @@ -2188,7 +2195,7 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, tp->af_specific->calc_md5_hash(md5_hash_location, md5, NULL, dst, req, - skb->h.th, sk->sk_protocol, + tcp_hdr(skb), sk->sk_protocol, skb->len); } #endif @@ -2271,7 +2278,7 @@ int tcp_connect(struct sock *sk) skb_reserve(buff, MAX_TCP_HEADER); TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN; - TCP_ECN_send_syn(sk, tp, buff); + TCP_ECN_send_syn(sk, buff); TCP_SKB_CB(buff)->sacked = 0; skb_shinfo(buff)->gso_segs = 1; skb_shinfo(buff)->gso_size = 0; @@ -2285,7 +2292,7 @@ int tcp_connect(struct sock *sk) TCP_SKB_CB(buff)->when = tcp_time_stamp; tp->retrans_stamp = TCP_SKB_CB(buff)->when; skb_header_release(buff); - __skb_queue_tail(&sk->sk_write_queue, buff); + __tcp_add_write_queue_tail(sk, buff); sk_charge_skb(sk, buff); tp->packets_out += tcp_skb_pcount(buff); tcp_transmit_skb(sk, buff, 1, GFP_KERNEL); @@ -2363,7 +2370,6 @@ void tcp_send_ack(struct sock *sk) { /* If we have been reset, we may not send again. */ if (sk->sk_state != TCP_CLOSE) { - struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; /* We are not putting this on the write queue, so @@ -2389,7 +2395,7 @@ void tcp_send_ack(struct sock *sk) skb_shinfo(buff)->gso_type = 0; /* Send it off, this clears delayed acks for us. */ - TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp); + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk); TCP_SKB_CB(buff)->when = tcp_time_stamp; tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC); } @@ -2441,7 +2447,7 @@ int tcp_write_wakeup(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - if ((skb = sk->sk_send_head) != NULL && + if ((skb = tcp_send_head(sk)) != NULL && before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) { int err; unsigned int mss = tcp_current_mss(sk, 0); @@ -2467,7 +2473,7 @@ int tcp_write_wakeup(struct sock *sk) TCP_SKB_CB(skb)->when = tcp_time_stamp; err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); if (!err) { - update_send_head(sk, tp, skb); + update_send_head(sk, skb); } return err; } else { @@ -2491,7 +2497,7 @@ void tcp_send_probe0(struct sock *sk) err = tcp_write_wakeup(sk); - if (tp->packets_out || !sk->sk_send_head) { + if (tp->packets_out || !tcp_send_head(sk)) { /* Cancel probe timer, if it is not required. */ icsk->icsk_probes_out = 0; icsk->icsk_backoff = 0; diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index 61f406f2729..3938d5dbdf2 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -26,6 +26,8 @@ #include <linux/proc_fs.h> #include <linux/module.h> #include <linux/kfifo.h> +#include <linux/ktime.h> +#include <linux/time.h> #include <linux/vmalloc.h> #include <net/tcp.h> @@ -34,43 +36,45 @@ MODULE_AUTHOR("Stephen Hemminger <shemminger@linux-foundation.org>"); MODULE_DESCRIPTION("TCP cwnd snooper"); MODULE_LICENSE("GPL"); -static int port = 0; +static int port __read_mostly = 0; MODULE_PARM_DESC(port, "Port to match (0=all)"); module_param(port, int, 0); -static int bufsize = 64*1024; +static int bufsize __read_mostly = 64*1024; MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)"); module_param(bufsize, int, 0); +static int full __read_mostly; +MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)"); +module_param(full, int, 0); + static const char procname[] = "tcpprobe"; struct { - struct kfifo *fifo; - spinlock_t lock; + struct kfifo *fifo; + spinlock_t lock; wait_queue_head_t wait; - struct timeval tstart; + ktime_t start; + u32 lastcwnd; } tcpw; +/* + * Print to log with timestamps. + * FIXME: causes an extra copy + */ static void printl(const char *fmt, ...) { va_list args; int len; - struct timeval now; + struct timespec tv; char tbuf[256]; va_start(args, fmt); - do_gettimeofday(&now); + /* want monotonic time since start of tcp_probe */ + tv = ktime_to_timespec(ktime_sub(ktime_get(), tcpw.start)); - now.tv_sec -= tcpw.tstart.tv_sec; - now.tv_usec -= tcpw.tstart.tv_usec; - if (now.tv_usec < 0) { - --now.tv_sec; - now.tv_usec += 1000000; - } - - len = sprintf(tbuf, "%lu.%06lu ", - (unsigned long) now.tv_sec, - (unsigned long) now.tv_usec); + len = sprintf(tbuf, "%lu.%09lu ", + (unsigned long) tv.tv_sec, (unsigned long) tv.tv_nsec); len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args); va_end(args); @@ -78,38 +82,44 @@ static void printl(const char *fmt, ...) wake_up(&tcpw.wait); } -static int jtcp_sendmsg(struct kiocb *iocb, struct sock *sk, - struct msghdr *msg, size_t size) +/* + * Hook inserted to be called before each receive packet. + * Note: arguments must match tcp_rcv_established()! + */ +static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, + struct tcphdr *th, unsigned len) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_sock *inet = inet_sk(sk); - if (port == 0 || ntohs(inet->dport) == port || - ntohs(inet->sport) == port) { + /* Only update if port matches */ + if ((port == 0 || ntohs(inet->dport) == port || ntohs(inet->sport) == port) + && (full || tp->snd_cwnd != tcpw.lastcwnd)) { printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %#x %#x %u %u %u\n", NIPQUAD(inet->saddr), ntohs(inet->sport), NIPQUAD(inet->daddr), ntohs(inet->dport), - size, tp->snd_nxt, tp->snd_una, + skb->len, tp->snd_nxt, tp->snd_una, tp->snd_cwnd, tcp_current_ssthresh(sk), - tp->snd_wnd); + tp->snd_wnd, tp->srtt >> 3); + tcpw.lastcwnd = tp->snd_cwnd; } jprobe_return(); return 0; } -static struct jprobe tcp_send_probe = { +static struct jprobe tcp_probe = { .kp = { - .symbol_name = "tcp_sendmsg", + .symbol_name = "tcp_rcv_established", }, - .entry = JPROBE_ENTRY(jtcp_sendmsg), + .entry = JPROBE_ENTRY(jtcp_rcv_established), }; static int tcpprobe_open(struct inode * inode, struct file * file) { kfifo_reset(tcpw.fifo); - do_gettimeofday(&tcpw.tstart); + tcpw.start = ktime_get(); return 0; } @@ -162,7 +172,7 @@ static __init int tcpprobe_init(void) if (!proc_net_fops_create(procname, S_IRUSR, &tcpprobe_fops)) goto err0; - ret = register_jprobe(&tcp_send_probe); + ret = register_jprobe(&tcp_probe); if (ret) goto err1; @@ -180,7 +190,7 @@ static __exit void tcpprobe_exit(void) { kfifo_free(tcpw.fifo); proc_net_remove(procname); - unregister_jprobe(&tcp_send_probe); + unregister_jprobe(&tcp_probe); } module_exit(tcpprobe_exit); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index a9243cfc1be..2ca97b20929 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -233,7 +233,7 @@ static void tcp_probe_timer(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); int max_probes; - if (tp->packets_out || !sk->sk_send_head) { + if (tp->packets_out || !tcp_send_head(sk)) { icsk->icsk_probes_out = 0; return; } @@ -284,7 +284,7 @@ static void tcp_retransmit_timer(struct sock *sk) if (!tp->packets_out) goto out; - BUG_TRAP(!skb_queue_empty(&sk->sk_write_queue)); + BUG_TRAP(!tcp_write_queue_empty(sk)); if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) && !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) { @@ -306,7 +306,7 @@ static void tcp_retransmit_timer(struct sock *sk) goto out; } tcp_enter_loss(sk, 0); - tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)); + tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); __sk_dst_reset(sk); goto out_reset_timer; } @@ -341,7 +341,7 @@ static void tcp_retransmit_timer(struct sock *sk) tcp_enter_loss(sk, 0); } - if (tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)) > 0) { + if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) { /* Retransmission failed because of local congestion, * do not backoff. */ @@ -482,7 +482,7 @@ static void tcp_keepalive_timer (unsigned long data) elapsed = keepalive_time_when(tp); /* It is alive without keepalive 8) */ - if (tp->packets_out || sk->sk_send_head) + if (tp->packets_out || tcp_send_head(sk)) goto resched; elapsed = tcp_time_stamp - tp->rcv_tstamp; diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 5c484dceb96..73e19cf7df2 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -38,6 +38,8 @@ #include <net/tcp.h> +#include "tcp_vegas.h" + /* Default values of the Vegas variables, in fixed-point representation * with V_PARAM_SHIFT bits to the right of the binary point. */ @@ -54,17 +56,6 @@ module_param(gamma, int, 0644); MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)"); -/* Vegas variables */ -struct vegas { - u32 beg_snd_nxt; /* right edge during last RTT */ - u32 beg_snd_una; /* left edge during last RTT */ - u32 beg_snd_cwnd; /* saves the size of the cwnd */ - u8 doing_vegas_now;/* if true, do vegas for this RTT */ - u16 cntRTT; /* # of RTTs measured within last RTT */ - u32 minRTT; /* min of RTTs measured within last RTT (in usec) */ - u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ -}; - /* There are several situations when we must "re-start" Vegas: * * o when a connection is established @@ -81,7 +72,7 @@ struct vegas { * Instead we must wait until the completion of an RTT during * which we actually receive ACKs. */ -static inline void vegas_enable(struct sock *sk) +static void vegas_enable(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct vegas *vegas = inet_csk_ca(sk); @@ -104,13 +95,14 @@ static inline void vegas_disable(struct sock *sk) vegas->doing_vegas_now = 0; } -static void tcp_vegas_init(struct sock *sk) +void tcp_vegas_init(struct sock *sk) { struct vegas *vegas = inet_csk_ca(sk); vegas->baseRTT = 0x7fffffff; vegas_enable(sk); } +EXPORT_SYMBOL_GPL(tcp_vegas_init); /* Do RTT sampling needed for Vegas. * Basically we: @@ -120,10 +112,13 @@ static void tcp_vegas_init(struct sock *sk) * o min-filter RTT samples from a much longer window (forever for now) * to find the propagation delay (baseRTT) */ -static void tcp_vegas_rtt_calc(struct sock *sk, u32 usrtt) +void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, ktime_t last) { struct vegas *vegas = inet_csk_ca(sk); - u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */ + u32 vrtt; + + /* Never allow zero rtt or baseRTT */ + vrtt = ktime_to_us(net_timedelta(last)) + 1; /* Filter to find propagation delay: */ if (vrtt < vegas->baseRTT) @@ -135,8 +130,9 @@ static void tcp_vegas_rtt_calc(struct sock *sk, u32 usrtt) vegas->minRTT = min(vegas->minRTT, vrtt); vegas->cntRTT++; } +EXPORT_SYMBOL_GPL(tcp_vegas_pkts_acked); -static void tcp_vegas_state(struct sock *sk, u8 ca_state) +void tcp_vegas_state(struct sock *sk, u8 ca_state) { if (ca_state == TCP_CA_Open) @@ -144,6 +140,7 @@ static void tcp_vegas_state(struct sock *sk, u8 ca_state) else vegas_disable(sk); } +EXPORT_SYMBOL_GPL(tcp_vegas_state); /* * If the connection is idle and we are restarting, @@ -154,12 +151,13 @@ static void tcp_vegas_state(struct sock *sk, u8 ca_state) * packets, _then_ we can make Vegas calculations * again. */ -static void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event) +void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START) tcp_vegas_init(sk); } +EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event); static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 seq_rtt, u32 in_flight, int flag) @@ -336,30 +334,29 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, } /* Extract info for Tcp socket info provided via netlink. */ -static void tcp_vegas_get_info(struct sock *sk, u32 ext, - struct sk_buff *skb) +void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) { const struct vegas *ca = inet_csk_ca(sk); if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { - struct tcpvegas_info *info; - - info = RTA_DATA(__RTA_PUT(skb, INET_DIAG_VEGASINFO, - sizeof(*info))); - - info->tcpv_enabled = ca->doing_vegas_now; - info->tcpv_rttcnt = ca->cntRTT; - info->tcpv_rtt = ca->baseRTT; - info->tcpv_minrtt = ca->minRTT; - rtattr_failure: ; + struct tcpvegas_info info = { + .tcpv_enabled = ca->doing_vegas_now, + .tcpv_rttcnt = ca->cntRTT, + .tcpv_rtt = ca->baseRTT, + .tcpv_minrtt = ca->minRTT, + }; + + nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); } } +EXPORT_SYMBOL_GPL(tcp_vegas_get_info); static struct tcp_congestion_ops tcp_vegas = { + .flags = TCP_CONG_RTT_STAMP, .init = tcp_vegas_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_vegas_cong_avoid, .min_cwnd = tcp_reno_min_cwnd, - .rtt_sample = tcp_vegas_rtt_calc, + .pkts_acked = tcp_vegas_pkts_acked, .set_state = tcp_vegas_state, .cwnd_event = tcp_vegas_cwnd_event, .get_info = tcp_vegas_get_info, diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h new file mode 100644 index 00000000000..502fa818363 --- /dev/null +++ b/net/ipv4/tcp_vegas.h @@ -0,0 +1,24 @@ +/* + * TCP Vegas congestion control interface + */ +#ifndef __TCP_VEGAS_H +#define __TCP_VEGAS_H 1 + +/* Vegas variables */ +struct vegas { + u32 beg_snd_nxt; /* right edge during last RTT */ + u32 beg_snd_una; /* left edge during last RTT */ + u32 beg_snd_cwnd; /* saves the size of the cwnd */ + u8 doing_vegas_now;/* if true, do vegas for this RTT */ + u16 cntRTT; /* # of RTTs measured within last RTT */ + u32 minRTT; /* min of RTTs measured within last RTT (in usec) */ + u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ +}; + +extern void tcp_vegas_init(struct sock *sk); +extern void tcp_vegas_state(struct sock *sk, u8 ca_state); +extern void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, ktime_t last); +extern void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event); +extern void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb); + +#endif /* __TCP_VEGAS_H */ diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index ce57bf302f6..9edb340f2f9 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -69,10 +69,13 @@ static void tcp_veno_init(struct sock *sk) } /* Do rtt sampling needed for Veno. */ -static void tcp_veno_rtt_calc(struct sock *sk, u32 usrtt) +static void tcp_veno_pkts_acked(struct sock *sk, u32 cnt, ktime_t last) { struct veno *veno = inet_csk_ca(sk); - u32 vrtt = usrtt + 1; /* Never allow zero rtt or basertt */ + u32 vrtt; + + /* Never allow zero rtt or baseRTT */ + vrtt = ktime_to_us(net_timedelta(last)) + 1; /* Filter to find propagation delay: */ if (vrtt < veno->basertt) @@ -199,10 +202,11 @@ static u32 tcp_veno_ssthresh(struct sock *sk) } static struct tcp_congestion_ops tcp_veno = { + .flags = TCP_CONG_RTT_STAMP, .init = tcp_veno_init, .ssthresh = tcp_veno_ssthresh, .cong_avoid = tcp_veno_cong_avoid, - .rtt_sample = tcp_veno_rtt_calc, + .pkts_acked = tcp_veno_pkts_acked, .set_state = tcp_veno_state, .cwnd_event = tcp_veno_cwnd_event, diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 4e1b61032a9..e61e09dd513 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -100,7 +100,7 @@ static void westwood_filter(struct westwood *w, u32 delta) * Called after processing group of packets. * but all westwood needs is the last sample of srtt. */ -static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt) +static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt, ktime_t last) { struct westwood *w = inet_csk_ca(sk); if (cnt > 0) @@ -226,7 +226,7 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) struct tcp_sock *tp = tcp_sk(sk); struct westwood *w = inet_csk_ca(sk); - switch(event) { + switch (event) { case CA_EVENT_FAST_ACK: westwood_fast_bw(sk); break; @@ -260,16 +260,13 @@ static void tcp_westwood_info(struct sock *sk, u32 ext, { const struct westwood *ca = inet_csk_ca(sk); if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { - struct rtattr *rta; - struct tcpvegas_info *info; - - rta = __RTA_PUT(skb, INET_DIAG_VEGASINFO, sizeof(*info)); - info = RTA_DATA(rta); - info->tcpv_enabled = 1; - info->tcpv_rttcnt = 0; - info->tcpv_rtt = jiffies_to_usecs(ca->rtt); - info->tcpv_minrtt = jiffies_to_usecs(ca->rtt_min); - rtattr_failure: ; + struct tcpvegas_info info = { + .tcpv_enabled = 1, + .tcpv_rtt = jiffies_to_usecs(ca->rtt), + .tcpv_minrtt = jiffies_to_usecs(ca->rtt_min), + }; + + nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); } } diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c new file mode 100644 index 00000000000..545ed237ab5 --- /dev/null +++ b/net/ipv4/tcp_yeah.c @@ -0,0 +1,268 @@ +/* + * + * YeAH TCP + * + * For further details look at: + * http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf + * + */ +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/inet_diag.h> + +#include <net/tcp.h> + +#include "tcp_vegas.h" + +#define TCP_YEAH_ALPHA 80 //lin number of packets queued at the bottleneck +#define TCP_YEAH_GAMMA 1 //lin fraction of queue to be removed per rtt +#define TCP_YEAH_DELTA 3 //log minimum fraction of cwnd to be removed on loss +#define TCP_YEAH_EPSILON 1 //log maximum fraction to be removed on early decongestion +#define TCP_YEAH_PHY 8 //lin maximum delta from base +#define TCP_YEAH_RHO 16 //lin minumum number of consecutive rtt to consider competition on loss +#define TCP_YEAH_ZETA 50 //lin minimum number of state switchs to reset reno_count + +#define TCP_SCALABLE_AI_CNT 100U + +/* YeAH variables */ +struct yeah { + struct vegas vegas; /* must be first */ + + /* YeAH */ + u32 lastQ; + u32 doing_reno_now; + + u32 reno_count; + u32 fast_count; + + u32 pkts_acked; +}; + +static void tcp_yeah_init(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct yeah *yeah = inet_csk_ca(sk); + + tcp_vegas_init(sk); + + yeah->doing_reno_now = 0; + yeah->lastQ = 0; + + yeah->reno_count = 2; + + /* Ensure the MD arithmetic works. This is somewhat pedantic, + * since I don't think we will see a cwnd this large. :) */ + tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); + +} + + +static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, ktime_t last) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct yeah *yeah = inet_csk_ca(sk); + + if (icsk->icsk_ca_state == TCP_CA_Open) + yeah->pkts_acked = pkts_acked; + + tcp_vegas_pkts_acked(sk, pkts_acked, last); +} + +static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, + u32 seq_rtt, u32 in_flight, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct yeah *yeah = inet_csk_ca(sk); + + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + + else if (!yeah->doing_reno_now) { + /* Scalable */ + + tp->snd_cwnd_cnt+=yeah->pkts_acked; + if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } + + yeah->pkts_acked = 1; + + } else { + /* Reno */ + + if (tp->snd_cwnd_cnt < tp->snd_cwnd) + tp->snd_cwnd_cnt++; + + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } + } + + /* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt. + * + * These are so named because they represent the approximate values + * of snd_una and snd_nxt at the beginning of the current RTT. More + * precisely, they represent the amount of data sent during the RTT. + * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, + * we will calculate that (v_beg_snd_nxt - v_vegas.beg_snd_una) outstanding + * bytes of data have been ACKed during the course of the RTT, giving + * an "actual" rate of: + * + * (v_beg_snd_nxt - v_vegas.beg_snd_una) / (rtt duration) + * + * Unfortunately, v_vegas.beg_snd_una is not exactly equal to snd_una, + * because delayed ACKs can cover more than one segment, so they + * don't line up yeahly with the boundaries of RTTs. + * + * Another unfortunate fact of life is that delayed ACKs delay the + * advance of the left edge of our send window, so that the number + * of bytes we send in an RTT is often less than our cwnd will allow. + * So we keep track of our cwnd separately, in v_beg_snd_cwnd. + */ + + if (after(ack, yeah->vegas.beg_snd_nxt)) { + + /* We do the Vegas calculations only if we got enough RTT + * samples that we can be reasonably sure that we got + * at least one RTT sample that wasn't from a delayed ACK. + * If we only had 2 samples total, + * then that means we're getting only 1 ACK per RTT, which + * means they're almost certainly delayed ACKs. + * If we have 3 samples, we should be OK. + */ + + if (yeah->vegas.cntRTT > 2) { + u32 rtt, queue; + u64 bw; + + /* We have enough RTT samples, so, using the Vegas + * algorithm, we determine if we should increase or + * decrease cwnd, and by how much. + */ + + /* Pluck out the RTT we are using for the Vegas + * calculations. This is the min RTT seen during the + * last RTT. Taking the min filters out the effects + * of delayed ACKs, at the cost of noticing congestion + * a bit later. + */ + rtt = yeah->vegas.minRTT; + + /* Compute excess number of packets above bandwidth + * Avoid doing full 64 bit divide. + */ + bw = tp->snd_cwnd; + bw *= rtt - yeah->vegas.baseRTT; + do_div(bw, rtt); + queue = bw; + + if (queue > TCP_YEAH_ALPHA || + rtt - yeah->vegas.baseRTT > (yeah->vegas.baseRTT / TCP_YEAH_PHY)) { + if (queue > TCP_YEAH_ALPHA + && tp->snd_cwnd > yeah->reno_count) { + u32 reduction = min(queue / TCP_YEAH_GAMMA , + tp->snd_cwnd >> TCP_YEAH_EPSILON); + + tp->snd_cwnd -= reduction; + + tp->snd_cwnd = max(tp->snd_cwnd, + yeah->reno_count); + + tp->snd_ssthresh = tp->snd_cwnd; + } + + if (yeah->reno_count <= 2) + yeah->reno_count = max(tp->snd_cwnd>>1, 2U); + else + yeah->reno_count++; + + yeah->doing_reno_now = min(yeah->doing_reno_now + 1, + 0xffffffU); + } else { + yeah->fast_count++; + + if (yeah->fast_count > TCP_YEAH_ZETA) { + yeah->reno_count = 2; + yeah->fast_count = 0; + } + + yeah->doing_reno_now = 0; + } + + yeah->lastQ = queue; + + } + + /* Save the extent of the current window so we can use this + * at the end of the next RTT. + */ + yeah->vegas.beg_snd_una = yeah->vegas.beg_snd_nxt; + yeah->vegas.beg_snd_nxt = tp->snd_nxt; + yeah->vegas.beg_snd_cwnd = tp->snd_cwnd; + + /* Wipe the slate clean for the next RTT. */ + yeah->vegas.cntRTT = 0; + yeah->vegas.minRTT = 0x7fffffff; + } +} + +static u32 tcp_yeah_ssthresh(struct sock *sk) { + const struct tcp_sock *tp = tcp_sk(sk); + struct yeah *yeah = inet_csk_ca(sk); + u32 reduction; + + if (yeah->doing_reno_now < TCP_YEAH_RHO) { + reduction = yeah->lastQ; + + reduction = min( reduction, max(tp->snd_cwnd>>1, 2U) ); + + reduction = max( reduction, tp->snd_cwnd >> TCP_YEAH_DELTA); + } else + reduction = max(tp->snd_cwnd>>1,2U); + + yeah->fast_count = 0; + yeah->reno_count = max(yeah->reno_count>>1, 2U); + + return tp->snd_cwnd - reduction; +} + +static struct tcp_congestion_ops tcp_yeah = { + .flags = TCP_CONG_RTT_STAMP, + .init = tcp_yeah_init, + .ssthresh = tcp_yeah_ssthresh, + .cong_avoid = tcp_yeah_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, + .set_state = tcp_vegas_state, + .cwnd_event = tcp_vegas_cwnd_event, + .get_info = tcp_vegas_get_info, + .pkts_acked = tcp_yeah_pkts_acked, + + .owner = THIS_MODULE, + .name = "yeah", +}; + +static int __init tcp_yeah_register(void) +{ + BUG_ON(sizeof(struct yeah) > ICSK_CA_PRIV_SIZE); + tcp_register_congestion_control(&tcp_yeah); + return 0; +} + +static void __exit tcp_yeah_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_yeah); +} + +module_init(tcp_yeah_register); +module_exit(tcp_yeah_unregister); + +MODULE_AUTHOR("Angelo P. Castellani"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("YeAH TCP"); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index fc620a7c1db..113e0c4c8a9 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -114,14 +114,33 @@ DEFINE_RWLOCK(udp_hash_lock); static int udp_port_rover; -static inline int __udp_lib_lport_inuse(__u16 num, struct hlist_head udptable[]) +/* + * Note about this hash function : + * Typical use is probably daddr = 0, only dport is going to vary hash + */ +static inline unsigned int hash_port_and_addr(__u16 port, __be32 addr) +{ + addr ^= addr >> 16; + addr ^= addr >> 8; + return port ^ addr; +} + +static inline int __udp_lib_port_inuse(unsigned int hash, int port, + __be32 daddr, struct hlist_head udptable[]) { struct sock *sk; struct hlist_node *node; + struct inet_sock *inet; - sk_for_each(sk, node, &udptable[num & (UDP_HTABLE_SIZE - 1)]) - if (sk->sk_hash == num) + sk_for_each(sk, node, &udptable[hash & (UDP_HTABLE_SIZE - 1)]) { + if (sk->sk_hash != hash) + continue; + inet = inet_sk(sk); + if (inet->num != port) + continue; + if (inet->rcv_saddr == daddr) return 1; + } return 0; } @@ -142,6 +161,7 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, struct hlist_node *node; struct hlist_head *head; struct sock *sk2; + unsigned int hash; int error = 1; write_lock_bh(&udp_hash_lock); @@ -156,7 +176,9 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) { int size; - head = &udptable[result & (UDP_HTABLE_SIZE - 1)]; + hash = hash_port_and_addr(result, + inet_sk(sk)->rcv_saddr); + head = &udptable[hash & (UDP_HTABLE_SIZE - 1)]; if (hlist_empty(head)) { if (result > sysctl_local_port_range[1]) result = sysctl_local_port_range[0] + @@ -175,12 +197,23 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, ; } result = best; - for(i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++, result += UDP_HTABLE_SIZE) { + for (i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; + i++, result += UDP_HTABLE_SIZE) { if (result > sysctl_local_port_range[1]) result = sysctl_local_port_range[0] + ((result - sysctl_local_port_range[0]) & (UDP_HTABLE_SIZE - 1)); - if (! __udp_lib_lport_inuse(result, udptable)) + hash = hash_port_and_addr(result, 0); + if (__udp_lib_port_inuse(hash, result, + 0, udptable)) + continue; + if (!inet_sk(sk)->rcv_saddr) + break; + + hash = hash_port_and_addr(result, + inet_sk(sk)->rcv_saddr); + if (! __udp_lib_port_inuse(hash, result, + inet_sk(sk)->rcv_saddr, udptable)) break; } if (i >= (1 << 16) / UDP_HTABLE_SIZE) @@ -188,21 +221,41 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, gotit: *port_rover = snum = result; } else { - head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; + hash = hash_port_and_addr(snum, 0); + head = &udptable[hash & (UDP_HTABLE_SIZE - 1)]; sk_for_each(sk2, node, head) - if (sk2->sk_hash == snum && - sk2 != sk && - (!sk2->sk_reuse || !sk->sk_reuse) && - (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if - || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && - (*saddr_comp)(sk, sk2) ) + if (sk2->sk_hash == hash && + sk2 != sk && + inet_sk(sk2)->num == snum && + (!sk2->sk_reuse || !sk->sk_reuse) && + (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || + sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && + (*saddr_comp)(sk, sk2)) goto fail; + + if (inet_sk(sk)->rcv_saddr) { + hash = hash_port_and_addr(snum, + inet_sk(sk)->rcv_saddr); + head = &udptable[hash & (UDP_HTABLE_SIZE - 1)]; + + sk_for_each(sk2, node, head) + if (sk2->sk_hash == hash && + sk2 != sk && + inet_sk(sk2)->num == snum && + (!sk2->sk_reuse || !sk->sk_reuse) && + (!sk2->sk_bound_dev_if || + !sk->sk_bound_dev_if || + sk2->sk_bound_dev_if == + sk->sk_bound_dev_if) && + (*saddr_comp)(sk, sk2)) + goto fail; + } } inet_sk(sk)->num = snum; - sk->sk_hash = snum; + sk->sk_hash = hash; if (sk_unhashed(sk)) { - head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; + head = &udptable[hash & (UDP_HTABLE_SIZE - 1)]; sk_add_node(sk, head); sock_prot_inc_use(sk->sk_prot); } @@ -212,13 +265,13 @@ fail: return error; } -__inline__ int udp_get_port(struct sock *sk, unsigned short snum, +int udp_get_port(struct sock *sk, unsigned short snum, int (*scmp)(const struct sock *, const struct sock *)) { return __udp_lib_get_port(sk, snum, udp_hash, &udp_port_rover, scmp); } -inline int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) +int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) { struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); @@ -241,63 +294,77 @@ static struct sock *__udp4_lib_lookup(__be32 saddr, __be16 sport, { struct sock *sk, *result = NULL; struct hlist_node *node; - unsigned short hnum = ntohs(dport); - int badness = -1; + unsigned int hash, hashwild; + int score, best = -1, hport = ntohs(dport); + + hash = hash_port_and_addr(hport, daddr); + hashwild = hash_port_and_addr(hport, 0); read_lock(&udp_hash_lock); - sk_for_each(sk, node, &udptable[hnum & (UDP_HTABLE_SIZE - 1)]) { + +lookup: + + sk_for_each(sk, node, &udptable[hash & (UDP_HTABLE_SIZE - 1)]) { struct inet_sock *inet = inet_sk(sk); - if (sk->sk_hash == hnum && !ipv6_only_sock(sk)) { - int score = (sk->sk_family == PF_INET ? 1 : 0); - if (inet->rcv_saddr) { - if (inet->rcv_saddr != daddr) - continue; - score+=2; - } - if (inet->daddr) { - if (inet->daddr != saddr) - continue; - score+=2; - } - if (inet->dport) { - if (inet->dport != sport) - continue; - score+=2; - } - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - continue; - score+=2; - } - if(score == 9) { - result = sk; - break; - } else if(score > badness) { - result = sk; - badness = score; - } + if (sk->sk_hash != hash || ipv6_only_sock(sk) || + inet->num != hport) + continue; + + score = (sk->sk_family == PF_INET ? 1 : 0); + if (inet->rcv_saddr) { + if (inet->rcv_saddr != daddr) + continue; + score+=2; + } + if (inet->daddr) { + if (inet->daddr != saddr) + continue; + score+=2; } + if (inet->dport) { + if (inet->dport != sport) + continue; + score+=2; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + continue; + score+=2; + } + if (score == 9) { + result = sk; + goto found; + } else if (score > best) { + result = sk; + best = score; + } + } + + if (hash != hashwild) { + hash = hashwild; + goto lookup; } +found: if (result) sock_hold(result); read_unlock(&udp_hash_lock); return result; } -static inline struct sock *udp_v4_mcast_next(struct sock *sk, - __be16 loc_port, __be32 loc_addr, +static inline struct sock *udp_v4_mcast_next(struct sock *sk, unsigned int hnum, + int hport, __be32 loc_addr, __be16 rmt_port, __be32 rmt_addr, int dif) { struct hlist_node *node; struct sock *s = sk; - unsigned short hnum = ntohs(loc_port); sk_for_each_from(s, node) { struct inet_sock *inet = inet_sk(s); if (s->sk_hash != hnum || + inet->num != hport || (inet->daddr && inet->daddr != rmt_addr) || (inet->dport != rmt_port && inet->dport) || (inet->rcv_saddr && inet->rcv_saddr != loc_addr) || @@ -329,8 +396,8 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct hlist_head udptable[]) struct inet_sock *inet; struct iphdr *iph = (struct iphdr*)skb->data; struct udphdr *uh = (struct udphdr*)(skb->data+(iph->ihl<<2)); - int type = skb->h.icmph->type; - int code = skb->h.icmph->code; + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; struct sock *sk; int harderr; int err; @@ -390,7 +457,7 @@ out: sock_put(sk); } -__inline__ void udp_err(struct sk_buff *skb, u32 info) +void udp_err(struct sk_buff *skb, u32 info) { return __udp4_lib_err(skb, info, udp_hash); } @@ -419,13 +486,14 @@ static void udp4_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, __be32 src, __be32 dst, int len ) { unsigned int offset; - struct udphdr *uh = skb->h.uh; + struct udphdr *uh = udp_hdr(skb); __wsum csum = 0; if (skb_queue_len(&sk->sk_write_queue) == 1) { /* * Only one fragment on the socket. */ + skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); uh->check = ~csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, 0); } else { @@ -434,7 +502,7 @@ static void udp4_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, * fragments on the socket so that all csums of sk_buffs * should be together */ - offset = skb->h.raw - skb->data; + offset = skb_transport_offset(skb); skb->csum = skb_checksum(skb, offset, skb->len - offset, 0); skb->ip_summed = CHECKSUM_NONE; @@ -469,7 +537,7 @@ static int udp_push_pending_frames(struct sock *sk) /* * Create a UDP header */ - uh = skb->h.uh; + uh = udp_hdr(skb); uh->source = fl->fl_ip_sport; uh->dest = fl->fl_ip_dport; uh->len = htons(up->len); @@ -765,38 +833,38 @@ out: int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) { - switch(cmd) + switch (cmd) { + case SIOCOUTQ: { - case SIOCOUTQ: - { - int amount = atomic_read(&sk->sk_wmem_alloc); - return put_user(amount, (int __user *)arg); - } + int amount = atomic_read(&sk->sk_wmem_alloc); + return put_user(amount, (int __user *)arg); + } - case SIOCINQ: - { - struct sk_buff *skb; - unsigned long amount; - - amount = 0; - spin_lock_bh(&sk->sk_receive_queue.lock); - skb = skb_peek(&sk->sk_receive_queue); - if (skb != NULL) { - /* - * We will only return the amount - * of this packet since that is all - * that will be read. - */ - amount = skb->len - sizeof(struct udphdr); - } - spin_unlock_bh(&sk->sk_receive_queue.lock); - return put_user(amount, (int __user *)arg); + case SIOCINQ: + { + struct sk_buff *skb; + unsigned long amount; + + amount = 0; + spin_lock_bh(&sk->sk_receive_queue.lock); + skb = skb_peek(&sk->sk_receive_queue); + if (skb != NULL) { + /* + * We will only return the amount + * of this packet since that is all + * that will be read. + */ + amount = skb->len - sizeof(struct udphdr); } + spin_unlock_bh(&sk->sk_receive_queue.lock); + return put_user(amount, (int __user *)arg); + } - default: - return -ENOIOCTLCMD; + default: + return -ENOIOCTLCMD; } - return(0); + + return 0; } /* @@ -810,7 +878,9 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct inet_sock *inet = inet_sk(sk); struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; struct sk_buff *skb; - int copied, err, copy_only, is_udplite = IS_UDPLITE(sk); + unsigned int ulen, copied; + int err; + int is_udplite = IS_UDPLITE(sk); /* * Check any passed addresses @@ -826,28 +896,25 @@ try_again: if (!skb) goto out; - copied = skb->len - sizeof(struct udphdr); - if (copied > len) { - copied = len; + ulen = skb->len - sizeof(struct udphdr); + copied = len; + if (copied > ulen) + copied = ulen; + else if (copied < ulen) msg->msg_flags |= MSG_TRUNC; - } /* - * Decide whether to checksum and/or copy data. - * - * UDP: checksum may have been computed in HW, - * (re-)compute it if message is truncated. - * UDP-Lite: always needs to checksum, no HW support. + * If checksum is needed at all, try to do it while copying the + * data. If the data is truncated, or if we only want a partial + * coverage checksum (UDP-Lite), do it before the copy. */ - copy_only = (skb->ip_summed==CHECKSUM_UNNECESSARY); - if (is_udplite || (!copy_only && msg->msg_flags&MSG_TRUNC)) { - if (__udp_lib_checksum_complete(skb)) + if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) { + if (udp_lib_checksum_complete(skb)) goto csum_copy_err; - copy_only = 1; } - if (copy_only) + if (skb_csum_unnecessary(skb)) err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, copied ); else { @@ -866,8 +933,8 @@ try_again: if (sin) { sin->sin_family = AF_INET; - sin->sin_port = skb->h.uh->source; - sin->sin_addr.s_addr = skb->nh.iph->saddr; + sin->sin_port = udp_hdr(skb)->source; + sin->sin_addr.s_addr = ip_hdr(skb)->saddr; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); } if (inet->cmsg_flags) @@ -875,7 +942,7 @@ try_again: err = copied; if (flags & MSG_TRUNC) - err = skb->len - sizeof(struct udphdr); + err = ulen; out_free: skb_free_datagram(sk, skb); @@ -949,7 +1016,7 @@ static int udp_encap_rcv(struct sock * sk, struct sk_buff *skb) return 1; /* Now we can get the pointers */ - uh = skb->h.uh; + uh = udp_hdr(skb); udpdata = (__u8 *)uh + sizeof(struct udphdr); udpdata32 = (__be32 *)udpdata; @@ -959,7 +1026,7 @@ static int udp_encap_rcv(struct sock * sk, struct sk_buff *skb) /* Check if this is a keepalive packet. If so, eat it. */ if (len == 1 && udpdata[0] == 0xff) { return 0; - } else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0 ) { + } else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0) { /* ESP Packet without Non-ESP header */ len = sizeof(struct udphdr); } else @@ -990,7 +1057,7 @@ static int udp_encap_rcv(struct sock * sk, struct sk_buff *skb) return 0; /* Now we can update and verify the packet length... */ - iph = skb->nh.iph; + iph = ip_hdr(skb); iphlen = iph->ihl << 2; iph->tot_len = htons(ntohs(iph->tot_len) - len); if (skb->len < iphlen + len) { @@ -1002,7 +1069,8 @@ static int udp_encap_rcv(struct sock * sk, struct sk_buff *skb) * transport header to point to ESP. Keep UDP on the stack * for later. */ - skb->h.raw = skb_pull(skb, len); + __skb_pull(skb, len); + skb_reset_transport_header(skb); /* modify the protocol (it's ESP!) */ iph->protocol = IPPROTO_ESP; @@ -1095,10 +1163,9 @@ int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) } } - if (sk->sk_filter && skb->ip_summed != CHECKSUM_UNNECESSARY) { - if (__udp_lib_checksum_complete(skb)) + if (sk->sk_filter) { + if (udp_lib_checksum_complete(skb)) goto drop; - skb->ip_summed = CHECKSUM_UNNECESSARY; } if ((rc = sock_queue_rcv_skb(sk,skb)) < 0) { @@ -1128,33 +1195,49 @@ static int __udp4_lib_mcast_deliver(struct sk_buff *skb, __be32 saddr, __be32 daddr, struct hlist_head udptable[]) { - struct sock *sk; + struct sock *sk, *skw, *sknext; int dif; + int hport = ntohs(uh->dest); + unsigned int hash = hash_port_and_addr(hport, daddr); + unsigned int hashwild = hash_port_and_addr(hport, 0); - read_lock(&udp_hash_lock); - sk = sk_head(&udptable[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]); dif = skb->dev->ifindex; - sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); - if (sk) { - struct sock *sknext = NULL; + read_lock(&udp_hash_lock); + + sk = sk_head(&udptable[hash & (UDP_HTABLE_SIZE - 1)]); + skw = sk_head(&udptable[hashwild & (UDP_HTABLE_SIZE - 1)]); + + sk = udp_v4_mcast_next(sk, hash, hport, daddr, uh->source, saddr, dif); + if (!sk) { + hash = hashwild; + sk = udp_v4_mcast_next(skw, hash, hport, daddr, uh->source, + saddr, dif); + } + if (sk) { do { struct sk_buff *skb1 = skb; - - sknext = udp_v4_mcast_next(sk_next(sk), uh->dest, daddr, - uh->source, saddr, dif); - if(sknext) + sknext = udp_v4_mcast_next(sk_next(sk), hash, hport, + daddr, uh->source, saddr, dif); + if (!sknext && hash != hashwild) { + hash = hashwild; + sknext = udp_v4_mcast_next(skw, hash, hport, + daddr, uh->source, saddr, dif); + } + if (sknext) skb1 = skb_clone(skb, GFP_ATOMIC); - if(skb1) { + if (skb1) { int ret = udp_queue_rcv_skb(sk, skb1); if (ret > 0) - /* we should probably re-process instead - * of dropping packets here. */ + /* + * we should probably re-process + * instead of dropping packets here. + */ kfree_skb(skb1); } sk = sknext; - } while(sknext); + } while (sknext); } else kfree_skb(skb); read_unlock(&udp_hash_lock); @@ -1166,25 +1249,37 @@ static int __udp4_lib_mcast_deliver(struct sk_buff *skb, * Otherwise, csum completion requires chacksumming packet body, * including udp header and folding it to skb->csum. */ -static inline void udp4_csum_init(struct sk_buff *skb, struct udphdr *uh) +static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, + int proto) { + const struct iphdr *iph; + int err; + + UDP_SKB_CB(skb)->partial_cov = 0; + UDP_SKB_CB(skb)->cscov = skb->len; + + if (proto == IPPROTO_UDPLITE) { + err = udplite_checksum_init(skb, uh); + if (err) + return err; + } + + iph = ip_hdr(skb); if (uh->check == 0) { skb->ip_summed = CHECKSUM_UNNECESSARY; } else if (skb->ip_summed == CHECKSUM_COMPLETE) { - if (!csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr, - skb->len, IPPROTO_UDP, skb->csum )) + if (!csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, + proto, skb->csum)) skb->ip_summed = CHECKSUM_UNNECESSARY; } - if (skb->ip_summed != CHECKSUM_UNNECESSARY) - skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, - skb->nh.iph->daddr, - skb->len, IPPROTO_UDP, 0); + if (!skb_csum_unnecessary(skb)) + skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, + skb->len, proto, 0); /* Probably, we should checksum udp header (it should be in cache * in any case) and data in tiny packets (< rx copybreak). */ - /* UDP = UDP-Lite with a non-partial checksum coverage */ - UDP_SKB_CB(skb)->partial_cov = 0; + return 0; } /* @@ -1192,14 +1287,14 @@ static inline void udp4_csum_init(struct sk_buff *skb, struct udphdr *uh) */ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], - int is_udplite) + int proto) { struct sock *sk; - struct udphdr *uh = skb->h.uh; + struct udphdr *uh = udp_hdr(skb); unsigned short ulen; struct rtable *rt = (struct rtable*)skb->dst; - __be32 saddr = skb->nh.iph->saddr; - __be32 daddr = skb->nh.iph->daddr; + __be32 saddr = ip_hdr(skb)->saddr; + __be32 daddr = ip_hdr(skb)->daddr; /* * Validate the packet. @@ -1211,24 +1306,21 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], if (ulen > skb->len) goto short_packet; - if(! is_udplite ) { /* UDP validates ulen. */ - + if (proto == IPPROTO_UDP) { + /* UDP validates ulen. */ if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen)) goto short_packet; - uh = skb->h.uh; - - udp4_csum_init(skb, uh); - - } else { /* UDP-Lite validates cscov. */ - if (udplite4_csum_init(skb, uh)) - goto csum_error; + uh = udp_hdr(skb); } - if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) + if (udp4_csum_init(skb, uh, proto)) + goto csum_error; + + if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) return __udp4_lib_mcast_deliver(skb, uh, saddr, daddr, udptable); sk = __udp4_lib_lookup(saddr, uh->source, daddr, uh->dest, - skb->dev->ifindex, udptable ); + skb->dev->ifindex, udptable); if (sk != NULL) { int ret = udp_queue_rcv_skb(sk, skb); @@ -1250,7 +1342,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], if (udp_lib_checksum_complete(skb)) goto csum_error; - UDP_INC_STATS_BH(UDP_MIB_NOPORTS, is_udplite); + UDP_INC_STATS_BH(UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); /* @@ -1258,11 +1350,11 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], * don't wanna listen. Ignore it. */ kfree_skb(skb); - return(0); + return 0; short_packet: LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n", - is_udplite? "-Lite" : "", + proto == IPPROTO_UDPLITE ? "-Lite" : "", NIPQUAD(saddr), ntohs(uh->source), ulen, @@ -1277,21 +1369,21 @@ csum_error: * the network is concerned, anyway) as per 4.1.3.4 (MUST). */ LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n", - is_udplite? "-Lite" : "", + proto == IPPROTO_UDPLITE ? "-Lite" : "", NIPQUAD(saddr), ntohs(uh->source), NIPQUAD(daddr), ntohs(uh->dest), ulen); drop: - UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_udplite); + UDP_INC_STATS_BH(UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); kfree_skb(skb); - return(0); + return 0; } -__inline__ int udp_rcv(struct sk_buff *skb) +int udp_rcv(struct sk_buff *skb) { - return __udp4_lib_rcv(skb, udp_hash, 0); + return __udp4_lib_rcv(skb, udp_hash, IPPROTO_UDP); } int udp_destroy_sock(struct sock *sk) @@ -1313,13 +1405,13 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, int val; int err = 0; - if(optlen<sizeof(int)) + if (optlen<sizeof(int)) return -EINVAL; if (get_user(val, (int __user *)optval)) return -EFAULT; - switch(optname) { + switch (optname) { case UDP_CORK: if (val != 0) { up->corkflag = 1; @@ -1373,7 +1465,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, default: err = -ENOPROTOOPT; break; - }; + } return err; } @@ -1404,15 +1496,15 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, struct udp_sock *up = udp_sk(sk); int val, len; - if(get_user(len,optlen)) + if (get_user(len,optlen)) return -EFAULT; len = min_t(unsigned int, len, sizeof(int)); - if(len < 0) + if (len < 0) return -EINVAL; - switch(optname) { + switch (optname) { case UDP_CORK: val = up->corkflag; break; @@ -1433,11 +1525,11 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, default: return -ENOPROTOOPT; - }; + } - if(put_user(len, optlen)) + if (put_user(len, optlen)) return -EFAULT; - if(copy_to_user(optval, &val,len)) + if (copy_to_user(optval, &val,len)) return -EFAULT; return 0; } @@ -1486,15 +1578,11 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) struct sk_buff *skb; spin_lock_bh(&rcvq->lock); - while ((skb = skb_peek(rcvq)) != NULL) { - if (udp_lib_checksum_complete(skb)) { - UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_lite); - __skb_unlink(skb, rcvq); - kfree_skb(skb); - } else { - skb->ip_summed = CHECKSUM_UNNECESSARY; - break; - } + while ((skb = skb_peek(rcvq)) != NULL && + udp_lib_checksum_complete(skb)) { + UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_lite); + __skb_unlink(skb, rcvq); + kfree_skb(skb); } spin_unlock_bh(&rcvq->lock); @@ -1573,7 +1661,7 @@ static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos) struct sock *sk = udp_get_first(seq); if (sk) - while(pos && (sk = udp_get_next(seq, sk)) != NULL) + while (pos && (sk = udp_get_next(seq, sk)) != NULL) --pos; return pos ? NULL : sk; } diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index b28fe1edf98..f34fd686a8f 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -31,7 +31,7 @@ static int udplite_v4_get_port(struct sock *sk, unsigned short snum) static int udplite_rcv(struct sk_buff *skb) { - return __udp4_lib_rcv(skb, udplite_hash, 1); + return __udp4_lib_rcv(skb, udplite_hash, IPPROTO_UDPLITE); } static void udplite_err(struct sk_buff *skb, u32 info) diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 78e80deb7e8..5ceca951d73 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -28,7 +28,7 @@ static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 switch (nexthdr) { case IPPROTO_IPIP: case IPPROTO_IPV6: - *spi = skb->nh.iph->saddr; + *spi = ip_hdr(skb)->saddr; *seq = 0; return 0; } @@ -39,9 +39,9 @@ static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 #ifdef CONFIG_NETFILTER static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb) { - struct iphdr *iph = skb->nh.iph; - if (skb->dst == NULL) { + const struct iphdr *iph = ip_hdr(skb); + if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, skb->dev)) goto drop; @@ -55,18 +55,18 @@ drop: int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type) { - int err; __be32 spi, seq; struct xfrm_state *xfrm_vec[XFRM_MAX_DEPTH]; struct xfrm_state *x; int xfrm_nr = 0; int decaps = 0; + int err = xfrm4_parse_spi(skb, ip_hdr(skb)->protocol, &spi, &seq); - if ((err = xfrm4_parse_spi(skb, skb->nh.iph->protocol, &spi, &seq)) != 0) + if (err != 0) goto drop; do { - struct iphdr *iph = skb->nh.iph; + const struct iphdr *iph = ip_hdr(skb); if (xfrm_nr == XFRM_MAX_DEPTH) goto drop; @@ -113,7 +113,8 @@ int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type) break; } - if ((err = xfrm_parse_spi(skb, skb->nh.iph->protocol, &spi, &seq)) < 0) + err = xfrm_parse_spi(skb, ip_hdr(skb)->protocol, &spi, &seq); + if (err < 0) goto drop; } while (!err); @@ -146,15 +147,15 @@ int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type) return 0; } else { #ifdef CONFIG_NETFILTER - __skb_push(skb, skb->data - skb->nh.raw); - skb->nh.iph->tot_len = htons(skb->len); - ip_send_check(skb->nh.iph); + __skb_push(skb, skb->data - skb_network_header(skb)); + ip_hdr(skb)->tot_len = htons(skb->len); + ip_send_check(ip_hdr(skb)); NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, skb->dev, NULL, xfrm4_rcv_encap_finish); return 0; #else - return -skb->nh.iph->protocol; + return -ip_hdr(skb)->protocol; #endif } diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c index d419e15d980..a73e710740c 100644 --- a/net/ipv4/xfrm4_mode_beet.c +++ b/net/ipv4/xfrm4_mode_beet.c @@ -29,20 +29,21 @@ */ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb) { - struct iphdr *iph, *top_iph = NULL; + struct iphdr *iph, *top_iph; int hdrlen, optlen; - iph = skb->nh.iph; - skb->h.ipiph = iph; + iph = ip_hdr(skb); + skb->transport_header = skb->network_header; hdrlen = 0; optlen = iph->ihl * 4 - sizeof(*iph); if (unlikely(optlen)) hdrlen += IPV4_BEET_PHMAXLEN - (optlen & 4); - skb->nh.raw = skb_push(skb, x->props.header_len + hdrlen); - top_iph = skb->nh.iph; - skb->h.raw += sizeof(*iph) - hdrlen; + skb_push(skb, x->props.header_len - IPV4_BEET_PHMAXLEN + hdrlen); + skb_reset_network_header(skb); + top_iph = ip_hdr(skb); + skb->transport_header += sizeof(*iph) - hdrlen; memmove(top_iph, iph, sizeof(*iph)); if (unlikely(optlen)) { @@ -50,7 +51,7 @@ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb) BUG_ON(optlen < 0); - ph = (struct ip_beet_phdr *)skb->h.raw; + ph = (struct ip_beet_phdr *)skb_transport_header(skb); ph->padlen = 4 - (optlen & 4); ph->hdrlen = optlen / 8; ph->nexthdr = top_iph->protocol; @@ -69,20 +70,18 @@ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb) static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb) { - struct iphdr *iph = skb->nh.iph; + struct iphdr *iph = ip_hdr(skb); int phlen = 0; int optlen = 0; - __u8 ph_nexthdr = 0, protocol = 0; + u8 ph_nexthdr = 0; int err = -EINVAL; - protocol = iph->protocol; - if (unlikely(iph->protocol == IPPROTO_BEETPH)) { struct ip_beet_phdr *ph; if (!pskb_may_pull(skb, sizeof(*ph))) goto out; - ph = (struct ip_beet_phdr *)(skb->h.ipiph + 1); + ph = (struct ip_beet_phdr *)(ipip_hdr(skb) + 1); phlen = sizeof(*ph) + ph->padlen; optlen = ph->hdrlen * 8 + (IPV4_BEET_PHMAXLEN - phlen); @@ -96,22 +95,20 @@ static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb) ph_nexthdr = ph->nexthdr; } - skb->nh.raw = skb->data + (phlen - sizeof(*iph)); - memmove(skb->nh.raw, iph, sizeof(*iph)); - skb->h.raw = skb->data + (phlen + optlen); - skb->data = skb->h.raw; + skb_set_network_header(skb, phlen - sizeof(*iph)); + memmove(skb_network_header(skb), iph, sizeof(*iph)); + skb_set_transport_header(skb, phlen + optlen); + skb->data = skb_transport_header(skb); - iph = skb->nh.iph; + iph = ip_hdr(skb); iph->ihl = (sizeof(*iph) + optlen) / 4; iph->tot_len = htons(skb->len + iph->ihl * 4); iph->daddr = x->sel.daddr.a4; iph->saddr = x->sel.saddr.a4; if (ph_nexthdr) iph->protocol = ph_nexthdr; - else - iph->protocol = protocol; iph->check = 0; - iph->check = ip_fast_csum(skb->nh.raw, iph->ihl); + iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl); err = 0; out: return err; diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c index 92676b7e403..601047161ea 100644 --- a/net/ipv4/xfrm4_mode_transport.c +++ b/net/ipv4/xfrm4_mode_transport.c @@ -23,16 +23,13 @@ */ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb) { - struct iphdr *iph; - int ihl; + struct iphdr *iph = ip_hdr(skb); + int ihl = iph->ihl * 4; - iph = skb->nh.iph; - skb->h.ipiph = iph; - - ihl = iph->ihl * 4; - skb->h.raw += ihl; - - skb->nh.raw = memmove(skb_push(skb, x->props.header_len), iph, ihl); + skb->transport_header = skb->network_header + ihl; + skb_push(skb, x->props.header_len); + skb_reset_network_header(skb); + memmove(skb_network_header(skb), iph, ihl); return 0; } @@ -46,12 +43,15 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb) */ static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) { - int ihl = skb->data - skb->h.raw; + int ihl = skb->data - skb_transport_header(skb); - if (skb->h.raw != skb->nh.raw) - skb->nh.raw = memmove(skb->h.raw, skb->nh.raw, ihl); - skb->nh.iph->tot_len = htons(skb->len + ihl); - skb->h.raw = skb->data; + if (skb->transport_header != skb->network_header) { + memmove(skb_transport_header(skb), + skb_network_header(skb), ihl); + skb->network_header = skb->transport_header; + } + ip_hdr(skb)->tot_len = htons(skb->len + ihl); + skb_reset_transport_header(skb); return 0; } diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index ceb4376f572..a2f2e6a5ec5 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -16,8 +16,8 @@ static inline void ipip_ecn_decapsulate(struct sk_buff *skb) { - struct iphdr *outer_iph = skb->nh.iph; - struct iphdr *inner_iph = skb->h.ipiph; + struct iphdr *outer_iph = ip_hdr(skb); + struct iphdr *inner_iph = ipip_hdr(skb); if (INET_ECN_is_ce(outer_iph->tos)) IP_ECN_set_ce(inner_iph); @@ -26,7 +26,7 @@ static inline void ipip_ecn_decapsulate(struct sk_buff *skb) static inline void ipip6_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) { if (INET_ECN_is_ce(iph->tos)) - IP6_ECN_set_ce(skb->nh.ipv6h); + IP6_ECN_set_ce(ipv6_hdr(skb)); } /* Add encapsulation header. @@ -46,11 +46,12 @@ static int xfrm4_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) struct iphdr *iph, *top_iph; int flags; - iph = skb->nh.iph; - skb->h.ipiph = iph; + iph = ip_hdr(skb); + skb->transport_header = skb->network_header; - skb->nh.raw = skb_push(skb, x->props.header_len); - top_iph = skb->nh.iph; + skb_push(skb, x->props.header_len); + skb_reset_network_header(skb); + top_iph = ip_hdr(skb); top_iph->ihl = 5; top_iph->version = 4; @@ -90,10 +91,11 @@ static int xfrm4_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) static int xfrm4_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) { - struct iphdr *iph = skb->nh.iph; + struct iphdr *iph = ip_hdr(skb); + const unsigned char *old_mac; int err = -EINVAL; - switch(iph->protocol){ + switch (iph->protocol){ case IPPROTO_IPIP: break; #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) @@ -111,10 +113,10 @@ static int xfrm4_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) goto out; - iph = skb->nh.iph; + iph = ip_hdr(skb); if (iph->protocol == IPPROTO_IPIP) { if (x->props.flags & XFRM_STATE_DECAP_DSCP) - ipv4_copy_dscp(iph, skb->h.ipiph); + ipv4_copy_dscp(iph, ipip_hdr(skb)); if (!(x->props.flags & XFRM_STATE_NOECN)) ipip_ecn_decapsulate(skb); } @@ -125,9 +127,10 @@ static int xfrm4_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) skb->protocol = htons(ETH_P_IPV6); } #endif - skb->mac.raw = memmove(skb->data - skb->mac_len, - skb->mac.raw, skb->mac_len); - skb->nh.raw = skb->data; + old_mac = skb_mac_header(skb); + skb_set_mac_header(skb, -skb->mac_len); + memmove(skb_mac_header(skb), old_mac, skb->mac_len); + skb_reset_network_header(skb); err = 0; out: diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 038ca160fe2..44ef208a75c 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -22,14 +22,13 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) { int mtu, ret = 0; struct dst_entry *dst; - struct iphdr *iph = skb->nh.iph; if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE) goto out; IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; - if (!(iph->frag_off & htons(IP_DF)) || skb->local_df) + if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df) goto out; dst = skb->dst; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 5d51a2af34c..4ff8ed30024 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -119,7 +119,7 @@ __xfrm4_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int if (xfrm[i]->props.mode == XFRM_MODE_TUNNEL) { unsigned short encap_family = xfrm[i]->props.family; - switch(encap_family) { + switch (encap_family) { case AF_INET: fl_tunnel.fl4_dst = xfrm[i]->id.daddr.a4; fl_tunnel.fl4_src = xfrm[i]->props.saddr.a4; @@ -209,8 +209,8 @@ error: static void _decode_session4(struct sk_buff *skb, struct flowi *fl) { - struct iphdr *iph = skb->nh.iph; - u8 *xprth = skb->nh.raw + iph->ihl*4; + struct iphdr *iph = ip_hdr(skb); + u8 *xprth = skb_network_header(skb) + iph->ihl * 4; memset(fl, 0, sizeof(struct flowi)); if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) { @@ -263,7 +263,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl) default: fl->fl_ipsec_spi = 0; break; - }; + } } fl->proto = iph->protocol; fl->fl4_dst = iph->daddr; diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c index 3eef06454da..56851030455 100644 --- a/net/ipv4/xfrm4_tunnel.c +++ b/net/ipv4/xfrm4_tunnel.c @@ -12,9 +12,8 @@ static int ipip_output(struct xfrm_state *x, struct sk_buff *skb) { - struct iphdr *iph; + struct iphdr *iph = ip_hdr(skb); - iph = skb->nh.iph; iph->tot_len = htons(skb->len); ip_send_check(iph); diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 79682efb14b..8e5d54f23b4 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -57,6 +57,16 @@ config IPV6_ROUTE_INFO If unsure, say N. +config IPV6_OPTIMISTIC_DAD + bool "IPv6: Enable RFC 4429 Optimistic DAD (EXPERIMENTAL)" + depends on IPV6 && EXPERIMENTAL + ---help--- + This is experimental support for optimistic Duplicate + Address Detection. It allows for autoconfigured addresses + to be used more quickly. + + If unsure, say N. + config INET6_AH tristate "IPv6: AH transformation" depends on IPV6 diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index d460017bb35..bb33309044c 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -7,14 +7,15 @@ obj-$(CONFIG_IPV6) += ipv6.o ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \ route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \ raw.o protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \ - exthdrs.o sysctl_net_ipv6.o datagram.o proc.o \ - ip6_flowlabel.o ipv6_syms.o inet6_connection_sock.o + exthdrs.o sysctl_net_ipv6.o datagram.o \ + ip6_flowlabel.o inet6_connection_sock.o ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \ xfrm6_output.o ipv6-$(CONFIG_NETFILTER) += netfilter.o ipv6-$(CONFIG_IPV6_MULTIPLE_TABLES) += fib6_rules.o ipv6-$(CONFIG_IPV6_MIP6) += mip6.o +ipv6-$(CONFIG_PROC_FS) += proc.o ipv6-objs += $(ipv6-y) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 452a82ce479..d02685c6bc6 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -81,6 +81,7 @@ #endif #include <asm/uaccess.h> +#include <asm/unaligned.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -208,9 +209,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { }; /* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */ -#if 0 const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; -#endif const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT; static void addrconf_del_timer(struct inet6_ifaddr *ifp) @@ -246,6 +245,37 @@ static void addrconf_mod_timer(struct inet6_ifaddr *ifp, add_timer(&ifp->timer); } +static int snmp6_alloc_dev(struct inet6_dev *idev) +{ + int err = -ENOMEM; + + if (!idev || !idev->dev) + return -EINVAL; + + if (snmp_mib_init((void **)idev->stats.ipv6, + sizeof(struct ipstats_mib), + __alignof__(struct ipstats_mib)) < 0) + goto err_ip; + if (snmp_mib_init((void **)idev->stats.icmpv6, + sizeof(struct icmpv6_mib), + __alignof__(struct icmpv6_mib)) < 0) + goto err_icmp; + + return 0; + +err_icmp: + snmp_mib_free((void **)idev->stats.ipv6); +err_ip: + return err; +} + +static int snmp6_free_dev(struct inet6_dev *idev) +{ + snmp_mib_free((void **)idev->stats.icmpv6); + snmp_mib_free((void **)idev->stats.ipv6); + return 0; +} + /* Nobody refers to this device, we may destroy it. */ static void in6_dev_finish_destroy_rcu(struct rcu_head *head) @@ -271,6 +301,8 @@ void in6_dev_finish_destroy(struct inet6_dev *idev) call_rcu(&idev->rcu, in6_dev_finish_destroy_rcu); } +EXPORT_SYMBOL(in6_dev_finish_destroy); + static struct inet6_dev * ipv6_add_dev(struct net_device *dev) { struct inet6_dev *ndev; @@ -417,7 +449,7 @@ static void addrconf_forward_change(void) struct inet6_dev *idev; read_lock(&dev_base_lock); - for (dev=dev_base; dev; dev=dev->next) { + for_each_netdev(dev) { rcu_read_lock(); idev = __in6_dev_get(dev); if (idev) { @@ -528,6 +560,16 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen, ifa->rt = rt; + /* + * part one of RFC 4429, section 3.3 + * We should not configure an address as + * optimistic if we do not yet know the link + * layer address of our nexhop router + */ + + if (rt->rt6i_nexthop == NULL) + ifa->flags &= ~IFA_F_OPTIMISTIC; + ifa->idev = idev; in6_dev_hold(idev); /* For caller */ @@ -704,6 +746,7 @@ static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, struct inet6_ifaddr *i int tmp_plen; int ret = 0; int max_addresses; + u32 addr_flags; write_lock(&idev->lock); if (ift) { @@ -761,10 +804,17 @@ retry: spin_unlock_bh(&ifp->lock); write_unlock(&idev->lock); + + addr_flags = IFA_F_TEMPORARY; + /* set in addrconf_prefix_rcv() */ + if (ifp->flags & IFA_F_OPTIMISTIC) + addr_flags |= IFA_F_OPTIMISTIC; + ift = !max_addresses || ipv6_count_addresses(idev) < max_addresses ? ipv6_add_addr(idev, &addr, tmp_plen, - ipv6_addr_type(&addr)&IPV6_ADDR_SCOPE_MASK, IFA_F_TEMPORARY) : NULL; + ipv6_addr_type(&addr)&IPV6_ADDR_SCOPE_MASK, + addr_flags) : NULL; if (!ift || IS_ERR(ift)) { in6_ifa_put(ifp); in6_dev_put(idev); @@ -861,7 +911,7 @@ int ipv6_dev_get_saddr(struct net_device *daddr_dev, read_lock(&dev_base_lock); rcu_read_lock(); - for (dev = dev_base; dev; dev=dev->next) { + for_each_netdev(dev) { struct inet6_dev *idev; struct inet6_ifaddr *ifa; @@ -896,13 +946,14 @@ int ipv6_dev_get_saddr(struct net_device *daddr_dev, * - Tentative Address (RFC2462 section 5.4) * - A tentative address is not considered * "assigned to an interface" in the traditional - * sense. + * sense, unless it is also flagged as optimistic. * - Candidate Source Address (section 4) * - In any case, anycast addresses, multicast * addresses, and the unspecified address MUST * NOT be included in a candidate set. */ - if (ifa->flags & IFA_F_TENTATIVE) + if ((ifa->flags & IFA_F_TENTATIVE) && + (!(ifa->flags & IFA_F_OPTIMISTIC))) continue; if (unlikely(score.addr_type == IPV6_ADDR_ANY || score.addr_type & IPV6_ADDR_MULTICAST)) { @@ -961,15 +1012,17 @@ int ipv6_dev_get_saddr(struct net_device *daddr_dev, } } - /* Rule 3: Avoid deprecated address */ + /* Rule 3: Avoid deprecated and optimistic addresses */ if (hiscore.rule < 3) { if (ipv6_saddr_preferred(hiscore.addr_type) || - !(ifa_result->flags & IFA_F_DEPRECATED)) + (((ifa_result->flags & + (IFA_F_DEPRECATED|IFA_F_OPTIMISTIC)) == 0))) hiscore.attrs |= IPV6_SADDR_SCORE_PREFERRED; hiscore.rule++; } if (ipv6_saddr_preferred(score.addr_type) || - !(ifa->flags & IFA_F_DEPRECATED)) { + (((ifa_result->flags & + (IFA_F_DEPRECATED|IFA_F_OPTIMISTIC)) == 0))) { score.attrs |= IPV6_SADDR_SCORE_PREFERRED; if (!(hiscore.attrs & IPV6_SADDR_SCORE_PREFERRED)) { score.rule = 3; @@ -1107,8 +1160,10 @@ int ipv6_get_saddr(struct dst_entry *dst, return ipv6_dev_get_saddr(dst ? ip6_dst_idev(dst)->dev : NULL, daddr, saddr); } +EXPORT_SYMBOL(ipv6_get_saddr); -int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr) +int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, + unsigned char banned_flags) { struct inet6_dev *idev; int err = -EADDRNOTAVAIL; @@ -1119,7 +1174,7 @@ int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr) read_lock_bh(&idev->lock); for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) { - if (ifp->scope == IFA_LINK && !(ifp->flags&IFA_F_TENTATIVE)) { + if (ifp->scope == IFA_LINK && !(ifp->flags & banned_flags)) { ipv6_addr_copy(addr, &ifp->addr); err = 0; break; @@ -1161,6 +1216,8 @@ int ipv6_chk_addr(struct in6_addr *addr, struct net_device *dev, int strict) return ifp != NULL; } +EXPORT_SYMBOL(ipv6_chk_addr); + static int ipv6_chk_same_addr(const struct in6_addr *addr, struct net_device *dev) { @@ -1669,6 +1726,13 @@ ok: if (ifp == NULL && valid_lft) { int max_addresses = in6_dev->cnf.max_addresses; + u32 addr_flags = 0; + +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + if (in6_dev->cnf.optimistic_dad && + !ipv6_devconf.forwarding) + addr_flags = IFA_F_OPTIMISTIC; +#endif /* Do not allow to create too much of autoconfigured * addresses; this would be too easy way to crash kernel. @@ -1676,7 +1740,8 @@ ok: if (!max_addresses || ipv6_count_addresses(in6_dev) < max_addresses) ifp = ipv6_add_addr(in6_dev, &addr, pinfo->prefix_len, - addr_type&IPV6_ADDR_SCOPE_MASK, 0); + addr_type&IPV6_ADDR_SCOPE_MASK, + addr_flags); if (!ifp || IS_ERR(ifp)) { in6_dev_put(in6_dev); @@ -1884,6 +1949,11 @@ static int inet6_addr_add(int ifindex, struct in6_addr *pfx, int plen, addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev, jiffies_to_clock_t(valid_lft * HZ), flags); + /* + * Note that section 3.1 of RFC 4429 indicates + * that the Optimistic flag should not be set for + * manually configured addresses + */ addrconf_dad_start(ifp, 0); in6_ifa_put(ifp); addrconf_verify(0); @@ -1994,7 +2064,7 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) return; } - for (dev = dev_base; dev != NULL; dev = dev->next) { + for_each_netdev(dev) { struct in_device * in_dev = __in_dev_get_rtnl(dev); if (in_dev && (dev->flags & IFF_UP)) { struct in_ifaddr * ifa; @@ -2060,8 +2130,16 @@ static void init_loopback(struct net_device *dev) static void addrconf_add_linklocal(struct inet6_dev *idev, struct in6_addr *addr) { struct inet6_ifaddr * ifp; + u32 addr_flags = IFA_F_PERMANENT; + +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + if (idev->cnf.optimistic_dad && + !ipv6_devconf.forwarding) + addr_flags |= IFA_F_OPTIMISTIC; +#endif + - ifp = ipv6_add_addr(idev, addr, 64, IFA_LINK, IFA_F_PERMANENT); + ifp = ipv6_add_addr(idev, addr, 64, IFA_LINK, addr_flags); if (!IS_ERR(ifp)) { addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev, 0, 0); addrconf_dad_start(ifp, 0); @@ -2129,7 +2207,7 @@ ipv6_inherit_linklocal(struct inet6_dev *idev, struct net_device *link_dev) { struct in6_addr lladdr; - if (!ipv6_get_lladdr(link_dev, &lladdr)) { + if (!ipv6_get_lladdr(link_dev, &lladdr, IFA_F_TENTATIVE)) { addrconf_add_linklocal(idev, &lladdr); return 0; } @@ -2147,7 +2225,7 @@ static void ip6_tnl_add_linklocal(struct inet6_dev *idev) return; } /* then try to inherit it from any device */ - for (link_dev = dev_base; link_dev; link_dev = link_dev->next) { + for_each_netdev(link_dev) { if (!ipv6_inherit_linklocal(idev, link_dev)) return; } @@ -2240,7 +2318,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, default: addrconf_dev_config(dev); break; - }; + } if (idev) { if (run_pending) addrconf_dad_run(idev); @@ -2281,8 +2359,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, break; case NETDEV_CHANGENAME: -#ifdef CONFIG_SYSCTL if (idev) { + snmp6_unregister_dev(idev); +#ifdef CONFIG_SYSCTL addrconf_sysctl_unregister(&idev->cnf); neigh_sysctl_unregister(idev->nd_parms); neigh_sysctl_register(dev, idev->nd_parms, @@ -2290,10 +2369,11 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, &ndisc_ifinfo_sysctl_change, NULL); addrconf_sysctl_register(idev, &idev->cnf); - } #endif + snmp6_register_dev(idev); + } break; - }; + } return NOTIFY_OK; } @@ -2474,7 +2554,11 @@ static void addrconf_dad_kick(struct inet6_ifaddr *ifp) unsigned long rand_num; struct inet6_dev *idev = ifp->idev; - rand_num = net_random() % (idev->cnf.rtr_solicit_delay ? : 1); + if (ifp->flags & IFA_F_OPTIMISTIC) + rand_num = 0; + else + rand_num = net_random() % (idev->cnf.rtr_solicit_delay ? : 1); + ifp->probes = idev->cnf.dad_transmits; addrconf_mod_timer(ifp, AC_DAD, rand_num); } @@ -2496,7 +2580,7 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags) if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) || !(ifp->flags&IFA_F_TENTATIVE) || ifp->flags & IFA_F_NODAD) { - ifp->flags &= ~IFA_F_TENTATIVE; + ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC); spin_unlock_bh(&ifp->lock); read_unlock_bh(&idev->lock); @@ -2516,6 +2600,14 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags) addrconf_dad_stop(ifp); return; } + + /* + * Optimistic nodes can start receiving + * Frames right away + */ + if(ifp->flags & IFA_F_OPTIMISTIC) + ip6_ins_rt(ifp->rt); + addrconf_dad_kick(ifp); spin_unlock_bh(&ifp->lock); out: @@ -2540,7 +2632,7 @@ static void addrconf_dad_timer(unsigned long data) * DAD was successful */ - ifp->flags &= ~IFA_F_TENTATIVE; + ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC); spin_unlock_bh(&ifp->lock); read_unlock_bh(&idev->lock); @@ -3164,16 +3256,16 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, s_idx = cb->args[0]; s_ip_idx = ip_idx = cb->args[1]; - read_lock(&dev_base_lock); - for (dev = dev_base, idx = 0; dev; dev = dev->next, idx++) { + idx = 0; + for_each_netdev(dev) { if (idx < s_idx) - continue; + goto cont; if (idx > s_idx) s_ip_idx = 0; ip_idx = 0; if ((idev = in6_dev_get(dev)) == NULL) - continue; + goto cont; read_lock_bh(&idev->lock); switch (type) { case UNICAST_ADDR: @@ -3220,13 +3312,14 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, } read_unlock_bh(&idev->lock); in6_dev_put(idev); +cont: + idx++; } done: if (err <= 0) { read_unlock_bh(&idev->lock); in6_dev_put(idev); } - read_unlock(&dev_base_lock); cb->args[0] = idx; cb->args[1] = ip_idx; return skb->len; @@ -3359,6 +3452,9 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, #endif array[DEVCONF_PROXY_NDP] = cnf->proxy_ndp; array[DEVCONF_ACCEPT_SOURCE_ROUTE] = cnf->accept_source_route; +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + array[DEVCONF_OPTIMISTIC_DAD] = cnf->optimistic_dad; +#endif } static inline size_t inet6_if_nlmsg_size(void) @@ -3372,14 +3468,44 @@ static inline size_t inet6_if_nlmsg_size(void) nla_total_size(4) /* IFLA_INET6_FLAGS */ + nla_total_size(sizeof(struct ifla_cacheinfo)) + nla_total_size(DEVCONF_MAX * 4) /* IFLA_INET6_CONF */ + + nla_total_size(IPSTATS_MIB_MAX * 8) /* IFLA_INET6_STATS */ + + nla_total_size(ICMP6_MIB_MAX * 8) /* IFLA_INET6_ICMP6STATS */ ); } +static inline void __snmp6_fill_stats(u64 *stats, void **mib, int items, + int bytes) +{ + int i; + int pad = bytes - sizeof(u64) * items; + BUG_ON(pad < 0); + + /* Use put_unaligned() because stats may not be aligned for u64. */ + put_unaligned(items, &stats[0]); + for (i = 1; i < items; i++) + put_unaligned(snmp_fold_field(mib, i), &stats[i]); + + memset(&stats[items], 0, pad); +} + +static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype, + int bytes) +{ + switch(attrtype) { + case IFLA_INET6_STATS: + __snmp6_fill_stats(stats, (void **)idev->stats.ipv6, IPSTATS_MIB_MAX, bytes); + break; + case IFLA_INET6_ICMP6STATS: + __snmp6_fill_stats(stats, (void **)idev->stats.icmpv6, ICMP6_MIB_MAX, bytes); + break; + } +} + static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, u32 pid, u32 seq, int event, unsigned int flags) { struct net_device *dev = idev->dev; - struct nlattr *conf; + struct nlattr *nla; struct ifinfomsg *hdr; struct nlmsghdr *nlh; void *protoinfo; @@ -3419,12 +3545,22 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, ci.retrans_time = idev->nd_parms->retrans_time; NLA_PUT(skb, IFLA_INET6_CACHEINFO, sizeof(ci), &ci); - conf = nla_reserve(skb, IFLA_INET6_CONF, DEVCONF_MAX * sizeof(s32)); - if (conf == NULL) + nla = nla_reserve(skb, IFLA_INET6_CONF, DEVCONF_MAX * sizeof(s32)); + if (nla == NULL) goto nla_put_failure; - ipv6_store_devconf(&idev->cnf, nla_data(conf), nla_len(conf)); + ipv6_store_devconf(&idev->cnf, nla_data(nla), nla_len(nla)); - /* XXX - Statistics/MC not implemented */ + /* XXX - MC not implemented */ + + nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64)); + if (nla == NULL) + goto nla_put_failure; + snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_STATS, nla_len(nla)); + + nla = nla_reserve(skb, IFLA_INET6_ICMP6STATS, ICMP6_MIB_MAX * sizeof(u64)); + if (nla == NULL) + goto nla_put_failure; + snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_ICMP6STATS, nla_len(nla)); nla_nest_end(skb, protoinfo); return nlmsg_end(skb, nlh); @@ -3442,16 +3578,19 @@ static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) struct inet6_dev *idev; read_lock(&dev_base_lock); - for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { + idx = 0; + for_each_netdev(dev) { if (idx < s_idx) - continue; + goto cont; if ((idev = in6_dev_get(dev)) == NULL) - continue; + goto cont; err = inet6_fill_ifinfo(skb, idev, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, RTM_NEWLINK, NLM_F_MULTI); in6_dev_put(idev); if (err <= 0) break; +cont: + idx++; } read_unlock(&dev_base_lock); cb->args[0] = idx; @@ -3550,30 +3689,20 @@ errout: rtnl_set_sk_err(RTNLGRP_IPV6_PREFIX, err); } -static struct rtnetlink_link inet6_rtnetlink_table[RTM_NR_MSGTYPES] = { - [RTM_GETLINK - RTM_BASE] = { .dumpit = inet6_dump_ifinfo, }, - [RTM_NEWADDR - RTM_BASE] = { .doit = inet6_rtm_newaddr, }, - [RTM_DELADDR - RTM_BASE] = { .doit = inet6_rtm_deladdr, }, - [RTM_GETADDR - RTM_BASE] = { .doit = inet6_rtm_getaddr, - .dumpit = inet6_dump_ifaddr, }, - [RTM_GETMULTICAST - RTM_BASE] = { .dumpit = inet6_dump_ifmcaddr, }, - [RTM_GETANYCAST - RTM_BASE] = { .dumpit = inet6_dump_ifacaddr, }, - [RTM_NEWROUTE - RTM_BASE] = { .doit = inet6_rtm_newroute, }, - [RTM_DELROUTE - RTM_BASE] = { .doit = inet6_rtm_delroute, }, - [RTM_GETROUTE - RTM_BASE] = { .doit = inet6_rtm_getroute, - .dumpit = inet6_dump_fib, }, -#ifdef CONFIG_IPV6_MULTIPLE_TABLES - [RTM_GETRULE - RTM_BASE] = { .dumpit = fib6_rules_dump, }, -#endif -}; - static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) { inet6_ifa_notify(event ? : RTM_NEWADDR, ifp); switch (event) { case RTM_NEWADDR: - ip6_ins_rt(ifp->rt); + /* + * If the address was optimistic + * we inserted the route at the start of + * our DAD process, so we don't need + * to do it again + */ + if (!(ifp->rt->rt6i_node)) + ip6_ins_rt(ifp->rt); if (ifp->idev->cnf.forwarding) addrconf_join_anycast(ifp); break; @@ -3894,6 +4023,17 @@ static struct addrconf_sysctl_table .mode = 0644, .proc_handler = &proc_dointvec, }, +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + { + .ctl_name = CTL_UNNUMBERED, + .procname = "optimistic_dad", + .data = &ipv6_devconf.optimistic_dad, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + + }, +#endif { .ctl_name = 0, /* sentinel */ } @@ -4021,11 +4161,15 @@ int register_inet6addr_notifier(struct notifier_block *nb) return atomic_notifier_chain_register(&inet6addr_chain, nb); } +EXPORT_SYMBOL(register_inet6addr_notifier); + int unregister_inet6addr_notifier(struct notifier_block *nb) { return atomic_notifier_chain_unregister(&inet6addr_chain,nb); } +EXPORT_SYMBOL(unregister_inet6addr_notifier); + /* * Init / cleanup code */ @@ -4064,7 +4208,18 @@ int __init addrconf_init(void) register_netdevice_notifier(&ipv6_dev_notf); addrconf_verify(0); - rtnetlink_links[PF_INET6] = inet6_rtnetlink_table; + + err = __rtnl_register(PF_INET6, RTM_GETLINK, NULL, inet6_dump_ifinfo); + if (err < 0) + goto errout; + + /* Only the first call to __rtnl_register can fail */ + __rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL); + __rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL); + __rtnl_register(PF_INET6, RTM_GETADDR, inet6_rtm_getaddr, inet6_dump_ifaddr); + __rtnl_register(PF_INET6, RTM_GETMULTICAST, NULL, inet6_dump_ifmcaddr); + __rtnl_register(PF_INET6, RTM_GETANYCAST, NULL, inet6_dump_ifacaddr); + #ifdef CONFIG_SYSCTL addrconf_sysctl.sysctl_header = register_sysctl_table(addrconf_sysctl.addrconf_root_dir); @@ -4072,6 +4227,10 @@ int __init addrconf_init(void) #endif return 0; +errout: + unregister_netdevice_notifier(&ipv6_dev_notf); + + return err; } void __exit addrconf_cleanup(void) @@ -4083,7 +4242,6 @@ void __exit addrconf_cleanup(void) unregister_netdevice_notifier(&ipv6_dev_notf); - rtnetlink_links[PF_INET6] = NULL; #ifdef CONFIG_SYSCTL addrconf_sysctl_unregister(&ipv6_devconf_dflt); addrconf_sysctl_unregister(&ipv6_devconf); @@ -4095,7 +4253,7 @@ void __exit addrconf_cleanup(void) * clean dev list. */ - for (dev=dev_base; dev; dev=dev->next) { + for_each_netdev(dev) { if ((idev = __in6_dev_get(dev)) == NULL) continue; addrconf_ifdown(dev, 1); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 5cac14a5c77..18cb928c8d9 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -98,6 +98,11 @@ static int inet6_create(struct socket *sock, int protocol) int try_loading_module = 0; int err; + if (sock->type != SOCK_RAW && + sock->type != SOCK_DGRAM && + !inet_ehash_secret) + build_ehash_secret(); + /* Look for the requested type/protocol pair. */ answer = NULL; lookup_protocol: @@ -349,6 +354,8 @@ out: return err; } +EXPORT_SYMBOL(inet6_bind); + int inet6_release(struct socket *sock) { struct sock *sk = sock->sk; @@ -365,6 +372,8 @@ int inet6_release(struct socket *sock) return inet_release(sock); } +EXPORT_SYMBOL(inet6_release); + int inet6_destroy_sock(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); @@ -428,6 +437,8 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, return(0); } +EXPORT_SYMBOL(inet6_getname); + int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; @@ -437,6 +448,9 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCGSTAMP: return sock_get_timestamp(sk, (struct timeval __user *)arg); + case SIOCGSTAMPNS: + return sock_get_timestampns(sk, (struct timespec __user *)arg); + case SIOCADDRT: case SIOCDELRT: @@ -457,6 +471,8 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return(0); } +EXPORT_SYMBOL(inet6_ioctl); + const struct proto_ops inet6_stream_ops = { .family = PF_INET6, .owner = THIS_MODULE, @@ -603,6 +619,8 @@ out_illegal: goto out; } +EXPORT_SYMBOL(inet6_register_protosw); + void inet6_unregister_protosw(struct inet_protosw *p) { @@ -619,6 +637,8 @@ inet6_unregister_protosw(struct inet_protosw *p) } } +EXPORT_SYMBOL(inet6_unregister_protosw); + int inet6_sk_rebuild_header(struct sock *sk) { int err; @@ -678,7 +698,8 @@ int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb) if (np->rxopt.all) { if ((opt->hop && (np->rxopt.bits.hopopts || np->rxopt.bits.ohopopts)) || - ((IPV6_FLOWINFO_MASK & *(__be32*)skb->nh.raw) && + ((IPV6_FLOWINFO_MASK & + *(__be32 *)skb_network_header(skb)) && np->rxopt.bits.rxflow) || (opt->srcrt && (np->rxopt.bits.srcrt || np->rxopt.bits.osrcrt)) || @@ -691,61 +712,28 @@ int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb) EXPORT_SYMBOL_GPL(ipv6_opt_accepted); -int -snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign) -{ - if (ptr == NULL) - return -EINVAL; - - ptr[0] = __alloc_percpu(mibsize); - if (!ptr[0]) - goto err0; - - ptr[1] = __alloc_percpu(mibsize); - if (!ptr[1]) - goto err1; - - return 0; - -err1: - free_percpu(ptr[0]); - ptr[0] = NULL; -err0: - return -ENOMEM; -} - -void -snmp6_mib_free(void *ptr[2]) -{ - if (ptr == NULL) - return; - free_percpu(ptr[0]); - free_percpu(ptr[1]); - ptr[0] = ptr[1] = NULL; -} - static int __init init_ipv6_mibs(void) { - if (snmp6_mib_init((void **)ipv6_statistics, sizeof (struct ipstats_mib), - __alignof__(struct ipstats_mib)) < 0) + if (snmp_mib_init((void **)ipv6_statistics, sizeof (struct ipstats_mib), + __alignof__(struct ipstats_mib)) < 0) goto err_ip_mib; - if (snmp6_mib_init((void **)icmpv6_statistics, sizeof (struct icmpv6_mib), - __alignof__(struct icmpv6_mib)) < 0) + if (snmp_mib_init((void **)icmpv6_statistics, sizeof (struct icmpv6_mib), + __alignof__(struct icmpv6_mib)) < 0) goto err_icmp_mib; - if (snmp6_mib_init((void **)udp_stats_in6, sizeof (struct udp_mib), - __alignof__(struct udp_mib)) < 0) + if (snmp_mib_init((void **)udp_stats_in6, sizeof (struct udp_mib), + __alignof__(struct udp_mib)) < 0) goto err_udp_mib; - if (snmp6_mib_init((void **)udplite_stats_in6, sizeof (struct udp_mib), - __alignof__(struct udp_mib)) < 0) + if (snmp_mib_init((void **)udplite_stats_in6, sizeof (struct udp_mib), + __alignof__(struct udp_mib)) < 0) goto err_udplite_mib; return 0; err_udplite_mib: - snmp6_mib_free((void **)udp_stats_in6); + snmp_mib_free((void **)udp_stats_in6); err_udp_mib: - snmp6_mib_free((void **)icmpv6_statistics); + snmp_mib_free((void **)icmpv6_statistics); err_icmp_mib: - snmp6_mib_free((void **)ipv6_statistics); + snmp_mib_free((void **)ipv6_statistics); err_ip_mib: return -ENOMEM; @@ -753,10 +741,10 @@ err_ip_mib: static void cleanup_ipv6_mibs(void) { - snmp6_mib_free((void **)ipv6_statistics); - snmp6_mib_free((void **)icmpv6_statistics); - snmp6_mib_free((void **)udp_stats_in6); - snmp6_mib_free((void **)udplite_stats_in6); + snmp_mib_free((void **)ipv6_statistics); + snmp_mib_free((void **)icmpv6_statistics); + snmp_mib_free((void **)udp_stats_in6); + snmp_mib_free((void **)udplite_stats_in6); } static int __init inet6_init(void) @@ -929,6 +917,8 @@ static void __exit inet6_exit(void) { /* First of all disallow new sockets creation. */ sock_unregister(PF_INET6); + /* Disallow any further netlink messages */ + rtnl_unregister_all(PF_INET6); /* Cleanup code parts. */ ipv6_packet_cleanup(); diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index dc68b7269c3..b696c840120 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -238,8 +238,8 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) top_iph = (struct ipv6hdr *)skb->data; top_iph->payload_len = htons(skb->len - sizeof(*top_iph)); - nexthdr = *skb->nh.raw; - *skb->nh.raw = IPPROTO_AH; + nexthdr = *skb_network_header(skb); + *skb_network_header(skb) = IPPROTO_AH; /* When there are no extension headers, we only need to save the first * 8 bytes of the base IP header. @@ -247,7 +247,7 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) memcpy(tmp_base, top_iph, sizeof(tmp_base)); tmp_ext = NULL; - extlen = skb->h.raw - (unsigned char *)(top_iph + 1); + extlen = skb_transport_offset(skb) + sizeof(struct ipv6hdr); if (extlen) { extlen += sizeof(*tmp_ext); tmp_ext = kmalloc(extlen, GFP_ATOMIC); @@ -268,7 +268,7 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) goto error_free_iph; } - ah = (struct ip_auth_hdr *)skb->h.raw; + ah = (struct ip_auth_hdr *)skb_transport_header(skb); ah->nexthdr = nexthdr; top_iph->priority = 0; @@ -316,8 +316,8 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) * * To erase AH: * Keeping copy of cleared headers. After AH processing, - * Moving the pointer of skb->nh.raw by using skb_pull as long as AH - * header length. Then copy back the copy as long as hdr_len + * Moving the pointer of skb->network_header by using skb_pull as long + * as AH header length. Then copy back the copy as long as hdr_len * If destination header following AH exists, copy it into after [Ext2]. * * |<>|[IPv6][Ext1][Ext2][Dest][Payload] @@ -325,6 +325,7 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) */ struct ipv6_auth_hdr *ah; + struct ipv6hdr *ip6h; struct ah_data *ahp; unsigned char *tmp_hdr = NULL; u16 hdr_len; @@ -341,7 +342,7 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) goto out; - hdr_len = skb->data - skb->nh.raw; + hdr_len = skb->data - skb_network_header(skb); ah = (struct ipv6_auth_hdr*)skb->data; ahp = x->data; nexthdr = ah->nexthdr; @@ -354,16 +355,17 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) if (!pskb_may_pull(skb, ah_hlen)) goto out; - tmp_hdr = kmemdup(skb->nh.raw, hdr_len, GFP_ATOMIC); + tmp_hdr = kmemdup(skb_network_header(skb), hdr_len, GFP_ATOMIC); if (!tmp_hdr) goto out; - if (ipv6_clear_mutable_options(skb->nh.ipv6h, hdr_len, XFRM_POLICY_IN)) + ip6h = ipv6_hdr(skb); + if (ipv6_clear_mutable_options(ip6h, hdr_len, XFRM_POLICY_IN)) goto free_out; - skb->nh.ipv6h->priority = 0; - skb->nh.ipv6h->flow_lbl[0] = 0; - skb->nh.ipv6h->flow_lbl[1] = 0; - skb->nh.ipv6h->flow_lbl[2] = 0; - skb->nh.ipv6h->hop_limit = 0; + ip6h->priority = 0; + ip6h->flow_lbl[0] = 0; + ip6h->flow_lbl[1] = 0; + ip6h->flow_lbl[2] = 0; + ip6h->hop_limit = 0; { u8 auth_data[MAX_AH_AUTH_LEN]; @@ -382,7 +384,9 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) } } - skb->h.raw = memcpy(skb->nh.raw += ah_hlen, tmp_hdr, hdr_len); + skb->network_header += ah_hlen; + memcpy(skb_network_header(skb), tmp_hdr, hdr_len); + skb->transport_header = skb->network_header; __skb_pull(skb, ah_hlen + hdr_len); kfree(tmp_hdr); diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 09117d63256..9b81264eb78 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -423,14 +423,18 @@ static int ipv6_chk_acast_dev(struct net_device *dev, struct in6_addr *addr) */ int ipv6_chk_acast_addr(struct net_device *dev, struct in6_addr *addr) { + int found = 0; + if (dev) return ipv6_chk_acast_dev(dev, addr); read_lock(&dev_base_lock); - for (dev=dev_base; dev; dev=dev->next) - if (ipv6_chk_acast_dev(dev, addr)) + for_each_netdev(dev) + if (ipv6_chk_acast_dev(dev, addr)) { + found = 1; break; + } read_unlock(&dev_base_lock); - return dev != 0; + return found; } @@ -447,9 +451,8 @@ static inline struct ifacaddr6 *ac6_get_first(struct seq_file *seq) struct ifacaddr6 *im = NULL; struct ac6_iter_state *state = ac6_seq_private(seq); - for (state->dev = dev_base, state->idev = NULL; - state->dev; - state->dev = state->dev->next) { + state->idev = NULL; + for_each_netdev(state->dev) { struct inet6_dev *idev; idev = in6_dev_get(state->dev); if (!idev) @@ -476,7 +479,7 @@ static struct ifacaddr6 *ac6_get_next(struct seq_file *seq, struct ifacaddr6 *im read_unlock_bh(&state->idev->lock); in6_dev_put(state->idev); } - state->dev = state->dev->next; + state->dev = next_net_device(state->dev); if (!state->dev) { state->idev = NULL; break; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 3b4e8dcf4c8..403eee66b9c 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -209,7 +209,7 @@ void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload) { struct ipv6_pinfo *np = inet6_sk(sk); - struct icmp6hdr *icmph = (struct icmp6hdr *)skb->h.raw; + struct icmp6hdr *icmph = icmp6_hdr(skb); struct sock_exterr_skb *serr; if (!np->recverr) @@ -227,11 +227,12 @@ void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, serr->ee.ee_pad = 0; serr->ee.ee_info = info; serr->ee.ee_data = 0; - serr->addr_offset = (u8*)&(((struct ipv6hdr*)(icmph+1))->daddr) - skb->nh.raw; + serr->addr_offset = (u8 *)&(((struct ipv6hdr *)(icmph + 1))->daddr) - + skb_network_header(skb); serr->port = port; - skb->h.raw = payload; __skb_pull(skb, payload - skb->data); + skb_reset_transport_header(skb); if (sock_queue_err_skb(sk, skb)) kfree_skb(skb); @@ -251,8 +252,9 @@ void ipv6_local_error(struct sock *sk, int err, struct flowi *fl, u32 info) if (!skb) return; - iph = (struct ipv6hdr*)skb_put(skb, sizeof(struct ipv6hdr)); - skb->nh.ipv6h = iph; + skb_put(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + iph = ipv6_hdr(skb); ipv6_addr_copy(&iph->daddr, &fl->fl6_dst); serr = SKB_EXT_ERR(skb); @@ -263,11 +265,11 @@ void ipv6_local_error(struct sock *sk, int err, struct flowi *fl, u32 info) serr->ee.ee_pad = 0; serr->ee.ee_info = info; serr->ee.ee_data = 0; - serr->addr_offset = (u8*)&iph->daddr - skb->nh.raw; + serr->addr_offset = (u8 *)&iph->daddr - skb_network_header(skb); serr->port = fl->fl_ip_dport; - skb->h.raw = skb->tail; - __skb_pull(skb, skb->tail - skb->data); + __skb_pull(skb, skb_tail_pointer(skb) - skb->data); + skb_reset_transport_header(skb); if (sock_queue_err_skb(sk, skb)) kfree_skb(skb); @@ -309,21 +311,24 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len) sin = (struct sockaddr_in6 *)msg->msg_name; if (sin) { + const unsigned char *nh = skb_network_header(skb); sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; sin->sin6_port = serr->port; sin->sin6_scope_id = 0; if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6) { ipv6_addr_copy(&sin->sin6_addr, - (struct in6_addr *)(skb->nh.raw + serr->addr_offset)); + (struct in6_addr *)(nh + serr->addr_offset)); if (np->sndflow) - sin->sin6_flowinfo = *(__be32*)(skb->nh.raw + serr->addr_offset - 24) & IPV6_FLOWINFO_MASK; + sin->sin6_flowinfo = + (*(__be32 *)(nh + serr->addr_offset - 24) & + IPV6_FLOWINFO_MASK); if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL) sin->sin6_scope_id = IP6CB(skb)->iif; } else { ipv6_addr_set(&sin->sin6_addr, 0, 0, htonl(0xffff), - *(__be32*)(skb->nh.raw + serr->addr_offset)); + *(__be32 *)(nh + serr->addr_offset)); } } @@ -335,7 +340,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len) sin->sin6_flowinfo = 0; sin->sin6_scope_id = 0; if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6) { - ipv6_addr_copy(&sin->sin6_addr, &skb->nh.ipv6h->saddr); + ipv6_addr_copy(&sin->sin6_addr, &ipv6_hdr(skb)->saddr); if (np->rxopt.all) datagram_recv_ctl(sk, msg, skb); if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL) @@ -344,8 +349,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len) struct inet_sock *inet = inet_sk(sk); ipv6_addr_set(&sin->sin6_addr, 0, 0, - htonl(0xffff), - skb->nh.iph->saddr); + htonl(0xffff), ip_hdr(skb)->saddr); if (inet->cmsg_flags) ip_cmsg_recv(msg, skb); } @@ -381,33 +385,34 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { struct ipv6_pinfo *np = inet6_sk(sk); struct inet6_skb_parm *opt = IP6CB(skb); + unsigned char *nh = skb_network_header(skb); if (np->rxopt.bits.rxinfo) { struct in6_pktinfo src_info; src_info.ipi6_ifindex = opt->iif; - ipv6_addr_copy(&src_info.ipi6_addr, &skb->nh.ipv6h->daddr); + ipv6_addr_copy(&src_info.ipi6_addr, &ipv6_hdr(skb)->daddr); put_cmsg(msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxhlim) { - int hlim = skb->nh.ipv6h->hop_limit; + int hlim = ipv6_hdr(skb)->hop_limit; put_cmsg(msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.rxtclass) { - int tclass = (ntohl(*(__be32 *)skb->nh.ipv6h) >> 20) & 0xff; + int tclass = (ntohl(*(__be32 *)ipv6_hdr(skb)) >> 20) & 0xff; put_cmsg(msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass); } - if (np->rxopt.bits.rxflow && (*(__be32*)skb->nh.raw & IPV6_FLOWINFO_MASK)) { - __be32 flowinfo = *(__be32*)skb->nh.raw & IPV6_FLOWINFO_MASK; + if (np->rxopt.bits.rxflow && (*(__be32 *)nh & IPV6_FLOWINFO_MASK)) { + __be32 flowinfo = *(__be32 *)nh & IPV6_FLOWINFO_MASK; put_cmsg(msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo); } /* HbH is allowed only once */ if (np->rxopt.bits.hopopts && opt->hop) { - u8 *ptr = skb->nh.raw + opt->hop; + u8 *ptr = nh + opt->hop; put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, (ptr[1]+1)<<3, ptr); } @@ -423,11 +428,11 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) * IPV6_RECVDSTOPTS is more generic. --yoshfuji */ unsigned int off = sizeof(struct ipv6hdr); - u8 nexthdr = skb->nh.ipv6h->nexthdr; + u8 nexthdr = ipv6_hdr(skb)->nexthdr; while (off <= opt->lastopt) { unsigned len; - u8 *ptr = skb->nh.raw + off; + u8 *ptr = nh + off; switch(nexthdr) { case IPPROTO_DSTOPTS: @@ -461,27 +466,27 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) struct in6_pktinfo src_info; src_info.ipi6_ifindex = opt->iif; - ipv6_addr_copy(&src_info.ipi6_addr, &skb->nh.ipv6h->daddr); + ipv6_addr_copy(&src_info.ipi6_addr, &ipv6_hdr(skb)->daddr); put_cmsg(msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxohlim) { - int hlim = skb->nh.ipv6h->hop_limit; + int hlim = ipv6_hdr(skb)->hop_limit; put_cmsg(msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.ohopopts && opt->hop) { - u8 *ptr = skb->nh.raw + opt->hop; + u8 *ptr = nh + opt->hop; put_cmsg(msg, SOL_IPV6, IPV6_2292HOPOPTS, (ptr[1]+1)<<3, ptr); } if (np->rxopt.bits.odstopts && opt->dst0) { - u8 *ptr = skb->nh.raw + opt->dst0; + u8 *ptr = nh + opt->dst0; put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr); } if (np->rxopt.bits.osrcrt && opt->srcrt) { - struct ipv6_rt_hdr *rthdr = (struct ipv6_rt_hdr *)(skb->nh.raw + opt->srcrt); + struct ipv6_rt_hdr *rthdr = (struct ipv6_rt_hdr *)(nh + opt->srcrt); put_cmsg(msg, SOL_IPV6, IPV6_2292RTHDR, (rthdr->hdrlen+1) << 3, rthdr); } if (np->rxopt.bits.odstopts && opt->dst1) { - u8 *ptr = skb->nh.raw + opt->dst1; + u8 *ptr = nh + opt->dst1; put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr); } return 0; @@ -718,7 +723,7 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl, cmsg->cmsg_type); err = -EINVAL; break; - }; + } } exit_f: diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 363e63ffecc..7107bb7e2e6 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -42,21 +42,19 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) { int err; - int hdr_len; struct ipv6hdr *top_iph; struct ipv6_esp_hdr *esph; struct crypto_blkcipher *tfm; struct blkcipher_desc desc; - struct esp_data *esp; struct sk_buff *trailer; int blksize; int clen; int alen; int nfrags; - - esp = x->data; - hdr_len = skb->h.raw - skb->data + - sizeof(*esph) + esp->conf.ivlen; + u8 *tail; + struct esp_data *esp = x->data; + int hdr_len = (skb_transport_offset(skb) + + sizeof(*esph) + esp->conf.ivlen); /* Strip IP+ESP header. */ __skb_pull(skb, hdr_len); @@ -81,19 +79,20 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) } /* Fill padding... */ + tail = skb_tail_pointer(trailer); do { int i; for (i=0; i<clen-skb->len - 2; i++) - *(u8*)(trailer->tail + i) = i+1; + tail[i] = i + 1; } while (0); - *(u8*)(trailer->tail + clen-skb->len - 2) = (clen - skb->len)-2; + tail[clen-skb->len - 2] = (clen - skb->len) - 2; pskb_put(skb, trailer, clen - skb->len); top_iph = (struct ipv6hdr *)__skb_push(skb, hdr_len); - esph = (struct ipv6_esp_hdr *)skb->h.raw; + esph = (struct ipv6_esp_hdr *)skb_transport_header(skb); top_iph->payload_len = htons(skb->len + alen - sizeof(*top_iph)); - *(u8*)(trailer->tail - 1) = *skb->nh.raw; - *skb->nh.raw = IPPROTO_ESP; + *(skb_tail_pointer(trailer) - 1) = *skb_network_header(skb); + *skb_network_header(skb) = IPPROTO_ESP; esph->spi = x->id.spi; esph->seq_no = htonl(++x->replay.oseq); @@ -150,8 +149,7 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) int blksize = ALIGN(crypto_blkcipher_blocksize(tfm), 4); int alen = esp->auth.icv_trunc_len; int elen = skb->len - sizeof(struct ipv6_esp_hdr) - esp->conf.ivlen - alen; - - int hdr_len = skb->h.raw - skb->nh.raw; + int hdr_len = skb_network_header_len(skb); int nfrags; int ret = 0; @@ -191,7 +189,7 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) skb->ip_summed = CHECKSUM_NONE; esph = (struct ipv6_esp_hdr*)skb->data; - iph = skb->nh.ipv6h; + iph = ipv6_hdr(skb); /* Get ivec. This can be wrong, check against another impls. */ if (esp->conf.ivlen) @@ -231,28 +229,30 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) ret = nexthdr[1]; } - skb->h.raw = __skb_pull(skb, sizeof(*esph) + esp->conf.ivlen) - hdr_len; - + __skb_pull(skb, sizeof(*esph) + esp->conf.ivlen); + skb_set_transport_header(skb, -hdr_len); out: return ret; } -static u32 esp6_get_max_size(struct xfrm_state *x, int mtu) +static u32 esp6_get_mtu(struct xfrm_state *x, int mtu) { struct esp_data *esp = x->data; u32 blksize = ALIGN(crypto_blkcipher_blocksize(esp->conf.tfm), 4); + u32 align = max_t(u32, blksize, esp->conf.padlen); + u32 rem; + + mtu -= x->props.header_len + esp->auth.icv_trunc_len; + rem = mtu & (align - 1); + mtu &= ~(align - 1); - if (x->props.mode == XFRM_MODE_TUNNEL) { - mtu = ALIGN(mtu + 2, blksize); - } else { - /* The worst case. */ + if (x->props.mode != XFRM_MODE_TUNNEL) { u32 padsize = ((blksize - 1) & 7) + 1; - mtu = ALIGN(mtu + 2, padsize) + blksize - padsize; + mtu -= blksize - padsize; + mtu += min_t(u32, blksize - padsize, rem); } - if (esp->conf.padlen) - mtu = ALIGN(mtu, esp->conf.padlen); - return mtu + x->props.header_len + esp->auth.icv_trunc_len; + return mtu - 2; } static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, @@ -382,7 +382,7 @@ static struct xfrm_type esp6_type = .proto = IPPROTO_ESP, .init_state = esp6_init_state, .destructor = esp6_destroy, - .get_max_size = esp6_get_max_size, + .get_mtu = esp6_get_mtu, .input = esp6_input, .output = esp6_output, .hdr_offset = xfrm6_find_1stfragopt, diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index fb39604c3d0..6d8e4ac7bda 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -50,13 +50,14 @@ int ipv6_find_tlv(struct sk_buff *skb, int offset, int type) { - int packet_len = skb->tail - skb->nh.raw; + const unsigned char *nh = skb_network_header(skb); + int packet_len = skb->tail - skb->network_header; struct ipv6_opt_hdr *hdr; int len; if (offset + 2 > packet_len) goto bad; - hdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset); + hdr = (struct ipv6_opt_hdr *)(nh + offset); len = ((hdr->hdrlen + 1) << 3); if (offset + len > packet_len) @@ -66,7 +67,7 @@ int ipv6_find_tlv(struct sk_buff *skb, int offset, int type) len -= 2; while (len > 0) { - int opttype = skb->nh.raw[offset]; + int opttype = nh[offset]; int optlen; if (opttype == type) @@ -77,7 +78,7 @@ int ipv6_find_tlv(struct sk_buff *skb, int offset, int type) optlen = 1; break; default: - optlen = skb->nh.raw[offset + 1] + 2; + optlen = nh[offset + 1] + 2; if (optlen > len) goto bad; break; @@ -113,7 +114,7 @@ static int ip6_tlvopt_unknown(struct sk_buff **skbp, int optoff) { struct sk_buff *skb = *skbp; - switch ((skb->nh.raw[optoff] & 0xC0) >> 6) { + switch ((skb_network_header(skb)[optoff] & 0xC0) >> 6) { case 0: /* ignore */ return 1; @@ -124,12 +125,12 @@ static int ip6_tlvopt_unknown(struct sk_buff **skbp, int optoff) /* Actually, it is redundant check. icmp_send will recheck in any case. */ - if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) + if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) break; case 2: /* send ICMP PARM PROB regardless and drop packet */ icmpv6_param_prob(skb, ICMPV6_UNK_OPTION, optoff); return 0; - }; + } kfree_skb(skb); return 0; @@ -141,19 +142,20 @@ static int ip6_parse_tlv(struct tlvtype_proc *procs, struct sk_buff **skbp) { struct sk_buff *skb = *skbp; struct tlvtype_proc *curr; - int off = skb->h.raw - skb->nh.raw; - int len = ((skb->h.raw[1]+1)<<3); + const unsigned char *nh = skb_network_header(skb); + int off = skb_network_header_len(skb); + int len = (skb_transport_header(skb)[1] + 1) << 3; - if ((skb->h.raw + len) - skb->data > skb_headlen(skb)) + if (skb_transport_offset(skb) + len > skb_headlen(skb)) goto bad; off += 2; len -= 2; while (len > 0) { - int optlen = skb->nh.raw[off+1]+2; + int optlen = nh[off + 1] + 2; - switch (skb->nh.raw[off]) { + switch (nh[off]) { case IPV6_TLV_PAD0: optlen = 1; break; @@ -165,7 +167,7 @@ static int ip6_parse_tlv(struct tlvtype_proc *procs, struct sk_buff **skbp) if (optlen > len) goto bad; for (curr=procs; curr->type >= 0; curr++) { - if (curr->type == skb->nh.raw[off]) { + if (curr->type == nh[off]) { /* type specific length/alignment checks will be performed in the func(). */ @@ -200,7 +202,7 @@ static int ipv6_dest_hao(struct sk_buff **skbp, int optoff) struct sk_buff *skb = *skbp; struct ipv6_destopt_hao *hao; struct inet6_skb_parm *opt = IP6CB(skb); - struct ipv6hdr *ipv6h = (struct ipv6hdr *)skb->nh.raw; + struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct in6_addr tmp_addr; int ret; @@ -211,7 +213,7 @@ static int ipv6_dest_hao(struct sk_buff **skbp, int optoff) opt->dsthao = opt->dst1; opt->dst1 = 0; - hao = (struct ipv6_destopt_hao *)(skb->nh.raw + optoff); + hao = (struct ipv6_destopt_hao *)(skb_network_header(skb) + optoff); if (hao->length != 16) { LIMIT_NETDEBUG( @@ -244,8 +246,9 @@ static int ipv6_dest_hao(struct sk_buff **skbp, int optoff) /* update all variable using below by copied skbuff */ *skbp = skb = skb2; - hao = (struct ipv6_destopt_hao *)(skb2->nh.raw + optoff); - ipv6h = (struct ipv6hdr *)skb2->nh.raw; + hao = (struct ipv6_destopt_hao *)(skb_network_header(skb2) + + optoff); + ipv6h = ipv6_hdr(skb2); } if (skb->ip_summed == CHECKSUM_COMPLETE) @@ -255,7 +258,7 @@ static int ipv6_dest_hao(struct sk_buff **skbp, int optoff) ipv6_addr_copy(&ipv6h->saddr, &hao->addr); ipv6_addr_copy(&hao->addr, &tmp_addr); - if (skb->tstamp.off_sec == 0) + if (skb->tstamp.tv64 == 0) __net_timestamp(skb); return 1; @@ -285,16 +288,16 @@ static int ipv6_destopt_rcv(struct sk_buff **skbp) #endif struct dst_entry *dst; - if (!pskb_may_pull(skb, (skb->h.raw-skb->data)+8) || - !pskb_may_pull(skb, (skb->h.raw-skb->data)+((skb->h.raw[1]+1)<<3))) { + if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || + !pskb_may_pull(skb, (skb_transport_offset(skb) + + ((skb_transport_header(skb)[1] + 1) << 3)))) { IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -1; } - opt->lastopt = skb->h.raw - skb->nh.raw; - opt->dst1 = skb->h.raw - skb->nh.raw; + opt->lastopt = opt->dst1 = skb_network_header_len(skb); #ifdef CONFIG_IPV6_MIP6 dstbuf = opt->dst1; #endif @@ -303,7 +306,7 @@ static int ipv6_destopt_rcv(struct sk_buff **skbp) if (ip6_parse_tlv(tlvprocdestopt_lst, skbp)) { dst_release(dst); skb = *skbp; - skb->h.raw += ((skb->h.raw[1]+1)<<3); + skb->transport_header += (skb_transport_header(skb)[1] + 1) << 3; opt = IP6CB(skb); #ifdef CONFIG_IPV6_MIP6 opt->nhoff = dstbuf; @@ -384,18 +387,20 @@ static int ipv6_rthdr_rcv(struct sk_buff **skbp) in6_dev_put(idev); - if (!pskb_may_pull(skb, (skb->h.raw-skb->data)+8) || - !pskb_may_pull(skb, (skb->h.raw-skb->data)+((skb->h.raw[1]+1)<<3))) { + if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || + !pskb_may_pull(skb, (skb_transport_offset(skb) + + ((skb_transport_header(skb)[1] + 1) << 3)))) { IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -1; } - hdr = (struct ipv6_rt_hdr *) skb->h.raw; + hdr = (struct ipv6_rt_hdr *)skb_transport_header(skb); switch (hdr->type) { #ifdef CONFIG_IPV6_MIP6 + case IPV6_SRCRT_TYPE_2: break; #endif case IPV6_SRCRT_TYPE_0: @@ -406,11 +411,12 @@ static int ipv6_rthdr_rcv(struct sk_buff **skbp) default: IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_INHDRERRORS); - icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, (&hdr->type) - skb->nh.raw); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, + (&hdr->type) - skb_network_header(skb)); return -1; } - if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr) || + if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) || skb->pkt_type != PACKET_HOST) { IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_INADDRERRORS); @@ -438,12 +444,11 @@ looped_back: break; } - opt->lastopt = skb->h.raw - skb->nh.raw; - opt->srcrt = skb->h.raw - skb->nh.raw; - skb->h.raw += (hdr->hdrlen + 1) << 3; + opt->lastopt = opt->srcrt = skb_network_header_len(skb); + skb->transport_header += (hdr->hdrlen + 1) << 3; opt->dst0 = opt->dst1; opt->dst1 = 0; - opt->nhoff = (&hdr->nexthdr) - skb->nh.raw; + opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb); return 1; } @@ -452,7 +457,9 @@ looped_back: if (hdr->hdrlen & 0x01) { IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_INHDRERRORS); - icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, (&hdr->hdrlen) - skb->nh.raw); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, + ((&hdr->hdrlen) - + skb_network_header(skb))); return -1; } break; @@ -479,7 +486,9 @@ looped_back: if (hdr->segments_left > n) { IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_INHDRERRORS); - icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, (&hdr->segments_left) - skb->nh.raw); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, + ((&hdr->segments_left) - + skb_network_header(skb))); return -1; } @@ -498,7 +507,7 @@ looped_back: kfree_skb(skb); *skbp = skb = skb2; opt = IP6CB(skb2); - hdr = (struct ipv6_rt_hdr *) skb2->h.raw; + hdr = (struct ipv6_rt_hdr *)skb_transport_header(skb2); } if (skb->ip_summed == CHECKSUM_COMPLETE) @@ -514,7 +523,7 @@ looped_back: #ifdef CONFIG_IPV6_MIP6 case IPV6_SRCRT_TYPE_2: if (xfrm6_input_addr(skb, (xfrm_address_t *)addr, - (xfrm_address_t *)&skb->nh.ipv6h->saddr, + (xfrm_address_t *)&ipv6_hdr(skb)->saddr, IPPROTO_ROUTING) < 0) { IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_INADDRERRORS); @@ -541,19 +550,19 @@ looped_back: } ipv6_addr_copy(&daddr, addr); - ipv6_addr_copy(addr, &skb->nh.ipv6h->daddr); - ipv6_addr_copy(&skb->nh.ipv6h->daddr, &daddr); + ipv6_addr_copy(addr, &ipv6_hdr(skb)->daddr); + ipv6_addr_copy(&ipv6_hdr(skb)->daddr, &daddr); dst_release(xchg(&skb->dst, NULL)); ip6_route_input(skb); if (skb->dst->error) { - skb_push(skb, skb->data - skb->nh.raw); + skb_push(skb, skb->data - skb_network_header(skb)); dst_input(skb); return -1; } if (skb->dst->dev->flags&IFF_LOOPBACK) { - if (skb->nh.ipv6h->hop_limit <= 1) { + if (ipv6_hdr(skb)->hop_limit <= 1) { IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_INHDRERRORS); icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, @@ -561,11 +570,11 @@ looped_back: kfree_skb(skb); return -1; } - skb->nh.ipv6h->hop_limit--; + ipv6_hdr(skb)->hop_limit--; goto looped_back; } - skb_push(skb, skb->data - skb->nh.raw); + skb_push(skb, skb->data - skb_network_header(skb)); dst_input(skb); return -1; } @@ -656,13 +665,14 @@ EXPORT_SYMBOL_GPL(ipv6_invert_rthdr); static int ipv6_hop_ra(struct sk_buff **skbp, int optoff) { struct sk_buff *skb = *skbp; + const unsigned char *nh = skb_network_header(skb); - if (skb->nh.raw[optoff+1] == 2) { + if (nh[optoff + 1] == 2) { IP6CB(skb)->ra = optoff; return 1; } LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_ra: wrong RA length %d\n", - skb->nh.raw[optoff+1]); + nh[optoff + 1]); kfree_skb(skb); return 0; } @@ -672,23 +682,24 @@ static int ipv6_hop_ra(struct sk_buff **skbp, int optoff) static int ipv6_hop_jumbo(struct sk_buff **skbp, int optoff) { struct sk_buff *skb = *skbp; + const unsigned char *nh = skb_network_header(skb); u32 pkt_len; - if (skb->nh.raw[optoff+1] != 4 || (optoff&3) != 2) { + if (nh[optoff + 1] != 4 || (optoff & 3) != 2) { LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n", - skb->nh.raw[optoff+1]); + nh[optoff+1]); IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_INHDRERRORS); goto drop; } - pkt_len = ntohl(*(__be32*)(skb->nh.raw+optoff+2)); + pkt_len = ntohl(*(__be32 *)(nh + optoff + 2)); if (pkt_len <= IPV6_MAXPLEN) { IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff+2); return 0; } - if (skb->nh.ipv6h->payload_len) { + if (ipv6_hdr(skb)->payload_len) { IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff); return 0; @@ -727,13 +738,14 @@ int ipv6_parse_hopopts(struct sk_buff **skbp) struct inet6_skb_parm *opt = IP6CB(skb); /* - * skb->nh.raw is equal to skb->data, and - * skb->h.raw - skb->nh.raw is always equal to + * skb_network_header(skb) is equal to skb->data, and + * skb_network_header_len(skb) is always equal to * sizeof(struct ipv6hdr) by definition of * hop-by-hop options. */ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr) + 8) || - !pskb_may_pull(skb, sizeof(struct ipv6hdr) + ((skb->h.raw[1] + 1) << 3))) { + !pskb_may_pull(skb, (sizeof(struct ipv6hdr) + + ((skb_transport_header(skb)[1] + 1) << 3)))) { kfree_skb(skb); return -1; } @@ -741,7 +753,7 @@ int ipv6_parse_hopopts(struct sk_buff **skbp) opt->hop = sizeof(struct ipv6hdr); if (ip6_parse_tlv(tlvprochopopt_lst, skbp)) { skb = *skbp; - skb->h.raw += (skb->h.raw[1]+1)<<3; + skb->transport_header += (skb_transport_header(skb)[1] + 1) << 3; opt = IP6CB(skb); opt->nhoff = sizeof(struct ipv6hdr); return 1; @@ -810,6 +822,8 @@ void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, ipv6_push_exthdr(skb, proto, NEXTHDR_HOP, opt->hopopt); } +EXPORT_SYMBOL(ipv6_push_nfrag_opts); + void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto) { if (opt->dst1opt) diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index ea3035b4e3e..fc3882c9060 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -17,6 +17,7 @@ #include <net/fib_rules.h> #include <net/ipv6.h> +#include <net/addrconf.h> #include <net/ip6_route.h> #include <net/netlink.h> @@ -95,8 +96,27 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, if (table) rt = lookup(table, flp, flags); - if (rt != &ip6_null_entry) + if (rt != &ip6_null_entry) { + struct fib6_rule *r = (struct fib6_rule *)rule; + + /* + * If we need to find a source address for this traffic, + * we check the result if it meets requirement of the rule. + */ + if ((rule->flags & FIB_RULE_FIND_SADDR) && + r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) { + struct in6_addr saddr; + if (ipv6_get_saddr(&rt->u.dst, &flp->fl6_dst, + &saddr)) + goto again; + if (!ipv6_prefix_equal(&saddr, &r->src.addr, + r->src.plen)) + goto again; + ipv6_addr_copy(&flp->fl6_src, &saddr); + } goto out; + } +again: dst_release(&rt->u.dst); rt = NULL; goto out; @@ -117,9 +137,17 @@ static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) !ipv6_prefix_equal(&fl->fl6_dst, &r->dst.addr, r->dst.plen)) return 0; + /* + * If FIB_RULE_FIND_SADDR is set and we do not have a + * source address for the traffic, we defer check for + * source address. + */ if (r->src.plen) { - if (!(flags & RT6_LOOKUP_F_HAS_SADDR) || - !ipv6_prefix_equal(&fl->fl6_src, &r->src.addr, r->src.plen)) + if (flags & RT6_LOOKUP_F_HAS_SADDR) { + if (!ipv6_prefix_equal(&fl->fl6_src, &r->src.addr, + r->src.plen)) + return 0; + } else if (!(r->common.flags & FIB_RULE_FIND_SADDR)) return 0; } @@ -216,11 +244,6 @@ nla_put_failure: return -ENOBUFS; } -int fib6_rules_dump(struct sk_buff *skb, struct netlink_callback *cb) -{ - return fib_rules_dump(skb, cb, AF_INET6); -} - static u32 fib6_rule_default_pref(void) { return 0x3FFF; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index edfe98bf64c..e9bcce9e7bd 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -68,6 +68,7 @@ #include <asm/system.h> DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics) __read_mostly; +EXPORT_SYMBOL(icmpv6_statistics); /* * The ICMP socket(s). This is the most convenient way to flow control @@ -128,9 +129,9 @@ void icmpv6_param_prob(struct sk_buff *skb, int code, int pos) static int is_ineligible(struct sk_buff *skb) { - int ptr = (u8*)(skb->nh.ipv6h+1) - skb->data; + int ptr = (u8 *)(ipv6_hdr(skb) + 1) - skb->data; int len = skb->len - ptr; - __u8 nexthdr = skb->nh.ipv6h->nexthdr; + __u8 nexthdr = ipv6_hdr(skb)->nexthdr; if (len < 0) return 1; @@ -205,7 +206,7 @@ static __inline__ int opt_unrec(struct sk_buff *skb, __u32 offset) { u8 _optval, *op; - offset += skb->nh.raw - skb->data; + offset += skb_network_offset(skb); op = skb_header_pointer(skb, offset, sizeof(_optval), &_optval); if (op == NULL) return 1; @@ -221,7 +222,7 @@ static int icmpv6_push_pending_frames(struct sock *sk, struct flowi *fl, struct if ((skb = skb_peek(&sk->sk_write_queue)) == NULL) goto out; - icmp6h = (struct icmp6hdr*) skb->h.raw; + icmp6h = icmp6_hdr(skb); memcpy(icmp6h, thdr, sizeof(struct icmp6hdr)); icmp6h->icmp6_cksum = 0; @@ -274,7 +275,7 @@ static int icmpv6_getfrag(void *from, char *to, int offset, int len, int odd, st #ifdef CONFIG_IPV6_MIP6 static void mip6_addr_swap(struct sk_buff *skb) { - struct ipv6hdr *iph = skb->nh.ipv6h; + struct ipv6hdr *iph = ipv6_hdr(skb); struct inet6_skb_parm *opt = IP6CB(skb); struct ipv6_destopt_hao *hao; struct in6_addr tmp; @@ -283,7 +284,8 @@ static void mip6_addr_swap(struct sk_buff *skb) if (opt->dsthao) { off = ipv6_find_tlv(skb, opt->dsthao, IPV6_TLV_HAO); if (likely(off >= 0)) { - hao = (struct ipv6_destopt_hao *)(skb->nh.raw + off); + hao = (struct ipv6_destopt_hao *) + (skb_network_header(skb) + off); ipv6_addr_copy(&tmp, &iph->saddr); ipv6_addr_copy(&iph->saddr, &hao->addr); ipv6_addr_copy(&hao->addr, &tmp); @@ -301,7 +303,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, struct net_device *dev) { struct inet6_dev *idev = NULL; - struct ipv6hdr *hdr = skb->nh.ipv6h; + struct ipv6hdr *hdr = ipv6_hdr(skb); struct sock *sk; struct ipv6_pinfo *np; struct in6_addr *saddr = NULL; @@ -315,7 +317,8 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, int hlimit, tclass; int err = 0; - if ((u8*)hdr < skb->head || (u8*)(hdr+1) > skb->tail) + if ((u8 *)hdr < skb->head || + (skb->network_header + sizeof(*hdr)) > skb->tail) return; /* @@ -430,7 +433,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, tclass = 0; msg.skb = skb; - msg.offset = skb->nh.raw - skb->data; + msg.offset = skb_network_offset(skb); msg.type = type; len = skb->len - msg.offset; @@ -466,13 +469,15 @@ out: icmpv6_xmit_unlock(); } +EXPORT_SYMBOL(icmpv6_send); + static void icmpv6_echo_reply(struct sk_buff *skb) { struct sock *sk; struct inet6_dev *idev; struct ipv6_pinfo *np; struct in6_addr *saddr = NULL; - struct icmp6hdr *icmph = (struct icmp6hdr *) skb->h.raw; + struct icmp6hdr *icmph = icmp6_hdr(skb); struct icmp6hdr tmp_hdr; struct flowi fl; struct icmpv6_msg msg; @@ -481,7 +486,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) int hlimit; int tclass; - saddr = &skb->nh.ipv6h->daddr; + saddr = &ipv6_hdr(skb)->daddr; if (!ipv6_unicast_destination(skb)) saddr = NULL; @@ -491,7 +496,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) memset(&fl, 0, sizeof(fl)); fl.proto = IPPROTO_ICMPV6; - ipv6_addr_copy(&fl.fl6_dst, &skb->nh.ipv6h->saddr); + ipv6_addr_copy(&fl.fl6_dst, &ipv6_hdr(skb)->saddr); if (saddr) ipv6_addr_copy(&fl.fl6_src, saddr); fl.oif = skb->dev->ifindex; @@ -579,8 +584,8 @@ static void icmpv6_notify(struct sk_buff *skb, int type, int code, __be32 info) if (!pskb_may_pull(skb, inner_offset+8)) return; - saddr = &skb->nh.ipv6h->saddr; - daddr = &skb->nh.ipv6h->daddr; + saddr = &ipv6_hdr(skb)->saddr; + daddr = &ipv6_hdr(skb)->daddr; /* BUGGG_FUTURE: we should try to parse exthdrs in this packet. Without this we will not able f.e. to make source routed @@ -624,8 +629,8 @@ static int icmpv6_rcv(struct sk_buff **pskb) ICMP6_INC_STATS_BH(idev, ICMP6_MIB_INMSGS); - saddr = &skb->nh.ipv6h->saddr; - daddr = &skb->nh.ipv6h->daddr; + saddr = &ipv6_hdr(skb)->saddr; + daddr = &ipv6_hdr(skb)->daddr; /* Perform checksum. */ switch (skb->ip_summed) { @@ -647,7 +652,7 @@ static int icmpv6_rcv(struct sk_buff **pskb) if (!pskb_pull(skb, sizeof(struct icmp6hdr))) goto discard_it; - hdr = (struct icmp6hdr *) skb->h.raw; + hdr = icmp6_hdr(skb); type = hdr->icmp6_type; @@ -673,7 +678,7 @@ static int icmpv6_rcv(struct sk_buff **pskb) */ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto discard_it; - hdr = (struct icmp6hdr *) skb->h.raw; + hdr = icmp6_hdr(skb); orig_hdr = (struct ipv6hdr *) (hdr + 1); rt6_pmtu_discovery(&orig_hdr->daddr, &orig_hdr->saddr, dev, ntohl(hdr->icmp6_mtu)); @@ -727,7 +732,8 @@ static int icmpv6_rcv(struct sk_buff **pskb) */ icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu); - }; + } + kfree_skb(skb); return 0; @@ -860,11 +866,13 @@ int icmpv6_err_convert(int type, int code, int *err) case ICMPV6_TIME_EXCEED: *err = EHOSTUNREACH; break; - }; + } return fatal; } +EXPORT_SYMBOL(icmpv6_err_convert); + #ifdef CONFIG_SYSCTL ctl_table ipv6_icmp_table[] = { { diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 268f476ef3d..ca08ee88d07 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -359,7 +359,7 @@ end: return res; } -int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) +static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { unsigned int h, s_h; unsigned int e = 0, s_e; @@ -1486,6 +1486,8 @@ void __init fib6_init(void) NULL, NULL); fib6_tables_init(); + + __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib); } void fib6_gc_cleanup(void) diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 61e7a6c8141..be0ee8a34f9 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -96,12 +96,12 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt if (unlikely(!pskb_may_pull(skb, sizeof(*hdr)))) goto err; - hdr = skb->nh.ipv6h; + hdr = ipv6_hdr(skb); if (hdr->version != 6) goto err; - skb->h.raw = (u8 *)(hdr + 1); + skb->transport_header = skb->network_header + sizeof(*hdr); IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); pkt_len = ntohs(hdr->payload_len); @@ -116,7 +116,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt IP6_INC_STATS_BH(idev, IPSTATS_MIB_INHDRERRORS); goto drop; } - hdr = skb->nh.ipv6h; + hdr = ipv6_hdr(skb); } if (hdr->nexthdr == NEXTHDR_HOP) { @@ -160,10 +160,10 @@ static inline int ip6_input_finish(struct sk_buff *skb) rcu_read_lock(); resubmit: idev = ip6_dst_idev(skb->dst); - if (!pskb_pull(skb, skb->h.raw - skb->data)) + if (!pskb_pull(skb, skb_transport_offset(skb))) goto discard; nhoff = IP6CB(skb)->nhoff; - nexthdr = skb->nh.raw[nhoff]; + nexthdr = skb_network_header(skb)[nhoff]; raw_sk = sk_head(&raw_v6_htable[nexthdr & (MAX_INET_PROTOS - 1)]); if (raw_sk && !ipv6_raw_deliver(skb, nexthdr)) @@ -181,9 +181,9 @@ resubmit: indefinitely. */ nf_reset(skb); - skb_postpull_rcsum(skb, skb->nh.raw, - skb->h.raw - skb->nh.raw); - hdr = skb->nh.ipv6h; + skb_postpull_rcsum(skb, skb_network_header(skb), + skb_network_header_len(skb)); + hdr = ipv6_hdr(skb); if (ipv6_addr_is_multicast(&hdr->daddr) && !ipv6_chk_mcast_addr(skb->dev, &hdr->daddr, &hdr->saddr) && @@ -234,7 +234,7 @@ int ip6_mc_input(struct sk_buff *skb) IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_INMCASTPKTS); - hdr = skb->nh.ipv6h; + hdr = ipv6_hdr(skb); deliver = likely(!(skb->dev->flags & (IFF_PROMISC|IFF_ALLMULTI))) || ipv6_chk_mcast_addr(skb->dev, &hdr->daddr, NULL); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 305516921aa..f508171bab7 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -88,8 +88,8 @@ static inline int ip6_output_finish(struct sk_buff *skb) /* dev_loopback_xmit for use with netfilter. */ static int ip6_dev_loopback_xmit(struct sk_buff *newskb) { - newskb->mac.raw = newskb->data; - __skb_pull(newskb, newskb->nh.raw - newskb->data); + skb_reset_mac_header(newskb); + __skb_pull(newskb, skb_network_offset(newskb)); newskb->pkt_type = PACKET_LOOPBACK; newskb->ip_summed = CHECKSUM_UNNECESSARY; BUG_TRAP(newskb->dst); @@ -107,13 +107,13 @@ static int ip6_output2(struct sk_buff *skb) skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; - if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) { + if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) { struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL; struct inet6_dev *idev = ip6_dst_idev(skb->dst); if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) && - ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr, - &skb->nh.ipv6h->saddr)) { + ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr, + &ipv6_hdr(skb)->saddr)) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); /* Do not check for IFF_ALLMULTI; multicast routing @@ -124,7 +124,7 @@ static int ip6_output2(struct sk_buff *skb) newskb->dev, ip6_dev_loopback_xmit); - if (skb->nh.ipv6h->hop_limit == 0) { + if (ipv6_hdr(skb)->hop_limit == 0) { IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); return 0; @@ -137,9 +137,17 @@ static int ip6_output2(struct sk_buff *skb) return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish); } +static inline int ip6_skb_dst_mtu(struct sk_buff *skb) +{ + struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL; + + return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ? + skb->dst->dev->mtu : dst_mtu(skb->dst); +} + int ip6_output(struct sk_buff *skb) { - if ((skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb)) || + if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || dst_allfrag(skb->dst)) return ip6_fragment(skb, ip6_output2); else @@ -191,7 +199,9 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop); } - hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr)); + skb_push(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + hdr = ipv6_hdr(skb); /* * Fill in the IPv6 header @@ -239,6 +249,8 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, return -EMSGSIZE; } +EXPORT_SYMBOL(ip6_xmit); + /* * To avoid extra problems ND packets are send through this * routine. It's code duplication but I really want to avoid @@ -259,8 +271,9 @@ int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev, totlen = len + sizeof(struct ipv6hdr); - hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr)); - skb->nh.ipv6h = hdr; + skb_reset_network_header(skb); + skb_put(skb, sizeof(struct ipv6hdr)); + hdr = ipv6_hdr(skb); *(__be32*)hdr = htonl(0x60000000); @@ -305,7 +318,7 @@ static int ip6_call_ra_chain(struct sk_buff *skb, int sel) static int ip6_forward_proxy_check(struct sk_buff *skb) { - struct ipv6hdr *hdr = skb->nh.ipv6h; + struct ipv6hdr *hdr = ipv6_hdr(skb); u8 nexthdr = hdr->nexthdr; int offset; @@ -319,10 +332,11 @@ static int ip6_forward_proxy_check(struct sk_buff *skb) if (nexthdr == IPPROTO_ICMPV6) { struct icmp6hdr *icmp6; - if (!pskb_may_pull(skb, skb->nh.raw + offset + 1 - skb->data)) + if (!pskb_may_pull(skb, (skb_network_header(skb) + + offset + 1 - skb->data))) return 0; - icmp6 = (struct icmp6hdr *)(skb->nh.raw + offset); + icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); switch (icmp6->icmp6_type) { case NDISC_ROUTER_SOLICITATION: @@ -361,7 +375,7 @@ static inline int ip6_forward_finish(struct sk_buff *skb) int ip6_forward(struct sk_buff *skb) { struct dst_entry *dst = skb->dst; - struct ipv6hdr *hdr = skb->nh.ipv6h; + struct ipv6hdr *hdr = ipv6_hdr(skb); struct inet6_skb_parm *opt = IP6CB(skb); if (ipv6_devconf.forwarding == 0) @@ -372,7 +386,7 @@ int ip6_forward(struct sk_buff *skb) goto drop; } - skb->ip_summed = CHECKSUM_NONE; + skb_forward_csum(skb); /* * We DO NOT make any processing on @@ -388,7 +402,7 @@ int ip6_forward(struct sk_buff *skb) * that different fragments will go along one path. --ANK */ if (opt->ra) { - u8 *ptr = skb->nh.raw + opt->ra; + u8 *ptr = skb_network_header(skb) + opt->ra; if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3])) return 0; } @@ -470,7 +484,7 @@ int ip6_forward(struct sk_buff *skb) goto drop; } - hdr = skb->nh.ipv6h; + hdr = ipv6_hdr(skb); /* Mangling hops number delayed to point after skb COW */ @@ -499,33 +513,18 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) #ifdef CONFIG_NET_SCHED to->tc_index = from->tc_index; #endif -#ifdef CONFIG_NETFILTER - /* Connection association is same as pre-frag packet */ - nf_conntrack_put(to->nfct); - to->nfct = from->nfct; - nf_conntrack_get(to->nfct); - to->nfctinfo = from->nfctinfo; -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) - nf_conntrack_put_reasm(to->nfct_reasm); - to->nfct_reasm = from->nfct_reasm; - nf_conntrack_get_reasm(to->nfct_reasm); -#endif -#ifdef CONFIG_BRIDGE_NETFILTER - nf_bridge_put(to->nf_bridge); - to->nf_bridge = from->nf_bridge; - nf_bridge_get(to->nf_bridge); -#endif -#endif + nf_copy(to, from); skb_copy_secmark(to, from); } int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) { u16 offset = sizeof(struct ipv6hdr); - struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1); - unsigned int packet_len = skb->tail - skb->nh.raw; + struct ipv6_opt_hdr *exthdr = + (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); + unsigned int packet_len = skb->tail - skb->network_header; int found_rhdr = 0; - *nexthdr = &skb->nh.ipv6h->nexthdr; + *nexthdr = &ipv6_hdr(skb)->nexthdr; while (offset + 1 <= packet_len) { @@ -550,7 +549,8 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) offset += ipv6_optlen(exthdr); *nexthdr = &exthdr->nexthdr; - exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset); + exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) + + offset); } return offset; @@ -574,7 +574,20 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) hlen = ip6_find_1stfragopt(skb, &prevhdr); nexthdr = *prevhdr; - mtu = dst_mtu(&rt->u.dst); + mtu = ip6_skb_dst_mtu(skb); + + /* We must not fragment if the socket is set to force MTU discovery + * or if the skb it not generated by a local socket. (This last + * check should be redundant, but it's free.) + */ + if (!np || np->pmtudisc >= IPV6_PMTUDISC_DO) { + skb->dev = skb->dst->dev; + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS); + kfree_skb(skb); + return -EMSGSIZE; + } + if (np && np->frag_size < mtu) { if (np->frag_size) mtu = np->frag_size; @@ -616,7 +629,7 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) /* BUILD HEADER */ *prevhdr = NEXTHDR_FRAGMENT; - tmp_hdr = kmemdup(skb->nh.raw, hlen, GFP_ATOMIC); + tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); if (!tmp_hdr) { IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS); return -ENOMEM; @@ -624,8 +637,9 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) __skb_pull(skb, hlen); fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr)); - skb->nh.raw = __skb_push(skb, hlen); - memcpy(skb->nh.raw, tmp_hdr, hlen); + __skb_push(skb, hlen); + skb_reset_network_header(skb); + memcpy(skb_network_header(skb), tmp_hdr, hlen); ipv6_select_ident(skb, fh); fh->nexthdr = nexthdr; @@ -636,7 +650,8 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) first_len = skb_pagelen(skb); skb->data_len = first_len - skb_headlen(skb); skb->len = first_len; - skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr)); + ipv6_hdr(skb)->payload_len = htons(first_len - + sizeof(struct ipv6hdr)); dst_hold(&rt->u.dst); @@ -645,10 +660,12 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) * before previous one went down. */ if (frag) { frag->ip_summed = CHECKSUM_NONE; - frag->h.raw = frag->data; + skb_reset_transport_header(frag); fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr)); - frag->nh.raw = __skb_push(frag, hlen); - memcpy(frag->nh.raw, tmp_hdr, hlen); + __skb_push(frag, hlen); + skb_reset_network_header(frag); + memcpy(skb_network_header(frag), tmp_hdr, + hlen); offset += skb->len - hlen - sizeof(struct frag_hdr); fh->nexthdr = nexthdr; fh->reserved = 0; @@ -656,7 +673,9 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) if (frag->next != NULL) fh->frag_off |= htons(IP6_MF); fh->identification = frag_id; - frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); + ipv6_hdr(frag)->payload_len = + htons(frag->len - + sizeof(struct ipv6hdr)); ip6_copy_metadata(frag, skb); } @@ -733,9 +752,10 @@ slow_path: ip6_copy_metadata(frag, skb); skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev)); skb_put(frag, len + hlen + sizeof(struct frag_hdr)); - frag->nh.raw = frag->data; - fh = (struct frag_hdr*)(frag->data + hlen); - frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr); + skb_reset_network_header(frag); + fh = (struct frag_hdr *)(skb_network_header(frag) + hlen); + frag->transport_header = (frag->network_header + hlen + + sizeof(struct frag_hdr)); /* * Charge the memory for the fragment to any owner @@ -747,7 +767,7 @@ slow_path: /* * Copy the packet header into the new buffer. */ - memcpy(frag->nh.raw, skb->data, hlen); + skb_copy_from_linear_data(skb, skb_network_header(frag), hlen); /* * Build fragment header. @@ -763,14 +783,15 @@ slow_path: /* * Copy a block of the IP datagram. */ - if (skb_copy_bits(skb, ptr, frag->h.raw, len)) + if (skb_copy_bits(skb, ptr, skb_transport_header(skb), len)) BUG(); left -= len; fh->frag_off = htons(offset); if (left > 0) fh->frag_off |= htons(IP6_MF); - frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); + ipv6_hdr(frag)->payload_len = htons(frag->len - + sizeof(struct ipv6hdr)); ptr += len; offset += len; @@ -861,6 +882,41 @@ static int ip6_dst_lookup_tail(struct sock *sk, goto out_err_release; } +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + /* + * Here if the dst entry we've looked up + * has a neighbour entry that is in the INCOMPLETE + * state and the src address from the flow is + * marked as OPTIMISTIC, we release the found + * dst entry and replace it instead with the + * dst entry of the nexthop router + */ + if (!((*dst)->neighbour->nud_state & NUD_VALID)) { + struct inet6_ifaddr *ifp; + struct flowi fl_gw; + int redirect; + + ifp = ipv6_get_ifaddr(&fl->fl6_src, (*dst)->dev, 1); + + redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); + if (ifp) + in6_ifa_put(ifp); + + if (redirect) { + /* + * We need to get the dst entry for the + * default router instead + */ + dst_release(*dst); + memcpy(&fl_gw, fl, sizeof(struct flowi)); + memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr)); + *dst = ip6_route_output(sk, &fl_gw); + if ((err = (*dst)->error)) + goto out_err_release; + } + } +#endif + return 0; out_err_release: @@ -939,10 +995,10 @@ static inline int ip6_ufo_append_data(struct sock *sk, skb_put(skb,fragheaderlen + transhdrlen); /* initialize network header pointer */ - skb->nh.raw = skb->data; + skb_reset_network_header(skb); /* initialize protocol header pointer */ - skb->h.raw = skb->data + fragheaderlen; + skb->transport_header = skb->network_header + fragheaderlen; skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; @@ -1015,7 +1071,8 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, inet->cork.fl = *fl; np->cork.hop_limit = hlimit; np->cork.tclass = tclass; - mtu = dst_mtu(rt->u.dst.path); + mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ? + rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path); if (np->frag_size < mtu) { if (np->frag_size) mtu = np->frag_size; @@ -1162,10 +1219,10 @@ alloc_new_skb: * Find where to start putting bytes */ data = skb_put(skb, fraglen); - skb->nh.raw = data + exthdrlen; + skb_set_network_header(skb, exthdrlen); data += fragheaderlen; - skb->h.raw = data + exthdrlen; - + skb->transport_header = (skb->network_header + + fragheaderlen); if (fraggap) { skb->csum = skb_copy_and_csum_bits( skb_prev, maxfraglen, @@ -1288,10 +1345,10 @@ int ip6_push_pending_frames(struct sock *sk) tail_skb = &(skb_shinfo(skb)->frag_list); /* move skb->data to ip header from ext header */ - if (skb->data < skb->nh.raw) - __skb_pull(skb, skb->nh.raw - skb->data); + if (skb->data < skb_network_header(skb)) + __skb_pull(skb, skb_network_offset(skb)); while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { - __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw); + __skb_pull(tmp_skb, skb_network_header_len(skb)); *tail_skb = tmp_skb; tail_skb = &(tmp_skb->next); skb->len += tmp_skb->len; @@ -1303,13 +1360,15 @@ int ip6_push_pending_frames(struct sock *sk) } ipv6_addr_copy(final_dst, &fl->fl6_dst); - __skb_pull(skb, skb->h.raw - skb->nh.raw); + __skb_pull(skb, skb_network_header_len(skb)); if (opt && opt->opt_flen) ipv6_push_frag_opts(skb, opt, &proto); if (opt && opt->opt_nflen) ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst); - skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr)); + skb_push(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + hdr = ipv6_hdr(skb); *(__be32*)hdr = fl->fl6_flowlabel | htonl(0x60000000 | ((int)np->cork.tclass << 20)); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 08d944223ec..a0902fbdb4e 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1,14 +1,15 @@ /* - * IPv6 over IPv6 tunnel device + * IPv6 tunneling device * Linux INET6 implementation * * Authors: * Ville Nuorvala <vnuorval@tcs.hut.fi> + * Yasuyuki Kozakai <kozakai@linux-ipv6.org> * * $Id$ * * Based on: - * linux/net/ipv6/sit.c + * linux/net/ipv6/sit.c and linux/net/ipv4/ipip.c * * RFC 2473 * @@ -24,6 +25,7 @@ #include <linux/errno.h> #include <linux/types.h> #include <linux/sockios.h> +#include <linux/icmp.h> #include <linux/if.h> #include <linux/in.h> #include <linux/ip.h> @@ -41,6 +43,7 @@ #include <asm/uaccess.h> #include <asm/atomic.h> +#include <net/icmp.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_route.h> @@ -51,7 +54,7 @@ #include <net/inet_ecn.h> MODULE_AUTHOR("Ville Nuorvala"); -MODULE_DESCRIPTION("IPv6-in-IPv6 tunnel"); +MODULE_DESCRIPTION("IPv6 tunneling device"); MODULE_LICENSE("GPL"); #define IPV6_TLV_TEL_DST_SIZE 8 @@ -63,6 +66,7 @@ MODULE_LICENSE("GPL"); #endif #define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK) +#define IPV6_TCLASS_SHIFT 20 #define HASH_SIZE 32 @@ -70,12 +74,12 @@ MODULE_LICENSE("GPL"); (addr)->s6_addr32[2] ^ (addr)->s6_addr32[3]) & \ (HASH_SIZE - 1)) -static int ip6ip6_fb_tnl_dev_init(struct net_device *dev); -static int ip6ip6_tnl_dev_init(struct net_device *dev); -static void ip6ip6_tnl_dev_setup(struct net_device *dev); +static int ip6_fb_tnl_dev_init(struct net_device *dev); +static int ip6_tnl_dev_init(struct net_device *dev); +static void ip6_tnl_dev_setup(struct net_device *dev); /* the IPv6 tunnel fallback device */ -static struct net_device *ip6ip6_fb_tnl_dev; +static struct net_device *ip6_fb_tnl_dev; /* lists for storing tunnels in use */ @@ -84,7 +88,7 @@ static struct ip6_tnl *tnls_wc[1]; static struct ip6_tnl **tnls[2] = { tnls_wc, tnls_r_l }; /* lock for the tunnel lists */ -static DEFINE_RWLOCK(ip6ip6_lock); +static DEFINE_RWLOCK(ip6_tnl_lock); static inline struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t) { @@ -115,7 +119,7 @@ static inline void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst) } /** - * ip6ip6_tnl_lookup - fetch tunnel matching the end-point addresses + * ip6_tnl_lookup - fetch tunnel matching the end-point addresses * @remote: the address of the tunnel exit-point * @local: the address of the tunnel entry-point * @@ -126,7 +130,7 @@ static inline void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst) **/ static struct ip6_tnl * -ip6ip6_tnl_lookup(struct in6_addr *remote, struct in6_addr *local) +ip6_tnl_lookup(struct in6_addr *remote, struct in6_addr *local) { unsigned h0 = HASH(remote); unsigned h1 = HASH(local); @@ -145,18 +149,18 @@ ip6ip6_tnl_lookup(struct in6_addr *remote, struct in6_addr *local) } /** - * ip6ip6_bucket - get head of list matching given tunnel parameters + * ip6_tnl_bucket - get head of list matching given tunnel parameters * @p: parameters containing tunnel end-points * * Description: - * ip6ip6_bucket() returns the head of the list matching the + * ip6_tnl_bucket() returns the head of the list matching the * &struct in6_addr entries laddr and raddr in @p. * * Return: head of IPv6 tunnel list **/ static struct ip6_tnl ** -ip6ip6_bucket(struct ip6_tnl_parm *p) +ip6_tnl_bucket(struct ip6_tnl_parm *p) { struct in6_addr *remote = &p->raddr; struct in6_addr *local = &p->laddr; @@ -171,36 +175,36 @@ ip6ip6_bucket(struct ip6_tnl_parm *p) } /** - * ip6ip6_tnl_link - add tunnel to hash table + * ip6_tnl_link - add tunnel to hash table * @t: tunnel to be added **/ static void -ip6ip6_tnl_link(struct ip6_tnl *t) +ip6_tnl_link(struct ip6_tnl *t) { - struct ip6_tnl **tp = ip6ip6_bucket(&t->parms); + struct ip6_tnl **tp = ip6_tnl_bucket(&t->parms); t->next = *tp; - write_lock_bh(&ip6ip6_lock); + write_lock_bh(&ip6_tnl_lock); *tp = t; - write_unlock_bh(&ip6ip6_lock); + write_unlock_bh(&ip6_tnl_lock); } /** - * ip6ip6_tnl_unlink - remove tunnel from hash table + * ip6_tnl_unlink - remove tunnel from hash table * @t: tunnel to be removed **/ static void -ip6ip6_tnl_unlink(struct ip6_tnl *t) +ip6_tnl_unlink(struct ip6_tnl *t) { struct ip6_tnl **tp; - for (tp = ip6ip6_bucket(&t->parms); *tp; tp = &(*tp)->next) { + for (tp = ip6_tnl_bucket(&t->parms); *tp; tp = &(*tp)->next) { if (t == *tp) { - write_lock_bh(&ip6ip6_lock); + write_lock_bh(&ip6_tnl_lock); *tp = t->next; - write_unlock_bh(&ip6ip6_lock); + write_unlock_bh(&ip6_tnl_lock); break; } } @@ -237,12 +241,12 @@ static struct ip6_tnl *ip6_tnl_create(struct ip6_tnl_parm *p) if (i == IP6_TNL_MAX) goto failed; } - dev = alloc_netdev(sizeof (*t), name, ip6ip6_tnl_dev_setup); + dev = alloc_netdev(sizeof (*t), name, ip6_tnl_dev_setup); if (dev == NULL) goto failed; t = netdev_priv(dev); - dev->init = ip6ip6_tnl_dev_init; + dev->init = ip6_tnl_dev_init; t->parms = *p; if ((err = register_netdevice(dev)) < 0) { @@ -250,19 +254,19 @@ static struct ip6_tnl *ip6_tnl_create(struct ip6_tnl_parm *p) goto failed; } dev_hold(dev); - ip6ip6_tnl_link(t); + ip6_tnl_link(t); return t; failed: return NULL; } /** - * ip6ip6_tnl_locate - find or create tunnel matching given parameters + * ip6_tnl_locate - find or create tunnel matching given parameters * @p: tunnel parameters * @create: != 0 if allowed to create new tunnel if no match found * * Description: - * ip6ip6_tnl_locate() first tries to locate an existing tunnel + * ip6_tnl_locate() first tries to locate an existing tunnel * based on @parms. If this is unsuccessful, but @create is set a new * tunnel device is created and registered for use. * @@ -270,13 +274,13 @@ failed: * matching tunnel or NULL **/ -static struct ip6_tnl *ip6ip6_tnl_locate(struct ip6_tnl_parm *p, int create) +static struct ip6_tnl *ip6_tnl_locate(struct ip6_tnl_parm *p, int create) { struct in6_addr *remote = &p->raddr; struct in6_addr *local = &p->laddr; struct ip6_tnl *t; - for (t = *ip6ip6_bucket(p); t; t = t->next) { + for (t = *ip6_tnl_bucket(p); t; t = t->next) { if (ipv6_addr_equal(local, &t->parms.laddr) && ipv6_addr_equal(remote, &t->parms.raddr)) return t; @@ -287,24 +291,24 @@ static struct ip6_tnl *ip6ip6_tnl_locate(struct ip6_tnl_parm *p, int create) } /** - * ip6ip6_tnl_dev_uninit - tunnel device uninitializer + * ip6_tnl_dev_uninit - tunnel device uninitializer * @dev: the device to be destroyed * * Description: - * ip6ip6_tnl_dev_uninit() removes tunnel from its list + * ip6_tnl_dev_uninit() removes tunnel from its list **/ static void -ip6ip6_tnl_dev_uninit(struct net_device *dev) +ip6_tnl_dev_uninit(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - if (dev == ip6ip6_fb_tnl_dev) { - write_lock_bh(&ip6ip6_lock); + if (dev == ip6_fb_tnl_dev) { + write_lock_bh(&ip6_tnl_lock); tnls_wc[0] = NULL; - write_unlock_bh(&ip6ip6_lock); + write_unlock_bh(&ip6_tnl_lock); } else { - ip6ip6_tnl_unlink(t); + ip6_tnl_unlink(t); } ip6_tnl_dst_reset(t); dev_put(dev); @@ -372,16 +376,16 @@ parse_tlv_tnl_enc_lim(struct sk_buff *skb, __u8 * raw) } /** - * ip6ip6_err - tunnel error handler + * ip6_tnl_err - tunnel error handler * * Description: - * ip6ip6_err() should handle errors in the tunnel according + * ip6_tnl_err() should handle errors in the tunnel according * to the specifications in RFC 2473. **/ static int -ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - int type, int code, int offset, __be32 info) +ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, + int *type, int *code, int *msg, __be32 *info, int offset) { struct ipv6hdr *ipv6h = (struct ipv6hdr *) skb->data; struct ip6_tnl *t; @@ -396,13 +400,16 @@ ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, in trouble since we might need the source address for further processing of the error. */ - read_lock(&ip6ip6_lock); - if ((t = ip6ip6_tnl_lookup(&ipv6h->daddr, &ipv6h->saddr)) == NULL) + read_lock(&ip6_tnl_lock); + if ((t = ip6_tnl_lookup(&ipv6h->daddr, &ipv6h->saddr)) == NULL) + goto out; + + if (t->parms.proto != ipproto && t->parms.proto != 0) goto out; err = 0; - switch (type) { + switch (*type) { __u32 teli; struct ipv6_tlv_tnl_enc_lim *tel; __u32 mtu; @@ -414,7 +421,7 @@ ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, rel_msg = 1; break; case ICMPV6_TIME_EXCEED: - if (code == ICMPV6_EXC_HOPLIMIT) { + if ((*code) == ICMPV6_EXC_HOPLIMIT) { if (net_ratelimit()) printk(KERN_WARNING "%s: Too small hop limit or " @@ -425,10 +432,10 @@ ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, break; case ICMPV6_PARAMPROB: teli = 0; - if (code == ICMPV6_HDR_FIELD) + if ((*code) == ICMPV6_HDR_FIELD) teli = parse_tlv_tnl_enc_lim(skb, skb->data); - if (teli && teli == ntohl(info) - 2) { + if (teli && teli == ntohl(*info) - 2) { tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; if (tel->encap_limit == 0) { if (net_ratelimit()) @@ -445,7 +452,7 @@ ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } break; case ICMPV6_PKT_TOOBIG: - mtu = ntohl(info) - offset; + mtu = ntohl(*info) - offset; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; t->dev->mtu = mtu; @@ -458,20 +465,144 @@ ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } break; } - if (rel_msg && pskb_may_pull(skb, offset + sizeof (*ipv6h))) { + + *type = rel_type; + *code = rel_code; + *info = rel_info; + *msg = rel_msg; + +out: + read_unlock(&ip6_tnl_lock); + return err; +} + +static int +ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + int type, int code, int offset, __u32 info) +{ + int rel_msg = 0; + int rel_type = type; + int rel_code = code; + __u32 rel_info = info; + int err; + struct sk_buff *skb2; + struct iphdr *eiph; + struct flowi fl; + struct rtable *rt; + + err = ip6_tnl_err(skb, IPPROTO_IPIP, opt, &rel_type, &rel_code, + &rel_msg, &rel_info, offset); + if (err < 0) + return err; + + if (rel_msg == 0) + return 0; + + switch (rel_type) { + case ICMPV6_DEST_UNREACH: + if (rel_code != ICMPV6_ADDR_UNREACH) + return 0; + rel_type = ICMP_DEST_UNREACH; + rel_code = ICMP_HOST_UNREACH; + break; + case ICMPV6_PKT_TOOBIG: + if (rel_code != 0) + return 0; + rel_type = ICMP_DEST_UNREACH; + rel_code = ICMP_FRAG_NEEDED; + break; + default: + return 0; + } + + if (!pskb_may_pull(skb, offset + sizeof(struct iphdr))) + return 0; + + skb2 = skb_clone(skb, GFP_ATOMIC); + if (!skb2) + return 0; + + dst_release(skb2->dst); + skb2->dst = NULL; + skb_pull(skb2, offset); + skb_reset_network_header(skb2); + eiph = ip_hdr(skb2); + + /* Try to guess incoming interface */ + memset(&fl, 0, sizeof(fl)); + fl.fl4_dst = eiph->saddr; + fl.fl4_tos = RT_TOS(eiph->tos); + fl.proto = IPPROTO_IPIP; + if (ip_route_output_key(&rt, &fl)) + goto out; + + skb2->dev = rt->u.dst.dev; + + /* route "incoming" packet */ + if (rt->rt_flags & RTCF_LOCAL) { + ip_rt_put(rt); + rt = NULL; + fl.fl4_dst = eiph->daddr; + fl.fl4_src = eiph->saddr; + fl.fl4_tos = eiph->tos; + if (ip_route_output_key(&rt, &fl) || + rt->u.dst.dev->type != ARPHRD_TUNNEL) { + ip_rt_put(rt); + goto out; + } + } else { + ip_rt_put(rt); + if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, + skb2->dev) || + skb2->dst->dev->type != ARPHRD_TUNNEL) + goto out; + } + + /* change mtu on this route */ + if (rel_type == ICMP_DEST_UNREACH && rel_code == ICMP_FRAG_NEEDED) { + if (rel_info > dst_mtu(skb2->dst)) + goto out; + + skb2->dst->ops->update_pmtu(skb2->dst, rel_info); + rel_info = htonl(rel_info); + } + + icmp_send(skb2, rel_type, rel_code, rel_info); + +out: + kfree_skb(skb2); + return 0; +} + +static int +ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + int type, int code, int offset, __u32 info) +{ + int rel_msg = 0; + int rel_type = type; + int rel_code = code; + __u32 rel_info = info; + int err; + + err = ip6_tnl_err(skb, IPPROTO_IPV6, opt, &rel_type, &rel_code, + &rel_msg, &rel_info, offset); + if (err < 0) + return err; + + if (rel_msg && pskb_may_pull(skb, offset + sizeof(struct ipv6hdr))) { struct rt6_info *rt; struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) - goto out; + return 0; dst_release(skb2->dst); skb2->dst = NULL; skb_pull(skb2, offset); - skb2->nh.raw = skb2->data; + skb_reset_network_header(skb2); /* Try to guess incoming interface */ - rt = rt6_lookup(&skb2->nh.ipv6h->saddr, NULL, 0, 0); + rt = rt6_lookup(&ipv6_hdr(skb2)->saddr, NULL, 0, 0); if (rt && rt->rt6i_dev) skb2->dev = rt->rt6i_dev; @@ -483,19 +614,34 @@ ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, kfree_skb(skb2); } -out: - read_unlock(&ip6ip6_lock); - return err; + + return 0; } -static inline void ip6ip6_ecn_decapsulate(struct ipv6hdr *outer_iph, - struct sk_buff *skb) +static void ip4ip6_dscp_ecn_decapsulate(struct ip6_tnl *t, + struct ipv6hdr *ipv6h, + struct sk_buff *skb) { - struct ipv6hdr *inner_iph = skb->nh.ipv6h; + __u8 dsfield = ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK; - if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph))) - IP6_ECN_set_ce(inner_iph); + if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) + ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, dsfield); + + if (INET_ECN_is_ce(dsfield)) + IP_ECN_set_ce(ip_hdr(skb)); +} + +static void ip6ip6_dscp_ecn_decapsulate(struct ip6_tnl *t, + struct ipv6hdr *ipv6h, + struct sk_buff *skb) +{ + if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) + ipv6_copy_dscp(ipv6h, ipv6_hdr(skb)); + + if (INET_ECN_is_ce(ipv6_get_dsfield(ipv6h))) + IP6_ECN_set_ce(ipv6_hdr(skb)); } + static inline int ip6_tnl_rcv_ctl(struct ip6_tnl *t) { struct ip6_tnl_parm *p = &t->parms; @@ -519,53 +665,61 @@ static inline int ip6_tnl_rcv_ctl(struct ip6_tnl *t) } /** - * ip6ip6_rcv - decapsulate IPv6 packet and retransmit it locally + * ip6_tnl_rcv - decapsulate IPv6 packet and retransmit it locally * @skb: received socket buffer + * @protocol: ethernet protocol ID + * @dscp_ecn_decapsulate: the function to decapsulate DSCP code and ECN * * Return: 0 **/ -static int -ip6ip6_rcv(struct sk_buff *skb) +static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol, + __u8 ipproto, + void (*dscp_ecn_decapsulate)(struct ip6_tnl *t, + struct ipv6hdr *ipv6h, + struct sk_buff *skb)) { - struct ipv6hdr *ipv6h; struct ip6_tnl *t; + struct ipv6hdr *ipv6h = ipv6_hdr(skb); - ipv6h = skb->nh.ipv6h; + read_lock(&ip6_tnl_lock); - read_lock(&ip6ip6_lock); + if ((t = ip6_tnl_lookup(&ipv6h->saddr, &ipv6h->daddr)) != NULL) { + if (t->parms.proto != ipproto && t->parms.proto != 0) { + read_unlock(&ip6_tnl_lock); + goto discard; + } - if ((t = ip6ip6_tnl_lookup(&ipv6h->saddr, &ipv6h->daddr)) != NULL) { if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { - read_unlock(&ip6ip6_lock); + read_unlock(&ip6_tnl_lock); goto discard; } if (!ip6_tnl_rcv_ctl(t)) { t->stat.rx_dropped++; - read_unlock(&ip6ip6_lock); + read_unlock(&ip6_tnl_lock); goto discard; } secpath_reset(skb); - skb->mac.raw = skb->nh.raw; - skb->nh.raw = skb->data; - skb->protocol = htons(ETH_P_IPV6); + skb->mac_header = skb->network_header; + skb_reset_network_header(skb); + skb->protocol = htons(protocol); skb->pkt_type = PACKET_HOST; memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); skb->dev = t->dev; dst_release(skb->dst); skb->dst = NULL; nf_reset(skb); - if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) - ipv6_copy_dscp(ipv6h, skb->nh.ipv6h); - ip6ip6_ecn_decapsulate(ipv6h, skb); + + dscp_ecn_decapsulate(t, ipv6h, skb); + t->stat.rx_packets++; t->stat.rx_bytes += skb->len; netif_rx(skb); - read_unlock(&ip6ip6_lock); + read_unlock(&ip6_tnl_lock); return 0; } - read_unlock(&ip6ip6_lock); + read_unlock(&ip6_tnl_lock); return 1; discard: @@ -573,6 +727,18 @@ discard: return 0; } +static int ip4ip6_rcv(struct sk_buff *skb) +{ + return ip6_tnl_rcv(skb, ETH_P_IP, IPPROTO_IPIP, + ip4ip6_dscp_ecn_decapsulate); +} + +static int ip6ip6_rcv(struct sk_buff *skb) +{ + return ip6_tnl_rcv(skb, ETH_P_IPV6, IPPROTO_IPV6, + ip6ip6_dscp_ecn_decapsulate); +} + struct ipv6_tel_txoption { struct ipv6_txoptions ops; __u8 dst_opt[8]; @@ -593,7 +759,7 @@ static void init_tel_txopt(struct ipv6_tel_txoption *opt, __u8 encap_limit) } /** - * ip6ip6_tnl_addr_conflict - compare packet addresses to tunnel's own + * ip6_tnl_addr_conflict - compare packet addresses to tunnel's own * @t: the outgoing tunnel device * @hdr: IPv6 header from the incoming packet * @@ -607,7 +773,7 @@ static void init_tel_txopt(struct ipv6_tel_txoption *opt, __u8 encap_limit) **/ static inline int -ip6ip6_tnl_addr_conflict(struct ip6_tnl *t, struct ipv6hdr *hdr) +ip6_tnl_addr_conflict(struct ip6_tnl *t, struct ipv6hdr *hdr) { return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr); } @@ -641,72 +807,49 @@ static inline int ip6_tnl_xmit_ctl(struct ip6_tnl *t) return ret; } /** - * ip6ip6_tnl_xmit - encapsulate packet and send + * ip6_tnl_xmit2 - encapsulate packet and send * @skb: the outgoing socket buffer * @dev: the outgoing tunnel device + * @dsfield: dscp code for outer header + * @fl: flow of tunneled packet + * @encap_limit: encapsulation limit + * @pmtu: Path MTU is stored if packet is too big * * Description: * Build new header and do some sanity checks on the packet before sending * it. * * Return: - * 0 + * 0 on success + * -1 fail + * %-EMSGSIZE message too big. return mtu in this case. **/ -static int -ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) +static int ip6_tnl_xmit2(struct sk_buff *skb, + struct net_device *dev, + __u8 dsfield, + struct flowi *fl, + int encap_limit, + __u32 *pmtu) { struct ip6_tnl *t = netdev_priv(dev); struct net_device_stats *stats = &t->stat; - struct ipv6hdr *ipv6h = skb->nh.ipv6h; - int encap_limit = -1; + struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct ipv6_tel_txoption opt; - __u16 offset; - struct flowi fl; struct dst_entry *dst; struct net_device *tdev; int mtu; int max_headroom = sizeof(struct ipv6hdr); u8 proto; - int err; + int err = -1; int pkt_len; - int dsfield; - - if (t->recursion++) { - stats->collisions++; - goto tx_err; - } - if (skb->protocol != htons(ETH_P_IPV6) || - !ip6_tnl_xmit_ctl(t) || ip6ip6_tnl_addr_conflict(t, ipv6h)) - goto tx_err; - - if ((offset = parse_tlv_tnl_enc_lim(skb, skb->nh.raw)) > 0) { - struct ipv6_tlv_tnl_enc_lim *tel; - tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->nh.raw[offset]; - if (tel->encap_limit == 0) { - icmpv6_send(skb, ICMPV6_PARAMPROB, - ICMPV6_HDR_FIELD, offset + 2, skb->dev); - goto tx_err; - } - encap_limit = tel->encap_limit - 1; - } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - encap_limit = t->parms.encap_limit; - - memcpy(&fl, &t->fl, sizeof (fl)); - proto = fl.proto; - - dsfield = ipv6_get_dsfield(ipv6h); - if ((t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)) - fl.fl6_flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK); - if ((t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)) - fl.fl6_flowlabel |= (*(__be32 *) ipv6h & IPV6_FLOWLABEL_MASK); if ((dst = ip6_tnl_dst_check(t)) != NULL) dst_hold(dst); else { - dst = ip6_route_output(NULL, &fl); + dst = ip6_route_output(NULL, fl); - if (dst->error || xfrm_lookup(&dst, &fl, NULL, 0) < 0) + if (dst->error || xfrm_lookup(&dst, fl, NULL, 0) < 0) goto tx_err_link_failure; } @@ -730,7 +873,8 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) if (skb->dst) skb->dst->ops->update_pmtu(skb->dst, mtu); if (skb->len > mtu) { - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); + *pmtu = mtu; + err = -EMSGSIZE; goto tx_err_dst_release; } @@ -754,22 +898,24 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) dst_release(skb->dst); skb->dst = dst_clone(dst); - skb->h.raw = skb->nh.raw; + skb->transport_header = skb->network_header; + proto = fl->proto; if (encap_limit >= 0) { init_tel_txopt(&opt, encap_limit); ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL); } - skb->nh.raw = skb_push(skb, sizeof(struct ipv6hdr)); - ipv6h = skb->nh.ipv6h; - *(__be32*)ipv6h = fl.fl6_flowlabel | htonl(0x60000000); + skb_push(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + ipv6h = ipv6_hdr(skb); + *(__be32*)ipv6h = fl->fl6_flowlabel | htonl(0x60000000); dsfield = INET_ECN_encapsulate(0, dsfield); ipv6_change_dsfield(ipv6h, ~INET_ECN_MASK, dsfield); ipv6h->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); ipv6h->hop_limit = t->parms.hop_limit; ipv6h->nexthdr = proto; - ipv6_addr_copy(&ipv6h->saddr, &fl.fl6_src); - ipv6_addr_copy(&ipv6h->daddr, &fl.fl6_dst); + ipv6_addr_copy(&ipv6h->saddr, &fl->fl6_src); + ipv6_addr_copy(&ipv6h->daddr, &fl->fl6_dst); nf_reset(skb); pkt_len = skb->len; err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, @@ -783,13 +929,131 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) stats->tx_aborted_errors++; } ip6_tnl_dst_store(t, dst); - t->recursion--; return 0; tx_err_link_failure: stats->tx_carrier_errors++; dst_link_failure(skb); tx_err_dst_release: dst_release(dst); + return err; +} + +static inline int +ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct iphdr *iph = ip_hdr(skb); + int encap_limit = -1; + struct flowi fl; + __u8 dsfield; + __u32 mtu; + int err; + + if ((t->parms.proto != IPPROTO_IPIP && t->parms.proto != 0) || + !ip6_tnl_xmit_ctl(t)) + return -1; + + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + encap_limit = t->parms.encap_limit; + + memcpy(&fl, &t->fl, sizeof (fl)); + fl.proto = IPPROTO_IPIP; + + dsfield = ipv4_get_dsfield(iph); + + if ((t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)) + fl.fl6_flowlabel |= ntohl(((__u32)iph->tos << IPV6_TCLASS_SHIFT) + & IPV6_TCLASS_MASK); + + err = ip6_tnl_xmit2(skb, dev, dsfield, &fl, encap_limit, &mtu); + if (err != 0) { + /* XXX: send ICMP error even if DF is not set. */ + if (err == -EMSGSIZE) + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); + return -1; + } + + return 0; +} + +static inline int +ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct ipv6hdr *ipv6h = ipv6_hdr(skb); + int encap_limit = -1; + __u16 offset; + struct flowi fl; + __u8 dsfield; + __u32 mtu; + int err; + + if ((t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) || + !ip6_tnl_xmit_ctl(t) || ip6_tnl_addr_conflict(t, ipv6h)) + return -1; + + offset = parse_tlv_tnl_enc_lim(skb, skb_network_header(skb)); + if (offset > 0) { + struct ipv6_tlv_tnl_enc_lim *tel; + tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset]; + if (tel->encap_limit == 0) { + icmpv6_send(skb, ICMPV6_PARAMPROB, + ICMPV6_HDR_FIELD, offset + 2, skb->dev); + return -1; + } + encap_limit = tel->encap_limit - 1; + } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + encap_limit = t->parms.encap_limit; + + memcpy(&fl, &t->fl, sizeof (fl)); + fl.proto = IPPROTO_IPV6; + + dsfield = ipv6_get_dsfield(ipv6h); + if ((t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)) + fl.fl6_flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK); + if ((t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)) + fl.fl6_flowlabel |= (*(__be32 *) ipv6h & IPV6_FLOWLABEL_MASK); + + err = ip6_tnl_xmit2(skb, dev, dsfield, &fl, encap_limit, &mtu); + if (err != 0) { + if (err == -EMSGSIZE) + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); + return -1; + } + + return 0; +} + +static int +ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct net_device_stats *stats = &t->stat; + int ret; + + if (t->recursion++) { + t->stat.collisions++; + goto tx_err; + } + + switch (skb->protocol) { + case __constant_htons(ETH_P_IP): + ret = ip4ip6_tnl_xmit(skb, dev); + break; + case __constant_htons(ETH_P_IPV6): + ret = ip6ip6_tnl_xmit(skb, dev); + break; + default: + goto tx_err; + } + + if (ret < 0) + goto tx_err; + + t->recursion--; + return 0; + tx_err: stats->tx_errors++; stats->tx_dropped++; @@ -817,7 +1081,7 @@ static void ip6_tnl_set_cap(struct ip6_tnl *t) } } -static void ip6ip6_tnl_link_config(struct ip6_tnl *t) +static void ip6_tnl_link_config(struct ip6_tnl *t) { struct net_device *dev = t->dev; struct ip6_tnl_parm *p = &t->parms; @@ -870,17 +1134,17 @@ static void ip6ip6_tnl_link_config(struct ip6_tnl *t) } /** - * ip6ip6_tnl_change - update the tunnel parameters + * ip6_tnl_change - update the tunnel parameters * @t: tunnel to be changed * @p: tunnel configuration parameters * @active: != 0 if tunnel is ready for use * * Description: - * ip6ip6_tnl_change() updates the tunnel parameters + * ip6_tnl_change() updates the tunnel parameters **/ static int -ip6ip6_tnl_change(struct ip6_tnl *t, struct ip6_tnl_parm *p) +ip6_tnl_change(struct ip6_tnl *t, struct ip6_tnl_parm *p) { ipv6_addr_copy(&t->parms.laddr, &p->laddr); ipv6_addr_copy(&t->parms.raddr, &p->raddr); @@ -889,19 +1153,20 @@ ip6ip6_tnl_change(struct ip6_tnl *t, struct ip6_tnl_parm *p) t->parms.encap_limit = p->encap_limit; t->parms.flowinfo = p->flowinfo; t->parms.link = p->link; + t->parms.proto = p->proto; ip6_tnl_dst_reset(t); - ip6ip6_tnl_link_config(t); + ip6_tnl_link_config(t); return 0; } /** - * ip6ip6_tnl_ioctl - configure ipv6 tunnels from userspace + * ip6_tnl_ioctl - configure ipv6 tunnels from userspace * @dev: virtual device associated with tunnel * @ifr: parameters passed from userspace * @cmd: command to be performed * * Description: - * ip6ip6_tnl_ioctl() is used for managing IPv6 tunnels + * ip6_tnl_ioctl() is used for managing IPv6 tunnels * from userspace. * * The possible commands are the following: @@ -923,7 +1188,7 @@ ip6ip6_tnl_change(struct ip6_tnl *t, struct ip6_tnl_parm *p) **/ static int -ip6ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { int err = 0; struct ip6_tnl_parm p; @@ -931,12 +1196,12 @@ ip6ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) switch (cmd) { case SIOCGETTUNNEL: - if (dev == ip6ip6_fb_tnl_dev) { + if (dev == ip6_fb_tnl_dev) { if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p))) { err = -EFAULT; break; } - t = ip6ip6_tnl_locate(&p, 0); + t = ip6_tnl_locate(&p, 0); } if (t == NULL) t = netdev_priv(dev); @@ -954,10 +1219,11 @@ ip6ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p))) break; err = -EINVAL; - if (p.proto != IPPROTO_IPV6) + if (p.proto != IPPROTO_IPV6 && p.proto != IPPROTO_IPIP && + p.proto != 0) break; - t = ip6ip6_tnl_locate(&p, cmd == SIOCADDTUNNEL); - if (dev != ip6ip6_fb_tnl_dev && cmd == SIOCCHGTUNNEL) { + t = ip6_tnl_locate(&p, cmd == SIOCADDTUNNEL); + if (dev != ip6_fb_tnl_dev && cmd == SIOCCHGTUNNEL) { if (t != NULL) { if (t->dev != dev) { err = -EEXIST; @@ -966,9 +1232,9 @@ ip6ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) } else t = netdev_priv(dev); - ip6ip6_tnl_unlink(t); - err = ip6ip6_tnl_change(t, &p); - ip6ip6_tnl_link(t); + ip6_tnl_unlink(t); + err = ip6_tnl_change(t, &p); + ip6_tnl_link(t); netdev_state_change(dev); } if (t) { @@ -984,15 +1250,15 @@ ip6ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (!capable(CAP_NET_ADMIN)) break; - if (dev == ip6ip6_fb_tnl_dev) { + if (dev == ip6_fb_tnl_dev) { err = -EFAULT; if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p))) break; err = -ENOENT; - if ((t = ip6ip6_tnl_locate(&p, 0)) == NULL) + if ((t = ip6_tnl_locate(&p, 0)) == NULL) break; err = -EPERM; - if (t->dev == ip6ip6_fb_tnl_dev) + if (t->dev == ip6_fb_tnl_dev) break; dev = t->dev; } @@ -1006,20 +1272,20 @@ ip6ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) } /** - * ip6ip6_tnl_get_stats - return the stats for tunnel device + * ip6_tnl_get_stats - return the stats for tunnel device * @dev: virtual device associated with tunnel * * Return: stats for device **/ static struct net_device_stats * -ip6ip6_tnl_get_stats(struct net_device *dev) +ip6_tnl_get_stats(struct net_device *dev) { return &(((struct ip6_tnl *)netdev_priv(dev))->stat); } /** - * ip6ip6_tnl_change_mtu - change mtu manually for tunnel device + * ip6_tnl_change_mtu - change mtu manually for tunnel device * @dev: virtual device associated with tunnel * @new_mtu: the new mtu * @@ -1029,7 +1295,7 @@ ip6ip6_tnl_get_stats(struct net_device *dev) **/ static int -ip6ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) +ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) { if (new_mtu < IPV6_MIN_MTU) { return -EINVAL; @@ -1039,22 +1305,22 @@ ip6ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) } /** - * ip6ip6_tnl_dev_setup - setup virtual tunnel device + * ip6_tnl_dev_setup - setup virtual tunnel device * @dev: virtual device associated with tunnel * * Description: * Initialize function pointers and device parameters **/ -static void ip6ip6_tnl_dev_setup(struct net_device *dev) +static void ip6_tnl_dev_setup(struct net_device *dev) { SET_MODULE_OWNER(dev); - dev->uninit = ip6ip6_tnl_dev_uninit; + dev->uninit = ip6_tnl_dev_uninit; dev->destructor = free_netdev; - dev->hard_start_xmit = ip6ip6_tnl_xmit; - dev->get_stats = ip6ip6_tnl_get_stats; - dev->do_ioctl = ip6ip6_tnl_ioctl; - dev->change_mtu = ip6ip6_tnl_change_mtu; + dev->hard_start_xmit = ip6_tnl_xmit; + dev->get_stats = ip6_tnl_get_stats; + dev->do_ioctl = ip6_tnl_ioctl; + dev->change_mtu = ip6_tnl_change_mtu; dev->type = ARPHRD_TUNNEL6; dev->hard_header_len = LL_MAX_HEADER + sizeof (struct ipv6hdr); @@ -1065,50 +1331,56 @@ static void ip6ip6_tnl_dev_setup(struct net_device *dev) /** - * ip6ip6_tnl_dev_init_gen - general initializer for all tunnel devices + * ip6_tnl_dev_init_gen - general initializer for all tunnel devices * @dev: virtual device associated with tunnel **/ static inline void -ip6ip6_tnl_dev_init_gen(struct net_device *dev) +ip6_tnl_dev_init_gen(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - t->fl.proto = IPPROTO_IPV6; t->dev = dev; strcpy(t->parms.name, dev->name); } /** - * ip6ip6_tnl_dev_init - initializer for all non fallback tunnel devices + * ip6_tnl_dev_init - initializer for all non fallback tunnel devices * @dev: virtual device associated with tunnel **/ static int -ip6ip6_tnl_dev_init(struct net_device *dev) +ip6_tnl_dev_init(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - ip6ip6_tnl_dev_init_gen(dev); - ip6ip6_tnl_link_config(t); + ip6_tnl_dev_init_gen(dev); + ip6_tnl_link_config(t); return 0; } /** - * ip6ip6_fb_tnl_dev_init - initializer for fallback tunnel device + * ip6_fb_tnl_dev_init - initializer for fallback tunnel device * @dev: fallback device * * Return: 0 **/ static int -ip6ip6_fb_tnl_dev_init(struct net_device *dev) +ip6_fb_tnl_dev_init(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - ip6ip6_tnl_dev_init_gen(dev); + ip6_tnl_dev_init_gen(dev); + t->parms.proto = IPPROTO_IPV6; dev_hold(dev); tnls_wc[0] = t; return 0; } +static struct xfrm6_tunnel ip4ip6_handler = { + .handler = ip4ip6_rcv, + .err_handler = ip4ip6_err, + .priority = 1, +}; + static struct xfrm6_tunnel ip6ip6_handler = { .handler = ip6ip6_rcv, .err_handler = ip6ip6_err, @@ -1125,30 +1397,40 @@ static int __init ip6_tunnel_init(void) { int err; + if (xfrm6_tunnel_register(&ip4ip6_handler, AF_INET)) { + printk(KERN_ERR "ip6_tunnel init: can't register ip4ip6\n"); + err = -EAGAIN; + goto out; + } + if (xfrm6_tunnel_register(&ip6ip6_handler, AF_INET6)) { - printk(KERN_ERR "ip6ip6 init: can't register tunnel\n"); - return -EAGAIN; + printk(KERN_ERR "ip6_tunnel init: can't register ip6ip6\n"); + err = -EAGAIN; + goto unreg_ip4ip6; } - ip6ip6_fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0", - ip6ip6_tnl_dev_setup); + ip6_fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0", + ip6_tnl_dev_setup); - if (!ip6ip6_fb_tnl_dev) { + if (!ip6_fb_tnl_dev) { err = -ENOMEM; goto fail; } - ip6ip6_fb_tnl_dev->init = ip6ip6_fb_tnl_dev_init; + ip6_fb_tnl_dev->init = ip6_fb_tnl_dev_init; - if ((err = register_netdev(ip6ip6_fb_tnl_dev))) { - free_netdev(ip6ip6_fb_tnl_dev); + if ((err = register_netdev(ip6_fb_tnl_dev))) { + free_netdev(ip6_fb_tnl_dev); goto fail; } return 0; fail: xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6); +unreg_ip4ip6: + xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET); +out: return err; } -static void __exit ip6ip6_destroy_tunnels(void) +static void __exit ip6_tnl_destroy_tunnels(void) { int h; struct ip6_tnl *t; @@ -1168,11 +1450,14 @@ static void __exit ip6ip6_destroy_tunnels(void) static void __exit ip6_tunnel_cleanup(void) { + if (xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET)) + printk(KERN_INFO "ip6_tunnel close: can't deregister ip4ip6\n"); + if (xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6)) - printk(KERN_INFO "ip6ip6 close: can't deregister tunnel\n"); + printk(KERN_INFO "ip6_tunnel close: can't deregister ip6ip6\n"); rtnl_lock(); - ip6ip6_destroy_tunnels(); + ip6_tnl_destroy_tunnels(); rtnl_unlock(); } diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 5724ba9f75d..1ee50b5782e 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -79,9 +79,9 @@ static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb) skb->ip_summed = CHECKSUM_NONE; /* Remove ipcomp header and decompress original payload */ - iph = skb->nh.ipv6h; + iph = ipv6_hdr(skb); ipch = (void *)skb->data; - skb->h.raw = skb->nh.raw + sizeof(*ipch); + skb->transport_header = skb->network_header + sizeof(*ipch); __skb_pull(skb, sizeof(*ipch)); /* decompression */ @@ -111,7 +111,7 @@ static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb) skb->truesize += dlen - plen; __skb_put(skb, dlen - plen); - memcpy(skb->data, scratch, dlen); + skb_copy_to_linear_data(skb, scratch, dlen); err = ipch->nexthdr; out_put_cpu: @@ -124,15 +124,13 @@ static int ipcomp6_output(struct xfrm_state *x, struct sk_buff *skb) { int err; struct ipv6hdr *top_iph; - int hdr_len; struct ipv6_comp_hdr *ipch; struct ipcomp_data *ipcd = x->data; int plen, dlen; u8 *start, *scratch; struct crypto_comp *tfm; int cpu; - - hdr_len = skb->h.raw - skb->data; + int hdr_len = skb_transport_offset(skb); /* check whether datagram len is larger than threshold */ if ((skb->len - hdr_len) < ipcd->threshold) { @@ -145,7 +143,7 @@ static int ipcomp6_output(struct xfrm_state *x, struct sk_buff *skb) /* compression */ plen = skb->len - hdr_len; dlen = IPCOMP_SCRATCH_SIZE; - start = skb->h.raw; + start = skb_transport_header(skb); cpu = get_cpu(); scratch = *per_cpu_ptr(ipcomp6_scratches, cpu); @@ -166,10 +164,10 @@ static int ipcomp6_output(struct xfrm_state *x, struct sk_buff *skb) top_iph->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); ipch = (struct ipv6_comp_hdr *)start; - ipch->nexthdr = *skb->nh.raw; + ipch->nexthdr = *skb_network_header(skb); ipch->flags = 0; ipch->cpi = htons((u16 )ntohl(x->id.spi)); - *skb->nh.raw = IPPROTO_COMP; + *skb_network_header(skb) = IPPROTO_COMP; out_ok: return 0; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index f5f9582a8d3..aa3d07c52a8 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -101,14 +101,14 @@ static int ipv6_gso_send_check(struct sk_buff *skb) if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) goto out; - ipv6h = skb->nh.ipv6h; + ipv6h = ipv6_hdr(skb); __skb_pull(skb, sizeof(*ipv6h)); err = -EPROTONOSUPPORT; rcu_read_lock(); ops = ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr); if (likely(ops && ops->gso_send_check)) { - skb->h.raw = skb->data; + skb_reset_transport_header(skb); err = ops->gso_send_check(skb); } rcu_read_unlock(); @@ -137,14 +137,14 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, int features) if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) goto out; - ipv6h = skb->nh.ipv6h; + ipv6h = ipv6_hdr(skb); __skb_pull(skb, sizeof(*ipv6h)); segs = ERR_PTR(-EPROTONOSUPPORT); rcu_read_lock(); ops = ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr); if (likely(ops && ops->gso_segment)) { - skb->h.raw = skb->data; + skb_reset_transport_header(skb); segs = ops->gso_segment(skb, features); } rcu_read_unlock(); @@ -153,7 +153,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, int features) goto out; for (skb = segs; skb; skb = skb->next) { - ipv6h = skb->nh.ipv6h; + ipv6h = ipv6_hdr(skb); ipv6h->payload_len = htons(skb->len - skb->mac_len - sizeof(*ipv6h)); } @@ -694,7 +694,7 @@ done: retv = ip6_ra_control(sk, val, NULL); break; case IPV6_MTU_DISCOVER: - if (val<0 || val>2) + if (val<0 || val>3) goto e_inval; np->pmtudisc = val; retv = 0; @@ -761,6 +761,7 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, return err; } +EXPORT_SYMBOL(ipv6_setsockopt); #ifdef CONFIG_COMPAT int compat_ipv6_setsockopt(struct sock *sk, int level, int optname, @@ -796,18 +797,37 @@ EXPORT_SYMBOL(compat_ipv6_setsockopt); #endif static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt, - char __user *optval, int len) + int optname, char __user *optval, int len) { struct ipv6_opt_hdr *hdr; - if (!opt || !opt->hopopt) + if (!opt) + return 0; + + switch(optname) { + case IPV6_HOPOPTS: + hdr = opt->hopopt; + break; + case IPV6_RTHDRDSTOPTS: + hdr = opt->dst0opt; + break; + case IPV6_RTHDR: + hdr = (struct ipv6_opt_hdr *)opt->srcrt; + break; + case IPV6_DSTOPTS: + hdr = opt->dst1opt; + break; + default: + return -EINVAL; /* should not happen */ + } + + if (!hdr) return 0; - hdr = opt->hopopt; len = min_t(unsigned int, len, ipv6_optlen(hdr)); - if (copy_to_user(optval, hdr, ipv6_optlen(hdr))) + if (copy_to_user(optval, hdr, len)); return -EFAULT; - return len; + return ipv6_optlen(hdr); } static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, @@ -945,7 +965,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, lock_sock(sk); len = ipv6_getsockopt_sticky(sk, np->opt, - optval, len); + optname, optval, len); release_sock(sk); return put_user(len, optlen); } @@ -1066,6 +1086,8 @@ int ipv6_getsockopt(struct sock *sk, int level, int optname, return err; } +EXPORT_SYMBOL(ipv6_getsockopt); + #ifdef CONFIG_COMPAT int compat_ipv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) diff --git a/net/ipv6/ipv6_syms.c b/net/ipv6/ipv6_syms.c deleted file mode 100644 index e12e3d4fcce..00000000000 --- a/net/ipv6/ipv6_syms.c +++ /dev/null @@ -1,36 +0,0 @@ - -#include <linux/module.h> -#include <net/protocol.h> -#include <net/ipv6.h> -#include <net/addrconf.h> -#include <net/ip6_route.h> -#include <net/xfrm.h> - -EXPORT_SYMBOL(icmpv6_send); -EXPORT_SYMBOL(icmpv6_statistics); -EXPORT_SYMBOL(icmpv6_err_convert); -EXPORT_SYMBOL(ndisc_mc_map); -EXPORT_SYMBOL(register_inet6addr_notifier); -EXPORT_SYMBOL(unregister_inet6addr_notifier); -EXPORT_SYMBOL(ip6_route_output); -EXPORT_SYMBOL(ipv6_setsockopt); -EXPORT_SYMBOL(ipv6_getsockopt); -EXPORT_SYMBOL(inet6_register_protosw); -EXPORT_SYMBOL(inet6_unregister_protosw); -EXPORT_SYMBOL(inet6_add_protocol); -EXPORT_SYMBOL(inet6_del_protocol); -EXPORT_SYMBOL(ip6_xmit); -EXPORT_SYMBOL(inet6_release); -EXPORT_SYMBOL(inet6_bind); -EXPORT_SYMBOL(inet6_getname); -EXPORT_SYMBOL(inet6_ioctl); -EXPORT_SYMBOL(ipv6_get_saddr); -EXPORT_SYMBOL(ipv6_chk_addr); -EXPORT_SYMBOL(in6_dev_finish_destroy); -#ifdef CONFIG_XFRM -EXPORT_SYMBOL(xfrm6_rcv); -EXPORT_SYMBOL(xfrm6_input_addr); -EXPORT_SYMBOL(xfrm6_find_1stfragopt); -#endif -EXPORT_SYMBOL(rt6_lookup); -EXPORT_SYMBOL(ipv6_push_nfrag_opts); diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index a8d6625ec78..3e308fb41b4 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -988,7 +988,7 @@ int ipv6_is_mld(struct sk_buff *skb, int nexthdr) if (!pskb_may_pull(skb, sizeof(struct icmp6hdr))) return 0; - pic = (struct icmp6hdr *)skb->h.raw; + pic = icmp6_hdr(skb); switch (pic->icmp6_type) { case ICMPV6_MGM_QUERY: @@ -1167,11 +1167,11 @@ int igmp6_event_query(struct sk_buff *skb) return -EINVAL; /* compute payload length excluding extension headers */ - len = ntohs(skb->nh.ipv6h->payload_len) + sizeof(struct ipv6hdr); - len -= (char *)skb->h.raw - (char *)skb->nh.ipv6h; + len = ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr); + len -= skb_network_header_len(skb); /* Drop queries with not link local source */ - if (!(ipv6_addr_type(&skb->nh.ipv6h->saddr)&IPV6_ADDR_LINKLOCAL)) + if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) return -EINVAL; idev = in6_dev_get(skb->dev); @@ -1179,7 +1179,7 @@ int igmp6_event_query(struct sk_buff *skb) if (idev == NULL) return 0; - hdr = (struct icmp6hdr *) skb->h.raw; + hdr = icmp6_hdr(skb); group = (struct in6_addr *) (hdr + 1); group_type = ipv6_addr_type(group); @@ -1212,7 +1212,7 @@ int igmp6_event_query(struct sk_buff *skb) in6_dev_put(idev); return -EINVAL; } - mlh2 = (struct mld2_query *) skb->h.raw; + mlh2 = (struct mld2_query *)skb_transport_header(skb); max_delay = (MLDV2_MRC(ntohs(mlh2->mrc))*HZ)/1000; if (!max_delay) max_delay = 1; @@ -1235,7 +1235,7 @@ int igmp6_event_query(struct sk_buff *skb) in6_dev_put(idev); return -EINVAL; } - mlh2 = (struct mld2_query *) skb->h.raw; + mlh2 = (struct mld2_query *)skb_transport_header(skb); mark = 1; } } else { @@ -1300,10 +1300,10 @@ int igmp6_event_report(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct in6_addr))) return -EINVAL; - hdr = (struct icmp6hdr*) skb->h.raw; + hdr = icmp6_hdr(skb); /* Drop reports with not link local source */ - addr_type = ipv6_addr_type(&skb->nh.ipv6h->saddr); + addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr); if (addr_type != IPV6_ADDR_ANY && !(addr_type&IPV6_ADDR_LINKLOCAL)) return -EINVAL; @@ -1411,7 +1411,7 @@ static struct sk_buff *mld_newpack(struct net_device *dev, int size) skb_reserve(skb, LL_RESERVED_SPACE(dev)); - if (ipv6_get_lladdr(dev, &addr_buf)) { + if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) { /* <draft-ietf-magma-mld-source-05.txt>: * use unspecified address as the source address * when a valid link-local address is not available. @@ -1423,8 +1423,9 @@ static struct sk_buff *mld_newpack(struct net_device *dev, int size) memcpy(skb_put(skb, sizeof(ra)), ra, sizeof(ra)); - pmr =(struct mld2_report *)skb_put(skb, sizeof(*pmr)); - skb->h.raw = (unsigned char *)pmr; + skb_set_transport_header(skb, skb_tail_pointer(skb) - skb->data); + skb_put(skb, sizeof(*pmr)); + pmr = (struct mld2_report *)skb_transport_header(skb); pmr->type = ICMPV6_MLD2_REPORT; pmr->resv1 = 0; pmr->csum = 0; @@ -1441,7 +1442,7 @@ static inline int mld_dev_queue_xmit2(struct sk_buff *skb) unsigned char ha[MAX_ADDR_LEN]; int err; - ndisc_mc_map(&skb->nh.ipv6h->daddr, ha, dev, 1); + ndisc_mc_map(&ipv6_hdr(skb)->daddr, ha, dev, 1); err = dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, skb->len); if (err < 0) { kfree_skb(skb); @@ -1459,20 +1460,21 @@ static inline int mld_dev_queue_xmit(struct sk_buff *skb) static void mld_sendpack(struct sk_buff *skb) { - struct ipv6hdr *pip6 = skb->nh.ipv6h; - struct mld2_report *pmr = (struct mld2_report *)skb->h.raw; + struct ipv6hdr *pip6 = ipv6_hdr(skb); + struct mld2_report *pmr = + (struct mld2_report *)skb_transport_header(skb); int payload_len, mldlen; struct inet6_dev *idev = in6_dev_get(skb->dev); int err; IP6_INC_STATS(idev, IPSTATS_MIB_OUTREQUESTS); - payload_len = skb->tail - (unsigned char *)skb->nh.ipv6h - - sizeof(struct ipv6hdr); - mldlen = skb->tail - skb->h.raw; + payload_len = (skb->tail - skb->network_header) - sizeof(*pip6); + mldlen = skb->tail - skb->transport_header; pip6->payload_len = htons(payload_len); pmr->csum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, mldlen, - IPPROTO_ICMPV6, csum_partial(skb->h.raw, mldlen, 0)); + IPPROTO_ICMPV6, csum_partial(skb_transport_header(skb), + mldlen, 0)); err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev, mld_dev_queue_xmit); if (!err) { @@ -1506,7 +1508,7 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc, pgr->grec_auxwords = 0; pgr->grec_nsrcs = 0; pgr->grec_mca = pmc->mca_addr; /* structure copy */ - pmr = (struct mld2_report *)skb->h.raw; + pmr = (struct mld2_report *)skb_transport_header(skb); pmr->ngrec = htons(ntohs(pmr->ngrec)+1); *ppgr = pgr; return skb; @@ -1539,7 +1541,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, if (!*psf_list) goto empty_source; - pmr = skb ? (struct mld2_report *)skb->h.raw : NULL; + pmr = skb ? (struct mld2_report *)skb_transport_header(skb) : NULL; /* EX and TO_EX get a fresh packet, if needed */ if (truncate) { @@ -1791,7 +1793,7 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) skb_reserve(skb, LL_RESERVED_SPACE(dev)); - if (ipv6_get_lladdr(dev, &addr_buf)) { + if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) { /* <draft-ietf-magma-mld-source-05.txt>: * use unspecified address as the source address * when a valid link-local address is not available. @@ -2329,9 +2331,8 @@ static inline struct ifmcaddr6 *igmp6_mc_get_first(struct seq_file *seq) struct ifmcaddr6 *im = NULL; struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); - for (state->dev = dev_base, state->idev = NULL; - state->dev; - state->dev = state->dev->next) { + state->idev = NULL; + for_each_netdev(state->dev) { struct inet6_dev *idev; idev = in6_dev_get(state->dev); if (!idev) @@ -2358,7 +2359,7 @@ static struct ifmcaddr6 *igmp6_mc_get_next(struct seq_file *seq, struct ifmcaddr read_unlock_bh(&state->idev->lock); in6_dev_put(state->idev); } - state->dev = state->dev->next; + state->dev = next_net_device(state->dev); if (!state->dev) { state->idev = NULL; break; @@ -2473,9 +2474,9 @@ static inline struct ip6_sf_list *igmp6_mcf_get_first(struct seq_file *seq) struct ifmcaddr6 *im = NULL; struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); - for (state->dev = dev_base, state->idev = NULL, state->im = NULL; - state->dev; - state->dev = state->dev->next) { + state->idev = NULL; + state->im = NULL; + for_each_netdev(state->dev) { struct inet6_dev *idev; idev = in6_dev_get(state->dev); if (unlikely(idev == NULL)) @@ -2511,7 +2512,7 @@ static struct ip6_sf_list *igmp6_mcf_get_next(struct seq_file *seq, struct ip6_s read_unlock_bh(&state->idev->lock); in6_dev_put(state->idev); } - state->dev = state->dev->next; + state->dev = next_net_device(state->dev); if (!state->dev) { state->idev = NULL; goto out; diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c index 0afcabdd8ed..13b7160fb89 100644 --- a/net/ipv6/mip6.c +++ b/net/ipv6/mip6.c @@ -90,23 +90,26 @@ int mip6_mh_filter(struct sock *sk, struct sk_buff *skb) { struct ip6_mh *mh; - if (!pskb_may_pull(skb, (skb->h.raw - skb->data) + 8) || - !pskb_may_pull(skb, (skb->h.raw - skb->data) + ((skb->h.raw[1] + 1) << 3))) + if (!pskb_may_pull(skb, (skb_transport_offset(skb)) + 8) || + !pskb_may_pull(skb, (skb_transport_offset(skb) + + ((skb_transport_header(skb)[1] + 1) << 3)))) return -1; - mh = (struct ip6_mh *)skb->h.raw; + mh = (struct ip6_mh *)skb_transport_header(skb); if (mh->ip6mh_hdrlen < mip6_mh_len(mh->ip6mh_type)) { LIMIT_NETDEBUG(KERN_DEBUG "mip6: MH message too short: %d vs >=%d\n", mh->ip6mh_hdrlen, mip6_mh_len(mh->ip6mh_type)); - mip6_param_prob(skb, 0, (&mh->ip6mh_hdrlen) - skb->nh.raw); + mip6_param_prob(skb, 0, ((&mh->ip6mh_hdrlen) - + skb_network_header(skb))); return -1; } if (mh->ip6mh_proto != IPPROTO_NONE) { LIMIT_NETDEBUG(KERN_DEBUG "mip6: MH invalid payload proto = %d\n", mh->ip6mh_proto); - mip6_param_prob(skb, 0, (&mh->ip6mh_proto) - skb->nh.raw); + mip6_param_prob(skb, 0, ((&mh->ip6mh_proto) - + skb_network_header(skb))); return -1; } @@ -122,12 +125,12 @@ struct mip6_report_rate_limiter { }; static struct mip6_report_rate_limiter mip6_report_rl = { - .lock = SPIN_LOCK_UNLOCKED + .lock = __SPIN_LOCK_UNLOCKED(mip6_report_rl.lock) }; static int mip6_destopt_input(struct xfrm_state *x, struct sk_buff *skb) { - struct ipv6hdr *iph = skb->nh.ipv6h; + struct ipv6hdr *iph = ipv6_hdr(skb); struct ipv6_destopt_hdr *destopt = (struct ipv6_destopt_hdr *)skb->data; if (!ipv6_addr_equal(&iph->saddr, (struct in6_addr *)x->coaddr) && @@ -152,10 +155,10 @@ static int mip6_destopt_output(struct xfrm_state *x, struct sk_buff *skb) iph = (struct ipv6hdr *)skb->data; iph->payload_len = htons(skb->len - sizeof(*iph)); - nexthdr = *skb->nh.raw; - *skb->nh.raw = IPPROTO_DSTOPTS; + nexthdr = *skb_network_header(skb); + *skb_network_header(skb) = IPPROTO_DSTOPTS; - dstopt = (struct ipv6_destopt_hdr *)skb->h.raw; + dstopt = (struct ipv6_destopt_hdr *)skb_transport_header(skb); dstopt->nexthdr = nexthdr; hao = mip6_padn((char *)(dstopt + 1), @@ -215,21 +218,22 @@ static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb, struct if (likely(opt->dsthao)) { offset = ipv6_find_tlv(skb, opt->dsthao, IPV6_TLV_HAO); if (likely(offset >= 0)) - hao = (struct ipv6_destopt_hao *)(skb->nh.raw + offset); + hao = (struct ipv6_destopt_hao *) + (skb_network_header(skb) + offset); } skb_get_timestamp(skb, &stamp); - if (!mip6_report_rl_allow(&stamp, &skb->nh.ipv6h->daddr, - hao ? &hao->addr : &skb->nh.ipv6h->saddr, + if (!mip6_report_rl_allow(&stamp, &ipv6_hdr(skb)->daddr, + hao ? &hao->addr : &ipv6_hdr(skb)->saddr, opt->iif)) goto out; memset(&sel, 0, sizeof(sel)); - memcpy(&sel.daddr, (xfrm_address_t *)&skb->nh.ipv6h->daddr, + memcpy(&sel.daddr, (xfrm_address_t *)&ipv6_hdr(skb)->daddr, sizeof(sel.daddr)); sel.prefixlen_d = 128; - memcpy(&sel.saddr, (xfrm_address_t *)&skb->nh.ipv6h->saddr, + memcpy(&sel.saddr, (xfrm_address_t *)&ipv6_hdr(skb)->saddr, sizeof(sel.saddr)); sel.prefixlen_s = 128; sel.family = AF_INET6; @@ -253,11 +257,13 @@ static int mip6_destopt_offset(struct xfrm_state *x, struct sk_buff *skb, u8 **nexthdr) { u16 offset = sizeof(struct ipv6hdr); - struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1); - unsigned int packet_len = skb->tail - skb->nh.raw; + struct ipv6_opt_hdr *exthdr = + (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); + const unsigned char *nh = skb_network_header(skb); + unsigned int packet_len = skb->tail - skb->network_header; int found_rhdr = 0; - *nexthdr = &skb->nh.ipv6h->nexthdr; + *nexthdr = &ipv6_hdr(skb)->nexthdr; while (offset + 1 <= packet_len) { @@ -288,7 +294,7 @@ static int mip6_destopt_offset(struct xfrm_state *x, struct sk_buff *skb, offset += ipv6_optlen(exthdr); *nexthdr = &exthdr->nexthdr; - exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset); + exthdr = (struct ipv6_opt_hdr *)(nh + offset); } return offset; @@ -361,10 +367,10 @@ static int mip6_rthdr_output(struct xfrm_state *x, struct sk_buff *skb) iph = (struct ipv6hdr *)skb->data; iph->payload_len = htons(skb->len - sizeof(*iph)); - nexthdr = *skb->nh.raw; - *skb->nh.raw = IPPROTO_ROUTING; + nexthdr = *skb_network_header(skb); + *skb_network_header(skb) = IPPROTO_ROUTING; - rt2 = (struct rt2_hdr *)skb->h.raw; + rt2 = (struct rt2_hdr *)skb_transport_header(skb); rt2->rt_hdr.nexthdr = nexthdr; rt2->rt_hdr.hdrlen = (x->props.header_len >> 3) - 1; rt2->rt_hdr.type = IPV6_SRCRT_TYPE_2; @@ -383,11 +389,13 @@ static int mip6_rthdr_offset(struct xfrm_state *x, struct sk_buff *skb, u8 **nexthdr) { u16 offset = sizeof(struct ipv6hdr); - struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1); - unsigned int packet_len = skb->tail - skb->nh.raw; + struct ipv6_opt_hdr *exthdr = + (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); + const unsigned char *nh = skb_network_header(skb); + unsigned int packet_len = skb->tail - skb->network_header; int found_rhdr = 0; - *nexthdr = &skb->nh.ipv6h->nexthdr; + *nexthdr = &ipv6_hdr(skb)->nexthdr; while (offset + 1 <= packet_len) { @@ -397,7 +405,7 @@ static int mip6_rthdr_offset(struct xfrm_state *x, struct sk_buff *skb, case NEXTHDR_ROUTING: if (offset + 3 <= packet_len) { struct ipv6_rt_hdr *rt; - rt = (struct ipv6_rt_hdr *)(skb->nh.raw + offset); + rt = (struct ipv6_rt_hdr *)(nh + offset); if (rt->type != 0) return offset; } @@ -417,7 +425,7 @@ static int mip6_rthdr_offset(struct xfrm_state *x, struct sk_buff *skb, offset += ipv6_optlen(exthdr); *nexthdr = &exthdr->nexthdr; - exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset); + exthdr = (struct ipv6_opt_hdr *)(nh + offset); } return offset; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 121f31c283f..d8b36451bad 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -319,6 +319,8 @@ int ndisc_mc_map(struct in6_addr *addr, char *buf, struct net_device *dev, int d return -EINVAL; } +EXPORT_SYMBOL(ndisc_mc_map); + static u32 ndisc_hash(const void *pkey, const struct net_device *dev) { const u32 *p32 = pkey; @@ -425,36 +427,23 @@ static inline void ndisc_flow_init(struct flowi *fl, u8 type, security_sk_classify_flow(ndisc_socket->sk, fl); } -static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, - struct in6_addr *daddr, struct in6_addr *solicited_addr, - int router, int solicited, int override, int inc_opt) +static void __ndisc_send(struct net_device *dev, + struct neighbour *neigh, + struct in6_addr *daddr, struct in6_addr *saddr, + struct icmp6hdr *icmp6h, struct in6_addr *target, + int llinfo, int icmp6_mib_outnd) { - struct in6_addr tmpaddr; - struct inet6_ifaddr *ifp; - struct inet6_dev *idev; struct flowi fl; - struct dst_entry* dst; + struct dst_entry *dst; struct sock *sk = ndisc_socket->sk; - struct in6_addr *src_addr; - struct nd_msg *msg; - int len; struct sk_buff *skb; + struct icmp6hdr *hdr; + struct inet6_dev *idev; + int len; int err; + u8 *opt; - len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr); - - /* for anycast or proxy, solicited_addr != src_addr */ - ifp = ipv6_get_ifaddr(solicited_addr, dev, 1); - if (ifp) { - src_addr = solicited_addr; - in6_ifa_put(ifp); - } else { - if (ipv6_dev_get_saddr(dev, daddr, &tmpaddr)) - return; - src_addr = &tmpaddr; - } - - ndisc_flow_init(&fl, NDISC_NEIGHBOUR_ADVERTISEMENT, src_addr, daddr, + ndisc_flow_init(&fl, icmp6h->icmp6_type, saddr, daddr, dev->ifindex); dst = ndisc_dst_alloc(dev, neigh, daddr, ip6_output); @@ -465,60 +454,57 @@ static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, if (err < 0) return; - if (inc_opt) { - if (dev->addr_len) - len += ndisc_opt_addr_space(dev); - else - inc_opt = 0; - } + if (!dev->addr_len) + llinfo = 0; + + len = sizeof(struct icmp6hdr) + (target ? sizeof(*target) : 0); + if (llinfo) + len += ndisc_opt_addr_space(dev); skb = sock_alloc_send_skb(sk, (MAX_HEADER + sizeof(struct ipv6hdr) + len + LL_RESERVED_SPACE(dev)), 1, &err); - - if (skb == NULL) { + if (!skb) { ND_PRINTK0(KERN_ERR - "ICMPv6 NA: %s() failed to allocate an skb.\n", + "ICMPv6 ND: %s() failed to allocate an skb.\n", __FUNCTION__); dst_release(dst); return; } skb_reserve(skb, LL_RESERVED_SPACE(dev)); - ip6_nd_hdr(sk, skb, dev, src_addr, daddr, IPPROTO_ICMPV6, len); - - msg = (struct nd_msg *)skb_put(skb, len); - skb->h.raw = (unsigned char*)msg; + ip6_nd_hdr(sk, skb, dev, saddr, daddr, IPPROTO_ICMPV6, len); - msg->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT; - msg->icmph.icmp6_code = 0; - msg->icmph.icmp6_cksum = 0; + skb->transport_header = skb->tail; + skb_put(skb, len); - msg->icmph.icmp6_unused = 0; - msg->icmph.icmp6_router = router; - msg->icmph.icmp6_solicited = solicited; - msg->icmph.icmp6_override = override; + hdr = (struct icmp6hdr *)skb_transport_header(skb); + memcpy(hdr, icmp6h, sizeof(*hdr)); - /* Set the target address. */ - ipv6_addr_copy(&msg->target, solicited_addr); + opt = skb_transport_header(skb) + sizeof(struct icmp6hdr); + if (target) { + ipv6_addr_copy((struct in6_addr *)opt, target); + opt += sizeof(*target); + } - if (inc_opt) - ndisc_fill_addr_option(msg->opt, ND_OPT_TARGET_LL_ADDR, dev->dev_addr, + if (llinfo) + ndisc_fill_addr_option(opt, llinfo, dev->dev_addr, dev->addr_len, dev->type); - /* checksum */ - msg->icmph.icmp6_cksum = csum_ipv6_magic(src_addr, daddr, len, - IPPROTO_ICMPV6, - csum_partial((__u8 *) msg, - len, 0)); + hdr->icmp6_cksum = csum_ipv6_magic(saddr, daddr, len, + IPPROTO_ICMPV6, + csum_partial((__u8 *) hdr, + len, 0)); skb->dst = dst; + idev = in6_dev_get(dst->dev); IP6_INC_STATS(idev, IPSTATS_MIB_OUTREQUESTS); + err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, dst_output); if (!err) { - ICMP6_INC_STATS(idev, ICMP6_MIB_OUTNEIGHBORADVERTISEMENTS); + ICMP6_INC_STATS(idev, icmp6_mib_outnd); ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS); } @@ -526,165 +512,95 @@ static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, in6_dev_put(idev); } +static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, + struct in6_addr *daddr, struct in6_addr *solicited_addr, + int router, int solicited, int override, int inc_opt) +{ + struct in6_addr tmpaddr; + struct inet6_ifaddr *ifp; + struct in6_addr *src_addr; + struct icmp6hdr icmp6h = { + .icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT, + }; + + /* for anycast or proxy, solicited_addr != src_addr */ + ifp = ipv6_get_ifaddr(solicited_addr, dev, 1); + if (ifp) { + src_addr = solicited_addr; + if (ifp->flags & IFA_F_OPTIMISTIC) + override = 0; + in6_ifa_put(ifp); + } else { + if (ipv6_dev_get_saddr(dev, daddr, &tmpaddr)) + return; + src_addr = &tmpaddr; + } + + icmp6h.icmp6_router = router; + icmp6h.icmp6_solicited = solicited; + icmp6h.icmp6_override = override; + + __ndisc_send(dev, neigh, daddr, src_addr, + &icmp6h, solicited_addr, + inc_opt ? ND_OPT_TARGET_LL_ADDR : 0, + ICMP6_MIB_OUTNEIGHBORADVERTISEMENTS); +} + void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh, struct in6_addr *solicit, struct in6_addr *daddr, struct in6_addr *saddr) { - struct flowi fl; - struct dst_entry* dst; - struct inet6_dev *idev; - struct sock *sk = ndisc_socket->sk; - struct sk_buff *skb; - struct nd_msg *msg; struct in6_addr addr_buf; - int len; - int err; - int send_llinfo; + struct icmp6hdr icmp6h = { + .icmp6_type = NDISC_NEIGHBOUR_SOLICITATION, + }; if (saddr == NULL) { - if (ipv6_get_lladdr(dev, &addr_buf)) + if (ipv6_get_lladdr(dev, &addr_buf, + (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC))) return; saddr = &addr_buf; } - ndisc_flow_init(&fl, NDISC_NEIGHBOUR_SOLICITATION, saddr, daddr, - dev->ifindex); - - dst = ndisc_dst_alloc(dev, neigh, daddr, ip6_output); - if (!dst) - return; - - err = xfrm_lookup(&dst, &fl, NULL, 0); - if (err < 0) - return; - - len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr); - send_llinfo = dev->addr_len && !ipv6_addr_any(saddr); - if (send_llinfo) - len += ndisc_opt_addr_space(dev); - - skb = sock_alloc_send_skb(sk, - (MAX_HEADER + sizeof(struct ipv6hdr) + - len + LL_RESERVED_SPACE(dev)), - 1, &err); - if (skb == NULL) { - ND_PRINTK0(KERN_ERR - "ICMPv6 NA: %s() failed to allocate an skb.\n", - __FUNCTION__); - dst_release(dst); - return; - } - - skb_reserve(skb, LL_RESERVED_SPACE(dev)); - ip6_nd_hdr(sk, skb, dev, saddr, daddr, IPPROTO_ICMPV6, len); - - msg = (struct nd_msg *)skb_put(skb, len); - skb->h.raw = (unsigned char*)msg; - msg->icmph.icmp6_type = NDISC_NEIGHBOUR_SOLICITATION; - msg->icmph.icmp6_code = 0; - msg->icmph.icmp6_cksum = 0; - msg->icmph.icmp6_unused = 0; - - /* Set the target address. */ - ipv6_addr_copy(&msg->target, solicit); - - if (send_llinfo) - ndisc_fill_addr_option(msg->opt, ND_OPT_SOURCE_LL_ADDR, dev->dev_addr, - dev->addr_len, dev->type); - - /* checksum */ - msg->icmph.icmp6_cksum = csum_ipv6_magic(&skb->nh.ipv6h->saddr, - daddr, len, - IPPROTO_ICMPV6, - csum_partial((__u8 *) msg, - len, 0)); - /* send it! */ - skb->dst = dst; - idev = in6_dev_get(dst->dev); - IP6_INC_STATS(idev, IPSTATS_MIB_OUTREQUESTS); - err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, dst_output); - if (!err) { - ICMP6_INC_STATS(idev, ICMP6_MIB_OUTNEIGHBORSOLICITS); - ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS); - } - - if (likely(idev != NULL)) - in6_dev_put(idev); + __ndisc_send(dev, neigh, daddr, saddr, + &icmp6h, solicit, + !ipv6_addr_any(saddr) ? ND_OPT_SOURCE_LL_ADDR : 0, + ICMP6_MIB_OUTNEIGHBORSOLICITS); } void ndisc_send_rs(struct net_device *dev, struct in6_addr *saddr, struct in6_addr *daddr) { - struct flowi fl; - struct dst_entry* dst; - struct inet6_dev *idev; - struct sock *sk = ndisc_socket->sk; - struct sk_buff *skb; - struct icmp6hdr *hdr; - __u8 * opt; - int len; - int err; - - ndisc_flow_init(&fl, NDISC_ROUTER_SOLICITATION, saddr, daddr, - dev->ifindex); - - dst = ndisc_dst_alloc(dev, NULL, daddr, ip6_output); - if (!dst) - return; - - err = xfrm_lookup(&dst, &fl, NULL, 0); - if (err < 0) - return; - - len = sizeof(struct icmp6hdr); - if (dev->addr_len) - len += ndisc_opt_addr_space(dev); - - skb = sock_alloc_send_skb(sk, - (MAX_HEADER + sizeof(struct ipv6hdr) + - len + LL_RESERVED_SPACE(dev)), - 1, &err); - if (skb == NULL) { - ND_PRINTK0(KERN_ERR - "ICMPv6 RS: %s() failed to allocate an skb.\n", - __FUNCTION__); - dst_release(dst); - return; - } - - skb_reserve(skb, LL_RESERVED_SPACE(dev)); - ip6_nd_hdr(sk, skb, dev, saddr, daddr, IPPROTO_ICMPV6, len); - - hdr = (struct icmp6hdr *)skb_put(skb, len); - skb->h.raw = (unsigned char*)hdr; - hdr->icmp6_type = NDISC_ROUTER_SOLICITATION; - hdr->icmp6_code = 0; - hdr->icmp6_cksum = 0; - hdr->icmp6_unused = 0; - - opt = (u8*) (hdr + 1); - - if (dev->addr_len) - ndisc_fill_addr_option(opt, ND_OPT_SOURCE_LL_ADDR, dev->dev_addr, - dev->addr_len, dev->type); - - /* checksum */ - hdr->icmp6_cksum = csum_ipv6_magic(&skb->nh.ipv6h->saddr, daddr, len, - IPPROTO_ICMPV6, - csum_partial((__u8 *) hdr, len, 0)); + struct icmp6hdr icmp6h = { + .icmp6_type = NDISC_ROUTER_SOLICITATION, + }; + int send_sllao = dev->addr_len; - /* send it! */ - skb->dst = dst; - idev = in6_dev_get(dst->dev); - IP6_INC_STATS(idev, IPSTATS_MIB_OUTREQUESTS); - err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, dst_output); - if (!err) { - ICMP6_INC_STATS(idev, ICMP6_MIB_OUTROUTERSOLICITS); - ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS); +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + /* + * According to section 2.2 of RFC 4429, we must not + * send router solicitations with a sllao from + * optimistic addresses, but we may send the solicitation + * if we don't include the sllao. So here we check + * if our address is optimistic, and if so, we + * supress the inclusion of the sllao. + */ + if (send_sllao) { + struct inet6_ifaddr *ifp = ipv6_get_ifaddr(saddr, dev, 1); + if (ifp) { + if (ifp->flags & IFA_F_OPTIMISTIC) { + send_sllao = 0; + } + in6_ifa_put(ifp); + } else { + send_sllao = 0; + } } - - if (likely(idev != NULL)) - in6_dev_put(idev); +#endif + __ndisc_send(dev, NULL, daddr, saddr, + &icmp6h, NULL, + send_sllao ? ND_OPT_SOURCE_LL_ADDR : 0, + ICMP6_MIB_OUTROUTERSOLICITS); } @@ -708,8 +624,8 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) struct in6_addr *target = (struct in6_addr *)&neigh->primary_key; int probes = atomic_read(&neigh->probes); - if (skb && ipv6_chk_addr(&skb->nh.ipv6h->saddr, dev, 1)) - saddr = &skb->nh.ipv6h->saddr; + if (skb && ipv6_chk_addr(&ipv6_hdr(skb)->saddr, dev, 1)) + saddr = &ipv6_hdr(skb)->saddr; if ((probes -= neigh->parms->ucast_probes) < 0) { if (!(neigh->nud_state & NUD_VALID)) { @@ -732,11 +648,12 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) static void ndisc_recv_ns(struct sk_buff *skb) { - struct nd_msg *msg = (struct nd_msg *)skb->h.raw; - struct in6_addr *saddr = &skb->nh.ipv6h->saddr; - struct in6_addr *daddr = &skb->nh.ipv6h->daddr; + struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); + struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; + struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; u8 *lladdr = NULL; - u32 ndoptlen = skb->tail - msg->opt; + u32 ndoptlen = skb->tail - (skb->transport_header + + offsetof(struct nd_msg, opt)); struct ndisc_options ndopts; struct net_device *dev = skb->dev; struct inet6_ifaddr *ifp; @@ -796,28 +713,40 @@ static void ndisc_recv_ns(struct sk_buff *skb) inc = ipv6_addr_is_multicast(daddr); if ((ifp = ipv6_get_ifaddr(&msg->target, dev, 1)) != NULL) { - if (ifp->flags & IFA_F_TENTATIVE) { - /* Address is tentative. If the source - is unspecified address, it is someone - does DAD, otherwise we ignore solicitations - until DAD timer expires. - */ - if (!dad) + + if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) { + if (dad) { + if (dev->type == ARPHRD_IEEE802_TR) { + const unsigned char *sadr; + sadr = skb_mac_header(skb); + if (((sadr[8] ^ dev->dev_addr[0]) & 0x7f) == 0 && + sadr[9] == dev->dev_addr[1] && + sadr[10] == dev->dev_addr[2] && + sadr[11] == dev->dev_addr[3] && + sadr[12] == dev->dev_addr[4] && + sadr[13] == dev->dev_addr[5]) { + /* looped-back to us */ + goto out; + } + } + + /* + * We are colliding with another node + * who is doing DAD + * so fail our DAD process + */ + addrconf_dad_failure(ifp); goto out; - if (dev->type == ARPHRD_IEEE802_TR) { - unsigned char *sadr = skb->mac.raw; - if (((sadr[8] ^ dev->dev_addr[0]) & 0x7f) == 0 && - sadr[9] == dev->dev_addr[1] && - sadr[10] == dev->dev_addr[2] && - sadr[11] == dev->dev_addr[3] && - sadr[12] == dev->dev_addr[4] && - sadr[13] == dev->dev_addr[5]) { - /* looped-back to us */ + } else { + /* + * This is not a dad solicitation. + * If we are an optimistic node, + * we should respond. + * Otherwise, we should ignore it. + */ + if (!(ifp->flags & IFA_F_OPTIMISTIC)) goto out; - } } - addrconf_dad_failure(ifp); - return; } idev = ifp->idev; @@ -898,11 +827,12 @@ out: static void ndisc_recv_na(struct sk_buff *skb) { - struct nd_msg *msg = (struct nd_msg *)skb->h.raw; - struct in6_addr *saddr = &skb->nh.ipv6h->saddr; - struct in6_addr *daddr = &skb->nh.ipv6h->daddr; + struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); + struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; + struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; u8 *lladdr = NULL; - u32 ndoptlen = skb->tail - msg->opt; + u32 ndoptlen = skb->tail - (skb->transport_header + + offsetof(struct nd_msg, opt)); struct ndisc_options ndopts; struct net_device *dev = skb->dev; struct inet6_ifaddr *ifp; @@ -1000,11 +930,11 @@ out: static void ndisc_recv_rs(struct sk_buff *skb) { - struct rs_msg *rs_msg = (struct rs_msg *) skb->h.raw; + struct rs_msg *rs_msg = (struct rs_msg *)skb_transport_header(skb); unsigned long ndoptlen = skb->len - sizeof(*rs_msg); struct neighbour *neigh; struct inet6_dev *idev; - struct in6_addr *saddr = &skb->nh.ipv6h->saddr; + struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; struct ndisc_options ndopts; u8 *lladdr = NULL; @@ -1057,7 +987,7 @@ out: static void ndisc_router_discovery(struct sk_buff *skb) { - struct ra_msg *ra_msg = (struct ra_msg *) skb->h.raw; + struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb); struct neighbour *neigh = NULL; struct inet6_dev *in6_dev; struct rt6_info *rt = NULL; @@ -1068,9 +998,9 @@ static void ndisc_router_discovery(struct sk_buff *skb) __u8 * opt = (__u8 *)(ra_msg + 1); - optlen = (skb->tail - skb->h.raw) - sizeof(struct ra_msg); + optlen = (skb->tail - skb->transport_header) - sizeof(struct ra_msg); - if (!(ipv6_addr_type(&skb->nh.ipv6h->saddr) & IPV6_ADDR_LINKLOCAL)) { + if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { ND_PRINTK2(KERN_WARNING "ICMPv6 RA: source address is not link-local.\n"); return; @@ -1136,7 +1066,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) pref = ICMPV6_ROUTER_PREF_MEDIUM; #endif - rt = rt6_get_dflt_router(&skb->nh.ipv6h->saddr, skb->dev); + rt = rt6_get_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev); if (rt) neigh = rt->rt6i_nexthop; @@ -1151,7 +1081,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) ND_PRINTK3(KERN_DEBUG "ICMPv6 RA: adding default router.\n"); - rt = rt6_add_dflt_router(&skb->nh.ipv6h->saddr, skb->dev, pref); + rt = rt6_add_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev, pref); if (rt == NULL) { ND_PRINTK0(KERN_ERR "ICMPv6 RA: %s() failed to add default route.\n", @@ -1223,7 +1153,7 @@ skip_defrtr: */ if (!neigh) - neigh = __neigh_lookup(&nd_tbl, &skb->nh.ipv6h->saddr, + neigh = __neigh_lookup(&nd_tbl, &ipv6_hdr(skb)->saddr, skb->dev, 1); if (neigh) { u8 *lladdr = NULL; @@ -1252,7 +1182,7 @@ skip_defrtr: if (((struct route_info *)p)->prefix_len > in6_dev->cnf.accept_ra_rt_info_max_plen) continue; rt6_route_rcv(skb->dev, (u8*)p, (p->nd_opt_len) << 3, - &skb->nh.ipv6h->saddr); + &ipv6_hdr(skb)->saddr); } } #endif @@ -1311,13 +1241,13 @@ static void ndisc_redirect_rcv(struct sk_buff *skb) int optlen; u8 *lladdr = NULL; - if (!(ipv6_addr_type(&skb->nh.ipv6h->saddr) & IPV6_ADDR_LINKLOCAL)) { + if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { ND_PRINTK2(KERN_WARNING "ICMPv6 Redirect: source address is not link-local.\n"); return; } - optlen = skb->tail - skb->h.raw; + optlen = skb->tail - skb->transport_header; optlen -= sizeof(struct icmp6hdr) + 2 * sizeof(struct in6_addr); if (optlen < 0) { @@ -1326,7 +1256,7 @@ static void ndisc_redirect_rcv(struct sk_buff *skb) return; } - icmph = (struct icmp6hdr *) skb->h.raw; + icmph = icmp6_hdr(skb); target = (struct in6_addr *) (icmph + 1); dest = target + 1; @@ -1376,8 +1306,8 @@ static void ndisc_redirect_rcv(struct sk_buff *skb) neigh = __neigh_lookup(&nd_tbl, target, skb->dev, 1); if (neigh) { - rt6_redirect(dest, &skb->nh.ipv6h->daddr, - &skb->nh.ipv6h->saddr, neigh, lladdr, + rt6_redirect(dest, &ipv6_hdr(skb)->daddr, + &ipv6_hdr(skb)->saddr, neigh, lladdr, on_link); neigh_release(neigh); } @@ -1406,21 +1336,21 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, dev = skb->dev; - if (ipv6_get_lladdr(dev, &saddr_buf)) { + if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) { ND_PRINTK2(KERN_WARNING "ICMPv6 Redirect: no link-local address on %s\n", dev->name); return; } - if (!ipv6_addr_equal(&skb->nh.ipv6h->daddr, target) && + if (!ipv6_addr_equal(&ipv6_hdr(skb)->daddr, target) && !(ipv6_addr_type(target) & IPV6_ADDR_LINKLOCAL)) { ND_PRINTK2(KERN_WARNING "ICMPv6 Redirect: target address is not link-local.\n"); return; } - ndisc_flow_init(&fl, NDISC_REDIRECT, &saddr_buf, &skb->nh.ipv6h->saddr, + ndisc_flow_init(&fl, NDISC_REDIRECT, &saddr_buf, &ipv6_hdr(skb)->saddr, dev->ifindex); dst = ip6_route_output(NULL, &fl); @@ -1475,11 +1405,12 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, hlen = 0; skb_reserve(buff, LL_RESERVED_SPACE(dev)); - ip6_nd_hdr(sk, buff, dev, &saddr_buf, &skb->nh.ipv6h->saddr, + ip6_nd_hdr(sk, buff, dev, &saddr_buf, &ipv6_hdr(skb)->saddr, IPPROTO_ICMPV6, len); - icmph = (struct icmp6hdr *)skb_put(buff, len); - buff->h.raw = (unsigned char*)icmph; + skb_set_transport_header(buff, skb_tail_pointer(buff) - buff->data); + skb_put(buff, len); + icmph = icmp6_hdr(buff); memset(icmph, 0, sizeof(struct icmp6hdr)); icmph->icmp6_type = NDISC_REDIRECT; @@ -1491,7 +1422,7 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, addrp = (struct in6_addr *)(icmph + 1); ipv6_addr_copy(addrp, target); addrp++; - ipv6_addr_copy(addrp, &skb->nh.ipv6h->daddr); + ipv6_addr_copy(addrp, &ipv6_hdr(skb)->daddr); opt = (u8*) (addrp + 1); @@ -1512,9 +1443,9 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, *(opt++) = (rd_len >> 3); opt += 6; - memcpy(opt, skb->nh.ipv6h, rd_len - 8); + memcpy(opt, ipv6_hdr(skb), rd_len - 8); - icmph->icmp6_cksum = csum_ipv6_magic(&saddr_buf, &skb->nh.ipv6h->saddr, + icmph->icmp6_cksum = csum_ipv6_magic(&saddr_buf, &ipv6_hdr(skb)->saddr, len, IPPROTO_ICMPV6, csum_partial((u8 *) icmph, len, 0)); @@ -1544,14 +1475,14 @@ int ndisc_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, skb->len)) return 0; - msg = (struct nd_msg *) skb->h.raw; + msg = (struct nd_msg *)skb_transport_header(skb); - __skb_push(skb, skb->data-skb->h.raw); + __skb_push(skb, skb->data - skb_transport_header(skb)); - if (skb->nh.ipv6h->hop_limit != 255) { + if (ipv6_hdr(skb)->hop_limit != 255) { ND_PRINTK2(KERN_WARNING "ICMPv6 NDISC: invalid hop-limit: %d\n", - skb->nh.ipv6h->hop_limit); + ipv6_hdr(skb)->hop_limit); return 0; } @@ -1584,7 +1515,7 @@ int ndisc_rcv(struct sk_buff *skb) case NDISC_REDIRECT: ndisc_redirect_rcv(skb); break; - }; + } return 0; } diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 1c405dd30c6..38b14961391 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -11,7 +11,7 @@ int ip6_route_me_harder(struct sk_buff *skb) { - struct ipv6hdr *iph = skb->nh.ipv6h; + struct ipv6hdr *iph = ipv6_hdr(skb); struct dst_entry *dst; struct flowi fl = { .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0, @@ -61,7 +61,7 @@ static void nf_ip6_saveroute(const struct sk_buff *skb, struct nf_info *info) struct ip6_rt_info *rt_info = nf_info_reroute(info); if (info->hook == NF_IP6_LOCAL_OUT) { - struct ipv6hdr *iph = skb->nh.ipv6h; + struct ipv6hdr *iph = ipv6_hdr(skb); rt_info->daddr = iph->daddr; rt_info->saddr = iph->saddr; @@ -73,7 +73,7 @@ static int nf_ip6_reroute(struct sk_buff **pskb, const struct nf_info *info) struct ip6_rt_info *rt_info = nf_info_reroute(info); if (info->hook == NF_IP6_LOCAL_OUT) { - struct ipv6hdr *iph = (*pskb)->nh.ipv6h; + struct ipv6hdr *iph = ipv6_hdr(*pskb); if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) || !ipv6_addr_equal(&iph->saddr, &rt_info->saddr)) return ip6_route_me_harder(*pskb); @@ -84,7 +84,7 @@ static int nf_ip6_reroute(struct sk_buff **pskb, const struct nf_info *info) __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u_int8_t protocol) { - struct ipv6hdr *ip6h = skb->nh.ipv6h; + struct ipv6hdr *ip6h = ipv6_hdr(skb); __sum16 csum = 0; switch (skb->ip_summed) { diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c index fdb30a5916e..0004db38af6 100644 --- a/net/ipv6/netfilter/ip6_queue.c +++ b/net/ipv6/netfilter/ip6_queue.c @@ -11,18 +11,6 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. - * - * 2001-11-06: First try. Working with ip_queue.c for IPv4 and trying - * to adapt it to IPv6 - * HEAVILY based in ipqueue.c by James Morris. It's just - * a little modified version of it, so he's nearly the - * real coder of this. - * Few changes needed, mainly the hard_routing code and - * the netlink socket protocol (we're NETLINK_IP6_FW). - * 2002-06-25: Code cleanup. [JM: ported cleanup over from ip_queue.c] - * 2005-02-04: Added /proc counter for dropped packets; fixed so - * packets aren't delivered to user space if they're going - * to be dropped. */ #include <linux/module.h> #include <linux/skbuff.h> @@ -189,12 +177,13 @@ ipq_flush(int verdict) static struct sk_buff * ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp) { - unsigned char *old_tail; + sk_buff_data_t old_tail; size_t size = 0; size_t data_len = 0; struct sk_buff *skb; struct ipq_packet_msg *pmsg; struct nlmsghdr *nlh; + struct timeval tv; read_lock_bh(&queue_lock); @@ -232,15 +221,16 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp) if (!skb) goto nlmsg_failure; - old_tail= skb->tail; + old_tail = skb->tail; nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh)); pmsg = NLMSG_DATA(nlh); memset(pmsg, 0, sizeof(*pmsg)); pmsg->packet_id = (unsigned long )entry; pmsg->data_len = data_len; - pmsg->timestamp_sec = entry->skb->tstamp.off_sec; - pmsg->timestamp_usec = entry->skb->tstamp.off_usec; + tv = ktime_to_timeval(entry->skb->tstamp); + pmsg->timestamp_sec = tv.tv_sec; + pmsg->timestamp_usec = tv.tv_usec; pmsg->mark = entry->skb->mark; pmsg->hook = entry->info->hook; pmsg->hw_protocol = entry->skb->protocol; @@ -376,7 +366,7 @@ ipq_mangle_ipv6(ipq_verdict_msg_t *v, struct ipq_queue_entry *e) } if (!skb_make_writable(&e->skb, v->data_len)) return -ENOMEM; - memcpy(e->skb->data, v->payload, v->data_len); + skb_copy_to_linear_data(e->skb, v->payload, v->data_len); e->skb->ip_summed = CHECKSUM_NONE; return 0; @@ -485,7 +475,7 @@ ipq_rcv_skb(struct sk_buff *skb) if (skblen < sizeof(*nlh)) return; - nlh = (struct nlmsghdr *)skb->data; + nlh = nlmsg_hdr(skb); nlmsglen = nlh->nlmsg_len; if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen) return; @@ -667,7 +657,7 @@ static int __init ip6_queue_init(void) struct proc_dir_entry *proc; netlink_register_notifier(&ipq_nl_notifier); - ipqnl = netlink_kernel_create(NETLINK_IP6_FW, 0, ipq_rcv_sk, + ipqnl = netlink_kernel_create(NETLINK_IP6_FW, 0, ipq_rcv_sk, NULL, THIS_MODULE); if (ipqnl == NULL) { printk(KERN_ERR "ip6_queue: failed to create netlink socket\n"); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 7c512e13f95..9aa62402668 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -7,15 +7,6 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. - * - * 19 Jan 2002 Harald Welte <laforge@gnumonks.org> - * - increase module usage count as soon as we have rules inside - * a table - * 06 Jun 2002 Andras Kis-Szabo <kisza@sch.bme.hu> - * - new extension header parser code - * 15 Oct 2005 Harald Welte <laforge@netfilter.org> - * - Unification of {ip,ip6}_tables into x_tables - * - Removed tcp and udp code, since it's not ipv6 specific */ #include <linux/capability.h> @@ -115,7 +106,7 @@ ip6_packet_match(const struct sk_buff *skb, { size_t i; unsigned long ret; - const struct ipv6hdr *ipv6 = skb->nh.ipv6h; + const struct ipv6hdr *ipv6 = ipv6_hdr(skb); #define FWINV(bool,invflg) ((bool) ^ !!(ip6info->invflags & invflg)) @@ -301,7 +292,7 @@ ip6t_do_table(struct sk_buff **pskb, goto no_match; ADD_COUNTER(e->counters, - ntohs((*pskb)->nh.ipv6h->payload_len) + ntohs(ipv6_hdr(*pskb)->payload_len) + IPV6_HDR_LEN, 1); @@ -1448,8 +1439,8 @@ static void __exit ip6_tables_fini(void) int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, int target, unsigned short *fragoff) { - unsigned int start = (u8*)(skb->nh.ipv6h + 1) - skb->data; - u8 nexthdr = skb->nh.ipv6h->nexthdr; + unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr); + u8 nexthdr = ipv6_hdr(skb)->nexthdr; unsigned int len = skb->len - start; if (fragoff) diff --git a/net/ipv6/netfilter/ip6t_HL.c b/net/ipv6/netfilter/ip6t_HL.c index ccbab66277e..4115a576ba2 100644 --- a/net/ipv6/netfilter/ip6t_HL.c +++ b/net/ipv6/netfilter/ip6t_HL.c @@ -32,7 +32,7 @@ static unsigned int ip6t_hl_target(struct sk_buff **pskb, if (!skb_make_writable(pskb, (*pskb)->len)) return NF_DROP; - ip6h = (*pskb)->nh.ipv6h; + ip6h = ipv6_hdr(*pskb); switch (info->mode) { case IP6T_HL_SET: diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c index afaa039d0b7..5bb9cd34935 100644 --- a/net/ipv6/netfilter/ip6t_LOG.c +++ b/net/ipv6/netfilter/ip6t_LOG.c @@ -396,8 +396,8 @@ ip6t_log_packet(unsigned int pf, /* MAC logging for input chain only. */ printk("MAC="); if (skb->dev && (len = skb->dev->hard_header_len) && - skb->mac.raw != skb->nh.raw) { - unsigned char *p = skb->mac.raw; + skb->mac_header != skb->network_header) { + const unsigned char *p = skb_mac_header(skb); int i; if (skb->dev->type == ARPHRD_SIT && @@ -412,7 +412,8 @@ ip6t_log_packet(unsigned int pf, printk(" "); if (skb->dev->type == ARPHRD_SIT) { - struct iphdr *iph = (struct iphdr *)skb->mac.raw; + const struct iphdr *iph = + (struct iphdr *)skb_mac_header(skb); printk("TUNNEL=%u.%u.%u.%u->%u.%u.%u.%u ", NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); @@ -421,7 +422,7 @@ ip6t_log_packet(unsigned int pf, printk(" "); } - dump_packet(loginfo, skb, (u8*)skb->nh.ipv6h - skb->data, 1); + dump_packet(loginfo, skb, skb_network_offset(skb), 1); printk("\n"); spin_unlock_bh(&log_lock); } @@ -489,14 +490,10 @@ static int __init ip6t_log_init(void) ret = xt_register_target(&ip6t_log_reg); if (ret < 0) return ret; - if (nf_log_register(PF_INET6, &ip6t_logger) < 0) { - printk(KERN_WARNING "ip6t_LOG: not logging via system console " - "since somebody else already registered for PF_INET6\n"); - /* we cannot make module load fail here, since otherwise - * ip6tables userspace would abort */ - } - - return 0; + ret = nf_log_register(PF_INET6, &ip6t_logger); + if (ret < 0 && ret != -EEXIST) + xt_unregister_target(&ip6t_log_reg); + return ret; } static void __exit ip6t_log_fini(void) diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c index 6abee94c929..cb3d2415a06 100644 --- a/net/ipv6/netfilter/ip6t_REJECT.c +++ b/net/ipv6/netfilter/ip6t_REJECT.c @@ -47,7 +47,7 @@ static void send_reset(struct sk_buff *oldskb) struct tcphdr otcph, *tcph; unsigned int otcplen, hh_len; int tcphoff, needs_ack; - struct ipv6hdr *oip6h = oldskb->nh.ipv6h, *ip6h; + struct ipv6hdr *oip6h = ipv6_hdr(oldskb), *ip6h; struct dst_entry *dst = NULL; u8 proto; struct flowi fl; @@ -120,8 +120,9 @@ static void send_reset(struct sk_buff *oldskb) skb_reserve(nskb, hh_len + dst->header_len); - ip6h = nskb->nh.ipv6h = (struct ipv6hdr *) - skb_put(nskb, sizeof(struct ipv6hdr)); + skb_put(nskb, sizeof(struct ipv6hdr)); + skb_reset_network_header(nskb); + ip6h = ipv6_hdr(nskb); ip6h->version = 6; ip6h->hop_limit = dst_metric(dst, RTAX_HOPLIMIT); ip6h->nexthdr = IPPROTO_TCP; @@ -155,8 +156,8 @@ static void send_reset(struct sk_buff *oldskb) tcph->check = 0; /* Adjust TCP checksum */ - tcph->check = csum_ipv6_magic(&nskb->nh.ipv6h->saddr, - &nskb->nh.ipv6h->daddr, + tcph->check = csum_ipv6_magic(&ipv6_hdr(nskb)->saddr, + &ipv6_hdr(nskb)->daddr, sizeof(struct tcphdr), IPPROTO_TCP, csum_partial((char *)tcph, sizeof(struct tcphdr), 0)); diff --git a/net/ipv6/netfilter/ip6t_eui64.c b/net/ipv6/netfilter/ip6t_eui64.c index 967bed71d4a..0f3dd932f0a 100644 --- a/net/ipv6/netfilter/ip6t_eui64.c +++ b/net/ipv6/netfilter/ip6t_eui64.c @@ -32,8 +32,8 @@ match(const struct sk_buff *skb, unsigned char eui64[8]; int i = 0; - if (!(skb->mac.raw >= skb->head && - (skb->mac.raw + ETH_HLEN) <= skb->data) && + if (!(skb_mac_header(skb) >= skb->head && + (skb_mac_header(skb) + ETH_HLEN) <= skb->data) && offset != 0) { *hotdrop = 1; return 0; @@ -42,7 +42,7 @@ match(const struct sk_buff *skb, memset(eui64, 0, sizeof(eui64)); if (eth_hdr(skb)->h_proto == htons(ETH_P_IPV6)) { - if (skb->nh.ipv6h->version == 0x6) { + if (ipv6_hdr(skb)->version == 0x6) { memcpy(eui64, eth_hdr(skb)->h_source, 3); memcpy(eui64 + 5, eth_hdr(skb)->h_source + 3, 3); eui64[3] = 0xff; @@ -50,7 +50,7 @@ match(const struct sk_buff *skb, eui64[0] |= 0x02; i = 0; - while ((skb->nh.ipv6h->saddr.s6_addr[8+i] == eui64[i]) + while ((ipv6_hdr(skb)->saddr.s6_addr[8 + i] == eui64[i]) && (i < 8)) i++; diff --git a/net/ipv6/netfilter/ip6t_hl.c b/net/ipv6/netfilter/ip6t_hl.c index 37c8a4d4ed7..d606c0e6d6f 100644 --- a/net/ipv6/netfilter/ip6t_hl.c +++ b/net/ipv6/netfilter/ip6t_hl.c @@ -25,7 +25,7 @@ static int match(const struct sk_buff *skb, int offset, unsigned int protoff, int *hotdrop) { const struct ip6t_hl_info *info = matchinfo; - const struct ipv6hdr *ip6h = skb->nh.ipv6h; + const struct ipv6hdr *ip6h = ipv6_hdr(skb); switch (info->mode) { case IP6T_HL_EQ: diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c index 700a11d25de..fd6a0869099 100644 --- a/net/ipv6/netfilter/ip6t_ipv6header.c +++ b/net/ipv6/netfilter/ip6t_ipv6header.c @@ -45,7 +45,7 @@ ipv6header_match(const struct sk_buff *skb, /* Make sure this isn't an evil packet */ /* type of the 1st exthdr */ - nexthdr = skb->nh.ipv6h->nexthdr; + nexthdr = ipv6_hdr(skb)->nexthdr; /* pointer to the 1st exthdr */ ptr = sizeof(struct ipv6hdr); /* available length */ diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index 112a21d0c6d..76f0cf66f95 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -102,7 +102,7 @@ ip6t_local_out_hook(unsigned int hook, #if 0 /* root is playing with raw sockets. */ if ((*pskb)->len < sizeof(struct iphdr) - || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { + || ip_hdrlen(*pskb) < sizeof(struct iphdr)) { if (net_ratelimit()) printk("ip6t_hook: happy cracking.\n"); return NF_ACCEPT; diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index 0c468d35a93..a9f10e32c16 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -7,8 +7,6 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. - * - * Extended to all five netfilter hooks by Brad Chapman & Harald Welte */ #include <linux/module.h> #include <linux/netfilter_ipv6/ip6_tables.h> @@ -138,7 +136,7 @@ ip6t_local_hook(unsigned int hook, #if 0 /* root is playing with raw sockets. */ if ((*pskb)->len < sizeof(struct iphdr) - || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { + || ip_hdrlen(*pskb) < sizeof(struct iphdr)) { if (net_ratelimit()) printk("ip6t_hook: happy cracking.\n"); return NF_ACCEPT; @@ -146,21 +144,21 @@ ip6t_local_hook(unsigned int hook, #endif /* save source/dest address, mark, hoplimit, flowlabel, priority, */ - memcpy(&saddr, &(*pskb)->nh.ipv6h->saddr, sizeof(saddr)); - memcpy(&daddr, &(*pskb)->nh.ipv6h->daddr, sizeof(daddr)); + memcpy(&saddr, &ipv6_hdr(*pskb)->saddr, sizeof(saddr)); + memcpy(&daddr, &ipv6_hdr(*pskb)->daddr, sizeof(daddr)); mark = (*pskb)->mark; - hop_limit = (*pskb)->nh.ipv6h->hop_limit; + hop_limit = ipv6_hdr(*pskb)->hop_limit; /* flowlabel and prio (includes version, which shouldn't change either */ - flowlabel = *((u_int32_t *) (*pskb)->nh.ipv6h); + flowlabel = *((u_int32_t *)ipv6_hdr(*pskb)); ret = ip6t_do_table(pskb, hook, in, out, &packet_mangler); if (ret != NF_DROP && ret != NF_STOLEN - && (memcmp(&(*pskb)->nh.ipv6h->saddr, &saddr, sizeof(saddr)) - || memcmp(&(*pskb)->nh.ipv6h->daddr, &daddr, sizeof(daddr)) + && (memcmp(&ipv6_hdr(*pskb)->saddr, &saddr, sizeof(saddr)) + || memcmp(&ipv6_hdr(*pskb)->daddr, &daddr, sizeof(daddr)) || (*pskb)->mark != mark - || (*pskb)->nh.ipv6h->hop_limit != hop_limit)) + || ipv6_hdr(*pskb)->hop_limit != hop_limit)) return ip6_route_me_harder(*pskb) == 0 ? ret : NF_DROP; return ret; diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index d1102455668..6d2a0820511 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -7,17 +7,6 @@ * * Author: * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> - * - * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> - * - support Layer 3 protocol independent connection tracking. - * Based on the original ip_conntrack code which had the following - * copyright information: - * (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> - * - add get_features() to support various size of conntrack - * structures. */ #include <linux/types.h> @@ -138,16 +127,10 @@ static int ipv6_prepare(struct sk_buff **pskb, unsigned int hooknum, unsigned int *dataoff, u_int8_t *protonum) { - unsigned int extoff; - unsigned char pnum; - int protoff; - - extoff = (u8*)((*pskb)->nh.ipv6h + 1) - (*pskb)->data; - pnum = (*pskb)->nh.ipv6h->nexthdr; - - protoff = nf_ct_ipv6_skip_exthdr(*pskb, extoff, &pnum, - (*pskb)->len - extoff); - + unsigned int extoff = (u8 *)(ipv6_hdr(*pskb) + 1) - (*pskb)->data; + unsigned char pnum = ipv6_hdr(*pskb)->nexthdr; + int protoff = nf_ct_ipv6_skip_exthdr(*pskb, extoff, &pnum, + (*pskb)->len - extoff); /* * (protoff == (*pskb)->len) mean that the packet doesn't have no data * except of IPv6 & ext headers. but it's tracked anyway. - YK @@ -179,9 +162,8 @@ static unsigned int ipv6_confirm(unsigned int hooknum, struct nf_conn_help *help; enum ip_conntrack_info ctinfo; unsigned int ret, protoff; - unsigned int extoff = (u8*)((*pskb)->nh.ipv6h + 1) - - (*pskb)->data; - unsigned char pnum = (*pskb)->nh.ipv6h->nexthdr; + unsigned int extoff = (u8 *)(ipv6_hdr(*pskb) + 1) - (*pskb)->data; + unsigned char pnum = ipv6_hdr(*pskb)->nexthdr; /* This is where we call the helper: as the packet goes out. */ diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index 075da4f287b..0be790d250f 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -7,13 +7,6 @@ * * Author: * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> - * - * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> - * - ICMPv6 tracking support. Derived from the original ip_conntrack code - * net/ipv4/netfilter/ip_conntrack_proto_icmp.c which had the following - * copyright information: - * (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> */ #include <linux/types.h> diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 15ab1e3e8b5..347ab760823 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -82,7 +82,7 @@ struct nf_ct_frag6_queue struct sk_buff *fragments; int len; int meat; - struct timeval stamp; + ktime_t stamp; unsigned int csum; __u8 last_in; /* has first/last segment arrived? */ #define COMPLETE 4 @@ -353,9 +353,7 @@ nf_ct_frag6_create(unsigned int hash, __be32 id, struct in6_addr *src, str ipv6_addr_copy(&fq->saddr, src); ipv6_addr_copy(&fq->daddr, dst); - init_timer(&fq->timer); - fq->timer.function = nf_ct_frag6_expire; - fq->timer.data = (long) fq; + setup_timer(&fq->timer, nf_ct_frag6_expire, (unsigned long)fq); spin_lock_init(&fq->lock); atomic_set(&fq->refcnt, 1); @@ -400,19 +398,20 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, } offset = ntohs(fhdr->frag_off) & ~0x7; - end = offset + (ntohs(skb->nh.ipv6h->payload_len) - - ((u8 *) (fhdr + 1) - (u8 *) (skb->nh.ipv6h + 1))); + end = offset + (ntohs(ipv6_hdr(skb)->payload_len) - + ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1))); if ((unsigned int)end > IPV6_MAXPLEN) { DEBUGP("offset is too large.\n"); return -1; } - if (skb->ip_summed == CHECKSUM_COMPLETE) + if (skb->ip_summed == CHECKSUM_COMPLETE) { + const unsigned char *nh = skb_network_header(skb); skb->csum = csum_sub(skb->csum, - csum_partial(skb->nh.raw, - (u8*)(fhdr + 1) - skb->nh.raw, + csum_partial(nh, (u8 *)(fhdr + 1) - nh, 0)); + } /* Is this the final fragment? */ if (!(fhdr->frag_off & htons(IP6_MF))) { @@ -542,7 +541,7 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, fq->fragments = skb; skb->dev = NULL; - skb_get_timestamp(skb, &fq->stamp); + fq->stamp = skb->tstamp; fq->meat += skb->len; atomic_add(skb->truesize, &nf_ct_frag6_mem); @@ -583,7 +582,9 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) BUG_TRAP(NFCT_FRAG6_CB(head)->offset == 0); /* Unfragmented part is taken from the first segment. */ - payload_len = (head->data - head->nh.raw) - sizeof(struct ipv6hdr) + fq->len - sizeof(struct frag_hdr); + payload_len = ((head->data - skb_network_header(head)) - + sizeof(struct ipv6hdr) + fq->len - + sizeof(struct frag_hdr)); if (payload_len > IPV6_MAXPLEN) { DEBUGP("payload len is too large.\n"); goto out_oversize; @@ -624,15 +625,15 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) /* We have to remove fragment header from datagram and to relocate * header in order to calculate ICV correctly. */ - head->nh.raw[fq->nhoffset] = head->h.raw[0]; + skb_network_header(head)[fq->nhoffset] = skb_transport_header(head)[0]; memmove(head->head + sizeof(struct frag_hdr), head->head, (head->data - head->head) - sizeof(struct frag_hdr)); - head->mac.raw += sizeof(struct frag_hdr); - head->nh.raw += sizeof(struct frag_hdr); + head->mac_header += sizeof(struct frag_hdr); + head->network_header += sizeof(struct frag_hdr); skb_shinfo(head)->frag_list = head->next; - head->h.raw = head->data; - skb_push(head, head->data - head->nh.raw); + skb_reset_transport_header(head); + skb_push(head, head->data - skb_network_header(head)); atomic_sub(head->truesize, &nf_ct_frag6_mem); for (fp=head->next; fp; fp = fp->next) { @@ -648,12 +649,14 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) head->next = NULL; head->dev = dev; - skb_set_timestamp(head, &fq->stamp); - head->nh.ipv6h->payload_len = htons(payload_len); + head->tstamp = fq->stamp; + ipv6_hdr(head)->payload_len = htons(payload_len); /* Yes, and fold redundant checksum back. 8) */ if (head->ip_summed == CHECKSUM_COMPLETE) - head->csum = csum_partial(head->nh.raw, head->h.raw-head->nh.raw, head->csum); + head->csum = csum_partial(skb_network_header(head), + skb_network_header_len(head), + head->csum); fq->fragments = NULL; @@ -701,9 +704,10 @@ out_fail: static int find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff) { - u8 nexthdr = skb->nh.ipv6h->nexthdr; - u8 prev_nhoff = (u8 *)&skb->nh.ipv6h->nexthdr - skb->data; - int start = (u8 *)(skb->nh.ipv6h+1) - skb->data; + u8 nexthdr = ipv6_hdr(skb)->nexthdr; + const int netoff = skb_network_offset(skb); + u8 prev_nhoff = netoff + offsetof(struct ipv6hdr, nexthdr); + int start = netoff + sizeof(struct ipv6hdr); int len = skb->len - start; u8 prevhdr = NEXTHDR_IPV6; @@ -759,7 +763,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb) struct sk_buff *ret_skb = NULL; /* Jumbo payload inhibits frag. header */ - if (skb->nh.ipv6h->payload_len == 0) { + if (ipv6_hdr(skb)->payload_len == 0) { DEBUGP("payload len = 0\n"); return skb; } @@ -780,9 +784,9 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb) goto ret_orig; } - clone->h.raw = clone->data + fhoff; - hdr = clone->nh.ipv6h; - fhdr = (struct frag_hdr *)clone->h.raw; + skb_set_transport_header(clone, fhoff); + hdr = ipv6_hdr(clone); + fhdr = (struct frag_hdr *)skb_transport_header(clone); if (!(fhdr->frag_off & htons(0xFFF9))) { DEBUGP("Invalid fragment offset\n"); @@ -864,8 +868,7 @@ int nf_ct_frag6_init(void) nf_ct_frag6_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^ (jiffies ^ (jiffies >> 6))); - init_timer(&nf_ct_frag6_secret_timer); - nf_ct_frag6_secret_timer.function = nf_ct_frag6_secret_rebuild; + setup_timer(&nf_ct_frag6_secret_timer, nf_ct_frag6_secret_rebuild, 0); nf_ct_frag6_secret_timer.expires = jiffies + nf_ct_frag6_secret_interval; add_timer(&nf_ct_frag6_secret_timer); diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index fa3fb509f18..920dc9cf6a8 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -23,12 +23,12 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/stddef.h> +#include <net/ip.h> #include <net/sock.h> #include <net/tcp.h> #include <net/transp_v6.h> #include <net/ipv6.h> -#ifdef CONFIG_PROC_FS static struct proc_dir_entry *proc_net_devsnmp6; static int fold_prot_inuse(struct proto *proto) @@ -142,26 +142,13 @@ static struct snmp_mib snmp6_udplite6_list[] = { SNMP_MIB_SENTINEL }; -static unsigned long -fold_field(void *mib[], int offt) -{ - unsigned long res = 0; - int i; - - for_each_possible_cpu(i) { - res += *(((unsigned long *)per_cpu_ptr(mib[0], i)) + offt); - res += *(((unsigned long *)per_cpu_ptr(mib[1], i)) + offt); - } - return res; -} - static inline void snmp6_seq_show_item(struct seq_file *seq, void **mib, struct snmp_mib *itemlist) { int i; for (i=0; itemlist[i].name; i++) seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, - fold_field(mib, itemlist[i].entry)); + snmp_fold_field(mib, itemlist[i].entry)); } static int snmp6_seq_show(struct seq_file *seq, void *v) @@ -236,6 +223,7 @@ int snmp6_unregister_dev(struct inet6_dev *idev) return -EINVAL; remove_proc_entry(idev->stats.proc_dir_entry->name, proc_net_devsnmp6); + idev->stats.proc_dir_entry = NULL; return 0; } @@ -271,47 +259,3 @@ void ipv6_misc_proc_exit(void) proc_net_remove("snmp6"); } -#else /* CONFIG_PROC_FS */ - - -int snmp6_register_dev(struct inet6_dev *idev) -{ - return 0; -} - -int snmp6_unregister_dev(struct inet6_dev *idev) -{ - return 0; -} -#endif /* CONFIG_PROC_FS */ - -int snmp6_alloc_dev(struct inet6_dev *idev) -{ - int err = -ENOMEM; - - if (!idev || !idev->dev) - return -EINVAL; - - if (snmp6_mib_init((void **)idev->stats.ipv6, sizeof(struct ipstats_mib), - __alignof__(struct ipstats_mib)) < 0) - goto err_ip; - if (snmp6_mib_init((void **)idev->stats.icmpv6, sizeof(struct icmpv6_mib), - __alignof__(struct icmpv6_mib)) < 0) - goto err_icmp; - - return 0; - -err_icmp: - snmp6_mib_free((void **)idev->stats.ipv6); -err_ip: - return err; -} - -int snmp6_free_dev(struct inet6_dev *idev) -{ - snmp6_mib_free((void **)idev->stats.icmpv6); - snmp6_mib_free((void **)idev->stats.ipv6); - return 0; -} - - diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c index ef43bd57bae..f929f47b925 100644 --- a/net/ipv6/protocol.c +++ b/net/ipv6/protocol.c @@ -60,6 +60,8 @@ int inet6_add_protocol(struct inet6_protocol *prot, unsigned char protocol) return ret; } +EXPORT_SYMBOL(inet6_add_protocol); + /* * Remove a protocol from the hash tables. */ @@ -83,3 +85,5 @@ int inet6_del_protocol(struct inet6_protocol *prot, unsigned char protocol) return ret; } + +EXPORT_SYMBOL(inet6_del_protocol); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 203e069e7fe..009a1047fc3 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -152,7 +152,7 @@ int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) int delivered = 0; __u8 hash; - saddr = &skb->nh.ipv6h->saddr; + saddr = &ipv6_hdr(skb)->saddr; daddr = saddr + 1; hash = nexthdr & (MAX_INET_PROTOS - 1); @@ -361,17 +361,18 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) skb->ip_summed = CHECKSUM_UNNECESSARY; if (skb->ip_summed == CHECKSUM_COMPLETE) { - skb_postpull_rcsum(skb, skb->nh.raw, - skb->h.raw - skb->nh.raw); - if (!csum_ipv6_magic(&skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr, + skb_postpull_rcsum(skb, skb_network_header(skb), + skb_network_header_len(skb)); + if (!csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, skb->len, inet->num, skb->csum)) skb->ip_summed = CHECKSUM_UNNECESSARY; } - if (skb->ip_summed != CHECKSUM_UNNECESSARY) - skb->csum = ~csum_unfold(csum_ipv6_magic(&skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr, - skb->len, inet->num, 0)); + if (!skb_csum_unnecessary(skb)) + skb->csum = ~csum_unfold(csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len, + inet->num, 0)); if (inet->hdrincl) { if (skb_checksum_complete(skb)) { @@ -420,7 +421,7 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, msg->msg_flags |= MSG_TRUNC; } - if (skb->ip_summed==CHECKSUM_UNNECESSARY) { + if (skb_csum_unnecessary(skb)) { err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); } else if (msg->msg_flags&MSG_TRUNC) { if (__skb_checksum_complete(skb)) @@ -438,7 +439,7 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, if (sin6) { sin6->sin6_family = AF_INET6; sin6->sin6_port = 0; - ipv6_addr_copy(&sin6->sin6_addr, &skb->nh.ipv6h->saddr); + ipv6_addr_copy(&sin6->sin6_addr, &ipv6_hdr(skb)->saddr); sin6->sin6_flowinfo = 0; sin6->sin6_scope_id = 0; if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) @@ -488,7 +489,8 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl, goto out; offset = rp->offset; - total_len = inet_sk(sk)->cork.length - (skb->nh.raw - skb->data); + total_len = inet_sk(sk)->cork.length - (skb_network_header(skb) - + skb->data); if (offset >= total_len - 1) { err = -EINVAL; ip6_flush_pending_frames(sk); @@ -511,7 +513,7 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl, if (csum_skb) continue; - len = skb->len - (skb->h.raw - skb->data); + len = skb->len - skb_transport_offset(skb); if (offset >= len) { offset -= len; continue; @@ -523,7 +525,7 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl, skb = csum_skb; } - offset += skb->h.raw - skb->data; + offset += skb_transport_offset(skb); if (skb_copy_bits(skb, offset, &csum, 2)) BUG(); @@ -575,11 +577,13 @@ static int rawv6_send_hdrinc(struct sock *sk, void *from, int length, skb->priority = sk->sk_priority; skb->dst = dst_clone(&rt->u.dst); - skb->nh.ipv6h = iph = (struct ipv6hdr *)skb_put(skb, length); + skb_put(skb, length); + skb_reset_network_header(skb); + iph = ipv6_hdr(skb); skb->ip_summed = CHECKSUM_NONE; - skb->h.raw = skb->nh.raw; + skb->transport_header = skb->network_header; err = memcpy_fromiovecend((void *)iph, from, 0, length); if (err) goto error_fault; @@ -878,7 +882,7 @@ static int rawv6_seticmpfilter(struct sock *sk, int level, int optname, return 0; default: return -ENOPROTOOPT; - }; + } return 0; } @@ -903,7 +907,7 @@ static int rawv6_geticmpfilter(struct sock *sk, int level, int optname, return 0; default: return -ENOPROTOOPT; - }; + } return 0; } @@ -957,7 +961,8 @@ static int rawv6_setsockopt(struct sock *sk, int level, int optname, default: return ipv6_setsockopt(sk, level, optname, optval, optlen); - }; + } + return do_rawv6_setsockopt(sk, level, optname, optval, optlen); } @@ -978,7 +983,7 @@ static int compat_rawv6_setsockopt(struct sock *sk, int level, int optname, default: return compat_ipv6_setsockopt(sk, level, optname, optval, optlen); - }; + } return do_rawv6_setsockopt(sk, level, optname, optval, optlen); } #endif @@ -1031,7 +1036,8 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname, default: return ipv6_getsockopt(sk, level, optname, optval, optlen); - }; + } + return do_rawv6_getsockopt(sk, level, optname, optval, optlen); } @@ -1052,7 +1058,7 @@ static int compat_rawv6_getsockopt(struct sock *sk, int level, int optname, default: return compat_ipv6_getsockopt(sk, level, optname, optval, optlen); - }; + } return do_rawv6_getsockopt(sk, level, optname, optval, optlen); } #endif @@ -1073,7 +1079,7 @@ static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg) spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb != NULL) - amount = skb->tail - skb->h.raw; + amount = skb->tail - skb->transport_header; spin_unlock_bh(&sk->sk_receive_queue.lock); return put_user(amount, (int __user *)arg); } diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 7034c54e501..de795c04e34 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -88,7 +88,7 @@ struct frag_queue int len; int meat; int iif; - struct timeval stamp; + ktime_t stamp; unsigned int csum; __u8 last_in; /* has first/last segment arrived? */ #define COMPLETE 4 @@ -430,19 +430,24 @@ static void ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, goto err; offset = ntohs(fhdr->frag_off) & ~0x7; - end = offset + (ntohs(skb->nh.ipv6h->payload_len) - - ((u8 *) (fhdr + 1) - (u8 *) (skb->nh.ipv6h + 1))); + end = offset + (ntohs(ipv6_hdr(skb)->payload_len) - + ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1))); if ((unsigned int)end > IPV6_MAXPLEN) { IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_INHDRERRORS); - icmpv6_param_prob(skb,ICMPV6_HDR_FIELD, (u8*)&fhdr->frag_off - skb->nh.raw); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, + ((u8 *)&fhdr->frag_off - + skb_network_header(skb))); return; } - if (skb->ip_summed == CHECKSUM_COMPLETE) + if (skb->ip_summed == CHECKSUM_COMPLETE) { + const unsigned char *nh = skb_network_header(skb); skb->csum = csum_sub(skb->csum, - csum_partial(skb->nh.raw, (u8*)(fhdr+1)-skb->nh.raw, 0)); + csum_partial(nh, (u8 *)(fhdr + 1) - nh, + 0)); + } /* Is this the final fragment? */ if (!(fhdr->frag_off & htons(IP6_MF))) { @@ -562,7 +567,7 @@ static void ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, if (skb->dev) fq->iif = skb->dev->ifindex; skb->dev = NULL; - skb_get_timestamp(skb, &fq->stamp); + fq->stamp = skb->tstamp; fq->meat += skb->len; atomic_add(skb->truesize, &ip6_frag_mem); @@ -605,7 +610,9 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in, BUG_TRAP(FRAG6_CB(head)->offset == 0); /* Unfragmented part is taken from the first segment. */ - payload_len = (head->data - head->nh.raw) - sizeof(struct ipv6hdr) + fq->len - sizeof(struct frag_hdr); + payload_len = ((head->data - skb_network_header(head)) - + sizeof(struct ipv6hdr) + fq->len - + sizeof(struct frag_hdr)); if (payload_len > IPV6_MAXPLEN) goto out_oversize; @@ -639,15 +646,15 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in, /* We have to remove fragment header from datagram and to relocate * header in order to calculate ICV correctly. */ nhoff = fq->nhoffset; - head->nh.raw[nhoff] = head->h.raw[0]; + skb_network_header(head)[nhoff] = skb_transport_header(head)[0]; memmove(head->head + sizeof(struct frag_hdr), head->head, (head->data - head->head) - sizeof(struct frag_hdr)); - head->mac.raw += sizeof(struct frag_hdr); - head->nh.raw += sizeof(struct frag_hdr); + head->mac_header += sizeof(struct frag_hdr); + head->network_header += sizeof(struct frag_hdr); skb_shinfo(head)->frag_list = head->next; - head->h.raw = head->data; - skb_push(head, head->data - head->nh.raw); + skb_reset_transport_header(head); + skb_push(head, head->data - skb_network_header(head)); atomic_sub(head->truesize, &ip6_frag_mem); for (fp=head->next; fp; fp = fp->next) { @@ -663,15 +670,17 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in, head->next = NULL; head->dev = dev; - skb_set_timestamp(head, &fq->stamp); - head->nh.ipv6h->payload_len = htons(payload_len); + head->tstamp = fq->stamp; + ipv6_hdr(head)->payload_len = htons(payload_len); IP6CB(head)->nhoff = nhoff; *skb_in = head; /* Yes, and fold redundant checksum back. 8) */ if (head->ip_summed == CHECKSUM_COMPLETE) - head->csum = csum_partial(head->nh.raw, head->h.raw-head->nh.raw, head->csum); + head->csum = csum_partial(skb_network_header(head), + skb_network_header_len(head), + head->csum); rcu_read_lock(); IP6_INC_STATS_BH(__in6_dev_get(dev), IPSTATS_MIB_REASMOKS); @@ -699,33 +708,34 @@ static int ipv6_frag_rcv(struct sk_buff **skbp) struct net_device *dev = skb->dev; struct frag_hdr *fhdr; struct frag_queue *fq; - struct ipv6hdr *hdr; - - hdr = skb->nh.ipv6h; + struct ipv6hdr *hdr = ipv6_hdr(skb); IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_REASMREQDS); /* Jumbo payload inhibits frag. header */ if (hdr->payload_len==0) { IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INHDRERRORS); - icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb->h.raw-skb->nh.raw); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, + skb_network_header_len(skb)); return -1; } - if (!pskb_may_pull(skb, (skb->h.raw-skb->data)+sizeof(struct frag_hdr))) { + if (!pskb_may_pull(skb, (skb_transport_offset(skb) + + sizeof(struct frag_hdr)))) { IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INHDRERRORS); - icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb->h.raw-skb->nh.raw); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, + skb_network_header_len(skb)); return -1; } - hdr = skb->nh.ipv6h; - fhdr = (struct frag_hdr *)skb->h.raw; + hdr = ipv6_hdr(skb); + fhdr = (struct frag_hdr *)skb_transport_header(skb); if (!(fhdr->frag_off & htons(0xFFF9))) { /* It is not a fragmented frame */ - skb->h.raw += sizeof(struct frag_hdr); + skb->transport_header += sizeof(struct frag_hdr); IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_REASMOKS); - IP6CB(skb)->nhoff = (u8*)fhdr - skb->nh.raw; + IP6CB(skb)->nhoff = (u8 *)fhdr - skb_network_header(skb); return 1; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index aebb4e2d5ae..b46ad53044b 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -575,6 +575,8 @@ struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr, return NULL; } +EXPORT_SYMBOL(rt6_lookup); + /* ip6_ins_rt is called with FREE table->tb6_lock. It takes new route entry, the addition fails by any reason the route is freed. In any case, if caller does not hold it, it may @@ -724,7 +726,7 @@ out2: void ip6_route_input(struct sk_buff *skb) { - struct ipv6hdr *iph = skb->nh.ipv6h; + struct ipv6hdr *iph = ipv6_hdr(skb); int flags = RT6_LOOKUP_F_HAS_SADDR; struct flowi fl = { .iif = skb->dev->ifindex, @@ -829,6 +831,7 @@ struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl) return fib6_rule_lookup(fl, flags, ip6_pol_route_output); } +EXPORT_SYMBOL(ip6_route_output); /* * Destination cache support functions @@ -1757,7 +1760,7 @@ int ipv6_route_ioctl(unsigned int cmd, void __user *arg) rtnl_unlock(); return err; - }; + } return -EINVAL; } @@ -1772,7 +1775,7 @@ static inline int ip6_pkt_drop(struct sk_buff *skb, int code, int type; switch (ipstats_mib_noroutes) { case IPSTATS_MIB_INNOROUTES: - type = ipv6_addr_type(&skb->nh.ipv6h->daddr); + type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED) { IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INADDRERRORS); break; @@ -2012,7 +2015,7 @@ errout: return err; } -int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) { struct fib6_config cfg; int err; @@ -2024,7 +2027,7 @@ int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) return ip6_route_del(&cfg); } -int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) { struct fib6_config cfg; int err; @@ -2161,7 +2164,7 @@ int rt6_dump_route(struct rt6_info *rt, void *p_arg) prefix, NLM_F_MULTI); } -int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) +static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) { struct nlattr *tb[RTA_MAX+1]; struct rt6_info *rt; @@ -2215,7 +2218,7 @@ int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) /* Reserve room for dummy headers, this skb can pass through good chunk of routing engine. */ - skb->mac.raw = skb->data; + skb_reset_mac_header(skb); skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr)); rt = (struct rt6_info*) ip6_route_output(NULL, &fl); @@ -2486,8 +2489,9 @@ ctl_table ipv6_route_table[] = { void __init ip6_route_init(void) { +#ifdef CONFIG_PROC_FS struct proc_dir_entry *p; - +#endif ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); @@ -2505,6 +2509,10 @@ void __init ip6_route_init(void) #ifdef CONFIG_IPV6_MULTIPLE_TABLES fib6_rules_init(); #endif + + __rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL); + __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL); + __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL); } void ip6_route_cleanup(void) diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 08d6ed3396e..1efa95a99f4 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -99,10 +99,10 @@ static struct ip_tunnel * ipip6_tunnel_lookup(__be32 remote, __be32 local) return NULL; } -static struct ip_tunnel ** ipip6_bucket(struct ip_tunnel *t) +static struct ip_tunnel **__ipip6_bucket(struct ip_tunnel_parm *parms) { - __be32 remote = t->parms.iph.daddr; - __be32 local = t->parms.iph.saddr; + __be32 remote = parms->iph.daddr; + __be32 local = parms->iph.saddr; unsigned h = 0; int prio = 0; @@ -117,6 +117,11 @@ static struct ip_tunnel ** ipip6_bucket(struct ip_tunnel *t) return &tunnels[prio][h]; } +static inline struct ip_tunnel **ipip6_bucket(struct ip_tunnel *t) +{ + return __ipip6_bucket(&t->parms); +} + static void ipip6_tunnel_unlink(struct ip_tunnel *t) { struct ip_tunnel **tp; @@ -147,19 +152,9 @@ static struct ip_tunnel * ipip6_tunnel_locate(struct ip_tunnel_parm *parms, int __be32 local = parms->iph.saddr; struct ip_tunnel *t, **tp, *nt; struct net_device *dev; - unsigned h = 0; - int prio = 0; char name[IFNAMSIZ]; - if (remote) { - prio |= 2; - h ^= HASH(remote); - } - if (local) { - prio |= 1; - h ^= HASH(local); - } - for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) { + for (tp = __ipip6_bucket(parms); (t = *tp) != NULL; tp = &t->next) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) return t; } @@ -224,8 +219,8 @@ static int ipip6_err(struct sk_buff *skb, u32 info) ICMP in the real Internet is absolutely infeasible. */ struct iphdr *iph = (struct iphdr*)skb->data; - int type = skb->h.icmph->type; - int code = skb->h.icmph->code; + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; struct ip_tunnel *t; int err; @@ -280,8 +275,8 @@ out: struct iphdr *iph = (struct iphdr*)dp; int hlen = iph->ihl<<2; struct ipv6hdr *iph6; - int type = skb->h.icmph->type; - int code = skb->h.icmph->code; + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; int rel_type = 0; int rel_code = 0; int rel_info = 0; @@ -296,14 +291,14 @@ out: default: return; case ICMP_PARAMETERPROB: - if (skb->h.icmph->un.gateway < hlen) + if (icmp_hdr(skb)->un.gateway < hlen) return; /* So... This guy found something strange INSIDE encapsulated packet. Well, he is fool, but what can we do ? */ rel_type = ICMPV6_PARAMPROB; - rel_info = skb->h.icmph->un.gateway - hlen; + rel_info = icmp_hdr(skb)->un.gateway - hlen; break; case ICMP_DEST_UNREACH: @@ -340,7 +335,7 @@ out: dst_release(skb2->dst); skb2->dst = NULL; skb_pull(skb2, skb->data - (u8*)iph6); - skb2->nh.raw = skb2->data; + skb_reset_network_header(skb2); /* Try to guess incoming interface */ rt6i = rt6_lookup(&iph6->saddr, NULL, NULL, 0); @@ -366,7 +361,7 @@ out: static inline void ipip6_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) { if (INET_ECN_is_ce(iph->tos)) - IP6_ECN_set_ce(skb->nh.ipv6h); + IP6_ECN_set_ce(ipv6_hdr(skb)); } static int ipip6_rcv(struct sk_buff *skb) @@ -377,13 +372,13 @@ static int ipip6_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto out; - iph = skb->nh.iph; + iph = ip_hdr(skb); read_lock(&ipip6_lock); if ((tunnel = ipip6_tunnel_lookup(iph->saddr, iph->daddr)) != NULL) { secpath_reset(skb); - skb->mac.raw = skb->nh.raw; - skb->nh.raw = skb->data; + skb->mac_header = skb->network_header; + skb_reset_network_header(skb); IPCB(skb)->flags = 0; skb->protocol = htons(ETH_P_IPV6); skb->pkt_type = PACKET_HOST; @@ -430,7 +425,7 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) struct ip_tunnel *tunnel = netdev_priv(dev); struct net_device_stats *stats = &tunnel->stat; struct iphdr *tiph = &tunnel->parms.iph; - struct ipv6hdr *iph6 = skb->nh.ipv6h; + struct ipv6hdr *iph6 = ipv6_hdr(skb); u8 tos = tunnel->parms.iph.tos; struct rtable *rt; /* Route to the other host */ struct net_device *tdev; /* Device to other host */ @@ -468,7 +463,7 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) addr_type = ipv6_addr_type(addr6); if (addr_type == IPV6_ADDR_ANY) { - addr6 = &skb->nh.ipv6h->daddr; + addr6 = &ipv6_hdr(skb)->daddr; addr_type = ipv6_addr_type(addr6); } @@ -550,11 +545,12 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) skb_set_owner_w(new_skb, skb->sk); dev_kfree_skb(skb); skb = new_skb; - iph6 = skb->nh.ipv6h; + iph6 = ipv6_hdr(skb); } - skb->h.raw = skb->nh.raw; - skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); + skb->transport_header = skb->network_header; + skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); IPCB(skb)->flags = 0; dst_release(skb->dst); @@ -564,7 +560,7 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) * Push down and install the IPIP header. */ - iph = skb->nh.iph; + iph = ip_hdr(skb); iph->version = 4; iph->ihl = sizeof(struct iphdr)>>2; if (mtu > IPV6_MIN_MTU) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 92f99927d12..e2f25ea43b6 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -115,10 +115,10 @@ static __inline__ __sum16 tcp_v6_check(struct tcphdr *th, int len, static __u32 tcp_v6_init_sequence(struct sk_buff *skb) { - return secure_tcpv6_sequence_number(skb->nh.ipv6h->daddr.s6_addr32, - skb->nh.ipv6h->saddr.s6_addr32, - skb->h.th->dest, - skb->h.th->source); + return secure_tcpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32, + ipv6_hdr(skb)->saddr.s6_addr32, + tcp_hdr(skb)->dest, + tcp_hdr(skb)->source); } static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, @@ -486,7 +486,9 @@ static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req, struct sk_buff *pktopts = treq->pktopts; struct inet6_skb_parm *rxopt = IP6CB(pktopts); if (rxopt->srcrt) - opt = ipv6_invert_rthdr(sk, (struct ipv6_rt_hdr*)(pktopts->nh.raw + rxopt->srcrt)); + opt = ipv6_invert_rthdr(sk, + (struct ipv6_rt_hdr *)(skb_network_header(pktopts) + + rxopt->srcrt)); } if (opt && opt->srcrt) { @@ -507,7 +509,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req, skb = tcp_make_synack(sk, dst, req); if (skb) { - struct tcphdr *th = skb->h.th; + struct tcphdr *th = tcp_hdr(skb); th->check = tcp_v6_check(th, skb->len, &treq->loc_addr, &treq->rmt_addr, @@ -835,8 +837,8 @@ static int tcp_v6_inbound_md5_hash (struct sock *sk, struct sk_buff *skb) { __u8 *hash_location = NULL; struct tcp_md5sig_key *hash_expected; - struct ipv6hdr *ip6h = skb->nh.ipv6h; - struct tcphdr *th = skb->h.th; + struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct tcphdr *th = tcp_hdr(skb); int length = (th->doff << 2) - sizeof (*th); int genhash; u8 *ptr; @@ -944,10 +946,11 @@ static struct timewait_sock_ops tcp6_timewait_sock_ops = { static void tcp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb) { struct ipv6_pinfo *np = inet6_sk(sk); - struct tcphdr *th = skb->h.th; + struct tcphdr *th = tcp_hdr(skb); if (skb->ip_summed == CHECKSUM_PARTIAL) { th->check = ~csum_ipv6_magic(&np->saddr, &np->daddr, len, IPPROTO_TCP, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); } else { th->check = csum_ipv6_magic(&np->saddr, &np->daddr, len, IPPROTO_TCP, @@ -964,12 +967,13 @@ static int tcp_v6_gso_send_check(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(*th))) return -EINVAL; - ipv6h = skb->nh.ipv6h; - th = skb->h.th; + ipv6h = ipv6_hdr(skb); + th = tcp_hdr(skb); th->check = 0; th->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, skb->len, IPPROTO_TCP, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); skb->ip_summed = CHECKSUM_PARTIAL; return 0; @@ -977,7 +981,7 @@ static int tcp_v6_gso_send_check(struct sk_buff *skb) static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) { - struct tcphdr *th = skb->h.th, *t1; + struct tcphdr *th = tcp_hdr(skb), *t1; struct sk_buff *buff; struct flowi fl; int tot_len = sizeof(*th); @@ -993,7 +997,7 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) #ifdef CONFIG_TCP_MD5SIG if (sk) - key = tcp_v6_md5_do_lookup(sk, &skb->nh.ipv6h->daddr); + key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr); else key = NULL; @@ -1037,20 +1041,18 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) (TCPOPT_NOP << 16) | (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); - tcp_v6_do_calc_md5_hash((__u8*)&opt[1], - key, - &skb->nh.ipv6h->daddr, - &skb->nh.ipv6h->saddr, - t1, IPPROTO_TCP, - tot_len); + tcp_v6_do_calc_md5_hash((__u8 *)&opt[1], key, + &ipv6_hdr(skb)->daddr, + &ipv6_hdr(skb)->saddr, + t1, IPPROTO_TCP, tot_len); } #endif buff->csum = csum_partial((char *)t1, sizeof(*t1), 0); memset(&fl, 0, sizeof(fl)); - ipv6_addr_copy(&fl.fl6_dst, &skb->nh.ipv6h->saddr); - ipv6_addr_copy(&fl.fl6_src, &skb->nh.ipv6h->daddr); + ipv6_addr_copy(&fl.fl6_dst, &ipv6_hdr(skb)->saddr); + ipv6_addr_copy(&fl.fl6_src, &ipv6_hdr(skb)->daddr); t1->check = csum_ipv6_magic(&fl.fl6_src, &fl.fl6_dst, sizeof(*t1), IPPROTO_TCP, @@ -1079,7 +1081,7 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) static void tcp_v6_send_ack(struct tcp_timewait_sock *tw, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts) { - struct tcphdr *th = skb->h.th, *t1; + struct tcphdr *th = tcp_hdr(skb), *t1; struct sk_buff *buff; struct flowi fl; int tot_len = sizeof(struct tcphdr); @@ -1091,7 +1093,7 @@ static void tcp_v6_send_ack(struct tcp_timewait_sock *tw, #ifdef CONFIG_TCP_MD5SIG if (!tw && skb->sk) { - key = tcp_v6_md5_do_lookup(skb->sk, &skb->nh.ipv6h->daddr); + key = tcp_v6_md5_do_lookup(skb->sk, &ipv6_hdr(skb)->daddr); } else if (tw && tw->tw_md5_keylen) { tw_key.key = tw->tw_md5_key; tw_key.keylen = tw->tw_md5_keylen; @@ -1140,20 +1142,18 @@ static void tcp_v6_send_ack(struct tcp_timewait_sock *tw, if (key) { *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); - tcp_v6_do_calc_md5_hash((__u8 *)topt, - key, - &skb->nh.ipv6h->daddr, - &skb->nh.ipv6h->saddr, - t1, IPPROTO_TCP, - tot_len); + tcp_v6_do_calc_md5_hash((__u8 *)topt, key, + &ipv6_hdr(skb)->daddr, + &ipv6_hdr(skb)->saddr, + t1, IPPROTO_TCP, tot_len); } #endif buff->csum = csum_partial((char *)t1, tot_len, 0); memset(&fl, 0, sizeof(fl)); - ipv6_addr_copy(&fl.fl6_dst, &skb->nh.ipv6h->saddr); - ipv6_addr_copy(&fl.fl6_src, &skb->nh.ipv6h->daddr); + ipv6_addr_copy(&fl.fl6_dst, &ipv6_hdr(skb)->saddr); + ipv6_addr_copy(&fl.fl6_src, &ipv6_hdr(skb)->daddr); t1->check = csum_ipv6_magic(&fl.fl6_src, &fl.fl6_dst, tot_len, IPPROTO_TCP, @@ -1197,18 +1197,18 @@ static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req) static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) { struct request_sock *req, **prev; - const struct tcphdr *th = skb->h.th; + const struct tcphdr *th = tcp_hdr(skb); struct sock *nsk; /* Find possible connection requests. */ req = inet6_csk_search_req(sk, &prev, th->source, - &skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr, inet6_iif(skb)); + &ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, inet6_iif(skb)); if (req) return tcp_check_req(sk, skb, req, prev); - nsk = __inet6_lookup_established(&tcp_hashinfo, &skb->nh.ipv6h->saddr, - th->source, &skb->nh.ipv6h->daddr, + nsk = __inet6_lookup_established(&tcp_hashinfo, &ipv6_hdr(skb)->saddr, + th->source, &ipv6_hdr(skb)->daddr, ntohs(th->dest), inet6_iif(skb)); if (nsk) { @@ -1275,9 +1275,9 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) tcp_openreq_init(req, &tmp_opt, skb); treq = inet6_rsk(req); - ipv6_addr_copy(&treq->rmt_addr, &skb->nh.ipv6h->saddr); - ipv6_addr_copy(&treq->loc_addr, &skb->nh.ipv6h->daddr); - TCP_ECN_create_request(req, skb->h.th); + ipv6_addr_copy(&treq->rmt_addr, &ipv6_hdr(skb)->saddr); + ipv6_addr_copy(&treq->loc_addr, &ipv6_hdr(skb)->daddr); + TCP_ECN_create_request(req, tcp_hdr(skb)); treq->pktopts = NULL; if (ipv6_opt_accepted(sk, skb) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || @@ -1363,7 +1363,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newnp->pktoptions = NULL; newnp->opt = NULL; newnp->mcast_oif = inet6_iif(skb); - newnp->mcast_hops = skb->nh.ipv6h->hop_limit; + newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; /* * No need to charge this sock to the relevant IPv6 refcnt debug socks count @@ -1389,7 +1389,9 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, opt == NULL && treq->pktopts) { struct inet6_skb_parm *rxopt = IP6CB(treq->pktopts); if (rxopt->srcrt) - opt = ipv6_invert_rthdr(sk, (struct ipv6_rt_hdr *)(treq->pktopts->nh.raw + rxopt->srcrt)); + opt = ipv6_invert_rthdr(sk, + (struct ipv6_rt_hdr *)(skb_network_header(treq->pktopts) + + rxopt->srcrt)); } if (dst == NULL) { @@ -1469,7 +1471,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, } newnp->opt = NULL; newnp->mcast_oif = inet6_iif(skb); - newnp->mcast_hops = skb->nh.ipv6h->hop_limit; + newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; /* Clone native IPv6 options from listening socket (if any) @@ -1528,15 +1530,16 @@ out: static __sum16 tcp_v6_checksum_init(struct sk_buff *skb) { if (skb->ip_summed == CHECKSUM_COMPLETE) { - if (!tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr,skb->csum)) { + if (!tcp_v6_check(tcp_hdr(skb), skb->len, &ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, skb->csum)) { skb->ip_summed = CHECKSUM_UNNECESSARY; return 0; } } - skb->csum = ~csum_unfold(tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr, 0)); + skb->csum = ~csum_unfold(tcp_v6_check(tcp_hdr(skb), skb->len, + &ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, 0)); if (skb->len <= 76) { return __skb_checksum_complete(skb); @@ -1600,7 +1603,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ TCP_CHECK_TIMER(sk); - if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) + if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) goto reset; TCP_CHECK_TIMER(sk); if (opt_skb) @@ -1608,7 +1611,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) return 0; } - if (skb->len < (skb->h.th->doff<<2) || tcp_checksum_complete(skb)) + if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb)) goto csum_err; if (sk->sk_state == TCP_LISTEN) { @@ -1631,7 +1634,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) } TCP_CHECK_TIMER(sk); - if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len)) + if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) goto reset; TCP_CHECK_TIMER(sk); if (opt_skb) @@ -1664,7 +1667,7 @@ ipv6_pktoptions: if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo) np->mcast_oif = inet6_iif(opt_skb); if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) - np->mcast_hops = opt_skb->nh.ipv6h->hop_limit; + np->mcast_hops = ipv6_hdr(opt_skb)->hop_limit; if (ipv6_opt_accepted(sk, opt_skb)) { skb_set_owner_r(opt_skb, sk); opt_skb = xchg(&np->pktoptions, opt_skb); @@ -1697,28 +1700,27 @@ static int tcp_v6_rcv(struct sk_buff **pskb) if (!pskb_may_pull(skb, sizeof(struct tcphdr))) goto discard_it; - th = skb->h.th; + th = tcp_hdr(skb); if (th->doff < sizeof(struct tcphdr)/4) goto bad_packet; if (!pskb_may_pull(skb, th->doff*4)) goto discard_it; - if ((skb->ip_summed != CHECKSUM_UNNECESSARY && - tcp_v6_checksum_init(skb))) + if (!skb_csum_unnecessary(skb) && tcp_v6_checksum_init(skb)) goto bad_packet; - th = skb->h.th; + th = tcp_hdr(skb); TCP_SKB_CB(skb)->seq = ntohl(th->seq); TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff*4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); TCP_SKB_CB(skb)->when = 0; - TCP_SKB_CB(skb)->flags = ipv6_get_dsfield(skb->nh.ipv6h); + TCP_SKB_CB(skb)->flags = ipv6_get_dsfield(ipv6_hdr(skb)); TCP_SKB_CB(skb)->sacked = 0; - sk = __inet6_lookup(&tcp_hashinfo, &skb->nh.ipv6h->saddr, th->source, - &skb->nh.ipv6h->daddr, ntohs(th->dest), + sk = __inet6_lookup(&tcp_hashinfo, &ipv6_hdr(skb)->saddr, th->source, + &ipv6_hdr(skb)->daddr, ntohs(th->dest), inet6_iif(skb)); if (!sk) @@ -1798,7 +1800,7 @@ do_time_wait: struct sock *sk2; sk2 = inet6_lookup_listener(&tcp_hashinfo, - &skb->nh.ipv6h->daddr, + &ipv6_hdr(skb)->daddr, ntohs(th->dest), inet6_iif(skb)); if (sk2 != NULL) { struct inet_timewait_sock *tw = inet_twsk(sk); @@ -1945,6 +1947,7 @@ static int tcp_v6_destroy_sock(struct sock *sk) return inet6_destroy_sock(sk); } +#ifdef CONFIG_PROC_FS /* Proc filesystem TCPv6 sock list dumping. */ static void get_openreq6(struct seq_file *seq, struct sock *sk, struct request_sock *req, int i, int uid) @@ -2061,7 +2064,6 @@ static void get_timewait6_sock(struct seq_file *seq, atomic_read(&tw->tw_refcnt), tw); } -#ifdef CONFIG_PROC_FS static int tcp6_seq_show(struct seq_file *seq, void *v) { struct tcp_iter_state *st; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index f590db57a7c..b083c09e3d2 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -93,10 +93,10 @@ static struct sock *__udp6_lib_lookup(struct in6_addr *saddr, __be16 sport, continue; score++; } - if(score == 4) { + if (score == 4) { result = sk; break; - } else if(score > badness) { + } else if (score > badness) { result = sk; badness = score; } @@ -120,8 +120,9 @@ int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk, struct ipv6_pinfo *np = inet6_sk(sk); struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; - size_t copied; - int err, copy_only, is_udplite = IS_UDPLITE(sk); + unsigned int ulen, copied; + int err; + int is_udplite = IS_UDPLITE(sk); if (addr_len) *addr_len=sizeof(struct sockaddr_in6); @@ -134,24 +135,25 @@ try_again: if (!skb) goto out; - copied = skb->len - sizeof(struct udphdr); - if (copied > len) { - copied = len; + ulen = skb->len - sizeof(struct udphdr); + copied = len; + if (copied > ulen) + copied = ulen; + else if (copied < ulen) msg->msg_flags |= MSG_TRUNC; - } /* - * Decide whether to checksum and/or copy data. + * If checksum is needed at all, try to do it while copying the + * data. If the data is truncated, or if we only want a partial + * coverage checksum (UDP-Lite), do it before the copy. */ - copy_only = (skb->ip_summed==CHECKSUM_UNNECESSARY); - if (is_udplite || (!copy_only && msg->msg_flags&MSG_TRUNC)) { - if (__udp_lib_checksum_complete(skb)) + if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) { + if (udp_lib_checksum_complete(skb)) goto csum_copy_err; - copy_only = 1; } - if (copy_only) + if (skb_csum_unnecessary(skb)) err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, copied ); else { @@ -170,15 +172,16 @@ try_again: sin6 = (struct sockaddr_in6 *) msg->msg_name; sin6->sin6_family = AF_INET6; - sin6->sin6_port = skb->h.uh->source; + sin6->sin6_port = udp_hdr(skb)->source; sin6->sin6_flowinfo = 0; sin6->sin6_scope_id = 0; if (skb->protocol == htons(ETH_P_IP)) ipv6_addr_set(&sin6->sin6_addr, 0, 0, - htonl(0xffff), skb->nh.iph->saddr); + htonl(0xffff), ip_hdr(skb)->saddr); else { - ipv6_addr_copy(&sin6->sin6_addr, &skb->nh.ipv6h->saddr); + ipv6_addr_copy(&sin6->sin6_addr, + &ipv6_hdr(skb)->saddr); if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) sin6->sin6_scope_id = IP6CB(skb)->iif; } @@ -194,7 +197,7 @@ try_again: err = copied; if (flags & MSG_TRUNC) - err = skb->len - sizeof(struct udphdr); + err = ulen; out_free: skb_free_datagram(sk, skb); @@ -279,8 +282,10 @@ int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) } } - if (udp_lib_checksum_complete(skb)) - goto drop; + if (sk->sk_filter) { + if (udp_lib_checksum_complete(skb)) + goto drop; + } if ((rc = sock_queue_rcv_skb(sk,skb)) < 0) { /* Note that an ENOMEM error is charged twice */ @@ -325,7 +330,7 @@ static struct sock *udp_v6_mcast_next(struct sock *sk, if (!ipv6_addr_equal(&np->rcv_saddr, loc_addr)) continue; } - if(!inet6_mc_check(s, loc_addr, rmt_addr)) + if (!inet6_mc_check(s, loc_addr, rmt_addr)) continue; return s; } @@ -341,7 +346,7 @@ static int __udp6_lib_mcast_deliver(struct sk_buff *skb, struct in6_addr *saddr, struct in6_addr *daddr, struct hlist_head udptable[]) { struct sock *sk, *sk2; - const struct udphdr *uh = skb->h.uh; + const struct udphdr *uh = udp_hdr(skb); int dif; read_lock(&udp_hash_lock); @@ -366,9 +371,20 @@ out: return 0; } -static inline int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh) - +static inline int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, + int proto) { + int err; + + UDP_SKB_CB(skb)->partial_cov = 0; + UDP_SKB_CB(skb)->cscov = skb->len; + + if (proto == IPPROTO_UDPLITE) { + err = udplite_checksum_init(skb, uh); + if (err) + return err; + } + if (uh->check == 0) { /* RFC 2460 section 8.1 says that we SHOULD log this error. Well, it is reasonable. @@ -377,21 +393,20 @@ static inline int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh) return 1; } if (skb->ip_summed == CHECKSUM_COMPLETE && - !csum_ipv6_magic(&skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr, - skb->len, IPPROTO_UDP, skb->csum )) + !csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, + skb->len, proto, skb->csum)) skb->ip_summed = CHECKSUM_UNNECESSARY; - if (skb->ip_summed != CHECKSUM_UNNECESSARY) - skb->csum = ~csum_unfold(csum_ipv6_magic(&skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr, - skb->len, IPPROTO_UDP, - 0)); + if (!skb_csum_unnecessary(skb)) + skb->csum = ~csum_unfold(csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len, proto, 0)); - return (UDP_SKB_CB(skb)->partial_cov = 0); + return 0; } int __udp6_lib_rcv(struct sk_buff **pskb, struct hlist_head udptable[], - int is_udplite) + int proto) { struct sk_buff *skb = *pskb; struct sock *sk; @@ -403,15 +418,16 @@ int __udp6_lib_rcv(struct sk_buff **pskb, struct hlist_head udptable[], if (!pskb_may_pull(skb, sizeof(struct udphdr))) goto short_packet; - saddr = &skb->nh.ipv6h->saddr; - daddr = &skb->nh.ipv6h->daddr; - uh = skb->h.uh; + saddr = &ipv6_hdr(skb)->saddr; + daddr = &ipv6_hdr(skb)->daddr; + uh = udp_hdr(skb); ulen = ntohs(uh->len); if (ulen > skb->len) goto short_packet; - if(! is_udplite ) { /* UDP validates ulen. */ + if (proto == IPPROTO_UDP) { + /* UDP validates ulen. */ /* Check for jumbo payload */ if (ulen == 0) @@ -423,19 +439,15 @@ int __udp6_lib_rcv(struct sk_buff **pskb, struct hlist_head udptable[], if (ulen < skb->len) { if (pskb_trim_rcsum(skb, ulen)) goto short_packet; - saddr = &skb->nh.ipv6h->saddr; - daddr = &skb->nh.ipv6h->daddr; - uh = skb->h.uh; + saddr = &ipv6_hdr(skb)->saddr; + daddr = &ipv6_hdr(skb)->daddr; + uh = udp_hdr(skb); } - - if (udp6_csum_init(skb, uh)) - goto discard; - - } else { /* UDP-Lite validates cscov. */ - if (udplite6_csum_init(skb, uh)) - goto discard; } + if (udp6_csum_init(skb, uh, proto)) + goto discard; + /* * Multicast receive code */ @@ -457,33 +469,34 @@ int __udp6_lib_rcv(struct sk_buff **pskb, struct hlist_head udptable[], if (udp_lib_checksum_complete(skb)) goto discard; - UDP6_INC_STATS_BH(UDP_MIB_NOPORTS, is_udplite); + UDP6_INC_STATS_BH(UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, dev); kfree_skb(skb); - return(0); + return 0; } /* deliver */ udpv6_queue_rcv_skb(sk, skb); sock_put(sk); - return(0); + return 0; short_packet: LIMIT_NETDEBUG(KERN_DEBUG "UDP%sv6: short packet: %d/%u\n", - is_udplite? "-Lite" : "", ulen, skb->len); + proto == IPPROTO_UDPLITE ? "-Lite" : "", + ulen, skb->len); discard: - UDP6_INC_STATS_BH(UDP_MIB_INERRORS, is_udplite); + UDP6_INC_STATS_BH(UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); kfree_skb(skb); - return(0); + return 0; } static __inline__ int udpv6_rcv(struct sk_buff **pskb) { - return __udp6_lib_rcv(pskb, udp_hash, 0); + return __udp6_lib_rcv(pskb, udp_hash, IPPROTO_UDP); } /* @@ -521,7 +534,7 @@ static int udp_v6_push_pending_frames(struct sock *sk) /* * Create a UDP header */ - uh = skb->h.uh; + uh = udp_hdr(skb); uh->source = fl->fl_ip_sport; uh->dest = fl->fl_ip_dport; uh->len = htons(up->len); diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index 629f97162fb..f54016a5500 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -19,7 +19,7 @@ DEFINE_SNMP_STAT(struct udp_mib, udplite_stats_in6) __read_mostly; static int udplitev6_rcv(struct sk_buff **pskb) { - return __udp6_lib_rcv(pskb, udplite_hash, 1); + return __udp6_lib_rcv(pskb, udplite_hash, IPPROTO_UDPLITE); } static void udplitev6_err(struct sk_buff *skb, diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 31f651f9509..d7ed8aa56ec 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -28,14 +28,14 @@ int xfrm6_rcv_spi(struct sk_buff *skb, __be32 spi) unsigned int nhoff; nhoff = IP6CB(skb)->nhoff; - nexthdr = skb->nh.raw[nhoff]; + nexthdr = skb_network_header(skb)[nhoff]; seq = 0; if (!spi && (err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) != 0) goto drop; do { - struct ipv6hdr *iph = skb->nh.ipv6h; + struct ipv6hdr *iph = ipv6_hdr(skb); if (xfrm_nr == XFRM_MAX_DEPTH) goto drop; @@ -58,7 +58,7 @@ int xfrm6_rcv_spi(struct sk_buff *skb, __be32 spi) if (nexthdr <= 0) goto drop_unlock; - skb->nh.raw[nhoff] = nexthdr; + skb_network_header(skb)[nhoff] = nexthdr; if (x->props.replay_window) xfrm_replay_advance(x, seq); @@ -112,8 +112,8 @@ int xfrm6_rcv_spi(struct sk_buff *skb, __be32 spi) return -1; } else { #ifdef CONFIG_NETFILTER - skb->nh.ipv6h->payload_len = htons(skb->len); - __skb_push(skb, skb->data - skb->nh.raw); + ipv6_hdr(skb)->payload_len = htons(skb->len); + __skb_push(skb, skb->data - skb_network_header(skb)); NF_HOOK(PF_INET6, NF_IP6_PRE_ROUTING, skb, skb->dev, NULL, ip6_rcv_finish); @@ -140,19 +140,19 @@ int xfrm6_rcv(struct sk_buff **pskb) return xfrm6_rcv_spi(*pskb, 0); } +EXPORT_SYMBOL(xfrm6_rcv); + int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, xfrm_address_t *saddr, u8 proto) { struct xfrm_state *x = NULL; int wildcard = 0; - struct in6_addr any; xfrm_address_t *xany; struct xfrm_state *xfrm_vec_one = NULL; int nh = 0; int i = 0; - ipv6_addr_set(&any, 0, 0, 0, 0); - xany = (xfrm_address_t *)&any; + xany = (xfrm_address_t *)&in6addr_any; for (i = 0; i < 3; i++) { xfrm_address_t *dst, *src; @@ -247,3 +247,5 @@ drop: xfrm_state_put(xfrm_vec_one); return -1; } + +EXPORT_SYMBOL(xfrm6_input_addr); diff --git a/net/ipv6/xfrm6_mode_beet.c b/net/ipv6/xfrm6_mode_beet.c index edcfffa9e87..2e61d6ddece 100644 --- a/net/ipv6/xfrm6_mode_beet.c +++ b/net/ipv6/xfrm6_mode_beet.c @@ -38,17 +38,18 @@ static int xfrm6_beet_output(struct xfrm_state *x, struct sk_buff *skb) int hdr_len; skb_push(skb, x->props.header_len); - iph = skb->nh.ipv6h; + iph = ipv6_hdr(skb); hdr_len = ip6_find_1stfragopt(skb, &prevhdr); - skb->nh.raw = prevhdr - x->props.header_len; - skb->h.raw = skb->data + hdr_len; + skb_set_network_header(skb, + (prevhdr - x->props.header_len) - skb->data); + skb_set_transport_header(skb, hdr_len); memmove(skb->data, iph, hdr_len); - skb->nh.raw = skb->data; - top_iph = skb->nh.ipv6h; - skb->nh.raw = &top_iph->nexthdr; - skb->h.ipv6h = top_iph + 1; + skb_reset_network_header(skb); + top_iph = ipv6_hdr(skb); + skb->transport_header = skb->network_header + sizeof(struct ipv6hdr); + skb->network_header += offsetof(struct ipv6hdr, nexthdr); ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr); ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr); @@ -59,6 +60,7 @@ static int xfrm6_beet_output(struct xfrm_state *x, struct sk_buff *skb) static int xfrm6_beet_input(struct xfrm_state *x, struct sk_buff *skb) { struct ipv6hdr *ip6h; + const unsigned char *old_mac; int size = sizeof(struct ipv6hdr); int err = -EINVAL; @@ -66,13 +68,14 @@ static int xfrm6_beet_input(struct xfrm_state *x, struct sk_buff *skb) goto out; skb_push(skb, size); - memmove(skb->data, skb->nh.raw, size); - skb->nh.raw = skb->data; + memmove(skb->data, skb_network_header(skb), size); + skb_reset_network_header(skb); - skb->mac.raw = memmove(skb->data - skb->mac_len, - skb->mac.raw, skb->mac_len); + old_mac = skb_mac_header(skb); + skb_set_mac_header(skb, -skb->mac_len); + memmove(skb_mac_header(skb), old_mac, skb->mac_len); - ip6h = skb->nh.ipv6h; + ip6h = ipv6_hdr(skb); ip6h->payload_len = htons(skb->len - size); ipv6_addr_copy(&ip6h->daddr, (struct in6_addr *) &x->sel.daddr.a6); ipv6_addr_copy(&ip6h->saddr, (struct in6_addr *) &x->sel.saddr.a6); diff --git a/net/ipv6/xfrm6_mode_ro.c b/net/ipv6/xfrm6_mode_ro.c index 6031c16d46c..6ad6d7ac6bd 100644 --- a/net/ipv6/xfrm6_mode_ro.c +++ b/net/ipv6/xfrm6_mode_ro.c @@ -50,11 +50,12 @@ static int xfrm6_ro_output(struct xfrm_state *x, struct sk_buff *skb) int hdr_len; skb_push(skb, x->props.header_len); - iph = skb->nh.ipv6h; + iph = ipv6_hdr(skb); hdr_len = x->type->hdr_offset(x, skb, &prevhdr); - skb->nh.raw = prevhdr - x->props.header_len; - skb->h.raw = skb->data + hdr_len; + skb_set_network_header(skb, + (prevhdr - x->props.header_len) - skb->data); + skb_set_transport_header(skb, hdr_len); memmove(skb->data, iph, hdr_len); return 0; } diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c index 3a4b39b12ba..c026bfea820 100644 --- a/net/ipv6/xfrm6_mode_transport.c +++ b/net/ipv6/xfrm6_mode_transport.c @@ -32,11 +32,12 @@ static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb) int hdr_len; skb_push(skb, x->props.header_len); - iph = skb->nh.ipv6h; + iph = ipv6_hdr(skb); hdr_len = x->type->hdr_offset(x, skb, &prevhdr); - skb->nh.raw = prevhdr - x->props.header_len; - skb->h.raw = skb->data + hdr_len; + skb_set_network_header(skb, + (prevhdr - x->props.header_len) - skb->data); + skb_set_transport_header(skb, hdr_len); memmove(skb->data, iph, hdr_len); return 0; } @@ -51,13 +52,16 @@ static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb) */ static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb) { - int ihl = skb->data - skb->h.raw; + int ihl = skb->data - skb_transport_header(skb); - if (skb->h.raw != skb->nh.raw) - skb->nh.raw = memmove(skb->h.raw, skb->nh.raw, ihl); - skb->nh.ipv6h->payload_len = htons(skb->len + ihl - + if (skb->transport_header != skb->network_header) { + memmove(skb_transport_header(skb), + skb_network_header(skb), ihl); + skb->network_header = skb->transport_header; + } + ipv6_hdr(skb)->payload_len = htons(skb->len + ihl - sizeof(struct ipv6hdr)); - skb->h.raw = skb->data; + skb_reset_transport_header(skb); return 0; } diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c index 0bc866c0d83..a6c0cdf46ad 100644 --- a/net/ipv6/xfrm6_mode_tunnel.c +++ b/net/ipv6/xfrm6_mode_tunnel.c @@ -18,8 +18,8 @@ static inline void ipip6_ecn_decapsulate(struct sk_buff *skb) { - struct ipv6hdr *outer_iph = skb->nh.ipv6h; - struct ipv6hdr *inner_iph = skb->h.ipv6h; + struct ipv6hdr *outer_iph = ipv6_hdr(skb); + struct ipv6hdr *inner_iph = ipipv6_hdr(skb); if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph))) IP6_ECN_set_ce(inner_iph); @@ -27,8 +27,8 @@ static inline void ipip6_ecn_decapsulate(struct sk_buff *skb) static inline void ip6ip_ecn_decapsulate(struct sk_buff *skb) { - if (INET_ECN_is_ce(ipv6_get_dsfield(skb->nh.ipv6h))) - IP_ECN_set_ce(skb->h.ipiph); + if (INET_ECN_is_ce(ipv6_get_dsfield(ipv6_hdr(skb)))) + IP_ECN_set_ce(ipip_hdr(skb)); } /* Add encapsulation header. @@ -51,12 +51,12 @@ static int xfrm6_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) int dsfield; skb_push(skb, x->props.header_len); - iph = skb->nh.ipv6h; + iph = ipv6_hdr(skb); - skb->nh.raw = skb->data; - top_iph = skb->nh.ipv6h; - skb->nh.raw = &top_iph->nexthdr; - skb->h.ipv6h = top_iph + 1; + skb_reset_network_header(skb); + top_iph = ipv6_hdr(skb); + skb->transport_header = skb->network_header + sizeof(struct ipv6hdr); + skb->network_header += offsetof(struct ipv6hdr, nexthdr); top_iph->version = 6; if (xdst->route->ops->family == AF_INET6) { @@ -86,9 +86,11 @@ static int xfrm6_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) static int xfrm6_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) { int err = -EINVAL; + const unsigned char *old_mac; + const unsigned char *nh = skb_network_header(skb); - if (skb->nh.raw[IP6CB(skb)->nhoff] != IPPROTO_IPV6 - && skb->nh.raw[IP6CB(skb)->nhoff] != IPPROTO_IPIP) + if (nh[IP6CB(skb)->nhoff] != IPPROTO_IPV6 && + nh[IP6CB(skb)->nhoff] != IPPROTO_IPIP) goto out; if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto out; @@ -97,9 +99,10 @@ static int xfrm6_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) goto out; - if (skb->nh.raw[IP6CB(skb)->nhoff] == IPPROTO_IPV6) { + nh = skb_network_header(skb); + if (nh[IP6CB(skb)->nhoff] == IPPROTO_IPV6) { if (x->props.flags & XFRM_STATE_DECAP_DSCP) - ipv6_copy_dscp(skb->nh.ipv6h, skb->h.ipv6h); + ipv6_copy_dscp(ipv6_hdr(skb), ipipv6_hdr(skb)); if (!(x->props.flags & XFRM_STATE_NOECN)) ipip6_ecn_decapsulate(skb); } else { @@ -107,9 +110,10 @@ static int xfrm6_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) ip6ip_ecn_decapsulate(skb); skb->protocol = htons(ETH_P_IP); } - skb->mac.raw = memmove(skb->data - skb->mac_len, - skb->mac.raw, skb->mac_len); - skb->nh.raw = skb->data; + old_mac = skb_mac_header(skb); + skb_set_mac_header(skb, -skb->mac_len); + memmove(skb_mac_header(skb), old_mac, skb->mac_len); + skb_reset_network_header(skb); err = 0; out: diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index d6d786b89d2..56364a5f676 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -23,6 +23,8 @@ int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb, return ip6_find_1stfragopt(skb, prevhdr); } +EXPORT_SYMBOL(xfrm6_find_1stfragopt); + static int xfrm6_tunnel_check_size(struct sk_buff *skb) { int mtu, ret = 0; @@ -76,11 +78,11 @@ static int xfrm6_output_one(struct sk_buff *skb) x->curlft.bytes += skb->len; x->curlft.packets++; if (x->props.mode == XFRM_MODE_ROUTEOPTIMIZATION) - x->lastused = (u64)xtime.tv_sec; + x->lastused = get_seconds(); spin_unlock_bh(&x->lock); - skb->nh.raw = skb->data; + skb_reset_network_header(skb); if (!(skb->dst = dst_pop(dst))) { err = -EHOSTUNREACH; diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index d8a585bd2cb..1faa2ea80af 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -240,7 +240,8 @@ __xfrm6_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int if (!afinfo) { dst = *dst_p; goto error; - }; + } + dst_prev->output = afinfo->output; xfrm_state_put_afinfo(afinfo); /* Sheit... I remember I did this right. Apparently, @@ -270,17 +271,19 @@ error: static inline void _decode_session6(struct sk_buff *skb, struct flowi *fl) { - u16 offset = skb->h.raw - skb->nh.raw; - struct ipv6hdr *hdr = skb->nh.ipv6h; + u16 offset = skb_network_header_len(skb); + struct ipv6hdr *hdr = ipv6_hdr(skb); struct ipv6_opt_hdr *exthdr; - u8 nexthdr = skb->nh.raw[IP6CB(skb)->nhoff]; + const unsigned char *nh = skb_network_header(skb); + u8 nexthdr = nh[IP6CB(skb)->nhoff]; memset(fl, 0, sizeof(struct flowi)); ipv6_addr_copy(&fl->fl6_dst, &hdr->daddr); ipv6_addr_copy(&fl->fl6_src, &hdr->saddr); - while (pskb_may_pull(skb, skb->nh.raw + offset + 1 - skb->data)) { - exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset); + while (pskb_may_pull(skb, nh + offset + 1 - skb->data)) { + nh = skb_network_header(skb); + exthdr = (struct ipv6_opt_hdr *)(nh + offset); switch (nexthdr) { case NEXTHDR_ROUTING: @@ -288,7 +291,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl) case NEXTHDR_DEST: offset += ipv6_optlen(exthdr); nexthdr = exthdr->nexthdr; - exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset); + exthdr = (struct ipv6_opt_hdr *)(nh + offset); break; case IPPROTO_UDP: @@ -296,7 +299,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl) case IPPROTO_TCP: case IPPROTO_SCTP: case IPPROTO_DCCP: - if (pskb_may_pull(skb, skb->nh.raw + offset + 4 - skb->data)) { + if (pskb_may_pull(skb, nh + offset + 4 - skb->data)) { __be16 *ports = (__be16 *)exthdr; fl->fl_ip_sport = ports[0]; @@ -306,7 +309,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl) return; case IPPROTO_ICMPV6: - if (pskb_may_pull(skb, skb->nh.raw + offset + 2 - skb->data)) { + if (pskb_may_pull(skb, nh + offset + 2 - skb->data)) { u8 *icmp = (u8 *)exthdr; fl->fl_icmp_type = icmp[0]; @@ -317,7 +320,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl) #ifdef CONFIG_IPV6_MIP6 case IPPROTO_MH: - if (pskb_may_pull(skb, skb->nh.raw + offset + 3 - skb->data)) { + if (pskb_may_pull(skb, nh + offset + 3 - skb->data)) { struct ip6_mh *mh; mh = (struct ip6_mh *)exthdr; @@ -335,7 +338,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl) fl->fl_ipsec_spi = 0; fl->proto = nexthdr; return; - }; + } } } diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 93c42232aa3..5502cc948df 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -257,11 +257,11 @@ static int xfrm6_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) static int xfrm6_tunnel_rcv(struct sk_buff *skb) { - struct ipv6hdr *iph = skb->nh.ipv6h; + struct ipv6hdr *iph = ipv6_hdr(skb); __be32 spi; spi = xfrm6_tunnel_spi_lookup((xfrm_address_t *)&iph->saddr); - return xfrm6_rcv_spi(skb, spi); + return xfrm6_rcv_spi(skb, spi) > 0 ? : 0; } static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt, diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index cac35a77f06..392f8bc9269 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -576,7 +576,9 @@ static struct sk_buff *ipxitf_adjust_skbuff(struct ipx_interface *intrfc, skb2 = alloc_skb(len, GFP_ATOMIC); if (skb2) { skb_reserve(skb2, out_offset); - skb2->nh.raw = skb2->h.raw = skb_put(skb2, skb->len); + skb_reset_network_header(skb2); + skb_reset_transport_header(skb2); + skb_put(skb2, skb->len); memcpy(ipx_hdr(skb2), ipx_hdr(skb), skb->len); memcpy(skb2->cb, skb->cb, sizeof(skb->cb)); } @@ -1807,8 +1809,8 @@ static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock, copied); if (rc) goto out_free; - if (skb->tstamp.off_sec) - skb_get_timestamp(skb, &sk->sk_stamp); + if (skb->tstamp.tv64) + sk->sk_stamp = skb->tstamp; msg->msg_namelen = sizeof(*sipx); diff --git a/net/ipx/ipx_route.c b/net/ipx/ipx_route.c index 8e1cad971f1..e16c1142352 100644 --- a/net/ipx/ipx_route.c +++ b/net/ipx/ipx_route.c @@ -203,7 +203,9 @@ int ipxrtr_route_packet(struct sock *sk, struct sockaddr_ipx *usipx, skb->sk = sk; /* Fill in IPX header */ - skb->h.raw = skb->nh.raw = skb_put(skb, sizeof(struct ipxhdr)); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + skb_put(skb, sizeof(struct ipxhdr)); ipx = ipx_hdr(skb); ipx->ipx_pktsize = htons(len + sizeof(struct ipxhdr)); IPX_SKB_CB(skb)->ipx_tctrl = 0; diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 0eb7d596d47..06c97c60d54 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -89,7 +89,6 @@ static int irda_data_indication(void *instance, void *sap, struct sk_buff *skb) self = instance; sk = instance; - IRDA_ASSERT(sk != NULL, return -1;); err = sock_queue_rcv_skb(sk, skb); if (err) { @@ -131,14 +130,12 @@ static void irda_disconnect_indication(void *instance, void *sap, } /* Prevent race conditions with irda_release() and irda_shutdown() */ + bh_lock_sock(sk); if (!sock_flag(sk, SOCK_DEAD) && sk->sk_state != TCP_CLOSE) { - lock_sock(sk); sk->sk_state = TCP_CLOSE; - sk->sk_err = ECONNRESET; sk->sk_shutdown |= SEND_SHUTDOWN; sk->sk_state_change(sk); - release_sock(sk); /* Close our TSAP. * If we leave it open, IrLMP put it back into the list of @@ -158,6 +155,7 @@ static void irda_disconnect_indication(void *instance, void *sap, self->tsap = NULL; } } + bh_unlock_sock(sk); /* Note : once we are there, there is not much you want to do * with the socket anymore, apart from closing it. @@ -220,7 +218,7 @@ static void irda_connect_confirm(void *instance, void *sap, break; default: self->max_data_size = irttp_get_max_seg_size(self->tsap); - }; + } IRDA_DEBUG(2, "%s(), max_data_size=%d\n", __FUNCTION__, self->max_data_size); @@ -283,7 +281,7 @@ static void irda_connect_indication(void *instance, void *sap, break; default: self->max_data_size = irttp_get_max_seg_size(self->tsap); - }; + } IRDA_DEBUG(2, "%s(), max_data_size=%d\n", __FUNCTION__, self->max_data_size); @@ -306,8 +304,6 @@ static void irda_connect_response(struct irda_sock *self) IRDA_DEBUG(2, "%s()\n", __FUNCTION__); - IRDA_ASSERT(self != NULL, return;); - skb = alloc_skb(TTP_MAX_HEADER + TTP_SAR_HEADER, GFP_ATOMIC); if (skb == NULL) { @@ -337,7 +333,7 @@ static void irda_flow_indication(void *instance, void *sap, LOCAL_FLOW flow) self = instance; sk = instance; - IRDA_ASSERT(sk != NULL, return;); + BUG_ON(sk == NULL); switch (flow) { case FLOW_STOP: @@ -449,7 +445,7 @@ static void irda_discovery_timeout(u_long priv) IRDA_DEBUG(2, "%s()\n", __FUNCTION__); self = (struct irda_sock *) priv; - IRDA_ASSERT(self != NULL, return;); + BUG_ON(self == NULL); /* Nothing for the caller */ self->cachelog = NULL; @@ -546,8 +542,6 @@ static int irda_find_lsap_sel(struct irda_sock *self, char *name) { IRDA_DEBUG(2, "%s(%p, %s)\n", __FUNCTION__, self, name); - IRDA_ASSERT(self != NULL, return -1;); - if (self->iriap) { IRDA_WARNING("%s(): busy with a previous query\n", __FUNCTION__); @@ -635,8 +629,6 @@ static int irda_discover_daddr_and_lsap_sel(struct irda_sock *self, char *name) IRDA_DEBUG(2, "%s(), name=%s\n", __FUNCTION__, name); - IRDA_ASSERT(self != NULL, return -1;); - /* Ask lmp for the current discovery log * Note : we have to use irlmp_get_discoveries(), as opposed * to play with the cachelog directly, because while we are @@ -784,8 +776,6 @@ static int irda_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct irda_sock *self = irda_sk(sk); int err; - IRDA_ASSERT(self != NULL, return -1;); - IRDA_DEBUG(2, "%s(%p)\n", __FUNCTION__, self); if (addr_len != sizeof(struct sockaddr_irda)) @@ -841,8 +831,6 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags) IRDA_DEBUG(2, "%s()\n", __FUNCTION__); - IRDA_ASSERT(self != NULL, return -1;); - err = irda_create(newsock, sk->sk_protocol); if (err) return err; @@ -873,44 +861,28 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags) * calling us, the data is waiting for us ;-) * Jean II */ - skb = skb_dequeue(&sk->sk_receive_queue); - if (skb == NULL) { - int ret = 0; - DECLARE_WAITQUEUE(waitq, current); + while (1) { + skb = skb_dequeue(&sk->sk_receive_queue); + if (skb) + break; /* Non blocking operation */ if (flags & O_NONBLOCK) return -EWOULDBLOCK; - /* The following code is a cut'n'paste of the - * wait_event_interruptible() macro. - * We don't us the macro because the condition has - * side effects : we want to make sure that only one - * skb get dequeued - Jean II */ - add_wait_queue(sk->sk_sleep, &waitq); - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - skb = skb_dequeue(&sk->sk_receive_queue); - if (skb != NULL) - break; - if (!signal_pending(current)) { - schedule(); - continue; - } - ret = -ERESTARTSYS; - break; - } - current->state = TASK_RUNNING; - remove_wait_queue(sk->sk_sleep, &waitq); - if(ret) - return -ERESTARTSYS; + err = wait_event_interruptible(*(sk->sk_sleep), + skb_peek(&sk->sk_receive_queue)); + if (err) + return err; } newsk = newsock->sk; + if (newsk == NULL) + return -EIO; + newsk->sk_state = TCP_ESTABLISHED; new = irda_sk(newsk); - IRDA_ASSERT(new != NULL, return -1;); /* Now attach up the new socket */ new->tsap = irttp_dup(self->tsap, new); @@ -1061,7 +1033,8 @@ static int irda_connect(struct socket *sock, struct sockaddr *uaddr, if (sk->sk_state != TCP_ESTABLISHED) { sock->state = SS_UNCONNECTED; - return sock_error(sk); /* Always set at this point */ + err = sock_error(sk); + return err? err : -ECONNRESET; } sock->state = SS_CONNECTED; @@ -1171,8 +1144,6 @@ static void irda_destroy_socket(struct irda_sock *self) { IRDA_DEBUG(2, "%s(%p)\n", __FUNCTION__, self); - IRDA_ASSERT(self != NULL, return;); - /* Unregister with IrLMP */ irlmp_unregister_client(self->ckey); irlmp_unregister_service(self->skey); @@ -1274,7 +1245,6 @@ static int irda_sendmsg(struct kiocb *iocb, struct socket *sock, struct sock *sk = sock->sk; struct irda_sock *self; struct sk_buff *skb; - unsigned char *asmptr; int err; IRDA_DEBUG(4, "%s(), len=%zd\n", __FUNCTION__, len); @@ -1292,7 +1262,6 @@ static int irda_sendmsg(struct kiocb *iocb, struct socket *sock, return -ENOTCONN; self = irda_sk(sk); - IRDA_ASSERT(self != NULL, return -1;); /* Check if IrTTP is wants us to slow down */ @@ -1317,9 +1286,9 @@ static int irda_sendmsg(struct kiocb *iocb, struct socket *sock, return -ENOBUFS; skb_reserve(skb, self->max_header_size + 16); - - asmptr = skb->h.raw = skb_put(skb, len); - err = memcpy_fromiovec(asmptr, msg->msg_iov, len); + skb_reset_transport_header(skb); + skb_put(skb, len); + err = memcpy_fromiovec(skb_transport_header(skb), msg->msg_iov, len); if (err) { kfree_skb(skb); return err; @@ -1355,16 +1324,16 @@ static int irda_recvmsg_dgram(struct kiocb *iocb, struct socket *sock, IRDA_DEBUG(4, "%s()\n", __FUNCTION__); - IRDA_ASSERT(self != NULL, return -1;); - IRDA_ASSERT(!sock_error(sk), return -1;); + if ((err = sock_error(sk)) < 0) + return err; skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, flags & MSG_DONTWAIT, &err); if (!skb) return err; - skb->h.raw = skb->data; - copied = skb->len; + skb_reset_transport_header(skb); + copied = skb->len; if (copied > size) { IRDA_DEBUG(2, "%s(), Received truncated frame (%zd < %zd)!\n", @@ -1403,13 +1372,13 @@ static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock, struct irda_sock *self = irda_sk(sk); int noblock = flags & MSG_DONTWAIT; size_t copied = 0; - int target = 1; - DECLARE_WAITQUEUE(waitq, current); + int target, err; + long timeo; IRDA_DEBUG(3, "%s()\n", __FUNCTION__); - IRDA_ASSERT(self != NULL, return -1;); - IRDA_ASSERT(!sock_error(sk), return -1;); + if ((err = sock_error(sk)) < 0) + return err; if (sock->flags & __SO_ACCEPTCON) return(-EINVAL); @@ -1417,8 +1386,8 @@ static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock, if (flags & MSG_OOB) return -EOPNOTSUPP; - if (flags & MSG_WAITALL) - target = size; + target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); + timeo = sock_rcvtimeo(sk, noblock); msg->msg_namelen = 0; @@ -1426,19 +1395,14 @@ static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock, int chunk; struct sk_buff *skb = skb_dequeue(&sk->sk_receive_queue); - if (skb==NULL) { + if (skb == NULL) { + DEFINE_WAIT(wait); int ret = 0; if (copied >= target) break; - /* The following code is a cut'n'paste of the - * wait_event_interruptible() macro. - * We don't us the macro because the test condition - * is messy. - Jean II */ - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); - add_wait_queue(sk->sk_sleep, &waitq); - set_current_state(TASK_INTERRUPTIBLE); + prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); /* * POSIX 1003.1g mandates this order. @@ -1451,17 +1415,17 @@ static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock, else if (noblock) ret = -EAGAIN; else if (signal_pending(current)) - ret = -ERESTARTSYS; + ret = sock_intr_errno(timeo); + else if (sk->sk_state != TCP_ESTABLISHED) + ret = -ENOTCONN; else if (skb_peek(&sk->sk_receive_queue) == NULL) /* Wait process until data arrives */ schedule(); - current->state = TASK_RUNNING; - remove_wait_queue(sk->sk_sleep, &waitq); - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + finish_wait(sk->sk_sleep, &wait); - if(ret) - return(ret); + if (ret) + return ret; if (sk->sk_shutdown & RCV_SHUTDOWN) break; @@ -1530,7 +1494,6 @@ static int irda_sendmsg_dgram(struct kiocb *iocb, struct socket *sock, struct sock *sk = sock->sk; struct irda_sock *self; struct sk_buff *skb; - unsigned char *asmptr; int err; IRDA_DEBUG(4, "%s(), len=%zd\n", __FUNCTION__, len); @@ -1547,7 +1510,6 @@ static int irda_sendmsg_dgram(struct kiocb *iocb, struct socket *sock, return -ENOTCONN; self = irda_sk(sk); - IRDA_ASSERT(self != NULL, return -1;); /* * Check that we don't send out too big frames. This is an unreliable @@ -1566,10 +1528,11 @@ static int irda_sendmsg_dgram(struct kiocb *iocb, struct socket *sock, return -ENOBUFS; skb_reserve(skb, self->max_header_size); + skb_reset_transport_header(skb); IRDA_DEBUG(4, "%s(), appending user data\n", __FUNCTION__); - asmptr = skb->h.raw = skb_put(skb, len); - err = memcpy_fromiovec(asmptr, msg->msg_iov, len); + skb_put(skb, len); + err = memcpy_fromiovec(skb_transport_header(skb), msg->msg_iov, len); if (err) { kfree_skb(skb); return err; @@ -1602,7 +1565,6 @@ static int irda_sendmsg_ultra(struct kiocb *iocb, struct socket *sock, __u8 pid = 0; int bound = 0; struct sk_buff *skb; - unsigned char *asmptr; int err; IRDA_DEBUG(4, "%s(), len=%zd\n", __FUNCTION__, len); @@ -1616,7 +1578,6 @@ static int irda_sendmsg_ultra(struct kiocb *iocb, struct socket *sock, } self = irda_sk(sk); - IRDA_ASSERT(self != NULL, return -1;); /* Check if an address was specified with sendto. Jean II */ if (msg->msg_name) { @@ -1662,10 +1623,11 @@ static int irda_sendmsg_ultra(struct kiocb *iocb, struct socket *sock, return -ENOBUFS; skb_reserve(skb, self->max_header_size); + skb_reset_transport_header(skb); IRDA_DEBUG(4, "%s(), appending user data\n", __FUNCTION__); - asmptr = skb->h.raw = skb_put(skb, len); - err = memcpy_fromiovec(asmptr, msg->msg_iov, len); + skb_put(skb, len); + err = memcpy_fromiovec(skb_transport_header(skb), msg->msg_iov, len); if (err) { kfree_skb(skb); return err; @@ -1689,8 +1651,6 @@ static int irda_shutdown(struct socket *sock, int how) struct sock *sk = sock->sk; struct irda_sock *self = irda_sk(sk); - IRDA_ASSERT(self != NULL, return -1;); - IRDA_DEBUG(1, "%s(%p)\n", __FUNCTION__, self); sk->sk_state = TCP_CLOSE; @@ -1863,8 +1823,6 @@ static int irda_setsockopt(struct socket *sock, int level, int optname, struct ias_attrib * ias_attr; /* Attribute in IAS object */ int opt; - IRDA_ASSERT(self != NULL, return -1;); - IRDA_DEBUG(2, "%s(%p)\n", __FUNCTION__, self); if (level != SOL_IRLMP) diff --git a/net/irda/ircomm/ircomm_param.c b/net/irda/ircomm/ircomm_param.c index 01d7c9c7b3b..e5e4792a031 100644 --- a/net/irda/ircomm/ircomm_param.c +++ b/net/irda/ircomm/ircomm_param.c @@ -133,8 +133,8 @@ int ircomm_param_request(struct ircomm_tty_cb *self, __u8 pi, int flush) * Inserting is a little bit tricky since we don't know how much * room we will need. But this should hopefully work OK */ - count = irda_param_insert(self, pi, skb->tail, skb_tailroom(skb), - &ircomm_param_info); + count = irda_param_insert(self, pi, skb_tail_pointer(skb), + skb_tailroom(skb), &ircomm_param_info); if (count < 0) { IRDA_WARNING("%s(), no room for parameter!\n", __FUNCTION__); spin_unlock_irqrestore(&self->spinlock, flags); diff --git a/net/irda/irda_device.c b/net/irda/irda_device.c index e717801b38f..7b5def1ea63 100644 --- a/net/irda/irda_device.c +++ b/net/irda/irda_device.c @@ -375,7 +375,7 @@ EXPORT_SYMBOL(alloc_irdadev); dongle_t *irda_device_dongle_init(struct net_device *dev, int type) { struct dongle_reg *reg; - dongle_t *dongle = NULL; + dongle_t *dongle = kzalloc(sizeof(dongle_t), GFP_KERNEL); might_sleep(); @@ -397,19 +397,14 @@ dongle_t *irda_device_dongle_init(struct net_device *dev, int type) if (!reg || !try_module_get(reg->owner) ) { IRDA_ERROR("IrDA: Unable to find requested dongle type %x\n", type); - goto out; + kfree(dongle); + dongle = NULL; + } + if (dongle) { + /* Bind the registration info to this particular instance */ + dongle->issue = reg; + dongle->dev = dev; } - - /* Allocate dongle info for this instance */ - dongle = kzalloc(sizeof(dongle_t), GFP_KERNEL); - if (!dongle) - goto out; - - /* Bind the registration info to this particular instance */ - dongle->issue = reg; - dongle->dev = dev; - - out: spin_unlock(&dongles->hb_spinlock); return dongle; } diff --git a/net/irda/irlan/irlan_common.c b/net/irda/irlan/irlan_common.c index fcf9d659962..ed69773b0f8 100644 --- a/net/irda/irlan/irlan_common.c +++ b/net/irda/irlan/irlan_common.c @@ -1039,7 +1039,7 @@ static int __irlan_insert_param(struct sk_buff *skb, char *param, int type, } /* Insert at end of sk-buffer */ - frame = skb->tail; + frame = skb_tail_pointer(skb); /* Make space for data */ if (skb_tailroom(skb) < (param_len+value_len+3)) { diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c index 672ab3f6903..c421521c0a9 100644 --- a/net/irda/irlan/irlan_eth.c +++ b/net/irda/irlan/irlan_eth.c @@ -234,8 +234,7 @@ int irlan_eth_receive(void *instance, void *sap, struct sk_buff *skb) * might have been previously set by the low level IrDA network * device driver */ - skb->dev = self->dev; - skb->protocol=eth_type_trans(skb, skb->dev); /* Remove eth header */ + skb->protocol = eth_type_trans(skb, self->dev); /* Remove eth header */ self->stats.rx_packets++; self->stats.rx_bytes += skb->len; diff --git a/net/irda/irlap_event.c b/net/irda/irlap_event.c index 7b6433fe1dc..0b02073ffdf 100644 --- a/net/irda/irlap_event.c +++ b/net/irda/irlap_event.c @@ -590,7 +590,7 @@ static int irlap_state_query(struct irlap_cb *self, IRLAP_EVENT event, if (!self->discovery_log) { IRDA_WARNING("%s: discovery log is gone! " "maybe the discovery timeout has been set" - " to short?\n", __FUNCTION__); + " too short?\n", __FUNCTION__); break; } hashbin_insert(self->discovery_log, diff --git a/net/irda/irlap_frame.c b/net/irda/irlap_frame.c index 0b04603e9c4..3c5a68e3641 100644 --- a/net/irda/irlap_frame.c +++ b/net/irda/irlap_frame.c @@ -93,7 +93,9 @@ void irlap_queue_xmit(struct irlap_cb *self, struct sk_buff *skb) { /* Some common init stuff */ skb->dev = self->netdev; - skb->h.raw = skb->nh.raw = skb->mac.raw = skb->data; + skb_reset_mac_header(skb); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); skb->protocol = htons(ETH_P_IRDA); skb->priority = TC_PRIO_BESTEFFORT; @@ -411,7 +413,7 @@ static void irlap_recv_discovery_xid_rsp(struct irlap_cb *self, IRDA_ASSERT(self->magic == LAP_MAGIC, return;); if (!pskb_may_pull(skb, sizeof(struct xid_frame))) { - IRDA_ERROR("%s: frame to short!\n", __FUNCTION__); + IRDA_ERROR("%s: frame too short!\n", __FUNCTION__); return; } @@ -482,7 +484,7 @@ static void irlap_recv_discovery_xid_cmd(struct irlap_cb *self, char *text; if (!pskb_may_pull(skb, sizeof(struct xid_frame))) { - IRDA_ERROR("%s: frame to short!\n", __FUNCTION__); + IRDA_ERROR("%s: frame too short!\n", __FUNCTION__); return; } @@ -526,7 +528,7 @@ static void irlap_recv_discovery_xid_cmd(struct irlap_cb *self, /* Check if things are sane at this point... */ if((discovery_info == NULL) || !pskb_may_pull(skb, 3)) { - IRDA_ERROR("%s: discovery frame to short!\n", + IRDA_ERROR("%s: discovery frame too short!\n", __FUNCTION__); return; } @@ -1171,7 +1173,7 @@ static void irlap_recv_frmr_frame(struct irlap_cb *self, struct sk_buff *skb, IRDA_ASSERT(info != NULL, return;); if (!pskb_may_pull(skb, 4)) { - IRDA_ERROR("%s: frame to short!\n", __FUNCTION__); + IRDA_ERROR("%s: frame too short!\n", __FUNCTION__); return; } @@ -1260,7 +1262,7 @@ static void irlap_recv_test_frame(struct irlap_cb *self, struct sk_buff *skb, IRDA_DEBUG(2, "%s()\n", __FUNCTION__); if (!pskb_may_pull(skb, sizeof(*frame))) { - IRDA_ERROR("%s: frame to short!\n", __FUNCTION__); + IRDA_ERROR("%s: frame too short!\n", __FUNCTION__); return; } frame = (struct test_frame *) skb->data; @@ -1268,7 +1270,7 @@ static void irlap_recv_test_frame(struct irlap_cb *self, struct sk_buff *skb, /* Broadcast frames must carry saddr and daddr fields */ if (info->caddr == CBROADCAST) { if (skb->len < sizeof(struct test_frame)) { - IRDA_DEBUG(0, "%s() test frame to short!\n", + IRDA_DEBUG(0, "%s() test frame too short!\n", __FUNCTION__); return; } @@ -1334,7 +1336,7 @@ int irlap_driver_rcv(struct sk_buff *skb, struct net_device *dev, /* Check if frame is large enough for parsing */ if (!pskb_may_pull(skb, 2)) { - IRDA_ERROR("%s: frame to short!\n", __FUNCTION__); + IRDA_ERROR("%s: frame too short!\n", __FUNCTION__); dev_kfree_skb(skb); return -1; } diff --git a/net/irda/irqueue.c b/net/irda/irqueue.c index 92662330dbc..d058b467f9e 100644 --- a/net/irda/irqueue.c +++ b/net/irda/irqueue.c @@ -384,6 +384,9 @@ EXPORT_SYMBOL(hashbin_new); * for deallocating this structure if it's complex. If not the user can * just supply kfree, which should take care of the job. */ +#ifdef CONFIG_LOCKDEP +static int hashbin_lock_depth = 0; +#endif int hashbin_delete( hashbin_t* hashbin, FREE_FUNC free_func) { irda_queue_t* queue; @@ -395,7 +398,8 @@ int hashbin_delete( hashbin_t* hashbin, FREE_FUNC free_func) /* Synchronize */ if ( hashbin->hb_type & HB_LOCK ) { - spin_lock_irqsave(&hashbin->hb_spinlock, flags); + spin_lock_irqsave_nested(&hashbin->hb_spinlock, flags, + hashbin_lock_depth++); } /* @@ -419,6 +423,9 @@ int hashbin_delete( hashbin_t* hashbin, FREE_FUNC free_func) /* Release lock */ if ( hashbin->hb_type & HB_LOCK) { spin_unlock_irqrestore(&hashbin->hb_spinlock, flags); +#ifdef CONFIG_LOCKDEP + hashbin_lock_depth--; +#endif } /* diff --git a/net/irda/irttp.c b/net/irda/irttp.c index da3f2bc1b6f..7069e4a5825 100644 --- a/net/irda/irttp.c +++ b/net/irda/irttp.c @@ -256,7 +256,7 @@ static struct sk_buff *irttp_reassemble_skb(struct tsap_cb *self) * Copy all fragments to a new buffer */ while ((frag = skb_dequeue(&self->rx_fragments)) != NULL) { - memcpy(skb->data+n, frag->data, frag->len); + skb_copy_to_linear_data_offset(skb, n, frag->data, frag->len); n += frag->len; dev_kfree_skb(frag); @@ -314,8 +314,8 @@ static inline void irttp_fragment_skb(struct tsap_cb *self, skb_reserve(frag, self->max_header_size); /* Copy data from the original skb into this fragment. */ - memcpy(skb_put(frag, self->max_seg_size), skb->data, - self->max_seg_size); + skb_copy_from_linear_data(skb, skb_put(frag, self->max_seg_size), + self->max_seg_size); /* Insert TTP header, with the more bit set */ frame = skb_push(frag, TTP_HEADER); @@ -551,7 +551,7 @@ int irttp_udata_request(struct tsap_cb *self, struct sk_buff *skb) } if (skb->len > self->max_seg_size) { - IRDA_DEBUG(1, "%s(), UData is to large for IrLAP!\n", + IRDA_DEBUG(1, "%s(), UData is too large for IrLAP!\n", __FUNCTION__); goto err; } @@ -598,7 +598,7 @@ int irttp_data_request(struct tsap_cb *self, struct sk_buff *skb) * inside an IrLAP frame */ if ((self->tx_max_sdu_size == 0) && (skb->len > self->max_seg_size)) { - IRDA_ERROR("%s: SAR disabled, and data is to large for IrLAP!\n", + IRDA_ERROR("%s: SAR disabled, and data is too large for IrLAP!\n", __FUNCTION__); ret = -EMSGSIZE; goto err; diff --git a/net/irda/parameters.c b/net/irda/parameters.c index 75a72d203b0..2627dad7cd8 100644 --- a/net/irda/parameters.c +++ b/net/irda/parameters.c @@ -160,7 +160,7 @@ static int irda_insert_integer(void *self, __u8 *buf, int len, __u8 pi, } /* Check if buffer is long enough for insertion */ if (len < (2+p.pl)) { - IRDA_WARNING("%s: buffer to short for insertion!\n", + IRDA_WARNING("%s: buffer too short for insertion!\n", __FUNCTION__); return -1; } @@ -216,7 +216,7 @@ static int irda_extract_integer(void *self, __u8 *buf, int len, __u8 pi, /* Check if buffer is long enough for parsing */ if (len < (2+p.pl)) { - IRDA_WARNING("%s: buffer to short for parsing! " + IRDA_WARNING("%s: buffer too short for parsing! " "Need %d bytes, but len is only %d\n", __FUNCTION__, p.pl, len); return -1; @@ -304,7 +304,7 @@ static int irda_extract_string(void *self, __u8 *buf, int len, __u8 pi, /* Check if buffer is long enough for parsing */ if (len < (2+p.pl)) { - IRDA_WARNING("%s: buffer to short for parsing! " + IRDA_WARNING("%s: buffer too short for parsing! " "Need %d bytes, but len is only %d\n", __FUNCTION__, p.pl, len); return -1; @@ -343,7 +343,7 @@ static int irda_extract_octseq(void *self, __u8 *buf, int len, __u8 pi, /* Check if buffer is long enough for parsing */ if (len < (2+p.pl)) { - IRDA_WARNING("%s: buffer to short for parsing! " + IRDA_WARNING("%s: buffer too short for parsing! " "Need %d bytes, but len is only %d\n", __FUNCTION__, p.pl, len); return -1; diff --git a/net/irda/qos.c b/net/irda/qos.c index 349012c926b..aeb18cf1dca 100644 --- a/net/irda/qos.c +++ b/net/irda/qos.c @@ -469,49 +469,49 @@ int irlap_insert_qos_negotiation_params(struct irlap_cb *self, int ret; /* Insert data rate */ - ret = irda_param_insert(self, PI_BAUD_RATE, skb->tail, + ret = irda_param_insert(self, PI_BAUD_RATE, skb_tail_pointer(skb), skb_tailroom(skb), &irlap_param_info); if (ret < 0) return ret; skb_put(skb, ret); /* Insert max turnaround time */ - ret = irda_param_insert(self, PI_MAX_TURN_TIME, skb->tail, + ret = irda_param_insert(self, PI_MAX_TURN_TIME, skb_tail_pointer(skb), skb_tailroom(skb), &irlap_param_info); if (ret < 0) return ret; skb_put(skb, ret); /* Insert data size */ - ret = irda_param_insert(self, PI_DATA_SIZE, skb->tail, + ret = irda_param_insert(self, PI_DATA_SIZE, skb_tail_pointer(skb), skb_tailroom(skb), &irlap_param_info); if (ret < 0) return ret; skb_put(skb, ret); /* Insert window size */ - ret = irda_param_insert(self, PI_WINDOW_SIZE, skb->tail, + ret = irda_param_insert(self, PI_WINDOW_SIZE, skb_tail_pointer(skb), skb_tailroom(skb), &irlap_param_info); if (ret < 0) return ret; skb_put(skb, ret); /* Insert additional BOFs */ - ret = irda_param_insert(self, PI_ADD_BOFS, skb->tail, + ret = irda_param_insert(self, PI_ADD_BOFS, skb_tail_pointer(skb), skb_tailroom(skb), &irlap_param_info); if (ret < 0) return ret; skb_put(skb, ret); /* Insert minimum turnaround time */ - ret = irda_param_insert(self, PI_MIN_TURN_TIME, skb->tail, + ret = irda_param_insert(self, PI_MIN_TURN_TIME, skb_tail_pointer(skb), skb_tailroom(skb), &irlap_param_info); if (ret < 0) return ret; skb_put(skb, ret); /* Insert link disconnect/threshold time */ - ret = irda_param_insert(self, PI_LINK_DISC, skb->tail, + ret = irda_param_insert(self, PI_LINK_DISC, skb_tail_pointer(skb), skb_tailroom(skb), &irlap_param_info); if (ret < 0) return ret; diff --git a/net/irda/wrapper.c b/net/irda/wrapper.c index 5abfb71aae8..a7a7f191f1a 100644 --- a/net/irda/wrapper.c +++ b/net/irda/wrapper.c @@ -239,7 +239,8 @@ async_bump(struct net_device *dev, if(docopy) { /* Copy data without CRC (lenght already checked) */ - memcpy(newskb->data, rx_buff->data, rx_buff->len - 2); + skb_copy_to_linear_data(newskb, rx_buff->data, + rx_buff->len - 2); /* Deliver this skb */ dataskb = newskb; } else { @@ -256,7 +257,7 @@ async_bump(struct net_device *dev, /* Feed it to IrLAP layer */ dataskb->dev = dev; - dataskb->mac.raw = dataskb->data; + skb_reset_mac_header(dataskb); dataskb->protocol = htons(ETH_P_IRDA); netif_rx(dataskb); diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index acc94214bde..2f1373855a8 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -45,7 +45,8 @@ static struct proto iucv_proto = { static void iucv_callback_rx(struct iucv_path *, struct iucv_message *); static void iucv_callback_txdone(struct iucv_path *, struct iucv_message *); static void iucv_callback_connack(struct iucv_path *, u8 ipuser[16]); -static int iucv_callback_connreq(struct iucv_path *, u8 ipvmid[8], u8 ipuser[16]); +static int iucv_callback_connreq(struct iucv_path *, u8 ipvmid[8], + u8 ipuser[16]); static void iucv_callback_connrej(struct iucv_path *, u8 ipuser[16]); static struct iucv_sock_list iucv_sk_list = { @@ -147,11 +148,12 @@ static void iucv_sock_close(struct sock *sk) unsigned char user_data[16]; struct iucv_sock *iucv = iucv_sk(sk); int err; + unsigned long timeo; iucv_sock_clear_timer(sk); lock_sock(sk); - switch(sk->sk_state) { + switch (sk->sk_state) { case IUCV_LISTEN: iucv_sock_cleanup_listen(sk); break; @@ -159,6 +161,21 @@ static void iucv_sock_close(struct sock *sk) case IUCV_CONNECTED: case IUCV_DISCONN: err = 0; + + sk->sk_state = IUCV_CLOSING; + sk->sk_state_change(sk); + + if (!skb_queue_empty(&iucv->send_skb_q)) { + if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime) + timeo = sk->sk_lingertime; + else + timeo = IUCV_DISCONN_TIMEOUT; + err = iucv_sock_wait_state(sk, IUCV_CLOSED, 0, timeo); + } + + sk->sk_state = IUCV_CLOSED; + sk->sk_state_change(sk); + if (iucv->path) { low_nmcpy(user_data, iucv->src_name); high_nmcpy(user_data, iucv->dst_name); @@ -168,12 +185,11 @@ static void iucv_sock_close(struct sock *sk) iucv->path = NULL; } - sk->sk_state = IUCV_CLOSED; - sk->sk_state_change(sk); sk->sk_err = ECONNRESET; sk->sk_state_change(sk); skb_queue_purge(&iucv->send_skb_q); + skb_queue_purge(&iucv->backlog_skb_q); sock_set_flag(sk, SOCK_ZAPPED); break; @@ -181,7 +197,7 @@ static void iucv_sock_close(struct sock *sk) default: sock_set_flag(sk, SOCK_ZAPPED); break; - }; + } release_sock(sk); iucv_sock_kill(sk); @@ -204,6 +220,7 @@ static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio) sock_init_data(sock, sk); INIT_LIST_HEAD(&iucv_sk(sk)->accept_q); skb_queue_head_init(&iucv_sk(sk)->send_skb_q); + skb_queue_head_init(&iucv_sk(sk)->backlog_skb_q); iucv_sk(sk)->send_tag = 0; sk->sk_destruct = iucv_sock_destruct; @@ -276,7 +293,7 @@ struct sock *iucv_accept_dequeue(struct sock *parent, struct socket *newsock) struct iucv_sock *isk, *n; struct sock *sk; - list_for_each_entry_safe(isk, n, &iucv_sk(parent)->accept_q, accept_q){ + list_for_each_entry_safe(isk, n, &iucv_sk(parent)->accept_q, accept_q) { sk = (struct sock *) isk; lock_sock(sk); @@ -510,7 +527,7 @@ static int iucv_sock_accept(struct socket *sock, struct socket *newsock, long timeo; int err = 0; - lock_sock(sk); + lock_sock_nested(sk, SINGLE_DEPTH_NESTING); if (sk->sk_state != IUCV_LISTEN) { err = -EBADFD; @@ -521,7 +538,7 @@ static int iucv_sock_accept(struct socket *sock, struct socket *newsock, /* Wait for an incoming connection */ add_wait_queue_exclusive(sk->sk_sleep, &wait); - while (!(nsk = iucv_accept_dequeue(sk, newsock))){ + while (!(nsk = iucv_accept_dequeue(sk, newsock))) { set_current_state(TASK_INTERRUPTIBLE); if (!timeo) { err = -EAGAIN; @@ -530,7 +547,7 @@ static int iucv_sock_accept(struct socket *sock, struct socket *newsock, release_sock(sk); timeo = schedule_timeout(timeo); - lock_sock(sk); + lock_sock_nested(sk, SINGLE_DEPTH_NESTING); if (sk->sk_state != IUCV_LISTEN) { err = -EBADFD; @@ -602,13 +619,13 @@ static int iucv_sock_sendmsg(struct kiocb *iocb, struct socket *sock, goto out; } - if (sk->sk_state == IUCV_CONNECTED){ - if(!(skb = sock_alloc_send_skb(sk, len, - msg->msg_flags & MSG_DONTWAIT, - &err))) - return err; + if (sk->sk_state == IUCV_CONNECTED) { + if (!(skb = sock_alloc_send_skb(sk, len, + msg->msg_flags & MSG_DONTWAIT, + &err))) + goto out; - if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)){ + if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) { err = -EFAULT; goto fail; } @@ -647,10 +664,16 @@ static int iucv_sock_recvmsg(struct kiocb *iocb, struct socket *sock, { int noblock = flags & MSG_DONTWAIT; struct sock *sk = sock->sk; + struct iucv_sock *iucv = iucv_sk(sk); int target, copied = 0; - struct sk_buff *skb; + struct sk_buff *skb, *rskb, *cskb; int err = 0; + if ((sk->sk_state == IUCV_DISCONN || sk->sk_state == IUCV_SEVERED) && + skb_queue_empty(&iucv->backlog_skb_q) && + skb_queue_empty(&sk->sk_receive_queue)) + return 0; + if (flags & (MSG_OOB)) return -EOPNOTSUPP; @@ -665,10 +688,12 @@ static int iucv_sock_recvmsg(struct kiocb *iocb, struct socket *sock, copied = min_t(unsigned int, skb->len, len); - if (memcpy_toiovec(msg->msg_iov, skb->data, copied)) { + cskb = skb; + if (memcpy_toiovec(msg->msg_iov, cskb->data, copied)) { skb_queue_head(&sk->sk_receive_queue, skb); if (copied == 0) return -EFAULT; + goto done; } len -= copied; @@ -683,6 +708,18 @@ static int iucv_sock_recvmsg(struct kiocb *iocb, struct socket *sock, } kfree_skb(skb); + + /* Queue backlog skbs */ + rskb = skb_dequeue(&iucv_sk(sk)->backlog_skb_q); + while (rskb) { + if (sock_queue_rcv_skb(sk, rskb)) { + skb_queue_head(&iucv_sk(sk)->backlog_skb_q, + rskb); + break; + } else { + rskb = skb_dequeue(&iucv_sk(sk)->backlog_skb_q); + } + } } else skb_queue_head(&sk->sk_receive_queue, skb); @@ -695,7 +732,7 @@ static inline unsigned int iucv_accept_poll(struct sock *parent) struct iucv_sock *isk, *n; struct sock *sk; - list_for_each_entry_safe(isk, n, &iucv_sk(parent)->accept_q, accept_q){ + list_for_each_entry_safe(isk, n, &iucv_sk(parent)->accept_q, accept_q) { sk = (struct sock *) isk; if (sk->sk_state == IUCV_CONNECTED) @@ -726,12 +763,15 @@ unsigned int iucv_sock_poll(struct file *file, struct socket *sock, mask |= POLLHUP; if (!skb_queue_empty(&sk->sk_receive_queue) || - (sk->sk_shutdown & RCV_SHUTDOWN)) + (sk->sk_shutdown & RCV_SHUTDOWN)) mask |= POLLIN | POLLRDNORM; if (sk->sk_state == IUCV_CLOSED) mask |= POLLHUP; + if (sk->sk_state == IUCV_DISCONN || sk->sk_state == IUCV_SEVERED) + mask |= POLLIN; + if (sock_writeable(sk)) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; else @@ -754,7 +794,7 @@ static int iucv_sock_shutdown(struct socket *sock, int how) return -EINVAL; lock_sock(sk); - switch(sk->sk_state) { + switch (sk->sk_state) { case IUCV_CLOSED: err = -ENOTCONN; goto fail; @@ -770,7 +810,7 @@ static int iucv_sock_shutdown(struct socket *sock, int how) err = iucv_message_send(iucv->path, &txmsg, IUCV_IPRMDATA, 0, (void *) prmmsg, 8); if (err) { - switch(err) { + switch (err) { case 1: err = -ENOTCONN; break; @@ -817,13 +857,6 @@ static int iucv_sock_release(struct socket *sock) iucv_sk(sk)->path = NULL; } - if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime){ - lock_sock(sk); - err = iucv_sock_wait_state(sk, IUCV_CLOSED, 0, - sk->sk_lingertime); - release_sock(sk); - } - sock_orphan(sk); iucv_sock_kill(sk); return err; @@ -880,7 +913,7 @@ static int iucv_callback_connreq(struct iucv_path *path, /* Create the new socket */ nsk = iucv_sock_alloc(NULL, SOCK_STREAM, GFP_ATOMIC); - if (!nsk){ + if (!nsk) { err = iucv_path_sever(path, user_data); goto fail; } @@ -903,7 +936,7 @@ static int iucv_callback_connreq(struct iucv_path *path, path->msglim = IUCV_QUEUELEN_DEFAULT; err = iucv_path_accept(path, &af_iucv_handler, nuser_data, nsk); - if (err){ + if (err) { err = iucv_path_sever(path, user_data); goto fail; } @@ -927,18 +960,53 @@ static void iucv_callback_connack(struct iucv_path *path, u8 ipuser[16]) sk->sk_state_change(sk); } +static int iucv_fragment_skb(struct sock *sk, struct sk_buff *skb, int len, + struct sk_buff_head fragmented_skb_q) +{ + int dataleft, size, copied = 0; + struct sk_buff *nskb; + + dataleft = len; + while (dataleft) { + if (dataleft >= sk->sk_rcvbuf / 4) + size = sk->sk_rcvbuf / 4; + else + size = dataleft; + + nskb = alloc_skb(size, GFP_ATOMIC | GFP_DMA); + if (!nskb) + return -ENOMEM; + + memcpy(nskb->data, skb->data + copied, size); + copied += size; + dataleft -= size; + + nskb->h.raw = nskb->data; + nskb->nh.raw = nskb->data; + nskb->len = size; + + skb_queue_tail(fragmented_skb_q, nskb); + } + + return 0; +} + static void iucv_callback_rx(struct iucv_path *path, struct iucv_message *msg) { struct sock *sk = path->private; - struct sk_buff *skb; + struct iucv_sock *iucv = iucv_sk(sk); + struct sk_buff *skb, *fskb; + struct sk_buff_head fragmented_skb_q; int rc; + skb_queue_head_init(&fragmented_skb_q); + if (sk->sk_shutdown & RCV_SHUTDOWN) return; skb = alloc_skb(msg->length, GFP_ATOMIC | GFP_DMA); if (!skb) { - iucv_message_reject(path, msg); + iucv_path_sever(path, NULL); return; } @@ -952,14 +1020,39 @@ static void iucv_callback_rx(struct iucv_path *path, struct iucv_message *msg) kfree_skb(skb); return; } + if (skb->truesize >= sk->sk_rcvbuf / 4) { + rc = iucv_fragment_skb(sk, skb, msg->length, + &fragmented_skb_q); + kfree_skb(skb); + skb = NULL; + if (rc) { + iucv_path_sever(path, NULL); + return; + } + } else { + skb_reset_transport_header(skb); + skb_reset_network_header(skb); + skb->len = msg->length; + } + } + /* Queue the fragmented skb */ + fskb = skb_dequeue(&fragmented_skb_q); + while (fskb) { + if (!skb_queue_empty(&iucv->backlog_skb_q)) + skb_queue_tail(&iucv->backlog_skb_q, fskb); + else if (sock_queue_rcv_skb(sk, fskb)) + skb_queue_tail(&iucv_sk(sk)->backlog_skb_q, fskb); + fskb = skb_dequeue(&fragmented_skb_q); + } - skb->h.raw = skb->data; - skb->nh.raw = skb->data; - skb->len = msg->length; + /* Queue the original skb if it exists (was not fragmented) */ + if (skb) { + if (!skb_queue_empty(&iucv->backlog_skb_q)) + skb_queue_tail(&iucv_sk(sk)->backlog_skb_q, skb); + else if (sock_queue_rcv_skb(sk, skb)) + skb_queue_tail(&iucv_sk(sk)->backlog_skb_q, skb); } - if (sock_queue_rcv_skb(sk, skb)) - kfree_skb(skb); } static void iucv_callback_txdone(struct iucv_path *path, @@ -971,17 +1064,27 @@ static void iucv_callback_txdone(struct iucv_path *path, struct sk_buff *list_skb = list->next; unsigned long flags; - spin_lock_irqsave(&list->lock, flags); + if (list_skb) { + spin_lock_irqsave(&list->lock, flags); + + do { + this = list_skb; + list_skb = list_skb->next; + } while (memcmp(&msg->tag, this->cb, 4) && list_skb); + + spin_unlock_irqrestore(&list->lock, flags); - do { - this = list_skb; - list_skb = list_skb->next; - } while (memcmp(&msg->tag, this->cb, 4)); + skb_unlink(this, &iucv_sk(sk)->send_skb_q); + kfree_skb(this); + } - spin_unlock_irqrestore(&list->lock, flags); + if (sk->sk_state == IUCV_CLOSING) { + if (skb_queue_empty(&iucv_sk(sk)->send_skb_q)) { + sk->sk_state = IUCV_CLOSED; + sk->sk_state_change(sk); + } + } - skb_unlink(this, &iucv_sk(sk)->send_skb_q); - kfree_skb(this); } static void iucv_callback_connrej(struct iucv_path *path, u8 ipuser[16]) @@ -1022,7 +1125,7 @@ static struct net_proto_family iucv_sock_family_ops = { .create = iucv_sock_create, }; -static int afiucv_init(void) +static int __init afiucv_init(void) { int err; diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index 1b10d576f22..fb3faf72e85 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -32,7 +32,6 @@ #include <linux/module.h> #include <linux/moduleparam.h> - #include <linux/spinlock.h> #include <linux/kernel.h> #include <linux/slab.h> @@ -69,7 +68,7 @@ #define IUCV_IPNORPY 0x10 #define IUCV_IPALL 0x80 -static int iucv_bus_match (struct device *dev, struct device_driver *drv) +static int iucv_bus_match(struct device *dev, struct device_driver *drv) { return 0; } @@ -78,8 +77,11 @@ struct bus_type iucv_bus = { .name = "iucv", .match = iucv_bus_match, }; +EXPORT_SYMBOL(iucv_bus); struct device *iucv_root; +EXPORT_SYMBOL(iucv_root); + static int iucv_available; /* General IUCV interrupt structure */ @@ -90,20 +92,43 @@ struct iucv_irq_data { u32 res2[8]; }; -struct iucv_work { +struct iucv_irq_list { struct list_head list; struct iucv_irq_data data; }; -static LIST_HEAD(iucv_work_queue); -static DEFINE_SPINLOCK(iucv_work_lock); - static struct iucv_irq_data *iucv_irq_data; static cpumask_t iucv_buffer_cpumask = CPU_MASK_NONE; static cpumask_t iucv_irq_cpumask = CPU_MASK_NONE; -static void iucv_tasklet_handler(unsigned long); -static DECLARE_TASKLET(iucv_tasklet, iucv_tasklet_handler,0); +/* + * Queue of interrupt buffers lock for delivery via the tasklet + * (fast but can't call smp_call_function). + */ +static LIST_HEAD(iucv_task_queue); + +/* + * The tasklet for fast delivery of iucv interrupts. + */ +static void iucv_tasklet_fn(unsigned long); +static DECLARE_TASKLET(iucv_tasklet, iucv_tasklet_fn,0); + +/* + * Queue of interrupt buffers for delivery via a work queue + * (slower but can call smp_call_function). + */ +static LIST_HEAD(iucv_work_queue); + +/* + * The work element to deliver path pending interrupts. + */ +static void iucv_work_fn(struct work_struct *work); +static DECLARE_WORK(iucv_work, iucv_work_fn); + +/* + * Spinlock protecting task and work queue. + */ +static DEFINE_SPINLOCK(iucv_queue_lock); enum iucv_command_codes { IUCV_QUERY = 0, @@ -147,10 +172,10 @@ static unsigned long iucv_max_pathid; static DEFINE_SPINLOCK(iucv_table_lock); /* - * iucv_tasklet_cpu: contains the number of the cpu executing the tasklet. - * Needed for iucv_path_sever called from tasklet. + * iucv_active_cpu: contains the number of the cpu executing the tasklet + * or the work handler. Needed for iucv_path_sever called from tasklet. */ -static int iucv_tasklet_cpu = -1; +static int iucv_active_cpu = -1; /* * Mutex and wait queue for iucv_register/iucv_unregister. @@ -382,7 +407,7 @@ static void iucv_declare_cpu(void *data) rc = iucv_call_b2f0(IUCV_DECLARE_BUFFER, parm); if (rc) { char *err = "Unknown"; - switch(rc) { + switch (rc) { case 0x03: err = "Directory error"; break; @@ -449,17 +474,19 @@ static void iucv_setmask_mp(void) { int cpu; + preempt_disable(); for_each_online_cpu(cpu) /* Enable all cpus with a declared buffer. */ if (cpu_isset(cpu, iucv_buffer_cpumask) && !cpu_isset(cpu, iucv_irq_cpumask)) smp_call_function_on(iucv_allow_cpu, NULL, 0, 1, cpu); + preempt_enable(); } /** * iucv_setmask_up * - * Allow iucv interrupts on a single cpus. + * Allow iucv interrupts on a single cpu. */ static void iucv_setmask_up(void) { @@ -493,8 +520,10 @@ static int iucv_enable(void) goto out; /* Declare per cpu buffers. */ rc = -EIO; + preempt_disable(); for_each_online_cpu(cpu) smp_call_function_on(iucv_declare_cpu, NULL, 0, 1, cpu); + preempt_enable(); if (cpus_empty(iucv_buffer_cpumask)) /* No cpu could declare an iucv buffer. */ goto out_path; @@ -519,7 +548,6 @@ static void iucv_disable(void) kfree(iucv_path_table); } -#ifdef CONFIG_HOTPLUG_CPU static int __cpuinit iucv_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -562,10 +590,9 @@ static int __cpuinit iucv_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block iucv_cpu_notifier = { +static struct notifier_block __cpuinitdata iucv_cpu_notifier = { .notifier_call = iucv_cpu_notify, }; -#endif /** * iucv_sever_pathid @@ -586,48 +613,49 @@ static int iucv_sever_pathid(u16 pathid, u8 userdata[16]) return iucv_call_b2f0(IUCV_SEVER, parm); } +#ifdef CONFIG_SMP /** - * __iucv_cleanup_pathid + * __iucv_cleanup_queue * @dummy: unused dummy argument * * Nop function called via smp_call_function to force work items from * pending external iucv interrupts to the work queue. */ -static void __iucv_cleanup_pathid(void *dummy) +static void __iucv_cleanup_queue(void *dummy) { } +#endif /** - * iucv_cleanup_pathid - * @pathid: 16 bit pathid + * iucv_cleanup_queue * * Function called after a path has been severed to find all remaining * work items for the now stale pathid. The caller needs to hold the * iucv_table_lock. */ -static void iucv_cleanup_pathid(u16 pathid) +static void iucv_cleanup_queue(void) { - struct iucv_work *p, *n; + struct iucv_irq_list *p, *n; /* - * Path is severed, the pathid can be reused immediatly on - * a iucv connect or a connection pending interrupt. - * iucv_path_connect and connection pending interrupt will - * wait until the iucv_table_lock is released before the - * recycled pathid enters the system. - * Force remaining interrupts to the work queue, then - * scan the work queue for items of this path. + * When a path is severed, the pathid can be reused immediatly + * on a iucv connect or a connection pending interrupt. Remove + * all entries from the task queue that refer to a stale pathid + * (iucv_path_table[ix] == NULL). Only then do the iucv connect + * or deliver the connection pending interrupt. To get all the + * pending interrupts force them to the work queue by calling + * an empty function on all cpus. */ - smp_call_function(__iucv_cleanup_pathid, NULL, 0, 1); - spin_lock_irq(&iucv_work_lock); - list_for_each_entry_safe(p, n, &iucv_work_queue, list) { - /* Remove work items for pathid except connection pending */ - if (p->data.ippathid == pathid && p->data.iptype != 0x01) { + smp_call_function(__iucv_cleanup_queue, NULL, 0, 1); + spin_lock_irq(&iucv_queue_lock); + list_for_each_entry_safe(p, n, &iucv_task_queue, list) { + /* Remove stale work items from the task queue. */ + if (iucv_path_table[p->data.ippathid] == NULL) { list_del(&p->list); kfree(p); } } - spin_unlock_irq(&iucv_work_lock); + spin_unlock_irq(&iucv_queue_lock); } /** @@ -665,6 +693,7 @@ out_mutex: mutex_unlock(&iucv_register_mutex); return rc; } +EXPORT_SYMBOL(iucv_register); /** * iucv_unregister @@ -686,7 +715,6 @@ void iucv_unregister(struct iucv_handler *handler, int smp) iucv_sever_pathid(p->pathid, NULL); iucv_path_table[p->pathid] = NULL; list_del(&p->list); - iucv_cleanup_pathid(p->pathid); iucv_path_free(p); } spin_unlock_bh(&iucv_table_lock); @@ -698,6 +726,7 @@ void iucv_unregister(struct iucv_handler *handler, int smp) iucv_setmask_mp(); mutex_unlock(&iucv_register_mutex); } +EXPORT_SYMBOL(iucv_unregister); /** * iucv_path_accept @@ -736,6 +765,7 @@ int iucv_path_accept(struct iucv_path *path, struct iucv_handler *handler, local_bh_enable(); return rc; } +EXPORT_SYMBOL(iucv_path_accept); /** * iucv_path_connect @@ -759,9 +789,9 @@ int iucv_path_connect(struct iucv_path *path, struct iucv_handler *handler, union iucv_param *parm; int rc; - preempt_disable(); - if (iucv_tasklet_cpu != smp_processor_id()) - spin_lock_bh(&iucv_table_lock); + BUG_ON(in_atomic()); + spin_lock_bh(&iucv_table_lock); + iucv_cleanup_queue(); parm = percpu_ptr(iucv_param, smp_processor_id()); memset(parm, 0, sizeof(union iucv_param)); parm->ctrl.ipmsglim = path->msglim; @@ -796,11 +826,10 @@ int iucv_path_connect(struct iucv_path *path, struct iucv_handler *handler, rc = -EIO; } } - if (iucv_tasklet_cpu != smp_processor_id()) - spin_unlock_bh(&iucv_table_lock); - preempt_enable(); + spin_unlock_bh(&iucv_table_lock); return rc; } +EXPORT_SYMBOL(iucv_path_connect); /** * iucv_path_quiesce: @@ -827,6 +856,7 @@ int iucv_path_quiesce(struct iucv_path *path, u8 userdata[16]) local_bh_enable(); return rc; } +EXPORT_SYMBOL(iucv_path_quiesce); /** * iucv_path_resume: @@ -867,21 +897,20 @@ int iucv_path_sever(struct iucv_path *path, u8 userdata[16]) { int rc; - preempt_disable(); - if (iucv_tasklet_cpu != smp_processor_id()) + if (iucv_active_cpu != smp_processor_id()) spin_lock_bh(&iucv_table_lock); rc = iucv_sever_pathid(path->pathid, userdata); if (!rc) { iucv_path_table[path->pathid] = NULL; list_del_init(&path->list); - iucv_cleanup_pathid(path->pathid); } - if (iucv_tasklet_cpu != smp_processor_id()) + if (iucv_active_cpu != smp_processor_id()) spin_unlock_bh(&iucv_table_lock); preempt_enable(); return rc; } +EXPORT_SYMBOL(iucv_path_sever); /** * iucv_message_purge @@ -914,6 +943,7 @@ int iucv_message_purge(struct iucv_path *path, struct iucv_message *msg, local_bh_enable(); return rc; } +EXPORT_SYMBOL(iucv_message_purge); /** * iucv_message_receive @@ -984,6 +1014,7 @@ int iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, local_bh_enable(); return rc; } +EXPORT_SYMBOL(iucv_message_receive); /** * iucv_message_reject @@ -1012,6 +1043,7 @@ int iucv_message_reject(struct iucv_path *path, struct iucv_message *msg) local_bh_enable(); return rc; } +EXPORT_SYMBOL(iucv_message_reject); /** * iucv_message_reply @@ -1055,6 +1087,7 @@ int iucv_message_reply(struct iucv_path *path, struct iucv_message *msg, local_bh_enable(); return rc; } +EXPORT_SYMBOL(iucv_message_reply); /** * iucv_message_send @@ -1103,6 +1136,7 @@ int iucv_message_send(struct iucv_path *path, struct iucv_message *msg, local_bh_enable(); return rc; } +EXPORT_SYMBOL(iucv_message_send); /** * iucv_message_send2way @@ -1159,6 +1193,7 @@ int iucv_message_send2way(struct iucv_path *path, struct iucv_message *msg, local_bh_enable(); return rc; } +EXPORT_SYMBOL(iucv_message_send2way); /** * iucv_path_pending @@ -1246,8 +1281,7 @@ static void iucv_path_complete(struct iucv_irq_data *data) struct iucv_path_complete *ipc = (void *) data; struct iucv_path *path = iucv_path_table[ipc->ippathid]; - BUG_ON(!path || !path->handler); - if (path->handler->path_complete) + if (path && path->handler && path->handler->path_complete) path->handler->path_complete(path, ipc->ipuser); } @@ -1275,14 +1309,14 @@ static void iucv_path_severed(struct iucv_irq_data *data) struct iucv_path_severed *ips = (void *) data; struct iucv_path *path = iucv_path_table[ips->ippathid]; - BUG_ON(!path || !path->handler); + if (!path || !path->handler) /* Already severed */ + return; if (path->handler->path_severed) path->handler->path_severed(path, ips->ipuser); else { iucv_sever_pathid(path->pathid, NULL); iucv_path_table[path->pathid] = NULL; list_del_init(&path->list); - iucv_cleanup_pathid(path->pathid); iucv_path_free(path); } } @@ -1311,8 +1345,7 @@ static void iucv_path_quiesced(struct iucv_irq_data *data) struct iucv_path_quiesced *ipq = (void *) data; struct iucv_path *path = iucv_path_table[ipq->ippathid]; - BUG_ON(!path || !path->handler); - if (path->handler->path_quiesced) + if (path && path->handler && path->handler->path_quiesced) path->handler->path_quiesced(path, ipq->ipuser); } @@ -1340,8 +1373,7 @@ static void iucv_path_resumed(struct iucv_irq_data *data) struct iucv_path_resumed *ipr = (void *) data; struct iucv_path *path = iucv_path_table[ipr->ippathid]; - BUG_ON(!path || !path->handler); - if (path->handler->path_resumed) + if (path && path->handler && path->handler->path_resumed) path->handler->path_resumed(path, ipr->ipuser); } @@ -1373,8 +1405,7 @@ static void iucv_message_complete(struct iucv_irq_data *data) struct iucv_path *path = iucv_path_table[imc->ippathid]; struct iucv_message msg; - BUG_ON(!path || !path->handler); - if (path->handler->message_complete) { + if (path && path->handler && path->handler->message_complete) { msg.flags = imc->ipflags1; msg.id = imc->ipmsgid; msg.audit = imc->ipaudit; @@ -1419,8 +1450,7 @@ static void iucv_message_pending(struct iucv_irq_data *data) struct iucv_path *path = iucv_path_table[imp->ippathid]; struct iucv_message msg; - BUG_ON(!path || !path->handler); - if (path->handler->message_pending) { + if (path && path->handler && path->handler->message_pending) { msg.flags = imp->ipflags1; msg.id = imp->ipmsgid; msg.class = imp->iptrgcls; @@ -1435,17 +1465,16 @@ static void iucv_message_pending(struct iucv_irq_data *data) } /** - * iucv_tasklet_handler: + * iucv_tasklet_fn: * * This tasklet loops over the queue of irq buffers created by * iucv_external_interrupt, calls the appropriate action handler * and then frees the buffer. */ -static void iucv_tasklet_handler(unsigned long ignored) +static void iucv_tasklet_fn(unsigned long ignored) { typedef void iucv_irq_fn(struct iucv_irq_data *); static iucv_irq_fn *irq_fn[] = { - [0x01] = iucv_path_pending, [0x02] = iucv_path_complete, [0x03] = iucv_path_severed, [0x04] = iucv_path_quiesced, @@ -1455,38 +1484,70 @@ static void iucv_tasklet_handler(unsigned long ignored) [0x08] = iucv_message_pending, [0x09] = iucv_message_pending, }; - struct iucv_work *p; + struct list_head task_queue = LIST_HEAD_INIT(task_queue); + struct iucv_irq_list *p, *n; /* Serialize tasklet, iucv_path_sever and iucv_path_connect. */ spin_lock(&iucv_table_lock); - iucv_tasklet_cpu = smp_processor_id(); + iucv_active_cpu = smp_processor_id(); - spin_lock_irq(&iucv_work_lock); - while (!list_empty(&iucv_work_queue)) { - p = list_entry(iucv_work_queue.next, struct iucv_work, list); + spin_lock_irq(&iucv_queue_lock); + list_splice_init(&iucv_task_queue, &task_queue); + spin_unlock_irq(&iucv_queue_lock); + + list_for_each_entry_safe(p, n, &task_queue, list) { list_del_init(&p->list); - spin_unlock_irq(&iucv_work_lock); irq_fn[p->data.iptype](&p->data); kfree(p); - spin_lock_irq(&iucv_work_lock); } - spin_unlock_irq(&iucv_work_lock); - iucv_tasklet_cpu = -1; + iucv_active_cpu = -1; spin_unlock(&iucv_table_lock); } /** + * iucv_work_fn: + * + * This work function loops over the queue of path pending irq blocks + * created by iucv_external_interrupt, calls the appropriate action + * handler and then frees the buffer. + */ +static void iucv_work_fn(struct work_struct *work) +{ + typedef void iucv_irq_fn(struct iucv_irq_data *); + struct list_head work_queue = LIST_HEAD_INIT(work_queue); + struct iucv_irq_list *p, *n; + + /* Serialize tasklet, iucv_path_sever and iucv_path_connect. */ + spin_lock_bh(&iucv_table_lock); + iucv_active_cpu = smp_processor_id(); + + spin_lock_irq(&iucv_queue_lock); + list_splice_init(&iucv_work_queue, &work_queue); + spin_unlock_irq(&iucv_queue_lock); + + iucv_cleanup_queue(); + list_for_each_entry_safe(p, n, &work_queue, list) { + list_del_init(&p->list); + iucv_path_pending(&p->data); + kfree(p); + } + + iucv_active_cpu = -1; + spin_unlock_bh(&iucv_table_lock); +} + +/** * iucv_external_interrupt * @code: irq code * * Handles external interrupts coming in from CP. - * Places the interrupt buffer on a queue and schedules iucv_tasklet_handler(). + * Places the interrupt buffer on a queue and schedules iucv_tasklet_fn(). */ static void iucv_external_interrupt(u16 code) { struct iucv_irq_data *p; - struct iucv_work *work; + struct iucv_irq_list *work; p = percpu_ptr(iucv_irq_data, smp_processor_id()); if (p->ippathid >= iucv_max_pathid) { @@ -1500,16 +1561,23 @@ static void iucv_external_interrupt(u16 code) printk(KERN_ERR "iucv_do_int: unknown iucv interrupt\n"); return; } - work = kmalloc(sizeof(struct iucv_work), GFP_ATOMIC); + work = kmalloc(sizeof(struct iucv_irq_list), GFP_ATOMIC); if (!work) { printk(KERN_WARNING "iucv_external_interrupt: out of memory\n"); return; } memcpy(&work->data, p, sizeof(work->data)); - spin_lock(&iucv_work_lock); - list_add_tail(&work->list, &iucv_work_queue); - spin_unlock(&iucv_work_lock); - tasklet_schedule(&iucv_tasklet); + spin_lock(&iucv_queue_lock); + if (p->iptype == 0x01) { + /* Path pending interrupt. */ + list_add_tail(&work->list, &iucv_work_queue); + schedule_work(&iucv_work); + } else { + /* The other interrupts. */ + list_add_tail(&work->list, &iucv_task_queue); + tasklet_schedule(&iucv_tasklet); + } + spin_unlock(&iucv_queue_lock); } /** @@ -1517,7 +1585,7 @@ static void iucv_external_interrupt(u16 code) * * Allocates and initializes various data structures. */ -static int iucv_init(void) +static int __init iucv_init(void) { int rc; @@ -1528,7 +1596,7 @@ static int iucv_init(void) rc = iucv_query_maxconn(); if (rc) goto out; - rc = register_external_interrupt (0x4000, iucv_external_interrupt); + rc = register_external_interrupt(0x4000, iucv_external_interrupt); if (rc) goto out; rc = bus_register(&iucv_bus); @@ -1539,7 +1607,7 @@ static int iucv_init(void) rc = PTR_ERR(iucv_root); goto out_bus; } - /* Note: GFP_DMA used used to get memory below 2G */ + /* Note: GFP_DMA used to get memory below 2G */ iucv_irq_data = percpu_alloc(sizeof(struct iucv_irq_data), GFP_KERNEL|GFP_DMA); if (!iucv_irq_data) { @@ -1577,14 +1645,16 @@ out: * * Frees everything allocated from iucv_init. */ -static void iucv_exit(void) +static void __exit iucv_exit(void) { - struct iucv_work *p, *n; + struct iucv_irq_list *p, *n; - spin_lock_irq(&iucv_work_lock); + spin_lock_irq(&iucv_queue_lock); + list_for_each_entry_safe(p, n, &iucv_task_queue, list) + kfree(p); list_for_each_entry_safe(p, n, &iucv_work_queue, list) kfree(p); - spin_unlock_irq(&iucv_work_lock); + spin_unlock_irq(&iucv_queue_lock); unregister_hotcpu_notifier(&iucv_cpu_notifier); percpu_free(iucv_param); percpu_free(iucv_irq_data); @@ -1596,24 +1666,6 @@ static void iucv_exit(void) subsys_initcall(iucv_init); module_exit(iucv_exit); -/** - * Export all public stuff - */ -EXPORT_SYMBOL (iucv_bus); -EXPORT_SYMBOL (iucv_root); -EXPORT_SYMBOL (iucv_register); -EXPORT_SYMBOL (iucv_unregister); -EXPORT_SYMBOL (iucv_path_accept); -EXPORT_SYMBOL (iucv_path_connect); -EXPORT_SYMBOL (iucv_path_quiesce); -EXPORT_SYMBOL (iucv_path_sever); -EXPORT_SYMBOL (iucv_message_purge); -EXPORT_SYMBOL (iucv_message_receive); -EXPORT_SYMBOL (iucv_message_reject); -EXPORT_SYMBOL (iucv_message_reply); -EXPORT_SYMBOL (iucv_message_send); -EXPORT_SYMBOL (iucv_message_send2way); - MODULE_AUTHOR("(C) 2001 IBM Corp. by Fritz Elfert (felfert@millenux.com)"); MODULE_DESCRIPTION("Linux for S/390 IUCV lowlevel driver"); MODULE_LICENSE("GPL"); diff --git a/net/key/af_key.c b/net/key/af_key.c index 345019345f0..a99444142dc 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -379,7 +379,7 @@ static int verify_address_len(void *p) */ return -EINVAL; break; - }; + } return 0; } @@ -3667,7 +3667,7 @@ static int pfkey_recvmsg(struct kiocb *kiocb, copied = len; } - skb->h.raw = skb->data; + skb_reset_transport_header(skb); err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); if (err) goto out_free; diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c index d12413cff5b..d4b13a031fd 100644 --- a/net/llc/llc_core.c +++ b/net/llc/llc_core.c @@ -160,8 +160,14 @@ static struct packet_type llc_tr_packet_type = { static int __init llc_init(void) { - if (dev_base->next) - memcpy(llc_station_mac_sa, dev_base->next->dev_addr, ETH_ALEN); + struct net_device *dev; + + dev = first_net_device(); + if (dev != NULL) + dev = next_net_device(dev); + + if (dev != NULL) + memcpy(llc_station_mac_sa, dev->dev_addr, ETH_ALEN); else memset(llc_station_mac_sa, 0, ETH_ALEN); dev_add_pack(&llc_packet_type); diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c index b3f65d1e80b..099ed8fec14 100644 --- a/net/llc/llc_input.c +++ b/net/llc/llc_input.c @@ -112,7 +112,7 @@ static inline int llc_fixup_skb(struct sk_buff *skb) if (unlikely(!pskb_may_pull(skb, llc_len))) return 0; - skb->h.raw += llc_len; + skb->transport_header += llc_len; skb_pull(skb, llc_len); if (skb->protocol == htons(ETH_P_802_2)) { __be16 pdulen = eth_hdr(skb)->h_proto; diff --git a/net/llc/llc_output.c b/net/llc/llc_output.c index f4291f349e9..754f4fedc85 100644 --- a/net/llc/llc_output.c +++ b/net/llc/llc_output.c @@ -41,7 +41,8 @@ int llc_mac_hdr_init(struct sk_buff *skb, struct net_device *dev = skb->dev; struct trh_hdr *trh; - skb->mac.raw = skb_push(skb, sizeof(*trh)); + skb_push(skb, sizeof(*trh)); + skb_reset_mac_header(skb); trh = tr_hdr(skb); trh->ac = AC; trh->fc = LLC_FRAME; @@ -52,7 +53,7 @@ int llc_mac_hdr_init(struct sk_buff *skb, if (da) { memcpy(trh->daddr, da, dev->addr_len); tr_source_route(skb, trh, dev); - skb->mac.raw = skb->data; + skb_reset_mac_header(skb); } break; } @@ -62,7 +63,8 @@ int llc_mac_hdr_init(struct sk_buff *skb, unsigned short len = skb->len; struct ethhdr *eth; - skb->mac.raw = skb_push(skb, sizeof(*eth)); + skb_push(skb, sizeof(*eth)); + skb_reset_mac_header(skb); eth = eth_hdr(skb); eth->h_proto = htons(len); memcpy(eth->h_dest, da, ETH_ALEN); diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c index 2615dc81aa3..2525165e2e8 100644 --- a/net/llc/llc_sap.c +++ b/net/llc/llc_sap.c @@ -36,11 +36,12 @@ struct sk_buff *llc_alloc_frame(struct sock *sk, struct net_device *dev) struct sk_buff *skb = alloc_skb(128, GFP_ATOMIC); if (skb) { + skb_reset_mac_header(skb); skb_reserve(skb, 50); - skb->nh.raw = skb->h.raw = skb->data; + skb_reset_network_header(skb); + skb_reset_transport_header(skb); skb->protocol = htons(ETH_P_802_2); skb->dev = dev; - skb->mac.raw = skb->head; if (sk != NULL) skb_set_owner_w(skb, sk); } diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 54698af6d0a..c558f321425 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -25,6 +25,7 @@ config NETFILTER_NETLINK_LOG and is also scheduled to replace the old syslog-based ipt_LOG and ip6t_LOG modules. +# Rename this to NF_CONNTRACK in a 2.6.25 config NF_CONNTRACK_ENABLED tristate "Netfilter connection tracking support" help @@ -39,42 +40,9 @@ config NF_CONNTRACK_ENABLED To compile it as a module, choose M here. If unsure, say N. -choice - prompt "Netfilter connection tracking support" - depends on NF_CONNTRACK_ENABLED - -config NF_CONNTRACK_SUPPORT - bool "Layer 3 Independent Connection tracking" - help - Layer 3 independent connection tracking is experimental scheme - which generalize ip_conntrack to support other layer 3 protocols. - - This is required to do Masquerading or other kinds of Network - Address Translation (except for Fast NAT). It can also be used to - enhance packet filtering (see `Connection state match support' - below). - -config IP_NF_CONNTRACK_SUPPORT - bool "Layer 3 Dependent Connection tracking (OBSOLETE)" - help - The old, Layer 3 dependent ip_conntrack subsystem of netfilter. - - This is required to do Masquerading or other kinds of Network - Address Translation (except for Fast NAT). It can also be used to - enhance packet filtering (see `Connection state match support' - below). - -endchoice - config NF_CONNTRACK tristate - default m if NF_CONNTRACK_SUPPORT && NF_CONNTRACK_ENABLED=m - default y if NF_CONNTRACK_SUPPORT && NF_CONNTRACK_ENABLED=y - -config IP_NF_CONNTRACK - tristate - default m if IP_NF_CONNTRACK_SUPPORT && NF_CONNTRACK_ENABLED=m - default y if IP_NF_CONNTRACK_SUPPORT && NF_CONNTRACK_ENABLED=y + default NF_CONNTRACK_ENABLED config NF_CT_ACCT bool "Connection tracking flow accounting" @@ -303,9 +271,8 @@ config NETFILTER_XT_TARGET_CONNMARK tristate '"CONNMARK" target support' depends on NETFILTER_XTABLES depends on IP_NF_MANGLE || IP6_NF_MANGLE - depends on IP_NF_CONNTRACK || NF_CONNTRACK - select IP_NF_CONNTRACK_MARK if IP_NF_CONNTRACK - select NF_CONNTRACK_MARK if NF_CONNTRACK + depends on NF_CONNTRACK + select NF_CONNTRACK_MARK help This option adds a `CONNMARK' target, which allows one to manipulate the connection mark value. Similar to the MARK target, but @@ -366,7 +333,7 @@ config NETFILTER_XT_TARGET_NOTRACK tristate '"NOTRACK" target support' depends on NETFILTER_XTABLES depends on IP_NF_RAW || IP6_NF_RAW - depends on IP_NF_CONNTRACK || NF_CONNTRACK + depends on NF_CONNTRACK help The NOTRACK target allows a select rule to specify which packets *not* to enter the conntrack/NAT @@ -387,9 +354,7 @@ config NETFILTER_XT_TARGET_SECMARK config NETFILTER_XT_TARGET_CONNSECMARK tristate '"CONNSECMARK" target support' - depends on NETFILTER_XTABLES && \ - ((NF_CONNTRACK && NF_CONNTRACK_SECMARK) || \ - (IP_NF_CONNTRACK && IP_NF_CONNTRACK_SECMARK)) + depends on NETFILTER_XTABLES && NF_CONNTRACK && NF_CONNTRACK_SECMARK help The CONNSECMARK target copies security markings from packets to connections, and restores security markings from connections @@ -437,9 +402,8 @@ config NETFILTER_XT_MATCH_COMMENT config NETFILTER_XT_MATCH_CONNBYTES tristate '"connbytes" per-connection counter match support' depends on NETFILTER_XTABLES - depends on IP_NF_CONNTRACK || NF_CONNTRACK - select IP_NF_CT_ACCT if IP_NF_CONNTRACK - select NF_CT_ACCT if NF_CONNTRACK + depends on NF_CONNTRACK + select NF_CT_ACCT help This option adds a `connbytes' match, which allows you to match the number of bytes and/or packets for each direction within a connection. @@ -450,9 +414,8 @@ config NETFILTER_XT_MATCH_CONNBYTES config NETFILTER_XT_MATCH_CONNMARK tristate '"connmark" connection mark match support' depends on NETFILTER_XTABLES - depends on IP_NF_CONNTRACK || NF_CONNTRACK - select IP_NF_CONNTRACK_MARK if IP_NF_CONNTRACK - select NF_CONNTRACK_MARK if NF_CONNTRACK + depends on NF_CONNTRACK + select NF_CONNTRACK_MARK help This option adds a `connmark' match, which allows you to match the connection mark value previously set for the session by `CONNMARK'. @@ -464,7 +427,7 @@ config NETFILTER_XT_MATCH_CONNMARK config NETFILTER_XT_MATCH_CONNTRACK tristate '"conntrack" connection tracking match support' depends on NETFILTER_XTABLES - depends on IP_NF_CONNTRACK || NF_CONNTRACK + depends on NF_CONNTRACK help This is a general conntrack match module, a superset of the state match. @@ -508,7 +471,7 @@ config NETFILTER_XT_MATCH_ESP config NETFILTER_XT_MATCH_HELPER tristate '"helper" match support' depends on NETFILTER_XTABLES - depends on IP_NF_CONNTRACK || NF_CONNTRACK + depends on NF_CONNTRACK help Helper matching allows you to match packets in dynamic connections tracked by a conntrack-helper, ie. ip_conntrack_ftp @@ -632,7 +595,7 @@ config NETFILTER_XT_MATCH_SCTP config NETFILTER_XT_MATCH_STATE tristate '"state" match support' depends on NETFILTER_XTABLES - depends on IP_NF_CONNTRACK || NF_CONNTRACK + depends on NF_CONNTRACK help Connection state matching allows you to match packets based on their relationship to a tracked connection (ie. previous packets). This diff --git a/net/netfilter/core.c b/net/netfilter/core.c index c3ebdbd917e..a84478ee2de 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -5,10 +5,6 @@ * way. * * Rusty Russell (C)2000 -- This code is GPL. - * - * February 2000: Modified by James Morris to have 1 queue per protocol. - * 15-Mar-2000: Added NF_REPEAT --RR. - * 08-May-2003: Internal logging interface added by Jozsef Kadlecsik. */ #include <linux/kernel.h> #include <linux/netfilter.h> @@ -244,6 +240,7 @@ void nf_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, } EXPORT_SYMBOL(nf_proto_csum_replace4); +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) /* This does not belong here, but locally generated errors need it if connection tracking in use: without this, connection may not be in hash table, and hence manufactured ICMP or RST packets will not be associated with it. */ @@ -264,6 +261,22 @@ void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) } EXPORT_SYMBOL(nf_ct_attach); +void (*nf_ct_destroy)(struct nf_conntrack *); +EXPORT_SYMBOL(nf_ct_destroy); + +void nf_conntrack_destroy(struct nf_conntrack *nfct) +{ + void (*destroy)(struct nf_conntrack *); + + rcu_read_lock(); + destroy = rcu_dereference(nf_ct_destroy); + BUG_ON(destroy == NULL); + destroy(nfct); + rcu_read_unlock(); +} +EXPORT_SYMBOL(nf_conntrack_destroy); +#endif /* CONFIG_NF_CONNTRACK */ + #ifdef CONFIG_PROC_FS struct proc_dir_entry *proc_net_netfilter; EXPORT_SYMBOL(proc_net_netfilter); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index b3a70eb6d42..e132c8ae878 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -9,24 +9,6 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. - * - * 23 Apr 2001: Harald Welte <laforge@gnumonks.org> - * - new API and handling of conntrack/nat helpers - * - now capable of multiple expectations for one master - * 16 Jul 2002: Harald Welte <laforge@gnumonks.org> - * - add usage/reference counts to ip_conntrack_expect - * - export ip_conntrack[_expect]_{find_get,put} functions - * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> - * - generalize L3 protocol denendent part. - * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> - * - add support various size of conntrack structures. - * 26 Jan 2006: Harald Welte <laforge@netfilter.org> - * - restructure nf_conn (introduce nf_conn_help) - * - redesign 'features' how they were originally intended - * 26 Feb 2006: Pablo Neira Ayuso <pablo@eurodev.net> - * - add support for L3 protocol module load on demand. - * - * Derived from net/ipv4/netfilter/ip_conntrack_core.c */ #include <linux/types.h> @@ -128,10 +110,11 @@ static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple, unsigned int size, unsigned int rnd) { unsigned int a, b; - a = jhash((void *)tuple->src.u3.all, sizeof(tuple->src.u3.all), - ((tuple->src.l3num) << 16) | tuple->dst.protonum); - b = jhash((void *)tuple->dst.u3.all, sizeof(tuple->dst.u3.all), - (tuple->src.u.all << 16) | tuple->dst.u.all); + + a = jhash2(tuple->src.u3.all, ARRAY_SIZE(tuple->src.u3.all), + (tuple->src.l3num << 16) | tuple->dst.protonum); + b = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all), + (tuple->src.u.all << 16) | tuple->dst.u.all); return jhash_2words(a, b, rnd) % size; } @@ -633,13 +616,11 @@ __nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, memset(conntrack, 0, nf_ct_cache[features].size); conntrack->features = features; atomic_set(&conntrack->ct_general.use, 1); - conntrack->ct_general.destroy = destroy_conntrack; conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; /* Don't set timer yet: wait for confirmation */ - init_timer(&conntrack->timeout); - conntrack->timeout.data = (unsigned long)conntrack; - conntrack->timeout.function = death_by_timeout; + setup_timer(&conntrack->timeout, death_by_timeout, + (unsigned long)conntrack); read_unlock_bh(&nf_ct_cache_lock); return conntrack; @@ -768,7 +749,7 @@ resolve_normal_ct(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; - if (!nf_ct_get_tuple(skb, (unsigned int)(skb->nh.raw - skb->data), + if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, protonum, &tuple, l3proto, l4proto)) { DEBUGP("resolve_normal_ct: Can't get tuple\n"); @@ -960,7 +941,7 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, if (do_acct) { ct->counters[CTINFO2DIR(ctinfo)].packets++; ct->counters[CTINFO2DIR(ctinfo)].bytes += - skb->len - (unsigned int)(skb->nh.raw - skb->data); + skb->len - skb_network_offset(skb); if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000) || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000)) @@ -1140,6 +1121,8 @@ void nf_conntrack_cleanup(void) while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1) schedule(); + rcu_assign_pointer(nf_ct_destroy, NULL); + for (i = 0; i < NF_CT_F_NUM; i++) { if (nf_ct_cache[i].use == 0) continue; @@ -1152,14 +1135,7 @@ void nf_conntrack_cleanup(void) free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc, nf_conntrack_htable_size); - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_generic); - - /* free l3proto protocol tables */ - for (i = 0; i < PF_MAX; i++) - if (nf_ct_protos[i]) { - kfree(nf_ct_protos[i]); - nf_ct_protos[i] = NULL; - } + nf_conntrack_proto_fini(); } static struct list_head *alloc_hashtable(int size, int *vmalloced) @@ -1237,7 +1213,6 @@ module_param_call(hashsize, set_hashsize, param_get_uint, int __init nf_conntrack_init(void) { - unsigned int i; int ret; /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB @@ -1279,18 +1254,13 @@ int __init nf_conntrack_init(void) goto err_free_conntrack_slab; } - ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_generic); + ret = nf_conntrack_proto_init(); if (ret < 0) goto out_free_expect_slab; - /* Don't NEED lock here, but good form anyway. */ - write_lock_bh(&nf_conntrack_lock); - for (i = 0; i < AF_MAX; i++) - nf_ct_l3protos[i] = &nf_conntrack_l3proto_generic; - write_unlock_bh(&nf_conntrack_lock); - /* For use by REJECT target */ rcu_assign_pointer(ip_ct_attach, __nf_conntrack_attach); + rcu_assign_pointer(nf_ct_destroy, destroy_conntrack); /* Set up fake conntrack: - to never be deleted, not in any hashes */ diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 1a223e0c085..6bd421df2db 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -91,3 +91,26 @@ void nf_ct_event_cache_flush(void) } } +int nf_conntrack_register_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&nf_conntrack_chain, nb); +} +EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier); + +int nf_conntrack_unregister_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&nf_conntrack_chain, nb); +} +EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier); + +int nf_conntrack_expect_register_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&nf_conntrack_expect_chain, nb); +} +EXPORT_SYMBOL_GPL(nf_conntrack_expect_register_notifier); + +int nf_conntrack_expect_unregister_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&nf_conntrack_expect_chain, nb); +} +EXPORT_SYMBOL_GPL(nf_conntrack_expect_unregister_notifier); diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index ce70a6fc6bd..c31af29a443 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -290,9 +290,7 @@ static void nf_conntrack_expect_insert(struct nf_conntrack_expect *exp) master_help->expecting++; list_add(&exp->list, &nf_conntrack_expect_list); - init_timer(&exp->timeout); - exp->timeout.data = (unsigned long)exp; - exp->timeout.function = expectation_timed_out; + setup_timer(&exp->timeout, expectation_timed_out, (unsigned long)exp); exp->timeout.expires = jiffies + master_help->helper->timeout * HZ; add_timer(&exp->timeout); diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index 3089dfc40c8..a186799f654 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -7,12 +7,6 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. - * - * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> - * - enable working with Layer 3 protocol independent connection tracking. - * - track EPRT and EPSV commands with IPv6 address. - * - * Derived from net/ipv4/netfilter/ip_conntrack_ftp.c */ #include <linux/module.h> diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c index bb26a658cc1..1093478cc00 100644 --- a/net/netfilter/nf_conntrack_netbios_ns.c +++ b/net/netfilter/nf_conntrack_netbios_ns.c @@ -46,7 +46,7 @@ static int help(struct sk_buff **pskb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { struct nf_conntrack_expect *exp; - struct iphdr *iph = (*pskb)->nh.iph; + struct iphdr *iph = ip_hdr(*pskb); struct rtable *rt = (struct rtable *)(*pskb)->dst; struct in_device *in_dev; __be32 mask = 0; diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 48f05314ebf..aa1a97ee514 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -6,9 +6,6 @@ * (C) 2003 by Patrick Mchardy <kaber@trash.net> * (C) 2005-2006 by Pablo Neira Ayuso <pablo@eurodev.net> * - * I've reworked this stuff to use attributes instead of conntrack - * structures. 5.44 am. I need more tea. --pablo 05/07/11. - * * Initial connection tracking via netlink development funded and * generally made possible by Network Robots, Inc. (www.networkrobots.com) * @@ -16,8 +13,6 @@ * * This software may be used and distributed according to the terms * of the GNU General Public License, incorporated herein by reference. - * - * Derived from ip_conntrack_netlink.c: Port by Pablo Neira Ayuso (05/11/14) */ #include <linux/init.h> @@ -33,6 +28,7 @@ #include <linux/notifier.h> #include <linux/netfilter.h> +#include <net/netlink.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_expect.h> @@ -268,9 +264,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq, struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; struct nfattr *nest_parms; - unsigned char *b; - - b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); event |= NFNL_SUBSYS_CTNETLINK << 8; nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg)); @@ -303,12 +297,12 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq, ctnetlink_dump_use(skb, ct) < 0) goto nfattr_failure; - nlh->nlmsg_len = skb->tail - b; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; nlmsg_failure: nfattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } @@ -322,7 +316,7 @@ static int ctnetlink_conntrack_event(struct notifier_block *this, struct nf_conn *ct = (struct nf_conn *)ptr; struct sk_buff *skb; unsigned int type; - unsigned char *b; + sk_buff_data_t b; unsigned int flags = 0, group; /* ignore our fake conntrack entry */ @@ -662,7 +656,7 @@ static const size_t cta_min[CTA_MAX] = { static int ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) + struct nlmsghdr *nlh, struct nfattr *cda[]) { struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; @@ -710,7 +704,7 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, static int ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) + struct nlmsghdr *nlh, struct nfattr *cda[]) { struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; @@ -721,22 +715,12 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, int err = 0; if (nlh->nlmsg_flags & NLM_F_DUMP) { - u32 rlen; - #ifndef CONFIG_NF_CT_ACCT if (NFNL_MSG_TYPE(nlh->nlmsg_type) == IPCTNL_MSG_CT_GET_CTRZERO) return -ENOTSUPP; #endif - if ((*errp = netlink_dump_start(ctnl, skb, nlh, - ctnetlink_dump_table, - ctnetlink_done)) != 0) - return -EINVAL; - - rlen = NLMSG_ALIGN(nlh->nlmsg_len); - if (rlen > skb->len) - rlen = skb->len; - skb_pull(skb, rlen); - return 0; + return netlink_dump_start(ctnl, skb, nlh, ctnetlink_dump_table, + ctnetlink_done); } if (nfattr_bad_size(cda, CTA_MAX, cta_min)) @@ -1010,7 +994,7 @@ err: static int ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) + struct nlmsghdr *nlh, struct nfattr *cda[]) { struct nf_conntrack_tuple otuple, rtuple; struct nf_conntrack_tuple_hash *h = NULL; @@ -1152,9 +1136,7 @@ ctnetlink_exp_fill_info(struct sk_buff *skb, u32 pid, u32 seq, { struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; - unsigned char *b; - - b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); event |= NFNL_SUBSYS_CTNETLINK_EXP << 8; nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg)); @@ -1168,12 +1150,12 @@ ctnetlink_exp_fill_info(struct sk_buff *skb, u32 pid, u32 seq, if (ctnetlink_exp_dump_expect(skb, exp) < 0) goto nfattr_failure; - nlh->nlmsg_len = skb->tail - b; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; nlmsg_failure: nfattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } @@ -1186,7 +1168,7 @@ static int ctnetlink_expect_event(struct notifier_block *this, struct nf_conntrack_expect *exp = (struct nf_conntrack_expect *)ptr; struct sk_buff *skb; unsigned int type; - unsigned char *b; + sk_buff_data_t b; int flags = 0; if (events & IPEXP_NEW) { @@ -1263,7 +1245,7 @@ static const size_t cta_min_exp[CTA_EXPECT_MAX] = { static int ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) + struct nlmsghdr *nlh, struct nfattr *cda[]) { struct nf_conntrack_tuple tuple; struct nf_conntrack_expect *exp; @@ -1276,17 +1258,9 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, return -EINVAL; if (nlh->nlmsg_flags & NLM_F_DUMP) { - u32 rlen; - - if ((*errp = netlink_dump_start(ctnl, skb, nlh, - ctnetlink_exp_dump_table, - ctnetlink_done)) != 0) - return -EINVAL; - rlen = NLMSG_ALIGN(nlh->nlmsg_len); - if (rlen > skb->len) - rlen = skb->len; - skb_pull(skb, rlen); - return 0; + return netlink_dump_start(ctnl, skb, nlh, + ctnetlink_exp_dump_table, + ctnetlink_done); } if (cda[CTA_EXPECT_MASTER-1]) @@ -1333,7 +1307,7 @@ out: static int ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) + struct nlmsghdr *nlh, struct nfattr *cda[]) { struct nf_conntrack_expect *exp, *tmp; struct nf_conntrack_tuple tuple; @@ -1467,7 +1441,7 @@ out: static int ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) + struct nlmsghdr *nlh, struct nfattr *cda[]) { struct nf_conntrack_tuple tuple; struct nf_conntrack_expect *exp; diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 456155f05c7..6d947068c58 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -28,13 +28,13 @@ #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_core.h> -struct nf_conntrack_l4proto **nf_ct_protos[PF_MAX] __read_mostly; +static struct nf_conntrack_l4proto **nf_ct_protos[PF_MAX] __read_mostly; struct nf_conntrack_l3proto *nf_ct_l3protos[AF_MAX] __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_l3protos); -#ifdef CONFIG_SYSCTL -static DEFINE_MUTEX(nf_ct_proto_sysctl_mutex); +static DEFINE_MUTEX(nf_ct_proto_mutex); +#ifdef CONFIG_SYSCTL static int nf_ct_register_sysctl(struct ctl_table_header **header, struct ctl_table *path, struct ctl_table *table, unsigned int *users) @@ -164,13 +164,11 @@ static int nf_ct_l3proto_register_sysctl(struct nf_conntrack_l3proto *l3proto) int err = 0; #ifdef CONFIG_SYSCTL - mutex_lock(&nf_ct_proto_sysctl_mutex); if (l3proto->ctl_table != NULL) { err = nf_ct_register_sysctl(&l3proto->ctl_table_header, l3proto->ctl_table_path, l3proto->ctl_table, NULL); } - mutex_unlock(&nf_ct_proto_sysctl_mutex); #endif return err; } @@ -178,11 +176,9 @@ static int nf_ct_l3proto_register_sysctl(struct nf_conntrack_l3proto *l3proto) static void nf_ct_l3proto_unregister_sysctl(struct nf_conntrack_l3proto *l3proto) { #ifdef CONFIG_SYSCTL - mutex_lock(&nf_ct_proto_sysctl_mutex); if (l3proto->ctl_table_header != NULL) nf_ct_unregister_sysctl(&l3proto->ctl_table_header, l3proto->ctl_table, NULL); - mutex_unlock(&nf_ct_proto_sysctl_mutex); #endif } @@ -190,27 +186,23 @@ int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto) { int ret = 0; - if (proto->l3proto >= AF_MAX) { - ret = -EBUSY; - goto out; - } + if (proto->l3proto >= AF_MAX) + return -EBUSY; - write_lock_bh(&nf_conntrack_lock); + mutex_lock(&nf_ct_proto_mutex); if (nf_ct_l3protos[proto->l3proto] != &nf_conntrack_l3proto_generic) { ret = -EBUSY; goto out_unlock; } - rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], proto); - write_unlock_bh(&nf_conntrack_lock); ret = nf_ct_l3proto_register_sysctl(proto); if (ret < 0) - nf_conntrack_l3proto_unregister(proto); - return ret; + goto out_unlock; + + rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], proto); out_unlock: - write_unlock_bh(&nf_conntrack_lock); -out: + mutex_unlock(&nf_ct_proto_mutex); return ret; } EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_register); @@ -219,14 +211,14 @@ void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto) { BUG_ON(proto->l3proto >= AF_MAX); - write_lock_bh(&nf_conntrack_lock); + mutex_lock(&nf_ct_proto_mutex); BUG_ON(nf_ct_l3protos[proto->l3proto] != proto); rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], &nf_conntrack_l3proto_generic); - write_unlock_bh(&nf_conntrack_lock); - synchronize_rcu(); - nf_ct_l3proto_unregister_sysctl(proto); + mutex_unlock(&nf_ct_proto_mutex); + + synchronize_rcu(); /* Remove all contrack entries for this protocol */ nf_ct_iterate_cleanup(kill_l3proto, proto); @@ -238,7 +230,6 @@ static int nf_ct_l4proto_register_sysctl(struct nf_conntrack_l4proto *l4proto) int err = 0; #ifdef CONFIG_SYSCTL - mutex_lock(&nf_ct_proto_sysctl_mutex); if (l4proto->ctl_table != NULL) { err = nf_ct_register_sysctl(l4proto->ctl_table_header, nf_net_netfilter_sysctl_path, @@ -260,7 +251,6 @@ static int nf_ct_l4proto_register_sysctl(struct nf_conntrack_l4proto *l4proto) } #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ out: - mutex_unlock(&nf_ct_proto_sysctl_mutex); #endif /* CONFIG_SYSCTL */ return err; } @@ -268,7 +258,6 @@ out: static void nf_ct_l4proto_unregister_sysctl(struct nf_conntrack_l4proto *l4proto) { #ifdef CONFIG_SYSCTL - mutex_lock(&nf_ct_proto_sysctl_mutex); if (l4proto->ctl_table_header != NULL && *l4proto->ctl_table_header != NULL) nf_ct_unregister_sysctl(l4proto->ctl_table_header, @@ -279,7 +268,6 @@ static void nf_ct_l4proto_unregister_sysctl(struct nf_conntrack_l4proto *l4proto nf_ct_unregister_sysctl(&l4proto->ctl_compat_table_header, l4proto->ctl_compat_table, NULL); #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ - mutex_unlock(&nf_ct_proto_sysctl_mutex); #endif /* CONFIG_SYSCTL */ } @@ -289,68 +277,41 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto) { int ret = 0; - if (l4proto->l3proto >= PF_MAX) { - ret = -EBUSY; - goto out; - } - - if (l4proto == &nf_conntrack_l4proto_generic) - return nf_ct_l4proto_register_sysctl(l4proto); + if (l4proto->l3proto >= PF_MAX) + return -EBUSY; -retry: - write_lock_bh(&nf_conntrack_lock); - if (nf_ct_protos[l4proto->l3proto]) { - if (nf_ct_protos[l4proto->l3proto][l4proto->l4proto] - != &nf_conntrack_l4proto_generic) { - ret = -EBUSY; - goto out_unlock; - } - } else { + mutex_lock(&nf_ct_proto_mutex); + if (!nf_ct_protos[l4proto->l3proto]) { /* l3proto may be loaded latter. */ struct nf_conntrack_l4proto **proto_array; int i; - write_unlock_bh(&nf_conntrack_lock); - - proto_array = (struct nf_conntrack_l4proto **) - kmalloc(MAX_NF_CT_PROTO * - sizeof(struct nf_conntrack_l4proto *), - GFP_KERNEL); + proto_array = kmalloc(MAX_NF_CT_PROTO * + sizeof(struct nf_conntrack_l4proto *), + GFP_KERNEL); if (proto_array == NULL) { ret = -ENOMEM; - goto out; + goto out_unlock; } + for (i = 0; i < MAX_NF_CT_PROTO; i++) proto_array[i] = &nf_conntrack_l4proto_generic; - - write_lock_bh(&nf_conntrack_lock); - if (nf_ct_protos[l4proto->l3proto]) { - /* bad timing, but no problem */ - write_unlock_bh(&nf_conntrack_lock); - kfree(proto_array); - } else { - nf_ct_protos[l4proto->l3proto] = proto_array; - write_unlock_bh(&nf_conntrack_lock); - } - - /* - * Just once because array is never freed until unloading - * nf_conntrack.ko - */ - goto retry; + nf_ct_protos[l4proto->l3proto] = proto_array; + } else if (nf_ct_protos[l4proto->l3proto][l4proto->l4proto] != + &nf_conntrack_l4proto_generic) { + ret = -EBUSY; + goto out_unlock; } - rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], l4proto); - write_unlock_bh(&nf_conntrack_lock); - ret = nf_ct_l4proto_register_sysctl(l4proto); if (ret < 0) - nf_conntrack_l4proto_unregister(l4proto); - return ret; + goto out_unlock; + + rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], + l4proto); out_unlock: - write_unlock_bh(&nf_conntrack_lock); -out: + mutex_unlock(&nf_ct_proto_mutex); return ret; } EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_register); @@ -359,21 +320,42 @@ void nf_conntrack_l4proto_unregister(struct nf_conntrack_l4proto *l4proto) { BUG_ON(l4proto->l3proto >= PF_MAX); - if (l4proto == &nf_conntrack_l4proto_generic) { - nf_ct_l4proto_unregister_sysctl(l4proto); - return; - } - - write_lock_bh(&nf_conntrack_lock); + mutex_lock(&nf_ct_proto_mutex); BUG_ON(nf_ct_protos[l4proto->l3proto][l4proto->l4proto] != l4proto); rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], &nf_conntrack_l4proto_generic); - write_unlock_bh(&nf_conntrack_lock); - synchronize_rcu(); - nf_ct_l4proto_unregister_sysctl(l4proto); + mutex_unlock(&nf_ct_proto_mutex); + + synchronize_rcu(); /* Remove all contrack entries for this protocol */ nf_ct_iterate_cleanup(kill_l4proto, l4proto); } EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_unregister); + +int nf_conntrack_proto_init(void) +{ + unsigned int i; + int err; + + err = nf_ct_l4proto_register_sysctl(&nf_conntrack_l4proto_generic); + if (err < 0) + return err; + + for (i = 0; i < AF_MAX; i++) + rcu_assign_pointer(nf_ct_l3protos[i], + &nf_conntrack_l3proto_generic); + return 0; +} + +void nf_conntrack_proto_fini(void) +{ + unsigned int i; + + nf_ct_l4proto_unregister_sysctl(&nf_conntrack_l4proto_generic); + + /* free l3proto protocol tables */ + for (i = 0; i < PF_MAX; i++) + kfree(nf_ct_protos[i]); +} diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index 7c069939695..6faf1bed722 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -4,11 +4,6 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. - * - * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> - * - enable working with L3 protocol independent connection tracking. - * - * Derived from net/ipv4/netfilter/ip_conntrack_proto_generic.c */ #include <linux/types.h> diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 3c80558716a..0d3254b974c 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -7,15 +7,6 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. - * - * 17 Oct 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> - * - enable working with L3 protocol independent connection tracking. - * - * Derived from net/ipv4/ip_conntrack_sctp.c - */ - -/* - * Added support for proc manipulation of timeouts. */ #include <linux/types.h> diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 153d6619993..ccdd5d231e0 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -4,24 +4,6 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. - * - * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>: - * - Real stateful connection tracking - * - Modified state transitions table - * - Window scaling support added - * - SACK support added - * - * Willy Tarreau: - * - State table bugfixes - * - More robust state changes - * - Tuning timer parameters - * - * 27 Oct 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> - * - genelized Layer 3 protocol part. - * - * Derived from net/ipv4/netfilter/ip_conntrack_proto_tcp.c - * - * version 2.2 */ #include <linux/types.h> @@ -470,11 +452,10 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, /* Fast path for timestamp-only option */ if (length == TCPOLEN_TSTAMP_ALIGNED*4 - && *(__be32 *)ptr == - __constant_htonl((TCPOPT_NOP << 24) - | (TCPOPT_NOP << 16) - | (TCPOPT_TIMESTAMP << 8) - | TCPOLEN_TIMESTAMP)) + && *(__be32 *)ptr == htonl((TCPOPT_NOP << 24) + | (TCPOPT_NOP << 16) + | (TCPOPT_TIMESTAMP << 8) + | TCPOLEN_TIMESTAMP)) return; while (length > 0) { @@ -765,26 +746,18 @@ EXPORT_SYMBOL_GPL(nf_conntrack_tcp_update); #define TH_ECE 0x40 #define TH_CWR 0x80 -/* table of valid flag combinations - ECE and CWR are always valid */ -static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] = +/* table of valid flag combinations - PUSH, ECE and CWR are always valid */ +static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_ACK|TH_URG) + 1] = { [TH_SYN] = 1, - [TH_SYN|TH_PUSH] = 1, [TH_SYN|TH_URG] = 1, - [TH_SYN|TH_PUSH|TH_URG] = 1, [TH_SYN|TH_ACK] = 1, - [TH_SYN|TH_ACK|TH_PUSH] = 1, [TH_RST] = 1, [TH_RST|TH_ACK] = 1, - [TH_RST|TH_ACK|TH_PUSH] = 1, [TH_FIN|TH_ACK] = 1, + [TH_FIN|TH_ACK|TH_URG] = 1, [TH_ACK] = 1, - [TH_ACK|TH_PUSH] = 1, [TH_ACK|TH_URG] = 1, - [TH_ACK|TH_URG|TH_PUSH] = 1, - [TH_FIN|TH_ACK|TH_PUSH] = 1, - [TH_FIN|TH_ACK|TH_URG] = 1, - [TH_FIN|TH_ACK|TH_URG|TH_PUSH] = 1, }; /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ @@ -831,7 +804,7 @@ static int tcp_error(struct sk_buff *skb, } /* Check TCP flags. */ - tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR)); + tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR|TH_PUSH)); if (!tcp_valid_flags[tcpflags]) { if (LOG_INVALID(IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, @@ -1110,11 +1083,26 @@ static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa, const struct nf_conn *ct) { struct nfattr *nest_parms; + struct nf_ct_tcp_flags tmp = {}; read_lock_bh(&tcp_lock); nest_parms = NFA_NEST(skb, CTA_PROTOINFO_TCP); NFA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t), &ct->proto.tcp.state); + + NFA_PUT(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL, sizeof(u_int8_t), + &ct->proto.tcp.seen[0].td_scale); + + NFA_PUT(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY, sizeof(u_int8_t), + &ct->proto.tcp.seen[1].td_scale); + + tmp.flags = ct->proto.tcp.seen[0].flags; + NFA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL, + sizeof(struct nf_ct_tcp_flags), &tmp); + + tmp.flags = ct->proto.tcp.seen[1].flags; + NFA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY, + sizeof(struct nf_ct_tcp_flags), &tmp); read_unlock_bh(&tcp_lock); NFA_NEST_END(skb, nest_parms); @@ -1127,7 +1115,11 @@ nfattr_failure: } static const size_t cta_min_tcp[CTA_PROTOINFO_TCP_MAX] = { - [CTA_PROTOINFO_TCP_STATE-1] = sizeof(u_int8_t), + [CTA_PROTOINFO_TCP_STATE-1] = sizeof(u_int8_t), + [CTA_PROTOINFO_TCP_WSCALE_ORIGINAL-1] = sizeof(u_int8_t), + [CTA_PROTOINFO_TCP_WSCALE_REPLY-1] = sizeof(u_int8_t), + [CTA_PROTOINFO_TCP_FLAGS_ORIGINAL-1] = sizeof(struct nf_ct_tcp_flags), + [CTA_PROTOINFO_TCP_FLAGS_REPLY-1] = sizeof(struct nf_ct_tcp_flags) }; static int nfattr_to_tcp(struct nfattr *cda[], struct nf_conn *ct) @@ -1151,6 +1143,30 @@ static int nfattr_to_tcp(struct nfattr *cda[], struct nf_conn *ct) write_lock_bh(&tcp_lock); ct->proto.tcp.state = *(u_int8_t *)NFA_DATA(tb[CTA_PROTOINFO_TCP_STATE-1]); + + if (tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL-1]) { + struct nf_ct_tcp_flags *attr = + NFA_DATA(tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL-1]); + ct->proto.tcp.seen[0].flags &= ~attr->mask; + ct->proto.tcp.seen[0].flags |= attr->flags & attr->mask; + } + + if (tb[CTA_PROTOINFO_TCP_FLAGS_REPLY-1]) { + struct nf_ct_tcp_flags *attr = + NFA_DATA(tb[CTA_PROTOINFO_TCP_FLAGS_REPLY-1]); + ct->proto.tcp.seen[1].flags &= ~attr->mask; + ct->proto.tcp.seen[1].flags |= attr->flags & attr->mask; + } + + if (tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL-1] && + tb[CTA_PROTOINFO_TCP_WSCALE_REPLY-1] && + ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_WINDOW_SCALE && + ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_WINDOW_SCALE) { + ct->proto.tcp.seen[0].td_scale = *(u_int8_t *) + NFA_DATA(tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL-1]); + ct->proto.tcp.seen[1].td_scale = *(u_int8_t *) + NFA_DATA(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY-1]); + } write_unlock_bh(&tcp_lock); return 0; diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index a5e5726ec0c..3620ecc095f 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -4,11 +4,6 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. - * - * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> - * - enable working with Layer 3 protocol independent connection tracking. - * - * Derived from net/ipv4/netfilter/ip_conntrack_proto_udp.c */ #include <linux/types.h> diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index b8586360e51..45baeb0e30f 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -1,20 +1,9 @@ -/* This file contains all the functions required for the standalone - nf_conntrack module. - - These are not required by the compatibility layer. -*/ - /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. - * - * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> - * - generalize L3 protocol dependent part. - * - * Derived from net/ipv4/netfilter/ip_conntrack_standalone.c */ #include <linux/types.h> diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index bf23e489e4c..8797e6953ef 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -3,7 +3,7 @@ * * (C) 2001 by Jay Schulist <jschlst@samba.org>, * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org> - * (C) 2005 by Pablo Neira Ayuso <pablo@eurodev.net> + * (C) 2005,2007 by Pablo Neira Ayuso <pablo@netfilter.org> * * Initial netfilter messages via netlink development funded and * generally made possible by Network Robots, Inc. (www.networkrobots.com) @@ -28,10 +28,9 @@ #include <asm/uaccess.h> #include <asm/system.h> #include <net/sock.h> +#include <net/netlink.h> #include <linux/init.h> -#include <linux/spinlock.h> -#include <linux/netfilter.h> #include <linux/netlink.h> #include <linux/netfilter/nfnetlink.h> @@ -41,32 +40,34 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER); static char __initdata nfversion[] = "0.30"; -#if 0 -#define DEBUGP(format, args...) \ - printk(KERN_DEBUG "%s(%d):%s(): " format, __FILE__, \ - __LINE__, __FUNCTION__, ## args) -#else -#define DEBUGP(format, args...) -#endif - static struct sock *nfnl = NULL; static struct nfnetlink_subsystem *subsys_table[NFNL_SUBSYS_COUNT]; -DECLARE_MUTEX(nfnl_sem); +static DEFINE_MUTEX(nfnl_mutex); -void nfnl_lock(void) +static void nfnl_lock(void) { - nfnl_shlock(); + mutex_lock(&nfnl_mutex); } -void nfnl_unlock(void) +static int nfnl_trylock(void) { - nfnl_shunlock(); + return !mutex_trylock(&nfnl_mutex); } -int nfnetlink_subsys_register(struct nfnetlink_subsystem *n) +static void __nfnl_unlock(void) { - DEBUGP("registering subsystem ID %u\n", n->subsys_id); + mutex_unlock(&nfnl_mutex); +} + +static void nfnl_unlock(void) +{ + mutex_unlock(&nfnl_mutex); + if (nfnl->sk_receive_queue.qlen) + nfnl->sk_data_ready(nfnl, 0); +} +int nfnetlink_subsys_register(struct nfnetlink_subsystem *n) +{ nfnl_lock(); if (subsys_table[n->subsys_id]) { nfnl_unlock(); @@ -77,24 +78,23 @@ int nfnetlink_subsys_register(struct nfnetlink_subsystem *n) return 0; } +EXPORT_SYMBOL_GPL(nfnetlink_subsys_register); int nfnetlink_subsys_unregister(struct nfnetlink_subsystem *n) { - DEBUGP("unregistering subsystem ID %u\n", n->subsys_id); - nfnl_lock(); subsys_table[n->subsys_id] = NULL; nfnl_unlock(); return 0; } +EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister); static inline struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t type) { u_int8_t subsys_id = NFNL_SUBSYS_ID(type); - if (subsys_id >= NFNL_SUBSYS_COUNT - || subsys_table[subsys_id] == NULL) + if (subsys_id >= NFNL_SUBSYS_COUNT) return NULL; return subsys_table[subsys_id]; @@ -105,10 +105,8 @@ nfnetlink_find_client(u_int16_t type, struct nfnetlink_subsystem *ss) { u_int8_t cb_id = NFNL_MSG_TYPE(type); - if (cb_id >= ss->cb_count) { - DEBUGP("msgtype %u >= %u, returning\n", type, ss->cb_count); + if (cb_id >= ss->cb_count) return NULL; - } return &ss->cb[cb_id]; } @@ -125,6 +123,7 @@ void __nfa_fill(struct sk_buff *skb, int attrtype, int attrlen, memcpy(NFA_DATA(nfa), data, attrlen); memset(NFA_DATA(nfa) + attrlen, 0, NFA_ALIGN(size) - size); } +EXPORT_SYMBOL_GPL(__nfa_fill); void nfattr_parse(struct nfattr *tb[], int maxattr, struct nfattr *nfa, int len) { @@ -137,6 +136,7 @@ void nfattr_parse(struct nfattr *tb[], int maxattr, struct nfattr *nfa, int len) nfa = NFA_NEXT(nfa, len); } } +EXPORT_SYMBOL_GPL(nfattr_parse); /** * nfnetlink_check_attributes - check and parse nfnetlink attributes @@ -150,37 +150,15 @@ static int nfnetlink_check_attributes(struct nfnetlink_subsystem *subsys, struct nlmsghdr *nlh, struct nfattr *cda[]) { - int min_len; - u_int16_t attr_count; + int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg)); u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); - - if (unlikely(cb_id >= subsys->cb_count)) { - DEBUGP("msgtype %u >= %u, returning\n", - cb_id, subsys->cb_count); - return -EINVAL; - } - - min_len = NLMSG_SPACE(sizeof(struct nfgenmsg)); - if (unlikely(nlh->nlmsg_len < min_len)) - return -EINVAL; - - attr_count = subsys->cb[cb_id].attr_count; - memset(cda, 0, sizeof(struct nfattr *) * attr_count); + u_int16_t attr_count = subsys->cb[cb_id].attr_count; /* check attribute lengths. */ if (likely(nlh->nlmsg_len > min_len)) { struct nfattr *attr = NFM_NFA(NLMSG_DATA(nlh)); int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); - - while (NFA_OK(attr, attrlen)) { - unsigned flavor = NFA_TYPE(attr); - if (flavor) { - if (flavor > attr_count) - return -EINVAL; - cda[flavor - 1] = attr; - } - attr = NFA_NEXT(attr, attrlen); - } + nfattr_parse(cda, attr_count, attr, attrlen); } /* implicit: if nlmsg_len == min_len, we return 0, and an empty @@ -208,62 +186,46 @@ int nfnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo) return err; } +EXPORT_SYMBOL_GPL(nfnetlink_send); int nfnetlink_unicast(struct sk_buff *skb, u_int32_t pid, int flags) { return netlink_unicast(nfnl, skb, pid, flags); } +EXPORT_SYMBOL_GPL(nfnetlink_unicast); /* Process one complete nfnetlink message. */ -static int nfnetlink_rcv_msg(struct sk_buff *skb, - struct nlmsghdr *nlh, int *errp) +static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { struct nfnl_callback *nc; struct nfnetlink_subsystem *ss; - int type, err = 0; - - DEBUGP("entered; subsys=%u, msgtype=%u\n", - NFNL_SUBSYS_ID(nlh->nlmsg_type), - NFNL_MSG_TYPE(nlh->nlmsg_type)); - - if (security_netlink_recv(skb, CAP_NET_ADMIN)) { - DEBUGP("missing CAP_NET_ADMIN\n"); - *errp = -EPERM; - return -1; - } + int type, err; - /* Only requests are handled by kernel now. */ - if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) { - DEBUGP("received non-request message\n"); - return 0; - } + if (security_netlink_recv(skb, CAP_NET_ADMIN)) + return -EPERM; /* All the messages must at least contain nfgenmsg */ - if (nlh->nlmsg_len < NLMSG_SPACE(sizeof(struct nfgenmsg))) { - DEBUGP("received message was too short\n"); + if (nlh->nlmsg_len < NLMSG_SPACE(sizeof(struct nfgenmsg))) return 0; - } type = nlh->nlmsg_type; ss = nfnetlink_get_subsys(type); if (!ss) { #ifdef CONFIG_KMOD - /* don't call nfnl_shunlock, since it would reenter + /* don't call nfnl_unlock, since it would reenter * with further packet processing */ - up(&nfnl_sem); + __nfnl_unlock(); request_module("nfnetlink-subsys-%d", NFNL_SUBSYS_ID(type)); - nfnl_shlock(); + nfnl_lock(); ss = nfnetlink_get_subsys(type); if (!ss) #endif - goto err_inval; + return -EINVAL; } nc = nfnetlink_find_client(type, ss); - if (!nc) { - DEBUGP("unable to find client for type %d\n", type); - goto err_inval; - } + if (!nc) + return -EINVAL; { u_int16_t attr_count = @@ -274,73 +236,21 @@ static int nfnetlink_rcv_msg(struct sk_buff *skb, err = nfnetlink_check_attributes(ss, nlh, cda); if (err < 0) - goto err_inval; - - DEBUGP("calling handler\n"); - err = nc->call(nfnl, skb, nlh, cda, errp); - *errp = err; - return err; - } - -err_inval: - DEBUGP("returning -EINVAL\n"); - *errp = -EINVAL; - return -1; -} - -/* Process one packet of messages. */ -static inline int nfnetlink_rcv_skb(struct sk_buff *skb) -{ - int err; - struct nlmsghdr *nlh; - - while (skb->len >= NLMSG_SPACE(0)) { - u32 rlen; - - nlh = (struct nlmsghdr *)skb->data; - if (nlh->nlmsg_len < sizeof(struct nlmsghdr) - || skb->len < nlh->nlmsg_len) - return 0; - rlen = NLMSG_ALIGN(nlh->nlmsg_len); - if (rlen > skb->len) - rlen = skb->len; - if (nfnetlink_rcv_msg(skb, nlh, &err)) { - if (!err) - return -1; - netlink_ack(skb, nlh, err); - } else - if (nlh->nlmsg_flags & NLM_F_ACK) - netlink_ack(skb, nlh, 0); - skb_pull(skb, rlen); + return err; + return nc->call(nfnl, skb, nlh, cda); } - - return 0; } static void nfnetlink_rcv(struct sock *sk, int len) { - do { - struct sk_buff *skb; + unsigned int qlen = 0; - if (nfnl_shlock_nowait()) + do { + if (nfnl_trylock()) return; - - while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { - if (nfnetlink_rcv_skb(skb)) { - if (skb->len) - skb_queue_head(&sk->sk_receive_queue, - skb); - else - kfree_skb(skb); - break; - } - kfree_skb(skb); - } - - /* don't call nfnl_shunlock, since it would reenter - * with further packet processing */ - up(&nfnl_sem); - } while(nfnl && nfnl->sk_receive_queue.qlen); + netlink_run_queue(sk, &qlen, nfnetlink_rcv_msg); + __nfnl_unlock(); + } while (qlen); } static void __exit nfnetlink_exit(void) @@ -355,7 +265,7 @@ static int __init nfnetlink_init(void) printk("Netfilter messages via NETLINK v%s.\n", nfversion); nfnl = netlink_kernel_create(NETLINK_NETFILTER, NFNLGRP_MAX, - nfnetlink_rcv, THIS_MODULE); + nfnetlink_rcv, NULL, THIS_MODULE); if (!nfnl) { printk(KERN_ERR "cannot initialize nfnetlink!\n"); return -1; @@ -366,10 +276,3 @@ static int __init nfnetlink_init(void) module_init(nfnetlink_init); module_exit(nfnetlink_exit); - -EXPORT_SYMBOL_GPL(nfnetlink_subsys_register); -EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister); -EXPORT_SYMBOL_GPL(nfnetlink_send); -EXPORT_SYMBOL_GPL(nfnetlink_unicast); -EXPORT_SYMBOL_GPL(nfattr_parse); -EXPORT_SYMBOL_GPL(__nfa_fill); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 5cb30ebba0f..e32e30e7a17 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -10,11 +10,6 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. - * - * 2006-01-26 Harald Welte <laforge@netfilter.org> - * - Add optional local and global sequence number to detect lost - * events from userspace - * */ #include <linux/module.h> #include <linux/skbuff.h> @@ -163,10 +158,7 @@ instance_create(u_int16_t group_num, int pid) /* needs to be two, since we _put() after creation */ atomic_set(&inst->use, 2); - init_timer(&inst->timer); - inst->timer.function = nfulnl_timer; - inst->timer.data = (unsigned long)inst; - /* don't start timer yet. (re)start it with every packet */ + setup_timer(&inst->timer, nfulnl_timer, (unsigned long)inst); inst->peer_pid = pid; inst->group_num = group_num; @@ -200,20 +192,14 @@ out_unlock: static int __nfulnl_send(struct nfulnl_instance *inst); static void -_instance_destroy2(struct nfulnl_instance *inst, int lock) +__instance_destroy(struct nfulnl_instance *inst) { /* first pull it out of the global list */ - if (lock) - write_lock_bh(&instances_lock); - UDEBUG("removing instance %p (queuenum=%u) from hash\n", inst, inst->group_num); hlist_del(&inst->hlist); - if (lock) - write_unlock_bh(&instances_lock); - /* then flush all pending packets from skb */ spin_lock_bh(&inst->lock); @@ -235,15 +221,11 @@ _instance_destroy2(struct nfulnl_instance *inst, int lock) } static inline void -__instance_destroy(struct nfulnl_instance *inst) -{ - _instance_destroy2(inst, 0); -} - -static inline void instance_destroy(struct nfulnl_instance *inst) { - _instance_destroy2(inst, 1); + write_lock_bh(&instances_lock); + __instance_destroy(inst); + write_unlock_bh(&instances_lock); } static int @@ -365,9 +347,6 @@ __nfulnl_send(struct nfulnl_instance *inst) { int status; - if (!inst->skb) - return 0; - if (inst->qlen > 1) inst->lastnlh->nlmsg_type = NLMSG_DONE; @@ -391,7 +370,8 @@ static void nfulnl_timer(unsigned long data) UDEBUG("timer function called, flushing buffer\n"); spin_lock_bh(&inst->lock); - __nfulnl_send(inst); + if (inst->skb) + __nfulnl_send(inst); spin_unlock_bh(&inst->lock); instance_put(inst); } @@ -409,15 +389,14 @@ __build_packet_message(struct nfulnl_instance *inst, const struct nf_loginfo *li, const char *prefix, unsigned int plen) { - unsigned char *old_tail; struct nfulnl_msg_packet_hdr pmsg; struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; __be32 tmp_uint; + sk_buff_data_t old_tail = inst->skb->tail; UDEBUG("entered\n"); - old_tail = inst->skb->tail; nlh = NLMSG_PUT(inst->skb, 0, 0, NFNL_SUBSYS_ULOG << 8 | NFULNL_MSG_PACKET, sizeof(struct nfgenmsg)); @@ -509,11 +488,11 @@ __build_packet_message(struct nfulnl_instance *inst, NFA_PUT(inst->skb, NFULA_HWADDR, sizeof(phw), &phw); } - if (skb->tstamp.off_sec) { + if (skb->tstamp.tv64) { struct nfulnl_msg_packet_timestamp ts; - - ts.sec = cpu_to_be64(skb->tstamp.off_sec); - ts.usec = cpu_to_be64(skb->tstamp.off_usec); + struct timeval tv = ktime_to_timeval(skb->tstamp); + ts.sec = cpu_to_be64(tv.tv_sec); + ts.usec = cpu_to_be64(tv.tv_usec); NFA_PUT(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts); } @@ -596,7 +575,6 @@ nfulnl_log_packet(unsigned int pf, struct nfulnl_instance *inst; const struct nf_loginfo *li; unsigned int qthreshold; - unsigned int nlbufsiz; unsigned int plen; if (li_user && li_user->type == NF_LOG_TYPE_ULOG) @@ -606,12 +584,7 @@ nfulnl_log_packet(unsigned int pf, inst = instance_lookup_get(li->u.ulog.group); if (!inst) - inst = instance_lookup_get(0); - if (!inst) { - PRINTR("nfnetlink_log: trying to log packet, " - "but no instance for group %u\n", li->u.ulog.group); return; - } plen = 0; if (prefix) @@ -667,24 +640,11 @@ nfulnl_log_packet(unsigned int pf, break; default: - spin_unlock_bh(&inst->lock); - instance_put(inst); - return; + goto unlock_and_release; } - if (size > inst->nlbufsiz) - nlbufsiz = size; - else - nlbufsiz = inst->nlbufsiz; - - if (!inst->skb) { - if (!(inst->skb = nfulnl_alloc_skb(nlbufsiz, size))) { - UDEBUG("error in nfulnl_alloc_skb(%u, %u)\n", - inst->nlbufsiz, size); - goto alloc_failure; - } - } else if (inst->qlen >= qthreshold || - size > skb_tailroom(inst->skb)) { + if (inst->qlen >= qthreshold || + (inst->skb && size > skb_tailroom(inst->skb))) { /* either the queue len is too high or we don't have * enough room in the skb left. flush to userspace. */ UDEBUG("flushing old skb\n"); @@ -693,12 +653,12 @@ nfulnl_log_packet(unsigned int pf, if (del_timer(&inst->timer)) instance_put(inst); __nfulnl_send(inst); + } - if (!(inst->skb = nfulnl_alloc_skb(nlbufsiz, size))) { - UDEBUG("error in nfulnl_alloc_skb(%u, %u)\n", - inst->nlbufsiz, size); + if (!inst->skb) { + inst->skb = nfulnl_alloc_skb(inst->nlbufsiz, size); + if (!inst->skb) goto alloc_failure; - } } UDEBUG("qlen %d, qthreshold %d\n", inst->qlen, qthreshold); @@ -760,7 +720,7 @@ static struct notifier_block nfulnl_rtnl_notifier = { static int nfulnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp) + struct nlmsghdr *nlh, struct nfattr *nfqa[]) { return -ENOTSUPP; } @@ -798,7 +758,7 @@ static const int nfula_cfg_min[NFULA_CFG_MAX] = { static int nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *nfula[], int *errp) + struct nlmsghdr *nlh, struct nfattr *nfula[]) { struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); u_int16_t group_num = ntohs(nfmsg->res_id); @@ -830,13 +790,13 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, NETLINK_CB(skb).pid); if (!inst) { ret = -EINVAL; - goto out_put; + goto out; } break; case NFULNL_CFG_CMD_UNBIND: if (!inst) { ret = -ENODEV; - goto out_put; + goto out; } if (inst->peer_pid != NETLINK_CB(skb).pid) { @@ -845,7 +805,7 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, } instance_destroy(inst); - break; + goto out; case NFULNL_CFG_CMD_PF_BIND: UDEBUG("registering log handler for pf=%u\n", pf); ret = nf_log_register(pf, &nfulnl_logger); @@ -869,7 +829,7 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, "group=%u pid=%u =>ENOENT\n", group_num, NETLINK_CB(skb).pid); ret = -ENOENT; - goto out_put; + goto out; } if (inst->peer_pid != NETLINK_CB(skb).pid) { @@ -939,10 +899,8 @@ struct iter_state { unsigned int bucket; }; -static struct hlist_node *get_first(struct seq_file *seq) +static struct hlist_node *get_first(struct iter_state *st) { - struct iter_state *st = seq->private; - if (!st) return NULL; @@ -953,10 +911,8 @@ static struct hlist_node *get_first(struct seq_file *seq) return NULL; } -static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h) +static struct hlist_node *get_next(struct iter_state *st, struct hlist_node *h) { - struct iter_state *st = seq->private; - h = h->next; while (!h) { if (++st->bucket >= INSTANCE_BUCKETS) @@ -967,13 +923,13 @@ static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h) return h; } -static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos) +static struct hlist_node *get_idx(struct iter_state *st, loff_t pos) { struct hlist_node *head; - head = get_first(seq); + head = get_first(st); if (head) - while (pos && (head = get_next(seq, head))) + while (pos && (head = get_next(st, head))) pos--; return pos ? NULL : head; } @@ -981,13 +937,13 @@ static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos) static void *seq_start(struct seq_file *seq, loff_t *pos) { read_lock_bh(&instances_lock); - return get_idx(seq, *pos); + return get_idx(seq->private, *pos); } static void *seq_next(struct seq_file *s, void *v, loff_t *pos) { (*pos)++; - return get_next(s, v); + return get_next(s->private, v); } static void seq_stop(struct seq_file *s, void *v) diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index d9ce4a71d0f..7a97bec6772 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -338,7 +338,7 @@ static struct sk_buff * nfqnl_build_packet_message(struct nfqnl_instance *queue, struct nfqnl_queue_entry *entry, int *errp) { - unsigned char *old_tail; + sk_buff_data_t old_tail; size_t size; size_t data_len = 0; struct sk_buff *skb; @@ -404,7 +404,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, if (!skb) goto nlmsg_failure; - old_tail= skb->tail; + old_tail = skb->tail; nlh = NLMSG_PUT(skb, 0, 0, NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET, sizeof(struct nfgenmsg)); @@ -495,11 +495,11 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, NFA_PUT(skb, NFQA_HWADDR, sizeof(phw), &phw); } - if (entskb->tstamp.off_sec) { + if (entskb->tstamp.tv64) { struct nfqnl_msg_packet_timestamp ts; - - ts.sec = cpu_to_be64(entskb->tstamp.off_sec); - ts.usec = cpu_to_be64(entskb->tstamp.off_usec); + struct timeval tv = ktime_to_timeval(entskb->tstamp); + ts.sec = cpu_to_be64(tv.tv_sec); + ts.usec = cpu_to_be64(tv.tv_usec); NFA_PUT(skb, NFQA_TIMESTAMP, sizeof(ts), &ts); } @@ -648,7 +648,7 @@ nfqnl_mangle(void *data, int data_len, struct nfqnl_queue_entry *e) } if (!skb_make_writable(&e->skb, data_len)) return -ENOMEM; - memcpy(e->skb->data, data, data_len); + skb_copy_to_linear_data(e->skb, data, data_len); e->skb->ip_summed = CHECKSUM_NONE; return 0; } @@ -783,7 +783,7 @@ static const int nfqa_verdict_min[NFQA_MAX] = { static int nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp) + struct nlmsghdr *nlh, struct nfattr *nfqa[]) { struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); u_int16_t queue_num = ntohs(nfmsg->res_id); @@ -848,7 +848,7 @@ err_out_put: static int nfqnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp) + struct nlmsghdr *nlh, struct nfattr *nfqa[]) { return -ENOTSUPP; } @@ -865,7 +865,7 @@ static struct nf_queue_handler nfqh = { static int nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp) + struct nlmsghdr *nlh, struct nfattr *nfqa[]) { struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); u_int16_t queue_num = ntohs(nfmsg->res_id); diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index ec607a421a5..0eb2504b89b 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -56,8 +56,8 @@ enum { }; static const char *xt_prefix[NPROTO] = { - [AF_INET] = "ip", - [AF_INET6] = "ip6", + [AF_INET] = "ip", + [AF_INET6] = "ip6", [NF_ARP] = "arp", }; @@ -651,12 +651,6 @@ void *xt_unregister_table(struct xt_table *table) EXPORT_SYMBOL_GPL(xt_unregister_table); #ifdef CONFIG_PROC_FS -static char *xt_proto_prefix[NPROTO] = { - [AF_INET] = "ip", - [AF_INET6] = "ip6", - [NF_ARP] = "arp", -}; - static struct list_head *xt_get_idx(struct list_head *list, struct seq_file *seq, loff_t pos) { struct list_head *head = list->next; @@ -798,7 +792,7 @@ int xt_proto_init(int af) #ifdef CONFIG_PROC_FS - strlcpy(buf, xt_proto_prefix[af], sizeof(buf)); + strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TABLES, sizeof(buf)); proc = proc_net_fops_create(buf, 0440, &xt_file_ops); if (!proc) @@ -806,14 +800,14 @@ int xt_proto_init(int af) proc->data = (void *) ((unsigned long) af | (TABLE << 16)); - strlcpy(buf, xt_proto_prefix[af], sizeof(buf)); + strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_MATCHES, sizeof(buf)); proc = proc_net_fops_create(buf, 0440, &xt_file_ops); if (!proc) goto out_remove_tables; proc->data = (void *) ((unsigned long) af | (MATCH << 16)); - strlcpy(buf, xt_proto_prefix[af], sizeof(buf)); + strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TARGETS, sizeof(buf)); proc = proc_net_fops_create(buf, 0440, &xt_file_ops); if (!proc) @@ -825,12 +819,12 @@ int xt_proto_init(int af) #ifdef CONFIG_PROC_FS out_remove_matches: - strlcpy(buf, xt_proto_prefix[af], sizeof(buf)); + strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_MATCHES, sizeof(buf)); proc_net_remove(buf); out_remove_tables: - strlcpy(buf, xt_proto_prefix[af], sizeof(buf)); + strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TABLES, sizeof(buf)); proc_net_remove(buf); out: @@ -844,15 +838,15 @@ void xt_proto_fini(int af) #ifdef CONFIG_PROC_FS char buf[XT_FUNCTION_MAXNAMELEN]; - strlcpy(buf, xt_proto_prefix[af], sizeof(buf)); + strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TABLES, sizeof(buf)); proc_net_remove(buf); - strlcpy(buf, xt_proto_prefix[af], sizeof(buf)); + strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TARGETS, sizeof(buf)); proc_net_remove(buf); - strlcpy(buf, xt_proto_prefix[af], sizeof(buf)); + strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_MATCHES, sizeof(buf)); proc_net_remove(buf); #endif /*CONFIG_PROC_FS*/ diff --git a/net/netfilter/xt_CONNMARK.c b/net/netfilter/xt_CONNMARK.c index 795c058b16a..b03ce009d0b 100644 --- a/net/netfilter/xt_CONNMARK.c +++ b/net/netfilter/xt_CONNMARK.c @@ -30,10 +30,7 @@ MODULE_ALIAS("ipt_CONNMARK"); #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_CONNMARK.h> -#include <net/netfilter/nf_conntrack_compat.h> -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) #include <net/netfilter/nf_conntrack_ecache.h> -#endif static unsigned int target(struct sk_buff **pskb, @@ -44,40 +41,33 @@ target(struct sk_buff **pskb, const void *targinfo) { const struct xt_connmark_target_info *markinfo = targinfo; + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; u_int32_t diff; u_int32_t mark; u_int32_t newmark; - u_int32_t ctinfo; - u_int32_t *ctmark = nf_ct_get_mark(*pskb, &ctinfo); - if (ctmark) { + ct = nf_ct_get(*pskb, &ctinfo); + if (ct) { switch(markinfo->mode) { case XT_CONNMARK_SET: - newmark = (*ctmark & ~markinfo->mask) | markinfo->mark; - if (newmark != *ctmark) { - *ctmark = newmark; -#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE) - ip_conntrack_event_cache(IPCT_MARK, *pskb); -#else + newmark = (ct->mark & ~markinfo->mask) | markinfo->mark; + if (newmark != ct->mark) { + ct->mark = newmark; nf_conntrack_event_cache(IPCT_MARK, *pskb); -#endif } break; case XT_CONNMARK_SAVE: - newmark = (*ctmark & ~markinfo->mask) | + newmark = (ct->mark & ~markinfo->mask) | ((*pskb)->mark & markinfo->mask); - if (*ctmark != newmark) { - *ctmark = newmark; -#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE) - ip_conntrack_event_cache(IPCT_MARK, *pskb); -#else + if (ct->mark != newmark) { + ct->mark = newmark; nf_conntrack_event_cache(IPCT_MARK, *pskb); -#endif } break; case XT_CONNMARK_RESTORE: mark = (*pskb)->mark; - diff = (*ctmark ^ mark) & markinfo->mask; + diff = (ct->mark ^ mark) & markinfo->mask; (*pskb)->mark = mark ^ diff; break; } diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c index 1ab0db641f9..81c0c58bab4 100644 --- a/net/netfilter/xt_CONNSECMARK.c +++ b/net/netfilter/xt_CONNSECMARK.c @@ -19,7 +19,7 @@ #include <linux/skbuff.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_CONNSECMARK.h> -#include <net/netfilter/nf_conntrack_compat.h> +#include <net/netfilter/nf_conntrack.h> #define PFX "CONNSECMARK: " @@ -36,12 +36,12 @@ MODULE_ALIAS("ip6t_CONNSECMARK"); static void secmark_save(struct sk_buff *skb) { if (skb->secmark) { - u32 *connsecmark; + struct nf_conn *ct; enum ip_conntrack_info ctinfo; - connsecmark = nf_ct_get_secmark(skb, &ctinfo); - if (connsecmark && !*connsecmark) - *connsecmark = skb->secmark; + ct = nf_ct_get(skb, &ctinfo); + if (ct && !ct->secmark) + ct->secmark = skb->secmark; } } @@ -52,12 +52,12 @@ static void secmark_save(struct sk_buff *skb) static void secmark_restore(struct sk_buff *skb) { if (!skb->secmark) { - u32 *connsecmark; + struct nf_conn *ct; enum ip_conntrack_info ctinfo; - connsecmark = nf_ct_get_secmark(skb, &ctinfo); - if (connsecmark && *connsecmark) - skb->secmark = *connsecmark; + ct = nf_ct_get(skb, &ctinfo); + if (ct && ct->secmark) + skb->secmark = ct->secmark; } } diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c index a7cc75aeb38..9f2f2201f6a 100644 --- a/net/netfilter/xt_DSCP.c +++ b/net/netfilter/xt_DSCP.c @@ -8,8 +8,6 @@ * published by the Free Software Foundation. * * See RFC2474 for a description of the DSCP field within the IP Header. - * - * xt_DSCP.c,v 1.8 2002/08/06 18:41:57 laforge Exp */ #include <linux/module.h> @@ -35,13 +33,13 @@ static unsigned int target(struct sk_buff **pskb, const void *targinfo) { const struct xt_DSCP_info *dinfo = targinfo; - u_int8_t dscp = ipv4_get_dsfield((*pskb)->nh.iph) >> XT_DSCP_SHIFT; + u_int8_t dscp = ipv4_get_dsfield(ip_hdr(*pskb)) >> XT_DSCP_SHIFT; if (dscp != dinfo->dscp) { if (!skb_make_writable(pskb, sizeof(struct iphdr))) return NF_DROP; - ipv4_change_dsfield((*pskb)->nh.iph, (__u8)(~XT_DSCP_MASK), + ipv4_change_dsfield(ip_hdr(*pskb), (__u8)(~XT_DSCP_MASK), dinfo->dscp << XT_DSCP_SHIFT); } @@ -56,13 +54,13 @@ static unsigned int target6(struct sk_buff **pskb, const void *targinfo) { const struct xt_DSCP_info *dinfo = targinfo; - u_int8_t dscp = ipv6_get_dsfield((*pskb)->nh.ipv6h) >> XT_DSCP_SHIFT; + u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(*pskb)) >> XT_DSCP_SHIFT; if (dscp != dinfo->dscp) { if (!skb_make_writable(pskb, sizeof(struct ipv6hdr))) return NF_DROP; - ipv6_change_dsfield((*pskb)->nh.ipv6h, (__u8)(~XT_DSCP_MASK), + ipv6_change_dsfield(ipv6_hdr(*pskb), (__u8)(~XT_DSCP_MASK), dinfo->dscp << XT_DSCP_SHIFT); } return XT_CONTINUE; diff --git a/net/netfilter/xt_NOTRACK.c b/net/netfilter/xt_NOTRACK.c index b874a2008b2..5085fb3d1e2 100644 --- a/net/netfilter/xt_NOTRACK.c +++ b/net/netfilter/xt_NOTRACK.c @@ -5,7 +5,7 @@ #include <linux/skbuff.h> #include <linux/netfilter/x_tables.h> -#include <net/netfilter/nf_conntrack_compat.h> +#include <net/netfilter/nf_conntrack.h> MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_NOTRACK"); @@ -26,7 +26,7 @@ target(struct sk_buff **pskb, If there is a real ct entry correspondig to this packet, it'll hang aroun till timing out. We don't deal with it for performance reasons. JK */ - nf_ct_untrack(*pskb); + (*pskb)->nfct = &nf_conntrack_untracked.ct_general; (*pskb)->nfctinfo = IP_CT_NEW; nf_conntrack_get((*pskb)->nfct); diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index db7e38c08de..15fe8f64951 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -54,7 +54,7 @@ tcpmss_mangle_packet(struct sk_buff **pskb, return -1; tcplen = (*pskb)->len - tcphoff; - tcph = (struct tcphdr *)((*pskb)->nh.raw + tcphoff); + tcph = (struct tcphdr *)(skb_network_header(*pskb) + tcphoff); /* Since it passed flags test in tcp match, we know it is is not a fragment, and has data >= tcp header length. SYN @@ -113,7 +113,7 @@ tcpmss_mangle_packet(struct sk_buff **pskb, return -1; kfree_skb(*pskb); *pskb = newskb; - tcph = (struct tcphdr *)((*pskb)->nh.raw + tcphoff); + tcph = (struct tcphdr *)(skb_network_header(*pskb) + tcphoff); } skb_put((*pskb), TCPOLEN_MSS); @@ -145,7 +145,7 @@ xt_tcpmss_target4(struct sk_buff **pskb, const struct xt_target *target, const void *targinfo) { - struct iphdr *iph = (*pskb)->nh.iph; + struct iphdr *iph = ip_hdr(*pskb); __be16 newlen; int ret; @@ -154,7 +154,7 @@ xt_tcpmss_target4(struct sk_buff **pskb, if (ret < 0) return NF_DROP; if (ret > 0) { - iph = (*pskb)->nh.iph; + iph = ip_hdr(*pskb); newlen = htons(ntohs(iph->tot_len) + ret); nf_csum_replace2(&iph->check, iph->tot_len, newlen); iph->tot_len = newlen; @@ -171,7 +171,7 @@ xt_tcpmss_target6(struct sk_buff **pskb, const struct xt_target *target, const void *targinfo) { - struct ipv6hdr *ipv6h = (*pskb)->nh.ipv6h; + struct ipv6hdr *ipv6h = ipv6_hdr(*pskb); u8 nexthdr; int tcphoff; int ret; @@ -187,7 +187,7 @@ xt_tcpmss_target6(struct sk_buff **pskb, if (ret < 0) return NF_DROP; if (ret > 0) { - ipv6h = (*pskb)->nh.ipv6h; + ipv6h = ipv6_hdr(*pskb); ipv6h->payload_len = htons(ntohs(ipv6h->payload_len) + ret); } return XT_CONTINUE; diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c index 5e32dfa2668..804afe55e14 100644 --- a/net/netfilter/xt_connbytes.c +++ b/net/netfilter/xt_connbytes.c @@ -1,20 +1,11 @@ /* Kernel module to match connection tracking byte counter. * GPL (C) 2002 Martin Devera (devik@cdi.cz). - * - * 2004-07-20 Harald Welte <laforge@netfilter.org> - * - reimplemented to use per-connection accounting counters - * - add functionality to match number of packets - * - add functionality to match average packet size - * - add support to match directions seperately - * 2005-10-16 Harald Welte <laforge@netfilter.org> - * - Port to x_tables - * */ #include <linux/module.h> #include <linux/skbuff.h> -#include <net/netfilter/nf_conntrack_compat.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_connbytes.h> +#include <net/netfilter/nf_conntrack.h> #include <asm/div64.h> #include <asm/bitops.h> @@ -24,22 +15,6 @@ MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("iptables match for matching number of pkts/bytes per connection"); MODULE_ALIAS("ipt_connbytes"); -/* 64bit divisor, dividend and result. dynamic precision */ -static u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor) -{ - u_int32_t d = divisor; - - if (divisor > 0xffffffffULL) { - unsigned int shift = fls(divisor >> 32); - - d = divisor >> shift; - dividend >>= shift; - } - - do_div(dividend, d); - return dividend; -} - static int match(const struct sk_buff *skb, const struct net_device *in, @@ -51,13 +26,17 @@ match(const struct sk_buff *skb, int *hotdrop) { const struct xt_connbytes_info *sinfo = matchinfo; + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; u_int64_t what = 0; /* initialize to make gcc happy */ u_int64_t bytes = 0; u_int64_t pkts = 0; const struct ip_conntrack_counter *counters; - if (!(counters = nf_ct_get_counters(skb))) - return 0; /* no match */ + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return 0; + counters = ct->counters; switch (sinfo->what) { case XT_CONNBYTES_PKTS: diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index 36c2defff23..e1803256c79 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -21,16 +21,15 @@ #include <linux/module.h> #include <linux/skbuff.h> +#include <net/netfilter/nf_conntrack.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_connmark.h> MODULE_AUTHOR("Henrik Nordstrom <hno@marasytems.com>"); MODULE_DESCRIPTION("IP tables connmark match module"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_connmark"); -#include <linux/netfilter/x_tables.h> -#include <linux/netfilter/xt_connmark.h> -#include <net/netfilter/nf_conntrack_compat.h> - static int match(const struct sk_buff *skb, const struct net_device *in, @@ -42,12 +41,14 @@ match(const struct sk_buff *skb, int *hotdrop) { const struct xt_connmark_info *info = matchinfo; - u_int32_t ctinfo; - const u_int32_t *ctmark = nf_ct_get_mark(skb, &ctinfo); - if (!ctmark) + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) return 0; - return (((*ctmark) & info->mask) == info->mark) ^ info->invert; + return (((ct->mark) & info->mask) == info->mark) ^ info->invert; } static int diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index 2885c378288..f4ea8fe07a5 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -10,121 +10,15 @@ #include <linux/module.h> #include <linux/skbuff.h> - -#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE) -#include <linux/netfilter_ipv4/ip_conntrack.h> -#include <linux/netfilter_ipv4/ip_conntrack_tuple.h> -#else -#include <net/netfilter/nf_conntrack.h> -#endif - #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_conntrack.h> -#include <net/netfilter/nf_conntrack_compat.h> +#include <net/netfilter/nf_conntrack.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); MODULE_DESCRIPTION("iptables connection tracking match module"); MODULE_ALIAS("ipt_conntrack"); -#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE) - -static int -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) -{ - const struct xt_conntrack_info *sinfo = matchinfo; - struct ip_conntrack *ct; - enum ip_conntrack_info ctinfo; - unsigned int statebit; - - ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo); - -#define FWINV(bool, invflg) ((bool) ^ !!(sinfo->invflags & invflg)) - - if (ct == &ip_conntrack_untracked) - statebit = XT_CONNTRACK_STATE_UNTRACKED; - else if (ct) - statebit = XT_CONNTRACK_STATE_BIT(ctinfo); - else - statebit = XT_CONNTRACK_STATE_INVALID; - - if (sinfo->flags & XT_CONNTRACK_STATE) { - if (ct) { - if (test_bit(IPS_SRC_NAT_BIT, &ct->status)) - statebit |= XT_CONNTRACK_STATE_SNAT; - if (test_bit(IPS_DST_NAT_BIT, &ct->status)) - statebit |= XT_CONNTRACK_STATE_DNAT; - } - if (FWINV((statebit & sinfo->statemask) == 0, - XT_CONNTRACK_STATE)) - return 0; - } - - if (ct == NULL) { - if (sinfo->flags & ~XT_CONNTRACK_STATE) - return 0; - return 1; - } - - if (sinfo->flags & XT_CONNTRACK_PROTO && - FWINV(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum != - sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.protonum, - XT_CONNTRACK_PROTO)) - return 0; - - if (sinfo->flags & XT_CONNTRACK_ORIGSRC && - FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip & - sinfo->sipmsk[IP_CT_DIR_ORIGINAL].s_addr) != - sinfo->tuple[IP_CT_DIR_ORIGINAL].src.ip, - XT_CONNTRACK_ORIGSRC)) - return 0; - - if (sinfo->flags & XT_CONNTRACK_ORIGDST && - FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip & - sinfo->dipmsk[IP_CT_DIR_ORIGINAL].s_addr) != - sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.ip, - XT_CONNTRACK_ORIGDST)) - return 0; - - if (sinfo->flags & XT_CONNTRACK_REPLSRC && - FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip & - sinfo->sipmsk[IP_CT_DIR_REPLY].s_addr) != - sinfo->tuple[IP_CT_DIR_REPLY].src.ip, - XT_CONNTRACK_REPLSRC)) - return 0; - - if (sinfo->flags & XT_CONNTRACK_REPLDST && - FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip & - sinfo->dipmsk[IP_CT_DIR_REPLY].s_addr) != - sinfo->tuple[IP_CT_DIR_REPLY].dst.ip, - XT_CONNTRACK_REPLDST)) - return 0; - - if (sinfo->flags & XT_CONNTRACK_STATUS && - FWINV((ct->status & sinfo->statusmask) == 0, - XT_CONNTRACK_STATUS)) - return 0; - - if (sinfo->flags & XT_CONNTRACK_EXPIRES) { - unsigned long expires = timer_pending(&ct->timeout) ? - (ct->timeout.expires - jiffies)/HZ : 0; - - if (FWINV(!(expires >= sinfo->expires_min && - expires <= sinfo->expires_max), - XT_CONNTRACK_EXPIRES)) - return 0; - } - return 1; -} - -#else /* CONFIG_IP_NF_CONNTRACK */ static int match(const struct sk_buff *skb, const struct net_device *in, @@ -220,8 +114,6 @@ match(const struct sk_buff *skb, return 1; } -#endif /* CONFIG_NF_IP_CONNTRACK */ - static int checkentry(const char *tablename, const void *ip, diff --git a/net/netfilter/xt_dscp.c b/net/netfilter/xt_dscp.c index 26c7f4ad102..56b247ecc28 100644 --- a/net/netfilter/xt_dscp.c +++ b/net/netfilter/xt_dscp.c @@ -1,7 +1,5 @@ /* IP tables module for matching the value of the IPv4/IPv6 DSCP field * - * xt_dscp.c,v 1.3 2002/08/05 19:00:21 laforge Exp - * * (C) 2002 by Harald Welte <laforge@netfilter.org> * * This program is free software; you can redistribute it and/or modify @@ -34,7 +32,7 @@ static int match(const struct sk_buff *skb, int *hotdrop) { const struct xt_dscp_info *info = matchinfo; - u_int8_t dscp = ipv4_get_dsfield(skb->nh.iph) >> XT_DSCP_SHIFT; + u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT; return (dscp == info->dscp) ^ !!info->invert; } @@ -49,7 +47,7 @@ static int match6(const struct sk_buff *skb, int *hotdrop) { const struct xt_dscp_info *info = matchinfo; - u_int8_t dscp = ipv6_get_dsfield(skb->nh.ipv6h) >> XT_DSCP_SHIFT; + u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT; return (dscp == info->dscp) ^ !!info->invert; } diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 9f37d593ca3..d3043fa32eb 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -216,10 +216,8 @@ static int htable_create(struct xt_hashlimit_info *minfo, int family) hinfo->pde->proc_fops = &dl_file_ops; hinfo->pde->data = hinfo; - init_timer(&hinfo->timer); + setup_timer(&hinfo->timer, htable_gc, (unsigned long )hinfo); hinfo->timer.expires = jiffies + msecs_to_jiffies(hinfo->cfg.gc_interval); - hinfo->timer.data = (unsigned long )hinfo; - hinfo->timer.function = htable_gc; add_timer(&hinfo->timer); spin_lock_bh(&hashlimit_lock); @@ -380,22 +378,22 @@ hashlimit_init_dst(struct xt_hashlimit_htable *hinfo, struct dsthash_dst *dst, switch (hinfo->family) { case AF_INET: if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_DIP) - dst->addr.ip.dst = skb->nh.iph->daddr; + dst->addr.ip.dst = ip_hdr(skb)->daddr; if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_SIP) - dst->addr.ip.src = skb->nh.iph->saddr; + dst->addr.ip.src = ip_hdr(skb)->saddr; if (!(hinfo->cfg.mode & (XT_HASHLIMIT_HASH_DPT | XT_HASHLIMIT_HASH_SPT))) return 0; - nexthdr = skb->nh.iph->protocol; + nexthdr = ip_hdr(skb)->protocol; break; #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) case AF_INET6: if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_DIP) - memcpy(&dst->addr.ip6.dst, &skb->nh.ipv6h->daddr, + memcpy(&dst->addr.ip6.dst, &ipv6_hdr(skb)->daddr, sizeof(dst->addr.ip6.dst)); if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_SIP) - memcpy(&dst->addr.ip6.src, &skb->nh.ipv6h->saddr, + memcpy(&dst->addr.ip6.src, &ipv6_hdr(skb)->saddr, sizeof(dst->addr.ip6.src)); if (!(hinfo->cfg.mode & diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c index 407d1d5da8a..c139b2f43a1 100644 --- a/net/netfilter/xt_helper.c +++ b/net/netfilter/xt_helper.c @@ -5,26 +5,16 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. - * - * 19 Mar 2002 Harald Welte <laforge@gnumonks.org>: - * - Port to newnat infrastructure */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netfilter.h> -#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE) -#include <linux/netfilter_ipv4/ip_conntrack.h> -#include <linux/netfilter_ipv4/ip_conntrack_core.h> -#include <linux/netfilter_ipv4/ip_conntrack_helper.h> -#else #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_helper.h> -#endif #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_helper.h> -#include <net/netfilter/nf_conntrack_compat.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Martin Josefsson <gandalf@netfilter.org>"); @@ -38,55 +28,6 @@ MODULE_ALIAS("ip6t_helper"); #define DEBUGP(format, args...) #endif -#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE) -static int -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) -{ - const struct xt_helper_info *info = matchinfo; - struct ip_conntrack *ct; - enum ip_conntrack_info ctinfo; - int ret = info->invert; - - ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo); - if (!ct) { - DEBUGP("xt_helper: Eek! invalid conntrack?\n"); - return ret; - } - - if (!ct->master) { - DEBUGP("xt_helper: conntrack %p has no master\n", ct); - return ret; - } - - read_lock_bh(&ip_conntrack_lock); - if (!ct->master->helper) { - DEBUGP("xt_helper: master ct %p has no helper\n", - exp->expectant); - goto out_unlock; - } - - DEBUGP("master's name = %s , info->name = %s\n", - ct->master->helper->name, info->name); - - if (info->name[0] == '\0') - ret ^= 1; - else - ret ^= !strncmp(ct->master->helper->name, info->name, - strlen(ct->master->helper->name)); -out_unlock: - read_unlock_bh(&ip_conntrack_lock); - return ret; -} - -#else /* CONFIG_IP_NF_CONNTRACK */ - static int match(const struct sk_buff *skb, const struct net_device *in, @@ -134,7 +75,6 @@ out_unlock: read_unlock_bh(&nf_conntrack_lock); return ret; } -#endif static int check(const char *tablename, const void *inf, diff --git a/net/netfilter/xt_length.c b/net/netfilter/xt_length.c index 32fb998d9ba..77288c5ada7 100644 --- a/net/netfilter/xt_length.c +++ b/net/netfilter/xt_length.c @@ -31,7 +31,7 @@ match(const struct sk_buff *skb, int *hotdrop) { const struct xt_length_info *info = matchinfo; - u_int16_t pktlen = ntohs(skb->nh.iph->tot_len); + u_int16_t pktlen = ntohs(ip_hdr(skb)->tot_len); return (pktlen >= info->min && pktlen <= info->max) ^ info->invert; } @@ -47,7 +47,8 @@ match6(const struct sk_buff *skb, int *hotdrop) { const struct xt_length_info *info = matchinfo; - u_int16_t pktlen = ntohs(skb->nh.ipv6h->payload_len) + sizeof(struct ipv6hdr); + const u_int16_t pktlen = (ntohs(ipv6_hdr(skb)->payload_len) + + sizeof(struct ipv6hdr)); return (pktlen >= info->min && pktlen <= info->max) ^ info->invert; } diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c index 6fd8347c005..571a72ab89a 100644 --- a/net/netfilter/xt_limit.c +++ b/net/netfilter/xt_limit.c @@ -1,10 +1,3 @@ -/* Kernel module to control the rate - * - * 2 September 1999: Changed from the target RATE to the match - * `limit', removed logging. Did I mention that - * Alexey is a fucking genius? - * Rusty Russell (rusty@rustcorp.com.au). */ - /* (C) 1999 Jérôme de Vivie <devivie@info.enserb.u-bordeaux.fr> * (C) 1999 Hervé Eychenne <eychenne@info.enserb.u-bordeaux.fr> * diff --git a/net/netfilter/xt_mac.c b/net/netfilter/xt_mac.c index d430d90d7b2..1d3a1d98b88 100644 --- a/net/netfilter/xt_mac.c +++ b/net/netfilter/xt_mac.c @@ -37,8 +37,8 @@ match(const struct sk_buff *skb, const struct xt_mac_info *info = matchinfo; /* Is mac pointer valid? */ - return (skb->mac.raw >= skb->head - && (skb->mac.raw + ETH_HLEN) <= skb->data + return (skb_mac_header(skb) >= skb->head && + (skb_mac_header(skb) + ETH_HLEN) <= skb->data /* If so, compare... */ && ((!compare_ether_addr(eth_hdr(skb)->h_source, info->srcaddr)) ^ info->invert)); diff --git a/net/netfilter/xt_pkttype.c b/net/netfilter/xt_pkttype.c index 16e7b080428..e1409fc5c28 100644 --- a/net/netfilter/xt_pkttype.c +++ b/net/netfilter/xt_pkttype.c @@ -34,7 +34,7 @@ static int match(const struct sk_buff *skb, const struct xt_pkttype_info *info = matchinfo; if (skb->pkt_type == PACKET_LOOPBACK) - type = (MULTICAST(skb->nh.iph->daddr) + type = (MULTICAST(ip_hdr(skb)->daddr) ? PACKET_MULTICAST : PACKET_BROADCAST); else diff --git a/net/netfilter/xt_realm.c b/net/netfilter/xt_realm.c index 97ffc2fbc19..c2017f8af9c 100644 --- a/net/netfilter/xt_realm.c +++ b/net/netfilter/xt_realm.c @@ -1,7 +1,5 @@ /* IP tables module for matching the routing realm * - * $Id: ipt_realm.c,v 1.3 2004/03/05 13:25:40 laforge Exp $ - * * (C) 2003 by Sampsa Ranta <sampsa@netsonic.fi> * * This program is free software; you can redistribute it and/or modify diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c index df37b912163..149294f7df7 100644 --- a/net/netfilter/xt_state.c +++ b/net/netfilter/xt_state.c @@ -10,7 +10,7 @@ #include <linux/module.h> #include <linux/skbuff.h> -#include <net/netfilter/nf_conntrack_compat.h> +#include <net/netfilter/nf_conntrack.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_state.h> @@ -36,7 +36,7 @@ match(const struct sk_buff *skb, if (nf_ct_is_untracked(skb)) statebit = XT_STATE_UNTRACKED; - else if (!nf_ct_get_ctinfo(skb, &ctinfo)) + else if (!nf_ct_get(skb, &ctinfo)) statebit = XT_STATE_INVALID; else statebit = XT_STATE_BIT(ctinfo); diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index e03a3282c55..f2535e7f286 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -263,9 +263,6 @@ int netlbl_socket_setattr(const struct socket *sock, int ret_val = -ENOENT; struct netlbl_dom_map *dom_entry; - if ((secattr->flags & NETLBL_SECATTR_DOMAIN) == 0) - return -ENOENT; - rcu_read_lock(); dom_entry = netlbl_domhsh_getentry(secattr->domain); if (dom_entry == NULL) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index c48b0f49f00..507828d7d4a 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -56,6 +56,7 @@ #include <linux/types.h> #include <linux/audit.h> #include <linux/selinux.h> +#include <linux/mutex.h> #include <net/sock.h> #include <net/scm.h> @@ -76,7 +77,8 @@ struct netlink_sock { unsigned long state; wait_queue_head_t wait; struct netlink_callback *cb; - spinlock_t cb_lock; + struct mutex *cb_mutex; + struct mutex cb_def_mutex; void (*data_ready)(struct sock *sk, int bytes); struct module *module; }; @@ -108,6 +110,7 @@ struct netlink_table { unsigned long *listeners; unsigned int nl_nonroot; unsigned int groups; + struct mutex *cb_mutex; struct module *module; int registered; }; @@ -118,6 +121,7 @@ static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait); static int netlink_dump(struct sock *sk); static void netlink_destroy_callback(struct netlink_callback *cb); +static void netlink_queue_skip(struct nlmsghdr *nlh, struct sk_buff *skb); static DEFINE_RWLOCK(nl_table_lock); static atomic_t nl_table_users = ATOMIC_INIT(0); @@ -136,6 +140,14 @@ static struct hlist_head *nl_pid_hashfn(struct nl_pid_hash *hash, u32 pid) static void netlink_sock_destruct(struct sock *sk) { + struct netlink_sock *nlk = nlk_sk(sk); + + if (nlk->cb) { + if (nlk->cb->done) + nlk->cb->done(nlk->cb); + netlink_destroy_callback(nlk->cb); + } + skb_queue_purge(&sk->sk_receive_queue); if (!sock_flag(sk, SOCK_DEAD)) { @@ -144,7 +156,6 @@ static void netlink_sock_destruct(struct sock *sk) } BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc)); BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc)); - BUG_TRAP(!nlk_sk(sk)->cb); BUG_TRAP(!nlk_sk(sk)->groups); } @@ -370,7 +381,8 @@ static struct proto netlink_proto = { .obj_size = sizeof(struct netlink_sock), }; -static int __netlink_create(struct socket *sock, int protocol) +static int __netlink_create(struct socket *sock, struct mutex *cb_mutex, + int protocol) { struct sock *sk; struct netlink_sock *nlk; @@ -384,7 +396,12 @@ static int __netlink_create(struct socket *sock, int protocol) sock_init_data(sock, sk); nlk = nlk_sk(sk); - spin_lock_init(&nlk->cb_lock); + if (cb_mutex) + nlk->cb_mutex = cb_mutex; + else { + nlk->cb_mutex = &nlk->cb_def_mutex; + mutex_init(nlk->cb_mutex); + } init_waitqueue_head(&nlk->wait); sk->sk_destruct = netlink_sock_destruct; @@ -395,8 +412,8 @@ static int __netlink_create(struct socket *sock, int protocol) static int netlink_create(struct socket *sock, int protocol) { struct module *module = NULL; + struct mutex *cb_mutex; struct netlink_sock *nlk; - unsigned int groups; int err = 0; sock->state = SS_UNCONNECTED; @@ -418,10 +435,10 @@ static int netlink_create(struct socket *sock, int protocol) if (nl_table[protocol].registered && try_module_get(nl_table[protocol].module)) module = nl_table[protocol].module; - groups = nl_table[protocol].groups; + cb_mutex = nl_table[protocol].cb_mutex; netlink_unlock_table(); - if ((err = __netlink_create(sock, protocol)) < 0) + if ((err = __netlink_create(sock, cb_mutex, protocol)) < 0) goto out_module; nlk = nlk_sk(sock->sk); @@ -446,17 +463,10 @@ static int netlink_release(struct socket *sock) sock_orphan(sk); nlk = nlk_sk(sk); - spin_lock(&nlk->cb_lock); - if (nlk->cb) { - if (nlk->cb->done) - nlk->cb->done(nlk->cb); - netlink_destroy_callback(nlk->cb); - nlk->cb = NULL; - } - spin_unlock(&nlk->cb_lock); - - /* OK. Socket is unlinked, and, therefore, - no new packets will arrive */ + /* + * OK. Socket is unlinked, any packets that arrive now + * will be purged. + */ sock->sk = NULL; wake_up_interruptible_all(&nlk->wait); @@ -1215,7 +1225,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, copied = len; } - skb->h.raw = skb->data; + skb_reset_transport_header(skb); err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); if (msg->msg_name) { @@ -1235,13 +1245,14 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, siocb->scm = &scm; } siocb->scm->creds = *NETLINK_CREDS(skb); + if (flags & MSG_TRUNC) + copied = skb->len; skb_free_datagram(sk, skb); if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) netlink_dump(sk); scm_recv(sock, msg, siocb->scm, flags); - out: netlink_rcv_wake(sk); return err ? : copied; @@ -1265,7 +1276,7 @@ static void netlink_data_ready(struct sock *sk, int len) struct sock * netlink_kernel_create(int unit, unsigned int groups, void (*input)(struct sock *sk, int len), - struct module *module) + struct mutex *cb_mutex, struct module *module) { struct socket *sock; struct sock *sk; @@ -1280,7 +1291,7 @@ netlink_kernel_create(int unit, unsigned int groups, if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock)) return NULL; - if (__netlink_create(sock, unit) < 0) + if (__netlink_create(sock, cb_mutex, unit) < 0) goto out_sock_release; if (groups < 32) @@ -1304,6 +1315,7 @@ netlink_kernel_create(int unit, unsigned int groups, netlink_table_grab(); nl_table[unit].groups = groups; nl_table[unit].listeners = listeners; + nl_table[unit].cb_mutex = cb_mutex; nl_table[unit].module = module; nl_table[unit].registered = 1; netlink_table_ungrab(); @@ -1346,7 +1358,7 @@ static int netlink_dump(struct sock *sk) if (!skb) goto errout; - spin_lock(&nlk->cb_lock); + mutex_lock(nlk->cb_mutex); cb = nlk->cb; if (cb == NULL) { @@ -1357,7 +1369,7 @@ static int netlink_dump(struct sock *sk) len = cb->dump(skb, cb); if (len > 0) { - spin_unlock(&nlk->cb_lock); + mutex_unlock(nlk->cb_mutex); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk, len); return 0; @@ -1375,13 +1387,13 @@ static int netlink_dump(struct sock *sk) if (cb->done) cb->done(cb); nlk->cb = NULL; - spin_unlock(&nlk->cb_lock); + mutex_unlock(nlk->cb_mutex); netlink_destroy_callback(cb); return 0; errout_skb: - spin_unlock(&nlk->cb_lock); + mutex_unlock(nlk->cb_mutex); kfree_skb(skb); errout: return err; @@ -1412,20 +1424,25 @@ int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, return -ECONNREFUSED; } nlk = nlk_sk(sk); - /* A dump or destruction is in progress... */ - spin_lock(&nlk->cb_lock); - if (nlk->cb || sock_flag(sk, SOCK_DEAD)) { - spin_unlock(&nlk->cb_lock); + /* A dump is in progress... */ + mutex_lock(nlk->cb_mutex); + if (nlk->cb) { + mutex_unlock(nlk->cb_mutex); netlink_destroy_callback(cb); sock_put(sk); return -EBUSY; } nlk->cb = cb; - spin_unlock(&nlk->cb_lock); + mutex_unlock(nlk->cb_mutex); netlink_dump(sk); sock_put(sk); - return 0; + + /* We successfully started a dump, by returning -EINTR we + * signal the queue mangement to interrupt processing of + * any netlink messages so userspace gets a chance to read + * the results. */ + return -EINTR; } void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) @@ -1462,27 +1479,35 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) } static int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, - struct nlmsghdr *, int *)) + struct nlmsghdr *)) { struct nlmsghdr *nlh; int err; while (skb->len >= nlmsg_total_size(0)) { - nlh = (struct nlmsghdr *) skb->data; + nlh = nlmsg_hdr(skb); + err = 0; if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len) return 0; - if (cb(skb, nlh, &err) < 0) { - /* Not an error, but we have to interrupt processing - * here. Note: that in this case we do not pull - * message from skb, it will be processed later. - */ - if (err == 0) - return -1; + /* Only requests are handled by the kernel */ + if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) + goto skip; + + /* Skip control messages */ + if (nlh->nlmsg_type < NLMSG_MIN_TYPE) + goto skip; + + err = cb(skb, nlh); + if (err == -EINTR) { + /* Not an error, but we interrupt processing */ + netlink_queue_skip(nlh, skb); + return err; + } +skip: + if (nlh->nlmsg_flags & NLM_F_ACK || err) netlink_ack(skb, nlh, err); - } else if (nlh->nlmsg_flags & NLM_F_ACK) - netlink_ack(skb, nlh, 0); netlink_queue_skip(nlh, skb); } @@ -1504,9 +1529,14 @@ static int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, * * qlen must be initialized to 0 before the initial entry, afterwards * the function may be called repeatedly until qlen reaches 0. + * + * The callback function may return -EINTR to signal that processing + * of netlink messages shall be interrupted. In this case the message + * currently being processed will NOT be requeued onto the receive + * queue. */ void netlink_run_queue(struct sock *sk, unsigned int *qlen, - int (*cb)(struct sk_buff *, struct nlmsghdr *, int *)) + int (*cb)(struct sk_buff *, struct nlmsghdr *)) { struct sk_buff *skb; @@ -1537,7 +1567,7 @@ void netlink_run_queue(struct sock *sk, unsigned int *qlen, * Pulls the given netlink message off the socket buffer so the next * call to netlink_queue_run() will not reconsider the message. */ -void netlink_queue_skip(struct nlmsghdr *nlh, struct sk_buff *skb) +static void netlink_queue_skip(struct nlmsghdr *nlh, struct sk_buff *skb) { int msglen = NLMSG_ALIGN(nlh->nlmsg_len); @@ -1820,12 +1850,10 @@ core_initcall(netlink_proto_init); EXPORT_SYMBOL(netlink_ack); EXPORT_SYMBOL(netlink_run_queue); -EXPORT_SYMBOL(netlink_queue_skip); EXPORT_SYMBOL(netlink_broadcast); EXPORT_SYMBOL(netlink_dump_start); EXPORT_SYMBOL(netlink_kernel_create); EXPORT_SYMBOL(netlink_register_notifier); -EXPORT_SYMBOL(netlink_set_err); EXPORT_SYMBOL(netlink_set_nonroot); EXPORT_SYMBOL(netlink_unicast); EXPORT_SYMBOL(netlink_unregister_notifier); diff --git a/net/netlink/attr.c b/net/netlink/attr.c index 004139557e0..df5f820a4c3 100644 --- a/net/netlink/attr.c +++ b/net/netlink/attr.c @@ -67,6 +67,11 @@ static int validate_nla(struct nlattr *nla, int maxtype, } break; + case NLA_BINARY: + if (pt->len && attrlen > pt->len) + return -ERANGE; + break; + default: if (pt->len) minlen = pt->len; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index c2996794eb2..6e31234a419 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -295,66 +295,46 @@ int genl_unregister_family(struct genl_family *family) return -ENOENT; } -static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, - int *errp) +static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { struct genl_ops *ops; struct genl_family *family; struct genl_info info; struct genlmsghdr *hdr = nlmsg_data(nlh); - int hdrlen, err = -EINVAL; - - if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) - goto ignore; - - if (nlh->nlmsg_type < NLMSG_MIN_TYPE) - goto ignore; + int hdrlen, err; family = genl_family_find_byid(nlh->nlmsg_type); - if (family == NULL) { - err = -ENOENT; - goto errout; - } + if (family == NULL) + return -ENOENT; hdrlen = GENL_HDRLEN + family->hdrsize; if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) - goto errout; + return -EINVAL; ops = genl_get_cmd(hdr->cmd, family); - if (ops == NULL) { - err = -EOPNOTSUPP; - goto errout; - } + if (ops == NULL) + return -EOPNOTSUPP; - if ((ops->flags & GENL_ADMIN_PERM) && security_netlink_recv(skb, CAP_NET_ADMIN)) { - err = -EPERM; - goto errout; - } + if ((ops->flags & GENL_ADMIN_PERM) && + security_netlink_recv(skb, CAP_NET_ADMIN)) + return -EPERM; if (nlh->nlmsg_flags & NLM_F_DUMP) { - if (ops->dumpit == NULL) { - err = -EOPNOTSUPP; - goto errout; - } + if (ops->dumpit == NULL) + return -EOPNOTSUPP; - *errp = err = netlink_dump_start(genl_sock, skb, nlh, - ops->dumpit, ops->done); - if (err == 0) - skb_pull(skb, min(NLMSG_ALIGN(nlh->nlmsg_len), - skb->len)); - return -1; + return netlink_dump_start(genl_sock, skb, nlh, + ops->dumpit, ops->done); } - if (ops->doit == NULL) { - err = -EOPNOTSUPP; - goto errout; - } + if (ops->doit == NULL) + return -EOPNOTSUPP; if (family->attrbuf) { err = nlmsg_parse(nlh, hdrlen, family->attrbuf, family->maxattr, ops->policy); if (err < 0) - goto errout; + return err; } info.snd_seq = nlh->nlmsg_seq; @@ -364,15 +344,7 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN; info.attrs = family->attrbuf; - *errp = err = ops->doit(skb, &info); - return err; - -ignore: - return 0; - -errout: - *errp = err; - return -1; + return ops->doit(skb, &info); } static void genl_rcv(struct sock *sk, int len) @@ -586,7 +558,7 @@ static int __init genl_init(void) netlink_set_nonroot(NETLINK_GENERIC, NL_NONROOT_RECV); genl_sock = netlink_kernel_create(NETLINK_GENERIC, GENL_MAX_ID, - genl_rcv, THIS_MODULE); + genl_rcv, NULL, THIS_MODULE); if (genl_sock == NULL) panic("GENL: Cannot initialize generic netlink\n"); diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index bf9837dd95c..5d4a26c2aa0 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -625,42 +625,42 @@ static int nr_connect(struct socket *sock, struct sockaddr *uaddr, ax25_address *source = NULL; ax25_uid_assoc *user; struct net_device *dev; + int err = 0; lock_sock(sk); if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) { sock->state = SS_CONNECTED; - release_sock(sk); - return 0; /* Connect completed during a ERESTARTSYS event */ + goto out_release; /* Connect completed during a ERESTARTSYS event */ } if (sk->sk_state == TCP_CLOSE && sock->state == SS_CONNECTING) { sock->state = SS_UNCONNECTED; - release_sock(sk); - return -ECONNREFUSED; + err = -ECONNREFUSED; + goto out_release; } if (sk->sk_state == TCP_ESTABLISHED) { - release_sock(sk); - return -EISCONN; /* No reconnect on a seqpacket socket */ + err = -EISCONN; /* No reconnect on a seqpacket socket */ + goto out_release; } sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; if (addr_len != sizeof(struct sockaddr_ax25) && addr_len != sizeof(struct full_sockaddr_ax25)) { - release_sock(sk); - return -EINVAL; + err = -EINVAL; + goto out_release; } if (addr->sax25_family != AF_NETROM) { - release_sock(sk); - return -EINVAL; + err = -EINVAL; + goto out_release; } if (sock_flag(sk, SOCK_ZAPPED)) { /* Must bind first - autobinding in this may or may not work */ sock_reset_flag(sk, SOCK_ZAPPED); if ((dev = nr_dev_first()) == NULL) { - release_sock(sk); - return -ENETUNREACH; + err = -ENETUNREACH; + goto out_release; } source = (ax25_address *)dev->dev_addr; @@ -671,8 +671,8 @@ static int nr_connect(struct socket *sock, struct sockaddr *uaddr, } else { if (ax25_uid_policy && !capable(CAP_NET_ADMIN)) { dev_put(dev); - release_sock(sk); - return -EPERM; + err = -EPERM; + goto out_release; } nr->user_addr = *source; } @@ -707,8 +707,8 @@ static int nr_connect(struct socket *sock, struct sockaddr *uaddr, /* Now the loop */ if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) { - release_sock(sk); - return -EINPROGRESS; + err = -EINPROGRESS; + goto out_release; } /* @@ -716,46 +716,46 @@ static int nr_connect(struct socket *sock, struct sockaddr *uaddr, * closed. */ if (sk->sk_state == TCP_SYN_SENT) { - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); + DEFINE_WAIT(wait); - add_wait_queue(sk->sk_sleep, &wait); for (;;) { - set_current_state(TASK_INTERRUPTIBLE); + prepare_to_wait(sk->sk_sleep, &wait, + TASK_INTERRUPTIBLE); if (sk->sk_state != TCP_SYN_SENT) break; - release_sock(sk); - if (!signal_pending(tsk)) { + if (!signal_pending(current)) { + release_sock(sk); schedule(); lock_sock(sk); continue; } - current->state = TASK_RUNNING; - remove_wait_queue(sk->sk_sleep, &wait); - return -ERESTARTSYS; + err = -ERESTARTSYS; + break; } - current->state = TASK_RUNNING; - remove_wait_queue(sk->sk_sleep, &wait); + finish_wait(sk->sk_sleep, &wait); + if (err) + goto out_release; } if (sk->sk_state != TCP_ESTABLISHED) { sock->state = SS_UNCONNECTED; - release_sock(sk); - return sock_error(sk); /* Always set at this point */ + err = sock_error(sk); /* Always set at this point */ + goto out_release; } sock->state = SS_CONNECTED; + +out_release: release_sock(sk); - return 0; + return err; } static int nr_accept(struct socket *sock, struct socket *newsock, int flags) { - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); struct sk_buff *skb; struct sock *newsk; + DEFINE_WAIT(wait); struct sock *sk; int err = 0; @@ -765,42 +765,40 @@ static int nr_accept(struct socket *sock, struct socket *newsock, int flags) lock_sock(sk); if (sk->sk_type != SOCK_SEQPACKET) { err = -EOPNOTSUPP; - goto out; + goto out_release; } if (sk->sk_state != TCP_LISTEN) { err = -EINVAL; - goto out; + goto out_release; } /* * The write queue this time is holding sockets ready to use * hooked into the SABM we saved */ - add_wait_queue(sk->sk_sleep, &wait); for (;;) { + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); skb = skb_dequeue(&sk->sk_receive_queue); if (skb) break; - current->state = TASK_INTERRUPTIBLE; - release_sock(sk); if (flags & O_NONBLOCK) { - current->state = TASK_RUNNING; - remove_wait_queue(sk->sk_sleep, &wait); - return -EWOULDBLOCK; + err = -EWOULDBLOCK; + break; } - if (!signal_pending(tsk)) { + if (!signal_pending(current)) { + release_sock(sk); schedule(); lock_sock(sk); continue; } - current->state = TASK_RUNNING; - remove_wait_queue(sk->sk_sleep, &wait); - return -ERESTARTSYS; + err = -ERESTARTSYS; + break; } - current->state = TASK_RUNNING; - remove_wait_queue(sk->sk_sleep, &wait); + finish_wait(sk->sk_sleep, &wait); + if (err) + goto out_release; newsk = skb->sk; newsk->sk_socket = newsock; @@ -811,8 +809,9 @@ static int nr_accept(struct socket *sock, struct socket *newsock, int flags) sk_acceptq_removed(sk); newsock->sk = newsk; -out: +out_release: release_sock(sk); + return err; } @@ -878,7 +877,7 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev) if (frametype == NR_PROTOEXT && circuit_index == NR_PROTO_IP && circuit_id == NR_PROTO_IP) { skb_pull(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN); - skb->h.raw = skb->data; + skb_reset_transport_header(skb); return nr_rx_ip(skb, dev); } @@ -904,7 +903,7 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev) } if (sk != NULL) { - skb->h.raw = skb->data; + skb_reset_transport_header(skb); if (frametype == NR_CONNACK && skb->len == 22) nr_sk(sk)->bpqext = 1; @@ -1074,6 +1073,7 @@ static int nr_sendmsg(struct kiocb *iocb, struct socket *sock, goto out; skb_reserve(skb, size - len); + skb_reset_transport_header(skb); /* * Push down the NET/ROM header @@ -1094,14 +1094,12 @@ static int nr_sendmsg(struct kiocb *iocb, struct socket *sock, /* * Put the data on the end */ + skb_put(skb, len); - skb->h.raw = skb_put(skb, len); - - asmptr = skb->h.raw; SOCK_DEBUG(sk, "NET/ROM: Appending user data\n"); /* User data follows immediately after the NET/ROM transport header */ - if (memcpy_fromiovec(asmptr, msg->msg_iov, len)) { + if (memcpy_fromiovec(skb_transport_header(skb), msg->msg_iov, len)) { kfree_skb(skb); err = -EFAULT; goto out; @@ -1149,7 +1147,7 @@ static int nr_recvmsg(struct kiocb *iocb, struct socket *sock, return er; } - skb->h.raw = skb->data; + skb_reset_transport_header(skb); copied = skb->len; if (copied > size) { @@ -1161,7 +1159,8 @@ static int nr_recvmsg(struct kiocb *iocb, struct socket *sock, if (sax != NULL) { sax->sax25_family = AF_NETROM; - memcpy(sax->sax25_call.ax25_call, skb->data + 7, AX25_ADDR_LEN); + skb_copy_from_linear_data_offset(skb, 7, sax->sax25_call.ax25_call, + AX25_ADDR_LEN); } msg->msg_namelen = sizeof(*sax); @@ -1209,6 +1208,12 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) release_sock(sk); return ret; + case SIOCGSTAMPNS: + lock_sock(sk); + ret = sock_get_timestampns(sk, argp); + release_sock(sk); + return ret; + case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: diff --git a/net/netrom/nr_dev.c b/net/netrom/nr_dev.c index 9a97ed6e691..c7b5d930e73 100644 --- a/net/netrom/nr_dev.c +++ b/net/netrom/nr_dev.c @@ -56,8 +56,8 @@ int nr_rx_ip(struct sk_buff *skb, struct net_device *dev) /* Spoof incoming device */ skb->dev = dev; - skb->mac.raw = skb->nh.raw; - skb->nh.raw = skb->data; + skb_reset_mac_header(skb); + skb_reset_network_header(skb); skb->pkt_type = PACKET_HOST; netif_rx(skb); diff --git a/net/netrom/nr_in.c b/net/netrom/nr_in.c index 5560acbaaa9..68176483617 100644 --- a/net/netrom/nr_in.c +++ b/net/netrom/nr_in.c @@ -51,10 +51,12 @@ static int nr_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more) if ((skbn = alloc_skb(nr->fraglen, GFP_ATOMIC)) == NULL) return 1; - skbn->h.raw = skbn->data; + skb_reset_transport_header(skbn); while ((skbo = skb_dequeue(&nr->frag_queue)) != NULL) { - memcpy(skb_put(skbn, skbo->len), skbo->data, skbo->len); + skb_copy_from_linear_data(skbo, + skb_put(skbn, skbo->len), + skbo->len); kfree_skb(skbo); } diff --git a/net/netrom/nr_loopback.c b/net/netrom/nr_loopback.c index e856ae1b360..f324d5df418 100644 --- a/net/netrom/nr_loopback.c +++ b/net/netrom/nr_loopback.c @@ -34,8 +34,8 @@ int nr_loopback_queue(struct sk_buff *skb) struct sk_buff *skbn; if ((skbn = alloc_skb(skb->len, GFP_ATOMIC)) != NULL) { - memcpy(skb_put(skbn, skb->len), skb->data, skb->len); - skbn->h.raw = skbn->data; + skb_copy_from_linear_data(skb, skb_put(skbn, skb->len), skb->len); + skb_reset_transport_header(skbn); skb_queue_tail(&loopback_queue, skbn); diff --git a/net/netrom/nr_out.c b/net/netrom/nr_out.c index 0cbfb611465..e3e6c44e189 100644 --- a/net/netrom/nr_out.c +++ b/net/netrom/nr_out.c @@ -40,7 +40,7 @@ void nr_output(struct sock *sk, struct sk_buff *skb) if (skb->len - NR_TRANSPORT_LEN > NR_MAX_PACKET_SIZE) { /* Save a copy of the Transport Header */ - memcpy(transport, skb->data, NR_TRANSPORT_LEN); + skb_copy_from_linear_data(skb, transport, NR_TRANSPORT_LEN); skb_pull(skb, NR_TRANSPORT_LEN); frontlen = skb_headroom(skb); @@ -54,13 +54,13 @@ void nr_output(struct sock *sk, struct sk_buff *skb) len = (NR_MAX_PACKET_SIZE > skb->len) ? skb->len : NR_MAX_PACKET_SIZE; /* Copy the user data */ - memcpy(skb_put(skbn, len), skb->data, len); + skb_copy_from_linear_data(skb, skb_put(skbn, len), len); skb_pull(skb, len); /* Duplicate the Transport Header */ skb_push(skbn, NR_TRANSPORT_LEN); - memcpy(skbn->data, transport, NR_TRANSPORT_LEN); - + skb_copy_to_linear_data(skbn, transport, + NR_TRANSPORT_LEN); if (skb->len > 0) skbn->data[4] |= NR_MORE_FLAG; diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c index 8e6bd4e9d82..2f76e062609 100644 --- a/net/netrom/nr_route.c +++ b/net/netrom/nr_route.c @@ -598,7 +598,7 @@ struct net_device *nr_dev_first(void) struct net_device *dev, *first = NULL; read_lock(&dev_base_lock); - for (dev = dev_base; dev != NULL; dev = dev->next) { + for_each_netdev(dev) { if ((dev->flags & IFF_UP) && dev->type == ARPHRD_NETROM) if (first == NULL || strncmp(dev->name, first->name, 3) < 0) first = dev; @@ -618,12 +618,13 @@ struct net_device *nr_dev_get(ax25_address *addr) struct net_device *dev; read_lock(&dev_base_lock); - for (dev = dev_base; dev != NULL; dev = dev->next) { + for_each_netdev(dev) { if ((dev->flags & IFF_UP) && dev->type == ARPHRD_NETROM && ax25cmp(addr, (ax25_address *)dev->dev_addr) == 0) { dev_hold(dev); goto out; } } + dev = NULL; out: read_unlock(&dev_base_lock); return dev; diff --git a/net/netrom/nr_subr.c b/net/netrom/nr_subr.c index 07b694d1887..04e7d0d2fd8 100644 --- a/net/netrom/nr_subr.c +++ b/net/netrom/nr_subr.c @@ -226,13 +226,13 @@ void __nr_transmit_reply(struct sk_buff *skb, int mine, unsigned char cmdflags) dptr = skb_put(skbn, NR_NETWORK_LEN + NR_TRANSPORT_LEN); - memcpy(dptr, skb->data + 7, AX25_ADDR_LEN); + skb_copy_from_linear_data_offset(skb, 7, dptr, AX25_ADDR_LEN); dptr[6] &= ~AX25_CBIT; dptr[6] &= ~AX25_EBIT; dptr[6] |= AX25_SSSID_SPARE; dptr += AX25_ADDR_LEN; - memcpy(dptr, skb->data + 0, AX25_ADDR_LEN); + skb_copy_from_linear_data(skb, dptr, AX25_ADDR_LEN); dptr[6] &= ~AX25_CBIT; dptr[6] |= AX25_EBIT; dptr[6] |= AX25_SSSID_SPARE; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 28d47e8f287..02e401cd683 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -114,22 +114,22 @@ On receive: ----------- Incoming, dev->hard_header!=NULL - mac.raw -> ll header - data -> data + mac_header -> ll header + data -> data Outgoing, dev->hard_header!=NULL - mac.raw -> ll header - data -> ll header + mac_header -> ll header + data -> ll header Incoming, dev->hard_header==NULL - mac.raw -> UNKNOWN position. It is very likely, that it points to ll header. - PPP makes it, that is wrong, because introduce assymetry - between rx and tx paths. - data -> data + mac_header -> UNKNOWN position. It is very likely, that it points to ll + header. PPP makes it, that is wrong, because introduce + assymetry between rx and tx paths. + data -> data Outgoing, dev->hard_header==NULL - mac.raw -> data. ll header is still not built! - data -> data + mac_header -> data. ll header is still not built! + data -> data Resume If dev->hard_header==NULL we are unlikely to restore sensible ll header. @@ -139,12 +139,12 @@ On transmit: ------------ dev->hard_header != NULL - mac.raw -> ll header - data -> ll header + mac_header -> ll header + data -> ll header dev->hard_header == NULL (ll header is added by device, we cannot control it) - mac.raw -> data - data -> data + mac_header -> data + data -> data We should set nh.raw on output to correct posistion, packet classifier depends on it. @@ -201,7 +201,8 @@ struct packet_sock { struct packet_type prot_hook; spinlock_t bind_lock; unsigned int running:1, /* prot_hook is attached*/ - auxdata:1; + auxdata:1, + origdev:1; int ifindex; /* bound device */ __be16 num; #ifdef CONFIG_PACKET_MULTICAST @@ -284,7 +285,7 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct * Incoming packets have ll header pulled, * push it back. * - * For outgoing ones skb->data == skb->mac.raw + * For outgoing ones skb->data == skb_mac_header(skb) * so that this procedure is noop. */ @@ -303,7 +304,7 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct spkt = &PACKET_SKB_CB(skb)->sa.pkt; - skb_push(skb, skb->data-skb->mac.raw); + skb_push(skb, skb->data - skb_mac_header(skb)); /* * The SOCK_PACKET socket receives _all_ frames. @@ -401,14 +402,14 @@ static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock, * notable one here. This should really be fixed at the driver level. */ skb_reserve(skb, LL_RESERVED_SPACE(dev)); - skb->nh.raw = skb->data; + skb_reset_network_header(skb); /* Try to align data part correctly */ if (dev->hard_header) { skb->data -= dev->hard_header_len; skb->tail -= dev->hard_header_len; if (len < dev->hard_header_len) - skb->nh.raw = skb->data; + skb_reset_network_header(skb); } /* Returns -EFAULT on error */ @@ -488,10 +489,10 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet never delivered to user. */ if (sk->sk_type != SOCK_DGRAM) - skb_push(skb, skb->data - skb->mac.raw); + skb_push(skb, skb->data - skb_mac_header(skb)); else if (skb->pkt_type == PACKET_OUTGOING) { /* Special case: outgoing packets have ll header at head */ - skb_pull(skb, skb->nh.raw - skb->data); + skb_pull(skb, skb_network_offset(skb)); } } @@ -528,7 +529,10 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet sll->sll_hatype = dev->type; sll->sll_protocol = skb->protocol; sll->sll_pkttype = skb->pkt_type; - sll->sll_ifindex = dev->ifindex; + if (unlikely(po->origdev) && skb->pkt_type == PACKET_HOST) + sll->sll_ifindex = orig_dev->ifindex; + else + sll->sll_ifindex = dev->ifindex; sll->sll_halen = 0; if (dev->hard_header_parse) @@ -582,6 +586,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER; unsigned short macoff, netoff; struct sk_buff *copy_skb = NULL; + struct timeval tv; if (skb->pkt_type == PACKET_LOOPBACK) goto drop; @@ -591,10 +596,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe if (dev->hard_header) { if (sk->sk_type != SOCK_DGRAM) - skb_push(skb, skb->data - skb->mac.raw); + skb_push(skb, skb->data - skb_mac_header(skb)); else if (skb->pkt_type == PACKET_OUTGOING) { /* Special case: outgoing packets have ll header at head */ - skb_pull(skb, skb->nh.raw - skb->data); + skb_pull(skb, skb_network_offset(skb)); } } @@ -612,7 +617,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe if (sk->sk_type == SOCK_DGRAM) { macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16; } else { - unsigned maclen = skb->nh.raw - skb->data; + unsigned maclen = skb_network_offset(skb); netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen)); macoff = netoff - maclen; } @@ -656,12 +661,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe h->tp_snaplen = snaplen; h->tp_mac = macoff; h->tp_net = netoff; - if (skb->tstamp.off_sec == 0) { + if (skb->tstamp.tv64 == 0) { __net_timestamp(skb); sock_enable_timestamp(sk); } - h->tp_sec = skb->tstamp.off_sec; - h->tp_usec = skb->tstamp.off_usec; + tv = ktime_to_timeval(skb->tstamp); + h->tp_sec = tv.tv_sec; + h->tp_usec = tv.tv_usec; sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h))); sll->sll_halen = 0; @@ -671,7 +677,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe sll->sll_hatype = dev->type; sll->sll_protocol = skb->protocol; sll->sll_pkttype = skb->pkt_type; - sll->sll_ifindex = dev->ifindex; + if (unlikely(po->origdev) && skb->pkt_type == PACKET_HOST) + sll->sll_ifindex = orig_dev->ifindex; + else + sll->sll_ifindex = dev->ifindex; h->tp_status = status; smp_mb(); @@ -766,14 +775,14 @@ static int packet_sendmsg(struct kiocb *iocb, struct socket *sock, goto out_unlock; skb_reserve(skb, LL_RESERVED_SPACE(dev)); - skb->nh.raw = skb->data; + skb_reset_network_header(skb); if (dev->hard_header) { int res; err = -EINVAL; res = dev->hard_header(skb, dev, ntohs(proto), addr, NULL, len); if (sock->type != SOCK_DGRAM) { - skb->tail = skb->data; + skb_reset_tail_pointer(skb); skb->len = 0; } else if (res < 0) goto out_free; @@ -1143,7 +1152,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, aux.tp_len = PACKET_SKB_CB(skb)->origlen; aux.tp_snaplen = skb->len; aux.tp_mac = 0; - aux.tp_net = skb->nh.raw - skb->data; + aux.tp_net = skb_network_offset(skb); put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux); } @@ -1411,6 +1420,18 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv po->auxdata = !!val; return 0; } + case PACKET_ORIGDEV: + { + int val; + + if (optlen < sizeof(val)) + return -EINVAL; + if (copy_from_user(&val, optval, sizeof(val))) + return -EFAULT; + + po->origdev = !!val; + return 0; + } default: return -ENOPROTOOPT; } @@ -1454,6 +1475,13 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, data = &val; break; + case PACKET_ORIGDEV: + if (len > sizeof(int)) + len = sizeof(int); + val = po->origdev; + + data = &val; + break; default: return -ENOPROTOOPT; } @@ -1543,6 +1571,8 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd, } case SIOCGSTAMP: return sock_get_timestamp(sk, (struct timeval __user *)arg); + case SIOCGSTAMPNS: + return sock_get_timestampns(sk, (struct timespec __user *)arg); #ifdef CONFIG_INET case SIOCADDRT: diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index f92d5310847..d476c43d521 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -812,26 +812,26 @@ rose_try_next_neigh: * closed. */ if (sk->sk_state == TCP_SYN_SENT) { - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); + DEFINE_WAIT(wait); - add_wait_queue(sk->sk_sleep, &wait); for (;;) { - set_current_state(TASK_INTERRUPTIBLE); + prepare_to_wait(sk->sk_sleep, &wait, + TASK_INTERRUPTIBLE); if (sk->sk_state != TCP_SYN_SENT) break; - release_sock(sk); - if (!signal_pending(tsk)) { + if (!signal_pending(current)) { + release_sock(sk); schedule(); lock_sock(sk); continue; } - current->state = TASK_RUNNING; - remove_wait_queue(sk->sk_sleep, &wait); - return -ERESTARTSYS; + err = -ERESTARTSYS; + break; } - current->state = TASK_RUNNING; - remove_wait_queue(sk->sk_sleep, &wait); + finish_wait(sk->sk_sleep, &wait); + + if (err) + goto out_release; } if (sk->sk_state != TCP_ESTABLISHED) { @@ -856,10 +856,9 @@ out_release: static int rose_accept(struct socket *sock, struct socket *newsock, int flags) { - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); struct sk_buff *skb; struct sock *newsk; + DEFINE_WAIT(wait); struct sock *sk; int err = 0; @@ -869,42 +868,41 @@ static int rose_accept(struct socket *sock, struct socket *newsock, int flags) lock_sock(sk); if (sk->sk_type != SOCK_SEQPACKET) { err = -EOPNOTSUPP; - goto out; + goto out_release; } if (sk->sk_state != TCP_LISTEN) { err = -EINVAL; - goto out; + goto out_release; } /* * The write queue this time is holding sockets ready to use * hooked into the SABM we saved */ - add_wait_queue(sk->sk_sleep, &wait); for (;;) { + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + skb = skb_dequeue(&sk->sk_receive_queue); if (skb) break; - current->state = TASK_INTERRUPTIBLE; - release_sock(sk); if (flags & O_NONBLOCK) { - current->state = TASK_RUNNING; - remove_wait_queue(sk->sk_sleep, &wait); - return -EWOULDBLOCK; + err = -EWOULDBLOCK; + break; } - if (!signal_pending(tsk)) { + if (!signal_pending(current)) { + release_sock(sk); schedule(); lock_sock(sk); continue; } - current->state = TASK_RUNNING; - remove_wait_queue(sk->sk_sleep, &wait); - return -ERESTARTSYS; + err = -ERESTARTSYS; + break; } - current->state = TASK_RUNNING; - remove_wait_queue(sk->sk_sleep, &wait); + finish_wait(sk->sk_sleep, &wait); + if (err) + goto out_release; newsk = skb->sk; newsk->sk_socket = newsock; @@ -916,7 +914,7 @@ static int rose_accept(struct socket *sock, struct socket *newsock, int flags) sk->sk_ack_backlog--; newsock->sk = newsk; -out: +out_release: release_sock(sk); return err; @@ -1105,9 +1103,10 @@ static int rose_sendmsg(struct kiocb *iocb, struct socket *sock, */ SOCK_DEBUG(sk, "ROSE: Appending user data\n"); - asmptr = skb->h.raw = skb_put(skb, len); + skb_reset_transport_header(skb); + skb_put(skb, len); - err = memcpy_fromiovec(asmptr, msg->msg_iov, len); + err = memcpy_fromiovec(skb_transport_header(skb), msg->msg_iov, len); if (err) { kfree_skb(skb); return err; @@ -1155,7 +1154,7 @@ static int rose_sendmsg(struct kiocb *iocb, struct socket *sock, int lg; /* Save a copy of the Header */ - memcpy(header, skb->data, ROSE_MIN_LEN); + skb_copy_from_linear_data(skb, header, ROSE_MIN_LEN); skb_pull(skb, ROSE_MIN_LEN); frontlen = skb_headroom(skb); @@ -1175,12 +1174,12 @@ static int rose_sendmsg(struct kiocb *iocb, struct socket *sock, lg = (ROSE_PACLEN > skb->len) ? skb->len : ROSE_PACLEN; /* Copy the user data */ - memcpy(skb_put(skbn, lg), skb->data, lg); + skb_copy_from_linear_data(skb, skb_put(skbn, lg), lg); skb_pull(skb, lg); /* Duplicate the Header */ skb_push(skbn, ROSE_MIN_LEN); - memcpy(skbn->data, header, ROSE_MIN_LEN); + skb_copy_to_linear_data(skbn, header, ROSE_MIN_LEN); if (skb->len > 0) skbn->data[2] |= M_BIT; @@ -1234,7 +1233,7 @@ static int rose_recvmsg(struct kiocb *iocb, struct socket *sock, *asmptr = qbit; } - skb->h.raw = skb->data; + skb_reset_transport_header(skb); copied = skb->len; if (copied > size) { @@ -1296,6 +1295,9 @@ static int rose_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCGSTAMP: return sock_get_timestamp(sk, (struct timeval __user *) argp); + case SIOCGSTAMPNS: + return sock_get_timestampns(sk, (struct timespec __user *) argp); + case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: diff --git a/net/rose/rose_loopback.c b/net/rose/rose_loopback.c index 3e41bd93ab9..cd01642f049 100644 --- a/net/rose/rose_loopback.c +++ b/net/rose/rose_loopback.c @@ -77,7 +77,7 @@ static void rose_loopback_timer(unsigned long param) dest = (rose_address *)(skb->data + 4); lci_o = 0xFFF - lci_i; - skb->h.raw = skb->data; + skb_reset_transport_header(skb); sk = rose_find_socket(lci_o, &rose_loopback_neigh); if (sk) { diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index a1233e1b1ab..929a784a86d 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -596,7 +596,7 @@ struct net_device *rose_dev_first(void) struct net_device *dev, *first = NULL; read_lock(&dev_base_lock); - for (dev = dev_base; dev != NULL; dev = dev->next) { + for_each_netdev(dev) { if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE) if (first == NULL || strncmp(dev->name, first->name, 3) < 0) first = dev; @@ -614,12 +614,13 @@ struct net_device *rose_dev_get(rose_address *addr) struct net_device *dev; read_lock(&dev_base_lock); - for (dev = dev_base; dev != NULL; dev = dev->next) { + for_each_netdev(dev) { if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE && rosecmp(addr, (rose_address *)dev->dev_addr) == 0) { dev_hold(dev); goto out; } } + dev = NULL; out: read_unlock(&dev_base_lock); return dev; @@ -630,10 +631,11 @@ static int rose_dev_exists(rose_address *addr) struct net_device *dev; read_lock(&dev_base_lock); - for (dev = dev_base; dev != NULL; dev = dev->next) { + for_each_netdev(dev) { if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE && rosecmp(addr, (rose_address *)dev->dev_addr) == 0) goto out; } + dev = NULL; out: read_unlock(&dev_base_lock); return dev != NULL; @@ -906,7 +908,7 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) } } else { - skb->h.raw = skb->data; + skb_reset_transport_header(skb); res = rose_process_rx_frame(sk, skb); goto out; } diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig new file mode 100644 index 00000000000..91b3d52f6f1 --- /dev/null +++ b/net/rxrpc/Kconfig @@ -0,0 +1,43 @@ +# +# RxRPC session sockets +# + +config AF_RXRPC + tristate "RxRPC session sockets" + depends on EXPERIMENTAL + select KEYS + help + Say Y or M here to include support for RxRPC session sockets (just + the transport part, not the presentation part: (un)marshalling is + left to the application). + + These are used for AFS kernel filesystem and userspace utilities. + + This module at the moment only supports client operations and is + currently incomplete. + + See Documentation/networking/rxrpc.txt. + + +config AF_RXRPC_DEBUG + bool "RxRPC dynamic debugging" + depends on AF_RXRPC + help + Say Y here to make runtime controllable debugging messages appear. + + See Documentation/networking/rxrpc.txt. + + +config RXKAD + tristate "RxRPC Kerberos security" + depends on AF_RXRPC + select CRYPTO + select CRYPTO_MANAGER + select CRYPTO_BLKCIPHER + select CRYPTO_PCBC + select CRYPTO_FCRYPT + help + Provide kerberos 4 and AFS kaserver security handling for AF_RXRPC + through the use of the key retention service. + + See Documentation/networking/rxrpc.txt. diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index 6efcb6f162a..c46867c61c9 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -1,25 +1,29 @@ # -# Makefile for Linux kernel Rx RPC +# Makefile for Linux kernel RxRPC # -#CFLAGS += -finstrument-functions - -rxrpc-objs := \ - call.o \ - connection.o \ - krxiod.o \ - krxsecd.o \ - krxtimod.o \ - main.o \ - peer.o \ - rxrpc_syms.o \ - transport.o +af-rxrpc-objs := \ + af_rxrpc.o \ + ar-accept.o \ + ar-ack.o \ + ar-call.o \ + ar-connection.o \ + ar-connevent.o \ + ar-error.o \ + ar-input.o \ + ar-key.o \ + ar-local.o \ + ar-output.o \ + ar-peer.o \ + ar-recvmsg.o \ + ar-security.o \ + ar-skbuff.o \ + ar-transport.o ifeq ($(CONFIG_PROC_FS),y) -rxrpc-objs += proc.o -endif -ifeq ($(CONFIG_SYSCTL),y) -rxrpc-objs += sysctl.o +af-rxrpc-objs += ar-proc.o endif -obj-$(CONFIG_RXRPC) := rxrpc.o +obj-$(CONFIG_AF_RXRPC) += af-rxrpc.o + +obj-$(CONFIG_RXKAD) += rxkad.o diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c new file mode 100644 index 00000000000..2c57df9c131 --- /dev/null +++ b/net/rxrpc/af_rxrpc.c @@ -0,0 +1,879 @@ +/* AF_RXRPC implementation + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/net.h> +#include <linux/skbuff.h> +#include <linux/poll.h> +#include <linux/proc_fs.h> +#include <net/sock.h> +#include <net/af_rxrpc.h> +#include "ar-internal.h" + +MODULE_DESCRIPTION("RxRPC network protocol"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_RXRPC); + +unsigned rxrpc_debug; // = RXRPC_DEBUG_KPROTO; +module_param_named(debug, rxrpc_debug, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(rxrpc_debug, "RxRPC debugging mask"); + +static int sysctl_rxrpc_max_qlen __read_mostly = 10; + +static struct proto rxrpc_proto; +static const struct proto_ops rxrpc_rpc_ops; + +/* local epoch for detecting local-end reset */ +__be32 rxrpc_epoch; + +/* current debugging ID */ +atomic_t rxrpc_debug_id; + +/* count of skbs currently in use */ +atomic_t rxrpc_n_skbs; + +struct workqueue_struct *rxrpc_workqueue; + +static void rxrpc_sock_destructor(struct sock *); + +/* + * see if an RxRPC socket is currently writable + */ +static inline int rxrpc_writable(struct sock *sk) +{ + return atomic_read(&sk->sk_wmem_alloc) < (size_t) sk->sk_sndbuf; +} + +/* + * wait for write bufferage to become available + */ +static void rxrpc_write_space(struct sock *sk) +{ + _enter("%p", sk); + read_lock(&sk->sk_callback_lock); + if (rxrpc_writable(sk)) { + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_interruptible(sk->sk_sleep); + sk_wake_async(sk, 2, POLL_OUT); + } + read_unlock(&sk->sk_callback_lock); +} + +/* + * validate an RxRPC address + */ +static int rxrpc_validate_address(struct rxrpc_sock *rx, + struct sockaddr_rxrpc *srx, + int len) +{ + if (len < sizeof(struct sockaddr_rxrpc)) + return -EINVAL; + + if (srx->srx_family != AF_RXRPC) + return -EAFNOSUPPORT; + + if (srx->transport_type != SOCK_DGRAM) + return -ESOCKTNOSUPPORT; + + len -= offsetof(struct sockaddr_rxrpc, transport); + if (srx->transport_len < sizeof(sa_family_t) || + srx->transport_len > len) + return -EINVAL; + + if (srx->transport.family != rx->proto) + return -EAFNOSUPPORT; + + switch (srx->transport.family) { + case AF_INET: + _debug("INET: %x @ %u.%u.%u.%u", + ntohs(srx->transport.sin.sin_port), + NIPQUAD(srx->transport.sin.sin_addr)); + if (srx->transport_len > 8) + memset((void *)&srx->transport + 8, 0, + srx->transport_len - 8); + break; + + case AF_INET6: + default: + return -EAFNOSUPPORT; + } + + return 0; +} + +/* + * bind a local address to an RxRPC socket + */ +static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) +{ + struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *) saddr; + struct sock *sk = sock->sk; + struct rxrpc_local *local; + struct rxrpc_sock *rx = rxrpc_sk(sk), *prx; + __be16 service_id; + int ret; + + _enter("%p,%p,%d", rx, saddr, len); + + ret = rxrpc_validate_address(rx, srx, len); + if (ret < 0) + goto error; + + lock_sock(&rx->sk); + + if (rx->sk.sk_state != RXRPC_UNCONNECTED) { + ret = -EINVAL; + goto error_unlock; + } + + memcpy(&rx->srx, srx, sizeof(rx->srx)); + + /* find a local transport endpoint if we don't have one already */ + local = rxrpc_lookup_local(&rx->srx); + if (IS_ERR(local)) { + ret = PTR_ERR(local); + goto error_unlock; + } + + rx->local = local; + if (srx->srx_service) { + service_id = htons(srx->srx_service); + write_lock_bh(&local->services_lock); + list_for_each_entry(prx, &local->services, listen_link) { + if (prx->service_id == service_id) + goto service_in_use; + } + + rx->service_id = service_id; + list_add_tail(&rx->listen_link, &local->services); + write_unlock_bh(&local->services_lock); + + rx->sk.sk_state = RXRPC_SERVER_BOUND; + } else { + rx->sk.sk_state = RXRPC_CLIENT_BOUND; + } + + release_sock(&rx->sk); + _leave(" = 0"); + return 0; + +service_in_use: + ret = -EADDRINUSE; + write_unlock_bh(&local->services_lock); +error_unlock: + release_sock(&rx->sk); +error: + _leave(" = %d", ret); + return ret; +} + +/* + * set the number of pending calls permitted on a listening socket + */ +static int rxrpc_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + struct rxrpc_sock *rx = rxrpc_sk(sk); + int ret; + + _enter("%p,%d", rx, backlog); + + lock_sock(&rx->sk); + + switch (rx->sk.sk_state) { + case RXRPC_UNCONNECTED: + ret = -EADDRNOTAVAIL; + break; + case RXRPC_CLIENT_BOUND: + case RXRPC_CLIENT_CONNECTED: + default: + ret = -EBUSY; + break; + case RXRPC_SERVER_BOUND: + ASSERT(rx->local != NULL); + sk->sk_max_ack_backlog = backlog; + rx->sk.sk_state = RXRPC_SERVER_LISTENING; + ret = 0; + break; + } + + release_sock(&rx->sk); + _leave(" = %d", ret); + return ret; +} + +/* + * find a transport by address + */ +static struct rxrpc_transport *rxrpc_name_to_transport(struct socket *sock, + struct sockaddr *addr, + int addr_len, int flags, + gfp_t gfp) +{ + struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *) addr; + struct rxrpc_transport *trans; + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + struct rxrpc_peer *peer; + + _enter("%p,%p,%d,%d", rx, addr, addr_len, flags); + + ASSERT(rx->local != NULL); + ASSERT(rx->sk.sk_state > RXRPC_UNCONNECTED); + + if (rx->srx.transport_type != srx->transport_type) + return ERR_PTR(-ESOCKTNOSUPPORT); + if (rx->srx.transport.family != srx->transport.family) + return ERR_PTR(-EAFNOSUPPORT); + + /* find a remote transport endpoint from the local one */ + peer = rxrpc_get_peer(srx, gfp); + if (IS_ERR(peer)) + return ERR_PTR(PTR_ERR(peer)); + + /* find a transport */ + trans = rxrpc_get_transport(rx->local, peer, gfp); + rxrpc_put_peer(peer); + _leave(" = %p", trans); + return trans; +} + +/** + * rxrpc_kernel_begin_call - Allow a kernel service to begin a call + * @sock: The socket on which to make the call + * @srx: The address of the peer to contact (defaults to socket setting) + * @key: The security context to use (defaults to socket setting) + * @user_call_ID: The ID to use + * + * Allow a kernel service to begin a call on the nominated socket. This just + * sets up all the internal tracking structures and allocates connection and + * call IDs as appropriate. The call to be used is returned. + * + * The default socket destination address and security may be overridden by + * supplying @srx and @key. + */ +struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, + struct sockaddr_rxrpc *srx, + struct key *key, + unsigned long user_call_ID, + gfp_t gfp) +{ + struct rxrpc_conn_bundle *bundle; + struct rxrpc_transport *trans; + struct rxrpc_call *call; + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + __be16 service_id; + + _enter(",,%x,%lx", key_serial(key), user_call_ID); + + lock_sock(&rx->sk); + + if (srx) { + trans = rxrpc_name_to_transport(sock, (struct sockaddr *) srx, + sizeof(*srx), 0, gfp); + if (IS_ERR(trans)) { + call = ERR_PTR(PTR_ERR(trans)); + trans = NULL; + goto out; + } + } else { + trans = rx->trans; + if (!trans) { + call = ERR_PTR(-ENOTCONN); + goto out; + } + atomic_inc(&trans->usage); + } + + service_id = rx->service_id; + if (srx) + service_id = htons(srx->srx_service); + + if (!key) + key = rx->key; + if (key && !key->payload.data) + key = NULL; /* a no-security key */ + + bundle = rxrpc_get_bundle(rx, trans, key, service_id, gfp); + if (IS_ERR(bundle)) { + call = ERR_PTR(PTR_ERR(bundle)); + goto out; + } + + call = rxrpc_get_client_call(rx, trans, bundle, user_call_ID, true, + gfp); + rxrpc_put_bundle(trans, bundle); +out: + rxrpc_put_transport(trans); + release_sock(&rx->sk); + _leave(" = %p", call); + return call; +} + +EXPORT_SYMBOL(rxrpc_kernel_begin_call); + +/** + * rxrpc_kernel_end_call - Allow a kernel service to end a call it was using + * @call: The call to end + * + * Allow a kernel service to end a call it was using. The call must be + * complete before this is called (the call should be aborted if necessary). + */ +void rxrpc_kernel_end_call(struct rxrpc_call *call) +{ + _enter("%d{%d}", call->debug_id, atomic_read(&call->usage)); + rxrpc_remove_user_ID(call->socket, call); + rxrpc_put_call(call); +} + +EXPORT_SYMBOL(rxrpc_kernel_end_call); + +/** + * rxrpc_kernel_intercept_rx_messages - Intercept received RxRPC messages + * @sock: The socket to intercept received messages on + * @interceptor: The function to pass the messages to + * + * Allow a kernel service to intercept messages heading for the Rx queue on an + * RxRPC socket. They get passed to the specified function instead. + * @interceptor should free the socket buffers it is given. @interceptor is + * called with the socket receive queue spinlock held and softirqs disabled - + * this ensures that the messages will be delivered in the right order. + */ +void rxrpc_kernel_intercept_rx_messages(struct socket *sock, + rxrpc_interceptor_t interceptor) +{ + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + + _enter(""); + rx->interceptor = interceptor; +} + +EXPORT_SYMBOL(rxrpc_kernel_intercept_rx_messages); + +/* + * connect an RxRPC socket + * - this just targets it at a specific destination; no actual connection + * negotiation takes place + */ +static int rxrpc_connect(struct socket *sock, struct sockaddr *addr, + int addr_len, int flags) +{ + struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *) addr; + struct sock *sk = sock->sk; + struct rxrpc_transport *trans; + struct rxrpc_local *local; + struct rxrpc_sock *rx = rxrpc_sk(sk); + int ret; + + _enter("%p,%p,%d,%d", rx, addr, addr_len, flags); + + ret = rxrpc_validate_address(rx, srx, addr_len); + if (ret < 0) { + _leave(" = %d [bad addr]", ret); + return ret; + } + + lock_sock(&rx->sk); + + switch (rx->sk.sk_state) { + case RXRPC_UNCONNECTED: + /* find a local transport endpoint if we don't have one already */ + ASSERTCMP(rx->local, ==, NULL); + rx->srx.srx_family = AF_RXRPC; + rx->srx.srx_service = 0; + rx->srx.transport_type = srx->transport_type; + rx->srx.transport_len = sizeof(sa_family_t); + rx->srx.transport.family = srx->transport.family; + local = rxrpc_lookup_local(&rx->srx); + if (IS_ERR(local)) { + release_sock(&rx->sk); + return PTR_ERR(local); + } + rx->local = local; + rx->sk.sk_state = RXRPC_CLIENT_BOUND; + case RXRPC_CLIENT_BOUND: + break; + case RXRPC_CLIENT_CONNECTED: + release_sock(&rx->sk); + return -EISCONN; + default: + release_sock(&rx->sk); + return -EBUSY; /* server sockets can't connect as well */ + } + + trans = rxrpc_name_to_transport(sock, addr, addr_len, flags, + GFP_KERNEL); + if (IS_ERR(trans)) { + release_sock(&rx->sk); + _leave(" = %ld", PTR_ERR(trans)); + return PTR_ERR(trans); + } + + rx->trans = trans; + rx->service_id = htons(srx->srx_service); + rx->sk.sk_state = RXRPC_CLIENT_CONNECTED; + + release_sock(&rx->sk); + return 0; +} + +/* + * send a message through an RxRPC socket + * - in a client this does a number of things: + * - finds/sets up a connection for the security specified (if any) + * - initiates a call (ID in control data) + * - ends the request phase of a call (if MSG_MORE is not set) + * - sends a call data packet + * - may send an abort (abort code in control data) + */ +static int rxrpc_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *m, size_t len) +{ + struct rxrpc_transport *trans; + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + int ret; + + _enter(",{%d},,%zu", rx->sk.sk_state, len); + + if (m->msg_flags & MSG_OOB) + return -EOPNOTSUPP; + + if (m->msg_name) { + ret = rxrpc_validate_address(rx, m->msg_name, m->msg_namelen); + if (ret < 0) { + _leave(" = %d [bad addr]", ret); + return ret; + } + } + + trans = NULL; + lock_sock(&rx->sk); + + if (m->msg_name) { + ret = -EISCONN; + trans = rxrpc_name_to_transport(sock, m->msg_name, + m->msg_namelen, 0, GFP_KERNEL); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out; + } + } else { + trans = rx->trans; + if (trans) + atomic_inc(&trans->usage); + } + + switch (rx->sk.sk_state) { + case RXRPC_SERVER_LISTENING: + if (!m->msg_name) { + ret = rxrpc_server_sendmsg(iocb, rx, m, len); + break; + } + case RXRPC_SERVER_BOUND: + case RXRPC_CLIENT_BOUND: + if (!m->msg_name) { + ret = -ENOTCONN; + break; + } + case RXRPC_CLIENT_CONNECTED: + ret = rxrpc_client_sendmsg(iocb, rx, trans, m, len); + break; + default: + ret = -ENOTCONN; + break; + } + +out: + release_sock(&rx->sk); + if (trans) + rxrpc_put_transport(trans); + _leave(" = %d", ret); + return ret; +} + +/* + * set RxRPC socket options + */ +static int rxrpc_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, int optlen) +{ + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + unsigned min_sec_level; + int ret; + + _enter(",%d,%d,,%d", level, optname, optlen); + + lock_sock(&rx->sk); + ret = -EOPNOTSUPP; + + if (level == SOL_RXRPC) { + switch (optname) { + case RXRPC_EXCLUSIVE_CONNECTION: + ret = -EINVAL; + if (optlen != 0) + goto error; + ret = -EISCONN; + if (rx->sk.sk_state != RXRPC_UNCONNECTED) + goto error; + set_bit(RXRPC_SOCK_EXCLUSIVE_CONN, &rx->flags); + goto success; + + case RXRPC_SECURITY_KEY: + ret = -EINVAL; + if (rx->key) + goto error; + ret = -EISCONN; + if (rx->sk.sk_state != RXRPC_UNCONNECTED) + goto error; + ret = rxrpc_request_key(rx, optval, optlen); + goto error; + + case RXRPC_SECURITY_KEYRING: + ret = -EINVAL; + if (rx->key) + goto error; + ret = -EISCONN; + if (rx->sk.sk_state != RXRPC_UNCONNECTED) + goto error; + ret = rxrpc_server_keyring(rx, optval, optlen); + goto error; + + case RXRPC_MIN_SECURITY_LEVEL: + ret = -EINVAL; + if (optlen != sizeof(unsigned)) + goto error; + ret = -EISCONN; + if (rx->sk.sk_state != RXRPC_UNCONNECTED) + goto error; + ret = get_user(min_sec_level, + (unsigned __user *) optval); + if (ret < 0) + goto error; + ret = -EINVAL; + if (min_sec_level > RXRPC_SECURITY_MAX) + goto error; + rx->min_sec_level = min_sec_level; + goto success; + + default: + break; + } + } + +success: + ret = 0; +error: + release_sock(&rx->sk); + return ret; +} + +/* + * permit an RxRPC socket to be polled + */ +static unsigned int rxrpc_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + unsigned int mask; + struct sock *sk = sock->sk; + + poll_wait(file, sk->sk_sleep, wait); + mask = 0; + + /* the socket is readable if there are any messages waiting on the Rx + * queue */ + if (!skb_queue_empty(&sk->sk_receive_queue)) + mask |= POLLIN | POLLRDNORM; + + /* the socket is writable if there is space to add new data to the + * socket; there is no guarantee that any particular call in progress + * on the socket may have space in the Tx ACK window */ + if (rxrpc_writable(sk)) + mask |= POLLOUT | POLLWRNORM; + + return mask; +} + +/* + * create an RxRPC socket + */ +static int rxrpc_create(struct socket *sock, int protocol) +{ + struct rxrpc_sock *rx; + struct sock *sk; + + _enter("%p,%d", sock, protocol); + + /* we support transport protocol UDP only */ + if (protocol != PF_INET) + return -EPROTONOSUPPORT; + + if (sock->type != SOCK_DGRAM) + return -ESOCKTNOSUPPORT; + + sock->ops = &rxrpc_rpc_ops; + sock->state = SS_UNCONNECTED; + + sk = sk_alloc(PF_RXRPC, GFP_KERNEL, &rxrpc_proto, 1); + if (!sk) + return -ENOMEM; + + sock_init_data(sock, sk); + sk->sk_state = RXRPC_UNCONNECTED; + sk->sk_write_space = rxrpc_write_space; + sk->sk_max_ack_backlog = sysctl_rxrpc_max_qlen; + sk->sk_destruct = rxrpc_sock_destructor; + + rx = rxrpc_sk(sk); + rx->proto = protocol; + rx->calls = RB_ROOT; + + INIT_LIST_HEAD(&rx->listen_link); + INIT_LIST_HEAD(&rx->secureq); + INIT_LIST_HEAD(&rx->acceptq); + rwlock_init(&rx->call_lock); + memset(&rx->srx, 0, sizeof(rx->srx)); + + _leave(" = 0 [%p]", rx); + return 0; +} + +/* + * RxRPC socket destructor + */ +static void rxrpc_sock_destructor(struct sock *sk) +{ + _enter("%p", sk); + + rxrpc_purge_queue(&sk->sk_receive_queue); + + BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc)); + BUG_TRAP(sk_unhashed(sk)); + BUG_TRAP(!sk->sk_socket); + + if (!sock_flag(sk, SOCK_DEAD)) { + printk("Attempt to release alive rxrpc socket: %p\n", sk); + return; + } +} + +/* + * release an RxRPC socket + */ +static int rxrpc_release_sock(struct sock *sk) +{ + struct rxrpc_sock *rx = rxrpc_sk(sk); + + _enter("%p{%d,%d}", sk, sk->sk_state, atomic_read(&sk->sk_refcnt)); + + /* declare the socket closed for business */ + sock_orphan(sk); + sk->sk_shutdown = SHUTDOWN_MASK; + + spin_lock_bh(&sk->sk_receive_queue.lock); + sk->sk_state = RXRPC_CLOSE; + spin_unlock_bh(&sk->sk_receive_queue.lock); + + ASSERTCMP(rx->listen_link.next, !=, LIST_POISON1); + + if (!list_empty(&rx->listen_link)) { + write_lock_bh(&rx->local->services_lock); + list_del(&rx->listen_link); + write_unlock_bh(&rx->local->services_lock); + } + + /* try to flush out this socket */ + rxrpc_release_calls_on_socket(rx); + flush_workqueue(rxrpc_workqueue); + rxrpc_purge_queue(&sk->sk_receive_queue); + + if (rx->conn) { + rxrpc_put_connection(rx->conn); + rx->conn = NULL; + } + + if (rx->bundle) { + rxrpc_put_bundle(rx->trans, rx->bundle); + rx->bundle = NULL; + } + if (rx->trans) { + rxrpc_put_transport(rx->trans); + rx->trans = NULL; + } + if (rx->local) { + rxrpc_put_local(rx->local); + rx->local = NULL; + } + + key_put(rx->key); + rx->key = NULL; + key_put(rx->securities); + rx->securities = NULL; + sock_put(sk); + + _leave(" = 0"); + return 0; +} + +/* + * release an RxRPC BSD socket on close() or equivalent + */ +static int rxrpc_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + _enter("%p{%p}", sock, sk); + + if (!sk) + return 0; + + sock->sk = NULL; + + return rxrpc_release_sock(sk); +} + +/* + * RxRPC network protocol + */ +static const struct proto_ops rxrpc_rpc_ops = { + .family = PF_UNIX, + .owner = THIS_MODULE, + .release = rxrpc_release, + .bind = rxrpc_bind, + .connect = rxrpc_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .poll = rxrpc_poll, + .ioctl = sock_no_ioctl, + .listen = rxrpc_listen, + .shutdown = sock_no_shutdown, + .setsockopt = rxrpc_setsockopt, + .getsockopt = sock_no_getsockopt, + .sendmsg = rxrpc_sendmsg, + .recvmsg = rxrpc_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct proto rxrpc_proto = { + .name = "RXRPC", + .owner = THIS_MODULE, + .obj_size = sizeof(struct rxrpc_sock), + .max_header = sizeof(struct rxrpc_header), +}; + +static struct net_proto_family rxrpc_family_ops = { + .family = PF_RXRPC, + .create = rxrpc_create, + .owner = THIS_MODULE, +}; + +/* + * initialise and register the RxRPC protocol + */ +static int __init af_rxrpc_init(void) +{ + struct sk_buff *dummy_skb; + int ret = -1; + + BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > sizeof(dummy_skb->cb)); + + rxrpc_epoch = htonl(xtime.tv_sec); + + ret = -ENOMEM; + rxrpc_call_jar = kmem_cache_create( + "rxrpc_call_jar", sizeof(struct rxrpc_call), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!rxrpc_call_jar) { + printk(KERN_NOTICE "RxRPC: Failed to allocate call jar\n"); + goto error_call_jar; + } + + rxrpc_workqueue = create_workqueue("krxrpcd"); + if (!rxrpc_workqueue) { + printk(KERN_NOTICE "RxRPC: Failed to allocate work queue\n"); + goto error_work_queue; + } + + ret = proto_register(&rxrpc_proto, 1); + if (ret < 0) { + printk(KERN_CRIT "RxRPC: Cannot register protocol\n"); + goto error_proto; + } + + ret = sock_register(&rxrpc_family_ops); + if (ret < 0) { + printk(KERN_CRIT "RxRPC: Cannot register socket family\n"); + goto error_sock; + } + + ret = register_key_type(&key_type_rxrpc); + if (ret < 0) { + printk(KERN_CRIT "RxRPC: Cannot register client key type\n"); + goto error_key_type; + } + + ret = register_key_type(&key_type_rxrpc_s); + if (ret < 0) { + printk(KERN_CRIT "RxRPC: Cannot register server key type\n"); + goto error_key_type_s; + } + +#ifdef CONFIG_PROC_FS + proc_net_fops_create("rxrpc_calls", 0, &rxrpc_call_seq_fops); + proc_net_fops_create("rxrpc_conns", 0, &rxrpc_connection_seq_fops); +#endif + return 0; + +error_key_type_s: + unregister_key_type(&key_type_rxrpc); +error_key_type: + sock_unregister(PF_RXRPC); +error_sock: + proto_unregister(&rxrpc_proto); +error_proto: + destroy_workqueue(rxrpc_workqueue); +error_work_queue: + kmem_cache_destroy(rxrpc_call_jar); +error_call_jar: + return ret; +} + +/* + * unregister the RxRPC protocol + */ +static void __exit af_rxrpc_exit(void) +{ + _enter(""); + unregister_key_type(&key_type_rxrpc_s); + unregister_key_type(&key_type_rxrpc); + sock_unregister(PF_RXRPC); + proto_unregister(&rxrpc_proto); + rxrpc_destroy_all_calls(); + rxrpc_destroy_all_connections(); + rxrpc_destroy_all_transports(); + rxrpc_destroy_all_peers(); + rxrpc_destroy_all_locals(); + + ASSERTCMP(atomic_read(&rxrpc_n_skbs), ==, 0); + + _debug("flush scheduled work"); + flush_workqueue(rxrpc_workqueue); + proc_net_remove("rxrpc_conns"); + proc_net_remove("rxrpc_calls"); + destroy_workqueue(rxrpc_workqueue); + kmem_cache_destroy(rxrpc_call_jar); + _leave(""); +} + +module_init(af_rxrpc_init); +module_exit(af_rxrpc_exit); diff --git a/net/rxrpc/ar-accept.c b/net/rxrpc/ar-accept.c new file mode 100644 index 00000000000..92a87fde8bf --- /dev/null +++ b/net/rxrpc/ar-accept.c @@ -0,0 +1,504 @@ +/* incoming call handling + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/net.h> +#include <linux/skbuff.h> +#include <linux/errqueue.h> +#include <linux/udp.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <linux/icmp.h> +#include <net/sock.h> +#include <net/af_rxrpc.h> +#include <net/ip.h> +#include "ar-internal.h" + +/* + * generate a connection-level abort + */ +static int rxrpc_busy(struct rxrpc_local *local, struct sockaddr_rxrpc *srx, + struct rxrpc_header *hdr) +{ + struct msghdr msg; + struct kvec iov[1]; + size_t len; + int ret; + + _enter("%d,,", local->debug_id); + + msg.msg_name = &srx->transport.sin; + msg.msg_namelen = sizeof(srx->transport.sin); + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + hdr->seq = 0; + hdr->type = RXRPC_PACKET_TYPE_BUSY; + hdr->flags = 0; + hdr->userStatus = 0; + hdr->_rsvd = 0; + + iov[0].iov_base = hdr; + iov[0].iov_len = sizeof(*hdr); + + len = iov[0].iov_len; + + hdr->serial = htonl(1); + _proto("Tx BUSY %%%u", ntohl(hdr->serial)); + + ret = kernel_sendmsg(local->socket, &msg, iov, 1, len); + if (ret < 0) { + _leave(" = -EAGAIN [sendmsg failed: %d]", ret); + return -EAGAIN; + } + + _leave(" = 0"); + return 0; +} + +/* + * accept an incoming call that needs peer, transport and/or connection setting + * up + */ +static int rxrpc_accept_incoming_call(struct rxrpc_local *local, + struct rxrpc_sock *rx, + struct sk_buff *skb, + struct sockaddr_rxrpc *srx) +{ + struct rxrpc_connection *conn; + struct rxrpc_transport *trans; + struct rxrpc_skb_priv *sp, *nsp; + struct rxrpc_peer *peer; + struct rxrpc_call *call; + struct sk_buff *notification; + int ret; + + _enter(""); + + sp = rxrpc_skb(skb); + + /* get a notification message to send to the server app */ + notification = alloc_skb(0, GFP_NOFS); + rxrpc_new_skb(notification); + notification->mark = RXRPC_SKB_MARK_NEW_CALL; + + peer = rxrpc_get_peer(srx, GFP_NOIO); + if (IS_ERR(peer)) { + _debug("no peer"); + ret = -EBUSY; + goto error; + } + + trans = rxrpc_get_transport(local, peer, GFP_NOIO); + rxrpc_put_peer(peer); + if (!trans) { + _debug("no trans"); + ret = -EBUSY; + goto error; + } + + conn = rxrpc_incoming_connection(trans, &sp->hdr, GFP_NOIO); + rxrpc_put_transport(trans); + if (IS_ERR(conn)) { + _debug("no conn"); + ret = PTR_ERR(conn); + goto error; + } + + call = rxrpc_incoming_call(rx, conn, &sp->hdr, GFP_NOIO); + rxrpc_put_connection(conn); + if (IS_ERR(call)) { + _debug("no call"); + ret = PTR_ERR(call); + goto error; + } + + /* attach the call to the socket */ + read_lock_bh(&local->services_lock); + if (rx->sk.sk_state == RXRPC_CLOSE) + goto invalid_service; + + write_lock(&rx->call_lock); + if (!test_and_set_bit(RXRPC_CALL_INIT_ACCEPT, &call->flags)) { + rxrpc_get_call(call); + + spin_lock(&call->conn->state_lock); + if (sp->hdr.securityIndex > 0 && + call->conn->state == RXRPC_CONN_SERVER_UNSECURED) { + _debug("await conn sec"); + list_add_tail(&call->accept_link, &rx->secureq); + call->conn->state = RXRPC_CONN_SERVER_CHALLENGING; + atomic_inc(&call->conn->usage); + set_bit(RXRPC_CONN_CHALLENGE, &call->conn->events); + rxrpc_queue_conn(call->conn); + } else { + _debug("conn ready"); + call->state = RXRPC_CALL_SERVER_ACCEPTING; + list_add_tail(&call->accept_link, &rx->acceptq); + rxrpc_get_call(call); + nsp = rxrpc_skb(notification); + nsp->call = call; + + ASSERTCMP(atomic_read(&call->usage), >=, 3); + + _debug("notify"); + spin_lock(&call->lock); + ret = rxrpc_queue_rcv_skb(call, notification, true, + false); + spin_unlock(&call->lock); + notification = NULL; + if (ret < 0) + BUG(); + } + spin_unlock(&call->conn->state_lock); + + _debug("queued"); + } + write_unlock(&rx->call_lock); + + _debug("process"); + rxrpc_fast_process_packet(call, skb); + + _debug("done"); + read_unlock_bh(&local->services_lock); + rxrpc_free_skb(notification); + rxrpc_put_call(call); + _leave(" = 0"); + return 0; + +invalid_service: + _debug("invalid"); + read_unlock_bh(&local->services_lock); + + read_lock_bh(&call->state_lock); + if (!test_bit(RXRPC_CALL_RELEASE, &call->flags) && + !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events)) { + rxrpc_get_call(call); + rxrpc_queue_call(call); + } + read_unlock_bh(&call->state_lock); + rxrpc_put_call(call); + ret = -ECONNREFUSED; +error: + rxrpc_free_skb(notification); + _leave(" = %d", ret); + return ret; +} + +/* + * accept incoming calls that need peer, transport and/or connection setting up + * - the packets we get are all incoming client DATA packets that have seq == 1 + */ +void rxrpc_accept_incoming_calls(struct work_struct *work) +{ + struct rxrpc_local *local = + container_of(work, struct rxrpc_local, acceptor); + struct rxrpc_skb_priv *sp; + struct sockaddr_rxrpc srx; + struct rxrpc_sock *rx; + struct sk_buff *skb; + __be16 service_id; + int ret; + + _enter("%d", local->debug_id); + + read_lock_bh(&rxrpc_local_lock); + if (atomic_read(&local->usage) > 0) + rxrpc_get_local(local); + else + local = NULL; + read_unlock_bh(&rxrpc_local_lock); + if (!local) { + _leave(" [local dead]"); + return; + } + +process_next_packet: + skb = skb_dequeue(&local->accept_queue); + if (!skb) { + rxrpc_put_local(local); + _leave("\n"); + return; + } + + _net("incoming call skb %p", skb); + + sp = rxrpc_skb(skb); + + /* determine the remote address */ + memset(&srx, 0, sizeof(srx)); + srx.srx_family = AF_RXRPC; + srx.transport.family = local->srx.transport.family; + srx.transport_type = local->srx.transport_type; + switch (srx.transport.family) { + case AF_INET: + srx.transport_len = sizeof(struct sockaddr_in); + srx.transport.sin.sin_port = udp_hdr(skb)->source; + srx.transport.sin.sin_addr.s_addr = ip_hdr(skb)->saddr; + break; + default: + goto busy; + } + + /* get the socket providing the service */ + service_id = sp->hdr.serviceId; + read_lock_bh(&local->services_lock); + list_for_each_entry(rx, &local->services, listen_link) { + if (rx->service_id == service_id && + rx->sk.sk_state != RXRPC_CLOSE) + goto found_service; + } + read_unlock_bh(&local->services_lock); + goto invalid_service; + +found_service: + _debug("found service %hd", ntohs(rx->service_id)); + if (sk_acceptq_is_full(&rx->sk)) + goto backlog_full; + sk_acceptq_added(&rx->sk); + sock_hold(&rx->sk); + read_unlock_bh(&local->services_lock); + + ret = rxrpc_accept_incoming_call(local, rx, skb, &srx); + if (ret < 0) + sk_acceptq_removed(&rx->sk); + sock_put(&rx->sk); + switch (ret) { + case -ECONNRESET: /* old calls are ignored */ + case -ECONNABORTED: /* aborted calls are reaborted or ignored */ + case 0: + goto process_next_packet; + case -ECONNREFUSED: + goto invalid_service; + case -EBUSY: + goto busy; + case -EKEYREJECTED: + goto security_mismatch; + default: + BUG(); + } + +backlog_full: + read_unlock_bh(&local->services_lock); +busy: + rxrpc_busy(local, &srx, &sp->hdr); + rxrpc_free_skb(skb); + goto process_next_packet; + +invalid_service: + skb->priority = RX_INVALID_OPERATION; + rxrpc_reject_packet(local, skb); + goto process_next_packet; + + /* can't change connection security type mid-flow */ +security_mismatch: + skb->priority = RX_PROTOCOL_ERROR; + rxrpc_reject_packet(local, skb); + goto process_next_packet; +} + +/* + * handle acceptance of a call by userspace + * - assign the user call ID to the call at the front of the queue + */ +struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, + unsigned long user_call_ID) +{ + struct rxrpc_call *call; + struct rb_node *parent, **pp; + int ret; + + _enter(",%lx", user_call_ID); + + ASSERT(!irqs_disabled()); + + write_lock(&rx->call_lock); + + ret = -ENODATA; + if (list_empty(&rx->acceptq)) + goto out; + + /* check the user ID isn't already in use */ + ret = -EBADSLT; + pp = &rx->calls.rb_node; + parent = NULL; + while (*pp) { + parent = *pp; + call = rb_entry(parent, struct rxrpc_call, sock_node); + + if (user_call_ID < call->user_call_ID) + pp = &(*pp)->rb_left; + else if (user_call_ID > call->user_call_ID) + pp = &(*pp)->rb_right; + else + goto out; + } + + /* dequeue the first call and check it's still valid */ + call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link); + list_del_init(&call->accept_link); + sk_acceptq_removed(&rx->sk); + + write_lock_bh(&call->state_lock); + switch (call->state) { + case RXRPC_CALL_SERVER_ACCEPTING: + call->state = RXRPC_CALL_SERVER_RECV_REQUEST; + break; + case RXRPC_CALL_REMOTELY_ABORTED: + case RXRPC_CALL_LOCALLY_ABORTED: + ret = -ECONNABORTED; + goto out_release; + case RXRPC_CALL_NETWORK_ERROR: + ret = call->conn->error; + goto out_release; + case RXRPC_CALL_DEAD: + ret = -ETIME; + goto out_discard; + default: + BUG(); + } + + /* formalise the acceptance */ + call->user_call_ID = user_call_ID; + rb_link_node(&call->sock_node, parent, pp); + rb_insert_color(&call->sock_node, &rx->calls); + if (test_and_set_bit(RXRPC_CALL_HAS_USERID, &call->flags)) + BUG(); + if (test_and_set_bit(RXRPC_CALL_ACCEPTED, &call->events)) + BUG(); + rxrpc_queue_call(call); + + rxrpc_get_call(call); + write_unlock_bh(&call->state_lock); + write_unlock(&rx->call_lock); + _leave(" = %p{%d}", call, call->debug_id); + return call; + + /* if the call is already dying or dead, then we leave the socket's ref + * on it to be released by rxrpc_dead_call_expired() as induced by + * rxrpc_release_call() */ +out_release: + _debug("release %p", call); + if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && + !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events)) + rxrpc_queue_call(call); +out_discard: + write_unlock_bh(&call->state_lock); + _debug("discard %p", call); +out: + write_unlock(&rx->call_lock); + _leave(" = %d", ret); + return ERR_PTR(ret); +} + +/* + * handle rejectance of a call by userspace + * - reject the call at the front of the queue + */ +int rxrpc_reject_call(struct rxrpc_sock *rx) +{ + struct rxrpc_call *call; + int ret; + + _enter(""); + + ASSERT(!irqs_disabled()); + + write_lock(&rx->call_lock); + + ret = -ENODATA; + if (list_empty(&rx->acceptq)) + goto out; + + /* dequeue the first call and check it's still valid */ + call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link); + list_del_init(&call->accept_link); + sk_acceptq_removed(&rx->sk); + + write_lock_bh(&call->state_lock); + switch (call->state) { + case RXRPC_CALL_SERVER_ACCEPTING: + call->state = RXRPC_CALL_SERVER_BUSY; + if (test_and_set_bit(RXRPC_CALL_REJECT_BUSY, &call->events)) + rxrpc_queue_call(call); + ret = 0; + goto out_release; + case RXRPC_CALL_REMOTELY_ABORTED: + case RXRPC_CALL_LOCALLY_ABORTED: + ret = -ECONNABORTED; + goto out_release; + case RXRPC_CALL_NETWORK_ERROR: + ret = call->conn->error; + goto out_release; + case RXRPC_CALL_DEAD: + ret = -ETIME; + goto out_discard; + default: + BUG(); + } + + /* if the call is already dying or dead, then we leave the socket's ref + * on it to be released by rxrpc_dead_call_expired() as induced by + * rxrpc_release_call() */ +out_release: + _debug("release %p", call); + if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && + !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events)) + rxrpc_queue_call(call); +out_discard: + write_unlock_bh(&call->state_lock); + _debug("discard %p", call); +out: + write_unlock(&rx->call_lock); + _leave(" = %d", ret); + return ret; +} + +/** + * rxrpc_kernel_accept_call - Allow a kernel service to accept an incoming call + * @sock: The socket on which the impending call is waiting + * @user_call_ID: The tag to attach to the call + * + * Allow a kernel service to accept an incoming call, assuming the incoming + * call is still valid. + */ +struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *sock, + unsigned long user_call_ID) +{ + struct rxrpc_call *call; + + _enter(",%lx", user_call_ID); + call = rxrpc_accept_call(rxrpc_sk(sock->sk), user_call_ID); + _leave(" = %p", call); + return call; +} + +EXPORT_SYMBOL(rxrpc_kernel_accept_call); + +/** + * rxrpc_kernel_reject_call - Allow a kernel service to reject an incoming call + * @sock: The socket on which the impending call is waiting + * + * Allow a kernel service to reject an incoming call with a BUSY message, + * assuming the incoming call is still valid. + */ +int rxrpc_kernel_reject_call(struct socket *sock) +{ + int ret; + + _enter(""); + ret = rxrpc_reject_call(rxrpc_sk(sock->sk)); + _leave(" = %d", ret); + return ret; +} + +EXPORT_SYMBOL(rxrpc_kernel_reject_call); diff --git a/net/rxrpc/ar-ack.c b/net/rxrpc/ar-ack.c new file mode 100644 index 00000000000..657ee69f213 --- /dev/null +++ b/net/rxrpc/ar-ack.c @@ -0,0 +1,1306 @@ +/* Management of Tx window, Tx resend, ACKs and out-of-sequence reception + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/circ_buf.h> +#include <linux/net.h> +#include <linux/skbuff.h> +#include <linux/udp.h> +#include <net/sock.h> +#include <net/af_rxrpc.h> +#include "ar-internal.h" + +static unsigned rxrpc_ack_defer = 1; + +static const char *rxrpc_acks[] = { + "---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY", "IDL", + "-?-" +}; + +static const s8 rxrpc_ack_priority[] = { + [0] = 0, + [RXRPC_ACK_DELAY] = 1, + [RXRPC_ACK_REQUESTED] = 2, + [RXRPC_ACK_IDLE] = 3, + [RXRPC_ACK_PING_RESPONSE] = 4, + [RXRPC_ACK_DUPLICATE] = 5, + [RXRPC_ACK_OUT_OF_SEQUENCE] = 6, + [RXRPC_ACK_EXCEEDS_WINDOW] = 7, + [RXRPC_ACK_NOSPACE] = 8, +}; + +/* + * propose an ACK be sent + */ +void __rxrpc_propose_ACK(struct rxrpc_call *call, uint8_t ack_reason, + __be32 serial, bool immediate) +{ + unsigned long expiry; + s8 prior = rxrpc_ack_priority[ack_reason]; + + ASSERTCMP(prior, >, 0); + + _enter("{%d},%s,%%%x,%u", + call->debug_id, rxrpc_acks[ack_reason], ntohl(serial), + immediate); + + if (prior < rxrpc_ack_priority[call->ackr_reason]) { + if (immediate) + goto cancel_timer; + return; + } + + /* update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial + * numbers */ + if (prior == rxrpc_ack_priority[call->ackr_reason]) { + if (prior <= 4) + call->ackr_serial = serial; + if (immediate) + goto cancel_timer; + return; + } + + call->ackr_reason = ack_reason; + call->ackr_serial = serial; + + switch (ack_reason) { + case RXRPC_ACK_DELAY: + _debug("run delay timer"); + call->ack_timer.expires = jiffies + rxrpc_ack_timeout * HZ; + add_timer(&call->ack_timer); + return; + + case RXRPC_ACK_IDLE: + if (!immediate) { + _debug("run defer timer"); + expiry = 1; + goto run_timer; + } + goto cancel_timer; + + case RXRPC_ACK_REQUESTED: + if (!rxrpc_ack_defer) + goto cancel_timer; + if (!immediate || serial == cpu_to_be32(1)) { + _debug("run defer timer"); + expiry = rxrpc_ack_defer; + goto run_timer; + } + + default: + _debug("immediate ACK"); + goto cancel_timer; + } + +run_timer: + expiry += jiffies; + if (!timer_pending(&call->ack_timer) || + time_after(call->ack_timer.expires, expiry)) + mod_timer(&call->ack_timer, expiry); + return; + +cancel_timer: + _debug("cancel timer %%%u", ntohl(serial)); + try_to_del_timer_sync(&call->ack_timer); + read_lock_bh(&call->state_lock); + if (call->state <= RXRPC_CALL_COMPLETE && + !test_and_set_bit(RXRPC_CALL_ACK, &call->events)) + rxrpc_queue_call(call); + read_unlock_bh(&call->state_lock); +} + +/* + * propose an ACK be sent, locking the call structure + */ +void rxrpc_propose_ACK(struct rxrpc_call *call, uint8_t ack_reason, + __be32 serial, bool immediate) +{ + s8 prior = rxrpc_ack_priority[ack_reason]; + + if (prior > rxrpc_ack_priority[call->ackr_reason]) { + spin_lock_bh(&call->lock); + __rxrpc_propose_ACK(call, ack_reason, serial, immediate); + spin_unlock_bh(&call->lock); + } +} + +/* + * set the resend timer + */ +static void rxrpc_set_resend(struct rxrpc_call *call, u8 resend, + unsigned long resend_at) +{ + read_lock_bh(&call->state_lock); + if (call->state >= RXRPC_CALL_COMPLETE) + resend = 0; + + if (resend & 1) { + _debug("SET RESEND"); + set_bit(RXRPC_CALL_RESEND, &call->events); + } + + if (resend & 2) { + _debug("MODIFY RESEND TIMER"); + set_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); + mod_timer(&call->resend_timer, resend_at); + } else { + _debug("KILL RESEND TIMER"); + del_timer_sync(&call->resend_timer); + clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events); + clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); + } + read_unlock_bh(&call->state_lock); +} + +/* + * resend packets + */ +static void rxrpc_resend(struct rxrpc_call *call) +{ + struct rxrpc_skb_priv *sp; + struct rxrpc_header *hdr; + struct sk_buff *txb; + unsigned long *p_txb, resend_at; + int loop, stop; + u8 resend; + + _enter("{%d,%d,%d,%d},", + call->acks_hard, call->acks_unacked, + atomic_read(&call->sequence), + CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz)); + + stop = 0; + resend = 0; + resend_at = 0; + + for (loop = call->acks_tail; + loop != call->acks_head || stop; + loop = (loop + 1) & (call->acks_winsz - 1) + ) { + p_txb = call->acks_window + loop; + smp_read_barrier_depends(); + if (*p_txb & 1) + continue; + + txb = (struct sk_buff *) *p_txb; + sp = rxrpc_skb(txb); + + if (sp->need_resend) { + sp->need_resend = 0; + + /* each Tx packet has a new serial number */ + sp->hdr.serial = + htonl(atomic_inc_return(&call->conn->serial)); + + hdr = (struct rxrpc_header *) txb->head; + hdr->serial = sp->hdr.serial; + + _proto("Tx DATA %%%u { #%d }", + ntohl(sp->hdr.serial), ntohl(sp->hdr.seq)); + if (rxrpc_send_packet(call->conn->trans, txb) < 0) { + stop = 0; + sp->resend_at = jiffies + 3; + } else { + sp->resend_at = + jiffies + rxrpc_resend_timeout * HZ; + } + } + + if (time_after_eq(jiffies + 1, sp->resend_at)) { + sp->need_resend = 1; + resend |= 1; + } else if (resend & 2) { + if (time_before(sp->resend_at, resend_at)) + resend_at = sp->resend_at; + } else { + resend_at = sp->resend_at; + resend |= 2; + } + } + + rxrpc_set_resend(call, resend, resend_at); + _leave(""); +} + +/* + * handle resend timer expiry + */ +static void rxrpc_resend_timer(struct rxrpc_call *call) +{ + struct rxrpc_skb_priv *sp; + struct sk_buff *txb; + unsigned long *p_txb, resend_at; + int loop; + u8 resend; + + _enter("%d,%d,%d", + call->acks_tail, call->acks_unacked, call->acks_head); + + resend = 0; + resend_at = 0; + + for (loop = call->acks_unacked; + loop != call->acks_head; + loop = (loop + 1) & (call->acks_winsz - 1) + ) { + p_txb = call->acks_window + loop; + smp_read_barrier_depends(); + txb = (struct sk_buff *) (*p_txb & ~1); + sp = rxrpc_skb(txb); + + ASSERT(!(*p_txb & 1)); + + if (sp->need_resend) { + ; + } else if (time_after_eq(jiffies + 1, sp->resend_at)) { + sp->need_resend = 1; + resend |= 1; + } else if (resend & 2) { + if (time_before(sp->resend_at, resend_at)) + resend_at = sp->resend_at; + } else { + resend_at = sp->resend_at; + resend |= 2; + } + } + + rxrpc_set_resend(call, resend, resend_at); + _leave(""); +} + +/* + * process soft ACKs of our transmitted packets + * - these indicate packets the peer has or has not received, but hasn't yet + * given to the consumer, and so can still be discarded and re-requested + */ +static int rxrpc_process_soft_ACKs(struct rxrpc_call *call, + struct rxrpc_ackpacket *ack, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp; + struct sk_buff *txb; + unsigned long *p_txb, resend_at; + int loop; + u8 sacks[RXRPC_MAXACKS], resend; + + _enter("{%d,%d},{%d},", + call->acks_hard, + CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz), + ack->nAcks); + + if (skb_copy_bits(skb, 0, sacks, ack->nAcks) < 0) + goto protocol_error; + + resend = 0; + resend_at = 0; + for (loop = 0; loop < ack->nAcks; loop++) { + p_txb = call->acks_window; + p_txb += (call->acks_tail + loop) & (call->acks_winsz - 1); + smp_read_barrier_depends(); + txb = (struct sk_buff *) (*p_txb & ~1); + sp = rxrpc_skb(txb); + + switch (sacks[loop]) { + case RXRPC_ACK_TYPE_ACK: + sp->need_resend = 0; + *p_txb |= 1; + break; + case RXRPC_ACK_TYPE_NACK: + sp->need_resend = 1; + *p_txb &= ~1; + resend = 1; + break; + default: + _debug("Unsupported ACK type %d", sacks[loop]); + goto protocol_error; + } + } + + smp_mb(); + call->acks_unacked = (call->acks_tail + loop) & (call->acks_winsz - 1); + + /* anything not explicitly ACK'd is implicitly NACK'd, but may just not + * have been received or processed yet by the far end */ + for (loop = call->acks_unacked; + loop != call->acks_head; + loop = (loop + 1) & (call->acks_winsz - 1) + ) { + p_txb = call->acks_window + loop; + smp_read_barrier_depends(); + txb = (struct sk_buff *) (*p_txb & ~1); + sp = rxrpc_skb(txb); + + if (*p_txb & 1) { + /* packet must have been discarded */ + sp->need_resend = 1; + *p_txb &= ~1; + resend |= 1; + } else if (sp->need_resend) { + ; + } else if (time_after_eq(jiffies + 1, sp->resend_at)) { + sp->need_resend = 1; + resend |= 1; + } else if (resend & 2) { + if (time_before(sp->resend_at, resend_at)) + resend_at = sp->resend_at; + } else { + resend_at = sp->resend_at; + resend |= 2; + } + } + + rxrpc_set_resend(call, resend, resend_at); + _leave(" = 0"); + return 0; + +protocol_error: + _leave(" = -EPROTO"); + return -EPROTO; +} + +/* + * discard hard-ACK'd packets from the Tx window + */ +static void rxrpc_rotate_tx_window(struct rxrpc_call *call, u32 hard) +{ + struct rxrpc_skb_priv *sp; + unsigned long _skb; + int tail = call->acks_tail, old_tail; + int win = CIRC_CNT(call->acks_head, tail, call->acks_winsz); + + _enter("{%u,%u},%u", call->acks_hard, win, hard); + + ASSERTCMP(hard - call->acks_hard, <=, win); + + while (call->acks_hard < hard) { + smp_read_barrier_depends(); + _skb = call->acks_window[tail] & ~1; + sp = rxrpc_skb((struct sk_buff *) _skb); + rxrpc_free_skb((struct sk_buff *) _skb); + old_tail = tail; + tail = (tail + 1) & (call->acks_winsz - 1); + call->acks_tail = tail; + if (call->acks_unacked == old_tail) + call->acks_unacked = tail; + call->acks_hard++; + } + + wake_up(&call->tx_waitq); +} + +/* + * clear the Tx window in the event of a failure + */ +static void rxrpc_clear_tx_window(struct rxrpc_call *call) +{ + rxrpc_rotate_tx_window(call, atomic_read(&call->sequence)); +} + +/* + * drain the out of sequence received packet queue into the packet Rx queue + */ +static int rxrpc_drain_rx_oos_queue(struct rxrpc_call *call) +{ + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + bool terminal; + int ret; + + _enter("{%d,%d}", call->rx_data_post, call->rx_first_oos); + + spin_lock_bh(&call->lock); + + ret = -ECONNRESET; + if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) + goto socket_unavailable; + + skb = skb_dequeue(&call->rx_oos_queue); + if (skb) { + sp = rxrpc_skb(skb); + + _debug("drain OOS packet %d [%d]", + ntohl(sp->hdr.seq), call->rx_first_oos); + + if (ntohl(sp->hdr.seq) != call->rx_first_oos) { + skb_queue_head(&call->rx_oos_queue, skb); + call->rx_first_oos = ntohl(rxrpc_skb(skb)->hdr.seq); + _debug("requeue %p {%u}", skb, call->rx_first_oos); + } else { + skb->mark = RXRPC_SKB_MARK_DATA; + terminal = ((sp->hdr.flags & RXRPC_LAST_PACKET) && + !(sp->hdr.flags & RXRPC_CLIENT_INITIATED)); + ret = rxrpc_queue_rcv_skb(call, skb, true, terminal); + BUG_ON(ret < 0); + _debug("drain #%u", call->rx_data_post); + call->rx_data_post++; + + /* find out what the next packet is */ + skb = skb_peek(&call->rx_oos_queue); + if (skb) + call->rx_first_oos = + ntohl(rxrpc_skb(skb)->hdr.seq); + else + call->rx_first_oos = 0; + _debug("peek %p {%u}", skb, call->rx_first_oos); + } + } + + ret = 0; +socket_unavailable: + spin_unlock_bh(&call->lock); + _leave(" = %d", ret); + return ret; +} + +/* + * insert an out of sequence packet into the buffer + */ +static void rxrpc_insert_oos_packet(struct rxrpc_call *call, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp, *psp; + struct sk_buff *p; + u32 seq; + + sp = rxrpc_skb(skb); + seq = ntohl(sp->hdr.seq); + _enter(",,{%u}", seq); + + skb->destructor = rxrpc_packet_destructor; + ASSERTCMP(sp->call, ==, NULL); + sp->call = call; + rxrpc_get_call(call); + + /* insert into the buffer in sequence order */ + spin_lock_bh(&call->lock); + + skb_queue_walk(&call->rx_oos_queue, p) { + psp = rxrpc_skb(p); + if (ntohl(psp->hdr.seq) > seq) { + _debug("insert oos #%u before #%u", + seq, ntohl(psp->hdr.seq)); + skb_insert(p, skb, &call->rx_oos_queue); + goto inserted; + } + } + + _debug("append oos #%u", seq); + skb_queue_tail(&call->rx_oos_queue, skb); +inserted: + + /* we might now have a new front to the queue */ + if (call->rx_first_oos == 0 || seq < call->rx_first_oos) + call->rx_first_oos = seq; + + read_lock(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE && + call->rx_data_post == call->rx_first_oos) { + _debug("drain rx oos now"); + set_bit(RXRPC_CALL_DRAIN_RX_OOS, &call->events); + } + read_unlock(&call->state_lock); + + spin_unlock_bh(&call->lock); + _leave(" [stored #%u]", call->rx_first_oos); +} + +/* + * clear the Tx window on final ACK reception + */ +static void rxrpc_zap_tx_window(struct rxrpc_call *call) +{ + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + unsigned long _skb, *acks_window; + uint8_t winsz = call->acks_winsz; + int tail; + + acks_window = call->acks_window; + call->acks_window = NULL; + + while (CIRC_CNT(call->acks_head, call->acks_tail, winsz) > 0) { + tail = call->acks_tail; + smp_read_barrier_depends(); + _skb = acks_window[tail] & ~1; + smp_mb(); + call->acks_tail = (call->acks_tail + 1) & (winsz - 1); + + skb = (struct sk_buff *) _skb; + sp = rxrpc_skb(skb); + _debug("+++ clear Tx %u", ntohl(sp->hdr.seq)); + rxrpc_free_skb(skb); + } + + kfree(acks_window); +} + +/* + * process the extra information that may be appended to an ACK packet + */ +static void rxrpc_extract_ackinfo(struct rxrpc_call *call, struct sk_buff *skb, + unsigned latest, int nAcks) +{ + struct rxrpc_ackinfo ackinfo; + struct rxrpc_peer *peer; + unsigned mtu; + + if (skb_copy_bits(skb, nAcks + 3, &ackinfo, sizeof(ackinfo)) < 0) { + _leave(" [no ackinfo]"); + return; + } + + _proto("Rx ACK %%%u Info { rx=%u max=%u rwin=%u jm=%u }", + latest, + ntohl(ackinfo.rxMTU), ntohl(ackinfo.maxMTU), + ntohl(ackinfo.rwind), ntohl(ackinfo.jumbo_max)); + + mtu = min(ntohl(ackinfo.rxMTU), ntohl(ackinfo.maxMTU)); + + peer = call->conn->trans->peer; + if (mtu < peer->maxdata) { + spin_lock_bh(&peer->lock); + peer->maxdata = mtu; + peer->mtu = mtu + peer->hdrsize; + spin_unlock_bh(&peer->lock); + _net("Net MTU %u (maxdata %u)", peer->mtu, peer->maxdata); + } +} + +/* + * process packets in the reception queue + */ +static int rxrpc_process_rx_queue(struct rxrpc_call *call, + u32 *_abort_code) +{ + struct rxrpc_ackpacket ack; + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + bool post_ACK; + int latest; + u32 hard, tx; + + _enter(""); + +process_further: + skb = skb_dequeue(&call->rx_queue); + if (!skb) + return -EAGAIN; + + _net("deferred skb %p", skb); + + sp = rxrpc_skb(skb); + + _debug("process %s [st %d]", rxrpc_pkts[sp->hdr.type], call->state); + + post_ACK = false; + + switch (sp->hdr.type) { + /* data packets that wind up here have been received out of + * order, need security processing or are jumbo packets */ + case RXRPC_PACKET_TYPE_DATA: + _proto("OOSQ DATA %%%u { #%u }", + ntohl(sp->hdr.serial), ntohl(sp->hdr.seq)); + + /* secured packets must be verified and possibly decrypted */ + if (rxrpc_verify_packet(call, skb, _abort_code) < 0) + goto protocol_error; + + rxrpc_insert_oos_packet(call, skb); + goto process_further; + + /* partial ACK to process */ + case RXRPC_PACKET_TYPE_ACK: + if (skb_copy_bits(skb, 0, &ack, sizeof(ack)) < 0) { + _debug("extraction failure"); + goto protocol_error; + } + if (!skb_pull(skb, sizeof(ack))) + BUG(); + + latest = ntohl(sp->hdr.serial); + hard = ntohl(ack.firstPacket); + tx = atomic_read(&call->sequence); + + _proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", + latest, + ntohs(ack.maxSkew), + hard, + ntohl(ack.previousPacket), + ntohl(ack.serial), + rxrpc_acks[ack.reason], + ack.nAcks); + + rxrpc_extract_ackinfo(call, skb, latest, ack.nAcks); + + if (ack.reason == RXRPC_ACK_PING) { + _proto("Rx ACK %%%u PING Request", latest); + rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE, + sp->hdr.serial, true); + } + + /* discard any out-of-order or duplicate ACKs */ + if (latest - call->acks_latest <= 0) { + _debug("discard ACK %d <= %d", + latest, call->acks_latest); + goto discard; + } + call->acks_latest = latest; + + if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST && + call->state != RXRPC_CALL_CLIENT_AWAIT_REPLY && + call->state != RXRPC_CALL_SERVER_SEND_REPLY && + call->state != RXRPC_CALL_SERVER_AWAIT_ACK) + goto discard; + + _debug("Tx=%d H=%u S=%d", tx, call->acks_hard, call->state); + + if (hard > 0) { + if (hard - 1 > tx) { + _debug("hard-ACK'd packet %d not transmitted" + " (%d top)", + hard - 1, tx); + goto protocol_error; + } + + if ((call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY || + call->state == RXRPC_CALL_SERVER_AWAIT_ACK) && + hard > tx) + goto all_acked; + + smp_rmb(); + rxrpc_rotate_tx_window(call, hard - 1); + } + + if (ack.nAcks > 0) { + if (hard - 1 + ack.nAcks > tx) { + _debug("soft-ACK'd packet %d+%d not" + " transmitted (%d top)", + hard - 1, ack.nAcks, tx); + goto protocol_error; + } + + if (rxrpc_process_soft_ACKs(call, &ack, skb) < 0) + goto protocol_error; + } + goto discard; + + /* complete ACK to process */ + case RXRPC_PACKET_TYPE_ACKALL: + goto all_acked; + + /* abort and busy are handled elsewhere */ + case RXRPC_PACKET_TYPE_BUSY: + case RXRPC_PACKET_TYPE_ABORT: + BUG(); + + /* connection level events - also handled elsewhere */ + case RXRPC_PACKET_TYPE_CHALLENGE: + case RXRPC_PACKET_TYPE_RESPONSE: + case RXRPC_PACKET_TYPE_DEBUG: + BUG(); + } + + /* if we've had a hard ACK that covers all the packets we've sent, then + * that ends that phase of the operation */ +all_acked: + write_lock_bh(&call->state_lock); + _debug("ack all %d", call->state); + + switch (call->state) { + case RXRPC_CALL_CLIENT_AWAIT_REPLY: + call->state = RXRPC_CALL_CLIENT_RECV_REPLY; + break; + case RXRPC_CALL_SERVER_AWAIT_ACK: + _debug("srv complete"); + call->state = RXRPC_CALL_COMPLETE; + post_ACK = true; + break; + case RXRPC_CALL_CLIENT_SEND_REQUEST: + case RXRPC_CALL_SERVER_RECV_REQUEST: + goto protocol_error_unlock; /* can't occur yet */ + default: + write_unlock_bh(&call->state_lock); + goto discard; /* assume packet left over from earlier phase */ + } + + write_unlock_bh(&call->state_lock); + + /* if all the packets we sent are hard-ACK'd, then we can discard + * whatever we've got left */ + _debug("clear Tx %d", + CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz)); + + del_timer_sync(&call->resend_timer); + clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); + clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events); + + if (call->acks_window) + rxrpc_zap_tx_window(call); + + if (post_ACK) { + /* post the final ACK message for userspace to pick up */ + _debug("post ACK"); + skb->mark = RXRPC_SKB_MARK_FINAL_ACK; + sp->call = call; + rxrpc_get_call(call); + spin_lock_bh(&call->lock); + if (rxrpc_queue_rcv_skb(call, skb, true, true) < 0) + BUG(); + spin_unlock_bh(&call->lock); + goto process_further; + } + +discard: + rxrpc_free_skb(skb); + goto process_further; + +protocol_error_unlock: + write_unlock_bh(&call->state_lock); +protocol_error: + rxrpc_free_skb(skb); + _leave(" = -EPROTO"); + return -EPROTO; +} + +/* + * post a message to the socket Rx queue for recvmsg() to pick up + */ +static int rxrpc_post_message(struct rxrpc_call *call, u32 mark, u32 error, + bool fatal) +{ + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + int ret; + + _enter("{%d,%lx},%u,%u,%d", + call->debug_id, call->flags, mark, error, fatal); + + /* remove timers and things for fatal messages */ + if (fatal) { + del_timer_sync(&call->resend_timer); + del_timer_sync(&call->ack_timer); + clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); + } + + if (mark != RXRPC_SKB_MARK_NEW_CALL && + !test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { + _leave("[no userid]"); + return 0; + } + + if (!test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) { + skb = alloc_skb(0, GFP_NOFS); + if (!skb) + return -ENOMEM; + + rxrpc_new_skb(skb); + + skb->mark = mark; + + sp = rxrpc_skb(skb); + memset(sp, 0, sizeof(*sp)); + sp->error = error; + sp->call = call; + rxrpc_get_call(call); + + spin_lock_bh(&call->lock); + ret = rxrpc_queue_rcv_skb(call, skb, true, fatal); + spin_unlock_bh(&call->lock); + if (ret < 0) + BUG(); + } + + return 0; +} + +/* + * handle background processing of incoming call packets and ACK / abort + * generation + */ +void rxrpc_process_call(struct work_struct *work) +{ + struct rxrpc_call *call = + container_of(work, struct rxrpc_call, processor); + struct rxrpc_ackpacket ack; + struct rxrpc_ackinfo ackinfo; + struct rxrpc_header hdr; + struct msghdr msg; + struct kvec iov[5]; + unsigned long bits; + __be32 data, pad; + size_t len; + int genbit, loop, nbit, ioc, ret, mtu; + u32 abort_code = RX_PROTOCOL_ERROR; + u8 *acks = NULL; + + //printk("\n--------------------\n"); + _enter("{%d,%s,%lx} [%lu]", + call->debug_id, rxrpc_call_states[call->state], call->events, + (jiffies - call->creation_jif) / (HZ / 10)); + + if (test_and_set_bit(RXRPC_CALL_PROC_BUSY, &call->flags)) { + _debug("XXXXXXXXXXXXX RUNNING ON MULTIPLE CPUS XXXXXXXXXXXXX"); + return; + } + + /* there's a good chance we're going to have to send a message, so set + * one up in advance */ + msg.msg_name = &call->conn->trans->peer->srx.transport.sin; + msg.msg_namelen = sizeof(call->conn->trans->peer->srx.transport.sin); + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + hdr.epoch = call->conn->epoch; + hdr.cid = call->cid; + hdr.callNumber = call->call_id; + hdr.seq = 0; + hdr.type = RXRPC_PACKET_TYPE_ACK; + hdr.flags = call->conn->out_clientflag; + hdr.userStatus = 0; + hdr.securityIndex = call->conn->security_ix; + hdr._rsvd = 0; + hdr.serviceId = call->conn->service_id; + + memset(iov, 0, sizeof(iov)); + iov[0].iov_base = &hdr; + iov[0].iov_len = sizeof(hdr); + + /* deal with events of a final nature */ + if (test_bit(RXRPC_CALL_RELEASE, &call->events)) { + rxrpc_release_call(call); + clear_bit(RXRPC_CALL_RELEASE, &call->events); + } + + if (test_bit(RXRPC_CALL_RCVD_ERROR, &call->events)) { + int error; + + clear_bit(RXRPC_CALL_CONN_ABORT, &call->events); + clear_bit(RXRPC_CALL_REJECT_BUSY, &call->events); + clear_bit(RXRPC_CALL_ABORT, &call->events); + + error = call->conn->trans->peer->net_error; + _debug("post net error %d", error); + + if (rxrpc_post_message(call, RXRPC_SKB_MARK_NET_ERROR, + error, true) < 0) + goto no_mem; + clear_bit(RXRPC_CALL_RCVD_ERROR, &call->events); + goto kill_ACKs; + } + + if (test_bit(RXRPC_CALL_CONN_ABORT, &call->events)) { + ASSERTCMP(call->state, >, RXRPC_CALL_COMPLETE); + + clear_bit(RXRPC_CALL_REJECT_BUSY, &call->events); + clear_bit(RXRPC_CALL_ABORT, &call->events); + + _debug("post conn abort"); + + if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR, + call->conn->error, true) < 0) + goto no_mem; + clear_bit(RXRPC_CALL_CONN_ABORT, &call->events); + goto kill_ACKs; + } + + if (test_bit(RXRPC_CALL_REJECT_BUSY, &call->events)) { + hdr.type = RXRPC_PACKET_TYPE_BUSY; + genbit = RXRPC_CALL_REJECT_BUSY; + goto send_message; + } + + if (test_bit(RXRPC_CALL_ABORT, &call->events)) { + ASSERTCMP(call->state, >, RXRPC_CALL_COMPLETE); + + if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR, + ECONNABORTED, true) < 0) + goto no_mem; + hdr.type = RXRPC_PACKET_TYPE_ABORT; + data = htonl(call->abort_code); + iov[1].iov_base = &data; + iov[1].iov_len = sizeof(data); + genbit = RXRPC_CALL_ABORT; + goto send_message; + } + + if (test_bit(RXRPC_CALL_ACK_FINAL, &call->events)) { + genbit = RXRPC_CALL_ACK_FINAL; + + ack.bufferSpace = htons(8); + ack.maxSkew = 0; + ack.serial = 0; + ack.reason = RXRPC_ACK_IDLE; + ack.nAcks = 0; + call->ackr_reason = 0; + + spin_lock_bh(&call->lock); + ack.serial = call->ackr_serial; + ack.previousPacket = call->ackr_prev_seq; + ack.firstPacket = htonl(call->rx_data_eaten + 1); + spin_unlock_bh(&call->lock); + + pad = 0; + + iov[1].iov_base = &ack; + iov[1].iov_len = sizeof(ack); + iov[2].iov_base = &pad; + iov[2].iov_len = 3; + iov[3].iov_base = &ackinfo; + iov[3].iov_len = sizeof(ackinfo); + goto send_ACK; + } + + if (call->events & ((1 << RXRPC_CALL_RCVD_BUSY) | + (1 << RXRPC_CALL_RCVD_ABORT)) + ) { + u32 mark; + + if (test_bit(RXRPC_CALL_RCVD_ABORT, &call->events)) + mark = RXRPC_SKB_MARK_REMOTE_ABORT; + else + mark = RXRPC_SKB_MARK_BUSY; + + _debug("post abort/busy"); + rxrpc_clear_tx_window(call); + if (rxrpc_post_message(call, mark, ECONNABORTED, true) < 0) + goto no_mem; + + clear_bit(RXRPC_CALL_RCVD_BUSY, &call->events); + clear_bit(RXRPC_CALL_RCVD_ABORT, &call->events); + goto kill_ACKs; + } + + if (test_and_clear_bit(RXRPC_CALL_RCVD_ACKALL, &call->events)) { + _debug("do implicit ackall"); + rxrpc_clear_tx_window(call); + } + + if (test_bit(RXRPC_CALL_LIFE_TIMER, &call->events)) { + write_lock_bh(&call->state_lock); + if (call->state <= RXRPC_CALL_COMPLETE) { + call->state = RXRPC_CALL_LOCALLY_ABORTED; + call->abort_code = RX_CALL_TIMEOUT; + set_bit(RXRPC_CALL_ABORT, &call->events); + } + write_unlock_bh(&call->state_lock); + + _debug("post timeout"); + if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR, + ETIME, true) < 0) + goto no_mem; + + clear_bit(RXRPC_CALL_LIFE_TIMER, &call->events); + goto kill_ACKs; + } + + /* deal with assorted inbound messages */ + if (!skb_queue_empty(&call->rx_queue)) { + switch (rxrpc_process_rx_queue(call, &abort_code)) { + case 0: + case -EAGAIN: + break; + case -ENOMEM: + goto no_mem; + case -EKEYEXPIRED: + case -EKEYREJECTED: + case -EPROTO: + rxrpc_abort_call(call, abort_code); + goto kill_ACKs; + } + } + + /* handle resending */ + if (test_and_clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events)) + rxrpc_resend_timer(call); + if (test_and_clear_bit(RXRPC_CALL_RESEND, &call->events)) + rxrpc_resend(call); + + /* consider sending an ordinary ACK */ + if (test_bit(RXRPC_CALL_ACK, &call->events)) { + _debug("send ACK: window: %d - %d { %lx }", + call->rx_data_eaten, call->ackr_win_top, + call->ackr_window[0]); + + if (call->state > RXRPC_CALL_SERVER_ACK_REQUEST && + call->ackr_reason != RXRPC_ACK_PING_RESPONSE) { + /* ACK by sending reply DATA packet in this state */ + clear_bit(RXRPC_CALL_ACK, &call->events); + goto maybe_reschedule; + } + + genbit = RXRPC_CALL_ACK; + + acks = kzalloc(call->ackr_win_top - call->rx_data_eaten, + GFP_NOFS); + if (!acks) + goto no_mem; + + //hdr.flags = RXRPC_SLOW_START_OK; + ack.bufferSpace = htons(8); + ack.maxSkew = 0; + ack.serial = 0; + ack.reason = 0; + + spin_lock_bh(&call->lock); + ack.reason = call->ackr_reason; + ack.serial = call->ackr_serial; + ack.previousPacket = call->ackr_prev_seq; + ack.firstPacket = htonl(call->rx_data_eaten + 1); + + ack.nAcks = 0; + for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) { + nbit = loop * BITS_PER_LONG; + for (bits = call->ackr_window[loop]; bits; bits >>= 1 + ) { + _debug("- l=%d n=%d b=%lx", loop, nbit, bits); + if (bits & 1) { + acks[nbit] = RXRPC_ACK_TYPE_ACK; + ack.nAcks = nbit + 1; + } + nbit++; + } + } + call->ackr_reason = 0; + spin_unlock_bh(&call->lock); + + pad = 0; + + iov[1].iov_base = &ack; + iov[1].iov_len = sizeof(ack); + iov[2].iov_base = acks; + iov[2].iov_len = ack.nAcks; + iov[3].iov_base = &pad; + iov[3].iov_len = 3; + iov[4].iov_base = &ackinfo; + iov[4].iov_len = sizeof(ackinfo); + + switch (ack.reason) { + case RXRPC_ACK_REQUESTED: + case RXRPC_ACK_DUPLICATE: + case RXRPC_ACK_OUT_OF_SEQUENCE: + case RXRPC_ACK_EXCEEDS_WINDOW: + case RXRPC_ACK_NOSPACE: + case RXRPC_ACK_PING: + case RXRPC_ACK_PING_RESPONSE: + goto send_ACK_with_skew; + case RXRPC_ACK_DELAY: + case RXRPC_ACK_IDLE: + goto send_ACK; + } + } + + /* handle completion of security negotiations on an incoming + * connection */ + if (test_and_clear_bit(RXRPC_CALL_SECURED, &call->events)) { + _debug("secured"); + spin_lock_bh(&call->lock); + + if (call->state == RXRPC_CALL_SERVER_SECURING) { + _debug("securing"); + write_lock(&call->conn->lock); + if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && + !test_bit(RXRPC_CALL_RELEASE, &call->events)) { + _debug("not released"); + call->state = RXRPC_CALL_SERVER_ACCEPTING; + list_move_tail(&call->accept_link, + &call->socket->acceptq); + } + write_unlock(&call->conn->lock); + read_lock(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE) + set_bit(RXRPC_CALL_POST_ACCEPT, &call->events); + read_unlock(&call->state_lock); + } + + spin_unlock_bh(&call->lock); + if (!test_bit(RXRPC_CALL_POST_ACCEPT, &call->events)) + goto maybe_reschedule; + } + + /* post a notification of an acceptable connection to the app */ + if (test_bit(RXRPC_CALL_POST_ACCEPT, &call->events)) { + _debug("post accept"); + if (rxrpc_post_message(call, RXRPC_SKB_MARK_NEW_CALL, + 0, false) < 0) + goto no_mem; + clear_bit(RXRPC_CALL_POST_ACCEPT, &call->events); + goto maybe_reschedule; + } + + /* handle incoming call acceptance */ + if (test_and_clear_bit(RXRPC_CALL_ACCEPTED, &call->events)) { + _debug("accepted"); + ASSERTCMP(call->rx_data_post, ==, 0); + call->rx_data_post = 1; + read_lock_bh(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE) + set_bit(RXRPC_CALL_DRAIN_RX_OOS, &call->events); + read_unlock_bh(&call->state_lock); + } + + /* drain the out of sequence received packet queue into the packet Rx + * queue */ + if (test_and_clear_bit(RXRPC_CALL_DRAIN_RX_OOS, &call->events)) { + while (call->rx_data_post == call->rx_first_oos) + if (rxrpc_drain_rx_oos_queue(call) < 0) + break; + goto maybe_reschedule; + } + + /* other events may have been raised since we started checking */ + goto maybe_reschedule; + +send_ACK_with_skew: + ack.maxSkew = htons(atomic_read(&call->conn->hi_serial) - + ntohl(ack.serial)); +send_ACK: + mtu = call->conn->trans->peer->if_mtu; + mtu -= call->conn->trans->peer->hdrsize; + ackinfo.maxMTU = htonl(mtu); + ackinfo.rwind = htonl(32); + + /* permit the peer to send us jumbo packets if it wants to */ + ackinfo.rxMTU = htonl(5692); + ackinfo.jumbo_max = htonl(4); + + hdr.serial = htonl(atomic_inc_return(&call->conn->serial)); + _proto("Tx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", + ntohl(hdr.serial), + ntohs(ack.maxSkew), + ntohl(ack.firstPacket), + ntohl(ack.previousPacket), + ntohl(ack.serial), + rxrpc_acks[ack.reason], + ack.nAcks); + + del_timer_sync(&call->ack_timer); + if (ack.nAcks > 0) + set_bit(RXRPC_CALL_TX_SOFT_ACK, &call->flags); + goto send_message_2; + +send_message: + _debug("send message"); + + hdr.serial = htonl(atomic_inc_return(&call->conn->serial)); + _proto("Tx %s %%%u", rxrpc_pkts[hdr.type], ntohl(hdr.serial)); +send_message_2: + + len = iov[0].iov_len; + ioc = 1; + if (iov[4].iov_len) { + ioc = 5; + len += iov[4].iov_len; + len += iov[3].iov_len; + len += iov[2].iov_len; + len += iov[1].iov_len; + } else if (iov[3].iov_len) { + ioc = 4; + len += iov[3].iov_len; + len += iov[2].iov_len; + len += iov[1].iov_len; + } else if (iov[2].iov_len) { + ioc = 3; + len += iov[2].iov_len; + len += iov[1].iov_len; + } else if (iov[1].iov_len) { + ioc = 2; + len += iov[1].iov_len; + } + + ret = kernel_sendmsg(call->conn->trans->local->socket, + &msg, iov, ioc, len); + if (ret < 0) { + _debug("sendmsg failed: %d", ret); + read_lock_bh(&call->state_lock); + if (call->state < RXRPC_CALL_DEAD) + rxrpc_queue_call(call); + read_unlock_bh(&call->state_lock); + goto error; + } + + switch (genbit) { + case RXRPC_CALL_ABORT: + clear_bit(genbit, &call->events); + clear_bit(RXRPC_CALL_RCVD_ABORT, &call->events); + goto kill_ACKs; + + case RXRPC_CALL_ACK_FINAL: + write_lock_bh(&call->state_lock); + if (call->state == RXRPC_CALL_CLIENT_FINAL_ACK) + call->state = RXRPC_CALL_COMPLETE; + write_unlock_bh(&call->state_lock); + goto kill_ACKs; + + default: + clear_bit(genbit, &call->events); + switch (call->state) { + case RXRPC_CALL_CLIENT_AWAIT_REPLY: + case RXRPC_CALL_CLIENT_RECV_REPLY: + case RXRPC_CALL_SERVER_RECV_REQUEST: + case RXRPC_CALL_SERVER_ACK_REQUEST: + _debug("start ACK timer"); + rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, + call->ackr_serial, false); + default: + break; + } + goto maybe_reschedule; + } + +kill_ACKs: + del_timer_sync(&call->ack_timer); + if (test_and_clear_bit(RXRPC_CALL_ACK_FINAL, &call->events)) + rxrpc_put_call(call); + clear_bit(RXRPC_CALL_ACK, &call->events); + +maybe_reschedule: + if (call->events || !skb_queue_empty(&call->rx_queue)) { + read_lock_bh(&call->state_lock); + if (call->state < RXRPC_CALL_DEAD) + rxrpc_queue_call(call); + read_unlock_bh(&call->state_lock); + } + + /* don't leave aborted connections on the accept queue */ + if (call->state >= RXRPC_CALL_COMPLETE && + !list_empty(&call->accept_link)) { + _debug("X unlinking once-pending call %p { e=%lx f=%lx c=%x }", + call, call->events, call->flags, + ntohl(call->conn->cid)); + + read_lock_bh(&call->state_lock); + if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && + !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events)) + rxrpc_queue_call(call); + read_unlock_bh(&call->state_lock); + } + +error: + clear_bit(RXRPC_CALL_PROC_BUSY, &call->flags); + kfree(acks); + + /* because we don't want two CPUs both processing the work item for one + * call at the same time, we use a flag to note when it's busy; however + * this means there's a race between clearing the flag and setting the + * work pending bit and the work item being processed again */ + if (call->events && !work_pending(&call->processor)) { + _debug("jumpstart %x", ntohl(call->conn->cid)); + rxrpc_queue_call(call); + } + + _leave(""); + return; + +no_mem: + _debug("out of memory"); + goto maybe_reschedule; +} diff --git a/net/rxrpc/ar-call.c b/net/rxrpc/ar-call.c new file mode 100644 index 00000000000..4d92d88ff1f --- /dev/null +++ b/net/rxrpc/ar-call.c @@ -0,0 +1,804 @@ +/* RxRPC individual remote procedure call handling + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/circ_buf.h> +#include <net/sock.h> +#include <net/af_rxrpc.h> +#include "ar-internal.h" + +struct kmem_cache *rxrpc_call_jar; +LIST_HEAD(rxrpc_calls); +DEFINE_RWLOCK(rxrpc_call_lock); +static unsigned rxrpc_call_max_lifetime = 60; +static unsigned rxrpc_dead_call_timeout = 2; + +static void rxrpc_destroy_call(struct work_struct *work); +static void rxrpc_call_life_expired(unsigned long _call); +static void rxrpc_dead_call_expired(unsigned long _call); +static void rxrpc_ack_time_expired(unsigned long _call); +static void rxrpc_resend_time_expired(unsigned long _call); + +/* + * allocate a new call + */ +static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) +{ + struct rxrpc_call *call; + + call = kmem_cache_zalloc(rxrpc_call_jar, gfp); + if (!call) + return NULL; + + call->acks_winsz = 16; + call->acks_window = kmalloc(call->acks_winsz * sizeof(unsigned long), + gfp); + if (!call->acks_window) { + kmem_cache_free(rxrpc_call_jar, call); + return NULL; + } + + setup_timer(&call->lifetimer, &rxrpc_call_life_expired, + (unsigned long) call); + setup_timer(&call->deadspan, &rxrpc_dead_call_expired, + (unsigned long) call); + setup_timer(&call->ack_timer, &rxrpc_ack_time_expired, + (unsigned long) call); + setup_timer(&call->resend_timer, &rxrpc_resend_time_expired, + (unsigned long) call); + INIT_WORK(&call->destroyer, &rxrpc_destroy_call); + INIT_WORK(&call->processor, &rxrpc_process_call); + INIT_LIST_HEAD(&call->accept_link); + skb_queue_head_init(&call->rx_queue); + skb_queue_head_init(&call->rx_oos_queue); + init_waitqueue_head(&call->tx_waitq); + spin_lock_init(&call->lock); + rwlock_init(&call->state_lock); + atomic_set(&call->usage, 1); + call->debug_id = atomic_inc_return(&rxrpc_debug_id); + call->state = RXRPC_CALL_CLIENT_SEND_REQUEST; + + memset(&call->sock_node, 0xed, sizeof(call->sock_node)); + + call->rx_data_expect = 1; + call->rx_data_eaten = 0; + call->rx_first_oos = 0; + call->ackr_win_top = call->rx_data_eaten + 1 + RXRPC_MAXACKS; + call->creation_jif = jiffies; + return call; +} + +/* + * allocate a new client call and attempt to to get a connection slot for it + */ +static struct rxrpc_call *rxrpc_alloc_client_call( + struct rxrpc_sock *rx, + struct rxrpc_transport *trans, + struct rxrpc_conn_bundle *bundle, + gfp_t gfp) +{ + struct rxrpc_call *call; + int ret; + + _enter(""); + + ASSERT(rx != NULL); + ASSERT(trans != NULL); + ASSERT(bundle != NULL); + + call = rxrpc_alloc_call(gfp); + if (!call) + return ERR_PTR(-ENOMEM); + + sock_hold(&rx->sk); + call->socket = rx; + call->rx_data_post = 1; + + ret = rxrpc_connect_call(rx, trans, bundle, call, gfp); + if (ret < 0) { + kmem_cache_free(rxrpc_call_jar, call); + return ERR_PTR(ret); + } + + spin_lock(&call->conn->trans->peer->lock); + list_add(&call->error_link, &call->conn->trans->peer->error_targets); + spin_unlock(&call->conn->trans->peer->lock); + + call->lifetimer.expires = jiffies + rxrpc_call_max_lifetime * HZ; + add_timer(&call->lifetimer); + + _leave(" = %p", call); + return call; +} + +/* + * set up a call for the given data + * - called in process context with IRQs enabled + */ +struct rxrpc_call *rxrpc_get_client_call(struct rxrpc_sock *rx, + struct rxrpc_transport *trans, + struct rxrpc_conn_bundle *bundle, + unsigned long user_call_ID, + int create, + gfp_t gfp) +{ + struct rxrpc_call *call, *candidate; + struct rb_node *p, *parent, **pp; + + _enter("%p,%d,%d,%lx,%d", + rx, trans ? trans->debug_id : -1, bundle ? bundle->debug_id : -1, + user_call_ID, create); + + /* search the extant calls first for one that matches the specified + * user ID */ + read_lock(&rx->call_lock); + + p = rx->calls.rb_node; + while (p) { + call = rb_entry(p, struct rxrpc_call, sock_node); + + if (user_call_ID < call->user_call_ID) + p = p->rb_left; + else if (user_call_ID > call->user_call_ID) + p = p->rb_right; + else + goto found_extant_call; + } + + read_unlock(&rx->call_lock); + + if (!create || !trans) + return ERR_PTR(-EBADSLT); + + /* not yet present - create a candidate for a new record and then + * redo the search */ + candidate = rxrpc_alloc_client_call(rx, trans, bundle, gfp); + if (IS_ERR(candidate)) { + _leave(" = %ld", PTR_ERR(candidate)); + return candidate; + } + + candidate->user_call_ID = user_call_ID; + __set_bit(RXRPC_CALL_HAS_USERID, &candidate->flags); + + write_lock(&rx->call_lock); + + pp = &rx->calls.rb_node; + parent = NULL; + while (*pp) { + parent = *pp; + call = rb_entry(parent, struct rxrpc_call, sock_node); + + if (user_call_ID < call->user_call_ID) + pp = &(*pp)->rb_left; + else if (user_call_ID > call->user_call_ID) + pp = &(*pp)->rb_right; + else + goto found_extant_second; + } + + /* second search also failed; add the new call */ + call = candidate; + candidate = NULL; + rxrpc_get_call(call); + + rb_link_node(&call->sock_node, parent, pp); + rb_insert_color(&call->sock_node, &rx->calls); + write_unlock(&rx->call_lock); + + write_lock_bh(&rxrpc_call_lock); + list_add_tail(&call->link, &rxrpc_calls); + write_unlock_bh(&rxrpc_call_lock); + + _net("CALL new %d on CONN %d", call->debug_id, call->conn->debug_id); + + _leave(" = %p [new]", call); + return call; + + /* we found the call in the list immediately */ +found_extant_call: + rxrpc_get_call(call); + read_unlock(&rx->call_lock); + _leave(" = %p [extant %d]", call, atomic_read(&call->usage)); + return call; + + /* we found the call on the second time through the list */ +found_extant_second: + rxrpc_get_call(call); + write_unlock(&rx->call_lock); + rxrpc_put_call(candidate); + _leave(" = %p [second %d]", call, atomic_read(&call->usage)); + return call; +} + +/* + * set up an incoming call + * - called in process context with IRQs enabled + */ +struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, + struct rxrpc_connection *conn, + struct rxrpc_header *hdr, + gfp_t gfp) +{ + struct rxrpc_call *call, *candidate; + struct rb_node **p, *parent; + __be32 call_id; + + _enter(",%d,,%x", conn->debug_id, gfp); + + ASSERT(rx != NULL); + + candidate = rxrpc_alloc_call(gfp); + if (!candidate) + return ERR_PTR(-EBUSY); + + candidate->socket = rx; + candidate->conn = conn; + candidate->cid = hdr->cid; + candidate->call_id = hdr->callNumber; + candidate->channel = ntohl(hdr->cid) & RXRPC_CHANNELMASK; + candidate->rx_data_post = 0; + candidate->state = RXRPC_CALL_SERVER_ACCEPTING; + if (conn->security_ix > 0) + candidate->state = RXRPC_CALL_SERVER_SECURING; + + write_lock_bh(&conn->lock); + + /* set the channel for this call */ + call = conn->channels[candidate->channel]; + _debug("channel[%u] is %p", candidate->channel, call); + if (call && call->call_id == hdr->callNumber) { + /* already set; must've been a duplicate packet */ + _debug("extant call [%d]", call->state); + ASSERTCMP(call->conn, ==, conn); + + read_lock(&call->state_lock); + switch (call->state) { + case RXRPC_CALL_LOCALLY_ABORTED: + if (!test_and_set_bit(RXRPC_CALL_ABORT, &call->events)) + rxrpc_queue_call(call); + case RXRPC_CALL_REMOTELY_ABORTED: + read_unlock(&call->state_lock); + goto aborted_call; + default: + rxrpc_get_call(call); + read_unlock(&call->state_lock); + goto extant_call; + } + } + + if (call) { + /* it seems the channel is still in use from the previous call + * - ditch the old binding if its call is now complete */ + _debug("CALL: %u { %s }", + call->debug_id, rxrpc_call_states[call->state]); + + if (call->state >= RXRPC_CALL_COMPLETE) { + conn->channels[call->channel] = NULL; + } else { + write_unlock_bh(&conn->lock); + kmem_cache_free(rxrpc_call_jar, candidate); + _leave(" = -EBUSY"); + return ERR_PTR(-EBUSY); + } + } + + /* check the call number isn't duplicate */ + _debug("check dup"); + call_id = hdr->callNumber; + p = &conn->calls.rb_node; + parent = NULL; + while (*p) { + parent = *p; + call = rb_entry(parent, struct rxrpc_call, conn_node); + + if (call_id < call->call_id) + p = &(*p)->rb_left; + else if (call_id > call->call_id) + p = &(*p)->rb_right; + else + goto old_call; + } + + /* make the call available */ + _debug("new call"); + call = candidate; + candidate = NULL; + rb_link_node(&call->conn_node, parent, p); + rb_insert_color(&call->conn_node, &conn->calls); + conn->channels[call->channel] = call; + sock_hold(&rx->sk); + atomic_inc(&conn->usage); + write_unlock_bh(&conn->lock); + + spin_lock(&conn->trans->peer->lock); + list_add(&call->error_link, &conn->trans->peer->error_targets); + spin_unlock(&conn->trans->peer->lock); + + write_lock_bh(&rxrpc_call_lock); + list_add_tail(&call->link, &rxrpc_calls); + write_unlock_bh(&rxrpc_call_lock); + + _net("CALL incoming %d on CONN %d", call->debug_id, call->conn->debug_id); + + call->lifetimer.expires = jiffies + rxrpc_call_max_lifetime * HZ; + add_timer(&call->lifetimer); + _leave(" = %p {%d} [new]", call, call->debug_id); + return call; + +extant_call: + write_unlock_bh(&conn->lock); + kmem_cache_free(rxrpc_call_jar, candidate); + _leave(" = %p {%d} [extant]", call, call ? call->debug_id : -1); + return call; + +aborted_call: + write_unlock_bh(&conn->lock); + kmem_cache_free(rxrpc_call_jar, candidate); + _leave(" = -ECONNABORTED"); + return ERR_PTR(-ECONNABORTED); + +old_call: + write_unlock_bh(&conn->lock); + kmem_cache_free(rxrpc_call_jar, candidate); + _leave(" = -ECONNRESET [old]"); + return ERR_PTR(-ECONNRESET); +} + +/* + * find an extant server call + * - called in process context with IRQs enabled + */ +struct rxrpc_call *rxrpc_find_server_call(struct rxrpc_sock *rx, + unsigned long user_call_ID) +{ + struct rxrpc_call *call; + struct rb_node *p; + + _enter("%p,%lx", rx, user_call_ID); + + /* search the extant calls for one that matches the specified user + * ID */ + read_lock(&rx->call_lock); + + p = rx->calls.rb_node; + while (p) { + call = rb_entry(p, struct rxrpc_call, sock_node); + + if (user_call_ID < call->user_call_ID) + p = p->rb_left; + else if (user_call_ID > call->user_call_ID) + p = p->rb_right; + else + goto found_extant_call; + } + + read_unlock(&rx->call_lock); + _leave(" = NULL"); + return NULL; + + /* we found the call in the list immediately */ +found_extant_call: + rxrpc_get_call(call); + read_unlock(&rx->call_lock); + _leave(" = %p [%d]", call, atomic_read(&call->usage)); + return call; +} + +/* + * detach a call from a socket and set up for release + */ +void rxrpc_release_call(struct rxrpc_call *call) +{ + struct rxrpc_connection *conn = call->conn; + struct rxrpc_sock *rx = call->socket; + + _enter("{%d,%d,%d,%d}", + call->debug_id, atomic_read(&call->usage), + atomic_read(&call->ackr_not_idle), + call->rx_first_oos); + + spin_lock_bh(&call->lock); + if (test_and_set_bit(RXRPC_CALL_RELEASED, &call->flags)) + BUG(); + spin_unlock_bh(&call->lock); + + /* dissociate from the socket + * - the socket's ref on the call is passed to the death timer + */ + _debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn); + + write_lock_bh(&rx->call_lock); + if (!list_empty(&call->accept_link)) { + _debug("unlinking once-pending call %p { e=%lx f=%lx }", + call, call->events, call->flags); + ASSERT(!test_bit(RXRPC_CALL_HAS_USERID, &call->flags)); + list_del_init(&call->accept_link); + sk_acceptq_removed(&rx->sk); + } else if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { + rb_erase(&call->sock_node, &rx->calls); + memset(&call->sock_node, 0xdd, sizeof(call->sock_node)); + clear_bit(RXRPC_CALL_HAS_USERID, &call->flags); + } + write_unlock_bh(&rx->call_lock); + + /* free up the channel for reuse */ + spin_lock(&conn->trans->client_lock); + write_lock_bh(&conn->lock); + write_lock(&call->state_lock); + + if (conn->channels[call->channel] == call) + conn->channels[call->channel] = NULL; + + if (conn->out_clientflag && conn->bundle) { + conn->avail_calls++; + switch (conn->avail_calls) { + case 1: + list_move_tail(&conn->bundle_link, + &conn->bundle->avail_conns); + case 2 ... RXRPC_MAXCALLS - 1: + ASSERT(conn->channels[0] == NULL || + conn->channels[1] == NULL || + conn->channels[2] == NULL || + conn->channels[3] == NULL); + break; + case RXRPC_MAXCALLS: + list_move_tail(&conn->bundle_link, + &conn->bundle->unused_conns); + ASSERT(conn->channels[0] == NULL && + conn->channels[1] == NULL && + conn->channels[2] == NULL && + conn->channels[3] == NULL); + break; + default: + printk(KERN_ERR "RxRPC: conn->avail_calls=%d\n", + conn->avail_calls); + BUG(); + } + } + + spin_unlock(&conn->trans->client_lock); + + if (call->state < RXRPC_CALL_COMPLETE && + call->state != RXRPC_CALL_CLIENT_FINAL_ACK) { + _debug("+++ ABORTING STATE %d +++\n", call->state); + call->state = RXRPC_CALL_LOCALLY_ABORTED; + call->abort_code = RX_CALL_DEAD; + set_bit(RXRPC_CALL_ABORT, &call->events); + rxrpc_queue_call(call); + } + write_unlock(&call->state_lock); + write_unlock_bh(&conn->lock); + + /* clean up the Rx queue */ + if (!skb_queue_empty(&call->rx_queue) || + !skb_queue_empty(&call->rx_oos_queue)) { + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + + _debug("purge Rx queues"); + + spin_lock_bh(&call->lock); + while ((skb = skb_dequeue(&call->rx_queue)) || + (skb = skb_dequeue(&call->rx_oos_queue))) { + sp = rxrpc_skb(skb); + if (sp->call) { + ASSERTCMP(sp->call, ==, call); + rxrpc_put_call(call); + sp->call = NULL; + } + skb->destructor = NULL; + spin_unlock_bh(&call->lock); + + _debug("- zap %s %%%u #%u", + rxrpc_pkts[sp->hdr.type], + ntohl(sp->hdr.serial), + ntohl(sp->hdr.seq)); + rxrpc_free_skb(skb); + spin_lock_bh(&call->lock); + } + spin_unlock_bh(&call->lock); + + ASSERTCMP(call->state, !=, RXRPC_CALL_COMPLETE); + } + + del_timer_sync(&call->resend_timer); + del_timer_sync(&call->ack_timer); + del_timer_sync(&call->lifetimer); + call->deadspan.expires = jiffies + rxrpc_dead_call_timeout * HZ; + add_timer(&call->deadspan); + + _leave(""); +} + +/* + * handle a dead call being ready for reaping + */ +static void rxrpc_dead_call_expired(unsigned long _call) +{ + struct rxrpc_call *call = (struct rxrpc_call *) _call; + + _enter("{%d}", call->debug_id); + + write_lock_bh(&call->state_lock); + call->state = RXRPC_CALL_DEAD; + write_unlock_bh(&call->state_lock); + rxrpc_put_call(call); +} + +/* + * mark a call as to be released, aborting it if it's still in progress + * - called with softirqs disabled + */ +static void rxrpc_mark_call_released(struct rxrpc_call *call) +{ + bool sched; + + write_lock(&call->state_lock); + if (call->state < RXRPC_CALL_DEAD) { + sched = false; + if (call->state < RXRPC_CALL_COMPLETE) { + _debug("abort call %p", call); + call->state = RXRPC_CALL_LOCALLY_ABORTED; + call->abort_code = RX_CALL_DEAD; + if (!test_and_set_bit(RXRPC_CALL_ABORT, &call->events)) + sched = true; + } + if (!test_and_set_bit(RXRPC_CALL_RELEASE, &call->events)) + sched = true; + if (sched) + rxrpc_queue_call(call); + } + write_unlock(&call->state_lock); +} + +/* + * release all the calls associated with a socket + */ +void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) +{ + struct rxrpc_call *call; + struct rb_node *p; + + _enter("%p", rx); + + read_lock_bh(&rx->call_lock); + + /* mark all the calls as no longer wanting incoming packets */ + for (p = rb_first(&rx->calls); p; p = rb_next(p)) { + call = rb_entry(p, struct rxrpc_call, sock_node); + rxrpc_mark_call_released(call); + } + + /* kill the not-yet-accepted incoming calls */ + list_for_each_entry(call, &rx->secureq, accept_link) { + rxrpc_mark_call_released(call); + } + + list_for_each_entry(call, &rx->acceptq, accept_link) { + rxrpc_mark_call_released(call); + } + + read_unlock_bh(&rx->call_lock); + _leave(""); +} + +/* + * release a call + */ +void __rxrpc_put_call(struct rxrpc_call *call) +{ + ASSERT(call != NULL); + + _enter("%p{u=%d}", call, atomic_read(&call->usage)); + + ASSERTCMP(atomic_read(&call->usage), >, 0); + + if (atomic_dec_and_test(&call->usage)) { + _debug("call %d dead", call->debug_id); + ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD); + rxrpc_queue_work(&call->destroyer); + } + _leave(""); +} + +/* + * clean up a call + */ +static void rxrpc_cleanup_call(struct rxrpc_call *call) +{ + _net("DESTROY CALL %d", call->debug_id); + + ASSERT(call->socket); + + memset(&call->sock_node, 0xcd, sizeof(call->sock_node)); + + del_timer_sync(&call->lifetimer); + del_timer_sync(&call->deadspan); + del_timer_sync(&call->ack_timer); + del_timer_sync(&call->resend_timer); + + ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags)); + ASSERTCMP(call->events, ==, 0); + if (work_pending(&call->processor)) { + _debug("defer destroy"); + rxrpc_queue_work(&call->destroyer); + return; + } + + if (call->conn) { + spin_lock(&call->conn->trans->peer->lock); + list_del(&call->error_link); + spin_unlock(&call->conn->trans->peer->lock); + + write_lock_bh(&call->conn->lock); + rb_erase(&call->conn_node, &call->conn->calls); + write_unlock_bh(&call->conn->lock); + rxrpc_put_connection(call->conn); + } + + if (call->acks_window) { + _debug("kill Tx window %d", + CIRC_CNT(call->acks_head, call->acks_tail, + call->acks_winsz)); + smp_mb(); + while (CIRC_CNT(call->acks_head, call->acks_tail, + call->acks_winsz) > 0) { + struct rxrpc_skb_priv *sp; + unsigned long _skb; + + _skb = call->acks_window[call->acks_tail] & ~1; + sp = rxrpc_skb((struct sk_buff *) _skb); + _debug("+++ clear Tx %u", ntohl(sp->hdr.seq)); + rxrpc_free_skb((struct sk_buff *) _skb); + call->acks_tail = + (call->acks_tail + 1) & (call->acks_winsz - 1); + } + + kfree(call->acks_window); + } + + rxrpc_free_skb(call->tx_pending); + + rxrpc_purge_queue(&call->rx_queue); + ASSERT(skb_queue_empty(&call->rx_oos_queue)); + sock_put(&call->socket->sk); + kmem_cache_free(rxrpc_call_jar, call); +} + +/* + * destroy a call + */ +static void rxrpc_destroy_call(struct work_struct *work) +{ + struct rxrpc_call *call = + container_of(work, struct rxrpc_call, destroyer); + + _enter("%p{%d,%d,%p}", + call, atomic_read(&call->usage), call->channel, call->conn); + + ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD); + + write_lock_bh(&rxrpc_call_lock); + list_del_init(&call->link); + write_unlock_bh(&rxrpc_call_lock); + + rxrpc_cleanup_call(call); + _leave(""); +} + +/* + * preemptively destroy all the call records from a transport endpoint rather + * than waiting for them to time out + */ +void __exit rxrpc_destroy_all_calls(void) +{ + struct rxrpc_call *call; + + _enter(""); + write_lock_bh(&rxrpc_call_lock); + + while (!list_empty(&rxrpc_calls)) { + call = list_entry(rxrpc_calls.next, struct rxrpc_call, link); + _debug("Zapping call %p", call); + + list_del_init(&call->link); + + switch (atomic_read(&call->usage)) { + case 0: + ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD); + break; + case 1: + if (del_timer_sync(&call->deadspan) != 0 && + call->state != RXRPC_CALL_DEAD) + rxrpc_dead_call_expired((unsigned long) call); + if (call->state != RXRPC_CALL_DEAD) + break; + default: + printk(KERN_ERR "RXRPC:" + " Call %p still in use (%d,%d,%s,%lx,%lx)!\n", + call, atomic_read(&call->usage), + atomic_read(&call->ackr_not_idle), + rxrpc_call_states[call->state], + call->flags, call->events); + if (!skb_queue_empty(&call->rx_queue)) + printk(KERN_ERR"RXRPC: Rx queue occupied\n"); + if (!skb_queue_empty(&call->rx_oos_queue)) + printk(KERN_ERR"RXRPC: OOS queue occupied\n"); + break; + } + + write_unlock_bh(&rxrpc_call_lock); + cond_resched(); + write_lock_bh(&rxrpc_call_lock); + } + + write_unlock_bh(&rxrpc_call_lock); + _leave(""); +} + +/* + * handle call lifetime being exceeded + */ +static void rxrpc_call_life_expired(unsigned long _call) +{ + struct rxrpc_call *call = (struct rxrpc_call *) _call; + + if (call->state >= RXRPC_CALL_COMPLETE) + return; + + _enter("{%d}", call->debug_id); + read_lock_bh(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE) { + set_bit(RXRPC_CALL_LIFE_TIMER, &call->events); + rxrpc_queue_call(call); + } + read_unlock_bh(&call->state_lock); +} + +/* + * handle resend timer expiry + */ +static void rxrpc_resend_time_expired(unsigned long _call) +{ + struct rxrpc_call *call = (struct rxrpc_call *) _call; + + _enter("{%d}", call->debug_id); + + if (call->state >= RXRPC_CALL_COMPLETE) + return; + + read_lock_bh(&call->state_lock); + clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); + if (call->state < RXRPC_CALL_COMPLETE && + !test_and_set_bit(RXRPC_CALL_RESEND_TIMER, &call->events)) + rxrpc_queue_call(call); + read_unlock_bh(&call->state_lock); +} + +/* + * handle ACK timer expiry + */ +static void rxrpc_ack_time_expired(unsigned long _call) +{ + struct rxrpc_call *call = (struct rxrpc_call *) _call; + + _enter("{%d}", call->debug_id); + + if (call->state >= RXRPC_CALL_COMPLETE) + return; + + read_lock_bh(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE && + !test_and_set_bit(RXRPC_CALL_ACK, &call->events)) + rxrpc_queue_call(call); + read_unlock_bh(&call->state_lock); +} diff --git a/net/rxrpc/ar-connection.c b/net/rxrpc/ar-connection.c new file mode 100644 index 00000000000..43cb3e051ec --- /dev/null +++ b/net/rxrpc/ar-connection.c @@ -0,0 +1,911 @@ +/* RxRPC virtual connection handler + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/net.h> +#include <linux/skbuff.h> +#include <linux/crypto.h> +#include <net/sock.h> +#include <net/af_rxrpc.h> +#include "ar-internal.h" + +static void rxrpc_connection_reaper(struct work_struct *work); + +LIST_HEAD(rxrpc_connections); +DEFINE_RWLOCK(rxrpc_connection_lock); +static unsigned long rxrpc_connection_timeout = 10 * 60; +static DECLARE_DELAYED_WORK(rxrpc_connection_reap, rxrpc_connection_reaper); + +/* + * allocate a new client connection bundle + */ +static struct rxrpc_conn_bundle *rxrpc_alloc_bundle(gfp_t gfp) +{ + struct rxrpc_conn_bundle *bundle; + + _enter(""); + + bundle = kzalloc(sizeof(struct rxrpc_conn_bundle), gfp); + if (bundle) { + INIT_LIST_HEAD(&bundle->unused_conns); + INIT_LIST_HEAD(&bundle->avail_conns); + INIT_LIST_HEAD(&bundle->busy_conns); + init_waitqueue_head(&bundle->chanwait); + atomic_set(&bundle->usage, 1); + } + + _leave(" = %p", bundle); + return bundle; +} + +/* + * compare bundle parameters with what we're looking for + * - return -ve, 0 or +ve + */ +static inline +int rxrpc_cmp_bundle(const struct rxrpc_conn_bundle *bundle, + struct key *key, __be16 service_id) +{ + return (bundle->service_id - service_id) ?: + ((unsigned long) bundle->key - (unsigned long) key); +} + +/* + * get bundle of client connections that a client socket can make use of + */ +struct rxrpc_conn_bundle *rxrpc_get_bundle(struct rxrpc_sock *rx, + struct rxrpc_transport *trans, + struct key *key, + __be16 service_id, + gfp_t gfp) +{ + struct rxrpc_conn_bundle *bundle, *candidate; + struct rb_node *p, *parent, **pp; + + _enter("%p{%x},%x,%hx,", + rx, key_serial(key), trans->debug_id, ntohl(service_id)); + + if (rx->trans == trans && rx->bundle) { + atomic_inc(&rx->bundle->usage); + return rx->bundle; + } + + /* search the extant bundles first for one that matches the specified + * user ID */ + spin_lock(&trans->client_lock); + + p = trans->bundles.rb_node; + while (p) { + bundle = rb_entry(p, struct rxrpc_conn_bundle, node); + + if (rxrpc_cmp_bundle(bundle, key, service_id) < 0) + p = p->rb_left; + else if (rxrpc_cmp_bundle(bundle, key, service_id) > 0) + p = p->rb_right; + else + goto found_extant_bundle; + } + + spin_unlock(&trans->client_lock); + + /* not yet present - create a candidate for a new record and then + * redo the search */ + candidate = rxrpc_alloc_bundle(gfp); + if (!candidate) { + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); + } + + candidate->key = key_get(key); + candidate->service_id = service_id; + + spin_lock(&trans->client_lock); + + pp = &trans->bundles.rb_node; + parent = NULL; + while (*pp) { + parent = *pp; + bundle = rb_entry(parent, struct rxrpc_conn_bundle, node); + + if (rxrpc_cmp_bundle(bundle, key, service_id) < 0) + pp = &(*pp)->rb_left; + else if (rxrpc_cmp_bundle(bundle, key, service_id) > 0) + pp = &(*pp)->rb_right; + else + goto found_extant_second; + } + + /* second search also failed; add the new bundle */ + bundle = candidate; + candidate = NULL; + + rb_link_node(&bundle->node, parent, pp); + rb_insert_color(&bundle->node, &trans->bundles); + spin_unlock(&trans->client_lock); + _net("BUNDLE new on trans %d", trans->debug_id); + if (!rx->bundle && rx->sk.sk_state == RXRPC_CLIENT_CONNECTED) { + atomic_inc(&bundle->usage); + rx->bundle = bundle; + } + _leave(" = %p [new]", bundle); + return bundle; + + /* we found the bundle in the list immediately */ +found_extant_bundle: + atomic_inc(&bundle->usage); + spin_unlock(&trans->client_lock); + _net("BUNDLE old on trans %d", trans->debug_id); + if (!rx->bundle && rx->sk.sk_state == RXRPC_CLIENT_CONNECTED) { + atomic_inc(&bundle->usage); + rx->bundle = bundle; + } + _leave(" = %p [extant %d]", bundle, atomic_read(&bundle->usage)); + return bundle; + + /* we found the bundle on the second time through the list */ +found_extant_second: + atomic_inc(&bundle->usage); + spin_unlock(&trans->client_lock); + kfree(candidate); + _net("BUNDLE old2 on trans %d", trans->debug_id); + if (!rx->bundle && rx->sk.sk_state == RXRPC_CLIENT_CONNECTED) { + atomic_inc(&bundle->usage); + rx->bundle = bundle; + } + _leave(" = %p [second %d]", bundle, atomic_read(&bundle->usage)); + return bundle; +} + +/* + * release a bundle + */ +void rxrpc_put_bundle(struct rxrpc_transport *trans, + struct rxrpc_conn_bundle *bundle) +{ + _enter("%p,%p{%d}",trans, bundle, atomic_read(&bundle->usage)); + + if (atomic_dec_and_lock(&bundle->usage, &trans->client_lock)) { + _debug("Destroy bundle"); + rb_erase(&bundle->node, &trans->bundles); + spin_unlock(&trans->client_lock); + ASSERT(list_empty(&bundle->unused_conns)); + ASSERT(list_empty(&bundle->avail_conns)); + ASSERT(list_empty(&bundle->busy_conns)); + ASSERTCMP(bundle->num_conns, ==, 0); + key_put(bundle->key); + kfree(bundle); + } + + _leave(""); +} + +/* + * allocate a new connection + */ +static struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) +{ + struct rxrpc_connection *conn; + + _enter(""); + + conn = kzalloc(sizeof(struct rxrpc_connection), gfp); + if (conn) { + INIT_WORK(&conn->processor, &rxrpc_process_connection); + INIT_LIST_HEAD(&conn->bundle_link); + conn->calls = RB_ROOT; + skb_queue_head_init(&conn->rx_queue); + rwlock_init(&conn->lock); + spin_lock_init(&conn->state_lock); + atomic_set(&conn->usage, 1); + conn->debug_id = atomic_inc_return(&rxrpc_debug_id); + conn->avail_calls = RXRPC_MAXCALLS; + conn->size_align = 4; + conn->header_size = sizeof(struct rxrpc_header); + } + + _leave(" = %p{%d}", conn, conn->debug_id); + return conn; +} + +/* + * assign a connection ID to a connection and add it to the transport's + * connection lookup tree + * - called with transport client lock held + */ +static void rxrpc_assign_connection_id(struct rxrpc_connection *conn) +{ + struct rxrpc_connection *xconn; + struct rb_node *parent, **p; + __be32 epoch; + u32 real_conn_id; + + _enter(""); + + epoch = conn->epoch; + + write_lock_bh(&conn->trans->conn_lock); + + conn->trans->conn_idcounter += RXRPC_CID_INC; + if (conn->trans->conn_idcounter < RXRPC_CID_INC) + conn->trans->conn_idcounter = RXRPC_CID_INC; + real_conn_id = conn->trans->conn_idcounter; + +attempt_insertion: + parent = NULL; + p = &conn->trans->client_conns.rb_node; + + while (*p) { + parent = *p; + xconn = rb_entry(parent, struct rxrpc_connection, node); + + if (epoch < xconn->epoch) + p = &(*p)->rb_left; + else if (epoch > xconn->epoch) + p = &(*p)->rb_right; + else if (real_conn_id < xconn->real_conn_id) + p = &(*p)->rb_left; + else if (real_conn_id > xconn->real_conn_id) + p = &(*p)->rb_right; + else + goto id_exists; + } + + /* we've found a suitable hole - arrange for this connection to occupy + * it */ + rb_link_node(&conn->node, parent, p); + rb_insert_color(&conn->node, &conn->trans->client_conns); + + conn->real_conn_id = real_conn_id; + conn->cid = htonl(real_conn_id); + write_unlock_bh(&conn->trans->conn_lock); + _leave(" [CONNID %x CID %x]", real_conn_id, ntohl(conn->cid)); + return; + + /* we found a connection with the proposed ID - walk the tree from that + * point looking for the next unused ID */ +id_exists: + for (;;) { + real_conn_id += RXRPC_CID_INC; + if (real_conn_id < RXRPC_CID_INC) { + real_conn_id = RXRPC_CID_INC; + conn->trans->conn_idcounter = real_conn_id; + goto attempt_insertion; + } + + parent = rb_next(parent); + if (!parent) + goto attempt_insertion; + + xconn = rb_entry(parent, struct rxrpc_connection, node); + if (epoch < xconn->epoch || + real_conn_id < xconn->real_conn_id) + goto attempt_insertion; + } +} + +/* + * add a call to a connection's call-by-ID tree + */ +static void rxrpc_add_call_ID_to_conn(struct rxrpc_connection *conn, + struct rxrpc_call *call) +{ + struct rxrpc_call *xcall; + struct rb_node *parent, **p; + __be32 call_id; + + write_lock_bh(&conn->lock); + + call_id = call->call_id; + p = &conn->calls.rb_node; + parent = NULL; + while (*p) { + parent = *p; + xcall = rb_entry(parent, struct rxrpc_call, conn_node); + + if (call_id < xcall->call_id) + p = &(*p)->rb_left; + else if (call_id > xcall->call_id) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&call->conn_node, parent, p); + rb_insert_color(&call->conn_node, &conn->calls); + + write_unlock_bh(&conn->lock); +} + +/* + * connect a call on an exclusive connection + */ +static int rxrpc_connect_exclusive(struct rxrpc_sock *rx, + struct rxrpc_transport *trans, + __be16 service_id, + struct rxrpc_call *call, + gfp_t gfp) +{ + struct rxrpc_connection *conn; + int chan, ret; + + _enter(""); + + conn = rx->conn; + if (!conn) { + /* not yet present - create a candidate for a new connection + * and then redo the check */ + conn = rxrpc_alloc_connection(gfp); + if (IS_ERR(conn)) { + _leave(" = %ld", PTR_ERR(conn)); + return PTR_ERR(conn); + } + + conn->trans = trans; + conn->bundle = NULL; + conn->service_id = service_id; + conn->epoch = rxrpc_epoch; + conn->in_clientflag = 0; + conn->out_clientflag = RXRPC_CLIENT_INITIATED; + conn->cid = 0; + conn->state = RXRPC_CONN_CLIENT; + conn->avail_calls = RXRPC_MAXCALLS - 1; + conn->security_level = rx->min_sec_level; + conn->key = key_get(rx->key); + + ret = rxrpc_init_client_conn_security(conn); + if (ret < 0) { + key_put(conn->key); + kfree(conn); + _leave(" = %d [key]", ret); + return ret; + } + + write_lock_bh(&rxrpc_connection_lock); + list_add_tail(&conn->link, &rxrpc_connections); + write_unlock_bh(&rxrpc_connection_lock); + + spin_lock(&trans->client_lock); + atomic_inc(&trans->usage); + + _net("CONNECT EXCL new %d on TRANS %d", + conn->debug_id, conn->trans->debug_id); + + rxrpc_assign_connection_id(conn); + rx->conn = conn; + } + + /* we've got a connection with a free channel and we can now attach the + * call to it + * - we're holding the transport's client lock + * - we're holding a reference on the connection + */ + for (chan = 0; chan < RXRPC_MAXCALLS; chan++) + if (!conn->channels[chan]) + goto found_channel; + goto no_free_channels; + +found_channel: + atomic_inc(&conn->usage); + conn->channels[chan] = call; + call->conn = conn; + call->channel = chan; + call->cid = conn->cid | htonl(chan); + call->call_id = htonl(++conn->call_counter); + + _net("CONNECT client on conn %d chan %d as call %x", + conn->debug_id, chan, ntohl(call->call_id)); + + spin_unlock(&trans->client_lock); + + rxrpc_add_call_ID_to_conn(conn, call); + _leave(" = 0"); + return 0; + +no_free_channels: + spin_unlock(&trans->client_lock); + _leave(" = -ENOSR"); + return -ENOSR; +} + +/* + * find a connection for a call + * - called in process context with IRQs enabled + */ +int rxrpc_connect_call(struct rxrpc_sock *rx, + struct rxrpc_transport *trans, + struct rxrpc_conn_bundle *bundle, + struct rxrpc_call *call, + gfp_t gfp) +{ + struct rxrpc_connection *conn, *candidate; + int chan, ret; + + DECLARE_WAITQUEUE(myself, current); + + _enter("%p,%lx,", rx, call->user_call_ID); + + if (test_bit(RXRPC_SOCK_EXCLUSIVE_CONN, &rx->flags)) + return rxrpc_connect_exclusive(rx, trans, bundle->service_id, + call, gfp); + + spin_lock(&trans->client_lock); + for (;;) { + /* see if the bundle has a call slot available */ + if (!list_empty(&bundle->avail_conns)) { + _debug("avail"); + conn = list_entry(bundle->avail_conns.next, + struct rxrpc_connection, + bundle_link); + if (--conn->avail_calls == 0) + list_move(&conn->bundle_link, + &bundle->busy_conns); + ASSERTCMP(conn->avail_calls, <, RXRPC_MAXCALLS); + ASSERT(conn->channels[0] == NULL || + conn->channels[1] == NULL || + conn->channels[2] == NULL || + conn->channels[3] == NULL); + atomic_inc(&conn->usage); + break; + } + + if (!list_empty(&bundle->unused_conns)) { + _debug("unused"); + conn = list_entry(bundle->unused_conns.next, + struct rxrpc_connection, + bundle_link); + ASSERTCMP(conn->avail_calls, ==, RXRPC_MAXCALLS); + conn->avail_calls = RXRPC_MAXCALLS - 1; + ASSERT(conn->channels[0] == NULL && + conn->channels[1] == NULL && + conn->channels[2] == NULL && + conn->channels[3] == NULL); + atomic_inc(&conn->usage); + list_move(&conn->bundle_link, &bundle->avail_conns); + break; + } + + /* need to allocate a new connection */ + _debug("get new conn [%d]", bundle->num_conns); + + spin_unlock(&trans->client_lock); + + if (signal_pending(current)) + goto interrupted; + + if (bundle->num_conns >= 20) { + _debug("too many conns"); + + if (!(gfp & __GFP_WAIT)) { + _leave(" = -EAGAIN"); + return -EAGAIN; + } + + add_wait_queue(&bundle->chanwait, &myself); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (bundle->num_conns < 20 || + !list_empty(&bundle->unused_conns) || + !list_empty(&bundle->avail_conns)) + break; + if (signal_pending(current)) + goto interrupted_dequeue; + schedule(); + } + remove_wait_queue(&bundle->chanwait, &myself); + __set_current_state(TASK_RUNNING); + spin_lock(&trans->client_lock); + continue; + } + + /* not yet present - create a candidate for a new connection and then + * redo the check */ + candidate = rxrpc_alloc_connection(gfp); + if (IS_ERR(candidate)) { + _leave(" = %ld", PTR_ERR(candidate)); + return PTR_ERR(candidate); + } + + candidate->trans = trans; + candidate->bundle = bundle; + candidate->service_id = bundle->service_id; + candidate->epoch = rxrpc_epoch; + candidate->in_clientflag = 0; + candidate->out_clientflag = RXRPC_CLIENT_INITIATED; + candidate->cid = 0; + candidate->state = RXRPC_CONN_CLIENT; + candidate->avail_calls = RXRPC_MAXCALLS; + candidate->security_level = rx->min_sec_level; + candidate->key = key_get(bundle->key); + + ret = rxrpc_init_client_conn_security(candidate); + if (ret < 0) { + key_put(candidate->key); + kfree(candidate); + _leave(" = %d [key]", ret); + return ret; + } + + write_lock_bh(&rxrpc_connection_lock); + list_add_tail(&candidate->link, &rxrpc_connections); + write_unlock_bh(&rxrpc_connection_lock); + + spin_lock(&trans->client_lock); + + list_add(&candidate->bundle_link, &bundle->unused_conns); + bundle->num_conns++; + atomic_inc(&bundle->usage); + atomic_inc(&trans->usage); + + _net("CONNECT new %d on TRANS %d", + candidate->debug_id, candidate->trans->debug_id); + + rxrpc_assign_connection_id(candidate); + if (candidate->security) + candidate->security->prime_packet_security(candidate); + + /* leave the candidate lurking in zombie mode attached to the + * bundle until we're ready for it */ + rxrpc_put_connection(candidate); + candidate = NULL; + } + + /* we've got a connection with a free channel and we can now attach the + * call to it + * - we're holding the transport's client lock + * - we're holding a reference on the connection + * - we're holding a reference on the bundle + */ + for (chan = 0; chan < RXRPC_MAXCALLS; chan++) + if (!conn->channels[chan]) + goto found_channel; + ASSERT(conn->channels[0] == NULL || + conn->channels[1] == NULL || + conn->channels[2] == NULL || + conn->channels[3] == NULL); + BUG(); + +found_channel: + conn->channels[chan] = call; + call->conn = conn; + call->channel = chan; + call->cid = conn->cid | htonl(chan); + call->call_id = htonl(++conn->call_counter); + + _net("CONNECT client on conn %d chan %d as call %x", + conn->debug_id, chan, ntohl(call->call_id)); + + ASSERTCMP(conn->avail_calls, <, RXRPC_MAXCALLS); + spin_unlock(&trans->client_lock); + + rxrpc_add_call_ID_to_conn(conn, call); + + _leave(" = 0"); + return 0; + +interrupted_dequeue: + remove_wait_queue(&bundle->chanwait, &myself); + __set_current_state(TASK_RUNNING); +interrupted: + _leave(" = -ERESTARTSYS"); + return -ERESTARTSYS; +} + +/* + * get a record of an incoming connection + */ +struct rxrpc_connection * +rxrpc_incoming_connection(struct rxrpc_transport *trans, + struct rxrpc_header *hdr, + gfp_t gfp) +{ + struct rxrpc_connection *conn, *candidate = NULL; + struct rb_node *p, **pp; + const char *new = "old"; + __be32 epoch; + u32 conn_id; + + _enter(""); + + ASSERT(hdr->flags & RXRPC_CLIENT_INITIATED); + + epoch = hdr->epoch; + conn_id = ntohl(hdr->cid) & RXRPC_CIDMASK; + + /* search the connection list first */ + read_lock_bh(&trans->conn_lock); + + p = trans->server_conns.rb_node; + while (p) { + conn = rb_entry(p, struct rxrpc_connection, node); + + _debug("maybe %x", conn->real_conn_id); + + if (epoch < conn->epoch) + p = p->rb_left; + else if (epoch > conn->epoch) + p = p->rb_right; + else if (conn_id < conn->real_conn_id) + p = p->rb_left; + else if (conn_id > conn->real_conn_id) + p = p->rb_right; + else + goto found_extant_connection; + } + read_unlock_bh(&trans->conn_lock); + + /* not yet present - create a candidate for a new record and then + * redo the search */ + candidate = rxrpc_alloc_connection(gfp); + if (!candidate) { + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); + } + + candidate->trans = trans; + candidate->epoch = hdr->epoch; + candidate->cid = hdr->cid & __constant_cpu_to_be32(RXRPC_CIDMASK); + candidate->service_id = hdr->serviceId; + candidate->security_ix = hdr->securityIndex; + candidate->in_clientflag = RXRPC_CLIENT_INITIATED; + candidate->out_clientflag = 0; + candidate->real_conn_id = conn_id; + candidate->state = RXRPC_CONN_SERVER; + if (candidate->service_id) + candidate->state = RXRPC_CONN_SERVER_UNSECURED; + + write_lock_bh(&trans->conn_lock); + + pp = &trans->server_conns.rb_node; + p = NULL; + while (*pp) { + p = *pp; + conn = rb_entry(p, struct rxrpc_connection, node); + + if (epoch < conn->epoch) + pp = &(*pp)->rb_left; + else if (epoch > conn->epoch) + pp = &(*pp)->rb_right; + else if (conn_id < conn->real_conn_id) + pp = &(*pp)->rb_left; + else if (conn_id > conn->real_conn_id) + pp = &(*pp)->rb_right; + else + goto found_extant_second; + } + + /* we can now add the new candidate to the list */ + conn = candidate; + candidate = NULL; + rb_link_node(&conn->node, p, pp); + rb_insert_color(&conn->node, &trans->server_conns); + atomic_inc(&conn->trans->usage); + + write_unlock_bh(&trans->conn_lock); + + write_lock_bh(&rxrpc_connection_lock); + list_add_tail(&conn->link, &rxrpc_connections); + write_unlock_bh(&rxrpc_connection_lock); + + new = "new"; + +success: + _net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->real_conn_id); + + _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage)); + return conn; + + /* we found the connection in the list immediately */ +found_extant_connection: + if (hdr->securityIndex != conn->security_ix) { + read_unlock_bh(&trans->conn_lock); + goto security_mismatch; + } + atomic_inc(&conn->usage); + read_unlock_bh(&trans->conn_lock); + goto success; + + /* we found the connection on the second time through the list */ +found_extant_second: + if (hdr->securityIndex != conn->security_ix) { + write_unlock_bh(&trans->conn_lock); + goto security_mismatch; + } + atomic_inc(&conn->usage); + write_unlock_bh(&trans->conn_lock); + kfree(candidate); + goto success; + +security_mismatch: + kfree(candidate); + _leave(" = -EKEYREJECTED"); + return ERR_PTR(-EKEYREJECTED); +} + +/* + * find a connection based on transport and RxRPC connection ID for an incoming + * packet + */ +struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *trans, + struct rxrpc_header *hdr) +{ + struct rxrpc_connection *conn; + struct rb_node *p; + __be32 epoch; + u32 conn_id; + + _enter(",{%x,%x}", ntohl(hdr->cid), hdr->flags); + + read_lock_bh(&trans->conn_lock); + + conn_id = ntohl(hdr->cid) & RXRPC_CIDMASK; + epoch = hdr->epoch; + + if (hdr->flags & RXRPC_CLIENT_INITIATED) + p = trans->server_conns.rb_node; + else + p = trans->client_conns.rb_node; + + while (p) { + conn = rb_entry(p, struct rxrpc_connection, node); + + _debug("maybe %x", conn->real_conn_id); + + if (epoch < conn->epoch) + p = p->rb_left; + else if (epoch > conn->epoch) + p = p->rb_right; + else if (conn_id < conn->real_conn_id) + p = p->rb_left; + else if (conn_id > conn->real_conn_id) + p = p->rb_right; + else + goto found; + } + + read_unlock_bh(&trans->conn_lock); + _leave(" = NULL"); + return NULL; + +found: + atomic_inc(&conn->usage); + read_unlock_bh(&trans->conn_lock); + _leave(" = %p", conn); + return conn; +} + +/* + * release a virtual connection + */ +void rxrpc_put_connection(struct rxrpc_connection *conn) +{ + _enter("%p{u=%d,d=%d}", + conn, atomic_read(&conn->usage), conn->debug_id); + + ASSERTCMP(atomic_read(&conn->usage), >, 0); + + conn->put_time = xtime.tv_sec; + if (atomic_dec_and_test(&conn->usage)) { + _debug("zombie"); + rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0); + } + + _leave(""); +} + +/* + * destroy a virtual connection + */ +static void rxrpc_destroy_connection(struct rxrpc_connection *conn) +{ + _enter("%p{%d}", conn, atomic_read(&conn->usage)); + + ASSERTCMP(atomic_read(&conn->usage), ==, 0); + + _net("DESTROY CONN %d", conn->debug_id); + + if (conn->bundle) + rxrpc_put_bundle(conn->trans, conn->bundle); + + ASSERT(RB_EMPTY_ROOT(&conn->calls)); + rxrpc_purge_queue(&conn->rx_queue); + + rxrpc_clear_conn_security(conn); + rxrpc_put_transport(conn->trans); + kfree(conn); + _leave(""); +} + +/* + * reap dead connections + */ +void rxrpc_connection_reaper(struct work_struct *work) +{ + struct rxrpc_connection *conn, *_p; + unsigned long now, earliest, reap_time; + + LIST_HEAD(graveyard); + + _enter(""); + + now = xtime.tv_sec; + earliest = ULONG_MAX; + + write_lock_bh(&rxrpc_connection_lock); + list_for_each_entry_safe(conn, _p, &rxrpc_connections, link) { + _debug("reap CONN %d { u=%d,t=%ld }", + conn->debug_id, atomic_read(&conn->usage), + (long) now - (long) conn->put_time); + + if (likely(atomic_read(&conn->usage) > 0)) + continue; + + spin_lock(&conn->trans->client_lock); + write_lock(&conn->trans->conn_lock); + reap_time = conn->put_time + rxrpc_connection_timeout; + + if (atomic_read(&conn->usage) > 0) { + ; + } else if (reap_time <= now) { + list_move_tail(&conn->link, &graveyard); + if (conn->out_clientflag) + rb_erase(&conn->node, + &conn->trans->client_conns); + else + rb_erase(&conn->node, + &conn->trans->server_conns); + if (conn->bundle) { + list_del_init(&conn->bundle_link); + conn->bundle->num_conns--; + } + + } else if (reap_time < earliest) { + earliest = reap_time; + } + + write_unlock(&conn->trans->conn_lock); + spin_unlock(&conn->trans->client_lock); + } + write_unlock_bh(&rxrpc_connection_lock); + + if (earliest != ULONG_MAX) { + _debug("reschedule reaper %ld", (long) earliest - now); + ASSERTCMP(earliest, >, now); + rxrpc_queue_delayed_work(&rxrpc_connection_reap, + (earliest - now) * HZ); + } + + /* then destroy all those pulled out */ + while (!list_empty(&graveyard)) { + conn = list_entry(graveyard.next, struct rxrpc_connection, + link); + list_del_init(&conn->link); + + ASSERTCMP(atomic_read(&conn->usage), ==, 0); + rxrpc_destroy_connection(conn); + } + + _leave(""); +} + +/* + * preemptively destroy all the connection records rather than waiting for them + * to time out + */ +void __exit rxrpc_destroy_all_connections(void) +{ + _enter(""); + + rxrpc_connection_timeout = 0; + cancel_delayed_work(&rxrpc_connection_reap); + rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0); + + _leave(""); +} diff --git a/net/rxrpc/ar-connevent.c b/net/rxrpc/ar-connevent.c new file mode 100644 index 00000000000..1ada43d5116 --- /dev/null +++ b/net/rxrpc/ar-connevent.c @@ -0,0 +1,403 @@ +/* connection-level event handling + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/net.h> +#include <linux/skbuff.h> +#include <linux/errqueue.h> +#include <linux/udp.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <linux/icmp.h> +#include <net/sock.h> +#include <net/af_rxrpc.h> +#include <net/ip.h> +#include "ar-internal.h" + +/* + * pass a connection-level abort onto all calls on that connection + */ +static void rxrpc_abort_calls(struct rxrpc_connection *conn, int state, + u32 abort_code) +{ + struct rxrpc_call *call; + struct rb_node *p; + + _enter("{%d},%x", conn->debug_id, abort_code); + + read_lock_bh(&conn->lock); + + for (p = rb_first(&conn->calls); p; p = rb_next(p)) { + call = rb_entry(p, struct rxrpc_call, conn_node); + write_lock(&call->state_lock); + if (call->state <= RXRPC_CALL_COMPLETE) { + call->state = state; + call->abort_code = abort_code; + if (state == RXRPC_CALL_LOCALLY_ABORTED) + set_bit(RXRPC_CALL_CONN_ABORT, &call->events); + else + set_bit(RXRPC_CALL_RCVD_ABORT, &call->events); + rxrpc_queue_call(call); + } + write_unlock(&call->state_lock); + } + + read_unlock_bh(&conn->lock); + _leave(""); +} + +/* + * generate a connection-level abort + */ +static int rxrpc_abort_connection(struct rxrpc_connection *conn, + u32 error, u32 abort_code) +{ + struct rxrpc_header hdr; + struct msghdr msg; + struct kvec iov[2]; + __be32 word; + size_t len; + int ret; + + _enter("%d,,%u,%u", conn->debug_id, error, abort_code); + + /* generate a connection-level abort */ + spin_lock_bh(&conn->state_lock); + if (conn->state < RXRPC_CONN_REMOTELY_ABORTED) { + conn->state = RXRPC_CONN_LOCALLY_ABORTED; + conn->error = error; + spin_unlock_bh(&conn->state_lock); + } else { + spin_unlock_bh(&conn->state_lock); + _leave(" = 0 [already dead]"); + return 0; + } + + rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED, abort_code); + + msg.msg_name = &conn->trans->peer->srx.transport.sin; + msg.msg_namelen = sizeof(conn->trans->peer->srx.transport.sin); + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + hdr.epoch = conn->epoch; + hdr.cid = conn->cid; + hdr.callNumber = 0; + hdr.seq = 0; + hdr.type = RXRPC_PACKET_TYPE_ABORT; + hdr.flags = conn->out_clientflag; + hdr.userStatus = 0; + hdr.securityIndex = conn->security_ix; + hdr._rsvd = 0; + hdr.serviceId = conn->service_id; + + word = htonl(abort_code); + + iov[0].iov_base = &hdr; + iov[0].iov_len = sizeof(hdr); + iov[1].iov_base = &word; + iov[1].iov_len = sizeof(word); + + len = iov[0].iov_len + iov[1].iov_len; + + hdr.serial = htonl(atomic_inc_return(&conn->serial)); + _proto("Tx CONN ABORT %%%u { %d }", ntohl(hdr.serial), abort_code); + + ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 2, len); + if (ret < 0) { + _debug("sendmsg failed: %d", ret); + return -EAGAIN; + } + + _leave(" = 0"); + return 0; +} + +/* + * mark a call as being on a now-secured channel + * - must be called with softirqs disabled + */ +void rxrpc_call_is_secure(struct rxrpc_call *call) +{ + _enter("%p", call); + if (call) { + read_lock(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE && + !test_and_set_bit(RXRPC_CALL_SECURED, &call->events)) + rxrpc_queue_call(call); + read_unlock(&call->state_lock); + } +} + +/* + * connection-level Rx packet processor + */ +static int rxrpc_process_event(struct rxrpc_connection *conn, + struct sk_buff *skb, + u32 *_abort_code) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + __be32 tmp; + u32 serial; + int loop, ret; + + if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) + return -ECONNABORTED; + + serial = ntohl(sp->hdr.serial); + + switch (sp->hdr.type) { + case RXRPC_PACKET_TYPE_ABORT: + if (skb_copy_bits(skb, 0, &tmp, sizeof(tmp)) < 0) + return -EPROTO; + _proto("Rx ABORT %%%u { ac=%d }", serial, ntohl(tmp)); + + conn->state = RXRPC_CONN_REMOTELY_ABORTED; + rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED, + ntohl(tmp)); + return -ECONNABORTED; + + case RXRPC_PACKET_TYPE_CHALLENGE: + if (conn->security) + return conn->security->respond_to_challenge( + conn, skb, _abort_code); + return -EPROTO; + + case RXRPC_PACKET_TYPE_RESPONSE: + if (!conn->security) + return -EPROTO; + + ret = conn->security->verify_response(conn, skb, _abort_code); + if (ret < 0) + return ret; + + ret = conn->security->init_connection_security(conn); + if (ret < 0) + return ret; + + conn->security->prime_packet_security(conn); + read_lock_bh(&conn->lock); + spin_lock(&conn->state_lock); + + if (conn->state == RXRPC_CONN_SERVER_CHALLENGING) { + conn->state = RXRPC_CONN_SERVER; + for (loop = 0; loop < RXRPC_MAXCALLS; loop++) + rxrpc_call_is_secure(conn->channels[loop]); + } + + spin_unlock(&conn->state_lock); + read_unlock_bh(&conn->lock); + return 0; + + default: + return -EPROTO; + } +} + +/* + * set up security and issue a challenge + */ +static void rxrpc_secure_connection(struct rxrpc_connection *conn) +{ + u32 abort_code; + int ret; + + _enter("{%d}", conn->debug_id); + + ASSERT(conn->security_ix != 0); + + if (!conn->key) { + _debug("set up security"); + ret = rxrpc_init_server_conn_security(conn); + switch (ret) { + case 0: + break; + case -ENOENT: + abort_code = RX_CALL_DEAD; + goto abort; + default: + abort_code = RXKADNOAUTH; + goto abort; + } + } + + ASSERT(conn->security != NULL); + + if (conn->security->issue_challenge(conn) < 0) { + abort_code = RX_CALL_DEAD; + ret = -ENOMEM; + goto abort; + } + + _leave(""); + return; + +abort: + _debug("abort %d, %d", ret, abort_code); + rxrpc_abort_connection(conn, -ret, abort_code); + _leave(" [aborted]"); +} + +/* + * connection-level event processor + */ +void rxrpc_process_connection(struct work_struct *work) +{ + struct rxrpc_connection *conn = + container_of(work, struct rxrpc_connection, processor); + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + u32 abort_code = RX_PROTOCOL_ERROR; + int ret; + + _enter("{%d}", conn->debug_id); + + atomic_inc(&conn->usage); + + if (test_and_clear_bit(RXRPC_CONN_CHALLENGE, &conn->events)) { + rxrpc_secure_connection(conn); + rxrpc_put_connection(conn); + } + + /* go through the conn-level event packets, releasing the ref on this + * connection that each one has when we've finished with it */ + while ((skb = skb_dequeue(&conn->rx_queue))) { + sp = rxrpc_skb(skb); + + ret = rxrpc_process_event(conn, skb, &abort_code); + switch (ret) { + case -EPROTO: + case -EKEYEXPIRED: + case -EKEYREJECTED: + goto protocol_error; + case -EAGAIN: + goto requeue_and_leave; + case -ECONNABORTED: + default: + rxrpc_put_connection(conn); + rxrpc_free_skb(skb); + break; + } + } + +out: + rxrpc_put_connection(conn); + _leave(""); + return; + +requeue_and_leave: + skb_queue_head(&conn->rx_queue, skb); + goto out; + +protocol_error: + if (rxrpc_abort_connection(conn, -ret, abort_code) < 0) + goto requeue_and_leave; + rxrpc_put_connection(conn); + rxrpc_free_skb(skb); + _leave(" [EPROTO]"); + goto out; +} + +/* + * put a packet up for transport-level abort + */ +void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb) +{ + CHECK_SLAB_OKAY(&local->usage); + + if (!atomic_inc_not_zero(&local->usage)) { + printk("resurrected on reject\n"); + BUG(); + } + + skb_queue_tail(&local->reject_queue, skb); + rxrpc_queue_work(&local->rejecter); +} + +/* + * reject packets through the local endpoint + */ +void rxrpc_reject_packets(struct work_struct *work) +{ + union { + struct sockaddr sa; + struct sockaddr_in sin; + } sa; + struct rxrpc_skb_priv *sp; + struct rxrpc_header hdr; + struct rxrpc_local *local; + struct sk_buff *skb; + struct msghdr msg; + struct kvec iov[2]; + size_t size; + __be32 code; + + local = container_of(work, struct rxrpc_local, rejecter); + rxrpc_get_local(local); + + _enter("%d", local->debug_id); + + iov[0].iov_base = &hdr; + iov[0].iov_len = sizeof(hdr); + iov[1].iov_base = &code; + iov[1].iov_len = sizeof(code); + size = sizeof(hdr) + sizeof(code); + + msg.msg_name = &sa; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + memset(&sa, 0, sizeof(sa)); + sa.sa.sa_family = local->srx.transport.family; + switch (sa.sa.sa_family) { + case AF_INET: + msg.msg_namelen = sizeof(sa.sin); + break; + default: + msg.msg_namelen = 0; + break; + } + + memset(&hdr, 0, sizeof(hdr)); + hdr.type = RXRPC_PACKET_TYPE_ABORT; + + while ((skb = skb_dequeue(&local->reject_queue))) { + sp = rxrpc_skb(skb); + switch (sa.sa.sa_family) { + case AF_INET: + sa.sin.sin_port = udp_hdr(skb)->source; + sa.sin.sin_addr.s_addr = ip_hdr(skb)->saddr; + code = htonl(skb->priority); + + hdr.epoch = sp->hdr.epoch; + hdr.cid = sp->hdr.cid; + hdr.callNumber = sp->hdr.callNumber; + hdr.serviceId = sp->hdr.serviceId; + hdr.flags = sp->hdr.flags; + hdr.flags ^= RXRPC_CLIENT_INITIATED; + hdr.flags &= RXRPC_CLIENT_INITIATED; + + kernel_sendmsg(local->socket, &msg, iov, 2, size); + break; + + default: + break; + } + + rxrpc_free_skb(skb); + rxrpc_put_local(local); + } + + rxrpc_put_local(local); + _leave(""); +} diff --git a/net/rxrpc/ar-error.c b/net/rxrpc/ar-error.c new file mode 100644 index 00000000000..6cb3e8890e7 --- /dev/null +++ b/net/rxrpc/ar-error.c @@ -0,0 +1,255 @@ +/* Error message handling (ICMP) + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/net.h> +#include <linux/skbuff.h> +#include <linux/errqueue.h> +#include <linux/udp.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <linux/icmp.h> +#include <net/sock.h> +#include <net/af_rxrpc.h> +#include <net/ip.h> +#include "ar-internal.h" + +/* + * handle an error received on the local endpoint + */ +void rxrpc_UDP_error_report(struct sock *sk) +{ + struct sock_exterr_skb *serr; + struct rxrpc_transport *trans; + struct rxrpc_local *local = sk->sk_user_data; + struct rxrpc_peer *peer; + struct sk_buff *skb; + __be32 addr; + __be16 port; + + _enter("%p{%d}", sk, local->debug_id); + + skb = skb_dequeue(&sk->sk_error_queue); + if (!skb) { + _leave("UDP socket errqueue empty"); + return; + } + + rxrpc_new_skb(skb); + + serr = SKB_EXT_ERR(skb); + addr = *(__be32 *)(skb_network_header(skb) + serr->addr_offset); + port = serr->port; + + _net("Rx UDP Error from "NIPQUAD_FMT":%hu", + NIPQUAD(addr), ntohs(port)); + _debug("Msg l:%d d:%d", skb->len, skb->data_len); + + peer = rxrpc_find_peer(local, addr, port); + if (IS_ERR(peer)) { + rxrpc_free_skb(skb); + _leave(" [no peer]"); + return; + } + + trans = rxrpc_find_transport(local, peer); + if (!trans) { + rxrpc_put_peer(peer); + rxrpc_free_skb(skb); + _leave(" [no trans]"); + return; + } + + if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP && + serr->ee.ee_type == ICMP_DEST_UNREACH && + serr->ee.ee_code == ICMP_FRAG_NEEDED + ) { + u32 mtu = serr->ee.ee_info; + + _net("Rx Received ICMP Fragmentation Needed (%d)", mtu); + + /* wind down the local interface MTU */ + if (mtu > 0 && peer->if_mtu == 65535 && mtu < peer->if_mtu) { + peer->if_mtu = mtu; + _net("I/F MTU %u", mtu); + } + + /* ip_rt_frag_needed() may have eaten the info */ + if (mtu == 0) + mtu = ntohs(icmp_hdr(skb)->un.frag.mtu); + + if (mtu == 0) { + /* they didn't give us a size, estimate one */ + if (mtu > 1500) { + mtu >>= 1; + if (mtu < 1500) + mtu = 1500; + } else { + mtu -= 100; + if (mtu < peer->hdrsize) + mtu = peer->hdrsize + 4; + } + } + + if (mtu < peer->mtu) { + spin_lock_bh(&peer->lock); + peer->mtu = mtu; + peer->maxdata = peer->mtu - peer->hdrsize; + spin_unlock_bh(&peer->lock); + _net("Net MTU %u (maxdata %u)", + peer->mtu, peer->maxdata); + } + } + + rxrpc_put_peer(peer); + + /* pass the transport ref to error_handler to release */ + skb_queue_tail(&trans->error_queue, skb); + rxrpc_queue_work(&trans->error_handler); + + /* reset and regenerate socket error */ + spin_lock_bh(&sk->sk_error_queue.lock); + sk->sk_err = 0; + skb = skb_peek(&sk->sk_error_queue); + if (skb) { + sk->sk_err = SKB_EXT_ERR(skb)->ee.ee_errno; + spin_unlock_bh(&sk->sk_error_queue.lock); + sk->sk_error_report(sk); + } else { + spin_unlock_bh(&sk->sk_error_queue.lock); + } + + _leave(""); +} + +/* + * deal with UDP error messages + */ +void rxrpc_UDP_error_handler(struct work_struct *work) +{ + struct sock_extended_err *ee; + struct sock_exterr_skb *serr; + struct rxrpc_transport *trans = + container_of(work, struct rxrpc_transport, error_handler); + struct sk_buff *skb; + int local, err; + + _enter(""); + + skb = skb_dequeue(&trans->error_queue); + if (!skb) + return; + + serr = SKB_EXT_ERR(skb); + ee = &serr->ee; + + _net("Rx Error o=%d t=%d c=%d e=%d", + ee->ee_origin, ee->ee_type, ee->ee_code, ee->ee_errno); + + err = ee->ee_errno; + + switch (ee->ee_origin) { + case SO_EE_ORIGIN_ICMP: + local = 0; + switch (ee->ee_type) { + case ICMP_DEST_UNREACH: + switch (ee->ee_code) { + case ICMP_NET_UNREACH: + _net("Rx Received ICMP Network Unreachable"); + err = ENETUNREACH; + break; + case ICMP_HOST_UNREACH: + _net("Rx Received ICMP Host Unreachable"); + err = EHOSTUNREACH; + break; + case ICMP_PORT_UNREACH: + _net("Rx Received ICMP Port Unreachable"); + err = ECONNREFUSED; + break; + case ICMP_FRAG_NEEDED: + _net("Rx Received ICMP Fragmentation Needed (%d)", + ee->ee_info); + err = 0; /* dealt with elsewhere */ + break; + case ICMP_NET_UNKNOWN: + _net("Rx Received ICMP Unknown Network"); + err = ENETUNREACH; + break; + case ICMP_HOST_UNKNOWN: + _net("Rx Received ICMP Unknown Host"); + err = EHOSTUNREACH; + break; + default: + _net("Rx Received ICMP DestUnreach code=%u", + ee->ee_code); + break; + } + break; + + case ICMP_TIME_EXCEEDED: + _net("Rx Received ICMP TTL Exceeded"); + break; + + default: + _proto("Rx Received ICMP error { type=%u code=%u }", + ee->ee_type, ee->ee_code); + break; + } + break; + + case SO_EE_ORIGIN_LOCAL: + _proto("Rx Received local error { error=%d }", + ee->ee_errno); + local = 1; + break; + + case SO_EE_ORIGIN_NONE: + case SO_EE_ORIGIN_ICMP6: + default: + _proto("Rx Received error report { orig=%u }", + ee->ee_origin); + local = 0; + break; + } + + /* terminate all the affected calls if there's an unrecoverable + * error */ + if (err) { + struct rxrpc_call *call, *_n; + + _debug("ISSUE ERROR %d", err); + + spin_lock_bh(&trans->peer->lock); + trans->peer->net_error = err; + + list_for_each_entry_safe(call, _n, &trans->peer->error_targets, + error_link) { + write_lock(&call->state_lock); + if (call->state != RXRPC_CALL_COMPLETE && + call->state < RXRPC_CALL_NETWORK_ERROR) { + call->state = RXRPC_CALL_NETWORK_ERROR; + set_bit(RXRPC_CALL_RCVD_ERROR, &call->events); + rxrpc_queue_call(call); + } + write_unlock(&call->state_lock); + list_del_init(&call->error_link); + } + + spin_unlock_bh(&trans->peer->lock); + } + + if (!skb_queue_empty(&trans->error_queue)) + rxrpc_queue_work(&trans->error_handler); + + rxrpc_free_skb(skb); + rxrpc_put_transport(trans); + _leave(""); +} diff --git a/net/rxrpc/ar-input.c b/net/rxrpc/ar-input.c new file mode 100644 index 00000000000..91b5bbb003e --- /dev/null +++ b/net/rxrpc/ar-input.c @@ -0,0 +1,797 @@ +/* RxRPC packet reception + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/net.h> +#include <linux/skbuff.h> +#include <linux/errqueue.h> +#include <linux/udp.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <linux/icmp.h> +#include <net/sock.h> +#include <net/af_rxrpc.h> +#include <net/ip.h> +#include "ar-internal.h" + +unsigned long rxrpc_ack_timeout = 1; + +const char *rxrpc_pkts[] = { + "?00", + "DATA", "ACK", "BUSY", "ABORT", "ACKALL", "CHALL", "RESP", "DEBUG", + "?09", "?10", "?11", "?12", "?13", "?14", "?15" +}; + +/* + * queue a packet for recvmsg to pass to userspace + * - the caller must hold a lock on call->lock + * - must not be called with interrupts disabled (sk_filter() disables BH's) + * - eats the packet whether successful or not + * - there must be just one reference to the packet, which the caller passes to + * this function + */ +int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb, + bool force, bool terminal) +{ + struct rxrpc_skb_priv *sp; + struct rxrpc_sock *rx = call->socket; + struct sock *sk; + int skb_len, ret; + + _enter(",,%d,%d", force, terminal); + + ASSERT(!irqs_disabled()); + + sp = rxrpc_skb(skb); + ASSERTCMP(sp->call, ==, call); + + /* if we've already posted the terminal message for a call, then we + * don't post any more */ + if (test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) { + _debug("already terminated"); + ASSERTCMP(call->state, >=, RXRPC_CALL_COMPLETE); + skb->destructor = NULL; + sp->call = NULL; + rxrpc_put_call(call); + rxrpc_free_skb(skb); + return 0; + } + + sk = &rx->sk; + + if (!force) { + /* cast skb->rcvbuf to unsigned... It's pointless, but + * reduces number of warnings when compiling with -W + * --ANK */ +// ret = -ENOBUFS; +// if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= +// (unsigned) sk->sk_rcvbuf) +// goto out; + + ret = sk_filter(sk, skb); + if (ret < 0) + goto out; + } + + spin_lock_bh(&sk->sk_receive_queue.lock); + if (!test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags) && + !test_bit(RXRPC_CALL_RELEASED, &call->flags) && + call->socket->sk.sk_state != RXRPC_CLOSE) { + skb->destructor = rxrpc_packet_destructor; + skb->dev = NULL; + skb->sk = sk; + atomic_add(skb->truesize, &sk->sk_rmem_alloc); + + if (terminal) { + _debug("<<<< TERMINAL MESSAGE >>>>"); + set_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags); + } + + /* allow interception by a kernel service */ + if (rx->interceptor) { + rx->interceptor(sk, call->user_call_ID, skb); + spin_unlock_bh(&sk->sk_receive_queue.lock); + } else { + + /* Cache the SKB length before we tack it onto the + * receive queue. Once it is added it no longer + * belongs to us and may be freed by other threads of + * control pulling packets from the queue */ + skb_len = skb->len; + + _net("post skb %p", skb); + __skb_queue_tail(&sk->sk_receive_queue, skb); + spin_unlock_bh(&sk->sk_receive_queue.lock); + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, skb_len); + } + skb = NULL; + } else { + spin_unlock_bh(&sk->sk_receive_queue.lock); + } + ret = 0; + +out: + /* release the socket buffer */ + if (skb) { + skb->destructor = NULL; + sp->call = NULL; + rxrpc_put_call(call); + rxrpc_free_skb(skb); + } + + _leave(" = %d", ret); + return ret; +} + +/* + * process a DATA packet, posting the packet to the appropriate queue + * - eats the packet if successful + */ +static int rxrpc_fast_process_data(struct rxrpc_call *call, + struct sk_buff *skb, u32 seq) +{ + struct rxrpc_skb_priv *sp; + bool terminal; + int ret, ackbit, ack; + + _enter("{%u,%u},,{%u}", call->rx_data_post, call->rx_first_oos, seq); + + sp = rxrpc_skb(skb); + ASSERTCMP(sp->call, ==, NULL); + + spin_lock(&call->lock); + + if (call->state > RXRPC_CALL_COMPLETE) + goto discard; + + ASSERTCMP(call->rx_data_expect, >=, call->rx_data_post); + ASSERTCMP(call->rx_data_post, >=, call->rx_data_recv); + ASSERTCMP(call->rx_data_recv, >=, call->rx_data_eaten); + + if (seq < call->rx_data_post) { + _debug("dup #%u [-%u]", seq, call->rx_data_post); + ack = RXRPC_ACK_DUPLICATE; + ret = -ENOBUFS; + goto discard_and_ack; + } + + /* we may already have the packet in the out of sequence queue */ + ackbit = seq - (call->rx_data_eaten + 1); + ASSERTCMP(ackbit, >=, 0); + if (__test_and_set_bit(ackbit, call->ackr_window)) { + _debug("dup oos #%u [%u,%u]", + seq, call->rx_data_eaten, call->rx_data_post); + ack = RXRPC_ACK_DUPLICATE; + goto discard_and_ack; + } + + if (seq >= call->ackr_win_top) { + _debug("exceed #%u [%u]", seq, call->ackr_win_top); + __clear_bit(ackbit, call->ackr_window); + ack = RXRPC_ACK_EXCEEDS_WINDOW; + goto discard_and_ack; + } + + if (seq == call->rx_data_expect) { + clear_bit(RXRPC_CALL_EXPECT_OOS, &call->flags); + call->rx_data_expect++; + } else if (seq > call->rx_data_expect) { + _debug("oos #%u [%u]", seq, call->rx_data_expect); + call->rx_data_expect = seq + 1; + if (test_and_set_bit(RXRPC_CALL_EXPECT_OOS, &call->flags)) { + ack = RXRPC_ACK_OUT_OF_SEQUENCE; + goto enqueue_and_ack; + } + goto enqueue_packet; + } + + if (seq != call->rx_data_post) { + _debug("ahead #%u [%u]", seq, call->rx_data_post); + goto enqueue_packet; + } + + if (test_bit(RXRPC_CALL_RCVD_LAST, &call->flags)) + goto protocol_error; + + /* if the packet need security things doing to it, then it goes down + * the slow path */ + if (call->conn->security) + goto enqueue_packet; + + sp->call = call; + rxrpc_get_call(call); + terminal = ((sp->hdr.flags & RXRPC_LAST_PACKET) && + !(sp->hdr.flags & RXRPC_CLIENT_INITIATED)); + ret = rxrpc_queue_rcv_skb(call, skb, false, terminal); + if (ret < 0) { + if (ret == -ENOMEM || ret == -ENOBUFS) { + __clear_bit(ackbit, call->ackr_window); + ack = RXRPC_ACK_NOSPACE; + goto discard_and_ack; + } + goto out; + } + + skb = NULL; + + _debug("post #%u", seq); + ASSERTCMP(call->rx_data_post, ==, seq); + call->rx_data_post++; + + if (sp->hdr.flags & RXRPC_LAST_PACKET) + set_bit(RXRPC_CALL_RCVD_LAST, &call->flags); + + /* if we've reached an out of sequence packet then we need to drain + * that queue into the socket Rx queue now */ + if (call->rx_data_post == call->rx_first_oos) { + _debug("drain rx oos now"); + read_lock(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE && + !test_and_set_bit(RXRPC_CALL_DRAIN_RX_OOS, &call->events)) + rxrpc_queue_call(call); + read_unlock(&call->state_lock); + } + + spin_unlock(&call->lock); + atomic_inc(&call->ackr_not_idle); + rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, sp->hdr.serial, false); + _leave(" = 0 [posted]"); + return 0; + +protocol_error: + ret = -EBADMSG; +out: + spin_unlock(&call->lock); + _leave(" = %d", ret); + return ret; + +discard_and_ack: + _debug("discard and ACK packet %p", skb); + __rxrpc_propose_ACK(call, ack, sp->hdr.serial, true); +discard: + spin_unlock(&call->lock); + rxrpc_free_skb(skb); + _leave(" = 0 [discarded]"); + return 0; + +enqueue_and_ack: + __rxrpc_propose_ACK(call, ack, sp->hdr.serial, true); +enqueue_packet: + _net("defer skb %p", skb); + spin_unlock(&call->lock); + skb_queue_tail(&call->rx_queue, skb); + atomic_inc(&call->ackr_not_idle); + read_lock(&call->state_lock); + if (call->state < RXRPC_CALL_DEAD) + rxrpc_queue_call(call); + read_unlock(&call->state_lock); + _leave(" = 0 [queued]"); + return 0; +} + +/* + * assume an implicit ACKALL of the transmission phase of a client socket upon + * reception of the first reply packet + */ +static void rxrpc_assume_implicit_ackall(struct rxrpc_call *call, u32 serial) +{ + write_lock_bh(&call->state_lock); + + switch (call->state) { + case RXRPC_CALL_CLIENT_AWAIT_REPLY: + call->state = RXRPC_CALL_CLIENT_RECV_REPLY; + call->acks_latest = serial; + + _debug("implicit ACKALL %%%u", call->acks_latest); + set_bit(RXRPC_CALL_RCVD_ACKALL, &call->events); + write_unlock_bh(&call->state_lock); + + if (try_to_del_timer_sync(&call->resend_timer) >= 0) { + clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events); + clear_bit(RXRPC_CALL_RESEND, &call->events); + clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); + } + break; + + default: + write_unlock_bh(&call->state_lock); + break; + } +} + +/* + * post an incoming packet to the nominated call to deal with + * - must get rid of the sk_buff, either by freeing it or by queuing it + */ +void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + __be32 _abort_code; + u32 serial, hi_serial, seq, abort_code; + + _enter("%p,%p", call, skb); + + ASSERT(!irqs_disabled()); + +#if 0 // INJECT RX ERROR + if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA) { + static int skip = 0; + if (++skip == 3) { + printk("DROPPED 3RD PACKET!!!!!!!!!!!!!\n"); + skip = 0; + goto free_packet; + } + } +#endif + + /* track the latest serial number on this connection for ACK packet + * information */ + serial = ntohl(sp->hdr.serial); + hi_serial = atomic_read(&call->conn->hi_serial); + while (serial > hi_serial) + hi_serial = atomic_cmpxchg(&call->conn->hi_serial, hi_serial, + serial); + + /* request ACK generation for any ACK or DATA packet that requests + * it */ + if (sp->hdr.flags & RXRPC_REQUEST_ACK) { + _proto("ACK Requested on %%%u", serial); + rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED, sp->hdr.serial, + !(sp->hdr.flags & RXRPC_MORE_PACKETS)); + } + + switch (sp->hdr.type) { + case RXRPC_PACKET_TYPE_ABORT: + _debug("abort"); + + if (skb_copy_bits(skb, 0, &_abort_code, + sizeof(_abort_code)) < 0) + goto protocol_error; + + abort_code = ntohl(_abort_code); + _proto("Rx ABORT %%%u { %x }", serial, abort_code); + + write_lock_bh(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE) { + call->state = RXRPC_CALL_REMOTELY_ABORTED; + call->abort_code = abort_code; + set_bit(RXRPC_CALL_RCVD_ABORT, &call->events); + rxrpc_queue_call(call); + } + goto free_packet_unlock; + + case RXRPC_PACKET_TYPE_BUSY: + _proto("Rx BUSY %%%u", serial); + + if (call->conn->out_clientflag) + goto protocol_error; + + write_lock_bh(&call->state_lock); + switch (call->state) { + case RXRPC_CALL_CLIENT_SEND_REQUEST: + call->state = RXRPC_CALL_SERVER_BUSY; + set_bit(RXRPC_CALL_RCVD_BUSY, &call->events); + rxrpc_queue_call(call); + case RXRPC_CALL_SERVER_BUSY: + goto free_packet_unlock; + default: + goto protocol_error_locked; + } + + default: + _proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], serial); + goto protocol_error; + + case RXRPC_PACKET_TYPE_DATA: + seq = ntohl(sp->hdr.seq); + + _proto("Rx DATA %%%u { #%u }", serial, seq); + + if (seq == 0) + goto protocol_error; + + call->ackr_prev_seq = sp->hdr.seq; + + /* received data implicitly ACKs all of the request packets we + * sent when we're acting as a client */ + if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) + rxrpc_assume_implicit_ackall(call, serial); + + switch (rxrpc_fast_process_data(call, skb, seq)) { + case 0: + skb = NULL; + goto done; + + default: + BUG(); + + /* data packet received beyond the last packet */ + case -EBADMSG: + goto protocol_error; + } + + case RXRPC_PACKET_TYPE_ACK: + /* ACK processing is done in process context */ + read_lock_bh(&call->state_lock); + if (call->state < RXRPC_CALL_DEAD) { + skb_queue_tail(&call->rx_queue, skb); + rxrpc_queue_call(call); + skb = NULL; + } + read_unlock_bh(&call->state_lock); + goto free_packet; + } + +protocol_error: + _debug("protocol error"); + write_lock_bh(&call->state_lock); +protocol_error_locked: + if (call->state <= RXRPC_CALL_COMPLETE) { + call->state = RXRPC_CALL_LOCALLY_ABORTED; + call->abort_code = RX_PROTOCOL_ERROR; + set_bit(RXRPC_CALL_ABORT, &call->events); + rxrpc_queue_call(call); + } +free_packet_unlock: + write_unlock_bh(&call->state_lock); +free_packet: + rxrpc_free_skb(skb); +done: + _leave(""); +} + +/* + * split up a jumbo data packet + */ +static void rxrpc_process_jumbo_packet(struct rxrpc_call *call, + struct sk_buff *jumbo) +{ + struct rxrpc_jumbo_header jhdr; + struct rxrpc_skb_priv *sp; + struct sk_buff *part; + + _enter(",{%u,%u}", jumbo->data_len, jumbo->len); + + sp = rxrpc_skb(jumbo); + + do { + sp->hdr.flags &= ~RXRPC_JUMBO_PACKET; + + /* make a clone to represent the first subpacket in what's left + * of the jumbo packet */ + part = skb_clone(jumbo, GFP_ATOMIC); + if (!part) { + /* simply ditch the tail in the event of ENOMEM */ + pskb_trim(jumbo, RXRPC_JUMBO_DATALEN); + break; + } + rxrpc_new_skb(part); + + pskb_trim(part, RXRPC_JUMBO_DATALEN); + + if (!pskb_pull(jumbo, RXRPC_JUMBO_DATALEN)) + goto protocol_error; + + if (skb_copy_bits(jumbo, 0, &jhdr, sizeof(jhdr)) < 0) + goto protocol_error; + if (!pskb_pull(jumbo, sizeof(jhdr))) + BUG(); + + sp->hdr.seq = htonl(ntohl(sp->hdr.seq) + 1); + sp->hdr.serial = htonl(ntohl(sp->hdr.serial) + 1); + sp->hdr.flags = jhdr.flags; + sp->hdr._rsvd = jhdr._rsvd; + + _proto("Rx DATA Jumbo %%%u", ntohl(sp->hdr.serial) - 1); + + rxrpc_fast_process_packet(call, part); + part = NULL; + + } while (sp->hdr.flags & RXRPC_JUMBO_PACKET); + + rxrpc_fast_process_packet(call, jumbo); + _leave(""); + return; + +protocol_error: + _debug("protocol error"); + rxrpc_free_skb(part); + rxrpc_free_skb(jumbo); + write_lock_bh(&call->state_lock); + if (call->state <= RXRPC_CALL_COMPLETE) { + call->state = RXRPC_CALL_LOCALLY_ABORTED; + call->abort_code = RX_PROTOCOL_ERROR; + set_bit(RXRPC_CALL_ABORT, &call->events); + rxrpc_queue_call(call); + } + write_unlock_bh(&call->state_lock); + _leave(""); +} + +/* + * post an incoming packet to the appropriate call/socket to deal with + * - must get rid of the sk_buff, either by freeing it or by queuing it + */ +static void rxrpc_post_packet_to_call(struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp; + struct rxrpc_call *call; + struct rb_node *p; + __be32 call_id; + + _enter("%p,%p", conn, skb); + + read_lock_bh(&conn->lock); + + sp = rxrpc_skb(skb); + + /* look at extant calls by channel number first */ + call = conn->channels[ntohl(sp->hdr.cid) & RXRPC_CHANNELMASK]; + if (!call || call->call_id != sp->hdr.callNumber) + goto call_not_extant; + + _debug("extant call [%d]", call->state); + ASSERTCMP(call->conn, ==, conn); + + read_lock(&call->state_lock); + switch (call->state) { + case RXRPC_CALL_LOCALLY_ABORTED: + if (!test_and_set_bit(RXRPC_CALL_ABORT, &call->events)) + rxrpc_queue_call(call); + case RXRPC_CALL_REMOTELY_ABORTED: + case RXRPC_CALL_NETWORK_ERROR: + case RXRPC_CALL_DEAD: + goto free_unlock; + default: + break; + } + + read_unlock(&call->state_lock); + rxrpc_get_call(call); + read_unlock_bh(&conn->lock); + + if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && + sp->hdr.flags & RXRPC_JUMBO_PACKET) + rxrpc_process_jumbo_packet(call, skb); + else + rxrpc_fast_process_packet(call, skb); + + rxrpc_put_call(call); + goto done; + +call_not_extant: + /* search the completed calls in case what we're dealing with is + * there */ + _debug("call not extant"); + + call_id = sp->hdr.callNumber; + p = conn->calls.rb_node; + while (p) { + call = rb_entry(p, struct rxrpc_call, conn_node); + + if (call_id < call->call_id) + p = p->rb_left; + else if (call_id > call->call_id) + p = p->rb_right; + else + goto found_completed_call; + } + +dead_call: + /* it's a either a really old call that we no longer remember or its a + * new incoming call */ + read_unlock_bh(&conn->lock); + + if (sp->hdr.flags & RXRPC_CLIENT_INITIATED && + sp->hdr.seq == __constant_cpu_to_be32(1)) { + _debug("incoming call"); + skb_queue_tail(&conn->trans->local->accept_queue, skb); + rxrpc_queue_work(&conn->trans->local->acceptor); + goto done; + } + + _debug("dead call"); + skb->priority = RX_CALL_DEAD; + rxrpc_reject_packet(conn->trans->local, skb); + goto done; + + /* resend last packet of a completed call + * - client calls may have been aborted or ACK'd + * - server calls may have been aborted + */ +found_completed_call: + _debug("completed call"); + + if (atomic_read(&call->usage) == 0) + goto dead_call; + + /* synchronise any state changes */ + read_lock(&call->state_lock); + ASSERTIFCMP(call->state != RXRPC_CALL_CLIENT_FINAL_ACK, + call->state, >=, RXRPC_CALL_COMPLETE); + + if (call->state == RXRPC_CALL_LOCALLY_ABORTED || + call->state == RXRPC_CALL_REMOTELY_ABORTED || + call->state == RXRPC_CALL_DEAD) { + read_unlock(&call->state_lock); + goto dead_call; + } + + if (call->conn->in_clientflag) { + read_unlock(&call->state_lock); + goto dead_call; /* complete server call */ + } + + _debug("final ack again"); + rxrpc_get_call(call); + set_bit(RXRPC_CALL_ACK_FINAL, &call->events); + rxrpc_queue_call(call); + +free_unlock: + read_unlock(&call->state_lock); + read_unlock_bh(&conn->lock); + rxrpc_free_skb(skb); +done: + _leave(""); +} + +/* + * post connection-level events to the connection + * - this includes challenges, responses and some aborts + */ +static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + _enter("%p,%p", conn, skb); + + atomic_inc(&conn->usage); + skb_queue_tail(&conn->rx_queue, skb); + rxrpc_queue_conn(conn); +} + +/* + * handle data received on the local endpoint + * - may be called in interrupt context + */ +void rxrpc_data_ready(struct sock *sk, int count) +{ + struct rxrpc_connection *conn; + struct rxrpc_transport *trans; + struct rxrpc_skb_priv *sp; + struct rxrpc_local *local; + struct rxrpc_peer *peer; + struct sk_buff *skb; + int ret; + + _enter("%p, %d", sk, count); + + ASSERT(!irqs_disabled()); + + read_lock_bh(&rxrpc_local_lock); + local = sk->sk_user_data; + if (local && atomic_read(&local->usage) > 0) + rxrpc_get_local(local); + else + local = NULL; + read_unlock_bh(&rxrpc_local_lock); + if (!local) { + _leave(" [local dead]"); + return; + } + + skb = skb_recv_datagram(sk, 0, 1, &ret); + if (!skb) { + rxrpc_put_local(local); + if (ret == -EAGAIN) + return; + _debug("UDP socket error %d", ret); + return; + } + + rxrpc_new_skb(skb); + + _net("recv skb %p", skb); + + /* we'll probably need to checksum it (didn't call sock_recvmsg) */ + if (skb_checksum_complete(skb)) { + rxrpc_free_skb(skb); + rxrpc_put_local(local); + _leave(" [CSUM failed]"); + return; + } + + /* the socket buffer we have is owned by UDP, with UDP's data all over + * it, but we really want our own */ + skb_orphan(skb); + sp = rxrpc_skb(skb); + memset(sp, 0, sizeof(*sp)); + + _net("Rx UDP packet from %08x:%04hu", + ntohl(ip_hdr(skb)->saddr), ntohs(udp_hdr(skb)->source)); + + /* dig out the RxRPC connection details */ + if (skb_copy_bits(skb, sizeof(struct udphdr), &sp->hdr, + sizeof(sp->hdr)) < 0) + goto bad_message; + if (!pskb_pull(skb, sizeof(struct udphdr) + sizeof(sp->hdr))) + BUG(); + + _net("Rx RxRPC %s ep=%x call=%x:%x", + sp->hdr.flags & RXRPC_CLIENT_INITIATED ? "ToServer" : "ToClient", + ntohl(sp->hdr.epoch), + ntohl(sp->hdr.cid), + ntohl(sp->hdr.callNumber)); + + if (sp->hdr.type == 0 || sp->hdr.type >= RXRPC_N_PACKET_TYPES) { + _proto("Rx Bad Packet Type %u", sp->hdr.type); + goto bad_message; + } + + if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && + (sp->hdr.callNumber == 0 || sp->hdr.seq == 0)) + goto bad_message; + + peer = rxrpc_find_peer(local, ip_hdr(skb)->saddr, udp_hdr(skb)->source); + if (IS_ERR(peer)) + goto cant_route_call; + + trans = rxrpc_find_transport(local, peer); + rxrpc_put_peer(peer); + if (!trans) + goto cant_route_call; + + conn = rxrpc_find_connection(trans, &sp->hdr); + rxrpc_put_transport(trans); + if (!conn) + goto cant_route_call; + + _debug("CONN %p {%d}", conn, conn->debug_id); + + if (sp->hdr.callNumber == 0) + rxrpc_post_packet_to_conn(conn, skb); + else + rxrpc_post_packet_to_call(conn, skb); + rxrpc_put_connection(conn); + rxrpc_put_local(local); + return; + +cant_route_call: + _debug("can't route call"); + if (sp->hdr.flags & RXRPC_CLIENT_INITIATED && + sp->hdr.type == RXRPC_PACKET_TYPE_DATA) { + if (sp->hdr.seq == __constant_cpu_to_be32(1)) { + _debug("first packet"); + skb_queue_tail(&local->accept_queue, skb); + rxrpc_queue_work(&local->acceptor); + rxrpc_put_local(local); + _leave(" [incoming]"); + return; + } + skb->priority = RX_INVALID_OPERATION; + } else { + skb->priority = RX_CALL_DEAD; + } + + _debug("reject"); + rxrpc_reject_packet(local, skb); + rxrpc_put_local(local); + _leave(" [no call]"); + return; + +bad_message: + skb->priority = RX_PROTOCOL_ERROR; + rxrpc_reject_packet(local, skb); + rxrpc_put_local(local); + _leave(" [badmsg]"); +} diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h new file mode 100644 index 00000000000..58aaf892238 --- /dev/null +++ b/net/rxrpc/ar-internal.h @@ -0,0 +1,808 @@ +/* AF_RXRPC internal definitions + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <rxrpc/packet.h> + +#if 0 +#define CHECK_SLAB_OKAY(X) \ + BUG_ON(atomic_read((X)) >> (sizeof(atomic_t) - 2) == \ + (POISON_FREE << 8 | POISON_FREE)) +#else +#define CHECK_SLAB_OKAY(X) do {} while(0) +#endif + +#define FCRYPT_BSIZE 8 +struct rxrpc_crypt { + union { + u8 x[FCRYPT_BSIZE]; + u32 n[2]; + }; +} __attribute__((aligned(8))); + +#define rxrpc_queue_work(WS) queue_work(rxrpc_workqueue, (WS)) +#define rxrpc_queue_delayed_work(WS,D) \ + queue_delayed_work(rxrpc_workqueue, (WS), (D)) + +#define rxrpc_queue_call(CALL) rxrpc_queue_work(&(CALL)->processor) +#define rxrpc_queue_conn(CONN) rxrpc_queue_work(&(CONN)->processor) + +/* + * sk_state for RxRPC sockets + */ +enum { + RXRPC_UNCONNECTED = 0, + RXRPC_CLIENT_BOUND, /* client local address bound */ + RXRPC_CLIENT_CONNECTED, /* client is connected */ + RXRPC_SERVER_BOUND, /* server local address bound */ + RXRPC_SERVER_LISTENING, /* server listening for connections */ + RXRPC_CLOSE, /* socket is being closed */ +}; + +/* + * RxRPC socket definition + */ +struct rxrpc_sock { + /* WARNING: sk has to be the first member */ + struct sock sk; + rxrpc_interceptor_t interceptor; /* kernel service Rx interceptor function */ + struct rxrpc_local *local; /* local endpoint */ + struct rxrpc_transport *trans; /* transport handler */ + struct rxrpc_conn_bundle *bundle; /* virtual connection bundle */ + struct rxrpc_connection *conn; /* exclusive virtual connection */ + struct list_head listen_link; /* link in the local endpoint's listen list */ + struct list_head secureq; /* calls awaiting connection security clearance */ + struct list_head acceptq; /* calls awaiting acceptance */ + struct key *key; /* security for this socket */ + struct key *securities; /* list of server security descriptors */ + struct rb_root calls; /* outstanding calls on this socket */ + unsigned long flags; +#define RXRPC_SOCK_EXCLUSIVE_CONN 1 /* exclusive connection for a client socket */ + rwlock_t call_lock; /* lock for calls */ + u32 min_sec_level; /* minimum security level */ +#define RXRPC_SECURITY_MAX RXRPC_SECURITY_ENCRYPT + struct sockaddr_rxrpc srx; /* local address */ + sa_family_t proto; /* protocol created with */ + __be16 service_id; /* service ID of local/remote service */ +}; + +#define rxrpc_sk(__sk) container_of((__sk), struct rxrpc_sock, sk) + +/* + * RxRPC socket buffer private variables + * - max 48 bytes (struct sk_buff::cb) + */ +struct rxrpc_skb_priv { + struct rxrpc_call *call; /* call with which associated */ + unsigned long resend_at; /* time in jiffies at which to resend */ + union { + unsigned offset; /* offset into buffer of next read */ + int remain; /* amount of space remaining for next write */ + u32 error; /* network error code */ + bool need_resend; /* T if needs resending */ + }; + + struct rxrpc_header hdr; /* RxRPC packet header from this packet */ +}; + +#define rxrpc_skb(__skb) ((struct rxrpc_skb_priv *) &(__skb)->cb) + +enum rxrpc_command { + RXRPC_CMD_SEND_DATA, /* send data message */ + RXRPC_CMD_SEND_ABORT, /* request abort generation */ + RXRPC_CMD_ACCEPT, /* [server] accept incoming call */ + RXRPC_CMD_REJECT_BUSY, /* [server] reject a call as busy */ +}; + +/* + * RxRPC security module interface + */ +struct rxrpc_security { + struct module *owner; /* providing module */ + struct list_head link; /* link in master list */ + const char *name; /* name of this service */ + u8 security_index; /* security type provided */ + + /* initialise a connection's security */ + int (*init_connection_security)(struct rxrpc_connection *); + + /* prime a connection's packet security */ + void (*prime_packet_security)(struct rxrpc_connection *); + + /* impose security on a packet */ + int (*secure_packet)(const struct rxrpc_call *, + struct sk_buff *, + size_t, + void *); + + /* verify the security on a received packet */ + int (*verify_packet)(const struct rxrpc_call *, struct sk_buff *, + u32 *); + + /* issue a challenge */ + int (*issue_challenge)(struct rxrpc_connection *); + + /* respond to a challenge */ + int (*respond_to_challenge)(struct rxrpc_connection *, + struct sk_buff *, + u32 *); + + /* verify a response */ + int (*verify_response)(struct rxrpc_connection *, + struct sk_buff *, + u32 *); + + /* clear connection security */ + void (*clear)(struct rxrpc_connection *); +}; + +/* + * RxRPC local transport endpoint definition + * - matched by local port, address and protocol type + */ +struct rxrpc_local { + struct socket *socket; /* my UDP socket */ + struct work_struct destroyer; /* endpoint destroyer */ + struct work_struct acceptor; /* incoming call processor */ + struct work_struct rejecter; /* packet reject writer */ + struct list_head services; /* services listening on this endpoint */ + struct list_head link; /* link in endpoint list */ + struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */ + struct sk_buff_head accept_queue; /* incoming calls awaiting acceptance */ + struct sk_buff_head reject_queue; /* packets awaiting rejection */ + spinlock_t lock; /* access lock */ + rwlock_t services_lock; /* lock for services list */ + atomic_t usage; + int debug_id; /* debug ID for printks */ + volatile char error_rcvd; /* T if received ICMP error outstanding */ + struct sockaddr_rxrpc srx; /* local address */ +}; + +/* + * RxRPC remote transport endpoint definition + * - matched by remote port, address and protocol type + * - holds the connection ID counter for connections between the two endpoints + */ +struct rxrpc_peer { + struct work_struct destroyer; /* peer destroyer */ + struct list_head link; /* link in master peer list */ + struct list_head error_targets; /* targets for net error distribution */ + spinlock_t lock; /* access lock */ + atomic_t usage; + unsigned if_mtu; /* interface MTU for this peer */ + unsigned mtu; /* network MTU for this peer */ + unsigned maxdata; /* data size (MTU - hdrsize) */ + unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */ + int debug_id; /* debug ID for printks */ + int net_error; /* network error distributed */ + struct sockaddr_rxrpc srx; /* remote address */ + + /* calculated RTT cache */ +#define RXRPC_RTT_CACHE_SIZE 32 + suseconds_t rtt; /* current RTT estimate (in uS) */ + unsigned rtt_point; /* next entry at which to insert */ + unsigned rtt_usage; /* amount of cache actually used */ + suseconds_t rtt_cache[RXRPC_RTT_CACHE_SIZE]; /* calculated RTT cache */ +}; + +/* + * RxRPC point-to-point transport / connection manager definition + * - handles a bundle of connections between two endpoints + * - matched by { local, peer } + */ +struct rxrpc_transport { + struct rxrpc_local *local; /* local transport endpoint */ + struct rxrpc_peer *peer; /* remote transport endpoint */ + struct work_struct error_handler; /* network error distributor */ + struct rb_root bundles; /* client connection bundles on this transport */ + struct rb_root client_conns; /* client connections on this transport */ + struct rb_root server_conns; /* server connections on this transport */ + struct list_head link; /* link in master session list */ + struct sk_buff_head error_queue; /* error packets awaiting processing */ + time_t put_time; /* time at which to reap */ + spinlock_t client_lock; /* client connection allocation lock */ + rwlock_t conn_lock; /* lock for active/dead connections */ + atomic_t usage; + int debug_id; /* debug ID for printks */ + unsigned int conn_idcounter; /* connection ID counter (client) */ +}; + +/* + * RxRPC client connection bundle + * - matched by { transport, service_id, key } + */ +struct rxrpc_conn_bundle { + struct rb_node node; /* node in transport's lookup tree */ + struct list_head unused_conns; /* unused connections in this bundle */ + struct list_head avail_conns; /* available connections in this bundle */ + struct list_head busy_conns; /* busy connections in this bundle */ + struct key *key; /* security for this bundle */ + wait_queue_head_t chanwait; /* wait for channel to become available */ + atomic_t usage; + int debug_id; /* debug ID for printks */ + unsigned short num_conns; /* number of connections in this bundle */ + __be16 service_id; /* service ID */ + uint8_t security_ix; /* security type */ +}; + +/* + * RxRPC connection definition + * - matched by { transport, service_id, conn_id, direction, key } + * - each connection can only handle four simultaneous calls + */ +struct rxrpc_connection { + struct rxrpc_transport *trans; /* transport session */ + struct rxrpc_conn_bundle *bundle; /* connection bundle (client) */ + struct work_struct processor; /* connection event processor */ + struct rb_node node; /* node in transport's lookup tree */ + struct list_head link; /* link in master connection list */ + struct list_head bundle_link; /* link in bundle */ + struct rb_root calls; /* calls on this connection */ + struct sk_buff_head rx_queue; /* received conn-level packets */ + struct rxrpc_call *channels[RXRPC_MAXCALLS]; /* channels (active calls) */ + struct rxrpc_security *security; /* applied security module */ + struct key *key; /* security for this connection (client) */ + struct key *server_key; /* security for this service */ + struct crypto_blkcipher *cipher; /* encryption handle */ + struct rxrpc_crypt csum_iv; /* packet checksum base */ + unsigned long events; +#define RXRPC_CONN_CHALLENGE 0 /* send challenge packet */ + time_t put_time; /* time at which to reap */ + rwlock_t lock; /* access lock */ + spinlock_t state_lock; /* state-change lock */ + atomic_t usage; + u32 real_conn_id; /* connection ID (host-endian) */ + enum { /* current state of connection */ + RXRPC_CONN_UNUSED, /* - connection not yet attempted */ + RXRPC_CONN_CLIENT, /* - client connection */ + RXRPC_CONN_SERVER_UNSECURED, /* - server unsecured connection */ + RXRPC_CONN_SERVER_CHALLENGING, /* - server challenging for security */ + RXRPC_CONN_SERVER, /* - server secured connection */ + RXRPC_CONN_REMOTELY_ABORTED, /* - conn aborted by peer */ + RXRPC_CONN_LOCALLY_ABORTED, /* - conn aborted locally */ + RXRPC_CONN_NETWORK_ERROR, /* - conn terminated by network error */ + } state; + int error; /* error code for local abort */ + int debug_id; /* debug ID for printks */ + unsigned call_counter; /* call ID counter */ + atomic_t serial; /* packet serial number counter */ + atomic_t hi_serial; /* highest serial number received */ + u8 avail_calls; /* number of calls available */ + u8 size_align; /* data size alignment (for security) */ + u8 header_size; /* rxrpc + security header size */ + u8 security_size; /* security header size */ + u32 security_level; /* security level negotiated */ + u32 security_nonce; /* response re-use preventer */ + + /* the following are all in net order */ + __be32 epoch; /* epoch of this connection */ + __be32 cid; /* connection ID */ + __be16 service_id; /* service ID */ + u8 security_ix; /* security type */ + u8 in_clientflag; /* RXRPC_CLIENT_INITIATED if we are server */ + u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */ +}; + +/* + * RxRPC call definition + * - matched by { connection, call_id } + */ +struct rxrpc_call { + struct rxrpc_connection *conn; /* connection carrying call */ + struct rxrpc_sock *socket; /* socket responsible */ + struct timer_list lifetimer; /* lifetime remaining on call */ + struct timer_list deadspan; /* reap timer for re-ACK'ing, etc */ + struct timer_list ack_timer; /* ACK generation timer */ + struct timer_list resend_timer; /* Tx resend timer */ + struct work_struct destroyer; /* call destroyer */ + struct work_struct processor; /* packet processor and ACK generator */ + struct list_head link; /* link in master call list */ + struct list_head error_link; /* link in error distribution list */ + struct list_head accept_link; /* calls awaiting acceptance */ + struct rb_node sock_node; /* node in socket call tree */ + struct rb_node conn_node; /* node in connection call tree */ + struct sk_buff_head rx_queue; /* received packets */ + struct sk_buff_head rx_oos_queue; /* packets received out of sequence */ + struct sk_buff *tx_pending; /* Tx socket buffer being filled */ + wait_queue_head_t tx_waitq; /* wait for Tx window space to become available */ + unsigned long user_call_ID; /* user-defined call ID */ + unsigned long creation_jif; /* time of call creation */ + unsigned long flags; +#define RXRPC_CALL_RELEASED 0 /* call has been released - no more message to userspace */ +#define RXRPC_CALL_TERMINAL_MSG 1 /* call has given the socket its final message */ +#define RXRPC_CALL_RCVD_LAST 2 /* all packets received */ +#define RXRPC_CALL_RUN_RTIMER 3 /* Tx resend timer started */ +#define RXRPC_CALL_TX_SOFT_ACK 4 /* sent some soft ACKs */ +#define RXRPC_CALL_PROC_BUSY 5 /* the processor is busy */ +#define RXRPC_CALL_INIT_ACCEPT 6 /* acceptance was initiated */ +#define RXRPC_CALL_HAS_USERID 7 /* has a user ID attached */ +#define RXRPC_CALL_EXPECT_OOS 8 /* expect out of sequence packets */ + unsigned long events; +#define RXRPC_CALL_RCVD_ACKALL 0 /* ACKALL or reply received */ +#define RXRPC_CALL_RCVD_BUSY 1 /* busy packet received */ +#define RXRPC_CALL_RCVD_ABORT 2 /* abort packet received */ +#define RXRPC_CALL_RCVD_ERROR 3 /* network error received */ +#define RXRPC_CALL_ACK_FINAL 4 /* need to generate final ACK (and release call) */ +#define RXRPC_CALL_ACK 5 /* need to generate ACK */ +#define RXRPC_CALL_REJECT_BUSY 6 /* need to generate busy message */ +#define RXRPC_CALL_ABORT 7 /* need to generate abort */ +#define RXRPC_CALL_CONN_ABORT 8 /* local connection abort generated */ +#define RXRPC_CALL_RESEND_TIMER 9 /* Tx resend timer expired */ +#define RXRPC_CALL_RESEND 10 /* Tx resend required */ +#define RXRPC_CALL_DRAIN_RX_OOS 11 /* drain the Rx out of sequence queue */ +#define RXRPC_CALL_LIFE_TIMER 12 /* call's lifetimer ran out */ +#define RXRPC_CALL_ACCEPTED 13 /* incoming call accepted by userspace app */ +#define RXRPC_CALL_SECURED 14 /* incoming call's connection is now secure */ +#define RXRPC_CALL_POST_ACCEPT 15 /* need to post an "accept?" message to the app */ +#define RXRPC_CALL_RELEASE 16 /* need to release the call's resources */ + + spinlock_t lock; + rwlock_t state_lock; /* lock for state transition */ + atomic_t usage; + atomic_t sequence; /* Tx data packet sequence counter */ + u32 abort_code; /* local/remote abort code */ + enum { /* current state of call */ + RXRPC_CALL_CLIENT_SEND_REQUEST, /* - client sending request phase */ + RXRPC_CALL_CLIENT_AWAIT_REPLY, /* - client awaiting reply */ + RXRPC_CALL_CLIENT_RECV_REPLY, /* - client receiving reply phase */ + RXRPC_CALL_CLIENT_FINAL_ACK, /* - client sending final ACK phase */ + RXRPC_CALL_SERVER_SECURING, /* - server securing request connection */ + RXRPC_CALL_SERVER_ACCEPTING, /* - server accepting request */ + RXRPC_CALL_SERVER_RECV_REQUEST, /* - server receiving request */ + RXRPC_CALL_SERVER_ACK_REQUEST, /* - server pending ACK of request */ + RXRPC_CALL_SERVER_SEND_REPLY, /* - server sending reply */ + RXRPC_CALL_SERVER_AWAIT_ACK, /* - server awaiting final ACK */ + RXRPC_CALL_COMPLETE, /* - call completed */ + RXRPC_CALL_SERVER_BUSY, /* - call rejected by busy server */ + RXRPC_CALL_REMOTELY_ABORTED, /* - call aborted by peer */ + RXRPC_CALL_LOCALLY_ABORTED, /* - call aborted locally on error or close */ + RXRPC_CALL_NETWORK_ERROR, /* - call terminated by network error */ + RXRPC_CALL_DEAD, /* - call is dead */ + } state; + int debug_id; /* debug ID for printks */ + u8 channel; /* connection channel occupied by this call */ + + /* transmission-phase ACK management */ + uint8_t acks_head; /* offset into window of first entry */ + uint8_t acks_tail; /* offset into window of last entry */ + uint8_t acks_winsz; /* size of un-ACK'd window */ + uint8_t acks_unacked; /* lowest unacked packet in last ACK received */ + int acks_latest; /* serial number of latest ACK received */ + rxrpc_seq_t acks_hard; /* highest definitively ACK'd msg seq */ + unsigned long *acks_window; /* sent packet window + * - elements are pointers with LSB set if ACK'd + */ + + /* receive-phase ACK management */ + rxrpc_seq_t rx_data_expect; /* next data seq ID expected to be received */ + rxrpc_seq_t rx_data_post; /* next data seq ID expected to be posted */ + rxrpc_seq_t rx_data_recv; /* last data seq ID encountered by recvmsg */ + rxrpc_seq_t rx_data_eaten; /* last data seq ID consumed by recvmsg */ + rxrpc_seq_t rx_first_oos; /* first packet in rx_oos_queue (or 0) */ + rxrpc_seq_t ackr_win_top; /* top of ACK window (rx_data_eaten is bottom) */ + rxrpc_seq_net_t ackr_prev_seq; /* previous sequence number received */ + uint8_t ackr_reason; /* reason to ACK */ + __be32 ackr_serial; /* serial of packet being ACK'd */ + atomic_t ackr_not_idle; /* number of packets in Rx queue */ + + /* received packet records, 1 bit per record */ +#define RXRPC_ACKR_WINDOW_ASZ DIV_ROUND_UP(RXRPC_MAXACKS, BITS_PER_LONG) + unsigned long ackr_window[RXRPC_ACKR_WINDOW_ASZ + 1]; + + /* the following should all be in net order */ + __be32 cid; /* connection ID + channel index */ + __be32 call_id; /* call ID on connection */ +}; + +/* + * RxRPC key for Kerberos (type-2 security) + */ +struct rxkad_key { + u16 security_index; /* RxRPC header security index */ + u16 ticket_len; /* length of ticket[] */ + u32 expiry; /* time at which expires */ + u32 kvno; /* key version number */ + u8 session_key[8]; /* DES session key */ + u8 ticket[0]; /* the encrypted ticket */ +}; + +struct rxrpc_key_payload { + struct rxkad_key k; +}; + +/* + * locally abort an RxRPC call + */ +static inline void rxrpc_abort_call(struct rxrpc_call *call, u32 abort_code) +{ + write_lock_bh(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE) { + call->abort_code = abort_code; + call->state = RXRPC_CALL_LOCALLY_ABORTED; + set_bit(RXRPC_CALL_ABORT, &call->events); + } + write_unlock_bh(&call->state_lock); +} + +/* + * af_rxrpc.c + */ +extern atomic_t rxrpc_n_skbs; +extern __be32 rxrpc_epoch; +extern atomic_t rxrpc_debug_id; +extern struct workqueue_struct *rxrpc_workqueue; + +/* + * ar-accept.c + */ +extern void rxrpc_accept_incoming_calls(struct work_struct *); +extern struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *, + unsigned long); +extern int rxrpc_reject_call(struct rxrpc_sock *); + +/* + * ar-ack.c + */ +extern void __rxrpc_propose_ACK(struct rxrpc_call *, uint8_t, __be32, bool); +extern void rxrpc_propose_ACK(struct rxrpc_call *, uint8_t, __be32, bool); +extern void rxrpc_process_call(struct work_struct *); + +/* + * ar-call.c + */ +extern struct kmem_cache *rxrpc_call_jar; +extern struct list_head rxrpc_calls; +extern rwlock_t rxrpc_call_lock; + +extern struct rxrpc_call *rxrpc_get_client_call(struct rxrpc_sock *, + struct rxrpc_transport *, + struct rxrpc_conn_bundle *, + unsigned long, int, gfp_t); +extern struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *, + struct rxrpc_connection *, + struct rxrpc_header *, gfp_t); +extern struct rxrpc_call *rxrpc_find_server_call(struct rxrpc_sock *, + unsigned long); +extern void rxrpc_release_call(struct rxrpc_call *); +extern void rxrpc_release_calls_on_socket(struct rxrpc_sock *); +extern void __rxrpc_put_call(struct rxrpc_call *); +extern void __exit rxrpc_destroy_all_calls(void); + +/* + * ar-connection.c + */ +extern struct list_head rxrpc_connections; +extern rwlock_t rxrpc_connection_lock; + +extern struct rxrpc_conn_bundle *rxrpc_get_bundle(struct rxrpc_sock *, + struct rxrpc_transport *, + struct key *, + __be16, gfp_t); +extern void rxrpc_put_bundle(struct rxrpc_transport *, + struct rxrpc_conn_bundle *); +extern int rxrpc_connect_call(struct rxrpc_sock *, struct rxrpc_transport *, + struct rxrpc_conn_bundle *, struct rxrpc_call *, + gfp_t); +extern void rxrpc_put_connection(struct rxrpc_connection *); +extern void __exit rxrpc_destroy_all_connections(void); +extern struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *, + struct rxrpc_header *); +extern struct rxrpc_connection * +rxrpc_incoming_connection(struct rxrpc_transport *, struct rxrpc_header *, + gfp_t); + +/* + * ar-connevent.c + */ +extern void rxrpc_process_connection(struct work_struct *); +extern void rxrpc_reject_packet(struct rxrpc_local *, struct sk_buff *); +extern void rxrpc_reject_packets(struct work_struct *); + +/* + * ar-error.c + */ +extern void rxrpc_UDP_error_report(struct sock *); +extern void rxrpc_UDP_error_handler(struct work_struct *); + +/* + * ar-input.c + */ +extern unsigned long rxrpc_ack_timeout; +extern const char *rxrpc_pkts[]; + +extern void rxrpc_data_ready(struct sock *, int); +extern int rxrpc_queue_rcv_skb(struct rxrpc_call *, struct sk_buff *, bool, + bool); +extern void rxrpc_fast_process_packet(struct rxrpc_call *, struct sk_buff *); + +/* + * ar-local.c + */ +extern rwlock_t rxrpc_local_lock; +extern struct rxrpc_local *rxrpc_lookup_local(struct sockaddr_rxrpc *); +extern void rxrpc_put_local(struct rxrpc_local *); +extern void __exit rxrpc_destroy_all_locals(void); + +/* + * ar-key.c + */ +extern struct key_type key_type_rxrpc; +extern struct key_type key_type_rxrpc_s; + +extern int rxrpc_request_key(struct rxrpc_sock *, char __user *, int); +extern int rxrpc_server_keyring(struct rxrpc_sock *, char __user *, int); +extern int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, + time_t, u32); + +/* + * ar-output.c + */ +extern int rxrpc_resend_timeout; + +extern int rxrpc_send_packet(struct rxrpc_transport *, struct sk_buff *); +extern int rxrpc_client_sendmsg(struct kiocb *, struct rxrpc_sock *, + struct rxrpc_transport *, struct msghdr *, + size_t); +extern int rxrpc_server_sendmsg(struct kiocb *, struct rxrpc_sock *, + struct msghdr *, size_t); + +/* + * ar-peer.c + */ +extern struct rxrpc_peer *rxrpc_get_peer(struct sockaddr_rxrpc *, gfp_t); +extern void rxrpc_put_peer(struct rxrpc_peer *); +extern struct rxrpc_peer *rxrpc_find_peer(struct rxrpc_local *, + __be32, __be16); +extern void __exit rxrpc_destroy_all_peers(void); + +/* + * ar-proc.c + */ +extern const char *rxrpc_call_states[]; +extern struct file_operations rxrpc_call_seq_fops; +extern struct file_operations rxrpc_connection_seq_fops; + +/* + * ar-recvmsg.c + */ +extern void rxrpc_remove_user_ID(struct rxrpc_sock *, struct rxrpc_call *); +extern int rxrpc_recvmsg(struct kiocb *, struct socket *, struct msghdr *, + size_t, int); + +/* + * ar-security.c + */ +extern int rxrpc_register_security(struct rxrpc_security *); +extern void rxrpc_unregister_security(struct rxrpc_security *); +extern int rxrpc_init_client_conn_security(struct rxrpc_connection *); +extern int rxrpc_init_server_conn_security(struct rxrpc_connection *); +extern int rxrpc_secure_packet(const struct rxrpc_call *, struct sk_buff *, + size_t, void *); +extern int rxrpc_verify_packet(const struct rxrpc_call *, struct sk_buff *, + u32 *); +extern void rxrpc_clear_conn_security(struct rxrpc_connection *); + +/* + * ar-skbuff.c + */ +extern void rxrpc_packet_destructor(struct sk_buff *); + +/* + * ar-transport.c + */ +extern struct rxrpc_transport *rxrpc_get_transport(struct rxrpc_local *, + struct rxrpc_peer *, + gfp_t); +extern void rxrpc_put_transport(struct rxrpc_transport *); +extern void __exit rxrpc_destroy_all_transports(void); +extern struct rxrpc_transport *rxrpc_find_transport(struct rxrpc_local *, + struct rxrpc_peer *); + +/* + * debug tracing + */ +extern unsigned rxrpc_debug; + +#define dbgprintk(FMT,...) \ + printk("[%x%-6.6s] "FMT"\n", smp_processor_id(), current->comm ,##__VA_ARGS__) + +/* make sure we maintain the format strings, even when debugging is disabled */ +static inline __attribute__((format(printf,1,2))) +void _dbprintk(const char *fmt, ...) +{ +} + +#define kenter(FMT,...) dbgprintk("==> %s("FMT")",__FUNCTION__ ,##__VA_ARGS__) +#define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__FUNCTION__ ,##__VA_ARGS__) +#define kdebug(FMT,...) dbgprintk(" "FMT ,##__VA_ARGS__) +#define kproto(FMT,...) dbgprintk("### "FMT ,##__VA_ARGS__) +#define knet(FMT,...) dbgprintk("@@@ "FMT ,##__VA_ARGS__) + + +#if defined(__KDEBUG) +#define _enter(FMT,...) kenter(FMT,##__VA_ARGS__) +#define _leave(FMT,...) kleave(FMT,##__VA_ARGS__) +#define _debug(FMT,...) kdebug(FMT,##__VA_ARGS__) +#define _proto(FMT,...) kproto(FMT,##__VA_ARGS__) +#define _net(FMT,...) knet(FMT,##__VA_ARGS__) + +#elif defined(CONFIG_AF_RXRPC_DEBUG) +#define RXRPC_DEBUG_KENTER 0x01 +#define RXRPC_DEBUG_KLEAVE 0x02 +#define RXRPC_DEBUG_KDEBUG 0x04 +#define RXRPC_DEBUG_KPROTO 0x08 +#define RXRPC_DEBUG_KNET 0x10 + +#define _enter(FMT,...) \ +do { \ + if (unlikely(rxrpc_debug & RXRPC_DEBUG_KENTER)) \ + kenter(FMT,##__VA_ARGS__); \ +} while (0) + +#define _leave(FMT,...) \ +do { \ + if (unlikely(rxrpc_debug & RXRPC_DEBUG_KLEAVE)) \ + kleave(FMT,##__VA_ARGS__); \ +} while (0) + +#define _debug(FMT,...) \ +do { \ + if (unlikely(rxrpc_debug & RXRPC_DEBUG_KDEBUG)) \ + kdebug(FMT,##__VA_ARGS__); \ +} while (0) + +#define _proto(FMT,...) \ +do { \ + if (unlikely(rxrpc_debug & RXRPC_DEBUG_KPROTO)) \ + kproto(FMT,##__VA_ARGS__); \ +} while (0) + +#define _net(FMT,...) \ +do { \ + if (unlikely(rxrpc_debug & RXRPC_DEBUG_KNET)) \ + knet(FMT,##__VA_ARGS__); \ +} while (0) + +#else +#define _enter(FMT,...) _dbprintk("==> %s("FMT")",__FUNCTION__ ,##__VA_ARGS__) +#define _leave(FMT,...) _dbprintk("<== %s()"FMT"",__FUNCTION__ ,##__VA_ARGS__) +#define _debug(FMT,...) _dbprintk(" "FMT ,##__VA_ARGS__) +#define _proto(FMT,...) _dbprintk("### "FMT ,##__VA_ARGS__) +#define _net(FMT,...) _dbprintk("@@@ "FMT ,##__VA_ARGS__) +#endif + +/* + * debug assertion checking + */ +#if 1 // defined(__KDEBUGALL) + +#define ASSERT(X) \ +do { \ + if (unlikely(!(X))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "RxRPC: Assertion failed\n"); \ + BUG(); \ + } \ +} while(0) + +#define ASSERTCMP(X, OP, Y) \ +do { \ + if (unlikely(!((X) OP (Y)))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "RxRPC: Assertion failed\n"); \ + printk(KERN_ERR "%lu " #OP " %lu is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + BUG(); \ + } \ +} while(0) + +#define ASSERTIF(C, X) \ +do { \ + if (unlikely((C) && !(X))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "RxRPC: Assertion failed\n"); \ + BUG(); \ + } \ +} while(0) + +#define ASSERTIFCMP(C, X, OP, Y) \ +do { \ + if (unlikely((C) && !((X) OP (Y)))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "RxRPC: Assertion failed\n"); \ + printk(KERN_ERR "%lu " #OP " %lu is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + BUG(); \ + } \ +} while(0) + +#else + +#define ASSERT(X) \ +do { \ +} while(0) + +#define ASSERTCMP(X, OP, Y) \ +do { \ +} while(0) + +#define ASSERTIF(C, X) \ +do { \ +} while(0) + +#define ASSERTIFCMP(C, X, OP, Y) \ +do { \ +} while(0) + +#endif /* __KDEBUGALL */ + +/* + * socket buffer accounting / leak finding + */ +static inline void __rxrpc_new_skb(struct sk_buff *skb, const char *fn) +{ + //_net("new skb %p %s [%d]", skb, fn, atomic_read(&rxrpc_n_skbs)); + //atomic_inc(&rxrpc_n_skbs); +} + +#define rxrpc_new_skb(skb) __rxrpc_new_skb((skb), __func__) + +static inline void __rxrpc_kill_skb(struct sk_buff *skb, const char *fn) +{ + //_net("kill skb %p %s [%d]", skb, fn, atomic_read(&rxrpc_n_skbs)); + //atomic_dec(&rxrpc_n_skbs); +} + +#define rxrpc_kill_skb(skb) __rxrpc_kill_skb((skb), __func__) + +static inline void __rxrpc_free_skb(struct sk_buff *skb, const char *fn) +{ + if (skb) { + CHECK_SLAB_OKAY(&skb->users); + //_net("free skb %p %s [%d]", + // skb, fn, atomic_read(&rxrpc_n_skbs)); + //atomic_dec(&rxrpc_n_skbs); + kfree_skb(skb); + } +} + +#define rxrpc_free_skb(skb) __rxrpc_free_skb((skb), __func__) + +static inline void rxrpc_purge_queue(struct sk_buff_head *list) +{ + struct sk_buff *skb; + while ((skb = skb_dequeue((list))) != NULL) + rxrpc_free_skb(skb); +} + +static inline void __rxrpc_get_local(struct rxrpc_local *local, const char *f) +{ + CHECK_SLAB_OKAY(&local->usage); + if (atomic_inc_return(&local->usage) == 1) + printk("resurrected (%s)\n", f); +} + +#define rxrpc_get_local(LOCAL) __rxrpc_get_local((LOCAL), __func__) + +#define rxrpc_get_call(CALL) \ +do { \ + CHECK_SLAB_OKAY(&(CALL)->usage); \ + if (atomic_inc_return(&(CALL)->usage) == 1) \ + BUG(); \ +} while(0) + +#define rxrpc_put_call(CALL) \ +do { \ + __rxrpc_put_call(CALL); \ +} while(0) diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c new file mode 100644 index 00000000000..7e049ff6ae6 --- /dev/null +++ b/net/rxrpc/ar-key.c @@ -0,0 +1,334 @@ +/* RxRPC key management + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * RxRPC keys should have a description of describing their purpose: + * "afs@CAMBRIDGE.REDHAT.COM> + */ + +#include <linux/module.h> +#include <linux/net.h> +#include <linux/skbuff.h> +#include <linux/key.h> +#include <linux/crypto.h> +#include <net/sock.h> +#include <net/af_rxrpc.h> +#include <keys/rxrpc-type.h> +#include <keys/user-type.h> +#include "ar-internal.h" + +static int rxrpc_instantiate(struct key *, const void *, size_t); +static int rxrpc_instantiate_s(struct key *, const void *, size_t); +static void rxrpc_destroy(struct key *); +static void rxrpc_destroy_s(struct key *); +static void rxrpc_describe(const struct key *, struct seq_file *); + +/* + * rxrpc defined keys take an arbitrary string as the description and an + * arbitrary blob of data as the payload + */ +struct key_type key_type_rxrpc = { + .name = "rxrpc", + .instantiate = rxrpc_instantiate, + .match = user_match, + .destroy = rxrpc_destroy, + .describe = rxrpc_describe, +}; + +EXPORT_SYMBOL(key_type_rxrpc); + +/* + * rxrpc server defined keys take "<serviceId>:<securityIndex>" as the + * description and an 8-byte decryption key as the payload + */ +struct key_type key_type_rxrpc_s = { + .name = "rxrpc_s", + .instantiate = rxrpc_instantiate_s, + .match = user_match, + .destroy = rxrpc_destroy_s, + .describe = rxrpc_describe, +}; + +/* + * instantiate an rxrpc defined key + * data should be of the form: + * OFFSET LEN CONTENT + * 0 4 key interface version number + * 4 2 security index (type) + * 6 2 ticket length + * 8 4 key expiry time (time_t) + * 12 4 kvno + * 16 8 session key + * 24 [len] ticket + * + * if no data is provided, then a no-security key is made + */ +static int rxrpc_instantiate(struct key *key, const void *data, size_t datalen) +{ + const struct rxkad_key *tsec; + struct rxrpc_key_payload *upayload; + size_t plen; + u32 kver; + int ret; + + _enter("{%x},,%zu", key_serial(key), datalen); + + /* handle a no-security key */ + if (!data && datalen == 0) + return 0; + + /* get the key interface version number */ + ret = -EINVAL; + if (datalen <= 4 || !data) + goto error; + memcpy(&kver, data, sizeof(kver)); + data += sizeof(kver); + datalen -= sizeof(kver); + + _debug("KEY I/F VERSION: %u", kver); + + ret = -EKEYREJECTED; + if (kver != 1) + goto error; + + /* deal with a version 1 key */ + ret = -EINVAL; + if (datalen < sizeof(*tsec)) + goto error; + + tsec = data; + if (datalen != sizeof(*tsec) + tsec->ticket_len) + goto error; + + _debug("SCIX: %u", tsec->security_index); + _debug("TLEN: %u", tsec->ticket_len); + _debug("EXPY: %x", tsec->expiry); + _debug("KVNO: %u", tsec->kvno); + _debug("SKEY: %02x%02x%02x%02x%02x%02x%02x%02x", + tsec->session_key[0], tsec->session_key[1], + tsec->session_key[2], tsec->session_key[3], + tsec->session_key[4], tsec->session_key[5], + tsec->session_key[6], tsec->session_key[7]); + if (tsec->ticket_len >= 8) + _debug("TCKT: %02x%02x%02x%02x%02x%02x%02x%02x", + tsec->ticket[0], tsec->ticket[1], + tsec->ticket[2], tsec->ticket[3], + tsec->ticket[4], tsec->ticket[5], + tsec->ticket[6], tsec->ticket[7]); + + ret = -EPROTONOSUPPORT; + if (tsec->security_index != 2) + goto error; + + key->type_data.x[0] = tsec->security_index; + + plen = sizeof(*upayload) + tsec->ticket_len; + ret = key_payload_reserve(key, plen); + if (ret < 0) + goto error; + + ret = -ENOMEM; + upayload = kmalloc(plen, GFP_KERNEL); + if (!upayload) + goto error; + + /* attach the data */ + memcpy(&upayload->k, tsec, sizeof(*tsec)); + memcpy(&upayload->k.ticket, (void *)tsec + sizeof(*tsec), + tsec->ticket_len); + key->payload.data = upayload; + key->expiry = tsec->expiry; + ret = 0; + +error: + return ret; +} + +/* + * instantiate a server secret key + * data should be a pointer to the 8-byte secret key + */ +static int rxrpc_instantiate_s(struct key *key, const void *data, + size_t datalen) +{ + struct crypto_blkcipher *ci; + + _enter("{%x},,%zu", key_serial(key), datalen); + + if (datalen != 8) + return -EINVAL; + + memcpy(&key->type_data, data, 8); + + ci = crypto_alloc_blkcipher("pcbc(des)", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(ci)) { + _leave(" = %ld", PTR_ERR(ci)); + return PTR_ERR(ci); + } + + if (crypto_blkcipher_setkey(ci, data, 8) < 0) + BUG(); + + key->payload.data = ci; + _leave(" = 0"); + return 0; +} + +/* + * dispose of the data dangling from the corpse of a rxrpc key + */ +static void rxrpc_destroy(struct key *key) +{ + kfree(key->payload.data); +} + +/* + * dispose of the data dangling from the corpse of a rxrpc key + */ +static void rxrpc_destroy_s(struct key *key) +{ + if (key->payload.data) { + crypto_free_blkcipher(key->payload.data); + key->payload.data = NULL; + } +} + +/* + * describe the rxrpc key + */ +static void rxrpc_describe(const struct key *key, struct seq_file *m) +{ + seq_puts(m, key->description); +} + +/* + * grab the security key for a socket + */ +int rxrpc_request_key(struct rxrpc_sock *rx, char __user *optval, int optlen) +{ + struct key *key; + char *description; + + _enter(""); + + if (optlen <= 0 || optlen > PAGE_SIZE - 1) + return -EINVAL; + + description = kmalloc(optlen + 1, GFP_KERNEL); + if (!description) + return -ENOMEM; + + if (copy_from_user(description, optval, optlen)) { + kfree(description); + return -EFAULT; + } + description[optlen] = 0; + + key = request_key(&key_type_rxrpc, description, NULL); + if (IS_ERR(key)) { + kfree(description); + _leave(" = %ld", PTR_ERR(key)); + return PTR_ERR(key); + } + + rx->key = key; + kfree(description); + _leave(" = 0 [key %x]", key->serial); + return 0; +} + +/* + * grab the security keyring for a server socket + */ +int rxrpc_server_keyring(struct rxrpc_sock *rx, char __user *optval, + int optlen) +{ + struct key *key; + char *description; + + _enter(""); + + if (optlen <= 0 || optlen > PAGE_SIZE - 1) + return -EINVAL; + + description = kmalloc(optlen + 1, GFP_KERNEL); + if (!description) + return -ENOMEM; + + if (copy_from_user(description, optval, optlen)) { + kfree(description); + return -EFAULT; + } + description[optlen] = 0; + + key = request_key(&key_type_keyring, description, NULL); + if (IS_ERR(key)) { + kfree(description); + _leave(" = %ld", PTR_ERR(key)); + return PTR_ERR(key); + } + + rx->securities = key; + kfree(description); + _leave(" = 0 [key %x]", key->serial); + return 0; +} + +/* + * generate a server data key + */ +int rxrpc_get_server_data_key(struct rxrpc_connection *conn, + const void *session_key, + time_t expiry, + u32 kvno) +{ + struct key *key; + int ret; + + struct { + u32 kver; + struct rxkad_key tsec; + } data; + + _enter(""); + + key = key_alloc(&key_type_rxrpc, "x", 0, 0, current, 0, + KEY_ALLOC_NOT_IN_QUOTA); + if (IS_ERR(key)) { + _leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key)); + return -ENOMEM; + } + + _debug("key %d", key_serial(key)); + + data.kver = 1; + data.tsec.security_index = 2; + data.tsec.ticket_len = 0; + data.tsec.expiry = expiry; + data.tsec.kvno = 0; + + memcpy(&data.tsec.session_key, session_key, + sizeof(data.tsec.session_key)); + + ret = key_instantiate_and_link(key, &data, sizeof(data), NULL, NULL); + if (ret < 0) + goto error; + + conn->key = key; + _leave(" = 0 [%d]", key_serial(key)); + return 0; + +error: + key_revoke(key); + key_put(key); + _leave(" = -ENOMEM [ins %d]", ret); + return -ENOMEM; +} + +EXPORT_SYMBOL(rxrpc_get_server_data_key); diff --git a/net/rxrpc/ar-local.c b/net/rxrpc/ar-local.c new file mode 100644 index 00000000000..fe03f71f17d --- /dev/null +++ b/net/rxrpc/ar-local.c @@ -0,0 +1,309 @@ +/* AF_RXRPC local endpoint management + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/net.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/af_rxrpc.h> +#include "ar-internal.h" + +static LIST_HEAD(rxrpc_locals); +DEFINE_RWLOCK(rxrpc_local_lock); +static DECLARE_RWSEM(rxrpc_local_sem); +static DECLARE_WAIT_QUEUE_HEAD(rxrpc_local_wq); + +static void rxrpc_destroy_local(struct work_struct *work); + +/* + * allocate a new local + */ +static +struct rxrpc_local *rxrpc_alloc_local(struct sockaddr_rxrpc *srx) +{ + struct rxrpc_local *local; + + local = kzalloc(sizeof(struct rxrpc_local), GFP_KERNEL); + if (local) { + INIT_WORK(&local->destroyer, &rxrpc_destroy_local); + INIT_WORK(&local->acceptor, &rxrpc_accept_incoming_calls); + INIT_WORK(&local->rejecter, &rxrpc_reject_packets); + INIT_LIST_HEAD(&local->services); + INIT_LIST_HEAD(&local->link); + init_rwsem(&local->defrag_sem); + skb_queue_head_init(&local->accept_queue); + skb_queue_head_init(&local->reject_queue); + spin_lock_init(&local->lock); + rwlock_init(&local->services_lock); + atomic_set(&local->usage, 1); + local->debug_id = atomic_inc_return(&rxrpc_debug_id); + memcpy(&local->srx, srx, sizeof(*srx)); + } + + _leave(" = %p", local); + return local; +} + +/* + * create the local socket + * - must be called with rxrpc_local_sem writelocked + */ +static int rxrpc_create_local(struct rxrpc_local *local) +{ + struct sock *sock; + int ret, opt; + + _enter("%p{%d}", local, local->srx.transport_type); + + /* create a socket to represent the local endpoint */ + ret = sock_create_kern(PF_INET, local->srx.transport_type, IPPROTO_UDP, + &local->socket); + if (ret < 0) { + _leave(" = %d [socket]", ret); + return ret; + } + + /* if a local address was supplied then bind it */ + if (local->srx.transport_len > sizeof(sa_family_t)) { + _debug("bind"); + ret = kernel_bind(local->socket, + (struct sockaddr *) &local->srx.transport, + local->srx.transport_len); + if (ret < 0) { + _debug("bind failed"); + goto error; + } + } + + /* we want to receive ICMP errors */ + opt = 1; + ret = kernel_setsockopt(local->socket, SOL_IP, IP_RECVERR, + (char *) &opt, sizeof(opt)); + if (ret < 0) { + _debug("setsockopt failed"); + goto error; + } + + /* we want to set the don't fragment bit */ + opt = IP_PMTUDISC_DO; + ret = kernel_setsockopt(local->socket, SOL_IP, IP_MTU_DISCOVER, + (char *) &opt, sizeof(opt)); + if (ret < 0) { + _debug("setsockopt failed"); + goto error; + } + + write_lock_bh(&rxrpc_local_lock); + list_add(&local->link, &rxrpc_locals); + write_unlock_bh(&rxrpc_local_lock); + + /* set the socket up */ + sock = local->socket->sk; + sock->sk_user_data = local; + sock->sk_data_ready = rxrpc_data_ready; + sock->sk_error_report = rxrpc_UDP_error_report; + _leave(" = 0"); + return 0; + +error: + local->socket->ops->shutdown(local->socket, 2); + local->socket->sk->sk_user_data = NULL; + sock_release(local->socket); + local->socket = NULL; + + _leave(" = %d", ret); + return ret; +} + +/* + * create a new local endpoint using the specified UDP address + */ +struct rxrpc_local *rxrpc_lookup_local(struct sockaddr_rxrpc *srx) +{ + struct rxrpc_local *local; + int ret; + + _enter("{%d,%u,%u.%u.%u.%u+%hu}", + srx->transport_type, + srx->transport.family, + NIPQUAD(srx->transport.sin.sin_addr), + ntohs(srx->transport.sin.sin_port)); + + down_write(&rxrpc_local_sem); + + /* see if we have a suitable local local endpoint already */ + read_lock_bh(&rxrpc_local_lock); + + list_for_each_entry(local, &rxrpc_locals, link) { + _debug("CMP {%d,%u,%u.%u.%u.%u+%hu}", + local->srx.transport_type, + local->srx.transport.family, + NIPQUAD(local->srx.transport.sin.sin_addr), + ntohs(local->srx.transport.sin.sin_port)); + + if (local->srx.transport_type != srx->transport_type || + local->srx.transport.family != srx->transport.family) + continue; + + switch (srx->transport.family) { + case AF_INET: + if (local->srx.transport.sin.sin_port != + srx->transport.sin.sin_port) + continue; + if (memcmp(&local->srx.transport.sin.sin_addr, + &srx->transport.sin.sin_addr, + sizeof(struct in_addr)) != 0) + continue; + goto found_local; + + default: + BUG(); + } + } + + read_unlock_bh(&rxrpc_local_lock); + + /* we didn't find one, so we need to create one */ + local = rxrpc_alloc_local(srx); + if (!local) { + up_write(&rxrpc_local_sem); + return ERR_PTR(-ENOMEM); + } + + ret = rxrpc_create_local(local); + if (ret < 0) { + up_write(&rxrpc_local_sem); + kfree(local); + _leave(" = %d", ret); + return ERR_PTR(ret); + } + + up_write(&rxrpc_local_sem); + + _net("LOCAL new %d {%d,%u,%u.%u.%u.%u+%hu}", + local->debug_id, + local->srx.transport_type, + local->srx.transport.family, + NIPQUAD(local->srx.transport.sin.sin_addr), + ntohs(local->srx.transport.sin.sin_port)); + + _leave(" = %p [new]", local); + return local; + +found_local: + rxrpc_get_local(local); + read_unlock_bh(&rxrpc_local_lock); + up_write(&rxrpc_local_sem); + + _net("LOCAL old %d {%d,%u,%u.%u.%u.%u+%hu}", + local->debug_id, + local->srx.transport_type, + local->srx.transport.family, + NIPQUAD(local->srx.transport.sin.sin_addr), + ntohs(local->srx.transport.sin.sin_port)); + + _leave(" = %p [reuse]", local); + return local; +} + +/* + * release a local endpoint + */ +void rxrpc_put_local(struct rxrpc_local *local) +{ + _enter("%p{u=%d}", local, atomic_read(&local->usage)); + + ASSERTCMP(atomic_read(&local->usage), >, 0); + + /* to prevent a race, the decrement and the dequeue must be effectively + * atomic */ + write_lock_bh(&rxrpc_local_lock); + if (unlikely(atomic_dec_and_test(&local->usage))) { + _debug("destroy local"); + rxrpc_queue_work(&local->destroyer); + } + write_unlock_bh(&rxrpc_local_lock); + _leave(""); +} + +/* + * destroy a local endpoint + */ +static void rxrpc_destroy_local(struct work_struct *work) +{ + struct rxrpc_local *local = + container_of(work, struct rxrpc_local, destroyer); + + _enter("%p{%d}", local, atomic_read(&local->usage)); + + down_write(&rxrpc_local_sem); + + write_lock_bh(&rxrpc_local_lock); + if (atomic_read(&local->usage) > 0) { + write_unlock_bh(&rxrpc_local_lock); + up_read(&rxrpc_local_sem); + _leave(" [resurrected]"); + return; + } + + list_del(&local->link); + local->socket->sk->sk_user_data = NULL; + write_unlock_bh(&rxrpc_local_lock); + + downgrade_write(&rxrpc_local_sem); + + ASSERT(list_empty(&local->services)); + ASSERT(!work_pending(&local->acceptor)); + ASSERT(!work_pending(&local->rejecter)); + + /* finish cleaning up the local descriptor */ + rxrpc_purge_queue(&local->accept_queue); + rxrpc_purge_queue(&local->reject_queue); + local->socket->ops->shutdown(local->socket, 2); + sock_release(local->socket); + + up_read(&rxrpc_local_sem); + + _net("DESTROY LOCAL %d", local->debug_id); + kfree(local); + + if (list_empty(&rxrpc_locals)) + wake_up_all(&rxrpc_local_wq); + + _leave(""); +} + +/* + * preemptively destroy all local local endpoint rather than waiting for + * them to be destroyed + */ +void __exit rxrpc_destroy_all_locals(void) +{ + DECLARE_WAITQUEUE(myself,current); + + _enter(""); + + /* we simply have to wait for them to go away */ + if (!list_empty(&rxrpc_locals)) { + set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&rxrpc_local_wq, &myself); + + while (!list_empty(&rxrpc_locals)) { + schedule(); + set_current_state(TASK_UNINTERRUPTIBLE); + } + + remove_wait_queue(&rxrpc_local_wq, &myself); + set_current_state(TASK_RUNNING); + } + + _leave(""); +} diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c new file mode 100644 index 00000000000..591c4422205 --- /dev/null +++ b/net/rxrpc/ar-output.c @@ -0,0 +1,734 @@ +/* RxRPC packet transmission + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/net.h> +#include <linux/skbuff.h> +#include <linux/circ_buf.h> +#include <net/sock.h> +#include <net/af_rxrpc.h> +#include "ar-internal.h" + +int rxrpc_resend_timeout = 4; + +static int rxrpc_send_data(struct kiocb *iocb, + struct rxrpc_sock *rx, + struct rxrpc_call *call, + struct msghdr *msg, size_t len); + +/* + * extract control messages from the sendmsg() control buffer + */ +static int rxrpc_sendmsg_cmsg(struct rxrpc_sock *rx, struct msghdr *msg, + unsigned long *user_call_ID, + enum rxrpc_command *command, + u32 *abort_code, + bool server) +{ + struct cmsghdr *cmsg; + int len; + + *command = RXRPC_CMD_SEND_DATA; + + if (msg->msg_controllen == 0) + return -EINVAL; + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + + len = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); + _debug("CMSG %d, %d, %d", + cmsg->cmsg_level, cmsg->cmsg_type, len); + + if (cmsg->cmsg_level != SOL_RXRPC) + continue; + + switch (cmsg->cmsg_type) { + case RXRPC_USER_CALL_ID: + if (msg->msg_flags & MSG_CMSG_COMPAT) { + if (len != sizeof(u32)) + return -EINVAL; + *user_call_ID = *(u32 *) CMSG_DATA(cmsg); + } else { + if (len != sizeof(unsigned long)) + return -EINVAL; + *user_call_ID = *(unsigned long *) + CMSG_DATA(cmsg); + } + _debug("User Call ID %lx", *user_call_ID); + break; + + case RXRPC_ABORT: + if (*command != RXRPC_CMD_SEND_DATA) + return -EINVAL; + *command = RXRPC_CMD_SEND_ABORT; + if (len != sizeof(*abort_code)) + return -EINVAL; + *abort_code = *(unsigned int *) CMSG_DATA(cmsg); + _debug("Abort %x", *abort_code); + if (*abort_code == 0) + return -EINVAL; + break; + + case RXRPC_ACCEPT: + if (*command != RXRPC_CMD_SEND_DATA) + return -EINVAL; + *command = RXRPC_CMD_ACCEPT; + if (len != 0) + return -EINVAL; + if (!server) + return -EISCONN; + break; + + default: + return -EINVAL; + } + } + + _leave(" = 0"); + return 0; +} + +/* + * abort a call, sending an ABORT packet to the peer + */ +static void rxrpc_send_abort(struct rxrpc_call *call, u32 abort_code) +{ + write_lock_bh(&call->state_lock); + + if (call->state <= RXRPC_CALL_COMPLETE) { + call->state = RXRPC_CALL_LOCALLY_ABORTED; + call->abort_code = abort_code; + set_bit(RXRPC_CALL_ABORT, &call->events); + del_timer_sync(&call->resend_timer); + del_timer_sync(&call->ack_timer); + clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events); + clear_bit(RXRPC_CALL_ACK, &call->events); + clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); + rxrpc_queue_call(call); + } + + write_unlock_bh(&call->state_lock); +} + +/* + * send a message forming part of a client call through an RxRPC socket + * - caller holds the socket locked + * - the socket may be either a client socket or a server socket + */ +int rxrpc_client_sendmsg(struct kiocb *iocb, struct rxrpc_sock *rx, + struct rxrpc_transport *trans, struct msghdr *msg, + size_t len) +{ + struct rxrpc_conn_bundle *bundle; + enum rxrpc_command cmd; + struct rxrpc_call *call; + unsigned long user_call_ID = 0; + struct key *key; + __be16 service_id; + u32 abort_code = 0; + int ret; + + _enter(""); + + ASSERT(trans != NULL); + + ret = rxrpc_sendmsg_cmsg(rx, msg, &user_call_ID, &cmd, &abort_code, + false); + if (ret < 0) + return ret; + + bundle = NULL; + if (trans) { + service_id = rx->service_id; + if (msg->msg_name) { + struct sockaddr_rxrpc *srx = + (struct sockaddr_rxrpc *) msg->msg_name; + service_id = htons(srx->srx_service); + } + key = rx->key; + if (key && !rx->key->payload.data) + key = NULL; + bundle = rxrpc_get_bundle(rx, trans, key, service_id, + GFP_KERNEL); + if (IS_ERR(bundle)) + return PTR_ERR(bundle); + } + + call = rxrpc_get_client_call(rx, trans, bundle, user_call_ID, + abort_code == 0, GFP_KERNEL); + if (trans) + rxrpc_put_bundle(trans, bundle); + if (IS_ERR(call)) { + _leave(" = %ld", PTR_ERR(call)); + return PTR_ERR(call); + } + + _debug("CALL %d USR %lx ST %d on CONN %p", + call->debug_id, call->user_call_ID, call->state, call->conn); + + if (call->state >= RXRPC_CALL_COMPLETE) { + /* it's too late for this call */ + ret = -ESHUTDOWN; + } else if (cmd == RXRPC_CMD_SEND_ABORT) { + rxrpc_send_abort(call, abort_code); + } else if (cmd != RXRPC_CMD_SEND_DATA) { + ret = -EINVAL; + } else if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST) { + /* request phase complete for this client call */ + ret = -EPROTO; + } else { + ret = rxrpc_send_data(iocb, rx, call, msg, len); + } + + rxrpc_put_call(call); + _leave(" = %d", ret); + return ret; +} + +/** + * rxrpc_kernel_send_data - Allow a kernel service to send data on a call + * @call: The call to send data through + * @msg: The data to send + * @len: The amount of data to send + * + * Allow a kernel service to send data on a call. The call must be in an state + * appropriate to sending data. No control data should be supplied in @msg, + * nor should an address be supplied. MSG_MORE should be flagged if there's + * more data to come, otherwise this data will end the transmission phase. + */ +int rxrpc_kernel_send_data(struct rxrpc_call *call, struct msghdr *msg, + size_t len) +{ + int ret; + + _enter("{%d,%s},", call->debug_id, rxrpc_call_states[call->state]); + + ASSERTCMP(msg->msg_name, ==, NULL); + ASSERTCMP(msg->msg_control, ==, NULL); + + lock_sock(&call->socket->sk); + + _debug("CALL %d USR %lx ST %d on CONN %p", + call->debug_id, call->user_call_ID, call->state, call->conn); + + if (call->state >= RXRPC_CALL_COMPLETE) { + ret = -ESHUTDOWN; /* it's too late for this call */ + } else if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST && + call->state != RXRPC_CALL_SERVER_ACK_REQUEST && + call->state != RXRPC_CALL_SERVER_SEND_REPLY) { + ret = -EPROTO; /* request phase complete for this client call */ + } else { + mm_segment_t oldfs = get_fs(); + set_fs(KERNEL_DS); + ret = rxrpc_send_data(NULL, call->socket, call, msg, len); + set_fs(oldfs); + } + + release_sock(&call->socket->sk); + _leave(" = %d", ret); + return ret; +} + +EXPORT_SYMBOL(rxrpc_kernel_send_data); + +/* + * rxrpc_kernel_abort_call - Allow a kernel service to abort a call + * @call: The call to be aborted + * @abort_code: The abort code to stick into the ABORT packet + * + * Allow a kernel service to abort a call, if it's still in an abortable state. + */ +void rxrpc_kernel_abort_call(struct rxrpc_call *call, u32 abort_code) +{ + _enter("{%d},%d", call->debug_id, abort_code); + + lock_sock(&call->socket->sk); + + _debug("CALL %d USR %lx ST %d on CONN %p", + call->debug_id, call->user_call_ID, call->state, call->conn); + + if (call->state < RXRPC_CALL_COMPLETE) + rxrpc_send_abort(call, abort_code); + + release_sock(&call->socket->sk); + _leave(""); +} + +EXPORT_SYMBOL(rxrpc_kernel_abort_call); + +/* + * send a message through a server socket + * - caller holds the socket locked + */ +int rxrpc_server_sendmsg(struct kiocb *iocb, struct rxrpc_sock *rx, + struct msghdr *msg, size_t len) +{ + enum rxrpc_command cmd; + struct rxrpc_call *call; + unsigned long user_call_ID = 0; + u32 abort_code = 0; + int ret; + + _enter(""); + + ret = rxrpc_sendmsg_cmsg(rx, msg, &user_call_ID, &cmd, &abort_code, + true); + if (ret < 0) + return ret; + + if (cmd == RXRPC_CMD_ACCEPT) { + call = rxrpc_accept_call(rx, user_call_ID); + if (IS_ERR(call)) + return PTR_ERR(call); + rxrpc_put_call(call); + return 0; + } + + call = rxrpc_find_server_call(rx, user_call_ID); + if (!call) + return -EBADSLT; + if (call->state >= RXRPC_CALL_COMPLETE) { + ret = -ESHUTDOWN; + goto out; + } + + switch (cmd) { + case RXRPC_CMD_SEND_DATA: + if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST && + call->state != RXRPC_CALL_SERVER_ACK_REQUEST && + call->state != RXRPC_CALL_SERVER_SEND_REPLY) { + /* Tx phase not yet begun for this call */ + ret = -EPROTO; + break; + } + + ret = rxrpc_send_data(iocb, rx, call, msg, len); + break; + + case RXRPC_CMD_SEND_ABORT: + rxrpc_send_abort(call, abort_code); + break; + default: + BUG(); + } + + out: + rxrpc_put_call(call); + _leave(" = %d", ret); + return ret; +} + +/* + * send a packet through the transport endpoint + */ +int rxrpc_send_packet(struct rxrpc_transport *trans, struct sk_buff *skb) +{ + struct kvec iov[1]; + struct msghdr msg; + int ret, opt; + + _enter(",{%d}", skb->len); + + iov[0].iov_base = skb->head; + iov[0].iov_len = skb->len; + + msg.msg_name = &trans->peer->srx.transport.sin; + msg.msg_namelen = sizeof(trans->peer->srx.transport.sin); + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + /* send the packet with the don't fragment bit set if we currently + * think it's small enough */ + if (skb->len - sizeof(struct rxrpc_header) < trans->peer->maxdata) { + down_read(&trans->local->defrag_sem); + /* send the packet by UDP + * - returns -EMSGSIZE if UDP would have to fragment the packet + * to go out of the interface + * - in which case, we'll have processed the ICMP error + * message and update the peer record + */ + ret = kernel_sendmsg(trans->local->socket, &msg, iov, 1, + iov[0].iov_len); + + up_read(&trans->local->defrag_sem); + if (ret == -EMSGSIZE) + goto send_fragmentable; + + _leave(" = %d [%u]", ret, trans->peer->maxdata); + return ret; + } + +send_fragmentable: + /* attempt to send this message with fragmentation enabled */ + _debug("send fragment"); + + down_write(&trans->local->defrag_sem); + opt = IP_PMTUDISC_DONT; + ret = kernel_setsockopt(trans->local->socket, SOL_IP, IP_MTU_DISCOVER, + (char *) &opt, sizeof(opt)); + if (ret == 0) { + ret = kernel_sendmsg(trans->local->socket, &msg, iov, 1, + iov[0].iov_len); + + opt = IP_PMTUDISC_DO; + kernel_setsockopt(trans->local->socket, SOL_IP, + IP_MTU_DISCOVER, (char *) &opt, sizeof(opt)); + } + + up_write(&trans->local->defrag_sem); + _leave(" = %d [frag %u]", ret, trans->peer->maxdata); + return ret; +} + +/* + * wait for space to appear in the transmit/ACK window + * - caller holds the socket locked + */ +static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, + struct rxrpc_call *call, + long *timeo) +{ + DECLARE_WAITQUEUE(myself, current); + int ret; + + _enter(",{%d},%ld", + CIRC_SPACE(call->acks_head, call->acks_tail, call->acks_winsz), + *timeo); + + add_wait_queue(&call->tx_waitq, &myself); + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + ret = 0; + if (CIRC_SPACE(call->acks_head, call->acks_tail, + call->acks_winsz) > 0) + break; + if (signal_pending(current)) { + ret = sock_intr_errno(*timeo); + break; + } + + release_sock(&rx->sk); + *timeo = schedule_timeout(*timeo); + lock_sock(&rx->sk); + } + + remove_wait_queue(&call->tx_waitq, &myself); + set_current_state(TASK_RUNNING); + _leave(" = %d", ret); + return ret; +} + +/* + * attempt to schedule an instant Tx resend + */ +static inline void rxrpc_instant_resend(struct rxrpc_call *call) +{ + read_lock_bh(&call->state_lock); + if (try_to_del_timer_sync(&call->resend_timer) >= 0) { + clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); + if (call->state < RXRPC_CALL_COMPLETE && + !test_and_set_bit(RXRPC_CALL_RESEND_TIMER, &call->events)) + rxrpc_queue_call(call); + } + read_unlock_bh(&call->state_lock); +} + +/* + * queue a packet for transmission, set the resend timer and attempt + * to send the packet immediately + */ +static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, + bool last) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + int ret; + + _net("queue skb %p [%d]", skb, call->acks_head); + + ASSERT(call->acks_window != NULL); + call->acks_window[call->acks_head] = (unsigned long) skb; + smp_wmb(); + call->acks_head = (call->acks_head + 1) & (call->acks_winsz - 1); + + if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) { + _debug("________awaiting reply/ACK__________"); + write_lock_bh(&call->state_lock); + switch (call->state) { + case RXRPC_CALL_CLIENT_SEND_REQUEST: + call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY; + break; + case RXRPC_CALL_SERVER_ACK_REQUEST: + call->state = RXRPC_CALL_SERVER_SEND_REPLY; + if (!last) + break; + case RXRPC_CALL_SERVER_SEND_REPLY: + call->state = RXRPC_CALL_SERVER_AWAIT_ACK; + break; + default: + break; + } + write_unlock_bh(&call->state_lock); + } + + _proto("Tx DATA %%%u { #%u }", + ntohl(sp->hdr.serial), ntohl(sp->hdr.seq)); + + sp->need_resend = 0; + sp->resend_at = jiffies + rxrpc_resend_timeout * HZ; + if (!test_and_set_bit(RXRPC_CALL_RUN_RTIMER, &call->flags)) { + _debug("run timer"); + call->resend_timer.expires = sp->resend_at; + add_timer(&call->resend_timer); + } + + /* attempt to cancel the rx-ACK timer, deferring reply transmission if + * we're ACK'ing the request phase of an incoming call */ + ret = -EAGAIN; + if (try_to_del_timer_sync(&call->ack_timer) >= 0) { + /* the packet may be freed by rxrpc_process_call() before this + * returns */ + ret = rxrpc_send_packet(call->conn->trans, skb); + _net("sent skb %p", skb); + } else { + _debug("failed to delete ACK timer"); + } + + if (ret < 0) { + _debug("need instant resend %d", ret); + sp->need_resend = 1; + rxrpc_instant_resend(call); + } + + _leave(""); +} + +/* + * send data through a socket + * - must be called in process context + * - caller holds the socket locked + */ +static int rxrpc_send_data(struct kiocb *iocb, + struct rxrpc_sock *rx, + struct rxrpc_call *call, + struct msghdr *msg, size_t len) +{ + struct rxrpc_skb_priv *sp; + unsigned char __user *from; + struct sk_buff *skb; + struct iovec *iov; + struct sock *sk = &rx->sk; + long timeo; + bool more; + int ret, ioc, segment, copied; + + _enter(",,,{%zu},%zu", msg->msg_iovlen, len); + + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + + /* this should be in poll */ + clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) + return -EPIPE; + + iov = msg->msg_iov; + ioc = msg->msg_iovlen - 1; + from = iov->iov_base; + segment = iov->iov_len; + iov++; + more = msg->msg_flags & MSG_MORE; + + skb = call->tx_pending; + call->tx_pending = NULL; + + copied = 0; + do { + int copy; + + if (segment > len) + segment = len; + + _debug("SEGMENT %d @%p", segment, from); + + if (!skb) { + size_t size, chunk, max, space; + + _debug("alloc"); + + if (CIRC_SPACE(call->acks_head, call->acks_tail, + call->acks_winsz) <= 0) { + ret = -EAGAIN; + if (msg->msg_flags & MSG_DONTWAIT) + goto maybe_error; + ret = rxrpc_wait_for_tx_window(rx, call, + &timeo); + if (ret < 0) + goto maybe_error; + } + + max = call->conn->trans->peer->maxdata; + max -= call->conn->security_size; + max &= ~(call->conn->size_align - 1UL); + + chunk = max; + if (chunk > len && !more) + chunk = len; + + space = chunk + call->conn->size_align; + space &= ~(call->conn->size_align - 1UL); + + size = space + call->conn->header_size; + + _debug("SIZE: %zu/%zu/%zu", chunk, space, size); + + /* create a buffer that we can retain until it's ACK'd */ + skb = sock_alloc_send_skb( + sk, size, msg->msg_flags & MSG_DONTWAIT, &ret); + if (!skb) + goto maybe_error; + + rxrpc_new_skb(skb); + + _debug("ALLOC SEND %p", skb); + + ASSERTCMP(skb->mark, ==, 0); + + _debug("HS: %u", call->conn->header_size); + skb_reserve(skb, call->conn->header_size); + skb->len += call->conn->header_size; + + sp = rxrpc_skb(skb); + sp->remain = chunk; + if (sp->remain > skb_tailroom(skb)) + sp->remain = skb_tailroom(skb); + + _net("skb: hr %d, tr %d, hl %d, rm %d", + skb_headroom(skb), + skb_tailroom(skb), + skb_headlen(skb), + sp->remain); + + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + + _debug("append"); + sp = rxrpc_skb(skb); + + /* append next segment of data to the current buffer */ + copy = skb_tailroom(skb); + ASSERTCMP(copy, >, 0); + if (copy > segment) + copy = segment; + if (copy > sp->remain) + copy = sp->remain; + + _debug("add"); + ret = skb_add_data(skb, from, copy); + _debug("added"); + if (ret < 0) + goto efault; + sp->remain -= copy; + skb->mark += copy; + + len -= copy; + segment -= copy; + from += copy; + while (segment == 0 && ioc > 0) { + from = iov->iov_base; + segment = iov->iov_len; + iov++; + ioc--; + } + if (len == 0) { + segment = 0; + ioc = 0; + } + + /* check for the far side aborting the call or a network error + * occurring */ + if (call->state > RXRPC_CALL_COMPLETE) + goto call_aborted; + + /* add the packet to the send queue if it's now full */ + if (sp->remain <= 0 || (segment == 0 && !more)) { + struct rxrpc_connection *conn = call->conn; + size_t pad; + + /* pad out if we're using security */ + if (conn->security) { + pad = conn->security_size + skb->mark; + pad = conn->size_align - pad; + pad &= conn->size_align - 1; + _debug("pad %zu", pad); + if (pad) + memset(skb_put(skb, pad), 0, pad); + } + + sp->hdr.epoch = conn->epoch; + sp->hdr.cid = call->cid; + sp->hdr.callNumber = call->call_id; + sp->hdr.seq = + htonl(atomic_inc_return(&call->sequence)); + sp->hdr.serial = + htonl(atomic_inc_return(&conn->serial)); + sp->hdr.type = RXRPC_PACKET_TYPE_DATA; + sp->hdr.userStatus = 0; + sp->hdr.securityIndex = conn->security_ix; + sp->hdr._rsvd = 0; + sp->hdr.serviceId = conn->service_id; + + sp->hdr.flags = conn->out_clientflag; + if (len == 0 && !more) + sp->hdr.flags |= RXRPC_LAST_PACKET; + else if (CIRC_SPACE(call->acks_head, call->acks_tail, + call->acks_winsz) > 1) + sp->hdr.flags |= RXRPC_MORE_PACKETS; + + ret = rxrpc_secure_packet( + call, skb, skb->mark, + skb->head + sizeof(struct rxrpc_header)); + if (ret < 0) + goto out; + + memcpy(skb->head, &sp->hdr, + sizeof(struct rxrpc_header)); + rxrpc_queue_packet(call, skb, segment == 0 && !more); + skb = NULL; + } + + } while (segment > 0); + +out: + call->tx_pending = skb; + _leave(" = %d", ret); + return ret; + +call_aborted: + rxrpc_free_skb(skb); + if (call->state == RXRPC_CALL_NETWORK_ERROR) + ret = call->conn->trans->peer->net_error; + else + ret = -ECONNABORTED; + _leave(" = %d", ret); + return ret; + +maybe_error: + if (copied) + ret = copied; + goto out; + +efault: + ret = -EFAULT; + goto out; +} diff --git a/net/rxrpc/ar-peer.c b/net/rxrpc/ar-peer.c new file mode 100644 index 00000000000..ce08b78647c --- /dev/null +++ b/net/rxrpc/ar-peer.c @@ -0,0 +1,316 @@ +/* RxRPC remote transport endpoint management + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/net.h> +#include <linux/skbuff.h> +#include <linux/udp.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <linux/icmp.h> +#include <net/sock.h> +#include <net/af_rxrpc.h> +#include <net/ip.h> +#include <net/route.h> +#include "ar-internal.h" + +static LIST_HEAD(rxrpc_peers); +static DEFINE_RWLOCK(rxrpc_peer_lock); +static DECLARE_WAIT_QUEUE_HEAD(rxrpc_peer_wq); + +static void rxrpc_destroy_peer(struct work_struct *work); + +/* + * assess the MTU size for the network interface through which this peer is + * reached + */ +static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer) +{ + struct rtable *rt; + struct flowi fl; + int ret; + + peer->if_mtu = 1500; + + memset(&fl, 0, sizeof(fl)); + + switch (peer->srx.transport.family) { + case AF_INET: + fl.oif = 0; + fl.proto = IPPROTO_UDP, + fl.nl_u.ip4_u.saddr = 0; + fl.nl_u.ip4_u.daddr = peer->srx.transport.sin.sin_addr.s_addr; + fl.nl_u.ip4_u.tos = 0; + /* assume AFS.CM talking to AFS.FS */ + fl.uli_u.ports.sport = htons(7001); + fl.uli_u.ports.dport = htons(7000); + break; + default: + BUG(); + } + + ret = ip_route_output_key(&rt, &fl); + if (ret < 0) { + kleave(" [route err %d]", ret); + return; + } + + peer->if_mtu = dst_mtu(&rt->u.dst); + dst_release(&rt->u.dst); + + kleave(" [if_mtu %u]", peer->if_mtu); +} + +/* + * allocate a new peer + */ +static struct rxrpc_peer *rxrpc_alloc_peer(struct sockaddr_rxrpc *srx, + gfp_t gfp) +{ + struct rxrpc_peer *peer; + + _enter(""); + + peer = kzalloc(sizeof(struct rxrpc_peer), gfp); + if (peer) { + INIT_WORK(&peer->destroyer, &rxrpc_destroy_peer); + INIT_LIST_HEAD(&peer->link); + INIT_LIST_HEAD(&peer->error_targets); + spin_lock_init(&peer->lock); + atomic_set(&peer->usage, 1); + peer->debug_id = atomic_inc_return(&rxrpc_debug_id); + memcpy(&peer->srx, srx, sizeof(*srx)); + + rxrpc_assess_MTU_size(peer); + peer->mtu = peer->if_mtu; + + if (srx->transport.family == AF_INET) { + peer->hdrsize = sizeof(struct iphdr); + switch (srx->transport_type) { + case SOCK_DGRAM: + peer->hdrsize += sizeof(struct udphdr); + break; + default: + BUG(); + break; + } + } else { + BUG(); + } + + peer->hdrsize += sizeof(struct rxrpc_header); + peer->maxdata = peer->mtu - peer->hdrsize; + } + + _leave(" = %p", peer); + return peer; +} + +/* + * obtain a remote transport endpoint for the specified address + */ +struct rxrpc_peer *rxrpc_get_peer(struct sockaddr_rxrpc *srx, gfp_t gfp) +{ + struct rxrpc_peer *peer, *candidate; + const char *new = "old"; + int usage; + + _enter("{%d,%d,%u.%u.%u.%u+%hu}", + srx->transport_type, + srx->transport_len, + NIPQUAD(srx->transport.sin.sin_addr), + ntohs(srx->transport.sin.sin_port)); + + /* search the peer list first */ + read_lock_bh(&rxrpc_peer_lock); + list_for_each_entry(peer, &rxrpc_peers, link) { + _debug("check PEER %d { u=%d t=%d l=%d }", + peer->debug_id, + atomic_read(&peer->usage), + peer->srx.transport_type, + peer->srx.transport_len); + + if (atomic_read(&peer->usage) > 0 && + peer->srx.transport_type == srx->transport_type && + peer->srx.transport_len == srx->transport_len && + memcmp(&peer->srx.transport, + &srx->transport, + srx->transport_len) == 0) + goto found_extant_peer; + } + read_unlock_bh(&rxrpc_peer_lock); + + /* not yet present - create a candidate for a new record and then + * redo the search */ + candidate = rxrpc_alloc_peer(srx, gfp); + if (!candidate) { + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); + } + + write_lock_bh(&rxrpc_peer_lock); + + list_for_each_entry(peer, &rxrpc_peers, link) { + if (atomic_read(&peer->usage) > 0 && + peer->srx.transport_type == srx->transport_type && + peer->srx.transport_len == srx->transport_len && + memcmp(&peer->srx.transport, + &srx->transport, + srx->transport_len) == 0) + goto found_extant_second; + } + + /* we can now add the new candidate to the list */ + peer = candidate; + candidate = NULL; + + list_add_tail(&peer->link, &rxrpc_peers); + write_unlock_bh(&rxrpc_peer_lock); + new = "new"; + +success: + _net("PEER %s %d {%d,%u,%u.%u.%u.%u+%hu}", + new, + peer->debug_id, + peer->srx.transport_type, + peer->srx.transport.family, + NIPQUAD(peer->srx.transport.sin.sin_addr), + ntohs(peer->srx.transport.sin.sin_port)); + + _leave(" = %p {u=%d}", peer, atomic_read(&peer->usage)); + return peer; + + /* we found the peer in the list immediately */ +found_extant_peer: + usage = atomic_inc_return(&peer->usage); + read_unlock_bh(&rxrpc_peer_lock); + goto success; + + /* we found the peer on the second time through the list */ +found_extant_second: + usage = atomic_inc_return(&peer->usage); + write_unlock_bh(&rxrpc_peer_lock); + kfree(candidate); + goto success; +} + +/* + * find the peer associated with a packet + */ +struct rxrpc_peer *rxrpc_find_peer(struct rxrpc_local *local, + __be32 addr, __be16 port) +{ + struct rxrpc_peer *peer; + + _enter(""); + + /* search the peer list */ + read_lock_bh(&rxrpc_peer_lock); + + if (local->srx.transport.family == AF_INET && + local->srx.transport_type == SOCK_DGRAM + ) { + list_for_each_entry(peer, &rxrpc_peers, link) { + if (atomic_read(&peer->usage) > 0 && + peer->srx.transport_type == SOCK_DGRAM && + peer->srx.transport.family == AF_INET && + peer->srx.transport.sin.sin_port == port && + peer->srx.transport.sin.sin_addr.s_addr == addr) + goto found_UDP_peer; + } + + goto new_UDP_peer; + } + + read_unlock_bh(&rxrpc_peer_lock); + _leave(" = -EAFNOSUPPORT"); + return ERR_PTR(-EAFNOSUPPORT); + +found_UDP_peer: + _net("Rx UDP DGRAM from peer %d", peer->debug_id); + atomic_inc(&peer->usage); + read_unlock_bh(&rxrpc_peer_lock); + _leave(" = %p", peer); + return peer; + +new_UDP_peer: + _net("Rx UDP DGRAM from NEW peer %d", peer->debug_id); + read_unlock_bh(&rxrpc_peer_lock); + _leave(" = -EBUSY [new]"); + return ERR_PTR(-EBUSY); +} + +/* + * release a remote transport endpoint + */ +void rxrpc_put_peer(struct rxrpc_peer *peer) +{ + _enter("%p{u=%d}", peer, atomic_read(&peer->usage)); + + ASSERTCMP(atomic_read(&peer->usage), >, 0); + + if (likely(!atomic_dec_and_test(&peer->usage))) { + _leave(" [in use]"); + return; + } + + rxrpc_queue_work(&peer->destroyer); + _leave(""); +} + +/* + * destroy a remote transport endpoint + */ +static void rxrpc_destroy_peer(struct work_struct *work) +{ + struct rxrpc_peer *peer = + container_of(work, struct rxrpc_peer, destroyer); + + _enter("%p{%d}", peer, atomic_read(&peer->usage)); + + write_lock_bh(&rxrpc_peer_lock); + list_del(&peer->link); + write_unlock_bh(&rxrpc_peer_lock); + + _net("DESTROY PEER %d", peer->debug_id); + kfree(peer); + + if (list_empty(&rxrpc_peers)) + wake_up_all(&rxrpc_peer_wq); + _leave(""); +} + +/* + * preemptively destroy all the peer records from a transport endpoint rather + * than waiting for them to time out + */ +void __exit rxrpc_destroy_all_peers(void) +{ + DECLARE_WAITQUEUE(myself,current); + + _enter(""); + + /* we simply have to wait for them to go away */ + if (!list_empty(&rxrpc_peers)) { + set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&rxrpc_peer_wq, &myself); + + while (!list_empty(&rxrpc_peers)) { + schedule(); + set_current_state(TASK_UNINTERRUPTIBLE); + } + + remove_wait_queue(&rxrpc_peer_wq, &myself); + set_current_state(TASK_RUNNING); + } + + _leave(""); +} diff --git a/net/rxrpc/ar-proc.c b/net/rxrpc/ar-proc.c new file mode 100644 index 00000000000..58f4b4e5cec --- /dev/null +++ b/net/rxrpc/ar-proc.c @@ -0,0 +1,247 @@ +/* /proc/net/ support for AF_RXRPC + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <net/sock.h> +#include <net/af_rxrpc.h> +#include "ar-internal.h" + +static const char *rxrpc_conn_states[] = { + [RXRPC_CONN_UNUSED] = "Unused ", + [RXRPC_CONN_CLIENT] = "Client ", + [RXRPC_CONN_SERVER_UNSECURED] = "SvUnsec ", + [RXRPC_CONN_SERVER_CHALLENGING] = "SvChall ", + [RXRPC_CONN_SERVER] = "SvSecure", + [RXRPC_CONN_REMOTELY_ABORTED] = "RmtAbort", + [RXRPC_CONN_LOCALLY_ABORTED] = "LocAbort", + [RXRPC_CONN_NETWORK_ERROR] = "NetError", +}; + +const char *rxrpc_call_states[] = { + [RXRPC_CALL_CLIENT_SEND_REQUEST] = "ClSndReq", + [RXRPC_CALL_CLIENT_AWAIT_REPLY] = "ClAwtRpl", + [RXRPC_CALL_CLIENT_RECV_REPLY] = "ClRcvRpl", + [RXRPC_CALL_CLIENT_FINAL_ACK] = "ClFnlACK", + [RXRPC_CALL_SERVER_SECURING] = "SvSecure", + [RXRPC_CALL_SERVER_ACCEPTING] = "SvAccept", + [RXRPC_CALL_SERVER_RECV_REQUEST] = "SvRcvReq", + [RXRPC_CALL_SERVER_ACK_REQUEST] = "SvAckReq", + [RXRPC_CALL_SERVER_SEND_REPLY] = "SvSndRpl", + [RXRPC_CALL_SERVER_AWAIT_ACK] = "SvAwtACK", + [RXRPC_CALL_COMPLETE] = "Complete", + [RXRPC_CALL_SERVER_BUSY] = "SvBusy ", + [RXRPC_CALL_REMOTELY_ABORTED] = "RmtAbort", + [RXRPC_CALL_LOCALLY_ABORTED] = "LocAbort", + [RXRPC_CALL_NETWORK_ERROR] = "NetError", + [RXRPC_CALL_DEAD] = "Dead ", +}; + +/* + * generate a list of extant and dead calls in /proc/net/rxrpc_calls + */ +static void *rxrpc_call_seq_start(struct seq_file *seq, loff_t *_pos) +{ + struct list_head *_p; + loff_t pos = *_pos; + + read_lock(&rxrpc_call_lock); + if (!pos) + return SEQ_START_TOKEN; + pos--; + + list_for_each(_p, &rxrpc_calls) + if (!pos--) + break; + + return _p != &rxrpc_calls ? _p : NULL; +} + +static void *rxrpc_call_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct list_head *_p; + + (*pos)++; + + _p = v; + _p = (v == SEQ_START_TOKEN) ? rxrpc_calls.next : _p->next; + + return _p != &rxrpc_calls ? _p : NULL; +} + +static void rxrpc_call_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock(&rxrpc_call_lock); +} + +static int rxrpc_call_seq_show(struct seq_file *seq, void *v) +{ + struct rxrpc_transport *trans; + struct rxrpc_call *call; + char lbuff[4 + 4 + 4 + 4 + 5 + 1], rbuff[4 + 4 + 4 + 4 + 5 + 1]; + + if (v == SEQ_START_TOKEN) { + seq_puts(seq, + "Proto Local Remote " + " SvID ConnID CallID End Use State Abort " + " UserID\n"); + return 0; + } + + call = list_entry(v, struct rxrpc_call, link); + trans = call->conn->trans; + + sprintf(lbuff, NIPQUAD_FMT":%u", + NIPQUAD(trans->local->srx.transport.sin.sin_addr), + ntohs(trans->local->srx.transport.sin.sin_port)); + + sprintf(rbuff, NIPQUAD_FMT":%u", + NIPQUAD(trans->peer->srx.transport.sin.sin_addr), + ntohs(trans->peer->srx.transport.sin.sin_port)); + + seq_printf(seq, + "UDP %-22.22s %-22.22s %4x %08x %08x %s %3u" + " %-8.8s %08x %lx\n", + lbuff, + rbuff, + ntohs(call->conn->service_id), + ntohl(call->conn->cid), + ntohl(call->call_id), + call->conn->in_clientflag ? "Svc" : "Clt", + atomic_read(&call->usage), + rxrpc_call_states[call->state], + call->abort_code, + call->user_call_ID); + + return 0; +} + +static struct seq_operations rxrpc_call_seq_ops = { + .start = rxrpc_call_seq_start, + .next = rxrpc_call_seq_next, + .stop = rxrpc_call_seq_stop, + .show = rxrpc_call_seq_show, +}; + +static int rxrpc_call_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &rxrpc_call_seq_ops); +} + +struct file_operations rxrpc_call_seq_fops = { + .owner = THIS_MODULE, + .open = rxrpc_call_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +/* + * generate a list of extant virtual connections in /proc/net/rxrpc_conns + */ +static void *rxrpc_connection_seq_start(struct seq_file *seq, loff_t *_pos) +{ + struct list_head *_p; + loff_t pos = *_pos; + + read_lock(&rxrpc_connection_lock); + if (!pos) + return SEQ_START_TOKEN; + pos--; + + list_for_each(_p, &rxrpc_connections) + if (!pos--) + break; + + return _p != &rxrpc_connections ? _p : NULL; +} + +static void *rxrpc_connection_seq_next(struct seq_file *seq, void *v, + loff_t *pos) +{ + struct list_head *_p; + + (*pos)++; + + _p = v; + _p = (v == SEQ_START_TOKEN) ? rxrpc_connections.next : _p->next; + + return _p != &rxrpc_connections ? _p : NULL; +} + +static void rxrpc_connection_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock(&rxrpc_connection_lock); +} + +static int rxrpc_connection_seq_show(struct seq_file *seq, void *v) +{ + struct rxrpc_connection *conn; + struct rxrpc_transport *trans; + char lbuff[4 + 4 + 4 + 4 + 5 + 1], rbuff[4 + 4 + 4 + 4 + 5 + 1]; + + if (v == SEQ_START_TOKEN) { + seq_puts(seq, + "Proto Local Remote " + " SvID ConnID Calls End Use State Key " + " Serial ISerial\n" + ); + return 0; + } + + conn = list_entry(v, struct rxrpc_connection, link); + trans = conn->trans; + + sprintf(lbuff, NIPQUAD_FMT":%u", + NIPQUAD(trans->local->srx.transport.sin.sin_addr), + ntohs(trans->local->srx.transport.sin.sin_port)); + + sprintf(rbuff, NIPQUAD_FMT":%u", + NIPQUAD(trans->peer->srx.transport.sin.sin_addr), + ntohs(trans->peer->srx.transport.sin.sin_port)); + + seq_printf(seq, + "UDP %-22.22s %-22.22s %4x %08x %08x %s %3u" + " %s %08x %08x %08x\n", + lbuff, + rbuff, + ntohs(conn->service_id), + ntohl(conn->cid), + conn->call_counter, + conn->in_clientflag ? "Svc" : "Clt", + atomic_read(&conn->usage), + rxrpc_conn_states[conn->state], + key_serial(conn->key), + atomic_read(&conn->serial), + atomic_read(&conn->hi_serial)); + + return 0; +} + +static struct seq_operations rxrpc_connection_seq_ops = { + .start = rxrpc_connection_seq_start, + .next = rxrpc_connection_seq_next, + .stop = rxrpc_connection_seq_stop, + .show = rxrpc_connection_seq_show, +}; + + +static int rxrpc_connection_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &rxrpc_connection_seq_ops); +} + +struct file_operations rxrpc_connection_seq_fops = { + .owner = THIS_MODULE, + .open = rxrpc_connection_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c new file mode 100644 index 00000000000..f19121d4795 --- /dev/null +++ b/net/rxrpc/ar-recvmsg.c @@ -0,0 +1,437 @@ +/* RxRPC recvmsg() implementation + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/net.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/af_rxrpc.h> +#include "ar-internal.h" + +/* + * removal a call's user ID from the socket tree to make the user ID available + * again and so that it won't be seen again in association with that call + */ +void rxrpc_remove_user_ID(struct rxrpc_sock *rx, struct rxrpc_call *call) +{ + _debug("RELEASE CALL %d", call->debug_id); + + if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { + write_lock_bh(&rx->call_lock); + rb_erase(&call->sock_node, &call->socket->calls); + clear_bit(RXRPC_CALL_HAS_USERID, &call->flags); + write_unlock_bh(&rx->call_lock); + } + + read_lock_bh(&call->state_lock); + if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && + !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events)) + rxrpc_queue_call(call); + read_unlock_bh(&call->state_lock); +} + +/* + * receive a message from an RxRPC socket + * - we need to be careful about two or more threads calling recvmsg + * simultaneously + */ +int rxrpc_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len, int flags) +{ + struct rxrpc_skb_priv *sp; + struct rxrpc_call *call = NULL, *continue_call = NULL; + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + struct sk_buff *skb; + long timeo; + int copy, ret, ullen, offset, copied = 0; + u32 abort_code; + + DEFINE_WAIT(wait); + + _enter(",,,%zu,%d", len, flags); + + if (flags & (MSG_OOB | MSG_TRUNC)) + return -EOPNOTSUPP; + + ullen = msg->msg_flags & MSG_CMSG_COMPAT ? 4 : sizeof(unsigned long); + + timeo = sock_rcvtimeo(&rx->sk, flags & MSG_DONTWAIT); + msg->msg_flags |= MSG_MORE; + + lock_sock(&rx->sk); + + for (;;) { + /* return immediately if a client socket has no outstanding + * calls */ + if (RB_EMPTY_ROOT(&rx->calls)) { + if (copied) + goto out; + if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) { + release_sock(&rx->sk); + if (continue_call) + rxrpc_put_call(continue_call); + return -ENODATA; + } + } + + /* get the next message on the Rx queue */ + skb = skb_peek(&rx->sk.sk_receive_queue); + if (!skb) { + /* nothing remains on the queue */ + if (copied && + (msg->msg_flags & MSG_PEEK || timeo == 0)) + goto out; + + /* wait for a message to turn up */ + release_sock(&rx->sk); + prepare_to_wait_exclusive(rx->sk.sk_sleep, &wait, + TASK_INTERRUPTIBLE); + ret = sock_error(&rx->sk); + if (ret) + goto wait_error; + + if (skb_queue_empty(&rx->sk.sk_receive_queue)) { + if (signal_pending(current)) + goto wait_interrupted; + timeo = schedule_timeout(timeo); + } + finish_wait(rx->sk.sk_sleep, &wait); + lock_sock(&rx->sk); + continue; + } + + peek_next_packet: + sp = rxrpc_skb(skb); + call = sp->call; + ASSERT(call != NULL); + + _debug("next pkt %s", rxrpc_pkts[sp->hdr.type]); + + /* make sure we wait for the state to be updated in this call */ + spin_lock_bh(&call->lock); + spin_unlock_bh(&call->lock); + + if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) { + _debug("packet from released call"); + if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) + BUG(); + rxrpc_free_skb(skb); + continue; + } + + /* determine whether to continue last data receive */ + if (continue_call) { + _debug("maybe cont"); + if (call != continue_call || + skb->mark != RXRPC_SKB_MARK_DATA) { + release_sock(&rx->sk); + rxrpc_put_call(continue_call); + _leave(" = %d [noncont]", copied); + return copied; + } + } + + rxrpc_get_call(call); + + /* copy the peer address and timestamp */ + if (!continue_call) { + if (msg->msg_name && msg->msg_namelen > 0) + memcpy(&msg->msg_name, &call->conn->trans->peer->srx, + sizeof(call->conn->trans->peer->srx)); + sock_recv_timestamp(msg, &rx->sk, skb); + } + + /* receive the message */ + if (skb->mark != RXRPC_SKB_MARK_DATA) + goto receive_non_data_message; + + _debug("recvmsg DATA #%u { %d, %d }", + ntohl(sp->hdr.seq), skb->len, sp->offset); + + if (!continue_call) { + /* only set the control data once per recvmsg() */ + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, + ullen, &call->user_call_ID); + if (ret < 0) + goto copy_error; + ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags)); + } + + ASSERTCMP(ntohl(sp->hdr.seq), >=, call->rx_data_recv); + ASSERTCMP(ntohl(sp->hdr.seq), <=, call->rx_data_recv + 1); + call->rx_data_recv = ntohl(sp->hdr.seq); + + ASSERTCMP(ntohl(sp->hdr.seq), >, call->rx_data_eaten); + + offset = sp->offset; + copy = skb->len - offset; + if (copy > len - copied) + copy = len - copied; + + if (skb->ip_summed == CHECKSUM_UNNECESSARY) { + ret = skb_copy_datagram_iovec(skb, offset, + msg->msg_iov, copy); + } else { + ret = skb_copy_and_csum_datagram_iovec(skb, offset, + msg->msg_iov); + if (ret == -EINVAL) + goto csum_copy_error; + } + + if (ret < 0) + goto copy_error; + + /* handle piecemeal consumption of data packets */ + _debug("copied %d+%d", copy, copied); + + offset += copy; + copied += copy; + + if (!(flags & MSG_PEEK)) + sp->offset = offset; + + if (sp->offset < skb->len) { + _debug("buffer full"); + ASSERTCMP(copied, ==, len); + break; + } + + /* we transferred the whole data packet */ + if (sp->hdr.flags & RXRPC_LAST_PACKET) { + _debug("last"); + if (call->conn->out_clientflag) { + /* last byte of reply received */ + ret = copied; + goto terminal_message; + } + + /* last bit of request received */ + if (!(flags & MSG_PEEK)) { + _debug("eat packet"); + if (skb_dequeue(&rx->sk.sk_receive_queue) != + skb) + BUG(); + rxrpc_free_skb(skb); + } + msg->msg_flags &= ~MSG_MORE; + break; + } + + /* move on to the next data message */ + _debug("next"); + if (!continue_call) + continue_call = sp->call; + else + rxrpc_put_call(call); + call = NULL; + + if (flags & MSG_PEEK) { + _debug("peek next"); + skb = skb->next; + if (skb == (struct sk_buff *) &rx->sk.sk_receive_queue) + break; + goto peek_next_packet; + } + + _debug("eat packet"); + if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) + BUG(); + rxrpc_free_skb(skb); + } + + /* end of non-terminal data packet reception for the moment */ + _debug("end rcv data"); +out: + release_sock(&rx->sk); + if (call) + rxrpc_put_call(call); + if (continue_call) + rxrpc_put_call(continue_call); + _leave(" = %d [data]", copied); + return copied; + + /* handle non-DATA messages such as aborts, incoming connections and + * final ACKs */ +receive_non_data_message: + _debug("non-data"); + + if (skb->mark == RXRPC_SKB_MARK_NEW_CALL) { + _debug("RECV NEW CALL"); + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NEW_CALL, 0, &abort_code); + if (ret < 0) + goto copy_error; + if (!(flags & MSG_PEEK)) { + if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) + BUG(); + rxrpc_free_skb(skb); + } + goto out; + } + + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, + ullen, &call->user_call_ID); + if (ret < 0) + goto copy_error; + ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags)); + + switch (skb->mark) { + case RXRPC_SKB_MARK_DATA: + BUG(); + case RXRPC_SKB_MARK_FINAL_ACK: + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ACK, 0, &abort_code); + break; + case RXRPC_SKB_MARK_BUSY: + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_BUSY, 0, &abort_code); + break; + case RXRPC_SKB_MARK_REMOTE_ABORT: + abort_code = call->abort_code; + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &abort_code); + break; + case RXRPC_SKB_MARK_NET_ERROR: + _debug("RECV NET ERROR %d", sp->error); + abort_code = sp->error; + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NET_ERROR, 4, &abort_code); + break; + case RXRPC_SKB_MARK_LOCAL_ERROR: + _debug("RECV LOCAL ERROR %d", sp->error); + abort_code = sp->error; + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_LOCAL_ERROR, 4, + &abort_code); + break; + default: + BUG(); + break; + } + + if (ret < 0) + goto copy_error; + +terminal_message: + _debug("terminal"); + msg->msg_flags &= ~MSG_MORE; + msg->msg_flags |= MSG_EOR; + + if (!(flags & MSG_PEEK)) { + _net("free terminal skb %p", skb); + if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) + BUG(); + rxrpc_free_skb(skb); + rxrpc_remove_user_ID(rx, call); + } + + release_sock(&rx->sk); + rxrpc_put_call(call); + if (continue_call) + rxrpc_put_call(continue_call); + _leave(" = %d", ret); + return ret; + +copy_error: + _debug("copy error"); + release_sock(&rx->sk); + rxrpc_put_call(call); + if (continue_call) + rxrpc_put_call(continue_call); + _leave(" = %d", ret); + return ret; + +csum_copy_error: + _debug("csum error"); + release_sock(&rx->sk); + if (continue_call) + rxrpc_put_call(continue_call); + rxrpc_kill_skb(skb); + skb_kill_datagram(&rx->sk, skb, flags); + rxrpc_put_call(call); + return -EAGAIN; + +wait_interrupted: + ret = sock_intr_errno(timeo); +wait_error: + finish_wait(rx->sk.sk_sleep, &wait); + if (continue_call) + rxrpc_put_call(continue_call); + if (copied) + copied = ret; + _leave(" = %d [waitfail %d]", copied, ret); + return copied; + +} + +/** + * rxrpc_kernel_data_delivered - Record delivery of data message + * @skb: Message holding data + * + * Record the delivery of a data message. This permits RxRPC to keep its + * tracking correct. The socket buffer will be deleted. + */ +void rxrpc_kernel_data_delivered(struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_call *call = sp->call; + + ASSERTCMP(ntohl(sp->hdr.seq), >=, call->rx_data_recv); + ASSERTCMP(ntohl(sp->hdr.seq), <=, call->rx_data_recv + 1); + call->rx_data_recv = ntohl(sp->hdr.seq); + + ASSERTCMP(ntohl(sp->hdr.seq), >, call->rx_data_eaten); + rxrpc_free_skb(skb); +} + +EXPORT_SYMBOL(rxrpc_kernel_data_delivered); + +/** + * rxrpc_kernel_is_data_last - Determine if data message is last one + * @skb: Message holding data + * + * Determine if data message is last one for the parent call. + */ +bool rxrpc_kernel_is_data_last(struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + ASSERTCMP(skb->mark, ==, RXRPC_SKB_MARK_DATA); + + return sp->hdr.flags & RXRPC_LAST_PACKET; +} + +EXPORT_SYMBOL(rxrpc_kernel_is_data_last); + +/** + * rxrpc_kernel_get_abort_code - Get the abort code from an RxRPC abort message + * @skb: Message indicating an abort + * + * Get the abort code from an RxRPC abort message. + */ +u32 rxrpc_kernel_get_abort_code(struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + ASSERTCMP(skb->mark, ==, RXRPC_SKB_MARK_REMOTE_ABORT); + + return sp->call->abort_code; +} + +EXPORT_SYMBOL(rxrpc_kernel_get_abort_code); + +/** + * rxrpc_kernel_get_error - Get the error number from an RxRPC error message + * @skb: Message indicating an error + * + * Get the error number from an RxRPC error message. + */ +int rxrpc_kernel_get_error_number(struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + return sp->error; +} + +EXPORT_SYMBOL(rxrpc_kernel_get_error_number); diff --git a/net/rxrpc/ar-security.c b/net/rxrpc/ar-security.c new file mode 100644 index 00000000000..60d1d364430 --- /dev/null +++ b/net/rxrpc/ar-security.c @@ -0,0 +1,258 @@ +/* RxRPC security handling + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/net.h> +#include <linux/skbuff.h> +#include <linux/udp.h> +#include <linux/crypto.h> +#include <net/sock.h> +#include <net/af_rxrpc.h> +#include "ar-internal.h" + +static LIST_HEAD(rxrpc_security_methods); +static DECLARE_RWSEM(rxrpc_security_sem); + +/* + * get an RxRPC security module + */ +static struct rxrpc_security *rxrpc_security_get(struct rxrpc_security *sec) +{ + return try_module_get(sec->owner) ? sec : NULL; +} + +/* + * release an RxRPC security module + */ +static void rxrpc_security_put(struct rxrpc_security *sec) +{ + module_put(sec->owner); +} + +/* + * look up an rxrpc security module + */ +struct rxrpc_security *rxrpc_security_lookup(u8 security_index) +{ + struct rxrpc_security *sec = NULL; + + _enter(""); + + down_read(&rxrpc_security_sem); + + list_for_each_entry(sec, &rxrpc_security_methods, link) { + if (sec->security_index == security_index) { + if (unlikely(!rxrpc_security_get(sec))) + break; + goto out; + } + } + + sec = NULL; +out: + up_read(&rxrpc_security_sem); + _leave(" = %p [%s]", sec, sec ? sec->name : ""); + return sec; +} + +/** + * rxrpc_register_security - register an RxRPC security handler + * @sec: security module + * + * register an RxRPC security handler for use by RxRPC + */ +int rxrpc_register_security(struct rxrpc_security *sec) +{ + struct rxrpc_security *psec; + int ret; + + _enter(""); + down_write(&rxrpc_security_sem); + + ret = -EEXIST; + list_for_each_entry(psec, &rxrpc_security_methods, link) { + if (psec->security_index == sec->security_index) + goto out; + } + + list_add(&sec->link, &rxrpc_security_methods); + + printk(KERN_NOTICE "RxRPC: Registered security type %d '%s'\n", + sec->security_index, sec->name); + ret = 0; + +out: + up_write(&rxrpc_security_sem); + _leave(" = %d", ret); + return ret; +} + +EXPORT_SYMBOL_GPL(rxrpc_register_security); + +/** + * rxrpc_unregister_security - unregister an RxRPC security handler + * @sec: security module + * + * unregister an RxRPC security handler + */ +void rxrpc_unregister_security(struct rxrpc_security *sec) +{ + + _enter(""); + down_write(&rxrpc_security_sem); + list_del_init(&sec->link); + up_write(&rxrpc_security_sem); + + printk(KERN_NOTICE "RxRPC: Unregistered security type %d '%s'\n", + sec->security_index, sec->name); +} + +EXPORT_SYMBOL_GPL(rxrpc_unregister_security); + +/* + * initialise the security on a client connection + */ +int rxrpc_init_client_conn_security(struct rxrpc_connection *conn) +{ + struct rxrpc_security *sec; + struct key *key = conn->key; + int ret; + + _enter("{%d},{%x}", conn->debug_id, key_serial(key)); + + if (!key) + return 0; + + ret = key_validate(key); + if (ret < 0) + return ret; + + sec = rxrpc_security_lookup(key->type_data.x[0]); + if (!sec) + return -EKEYREJECTED; + conn->security = sec; + + ret = conn->security->init_connection_security(conn); + if (ret < 0) { + rxrpc_security_put(conn->security); + conn->security = NULL; + return ret; + } + + _leave(" = 0"); + return 0; +} + +/* + * initialise the security on a server connection + */ +int rxrpc_init_server_conn_security(struct rxrpc_connection *conn) +{ + struct rxrpc_security *sec; + struct rxrpc_local *local = conn->trans->local; + struct rxrpc_sock *rx; + struct key *key; + key_ref_t kref; + char kdesc[5+1+3+1]; + + _enter(""); + + sprintf(kdesc, "%u:%u", ntohs(conn->service_id), conn->security_ix); + + sec = rxrpc_security_lookup(conn->security_ix); + if (!sec) { + _leave(" = -ENOKEY [lookup]"); + return -ENOKEY; + } + + /* find the service */ + read_lock_bh(&local->services_lock); + list_for_each_entry(rx, &local->services, listen_link) { + if (rx->service_id == conn->service_id) + goto found_service; + } + + /* the service appears to have died */ + read_unlock_bh(&local->services_lock); + rxrpc_security_put(sec); + _leave(" = -ENOENT"); + return -ENOENT; + +found_service: + if (!rx->securities) { + read_unlock_bh(&local->services_lock); + rxrpc_security_put(sec); + _leave(" = -ENOKEY"); + return -ENOKEY; + } + + /* look through the service's keyring */ + kref = keyring_search(make_key_ref(rx->securities, 1UL), + &key_type_rxrpc_s, kdesc); + if (IS_ERR(kref)) { + read_unlock_bh(&local->services_lock); + rxrpc_security_put(sec); + _leave(" = %ld [search]", PTR_ERR(kref)); + return PTR_ERR(kref); + } + + key = key_ref_to_ptr(kref); + read_unlock_bh(&local->services_lock); + + conn->server_key = key; + conn->security = sec; + + _leave(" = 0"); + return 0; +} + +/* + * secure a packet prior to transmission + */ +int rxrpc_secure_packet(const struct rxrpc_call *call, + struct sk_buff *skb, + size_t data_size, + void *sechdr) +{ + if (call->conn->security) + return call->conn->security->secure_packet( + call, skb, data_size, sechdr); + return 0; +} + +/* + * secure a packet prior to transmission + */ +int rxrpc_verify_packet(const struct rxrpc_call *call, struct sk_buff *skb, + u32 *_abort_code) +{ + if (call->conn->security) + return call->conn->security->verify_packet( + call, skb, _abort_code); + return 0; +} + +/* + * clear connection security + */ +void rxrpc_clear_conn_security(struct rxrpc_connection *conn) +{ + _enter("{%d}", conn->debug_id); + + if (conn->security) { + conn->security->clear(conn); + rxrpc_security_put(conn->security); + conn->security = NULL; + } + + key_put(conn->key); + key_put(conn->server_key); +} diff --git a/net/rxrpc/ar-skbuff.c b/net/rxrpc/ar-skbuff.c new file mode 100644 index 00000000000..de755e04d29 --- /dev/null +++ b/net/rxrpc/ar-skbuff.c @@ -0,0 +1,132 @@ +/* ar-skbuff.c: socket buffer destruction handling + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/net.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/af_rxrpc.h> +#include "ar-internal.h" + +/* + * set up for the ACK at the end of the receive phase when we discard the final + * receive phase data packet + * - called with softirqs disabled + */ +static void rxrpc_request_final_ACK(struct rxrpc_call *call) +{ + /* the call may be aborted before we have a chance to ACK it */ + write_lock(&call->state_lock); + + switch (call->state) { + case RXRPC_CALL_CLIENT_RECV_REPLY: + call->state = RXRPC_CALL_CLIENT_FINAL_ACK; + _debug("request final ACK"); + + /* get an extra ref on the call for the final-ACK generator to + * release */ + rxrpc_get_call(call); + set_bit(RXRPC_CALL_ACK_FINAL, &call->events); + if (try_to_del_timer_sync(&call->ack_timer) >= 0) + rxrpc_queue_call(call); + break; + + case RXRPC_CALL_SERVER_RECV_REQUEST: + call->state = RXRPC_CALL_SERVER_ACK_REQUEST; + default: + break; + } + + write_unlock(&call->state_lock); +} + +/* + * drop the bottom ACK off of the call ACK window and advance the window + */ +static void rxrpc_hard_ACK_data(struct rxrpc_call *call, + struct rxrpc_skb_priv *sp) +{ + int loop; + u32 seq; + + spin_lock_bh(&call->lock); + + _debug("hard ACK #%u", ntohl(sp->hdr.seq)); + + for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) { + call->ackr_window[loop] >>= 1; + call->ackr_window[loop] |= + call->ackr_window[loop + 1] << (BITS_PER_LONG - 1); + } + + seq = ntohl(sp->hdr.seq); + ASSERTCMP(seq, ==, call->rx_data_eaten + 1); + call->rx_data_eaten = seq; + + if (call->ackr_win_top < UINT_MAX) + call->ackr_win_top++; + + ASSERTIFCMP(call->state <= RXRPC_CALL_COMPLETE, + call->rx_data_post, >=, call->rx_data_recv); + ASSERTIFCMP(call->state <= RXRPC_CALL_COMPLETE, + call->rx_data_recv, >=, call->rx_data_eaten); + + if (sp->hdr.flags & RXRPC_LAST_PACKET) { + rxrpc_request_final_ACK(call); + } else if (atomic_dec_and_test(&call->ackr_not_idle) && + test_and_clear_bit(RXRPC_CALL_TX_SOFT_ACK, &call->flags)) { + _debug("send Rx idle ACK"); + __rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, sp->hdr.serial, + true); + } + + spin_unlock_bh(&call->lock); +} + +/* + * destroy a packet that has an RxRPC control buffer + * - advance the hard-ACK state of the parent call (done here in case something + * in the kernel bypasses recvmsg() and steals the packet directly off of the + * socket receive queue) + */ +void rxrpc_packet_destructor(struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_call *call = sp->call; + + _enter("%p{%p}", skb, call); + + if (call) { + /* send the final ACK on a client call */ + if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA) + rxrpc_hard_ACK_data(call, sp); + rxrpc_put_call(call); + sp->call = NULL; + } + + if (skb->sk) + sock_rfree(skb); + _leave(""); +} + +/** + * rxrpc_kernel_free_skb - Free an RxRPC socket buffer + * @skb: The socket buffer to be freed + * + * Let RxRPC free its own socket buffer, permitting it to maintain debug + * accounting. + */ +void rxrpc_kernel_free_skb(struct sk_buff *skb) +{ + rxrpc_free_skb(skb); +} + +EXPORT_SYMBOL(rxrpc_kernel_free_skb); diff --git a/net/rxrpc/ar-transport.c b/net/rxrpc/ar-transport.c new file mode 100644 index 00000000000..d43d78f1930 --- /dev/null +++ b/net/rxrpc/ar-transport.c @@ -0,0 +1,276 @@ +/* RxRPC point-to-point transport session management + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/net.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/af_rxrpc.h> +#include "ar-internal.h" + +static void rxrpc_transport_reaper(struct work_struct *work); + +static LIST_HEAD(rxrpc_transports); +static DEFINE_RWLOCK(rxrpc_transport_lock); +static unsigned long rxrpc_transport_timeout = 3600 * 24; +static DECLARE_DELAYED_WORK(rxrpc_transport_reap, rxrpc_transport_reaper); + +/* + * allocate a new transport session manager + */ +static struct rxrpc_transport *rxrpc_alloc_transport(struct rxrpc_local *local, + struct rxrpc_peer *peer, + gfp_t gfp) +{ + struct rxrpc_transport *trans; + + _enter(""); + + trans = kzalloc(sizeof(struct rxrpc_transport), gfp); + if (trans) { + trans->local = local; + trans->peer = peer; + INIT_LIST_HEAD(&trans->link); + trans->bundles = RB_ROOT; + trans->client_conns = RB_ROOT; + trans->server_conns = RB_ROOT; + skb_queue_head_init(&trans->error_queue); + spin_lock_init(&trans->client_lock); + rwlock_init(&trans->conn_lock); + atomic_set(&trans->usage, 1); + trans->debug_id = atomic_inc_return(&rxrpc_debug_id); + + if (peer->srx.transport.family == AF_INET) { + switch (peer->srx.transport_type) { + case SOCK_DGRAM: + INIT_WORK(&trans->error_handler, + rxrpc_UDP_error_handler); + break; + default: + BUG(); + break; + } + } else { + BUG(); + } + } + + _leave(" = %p", trans); + return trans; +} + +/* + * obtain a transport session for the nominated endpoints + */ +struct rxrpc_transport *rxrpc_get_transport(struct rxrpc_local *local, + struct rxrpc_peer *peer, + gfp_t gfp) +{ + struct rxrpc_transport *trans, *candidate; + const char *new = "old"; + int usage; + + _enter("{%u.%u.%u.%u+%hu},{%u.%u.%u.%u+%hu},", + NIPQUAD(local->srx.transport.sin.sin_addr), + ntohs(local->srx.transport.sin.sin_port), + NIPQUAD(peer->srx.transport.sin.sin_addr), + ntohs(peer->srx.transport.sin.sin_port)); + + /* search the transport list first */ + read_lock_bh(&rxrpc_transport_lock); + list_for_each_entry(trans, &rxrpc_transports, link) { + if (trans->local == local && trans->peer == peer) + goto found_extant_transport; + } + read_unlock_bh(&rxrpc_transport_lock); + + /* not yet present - create a candidate for a new record and then + * redo the search */ + candidate = rxrpc_alloc_transport(local, peer, gfp); + if (!candidate) { + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); + } + + write_lock_bh(&rxrpc_transport_lock); + + list_for_each_entry(trans, &rxrpc_transports, link) { + if (trans->local == local && trans->peer == peer) + goto found_extant_second; + } + + /* we can now add the new candidate to the list */ + trans = candidate; + candidate = NULL; + + rxrpc_get_local(trans->local); + atomic_inc(&trans->peer->usage); + list_add_tail(&trans->link, &rxrpc_transports); + write_unlock_bh(&rxrpc_transport_lock); + new = "new"; + +success: + _net("TRANSPORT %s %d local %d -> peer %d", + new, + trans->debug_id, + trans->local->debug_id, + trans->peer->debug_id); + + _leave(" = %p {u=%d}", trans, atomic_read(&trans->usage)); + return trans; + + /* we found the transport in the list immediately */ +found_extant_transport: + usage = atomic_inc_return(&trans->usage); + read_unlock_bh(&rxrpc_transport_lock); + goto success; + + /* we found the transport on the second time through the list */ +found_extant_second: + usage = atomic_inc_return(&trans->usage); + write_unlock_bh(&rxrpc_transport_lock); + kfree(candidate); + goto success; +} + +/* + * find the transport connecting two endpoints + */ +struct rxrpc_transport *rxrpc_find_transport(struct rxrpc_local *local, + struct rxrpc_peer *peer) +{ + struct rxrpc_transport *trans; + + _enter("{%u.%u.%u.%u+%hu},{%u.%u.%u.%u+%hu},", + NIPQUAD(local->srx.transport.sin.sin_addr), + ntohs(local->srx.transport.sin.sin_port), + NIPQUAD(peer->srx.transport.sin.sin_addr), + ntohs(peer->srx.transport.sin.sin_port)); + + /* search the transport list */ + read_lock_bh(&rxrpc_transport_lock); + + list_for_each_entry(trans, &rxrpc_transports, link) { + if (trans->local == local && trans->peer == peer) + goto found_extant_transport; + } + + read_unlock_bh(&rxrpc_transport_lock); + _leave(" = NULL"); + return NULL; + +found_extant_transport: + atomic_inc(&trans->usage); + read_unlock_bh(&rxrpc_transport_lock); + _leave(" = %p", trans); + return trans; +} + +/* + * release a transport session + */ +void rxrpc_put_transport(struct rxrpc_transport *trans) +{ + _enter("%p{u=%d}", trans, atomic_read(&trans->usage)); + + ASSERTCMP(atomic_read(&trans->usage), >, 0); + + trans->put_time = xtime.tv_sec; + if (unlikely(atomic_dec_and_test(&trans->usage))) + _debug("zombie"); + /* let the reaper determine the timeout to avoid a race with + * overextending the timeout if the reaper is running at the + * same time */ + rxrpc_queue_delayed_work(&rxrpc_transport_reap, 0); + _leave(""); +} + +/* + * clean up a transport session + */ +static void rxrpc_cleanup_transport(struct rxrpc_transport *trans) +{ + _net("DESTROY TRANS %d", trans->debug_id); + + rxrpc_purge_queue(&trans->error_queue); + + rxrpc_put_local(trans->local); + rxrpc_put_peer(trans->peer); + kfree(trans); +} + +/* + * reap dead transports that have passed their expiry date + */ +static void rxrpc_transport_reaper(struct work_struct *work) +{ + struct rxrpc_transport *trans, *_p; + unsigned long now, earliest, reap_time; + + LIST_HEAD(graveyard); + + _enter(""); + + now = xtime.tv_sec; + earliest = ULONG_MAX; + + /* extract all the transports that have been dead too long */ + write_lock_bh(&rxrpc_transport_lock); + list_for_each_entry_safe(trans, _p, &rxrpc_transports, link) { + _debug("reap TRANS %d { u=%d t=%ld }", + trans->debug_id, atomic_read(&trans->usage), + (long) now - (long) trans->put_time); + + if (likely(atomic_read(&trans->usage) > 0)) + continue; + + reap_time = trans->put_time + rxrpc_transport_timeout; + if (reap_time <= now) + list_move_tail(&trans->link, &graveyard); + else if (reap_time < earliest) + earliest = reap_time; + } + write_unlock_bh(&rxrpc_transport_lock); + + if (earliest != ULONG_MAX) { + _debug("reschedule reaper %ld", (long) earliest - now); + ASSERTCMP(earliest, >, now); + rxrpc_queue_delayed_work(&rxrpc_transport_reap, + (earliest - now) * HZ); + } + + /* then destroy all those pulled out */ + while (!list_empty(&graveyard)) { + trans = list_entry(graveyard.next, struct rxrpc_transport, + link); + list_del_init(&trans->link); + + ASSERTCMP(atomic_read(&trans->usage), ==, 0); + rxrpc_cleanup_transport(trans); + } + + _leave(""); +} + +/* + * preemptively destroy all the transport session records rather than waiting + * for them to time out + */ +void __exit rxrpc_destroy_all_transports(void) +{ + _enter(""); + + rxrpc_transport_timeout = 0; + cancel_delayed_work(&rxrpc_transport_reap); + rxrpc_queue_delayed_work(&rxrpc_transport_reap, 0); + + _leave(""); +} diff --git a/net/rxrpc/call.c b/net/rxrpc/call.c deleted file mode 100644 index d07122b57e0..00000000000 --- a/net/rxrpc/call.c +++ /dev/null @@ -1,2277 +0,0 @@ -/* call.c: Rx call routines - * - * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/module.h> -#include <rxrpc/rxrpc.h> -#include <rxrpc/transport.h> -#include <rxrpc/peer.h> -#include <rxrpc/connection.h> -#include <rxrpc/call.h> -#include <rxrpc/message.h> -#include "internal.h" - -__RXACCT_DECL(atomic_t rxrpc_call_count); -__RXACCT_DECL(atomic_t rxrpc_message_count); - -LIST_HEAD(rxrpc_calls); -DECLARE_RWSEM(rxrpc_calls_sem); - -unsigned rxrpc_call_rcv_timeout = HZ/3; -static unsigned rxrpc_call_acks_timeout = HZ/3; -static unsigned rxrpc_call_dfr_ack_timeout = HZ/20; -static unsigned short rxrpc_call_max_resend = HZ/10; - -const char *rxrpc_call_states[] = { - "COMPLETE", - "ERROR", - "SRVR_RCV_OPID", - "SRVR_RCV_ARGS", - "SRVR_GOT_ARGS", - "SRVR_SND_REPLY", - "SRVR_RCV_FINAL_ACK", - "CLNT_SND_ARGS", - "CLNT_RCV_REPLY", - "CLNT_GOT_REPLY" -}; - -const char *rxrpc_call_error_states[] = { - "NO_ERROR", - "LOCAL_ABORT", - "PEER_ABORT", - "LOCAL_ERROR", - "REMOTE_ERROR" -}; - -const char *rxrpc_pkts[] = { - "?00", - "data", "ack", "busy", "abort", "ackall", "chall", "resp", "debug", - "?09", "?10", "?11", "?12", "?13", "?14", "?15" -}; - -static const char *rxrpc_acks[] = { - "---", "REQ", "DUP", "SEQ", "WIN", "MEM", "PNG", "PNR", "DLY", "IDL", - "-?-" -}; - -static const char _acktype[] = "NA-"; - -static void rxrpc_call_receive_packet(struct rxrpc_call *call); -static void rxrpc_call_receive_data_packet(struct rxrpc_call *call, - struct rxrpc_message *msg); -static void rxrpc_call_receive_ack_packet(struct rxrpc_call *call, - struct rxrpc_message *msg); -static void rxrpc_call_definitively_ACK(struct rxrpc_call *call, - rxrpc_seq_t higest); -static void rxrpc_call_resend(struct rxrpc_call *call, rxrpc_seq_t highest); -static int __rxrpc_call_read_data(struct rxrpc_call *call); - -static int rxrpc_call_record_ACK(struct rxrpc_call *call, - struct rxrpc_message *msg, - rxrpc_seq_t seq, - size_t count); - -static int rxrpc_call_flush(struct rxrpc_call *call); - -#define _state(call) \ - _debug("[[[ state %s ]]]", rxrpc_call_states[call->app_call_state]); - -static void rxrpc_call_default_attn_func(struct rxrpc_call *call) -{ - wake_up(&call->waitq); -} - -static void rxrpc_call_default_error_func(struct rxrpc_call *call) -{ - wake_up(&call->waitq); -} - -static void rxrpc_call_default_aemap_func(struct rxrpc_call *call) -{ - switch (call->app_err_state) { - case RXRPC_ESTATE_LOCAL_ABORT: - call->app_abort_code = -call->app_errno; - case RXRPC_ESTATE_PEER_ABORT: - call->app_errno = -ECONNABORTED; - default: - break; - } -} - -static void __rxrpc_call_acks_timeout(unsigned long _call) -{ - struct rxrpc_call *call = (struct rxrpc_call *) _call; - - _debug("ACKS TIMEOUT %05lu", jiffies - call->cjif); - - call->flags |= RXRPC_CALL_ACKS_TIMO; - rxrpc_krxiod_queue_call(call); -} - -static void __rxrpc_call_rcv_timeout(unsigned long _call) -{ - struct rxrpc_call *call = (struct rxrpc_call *) _call; - - _debug("RCV TIMEOUT %05lu", jiffies - call->cjif); - - call->flags |= RXRPC_CALL_RCV_TIMO; - rxrpc_krxiod_queue_call(call); -} - -static void __rxrpc_call_ackr_timeout(unsigned long _call) -{ - struct rxrpc_call *call = (struct rxrpc_call *) _call; - - _debug("ACKR TIMEOUT %05lu",jiffies - call->cjif); - - call->flags |= RXRPC_CALL_ACKR_TIMO; - rxrpc_krxiod_queue_call(call); -} - -/*****************************************************************************/ -/* - * calculate a timeout based on an RTT value - */ -static inline unsigned long __rxrpc_rtt_based_timeout(struct rxrpc_call *call, - unsigned long val) -{ - unsigned long expiry = call->conn->peer->rtt / (1000000 / HZ); - - expiry += 10; - if (expiry < HZ / 25) - expiry = HZ / 25; - if (expiry > HZ) - expiry = HZ; - - _leave(" = %lu jiffies", expiry); - return jiffies + expiry; -} /* end __rxrpc_rtt_based_timeout() */ - -/*****************************************************************************/ -/* - * create a new call record - */ -static inline int __rxrpc_create_call(struct rxrpc_connection *conn, - struct rxrpc_call **_call) -{ - struct rxrpc_call *call; - - _enter("%p", conn); - - /* allocate and initialise a call record */ - call = (struct rxrpc_call *) get_zeroed_page(GFP_KERNEL); - if (!call) { - _leave(" ENOMEM"); - return -ENOMEM; - } - - atomic_set(&call->usage, 1); - - init_waitqueue_head(&call->waitq); - spin_lock_init(&call->lock); - INIT_LIST_HEAD(&call->link); - INIT_LIST_HEAD(&call->acks_pendq); - INIT_LIST_HEAD(&call->rcv_receiveq); - INIT_LIST_HEAD(&call->rcv_krxiodq_lk); - INIT_LIST_HEAD(&call->app_readyq); - INIT_LIST_HEAD(&call->app_unreadyq); - INIT_LIST_HEAD(&call->app_link); - INIT_LIST_HEAD(&call->app_attn_link); - - init_timer(&call->acks_timeout); - call->acks_timeout.data = (unsigned long) call; - call->acks_timeout.function = __rxrpc_call_acks_timeout; - - init_timer(&call->rcv_timeout); - call->rcv_timeout.data = (unsigned long) call; - call->rcv_timeout.function = __rxrpc_call_rcv_timeout; - - init_timer(&call->ackr_dfr_timo); - call->ackr_dfr_timo.data = (unsigned long) call; - call->ackr_dfr_timo.function = __rxrpc_call_ackr_timeout; - - call->conn = conn; - call->ackr_win_bot = 1; - call->ackr_win_top = call->ackr_win_bot + RXRPC_CALL_ACK_WINDOW_SIZE - 1; - call->ackr_prev_seq = 0; - call->app_mark = RXRPC_APP_MARK_EOF; - call->app_attn_func = rxrpc_call_default_attn_func; - call->app_error_func = rxrpc_call_default_error_func; - call->app_aemap_func = rxrpc_call_default_aemap_func; - call->app_scr_alloc = call->app_scratch; - - call->cjif = jiffies; - - _leave(" = 0 (%p)", call); - - *_call = call; - - return 0; -} /* end __rxrpc_create_call() */ - -/*****************************************************************************/ -/* - * create a new call record for outgoing calls - */ -int rxrpc_create_call(struct rxrpc_connection *conn, - rxrpc_call_attn_func_t attn, - rxrpc_call_error_func_t error, - rxrpc_call_aemap_func_t aemap, - struct rxrpc_call **_call) -{ - DECLARE_WAITQUEUE(myself, current); - - struct rxrpc_call *call; - int ret, cix, loop; - - _enter("%p", conn); - - /* allocate and initialise a call record */ - ret = __rxrpc_create_call(conn, &call); - if (ret < 0) { - _leave(" = %d", ret); - return ret; - } - - call->app_call_state = RXRPC_CSTATE_CLNT_SND_ARGS; - if (attn) - call->app_attn_func = attn; - if (error) - call->app_error_func = error; - if (aemap) - call->app_aemap_func = aemap; - - _state(call); - - spin_lock(&conn->lock); - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&conn->chanwait, &myself); - - try_again: - /* try to find an unused channel */ - for (cix = 0; cix < 4; cix++) - if (!conn->channels[cix]) - goto obtained_chan; - - /* no free channels - wait for one to become available */ - ret = -EINTR; - if (signal_pending(current)) - goto error_unwait; - - spin_unlock(&conn->lock); - - schedule(); - set_current_state(TASK_INTERRUPTIBLE); - - spin_lock(&conn->lock); - goto try_again; - - /* got a channel - now attach to the connection */ - obtained_chan: - remove_wait_queue(&conn->chanwait, &myself); - set_current_state(TASK_RUNNING); - - /* concoct a unique call number */ - next_callid: - call->call_id = htonl(++conn->call_counter); - for (loop = 0; loop < 4; loop++) - if (conn->channels[loop] && - conn->channels[loop]->call_id == call->call_id) - goto next_callid; - - rxrpc_get_connection(conn); - conn->channels[cix] = call; /* assign _after_ done callid check loop */ - do_gettimeofday(&conn->atime); - call->chan_ix = htonl(cix); - - spin_unlock(&conn->lock); - - down_write(&rxrpc_calls_sem); - list_add_tail(&call->call_link, &rxrpc_calls); - up_write(&rxrpc_calls_sem); - - __RXACCT(atomic_inc(&rxrpc_call_count)); - *_call = call; - - _leave(" = 0 (call=%p cix=%u)", call, cix); - return 0; - - error_unwait: - remove_wait_queue(&conn->chanwait, &myself); - set_current_state(TASK_RUNNING); - spin_unlock(&conn->lock); - - free_page((unsigned long) call); - _leave(" = %d", ret); - return ret; -} /* end rxrpc_create_call() */ - -/*****************************************************************************/ -/* - * create a new call record for incoming calls - */ -int rxrpc_incoming_call(struct rxrpc_connection *conn, - struct rxrpc_message *msg, - struct rxrpc_call **_call) -{ - struct rxrpc_call *call; - unsigned cix; - int ret; - - cix = ntohl(msg->hdr.cid) & RXRPC_CHANNELMASK; - - _enter("%p,%u,%u", conn, ntohl(msg->hdr.callNumber), cix); - - /* allocate and initialise a call record */ - ret = __rxrpc_create_call(conn, &call); - if (ret < 0) { - _leave(" = %d", ret); - return ret; - } - - call->pkt_rcv_count = 1; - call->app_call_state = RXRPC_CSTATE_SRVR_RCV_OPID; - call->app_mark = sizeof(uint32_t); - - _state(call); - - /* attach to the connection */ - ret = -EBUSY; - call->chan_ix = htonl(cix); - call->call_id = msg->hdr.callNumber; - - spin_lock(&conn->lock); - - if (!conn->channels[cix] || - conn->channels[cix]->app_call_state == RXRPC_CSTATE_COMPLETE || - conn->channels[cix]->app_call_state == RXRPC_CSTATE_ERROR - ) { - conn->channels[cix] = call; - rxrpc_get_connection(conn); - ret = 0; - } - - spin_unlock(&conn->lock); - - if (ret < 0) { - free_page((unsigned long) call); - call = NULL; - } - - if (ret == 0) { - down_write(&rxrpc_calls_sem); - list_add_tail(&call->call_link, &rxrpc_calls); - up_write(&rxrpc_calls_sem); - __RXACCT(atomic_inc(&rxrpc_call_count)); - *_call = call; - } - - _leave(" = %d [%p]", ret, call); - return ret; -} /* end rxrpc_incoming_call() */ - -/*****************************************************************************/ -/* - * free a call record - */ -void rxrpc_put_call(struct rxrpc_call *call) -{ - struct rxrpc_connection *conn = call->conn; - struct rxrpc_message *msg; - - _enter("%p{u=%d}",call,atomic_read(&call->usage)); - - /* sanity check */ - if (atomic_read(&call->usage) <= 0) - BUG(); - - /* to prevent a race, the decrement and the de-list must be effectively - * atomic */ - spin_lock(&conn->lock); - if (likely(!atomic_dec_and_test(&call->usage))) { - spin_unlock(&conn->lock); - _leave(""); - return; - } - - if (conn->channels[ntohl(call->chan_ix)] == call) - conn->channels[ntohl(call->chan_ix)] = NULL; - - spin_unlock(&conn->lock); - - wake_up(&conn->chanwait); - - rxrpc_put_connection(conn); - - /* clear the timers and dequeue from krxiod */ - del_timer_sync(&call->acks_timeout); - del_timer_sync(&call->rcv_timeout); - del_timer_sync(&call->ackr_dfr_timo); - - rxrpc_krxiod_dequeue_call(call); - - /* clean up the contents of the struct */ - if (call->snd_nextmsg) - rxrpc_put_message(call->snd_nextmsg); - - if (call->snd_ping) - rxrpc_put_message(call->snd_ping); - - while (!list_empty(&call->acks_pendq)) { - msg = list_entry(call->acks_pendq.next, - struct rxrpc_message, link); - list_del(&msg->link); - rxrpc_put_message(msg); - } - - while (!list_empty(&call->rcv_receiveq)) { - msg = list_entry(call->rcv_receiveq.next, - struct rxrpc_message, link); - list_del(&msg->link); - rxrpc_put_message(msg); - } - - while (!list_empty(&call->app_readyq)) { - msg = list_entry(call->app_readyq.next, - struct rxrpc_message, link); - list_del(&msg->link); - rxrpc_put_message(msg); - } - - while (!list_empty(&call->app_unreadyq)) { - msg = list_entry(call->app_unreadyq.next, - struct rxrpc_message, link); - list_del(&msg->link); - rxrpc_put_message(msg); - } - - module_put(call->owner); - - down_write(&rxrpc_calls_sem); - list_del(&call->call_link); - up_write(&rxrpc_calls_sem); - - __RXACCT(atomic_dec(&rxrpc_call_count)); - free_page((unsigned long) call); - - _leave(" [destroyed]"); -} /* end rxrpc_put_call() */ - -/*****************************************************************************/ -/* - * actually generate a normal ACK - */ -static inline int __rxrpc_call_gen_normal_ACK(struct rxrpc_call *call, - rxrpc_seq_t seq) -{ - struct rxrpc_message *msg; - struct kvec diov[3]; - __be32 aux[4]; - int delta, ret; - - /* ACKs default to DELAY */ - if (!call->ackr.reason) - call->ackr.reason = RXRPC_ACK_DELAY; - - _proto("Rx %05lu Sending ACK { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", - jiffies - call->cjif, - ntohs(call->ackr.maxSkew), - ntohl(call->ackr.firstPacket), - ntohl(call->ackr.previousPacket), - ntohl(call->ackr.serial), - rxrpc_acks[call->ackr.reason], - call->ackr.nAcks); - - aux[0] = htonl(call->conn->peer->if_mtu); /* interface MTU */ - aux[1] = htonl(1444); /* max MTU */ - aux[2] = htonl(16); /* rwind */ - aux[3] = htonl(4); /* max packets */ - - diov[0].iov_len = sizeof(struct rxrpc_ackpacket); - diov[0].iov_base = &call->ackr; - diov[1].iov_len = call->ackr_pend_cnt + 3; - diov[1].iov_base = call->ackr_array; - diov[2].iov_len = sizeof(aux); - diov[2].iov_base = &aux; - - /* build and send the message */ - ret = rxrpc_conn_newmsg(call->conn,call, RXRPC_PACKET_TYPE_ACK, - 3, diov, GFP_KERNEL, &msg); - if (ret < 0) - goto out; - - msg->seq = seq; - msg->hdr.seq = htonl(seq); - msg->hdr.flags |= RXRPC_SLOW_START_OK; - - ret = rxrpc_conn_sendmsg(call->conn, msg); - rxrpc_put_message(msg); - if (ret < 0) - goto out; - call->pkt_snd_count++; - - /* count how many actual ACKs there were at the front */ - for (delta = 0; delta < call->ackr_pend_cnt; delta++) - if (call->ackr_array[delta] != RXRPC_ACK_TYPE_ACK) - break; - - call->ackr_pend_cnt -= delta; /* all ACK'd to this point */ - - /* crank the ACK window around */ - if (delta == 0) { - /* un-ACK'd window */ - } - else if (delta < RXRPC_CALL_ACK_WINDOW_SIZE) { - /* partially ACK'd window - * - shuffle down to avoid losing out-of-sequence packets - */ - call->ackr_win_bot += delta; - call->ackr_win_top += delta; - - memmove(&call->ackr_array[0], - &call->ackr_array[delta], - call->ackr_pend_cnt); - - memset(&call->ackr_array[call->ackr_pend_cnt], - RXRPC_ACK_TYPE_NACK, - sizeof(call->ackr_array) - call->ackr_pend_cnt); - } - else { - /* fully ACK'd window - * - just clear the whole thing - */ - memset(&call->ackr_array, - RXRPC_ACK_TYPE_NACK, - sizeof(call->ackr_array)); - } - - /* clear this ACK */ - memset(&call->ackr, 0, sizeof(call->ackr)); - - out: - if (!call->app_call_state) - printk("___ STATE 0 ___\n"); - return ret; -} /* end __rxrpc_call_gen_normal_ACK() */ - -/*****************************************************************************/ -/* - * note the reception of a packet in the call's ACK records and generate an - * appropriate ACK packet if necessary - * - returns 0 if packet should be processed, 1 if packet should be ignored - * and -ve on an error - */ -static int rxrpc_call_generate_ACK(struct rxrpc_call *call, - struct rxrpc_header *hdr, - struct rxrpc_ackpacket *ack) -{ - struct rxrpc_message *msg; - rxrpc_seq_t seq; - unsigned offset; - int ret = 0, err; - u8 special_ACK, do_ACK, force; - - _enter("%p,%p { seq=%d tp=%d fl=%02x }", - call, hdr, ntohl(hdr->seq), hdr->type, hdr->flags); - - seq = ntohl(hdr->seq); - offset = seq - call->ackr_win_bot; - do_ACK = RXRPC_ACK_DELAY; - special_ACK = 0; - force = (seq == 1); - - if (call->ackr_high_seq < seq) - call->ackr_high_seq = seq; - - /* deal with generation of obvious special ACKs first */ - if (ack && ack->reason == RXRPC_ACK_PING) { - special_ACK = RXRPC_ACK_PING_RESPONSE; - ret = 1; - goto gen_ACK; - } - - if (seq < call->ackr_win_bot) { - special_ACK = RXRPC_ACK_DUPLICATE; - ret = 1; - goto gen_ACK; - } - - if (seq >= call->ackr_win_top) { - special_ACK = RXRPC_ACK_EXCEEDS_WINDOW; - ret = 1; - goto gen_ACK; - } - - if (call->ackr_array[offset] != RXRPC_ACK_TYPE_NACK) { - special_ACK = RXRPC_ACK_DUPLICATE; - ret = 1; - goto gen_ACK; - } - - /* okay... it's a normal data packet inside the ACK window */ - call->ackr_array[offset] = RXRPC_ACK_TYPE_ACK; - - if (offset < call->ackr_pend_cnt) { - } - else if (offset > call->ackr_pend_cnt) { - do_ACK = RXRPC_ACK_OUT_OF_SEQUENCE; - call->ackr_pend_cnt = offset; - goto gen_ACK; - } - - if (hdr->flags & RXRPC_REQUEST_ACK) { - do_ACK = RXRPC_ACK_REQUESTED; - } - - /* generate an ACK on the final packet of a reply just received */ - if (hdr->flags & RXRPC_LAST_PACKET) { - if (call->conn->out_clientflag) - force = 1; - } - else if (!(hdr->flags & RXRPC_MORE_PACKETS)) { - do_ACK = RXRPC_ACK_REQUESTED; - } - - /* re-ACK packets previously received out-of-order */ - for (offset++; offset < RXRPC_CALL_ACK_WINDOW_SIZE; offset++) - if (call->ackr_array[offset] != RXRPC_ACK_TYPE_ACK) - break; - - call->ackr_pend_cnt = offset; - - /* generate an ACK if we fill up the window */ - if (call->ackr_pend_cnt >= RXRPC_CALL_ACK_WINDOW_SIZE) - force = 1; - - gen_ACK: - _debug("%05lu ACKs pend=%u norm=%s special=%s%s", - jiffies - call->cjif, - call->ackr_pend_cnt, - rxrpc_acks[do_ACK], - rxrpc_acks[special_ACK], - force ? " immediate" : - do_ACK == RXRPC_ACK_REQUESTED ? " merge-req" : - hdr->flags & RXRPC_LAST_PACKET ? " finalise" : - " defer" - ); - - /* send any pending normal ACKs if need be */ - if (call->ackr_pend_cnt > 0) { - /* fill out the appropriate form */ - call->ackr.bufferSpace = htons(RXRPC_CALL_ACK_WINDOW_SIZE); - call->ackr.maxSkew = htons(min(call->ackr_high_seq - seq, - 65535U)); - call->ackr.firstPacket = htonl(call->ackr_win_bot); - call->ackr.previousPacket = call->ackr_prev_seq; - call->ackr.serial = hdr->serial; - call->ackr.nAcks = call->ackr_pend_cnt; - - if (do_ACK == RXRPC_ACK_REQUESTED) - call->ackr.reason = do_ACK; - - /* generate the ACK immediately if necessary */ - if (special_ACK || force) { - err = __rxrpc_call_gen_normal_ACK( - call, do_ACK == RXRPC_ACK_DELAY ? 0 : seq); - if (err < 0) { - ret = err; - goto out; - } - } - } - - if (call->ackr.reason == RXRPC_ACK_REQUESTED) - call->ackr_dfr_seq = seq; - - /* start the ACK timer if not running if there are any pending deferred - * ACKs */ - if (call->ackr_pend_cnt > 0 && - call->ackr.reason != RXRPC_ACK_REQUESTED && - !timer_pending(&call->ackr_dfr_timo) - ) { - unsigned long timo; - - timo = rxrpc_call_dfr_ack_timeout + jiffies; - - _debug("START ACKR TIMER for cj=%lu", timo - call->cjif); - - spin_lock(&call->lock); - mod_timer(&call->ackr_dfr_timo, timo); - spin_unlock(&call->lock); - } - else if ((call->ackr_pend_cnt == 0 || - call->ackr.reason == RXRPC_ACK_REQUESTED) && - timer_pending(&call->ackr_dfr_timo) - ) { - /* stop timer if no pending ACKs */ - _debug("CLEAR ACKR TIMER"); - del_timer_sync(&call->ackr_dfr_timo); - } - - /* send a special ACK if one is required */ - if (special_ACK) { - struct rxrpc_ackpacket ack; - struct kvec diov[2]; - uint8_t acks[1] = { RXRPC_ACK_TYPE_ACK }; - - /* fill out the appropriate form */ - ack.bufferSpace = htons(RXRPC_CALL_ACK_WINDOW_SIZE); - ack.maxSkew = htons(min(call->ackr_high_seq - seq, - 65535U)); - ack.firstPacket = htonl(call->ackr_win_bot); - ack.previousPacket = call->ackr_prev_seq; - ack.serial = hdr->serial; - ack.reason = special_ACK; - ack.nAcks = 0; - - _proto("Rx Sending s-ACK" - " { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", - ntohs(ack.maxSkew), - ntohl(ack.firstPacket), - ntohl(ack.previousPacket), - ntohl(ack.serial), - rxrpc_acks[ack.reason], - ack.nAcks); - - diov[0].iov_len = sizeof(struct rxrpc_ackpacket); - diov[0].iov_base = &ack; - diov[1].iov_len = sizeof(acks); - diov[1].iov_base = acks; - - /* build and send the message */ - err = rxrpc_conn_newmsg(call->conn,call, RXRPC_PACKET_TYPE_ACK, - hdr->seq ? 2 : 1, diov, - GFP_KERNEL, - &msg); - if (err < 0) { - ret = err; - goto out; - } - - msg->seq = seq; - msg->hdr.seq = htonl(seq); - msg->hdr.flags |= RXRPC_SLOW_START_OK; - - err = rxrpc_conn_sendmsg(call->conn, msg); - rxrpc_put_message(msg); - if (err < 0) { - ret = err; - goto out; - } - call->pkt_snd_count++; - } - - out: - if (hdr->seq) - call->ackr_prev_seq = hdr->seq; - - _leave(" = %d", ret); - return ret; -} /* end rxrpc_call_generate_ACK() */ - -/*****************************************************************************/ -/* - * handle work to be done on a call - * - includes packet reception and timeout processing - */ -void rxrpc_call_do_stuff(struct rxrpc_call *call) -{ - _enter("%p{flags=%lx}", call, call->flags); - - /* handle packet reception */ - if (call->flags & RXRPC_CALL_RCV_PKT) { - _debug("- receive packet"); - call->flags &= ~RXRPC_CALL_RCV_PKT; - rxrpc_call_receive_packet(call); - } - - /* handle overdue ACKs */ - if (call->flags & RXRPC_CALL_ACKS_TIMO) { - _debug("- overdue ACK timeout"); - call->flags &= ~RXRPC_CALL_ACKS_TIMO; - rxrpc_call_resend(call, call->snd_seq_count); - } - - /* handle lack of reception */ - if (call->flags & RXRPC_CALL_RCV_TIMO) { - _debug("- reception timeout"); - call->flags &= ~RXRPC_CALL_RCV_TIMO; - rxrpc_call_abort(call, -EIO); - } - - /* handle deferred ACKs */ - if (call->flags & RXRPC_CALL_ACKR_TIMO || - (call->ackr.nAcks > 0 && call->ackr.reason == RXRPC_ACK_REQUESTED) - ) { - _debug("- deferred ACK timeout: cj=%05lu r=%s n=%u", - jiffies - call->cjif, - rxrpc_acks[call->ackr.reason], - call->ackr.nAcks); - - call->flags &= ~RXRPC_CALL_ACKR_TIMO; - - if (call->ackr.nAcks > 0 && - call->app_call_state != RXRPC_CSTATE_ERROR) { - /* generate ACK */ - __rxrpc_call_gen_normal_ACK(call, call->ackr_dfr_seq); - call->ackr_dfr_seq = 0; - } - } - - _leave(""); - -} /* end rxrpc_call_do_stuff() */ - -/*****************************************************************************/ -/* - * send an abort message at call or connection level - * - must be called with call->lock held - * - the supplied error code is sent as the packet data - */ -static int __rxrpc_call_abort(struct rxrpc_call *call, int errno) -{ - struct rxrpc_connection *conn = call->conn; - struct rxrpc_message *msg; - struct kvec diov[1]; - int ret; - __be32 _error; - - _enter("%p{%08x},%p{%d},%d", - conn, ntohl(conn->conn_id), call, ntohl(call->call_id), errno); - - /* if this call is already aborted, then just wake up any waiters */ - if (call->app_call_state == RXRPC_CSTATE_ERROR) { - spin_unlock(&call->lock); - call->app_error_func(call); - _leave(" = 0"); - return 0; - } - - rxrpc_get_call(call); - - /* change the state _with_ the lock still held */ - call->app_call_state = RXRPC_CSTATE_ERROR; - call->app_err_state = RXRPC_ESTATE_LOCAL_ABORT; - call->app_errno = errno; - call->app_mark = RXRPC_APP_MARK_EOF; - call->app_read_buf = NULL; - call->app_async_read = 0; - - _state(call); - - /* ask the app to translate the error code */ - call->app_aemap_func(call); - - spin_unlock(&call->lock); - - /* flush any outstanding ACKs */ - del_timer_sync(&call->acks_timeout); - del_timer_sync(&call->rcv_timeout); - del_timer_sync(&call->ackr_dfr_timo); - - if (rxrpc_call_is_ack_pending(call)) - __rxrpc_call_gen_normal_ACK(call, 0); - - /* send the abort packet only if we actually traded some other - * packets */ - ret = 0; - if (call->pkt_snd_count || call->pkt_rcv_count) { - /* actually send the abort */ - _proto("Rx Sending Call ABORT { data=%d }", - call->app_abort_code); - - _error = htonl(call->app_abort_code); - - diov[0].iov_len = sizeof(_error); - diov[0].iov_base = &_error; - - ret = rxrpc_conn_newmsg(conn, call, RXRPC_PACKET_TYPE_ABORT, - 1, diov, GFP_KERNEL, &msg); - if (ret == 0) { - ret = rxrpc_conn_sendmsg(conn, msg); - rxrpc_put_message(msg); - } - } - - /* tell the app layer to let go */ - call->app_error_func(call); - - rxrpc_put_call(call); - - _leave(" = %d", ret); - return ret; -} /* end __rxrpc_call_abort() */ - -/*****************************************************************************/ -/* - * send an abort message at call or connection level - * - the supplied error code is sent as the packet data - */ -int rxrpc_call_abort(struct rxrpc_call *call, int error) -{ - spin_lock(&call->lock); - - return __rxrpc_call_abort(call, error); - -} /* end rxrpc_call_abort() */ - -/*****************************************************************************/ -/* - * process packets waiting for this call - */ -static void rxrpc_call_receive_packet(struct rxrpc_call *call) -{ - struct rxrpc_message *msg; - struct list_head *_p; - - _enter("%p", call); - - rxrpc_get_call(call); /* must not go away too soon if aborted by - * app-layer */ - - while (!list_empty(&call->rcv_receiveq)) { - /* try to get next packet */ - _p = NULL; - spin_lock(&call->lock); - if (!list_empty(&call->rcv_receiveq)) { - _p = call->rcv_receiveq.next; - list_del_init(_p); - } - spin_unlock(&call->lock); - - if (!_p) - break; - - msg = list_entry(_p, struct rxrpc_message, link); - - _proto("Rx %05lu Received %s packet (%%%u,#%u,%c%c%c%c%c)", - jiffies - call->cjif, - rxrpc_pkts[msg->hdr.type], - ntohl(msg->hdr.serial), - msg->seq, - msg->hdr.flags & RXRPC_JUMBO_PACKET ? 'j' : '-', - msg->hdr.flags & RXRPC_MORE_PACKETS ? 'm' : '-', - msg->hdr.flags & RXRPC_LAST_PACKET ? 'l' : '-', - msg->hdr.flags & RXRPC_REQUEST_ACK ? 'r' : '-', - msg->hdr.flags & RXRPC_CLIENT_INITIATED ? 'C' : 'S' - ); - - switch (msg->hdr.type) { - /* deal with data packets */ - case RXRPC_PACKET_TYPE_DATA: - /* ACK the packet if necessary */ - switch (rxrpc_call_generate_ACK(call, &msg->hdr, - NULL)) { - case 0: /* useful packet */ - rxrpc_call_receive_data_packet(call, msg); - break; - case 1: /* duplicate or out-of-window packet */ - break; - default: - rxrpc_put_message(msg); - goto out; - } - break; - - /* deal with ACK packets */ - case RXRPC_PACKET_TYPE_ACK: - rxrpc_call_receive_ack_packet(call, msg); - break; - - /* deal with abort packets */ - case RXRPC_PACKET_TYPE_ABORT: { - __be32 _dbuf, *dp; - - dp = skb_header_pointer(msg->pkt, msg->offset, - sizeof(_dbuf), &_dbuf); - if (dp == NULL) - printk("Rx Received short ABORT packet\n"); - - _proto("Rx Received Call ABORT { data=%d }", - (dp ? ntohl(*dp) : 0)); - - spin_lock(&call->lock); - call->app_call_state = RXRPC_CSTATE_ERROR; - call->app_err_state = RXRPC_ESTATE_PEER_ABORT; - call->app_abort_code = (dp ? ntohl(*dp) : 0); - call->app_errno = -ECONNABORTED; - call->app_mark = RXRPC_APP_MARK_EOF; - call->app_read_buf = NULL; - call->app_async_read = 0; - - /* ask the app to translate the error code */ - call->app_aemap_func(call); - _state(call); - spin_unlock(&call->lock); - call->app_error_func(call); - break; - } - default: - /* deal with other packet types */ - _proto("Rx Unsupported packet type %u (#%u)", - msg->hdr.type, msg->seq); - break; - } - - rxrpc_put_message(msg); - } - - out: - rxrpc_put_call(call); - _leave(""); -} /* end rxrpc_call_receive_packet() */ - -/*****************************************************************************/ -/* - * process next data packet - * - as the next data packet arrives: - * - it is queued on app_readyq _if_ it is the next one expected - * (app_ready_seq+1) - * - it is queued on app_unreadyq _if_ it is not the next one expected - * - if a packet placed on app_readyq completely fills a hole leading up to - * the first packet on app_unreadyq, then packets now in sequence are - * tranferred to app_readyq - * - the application layer can only see packets on app_readyq - * (app_ready_qty bytes) - * - the application layer is prodded every time a new packet arrives - */ -static void rxrpc_call_receive_data_packet(struct rxrpc_call *call, - struct rxrpc_message *msg) -{ - const struct rxrpc_operation *optbl, *op; - struct rxrpc_message *pmsg; - struct list_head *_p; - int ret, lo, hi, rmtimo; - __be32 opid; - - _enter("%p{%u},%p{%u}", call, ntohl(call->call_id), msg, msg->seq); - - rxrpc_get_message(msg); - - /* add to the unready queue if we'd have to create a hole in the ready - * queue otherwise */ - if (msg->seq != call->app_ready_seq + 1) { - _debug("Call add packet %d to unreadyq", msg->seq); - - /* insert in seq order */ - list_for_each(_p, &call->app_unreadyq) { - pmsg = list_entry(_p, struct rxrpc_message, link); - if (pmsg->seq > msg->seq) - break; - } - - list_add_tail(&msg->link, _p); - - _leave(" [unreadyq]"); - return; - } - - /* next in sequence - simply append into the call's ready queue */ - _debug("Call add packet %d to readyq (+%Zd => %Zd bytes)", - msg->seq, msg->dsize, call->app_ready_qty); - - spin_lock(&call->lock); - call->app_ready_seq = msg->seq; - call->app_ready_qty += msg->dsize; - list_add_tail(&msg->link, &call->app_readyq); - - /* move unready packets to the readyq if we got rid of a hole */ - while (!list_empty(&call->app_unreadyq)) { - pmsg = list_entry(call->app_unreadyq.next, - struct rxrpc_message, link); - - if (pmsg->seq != call->app_ready_seq + 1) - break; - - /* next in sequence - just move list-to-list */ - _debug("Call transfer packet %d to readyq (+%Zd => %Zd bytes)", - pmsg->seq, pmsg->dsize, call->app_ready_qty); - - call->app_ready_seq = pmsg->seq; - call->app_ready_qty += pmsg->dsize; - list_move_tail(&pmsg->link, &call->app_readyq); - } - - /* see if we've got the last packet yet */ - if (!list_empty(&call->app_readyq)) { - pmsg = list_entry(call->app_readyq.prev, - struct rxrpc_message, link); - if (pmsg->hdr.flags & RXRPC_LAST_PACKET) { - call->app_last_rcv = 1; - _debug("Last packet on readyq"); - } - } - - switch (call->app_call_state) { - /* do nothing if call already aborted */ - case RXRPC_CSTATE_ERROR: - spin_unlock(&call->lock); - _leave(" [error]"); - return; - - /* extract the operation ID from an incoming call if that's not - * yet been done */ - case RXRPC_CSTATE_SRVR_RCV_OPID: - spin_unlock(&call->lock); - - /* handle as yet insufficient data for the operation ID */ - if (call->app_ready_qty < 4) { - if (call->app_last_rcv) - /* trouble - last packet seen */ - rxrpc_call_abort(call, -EINVAL); - - _leave(""); - return; - } - - /* pull the operation ID out of the buffer */ - ret = rxrpc_call_read_data(call, &opid, sizeof(opid), 0); - if (ret < 0) { - printk("Unexpected error from read-data: %d\n", ret); - if (call->app_call_state != RXRPC_CSTATE_ERROR) - rxrpc_call_abort(call, ret); - _leave(""); - return; - } - call->app_opcode = ntohl(opid); - - /* locate the operation in the available ops table */ - optbl = call->conn->service->ops_begin; - lo = 0; - hi = call->conn->service->ops_end - optbl; - - while (lo < hi) { - int mid = (hi + lo) / 2; - op = &optbl[mid]; - if (call->app_opcode == op->id) - goto found_op; - if (call->app_opcode > op->id) - lo = mid + 1; - else - hi = mid; - } - - /* search failed */ - kproto("Rx Client requested operation %d from %s service", - call->app_opcode, call->conn->service->name); - rxrpc_call_abort(call, -EINVAL); - _leave(" [inval]"); - return; - - found_op: - _proto("Rx Client requested operation %s from %s service", - op->name, call->conn->service->name); - - /* we're now waiting for the argument block (unless the call - * was aborted) */ - spin_lock(&call->lock); - if (call->app_call_state == RXRPC_CSTATE_SRVR_RCV_OPID || - call->app_call_state == RXRPC_CSTATE_SRVR_SND_REPLY) { - if (!call->app_last_rcv) - call->app_call_state = - RXRPC_CSTATE_SRVR_RCV_ARGS; - else if (call->app_ready_qty > 0) - call->app_call_state = - RXRPC_CSTATE_SRVR_GOT_ARGS; - else - call->app_call_state = - RXRPC_CSTATE_SRVR_SND_REPLY; - call->app_mark = op->asize; - call->app_user = op->user; - } - spin_unlock(&call->lock); - - _state(call); - break; - - case RXRPC_CSTATE_SRVR_RCV_ARGS: - /* change state if just received last packet of arg block */ - if (call->app_last_rcv) - call->app_call_state = RXRPC_CSTATE_SRVR_GOT_ARGS; - spin_unlock(&call->lock); - - _state(call); - break; - - case RXRPC_CSTATE_CLNT_RCV_REPLY: - /* change state if just received last packet of reply block */ - rmtimo = 0; - if (call->app_last_rcv) { - call->app_call_state = RXRPC_CSTATE_CLNT_GOT_REPLY; - rmtimo = 1; - } - spin_unlock(&call->lock); - - if (rmtimo) { - del_timer_sync(&call->acks_timeout); - del_timer_sync(&call->rcv_timeout); - del_timer_sync(&call->ackr_dfr_timo); - } - - _state(call); - break; - - default: - /* deal with data reception in an unexpected state */ - printk("Unexpected state [[[ %u ]]]\n", call->app_call_state); - __rxrpc_call_abort(call, -EBADMSG); - _leave(""); - return; - } - - if (call->app_call_state == RXRPC_CSTATE_CLNT_RCV_REPLY && - call->app_last_rcv) - BUG(); - - /* otherwise just invoke the data function whenever we can satisfy its desire for more - * data - */ - _proto("Rx Received Op Data: st=%u qty=%Zu mk=%Zu%s", - call->app_call_state, call->app_ready_qty, call->app_mark, - call->app_last_rcv ? " last-rcvd" : ""); - - spin_lock(&call->lock); - - ret = __rxrpc_call_read_data(call); - switch (ret) { - case 0: - spin_unlock(&call->lock); - call->app_attn_func(call); - break; - case -EAGAIN: - spin_unlock(&call->lock); - break; - case -ECONNABORTED: - spin_unlock(&call->lock); - break; - default: - __rxrpc_call_abort(call, ret); - break; - } - - _state(call); - - _leave(""); - -} /* end rxrpc_call_receive_data_packet() */ - -/*****************************************************************************/ -/* - * received an ACK packet - */ -static void rxrpc_call_receive_ack_packet(struct rxrpc_call *call, - struct rxrpc_message *msg) -{ - struct rxrpc_ackpacket _ack, *ap; - rxrpc_serial_net_t serial; - rxrpc_seq_t seq; - int ret; - - _enter("%p{%u},%p{%u}", call, ntohl(call->call_id), msg, msg->seq); - - /* extract the basic ACK record */ - ap = skb_header_pointer(msg->pkt, msg->offset, sizeof(_ack), &_ack); - if (ap == NULL) { - printk("Rx Received short ACK packet\n"); - return; - } - msg->offset += sizeof(_ack); - - serial = ap->serial; - seq = ntohl(ap->firstPacket); - - _proto("Rx Received ACK %%%d { b=%hu m=%hu f=%u p=%u s=%u r=%s n=%u }", - ntohl(msg->hdr.serial), - ntohs(ap->bufferSpace), - ntohs(ap->maxSkew), - seq, - ntohl(ap->previousPacket), - ntohl(serial), - rxrpc_acks[ap->reason], - call->ackr.nAcks - ); - - /* check the other side isn't ACK'ing a sequence number I haven't sent - * yet */ - if (ap->nAcks > 0 && - (seq > call->snd_seq_count || - seq + ap->nAcks - 1 > call->snd_seq_count)) { - printk("Received ACK (#%u-#%u) for unsent packet\n", - seq, seq + ap->nAcks - 1); - rxrpc_call_abort(call, -EINVAL); - _leave(""); - return; - } - - /* deal with RTT calculation */ - if (serial) { - struct rxrpc_message *rttmsg; - - /* find the prompting packet */ - spin_lock(&call->lock); - if (call->snd_ping && call->snd_ping->hdr.serial == serial) { - /* it was a ping packet */ - rttmsg = call->snd_ping; - call->snd_ping = NULL; - spin_unlock(&call->lock); - - if (rttmsg) { - rttmsg->rttdone = 1; - rxrpc_peer_calculate_rtt(call->conn->peer, - rttmsg, msg); - rxrpc_put_message(rttmsg); - } - } - else { - struct list_head *_p; - - /* it ought to be a data packet - look in the pending - * ACK list */ - list_for_each(_p, &call->acks_pendq) { - rttmsg = list_entry(_p, struct rxrpc_message, - link); - if (rttmsg->hdr.serial == serial) { - if (rttmsg->rttdone) - /* never do RTT twice without - * resending */ - break; - - rttmsg->rttdone = 1; - rxrpc_peer_calculate_rtt( - call->conn->peer, rttmsg, msg); - break; - } - } - spin_unlock(&call->lock); - } - } - - switch (ap->reason) { - /* deal with negative/positive acknowledgement of data - * packets */ - case RXRPC_ACK_REQUESTED: - case RXRPC_ACK_DELAY: - case RXRPC_ACK_IDLE: - rxrpc_call_definitively_ACK(call, seq - 1); - - case RXRPC_ACK_DUPLICATE: - case RXRPC_ACK_OUT_OF_SEQUENCE: - case RXRPC_ACK_EXCEEDS_WINDOW: - call->snd_resend_cnt = 0; - ret = rxrpc_call_record_ACK(call, msg, seq, ap->nAcks); - if (ret < 0) - rxrpc_call_abort(call, ret); - break; - - /* respond to ping packets immediately */ - case RXRPC_ACK_PING: - rxrpc_call_generate_ACK(call, &msg->hdr, ap); - break; - - /* only record RTT on ping response packets */ - case RXRPC_ACK_PING_RESPONSE: - if (call->snd_ping) { - struct rxrpc_message *rttmsg; - - /* only do RTT stuff if the response matches the - * retained ping */ - rttmsg = NULL; - spin_lock(&call->lock); - if (call->snd_ping && - call->snd_ping->hdr.serial == ap->serial) { - rttmsg = call->snd_ping; - call->snd_ping = NULL; - } - spin_unlock(&call->lock); - - if (rttmsg) { - rttmsg->rttdone = 1; - rxrpc_peer_calculate_rtt(call->conn->peer, - rttmsg, msg); - rxrpc_put_message(rttmsg); - } - } - break; - - default: - printk("Unsupported ACK reason %u\n", ap->reason); - break; - } - - _leave(""); -} /* end rxrpc_call_receive_ack_packet() */ - -/*****************************************************************************/ -/* - * record definitive ACKs for all messages up to and including the one with the - * 'highest' seq - */ -static void rxrpc_call_definitively_ACK(struct rxrpc_call *call, - rxrpc_seq_t highest) -{ - struct rxrpc_message *msg; - int now_complete; - - _enter("%p{ads=%u},%u", call, call->acks_dftv_seq, highest); - - while (call->acks_dftv_seq < highest) { - call->acks_dftv_seq++; - - _proto("Definitive ACK on packet #%u", call->acks_dftv_seq); - - /* discard those at front of queue until message with highest - * ACK is found */ - spin_lock(&call->lock); - msg = NULL; - if (!list_empty(&call->acks_pendq)) { - msg = list_entry(call->acks_pendq.next, - struct rxrpc_message, link); - list_del_init(&msg->link); /* dequeue */ - if (msg->state == RXRPC_MSG_SENT) - call->acks_pend_cnt--; - } - spin_unlock(&call->lock); - - /* insanity check */ - if (!msg) - panic("%s(): acks_pendq unexpectedly empty\n", - __FUNCTION__); - - if (msg->seq != call->acks_dftv_seq) - panic("%s(): Packet #%u expected at front of acks_pendq" - " (#%u found)\n", - __FUNCTION__, call->acks_dftv_seq, msg->seq); - - /* discard the message */ - msg->state = RXRPC_MSG_DONE; - rxrpc_put_message(msg); - } - - /* if all sent packets are definitively ACK'd then prod any sleepers just in case */ - now_complete = 0; - spin_lock(&call->lock); - if (call->acks_dftv_seq == call->snd_seq_count) { - if (call->app_call_state != RXRPC_CSTATE_COMPLETE) { - call->app_call_state = RXRPC_CSTATE_COMPLETE; - _state(call); - now_complete = 1; - } - } - spin_unlock(&call->lock); - - if (now_complete) { - del_timer_sync(&call->acks_timeout); - del_timer_sync(&call->rcv_timeout); - del_timer_sync(&call->ackr_dfr_timo); - call->app_attn_func(call); - } - - _leave(""); -} /* end rxrpc_call_definitively_ACK() */ - -/*****************************************************************************/ -/* - * record the specified amount of ACKs/NAKs - */ -static int rxrpc_call_record_ACK(struct rxrpc_call *call, - struct rxrpc_message *msg, - rxrpc_seq_t seq, - size_t count) -{ - struct rxrpc_message *dmsg; - struct list_head *_p; - rxrpc_seq_t highest; - unsigned ix; - size_t chunk; - char resend, now_complete; - u8 acks[16]; - - _enter("%p{apc=%u ads=%u},%p,%u,%Zu", - call, call->acks_pend_cnt, call->acks_dftv_seq, - msg, seq, count); - - /* handle re-ACK'ing of definitively ACK'd packets (may be out-of-order - * ACKs) */ - if (seq <= call->acks_dftv_seq) { - unsigned delta = call->acks_dftv_seq - seq; - - if (count <= delta) { - _leave(" = 0 [all definitively ACK'd]"); - return 0; - } - - seq += delta; - count -= delta; - msg->offset += delta; - } - - highest = seq + count - 1; - resend = 0; - while (count > 0) { - /* extract up to 16 ACK slots at a time */ - chunk = min(count, sizeof(acks)); - count -= chunk; - - memset(acks, 2, sizeof(acks)); - - if (skb_copy_bits(msg->pkt, msg->offset, &acks, chunk) < 0) { - printk("Rx Received short ACK packet\n"); - _leave(" = -EINVAL"); - return -EINVAL; - } - msg->offset += chunk; - - /* check that the ACK set is valid */ - for (ix = 0; ix < chunk; ix++) { - switch (acks[ix]) { - case RXRPC_ACK_TYPE_ACK: - break; - case RXRPC_ACK_TYPE_NACK: - resend = 1; - break; - default: - printk("Rx Received unsupported ACK state" - " %u\n", acks[ix]); - _leave(" = -EINVAL"); - return -EINVAL; - } - } - - _proto("Rx ACK of packets #%u-#%u " - "[%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c] (pend=%u)", - seq, (unsigned) (seq + chunk - 1), - _acktype[acks[0x0]], - _acktype[acks[0x1]], - _acktype[acks[0x2]], - _acktype[acks[0x3]], - _acktype[acks[0x4]], - _acktype[acks[0x5]], - _acktype[acks[0x6]], - _acktype[acks[0x7]], - _acktype[acks[0x8]], - _acktype[acks[0x9]], - _acktype[acks[0xA]], - _acktype[acks[0xB]], - _acktype[acks[0xC]], - _acktype[acks[0xD]], - _acktype[acks[0xE]], - _acktype[acks[0xF]], - call->acks_pend_cnt - ); - - /* mark the packets in the ACK queue as being provisionally - * ACK'd */ - ix = 0; - spin_lock(&call->lock); - - /* find the first packet ACK'd/NAK'd here */ - list_for_each(_p, &call->acks_pendq) { - dmsg = list_entry(_p, struct rxrpc_message, link); - if (dmsg->seq == seq) - goto found_first; - _debug("- %u: skipping #%u", ix, dmsg->seq); - } - goto bad_queue; - - found_first: - do { - _debug("- %u: processing #%u (%c) apc=%u", - ix, dmsg->seq, _acktype[acks[ix]], - call->acks_pend_cnt); - - if (acks[ix] == RXRPC_ACK_TYPE_ACK) { - if (dmsg->state == RXRPC_MSG_SENT) - call->acks_pend_cnt--; - dmsg->state = RXRPC_MSG_ACKED; - } - else { - if (dmsg->state == RXRPC_MSG_ACKED) - call->acks_pend_cnt++; - dmsg->state = RXRPC_MSG_SENT; - } - ix++; - seq++; - - _p = dmsg->link.next; - dmsg = list_entry(_p, struct rxrpc_message, link); - } while(ix < chunk && - _p != &call->acks_pendq && - dmsg->seq == seq); - - if (ix < chunk) - goto bad_queue; - - spin_unlock(&call->lock); - } - - if (resend) - rxrpc_call_resend(call, highest); - - /* if all packets are provisionally ACK'd, then wake up anyone who's - * waiting for that */ - now_complete = 0; - spin_lock(&call->lock); - if (call->acks_pend_cnt == 0) { - if (call->app_call_state == RXRPC_CSTATE_SRVR_RCV_FINAL_ACK) { - call->app_call_state = RXRPC_CSTATE_COMPLETE; - _state(call); - } - now_complete = 1; - } - spin_unlock(&call->lock); - - if (now_complete) { - _debug("- wake up waiters"); - del_timer_sync(&call->acks_timeout); - del_timer_sync(&call->rcv_timeout); - del_timer_sync(&call->ackr_dfr_timo); - call->app_attn_func(call); - } - - _leave(" = 0 (apc=%u)", call->acks_pend_cnt); - return 0; - - bad_queue: - panic("%s(): acks_pendq in bad state (packet #%u absent)\n", - __FUNCTION__, seq); - -} /* end rxrpc_call_record_ACK() */ - -/*****************************************************************************/ -/* - * transfer data from the ready packet queue to the asynchronous read buffer - * - since this func is the only one going to look at packets queued on - * app_readyq, we don't need a lock to modify or access them, only to modify - * the queue pointers - * - called with call->lock held - * - the buffer must be in kernel space - * - returns: - * 0 if buffer filled - * -EAGAIN if buffer not filled and more data to come - * -EBADMSG if last packet received and insufficient data left - * -ECONNABORTED if the call has in an error state - */ -static int __rxrpc_call_read_data(struct rxrpc_call *call) -{ - struct rxrpc_message *msg; - size_t qty; - int ret; - - _enter("%p{as=%d buf=%p qty=%Zu/%Zu}", - call, - call->app_async_read, call->app_read_buf, - call->app_ready_qty, call->app_mark); - - /* check the state */ - switch (call->app_call_state) { - case RXRPC_CSTATE_SRVR_RCV_ARGS: - case RXRPC_CSTATE_CLNT_RCV_REPLY: - if (call->app_last_rcv) { - printk("%s(%p,%p,%Zd):" - " Inconsistent call state (%s, last pkt)", - __FUNCTION__, - call, call->app_read_buf, call->app_mark, - rxrpc_call_states[call->app_call_state]); - BUG(); - } - break; - - case RXRPC_CSTATE_SRVR_RCV_OPID: - case RXRPC_CSTATE_SRVR_GOT_ARGS: - case RXRPC_CSTATE_CLNT_GOT_REPLY: - break; - - case RXRPC_CSTATE_SRVR_SND_REPLY: - if (!call->app_last_rcv) { - printk("%s(%p,%p,%Zd):" - " Inconsistent call state (%s, not last pkt)", - __FUNCTION__, - call, call->app_read_buf, call->app_mark, - rxrpc_call_states[call->app_call_state]); - BUG(); - } - _debug("Trying to read data from call in SND_REPLY state"); - break; - - case RXRPC_CSTATE_ERROR: - _leave(" = -ECONNABORTED"); - return -ECONNABORTED; - - default: - printk("reading in unexpected state [[[ %u ]]]\n", - call->app_call_state); - BUG(); - } - - /* handle the case of not having an async buffer */ - if (!call->app_async_read) { - if (call->app_mark == RXRPC_APP_MARK_EOF) { - ret = call->app_last_rcv ? 0 : -EAGAIN; - } - else { - if (call->app_mark >= call->app_ready_qty) { - call->app_mark = RXRPC_APP_MARK_EOF; - ret = 0; - } - else { - ret = call->app_last_rcv ? -EBADMSG : -EAGAIN; - } - } - - _leave(" = %d [no buf]", ret); - return 0; - } - - while (!list_empty(&call->app_readyq) && call->app_mark > 0) { - msg = list_entry(call->app_readyq.next, - struct rxrpc_message, link); - - /* drag as much data as we need out of this packet */ - qty = min(call->app_mark, msg->dsize); - - _debug("reading %Zu from skb=%p off=%lu", - qty, msg->pkt, msg->offset); - - if (call->app_read_buf) - if (skb_copy_bits(msg->pkt, msg->offset, - call->app_read_buf, qty) < 0) - panic("%s: Failed to copy data from packet:" - " (%p,%p,%Zd)", - __FUNCTION__, - call, call->app_read_buf, qty); - - /* if that packet is now empty, discard it */ - call->app_ready_qty -= qty; - msg->dsize -= qty; - - if (msg->dsize == 0) { - list_del_init(&msg->link); - rxrpc_put_message(msg); - } - else { - msg->offset += qty; - } - - call->app_mark -= qty; - if (call->app_read_buf) - call->app_read_buf += qty; - } - - if (call->app_mark == 0) { - call->app_async_read = 0; - call->app_mark = RXRPC_APP_MARK_EOF; - call->app_read_buf = NULL; - - /* adjust the state if used up all packets */ - if (list_empty(&call->app_readyq) && call->app_last_rcv) { - switch (call->app_call_state) { - case RXRPC_CSTATE_SRVR_RCV_OPID: - call->app_call_state = RXRPC_CSTATE_SRVR_SND_REPLY; - call->app_mark = RXRPC_APP_MARK_EOF; - _state(call); - del_timer_sync(&call->rcv_timeout); - break; - case RXRPC_CSTATE_SRVR_GOT_ARGS: - call->app_call_state = RXRPC_CSTATE_SRVR_SND_REPLY; - _state(call); - del_timer_sync(&call->rcv_timeout); - break; - default: - call->app_call_state = RXRPC_CSTATE_COMPLETE; - _state(call); - del_timer_sync(&call->acks_timeout); - del_timer_sync(&call->ackr_dfr_timo); - del_timer_sync(&call->rcv_timeout); - break; - } - } - - _leave(" = 0"); - return 0; - } - - if (call->app_last_rcv) { - _debug("Insufficient data (%Zu/%Zu)", - call->app_ready_qty, call->app_mark); - call->app_async_read = 0; - call->app_mark = RXRPC_APP_MARK_EOF; - call->app_read_buf = NULL; - - _leave(" = -EBADMSG"); - return -EBADMSG; - } - - _leave(" = -EAGAIN"); - return -EAGAIN; -} /* end __rxrpc_call_read_data() */ - -/*****************************************************************************/ -/* - * attempt to read the specified amount of data from the call's ready queue - * into the buffer provided - * - since this func is the only one going to look at packets queued on - * app_readyq, we don't need a lock to modify or access them, only to modify - * the queue pointers - * - if the buffer pointer is NULL, then data is merely drained, not copied - * - if flags&RXRPC_CALL_READ_BLOCK, then the function will wait until there is - * enough data or an error will be generated - * - note that the caller must have added the calling task to the call's wait - * queue beforehand - * - if flags&RXRPC_CALL_READ_ALL, then an error will be generated if this - * function doesn't read all available data - */ -int rxrpc_call_read_data(struct rxrpc_call *call, - void *buffer, size_t size, int flags) -{ - int ret; - - _enter("%p{arq=%Zu},%p,%Zd,%x", - call, call->app_ready_qty, buffer, size, flags); - - spin_lock(&call->lock); - - if (unlikely(!!call->app_read_buf)) { - spin_unlock(&call->lock); - _leave(" = -EBUSY"); - return -EBUSY; - } - - call->app_mark = size; - call->app_read_buf = buffer; - call->app_async_read = 1; - call->app_read_count++; - - /* read as much data as possible */ - ret = __rxrpc_call_read_data(call); - switch (ret) { - case 0: - if (flags & RXRPC_CALL_READ_ALL && - (!call->app_last_rcv || call->app_ready_qty > 0)) { - _leave(" = -EBADMSG"); - __rxrpc_call_abort(call, -EBADMSG); - return -EBADMSG; - } - - spin_unlock(&call->lock); - call->app_attn_func(call); - _leave(" = 0"); - return ret; - - case -ECONNABORTED: - spin_unlock(&call->lock); - _leave(" = %d [aborted]", ret); - return ret; - - default: - __rxrpc_call_abort(call, ret); - _leave(" = %d", ret); - return ret; - - case -EAGAIN: - spin_unlock(&call->lock); - - if (!(flags & RXRPC_CALL_READ_BLOCK)) { - _leave(" = -EAGAIN"); - return -EAGAIN; - } - - /* wait for the data to arrive */ - _debug("blocking for data arrival"); - - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - if (!call->app_async_read || signal_pending(current)) - break; - schedule(); - } - set_current_state(TASK_RUNNING); - - if (signal_pending(current)) { - _leave(" = -EINTR"); - return -EINTR; - } - - if (call->app_call_state == RXRPC_CSTATE_ERROR) { - _leave(" = -ECONNABORTED"); - return -ECONNABORTED; - } - - _leave(" = 0"); - return 0; - } - -} /* end rxrpc_call_read_data() */ - -/*****************************************************************************/ -/* - * write data to a call - * - the data may not be sent immediately if it doesn't fill a buffer - * - if we can't queue all the data for buffering now, siov[] will have been - * adjusted to take account of what has been sent - */ -int rxrpc_call_write_data(struct rxrpc_call *call, - size_t sioc, - struct kvec *siov, - u8 rxhdr_flags, - gfp_t alloc_flags, - int dup_data, - size_t *size_sent) -{ - struct rxrpc_message *msg; - struct kvec *sptr; - size_t space, size, chunk, tmp; - char *buf; - int ret; - - _enter("%p,%Zu,%p,%02x,%x,%d,%p", - call, sioc, siov, rxhdr_flags, alloc_flags, dup_data, - size_sent); - - *size_sent = 0; - size = 0; - ret = -EINVAL; - - /* can't send more if we've sent last packet from this end */ - switch (call->app_call_state) { - case RXRPC_CSTATE_SRVR_SND_REPLY: - case RXRPC_CSTATE_CLNT_SND_ARGS: - break; - case RXRPC_CSTATE_ERROR: - ret = call->app_errno; - default: - goto out; - } - - /* calculate how much data we've been given */ - sptr = siov; - for (; sioc > 0; sptr++, sioc--) { - if (!sptr->iov_len) - continue; - - if (!sptr->iov_base) - goto out; - - size += sptr->iov_len; - } - - _debug("- size=%Zu mtu=%Zu", size, call->conn->mtu_size); - - do { - /* make sure there's a message under construction */ - if (!call->snd_nextmsg) { - /* no - allocate a message with no data yet attached */ - ret = rxrpc_conn_newmsg(call->conn, call, - RXRPC_PACKET_TYPE_DATA, - 0, NULL, alloc_flags, - &call->snd_nextmsg); - if (ret < 0) - goto out; - _debug("- allocated new message [ds=%Zu]", - call->snd_nextmsg->dsize); - } - - msg = call->snd_nextmsg; - msg->hdr.flags |= rxhdr_flags; - - /* deal with zero-length terminal packet */ - if (size == 0) { - if (rxhdr_flags & RXRPC_LAST_PACKET) { - ret = rxrpc_call_flush(call); - if (ret < 0) - goto out; - } - break; - } - - /* work out how much space current packet has available */ - space = call->conn->mtu_size - msg->dsize; - chunk = min(space, size); - - _debug("- [before] space=%Zu chunk=%Zu", space, chunk); - - while (!siov->iov_len) - siov++; - - /* if we are going to have to duplicate the data then coalesce - * it too */ - if (dup_data) { - /* don't allocate more that 1 page at a time */ - if (chunk > PAGE_SIZE) - chunk = PAGE_SIZE; - - /* allocate a data buffer and attach to the message */ - buf = kmalloc(chunk, alloc_flags); - if (unlikely(!buf)) { - if (msg->dsize == - sizeof(struct rxrpc_header)) { - /* discard an empty msg and wind back - * the seq counter */ - rxrpc_put_message(msg); - call->snd_nextmsg = NULL; - call->snd_seq_count--; - } - - ret = -ENOMEM; - goto out; - } - - tmp = msg->dcount++; - set_bit(tmp, &msg->dfree); - msg->data[tmp].iov_base = buf; - msg->data[tmp].iov_len = chunk; - msg->dsize += chunk; - *size_sent += chunk; - size -= chunk; - - /* load the buffer with data */ - while (chunk > 0) { - tmp = min(chunk, siov->iov_len); - memcpy(buf, siov->iov_base, tmp); - buf += tmp; - siov->iov_base += tmp; - siov->iov_len -= tmp; - if (!siov->iov_len) - siov++; - chunk -= tmp; - } - } - else { - /* we want to attach the supplied buffers directly */ - while (chunk > 0 && - msg->dcount < RXRPC_MSG_MAX_IOCS) { - tmp = msg->dcount++; - msg->data[tmp].iov_base = siov->iov_base; - msg->data[tmp].iov_len = siov->iov_len; - msg->dsize += siov->iov_len; - *size_sent += siov->iov_len; - size -= siov->iov_len; - chunk -= siov->iov_len; - siov++; - } - } - - _debug("- [loaded] chunk=%Zu size=%Zu", chunk, size); - - /* dispatch the message when full, final or requesting ACK */ - if (msg->dsize >= call->conn->mtu_size || rxhdr_flags) { - ret = rxrpc_call_flush(call); - if (ret < 0) - goto out; - } - - } while(size > 0); - - ret = 0; - out: - _leave(" = %d (%Zd queued, %Zd rem)", ret, *size_sent, size); - return ret; - -} /* end rxrpc_call_write_data() */ - -/*****************************************************************************/ -/* - * flush outstanding packets to the network - */ -static int rxrpc_call_flush(struct rxrpc_call *call) -{ - struct rxrpc_message *msg; - int ret = 0; - - _enter("%p", call); - - rxrpc_get_call(call); - - /* if there's a packet under construction, then dispatch it now */ - if (call->snd_nextmsg) { - msg = call->snd_nextmsg; - call->snd_nextmsg = NULL; - - if (msg->hdr.flags & RXRPC_LAST_PACKET) { - msg->hdr.flags &= ~RXRPC_MORE_PACKETS; - if (call->app_call_state != RXRPC_CSTATE_CLNT_SND_ARGS) - msg->hdr.flags |= RXRPC_REQUEST_ACK; - } - else { - msg->hdr.flags |= RXRPC_MORE_PACKETS; - } - - _proto("Sending DATA message { ds=%Zu dc=%u df=%02lu }", - msg->dsize, msg->dcount, msg->dfree); - - /* queue and adjust call state */ - spin_lock(&call->lock); - list_add_tail(&msg->link, &call->acks_pendq); - - /* decide what to do depending on current state and if this is - * the last packet */ - ret = -EINVAL; - switch (call->app_call_state) { - case RXRPC_CSTATE_SRVR_SND_REPLY: - if (msg->hdr.flags & RXRPC_LAST_PACKET) { - call->app_call_state = - RXRPC_CSTATE_SRVR_RCV_FINAL_ACK; - _state(call); - } - break; - - case RXRPC_CSTATE_CLNT_SND_ARGS: - if (msg->hdr.flags & RXRPC_LAST_PACKET) { - call->app_call_state = - RXRPC_CSTATE_CLNT_RCV_REPLY; - _state(call); - } - break; - - case RXRPC_CSTATE_ERROR: - ret = call->app_errno; - default: - spin_unlock(&call->lock); - goto out; - } - - call->acks_pend_cnt++; - - mod_timer(&call->acks_timeout, - __rxrpc_rtt_based_timeout(call, - rxrpc_call_acks_timeout)); - - spin_unlock(&call->lock); - - ret = rxrpc_conn_sendmsg(call->conn, msg); - if (ret == 0) - call->pkt_snd_count++; - } - - out: - rxrpc_put_call(call); - - _leave(" = %d", ret); - return ret; - -} /* end rxrpc_call_flush() */ - -/*****************************************************************************/ -/* - * resend NAK'd or unacknowledged packets up to the highest one specified - */ -static void rxrpc_call_resend(struct rxrpc_call *call, rxrpc_seq_t highest) -{ - struct rxrpc_message *msg; - struct list_head *_p; - rxrpc_seq_t seq = 0; - - _enter("%p,%u", call, highest); - - _proto("Rx Resend required"); - - /* handle too many resends */ - if (call->snd_resend_cnt >= rxrpc_call_max_resend) { - _debug("Aborting due to too many resends (rcv=%d)", - call->pkt_rcv_count); - rxrpc_call_abort(call, - call->pkt_rcv_count > 0 ? -EIO : -ETIMEDOUT); - _leave(""); - return; - } - - spin_lock(&call->lock); - call->snd_resend_cnt++; - for (;;) { - /* determine which the next packet we might need to ACK is */ - if (seq <= call->acks_dftv_seq) - seq = call->acks_dftv_seq; - seq++; - - if (seq > highest) - break; - - /* look for the packet in the pending-ACK queue */ - list_for_each(_p, &call->acks_pendq) { - msg = list_entry(_p, struct rxrpc_message, link); - if (msg->seq == seq) - goto found_msg; - } - - panic("%s(%p,%d):" - " Inconsistent pending-ACK queue (ds=%u sc=%u sq=%u)\n", - __FUNCTION__, call, highest, - call->acks_dftv_seq, call->snd_seq_count, seq); - - found_msg: - if (msg->state != RXRPC_MSG_SENT) - continue; /* only un-ACK'd packets */ - - rxrpc_get_message(msg); - spin_unlock(&call->lock); - - /* send each message again (and ignore any errors we might - * incur) */ - _proto("Resending DATA message { ds=%Zu dc=%u df=%02lu }", - msg->dsize, msg->dcount, msg->dfree); - - if (rxrpc_conn_sendmsg(call->conn, msg) == 0) - call->pkt_snd_count++; - - rxrpc_put_message(msg); - - spin_lock(&call->lock); - } - - /* reset the timeout */ - mod_timer(&call->acks_timeout, - __rxrpc_rtt_based_timeout(call, rxrpc_call_acks_timeout)); - - spin_unlock(&call->lock); - - _leave(""); -} /* end rxrpc_call_resend() */ - -/*****************************************************************************/ -/* - * handle an ICMP error being applied to a call - */ -void rxrpc_call_handle_error(struct rxrpc_call *call, int local, int errno) -{ - _enter("%p{%u},%d", call, ntohl(call->call_id), errno); - - /* if this call is already aborted, then just wake up any waiters */ - if (call->app_call_state == RXRPC_CSTATE_ERROR) { - call->app_error_func(call); - } - else { - /* tell the app layer what happened */ - spin_lock(&call->lock); - call->app_call_state = RXRPC_CSTATE_ERROR; - _state(call); - if (local) - call->app_err_state = RXRPC_ESTATE_LOCAL_ERROR; - else - call->app_err_state = RXRPC_ESTATE_REMOTE_ERROR; - call->app_errno = errno; - call->app_mark = RXRPC_APP_MARK_EOF; - call->app_read_buf = NULL; - call->app_async_read = 0; - - /* map the error */ - call->app_aemap_func(call); - - del_timer_sync(&call->acks_timeout); - del_timer_sync(&call->rcv_timeout); - del_timer_sync(&call->ackr_dfr_timo); - - spin_unlock(&call->lock); - - call->app_error_func(call); - } - - _leave(""); -} /* end rxrpc_call_handle_error() */ diff --git a/net/rxrpc/connection.c b/net/rxrpc/connection.c deleted file mode 100644 index a7c929a9fdc..00000000000 --- a/net/rxrpc/connection.c +++ /dev/null @@ -1,777 +0,0 @@ -/* connection.c: Rx connection routines - * - * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/module.h> -#include <rxrpc/rxrpc.h> -#include <rxrpc/transport.h> -#include <rxrpc/peer.h> -#include <rxrpc/connection.h> -#include <rxrpc/call.h> -#include <rxrpc/message.h> -#include <linux/udp.h> -#include <linux/ip.h> -#include <net/sock.h> -#include <asm/uaccess.h> -#include "internal.h" - -__RXACCT_DECL(atomic_t rxrpc_connection_count); - -LIST_HEAD(rxrpc_conns); -DECLARE_RWSEM(rxrpc_conns_sem); -unsigned long rxrpc_conn_timeout = 60 * 60; - -static void rxrpc_conn_do_timeout(struct rxrpc_connection *conn); - -static void __rxrpc_conn_timeout(rxrpc_timer_t *timer) -{ - struct rxrpc_connection *conn = - list_entry(timer, struct rxrpc_connection, timeout); - - _debug("Rx CONN TIMEOUT [%p{u=%d}]", conn, atomic_read(&conn->usage)); - - rxrpc_conn_do_timeout(conn); -} - -static const struct rxrpc_timer_ops rxrpc_conn_timer_ops = { - .timed_out = __rxrpc_conn_timeout, -}; - -/*****************************************************************************/ -/* - * create a new connection record - */ -static inline int __rxrpc_create_connection(struct rxrpc_peer *peer, - struct rxrpc_connection **_conn) -{ - struct rxrpc_connection *conn; - - _enter("%p",peer); - - /* allocate and initialise a connection record */ - conn = kzalloc(sizeof(struct rxrpc_connection), GFP_KERNEL); - if (!conn) { - _leave(" = -ENOMEM"); - return -ENOMEM; - } - - atomic_set(&conn->usage, 1); - - INIT_LIST_HEAD(&conn->link); - INIT_LIST_HEAD(&conn->id_link); - init_waitqueue_head(&conn->chanwait); - spin_lock_init(&conn->lock); - rxrpc_timer_init(&conn->timeout, &rxrpc_conn_timer_ops); - - do_gettimeofday(&conn->atime); - conn->mtu_size = 1024; - conn->peer = peer; - conn->trans = peer->trans; - - __RXACCT(atomic_inc(&rxrpc_connection_count)); - *_conn = conn; - _leave(" = 0 (%p)", conn); - - return 0; -} /* end __rxrpc_create_connection() */ - -/*****************************************************************************/ -/* - * create a new connection record for outgoing connections - */ -int rxrpc_create_connection(struct rxrpc_transport *trans, - __be16 port, - __be32 addr, - uint16_t service_id, - void *security, - struct rxrpc_connection **_conn) -{ - struct rxrpc_connection *candidate, *conn; - struct rxrpc_peer *peer; - struct list_head *_p; - __be32 connid; - int ret; - - _enter("%p{%hu},%u,%hu", trans, trans->port, ntohs(port), service_id); - - /* get a peer record */ - ret = rxrpc_peer_lookup(trans, addr, &peer); - if (ret < 0) { - _leave(" = %d", ret); - return ret; - } - - /* allocate and initialise a connection record */ - ret = __rxrpc_create_connection(peer, &candidate); - if (ret < 0) { - rxrpc_put_peer(peer); - _leave(" = %d", ret); - return ret; - } - - /* fill in the specific bits */ - candidate->addr.sin_family = AF_INET; - candidate->addr.sin_port = port; - candidate->addr.sin_addr.s_addr = addr; - - candidate->in_epoch = rxrpc_epoch; - candidate->out_epoch = rxrpc_epoch; - candidate->in_clientflag = 0; - candidate->out_clientflag = RXRPC_CLIENT_INITIATED; - candidate->service_id = htons(service_id); - - /* invent a unique connection ID */ - write_lock(&peer->conn_idlock); - - try_next_id: - connid = htonl(peer->conn_idcounter & RXRPC_CIDMASK); - peer->conn_idcounter += RXRPC_MAXCALLS; - - list_for_each(_p, &peer->conn_idlist) { - conn = list_entry(_p, struct rxrpc_connection, id_link); - if (connid == conn->conn_id) - goto try_next_id; - if (connid > conn->conn_id) - break; - } - - _debug("selected candidate conn ID %x.%u", - ntohl(peer->addr.s_addr), ntohl(connid)); - - candidate->conn_id = connid; - list_add_tail(&candidate->id_link, _p); - - write_unlock(&peer->conn_idlock); - - /* attach to peer */ - candidate->peer = peer; - - write_lock(&peer->conn_lock); - - /* search the peer's transport graveyard list */ - spin_lock(&peer->conn_gylock); - list_for_each(_p, &peer->conn_graveyard) { - conn = list_entry(_p, struct rxrpc_connection, link); - if (conn->addr.sin_port == candidate->addr.sin_port && - conn->security_ix == candidate->security_ix && - conn->service_id == candidate->service_id && - conn->in_clientflag == 0) - goto found_in_graveyard; - } - spin_unlock(&peer->conn_gylock); - - /* pick the new candidate */ - _debug("created connection: {%08x} [out]", ntohl(candidate->conn_id)); - atomic_inc(&peer->conn_count); - conn = candidate; - candidate = NULL; - - make_active: - list_add_tail(&conn->link, &peer->conn_active); - write_unlock(&peer->conn_lock); - - if (candidate) { - write_lock(&peer->conn_idlock); - list_del(&candidate->id_link); - write_unlock(&peer->conn_idlock); - - __RXACCT(atomic_dec(&rxrpc_connection_count)); - kfree(candidate); - } - else { - down_write(&rxrpc_conns_sem); - list_add_tail(&conn->proc_link, &rxrpc_conns); - up_write(&rxrpc_conns_sem); - } - - *_conn = conn; - _leave(" = 0 (%p)", conn); - - return 0; - - /* handle resurrecting a connection from the graveyard */ - found_in_graveyard: - _debug("resurrecting connection: {%08x} [out]", ntohl(conn->conn_id)); - rxrpc_get_connection(conn); - rxrpc_krxtimod_del_timer(&conn->timeout); - list_del_init(&conn->link); - spin_unlock(&peer->conn_gylock); - goto make_active; -} /* end rxrpc_create_connection() */ - -/*****************************************************************************/ -/* - * lookup the connection for an incoming packet - * - create a new connection record for unrecorded incoming connections - */ -int rxrpc_connection_lookup(struct rxrpc_peer *peer, - struct rxrpc_message *msg, - struct rxrpc_connection **_conn) -{ - struct rxrpc_connection *conn, *candidate = NULL; - struct list_head *_p; - struct sk_buff *pkt = msg->pkt; - int ret, fresh = 0; - __be32 x_epoch, x_connid; - __be16 x_port, x_servid; - __u32 x_secix; - u8 x_clflag; - - _enter("%p{{%hu}},%u,%hu", - peer, - peer->trans->port, - ntohs(pkt->h.uh->source), - ntohs(msg->hdr.serviceId)); - - x_port = pkt->h.uh->source; - x_epoch = msg->hdr.epoch; - x_clflag = msg->hdr.flags & RXRPC_CLIENT_INITIATED; - x_connid = htonl(ntohl(msg->hdr.cid) & RXRPC_CIDMASK); - x_servid = msg->hdr.serviceId; - x_secix = msg->hdr.securityIndex; - - /* [common case] search the transport's active list first */ - read_lock(&peer->conn_lock); - list_for_each(_p, &peer->conn_active) { - conn = list_entry(_p, struct rxrpc_connection, link); - if (conn->addr.sin_port == x_port && - conn->in_epoch == x_epoch && - conn->conn_id == x_connid && - conn->security_ix == x_secix && - conn->service_id == x_servid && - conn->in_clientflag == x_clflag) - goto found_active; - } - read_unlock(&peer->conn_lock); - - /* [uncommon case] not active - * - create a candidate for a new record if an inbound connection - * - only examine the graveyard for an outbound connection - */ - if (x_clflag) { - ret = __rxrpc_create_connection(peer, &candidate); - if (ret < 0) { - _leave(" = %d", ret); - return ret; - } - - /* fill in the specifics */ - candidate->addr.sin_family = AF_INET; - candidate->addr.sin_port = x_port; - candidate->addr.sin_addr.s_addr = pkt->nh.iph->saddr; - candidate->in_epoch = x_epoch; - candidate->out_epoch = x_epoch; - candidate->in_clientflag = RXRPC_CLIENT_INITIATED; - candidate->out_clientflag = 0; - candidate->conn_id = x_connid; - candidate->service_id = x_servid; - candidate->security_ix = x_secix; - } - - /* search the active list again, just in case it appeared whilst we - * were busy */ - write_lock(&peer->conn_lock); - list_for_each(_p, &peer->conn_active) { - conn = list_entry(_p, struct rxrpc_connection, link); - if (conn->addr.sin_port == x_port && - conn->in_epoch == x_epoch && - conn->conn_id == x_connid && - conn->security_ix == x_secix && - conn->service_id == x_servid && - conn->in_clientflag == x_clflag) - goto found_active_second_chance; - } - - /* search the transport's graveyard list */ - spin_lock(&peer->conn_gylock); - list_for_each(_p, &peer->conn_graveyard) { - conn = list_entry(_p, struct rxrpc_connection, link); - if (conn->addr.sin_port == x_port && - conn->in_epoch == x_epoch && - conn->conn_id == x_connid && - conn->security_ix == x_secix && - conn->service_id == x_servid && - conn->in_clientflag == x_clflag) - goto found_in_graveyard; - } - spin_unlock(&peer->conn_gylock); - - /* outbound connections aren't created here */ - if (!x_clflag) { - write_unlock(&peer->conn_lock); - _leave(" = -ENOENT"); - return -ENOENT; - } - - /* we can now add the new candidate to the list */ - _debug("created connection: {%08x} [in]", ntohl(candidate->conn_id)); - rxrpc_get_peer(peer); - conn = candidate; - candidate = NULL; - atomic_inc(&peer->conn_count); - fresh = 1; - - make_active: - list_add_tail(&conn->link, &peer->conn_active); - - success_uwfree: - write_unlock(&peer->conn_lock); - - if (candidate) { - write_lock(&peer->conn_idlock); - list_del(&candidate->id_link); - write_unlock(&peer->conn_idlock); - - __RXACCT(atomic_dec(&rxrpc_connection_count)); - kfree(candidate); - } - - if (fresh) { - down_write(&rxrpc_conns_sem); - list_add_tail(&conn->proc_link, &rxrpc_conns); - up_write(&rxrpc_conns_sem); - } - - success: - *_conn = conn; - _leave(" = 0 (%p)", conn); - return 0; - - /* handle the connection being found in the active list straight off */ - found_active: - rxrpc_get_connection(conn); - read_unlock(&peer->conn_lock); - goto success; - - /* handle resurrecting a connection from the graveyard */ - found_in_graveyard: - _debug("resurrecting connection: {%08x} [in]", ntohl(conn->conn_id)); - rxrpc_get_peer(peer); - rxrpc_get_connection(conn); - rxrpc_krxtimod_del_timer(&conn->timeout); - list_del_init(&conn->link); - spin_unlock(&peer->conn_gylock); - goto make_active; - - /* handle finding the connection on the second time through the active - * list */ - found_active_second_chance: - rxrpc_get_connection(conn); - goto success_uwfree; - -} /* end rxrpc_connection_lookup() */ - -/*****************************************************************************/ -/* - * finish using a connection record - * - it will be transferred to the peer's connection graveyard when refcount - * reaches 0 - */ -void rxrpc_put_connection(struct rxrpc_connection *conn) -{ - struct rxrpc_peer *peer; - - if (!conn) - return; - - _enter("%p{u=%d p=%hu}", - conn, atomic_read(&conn->usage), ntohs(conn->addr.sin_port)); - - peer = conn->peer; - spin_lock(&peer->conn_gylock); - - /* sanity check */ - if (atomic_read(&conn->usage) <= 0) - BUG(); - - if (likely(!atomic_dec_and_test(&conn->usage))) { - spin_unlock(&peer->conn_gylock); - _leave(""); - return; - } - - /* move to graveyard queue */ - _debug("burying connection: {%08x}", ntohl(conn->conn_id)); - list_move_tail(&conn->link, &peer->conn_graveyard); - - rxrpc_krxtimod_add_timer(&conn->timeout, rxrpc_conn_timeout * HZ); - - spin_unlock(&peer->conn_gylock); - - rxrpc_put_peer(conn->peer); - - _leave(" [killed]"); -} /* end rxrpc_put_connection() */ - -/*****************************************************************************/ -/* - * free a connection record - */ -static void rxrpc_conn_do_timeout(struct rxrpc_connection *conn) -{ - struct rxrpc_peer *peer; - - _enter("%p{u=%d p=%hu}", - conn, atomic_read(&conn->usage), ntohs(conn->addr.sin_port)); - - peer = conn->peer; - - if (atomic_read(&conn->usage) < 0) - BUG(); - - /* remove from graveyard if still dead */ - spin_lock(&peer->conn_gylock); - if (atomic_read(&conn->usage) == 0) { - list_del_init(&conn->link); - } - else { - conn = NULL; - } - spin_unlock(&peer->conn_gylock); - - if (!conn) { - _leave(""); - return; /* resurrected */ - } - - _debug("--- Destroying Connection %p{%08x} ---", - conn, ntohl(conn->conn_id)); - - down_write(&rxrpc_conns_sem); - list_del(&conn->proc_link); - up_write(&rxrpc_conns_sem); - - write_lock(&peer->conn_idlock); - list_del(&conn->id_link); - write_unlock(&peer->conn_idlock); - - __RXACCT(atomic_dec(&rxrpc_connection_count)); - kfree(conn); - - /* if the graveyard is now empty, wake up anyone waiting for that */ - if (atomic_dec_and_test(&peer->conn_count)) - wake_up(&peer->conn_gy_waitq); - - _leave(" [destroyed]"); -} /* end rxrpc_conn_do_timeout() */ - -/*****************************************************************************/ -/* - * clear all connection records from a peer endpoint - */ -void rxrpc_conn_clearall(struct rxrpc_peer *peer) -{ - DECLARE_WAITQUEUE(myself, current); - - struct rxrpc_connection *conn; - int err; - - _enter("%p", peer); - - /* there shouldn't be any active conns remaining */ - if (!list_empty(&peer->conn_active)) - BUG(); - - /* manually timeout all conns in the graveyard */ - spin_lock(&peer->conn_gylock); - while (!list_empty(&peer->conn_graveyard)) { - conn = list_entry(peer->conn_graveyard.next, - struct rxrpc_connection, link); - err = rxrpc_krxtimod_del_timer(&conn->timeout); - spin_unlock(&peer->conn_gylock); - - if (err == 0) - rxrpc_conn_do_timeout(conn); - - spin_lock(&peer->conn_gylock); - } - spin_unlock(&peer->conn_gylock); - - /* wait for the the conn graveyard to be completely cleared */ - set_current_state(TASK_UNINTERRUPTIBLE); - add_wait_queue(&peer->conn_gy_waitq, &myself); - - while (atomic_read(&peer->conn_count) != 0) { - schedule(); - set_current_state(TASK_UNINTERRUPTIBLE); - } - - remove_wait_queue(&peer->conn_gy_waitq, &myself); - set_current_state(TASK_RUNNING); - - _leave(""); -} /* end rxrpc_conn_clearall() */ - -/*****************************************************************************/ -/* - * allocate and prepare a message for sending out through the transport - * endpoint - */ -int rxrpc_conn_newmsg(struct rxrpc_connection *conn, - struct rxrpc_call *call, - uint8_t type, - int dcount, - struct kvec diov[], - gfp_t alloc_flags, - struct rxrpc_message **_msg) -{ - struct rxrpc_message *msg; - int loop; - - _enter("%p{%d},%p,%u", conn, ntohs(conn->addr.sin_port), call, type); - - if (dcount > 3) { - _leave(" = -EINVAL"); - return -EINVAL; - } - - msg = kzalloc(sizeof(struct rxrpc_message), alloc_flags); - if (!msg) { - _leave(" = -ENOMEM"); - return -ENOMEM; - } - - atomic_set(&msg->usage, 1); - - INIT_LIST_HEAD(&msg->link); - - msg->state = RXRPC_MSG_PREPARED; - - msg->hdr.epoch = conn->out_epoch; - msg->hdr.cid = conn->conn_id | (call ? call->chan_ix : 0); - msg->hdr.callNumber = call ? call->call_id : 0; - msg->hdr.type = type; - msg->hdr.flags = conn->out_clientflag; - msg->hdr.securityIndex = conn->security_ix; - msg->hdr.serviceId = conn->service_id; - - /* generate sequence numbers for data packets */ - if (call) { - switch (type) { - case RXRPC_PACKET_TYPE_DATA: - msg->seq = ++call->snd_seq_count; - msg->hdr.seq = htonl(msg->seq); - break; - case RXRPC_PACKET_TYPE_ACK: - /* ACK sequence numbers are complicated. The following - * may be wrong: - * - jumbo packet ACKs should have a seq number - * - normal ACKs should not - */ - default: - break; - } - } - - msg->dcount = dcount + 1; - msg->dsize = sizeof(msg->hdr); - msg->data[0].iov_len = sizeof(msg->hdr); - msg->data[0].iov_base = &msg->hdr; - - for (loop=0; loop < dcount; loop++) { - msg->dsize += diov[loop].iov_len; - msg->data[loop+1].iov_len = diov[loop].iov_len; - msg->data[loop+1].iov_base = diov[loop].iov_base; - } - - __RXACCT(atomic_inc(&rxrpc_message_count)); - *_msg = msg; - _leave(" = 0 (%p) #%d", msg, atomic_read(&rxrpc_message_count)); - return 0; -} /* end rxrpc_conn_newmsg() */ - -/*****************************************************************************/ -/* - * free a message - */ -void __rxrpc_put_message(struct rxrpc_message *msg) -{ - int loop; - - _enter("%p #%d", msg, atomic_read(&rxrpc_message_count)); - - if (msg->pkt) - kfree_skb(msg->pkt); - rxrpc_put_connection(msg->conn); - - for (loop = 0; loop < 8; loop++) - if (test_bit(loop, &msg->dfree)) - kfree(msg->data[loop].iov_base); - - __RXACCT(atomic_dec(&rxrpc_message_count)); - kfree(msg); - - _leave(""); -} /* end __rxrpc_put_message() */ - -/*****************************************************************************/ -/* - * send a message out through the transport endpoint - */ -int rxrpc_conn_sendmsg(struct rxrpc_connection *conn, - struct rxrpc_message *msg) -{ - struct msghdr msghdr; - int ret; - - _enter("%p{%d}", conn, ntohs(conn->addr.sin_port)); - - /* fill in some fields in the header */ - spin_lock(&conn->lock); - msg->hdr.serial = htonl(++conn->serial_counter); - msg->rttdone = 0; - spin_unlock(&conn->lock); - - /* set up the message to be transmitted */ - msghdr.msg_name = &conn->addr; - msghdr.msg_namelen = sizeof(conn->addr); - msghdr.msg_control = NULL; - msghdr.msg_controllen = 0; - msghdr.msg_flags = MSG_CONFIRM | MSG_DONTWAIT; - - _net("Sending message type %d of %Zd bytes to %08x:%d", - msg->hdr.type, - msg->dsize, - ntohl(conn->addr.sin_addr.s_addr), - ntohs(conn->addr.sin_port)); - - /* send the message */ - ret = kernel_sendmsg(conn->trans->socket, &msghdr, - msg->data, msg->dcount, msg->dsize); - if (ret < 0) { - msg->state = RXRPC_MSG_ERROR; - } else { - msg->state = RXRPC_MSG_SENT; - ret = 0; - - spin_lock(&conn->lock); - do_gettimeofday(&conn->atime); - msg->stamp = conn->atime; - spin_unlock(&conn->lock); - } - - _leave(" = %d", ret); - - return ret; -} /* end rxrpc_conn_sendmsg() */ - -/*****************************************************************************/ -/* - * deal with a subsequent call packet - */ -int rxrpc_conn_receive_call_packet(struct rxrpc_connection *conn, - struct rxrpc_call *call, - struct rxrpc_message *msg) -{ - struct rxrpc_message *pmsg; - struct dst_entry *dst; - struct list_head *_p; - unsigned cix, seq; - int ret = 0; - - _enter("%p,%p,%p", conn, call, msg); - - if (!call) { - cix = ntohl(msg->hdr.cid) & RXRPC_CHANNELMASK; - - spin_lock(&conn->lock); - call = conn->channels[cix]; - - if (!call || call->call_id != msg->hdr.callNumber) { - spin_unlock(&conn->lock); - rxrpc_trans_immediate_abort(conn->trans, msg, -ENOENT); - goto out; - } - else { - rxrpc_get_call(call); - spin_unlock(&conn->lock); - } - } - else { - rxrpc_get_call(call); - } - - _proto("Received packet %%%u [%u] on call %hu:%u:%u", - ntohl(msg->hdr.serial), - ntohl(msg->hdr.seq), - ntohs(msg->hdr.serviceId), - ntohl(conn->conn_id), - ntohl(call->call_id)); - - call->pkt_rcv_count++; - - dst = msg->pkt->dst; - if (dst && dst->dev) - conn->peer->if_mtu = - dst->dev->mtu - dst->dev->hard_header_len; - - /* queue on the call in seq order */ - rxrpc_get_message(msg); - seq = msg->seq; - - spin_lock(&call->lock); - list_for_each(_p, &call->rcv_receiveq) { - pmsg = list_entry(_p, struct rxrpc_message, link); - if (pmsg->seq > seq) - break; - } - list_add_tail(&msg->link, _p); - - /* reset the activity timeout */ - call->flags |= RXRPC_CALL_RCV_PKT; - mod_timer(&call->rcv_timeout,jiffies + rxrpc_call_rcv_timeout * HZ); - - spin_unlock(&call->lock); - - rxrpc_krxiod_queue_call(call); - - rxrpc_put_call(call); - out: - _leave(" = %d", ret); - return ret; -} /* end rxrpc_conn_receive_call_packet() */ - -/*****************************************************************************/ -/* - * handle an ICMP error being applied to a connection - */ -void rxrpc_conn_handle_error(struct rxrpc_connection *conn, - int local, int errno) -{ - struct rxrpc_call *calls[4]; - int loop; - - _enter("%p{%d},%d", conn, ntohs(conn->addr.sin_port), errno); - - /* get a ref to all my calls in one go */ - memset(calls, 0, sizeof(calls)); - spin_lock(&conn->lock); - - for (loop = 3; loop >= 0; loop--) { - if (conn->channels[loop]) { - calls[loop] = conn->channels[loop]; - rxrpc_get_call(calls[loop]); - } - } - - spin_unlock(&conn->lock); - - /* now kick them all */ - for (loop = 3; loop >= 0; loop--) { - if (calls[loop]) { - rxrpc_call_handle_error(calls[loop], local, errno); - rxrpc_put_call(calls[loop]); - } - } - - _leave(""); -} /* end rxrpc_conn_handle_error() */ diff --git a/net/rxrpc/internal.h b/net/rxrpc/internal.h deleted file mode 100644 index cc0c5795a10..00000000000 --- a/net/rxrpc/internal.h +++ /dev/null @@ -1,106 +0,0 @@ -/* internal.h: internal Rx RPC stuff - * - * Copyright (c) 2002 David Howells (dhowells@redhat.com). - */ - -#ifndef RXRPC_INTERNAL_H -#define RXRPC_INTERNAL_H - -#include <linux/compiler.h> -#include <linux/kernel.h> - -/* - * debug accounting - */ -#if 1 -#define __RXACCT_DECL(X) X -#define __RXACCT(X) do { X; } while(0) -#else -#define __RXACCT_DECL(X) -#define __RXACCT(X) do { } while(0) -#endif - -__RXACCT_DECL(extern atomic_t rxrpc_transport_count); -__RXACCT_DECL(extern atomic_t rxrpc_peer_count); -__RXACCT_DECL(extern atomic_t rxrpc_connection_count); -__RXACCT_DECL(extern atomic_t rxrpc_call_count); -__RXACCT_DECL(extern atomic_t rxrpc_message_count); - -/* - * debug tracing - */ -#define kenter(FMT, a...) printk("==> %s("FMT")\n",__FUNCTION__ , ##a) -#define kleave(FMT, a...) printk("<== %s()"FMT"\n",__FUNCTION__ , ##a) -#define kdebug(FMT, a...) printk(" "FMT"\n" , ##a) -#define kproto(FMT, a...) printk("### "FMT"\n" , ##a) -#define knet(FMT, a...) printk(" "FMT"\n" , ##a) - -#if 0 -#define _enter(FMT, a...) kenter(FMT , ##a) -#define _leave(FMT, a...) kleave(FMT , ##a) -#define _debug(FMT, a...) kdebug(FMT , ##a) -#define _proto(FMT, a...) kproto(FMT , ##a) -#define _net(FMT, a...) knet(FMT , ##a) -#else -#define _enter(FMT, a...) do { if (rxrpc_ktrace) kenter(FMT , ##a); } while(0) -#define _leave(FMT, a...) do { if (rxrpc_ktrace) kleave(FMT , ##a); } while(0) -#define _debug(FMT, a...) do { if (rxrpc_kdebug) kdebug(FMT , ##a); } while(0) -#define _proto(FMT, a...) do { if (rxrpc_kproto) kproto(FMT , ##a); } while(0) -#define _net(FMT, a...) do { if (rxrpc_knet) knet (FMT , ##a); } while(0) -#endif - -static inline void rxrpc_discard_my_signals(void) -{ - while (signal_pending(current)) { - siginfo_t sinfo; - - spin_lock_irq(¤t->sighand->siglock); - dequeue_signal(current, ¤t->blocked, &sinfo); - spin_unlock_irq(¤t->sighand->siglock); - } -} - -/* - * call.c - */ -extern struct list_head rxrpc_calls; -extern struct rw_semaphore rxrpc_calls_sem; - -/* - * connection.c - */ -extern struct list_head rxrpc_conns; -extern struct rw_semaphore rxrpc_conns_sem; -extern unsigned long rxrpc_conn_timeout; - -extern void rxrpc_conn_clearall(struct rxrpc_peer *peer); - -/* - * peer.c - */ -extern struct list_head rxrpc_peers; -extern struct rw_semaphore rxrpc_peers_sem; -extern unsigned long rxrpc_peer_timeout; - -extern void rxrpc_peer_calculate_rtt(struct rxrpc_peer *peer, - struct rxrpc_message *msg, - struct rxrpc_message *resp); - -extern void rxrpc_peer_clearall(struct rxrpc_transport *trans); - - -/* - * proc.c - */ -#ifdef CONFIG_PROC_FS -extern int rxrpc_proc_init(void); -extern void rxrpc_proc_cleanup(void); -#endif - -/* - * transport.c - */ -extern struct list_head rxrpc_proc_transports; -extern struct rw_semaphore rxrpc_proc_transports_sem; - -#endif /* RXRPC_INTERNAL_H */ diff --git a/net/rxrpc/krxiod.c b/net/rxrpc/krxiod.c deleted file mode 100644 index bbbcd6c2404..00000000000 --- a/net/rxrpc/krxiod.c +++ /dev/null @@ -1,262 +0,0 @@ -/* krxiod.c: Rx I/O daemon - * - * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/sched.h> -#include <linux/completion.h> -#include <linux/spinlock.h> -#include <linux/init.h> -#include <linux/freezer.h> -#include <rxrpc/krxiod.h> -#include <rxrpc/transport.h> -#include <rxrpc/peer.h> -#include <rxrpc/call.h> -#include "internal.h" - -static DECLARE_WAIT_QUEUE_HEAD(rxrpc_krxiod_sleepq); -static DECLARE_COMPLETION(rxrpc_krxiod_dead); - -static atomic_t rxrpc_krxiod_qcount = ATOMIC_INIT(0); - -static LIST_HEAD(rxrpc_krxiod_transportq); -static DEFINE_SPINLOCK(rxrpc_krxiod_transportq_lock); - -static LIST_HEAD(rxrpc_krxiod_callq); -static DEFINE_SPINLOCK(rxrpc_krxiod_callq_lock); - -static volatile int rxrpc_krxiod_die; - -/*****************************************************************************/ -/* - * Rx I/O daemon - */ -static int rxrpc_krxiod(void *arg) -{ - DECLARE_WAITQUEUE(krxiod,current); - - printk("Started krxiod %d\n",current->pid); - - daemonize("krxiod"); - - /* loop around waiting for work to do */ - do { - /* wait for work or to be told to exit */ - _debug("### Begin Wait"); - if (!atomic_read(&rxrpc_krxiod_qcount)) { - set_current_state(TASK_INTERRUPTIBLE); - - add_wait_queue(&rxrpc_krxiod_sleepq, &krxiod); - - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - if (atomic_read(&rxrpc_krxiod_qcount) || - rxrpc_krxiod_die || - signal_pending(current)) - break; - - schedule(); - } - - remove_wait_queue(&rxrpc_krxiod_sleepq, &krxiod); - set_current_state(TASK_RUNNING); - } - _debug("### End Wait"); - - /* do work if been given some to do */ - _debug("### Begin Work"); - - /* see if there's a transport in need of attention */ - if (!list_empty(&rxrpc_krxiod_transportq)) { - struct rxrpc_transport *trans = NULL; - - spin_lock_irq(&rxrpc_krxiod_transportq_lock); - - if (!list_empty(&rxrpc_krxiod_transportq)) { - trans = list_entry( - rxrpc_krxiod_transportq.next, - struct rxrpc_transport, - krxiodq_link); - - list_del_init(&trans->krxiodq_link); - atomic_dec(&rxrpc_krxiod_qcount); - - /* make sure it hasn't gone away and doesn't go - * away */ - if (atomic_read(&trans->usage)>0) - rxrpc_get_transport(trans); - else - trans = NULL; - } - - spin_unlock_irq(&rxrpc_krxiod_transportq_lock); - - if (trans) { - rxrpc_trans_receive_packet(trans); - rxrpc_put_transport(trans); - } - } - - /* see if there's a call in need of attention */ - if (!list_empty(&rxrpc_krxiod_callq)) { - struct rxrpc_call *call = NULL; - - spin_lock_irq(&rxrpc_krxiod_callq_lock); - - if (!list_empty(&rxrpc_krxiod_callq)) { - call = list_entry(rxrpc_krxiod_callq.next, - struct rxrpc_call, - rcv_krxiodq_lk); - list_del_init(&call->rcv_krxiodq_lk); - atomic_dec(&rxrpc_krxiod_qcount); - - /* make sure it hasn't gone away and doesn't go - * away */ - if (atomic_read(&call->usage) > 0) { - _debug("@@@ KRXIOD" - " Begin Attend Call %p", call); - rxrpc_get_call(call); - } - else { - call = NULL; - } - } - - spin_unlock_irq(&rxrpc_krxiod_callq_lock); - - if (call) { - rxrpc_call_do_stuff(call); - rxrpc_put_call(call); - _debug("@@@ KRXIOD End Attend Call %p", call); - } - } - - _debug("### End Work"); - - try_to_freeze(); - - /* discard pending signals */ - rxrpc_discard_my_signals(); - - } while (!rxrpc_krxiod_die); - - /* and that's all */ - complete_and_exit(&rxrpc_krxiod_dead, 0); - -} /* end rxrpc_krxiod() */ - -/*****************************************************************************/ -/* - * start up a krxiod daemon - */ -int __init rxrpc_krxiod_init(void) -{ - return kernel_thread(rxrpc_krxiod, NULL, 0); - -} /* end rxrpc_krxiod_init() */ - -/*****************************************************************************/ -/* - * kill the krxiod daemon and wait for it to complete - */ -void rxrpc_krxiod_kill(void) -{ - rxrpc_krxiod_die = 1; - wake_up_all(&rxrpc_krxiod_sleepq); - wait_for_completion(&rxrpc_krxiod_dead); - -} /* end rxrpc_krxiod_kill() */ - -/*****************************************************************************/ -/* - * queue a transport for attention by krxiod - */ -void rxrpc_krxiod_queue_transport(struct rxrpc_transport *trans) -{ - unsigned long flags; - - _enter(""); - - if (list_empty(&trans->krxiodq_link)) { - spin_lock_irqsave(&rxrpc_krxiod_transportq_lock, flags); - - if (list_empty(&trans->krxiodq_link)) { - if (atomic_read(&trans->usage) > 0) { - list_add_tail(&trans->krxiodq_link, - &rxrpc_krxiod_transportq); - atomic_inc(&rxrpc_krxiod_qcount); - } - } - - spin_unlock_irqrestore(&rxrpc_krxiod_transportq_lock, flags); - wake_up_all(&rxrpc_krxiod_sleepq); - } - - _leave(""); - -} /* end rxrpc_krxiod_queue_transport() */ - -/*****************************************************************************/ -/* - * dequeue a transport from krxiod's attention queue - */ -void rxrpc_krxiod_dequeue_transport(struct rxrpc_transport *trans) -{ - unsigned long flags; - - _enter(""); - - spin_lock_irqsave(&rxrpc_krxiod_transportq_lock, flags); - if (!list_empty(&trans->krxiodq_link)) { - list_del_init(&trans->krxiodq_link); - atomic_dec(&rxrpc_krxiod_qcount); - } - spin_unlock_irqrestore(&rxrpc_krxiod_transportq_lock, flags); - - _leave(""); - -} /* end rxrpc_krxiod_dequeue_transport() */ - -/*****************************************************************************/ -/* - * queue a call for attention by krxiod - */ -void rxrpc_krxiod_queue_call(struct rxrpc_call *call) -{ - unsigned long flags; - - if (list_empty(&call->rcv_krxiodq_lk)) { - spin_lock_irqsave(&rxrpc_krxiod_callq_lock, flags); - if (atomic_read(&call->usage) > 0) { - list_add_tail(&call->rcv_krxiodq_lk, - &rxrpc_krxiod_callq); - atomic_inc(&rxrpc_krxiod_qcount); - } - spin_unlock_irqrestore(&rxrpc_krxiod_callq_lock, flags); - } - wake_up_all(&rxrpc_krxiod_sleepq); - -} /* end rxrpc_krxiod_queue_call() */ - -/*****************************************************************************/ -/* - * dequeue a call from krxiod's attention queue - */ -void rxrpc_krxiod_dequeue_call(struct rxrpc_call *call) -{ - unsigned long flags; - - spin_lock_irqsave(&rxrpc_krxiod_callq_lock, flags); - if (!list_empty(&call->rcv_krxiodq_lk)) { - list_del_init(&call->rcv_krxiodq_lk); - atomic_dec(&rxrpc_krxiod_qcount); - } - spin_unlock_irqrestore(&rxrpc_krxiod_callq_lock, flags); - -} /* end rxrpc_krxiod_dequeue_call() */ diff --git a/net/rxrpc/krxsecd.c b/net/rxrpc/krxsecd.c deleted file mode 100644 index 9a1e7f5e034..00000000000 --- a/net/rxrpc/krxsecd.c +++ /dev/null @@ -1,270 +0,0 @@ -/* krxsecd.c: Rx security daemon - * - * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * This daemon deals with: - * - consulting the application as to whether inbound peers and calls should be authorised - * - generating security challenges for inbound connections - * - responding to security challenges on outbound connections - */ - -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/completion.h> -#include <linux/spinlock.h> -#include <linux/init.h> -#include <rxrpc/krxsecd.h> -#include <rxrpc/transport.h> -#include <rxrpc/connection.h> -#include <rxrpc/message.h> -#include <rxrpc/peer.h> -#include <rxrpc/call.h> -#include <linux/udp.h> -#include <linux/ip.h> -#include <linux/freezer.h> -#include <net/sock.h> -#include "internal.h" - -static DECLARE_WAIT_QUEUE_HEAD(rxrpc_krxsecd_sleepq); -static DECLARE_COMPLETION(rxrpc_krxsecd_dead); -static volatile int rxrpc_krxsecd_die; - -static atomic_t rxrpc_krxsecd_qcount; - -/* queue of unprocessed inbound messages with seqno #1 and - * RXRPC_CLIENT_INITIATED flag set */ -static LIST_HEAD(rxrpc_krxsecd_initmsgq); -static DEFINE_SPINLOCK(rxrpc_krxsecd_initmsgq_lock); - -static void rxrpc_krxsecd_process_incoming_call(struct rxrpc_message *msg); - -/*****************************************************************************/ -/* - * Rx security daemon - */ -static int rxrpc_krxsecd(void *arg) -{ - DECLARE_WAITQUEUE(krxsecd, current); - - int die; - - printk("Started krxsecd %d\n", current->pid); - - daemonize("krxsecd"); - - /* loop around waiting for work to do */ - do { - /* wait for work or to be told to exit */ - _debug("### Begin Wait"); - if (!atomic_read(&rxrpc_krxsecd_qcount)) { - set_current_state(TASK_INTERRUPTIBLE); - - add_wait_queue(&rxrpc_krxsecd_sleepq, &krxsecd); - - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - if (atomic_read(&rxrpc_krxsecd_qcount) || - rxrpc_krxsecd_die || - signal_pending(current)) - break; - - schedule(); - } - - remove_wait_queue(&rxrpc_krxsecd_sleepq, &krxsecd); - set_current_state(TASK_RUNNING); - } - die = rxrpc_krxsecd_die; - _debug("### End Wait"); - - /* see if there're incoming calls in need of authenticating */ - _debug("### Begin Inbound Calls"); - - if (!list_empty(&rxrpc_krxsecd_initmsgq)) { - struct rxrpc_message *msg = NULL; - - spin_lock(&rxrpc_krxsecd_initmsgq_lock); - - if (!list_empty(&rxrpc_krxsecd_initmsgq)) { - msg = list_entry(rxrpc_krxsecd_initmsgq.next, - struct rxrpc_message, link); - list_del_init(&msg->link); - atomic_dec(&rxrpc_krxsecd_qcount); - } - - spin_unlock(&rxrpc_krxsecd_initmsgq_lock); - - if (msg) { - rxrpc_krxsecd_process_incoming_call(msg); - rxrpc_put_message(msg); - } - } - - _debug("### End Inbound Calls"); - - try_to_freeze(); - - /* discard pending signals */ - rxrpc_discard_my_signals(); - - } while (!die); - - /* and that's all */ - complete_and_exit(&rxrpc_krxsecd_dead, 0); - -} /* end rxrpc_krxsecd() */ - -/*****************************************************************************/ -/* - * start up a krxsecd daemon - */ -int __init rxrpc_krxsecd_init(void) -{ - return kernel_thread(rxrpc_krxsecd, NULL, 0); - -} /* end rxrpc_krxsecd_init() */ - -/*****************************************************************************/ -/* - * kill the krxsecd daemon and wait for it to complete - */ -void rxrpc_krxsecd_kill(void) -{ - rxrpc_krxsecd_die = 1; - wake_up_all(&rxrpc_krxsecd_sleepq); - wait_for_completion(&rxrpc_krxsecd_dead); - -} /* end rxrpc_krxsecd_kill() */ - -/*****************************************************************************/ -/* - * clear all pending incoming calls for the specified transport - */ -void rxrpc_krxsecd_clear_transport(struct rxrpc_transport *trans) -{ - LIST_HEAD(tmp); - - struct rxrpc_message *msg; - struct list_head *_p, *_n; - - _enter("%p",trans); - - /* move all the messages for this transport onto a temp list */ - spin_lock(&rxrpc_krxsecd_initmsgq_lock); - - list_for_each_safe(_p, _n, &rxrpc_krxsecd_initmsgq) { - msg = list_entry(_p, struct rxrpc_message, link); - if (msg->trans == trans) { - list_move_tail(&msg->link, &tmp); - atomic_dec(&rxrpc_krxsecd_qcount); - } - } - - spin_unlock(&rxrpc_krxsecd_initmsgq_lock); - - /* zap all messages on the temp list */ - while (!list_empty(&tmp)) { - msg = list_entry(tmp.next, struct rxrpc_message, link); - list_del_init(&msg->link); - rxrpc_put_message(msg); - } - - _leave(""); -} /* end rxrpc_krxsecd_clear_transport() */ - -/*****************************************************************************/ -/* - * queue a message on the incoming calls list - */ -void rxrpc_krxsecd_queue_incoming_call(struct rxrpc_message *msg) -{ - _enter("%p", msg); - - /* queue for processing by krxsecd */ - spin_lock(&rxrpc_krxsecd_initmsgq_lock); - - if (!rxrpc_krxsecd_die) { - rxrpc_get_message(msg); - list_add_tail(&msg->link, &rxrpc_krxsecd_initmsgq); - atomic_inc(&rxrpc_krxsecd_qcount); - } - - spin_unlock(&rxrpc_krxsecd_initmsgq_lock); - - wake_up(&rxrpc_krxsecd_sleepq); - - _leave(""); -} /* end rxrpc_krxsecd_queue_incoming_call() */ - -/*****************************************************************************/ -/* - * process the initial message of an incoming call - */ -void rxrpc_krxsecd_process_incoming_call(struct rxrpc_message *msg) -{ - struct rxrpc_transport *trans = msg->trans; - struct rxrpc_service *srv; - struct rxrpc_call *call; - struct list_head *_p; - unsigned short sid; - int ret; - - _enter("%p{tr=%p}", msg, trans); - - ret = rxrpc_incoming_call(msg->conn, msg, &call); - if (ret < 0) - goto out; - - /* find the matching service on the transport */ - sid = ntohs(msg->hdr.serviceId); - srv = NULL; - - spin_lock(&trans->lock); - list_for_each(_p, &trans->services) { - srv = list_entry(_p, struct rxrpc_service, link); - if (srv->service_id == sid && try_module_get(srv->owner)) { - /* found a match (made sure it won't vanish) */ - _debug("found service '%s'", srv->name); - call->owner = srv->owner; - break; - } - } - spin_unlock(&trans->lock); - - /* report the new connection - * - the func must inc the call's usage count to keep it - */ - ret = -ENOENT; - if (_p != &trans->services) { - /* attempt to accept the call */ - call->conn->service = srv; - call->app_attn_func = srv->attn_func; - call->app_error_func = srv->error_func; - call->app_aemap_func = srv->aemap_func; - - ret = srv->new_call(call); - - /* send an abort if an error occurred */ - if (ret < 0) { - rxrpc_call_abort(call, ret); - } - else { - /* formally receive and ACK the new packet */ - ret = rxrpc_conn_receive_call_packet(call->conn, - call, msg); - } - } - - rxrpc_put_call(call); - out: - if (ret < 0) - rxrpc_trans_immediate_abort(trans, msg, ret); - - _leave(" (%d)", ret); -} /* end rxrpc_krxsecd_process_incoming_call() */ diff --git a/net/rxrpc/krxtimod.c b/net/rxrpc/krxtimod.c deleted file mode 100644 index 9a9b6132dba..00000000000 --- a/net/rxrpc/krxtimod.c +++ /dev/null @@ -1,204 +0,0 @@ -/* krxtimod.c: RXRPC timeout daemon - * - * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/module.h> -#include <linux/init.h> -#include <linux/sched.h> -#include <linux/completion.h> -#include <linux/freezer.h> -#include <rxrpc/rxrpc.h> -#include <rxrpc/krxtimod.h> -#include <asm/errno.h> -#include "internal.h" - -static DECLARE_COMPLETION(krxtimod_alive); -static DECLARE_COMPLETION(krxtimod_dead); -static DECLARE_WAIT_QUEUE_HEAD(krxtimod_sleepq); -static int krxtimod_die; - -static LIST_HEAD(krxtimod_list); -static DEFINE_SPINLOCK(krxtimod_lock); - -static int krxtimod(void *arg); - -/*****************************************************************************/ -/* - * start the timeout daemon - */ -int rxrpc_krxtimod_start(void) -{ - int ret; - - ret = kernel_thread(krxtimod, NULL, 0); - if (ret < 0) - return ret; - - wait_for_completion(&krxtimod_alive); - - return ret; -} /* end rxrpc_krxtimod_start() */ - -/*****************************************************************************/ -/* - * stop the timeout daemon - */ -void rxrpc_krxtimod_kill(void) -{ - /* get rid of my daemon */ - krxtimod_die = 1; - wake_up(&krxtimod_sleepq); - wait_for_completion(&krxtimod_dead); - -} /* end rxrpc_krxtimod_kill() */ - -/*****************************************************************************/ -/* - * timeout processing daemon - */ -static int krxtimod(void *arg) -{ - DECLARE_WAITQUEUE(myself, current); - - rxrpc_timer_t *timer; - - printk("Started krxtimod %d\n", current->pid); - - daemonize("krxtimod"); - - complete(&krxtimod_alive); - - /* loop around looking for things to attend to */ - loop: - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&krxtimod_sleepq, &myself); - - for (;;) { - unsigned long jif; - long timeout; - - /* deal with the server being asked to die */ - if (krxtimod_die) { - remove_wait_queue(&krxtimod_sleepq, &myself); - _leave(""); - complete_and_exit(&krxtimod_dead, 0); - } - - try_to_freeze(); - - /* discard pending signals */ - rxrpc_discard_my_signals(); - - /* work out the time to elapse before the next event */ - spin_lock(&krxtimod_lock); - if (list_empty(&krxtimod_list)) { - timeout = MAX_SCHEDULE_TIMEOUT; - } - else { - timer = list_entry(krxtimod_list.next, - rxrpc_timer_t, link); - timeout = timer->timo_jif; - jif = jiffies; - - if (time_before_eq((unsigned long) timeout, jif)) - goto immediate; - - else { - timeout = (long) timeout - (long) jiffies; - } - } - spin_unlock(&krxtimod_lock); - - schedule_timeout(timeout); - - set_current_state(TASK_INTERRUPTIBLE); - } - - /* the thing on the front of the queue needs processing - * - we come here with the lock held and timer pointing to the expired - * entry - */ - immediate: - remove_wait_queue(&krxtimod_sleepq, &myself); - set_current_state(TASK_RUNNING); - - _debug("@@@ Begin Timeout of %p", timer); - - /* dequeue the timer */ - list_del_init(&timer->link); - spin_unlock(&krxtimod_lock); - - /* call the timeout function */ - timer->ops->timed_out(timer); - - _debug("@@@ End Timeout"); - goto loop; - -} /* end krxtimod() */ - -/*****************************************************************************/ -/* - * (re-)queue a timer - */ -void rxrpc_krxtimod_add_timer(rxrpc_timer_t *timer, unsigned long timeout) -{ - struct list_head *_p; - rxrpc_timer_t *ptimer; - - _enter("%p,%lu", timer, timeout); - - spin_lock(&krxtimod_lock); - - list_del(&timer->link); - - /* the timer was deferred or reset - put it back in the queue at the - * right place */ - timer->timo_jif = jiffies + timeout; - - list_for_each(_p, &krxtimod_list) { - ptimer = list_entry(_p, rxrpc_timer_t, link); - if (time_before(timer->timo_jif, ptimer->timo_jif)) - break; - } - - list_add_tail(&timer->link, _p); /* insert before stopping point */ - - spin_unlock(&krxtimod_lock); - - wake_up(&krxtimod_sleepq); - - _leave(""); -} /* end rxrpc_krxtimod_add_timer() */ - -/*****************************************************************************/ -/* - * dequeue a timer - * - returns 0 if the timer was deleted or -ENOENT if it wasn't queued - */ -int rxrpc_krxtimod_del_timer(rxrpc_timer_t *timer) -{ - int ret = 0; - - _enter("%p", timer); - - spin_lock(&krxtimod_lock); - - if (list_empty(&timer->link)) - ret = -ENOENT; - else - list_del_init(&timer->link); - - spin_unlock(&krxtimod_lock); - - wake_up(&krxtimod_sleepq); - - _leave(" = %d", ret); - return ret; -} /* end rxrpc_krxtimod_del_timer() */ diff --git a/net/rxrpc/main.c b/net/rxrpc/main.c deleted file mode 100644 index baec1f7fd8b..00000000000 --- a/net/rxrpc/main.c +++ /dev/null @@ -1,180 +0,0 @@ -/* main.c: Rx RPC interface - * - * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/module.h> -#include <linux/init.h> -#include <linux/sched.h> -#include <rxrpc/rxrpc.h> -#include <rxrpc/krxiod.h> -#include <rxrpc/krxsecd.h> -#include <rxrpc/krxtimod.h> -#include <rxrpc/transport.h> -#include <rxrpc/connection.h> -#include <rxrpc/call.h> -#include <rxrpc/message.h> -#include "internal.h" - -MODULE_DESCRIPTION("Rx RPC implementation"); -MODULE_AUTHOR("Red Hat, Inc."); -MODULE_LICENSE("GPL"); - -__be32 rxrpc_epoch; - -/*****************************************************************************/ -/* - * initialise the Rx module - */ -static int __init rxrpc_initialise(void) -{ - int ret; - - /* my epoch value */ - rxrpc_epoch = htonl(xtime.tv_sec); - - /* register the /proc interface */ -#ifdef CONFIG_PROC_FS - ret = rxrpc_proc_init(); - if (ret<0) - return ret; -#endif - - /* register the sysctl files */ -#ifdef CONFIG_SYSCTL - ret = rxrpc_sysctl_init(); - if (ret<0) - goto error_proc; -#endif - - /* start the krxtimod daemon */ - ret = rxrpc_krxtimod_start(); - if (ret<0) - goto error_sysctl; - - /* start the krxiod daemon */ - ret = rxrpc_krxiod_init(); - if (ret<0) - goto error_krxtimod; - - /* start the krxsecd daemon */ - ret = rxrpc_krxsecd_init(); - if (ret<0) - goto error_krxiod; - - kdebug("\n\n"); - - return 0; - - error_krxiod: - rxrpc_krxiod_kill(); - error_krxtimod: - rxrpc_krxtimod_kill(); - error_sysctl: -#ifdef CONFIG_SYSCTL - rxrpc_sysctl_cleanup(); - error_proc: -#endif -#ifdef CONFIG_PROC_FS - rxrpc_proc_cleanup(); -#endif - return ret; -} /* end rxrpc_initialise() */ - -module_init(rxrpc_initialise); - -/*****************************************************************************/ -/* - * clean up the Rx module - */ -static void __exit rxrpc_cleanup(void) -{ - kenter(""); - - __RXACCT(printk("Outstanding Messages : %d\n", - atomic_read(&rxrpc_message_count))); - __RXACCT(printk("Outstanding Calls : %d\n", - atomic_read(&rxrpc_call_count))); - __RXACCT(printk("Outstanding Connections: %d\n", - atomic_read(&rxrpc_connection_count))); - __RXACCT(printk("Outstanding Peers : %d\n", - atomic_read(&rxrpc_peer_count))); - __RXACCT(printk("Outstanding Transports : %d\n", - atomic_read(&rxrpc_transport_count))); - - rxrpc_krxsecd_kill(); - rxrpc_krxiod_kill(); - rxrpc_krxtimod_kill(); -#ifdef CONFIG_SYSCTL - rxrpc_sysctl_cleanup(); -#endif -#ifdef CONFIG_PROC_FS - rxrpc_proc_cleanup(); -#endif - - __RXACCT(printk("Outstanding Messages : %d\n", - atomic_read(&rxrpc_message_count))); - __RXACCT(printk("Outstanding Calls : %d\n", - atomic_read(&rxrpc_call_count))); - __RXACCT(printk("Outstanding Connections: %d\n", - atomic_read(&rxrpc_connection_count))); - __RXACCT(printk("Outstanding Peers : %d\n", - atomic_read(&rxrpc_peer_count))); - __RXACCT(printk("Outstanding Transports : %d\n", - atomic_read(&rxrpc_transport_count))); - - kleave(""); -} /* end rxrpc_cleanup() */ - -module_exit(rxrpc_cleanup); - -/*****************************************************************************/ -/* - * clear the dead space between task_struct and kernel stack - * - called by supplying -finstrument-functions to gcc - */ -#if 0 -void __cyg_profile_func_enter (void *this_fn, void *call_site) -__attribute__((no_instrument_function)); - -void __cyg_profile_func_enter (void *this_fn, void *call_site) -{ - asm volatile(" movl %%esp,%%edi \n" - " andl %0,%%edi \n" - " addl %1,%%edi \n" - " movl %%esp,%%ecx \n" - " subl %%edi,%%ecx \n" - " shrl $2,%%ecx \n" - " movl $0xedededed,%%eax \n" - " rep stosl \n" - : - : "i"(~(THREAD_SIZE-1)), "i"(sizeof(struct thread_info)) - : "eax", "ecx", "edi", "memory", "cc" - ); -} - -void __cyg_profile_func_exit(void *this_fn, void *call_site) -__attribute__((no_instrument_function)); - -void __cyg_profile_func_exit(void *this_fn, void *call_site) -{ - asm volatile(" movl %%esp,%%edi \n" - " andl %0,%%edi \n" - " addl %1,%%edi \n" - " movl %%esp,%%ecx \n" - " subl %%edi,%%ecx \n" - " shrl $2,%%ecx \n" - " movl $0xdadadada,%%eax \n" - " rep stosl \n" - : - : "i"(~(THREAD_SIZE-1)), "i"(sizeof(struct thread_info)) - : "eax", "ecx", "edi", "memory", "cc" - ); -} -#endif diff --git a/net/rxrpc/peer.c b/net/rxrpc/peer.c deleted file mode 100644 index 8a275157a3b..00000000000 --- a/net/rxrpc/peer.c +++ /dev/null @@ -1,398 +0,0 @@ -/* peer.c: Rx RPC peer management - * - * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/module.h> -#include <rxrpc/rxrpc.h> -#include <rxrpc/transport.h> -#include <rxrpc/peer.h> -#include <rxrpc/connection.h> -#include <rxrpc/call.h> -#include <rxrpc/message.h> -#include <linux/udp.h> -#include <linux/ip.h> -#include <net/sock.h> -#include <asm/uaccess.h> -#include <asm/div64.h> -#include "internal.h" - -__RXACCT_DECL(atomic_t rxrpc_peer_count); -LIST_HEAD(rxrpc_peers); -DECLARE_RWSEM(rxrpc_peers_sem); -unsigned long rxrpc_peer_timeout = 12 * 60 * 60; - -static void rxrpc_peer_do_timeout(struct rxrpc_peer *peer); - -static void __rxrpc_peer_timeout(rxrpc_timer_t *timer) -{ - struct rxrpc_peer *peer = - list_entry(timer, struct rxrpc_peer, timeout); - - _debug("Rx PEER TIMEOUT [%p{u=%d}]", peer, atomic_read(&peer->usage)); - - rxrpc_peer_do_timeout(peer); -} - -static const struct rxrpc_timer_ops rxrpc_peer_timer_ops = { - .timed_out = __rxrpc_peer_timeout, -}; - -/*****************************************************************************/ -/* - * create a peer record - */ -static int __rxrpc_create_peer(struct rxrpc_transport *trans, __be32 addr, - struct rxrpc_peer **_peer) -{ - struct rxrpc_peer *peer; - - _enter("%p,%08x", trans, ntohl(addr)); - - /* allocate and initialise a peer record */ - peer = kzalloc(sizeof(struct rxrpc_peer), GFP_KERNEL); - if (!peer) { - _leave(" = -ENOMEM"); - return -ENOMEM; - } - - atomic_set(&peer->usage, 1); - - INIT_LIST_HEAD(&peer->link); - INIT_LIST_HEAD(&peer->proc_link); - INIT_LIST_HEAD(&peer->conn_idlist); - INIT_LIST_HEAD(&peer->conn_active); - INIT_LIST_HEAD(&peer->conn_graveyard); - spin_lock_init(&peer->conn_gylock); - init_waitqueue_head(&peer->conn_gy_waitq); - rwlock_init(&peer->conn_idlock); - rwlock_init(&peer->conn_lock); - atomic_set(&peer->conn_count, 0); - spin_lock_init(&peer->lock); - rxrpc_timer_init(&peer->timeout, &rxrpc_peer_timer_ops); - - peer->addr.s_addr = addr; - - peer->trans = trans; - peer->ops = trans->peer_ops; - - __RXACCT(atomic_inc(&rxrpc_peer_count)); - *_peer = peer; - _leave(" = 0 (%p)", peer); - - return 0; -} /* end __rxrpc_create_peer() */ - -/*****************************************************************************/ -/* - * find a peer record on the specified transport - * - returns (if successful) with peer record usage incremented - * - resurrects it from the graveyard if found there - */ -int rxrpc_peer_lookup(struct rxrpc_transport *trans, __be32 addr, - struct rxrpc_peer **_peer) -{ - struct rxrpc_peer *peer, *candidate = NULL; - struct list_head *_p; - int ret; - - _enter("%p{%hu},%08x", trans, trans->port, ntohl(addr)); - - /* [common case] search the transport's active list first */ - read_lock(&trans->peer_lock); - list_for_each(_p, &trans->peer_active) { - peer = list_entry(_p, struct rxrpc_peer, link); - if (peer->addr.s_addr == addr) - goto found_active; - } - read_unlock(&trans->peer_lock); - - /* [uncommon case] not active - create a candidate for a new record */ - ret = __rxrpc_create_peer(trans, addr, &candidate); - if (ret < 0) { - _leave(" = %d", ret); - return ret; - } - - /* search the active list again, just in case it appeared whilst we - * were busy */ - write_lock(&trans->peer_lock); - list_for_each(_p, &trans->peer_active) { - peer = list_entry(_p, struct rxrpc_peer, link); - if (peer->addr.s_addr == addr) - goto found_active_second_chance; - } - - /* search the transport's graveyard list */ - spin_lock(&trans->peer_gylock); - list_for_each(_p, &trans->peer_graveyard) { - peer = list_entry(_p, struct rxrpc_peer, link); - if (peer->addr.s_addr == addr) - goto found_in_graveyard; - } - spin_unlock(&trans->peer_gylock); - - /* we can now add the new candidate to the list - * - tell the application layer that this peer has been added - */ - rxrpc_get_transport(trans); - peer = candidate; - candidate = NULL; - - if (peer->ops && peer->ops->adding) { - ret = peer->ops->adding(peer); - if (ret < 0) { - write_unlock(&trans->peer_lock); - __RXACCT(atomic_dec(&rxrpc_peer_count)); - kfree(peer); - rxrpc_put_transport(trans); - _leave(" = %d", ret); - return ret; - } - } - - atomic_inc(&trans->peer_count); - - make_active: - list_add_tail(&peer->link, &trans->peer_active); - - success_uwfree: - write_unlock(&trans->peer_lock); - - if (candidate) { - __RXACCT(atomic_dec(&rxrpc_peer_count)); - kfree(candidate); - } - - if (list_empty(&peer->proc_link)) { - down_write(&rxrpc_peers_sem); - list_add_tail(&peer->proc_link, &rxrpc_peers); - up_write(&rxrpc_peers_sem); - } - - success: - *_peer = peer; - - _leave(" = 0 (%p{u=%d cc=%d})", - peer, - atomic_read(&peer->usage), - atomic_read(&peer->conn_count)); - return 0; - - /* handle the peer being found in the active list straight off */ - found_active: - rxrpc_get_peer(peer); - read_unlock(&trans->peer_lock); - goto success; - - /* handle resurrecting a peer from the graveyard */ - found_in_graveyard: - rxrpc_get_peer(peer); - rxrpc_get_transport(peer->trans); - rxrpc_krxtimod_del_timer(&peer->timeout); - list_del_init(&peer->link); - spin_unlock(&trans->peer_gylock); - goto make_active; - - /* handle finding the peer on the second time through the active - * list */ - found_active_second_chance: - rxrpc_get_peer(peer); - goto success_uwfree; - -} /* end rxrpc_peer_lookup() */ - -/*****************************************************************************/ -/* - * finish with a peer record - * - it gets sent to the graveyard from where it can be resurrected or timed - * out - */ -void rxrpc_put_peer(struct rxrpc_peer *peer) -{ - struct rxrpc_transport *trans = peer->trans; - - _enter("%p{cc=%d a=%08x}", - peer, - atomic_read(&peer->conn_count), - ntohl(peer->addr.s_addr)); - - /* sanity check */ - if (atomic_read(&peer->usage) <= 0) - BUG(); - - write_lock(&trans->peer_lock); - spin_lock(&trans->peer_gylock); - if (likely(!atomic_dec_and_test(&peer->usage))) { - spin_unlock(&trans->peer_gylock); - write_unlock(&trans->peer_lock); - _leave(""); - return; - } - - /* move to graveyard queue */ - list_del(&peer->link); - write_unlock(&trans->peer_lock); - - list_add_tail(&peer->link, &trans->peer_graveyard); - - BUG_ON(!list_empty(&peer->conn_active)); - - rxrpc_krxtimod_add_timer(&peer->timeout, rxrpc_peer_timeout * HZ); - - spin_unlock(&trans->peer_gylock); - - rxrpc_put_transport(trans); - - _leave(" [killed]"); -} /* end rxrpc_put_peer() */ - -/*****************************************************************************/ -/* - * handle a peer timing out in the graveyard - * - called from krxtimod - */ -static void rxrpc_peer_do_timeout(struct rxrpc_peer *peer) -{ - struct rxrpc_transport *trans = peer->trans; - - _enter("%p{u=%d cc=%d a=%08x}", - peer, - atomic_read(&peer->usage), - atomic_read(&peer->conn_count), - ntohl(peer->addr.s_addr)); - - BUG_ON(atomic_read(&peer->usage) < 0); - - /* remove from graveyard if still dead */ - spin_lock(&trans->peer_gylock); - if (atomic_read(&peer->usage) == 0) - list_del_init(&peer->link); - else - peer = NULL; - spin_unlock(&trans->peer_gylock); - - if (!peer) { - _leave(""); - return; /* resurrected */ - } - - /* clear all connections on this peer */ - rxrpc_conn_clearall(peer); - - BUG_ON(!list_empty(&peer->conn_active)); - BUG_ON(!list_empty(&peer->conn_graveyard)); - - /* inform the application layer */ - if (peer->ops && peer->ops->discarding) - peer->ops->discarding(peer); - - if (!list_empty(&peer->proc_link)) { - down_write(&rxrpc_peers_sem); - list_del(&peer->proc_link); - up_write(&rxrpc_peers_sem); - } - - __RXACCT(atomic_dec(&rxrpc_peer_count)); - kfree(peer); - - /* if the graveyard is now empty, wake up anyone waiting for that */ - if (atomic_dec_and_test(&trans->peer_count)) - wake_up(&trans->peer_gy_waitq); - - _leave(" [destroyed]"); -} /* end rxrpc_peer_do_timeout() */ - -/*****************************************************************************/ -/* - * clear all peer records from a transport endpoint - */ -void rxrpc_peer_clearall(struct rxrpc_transport *trans) -{ - DECLARE_WAITQUEUE(myself,current); - - struct rxrpc_peer *peer; - int err; - - _enter("%p",trans); - - /* there shouldn't be any active peers remaining */ - BUG_ON(!list_empty(&trans->peer_active)); - - /* manually timeout all peers in the graveyard */ - spin_lock(&trans->peer_gylock); - while (!list_empty(&trans->peer_graveyard)) { - peer = list_entry(trans->peer_graveyard.next, - struct rxrpc_peer, link); - _debug("Clearing peer %p\n", peer); - err = rxrpc_krxtimod_del_timer(&peer->timeout); - spin_unlock(&trans->peer_gylock); - - if (err == 0) - rxrpc_peer_do_timeout(peer); - - spin_lock(&trans->peer_gylock); - } - spin_unlock(&trans->peer_gylock); - - /* wait for the the peer graveyard to be completely cleared */ - set_current_state(TASK_UNINTERRUPTIBLE); - add_wait_queue(&trans->peer_gy_waitq, &myself); - - while (atomic_read(&trans->peer_count) != 0) { - schedule(); - set_current_state(TASK_UNINTERRUPTIBLE); - } - - remove_wait_queue(&trans->peer_gy_waitq, &myself); - set_current_state(TASK_RUNNING); - - _leave(""); -} /* end rxrpc_peer_clearall() */ - -/*****************************************************************************/ -/* - * calculate and cache the Round-Trip-Time for a message and its response - */ -void rxrpc_peer_calculate_rtt(struct rxrpc_peer *peer, - struct rxrpc_message *msg, - struct rxrpc_message *resp) -{ - unsigned long long rtt; - int loop; - - _enter("%p,%p,%p", peer, msg, resp); - - /* calculate the latest RTT */ - rtt = resp->stamp.tv_sec - msg->stamp.tv_sec; - rtt *= 1000000UL; - rtt += resp->stamp.tv_usec - msg->stamp.tv_usec; - - /* add to cache */ - peer->rtt_cache[peer->rtt_point] = rtt; - peer->rtt_point++; - peer->rtt_point %= RXRPC_RTT_CACHE_SIZE; - - if (peer->rtt_usage < RXRPC_RTT_CACHE_SIZE) - peer->rtt_usage++; - - /* recalculate RTT */ - rtt = 0; - for (loop = peer->rtt_usage - 1; loop >= 0; loop--) - rtt += peer->rtt_cache[loop]; - - do_div(rtt, peer->rtt_usage); - peer->rtt = rtt; - - _leave(" RTT=%lu.%lums", - (long) (peer->rtt / 1000), (long) (peer->rtt % 1000)); - -} /* end rxrpc_peer_calculate_rtt() */ diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c deleted file mode 100644 index 8551c879e45..00000000000 --- a/net/rxrpc/proc.c +++ /dev/null @@ -1,617 +0,0 @@ -/* proc.c: /proc interface for RxRPC - * - * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/module.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <rxrpc/rxrpc.h> -#include <rxrpc/transport.h> -#include <rxrpc/peer.h> -#include <rxrpc/connection.h> -#include <rxrpc/call.h> -#include <rxrpc/message.h> -#include "internal.h" - -static struct proc_dir_entry *proc_rxrpc; - -static int rxrpc_proc_transports_open(struct inode *inode, struct file *file); -static void *rxrpc_proc_transports_start(struct seq_file *p, loff_t *pos); -static void *rxrpc_proc_transports_next(struct seq_file *p, void *v, loff_t *pos); -static void rxrpc_proc_transports_stop(struct seq_file *p, void *v); -static int rxrpc_proc_transports_show(struct seq_file *m, void *v); - -static struct seq_operations rxrpc_proc_transports_ops = { - .start = rxrpc_proc_transports_start, - .next = rxrpc_proc_transports_next, - .stop = rxrpc_proc_transports_stop, - .show = rxrpc_proc_transports_show, -}; - -static const struct file_operations rxrpc_proc_transports_fops = { - .open = rxrpc_proc_transports_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int rxrpc_proc_peers_open(struct inode *inode, struct file *file); -static void *rxrpc_proc_peers_start(struct seq_file *p, loff_t *pos); -static void *rxrpc_proc_peers_next(struct seq_file *p, void *v, loff_t *pos); -static void rxrpc_proc_peers_stop(struct seq_file *p, void *v); -static int rxrpc_proc_peers_show(struct seq_file *m, void *v); - -static struct seq_operations rxrpc_proc_peers_ops = { - .start = rxrpc_proc_peers_start, - .next = rxrpc_proc_peers_next, - .stop = rxrpc_proc_peers_stop, - .show = rxrpc_proc_peers_show, -}; - -static const struct file_operations rxrpc_proc_peers_fops = { - .open = rxrpc_proc_peers_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int rxrpc_proc_conns_open(struct inode *inode, struct file *file); -static void *rxrpc_proc_conns_start(struct seq_file *p, loff_t *pos); -static void *rxrpc_proc_conns_next(struct seq_file *p, void *v, loff_t *pos); -static void rxrpc_proc_conns_stop(struct seq_file *p, void *v); -static int rxrpc_proc_conns_show(struct seq_file *m, void *v); - -static struct seq_operations rxrpc_proc_conns_ops = { - .start = rxrpc_proc_conns_start, - .next = rxrpc_proc_conns_next, - .stop = rxrpc_proc_conns_stop, - .show = rxrpc_proc_conns_show, -}; - -static const struct file_operations rxrpc_proc_conns_fops = { - .open = rxrpc_proc_conns_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int rxrpc_proc_calls_open(struct inode *inode, struct file *file); -static void *rxrpc_proc_calls_start(struct seq_file *p, loff_t *pos); -static void *rxrpc_proc_calls_next(struct seq_file *p, void *v, loff_t *pos); -static void rxrpc_proc_calls_stop(struct seq_file *p, void *v); -static int rxrpc_proc_calls_show(struct seq_file *m, void *v); - -static struct seq_operations rxrpc_proc_calls_ops = { - .start = rxrpc_proc_calls_start, - .next = rxrpc_proc_calls_next, - .stop = rxrpc_proc_calls_stop, - .show = rxrpc_proc_calls_show, -}; - -static const struct file_operations rxrpc_proc_calls_fops = { - .open = rxrpc_proc_calls_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static const char *rxrpc_call_states7[] = { - "complet", - "error ", - "rcv_op ", - "rcv_arg", - "got_arg", - "snd_rpl", - "fin_ack", - "snd_arg", - "rcv_rpl", - "got_rpl" -}; - -static const char *rxrpc_call_error_states7[] = { - "no_err ", - "loc_abt", - "rmt_abt", - "loc_err", - "rmt_err" -}; - -/*****************************************************************************/ -/* - * initialise the /proc/net/rxrpc/ directory - */ -int rxrpc_proc_init(void) -{ - struct proc_dir_entry *p; - - proc_rxrpc = proc_mkdir("rxrpc", proc_net); - if (!proc_rxrpc) - goto error; - proc_rxrpc->owner = THIS_MODULE; - - p = create_proc_entry("calls", 0, proc_rxrpc); - if (!p) - goto error_proc; - p->proc_fops = &rxrpc_proc_calls_fops; - p->owner = THIS_MODULE; - - p = create_proc_entry("connections", 0, proc_rxrpc); - if (!p) - goto error_calls; - p->proc_fops = &rxrpc_proc_conns_fops; - p->owner = THIS_MODULE; - - p = create_proc_entry("peers", 0, proc_rxrpc); - if (!p) - goto error_calls; - p->proc_fops = &rxrpc_proc_peers_fops; - p->owner = THIS_MODULE; - - p = create_proc_entry("transports", 0, proc_rxrpc); - if (!p) - goto error_conns; - p->proc_fops = &rxrpc_proc_transports_fops; - p->owner = THIS_MODULE; - - return 0; - - error_conns: - remove_proc_entry("connections", proc_rxrpc); - error_calls: - remove_proc_entry("calls", proc_rxrpc); - error_proc: - remove_proc_entry("rxrpc", proc_net); - error: - return -ENOMEM; -} /* end rxrpc_proc_init() */ - -/*****************************************************************************/ -/* - * clean up the /proc/net/rxrpc/ directory - */ -void rxrpc_proc_cleanup(void) -{ - remove_proc_entry("transports", proc_rxrpc); - remove_proc_entry("peers", proc_rxrpc); - remove_proc_entry("connections", proc_rxrpc); - remove_proc_entry("calls", proc_rxrpc); - - remove_proc_entry("rxrpc", proc_net); - -} /* end rxrpc_proc_cleanup() */ - -/*****************************************************************************/ -/* - * open "/proc/net/rxrpc/transports" which provides a summary of extant transports - */ -static int rxrpc_proc_transports_open(struct inode *inode, struct file *file) -{ - struct seq_file *m; - int ret; - - ret = seq_open(file, &rxrpc_proc_transports_ops); - if (ret < 0) - return ret; - - m = file->private_data; - m->private = PDE(inode)->data; - - return 0; -} /* end rxrpc_proc_transports_open() */ - -/*****************************************************************************/ -/* - * set up the iterator to start reading from the transports list and return the first item - */ -static void *rxrpc_proc_transports_start(struct seq_file *m, loff_t *_pos) -{ - struct list_head *_p; - loff_t pos = *_pos; - - /* lock the list against modification */ - down_read(&rxrpc_proc_transports_sem); - - /* allow for the header line */ - if (!pos) - return SEQ_START_TOKEN; - pos--; - - /* find the n'th element in the list */ - list_for_each(_p, &rxrpc_proc_transports) - if (!pos--) - break; - - return _p != &rxrpc_proc_transports ? _p : NULL; -} /* end rxrpc_proc_transports_start() */ - -/*****************************************************************************/ -/* - * move to next call in transports list - */ -static void *rxrpc_proc_transports_next(struct seq_file *p, void *v, loff_t *pos) -{ - struct list_head *_p; - - (*pos)++; - - _p = v; - _p = (v == SEQ_START_TOKEN) ? rxrpc_proc_transports.next : _p->next; - - return _p != &rxrpc_proc_transports ? _p : NULL; -} /* end rxrpc_proc_transports_next() */ - -/*****************************************************************************/ -/* - * clean up after reading from the transports list - */ -static void rxrpc_proc_transports_stop(struct seq_file *p, void *v) -{ - up_read(&rxrpc_proc_transports_sem); - -} /* end rxrpc_proc_transports_stop() */ - -/*****************************************************************************/ -/* - * display a header line followed by a load of call lines - */ -static int rxrpc_proc_transports_show(struct seq_file *m, void *v) -{ - struct rxrpc_transport *trans = - list_entry(v, struct rxrpc_transport, proc_link); - - /* display header on line 1 */ - if (v == SEQ_START_TOKEN) { - seq_puts(m, "LOCAL USE\n"); - return 0; - } - - /* display one transport per line on subsequent lines */ - seq_printf(m, "%5hu %3d\n", - trans->port, - atomic_read(&trans->usage) - ); - - return 0; -} /* end rxrpc_proc_transports_show() */ - -/*****************************************************************************/ -/* - * open "/proc/net/rxrpc/peers" which provides a summary of extant peers - */ -static int rxrpc_proc_peers_open(struct inode *inode, struct file *file) -{ - struct seq_file *m; - int ret; - - ret = seq_open(file, &rxrpc_proc_peers_ops); - if (ret < 0) - return ret; - - m = file->private_data; - m->private = PDE(inode)->data; - - return 0; -} /* end rxrpc_proc_peers_open() */ - -/*****************************************************************************/ -/* - * set up the iterator to start reading from the peers list and return the - * first item - */ -static void *rxrpc_proc_peers_start(struct seq_file *m, loff_t *_pos) -{ - struct list_head *_p; - loff_t pos = *_pos; - - /* lock the list against modification */ - down_read(&rxrpc_peers_sem); - - /* allow for the header line */ - if (!pos) - return SEQ_START_TOKEN; - pos--; - - /* find the n'th element in the list */ - list_for_each(_p, &rxrpc_peers) - if (!pos--) - break; - - return _p != &rxrpc_peers ? _p : NULL; -} /* end rxrpc_proc_peers_start() */ - -/*****************************************************************************/ -/* - * move to next conn in peers list - */ -static void *rxrpc_proc_peers_next(struct seq_file *p, void *v, loff_t *pos) -{ - struct list_head *_p; - - (*pos)++; - - _p = v; - _p = (v == SEQ_START_TOKEN) ? rxrpc_peers.next : _p->next; - - return _p != &rxrpc_peers ? _p : NULL; -} /* end rxrpc_proc_peers_next() */ - -/*****************************************************************************/ -/* - * clean up after reading from the peers list - */ -static void rxrpc_proc_peers_stop(struct seq_file *p, void *v) -{ - up_read(&rxrpc_peers_sem); - -} /* end rxrpc_proc_peers_stop() */ - -/*****************************************************************************/ -/* - * display a header line followed by a load of conn lines - */ -static int rxrpc_proc_peers_show(struct seq_file *m, void *v) -{ - struct rxrpc_peer *peer = list_entry(v, struct rxrpc_peer, proc_link); - long timeout; - - /* display header on line 1 */ - if (v == SEQ_START_TOKEN) { - seq_puts(m, "LOCAL REMOTE USAGE CONNS TIMEOUT" - " MTU RTT(uS)\n"); - return 0; - } - - /* display one peer per line on subsequent lines */ - timeout = 0; - if (!list_empty(&peer->timeout.link)) - timeout = (long) peer->timeout.timo_jif - - (long) jiffies; - - seq_printf(m, "%5hu %08x %5d %5d %8ld %5Zu %7lu\n", - peer->trans->port, - ntohl(peer->addr.s_addr), - atomic_read(&peer->usage), - atomic_read(&peer->conn_count), - timeout, - peer->if_mtu, - (long) peer->rtt - ); - - return 0; -} /* end rxrpc_proc_peers_show() */ - -/*****************************************************************************/ -/* - * open "/proc/net/rxrpc/connections" which provides a summary of extant - * connections - */ -static int rxrpc_proc_conns_open(struct inode *inode, struct file *file) -{ - struct seq_file *m; - int ret; - - ret = seq_open(file, &rxrpc_proc_conns_ops); - if (ret < 0) - return ret; - - m = file->private_data; - m->private = PDE(inode)->data; - - return 0; -} /* end rxrpc_proc_conns_open() */ - -/*****************************************************************************/ -/* - * set up the iterator to start reading from the conns list and return the - * first item - */ -static void *rxrpc_proc_conns_start(struct seq_file *m, loff_t *_pos) -{ - struct list_head *_p; - loff_t pos = *_pos; - - /* lock the list against modification */ - down_read(&rxrpc_conns_sem); - - /* allow for the header line */ - if (!pos) - return SEQ_START_TOKEN; - pos--; - - /* find the n'th element in the list */ - list_for_each(_p, &rxrpc_conns) - if (!pos--) - break; - - return _p != &rxrpc_conns ? _p : NULL; -} /* end rxrpc_proc_conns_start() */ - -/*****************************************************************************/ -/* - * move to next conn in conns list - */ -static void *rxrpc_proc_conns_next(struct seq_file *p, void *v, loff_t *pos) -{ - struct list_head *_p; - - (*pos)++; - - _p = v; - _p = (v == SEQ_START_TOKEN) ? rxrpc_conns.next : _p->next; - - return _p != &rxrpc_conns ? _p : NULL; -} /* end rxrpc_proc_conns_next() */ - -/*****************************************************************************/ -/* - * clean up after reading from the conns list - */ -static void rxrpc_proc_conns_stop(struct seq_file *p, void *v) -{ - up_read(&rxrpc_conns_sem); - -} /* end rxrpc_proc_conns_stop() */ - -/*****************************************************************************/ -/* - * display a header line followed by a load of conn lines - */ -static int rxrpc_proc_conns_show(struct seq_file *m, void *v) -{ - struct rxrpc_connection *conn; - long timeout; - - conn = list_entry(v, struct rxrpc_connection, proc_link); - - /* display header on line 1 */ - if (v == SEQ_START_TOKEN) { - seq_puts(m, - "LOCAL REMOTE RPORT SRVC CONN END SERIALNO " - "CALLNO MTU TIMEOUT" - "\n"); - return 0; - } - - /* display one conn per line on subsequent lines */ - timeout = 0; - if (!list_empty(&conn->timeout.link)) - timeout = (long) conn->timeout.timo_jif - - (long) jiffies; - - seq_printf(m, - "%5hu %08x %5hu %04hx %08x %-3.3s %08x %08x %5Zu %8ld\n", - conn->trans->port, - ntohl(conn->addr.sin_addr.s_addr), - ntohs(conn->addr.sin_port), - ntohs(conn->service_id), - ntohl(conn->conn_id), - conn->out_clientflag ? "CLT" : "SRV", - conn->serial_counter, - conn->call_counter, - conn->mtu_size, - timeout - ); - - return 0; -} /* end rxrpc_proc_conns_show() */ - -/*****************************************************************************/ -/* - * open "/proc/net/rxrpc/calls" which provides a summary of extant calls - */ -static int rxrpc_proc_calls_open(struct inode *inode, struct file *file) -{ - struct seq_file *m; - int ret; - - ret = seq_open(file, &rxrpc_proc_calls_ops); - if (ret < 0) - return ret; - - m = file->private_data; - m->private = PDE(inode)->data; - - return 0; -} /* end rxrpc_proc_calls_open() */ - -/*****************************************************************************/ -/* - * set up the iterator to start reading from the calls list and return the - * first item - */ -static void *rxrpc_proc_calls_start(struct seq_file *m, loff_t *_pos) -{ - struct list_head *_p; - loff_t pos = *_pos; - - /* lock the list against modification */ - down_read(&rxrpc_calls_sem); - - /* allow for the header line */ - if (!pos) - return SEQ_START_TOKEN; - pos--; - - /* find the n'th element in the list */ - list_for_each(_p, &rxrpc_calls) - if (!pos--) - break; - - return _p != &rxrpc_calls ? _p : NULL; -} /* end rxrpc_proc_calls_start() */ - -/*****************************************************************************/ -/* - * move to next call in calls list - */ -static void *rxrpc_proc_calls_next(struct seq_file *p, void *v, loff_t *pos) -{ - struct list_head *_p; - - (*pos)++; - - _p = v; - _p = (v == SEQ_START_TOKEN) ? rxrpc_calls.next : _p->next; - - return _p != &rxrpc_calls ? _p : NULL; -} /* end rxrpc_proc_calls_next() */ - -/*****************************************************************************/ -/* - * clean up after reading from the calls list - */ -static void rxrpc_proc_calls_stop(struct seq_file *p, void *v) -{ - up_read(&rxrpc_calls_sem); - -} /* end rxrpc_proc_calls_stop() */ - -/*****************************************************************************/ -/* - * display a header line followed by a load of call lines - */ -static int rxrpc_proc_calls_show(struct seq_file *m, void *v) -{ - struct rxrpc_call *call = list_entry(v, struct rxrpc_call, call_link); - - /* display header on line 1 */ - if (v == SEQ_START_TOKEN) { - seq_puts(m, - "LOCAL REMOT SRVC CONN CALL DIR USE " - " L STATE OPCODE ABORT ERRNO\n" - ); - return 0; - } - - /* display one call per line on subsequent lines */ - seq_printf(m, - "%5hu %5hu %04hx %08x %08x %s %3u%c" - " %c %-7.7s %6d %08x %5d\n", - call->conn->trans->port, - ntohs(call->conn->addr.sin_port), - ntohs(call->conn->service_id), - ntohl(call->conn->conn_id), - ntohl(call->call_id), - call->conn->service ? "SVC" : "CLT", - atomic_read(&call->usage), - waitqueue_active(&call->waitq) ? 'w' : ' ', - call->app_last_rcv ? 'Y' : '-', - (call->app_call_state!=RXRPC_CSTATE_ERROR ? - rxrpc_call_states7[call->app_call_state] : - rxrpc_call_error_states7[call->app_err_state]), - call->app_opcode, - call->app_abort_code, - call->app_errno - ); - - return 0; -} /* end rxrpc_proc_calls_show() */ diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c new file mode 100644 index 00000000000..5ec705144e1 --- /dev/null +++ b/net/rxrpc/rxkad.c @@ -0,0 +1,1154 @@ +/* Kerberos-based RxRPC security + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/net.h> +#include <linux/skbuff.h> +#include <linux/udp.h> +#include <linux/crypto.h> +#include <linux/scatterlist.h> +#include <linux/ctype.h> +#include <net/sock.h> +#include <net/af_rxrpc.h> +#define rxrpc_debug rxkad_debug +#include "ar-internal.h" + +#define RXKAD_VERSION 2 +#define MAXKRB5TICKETLEN 1024 +#define RXKAD_TKT_TYPE_KERBEROS_V5 256 +#define ANAME_SZ 40 /* size of authentication name */ +#define INST_SZ 40 /* size of principal's instance */ +#define REALM_SZ 40 /* size of principal's auth domain */ +#define SNAME_SZ 40 /* size of service name */ + +unsigned rxrpc_debug; +module_param_named(debug, rxrpc_debug, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(rxrpc_debug, "rxkad debugging mask"); + +struct rxkad_level1_hdr { + __be32 data_size; /* true data size (excluding padding) */ +}; + +struct rxkad_level2_hdr { + __be32 data_size; /* true data size (excluding padding) */ + __be32 checksum; /* decrypted data checksum */ +}; + +MODULE_DESCRIPTION("RxRPC network protocol type-2 security (Kerberos)"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +/* + * this holds a pinned cipher so that keventd doesn't get called by the cipher + * alloc routine, but since we have it to hand, we use it to decrypt RESPONSE + * packets + */ +static struct crypto_blkcipher *rxkad_ci; +static DEFINE_MUTEX(rxkad_ci_mutex); + +/* + * initialise connection security + */ +static int rxkad_init_connection_security(struct rxrpc_connection *conn) +{ + struct rxrpc_key_payload *payload; + struct crypto_blkcipher *ci; + int ret; + + _enter("{%d},{%x}", conn->debug_id, key_serial(conn->key)); + + payload = conn->key->payload.data; + conn->security_ix = payload->k.security_index; + + ci = crypto_alloc_blkcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(ci)) { + _debug("no cipher"); + ret = PTR_ERR(ci); + goto error; + } + + if (crypto_blkcipher_setkey(ci, payload->k.session_key, + sizeof(payload->k.session_key)) < 0) + BUG(); + + switch (conn->security_level) { + case RXRPC_SECURITY_PLAIN: + break; + case RXRPC_SECURITY_AUTH: + conn->size_align = 8; + conn->security_size = sizeof(struct rxkad_level1_hdr); + conn->header_size += sizeof(struct rxkad_level1_hdr); + break; + case RXRPC_SECURITY_ENCRYPT: + conn->size_align = 8; + conn->security_size = sizeof(struct rxkad_level2_hdr); + conn->header_size += sizeof(struct rxkad_level2_hdr); + break; + default: + ret = -EKEYREJECTED; + goto error; + } + + conn->cipher = ci; + ret = 0; +error: + _leave(" = %d", ret); + return ret; +} + +/* + * prime the encryption state with the invariant parts of a connection's + * description + */ +static void rxkad_prime_packet_security(struct rxrpc_connection *conn) +{ + struct rxrpc_key_payload *payload; + struct blkcipher_desc desc; + struct scatterlist sg[2]; + struct rxrpc_crypt iv; + struct { + __be32 x[4]; + } tmpbuf __attribute__((aligned(16))); /* must all be in same page */ + + _enter(""); + + if (!conn->key) + return; + + payload = conn->key->payload.data; + memcpy(&iv, payload->k.session_key, sizeof(iv)); + + desc.tfm = conn->cipher; + desc.info = iv.x; + desc.flags = 0; + + tmpbuf.x[0] = conn->epoch; + tmpbuf.x[1] = conn->cid; + tmpbuf.x[2] = 0; + tmpbuf.x[3] = htonl(conn->security_ix); + + memset(sg, 0, sizeof(sg)); + sg_set_buf(&sg[0], &tmpbuf, sizeof(tmpbuf)); + sg_set_buf(&sg[1], &tmpbuf, sizeof(tmpbuf)); + crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf)); + + memcpy(&conn->csum_iv, &tmpbuf.x[2], sizeof(conn->csum_iv)); + ASSERTCMP(conn->csum_iv.n[0], ==, tmpbuf.x[2]); + + _leave(""); +} + +/* + * partially encrypt a packet (level 1 security) + */ +static int rxkad_secure_packet_auth(const struct rxrpc_call *call, + struct sk_buff *skb, + u32 data_size, + void *sechdr) +{ + struct rxrpc_skb_priv *sp; + struct blkcipher_desc desc; + struct rxrpc_crypt iv; + struct scatterlist sg[2]; + struct { + struct rxkad_level1_hdr hdr; + __be32 first; /* first four bytes of data and padding */ + } tmpbuf __attribute__((aligned(8))); /* must all be in same page */ + u16 check; + + sp = rxrpc_skb(skb); + + _enter(""); + + check = ntohl(sp->hdr.seq ^ sp->hdr.callNumber); + data_size |= (u32) check << 16; + + tmpbuf.hdr.data_size = htonl(data_size); + memcpy(&tmpbuf.first, sechdr + 4, sizeof(tmpbuf.first)); + + /* start the encryption afresh */ + memset(&iv, 0, sizeof(iv)); + desc.tfm = call->conn->cipher; + desc.info = iv.x; + desc.flags = 0; + + memset(sg, 0, sizeof(sg)); + sg_set_buf(&sg[0], &tmpbuf, sizeof(tmpbuf)); + sg_set_buf(&sg[1], &tmpbuf, sizeof(tmpbuf)); + crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf)); + + memcpy(sechdr, &tmpbuf, sizeof(tmpbuf)); + + _leave(" = 0"); + return 0; +} + +/* + * wholly encrypt a packet (level 2 security) + */ +static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, + struct sk_buff *skb, + u32 data_size, + void *sechdr) +{ + const struct rxrpc_key_payload *payload; + struct rxkad_level2_hdr rxkhdr + __attribute__((aligned(8))); /* must be all on one page */ + struct rxrpc_skb_priv *sp; + struct blkcipher_desc desc; + struct rxrpc_crypt iv; + struct scatterlist sg[16]; + struct sk_buff *trailer; + unsigned len; + u16 check; + int nsg; + + sp = rxrpc_skb(skb); + + _enter(""); + + check = ntohl(sp->hdr.seq ^ sp->hdr.callNumber); + + rxkhdr.data_size = htonl(data_size | (u32) check << 16); + rxkhdr.checksum = 0; + + /* encrypt from the session key */ + payload = call->conn->key->payload.data; + memcpy(&iv, payload->k.session_key, sizeof(iv)); + desc.tfm = call->conn->cipher; + desc.info = iv.x; + desc.flags = 0; + + memset(sg, 0, sizeof(sg[0]) * 2); + sg_set_buf(&sg[0], sechdr, sizeof(rxkhdr)); + sg_set_buf(&sg[1], &rxkhdr, sizeof(rxkhdr)); + crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(rxkhdr)); + + /* we want to encrypt the skbuff in-place */ + nsg = skb_cow_data(skb, 0, &trailer); + if (nsg < 0 || nsg > 16) + return -ENOMEM; + + len = data_size + call->conn->size_align - 1; + len &= ~(call->conn->size_align - 1); + + skb_to_sgvec(skb, sg, 0, len); + crypto_blkcipher_encrypt_iv(&desc, sg, sg, len); + + _leave(" = 0"); + return 0; +} + +/* + * checksum an RxRPC packet header + */ +static int rxkad_secure_packet(const struct rxrpc_call *call, + struct sk_buff *skb, + size_t data_size, + void *sechdr) +{ + struct rxrpc_skb_priv *sp; + struct blkcipher_desc desc; + struct rxrpc_crypt iv; + struct scatterlist sg[2]; + struct { + __be32 x[2]; + } tmpbuf __attribute__((aligned(8))); /* must all be in same page */ + __be32 x; + int ret; + + sp = rxrpc_skb(skb); + + _enter("{%d{%x}},{#%u},%zu,", + call->debug_id, key_serial(call->conn->key), ntohl(sp->hdr.seq), + data_size); + + if (!call->conn->cipher) + return 0; + + ret = key_validate(call->conn->key); + if (ret < 0) + return ret; + + /* continue encrypting from where we left off */ + memcpy(&iv, call->conn->csum_iv.x, sizeof(iv)); + desc.tfm = call->conn->cipher; + desc.info = iv.x; + desc.flags = 0; + + /* calculate the security checksum */ + x = htonl(call->channel << (32 - RXRPC_CIDSHIFT)); + x |= sp->hdr.seq & __constant_cpu_to_be32(0x3fffffff); + tmpbuf.x[0] = sp->hdr.callNumber; + tmpbuf.x[1] = x; + + memset(&sg, 0, sizeof(sg)); + sg_set_buf(&sg[0], &tmpbuf, sizeof(tmpbuf)); + sg_set_buf(&sg[1], &tmpbuf, sizeof(tmpbuf)); + crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf)); + + x = ntohl(tmpbuf.x[1]); + x = (x >> 16) & 0xffff; + if (x == 0) + x = 1; /* zero checksums are not permitted */ + sp->hdr.cksum = htons(x); + + switch (call->conn->security_level) { + case RXRPC_SECURITY_PLAIN: + ret = 0; + break; + case RXRPC_SECURITY_AUTH: + ret = rxkad_secure_packet_auth(call, skb, data_size, sechdr); + break; + case RXRPC_SECURITY_ENCRYPT: + ret = rxkad_secure_packet_encrypt(call, skb, data_size, + sechdr); + break; + default: + ret = -EPERM; + break; + } + + _leave(" = %d [set %hx]", ret, x); + return ret; +} + +/* + * decrypt partial encryption on a packet (level 1 security) + */ +static int rxkad_verify_packet_auth(const struct rxrpc_call *call, + struct sk_buff *skb, + u32 *_abort_code) +{ + struct rxkad_level1_hdr sechdr; + struct rxrpc_skb_priv *sp; + struct blkcipher_desc desc; + struct rxrpc_crypt iv; + struct scatterlist sg[2]; + struct sk_buff *trailer; + u32 data_size, buf; + u16 check; + + _enter(""); + + sp = rxrpc_skb(skb); + + /* we want to decrypt the skbuff in-place */ + if (skb_cow_data(skb, 0, &trailer) < 0) + goto nomem; + + skb_to_sgvec(skb, sg, 0, 8); + + /* start the decryption afresh */ + memset(&iv, 0, sizeof(iv)); + desc.tfm = call->conn->cipher; + desc.info = iv.x; + desc.flags = 0; + + crypto_blkcipher_decrypt_iv(&desc, sg, sg, 8); + + /* remove the decrypted packet length */ + if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0) + goto datalen_error; + if (!skb_pull(skb, sizeof(sechdr))) + BUG(); + + buf = ntohl(sechdr.data_size); + data_size = buf & 0xffff; + + check = buf >> 16; + check ^= ntohl(sp->hdr.seq ^ sp->hdr.callNumber); + check &= 0xffff; + if (check != 0) { + *_abort_code = RXKADSEALEDINCON; + goto protocol_error; + } + + /* shorten the packet to remove the padding */ + if (data_size > skb->len) + goto datalen_error; + else if (data_size < skb->len) + skb->len = data_size; + + _leave(" = 0 [dlen=%x]", data_size); + return 0; + +datalen_error: + *_abort_code = RXKADDATALEN; +protocol_error: + _leave(" = -EPROTO"); + return -EPROTO; + +nomem: + _leave(" = -ENOMEM"); + return -ENOMEM; +} + +/* + * wholly decrypt a packet (level 2 security) + */ +static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call, + struct sk_buff *skb, + u32 *_abort_code) +{ + const struct rxrpc_key_payload *payload; + struct rxkad_level2_hdr sechdr; + struct rxrpc_skb_priv *sp; + struct blkcipher_desc desc; + struct rxrpc_crypt iv; + struct scatterlist _sg[4], *sg; + struct sk_buff *trailer; + u32 data_size, buf; + u16 check; + int nsg; + + _enter(",{%d}", skb->len); + + sp = rxrpc_skb(skb); + + /* we want to decrypt the skbuff in-place */ + nsg = skb_cow_data(skb, 0, &trailer); + if (nsg < 0) + goto nomem; + + sg = _sg; + if (unlikely(nsg > 4)) { + sg = kmalloc(sizeof(*sg) * nsg, GFP_NOIO); + if (!sg) + goto nomem; + } + + skb_to_sgvec(skb, sg, 0, skb->len); + + /* decrypt from the session key */ + payload = call->conn->key->payload.data; + memcpy(&iv, payload->k.session_key, sizeof(iv)); + desc.tfm = call->conn->cipher; + desc.info = iv.x; + desc.flags = 0; + + crypto_blkcipher_decrypt_iv(&desc, sg, sg, skb->len); + if (sg != _sg) + kfree(sg); + + /* remove the decrypted packet length */ + if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0) + goto datalen_error; + if (!skb_pull(skb, sizeof(sechdr))) + BUG(); + + buf = ntohl(sechdr.data_size); + data_size = buf & 0xffff; + + check = buf >> 16; + check ^= ntohl(sp->hdr.seq ^ sp->hdr.callNumber); + check &= 0xffff; + if (check != 0) { + *_abort_code = RXKADSEALEDINCON; + goto protocol_error; + } + + /* shorten the packet to remove the padding */ + if (data_size > skb->len) + goto datalen_error; + else if (data_size < skb->len) + skb->len = data_size; + + _leave(" = 0 [dlen=%x]", data_size); + return 0; + +datalen_error: + *_abort_code = RXKADDATALEN; +protocol_error: + _leave(" = -EPROTO"); + return -EPROTO; + +nomem: + _leave(" = -ENOMEM"); + return -ENOMEM; +} + +/* + * verify the security on a received packet + */ +static int rxkad_verify_packet(const struct rxrpc_call *call, + struct sk_buff *skb, + u32 *_abort_code) +{ + struct blkcipher_desc desc; + struct rxrpc_skb_priv *sp; + struct rxrpc_crypt iv; + struct scatterlist sg[2]; + struct { + __be32 x[2]; + } tmpbuf __attribute__((aligned(8))); /* must all be in same page */ + __be32 x; + __be16 cksum; + int ret; + + sp = rxrpc_skb(skb); + + _enter("{%d{%x}},{#%u}", + call->debug_id, key_serial(call->conn->key), + ntohl(sp->hdr.seq)); + + if (!call->conn->cipher) + return 0; + + if (sp->hdr.securityIndex != 2) { + *_abort_code = RXKADINCONSISTENCY; + _leave(" = -EPROTO [not rxkad]"); + return -EPROTO; + } + + /* continue encrypting from where we left off */ + memcpy(&iv, call->conn->csum_iv.x, sizeof(iv)); + desc.tfm = call->conn->cipher; + desc.info = iv.x; + desc.flags = 0; + + /* validate the security checksum */ + x = htonl(call->channel << (32 - RXRPC_CIDSHIFT)); + x |= sp->hdr.seq & __constant_cpu_to_be32(0x3fffffff); + tmpbuf.x[0] = call->call_id; + tmpbuf.x[1] = x; + + memset(&sg, 0, sizeof(sg)); + sg_set_buf(&sg[0], &tmpbuf, sizeof(tmpbuf)); + sg_set_buf(&sg[1], &tmpbuf, sizeof(tmpbuf)); + crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf)); + + x = ntohl(tmpbuf.x[1]); + x = (x >> 16) & 0xffff; + if (x == 0) + x = 1; /* zero checksums are not permitted */ + + cksum = htons(x); + if (sp->hdr.cksum != cksum) { + *_abort_code = RXKADSEALEDINCON; + _leave(" = -EPROTO [csum failed]"); + return -EPROTO; + } + + switch (call->conn->security_level) { + case RXRPC_SECURITY_PLAIN: + ret = 0; + break; + case RXRPC_SECURITY_AUTH: + ret = rxkad_verify_packet_auth(call, skb, _abort_code); + break; + case RXRPC_SECURITY_ENCRYPT: + ret = rxkad_verify_packet_encrypt(call, skb, _abort_code); + break; + default: + ret = -ENOANO; + break; + } + + _leave(" = %d", ret); + return ret; +} + +/* + * issue a challenge + */ +static int rxkad_issue_challenge(struct rxrpc_connection *conn) +{ + struct rxkad_challenge challenge; + struct rxrpc_header hdr; + struct msghdr msg; + struct kvec iov[2]; + size_t len; + int ret; + + _enter("{%d,%x}", conn->debug_id, key_serial(conn->key)); + + ret = key_validate(conn->key); + if (ret < 0) + return ret; + + get_random_bytes(&conn->security_nonce, sizeof(conn->security_nonce)); + + challenge.version = htonl(2); + challenge.nonce = htonl(conn->security_nonce); + challenge.min_level = htonl(0); + challenge.__padding = 0; + + msg.msg_name = &conn->trans->peer->srx.transport.sin; + msg.msg_namelen = sizeof(conn->trans->peer->srx.transport.sin); + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + hdr.epoch = conn->epoch; + hdr.cid = conn->cid; + hdr.callNumber = 0; + hdr.seq = 0; + hdr.type = RXRPC_PACKET_TYPE_CHALLENGE; + hdr.flags = conn->out_clientflag; + hdr.userStatus = 0; + hdr.securityIndex = conn->security_ix; + hdr._rsvd = 0; + hdr.serviceId = conn->service_id; + + iov[0].iov_base = &hdr; + iov[0].iov_len = sizeof(hdr); + iov[1].iov_base = &challenge; + iov[1].iov_len = sizeof(challenge); + + len = iov[0].iov_len + iov[1].iov_len; + + hdr.serial = htonl(atomic_inc_return(&conn->serial)); + _proto("Tx CHALLENGE %%%u", ntohl(hdr.serial)); + + ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 2, len); + if (ret < 0) { + _debug("sendmsg failed: %d", ret); + return -EAGAIN; + } + + _leave(" = 0"); + return 0; +} + +/* + * send a Kerberos security response + */ +static int rxkad_send_response(struct rxrpc_connection *conn, + struct rxrpc_header *hdr, + struct rxkad_response *resp, + const struct rxkad_key *s2) +{ + struct msghdr msg; + struct kvec iov[3]; + size_t len; + int ret; + + _enter(""); + + msg.msg_name = &conn->trans->peer->srx.transport.sin; + msg.msg_namelen = sizeof(conn->trans->peer->srx.transport.sin); + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + hdr->epoch = conn->epoch; + hdr->seq = 0; + hdr->type = RXRPC_PACKET_TYPE_RESPONSE; + hdr->flags = conn->out_clientflag; + hdr->userStatus = 0; + hdr->_rsvd = 0; + + iov[0].iov_base = hdr; + iov[0].iov_len = sizeof(*hdr); + iov[1].iov_base = resp; + iov[1].iov_len = sizeof(*resp); + iov[2].iov_base = (void *) s2->ticket; + iov[2].iov_len = s2->ticket_len; + + len = iov[0].iov_len + iov[1].iov_len + iov[2].iov_len; + + hdr->serial = htonl(atomic_inc_return(&conn->serial)); + _proto("Tx RESPONSE %%%u", ntohl(hdr->serial)); + + ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 3, len); + if (ret < 0) { + _debug("sendmsg failed: %d", ret); + return -EAGAIN; + } + + _leave(" = 0"); + return 0; +} + +/* + * calculate the response checksum + */ +static void rxkad_calc_response_checksum(struct rxkad_response *response) +{ + u32 csum = 1000003; + int loop; + u8 *p = (u8 *) response; + + for (loop = sizeof(*response); loop > 0; loop--) + csum = csum * 0x10204081 + *p++; + + response->encrypted.checksum = htonl(csum); +} + +/* + * load a scatterlist with a potentially split-page buffer + */ +static void rxkad_sg_set_buf2(struct scatterlist sg[2], + void *buf, size_t buflen) +{ + + memset(sg, 0, sizeof(sg)); + + sg_set_buf(&sg[0], buf, buflen); + if (sg[0].offset + buflen > PAGE_SIZE) { + /* the buffer was split over two pages */ + sg[0].length = PAGE_SIZE - sg[0].offset; + sg_set_buf(&sg[1], buf + sg[0].length, buflen - sg[0].length); + } + + ASSERTCMP(sg[0].length + sg[1].length, ==, buflen); +} + +/* + * encrypt the response packet + */ +static void rxkad_encrypt_response(struct rxrpc_connection *conn, + struct rxkad_response *resp, + const struct rxkad_key *s2) +{ + struct blkcipher_desc desc; + struct rxrpc_crypt iv; + struct scatterlist ssg[2], dsg[2]; + + /* continue encrypting from where we left off */ + memcpy(&iv, s2->session_key, sizeof(iv)); + desc.tfm = conn->cipher; + desc.info = iv.x; + desc.flags = 0; + + rxkad_sg_set_buf2(ssg, &resp->encrypted, sizeof(resp->encrypted)); + memcpy(dsg, ssg, sizeof(dsg)); + crypto_blkcipher_encrypt_iv(&desc, dsg, ssg, sizeof(resp->encrypted)); +} + +/* + * respond to a challenge packet + */ +static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, + struct sk_buff *skb, + u32 *_abort_code) +{ + const struct rxrpc_key_payload *payload; + struct rxkad_challenge challenge; + struct rxkad_response resp + __attribute__((aligned(8))); /* must be aligned for crypto */ + struct rxrpc_skb_priv *sp; + u32 version, nonce, min_level, abort_code; + int ret; + + _enter("{%d,%x}", conn->debug_id, key_serial(conn->key)); + + if (!conn->key) { + _leave(" = -EPROTO [no key]"); + return -EPROTO; + } + + ret = key_validate(conn->key); + if (ret < 0) { + *_abort_code = RXKADEXPIRED; + return ret; + } + + abort_code = RXKADPACKETSHORT; + sp = rxrpc_skb(skb); + if (skb_copy_bits(skb, 0, &challenge, sizeof(challenge)) < 0) + goto protocol_error; + + version = ntohl(challenge.version); + nonce = ntohl(challenge.nonce); + min_level = ntohl(challenge.min_level); + + _proto("Rx CHALLENGE %%%u { v=%u n=%u ml=%u }", + ntohl(sp->hdr.serial), version, nonce, min_level); + + abort_code = RXKADINCONSISTENCY; + if (version != RXKAD_VERSION) + goto protocol_error; + + abort_code = RXKADLEVELFAIL; + if (conn->security_level < min_level) + goto protocol_error; + + payload = conn->key->payload.data; + + /* build the response packet */ + memset(&resp, 0, sizeof(resp)); + + resp.version = RXKAD_VERSION; + resp.encrypted.epoch = conn->epoch; + resp.encrypted.cid = conn->cid; + resp.encrypted.securityIndex = htonl(conn->security_ix); + resp.encrypted.call_id[0] = + (conn->channels[0] ? conn->channels[0]->call_id : 0); + resp.encrypted.call_id[1] = + (conn->channels[1] ? conn->channels[1]->call_id : 0); + resp.encrypted.call_id[2] = + (conn->channels[2] ? conn->channels[2]->call_id : 0); + resp.encrypted.call_id[3] = + (conn->channels[3] ? conn->channels[3]->call_id : 0); + resp.encrypted.inc_nonce = htonl(nonce + 1); + resp.encrypted.level = htonl(conn->security_level); + resp.kvno = htonl(payload->k.kvno); + resp.ticket_len = htonl(payload->k.ticket_len); + + /* calculate the response checksum and then do the encryption */ + rxkad_calc_response_checksum(&resp); + rxkad_encrypt_response(conn, &resp, &payload->k); + return rxkad_send_response(conn, &sp->hdr, &resp, &payload->k); + +protocol_error: + *_abort_code = abort_code; + _leave(" = -EPROTO [%d]", abort_code); + return -EPROTO; +} + +/* + * decrypt the kerberos IV ticket in the response + */ +static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, + void *ticket, size_t ticket_len, + struct rxrpc_crypt *_session_key, + time_t *_expiry, + u32 *_abort_code) +{ + struct blkcipher_desc desc; + struct rxrpc_crypt iv, key; + struct scatterlist ssg[1], dsg[1]; + struct in_addr addr; + unsigned life; + time_t issue, now; + bool little_endian; + int ret; + u8 *p, *q, *name, *end; + + _enter("{%d},{%x}", conn->debug_id, key_serial(conn->server_key)); + + *_expiry = 0; + + ret = key_validate(conn->server_key); + if (ret < 0) { + switch (ret) { + case -EKEYEXPIRED: + *_abort_code = RXKADEXPIRED; + goto error; + default: + *_abort_code = RXKADNOAUTH; + goto error; + } + } + + ASSERT(conn->server_key->payload.data != NULL); + ASSERTCMP((unsigned long) ticket & 7UL, ==, 0); + + memcpy(&iv, &conn->server_key->type_data, sizeof(iv)); + + desc.tfm = conn->server_key->payload.data; + desc.info = iv.x; + desc.flags = 0; + + sg_init_one(&ssg[0], ticket, ticket_len); + memcpy(dsg, ssg, sizeof(dsg)); + crypto_blkcipher_decrypt_iv(&desc, dsg, ssg, ticket_len); + + p = ticket; + end = p + ticket_len; + +#define Z(size) \ + ({ \ + u8 *__str = p; \ + q = memchr(p, 0, end - p); \ + if (!q || q - p > (size)) \ + goto bad_ticket; \ + for (; p < q; p++) \ + if (!isprint(*p)) \ + goto bad_ticket; \ + p++; \ + __str; \ + }) + + /* extract the ticket flags */ + _debug("KIV FLAGS: %x", *p); + little_endian = *p & 1; + p++; + + /* extract the authentication name */ + name = Z(ANAME_SZ); + _debug("KIV ANAME: %s", name); + + /* extract the principal's instance */ + name = Z(INST_SZ); + _debug("KIV INST : %s", name); + + /* extract the principal's authentication domain */ + name = Z(REALM_SZ); + _debug("KIV REALM: %s", name); + + if (end - p < 4 + 8 + 4 + 2) + goto bad_ticket; + + /* get the IPv4 address of the entity that requested the ticket */ + memcpy(&addr, p, sizeof(addr)); + p += 4; + _debug("KIV ADDR : "NIPQUAD_FMT, NIPQUAD(addr)); + + /* get the session key from the ticket */ + memcpy(&key, p, sizeof(key)); + p += 8; + _debug("KIV KEY : %08x %08x", ntohl(key.n[0]), ntohl(key.n[1])); + memcpy(_session_key, &key, sizeof(key)); + + /* get the ticket's lifetime */ + life = *p++ * 5 * 60; + _debug("KIV LIFE : %u", life); + + /* get the issue time of the ticket */ + if (little_endian) { + __le32 stamp; + memcpy(&stamp, p, 4); + issue = le32_to_cpu(stamp); + } else { + __be32 stamp; + memcpy(&stamp, p, 4); + issue = be32_to_cpu(stamp); + } + p += 4; + now = xtime.tv_sec; + _debug("KIV ISSUE: %lx [%lx]", issue, now); + + /* check the ticket is in date */ + if (issue > now) { + *_abort_code = RXKADNOAUTH; + ret = -EKEYREJECTED; + goto error; + } + + if (issue < now - life) { + *_abort_code = RXKADEXPIRED; + ret = -EKEYEXPIRED; + goto error; + } + + *_expiry = issue + life; + + /* get the service name */ + name = Z(SNAME_SZ); + _debug("KIV SNAME: %s", name); + + /* get the service instance name */ + name = Z(INST_SZ); + _debug("KIV SINST: %s", name); + + ret = 0; +error: + _leave(" = %d", ret); + return ret; + +bad_ticket: + *_abort_code = RXKADBADTICKET; + ret = -EBADMSG; + goto error; +} + +/* + * decrypt the response packet + */ +static void rxkad_decrypt_response(struct rxrpc_connection *conn, + struct rxkad_response *resp, + const struct rxrpc_crypt *session_key) +{ + struct blkcipher_desc desc; + struct scatterlist ssg[2], dsg[2]; + struct rxrpc_crypt iv; + + _enter(",,%08x%08x", + ntohl(session_key->n[0]), ntohl(session_key->n[1])); + + ASSERT(rxkad_ci != NULL); + + mutex_lock(&rxkad_ci_mutex); + if (crypto_blkcipher_setkey(rxkad_ci, session_key->x, + sizeof(*session_key)) < 0) + BUG(); + + memcpy(&iv, session_key, sizeof(iv)); + desc.tfm = rxkad_ci; + desc.info = iv.x; + desc.flags = 0; + + rxkad_sg_set_buf2(ssg, &resp->encrypted, sizeof(resp->encrypted)); + memcpy(dsg, ssg, sizeof(dsg)); + crypto_blkcipher_decrypt_iv(&desc, dsg, ssg, sizeof(resp->encrypted)); + mutex_unlock(&rxkad_ci_mutex); + + _leave(""); +} + +/* + * verify a response + */ +static int rxkad_verify_response(struct rxrpc_connection *conn, + struct sk_buff *skb, + u32 *_abort_code) +{ + struct rxkad_response response + __attribute__((aligned(8))); /* must be aligned for crypto */ + struct rxrpc_skb_priv *sp; + struct rxrpc_crypt session_key; + time_t expiry; + void *ticket; + u32 abort_code, version, kvno, ticket_len, csum, level; + int ret; + + _enter("{%d,%x}", conn->debug_id, key_serial(conn->server_key)); + + abort_code = RXKADPACKETSHORT; + if (skb_copy_bits(skb, 0, &response, sizeof(response)) < 0) + goto protocol_error; + if (!pskb_pull(skb, sizeof(response))) + BUG(); + + version = ntohl(response.version); + ticket_len = ntohl(response.ticket_len); + kvno = ntohl(response.kvno); + sp = rxrpc_skb(skb); + _proto("Rx RESPONSE %%%u { v=%u kv=%u tl=%u }", + ntohl(sp->hdr.serial), version, kvno, ticket_len); + + abort_code = RXKADINCONSISTENCY; + if (version != RXKAD_VERSION) + + abort_code = RXKADTICKETLEN; + if (ticket_len < 4 || ticket_len > MAXKRB5TICKETLEN) + goto protocol_error; + + abort_code = RXKADUNKNOWNKEY; + if (kvno >= RXKAD_TKT_TYPE_KERBEROS_V5) + goto protocol_error; + + /* extract the kerberos ticket and decrypt and decode it */ + ticket = kmalloc(ticket_len, GFP_NOFS); + if (!ticket) + return -ENOMEM; + + abort_code = RXKADPACKETSHORT; + if (skb_copy_bits(skb, 0, ticket, ticket_len) < 0) + goto protocol_error_free; + + ret = rxkad_decrypt_ticket(conn, ticket, ticket_len, &session_key, + &expiry, &abort_code); + if (ret < 0) { + *_abort_code = abort_code; + kfree(ticket); + return ret; + } + + /* use the session key from inside the ticket to decrypt the + * response */ + rxkad_decrypt_response(conn, &response, &session_key); + + abort_code = RXKADSEALEDINCON; + if (response.encrypted.epoch != conn->epoch) + goto protocol_error_free; + if (response.encrypted.cid != conn->cid) + goto protocol_error_free; + if (ntohl(response.encrypted.securityIndex) != conn->security_ix) + goto protocol_error_free; + csum = response.encrypted.checksum; + response.encrypted.checksum = 0; + rxkad_calc_response_checksum(&response); + if (response.encrypted.checksum != csum) + goto protocol_error_free; + + if (ntohl(response.encrypted.call_id[0]) > INT_MAX || + ntohl(response.encrypted.call_id[1]) > INT_MAX || + ntohl(response.encrypted.call_id[2]) > INT_MAX || + ntohl(response.encrypted.call_id[3]) > INT_MAX) + goto protocol_error_free; + + abort_code = RXKADOUTOFSEQUENCE; + if (response.encrypted.inc_nonce != htonl(conn->security_nonce + 1)) + goto protocol_error_free; + + abort_code = RXKADLEVELFAIL; + level = ntohl(response.encrypted.level); + if (level > RXRPC_SECURITY_ENCRYPT) + goto protocol_error_free; + conn->security_level = level; + + /* create a key to hold the security data and expiration time - after + * this the connection security can be handled in exactly the same way + * as for a client connection */ + ret = rxrpc_get_server_data_key(conn, &session_key, expiry, kvno); + if (ret < 0) { + kfree(ticket); + return ret; + } + + kfree(ticket); + _leave(" = 0"); + return 0; + +protocol_error_free: + kfree(ticket); +protocol_error: + *_abort_code = abort_code; + _leave(" = -EPROTO [%d]", abort_code); + return -EPROTO; +} + +/* + * clear the connection security + */ +static void rxkad_clear(struct rxrpc_connection *conn) +{ + _enter(""); + + if (conn->cipher) + crypto_free_blkcipher(conn->cipher); +} + +/* + * RxRPC Kerberos-based security + */ +static struct rxrpc_security rxkad = { + .owner = THIS_MODULE, + .name = "rxkad", + .security_index = RXKAD_VERSION, + .init_connection_security = rxkad_init_connection_security, + .prime_packet_security = rxkad_prime_packet_security, + .secure_packet = rxkad_secure_packet, + .verify_packet = rxkad_verify_packet, + .issue_challenge = rxkad_issue_challenge, + .respond_to_challenge = rxkad_respond_to_challenge, + .verify_response = rxkad_verify_response, + .clear = rxkad_clear, +}; + +static __init int rxkad_init(void) +{ + _enter(""); + + /* pin the cipher we need so that the crypto layer doesn't invoke + * keventd to go get it */ + rxkad_ci = crypto_alloc_blkcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(rxkad_ci)) + return PTR_ERR(rxkad_ci); + + return rxrpc_register_security(&rxkad); +} + +module_init(rxkad_init); + +static __exit void rxkad_exit(void) +{ + _enter(""); + + rxrpc_unregister_security(&rxkad); + crypto_free_blkcipher(rxkad_ci); +} + +module_exit(rxkad_exit); diff --git a/net/rxrpc/rxrpc_syms.c b/net/rxrpc/rxrpc_syms.c deleted file mode 100644 index 9896fd87a4d..00000000000 --- a/net/rxrpc/rxrpc_syms.c +++ /dev/null @@ -1,34 +0,0 @@ -/* rxrpc_syms.c: exported Rx RPC layer interface symbols - * - * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/module.h> - -#include <rxrpc/transport.h> -#include <rxrpc/connection.h> -#include <rxrpc/call.h> -#include <rxrpc/krxiod.h> - -/* call.c */ -EXPORT_SYMBOL(rxrpc_create_call); -EXPORT_SYMBOL(rxrpc_put_call); -EXPORT_SYMBOL(rxrpc_call_abort); -EXPORT_SYMBOL(rxrpc_call_read_data); -EXPORT_SYMBOL(rxrpc_call_write_data); - -/* connection.c */ -EXPORT_SYMBOL(rxrpc_create_connection); -EXPORT_SYMBOL(rxrpc_put_connection); - -/* transport.c */ -EXPORT_SYMBOL(rxrpc_create_transport); -EXPORT_SYMBOL(rxrpc_put_transport); -EXPORT_SYMBOL(rxrpc_add_service); -EXPORT_SYMBOL(rxrpc_del_service); diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c deleted file mode 100644 index 884290754af..00000000000 --- a/net/rxrpc/sysctl.c +++ /dev/null @@ -1,121 +0,0 @@ -/* sysctl.c: Rx RPC control - * - * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/module.h> -#include <linux/sysctl.h> -#include <rxrpc/types.h> -#include <rxrpc/rxrpc.h> -#include <asm/errno.h> -#include "internal.h" - -int rxrpc_ktrace; -int rxrpc_kdebug; -int rxrpc_kproto; -int rxrpc_knet; - -#ifdef CONFIG_SYSCTL -static struct ctl_table_header *rxrpc_sysctl = NULL; - -static ctl_table rxrpc_sysctl_table[] = { - { - .ctl_name = 1, - .procname = "kdebug", - .data = &rxrpc_kdebug, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = 2, - .procname = "ktrace", - .data = &rxrpc_ktrace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = 3, - .procname = "kproto", - .data = &rxrpc_kproto, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = 4, - .procname = "knet", - .data = &rxrpc_knet, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = 5, - .procname = "peertimo", - .data = &rxrpc_peer_timeout, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = &proc_doulongvec_minmax - }, - { - .ctl_name = 6, - .procname = "conntimo", - .data = &rxrpc_conn_timeout, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = &proc_doulongvec_minmax - }, - { .ctl_name = 0 } -}; - -static ctl_table rxrpc_dir_sysctl_table[] = { - { - .ctl_name = 1, - .procname = "rxrpc", - .maxlen = 0, - .mode = 0555, - .child = rxrpc_sysctl_table - }, - { .ctl_name = 0 } -}; -#endif /* CONFIG_SYSCTL */ - -/*****************************************************************************/ -/* - * initialise the sysctl stuff for Rx RPC - */ -int rxrpc_sysctl_init(void) -{ -#ifdef CONFIG_SYSCTL - rxrpc_sysctl = register_sysctl_table(rxrpc_dir_sysctl_table); - if (!rxrpc_sysctl) - return -ENOMEM; -#endif /* CONFIG_SYSCTL */ - - return 0; -} /* end rxrpc_sysctl_init() */ - -/*****************************************************************************/ -/* - * clean up the sysctl stuff for Rx RPC - */ -void rxrpc_sysctl_cleanup(void) -{ -#ifdef CONFIG_SYSCTL - if (rxrpc_sysctl) { - unregister_sysctl_table(rxrpc_sysctl); - rxrpc_sysctl = NULL; - } -#endif /* CONFIG_SYSCTL */ - -} /* end rxrpc_sysctl_cleanup() */ diff --git a/net/rxrpc/transport.c b/net/rxrpc/transport.c deleted file mode 100644 index 8e57be2df93..00000000000 --- a/net/rxrpc/transport.c +++ /dev/null @@ -1,846 +0,0 @@ -/* transport.c: Rx Transport routines - * - * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/slab.h> -#include <linux/module.h> -#include <rxrpc/transport.h> -#include <rxrpc/peer.h> -#include <rxrpc/connection.h> -#include <rxrpc/call.h> -#include <rxrpc/message.h> -#include <rxrpc/krxiod.h> -#include <rxrpc/krxsecd.h> -#include <linux/udp.h> -#include <linux/in.h> -#include <linux/in6.h> -#include <linux/icmp.h> -#include <linux/skbuff.h> -#include <net/sock.h> -#include <net/ip.h> -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) -#include <linux/ipv6.h> /* this should _really_ be in errqueue.h.. */ -#endif -#include <linux/errqueue.h> -#include <asm/uaccess.h> -#include "internal.h" - -struct errormsg { - struct cmsghdr cmsg; /* control message header */ - struct sock_extended_err ee; /* extended error information */ - struct sockaddr_in icmp_src; /* ICMP packet source address */ -}; - -static DEFINE_SPINLOCK(rxrpc_transports_lock); -static struct list_head rxrpc_transports = LIST_HEAD_INIT(rxrpc_transports); - -__RXACCT_DECL(atomic_t rxrpc_transport_count); -LIST_HEAD(rxrpc_proc_transports); -DECLARE_RWSEM(rxrpc_proc_transports_sem); - -static void rxrpc_data_ready(struct sock *sk, int count); -static void rxrpc_error_report(struct sock *sk); -static int rxrpc_trans_receive_new_call(struct rxrpc_transport *trans, - struct list_head *msgq); -static void rxrpc_trans_receive_error_report(struct rxrpc_transport *trans); - -/*****************************************************************************/ -/* - * create a new transport endpoint using the specified UDP port - */ -int rxrpc_create_transport(unsigned short port, - struct rxrpc_transport **_trans) -{ - struct rxrpc_transport *trans; - struct sockaddr_in sin; - mm_segment_t oldfs; - struct sock *sock; - int ret, opt; - - _enter("%hu", port); - - trans = kzalloc(sizeof(struct rxrpc_transport), GFP_KERNEL); - if (!trans) - return -ENOMEM; - - atomic_set(&trans->usage, 1); - INIT_LIST_HEAD(&trans->services); - INIT_LIST_HEAD(&trans->link); - INIT_LIST_HEAD(&trans->krxiodq_link); - spin_lock_init(&trans->lock); - INIT_LIST_HEAD(&trans->peer_active); - INIT_LIST_HEAD(&trans->peer_graveyard); - spin_lock_init(&trans->peer_gylock); - init_waitqueue_head(&trans->peer_gy_waitq); - rwlock_init(&trans->peer_lock); - atomic_set(&trans->peer_count, 0); - trans->port = port; - - /* create a UDP socket to be my actual transport endpoint */ - ret = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &trans->socket); - if (ret < 0) - goto error; - - /* use the specified port */ - if (port) { - memset(&sin, 0, sizeof(sin)); - sin.sin_family = AF_INET; - sin.sin_port = htons(port); - ret = trans->socket->ops->bind(trans->socket, - (struct sockaddr *) &sin, - sizeof(sin)); - if (ret < 0) - goto error; - } - - opt = 1; - oldfs = get_fs(); - set_fs(KERNEL_DS); - ret = trans->socket->ops->setsockopt(trans->socket, SOL_IP, IP_RECVERR, - (char *) &opt, sizeof(opt)); - set_fs(oldfs); - - spin_lock(&rxrpc_transports_lock); - list_add(&trans->link, &rxrpc_transports); - spin_unlock(&rxrpc_transports_lock); - - /* set the socket up */ - sock = trans->socket->sk; - sock->sk_user_data = trans; - sock->sk_data_ready = rxrpc_data_ready; - sock->sk_error_report = rxrpc_error_report; - - down_write(&rxrpc_proc_transports_sem); - list_add_tail(&trans->proc_link, &rxrpc_proc_transports); - up_write(&rxrpc_proc_transports_sem); - - __RXACCT(atomic_inc(&rxrpc_transport_count)); - - *_trans = trans; - _leave(" = 0 (%p)", trans); - return 0; - - error: - /* finish cleaning up the transport (not really needed here, but...) */ - if (trans->socket) - trans->socket->ops->shutdown(trans->socket, 2); - - /* close the socket */ - if (trans->socket) { - trans->socket->sk->sk_user_data = NULL; - sock_release(trans->socket); - trans->socket = NULL; - } - - kfree(trans); - - - _leave(" = %d", ret); - return ret; -} /* end rxrpc_create_transport() */ - -/*****************************************************************************/ -/* - * destroy a transport endpoint - */ -void rxrpc_put_transport(struct rxrpc_transport *trans) -{ - _enter("%p{u=%d p=%hu}", - trans, atomic_read(&trans->usage), trans->port); - - BUG_ON(atomic_read(&trans->usage) <= 0); - - /* to prevent a race, the decrement and the dequeue must be - * effectively atomic */ - spin_lock(&rxrpc_transports_lock); - if (likely(!atomic_dec_and_test(&trans->usage))) { - spin_unlock(&rxrpc_transports_lock); - _leave(""); - return; - } - - list_del(&trans->link); - spin_unlock(&rxrpc_transports_lock); - - /* finish cleaning up the transport */ - if (trans->socket) - trans->socket->ops->shutdown(trans->socket, 2); - - rxrpc_krxsecd_clear_transport(trans); - rxrpc_krxiod_dequeue_transport(trans); - - /* discard all peer information */ - rxrpc_peer_clearall(trans); - - down_write(&rxrpc_proc_transports_sem); - list_del(&trans->proc_link); - up_write(&rxrpc_proc_transports_sem); - __RXACCT(atomic_dec(&rxrpc_transport_count)); - - /* close the socket */ - if (trans->socket) { - trans->socket->sk->sk_user_data = NULL; - sock_release(trans->socket); - trans->socket = NULL; - } - - kfree(trans); - - _leave(""); -} /* end rxrpc_put_transport() */ - -/*****************************************************************************/ -/* - * add a service to a transport to be listened upon - */ -int rxrpc_add_service(struct rxrpc_transport *trans, - struct rxrpc_service *newsrv) -{ - struct rxrpc_service *srv; - struct list_head *_p; - int ret = -EEXIST; - - _enter("%p{%hu},%p{%hu}", - trans, trans->port, newsrv, newsrv->service_id); - - /* verify that the service ID is not already present */ - spin_lock(&trans->lock); - - list_for_each(_p, &trans->services) { - srv = list_entry(_p, struct rxrpc_service, link); - if (srv->service_id == newsrv->service_id) - goto out; - } - - /* okay - add the transport to the list */ - list_add_tail(&newsrv->link, &trans->services); - rxrpc_get_transport(trans); - ret = 0; - - out: - spin_unlock(&trans->lock); - - _leave("= %d", ret); - return ret; -} /* end rxrpc_add_service() */ - -/*****************************************************************************/ -/* - * remove a service from a transport - */ -void rxrpc_del_service(struct rxrpc_transport *trans, struct rxrpc_service *srv) -{ - _enter("%p{%hu},%p{%hu}", trans, trans->port, srv, srv->service_id); - - spin_lock(&trans->lock); - list_del(&srv->link); - spin_unlock(&trans->lock); - - rxrpc_put_transport(trans); - - _leave(""); -} /* end rxrpc_del_service() */ - -/*****************************************************************************/ -/* - * INET callback when data has been received on the socket. - */ -static void rxrpc_data_ready(struct sock *sk, int count) -{ - struct rxrpc_transport *trans; - - _enter("%p{t=%p},%d", sk, sk->sk_user_data, count); - - /* queue the transport for attention by krxiod */ - trans = (struct rxrpc_transport *) sk->sk_user_data; - if (trans) - rxrpc_krxiod_queue_transport(trans); - - /* wake up anyone waiting on the socket */ - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible(sk->sk_sleep); - - _leave(""); -} /* end rxrpc_data_ready() */ - -/*****************************************************************************/ -/* - * INET callback when an ICMP error packet is received - * - sk->err is error (EHOSTUNREACH, EPROTO or EMSGSIZE) - */ -static void rxrpc_error_report(struct sock *sk) -{ - struct rxrpc_transport *trans; - - _enter("%p{t=%p}", sk, sk->sk_user_data); - - /* queue the transport for attention by krxiod */ - trans = (struct rxrpc_transport *) sk->sk_user_data; - if (trans) { - trans->error_rcvd = 1; - rxrpc_krxiod_queue_transport(trans); - } - - /* wake up anyone waiting on the socket */ - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible(sk->sk_sleep); - - _leave(""); -} /* end rxrpc_error_report() */ - -/*****************************************************************************/ -/* - * split a message up, allocating message records and filling them in - * from the contents of a socket buffer - */ -static int rxrpc_incoming_msg(struct rxrpc_transport *trans, - struct sk_buff *pkt, - struct list_head *msgq) -{ - struct rxrpc_message *msg; - int ret; - - _enter(""); - - msg = kzalloc(sizeof(struct rxrpc_message), GFP_KERNEL); - if (!msg) { - _leave(" = -ENOMEM"); - return -ENOMEM; - } - - atomic_set(&msg->usage, 1); - list_add_tail(&msg->link,msgq); - - /* dig out the Rx routing parameters */ - if (skb_copy_bits(pkt, sizeof(struct udphdr), - &msg->hdr, sizeof(msg->hdr)) < 0) { - ret = -EBADMSG; - goto error; - } - - msg->trans = trans; - msg->state = RXRPC_MSG_RECEIVED; - skb_get_timestamp(pkt, &msg->stamp); - if (msg->stamp.tv_sec == 0) { - do_gettimeofday(&msg->stamp); - if (pkt->sk) - sock_enable_timestamp(pkt->sk); - } - msg->seq = ntohl(msg->hdr.seq); - - /* attach the packet */ - skb_get(pkt); - msg->pkt = pkt; - - msg->offset = sizeof(struct udphdr) + sizeof(struct rxrpc_header); - msg->dsize = msg->pkt->len - msg->offset; - - _net("Rx Received packet from %s (%08x;%08x,%1x,%d,%s,%02x,%d,%d)", - msg->hdr.flags & RXRPC_CLIENT_INITIATED ? "client" : "server", - ntohl(msg->hdr.epoch), - (ntohl(msg->hdr.cid) & RXRPC_CIDMASK) >> RXRPC_CIDSHIFT, - ntohl(msg->hdr.cid) & RXRPC_CHANNELMASK, - ntohl(msg->hdr.callNumber), - rxrpc_pkts[msg->hdr.type], - msg->hdr.flags, - ntohs(msg->hdr.serviceId), - msg->hdr.securityIndex); - - __RXACCT(atomic_inc(&rxrpc_message_count)); - - /* split off jumbo packets */ - while (msg->hdr.type == RXRPC_PACKET_TYPE_DATA && - msg->hdr.flags & RXRPC_JUMBO_PACKET - ) { - struct rxrpc_jumbo_header jumbo; - struct rxrpc_message *jumbomsg = msg; - - _debug("split jumbo packet"); - - /* quick sanity check */ - ret = -EBADMSG; - if (msg->dsize < - RXRPC_JUMBO_DATALEN + sizeof(struct rxrpc_jumbo_header)) - goto error; - if (msg->hdr.flags & RXRPC_LAST_PACKET) - goto error; - - /* dig out the secondary header */ - if (skb_copy_bits(pkt, msg->offset + RXRPC_JUMBO_DATALEN, - &jumbo, sizeof(jumbo)) < 0) - goto error; - - /* allocate a new message record */ - ret = -ENOMEM; - msg = kmemdup(jumbomsg, sizeof(struct rxrpc_message), GFP_KERNEL); - if (!msg) - goto error; - - list_add_tail(&msg->link, msgq); - - /* adjust the jumbo packet */ - jumbomsg->dsize = RXRPC_JUMBO_DATALEN; - - /* attach the packet here too */ - skb_get(pkt); - - /* adjust the parameters */ - msg->seq++; - msg->hdr.seq = htonl(msg->seq); - msg->hdr.serial = htonl(ntohl(msg->hdr.serial) + 1); - msg->offset += RXRPC_JUMBO_DATALEN + - sizeof(struct rxrpc_jumbo_header); - msg->dsize -= RXRPC_JUMBO_DATALEN + - sizeof(struct rxrpc_jumbo_header); - msg->hdr.flags = jumbo.flags; - msg->hdr._rsvd = jumbo._rsvd; - - _net("Rx Split jumbo packet from %s" - " (%08x;%08x,%1x,%d,%s,%02x,%d,%d)", - msg->hdr.flags & RXRPC_CLIENT_INITIATED ? "client" : "server", - ntohl(msg->hdr.epoch), - (ntohl(msg->hdr.cid) & RXRPC_CIDMASK) >> RXRPC_CIDSHIFT, - ntohl(msg->hdr.cid) & RXRPC_CHANNELMASK, - ntohl(msg->hdr.callNumber), - rxrpc_pkts[msg->hdr.type], - msg->hdr.flags, - ntohs(msg->hdr.serviceId), - msg->hdr.securityIndex); - - __RXACCT(atomic_inc(&rxrpc_message_count)); - } - - _leave(" = 0 #%d", atomic_read(&rxrpc_message_count)); - return 0; - - error: - while (!list_empty(msgq)) { - msg = list_entry(msgq->next, struct rxrpc_message, link); - list_del_init(&msg->link); - - rxrpc_put_message(msg); - } - - _leave(" = %d", ret); - return ret; -} /* end rxrpc_incoming_msg() */ - -/*****************************************************************************/ -/* - * accept a new call - * - called from krxiod in process context - */ -void rxrpc_trans_receive_packet(struct rxrpc_transport *trans) -{ - struct rxrpc_message *msg; - struct rxrpc_peer *peer; - struct sk_buff *pkt; - int ret; - __be32 addr; - __be16 port; - - LIST_HEAD(msgq); - - _enter("%p{%d}", trans, trans->port); - - for (;;) { - /* deal with outstanting errors first */ - if (trans->error_rcvd) - rxrpc_trans_receive_error_report(trans); - - /* attempt to receive a packet */ - pkt = skb_recv_datagram(trans->socket->sk, 0, 1, &ret); - if (!pkt) { - if (ret == -EAGAIN) { - _leave(" EAGAIN"); - return; - } - - /* an icmp error may have occurred */ - rxrpc_krxiod_queue_transport(trans); - _leave(" error %d\n", ret); - return; - } - - /* we'll probably need to checksum it (didn't call - * sock_recvmsg) */ - if (skb_checksum_complete(pkt)) { - kfree_skb(pkt); - rxrpc_krxiod_queue_transport(trans); - _leave(" CSUM failed"); - return; - } - - addr = pkt->nh.iph->saddr; - port = pkt->h.uh->source; - - _net("Rx Received UDP packet from %08x:%04hu", - ntohl(addr), ntohs(port)); - - /* unmarshall the Rx parameters and split jumbo packets */ - ret = rxrpc_incoming_msg(trans, pkt, &msgq); - if (ret < 0) { - kfree_skb(pkt); - rxrpc_krxiod_queue_transport(trans); - _leave(" bad packet"); - return; - } - - BUG_ON(list_empty(&msgq)); - - msg = list_entry(msgq.next, struct rxrpc_message, link); - - /* locate the record for the peer from which it - * originated */ - ret = rxrpc_peer_lookup(trans, addr, &peer); - if (ret < 0) { - kdebug("Rx No connections from that peer"); - rxrpc_trans_immediate_abort(trans, msg, -EINVAL); - goto finished_msg; - } - - /* try and find a matching connection */ - ret = rxrpc_connection_lookup(peer, msg, &msg->conn); - if (ret < 0) { - kdebug("Rx Unknown Connection"); - rxrpc_trans_immediate_abort(trans, msg, -EINVAL); - rxrpc_put_peer(peer); - goto finished_msg; - } - rxrpc_put_peer(peer); - - /* deal with the first packet of a new call */ - if (msg->hdr.flags & RXRPC_CLIENT_INITIATED && - msg->hdr.type == RXRPC_PACKET_TYPE_DATA && - ntohl(msg->hdr.seq) == 1 - ) { - _debug("Rx New server call"); - rxrpc_trans_receive_new_call(trans, &msgq); - goto finished_msg; - } - - /* deal with subsequent packet(s) of call */ - _debug("Rx Call packet"); - while (!list_empty(&msgq)) { - msg = list_entry(msgq.next, struct rxrpc_message, link); - list_del_init(&msg->link); - - ret = rxrpc_conn_receive_call_packet(msg->conn, NULL, msg); - if (ret < 0) { - rxrpc_trans_immediate_abort(trans, msg, ret); - rxrpc_put_message(msg); - goto finished_msg; - } - - rxrpc_put_message(msg); - } - - goto finished_msg; - - /* dispose of the packets */ - finished_msg: - while (!list_empty(&msgq)) { - msg = list_entry(msgq.next, struct rxrpc_message, link); - list_del_init(&msg->link); - - rxrpc_put_message(msg); - } - kfree_skb(pkt); - } - - _leave(""); - -} /* end rxrpc_trans_receive_packet() */ - -/*****************************************************************************/ -/* - * accept a new call from a client trying to connect to one of my services - * - called in process context - */ -static int rxrpc_trans_receive_new_call(struct rxrpc_transport *trans, - struct list_head *msgq) -{ - struct rxrpc_message *msg; - - _enter(""); - - /* only bother with the first packet */ - msg = list_entry(msgq->next, struct rxrpc_message, link); - list_del_init(&msg->link); - rxrpc_krxsecd_queue_incoming_call(msg); - rxrpc_put_message(msg); - - _leave(" = 0"); - - return 0; -} /* end rxrpc_trans_receive_new_call() */ - -/*****************************************************************************/ -/* - * perform an immediate abort without connection or call structures - */ -int rxrpc_trans_immediate_abort(struct rxrpc_transport *trans, - struct rxrpc_message *msg, - int error) -{ - struct rxrpc_header ahdr; - struct sockaddr_in sin; - struct msghdr msghdr; - struct kvec iov[2]; - __be32 _error; - int len, ret; - - _enter("%p,%p,%d", trans, msg, error); - - /* don't abort an abort packet */ - if (msg->hdr.type == RXRPC_PACKET_TYPE_ABORT) { - _leave(" = 0"); - return 0; - } - - _error = htonl(-error); - - /* set up the message to be transmitted */ - memcpy(&ahdr, &msg->hdr, sizeof(ahdr)); - ahdr.epoch = msg->hdr.epoch; - ahdr.serial = htonl(1); - ahdr.seq = 0; - ahdr.type = RXRPC_PACKET_TYPE_ABORT; - ahdr.flags = RXRPC_LAST_PACKET; - ahdr.flags |= ~msg->hdr.flags & RXRPC_CLIENT_INITIATED; - - iov[0].iov_len = sizeof(ahdr); - iov[0].iov_base = &ahdr; - iov[1].iov_len = sizeof(_error); - iov[1].iov_base = &_error; - - len = sizeof(ahdr) + sizeof(_error); - - memset(&sin,0,sizeof(sin)); - sin.sin_family = AF_INET; - sin.sin_port = msg->pkt->h.uh->source; - sin.sin_addr.s_addr = msg->pkt->nh.iph->saddr; - - msghdr.msg_name = &sin; - msghdr.msg_namelen = sizeof(sin); - msghdr.msg_control = NULL; - msghdr.msg_controllen = 0; - msghdr.msg_flags = MSG_DONTWAIT; - - _net("Sending message type %d of %d bytes to %08x:%d", - ahdr.type, - len, - ntohl(sin.sin_addr.s_addr), - ntohs(sin.sin_port)); - - /* send the message */ - ret = kernel_sendmsg(trans->socket, &msghdr, iov, 2, len); - - _leave(" = %d", ret); - return ret; -} /* end rxrpc_trans_immediate_abort() */ - -/*****************************************************************************/ -/* - * receive an ICMP error report and percolate it to all connections - * heading to the affected host or port - */ -static void rxrpc_trans_receive_error_report(struct rxrpc_transport *trans) -{ - struct rxrpc_connection *conn; - struct sockaddr_in sin; - struct rxrpc_peer *peer; - struct list_head connq, *_p; - struct errormsg emsg; - struct msghdr msg; - __be16 port; - int local, err; - - _enter("%p", trans); - - for (;;) { - trans->error_rcvd = 0; - - /* try and receive an error message */ - msg.msg_name = &sin; - msg.msg_namelen = sizeof(sin); - msg.msg_control = &emsg; - msg.msg_controllen = sizeof(emsg); - msg.msg_flags = 0; - - err = kernel_recvmsg(trans->socket, &msg, NULL, 0, 0, - MSG_ERRQUEUE | MSG_DONTWAIT | MSG_TRUNC); - - if (err == -EAGAIN) { - _leave(""); - return; - } - - if (err < 0) { - printk("%s: unable to recv an error report: %d\n", - __FUNCTION__, err); - _leave(""); - return; - } - - msg.msg_controllen = (char *) msg.msg_control - (char *) &emsg; - - if (msg.msg_controllen < sizeof(emsg.cmsg) || - msg.msg_namelen < sizeof(sin)) { - printk("%s: short control message" - " (nlen=%u clen=%Zu fl=%x)\n", - __FUNCTION__, - msg.msg_namelen, - msg.msg_controllen, - msg.msg_flags); - continue; - } - - _net("Rx Received control message" - " { len=%Zu level=%u type=%u }", - emsg.cmsg.cmsg_len, - emsg.cmsg.cmsg_level, - emsg.cmsg.cmsg_type); - - if (sin.sin_family != AF_INET) { - printk("Rx Ignoring error report with non-INET address" - " (fam=%u)", - sin.sin_family); - continue; - } - - _net("Rx Received message pertaining to host addr=%x port=%hu", - ntohl(sin.sin_addr.s_addr), ntohs(sin.sin_port)); - - if (emsg.cmsg.cmsg_level != SOL_IP || - emsg.cmsg.cmsg_type != IP_RECVERR) { - printk("Rx Ignoring unknown error report" - " { level=%u type=%u }", - emsg.cmsg.cmsg_level, - emsg.cmsg.cmsg_type); - continue; - } - - if (msg.msg_controllen < sizeof(emsg.cmsg) + sizeof(emsg.ee)) { - printk("%s: short error message (%Zu)\n", - __FUNCTION__, msg.msg_controllen); - _leave(""); - return; - } - - port = sin.sin_port; - - switch (emsg.ee.ee_origin) { - case SO_EE_ORIGIN_ICMP: - local = 0; - switch (emsg.ee.ee_type) { - case ICMP_DEST_UNREACH: - switch (emsg.ee.ee_code) { - case ICMP_NET_UNREACH: - _net("Rx Received ICMP Network Unreachable"); - port = 0; - err = -ENETUNREACH; - break; - case ICMP_HOST_UNREACH: - _net("Rx Received ICMP Host Unreachable"); - port = 0; - err = -EHOSTUNREACH; - break; - case ICMP_PORT_UNREACH: - _net("Rx Received ICMP Port Unreachable"); - err = -ECONNREFUSED; - break; - case ICMP_NET_UNKNOWN: - _net("Rx Received ICMP Unknown Network"); - port = 0; - err = -ENETUNREACH; - break; - case ICMP_HOST_UNKNOWN: - _net("Rx Received ICMP Unknown Host"); - port = 0; - err = -EHOSTUNREACH; - break; - default: - _net("Rx Received ICMP DestUnreach { code=%u }", - emsg.ee.ee_code); - err = emsg.ee.ee_errno; - break; - } - break; - - case ICMP_TIME_EXCEEDED: - _net("Rx Received ICMP TTL Exceeded"); - err = emsg.ee.ee_errno; - break; - - default: - _proto("Rx Received ICMP error { type=%u code=%u }", - emsg.ee.ee_type, emsg.ee.ee_code); - err = emsg.ee.ee_errno; - break; - } - break; - - case SO_EE_ORIGIN_LOCAL: - _proto("Rx Received local error { error=%d }", - emsg.ee.ee_errno); - local = 1; - err = emsg.ee.ee_errno; - break; - - case SO_EE_ORIGIN_NONE: - case SO_EE_ORIGIN_ICMP6: - default: - _proto("Rx Received error report { orig=%u }", - emsg.ee.ee_origin); - local = 0; - err = emsg.ee.ee_errno; - break; - } - - /* find all the connections between this transport and the - * affected destination */ - INIT_LIST_HEAD(&connq); - - if (rxrpc_peer_lookup(trans, sin.sin_addr.s_addr, - &peer) == 0) { - read_lock(&peer->conn_lock); - list_for_each(_p, &peer->conn_active) { - conn = list_entry(_p, struct rxrpc_connection, - link); - if (port && conn->addr.sin_port != port) - continue; - if (!list_empty(&conn->err_link)) - continue; - - rxrpc_get_connection(conn); - list_add_tail(&conn->err_link, &connq); - } - read_unlock(&peer->conn_lock); - - /* service all those connections */ - while (!list_empty(&connq)) { - conn = list_entry(connq.next, - struct rxrpc_connection, - err_link); - list_del(&conn->err_link); - - rxrpc_conn_handle_error(conn, local, err); - - rxrpc_put_connection(conn); - } - - rxrpc_put_peer(peer); - } - } - - _leave(""); - return; -} /* end rxrpc_trans_receive_error_report() */ diff --git a/net/sched/Kconfig b/net/sched/Kconfig index f4544dd8647..475df8449be 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -46,62 +46,6 @@ config NET_SCH_FIFO if NET_SCHED -choice - prompt "Packet scheduler clock source" - default NET_SCH_CLK_GETTIMEOFDAY - ---help--- - Packet schedulers need a monotonic clock that increments at a static - rate. The kernel provides several suitable interfaces, each with - different properties: - - - high resolution (us or better) - - fast to read (minimal locking, no i/o access) - - synchronized on all processors - - handles cpu clock frequency changes - - but nothing provides all of the above. - -config NET_SCH_CLK_JIFFIES - bool "Timer interrupt" - ---help--- - Say Y here if you want to use the timer interrupt (jiffies) as clock - source. This clock source is fast, synchronized on all processors and - handles cpu clock frequency changes, but its resolution is too low - for accurate shaping except at very low speed. - -config NET_SCH_CLK_GETTIMEOFDAY - bool "gettimeofday" - ---help--- - Say Y here if you want to use gettimeofday as clock source. This clock - source has high resolution, is synchronized on all processors and - handles cpu clock frequency changes, but it is slow. - - Choose this if you need a high resolution clock source but can't use - the CPU's cycle counter. - -# don't allow on SMP x86 because they can have unsynchronized TSCs. -# gettimeofday is a good alternative -config NET_SCH_CLK_CPU - bool "CPU cycle counter" - depends on ((X86_TSC || X86_64) && !SMP) || ALPHA || SPARC64 || PPC64 || IA64 - ---help--- - Say Y here if you want to use the CPU's cycle counter as clock source. - This is a cheap and high resolution clock source, but on some - architectures it is not synchronized on all processors and doesn't - handle cpu clock frequency changes. - - The useable cycle counters are: - - x86/x86_64 - Timestamp Counter - alpha - Cycle Counter - sparc64 - %ticks register - ppc64 - Time base - ia64 - Interval Time Counter - - Choose this if your CPU's cycle counter is working properly. - -endchoice - comment "Queueing/Scheduling" config NET_SCH_CBQ diff --git a/net/sched/act_api.c b/net/sched/act_api.c index cb21617a567..711dd26c95c 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -25,12 +25,12 @@ #include <linux/interrupt.h> #include <linux/netdevice.h> #include <linux/skbuff.h> -#include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/kmod.h> #include <net/sock.h> #include <net/sch_generic.h> #include <net/act_api.h> +#include <net/netlink.h> void tcf_hash_destroy(struct tcf_common *p, struct tcf_hashinfo *hinfo) { @@ -93,15 +93,15 @@ static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb, continue; a->priv = p; a->order = n_i; - r = (struct rtattr*) skb->tail; + r = (struct rtattr *)skb_tail_pointer(skb); RTA_PUT(skb, a->order, 0, NULL); err = tcf_action_dump_1(skb, a, 0, 0); if (err < 0) { index--; - skb_trim(skb, (u8*)r - skb->data); + nlmsg_trim(skb, r); goto done; } - r->rta_len = skb->tail - (u8*)r; + r->rta_len = skb_tail_pointer(skb) - (u8 *)r; n_i++; if (n_i >= TCA_ACT_MAX_PRIO) goto done; @@ -114,7 +114,7 @@ done: return n_i; rtattr_failure: - skb_trim(skb, (u8*)r - skb->data); + nlmsg_trim(skb, r); goto done; } @@ -125,7 +125,7 @@ static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a, struct rtattr *r ; int i= 0, n_i = 0; - r = (struct rtattr*) skb->tail; + r = (struct rtattr *)skb_tail_pointer(skb); RTA_PUT(skb, a->order, 0, NULL); RTA_PUT(skb, TCA_KIND, IFNAMSIZ, a->ops->kind); for (i = 0; i < (hinfo->hmask + 1); i++) { @@ -140,11 +140,11 @@ static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a, } } RTA_PUT(skb, TCA_FCNT, 4, &n_i); - r->rta_len = skb->tail - (u8*)r; + r->rta_len = skb_tail_pointer(skb) - (u8 *)r; return n_i; rtattr_failure: - skb_trim(skb, (u8*)r - skb->data); + nlmsg_trim(skb, r); return -EINVAL; } @@ -423,7 +423,7 @@ int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { int err = -EINVAL; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct rtattr *r; if (a->ops == NULL || a->ops->dump == NULL) @@ -432,15 +432,15 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) RTA_PUT(skb, TCA_KIND, IFNAMSIZ, a->ops->kind); if (tcf_action_copy_stats(skb, a, 0)) goto rtattr_failure; - r = (struct rtattr*) skb->tail; + r = (struct rtattr *)skb_tail_pointer(skb); RTA_PUT(skb, TCA_OPTIONS, 0, NULL); if ((err = tcf_action_dump_old(skb, a, bind, ref)) > 0) { - r->rta_len = skb->tail - (u8*)r; + r->rta_len = skb_tail_pointer(skb) - (u8 *)r; return err; } rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } @@ -449,17 +449,17 @@ tcf_action_dump(struct sk_buff *skb, struct tc_action *act, int bind, int ref) { struct tc_action *a; int err = -EINVAL; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct rtattr *r ; while ((a = act) != NULL) { - r = (struct rtattr*) skb->tail; + r = (struct rtattr *)skb_tail_pointer(skb); act = a->next; RTA_PUT(skb, a->order, 0, NULL); err = tcf_action_dump_1(skb, a, bind, ref); if (err < 0) goto errout; - r->rta_len = skb->tail - (u8*)r; + r->rta_len = skb_tail_pointer(skb) - (u8 *)r; } return 0; @@ -467,7 +467,7 @@ tcf_action_dump(struct sk_buff *skb, struct tc_action *act, int bind, int ref) rtattr_failure: err = -EINVAL; errout: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return err; } @@ -635,7 +635,7 @@ tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq, { struct tcamsg *t; struct nlmsghdr *nlh; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct rtattr *x; nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags); @@ -645,20 +645,20 @@ tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq, t->tca__pad1 = 0; t->tca__pad2 = 0; - x = (struct rtattr*) skb->tail; + x = (struct rtattr *)skb_tail_pointer(skb); RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); if (tcf_action_dump(skb, a, bind, ref) < 0) goto rtattr_failure; - x->rta_len = skb->tail - (u8*)x; + x->rta_len = skb_tail_pointer(skb) - (u8 *)x; - nlh->nlmsg_len = skb->tail - b; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; rtattr_failure: nlmsg_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } @@ -767,7 +767,7 @@ static int tca_action_flush(struct rtattr *rta, struct nlmsghdr *n, u32 pid) return -ENOBUFS; } - b = (unsigned char *)skb->tail; + b = skb_tail_pointer(skb); if (rtattr_parse_nested(tb, TCA_ACT_MAX, rta) < 0) goto err_out; @@ -783,16 +783,16 @@ static int tca_action_flush(struct rtattr *rta, struct nlmsghdr *n, u32 pid) t->tca__pad1 = 0; t->tca__pad2 = 0; - x = (struct rtattr *) skb->tail; + x = (struct rtattr *)skb_tail_pointer(skb); RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); err = a->ops->walk(skb, &dcb, RTM_DELACTION, a); if (err < 0) goto rtattr_failure; - x->rta_len = skb->tail - (u8 *) x; + x->rta_len = skb_tail_pointer(skb) - (u8 *)x; - nlh->nlmsg_len = skb->tail - b; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; nlh->nlmsg_flags |= NLM_F_ROOT; module_put(a->ops->owner); kfree(a); @@ -884,7 +884,7 @@ static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event, if (!skb) return -ENOBUFS; - b = (unsigned char *)skb->tail; + b = skb_tail_pointer(skb); nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags); t = NLMSG_DATA(nlh); @@ -892,15 +892,15 @@ static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event, t->tca__pad1 = 0; t->tca__pad2 = 0; - x = (struct rtattr*) skb->tail; + x = (struct rtattr *)skb_tail_pointer(skb); RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); if (tcf_action_dump(skb, a, 0, 0) < 0) goto rtattr_failure; - x->rta_len = skb->tail - (u8*)x; + x->rta_len = skb_tail_pointer(skb) - (u8 *)x; - nlh->nlmsg_len = skb->tail - b; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; NETLINK_CB(skb).dst_group = RTNLGRP_TC; err = rtnetlink_send(skb, pid, RTNLGRP_TC, flags&NLM_F_ECHO); @@ -1015,7 +1015,7 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) { struct nlmsghdr *nlh; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct rtattr *x; struct tc_action_ops *a_o; struct tc_action a; @@ -1048,7 +1048,7 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) t->tca__pad1 = 0; t->tca__pad2 = 0; - x = (struct rtattr *) skb->tail; + x = (struct rtattr *)skb_tail_pointer(skb); RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); ret = a_o->walk(skb, cb, RTM_GETACTION, &a); @@ -1056,12 +1056,12 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) goto rtattr_failure; if (ret > 0) { - x->rta_len = skb->tail - (u8 *) x; + x->rta_len = skb_tail_pointer(skb) - (u8 *)x; ret = skb->len; } else - skb_trim(skb, (u8*)x - skb->data); + nlmsg_trim(skb, x); - nlh->nlmsg_len = skb->tail - b; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; if (NETLINK_CB(cb->skb).pid && ret) nlh->nlmsg_flags |= NLM_F_MULTI; module_put(a_o->owner); @@ -1070,20 +1070,15 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) rtattr_failure: nlmsg_failure: module_put(a_o->owner); - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return skb->len; } static int __init tc_action_init(void) { - struct rtnetlink_link *link_p = rtnetlink_links[PF_UNSPEC]; - - if (link_p) { - link_p[RTM_NEWACTION-RTM_BASE].doit = tc_ctl_action; - link_p[RTM_DELACTION-RTM_BASE].doit = tc_ctl_action; - link_p[RTM_GETACTION-RTM_BASE].doit = tc_ctl_action; - link_p[RTM_GETACTION-RTM_BASE].dumpit = tc_dump_action; - } + rtnl_register(PF_UNSPEC, RTM_NEWACTION, tc_ctl_action, NULL); + rtnl_register(PF_UNSPEC, RTM_DELACTION, tc_ctl_action, NULL); + rtnl_register(PF_UNSPEC, RTM_GETACTION, tc_ctl_action, tc_dump_action); return 0; } diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 87d0faf3286..7517f379154 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -28,6 +28,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/proc_fs.h> +#include <net/netlink.h> #include <net/sock.h> #include <net/pkt_sched.h> #include <linux/tc_act/tc_gact.h> @@ -155,7 +156,7 @@ static int tcf_gact(struct sk_buff *skb, struct tc_action *a, struct tcf_result static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct tc_gact opt; struct tcf_gact *gact = a->priv; struct tcf_t t; @@ -181,7 +182,7 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int return skb->len; rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 47f0b132423..00b05f422d4 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -30,6 +30,7 @@ #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/kmod.h> +#include <net/netlink.h> #include <net/sock.h> #include <net/pkt_sched.h> #include <linux/tc_act/tc_ipt.h> @@ -245,7 +246,7 @@ static int tcf_ipt(struct sk_buff *skb, struct tc_action *a, static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct tcf_ipt *ipt = a->priv; struct ipt_entry_target *t; struct tcf_t tm; @@ -277,7 +278,7 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int return skb->len; rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); kfree(t); return -1; } diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 3e93683e9ab..de21c92faaa 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -30,6 +30,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/proc_fs.h> +#include <net/netlink.h> #include <net/sock.h> #include <net/pkt_sched.h> #include <linux/tc_act/tc_mirred.h> @@ -206,7 +207,7 @@ bad_mirred: static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct tcf_mirred *m = a->priv; struct tc_mirred opt; struct tcf_t t; @@ -225,7 +226,7 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, i return skb->len; rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 3d6a2fcc9ce..45b3cda86a2 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -27,6 +27,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/proc_fs.h> +#include <net/netlink.h> #include <net/sock.h> #include <net/pkt_sched.h> #include <linux/tc_act/tc_pedit.h> @@ -136,7 +137,7 @@ static int tcf_pedit(struct sk_buff *skb, struct tc_action *a, } } - pptr = skb->nh.raw; + pptr = skb_network_header(skb); spin_lock(&p->tcf_lock); @@ -195,7 +196,7 @@ done: static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct tcf_pedit *p = a->priv; struct tc_pedit *opt; struct tcf_t t; @@ -226,7 +227,7 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, return skb->len; rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); kfree(opt); return -1; } diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 10a5a5c36f7..616f465f407 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -30,6 +30,7 @@ #include <linux/init.h> #include <net/sock.h> #include <net/act_api.h> +#include <net/netlink.h> #define L2T(p,L) ((p)->tcfp_R_tab->data[(L)>>(p)->tcfp_R_tab->rate.cell_log]) #define L2T_P(p,L) ((p)->tcfp_P_tab->data[(L)>>(p)->tcfp_P_tab->rate.cell_log]) @@ -80,7 +81,7 @@ static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *c continue; a->priv = p; a->order = index; - r = (struct rtattr*) skb->tail; + r = (struct rtattr *)skb_tail_pointer(skb); RTA_PUT(skb, a->order, 0, NULL); if (type == RTM_DELACTION) err = tcf_action_dump_1(skb, a, 0, 1); @@ -88,10 +89,10 @@ static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *c err = tcf_action_dump_1(skb, a, 0, 0); if (err < 0) { index--; - skb_trim(skb, (u8*)r - skb->data); + nlmsg_trim(skb, r); goto done; } - r->rta_len = skb->tail - (u8*)r; + r->rta_len = skb_tail_pointer(skb) - (u8 *)r; n_i++; } } @@ -102,7 +103,7 @@ done: return n_i; rtattr_failure: - skb_trim(skb, (u8*)r - skb->data); + nlmsg_trim(skb, r); goto done; } #endif @@ -240,7 +241,7 @@ override: if (ret != ACT_P_CREATED) return ret; - PSCHED_GET_TIME(police->tcfp_t_c); + police->tcfp_t_c = psched_get_time(); police->tcf_index = parm->index ? parm->index : tcf_hash_new_index(&police_idx_gen, &police_hash_info); h = tcf_hash(police->tcf_index, POL_TAB_MASK); @@ -295,10 +296,9 @@ static int tcf_act_police(struct sk_buff *skb, struct tc_action *a, return police->tcfp_result; } - PSCHED_GET_TIME(now); - - toks = PSCHED_TDIFF_SAFE(now, police->tcfp_t_c, - police->tcfp_burst); + now = psched_get_time(); + toks = psched_tdiff_bounded(now, police->tcfp_t_c, + police->tcfp_burst); if (police->tcfp_P_tab) { ptoks = toks + police->tcfp_ptoks; if (ptoks > (long)L2T_P(police, police->tcfp_mtu)) @@ -326,7 +326,7 @@ static int tcf_act_police(struct sk_buff *skb, struct tc_action *a, static int tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct tcf_police *police = a->priv; struct tc_police opt; @@ -355,7 +355,7 @@ tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) return skb->len; rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } @@ -494,7 +494,7 @@ struct tcf_police *tcf_police_locate(struct rtattr *rta, struct rtattr *est) } if (police->tcfp_P_tab) police->tcfp_ptoks = L2T_P(police, police->tcfp_mtu); - PSCHED_GET_TIME(police->tcfp_t_c); + police->tcfp_t_c = psched_get_time(); police->tcf_index = parm->index ? parm->index : tcf_police_new_index(); police->tcf_action = parm->action; @@ -542,9 +542,9 @@ int tcf_police(struct sk_buff *skb, struct tcf_police *police) return police->tcfp_result; } - PSCHED_GET_TIME(now); - toks = PSCHED_TDIFF_SAFE(now, police->tcfp_t_c, - police->tcfp_burst); + now = psched_get_time(); + toks = psched_tdiff_bounded(now, police->tcfp_t_c, + police->tcfp_burst); if (police->tcfp_P_tab) { ptoks = toks + police->tcfp_ptoks; if (ptoks > (long)L2T_P(police, police->tcfp_mtu)) @@ -572,7 +572,7 @@ EXPORT_SYMBOL(tcf_police); int tcf_police_dump(struct sk_buff *skb, struct tcf_police *police) { - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct tc_police opt; opt.index = police->tcf_index; @@ -598,7 +598,7 @@ int tcf_police_dump(struct sk_buff *skb, struct tcf_police *police) return skb->len; rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index c7971182af0..36e1edad599 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -16,6 +16,7 @@ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> +#include <net/netlink.h> #include <net/pkt_sched.h> #define TCA_ACT_SIMP 22 @@ -155,7 +156,7 @@ static inline int tcf_simp_cleanup(struct tc_action *a, int bind) static inline int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct tcf_defact *d = a->priv; struct tc_defact opt; struct tcf_t t; @@ -173,7 +174,7 @@ static inline int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, return skb->len; rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 5c6ffdb77d2..ebf94edf047 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -29,9 +29,10 @@ #include <linux/interrupt.h> #include <linux/netdevice.h> #include <linux/skbuff.h> -#include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/kmod.h> +#include <linux/netlink.h> +#include <net/netlink.h> #include <net/sock.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> @@ -323,7 +324,7 @@ tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, unsigned long fh, { struct tcmsg *tcm; struct nlmsghdr *nlh; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags); tcm = NLMSG_DATA(nlh); @@ -340,12 +341,12 @@ tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, unsigned long fh, if (tp->ops->dump && tp->ops->dump(tp, fh, skb, tcm) < 0) goto rtattr_failure; } - nlh->nlmsg_len = skb->tail - b; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; nlmsg_failure: rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } @@ -399,7 +400,6 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) return skb->len; - read_lock(&qdisc_tree_lock); if (!tcm->tcm_parent) q = dev->qdisc_sleeping; else @@ -456,7 +456,6 @@ errout: if (cl) cops->put(q, cl); out: - read_unlock(&qdisc_tree_lock); dev_put(dev); return skb->len; } @@ -563,30 +562,30 @@ tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts, * to work with both old and new modes of entering * tc data even if iproute2 was newer - jhs */ - struct rtattr * p_rta = (struct rtattr*) skb->tail; + struct rtattr *p_rta = (struct rtattr *)skb_tail_pointer(skb); if (exts->action->type != TCA_OLD_COMPAT) { RTA_PUT(skb, map->action, 0, NULL); if (tcf_action_dump(skb, exts->action, 0, 0) < 0) goto rtattr_failure; - p_rta->rta_len = skb->tail - (u8*)p_rta; + p_rta->rta_len = skb_tail_pointer(skb) - (u8 *)p_rta; } else if (map->police) { RTA_PUT(skb, map->police, 0, NULL); if (tcf_action_dump_old(skb, exts->action, 0, 0) < 0) goto rtattr_failure; - p_rta->rta_len = skb->tail - (u8*)p_rta; + p_rta->rta_len = skb_tail_pointer(skb) - (u8 *)p_rta; } } #elif defined CONFIG_NET_CLS_POLICE if (map->police && exts->police) { - struct rtattr * p_rta = (struct rtattr*) skb->tail; + struct rtattr *p_rta = (struct rtattr *)skb_tail_pointer(skb); RTA_PUT(skb, map->police, 0, NULL); if (tcf_police_dump(skb, exts->police) < 0) goto rtattr_failure; - p_rta->rta_len = skb->tail - (u8*)p_rta; + p_rta->rta_len = skb_tail_pointer(skb) - (u8 *)p_rta; } #endif return 0; @@ -614,18 +613,11 @@ rtattr_failure: __attribute__ ((unused)) static int __init tc_filter_init(void) { - struct rtnetlink_link *link_p = rtnetlink_links[PF_UNSPEC]; + rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL); + rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL); + rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter, + tc_dump_tfilter); - /* Setup rtnetlink links. It is made here to avoid - exporting large number of public symbols. - */ - - if (link_p) { - link_p[RTM_NEWTFILTER-RTM_BASE].doit = tc_ctl_tfilter; - link_p[RTM_DELTFILTER-RTM_BASE].doit = tc_ctl_tfilter; - link_p[RTM_GETTFILTER-RTM_BASE].doit = tc_ctl_tfilter; - link_p[RTM_GETTFILTER-RTM_BASE].dumpit = tc_dump_tfilter; - } return 0; } diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 4a91f082a81..c885412d79d 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -17,6 +17,7 @@ #include <linux/errno.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> +#include <net/netlink.h> #include <net/act_api.h> #include <net/pkt_cls.h> @@ -245,7 +246,7 @@ static int basic_dump(struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { struct basic_filter *f = (struct basic_filter *) fh; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct rtattr *rta; if (f == NULL) @@ -263,11 +264,11 @@ static int basic_dump(struct tcf_proto *tp, unsigned long fh, tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0) goto rtattr_failure; - rta->rta_len = (skb->tail - b); + rta->rta_len = skb_tail_pointer(skb) - b; return skb->len; rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 5dbb9d451f7..bbec4a0d4dc 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -38,6 +38,7 @@ #include <linux/notifier.h> #include <linux/netfilter.h> #include <net/ip.h> +#include <net/netlink.h> #include <net/route.h> #include <linux/skbuff.h> #include <net/sock.h> @@ -348,7 +349,7 @@ static int fw_dump(struct tcf_proto *tp, unsigned long fh, { struct fw_head *head = (struct fw_head *)tp->root; struct fw_filter *f = (struct fw_filter*)fh; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct rtattr *rta; if (f == NULL) @@ -374,7 +375,7 @@ static int fw_dump(struct tcf_proto *tp, unsigned long fh, if (tcf_exts_dump(skb, &f->exts, &fw_ext_map) < 0) goto rtattr_failure; - rta->rta_len = skb->tail - b; + rta->rta_len = skb_tail_pointer(skb) - b; if (tcf_exts_dump_stats(skb, &f->exts, &fw_ext_map) < 0) goto rtattr_failure; @@ -382,7 +383,7 @@ static int fw_dump(struct tcf_proto *tp, unsigned long fh, return skb->len; rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index abc47cc48ad..cc941d0ee3a 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -28,6 +28,7 @@ #include <linux/etherdevice.h> #include <linux/notifier.h> #include <net/ip.h> +#include <net/netlink.h> #include <net/route.h> #include <linux/skbuff.h> #include <net/sock.h> @@ -88,9 +89,9 @@ static __inline__ int route4_fastmap_hash(u32 id, int iif) static inline void route4_reset_fastmap(struct net_device *dev, struct route4_head *head, u32 id) { - spin_lock_bh(&dev->queue_lock); + qdisc_lock_tree(dev); memset(head->fastmap, 0, sizeof(head->fastmap)); - spin_unlock_bh(&dev->queue_lock); + qdisc_unlock_tree(dev); } static inline void @@ -562,7 +563,7 @@ static int route4_dump(struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { struct route4_filter *f = (struct route4_filter*)fh; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct rtattr *rta; u32 id; @@ -591,7 +592,7 @@ static int route4_dump(struct tcf_proto *tp, unsigned long fh, if (tcf_exts_dump(skb, &f->exts, &route_ext_map) < 0) goto rtattr_failure; - rta->rta_len = skb->tail - b; + rta->rta_len = skb_tail_pointer(skb) - b; if (tcf_exts_dump_stats(skb, &f->exts, &route_ext_map) < 0) goto rtattr_failure; @@ -599,7 +600,7 @@ static int route4_dump(struct tcf_proto *tp, unsigned long fh, return skb->len; rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/cls_rsvp.c b/net/sched/cls_rsvp.c index 1d4a1fb1760..0a683c07c64 100644 --- a/net/sched/cls_rsvp.c +++ b/net/sched/cls_rsvp.c @@ -31,6 +31,7 @@ #include <net/route.h> #include <linux/skbuff.h> #include <net/sock.h> +#include <net/netlink.h> #include <net/act_api.h> #include <net/pkt_cls.h> diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 7853621a04c..22f9ede70e8 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -143,9 +143,9 @@ static int rsvp_classify(struct sk_buff *skb, struct tcf_proto *tp, u8 tunnelid = 0; u8 *xprt; #if RSVP_DST_LEN == 4 - struct ipv6hdr *nhptr = skb->nh.ipv6h; + struct ipv6hdr *nhptr = ipv6_hdr(skb); #else - struct iphdr *nhptr = skb->nh.iph; + struct iphdr *nhptr = ip_hdr(skb); #endif restart: @@ -160,7 +160,7 @@ restart: dst = &nhptr->daddr; protocol = nhptr->protocol; xprt = ((u8*)nhptr) + (nhptr->ihl<<2); - if (nhptr->frag_off&__constant_htons(IP_MF|IP_OFFSET)) + if (nhptr->frag_off & htons(IP_MF|IP_OFFSET)) return -1; #endif @@ -593,7 +593,7 @@ static int rsvp_dump(struct tcf_proto *tp, unsigned long fh, { struct rsvp_filter *f = (struct rsvp_filter*)fh; struct rsvp_session *s; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct rtattr *rta; struct tc_rsvp_pinfo pinfo; @@ -623,14 +623,14 @@ static int rsvp_dump(struct tcf_proto *tp, unsigned long fh, if (tcf_exts_dump(skb, &f->exts, &rsvp_ext_map) < 0) goto rtattr_failure; - rta->rta_len = skb->tail - b; + rta->rta_len = skb_tail_pointer(skb) - b; if (tcf_exts_dump_stats(skb, &f->exts, &rsvp_ext_map) < 0) goto rtattr_failure; return skb->len; rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/cls_rsvp6.c b/net/sched/cls_rsvp6.c index a2979d89798..93b6abed57d 100644 --- a/net/sched/cls_rsvp6.c +++ b/net/sched/cls_rsvp6.c @@ -34,6 +34,7 @@ #include <net/sock.h> #include <net/act_api.h> #include <net/pkt_cls.h> +#include <net/netlink.h> #define RSVP_DST_LEN 4 #define RSVP_ID "rsvp6" diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 7563fdcef4b..47ac0c55642 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -12,6 +12,7 @@ #include <linux/netdevice.h> #include <net/ip.h> #include <net/act_api.h> +#include <net/netlink.h> #include <net/pkt_cls.h> #include <net/route.h> @@ -448,7 +449,7 @@ static int tcindex_dump(struct tcf_proto *tp, unsigned long fh, { struct tcindex_data *p = PRIV(tp); struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct rtattr *rta; DPRINTK("tcindex_dump(tp %p,fh 0x%lx,skb %p,t %p),p %p,r %p,b %p\n", @@ -463,7 +464,7 @@ static int tcindex_dump(struct tcf_proto *tp, unsigned long fh, RTA_PUT(skb,TCA_TCINDEX_SHIFT,sizeof(p->shift),&p->shift); RTA_PUT(skb,TCA_TCINDEX_FALL_THROUGH,sizeof(p->fall_through), &p->fall_through); - rta->rta_len = skb->tail-b; + rta->rta_len = skb_tail_pointer(skb) - b; } else { if (p->perfect) { t->tcm_handle = r-p->perfect; @@ -486,7 +487,7 @@ static int tcindex_dump(struct tcf_proto *tp, unsigned long fh, if (tcf_exts_dump(skb, &r->exts, &tcindex_ext_map) < 0) goto rtattr_failure; - rta->rta_len = skb->tail-b; + rta->rta_len = skb_tail_pointer(skb) - b; if (tcf_exts_dump_stats(skb, &r->exts, &tcindex_ext_map) < 0) goto rtattr_failure; @@ -495,7 +496,7 @@ static int tcindex_dump(struct tcf_proto *tp, unsigned long fh, return skb->len; rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 0bcb16928d2..c7a347bd6d7 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -50,6 +50,7 @@ #include <linux/notifier.h> #include <linux/rtnetlink.h> #include <net/ip.h> +#include <net/netlink.h> #include <net/route.h> #include <linux/skbuff.h> #include <net/sock.h> @@ -119,7 +120,7 @@ static int u32_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_re } stack[TC_U32_MAXDEPTH]; struct tc_u_hnode *ht = (struct tc_u_hnode*)tp->root; - u8 *ptr = skb->nh.raw; + u8 *ptr = skb_network_header(skb); struct tc_u_knode *n; int sdepth = 0; int off2 = 0; @@ -213,7 +214,7 @@ check_terminal: off2 = 0; } - if (ptr < skb->tail) + if (ptr < skb_tail_pointer(skb)) goto next_ht; } @@ -435,7 +436,7 @@ static void u32_destroy(struct tcf_proto *tp) BUG_TRAP(ht->refcnt == 0); kfree(ht); - }; + } kfree(tp_c); } @@ -718,7 +719,7 @@ static int u32_dump(struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { struct tc_u_knode *n = (struct tc_u_knode*)fh; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct rtattr *rta; if (n == NULL) @@ -765,14 +766,14 @@ static int u32_dump(struct tcf_proto *tp, unsigned long fh, #endif } - rta->rta_len = skb->tail - b; + rta->rta_len = skb_tail_pointer(skb) - b; if (TC_U32_KEY(n->handle)) if (tcf_exts_dump_stats(skb, &n->exts, &u32_ext_map) < 0) goto rtattr_failure; return skb->len; rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/em_u32.c b/net/sched/em_u32.c index cd0600c6796..0a2a7fe08de 100644 --- a/net/sched/em_u32.c +++ b/net/sched/em_u32.c @@ -22,7 +22,7 @@ static int em_u32_match(struct sk_buff *skb, struct tcf_ematch *em, struct tcf_pkt_info *info) { struct tc_u32_key *key = (struct tc_u32_key *) em->data; - unsigned char *ptr = skb->nh.raw; + const unsigned char *ptr = skb_network_header(skb); if (info) { if (info->ptr) diff --git a/net/sched/ematch.c b/net/sched/ematch.c index 959c306c571..63146d339d8 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -418,17 +418,19 @@ void tcf_em_tree_destroy(struct tcf_proto *tp, struct tcf_ematch_tree *tree) int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv) { int i; - struct rtattr * top_start = (struct rtattr*) skb->tail; - struct rtattr * list_start; + u8 *tail; + struct rtattr *top_start = (struct rtattr *)skb_tail_pointer(skb); + struct rtattr *list_start; RTA_PUT(skb, tlv, 0, NULL); RTA_PUT(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr); - list_start = (struct rtattr *) skb->tail; + list_start = (struct rtattr *)skb_tail_pointer(skb); RTA_PUT(skb, TCA_EMATCH_TREE_LIST, 0, NULL); + tail = skb_tail_pointer(skb); for (i = 0; i < tree->hdr.nmatches; i++) { - struct rtattr *match_start = (struct rtattr*) skb->tail; + struct rtattr *match_start = (struct rtattr *)tail; struct tcf_ematch *em = tcf_em_get_match(tree, i); struct tcf_ematch_hdr em_hdr = { .kind = em->ops ? em->ops->kind : TCF_EM_CONTAINER, @@ -447,11 +449,12 @@ int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv) } else if (em->datalen > 0) RTA_PUT_NOHDR(skb, em->datalen, (void *) em->data); - match_start->rta_len = skb->tail - (u8*) match_start; + tail = skb_tail_pointer(skb); + match_start->rta_len = tail - (u8 *)match_start; } - list_start->rta_len = skb->tail - (u8 *) list_start; - top_start->rta_len = skb->tail - (u8 *) top_start; + list_start->rta_len = tail - (u8 *)list_start; + top_start->rta_len = tail - (u8 *)top_start; return 0; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index ecc988af4a9..bec600af03c 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -27,14 +27,15 @@ #include <linux/interrupt.h> #include <linux/netdevice.h> #include <linux/skbuff.h> -#include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/kmod.h> #include <linux/list.h> #include <linux/bitops.h> +#include <linux/hrtimer.h> +#include <net/netlink.h> #include <net/sock.h> #include <net/pkt_sched.h> @@ -190,7 +191,7 @@ int unregister_qdisc(struct Qdisc_ops *qops) (root qdisc, all its children, children of children etc.) */ -static struct Qdisc *__qdisc_lookup(struct net_device *dev, u32 handle) +struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) { struct Qdisc *q; @@ -201,16 +202,6 @@ static struct Qdisc *__qdisc_lookup(struct net_device *dev, u32 handle) return NULL; } -struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) -{ - struct Qdisc *q; - - read_lock(&qdisc_tree_lock); - q = __qdisc_lookup(dev, handle); - read_unlock(&qdisc_tree_lock); - return q; -} - static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) { unsigned long cl; @@ -291,6 +282,48 @@ void qdisc_put_rtab(struct qdisc_rate_table *tab) } } +static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) +{ + struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog, + timer); + struct net_device *dev = wd->qdisc->dev; + + wd->qdisc->flags &= ~TCQ_F_THROTTLED; + smp_wmb(); + if (spin_trylock(&dev->queue_lock)) { + qdisc_run(dev); + spin_unlock(&dev->queue_lock); + } else + netif_schedule(dev); + + return HRTIMER_NORESTART; +} + +void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) +{ + hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + wd->timer.function = qdisc_watchdog; + wd->qdisc = qdisc; +} +EXPORT_SYMBOL(qdisc_watchdog_init); + +void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires) +{ + ktime_t time; + + wd->qdisc->flags |= TCQ_F_THROTTLED; + time = ktime_set(0, 0); + time = ktime_add_ns(time, PSCHED_US2NS(expires)); + hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS); +} +EXPORT_SYMBOL(qdisc_watchdog_schedule); + +void qdisc_watchdog_cancel(struct qdisc_watchdog *wd) +{ + hrtimer_cancel(&wd->timer); + wd->qdisc->flags &= ~TCQ_F_THROTTLED; +} +EXPORT_SYMBOL(qdisc_watchdog_cancel); /* Allocate an unique handle from space managed by kernel */ @@ -362,7 +395,7 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) if (n == 0) return; while ((parentid = sch->parent)) { - sch = __qdisc_lookup(sch->dev, TC_H_MAJ(parentid)); + sch = qdisc_lookup(sch->dev, TC_H_MAJ(parentid)); cops = sch->ops->cl_ops; if (cops->qlen_notify) { cl = cops->get(sch, parentid); @@ -467,12 +500,16 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp) if (handle == TC_H_INGRESS) { sch->flags |= TCQ_F_INGRESS; + sch->stats_lock = &dev->ingress_lock; handle = TC_H_MAKE(TC_H_INGRESS, 0); - } else if (handle == 0) { - handle = qdisc_alloc_handle(dev); - err = -ENOMEM; - if (handle == 0) - goto err_out3; + } else { + sch->stats_lock = &dev->queue_lock; + if (handle == 0) { + handle = qdisc_alloc_handle(dev); + err = -ENOMEM; + if (handle == 0) + goto err_out3; + } } sch->handle = handle; @@ -621,9 +658,9 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) return err; if (q) { qdisc_notify(skb, n, clid, q, NULL); - spin_lock_bh(&dev->queue_lock); + qdisc_lock_tree(dev); qdisc_destroy(q); - spin_unlock_bh(&dev->queue_lock); + qdisc_unlock_tree(dev); } } else { qdisc_notify(skb, n, clid, NULL, q); @@ -756,17 +793,17 @@ graft: err = qdisc_graft(dev, p, clid, q, &old_q); if (err) { if (q) { - spin_lock_bh(&dev->queue_lock); + qdisc_lock_tree(dev); qdisc_destroy(q); - spin_unlock_bh(&dev->queue_lock); + qdisc_unlock_tree(dev); } return err; } qdisc_notify(skb, n, clid, old_q, q); if (old_q) { - spin_lock_bh(&dev->queue_lock); + qdisc_lock_tree(dev); qdisc_destroy(old_q); - spin_unlock_bh(&dev->queue_lock); + qdisc_unlock_tree(dev); } } return 0; @@ -777,7 +814,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, { struct tcmsg *tcm; struct nlmsghdr *nlh; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct gnet_dump d; nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags); @@ -811,12 +848,12 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, if (gnet_stats_finish_copy(&d) < 0) goto rtattr_failure; - nlh->nlmsg_len = skb->tail - b; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; nlmsg_failure: rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } @@ -857,12 +894,12 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) s_idx = cb->args[0]; s_q_idx = q_idx = cb->args[1]; read_lock(&dev_base_lock); - for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { + idx = 0; + for_each_netdev(dev) { if (idx < s_idx) - continue; + goto cont; if (idx > s_idx) s_q_idx = 0; - read_lock(&qdisc_tree_lock); q_idx = 0; list_for_each_entry(q, &dev->qdisc_list, list) { if (q_idx < s_q_idx) { @@ -870,13 +907,12 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) continue; } if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) { - read_unlock(&qdisc_tree_lock); + cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) goto done; - } q_idx++; } - read_unlock(&qdisc_tree_lock); +cont: + idx++; } done: @@ -1015,7 +1051,7 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, { struct tcmsg *tcm; struct nlmsghdr *nlh; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct gnet_dump d; struct Qdisc_class_ops *cl_ops = q->ops->cl_ops; @@ -1040,12 +1076,12 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, if (gnet_stats_finish_copy(&d) < 0) goto rtattr_failure; - nlh->nlmsg_len = skb->tail - b; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; nlmsg_failure: rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } @@ -1099,7 +1135,6 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) s_t = cb->args[0]; t = 0; - read_lock(&qdisc_tree_lock); list_for_each_entry(q, &dev->qdisc_list, list) { if (t < s_t || !q->ops->cl_ops || (tcm->tcm_parent && @@ -1121,7 +1156,6 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) break; t++; } - read_unlock(&qdisc_tree_lock); cb->args[0] = t; @@ -1146,7 +1180,7 @@ reclassify: for ( ; tp; tp = tp->next) { if ((tp->protocol == protocol || - tp->protocol == __constant_htons(ETH_P_ALL)) && + tp->protocol == htons(ETH_P_ALL)) && (err = tp->classify(skb, tp, res)) >= 0) { #ifdef CONFIG_NET_CLS_ACT if ( TC_ACT_RECLASSIFY == err) { @@ -1175,15 +1209,31 @@ reclassify: return -1; } -static int psched_us_per_tick = 1; -static int psched_tick_per_us = 1; +void tcf_destroy(struct tcf_proto *tp) +{ + tp->ops->destroy(tp); + module_put(tp->ops->owner); + kfree(tp); +} + +void tcf_destroy_chain(struct tcf_proto *fl) +{ + struct tcf_proto *tp; + + while ((tp = fl) != NULL) { + fl = tp->next; + tcf_destroy(tp); + } +} +EXPORT_SYMBOL(tcf_destroy_chain); #ifdef CONFIG_PROC_FS static int psched_show(struct seq_file *seq, void *v) { seq_printf(seq, "%08x %08x %08x %08x\n", - psched_tick_per_us, psched_us_per_tick, - 1000000, HZ); + (u32)NSEC_PER_USEC, (u32)PSCHED_US2NS(1), + 1000000, + (u32)NSEC_PER_SEC/(u32)ktime_to_ns(KTIME_MONOTONIC_RES)); return 0; } @@ -1202,101 +1252,19 @@ static const struct file_operations psched_fops = { }; #endif -#ifdef CONFIG_NET_SCH_CLK_CPU -psched_tdiff_t psched_clock_per_hz; -int psched_clock_scale; -EXPORT_SYMBOL(psched_clock_per_hz); -EXPORT_SYMBOL(psched_clock_scale); - -psched_time_t psched_time_base; -cycles_t psched_time_mark; -EXPORT_SYMBOL(psched_time_mark); -EXPORT_SYMBOL(psched_time_base); - -/* - * Periodically adjust psched_time_base to avoid overflow - * with 32-bit get_cycles(). Safe up to 4GHz CPU. - */ -static void psched_tick(unsigned long); -static DEFINE_TIMER(psched_timer, psched_tick, 0, 0); - -static void psched_tick(unsigned long dummy) -{ - if (sizeof(cycles_t) == sizeof(u32)) { - psched_time_t dummy_stamp; - PSCHED_GET_TIME(dummy_stamp); - psched_timer.expires = jiffies + 1*HZ; - add_timer(&psched_timer); - } -} - -int __init psched_calibrate_clock(void) -{ - psched_time_t stamp, stamp1; - struct timeval tv, tv1; - psched_tdiff_t delay; - long rdelay; - unsigned long stop; - - psched_tick(0); - stop = jiffies + HZ/10; - PSCHED_GET_TIME(stamp); - do_gettimeofday(&tv); - while (time_before(jiffies, stop)) { - barrier(); - cpu_relax(); - } - PSCHED_GET_TIME(stamp1); - do_gettimeofday(&tv1); - - delay = PSCHED_TDIFF(stamp1, stamp); - rdelay = tv1.tv_usec - tv.tv_usec; - rdelay += (tv1.tv_sec - tv.tv_sec)*1000000; - if (rdelay > delay) - return -1; - delay /= rdelay; - psched_tick_per_us = delay; - while ((delay>>=1) != 0) - psched_clock_scale++; - psched_us_per_tick = 1<<psched_clock_scale; - psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale; - return 0; -} -#endif - static int __init pktsched_init(void) { - struct rtnetlink_link *link_p; - -#ifdef CONFIG_NET_SCH_CLK_CPU - if (psched_calibrate_clock() < 0) - return -1; -#elif defined(CONFIG_NET_SCH_CLK_JIFFIES) - psched_tick_per_us = HZ<<PSCHED_JSCALE; - psched_us_per_tick = 1000000; -#endif - - link_p = rtnetlink_links[PF_UNSPEC]; - - /* Setup rtnetlink links. It is made here to avoid - exporting large number of public symbols. - */ - - if (link_p) { - link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc; - link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc; - link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc; - link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc; - link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass; - link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass; - link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass; - link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass; - } - register_qdisc(&pfifo_qdisc_ops); register_qdisc(&bfifo_qdisc_ops); proc_net_fops_create("psched", 0, &psched_fops); + rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL); + rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL); + rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc); + rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL); + rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL); + rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass); + return 0; } diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index afb3bbd571f..be7d299acd7 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -14,6 +14,7 @@ #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/file.h> /* for fput */ +#include <net/netlink.h> #include <net/pkt_sched.h> #include <net/sock.h> @@ -157,19 +158,6 @@ static unsigned long atm_tc_bind_filter(struct Qdisc *sch, return atm_tc_get(sch,classid); } - -static void destroy_filters(struct atm_flow_data *flow) -{ - struct tcf_proto *filter; - - while ((filter = flow->filter_list)) { - DPRINTK("destroy_filters: destroying filter %p\n",filter); - flow->filter_list = filter->next; - tcf_destroy(filter); - } -} - - /* * atm_tc_put handles all destructions, including the ones that are explicitly * requested (atm_tc_destroy, etc.). The assumption here is that we never drop @@ -194,7 +182,7 @@ static void atm_tc_put(struct Qdisc *sch, unsigned long cl) *prev = flow->next; DPRINTK("atm_tc_put: qdisc %p\n",flow->q); qdisc_destroy(flow->q); - destroy_filters(flow); + tcf_destroy_chain(flow->filter_list); if (flow->sock) { DPRINTK("atm_tc_put: f_count %d\n", file_count(flow->sock->file)); @@ -503,7 +491,7 @@ static void sch_atm_dequeue(unsigned long data) } D2PRINTK("atm_tc_dequeue: sending on class %p\n",flow); /* remove any LL header somebody else has attached */ - skb_pull(skb,(char *) skb->nh.iph-(char *) skb->data); + skb_pull(skb, skb_network_offset(skb)); if (skb_headroom(skb) < flow->hdr_len) { struct sk_buff *new; @@ -513,7 +501,7 @@ static void sch_atm_dequeue(unsigned long data) skb = new; } D2PRINTK("sch_atm_dequeue: ip %p, data %p\n", - skb->nh.iph,skb->data); + skb_network_header(skb), skb->data); ATM_SKB(skb)->vcc = flow->vcc; memcpy(skb_push(skb,flow->hdr_len),flow->hdr, flow->hdr_len); @@ -610,7 +598,7 @@ static void atm_tc_destroy(struct Qdisc *sch) DPRINTK("atm_tc_destroy(sch %p,[qdisc %p])\n",sch,p); /* races ? */ while ((flow = p->flows)) { - destroy_filters(flow); + tcf_destroy_chain(flow->filter_list); if (flow->ref > 1) printk(KERN_ERR "atm_destroy: %p->ref = %d\n",flow, flow->ref); @@ -631,7 +619,7 @@ static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl, { struct atm_qdisc_data *p = PRIV(sch); struct atm_flow_data *flow = (struct atm_flow_data *) cl; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct rtattr *rta; DPRINTK("atm_tc_dump_class(sch %p,[qdisc %p],flow %p,skb %p,tcm %p)\n", @@ -661,11 +649,11 @@ static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl, RTA_PUT(skb,TCA_ATM_EXCESS,sizeof(zero),&zero); } - rta->rta_len = skb->tail-b; + rta->rta_len = skb_tail_pointer(skb) - b; return skb->len; rtattr_failure: - skb_trim(skb,b-skb->data); + nlmsg_trim(skb, b); return -1; } static int diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 76c92e710a3..a294542cb8e 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -29,6 +29,7 @@ #include <linux/etherdevice.h> #include <linux/notifier.h> #include <net/ip.h> +#include <net/netlink.h> #include <net/route.h> #include <linux/skbuff.h> #include <net/sock.h> @@ -112,7 +113,7 @@ struct cbq_class /* Overlimit strategy parameters */ void (*overlimit)(struct cbq_class *cl); - long penalty; + psched_tdiff_t penalty; /* General scheduler (WRR) parameters */ long allot; @@ -143,7 +144,7 @@ struct cbq_class psched_time_t undertime; long avgidle; long deficit; /* Saved deficit for WRR */ - unsigned long penalized; + psched_time_t penalized; struct gnet_stats_basic bstats; struct gnet_stats_queue qstats; struct gnet_stats_rate_est rate_est; @@ -180,12 +181,12 @@ struct cbq_sched_data psched_time_t now_rt; /* Cached real time */ unsigned pmask; - struct timer_list delay_timer; - struct timer_list wd_timer; /* Watchdog timer, + struct hrtimer delay_timer; + struct qdisc_watchdog watchdog; /* Watchdog timer, started when CBQ has backlog, but cannot transmit just now */ - long wd_expires; + psched_tdiff_t wd_expires; int toplevel; u32 hgenerator; }; @@ -384,12 +385,12 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl) psched_time_t now; psched_tdiff_t incr; - PSCHED_GET_TIME(now); - incr = PSCHED_TDIFF(now, q->now_rt); - PSCHED_TADD2(q->now, incr, now); + now = psched_get_time(); + incr = now - q->now_rt; + now = q->now + incr; do { - if (PSCHED_TLESS(cl->undertime, now)) { + if (cl->undertime < now) { q->toplevel = cl->level; return; } @@ -473,7 +474,7 @@ cbq_requeue(struct sk_buff *skb, struct Qdisc *sch) static void cbq_ovl_classic(struct cbq_class *cl) { struct cbq_sched_data *q = qdisc_priv(cl->qdisc); - psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now); + psched_tdiff_t delay = cl->undertime - q->now; if (!cl->delayed) { delay += cl->offtime; @@ -491,7 +492,7 @@ static void cbq_ovl_classic(struct cbq_class *cl) cl->avgidle = cl->minidle; if (delay <= 0) delay = 1; - PSCHED_TADD2(q->now, delay, cl->undertime); + cl->undertime = q->now + delay; cl->xstats.overactions++; cl->delayed = 1; @@ -508,7 +509,7 @@ static void cbq_ovl_classic(struct cbq_class *cl) psched_tdiff_t base_delay = q->wd_expires; for (b = cl->borrow; b; b = b->borrow) { - delay = PSCHED_TDIFF(b->undertime, q->now); + delay = b->undertime - q->now; if (delay < base_delay) { if (delay <= 0) delay = 1; @@ -546,27 +547,32 @@ static void cbq_ovl_rclassic(struct cbq_class *cl) static void cbq_ovl_delay(struct cbq_class *cl) { struct cbq_sched_data *q = qdisc_priv(cl->qdisc); - psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now); + psched_tdiff_t delay = cl->undertime - q->now; if (!cl->delayed) { - unsigned long sched = jiffies; + psched_time_t sched = q->now; + ktime_t expires; delay += cl->offtime; if (cl->avgidle < 0) delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); if (cl->avgidle < cl->minidle) cl->avgidle = cl->minidle; - PSCHED_TADD2(q->now, delay, cl->undertime); + cl->undertime = q->now + delay; if (delay > 0) { - sched += PSCHED_US2JIFFIE(delay) + cl->penalty; + sched += delay + cl->penalty; cl->penalized = sched; cl->cpriority = TC_CBQ_MAXPRIO; q->pmask |= (1<<TC_CBQ_MAXPRIO); - if (del_timer(&q->delay_timer) && - (long)(q->delay_timer.expires - sched) > 0) - q->delay_timer.expires = sched; - add_timer(&q->delay_timer); + + expires = ktime_set(0, 0); + expires = ktime_add_ns(expires, PSCHED_US2NS(sched)); + if (hrtimer_try_to_cancel(&q->delay_timer) && + ktime_to_ns(ktime_sub(q->delay_timer.expires, + expires)) > 0) + q->delay_timer.expires = expires; + hrtimer_restart(&q->delay_timer); cl->delayed = 1; cl->xstats.overactions++; return; @@ -583,7 +589,7 @@ static void cbq_ovl_lowprio(struct cbq_class *cl) { struct cbq_sched_data *q = qdisc_priv(cl->qdisc); - cl->penalized = jiffies + cl->penalty; + cl->penalized = q->now + cl->penalty; if (cl->cpriority != cl->priority2) { cl->cpriority = cl->priority2; @@ -604,27 +610,19 @@ static void cbq_ovl_drop(struct cbq_class *cl) cbq_ovl_classic(cl); } -static void cbq_watchdog(unsigned long arg) -{ - struct Qdisc *sch = (struct Qdisc*)arg; - - sch->flags &= ~TCQ_F_THROTTLED; - netif_schedule(sch->dev); -} - -static unsigned long cbq_undelay_prio(struct cbq_sched_data *q, int prio) +static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio, + psched_time_t now) { struct cbq_class *cl; struct cbq_class *cl_prev = q->active[prio]; - unsigned long now = jiffies; - unsigned long sched = now; + psched_time_t sched = now; if (cl_prev == NULL) - return now; + return 0; do { cl = cl_prev->next_alive; - if ((long)(now - cl->penalized) > 0) { + if (now - cl->penalized > 0) { cl_prev->next_alive = cl->next_alive; cl->next_alive = NULL; cl->cpriority = cl->priority; @@ -640,30 +638,34 @@ static unsigned long cbq_undelay_prio(struct cbq_sched_data *q, int prio) } cl = cl_prev->next_alive; - } else if ((long)(sched - cl->penalized) > 0) + } else if (sched - cl->penalized > 0) sched = cl->penalized; } while ((cl_prev = cl) != q->active[prio]); - return (long)(sched - now); + return sched - now; } -static void cbq_undelay(unsigned long arg) +static enum hrtimer_restart cbq_undelay(struct hrtimer *timer) { - struct Qdisc *sch = (struct Qdisc*)arg; - struct cbq_sched_data *q = qdisc_priv(sch); - long delay = 0; + struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data, + delay_timer); + struct Qdisc *sch = q->watchdog.qdisc; + psched_time_t now; + psched_tdiff_t delay = 0; unsigned pmask; + now = psched_get_time(); + pmask = q->pmask; q->pmask = 0; while (pmask) { int prio = ffz(~pmask); - long tmp; + psched_tdiff_t tmp; pmask &= ~(1<<prio); - tmp = cbq_undelay_prio(q, prio); + tmp = cbq_undelay_prio(q, prio, now); if (tmp > 0) { q->pmask |= 1<<prio; if (tmp < delay || delay == 0) @@ -672,12 +674,16 @@ static void cbq_undelay(unsigned long arg) } if (delay) { - q->delay_timer.expires = jiffies + delay; - add_timer(&q->delay_timer); + ktime_t time; + + time = ktime_set(0, 0); + time = ktime_add_ns(time, PSCHED_US2NS(now + delay)); + hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS); } sch->flags &= ~TCQ_F_THROTTLED; netif_schedule(sch->dev); + return HRTIMER_NORESTART; } @@ -732,7 +738,7 @@ cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl, if (cl && q->toplevel >= borrowed->level) { if (cl->q->q.qlen > 1) { do { - if (PSCHED_IS_PASTPERFECT(borrowed->undertime)) { + if (borrowed->undertime == PSCHED_PASTPERFECT) { q->toplevel = borrowed->level; return; } @@ -770,7 +776,7 @@ cbq_update(struct cbq_sched_data *q) idle = (now - last) - last_pktlen/rate */ - idle = PSCHED_TDIFF(q->now, cl->last); + idle = q->now - cl->last; if ((unsigned long)idle > 128*1024*1024) { avgidle = cl->maxidle; } else { @@ -814,13 +820,11 @@ cbq_update(struct cbq_sched_data *q) idle -= L2T(&q->link, len); idle += L2T(cl, len); - PSCHED_AUDIT_TDIFF(idle); - - PSCHED_TADD2(q->now, idle, cl->undertime); + cl->undertime = q->now + idle; } else { /* Underlimit */ - PSCHED_SET_PASTPERFECT(cl->undertime); + cl->undertime = PSCHED_PASTPERFECT; if (avgidle > cl->maxidle) cl->avgidle = cl->maxidle; else @@ -841,8 +845,7 @@ cbq_under_limit(struct cbq_class *cl) if (cl->tparent == NULL) return cl; - if (PSCHED_IS_PASTPERFECT(cl->undertime) || - !PSCHED_TLESS(q->now, cl->undertime)) { + if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) { cl->delayed = 0; return cl; } @@ -865,8 +868,7 @@ cbq_under_limit(struct cbq_class *cl) } if (cl->level > q->toplevel) return NULL; - } while (!PSCHED_IS_PASTPERFECT(cl->undertime) && - PSCHED_TLESS(q->now, cl->undertime)); + } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime); cl->delayed = 0; return cl; @@ -1001,8 +1003,8 @@ cbq_dequeue(struct Qdisc *sch) psched_time_t now; psched_tdiff_t incr; - PSCHED_GET_TIME(now); - incr = PSCHED_TDIFF(now, q->now_rt); + now = psched_get_time(); + incr = now - q->now_rt; if (q->tx_class) { psched_tdiff_t incr2; @@ -1014,12 +1016,12 @@ cbq_dequeue(struct Qdisc *sch) cbq_time = max(real_time, work); */ incr2 = L2T(&q->link, q->tx_len); - PSCHED_TADD(q->now, incr2); + q->now += incr2; cbq_update(q); if ((incr -= incr2) < 0) incr = 0; } - PSCHED_TADD(q->now, incr); + q->now += incr; q->now_rt = now; for (;;) { @@ -1051,11 +1053,11 @@ cbq_dequeue(struct Qdisc *sch) */ if (q->toplevel == TC_CBQ_MAXLEVEL && - PSCHED_IS_PASTPERFECT(q->link.undertime)) + q->link.undertime == PSCHED_PASTPERFECT) break; q->toplevel = TC_CBQ_MAXLEVEL; - PSCHED_SET_PASTPERFECT(q->link.undertime); + q->link.undertime = PSCHED_PASTPERFECT; } /* No packets in scheduler or nobody wants to give them to us :-( @@ -1063,13 +1065,9 @@ cbq_dequeue(struct Qdisc *sch) if (sch->q.qlen) { sch->qstats.overlimits++; - if (q->wd_expires) { - long delay = PSCHED_US2JIFFIE(q->wd_expires); - if (delay <= 0) - delay = 1; - mod_timer(&q->wd_timer, jiffies + delay); - sch->flags |= TCQ_F_THROTTLED; - } + if (q->wd_expires) + qdisc_watchdog_schedule(&q->watchdog, + now + q->wd_expires); } return NULL; } @@ -1276,10 +1274,10 @@ cbq_reset(struct Qdisc* sch) q->pmask = 0; q->tx_class = NULL; q->tx_borrowed = NULL; - del_timer(&q->wd_timer); - del_timer(&q->delay_timer); + qdisc_watchdog_cancel(&q->watchdog); + hrtimer_cancel(&q->delay_timer); q->toplevel = TC_CBQ_MAXLEVEL; - PSCHED_GET_TIME(q->now); + q->now = psched_get_time(); q->now_rt = q->now; for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++) @@ -1290,7 +1288,7 @@ cbq_reset(struct Qdisc* sch) qdisc_reset(cl->q); cl->next_alive = NULL; - PSCHED_SET_PASTPERFECT(cl->undertime); + cl->undertime = PSCHED_PASTPERFECT; cl->avgidle = cl->maxidle; cl->deficit = cl->quantum; cl->cpriority = cl->priority; @@ -1379,7 +1377,7 @@ static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl) default: return -EINVAL; } - cl->penalty = (ovl->penalty*HZ)/1000; + cl->penalty = ovl->penalty; return 0; } @@ -1446,14 +1444,11 @@ static int cbq_init(struct Qdisc *sch, struct rtattr *opt) q->link.minidle = -0x7FFFFFFF; q->link.stats_lock = &sch->dev->queue_lock; - init_timer(&q->wd_timer); - q->wd_timer.data = (unsigned long)sch; - q->wd_timer.function = cbq_watchdog; - init_timer(&q->delay_timer); - q->delay_timer.data = (unsigned long)sch; + qdisc_watchdog_init(&q->watchdog, sch); + hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); q->delay_timer.function = cbq_undelay; q->toplevel = TC_CBQ_MAXLEVEL; - PSCHED_GET_TIME(q->now); + q->now = psched_get_time(); q->now_rt = q->now; cbq_link_class(&q->link); @@ -1467,19 +1462,19 @@ static int cbq_init(struct Qdisc *sch, struct rtattr *opt) static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl) { - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); RTA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate); return skb->len; rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl) { - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct tc_cbq_lssopt opt; opt.flags = 0; @@ -1498,13 +1493,13 @@ static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl) return skb->len; rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl) { - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct tc_cbq_wrropt opt; opt.flags = 0; @@ -1516,30 +1511,30 @@ static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl) return skb->len; rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl) { - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct tc_cbq_ovl opt; opt.strategy = cl->ovl_strategy; opt.priority2 = cl->priority2+1; opt.pad = 0; - opt.penalty = (cl->penalty*1000)/HZ; + opt.penalty = cl->penalty; RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt); return skb->len; rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl) { - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct tc_cbq_fopt opt; if (cl->split || cl->defmap) { @@ -1551,14 +1546,14 @@ static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl) return skb->len; rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } #ifdef CONFIG_NET_CLS_POLICE static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl) { - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct tc_cbq_police opt; if (cl->police) { @@ -1570,7 +1565,7 @@ static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl) return skb->len; rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } #endif @@ -1592,18 +1587,18 @@ static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl) static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb) { struct cbq_sched_data *q = qdisc_priv(sch); - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct rtattr *rta; rta = (struct rtattr*)b; RTA_PUT(skb, TCA_OPTIONS, 0, NULL); if (cbq_dump_attr(skb, &q->link) < 0) goto rtattr_failure; - rta->rta_len = skb->tail - b; + rta->rta_len = skb_tail_pointer(skb) - b; return skb->len; rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } @@ -1621,7 +1616,7 @@ cbq_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb, struct tcmsg *tcm) { struct cbq_class *cl = (struct cbq_class*)arg; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct rtattr *rta; if (cl->tparent) @@ -1635,11 +1630,11 @@ cbq_dump_class(struct Qdisc *sch, unsigned long arg, RTA_PUT(skb, TCA_OPTIONS, 0, NULL); if (cbq_dump_attr(skb, cl) < 0) goto rtattr_failure; - rta->rta_len = skb->tail - b; + rta->rta_len = skb_tail_pointer(skb) - b; return skb->len; rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } @@ -1654,8 +1649,8 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, cl->xstats.avgidle = cl->avgidle; cl->xstats.undertime = 0; - if (!PSCHED_IS_PASTPERFECT(cl->undertime)) - cl->xstats.undertime = PSCHED_TDIFF(cl->undertime, q->now); + if (cl->undertime != PSCHED_PASTPERFECT) + cl->xstats.undertime = cl->undertime - q->now; if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || #ifdef CONFIG_NET_ESTIMATOR @@ -1722,23 +1717,13 @@ static unsigned long cbq_get(struct Qdisc *sch, u32 classid) return 0; } -static void cbq_destroy_filters(struct cbq_class *cl) -{ - struct tcf_proto *tp; - - while ((tp = cl->filter_list) != NULL) { - cl->filter_list = tp->next; - tcf_destroy(tp); - } -} - static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl) { struct cbq_sched_data *q = qdisc_priv(sch); BUG_TRAP(!cl->filters); - cbq_destroy_filters(cl); + tcf_destroy_chain(cl->filter_list); qdisc_destroy(cl->q); qdisc_put_rtab(cl->R_tab); #ifdef CONFIG_NET_ESTIMATOR @@ -1765,7 +1750,7 @@ cbq_destroy(struct Qdisc* sch) */ for (h = 0; h < 16; h++) for (cl = q->classes[h]; cl; cl = cl->next) - cbq_destroy_filters(cl); + tcf_destroy_chain(cl->filter_list); for (h = 0; h < 16; h++) { struct cbq_class *next; diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 96324cf4e6a..3c6fd181263 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -216,17 +216,17 @@ static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch) /* FIXME: Safe with non-linear skbs? --RR */ switch (skb->protocol) { case __constant_htons(ETH_P_IP): - skb->tc_index = ipv4_get_dsfield(skb->nh.iph) + skb->tc_index = ipv4_get_dsfield(ip_hdr(skb)) & ~INET_ECN_MASK; break; case __constant_htons(ETH_P_IPV6): - skb->tc_index = ipv6_get_dsfield(skb->nh.ipv6h) + skb->tc_index = ipv6_get_dsfield(ipv6_hdr(skb)) & ~INET_ECN_MASK; break; default: skb->tc_index = 0; break; - }; + } } if (TC_H_MAJ(skb->priority) == sch->handle) @@ -257,7 +257,7 @@ static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch) if (p->default_index != NO_DEFAULT_INDEX) skb->tc_index = p->default_index; break; - }; + } } err = p->q->enqueue(skb,p->q); @@ -292,11 +292,11 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) switch (skb->protocol) { case __constant_htons(ETH_P_IP): - ipv4_change_dsfield(skb->nh.iph, p->mask[index], + ipv4_change_dsfield(ip_hdr(skb), p->mask[index], p->value[index]); break; case __constant_htons(ETH_P_IPV6): - ipv6_change_dsfield(skb->nh.ipv6h, p->mask[index], + ipv6_change_dsfield(ipv6_hdr(skb), p->mask[index], p->value[index]); break; default: @@ -310,7 +310,7 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) "unsupported protocol %d\n", ntohs(skb->protocol)); break; - }; + } return skb; } @@ -412,16 +412,10 @@ static void dsmark_reset(struct Qdisc *sch) static void dsmark_destroy(struct Qdisc *sch) { struct dsmark_qdisc_data *p = PRIV(sch); - struct tcf_proto *tp; DPRINTK("dsmark_destroy(sch %p,[qdisc %p])\n", sch, p); - while (p->filter_list) { - tp = p->filter_list; - p->filter_list = tp->next; - tcf_destroy(tp); - } - + tcf_destroy_chain(p->filter_list); qdisc_destroy(p->q); kfree(p->mask); } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 52eb3439d7c..3385ee59254 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -36,34 +36,27 @@ /* Main transmission queue. */ -/* Main qdisc structure lock. - - However, modifications - to data, participating in scheduling must be additionally - protected with dev->queue_lock spinlock. - - The idea is the following: - - enqueue, dequeue are serialized via top level device - spinlock dev->queue_lock. - - tree walking is protected by read_lock(qdisc_tree_lock) - and this lock is used only in process context. - - updates to tree are made only under rtnl semaphore, - hence this lock may be made without local bh disabling. - - qdisc_tree_lock must be grabbed BEFORE dev->queue_lock! +/* Modifications to data participating in scheduling must be protected with + * dev->queue_lock spinlock. + * + * The idea is the following: + * - enqueue, dequeue are serialized via top level device + * spinlock dev->queue_lock. + * - ingress filtering is serialized via top level device + * spinlock dev->ingress_lock. + * - updates to tree and tree walking are only done under the rtnl mutex. */ -DEFINE_RWLOCK(qdisc_tree_lock); void qdisc_lock_tree(struct net_device *dev) { - write_lock(&qdisc_tree_lock); spin_lock_bh(&dev->queue_lock); + spin_lock(&dev->ingress_lock); } void qdisc_unlock_tree(struct net_device *dev) { + spin_unlock(&dev->ingress_lock); spin_unlock_bh(&dev->queue_lock); - write_unlock(&qdisc_tree_lock); } /* @@ -442,7 +435,6 @@ struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops) sch->dequeue = ops->dequeue; sch->dev = dev; dev_hold(dev); - sch->stats_lock = &dev->queue_lock; atomic_set(&sch->refcnt, 1); return sch; @@ -458,6 +450,7 @@ struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops, sch = qdisc_alloc(dev, ops); if (IS_ERR(sch)) goto errout; + sch->stats_lock = &dev->queue_lock; sch->parent = parentid; if (!ops->init || ops->init(sch, NULL) == 0) @@ -528,15 +521,11 @@ void dev_activate(struct net_device *dev) printk(KERN_INFO "%s: activation failed\n", dev->name); return; } - write_lock(&qdisc_tree_lock); list_add_tail(&qdisc->list, &dev->qdisc_list); - write_unlock(&qdisc_tree_lock); } else { qdisc = &noqueue_qdisc; } - write_lock(&qdisc_tree_lock); dev->qdisc_sleeping = qdisc; - write_unlock(&qdisc_tree_lock); } if (!netif_carrier_ok(dev)) diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 407c6fb1ba1..9d124c4ee3a 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -59,13 +59,13 @@ #include <linux/skbuff.h> #include <linux/string.h> #include <linux/slab.h> -#include <linux/timer.h> #include <linux/list.h> #include <linux/rbtree.h> #include <linux/init.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/pkt_sched.h> +#include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <asm/system.h> @@ -192,23 +192,9 @@ struct hfsc_sched struct list_head droplist; /* active leaf class list (for dropping) */ struct sk_buff_head requeue; /* requeued packet */ - struct timer_list wd_timer; /* watchdog timer */ + struct qdisc_watchdog watchdog; /* watchdog timer */ }; -/* - * macros - */ -#ifdef CONFIG_NET_SCH_CLK_GETTIMEOFDAY -#include <linux/time.h> -#undef PSCHED_GET_TIME -#define PSCHED_GET_TIME(stamp) \ -do { \ - struct timeval tv; \ - do_gettimeofday(&tv); \ - (stamp) = 1ULL * USEC_PER_SEC * tv.tv_sec + tv.tv_usec; \ -} while (0) -#endif - #define HT_INFINITY 0xffffffffffffffffULL /* infinite time value */ @@ -394,28 +380,17 @@ cftree_update(struct hfsc_class *cl) * ism: (psched_us/byte) << ISM_SHIFT * dx: psched_us * - * Clock source resolution (CONFIG_NET_SCH_CLK_*) - * JIFFIES: for 48<=HZ<=1534 resolution is between 0.63us and 1.27us. - * CPU: resolution is between 0.5us and 1us. - * GETTIMEOFDAY: resolution is exactly 1us. + * The clock source resolution with ktime is 1.024us. * * sm and ism are scaled in order to keep effective digits. * SM_SHIFT and ISM_SHIFT are selected to keep at least 4 effective * digits in decimal using the following table. * - * Note: We can afford the additional accuracy (altq hfsc keeps at most - * 3 effective digits) thanks to the fact that linux clock is bounded - * much more tightly. - * * bits/sec 100Kbps 1Mbps 10Mbps 100Mbps 1Gbps * ------------+------------------------------------------------------- - * bytes/0.5us 6.25e-3 62.5e-3 625e-3 6250e-e 62500e-3 - * bytes/us 12.5e-3 125e-3 1250e-3 12500e-3 125000e-3 - * bytes/1.27us 15.875e-3 158.75e-3 1587.5e-3 15875e-3 158750e-3 + * bytes/1.024us 12.8e-3 128e-3 1280e-3 12800e-3 128000e-3 * - * 0.5us/byte 160 16 1.6 0.16 0.016 - * us/byte 80 8 0.8 0.08 0.008 - * 1.27us/byte 63 6.3 0.63 0.063 0.0063 + * 1.024us/byte 78.125 7.8125 0.78125 0.078125 0.0078125 */ #define SM_SHIFT 20 #define ISM_SHIFT 18 @@ -460,8 +435,8 @@ m2sm(u32 m) u64 sm; sm = ((u64)m << SM_SHIFT); - sm += PSCHED_JIFFIE2US(HZ) - 1; - do_div(sm, PSCHED_JIFFIE2US(HZ)); + sm += PSCHED_TICKS_PER_SEC - 1; + do_div(sm, PSCHED_TICKS_PER_SEC); return sm; } @@ -474,7 +449,7 @@ m2ism(u32 m) if (m == 0) ism = HT_INFINITY; else { - ism = ((u64)PSCHED_JIFFIE2US(HZ) << ISM_SHIFT); + ism = ((u64)PSCHED_TICKS_PER_SEC << ISM_SHIFT); ism += m - 1; do_div(ism, m); } @@ -487,7 +462,7 @@ d2dx(u32 d) { u64 dx; - dx = ((u64)d * PSCHED_JIFFIE2US(HZ)); + dx = ((u64)d * PSCHED_TICKS_PER_SEC); dx += USEC_PER_SEC - 1; do_div(dx, USEC_PER_SEC); return dx; @@ -499,7 +474,7 @@ sm2m(u64 sm) { u64 m; - m = (sm * PSCHED_JIFFIE2US(HZ)) >> SM_SHIFT; + m = (sm * PSCHED_TICKS_PER_SEC) >> SM_SHIFT; return (u32)m; } @@ -510,7 +485,7 @@ dx2d(u64 dx) u64 d; d = dx * USEC_PER_SEC; - do_div(d, PSCHED_JIFFIE2US(HZ)); + do_div(d, PSCHED_TICKS_PER_SEC); return (u32)d; } @@ -654,9 +629,7 @@ rtsc_min(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y) static void init_ed(struct hfsc_class *cl, unsigned int next_len) { - u64 cur_time; - - PSCHED_GET_TIME(cur_time); + u64 cur_time = psched_get_time(); /* update the deadline curve */ rtsc_min(&cl->cl_deadline, &cl->cl_rsc, cur_time, cl->cl_cumul); @@ -779,7 +752,7 @@ init_vf(struct hfsc_class *cl, unsigned int len) if (cl->cl_flags & HFSC_USC) { /* class has upper limit curve */ if (cur_time == 0) - PSCHED_GET_TIME(cur_time); + cur_time = psched_get_time(); /* update the ulimit curve */ rtsc_min(&cl->cl_ulimit, &cl->cl_usc, cur_time, @@ -1063,7 +1036,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl->cl_parent == NULL && parentid != TC_H_ROOT) return -EINVAL; } - PSCHED_GET_TIME(cur_time); + cur_time = psched_get_time(); sch_tree_lock(sch); if (rsc != NULL) @@ -1149,22 +1122,11 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, } static void -hfsc_destroy_filters(struct tcf_proto **fl) -{ - struct tcf_proto *tp; - - while ((tp = *fl) != NULL) { - *fl = tp->next; - tcf_destroy(tp); - } -} - -static void hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl) { struct hfsc_sched *q = qdisc_priv(sch); - hfsc_destroy_filters(&cl->filter_list); + tcf_destroy_chain(cl->filter_list); qdisc_destroy(cl->qdisc); #ifdef CONFIG_NET_ESTIMATOR gen_kill_estimator(&cl->bstats, &cl->rate_est); @@ -1389,7 +1351,7 @@ hfsc_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb, struct tcmsg *tcm) { struct hfsc_class *cl = (struct hfsc_class *)arg; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct rtattr *rta = (struct rtattr *)b; tcm->tcm_parent = cl->cl_parent ? cl->cl_parent->classid : TC_H_ROOT; @@ -1400,11 +1362,11 @@ hfsc_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb, RTA_PUT(skb, TCA_OPTIONS, 0, NULL); if (hfsc_dump_curves(skb, cl) < 0) goto rtattr_failure; - rta->rta_len = skb->tail - b; + rta->rta_len = skb_tail_pointer(skb) - b; return skb->len; rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } @@ -1459,21 +1421,11 @@ hfsc_walk(struct Qdisc *sch, struct qdisc_walker *arg) } static void -hfsc_watchdog(unsigned long arg) -{ - struct Qdisc *sch = (struct Qdisc *)arg; - - sch->flags &= ~TCQ_F_THROTTLED; - netif_schedule(sch->dev); -} - -static void -hfsc_schedule_watchdog(struct Qdisc *sch, u64 cur_time) +hfsc_schedule_watchdog(struct Qdisc *sch) { struct hfsc_sched *q = qdisc_priv(sch); struct hfsc_class *cl; u64 next_time = 0; - long delay; if ((cl = eltree_get_minel(q)) != NULL) next_time = cl->cl_e; @@ -1482,11 +1434,7 @@ hfsc_schedule_watchdog(struct Qdisc *sch, u64 cur_time) next_time = q->root.cl_cfmin; } WARN_ON(next_time == 0); - delay = next_time - cur_time; - delay = PSCHED_US2JIFFIE(delay); - - sch->flags |= TCQ_F_THROTTLED; - mod_timer(&q->wd_timer, jiffies + delay); + qdisc_watchdog_schedule(&q->watchdog, next_time); } static int @@ -1523,9 +1471,7 @@ hfsc_init_qdisc(struct Qdisc *sch, struct rtattr *opt) list_add(&q->root.hlist, &q->clhash[hfsc_hash(q->root.classid)]); - init_timer(&q->wd_timer); - q->wd_timer.function = hfsc_watchdog; - q->wd_timer.data = (unsigned long)sch; + qdisc_watchdog_init(&q->watchdog, sch); return 0; } @@ -1595,8 +1541,7 @@ hfsc_reset_qdisc(struct Qdisc *sch) __skb_queue_purge(&q->requeue); q->eligible = RB_ROOT; INIT_LIST_HEAD(&q->droplist); - del_timer(&q->wd_timer); - sch->flags &= ~TCQ_F_THROTTLED; + qdisc_watchdog_cancel(&q->watchdog); sch->q.qlen = 0; } @@ -1612,14 +1557,14 @@ hfsc_destroy_qdisc(struct Qdisc *sch) hfsc_destroy_class(sch, cl); } __skb_queue_purge(&q->requeue); - del_timer(&q->wd_timer); + qdisc_watchdog_cancel(&q->watchdog); } static int hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb) { struct hfsc_sched *q = qdisc_priv(sch); - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct tc_hfsc_qopt qopt; qopt.defcls = q->defcls; @@ -1627,7 +1572,7 @@ hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb) return skb->len; rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } @@ -1681,7 +1626,7 @@ hfsc_dequeue(struct Qdisc *sch) if ((skb = __skb_dequeue(&q->requeue))) goto out; - PSCHED_GET_TIME(cur_time); + cur_time = psched_get_time(); /* * if there are eligible classes, use real-time criteria. @@ -1698,7 +1643,7 @@ hfsc_dequeue(struct Qdisc *sch) cl = vttree_get_minvt(&q->root, cur_time); if (cl == NULL) { sch->qstats.overlimits++; - hfsc_schedule_watchdog(sch, cur_time); + hfsc_schedule_watchdog(sch); return NULL; } } diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 3c3294d0104..99bcec8dd04 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -50,6 +50,7 @@ #include <linux/skbuff.h> #include <linux/list.h> #include <linux/compiler.h> +#include <net/netlink.h> #include <net/sock.h> #include <net/pkt_sched.h> #include <linux/rbtree.h> @@ -128,7 +129,7 @@ struct htb_class { } un; struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */ struct rb_node pq_node; /* node for event queue */ - unsigned long pq_key; /* the same type as jiffies global */ + psched_time_t pq_key; int prio_activity; /* for which prios are we active */ enum htb_cmode cmode; /* current mode of the class */ @@ -179,10 +180,7 @@ struct htb_sched { struct rb_root wait_pq[TC_HTB_MAXDEPTH]; /* time of nearest event per level (row) */ - unsigned long near_ev_cache[TC_HTB_MAXDEPTH]; - - /* cached value of jiffies in dequeue */ - unsigned long jiffies; + psched_time_t near_ev_cache[TC_HTB_MAXDEPTH]; /* whether we hit non-work conserving class during this dequeue; we use */ int nwc_hit; /* this to disable mindelay complaint in dequeue */ @@ -195,7 +193,7 @@ struct htb_sched { int rate2quantum; /* quant = rate / rate2quantum */ psched_time_t now; /* cached dequeue time */ - struct timer_list timer; /* send delay timer */ + struct qdisc_watchdog watchdog; #ifdef HTB_RATECM struct timer_list rttim; /* rate computer timer */ int recmp_bucket; /* which hash bucket to recompute next */ @@ -342,19 +340,19 @@ static void htb_add_to_wait_tree(struct htb_sched *q, { struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL; - cl->pq_key = q->jiffies + PSCHED_US2JIFFIE(delay); - if (cl->pq_key == q->jiffies) + cl->pq_key = q->now + delay; + if (cl->pq_key == q->now) cl->pq_key++; /* update the nearest event cache */ - if (time_after(q->near_ev_cache[cl->level], cl->pq_key)) + if (q->near_ev_cache[cl->level] > cl->pq_key) q->near_ev_cache[cl->level] = cl->pq_key; while (*p) { struct htb_class *c; parent = *p; c = rb_entry(parent, struct htb_class, pq_node); - if (time_after_eq(cl->pq_key, c->pq_key)) + if (cl->pq_key >= c->pq_key) p = &parent->rb_right; else p = &parent->rb_left; @@ -679,14 +677,6 @@ static int htb_requeue(struct sk_buff *skb, struct Qdisc *sch) return NET_XMIT_SUCCESS; } -static void htb_timer(unsigned long arg) -{ - struct Qdisc *sch = (struct Qdisc *)arg; - sch->flags &= ~TCQ_F_THROTTLED; - wmb(); - netif_schedule(sch->dev); -} - #ifdef HTB_RATECM #define RT_GEN(D,R) R+=D-(R/HTB_EWMAC);D=0 static void htb_rate_timer(unsigned long arg) @@ -739,7 +729,7 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl, cl->T = toks while (cl) { - diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32) cl->mbuffer); + diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer); if (cl->level >= level) { if (cl->level == level) cl->xstats.lends++; @@ -778,11 +768,11 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl, /** * htb_do_events - make mode changes to classes at the level * - * Scans event queue for pending events and applies them. Returns jiffies to + * Scans event queue for pending events and applies them. Returns time of * next pending event (0 for no event in pq). - * Note: Aplied are events whose have cl->pq_key <= jiffies. + * Note: Applied are events whose have cl->pq_key <= q->now. */ -static long htb_do_events(struct htb_sched *q, int level) +static psched_time_t htb_do_events(struct htb_sched *q, int level) { int i; @@ -795,18 +785,18 @@ static long htb_do_events(struct htb_sched *q, int level) return 0; cl = rb_entry(p, struct htb_class, pq_node); - if (time_after(cl->pq_key, q->jiffies)) { - return cl->pq_key - q->jiffies; - } + if (cl->pq_key > q->now) + return cl->pq_key; + htb_safe_rb_erase(p, q->wait_pq + level); - diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32) cl->mbuffer); + diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer); htb_change_class_mode(q, cl, &diff); if (cl->cmode != HTB_CAN_SEND) htb_add_to_wait_tree(q, cl, diff); } if (net_ratelimit()) printk(KERN_WARNING "htb: too many events !\n"); - return HZ / 10; + return q->now + PSCHED_TICKS_PER_SEC / 10; } /* Returns class->node+prio from id-tree where classe's id is >= id. NULL @@ -958,30 +948,12 @@ next: return skb; } -static void htb_delay_by(struct Qdisc *sch, long delay) -{ - struct htb_sched *q = qdisc_priv(sch); - if (delay <= 0) - delay = 1; - if (unlikely(delay > 5 * HZ)) { - if (net_ratelimit()) - printk(KERN_INFO "HTB delay %ld > 5sec\n", delay); - delay = 5 * HZ; - } - /* why don't use jiffies here ? because expires can be in past */ - mod_timer(&q->timer, q->jiffies + delay); - sch->flags |= TCQ_F_THROTTLED; - sch->qstats.overlimits++; -} - static struct sk_buff *htb_dequeue(struct Qdisc *sch) { struct sk_buff *skb = NULL; struct htb_sched *q = qdisc_priv(sch); int level; - long min_delay; - - q->jiffies = jiffies; + psched_time_t next_event; /* try to dequeue direct packets as high prio (!) to minimize cpu work */ skb = __skb_dequeue(&q->direct_queue); @@ -993,23 +965,25 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch) if (!sch->q.qlen) goto fin; - PSCHED_GET_TIME(q->now); + q->now = psched_get_time(); - min_delay = LONG_MAX; + next_event = q->now + 5 * PSCHED_TICKS_PER_SEC; q->nwc_hit = 0; for (level = 0; level < TC_HTB_MAXDEPTH; level++) { /* common case optimization - skip event handler quickly */ int m; - long delay; - if (time_after_eq(q->jiffies, q->near_ev_cache[level])) { - delay = htb_do_events(q, level); - q->near_ev_cache[level] = - q->jiffies + (delay ? delay : HZ); + psched_time_t event; + + if (q->now >= q->near_ev_cache[level]) { + event = htb_do_events(q, level); + q->near_ev_cache[level] = event ? event : + PSCHED_TICKS_PER_SEC; } else - delay = q->near_ev_cache[level] - q->jiffies; + event = q->near_ev_cache[level]; + + if (event && next_event > event) + next_event = event; - if (delay && min_delay > delay) - min_delay = delay; m = ~q->row_mask[level]; while (m != (int)(-1)) { int prio = ffz(m); @@ -1022,7 +996,8 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch) } } } - htb_delay_by(sch, min_delay > 5 * HZ ? 5 * HZ : min_delay); + sch->qstats.overlimits++; + qdisc_watchdog_schedule(&q->watchdog, next_event); fin: return skb; } @@ -1075,8 +1050,7 @@ static void htb_reset(struct Qdisc *sch) } } - sch->flags &= ~TCQ_F_THROTTLED; - del_timer(&q->timer); + qdisc_watchdog_cancel(&q->watchdog); __skb_queue_purge(&q->direct_queue); sch->q.qlen = 0; memset(q->row, 0, sizeof(q->row)); @@ -1113,14 +1087,12 @@ static int htb_init(struct Qdisc *sch, struct rtattr *opt) for (i = 0; i < TC_HTB_NUMPRIO; i++) INIT_LIST_HEAD(q->drops + i); - init_timer(&q->timer); + qdisc_watchdog_init(&q->watchdog, sch); skb_queue_head_init(&q->direct_queue); q->direct_qlen = sch->dev->tx_queue_len; if (q->direct_qlen < 2) /* some devices have zero tx_queue_len */ q->direct_qlen = 2; - q->timer.function = htb_timer; - q->timer.data = (unsigned long)sch; #ifdef HTB_RATECM init_timer(&q->rttim); @@ -1139,7 +1111,7 @@ static int htb_init(struct Qdisc *sch, struct rtattr *opt) static int htb_dump(struct Qdisc *sch, struct sk_buff *skb) { struct htb_sched *q = qdisc_priv(sch); - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct rtattr *rta; struct tc_htb_glob gopt; spin_lock_bh(&sch->dev->queue_lock); @@ -1152,12 +1124,12 @@ static int htb_dump(struct Qdisc *sch, struct sk_buff *skb) rta = (struct rtattr *)b; RTA_PUT(skb, TCA_OPTIONS, 0, NULL); RTA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt); - rta->rta_len = skb->tail - b; + rta->rta_len = skb_tail_pointer(skb) - b; spin_unlock_bh(&sch->dev->queue_lock); return skb->len; rtattr_failure: spin_unlock_bh(&sch->dev->queue_lock); - skb_trim(skb, skb->tail - skb->data); + nlmsg_trim(skb, skb_tail_pointer(skb)); return -1; } @@ -1165,7 +1137,7 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb, struct tcmsg *tcm) { struct htb_class *cl = (struct htb_class *)arg; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct rtattr *rta; struct tc_htb_opt opt; @@ -1188,12 +1160,12 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg, opt.prio = cl->un.leaf.prio; opt.level = cl->level; RTA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt); - rta->rta_len = skb->tail - b; + rta->rta_len = skb_tail_pointer(skb) - b; spin_unlock_bh(&sch->dev->queue_lock); return skb->len; rtattr_failure: spin_unlock_bh(&sch->dev->queue_lock); - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } @@ -1264,16 +1236,6 @@ static unsigned long htb_get(struct Qdisc *sch, u32 classid) return (unsigned long)cl; } -static void htb_destroy_filters(struct tcf_proto **fl) -{ - struct tcf_proto *tp; - - while ((tp = *fl) != NULL) { - *fl = tp->next; - tcf_destroy(tp); - } -} - static inline int htb_parent_last_child(struct htb_class *cl) { if (!cl->parent) @@ -1302,7 +1264,7 @@ static void htb_parent_to_leaf(struct htb_class *cl, struct Qdisc *new_q) parent->un.leaf.prio = parent->prio; parent->tokens = parent->buffer; parent->ctokens = parent->cbuffer; - PSCHED_GET_TIME(parent->t_c); + parent->t_c = psched_get_time(); parent->cmode = HTB_CAN_SEND; } @@ -1317,7 +1279,7 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl) qdisc_put_rtab(cl->rate); qdisc_put_rtab(cl->ceil); - htb_destroy_filters(&cl->filter_list); + tcf_destroy_chain(cl->filter_list); while (!list_empty(&cl->children)) htb_destroy_class(sch, list_entry(cl->children.next, @@ -1341,7 +1303,7 @@ static void htb_destroy(struct Qdisc *sch) { struct htb_sched *q = qdisc_priv(sch); - del_timer_sync(&q->timer); + qdisc_watchdog_cancel(&q->watchdog); #ifdef HTB_RATECM del_timer_sync(&q->rttim); #endif @@ -1349,7 +1311,7 @@ static void htb_destroy(struct Qdisc *sch) and surprisingly it worked in 2.4. But it must precede it because filter need its target class alive to be able to call unbind_filter on it (without Oops). */ - htb_destroy_filters(&q->filter_list); + tcf_destroy_chain(q->filter_list); while (!list_empty(&q->root)) htb_destroy_class(sch, list_entry(q->root.next, @@ -1498,8 +1460,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, /* set class to be in HTB_CAN_SEND state */ cl->tokens = hopt->buffer; cl->ctokens = hopt->cbuffer; - cl->mbuffer = PSCHED_JIFFIE2US(HZ * 60); /* 1min */ - PSCHED_GET_TIME(cl->t_c); + cl->mbuffer = 60 * PSCHED_TICKS_PER_SEC; /* 1min */ + cl->t_c = psched_get_time(); cl->cmode = HTB_CAN_SEND; /* attach to the hash list and parent's family */ diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index cfe070ee6ee..f8b9f1cdf73 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -16,6 +16,7 @@ #include <linux/netfilter_ipv6.h> #include <linux/netfilter.h> #include <linux/smp.h> +#include <net/netlink.h> #include <net/pkt_sched.h> #include <asm/byteorder.h> #include <asm/uaccess.h> @@ -169,7 +170,7 @@ static int ingress_enqueue(struct sk_buff *skb,struct Qdisc *sch) skb->tc_index = TC_H_MIN(res.classid); result = TC_ACT_OK; break; - }; + } /* backward compat */ #else #ifdef CONFIG_NET_CLS_POLICE @@ -186,7 +187,7 @@ static int ingress_enqueue(struct sk_buff *skb,struct Qdisc *sch) sch->bstats.bytes += skb->len; result = NF_ACCEPT; break; - }; + } #else D2PRINTK("Overriding result to ACCEPT\n"); @@ -247,16 +248,11 @@ ing_hook(unsigned int hook, struct sk_buff **pskb, skb->dev ? (*pskb)->dev->name : "(no dev)", skb->len); -/* -revisit later: Use a private since lock dev->queue_lock is also -used on the egress (might slow things for an iota) -*/ - if (dev->qdisc_ingress) { - spin_lock(&dev->queue_lock); + spin_lock(&dev->ingress_lock); if ((q = dev->qdisc_ingress) != NULL) fwres = q->enqueue(skb, q); - spin_unlock(&dev->queue_lock); + spin_unlock(&dev->ingress_lock); } return fwres; @@ -345,14 +341,9 @@ static void ingress_reset(struct Qdisc *sch) static void ingress_destroy(struct Qdisc *sch) { struct ingress_qdisc_data *p = PRIV(sch); - struct tcf_proto *tp; DPRINTK("ingress_destroy(sch %p,[qdisc %p])\n", sch, p); - while (p->filter_list) { - tp = p->filter_list; - p->filter_list = tp->next; - tcf_destroy(tp); - } + tcf_destroy_chain(p->filter_list); #if 0 /* for future use */ qdisc_destroy(p->q); @@ -362,16 +353,16 @@ static void ingress_destroy(struct Qdisc *sch) static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb) { - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct rtattr *rta; rta = (struct rtattr *) b; RTA_PUT(skb, TCA_OPTIONS, 0, NULL); - rta->rta_len = skb->tail - b; + rta->rta_len = skb_tail_pointer(skb) - b; return skb->len; rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 1ccbfb55b0b..5d9d8bc9cc3 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -22,6 +22,7 @@ #include <linux/skbuff.h> #include <linux/rtnetlink.h> +#include <net/netlink.h> #include <net/pkt_sched.h> #define VERSION "1.2" @@ -54,21 +55,22 @@ struct netem_sched_data { struct Qdisc *qdisc; - struct timer_list timer; + struct qdisc_watchdog watchdog; + + psched_tdiff_t latency; + psched_tdiff_t jitter; - u32 latency; u32 loss; u32 limit; u32 counter; u32 gap; - u32 jitter; u32 duplicate; u32 reorder; u32 corrupt; struct crndstate { - unsigned long last; - unsigned long rho; + u32 last; + u32 rho; } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor; struct disttable { @@ -95,12 +97,12 @@ static void init_crandom(struct crndstate *state, unsigned long rho) * Next number depends on last value. * rho is scaled to avoid floating point. */ -static unsigned long get_crandom(struct crndstate *state) +static u32 get_crandom(struct crndstate *state) { u64 value, rho; unsigned long answer; - if (state->rho == 0) /* no correllation */ + if (state->rho == 0) /* no correlation */ return net_random(); value = net_random(); @@ -114,11 +116,13 @@ static unsigned long get_crandom(struct crndstate *state) * std deviation sigma. Uses table lookup to approximate the desired * distribution, and a uniformly-distributed pseudo-random source. */ -static long tabledist(unsigned long mu, long sigma, - struct crndstate *state, const struct disttable *dist) +static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma, + struct crndstate *state, + const struct disttable *dist) { - long t, x; - unsigned long rnd; + psched_tdiff_t x; + long t; + u32 rnd; if (sigma == 0) return mu; @@ -213,8 +217,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) delay = tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist); - PSCHED_GET_TIME(now); - PSCHED_TADD2(now, delay, cb->time_to_send); + now = psched_get_time(); + cb->time_to_send = now + delay; ++q->counter; ret = q->qdisc->enqueue(skb, q->qdisc); } else { @@ -222,7 +226,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) * Do re-ordering by putting one out of N packets at the front * of the queue. */ - PSCHED_GET_TIME(cb->time_to_send); + cb->time_to_send = psched_get_time(); q->counter = 0; ret = q->qdisc->ops->requeue(skb, q->qdisc); } @@ -269,55 +273,43 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch) struct netem_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; + smp_mb(); + if (sch->flags & TCQ_F_THROTTLED) + return NULL; + skb = q->qdisc->dequeue(q->qdisc); if (skb) { const struct netem_skb_cb *cb = (const struct netem_skb_cb *)skb->cb; - psched_time_t now; + psched_time_t now = psched_get_time(); /* if more time remaining? */ - PSCHED_GET_TIME(now); - - if (PSCHED_TLESS(cb->time_to_send, now)) { + if (cb->time_to_send <= now) { pr_debug("netem_dequeue: return skb=%p\n", skb); sch->q.qlen--; - sch->flags &= ~TCQ_F_THROTTLED; return skb; - } else { - psched_tdiff_t delay = PSCHED_TDIFF(cb->time_to_send, now); - - if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) { - qdisc_tree_decrease_qlen(q->qdisc, 1); - sch->qstats.drops++; - printk(KERN_ERR "netem: queue discpline %s could not requeue\n", - q->qdisc->ops->id); - } + } - mod_timer(&q->timer, jiffies + PSCHED_US2JIFFIE(delay)); - sch->flags |= TCQ_F_THROTTLED; + if (unlikely(q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS)) { + qdisc_tree_decrease_qlen(q->qdisc, 1); + sch->qstats.drops++; + printk(KERN_ERR "netem: %s could not requeue\n", + q->qdisc->ops->id); } + + qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send); } return NULL; } -static void netem_watchdog(unsigned long arg) -{ - struct Qdisc *sch = (struct Qdisc *)arg; - - pr_debug("netem_watchdog qlen=%d\n", sch->q.qlen); - sch->flags &= ~TCQ_F_THROTTLED; - netif_schedule(sch->dev); -} - static void netem_reset(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); qdisc_reset(q->qdisc); sch->q.qlen = 0; - sch->flags &= ~TCQ_F_THROTTLED; - del_timer_sync(&q->timer); + qdisc_watchdog_cancel(&q->watchdog); } /* Pass size change message down to embedded FIFO */ @@ -438,10 +430,11 @@ static int netem_change(struct Qdisc *sch, struct rtattr *opt) q->loss = qopt->loss; q->duplicate = qopt->duplicate; - /* for compatiablity with earlier versions. - * if gap is set, need to assume 100% probablity + /* for compatibility with earlier versions. + * if gap is set, need to assume 100% probability */ - q->reorder = ~0; + if (q->gap) + q->reorder = ~0; /* Handle nested options after initial queue options. * Should have put all options in nested format but too late now. @@ -487,22 +480,28 @@ static int netem_change(struct Qdisc *sch, struct rtattr *opt) */ struct fifo_sched_data { u32 limit; + psched_time_t oldest; }; static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) { struct fifo_sched_data *q = qdisc_priv(sch); struct sk_buff_head *list = &sch->q; - const struct netem_skb_cb *ncb - = (const struct netem_skb_cb *)nskb->cb; + psched_time_t tnext = ((struct netem_skb_cb *)nskb->cb)->time_to_send; struct sk_buff *skb; if (likely(skb_queue_len(list) < q->limit)) { + /* Optimize for add at tail */ + if (likely(skb_queue_empty(list) || tnext >= q->oldest)) { + q->oldest = tnext; + return qdisc_enqueue_tail(nskb, sch); + } + skb_queue_reverse_walk(list, skb) { const struct netem_skb_cb *cb = (const struct netem_skb_cb *)skb->cb; - if (!PSCHED_TLESS(ncb->time_to_send, cb->time_to_send)) + if (tnext >= cb->time_to_send) break; } @@ -515,7 +514,7 @@ static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) return NET_XMIT_SUCCESS; } - return qdisc_drop(nskb, sch); + return qdisc_reshape_fail(nskb, sch); } static int tfifo_init(struct Qdisc *sch, struct rtattr *opt) @@ -531,6 +530,7 @@ static int tfifo_init(struct Qdisc *sch, struct rtattr *opt) } else q->limit = max_t(u32, sch->dev->tx_queue_len, 1); + q->oldest = PSCHED_PASTPERFECT; return 0; } @@ -567,9 +567,7 @@ static int netem_init(struct Qdisc *sch, struct rtattr *opt) if (!opt) return -EINVAL; - init_timer(&q->timer); - q->timer.function = netem_watchdog; - q->timer.data = (unsigned long) sch; + qdisc_watchdog_init(&q->watchdog, sch); q->qdisc = qdisc_create_dflt(sch->dev, &tfifo_qdisc_ops, TC_H_MAKE(sch->handle, 1)); @@ -590,7 +588,7 @@ static void netem_destroy(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); - del_timer_sync(&q->timer); + qdisc_watchdog_cancel(&q->watchdog); qdisc_destroy(q->qdisc); kfree(q->delay_dist); } @@ -598,7 +596,7 @@ static void netem_destroy(struct Qdisc *sch) static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) { const struct netem_sched_data *q = qdisc_priv(sch); - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct rtattr *rta = (struct rtattr *) b; struct tc_netem_qopt qopt; struct tc_netem_corr cor; @@ -626,12 +624,12 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) corrupt.correlation = q->corrupt_cor.rho; RTA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt); - rta->rta_len = skb->tail - b; + rta->rta_len = skb_tail_pointer(skb) - b; return skb->len; rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index de889f23f22..269a6e17c6c 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -32,6 +32,7 @@ #include <net/ip.h> #include <net/route.h> #include <linux/skbuff.h> +#include <net/netlink.h> #include <net/sock.h> #include <net/pkt_sched.h> @@ -61,7 +62,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) *qerr = NET_XMIT_SUCCESS; case TC_ACT_SHOT: return NULL; - }; + } if (!q->filter_list ) { #else @@ -188,13 +189,8 @@ prio_destroy(struct Qdisc* sch) { int prio; struct prio_sched_data *q = qdisc_priv(sch); - struct tcf_proto *tp; - - while ((tp = q->filter_list) != NULL) { - q->filter_list = tp->next; - tcf_destroy(tp); - } + tcf_destroy_chain(q->filter_list); for (prio=0; prio<q->bands; prio++) qdisc_destroy(q->queues[prio]); } @@ -271,7 +267,7 @@ static int prio_init(struct Qdisc *sch, struct rtattr *opt) static int prio_dump(struct Qdisc *sch, struct sk_buff *skb) { struct prio_sched_data *q = qdisc_priv(sch); - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct tc_prio_qopt opt; opt.bands = q->bands; @@ -280,7 +276,7 @@ static int prio_dump(struct Qdisc *sch, struct sk_buff *skb) return skb->len; rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 66f32051a99..96dfdf78d32 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -30,6 +30,7 @@ #include <linux/notifier.h> #include <linux/init.h> #include <net/ip.h> +#include <net/netlink.h> #include <linux/ipv6.h> #include <net/route.h> #include <linux/skbuff.h> @@ -137,7 +138,7 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) switch (skb->protocol) { case __constant_htons(ETH_P_IP): { - struct iphdr *iph = skb->nh.iph; + const struct iphdr *iph = ip_hdr(skb); h = iph->daddr; h2 = iph->saddr^iph->protocol; if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) && @@ -152,7 +153,7 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) } case __constant_htons(ETH_P_IPV6): { - struct ipv6hdr *iph = skb->nh.ipv6h; + struct ipv6hdr *iph = ipv6_hdr(skb); h = iph->daddr.s6_addr32[3]; h2 = iph->saddr.s6_addr32[3]^iph->nexthdr; if (iph->nexthdr == IPPROTO_TCP || @@ -461,7 +462,7 @@ static void sfq_destroy(struct Qdisc *sch) static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) { struct sfq_sched_data *q = qdisc_priv(sch); - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct tc_sfq_qopt opt; opt.quantum = q->quantum; @@ -476,7 +477,7 @@ static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) return skb->len; rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 85da8daa61d..53862953baa 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -32,6 +32,7 @@ #include <linux/etherdevice.h> #include <linux/notifier.h> #include <net/ip.h> +#include <net/netlink.h> #include <net/route.h> #include <linux/skbuff.h> #include <net/sock.h> @@ -127,8 +128,8 @@ struct tbf_sched_data long tokens; /* Current number of B tokens */ long ptokens; /* Current number of P tokens */ psched_time_t t_c; /* Time check-point */ - struct timer_list wd_timer; /* Watchdog timer */ struct Qdisc *qdisc; /* Inner qdisc, default - bfifo queue */ + struct qdisc_watchdog watchdog; /* Watchdog timer */ }; #define L2T(q,L) ((q)->R_tab->data[(L)>>(q)->R_tab->rate.cell_log]) @@ -185,14 +186,6 @@ static unsigned int tbf_drop(struct Qdisc* sch) return len; } -static void tbf_watchdog(unsigned long arg) -{ - struct Qdisc *sch = (struct Qdisc*)arg; - - sch->flags &= ~TCQ_F_THROTTLED; - netif_schedule(sch->dev); -} - static struct sk_buff *tbf_dequeue(struct Qdisc* sch) { struct tbf_sched_data *q = qdisc_priv(sch); @@ -202,13 +195,12 @@ static struct sk_buff *tbf_dequeue(struct Qdisc* sch) if (skb) { psched_time_t now; - long toks, delay; + long toks; long ptoks = 0; unsigned int len = skb->len; - PSCHED_GET_TIME(now); - - toks = PSCHED_TDIFF_SAFE(now, q->t_c, q->buffer); + now = psched_get_time(); + toks = psched_tdiff_bounded(now, q->t_c, q->buffer); if (q->P_tab) { ptoks = toks + q->ptokens; @@ -230,12 +222,8 @@ static struct sk_buff *tbf_dequeue(struct Qdisc* sch) return skb; } - delay = PSCHED_US2JIFFIE(max_t(long, -toks, -ptoks)); - - if (delay == 0) - delay = 1; - - mod_timer(&q->wd_timer, jiffies+delay); + qdisc_watchdog_schedule(&q->watchdog, + now + max_t(long, -toks, -ptoks)); /* Maybe we have a shorter packet in the queue, which can be sent now. It sounds cool, @@ -254,7 +242,6 @@ static struct sk_buff *tbf_dequeue(struct Qdisc* sch) sch->qstats.drops++; } - sch->flags |= TCQ_F_THROTTLED; sch->qstats.overlimits++; } return NULL; @@ -266,11 +253,10 @@ static void tbf_reset(struct Qdisc* sch) qdisc_reset(q->qdisc); sch->q.qlen = 0; - PSCHED_GET_TIME(q->t_c); + q->t_c = psched_get_time(); q->tokens = q->buffer; q->ptokens = q->mtu; - sch->flags &= ~TCQ_F_THROTTLED; - del_timer(&q->wd_timer); + qdisc_watchdog_cancel(&q->watchdog); } static struct Qdisc *tbf_create_dflt_qdisc(struct Qdisc *sch, u32 limit) @@ -377,11 +363,8 @@ static int tbf_init(struct Qdisc* sch, struct rtattr *opt) if (opt == NULL) return -EINVAL; - PSCHED_GET_TIME(q->t_c); - init_timer(&q->wd_timer); - q->wd_timer.function = tbf_watchdog; - q->wd_timer.data = (unsigned long)sch; - + q->t_c = psched_get_time(); + qdisc_watchdog_init(&q->watchdog, sch); q->qdisc = &noop_qdisc; return tbf_change(sch, opt); @@ -391,7 +374,7 @@ static void tbf_destroy(struct Qdisc *sch) { struct tbf_sched_data *q = qdisc_priv(sch); - del_timer(&q->wd_timer); + qdisc_watchdog_cancel(&q->watchdog); if (q->P_tab) qdisc_put_rtab(q->P_tab); @@ -404,7 +387,7 @@ static void tbf_destroy(struct Qdisc *sch) static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) { struct tbf_sched_data *q = qdisc_priv(sch); - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct rtattr *rta; struct tc_tbf_qopt opt; @@ -420,12 +403,12 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) opt.mtu = q->mtu; opt.buffer = q->buffer; RTA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt); - rta->rta_len = skb->tail - b; + rta->rta_len = skb_tail_pointer(skb) - b; return skb->len; rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 587123c61af..d24914db786 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -323,7 +323,7 @@ restart: nores = 1; break; } - __skb_pull(skb, skb->nh.raw - skb->data); + __skb_pull(skb, skb_network_offset(skb)); } while ((q = NEXT_SLAVE(q)) != start); if (nores && skb_res == NULL) { diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 78d2ddb5ca1..df94e3cdfba 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -143,7 +143,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a /* Initialize the maximum mumber of new data packets that can be sent * in a burst. */ - asoc->max_burst = sctp_max_burst; + asoc->max_burst = sp->max_burst; /* initialize association timers */ asoc->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0; @@ -714,8 +714,16 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, /* Record the transition on the transport. */ switch (command) { case SCTP_TRANSPORT_UP: + /* If we are moving from UNCONFIRMED state due + * to heartbeat success, report the SCTP_ADDR_CONFIRMED + * state to the user, otherwise report SCTP_ADDR_AVAILABLE. + */ + if (SCTP_UNCONFIRMED == transport->state && + SCTP_HEARTBEAT_SUCCESS == error) + spc_state = SCTP_ADDR_CONFIRMED; + else + spc_state = SCTP_ADDR_AVAILABLE; transport->state = SCTP_ACTIVE; - spc_state = SCTP_ADDR_AVAILABLE; break; case SCTP_TRANSPORT_DOWN: @@ -725,7 +733,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, default: return; - }; + } /* Generate and send a SCTP_PEER_ADDR_CHANGE notification to the * user. @@ -1095,6 +1103,13 @@ void sctp_assoc_update(struct sctp_association *asoc, asoc->ssnmap = new->ssnmap; new->ssnmap = NULL; } + + if (!asoc->assoc_id) { + /* get a new association id since we don't have one + * yet. + */ + sctp_assoc_set_id(asoc, GFP_ATOMIC); + } } } @@ -1367,3 +1382,25 @@ out: sctp_read_unlock(&asoc->base.addr_lock); return found; } + +/* Set an association id for a given association */ +int sctp_assoc_set_id(struct sctp_association *asoc, gfp_t gfp) +{ + int assoc_id; + int error = 0; +retry: + if (unlikely(!idr_pre_get(&sctp_assocs_id, gfp))) + return -ENOMEM; + + spin_lock_bh(&sctp_assocs_id_lock); + error = idr_get_new_above(&sctp_assocs_id, (void *)asoc, + 1, &assoc_id); + spin_unlock_bh(&sctp_assocs_id_lock); + if (error == -EAGAIN) + goto retry; + else if (error) + return error; + + asoc->assoc_id = (sctp_assoc_t) assoc_id; + return error; +} diff --git a/net/sctp/debug.c b/net/sctp/debug.c index 5f5ab28977c..e8c0f7435d7 100644 --- a/net/sctp/debug.c +++ b/net/sctp/debug.c @@ -93,8 +93,9 @@ const char *sctp_cname(const sctp_subtype_t cid) return "FWD_TSN"; default: - return "unknown chunk"; - }; + break; + } + return "unknown chunk"; } diff --git a/net/sctp/input.c b/net/sctp/input.c index 71db6687369..885109fb3dd 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -79,14 +79,10 @@ static void sctp_add_backlog(struct sock *sk, struct sk_buff *skb); /* Calculate the SCTP checksum of an SCTP packet. */ static inline int sctp_rcv_checksum(struct sk_buff *skb) { - struct sctphdr *sh; - __u32 cmp, val; struct sk_buff *list = skb_shinfo(skb)->frag_list; - - sh = (struct sctphdr *) skb->h.raw; - cmp = ntohl(sh->checksum); - - val = sctp_start_cksum((__u8 *)sh, skb_headlen(skb)); + struct sctphdr *sh = sctp_hdr(skb); + __u32 cmp = ntohl(sh->checksum); + __u32 val = sctp_start_cksum((__u8 *)sh, skb_headlen(skb)); for (; list; list = list->next) val = sctp_update_cksum((__u8 *)list->data, skb_headlen(list), @@ -138,14 +134,13 @@ int sctp_rcv(struct sk_buff *skb) if (skb_linearize(skb)) goto discard_it; - sh = (struct sctphdr *) skb->h.raw; + sh = sctp_hdr(skb); /* Pull up the IP and SCTP headers. */ - __skb_pull(skb, skb->h.raw - skb->data); + __skb_pull(skb, skb_transport_offset(skb)); if (skb->len < sizeof(struct sctphdr)) goto discard_it; - if ((skb->ip_summed != CHECKSUM_UNNECESSARY) && - (sctp_rcv_checksum(skb) < 0)) + if (!skb_csum_unnecessary(skb) && sctp_rcv_checksum(skb) < 0) goto discard_it; skb_pull(skb, sizeof(struct sctphdr)); @@ -154,7 +149,7 @@ int sctp_rcv(struct sk_buff *skb) if (skb->len < sizeof(struct sctp_chunkhdr)) goto discard_it; - family = ipver2af(skb->nh.iph->version); + family = ipver2af(ip_hdr(skb)->version); af = sctp_get_af_specific(family); if (unlikely(!af)) goto discard_it; @@ -510,30 +505,30 @@ void sctp_err_finish(struct sock *sk, struct sctp_association *asoc) void sctp_v4_err(struct sk_buff *skb, __u32 info) { struct iphdr *iph = (struct iphdr *)skb->data; - struct sctphdr *sh = (struct sctphdr *)(skb->data + (iph->ihl <<2)); - int type = skb->h.icmph->type; - int code = skb->h.icmph->code; + const int ihlen = iph->ihl * 4; + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; struct sock *sk; struct sctp_association *asoc = NULL; struct sctp_transport *transport; struct inet_sock *inet; - char *saveip, *savesctp; + sk_buff_data_t saveip, savesctp; int err; - if (skb->len < ((iph->ihl << 2) + 8)) { + if (skb->len < ihlen + 8) { ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); return; } /* Fix up skb to look at the embedded net header. */ - saveip = skb->nh.raw; - savesctp = skb->h.raw; - skb->nh.iph = iph; - skb->h.raw = (char *)sh; - sk = sctp_err_lookup(AF_INET, skb, sh, &asoc, &transport); - /* Put back, the original pointers. */ - skb->nh.raw = saveip; - skb->h.raw = savesctp; + saveip = skb->network_header; + savesctp = skb->transport_header; + skb_reset_network_header(skb); + skb_set_transport_header(skb, ihlen); + sk = sctp_err_lookup(AF_INET, skb, sctp_hdr(skb), &asoc, &transport); + /* Put back, the original values. */ + skb->network_header = saveip; + skb->transport_header = savesctp; if (!sk) { ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); return; @@ -616,7 +611,7 @@ int sctp_rcv_ootb(struct sk_buff *skb) break; ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length)); - if (ch_end > skb->tail) + if (ch_end > skb_tail_pointer(skb)) break; /* RFC 8.4, 2) If the OOTB packet contains an ABORT chunk, the @@ -648,7 +643,7 @@ int sctp_rcv_ootb(struct sk_buff *skb) } ch = (sctp_chunkhdr_t *) ch_end; - } while (ch_end < skb->tail); + } while (ch_end < skb_tail_pointer(skb)); return 0; @@ -905,7 +900,7 @@ static struct sctp_association *__sctp_rcv_init_lookup(struct sk_buff *skb, struct sctp_association *asoc; union sctp_addr addr; union sctp_addr *paddr = &addr; - struct sctphdr *sh = (struct sctphdr *) skb->h.raw; + struct sctphdr *sh = sctp_hdr(skb); sctp_chunkhdr_t *ch; union sctp_params params; sctp_init_chunk_t *init; diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index c30629e1778..88aa2240754 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -159,16 +159,16 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue) * the skb->tail. */ if (unlikely(skb_is_nonlinear(chunk->skb))) { - if (chunk->chunk_end > chunk->skb->tail) - chunk->chunk_end = chunk->skb->tail; + if (chunk->chunk_end > skb_tail_pointer(chunk->skb)) + chunk->chunk_end = skb_tail_pointer(chunk->skb); } skb_pull(chunk->skb, sizeof(sctp_chunkhdr_t)); chunk->subh.v = NULL; /* Subheader is no longer valid. */ - if (chunk->chunk_end < chunk->skb->tail) { + if (chunk->chunk_end < skb_tail_pointer(chunk->skb)) { /* This is not a singleton */ chunk->singleton = 0; - } else if (chunk->chunk_end > chunk->skb->tail) { + } else if (chunk->chunk_end > skb_tail_pointer(chunk->skb)) { /* RFC 2960, Section 6.10 Bundling * * Partial chunks MUST NOT be placed in an SCTP packet. diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 0b9c49b3a10..84cd53635fe 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -122,26 +122,24 @@ SCTP_STATIC void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, int type, int code, int offset, __be32 info) { struct inet6_dev *idev; - struct ipv6hdr *iph = (struct ipv6hdr *)skb->data; - struct sctphdr *sh = (struct sctphdr *)(skb->data + offset); struct sock *sk; struct sctp_association *asoc; struct sctp_transport *transport; struct ipv6_pinfo *np; - char *saveip, *savesctp; + sk_buff_data_t saveip, savesctp; int err; idev = in6_dev_get(skb->dev); /* Fix up skb to look at the embedded net header. */ - saveip = skb->nh.raw; - savesctp = skb->h.raw; - skb->nh.ipv6h = iph; - skb->h.raw = (char *)sh; - sk = sctp_err_lookup(AF_INET6, skb, sh, &asoc, &transport); + saveip = skb->network_header; + savesctp = skb->transport_header; + skb_reset_network_header(skb); + skb_set_transport_header(skb, offset); + sk = sctp_err_lookup(AF_INET6, skb, sctp_hdr(skb), &asoc, &transport); /* Put back, the original pointers. */ - skb->nh.raw = saveip; - skb->h.raw = savesctp; + skb->network_header = saveip; + skb->transport_header = savesctp; if (!sk) { ICMP6_INC_STATS_BH(idev, ICMP6_MIB_INERRORS); goto out; @@ -391,13 +389,13 @@ static void sctp_v6_from_skb(union sctp_addr *addr,struct sk_buff *skb, addr->v6.sin6_flowinfo = 0; /* FIXME */ addr->v6.sin6_scope_id = ((struct inet6_skb_parm *)skb->cb)->iif; - sh = (struct sctphdr *) skb->h.raw; + sh = sctp_hdr(skb); if (is_saddr) { *port = sh->source; - from = &skb->nh.ipv6h->saddr; + from = &ipv6_hdr(skb)->saddr; } else { *port = sh->dest; - from = &skb->nh.ipv6h->daddr; + from = &ipv6_hdr(skb)->daddr; } ipv6_addr_copy(&addr->v6.sin6_addr, from); } @@ -606,7 +604,7 @@ static sctp_scope_t sctp_v6_scope(union sctp_addr *addr) default: retval = SCTP_SCOPE_GLOBAL; break; - }; + } return retval; } @@ -699,7 +697,7 @@ static int sctp_v6_skb_iif(const struct sk_buff *skb) /* Was this packet marked by Explicit Congestion Notification? */ static int sctp_v6_is_ce(const struct sk_buff *skb) { - return *((__u32 *)(skb->nh.ipv6h)) & htonl(1<<20); + return *((__u32 *)(ipv6_hdr(skb))) & htonl(1 << 20); } /* Dump the v6 addr to the seq file. */ @@ -766,19 +764,19 @@ static void sctp_inet6_skb_msgname(struct sk_buff *skb, char *msgname, if (msgname) { sctp_inet6_msgname(msgname, addr_len); sin6 = (struct sockaddr_in6 *)msgname; - sh = (struct sctphdr *)skb->h.raw; + sh = sctp_hdr(skb); sin6->sin6_port = sh->source; /* Map ipv4 address into v4-mapped-on-v6 address. */ if (sctp_sk(skb->sk)->v4mapped && - skb->nh.iph->version == 4) { + ip_hdr(skb)->version == 4) { sctp_v4_map_v6((union sctp_addr *)sin6); - sin6->sin6_addr.s6_addr32[3] = skb->nh.iph->saddr; + sin6->sin6_addr.s6_addr32[3] = ip_hdr(skb)->saddr; return; } /* Otherwise, just copy the v6 address. */ - ipv6_addr_copy(&sin6->sin6_addr, &skb->nh.ipv6h->saddr); + ipv6_addr_copy(&sin6->sin6_addr, &ipv6_hdr(skb)->saddr); if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) { struct sctp_ulpevent *ev = sctp_skb2event(skb); sin6->sin6_scope_id = ev->iif; @@ -994,45 +992,52 @@ static struct sctp_pf sctp_pf_inet6_specific = { .af = &sctp_ipv6_specific, }; -/* Initialize IPv6 support and register with inet6 stack. */ +/* Initialize IPv6 support and register with socket layer. */ int sctp_v6_init(void) { - int rc = proto_register(&sctpv6_prot, 1); + int rc; + + /* Register the SCTP specific PF_INET6 functions. */ + sctp_register_pf(&sctp_pf_inet6_specific, PF_INET6); + /* Register the SCTP specific AF_INET6 functions. */ + sctp_register_af(&sctp_ipv6_specific); + + rc = proto_register(&sctpv6_prot, 1); if (rc) - goto out; - /* Register inet6 protocol. */ - rc = -EAGAIN; - if (inet6_add_protocol(&sctpv6_protocol, IPPROTO_SCTP) < 0) - goto out_unregister_sctp_proto; + return rc; /* Add SCTPv6(UDP and TCP style) to inetsw6 linked list. */ inet6_register_protosw(&sctpv6_seqpacket_protosw); inet6_register_protosw(&sctpv6_stream_protosw); - /* Register the SCTP specific PF_INET6 functions. */ - sctp_register_pf(&sctp_pf_inet6_specific, PF_INET6); - - /* Register the SCTP specific AF_INET6 functions. */ - sctp_register_af(&sctp_ipv6_specific); + return 0; +} +/* Register with inet6 layer. */ +int sctp_v6_add_protocol(void) +{ /* Register notifier for inet6 address additions/deletions. */ register_inet6addr_notifier(&sctp_inet6addr_notifier); - rc = 0; -out: - return rc; -out_unregister_sctp_proto: - proto_unregister(&sctpv6_prot); - goto out; + + if (inet6_add_protocol(&sctpv6_protocol, IPPROTO_SCTP) < 0) + return -EAGAIN; + + return 0; } /* IPv6 specific exit support. */ void sctp_v6_exit(void) { - list_del(&sctp_ipv6_specific.list); - inet6_del_protocol(&sctpv6_protocol, IPPROTO_SCTP); inet6_unregister_protosw(&sctpv6_seqpacket_protosw); inet6_unregister_protosw(&sctpv6_stream_protosw); - unregister_inet6addr_notifier(&sctp_inet6addr_notifier); proto_unregister(&sctpv6_prot); + list_del(&sctp_ipv6_specific.list); +} + +/* Unregister with inet6 layer. */ +void sctp_v6_del_protocol(void) +{ + inet6_del_protocol(&sctpv6_protocol, IPPROTO_SCTP); + unregister_inet6addr_notifier(&sctp_inet6addr_notifier); } diff --git a/net/sctp/output.c b/net/sctp/output.c index f875fc3ced5..d85543def75 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -176,7 +176,7 @@ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet, case SCTP_XMIT_OK: case SCTP_XMIT_NAGLE_DELAY: break; - }; + } return retval; } diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 41abfd17627..992f361084b 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -338,7 +338,7 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk) SCTP_INC_STATS(SCTP_MIB_OUTORDERCHUNKS); q->empty = 0; break; - }; + } } else { list_add_tail(&chunk->list, &q->control_chunk_list); SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS); @@ -630,7 +630,7 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt, /* Retrieve a new chunk to bundle. */ lchunk = sctp_list_dequeue(lqueue); break; - }; + } /* If we are here due to a retransmit timeout or a fast * retransmit and if there are any chunks left in the retransmit @@ -779,7 +779,7 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout) default: /* We built a chunk with an illegal type! */ BUG(); - }; + } } /* Is it OK to send data chunks? */ @@ -1397,7 +1397,7 @@ static void sctp_check_transmitted(struct sctp_outq *q, SCTP_DEBUG_PRINTK("ACKed: %08x", tsn); dbg_prt_state = 0; dbg_ack_tsn = tsn; - }; + } dbg_last_ack_tsn = tsn; #endif /* SCTP_DEBUG */ @@ -1452,7 +1452,7 @@ static void sctp_check_transmitted(struct sctp_outq *q, SCTP_DEBUG_PRINTK("KEPT: %08x",tsn); dbg_prt_state = 1; dbg_kept_tsn = tsn; - }; + } dbg_last_kept_tsn = tsn; #endif /* SCTP_DEBUG */ @@ -1476,7 +1476,7 @@ static void sctp_check_transmitted(struct sctp_outq *q, } else { SCTP_DEBUG_PRINTK("\n"); } - }; + } #endif /* SCTP_DEBUG */ if (transport) { if (bytes_acked) { diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index e17a823ca90..34bab36637a 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -170,7 +170,7 @@ static void sctp_get_local_addr_list(void) struct sctp_af *af; read_lock(&dev_base_lock); - for (dev = dev_base; dev; dev = dev->next) { + for_each_netdev(dev) { __list_for_each(pos, &sctp_address_families) { af = list_entry(pos, struct sctp_af, list); af->copy_addrlist(&sctp_local_addr_list, dev); @@ -235,13 +235,13 @@ static void sctp_v4_from_skb(union sctp_addr *addr, struct sk_buff *skb, port = &addr->v4.sin_port; addr->v4.sin_family = AF_INET; - sh = (struct sctphdr *) skb->h.raw; + sh = sctp_hdr(skb); if (is_saddr) { *port = sh->source; - from = &skb->nh.iph->saddr; + from = &ip_hdr(skb)->saddr; } else { *port = sh->dest; - from = &skb->nh.iph->daddr; + from = &ip_hdr(skb)->daddr; } memcpy(&addr->v4.sin_addr.s_addr, from, sizeof(struct in_addr)); } @@ -530,7 +530,7 @@ static int sctp_v4_skb_iif(const struct sk_buff *skb) /* Was this packet marked by Explicit Congestion Notification? */ static int sctp_v4_is_ce(const struct sk_buff *skb) { - return INET_ECN_is_ce(skb->nh.iph->tos); + return INET_ECN_is_ce(ip_hdr(skb)->tos); } /* Create and initialize a new sk for the socket returned by accept(). */ @@ -731,15 +731,13 @@ static void sctp_inet_event_msgname(struct sctp_ulpevent *event, char *msgname, /* Initialize and copy out a msgname from an inbound skb. */ static void sctp_inet_skb_msgname(struct sk_buff *skb, char *msgname, int *len) { - struct sctphdr *sh; - struct sockaddr_in *sin; - if (msgname) { + struct sctphdr *sh = sctp_hdr(skb); + struct sockaddr_in *sin = (struct sockaddr_in *)msgname; + sctp_inet_msgname(msgname, len); - sin = (struct sockaddr_in *)msgname; - sh = (struct sctphdr *)skb->h.raw; sin->sin_port = sh->source; - sin->sin_addr.s_addr = skb->nh.iph->saddr; + sin->sin_addr.s_addr = ip_hdr(skb)->saddr; } } @@ -977,28 +975,14 @@ SCTP_STATIC __init int sctp_init(void) if (!sctp_sanity_check()) goto out; - status = proto_register(&sctp_prot, 1); - if (status) - goto out; - - /* Add SCTP to inet_protos hash table. */ - status = -EAGAIN; - if (inet_add_protocol(&sctp_protocol, IPPROTO_SCTP) < 0) - goto err_add_protocol; - - /* Add SCTP(TCP and UDP style) to inetsw linked list. */ - inet_register_protosw(&sctp_seqpacket_protosw); - inet_register_protosw(&sctp_stream_protosw); - - /* Allocate a cache pools. */ + /* Allocate bind_bucket and chunk caches. */ status = -ENOBUFS; sctp_bucket_cachep = kmem_cache_create("sctp_bind_bucket", sizeof(struct sctp_bind_bucket), 0, SLAB_HWCACHE_ALIGN, NULL, NULL); - if (!sctp_bucket_cachep) - goto err_bucket_cachep; + goto out; sctp_chunk_cachep = kmem_cache_create("sctp_chunk", sizeof(struct sctp_chunk), @@ -1044,7 +1028,7 @@ SCTP_STATIC __init int sctp_init(void) sctp_cookie_preserve_enable = 1; /* Max.Burst - 4 */ - sctp_max_burst = SCTP_MAX_BURST; + sctp_max_burst = SCTP_DEFAULT_MAX_BURST; /* Association.Max.Retrans - 10 attempts * Path.Max.Retrans - 5 attempts (per destination address) @@ -1155,6 +1139,14 @@ SCTP_STATIC __init int sctp_init(void) INIT_LIST_HEAD(&sctp_address_families); sctp_register_af(&sctp_ipv4_specific); + status = proto_register(&sctp_prot, 1); + if (status) + goto err_proto_register; + + /* Register SCTP(UDP and TCP style) with socket layer. */ + inet_register_protosw(&sctp_seqpacket_protosw); + inet_register_protosw(&sctp_stream_protosw); + status = sctp_v6_init(); if (status) goto err_v6_init; @@ -1168,19 +1160,39 @@ SCTP_STATIC __init int sctp_init(void) /* Initialize the local address list. */ INIT_LIST_HEAD(&sctp_local_addr_list); - sctp_get_local_addr_list(); /* Register notifier for inet address additions/deletions. */ register_inetaddr_notifier(&sctp_inetaddr_notifier); + /* Register SCTP with inet layer. */ + if (inet_add_protocol(&sctp_protocol, IPPROTO_SCTP) < 0) { + status = -EAGAIN; + goto err_add_protocol; + } + + /* Register SCTP with inet6 layer. */ + status = sctp_v6_add_protocol(); + if (status) + goto err_v6_add_protocol; + __unsafe(THIS_MODULE); status = 0; out: return status; +err_v6_add_protocol: + inet_del_protocol(&sctp_protocol, IPPROTO_SCTP); + unregister_inetaddr_notifier(&sctp_inetaddr_notifier); +err_add_protocol: + sctp_free_local_addr_list(); + sock_release(sctp_ctl_socket); err_ctl_sock_init: sctp_v6_exit(); err_v6_init: + inet_unregister_protosw(&sctp_stream_protosw); + inet_unregister_protosw(&sctp_seqpacket_protosw); + proto_unregister(&sctp_prot); +err_proto_register: sctp_sysctl_unregister(); list_del(&sctp_ipv4_specific.list); free_pages((unsigned long)sctp_port_hashtable, @@ -1194,19 +1206,13 @@ err_ehash_alloc: sizeof(struct sctp_hashbucket))); err_ahash_alloc: sctp_dbg_objcnt_exit(); -err_init_proc: sctp_proc_exit(); +err_init_proc: cleanup_sctp_mibs(); err_init_mibs: kmem_cache_destroy(sctp_chunk_cachep); err_chunk_cachep: kmem_cache_destroy(sctp_bucket_cachep); -err_bucket_cachep: - inet_del_protocol(&sctp_protocol, IPPROTO_SCTP); - inet_unregister_protosw(&sctp_seqpacket_protosw); - inet_unregister_protosw(&sctp_stream_protosw); -err_add_protocol: - proto_unregister(&sctp_prot); goto out; } @@ -1217,8 +1223,9 @@ SCTP_STATIC __exit void sctp_exit(void) * up all the remaining associations and all that memory. */ - /* Unregister notifier for inet address additions/deletions. */ - unregister_inetaddr_notifier(&sctp_inetaddr_notifier); + /* Unregister with inet6/inet layers. */ + sctp_v6_del_protocol(); + inet_del_protocol(&sctp_protocol, IPPROTO_SCTP); /* Free the local address list. */ sctp_free_local_addr_list(); @@ -1226,7 +1233,16 @@ SCTP_STATIC __exit void sctp_exit(void) /* Free the control endpoint. */ sock_release(sctp_ctl_socket); + /* Cleanup v6 initializations. */ sctp_v6_exit(); + + /* Unregister with socket layer. */ + inet_unregister_protosw(&sctp_stream_protosw); + inet_unregister_protosw(&sctp_seqpacket_protosw); + + /* Unregister notifier for inet address additions/deletions. */ + unregister_inetaddr_notifier(&sctp_inetaddr_notifier); + sctp_sysctl_unregister(); list_del(&sctp_ipv4_specific.list); @@ -1238,16 +1254,13 @@ SCTP_STATIC __exit void sctp_exit(void) get_order(sctp_port_hashsize * sizeof(struct sctp_bind_hashbucket))); - kmem_cache_destroy(sctp_chunk_cachep); - kmem_cache_destroy(sctp_bucket_cachep); - sctp_dbg_objcnt_exit(); sctp_proc_exit(); cleanup_sctp_mibs(); - inet_del_protocol(&sctp_protocol, IPPROTO_SCTP); - inet_unregister_protosw(&sctp_seqpacket_protosw); - inet_unregister_protosw(&sctp_stream_protosw); + kmem_cache_destroy(sctp_chunk_cachep); + kmem_cache_destroy(sctp_bucket_cachep); + proto_unregister(&sctp_prot); } diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index f7fb29d5a0c..8d18f570c2e 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -86,7 +86,7 @@ int sctp_chunk_iif(const struct sctp_chunk *chunk) struct sctp_af *af; int iif = 0; - af = sctp_get_af_specific(ipver2af(chunk->skb->nh.iph->version)); + af = sctp_get_af_specific(ipver2af(ip_hdr(chunk->skb)->version)); if (af) iif = af->skb_iif(chunk->skb); @@ -1143,7 +1143,7 @@ void *sctp_addto_chunk(struct sctp_chunk *chunk, int len, const void *data) /* Adjust the chunk length field. */ chunk->chunk_hdr->length = htons(chunklen + padlen + len); - chunk->chunk_end = chunk->skb->tail; + chunk->chunk_end = skb_tail_pointer(chunk->skb); return target; } @@ -1168,7 +1168,7 @@ int sctp_user_addto_chunk(struct sctp_chunk *chunk, int off, int len, /* Adjust the chunk length field. */ chunk->chunk_hdr->length = htons(ntohs(chunk->chunk_hdr->length) + len); - chunk->chunk_end = chunk->skb->tail; + chunk->chunk_end = skb_tail_pointer(chunk->skb); out: return err; @@ -1233,7 +1233,7 @@ struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep, asoc->temp = 1; skb = chunk->skb; /* Create an entry for the source address of the packet. */ - af = sctp_get_af_specific(ipver2af(skb->nh.iph->version)); + af = sctp_get_af_specific(ipver2af(ip_hdr(skb)->version)); if (unlikely(!af)) goto fail; af->from_skb(&asoc->c.peer_addr, skb, 1); @@ -1939,7 +1939,6 @@ int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid, * association. */ if (!asoc->temp) { - int assoc_id; int error; asoc->ssnmap = sctp_ssnmap_new(asoc->c.sinit_max_instreams, @@ -1947,19 +1946,9 @@ int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid, if (!asoc->ssnmap) goto clean_up; - retry: - if (unlikely(!idr_pre_get(&sctp_assocs_id, gfp))) + error = sctp_assoc_set_id(asoc, gfp); + if (error) goto clean_up; - spin_lock_bh(&sctp_assocs_id_lock); - error = idr_get_new_above(&sctp_assocs_id, (void *)asoc, 1, - &assoc_id); - spin_unlock_bh(&sctp_assocs_id_lock); - if (error == -EAGAIN) - goto retry; - else if (error) - goto clean_up; - - asoc->assoc_id = (sctp_assoc_t) assoc_id; } /* ADDIP Section 4.1 ASCONF Chunk Procedures @@ -2077,7 +2066,7 @@ static int sctp_process_param(struct sctp_association *asoc, default: /* Just ignore anything else. */ break; - }; + } } break; @@ -2118,7 +2107,7 @@ static int sctp_process_param(struct sctp_association *asoc, SCTP_DEBUG_PRINTK("Ignoring param: %d for association %p.\n", ntohs(param.p->type), asoc); break; - }; + } return retval; } diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 13556749311..d9fad4f6ffc 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -464,7 +464,7 @@ static void sctp_cmd_init_failed(sctp_cmd_seq_t *commands, struct sctp_ulpevent *event; event = sctp_ulpevent_make_assoc_change(asoc,0, SCTP_CANT_STR_ASSOC, - (__u16)error, 0, 0, + (__u16)error, 0, 0, NULL, GFP_ATOMIC); if (event) @@ -492,8 +492,13 @@ static void sctp_cmd_assoc_failed(sctp_cmd_seq_t *commands, /* Cancel any partial delivery in progress. */ sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC); - event = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_LOST, - (__u16)error, 0, 0, + if (event_type == SCTP_EVENT_T_CHUNK && subtype.chunk == SCTP_CID_ABORT) + event = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_LOST, + (__u16)error, 0, 0, chunk, + GFP_ATOMIC); + else + event = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_LOST, + (__u16)error, 0, 0, NULL, GFP_ATOMIC); if (event) sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, @@ -857,6 +862,33 @@ static void sctp_cmd_set_sk_err(struct sctp_association *asoc, int error) sk->sk_err = error; } +/* Helper function to generate an association change event */ +static void sctp_cmd_assoc_change(sctp_cmd_seq_t *commands, + struct sctp_association *asoc, + u8 state) +{ + struct sctp_ulpevent *ev; + + ev = sctp_ulpevent_make_assoc_change(asoc, 0, state, 0, + asoc->c.sinit_num_ostreams, + asoc->c.sinit_max_instreams, + NULL, GFP_ATOMIC); + if (ev) + sctp_ulpq_tail_event(&asoc->ulpq, ev); +} + +/* Helper function to generate an adaptation indication event */ +static void sctp_cmd_adaptation_ind(sctp_cmd_seq_t *commands, + struct sctp_association *asoc) +{ + struct sctp_ulpevent *ev; + + ev = sctp_ulpevent_make_adaptation_indication(asoc, GFP_ATOMIC); + + if (ev) + sctp_ulpq_tail_event(&asoc->ulpq, ev); +} + /* These three macros allow us to pull the debugging code out of the * main flow of sctp_do_sm() to keep attention focused on the real * functionality there. @@ -1004,7 +1036,7 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype, status, state, event_type, subtype.chunk); BUG(); break; - }; + } bail: return error; @@ -1480,11 +1512,20 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, case SCTP_CMD_SET_SK_ERR: sctp_cmd_set_sk_err(asoc, cmd->obj.error); break; + case SCTP_CMD_ASSOC_CHANGE: + sctp_cmd_assoc_change(commands, asoc, + cmd->obj.u8); + break; + case SCTP_CMD_ADAPTATION_IND: + sctp_cmd_adaptation_ind(commands, asoc); + break; + default: printk(KERN_WARNING "Impossible command: %u, %p\n", cmd->verb, cmd->obj.ptr); break; - }; + } + if (error) break; } diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index e9097cf614b..f02ce3dddb7 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -186,7 +186,7 @@ sctp_disposition_t sctp_sf_do_4_C(const struct sctp_endpoint *ep, * notification is passed to the upper layer. */ ev = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_SHUTDOWN_COMP, - 0, 0, 0, GFP_ATOMIC); + 0, 0, 0, NULL, GFP_ATOMIC); if (ev) sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev)); @@ -629,7 +629,7 @@ sctp_disposition_t sctp_sf_do_5_1D_ce(const struct sctp_endpoint *ep, case -SCTP_IERROR_BAD_SIG: default: return sctp_sf_pdiscard(ep, asoc, type, arg, commands); - }; + } } @@ -661,7 +661,7 @@ sctp_disposition_t sctp_sf_do_5_1D_ce(const struct sctp_endpoint *ep, ev = sctp_ulpevent_make_assoc_change(new_asoc, 0, SCTP_COMM_UP, 0, new_asoc->c.sinit_num_ostreams, new_asoc->c.sinit_max_instreams, - GFP_ATOMIC); + NULL, GFP_ATOMIC); if (!ev) goto nomem_ev; @@ -790,7 +790,7 @@ sctp_disposition_t sctp_sf_do_5_1E_ca(const struct sctp_endpoint *ep, ev = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_UP, 0, asoc->c.sinit_num_ostreams, asoc->c.sinit_max_instreams, - GFP_ATOMIC); + NULL, GFP_ATOMIC); if (!ev) goto nomem; @@ -1195,7 +1195,7 @@ static void sctp_tietags_populate(struct sctp_association *new_asoc, new_asoc->c.my_ttag = asoc->c.my_vtag; new_asoc->c.peer_ttag = asoc->c.peer_vtag; break; - }; + } /* Other parameters for the endpoint SHOULD be copied from the * existing parameters of the association (e.g. number of @@ -1625,7 +1625,7 @@ static sctp_disposition_t sctp_sf_do_dupcook_a(const struct sctp_endpoint *ep, ev = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_RESTART, 0, new_asoc->c.sinit_num_ostreams, new_asoc->c.sinit_max_instreams, - GFP_ATOMIC); + NULL, GFP_ATOMIC); if (!ev) goto nomem_ev; @@ -1656,7 +1656,6 @@ static sctp_disposition_t sctp_sf_do_dupcook_b(const struct sctp_endpoint *ep, struct sctp_association *new_asoc) { sctp_init_chunk_t *peer_init; - struct sctp_ulpevent *ev; struct sctp_chunk *repl; /* new_asoc is a brand-new association, so these are not yet @@ -1687,34 +1686,28 @@ static sctp_disposition_t sctp_sf_do_dupcook_b(const struct sctp_endpoint *ep, * D) IMPLEMENTATION NOTE: An implementation may choose to * send the Communication Up notification to the SCTP user * upon reception of a valid COOKIE ECHO chunk. + * + * Sadly, this needs to be implemented as a side-effect, because + * we are not guaranteed to have set the association id of the real + * association and so these notifications need to be delayed until + * the association id is allocated. */ - ev = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_UP, 0, - new_asoc->c.sinit_num_ostreams, - new_asoc->c.sinit_max_instreams, - GFP_ATOMIC); - if (!ev) - goto nomem_ev; - sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev)); + sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_CHANGE, SCTP_U8(SCTP_COMM_UP)); /* Sockets API Draft Section 5.3.1.6 * When a peer sends a Adaptation Layer Indication parameter , SCTP * delivers this notification to inform the application that of the * peers requested adaptation layer. + * + * This also needs to be done as a side effect for the same reason as + * above. */ - if (asoc->peer.adaptation_ind) { - ev = sctp_ulpevent_make_adaptation_indication(asoc, GFP_ATOMIC); - if (!ev) - goto nomem_ev; - - sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, - SCTP_ULPEVENT(ev)); - } + if (asoc->peer.adaptation_ind) + sctp_add_cmd_sf(commands, SCTP_CMD_ADAPTATION_IND, SCTP_NULL()); return SCTP_DISPOSITION_CONSUME; -nomem_ev: - sctp_chunk_free(repl); nomem: return SCTP_DISPOSITION_NOMEM; } @@ -1786,7 +1779,7 @@ static sctp_disposition_t sctp_sf_do_dupcook_d(const struct sctp_endpoint *ep, SCTP_COMM_UP, 0, asoc->c.sinit_num_ostreams, asoc->c.sinit_max_instreams, - GFP_ATOMIC); + NULL, GFP_ATOMIC); if (!ev) goto nomem; @@ -1904,7 +1897,7 @@ sctp_disposition_t sctp_sf_do_5_2_4_dupcook(const struct sctp_endpoint *ep, case -SCTP_IERROR_BAD_SIG: default: return sctp_sf_pdiscard(ep, asoc, type, arg, commands); - }; + } } /* Compare the tie_tag in cookie with the verification tag of @@ -1936,7 +1929,7 @@ sctp_disposition_t sctp_sf_do_5_2_4_dupcook(const struct sctp_endpoint *ep, default: /* Discard packet for all others. */ retval = sctp_sf_pdiscard(ep, asoc, type, arg, commands); break; - }; + } /* Delete the tempory new association. */ sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, SCTP_ASOC(new_asoc)); @@ -3035,7 +3028,7 @@ sctp_disposition_t sctp_sf_do_9_2_final(const struct sctp_endpoint *ep, * notification is passed to the upper layer. */ ev = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_SHUTDOWN_COMP, - 0, 0, 0, GFP_ATOMIC); + 0, 0, 0, NULL, GFP_ATOMIC); if (!ev) goto nomem; @@ -3115,7 +3108,7 @@ sctp_disposition_t sctp_sf_ootb(const struct sctp_endpoint *ep, break; ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length)); - if (ch_end > skb->tail) + if (ch_end > skb_tail_pointer(skb)) break; if (SCTP_CID_SHUTDOWN_ACK == ch->type) @@ -3130,7 +3123,7 @@ sctp_disposition_t sctp_sf_ootb(const struct sctp_endpoint *ep, return sctp_sf_pdiscard(ep, asoc, type, arg, commands); ch = (sctp_chunkhdr_t *) ch_end; - } while (ch_end < skb->tail); + } while (ch_end < skb_tail_pointer(skb)); if (ootb_shut_ack) sctp_sf_shut_8_4_5(ep, asoc, type, arg, commands); @@ -4816,7 +4809,7 @@ sctp_disposition_t sctp_sf_t2_timer_expire(const struct sctp_endpoint *ep, default: BUG(); break; - }; + } if (!reply) goto nomem; @@ -5286,7 +5279,7 @@ static int sctp_eat_data(const struct sctp_association *asoc, chunk->ecn_ce_done = 1; af = sctp_get_af_specific( - ipver2af(chunk->skb->nh.iph->version)); + ipver2af(ip_hdr(chunk->skb)->version)); if (af && af->is_ce(chunk->skb) && asoc->peer.ecn_capable) { /* Do real work as sideffect. */ diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c index 5e54b17377f..523071c7902 100644 --- a/net/sctp/sm_statetable.c +++ b/net/sctp/sm_statetable.c @@ -101,7 +101,7 @@ const sctp_sm_table_entry_t *sctp_sm_lookup_event(sctp_event_t event_type, default: /* Yikes! We got an illegal event type. */ return &bug; - }; + } } #define TYPE_SCTP_FUNC(func) {.fn = func, .name = #func} diff --git a/net/sctp/socket.c b/net/sctp/socket.c index a1d026f12b0..9f1a908776d 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -941,7 +941,7 @@ SCTP_STATIC int sctp_setsockopt_bindx(struct sock* sk, default: err = -EINVAL; break; - }; + } out: kfree(kaddrs); @@ -972,6 +972,7 @@ static int __sctp_connect(struct sock* sk, int walk_size = 0; union sctp_addr *sa_addr; void *addr_buf; + unsigned short port; sp = sctp_sk(sk); ep = sp->ep; @@ -992,6 +993,7 @@ static int __sctp_connect(struct sock* sk, while (walk_size < addrs_size) { sa_addr = (union sctp_addr *)addr_buf; af = sctp_get_af_specific(sa_addr->sa.sa_family); + port = ntohs(sa_addr->v4.sin_port); /* If the address family is not supported or if this address * causes the address buffer to overflow return EINVAL. @@ -1005,6 +1007,12 @@ static int __sctp_connect(struct sock* sk, if (err) goto out_free; + /* Make sure the destination port is correctly set + * in all addresses. + */ + if (asoc && asoc->peer.port && asoc->peer.port != port) + goto out_free; + memcpy(&to, sa_addr, af->sockaddr_len); /* Check if there already is a matching association on the @@ -2039,6 +2047,10 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval, * SPP_HB_DEMAND - Request a user initiated heartbeat * to be made immediately. * + * SPP_HB_TIME_IS_ZERO - Specify's that the time for + * heartbeat delayis to be set to the value of 0 + * milliseconds. + * * SPP_PMTUD_ENABLE - This field will enable PMTU * discovery upon the specified address. Note that * if the address feild is empty then all addresses @@ -2081,13 +2093,30 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, return error; } - if (params->spp_hbinterval) { - if (trans) { - trans->hbinterval = msecs_to_jiffies(params->spp_hbinterval); - } else if (asoc) { - asoc->hbinterval = msecs_to_jiffies(params->spp_hbinterval); - } else { - sp->hbinterval = params->spp_hbinterval; + /* Note that unless the spp_flag is set to SPP_HB_ENABLE the value of + * this field is ignored. Note also that a value of zero indicates + * the current setting should be left unchanged. + */ + if (params->spp_flags & SPP_HB_ENABLE) { + + /* Re-zero the interval if the SPP_HB_TIME_IS_ZERO is + * set. This lets us use 0 value when this flag + * is set. + */ + if (params->spp_flags & SPP_HB_TIME_IS_ZERO) + params->spp_hbinterval = 0; + + if (params->spp_hbinterval || + (params->spp_flags & SPP_HB_TIME_IS_ZERO)) { + if (trans) { + trans->hbinterval = + msecs_to_jiffies(params->spp_hbinterval); + } else if (asoc) { + asoc->hbinterval = + msecs_to_jiffies(params->spp_hbinterval); + } else { + sp->hbinterval = params->spp_hbinterval; + } } } @@ -2104,7 +2133,12 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, } } - if (params->spp_pathmtu) { + /* When Path MTU discovery is disabled the value specified here will + * be the "fixed" path mtu (i.e. the value of the spp_flags field must + * include the flag SPP_PMTUD_DISABLE for this field to have any + * effect). + */ + if ((params->spp_flags & SPP_PMTUD_DISABLE) && params->spp_pathmtu) { if (trans) { trans->pathmtu = params->spp_pathmtu; sctp_assoc_sync_pmtu(asoc); @@ -2135,7 +2169,11 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, } } - if (params->spp_sackdelay) { + /* Note that unless the spp_flag is set to SPP_SACKDELAY_ENABLE the + * value of this field is ignored. Note also that a value of zero + * indicates the current setting should be left unchanged. + */ + if ((params->spp_flags & SPP_SACKDELAY_ENABLE) && params->spp_sackdelay) { if (trans) { trans->sackdelay = msecs_to_jiffies(params->spp_sackdelay); @@ -2163,7 +2201,11 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, } } - if (params->spp_pathmaxrxt) { + /* Note that unless the spp_flag is set to SPP_PMTUD_ENABLE the value + * of this field is ignored. Note also that a value of zero + * indicates the current setting should be left unchanged. + */ + if ((params->spp_flags & SPP_PMTUD_ENABLE) && params->spp_pathmaxrxt) { if (trans) { trans->pathmaxrxt = params->spp_pathmaxrxt; } else if (asoc) { @@ -2255,7 +2297,7 @@ static int sctp_setsockopt_peer_addr_params(struct sock *sk, return 0; } -/* 7.1.24. Delayed Ack Timer (SCTP_DELAYED_ACK_TIME) +/* 7.1.23. Delayed Ack Timer (SCTP_DELAYED_ACK_TIME) * * This options will get or set the delayed ack timer. The time is set * in milliseconds. If the assoc_id is 0, then this sets or gets the @@ -2792,6 +2834,102 @@ static int sctp_setsockopt_context(struct sock *sk, char __user *optval, return 0; } +/* + * 7.1.24. Get or set fragmented interleave (SCTP_FRAGMENT_INTERLEAVE) + * + * This options will at a minimum specify if the implementation is doing + * fragmented interleave. Fragmented interleave, for a one to many + * socket, is when subsequent calls to receive a message may return + * parts of messages from different associations. Some implementations + * may allow you to turn this value on or off. If so, when turned off, + * no fragment interleave will occur (which will cause a head of line + * blocking amongst multiple associations sharing the same one to many + * socket). When this option is turned on, then each receive call may + * come from a different association (thus the user must receive data + * with the extended calls (e.g. sctp_recvmsg) to keep track of which + * association each receive belongs to. + * + * This option takes a boolean value. A non-zero value indicates that + * fragmented interleave is on. A value of zero indicates that + * fragmented interleave is off. + * + * Note that it is important that an implementation that allows this + * option to be turned on, have it off by default. Otherwise an unaware + * application using the one to many model may become confused and act + * incorrectly. + */ +static int sctp_setsockopt_fragment_interleave(struct sock *sk, + char __user *optval, + int optlen) +{ + int val; + + if (optlen != sizeof(int)) + return -EINVAL; + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + sctp_sk(sk)->frag_interleave = (val == 0) ? 0 : 1; + + return 0; +} + +/* + * 7.1.25. Set or Get the sctp partial delivery point + * (SCTP_PARTIAL_DELIVERY_POINT) + * This option will set or get the SCTP partial delivery point. This + * point is the size of a message where the partial delivery API will be + * invoked to help free up rwnd space for the peer. Setting this to a + * lower value will cause partial delivery's to happen more often. The + * calls argument is an integer that sets or gets the partial delivery + * point. + */ +static int sctp_setsockopt_partial_delivery_point(struct sock *sk, + char __user *optval, + int optlen) +{ + u32 val; + + if (optlen != sizeof(u32)) + return -EINVAL; + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + sctp_sk(sk)->pd_point = val; + + return 0; /* is this the right error code? */ +} + +/* + * 7.1.28. Set or Get the maximum burst (SCTP_MAX_BURST) + * + * This option will allow a user to change the maximum burst of packets + * that can be emitted by this association. Note that the default value + * is 4, and some implementations may restrict this setting so that it + * can only be lowered. + * + * NOTE: This text doesn't seem right. Do this on a socket basis with + * future associations inheriting the socket value. + */ +static int sctp_setsockopt_maxburst(struct sock *sk, + char __user *optval, + int optlen) +{ + int val; + + if (optlen != sizeof(int)) + return -EINVAL; + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + if (val < 0) + return -EINVAL; + + sctp_sk(sk)->max_burst = val; + + return 0; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -2871,6 +3009,9 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_DELAYED_ACK_TIME: retval = sctp_setsockopt_delayed_ack_time(sk, optval, optlen); break; + case SCTP_PARTIAL_DELIVERY_POINT: + retval = sctp_setsockopt_partial_delivery_point(sk, optval, optlen); + break; case SCTP_INITMSG: retval = sctp_setsockopt_initmsg(sk, optval, optlen); @@ -2906,11 +3047,16 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_CONTEXT: retval = sctp_setsockopt_context(sk, optval, optlen); break; - + case SCTP_FRAGMENT_INTERLEAVE: + retval = sctp_setsockopt_fragment_interleave(sk, optval, optlen); + break; + case SCTP_MAX_BURST: + retval = sctp_setsockopt_maxburst(sk, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; - }; + } sctp_release_sock(sk); @@ -3066,6 +3212,7 @@ SCTP_STATIC int sctp_init_sock(struct sock *sk) sp->default_timetolive = 0; sp->default_rcv_context = 0; + sp->max_burst = sctp_max_burst; /* Initialize default setup parameters. These parameters * can be modified with the SCTP_INITMSG socket option or @@ -3134,8 +3281,9 @@ SCTP_STATIC int sctp_init_sock(struct sock *sk) sp->pf = sctp_get_pf_specific(sk->sk_family); /* Control variables for partial data delivery. */ - sp->pd_mode = 0; + atomic_set(&sp->pd_mode, 0); skb_queue_head_init(&sp->pd_lobby); + sp->frag_interleave = 0; /* Create a per socket endpoint structure. Even if we * change the data structure relationships, this may still @@ -3642,7 +3790,7 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len, return 0; } -/* 7.1.24. Delayed Ack Timer (SCTP_DELAYED_ACK_TIME) +/* 7.1.23. Delayed Ack Timer (SCTP_DELAYED_ACK_TIME) * * This options will get or set the delayed ack timer. The time is set * in milliseconds. If the assoc_id is 0, then this sets or gets the @@ -3847,7 +3995,7 @@ static int sctp_getsockopt_peer_addrs(struct sock *sk, int len, memcpy(&temp, &from->ipaddr, sizeof(temp)); sctp_get_pf_specific(sk->sk_family)->addr_v4map(sp, &temp); addrlen = sctp_get_af_specific(sk->sk_family)->sockaddr_len; - if(space_left < addrlen) + if (space_left < addrlen) return -ENOMEM; if (copy_to_user(to, &temp, addrlen)) return -EFAULT; @@ -3936,8 +4084,9 @@ done: /* Helper function that copies local addresses to user and returns the number * of addresses copied. */ -static int sctp_copy_laddrs_to_user_old(struct sock *sk, __u16 port, int max_addrs, - void __user *to) +static int sctp_copy_laddrs_old(struct sock *sk, __u16 port, + int max_addrs, void *to, + int *bytes_copied) { struct list_head *pos, *next; struct sctp_sockaddr_entry *addr; @@ -3954,10 +4103,10 @@ static int sctp_copy_laddrs_to_user_old(struct sock *sk, __u16 port, int max_add sctp_get_pf_specific(sk->sk_family)->addr_v4map(sctp_sk(sk), &temp); addrlen = sctp_get_af_specific(temp.sa.sa_family)->sockaddr_len; - if (copy_to_user(to, &temp, addrlen)) - return -EFAULT; + memcpy(to, &temp, addrlen); to += addrlen; + *bytes_copied += addrlen; cnt ++; if (cnt >= max_addrs) break; } @@ -3965,8 +4114,8 @@ static int sctp_copy_laddrs_to_user_old(struct sock *sk, __u16 port, int max_add return cnt; } -static int sctp_copy_laddrs_to_user(struct sock *sk, __u16 port, - void __user **to, size_t space_left) +static int sctp_copy_laddrs(struct sock *sk, __u16 port, void *to, + size_t space_left, int *bytes_copied) { struct list_head *pos, *next; struct sctp_sockaddr_entry *addr; @@ -3983,14 +4132,14 @@ static int sctp_copy_laddrs_to_user(struct sock *sk, __u16 port, sctp_get_pf_specific(sk->sk_family)->addr_v4map(sctp_sk(sk), &temp); addrlen = sctp_get_af_specific(temp.sa.sa_family)->sockaddr_len; - if(space_left<addrlen) + if (space_left < addrlen) return -ENOMEM; - if (copy_to_user(*to, &temp, addrlen)) - return -EFAULT; + memcpy(to, &temp, addrlen); - *to += addrlen; + to += addrlen; cnt ++; space_left -= addrlen; + bytes_copied += addrlen; } return cnt; @@ -4014,6 +4163,8 @@ static int sctp_getsockopt_local_addrs_old(struct sock *sk, int len, int addrlen; rwlock_t *addr_lock; int err = 0; + void *addrs; + int bytes_copied = 0; if (len != sizeof(struct sctp_getaddrs_old)) return -EINVAL; @@ -4041,6 +4192,15 @@ static int sctp_getsockopt_local_addrs_old(struct sock *sk, int len, to = getaddrs.addrs; + /* Allocate space for a local instance of packed array to hold all + * the data. We store addresses here first and then put write them + * to the user in one shot. + */ + addrs = kmalloc(sizeof(union sctp_addr) * getaddrs.addr_num, + GFP_KERNEL); + if (!addrs) + return -ENOMEM; + sctp_read_lock(addr_lock); /* If the endpoint is bound to 0.0.0.0 or ::0, get the valid @@ -4050,13 +4210,9 @@ static int sctp_getsockopt_local_addrs_old(struct sock *sk, int len, addr = list_entry(bp->address_list.next, struct sctp_sockaddr_entry, list); if (sctp_is_any(&addr->a)) { - cnt = sctp_copy_laddrs_to_user_old(sk, bp->port, - getaddrs.addr_num, - to); - if (cnt < 0) { - err = cnt; - goto unlock; - } + cnt = sctp_copy_laddrs_old(sk, bp->port, + getaddrs.addr_num, + addrs, &bytes_copied); goto copy_getaddrs; } } @@ -4066,22 +4222,29 @@ static int sctp_getsockopt_local_addrs_old(struct sock *sk, int len, memcpy(&temp, &addr->a, sizeof(temp)); sctp_get_pf_specific(sk->sk_family)->addr_v4map(sp, &temp); addrlen = sctp_get_af_specific(temp.sa.sa_family)->sockaddr_len; - if (copy_to_user(to, &temp, addrlen)) { - err = -EFAULT; - goto unlock; - } + memcpy(addrs, &temp, addrlen); to += addrlen; + bytes_copied += addrlen; cnt ++; if (cnt >= getaddrs.addr_num) break; } copy_getaddrs: + sctp_read_unlock(addr_lock); + + /* copy the entire address list into the user provided space */ + if (copy_to_user(to, addrs, bytes_copied)) { + err = -EFAULT; + goto error; + } + + /* copy the leading structure back to user */ getaddrs.addr_num = cnt; if (copy_to_user(optval, &getaddrs, sizeof(struct sctp_getaddrs_old))) err = -EFAULT; -unlock: - sctp_read_unlock(addr_lock); +error: + kfree(addrs); return err; } @@ -4101,7 +4264,8 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len, rwlock_t *addr_lock; int err = 0; size_t space_left; - int bytes_copied; + int bytes_copied = 0; + void *addrs; if (len <= sizeof(struct sctp_getaddrs)) return -EINVAL; @@ -4129,6 +4293,9 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len, to = optval + offsetof(struct sctp_getaddrs,addrs); space_left = len - sizeof(struct sctp_getaddrs) - offsetof(struct sctp_getaddrs,addrs); + addrs = kmalloc(space_left, GFP_KERNEL); + if (!addrs) + return -ENOMEM; sctp_read_lock(addr_lock); @@ -4139,11 +4306,11 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len, addr = list_entry(bp->address_list.next, struct sctp_sockaddr_entry, list); if (sctp_is_any(&addr->a)) { - cnt = sctp_copy_laddrs_to_user(sk, bp->port, - &to, space_left); + cnt = sctp_copy_laddrs(sk, bp->port, addrs, + space_left, &bytes_copied); if (cnt < 0) { err = cnt; - goto unlock; + goto error; } goto copy_getaddrs; } @@ -4154,26 +4321,31 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len, memcpy(&temp, &addr->a, sizeof(temp)); sctp_get_pf_specific(sk->sk_family)->addr_v4map(sp, &temp); addrlen = sctp_get_af_specific(temp.sa.sa_family)->sockaddr_len; - if(space_left < addrlen) - return -ENOMEM; /*fixme: right error?*/ - if (copy_to_user(to, &temp, addrlen)) { - err = -EFAULT; - goto unlock; + if (space_left < addrlen) { + err = -ENOMEM; /*fixme: right error?*/ + goto error; } + memcpy(addrs, &temp, addrlen); to += addrlen; + bytes_copied += addrlen; cnt ++; space_left -= addrlen; } copy_getaddrs: + sctp_read_unlock(addr_lock); + + if (copy_to_user(to, addrs, bytes_copied)) { + err = -EFAULT; + goto error; + } if (put_user(cnt, &((struct sctp_getaddrs __user *)optval)->addr_num)) return -EFAULT; - bytes_copied = ((char __user *)to) - optval; if (put_user(bytes_copied, optlen)) return -EFAULT; -unlock: - sctp_read_unlock(addr_lock); +error: + kfree(addrs); return err; } @@ -4536,6 +4708,77 @@ static int sctp_getsockopt_maxseg(struct sock *sk, int len, return 0; } +/* + * 7.1.24. Get or set fragmented interleave (SCTP_FRAGMENT_INTERLEAVE) + * (chapter and verse is quoted at sctp_setsockopt_fragment_interleave()) + */ +static int sctp_getsockopt_fragment_interleave(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + int val; + + if (len < sizeof(int)) + return -EINVAL; + + len = sizeof(int); + + val = sctp_sk(sk)->frag_interleave; + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + + return 0; +} + +/* + * 7.1.25. Set or Get the sctp partial delivery point + * (chapter and verse is quoted at sctp_setsockopt_partial_delivery_point()) + */ +static int sctp_getsockopt_partial_delivery_point(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + u32 val; + + if (len < sizeof(u32)) + return -EINVAL; + + len = sizeof(u32); + + val = sctp_sk(sk)->pd_point; + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + + return -ENOTSUPP; +} + +/* + * 7.1.28. Set or Get the maximum burst (SCTP_MAX_BURST) + * (chapter and verse is quoted at sctp_setsockopt_maxburst()) + */ +static int sctp_getsockopt_maxburst(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + int val; + + if (len < sizeof(int)) + return -EINVAL; + + len = sizeof(int); + + val = sctp_sk(sk)->max_burst; + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + + return -ENOTSUPP; +} + SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -4648,10 +4891,21 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname, case SCTP_CONTEXT: retval = sctp_getsockopt_context(sk, len, optval, optlen); break; + case SCTP_FRAGMENT_INTERLEAVE: + retval = sctp_getsockopt_fragment_interleave(sk, len, optval, + optlen); + break; + case SCTP_PARTIAL_DELIVERY_POINT: + retval = sctp_getsockopt_partial_delivery_point(sk, len, optval, + optlen); + break; + case SCTP_MAX_BURST: + retval = sctp_getsockopt_maxburst(sk, len, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; - }; + } sctp_release_sock(sk); return retval; @@ -4766,7 +5020,8 @@ pp_found: struct hlist_node *node; SCTP_DEBUG_PRINTK("sctp_get_port() found a possible match\n"); - if (pp->fastreuse && sk->sk_reuse) + if (pp->fastreuse && sk->sk_reuse && + sk->sk_state != SCTP_SS_LISTENING) goto success; /* Run through the list of sockets bound to the port @@ -4783,7 +5038,8 @@ pp_found: struct sctp_endpoint *ep2; ep2 = sctp_sk(sk2)->ep; - if (reuse && sk2->sk_reuse) + if (reuse && sk2->sk_reuse && + sk2->sk_state != SCTP_SS_LISTENING) continue; if (sctp_bind_addr_match(&ep2->base.bind_addr, addr, @@ -4804,9 +5060,13 @@ pp_not_found: * if sk->sk_reuse is too (that is, if the caller requested * SO_REUSEADDR on this socket -sk-). */ - if (hlist_empty(&pp->owner)) - pp->fastreuse = sk->sk_reuse ? 1 : 0; - else if (pp->fastreuse && !sk->sk_reuse) + if (hlist_empty(&pp->owner)) { + if (sk->sk_reuse && sk->sk_state != SCTP_SS_LISTENING) + pp->fastreuse = 1; + else + pp->fastreuse = 0; + } else if (pp->fastreuse && + (!sk->sk_reuse || sk->sk_state == SCTP_SS_LISTENING)) pp->fastreuse = 0; /* We are set, so fill up all the data in the hash table @@ -4814,8 +5074,8 @@ pp_not_found: * sockets FIXME: Blurry, NPI (ipg). */ success: - inet_sk(sk)->num = snum; if (!sctp_sk(sk)->bind_hash) { + inet_sk(sk)->num = snum; sk_add_bind_node(sk, &pp->owner); sctp_sk(sk)->bind_hash = pp; } @@ -4888,12 +5148,16 @@ SCTP_STATIC int sctp_seqpacket_listen(struct sock *sk, int backlog) * This is not currently spelled out in the SCTP sockets * extensions draft, but follows the practice as seen in TCP * sockets. + * + * Additionally, turn off fastreuse flag since we are not listening */ + sk->sk_state = SCTP_SS_LISTENING; if (!ep->base.bind_addr.port) { if (sctp_autobind(sk)) return -EAGAIN; - } - sk->sk_state = SCTP_SS_LISTENING; + } else + sctp_sk(sk)->bind_hash->fastreuse = 0; + sctp_hash_endpoint(ep); return 0; } @@ -4931,11 +5195,13 @@ SCTP_STATIC int sctp_stream_listen(struct sock *sk, int backlog) * extensions draft, but follows the practice as seen in TCP * sockets. */ + sk->sk_state = SCTP_SS_LISTENING; if (!ep->base.bind_addr.port) { if (sctp_autobind(sk)) return -EAGAIN; - } - sk->sk_state = SCTP_SS_LISTENING; + } else + sctp_sk(sk)->bind_hash->fastreuse = 0; + sk->sk_max_ack_backlog = backlog; sctp_hash_endpoint(ep); return 0; @@ -4976,7 +5242,8 @@ int sctp_inet_listen(struct socket *sock, int backlog) break; default: break; - }; + } + if (err) goto cleanup; @@ -5239,7 +5506,7 @@ SCTP_STATIC int sctp_msghdr_parse(const struct msghdr *msg, default: return -EINVAL; - }; + } } return 0; } @@ -5742,9 +6009,9 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, * 3) Peeling off non-partial delivery; move pd_lobby to receive_queue. */ skb_queue_head_init(&newsp->pd_lobby); - sctp_sk(newsk)->pd_mode = assoc->ulpq.pd_mode; + atomic_set(&sctp_sk(newsk)->pd_mode, assoc->ulpq.pd_mode); - if (sctp_sk(oldsk)->pd_mode) { + if (atomic_read(&sctp_sk(oldsk)->pd_mode)) { struct sk_buff_head *queue; /* Decide which queue to move pd_lobby skbs to. */ @@ -5770,7 +6037,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, * delivery to finish. */ if (assoc->ulpq.pd_mode) - sctp_clear_pd(oldsk); + sctp_clear_pd(oldsk, NULL); } diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 4d8c2ab864f..961df275d5b 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -507,7 +507,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, transport->cwnd = max(transport->cwnd/2, 4*transport->asoc->pathmtu); break; - }; + } transport->partial_bytes_acked = 0; SCTP_DEBUG_PRINTK("%s: transport: %p reason: %d cwnd: " diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index 2e11bc8d5d3..661ea2dd78b 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -131,19 +131,54 @@ static inline void sctp_ulpevent_release_owner(struct sctp_ulpevent *event) struct sctp_ulpevent *sctp_ulpevent_make_assoc_change( const struct sctp_association *asoc, __u16 flags, __u16 state, __u16 error, __u16 outbound, - __u16 inbound, gfp_t gfp) + __u16 inbound, struct sctp_chunk *chunk, gfp_t gfp) { struct sctp_ulpevent *event; struct sctp_assoc_change *sac; struct sk_buff *skb; - event = sctp_ulpevent_new(sizeof(struct sctp_assoc_change), + /* If the lower layer passed in the chunk, it will be + * an ABORT, so we need to include it in the sac_info. + */ + if (chunk) { + /* sctp_inqu_pop() has allready pulled off the chunk + * header. We need to put it back temporarily + */ + skb_push(chunk->skb, sizeof(sctp_chunkhdr_t)); + + /* Copy the chunk data to a new skb and reserve enough + * head room to use as notification. + */ + skb = skb_copy_expand(chunk->skb, + sizeof(struct sctp_assoc_change), 0, gfp); + + if (!skb) + goto fail; + + /* put back the chunk header now that we have a copy */ + skb_pull(chunk->skb, sizeof(sctp_chunkhdr_t)); + + /* Embed the event fields inside the cloned skb. */ + event = sctp_skb2event(skb); + sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize); + + /* Include the notification structure */ + sac = (struct sctp_assoc_change *) + skb_push(skb, sizeof(struct sctp_assoc_change)); + + /* Trim the buffer to the right length. */ + skb_trim(skb, sizeof(struct sctp_assoc_change) + + ntohs(chunk->chunk_hdr->length)); + } else { + event = sctp_ulpevent_new(sizeof(struct sctp_assoc_change), MSG_NOTIFICATION, gfp); - if (!event) - goto fail; - skb = sctp_event2skb(event); - sac = (struct sctp_assoc_change *) - skb_put(skb, sizeof(struct sctp_assoc_change)); + if (!event) + goto fail; + + skb = sctp_event2skb(event); + sac = (struct sctp_assoc_change *) skb_put(skb, + sizeof(struct sctp_assoc_change)); + } /* Socket Extensions for SCTP * 5.3.1.1 SCTP_ASSOC_CHANGE diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index b29e3e4b72c..34eb977a204 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -138,26 +138,59 @@ int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, /* Clear the partial delivery mode for this socket. Note: This * assumes that no association is currently in partial delivery mode. */ -int sctp_clear_pd(struct sock *sk) +int sctp_clear_pd(struct sock *sk, struct sctp_association *asoc) { struct sctp_sock *sp = sctp_sk(sk); - sp->pd_mode = 0; - if (!skb_queue_empty(&sp->pd_lobby)) { - struct list_head *list; - sctp_skb_list_tail(&sp->pd_lobby, &sk->sk_receive_queue); - list = (struct list_head *)&sctp_sk(sk)->pd_lobby; - INIT_LIST_HEAD(list); - return 1; + if (atomic_dec_and_test(&sp->pd_mode)) { + /* This means there are no other associations in PD, so + * we can go ahead and clear out the lobby in one shot + */ + if (!skb_queue_empty(&sp->pd_lobby)) { + struct list_head *list; + sctp_skb_list_tail(&sp->pd_lobby, &sk->sk_receive_queue); + list = (struct list_head *)&sctp_sk(sk)->pd_lobby; + INIT_LIST_HEAD(list); + return 1; + } + } else { + /* There are other associations in PD, so we only need to + * pull stuff out of the lobby that belongs to the + * associations that is exiting PD (all of its notifications + * are posted here). + */ + if (!skb_queue_empty(&sp->pd_lobby) && asoc) { + struct sk_buff *skb, *tmp; + struct sctp_ulpevent *event; + + sctp_skb_for_each(skb, &sp->pd_lobby, tmp) { + event = sctp_skb2event(skb); + if (event->asoc == asoc) { + __skb_unlink(skb, &sp->pd_lobby); + __skb_queue_tail(&sk->sk_receive_queue, + skb); + } + } + } } + return 0; } +/* Set the pd_mode on the socket and ulpq */ +static void sctp_ulpq_set_pd(struct sctp_ulpq *ulpq) +{ + struct sctp_sock *sp = sctp_sk(ulpq->asoc->base.sk); + + atomic_inc(&sp->pd_mode); + ulpq->pd_mode = 1; +} + /* Clear the pd_mode and restart any pending messages waiting for delivery. */ static int sctp_ulpq_clear_pd(struct sctp_ulpq *ulpq) { ulpq->pd_mode = 0; - return sctp_clear_pd(ulpq->asoc->base.sk); + return sctp_clear_pd(ulpq->asoc->base.sk, ulpq->asoc); } /* If the SKB of 'event' is on a list, it is the first such member @@ -187,25 +220,35 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) * the association the cause of the partial delivery. */ - if (!sctp_sk(sk)->pd_mode) { + if (atomic_read(&sctp_sk(sk)->pd_mode) == 0) { queue = &sk->sk_receive_queue; - } else if (ulpq->pd_mode) { - /* If the association is in partial delivery, we - * need to finish delivering the partially processed - * packet before passing any other data. This is - * because we don't truly support stream interleaving. - */ - if ((event->msg_flags & MSG_NOTIFICATION) || - (SCTP_DATA_NOT_FRAG == - (event->msg_flags & SCTP_DATA_FRAG_MASK))) - queue = &sctp_sk(sk)->pd_lobby; - else { - clear_pd = event->msg_flags & MSG_EOR; - queue = &sk->sk_receive_queue; + } else { + if (ulpq->pd_mode) { + /* If the association is in partial delivery, we + * need to finish delivering the partially processed + * packet before passing any other data. This is + * because we don't truly support stream interleaving. + */ + if ((event->msg_flags & MSG_NOTIFICATION) || + (SCTP_DATA_NOT_FRAG == + (event->msg_flags & SCTP_DATA_FRAG_MASK))) + queue = &sctp_sk(sk)->pd_lobby; + else { + clear_pd = event->msg_flags & MSG_EOR; + queue = &sk->sk_receive_queue; + } + } else { + /* + * If fragment interleave is enabled, we + * can queue this to the recieve queue instead + * of the lobby. + */ + if (sctp_sk(sk)->frag_interleave) + queue = &sk->sk_receive_queue; + else + queue = &sctp_sk(sk)->pd_lobby; } - } else - queue = &sctp_sk(sk)->pd_lobby; - + } /* If we are harvesting multiple skbs they will be * collected on a list. @@ -348,7 +391,7 @@ static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff_head *qu break; pos->next = pnext; pos = pnext; - }; + } event = sctp_skb2event(f_frag); SCTP_INC_STATS(SCTP_MIB_REASMUSRMSGS); @@ -367,6 +410,11 @@ static inline struct sctp_ulpevent *sctp_ulpq_retrieve_reassembled(struct sctp_u struct sk_buff *first_frag = NULL; __u32 ctsn, next_tsn; struct sctp_ulpevent *retval = NULL; + struct sk_buff *pd_first = NULL; + struct sk_buff *pd_last = NULL; + size_t pd_len = 0; + struct sctp_association *asoc; + u32 pd_point; /* Initialized to 0 just to avoid compiler warning message. Will * never be used with this value. It is referenced only after it @@ -382,6 +430,10 @@ static inline struct sctp_ulpevent *sctp_ulpq_retrieve_reassembled(struct sctp_u * we expect to find the remaining middle fragments and the last * fragment in order. If not, first_frag is reset to NULL and we * start the next pass when we find another first fragment. + * + * There is a potential to do partial delivery if user sets + * SCTP_PARTIAL_DELIVERY_POINT option. Lets count some things here + * to see if can do PD. */ skb_queue_walk(&ulpq->reasm, pos) { cevent = sctp_skb2event(pos); @@ -389,14 +441,32 @@ static inline struct sctp_ulpevent *sctp_ulpq_retrieve_reassembled(struct sctp_u switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: + /* If this "FIRST_FRAG" is the first + * element in the queue, then count it towards + * possible PD. + */ + if (pos == ulpq->reasm.next) { + pd_first = pos; + pd_last = pos; + pd_len = pos->len; + } else { + pd_first = NULL; + pd_last = NULL; + pd_len = 0; + } + first_frag = pos; next_tsn = ctsn + 1; break; case SCTP_DATA_MIDDLE_FRAG: - if ((first_frag) && (ctsn == next_tsn)) + if ((first_frag) && (ctsn == next_tsn)) { next_tsn++; - else + if (pd_first) { + pd_last = pos; + pd_len += pos->len; + } + } else first_frag = NULL; break; @@ -406,8 +476,29 @@ static inline struct sctp_ulpevent *sctp_ulpq_retrieve_reassembled(struct sctp_u else first_frag = NULL; break; - }; + } + } + asoc = ulpq->asoc; + if (pd_first) { + /* Make sure we can enter partial deliver. + * We can trigger partial delivery only if framgent + * interleave is set, or the socket is not already + * in partial delivery. + */ + if (!sctp_sk(asoc->base.sk)->frag_interleave && + atomic_read(&sctp_sk(asoc->base.sk)->pd_mode)) + goto done; + + cevent = sctp_skb2event(pd_first); + pd_point = sctp_sk(asoc->base.sk)->pd_point; + if (pd_point && pd_point <= pd_len) { + retval = sctp_make_reassembled_event(&ulpq->reasm, + pd_first, + pd_last); + if (retval) + sctp_ulpq_set_pd(ulpq); + } } done: return retval; @@ -465,7 +556,7 @@ static inline struct sctp_ulpevent *sctp_ulpq_retrieve_partial(struct sctp_ulpq goto done; default: return NULL; - }; + } } /* We have the reassembled event. There is no need to look @@ -557,7 +648,7 @@ static inline struct sctp_ulpevent *sctp_ulpq_retrieve_first(struct sctp_ulpq *u break; default: return NULL; - }; + } } /* We have the reassembled event. There is no need to look @@ -826,19 +917,29 @@ void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq, { struct sctp_ulpevent *event; struct sctp_association *asoc; + struct sctp_sock *sp; asoc = ulpq->asoc; + sp = sctp_sk(asoc->base.sk); - /* Are we already in partial delivery mode? */ - if (!sctp_sk(asoc->base.sk)->pd_mode) { + /* If the association is already in Partial Delivery mode + * we have noting to do. + */ + if (ulpq->pd_mode) + return; + /* If the user enabled fragment interleave socket option, + * multiple associations can enter partial delivery. + * Otherwise, we can only enter partial delivery if the + * socket is not in partial deliver mode. + */ + if (sp->frag_interleave || atomic_read(&sp->pd_mode) == 0) { /* Is partial delivery possible? */ event = sctp_ulpq_retrieve_first(ulpq); /* Send event to the ULP. */ if (event) { sctp_ulpq_tail_event(ulpq, event); - sctp_sk(asoc->base.sk)->pd_mode = 1; - ulpq->pd_mode = 1; + sctp_ulpq_set_pd(ulpq); return; } } diff --git a/net/socket.c b/net/socket.c index ea8f81abc45..1ad62c08377 100644 --- a/net/socket.c +++ b/net/socket.c @@ -585,6 +585,37 @@ int kernel_sendmsg(struct socket *sock, struct msghdr *msg, return result; } +/* + * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP) + */ +void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, + struct sk_buff *skb) +{ + ktime_t kt = skb->tstamp; + + if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) { + struct timeval tv; + /* Race occurred between timestamp enabling and packet + receiving. Fill in the current time for now. */ + if (kt.tv64 == 0) + kt = ktime_get_real(); + skb->tstamp = kt; + tv = ktime_to_timeval(kt); + put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP, sizeof(tv), &tv); + } else { + struct timespec ts; + /* Race occurred between timestamp enabling and packet + receiving. Fill in the current time for now. */ + if (kt.tv64 == 0) + kt = ktime_get_real(); + skb->tstamp = kt; + ts = ktime_to_timespec(kt); + put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS, sizeof(ts), &ts); + } +} + +EXPORT_SYMBOL_GPL(__sock_recv_timestamp); + static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size, int flags) { @@ -1292,7 +1323,7 @@ asmlinkage long sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen) int err, fput_needed; sock = sockfd_lookup_light(fd, &err, &fput_needed); - if(sock) { + if (sock) { err = move_addr_to_kernel(umyaddr, addrlen, address); if (err >= 0) { err = security_socket_bind(sock, diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile index cdcab9ca4c6..8ebfc4db7f5 100644 --- a/net/sunrpc/Makefile +++ b/net/sunrpc/Makefile @@ -9,7 +9,7 @@ obj-$(CONFIG_SUNRPC_GSS) += auth_gss/ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \ auth.o auth_null.o auth_unix.o \ svc.o svcsock.o svcauth.o svcauth_unix.o \ - pmap_clnt.o timer.o xdr.o \ + rpcb_clnt.o timer.o xdr.o \ sunrpc_syms.o cache.o rpc_pipe.o sunrpc-$(CONFIG_PROC_FS) += stats.o sunrpc-$(CONFIG_SYSCTL) += sysctl.o diff --git a/net/sunrpc/auth_gss/gss_spkm3_seal.c b/net/sunrpc/auth_gss/gss_spkm3_seal.c index 104cbf4f769..d158635de6c 100644 --- a/net/sunrpc/auth_gss/gss_spkm3_seal.c +++ b/net/sunrpc/auth_gss/gss_spkm3_seal.c @@ -123,9 +123,6 @@ spkm3_make_token(struct spkm3_ctx *ctx, return GSS_S_COMPLETE; out_err: - if (md5cksum.data) - kfree(md5cksum.data); - token->data = NULL; token->len = 0; return GSS_S_FAILURE; @@ -152,7 +149,7 @@ make_spkm3_checksum(s32 cksumtype, struct xdr_netobj *key, char *header, switch (cksumtype) { case CKSUMTYPE_HMAC_MD5: - cksumname = "md5"; + cksumname = "hmac(md5)"; break; default: dprintk("RPC: spkm3_make_checksum:" @@ -172,8 +169,12 @@ make_spkm3_checksum(s32 cksumtype, struct xdr_netobj *key, char *header, if (err) goto out; + err = crypto_hash_init(&desc); + if (err) + goto out; + sg_set_buf(sg, header, hdrlen); - crypto_hash_update(&desc, sg, 1); + crypto_hash_update(&desc, sg, sg->length); xdr_process_buf(body, body_offset, body->len - body_offset, spkm3_checksummer, &desc); @@ -184,5 +185,3 @@ out: return err ? GSS_S_FAILURE : 0; } - -EXPORT_SYMBOL(make_spkm3_checksum); diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index f02f24ae946..543b085ae2c 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -1237,20 +1237,12 @@ static int content_open(struct inode *inode, struct file *file) return res; } -static int content_release(struct inode *inode, struct file *file) -{ - struct seq_file *m = (struct seq_file *)file->private_data; - struct handle *han = m->private; - kfree(han); - m->private = NULL; - return seq_release(inode, file); -} static const struct file_operations content_file_operations = { .open = content_open, .read = seq_read, .llseek = seq_lseek, - .release = content_release, + .release = seq_release_private, }; static ssize_t read_flush(struct file *file, char __user *buf, diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 396cdbe249d..d8fbee40a19 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -36,8 +36,6 @@ #include <linux/sunrpc/metrics.h> -#define RPC_SLACK_SPACE (1024) /* total overkill */ - #ifdef RPC_DEBUG # define RPCDBG_FACILITY RPCDBG_CALL #endif @@ -747,21 +745,38 @@ call_reserveresult(struct rpc_task *task) static void call_allocate(struct rpc_task *task) { + unsigned int slack = task->tk_auth->au_cslack; struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = task->tk_xprt; - unsigned int bufsiz; + struct rpc_procinfo *proc = task->tk_msg.rpc_proc; dprint_status(task); + task->tk_status = 0; task->tk_action = call_bind; + if (req->rq_buffer) return; - /* FIXME: compute buffer requirements more exactly using - * auth->au_wslack */ - bufsiz = task->tk_msg.rpc_proc->p_bufsiz + RPC_SLACK_SPACE; + if (proc->p_proc != 0) { + BUG_ON(proc->p_arglen == 0); + if (proc->p_decode != NULL) + BUG_ON(proc->p_replen == 0); + } - if (xprt->ops->buf_alloc(task, bufsiz << 1) != NULL) + /* + * Calculate the size (in quads) of the RPC call + * and reply headers, and convert both values + * to byte sizes. + */ + req->rq_callsize = RPC_CALLHDRSIZE + (slack << 1) + proc->p_arglen; + req->rq_callsize <<= 2; + req->rq_rcvsize = RPC_REPHDRSIZE + slack + proc->p_replen; + req->rq_rcvsize <<= 2; + + req->rq_buffer = xprt->ops->buf_alloc(task, + req->rq_callsize + req->rq_rcvsize); + if (req->rq_buffer != NULL) return; dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid); @@ -788,6 +803,17 @@ rpc_task_force_reencode(struct rpc_task *task) task->tk_rqstp->rq_snd_buf.len = 0; } +static inline void +rpc_xdr_buf_init(struct xdr_buf *buf, void *start, size_t len) +{ + buf->head[0].iov_base = start; + buf->head[0].iov_len = len; + buf->tail[0].iov_len = 0; + buf->page_len = 0; + buf->len = 0; + buf->buflen = len; +} + /* * 3. Encode arguments of an RPC call */ @@ -795,28 +821,17 @@ static void call_encode(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; - struct xdr_buf *sndbuf = &req->rq_snd_buf; - struct xdr_buf *rcvbuf = &req->rq_rcv_buf; - unsigned int bufsiz; kxdrproc_t encode; __be32 *p; dprint_status(task); - /* Default buffer setup */ - bufsiz = req->rq_bufsize >> 1; - sndbuf->head[0].iov_base = (void *)req->rq_buffer; - sndbuf->head[0].iov_len = bufsiz; - sndbuf->tail[0].iov_len = 0; - sndbuf->page_len = 0; - sndbuf->len = 0; - sndbuf->buflen = bufsiz; - rcvbuf->head[0].iov_base = (void *)((char *)req->rq_buffer + bufsiz); - rcvbuf->head[0].iov_len = bufsiz; - rcvbuf->tail[0].iov_len = 0; - rcvbuf->page_len = 0; - rcvbuf->len = 0; - rcvbuf->buflen = bufsiz; + rpc_xdr_buf_init(&req->rq_snd_buf, + req->rq_buffer, + req->rq_callsize); + rpc_xdr_buf_init(&req->rq_rcv_buf, + (char *)req->rq_buffer + req->rq_callsize, + req->rq_rcvsize); /* Encode header and provided arguments */ encode = task->tk_msg.rpc_proc->p_encode; @@ -887,9 +902,11 @@ call_bind_status(struct rpc_task *task) task->tk_pid); break; case -EPROTONOSUPPORT: - dprintk("RPC: %5u remote rpcbind version 2 unavailable\n", + dprintk("RPC: %5u remote rpcbind version unavailable, retrying\n", task->tk_pid); - break; + task->tk_status = 0; + task->tk_action = call_bind; + return; default: dprintk("RPC: %5u unrecognized rpcbind error (%d)\n", task->tk_pid, -task->tk_status); diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c deleted file mode 100644 index d9f76534458..00000000000 --- a/net/sunrpc/pmap_clnt.c +++ /dev/null @@ -1,383 +0,0 @@ -/* - * linux/net/sunrpc/pmap_clnt.c - * - * In-kernel RPC portmapper client. - * - * Portmapper supports version 2 of the rpcbind protocol (RFC 1833). - * - * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de> - */ - -#include <linux/types.h> -#include <linux/socket.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/uio.h> -#include <linux/in.h> -#include <linux/sunrpc/clnt.h> -#include <linux/sunrpc/sched.h> - -#ifdef RPC_DEBUG -# define RPCDBG_FACILITY RPCDBG_PMAP -#endif - -#define PMAP_SET 1 -#define PMAP_UNSET 2 -#define PMAP_GETPORT 3 - -struct portmap_args { - u32 pm_prog; - u32 pm_vers; - u32 pm_prot; - unsigned short pm_port; - struct rpc_xprt * pm_xprt; -}; - -static struct rpc_procinfo pmap_procedures[]; -static struct rpc_clnt * pmap_create(char *, struct sockaddr_in *, int, int); -static void pmap_getport_done(struct rpc_task *, void *); -static struct rpc_program pmap_program; - -static void pmap_getport_prepare(struct rpc_task *task, void *calldata) -{ - struct portmap_args *map = calldata; - struct rpc_message msg = { - .rpc_proc = &pmap_procedures[PMAP_GETPORT], - .rpc_argp = map, - .rpc_resp = &map->pm_port, - }; - - rpc_call_setup(task, &msg, 0); -} - -static inline struct portmap_args *pmap_map_alloc(void) -{ - return kmalloc(sizeof(struct portmap_args), GFP_NOFS); -} - -static inline void pmap_map_free(struct portmap_args *map) -{ - kfree(map); -} - -static void pmap_map_release(void *data) -{ - struct portmap_args *map = data; - - xprt_put(map->pm_xprt); - pmap_map_free(map); -} - -static const struct rpc_call_ops pmap_getport_ops = { - .rpc_call_prepare = pmap_getport_prepare, - .rpc_call_done = pmap_getport_done, - .rpc_release = pmap_map_release, -}; - -static inline void pmap_wake_portmap_waiters(struct rpc_xprt *xprt, int status) -{ - xprt_clear_binding(xprt); - rpc_wake_up_status(&xprt->binding, status); -} - -/** - * rpc_getport - obtain the port for a given RPC service on a given host - * @task: task that is waiting for portmapper request - * - * This one can be called for an ongoing RPC request, and can be used in - * an async (rpciod) context. - */ -void rpc_getport(struct rpc_task *task) -{ - struct rpc_clnt *clnt = task->tk_client; - struct rpc_xprt *xprt = task->tk_xprt; - struct sockaddr_in addr; - struct portmap_args *map; - struct rpc_clnt *pmap_clnt; - struct rpc_task *child; - int status; - - dprintk("RPC: %5u rpc_getport(%s, %u, %u, %d)\n", - task->tk_pid, clnt->cl_server, - clnt->cl_prog, clnt->cl_vers, xprt->prot); - - /* Autobind on cloned rpc clients is discouraged */ - BUG_ON(clnt->cl_parent != clnt); - - status = -EACCES; /* tell caller to check again */ - if (xprt_test_and_set_binding(xprt)) - goto bailout_nowake; - - /* Put self on queue before sending rpcbind request, in case - * pmap_getport_done completes before we return from rpc_run_task */ - rpc_sleep_on(&xprt->binding, task, NULL, NULL); - - /* Someone else may have bound if we slept */ - status = 0; - if (xprt_bound(xprt)) - goto bailout_nofree; - - status = -ENOMEM; - map = pmap_map_alloc(); - if (!map) - goto bailout_nofree; - map->pm_prog = clnt->cl_prog; - map->pm_vers = clnt->cl_vers; - map->pm_prot = xprt->prot; - map->pm_port = 0; - map->pm_xprt = xprt_get(xprt); - - rpc_peeraddr(clnt, (struct sockaddr *) &addr, sizeof(addr)); - pmap_clnt = pmap_create(clnt->cl_server, &addr, map->pm_prot, 0); - status = PTR_ERR(pmap_clnt); - if (IS_ERR(pmap_clnt)) - goto bailout; - - status = -EIO; - child = rpc_run_task(pmap_clnt, RPC_TASK_ASYNC, &pmap_getport_ops, map); - if (IS_ERR(child)) - goto bailout_nofree; - rpc_put_task(child); - - task->tk_xprt->stat.bind_count++; - return; - -bailout: - pmap_map_free(map); - xprt_put(xprt); -bailout_nofree: - pmap_wake_portmap_waiters(xprt, status); -bailout_nowake: - task->tk_status = status; -} - -#ifdef CONFIG_ROOT_NFS -/** - * rpc_getport_external - obtain the port for a given RPC service on a given host - * @sin: address of remote peer - * @prog: RPC program number to bind - * @vers: RPC version number to bind - * @prot: transport protocol to use to make this request - * - * This one is called from outside the RPC client in a synchronous task context. - */ -int rpc_getport_external(struct sockaddr_in *sin, __u32 prog, __u32 vers, int prot) -{ - struct portmap_args map = { - .pm_prog = prog, - .pm_vers = vers, - .pm_prot = prot, - .pm_port = 0 - }; - struct rpc_message msg = { - .rpc_proc = &pmap_procedures[PMAP_GETPORT], - .rpc_argp = &map, - .rpc_resp = &map.pm_port, - }; - struct rpc_clnt *pmap_clnt; - char hostname[32]; - int status; - - dprintk("RPC: rpc_getport_external(%u.%u.%u.%u, %u, %u, %d)\n", - NIPQUAD(sin->sin_addr.s_addr), prog, vers, prot); - - sprintf(hostname, "%u.%u.%u.%u", NIPQUAD(sin->sin_addr.s_addr)); - pmap_clnt = pmap_create(hostname, sin, prot, 0); - if (IS_ERR(pmap_clnt)) - return PTR_ERR(pmap_clnt); - - /* Setup the call info struct */ - status = rpc_call_sync(pmap_clnt, &msg, 0); - - if (status >= 0) { - if (map.pm_port != 0) - return map.pm_port; - status = -EACCES; - } - return status; -} -#endif - -/* - * Portmapper child task invokes this callback via tk_exit. - */ -static void pmap_getport_done(struct rpc_task *child, void *data) -{ - struct portmap_args *map = data; - struct rpc_xprt *xprt = map->pm_xprt; - int status = child->tk_status; - - if (status < 0) { - /* Portmapper not available */ - xprt->ops->set_port(xprt, 0); - } else if (map->pm_port == 0) { - /* Requested RPC service wasn't registered */ - xprt->ops->set_port(xprt, 0); - status = -EACCES; - } else { - /* Succeeded */ - xprt->ops->set_port(xprt, map->pm_port); - xprt_set_bound(xprt); - status = 0; - } - - dprintk("RPC: %5u pmap_getport_done(status %d, port %u)\n", - child->tk_pid, status, map->pm_port); - - pmap_wake_portmap_waiters(xprt, status); -} - -/** - * rpc_register - set or unset a port registration with the local portmapper - * @prog: RPC program number to bind - * @vers: RPC version number to bind - * @prot: transport protocol to use to make this request - * @port: port value to register - * @okay: result code - * - * port == 0 means unregister, port != 0 means register. - */ -int rpc_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay) -{ - struct sockaddr_in sin = { - .sin_family = AF_INET, - .sin_addr.s_addr = htonl(INADDR_LOOPBACK), - }; - struct portmap_args map = { - .pm_prog = prog, - .pm_vers = vers, - .pm_prot = prot, - .pm_port = port, - }; - struct rpc_message msg = { - .rpc_proc = &pmap_procedures[port ? PMAP_SET : PMAP_UNSET], - .rpc_argp = &map, - .rpc_resp = okay, - }; - struct rpc_clnt *pmap_clnt; - int error = 0; - - dprintk("RPC: registering (%u, %u, %d, %u) with portmapper.\n", - prog, vers, prot, port); - - pmap_clnt = pmap_create("localhost", &sin, IPPROTO_UDP, 1); - if (IS_ERR(pmap_clnt)) { - error = PTR_ERR(pmap_clnt); - dprintk("RPC: couldn't create pmap client. Error = %d\n", - error); - return error; - } - - error = rpc_call_sync(pmap_clnt, &msg, 0); - - if (error < 0) { - printk(KERN_WARNING - "RPC: failed to contact portmap (errno %d).\n", - error); - } - dprintk("RPC: registration status %d/%d\n", error, *okay); - - /* Client deleted automatically because cl_oneshot == 1 */ - return error; -} - -static struct rpc_clnt *pmap_create(char *hostname, struct sockaddr_in *srvaddr, int proto, int privileged) -{ - struct rpc_create_args args = { - .protocol = proto, - .address = (struct sockaddr *)srvaddr, - .addrsize = sizeof(*srvaddr), - .servername = hostname, - .program = &pmap_program, - .version = RPC_PMAP_VERSION, - .authflavor = RPC_AUTH_UNIX, - .flags = (RPC_CLNT_CREATE_ONESHOT | - RPC_CLNT_CREATE_NOPING), - }; - - srvaddr->sin_port = htons(RPC_PMAP_PORT); - if (!privileged) - args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; - return rpc_create(&args); -} - -/* - * XDR encode/decode functions for PMAP - */ -static int xdr_encode_mapping(struct rpc_rqst *req, __be32 *p, struct portmap_args *map) -{ - dprintk("RPC: xdr_encode_mapping(%u, %u, %u, %u)\n", - map->pm_prog, map->pm_vers, - map->pm_prot, map->pm_port); - *p++ = htonl(map->pm_prog); - *p++ = htonl(map->pm_vers); - *p++ = htonl(map->pm_prot); - *p++ = htonl(map->pm_port); - - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; -} - -static int xdr_decode_port(struct rpc_rqst *req, __be32 *p, unsigned short *portp) -{ - *portp = (unsigned short) ntohl(*p++); - return 0; -} - -static int xdr_decode_bool(struct rpc_rqst *req, __be32 *p, unsigned int *boolp) -{ - *boolp = (unsigned int) ntohl(*p++); - return 0; -} - -static struct rpc_procinfo pmap_procedures[] = { -[PMAP_SET] = { - .p_proc = PMAP_SET, - .p_encode = (kxdrproc_t) xdr_encode_mapping, - .p_decode = (kxdrproc_t) xdr_decode_bool, - .p_bufsiz = 4, - .p_count = 1, - .p_statidx = PMAP_SET, - .p_name = "SET", - }, -[PMAP_UNSET] = { - .p_proc = PMAP_UNSET, - .p_encode = (kxdrproc_t) xdr_encode_mapping, - .p_decode = (kxdrproc_t) xdr_decode_bool, - .p_bufsiz = 4, - .p_count = 1, - .p_statidx = PMAP_UNSET, - .p_name = "UNSET", - }, -[PMAP_GETPORT] = { - .p_proc = PMAP_GETPORT, - .p_encode = (kxdrproc_t) xdr_encode_mapping, - .p_decode = (kxdrproc_t) xdr_decode_port, - .p_bufsiz = 4, - .p_count = 1, - .p_statidx = PMAP_GETPORT, - .p_name = "GETPORT", - }, -}; - -static struct rpc_version pmap_version2 = { - .number = 2, - .nrprocs = 4, - .procs = pmap_procedures -}; - -static struct rpc_version * pmap_version[] = { - NULL, - NULL, - &pmap_version2 -}; - -static struct rpc_stat pmap_stats; - -static struct rpc_program pmap_program = { - .name = "portmap", - .number = RPC_PMAP_PROGRAM, - .nrvers = ARRAY_SIZE(pmap_version), - .version = pmap_version, - .stats = &pmap_stats, -}; diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c new file mode 100644 index 00000000000..6c7aa8a1f0c --- /dev/null +++ b/net/sunrpc/rpcb_clnt.c @@ -0,0 +1,625 @@ +/* + * In-kernel rpcbind client supporting versions 2, 3, and 4 of the rpcbind + * protocol + * + * Based on RFC 1833: "Binding Protocols for ONC RPC Version 2" and + * RFC 3530: "Network File System (NFS) version 4 Protocol" + * + * Original: Gilles Quillard, Bull Open Source, 2005 <gilles.quillard@bull.net> + * Updated: Chuck Lever, Oracle Corporation, 2007 <chuck.lever@oracle.com> + * + * Descended from net/sunrpc/pmap_clnt.c, + * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de> + */ + +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/kernel.h> +#include <linux/errno.h> + +#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/sched.h> + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_BIND +#endif + +#define RPCBIND_PROGRAM (100000u) +#define RPCBIND_PORT (111u) + +enum { + RPCBPROC_NULL, + RPCBPROC_SET, + RPCBPROC_UNSET, + RPCBPROC_GETPORT, + RPCBPROC_GETADDR = 3, /* alias for GETPORT */ + RPCBPROC_DUMP, + RPCBPROC_CALLIT, + RPCBPROC_BCAST = 5, /* alias for CALLIT */ + RPCBPROC_GETTIME, + RPCBPROC_UADDR2TADDR, + RPCBPROC_TADDR2UADDR, + RPCBPROC_GETVERSADDR, + RPCBPROC_INDIRECT, + RPCBPROC_GETADDRLIST, + RPCBPROC_GETSTAT, +}; + +#define RPCB_HIGHPROC_2 RPCBPROC_CALLIT +#define RPCB_HIGHPROC_3 RPCBPROC_TADDR2UADDR +#define RPCB_HIGHPROC_4 RPCBPROC_GETSTAT + +/* + * r_addr + * + * Quoting RFC 3530, section 2.2: + * + * For TCP over IPv4 and for UDP over IPv4, the format of r_addr is the + * US-ASCII string: + * + * h1.h2.h3.h4.p1.p2 + * + * The prefix, "h1.h2.h3.h4", is the standard textual form for + * representing an IPv4 address, which is always four octets long. + * Assuming big-endian ordering, h1, h2, h3, and h4, are respectively, + * the first through fourth octets each converted to ASCII-decimal. + * Assuming big-endian ordering, p1 and p2 are, respectively, the first + * and second octets each converted to ASCII-decimal. For example, if a + * host, in big-endian order, has an address of 0x0A010307 and there is + * a service listening on, in big endian order, port 0x020F (decimal + * 527), then the complete universal address is "10.1.3.7.2.15". + * + * ... + * + * For TCP over IPv6 and for UDP over IPv6, the format of r_addr is the + * US-ASCII string: + * + * x1:x2:x3:x4:x5:x6:x7:x8.p1.p2 + * + * The suffix "p1.p2" is the service port, and is computed the same way + * as with universal addresses for TCP and UDP over IPv4. The prefix, + * "x1:x2:x3:x4:x5:x6:x7:x8", is the standard textual form for + * representing an IPv6 address as defined in Section 2.2 of [RFC2373]. + * Additionally, the two alternative forms specified in Section 2.2 of + * [RFC2373] are also acceptable. + * + * XXX: Currently this implementation does not explicitly convert the + * stored address to US-ASCII on non-ASCII systems. + */ +#define RPCB_MAXADDRLEN (128u) + +/* + * r_netid + * + * Quoting RFC 3530, section 2.2: + * + * For TCP over IPv4 the value of r_netid is the string "tcp". For UDP + * over IPv4 the value of r_netid is the string "udp". + * + * ... + * + * For TCP over IPv6 the value of r_netid is the string "tcp6". For UDP + * over IPv6 the value of r_netid is the string "udp6". + */ +#define RPCB_NETID_UDP "\165\144\160" /* "udp" */ +#define RPCB_NETID_TCP "\164\143\160" /* "tcp" */ +#define RPCB_NETID_UDP6 "\165\144\160\066" /* "udp6" */ +#define RPCB_NETID_TCP6 "\164\143\160\066" /* "tcp6" */ + +#define RPCB_MAXNETIDLEN (4u) + +/* + * r_owner + * + * The "owner" is allowed to unset a service in the rpcbind database. + * We always use the following (arbitrary) fixed string. + */ +#define RPCB_OWNER_STRING "rpcb" +#define RPCB_MAXOWNERLEN sizeof(RPCB_OWNER_STRING) + +static void rpcb_getport_done(struct rpc_task *, void *); +extern struct rpc_program rpcb_program; + +struct rpcbind_args { + struct rpc_xprt * r_xprt; + + u32 r_prog; + u32 r_vers; + u32 r_prot; + unsigned short r_port; + char * r_netid; + char r_addr[RPCB_MAXADDRLEN]; + char * r_owner; +}; + +static struct rpc_procinfo rpcb_procedures2[]; +static struct rpc_procinfo rpcb_procedures3[]; + +static struct rpcb_info { + int rpc_vers; + struct rpc_procinfo * rpc_proc; +} rpcb_next_version[]; + +static void rpcb_getport_prepare(struct rpc_task *task, void *calldata) +{ + struct rpcbind_args *map = calldata; + struct rpc_xprt *xprt = map->r_xprt; + struct rpc_message msg = { + .rpc_proc = rpcb_next_version[xprt->bind_index].rpc_proc, + .rpc_argp = map, + .rpc_resp = &map->r_port, + }; + + rpc_call_setup(task, &msg, 0); +} + +static void rpcb_map_release(void *data) +{ + struct rpcbind_args *map = data; + + xprt_put(map->r_xprt); + kfree(map); +} + +static const struct rpc_call_ops rpcb_getport_ops = { + .rpc_call_prepare = rpcb_getport_prepare, + .rpc_call_done = rpcb_getport_done, + .rpc_release = rpcb_map_release, +}; + +static void rpcb_wake_rpcbind_waiters(struct rpc_xprt *xprt, int status) +{ + xprt_clear_binding(xprt); + rpc_wake_up_status(&xprt->binding, status); +} + +static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr, + int proto, int version, int privileged) +{ + struct rpc_create_args args = { + .protocol = proto, + .address = srvaddr, + .addrsize = sizeof(struct sockaddr_in), + .servername = hostname, + .program = &rpcb_program, + .version = version, + .authflavor = RPC_AUTH_UNIX, + .flags = (RPC_CLNT_CREATE_ONESHOT | + RPC_CLNT_CREATE_NOPING), + }; + + ((struct sockaddr_in *)srvaddr)->sin_port = htons(RPCBIND_PORT); + if (!privileged) + args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; + return rpc_create(&args); +} + +/** + * rpcb_register - set or unset a port registration with the local rpcbind svc + * @prog: RPC program number to bind + * @vers: RPC version number to bind + * @prot: transport protocol to use to make this request + * @port: port value to register + * @okay: result code + * + * port == 0 means unregister, port != 0 means register. + * + * This routine supports only rpcbind version 2. + */ +int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay) +{ + struct sockaddr_in sin = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_LOOPBACK), + }; + struct rpcbind_args map = { + .r_prog = prog, + .r_vers = vers, + .r_prot = prot, + .r_port = port, + }; + struct rpc_message msg = { + .rpc_proc = &rpcb_procedures2[port ? + RPCBPROC_SET : RPCBPROC_UNSET], + .rpc_argp = &map, + .rpc_resp = okay, + }; + struct rpc_clnt *rpcb_clnt; + int error = 0; + + dprintk("RPC: %sregistering (%u, %u, %d, %u) with local " + "rpcbind\n", (port ? "" : "un"), + prog, vers, prot, port); + + rpcb_clnt = rpcb_create("localhost", (struct sockaddr *) &sin, + IPPROTO_UDP, 2, 1); + if (IS_ERR(rpcb_clnt)) + return PTR_ERR(rpcb_clnt); + + error = rpc_call_sync(rpcb_clnt, &msg, 0); + + if (error < 0) + printk(KERN_WARNING "RPC: failed to contact local rpcbind " + "server (errno %d).\n", -error); + dprintk("RPC: registration status %d/%d\n", error, *okay); + + return error; +} + +#ifdef CONFIG_ROOT_NFS +/** + * rpcb_getport_external - obtain the port for an RPC service on a given host + * @sin: address of remote peer + * @prog: RPC program number to bind + * @vers: RPC version number to bind + * @prot: transport protocol to use to make this request + * + * Called from outside the RPC client in a synchronous task context. + * + * For now, this supports only version 2 queries, but is used only by + * mount_clnt for NFS_ROOT. + */ +int rpcb_getport_external(struct sockaddr_in *sin, __u32 prog, + __u32 vers, int prot) +{ + struct rpcbind_args map = { + .r_prog = prog, + .r_vers = vers, + .r_prot = prot, + .r_port = 0, + }; + struct rpc_message msg = { + .rpc_proc = &rpcb_procedures2[RPCBPROC_GETPORT], + .rpc_argp = &map, + .rpc_resp = &map.r_port, + }; + struct rpc_clnt *rpcb_clnt; + char hostname[40]; + int status; + + dprintk("RPC: rpcb_getport_external(%u.%u.%u.%u, %u, %u, %d)\n", + NIPQUAD(sin->sin_addr.s_addr), prog, vers, prot); + + sprintf(hostname, "%u.%u.%u.%u", NIPQUAD(sin->sin_addr.s_addr)); + rpcb_clnt = rpcb_create(hostname, (struct sockaddr *)sin, prot, 2, 0); + if (IS_ERR(rpcb_clnt)) + return PTR_ERR(rpcb_clnt); + + status = rpc_call_sync(rpcb_clnt, &msg, 0); + + if (status >= 0) { + if (map.r_port != 0) + return map.r_port; + status = -EACCES; + } + return status; +} +#endif + +/** + * rpcb_getport - obtain the port for a given RPC service on a given host + * @task: task that is waiting for portmapper request + * + * This one can be called for an ongoing RPC request, and can be used in + * an async (rpciod) context. + */ +void rpcb_getport(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + int bind_version; + struct rpc_xprt *xprt = task->tk_xprt; + struct rpc_clnt *rpcb_clnt; + static struct rpcbind_args *map; + struct rpc_task *child; + struct sockaddr addr; + int status; + + dprintk("RPC: %5u rpcb_getport(%s, %u, %u, %d)\n", + task->tk_pid, clnt->cl_server, + clnt->cl_prog, clnt->cl_vers, xprt->prot); + + /* Autobind on cloned rpc clients is discouraged */ + BUG_ON(clnt->cl_parent != clnt); + + if (xprt_test_and_set_binding(xprt)) { + status = -EACCES; /* tell caller to check again */ + dprintk("RPC: %5u rpcb_getport waiting for another binder\n", + task->tk_pid); + goto bailout_nowake; + } + + /* Put self on queue before sending rpcbind request, in case + * rpcb_getport_done completes before we return from rpc_run_task */ + rpc_sleep_on(&xprt->binding, task, NULL, NULL); + + /* Someone else may have bound if we slept */ + if (xprt_bound(xprt)) { + status = 0; + dprintk("RPC: %5u rpcb_getport already bound\n", task->tk_pid); + goto bailout_nofree; + } + + if (rpcb_next_version[xprt->bind_index].rpc_proc == NULL) { + xprt->bind_index = 0; + status = -EACCES; /* tell caller to try again later */ + dprintk("RPC: %5u rpcb_getport no more getport versions " + "available\n", task->tk_pid); + goto bailout_nofree; + } + bind_version = rpcb_next_version[xprt->bind_index].rpc_vers; + + dprintk("RPC: %5u rpcb_getport trying rpcbind version %u\n", + task->tk_pid, bind_version); + + map = kzalloc(sizeof(struct rpcbind_args), GFP_ATOMIC); + if (!map) { + status = -ENOMEM; + dprintk("RPC: %5u rpcb_getport no memory available\n", + task->tk_pid); + goto bailout_nofree; + } + map->r_prog = clnt->cl_prog; + map->r_vers = clnt->cl_vers; + map->r_prot = xprt->prot; + map->r_port = 0; + map->r_xprt = xprt_get(xprt); + map->r_netid = (xprt->prot == IPPROTO_TCP) ? RPCB_NETID_TCP : + RPCB_NETID_UDP; + memcpy(&map->r_addr, rpc_peeraddr2str(clnt, RPC_DISPLAY_ADDR), + sizeof(map->r_addr)); + map->r_owner = RPCB_OWNER_STRING; /* ignored for GETADDR */ + + rpc_peeraddr(clnt, (void *)&addr, sizeof(addr)); + rpcb_clnt = rpcb_create(clnt->cl_server, &addr, xprt->prot, bind_version, 0); + if (IS_ERR(rpcb_clnt)) { + status = PTR_ERR(rpcb_clnt); + dprintk("RPC: %5u rpcb_getport rpcb_create failed, error %ld\n", + task->tk_pid, PTR_ERR(rpcb_clnt)); + goto bailout; + } + + child = rpc_run_task(rpcb_clnt, RPC_TASK_ASYNC, &rpcb_getport_ops, map); + if (IS_ERR(child)) { + status = -EIO; + dprintk("RPC: %5u rpcb_getport rpc_run_task failed\n", + task->tk_pid); + goto bailout_nofree; + } + rpc_put_task(child); + + task->tk_xprt->stat.bind_count++; + return; + +bailout: + kfree(map); + xprt_put(xprt); +bailout_nofree: + rpcb_wake_rpcbind_waiters(xprt, status); +bailout_nowake: + task->tk_status = status; +} + +/* + * Rpcbind child task calls this callback via tk_exit. + */ +static void rpcb_getport_done(struct rpc_task *child, void *data) +{ + struct rpcbind_args *map = data; + struct rpc_xprt *xprt = map->r_xprt; + int status = child->tk_status; + + /* rpcbind server doesn't support this rpcbind protocol version */ + if (status == -EPROTONOSUPPORT) + xprt->bind_index++; + + if (status < 0) { + /* rpcbind server not available on remote host? */ + xprt->ops->set_port(xprt, 0); + } else if (map->r_port == 0) { + /* Requested RPC service wasn't registered on remote host */ + xprt->ops->set_port(xprt, 0); + status = -EACCES; + } else { + /* Succeeded */ + xprt->ops->set_port(xprt, map->r_port); + xprt_set_bound(xprt); + status = 0; + } + + dprintk("RPC: %5u rpcb_getport_done(status %d, port %u)\n", + child->tk_pid, status, map->r_port); + + rpcb_wake_rpcbind_waiters(xprt, status); +} + +static int rpcb_encode_mapping(struct rpc_rqst *req, __be32 *p, + struct rpcbind_args *rpcb) +{ + dprintk("RPC: rpcb_encode_mapping(%u, %u, %d, %u)\n", + rpcb->r_prog, rpcb->r_vers, rpcb->r_prot, rpcb->r_port); + *p++ = htonl(rpcb->r_prog); + *p++ = htonl(rpcb->r_vers); + *p++ = htonl(rpcb->r_prot); + *p++ = htonl(rpcb->r_port); + + req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); + return 0; +} + +static int rpcb_decode_getport(struct rpc_rqst *req, __be32 *p, + unsigned short *portp) +{ + *portp = (unsigned short) ntohl(*p++); + dprintk("RPC: rpcb_decode_getport result %u\n", + *portp); + return 0; +} + +static int rpcb_decode_set(struct rpc_rqst *req, __be32 *p, + unsigned int *boolp) +{ + *boolp = (unsigned int) ntohl(*p++); + dprintk("RPC: rpcb_decode_set result %u\n", + *boolp); + return 0; +} + +static int rpcb_encode_getaddr(struct rpc_rqst *req, __be32 *p, + struct rpcbind_args *rpcb) +{ + dprintk("RPC: rpcb_encode_getaddr(%u, %u, %s)\n", + rpcb->r_prog, rpcb->r_vers, rpcb->r_addr); + *p++ = htonl(rpcb->r_prog); + *p++ = htonl(rpcb->r_vers); + + p = xdr_encode_string(p, rpcb->r_netid); + p = xdr_encode_string(p, rpcb->r_addr); + p = xdr_encode_string(p, rpcb->r_owner); + + req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); + + return 0; +} + +static int rpcb_decode_getaddr(struct rpc_rqst *req, __be32 *p, + unsigned short *portp) +{ + char *addr; + int addr_len, c, i, f, first, val; + + *portp = 0; + addr_len = (unsigned int) ntohl(*p++); + if (addr_len > RPCB_MAXADDRLEN) /* sanity */ + return -EINVAL; + + dprintk("RPC: rpcb_decode_getaddr returned string: '%s'\n", + (char *) p); + + addr = (char *)p; + val = 0; + first = 1; + f = 1; + for (i = addr_len - 1; i > 0; i--) { + c = addr[i]; + if (c >= '0' && c <= '9') { + val += (c - '0') * f; + f *= 10; + } else if (c == '.') { + if (first) { + *portp = val; + val = first = 0; + f = 1; + } else { + *portp |= (val << 8); + break; + } + } + } + + dprintk("RPC: rpcb_decode_getaddr port=%u\n", *portp); + return 0; +} + +#define RPCB_program_sz (1u) +#define RPCB_version_sz (1u) +#define RPCB_protocol_sz (1u) +#define RPCB_port_sz (1u) +#define RPCB_boolean_sz (1u) + +#define RPCB_netid_sz (1+XDR_QUADLEN(RPCB_MAXNETIDLEN)) +#define RPCB_addr_sz (1+XDR_QUADLEN(RPCB_MAXADDRLEN)) +#define RPCB_ownerstring_sz (1+XDR_QUADLEN(RPCB_MAXOWNERLEN)) + +#define RPCB_mappingargs_sz RPCB_program_sz+RPCB_version_sz+ \ + RPCB_protocol_sz+RPCB_port_sz +#define RPCB_getaddrargs_sz RPCB_program_sz+RPCB_version_sz+ \ + RPCB_netid_sz+RPCB_addr_sz+ \ + RPCB_ownerstring_sz + +#define RPCB_setres_sz RPCB_boolean_sz +#define RPCB_getportres_sz RPCB_port_sz + +/* + * Note that RFC 1833 does not put any size restrictions on the + * address string returned by the remote rpcbind database. + */ +#define RPCB_getaddrres_sz RPCB_addr_sz + +#define PROC(proc, argtype, restype) \ + [RPCBPROC_##proc] = { \ + .p_proc = RPCBPROC_##proc, \ + .p_encode = (kxdrproc_t) rpcb_encode_##argtype, \ + .p_decode = (kxdrproc_t) rpcb_decode_##restype, \ + .p_arglen = RPCB_##argtype##args_sz, \ + .p_replen = RPCB_##restype##res_sz, \ + .p_statidx = RPCBPROC_##proc, \ + .p_timer = 0, \ + .p_name = #proc, \ + } + +/* + * Not all rpcbind procedures described in RFC 1833 are implemented + * since the Linux kernel RPC code requires only these. + */ +static struct rpc_procinfo rpcb_procedures2[] = { + PROC(SET, mapping, set), + PROC(UNSET, mapping, set), + PROC(GETADDR, mapping, getport), +}; + +static struct rpc_procinfo rpcb_procedures3[] = { + PROC(SET, mapping, set), + PROC(UNSET, mapping, set), + PROC(GETADDR, getaddr, getaddr), +}; + +static struct rpc_procinfo rpcb_procedures4[] = { + PROC(SET, mapping, set), + PROC(UNSET, mapping, set), + PROC(GETVERSADDR, getaddr, getaddr), +}; + +static struct rpcb_info rpcb_next_version[] = { +#ifdef CONFIG_SUNRPC_BIND34 + { 4, &rpcb_procedures4[RPCBPROC_GETVERSADDR] }, + { 3, &rpcb_procedures3[RPCBPROC_GETADDR] }, +#endif + { 2, &rpcb_procedures2[RPCBPROC_GETPORT] }, + { 0, NULL }, +}; + +static struct rpc_version rpcb_version2 = { + .number = 2, + .nrprocs = RPCB_HIGHPROC_2, + .procs = rpcb_procedures2 +}; + +static struct rpc_version rpcb_version3 = { + .number = 3, + .nrprocs = RPCB_HIGHPROC_3, + .procs = rpcb_procedures3 +}; + +static struct rpc_version rpcb_version4 = { + .number = 4, + .nrprocs = RPCB_HIGHPROC_4, + .procs = rpcb_procedures4 +}; + +static struct rpc_version *rpcb_version[] = { + NULL, + NULL, + &rpcb_version2, + &rpcb_version3, + &rpcb_version4 +}; + +static struct rpc_stat rpcb_stats; + +struct rpc_program rpcb_program = { + .name = "rpcbind", + .number = RPCBIND_PROGRAM, + .nrvers = ARRAY_SIZE(rpcb_version), + .version = rpcb_version, + .stats = &rpcb_stats, +}; diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 6d87320074b..4a53e94f813 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -741,50 +741,53 @@ static void rpc_async_schedule(struct work_struct *work) * @task: RPC task that will use this buffer * @size: requested byte size * - * We try to ensure that some NFS reads and writes can always proceed - * by using a mempool when allocating 'small' buffers. + * To prevent rpciod from hanging, this allocator never sleeps, + * returning NULL if the request cannot be serviced immediately. + * The caller can arrange to sleep in a way that is safe for rpciod. + * + * Most requests are 'small' (under 2KiB) and can be serviced from a + * mempool, ensuring that NFS reads and writes can always proceed, + * and that there is good locality of reference for these buffers. + * * In order to avoid memory starvation triggering more writebacks of - * NFS requests, we use GFP_NOFS rather than GFP_KERNEL. + * NFS requests, we avoid using GFP_KERNEL. */ -void * rpc_malloc(struct rpc_task *task, size_t size) +void *rpc_malloc(struct rpc_task *task, size_t size) { - struct rpc_rqst *req = task->tk_rqstp; - gfp_t gfp; + size_t *buf; + gfp_t gfp = RPC_IS_SWAPPER(task) ? GFP_ATOMIC : GFP_NOWAIT; - if (task->tk_flags & RPC_TASK_SWAPPER) - gfp = GFP_ATOMIC; + size += sizeof(size_t); + if (size <= RPC_BUFFER_MAXSIZE) + buf = mempool_alloc(rpc_buffer_mempool, gfp); else - gfp = GFP_NOFS; - - if (size > RPC_BUFFER_MAXSIZE) { - req->rq_buffer = kmalloc(size, gfp); - if (req->rq_buffer) - req->rq_bufsize = size; - } else { - req->rq_buffer = mempool_alloc(rpc_buffer_mempool, gfp); - if (req->rq_buffer) - req->rq_bufsize = RPC_BUFFER_MAXSIZE; - } - return req->rq_buffer; + buf = kmalloc(size, gfp); + *buf = size; + dprintk("RPC: %5u allocated buffer of size %u at %p\n", + task->tk_pid, size, buf); + return (void *) ++buf; } /** * rpc_free - free buffer allocated via rpc_malloc - * @task: RPC task with a buffer to be freed + * @buffer: buffer to free * */ -void rpc_free(struct rpc_task *task) +void rpc_free(void *buffer) { - struct rpc_rqst *req = task->tk_rqstp; + size_t size, *buf = (size_t *) buffer; - if (req->rq_buffer) { - if (req->rq_bufsize == RPC_BUFFER_MAXSIZE) - mempool_free(req->rq_buffer, rpc_buffer_mempool); - else - kfree(req->rq_buffer); - req->rq_buffer = NULL; - req->rq_bufsize = 0; - } + if (!buffer) + return; + size = *buf; + buf--; + + dprintk("RPC: freeing buffer of size %u at %p\n", + size, buf); + if (size <= RPC_BUFFER_MAXSIZE) + mempool_free(buf, rpc_buffer_mempool); + else + kfree(buf); } /* diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c index 634885b0c04..1d377d1ab7f 100644 --- a/net/sunrpc/socklib.c +++ b/net/sunrpc/socklib.c @@ -154,7 +154,7 @@ int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) desc.offset = sizeof(struct udphdr); desc.count = skb->len - desc.offset; - if (skb->ip_summed == CHECKSUM_UNNECESSARY) + if (skb_csum_unnecessary(skb)) goto no_checksum; desc.csum = csum_partial(skb->data, desc.offset, skb->csum); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index b4db53ff143..b7503c103ae 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -757,7 +757,7 @@ svc_register(struct svc_serv *serv, int proto, unsigned short port) if (progp->pg_vers[i]->vs_hidden) continue; - error = rpc_register(progp->pg_prog, i, proto, port, &dummy); + error = rpcb_register(progp->pg_prog, i, proto, port, &dummy); if (error < 0) break; if (port && !dummy) { diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 2772fee9388..22f61aee482 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -798,16 +798,12 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) dprintk("svc: recvfrom returned error %d\n", -err); } rqstp->rq_addrlen = sizeof(rqstp->rq_addr); - if (skb->tstamp.off_sec == 0) { - struct timeval tv; - - tv.tv_sec = xtime.tv_sec; - tv.tv_usec = xtime.tv_nsec / NSEC_PER_USEC; - skb_set_timestamp(skb, &tv); + if (skb->tstamp.tv64 == 0) { + skb->tstamp = ktime_get_real(); /* Don't enable netstamp, sunrpc doesn't need that much accuracy */ } - skb_get_timestamp(skb, &svsk->sk_sk->sk_stamp); + svsk->sk_sk->sk_stamp = skb->tstamp; set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */ /* diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 456a1451030..5b05b73e4c1 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -823,7 +823,6 @@ static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt) req->rq_task = task; req->rq_xprt = xprt; req->rq_buffer = NULL; - req->rq_bufsize = 0; req->rq_xid = xprt_alloc_xid(xprt); req->rq_release_snd_buf = NULL; xprt_reset_majortimeo(req); @@ -855,7 +854,7 @@ void xprt_release(struct rpc_task *task) mod_timer(&xprt->timer, xprt->last_used + xprt->idle_timeout); spin_unlock_bh(&xprt->transport_lock); - xprt->ops->buf_free(task); + xprt->ops->buf_free(req->rq_buffer); task->tk_rqstp = NULL; if (req->rq_release_snd_buf) req->rq_release_snd_buf(req); @@ -928,6 +927,7 @@ struct rpc_xprt *xprt_create_transport(int proto, struct sockaddr *ap, size_t si xprt->timer.data = (unsigned long) xprt; xprt->last_used = jiffies; xprt->cwnd = RPC_INITCWND; + xprt->bind_index = 0; rpc_init_wait_queue(&xprt->binding, "xprt_binding"); rpc_init_wait_queue(&xprt->pending, "xprt_pending"); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index a5a32029e72..cc33c5880ab 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1476,7 +1476,7 @@ static struct rpc_xprt_ops xs_udp_ops = { .set_buffer_size = xs_udp_set_buffer_size, .reserve_xprt = xprt_reserve_xprt_cong, .release_xprt = xprt_release_xprt_cong, - .rpcbind = rpc_getport, + .rpcbind = rpcb_getport, .set_port = xs_set_port, .connect = xs_connect, .buf_alloc = rpc_malloc, @@ -1493,7 +1493,7 @@ static struct rpc_xprt_ops xs_udp_ops = { static struct rpc_xprt_ops xs_tcp_ops = { .reserve_xprt = xprt_reserve_xprt, .release_xprt = xs_tcp_release_xprt, - .rpcbind = rpc_getport, + .rpcbind = rpcb_getport, .set_port = xs_set_port, .connect = xs_connect, .buf_alloc = rpc_malloc, diff --git a/net/tipc/config.c b/net/tipc/config.c index 14789a82de5..c71337a22d3 100644 --- a/net/tipc/config.c +++ b/net/tipc/config.c @@ -89,7 +89,7 @@ struct sk_buff *tipc_cfg_reply_alloc(int payload_size) int tipc_cfg_append_tlv(struct sk_buff *buf, int tlv_type, void *tlv_data, int tlv_data_size) { - struct tlv_desc *tlv = (struct tlv_desc *)buf->tail; + struct tlv_desc *tlv = (struct tlv_desc *)skb_tail_pointer(buf); int new_tlv_space = TLV_SPACE(tlv_data_size); if (skb_tailroom(buf) < new_tlv_space) { diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c index 9be4839e32c..0ee6ded18f3 100644 --- a/net/tipc/eth_media.c +++ b/net/tipc/eth_media.c @@ -73,7 +73,7 @@ static int send_msg(struct sk_buff *buf, struct tipc_bearer *tb_ptr, clone = skb_clone(buf, GFP_ATOMIC); if (clone) { - clone->nh.raw = clone->data; + skb_reset_network_header(clone); dev = ((struct eth_bearer *)(tb_ptr->usr_handle))->dev; clone->dev = dev; dev->hard_header(clone, dev, ETH_P_TIPC, @@ -99,8 +99,8 @@ static int recv_msg(struct sk_buff *buf, struct net_device *dev, if (likely(eb_ptr->bearer)) { if (likely(!dev->promiscuity) || - !memcmp(buf->mac.raw,dev->dev_addr,ETH_ALEN) || - !memcmp(buf->mac.raw,dev->broadcast,ETH_ALEN)) { + !memcmp(skb_mac_header(buf), dev->dev_addr, ETH_ALEN) || + !memcmp(skb_mac_header(buf), dev->broadcast, ETH_ALEN)) { size = msg_size((struct tipc_msg *)buf->data); skb_trim(buf, size); if (likely(buf->len == size)) { @@ -120,16 +120,18 @@ static int recv_msg(struct sk_buff *buf, struct net_device *dev, static int enable_bearer(struct tipc_bearer *tb_ptr) { - struct net_device *dev = dev_base; + struct net_device *dev, *pdev; struct eth_bearer *eb_ptr = ð_bearers[0]; struct eth_bearer *stop = ð_bearers[MAX_ETH_BEARERS]; char *driver_name = strchr((const char *)tb_ptr->name, ':') + 1; /* Find device with specified name */ - - while (dev && dev->name && strncmp(dev->name, driver_name, IFNAMSIZ)) { - dev = dev->next; - } + dev = NULL; + for_each_netdev(pdev) + if (!strncmp(dev->name, driver_name, IFNAMSIZ)) { + dev = pdev; + break; + } if (!dev) return -ENODEV; @@ -140,7 +142,7 @@ static int enable_bearer(struct tipc_bearer *tb_ptr) return -EDQUOT; if (!eb_ptr->dev) { eb_ptr->dev = dev; - eb_ptr->tipc_packet_type.type = __constant_htons(ETH_P_TIPC); + eb_ptr->tipc_packet_type.type = htons(ETH_P_TIPC); eb_ptr->tipc_packet_type.dev = dev; eb_ptr->tipc_packet_type.func = recv_msg; eb_ptr->tipc_packet_type.af_packet_priv = eb_ptr; diff --git a/net/tipc/link.c b/net/tipc/link.c index 71c2f2fd405..2124f32ef29 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1001,7 +1001,7 @@ static int link_bundle_buf(struct link *l_ptr, return 0; skb_put(bundler, pad + size); - memcpy(bundler->data + to_pos, buf->data, size); + skb_copy_to_linear_data_offset(bundler, to_pos, buf->data, size); msg_set_size(bundler_msg, to_pos + size); msg_set_msgcnt(bundler_msg, msg_msgcnt(bundler_msg) + 1); dbg("Packed msg # %u(%u octets) into pos %u in buf(#%u)\n", @@ -1109,8 +1109,8 @@ int tipc_link_send_buf(struct link *l_ptr, struct sk_buff *buf) if (bundler) { msg_init(&bundler_hdr, MSG_BUNDLER, OPEN_MSG, TIPC_OK, INT_H_SIZE, l_ptr->addr); - memcpy(bundler->data, (unchar *)&bundler_hdr, - INT_H_SIZE); + skb_copy_to_linear_data(bundler, &bundler_hdr, + INT_H_SIZE); skb_trim(bundler, INT_H_SIZE); link_bundle_buf(l_ptr, bundler, buf); buf = bundler; @@ -1383,9 +1383,9 @@ again: if (!buf) return -ENOMEM; buf->next = NULL; - memcpy(buf->data, (unchar *)&fragm_hdr, INT_H_SIZE); + skb_copy_to_linear_data(buf, &fragm_hdr, INT_H_SIZE); hsz = msg_hdr_sz(hdr); - memcpy(buf->data + INT_H_SIZE, (unchar *)hdr, hsz); + skb_copy_to_linear_data_offset(buf, INT_H_SIZE, hdr, hsz); msg_dbg(buf_msg(buf), ">BUILD>"); /* Chop up message: */ @@ -1416,8 +1416,8 @@ error: return -EFAULT; } } else - memcpy(buf->data + fragm_crs, sect_crs, sz); - + skb_copy_to_linear_data_offset(buf, fragm_crs, + sect_crs, sz); sect_crs += sz; sect_rest -= sz; fragm_crs += sz; @@ -1442,7 +1442,7 @@ error: buf->next = NULL; prev->next = buf; - memcpy(buf->data, (unchar *)&fragm_hdr, INT_H_SIZE); + skb_copy_to_linear_data(buf, &fragm_hdr, INT_H_SIZE); fragm_crs = INT_H_SIZE; fragm_rest = fragm_sz; msg_dbg(buf_msg(buf)," >BUILD>"); @@ -2130,7 +2130,7 @@ void tipc_link_send_proto_msg(struct link *l_ptr, u32 msg_typ, int probe_msg, buf = l_ptr->proto_msg_queue; if (!buf) return; - memcpy(buf->data, (unchar *)msg, sizeof(l_ptr->proto_msg)); + skb_copy_to_linear_data(buf, msg, sizeof(l_ptr->proto_msg)); return; } msg_set_timestamp(msg, jiffies_to_msecs(jiffies)); @@ -2143,7 +2143,7 @@ void tipc_link_send_proto_msg(struct link *l_ptr, u32 msg_typ, int probe_msg, if (!buf) return; - memcpy(buf->data, (unchar *)msg, sizeof(l_ptr->proto_msg)); + skb_copy_to_linear_data(buf, msg, sizeof(l_ptr->proto_msg)); msg_set_size(buf_msg(buf), msg_size); if (tipc_bearer_send(l_ptr->b_ptr, buf, &l_ptr->media_addr)) { @@ -2319,8 +2319,8 @@ void tipc_link_tunnel(struct link *l_ptr, "unable to send tunnel msg\n"); return; } - memcpy(buf->data, (unchar *)tunnel_hdr, INT_H_SIZE); - memcpy(buf->data + INT_H_SIZE, (unchar *)msg, length); + skb_copy_to_linear_data(buf, tunnel_hdr, INT_H_SIZE); + skb_copy_to_linear_data_offset(buf, INT_H_SIZE, msg, length); dbg("%c->%c:", l_ptr->b_ptr->net_plane, tunnel->b_ptr->net_plane); msg_dbg(buf_msg(buf), ">SEND>"); tipc_link_send_buf(tunnel, buf); @@ -2361,7 +2361,7 @@ void tipc_link_changeover(struct link *l_ptr) buf = buf_acquire(INT_H_SIZE); if (buf) { - memcpy(buf->data, (unchar *)&tunnel_hdr, INT_H_SIZE); + skb_copy_to_linear_data(buf, &tunnel_hdr, INT_H_SIZE); msg_set_size(&tunnel_hdr, INT_H_SIZE); dbg("%c->%c:", l_ptr->b_ptr->net_plane, tunnel->b_ptr->net_plane); @@ -2426,8 +2426,9 @@ void tipc_link_send_duplicate(struct link *l_ptr, struct link *tunnel) "unable to send duplicate msg\n"); return; } - memcpy(outbuf->data, (unchar *)&tunnel_hdr, INT_H_SIZE); - memcpy(outbuf->data + INT_H_SIZE, iter->data, length); + skb_copy_to_linear_data(outbuf, &tunnel_hdr, INT_H_SIZE); + skb_copy_to_linear_data_offset(outbuf, INT_H_SIZE, iter->data, + length); dbg("%c->%c:", l_ptr->b_ptr->net_plane, tunnel->b_ptr->net_plane); msg_dbg(buf_msg(outbuf), ">SEND>"); @@ -2457,7 +2458,7 @@ static struct sk_buff *buf_extract(struct sk_buff *skb, u32 from_pos) eb = buf_acquire(size); if (eb) - memcpy(eb->data, (unchar *)msg, size); + skb_copy_to_linear_data(eb, msg, size); return eb; } @@ -2569,7 +2570,7 @@ void tipc_link_recv_bundle(struct sk_buff *buf) if (obuf == NULL) { warn("Link unable to unbundle message(s)\n"); break; - }; + } pos += align(msg_size(buf_msg(obuf))); msg_dbg(buf_msg(obuf), " /"); tipc_net_route_msg(obuf); @@ -2631,9 +2632,9 @@ int tipc_link_send_long_buf(struct link *l_ptr, struct sk_buff *buf) goto exit; } msg_set_size(&fragm_hdr, fragm_sz + INT_H_SIZE); - memcpy(fragm->data, (unchar *)&fragm_hdr, INT_H_SIZE); - memcpy(fragm->data + INT_H_SIZE, crs, fragm_sz); - + skb_copy_to_linear_data(fragm, &fragm_hdr, INT_H_SIZE); + skb_copy_to_linear_data_offset(fragm, INT_H_SIZE, crs, + fragm_sz); /* Send queued messages first, if any: */ l_ptr->stats.sent_fragments++; @@ -2733,8 +2734,8 @@ int tipc_link_recv_fragment(struct sk_buff **pending, struct sk_buff **fb, if (pbuf != NULL) { pbuf->next = *pending; *pending = pbuf; - memcpy(pbuf->data, (unchar *)imsg, msg_data_sz(fragm)); - + skb_copy_to_linear_data(pbuf, imsg, + msg_data_sz(fragm)); /* Prepare buffer for subsequent fragments. */ set_long_msg_seqno(pbuf, long_msg_seq_no); @@ -2750,7 +2751,8 @@ int tipc_link_recv_fragment(struct sk_buff **pending, struct sk_buff **fb, u32 fsz = get_fragm_size(pbuf); u32 crs = ((msg_fragm_no(fragm) - 1) * fsz); u32 exp_frags = get_expected_frags(pbuf) - 1; - memcpy(pbuf->data + crs, msg_data(fragm), dsz); + skb_copy_to_linear_data_offset(pbuf, crs, + msg_data(fragm), dsz); buf_discard(fbuf); /* Is message complete? */ diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 62d54906360..35d5ba1d4f4 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -1,8 +1,8 @@ /* * net/tipc/msg.h: Include file for TIPC message header routines * - * Copyright (c) 2000-2006, Ericsson AB - * Copyright (c) 2005, Wind River Systems + * Copyright (c) 2000-2007, Ericsson AB + * Copyright (c) 2005-2007, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -71,8 +71,11 @@ static inline void msg_set_word(struct tipc_msg *m, u32 w, u32 val) static inline void msg_set_bits(struct tipc_msg *m, u32 w, u32 pos, u32 mask, u32 val) { - u32 word = msg_word(m,w) & ~(mask << pos); - msg_set_word(m, w, (word |= (val << pos))); + val = (val & mask) << pos; + val = htonl(val); + mask = htonl(mask << pos); + m->hdr[w] &= ~mask; + m->hdr[w] |= val; } /* @@ -786,15 +789,16 @@ static inline int msg_build(struct tipc_msg *hdr, *buf = buf_acquire(sz); if (!(*buf)) return -ENOMEM; - memcpy((*buf)->data, (unchar *)hdr, hsz); + skb_copy_to_linear_data(*buf, hdr, hsz); for (res = 1, cnt = 0; res && (cnt < num_sect); cnt++) { if (likely(usrmem)) res = !copy_from_user((*buf)->data + pos, msg_sect[cnt].iov_base, msg_sect[cnt].iov_len); else - memcpy((*buf)->data + pos, msg_sect[cnt].iov_base, - msg_sect[cnt].iov_len); + skb_copy_to_linear_data_offset(*buf, pos, + msg_sect[cnt].iov_base, + msg_sect[cnt].iov_len); pos += msg_sect[cnt].iov_len; } if (likely(res)) diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index b8e1edc2bad..4cdafa2d1d4 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -57,7 +57,7 @@ static int handle_cmd(struct sk_buff *skb, struct genl_info *info) if (rep_buf) { skb_push(rep_buf, hdr_space); - rep_nlh = (struct nlmsghdr *)rep_buf->data; + rep_nlh = nlmsg_hdr(rep_buf); memcpy(rep_nlh, req_nlh, hdr_space); rep_nlh->nlmsg_len = rep_buf->len; genlmsg_unicast(rep_buf, req_nlh->nlmsg_pid); diff --git a/net/tipc/port.c b/net/tipc/port.c index 5f8217d4b45..bcd5da00737 100644 --- a/net/tipc/port.c +++ b/net/tipc/port.c @@ -464,7 +464,7 @@ int tipc_reject_msg(struct sk_buff *buf, u32 err) msg_set_size(rmsg, data_sz + hdr_sz); msg_set_nametype(rmsg, msg_nametype(msg)); msg_set_nameinst(rmsg, msg_nameinst(msg)); - memcpy(rbuf->data + hdr_sz, msg_data(msg), data_sz); + skb_copy_to_linear_data_offset(rbuf, hdr_sz, msg_data(msg), data_sz); /* send self-abort message when rejecting on a connected port */ if (msg_connected(msg)) { @@ -1419,7 +1419,7 @@ int tipc_send_buf(u32 ref, struct sk_buff *buf, unsigned int dsz) return -ENOMEM; skb_push(buf, hsz); - memcpy(buf->data, (unchar *)msg, hsz); + skb_copy_to_linear_data(buf, msg, hsz); destnode = msg_destnode(msg); p_ptr->publ.congested = 1; if (!tipc_port_congested(p_ptr)) { @@ -1555,7 +1555,7 @@ int tipc_forward_buf2name(u32 ref, if (skb_cow(buf, LONG_H_SIZE)) return -ENOMEM; skb_push(buf, LONG_H_SIZE); - memcpy(buf->data, (unchar *)msg, LONG_H_SIZE); + skb_copy_to_linear_data(buf, msg, LONG_H_SIZE); msg_dbg(buf_msg(buf),"PREP:"); if (likely(destport || destnode)) { p_ptr->sent++; @@ -1679,7 +1679,7 @@ int tipc_forward_buf2port(u32 ref, return -ENOMEM; skb_push(buf, DIR_MSG_H_SIZE); - memcpy(buf->data, (unchar *)msg, DIR_MSG_H_SIZE); + skb_copy_to_linear_data(buf, msg, DIR_MSG_H_SIZE); msg_dbg(msg, "buf2port: "); p_ptr->sent++; if (dest->node == tipc_own_addr) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index b71739fbe2c..45832fb75ea 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1020,7 +1020,7 @@ restart: if (!err) { buf_crs = (unsigned char *)(TIPC_SKB_CB(buf)->handle); - sz = buf->tail - buf_crs; + sz = skb_tail_pointer(buf) - buf_crs; needed = (buf_len - sz_copied); sz_to_copy = (sz <= needed) ? sz : needed; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 606971645b3..aec8cf165e1 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1319,7 +1319,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, unix_attach_fds(siocb->scm, skb); unix_get_secdata(siocb->scm, skb); - skb->h.raw = skb->data; + skb_reset_transport_header(skb); err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); if (err) goto out_free; diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c index 5d2d93dc083..7a19e0ede28 100644 --- a/net/wanrouter/wanmain.c +++ b/net/wanrouter/wanmain.c @@ -277,8 +277,8 @@ int wanrouter_encapsulate(struct sk_buff *skb, struct net_device *dev, skb_push(skb, 7); skb->data[0] = 0; skb->data[1] = NLPID_SNAP; - memcpy(&skb->data[2], wanrouter_oui_ether, - sizeof(wanrouter_oui_ether)); + skb_copy_to_linear_data_offset(skb, 2, wanrouter_oui_ether, + sizeof(wanrouter_oui_ether)); *((unsigned short*)&skb->data[5]) = htons(type); break; @@ -339,7 +339,7 @@ __be16 wanrouter_type_trans(struct sk_buff *skb, struct net_device *dev) skb->protocol = ethertype; skb->pkt_type = PACKET_HOST; /* Physically point to point */ skb_pull(skb, cnt); - skb->mac.raw = skb->data; + skb_reset_mac_header(skb); return ethertype; } diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig new file mode 100644 index 00000000000..a228d56a91b --- /dev/null +++ b/net/wireless/Kconfig @@ -0,0 +1,16 @@ +config CFG80211 + tristate "Improved wireless configuration API" + +config WIRELESS_EXT + bool "Wireless extensions" + default n + ---help--- + This option enables the legacy wireless extensions + (wireless network interface configuration via ioctls.) + + Wireless extensions will be replaced by cfg80211 and + will be required only by legacy drivers that implement + wireless extension handlers. + + Say N (if you can) unless you know you need wireless + extensions for external modules. diff --git a/net/wireless/Makefile b/net/wireless/Makefile new file mode 100644 index 00000000000..3a96ae60271 --- /dev/null +++ b/net/wireless/Makefile @@ -0,0 +1,4 @@ +obj-$(CONFIG_WIRELESS_EXT) += wext.o +obj-$(CONFIG_CFG80211) += cfg80211.o + +cfg80211-y += core.o sysfs.o diff --git a/net/wireless/core.c b/net/wireless/core.c new file mode 100644 index 00000000000..7eabd55417a --- /dev/null +++ b/net/wireless/core.c @@ -0,0 +1,224 @@ +/* + * This is the linux wireless configuration interface. + * + * Copyright 2006, 2007 Johannes Berg <johannes@sipsolutions.net> + */ + +#include <linux/if.h> +#include <linux/module.h> +#include <linux/err.h> +#include <linux/mutex.h> +#include <linux/list.h> +#include <linux/nl80211.h> +#include <linux/debugfs.h> +#include <linux/notifier.h> +#include <linux/device.h> +#include <net/genetlink.h> +#include <net/cfg80211.h> +#include <net/wireless.h> +#include "core.h" +#include "sysfs.h" + +/* name for sysfs, %d is appended */ +#define PHY_NAME "phy" + +MODULE_AUTHOR("Johannes Berg"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("wireless configuration support"); + +/* RCU might be appropriate here since we usually + * only read the list, and that can happen quite + * often because we need to do it for each command */ +LIST_HEAD(cfg80211_drv_list); +DEFINE_MUTEX(cfg80211_drv_mutex); +static int wiphy_counter; + +/* for debugfs */ +static struct dentry *ieee80211_debugfs_dir; + +/* exported functions */ + +struct wiphy *wiphy_new(struct cfg80211_ops *ops, int sizeof_priv) +{ + struct cfg80211_registered_device *drv; + int alloc_size; + + alloc_size = sizeof(*drv) + sizeof_priv; + + drv = kzalloc(alloc_size, GFP_KERNEL); + if (!drv) + return NULL; + + drv->ops = ops; + + mutex_lock(&cfg80211_drv_mutex); + + drv->idx = wiphy_counter; + + /* now increase counter for the next device unless + * it has wrapped previously */ + if (wiphy_counter >= 0) + wiphy_counter++; + + mutex_unlock(&cfg80211_drv_mutex); + + if (unlikely(drv->idx < 0)) { + /* ugh, wrapped! */ + kfree(drv); + return NULL; + } + + /* give it a proper name */ + snprintf(drv->wiphy.dev.bus_id, BUS_ID_SIZE, + PHY_NAME "%d", drv->idx); + + mutex_init(&drv->mtx); + mutex_init(&drv->devlist_mtx); + INIT_LIST_HEAD(&drv->netdev_list); + + device_initialize(&drv->wiphy.dev); + drv->wiphy.dev.class = &ieee80211_class; + drv->wiphy.dev.platform_data = drv; + + return &drv->wiphy; +} +EXPORT_SYMBOL(wiphy_new); + +int wiphy_register(struct wiphy *wiphy) +{ + struct cfg80211_registered_device *drv = wiphy_to_dev(wiphy); + int res; + + mutex_lock(&cfg80211_drv_mutex); + + res = device_add(&drv->wiphy.dev); + if (res) + goto out_unlock; + + list_add(&drv->list, &cfg80211_drv_list); + + /* add to debugfs */ + drv->wiphy.debugfsdir = + debugfs_create_dir(wiphy_name(&drv->wiphy), + ieee80211_debugfs_dir); + + res = 0; +out_unlock: + mutex_unlock(&cfg80211_drv_mutex); + return res; +} +EXPORT_SYMBOL(wiphy_register); + +void wiphy_unregister(struct wiphy *wiphy) +{ + struct cfg80211_registered_device *drv = wiphy_to_dev(wiphy); + + /* protect the device list */ + mutex_lock(&cfg80211_drv_mutex); + + BUG_ON(!list_empty(&drv->netdev_list)); + + /* + * Try to grab drv->mtx. If a command is still in progress, + * hopefully the driver will refuse it since it's tearing + * down the device already. We wait for this command to complete + * before unlinking the item from the list. + * Note: as codified by the BUG_ON above we cannot get here if + * a virtual interface is still associated. Hence, we can only + * get to lock contention here if userspace issues a command + * that identified the hardware by wiphy index. + */ + mutex_lock(&drv->mtx); + /* unlock again before freeing */ + mutex_unlock(&drv->mtx); + + list_del(&drv->list); + device_del(&drv->wiphy.dev); + debugfs_remove(drv->wiphy.debugfsdir); + + mutex_unlock(&cfg80211_drv_mutex); +} +EXPORT_SYMBOL(wiphy_unregister); + +void cfg80211_dev_free(struct cfg80211_registered_device *drv) +{ + mutex_destroy(&drv->mtx); + mutex_destroy(&drv->devlist_mtx); + kfree(drv); +} + +void wiphy_free(struct wiphy *wiphy) +{ + put_device(&wiphy->dev); +} +EXPORT_SYMBOL(wiphy_free); + +static int cfg80211_netdev_notifier_call(struct notifier_block * nb, + unsigned long state, + void *ndev) +{ + struct net_device *dev = ndev; + struct cfg80211_registered_device *rdev; + + if (!dev->ieee80211_ptr) + return 0; + + rdev = wiphy_to_dev(dev->ieee80211_ptr->wiphy); + + switch (state) { + case NETDEV_REGISTER: + mutex_lock(&rdev->devlist_mtx); + list_add(&dev->ieee80211_ptr->list, &rdev->netdev_list); + if (sysfs_create_link(&dev->dev.kobj, &rdev->wiphy.dev.kobj, + "phy80211")) { + printk(KERN_ERR "wireless: failed to add phy80211 " + "symlink to netdev!\n"); + } + dev->ieee80211_ptr->netdev = dev; + mutex_unlock(&rdev->devlist_mtx); + break; + case NETDEV_UNREGISTER: + mutex_lock(&rdev->devlist_mtx); + if (!list_empty(&dev->ieee80211_ptr->list)) { + sysfs_remove_link(&dev->dev.kobj, "phy80211"); + list_del_init(&dev->ieee80211_ptr->list); + } + mutex_unlock(&rdev->devlist_mtx); + break; + } + + return 0; +} + +static struct notifier_block cfg80211_netdev_notifier = { + .notifier_call = cfg80211_netdev_notifier_call, +}; + +static int cfg80211_init(void) +{ + int err = wiphy_sysfs_init(); + if (err) + goto out_fail_sysfs; + + err = register_netdevice_notifier(&cfg80211_netdev_notifier); + if (err) + goto out_fail_notifier; + + ieee80211_debugfs_dir = debugfs_create_dir("ieee80211", NULL); + + return 0; + +out_fail_notifier: + wiphy_sysfs_exit(); +out_fail_sysfs: + return err; +} +module_init(cfg80211_init); + +static void cfg80211_exit(void) +{ + debugfs_remove(ieee80211_debugfs_dir); + unregister_netdevice_notifier(&cfg80211_netdev_notifier); + wiphy_sysfs_exit(); +} +module_exit(cfg80211_exit); diff --git a/net/wireless/core.h b/net/wireless/core.h new file mode 100644 index 00000000000..158db1edb92 --- /dev/null +++ b/net/wireless/core.h @@ -0,0 +1,49 @@ +/* + * Wireless configuration interface internals. + * + * Copyright 2006, 2007 Johannes Berg <johannes@sipsolutions.net> + */ +#ifndef __NET_WIRELESS_CORE_H +#define __NET_WIRELESS_CORE_H +#include <linux/mutex.h> +#include <linux/list.h> +#include <linux/netdevice.h> +#include <net/genetlink.h> +#include <net/wireless.h> +#include <net/cfg80211.h> + +struct cfg80211_registered_device { + struct cfg80211_ops *ops; + struct list_head list; + /* we hold this mutex during any call so that + * we cannot do multiple calls at once, and also + * to avoid the deregister call to proceed while + * any call is in progress */ + struct mutex mtx; + + /* wiphy index, internal only */ + int idx; + + /* associate netdev list */ + struct mutex devlist_mtx; + struct list_head netdev_list; + + /* must be last because of the way we do wiphy_priv(), + * and it should at least be aligned to NETDEV_ALIGN */ + struct wiphy wiphy __attribute__((__aligned__(NETDEV_ALIGN))); +}; + +static inline +struct cfg80211_registered_device *wiphy_to_dev(struct wiphy *wiphy) +{ + BUG_ON(!wiphy); + return container_of(wiphy, struct cfg80211_registered_device, wiphy); +} + +extern struct mutex cfg80211_drv_mutex; +extern struct list_head cfg80211_drv_list; + +/* free object */ +extern void cfg80211_dev_free(struct cfg80211_registered_device *drv); + +#endif /* __NET_WIRELESS_CORE_H */ diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c new file mode 100644 index 00000000000..3ebae144296 --- /dev/null +++ b/net/wireless/sysfs.c @@ -0,0 +1,80 @@ +/* + * This file provides /sys/class/ieee80211/<wiphy name>/ + * and some default attributes. + * + * Copyright 2005-2006 Jiri Benc <jbenc@suse.cz> + * Copyright 2006 Johannes Berg <johannes@sipsolutions.net> + * + * This file is GPLv2 as found in COPYING. + */ + +#include <linux/device.h> +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/nl80211.h> +#include <linux/rtnetlink.h> +#include <net/cfg80211.h> +#include "sysfs.h" +#include "core.h" + +static inline struct cfg80211_registered_device *dev_to_rdev( + struct device *dev) +{ + return container_of(dev, struct cfg80211_registered_device, wiphy.dev); +} + +static ssize_t _show_index(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", dev_to_rdev(dev)->idx); +} + +static ssize_t _show_permaddr(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + char *addr = dev_to_rdev(dev)->wiphy.perm_addr; + + return sprintf(buf, "%.2x:%.2x:%.2x:%.2x:%.2x:%.2x\n", + addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]); +} + +static struct device_attribute ieee80211_dev_attrs[] = { + __ATTR(index, S_IRUGO, _show_index, NULL), + __ATTR(macaddress, S_IRUGO, _show_permaddr, NULL), + {} +}; + +static void wiphy_dev_release(struct device *dev) +{ + struct cfg80211_registered_device *rdev = dev_to_rdev(dev); + + cfg80211_dev_free(rdev); +} + +static int wiphy_uevent(struct device *dev, char **envp, + int num_envp, char *buf, int size) +{ + /* TODO, we probably need stuff here */ + return 0; +} + +struct class ieee80211_class = { + .name = "ieee80211", + .owner = THIS_MODULE, + .dev_release = wiphy_dev_release, + .dev_attrs = ieee80211_dev_attrs, +#ifdef CONFIG_HOTPLUG + .dev_uevent = wiphy_uevent, +#endif +}; + +int wiphy_sysfs_init(void) +{ + return class_register(&ieee80211_class); +} + +void wiphy_sysfs_exit(void) +{ + class_unregister(&ieee80211_class); +} diff --git a/net/wireless/sysfs.h b/net/wireless/sysfs.h new file mode 100644 index 00000000000..65acbebd371 --- /dev/null +++ b/net/wireless/sysfs.h @@ -0,0 +1,9 @@ +#ifndef __WIRELESS_SYSFS_H +#define __WIRELESS_SYSFS_H + +extern int wiphy_sysfs_init(void); +extern void wiphy_sysfs_exit(void); + +extern struct class ieee80211_class; + +#endif /* __WIRELESS_SYSFS_H */ diff --git a/net/core/wireless.c b/net/wireless/wext.c index b07fe270a50..d6aaf65192e 100644 --- a/net/core/wireless.c +++ b/net/wireless/wext.c @@ -97,22 +97,10 @@ #include <linux/wireless.h> /* Pretty obvious */ #include <net/iw_handler.h> /* New driver API */ #include <net/netlink.h> +#include <net/wext.h> #include <asm/uaccess.h> /* copy_to_user() */ -/**************************** CONSTANTS ****************************/ - -/* Debugging stuff */ -#undef WE_IOCTL_DEBUG /* Debug IOCTL API */ -#undef WE_RTNETLINK_DEBUG /* Debug RtNetlink API */ -#undef WE_EVENT_DEBUG /* Debug Event dispatcher */ -#undef WE_SPY_DEBUG /* Debug enhanced spy support */ - -/* Options */ -//CONFIG_NET_WIRELESS_RTNETLINK /* Wireless requests over RtNetlink */ -#define WE_EVENT_RTNETLINK /* Propagate events using RtNetlink */ -#define WE_SET_EVENT /* Generate an event on some set commands */ - /************************* GLOBAL VARIABLES *************************/ /* * You should not use global variables, because of re-entrancy. @@ -349,8 +337,7 @@ static const struct iw_ioctl_description standard_ioctl[] = { .max_tokens = sizeof(struct iw_pmksa), }, }; -static const unsigned standard_ioctl_num = (sizeof(standard_ioctl) / - sizeof(struct iw_ioctl_description)); +static const unsigned standard_ioctl_num = ARRAY_SIZE(standard_ioctl); /* * Meta-data about all the additional standard Wireless Extension events @@ -400,8 +387,7 @@ static const struct iw_ioctl_description standard_event[] = { .max_tokens = sizeof(struct iw_pmkid_cand), }, }; -static const unsigned standard_event_num = (sizeof(standard_event) / - sizeof(struct iw_ioctl_description)); +static const unsigned standard_event_num = ARRAY_SIZE(standard_event); /* Size (in bytes) of the various private data types */ static const char iw_priv_type_size[] = { @@ -454,26 +440,24 @@ static const int event_type_pk_size[] = { /* ---------------------------------------------------------------- */ /* * Return the driver handler associated with a specific Wireless Extension. - * Called from various place, so make sure it remains efficient. */ -static inline iw_handler get_handler(struct net_device *dev, - unsigned int cmd) +static iw_handler get_handler(struct net_device *dev, unsigned int cmd) { /* Don't "optimise" the following variable, it will crash */ unsigned int index; /* *MUST* be unsigned */ /* Check if we have some wireless handlers defined */ - if(dev->wireless_handlers == NULL) + if (dev->wireless_handlers == NULL) return NULL; /* Try as a standard command */ index = cmd - SIOCIWFIRST; - if(index < dev->wireless_handlers->num_standard) + if (index < dev->wireless_handlers->num_standard) return dev->wireless_handlers->standard[index]; /* Try as a private command */ index = cmd - SIOCIWFIRSTPRIV; - if(index < dev->wireless_handlers->num_private) + if (index < dev->wireless_handlers->num_private) return dev->wireless_handlers->private[index]; /* Not found */ @@ -484,15 +468,15 @@ static inline iw_handler get_handler(struct net_device *dev, /* * Get statistics out of the driver */ -static inline struct iw_statistics *get_wireless_stats(struct net_device *dev) +static struct iw_statistics *get_wireless_stats(struct net_device *dev) { /* New location */ - if((dev->wireless_handlers != NULL) && + if ((dev->wireless_handlers != NULL) && (dev->wireless_handlers->get_wireless_stats != NULL)) return dev->wireless_handlers->get_wireless_stats(dev); /* Not found */ - return (struct iw_statistics *) NULL; + return NULL; } /* ---------------------------------------------------------------- */ @@ -514,14 +498,14 @@ static inline struct iw_statistics *get_wireless_stats(struct net_device *dev) * netif_running(dev) test. I'm open on that one... * Hopefully, the driver will remember to do a commit in "open()" ;-) */ -static inline int call_commit_handler(struct net_device * dev) +static int call_commit_handler(struct net_device *dev) { - if((netif_running(dev)) && - (dev->wireless_handlers->standard[0] != NULL)) { + if ((netif_running(dev)) && + (dev->wireless_handlers->standard[0] != NULL)) /* Call the commit handler on the driver */ return dev->wireless_handlers->standard[0](dev, NULL, NULL, NULL); - } else + else return 0; /* Command completed successfully */ } @@ -570,14 +554,13 @@ static int iw_handler_get_iwstats(struct net_device * dev, struct iw_statistics *stats; stats = get_wireless_stats(dev); - if (stats != (struct iw_statistics *) NULL) { - + if (stats) { /* Copy statistics to extra */ memcpy(extra, stats, sizeof(struct iw_statistics)); wrqu->data.length = sizeof(struct iw_statistics); /* Check if we need to clear the updated flag */ - if(wrqu->data.flags != 0) + if (wrqu->data.flags != 0) stats->qual.updated &= ~IW_QUAL_ALL_UPDATED; return 0; } else @@ -596,12 +579,12 @@ static int iw_handler_get_private(struct net_device * dev, char * extra) { /* Check if the driver has something to export */ - if((dev->wireless_handlers->num_private_args == 0) || + if ((dev->wireless_handlers->num_private_args == 0) || (dev->wireless_handlers->private_args == NULL)) return -EOPNOTSUPP; /* Check if there is enough buffer up there */ - if(wrqu->data.length < dev->wireless_handlers->num_private_args) { + if (wrqu->data.length < dev->wireless_handlers->num_private_args) { /* User space can't know in advance how large the buffer * needs to be. Give it a hint, so that we can support * any size buffer we want somewhat efficiently... */ @@ -636,8 +619,8 @@ static int iw_handler_get_private(struct net_device * dev, /* * Print one entry (line) of /proc/net/wireless */ -static __inline__ void wireless_seq_printf_stats(struct seq_file *seq, - struct net_device *dev) +static void wireless_seq_printf_stats(struct seq_file *seq, + struct net_device *dev) { /* Get stats from the driver */ struct iw_statistics *stats = get_wireless_stats(dev); @@ -680,7 +663,7 @@ static int wireless_seq_show(struct seq_file *seq, void *v) return 0; } -static struct seq_operations wireless_seq_ops = { +static const struct seq_operations wireless_seq_ops = { .start = dev_seq_start, .next = dev_seq_next, .stop = dev_seq_stop, @@ -700,7 +683,7 @@ static const struct file_operations wireless_seq_fops = { .release = seq_release, }; -int __init wireless_proc_init(void) +int __init wext_proc_init(void) { /* Create /proc/net/wireless entry */ if (!proc_net_fops_create("wireless", S_IRUGO, &wireless_seq_fops)) @@ -735,32 +718,24 @@ static int ioctl_standard_call(struct net_device * dev, int ret = -EINVAL; /* Get the description of the IOCTL */ - if((cmd - SIOCIWFIRST) >= standard_ioctl_num) + if ((cmd - SIOCIWFIRST) >= standard_ioctl_num) return -EOPNOTSUPP; descr = &(standard_ioctl[cmd - SIOCIWFIRST]); -#ifdef WE_IOCTL_DEBUG - printk(KERN_DEBUG "%s (WE) : Found standard handler for 0x%04X\n", - ifr->ifr_name, cmd); - printk(KERN_DEBUG "%s (WE) : Header type : %d, Token type : %d, size : %d, token : %d\n", dev->name, descr->header_type, descr->token_type, descr->token_size, descr->max_tokens); -#endif /* WE_IOCTL_DEBUG */ - /* Prepare the call */ info.cmd = cmd; info.flags = 0; /* Check if we have a pointer to user space data or not */ - if(descr->header_type != IW_HEADER_TYPE_POINT) { + if (descr->header_type != IW_HEADER_TYPE_POINT) { /* No extra arguments. Trivial to handle */ ret = handler(dev, &info, &(iwr->u), NULL); -#ifdef WE_SET_EVENT /* Generate an event to notify listeners of the change */ - if((descr->flags & IW_DESCR_FLAG_EVENT) && + if ((descr->flags & IW_DESCR_FLAG_EVENT) && ((ret == 0) || (ret == -EIWCOMMIT))) wireless_send_event(dev, cmd, &(iwr->u), NULL); -#endif /* WE_SET_EVENT */ } else { char * extra; int extra_size; @@ -800,19 +775,19 @@ static int ioctl_standard_call(struct net_device * dev, iwr->u.data.length -= essid_compat; /* Check what user space is giving us */ - if(IW_IS_SET(cmd)) { + if (IW_IS_SET(cmd)) { /* Check NULL pointer */ - if((iwr->u.data.pointer == NULL) && + if ((iwr->u.data.pointer == NULL) && (iwr->u.data.length != 0)) return -EFAULT; /* Check if number of token fits within bounds */ - if(iwr->u.data.length > descr->max_tokens) + if (iwr->u.data.length > descr->max_tokens) return -E2BIG; - if(iwr->u.data.length < descr->min_tokens) + if (iwr->u.data.length < descr->min_tokens) return -EINVAL; } else { /* Check NULL pointer */ - if(iwr->u.data.pointer == NULL) + if (iwr->u.data.pointer == NULL) return -EFAULT; /* Save user space buffer size for checking */ user_length = iwr->u.data.length; @@ -822,7 +797,7 @@ static int ioctl_standard_call(struct net_device * dev, * implied by the test at the end. */ /* Support for very large requests */ - if((descr->flags & IW_DESCR_FLAG_NOMAX) && + if ((descr->flags & IW_DESCR_FLAG_NOMAX) && (user_length > descr->max_tokens)) { /* Allow userspace to GET more than max so * we can support any size GET requests. @@ -835,20 +810,14 @@ static int ioctl_standard_call(struct net_device * dev, } } -#ifdef WE_IOCTL_DEBUG - printk(KERN_DEBUG "%s (WE) : Malloc %d bytes\n", - dev->name, extra_size); -#endif /* WE_IOCTL_DEBUG */ - /* Create the kernel buffer */ /* kzalloc ensures NULL-termination for essid_compat */ extra = kzalloc(extra_size, GFP_KERNEL); - if (extra == NULL) { + if (extra == NULL) return -ENOMEM; - } /* If it is a SET, get all the extra data in here */ - if(IW_IS_SET(cmd) && (iwr->u.data.length != 0)) { + if (IW_IS_SET(cmd) && (iwr->u.data.length != 0)) { err = copy_from_user(extra, iwr->u.data.pointer, iwr->u.data.length * descr->token_size); @@ -856,11 +825,6 @@ static int ioctl_standard_call(struct net_device * dev, kfree(extra); return -EFAULT; } -#ifdef WE_IOCTL_DEBUG - printk(KERN_DEBUG "%s (WE) : Got %d bytes\n", - dev->name, - iwr->u.data.length * descr->token_size); -#endif /* WE_IOCTL_DEBUG */ } /* Call the handler */ @@ -871,7 +835,7 @@ static int ioctl_standard_call(struct net_device * dev, /* If we have something to return to the user */ if (!ret && IW_IS_GET(cmd)) { /* Check if there is enough buffer up there */ - if(user_length < iwr->u.data.length) { + if (user_length < iwr->u.data.length) { kfree(extra); return -E2BIG; } @@ -881,18 +845,12 @@ static int ioctl_standard_call(struct net_device * dev, descr->token_size); if (err) ret = -EFAULT; -#ifdef WE_IOCTL_DEBUG - printk(KERN_DEBUG "%s (WE) : Wrote %d bytes\n", - dev->name, - iwr->u.data.length * descr->token_size); -#endif /* WE_IOCTL_DEBUG */ } -#ifdef WE_SET_EVENT /* Generate an event to notify listeners of the change */ - if((descr->flags & IW_DESCR_FLAG_EVENT) && + if ((descr->flags & IW_DESCR_FLAG_EVENT) && ((ret == 0) || (ret == -EIWCOMMIT))) { - if(descr->flags & IW_DESCR_FLAG_RESTRICT) + if (descr->flags & IW_DESCR_FLAG_RESTRICT) /* If the event is restricted, don't * export the payload */ wireless_send_event(dev, cmd, &(iwr->u), NULL); @@ -900,14 +858,13 @@ static int ioctl_standard_call(struct net_device * dev, wireless_send_event(dev, cmd, &(iwr->u), extra); } -#endif /* WE_SET_EVENT */ /* Cleanup - I told you it wasn't that long ;-) */ kfree(extra); } /* Call commit handler if needed and defined */ - if(ret == -EIWCOMMIT) + if (ret == -EIWCOMMIT) ret = call_commit_handler(dev); /* Here, we will generate the appropriate event if needed */ @@ -931,10 +888,8 @@ static int ioctl_standard_call(struct net_device * dev, * a iw_handler but process it in your ioctl handler (i.e. use the * old driver API). */ -static inline int ioctl_private_call(struct net_device * dev, - struct ifreq * ifr, - unsigned int cmd, - iw_handler handler) +static int ioctl_private_call(struct net_device *dev, struct ifreq *ifr, + unsigned int cmd, iw_handler handler) { struct iwreq * iwr = (struct iwreq *) ifr; const struct iw_priv_args * descr = NULL; @@ -944,28 +899,18 @@ static inline int ioctl_private_call(struct net_device * dev, int ret = -EINVAL; /* Get the description of the IOCTL */ - for(i = 0; i < dev->wireless_handlers->num_private_args; i++) - if(cmd == dev->wireless_handlers->private_args[i].cmd) { + for (i = 0; i < dev->wireless_handlers->num_private_args; i++) + if (cmd == dev->wireless_handlers->private_args[i].cmd) { descr = &(dev->wireless_handlers->private_args[i]); break; } -#ifdef WE_IOCTL_DEBUG - printk(KERN_DEBUG "%s (WE) : Found private handler for 0x%04X\n", - ifr->ifr_name, cmd); - if(descr) { - printk(KERN_DEBUG "%s (WE) : Name %s, set %X, get %X\n", - dev->name, descr->name, - descr->set_args, descr->get_args); - } -#endif /* WE_IOCTL_DEBUG */ - /* Compute the size of the set/get arguments */ - if(descr != NULL) { - if(IW_IS_SET(cmd)) { + if (descr != NULL) { + if (IW_IS_SET(cmd)) { int offset = 0; /* For sub-ioctls */ /* Check for sub-ioctl handler */ - if(descr->name[0] == '\0') + if (descr->name[0] == '\0') /* Reserve one int for sub-ioctl index */ offset = sizeof(__u32); @@ -973,7 +918,7 @@ static inline int ioctl_private_call(struct net_device * dev, extra_size = get_priv_size(descr->set_args); /* Does it fits in iwr ? */ - if((descr->set_args & IW_PRIV_SIZE_FIXED) && + if ((descr->set_args & IW_PRIV_SIZE_FIXED) && ((extra_size + offset) <= IFNAMSIZ)) extra_size = 0; } else { @@ -981,7 +926,7 @@ static inline int ioctl_private_call(struct net_device * dev, extra_size = get_priv_size(descr->get_args); /* Does it fits in iwr ? */ - if((descr->get_args & IW_PRIV_SIZE_FIXED) && + if ((descr->get_args & IW_PRIV_SIZE_FIXED) && (extra_size <= IFNAMSIZ)) extra_size = 0; } @@ -992,7 +937,7 @@ static inline int ioctl_private_call(struct net_device * dev, info.flags = 0; /* Check if we have a pointer to user space data or not. */ - if(extra_size == 0) { + if (extra_size == 0) { /* No extra arguments. Trivial to handle */ ret = handler(dev, &info, &(iwr->u), (char *) &(iwr->u)); } else { @@ -1000,46 +945,33 @@ static inline int ioctl_private_call(struct net_device * dev, int err; /* Check what user space is giving us */ - if(IW_IS_SET(cmd)) { + if (IW_IS_SET(cmd)) { /* Check NULL pointer */ - if((iwr->u.data.pointer == NULL) && + if ((iwr->u.data.pointer == NULL) && (iwr->u.data.length != 0)) return -EFAULT; /* Does it fits within bounds ? */ - if(iwr->u.data.length > (descr->set_args & + if (iwr->u.data.length > (descr->set_args & IW_PRIV_SIZE_MASK)) return -E2BIG; - } else { - /* Check NULL pointer */ - if(iwr->u.data.pointer == NULL) - return -EFAULT; - } - -#ifdef WE_IOCTL_DEBUG - printk(KERN_DEBUG "%s (WE) : Malloc %d bytes\n", - dev->name, extra_size); -#endif /* WE_IOCTL_DEBUG */ + } else if (iwr->u.data.pointer == NULL) + return -EFAULT; /* Always allocate for max space. Easier, and won't last * long... */ extra = kmalloc(extra_size, GFP_KERNEL); - if (extra == NULL) { + if (extra == NULL) return -ENOMEM; - } /* If it is a SET, get all the extra data in here */ - if(IW_IS_SET(cmd) && (iwr->u.data.length != 0)) { + if (IW_IS_SET(cmd) && (iwr->u.data.length != 0)) { err = copy_from_user(extra, iwr->u.data.pointer, extra_size); if (err) { kfree(extra); return -EFAULT; } -#ifdef WE_IOCTL_DEBUG - printk(KERN_DEBUG "%s (WE) : Got %d elem\n", - dev->name, iwr->u.data.length); -#endif /* WE_IOCTL_DEBUG */ } /* Call the handler */ @@ -1059,10 +991,6 @@ static inline int ioctl_private_call(struct net_device * dev, extra_size); if (err) ret = -EFAULT; -#ifdef WE_IOCTL_DEBUG - printk(KERN_DEBUG "%s (WE) : Wrote %d elem\n", - dev->name, iwr->u.data.length); -#endif /* WE_IOCTL_DEBUG */ } /* Cleanup - I told you it wasn't that long ;-) */ @@ -1071,7 +999,7 @@ static inline int ioctl_private_call(struct net_device * dev, /* Call commit handler if needed and defined */ - if(ret == -EIWCOMMIT) + if (ret == -EIWCOMMIT) ret = call_commit_handler(dev); return ret; @@ -1079,11 +1007,10 @@ static inline int ioctl_private_call(struct net_device * dev, /* ---------------------------------------------------------------- */ /* - * Main IOCTl dispatcher. Called from the main networking code - * (dev_ioctl() in net/core/dev.c). + * Main IOCTl dispatcher. * Check the type of IOCTL and call the appropriate wrapper... */ -int wireless_process_ioctl(struct ifreq *ifr, unsigned int cmd) +static int wireless_process_ioctl(struct ifreq *ifr, unsigned int cmd) { struct net_device *dev; iw_handler handler; @@ -1098,789 +1025,54 @@ int wireless_process_ioctl(struct ifreq *ifr, unsigned int cmd) /* A bunch of special cases, then the generic case... * Note that 'cmd' is already filtered in dev_ioctl() with * (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) */ - switch(cmd) - { - case SIOCGIWSTATS: - /* Get Wireless Stats */ - return ioctl_standard_call(dev, - ifr, - cmd, - &iw_handler_get_iwstats); - - case SIOCGIWPRIV: - /* Check if we have some wireless handlers defined */ - if(dev->wireless_handlers != NULL) { - /* We export to user space the definition of - * the private handler ourselves */ - return ioctl_standard_call(dev, - ifr, - cmd, - &iw_handler_get_private); - } - // ## Fall-through for old API ## - default: - /* Generic IOCTL */ - /* Basic check */ - if (!netif_device_present(dev)) - return -ENODEV; - /* New driver API : try to find the handler */ - handler = get_handler(dev, cmd); - if(handler != NULL) { - /* Standard and private are not the same */ - if(cmd < SIOCIWFIRSTPRIV) - return ioctl_standard_call(dev, - ifr, - cmd, - handler); - else - return ioctl_private_call(dev, - ifr, - cmd, - handler); - } - /* Old driver API : call driver ioctl handler */ - if (dev->do_ioctl) { - return dev->do_ioctl(dev, ifr, cmd); - } - return -EOPNOTSUPP; - } - /* Not reached */ - return -EINVAL; -} - -/********************** RTNETLINK REQUEST API **********************/ -/* - * The alternate user space API to configure all those Wireless Extensions - * is through RtNetlink. - * This API support only the new driver API (iw_handler). - * - * This RtNetlink API use the same query/reply model as the ioctl API. - * Maximum effort has been done to fit in the RtNetlink model, and - * we support both RtNetlink Set and RtNelink Get operations. - * On the other hand, we don't offer Dump operations because of the - * following reasons : - * o Large number of parameters, most optional - * o Large size of some parameters (> 100 bytes) - * o Each parameters need to be extracted from hardware - * o Scan requests can take seconds and disable network activity. - * Because of this high cost/overhead, we want to return only the - * parameters the user application is really interested in. - * We could offer partial Dump using the IW_DESCR_FLAG_DUMP flag. - * - * The API uses the standard RtNetlink socket. When the RtNetlink code - * find a IFLA_WIRELESS field in a RtNetlink SET_LINK request, - * it calls here. - */ - -#ifdef CONFIG_NET_WIRELESS_RTNETLINK -/* ---------------------------------------------------------------- */ -/* - * Wrapper to call a standard Wireless Extension GET handler. - * We do various checks and call the handler with the proper args. - */ -static int rtnetlink_standard_get(struct net_device * dev, - struct iw_event * request, - int request_len, - iw_handler handler, - char ** p_buf, - int * p_len) -{ - const struct iw_ioctl_description * descr = NULL; - unsigned int cmd; - union iwreq_data * wrqu; - int hdr_len; - struct iw_request_info info; - char * buffer = NULL; - int buffer_size = 0; - int ret = -EINVAL; - - /* Get the description of the Request */ - cmd = request->cmd; - if((cmd - SIOCIWFIRST) >= standard_ioctl_num) - return -EOPNOTSUPP; - descr = &(standard_ioctl[cmd - SIOCIWFIRST]); - -#ifdef WE_RTNETLINK_DEBUG - printk(KERN_DEBUG "%s (WE.r) : Found standard handler for 0x%04X\n", - dev->name, cmd); - printk(KERN_DEBUG "%s (WE.r) : Header type : %d, Token type : %d, size : %d, token : %d\n", dev->name, descr->header_type, descr->token_type, descr->token_size, descr->max_tokens); -#endif /* WE_RTNETLINK_DEBUG */ - - /* Check if wrqu is complete */ - hdr_len = event_type_size[descr->header_type]; - if(request_len < hdr_len) { -#ifdef WE_RTNETLINK_DEBUG - printk(KERN_DEBUG - "%s (WE.r) : Wireless request too short (%d)\n", - dev->name, request_len); -#endif /* WE_RTNETLINK_DEBUG */ - return -EINVAL; - } - - /* Prepare the call */ - info.cmd = cmd; - info.flags = 0; - - /* Check if we have extra data in the reply or not */ - if(descr->header_type != IW_HEADER_TYPE_POINT) { - - /* Create the kernel buffer that we will return. - * It's at an offset to match the TYPE_POINT case... */ - buffer_size = request_len + IW_EV_POINT_OFF; - buffer = kmalloc(buffer_size, GFP_KERNEL); - if (buffer == NULL) { - return -ENOMEM; - } - /* Copy event data */ - memcpy(buffer + IW_EV_POINT_OFF, request, request_len); - /* Use our own copy of wrqu */ - wrqu = (union iwreq_data *) (buffer + IW_EV_POINT_OFF - + IW_EV_LCP_PK_LEN); - - /* No extra arguments. Trivial to handle */ - ret = handler(dev, &info, wrqu, NULL); - - } else { - union iwreq_data wrqu_point; - char * extra = NULL; - int extra_size = 0; + if (cmd == SIOCGIWSTATS) + return ioctl_standard_call(dev, ifr, cmd, + &iw_handler_get_iwstats); - /* Get a temp copy of wrqu (skip pointer) */ - memcpy(((char *) &wrqu_point) + IW_EV_POINT_OFF, - ((char *) request) + IW_EV_LCP_PK_LEN, - IW_EV_POINT_LEN - IW_EV_LCP_PK_LEN); - - /* Calculate space needed by arguments. Always allocate - * for max space. Easier, and won't last long... */ - extra_size = descr->max_tokens * descr->token_size; - /* Support for very large requests */ - if((descr->flags & IW_DESCR_FLAG_NOMAX) && - (wrqu_point.data.length > descr->max_tokens)) - extra_size = (wrqu_point.data.length - * descr->token_size); - buffer_size = extra_size + IW_EV_POINT_PK_LEN + IW_EV_POINT_OFF; -#ifdef WE_RTNETLINK_DEBUG - printk(KERN_DEBUG "%s (WE.r) : Malloc %d bytes (%d bytes)\n", - dev->name, extra_size, buffer_size); -#endif /* WE_RTNETLINK_DEBUG */ - - /* Create the kernel buffer that we will return */ - buffer = kmalloc(buffer_size, GFP_KERNEL); - if (buffer == NULL) { - return -ENOMEM; - } - - /* Put wrqu in the right place (just before extra). - * Leave space for IWE header and dummy pointer... - * Note that IW_EV_LCP_PK_LEN==4 bytes, so it's still aligned. - */ - memcpy(buffer + IW_EV_LCP_PK_LEN + IW_EV_POINT_OFF, - ((char *) &wrqu_point) + IW_EV_POINT_OFF, - IW_EV_POINT_PK_LEN - IW_EV_LCP_PK_LEN); - wrqu = (union iwreq_data *) (buffer + IW_EV_LCP_PK_LEN); - - /* Extra comes logically after that. Offset +12 bytes. */ - extra = buffer + IW_EV_POINT_OFF + IW_EV_POINT_PK_LEN; - - /* Call the handler */ - ret = handler(dev, &info, wrqu, extra); - - /* Calculate real returned length */ - extra_size = (wrqu->data.length * descr->token_size); - /* Re-adjust reply size */ - request->len = extra_size + IW_EV_POINT_PK_LEN; - - /* Put the iwe header where it should, i.e. scrap the - * dummy pointer. */ - memcpy(buffer + IW_EV_POINT_OFF, request, IW_EV_LCP_PK_LEN); - -#ifdef WE_RTNETLINK_DEBUG - printk(KERN_DEBUG "%s (WE.r) : Reply 0x%04X, hdr_len %d, tokens %d, extra_size %d, buffer_size %d\n", dev->name, cmd, hdr_len, wrqu->data.length, extra_size, buffer_size); -#endif /* WE_RTNETLINK_DEBUG */ - - /* Check if there is enough buffer up there */ - if(wrqu_point.data.length < wrqu->data.length) - ret = -E2BIG; - } - - /* Return the buffer to the caller */ - if (!ret) { - *p_buf = buffer; - *p_len = request->len; - } else { - /* Cleanup */ - if(buffer) - kfree(buffer); - } - - return ret; -} - -/* ---------------------------------------------------------------- */ -/* - * Wrapper to call a standard Wireless Extension SET handler. - * We do various checks and call the handler with the proper args. - */ -static inline int rtnetlink_standard_set(struct net_device * dev, - struct iw_event * request, - int request_len, - iw_handler handler) -{ - const struct iw_ioctl_description * descr = NULL; - unsigned int cmd; - union iwreq_data * wrqu; - union iwreq_data wrqu_point; - int hdr_len; - char * extra = NULL; - int extra_size = 0; - struct iw_request_info info; - int ret = -EINVAL; - - /* Get the description of the Request */ - cmd = request->cmd; - if((cmd - SIOCIWFIRST) >= standard_ioctl_num) - return -EOPNOTSUPP; - descr = &(standard_ioctl[cmd - SIOCIWFIRST]); - -#ifdef WE_RTNETLINK_DEBUG - printk(KERN_DEBUG "%s (WE.r) : Found standard SET handler for 0x%04X\n", - dev->name, cmd); - printk(KERN_DEBUG "%s (WE.r) : Header type : %d, Token type : %d, size : %d, token : %d\n", dev->name, descr->header_type, descr->token_type, descr->token_size, descr->max_tokens); -#endif /* WE_RTNETLINK_DEBUG */ - - /* Extract fixed header from request. This is properly aligned. */ - wrqu = (union iwreq_data *) (((char *) request) + IW_EV_LCP_PK_LEN); - - /* Check if wrqu is complete */ - hdr_len = event_type_pk_size[descr->header_type]; - if(request_len < hdr_len) { -#ifdef WE_RTNETLINK_DEBUG - printk(KERN_DEBUG - "%s (WE.r) : Wireless request too short (%d)\n", - dev->name, request_len); -#endif /* WE_RTNETLINK_DEBUG */ - return -EINVAL; - } - - /* Prepare the call */ - info.cmd = cmd; - info.flags = 0; - - /* Check if we have extra data in the request or not */ - if(descr->header_type != IW_HEADER_TYPE_POINT) { - - /* No extra arguments. Trivial to handle */ - ret = handler(dev, &info, wrqu, NULL); - - } else { - int extra_len; - - /* Put wrqu in the right place (skip pointer) */ - memcpy(((char *) &wrqu_point) + IW_EV_POINT_OFF, - wrqu, IW_EV_POINT_PK_LEN - IW_EV_LCP_PK_LEN); - /* Don't forget about the event code... */ - wrqu = &wrqu_point; - - /* Check if number of token fits within bounds */ - if(wrqu_point.data.length > descr->max_tokens) - return -E2BIG; - if(wrqu_point.data.length < descr->min_tokens) - return -EINVAL; - - /* Real length of payload */ - extra_len = wrqu_point.data.length * descr->token_size; - - /* Check if request is self consistent */ - if((request_len - hdr_len) < extra_len) { -#ifdef WE_RTNETLINK_DEBUG - printk(KERN_DEBUG "%s (WE.r) : Wireless request data too short (%d)\n", - dev->name, extra_size); -#endif /* WE_RTNETLINK_DEBUG */ - return -EINVAL; - } - -#ifdef WE_RTNETLINK_DEBUG - printk(KERN_DEBUG "%s (WE.r) : Malloc %d bytes\n", - dev->name, extra_size); -#endif /* WE_RTNETLINK_DEBUG */ - - /* Always allocate for max space. Easier, and won't last - * long... */ - extra_size = descr->max_tokens * descr->token_size; - extra = kmalloc(extra_size, GFP_KERNEL); - if (extra == NULL) - return -ENOMEM; - - /* Copy extra in aligned buffer */ - memcpy(extra, ((char *) request) + hdr_len, extra_len); - - /* Call the handler */ - ret = handler(dev, &info, &wrqu_point, extra); - } - -#ifdef WE_SET_EVENT - /* Generate an event to notify listeners of the change */ - if((descr->flags & IW_DESCR_FLAG_EVENT) && - ((ret == 0) || (ret == -EIWCOMMIT))) { - if(descr->flags & IW_DESCR_FLAG_RESTRICT) - /* If the event is restricted, don't - * export the payload */ - wireless_send_event(dev, cmd, wrqu, NULL); - else - wireless_send_event(dev, cmd, wrqu, extra); - } -#endif /* WE_SET_EVENT */ - - /* Cleanup - I told you it wasn't that long ;-) */ - if(extra) - kfree(extra); - - /* Call commit handler if needed and defined */ - if(ret == -EIWCOMMIT) - ret = call_commit_handler(dev); - - return ret; -} - -/* ---------------------------------------------------------------- */ -/* - * Wrapper to call a private Wireless Extension GET handler. - * Same as above... - * It's not as nice and slimline as the standard wrapper. The cause - * is struct iw_priv_args, which was not really designed for the - * job we are going here. - * - * IMPORTANT : This function prevent to set and get data on the same - * IOCTL and enforce the SET/GET convention. Not doing it would be - * far too hairy... - * If you need to set and get data at the same time, please don't use - * a iw_handler but process it in your ioctl handler (i.e. use the - * old driver API). - */ -static inline int rtnetlink_private_get(struct net_device * dev, - struct iw_event * request, - int request_len, - iw_handler handler, - char ** p_buf, - int * p_len) -{ - const struct iw_priv_args * descr = NULL; - unsigned int cmd; - union iwreq_data * wrqu; - int hdr_len; - struct iw_request_info info; - int extra_size = 0; - int i; - char * buffer = NULL; - int buffer_size = 0; - int ret = -EINVAL; - - /* Get the description of the Request */ - cmd = request->cmd; - for(i = 0; i < dev->wireless_handlers->num_private_args; i++) - if(cmd == dev->wireless_handlers->private_args[i].cmd) { - descr = &(dev->wireless_handlers->private_args[i]); - break; - } - if(descr == NULL) - return -EOPNOTSUPP; - -#ifdef WE_RTNETLINK_DEBUG - printk(KERN_DEBUG "%s (WE.r) : Found private handler for 0x%04X\n", - dev->name, cmd); - printk(KERN_DEBUG "%s (WE.r) : Name %s, set %X, get %X\n", - dev->name, descr->name, descr->set_args, descr->get_args); -#endif /* WE_RTNETLINK_DEBUG */ - - /* Compute the max size of the get arguments */ - extra_size = get_priv_size(descr->get_args); - - /* Does it fits in wrqu ? */ - if((descr->get_args & IW_PRIV_SIZE_FIXED) && - (extra_size <= IFNAMSIZ)) { - hdr_len = extra_size; - extra_size = 0; - } else { - hdr_len = IW_EV_POINT_PK_LEN; - } - - /* Check if wrqu is complete */ - if(request_len < hdr_len) { -#ifdef WE_RTNETLINK_DEBUG - printk(KERN_DEBUG - "%s (WE.r) : Wireless request too short (%d)\n", - dev->name, request_len); -#endif /* WE_RTNETLINK_DEBUG */ - return -EINVAL; - } - - /* Prepare the call */ - info.cmd = cmd; - info.flags = 0; - - /* Check if we have a pointer to user space data or not. */ - if(extra_size == 0) { - - /* Create the kernel buffer that we will return. - * It's at an offset to match the TYPE_POINT case... */ - buffer_size = request_len + IW_EV_POINT_OFF; - buffer = kmalloc(buffer_size, GFP_KERNEL); - if (buffer == NULL) { - return -ENOMEM; - } - /* Copy event data */ - memcpy(buffer + IW_EV_POINT_OFF, request, request_len); - /* Use our own copy of wrqu */ - wrqu = (union iwreq_data *) (buffer + IW_EV_POINT_OFF - + IW_EV_LCP_PK_LEN); - - /* No extra arguments. Trivial to handle */ - ret = handler(dev, &info, wrqu, (char *) wrqu); - - } else { - char * extra; - - /* Buffer for full reply */ - buffer_size = extra_size + IW_EV_POINT_PK_LEN + IW_EV_POINT_OFF; - -#ifdef WE_RTNETLINK_DEBUG - printk(KERN_DEBUG "%s (WE.r) : Malloc %d bytes (%d bytes)\n", - dev->name, extra_size, buffer_size); -#endif /* WE_RTNETLINK_DEBUG */ - - /* Create the kernel buffer that we will return */ - buffer = kmalloc(buffer_size, GFP_KERNEL); - if (buffer == NULL) { - return -ENOMEM; - } - - /* Put wrqu in the right place (just before extra). - * Leave space for IWE header and dummy pointer... - * Note that IW_EV_LCP_PK_LEN==4 bytes, so it's still aligned. - */ - memcpy(buffer + IW_EV_LCP_PK_LEN + IW_EV_POINT_OFF, - ((char *) request) + IW_EV_LCP_PK_LEN, - IW_EV_POINT_PK_LEN - IW_EV_LCP_PK_LEN); - wrqu = (union iwreq_data *) (buffer + IW_EV_LCP_PK_LEN); - - /* Extra comes logically after that. Offset +12 bytes. */ - extra = buffer + IW_EV_POINT_OFF + IW_EV_POINT_PK_LEN; - - /* Call the handler */ - ret = handler(dev, &info, wrqu, extra); - - /* Adjust for the actual length if it's variable, - * avoid leaking kernel bits outside. */ - if (!(descr->get_args & IW_PRIV_SIZE_FIXED)) - extra_size = adjust_priv_size(descr->get_args, wrqu); - /* Re-adjust reply size */ - request->len = extra_size + IW_EV_POINT_PK_LEN; - - /* Put the iwe header where it should, i.e. scrap the - * dummy pointer. */ - memcpy(buffer + IW_EV_POINT_OFF, request, IW_EV_LCP_PK_LEN); - -#ifdef WE_RTNETLINK_DEBUG - printk(KERN_DEBUG "%s (WE.r) : Reply 0x%04X, hdr_len %d, tokens %d, extra_size %d, buffer_size %d\n", dev->name, cmd, hdr_len, wrqu->data.length, extra_size, buffer_size); -#endif /* WE_RTNETLINK_DEBUG */ - } - - /* Return the buffer to the caller */ - if (!ret) { - *p_buf = buffer; - *p_len = request->len; - } else { - /* Cleanup */ - if(buffer) - kfree(buffer); - } - - return ret; -} - -/* ---------------------------------------------------------------- */ -/* - * Wrapper to call a private Wireless Extension SET handler. - * Same as above... - * It's not as nice and slimline as the standard wrapper. The cause - * is struct iw_priv_args, which was not really designed for the - * job we are going here. - * - * IMPORTANT : This function prevent to set and get data on the same - * IOCTL and enforce the SET/GET convention. Not doing it would be - * far too hairy... - * If you need to set and get data at the same time, please don't use - * a iw_handler but process it in your ioctl handler (i.e. use the - * old driver API). - */ -static inline int rtnetlink_private_set(struct net_device * dev, - struct iw_event * request, - int request_len, - iw_handler handler) -{ - const struct iw_priv_args * descr = NULL; - unsigned int cmd; - union iwreq_data * wrqu; - union iwreq_data wrqu_point; - int hdr_len; - char * extra = NULL; - int extra_size = 0; - int offset = 0; /* For sub-ioctls */ - struct iw_request_info info; - int i; - int ret = -EINVAL; - - /* Get the description of the Request */ - cmd = request->cmd; - for(i = 0; i < dev->wireless_handlers->num_private_args; i++) - if(cmd == dev->wireless_handlers->private_args[i].cmd) { - descr = &(dev->wireless_handlers->private_args[i]); - break; - } - if(descr == NULL) - return -EOPNOTSUPP; - -#ifdef WE_RTNETLINK_DEBUG - printk(KERN_DEBUG "%s (WE.r) : Found private handler for 0x%04X\n", - ifr->ifr_name, cmd); - printk(KERN_DEBUG "%s (WE.r) : Name %s, set %X, get %X\n", - dev->name, descr->name, descr->set_args, descr->get_args); -#endif /* WE_RTNETLINK_DEBUG */ - - /* Compute the size of the set arguments */ - /* Check for sub-ioctl handler */ - if(descr->name[0] == '\0') - /* Reserve one int for sub-ioctl index */ - offset = sizeof(__u32); - - /* Size of set arguments */ - extra_size = get_priv_size(descr->set_args); - - /* Does it fits in wrqu ? */ - if((descr->set_args & IW_PRIV_SIZE_FIXED) && - (extra_size <= IFNAMSIZ)) { - hdr_len = IW_EV_LCP_PK_LEN + extra_size; - extra_size = 0; - } else { - hdr_len = IW_EV_POINT_PK_LEN; - } - - /* Extract fixed header from request. This is properly aligned. */ - wrqu = (union iwreq_data *) (((char *) request) + IW_EV_LCP_PK_LEN); - - /* Check if wrqu is complete */ - if(request_len < hdr_len) { -#ifdef WE_RTNETLINK_DEBUG - printk(KERN_DEBUG - "%s (WE.r) : Wireless request too short (%d)\n", - dev->name, request_len); -#endif /* WE_RTNETLINK_DEBUG */ - return -EINVAL; - } - - /* Prepare the call */ - info.cmd = cmd; - info.flags = 0; - - /* Check if we have a pointer to user space data or not. */ - if(extra_size == 0) { - - /* No extra arguments. Trivial to handle */ - ret = handler(dev, &info, wrqu, (char *) wrqu); - - } else { - int extra_len; - - /* Put wrqu in the right place (skip pointer) */ - memcpy(((char *) &wrqu_point) + IW_EV_POINT_OFF, - wrqu, IW_EV_POINT_PK_LEN - IW_EV_LCP_PK_LEN); - - /* Does it fits within bounds ? */ - if(wrqu_point.data.length > (descr->set_args & - IW_PRIV_SIZE_MASK)) - return -E2BIG; - - /* Real length of payload */ - extra_len = adjust_priv_size(descr->set_args, &wrqu_point); - - /* Check if request is self consistent */ - if((request_len - hdr_len) < extra_len) { -#ifdef WE_RTNETLINK_DEBUG - printk(KERN_DEBUG "%s (WE.r) : Wireless request data too short (%d)\n", - dev->name, extra_size); -#endif /* WE_RTNETLINK_DEBUG */ - return -EINVAL; - } - -#ifdef WE_RTNETLINK_DEBUG - printk(KERN_DEBUG "%s (WE.r) : Malloc %d bytes\n", - dev->name, extra_size); -#endif /* WE_RTNETLINK_DEBUG */ - - /* Always allocate for max space. Easier, and won't last - * long... */ - extra = kmalloc(extra_size, GFP_KERNEL); - if (extra == NULL) - return -ENOMEM; - - /* Copy extra in aligned buffer */ - memcpy(extra, ((char *) request) + hdr_len, extra_len); - - /* Call the handler */ - ret = handler(dev, &info, &wrqu_point, extra); - - /* Cleanup - I told you it wasn't that long ;-) */ - kfree(extra); - } - - /* Call commit handler if needed and defined */ - if(ret == -EIWCOMMIT) - ret = call_commit_handler(dev); - - return ret; -} - -/* ---------------------------------------------------------------- */ -/* - * Main RtNetlink dispatcher. Called from the main networking code - * (do_getlink() in net/core/rtnetlink.c). - * Check the type of Request and call the appropriate wrapper... - */ -int wireless_rtnetlink_get(struct net_device * dev, - char * data, - int len, - char ** p_buf, - int * p_len) -{ - struct iw_event * request = (struct iw_event *) data; - iw_handler handler; - - /* Check length */ - if(len < IW_EV_LCP_PK_LEN) { - printk(KERN_DEBUG "%s (WE.r) : RtNetlink request too short (%d)\n", - dev->name, len); - return -EINVAL; - } - - /* ReCheck length (len may have padding) */ - if(request->len > len) { - printk(KERN_DEBUG "%s (WE.r) : RtNetlink request len invalid (%d-%d)\n", - dev->name, request->len, len); - return -EINVAL; - } - - /* Only accept GET requests in here */ - if(!IW_IS_GET(request->cmd)) - return -EOPNOTSUPP; - - /* If command is `get the encoding parameters', check if - * the user has the right to do it */ - if (request->cmd == SIOCGIWENCODE || - request->cmd == SIOCGIWENCODEEXT) { - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - } - - /* Special cases */ - if(request->cmd == SIOCGIWSTATS) - /* Get Wireless Stats */ - return rtnetlink_standard_get(dev, - request, - request->len, - &iw_handler_get_iwstats, - p_buf, p_len); - if(request->cmd == SIOCGIWPRIV) { - /* Check if we have some wireless handlers defined */ - if(dev->wireless_handlers == NULL) - return -EOPNOTSUPP; - /* Get Wireless Stats */ - return rtnetlink_standard_get(dev, - request, - request->len, - &iw_handler_get_private, - p_buf, p_len); - } + if (cmd == SIOCGIWPRIV && dev->wireless_handlers) + return ioctl_standard_call(dev, ifr, cmd, + &iw_handler_get_private); /* Basic check */ if (!netif_device_present(dev)) return -ENODEV; - /* Try to find the handler */ - handler = get_handler(dev, request->cmd); - if(handler != NULL) { + /* New driver API : try to find the handler */ + handler = get_handler(dev, cmd); + if (handler) { /* Standard and private are not the same */ - if(request->cmd < SIOCIWFIRSTPRIV) - return rtnetlink_standard_get(dev, - request, - request->len, - handler, - p_buf, p_len); + if (cmd < SIOCIWFIRSTPRIV) + return ioctl_standard_call(dev, ifr, cmd, handler); else - return rtnetlink_private_get(dev, - request, - request->len, - handler, - p_buf, p_len); + return ioctl_private_call(dev, ifr, cmd, handler); } - + /* Old driver API : call driver ioctl handler */ + if (dev->do_ioctl) + return dev->do_ioctl(dev, ifr, cmd); return -EOPNOTSUPP; } -/* ---------------------------------------------------------------- */ -/* - * Main RtNetlink dispatcher. Called from the main networking code - * (do_setlink() in net/core/rtnetlink.c). - * Check the type of Request and call the appropriate wrapper... - */ -int wireless_rtnetlink_set(struct net_device * dev, - char * data, - int len) +/* entry point from dev ioctl */ +int wext_handle_ioctl(struct ifreq *ifr, unsigned int cmd, + void __user *arg) { - struct iw_event * request = (struct iw_event *) data; - iw_handler handler; - - /* Check length */ - if(len < IW_EV_LCP_PK_LEN) { - printk(KERN_DEBUG "%s (WE.r) : RtNetlink request too short (%d)\n", - dev->name, len); - return -EINVAL; - } - - /* ReCheck length (len may have padding) */ - if(request->len > len) { - printk(KERN_DEBUG "%s (WE.r) : RtNetlink request len invalid (%d-%d)\n", - dev->name, request->len, len); - return -EINVAL; - } - - /* Only accept SET requests in here */ - if(!IW_IS_SET(request->cmd)) - return -EOPNOTSUPP; - - /* Basic check */ - if (!netif_device_present(dev)) - return -ENODEV; + int ret; - /* New driver API : try to find the handler */ - handler = get_handler(dev, request->cmd); - if(handler != NULL) { - /* Standard and private are not the same */ - if(request->cmd < SIOCIWFIRSTPRIV) - return rtnetlink_standard_set(dev, - request, - request->len, - handler); - else - return rtnetlink_private_set(dev, - request, - request->len, - handler); - } - - return -EOPNOTSUPP; + /* If command is `set a parameter', or + * `get the encoding parameters', check if + * the user has the right to do it */ + if ((IW_IS_SET(cmd) || cmd == SIOCGIWENCODE || cmd == SIOCGIWENCODEEXT) + && !capable(CAP_NET_ADMIN)) + return -EPERM; + + dev_load(ifr->ifr_name); + rtnl_lock(); + ret = wireless_process_ioctl(ifr, cmd); + rtnl_unlock(); + if (IW_IS_GET(cmd) && copy_to_user(arg, ifr, sizeof(struct ifreq))) + return -EFAULT; + return ret; } -#endif /* CONFIG_NET_WIRELESS_RTNETLINK */ - /************************* EVENT PROCESSING *************************/ /* @@ -1888,7 +1080,6 @@ int wireless_rtnetlink_set(struct net_device * dev, * Most often, the event will be propagated through rtnetlink */ -#ifdef WE_EVENT_RTNETLINK /* ---------------------------------------------------------------- */ /* * Locking... @@ -1933,15 +1124,12 @@ static DECLARE_TASKLET(wireless_nlevent_tasklet, wireless_nlevent_process, 0); * current wireless config. Dumping the wireless config is far too * expensive (for each parameter, the driver need to query the hardware). */ -static inline int rtnetlink_fill_iwinfo(struct sk_buff * skb, - struct net_device * dev, - int type, - char * event, - int event_len) +static int rtnetlink_fill_iwinfo(struct sk_buff *skb, struct net_device *dev, + int type, char *event, int event_len) { struct ifinfomsg *r; struct nlmsghdr *nlh; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(*r)); r = NLMSG_DATA(nlh); @@ -1955,12 +1143,12 @@ static inline int rtnetlink_fill_iwinfo(struct sk_buff * skb, /* Add the wireless events in the netlink packet */ RTA_PUT(skb, IFLA_WIRELESS, event_len, event); - nlh->nlmsg_len = skb->tail - b; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; nlmsg_failure: rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } @@ -1971,9 +1159,7 @@ rtattr_failure: * Andrzej Krzysztofowicz mandated that I used a IFLA_XXX field * within a RTM_NEWLINK event. */ -static inline void rtmsg_iwinfo(struct net_device * dev, - char * event, - int event_len) +static void rtmsg_iwinfo(struct net_device *dev, char *event, int event_len) { struct sk_buff *skb; int size = NLMSG_GOODSIZE; @@ -1992,8 +1178,6 @@ static inline void rtmsg_iwinfo(struct net_device * dev, tasklet_schedule(&wireless_nlevent_tasklet); } -#endif /* WE_EVENT_RTNETLINK */ - /* ---------------------------------------------------------------- */ /* * Main event dispatcher. Called from other parts and drivers. @@ -2015,17 +1199,17 @@ void wireless_send_event(struct net_device * dev, unsigned cmd_index; /* *MUST* be unsigned */ /* Get the description of the Event */ - if(cmd <= SIOCIWLAST) { + if (cmd <= SIOCIWLAST) { cmd_index = cmd - SIOCIWFIRST; - if(cmd_index < standard_ioctl_num) + if (cmd_index < standard_ioctl_num) descr = &(standard_ioctl[cmd_index]); } else { cmd_index = cmd - IWEVFIRST; - if(cmd_index < standard_event_num) + if (cmd_index < standard_event_num) descr = &(standard_event[cmd_index]); } /* Don't accept unknown events */ - if(descr == NULL) { + if (descr == NULL) { /* Note : we don't return an error to the driver, because * the driver would not know what to do about it. It can't * return an error to the user, because the event is not @@ -2037,63 +1221,50 @@ void wireless_send_event(struct net_device * dev, dev->name, cmd); return; } -#ifdef WE_EVENT_DEBUG - printk(KERN_DEBUG "%s (WE) : Got event 0x%04X\n", - dev->name, cmd); - printk(KERN_DEBUG "%s (WE) : Header type : %d, Token type : %d, size : %d, token : %d\n", dev->name, descr->header_type, descr->token_type, descr->token_size, descr->max_tokens); -#endif /* WE_EVENT_DEBUG */ /* Check extra parameters and set extra_len */ - if(descr->header_type == IW_HEADER_TYPE_POINT) { + if (descr->header_type == IW_HEADER_TYPE_POINT) { /* Check if number of token fits within bounds */ - if(wrqu->data.length > descr->max_tokens) { + if (wrqu->data.length > descr->max_tokens) { printk(KERN_ERR "%s (WE) : Wireless Event too big (%d)\n", dev->name, wrqu->data.length); return; } - if(wrqu->data.length < descr->min_tokens) { + if (wrqu->data.length < descr->min_tokens) { printk(KERN_ERR "%s (WE) : Wireless Event too small (%d)\n", dev->name, wrqu->data.length); return; } /* Calculate extra_len - extra is NULL for restricted events */ - if(extra != NULL) + if (extra != NULL) extra_len = wrqu->data.length * descr->token_size; /* Always at an offset in wrqu */ wrqu_off = IW_EV_POINT_OFF; -#ifdef WE_EVENT_DEBUG - printk(KERN_DEBUG "%s (WE) : Event 0x%04X, tokens %d, extra_len %d\n", dev->name, cmd, wrqu->data.length, extra_len); -#endif /* WE_EVENT_DEBUG */ } /* Total length of the event */ hdr_len = event_type_size[descr->header_type]; event_len = hdr_len + extra_len; -#ifdef WE_EVENT_DEBUG - printk(KERN_DEBUG "%s (WE) : Event 0x%04X, hdr_len %d, wrqu_off %d, event_len %d\n", dev->name, cmd, hdr_len, wrqu_off, event_len); -#endif /* WE_EVENT_DEBUG */ - /* Create temporary buffer to hold the event */ event = kmalloc(event_len, GFP_ATOMIC); - if(event == NULL) + if (event == NULL) return; /* Fill event */ event->len = event_len; event->cmd = cmd; memcpy(&event->u, ((char *) wrqu) + wrqu_off, hdr_len - IW_EV_LCP_LEN); - if(extra != NULL) + if (extra) memcpy(((char *) event) + hdr_len, extra, extra_len); -#ifdef WE_EVENT_RTNETLINK /* Send via the RtNetlink event channel */ rtmsg_iwinfo(dev, (char *) event, event_len); -#endif /* WE_EVENT_RTNETLINK */ /* Cleanup */ kfree(event); return; /* Always success, I guess ;-) */ } +EXPORT_SYMBOL(wireless_send_event); /********************** ENHANCED IWSPY SUPPORT **********************/ /* @@ -2113,11 +1284,11 @@ void wireless_send_event(struct net_device * dev, * Because this is called on the Rx path via wireless_spy_update(), * we want it to be efficient... */ -static inline struct iw_spy_data * get_spydata(struct net_device *dev) +static inline struct iw_spy_data *get_spydata(struct net_device *dev) { /* This is the new way */ - if(dev->wireless_data) - return(dev->wireless_data->spy_data); + if (dev->wireless_data) + return dev->wireless_data->spy_data; return NULL; } @@ -2134,7 +1305,7 @@ int iw_handler_set_spy(struct net_device * dev, struct sockaddr * address = (struct sockaddr *) extra; /* Make sure driver is not buggy or using the old API */ - if(!spydata) + if (!spydata) return -EOPNOTSUPP; /* Disable spy collection while we copy the addresses. @@ -2151,29 +1322,16 @@ int iw_handler_set_spy(struct net_device * dev, smp_wmb(); /* Are there are addresses to copy? */ - if(wrqu->data.length > 0) { + if (wrqu->data.length > 0) { int i; /* Copy addresses */ - for(i = 0; i < wrqu->data.length; i++) + for (i = 0; i < wrqu->data.length; i++) memcpy(spydata->spy_address[i], address[i].sa_data, ETH_ALEN); /* Reset stats */ memset(spydata->spy_stat, 0, sizeof(struct iw_quality) * IW_MAX_SPY); - -#ifdef WE_SPY_DEBUG - printk(KERN_DEBUG "iw_handler_set_spy() : wireless_data %p, spydata %p, num %d\n", dev->wireless_data, spydata, wrqu->data.length); - for (i = 0; i < wrqu->data.length; i++) - printk(KERN_DEBUG - "%02X:%02X:%02X:%02X:%02X:%02X \n", - spydata->spy_address[i][0], - spydata->spy_address[i][1], - spydata->spy_address[i][2], - spydata->spy_address[i][3], - spydata->spy_address[i][4], - spydata->spy_address[i][5]); -#endif /* WE_SPY_DEBUG */ } /* Make sure above is updated before re-enabling */ @@ -2184,6 +1342,7 @@ int iw_handler_set_spy(struct net_device * dev, return 0; } +EXPORT_SYMBOL(iw_handler_set_spy); /*------------------------------------------------------------------*/ /* @@ -2199,26 +1358,27 @@ int iw_handler_get_spy(struct net_device * dev, int i; /* Make sure driver is not buggy or using the old API */ - if(!spydata) + if (!spydata) return -EOPNOTSUPP; wrqu->data.length = spydata->spy_number; /* Copy addresses. */ - for(i = 0; i < spydata->spy_number; i++) { + for (i = 0; i < spydata->spy_number; i++) { memcpy(address[i].sa_data, spydata->spy_address[i], ETH_ALEN); address[i].sa_family = AF_UNIX; } /* Copy stats to the user buffer (just after). */ - if(spydata->spy_number > 0) + if (spydata->spy_number > 0) memcpy(extra + (sizeof(struct sockaddr) *spydata->spy_number), spydata->spy_stat, sizeof(struct iw_quality) * spydata->spy_number); /* Reset updated flags. */ - for(i = 0; i < spydata->spy_number; i++) + for (i = 0; i < spydata->spy_number; i++) spydata->spy_stat[i].updated &= ~IW_QUAL_ALL_UPDATED; return 0; } +EXPORT_SYMBOL(iw_handler_get_spy); /*------------------------------------------------------------------*/ /* @@ -2233,7 +1393,7 @@ int iw_handler_set_thrspy(struct net_device * dev, struct iw_thrspy * threshold = (struct iw_thrspy *) extra; /* Make sure driver is not buggy or using the old API */ - if(!spydata) + if (!spydata) return -EOPNOTSUPP; /* Just do it */ @@ -2243,12 +1403,9 @@ int iw_handler_set_thrspy(struct net_device * dev, /* Clear flag */ memset(spydata->spy_thr_under, '\0', sizeof(spydata->spy_thr_under)); -#ifdef WE_SPY_DEBUG - printk(KERN_DEBUG "iw_handler_set_thrspy() : low %d ; high %d\n", spydata->spy_thr_low.level, spydata->spy_thr_high.level); -#endif /* WE_SPY_DEBUG */ - return 0; } +EXPORT_SYMBOL(iw_handler_set_thrspy); /*------------------------------------------------------------------*/ /* @@ -2263,7 +1420,7 @@ int iw_handler_get_thrspy(struct net_device * dev, struct iw_thrspy * threshold = (struct iw_thrspy *) extra; /* Make sure driver is not buggy or using the old API */ - if(!spydata) + if (!spydata) return -EOPNOTSUPP; /* Just do it */ @@ -2272,6 +1429,7 @@ int iw_handler_get_thrspy(struct net_device * dev, return 0; } +EXPORT_SYMBOL(iw_handler_get_thrspy); /*------------------------------------------------------------------*/ /* @@ -2297,16 +1455,6 @@ static void iw_send_thrspy_event(struct net_device * dev, memcpy(&(threshold.low), &(spydata->spy_thr_low), 2 * sizeof(struct iw_quality)); -#ifdef WE_SPY_DEBUG - printk(KERN_DEBUG "iw_send_thrspy_event() : address %02X:%02X:%02X:%02X:%02X:%02X, level %d, up = %d\n", - threshold.addr.sa_data[0], - threshold.addr.sa_data[1], - threshold.addr.sa_data[2], - threshold.addr.sa_data[3], - threshold.addr.sa_data[4], - threshold.addr.sa_data[5], threshold.qual.level); -#endif /* WE_SPY_DEBUG */ - /* Send event to user space */ wireless_send_event(dev, SIOCGIWTHRSPY, &wrqu, (char *) &threshold); } @@ -2327,16 +1475,12 @@ void wireless_spy_update(struct net_device * dev, int match = -1; /* Make sure driver is not buggy or using the old API */ - if(!spydata) + if (!spydata) return; -#ifdef WE_SPY_DEBUG - printk(KERN_DEBUG "wireless_spy_update() : wireless_data %p, spydata %p, address %02X:%02X:%02X:%02X:%02X:%02X\n", dev->wireless_data, spydata, address[0], address[1], address[2], address[3], address[4], address[5]); -#endif /* WE_SPY_DEBUG */ - /* Update all records that match */ - for(i = 0; i < spydata->spy_number; i++) - if(!compare_ether_addr(address, spydata->spy_address[i])) { + for (i = 0; i < spydata->spy_number; i++) + if (!compare_ether_addr(address, spydata->spy_address[i])) { memcpy(&(spydata->spy_stat[i]), wstats, sizeof(struct iw_quality)); match = i; @@ -2346,15 +1490,15 @@ void wireless_spy_update(struct net_device * dev, * To avoid event storms, we have a simple hysteresis : we generate * event only when we go under the low threshold or above the * high threshold. */ - if(match >= 0) { - if(spydata->spy_thr_under[match]) { - if(wstats->level > spydata->spy_thr_high.level) { + if (match >= 0) { + if (spydata->spy_thr_under[match]) { + if (wstats->level > spydata->spy_thr_high.level) { spydata->spy_thr_under[match] = 0; iw_send_thrspy_event(dev, spydata, address, wstats); } } else { - if(wstats->level < spydata->spy_thr_low.level) { + if (wstats->level < spydata->spy_thr_low.level) { spydata->spy_thr_under[match] = 1; iw_send_thrspy_event(dev, spydata, address, wstats); @@ -2362,10 +1506,4 @@ void wireless_spy_update(struct net_device * dev, } } } - -EXPORT_SYMBOL(iw_handler_get_spy); -EXPORT_SYMBOL(iw_handler_get_thrspy); -EXPORT_SYMBOL(iw_handler_set_spy); -EXPORT_SYMBOL(iw_handler_set_thrspy); -EXPORT_SYMBOL(wireless_send_event); EXPORT_SYMBOL(wireless_spy_update); diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index e62ba41b05c..0d6002fc77b 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -951,7 +951,7 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, * Incoming Call User Data. */ if (skb->len >= 0) { - memcpy(makex25->calluserdata.cuddata, skb->data, skb->len); + skb_copy_from_linear_data(skb, makex25->calluserdata.cuddata, skb->len); makex25->calluserdata.cudlength = skb->len; } @@ -1058,9 +1058,10 @@ static int x25_sendmsg(struct kiocb *iocb, struct socket *sock, */ SOCK_DEBUG(sk, "x25_sendmsg: Copying user data\n"); - asmptr = skb->h.raw = skb_put(skb, len); + skb_reset_transport_header(skb); + skb_put(skb, len); - rc = memcpy_fromiovec(asmptr, msg->msg_iov, len); + rc = memcpy_fromiovec(skb_transport_header(skb), msg->msg_iov, len); if (rc) goto out_kfree_skb; @@ -1210,8 +1211,7 @@ static int x25_recvmsg(struct kiocb *iocb, struct socket *sock, } } - skb->h.raw = skb->data; - + skb_reset_transport_header(skb); copied = skb->len; if (copied > size) { @@ -1280,6 +1280,12 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) rc = sock_get_timestamp(sk, (struct timeval __user *)argp); break; + case SIOCGSTAMPNS: + rc = -EINVAL; + if (sk) + rc = sock_get_timestampns(sk, + (struct timespec __user *)argp); + break; case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: @@ -1521,6 +1527,12 @@ static int compat_x25_ioctl(struct socket *sock, unsigned int cmd, rc = compat_sock_get_timestamp(sk, (struct timeval __user*)argp); break; + case SIOCGSTAMPNS: + rc = -EINVAL; + if (sk) + rc = compat_sock_get_timestampns(sk, + (struct timespec __user*)argp); + break; case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: diff --git a/net/x25/x25_dev.c b/net/x25/x25_dev.c index c7221de98a9..848a6b6f90a 100644 --- a/net/x25/x25_dev.c +++ b/net/x25/x25_dev.c @@ -48,7 +48,7 @@ static int x25_receive_data(struct sk_buff *skb, struct x25_neigh *nb) if ((sk = x25_find_socket(lci, nb)) != NULL) { int queued = 1; - skb->h.raw = skb->data; + skb_reset_transport_header(skb); bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { queued = x25_process_rx_frame(sk, skb); @@ -191,7 +191,7 @@ void x25_send_frame(struct sk_buff *skb, struct x25_neigh *nb) { unsigned char *dptr; - skb->nh.raw = skb->data; + skb_reset_network_header(skb); switch (nb->dev->type) { case ARPHRD_X25: diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c index c5239fcdefa..1c88762c279 100644 --- a/net/x25/x25_in.c +++ b/net/x25/x25_in.c @@ -53,17 +53,20 @@ static int x25_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more) skb_queue_tail(&x25->fragment_queue, skb); - skbn->h.raw = skbn->data; + skb_reset_transport_header(skbn); skbo = skb_dequeue(&x25->fragment_queue); - memcpy(skb_put(skbn, skbo->len), skbo->data, skbo->len); + skb_copy_from_linear_data(skbo, skb_put(skbn, skbo->len), + skbo->len); kfree_skb(skbo); while ((skbo = skb_dequeue(&x25->fragment_queue)) != NULL) { skb_pull(skbo, (x25->neighbour->extended) ? X25_EXT_MIN_LEN : X25_STD_MIN_LEN); - memcpy(skb_put(skbn, skbo->len), skbo->data, skbo->len); + skb_copy_from_linear_data(skbo, + skb_put(skbn, skbo->len), + skbo->len); kfree_skb(skbo); } @@ -112,8 +115,9 @@ static int x25_state1_machine(struct sock *sk, struct sk_buff *skb, int frametyp * Copy any Call User Data. */ if (skb->len >= 0) { - memcpy(x25->calluserdata.cuddata, skb->data, - skb->len); + skb_copy_from_linear_data(skb, + x25->calluserdata.cuddata, + skb->len); x25->calluserdata.cudlength = skb->len; } if (!sock_flag(sk, SOCK_DEAD)) diff --git a/net/x25/x25_out.c b/net/x25/x25_out.c index 6f573785391..2b96b52114d 100644 --- a/net/x25/x25_out.c +++ b/net/x25/x25_out.c @@ -61,7 +61,7 @@ int x25_output(struct sock *sk, struct sk_buff *skb) if (skb->len - header_len > max_len) { /* Save a copy of the Header */ - memcpy(header, skb->data, header_len); + skb_copy_from_linear_data(skb, header, header_len); skb_pull(skb, header_len); frontlen = skb_headroom(skb); @@ -84,12 +84,12 @@ int x25_output(struct sock *sk, struct sk_buff *skb) len = max_len > skb->len ? skb->len : max_len; /* Copy the user data */ - memcpy(skb_put(skbn, len), skb->data, len); + skb_copy_from_linear_data(skb, skb_put(skbn, len), len); skb_pull(skb, len); /* Duplicate the Header */ skb_push(skbn, header_len); - memcpy(skbn->data, header, header_len); + skb_copy_to_linear_data(skbn, header, header_len); if (skb->len > 0) { if (x25->neighbour->extended) diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c index f373a8a7d9c..6249a9405bb 100644 --- a/net/xfrm/xfrm_algo.c +++ b/net/xfrm/xfrm_algo.c @@ -612,175 +612,6 @@ EXPORT_SYMBOL_GPL(skb_icv_walk); #if defined(CONFIG_INET_ESP) || defined(CONFIG_INET_ESP_MODULE) || defined(CONFIG_INET6_ESP) || defined(CONFIG_INET6_ESP_MODULE) -/* Looking generic it is not used in another places. */ - -int -skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) -{ - int start = skb_headlen(skb); - int i, copy = start - offset; - int elt = 0; - - if (copy > 0) { - if (copy > len) - copy = len; - sg[elt].page = virt_to_page(skb->data + offset); - sg[elt].offset = (unsigned long)(skb->data + offset) % PAGE_SIZE; - sg[elt].length = copy; - elt++; - if ((len -= copy) == 0) - return elt; - offset += copy; - } - - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int end; - - BUG_TRAP(start <= offset + len); - - end = start + skb_shinfo(skb)->frags[i].size; - if ((copy = end - offset) > 0) { - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - if (copy > len) - copy = len; - sg[elt].page = frag->page; - sg[elt].offset = frag->page_offset+offset-start; - sg[elt].length = copy; - elt++; - if (!(len -= copy)) - return elt; - offset += copy; - } - start = end; - } - - if (skb_shinfo(skb)->frag_list) { - struct sk_buff *list = skb_shinfo(skb)->frag_list; - - for (; list; list = list->next) { - int end; - - BUG_TRAP(start <= offset + len); - - end = start + list->len; - if ((copy = end - offset) > 0) { - if (copy > len) - copy = len; - elt += skb_to_sgvec(list, sg+elt, offset - start, copy); - if ((len -= copy) == 0) - return elt; - offset += copy; - } - start = end; - } - } - BUG_ON(len); - return elt; -} -EXPORT_SYMBOL_GPL(skb_to_sgvec); - -/* Check that skb data bits are writable. If they are not, copy data - * to newly created private area. If "tailbits" is given, make sure that - * tailbits bytes beyond current end of skb are writable. - * - * Returns amount of elements of scatterlist to load for subsequent - * transformations and pointer to writable trailer skb. - */ - -int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) -{ - int copyflag; - int elt; - struct sk_buff *skb1, **skb_p; - - /* If skb is cloned or its head is paged, reallocate - * head pulling out all the pages (pages are considered not writable - * at the moment even if they are anonymous). - */ - if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) && - __pskb_pull_tail(skb, skb_pagelen(skb)-skb_headlen(skb)) == NULL) - return -ENOMEM; - - /* Easy case. Most of packets will go this way. */ - if (!skb_shinfo(skb)->frag_list) { - /* A little of trouble, not enough of space for trailer. - * This should not happen, when stack is tuned to generate - * good frames. OK, on miss we reallocate and reserve even more - * space, 128 bytes is fair. */ - - if (skb_tailroom(skb) < tailbits && - pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC)) - return -ENOMEM; - - /* Voila! */ - *trailer = skb; - return 1; - } - - /* Misery. We are in troubles, going to mincer fragments... */ - - elt = 1; - skb_p = &skb_shinfo(skb)->frag_list; - copyflag = 0; - - while ((skb1 = *skb_p) != NULL) { - int ntail = 0; - - /* The fragment is partially pulled by someone, - * this can happen on input. Copy it and everything - * after it. */ - - if (skb_shared(skb1)) - copyflag = 1; - - /* If the skb is the last, worry about trailer. */ - - if (skb1->next == NULL && tailbits) { - if (skb_shinfo(skb1)->nr_frags || - skb_shinfo(skb1)->frag_list || - skb_tailroom(skb1) < tailbits) - ntail = tailbits + 128; - } - - if (copyflag || - skb_cloned(skb1) || - ntail || - skb_shinfo(skb1)->nr_frags || - skb_shinfo(skb1)->frag_list) { - struct sk_buff *skb2; - - /* Fuck, we are miserable poor guys... */ - if (ntail == 0) - skb2 = skb_copy(skb1, GFP_ATOMIC); - else - skb2 = skb_copy_expand(skb1, - skb_headroom(skb1), - ntail, - GFP_ATOMIC); - if (unlikely(skb2 == NULL)) - return -ENOMEM; - - if (skb1->sk) - skb_set_owner_w(skb2, skb1->sk); - - /* Looking around. Are we still alive? - * OK, link new skb, drop old one */ - - skb2->next = skb1->next; - *skb_p = skb2; - kfree_skb(skb1); - skb1 = skb2; - } - elt++; - *trailer = skb1; - skb_p = &skb1->next; - } - - return elt; -} -EXPORT_SYMBOL_GPL(skb_cow_data); - void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) { if (tail != skb) { diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index ee15bdae141..5c4695840c5 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -62,7 +62,7 @@ int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq) case IPPROTO_COMP: if (!pskb_may_pull(skb, sizeof(struct ip_comp_hdr))) return -EINVAL; - *spi = htonl(ntohs(*(__be16*)(skb->h.raw + 2))); + *spi = htonl(ntohs(*(__be16*)(skb_transport_header(skb) + 2))); *seq = 0; return 0; default: @@ -72,8 +72,8 @@ int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq) if (!pskb_may_pull(skb, 16)) return -EINVAL; - *spi = *(__be32*)(skb->h.raw + offset); - *seq = *(__be32*)(skb->h.raw + offset_seq); + *spi = *(__be32*)(skb_transport_header(skb) + offset); + *seq = *(__be32*)(skb_transport_header(skb) + offset_seq); return 0; } EXPORT_SYMBOL(xfrm_parse_spi); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 785c3e39f06..95271e8426a 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -268,7 +268,7 @@ static inline unsigned long make_jiffies(long secs) static void xfrm_policy_timer(unsigned long data) { struct xfrm_policy *xp = (struct xfrm_policy*)data; - unsigned long now = (unsigned long)xtime.tv_sec; + unsigned long now = get_seconds(); long next = LONG_MAX; int warn = 0; int dir; @@ -579,8 +579,22 @@ static inline int xfrm_byidx_should_resize(int total) return 0; } -static DEFINE_MUTEX(hash_resize_mutex); +void xfrm_spd_getinfo(struct xfrmk_spdinfo *si) +{ + read_lock_bh(&xfrm_policy_lock); + si->incnt = xfrm_policy_count[XFRM_POLICY_IN]; + si->outcnt = xfrm_policy_count[XFRM_POLICY_OUT]; + si->fwdcnt = xfrm_policy_count[XFRM_POLICY_FWD]; + si->inscnt = xfrm_policy_count[XFRM_POLICY_IN+XFRM_POLICY_MAX]; + si->outscnt = xfrm_policy_count[XFRM_POLICY_OUT+XFRM_POLICY_MAX]; + si->fwdscnt = xfrm_policy_count[XFRM_POLICY_FWD+XFRM_POLICY_MAX]; + si->spdhcnt = xfrm_idx_hmask; + si->spdhmcnt = xfrm_policy_hashmax; + read_unlock_bh(&xfrm_policy_lock); +} +EXPORT_SYMBOL(xfrm_spd_getinfo); +static DEFINE_MUTEX(hash_resize_mutex); static void xfrm_hash_resize(struct work_struct *__unused) { int dir, total; @@ -690,7 +704,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) } policy->index = delpol ? delpol->index : xfrm_gen_index(policy->type, dir); hlist_add_head(&policy->byidx, xfrm_policy_byidx+idx_hash(policy->index)); - policy->curlft.add_time = (unsigned long)xtime.tv_sec; + policy->curlft.add_time = get_seconds(); policy->curlft.use_time = 0; if (!mod_timer(&policy->timer, jiffies + HZ)) xfrm_pol_hold(policy); @@ -1049,7 +1063,7 @@ static inline int policy_to_flow_dir(int dir) return FLOW_DIR_OUT; case XFRM_POLICY_FWD: return FLOW_DIR_FWD; - }; + } } static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl) @@ -1133,7 +1147,7 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) old_pol = sk->sk_policy[dir]; sk->sk_policy[dir] = pol; if (pol) { - pol->curlft.add_time = (unsigned long)xtime.tv_sec; + pol->curlft.add_time = get_seconds(); pol->index = xfrm_gen_index(pol->type, XFRM_POLICY_MAX+dir); __xfrm_policy_link(pol, XFRM_POLICY_MAX+dir); } @@ -1330,6 +1344,40 @@ xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx, return err; } +static int inline +xfrm_dst_alloc_copy(void **target, void *src, int size) +{ + if (!*target) { + *target = kmalloc(size, GFP_ATOMIC); + if (!*target) + return -ENOMEM; + } + memcpy(*target, src, size); + return 0; +} + +static int inline +xfrm_dst_update_parent(struct dst_entry *dst, struct xfrm_selector *sel) +{ +#ifdef CONFIG_XFRM_SUB_POLICY + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + return xfrm_dst_alloc_copy((void **)&(xdst->partner), + sel, sizeof(*sel)); +#else + return 0; +#endif +} + +static int inline +xfrm_dst_update_origin(struct dst_entry *dst, struct flowi *fl) +{ +#ifdef CONFIG_XFRM_SUB_POLICY + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + return xfrm_dst_alloc_copy((void **)&(xdst->origin), fl, sizeof(*fl)); +#else + return 0; +#endif +} static int stale_bundle(struct dst_entry *dst); @@ -1386,7 +1434,7 @@ restart: return 0; family = dst_orig->ops->family; - policy->curlft.use_time = (unsigned long)xtime.tv_sec; + policy->curlft.use_time = get_seconds(); pols[0] = policy; npols ++; xfrm_nr += pols[0]->xfrm_nr; @@ -1518,6 +1566,18 @@ restart: err = -EHOSTUNREACH; goto error; } + + if (npols > 1) + err = xfrm_dst_update_parent(dst, &pols[1]->selector); + else + err = xfrm_dst_update_origin(dst, fl); + if (unlikely(err)) { + write_unlock_bh(&policy->lock); + if (dst) + dst_free(dst); + goto error; + } + dst->next = policy->bundles; policy->bundles = dst; dst_hold(dst); @@ -1682,7 +1742,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, return 1; } - pol->curlft.use_time = (unsigned long)xtime.tv_sec; + pol->curlft.use_time = get_seconds(); pols[0] = pol; npols ++; @@ -1694,7 +1754,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, if (pols[1]) { if (IS_ERR(pols[1])) return 0; - pols[1]->curlft.use_time = (unsigned long)xtime.tv_sec; + pols[1]->curlft.use_time = get_seconds(); npols ++; } } @@ -1933,6 +1993,15 @@ int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first, if (!dst_check(dst->path, ((struct xfrm_dst *)dst)->path_cookie) || (dst->dev && !netif_running(dst->dev))) return 0; +#ifdef CONFIG_XFRM_SUB_POLICY + if (fl) { + if (first->origin && !flow_cache_uli_match(first->origin, fl)) + return 0; + if (first->partner && + !xfrm_selector_match(first->partner, fl, family)) + return 0; + } +#endif last = NULL; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index e3a0bcfa5df..9955ff4da0a 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -233,7 +233,7 @@ static inline unsigned long make_jiffies(long secs) static void xfrm_timer_handler(unsigned long data) { struct xfrm_state *x = (struct xfrm_state*)data; - unsigned long now = (unsigned long)xtime.tv_sec; + unsigned long now = get_seconds(); long next = LONG_MAX; int warn = 0; int err = 0; @@ -326,7 +326,7 @@ struct xfrm_state *xfrm_state_alloc(void) init_timer(&x->rtimer); x->rtimer.function = xfrm_replay_timer_handler; x->rtimer.data = (unsigned long)x; - x->curlft.add_time = (unsigned long)xtime.tv_sec; + x->curlft.add_time = get_seconds(); x->lft.soft_byte_limit = XFRM_INF; x->lft.soft_packet_limit = XFRM_INF; x->lft.hard_byte_limit = XFRM_INF; @@ -421,6 +421,16 @@ restart: } EXPORT_SYMBOL(xfrm_state_flush); +void xfrm_sad_getinfo(struct xfrmk_sadinfo *si) +{ + spin_lock_bh(&xfrm_state_lock); + si->sadcnt = xfrm_state_num; + si->sadhcnt = xfrm_state_hmask; + si->sadhmcnt = xfrm_state_hashmax; + spin_unlock_bh(&xfrm_state_lock); +} +EXPORT_SYMBOL(xfrm_sad_getinfo); + static int xfrm_init_tempsel(struct xfrm_state *x, struct flowi *fl, struct xfrm_tmpl *tmpl, @@ -458,7 +468,7 @@ static struct xfrm_state *__xfrm_state_lookup(xfrm_address_t *daddr, __be32 spi, x->id.daddr.a6)) continue; break; - }; + } xfrm_state_hold(x); return x; @@ -493,7 +503,7 @@ static struct xfrm_state *__xfrm_state_lookup_byaddr(xfrm_address_t *daddr, xfrm x->props.saddr.a6)) continue; break; - }; + } xfrm_state_hold(x); return x; @@ -722,7 +732,7 @@ static struct xfrm_state *__find_acq_core(unsigned short family, u8 mode, u32 re (struct in6_addr *)saddr)) continue; break; - }; + } xfrm_state_hold(x); return x; @@ -755,7 +765,7 @@ static struct xfrm_state *__find_acq_core(unsigned short family, u8 mode, u32 re ipv6_addr_copy((struct in6_addr *)x->id.daddr.a6, (struct in6_addr *)daddr); break; - }; + } x->km.state = XFRM_STATE_ACQ; x->id.proto = proto; @@ -1051,7 +1061,7 @@ EXPORT_SYMBOL(xfrm_state_update); int xfrm_state_check_expire(struct xfrm_state *x) { if (!x->curlft.use_time) - x->curlft.use_time = (unsigned long)xtime.tv_sec; + x->curlft.use_time = get_seconds(); if (x->km.state != XFRM_STATE_VALID) return -EINVAL; @@ -1667,37 +1677,17 @@ void xfrm_state_delete_tunnel(struct xfrm_state *x) } EXPORT_SYMBOL(xfrm_state_delete_tunnel); -/* - * This function is NOT optimal. For example, with ESP it will give an - * MTU that's usually two bytes short of being optimal. However, it will - * usually give an answer that's a multiple of 4 provided the input is - * also a multiple of 4. - */ int xfrm_state_mtu(struct xfrm_state *x, int mtu) { - int res = mtu; - - res -= x->props.header_len; - - for (;;) { - int m = res; - - if (m < 68) - return 68; - - spin_lock_bh(&x->lock); - if (x->km.state == XFRM_STATE_VALID && - x->type && x->type->get_max_size) - m = x->type->get_max_size(x, m); - else - m += x->props.header_len; - spin_unlock_bh(&x->lock); - - if (m <= mtu) - break; - res -= (m - mtu); - } + int res; + spin_lock_bh(&x->lock); + if (x->km.state == XFRM_STATE_VALID && + x->type && x->type->get_mtu) + res = x->type->get_mtu(x, mtu); + else + res = mtu; + spin_unlock_bh(&x->lock); return res; } diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 816e3690b60..b14c7e590c3 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -71,7 +71,7 @@ static int verify_one_alg(struct rtattr **xfrma, enum xfrm_attr_type_t type) default: return -EINVAL; - }; + } algp->alg_name[CRYPTO_MAX_ALG_NAME - 1] = '\0'; return 0; @@ -152,7 +152,7 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, default: goto out; - }; + } err = -EINVAL; switch (p->id.proto) { @@ -192,7 +192,7 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, default: goto out; - }; + } if ((err = verify_one_alg(xfrma, XFRMA_ALG_AUTH))) goto out; @@ -217,7 +217,7 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, default: goto out; - }; + } err = 0; @@ -576,7 +576,7 @@ static int dump_one_state(struct xfrm_state *x, int count, void *ptr) struct sk_buff *skb = sp->out_skb; struct xfrm_usersa_info *p; struct nlmsghdr *nlh; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); if (sp->this_idx < sp->start_idx) goto out; @@ -621,14 +621,14 @@ static int dump_one_state(struct xfrm_state *x, int count, void *ptr) if (x->lastused) RTA_PUT(skb, XFRMA_LASTUSED, sizeof(x->lastused), &x->lastused); - nlh->nlmsg_len = skb->tail - b; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; out: sp->this_idx++; return 0; nlmsg_failure: rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } @@ -672,6 +672,113 @@ static struct sk_buff *xfrm_state_netlink(struct sk_buff *in_skb, return skb; } +static int build_spdinfo(struct sk_buff *skb, u32 pid, u32 seq, u32 flags) +{ + struct xfrmk_spdinfo si; + struct xfrmu_spdinfo spc; + struct xfrmu_spdhinfo sph; + struct nlmsghdr *nlh; + u32 *f; + + nlh = nlmsg_put(skb, pid, seq, XFRM_MSG_NEWSPDINFO, sizeof(u32), 0); + if (nlh == NULL) /* shouldnt really happen ... */ + return -EMSGSIZE; + + f = nlmsg_data(nlh); + *f = flags; + xfrm_spd_getinfo(&si); + spc.incnt = si.incnt; + spc.outcnt = si.outcnt; + spc.fwdcnt = si.fwdcnt; + spc.inscnt = si.inscnt; + spc.outscnt = si.outscnt; + spc.fwdscnt = si.fwdscnt; + sph.spdhcnt = si.spdhcnt; + sph.spdhmcnt = si.spdhmcnt; + + NLA_PUT(skb, XFRMA_SPD_INFO, sizeof(spc), &spc); + NLA_PUT(skb, XFRMA_SPD_HINFO, sizeof(sph), &sph); + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int xfrm_get_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh, + struct rtattr **xfrma) +{ + struct sk_buff *r_skb; + u32 *flags = NLMSG_DATA(nlh); + u32 spid = NETLINK_CB(skb).pid; + u32 seq = nlh->nlmsg_seq; + int len = NLMSG_LENGTH(sizeof(u32)); + + len += RTA_SPACE(sizeof(struct xfrmu_spdinfo)); + len += RTA_SPACE(sizeof(struct xfrmu_spdhinfo)); + + r_skb = alloc_skb(len, GFP_ATOMIC); + if (r_skb == NULL) + return -ENOMEM; + + if (build_spdinfo(r_skb, spid, seq, *flags) < 0) + BUG(); + + return nlmsg_unicast(xfrm_nl, r_skb, spid); +} + +static int build_sadinfo(struct sk_buff *skb, u32 pid, u32 seq, u32 flags) +{ + struct xfrmk_sadinfo si; + struct xfrmu_sadhinfo sh; + struct nlmsghdr *nlh; + u32 *f; + + nlh = nlmsg_put(skb, pid, seq, XFRM_MSG_NEWSADINFO, sizeof(u32), 0); + if (nlh == NULL) /* shouldnt really happen ... */ + return -EMSGSIZE; + + f = nlmsg_data(nlh); + *f = flags; + xfrm_sad_getinfo(&si); + + sh.sadhmcnt = si.sadhmcnt; + sh.sadhcnt = si.sadhcnt; + + NLA_PUT_U32(skb, XFRMA_SAD_CNT, si.sadcnt); + NLA_PUT(skb, XFRMA_SAD_HINFO, sizeof(sh), &sh); + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int xfrm_get_sadinfo(struct sk_buff *skb, struct nlmsghdr *nlh, + struct rtattr **xfrma) +{ + struct sk_buff *r_skb; + u32 *flags = NLMSG_DATA(nlh); + u32 spid = NETLINK_CB(skb).pid; + u32 seq = nlh->nlmsg_seq; + int len = NLMSG_LENGTH(sizeof(u32)); + + len += RTA_SPACE(sizeof(struct xfrmu_sadhinfo)); + len += RTA_SPACE(sizeof(u32)); + + r_skb = alloc_skb(len, GFP_ATOMIC); + + if (r_skb == NULL) + return -ENOMEM; + + if (build_sadinfo(r_skb, spid, seq, *flags) < 0) + BUG(); + + return nlmsg_unicast(xfrm_nl, r_skb, spid); +} + static int xfrm_get_sa(struct sk_buff *skb, struct nlmsghdr *nlh, struct rtattr **xfrma) { @@ -711,7 +818,7 @@ static int verify_userspi_info(struct xfrm_userspi_info *p) default: return -EINVAL; - }; + } if (p->min > p->max) return -EINVAL; @@ -789,7 +896,7 @@ static int verify_policy_dir(u8 dir) default: return -EINVAL; - }; + } return 0; } @@ -805,7 +912,7 @@ static int verify_policy_type(u8 type) default: return -EINVAL; - }; + } return 0; } @@ -821,7 +928,7 @@ static int verify_newpolicy_info(struct xfrm_userpolicy_info *p) default: return -EINVAL; - }; + } switch (p->action) { case XFRM_POLICY_ALLOW: @@ -830,7 +937,7 @@ static int verify_newpolicy_info(struct xfrm_userpolicy_info *p) default: return -EINVAL; - }; + } switch (p->sel.family) { case AF_INET: @@ -845,7 +952,7 @@ static int verify_newpolicy_info(struct xfrm_userpolicy_info *p) default: return -EINVAL; - }; + } return verify_policy_dir(p->dir); } @@ -912,7 +1019,7 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family) #endif default: return -EINVAL; - }; + } } return 0; @@ -1157,7 +1264,7 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr struct sk_buff *in_skb = sp->in_skb; struct sk_buff *skb = sp->out_skb; struct nlmsghdr *nlh; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); if (sp->this_idx < sp->start_idx) goto out; @@ -1176,13 +1283,13 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr if (copy_to_user_policy_type(xp->type, skb) < 0) goto nlmsg_failure; - nlh->nlmsg_len = skb->tail - b; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; out: sp->this_idx++; return 0; nlmsg_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } @@ -1330,7 +1437,7 @@ static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, struct km_eve struct xfrm_aevent_id *id; struct nlmsghdr *nlh; struct xfrm_lifetime_cur ltime; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); nlh = NLMSG_PUT(skb, c->pid, c->seq, XFRM_MSG_NEWAE, sizeof(*id)); id = NLMSG_DATA(nlh); @@ -1362,12 +1469,12 @@ static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, struct km_eve RTA_PUT(skb,XFRMA_ETIMER_THRESH,sizeof(u32),&etimer); } - nlh->nlmsg_len = skb->tail - b; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; rtattr_failure: nlmsg_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } @@ -1744,7 +1851,7 @@ static int build_migrate(struct sk_buff *skb, struct xfrm_migrate *m, struct xfrm_migrate *mp; struct xfrm_userpolicy_id *pol_id; struct nlmsghdr *nlh; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); int i; nlh = NLMSG_PUT(skb, 0, 0, XFRM_MSG_MIGRATE, sizeof(*pol_id)); @@ -1764,10 +1871,10 @@ static int build_migrate(struct sk_buff *skb, struct xfrm_migrate *m, goto nlmsg_failure; } - nlh->nlmsg_len = skb->tail - b; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; nlmsg_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } @@ -1823,6 +1930,8 @@ static const int xfrm_msg_min[XFRM_NR_MSGTYPES] = { [XFRM_MSG_GETAE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_aevent_id), [XFRM_MSG_REPORT - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_report), [XFRM_MSG_MIGRATE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id), + [XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = NLMSG_LENGTH(sizeof(u32)), + [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = NLMSG_LENGTH(sizeof(u32)), }; #undef XMSGSIZE @@ -1850,55 +1959,40 @@ static struct xfrm_link { [XFRM_MSG_NEWAE - XFRM_MSG_BASE] = { .doit = xfrm_new_ae }, [XFRM_MSG_GETAE - XFRM_MSG_BASE] = { .doit = xfrm_get_ae }, [XFRM_MSG_MIGRATE - XFRM_MSG_BASE] = { .doit = xfrm_do_migrate }, + [XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = { .doit = xfrm_get_sadinfo }, + [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = { .doit = xfrm_get_spdinfo }, }; -static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp) +static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { struct rtattr *xfrma[XFRMA_MAX]; struct xfrm_link *link; int type, min_len; - if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) - return 0; - type = nlh->nlmsg_type; - - /* A control message: ignore them */ - if (type < XFRM_MSG_BASE) - return 0; - - /* Unknown message: reply with EINVAL */ if (type > XFRM_MSG_MAX) - goto err_einval; + return -EINVAL; type -= XFRM_MSG_BASE; link = &xfrm_dispatch[type]; /* All operations require privileges, even GET */ - if (security_netlink_recv(skb, CAP_NET_ADMIN)) { - *errp = -EPERM; - return -1; - } + if (security_netlink_recv(skb, CAP_NET_ADMIN)) + return -EPERM; if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) || type == (XFRM_MSG_GETPOLICY - XFRM_MSG_BASE)) && (nlh->nlmsg_flags & NLM_F_DUMP)) { if (link->dump == NULL) - goto err_einval; - - if ((*errp = netlink_dump_start(xfrm_nl, skb, nlh, - link->dump, NULL)) != 0) { - return -1; - } + return -EINVAL; - netlink_queue_skip(nlh, skb); - return -1; + return netlink_dump_start(xfrm_nl, skb, nlh, link->dump, NULL); } memset(xfrma, 0, sizeof(xfrma)); if (nlh->nlmsg_len < (min_len = xfrm_msg_min[type])) - goto err_einval; + return -EINVAL; if (nlh->nlmsg_len > min_len) { int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); @@ -1908,7 +2002,7 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *err unsigned short flavor = attr->rta_type; if (flavor) { if (flavor > XFRMA_MAX) - goto err_einval; + return -EINVAL; xfrma[flavor - 1] = attr; } attr = RTA_NEXT(attr, attrlen); @@ -1916,14 +2010,9 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *err } if (link->doit == NULL) - goto err_einval; - *errp = link->doit(skb, nlh, xfrma); - - return *errp; + return -EINVAL; -err_einval: - *errp = -EINVAL; - return -1; + return link->doit(skb, nlh, xfrma); } static void xfrm_netlink_rcv(struct sock *sk, int len) @@ -1942,7 +2031,7 @@ static int build_expire(struct sk_buff *skb, struct xfrm_state *x, struct km_eve { struct xfrm_user_expire *ue; struct nlmsghdr *nlh; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); nlh = NLMSG_PUT(skb, c->pid, 0, XFRM_MSG_EXPIRE, sizeof(*ue)); @@ -1952,11 +2041,11 @@ static int build_expire(struct sk_buff *skb, struct xfrm_state *x, struct km_eve copy_to_user_state(x, &ue->state); ue->hard = (c->data.hard != 0) ? 1 : 0; - nlh->nlmsg_len = skb->tail - b; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; nlmsg_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } @@ -1999,7 +2088,7 @@ static int xfrm_notify_sa_flush(struct km_event *c) struct xfrm_usersa_flush *p; struct nlmsghdr *nlh; struct sk_buff *skb; - unsigned char *b; + sk_buff_data_t b; int len = NLMSG_LENGTH(sizeof(struct xfrm_usersa_flush)); skb = alloc_skb(len, GFP_ATOMIC); @@ -2045,7 +2134,7 @@ static int xfrm_notify_sa(struct xfrm_state *x, struct km_event *c) struct xfrm_usersa_id *id; struct nlmsghdr *nlh; struct sk_buff *skb; - unsigned char *b; + sk_buff_data_t b; int len = xfrm_sa_len(x); int headlen; @@ -2129,7 +2218,7 @@ static int build_acquire(struct sk_buff *skb, struct xfrm_state *x, { struct xfrm_user_acquire *ua; struct nlmsghdr *nlh; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); __u32 seq = xfrm_get_acqseq(); nlh = NLMSG_PUT(skb, 0, 0, XFRM_MSG_ACQUIRE, @@ -2153,11 +2242,11 @@ static int build_acquire(struct sk_buff *skb, struct xfrm_state *x, if (copy_to_user_policy_type(xp->type, skb) < 0) goto nlmsg_failure; - nlh->nlmsg_len = skb->tail - b; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; nlmsg_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } @@ -2249,7 +2338,7 @@ static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp, struct xfrm_user_polexpire *upe; struct nlmsghdr *nlh; int hard = c->data.hard; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); nlh = NLMSG_PUT(skb, c->pid, 0, XFRM_MSG_POLEXPIRE, sizeof(*upe)); upe = NLMSG_DATA(nlh); @@ -2264,11 +2353,11 @@ static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp, goto nlmsg_failure; upe->hard = !!hard; - nlh->nlmsg_len = skb->tail - b; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; nlmsg_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } @@ -2300,7 +2389,7 @@ static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, struct km_event * struct xfrm_userpolicy_id *id; struct nlmsghdr *nlh; struct sk_buff *skb; - unsigned char *b; + sk_buff_data_t b; int len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr); int headlen; @@ -2357,7 +2446,7 @@ static int xfrm_notify_policy_flush(struct km_event *c) { struct nlmsghdr *nlh; struct sk_buff *skb; - unsigned char *b; + sk_buff_data_t b; int len = 0; #ifdef CONFIG_XFRM_SUB_POLICY len += RTA_SPACE(sizeof(struct xfrm_userpolicy_type)); @@ -2410,7 +2499,7 @@ static int build_report(struct sk_buff *skb, u8 proto, { struct xfrm_user_report *ur; struct nlmsghdr *nlh; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); nlh = NLMSG_PUT(skb, 0, 0, XFRM_MSG_REPORT, sizeof(*ur)); ur = NLMSG_DATA(nlh); @@ -2422,12 +2511,12 @@ static int build_report(struct sk_buff *skb, u8 proto, if (addr) RTA_PUT(skb, XFRMA_COADDR, sizeof(*addr), addr); - nlh->nlmsg_len = skb->tail - b; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; nlmsg_failure: rtattr_failure: - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return -1; } @@ -2466,7 +2555,7 @@ static int __init xfrm_user_init(void) printk(KERN_INFO "Initializing XFRM netlink socket\n"); nlsk = netlink_kernel_create(NETLINK_XFRM, XFRMNLGRP_MAX, - xfrm_netlink_rcv, THIS_MODULE); + xfrm_netlink_rcv, NULL, THIS_MODULE); if (nlsk == NULL) return -ENOMEM; rcu_assign_pointer(xfrm_nl, nlsk); |