diff options
Diffstat (limited to 'net')
129 files changed, 2621 insertions, 1262 deletions
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 01ddb0472f8..889f4ac4459 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -35,12 +35,12 @@ drop: } EXPORT_SYMBOL(__vlan_hwaccel_rx); -int vlan_hwaccel_do_receive(struct sk_buff *skb) +void vlan_hwaccel_do_receive(struct sk_buff *skb) { struct net_device *dev = skb->dev; struct vlan_rx_stats *rx_stats; - skb->dev = vlan_dev_info(dev)->real_dev; + skb->dev = vlan_dev_real_dev(dev); netif_nit_deliver(skb); skb->dev = dev; @@ -69,7 +69,6 @@ int vlan_hwaccel_do_receive(struct sk_buff *skb) break; } u64_stats_update_end(&rx_stats->syncp); - return 0; } struct net_device *vlan_dev_real_dev(const struct net_device *dev) @@ -106,9 +105,12 @@ vlan_gro_common(struct napi_struct *napi, struct vlan_group *grp, goto drop; for (p = napi->gro_list; p; p = p->next) { - NAPI_GRO_CB(p)->same_flow = - p->dev == skb->dev && !compare_ether_header( - skb_mac_header(p), skb_gro_mac_header(skb)); + unsigned long diffs; + + diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; + diffs |= compare_ether_header(skb_mac_header(p), + skb_gro_mac_header(skb)); + NAPI_GRO_CB(p)->same_flow = !diffs; NAPI_GRO_CB(p)->flush = 0; } diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 3d59c9bf8fe..3bccdd12a26 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -510,7 +510,8 @@ static int vlan_dev_open(struct net_device *dev) if (vlan->flags & VLAN_FLAG_GVRP) vlan_gvrp_request_join(dev); - netif_carrier_on(dev); + if (netif_carrier_ok(real_dev)) + netif_carrier_on(dev); return 0; clear_allmulti: diff --git a/net/Kconfig b/net/Kconfig index e24fa0873f3..e330594d370 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -213,6 +213,7 @@ source "net/phonet/Kconfig" source "net/ieee802154/Kconfig" source "net/sched/Kconfig" source "net/dcb/Kconfig" +source "net/dns_resolver/Kconfig" config RPS boolean diff --git a/net/Makefile b/net/Makefile index 41d420070a3..ea60fbce9b1 100644 --- a/net/Makefile +++ b/net/Makefile @@ -67,3 +67,4 @@ ifeq ($(CONFIG_NET),y) obj-$(CONFIG_SYSCTL) += sysctl_net.o endif obj-$(CONFIG_WIMAX) += wimax/ +obj-$(CONFIG_DNS_RESOLVER) += dns_resolver/ diff --git a/net/atm/common.c b/net/atm/common.c index 940404a73b3..1b9c52a02cd 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -792,7 +792,7 @@ int vcc_getsockopt(struct socket *sock, int level, int optname, default: if (level == SOL_SOCKET) return -EINVAL; - break; + break; } if (!vcc->dev || !vcc->dev->ops->getsockopt) return -EINVAL; diff --git a/net/atm/lec.c b/net/atm/lec.c index d98bde1a0ac..181d70c73d7 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -220,7 +220,6 @@ static unsigned char *get_tr_dst(unsigned char *packet, unsigned char *rdesc) static int lec_open(struct net_device *dev) { netif_start_queue(dev); - memset(&dev->stats, 0, sizeof(struct net_device_stats)); return 0; } diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index cfdfd7e2a17..26eaebf4aaa 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1103,7 +1103,7 @@ done: out: release_sock(sk); - return 0; + return err; } /* diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index 7805945a5fd..a1690845dc6 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -412,7 +412,7 @@ int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr) { ax25_uid_assoc *user; ax25_route *ax25_rt; - int err; + int err = 0; if ((ax25_rt = ax25_get_route(addr, NULL)) == NULL) return -EHOSTUNREACH; @@ -453,7 +453,7 @@ int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr) put: ax25_put_route(ax25_rt); - return 0; + return err; } struct sk_buff *ax25_rt_build_path(struct sk_buff *skb, ax25_address *src, diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index c03d2c3ff03..89ad25a7620 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -61,30 +61,27 @@ static int port_cost(struct net_device *dev) } -/* - * Check for port carrier transistions. - * Called from work queue to allow for calling functions that - * might sleep (such as speed check), and to debounce. - */ +/* Check for port carrier transistions. */ void br_port_carrier_check(struct net_bridge_port *p) { struct net_device *dev = p->dev; struct net_bridge *br = p->br; - if (netif_carrier_ok(dev)) + if (netif_running(dev) && netif_carrier_ok(dev)) p->path_cost = port_cost(dev); - if (netif_running(br->dev)) { - spin_lock_bh(&br->lock); - if (netif_carrier_ok(dev)) { - if (p->state == BR_STATE_DISABLED) - br_stp_enable_port(p); - } else { - if (p->state != BR_STATE_DISABLED) - br_stp_disable_port(p); - } - spin_unlock_bh(&br->lock); + if (!netif_running(br->dev)) + return; + + spin_lock_bh(&br->lock); + if (netif_running(dev) && netif_carrier_ok(dev)) { + if (p->state == BR_STATE_DISABLED) + br_stp_enable_port(p); + } else { + if (p->state != BR_STATE_DISABLED) + br_stp_disable_port(p); } + spin_unlock_bh(&br->lock); } static void release_nbp(struct kobject *kobj) diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 826cd522153..6d04cfdf454 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -141,7 +141,7 @@ struct sk_buff *br_handle_frame(struct sk_buff *skb) const unsigned char *dest = eth_hdr(skb)->h_dest; int (*rhook)(struct sk_buff *skb); - if (skb->pkt_type == PACKET_LOOPBACK) + if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return skb; if (!is_valid_ether_addr(eth_hdr(skb)->h_source)) diff --git a/net/caif/cfpkt_skbuff.c b/net/caif/cfpkt_skbuff.c index 01f238ff234..c49a6695793 100644 --- a/net/caif/cfpkt_skbuff.c +++ b/net/caif/cfpkt_skbuff.c @@ -9,7 +9,7 @@ #include <linux/hardirq.h> #include <net/caif/cfpkt.h> -#define PKT_PREFIX 16 +#define PKT_PREFIX 48 #define PKT_POSTFIX 2 #define PKT_LEN_WHEN_EXTENDING 128 #define PKT_ERROR(pkt, errmsg) do { \ diff --git a/net/can/bcm.c b/net/can/bcm.c index 9c65e9deb9c..08ffe9e4be2 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -60,6 +60,13 @@ #include <net/sock.h> #include <net/net_namespace.h> +/* + * To send multiple CAN frame content within TX_SETUP or to filter + * CAN messages with multiplex index within RX_SETUP, the number of + * different filters is limited to 256 due to the one byte index value. + */ +#define MAX_NFRAMES 256 + /* use of last_frames[index].can_dlc */ #define RX_RECV 0x40 /* received data for this element */ #define RX_THR 0x80 /* element not been sent due to throttle feature */ @@ -89,16 +96,16 @@ struct bcm_op { struct list_head list; int ifindex; canid_t can_id; - int flags; + u32 flags; unsigned long frames_abs, frames_filtered; struct timeval ival1, ival2; struct hrtimer timer, thrtimer; struct tasklet_struct tsklet, thrtsklet; ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg; int rx_ifindex; - int count; - int nframes; - int currframe; + u32 count; + u32 nframes; + u32 currframe; struct can_frame *frames; struct can_frame *last_frames; struct can_frame sframe; @@ -175,7 +182,7 @@ static int bcm_proc_show(struct seq_file *m, void *v) seq_printf(m, "rx_op: %03X %-5s ", op->can_id, bcm_proc_getifname(ifname, op->ifindex)); - seq_printf(m, "[%d]%c ", op->nframes, + seq_printf(m, "[%u]%c ", op->nframes, (op->flags & RX_CHECK_DLC)?'d':' '); if (op->kt_ival1.tv64) seq_printf(m, "timeo=%lld ", @@ -198,7 +205,7 @@ static int bcm_proc_show(struct seq_file *m, void *v) list_for_each_entry(op, &bo->tx_ops, list) { - seq_printf(m, "tx_op: %03X %s [%d] ", + seq_printf(m, "tx_op: %03X %s [%u] ", op->can_id, bcm_proc_getifname(ifname, op->ifindex), op->nframes); @@ -283,7 +290,7 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head, struct can_frame *firstframe; struct sockaddr_can *addr; struct sock *sk = op->sk; - int datalen = head->nframes * CFSIZ; + unsigned int datalen = head->nframes * CFSIZ; int err; skb = alloc_skb(sizeof(*head) + datalen, gfp_any()); @@ -468,7 +475,7 @@ rx_changed_settime: * bcm_rx_cmp_to_index - (bit)compares the currently received data to formerly * received data stored in op->last_frames[] */ -static void bcm_rx_cmp_to_index(struct bcm_op *op, int index, +static void bcm_rx_cmp_to_index(struct bcm_op *op, unsigned int index, const struct can_frame *rxdata) { /* @@ -554,7 +561,8 @@ static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer) /* * bcm_rx_do_flush - helper for bcm_rx_thr_flush */ -static inline int bcm_rx_do_flush(struct bcm_op *op, int update, int index) +static inline int bcm_rx_do_flush(struct bcm_op *op, int update, + unsigned int index) { if ((op->last_frames) && (op->last_frames[index].can_dlc & RX_THR)) { if (update) @@ -575,7 +583,7 @@ static int bcm_rx_thr_flush(struct bcm_op *op, int update) int updated = 0; if (op->nframes > 1) { - int i; + unsigned int i; /* for MUX filter we start at index 1 */ for (i = 1; i < op->nframes; i++) @@ -624,7 +632,7 @@ static void bcm_rx_handler(struct sk_buff *skb, void *data) { struct bcm_op *op = (struct bcm_op *)data; const struct can_frame *rxframe = (struct can_frame *)skb->data; - int i; + unsigned int i; /* disable timeout */ hrtimer_cancel(&op->timer); @@ -822,14 +830,15 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, { struct bcm_sock *bo = bcm_sk(sk); struct bcm_op *op; - int i, err; + unsigned int i; + int err; /* we need a real device to send frames */ if (!ifindex) return -ENODEV; - /* we need at least one can_frame */ - if (msg_head->nframes < 1) + /* check nframes boundaries - we need at least one can_frame */ + if (msg_head->nframes < 1 || msg_head->nframes > MAX_NFRAMES) return -EINVAL; /* check the given can_id */ @@ -993,6 +1002,10 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, msg_head->nframes = 0; } + /* the first element contains the mux-mask => MAX_NFRAMES + 1 */ + if (msg_head->nframes > MAX_NFRAMES + 1) + return -EINVAL; + if ((msg_head->flags & RX_RTR_FRAME) && ((msg_head->nframes != 1) || (!(msg_head->can_id & CAN_RTR_FLAG)))) diff --git a/net/can/raw.c b/net/can/raw.c index a10e3338f08..7d77e67e57a 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -647,12 +647,12 @@ static int raw_sendmsg(struct kiocb *iocb, struct socket *sock, err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); if (err < 0) goto free_skb; - err = sock_tx_timestamp(msg, sk, skb_tx(skb)); + err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); if (err < 0) goto free_skb; /* to be able to check the received tx sock reference in raw_rcv() */ - skb_tx(skb)->prevent_sk_orphan = 1; + skb_shinfo(skb)->tx_flags |= SKBTX_DRV_NEEDS_SK_REF; skb->dev = dev; skb->sk = sk; diff --git a/net/core/dev.c b/net/core/dev.c index 1ae65439144..d8c43e73f0b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1902,14 +1902,14 @@ static int dev_gso_segment(struct sk_buff *skb) /* * Try to orphan skb early, right before transmission by the device. - * We cannot orphan skb if tx timestamp is requested, since - * drivers need to call skb_tstamp_tx() to send the timestamp. + * We cannot orphan skb if tx timestamp is requested or the sk-reference + * is needed on driver level for other reasons, e.g. see net/can/raw.c */ static inline void skb_orphan_try(struct sk_buff *skb) { struct sock *sk = skb->sk; - if (sk && !skb_tx(skb)->flags) { + if (sk && !skb_shinfo(skb)->tx_flags) { /* skb_tx_hash() wont be able to get sk. * We copy sk_hash into skb->rxhash */ @@ -1930,7 +1930,7 @@ static inline int skb_needs_linearize(struct sk_buff *skb, struct net_device *dev) { return skb_is_nonlinear(skb) && - ((skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) || + ((skb_has_frag_list(skb) && !(dev->features & NETIF_F_FRAGLIST)) || (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)))); } @@ -2259,69 +2259,44 @@ static inline void ____napi_schedule(struct softnet_data *sd, __raise_softirq_irqoff(NET_RX_SOFTIRQ); } -#ifdef CONFIG_RPS - -/* One global table that all flow-based protocols share. */ -struct rps_sock_flow_table *rps_sock_flow_table __read_mostly; -EXPORT_SYMBOL(rps_sock_flow_table); - /* - * get_rps_cpu is called from netif_receive_skb and returns the target - * CPU from the RPS map of the receiving queue for a given skb. - * rcu_read_lock must be held on entry. + * __skb_get_rxhash: calculate a flow hash based on src/dst addresses + * and src/dst port numbers. Returns a non-zero hash number on success + * and 0 on failure. */ -static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, - struct rps_dev_flow **rflowp) +__u32 __skb_get_rxhash(struct sk_buff *skb) { + int nhoff, hash = 0, poff; struct ipv6hdr *ip6; struct iphdr *ip; - struct netdev_rx_queue *rxqueue; - struct rps_map *map; - struct rps_dev_flow_table *flow_table; - struct rps_sock_flow_table *sock_flow_table; - int cpu = -1; u8 ip_proto; - u16 tcpu; u32 addr1, addr2, ihl; union { u32 v32; u16 v16[2]; } ports; - if (skb_rx_queue_recorded(skb)) { - u16 index = skb_get_rx_queue(skb); - if (unlikely(index >= dev->num_rx_queues)) { - WARN_ONCE(dev->num_rx_queues > 1, "%s received packet " - "on queue %u, but number of RX queues is %u\n", - dev->name, index, dev->num_rx_queues); - goto done; - } - rxqueue = dev->_rx + index; - } else - rxqueue = dev->_rx; - - if (!rxqueue->rps_map && !rxqueue->rps_flow_table) - goto done; - - if (skb->rxhash) - goto got_hash; /* Skip hash computation on packet header */ + nhoff = skb_network_offset(skb); switch (skb->protocol) { case __constant_htons(ETH_P_IP): - if (!pskb_may_pull(skb, sizeof(*ip))) + if (!pskb_may_pull(skb, sizeof(*ip) + nhoff)) goto done; - ip = (struct iphdr *) skb->data; - ip_proto = ip->protocol; + ip = (struct iphdr *) (skb->data + nhoff); + if (ip->frag_off & htons(IP_MF | IP_OFFSET)) + ip_proto = 0; + else + ip_proto = ip->protocol; addr1 = (__force u32) ip->saddr; addr2 = (__force u32) ip->daddr; ihl = ip->ihl; break; case __constant_htons(ETH_P_IPV6): - if (!pskb_may_pull(skb, sizeof(*ip6))) + if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff)) goto done; - ip6 = (struct ipv6hdr *) skb->data; + ip6 = (struct ipv6hdr *) (skb->data + nhoff); ip_proto = ip6->nexthdr; addr1 = (__force u32) ip6->saddr.s6_addr32[3]; addr2 = (__force u32) ip6->daddr.s6_addr32[3]; @@ -2330,33 +2305,71 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, default: goto done; } - switch (ip_proto) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_DCCP: - case IPPROTO_ESP: - case IPPROTO_AH: - case IPPROTO_SCTP: - case IPPROTO_UDPLITE: - if (pskb_may_pull(skb, (ihl * 4) + 4)) { - ports.v32 = * (__force u32 *) (skb->data + (ihl * 4)); + + ports.v32 = 0; + poff = proto_ports_offset(ip_proto); + if (poff >= 0) { + nhoff += ihl * 4 + poff; + if (pskb_may_pull(skb, nhoff + 4)) { + ports.v32 = * (__force u32 *) (skb->data + nhoff); if (ports.v16[1] < ports.v16[0]) swap(ports.v16[0], ports.v16[1]); - break; } - default: - ports.v32 = 0; - break; } /* get a consistent hash (same value on both flow directions) */ if (addr2 < addr1) swap(addr1, addr2); - skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd); - if (!skb->rxhash) - skb->rxhash = 1; -got_hash: + hash = jhash_3words(addr1, addr2, ports.v32, hashrnd); + if (!hash) + hash = 1; + +done: + return hash; +} +EXPORT_SYMBOL(__skb_get_rxhash); + +#ifdef CONFIG_RPS + +/* One global table that all flow-based protocols share. */ +struct rps_sock_flow_table *rps_sock_flow_table __read_mostly; +EXPORT_SYMBOL(rps_sock_flow_table); + +/* + * get_rps_cpu is called from netif_receive_skb and returns the target + * CPU from the RPS map of the receiving queue for a given skb. + * rcu_read_lock must be held on entry. + */ +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, + struct rps_dev_flow **rflowp) +{ + struct netdev_rx_queue *rxqueue; + struct rps_map *map; + struct rps_dev_flow_table *flow_table; + struct rps_sock_flow_table *sock_flow_table; + int cpu = -1; + u16 tcpu; + + if (skb_rx_queue_recorded(skb)) { + u16 index = skb_get_rx_queue(skb); + if (unlikely(index >= dev->num_rx_queues)) { + WARN_ONCE(dev->num_rx_queues > 1, "%s received packet " + "on queue %u, but number of RX queues is %u\n", + dev->name, index, dev->num_rx_queues); + goto done; + } + rxqueue = dev->_rx + index; + } else + rxqueue = dev->_rx; + + if (!rxqueue->rps_map && !rxqueue->rps_flow_table) + goto done; + + skb_reset_network_header(skb); + if (!skb_get_rxhash(skb)) + goto done; + flow_table = rcu_dereference(rxqueue->rps_flow_table); sock_flow_table = rcu_dereference(rps_sock_flow_table); if (flow_table && sock_flow_table) { @@ -2828,8 +2841,8 @@ static int __netif_receive_skb(struct sk_buff *skb) if (!netdev_tstamp_prequeue) net_timestamp_check(skb); - if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb)) - return NET_RX_SUCCESS; + if (vlan_tx_tag_present(skb)) + vlan_hwaccel_do_receive(skb); /* if we've gotten here through NAPI, check netpoll */ if (netpoll_receive_skb(skb)) @@ -3050,7 +3063,7 @@ out: return netif_receive_skb(skb); } -static void napi_gro_flush(struct napi_struct *napi) +inline void napi_gro_flush(struct napi_struct *napi) { struct sk_buff *skb, *next; @@ -3063,6 +3076,7 @@ static void napi_gro_flush(struct napi_struct *napi) napi->gro_count = 0; napi->gro_list = NULL; } +EXPORT_SYMBOL(napi_gro_flush); enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { @@ -3077,7 +3091,7 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb)) goto normal; - if (skb_is_gso(skb) || skb_has_frags(skb)) + if (skb_is_gso(skb) || skb_has_frag_list(skb)) goto normal; rcu_read_lock(); @@ -3143,7 +3157,7 @@ pull: put_page(skb_shinfo(skb)->frags[0].page); memmove(skb_shinfo(skb)->frags, skb_shinfo(skb)->frags + 1, - --skb_shinfo(skb)->nr_frags); + --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t)); } } @@ -3156,16 +3170,18 @@ normal: } EXPORT_SYMBOL(dev_gro_receive); -static gro_result_t +static inline gro_result_t __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff *p; for (p = napi->gro_list; p; p = p->next) { - NAPI_GRO_CB(p)->same_flow = - (p->dev == skb->dev) && - !compare_ether_header(skb_mac_header(p), + unsigned long diffs; + + diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; + diffs |= compare_ether_header(skb_mac_header(p), skb_gro_mac_header(skb)); + NAPI_GRO_CB(p)->same_flow = !diffs; NAPI_GRO_CB(p)->flush = 0; } diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 7a85367b3c2..970eb9817bb 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -205,18 +205,24 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo info; const struct ethtool_ops *ops = dev->ethtool_ops; - if (!ops->get_drvinfo) - return -EOPNOTSUPP; - memset(&info, 0, sizeof(info)); info.cmd = ETHTOOL_GDRVINFO; - ops->get_drvinfo(dev, &info); + if (ops && ops->get_drvinfo) { + ops->get_drvinfo(dev, &info); + } else if (dev->dev.parent && dev->dev.parent->driver) { + strlcpy(info.bus_info, dev_name(dev->dev.parent), + sizeof(info.bus_info)); + strlcpy(info.driver, dev->dev.parent->driver->name, + sizeof(info.driver)); + } else { + return -EOPNOTSUPP; + } /* * this method of obtaining string set info is deprecated; * Use ETHTOOL_GSSET_INFO instead. */ - if (ops->get_sset_count) { + if (ops && ops->get_sset_count) { int rc; rc = ops->get_sset_count(dev, ETH_SS_TEST); @@ -229,9 +235,9 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, if (rc >= 0) info.n_priv_flags = rc; } - if (ops->get_regs_len) + if (ops && ops->get_regs_len) info.regdump_len = ops->get_regs_len(dev); - if (ops->get_eeprom_len) + if (ops && ops->get_eeprom_len) info.eedump_len = ops->get_eeprom_len(dev); if (copy_to_user(useraddr, &info, sizeof(info))) @@ -1402,14 +1408,22 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) if (!dev || !netif_device_present(dev)) return -ENODEV; - if (!dev->ethtool_ops) - return -EOPNOTSUPP; - if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd))) return -EFAULT; + if (!dev->ethtool_ops) { + /* ETHTOOL_GDRVINFO does not require any driver support. + * It is also unprivileged and does not change anything, + * so we can take a shortcut to it. */ + if (ethcmd == ETHTOOL_GDRVINFO) + return ethtool_get_drvinfo(dev, useraddr); + else + return -EOPNOTSUPP; + } + /* Allow some commands to be done by anyone */ switch (ethcmd) { + case ETHTOOL_GSET: case ETHTOOL_GDRVINFO: case ETHTOOL_GMSGLVL: case ETHTOOL_GCOALESCE: diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 7d748542d97..76485a3f910 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -515,7 +515,7 @@ static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr, return attribute->store(queue, attribute, buf, count); } -static struct sysfs_ops rx_queue_sysfs_ops = { +static const struct sysfs_ops rx_queue_sysfs_ops = { .show = rx_queue_attr_show, .store = rx_queue_attr_store, }; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 10a1ea72010..386c2283f14 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3907,8 +3907,6 @@ static void __exit pg_cleanup(void) { struct pktgen_thread *t; struct list_head *q, *n; - wait_queue_head_t queue; - init_waitqueue_head(&queue); /* Stop all interfaces & threads */ diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f78d821bd93..b2a718dfd72 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -612,36 +612,7 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a, static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b) { - struct rtnl_link_stats64 a; - - a.rx_packets = b->rx_packets; - a.tx_packets = b->tx_packets; - a.rx_bytes = b->rx_bytes; - a.tx_bytes = b->tx_bytes; - a.rx_errors = b->rx_errors; - a.tx_errors = b->tx_errors; - a.rx_dropped = b->rx_dropped; - a.tx_dropped = b->tx_dropped; - - a.multicast = b->multicast; - a.collisions = b->collisions; - - a.rx_length_errors = b->rx_length_errors; - a.rx_over_errors = b->rx_over_errors; - a.rx_crc_errors = b->rx_crc_errors; - a.rx_frame_errors = b->rx_frame_errors; - a.rx_fifo_errors = b->rx_fifo_errors; - a.rx_missed_errors = b->rx_missed_errors; - - a.tx_aborted_errors = b->tx_aborted_errors; - a.tx_carrier_errors = b->tx_carrier_errors; - a.tx_fifo_errors = b->tx_fifo_errors; - a.tx_heartbeat_errors = b->tx_heartbeat_errors; - a.tx_window_errors = b->tx_window_errors; - - a.rx_compressed = b->rx_compressed; - a.tx_compressed = b->tx_compressed; - memcpy(v, &a, sizeof(a)); + memcpy(v, b, sizeof(*b)); } /* All VF info */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3a2513f0d0c..231dff0dde2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -340,7 +340,7 @@ static void skb_release_data(struct sk_buff *skb) put_page(skb_shinfo(skb)->frags[i].page); } - if (skb_has_frags(skb)) + if (skb_has_frag_list(skb)) skb_drop_fraglist(skb); kfree(skb->head); @@ -685,16 +685,10 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) { - int headerlen = skb->data - skb->head; - /* - * Allocate the copy buffer - */ - struct sk_buff *n; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - n = alloc_skb(skb->end + skb->data_len, gfp_mask); -#else - n = alloc_skb(skb->end - skb->head + skb->data_len, gfp_mask); -#endif + int headerlen = skb_headroom(skb); + unsigned int size = (skb_end_pointer(skb) - skb->head) + skb->data_len; + struct sk_buff *n = alloc_skb(size, gfp_mask); + if (!n) return NULL; @@ -726,20 +720,14 @@ EXPORT_SYMBOL(skb_copy); struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) { - /* - * Allocate the copy buffer - */ - struct sk_buff *n; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - n = alloc_skb(skb->end, gfp_mask); -#else - n = alloc_skb(skb->end - skb->head, gfp_mask); -#endif + unsigned int size = skb_end_pointer(skb) - skb->head; + struct sk_buff *n = alloc_skb(size, gfp_mask); + if (!n) goto out; /* Set the data pointer */ - skb_reserve(n, skb->data - skb->head); + skb_reserve(n, skb_headroom(skb)); /* Set the tail pointer and length */ skb_put(n, skb_headlen(skb)); /* Copy the bytes */ @@ -759,7 +747,7 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) skb_shinfo(n)->nr_frags = i; } - if (skb_has_frags(skb)) { + if (skb_has_frag_list(skb)) { skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; skb_clone_fraglist(n); } @@ -791,11 +779,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, { int i; u8 *data; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - int size = nhead + skb->end + ntail; -#else - int size = nhead + (skb->end - skb->head) + ntail; -#endif + int size = nhead + (skb_end_pointer(skb) - skb->head) + ntail; long off; BUG_ON(nhead < 0); @@ -810,19 +794,18 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, goto nodata; /* Copy only real data... and, alas, header. This should be - * optimized for the cases when header is void. */ -#ifdef NET_SKBUFF_DATA_USES_OFFSET - memcpy(data + nhead, skb->head, skb->tail); -#else - memcpy(data + nhead, skb->head, skb->tail - skb->head); -#endif - memcpy(data + size, skb_end_pointer(skb), + * optimized for the cases when header is void. + */ + memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head); + + memcpy((struct skb_shared_info *)(data + size), + skb_shinfo(skb), offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) get_page(skb_shinfo(skb)->frags[i].page); - if (skb_has_frags(skb)) + if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); skb_release_data(skb); @@ -1099,7 +1082,7 @@ drop_pages: for (; i < nfrags; i++) put_page(skb_shinfo(skb)->frags[i].page); - if (skb_has_frags(skb)) + if (skb_has_frag_list(skb)) skb_drop_fraglist(skb); goto done; } @@ -1194,7 +1177,7 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) /* Optimization: no fragments, no reasons to preestimate * size of pulled pages. Superb. */ - if (!skb_has_frags(skb)) + if (!skb_has_frag_list(skb)) goto pull_pages; /* Estimate size of pulled pages. */ @@ -2323,7 +2306,7 @@ next_skb: st->frag_data = NULL; } - if (st->root_skb == st->cur_skb && skb_has_frags(st->root_skb)) { + if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) { st->cur_skb = skb_shinfo(st->root_skb)->frag_list; st->frag_idx = 0; goto next_skb; @@ -2889,7 +2872,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) return -ENOMEM; /* Easy case. Most of packets will go this way. */ - if (!skb_has_frags(skb)) { + if (!skb_has_frag_list(skb)) { /* A little of trouble, not enough of space for trailer. * This should not happen, when stack is tuned to generate * good frames. OK, on miss we reallocate and reserve even more @@ -2924,7 +2907,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) if (skb1->next == NULL && tailbits) { if (skb_shinfo(skb1)->nr_frags || - skb_has_frags(skb1) || + skb_has_frag_list(skb1) || skb_tailroom(skb1) < tailbits) ntail = tailbits + 128; } @@ -2933,7 +2916,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) skb_cloned(skb1) || ntail || skb_shinfo(skb1)->nr_frags || - skb_has_frags(skb1)) { + skb_has_frag_list(skb1)) { struct sk_buff *skb2; /* Fuck, we are miserable poor guys... */ @@ -3016,7 +2999,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, } else { /* * no hardware time stamps available, - * so keep the skb_shared_tx and only + * so keep the shared tx_flags and only * store software time stamp */ skb->tstamp = ktime_get_real(); diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig index 8408398cd44..0581143cb80 100644 --- a/net/dccp/ccids/Kconfig +++ b/net/dccp/ccids/Kconfig @@ -47,37 +47,6 @@ config IP_DCCP_CCID3_DEBUG If in doubt, say N. -config IP_DCCP_CCID3_RTO - int "Use higher bound for nofeedback timer" - default 100 - depends on IP_DCCP_CCID3 && EXPERIMENTAL - ---help--- - Use higher lower bound for nofeedback timer expiration. - - The TFRC nofeedback timer normally expires after the maximum of 4 - RTTs and twice the current send interval (RFC 3448, 4.3). On LANs - with a small RTT this can mean a high processing load and reduced - performance, since then the nofeedback timer is triggered very - frequently. - - This option enables to set a higher lower bound for the nofeedback - value. Values in units of milliseconds can be set here. - - A value of 0 disables this feature by enforcing the value specified - in RFC 3448. The following values have been suggested as bounds for - experimental use: - * 16-20ms to match the typical multimedia inter-frame interval - * 100ms as a reasonable compromise [default] - * 1000ms corresponds to the lower TCP RTO bound (RFC 2988, 2.4) - - The default of 100ms is a compromise between a large value for - efficient DCCP implementations, and a small value to avoid disrupting - the network in times of congestion. - - The purpose of the nofeedback timer is to slow DCCP down when there - is serious network congestion: experimenting with larger values should - therefore not be performed on WANs. - config IP_DCCP_TFRC_LIB def_bool y if IP_DCCP_CCID3 diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 9b3ae9922be..dc18172b1e5 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -25,59 +25,14 @@ */ #include <linux/slab.h> #include "../feat.h" -#include "../ccid.h" -#include "../dccp.h" #include "ccid2.h" #ifdef CONFIG_IP_DCCP_CCID2_DEBUG static int ccid2_debug; #define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a) - -static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hc) -{ - int len = 0; - int pipe = 0; - struct ccid2_seq *seqp = hc->tx_seqh; - - /* there is data in the chain */ - if (seqp != hc->tx_seqt) { - seqp = seqp->ccid2s_prev; - len++; - if (!seqp->ccid2s_acked) - pipe++; - - while (seqp != hc->tx_seqt) { - struct ccid2_seq *prev = seqp->ccid2s_prev; - - len++; - if (!prev->ccid2s_acked) - pipe++; - - /* packets are sent sequentially */ - BUG_ON(dccp_delta_seqno(seqp->ccid2s_seq, - prev->ccid2s_seq ) >= 0); - BUG_ON(time_before(seqp->ccid2s_sent, - prev->ccid2s_sent)); - - seqp = prev; - } - } - - BUG_ON(pipe != hc->tx_pipe); - ccid2_pr_debug("len of chain=%d\n", len); - - do { - seqp = seqp->ccid2s_prev; - len++; - } while (seqp != hc->tx_seqh); - - ccid2_pr_debug("total len=%d\n", len); - BUG_ON(len != hc->tx_seqbufc * CCID2_SEQBUF_LEN); -} #else #define ccid2_pr_debug(format, a...) -#define ccid2_hc_tx_check_sanity(hc) #endif static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc) @@ -156,19 +111,10 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) dp->dccps_l_ack_ratio = val; } -static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hc, long val) -{ - ccid2_pr_debug("change SRTT to %ld\n", val); - hc->tx_srtt = val; -} - -static void ccid2_start_rto_timer(struct sock *sk); - static void ccid2_hc_tx_rto_expire(unsigned long data) { struct sock *sk = (struct sock *)data; struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - long s; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { @@ -178,23 +124,19 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) ccid2_pr_debug("RTO_EXPIRE\n"); - ccid2_hc_tx_check_sanity(hc); - /* back-off timer */ hc->tx_rto <<= 1; + if (hc->tx_rto > DCCP_RTO_MAX) + hc->tx_rto = DCCP_RTO_MAX; - s = hc->tx_rto / HZ; - if (s > 60) - hc->tx_rto = 60 * HZ; - - ccid2_start_rto_timer(sk); + sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); /* adjust pipe, cwnd etc */ hc->tx_ssthresh = hc->tx_cwnd / 2; if (hc->tx_ssthresh < 2) hc->tx_ssthresh = 2; - hc->tx_cwnd = 1; - hc->tx_pipe = 0; + hc->tx_cwnd = 1; + hc->tx_pipe = 0; /* clear state about stuff we sent */ hc->tx_seqt = hc->tx_seqh; @@ -204,22 +146,11 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) hc->tx_rpseq = 0; hc->tx_rpdupack = -1; ccid2_change_l_ack_ratio(sk, 1); - ccid2_hc_tx_check_sanity(hc); out: bh_unlock_sock(sk); sock_put(sk); } -static void ccid2_start_rto_timer(struct sock *sk) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - - ccid2_pr_debug("setting RTO timeout=%ld\n", hc->tx_rto); - - BUG_ON(timer_pending(&hc->tx_rtotimer)); - sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); -} - static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) { struct dccp_sock *dp = dccp_sk(sk); @@ -230,7 +161,7 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) hc->tx_seqh->ccid2s_seq = dp->dccps_gss; hc->tx_seqh->ccid2s_acked = 0; - hc->tx_seqh->ccid2s_sent = jiffies; + hc->tx_seqh->ccid2s_sent = ccid2_time_stamp; next = hc->tx_seqh->ccid2s_next; /* check if we need to alloc more space */ @@ -296,23 +227,20 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) } #endif - /* setup RTO timer */ - if (!timer_pending(&hc->tx_rtotimer)) - ccid2_start_rto_timer(sk); + sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); #ifdef CONFIG_IP_DCCP_CCID2_DEBUG do { struct ccid2_seq *seqp = hc->tx_seqt; while (seqp != hc->tx_seqh) { - ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n", + ccid2_pr_debug("out seq=%llu acked=%d time=%u\n", (unsigned long long)seqp->ccid2s_seq, seqp->ccid2s_acked, seqp->ccid2s_sent); seqp = seqp->ccid2s_next; } } while (0); ccid2_pr_debug("=========\n"); - ccid2_hc_tx_check_sanity(hc); #endif } @@ -378,17 +306,87 @@ out_invalid_option: return -1; } -static void ccid2_hc_tx_kill_rto_timer(struct sock *sk) +/** + * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm + * This code is almost identical with TCP's tcp_rtt_estimator(), since + * - it has a higher sampling frequency (recommended by RFC 1323), + * - the RTO does not collapse into RTT due to RTTVAR going towards zero, + * - it is simple (cf. more complex proposals such as Eifel timer or research + * which suggests that the gain should be set according to window size), + * - in tests it was found to work well with CCID2 [gerrit]. + */ +static void ccid2_rtt_estimator(struct sock *sk, const long mrtt) { struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); + long m = mrtt ? : 1; - sk_stop_timer(sk, &hc->tx_rtotimer); - ccid2_pr_debug("deleted RTO timer\n"); + if (hc->tx_srtt == 0) { + /* First measurement m */ + hc->tx_srtt = m << 3; + hc->tx_mdev = m << 1; + + hc->tx_mdev_max = max(hc->tx_mdev, tcp_rto_min(sk)); + hc->tx_rttvar = hc->tx_mdev_max; + + hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss; + } else { + /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */ + m -= (hc->tx_srtt >> 3); + hc->tx_srtt += m; + + /* Similarly, update scaled mdev with regard to |m| */ + if (m < 0) { + m = -m; + m -= (hc->tx_mdev >> 2); + /* + * This neutralises RTO increase when RTT < SRTT - mdev + * (see P. Sarolahti, A. Kuznetsov,"Congestion Control + * in Linux TCP", USENIX 2002, pp. 49-62). + */ + if (m > 0) + m >>= 3; + } else { + m -= (hc->tx_mdev >> 2); + } + hc->tx_mdev += m; + + if (hc->tx_mdev > hc->tx_mdev_max) { + hc->tx_mdev_max = hc->tx_mdev; + if (hc->tx_mdev_max > hc->tx_rttvar) + hc->tx_rttvar = hc->tx_mdev_max; + } + + /* + * Decay RTTVAR at most once per flight, exploiting that + * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2) + * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1) + * GAR is a useful bound for FlightSize = pipe. + * AWL is probably too low here, as it over-estimates pipe. + */ + if (after48(dccp_sk(sk)->dccps_gar, hc->tx_rtt_seq)) { + if (hc->tx_mdev_max < hc->tx_rttvar) + hc->tx_rttvar -= (hc->tx_rttvar - + hc->tx_mdev_max) >> 2; + hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss; + hc->tx_mdev_max = tcp_rto_min(sk); + } + } + + /* + * Set RTO from SRTT and RTTVAR + * As in TCP, 4 * RTTVAR >= TCP_RTO_MIN, giving a minimum RTO of 200 ms. + * This agrees with RFC 4341, 5: + * "Because DCCP does not retransmit data, DCCP does not require + * TCP's recommended minimum timeout of one second". + */ + hc->tx_rto = (hc->tx_srtt >> 3) + hc->tx_rttvar; + + if (hc->tx_rto > DCCP_RTO_MAX) + hc->tx_rto = DCCP_RTO_MAX; } -static inline void ccid2_new_ack(struct sock *sk, - struct ccid2_seq *seqp, - unsigned int *maxincr) +static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp, + unsigned int *maxincr) { struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); @@ -402,93 +400,27 @@ static inline void ccid2_new_ack(struct sock *sk, hc->tx_cwnd += 1; hc->tx_packets_acked = 0; } - - /* update RTO */ - if (hc->tx_srtt == -1 || - time_after(jiffies, hc->tx_lastrtt + hc->tx_srtt)) { - unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent; - int s; - - /* first measurement */ - if (hc->tx_srtt == -1) { - ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n", - r, jiffies, - (unsigned long long)seqp->ccid2s_seq); - ccid2_change_srtt(hc, r); - hc->tx_rttvar = r >> 1; - } else { - /* RTTVAR */ - long tmp = hc->tx_srtt - r; - long srtt; - - if (tmp < 0) - tmp *= -1; - - tmp >>= 2; - hc->tx_rttvar *= 3; - hc->tx_rttvar >>= 2; - hc->tx_rttvar += tmp; - - /* SRTT */ - srtt = hc->tx_srtt; - srtt *= 7; - srtt >>= 3; - tmp = r >> 3; - srtt += tmp; - ccid2_change_srtt(hc, srtt); - } - s = hc->tx_rttvar << 2; - /* clock granularity is 1 when based on jiffies */ - if (!s) - s = 1; - hc->tx_rto = hc->tx_srtt + s; - - /* must be at least a second */ - s = hc->tx_rto / HZ; - /* DCCP doesn't require this [but I like it cuz my code sux] */ -#if 1 - if (s < 1) - hc->tx_rto = HZ; -#endif - /* max 60 seconds */ - if (s > 60) - hc->tx_rto = HZ * 60; - - hc->tx_lastrtt = jiffies; - - ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n", - hc->tx_srtt, hc->tx_rttvar, - hc->tx_rto, HZ, r); - } - - /* we got a new ack, so re-start RTO timer */ - ccid2_hc_tx_kill_rto_timer(sk); - ccid2_start_rto_timer(sk); -} - -static void ccid2_hc_tx_dec_pipe(struct sock *sk) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - - if (hc->tx_pipe == 0) - DCCP_BUG("pipe == 0"); - else - hc->tx_pipe--; - - if (hc->tx_pipe == 0) - ccid2_hc_tx_kill_rto_timer(sk); + /* + * FIXME: RTT is sampled several times per acknowledgment (for each + * entry in the Ack Vector), instead of once per Ack (as in TCP SACK). + * This causes the RTT to be over-estimated, since the older entries + * in the Ack Vector have earlier sending times. + * The cleanest solution is to not use the ccid2s_sent field at all + * and instead use DCCP timestamps: requires changes in other places. + */ + ccid2_rtt_estimator(sk, ccid2_time_stamp - seqp->ccid2s_sent); } static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) { struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - if (time_before(seqp->ccid2s_sent, hc->tx_last_cong)) { + if ((s32)(seqp->ccid2s_sent - hc->tx_last_cong) < 0) { ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); return; } - hc->tx_last_cong = jiffies; + hc->tx_last_cong = ccid2_time_stamp; hc->tx_cwnd = hc->tx_cwnd / 2 ? : 1U; hc->tx_ssthresh = max(hc->tx_cwnd, 2U); @@ -510,7 +442,6 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) int done = 0; unsigned int maxincr = 0; - ccid2_hc_tx_check_sanity(hc); /* check reverse path congestion */ seqno = DCCP_SKB_CB(skb)->dccpd_seq; @@ -620,7 +551,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) seqp->ccid2s_acked = 1; ccid2_pr_debug("Got ack for %llu\n", (unsigned long long)seqp->ccid2s_seq); - ccid2_hc_tx_dec_pipe(sk); + hc->tx_pipe--; } if (seqp == hc->tx_seqt) { done = 1; @@ -677,7 +608,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) * one ack vector. */ ccid2_congestion_event(sk, seqp); - ccid2_hc_tx_dec_pipe(sk); + hc->tx_pipe--; } if (seqp == hc->tx_seqt) break; @@ -695,7 +626,11 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) hc->tx_seqt = hc->tx_seqt->ccid2s_next; } - ccid2_hc_tx_check_sanity(hc); + /* restart RTO timer if not all outstanding data has been acked */ + if (hc->tx_pipe == 0) + sk_stop_timer(sk, &hc->tx_rtotimer); + else + sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); } static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) @@ -707,12 +642,8 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ hc->tx_ssthresh = ~0U; - /* - * RFC 4341, 5: "The cwnd parameter is initialized to at most four - * packets for new connections, following the rules from [RFC3390]". - * We need to convert the bytes of RFC3390 into the packets of RFC 4341. - */ - hc->tx_cwnd = clamp(4380U / dp->dccps_mss_cache, 2U, 4U); + /* Use larger initial windows (RFC 4341, section 5). */ + hc->tx_cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache); /* Make sure that Ack Ratio is enabled and within bounds. */ max_ratio = DIV_ROUND_UP(hc->tx_cwnd, 2); @@ -723,15 +654,11 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) if (ccid2_hc_tx_alloc_seq(hc)) return -ENOMEM; - hc->tx_rto = 3 * HZ; - ccid2_change_srtt(hc, -1); - hc->tx_rttvar = -1; + hc->tx_rto = DCCP_TIMEOUT_INIT; hc->tx_rpdupack = -1; - hc->tx_last_cong = jiffies; + hc->tx_last_cong = ccid2_time_stamp; setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk); - - ccid2_hc_tx_check_sanity(hc); return 0; } @@ -740,7 +667,7 @@ static void ccid2_hc_tx_exit(struct sock *sk) struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); int i; - ccid2_hc_tx_kill_rto_timer(sk); + sk_stop_timer(sk, &hc->tx_rtotimer); for (i = 0; i < hc->tx_seqbufc; i++) kfree(hc->tx_seqbuf[i]); diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h index 1ec6a30103b..9731c2dc148 100644 --- a/net/dccp/ccids/ccid2.h +++ b/net/dccp/ccids/ccid2.h @@ -18,18 +18,23 @@ #ifndef _DCCP_CCID2_H_ #define _DCCP_CCID2_H_ -#include <linux/dccp.h> #include <linux/timer.h> #include <linux/types.h> #include "../ccid.h" +#include "../dccp.h" + +/* + * CCID-2 timestamping faces the same issues as TCP timestamping. + * Hence we reuse/share as much of the code as possible. + */ +#define ccid2_time_stamp tcp_time_stamp + /* NUMDUPACK parameter from RFC 4341, p. 6 */ #define NUMDUPACK 3 -struct sock; - struct ccid2_seq { u64 ccid2s_seq; - unsigned long ccid2s_sent; + u32 ccid2s_sent; int ccid2s_acked; struct ccid2_seq *ccid2s_prev; struct ccid2_seq *ccid2s_next; @@ -42,7 +47,12 @@ struct ccid2_seq { * struct ccid2_hc_tx_sock - CCID2 TX half connection * @tx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 * @tx_packets_acked: Ack counter for deriving cwnd growth (RFC 3465) - * @tx_lastrtt: time RTT was last measured + * @tx_srtt: smoothed RTT estimate, scaled by 2^3 + * @tx_mdev: smoothed RTT variation, scaled by 2^2 + * @tx_mdev_max: maximum of @mdev during one flight + * @tx_rttvar: moving average/maximum of @mdev_max + * @tx_rto: RTO value deriving from SRTT and RTTVAR (RFC 2988) + * @tx_rtt_seq: to decay RTTVAR at most once per flight * @tx_rpseq: last consecutive seqno * @tx_rpdupack: dupacks since rpseq */ @@ -55,14 +65,19 @@ struct ccid2_hc_tx_sock { int tx_seqbufc; struct ccid2_seq *tx_seqh; struct ccid2_seq *tx_seqt; - long tx_rto; - long tx_srtt; - long tx_rttvar; - unsigned long tx_lastrtt; + + /* RTT measurement: variables/principles are the same as in TCP */ + u32 tx_srtt, + tx_mdev, + tx_mdev_max, + tx_rttvar, + tx_rto; + u64 tx_rtt_seq:48; struct timer_list tx_rtotimer; + u64 tx_rpseq; int tx_rpdupack; - unsigned long tx_last_cong; + u32 tx_last_cong; u64 tx_high_ack; }; diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 95f75298649..278e1706932 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -218,9 +218,9 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) /* * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 + * RTO is 0 if and only if no feedback has been received yet. */ - if (hc->tx_t_rto == 0 || /* no feedback received yet */ - hc->tx_p == 0) { + if (hc->tx_t_rto == 0 || hc->tx_p == 0) { /* halve send rate directly */ hc->tx_x = max(hc->tx_x / 2, @@ -256,7 +256,7 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) * Set new timeout for the nofeedback timer. * See comments in packet_recv() regarding the value of t_RTO. */ - if (unlikely(hc->tx_t_rto == 0)) /* no feedback yet */ + if (unlikely(hc->tx_t_rto == 0)) /* no feedback received yet */ t_nfb = TFRC_INITIAL_TIMEOUT; else t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi); @@ -372,7 +372,7 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - struct ccid3_options_received *opt_recv; + struct ccid3_options_received *opt_recv = &hc->tx_options_received; ktime_t now; unsigned long t_nfb; u32 pinv, r_sample; @@ -386,7 +386,6 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) hc->tx_state != TFRC_SSTATE_NO_FBACK) return; - opt_recv = &hc->tx_options_received; now = ktime_get_real(); /* Estimate RTT from history if ACK number is valid */ @@ -461,13 +460,12 @@ done_computing_x: sk->sk_write_space(sk); /* - * Update timeout interval for the nofeedback timer. - * We use a configuration option to increase the lower bound. - * This can help avoid triggering the nofeedback timer too - * often ('spinning') on LANs with small RTTs. + * Update timeout interval for the nofeedback timer. In order to control + * rate halving on networks with very low RTTs (<= 1 ms), use per-route + * tunable RTAX_RTO_MIN value as the lower bound. */ - hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt, (CONFIG_IP_DCCP_CCID3_RTO * - (USEC_PER_SEC / 1000))); + hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt, + USEC_PER_SEC/HZ * tcp_rto_min(sk)); /* * Schedule no feedback timer to expire in * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) @@ -489,11 +487,9 @@ static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, int rc = 0; const struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - struct ccid3_options_received *opt_recv; + struct ccid3_options_received *opt_recv = &hc->tx_options_received; __be32 opt_val; - opt_recv = &hc->tx_options_received; - if (opt_recv->ccid3or_seqno != dp->dccps_gsr) { opt_recv->ccid3or_seqno = dp->dccps_gsr; opt_recv->ccid3or_loss_event_rate = ~0; @@ -567,34 +563,30 @@ static void ccid3_hc_tx_exit(struct sock *sk) static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) { - struct ccid3_hc_tx_sock *hc; - - /* Listen socks doesn't have a private CCID block */ - if (sk->sk_state == DCCP_LISTEN) - return; - - hc = ccid3_hc_tx_sk(sk); - info->tcpi_rto = hc->tx_t_rto; - info->tcpi_rtt = hc->tx_rtt; + info->tcpi_rto = ccid3_hc_tx_sk(sk)->tx_t_rto; + info->tcpi_rtt = ccid3_hc_tx_sk(sk)->tx_rtt; } static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen) { - const struct ccid3_hc_tx_sock *hc; + const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); + struct tfrc_tx_info tfrc; const void *val; - /* Listen socks doesn't have a private CCID block */ - if (sk->sk_state == DCCP_LISTEN) - return -EINVAL; - - hc = ccid3_hc_tx_sk(sk); switch (optname) { case DCCP_SOCKOPT_CCID_TX_INFO: - if (len < sizeof(hc->tx_tfrc)) + if (len < sizeof(tfrc)) return -EINVAL; - len = sizeof(hc->tx_tfrc); - val = &hc->tx_tfrc; + tfrc.tfrctx_x = hc->tx_x; + tfrc.tfrctx_x_recv = hc->tx_x_recv; + tfrc.tfrctx_x_calc = hc->tx_x_calc; + tfrc.tfrctx_rtt = hc->tx_rtt; + tfrc.tfrctx_p = hc->tx_p; + tfrc.tfrctx_rto = hc->tx_t_rto; + tfrc.tfrctx_ipi = hc->tx_t_ipi; + len = sizeof(tfrc); + val = &tfrc; break; default: return -ENOPROTOOPT; @@ -701,14 +693,12 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk, static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) { - const struct ccid3_hc_rx_sock *hc; + const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); __be32 x_recv, pinv; if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) return 0; - hc = ccid3_hc_rx_sk(sk); - if (dccp_packet_without_ack(skb)) return 0; @@ -749,10 +739,11 @@ static u32 ccid3_first_li(struct sock *sk) x_recv = scaled_div32(hc->rx_bytes_recv, delta); if (x_recv == 0) { /* would also trigger divide-by-zero */ DCCP_WARN("X_recv==0\n"); - if ((x_recv = hc->rx_x_recv) == 0) { + if (hc->rx_x_recv == 0) { DCCP_BUG("stored value of X_recv is zero"); return ~0U; } + x_recv = hc->rx_x_recv; } fval = scaled_div(hc->rx_s, hc->rx_rtt); @@ -870,30 +861,18 @@ static void ccid3_hc_rx_exit(struct sock *sk) static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) { - const struct ccid3_hc_rx_sock *hc; - - /* Listen socks doesn't have a private CCID block */ - if (sk->sk_state == DCCP_LISTEN) - return; - - hc = ccid3_hc_rx_sk(sk); - info->tcpi_ca_state = hc->rx_state; + info->tcpi_ca_state = ccid3_hc_rx_sk(sk)->rx_state; info->tcpi_options |= TCPI_OPT_TIMESTAMPS; - info->tcpi_rcv_rtt = hc->rx_rtt; + info->tcpi_rcv_rtt = ccid3_hc_rx_sk(sk)->rx_rtt; } static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen) { - const struct ccid3_hc_rx_sock *hc; + const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); struct tfrc_rx_info rx_info; const void *val; - /* Listen socks doesn't have a private CCID block */ - if (sk->sk_state == DCCP_LISTEN) - return -EINVAL; - - hc = ccid3_hc_rx_sk(sk); switch (optname) { case DCCP_SOCKOPT_CCID_RX_INFO: if (len < sizeof(rx_info)) diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index 03263577665..b7e569c22f3 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -42,7 +42,7 @@ #include "lib/tfrc.h" #include "../ccid.h" -/* Two seconds as per RFC 3448 4.2 */ +/* Two seconds as per RFC 5348, 4.2 */ #define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) /* In usecs - half the scheduling granularity as per RFC3448 4.6 */ @@ -95,14 +95,13 @@ enum ccid3_hc_tx_states { * @tx_options_received: Parsed set of retrieved options */ struct ccid3_hc_tx_sock { - struct tfrc_tx_info tx_tfrc; -#define tx_x tx_tfrc.tfrctx_x -#define tx_x_recv tx_tfrc.tfrctx_x_recv -#define tx_x_calc tx_tfrc.tfrctx_x_calc -#define tx_rtt tx_tfrc.tfrctx_rtt -#define tx_p tx_tfrc.tfrctx_p -#define tx_t_rto tx_tfrc.tfrctx_rto -#define tx_t_ipi tx_tfrc.tfrctx_ipi + u64 tx_x; + u64 tx_x_recv; + u32 tx_x_calc; + u32 tx_rtt; + u32 tx_p; + u32 tx_t_rto; + u32 tx_t_ipi; u16 tx_s; enum ccid3_hc_tx_states tx_state:8; u8 tx_last_win_count; @@ -131,16 +130,12 @@ enum ccid3_hc_rx_states { /** * struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket - * @rx_x_recv: Receiver estimate of send rate (RFC 3448 4.3) - * @rx_rtt: Receiver estimate of rtt (non-standard) - * @rx_p: Current loss event rate (RFC 3448 5.4) * @rx_last_counter: Tracks window counter (RFC 4342, 8.1) * @rx_state: Receiver state, one of %ccid3_hc_rx_states * @rx_bytes_recv: Total sum of DCCP payload bytes * @rx_x_recv: Receiver estimate of send rate (RFC 3448, sec. 4.3) * @rx_rtt: Receiver estimate of RTT * @rx_tstamp_last_feedback: Time at which last feedback was sent - * @rx_tstamp_last_ack: Time at which last feedback was sent * @rx_hist: Packet history (loss detection + RTT sampling) * @rx_li_hist: Loss Interval database * @rx_s: Received packet size in bytes diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c index baeb1eaf011..2ef115277be 100644 --- a/net/decnet/dn_nsp_out.c +++ b/net/decnet/dn_nsp_out.c @@ -693,22 +693,22 @@ void dn_nsp_send_conninit(struct sock *sk, unsigned char msgflg) aux = scp->accessdata.acc_userl; *skb_put(skb, 1) = aux; if (aux > 0) - memcpy(skb_put(skb, aux), scp->accessdata.acc_user, aux); + memcpy(skb_put(skb, aux), scp->accessdata.acc_user, aux); aux = scp->accessdata.acc_passl; *skb_put(skb, 1) = aux; if (aux > 0) - memcpy(skb_put(skb, aux), scp->accessdata.acc_pass, aux); + memcpy(skb_put(skb, aux), scp->accessdata.acc_pass, aux); aux = scp->accessdata.acc_accl; *skb_put(skb, 1) = aux; if (aux > 0) - memcpy(skb_put(skb, aux), scp->accessdata.acc_acc, aux); + memcpy(skb_put(skb, aux), scp->accessdata.acc_acc, aux); aux = (__u8)le16_to_cpu(scp->conndata_out.opt_optl); *skb_put(skb, 1) = aux; if (aux > 0) - memcpy(skb_put(skb,aux), scp->conndata_out.opt_data, aux); + memcpy(skb_put(skb, aux), scp->conndata_out.opt_data, aux); scp->persist = dn_nsp_persist(sk); scp->persist_fxn = dn_nsp_retrans_conninit; diff --git a/net/dns_resolver/Kconfig b/net/dns_resolver/Kconfig new file mode 100644 index 00000000000..50d49f7e047 --- /dev/null +++ b/net/dns_resolver/Kconfig @@ -0,0 +1,27 @@ +# +# Configuration for DNS Resolver +# +config DNS_RESOLVER + tristate "DNS Resolver support" + depends on NET && KEYS + help + Saying Y here will include support for the DNS Resolver key type + which can be used to make upcalls to perform DNS lookups in + userspace. + + DNS Resolver is used to query DNS server for information. Examples + being resolving a UNC hostname element to an IP address for CIFS or + performing a DNS query for AFSDB records so that AFS can locate a + cell's volume location database servers. + + DNS Resolver is used by the CIFS and AFS modules, and would support + SMB2 later. DNS Resolver is supported by the userspace upcall + helper "/sbin/dns.resolver" via /etc/request-key.conf. + + See <file:Documentation/networking/dns_resolver.txt> for further + information. + + To compile this as a module, choose M here: the module will be called + dnsresolver. + + If unsure, say N. diff --git a/net/dns_resolver/Makefile b/net/dns_resolver/Makefile new file mode 100644 index 00000000000..c0ef4e71dc4 --- /dev/null +++ b/net/dns_resolver/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the Linux DNS Resolver. +# + +obj-$(CONFIG_DNS_RESOLVER) += dns_resolver.o + +dns_resolver-objs := dns_key.o dns_query.o diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c new file mode 100644 index 00000000000..739435a6af3 --- /dev/null +++ b/net/dns_resolver/dns_key.c @@ -0,0 +1,293 @@ +/* Key type used to cache DNS lookups made by the kernel + * + * See Documentation/networking/dns_resolver.txt + * + * Copyright (c) 2007 Igor Mammedov + * Author(s): Igor Mammedov (niallain@gmail.com) + * Steve French (sfrench@us.ibm.com) + * Wang Lei (wang840925@gmail.com) + * David Howells (dhowells@redhat.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/keyctl.h> +#include <linux/err.h> +#include <linux/seq_file.h> +#include <keys/dns_resolver-type.h> +#include <keys/user-type.h> +#include "internal.h" + +MODULE_DESCRIPTION("DNS Resolver"); +MODULE_AUTHOR("Wang Lei"); +MODULE_LICENSE("GPL"); + +unsigned dns_resolver_debug; +module_param_named(debug, dns_resolver_debug, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(debug, "DNS Resolver debugging mask"); + +const struct cred *dns_resolver_cache; + +#define DNS_ERRORNO_OPTION "dnserror" + +/* + * Instantiate a user defined key for dns_resolver. + * + * The data must be a NUL-terminated string, with the NUL char accounted in + * datalen. + * + * If the data contains a '#' characters, then we take the clause after each + * one to be an option of the form 'key=value'. The actual data of interest is + * the string leading up to the first '#'. For instance: + * + * "ip1,ip2,...#foo=bar" + */ +static int +dns_resolver_instantiate(struct key *key, const void *_data, size_t datalen) +{ + struct user_key_payload *upayload; + unsigned long derrno; + int ret; + size_t result_len = 0; + const char *data = _data, *end, *opt; + + kenter("%%%d,%s,'%s',%zu", + key->serial, key->description, data, datalen); + + if (datalen <= 1 || !data || data[datalen - 1] != '\0') + return -EINVAL; + datalen--; + + /* deal with any options embedded in the data */ + end = data + datalen; + opt = memchr(data, '#', datalen); + if (!opt) { + /* no options: the entire data is the result */ + kdebug("no options"); + result_len = datalen; + } else { + const char *next_opt; + + result_len = opt - data; + opt++; + kdebug("options: '%s'", opt); + do { + const char *eq; + int opt_len, opt_nlen, opt_vlen, tmp; + + next_opt = memchr(opt, '#', end - opt) ?: end; + opt_len = next_opt - opt; + if (!opt_len) { + printk(KERN_WARNING + "Empty option to dns_resolver key %d\n", + key->serial); + return -EINVAL; + } + + eq = memchr(opt, '=', opt_len) ?: end; + opt_nlen = eq - opt; + eq++; + opt_vlen = next_opt - eq; /* will be -1 if no value */ + + tmp = opt_vlen >= 0 ? opt_vlen : 0; + kdebug("option '%*.*s' val '%*.*s'", + opt_nlen, opt_nlen, opt, tmp, tmp, eq); + + /* see if it's an error number representing a DNS error + * that's to be recorded as the result in this key */ + if (opt_nlen == sizeof(DNS_ERRORNO_OPTION) - 1 && + memcmp(opt, DNS_ERRORNO_OPTION, opt_nlen) == 0) { + kdebug("dns error number option"); + if (opt_vlen <= 0) + goto bad_option_value; + + ret = strict_strtoul(eq, 10, &derrno); + if (ret < 0) + goto bad_option_value; + + if (derrno < 1 || derrno > 511) + goto bad_option_value; + + kdebug("dns error no. = %lu", derrno); + key->type_data.x[0] = -derrno; + continue; + } + + bad_option_value: + printk(KERN_WARNING + "Option '%*.*s' to dns_resolver key %d:" + " bad/missing value\n", + opt_nlen, opt_nlen, opt, key->serial); + return -EINVAL; + } while (opt = next_opt + 1, opt < end); + } + + /* don't cache the result if we're caching an error saying there's no + * result */ + if (key->type_data.x[0]) { + kleave(" = 0 [h_error %ld]", key->type_data.x[0]); + return 0; + } + + kdebug("store result"); + ret = key_payload_reserve(key, result_len); + if (ret < 0) + return -EINVAL; + + upayload = kmalloc(sizeof(*upayload) + result_len + 1, GFP_KERNEL); + if (!upayload) { + kleave(" = -ENOMEM"); + return -ENOMEM; + } + + upayload->datalen = result_len; + memcpy(upayload->data, data, result_len); + upayload->data[result_len] = '\0'; + rcu_assign_pointer(key->payload.data, upayload); + + kleave(" = 0"); + return 0; +} + +/* + * The description is of the form "[<type>:]<domain_name>" + * + * The domain name may be a simple name or an absolute domain name (which + * should end with a period). The domain name is case-independent. + */ +static int +dns_resolver_match(const struct key *key, const void *description) +{ + int slen, dlen, ret = 0; + const char *src = key->description, *dsp = description; + + kenter("%s,%s", src, dsp); + + if (!src || !dsp) + goto no_match; + + if (strcasecmp(src, dsp) == 0) + goto matched; + + slen = strlen(src); + dlen = strlen(dsp); + if (slen <= 0 || dlen <= 0) + goto no_match; + if (src[slen - 1] == '.') + slen--; + if (dsp[dlen - 1] == '.') + dlen--; + if (slen != dlen || strncasecmp(src, dsp, slen) != 0) + goto no_match; + +matched: + ret = 1; +no_match: + kleave(" = %d", ret); + return ret; +} + +/* + * Describe a DNS key + */ +static void dns_resolver_describe(const struct key *key, struct seq_file *m) +{ + int err = key->type_data.x[0]; + + seq_puts(m, key->description); + if (err) + seq_printf(m, ": %d", err); + else + seq_printf(m, ": %u", key->datalen); +} + +struct key_type key_type_dns_resolver = { + .name = "dns_resolver", + .instantiate = dns_resolver_instantiate, + .match = dns_resolver_match, + .revoke = user_revoke, + .destroy = user_destroy, + .describe = dns_resolver_describe, + .read = user_read, +}; + +static int __init init_dns_resolver(void) +{ + struct cred *cred; + struct key *keyring; + int ret; + + printk(KERN_NOTICE "Registering the %s key type\n", + key_type_dns_resolver.name); + + /* create an override credential set with a special thread keyring in + * which DNS requests are cached + * + * this is used to prevent malicious redirections from being installed + * with add_key(). + */ + cred = prepare_kernel_cred(NULL); + if (!cred) + return -ENOMEM; + + keyring = key_alloc(&key_type_keyring, ".dns_resolver", 0, 0, cred, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA); + if (IS_ERR(keyring)) { + ret = PTR_ERR(keyring); + goto failed_put_cred; + } + + ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL); + if (ret < 0) + goto failed_put_key; + + ret = register_key_type(&key_type_dns_resolver); + if (ret < 0) + goto failed_put_key; + + /* instruct request_key() to use this special keyring as a cache for + * the results it looks up */ + cred->thread_keyring = keyring; + cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; + dns_resolver_cache = cred; + + kdebug("DNS resolver keyring: %d\n", key_serial(keyring)); + return 0; + +failed_put_key: + key_put(keyring); +failed_put_cred: + put_cred(cred); + return ret; +} + +static void __exit exit_dns_resolver(void) +{ + key_revoke(dns_resolver_cache->thread_keyring); + unregister_key_type(&key_type_dns_resolver); + put_cred(dns_resolver_cache); + printk(KERN_NOTICE "Unregistered %s key type\n", + key_type_dns_resolver.name); +} + +module_init(init_dns_resolver) +module_exit(exit_dns_resolver) +MODULE_LICENSE("GPL"); diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c new file mode 100644 index 00000000000..c32be292c7e --- /dev/null +++ b/net/dns_resolver/dns_query.c @@ -0,0 +1,165 @@ +/* Upcall routine, designed to work as a key type and working through + * /sbin/request-key to contact userspace when handling DNS queries. + * + * See Documentation/networking/dns_resolver.txt + * + * Copyright (c) 2007 Igor Mammedov + * Author(s): Igor Mammedov (niallain@gmail.com) + * Steve French (sfrench@us.ibm.com) + * Wang Lei (wang840925@gmail.com) + * David Howells (dhowells@redhat.com) + * + * The upcall wrapper used to make an arbitrary DNS query. + * + * This function requires the appropriate userspace tool dns.upcall to be + * installed and something like the following lines should be added to the + * /etc/request-key.conf file: + * + * create dns_resolver * * /sbin/dns.upcall %k + * + * For example to use this module to query AFSDB RR: + * + * create dns_resolver afsdb:* * /sbin/dns.afsdb %k + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/dns_resolver.h> +#include <linux/err.h> +#include <keys/dns_resolver-type.h> +#include <keys/user-type.h> + +#include "internal.h" + +/** + * dns_query - Query the DNS + * @type: Query type (or NULL for straight host->IP lookup) + * @name: Name to look up + * @namelen: Length of name + * @options: Request options (or NULL if no options) + * @_result: Where to place the returned data. + * @_expiry: Where to store the result expiry time (or NULL) + * + * The data will be returned in the pointer at *result, and the caller is + * responsible for freeing it. + * + * The description should be of the form "[<query_type>:]<domain_name>", and + * the options need to be appropriate for the query type requested. If no + * query_type is given, then the query is a straight hostname to IP address + * lookup. + * + * The DNS resolution lookup is performed by upcalling to userspace by way of + * requesting a key of type dns_resolver. + * + * Returns the size of the result on success, -ve error code otherwise. + */ +int dns_query(const char *type, const char *name, size_t namelen, + const char *options, char **_result, time_t *_expiry) +{ + struct key *rkey; + struct user_key_payload *upayload; + const struct cred *saved_cred; + size_t typelen, desclen; + char *desc, *cp; + int ret, len; + + kenter("%s,%*.*s,%zu,%s", + type, (int)namelen, (int)namelen, name, namelen, options); + + if (!name || namelen == 0 || !_result) + return -EINVAL; + + /* construct the query key description as "[<type>:]<name>" */ + typelen = 0; + desclen = 0; + if (type) { + typelen = strlen(type); + if (typelen < 1) + return -EINVAL; + desclen += typelen + 1; + } + + if (!namelen) + namelen = strlen(name); + if (namelen < 3) + return -EINVAL; + desclen += namelen + 1; + + desc = kmalloc(desclen, GFP_KERNEL); + if (!desc) + return -ENOMEM; + + cp = desc; + if (type) { + memcpy(cp, type, typelen); + cp += typelen; + *cp++ = ':'; + } + memcpy(cp, name, namelen); + cp += namelen; + *cp = '\0'; + + if (!options) + options = ""; + kdebug("call request_key(,%s,%s)", desc, options); + + /* make the upcall, using special credentials to prevent the use of + * add_key() to preinstall malicious redirections + */ + saved_cred = override_creds(dns_resolver_cache); + rkey = request_key(&key_type_dns_resolver, desc, options); + revert_creds(saved_cred); + kfree(desc); + if (IS_ERR(rkey)) { + ret = PTR_ERR(rkey); + goto out; + } + + down_read(&rkey->sem); + rkey->perm |= KEY_USR_VIEW; + + ret = key_validate(rkey); + if (ret < 0) + goto put; + + /* If the DNS server gave an error, return that to the caller */ + ret = rkey->type_data.x[0]; + if (ret) + goto put; + + upayload = rcu_dereference_protected(rkey->payload.data, + lockdep_is_held(&rkey->sem)); + len = upayload->datalen; + + ret = -ENOMEM; + *_result = kmalloc(len + 1, GFP_KERNEL); + if (!*_result) + goto put; + + memcpy(*_result, upayload->data, len + 1); + if (_expiry) + *_expiry = rkey->expiry; + + ret = len; +put: + up_read(&rkey->sem); + key_put(rkey); +out: + kleave(" = %d", ret); + return ret; +} +EXPORT_SYMBOL(dns_query); diff --git a/net/dns_resolver/internal.h b/net/dns_resolver/internal.h new file mode 100644 index 00000000000..189ca9e9b78 --- /dev/null +++ b/net/dns_resolver/internal.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2010 Wang Lei + * Author(s): Wang Lei (wang840925@gmail.com). All Rights Reserved. + * + * Internal DNS Rsolver stuff + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/compiler.h> +#include <linux/kernel.h> +#include <linux/sched.h> + +/* + * dns_key.c + */ +extern const struct cred *dns_resolver_cache; + +/* + * debug tracing + */ +extern unsigned dns_resolver_debug; + +#define kdebug(FMT, ...) \ +do { \ + if (unlikely(dns_resolver_debug)) \ + printk(KERN_DEBUG "[%-6.6s] "FMT"\n", \ + current->comm, ##__VA_ARGS__); \ +} while (0) + +#define kenter(FMT, ...) kdebug("==> %s("FMT")", __func__, ##__VA_ARGS__) +#define kleave(FMT, ...) kdebug("<== %s()"FMT"", __func__, ##__VA_ARGS__) diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 11201784d29..87bb5f4de0e 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -1,7 +1,7 @@ menuconfig NET_DSA bool "Distributed Switch Architecture support" default n - depends on EXPERIMENTAL && NET_ETHERNET && !S390 + depends on EXPERIMENTAL && NETDEVICES && !S390 select PHYLIB ---help--- This allows you to use hardware switch chips that use diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c index dc54bd0d083..baa98fb8355 100644 --- a/net/econet/af_econet.c +++ b/net/econet/af_econet.c @@ -1009,7 +1009,6 @@ static int __init aun_udp_initialise(void) struct sockaddr_in sin; skb_queue_head_init(&aun_queue); - spin_lock_init(&aun_queue_lock); setup_timer(&ab_cleanup_timer, ab_cleanup, 0); ab_cleanup_timer.expires = jiffies + (HZ*2); add_timer(&ab_cleanup_timer); @@ -1167,7 +1166,6 @@ static int __init econet_proto_init(void) goto out; sock_register(&econet_family_ops); #ifdef CONFIG_ECONET_AUNUDP - spin_lock_init(&aun_queue_lock); aun_udp_initialise(); #endif #ifdef CONFIG_ECONET_NATIVE diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 215c83986a9..85e7b455132 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -367,7 +367,7 @@ struct net_device *alloc_etherdev_mq(int sizeof_priv, unsigned int queue_count) EXPORT_SYMBOL(alloc_etherdev_mq); static size_t _format_mac_addr(char *buf, int buflen, - const unsigned char *addr, int len) + const unsigned char *addr, int len) { int i; char *cp = buf; @@ -376,7 +376,7 @@ static size_t _format_mac_addr(char *buf, int buflen, cp += scnprintf(cp, buflen - (cp - buf), "%02x", addr[i]); if (i == len - 1) break; - cp += strlcpy(cp, ":", buflen - (cp - buf)); + cp += scnprintf(cp, buflen - (cp - buf), ":"); } return cp - buf; } @@ -386,7 +386,7 @@ ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len) size_t l; l = _format_mac_addr(buf, PAGE_SIZE, addr, len); - l += strlcpy(buf + l, "\n", PAGE_SIZE - l); + l += scnprintf(buf + l, PAGE_SIZE - l, "\n"); return ((ssize_t) l); } EXPORT_SYMBOL(sysfs_format_mac); diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 7c3a7d19124..7458bdae7e9 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -215,8 +215,15 @@ config NET_IPIP be inserted in and removed from the running kernel whenever you want). Most people won't need this and can say N. +config NET_IPGRE_DEMUX + tristate "IP: GRE demultiplexer" + help + This is helper module to demultiplex GRE packets on GRE version field criteria. + Required by ip_gre and pptp modules. + config NET_IPGRE tristate "IP: GRE tunnels over IP" + depends on NET_IPGRE_DEMUX help Tunneling means encapsulating data of one protocol type within another protocol and sending it over a channel that understands the diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 80ff87ce43a..4978d22f9a7 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o obj-$(CONFIG_IP_MROUTE) += ipmr.o obj-$(CONFIG_NET_IPIP) += ipip.o +obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o obj-$(CONFIG_NET_IPGRE) += ip_gre.o obj-$(CONFIG_SYN_COOKIES) += syncookies.o obj-$(CONFIG_INET_AH) += ah4.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 6a1100c25a9..f581f77d109 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -227,18 +227,16 @@ EXPORT_SYMBOL(inet_ehash_secret); /* * inet_ehash_secret must be set exactly once - * Instead of using a dedicated spinlock, we (ab)use inetsw_lock */ void build_ehash_secret(void) { u32 rnd; + do { get_random_bytes(&rnd, sizeof(rnd)); } while (rnd == 0); - spin_lock_bh(&inetsw_lock); - if (!inet_ehash_secret) - inet_ehash_secret = rnd; - spin_unlock_bh(&inetsw_lock); + + cmpxchg(&inet_ehash_secret, 0, rnd); } EXPORT_SYMBOL(build_ehash_secret); diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c new file mode 100644 index 00000000000..b546736da2e --- /dev/null +++ b/net/ipv4/gre.c @@ -0,0 +1,151 @@ +/* + * GRE over IPv4 demultiplexer driver + * + * Authors: Dmitry Kozlov (xeb@mail.ru) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/kmod.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/netdevice.h> +#include <linux/version.h> +#include <linux/spinlock.h> +#include <net/protocol.h> +#include <net/gre.h> + + +const struct gre_protocol *gre_proto[GREPROTO_MAX] __read_mostly; +static DEFINE_SPINLOCK(gre_proto_lock); + +int gre_add_protocol(const struct gre_protocol *proto, u8 version) +{ + if (version >= GREPROTO_MAX) + goto err_out; + + spin_lock(&gre_proto_lock); + if (gre_proto[version]) + goto err_out_unlock; + + rcu_assign_pointer(gre_proto[version], proto); + spin_unlock(&gre_proto_lock); + return 0; + +err_out_unlock: + spin_unlock(&gre_proto_lock); +err_out: + return -1; +} +EXPORT_SYMBOL_GPL(gre_add_protocol); + +int gre_del_protocol(const struct gre_protocol *proto, u8 version) +{ + if (version >= GREPROTO_MAX) + goto err_out; + + spin_lock(&gre_proto_lock); + if (gre_proto[version] != proto) + goto err_out_unlock; + rcu_assign_pointer(gre_proto[version], NULL); + spin_unlock(&gre_proto_lock); + synchronize_rcu(); + return 0; + +err_out_unlock: + spin_unlock(&gre_proto_lock); +err_out: + return -1; +} +EXPORT_SYMBOL_GPL(gre_del_protocol); + +static int gre_rcv(struct sk_buff *skb) +{ + const struct gre_protocol *proto; + u8 ver; + int ret; + + if (!pskb_may_pull(skb, 12)) + goto drop; + + ver = skb->data[1]&0x7f; + if (ver >= GREPROTO_MAX) + goto drop; + + rcu_read_lock(); + proto = rcu_dereference(gre_proto[ver]); + if (!proto || !proto->handler) + goto drop_unlock; + ret = proto->handler(skb); + rcu_read_unlock(); + return ret; + +drop_unlock: + rcu_read_unlock(); +drop: + kfree_skb(skb); + return NET_RX_DROP; +} + +static void gre_err(struct sk_buff *skb, u32 info) +{ + const struct gre_protocol *proto; + u8 ver; + + if (!pskb_may_pull(skb, 12)) + goto drop; + + ver = skb->data[1]&0x7f; + if (ver >= GREPROTO_MAX) + goto drop; + + rcu_read_lock(); + proto = rcu_dereference(gre_proto[ver]); + if (!proto || !proto->err_handler) + goto drop_unlock; + proto->err_handler(skb, info); + rcu_read_unlock(); + return; + +drop_unlock: + rcu_read_unlock(); +drop: + kfree_skb(skb); +} + +static const struct net_protocol net_gre_protocol = { + .handler = gre_rcv, + .err_handler = gre_err, + .netns_ok = 1, +}; + +static int __init gre_init(void) +{ + pr_info("GRE over IPv4 demultiplexor driver"); + + if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) { + pr_err("gre: can't add protocol\n"); + return -EAGAIN; + } + + return 0; +} + +static void __exit gre_exit(void) +{ + inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); +} + +module_init(gre_init); +module_exit(gre_exit); + +MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver"); +MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)"); +MODULE_LICENSE("GPL"); + diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index a0d847c7cba..96bc7f9475a 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -379,7 +379,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) inet->tos = ip_hdr(skb)->tos; daddr = ipc.addr = rt->rt_src; ipc.opt = NULL; - ipc.shtx.flags = 0; + ipc.tx_flags = 0; if (icmp_param->replyopts.optlen) { ipc.opt = &icmp_param->replyopts; if (ipc.opt->srr) @@ -538,7 +538,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) inet_sk(sk)->tos = tos; ipc.addr = iph->saddr; ipc.opt = &icmp_param.replyopts; - ipc.shtx.flags = 0; + ipc.tx_flags = 0; { struct flowi fl = { diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index b7c41654dde..f4dc879e258 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -542,7 +542,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ - if (skb_has_frags(head)) { + if (skb_has_frag_list(head)) { struct sk_buff *clone; int i, plen = 0; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 945b20a5ad5..85176895495 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -44,6 +44,7 @@ #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> +#include <net/gre.h> #ifdef CONFIG_IPV6 #include <net/ipv6.h> @@ -1278,10 +1279,9 @@ static void ipgre_fb_tunnel_init(struct net_device *dev) } -static const struct net_protocol ipgre_protocol = { - .handler = ipgre_rcv, - .err_handler = ipgre_err, - .netns_ok = 1, +static const struct gre_protocol ipgre_protocol = { + .handler = ipgre_rcv, + .err_handler = ipgre_err, }; static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head) @@ -1663,7 +1663,7 @@ static int __init ipgre_init(void) if (err < 0) return err; - err = inet_add_protocol(&ipgre_protocol, IPPROTO_GRE); + err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO); if (err < 0) { printk(KERN_INFO "ipgre init: can't add protocol\n"); goto add_proto_failed; @@ -1683,7 +1683,7 @@ out: tap_ops_failed: rtnl_link_unregister(&ipgre_link_ops); rtnl_link_failed: - inet_del_protocol(&ipgre_protocol, IPPROTO_GRE); + gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); add_proto_failed: unregister_pernet_device(&ipgre_net_ops); goto out; @@ -1693,7 +1693,7 @@ static void __exit ipgre_fini(void) { rtnl_link_unregister(&ipgre_tap_ops); rtnl_link_unregister(&ipgre_link_ops); - if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) + if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) printk(KERN_INFO "ipgre close: can't remove protocol\n"); unregister_pernet_device(&ipgre_net_ops); } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 04b69896df5..e42762023c2 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -487,7 +487,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) * LATER: this step can be merged to real generation of fragments, * we can switch to copy when see the first bad fragment. */ - if (skb_has_frags(skb)) { + if (skb_has_frag_list(skb)) { struct sk_buff *frag; int first_len = skb_pagelen(skb); int truesizes = 0; @@ -837,10 +837,9 @@ int ip_append_data(struct sock *sk, inet->cork.length = 0; sk->sk_sndmsg_page = NULL; sk->sk_sndmsg_off = 0; - if ((exthdrlen = rt->dst.header_len) != 0) { - length += exthdrlen; - transhdrlen += exthdrlen; - } + exthdrlen = rt->dst.header_len; + length += exthdrlen; + transhdrlen += exthdrlen; } else { rt = (struct rtable *)inet->cork.dst; if (inet->cork.flags & IPCORK_OPT) @@ -953,7 +952,7 @@ alloc_new_skb: else /* only the initial fragment is time stamped */ - ipc->shtx.flags = 0; + ipc->tx_flags = 0; } if (skb == NULL) goto error; @@ -964,7 +963,7 @@ alloc_new_skb: skb->ip_summed = csummode; skb->csum = 0; skb_reserve(skb, hh_len); - *skb_tx(skb) = ipc->shtx; + skb_shinfo(skb)->tx_flags = ipc->tx_flags; /* * Find where to start putting bytes. @@ -1384,7 +1383,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar daddr = ipc.addr = rt->rt_src; ipc.opt = NULL; - ipc.shtx.flags = 0; + ipc.tx_flags = 0; if (replyopts.opt.optlen) { ipc.opt = &replyopts.opt; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index ec036731a70..3c6f8f3968a 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -744,7 +744,7 @@ static void __net_init ipip_fb_tunnel_init(struct net_device *dev) ipn->tunnels_wc[0] = tunnel; } -static struct xfrm_tunnel ipip_handler = { +static struct xfrm_tunnel ipip_handler __read_mostly = { .handler = ipip_rcv, .err_handler = ipip_err, .priority = 1, diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 6bccba31d13..51d6c316797 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -735,6 +735,7 @@ static void get_counters(const struct xt_table_info *t, if (cpu == curcpu) continue; i = 0; + local_bh_disable(); xt_info_wrlock(cpu); xt_entry_foreach(iter, t->entries[cpu], t->size) { ADD_COUNTER(counters[i], iter->counters.bcnt, @@ -742,6 +743,7 @@ static void get_counters(const struct xt_table_info *t, ++i; } xt_info_wrunlock(cpu); + local_bh_enable(); } put_cpu(); } diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index c439721b165..97b64b22c41 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -909,6 +909,7 @@ get_counters(const struct xt_table_info *t, if (cpu == curcpu) continue; i = 0; + local_bh_disable(); xt_info_wrlock(cpu); xt_entry_foreach(iter, t->entries[cpu], t->size) { ADD_COUNTER(counters[i], iter->counters.bcnt, @@ -916,6 +917,7 @@ get_counters(const struct xt_table_info *t, ++i; /* macro does multi eval of i */ } xt_info_wrunlock(cpu); + local_bh_enable(); } put_cpu(); } diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 3a43cf36db8..1e26a489765 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -29,6 +29,7 @@ #include <net/netfilter/nf_conntrack.h> #include <net/net_namespace.h> #include <net/checksum.h> +#include <net/ip.h> #define CLUSTERIP_VERSION "0.8" @@ -231,24 +232,22 @@ clusterip_hashfn(const struct sk_buff *skb, { const struct iphdr *iph = ip_hdr(skb); unsigned long hashval; - u_int16_t sport, dport; - const u_int16_t *ports; - - switch (iph->protocol) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_UDPLITE: - case IPPROTO_SCTP: - case IPPROTO_DCCP: - case IPPROTO_ICMP: - ports = (const void *)iph+iph->ihl*4; - sport = ports[0]; - dport = ports[1]; - break; - default: + u_int16_t sport = 0, dport = 0; + int poff; + + poff = proto_ports_offset(iph->protocol); + if (poff >= 0) { + const u_int16_t *ports; + u16 _ports[2]; + + ports = skb_header_pointer(skb, iph->ihl * 4 + poff, 4, _ports); + if (ports) { + sport = ports[0]; + dport = ports[1]; + } + } else { if (net_ratelimit()) pr_info("unknown protocol %u\n", iph->protocol); - sport = dport = 0; } switch (config->hash_mode) { diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 009a7b2aa1e..1f85ef28989 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -505,7 +505,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ipc.addr = inet->inet_saddr; ipc.opt = NULL; - ipc.shtx.flags = 0; + ipc.tx_flags = 0; ipc.oif = sk->sk_bound_dev_if; if (msg->msg_controllen) { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 3f56b6e6c6a..85a67c9d598 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1268,18 +1268,11 @@ skip_hashing: void rt_bind_peer(struct rtable *rt, int create) { - static DEFINE_SPINLOCK(rt_peer_lock); struct inet_peer *peer; peer = inet_getpeer(rt->rt_dst, create); - spin_lock_bh(&rt_peer_lock); - if (rt->peer == NULL) { - rt->peer = peer; - peer = NULL; - } - spin_unlock_bh(&rt_peer_lock); - if (peer) + if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) inet_putpeer(peer); } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 176e11aaea7..cf325452875 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2391,7 +2391,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level, err = tp->af_specific->md5_parse(sk, optval, optlen); break; #endif - + case TCP_USER_TIMEOUT: + /* Cap the max timeout in ms TCP will retry/retrans + * before giving up and aborting (ETIMEDOUT) a connection. + */ + icsk->icsk_user_timeout = msecs_to_jiffies(val); + break; default: err = -ENOPROTOOPT; break; @@ -2610,6 +2615,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_THIN_DUPACK: val = tp->thin_dupack; break; + + case TCP_USER_TIMEOUT: + val = jiffies_to_msecs(icsk->icsk_user_timeout); + break; default: return -ENOPROTOOPT; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e663b78a2ef..1bc87a05c73 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -805,25 +805,12 @@ void tcp_update_metrics(struct sock *sk) } } -/* Numbers are taken from RFC3390. - * - * John Heffner states: - * - * The RFC specifies a window of no more than 4380 bytes - * unless 2*MSS > 4380. Reading the pseudocode in the RFC - * is a bit misleading because they use a clamp at 4380 bytes - * rather than use a multiplier in the relevant range. - */ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) { __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); - if (!cwnd) { - if (tp->mss_cache > 1460) - cwnd = 2; - else - cwnd = (tp->mss_cache > 1095) ? 3 : 4; - } + if (!cwnd) + cwnd = rfc3390_bytes_to_packets(tp->mss_cache); return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 020766292bb..a0232f3a358 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2571,7 +2571,6 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) return tcp_gro_receive(head, skb); } -EXPORT_SYMBOL(tcp4_gro_receive); int tcp4_gro_complete(struct sk_buff *skb) { @@ -2584,7 +2583,6 @@ int tcp4_gro_complete(struct sk_buff *skb) return tcp_gro_complete(skb); } -EXPORT_SYMBOL(tcp4_gro_complete); struct proto tcp_prot = { .name = "TCP", diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index de3bd845858..ea09d2fd50c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -224,16 +224,10 @@ void tcp_select_initial_window(int __space, __u32 mss, } } - /* Set initial window to value enough for senders, - * following RFC2414. Senders, not following this RFC, - * will be satisfied with 2. - */ + /* Set initial window to value enough for senders, following RFC5681. */ if (mss > (1 << *rcv_wscale)) { - int init_cwnd = 4; - if (mss > 1460 * 3) - init_cwnd = 2; - else if (mss > 1460) - init_cwnd = 3; + int init_cwnd = rfc3390_bytes_to_packets(mss); + /* when initializing use the value from init_rcv_wnd * rather than the default from above */ @@ -2429,6 +2423,12 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, __u8 rcv_wscale; /* Set this up on the first call only */ req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); + + /* limit the window selection if the user enforce a smaller rx buffer */ + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && + (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) + req->window_clamp = tcp_full_space(sk); + /* tcp_full_space because it is guaranteed to be the first packet */ tcp_select_initial_window(tcp_full_space(sk), mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), @@ -2555,6 +2555,11 @@ static void tcp_connect_init(struct sock *sk) tcp_initialize_rcv_mss(sk); + /* limit the window selection if the user enforce a smaller rx buffer */ + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && + (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0)) + tp->window_clamp = tcp_full_space(sk); + tcp_select_initial_window(tcp_full_space(sk), tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), &tp->rcv_wnd, diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 808bb920c9f..11569deccbe 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -138,10 +138,10 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) * retransmissions with an initial RTO of TCP_RTO_MIN. */ static bool retransmits_timed_out(struct sock *sk, - unsigned int boundary) + unsigned int boundary, + unsigned int timeout) { - unsigned int timeout, linear_backoff_thresh; - unsigned int start_ts; + unsigned int linear_backoff_thresh, start_ts; if (!inet_csk(sk)->icsk_retransmits) return false; @@ -151,14 +151,15 @@ static bool retransmits_timed_out(struct sock *sk, else start_ts = tcp_sk(sk)->retrans_stamp; - linear_backoff_thresh = ilog2(TCP_RTO_MAX/TCP_RTO_MIN); - - if (boundary <= linear_backoff_thresh) - timeout = ((2 << boundary) - 1) * TCP_RTO_MIN; - else - timeout = ((2 << linear_backoff_thresh) - 1) * TCP_RTO_MIN + - (boundary - linear_backoff_thresh) * TCP_RTO_MAX; + if (likely(timeout == 0)) { + linear_backoff_thresh = ilog2(TCP_RTO_MAX/TCP_RTO_MIN); + if (boundary <= linear_backoff_thresh) + timeout = ((2 << boundary) - 1) * TCP_RTO_MIN; + else + timeout = ((2 << linear_backoff_thresh) - 1) * TCP_RTO_MIN + + (boundary - linear_backoff_thresh) * TCP_RTO_MAX; + } return (tcp_time_stamp - start_ts) >= timeout; } @@ -174,7 +175,7 @@ static int tcp_write_timeout(struct sock *sk) dst_negative_advice(sk); retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; } else { - if (retransmits_timed_out(sk, sysctl_tcp_retries1)) { + if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0)) { /* Black hole detection */ tcp_mtu_probing(icsk, sk); @@ -187,14 +188,16 @@ static int tcp_write_timeout(struct sock *sk) retry_until = tcp_orphan_retries(sk, alive); do_reset = alive || - !retransmits_timed_out(sk, retry_until); + !retransmits_timed_out(sk, retry_until, 0); if (tcp_out_of_resources(sk, do_reset)) return 1; } } - if (retransmits_timed_out(sk, retry_until)) { + if (retransmits_timed_out(sk, retry_until, + (1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV) ? 0 : + icsk->icsk_user_timeout)) { /* Has it gone just too far? */ tcp_write_err(sk); return 1; @@ -436,7 +439,7 @@ out_reset_timer: icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); - if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1)) + if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0)) __sk_dst_reset(sk); out:; @@ -556,7 +559,14 @@ static void tcp_keepalive_timer (unsigned long data) elapsed = keepalive_time_elapsed(tp); if (elapsed >= keepalive_time_when(tp)) { - if (icsk->icsk_probes_out >= keepalive_probes(tp)) { + /* If the TCP_USER_TIMEOUT option is enabled, use that + * to determine when to timeout instead. + */ + if ((icsk->icsk_user_timeout != 0 && + elapsed >= icsk->icsk_user_timeout && + icsk->icsk_probes_out > 0) || + (icsk->icsk_user_timeout == 0 && + icsk->icsk_probes_out >= keepalive_probes(tp))) { tcp_send_active_reset(sk, GFP_ATOMIC); tcp_write_err(sk); goto out; diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index 59186ca7808..df59d16337f 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -14,8 +14,8 @@ #include <net/protocol.h> #include <net/xfrm.h> -static struct xfrm_tunnel *tunnel4_handlers; -static struct xfrm_tunnel *tunnel64_handlers; +static struct xfrm_tunnel *tunnel4_handlers __read_mostly; +static struct xfrm_tunnel *tunnel64_handlers __read_mostly; static DEFINE_MUTEX(tunnel4_mutex); static inline struct xfrm_tunnel **fam_handlers(unsigned short family) @@ -73,6 +73,11 @@ int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family) } EXPORT_SYMBOL(xfrm4_tunnel_deregister); +#define for_each_tunnel_rcu(head, handler) \ + for (handler = rcu_dereference(head); \ + handler != NULL; \ + handler = rcu_dereference(handler->next)) \ + static int tunnel4_rcv(struct sk_buff *skb) { struct xfrm_tunnel *handler; @@ -80,7 +85,7 @@ static int tunnel4_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto drop; - for (handler = tunnel4_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel4_handlers, handler) if (!handler->handler(skb)) return 0; @@ -99,7 +104,7 @@ static int tunnel64_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; - for (handler = tunnel64_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel64_handlers, handler) if (!handler->handler(skb)) return 0; @@ -115,7 +120,7 @@ static void tunnel4_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; - for (handler = tunnel4_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel4_handlers, handler) if (!handler->err_handler(skb, info)) break; } @@ -125,7 +130,7 @@ static void tunnel64_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; - for (handler = tunnel64_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel64_handlers, handler) if (!handler->err_handler(skb, info)) break; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 32e0bef60d0..86e757e162e 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -797,7 +797,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, return -EOPNOTSUPP; ipc.opt = NULL; - ipc.shtx.flags = 0; + ipc.tx_flags = 0; if (up->pending) { /* @@ -845,7 +845,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ipc.addr = inet->inet_saddr; ipc.oif = sk->sk_bound_dev_if; - err = sock_tx_timestamp(msg, sk, &ipc.shtx); + err = sock_tx_timestamp(sk, &ipc.tx_flags); if (err) return err; if (msg->msg_controllen) { diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c index 41f5982d208..82806455e85 100644 --- a/net/ipv4/xfrm4_tunnel.c +++ b/net/ipv4/xfrm4_tunnel.c @@ -58,14 +58,14 @@ static int xfrm_tunnel_err(struct sk_buff *skb, u32 info) return -ENOENT; } -static struct xfrm_tunnel xfrm_tunnel_handler = { +static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = { .handler = xfrm_tunnel_rcv, .err_handler = xfrm_tunnel_err, .priority = 2, }; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static struct xfrm_tunnel xfrm64_tunnel_handler = { +static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = { .handler = xfrm_tunnel_rcv, .err_handler = xfrm_tunnel_err, .priority = 2, diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index d40b330c0ee..1838927a224 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -637,7 +637,7 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) } mtu -= hlen + sizeof(struct frag_hdr); - if (skb_has_frags(skb)) { + if (skb_has_frag_list(skb)) { int first_len = skb_pagelen(skb); int truesizes = 0; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 0fd027f3f47..29f99dd75bc 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1372,13 +1372,13 @@ static void __net_init ip6_fb_tnl_dev_init(struct net_device *dev) ip6n->tnls_wc[0] = t; } -static struct xfrm6_tunnel ip4ip6_handler = { +static struct xfrm6_tunnel ip4ip6_handler __read_mostly = { .handler = ip4ip6_rcv, .err_handler = ip4ip6_err, .priority = 1, }; -static struct xfrm6_tunnel ip6ip6_handler = { +static struct xfrm6_tunnel ip6ip6_handler __read_mostly = { .handler = ip6ip6_rcv, .err_handler = ip6ip6_err, .priority = 1, diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 5359ef4daac..29a7bca29e3 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -922,6 +922,7 @@ get_counters(const struct xt_table_info *t, if (cpu == curcpu) continue; i = 0; + local_bh_disable(); xt_info_wrlock(cpu); xt_entry_foreach(iter, t->entries[cpu], t->size) { ADD_COUNTER(counters[i], iter->counters.bcnt, @@ -929,6 +930,7 @@ get_counters(const struct xt_table_info *t, ++i; } xt_info_wrunlock(cpu); + local_bh_enable(); } put_cpu(); } diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 13ef5bc05cf..089c598773c 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -413,7 +413,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ - if (skb_has_frags(head)) { + if (skb_has_frag_list(head)) { struct sk_buff *clone; int i, plen = 0; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 545c4141b75..8aea3f3f18d 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -499,7 +499,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ - if (skb_has_frags(head)) { + if (skb_has_frag_list(head)) { struct sk_buff *clone; int i, plen = 0; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 8f2d0400cf8..d126365ac04 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2580,7 +2580,7 @@ ctl_table ipv6_route_table_template[] = { .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_jiffies, + .proc_handler = proc_dointvec, }, { .procname = "mtu_expires", @@ -2594,7 +2594,7 @@ ctl_table ipv6_route_table_template[] = { .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_jiffies, + .proc_handler = proc_dointvec, }, { .procname = "gc_min_interval_ms", diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 4699cd3c311..86618eb3033 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1132,7 +1132,7 @@ static void __net_init ipip6_fb_tunnel_init(struct net_device *dev) sitn->tunnels_wc[0] = tunnel; } -static struct xfrm_tunnel sit_handler = { +static struct xfrm_tunnel sit_handler __read_mostly = { .handler = ipip6_rcv, .err_handler = ipip6_err, .priority = 1, diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c index fc3c86a4745..3177fe0459e 100644 --- a/net/ipv6/tunnel6.c +++ b/net/ipv6/tunnel6.c @@ -30,8 +30,8 @@ #include <net/protocol.h> #include <net/xfrm.h> -static struct xfrm6_tunnel *tunnel6_handlers; -static struct xfrm6_tunnel *tunnel46_handlers; +static struct xfrm6_tunnel *tunnel6_handlers __read_mostly; +static struct xfrm6_tunnel *tunnel46_handlers __read_mostly; static DEFINE_MUTEX(tunnel6_mutex); int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family) @@ -88,6 +88,11 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family) EXPORT_SYMBOL(xfrm6_tunnel_deregister); +#define for_each_tunnel_rcu(head, handler) \ + for (handler = rcu_dereference(head); \ + handler != NULL; \ + handler = rcu_dereference(handler->next)) \ + static int tunnel6_rcv(struct sk_buff *skb) { struct xfrm6_tunnel *handler; @@ -95,7 +100,7 @@ static int tunnel6_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; - for (handler = tunnel6_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel6_handlers, handler) if (!handler->handler(skb)) return 0; @@ -113,7 +118,7 @@ static int tunnel46_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto drop; - for (handler = tunnel46_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel46_handlers, handler) if (!handler->handler(skb)) return 0; @@ -129,7 +134,7 @@ static void tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, { struct xfrm6_tunnel *handler; - for (handler = tunnel6_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel6_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) break; } diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 2ce3a8278f2..ac7584b946a 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -317,13 +317,13 @@ static const struct xfrm_type xfrm6_tunnel_type = { .output = xfrm6_tunnel_output, }; -static struct xfrm6_tunnel xfrm6_tunnel_handler = { +static struct xfrm6_tunnel xfrm6_tunnel_handler __read_mostly = { .handler = xfrm6_tunnel_rcv, .err_handler = xfrm6_tunnel_err, .priority = 2, }; -static struct xfrm6_tunnel xfrm46_tunnel_handler = { +static struct xfrm6_tunnel xfrm46_tunnel_handler __read_mostly = { .handler = xfrm6_tunnel_rcv, .err_handler = xfrm6_tunnel_err, .priority = 2, diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c index 9616c32d107..8ee1ff6c742 100644 --- a/net/irda/irlan/irlan_eth.c +++ b/net/irda/irlan/irlan_eth.c @@ -45,13 +45,11 @@ static int irlan_eth_close(struct net_device *dev); static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb, struct net_device *dev); static void irlan_eth_set_multicast_list( struct net_device *dev); -static struct net_device_stats *irlan_eth_get_stats(struct net_device *dev); static const struct net_device_ops irlan_eth_netdev_ops = { .ndo_open = irlan_eth_open, .ndo_stop = irlan_eth_close, .ndo_start_xmit = irlan_eth_xmit, - .ndo_get_stats = irlan_eth_get_stats, .ndo_set_multicast_list = irlan_eth_set_multicast_list, .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, @@ -169,6 +167,7 @@ static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb, { struct irlan_cb *self = netdev_priv(dev); int ret; + unsigned int len; /* skb headroom large enough to contain all IrDA-headers? */ if ((skb_headroom(skb) < self->max_header_size) || (skb_shared(skb))) { @@ -188,6 +187,7 @@ static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb, dev->trans_start = jiffies; + len = skb->len; /* Now queue the packet in the transport layer */ if (self->use_udata) ret = irttp_udata_request(self->tsap_data, skb); @@ -206,10 +206,10 @@ static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb, * tried :-) DB */ /* irttp_data_request already free the packet */ - self->stats.tx_dropped++; + dev->stats.tx_dropped++; } else { - self->stats.tx_packets++; - self->stats.tx_bytes += skb->len; + dev->stats.tx_packets++; + dev->stats.tx_bytes += len; } return NETDEV_TX_OK; @@ -224,15 +224,16 @@ static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb, int irlan_eth_receive(void *instance, void *sap, struct sk_buff *skb) { struct irlan_cb *self = instance; + struct net_device *dev = self->dev; if (skb == NULL) { - ++self->stats.rx_dropped; + dev->stats.rx_dropped++; return 0; } if (skb->len < ETH_HLEN) { IRDA_DEBUG(0, "%s() : IrLAN frame too short (%d)\n", __func__, skb->len); - ++self->stats.rx_dropped; + dev->stats.rx_dropped++; dev_kfree_skb(skb); return 0; } @@ -242,10 +243,10 @@ int irlan_eth_receive(void *instance, void *sap, struct sk_buff *skb) * might have been previously set by the low level IrDA network * device driver */ - skb->protocol = eth_type_trans(skb, self->dev); /* Remove eth header */ + skb->protocol = eth_type_trans(skb, dev); /* Remove eth header */ - self->stats.rx_packets++; - self->stats.rx_bytes += skb->len; + dev->stats.rx_packets++; + dev->stats.rx_bytes += skb->len; netif_rx(skb); /* Eat it! */ @@ -346,16 +347,3 @@ static void irlan_eth_set_multicast_list(struct net_device *dev) else irlan_set_broadcast_filter(self, FALSE); } - -/* - * Function irlan_get_stats (dev) - * - * Get the current statistics for this device - * - */ -static struct net_device_stats *irlan_eth_get_stats(struct net_device *dev) -{ - struct irlan_cb *self = netdev_priv(dev); - - return &self->stats; -} diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index f77a45625c0..4f772de2f21 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -103,6 +103,7 @@ ieee80211_rate_control_ops_get(const char *name) struct rate_control_ops *ops; const char *alg_name; + kparam_block_sysfs_write(ieee80211_default_rc_algo); if (!name) alg_name = ieee80211_default_rc_algo; else @@ -120,6 +121,7 @@ ieee80211_rate_control_ops_get(const char *name) /* try built-in one if specific alg requested but not found */ if (!ops && strlen(CONFIG_MAC80211_RC_DEFAULT)) ops = ieee80211_try_rate_control_ops_get(CONFIG_MAC80211_RC_DEFAULT); + kparam_unblock_sysfs_write(ieee80211_default_rc_algo); return ops; } diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 4f8ddba4801..edbfb96b935 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -40,6 +40,7 @@ #include <net/udp.h> #include <net/icmp.h> /* for icmp_send */ #include <net/route.h> +#include <net/ip6_checksum.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> @@ -637,10 +638,12 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, } /* And finally the ICMP checksum */ - icmph->icmp6_cksum = 0; - /* TODO IPv6: is this correct for ICMPv6? */ - ip_vs_checksum_complete(skb, icmp_offset); - skb->ip_summed = CHECKSUM_UNNECESSARY; + icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, + skb->len - icmp_offset, + IPPROTO_ICMPV6, 0); + skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset; + skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum); + skb->ip_summed = CHECKSUM_PARTIAL; if (inout) IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 0f0c079c422..ca8ec8c4f31 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -61,7 +61,7 @@ static DEFINE_RWLOCK(__ip_vs_svc_lock); static DEFINE_RWLOCK(__ip_vs_rs_lock); /* lock for state and timeout tables */ -static DEFINE_RWLOCK(__ip_vs_securetcp_lock); +static DEFINE_SPINLOCK(ip_vs_securetcp_lock); /* lock for drop entry handling */ static DEFINE_SPINLOCK(__ip_vs_dropentry_lock); @@ -204,7 +204,7 @@ static void update_defense_level(void) spin_unlock(&__ip_vs_droppacket_lock); /* secure_tcp */ - write_lock(&__ip_vs_securetcp_lock); + spin_lock(&ip_vs_securetcp_lock); switch (sysctl_ip_vs_secure_tcp) { case 0: if (old_secure_tcp >= 2) @@ -238,7 +238,7 @@ static void update_defense_level(void) old_secure_tcp = sysctl_ip_vs_secure_tcp; if (to_change >= 0) ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1); - write_unlock(&__ip_vs_securetcp_lock); + spin_unlock(&ip_vs_securetcp_lock); local_bh_enable(); } @@ -843,7 +843,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, return -EINVAL; } - dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC); + dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL); if (dest == NULL) { pr_err("%s(): no memory.\n", __func__); return -ENOMEM; @@ -1177,7 +1177,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u, } #endif - svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC); + svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL); if (svc == NULL) { IP_VS_DBG(1, "%s(): no memory\n", __func__); ret = -ENOMEM; @@ -2155,7 +2155,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) if (cmd != IP_VS_SO_SET_ADD && (svc == NULL || svc->protocol != usvc.protocol)) { ret = -ESRCH; - goto out_unlock; + goto out_drop_service; } switch (cmd) { @@ -2189,6 +2189,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) ret = -EINVAL; } +out_drop_service: if (svc) ip_vs_service_put(svc); diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c index bbc1ac79595..727e45b6695 100644 --- a/net/netfilter/ipvs/ip_vs_sched.c +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -35,7 +35,7 @@ static LIST_HEAD(ip_vs_schedulers); /* lock for service table */ -static DEFINE_RWLOCK(__ip_vs_sched_lock); +static DEFINE_SPINLOCK(ip_vs_sched_lock); /* @@ -108,7 +108,7 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) IP_VS_DBG(2, "%s(): sched_name \"%s\"\n", __func__, sched_name); - read_lock_bh(&__ip_vs_sched_lock); + spin_lock_bh(&ip_vs_sched_lock); list_for_each_entry(sched, &ip_vs_schedulers, n_list) { /* @@ -122,14 +122,14 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) } if (strcmp(sched_name, sched->name)==0) { /* HIT */ - read_unlock_bh(&__ip_vs_sched_lock); + spin_unlock_bh(&ip_vs_sched_lock); return sched; } if (sched->module) module_put(sched->module); } - read_unlock_bh(&__ip_vs_sched_lock); + spin_unlock_bh(&ip_vs_sched_lock); return NULL; } @@ -184,10 +184,10 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) /* increase the module use count */ ip_vs_use_count_inc(); - write_lock_bh(&__ip_vs_sched_lock); + spin_lock_bh(&ip_vs_sched_lock); if (!list_empty(&scheduler->n_list)) { - write_unlock_bh(&__ip_vs_sched_lock); + spin_unlock_bh(&ip_vs_sched_lock); ip_vs_use_count_dec(); pr_err("%s(): [%s] scheduler already linked\n", __func__, scheduler->name); @@ -200,7 +200,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) */ list_for_each_entry(sched, &ip_vs_schedulers, n_list) { if (strcmp(scheduler->name, sched->name) == 0) { - write_unlock_bh(&__ip_vs_sched_lock); + spin_unlock_bh(&ip_vs_sched_lock); ip_vs_use_count_dec(); pr_err("%s(): [%s] scheduler already existed " "in the system\n", __func__, scheduler->name); @@ -211,7 +211,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) * Add it into the d-linked scheduler list */ list_add(&scheduler->n_list, &ip_vs_schedulers); - write_unlock_bh(&__ip_vs_sched_lock); + spin_unlock_bh(&ip_vs_sched_lock); pr_info("[%s] scheduler registered.\n", scheduler->name); @@ -229,9 +229,9 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) return -EINVAL; } - write_lock_bh(&__ip_vs_sched_lock); + spin_lock_bh(&ip_vs_sched_lock); if (list_empty(&scheduler->n_list)) { - write_unlock_bh(&__ip_vs_sched_lock); + spin_unlock_bh(&ip_vs_sched_lock); pr_err("%s(): [%s] scheduler is not in the list. failed\n", __func__, scheduler->name); return -EINVAL; @@ -241,7 +241,7 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) * Remove it from the d-linked scheduler list */ list_del(&scheduler->n_list); - write_unlock_bh(&__ip_vs_sched_lock); + spin_unlock_bh(&ip_vs_sched_lock); /* decrease the module use count */ ip_vs_use_count_dec(); diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index b46a8390896..9228ee0dc11 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -448,6 +448,7 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo, { __be16 _ports[2], *ports; u8 nexthdr; + int poff; memset(dst, 0, sizeof(*dst)); @@ -492,19 +493,13 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo, return 0; } - switch (nexthdr) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_UDPLITE: - case IPPROTO_SCTP: - case IPPROTO_DCCP: - ports = skb_header_pointer(skb, protoff, sizeof(_ports), + poff = proto_ports_offset(nexthdr); + if (poff >= 0) { + ports = skb_header_pointer(skb, protoff + poff, sizeof(_ports), &_ports); - break; - default: + } else { _ports[0] = _ports[1] = 0; ports = _ports; - break; } if (!ports) return -1; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 2cbf380377d..980fe4ad001 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1406,7 +1406,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, struct netlink_sock *nlk = nlk_sk(sk); int noblock = flags&MSG_DONTWAIT; size_t copied; - struct sk_buff *skb; + struct sk_buff *skb, *data_skb; int err; if (flags&MSG_OOB) @@ -1418,59 +1418,35 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, if (skb == NULL) goto out; + data_skb = skb; + #ifdef CONFIG_COMPAT_NETLINK_MESSAGES if (unlikely(skb_shinfo(skb)->frag_list)) { - bool need_compat = !!(flags & MSG_CMSG_COMPAT); - /* - * If this skb has a frag_list, then here that means that - * we will have to use the frag_list skb for compat tasks - * and the regular skb for non-compat tasks. + * If this skb has a frag_list, then here that means that we + * will have to use the frag_list skb's data for compat tasks + * and the regular skb's data for normal (non-compat) tasks. * - * The skb might (and likely will) be cloned, so we can't - * just reset frag_list and go on with things -- we need to - * keep that. For the compat case that's easy -- simply get - * a reference to the compat skb and free the regular one - * including the frag. For the non-compat case, we need to - * avoid sending the frag to the user -- so assign NULL but - * restore it below before freeing the skb. + * If we need to send the compat skb, assign it to the + * 'data_skb' variable so that it will be used below for data + * copying. We keep 'skb' for everything else, including + * freeing both later. */ - if (need_compat) { - struct sk_buff *compskb = skb_shinfo(skb)->frag_list; - skb_get(compskb); - kfree_skb(skb); - skb = compskb; - } else { - /* - * Before setting frag_list to NULL, we must get a - * private copy of skb if shared (because of MSG_PEEK) - */ - if (skb_shared(skb)) { - struct sk_buff *nskb; - - nskb = pskb_copy(skb, GFP_KERNEL); - kfree_skb(skb); - skb = nskb; - err = -ENOMEM; - if (!skb) - goto out; - } - kfree_skb(skb_shinfo(skb)->frag_list); - skb_shinfo(skb)->frag_list = NULL; - } + if (flags & MSG_CMSG_COMPAT) + data_skb = skb_shinfo(skb)->frag_list; } #endif msg->msg_namelen = 0; - copied = skb->len; + copied = data_skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } - skb_reset_transport_header(skb); - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + skb_reset_transport_header(data_skb); + err = skb_copy_datagram_iovec(data_skb, 0, msg->msg_iov, copied); if (msg->msg_name) { struct sockaddr_nl *addr = (struct sockaddr_nl *)msg->msg_name; @@ -1490,7 +1466,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, } siocb->scm->creds = *NETLINK_CREDS(skb); if (flags & MSG_TRUNC) - copied = skb->len; + copied = data_skb->len; skb_free_datagram(sk, skb); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 9a17f28b125..3616f27b9d4 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -488,7 +488,7 @@ retry: skb->dev = dev; skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - err = sock_tx_timestamp(msg, sk, skb_tx(skb)); + err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); if (err < 0) goto out_unlock; @@ -1209,7 +1209,7 @@ static int packet_snd(struct socket *sock, err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len); if (err) goto out_free; - err = sock_tx_timestamp(msg, sk, skb_tx(skb)); + err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); if (err < 0) goto out_free; diff --git a/net/phonet/pep.c b/net/phonet/pep.c index b2a3ae6cad7..04e34196c9d 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -834,6 +834,7 @@ static int pipe_skb_send(struct sock *sk, struct sk_buff *skb) { struct pep_sock *pn = pep_sk(sk); struct pnpipehdr *ph; + int err; if (pn_flow_safe(pn->tx_fc) && !atomic_add_unless(&pn->tx_credits, -1, 0)) { @@ -852,7 +853,10 @@ static int pipe_skb_send(struct sock *sk, struct sk_buff *skb) ph->message_id = PNS_PIPE_DATA; ph->pipe_handle = pn->pipe_handle; - return pn_skb_send(sk, skb, &pipe_srv); + err = pn_skb_send(sk, skb, &pipe_srv); + if (err && pn_flow_safe(pn->tx_fc)) + atomic_inc(&pn->tx_credits); + return err; } static int pep_sendmsg(struct kiocb *iocb, struct sock *sk, @@ -872,7 +876,7 @@ static int pep_sendmsg(struct kiocb *iocb, struct sock *sk, skb = sock_alloc_send_skb(sk, MAX_PNPIPE_HEADER + len, flags & MSG_DONTWAIT, &err); if (!skb) - return -ENOBUFS; + return err; skb_reserve(skb, MAX_PHONET_HEADER + 3); err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index b18e48fae97..d0a42945937 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -292,8 +292,7 @@ static void phonet_route_autodel(struct net_device *dev) if (bitmap_empty(deleted, 64)) return; /* short-circuit RCU */ synchronize_rcu(); - for (i = find_first_bit(deleted, 64); i < 64; - i = find_next_bit(deleted, 64, i + 1)) { + for_each_set_bit(i, deleted, 64) { rtm_phonet_notify(RTM_DELROUTE, dev, i); dev_put(dev); } diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 6e9848bf037..7c91f739f13 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -281,7 +281,9 @@ static unsigned int pn_socket_poll(struct file *file, struct socket *sock, if (!mask && sk->sk_state == TCP_CLOSE_WAIT) return POLLHUP; - if (sk->sk_state == TCP_ESTABLISHED && atomic_read(&pn->tx_credits)) + if (sk->sk_state == TCP_ESTABLISHED && + atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf && + atomic_read(&pn->tx_credits)) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; return mask; diff --git a/net/rds/recv.c b/net/rds/recv.c index 795a00b7f2c..c93588c2d55 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -297,7 +297,7 @@ static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc, int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr) { struct rds_notifier *notifier; - struct rds_rdma_notify cmsg; + struct rds_rdma_notify cmsg = { 0 }; /* fill holes with zero */ unsigned int count = 0, max_messages = ~0U; unsigned long flags; LIST_HEAD(copy); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 7043b294bb6..8e22bd345e7 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -597,12 +597,6 @@ extern unsigned rxrpc_debug; #define dbgprintk(FMT,...) \ printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__) -/* make sure we maintain the format strings, even when debugging is disabled */ -static inline __attribute__((format(printf,1,2))) -void _dbprintk(const char *fmt, ...) -{ -} - #define kenter(FMT,...) dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__) #define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) #define kdebug(FMT,...) dbgprintk(" "FMT ,##__VA_ARGS__) @@ -655,11 +649,11 @@ do { \ } while (0) #else -#define _enter(FMT,...) _dbprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__) -#define _leave(FMT,...) _dbprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) -#define _debug(FMT,...) _dbprintk(" "FMT ,##__VA_ARGS__) -#define _proto(FMT,...) _dbprintk("### "FMT ,##__VA_ARGS__) -#define _net(FMT,...) _dbprintk("@@@ "FMT ,##__VA_ARGS__) +#define _enter(FMT,...) no_printk("==> %s("FMT")",__func__ ,##__VA_ARGS__) +#define _leave(FMT,...) no_printk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) +#define _debug(FMT,...) no_printk(" "FMT ,##__VA_ARGS__) +#define _proto(FMT,...) no_printk("### "FMT ,##__VA_ARGS__) +#define _net(FMT,...) no_printk("@@@ "FMT ,##__VA_ARGS__) #endif /* diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 2f691fb180d..a36270a994d 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -518,6 +518,16 @@ config NET_ACT_SKBEDIT To compile this code as a module, choose M here: the module will be called act_skbedit. +config NET_ACT_CSUM + tristate "Checksum Updating" + depends on NET_CLS_ACT && INET + ---help--- + Say Y here to update some common checksum after some direct + packet alterations. + + To compile this code as a module, choose M here: the + module will be called act_csum. + config NET_CLS_IND bool "Incoming device classification" depends on NET_CLS_U32 || NET_CLS_FW diff --git a/net/sched/Makefile b/net/sched/Makefile index f14e71bfa58..960f5dba630 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -15,6 +15,7 @@ obj-$(CONFIG_NET_ACT_NAT) += act_nat.o obj-$(CONFIG_NET_ACT_PEDIT) += act_pedit.o obj-$(CONFIG_NET_ACT_SIMP) += act_simple.o obj-$(CONFIG_NET_ACT_SKBEDIT) += act_skbedit.o +obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c new file mode 100644 index 00000000000..67dc7ce9b63 --- /dev/null +++ b/net/sched/act_csum.c @@ -0,0 +1,595 @@ +/* + * Checksum updating actions + * + * Copyright (c) 2010 Gregoire Baron <baronchon@n7mm.org> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include <linux/types.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/spinlock.h> + +#include <linux/netlink.h> +#include <net/netlink.h> +#include <linux/rtnetlink.h> + +#include <linux/skbuff.h> + +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/icmp.h> +#include <linux/icmpv6.h> +#include <linux/igmp.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <net/ip6_checksum.h> + +#include <net/act_api.h> + +#include <linux/tc_act/tc_csum.h> +#include <net/tc_act/tc_csum.h> + +#define CSUM_TAB_MASK 15 +static struct tcf_common *tcf_csum_ht[CSUM_TAB_MASK + 1]; +static u32 csum_idx_gen; +static DEFINE_RWLOCK(csum_lock); + +static struct tcf_hashinfo csum_hash_info = { + .htab = tcf_csum_ht, + .hmask = CSUM_TAB_MASK, + .lock = &csum_lock, +}; + +static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = { + [TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), }, +}; + +static int tcf_csum_init(struct nlattr *nla, struct nlattr *est, + struct tc_action *a, int ovr, int bind) +{ + struct nlattr *tb[TCA_CSUM_MAX + 1]; + struct tc_csum *parm; + struct tcf_common *pc; + struct tcf_csum *p; + int ret = 0, err; + + if (nla == NULL) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_CSUM_MAX, nla,csum_policy); + if (err < 0) + return err; + + if (tb[TCA_CSUM_PARMS] == NULL) + return -EINVAL; + parm = nla_data(tb[TCA_CSUM_PARMS]); + + pc = tcf_hash_check(parm->index, a, bind, &csum_hash_info); + if (!pc) { + pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind, + &csum_idx_gen, &csum_hash_info); + if (IS_ERR(pc)) + return PTR_ERR(pc); + p = to_tcf_csum(pc); + ret = ACT_P_CREATED; + } else { + p = to_tcf_csum(pc); + if (!ovr) { + tcf_hash_release(pc, bind, &csum_hash_info); + return -EEXIST; + } + } + + spin_lock_bh(&p->tcf_lock); + p->tcf_action = parm->action; + p->update_flags = parm->update_flags; + spin_unlock_bh(&p->tcf_lock); + + if (ret == ACT_P_CREATED) + tcf_hash_insert(pc, &csum_hash_info); + + return ret; +} + +static int tcf_csum_cleanup(struct tc_action *a, int bind) +{ + struct tcf_csum *p = a->priv; + return tcf_hash_release(&p->common, bind, &csum_hash_info); +} + +/** + * tcf_csum_skb_nextlayer - Get next layer pointer + * @skb: sk_buff to use + * @ihl: previous summed headers length + * @ipl: complete packet length + * @jhl: next header length + * + * Check the expected next layer availability in the specified sk_buff. + * Return the next layer pointer if pass, NULL otherwise. + */ +static void *tcf_csum_skb_nextlayer(struct sk_buff *skb, + unsigned int ihl, unsigned int ipl, + unsigned int jhl) +{ + int ntkoff = skb_network_offset(skb); + int hl = ihl + jhl; + + if (!pskb_may_pull(skb, ipl + ntkoff) || (ipl < hl) || + (skb_cloned(skb) && + !skb_clone_writable(skb, hl + ntkoff) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + return NULL; + else + return (void *)(skb_network_header(skb) + ihl); +} + +static int tcf_csum_ipv4_icmp(struct sk_buff *skb, + unsigned int ihl, unsigned int ipl) +{ + struct icmphdr *icmph; + + icmph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmph)); + if (icmph == NULL) + return 0; + + icmph->checksum = 0; + skb->csum = csum_partial(icmph, ipl - ihl, 0); + icmph->checksum = csum_fold(skb->csum); + + skb->ip_summed = CHECKSUM_NONE; + + return 1; +} + +static int tcf_csum_ipv4_igmp(struct sk_buff *skb, + unsigned int ihl, unsigned int ipl) +{ + struct igmphdr *igmph; + + igmph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*igmph)); + if (igmph == NULL) + return 0; + + igmph->csum = 0; + skb->csum = csum_partial(igmph, ipl - ihl, 0); + igmph->csum = csum_fold(skb->csum); + + skb->ip_summed = CHECKSUM_NONE; + + return 1; +} + +static int tcf_csum_ipv6_icmp(struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int ihl, unsigned int ipl) +{ + struct icmp6hdr *icmp6h; + + icmp6h = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmp6h)); + if (icmp6h == NULL) + return 0; + + icmp6h->icmp6_cksum = 0; + skb->csum = csum_partial(icmp6h, ipl - ihl, 0); + icmp6h->icmp6_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + ipl - ihl, IPPROTO_ICMPV6, + skb->csum); + + skb->ip_summed = CHECKSUM_NONE; + + return 1; +} + +static int tcf_csum_ipv4_tcp(struct sk_buff *skb, struct iphdr *iph, + unsigned int ihl, unsigned int ipl) +{ + struct tcphdr *tcph; + + tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph)); + if (tcph == NULL) + return 0; + + tcph->check = 0; + skb->csum = csum_partial(tcph, ipl - ihl, 0); + tcph->check = tcp_v4_check(ipl - ihl, + iph->saddr, iph->daddr, skb->csum); + + skb->ip_summed = CHECKSUM_NONE; + + return 1; +} + +static int tcf_csum_ipv6_tcp(struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int ihl, unsigned int ipl) +{ + struct tcphdr *tcph; + + tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph)); + if (tcph == NULL) + return 0; + + tcph->check = 0; + skb->csum = csum_partial(tcph, ipl - ihl, 0); + tcph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + ipl - ihl, IPPROTO_TCP, + skb->csum); + + skb->ip_summed = CHECKSUM_NONE; + + return 1; +} + +static int tcf_csum_ipv4_udp(struct sk_buff *skb, struct iphdr *iph, + unsigned int ihl, unsigned int ipl, int udplite) +{ + struct udphdr *udph; + u16 ul; + + /* + * Support both UDP and UDPLITE checksum algorithms, Don't use + * udph->len to get the real length without any protocol check, + * UDPLITE uses udph->len for another thing, + * Use iph->tot_len, or just ipl. + */ + + udph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*udph)); + if (udph == NULL) + return 0; + + ul = ntohs(udph->len); + + if (udplite || udph->check) { + + udph->check = 0; + + if (udplite) { + if (ul == 0) + skb->csum = csum_partial(udph, ipl - ihl, 0); + else if ((ul >= sizeof(*udph)) && (ul <= ipl - ihl)) + skb->csum = csum_partial(udph, ul, 0); + else + goto ignore_obscure_skb; + } else { + if (ul != ipl - ihl) + goto ignore_obscure_skb; + + skb->csum = csum_partial(udph, ul, 0); + } + + udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, + ul, iph->protocol, + skb->csum); + + if (!udph->check) + udph->check = CSUM_MANGLED_0; + } + + skb->ip_summed = CHECKSUM_NONE; + +ignore_obscure_skb: + return 1; +} + +static int tcf_csum_ipv6_udp(struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int ihl, unsigned int ipl, int udplite) +{ + struct udphdr *udph; + u16 ul; + + /* + * Support both UDP and UDPLITE checksum algorithms, Don't use + * udph->len to get the real length without any protocol check, + * UDPLITE uses udph->len for another thing, + * Use ip6h->payload_len + sizeof(*ip6h) ... , or just ipl. + */ + + udph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*udph)); + if (udph == NULL) + return 0; + + ul = ntohs(udph->len); + + udph->check = 0; + + if (udplite) { + if (ul == 0) + skb->csum = csum_partial(udph, ipl - ihl, 0); + + else if ((ul >= sizeof(*udph)) && (ul <= ipl - ihl)) + skb->csum = csum_partial(udph, ul, 0); + + else + goto ignore_obscure_skb; + } else { + if (ul != ipl - ihl) + goto ignore_obscure_skb; + + skb->csum = csum_partial(udph, ul, 0); + } + + udph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, ul, + udplite ? IPPROTO_UDPLITE : IPPROTO_UDP, + skb->csum); + + if (!udph->check) + udph->check = CSUM_MANGLED_0; + + skb->ip_summed = CHECKSUM_NONE; + +ignore_obscure_skb: + return 1; +} + +static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags) +{ + struct iphdr *iph; + int ntkoff; + + ntkoff = skb_network_offset(skb); + + if (!pskb_may_pull(skb, sizeof(*iph) + ntkoff)) + goto fail; + + iph = ip_hdr(skb); + + switch (iph->frag_off & htons(IP_OFFSET) ? 0 : iph->protocol) { + case IPPROTO_ICMP: + if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP) + if (!tcf_csum_ipv4_icmp(skb, iph->ihl * 4, + ntohs(iph->tot_len))) + goto fail; + break; + case IPPROTO_IGMP: + if (update_flags & TCA_CSUM_UPDATE_FLAG_IGMP) + if (!tcf_csum_ipv4_igmp(skb, iph->ihl * 4, + ntohs(iph->tot_len))) + goto fail; + break; + case IPPROTO_TCP: + if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP) + if (!tcf_csum_ipv4_tcp(skb, iph, iph->ihl * 4, + ntohs(iph->tot_len))) + goto fail; + break; + case IPPROTO_UDP: + if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP) + if (!tcf_csum_ipv4_udp(skb, iph, iph->ihl * 4, + ntohs(iph->tot_len), 0)) + goto fail; + break; + case IPPROTO_UDPLITE: + if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE) + if (!tcf_csum_ipv4_udp(skb, iph, iph->ihl * 4, + ntohs(iph->tot_len), 1)) + goto fail; + break; + } + + if (update_flags & TCA_CSUM_UPDATE_FLAG_IPV4HDR) { + if (skb_cloned(skb) && + !skb_clone_writable(skb, sizeof(*iph) + ntkoff) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + goto fail; + + ip_send_check(iph); + } + + return 1; + +fail: + return 0; +} + +static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh, + unsigned int ixhl, unsigned int *pl) +{ + int off, len, optlen; + unsigned char *xh = (void *)ip6xh; + + off = sizeof(*ip6xh); + len = ixhl - off; + + while (len > 1) { + switch (xh[off]) { + case IPV6_TLV_PAD0: + optlen = 1; + break; + case IPV6_TLV_JUMBO: + optlen = xh[off + 1] + 2; + if (optlen != 6 || len < 6 || (off & 3) != 2) + /* wrong jumbo option length/alignment */ + return 0; + *pl = ntohl(*(__be32 *)(xh + off + 2)); + goto done; + default: + optlen = xh[off + 1] + 2; + if (optlen > len) + /* ignore obscure options */ + goto done; + break; + } + off += optlen; + len -= optlen; + } + +done: + return 1; +} + +static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags) +{ + struct ipv6hdr *ip6h; + struct ipv6_opt_hdr *ip6xh; + unsigned int hl, ixhl; + unsigned int pl; + int ntkoff; + u8 nexthdr; + + ntkoff = skb_network_offset(skb); + + hl = sizeof(*ip6h); + + if (!pskb_may_pull(skb, hl + ntkoff)) + goto fail; + + ip6h = ipv6_hdr(skb); + + pl = ntohs(ip6h->payload_len); + nexthdr = ip6h->nexthdr; + + do { + switch (nexthdr) { + case NEXTHDR_FRAGMENT: + goto ignore_skb; + case NEXTHDR_ROUTING: + case NEXTHDR_HOP: + case NEXTHDR_DEST: + if (!pskb_may_pull(skb, hl + sizeof(*ip6xh) + ntkoff)) + goto fail; + ip6xh = (void *)(skb_network_header(skb) + hl); + ixhl = ipv6_optlen(ip6xh); + if (!pskb_may_pull(skb, hl + ixhl + ntkoff)) + goto fail; + if ((nexthdr == NEXTHDR_HOP) && + !(tcf_csum_ipv6_hopopts(ip6xh, ixhl, &pl))) + goto fail; + nexthdr = ip6xh->nexthdr; + hl += ixhl; + break; + case IPPROTO_ICMPV6: + if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP) + if (!tcf_csum_ipv6_icmp(skb, ip6h, + hl, pl + sizeof(*ip6h))) + goto fail; + goto done; + case IPPROTO_TCP: + if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP) + if (!tcf_csum_ipv6_tcp(skb, ip6h, + hl, pl + sizeof(*ip6h))) + goto fail; + goto done; + case IPPROTO_UDP: + if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP) + if (!tcf_csum_ipv6_udp(skb, ip6h, hl, + pl + sizeof(*ip6h), 0)) + goto fail; + goto done; + case IPPROTO_UDPLITE: + if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE) + if (!tcf_csum_ipv6_udp(skb, ip6h, hl, + pl + sizeof(*ip6h), 1)) + goto fail; + goto done; + default: + goto ignore_skb; + } + } while (pskb_may_pull(skb, hl + 1 + ntkoff)); + +done: +ignore_skb: + return 1; + +fail: + return 0; +} + +static int tcf_csum(struct sk_buff *skb, + struct tc_action *a, struct tcf_result *res) +{ + struct tcf_csum *p = a->priv; + int action; + u32 update_flags; + + spin_lock(&p->tcf_lock); + p->tcf_tm.lastuse = jiffies; + p->tcf_bstats.bytes += qdisc_pkt_len(skb); + p->tcf_bstats.packets++; + action = p->tcf_action; + update_flags = p->update_flags; + spin_unlock(&p->tcf_lock); + + if (unlikely(action == TC_ACT_SHOT)) + goto drop; + + switch (skb->protocol) { + case cpu_to_be16(ETH_P_IP): + if (!tcf_csum_ipv4(skb, update_flags)) + goto drop; + break; + case cpu_to_be16(ETH_P_IPV6): + if (!tcf_csum_ipv6(skb, update_flags)) + goto drop; + break; + } + + return action; + +drop: + spin_lock(&p->tcf_lock); + p->tcf_qstats.drops++; + spin_unlock(&p->tcf_lock); + return TC_ACT_SHOT; +} + +static int tcf_csum_dump(struct sk_buff *skb, + struct tc_action *a, int bind, int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tcf_csum *p = a->priv; + struct tc_csum opt = { + .update_flags = p->update_flags, + .index = p->tcf_index, + .action = p->tcf_action, + .refcnt = p->tcf_refcnt - ref, + .bindcnt = p->tcf_bindcnt - bind, + }; + struct tcf_t t; + + NLA_PUT(skb, TCA_CSUM_PARMS, sizeof(opt), &opt); + t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); + t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); + t.expires = jiffies_to_clock_t(p->tcf_tm.expires); + NLA_PUT(skb, TCA_CSUM_TM, sizeof(t), &t); + + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static struct tc_action_ops act_csum_ops = { + .kind = "csum", + .hinfo = &csum_hash_info, + .type = TCA_ACT_CSUM, + .capab = TCA_CAP_NONE, + .owner = THIS_MODULE, + .act = tcf_csum, + .dump = tcf_csum_dump, + .cleanup = tcf_csum_cleanup, + .lookup = tcf_hash_search, + .init = tcf_csum_init, + .walk = tcf_generic_walker +}; + +MODULE_DESCRIPTION("Checksum updating actions"); +MODULE_LICENSE("GPL"); + +static int __init csum_init_module(void) +{ + return tcf_register_action(&act_csum_ops); +} + +static void __exit csum_cleanup_module(void) +{ + tcf_unregister_action(&act_csum_ops); +} + +module_init(csum_init_module); +module_exit(csum_cleanup_module); diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 8406c665499..c2ed90a4c0b 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -152,21 +152,24 @@ static int tcf_gact(struct sk_buff *skb, struct tc_action *a, struct tcf_result static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tc_gact opt; struct tcf_gact *gact = a->priv; + struct tc_gact opt = { + .index = gact->tcf_index, + .refcnt = gact->tcf_refcnt - ref, + .bindcnt = gact->tcf_bindcnt - bind, + .action = gact->tcf_action, + }; struct tcf_t t; - opt.index = gact->tcf_index; - opt.refcnt = gact->tcf_refcnt - ref; - opt.bindcnt = gact->tcf_bindcnt - bind; - opt.action = gact->tcf_action; NLA_PUT(skb, TCA_GACT_PARMS, sizeof(opt), &opt); #ifdef CONFIG_GACT_PROB if (gact->tcfg_ptype) { - struct tc_gact_p p_opt; - p_opt.paction = gact->tcfg_paction; - p_opt.pval = gact->tcfg_pval; - p_opt.ptype = gact->tcfg_ptype; + struct tc_gact_p p_opt = { + .paction = gact->tcfg_paction, + .pval = gact->tcfg_pval, + .ptype = gact->tcfg_ptype, + }; + NLA_PUT(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt); } #endif diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 11f195af2da..0c311be9282 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -219,15 +219,16 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, i { unsigned char *b = skb_tail_pointer(skb); struct tcf_mirred *m = a->priv; - struct tc_mirred opt; + struct tc_mirred opt = { + .index = m->tcf_index, + .action = m->tcf_action, + .refcnt = m->tcf_refcnt - ref, + .bindcnt = m->tcf_bindcnt - bind, + .eaction = m->tcfm_eaction, + .ifindex = m->tcfm_ifindex, + }; struct tcf_t t; - opt.index = m->tcf_index; - opt.action = m->tcf_action; - opt.refcnt = m->tcf_refcnt - ref; - opt.bindcnt = m->tcf_bindcnt - bind; - opt.eaction = m->tcfm_eaction; - opt.ifindex = m->tcfm_ifindex; NLA_PUT(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt); t.install = jiffies_to_clock_t(jiffies - m->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - m->tcf_tm.lastuse); diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 509a2d53a99..186eb837e60 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -272,19 +272,19 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, { unsigned char *b = skb_tail_pointer(skb); struct tcf_nat *p = a->priv; - struct tc_nat opt; + struct tc_nat opt = { + .old_addr = p->old_addr, + .new_addr = p->new_addr, + .mask = p->mask, + .flags = p->flags, + + .index = p->tcf_index, + .action = p->tcf_action, + .refcnt = p->tcf_refcnt - ref, + .bindcnt = p->tcf_bindcnt - bind, + }; struct tcf_t t; - opt.old_addr = p->old_addr; - opt.new_addr = p->new_addr; - opt.mask = p->mask; - opt.flags = p->flags; - - opt.index = p->tcf_index; - opt.action = p->tcf_action; - opt.refcnt = p->tcf_refcnt - ref; - opt.bindcnt = p->tcf_bindcnt - bind; - NLA_PUT(skb, TCA_NAT_PARMS, sizeof(opt), &opt); t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 4a1d640b0cf..97e84f3ee77 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -164,13 +164,14 @@ static inline int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, { unsigned char *b = skb_tail_pointer(skb); struct tcf_defact *d = a->priv; - struct tc_defact opt; + struct tc_defact opt = { + .index = d->tcf_index, + .refcnt = d->tcf_refcnt - ref, + .bindcnt = d->tcf_bindcnt - bind, + .action = d->tcf_action, + }; struct tcf_t t; - opt.index = d->tcf_index; - opt.refcnt = d->tcf_refcnt - ref; - opt.bindcnt = d->tcf_bindcnt - bind; - opt.action = d->tcf_action; NLA_PUT(skb, TCA_DEF_PARMS, sizeof(opt), &opt); NLA_PUT_STRING(skb, TCA_DEF_DATA, d->tcfd_defdata); t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install); diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index e9607fe55b5..66cbf4eb885 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -159,13 +159,14 @@ static inline int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, { unsigned char *b = skb_tail_pointer(skb); struct tcf_skbedit *d = a->priv; - struct tc_skbedit opt; + struct tc_skbedit opt = { + .index = d->tcf_index, + .refcnt = d->tcf_refcnt - ref, + .bindcnt = d->tcf_bindcnt - bind, + .action = d->tcf_action, + }; struct tcf_t t; - opt.index = d->tcf_index; - opt.refcnt = d->tcf_refcnt - ref; - opt.bindcnt = d->tcf_bindcnt - bind; - opt.action = d->tcf_action; NLA_PUT(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt); if (d->flags & SKBEDIT_F_PRIORITY) NLA_PUT(skb, TCA_SKBEDIT_PRIORITY, sizeof(d->priority), diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index e17096e3913..5b271a18bc3 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -111,44 +111,41 @@ static u32 flow_get_proto(struct sk_buff *skb) } } -static int has_ports(u8 protocol) -{ - switch (protocol) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_UDPLITE: - case IPPROTO_SCTP: - case IPPROTO_DCCP: - case IPPROTO_ESP: - return 1; - default: - return 0; - } -} - static u32 flow_get_proto_src(struct sk_buff *skb) { switch (skb->protocol) { case htons(ETH_P_IP): { struct iphdr *iph; + int poff; if (!pskb_network_may_pull(skb, sizeof(*iph))) break; iph = ip_hdr(skb); - if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) && - has_ports(iph->protocol) && - pskb_network_may_pull(skb, iph->ihl * 4 + 2)) - return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4)); + if (iph->frag_off & htons(IP_MF|IP_OFFSET)) + break; + poff = proto_ports_offset(iph->protocol); + if (poff >= 0 && + pskb_network_may_pull(skb, iph->ihl * 4 + 2 + poff)) { + iph = ip_hdr(skb); + return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 + + poff)); + } break; } case htons(ETH_P_IPV6): { struct ipv6hdr *iph; + int poff; - if (!pskb_network_may_pull(skb, sizeof(*iph) + 2)) + if (!pskb_network_may_pull(skb, sizeof(*iph))) break; iph = ipv6_hdr(skb); - if (has_ports(iph->nexthdr)) - return ntohs(*(__be16 *)&iph[1]); + poff = proto_ports_offset(iph->nexthdr); + if (poff >= 0 && + pskb_network_may_pull(skb, sizeof(*iph) + poff + 2)) { + iph = ipv6_hdr(skb); + return ntohs(*(__be16 *)((void *)iph + sizeof(*iph) + + poff)); + } break; } } @@ -161,24 +158,36 @@ static u32 flow_get_proto_dst(struct sk_buff *skb) switch (skb->protocol) { case htons(ETH_P_IP): { struct iphdr *iph; + int poff; if (!pskb_network_may_pull(skb, sizeof(*iph))) break; iph = ip_hdr(skb); - if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) && - has_ports(iph->protocol) && - pskb_network_may_pull(skb, iph->ihl * 4 + 4)) - return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 + 2)); + if (iph->frag_off & htons(IP_MF|IP_OFFSET)) + break; + poff = proto_ports_offset(iph->protocol); + if (poff >= 0 && + pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) { + iph = ip_hdr(skb); + return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 + + 2 + poff)); + } break; } case htons(ETH_P_IPV6): { struct ipv6hdr *iph; + int poff; - if (!pskb_network_may_pull(skb, sizeof(*iph) + 4)) + if (!pskb_network_may_pull(skb, sizeof(*iph))) break; iph = ipv6_hdr(skb); - if (has_ports(iph->nexthdr)) - return ntohs(*(__be16 *)((void *)&iph[1] + 2)); + poff = proto_ports_offset(iph->nexthdr); + if (poff >= 0 && + pskb_network_may_pull(skb, sizeof(*iph) + poff + 4)) { + iph = ipv6_hdr(skb); + return ntohs(*(__be16 *)((void *)iph + sizeof(*iph) + + poff + 2)); + } break; } } @@ -297,6 +306,11 @@ static u32 flow_get_vlan_tag(const struct sk_buff *skb) return tag & VLAN_VID_MASK; } +static u32 flow_get_rxhash(struct sk_buff *skb) +{ + return skb_get_rxhash(skb); +} + static u32 flow_key_get(struct sk_buff *skb, int key) { switch (key) { @@ -334,6 +348,8 @@ static u32 flow_key_get(struct sk_buff *skb, int key) return flow_get_skgid(skb); case FLOW_KEY_VLAN_TAG: return flow_get_vlan_tag(skb); + case FLOW_KEY_RXHASH: + return flow_get_rxhash(skb); default: WARN_ON(1); return 0; diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 3bcac8aa333..34da5e29ea1 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -223,6 +223,11 @@ META_COLLECTOR(int_maclen) dst->value = skb->mac_len; } +META_COLLECTOR(int_rxhash) +{ + dst->value = skb_get_rxhash(skb); +} + /************************************************************************** * Netfilter **************************************************************************/ @@ -541,6 +546,7 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = { [META_ID(SK_SENDMSG_OFF)] = META_FUNC(int_sk_sendmsg_off), [META_ID(SK_WRITE_PENDING)] = META_FUNC(int_sk_write_pend), [META_ID(VLAN_TAG)] = META_FUNC(int_vlan_tag), + [META_ID(RXHASH)] = META_FUNC(int_rxhash), } }; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index b9e8c3b7d40..6fb3d41c0e4 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -150,22 +150,34 @@ int register_qdisc(struct Qdisc_ops *qops) if (qops->enqueue == NULL) qops->enqueue = noop_qdisc_ops.enqueue; if (qops->peek == NULL) { - if (qops->dequeue == NULL) { + if (qops->dequeue == NULL) qops->peek = noop_qdisc_ops.peek; - } else { - rc = -EINVAL; - goto out; - } + else + goto out_einval; } if (qops->dequeue == NULL) qops->dequeue = noop_qdisc_ops.dequeue; + if (qops->cl_ops) { + const struct Qdisc_class_ops *cops = qops->cl_ops; + + if (!(cops->get && cops->put && cops->walk && cops->leaf)) + goto out_einval; + + if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf)) + goto out_einval; + } + qops->next = NULL; *qp = qops; rc = 0; out: write_unlock(&qdisc_mod_lock); return rc; + +out_einval: + rc = -EINVAL; + goto out; } EXPORT_SYMBOL(register_qdisc); @@ -348,7 +360,7 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt) tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16); } - if (!s || tsize != s->tsize || (!tab && tsize > 0)) + if (tsize != s->tsize || (!tab && tsize > 0)) return ERR_PTR(-EINVAL); spin_lock(&qdisc_stab_lock); diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index e114f23d5ea..34066278952 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -418,7 +418,7 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch) } ret = qdisc_enqueue(skb, flow->q); - if (ret != 0) { + if (ret != NET_XMIT_SUCCESS) { drop: __maybe_unused if (net_xmit_drop_count(ret)) { sch->qstats.drops++; @@ -442,7 +442,7 @@ drop: __maybe_unused */ if (flow == &p->link) { sch->q.qlen++; - return 0; + return NET_XMIT_SUCCESS; } tasklet_schedule(&p->task); return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 534f33231c1..3cf478d012d 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -123,40 +123,39 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) case htons(ETH_P_IP): { const struct iphdr *iph; + int poff; if (!pskb_network_may_pull(skb, sizeof(*iph))) goto err; iph = ip_hdr(skb); h = (__force u32)iph->daddr; h2 = (__force u32)iph->saddr ^ iph->protocol; - if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) && - (iph->protocol == IPPROTO_TCP || - iph->protocol == IPPROTO_UDP || - iph->protocol == IPPROTO_UDPLITE || - iph->protocol == IPPROTO_SCTP || - iph->protocol == IPPROTO_DCCP || - iph->protocol == IPPROTO_ESP) && - pskb_network_may_pull(skb, iph->ihl * 4 + 4)) - h2 ^= *(((u32*)iph) + iph->ihl); + if (iph->frag_off & htons(IP_MF|IP_OFFSET)) + break; + poff = proto_ports_offset(iph->protocol); + if (poff >= 0 && + pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) { + iph = ip_hdr(skb); + h2 ^= *(u32*)((void *)iph + iph->ihl * 4 + poff); + } break; } case htons(ETH_P_IPV6): { struct ipv6hdr *iph; + int poff; if (!pskb_network_may_pull(skb, sizeof(*iph))) goto err; iph = ipv6_hdr(skb); h = (__force u32)iph->daddr.s6_addr32[3]; h2 = (__force u32)iph->saddr.s6_addr32[3] ^ iph->nexthdr; - if ((iph->nexthdr == IPPROTO_TCP || - iph->nexthdr == IPPROTO_UDP || - iph->nexthdr == IPPROTO_UDPLITE || - iph->nexthdr == IPPROTO_SCTP || - iph->nexthdr == IPPROTO_DCCP || - iph->nexthdr == IPPROTO_ESP) && - pskb_network_may_pull(skb, sizeof(*iph) + 4)) - h2 ^= *(u32*)&iph[1]; + poff = proto_ports_offset(iph->nexthdr); + if (poff >= 0 && + pskb_network_may_pull(skb, sizeof(*iph) + 4 + poff)) { + iph = ipv6_hdr(skb); + h2 ^= *(u32*)((void *)iph + sizeof(*iph) + poff); + } break; } default: @@ -334,7 +333,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (++sch->q.qlen <= q->limit) { sch->bstats.bytes += qdisc_pkt_len(skb); sch->bstats.packets++; - return 0; + return NET_XMIT_SUCCESS; } sfq_drop(sch); @@ -508,6 +507,11 @@ nla_put_failure: return -1; } +static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg) +{ + return NULL; +} + static unsigned long sfq_get(struct Qdisc *sch, u32 classid) { return 0; @@ -519,6 +523,10 @@ static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent, return 0; } +static void sfq_put(struct Qdisc *q, unsigned long cl) +{ +} + static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl) { struct sfq_sched_data *q = qdisc_priv(sch); @@ -571,9 +579,12 @@ static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg) } static const struct Qdisc_class_ops sfq_class_ops = { + .leaf = sfq_leaf, .get = sfq_get, + .put = sfq_put, .tcf_chain = sfq_find_tcf, .bind_tcf = sfq_bind, + .unbind_tcf = sfq_put, .dump = sfq_dump_class, .dump_stats = sfq_dump_class_stats, .walk = sfq_walk, diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 0991c640cd3..641a30d6463 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -127,7 +127,7 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch) return qdisc_reshape_fail(skb, sch); ret = qdisc_enqueue(skb, q->qdisc); - if (ret != 0) { + if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) sch->qstats.drops++; return ret; @@ -136,7 +136,7 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch) sch->q.qlen++; sch->bstats.bytes += qdisc_pkt_len(skb); sch->bstats.packets++; - return 0; + return NET_XMIT_SUCCESS; } static unsigned int tbf_drop(struct Qdisc* sch) diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 807643bdcba..feaabc103ce 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -85,7 +85,7 @@ teql_enqueue(struct sk_buff *skb, struct Qdisc* sch) __skb_queue_tail(&q->q, skb); sch->bstats.bytes += qdisc_pkt_len(skb); sch->bstats.packets++; - return 0; + return NET_XMIT_SUCCESS; } kfree_skb(skb); diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 0b85e525643..5f1fb8bd862 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -48,6 +48,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/types.h> #include <linux/fcntl.h> #include <linux/poll.h> diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 476caaf100e..6c8556459a7 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -37,6 +37,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/types.h> #include <linux/kernel.h> #include <linux/net.h> diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index ccb6dc48d15..397296fb156 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -43,6 +43,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <linux/interrupt.h> diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 732689140fb..95e0c8eda1a 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -47,6 +47,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> @@ -336,7 +338,7 @@ static void sctp_v6_get_saddr(struct sctp_sock *sk, memcpy(saddr, baddr, sizeof(union sctp_addr)); SCTP_DEBUG_PRINTK("saddr: %pI6\n", &saddr->v6.sin6_addr); } else { - printk(KERN_ERR "%s: asoc:%p Could not find a valid source " + pr_err("%s: asoc:%p Could not find a valid source " "address for the dest:%pI6\n", __func__, asoc, &daddr->v6.sin6_addr); } diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c index f73ec0ea93b..8ef8e7d9eb6 100644 --- a/net/sctp/objcnt.c +++ b/net/sctp/objcnt.c @@ -38,6 +38,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <net/sctp/sctp.h> @@ -134,8 +136,7 @@ void sctp_dbg_objcnt_init(void) ent = proc_create("sctp_dbg_objcnt", 0, proc_net_sctp, &sctp_objcnt_ops); if (!ent) - printk(KERN_WARNING - "sctp_dbg_objcnt: Unable to create /proc entry.\n"); + pr_warn("sctp_dbg_objcnt: Unable to create /proc entry.\n"); } /* Cleanup the objcount entry in the proc filesystem. */ diff --git a/net/sctp/output.c b/net/sctp/output.c index a646681f5ac..901764b17ae 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -41,6 +41,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/types.h> #include <linux/kernel.h> #include <linux/wait.h> diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index c04b2eb5918..8c6d379b4bb 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -46,6 +46,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/types.h> #include <linux/list.h> /* For struct list_head */ #include <linux/socket.h> @@ -1463,23 +1465,23 @@ static void sctp_check_transmitted(struct sctp_outq *q, /* Display the end of the * current range. */ - SCTP_DEBUG_PRINTK("-%08x", - dbg_last_ack_tsn); + SCTP_DEBUG_PRINTK_CONT("-%08x", + dbg_last_ack_tsn); } /* Start a new range. */ - SCTP_DEBUG_PRINTK(",%08x", tsn); + SCTP_DEBUG_PRINTK_CONT(",%08x", tsn); dbg_ack_tsn = tsn; break; case 1: /* The last TSN was NOT ACKed. */ if (dbg_last_kept_tsn != dbg_kept_tsn) { /* Display the end of current range. */ - SCTP_DEBUG_PRINTK("-%08x", - dbg_last_kept_tsn); + SCTP_DEBUG_PRINTK_CONT("-%08x", + dbg_last_kept_tsn); } - SCTP_DEBUG_PRINTK("\n"); + SCTP_DEBUG_PRINTK_CONT("\n"); /* FALL THROUGH... */ default: @@ -1526,18 +1528,18 @@ static void sctp_check_transmitted(struct sctp_outq *q, break; if (dbg_last_kept_tsn != dbg_kept_tsn) - SCTP_DEBUG_PRINTK("-%08x", - dbg_last_kept_tsn); + SCTP_DEBUG_PRINTK_CONT("-%08x", + dbg_last_kept_tsn); - SCTP_DEBUG_PRINTK(",%08x", tsn); + SCTP_DEBUG_PRINTK_CONT(",%08x", tsn); dbg_kept_tsn = tsn; break; case 0: if (dbg_last_ack_tsn != dbg_ack_tsn) - SCTP_DEBUG_PRINTK("-%08x", - dbg_last_ack_tsn); - SCTP_DEBUG_PRINTK("\n"); + SCTP_DEBUG_PRINTK_CONT("-%08x", + dbg_last_ack_tsn); + SCTP_DEBUG_PRINTK_CONT("\n"); /* FALL THROUGH... */ default: @@ -1556,17 +1558,17 @@ static void sctp_check_transmitted(struct sctp_outq *q, switch (dbg_prt_state) { case 0: if (dbg_last_ack_tsn != dbg_ack_tsn) { - SCTP_DEBUG_PRINTK("-%08x\n", dbg_last_ack_tsn); + SCTP_DEBUG_PRINTK_CONT("-%08x\n", dbg_last_ack_tsn); } else { - SCTP_DEBUG_PRINTK("\n"); + SCTP_DEBUG_PRINTK_CONT("\n"); } break; case 1: if (dbg_last_kept_tsn != dbg_kept_tsn) { - SCTP_DEBUG_PRINTK("-%08x\n", dbg_last_kept_tsn); + SCTP_DEBUG_PRINTK_CONT("-%08x\n", dbg_last_kept_tsn); } else { - SCTP_DEBUG_PRINTK("\n"); + SCTP_DEBUG_PRINTK_CONT("\n"); } } #endif /* SCTP_DEBUG */ diff --git a/net/sctp/probe.c b/net/sctp/probe.c index db3a42b8b34..2e63e9dc010 100644 --- a/net/sctp/probe.c +++ b/net/sctp/probe.c @@ -22,6 +22,8 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/kprobes.h> #include <linux/socket.h> @@ -192,7 +194,7 @@ static __init int sctpprobe_init(void) if (ret) goto remove_proc; - pr_info("SCTP probe registered (port=%d)\n", port); + pr_info("probe registered (port=%d)\n", port); return 0; diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 5027b83f1cc..f774e657641 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -46,6 +46,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/init.h> #include <linux/netdevice.h> @@ -707,8 +709,7 @@ static int sctp_ctl_sock_init(void) &init_net); if (err < 0) { - printk(KERN_ERR - "SCTP: Failed to create the SCTP control socket.\n"); + pr_err("Failed to create the SCTP control socket\n"); return err; } return 0; @@ -1206,7 +1207,7 @@ SCTP_STATIC __init int sctp_init(void) __get_free_pages(GFP_ATOMIC, order); } while (!sctp_assoc_hashtable && --order > 0); if (!sctp_assoc_hashtable) { - printk(KERN_ERR "SCTP: Failed association hash alloc.\n"); + pr_err("Failed association hash alloc\n"); status = -ENOMEM; goto err_ahash_alloc; } @@ -1220,7 +1221,7 @@ SCTP_STATIC __init int sctp_init(void) sctp_ep_hashtable = (struct sctp_hashbucket *) kmalloc(64 * sizeof(struct sctp_hashbucket), GFP_KERNEL); if (!sctp_ep_hashtable) { - printk(KERN_ERR "SCTP: Failed endpoint_hash alloc.\n"); + pr_err("Failed endpoint_hash alloc\n"); status = -ENOMEM; goto err_ehash_alloc; } @@ -1239,7 +1240,7 @@ SCTP_STATIC __init int sctp_init(void) __get_free_pages(GFP_ATOMIC, order); } while (!sctp_port_hashtable && --order > 0); if (!sctp_port_hashtable) { - printk(KERN_ERR "SCTP: Failed bind hash alloc."); + pr_err("Failed bind hash alloc\n"); status = -ENOMEM; goto err_bhash_alloc; } @@ -1248,8 +1249,7 @@ SCTP_STATIC __init int sctp_init(void) INIT_HLIST_HEAD(&sctp_port_hashtable[i].chain); } - printk(KERN_INFO "SCTP: Hash tables configured " - "(established %d bind %d)\n", + pr_info("Hash tables configured (established %d bind %d)\n", sctp_assoc_hashsize, sctp_port_hashsize); /* Disable ADDIP by default. */ @@ -1290,8 +1290,7 @@ SCTP_STATIC __init int sctp_init(void) /* Initialize the control inode/socket for handling OOTB packets. */ if ((status = sctp_ctl_sock_init())) { - printk (KERN_ERR - "SCTP: Failed to initialize the SCTP control sock.\n"); + pr_err("Failed to initialize the SCTP control sock\n"); goto err_ctl_sock_init; } diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 246f9292465..2cc46f0962c 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -50,6 +50,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/types.h> #include <linux/kernel.h> #include <linux/ip.h> diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index f5e5e27cac5..b21b218d564 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -47,6 +47,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/skbuff.h> #include <linux/types.h> #include <linux/socket.h> @@ -1146,26 +1148,23 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype, case SCTP_DISPOSITION_VIOLATION: if (net_ratelimit()) - printk(KERN_ERR "sctp protocol violation state %d " - "chunkid %d\n", state, subtype.chunk); + pr_err("protocol violation state %d chunkid %d\n", + state, subtype.chunk); break; case SCTP_DISPOSITION_NOT_IMPL: - printk(KERN_WARNING "sctp unimplemented feature in state %d, " - "event_type %d, event_id %d\n", - state, event_type, subtype.chunk); + pr_warn("unimplemented feature in state %d, event_type %d, event_id %d\n", + state, event_type, subtype.chunk); break; case SCTP_DISPOSITION_BUG: - printk(KERN_ERR "sctp bug in state %d, " - "event_type %d, event_id %d\n", + pr_err("bug in state %d, event_type %d, event_id %d\n", state, event_type, subtype.chunk); BUG(); break; default: - printk(KERN_ERR "sctp impossible disposition %d " - "in state %d, event_type %d, event_id %d\n", + pr_err("impossible disposition %d in state %d, event_type %d, event_id %d\n", status, state, event_type, subtype.chunk); BUG(); break; @@ -1679,8 +1678,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, sctp_cmd_send_asconf(asoc); break; default: - printk(KERN_WARNING "Impossible command: %u, %p\n", - cmd->verb, cmd->obj.ptr); + pr_warn("Impossible command: %u, %p\n", + cmd->verb, cmd->obj.ptr); break; } diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 24b2cd55563..8b284436be6 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -50,6 +50,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/types.h> #include <linux/kernel.h> #include <linux/ip.h> @@ -1138,18 +1140,16 @@ sctp_disposition_t sctp_sf_backbeat_8_3(const struct sctp_endpoint *ep, if (unlikely(!link)) { if (from_addr.sa.sa_family == AF_INET6) { if (net_ratelimit()) - printk(KERN_WARNING - "%s association %p could not find address %pI6\n", - __func__, - asoc, - &from_addr.v6.sin6_addr); + pr_warn("%s association %p could not find address %pI6\n", + __func__, + asoc, + &from_addr.v6.sin6_addr); } else { if (net_ratelimit()) - printk(KERN_WARNING - "%s association %p could not find address %pI4\n", - __func__, - asoc, - &from_addr.v4.sin_addr.s_addr); + pr_warn("%s association %p could not find address %pI4\n", + __func__, + asoc, + &from_addr.v4.sin_addr.s_addr); } return SCTP_DISPOSITION_DISCARD; } diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c index 6d9b3aafcc5..546d4387fb3 100644 --- a/net/sctp/sm_statetable.c +++ b/net/sctp/sm_statetable.c @@ -46,6 +46,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/skbuff.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> @@ -66,15 +68,19 @@ static const sctp_sm_table_entry_t bug = { .name = "sctp_sf_bug" }; -#define DO_LOOKUP(_max, _type, _table) \ - if ((event_subtype._type > (_max))) { \ - printk(KERN_WARNING \ - "sctp table %p possible attack:" \ - " event %d exceeds max %d\n", \ - _table, event_subtype._type, _max); \ - return &bug; \ - } \ - return &_table[event_subtype._type][(int)state]; +#define DO_LOOKUP(_max, _type, _table) \ +({ \ + const sctp_sm_table_entry_t *rtn; \ + \ + if ((event_subtype._type > (_max))) { \ + pr_warn("table %p possible attack: event %d exceeds max %d\n", \ + _table, event_subtype._type, _max); \ + rtn = &bug; \ + } else \ + rtn = &_table[event_subtype._type][(int)state]; \ + \ + rtn; \ +}) const sctp_sm_table_entry_t *sctp_sm_lookup_event(sctp_event_t event_type, sctp_state_t state, @@ -83,21 +89,15 @@ const sctp_sm_table_entry_t *sctp_sm_lookup_event(sctp_event_t event_type, switch (event_type) { case SCTP_EVENT_T_CHUNK: return sctp_chunk_event_lookup(event_subtype.chunk, state); - break; case SCTP_EVENT_T_TIMEOUT: - DO_LOOKUP(SCTP_EVENT_TIMEOUT_MAX, timeout, - timeout_event_table); - break; - + return DO_LOOKUP(SCTP_EVENT_TIMEOUT_MAX, timeout, + timeout_event_table); case SCTP_EVENT_T_OTHER: - DO_LOOKUP(SCTP_EVENT_OTHER_MAX, other, other_event_table); - break; - + return DO_LOOKUP(SCTP_EVENT_OTHER_MAX, other, + other_event_table); case SCTP_EVENT_T_PRIMITIVE: - DO_LOOKUP(SCTP_EVENT_PRIMITIVE_MAX, primitive, - primitive_event_table); - break; - + return DO_LOOKUP(SCTP_EVENT_PRIMITIVE_MAX, primitive, + primitive_event_table); default: /* Yikes! We got an illegal event type. */ return &bug; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index ca44917872d..f4bec277235 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -57,6 +57,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/types.h> #include <linux/kernel.h> #include <linux/wait.h> @@ -2458,9 +2460,8 @@ static int sctp_setsockopt_delayed_ack(struct sock *sk, if (params.sack_delay == 0 && params.sack_freq == 0) return 0; } else if (optlen == sizeof(struct sctp_assoc_value)) { - printk(KERN_WARNING "SCTP: Use of struct sctp_assoc_value " - "in delayed_ack socket option deprecated\n"); - printk(KERN_WARNING "SCTP: Use struct sctp_sack_info instead\n"); + pr_warn("Use of struct sctp_assoc_value in delayed_ack socket option deprecated\n"); + pr_warn("Use struct sctp_sack_info instead\n"); if (copy_from_user(¶ms, optval, optlen)) return -EFAULT; @@ -2868,10 +2869,8 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned int val; if (optlen == sizeof(int)) { - printk(KERN_WARNING - "SCTP: Use of int in maxseg socket option deprecated\n"); - printk(KERN_WARNING - "SCTP: Use struct sctp_assoc_value instead\n"); + pr_warn("Use of int in maxseg socket option deprecated\n"); + pr_warn("Use struct sctp_assoc_value instead\n"); if (copy_from_user(&val, optval, optlen)) return -EFAULT; params.assoc_id = 0; @@ -3121,10 +3120,8 @@ static int sctp_setsockopt_maxburst(struct sock *sk, int assoc_id = 0; if (optlen == sizeof(int)) { - printk(KERN_WARNING - "SCTP: Use of int in max_burst socket option deprecated\n"); - printk(KERN_WARNING - "SCTP: Use struct sctp_assoc_value instead\n"); + pr_warn("Use of int in max_burst socket option deprecated\n"); + pr_warn("Use struct sctp_assoc_value instead\n"); if (copy_from_user(&val, optval, optlen)) return -EFAULT; } else if (optlen == sizeof(struct sctp_assoc_value)) { @@ -4281,9 +4278,8 @@ static int sctp_getsockopt_delayed_ack(struct sock *sk, int len, if (copy_from_user(¶ms, optval, len)) return -EFAULT; } else if (len == sizeof(struct sctp_assoc_value)) { - printk(KERN_WARNING "SCTP: Use of struct sctp_assoc_value " - "in delayed_ack socket option deprecated\n"); - printk(KERN_WARNING "SCTP: Use struct sctp_sack_info instead\n"); + pr_warn("Use of struct sctp_assoc_value in delayed_ack socket option deprecated\n"); + pr_warn("Use struct sctp_sack_info instead\n"); if (copy_from_user(¶ms, optval, len)) return -EFAULT; } else @@ -4929,10 +4925,8 @@ static int sctp_getsockopt_maxseg(struct sock *sk, int len, struct sctp_association *asoc; if (len == sizeof(int)) { - printk(KERN_WARNING - "SCTP: Use of int in maxseg socket option deprecated\n"); - printk(KERN_WARNING - "SCTP: Use struct sctp_assoc_value instead\n"); + pr_warn("Use of int in maxseg socket option deprecated\n"); + pr_warn("Use struct sctp_assoc_value instead\n"); params.assoc_id = 0; } else if (len >= sizeof(struct sctp_assoc_value)) { len = sizeof(struct sctp_assoc_value); @@ -5023,10 +5017,8 @@ static int sctp_getsockopt_maxburst(struct sock *sk, int len, struct sctp_association *asoc; if (len == sizeof(int)) { - printk(KERN_WARNING - "SCTP: Use of int in max_burst socket option deprecated\n"); - printk(KERN_WARNING - "SCTP: Use struct sctp_assoc_value instead\n"); + pr_warn("Use of int in max_burst socket option deprecated\n"); + pr_warn("Use struct sctp_assoc_value instead\n"); params.assoc_id = 0; } else if (len >= sizeof(struct sctp_assoc_value)) { len = sizeof(struct sctp_assoc_value); @@ -5586,8 +5578,7 @@ SCTP_STATIC int sctp_listen_start(struct sock *sk, int backlog) tfm = crypto_alloc_hash(sctp_hmac_alg, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm)) { if (net_ratelimit()) { - printk(KERN_INFO - "SCTP: failed to load transform for %s: %ld\n", + pr_info("failed to load transform for %s: %ld\n", sctp_hmac_alg, PTR_ERR(tfm)); } return -ENOSYS; diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 132046cb82f..d3ae493d234 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -48,6 +48,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/slab.h> #include <linux/types.h> #include <linux/random.h> @@ -244,10 +246,9 @@ void sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) struct dst_entry *dst; if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) { - printk(KERN_WARNING "%s: Reported pmtu %d too low, " - "using default minimum of %d\n", - __func__, pmtu, - SCTP_DEFAULT_MINSEGMENT); + pr_warn("%s: Reported pmtu %d too low, using default minimum of %d\n", + __func__, pmtu, + SCTP_DEFAULT_MINSEGMENT); /* Use default minimum segment size and disable * pmtu discovery on this transport. */ diff --git a/net/socket.c b/net/socket.c index 2270b941bcc..7848d12f5e4 100644 --- a/net/socket.c +++ b/net/socket.c @@ -535,14 +535,13 @@ void sock_release(struct socket *sock) } EXPORT_SYMBOL(sock_release); -int sock_tx_timestamp(struct msghdr *msg, struct sock *sk, - union skb_shared_tx *shtx) +int sock_tx_timestamp(struct sock *sk, __u8 *tx_flags) { - shtx->flags = 0; + *tx_flags = 0; if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE)) - shtx->hardware = 1; + *tx_flags |= SKBTX_HW_TSTAMP; if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE)) - shtx->software = 1; + *tx_flags |= SKBTX_SW_TSTAMP; return 0; } EXPORT_SYMBOL(sock_tx_timestamp); diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 8dc47f1d000..36cb66022a2 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -19,6 +19,15 @@ # define RPCDBG_FACILITY RPCDBG_AUTH #endif +#define RPC_CREDCACHE_DEFAULT_HASHBITS (4) +struct rpc_cred_cache { + struct hlist_head *hashtable; + unsigned int hashbits; + spinlock_t lock; +}; + +static unsigned int auth_hashbits = RPC_CREDCACHE_DEFAULT_HASHBITS; + static DEFINE_SPINLOCK(rpc_authflavor_lock); static const struct rpc_authops *auth_flavors[RPC_AUTH_MAXFLAVOR] = { &authnull_ops, /* AUTH_NULL */ @@ -29,6 +38,47 @@ static const struct rpc_authops *auth_flavors[RPC_AUTH_MAXFLAVOR] = { static LIST_HEAD(cred_unused); static unsigned long number_cred_unused; +#define MAX_HASHTABLE_BITS (10) +static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp) +{ + unsigned long num; + unsigned int nbits; + int ret; + + if (!val) + goto out_inval; + ret = strict_strtoul(val, 0, &num); + if (ret == -EINVAL) + goto out_inval; + nbits = fls(num); + if (num > (1U << nbits)) + nbits++; + if (nbits > MAX_HASHTABLE_BITS || nbits < 2) + goto out_inval; + *(unsigned int *)kp->arg = nbits; + return 0; +out_inval: + return -EINVAL; +} + +static int param_get_hashtbl_sz(char *buffer, const struct kernel_param *kp) +{ + unsigned int nbits; + + nbits = *(unsigned int *)kp->arg; + return sprintf(buffer, "%u", 1U << nbits); +} + +#define param_check_hashtbl_sz(name, p) __param_check(name, p, unsigned int); + +static struct kernel_param_ops param_ops_hashtbl_sz = { + .set = param_set_hashtbl_sz, + .get = param_get_hashtbl_sz, +}; + +module_param_named(auth_hashtable_size, auth_hashbits, hashtbl_sz, 0644); +MODULE_PARM_DESC(auth_hashtable_size, "RPC credential cache hashtable size"); + static u32 pseudoflavor_to_flavor(u32 flavor) { if (flavor >= RPC_AUTH_MAXFLAVOR) @@ -145,16 +195,23 @@ int rpcauth_init_credcache(struct rpc_auth *auth) { struct rpc_cred_cache *new; - int i; + unsigned int hashsize; new = kmalloc(sizeof(*new), GFP_KERNEL); if (!new) - return -ENOMEM; - for (i = 0; i < RPC_CREDCACHE_NR; i++) - INIT_HLIST_HEAD(&new->hashtable[i]); + goto out_nocache; + new->hashbits = auth_hashbits; + hashsize = 1U << new->hashbits; + new->hashtable = kcalloc(hashsize, sizeof(new->hashtable[0]), GFP_KERNEL); + if (!new->hashtable) + goto out_nohashtbl; spin_lock_init(&new->lock); auth->au_credcache = new; return 0; +out_nohashtbl: + kfree(new); +out_nocache: + return -ENOMEM; } EXPORT_SYMBOL_GPL(rpcauth_init_credcache); @@ -183,11 +240,12 @@ rpcauth_clear_credcache(struct rpc_cred_cache *cache) LIST_HEAD(free); struct hlist_head *head; struct rpc_cred *cred; + unsigned int hashsize = 1U << cache->hashbits; int i; spin_lock(&rpc_credcache_lock); spin_lock(&cache->lock); - for (i = 0; i < RPC_CREDCACHE_NR; i++) { + for (i = 0; i < hashsize; i++) { head = &cache->hashtable[i]; while (!hlist_empty(head)) { cred = hlist_entry(head->first, struct rpc_cred, cr_hash); @@ -216,6 +274,7 @@ rpcauth_destroy_credcache(struct rpc_auth *auth) if (cache) { auth->au_credcache = NULL; rpcauth_clear_credcache(cache); + kfree(cache->hashtable); kfree(cache); } } @@ -297,7 +356,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, *entry, *new; unsigned int nr; - nr = hash_long(acred->uid, RPC_CREDCACHE_HASHBITS); + nr = hash_long(acred->uid, cache->hashbits); rcu_read_lock(); hlist_for_each_entry_rcu(entry, pos, &cache->hashtable[nr], cr_hash) { @@ -390,16 +449,16 @@ rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred, } EXPORT_SYMBOL_GPL(rpcauth_init_cred); -void +struct rpc_cred * rpcauth_generic_bind_cred(struct rpc_task *task, struct rpc_cred *cred, int lookupflags) { - task->tk_msg.rpc_cred = get_rpccred(cred); dprintk("RPC: %5u holding %s cred %p\n", task->tk_pid, cred->cr_auth->au_ops->au_name, cred); + return get_rpccred(cred); } EXPORT_SYMBOL_GPL(rpcauth_generic_bind_cred); -static void +static struct rpc_cred * rpcauth_bind_root_cred(struct rpc_task *task, int lookupflags) { struct rpc_auth *auth = task->tk_client->cl_auth; @@ -407,45 +466,43 @@ rpcauth_bind_root_cred(struct rpc_task *task, int lookupflags) .uid = 0, .gid = 0, }; - struct rpc_cred *ret; dprintk("RPC: %5u looking up %s cred\n", task->tk_pid, task->tk_client->cl_auth->au_ops->au_name); - ret = auth->au_ops->lookup_cred(auth, &acred, lookupflags); - if (!IS_ERR(ret)) - task->tk_msg.rpc_cred = ret; - else - task->tk_status = PTR_ERR(ret); + return auth->au_ops->lookup_cred(auth, &acred, lookupflags); } -static void +static struct rpc_cred * rpcauth_bind_new_cred(struct rpc_task *task, int lookupflags) { struct rpc_auth *auth = task->tk_client->cl_auth; - struct rpc_cred *ret; dprintk("RPC: %5u looking up %s cred\n", task->tk_pid, auth->au_ops->au_name); - ret = rpcauth_lookupcred(auth, lookupflags); - if (!IS_ERR(ret)) - task->tk_msg.rpc_cred = ret; - else - task->tk_status = PTR_ERR(ret); + return rpcauth_lookupcred(auth, lookupflags); } -void +static int rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags) { + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_cred *new; int lookupflags = 0; if (flags & RPC_TASK_ASYNC) lookupflags |= RPCAUTH_LOOKUP_NEW; if (cred != NULL) - cred->cr_ops->crbind(task, cred, lookupflags); + new = cred->cr_ops->crbind(task, cred, lookupflags); else if (flags & RPC_TASK_ROOTCREDS) - rpcauth_bind_root_cred(task, lookupflags); + new = rpcauth_bind_root_cred(task, lookupflags); else - rpcauth_bind_new_cred(task, lookupflags); + new = rpcauth_bind_new_cred(task, lookupflags); + if (IS_ERR(new)) + return PTR_ERR(new); + if (req->rq_cred != NULL) + put_rpccred(req->rq_cred); + req->rq_cred = new; + return 0; } void @@ -484,22 +541,10 @@ out_nodestroy: } EXPORT_SYMBOL_GPL(put_rpccred); -void -rpcauth_unbindcred(struct rpc_task *task) -{ - struct rpc_cred *cred = task->tk_msg.rpc_cred; - - dprintk("RPC: %5u releasing %s cred %p\n", - task->tk_pid, cred->cr_auth->au_ops->au_name, cred); - - put_rpccred(cred); - task->tk_msg.rpc_cred = NULL; -} - __be32 * rpcauth_marshcred(struct rpc_task *task, __be32 *p) { - struct rpc_cred *cred = task->tk_msg.rpc_cred; + struct rpc_cred *cred = task->tk_rqstp->rq_cred; dprintk("RPC: %5u marshaling %s cred %p\n", task->tk_pid, cred->cr_auth->au_ops->au_name, cred); @@ -510,7 +555,7 @@ rpcauth_marshcred(struct rpc_task *task, __be32 *p) __be32 * rpcauth_checkverf(struct rpc_task *task, __be32 *p) { - struct rpc_cred *cred = task->tk_msg.rpc_cred; + struct rpc_cred *cred = task->tk_rqstp->rq_cred; dprintk("RPC: %5u validating %s cred %p\n", task->tk_pid, cred->cr_auth->au_ops->au_name, cred); @@ -522,7 +567,7 @@ int rpcauth_wrap_req(struct rpc_task *task, kxdrproc_t encode, void *rqstp, __be32 *data, void *obj) { - struct rpc_cred *cred = task->tk_msg.rpc_cred; + struct rpc_cred *cred = task->tk_rqstp->rq_cred; dprintk("RPC: %5u using %s cred %p to wrap rpc data\n", task->tk_pid, cred->cr_ops->cr_name, cred); @@ -536,7 +581,7 @@ int rpcauth_unwrap_resp(struct rpc_task *task, kxdrproc_t decode, void *rqstp, __be32 *data, void *obj) { - struct rpc_cred *cred = task->tk_msg.rpc_cred; + struct rpc_cred *cred = task->tk_rqstp->rq_cred; dprintk("RPC: %5u using %s cred %p to unwrap rpc data\n", task->tk_pid, cred->cr_ops->cr_name, cred); @@ -550,13 +595,21 @@ rpcauth_unwrap_resp(struct rpc_task *task, kxdrproc_t decode, void *rqstp, int rpcauth_refreshcred(struct rpc_task *task) { - struct rpc_cred *cred = task->tk_msg.rpc_cred; + struct rpc_cred *cred = task->tk_rqstp->rq_cred; int err; + cred = task->tk_rqstp->rq_cred; + if (cred == NULL) { + err = rpcauth_bindcred(task, task->tk_msg.rpc_cred, task->tk_flags); + if (err < 0) + goto out; + cred = task->tk_rqstp->rq_cred; + }; dprintk("RPC: %5u refreshing %s cred %p\n", task->tk_pid, cred->cr_auth->au_ops->au_name, cred); err = cred->cr_ops->crrefresh(task); +out: if (err < 0) task->tk_status = err; return err; @@ -565,7 +618,7 @@ rpcauth_refreshcred(struct rpc_task *task) void rpcauth_invalcred(struct rpc_task *task) { - struct rpc_cred *cred = task->tk_msg.rpc_cred; + struct rpc_cred *cred = task->tk_rqstp->rq_cred; dprintk("RPC: %5u invalidating %s cred %p\n", task->tk_pid, cred->cr_auth->au_ops->au_name, cred); @@ -576,7 +629,7 @@ rpcauth_invalcred(struct rpc_task *task) int rpcauth_uptodatecred(struct rpc_task *task) { - struct rpc_cred *cred = task->tk_msg.rpc_cred; + struct rpc_cred *cred = task->tk_rqstp->rq_cred; return cred == NULL || test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0; @@ -587,14 +640,27 @@ static struct shrinker rpc_cred_shrinker = { .seeks = DEFAULT_SEEKS, }; -void __init rpcauth_init_module(void) +int __init rpcauth_init_module(void) { - rpc_init_authunix(); - rpc_init_generic_auth(); + int err; + + err = rpc_init_authunix(); + if (err < 0) + goto out1; + err = rpc_init_generic_auth(); + if (err < 0) + goto out2; register_shrinker(&rpc_cred_shrinker); + return 0; +out2: + rpc_destroy_authunix(); +out1: + return err; } void __exit rpcauth_remove_module(void) { + rpc_destroy_authunix(); + rpc_destroy_generic_auth(); unregister_shrinker(&rpc_cred_shrinker); } diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index 8f623b0f03d..43162bb3b78 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c @@ -27,7 +27,6 @@ struct generic_cred { }; static struct rpc_auth generic_auth; -static struct rpc_cred_cache generic_cred_cache; static const struct rpc_credops generic_credops; /* @@ -55,18 +54,13 @@ struct rpc_cred *rpc_lookup_machine_cred(void) } EXPORT_SYMBOL_GPL(rpc_lookup_machine_cred); -static void -generic_bind_cred(struct rpc_task *task, struct rpc_cred *cred, int lookupflags) +static struct rpc_cred *generic_bind_cred(struct rpc_task *task, + struct rpc_cred *cred, int lookupflags) { struct rpc_auth *auth = task->tk_client->cl_auth; struct auth_cred *acred = &container_of(cred, struct generic_cred, gc_base)->acred; - struct rpc_cred *ret; - ret = auth->au_ops->lookup_cred(auth, acred, lookupflags); - if (!IS_ERR(ret)) - task->tk_msg.rpc_cred = ret; - else - task->tk_status = PTR_ERR(ret); + return auth->au_ops->lookup_cred(auth, acred, lookupflags); } /* @@ -159,20 +153,16 @@ out_nomatch: return 0; } -void __init rpc_init_generic_auth(void) +int __init rpc_init_generic_auth(void) { - spin_lock_init(&generic_cred_cache.lock); + return rpcauth_init_credcache(&generic_auth); } void __exit rpc_destroy_generic_auth(void) { - rpcauth_clear_credcache(&generic_cred_cache); + rpcauth_destroy_credcache(&generic_auth); } -static struct rpc_cred_cache generic_cred_cache = { - {{ NULL, },}, -}; - static const struct rpc_authops generic_auth_ops = { .owner = THIS_MODULE, .au_name = "Generic", @@ -183,7 +173,6 @@ static const struct rpc_authops generic_auth_ops = { static struct rpc_auth generic_auth = { .au_ops = &generic_auth_ops, .au_count = ATOMIC_INIT(0), - .au_credcache = &generic_cred_cache, }; static const struct rpc_credops generic_credops = { diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 8da2a0e6857..dcfc66bab2b 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -373,7 +373,7 @@ gss_handle_downcall_result(struct gss_cred *gss_cred, struct gss_upcall_msg *gss static void gss_upcall_callback(struct rpc_task *task) { - struct gss_cred *gss_cred = container_of(task->tk_msg.rpc_cred, + struct gss_cred *gss_cred = container_of(task->tk_rqstp->rq_cred, struct gss_cred, gc_base); struct gss_upcall_msg *gss_msg = gss_cred->gc_upcall; struct inode *inode = &gss_msg->inode->vfs_inode; @@ -502,7 +502,7 @@ static void warn_gssd(void) static inline int gss_refresh_upcall(struct rpc_task *task) { - struct rpc_cred *cred = task->tk_msg.rpc_cred; + struct rpc_cred *cred = task->tk_rqstp->rq_cred; struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth); struct gss_cred *gss_cred = container_of(cred, @@ -928,6 +928,7 @@ gss_do_free_ctx(struct gss_cl_ctx *ctx) { dprintk("RPC: gss_free_ctx\n"); + gss_delete_sec_context(&ctx->gc_gss_ctx); kfree(ctx->gc_wire_ctx.data); kfree(ctx); } @@ -942,13 +943,7 @@ gss_free_ctx_callback(struct rcu_head *head) static void gss_free_ctx(struct gss_cl_ctx *ctx) { - struct gss_ctx *gc_gss_ctx; - - gc_gss_ctx = rcu_dereference(ctx->gc_gss_ctx); - rcu_assign_pointer(ctx->gc_gss_ctx, NULL); call_rcu(&ctx->gc_rcu, gss_free_ctx_callback); - if (gc_gss_ctx) - gss_delete_sec_context(&gc_gss_ctx); } static void @@ -1064,12 +1059,12 @@ out: static __be32 * gss_marshal(struct rpc_task *task, __be32 *p) { - struct rpc_cred *cred = task->tk_msg.rpc_cred; + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_cred *cred = req->rq_cred; struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); __be32 *cred_len; - struct rpc_rqst *req = task->tk_rqstp; u32 maj_stat = 0; struct xdr_netobj mic; struct kvec iov; @@ -1119,7 +1114,7 @@ out_put_ctx: static int gss_renew_cred(struct rpc_task *task) { - struct rpc_cred *oldcred = task->tk_msg.rpc_cred; + struct rpc_cred *oldcred = task->tk_rqstp->rq_cred; struct gss_cred *gss_cred = container_of(oldcred, struct gss_cred, gc_base); @@ -1133,7 +1128,7 @@ static int gss_renew_cred(struct rpc_task *task) new = gss_lookup_cred(auth, &acred, RPCAUTH_LOOKUP_NEW); if (IS_ERR(new)) return PTR_ERR(new); - task->tk_msg.rpc_cred = new; + task->tk_rqstp->rq_cred = new; put_rpccred(oldcred); return 0; } @@ -1161,7 +1156,7 @@ static int gss_cred_is_negative_entry(struct rpc_cred *cred) static int gss_refresh(struct rpc_task *task) { - struct rpc_cred *cred = task->tk_msg.rpc_cred; + struct rpc_cred *cred = task->tk_rqstp->rq_cred; int ret = 0; if (gss_cred_is_negative_entry(cred)) @@ -1172,7 +1167,7 @@ gss_refresh(struct rpc_task *task) ret = gss_renew_cred(task); if (ret < 0) goto out; - cred = task->tk_msg.rpc_cred; + cred = task->tk_rqstp->rq_cred; } if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags)) @@ -1191,7 +1186,7 @@ gss_refresh_null(struct rpc_task *task) static __be32 * gss_validate(struct rpc_task *task, __be32 *p) { - struct rpc_cred *cred = task->tk_msg.rpc_cred; + struct rpc_cred *cred = task->tk_rqstp->rq_cred; struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); __be32 seq; struct kvec iov; @@ -1400,7 +1395,7 @@ static int gss_wrap_req(struct rpc_task *task, kxdrproc_t encode, void *rqstp, __be32 *p, void *obj) { - struct rpc_cred *cred = task->tk_msg.rpc_cred; + struct rpc_cred *cred = task->tk_rqstp->rq_cred; struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); @@ -1503,7 +1498,7 @@ static int gss_unwrap_resp(struct rpc_task *task, kxdrproc_t decode, void *rqstp, __be32 *p, void *obj) { - struct rpc_cred *cred = task->tk_msg.rpc_cred; + struct rpc_cred *cred = task->tk_rqstp->rq_cred; struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c index 1db618f56ec..a5c36c01707 100644 --- a/net/sunrpc/auth_null.c +++ b/net/sunrpc/auth_null.c @@ -75,7 +75,7 @@ nul_marshal(struct rpc_task *task, __be32 *p) static int nul_refresh(struct rpc_task *task) { - set_bit(RPCAUTH_CRED_UPTODATE, &task->tk_msg.rpc_cred->cr_flags); + set_bit(RPCAUTH_CRED_UPTODATE, &task->tk_rqstp->rq_cred->cr_flags); return 0; } diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index aac2f8b4ee2..4cb70dc6e7a 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -29,7 +29,6 @@ struct unx_cred { #endif static struct rpc_auth unix_auth; -static struct rpc_cred_cache unix_cred_cache; static const struct rpc_credops unix_credops; static struct rpc_auth * @@ -141,7 +140,7 @@ static __be32 * unx_marshal(struct rpc_task *task, __be32 *p) { struct rpc_clnt *clnt = task->tk_client; - struct unx_cred *cred = container_of(task->tk_msg.rpc_cred, struct unx_cred, uc_base); + struct unx_cred *cred = container_of(task->tk_rqstp->rq_cred, struct unx_cred, uc_base); __be32 *base, *hold; int i; @@ -174,7 +173,7 @@ unx_marshal(struct rpc_task *task, __be32 *p) static int unx_refresh(struct rpc_task *task) { - set_bit(RPCAUTH_CRED_UPTODATE, &task->tk_msg.rpc_cred->cr_flags); + set_bit(RPCAUTH_CRED_UPTODATE, &task->tk_rqstp->rq_cred->cr_flags); return 0; } @@ -197,15 +196,20 @@ unx_validate(struct rpc_task *task, __be32 *p) printk("RPC: giant verf size: %u\n", size); return NULL; } - task->tk_msg.rpc_cred->cr_auth->au_rslack = (size >> 2) + 2; + task->tk_rqstp->rq_cred->cr_auth->au_rslack = (size >> 2) + 2; p += (size >> 2); return p; } -void __init rpc_init_authunix(void) +int __init rpc_init_authunix(void) { - spin_lock_init(&unix_cred_cache.lock); + return rpcauth_init_credcache(&unix_auth); +} + +void rpc_destroy_authunix(void) +{ + rpcauth_destroy_credcache(&unix_auth); } const struct rpc_authops authunix_ops = { @@ -219,17 +223,12 @@ const struct rpc_authops authunix_ops = { }; static -struct rpc_cred_cache unix_cred_cache = { -}; - -static struct rpc_auth unix_auth = { .au_cslack = UNX_WRITESLACK, .au_rslack = 2, /* assume AUTH_NULL verf */ .au_ops = &authunix_ops, .au_flavor = RPC_AUTH_UNIX, .au_count = ATOMIC_INIT(0), - .au_credcache = &unix_cred_cache, }; static diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 58de76c8540..2b06410e584 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -34,7 +34,6 @@ #include <linux/sunrpc/cache.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/rpc_pipe_fs.h> -#include <linux/smp_lock.h> #define RPCDBG_FACILITY RPCDBG_CACHE @@ -320,7 +319,7 @@ static struct cache_detail *current_detail; static int current_index; static void do_cache_clean(struct work_struct *work); -static DECLARE_DELAYED_WORK(cache_cleaner, do_cache_clean); +static struct delayed_work cache_cleaner; static void sunrpc_init_cache_detail(struct cache_detail *cd) { @@ -1504,6 +1503,11 @@ static int create_cache_proc_entries(struct cache_detail *cd) } #endif +void __init cache_initialize(void) +{ + INIT_DELAYED_WORK_DEFERRABLE(&cache_cleaner, do_cache_clean); +} + int cache_register(struct cache_detail *cd) { int ret; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 756fc324db9..2388d83b68f 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -414,6 +414,35 @@ out_no_clnt: EXPORT_SYMBOL_GPL(rpc_clone_client); /* + * Kill all tasks for the given client. + * XXX: kill their descendants as well? + */ +void rpc_killall_tasks(struct rpc_clnt *clnt) +{ + struct rpc_task *rovr; + + + if (list_empty(&clnt->cl_tasks)) + return; + dprintk("RPC: killing all tasks for client %p\n", clnt); + /* + * Spin lock all_tasks to prevent changes... + */ + spin_lock(&clnt->cl_lock); + list_for_each_entry(rovr, &clnt->cl_tasks, tk_task) { + if (!RPC_IS_ACTIVATED(rovr)) + continue; + if (!(rovr->tk_flags & RPC_TASK_KILLED)) { + rovr->tk_flags |= RPC_TASK_KILLED; + rpc_exit(rovr, -EIO); + rpc_wake_up_queued_task(rovr->tk_waitqueue, rovr); + } + } + spin_unlock(&clnt->cl_lock); +} +EXPORT_SYMBOL_GPL(rpc_killall_tasks); + +/* * Properly shut down an RPC client, terminating all outstanding * requests. */ @@ -538,6 +567,49 @@ out: } EXPORT_SYMBOL_GPL(rpc_bind_new_program); +void rpc_task_release_client(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + + if (clnt != NULL) { + /* Remove from client task list */ + spin_lock(&clnt->cl_lock); + list_del(&task->tk_task); + spin_unlock(&clnt->cl_lock); + task->tk_client = NULL; + + rpc_release_client(clnt); + } +} + +static +void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt) +{ + if (clnt != NULL) { + rpc_task_release_client(task); + task->tk_client = clnt; + kref_get(&clnt->cl_kref); + if (clnt->cl_softrtry) + task->tk_flags |= RPC_TASK_SOFT; + /* Add to the client's list of all tasks */ + spin_lock(&clnt->cl_lock); + list_add_tail(&task->tk_task, &clnt->cl_tasks); + spin_unlock(&clnt->cl_lock); + } +} + +static void +rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg) +{ + if (msg != NULL) { + task->tk_msg.rpc_proc = msg->rpc_proc; + task->tk_msg.rpc_argp = msg->rpc_argp; + task->tk_msg.rpc_resp = msg->rpc_resp; + if (msg->rpc_cred != NULL) + task->tk_msg.rpc_cred = get_rpccred(msg->rpc_cred); + } +} + /* * Default callback for async RPC calls */ @@ -562,6 +634,18 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data) if (IS_ERR(task)) goto out; + rpc_task_set_client(task, task_setup_data->rpc_client); + rpc_task_set_rpc_message(task, task_setup_data->rpc_message); + + if (task->tk_status != 0) { + int ret = task->tk_status; + rpc_put_task(task); + return ERR_PTR(ret); + } + + if (task->tk_action == NULL) + rpc_call_start(task); + atomic_inc(&task->tk_count); rpc_execute(task); out: @@ -756,12 +840,13 @@ EXPORT_SYMBOL_GPL(rpc_force_rebind); * Restart an (async) RPC call from the call_prepare state. * Usually called from within the exit handler. */ -void +int rpc_restart_call_prepare(struct rpc_task *task) { if (RPC_ASSASSINATED(task)) - return; + return 0; task->tk_action = rpc_prepare_task; + return 1; } EXPORT_SYMBOL_GPL(rpc_restart_call_prepare); @@ -769,13 +854,13 @@ EXPORT_SYMBOL_GPL(rpc_restart_call_prepare); * Restart an (async) RPC call. Usually called from within the * exit handler. */ -void +int rpc_restart_call(struct rpc_task *task) { if (RPC_ASSASSINATED(task)) - return; - + return 0; task->tk_action = call_start; + return 1; } EXPORT_SYMBOL_GPL(rpc_restart_call); @@ -824,11 +909,6 @@ call_reserve(struct rpc_task *task) { dprint_status(task); - if (!rpcauth_uptodatecred(task)) { - task->tk_action = call_refresh; - return; - } - task->tk_status = 0; task->tk_action = call_reserveresult; xprt_reserve(task); @@ -892,7 +972,7 @@ call_reserveresult(struct rpc_task *task) static void call_allocate(struct rpc_task *task) { - unsigned int slack = task->tk_msg.rpc_cred->cr_auth->au_cslack; + unsigned int slack = task->tk_client->cl_auth->au_cslack; struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = task->tk_xprt; struct rpc_procinfo *proc = task->tk_msg.rpc_proc; @@ -900,7 +980,7 @@ call_allocate(struct rpc_task *task) dprint_status(task); task->tk_status = 0; - task->tk_action = call_bind; + task->tk_action = call_refresh; if (req->rq_buffer) return; @@ -937,6 +1017,47 @@ call_allocate(struct rpc_task *task) rpc_exit(task, -ERESTARTSYS); } +/* + * 2a. Bind and/or refresh the credentials + */ +static void +call_refresh(struct rpc_task *task) +{ + dprint_status(task); + + task->tk_action = call_refreshresult; + task->tk_status = 0; + task->tk_client->cl_stats->rpcauthrefresh++; + rpcauth_refreshcred(task); +} + +/* + * 2b. Process the results of a credential refresh + */ +static void +call_refreshresult(struct rpc_task *task) +{ + int status = task->tk_status; + + dprint_status(task); + + task->tk_status = 0; + task->tk_action = call_bind; + if (status >= 0 && rpcauth_uptodatecred(task)) + return; + switch (status) { + case -EACCES: + rpc_exit(task, -EACCES); + return; + case -ENOMEM: + rpc_exit(task, -ENOMEM); + return; + case -ETIMEDOUT: + rpc_delay(task, 3*HZ); + } + task->tk_action = call_refresh; +} + static inline int rpc_task_need_encode(struct rpc_task *task) { @@ -1472,43 +1593,6 @@ out_retry: } } -/* - * 8. Refresh the credentials if rejected by the server - */ -static void -call_refresh(struct rpc_task *task) -{ - dprint_status(task); - - task->tk_action = call_refreshresult; - task->tk_status = 0; - task->tk_client->cl_stats->rpcauthrefresh++; - rpcauth_refreshcred(task); -} - -/* - * 8a. Process the results of a credential refresh - */ -static void -call_refreshresult(struct rpc_task *task) -{ - int status = task->tk_status; - - dprint_status(task); - - task->tk_status = 0; - task->tk_action = call_reserve; - if (status >= 0 && rpcauth_uptodatecred(task)) - return; - if (status == -EACCES) { - rpc_exit(task, -EACCES); - return; - } - task->tk_action = call_refresh; - if (status != -ETIMEDOUT) - rpc_delay(task, 3*HZ); -} - static __be32 * rpc_encode_header(struct rpc_task *task) { diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 4a843b883b8..cace6049e4a 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -246,17 +246,8 @@ static inline void rpc_task_set_debuginfo(struct rpc_task *task) static void rpc_set_active(struct rpc_task *task) { - struct rpc_clnt *clnt; - if (test_and_set_bit(RPC_TASK_ACTIVE, &task->tk_runstate) != 0) - return; rpc_task_set_debuginfo(task); - /* Add to global list of all tasks */ - clnt = task->tk_client; - if (clnt != NULL) { - spin_lock(&clnt->cl_lock); - list_add_tail(&task->tk_task, &clnt->cl_tasks); - spin_unlock(&clnt->cl_lock); - } + set_bit(RPC_TASK_ACTIVE, &task->tk_runstate); } /* @@ -319,11 +310,6 @@ static void __rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, dprintk("RPC: %5u sleep_on(queue \"%s\" time %lu)\n", task->tk_pid, rpc_qname(q), jiffies); - if (!RPC_IS_ASYNC(task) && !RPC_IS_ACTIVATED(task)) { - printk(KERN_ERR "RPC: Inactive synchronous task put to sleep!\n"); - return; - } - __rpc_add_wait_queue(q, task); BUG_ON(task->tk_callback != NULL); @@ -334,8 +320,8 @@ static void __rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, rpc_action action) { - /* Mark the task as being activated if so needed */ - rpc_set_active(task); + /* We shouldn't ever put an inactive task to sleep */ + BUG_ON(!RPC_IS_ACTIVATED(task)); /* * Protect the queue operations. @@ -406,14 +392,6 @@ void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task EXPORT_SYMBOL_GPL(rpc_wake_up_queued_task); /* - * Wake up the specified task - */ -static void rpc_wake_up_task(struct rpc_task *task) -{ - rpc_wake_up_queued_task(task->tk_waitqueue, task); -} - -/* * Wake up the next task on a priority queue. */ static struct rpc_task * __rpc_wake_up_next_priority(struct rpc_wait_queue *queue) @@ -600,7 +578,15 @@ void rpc_exit_task(struct rpc_task *task) } } } -EXPORT_SYMBOL_GPL(rpc_exit_task); + +void rpc_exit(struct rpc_task *task, int status) +{ + task->tk_status = status; + task->tk_action = rpc_exit_task; + if (RPC_IS_QUEUED(task)) + rpc_wake_up_queued_task(task->tk_waitqueue, task); +} +EXPORT_SYMBOL_GPL(rpc_exit); void rpc_release_calldata(const struct rpc_call_ops *ops, void *calldata) { @@ -690,7 +676,6 @@ static void __rpc_execute(struct rpc_task *task) dprintk("RPC: %5u got signal\n", task->tk_pid); task->tk_flags |= RPC_TASK_KILLED; rpc_exit(task, -ERESTARTSYS); - rpc_wake_up_task(task); } rpc_set_running(task); dprintk("RPC: %5u sync task resuming\n", task->tk_pid); @@ -714,8 +699,9 @@ static void __rpc_execute(struct rpc_task *task) void rpc_execute(struct rpc_task *task) { rpc_set_active(task); - rpc_set_running(task); - __rpc_execute(task); + rpc_make_runnable(task); + if (!RPC_IS_ASYNC(task)) + __rpc_execute(task); } static void rpc_async_schedule(struct work_struct *work) @@ -808,26 +794,9 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta /* Initialize workqueue for async tasks */ task->tk_workqueue = task_setup_data->workqueue; - task->tk_client = task_setup_data->rpc_client; - if (task->tk_client != NULL) { - kref_get(&task->tk_client->cl_kref); - if (task->tk_client->cl_softrtry) - task->tk_flags |= RPC_TASK_SOFT; - } - if (task->tk_ops->rpc_call_prepare != NULL) task->tk_action = rpc_prepare_task; - if (task_setup_data->rpc_message != NULL) { - task->tk_msg.rpc_proc = task_setup_data->rpc_message->rpc_proc; - task->tk_msg.rpc_argp = task_setup_data->rpc_message->rpc_argp; - task->tk_msg.rpc_resp = task_setup_data->rpc_message->rpc_resp; - /* Bind the user cred */ - rpcauth_bindcred(task, task_setup_data->rpc_message->rpc_cred, task_setup_data->flags); - if (task->tk_action == NULL) - rpc_call_start(task); - } - /* starting timestamp */ task->tk_start = ktime_get(); @@ -896,11 +865,8 @@ void rpc_put_task(struct rpc_task *task) if (task->tk_rqstp) xprt_release(task); if (task->tk_msg.rpc_cred) - rpcauth_unbindcred(task); - if (task->tk_client) { - rpc_release_client(task->tk_client); - task->tk_client = NULL; - } + put_rpccred(task->tk_msg.rpc_cred); + rpc_task_release_client(task); if (task->tk_workqueue != NULL) { INIT_WORK(&task->u.tk_work, rpc_async_release); queue_work(task->tk_workqueue, &task->u.tk_work); @@ -913,13 +879,6 @@ static void rpc_release_task(struct rpc_task *task) { dprintk("RPC: %5u release task\n", task->tk_pid); - if (!list_empty(&task->tk_task)) { - struct rpc_clnt *clnt = task->tk_client; - /* Remove from client task list */ - spin_lock(&clnt->cl_lock); - list_del(&task->tk_task); - spin_unlock(&clnt->cl_lock); - } BUG_ON (RPC_IS_QUEUED(task)); /* Wake up anyone who is waiting for task completion */ @@ -928,35 +887,6 @@ static void rpc_release_task(struct rpc_task *task) rpc_put_task(task); } -/* - * Kill all tasks for the given client. - * XXX: kill their descendants as well? - */ -void rpc_killall_tasks(struct rpc_clnt *clnt) -{ - struct rpc_task *rovr; - - - if (list_empty(&clnt->cl_tasks)) - return; - dprintk("RPC: killing all tasks for client %p\n", clnt); - /* - * Spin lock all_tasks to prevent changes... - */ - spin_lock(&clnt->cl_lock); - list_for_each_entry(rovr, &clnt->cl_tasks, tk_task) { - if (! RPC_IS_ACTIVATED(rovr)) - continue; - if (!(rovr->tk_flags & RPC_TASK_KILLED)) { - rovr->tk_flags |= RPC_TASK_KILLED; - rpc_exit(rovr, -EIO); - rpc_wake_up_task(rovr); - } - } - spin_unlock(&clnt->cl_lock); -} -EXPORT_SYMBOL_GPL(rpc_killall_tasks); - int rpciod_up(void) { return try_module_get(THIS_MODULE) ? 0 : -EINVAL; diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index f438347d817..c0d085013a2 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -33,21 +33,27 @@ init_sunrpc(void) if (err) goto out; err = rpc_init_mempool(); - if (err) { - unregister_rpc_pipefs(); - goto out; - } + if (err) + goto out2; + err = rpcauth_init_module(); + if (err) + goto out3; #ifdef RPC_DEBUG rpc_register_sysctl(); #endif #ifdef CONFIG_PROC_FS rpc_proc_init(); #endif + cache_initialize(); cache_register(&ip_map_cache); cache_register(&unix_gid_cache); svc_init_xprt_sock(); /* svc sock transport */ init_socket_xprt(); /* clnt sock transport */ - rpcauth_init_module(); + return 0; +out3: + rpc_destroy_mempool(); +out2: + unregister_rpc_pipefs(); out: return err; } diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index dcd0132396b..970fb00f388 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1032,6 +1032,8 @@ void xprt_release(struct rpc_task *task) spin_unlock_bh(&xprt->transport_lock); if (req->rq_buffer) xprt->ops->buf_free(req->rq_buffer); + if (req->rq_cred != NULL) + put_rpccred(req->rq_cred); task->tk_rqstp = NULL; if (req->rq_release_snd_buf) req->rq_release_snd_buf(req); @@ -1129,6 +1131,7 @@ static void xprt_destroy(struct kref *kref) rpc_destroy_wait_queue(&xprt->sending); rpc_destroy_wait_queue(&xprt->resend); rpc_destroy_wait_queue(&xprt->backlog); + cancel_work_sync(&xprt->task_cleanup); /* * Tear down transport state and free the rpc_xprt */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 7ca65c7005e..49a62f0c4b8 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2577,7 +2577,8 @@ void cleanup_socket_xprt(void) xprt_unregister_transport(&xs_bc_tcp_transport); } -static int param_set_uint_minmax(const char *val, struct kernel_param *kp, +static int param_set_uint_minmax(const char *val, + const struct kernel_param *kp, unsigned int min, unsigned int max) { unsigned long num; @@ -2592,34 +2593,37 @@ static int param_set_uint_minmax(const char *val, struct kernel_param *kp, return 0; } -static int param_set_portnr(const char *val, struct kernel_param *kp) +static int param_set_portnr(const char *val, const struct kernel_param *kp) { return param_set_uint_minmax(val, kp, RPC_MIN_RESVPORT, RPC_MAX_RESVPORT); } -static int param_get_portnr(char *buffer, struct kernel_param *kp) -{ - return param_get_uint(buffer, kp); -} +static struct kernel_param_ops param_ops_portnr = { + .set = param_set_portnr, + .get = param_get_uint, +}; + #define param_check_portnr(name, p) \ __param_check(name, p, unsigned int); module_param_named(min_resvport, xprt_min_resvport, portnr, 0644); module_param_named(max_resvport, xprt_max_resvport, portnr, 0644); -static int param_set_slot_table_size(const char *val, struct kernel_param *kp) +static int param_set_slot_table_size(const char *val, + const struct kernel_param *kp) { return param_set_uint_minmax(val, kp, RPC_MIN_SLOT_TABLE, RPC_MAX_SLOT_TABLE); } -static int param_get_slot_table_size(char *buffer, struct kernel_param *kp) -{ - return param_get_uint(buffer, kp); -} +static struct kernel_param_ops param_ops_slot_table_size = { + .set = param_set_slot_table_size, + .get = param_get_uint, +}; + #define param_check_slot_table_size(name, p) \ __param_check(name, p, unsigned int); diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index a008c668930..b11248c2d78 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -143,6 +143,19 @@ static void bcbuf_decr_acks(struct sk_buff *buf) } +static void bclink_set_last_sent(void) +{ + if (bcl->next_out) + bcl->fsm_msg_cnt = mod(buf_seqno(bcl->next_out) - 1); + else + bcl->fsm_msg_cnt = mod(bcl->next_out_no - 1); +} + +u32 tipc_bclink_get_last_sent(void) +{ + return bcl->fsm_msg_cnt; +} + /** * bclink_set_gap - set gap according to contents of current deferred pkt queue * @@ -237,8 +250,10 @@ void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked) /* Try resolving broadcast link congestion, if necessary */ - if (unlikely(bcl->next_out)) + if (unlikely(bcl->next_out)) { tipc_link_push_queue(bcl); + bclink_set_last_sent(); + } if (unlikely(released && !list_empty(&bcl->waiting_ports))) tipc_link_wakeup_ports(bcl, 0); spin_unlock_bh(&bc_lock); @@ -395,7 +410,7 @@ int tipc_bclink_send_msg(struct sk_buff *buf) if (unlikely(res == -ELINKCONG)) buf_discard(buf); else - bcl->stats.sent_info++; + bclink_set_last_sent(); if (bcl->out_queue_size > bcl->stats.max_queue_sz) bcl->stats.max_queue_sz = bcl->out_queue_size; @@ -529,15 +544,6 @@ receive: tipc_node_unlock(node); } -u32 tipc_bclink_get_last_sent(void) -{ - u32 last_sent = mod(bcl->next_out_no - 1); - - if (bcl->next_out) - last_sent = mod(buf_seqno(bcl->next_out) - 1); - return last_sent; -} - u32 tipc_bclink_acks_missing(struct tipc_node *n_ptr) { return (n_ptr->bclink.supported && @@ -570,6 +576,7 @@ static int tipc_bcbearer_send(struct sk_buff *buf, msg = buf_msg(buf); msg_set_non_seq(msg, 1); msg_set_mc_netid(msg, tipc_net_id); + bcl->stats.sent_info++; } /* Send buffer over bearers until all targets reached */ @@ -609,11 +616,13 @@ static int tipc_bcbearer_send(struct sk_buff *buf, bcbearer->remains = bcbearer->remains_new; } - /* Unable to reach all targets */ + /* + * Unable to reach all targets (indicate success, since currently + * there isn't code in place to properly block & unblock the + * pseudo-bearer used by the broadcast link) + */ - bcbearer->bearer.publ.blocked = 1; - bcl->stats.bearer_congs++; - return 1; + return TIPC_OK; } /** diff --git a/net/tipc/core.c b/net/tipc/core.c index 69646811798..466b861dab9 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -169,6 +169,7 @@ void tipc_core_stop(void) tipc_nametbl_stop(); tipc_ref_table_stop(); tipc_socket_stop(); + tipc_log_resize(0); } /** @@ -203,7 +204,9 @@ static int __init tipc_init(void) { int res; - tipc_log_resize(CONFIG_TIPC_LOG); + if (tipc_log_resize(CONFIG_TIPC_LOG) != 0) + warn("Unable to create log buffer\n"); + info("Activated (version " TIPC_MOD_VER " compiled " __DATE__ " " __TIME__ ")\n"); @@ -230,7 +233,6 @@ static void __exit tipc_exit(void) tipc_core_stop_net(); tipc_core_stop(); info("Deactivated\n"); - tipc_log_resize(0); } module_init(tipc_init); diff --git a/net/tipc/discover.c b/net/tipc/discover.c index fc1fcf5e6b5..f28d1ae9312 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -203,6 +203,14 @@ void tipc_disc_recv_msg(struct sk_buff *buf, struct bearer *b_ptr) return; } spin_lock_bh(&n_ptr->lock); + + /* Don't talk to neighbor during cleanup after last session */ + + if (n_ptr->cleanup_required) { + spin_unlock_bh(&n_ptr->lock); + return; + } + link = n_ptr->links[b_ptr->identity]; if (!link) { dbg("creating link\n"); diff --git a/net/tipc/link.c b/net/tipc/link.c index a3616b99529..a6a3102bb4d 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1802,6 +1802,15 @@ static int link_recv_buf_validate(struct sk_buff *buf) return pskb_may_pull(buf, hdr_size); } +/** + * tipc_recv_msg - process TIPC messages arriving from off-node + * @head: pointer to message buffer chain + * @tb_ptr: pointer to bearer message arrived on + * + * Invoked with no locks held. Bearer pointer must point to a valid bearer + * structure (i.e. cannot be NULL), but bearer can be inactive. + */ + void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *tb_ptr) { read_lock_bh(&tipc_net_lock); @@ -1819,6 +1828,11 @@ void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *tb_ptr) head = head->next; + /* Ensure bearer is still enabled */ + + if (unlikely(!b_ptr->active)) + goto cont; + /* Ensure message is well-formed */ if (unlikely(!link_recv_buf_validate(buf))) @@ -1855,13 +1869,22 @@ void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *tb_ptr) goto cont; } - /* Locate unicast link endpoint that should handle message */ + /* Locate neighboring node that sent message */ n_ptr = tipc_node_find(msg_prevnode(msg)); if (unlikely(!n_ptr)) goto cont; tipc_node_lock(n_ptr); + /* Don't talk to neighbor during cleanup after last session */ + + if (n_ptr->cleanup_required) { + tipc_node_unlock(n_ptr); + goto cont; + } + + /* Locate unicast link endpoint that should handle message */ + l_ptr = n_ptr->links[b_ptr->identity]; if (unlikely(!l_ptr)) { tipc_node_unlock(n_ptr); diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 8ba79620db3..d504e490fd0 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -877,7 +877,7 @@ static void subseq_list(struct sub_seq *sseq, struct print_buf *buf, u32 depth, u32 index) { char portIdStr[27]; - char *scopeStr; + const char *scope_str[] = {"", " zone", " cluster", " node"}; struct publication *publ = sseq->zone_list; tipc_printf(buf, "%-10u %-10u ", sseq->lower, sseq->upper); @@ -893,15 +893,8 @@ static void subseq_list(struct sub_seq *sseq, struct print_buf *buf, u32 depth, tipc_node(publ->node), publ->ref); tipc_printf(buf, "%-26s ", portIdStr); if (depth > 3) { - if (publ->node != tipc_own_addr) - scopeStr = ""; - else if (publ->scope == TIPC_NODE_SCOPE) - scopeStr = "node"; - else if (publ->scope == TIPC_CLUSTER_SCOPE) - scopeStr = "cluster"; - else - scopeStr = "zone"; - tipc_printf(buf, "%-10u %s", publ->key, scopeStr); + tipc_printf(buf, "%-10u %s", publ->key, + scope_str[publ->scope]); } publ = publ->zone_list_next; @@ -951,24 +944,19 @@ static void nameseq_list(struct name_seq *seq, struct print_buf *buf, u32 depth, static void nametbl_header(struct print_buf *buf, u32 depth) { - tipc_printf(buf, "Type "); - - if (depth > 1) - tipc_printf(buf, "Lower Upper "); - if (depth > 2) - tipc_printf(buf, "Port Identity "); - if (depth > 3) - tipc_printf(buf, "Publication"); - - tipc_printf(buf, "\n-----------"); - - if (depth > 1) - tipc_printf(buf, "--------------------- "); - if (depth > 2) - tipc_printf(buf, "-------------------------- "); - if (depth > 3) - tipc_printf(buf, "------------------"); - + const char *header[] = { + "Type ", + "Lower Upper ", + "Port Identity ", + "Publication Scope" + }; + + int i; + + if (depth > 4) + depth = 4; + for (i = 0; i < depth; i++) + tipc_printf(buf, header[i]); tipc_printf(buf, "\n"); } diff --git a/net/tipc/node.c b/net/tipc/node.c index b634942caba..b702c7bf580 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -237,8 +237,7 @@ void tipc_node_link_down(struct tipc_node *n_ptr, struct link *l_ptr) int tipc_node_has_active_links(struct tipc_node *n_ptr) { - return (n_ptr && - ((n_ptr->active_links[0]) || (n_ptr->active_links[1]))); + return n_ptr->active_links[0] != NULL; } int tipc_node_has_redundant_links(struct tipc_node *n_ptr) @@ -384,6 +383,20 @@ static void node_established_contact(struct tipc_node *n_ptr) tipc_highest_allowed_slave); } +static void node_cleanup_finished(unsigned long node_addr) +{ + struct tipc_node *n_ptr; + + read_lock_bh(&tipc_net_lock); + n_ptr = tipc_node_find(node_addr); + if (n_ptr) { + tipc_node_lock(n_ptr); + n_ptr->cleanup_required = 0; + tipc_node_unlock(n_ptr); + } + read_unlock_bh(&tipc_net_lock); +} + static void node_lost_contact(struct tipc_node *n_ptr) { struct cluster *c_ptr; @@ -458,6 +471,11 @@ static void node_lost_contact(struct tipc_node *n_ptr) tipc_k_signal((Handler)ns->handle_node_down, (unsigned long)ns->usr_handle); } + + /* Prevent re-contact with node until all cleanup is done */ + + n_ptr->cleanup_required = 1; + tipc_k_signal((Handler)node_cleanup_finished, n_ptr->addr); } /** diff --git a/net/tipc/node.h b/net/tipc/node.h index 6f990da5d14..45f3db3a595 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -52,6 +52,7 @@ * @active_links: pointers to active links to node * @links: pointers to all links to node * @working_links: number of working links to node (both active and standby) + * @cleanup_required: non-zero if cleaning up after a prior loss of contact * @link_cnt: number of links to node * @permit_changeover: non-zero if node has redundant links to this system * @routers: bitmap (used for multicluster communication) @@ -78,6 +79,7 @@ struct tipc_node { struct link *links[MAX_BEARERS]; int link_cnt; int working_links; + int cleanup_required; int permit_changeover; u32 routers[512/32]; int last_router; diff --git a/net/tipc/port.c b/net/tipc/port.c index 0737680e926..ebcbc21d8f9 100644 --- a/net/tipc/port.c +++ b/net/tipc/port.c @@ -588,19 +588,10 @@ void tipc_port_recv_proto_msg(struct sk_buff *buf) if (!p_ptr) { err = TIPC_ERR_NO_PORT; } else if (p_ptr->publ.connected) { - if (port_peernode(p_ptr) != msg_orignode(msg)) + if ((port_peernode(p_ptr) != msg_orignode(msg)) || + (port_peerport(p_ptr) != msg_origport(msg))) { err = TIPC_ERR_NO_PORT; - if (port_peerport(p_ptr) != msg_origport(msg)) - err = TIPC_ERR_NO_PORT; - if (!err && msg_routed(msg)) { - u32 seqno = msg_transp_seqno(msg); - u32 myno = ++p_ptr->last_in_seqno; - if (seqno != myno) { - err = TIPC_ERR_NO_PORT; - abort_buf = port_build_self_abort_msg(p_ptr, err); - } - } - if (msg_type(msg) == CONN_ACK) { + } else if (msg_type(msg) == CONN_ACK) { int wakeup = tipc_port_congested(p_ptr) && p_ptr->publ.congested && p_ptr->wakeup; diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 66e889ba48f..f7ac94de24f 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -64,6 +64,7 @@ struct tipc_sock { struct sock sk; struct tipc_port *p; struct tipc_portid peer_name; + long conn_timeout; }; #define tipc_sk(sk) ((struct tipc_sock *)(sk)) @@ -240,9 +241,9 @@ static int tipc_create(struct net *net, struct socket *sock, int protocol, sock->state = state; sock_init_data(sock, sk); - sk->sk_rcvtimeo = msecs_to_jiffies(CONN_TIMEOUT_DEFAULT); sk->sk_backlog_rcv = backlog_rcv; tipc_sk(sk)->p = tp_ptr; + tipc_sk(sk)->conn_timeout = msecs_to_jiffies(CONN_TIMEOUT_DEFAULT); spin_unlock_bh(tp_ptr->lock); @@ -429,36 +430,55 @@ static int get_name(struct socket *sock, struct sockaddr *uaddr, * to handle any preventable race conditions, so TIPC will do the same ... * * TIPC sets the returned events as follows: - * a) POLLRDNORM and POLLIN are set if the socket's receive queue is non-empty - * or if a connection-oriented socket is does not have an active connection - * (i.e. a read operation will not block). - * b) POLLOUT is set except when a socket's connection has been terminated - * (i.e. a write operation will not block). - * c) POLLHUP is set when a socket's connection has been terminated. - * - * IMPORTANT: The fact that a read or write operation will not block does NOT - * imply that the operation will succeed! + * + * socket state flags set + * ------------ --------- + * unconnected no read flags + * no write flags + * + * connecting POLLIN/POLLRDNORM if ACK/NACK in rx queue + * no write flags + * + * connected POLLIN/POLLRDNORM if data in rx queue + * POLLOUT if port is not congested + * + * disconnecting POLLIN/POLLRDNORM/POLLHUP + * no write flags + * + * listening POLLIN if SYN in rx queue + * no write flags + * + * ready POLLIN/POLLRDNORM if data in rx queue + * [connectionless] POLLOUT (since port cannot be congested) + * + * IMPORTANT: The fact that a read or write operation is indicated does NOT + * imply that the operation will succeed, merely that it should be performed + * and will not block. */ static unsigned int poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; - u32 mask; + u32 mask = 0; poll_wait(file, sk_sleep(sk), wait); - if (!skb_queue_empty(&sk->sk_receive_queue) || - (sock->state == SS_UNCONNECTED) || - (sock->state == SS_DISCONNECTING)) - mask = (POLLRDNORM | POLLIN); - else - mask = 0; - - if (sock->state == SS_DISCONNECTING) - mask |= POLLHUP; - else - mask |= POLLOUT; + switch ((int)sock->state) { + case SS_READY: + case SS_CONNECTED: + if (!tipc_sk_port(sk)->congested) + mask |= POLLOUT; + /* fall thru' */ + case SS_CONNECTING: + case SS_LISTENING: + if (!skb_queue_empty(&sk->sk_receive_queue)) + mask |= (POLLIN | POLLRDNORM); + break; + case SS_DISCONNECTING: + mask = (POLLIN | POLLRDNORM | POLLHUP); + break; + } return mask; } @@ -1026,9 +1046,8 @@ static int recv_stream(struct kiocb *iocb, struct socket *sock, struct sk_buff *buf; struct tipc_msg *msg; unsigned int sz; - int sz_to_copy; + int sz_to_copy, target, needed; int sz_copied = 0; - int needed; char __user *crs = m->msg_iov->iov_base; unsigned char *buf_crs; u32 err; @@ -1050,6 +1069,8 @@ static int recv_stream(struct kiocb *iocb, struct socket *sock, goto exit; } + target = sock_rcvlowat(sk, flags & MSG_WAITALL, buf_len); + restart: /* Look for a message in receive queue; wait if necessary */ @@ -1138,7 +1159,7 @@ restart: if ((sz_copied < buf_len) && /* didn't get all requested data */ (!skb_queue_empty(&sk->sk_receive_queue) || - (flags & MSG_WAITALL)) && /* and more is ready or required */ + (sz_copied < target)) && /* and more is ready or required */ (!(flags & MSG_PEEK)) && /* and aren't just peeking at data */ (!err)) /* and haven't reached a FIN */ goto restart; @@ -1365,6 +1386,7 @@ static int connect(struct socket *sock, struct sockaddr *dest, int destlen, struct msghdr m = {NULL,}; struct sk_buff *buf; struct tipc_msg *msg; + long timeout; int res; lock_sock(sk); @@ -1379,7 +1401,7 @@ static int connect(struct socket *sock, struct sockaddr *dest, int destlen, /* For now, TIPC does not support the non-blocking form of connect() */ if (flags & O_NONBLOCK) { - res = -EWOULDBLOCK; + res = -EOPNOTSUPP; goto exit; } @@ -1425,11 +1447,12 @@ static int connect(struct socket *sock, struct sockaddr *dest, int destlen, /* Wait until an 'ACK' or 'RST' arrives, or a timeout occurs */ + timeout = tipc_sk(sk)->conn_timeout; release_sock(sk); res = wait_event_interruptible_timeout(*sk_sleep(sk), (!skb_queue_empty(&sk->sk_receive_queue) || (sock->state != SS_CONNECTING)), - sk->sk_rcvtimeo); + timeout ? timeout : MAX_SCHEDULE_TIMEOUT); lock_sock(sk); if (res > 0) { @@ -1692,7 +1715,7 @@ static int setsockopt(struct socket *sock, res = tipc_set_portunreturnable(tport->ref, value); break; case TIPC_CONN_TIMEOUT: - sk->sk_rcvtimeo = msecs_to_jiffies(value); + tipc_sk(sk)->conn_timeout = msecs_to_jiffies(value); /* no need to set "res", since already 0 at this point */ break; default: @@ -1747,7 +1770,7 @@ static int getsockopt(struct socket *sock, res = tipc_portunreturnable(tport->ref, &value); break; case TIPC_CONN_TIMEOUT: - value = jiffies_to_msecs(sk->sk_rcvtimeo); + value = jiffies_to_msecs(tipc_sk(sk)->conn_timeout); /* no need to set "res", since already 0 at this point */ break; case TIPC_NODE_RECVQ_DEPTH: diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index ba59983aaff..b14ed4b1f27 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2504,7 +2504,7 @@ static struct xfrm_policy *xfrm_compile_policy(struct sock *sk, int opt, if (p->dir > XFRM_POLICY_OUT) return NULL; - xp = xfrm_policy_alloc(net, GFP_KERNEL); + xp = xfrm_policy_alloc(net, GFP_ATOMIC); if (xp == NULL) { *dir = -ENOBUFS; return NULL; |