summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/802/psnap.c13
-rw-r--r--net/802/tr.c4
-rw-r--r--net/8021q/vlan.c2
-rw-r--r--net/8021q/vlan_core.c12
-rw-r--r--net/8021q/vlan_dev.c3
-rw-r--r--net/9p/trans_fd.c2
-rw-r--r--net/Kconfig15
-rw-r--r--net/Makefile1
-rw-r--r--net/appletalk/ddp.c6
-rw-r--r--net/atm/clip.c9
-rw-r--r--net/atm/lec.c2
-rw-r--r--net/atm/mpc.c32
-rw-r--r--net/atm/mpc.h5
-rw-r--r--net/ax25/af_ax25.c19
-rw-r--r--net/bluetooth/af_bluetooth.c17
-rw-r--r--net/bluetooth/cmtp/core.c3
-rw-r--r--net/bluetooth/hci_conn.c64
-rw-r--r--net/bluetooth/hci_core.c3
-rw-r--r--net/bluetooth/hci_event.c26
-rw-r--r--net/bluetooth/l2cap.c602
-rw-r--r--net/bluetooth/rfcomm/core.c179
-rw-r--r--net/bluetooth/rfcomm/sock.c189
-rw-r--r--net/bluetooth/sco.c57
-rw-r--r--net/bridge/br_netlink.c3
-rw-r--r--net/can/af_can.c3
-rw-r--r--net/core/Makefile3
-rw-r--r--net/core/datagram.c2
-rw-r--r--net/core/dev.c110
-rw-r--r--net/core/drop_monitor.c263
-rw-r--r--net/core/ethtool.c58
-rw-r--r--net/core/fib_rules.c3
-rw-r--r--net/core/neighbour.c15
-rw-r--r--net/core/net-sysfs.c6
-rw-r--r--net/core/net-traces.c29
-rw-r--r--net/core/net_namespace.c89
-rw-r--r--net/core/pktgen.c18
-rw-r--r--net/core/rtnetlink.c9
-rw-r--r--net/core/skbuff.c33
-rw-r--r--net/core/sock.c9
-rw-r--r--net/core/sysctl_net_core.c1
-rw-r--r--net/dccp/ackvec.h3
-rw-r--r--net/dccp/dccp.h5
-rw-r--r--net/dccp/output.c37
-rw-r--r--net/decnet/af_decnet.c23
-rw-r--r--net/decnet/dn_dev.c6
-rw-r--r--net/decnet/dn_route.c6
-rw-r--r--net/decnet/dn_table.c3
-rw-r--r--net/decnet/sysctl_net_decnet.c2
-rw-r--r--net/dsa/Kconfig6
-rw-r--r--net/dsa/dsa.c177
-rw-r--r--net/dsa/dsa_priv.h97
-rw-r--r--net/dsa/mv88e6060.c12
-rw-r--r--net/dsa/mv88e6123_61_65.c92
-rw-r--r--net/dsa/mv88e6131.c96
-rw-r--r--net/dsa/slave.c34
-rw-r--r--net/dsa/tag_dsa.c32
-rw-r--r--net/dsa/tag_edsa.c32
-rw-r--r--net/dsa/tag_trailer.c12
-rw-r--r--net/econet/af_econet.c2
-rw-r--r--net/ipv4/Kconfig52
-rw-r--r--net/ipv4/af_inet.c2
-rw-r--r--net/ipv4/arp.c11
-rw-r--r--net/ipv4/cipso_ipv4.c9
-rw-r--r--net/ipv4/devinet.c3
-rw-r--r--net/ipv4/fib_frontend.c2
-rw-r--r--net/ipv4/fib_semantics.c5
-rw-r--r--net/ipv4/icmp.c2
-rw-r--r--net/ipv4/inet_fragment.c1
-rw-r--r--net/ipv4/ip_fragment.c3
-rw-r--r--net/ipv4/ip_gre.c5
-rw-r--r--net/ipv4/ipip.c7
-rw-r--r--net/ipv4/tcp.c62
-rw-r--r--net/ipv4/tcp_bic.c11
-rw-r--r--net/ipv4/tcp_cong.c21
-rw-r--r--net/ipv4/tcp_cubic.c11
-rw-r--r--net/ipv4/tcp_htcp.c3
-rw-r--r--net/ipv4/tcp_input.c207
-rw-r--r--net/ipv4/tcp_ipv4.c11
-rw-r--r--net/ipv4/tcp_minisocks.c9
-rw-r--r--net/ipv4/tcp_output.c95
-rw-r--r--net/ipv4/tcp_probe.c5
-rw-r--r--net/ipv4/tcp_scalable.c12
-rw-r--r--net/ipv4/tcp_timer.c23
-rw-r--r--net/ipv4/tcp_veno.c7
-rw-r--r--net/ipv4/tcp_yeah.c9
-rw-r--r--net/ipv4/udp.c2
-rw-r--r--net/ipv6/addrconf.c90
-rw-r--r--net/ipv6/af_inet6.c26
-rw-r--r--net/ipv6/inet6_hashtables.c4
-rw-r--r--net/ipv6/ipv6_sockglue.c3
-rw-r--r--net/ipv6/ndisc.c6
-rw-r--r--net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c5
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c8
-rw-r--r--net/ipv6/reassembly.c7
-rw-r--r--net/ipv6/route.c5
-rw-r--r--net/ipv6/sit.c7
-rw-r--r--net/ipv6/tcp_ipv6.c6
-rw-r--r--net/ipv6/xfrm6_state.c2
-rw-r--r--net/ipx/af_ipx.c16
-rw-r--r--net/irda/irda_device.c5
-rw-r--r--net/irda/irlan/irlan_eth.c19
-rw-r--r--net/irda/irmod.c2
-rw-r--r--net/iucv/af_iucv.c3
-rw-r--r--net/key/af_key.c6
-rw-r--r--net/llc/af_llc.c6
-rw-r--r--net/llc/llc_conn.c3
-rw-r--r--net/llc/llc_core.c4
-rw-r--r--net/mac80211/Makefile1
-rw-r--r--net/mac80211/agg-rx.c6
-rw-r--r--net/mac80211/agg-tx.c191
-rw-r--r--net/mac80211/cfg.c52
-rw-r--r--net/mac80211/debugfs_netdev.c48
-rw-r--r--net/mac80211/ht.c19
-rw-r--r--net/mac80211/ibss.c907
-rw-r--r--net/mac80211/ieee80211_i.h141
-rw-r--r--net/mac80211/iface.c91
-rw-r--r--net/mac80211/key.c2
-rw-r--r--net/mac80211/main.c24
-rw-r--r--net/mac80211/mlme.c1709
-rw-r--r--net/mac80211/rate.h12
-rw-r--r--net/mac80211/rx.c37
-rw-r--r--net/mac80211/scan.c66
-rw-r--r--net/mac80211/spectmgmt.c26
-rw-r--r--net/mac80211/sta_info.c15
-rw-r--r--net/mac80211/sta_info.h5
-rw-r--r--net/mac80211/tx.c31
-rw-r--r--net/mac80211/util.c254
-rw-r--r--net/mac80211/wext.c290
-rw-r--r--net/mac80211/wme.c170
-rw-r--r--net/mac80211/wme.h6
-rw-r--r--net/netfilter/nf_conntrack_core.c2
-rw-r--r--net/netfilter/nf_conntrack_netlink.c1
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c4
-rw-r--r--net/netfilter/nfnetlink_log.c8
-rw-r--r--net/netfilter/x_tables.c199
-rw-r--r--net/netfilter/xt_recent.c2
-rw-r--r--net/netlink/af_netlink.c55
-rw-r--r--net/netrom/af_netrom.c11
-rw-r--r--net/packet/af_packet.c5
-rw-r--r--net/phonet/af_phonet.c3
-rw-r--r--net/phonet/pn_netlink.c5
-rw-r--r--net/rds/Kconfig14
-rw-r--r--net/rds/Makefile14
-rw-r--r--net/rds/af_rds.c586
-rw-r--r--net/rds/bind.c199
-rw-r--r--net/rds/cong.c404
-rw-r--r--net/rds/connection.c487
-rw-r--r--net/rds/ib.c323
-rw-r--r--net/rds/ib.h367
-rw-r--r--net/rds/ib_cm.c726
-rw-r--r--net/rds/ib_rdma.c641
-rw-r--r--net/rds/ib_recv.c869
-rw-r--r--net/rds/ib_ring.c168
-rw-r--r--net/rds/ib_send.c874
-rw-r--r--net/rds/ib_stats.c95
-rw-r--r--net/rds/ib_sysctl.c137
-rw-r--r--net/rds/info.c241
-rw-r--r--net/rds/info.h30
-rw-r--r--net/rds/iw.c333
-rw-r--r--net/rds/iw.h395
-rw-r--r--net/rds/iw_cm.c750
-rw-r--r--net/rds/iw_rdma.c888
-rw-r--r--net/rds/iw_recv.c869
-rw-r--r--net/rds/iw_ring.c169
-rw-r--r--net/rds/iw_send.c975
-rw-r--r--net/rds/iw_stats.c95
-rw-r--r--net/rds/iw_sysctl.c137
-rw-r--r--net/rds/loop.c188
-rw-r--r--net/rds/loop.h9
-rw-r--r--net/rds/message.c402
-rw-r--r--net/rds/page.c221
-rw-r--r--net/rds/rdma.c679
-rw-r--r--net/rds/rdma.h84
-rw-r--r--net/rds/rdma_transport.c214
-rw-r--r--net/rds/rdma_transport.h28
-rw-r--r--net/rds/rds.h686
-rw-r--r--net/rds/recv.c542
-rw-r--r--net/rds/send.c1003
-rw-r--r--net/rds/stats.c148
-rw-r--r--net/rds/sysctl.c122
-rw-r--r--net/rds/threads.c265
-rw-r--r--net/rds/transport.c117
-rw-r--r--net/sched/act_police.c13
-rw-r--r--net/sched/sch_cbq.c7
-rw-r--r--net/sched/sch_drr.c13
-rw-r--r--net/sched/sch_hfsc.c7
-rw-r--r--net/sched/sch_htb.c7
-rw-r--r--net/sched/sch_tbf.c9
-rw-r--r--net/sctp/debug.c4
-rw-r--r--net/sctp/endpointola.c3
-rw-r--r--net/sctp/output.c5
-rw-r--r--net/sctp/outqueue.c6
-rw-r--r--net/sctp/protocol.c16
-rw-r--r--net/sctp/sm_make_chunk.c33
-rw-r--r--net/sctp/sm_sideeffect.c86
-rw-r--r--net/sctp/sm_statefuns.c22
-rw-r--r--net/sctp/socket.c161
-rw-r--r--net/sctp/transport.c7
-rw-r--r--net/tipc/bcast.c4
-rw-r--r--net/tipc/bcast.h2
-rw-r--r--net/tipc/dbg.c2
-rw-r--r--net/tipc/node.c2
-rw-r--r--net/unix/af_unix.c3
-rw-r--r--net/wanrouter/wanmain.c8
-rw-r--r--net/wanrouter/wanproc.c2
-rw-r--r--net/wireless/Kconfig10
-rw-r--r--net/wireless/core.c116
-rw-r--r--net/wireless/core.h39
-rw-r--r--net/wireless/lib80211_crypt_ccmp.c2
-rw-r--r--net/wireless/lib80211_crypt_tkip.c4
-rw-r--r--net/wireless/nl80211.c148
-rw-r--r--net/wireless/nl80211.h9
-rw-r--r--net/wireless/reg.c1034
-rw-r--r--net/wireless/reg.h36
-rw-r--r--net/wireless/scan.c64
-rw-r--r--net/wireless/sysfs.c9
-rw-r--r--net/wireless/wext-compat.c97
-rw-r--r--net/x25/af_x25.c13
-rw-r--r--net/xfrm/xfrm_state.c90
219 files changed, 21713 insertions, 3691 deletions
diff --git a/net/802/psnap.c b/net/802/psnap.c
index 6ed711748f2..6fea0750662 100644
--- a/net/802/psnap.c
+++ b/net/802/psnap.c
@@ -29,7 +29,7 @@ static struct llc_sap *snap_sap;
/*
* Find a snap client by matching the 5 bytes.
*/
-static struct datalink_proto *find_snap_client(unsigned char *desc)
+static struct datalink_proto *find_snap_client(const unsigned char *desc)
{
struct datalink_proto *proto = NULL, *p;
@@ -95,15 +95,16 @@ static int snap_request(struct datalink_proto *dl,
EXPORT_SYMBOL(register_snap_client);
EXPORT_SYMBOL(unregister_snap_client);
-static char snap_err_msg[] __initdata =
+static const char snap_err_msg[] __initconst =
KERN_CRIT "SNAP - unable to register with 802.2\n";
static int __init snap_init(void)
{
snap_sap = llc_sap_open(0xAA, snap_rcv);
-
- if (!snap_sap)
+ if (!snap_sap) {
printk(snap_err_msg);
+ return -EBUSY;
+ }
return 0;
}
@@ -121,7 +122,7 @@ module_exit(snap_exit);
/*
* Register SNAP clients. We don't yet use this for IP.
*/
-struct datalink_proto *register_snap_client(unsigned char *desc,
+struct datalink_proto *register_snap_client(const unsigned char *desc,
int (*rcvfunc)(struct sk_buff *,
struct net_device *,
struct packet_type *,
@@ -136,7 +137,7 @@ struct datalink_proto *register_snap_client(unsigned char *desc,
proto = kmalloc(sizeof(*proto), GFP_ATOMIC);
if (proto) {
- memcpy(proto->type, desc,5);
+ memcpy(proto->type, desc, 5);
proto->rcvfunc = rcvfunc;
proto->header_length = 5 + 3; /* snap + 802.2 */
proto->request = snap_request;
diff --git a/net/802/tr.c b/net/802/tr.c
index 158150fee46..e7eb13084d7 100644
--- a/net/802/tr.c
+++ b/net/802/tr.c
@@ -486,6 +486,7 @@ static struct rif_cache *rif_get_idx(loff_t pos)
}
static void *rif_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(&rif_lock)
{
spin_lock_irq(&rif_lock);
@@ -517,6 +518,7 @@ static void *rif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void rif_seq_stop(struct seq_file *seq, void *v)
+ __releases(&rif_lock)
{
spin_unlock_irq(&rif_lock);
}
@@ -668,3 +670,5 @@ module_init(rif_init);
EXPORT_SYMBOL(tr_type_trans);
EXPORT_SYMBOL(alloc_trdev);
+
+MODULE_LICENSE("GPL");
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 4163ea65bf4..2b7390e377b 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -51,7 +51,7 @@ const char vlan_version[] = DRV_VERSION;
static const char vlan_copyright[] = "Ben Greear <greearb@candelatech.com>";
static const char vlan_buggyright[] = "David S. Miller <davem@redhat.com>";
-static struct packet_type vlan_packet_type = {
+static struct packet_type vlan_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_8021Q),
.func = vlan_skb_recv, /* VLAN receive method */
};
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 70435af153f..654e45f5719 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -1,12 +1,16 @@
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/if_vlan.h>
+#include <linux/netpoll.h>
#include "vlan.h"
/* VLAN rx hw acceleration helper. This acts like netif_{rx,receive_skb}(). */
int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp,
u16 vlan_tci, int polling)
{
+ if (netpoll_rx(skb))
+ return NET_RX_DROP;
+
if (skb_bond_should_drop(skb))
goto drop;
@@ -94,12 +98,15 @@ static int vlan_gro_common(struct napi_struct *napi, struct vlan_group *grp,
return dev_gro_receive(napi, skb);
drop:
- return 2;
+ return GRO_DROP;
}
int vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp,
unsigned int vlan_tci, struct sk_buff *skb)
{
+ if (netpoll_rx_on(skb))
+ return vlan_hwaccel_receive_skb(skb, grp, vlan_tci);
+
skb_gro_reset_offset(skb);
return napi_skb_finish(vlan_gro_common(napi, grp, vlan_tci, skb), skb);
@@ -114,6 +121,9 @@ int vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp,
if (!skb)
return NET_RX_DROP;
+ if (netpoll_rx_on(skb))
+ return vlan_hwaccel_receive_skb(skb, grp, vlan_tci);
+
return napi_frags_finish(napi, skb,
vlan_gro_common(napi, grp, vlan_tci, skb));
}
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 4a19acd3a32..1b34135cf99 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -553,7 +553,7 @@ static int vlan_dev_neigh_setup(struct net_device *dev, struct neigh_parms *pa)
int err = 0;
if (netif_device_present(real_dev) && ops->ndo_neigh_setup)
- err = ops->ndo_neigh_setup(dev, pa);
+ err = ops->ndo_neigh_setup(real_dev, pa);
return err;
}
@@ -639,6 +639,7 @@ static int vlan_dev_init(struct net_device *dev)
dev->hard_header_len = real_dev->hard_header_len + VLAN_HLEN;
dev->netdev_ops = &vlan_netdev_ops;
}
+ netdev_resync_ops(dev);
if (is_vlan_dev(real_dev))
subclass = 1;
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index 1df0356f242..c613ed08a5e 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -417,7 +417,7 @@ static int p9_fd_write(struct p9_client *client, void *v, int len)
oldfs = get_fs();
set_fs(get_ds());
/* The cast to a user pointer is valid due to the set_fs() */
- ret = vfs_write(ts->wr, (void __user *)v, len, &ts->wr->f_pos);
+ ret = vfs_write(ts->wr, (__force void __user *)v, len, &ts->wr->f_pos);
set_fs(oldfs);
if (ret <= 0 && ret != -ERESTARTSYS && ret != -EAGAIN)
diff --git a/net/Kconfig b/net/Kconfig
index a12bae0e3fe..93998a9c39c 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -24,9 +24,6 @@ if NET
menu "Networking options"
-config COMPAT_NET_DEV_OPS
- def_bool y
-
source "net/packet/Kconfig"
source "net/unix/Kconfig"
source "net/xfrm/Kconfig"
@@ -171,6 +168,7 @@ endif
source "net/dccp/Kconfig"
source "net/sctp/Kconfig"
+source "net/rds/Kconfig"
source "net/tipc/Kconfig"
source "net/atm/Kconfig"
source "net/802/Kconfig"
@@ -221,6 +219,17 @@ config NET_TCPPROBE
To compile this code as a module, choose M here: the
module will be called tcp_probe.
+config NET_DROP_MONITOR
+ boolean "Network packet drop alerting service"
+ depends on INET && EXPERIMENTAL && TRACEPOINTS
+ ---help---
+ This feature provides an alerting service to userspace in the
+ event that packets are discarded in the network stack. Alerts
+ are broadcast via netlink socket to any listening user space
+ process. If you don't need network drop alerts, or if you are ok
+ just checking the various proc files and other utilities for
+ drop statistics, say N here.
+
endmenu
endmenu
diff --git a/net/Makefile b/net/Makefile
index 0fcce89d716..9e00a55a901 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -49,6 +49,7 @@ obj-y += 8021q/
endif
obj-$(CONFIG_IP_DCCP) += dccp/
obj-$(CONFIG_IP_SCTP) += sctp/
+obj-$(CONFIG_RDS) += rds/
obj-y += wireless/
obj-$(CONFIG_MAC80211) += mac80211/
obj-$(CONFIG_TIPC) += tipc/
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 510a6782da8..3e0671df3a3 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1860,12 +1860,12 @@ static struct notifier_block ddp_notifier = {
.notifier_call = ddp_device_event,
};
-static struct packet_type ltalk_packet_type = {
+static struct packet_type ltalk_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_LOCALTALK),
.func = ltalk_rcv,
};
-static struct packet_type ppptalk_packet_type = {
+static struct packet_type ppptalk_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_PPPTALK),
.func = atalk_rcv,
};
@@ -1877,7 +1877,7 @@ EXPORT_SYMBOL(aarp_send_ddp);
EXPORT_SYMBOL(atrtr_get_dev);
EXPORT_SYMBOL(atalk_find_dev_addr);
-static char atalk_err_snap[] __initdata =
+static const char atalk_err_snap[] __initconst =
KERN_CRIT "Unable to register DDP with SNAP.\n";
/* Called by proto.c on kernel start up */
diff --git a/net/atm/clip.c b/net/atm/clip.c
index da42fd06b61..3dc0a3a42a5 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -552,10 +552,13 @@ static int clip_setentry(struct atm_vcc *vcc, __be32 ip)
return error;
}
+static const struct net_device_ops clip_netdev_ops = {
+ .ndo_start_xmit = clip_start_xmit,
+};
+
static void clip_setup(struct net_device *dev)
{
- dev->hard_start_xmit = clip_start_xmit;
- /* sg_xmit ... */
+ dev->netdev_ops = &clip_netdev_ops;
dev->type = ARPHRD_ATM;
dev->hard_header_len = RFC1483LLC_LEN;
dev->mtu = RFC1626_MTU;
@@ -615,7 +618,7 @@ static int clip_device_event(struct notifier_block *this, unsigned long event,
}
/* ignore non-CLIP devices */
- if (dev->type != ARPHRD_ATM || dev->hard_start_xmit != clip_start_xmit)
+ if (dev->type != ARPHRD_ATM || dev->netdev_ops != &clip_netdev_ops)
return NOTIFY_DONE;
switch (event) {
diff --git a/net/atm/lec.c b/net/atm/lec.c
index c0cba9a037e..199b6bb79f4 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -502,7 +502,7 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb)
priv->lane2_ops = NULL;
if (priv->lane_version > 1)
priv->lane2_ops = &lane2_ops;
- if (dev->change_mtu(dev, mesg->content.config.mtu))
+ if (dev_set_mtu(dev, mesg->content.config.mtu))
printk("%s: change_mtu to %d failed\n", dev->name,
mesg->content.config.mtu);
priv->is_proxy = mesg->content.config.is_proxy;
diff --git a/net/atm/mpc.c b/net/atm/mpc.c
index 039d5cc72c3..e5bf11453a1 100644
--- a/net/atm/mpc.c
+++ b/net/atm/mpc.c
@@ -286,33 +286,32 @@ static void start_mpc(struct mpoa_client *mpc, struct net_device *dev)
{
dprintk("mpoa: (%s) start_mpc:\n", mpc->dev->name);
- if (dev->hard_start_xmit == NULL) {
- printk("mpoa: (%s) start_mpc: dev->hard_start_xmit == NULL, not starting\n",
- dev->name);
- return;
+ if (!dev->netdev_ops)
+ printk("mpoa: (%s) start_mpc not starting\n", dev->name);
+ else {
+ mpc->old_ops = dev->netdev_ops;
+ mpc->new_ops = *mpc->old_ops;
+ mpc->new_ops.ndo_start_xmit = mpc_send_packet;
+ dev->netdev_ops = &mpc->new_ops;
}
- mpc->old_hard_start_xmit = dev->hard_start_xmit;
- dev->hard_start_xmit = mpc_send_packet;
-
- return;
}
static void stop_mpc(struct mpoa_client *mpc)
{
-
+ struct net_device *dev = mpc->dev;
dprintk("mpoa: (%s) stop_mpc:", mpc->dev->name);
/* Lets not nullify lec device's dev->hard_start_xmit */
- if (mpc->dev->hard_start_xmit != mpc_send_packet) {
+ if (dev->netdev_ops != &mpc->new_ops) {
dprintk(" mpc already stopped, not fatal\n");
return;
}
dprintk("\n");
- mpc->dev->hard_start_xmit = mpc->old_hard_start_xmit;
- mpc->old_hard_start_xmit = NULL;
- /* close_shortcuts(mpc); ??? FIXME */
- return;
+ dev->netdev_ops = mpc->old_ops;
+ mpc->old_ops = NULL;
+
+ /* close_shortcuts(mpc); ??? FIXME */
}
static const char *mpoa_device_type_string(char type) __attribute__ ((unused));
@@ -531,7 +530,6 @@ static int send_via_shortcut(struct sk_buff *skb, struct mpoa_client *mpc)
*/
static int mpc_send_packet(struct sk_buff *skb, struct net_device *dev)
{
- int retval;
struct mpoa_client *mpc;
struct ethhdr *eth;
int i = 0;
@@ -561,9 +559,7 @@ static int mpc_send_packet(struct sk_buff *skb, struct net_device *dev)
}
non_ip:
- retval = mpc->old_hard_start_xmit(skb,dev);
-
- return retval;
+ return mpc->old_ops->ndo_start_xmit(skb,dev);
}
static int atm_mpoa_vcc_attach(struct atm_vcc *vcc, void __user *arg)
diff --git a/net/atm/mpc.h b/net/atm/mpc.h
index 24c386c35f5..0919a88bbc7 100644
--- a/net/atm/mpc.h
+++ b/net/atm/mpc.h
@@ -15,7 +15,7 @@ struct mpoa_client {
struct mpoa_client *next;
struct net_device *dev; /* lec in question */
int dev_num; /* e.g. 2 for lec2 */
- int (*old_hard_start_xmit)(struct sk_buff *skb, struct net_device *dev);
+
struct atm_vcc *mpoad_vcc; /* control channel to mpoad */
uint8_t mps_ctrl_addr[ATM_ESA_LEN]; /* MPS control ATM address */
uint8_t our_ctrl_addr[ATM_ESA_LEN]; /* MPC's control ATM address */
@@ -31,6 +31,9 @@ struct mpoa_client {
uint8_t *mps_macs; /* array of MPS MAC addresses, >=1 */
int number_of_mps_macs; /* number of the above MAC addresses */
struct mpc_parameters parameters; /* parameters for this client */
+
+ const struct net_device_ops *old_ops;
+ struct net_device_ops new_ops;
};
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index d127fd3ba5c..7da5ebb84e9 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1435,6 +1435,11 @@ static int ax25_sendmsg(struct kiocb *iocb, struct socket *sock,
size_t size;
int lv, err, addr_len = msg->msg_namelen;
+ /* AX.25 empty data frame has no meaning : don't send */
+ if (len == 0) {
+ return (0);
+ }
+
if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_CMSG_COMPAT))
return -EINVAL;
@@ -1529,10 +1534,8 @@ static int ax25_sendmsg(struct kiocb *iocb, struct socket *sock,
dp = ax25->digipeat;
}
- SOCK_DEBUG(sk, "AX.25: sendto: Addresses built.\n");
-
/* Build a packet */
- SOCK_DEBUG(sk, "AX.25: sendto: building packet.\n");
+ SOCK_DEBUG(sk, "AX.25: sendto: Addresses built. Building packet.\n");
/* Assume the worst case */
size = len + ax25->ax25_dev->dev->hard_header_len;
@@ -1636,6 +1639,13 @@ static int ax25_recvmsg(struct kiocb *iocb, struct socket *sock,
skb_reset_transport_header(skb);
copied = skb->len;
+ /* AX.25 empty data frame has no meaning : ignore it */
+ if (copied == 0) {
+ err = copied;
+ skb_free_datagram(sk, skb);
+ goto out;
+ }
+
if (copied > size) {
copied = size;
msg->msg_flags |= MSG_TRUNC;
@@ -1985,9 +1995,8 @@ static const struct proto_ops ax25_proto_ops = {
/*
* Called by socket.c on kernel start up
*/
-static struct packet_type ax25_packet_type = {
+static struct packet_type ax25_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_AX25),
- .dev = NULL, /* All devices */
.func = ax25_kiss_rcv,
};
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 744ed3f07ef..02b9baa1930 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -41,14 +41,13 @@
#include <net/bluetooth/bluetooth.h>
-#define VERSION "2.14"
+#define VERSION "2.15"
/* Bluetooth sockets */
#define BT_MAX_PROTO 8
static struct net_proto_family *bt_proto[BT_MAX_PROTO];
static DEFINE_RWLOCK(bt_proto_lock);
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key bt_lock_key[BT_MAX_PROTO];
static const char *bt_key_strings[BT_MAX_PROTO] = {
"sk_lock-AF_BLUETOOTH-BTPROTO_L2CAP",
@@ -86,11 +85,6 @@ static inline void bt_sock_reclassify_lock(struct socket *sock, int proto)
bt_slock_key_strings[proto], &bt_slock_key[proto],
bt_key_strings[proto], &bt_lock_key[proto]);
}
-#else
-static inline void bt_sock_reclassify_lock(struct socket *sock, int proto)
-{
-}
-#endif
int bt_sock_register(int proto, struct net_proto_family *ops)
{
@@ -217,7 +211,8 @@ struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock)
continue;
}
- if (sk->sk_state == BT_CONNECTED || !newsock) {
+ if (sk->sk_state == BT_CONNECTED || !newsock ||
+ bt_sk(parent)->defer_setup) {
bt_accept_unlink(sk);
if (newsock)
sock_graft(sk, newsock);
@@ -232,7 +227,7 @@ struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock)
EXPORT_SYMBOL(bt_accept_dequeue);
int bt_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
- struct msghdr *msg, size_t len, int flags)
+ struct msghdr *msg, size_t len, int flags)
{
int noblock = flags & MSG_DONTWAIT;
struct sock *sk = sock->sk;
@@ -277,7 +272,9 @@ static inline unsigned int bt_accept_poll(struct sock *parent)
list_for_each_safe(p, n, &bt_sk(parent)->accept_q) {
sk = (struct sock *) list_entry(p, struct bt_sock, accept_q);
- if (sk->sk_state == BT_CONNECTED)
+ if (sk->sk_state == BT_CONNECTED ||
+ (bt_sk(parent)->defer_setup &&
+ sk->sk_state == BT_CONNECT2))
return POLLIN | POLLRDNORM;
}
diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c
index c9cac7719ef..0073ec8495d 100644
--- a/net/bluetooth/cmtp/core.c
+++ b/net/bluetooth/cmtp/core.c
@@ -126,8 +126,7 @@ static inline void cmtp_add_msgpart(struct cmtp_session *session, int id, const
session->reassembly[id] = nskb;
- if (skb)
- kfree_skb(skb);
+ kfree_skb(skb);
}
static inline int cmtp_recv_frame(struct cmtp_session *session, struct sk_buff *skb)
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index a4a789f24c8..1181db08d9d 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -123,6 +123,8 @@ void hci_add_sco(struct hci_conn *conn, __u16 handle)
conn->state = BT_CONNECT;
conn->out = 1;
+ conn->attempt++;
+
cp.handle = cpu_to_le16(handle);
cp.pkt_type = cpu_to_le16(conn->pkt_type);
@@ -139,6 +141,8 @@ void hci_setup_sync(struct hci_conn *conn, __u16 handle)
conn->state = BT_CONNECT;
conn->out = 1;
+ conn->attempt++;
+
cp.handle = cpu_to_le16(handle);
cp.pkt_type = cpu_to_le16(conn->pkt_type);
@@ -155,6 +159,7 @@ static void hci_conn_timeout(unsigned long arg)
{
struct hci_conn *conn = (void *) arg;
struct hci_dev *hdev = conn->hdev;
+ __u8 reason;
BT_DBG("conn %p state %d", conn, conn->state);
@@ -173,7 +178,8 @@ static void hci_conn_timeout(unsigned long arg)
break;
case BT_CONFIG:
case BT_CONNECTED:
- hci_acl_disconn(conn, 0x13);
+ reason = hci_proto_disconn_ind(conn);
+ hci_acl_disconn(conn, reason);
break;
default:
conn->state = BT_CLOSED;
@@ -216,12 +222,13 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst)
break;
case SCO_LINK:
if (lmp_esco_capable(hdev))
- conn->pkt_type = hdev->esco_type & SCO_ESCO_MASK;
+ conn->pkt_type = (hdev->esco_type & SCO_ESCO_MASK) |
+ (hdev->esco_type & EDR_ESCO_MASK);
else
conn->pkt_type = hdev->pkt_type & SCO_PTYPE_MASK;
break;
case ESCO_LINK:
- conn->pkt_type = hdev->esco_type;
+ conn->pkt_type = hdev->esco_type & ~EDR_ESCO_MASK;
break;
}
@@ -280,6 +287,8 @@ int hci_conn_del(struct hci_conn *conn)
skb_queue_purge(&conn->data_q);
+ hci_conn_del_sysfs(conn);
+
return 0;
}
@@ -325,7 +334,7 @@ EXPORT_SYMBOL(hci_get_route);
/* Create SCO or ACL connection.
* Device _must_ be locked */
-struct hci_conn *hci_connect(struct hci_dev *hdev, int type, bdaddr_t *dst, __u8 auth_type)
+struct hci_conn *hci_connect(struct hci_dev *hdev, int type, bdaddr_t *dst, __u8 sec_level, __u8 auth_type)
{
struct hci_conn *acl;
struct hci_conn *sco;
@@ -340,6 +349,7 @@ struct hci_conn *hci_connect(struct hci_dev *hdev, int type, bdaddr_t *dst, __u8
hci_conn_hold(acl);
if (acl->state == BT_OPEN || acl->state == BT_CLOSED) {
+ acl->sec_level = sec_level;
acl->auth_type = auth_type;
hci_acl_connect(acl);
}
@@ -385,51 +395,59 @@ int hci_conn_check_link_mode(struct hci_conn *conn)
EXPORT_SYMBOL(hci_conn_check_link_mode);
/* Authenticate remote device */
-int hci_conn_auth(struct hci_conn *conn)
+static int hci_conn_auth(struct hci_conn *conn, __u8 sec_level, __u8 auth_type)
{
BT_DBG("conn %p", conn);
- if (conn->ssp_mode > 0 && conn->hdev->ssp_mode > 0) {
- if (!(conn->auth_type & 0x01)) {
- conn->auth_type |= 0x01;
- conn->link_mode &= ~HCI_LM_AUTH;
- }
- }
-
- if (conn->link_mode & HCI_LM_AUTH)
+ if (sec_level > conn->sec_level)
+ conn->sec_level = sec_level;
+ else if (conn->link_mode & HCI_LM_AUTH)
return 1;
+ conn->auth_type = auth_type;
+
if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->pend)) {
struct hci_cp_auth_requested cp;
cp.handle = cpu_to_le16(conn->handle);
hci_send_cmd(conn->hdev, HCI_OP_AUTH_REQUESTED,
sizeof(cp), &cp);
}
+
return 0;
}
-EXPORT_SYMBOL(hci_conn_auth);
-/* Enable encryption */
-int hci_conn_encrypt(struct hci_conn *conn)
+/* Enable security */
+int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type)
{
BT_DBG("conn %p", conn);
+ if (sec_level == BT_SECURITY_SDP)
+ return 1;
+
+ if (sec_level == BT_SECURITY_LOW) {
+ if (conn->ssp_mode > 0 && conn->hdev->ssp_mode > 0)
+ return hci_conn_auth(conn, sec_level, auth_type);
+ else
+ return 1;
+ }
+
if (conn->link_mode & HCI_LM_ENCRYPT)
- return hci_conn_auth(conn);
+ return hci_conn_auth(conn, sec_level, auth_type);
if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend))
return 0;
- if (hci_conn_auth(conn)) {
+ if (hci_conn_auth(conn, sec_level, auth_type)) {
struct hci_cp_set_conn_encrypt cp;
cp.handle = cpu_to_le16(conn->handle);
cp.encrypt = 1;
hci_send_cmd(conn->hdev, HCI_OP_SET_CONN_ENCRYPT,
sizeof(cp), &cp);
}
+
return 0;
}
-EXPORT_SYMBOL(hci_conn_encrypt);
+EXPORT_SYMBOL(hci_conn_security);
/* Change link key */
int hci_conn_change_link_key(struct hci_conn *conn)
@@ -442,12 +460,13 @@ int hci_conn_change_link_key(struct hci_conn *conn)
hci_send_cmd(conn->hdev, HCI_OP_CHANGE_CONN_LINK_KEY,
sizeof(cp), &cp);
}
+
return 0;
}
EXPORT_SYMBOL(hci_conn_change_link_key);
/* Switch role */
-int hci_conn_switch_role(struct hci_conn *conn, uint8_t role)
+int hci_conn_switch_role(struct hci_conn *conn, __u8 role)
{
BT_DBG("conn %p", conn);
@@ -460,6 +479,7 @@ int hci_conn_switch_role(struct hci_conn *conn, uint8_t role)
cp.role = role;
hci_send_cmd(conn->hdev, HCI_OP_SWITCH_ROLE, sizeof(cp), &cp);
}
+
return 0;
}
EXPORT_SYMBOL(hci_conn_switch_role);
@@ -542,9 +562,7 @@ void hci_conn_hash_flush(struct hci_dev *hdev)
c->state = BT_CLOSED;
- hci_conn_del_sysfs(c);
-
- hci_proto_disconn_ind(c, 0x16);
+ hci_proto_disconn_cfm(c, 0x16);
hci_conn_del(c);
}
}
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index ba78cc1eb8d..cd061510b6b 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -1565,8 +1565,7 @@ static void hci_cmd_task(unsigned long arg)
/* Send queued commands */
if (atomic_read(&hdev->cmd_cnt) && (skb = skb_dequeue(&hdev->cmd_q))) {
- if (hdev->sent_cmd)
- kfree_skb(hdev->sent_cmd);
+ kfree_skb(hdev->sent_cmd);
if ((hdev->sent_cmd = skb_clone(skb, GFP_ATOMIC))) {
atomic_dec(&hdev->cmd_cnt);
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index f91ba690f5d..55534244c3a 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -484,6 +484,15 @@ static void hci_cc_read_local_features(struct hci_dev *hdev, struct sk_buff *skb
if (hdev->features[4] & LMP_EV5)
hdev->esco_type |= (ESCO_EV5);
+ if (hdev->features[5] & LMP_EDR_ESCO_2M)
+ hdev->esco_type |= (ESCO_2EV3);
+
+ if (hdev->features[5] & LMP_EDR_ESCO_3M)
+ hdev->esco_type |= (ESCO_3EV3);
+
+ if (hdev->features[5] & LMP_EDR_3S_ESCO)
+ hdev->esco_type |= (ESCO_2EV5 | ESCO_3EV5);
+
BT_DBG("%s features 0x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x", hdev->name,
hdev->features[0], hdev->features[1],
hdev->features[2], hdev->features[3],
@@ -914,7 +923,8 @@ static inline void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *s
if (ev->status) {
hci_proto_connect_cfm(conn, ev->status);
hci_conn_del(conn);
- }
+ } else if (ev->link_type != ACL_LINK)
+ hci_proto_connect_cfm(conn, ev->status);
unlock:
hci_dev_unlock(hdev);
@@ -1009,9 +1019,7 @@ static inline void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff
if (conn) {
conn->state = BT_CLOSED;
- hci_conn_del_sysfs(conn);
-
- hci_proto_disconn_ind(conn, ev->reason);
+ hci_proto_disconn_cfm(conn, ev->reason);
hci_conn_del(conn);
}
@@ -1600,7 +1608,8 @@ static inline void hci_remote_ext_features_evt(struct hci_dev *hdev, struct sk_b
if (conn->state == BT_CONFIG) {
if (!ev->status && hdev->ssp_mode > 0 &&
- conn->ssp_mode > 0 && conn->out) {
+ conn->ssp_mode > 0 && conn->out &&
+ conn->sec_level != BT_SECURITY_SDP) {
struct hci_cp_auth_requested cp;
cp.handle = ev->handle;
hci_send_cmd(hdev, HCI_OP_AUTH_REQUESTED,
@@ -1637,6 +1646,13 @@ static inline void hci_sync_conn_complete_evt(struct hci_dev *hdev, struct sk_bu
conn->type = SCO_LINK;
}
+ if (conn->out && ev->status == 0x1c && conn->attempt < 2) {
+ conn->pkt_type = (hdev->esco_type & SCO_ESCO_MASK) |
+ (hdev->esco_type & EDR_ESCO_MASK);
+ hci_setup_sync(conn, conn->link->handle);
+ goto unlock;
+ }
+
if (!ev->status) {
conn->handle = __le16_to_cpu(ev->handle);
conn->state = BT_CONNECTED;
diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c
index b93748e224f..ca4d3b40d5c 100644
--- a/net/bluetooth/l2cap.c
+++ b/net/bluetooth/l2cap.c
@@ -50,9 +50,10 @@
#include <net/bluetooth/hci_core.h>
#include <net/bluetooth/l2cap.h>
-#define VERSION "2.11"
+#define VERSION "2.13"
-static u32 l2cap_feat_mask = 0x0000;
+static u32 l2cap_feat_mask = 0x0080;
+static u8 l2cap_fixed_chan[8] = { 0x02, };
static const struct proto_ops l2cap_sock_ops;
@@ -77,9 +78,10 @@ static void l2cap_sock_timeout(unsigned long arg)
bh_lock_sock(sk);
- if (sk->sk_state == BT_CONNECT &&
- (l2cap_pi(sk)->link_mode & (L2CAP_LM_AUTH |
- L2CAP_LM_ENCRYPT | L2CAP_LM_SECURE)))
+ if (sk->sk_state == BT_CONNECTED || sk->sk_state == BT_CONFIG)
+ reason = ECONNREFUSED;
+ else if (sk->sk_state == BT_CONNECT &&
+ l2cap_pi(sk)->sec_level != BT_SECURITY_SDP)
reason = ECONNREFUSED;
else
reason = ETIMEDOUT;
@@ -204,6 +206,8 @@ static void __l2cap_chan_add(struct l2cap_conn *conn, struct sock *sk, struct so
BT_DBG("conn %p, psm 0x%2.2x, dcid 0x%4.4x", conn, l2cap_pi(sk)->psm, l2cap_pi(sk)->dcid);
+ conn->disc_reason = 0x13;
+
l2cap_pi(sk)->conn = conn;
if (sk->sk_type == SOCK_SEQPACKET) {
@@ -259,18 +263,35 @@ static void l2cap_chan_del(struct sock *sk, int err)
}
/* Service level security */
-static inline int l2cap_check_link_mode(struct sock *sk)
+static inline int l2cap_check_security(struct sock *sk)
{
struct l2cap_conn *conn = l2cap_pi(sk)->conn;
+ __u8 auth_type;
- if ((l2cap_pi(sk)->link_mode & L2CAP_LM_ENCRYPT) ||
- (l2cap_pi(sk)->link_mode & L2CAP_LM_SECURE))
- return hci_conn_encrypt(conn->hcon);
+ if (l2cap_pi(sk)->psm == cpu_to_le16(0x0001)) {
+ if (l2cap_pi(sk)->sec_level == BT_SECURITY_HIGH)
+ auth_type = HCI_AT_NO_BONDING_MITM;
+ else
+ auth_type = HCI_AT_NO_BONDING;
- if (l2cap_pi(sk)->link_mode & L2CAP_LM_AUTH)
- return hci_conn_auth(conn->hcon);
+ if (l2cap_pi(sk)->sec_level == BT_SECURITY_LOW)
+ l2cap_pi(sk)->sec_level = BT_SECURITY_SDP;
+ } else {
+ switch (l2cap_pi(sk)->sec_level) {
+ case BT_SECURITY_HIGH:
+ auth_type = HCI_AT_GENERAL_BONDING_MITM;
+ break;
+ case BT_SECURITY_MEDIUM:
+ auth_type = HCI_AT_GENERAL_BONDING;
+ break;
+ default:
+ auth_type = HCI_AT_NO_BONDING;
+ break;
+ }
+ }
- return 1;
+ return hci_conn_security(conn->hcon, l2cap_pi(sk)->sec_level,
+ auth_type);
}
static inline u8 l2cap_get_ident(struct l2cap_conn *conn)
@@ -312,7 +333,10 @@ static void l2cap_do_start(struct sock *sk)
struct l2cap_conn *conn = l2cap_pi(sk)->conn;
if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) {
- if (l2cap_check_link_mode(sk)) {
+ if (!(conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE))
+ return;
+
+ if (l2cap_check_security(sk)) {
struct l2cap_conn_req req;
req.scid = cpu_to_le16(l2cap_pi(sk)->scid);
req.psm = l2cap_pi(sk)->psm;
@@ -356,7 +380,7 @@ static void l2cap_conn_start(struct l2cap_conn *conn)
}
if (sk->sk_state == BT_CONNECT) {
- if (l2cap_check_link_mode(sk)) {
+ if (l2cap_check_security(sk)) {
struct l2cap_conn_req req;
req.scid = cpu_to_le16(l2cap_pi(sk)->scid);
req.psm = l2cap_pi(sk)->psm;
@@ -371,10 +395,18 @@ static void l2cap_conn_start(struct l2cap_conn *conn)
rsp.scid = cpu_to_le16(l2cap_pi(sk)->dcid);
rsp.dcid = cpu_to_le16(l2cap_pi(sk)->scid);
- if (l2cap_check_link_mode(sk)) {
- sk->sk_state = BT_CONFIG;
- rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS);
- rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
+ if (l2cap_check_security(sk)) {
+ if (bt_sk(sk)->defer_setup) {
+ struct sock *parent = bt_sk(sk)->parent;
+ rsp.result = cpu_to_le16(L2CAP_CR_PEND);
+ rsp.status = cpu_to_le16(L2CAP_CS_AUTHOR_PEND);
+ parent->sk_data_ready(parent, 0);
+
+ } else {
+ sk->sk_state = BT_CONFIG;
+ rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS);
+ rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
+ }
} else {
rsp.result = cpu_to_le16(L2CAP_CR_PEND);
rsp.status = cpu_to_le16(L2CAP_CS_AUTHEN_PEND);
@@ -426,7 +458,7 @@ static void l2cap_conn_unreliable(struct l2cap_conn *conn, int err)
read_lock(&l->lock);
for (sk = l->head; sk; sk = l2cap_pi(sk)->next_c) {
- if (l2cap_pi(sk)->link_mode & L2CAP_LM_RELIABLE)
+ if (l2cap_pi(sk)->force_reliable)
sk->sk_err = err;
}
@@ -437,6 +469,7 @@ static void l2cap_info_timeout(unsigned long arg)
{
struct l2cap_conn *conn = (void *) arg;
+ conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE;
conn->info_ident = 0;
l2cap_conn_start(conn);
@@ -470,6 +503,8 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon, u8 status)
spin_lock_init(&conn->lock);
rwlock_init(&conn->chan_list.lock);
+ conn->disc_reason = 0x13;
+
return conn;
}
@@ -483,8 +518,7 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err)
BT_DBG("hcon %p conn %p, err %d", hcon, conn, err);
- if (conn->rx_skb)
- kfree_skb(conn->rx_skb);
+ kfree_skb(conn->rx_skb);
/* Kill channels */
while ((sk = conn->chan_list.head)) {
@@ -608,7 +642,6 @@ static void __l2cap_sock_close(struct sock *sk, int reason)
case BT_CONNECTED:
case BT_CONFIG:
- case BT_CONNECT2:
if (sk->sk_type == SOCK_SEQPACKET) {
struct l2cap_conn *conn = l2cap_pi(sk)->conn;
struct l2cap_disconn_req req;
@@ -624,6 +657,27 @@ static void __l2cap_sock_close(struct sock *sk, int reason)
l2cap_chan_del(sk, reason);
break;
+ case BT_CONNECT2:
+ if (sk->sk_type == SOCK_SEQPACKET) {
+ struct l2cap_conn *conn = l2cap_pi(sk)->conn;
+ struct l2cap_conn_rsp rsp;
+ __u16 result;
+
+ if (bt_sk(sk)->defer_setup)
+ result = L2CAP_CR_SEC_BLOCK;
+ else
+ result = L2CAP_CR_BAD_PSM;
+
+ rsp.scid = cpu_to_le16(l2cap_pi(sk)->dcid);
+ rsp.dcid = cpu_to_le16(l2cap_pi(sk)->scid);
+ rsp.result = cpu_to_le16(result);
+ rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
+ l2cap_send_cmd(conn, l2cap_pi(sk)->ident,
+ L2CAP_CONN_RSP, sizeof(rsp), &rsp);
+ } else
+ l2cap_chan_del(sk, reason);
+ break;
+
case BT_CONNECT:
case BT_DISCONN:
l2cap_chan_del(sk, reason);
@@ -653,13 +707,19 @@ static void l2cap_sock_init(struct sock *sk, struct sock *parent)
if (parent) {
sk->sk_type = parent->sk_type;
+ bt_sk(sk)->defer_setup = bt_sk(parent)->defer_setup;
+
pi->imtu = l2cap_pi(parent)->imtu;
pi->omtu = l2cap_pi(parent)->omtu;
- pi->link_mode = l2cap_pi(parent)->link_mode;
+ pi->sec_level = l2cap_pi(parent)->sec_level;
+ pi->role_switch = l2cap_pi(parent)->role_switch;
+ pi->force_reliable = l2cap_pi(parent)->force_reliable;
} else {
pi->imtu = L2CAP_DEFAULT_MTU;
pi->omtu = 0;
- pi->link_mode = 0;
+ pi->sec_level = BT_SECURITY_LOW;
+ pi->role_switch = 0;
+ pi->force_reliable = 0;
}
/* Default config options */
@@ -723,17 +783,24 @@ static int l2cap_sock_create(struct net *net, struct socket *sock, int protocol)
return 0;
}
-static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
+static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen)
{
- struct sockaddr_l2 *la = (struct sockaddr_l2 *) addr;
struct sock *sk = sock->sk;
- int err = 0;
+ struct sockaddr_l2 la;
+ int len, err = 0;
- BT_DBG("sk %p, %s %d", sk, batostr(&la->l2_bdaddr), la->l2_psm);
+ BT_DBG("sk %p", sk);
if (!addr || addr->sa_family != AF_BLUETOOTH)
return -EINVAL;
+ memset(&la, 0, sizeof(la));
+ len = min_t(unsigned int, sizeof(la), alen);
+ memcpy(&la, addr, len);
+
+ if (la.l2_cid)
+ return -EINVAL;
+
lock_sock(sk);
if (sk->sk_state != BT_OPEN) {
@@ -741,7 +808,7 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_
goto done;
}
- if (la->l2_psm && btohs(la->l2_psm) < 0x1001 &&
+ if (la.l2_psm && btohs(la.l2_psm) < 0x1001 &&
!capable(CAP_NET_BIND_SERVICE)) {
err = -EACCES;
goto done;
@@ -749,14 +816,17 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_
write_lock_bh(&l2cap_sk_list.lock);
- if (la->l2_psm && __l2cap_get_sock_by_addr(la->l2_psm, &la->l2_bdaddr)) {
+ if (la.l2_psm && __l2cap_get_sock_by_addr(la.l2_psm, &la.l2_bdaddr)) {
err = -EADDRINUSE;
} else {
/* Save source address */
- bacpy(&bt_sk(sk)->src, &la->l2_bdaddr);
- l2cap_pi(sk)->psm = la->l2_psm;
- l2cap_pi(sk)->sport = la->l2_psm;
+ bacpy(&bt_sk(sk)->src, &la.l2_bdaddr);
+ l2cap_pi(sk)->psm = la.l2_psm;
+ l2cap_pi(sk)->sport = la.l2_psm;
sk->sk_state = BT_BOUND;
+
+ if (btohs(la.l2_psm) == 0x0001 || btohs(la.l2_psm) == 0x0003)
+ l2cap_pi(sk)->sec_level = BT_SECURITY_SDP;
}
write_unlock_bh(&l2cap_sk_list.lock);
@@ -776,7 +846,8 @@ static int l2cap_do_connect(struct sock *sk)
__u8 auth_type;
int err = 0;
- BT_DBG("%s -> %s psm 0x%2.2x", batostr(src), batostr(dst), l2cap_pi(sk)->psm);
+ BT_DBG("%s -> %s psm 0x%2.2x", batostr(src), batostr(dst),
+ l2cap_pi(sk)->psm);
if (!(hdev = hci_get_route(dst, src)))
return -EHOSTUNREACH;
@@ -785,21 +856,42 @@ static int l2cap_do_connect(struct sock *sk)
err = -ENOMEM;
- if (l2cap_pi(sk)->link_mode & L2CAP_LM_AUTH ||
- l2cap_pi(sk)->link_mode & L2CAP_LM_ENCRYPT ||
- l2cap_pi(sk)->link_mode & L2CAP_LM_SECURE) {
- if (l2cap_pi(sk)->psm == cpu_to_le16(0x0001))
+ if (sk->sk_type == SOCK_RAW) {
+ switch (l2cap_pi(sk)->sec_level) {
+ case BT_SECURITY_HIGH:
+ auth_type = HCI_AT_DEDICATED_BONDING_MITM;
+ break;
+ case BT_SECURITY_MEDIUM:
+ auth_type = HCI_AT_DEDICATED_BONDING;
+ break;
+ default:
+ auth_type = HCI_AT_NO_BONDING;
+ break;
+ }
+ } else if (l2cap_pi(sk)->psm == cpu_to_le16(0x0001)) {
+ if (l2cap_pi(sk)->sec_level == BT_SECURITY_HIGH)
auth_type = HCI_AT_NO_BONDING_MITM;
else
- auth_type = HCI_AT_GENERAL_BONDING_MITM;
- } else {
- if (l2cap_pi(sk)->psm == cpu_to_le16(0x0001))
auth_type = HCI_AT_NO_BONDING;
- else
+
+ if (l2cap_pi(sk)->sec_level == BT_SECURITY_LOW)
+ l2cap_pi(sk)->sec_level = BT_SECURITY_SDP;
+ } else {
+ switch (l2cap_pi(sk)->sec_level) {
+ case BT_SECURITY_HIGH:
+ auth_type = HCI_AT_GENERAL_BONDING_MITM;
+ break;
+ case BT_SECURITY_MEDIUM:
auth_type = HCI_AT_GENERAL_BONDING;
+ break;
+ default:
+ auth_type = HCI_AT_NO_BONDING;
+ break;
+ }
}
- hcon = hci_connect(hdev, ACL_LINK, dst, auth_type);
+ hcon = hci_connect(hdev, ACL_LINK, dst,
+ l2cap_pi(sk)->sec_level, auth_type);
if (!hcon)
goto done;
@@ -835,20 +927,25 @@ done:
static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags)
{
- struct sockaddr_l2 *la = (struct sockaddr_l2 *) addr;
struct sock *sk = sock->sk;
- int err = 0;
-
- lock_sock(sk);
+ struct sockaddr_l2 la;
+ int len, err = 0;
BT_DBG("sk %p", sk);
- if (addr->sa_family != AF_BLUETOOTH || alen < sizeof(struct sockaddr_l2)) {
- err = -EINVAL;
- goto done;
- }
+ if (!addr || addr->sa_family != AF_BLUETOOTH)
+ return -EINVAL;
+
+ memset(&la, 0, sizeof(la));
+ len = min_t(unsigned int, sizeof(la), alen);
+ memcpy(&la, addr, len);
+
+ if (la.l2_cid)
+ return -EINVAL;
+
+ lock_sock(sk);
- if (sk->sk_type == SOCK_SEQPACKET && !la->l2_psm) {
+ if (sk->sk_type == SOCK_SEQPACKET && !la.l2_psm) {
err = -EINVAL;
goto done;
}
@@ -875,8 +972,8 @@ static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, int al
}
/* Set destination address and psm */
- bacpy(&bt_sk(sk)->dst, &la->l2_bdaddr);
- l2cap_pi(sk)->psm = la->l2_psm;
+ bacpy(&bt_sk(sk)->dst, &la.l2_bdaddr);
+ l2cap_pi(sk)->psm = la.l2_psm;
if ((err = l2cap_do_connect(sk)))
goto done;
@@ -1000,12 +1097,16 @@ static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr, int *l
addr->sa_family = AF_BLUETOOTH;
*len = sizeof(struct sockaddr_l2);
- if (peer)
+ if (peer) {
+ la->l2_psm = l2cap_pi(sk)->psm;
bacpy(&la->l2_bdaddr, &bt_sk(sk)->dst);
- else
+ la->l2_cid = htobs(l2cap_pi(sk)->dcid);
+ } else {
+ la->l2_psm = l2cap_pi(sk)->sport;
bacpy(&la->l2_bdaddr, &bt_sk(sk)->src);
+ la->l2_cid = htobs(l2cap_pi(sk)->scid);
+ }
- la->l2_psm = l2cap_pi(sk)->psm;
return 0;
}
@@ -1106,11 +1207,38 @@ static int l2cap_sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct ms
return err;
}
-static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
+static int l2cap_sock_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t len, int flags)
+{
+ struct sock *sk = sock->sk;
+
+ lock_sock(sk);
+
+ if (sk->sk_state == BT_CONNECT2 && bt_sk(sk)->defer_setup) {
+ struct l2cap_conn_rsp rsp;
+
+ sk->sk_state = BT_CONFIG;
+
+ rsp.scid = cpu_to_le16(l2cap_pi(sk)->dcid);
+ rsp.dcid = cpu_to_le16(l2cap_pi(sk)->scid);
+ rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS);
+ rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
+ l2cap_send_cmd(l2cap_pi(sk)->conn, l2cap_pi(sk)->ident,
+ L2CAP_CONN_RSP, sizeof(rsp), &rsp);
+
+ release_sock(sk);
+ return 0;
+ }
+
+ release_sock(sk);
+
+ return bt_sock_recvmsg(iocb, sock, msg, len, flags);
+}
+
+static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, char __user *optval, int optlen)
{
struct sock *sk = sock->sk;
struct l2cap_options opts;
- int err = 0, len;
+ int len, err = 0;
u32 opt;
BT_DBG("sk %p", sk);
@@ -1140,7 +1268,15 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, ch
break;
}
- l2cap_pi(sk)->link_mode = opt;
+ if (opt & L2CAP_LM_AUTH)
+ l2cap_pi(sk)->sec_level = BT_SECURITY_LOW;
+ if (opt & L2CAP_LM_ENCRYPT)
+ l2cap_pi(sk)->sec_level = BT_SECURITY_MEDIUM;
+ if (opt & L2CAP_LM_SECURE)
+ l2cap_pi(sk)->sec_level = BT_SECURITY_HIGH;
+
+ l2cap_pi(sk)->role_switch = (opt & L2CAP_LM_MASTER);
+ l2cap_pi(sk)->force_reliable = (opt & L2CAP_LM_RELIABLE);
break;
default:
@@ -1152,12 +1288,77 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, ch
return err;
}
-static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
+static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct bt_security sec;
+ int len, err = 0;
+ u32 opt;
+
+ BT_DBG("sk %p", sk);
+
+ if (level == SOL_L2CAP)
+ return l2cap_sock_setsockopt_old(sock, optname, optval, optlen);
+
+ if (level != SOL_BLUETOOTH)
+ return -ENOPROTOOPT;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case BT_SECURITY:
+ if (sk->sk_type != SOCK_SEQPACKET && sk->sk_type != SOCK_RAW) {
+ err = -EINVAL;
+ break;
+ }
+
+ sec.level = BT_SECURITY_LOW;
+
+ len = min_t(unsigned int, sizeof(sec), optlen);
+ if (copy_from_user((char *) &sec, optval, len)) {
+ err = -EFAULT;
+ break;
+ }
+
+ if (sec.level < BT_SECURITY_LOW ||
+ sec.level > BT_SECURITY_HIGH) {
+ err = -EINVAL;
+ break;
+ }
+
+ l2cap_pi(sk)->sec_level = sec.level;
+ break;
+
+ case BT_DEFER_SETUP:
+ if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (get_user(opt, (u32 __user *) optval)) {
+ err = -EFAULT;
+ break;
+ }
+
+ bt_sk(sk)->defer_setup = opt;
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+static int l2cap_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen)
{
struct sock *sk = sock->sk;
struct l2cap_options opts;
struct l2cap_conninfo cinfo;
int len, err = 0;
+ u32 opt;
BT_DBG("sk %p", sk);
@@ -1180,12 +1381,36 @@ static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, ch
break;
case L2CAP_LM:
- if (put_user(l2cap_pi(sk)->link_mode, (u32 __user *) optval))
+ switch (l2cap_pi(sk)->sec_level) {
+ case BT_SECURITY_LOW:
+ opt = L2CAP_LM_AUTH;
+ break;
+ case BT_SECURITY_MEDIUM:
+ opt = L2CAP_LM_AUTH | L2CAP_LM_ENCRYPT;
+ break;
+ case BT_SECURITY_HIGH:
+ opt = L2CAP_LM_AUTH | L2CAP_LM_ENCRYPT |
+ L2CAP_LM_SECURE;
+ break;
+ default:
+ opt = 0;
+ break;
+ }
+
+ if (l2cap_pi(sk)->role_switch)
+ opt |= L2CAP_LM_MASTER;
+
+ if (l2cap_pi(sk)->force_reliable)
+ opt |= L2CAP_LM_RELIABLE;
+
+ if (put_user(opt, (u32 __user *) optval))
err = -EFAULT;
break;
case L2CAP_CONNINFO:
- if (sk->sk_state != BT_CONNECTED) {
+ if (sk->sk_state != BT_CONNECTED &&
+ !(sk->sk_state == BT_CONNECT2 &&
+ bt_sk(sk)->defer_setup)) {
err = -ENOTCONN;
break;
}
@@ -1208,6 +1433,60 @@ static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, ch
return err;
}
+static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ struct bt_security sec;
+ int len, err = 0;
+
+ BT_DBG("sk %p", sk);
+
+ if (level == SOL_L2CAP)
+ return l2cap_sock_getsockopt_old(sock, optname, optval, optlen);
+
+ if (level != SOL_BLUETOOTH)
+ return -ENOPROTOOPT;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case BT_SECURITY:
+ if (sk->sk_type != SOCK_SEQPACKET && sk->sk_type != SOCK_RAW) {
+ err = -EINVAL;
+ break;
+ }
+
+ sec.level = l2cap_pi(sk)->sec_level;
+
+ len = min_t(unsigned int, len, sizeof(sec));
+ if (copy_to_user(optval, (char *) &sec, len))
+ err = -EFAULT;
+
+ break;
+
+ case BT_DEFER_SETUP:
+ if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (put_user(bt_sk(sk)->defer_setup, (u32 __user *) optval))
+ err = -EFAULT;
+
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+ return err;
+}
+
static int l2cap_sock_shutdown(struct socket *sock, int how)
{
struct sock *sk = sock->sk;
@@ -1270,11 +1549,6 @@ static void l2cap_chan_ready(struct sock *sk)
*/
parent->sk_data_ready(parent, 0);
}
-
- if (l2cap_pi(sk)->link_mode & L2CAP_LM_SECURE) {
- struct l2cap_conn *conn = l2cap_pi(sk)->conn;
- hci_conn_change_link_key(conn->hcon);
- }
}
/* Copy frame to all raw sockets on that connection */
@@ -1549,8 +1823,11 @@ static inline int l2cap_command_rej(struct l2cap_conn *conn, struct l2cap_cmd_hd
if ((conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) &&
cmd->ident == conn->info_ident) {
- conn->info_ident = 0;
del_timer(&conn->info_timer);
+
+ conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE;
+ conn->info_ident = 0;
+
l2cap_conn_start(conn);
}
@@ -1580,6 +1857,7 @@ static inline int l2cap_connect_req(struct l2cap_conn *conn, struct l2cap_cmd_hd
/* Check if the ACL is secure enough (if not SDP) */
if (psm != cpu_to_le16(0x0001) &&
!hci_conn_check_link_mode(conn->hcon)) {
+ conn->disc_reason = 0x05;
result = L2CAP_CR_SEC_BLOCK;
goto response;
}
@@ -1621,11 +1899,18 @@ static inline int l2cap_connect_req(struct l2cap_conn *conn, struct l2cap_cmd_hd
l2cap_pi(sk)->ident = cmd->ident;
- if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) {
- if (l2cap_check_link_mode(sk)) {
- sk->sk_state = BT_CONFIG;
- result = L2CAP_CR_SUCCESS;
- status = L2CAP_CS_NO_INFO;
+ if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE) {
+ if (l2cap_check_security(sk)) {
+ if (bt_sk(sk)->defer_setup) {
+ sk->sk_state = BT_CONNECT2;
+ result = L2CAP_CR_PEND;
+ status = L2CAP_CS_AUTHOR_PEND;
+ parent->sk_data_ready(parent, 0);
+ } else {
+ sk->sk_state = BT_CONFIG;
+ result = L2CAP_CR_SUCCESS;
+ status = L2CAP_CS_NO_INFO;
+ }
} else {
sk->sk_state = BT_CONNECT2;
result = L2CAP_CR_PEND;
@@ -1695,11 +1980,14 @@ static inline int l2cap_connect_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hd
l2cap_pi(sk)->dcid = dcid;
l2cap_pi(sk)->conf_state |= L2CAP_CONF_REQ_SENT;
+ l2cap_pi(sk)->conf_state &= ~L2CAP_CONF_CONNECT_PEND;
+
l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
l2cap_build_conf_req(sk, req), req);
break;
case L2CAP_CR_PEND:
+ l2cap_pi(sk)->conf_state |= L2CAP_CONF_CONNECT_PEND;
break;
default:
@@ -1908,6 +2196,14 @@ static inline int l2cap_information_req(struct l2cap_conn *conn, struct l2cap_cm
put_unaligned(cpu_to_le32(l2cap_feat_mask), (__le32 *) rsp->data);
l2cap_send_cmd(conn, cmd->ident,
L2CAP_INFO_RSP, sizeof(buf), buf);
+ } else if (type == L2CAP_IT_FIXED_CHAN) {
+ u8 buf[12];
+ struct l2cap_info_rsp *rsp = (struct l2cap_info_rsp *) buf;
+ rsp->type = cpu_to_le16(L2CAP_IT_FIXED_CHAN);
+ rsp->result = cpu_to_le16(L2CAP_IR_SUCCESS);
+ memcpy(buf + 4, l2cap_fixed_chan, 8);
+ l2cap_send_cmd(conn, cmd->ident,
+ L2CAP_INFO_RSP, sizeof(buf), buf);
} else {
struct l2cap_info_rsp rsp;
rsp.type = cpu_to_le16(type);
@@ -1929,14 +2225,31 @@ static inline int l2cap_information_rsp(struct l2cap_conn *conn, struct l2cap_cm
BT_DBG("type 0x%4.4x result 0x%2.2x", type, result);
- conn->info_ident = 0;
-
del_timer(&conn->info_timer);
- if (type == L2CAP_IT_FEAT_MASK)
+ if (type == L2CAP_IT_FEAT_MASK) {
conn->feat_mask = get_unaligned_le32(rsp->data);
- l2cap_conn_start(conn);
+ if (conn->feat_mask & 0x0080) {
+ struct l2cap_info_req req;
+ req.type = cpu_to_le16(L2CAP_IT_FIXED_CHAN);
+
+ conn->info_ident = l2cap_get_ident(conn);
+
+ l2cap_send_cmd(conn, conn->info_ident,
+ L2CAP_INFO_REQ, sizeof(req), &req);
+ } else {
+ conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE;
+ conn->info_ident = 0;
+
+ l2cap_conn_start(conn);
+ }
+ } else if (type == L2CAP_IT_FIXED_CHAN) {
+ conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE;
+ conn->info_ident = 0;
+
+ l2cap_conn_start(conn);
+ }
return 0;
}
@@ -2143,10 +2456,15 @@ static int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type)
continue;
if (!bacmp(&bt_sk(sk)->src, &hdev->bdaddr)) {
- lm1 |= (HCI_LM_ACCEPT | l2cap_pi(sk)->link_mode);
+ lm1 |= HCI_LM_ACCEPT;
+ if (l2cap_pi(sk)->role_switch)
+ lm1 |= HCI_LM_MASTER;
exact++;
- } else if (!bacmp(&bt_sk(sk)->src, BDADDR_ANY))
- lm2 |= (HCI_LM_ACCEPT | l2cap_pi(sk)->link_mode);
+ } else if (!bacmp(&bt_sk(sk)->src, BDADDR_ANY)) {
+ lm2 |= HCI_LM_ACCEPT;
+ if (l2cap_pi(sk)->role_switch)
+ lm2 |= HCI_LM_MASTER;
+ }
}
read_unlock(&l2cap_sk_list.lock);
@@ -2172,89 +2490,48 @@ static int l2cap_connect_cfm(struct hci_conn *hcon, u8 status)
return 0;
}
-static int l2cap_disconn_ind(struct hci_conn *hcon, u8 reason)
+static int l2cap_disconn_ind(struct hci_conn *hcon)
{
- BT_DBG("hcon %p reason %d", hcon, reason);
+ struct l2cap_conn *conn = hcon->l2cap_data;
- if (hcon->type != ACL_LINK)
- return 0;
+ BT_DBG("hcon %p", hcon);
- l2cap_conn_del(hcon, bt_err(reason));
+ if (hcon->type != ACL_LINK || !conn)
+ return 0x13;
- return 0;
+ return conn->disc_reason;
}
-static int l2cap_auth_cfm(struct hci_conn *hcon, u8 status)
+static int l2cap_disconn_cfm(struct hci_conn *hcon, u8 reason)
{
- struct l2cap_chan_list *l;
- struct l2cap_conn *conn = hcon->l2cap_data;
- struct sock *sk;
+ BT_DBG("hcon %p reason %d", hcon, reason);
- if (!conn)
+ if (hcon->type != ACL_LINK)
return 0;
- l = &conn->chan_list;
-
- BT_DBG("conn %p", conn);
-
- read_lock(&l->lock);
-
- for (sk = l->head; sk; sk = l2cap_pi(sk)->next_c) {
- struct l2cap_pinfo *pi = l2cap_pi(sk);
-
- bh_lock_sock(sk);
-
- if ((pi->link_mode & (L2CAP_LM_ENCRYPT | L2CAP_LM_SECURE)) &&
- !(hcon->link_mode & HCI_LM_ENCRYPT) &&
- !status) {
- bh_unlock_sock(sk);
- continue;
- }
-
- if (sk->sk_state == BT_CONNECT) {
- if (!status) {
- struct l2cap_conn_req req;
- req.scid = cpu_to_le16(l2cap_pi(sk)->scid);
- req.psm = l2cap_pi(sk)->psm;
-
- l2cap_pi(sk)->ident = l2cap_get_ident(conn);
-
- l2cap_send_cmd(conn, l2cap_pi(sk)->ident,
- L2CAP_CONN_REQ, sizeof(req), &req);
- } else {
- l2cap_sock_clear_timer(sk);
- l2cap_sock_set_timer(sk, HZ / 10);
- }
- } else if (sk->sk_state == BT_CONNECT2) {
- struct l2cap_conn_rsp rsp;
- __u16 result;
+ l2cap_conn_del(hcon, bt_err(reason));
- if (!status) {
- sk->sk_state = BT_CONFIG;
- result = L2CAP_CR_SUCCESS;
- } else {
- sk->sk_state = BT_DISCONN;
- l2cap_sock_set_timer(sk, HZ / 10);
- result = L2CAP_CR_SEC_BLOCK;
- }
+ return 0;
+}
- rsp.scid = cpu_to_le16(l2cap_pi(sk)->dcid);
- rsp.dcid = cpu_to_le16(l2cap_pi(sk)->scid);
- rsp.result = cpu_to_le16(result);
- rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
- l2cap_send_cmd(conn, l2cap_pi(sk)->ident,
- L2CAP_CONN_RSP, sizeof(rsp), &rsp);
- }
+static inline void l2cap_check_encryption(struct sock *sk, u8 encrypt)
+{
+ if (sk->sk_type != SOCK_SEQPACKET)
+ return;
- bh_unlock_sock(sk);
+ if (encrypt == 0x00) {
+ if (l2cap_pi(sk)->sec_level == BT_SECURITY_MEDIUM) {
+ l2cap_sock_clear_timer(sk);
+ l2cap_sock_set_timer(sk, HZ * 5);
+ } else if (l2cap_pi(sk)->sec_level == BT_SECURITY_HIGH)
+ __l2cap_sock_close(sk, ECONNREFUSED);
+ } else {
+ if (l2cap_pi(sk)->sec_level == BT_SECURITY_MEDIUM)
+ l2cap_sock_clear_timer(sk);
}
-
- read_unlock(&l->lock);
-
- return 0;
}
-static int l2cap_encrypt_cfm(struct hci_conn *hcon, u8 status, u8 encrypt)
+static int l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt)
{
struct l2cap_chan_list *l;
struct l2cap_conn *conn = hcon->l2cap_data;
@@ -2270,15 +2547,16 @@ static int l2cap_encrypt_cfm(struct hci_conn *hcon, u8 status, u8 encrypt)
read_lock(&l->lock);
for (sk = l->head; sk; sk = l2cap_pi(sk)->next_c) {
- struct l2cap_pinfo *pi = l2cap_pi(sk);
-
bh_lock_sock(sk);
- if ((pi->link_mode & (L2CAP_LM_ENCRYPT | L2CAP_LM_SECURE)) &&
- (sk->sk_state == BT_CONNECTED ||
- sk->sk_state == BT_CONFIG) &&
- !status && encrypt == 0x00) {
- __l2cap_sock_close(sk, ECONNREFUSED);
+ if (l2cap_pi(sk)->conf_state & L2CAP_CONF_CONNECT_PEND) {
+ bh_unlock_sock(sk);
+ continue;
+ }
+
+ if (!status && (sk->sk_state == BT_CONNECTED ||
+ sk->sk_state == BT_CONFIG)) {
+ l2cap_check_encryption(sk, encrypt);
bh_unlock_sock(sk);
continue;
}
@@ -2376,7 +2654,7 @@ static int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 fl
goto drop;
skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len),
- skb->len);
+ skb->len);
conn->rx_len = len - skb->len;
} else {
BT_DBG("Cont: frag len %d (expecting %d)", skb->len, conn->rx_len);
@@ -2398,7 +2676,7 @@ static int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 fl
}
skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len),
- skb->len);
+ skb->len);
conn->rx_len -= skb->len;
if (!conn->rx_len) {
@@ -2424,10 +2702,10 @@ static ssize_t l2cap_sysfs_show(struct class *dev, char *buf)
sk_for_each(sk, node, &l2cap_sk_list.head) {
struct l2cap_pinfo *pi = l2cap_pi(sk);
- str += sprintf(str, "%s %s %d %d 0x%4.4x 0x%4.4x %d %d 0x%x\n",
+ str += sprintf(str, "%s %s %d %d 0x%4.4x 0x%4.4x %d %d %d\n",
batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst),
sk->sk_state, btohs(pi->psm), pi->scid, pi->dcid,
- pi->imtu, pi->omtu, pi->link_mode);
+ pi->imtu, pi->omtu, pi->sec_level);
}
read_unlock_bh(&l2cap_sk_list.lock);
@@ -2447,7 +2725,7 @@ static const struct proto_ops l2cap_sock_ops = {
.accept = l2cap_sock_accept,
.getname = l2cap_sock_getname,
.sendmsg = l2cap_sock_sendmsg,
- .recvmsg = bt_sock_recvmsg,
+ .recvmsg = l2cap_sock_recvmsg,
.poll = bt_sock_poll,
.ioctl = bt_sock_ioctl,
.mmap = sock_no_mmap,
@@ -2469,8 +2747,8 @@ static struct hci_proto l2cap_hci_proto = {
.connect_ind = l2cap_connect_ind,
.connect_cfm = l2cap_connect_cfm,
.disconn_ind = l2cap_disconn_ind,
- .auth_cfm = l2cap_auth_cfm,
- .encrypt_cfm = l2cap_encrypt_cfm,
+ .disconn_cfm = l2cap_disconn_cfm,
+ .security_cfm = l2cap_security_cfm,
.recv_acldata = l2cap_recv_acldata
};
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index acd84fd524b..1d0fb0f23c6 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -46,7 +46,7 @@
#include <net/bluetooth/l2cap.h>
#include <net/bluetooth/rfcomm.h>
-#define VERSION "1.10"
+#define VERSION "1.11"
static int disable_cfc = 0;
static int channel_mtu = -1;
@@ -223,19 +223,25 @@ static int rfcomm_l2sock_create(struct socket **sock)
return err;
}
-static inline int rfcomm_check_link_mode(struct rfcomm_dlc *d)
+static inline int rfcomm_check_security(struct rfcomm_dlc *d)
{
struct sock *sk = d->session->sock->sk;
+ __u8 auth_type;
- if (d->link_mode & (RFCOMM_LM_ENCRYPT | RFCOMM_LM_SECURE)) {
- if (!hci_conn_encrypt(l2cap_pi(sk)->conn->hcon))
- return 1;
- } else if (d->link_mode & RFCOMM_LM_AUTH) {
- if (!hci_conn_auth(l2cap_pi(sk)->conn->hcon))
- return 1;
+ switch (d->sec_level) {
+ case BT_SECURITY_HIGH:
+ auth_type = HCI_AT_GENERAL_BONDING_MITM;
+ break;
+ case BT_SECURITY_MEDIUM:
+ auth_type = HCI_AT_GENERAL_BONDING;
+ break;
+ default:
+ auth_type = HCI_AT_NO_BONDING;
+ break;
}
- return 0;
+ return hci_conn_security(l2cap_pi(sk)->conn->hcon, d->sec_level,
+ auth_type);
}
/* ---- RFCOMM DLCs ---- */
@@ -388,10 +394,10 @@ static int __rfcomm_dlc_open(struct rfcomm_dlc *d, bdaddr_t *src, bdaddr_t *dst,
d->cfc = (s->cfc == RFCOMM_CFC_UNKNOWN) ? 0 : s->cfc;
if (s->state == BT_CONNECTED) {
- if (rfcomm_check_link_mode(d))
- set_bit(RFCOMM_AUTH_PENDING, &d->flags);
- else
+ if (rfcomm_check_security(d))
rfcomm_send_pn(s, 1, d);
+ else
+ set_bit(RFCOMM_AUTH_PENDING, &d->flags);
}
rfcomm_dlc_set_timer(d, RFCOMM_CONN_TIMEOUT);
@@ -421,9 +427,16 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err)
d, d->state, d->dlci, err, s);
switch (d->state) {
- case BT_CONNECTED:
- case BT_CONFIG:
case BT_CONNECT:
+ case BT_CONFIG:
+ if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) {
+ set_bit(RFCOMM_AUTH_REJECT, &d->flags);
+ rfcomm_schedule(RFCOMM_SCHED_AUTH);
+ break;
+ }
+ /* Fall through */
+
+ case BT_CONNECTED:
d->state = BT_DISCONN;
if (skb_queue_empty(&d->tx_queue)) {
rfcomm_send_disc(s, d->dlci);
@@ -434,6 +447,15 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err)
}
break;
+ case BT_OPEN:
+ case BT_CONNECT2:
+ if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) {
+ set_bit(RFCOMM_AUTH_REJECT, &d->flags);
+ rfcomm_schedule(RFCOMM_SCHED_AUTH);
+ break;
+ }
+ /* Fall through */
+
default:
rfcomm_dlc_clear_timer(d);
@@ -636,6 +658,7 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, bdaddr_t *dst
bacpy(&addr.l2_bdaddr, src);
addr.l2_family = AF_BLUETOOTH;
addr.l2_psm = 0;
+ addr.l2_cid = 0;
*err = kernel_bind(sock, (struct sockaddr *) &addr, sizeof(addr));
if (*err < 0)
goto failed;
@@ -657,6 +680,7 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, bdaddr_t *dst
bacpy(&addr.l2_bdaddr, dst);
addr.l2_family = AF_BLUETOOTH;
addr.l2_psm = htobs(RFCOMM_PSM);
+ addr.l2_cid = 0;
*err = kernel_connect(sock, (struct sockaddr *) &addr, sizeof(addr), O_NONBLOCK);
if (*err == 0 || *err == -EINPROGRESS)
return s;
@@ -1162,7 +1186,7 @@ static int rfcomm_recv_disc(struct rfcomm_session *s, u8 dlci)
return 0;
}
-static void rfcomm_dlc_accept(struct rfcomm_dlc *d)
+void rfcomm_dlc_accept(struct rfcomm_dlc *d)
{
struct sock *sk = d->session->sock->sk;
@@ -1175,12 +1199,31 @@ static void rfcomm_dlc_accept(struct rfcomm_dlc *d)
d->state_change(d, 0);
rfcomm_dlc_unlock(d);
- if (d->link_mode & RFCOMM_LM_MASTER)
+ if (d->role_switch)
hci_conn_switch_role(l2cap_pi(sk)->conn->hcon, 0x00);
rfcomm_send_msc(d->session, 1, d->dlci, d->v24_sig);
}
+static void rfcomm_check_accept(struct rfcomm_dlc *d)
+{
+ if (rfcomm_check_security(d)) {
+ if (d->defer_setup) {
+ set_bit(RFCOMM_DEFER_SETUP, &d->flags);
+ rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
+
+ rfcomm_dlc_lock(d);
+ d->state = BT_CONNECT2;
+ d->state_change(d, 0);
+ rfcomm_dlc_unlock(d);
+ } else
+ rfcomm_dlc_accept(d);
+ } else {
+ set_bit(RFCOMM_AUTH_PENDING, &d->flags);
+ rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
+ }
+}
+
static int rfcomm_recv_sabm(struct rfcomm_session *s, u8 dlci)
{
struct rfcomm_dlc *d;
@@ -1203,11 +1246,7 @@ static int rfcomm_recv_sabm(struct rfcomm_session *s, u8 dlci)
if (d) {
if (d->state == BT_OPEN) {
/* DLC was previously opened by PN request */
- if (rfcomm_check_link_mode(d)) {
- set_bit(RFCOMM_AUTH_PENDING, &d->flags);
- rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
- } else
- rfcomm_dlc_accept(d);
+ rfcomm_check_accept(d);
}
return 0;
}
@@ -1219,11 +1258,7 @@ static int rfcomm_recv_sabm(struct rfcomm_session *s, u8 dlci)
d->addr = __addr(s->initiator, dlci);
rfcomm_dlc_link(s, d);
- if (rfcomm_check_link_mode(d)) {
- set_bit(RFCOMM_AUTH_PENDING, &d->flags);
- rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
- } else
- rfcomm_dlc_accept(d);
+ rfcomm_check_accept(d);
} else {
rfcomm_send_dm(s, dlci);
}
@@ -1637,11 +1672,12 @@ static void rfcomm_process_connect(struct rfcomm_session *s)
d = list_entry(p, struct rfcomm_dlc, list);
if (d->state == BT_CONFIG) {
d->mtu = s->mtu;
- if (rfcomm_check_link_mode(d)) {
+ if (rfcomm_check_security(d)) {
+ rfcomm_send_pn(s, 1, d);
+ } else {
set_bit(RFCOMM_AUTH_PENDING, &d->flags);
rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
- } else
- rfcomm_send_pn(s, 1, d);
+ }
}
}
}
@@ -1717,11 +1753,17 @@ static inline void rfcomm_process_dlcs(struct rfcomm_session *s)
if (d->out) {
rfcomm_send_pn(s, 1, d);
rfcomm_dlc_set_timer(d, RFCOMM_CONN_TIMEOUT);
- } else
- rfcomm_dlc_accept(d);
- if (d->link_mode & RFCOMM_LM_SECURE) {
- struct sock *sk = s->sock->sk;
- hci_conn_change_link_key(l2cap_pi(sk)->conn->hcon);
+ } else {
+ if (d->defer_setup) {
+ set_bit(RFCOMM_DEFER_SETUP, &d->flags);
+ rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
+
+ rfcomm_dlc_lock(d);
+ d->state = BT_CONNECT2;
+ d->state_change(d, 0);
+ rfcomm_dlc_unlock(d);
+ } else
+ rfcomm_dlc_accept(d);
}
continue;
} else if (test_and_clear_bit(RFCOMM_AUTH_REJECT, &d->flags)) {
@@ -1734,6 +1776,9 @@ static inline void rfcomm_process_dlcs(struct rfcomm_session *s)
continue;
}
+ if (test_bit(RFCOMM_SEC_PENDING, &d->flags))
+ continue;
+
if (test_bit(RFCOMM_TX_THROTTLED, &s->flags))
continue;
@@ -1876,6 +1921,7 @@ static int rfcomm_add_listener(bdaddr_t *ba)
bacpy(&addr.l2_bdaddr, ba);
addr.l2_family = AF_BLUETOOTH;
addr.l2_psm = htobs(RFCOMM_PSM);
+ addr.l2_cid = 0;
err = kernel_bind(sock, (struct sockaddr *) &addr, sizeof(addr));
if (err < 0) {
BT_ERR("Bind failed %d", err);
@@ -1947,42 +1993,7 @@ static int rfcomm_run(void *unused)
return 0;
}
-static void rfcomm_auth_cfm(struct hci_conn *conn, u8 status)
-{
- struct rfcomm_session *s;
- struct rfcomm_dlc *d;
- struct list_head *p, *n;
-
- BT_DBG("conn %p status 0x%02x", conn, status);
-
- s = rfcomm_session_get(&conn->hdev->bdaddr, &conn->dst);
- if (!s)
- return;
-
- rfcomm_session_hold(s);
-
- list_for_each_safe(p, n, &s->dlcs) {
- d = list_entry(p, struct rfcomm_dlc, list);
-
- if ((d->link_mode & (RFCOMM_LM_ENCRYPT | RFCOMM_LM_SECURE)) &&
- !(conn->link_mode & HCI_LM_ENCRYPT) && !status)
- continue;
-
- if (!test_and_clear_bit(RFCOMM_AUTH_PENDING, &d->flags))
- continue;
-
- if (!status)
- set_bit(RFCOMM_AUTH_ACCEPT, &d->flags);
- else
- set_bit(RFCOMM_AUTH_REJECT, &d->flags);
- }
-
- rfcomm_session_put(s);
-
- rfcomm_schedule(RFCOMM_SCHED_AUTH);
-}
-
-static void rfcomm_encrypt_cfm(struct hci_conn *conn, u8 status, u8 encrypt)
+static void rfcomm_security_cfm(struct hci_conn *conn, u8 status, u8 encrypt)
{
struct rfcomm_session *s;
struct rfcomm_dlc *d;
@@ -1999,18 +2010,29 @@ static void rfcomm_encrypt_cfm(struct hci_conn *conn, u8 status, u8 encrypt)
list_for_each_safe(p, n, &s->dlcs) {
d = list_entry(p, struct rfcomm_dlc, list);
- if ((d->link_mode & (RFCOMM_LM_ENCRYPT | RFCOMM_LM_SECURE)) &&
- (d->state == BT_CONNECTED ||
- d->state == BT_CONFIG) &&
- !status && encrypt == 0x00) {
- __rfcomm_dlc_close(d, ECONNREFUSED);
- continue;
+ if (test_and_clear_bit(RFCOMM_SEC_PENDING, &d->flags)) {
+ rfcomm_dlc_clear_timer(d);
+ if (status || encrypt == 0x00) {
+ __rfcomm_dlc_close(d, ECONNREFUSED);
+ continue;
+ }
+ }
+
+ if (d->state == BT_CONNECTED && !status && encrypt == 0x00) {
+ if (d->sec_level == BT_SECURITY_MEDIUM) {
+ set_bit(RFCOMM_SEC_PENDING, &d->flags);
+ rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
+ continue;
+ } else if (d->sec_level == BT_SECURITY_HIGH) {
+ __rfcomm_dlc_close(d, ECONNREFUSED);
+ continue;
+ }
}
if (!test_and_clear_bit(RFCOMM_AUTH_PENDING, &d->flags))
continue;
- if (!status && encrypt)
+ if (!status)
set_bit(RFCOMM_AUTH_ACCEPT, &d->flags);
else
set_bit(RFCOMM_AUTH_REJECT, &d->flags);
@@ -2023,8 +2045,7 @@ static void rfcomm_encrypt_cfm(struct hci_conn *conn, u8 status, u8 encrypt)
static struct hci_cb rfcomm_cb = {
.name = "RFCOMM",
- .auth_cfm = rfcomm_auth_cfm,
- .encrypt_cfm = rfcomm_encrypt_cfm
+ .security_cfm = rfcomm_security_cfm
};
static ssize_t rfcomm_dlc_sysfs_show(struct class *dev, char *buf)
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index d3fc6fca38d..7f482784e9f 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -261,12 +261,19 @@ static void rfcomm_sock_init(struct sock *sk, struct sock *parent)
if (parent) {
sk->sk_type = parent->sk_type;
- pi->link_mode = rfcomm_pi(parent)->link_mode;
+ pi->dlc->defer_setup = bt_sk(parent)->defer_setup;
+
+ pi->sec_level = rfcomm_pi(parent)->sec_level;
+ pi->role_switch = rfcomm_pi(parent)->role_switch;
} else {
- pi->link_mode = 0;
+ pi->dlc->defer_setup = 0;
+
+ pi->sec_level = BT_SECURITY_LOW;
+ pi->role_switch = 0;
}
- pi->dlc->link_mode = pi->link_mode;
+ pi->dlc->sec_level = pi->sec_level;
+ pi->dlc->role_switch = pi->role_switch;
}
static struct proto rfcomm_proto = {
@@ -406,7 +413,8 @@ static int rfcomm_sock_connect(struct socket *sock, struct sockaddr *addr, int a
bacpy(&bt_sk(sk)->dst, &sa->rc_bdaddr);
rfcomm_pi(sk)->channel = sa->rc_channel;
- d->link_mode = rfcomm_pi(sk)->link_mode;
+ d->sec_level = rfcomm_pi(sk)->sec_level;
+ d->role_switch = rfcomm_pi(sk)->role_switch;
err = rfcomm_dlc_open(d, &bt_sk(sk)->src, &sa->rc_bdaddr, sa->rc_channel);
if (!err)
@@ -554,6 +562,9 @@ static int rfcomm_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
struct sk_buff *skb;
int sent = 0;
+ if (test_bit(RFCOMM_DEFER_SETUP, &d->flags))
+ return -ENOTCONN;
+
if (msg->msg_flags & MSG_OOB)
return -EOPNOTSUPP;
@@ -570,8 +581,11 @@ static int rfcomm_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
skb = sock_alloc_send_skb(sk, size + RFCOMM_SKB_RESERVE,
msg->msg_flags & MSG_DONTWAIT, &err);
- if (!skb)
+ if (!skb) {
+ if (sent == 0)
+ sent = err;
break;
+ }
skb_reserve(skb, RFCOMM_SKB_HEAD_RESERVE);
err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
@@ -630,10 +644,16 @@ static int rfcomm_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
struct msghdr *msg, size_t size, int flags)
{
struct sock *sk = sock->sk;
+ struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc;
int err = 0;
size_t target, copied = 0;
long timeo;
+ if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) {
+ rfcomm_dlc_accept(d);
+ return 0;
+ }
+
if (flags & MSG_OOB)
return -EOPNOTSUPP;
@@ -710,7 +730,7 @@ out:
return copied ? : err;
}
-static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
+static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, char __user *optval, int optlen)
{
struct sock *sk = sock->sk;
int err = 0;
@@ -727,7 +747,14 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, c
break;
}
- rfcomm_pi(sk)->link_mode = opt;
+ if (opt & RFCOMM_LM_AUTH)
+ rfcomm_pi(sk)->sec_level = BT_SECURITY_LOW;
+ if (opt & RFCOMM_LM_ENCRYPT)
+ rfcomm_pi(sk)->sec_level = BT_SECURITY_MEDIUM;
+ if (opt & RFCOMM_LM_SECURE)
+ rfcomm_pi(sk)->sec_level = BT_SECURITY_HIGH;
+
+ rfcomm_pi(sk)->role_switch = (opt & RFCOMM_LM_MASTER);
break;
default:
@@ -739,12 +766,76 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, c
return err;
}
-static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
+static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct bt_security sec;
+ int len, err = 0;
+ u32 opt;
+
+ BT_DBG("sk %p", sk);
+
+ if (level == SOL_RFCOMM)
+ return rfcomm_sock_setsockopt_old(sock, optname, optval, optlen);
+
+ if (level != SOL_BLUETOOTH)
+ return -ENOPROTOOPT;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case BT_SECURITY:
+ if (sk->sk_type != SOCK_STREAM) {
+ err = -EINVAL;
+ break;
+ }
+
+ sec.level = BT_SECURITY_LOW;
+
+ len = min_t(unsigned int, sizeof(sec), optlen);
+ if (copy_from_user((char *) &sec, optval, len)) {
+ err = -EFAULT;
+ break;
+ }
+
+ if (sec.level > BT_SECURITY_HIGH) {
+ err = -EINVAL;
+ break;
+ }
+
+ rfcomm_pi(sk)->sec_level = sec.level;
+ break;
+
+ case BT_DEFER_SETUP:
+ if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (get_user(opt, (u32 __user *) optval)) {
+ err = -EFAULT;
+ break;
+ }
+
+ bt_sk(sk)->defer_setup = opt;
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+static int rfcomm_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen)
{
struct sock *sk = sock->sk;
struct sock *l2cap_sk;
struct rfcomm_conninfo cinfo;
int len, err = 0;
+ u32 opt;
BT_DBG("sk %p", sk);
@@ -755,12 +846,32 @@ static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, c
switch (optname) {
case RFCOMM_LM:
- if (put_user(rfcomm_pi(sk)->link_mode, (u32 __user *) optval))
+ switch (rfcomm_pi(sk)->sec_level) {
+ case BT_SECURITY_LOW:
+ opt = RFCOMM_LM_AUTH;
+ break;
+ case BT_SECURITY_MEDIUM:
+ opt = RFCOMM_LM_AUTH | RFCOMM_LM_ENCRYPT;
+ break;
+ case BT_SECURITY_HIGH:
+ opt = RFCOMM_LM_AUTH | RFCOMM_LM_ENCRYPT |
+ RFCOMM_LM_SECURE;
+ break;
+ default:
+ opt = 0;
+ break;
+ }
+
+ if (rfcomm_pi(sk)->role_switch)
+ opt |= RFCOMM_LM_MASTER;
+
+ if (put_user(opt, (u32 __user *) optval))
err = -EFAULT;
break;
case RFCOMM_CONNINFO:
- if (sk->sk_state != BT_CONNECTED) {
+ if (sk->sk_state != BT_CONNECTED &&
+ !rfcomm_pi(sk)->dlc->defer_setup) {
err = -ENOTCONN;
break;
}
@@ -785,6 +896,60 @@ static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, c
return err;
}
+static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ struct bt_security sec;
+ int len, err = 0;
+
+ BT_DBG("sk %p", sk);
+
+ if (level == SOL_RFCOMM)
+ return rfcomm_sock_getsockopt_old(sock, optname, optval, optlen);
+
+ if (level != SOL_BLUETOOTH)
+ return -ENOPROTOOPT;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case BT_SECURITY:
+ if (sk->sk_type != SOCK_STREAM) {
+ err = -EINVAL;
+ break;
+ }
+
+ sec.level = rfcomm_pi(sk)->sec_level;
+
+ len = min_t(unsigned int, len, sizeof(sec));
+ if (copy_to_user(optval, (char *) &sec, len))
+ err = -EFAULT;
+
+ break;
+
+ case BT_DEFER_SETUP:
+ if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (put_user(bt_sk(sk)->defer_setup, (u32 __user *) optval))
+ err = -EFAULT;
+
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+ return err;
+}
+
static int rfcomm_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
struct sock *sk __maybe_unused = sock->sk;
@@ -888,6 +1053,10 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc *
done:
bh_unlock_sock(parent);
+
+ if (bt_sk(parent)->defer_setup)
+ parent->sk_state_change(parent);
+
return result;
}
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 46fd8bf9a69..51ae0c3e470 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -195,7 +195,7 @@ static int sco_connect(struct sock *sk)
else
type = SCO_LINK;
- hcon = hci_connect(hdev, type, dst, HCI_AT_NO_BONDING);
+ hcon = hci_connect(hdev, type, dst, BT_SECURITY_LOW, HCI_AT_NO_BONDING);
if (!hcon)
goto done;
@@ -668,7 +668,7 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, char
return err;
}
-static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
+static int sco_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen)
{
struct sock *sk = sock->sk;
struct sco_options opts;
@@ -723,6 +723,31 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char
return err;
}
+static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ int len, err = 0;
+
+ BT_DBG("sk %p", sk);
+
+ if (level == SOL_SCO)
+ return sco_sock_getsockopt_old(sock, optname, optval, optlen);
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+ return err;
+}
+
static int sco_sock_release(struct socket *sock)
{
struct sock *sk = sock->sk;
@@ -832,10 +857,30 @@ done:
/* ----- SCO interface with lower layer (HCI) ----- */
static int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 type)
{
+ register struct sock *sk;
+ struct hlist_node *node;
+ int lm = 0;
+
+ if (type != SCO_LINK && type != ESCO_LINK)
+ return 0;
+
BT_DBG("hdev %s, bdaddr %s", hdev->name, batostr(bdaddr));
- /* Always accept connection */
- return HCI_LM_ACCEPT;
+ /* Find listening sockets */
+ read_lock(&sco_sk_list.lock);
+ sk_for_each(sk, node, &sco_sk_list.head) {
+ if (sk->sk_state != BT_LISTEN)
+ continue;
+
+ if (!bacmp(&bt_sk(sk)->src, &hdev->bdaddr) ||
+ !bacmp(&bt_sk(sk)->src, BDADDR_ANY)) {
+ lm |= HCI_LM_ACCEPT;
+ break;
+ }
+ }
+ read_unlock(&sco_sk_list.lock);
+
+ return lm;
}
static int sco_connect_cfm(struct hci_conn *hcon, __u8 status)
@@ -857,7 +902,7 @@ static int sco_connect_cfm(struct hci_conn *hcon, __u8 status)
return 0;
}
-static int sco_disconn_ind(struct hci_conn *hcon, __u8 reason)
+static int sco_disconn_cfm(struct hci_conn *hcon, __u8 reason)
{
BT_DBG("hcon %p reason %d", hcon, reason);
@@ -940,7 +985,7 @@ static struct hci_proto sco_hci_proto = {
.id = HCI_PROTO_SCO,
.connect_ind = sco_connect_ind,
.connect_cfm = sco_connect_cfm,
- .disconn_ind = sco_disconn_ind,
+ .disconn_cfm = sco_disconn_cfm,
.recv_scodata = sco_recv_scodata
};
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index ba7be195803..fcffb3fb117 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -98,7 +98,8 @@ void br_ifinfo_notify(int event, struct net_bridge_port *port)
kfree_skb(skb);
goto errout;
}
- err = rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
+ rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
+ return;
errout:
if (err < 0)
rtnl_set_sk_err(net, RTNLGRP_LINK, err);
diff --git a/net/can/af_can.c b/net/can/af_can.c
index d90e8dd975f..547bafc79e2 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -273,8 +273,7 @@ int can_send(struct sk_buff *skb, int loop)
err = net_xmit_errno(err);
if (err) {
- if (newskb)
- kfree_skb(newskb);
+ kfree_skb(newskb);
return err;
}
diff --git a/net/core/Makefile b/net/core/Makefile
index 26a37cb3192..796f46eece5 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -17,3 +17,6 @@ obj-$(CONFIG_NET_PKTGEN) += pktgen.o
obj-$(CONFIG_NETPOLL) += netpoll.o
obj-$(CONFIG_NET_DMA) += user_dma.o
obj-$(CONFIG_FIB_RULES) += fib_rules.o
+obj-$(CONFIG_TRACEPOINTS) += net-traces.o
+obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
+
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 5e2ac0c4b07..d0de644b378 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -208,7 +208,7 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags,
void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
{
- kfree_skb(skb);
+ consume_skb(skb);
sk_mem_reclaim_partial(sk);
}
diff --git a/net/core/dev.c b/net/core/dev.c
index d393fc997cd..052dd478d3e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -135,14 +135,6 @@
/* This should be increased if a protocol with a bigger head is added. */
#define GRO_MAX_HEAD (MAX_HEADER + 128)
-enum {
- GRO_MERGED,
- GRO_MERGED_FREE,
- GRO_HELD,
- GRO_NORMAL,
- GRO_DROP,
-};
-
/*
* The list of packet types we will receive (as opposed to discard)
* and the routines to invoke.
@@ -1672,23 +1664,12 @@ static int dev_gso_segment(struct sk_buff *skb)
return 0;
}
-static void tstamp_tx(struct sk_buff *skb)
-{
- union skb_shared_tx *shtx =
- skb_tx(skb);
- if (unlikely(shtx->software &&
- !shtx->in_progress)) {
- skb_tstamp_tx(skb, NULL);
- }
-}
-
int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
struct netdev_queue *txq)
{
const struct net_device_ops *ops = dev->netdev_ops;
int rc;
- prefetch(&dev->netdev_ops->ndo_start_xmit);
if (likely(!skb->next)) {
if (!list_empty(&ptype_all))
dev_queue_xmit_nit(skb, dev);
@@ -1715,8 +1696,6 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
* the skb destructor before the call and restoring it
* afterwards, then doing the skb_orphan() ourselves?
*/
- if (likely(!rc))
- tstamp_tx(skb);
return rc;
}
@@ -1732,7 +1711,6 @@ gso:
skb->next = nskb;
return rc;
}
- tstamp_tx(skb);
if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
return NETDEV_TX_BUSY;
} while (skb->next);
@@ -1745,17 +1723,11 @@ out_kfree_skb:
}
static u32 skb_tx_hashrnd;
-static int skb_tx_hashrnd_initialized = 0;
-static u16 skb_tx_hash(struct net_device *dev, struct sk_buff *skb)
+u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
{
u32 hash;
- if (unlikely(!skb_tx_hashrnd_initialized)) {
- get_random_bytes(&skb_tx_hashrnd, 4);
- skb_tx_hashrnd_initialized = 1;
- }
-
if (skb_rx_queue_recorded(skb)) {
hash = skb_get_rx_queue(skb);
} else if (skb->sk && skb->sk->sk_hash) {
@@ -1767,6 +1739,7 @@ static u16 skb_tx_hash(struct net_device *dev, struct sk_buff *skb)
return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
}
+EXPORT_SYMBOL(skb_tx_hash);
static struct netdev_queue *dev_pick_tx(struct net_device *dev,
struct sk_buff *skb)
@@ -2273,12 +2246,6 @@ int netif_receive_skb(struct sk_buff *skb)
rcu_read_lock();
- /* Don't receive packets in an exiting network namespace */
- if (!net_alive(dev_net(skb->dev))) {
- kfree_skb(skb);
- goto out;
- }
-
#ifdef CONFIG_NET_CLS_ACT
if (skb->tc_verd & TC_NCLS) {
skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
@@ -2499,6 +2466,9 @@ static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
struct sk_buff *p;
+ if (netpoll_rx_on(skb))
+ return GRO_NORMAL;
+
for (p = napi->gro_list; p; p = p->next) {
NAPI_GRO_CB(p)->same_flow = !compare_ether_header(
skb_mac_header(p), skb_gro_mac_header(skb));
@@ -2657,9 +2627,9 @@ static int process_backlog(struct napi_struct *napi, int quota)
local_irq_disable();
skb = __skb_dequeue(&queue->input_pkt_queue);
if (!skb) {
- __napi_complete(napi);
local_irq_enable();
- break;
+ napi_complete(napi);
+ goto out;
}
local_irq_enable();
@@ -2668,6 +2638,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
napi_gro_flush(napi);
+out:
return work;
}
@@ -2741,7 +2712,7 @@ void netif_napi_del(struct napi_struct *napi)
struct sk_buff *skb, *next;
list_del_init(&napi->dev_list);
- kfree(napi->skb);
+ kfree_skb(napi->skb);
for (skb = napi->gro_list; skb; skb = next) {
next = skb->next;
@@ -4355,6 +4326,39 @@ unsigned long netdev_fix_features(unsigned long features, const char *name)
}
EXPORT_SYMBOL(netdev_fix_features);
+/* Some devices need to (re-)set their netdev_ops inside
+ * ->init() or similar. If that happens, we have to setup
+ * the compat pointers again.
+ */
+void netdev_resync_ops(struct net_device *dev)
+{
+#ifdef CONFIG_COMPAT_NET_DEV_OPS
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ dev->init = ops->ndo_init;
+ dev->uninit = ops->ndo_uninit;
+ dev->open = ops->ndo_open;
+ dev->change_rx_flags = ops->ndo_change_rx_flags;
+ dev->set_rx_mode = ops->ndo_set_rx_mode;
+ dev->set_multicast_list = ops->ndo_set_multicast_list;
+ dev->set_mac_address = ops->ndo_set_mac_address;
+ dev->validate_addr = ops->ndo_validate_addr;
+ dev->do_ioctl = ops->ndo_do_ioctl;
+ dev->set_config = ops->ndo_set_config;
+ dev->change_mtu = ops->ndo_change_mtu;
+ dev->neigh_setup = ops->ndo_neigh_setup;
+ dev->tx_timeout = ops->ndo_tx_timeout;
+ dev->get_stats = ops->ndo_get_stats;
+ dev->vlan_rx_register = ops->ndo_vlan_rx_register;
+ dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid;
+ dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid;
+#ifdef CONFIG_NET_POLL_CONTROLLER
+ dev->poll_controller = ops->ndo_poll_controller;
+#endif
+#endif
+}
+EXPORT_SYMBOL(netdev_resync_ops);
+
/**
* register_netdevice - register a network device
* @dev: device to register
@@ -4399,27 +4403,7 @@ int register_netdevice(struct net_device *dev)
* This is temporary until all network devices are converted.
*/
if (dev->netdev_ops) {
- const struct net_device_ops *ops = dev->netdev_ops;
-
- dev->init = ops->ndo_init;
- dev->uninit = ops->ndo_uninit;
- dev->open = ops->ndo_open;
- dev->change_rx_flags = ops->ndo_change_rx_flags;
- dev->set_rx_mode = ops->ndo_set_rx_mode;
- dev->set_multicast_list = ops->ndo_set_multicast_list;
- dev->set_mac_address = ops->ndo_set_mac_address;
- dev->validate_addr = ops->ndo_validate_addr;
- dev->do_ioctl = ops->ndo_do_ioctl;
- dev->set_config = ops->ndo_set_config;
- dev->change_mtu = ops->ndo_change_mtu;
- dev->tx_timeout = ops->ndo_tx_timeout;
- dev->get_stats = ops->ndo_get_stats;
- dev->vlan_rx_register = ops->ndo_vlan_rx_register;
- dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid;
- dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid;
-#ifdef CONFIG_NET_POLL_CONTROLLER
- dev->poll_controller = ops->ndo_poll_controller;
-#endif
+ netdev_resync_ops(dev);
} else {
char drivername[64];
pr_info("%s (%s): not using net_device_ops yet\n",
@@ -5291,6 +5275,14 @@ out:
subsys_initcall(net_dev_init);
+static int __init initialize_hashrnd(void)
+{
+ get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd));
+ return 0;
+}
+
+late_initcall_sync(initialize_hashrnd);
+
EXPORT_SYMBOL(__dev_get_by_index);
EXPORT_SYMBOL(__dev_get_by_name);
EXPORT_SYMBOL(__dev_remove_pack);
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
new file mode 100644
index 00000000000..9fd0dc3cca9
--- /dev/null
+++ b/net/core/drop_monitor.c
@@ -0,0 +1,263 @@
+/*
+ * Monitoring code for network dropped packet alerts
+ *
+ * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com>
+ */
+
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/string.h>
+#include <linux/if_arp.h>
+#include <linux/inetdevice.h>
+#include <linux/inet.h>
+#include <linux/interrupt.h>
+#include <linux/netpoll.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/netlink.h>
+#include <linux/net_dropmon.h>
+#include <linux/percpu.h>
+#include <linux/timer.h>
+#include <linux/bitops.h>
+#include <net/genetlink.h>
+
+#include <trace/skb.h>
+
+#include <asm/unaligned.h>
+
+#define TRACE_ON 1
+#define TRACE_OFF 0
+
+static void send_dm_alert(struct work_struct *unused);
+
+
+/*
+ * Globals, our netlink socket pointer
+ * and the work handle that will send up
+ * netlink alerts
+ */
+struct sock *dm_sock;
+
+struct per_cpu_dm_data {
+ struct work_struct dm_alert_work;
+ struct sk_buff *skb;
+ atomic_t dm_hit_count;
+ struct timer_list send_timer;
+};
+
+static struct genl_family net_drop_monitor_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = 0,
+ .name = "NET_DM",
+ .version = 1,
+ .maxattr = NET_DM_CMD_MAX,
+};
+
+static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
+
+static int dm_hit_limit = 64;
+static int dm_delay = 1;
+
+
+static void reset_per_cpu_data(struct per_cpu_dm_data *data)
+{
+ size_t al;
+ struct net_dm_alert_msg *msg;
+
+ al = sizeof(struct net_dm_alert_msg);
+ al += dm_hit_limit * sizeof(struct net_dm_drop_point);
+ data->skb = genlmsg_new(al, GFP_KERNEL);
+ genlmsg_put(data->skb, 0, 0, &net_drop_monitor_family,
+ 0, NET_DM_CMD_ALERT);
+ msg = __nla_reserve_nohdr(data->skb, sizeof(struct net_dm_alert_msg));
+ memset(msg, 0, al);
+ atomic_set(&data->dm_hit_count, dm_hit_limit);
+}
+
+static void send_dm_alert(struct work_struct *unused)
+{
+ struct sk_buff *skb;
+ struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data);
+
+ /*
+ * Grab the skb we're about to send
+ */
+ skb = data->skb;
+
+ /*
+ * Replace it with a new one
+ */
+ reset_per_cpu_data(data);
+
+ /*
+ * Ship it!
+ */
+ genlmsg_multicast(skb, 0, NET_DM_GRP_ALERT, GFP_KERNEL);
+
+}
+
+/*
+ * This is the timer function to delay the sending of an alert
+ * in the event that more drops will arrive during the
+ * hysteresis period. Note that it operates under the timer interrupt
+ * so we don't need to disable preemption here
+ */
+static void sched_send_work(unsigned long unused)
+{
+ struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data);
+
+ schedule_work(&data->dm_alert_work);
+}
+
+static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
+{
+ struct net_dm_alert_msg *msg;
+ struct nlmsghdr *nlh;
+ int i;
+ struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data);
+
+
+ if (!atomic_add_unless(&data->dm_hit_count, -1, 0)) {
+ /*
+ * we're already at zero, discard this hit
+ */
+ goto out;
+ }
+
+ nlh = (struct nlmsghdr *)data->skb->data;
+ msg = genlmsg_data(nlmsg_data(nlh));
+ for (i = 0; i < msg->entries; i++) {
+ if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) {
+ msg->points[i].count++;
+ goto out;
+ }
+ }
+
+ /*
+ * We need to create a new entry
+ */
+ __nla_reserve_nohdr(data->skb, sizeof(struct net_dm_drop_point));
+ memcpy(msg->points[msg->entries].pc, &location, sizeof(void *));
+ msg->points[msg->entries].count = 1;
+ msg->entries++;
+
+ if (!timer_pending(&data->send_timer)) {
+ data->send_timer.expires = jiffies + dm_delay * HZ;
+ add_timer_on(&data->send_timer, smp_processor_id());
+ }
+
+out:
+ return;
+}
+
+static int set_all_monitor_traces(int state)
+{
+ int rc = 0;
+
+ switch (state) {
+ case TRACE_ON:
+ rc |= register_trace_kfree_skb(trace_kfree_skb_hit);
+ break;
+ case TRACE_OFF:
+ rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit);
+
+ tracepoint_synchronize_unregister();
+ break;
+ default:
+ rc = 1;
+ break;
+ }
+
+ if (rc)
+ return -EINPROGRESS;
+ return rc;
+}
+
+
+static int net_dm_cmd_config(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ return -ENOTSUPP;
+}
+
+static int net_dm_cmd_trace(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ switch (info->genlhdr->cmd) {
+ case NET_DM_CMD_START:
+ return set_all_monitor_traces(TRACE_ON);
+ break;
+ case NET_DM_CMD_STOP:
+ return set_all_monitor_traces(TRACE_OFF);
+ break;
+ }
+
+ return -ENOTSUPP;
+}
+
+
+static struct genl_ops dropmon_ops[] = {
+ {
+ .cmd = NET_DM_CMD_CONFIG,
+ .doit = net_dm_cmd_config,
+ },
+ {
+ .cmd = NET_DM_CMD_START,
+ .doit = net_dm_cmd_trace,
+ },
+ {
+ .cmd = NET_DM_CMD_STOP,
+ .doit = net_dm_cmd_trace,
+ },
+};
+
+static int __init init_net_drop_monitor(void)
+{
+ int cpu;
+ int rc, i, ret;
+ struct per_cpu_dm_data *data;
+ printk(KERN_INFO "Initalizing network drop monitor service\n");
+
+ if (sizeof(void *) > 8) {
+ printk(KERN_ERR "Unable to store program counters on this arch, Drop monitor failed\n");
+ return -ENOSPC;
+ }
+
+ if (genl_register_family(&net_drop_monitor_family) < 0) {
+ printk(KERN_ERR "Could not create drop monitor netlink family\n");
+ return -EFAULT;
+ }
+
+ rc = -EFAULT;
+
+ for (i = 0; i < ARRAY_SIZE(dropmon_ops); i++) {
+ ret = genl_register_ops(&net_drop_monitor_family,
+ &dropmon_ops[i]);
+ if (ret) {
+ printk(KERN_CRIT "failed to register operation %d\n",
+ dropmon_ops[i].cmd);
+ goto out_unreg;
+ }
+ }
+
+ rc = 0;
+
+ for_each_present_cpu(cpu) {
+ data = &per_cpu(dm_cpu_data, cpu);
+ reset_per_cpu_data(data);
+ INIT_WORK(&data->dm_alert_work, send_dm_alert);
+ init_timer(&data->send_timer);
+ data->send_timer.data = cpu;
+ data->send_timer.function = sched_send_work;
+ }
+ goto out;
+
+out_unreg:
+ genl_unregister_family(&net_drop_monitor_family);
+out:
+ return rc;
+}
+
+late_initcall(init_net_drop_monitor);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 947710a36ce..244ca56dffa 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -209,34 +209,62 @@ static int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr)
return 0;
}
-static int ethtool_set_rxhash(struct net_device *dev, void __user *useraddr)
+static int ethtool_set_rxnfc(struct net_device *dev, void __user *useraddr)
{
struct ethtool_rxnfc cmd;
- if (!dev->ethtool_ops->set_rxhash)
+ if (!dev->ethtool_ops->set_rxnfc)
return -EOPNOTSUPP;
if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
return -EFAULT;
- return dev->ethtool_ops->set_rxhash(dev, &cmd);
+ return dev->ethtool_ops->set_rxnfc(dev, &cmd);
}
-static int ethtool_get_rxhash(struct net_device *dev, void __user *useraddr)
+static int ethtool_get_rxnfc(struct net_device *dev, void __user *useraddr)
{
struct ethtool_rxnfc info;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ int ret;
+ void *rule_buf = NULL;
- if (!dev->ethtool_ops->get_rxhash)
+ if (!ops->get_rxnfc)
return -EOPNOTSUPP;
if (copy_from_user(&info, useraddr, sizeof(info)))
return -EFAULT;
- dev->ethtool_ops->get_rxhash(dev, &info);
+ if (info.cmd == ETHTOOL_GRXCLSRLALL) {
+ if (info.rule_cnt > 0) {
+ rule_buf = kmalloc(info.rule_cnt * sizeof(u32),
+ GFP_USER);
+ if (!rule_buf)
+ return -ENOMEM;
+ }
+ }
+ ret = ops->get_rxnfc(dev, &info, rule_buf);
+ if (ret < 0)
+ goto err_out;
+
+ ret = -EFAULT;
if (copy_to_user(useraddr, &info, sizeof(info)))
- return -EFAULT;
- return 0;
+ goto err_out;
+
+ if (rule_buf) {
+ useraddr += offsetof(struct ethtool_rxnfc, rule_locs);
+ if (copy_to_user(useraddr, rule_buf,
+ info.rule_cnt * sizeof(u32)))
+ goto err_out;
+ }
+ ret = 0;
+
+err_out:
+ if (rule_buf)
+ kfree(rule_buf);
+
+ return ret;
}
static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
@@ -901,6 +929,10 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_GFLAGS:
case ETHTOOL_GPFLAGS:
case ETHTOOL_GRXFH:
+ case ETHTOOL_GRXRINGS:
+ case ETHTOOL_GRXCLSRLCNT:
+ case ETHTOOL_GRXCLSRULE:
+ case ETHTOOL_GRXCLSRLALL:
break;
default:
if (!capable(CAP_NET_ADMIN))
@@ -1052,10 +1084,16 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
dev->ethtool_ops->set_priv_flags);
break;
case ETHTOOL_GRXFH:
- rc = ethtool_get_rxhash(dev, useraddr);
+ case ETHTOOL_GRXRINGS:
+ case ETHTOOL_GRXCLSRLCNT:
+ case ETHTOOL_GRXCLSRULE:
+ case ETHTOOL_GRXCLSRLALL:
+ rc = ethtool_get_rxnfc(dev, useraddr);
break;
case ETHTOOL_SRXFH:
- rc = ethtool_set_rxhash(dev, useraddr);
+ case ETHTOOL_SRXCLSRLDEL:
+ case ETHTOOL_SRXCLSRLINS:
+ rc = ethtool_set_rxnfc(dev, useraddr);
break;
case ETHTOOL_GGRO:
rc = ethtool_get_gro(dev, useraddr);
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 32b3a0152d7..98691e1466b 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -588,7 +588,8 @@ static void notify_rule_change(int event, struct fib_rule *rule,
goto errout;
}
- err = rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL);
+ rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL);
+ return;
errout:
if (err < 0)
rtnl_set_sk_err(net, ops->nlgroup, err);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 278a142d104..a1cbce7fdae 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -871,8 +871,7 @@ static void neigh_timer_handler(unsigned long arg)
write_unlock(&neigh->lock);
neigh->ops->solicit(neigh, skb);
atomic_inc(&neigh->probes);
- if (skb)
- kfree_skb(skb);
+ kfree_skb(skb);
} else {
out:
write_unlock(&neigh->lock);
@@ -908,8 +907,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
neigh->updated = jiffies;
write_unlock_bh(&neigh->lock);
- if (skb)
- kfree_skb(skb);
+ kfree_skb(skb);
return 1;
}
} else if (neigh->nud_state & NUD_STALE) {
@@ -1656,7 +1654,11 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
flags &= ~NEIGH_UPDATE_F_OVERRIDE;
}
- err = neigh_update(neigh, lladdr, ndm->ndm_state, flags);
+ if (ndm->ndm_flags & NTF_USE) {
+ neigh_event_send(neigh, NULL);
+ err = 0;
+ } else
+ err = neigh_update(neigh, lladdr, ndm->ndm_state, flags);
neigh_release(neigh);
goto out_dev_put;
}
@@ -2534,7 +2536,8 @@ static void __neigh_notify(struct neighbour *n, int type, int flags)
kfree_skb(skb);
goto errout;
}
- err = rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+ rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+ return;
errout:
if (err < 0)
rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 6ac29a46e23..2da59a0ac4a 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -77,7 +77,9 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
if (endp == buf)
goto err;
- rtnl_lock();
+ if (!rtnl_trylock())
+ return -ERESTARTSYS;
+
if (dev_isalive(net)) {
if ((ret = (*set)(net, new)) == 0)
ret = len;
@@ -496,7 +498,7 @@ int netdev_register_kobject(struct net_device *net)
dev->groups = groups;
BUILD_BUG_ON(BUS_ID_SIZE < IFNAMSIZ);
- dev_set_name(dev, net->name);
+ dev_set_name(dev, "%s", net->name);
#ifdef CONFIG_SYSFS
*groups++ = &netstat_group;
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
new file mode 100644
index 00000000000..c8fb45665e4
--- /dev/null
+++ b/net/core/net-traces.c
@@ -0,0 +1,29 @@
+/*
+ * consolidates trace point definitions
+ *
+ * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com>
+ */
+
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/string.h>
+#include <linux/if_arp.h>
+#include <linux/inetdevice.h>
+#include <linux/inet.h>
+#include <linux/interrupt.h>
+#include <linux/netpoll.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/rcupdate.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/netlink.h>
+#include <linux/net_dropmon.h>
+#include <trace/skb.h>
+
+#include <asm/unaligned.h>
+#include <asm/bitops.h>
+
+
+DEFINE_TRACE(kfree_skb);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 55151faaf90..e3bebd36f05 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -32,24 +32,14 @@ static __net_init int setup_net(struct net *net)
{
/* Must be called with net_mutex held */
struct pernet_operations *ops;
- int error;
- struct net_generic *ng;
+ int error = 0;
atomic_set(&net->count, 1);
+
#ifdef NETNS_REFCNT_DEBUG
atomic_set(&net->use_count, 0);
#endif
- error = -ENOMEM;
- ng = kzalloc(sizeof(struct net_generic) +
- INITIAL_NET_GEN_PTRS * sizeof(void *), GFP_KERNEL);
- if (ng == NULL)
- goto out;
-
- ng->len = INITIAL_NET_GEN_PTRS;
- rcu_assign_pointer(net->gen, ng);
-
- error = 0;
list_for_each_entry(ops, &pernet_list, list) {
if (ops->init) {
error = ops->init(net);
@@ -70,24 +60,50 @@ out_undo:
}
rcu_barrier();
- kfree(ng);
goto out;
}
+static struct net_generic *net_alloc_generic(void)
+{
+ struct net_generic *ng;
+ size_t generic_size = sizeof(struct net_generic) +
+ INITIAL_NET_GEN_PTRS * sizeof(void *);
+
+ ng = kzalloc(generic_size, GFP_KERNEL);
+ if (ng)
+ ng->len = INITIAL_NET_GEN_PTRS;
+
+ return ng;
+}
+
#ifdef CONFIG_NET_NS
static struct kmem_cache *net_cachep;
static struct workqueue_struct *netns_wq;
static struct net *net_alloc(void)
{
- return kmem_cache_zalloc(net_cachep, GFP_KERNEL);
+ struct net *net = NULL;
+ struct net_generic *ng;
+
+ ng = net_alloc_generic();
+ if (!ng)
+ goto out;
+
+ net = kmem_cache_zalloc(net_cachep, GFP_KERNEL);
+ if (!net)
+ goto out_free;
+
+ rcu_assign_pointer(net->gen, ng);
+out:
+ return net;
+
+out_free:
+ kfree(ng);
+ goto out;
}
static void net_free(struct net *net)
{
- if (!net)
- return;
-
#ifdef NETNS_REFCNT_DEBUG
if (unlikely(atomic_read(&net->use_count) != 0)) {
printk(KERN_EMERG "network namespace not free! Usage: %d\n",
@@ -112,27 +128,28 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net)
err = -ENOMEM;
new_net = net_alloc();
if (!new_net)
- goto out;
+ goto out_err;
mutex_lock(&net_mutex);
err = setup_net(new_net);
- if (err)
- goto out_unlock;
-
- rtnl_lock();
- list_add_tail(&new_net->list, &net_namespace_list);
- rtnl_unlock();
-
-
-out_unlock:
+ if (!err) {
+ rtnl_lock();
+ list_add_tail(&new_net->list, &net_namespace_list);
+ rtnl_unlock();
+ }
mutex_unlock(&net_mutex);
+
+ if (err)
+ goto out_free;
out:
put_net(old_net);
- if (err) {
- net_free(new_net);
- new_net = ERR_PTR(err);
- }
return new_net;
+
+out_free:
+ net_free(new_net);
+out_err:
+ new_net = ERR_PTR(err);
+ goto out;
}
static void cleanup_net(struct work_struct *work)
@@ -140,9 +157,6 @@ static void cleanup_net(struct work_struct *work)
struct pernet_operations *ops;
struct net *net;
- /* Be very certain incoming network packets will not find us */
- rcu_barrier();
-
net = container_of(work, struct net, work);
mutex_lock(&net_mutex);
@@ -188,6 +202,7 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net)
static int __init net_ns_init(void)
{
+ struct net_generic *ng;
int err;
printk(KERN_INFO "net_namespace: %zd bytes\n", sizeof(struct net));
@@ -202,6 +217,12 @@ static int __init net_ns_init(void)
panic("Could not create netns workq");
#endif
+ ng = net_alloc_generic();
+ if (!ng)
+ panic("Could not allocate generic netns");
+
+ rcu_assign_pointer(init_net.gen, ng);
+
mutex_lock(&net_mutex);
err = setup_net(&init_net);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 65498483325..32d419f5ac9 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3275,8 +3275,7 @@ static void pktgen_stop(struct pktgen_thread *t)
list_for_each_entry(pkt_dev, &t->if_list, list) {
pktgen_stop_device(pkt_dev);
- if (pkt_dev->skb)
- kfree_skb(pkt_dev->skb);
+ kfree_skb(pkt_dev->skb);
pkt_dev->skb = NULL;
}
@@ -3303,8 +3302,7 @@ static void pktgen_rem_one_if(struct pktgen_thread *t)
if (!cur->removal_mark)
continue;
- if (cur->skb)
- kfree_skb(cur->skb);
+ kfree_skb(cur->skb);
cur->skb = NULL;
pktgen_remove_device(t, cur);
@@ -3328,8 +3326,7 @@ static void pktgen_rem_all_ifs(struct pktgen_thread *t)
list_for_each_safe(q, n, &t->if_list) {
cur = list_entry(q, struct pktgen_dev, list);
- if (cur->skb)
- kfree_skb(cur->skb);
+ kfree_skb(cur->skb);
cur->skb = NULL;
pktgen_remove_device(t, cur);
@@ -3393,8 +3390,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev)
if (!netif_running(odev)) {
pktgen_stop_device(pkt_dev);
- if (pkt_dev->skb)
- kfree_skb(pkt_dev->skb);
+ kfree_skb(pkt_dev->skb);
pkt_dev->skb = NULL;
goto out;
}
@@ -3415,8 +3411,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev)
if ((++pkt_dev->clone_count >= pkt_dev->clone_skb)
|| (!pkt_dev->skb)) {
/* build a new pkt */
- if (pkt_dev->skb)
- kfree_skb(pkt_dev->skb);
+ kfree_skb(pkt_dev->skb);
pkt_dev->skb = fill_packet(odev, pkt_dev);
if (pkt_dev->skb == NULL) {
@@ -3498,8 +3493,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev)
/* Done with this */
pktgen_stop_device(pkt_dev);
- if (pkt_dev->skb)
- kfree_skb(pkt_dev->skb);
+ kfree_skb(pkt_dev->skb);
pkt_dev->skb = NULL;
}
out:;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 790dd205bb5..d78030f88bd 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -455,8 +455,8 @@ int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid)
return nlmsg_unicast(rtnl, skb, pid);
}
-int rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
- struct nlmsghdr *nlh, gfp_t flags)
+void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
+ struct nlmsghdr *nlh, gfp_t flags)
{
struct sock *rtnl = net->rtnl;
int report = 0;
@@ -464,7 +464,7 @@ int rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
if (nlh)
report = nlmsg_report(nlh);
- return nlmsg_notify(rtnl, skb, pid, group, report, flags);
+ nlmsg_notify(rtnl, skb, pid, group, report, flags);
}
void rtnl_set_sk_err(struct net *net, u32 group, int error)
@@ -1246,7 +1246,8 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change)
kfree_skb(skb);
goto errout;
}
- err = rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL);
+ rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL);
+ return;
errout:
if (err < 0)
rtnl_set_sk_err(net, RTNLGRP_LINK, err);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index e5a8351ff12..6acbf9e79eb 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -65,6 +65,7 @@
#include <asm/uaccess.h>
#include <asm/system.h>
+#include <trace/skb.h>
#include "kmap_skb.h"
@@ -146,14 +147,6 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
}
EXPORT_SYMBOL(skb_under_panic);
-void skb_truesize_bug(struct sk_buff *skb)
-{
- WARN(net_ratelimit(), KERN_ERR "SKB BUG: Invalid truesize (%u) "
- "len=%u, sizeof(sk_buff)=%Zd\n",
- skb->truesize, skb->len, sizeof(struct sk_buff));
-}
-EXPORT_SYMBOL(skb_truesize_bug);
-
/* Allocate a new skbuff. We do this ourselves so we can fill in a few
* 'private' fields and also do memory statistics to find all the
* [BEEP] leaks.
@@ -450,11 +443,32 @@ void kfree_skb(struct sk_buff *skb)
smp_rmb();
else if (likely(!atomic_dec_and_test(&skb->users)))
return;
+ trace_kfree_skb(skb, __builtin_return_address(0));
__kfree_skb(skb);
}
EXPORT_SYMBOL(kfree_skb);
/**
+ * consume_skb - free an skbuff
+ * @skb: buffer to free
+ *
+ * Drop a ref to the buffer and free it if the usage count has hit zero
+ * Functions identically to kfree_skb, but kfree_skb assumes that the frame
+ * is being dropped after a failure and notes that
+ */
+void consume_skb(struct sk_buff *skb)
+{
+ if (unlikely(!skb))
+ return;
+ if (likely(atomic_read(&skb->users) == 1))
+ smp_rmb();
+ else if (likely(!atomic_dec_and_test(&skb->users)))
+ return;
+ __kfree_skb(skb);
+}
+EXPORT_SYMBOL(consume_skb);
+
+/**
* skb_recycle_check - check if skb can be reused for receive
* @skb: buffer
* @skb_size: minimum receive buffer size
@@ -1216,8 +1230,7 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
insp = list;
}
if (!pskb_pull(list, eat)) {
- if (clone)
- kfree_skb(clone);
+ kfree_skb(clone);
return NULL;
}
break;
diff --git a/net/core/sock.c b/net/core/sock.c
index 40887e76652..0620046e4eb 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -150,7 +150,7 @@ static const char *af_family_key_strings[AF_MAX+1] = {
"sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
"sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
"sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
- "sk_lock-21" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
+ "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
"sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
"sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
"sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
@@ -165,7 +165,7 @@ static const char *af_family_slock_key_strings[AF_MAX+1] = {
"slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
"slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
"slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
- "slock-21" , "slock-AF_SNA" , "slock-AF_IRDA" ,
+ "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
"slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
"slock-27" , "slock-28" , "slock-AF_CAN" ,
"slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
@@ -180,7 +180,7 @@ static const char *af_family_clock_key_strings[AF_MAX+1] = {
"clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
"clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
"clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
- "clock-21" , "clock-AF_SNA" , "clock-AF_IRDA" ,
+ "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
"clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
"clock-27" , "clock-28" , "clock-AF_CAN" ,
"clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
@@ -725,7 +725,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
if (len < 0)
return -EINVAL;
- v.val = 0;
+ memset(&v, 0, sizeof(v));
switch(optname) {
case SO_DEBUG:
@@ -1185,7 +1185,6 @@ void sock_rfree(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
- skb_truesize_check(skb);
atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
sk_mem_uncharge(skb->sk, skb->truesize);
}
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 83d3398559e..7db1de0497c 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -11,6 +11,7 @@
#include <linux/socket.h>
#include <linux/netdevice.h>
#include <linux/init.h>
+#include <net/ip.h>
#include <net/sock.h>
static struct ctl_table net_core_table[] = {
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
index 45f95e55f87..7ea557b7c6b 100644
--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -20,6 +20,9 @@
/* We can spread an ack vector across multiple options */
#define DCCP_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * 2)
+/* Estimated minimum average Ack Vector length - used for updating MPS */
+#define DCCPAV_MIN_OPTLEN 16
+
#define DCCP_ACKVEC_STATE_RECEIVED 0
#define DCCP_ACKVEC_STATE_ECN_MARKED (1 << 6)
#define DCCP_ACKVEC_STATE_NOT_RECEIVED (3 << 6)
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 08a569ff02d..d6bc47363b1 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -63,11 +63,14 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
* - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields
* Hence a safe upper bound for the maximum option length is 1020-28 = 992
*/
-#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(int))
+#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t))
#define DCCP_MAX_PACKET_HDR 28
#define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR)
#define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER)
+/* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */
+#define DCCP_FEATNEG_OVERHEAD (32 * sizeof(uint32_t))
+
#define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT
* state, about 60 seconds */
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 22a618af489..36bcc00654d 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -161,21 +161,27 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
struct inet_connection_sock *icsk = inet_csk(sk);
struct dccp_sock *dp = dccp_sk(sk);
u32 ccmps = dccp_determine_ccmps(dp);
- int cur_mps = ccmps ? min(pmtu, ccmps) : pmtu;
+ u32 cur_mps = ccmps ? min(pmtu, ccmps) : pmtu;
/* Account for header lengths and IPv4/v6 option overhead */
cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len +
sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext));
/*
- * FIXME: this should come from the CCID infrastructure, where, say,
- * TFRC will say it wants TIMESTAMPS, ELAPSED time, etc, for now lets
- * put a rough estimate for NDP + TIMESTAMP + TIMESTAMP_ECHO + ELAPSED
- * TIME + TFRC_OPT_LOSS_EVENT_RATE + TFRC_OPT_RECEIVE_RATE + padding to
- * make it a multiple of 4
+ * Leave enough headroom for common DCCP header options.
+ * This only considers options which may appear on DCCP-Data packets, as
+ * per table 3 in RFC 4340, 5.8. When running out of space for other
+ * options (eg. Ack Vector which can take up to 255 bytes), it is better
+ * to schedule a separate Ack. Thus we leave headroom for the following:
+ * - 1 byte for Slow Receiver (11.6)
+ * - 6 bytes for Timestamp (13.1)
+ * - 10 bytes for Timestamp Echo (13.3)
+ * - 8 bytes for NDP count (7.7, when activated)
+ * - 6 bytes for Data Checksum (9.3)
+ * - %DCCPAV_MIN_OPTLEN bytes for Ack Vector size (11.4, when enabled)
*/
-
- cur_mps -= roundup(5 + 6 + 10 + 6 + 6 + 6, 4);
+ cur_mps -= roundup(1 + 6 + 10 + dp->dccps_send_ndp_count * 8 + 6 +
+ (dp->dccps_hc_rx_ackvec ? DCCPAV_MIN_OPTLEN : 0), 4);
/* And store cached results */
icsk->icsk_pmtu_cookie = pmtu;
@@ -270,7 +276,20 @@ void dccp_write_xmit(struct sock *sk, int block)
const int len = skb->len;
if (sk->sk_state == DCCP_PARTOPEN) {
- /* See 8.1.5. Handshake Completion */
+ const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD;
+ /*
+ * See 8.1.5 - Handshake Completion.
+ *
+ * For robustness we resend Confirm options until the client has
+ * entered OPEN. During the initial feature negotiation, the MPS
+ * is smaller than usual, reduced by the Change/Confirm options.
+ */
+ if (!list_empty(&dp->dccps_featneg) && len > cur_mps) {
+ DCCP_WARN("Payload too large (%d) for featneg.\n", len);
+ dccp_send_ack(sk);
+ dccp_feat_list_purge(&dp->dccps_featneg);
+ }
+
inet_csk_schedule_ack(sk);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
inet_csk(sk)->icsk_rto,
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 12bf7d4c16c..9647d911f91 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -1246,11 +1246,12 @@ static int dn_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
case TIOCINQ:
lock_sock(sk);
- if ((skb = skb_peek(&scp->other_receive_queue)) != NULL) {
+ skb = skb_peek(&scp->other_receive_queue);
+ if (skb) {
amount = skb->len;
} else {
- struct sk_buff *skb = sk->sk_receive_queue.next;
- for(;;) {
+ skb = sk->sk_receive_queue.next;
+ for (;;) {
if (skb ==
(struct sk_buff *)&sk->sk_receive_queue)
break;
@@ -1579,16 +1580,16 @@ static int __dn_getsockopt(struct socket *sock, int level,int optname, char __us
default:
#ifdef CONFIG_NETFILTER
{
- int val, len;
+ int ret, len;
if(get_user(len, optlen))
return -EFAULT;
- val = nf_getsockopt(sk, PF_DECnet, optname,
+ ret = nf_getsockopt(sk, PF_DECnet, optname,
optval, &len);
- if (val >= 0)
- val = put_user(len, optlen);
- return val;
+ if (ret >= 0)
+ ret = put_user(len, optlen);
+ return ret;
}
#endif
case DSO_STREAM:
@@ -2071,8 +2072,7 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock,
}
out:
- if (skb)
- kfree_skb(skb);
+ kfree_skb(skb);
release_sock(sk);
@@ -2112,9 +2112,8 @@ static struct notifier_block dn_dev_notifier = {
extern int dn_route_rcv(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *);
-static struct packet_type dn_dix_packet_type = {
+static struct packet_type dn_dix_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_DNA_RT),
- .dev = NULL, /* All devices */
.func = dn_route_rcv,
};
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index daf2b98b15f..1c6a5bb6f0c 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -684,7 +684,6 @@ static int dn_nl_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
return -ENODEV;
if ((dn_db = dev->dn_ptr) == NULL) {
- int err;
dn_db = dn_dev_create(dev, &err);
if (!dn_db)
return err;
@@ -769,7 +768,8 @@ static void dn_ifaddr_notify(int event, struct dn_ifaddr *ifa)
kfree_skb(skb);
goto errout;
}
- err = rtnl_notify(skb, &init_net, 0, RTNLGRP_DECnet_IFADDR, NULL, GFP_KERNEL);
+ rtnl_notify(skb, &init_net, 0, RTNLGRP_DECnet_IFADDR, NULL, GFP_KERNEL);
+ return;
errout:
if (err < 0)
rtnl_set_sk_err(&init_net, RTNLGRP_DECnet_IFADDR, err);
@@ -1322,6 +1322,7 @@ static inline int is_dn_dev(struct net_device *dev)
}
static void *dn_dev_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(&dev_base_lock)
{
int i;
struct net_device *dev;
@@ -1364,6 +1365,7 @@ static void *dn_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void dn_dev_seq_stop(struct seq_file *seq, void *v)
+ __releases(&dev_base_lock)
{
read_unlock(&dev_base_lock);
}
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 5130dee0b38..0cc4394117d 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -380,7 +380,6 @@ static int dn_return_short(struct sk_buff *skb)
unsigned char *ptr;
__le16 *src;
__le16 *dst;
- __le16 tmp;
/* Add back headers */
skb_push(skb, skb->data - skb_network_header(skb));
@@ -399,10 +398,7 @@ static int dn_return_short(struct sk_buff *skb)
ptr += 2;
*ptr = 0; /* Zero hop count */
- /* Swap source and destination */
- tmp = *src;
- *src = *dst;
- *dst = tmp;
+ swap(*src, *dst);
skb->pkt_type = PACKET_OUTGOING;
dn_rt_finish_output(skb, NULL, NULL);
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index 69ad9280c69..67054b0d550 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -375,7 +375,8 @@ static void dn_rtmsg_fib(int event, struct dn_fib_node *f, int z, u32 tb_id,
kfree_skb(skb);
goto errout;
}
- err = rtnl_notify(skb, &init_net, pid, RTNLGRP_DECnet_ROUTE, nlh, GFP_KERNEL);
+ rtnl_notify(skb, &init_net, pid, RTNLGRP_DECnet_ROUTE, nlh, GFP_KERNEL);
+ return;
errout:
if (err < 0)
rtnl_set_sk_err(&init_net, RTNLGRP_DECnet_ROUTE, err);
diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c
index 965397af9a8..5bcd592ae6d 100644
--- a/net/decnet/sysctl_net_decnet.c
+++ b/net/decnet/sysctl_net_decnet.c
@@ -179,7 +179,7 @@ static int dn_node_address_handler(ctl_table *table, int write,
}
if (write) {
- int len = (*lenp < DN_ASCBUF_LEN) ? *lenp : (DN_ASCBUF_LEN-1);
+ len = (*lenp < DN_ASCBUF_LEN) ? *lenp : (DN_ASCBUF_LEN-1);
if (copy_from_user(addr, buffer, len))
return -EFAULT;
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 49211b35725..c51b55400dc 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -41,13 +41,13 @@ config NET_DSA_MV88E6XXX_NEED_PPU
default n
config NET_DSA_MV88E6131
- bool "Marvell 88E6131 ethernet switch chip support"
+ bool "Marvell 88E6095/6095F/6131 ethernet switch chip support"
select NET_DSA_MV88E6XXX
select NET_DSA_MV88E6XXX_NEED_PPU
select NET_DSA_TAG_DSA
---help---
- This enables support for the Marvell 88E6131 ethernet switch
- chip.
+ This enables support for the Marvell 88E6095/6095F/6131
+ ethernet switch chips.
config NET_DSA_MV88E6123_61_65
bool "Marvell 88E6123/6161/6165 ethernet switch chip support"
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 33e99462023..71489f69a42 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -1,6 +1,6 @@
/*
* net/dsa/dsa.c - Hardware switch handling
- * Copyright (c) 2008 Marvell Semiconductor
+ * Copyright (c) 2008-2009 Marvell Semiconductor
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -67,12 +67,13 @@ dsa_switch_probe(struct mii_bus *bus, int sw_addr, char **_name)
/* basic switch operations **************************************************/
static struct dsa_switch *
-dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd,
- struct mii_bus *bus, struct net_device *dev)
+dsa_switch_setup(struct dsa_switch_tree *dst, int index,
+ struct device *parent, struct mii_bus *bus)
{
+ struct dsa_chip_data *pd = dst->pd->chip + index;
+ struct dsa_switch_driver *drv;
struct dsa_switch *ds;
int ret;
- struct dsa_switch_driver *drv;
char *name;
int i;
@@ -81,11 +82,12 @@ dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd,
*/
drv = dsa_switch_probe(bus, pd->sw_addr, &name);
if (drv == NULL) {
- printk(KERN_ERR "%s: could not detect attached switch\n",
- dev->name);
+ printk(KERN_ERR "%s[%d]: could not detect attached switch\n",
+ dst->master_netdev->name, index);
return ERR_PTR(-EINVAL);
}
- printk(KERN_INFO "%s: detected a %s switch\n", dev->name, name);
+ printk(KERN_INFO "%s[%d]: detected a %s switch\n",
+ dst->master_netdev->name, index, name);
/*
@@ -95,18 +97,16 @@ dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd,
if (ds == NULL)
return ERR_PTR(-ENOMEM);
- ds->pd = pd;
- ds->master_netdev = dev;
- ds->master_mii_bus = bus;
-
+ ds->dst = dst;
+ ds->index = index;
+ ds->pd = dst->pd->chip + index;
ds->drv = drv;
- ds->tag_protocol = drv->tag_protocol;
+ ds->master_mii_bus = bus;
/*
* Validate supplied switch configuration.
*/
- ds->cpu_port = -1;
for (i = 0; i < DSA_MAX_PORTS; i++) {
char *name;
@@ -115,32 +115,28 @@ dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd,
continue;
if (!strcmp(name, "cpu")) {
- if (ds->cpu_port != -1) {
+ if (dst->cpu_switch != -1) {
printk(KERN_ERR "multiple cpu ports?!\n");
ret = -EINVAL;
goto out;
}
- ds->cpu_port = i;
+ dst->cpu_switch = index;
+ dst->cpu_port = i;
+ } else if (!strcmp(name, "dsa")) {
+ ds->dsa_port_mask |= 1 << i;
} else {
- ds->valid_port_mask |= 1 << i;
+ ds->phys_port_mask |= 1 << i;
}
}
- if (ds->cpu_port == -1) {
- printk(KERN_ERR "no cpu port?!\n");
- ret = -EINVAL;
- goto out;
- }
-
/*
- * If we use a tagging format that doesn't have an ethertype
- * field, make sure that all packets from this point on get
- * sent to the tag format's receive function. (Which will
- * discard received packets until we set ds->ports[] below.)
+ * If the CPU connects to this switch, set the switch tree
+ * tagging protocol to the preferred tagging format of this
+ * switch.
*/
- wmb();
- dev->dsa_ptr = (void *)ds;
+ if (ds->dst->cpu_switch == index)
+ ds->dst->tag_protocol = drv->tag_protocol;
/*
@@ -150,7 +146,7 @@ dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd,
if (ret < 0)
goto out;
- ret = drv->set_addr(ds, dev->dev_addr);
+ ret = drv->set_addr(ds, dst->master_netdev->dev_addr);
if (ret < 0)
goto out;
@@ -169,18 +165,18 @@ dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd,
/*
* Create network devices for physical switch ports.
*/
- wmb();
for (i = 0; i < DSA_MAX_PORTS; i++) {
struct net_device *slave_dev;
- if (!(ds->valid_port_mask & (1 << i)))
+ if (!(ds->phys_port_mask & (1 << i)))
continue;
slave_dev = dsa_slave_create(ds, parent, i, pd->port_names[i]);
if (slave_dev == NULL) {
- printk(KERN_ERR "%s: can't create dsa slave "
- "device for port %d(%s)\n",
- dev->name, i, pd->port_names[i]);
+ printk(KERN_ERR "%s[%d]: can't create dsa "
+ "slave device for port %d(%s)\n",
+ dst->master_netdev->name,
+ index, i, pd->port_names[i]);
continue;
}
@@ -192,7 +188,6 @@ dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd,
out_free:
mdiobus_free(ds->slave_mii_bus);
out:
- dev->dsa_ptr = NULL;
kfree(ds);
return ERR_PTR(ret);
}
@@ -212,35 +207,42 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
*/
bool dsa_uses_dsa_tags(void *dsa_ptr)
{
- struct dsa_switch *ds = dsa_ptr;
+ struct dsa_switch_tree *dst = dsa_ptr;
- return !!(ds->tag_protocol == htons(ETH_P_DSA));
+ return !!(dst->tag_protocol == htons(ETH_P_DSA));
}
bool dsa_uses_trailer_tags(void *dsa_ptr)
{
- struct dsa_switch *ds = dsa_ptr;
+ struct dsa_switch_tree *dst = dsa_ptr;
- return !!(ds->tag_protocol == htons(ETH_P_TRAILER));
+ return !!(dst->tag_protocol == htons(ETH_P_TRAILER));
}
/* link polling *************************************************************/
static void dsa_link_poll_work(struct work_struct *ugly)
{
- struct dsa_switch *ds;
+ struct dsa_switch_tree *dst;
+ int i;
+
+ dst = container_of(ugly, struct dsa_switch_tree, link_poll_work);
- ds = container_of(ugly, struct dsa_switch, link_poll_work);
+ for (i = 0; i < dst->pd->nr_chips; i++) {
+ struct dsa_switch *ds = dst->ds[i];
- ds->drv->poll_link(ds);
- mod_timer(&ds->link_poll_timer, round_jiffies(jiffies + HZ));
+ if (ds != NULL && ds->drv->poll_link != NULL)
+ ds->drv->poll_link(ds);
+ }
+
+ mod_timer(&dst->link_poll_timer, round_jiffies(jiffies + HZ));
}
-static void dsa_link_poll_timer(unsigned long _ds)
+static void dsa_link_poll_timer(unsigned long _dst)
{
- struct dsa_switch *ds = (void *)_ds;
+ struct dsa_switch_tree *dst = (void *)_dst;
- schedule_work(&ds->link_poll_work);
+ schedule_work(&dst->link_poll_work);
}
@@ -303,18 +305,14 @@ static int dsa_probe(struct platform_device *pdev)
static int dsa_version_printed;
struct dsa_platform_data *pd = pdev->dev.platform_data;
struct net_device *dev;
- struct mii_bus *bus;
- struct dsa_switch *ds;
+ struct dsa_switch_tree *dst;
+ int i;
if (!dsa_version_printed++)
printk(KERN_NOTICE "Distributed Switch Architecture "
"driver version %s\n", dsa_driver_version);
- if (pd == NULL || pd->mii_bus == NULL || pd->netdev == NULL)
- return -EINVAL;
-
- bus = dev_to_mii_bus(pd->mii_bus);
- if (bus == NULL)
+ if (pd == NULL || pd->netdev == NULL)
return -EINVAL;
dev = dev_to_net_device(pd->netdev);
@@ -326,36 +324,79 @@ static int dsa_probe(struct platform_device *pdev)
return -EEXIST;
}
- ds = dsa_switch_setup(&pdev->dev, pd, bus, dev);
- if (IS_ERR(ds)) {
+ dst = kzalloc(sizeof(*dst), GFP_KERNEL);
+ if (dst == NULL) {
dev_put(dev);
- return PTR_ERR(ds);
+ return -ENOMEM;
}
- if (ds->drv->poll_link != NULL) {
- INIT_WORK(&ds->link_poll_work, dsa_link_poll_work);
- init_timer(&ds->link_poll_timer);
- ds->link_poll_timer.data = (unsigned long)ds;
- ds->link_poll_timer.function = dsa_link_poll_timer;
- ds->link_poll_timer.expires = round_jiffies(jiffies + HZ);
- add_timer(&ds->link_poll_timer);
+ platform_set_drvdata(pdev, dst);
+
+ dst->pd = pd;
+ dst->master_netdev = dev;
+ dst->cpu_switch = -1;
+ dst->cpu_port = -1;
+
+ for (i = 0; i < pd->nr_chips; i++) {
+ struct mii_bus *bus;
+ struct dsa_switch *ds;
+
+ bus = dev_to_mii_bus(pd->chip[i].mii_bus);
+ if (bus == NULL) {
+ printk(KERN_ERR "%s[%d]: no mii bus found for "
+ "dsa switch\n", dev->name, i);
+ continue;
+ }
+
+ ds = dsa_switch_setup(dst, i, &pdev->dev, bus);
+ if (IS_ERR(ds)) {
+ printk(KERN_ERR "%s[%d]: couldn't create dsa switch "
+ "instance (error %ld)\n", dev->name, i,
+ PTR_ERR(ds));
+ continue;
+ }
+
+ dst->ds[i] = ds;
+ if (ds->drv->poll_link != NULL)
+ dst->link_poll_needed = 1;
}
- platform_set_drvdata(pdev, ds);
+ /*
+ * If we use a tagging format that doesn't have an ethertype
+ * field, make sure that all packets from this point on get
+ * sent to the tag format's receive function.
+ */
+ wmb();
+ dev->dsa_ptr = (void *)dst;
+
+ if (dst->link_poll_needed) {
+ INIT_WORK(&dst->link_poll_work, dsa_link_poll_work);
+ init_timer(&dst->link_poll_timer);
+ dst->link_poll_timer.data = (unsigned long)dst;
+ dst->link_poll_timer.function = dsa_link_poll_timer;
+ dst->link_poll_timer.expires = round_jiffies(jiffies + HZ);
+ add_timer(&dst->link_poll_timer);
+ }
return 0;
}
static int dsa_remove(struct platform_device *pdev)
{
- struct dsa_switch *ds = platform_get_drvdata(pdev);
+ struct dsa_switch_tree *dst = platform_get_drvdata(pdev);
+ int i;
- if (ds->drv->poll_link != NULL)
- del_timer_sync(&ds->link_poll_timer);
+ if (dst->link_poll_needed)
+ del_timer_sync(&dst->link_poll_timer);
flush_scheduled_work();
- dsa_switch_destroy(ds);
+ for (i = 0; i < dst->pd->nr_chips; i++) {
+ struct dsa_switch *ds = dst->ds[i];
+
+ if (ds != NULL)
+ dsa_switch_destroy(ds);
+ }
return 0;
}
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 7063378a1eb..41055f33d28 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -1,6 +1,6 @@
/*
* net/dsa/dsa_priv.h - Hardware switch handling
- * Copyright (c) 2008 Marvell Semiconductor
+ * Copyright (c) 2008-2009 Marvell Semiconductor
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -19,42 +19,107 @@
struct dsa_switch {
/*
- * Configuration data for the platform device that owns
- * this dsa switch instance.
+ * Parent switch tree, and switch index.
*/
- struct dsa_platform_data *pd;
+ struct dsa_switch_tree *dst;
+ int index;
/*
- * References to network device and mii bus to use.
+ * Configuration data for this switch.
*/
- struct net_device *master_netdev;
- struct mii_bus *master_mii_bus;
+ struct dsa_chip_data *pd;
/*
- * The used switch driver and frame tagging type.
+ * The used switch driver.
*/
struct dsa_switch_driver *drv;
- __be16 tag_protocol;
+
+ /*
+ * Reference to mii bus to use.
+ */
+ struct mii_bus *master_mii_bus;
/*
* Slave mii_bus and devices for the individual ports.
*/
- int cpu_port;
- u32 valid_port_mask;
- struct mii_bus *slave_mii_bus;
- struct net_device *ports[DSA_MAX_PORTS];
+ u32 dsa_port_mask;
+ u32 phys_port_mask;
+ struct mii_bus *slave_mii_bus;
+ struct net_device *ports[DSA_MAX_PORTS];
+};
+
+struct dsa_switch_tree {
+ /*
+ * Configuration data for the platform device that owns
+ * this dsa switch tree instance.
+ */
+ struct dsa_platform_data *pd;
+
+ /*
+ * Reference to network device to use, and which tagging
+ * protocol to use.
+ */
+ struct net_device *master_netdev;
+ __be16 tag_protocol;
+
+ /*
+ * The switch and port to which the CPU is attached.
+ */
+ s8 cpu_switch;
+ s8 cpu_port;
/*
* Link state polling.
*/
- struct work_struct link_poll_work;
- struct timer_list link_poll_timer;
+ int link_poll_needed;
+ struct work_struct link_poll_work;
+ struct timer_list link_poll_timer;
+
+ /*
+ * Data for the individual switch chips.
+ */
+ struct dsa_switch *ds[DSA_MAX_SWITCHES];
};
+static inline bool dsa_is_cpu_port(struct dsa_switch *ds, int p)
+{
+ return !!(ds->index == ds->dst->cpu_switch && p == ds->dst->cpu_port);
+}
+
+static inline u8 dsa_upstream_port(struct dsa_switch *ds)
+{
+ struct dsa_switch_tree *dst = ds->dst;
+
+ /*
+ * If this is the root switch (i.e. the switch that connects
+ * to the CPU), return the cpu port number on this switch.
+ * Else return the (DSA) port number that connects to the
+ * switch that is one hop closer to the cpu.
+ */
+ if (dst->cpu_switch == ds->index)
+ return dst->cpu_port;
+ else
+ return ds->pd->rtable[dst->cpu_switch];
+}
+
struct dsa_slave_priv {
+ /*
+ * The linux network interface corresponding to this
+ * switch port.
+ */
struct net_device *dev;
+
+ /*
+ * Which switch this port is a part of, and the port index
+ * for this port.
+ */
struct dsa_switch *parent;
- int port;
+ u8 port;
+
+ /*
+ * The phylib phy_device pointer for the PHY connected
+ * to this port.
+ */
struct phy_device *phy;
};
diff --git a/net/dsa/mv88e6060.c b/net/dsa/mv88e6060.c
index 85081ae9fe8..83277f463af 100644
--- a/net/dsa/mv88e6060.c
+++ b/net/dsa/mv88e6060.c
@@ -1,6 +1,6 @@
/*
* net/dsa/mv88e6060.c - Driver for Marvell 88e6060 switch chips
- * Copyright (c) 2008 Marvell Semiconductor
+ * Copyright (c) 2008-2009 Marvell Semiconductor
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -81,7 +81,7 @@ static int mv88e6060_switch_reset(struct dsa_switch *ds)
/*
* Reset the switch.
*/
- REG_WRITE(REG_GLOBAL, 0x0A, 0xa130);
+ REG_WRITE(REG_GLOBAL, 0x0a, 0xa130);
/*
* Wait up to one second for reset to complete.
@@ -128,7 +128,7 @@ static int mv88e6060_setup_port(struct dsa_switch *ds, int p)
* state to Forwarding. Additionally, if this is the CPU
* port, enable Ingress and Egress Trailer tagging mode.
*/
- REG_WRITE(addr, 0x04, (p == ds->cpu_port) ? 0x4103 : 0x0003);
+ REG_WRITE(addr, 0x04, dsa_is_cpu_port(ds, p) ? 0x4103 : 0x0003);
/*
* Port based VLAN map: give each port its own address
@@ -138,9 +138,9 @@ static int mv88e6060_setup_port(struct dsa_switch *ds, int p)
*/
REG_WRITE(addr, 0x06,
((p & 0xf) << 12) |
- ((p == ds->cpu_port) ?
- ds->valid_port_mask :
- (1 << ds->cpu_port)));
+ (dsa_is_cpu_port(ds, p) ?
+ ds->phys_port_mask :
+ (1 << ds->dst->cpu_port)));
/*
* Port Association Vector: when learning source addresses
diff --git a/net/dsa/mv88e6123_61_65.c b/net/dsa/mv88e6123_61_65.c
index 10031872221..52faaa21a4d 100644
--- a/net/dsa/mv88e6123_61_65.c
+++ b/net/dsa/mv88e6123_61_65.c
@@ -1,6 +1,6 @@
/*
* net/dsa/mv88e6123_61_65.c - Marvell 88e6123/6161/6165 switch chip support
- * Copyright (c) 2008 Marvell Semiconductor
+ * Copyright (c) 2008-2009 Marvell Semiconductor
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -98,17 +98,17 @@ static int mv88e6123_61_65_setup_global(struct dsa_switch *ds)
return ret;
/*
- * Configure the cpu port, and configure the cpu port as the
- * port to which ingress and egress monitor frames are to be
- * sent.
+ * Configure the upstream port, and configure the upstream
+ * port as the port to which ingress and egress monitor frames
+ * are to be sent.
*/
- REG_WRITE(REG_GLOBAL, 0x1a, (ds->cpu_port * 0x1110));
+ REG_WRITE(REG_GLOBAL, 0x1a, (dsa_upstream_port(ds) * 0x1110));
/*
* Disable remote management for now, and set the switch's
- * DSA device number to zero.
+ * DSA device number.
*/
- REG_WRITE(REG_GLOBAL, 0x1c, 0x0000);
+ REG_WRITE(REG_GLOBAL, 0x1c, ds->index & 0x1f);
/*
* Send all frames with destination addresses matching
@@ -133,10 +133,17 @@ static int mv88e6123_61_65_setup_global(struct dsa_switch *ds)
REG_WRITE(REG_GLOBAL2, 0x05, 0x00ff);
/*
- * Map all DSA device IDs to the CPU port.
+ * Program the DSA routing table.
*/
- for (i = 0; i < 32; i++)
- REG_WRITE(REG_GLOBAL2, 0x06, 0x8000 | (i << 8) | ds->cpu_port);
+ for (i = 0; i < 32; i++) {
+ int nexthop;
+
+ nexthop = 0x1f;
+ if (i != ds->index && i < ds->dst->pd->nr_chips)
+ nexthop = ds->pd->rtable[i] & 0x1f;
+
+ REG_WRITE(REG_GLOBAL2, 0x06, 0x8000 | (i << 8) | nexthop);
+ }
/*
* Clear all trunk masks.
@@ -176,12 +183,18 @@ static int mv88e6123_61_65_setup_global(struct dsa_switch *ds)
static int mv88e6123_61_65_setup_port(struct dsa_switch *ds, int p)
{
int addr = REG_PORT(p);
+ u16 val;
/*
* MAC Forcing register: don't force link, speed, duplex
- * or flow control state to any particular values.
+ * or flow control state to any particular values on physical
+ * ports, but force the CPU port and all DSA ports to 1000 Mb/s
+ * full duplex.
*/
- REG_WRITE(addr, 0x01, 0x0003);
+ if (dsa_is_cpu_port(ds, p) || ds->dsa_port_mask & (1 << p))
+ REG_WRITE(addr, 0x01, 0x003e);
+ else
+ REG_WRITE(addr, 0x01, 0x0003);
/*
* Do not limit the period of time that this port can be
@@ -192,37 +205,50 @@ static int mv88e6123_61_65_setup_port(struct dsa_switch *ds, int p)
/*
* Port Control: disable Drop-on-Unlock, disable Drop-on-Lock,
- * configure the requested (DSA/EDSA) tagging mode if this is
- * the CPU port, disable Header mode, enable IGMP/MLD snooping,
- * disable VLAN tunneling, determine priority by looking at
- * 802.1p and IP priority fields (IP prio has precedence), and
- * set STP state to Forwarding. Finally, if this is the CPU
- * port, additionally enable forwarding of unknown unicast and
- * multicast addresses.
- */
- REG_WRITE(addr, 0x04,
- (p == ds->cpu_port) ?
- (ds->tag_protocol == htons(ETH_P_DSA)) ?
- 0x053f : 0x373f :
- 0x0433);
+ * disable Header mode, enable IGMP/MLD snooping, disable VLAN
+ * tunneling, determine priority by looking at 802.1p and IP
+ * priority fields (IP prio has precedence), and set STP state
+ * to Forwarding.
+ *
+ * If this is the CPU link, use DSA or EDSA tagging depending
+ * on which tagging mode was configured.
+ *
+ * If this is a link to another switch, use DSA tagging mode.
+ *
+ * If this is the upstream port for this switch, enable
+ * forwarding of unknown unicasts and multicasts.
+ */
+ val = 0x0433;
+ if (dsa_is_cpu_port(ds, p)) {
+ if (ds->dst->tag_protocol == htons(ETH_P_EDSA))
+ val |= 0x3300;
+ else
+ val |= 0x0100;
+ }
+ if (ds->dsa_port_mask & (1 << p))
+ val |= 0x0100;
+ if (p == dsa_upstream_port(ds))
+ val |= 0x000c;
+ REG_WRITE(addr, 0x04, val);
/*
* Port Control 1: disable trunking. Also, if this is the
* CPU port, enable learn messages to be sent to this port.
*/
- REG_WRITE(addr, 0x05, (p == ds->cpu_port) ? 0x8000 : 0x0000);
+ REG_WRITE(addr, 0x05, dsa_is_cpu_port(ds, p) ? 0x8000 : 0x0000);
/*
* Port based VLAN map: give each port its own address
* database, allow the CPU port to talk to each of the 'real'
* ports, and allow each of the 'real' ports to only talk to
- * the CPU port.
- */
- REG_WRITE(addr, 0x06,
- ((p & 0xf) << 12) |
- ((p == ds->cpu_port) ?
- ds->valid_port_mask :
- (1 << ds->cpu_port)));
+ * the upstream port.
+ */
+ val = (p & 0xf) << 12;
+ if (dsa_is_cpu_port(ds, p))
+ val |= ds->phys_port_mask;
+ else
+ val |= 1 << dsa_upstream_port(ds);
+ REG_WRITE(addr, 0x06, val);
/*
* Default VLAN ID and priority: don't set a default VLAN
diff --git a/net/dsa/mv88e6131.c b/net/dsa/mv88e6131.c
index 70fae2444cb..bb2b41bc854 100644
--- a/net/dsa/mv88e6131.c
+++ b/net/dsa/mv88e6131.c
@@ -1,6 +1,6 @@
/*
- * net/dsa/mv88e6131.c - Marvell 88e6131 switch chip support
- * Copyright (c) 2008 Marvell Semiconductor
+ * net/dsa/mv88e6131.c - Marvell 88e6095/6095f/6131 switch chip support
+ * Copyright (c) 2008-2009 Marvell Semiconductor
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -21,6 +21,8 @@ static char *mv88e6131_probe(struct mii_bus *bus, int sw_addr)
ret = __mv88e6xxx_reg_read(bus, sw_addr, REG_PORT(0), 0x03);
if (ret >= 0) {
ret &= 0xfff0;
+ if (ret == 0x0950)
+ return "Marvell 88E6095/88E6095F";
if (ret == 0x1060)
return "Marvell 88E6131";
}
@@ -36,7 +38,7 @@ static int mv88e6131_switch_reset(struct dsa_switch *ds)
/*
* Set all ports to the disabled state.
*/
- for (i = 0; i < 8; i++) {
+ for (i = 0; i < 11; i++) {
ret = REG_READ(REG_PORT(i), 0x04);
REG_WRITE(REG_PORT(i), 0x04, ret & 0xfffc);
}
@@ -100,17 +102,17 @@ static int mv88e6131_setup_global(struct dsa_switch *ds)
REG_WRITE(REG_GLOBAL, 0x19, 0x8100);
/*
- * Disable ARP mirroring, and configure the cpu port as the
- * port to which ingress and egress monitor frames are to be
- * sent.
+ * Disable ARP mirroring, and configure the upstream port as
+ * the port to which ingress and egress monitor frames are to
+ * be sent.
*/
- REG_WRITE(REG_GLOBAL, 0x1a, (ds->cpu_port * 0x1100) | 0x00f0);
+ REG_WRITE(REG_GLOBAL, 0x1a, (dsa_upstream_port(ds) * 0x1100) | 0x00f0);
/*
* Disable cascade port functionality, and set the switch's
- * DSA device number to zero.
+ * DSA device number.
*/
- REG_WRITE(REG_GLOBAL, 0x1c, 0xe000);
+ REG_WRITE(REG_GLOBAL, 0x1c, 0xe000 | (ds->index & 0x1f));
/*
* Send all frames with destination addresses matching
@@ -127,16 +129,23 @@ static int mv88e6131_setup_global(struct dsa_switch *ds)
REG_WRITE(REG_GLOBAL2, 0x05, 0x00ff);
/*
- * Map all DSA device IDs to the CPU port.
+ * Program the DSA routing table.
*/
- for (i = 0; i < 32; i++)
- REG_WRITE(REG_GLOBAL2, 0x06, 0x8000 | (i << 8) | ds->cpu_port);
+ for (i = 0; i < 32; i++) {
+ int nexthop;
+
+ nexthop = 0x1f;
+ if (i != ds->index && i < ds->dst->pd->nr_chips)
+ nexthop = ds->pd->rtable[i] & 0x1f;
+
+ REG_WRITE(REG_GLOBAL2, 0x06, 0x8000 | (i << 8) | nexthop);
+ }
/*
* Clear all trunk masks.
*/
for (i = 0; i < 8; i++)
- REG_WRITE(REG_GLOBAL2, 0x07, 0x8000 | (i << 12) | 0xff);
+ REG_WRITE(REG_GLOBAL2, 0x07, 0x8000 | (i << 12) | 0x7ff);
/*
* Clear all trunk mappings.
@@ -156,12 +165,18 @@ static int mv88e6131_setup_global(struct dsa_switch *ds)
static int mv88e6131_setup_port(struct dsa_switch *ds, int p)
{
int addr = REG_PORT(p);
+ u16 val;
/*
* MAC Forcing register: don't force link, speed, duplex
- * or flow control state to any particular values.
+ * or flow control state to any particular values on physical
+ * ports, but force the CPU port and all DSA ports to 1000 Mb/s
+ * full duplex.
*/
- REG_WRITE(addr, 0x01, 0x0003);
+ if (dsa_is_cpu_port(ds, p) || ds->dsa_port_mask & (1 << p))
+ REG_WRITE(addr, 0x01, 0x003e);
+ else
+ REG_WRITE(addr, 0x01, 0x0003);
/*
* Port Control: disable Core Tag, disable Drop-on-Lock,
@@ -169,29 +184,40 @@ static int mv88e6131_setup_port(struct dsa_switch *ds, int p)
* enable IGMP/MLD snoop, disable DoubleTag, disable VLAN
* tunneling, determine priority by looking at 802.1p and
* IP priority fields (IP prio has precedence), and set STP
- * state to Forwarding. Finally, if this is the CPU port,
- * additionally enable DSA tagging and forwarding of unknown
- * unicast addresses.
+ * state to Forwarding.
+ *
+ * If this is the upstream port for this switch, enable
+ * forwarding of unknown unicasts, and enable DSA tagging
+ * mode.
+ *
+ * If this is the link to another switch, use DSA tagging
+ * mode, but do not enable forwarding of unknown unicasts.
*/
- REG_WRITE(addr, 0x04, (p == ds->cpu_port) ? 0x0537 : 0x0433);
+ val = 0x0433;
+ if (p == dsa_upstream_port(ds))
+ val |= 0x0104;
+ if (ds->dsa_port_mask & (1 << p))
+ val |= 0x0100;
+ REG_WRITE(addr, 0x04, val);
/*
* Port Control 1: disable trunking. Also, if this is the
* CPU port, enable learn messages to be sent to this port.
*/
- REG_WRITE(addr, 0x05, (p == ds->cpu_port) ? 0x8000 : 0x0000);
+ REG_WRITE(addr, 0x05, dsa_is_cpu_port(ds, p) ? 0x8000 : 0x0000);
/*
* Port based VLAN map: give each port its own address
* database, allow the CPU port to talk to each of the 'real'
* ports, and allow each of the 'real' ports to only talk to
- * the CPU port.
+ * the upstream port.
*/
- REG_WRITE(addr, 0x06,
- ((p & 0xf) << 12) |
- ((p == ds->cpu_port) ?
- ds->valid_port_mask :
- (1 << ds->cpu_port)));
+ val = (p & 0xf) << 12;
+ if (dsa_is_cpu_port(ds, p))
+ val |= ds->phys_port_mask;
+ else
+ val |= 1 << dsa_upstream_port(ds);
+ REG_WRITE(addr, 0x06, val);
/*
* Default VLAN ID and priority: don't set a default VLAN
@@ -207,13 +233,15 @@ static int mv88e6131_setup_port(struct dsa_switch *ds, int p)
* untagged frames on this port, do a destination address
* lookup on received packets as usual, don't send a copy
* of all transmitted/received frames on this port to the
- * CPU, and configure the CPU port number. Also, if this
- * is the CPU port, enable forwarding of unknown multicast
- * addresses.
+ * CPU, and configure the upstream port number.
+ *
+ * If this is the upstream port for this switch, enable
+ * forwarding of unknown multicast addresses.
*/
- REG_WRITE(addr, 0x08,
- ((p == ds->cpu_port) ? 0x00c0 : 0x0080) |
- ds->cpu_port);
+ val = 0x0080 | dsa_upstream_port(ds);
+ if (p == dsa_upstream_port(ds))
+ val |= 0x0040;
+ REG_WRITE(addr, 0x08, val);
/*
* Rate Control: disable ingress rate limiting.
@@ -268,7 +296,7 @@ static int mv88e6131_setup(struct dsa_switch *ds)
if (ret < 0)
return ret;
- for (i = 0; i < 6; i++) {
+ for (i = 0; i < 11; i++) {
ret = mv88e6131_setup_port(ds, i);
if (ret < 0)
return ret;
@@ -279,7 +307,7 @@ static int mv88e6131_setup(struct dsa_switch *ds)
static int mv88e6131_port_to_phy_addr(int port)
{
- if (port >= 0 && port != 3 && port <= 7)
+ if (port >= 0 && port <= 11)
return port;
return -1;
}
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index a68fd79e9ec..ed131181215 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1,6 +1,6 @@
/*
* net/dsa/slave.c - Slave device handling
- * Copyright (c) 2008 Marvell Semiconductor
+ * Copyright (c) 2008-2009 Marvell Semiconductor
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -19,7 +19,7 @@ static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg)
{
struct dsa_switch *ds = bus->priv;
- if (ds->valid_port_mask & (1 << addr))
+ if (ds->phys_port_mask & (1 << addr))
return ds->drv->phy_read(ds, addr, reg);
return 0xffff;
@@ -29,7 +29,7 @@ static int dsa_slave_phy_write(struct mii_bus *bus, int addr, int reg, u16 val)
{
struct dsa_switch *ds = bus->priv;
- if (ds->valid_port_mask & (1 << addr))
+ if (ds->phys_port_mask & (1 << addr))
return ds->drv->phy_write(ds, addr, reg, val);
return 0;
@@ -43,15 +43,24 @@ void dsa_slave_mii_bus_init(struct dsa_switch *ds)
ds->slave_mii_bus->write = dsa_slave_phy_write;
snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "%s:%.2x",
ds->master_mii_bus->id, ds->pd->sw_addr);
- ds->slave_mii_bus->parent = &(ds->master_mii_bus->dev);
+ ds->slave_mii_bus->parent = &ds->master_mii_bus->dev;
}
/* slave device handling ****************************************************/
+static int dsa_slave_init(struct net_device *dev)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+
+ dev->iflink = p->parent->dst->master_netdev->ifindex;
+
+ return 0;
+}
+
static int dsa_slave_open(struct net_device *dev)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct net_device *master = p->parent->master_netdev;
+ struct net_device *master = p->parent->dst->master_netdev;
int err;
if (!(master->flags & IFF_UP))
@@ -89,7 +98,7 @@ out:
static int dsa_slave_close(struct net_device *dev)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct net_device *master = p->parent->master_netdev;
+ struct net_device *master = p->parent->dst->master_netdev;
dev_mc_unsync(master, dev);
dev_unicast_unsync(master, dev);
@@ -107,7 +116,7 @@ static int dsa_slave_close(struct net_device *dev)
static void dsa_slave_change_rx_flags(struct net_device *dev, int change)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct net_device *master = p->parent->master_netdev;
+ struct net_device *master = p->parent->dst->master_netdev;
if (change & IFF_ALLMULTI)
dev_set_allmulti(master, dev->flags & IFF_ALLMULTI ? 1 : -1);
@@ -118,7 +127,7 @@ static void dsa_slave_change_rx_flags(struct net_device *dev, int change)
static void dsa_slave_set_rx_mode(struct net_device *dev)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct net_device *master = p->parent->master_netdev;
+ struct net_device *master = p->parent->dst->master_netdev;
dev_mc_sync(master, dev);
dev_unicast_sync(master, dev);
@@ -127,7 +136,7 @@ static void dsa_slave_set_rx_mode(struct net_device *dev)
static int dsa_slave_set_mac_address(struct net_device *dev, void *a)
{
struct dsa_slave_priv *p = netdev_priv(dev);
- struct net_device *master = p->parent->master_netdev;
+ struct net_device *master = p->parent->dst->master_netdev;
struct sockaddr *addr = a;
int err;
@@ -288,6 +297,7 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
#ifdef CONFIG_NET_DSA_TAG_DSA
static const struct net_device_ops dsa_netdev_ops = {
+ .ndo_init = dsa_slave_init,
.ndo_open = dsa_slave_open,
.ndo_stop = dsa_slave_close,
.ndo_start_xmit = dsa_xmit,
@@ -300,6 +310,7 @@ static const struct net_device_ops dsa_netdev_ops = {
#endif
#ifdef CONFIG_NET_DSA_TAG_EDSA
static const struct net_device_ops edsa_netdev_ops = {
+ .ndo_init = dsa_slave_init,
.ndo_open = dsa_slave_open,
.ndo_stop = dsa_slave_close,
.ndo_start_xmit = edsa_xmit,
@@ -312,6 +323,7 @@ static const struct net_device_ops edsa_netdev_ops = {
#endif
#ifdef CONFIG_NET_DSA_TAG_TRAILER
static const struct net_device_ops trailer_netdev_ops = {
+ .ndo_init = dsa_slave_init,
.ndo_open = dsa_slave_open,
.ndo_stop = dsa_slave_close,
.ndo_start_xmit = trailer_xmit,
@@ -328,7 +340,7 @@ struct net_device *
dsa_slave_create(struct dsa_switch *ds, struct device *parent,
int port, char *name)
{
- struct net_device *master = ds->master_netdev;
+ struct net_device *master = ds->dst->master_netdev;
struct net_device *slave_dev;
struct dsa_slave_priv *p;
int ret;
@@ -343,7 +355,7 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent,
memcpy(slave_dev->dev_addr, master->dev_addr, ETH_ALEN);
slave_dev->tx_queue_len = 0;
- switch (ds->tag_protocol) {
+ switch (ds->dst->tag_protocol) {
#ifdef CONFIG_NET_DSA_TAG_DSA
case htons(ETH_P_DSA):
slave_dev->netdev_ops = &dsa_netdev_ops;
diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c
index 63e532a69fd..8fa25bafe6c 100644
--- a/net/dsa/tag_dsa.c
+++ b/net/dsa/tag_dsa.c
@@ -1,6 +1,6 @@
/*
* net/dsa/tag_dsa.c - (Non-ethertype) DSA tagging
- * Copyright (c) 2008 Marvell Semiconductor
+ * Copyright (c) 2008-2009 Marvell Semiconductor
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -36,7 +36,7 @@ int dsa_xmit(struct sk_buff *skb, struct net_device *dev)
* Construct tagged FROM_CPU DSA tag from 802.1q tag.
*/
dsa_header = skb->data + 2 * ETH_ALEN;
- dsa_header[0] = 0x60;
+ dsa_header[0] = 0x60 | p->parent->index;
dsa_header[1] = p->port << 3;
/*
@@ -57,7 +57,7 @@ int dsa_xmit(struct sk_buff *skb, struct net_device *dev)
* Construct untagged FROM_CPU DSA tag.
*/
dsa_header = skb->data + 2 * ETH_ALEN;
- dsa_header[0] = 0x40;
+ dsa_header[0] = 0x40 | p->parent->index;
dsa_header[1] = p->port << 3;
dsa_header[2] = 0x00;
dsa_header[3] = 0x00;
@@ -65,7 +65,7 @@ int dsa_xmit(struct sk_buff *skb, struct net_device *dev)
skb->protocol = htons(ETH_P_DSA);
- skb->dev = p->parent->master_netdev;
+ skb->dev = p->parent->dst->master_netdev;
dev_queue_xmit(skb);
return NETDEV_TX_OK;
@@ -78,11 +78,13 @@ out_free:
static int dsa_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
- struct dsa_switch *ds = dev->dsa_ptr;
+ struct dsa_switch_tree *dst = dev->dsa_ptr;
+ struct dsa_switch *ds;
u8 *dsa_header;
+ int source_device;
int source_port;
- if (unlikely(ds == NULL))
+ if (unlikely(dst == NULL))
goto out_drop;
skb = skb_unshare(skb, GFP_ATOMIC);
@@ -98,16 +100,24 @@ static int dsa_rcv(struct sk_buff *skb, struct net_device *dev,
dsa_header = skb->data - 2;
/*
- * Check that frame type is either TO_CPU or FORWARD, and
- * that the source device is zero.
+ * Check that frame type is either TO_CPU or FORWARD.
*/
- if ((dsa_header[0] & 0xdf) != 0x00 && (dsa_header[0] & 0xdf) != 0xc0)
+ if ((dsa_header[0] & 0xc0) != 0x00 && (dsa_header[0] & 0xc0) != 0xc0)
goto out_drop;
/*
- * Check that the source port is a registered DSA port.
+ * Determine source device and port.
*/
+ source_device = dsa_header[0] & 0x1f;
source_port = (dsa_header[1] >> 3) & 0x1f;
+
+ /*
+ * Check that the source device exists and that the source
+ * port is a registered DSA port.
+ */
+ if (source_device >= dst->pd->nr_chips)
+ goto out_drop;
+ ds = dst->ds[source_device];
if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL)
goto out_drop;
@@ -175,7 +185,7 @@ out:
return 0;
}
-static struct packet_type dsa_packet_type = {
+static struct packet_type dsa_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_DSA),
.func = dsa_rcv,
};
diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c
index 6197f9a7ef4..815607bd286 100644
--- a/net/dsa/tag_edsa.c
+++ b/net/dsa/tag_edsa.c
@@ -1,6 +1,6 @@
/*
* net/dsa/tag_edsa.c - Ethertype DSA tagging
- * Copyright (c) 2008 Marvell Semiconductor
+ * Copyright (c) 2008-2009 Marvell Semiconductor
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -45,7 +45,7 @@ int edsa_xmit(struct sk_buff *skb, struct net_device *dev)
edsa_header[1] = ETH_P_EDSA & 0xff;
edsa_header[2] = 0x00;
edsa_header[3] = 0x00;
- edsa_header[4] = 0x60;
+ edsa_header[4] = 0x60 | p->parent->index;
edsa_header[5] = p->port << 3;
/*
@@ -70,7 +70,7 @@ int edsa_xmit(struct sk_buff *skb, struct net_device *dev)
edsa_header[1] = ETH_P_EDSA & 0xff;
edsa_header[2] = 0x00;
edsa_header[3] = 0x00;
- edsa_header[4] = 0x40;
+ edsa_header[4] = 0x40 | p->parent->index;
edsa_header[5] = p->port << 3;
edsa_header[6] = 0x00;
edsa_header[7] = 0x00;
@@ -78,7 +78,7 @@ int edsa_xmit(struct sk_buff *skb, struct net_device *dev)
skb->protocol = htons(ETH_P_EDSA);
- skb->dev = p->parent->master_netdev;
+ skb->dev = p->parent->dst->master_netdev;
dev_queue_xmit(skb);
return NETDEV_TX_OK;
@@ -91,11 +91,13 @@ out_free:
static int edsa_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
- struct dsa_switch *ds = dev->dsa_ptr;
+ struct dsa_switch_tree *dst = dev->dsa_ptr;
+ struct dsa_switch *ds;
u8 *edsa_header;
+ int source_device;
int source_port;
- if (unlikely(ds == NULL))
+ if (unlikely(dst == NULL))
goto out_drop;
skb = skb_unshare(skb, GFP_ATOMIC);
@@ -111,16 +113,24 @@ static int edsa_rcv(struct sk_buff *skb, struct net_device *dev,
edsa_header = skb->data + 2;
/*
- * Check that frame type is either TO_CPU or FORWARD, and
- * that the source device is zero.
+ * Check that frame type is either TO_CPU or FORWARD.
*/
- if ((edsa_header[0] & 0xdf) != 0x00 && (edsa_header[0] & 0xdf) != 0xc0)
+ if ((edsa_header[0] & 0xc0) != 0x00 && (edsa_header[0] & 0xc0) != 0xc0)
goto out_drop;
/*
- * Check that the source port is a registered DSA port.
+ * Determine source device and port.
*/
+ source_device = edsa_header[0] & 0x1f;
source_port = (edsa_header[1] >> 3) & 0x1f;
+
+ /*
+ * Check that the source device exists and that the source
+ * port is a registered DSA port.
+ */
+ if (source_device >= dst->pd->nr_chips)
+ goto out_drop;
+ ds = dst->ds[source_device];
if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL)
goto out_drop;
@@ -194,7 +204,7 @@ out:
return 0;
}
-static struct packet_type edsa_packet_type = {
+static struct packet_type edsa_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_EDSA),
.func = edsa_rcv,
};
diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c
index d7e7f424ff0..1c3e30c38b8 100644
--- a/net/dsa/tag_trailer.c
+++ b/net/dsa/tag_trailer.c
@@ -1,6 +1,6 @@
/*
* net/dsa/tag_trailer.c - Trailer tag format handling
- * Copyright (c) 2008 Marvell Semiconductor
+ * Copyright (c) 2008-2009 Marvell Semiconductor
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -59,7 +59,7 @@ int trailer_xmit(struct sk_buff *skb, struct net_device *dev)
nskb->protocol = htons(ETH_P_TRAILER);
- nskb->dev = p->parent->master_netdev;
+ nskb->dev = p->parent->dst->master_netdev;
dev_queue_xmit(nskb);
return NETDEV_TX_OK;
@@ -68,12 +68,14 @@ int trailer_xmit(struct sk_buff *skb, struct net_device *dev)
static int trailer_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
- struct dsa_switch *ds = dev->dsa_ptr;
+ struct dsa_switch_tree *dst = dev->dsa_ptr;
+ struct dsa_switch *ds;
u8 *trailer;
int source_port;
- if (unlikely(ds == NULL))
+ if (unlikely(dst == NULL))
goto out_drop;
+ ds = dst->ds[0];
skb = skb_unshare(skb, GFP_ATOMIC);
if (skb == NULL)
@@ -111,7 +113,7 @@ out:
return 0;
}
-static struct packet_type trailer_packet_type = {
+static struct packet_type trailer_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_TRAILER),
.func = trailer_rcv,
};
diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c
index 7bf35582f65..6f479fa522c 100644
--- a/net/econet/af_econet.c
+++ b/net/econet/af_econet.c
@@ -1102,7 +1102,7 @@ drop:
return NET_RX_DROP;
}
-static struct packet_type econet_packet_type = {
+static struct packet_type econet_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_ECONET),
.func = econet_rcv,
};
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 691268f3a35..b2cf91e4cca 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -35,7 +35,7 @@ config IP_ADVANCED_ROUTER
at boot time after the /proc file system has been mounted.
- If you turn on IP forwarding, you will also get the rp_filter, which
+ If you turn on IP forwarding, you should consider the rp_filter, which
automatically rejects incoming packets if the routing table entry
for their source address doesn't match the network interface they're
arriving on. This has security advantages because it prevents the
@@ -46,12 +46,16 @@ config IP_ADVANCED_ROUTER
rp_filter on use:
echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter
- or
+ and
echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter
+ Note that some distributions enable it in startup scripts.
+ For details about rp_filter strict and loose mode read
+ <file:Documentation/networking/ip-sysctl.txt>.
+
If unsure, say N here.
-choice
+choice
prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
depends on IP_ADVANCED_ROUTER
default ASK_IP_FIB_HASH
@@ -59,27 +63,29 @@ choice
config ASK_IP_FIB_HASH
bool "FIB_HASH"
---help---
- Current FIB is very proven and good enough for most users.
+ Current FIB is very proven and good enough for most users.
config IP_FIB_TRIE
bool "FIB_TRIE"
---help---
- Use new experimental LC-trie as FIB lookup algorithm.
- This improves lookup performance if you have a large
- number of routes.
-
- LC-trie is a longest matching prefix lookup algorithm which
- performs better than FIB_HASH for large routing tables.
- But, it consumes more memory and is more complex.
-
- LC-trie is described in:
-
- IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
- IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
- An experimental study of compression methods for dynamic tries
- Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
- http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
-
+ Use new experimental LC-trie as FIB lookup algorithm.
+ This improves lookup performance if you have a large
+ number of routes.
+
+ LC-trie is a longest matching prefix lookup algorithm which
+ performs better than FIB_HASH for large routing tables.
+ But, it consumes more memory and is more complex.
+
+ LC-trie is described in:
+
+ IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
+ IEEE Journal on Selected Areas in Communications, 17(6):1083-1092,
+ June 1999
+
+ An experimental study of compression methods for dynamic tries
+ Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
+ http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
+
endchoice
config IP_FIB_HASH
@@ -191,7 +197,7 @@ config IP_PNP_RARP
<file:Documentation/filesystems/nfsroot.txt> for details.
# not yet ready..
-# bool ' IP: ARP support' CONFIG_IP_PNP_ARP
+# bool ' IP: ARP support' CONFIG_IP_PNP_ARP
config NET_IPIP
tristate "IP: tunneling"
select INET_TUNNEL
@@ -361,7 +367,7 @@ config INET_IPCOMP
---help---
Support for IP Payload Compression Protocol (IPComp) (RFC3173),
typically needed for IPsec.
-
+
If unsure, say Y.
config INET_XFRM_TUNNEL
@@ -415,7 +421,7 @@ config INET_DIAG
Support for INET (TCP, DCCP, etc) socket monitoring interface used by
native Linux tools such as ss. ss is included in iproute2, currently
downloadable at <http://linux-net.osdl.org/index.php/Iproute2>.
-
+
If unsure, say Y.
config INET_TCP_DIAG
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 627be4dc7fb..d5aaabbb7cb 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1500,7 +1500,7 @@ static int ipv4_proc_init(void);
* IP protocol layer initialiser
*/
-static struct packet_type ip_packet_type = {
+static struct packet_type ip_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_IP),
.func = ip_rcv,
.gso_send_check = inet_gso_send_check,
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 3f6b7354699..f11931c1838 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -801,8 +801,11 @@ static int arp_process(struct sk_buff *skb)
* cache.
*/
- /* Special case: IPv4 duplicate address detection packet (RFC2131) */
- if (sip == 0) {
+ /*
+ * Special case: IPv4 duplicate address detection packet (RFC2131)
+ * and Gratuitous ARP/ARP Announce. (RFC3927, Section 2.4)
+ */
+ if (sip == 0 || tip == sip) {
if (arp->ar_op == htons(ARPOP_REQUEST) &&
inet_addr_type(net, tip) == RTN_LOCAL &&
!arp_ignore(in_dev, sip, tip))
@@ -892,7 +895,7 @@ static int arp_process(struct sk_buff *skb)
out:
if (in_dev)
in_dev_put(in_dev);
- kfree_skb(skb);
+ consume_skb(skb);
return 0;
}
@@ -1225,7 +1228,7 @@ void arp_ifdown(struct net_device *dev)
* Called once on startup.
*/
-static struct packet_type arp_packet_type = {
+static struct packet_type arp_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_ARP),
.func = arp_rcv,
};
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 6bb2635b5de..7bc992976d2 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -3,11 +3,16 @@
*
* This is an implementation of the CIPSO 2.2 protocol as specified in
* draft-ietf-cipso-ipsecurity-01.txt with additional tag types as found in
- * FIPS-188, copies of both documents can be found in the Documentation
- * directory. While CIPSO never became a full IETF RFC standard many vendors
+ * FIPS-188. While CIPSO never became a full IETF RFC standard many vendors
* have chosen to adopt the protocol and over the years it has become a
* de-facto standard for labeled networking.
*
+ * The CIPSO draft specification can be found in the kernel's Documentation
+ * directory as well as the following URL:
+ * http://netlabel.sourceforge.net/files/draft-ietf-cipso-ipsecurity-01.txt
+ * The FIPS-188 specification can be found at the following URL:
+ * http://www.itl.nist.gov/fipspubs/fip188.htm
+ *
* Author: Paul Moore <paul.moore@hp.com>
*
*/
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index d519a6a6672..126bb911880 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1216,7 +1216,8 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
kfree_skb(skb);
goto errout;
}
- err = rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
+ rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
+ return;
errout:
if (err < 0)
rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 741e4fa3e47..cafcc49d099 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -275,7 +275,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
fib_res_put(&res);
if (no_addr)
goto last_resort;
- if (rpf)
+ if (rpf == 1)
goto e_inval;
fl.oif = dev->ifindex;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 4817dea3bc7..f831df50090 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -322,8 +322,9 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
kfree_skb(skb);
goto errout;
}
- err = rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
- info->nlh, GFP_KERNEL);
+ rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
+ info->nlh, GFP_KERNEL);
+ return;
errout:
if (err < 0)
rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 382800a62b3..3f50807237e 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1207,7 +1207,7 @@ static struct pernet_operations __net_initdata icmp_sk_ops = {
int __init icmp_init(void)
{
- return register_pernet_device(&icmp_sk_ops);
+ return register_pernet_subsys(&icmp_sk_ops);
}
EXPORT_SYMBOL(icmp_err_convert);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 6c52e08f786..eaf3e2c8646 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -267,6 +267,7 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
struct inet_frags *f, void *key, unsigned int hash)
+ __releases(&f->lock)
{
struct inet_frag_queue *q;
struct hlist_node *n;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 6659ac000ee..7985346653b 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -463,6 +463,7 @@ err:
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
struct net_device *dev)
{
+ struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
struct iphdr *iph;
struct sk_buff *fp, *head = qp->q.fragments;
int len;
@@ -548,7 +549,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
iph = ip_hdr(head);
iph->frag_off = 0;
iph->tot_len = htons(len);
- IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_REASMOKS);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
qp->q.fragments = NULL;
return 0;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 07a188afb3a..e62510d5ea5 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -491,7 +491,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
goto out;
- if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
+ if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
t->err_count++;
else
t->err_count = 1;
@@ -803,7 +803,8 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
#endif
if (tunnel->err_count > 0) {
- if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
+ if (time_before(jiffies,
+ tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
tunnel->err_count--;
dst_link_failure(skb);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 5079dfbc6f3..9054139795a 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -327,7 +327,7 @@ static int ipip_err(struct sk_buff *skb, u32 info)
if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
goto out;
- if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
+ if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
t->err_count++;
else
t->err_count = 1;
@@ -466,7 +466,8 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
}
if (tunnel->err_count > 0) {
- if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
+ if (time_before(jiffies,
+ tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
tunnel->err_count--;
dst_link_failure(skb);
} else
@@ -750,7 +751,7 @@ static struct xfrm_tunnel ipip_handler = {
.priority = 1,
};
-static char banner[] __initdata =
+static const char banner[] __initconst =
KERN_INFO "IPv4 over IPv4 tunneling driver\n";
static void ipip_destroy_tunnels(struct ipip_net *ipn)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 90b2f3c192f..2451aeb5ac2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -661,6 +661,47 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
return NULL;
}
+static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
+ int large_allowed)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 xmit_size_goal, old_size_goal;
+
+ xmit_size_goal = mss_now;
+
+ if (large_allowed && sk_can_gso(sk)) {
+ xmit_size_goal = ((sk->sk_gso_max_size - 1) -
+ inet_csk(sk)->icsk_af_ops->net_header_len -
+ inet_csk(sk)->icsk_ext_hdr_len -
+ tp->tcp_header_len);
+
+ xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
+
+ /* We try hard to avoid divides here */
+ old_size_goal = tp->xmit_size_goal_segs * mss_now;
+
+ if (likely(old_size_goal <= xmit_size_goal &&
+ old_size_goal + mss_now > xmit_size_goal)) {
+ xmit_size_goal = old_size_goal;
+ } else {
+ tp->xmit_size_goal_segs = xmit_size_goal / mss_now;
+ xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
+ }
+ }
+
+ return max(xmit_size_goal, mss_now);
+}
+
+static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
+{
+ int mss_now;
+
+ mss_now = tcp_current_mss(sk);
+ *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
+
+ return mss_now;
+}
+
static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
size_t psize, int flags)
{
@@ -677,13 +718,12 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
- mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
- size_goal = tp->xmit_size_goal;
+ mss_now = tcp_send_mss(sk, &size_goal, flags);
copied = 0;
err = -EPIPE;
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
- goto do_error;
+ goto out_err;
while (psize > 0) {
struct sk_buff *skb = tcp_write_queue_tail(sk);
@@ -761,8 +801,7 @@ wait_for_memory:
if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
goto do_error;
- mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
- size_goal = tp->xmit_size_goal;
+ mss_now = tcp_send_mss(sk, &size_goal, flags);
}
out:
@@ -844,8 +883,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
/* This should be in poll */
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
- mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
- size_goal = tp->xmit_size_goal;
+ mss_now = tcp_send_mss(sk, &size_goal, flags);
/* Ok commence sending. */
iovlen = msg->msg_iovlen;
@@ -854,7 +892,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
err = -EPIPE;
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
- goto do_error;
+ goto out_err;
while (--iovlen >= 0) {
int seglen = iov->iov_len;
@@ -1007,8 +1045,7 @@ wait_for_memory:
if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
goto do_error;
- mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
- size_goal = tp->xmit_size_goal;
+ mss_now = tcp_send_mss(sk, &size_goal, flags);
}
}
@@ -1045,8 +1082,7 @@ out_err:
*/
static int tcp_recv_urg(struct sock *sk, long timeo,
- struct msghdr *msg, int len, int flags,
- int *addr_len)
+ struct msghdr *msg, int len, int flags)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -1661,7 +1697,7 @@ out:
return err;
recv_urg:
- err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
+ err = tcp_recv_urg(sk, timeo, msg, len, flags);
goto out;
}
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 7eb7636db0d..3b53fd1af23 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -149,16 +149,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
tcp_slow_start(tp);
else {
bictcp_update(ca, tp->snd_cwnd);
-
- /* In dangerous area, increase slowly.
- * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
- */
- if (tp->snd_cwnd_cnt >= ca->cnt) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- tp->snd_cwnd_cnt = 0;
- } else
- tp->snd_cwnd_cnt++;
+ tcp_cong_avoid_ai(tp, ca->cnt);
}
}
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 4ec5b4e97c4..e92beb9e55e 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -336,6 +336,19 @@ void tcp_slow_start(struct tcp_sock *tp)
}
EXPORT_SYMBOL_GPL(tcp_slow_start);
+/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */
+void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w)
+{
+ if (tp->snd_cwnd_cnt >= w) {
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
+ tp->snd_cwnd_cnt = 0;
+ } else {
+ tp->snd_cwnd_cnt++;
+ }
+}
+EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
+
/*
* TCP Reno congestion control
* This is special case used for fallback as well.
@@ -365,13 +378,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
tp->snd_cwnd++;
}
} else {
- /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */
- if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- tp->snd_cwnd_cnt = 0;
- } else
- tp->snd_cwnd_cnt++;
+ tcp_cong_avoid_ai(tp, tp->snd_cwnd);
}
}
EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index ee467ec40c4..71d5f2f29fa 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -294,16 +294,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
tcp_slow_start(tp);
} else {
bictcp_update(ca, tp->snd_cwnd);
-
- /* In dangerous area, increase slowly.
- * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
- */
- if (tp->snd_cwnd_cnt >= ca->cnt) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- tp->snd_cwnd_cnt = 0;
- } else
- tp->snd_cwnd_cnt++;
+ tcp_cong_avoid_ai(tp, ca->cnt);
}
}
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 937549b8a92..26d5c7fc7de 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -115,8 +115,7 @@ static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked, s32 rtt
return;
/* achieved throughput calculations */
- if (icsk->icsk_ca_state != TCP_CA_Open &&
- icsk->icsk_ca_state != TCP_CA_Disorder) {
+ if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_Disorder))) {
ca->packetcount = 0;
ca->lasttime = now;
return;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a6961d75c7e..2bc8e27a163 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -64,6 +64,7 @@
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/sysctl.h>
+#include <linux/kernel.h>
#include <net/dst.h>
#include <net/tcp.h>
#include <net/inet_common.h>
@@ -1178,10 +1179,18 @@ static void tcp_mark_lost_retrans(struct sock *sk)
if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS))
continue;
- if (after(received_upto, ack_seq) &&
- (tcp_is_fack(tp) ||
- !before(received_upto,
- ack_seq + tp->reordering * tp->mss_cache))) {
+ /* TODO: We would like to get rid of tcp_is_fack(tp) only
+ * constraint here (see above) but figuring out that at
+ * least tp->reordering SACK blocks reside between ack_seq
+ * and received_upto is not easy task to do cheaply with
+ * the available datastructures.
+ *
+ * Whether FACK should check here for tp->reordering segs
+ * in-between one could argue for either way (it would be
+ * rather simple to implement as we could count fack_count
+ * during the walk and do tp->fackets_out - fack_count).
+ */
+ if (after(received_upto, ack_seq)) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
@@ -1374,7 +1383,8 @@ static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
struct tcp_sacktag_state *state,
- unsigned int pcount, int shifted, int mss)
+ unsigned int pcount, int shifted, int mss,
+ int dup_sack)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
@@ -1410,7 +1420,7 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
}
/* We discard results */
- tcp_sacktag_one(skb, sk, state, 0, pcount);
+ tcp_sacktag_one(skb, sk, state, dup_sack, pcount);
/* Difference in this won't matter, both ACKed by the same cumul. ACK */
TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
@@ -1561,7 +1571,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
if (!skb_shift(prev, skb, len))
goto fallback;
- if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss))
+ if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack))
goto out;
/* Hole filled allows collapsing with the next as well, this is very
@@ -1580,7 +1590,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
len = skb->len;
if (skb_shift(prev, skb, len)) {
pcount += tcp_skb_pcount(skb);
- tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss);
+ tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0);
}
out:
@@ -1793,11 +1803,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
for (i = used_sacks - 1; i > 0; i--) {
for (j = 0; j < i; j++) {
if (after(sp[j].start_seq, sp[j + 1].start_seq)) {
- struct tcp_sack_block tmp;
-
- tmp = sp[j];
- sp[j] = sp[j + 1];
- sp[j + 1] = tmp;
+ swap(sp[j], sp[j + 1]);
/* Track where the first SACK block goes to */
if (j == first_sack_index)
@@ -2452,6 +2458,44 @@ static int tcp_time_to_recover(struct sock *sk)
return 0;
}
+/* New heuristics: it is possible only after we switched to restart timer
+ * each time when something is ACKed. Hence, we can detect timed out packets
+ * during fast retransmit without falling to slow start.
+ *
+ * Usefulness of this as is very questionable, since we should know which of
+ * the segments is the next to timeout which is relatively expensive to find
+ * in general case unless we add some data structure just for that. The
+ * current approach certainly won't find the right one too often and when it
+ * finally does find _something_ it usually marks large part of the window
+ * right away (because a retransmission with a larger timestamp blocks the
+ * loop from advancing). -ij
+ */
+static void tcp_timeout_skbs(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+
+ if (!tcp_is_fack(tp) || !tcp_head_timedout(sk))
+ return;
+
+ skb = tp->scoreboard_skb_hint;
+ if (tp->scoreboard_skb_hint == NULL)
+ skb = tcp_write_queue_head(sk);
+
+ tcp_for_write_queue_from(skb, sk) {
+ if (skb == tcp_send_head(sk))
+ break;
+ if (!tcp_skb_timedout(sk, skb))
+ break;
+
+ tcp_skb_mark_lost(tp, skb);
+ }
+
+ tp->scoreboard_skb_hint = skb;
+
+ tcp_verify_left_out(tp);
+}
+
/* Mark head of queue up as lost. With RFC3517 SACK, the packets is
* is against sacked "cnt", otherwise it's against facked "cnt"
*/
@@ -2524,30 +2568,7 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
tcp_mark_head_lost(sk, sacked_upto);
}
- /* New heuristics: it is possible only after we switched
- * to restart timer each time when something is ACKed.
- * Hence, we can detect timed out packets during fast
- * retransmit without falling to slow start.
- */
- if (tcp_is_fack(tp) && tcp_head_timedout(sk)) {
- struct sk_buff *skb;
-
- skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint
- : tcp_write_queue_head(sk);
-
- tcp_for_write_queue_from(skb, sk) {
- if (skb == tcp_send_head(sk))
- break;
- if (!tcp_skb_timedout(sk, skb))
- break;
-
- tcp_skb_mark_lost(tp, skb);
- }
-
- tp->scoreboard_skb_hint = skb;
-
- tcp_verify_left_out(tp);
- }
+ tcp_timeout_skbs(sk);
}
/* CWND moderation, preventing bursts due to too big ACKs
@@ -2812,7 +2833,7 @@ static void tcp_mtup_probe_failed(struct sock *sk)
icsk->icsk_mtup.probe_size = 0;
}
-static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb)
+static void tcp_mtup_probe_success(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2840,7 +2861,7 @@ void tcp_simple_retransmit(struct sock *sk)
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
- unsigned int mss = tcp_current_mss(sk, 0);
+ unsigned int mss = tcp_current_mss(sk);
u32 prior_lost = tp->lost_out;
tcp_for_write_queue(skb, sk) {
@@ -3177,7 +3198,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
- u32 end_seq;
u32 acked_pcount;
u8 sacked = scb->sacked;
@@ -3192,16 +3212,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
break;
fully_acked = 0;
- end_seq = tp->snd_una;
} else {
acked_pcount = tcp_skb_pcount(skb);
- end_seq = scb->end_seq;
- }
-
- /* MTU probing checks */
- if (fully_acked && icsk->icsk_mtup.probe_size &&
- !after(tp->mtu_probe.probe_seq_end, scb->end_seq)) {
- tcp_mtup_probe_success(sk, skb);
}
if (sacked & TCPCB_RETRANS) {
@@ -3266,24 +3278,26 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
const struct tcp_congestion_ops *ca_ops
= inet_csk(sk)->icsk_ca_ops;
+ if (unlikely(icsk->icsk_mtup.probe_size &&
+ !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
+ tcp_mtup_probe_success(sk);
+ }
+
tcp_ack_update_rtt(sk, flag, seq_rtt);
tcp_rearm_rto(sk);
if (tcp_is_reno(tp)) {
tcp_remove_reno_sacks(sk, pkts_acked);
} else {
+ int delta;
+
/* Non-retransmitted hole got filled? That's reordering */
if (reord < prior_fackets)
tcp_update_reordering(sk, tp->fackets_out - reord, 0);
- /* No need to care for underflows here because
- * the lost_skb_hint gets NULLed if we're past it
- * (or something non-trivial happened)
- */
- if (tcp_is_fack(tp))
- tp->lost_cnt_hint -= pkts_acked;
- else
- tp->lost_cnt_hint -= prior_sacked - tp->sacked_out;
+ delta = tcp_is_fack(tp) ? pkts_acked :
+ prior_sacked - tp->sacked_out;
+ tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
}
tp->fackets_out -= min(pkts_acked, tp->fackets_out);
@@ -3395,7 +3409,7 @@ static int tcp_ack_update_window(struct sock *sk, struct sk_buff *skb, u32 ack,
if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
flag |= FLAG_WIN_UPDATE;
- tcp_update_wl(tp, ack, ack_seq);
+ tcp_update_wl(tp, ack_seq);
if (tp->snd_wnd != nwin) {
tp->snd_wnd = nwin;
@@ -3571,15 +3585,18 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
int prior_packets;
int frto_cwnd = 0;
- /* If the ack is newer than sent or older than previous acks
+ /* If the ack is older than previous acks
* then we can probably ignore it.
*/
- if (after(ack, tp->snd_nxt))
- goto uninteresting_ack;
-
if (before(ack, prior_snd_una))
goto old_ack;
+ /* If the ack includes data we haven't sent yet, discard
+ * this segment (RFC793 Section 3.9).
+ */
+ if (after(ack, tp->snd_nxt))
+ goto invalid_ack;
+
if (after(ack, prior_snd_una))
flag |= FLAG_SND_UNA_ADVANCED;
@@ -3600,7 +3617,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
* No more checks are required.
* Note, we use the fact that SND.UNA>=SND.WL2.
*/
- tcp_update_wl(tp, ack, ack_seq);
+ tcp_update_wl(tp, ack_seq);
tp->snd_una = ack;
flag |= FLAG_WIN_UPDATE;
@@ -3669,6 +3686,10 @@ no_queue:
tcp_ack_probe(sk);
return 1;
+invalid_ack:
+ SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
+ return -1;
+
old_ack:
if (TCP_SKB_CB(skb)->sacked) {
tcp_sacktag_write_queue(sk, skb, prior_snd_una);
@@ -3676,8 +3697,7 @@ old_ack:
tcp_try_keep_open(sk);
}
-uninteresting_ack:
- SOCK_DEBUG(sk, "Ack %u out of %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
+ SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
return 0;
}
@@ -3865,8 +3885,7 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
* Not only, also it occurs for expired timestamps.
*/
- if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 ||
- get_seconds() >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS)
+ if (tcp_paws_check(&tp->rx_opt, 0))
tcp_store_ts_recent(tp);
}
}
@@ -3918,9 +3937,9 @@ static inline int tcp_paws_discard(const struct sock *sk,
const struct sk_buff *skb)
{
const struct tcp_sock *tp = tcp_sk(sk);
- return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW &&
- get_seconds() < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS &&
- !tcp_disordered_ack(sk, skb));
+
+ return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
+ !tcp_disordered_ack(sk, skb);
}
/* Check segment sequence number for validity.
@@ -4078,7 +4097,6 @@ static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
tp->rx_opt.dsack = 1;
tp->duplicate_sack[0].start_seq = seq;
tp->duplicate_sack[0].end_seq = end_seq;
- tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + 1;
}
}
@@ -4133,8 +4151,6 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
* Decrease num_sacks.
*/
tp->rx_opt.num_sacks--;
- tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks +
- tp->rx_opt.dsack;
for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
sp[i] = sp[i + 1];
continue;
@@ -4143,20 +4159,6 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
}
}
-static inline void tcp_sack_swap(struct tcp_sack_block *sack1,
- struct tcp_sack_block *sack2)
-{
- __u32 tmp;
-
- tmp = sack1->start_seq;
- sack1->start_seq = sack2->start_seq;
- sack2->start_seq = tmp;
-
- tmp = sack1->end_seq;
- sack1->end_seq = sack2->end_seq;
- sack2->end_seq = tmp;
-}
-
static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -4171,7 +4173,7 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
if (tcp_sack_extend(sp, seq, end_seq)) {
/* Rotate this_sack to the first one. */
for (; this_sack > 0; this_sack--, sp--)
- tcp_sack_swap(sp, sp - 1);
+ swap(*sp, *(sp - 1));
if (cur_sacks > 1)
tcp_sack_maybe_coalesce(tp);
return;
@@ -4197,7 +4199,6 @@ new_sack:
sp->start_seq = seq;
sp->end_seq = end_seq;
tp->rx_opt.num_sacks++;
- tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
}
/* RCV.NXT advances, some SACKs should be eaten. */
@@ -4211,7 +4212,6 @@ static void tcp_sack_remove(struct tcp_sock *tp)
/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
if (skb_queue_empty(&tp->out_of_order_queue)) {
tp->rx_opt.num_sacks = 0;
- tp->rx_opt.eff_sacks = tp->rx_opt.dsack;
return;
}
@@ -4232,11 +4232,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)
this_sack++;
sp++;
}
- if (num_sacks != tp->rx_opt.num_sacks) {
- tp->rx_opt.num_sacks = num_sacks;
- tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks +
- tp->rx_opt.dsack;
- }
+ tp->rx_opt.num_sacks = num_sacks;
}
/* This one checks to see if we can put data from the
@@ -4312,10 +4308,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
TCP_ECN_accept_cwr(tp, skb);
- if (tp->rx_opt.dsack) {
- tp->rx_opt.dsack = 0;
- tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks;
- }
+ tp->rx_opt.dsack = 0;
/* Queue data for delivery to the user.
* Packets in sequence go to the receive queue.
@@ -4434,8 +4427,6 @@ drop:
/* Initial out of order segment, build 1 SACK. */
if (tcp_is_sack(tp)) {
tp->rx_opt.num_sacks = 1;
- tp->rx_opt.dsack = 0;
- tp->rx_opt.eff_sacks = 1;
tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
tp->selective_acks[0].end_seq =
TCP_SKB_CB(skb)->end_seq;
@@ -5156,7 +5147,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
*/
if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
- TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
+ TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
+ !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
int tcp_header_len = tp->tcp_header_len;
/* Timestamp header prediction: tcp_header_len
@@ -5309,8 +5301,8 @@ slow_path:
return -res;
step5:
- if (th->ack)
- tcp_ack(sk, skb, FLAG_SLOWPATH);
+ if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0)
+ goto discard;
tcp_rcv_rtt_measure_ts(sk, skb);
@@ -5408,7 +5400,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* never scaled.
*/
tp->snd_wnd = ntohs(th->window);
- tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(skb)->seq);
+ tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
if (!tp->rx_opt.wscale_ok) {
tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
@@ -5509,7 +5501,7 @@ discard:
/* PAWS check. */
if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
- tcp_paws_check(&tp->rx_opt, 0))
+ tcp_paws_reject(&tp->rx_opt, 0))
goto discard_and_undo;
if (th->syn) {
@@ -5647,7 +5639,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
/* step 5: check the ACK field */
if (th->ack) {
- int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH);
+ int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0;
switch (sk->sk_state) {
case TCP_SYN_RECV:
@@ -5669,8 +5661,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
tp->snd_wnd = ntohs(th->window) <<
tp->rx_opt.snd_wscale;
- tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq,
- TCP_SKB_CB(skb)->seq);
+ tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
/* tcp_ack considers this ACK as duplicate
* and does not calculate rtt.
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f6b962f56ab..d0a314879d8 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1226,15 +1226,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
if (want_cookie && !tmp_opt.saw_tstamp)
tcp_clear_options(&tmp_opt);
- if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
- /* Some OSes (unknown ones, but I see them on web server, which
- * contains information interesting only for windows'
- * users) do not send their stamp in SYN. It is easy case.
- * We simply do not advertise TS support.
- */
- tmp_opt.saw_tstamp = 0;
- tmp_opt.tstamp_ok = 0;
- }
tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
tcp_openreq_init(req, &tmp_opt, skb);
@@ -2443,7 +2434,7 @@ static struct pernet_operations __net_initdata tcp_sk_ops = {
void __init tcp_v4_init(void)
{
inet_hashinfo_init(&tcp_hashinfo);
- if (register_pernet_device(&tcp_sk_ops))
+ if (register_pernet_subsys(&tcp_sk_ops))
panic("Failed to create the TCP control socket.\n");
}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f67effbb102..43bbba7926e 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -107,7 +107,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
if (tmp_opt.saw_tstamp) {
tmp_opt.ts_recent = tcptw->tw_ts_recent;
tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
- paws_reject = tcp_paws_check(&tmp_opt, th->rst);
+ paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
}
}
@@ -399,7 +399,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
tcp_prequeue_init(newtp);
- tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn);
+ tcp_init_wl(newtp, treq->rcv_isn);
newtp->srtt = 0;
newtp->mdev = TCP_TIMEOUT_INIT;
@@ -434,9 +434,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->rx_opt.saw_tstamp = 0;
newtp->rx_opt.dsack = 0;
- newtp->rx_opt.eff_sacks = 0;
-
newtp->rx_opt.num_sacks = 0;
+
newtp->urg_data = 0;
if (sock_flag(newsk, SOCK_KEEPOPEN))
@@ -512,7 +511,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
* from another data.
*/
tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
- paws_reject = tcp_paws_check(&tmp_opt, th->rst);
+ paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
}
}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index dda42f0bd7a..c1f259d2d33 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -441,10 +441,7 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
*ptr++ = htonl(sp[this_sack].end_seq);
}
- if (tp->rx_opt.dsack) {
- tp->rx_opt.dsack = 0;
- tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks;
- }
+ tp->rx_opt.dsack = 0;
}
}
@@ -550,6 +547,7 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,
struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
struct tcp_sock *tp = tcp_sk(sk);
unsigned size = 0;
+ unsigned int eff_sacks;
#ifdef CONFIG_TCP_MD5SIG
*md5 = tp->af_specific->md5_lookup(sk, sk);
@@ -568,10 +566,11 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,
size += TCPOLEN_TSTAMP_ALIGNED;
}
- if (unlikely(tp->rx_opt.eff_sacks)) {
+ eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
+ if (unlikely(eff_sacks)) {
const unsigned remaining = MAX_TCP_OPTION_SPACE - size;
opts->num_sack_blocks =
- min_t(unsigned, tp->rx_opt.eff_sacks,
+ min_t(unsigned, eff_sacks,
(remaining - TCPOLEN_SACK_BASE_ALIGNED) /
TCPOLEN_SACK_PERBLOCK);
size += TCPOLEN_SACK_BASE_ALIGNED +
@@ -663,10 +662,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
th->urg_ptr = 0;
/* The urg_mode check is necessary during a below snd_una win probe */
- if (unlikely(tcp_urg_mode(tp) &&
- between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) {
- th->urg_ptr = htons(tp->snd_up - tcb->seq);
- th->urg = 1;
+ if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
+ if (before(tp->snd_up, tcb->seq + 0x10000)) {
+ th->urg_ptr = htons(tp->snd_up - tcb->seq);
+ th->urg = 1;
+ } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
+ th->urg_ptr = 0xFFFF;
+ th->urg = 1;
+ }
}
tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location);
@@ -763,11 +766,10 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
struct sk_buff *buff;
int nsize, old_factor;
int nlen;
- u16 flags;
+ u8 flags;
BUG_ON(len > skb->len);
- tcp_clear_retrans_hints_partial(tp);
nsize = skb_headlen(skb) - len;
if (nsize < 0)
nsize = 0;
@@ -850,6 +852,12 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
tcp_verify_left_out(tp);
}
tcp_adjust_fackets_out(sk, skb, diff);
+
+ if (tp->lost_skb_hint &&
+ before(TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
+ (tcp_is_fack(tp) || TCP_SKB_CB(skb)->sacked))
+ tp->lost_cnt_hint -= diff;
}
/* Link BUFF into the send queue. */
@@ -913,7 +921,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
* factor and mss.
*/
if (tcp_skb_pcount(skb) > 1)
- tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1));
+ tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk));
return 0;
}
@@ -974,15 +982,6 @@ void tcp_mtup_init(struct sock *sk)
icsk->icsk_mtup.probe_size = 0;
}
-/* Bound MSS / TSO packet size with the half of the window */
-static int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
-{
- if (tp->max_window && pktsize > (tp->max_window >> 1))
- return max(tp->max_window >> 1, 68U - tp->tcp_header_len);
- else
- return pktsize;
-}
-
/* This function synchronize snd mss to current pmtu/exthdr set.
tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
@@ -1029,22 +1028,17 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
/* Compute the current effective MSS, taking SACKs and IP options,
* and even PMTU discovery events into account.
*/
-unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
+unsigned int tcp_current_mss(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct dst_entry *dst = __sk_dst_get(sk);
u32 mss_now;
- u16 xmit_size_goal;
- int doing_tso = 0;
unsigned header_len;
struct tcp_out_options opts;
struct tcp_md5sig_key *md5;
mss_now = tp->mss_cache;
- if (large_allowed && sk_can_gso(sk))
- doing_tso = 1;
-
if (dst) {
u32 mtu = dst_mtu(dst);
if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
@@ -1062,19 +1056,6 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
mss_now -= delta;
}
- xmit_size_goal = mss_now;
-
- if (doing_tso) {
- xmit_size_goal = ((sk->sk_gso_max_size - 1) -
- inet_csk(sk)->icsk_af_ops->net_header_len -
- inet_csk(sk)->icsk_ext_hdr_len -
- tp->tcp_header_len);
-
- xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
- xmit_size_goal -= (xmit_size_goal % mss_now);
- }
- tp->xmit_size_goal = xmit_size_goal;
-
return mss_now;
}
@@ -1256,7 +1237,7 @@ int tcp_may_send_now(struct sock *sk)
struct sk_buff *skb = tcp_send_head(sk);
return (skb &&
- tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
+ tcp_snd_test(sk, skb, tcp_current_mss(sk),
(tcp_skb_is_last(sk, skb) ?
tp->nonagle : TCP_NAGLE_PUSH)));
}
@@ -1273,7 +1254,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
{
struct sk_buff *buff;
int nlen = skb->len - len;
- u16 flags;
+ u8 flags;
/* All of a TSO frame must be composed of paged data. */
if (skb->len != skb->data_len)
@@ -1352,6 +1333,10 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
if (limit >= sk->sk_gso_max_size)
goto send_now;
+ /* Middle in queue won't get any more data, full sendable already? */
+ if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
+ goto send_now;
+
if (sysctl_tcp_tso_win_divisor) {
u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
@@ -1405,11 +1390,11 @@ static int tcp_mtu_probe(struct sock *sk)
icsk->icsk_mtup.probe_size ||
inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
tp->snd_cwnd < 11 ||
- tp->rx_opt.eff_sacks)
+ tp->rx_opt.num_sacks || tp->rx_opt.dsack)
return -1;
/* Very simple search strategy: just double the MSS. */
- mss_now = tcp_current_mss(sk, 0);
+ mss_now = tcp_current_mss(sk);
probe_size = 2 * tp->mss_cache;
size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
@@ -1754,11 +1739,9 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
int skb_size, next_skb_size;
- u16 flags;
skb_size = skb->len;
next_skb_size = next_skb->len;
- flags = TCP_SKB_CB(skb)->flags;
BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
@@ -1778,9 +1761,8 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
/* Update sequence range on original skb. */
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
- /* Merge over control information. */
- flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
- TCP_SKB_CB(skb)->flags = flags;
+ /* Merge over control information. This moves PSH/FIN etc. over */
+ TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(next_skb)->flags;
/* All done, get rid of second SKB and account for it so
* packet counting does not break.
@@ -1894,7 +1876,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
return -EHOSTUNREACH; /* Routing failure or similar. */
- cur_mss = tcp_current_mss(sk, 0);
+ cur_mss = tcp_current_mss(sk);
/* If receiver has shrunk his window, and skb is out of
* new window, do not retransmit it. The exception is the
@@ -1908,6 +1890,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
if (skb->len > cur_mss) {
if (tcp_fragment(sk, skb, cur_mss, cur_mss))
return -ENOMEM; /* We'll try again later. */
+ } else {
+ tcp_init_tso_segs(sk, skb, cur_mss);
}
tcp_retrans_try_collapse(sk, skb, cur_mss);
@@ -2023,7 +2007,6 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
last_lost = tp->snd_una;
}
- /* First pass: retransmit lost packets. */
tcp_for_write_queue_from(skb, sk) {
__u8 sacked = TCP_SKB_CB(skb)->sacked;
@@ -2062,7 +2045,7 @@ begin_fwd:
goto begin_fwd;
} else if (!(sacked & TCPCB_LOST)) {
- if (hole == NULL && !(sacked & TCPCB_SACKED_RETRANS))
+ if (hole == NULL && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
hole = skb;
continue;
@@ -2101,7 +2084,7 @@ void tcp_send_fin(struct sock *sk)
* unsent frames. But be careful about outgoing SACKS
* and IP options.
*/
- mss_now = tcp_current_mss(sk, 1);
+ mss_now = tcp_current_mss(sk);
if (tcp_send_head(sk) != NULL) {
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
@@ -2326,7 +2309,7 @@ static void tcp_connect_init(struct sock *sk)
sk->sk_err = 0;
sock_reset_flag(sk, SOCK_DONE);
tp->snd_wnd = 0;
- tcp_init_wl(tp, tp->write_seq, 0);
+ tcp_init_wl(tp, 0);
tp->snd_una = tp->write_seq;
tp->snd_sml = tp->write_seq;
tp->snd_up = tp->write_seq;
@@ -2513,7 +2496,7 @@ int tcp_write_wakeup(struct sock *sk)
if ((skb = tcp_send_head(sk)) != NULL &&
before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
int err;
- unsigned int mss = tcp_current_mss(sk, 0);
+ unsigned int mss = tcp_current_mss(sk);
unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 25524d4e372..59f5b5e7c56 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -165,9 +165,10 @@ static int tcpprobe_sprint(char *tbuf, int n)
static ssize_t tcpprobe_read(struct file *file, char __user *buf,
size_t len, loff_t *ppos)
{
- int error = 0, cnt = 0;
+ int error = 0;
+ size_t cnt = 0;
- if (!buf || len < 0)
+ if (!buf)
return -EINVAL;
while (cnt < len) {
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 2747ec7bfb6..a76513779e2 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -1,6 +1,6 @@
/* Tom Kelly's Scalable TCP
*
- * See htt://www-lce.eng.cam.ac.uk/~ctk21/scalable/
+ * See http://www.deneholme.net/tom/scalable/
*
* John Heffner <jheffner@sc.edu>
*/
@@ -24,14 +24,8 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
if (tp->snd_cwnd <= tp->snd_ssthresh)
tcp_slow_start(tp);
- else {
- tp->snd_cwnd_cnt++;
- if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- tp->snd_cwnd_cnt = 0;
- }
- }
+ else
+ tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT));
}
static u32 tcp_scalable_ssthresh(struct sock *sk)
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 0170e914f1b..b144a26359b 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -328,19 +328,16 @@ static void tcp_retransmit_timer(struct sock *sk)
if (icsk->icsk_retransmits == 0) {
int mib_idx;
- if (icsk->icsk_ca_state == TCP_CA_Disorder ||
- icsk->icsk_ca_state == TCP_CA_Recovery) {
- if (tcp_is_sack(tp)) {
- if (icsk->icsk_ca_state == TCP_CA_Recovery)
- mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL;
- else
- mib_idx = LINUX_MIB_TCPSACKFAILURES;
- } else {
- if (icsk->icsk_ca_state == TCP_CA_Recovery)
- mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL;
- else
- mib_idx = LINUX_MIB_TCPRENOFAILURES;
- }
+ if (icsk->icsk_ca_state == TCP_CA_Disorder) {
+ if (tcp_is_sack(tp))
+ mib_idx = LINUX_MIB_TCPSACKFAILURES;
+ else
+ mib_idx = LINUX_MIB_TCPRENOFAILURES;
+ } else if (icsk->icsk_ca_state == TCP_CA_Recovery) {
+ if (tcp_is_sack(tp))
+ mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL;
+ else
+ mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL;
} else if (icsk->icsk_ca_state == TCP_CA_Loss) {
mib_idx = LINUX_MIB_TCPLOSSFAILURES;
} else {
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index d08b2e855c2..e9bbff74648 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -159,12 +159,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
/* In the "non-congestive state", increase cwnd
* every rtt.
*/
- if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- tp->snd_cwnd_cnt = 0;
- } else
- tp->snd_cwnd_cnt++;
+ tcp_cong_avoid_ai(tp, tp->snd_cwnd);
} else {
/* In the "congestive state", increase cwnd
* every other rtt.
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 9ec843a9bbb..66b6821b984 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -94,14 +94,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
} else {
/* Reno */
-
- if (tp->snd_cwnd_cnt < tp->snd_cwnd)
- tp->snd_cwnd_cnt++;
-
- if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
- tp->snd_cwnd++;
- tp->snd_cwnd_cnt = 0;
- }
+ tcp_cong_avoid_ai(tp, tp->snd_cwnd);
}
/* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt.
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 4bd178a111d..05b7abb99f6 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1184,7 +1184,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
sk = sknext;
} while (sknext);
} else
- kfree_skb(skb);
+ consume_skb(skb);
spin_unlock(&hslot->lock);
return 0;
}
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 03e2a1ad71e..8499da9e76a 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -40,6 +40,7 @@
#include <linux/errno.h>
#include <linux/types.h>
+#include <linux/kernel.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/net.h>
@@ -493,15 +494,17 @@ static void addrconf_forward_change(struct net *net, __s32 newf)
read_unlock(&dev_base_lock);
}
-static void addrconf_fixup_forwarding(struct ctl_table *table, int *p, int old)
+static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int old)
{
struct net *net;
net = (struct net *)table->extra2;
if (p == &net->ipv6.devconf_dflt->forwarding)
- return;
+ return 0;
+
+ if (!rtnl_trylock())
+ return -ERESTARTSYS;
- rtnl_lock();
if (p == &net->ipv6.devconf_all->forwarding) {
__s32 newf = net->ipv6.devconf_all->forwarding;
net->ipv6.devconf_dflt->forwarding = newf;
@@ -512,6 +515,7 @@ static void addrconf_fixup_forwarding(struct ctl_table *table, int *p, int old)
if (*p)
rt6_purge_dflt_routers(net);
+ return 1;
}
#endif
@@ -587,6 +591,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen,
{
struct inet6_ifaddr *ifa = NULL;
struct rt6_info *rt;
+ struct net *net = dev_net(idev->dev);
int hash;
int err = 0;
int addr_type = ipv6_addr_type(addr);
@@ -603,6 +608,11 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen,
goto out2;
}
+ if (idev->cnf.disable_ipv6 || net->ipv6.devconf_all->disable_ipv6) {
+ err = -EACCES;
+ goto out2;
+ }
+
write_lock(&addrconf_hash_lock);
/* Ignore adding duplicate addresses on an interface */
@@ -1206,16 +1216,12 @@ int ipv6_dev_get_saddr(struct net *net, struct net_device *dst_dev,
}
break;
} else if (minihiscore < miniscore) {
- struct ipv6_saddr_score *tmp;
-
if (hiscore->ifa)
in6_ifa_put(hiscore->ifa);
in6_ifa_hold(score->ifa);
- tmp = hiscore;
- hiscore = score;
- score = tmp;
+ swap(hiscore, score);
/* restore our iterator */
score->ifa = hiscore->ifa;
@@ -1430,6 +1436,11 @@ static void addrconf_dad_stop(struct inet6_ifaddr *ifp)
void addrconf_dad_failure(struct inet6_ifaddr *ifp)
{
struct inet6_dev *idev = ifp->idev;
+
+ if (net_ratelimit())
+ printk(KERN_INFO "%s: IPv6 duplicate address detected!\n",
+ ifp->idev->dev->name);
+
if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6) {
struct in6_addr addr;
@@ -1440,11 +1451,12 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp)
ipv6_addr_equal(&ifp->addr, &addr)) {
/* DAD failed for link-local based on MAC address */
idev->cnf.disable_ipv6 = 1;
+
+ printk(KERN_INFO "%s: IPv6 being disabled!\n",
+ ifp->idev->dev->name);
}
}
- if (net_ratelimit())
- printk(KERN_INFO "%s: duplicate address detected!\n", ifp->idev->dev->name);
addrconf_dad_stop(ifp);
}
@@ -2599,9 +2611,6 @@ static int addrconf_ifdown(struct net_device *dev, int how)
ASSERT_RTNL();
- if ((dev->flags & IFF_LOOPBACK) && how == 1)
- how = 0;
-
rt6_ifdown(net, dev);
neigh_ifdown(&nd_tbl, dev);
@@ -2823,11 +2832,6 @@ static void addrconf_dad_timer(unsigned long data)
read_unlock_bh(&idev->lock);
goto out;
}
- if (idev->cnf.accept_dad > 1 && idev->cnf.disable_ipv6) {
- read_unlock_bh(&idev->lock);
- addrconf_dad_failure(ifp);
- return;
- }
spin_lock_bh(&ifp->lock);
if (ifp->probes == 0) {
/*
@@ -3638,7 +3642,8 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
kfree_skb(skb);
goto errout;
}
- err = rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC);
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC);
+ return;
errout:
if (err < 0)
rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err);
@@ -3849,7 +3854,8 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev)
kfree_skb(skb);
goto errout;
}
- err = rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC);
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC);
+ return;
errout:
if (err < 0)
rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err);
@@ -3919,7 +3925,8 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev,
kfree_skb(skb);
goto errout;
}
- err = rtnl_notify(skb, net, 0, RTNLGRP_IPV6_PREFIX, NULL, GFP_ATOMIC);
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV6_PREFIX, NULL, GFP_ATOMIC);
+ return;
errout:
if (err < 0)
rtnl_set_sk_err(net, RTNLGRP_IPV6_PREFIX, err);
@@ -3974,7 +3981,7 @@ int addrconf_sysctl_forward(ctl_table *ctl, int write, struct file * filp,
ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
if (write)
- addrconf_fixup_forwarding(ctl, valp, val);
+ ret = addrconf_fixup_forwarding(ctl, valp, val);
return ret;
}
@@ -4010,8 +4017,7 @@ static int addrconf_sysctl_forward_strategy(ctl_table *table,
}
*valp = new;
- addrconf_fixup_forwarding(table, valp, val);
- return 1;
+ return addrconf_fixup_forwarding(table, valp, val);
}
static struct addrconf_sysctl_table
@@ -4437,25 +4443,6 @@ int unregister_inet6addr_notifier(struct notifier_block *nb)
EXPORT_SYMBOL(unregister_inet6addr_notifier);
-static void addrconf_net_exit(struct net *net)
-{
- struct net_device *dev;
-
- rtnl_lock();
- /* clean dev list */
- for_each_netdev(net, dev) {
- if (__in6_dev_get(dev) == NULL)
- continue;
- addrconf_ifdown(dev, 1);
- }
- addrconf_ifdown(net->loopback_dev, 2);
- rtnl_unlock();
-}
-
-static struct pernet_operations addrconf_net_ops = {
- .exit = addrconf_net_exit,
-};
-
/*
* Init / cleanup code
*/
@@ -4497,10 +4484,6 @@ int __init addrconf_init(void)
if (err)
goto errlo;
- err = register_pernet_device(&addrconf_net_ops);
- if (err)
- return err;
-
register_netdevice_notifier(&ipv6_dev_notf);
addrconf_verify(0);
@@ -4530,15 +4513,22 @@ errlo:
void addrconf_cleanup(void)
{
struct inet6_ifaddr *ifa;
+ struct net_device *dev;
int i;
unregister_netdevice_notifier(&ipv6_dev_notf);
- unregister_pernet_device(&addrconf_net_ops);
-
unregister_pernet_subsys(&addrconf_ops);
rtnl_lock();
+ /* clean dev list */
+ for_each_netdev(&init_net, dev) {
+ if (__in6_dev_get(dev) == NULL)
+ continue;
+ addrconf_ifdown(dev, 1);
+ }
+ addrconf_ifdown(init_net.loopback_dev, 2);
+
/*
* Check hash table.
*/
@@ -4559,6 +4549,4 @@ void addrconf_cleanup(void)
del_timer(&addr_chk_timer);
rtnl_unlock();
-
- unregister_pernet_subsys(&addrconf_net_ops);
}
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index fa2ac7ee662..fbf533cc9dc 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -72,6 +72,10 @@ MODULE_LICENSE("GPL");
static struct list_head inetsw6[SOCK_MAX];
static DEFINE_SPINLOCK(inetsw6_lock);
+static int disable_ipv6 = 0;
+module_param_named(disable, disable_ipv6, int, 0);
+MODULE_PARM_DESC(disable, "Disable IPv6 such that it is non-functional");
+
static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk)
{
const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo);
@@ -889,7 +893,7 @@ out_unlock:
return err;
}
-static struct packet_type ipv6_packet_type = {
+static struct packet_type ipv6_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_IPV6),
.func = ipv6_rcv,
.gso_send_check = ipv6_gso_send_check,
@@ -1001,10 +1005,21 @@ static int __init inet6_init(void)
{
struct sk_buff *dummy_skb;
struct list_head *r;
- int err;
+ int err = 0;
BUILD_BUG_ON(sizeof(struct inet6_skb_parm) > sizeof(dummy_skb->cb));
+ /* Register the socket-side information for inet6_create. */
+ for(r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r)
+ INIT_LIST_HEAD(r);
+
+ if (disable_ipv6) {
+ printk(KERN_INFO
+ "IPv6: Loaded, but administratively disabled, "
+ "reboot required to enable\n");
+ goto out;
+ }
+
err = proto_register(&tcpv6_prot, 1);
if (err)
goto out;
@@ -1022,10 +1037,6 @@ static int __init inet6_init(void)
goto out_unregister_udplite_proto;
- /* Register the socket-side information for inet6_create. */
- for(r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r)
- INIT_LIST_HEAD(r);
-
/* We MUST register RAW sockets before we create the ICMP6,
* IGMP6, or NDISC control sockets.
*/
@@ -1191,6 +1202,9 @@ module_init(inet6_init);
static void __exit inet6_exit(void)
{
+ if (disable_ipv6)
+ return;
+
/* First of all disallow new sockets creation. */
sock_unregister(PF_INET6);
/* Disallow any further netlink messages */
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 8fe267feb81..1bcc3431859 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -258,11 +258,11 @@ unique:
if (twp != NULL) {
*twp = tw;
- NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITRECYCLED);
+ NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
} else if (tw != NULL) {
/* Silly. Should hash-dance instead... */
inet_twsk_deschedule(tw, death_row);
- NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITRECYCLED);
+ NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
inet_twsk_put(tw);
}
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 40f324655e2..d31df0f4bc9 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -218,8 +218,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
if (opt)
sock_kfree_s(sk, opt, opt->tot_len);
pktopt = xchg(&np->pktoptions, NULL);
- if (pktopt)
- kfree_skb(pktopt);
+ kfree_skb(pktopt);
sk->sk_destruct = inet_sock_destruct;
/*
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 3cd83b85e9e..9f061d1adbc 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1095,11 +1095,7 @@ static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt)
&ipv6_hdr(ra)->saddr);
nlmsg_end(skb, nlh);
- err = rtnl_notify(skb, net, 0, RTNLGRP_ND_USEROPT, NULL,
- GFP_ATOMIC);
- if (err < 0)
- goto errout;
-
+ rtnl_notify(skb, net, 0, RTNLGRP_ND_USEROPT, NULL, GFP_ATOMIC);
return;
nla_put_failure:
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 165b256a6fa..41b8a956e1b 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -205,8 +205,9 @@ icmpv6_error(struct net *net, struct sk_buff *skb, unsigned int dataoff,
if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) {
- nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL,
- "nf_ct_icmpv6: ICMPv6 checksum failed\n");
+ if (LOG_INVALID(net, IPPROTO_ICMPV6))
+ nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL,
+ "nf_ct_icmpv6: ICMPv6 checksum failed ");
return -NF_ACCEPT;
}
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index ed4d79a9e4a..058a5e4a60c 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -528,14 +528,14 @@ find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff)
if (!ipv6_ext_hdr(nexthdr)) {
return -1;
}
- if (len < (int)sizeof(struct ipv6_opt_hdr)) {
- pr_debug("too short\n");
- return -1;
- }
if (nexthdr == NEXTHDR_NONE) {
pr_debug("next header is none\n");
return -1;
}
+ if (len < (int)sizeof(struct ipv6_opt_hdr)) {
+ pr_debug("too short\n");
+ return -1;
+ }
if (skb_copy_bits(skb, start, &hdr, sizeof(hdr)))
BUG();
if (nexthdr == NEXTHDR_AUTH)
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 3c575118fca..e9ac7a12f59 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -452,6 +452,7 @@ err:
static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
struct net_device *dev)
{
+ struct net *net = container_of(fq->q.net, struct net, ipv6.frags);
struct sk_buff *fp, *head = fq->q.fragments;
int payload_len;
unsigned int nhoff;
@@ -551,8 +552,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
head->csum);
rcu_read_lock();
- IP6_INC_STATS_BH(dev_net(dev),
- __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
+ IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
rcu_read_unlock();
fq->q.fragments = NULL;
return 1;
@@ -566,8 +566,7 @@ out_oom:
printk(KERN_DEBUG "ip6_frag_reasm: no memory for reassembly\n");
out_fail:
rcu_read_lock();
- IP6_INC_STATS_BH(dev_net(dev),
- __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
+ IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
rcu_read_unlock();
return -1;
}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index c3d486a3eda..1394ddb6e35 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2400,8 +2400,9 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
kfree_skb(skb);
goto errout;
}
- err = rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
- info->nlh, gfp_any());
+ rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
+ info->nlh, gfp_any());
+ return;
errout:
if (err < 0)
rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index d3467e563f0..664ab82e03b 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -188,9 +188,9 @@ static struct ip_tunnel * ipip6_tunnel_locate(struct net *net,
}
nt = netdev_priv(dev);
- ipip6_tunnel_init(dev);
nt->parms = *parms;
+ ipip6_tunnel_init(dev);
if (parms->i_flags & SIT_ISATAP)
dev->priv_flags |= IFF_ISATAP;
@@ -454,7 +454,7 @@ static int ipip6_err(struct sk_buff *skb, u32 info)
if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
goto out;
- if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
+ if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
t->err_count++;
else
t->err_count = 1;
@@ -658,7 +658,8 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
}
if (tunnel->err_count > 0) {
- if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
+ if (time_before(jiffies,
+ tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
tunnel->err_count--;
dst_link_failure(skb);
} else
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 00f1269e11e..4b5aa185426 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -533,8 +533,7 @@ static inline void syn_flood_warning(struct sk_buff *skb)
static void tcp_v6_reqsk_destructor(struct request_sock *req)
{
- if (inet6_rsk(req)->pktopts)
- kfree_skb(inet6_rsk(req)->pktopts);
+ kfree_skb(inet6_rsk(req)->pktopts);
}
#ifdef CONFIG_TCP_MD5SIG
@@ -1611,8 +1610,7 @@ ipv6_pktoptions:
}
}
- if (opt_skb)
- kfree_skb(opt_skb);
+ kfree_skb(opt_skb);
return 0;
}
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index 0e685b05496..f417b77fa0e 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -69,7 +69,7 @@ __xfrm6_sort(void **dst, void **src, int n, int (*cmp)(void *p), int maxclass)
for (i = 0; i < n; i++) {
dst[count[class[i] - 1]++] = src[i];
- src[i] = 0;
+ src[i] = NULL;
}
return 0;
diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index 43d0ffc6d56..1627050e29f 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -1958,12 +1958,12 @@ static const struct proto_ops SOCKOPS_WRAPPED(ipx_dgram_ops) = {
SOCKOPS_WRAP(ipx_dgram, PF_IPX);
-static struct packet_type ipx_8023_packet_type = {
+static struct packet_type ipx_8023_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_802_3),
.func = ipx_rcv,
};
-static struct packet_type ipx_dix_packet_type = {
+static struct packet_type ipx_dix_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_IPX),
.func = ipx_rcv,
};
@@ -1975,15 +1975,15 @@ static struct notifier_block ipx_dev_notifier = {
extern struct datalink_proto *make_EII_client(void);
extern void destroy_EII_client(struct datalink_proto *);
-static unsigned char ipx_8022_type = 0xE0;
-static unsigned char ipx_snap_id[5] = { 0x0, 0x0, 0x0, 0x81, 0x37 };
-static char ipx_EII_err_msg[] __initdata =
+static const unsigned char ipx_8022_type = 0xE0;
+static const unsigned char ipx_snap_id[5] = { 0x0, 0x0, 0x0, 0x81, 0x37 };
+static const char ipx_EII_err_msg[] __initconst =
KERN_CRIT "IPX: Unable to register with Ethernet II\n";
-static char ipx_8023_err_msg[] __initdata =
+static const char ipx_8023_err_msg[] __initconst =
KERN_CRIT "IPX: Unable to register with 802.3\n";
-static char ipx_llc_err_msg[] __initdata =
+static const char ipx_llc_err_msg[] __initconst =
KERN_CRIT "IPX: Unable to register with 802.2\n";
-static char ipx_snap_err_msg[] __initdata =
+static const char ipx_snap_err_msg[] __initconst =
KERN_CRIT "IPX: Unable to register with SNAP\n";
static int __init ipx_init(void)
diff --git a/net/irda/irda_device.c b/net/irda/irda_device.c
index ea319e3ddc1..bf92e147344 100644
--- a/net/irda/irda_device.c
+++ b/net/irda/irda_device.c
@@ -149,13 +149,14 @@ int irda_device_is_receiving(struct net_device *dev)
IRDA_DEBUG(2, "%s()\n", __func__);
- if (!dev->do_ioctl) {
+ if (!dev->netdev_ops->ndo_do_ioctl) {
IRDA_ERROR("%s: do_ioctl not impl. by device driver\n",
__func__);
return -1;
}
- ret = dev->do_ioctl(dev, (struct ifreq *) &req, SIOCGRECEIVING);
+ ret = (dev->netdev_ops->ndo_do_ioctl)(dev, (struct ifreq *) &req,
+ SIOCGRECEIVING);
if (ret < 0)
return ret;
diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c
index 05112be9956..724bcf951b8 100644
--- a/net/irda/irlan/irlan_eth.c
+++ b/net/irda/irlan/irlan_eth.c
@@ -45,6 +45,16 @@ static int irlan_eth_xmit(struct sk_buff *skb, struct net_device *dev);
static void irlan_eth_set_multicast_list( struct net_device *dev);
static struct net_device_stats *irlan_eth_get_stats(struct net_device *dev);
+static const struct net_device_ops irlan_eth_netdev_ops = {
+ .ndo_open = irlan_eth_open,
+ .ndo_stop = irlan_eth_close,
+ .ndo_start_xmit = irlan_eth_xmit,
+ .ndo_get_stats = irlan_eth_get_stats,
+ .ndo_set_multicast_list = irlan_eth_set_multicast_list,
+ .ndo_change_mtu = eth_change_mtu,
+ .ndo_validate_addr = eth_validate_addr,
+};
+
/*
* Function irlan_eth_setup (dev)
*
@@ -53,14 +63,11 @@ static struct net_device_stats *irlan_eth_get_stats(struct net_device *dev);
*/
static void irlan_eth_setup(struct net_device *dev)
{
- dev->open = irlan_eth_open;
- dev->stop = irlan_eth_close;
- dev->hard_start_xmit = irlan_eth_xmit;
- dev->get_stats = irlan_eth_get_stats;
- dev->set_multicast_list = irlan_eth_set_multicast_list;
+ ether_setup(dev);
+
+ dev->netdev_ops = &irlan_eth_netdev_ops;
dev->destructor = free_netdev;
- ether_setup(dev);
/*
* Lets do all queueing in IrTTP instead of this device driver.
diff --git a/net/irda/irmod.c b/net/irda/irmod.c
index 1bb607f2f5c..303a68d9273 100644
--- a/net/irda/irmod.c
+++ b/net/irda/irmod.c
@@ -55,7 +55,7 @@ EXPORT_SYMBOL(irda_debug);
/* Packet type handler.
* Tell the kernel how IrDA packets should be handled.
*/
-static struct packet_type irda_packet_type = {
+static struct packet_type irda_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_IRDA),
.func = irlap_driver_rcv, /* Packet type handler irlap_frame.c */
};
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index eb8a2a0b6eb..49e786535dc 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -1171,8 +1171,7 @@ static void iucv_callback_txdone(struct iucv_path *path,
spin_unlock_irqrestore(&list->lock, flags);
- if (this)
- kfree_skb(this);
+ kfree_skb(this);
}
BUG_ON(!this);
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 7dcbde3ea7d..643c1be2d02 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -313,8 +313,7 @@ static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation,
if (one_sk != NULL)
err = pfkey_broadcast_one(skb, &skb2, allocation, one_sk);
- if (skb2)
- kfree_skb(skb2);
+ kfree_skb(skb2);
kfree_skb(skb);
return err;
}
@@ -3573,8 +3572,7 @@ static int pfkey_sendmsg(struct kiocb *kiocb,
out:
if (err && hdr && pfkey_error(hdr, err, sk) == 0)
err = 0;
- if (skb)
- kfree_skb(skb);
+ kfree_skb(skb);
return err ? : len;
}
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 56fd85ab358..febae702685 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -1118,11 +1118,11 @@ static const struct proto_ops llc_ui_ops = {
.sendpage = sock_no_sendpage,
};
-static char llc_proc_err_msg[] __initdata =
+static const char llc_proc_err_msg[] __initconst =
KERN_CRIT "LLC: Unable to register the proc_fs entries\n";
-static char llc_sysctl_err_msg[] __initdata =
+static const char llc_sysctl_err_msg[] __initconst =
KERN_CRIT "LLC: Unable to register the sysctl entries\n";
-static char llc_sock_err_msg[] __initdata =
+static const char llc_sock_err_msg[] __initconst =
KERN_CRIT "LLC: Unable to register the network family\n";
static int __init llc2_init(void)
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index 5c6d89c6d51..3477624a490 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -332,8 +332,7 @@ int llc_conn_remove_acked_pdus(struct sock *sk, u8 nr, u16 *how_many_unacked)
for (i = 0; i < pdu_pos && i < q_len; i++) {
skb = skb_dequeue(&llc->pdu_unack_q);
- if (skb)
- kfree_skb(skb);
+ kfree_skb(skb);
nbr_acked++;
}
out:
diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c
index a7fe1adc378..ff4c0ab96a6 100644
--- a/net/llc/llc_core.c
+++ b/net/llc/llc_core.c
@@ -147,12 +147,12 @@ void llc_sap_close(struct llc_sap *sap)
kfree(sap);
}
-static struct packet_type llc_packet_type = {
+static struct packet_type llc_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_802_2),
.func = llc_rcv,
};
-static struct packet_type llc_tr_packet_type = {
+static struct packet_type llc_tr_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_TR_802_2),
.func = llc_rcv,
};
diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile
index 3503a3d2131..0e3ab88bb70 100644
--- a/net/mac80211/Makefile
+++ b/net/mac80211/Makefile
@@ -9,6 +9,7 @@ mac80211-y := \
wpa.o \
scan.o \
ht.o agg-tx.o agg-rx.o \
+ ibss.o \
mlme.o \
iface.o \
rate.o \
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 3112bfd441b..a95affc9462 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -129,7 +129,6 @@ static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *d
u8 dialog_token, u16 status, u16 policy,
u16 buf_size, u16 timeout)
{
- struct ieee80211_if_sta *ifsta = &sdata->u.sta;
struct ieee80211_local *local = sdata->local;
struct sk_buff *skb;
struct ieee80211_mgmt *mgmt;
@@ -151,8 +150,9 @@ static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *d
if (sdata->vif.type == NL80211_IFTYPE_AP ||
sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN);
- else
- memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
+ else if (sdata->vif.type == NL80211_IFTYPE_STATION)
+ memcpy(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN);
+
mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
IEEE80211_STYPE_ACTION);
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index 1232d9f01ca..1df116d4d6e 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -49,7 +49,6 @@ static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata,
u16 agg_size, u16 timeout)
{
struct ieee80211_local *local = sdata->local;
- struct ieee80211_if_sta *ifsta = &sdata->u.sta;
struct sk_buff *skb;
struct ieee80211_mgmt *mgmt;
u16 capab;
@@ -69,8 +68,8 @@ static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata,
if (sdata->vif.type == NL80211_IFTYPE_AP ||
sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN);
- else
- memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
+ else if (sdata->vif.type == NL80211_IFTYPE_STATION)
+ memcpy(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN);
mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
IEEE80211_STYPE_ACTION);
@@ -132,9 +131,24 @@ static int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
state = &sta->ampdu_mlme.tid_state_tx[tid];
- if (local->hw.ampdu_queues)
- ieee80211_stop_queue(&local->hw, sta->tid_to_tx_q[tid]);
+ if (local->hw.ampdu_queues) {
+ if (initiator) {
+ /*
+ * Stop the AC queue to avoid issues where we send
+ * unaggregated frames already before the delba.
+ */
+ ieee80211_stop_queue_by_reason(&local->hw,
+ local->hw.queues + sta->tid_to_tx_q[tid],
+ IEEE80211_QUEUE_STOP_REASON_AGGREGATION);
+ }
+ /*
+ * Pretend the driver woke the queue, just in case
+ * it disabled it before the session was stopped.
+ */
+ ieee80211_wake_queue(
+ &local->hw, local->hw.queues + sta->tid_to_tx_q[tid]);
+ }
*state = HT_AGG_STATE_REQ_STOP_BA_MSK |
(initiator << HT_AGG_STATE_INITIATOR_SHIFT);
@@ -144,8 +158,6 @@ static int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
/* HW shall not deny going back to legacy */
if (WARN_ON(ret)) {
*state = HT_AGG_STATE_OPERATIONAL;
- if (local->hw.ampdu_queues)
- ieee80211_wake_queue(&local->hw, sta->tid_to_tx_q[tid]);
}
return ret;
@@ -189,14 +201,19 @@ static void sta_addba_resp_timer_expired(unsigned long data)
spin_unlock_bh(&sta->lock);
}
+static inline int ieee80211_ac_from_tid(int tid)
+{
+ return ieee802_1d_to_ac[tid & 7];
+}
+
int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
{
struct ieee80211_local *local = hw_to_local(hw);
struct sta_info *sta;
struct ieee80211_sub_if_data *sdata;
- u16 start_seq_num;
u8 *state;
- int ret = 0;
+ int i, qn = -1, ret = 0;
+ u16 start_seq_num;
if (WARN_ON(!local->ops->ampdu_action))
return -EINVAL;
@@ -209,6 +226,13 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
ra, tid);
#endif /* CONFIG_MAC80211_HT_DEBUG */
+ if (hw->ampdu_queues && ieee80211_ac_from_tid(tid) == 0) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+ printk(KERN_DEBUG "rejecting on voice AC\n");
+#endif
+ return -EINVAL;
+ }
+
rcu_read_lock();
sta = sta_info_get(local, ra);
@@ -217,7 +241,7 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
printk(KERN_DEBUG "Could not find the station\n");
#endif
ret = -ENOENT;
- goto exit;
+ goto unlock;
}
/*
@@ -230,11 +254,13 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
sta->sdata->vif.type != NL80211_IFTYPE_AP_VLAN &&
sta->sdata->vif.type != NL80211_IFTYPE_AP) {
ret = -EINVAL;
- goto exit;
+ goto unlock;
}
spin_lock_bh(&sta->lock);
+ sdata = sta->sdata;
+
/* we have tried too many times, receiver does not want A-MPDU */
if (sta->ampdu_mlme.addba_req_num[tid] > HT_AGG_MAX_RETRIES) {
ret = -EBUSY;
@@ -252,6 +278,42 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
goto err_unlock_sta;
}
+ if (hw->ampdu_queues) {
+ spin_lock(&local->queue_stop_reason_lock);
+ /* reserve a new queue for this session */
+ for (i = 0; i < local->hw.ampdu_queues; i++) {
+ if (local->ampdu_ac_queue[i] < 0) {
+ qn = i;
+ local->ampdu_ac_queue[qn] =
+ ieee80211_ac_from_tid(tid);
+ break;
+ }
+ }
+ spin_unlock(&local->queue_stop_reason_lock);
+
+ if (qn < 0) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+ printk(KERN_DEBUG "BA request denied - "
+ "queue unavailable for tid %d\n", tid);
+#endif /* CONFIG_MAC80211_HT_DEBUG */
+ ret = -ENOSPC;
+ goto err_unlock_sta;
+ }
+
+ /*
+ * If we successfully allocate the session, we can't have
+ * anything going on on the queue this TID maps into, so
+ * stop it for now. This is a "virtual" stop using the same
+ * mechanism that drivers will use.
+ *
+ * XXX: queue up frames for this session in the sta_info
+ * struct instead to avoid hitting all other STAs.
+ */
+ ieee80211_stop_queue_by_reason(
+ &local->hw, hw->queues + qn,
+ IEEE80211_QUEUE_STOP_REASON_AGGREGATION);
+ }
+
/* prepare A-MPDU MLME for Tx aggregation */
sta->ampdu_mlme.tid_tx[tid] =
kmalloc(sizeof(struct tid_ampdu_tx), GFP_ATOMIC);
@@ -262,8 +324,9 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
tid);
#endif
ret = -ENOMEM;
- goto err_unlock_sta;
+ goto err_return_queue;
}
+
/* Tx timer */
sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer.function =
sta_addba_resp_timer_expired;
@@ -271,49 +334,25 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
(unsigned long)&sta->timer_to_tid[tid];
init_timer(&sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer);
- if (hw->ampdu_queues) {
- /* create a new queue for this aggregation */
- ret = ieee80211_ht_agg_queue_add(local, sta, tid);
-
- /* case no queue is available to aggregation
- * don't switch to aggregation */
- if (ret) {
-#ifdef CONFIG_MAC80211_HT_DEBUG
- printk(KERN_DEBUG "BA request denied - "
- "queue unavailable for tid %d\n", tid);
-#endif /* CONFIG_MAC80211_HT_DEBUG */
- goto err_unlock_queue;
- }
- }
- sdata = sta->sdata;
-
/* Ok, the Addba frame hasn't been sent yet, but if the driver calls the
* call back right away, it must see that the flow has begun */
*state |= HT_ADDBA_REQUESTED_MSK;
- /* This is slightly racy because the queue isn't stopped */
start_seq_num = sta->tid_seq[tid];
ret = local->ops->ampdu_action(hw, IEEE80211_AMPDU_TX_START,
&sta->sta, tid, &start_seq_num);
if (ret) {
- /* No need to requeue the packets in the agg queue, since we
- * held the tx lock: no packet could be enqueued to the newly
- * allocated queue */
- if (hw->ampdu_queues)
- ieee80211_ht_agg_queue_remove(local, sta, tid, 0);
#ifdef CONFIG_MAC80211_HT_DEBUG
printk(KERN_DEBUG "BA request denied - HW unavailable for"
" tid %d\n", tid);
#endif /* CONFIG_MAC80211_HT_DEBUG */
*state = HT_AGG_STATE_IDLE;
- goto err_unlock_queue;
+ goto err_free;
}
+ sta->tid_to_tx_q[tid] = qn;
- /* Will put all the packets in the new SW queue */
- if (hw->ampdu_queues)
- ieee80211_requeue(local, ieee802_1d_to_ac[tid]);
spin_unlock_bh(&sta->lock);
/* send an addBA request */
@@ -322,7 +361,6 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
sta->ampdu_mlme.dialog_token_allocator;
sta->ampdu_mlme.tid_tx[tid]->ssn = start_seq_num;
-
ieee80211_send_addba_request(sta->sdata, ra, tid,
sta->ampdu_mlme.tid_tx[tid]->dialog_token,
sta->ampdu_mlme.tid_tx[tid]->ssn,
@@ -334,15 +372,24 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
#ifdef CONFIG_MAC80211_HT_DEBUG
printk(KERN_DEBUG "activated addBA response timer on tid %d\n", tid);
#endif
- goto exit;
+ goto unlock;
-err_unlock_queue:
+ err_free:
kfree(sta->ampdu_mlme.tid_tx[tid]);
sta->ampdu_mlme.tid_tx[tid] = NULL;
- ret = -EBUSY;
-err_unlock_sta:
+ err_return_queue:
+ if (qn >= 0) {
+ /* We failed, so start queue again right away. */
+ ieee80211_wake_queue_by_reason(hw, hw->queues + qn,
+ IEEE80211_QUEUE_STOP_REASON_AGGREGATION);
+ /* give queue back to pool */
+ spin_lock(&local->queue_stop_reason_lock);
+ local->ampdu_ac_queue[qn] = -1;
+ spin_unlock(&local->queue_stop_reason_lock);
+ }
+ err_unlock_sta:
spin_unlock_bh(&sta->lock);
-exit:
+ unlock:
rcu_read_unlock();
return ret;
}
@@ -375,7 +422,7 @@ void ieee80211_start_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u16 tid)
state = &sta->ampdu_mlme.tid_state_tx[tid];
spin_lock_bh(&sta->lock);
- if (!(*state & HT_ADDBA_REQUESTED_MSK)) {
+ if (WARN_ON(!(*state & HT_ADDBA_REQUESTED_MSK))) {
#ifdef CONFIG_MAC80211_HT_DEBUG
printk(KERN_DEBUG "addBA was not requested yet, state is %d\n",
*state);
@@ -385,7 +432,8 @@ void ieee80211_start_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u16 tid)
return;
}
- WARN_ON_ONCE(*state & HT_ADDBA_DRV_READY_MSK);
+ if (WARN_ON(*state & HT_ADDBA_DRV_READY_MSK))
+ goto out;
*state |= HT_ADDBA_DRV_READY_MSK;
@@ -393,9 +441,18 @@ void ieee80211_start_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u16 tid)
#ifdef CONFIG_MAC80211_HT_DEBUG
printk(KERN_DEBUG "Aggregation is on for tid %d \n", tid);
#endif
- if (hw->ampdu_queues)
- ieee80211_wake_queue(hw, sta->tid_to_tx_q[tid]);
+ if (hw->ampdu_queues) {
+ /*
+ * Wake up this queue, we stopped it earlier,
+ * this will in turn wake the entire AC.
+ */
+ ieee80211_wake_queue_by_reason(hw,
+ hw->queues + sta->tid_to_tx_q[tid],
+ IEEE80211_QUEUE_STOP_REASON_AGGREGATION);
+ }
}
+
+ out:
spin_unlock_bh(&sta->lock);
rcu_read_unlock();
}
@@ -485,7 +542,6 @@ void ieee80211_stop_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u8 tid)
struct ieee80211_local *local = hw_to_local(hw);
struct sta_info *sta;
u8 *state;
- int agg_queue;
if (tid >= STA_TID_NUM) {
#ifdef CONFIG_MAC80211_HT_DEBUG
@@ -527,19 +583,19 @@ void ieee80211_stop_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u8 tid)
ieee80211_send_delba(sta->sdata, ra, tid,
WLAN_BACK_INITIATOR, WLAN_REASON_QSTA_NOT_USE);
- if (hw->ampdu_queues) {
- agg_queue = sta->tid_to_tx_q[tid];
- ieee80211_ht_agg_queue_remove(local, sta, tid, 1);
+ spin_lock_bh(&sta->lock);
- /* We just requeued the all the frames that were in the
- * removed queue, and since we might miss a softirq we do
- * netif_schedule_queue. ieee80211_wake_queue is not used
- * here as this queue is not necessarily stopped
+ if (*state & HT_AGG_STATE_INITIATOR_MSK &&
+ hw->ampdu_queues) {
+ /*
+ * Wake up this queue, we stopped it earlier,
+ * this will in turn wake the entire AC.
*/
- netif_schedule_queue(netdev_get_tx_queue(local->mdev,
- agg_queue));
+ ieee80211_wake_queue_by_reason(hw,
+ hw->queues + sta->tid_to_tx_q[tid],
+ IEEE80211_QUEUE_STOP_REASON_AGGREGATION);
}
- spin_lock_bh(&sta->lock);
+
*state = HT_AGG_STATE_IDLE;
sta->ampdu_mlme.addba_req_num[tid] = 0;
kfree(sta->ampdu_mlme.tid_tx[tid]);
@@ -613,12 +669,21 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
#endif /* CONFIG_MAC80211_HT_DEBUG */
if (le16_to_cpu(mgmt->u.action.u.addba_resp.status)
== WLAN_STATUS_SUCCESS) {
+ u8 curstate = *state;
+
*state |= HT_ADDBA_RECEIVED_MSK;
- sta->ampdu_mlme.addba_req_num[tid] = 0;
- if (*state == HT_AGG_STATE_OPERATIONAL &&
- local->hw.ampdu_queues)
- ieee80211_wake_queue(hw, sta->tid_to_tx_q[tid]);
+ if (hw->ampdu_queues && *state != curstate &&
+ *state == HT_AGG_STATE_OPERATIONAL) {
+ /*
+ * Wake up this queue, we stopped it earlier,
+ * this will in turn wake the entire AC.
+ */
+ ieee80211_wake_queue_by_reason(hw,
+ hw->queues + sta->tid_to_tx_q[tid],
+ IEEE80211_QUEUE_STOP_REASON_AGGREGATION);
+ }
+ sta->ampdu_mlme.addba_req_num[tid] = 0;
if (local->ops->ampdu_action) {
(void)local->ops->ampdu_action(hw,
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index c8d969be440..58693e52d45 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -341,11 +341,15 @@ static void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
sinfo->filled = STATION_INFO_INACTIVE_TIME |
STATION_INFO_RX_BYTES |
STATION_INFO_TX_BYTES |
+ STATION_INFO_RX_PACKETS |
+ STATION_INFO_TX_PACKETS |
STATION_INFO_TX_BITRATE;
sinfo->inactive_time = jiffies_to_msecs(jiffies - sta->last_rx);
sinfo->rx_bytes = sta->rx_bytes;
sinfo->tx_bytes = sta->tx_bytes;
+ sinfo->rx_packets = sta->rx_packets;
+ sinfo->tx_packets = sta->tx_packets;
if (sta->local->hw.flags & IEEE80211_HW_SIGNAL_DBM) {
sinfo->filled |= STATION_INFO_SIGNAL;
@@ -447,7 +451,8 @@ static int ieee80211_config_beacon(struct ieee80211_sub_if_data *sdata,
* This is a kludge. beacon interval should really be part
* of the beacon information.
*/
- if (params->interval) {
+ if (params->interval && (sdata->local->hw.conf.beacon_int !=
+ params->interval)) {
sdata->local->hw.conf.beacon_int = params->interval;
err = ieee80211_hw_config(sdata->local,
IEEE80211_CONF_CHANGE_BEACON_INTERVAL);
@@ -1180,45 +1185,45 @@ static int set_mgmt_extra_ie_sta(struct ieee80211_sub_if_data *sdata,
u8 subtype, u8 *ies, size_t ies_len)
{
struct ieee80211_local *local = sdata->local;
- struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
switch (subtype) {
case IEEE80211_STYPE_PROBE_REQ >> 4:
if (local->ops->hw_scan)
break;
- kfree(ifsta->ie_probereq);
- ifsta->ie_probereq = ies;
- ifsta->ie_probereq_len = ies_len;
+ kfree(ifmgd->ie_probereq);
+ ifmgd->ie_probereq = ies;
+ ifmgd->ie_probereq_len = ies_len;
return 0;
case IEEE80211_STYPE_PROBE_RESP >> 4:
- kfree(ifsta->ie_proberesp);
- ifsta->ie_proberesp = ies;
- ifsta->ie_proberesp_len = ies_len;
+ kfree(ifmgd->ie_proberesp);
+ ifmgd->ie_proberesp = ies;
+ ifmgd->ie_proberesp_len = ies_len;
return 0;
case IEEE80211_STYPE_AUTH >> 4:
- kfree(ifsta->ie_auth);
- ifsta->ie_auth = ies;
- ifsta->ie_auth_len = ies_len;
+ kfree(ifmgd->ie_auth);
+ ifmgd->ie_auth = ies;
+ ifmgd->ie_auth_len = ies_len;
return 0;
case IEEE80211_STYPE_ASSOC_REQ >> 4:
- kfree(ifsta->ie_assocreq);
- ifsta->ie_assocreq = ies;
- ifsta->ie_assocreq_len = ies_len;
+ kfree(ifmgd->ie_assocreq);
+ ifmgd->ie_assocreq = ies;
+ ifmgd->ie_assocreq_len = ies_len;
return 0;
case IEEE80211_STYPE_REASSOC_REQ >> 4:
- kfree(ifsta->ie_reassocreq);
- ifsta->ie_reassocreq = ies;
- ifsta->ie_reassocreq_len = ies_len;
+ kfree(ifmgd->ie_reassocreq);
+ ifmgd->ie_reassocreq = ies;
+ ifmgd->ie_reassocreq_len = ies_len;
return 0;
case IEEE80211_STYPE_DEAUTH >> 4:
- kfree(ifsta->ie_deauth);
- ifsta->ie_deauth = ies;
- ifsta->ie_deauth_len = ies_len;
+ kfree(ifmgd->ie_deauth);
+ ifmgd->ie_deauth = ies;
+ ifmgd->ie_deauth_len = ies_len;
return 0;
case IEEE80211_STYPE_DISASSOC >> 4:
- kfree(ifsta->ie_disassoc);
- ifsta->ie_disassoc = ies;
- ifsta->ie_disassoc_len = ies_len;
+ kfree(ifmgd->ie_disassoc);
+ ifmgd->ie_disassoc = ies;
+ ifmgd->ie_disassoc_len = ies_len;
return 0;
}
@@ -1248,7 +1253,6 @@ static int ieee80211_set_mgmt_extra_ie(struct wiphy *wiphy,
switch (sdata->vif.type) {
case NL80211_IFTYPE_STATION:
- case NL80211_IFTYPE_ADHOC:
ret = set_mgmt_extra_ie_sta(sdata, params->subtype,
ies, ies_len);
break;
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index c5421930172..e3420329f4e 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -94,31 +94,31 @@ IEEE80211_IF_FILE(drop_unencrypted, drop_unencrypted, DEC);
IEEE80211_IF_FILE(force_unicast_rateidx, force_unicast_rateidx, DEC);
IEEE80211_IF_FILE(max_ratectrl_rateidx, max_ratectrl_rateidx, DEC);
-/* STA/IBSS attributes */
-IEEE80211_IF_FILE(state, u.sta.state, DEC);
-IEEE80211_IF_FILE(bssid, u.sta.bssid, MAC);
-IEEE80211_IF_FILE(prev_bssid, u.sta.prev_bssid, MAC);
-IEEE80211_IF_FILE(ssid_len, u.sta.ssid_len, SIZE);
-IEEE80211_IF_FILE(aid, u.sta.aid, DEC);
-IEEE80211_IF_FILE(ap_capab, u.sta.ap_capab, HEX);
-IEEE80211_IF_FILE(capab, u.sta.capab, HEX);
-IEEE80211_IF_FILE(extra_ie_len, u.sta.extra_ie_len, SIZE);
-IEEE80211_IF_FILE(auth_tries, u.sta.auth_tries, DEC);
-IEEE80211_IF_FILE(assoc_tries, u.sta.assoc_tries, DEC);
-IEEE80211_IF_FILE(auth_algs, u.sta.auth_algs, HEX);
-IEEE80211_IF_FILE(auth_alg, u.sta.auth_alg, DEC);
-IEEE80211_IF_FILE(auth_transaction, u.sta.auth_transaction, DEC);
+/* STA attributes */
+IEEE80211_IF_FILE(state, u.mgd.state, DEC);
+IEEE80211_IF_FILE(bssid, u.mgd.bssid, MAC);
+IEEE80211_IF_FILE(prev_bssid, u.mgd.prev_bssid, MAC);
+IEEE80211_IF_FILE(ssid_len, u.mgd.ssid_len, SIZE);
+IEEE80211_IF_FILE(aid, u.mgd.aid, DEC);
+IEEE80211_IF_FILE(ap_capab, u.mgd.ap_capab, HEX);
+IEEE80211_IF_FILE(capab, u.mgd.capab, HEX);
+IEEE80211_IF_FILE(extra_ie_len, u.mgd.extra_ie_len, SIZE);
+IEEE80211_IF_FILE(auth_tries, u.mgd.auth_tries, DEC);
+IEEE80211_IF_FILE(assoc_tries, u.mgd.assoc_tries, DEC);
+IEEE80211_IF_FILE(auth_algs, u.mgd.auth_algs, HEX);
+IEEE80211_IF_FILE(auth_alg, u.mgd.auth_alg, DEC);
+IEEE80211_IF_FILE(auth_transaction, u.mgd.auth_transaction, DEC);
static ssize_t ieee80211_if_fmt_flags(
const struct ieee80211_sub_if_data *sdata, char *buf, int buflen)
{
return scnprintf(buf, buflen, "%s%s%s%s%s%s%s\n",
- sdata->u.sta.flags & IEEE80211_STA_SSID_SET ? "SSID\n" : "",
- sdata->u.sta.flags & IEEE80211_STA_BSSID_SET ? "BSSID\n" : "",
- sdata->u.sta.flags & IEEE80211_STA_PREV_BSSID_SET ? "prev BSSID\n" : "",
- sdata->u.sta.flags & IEEE80211_STA_AUTHENTICATED ? "AUTH\n" : "",
- sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED ? "ASSOC\n" : "",
- sdata->u.sta.flags & IEEE80211_STA_PROBEREQ_POLL ? "PROBEREQ POLL\n" : "",
+ sdata->u.mgd.flags & IEEE80211_STA_SSID_SET ? "SSID\n" : "",
+ sdata->u.mgd.flags & IEEE80211_STA_BSSID_SET ? "BSSID\n" : "",
+ sdata->u.mgd.flags & IEEE80211_STA_PREV_BSSID_SET ? "prev BSSID\n" : "",
+ sdata->u.mgd.flags & IEEE80211_STA_AUTHENTICATED ? "AUTH\n" : "",
+ sdata->u.mgd.flags & IEEE80211_STA_ASSOCIATED ? "ASSOC\n" : "",
+ sdata->u.mgd.flags & IEEE80211_STA_PROBEREQ_POLL ? "PROBEREQ POLL\n" : "",
sdata->vif.bss_conf.use_cts_prot ? "CTS prot\n" : "");
}
__IEEE80211_IF_FILE(flags);
@@ -283,9 +283,11 @@ static void add_files(struct ieee80211_sub_if_data *sdata)
#endif
break;
case NL80211_IFTYPE_STATION:
- case NL80211_IFTYPE_ADHOC:
add_sta_files(sdata);
break;
+ case NL80211_IFTYPE_ADHOC:
+ /* XXX */
+ break;
case NL80211_IFTYPE_AP:
add_ap_files(sdata);
break;
@@ -418,9 +420,11 @@ static void del_files(struct ieee80211_sub_if_data *sdata)
#endif
break;
case NL80211_IFTYPE_STATION:
- case NL80211_IFTYPE_ADHOC:
del_sta_files(sdata);
break;
+ case NL80211_IFTYPE_ADHOC:
+ /* XXX */
+ break;
case NL80211_IFTYPE_AP:
del_ap_files(sdata);
break;
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index 82ea0b63a38..4e3c72f20de 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -17,6 +17,7 @@
#include <net/wireless.h>
#include <net/mac80211.h>
#include "ieee80211_i.h"
+#include "rate.h"
void ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_supported_band *sband,
struct ieee80211_ht_cap *ht_cap_ie,
@@ -93,7 +94,9 @@ u32 ieee80211_enable_ht(struct ieee80211_sub_if_data *sdata,
{
struct ieee80211_local *local = sdata->local;
struct ieee80211_supported_band *sband;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
struct ieee80211_bss_ht_conf ht;
+ struct sta_info *sta;
u32 changed = 0;
bool enable_ht = true, ht_changed;
enum nl80211_channel_type channel_type = NL80211_CHAN_NO_HT;
@@ -136,6 +139,16 @@ u32 ieee80211_enable_ht(struct ieee80211_sub_if_data *sdata,
if (ht_changed) {
/* channel_type change automatically detected */
ieee80211_hw_config(local, 0);
+
+ rcu_read_lock();
+
+ sta = sta_info_get(local, ifmgd->bssid);
+ if (sta)
+ rate_control_rate_update(local, sband, sta,
+ IEEE80211_RC_HT_CHANGED);
+
+ rcu_read_unlock();
+
}
/* disable HT */
@@ -169,7 +182,6 @@ void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata,
u16 initiator, u16 reason_code)
{
struct ieee80211_local *local = sdata->local;
- struct ieee80211_if_sta *ifsta = &sdata->u.sta;
struct sk_buff *skb;
struct ieee80211_mgmt *mgmt;
u16 params;
@@ -190,8 +202,9 @@ void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata,
if (sdata->vif.type == NL80211_IFTYPE_AP ||
sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN);
- else
- memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
+ else if (sdata->vif.type == NL80211_IFTYPE_STATION)
+ memcpy(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN);
+
mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
IEEE80211_STYPE_ACTION);
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
new file mode 100644
index 00000000000..f4becc12904
--- /dev/null
+++ b/net/mac80211/ibss.c
@@ -0,0 +1,907 @@
+/*
+ * IBSS mode implementation
+ * Copyright 2003-2008, Jouni Malinen <j@w1.fi>
+ * Copyright 2004, Instant802 Networks, Inc.
+ * Copyright 2005, Devicescape Software, Inc.
+ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
+ * Copyright 2009, Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/delay.h>
+#include <linux/if_ether.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <net/mac80211.h>
+#include <asm/unaligned.h>
+
+#include "ieee80211_i.h"
+#include "rate.h"
+
+#define IEEE80211_SCAN_INTERVAL (2 * HZ)
+#define IEEE80211_SCAN_INTERVAL_SLOW (15 * HZ)
+#define IEEE80211_IBSS_JOIN_TIMEOUT (7 * HZ)
+
+#define IEEE80211_IBSS_MERGE_INTERVAL (30 * HZ)
+#define IEEE80211_IBSS_MERGE_DELAY 0x400000
+#define IEEE80211_IBSS_INACTIVITY_LIMIT (60 * HZ)
+
+#define IEEE80211_IBSS_MAX_STA_ENTRIES 128
+
+
+static void ieee80211_rx_mgmt_auth_ibss(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt,
+ size_t len)
+{
+ u16 auth_alg, auth_transaction, status_code;
+
+ if (len < 24 + 6)
+ return;
+
+ auth_alg = le16_to_cpu(mgmt->u.auth.auth_alg);
+ auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction);
+ status_code = le16_to_cpu(mgmt->u.auth.status_code);
+
+ /*
+ * IEEE 802.11 standard does not require authentication in IBSS
+ * networks and most implementations do not seem to use it.
+ * However, try to reply to authentication attempts if someone
+ * has actually implemented this.
+ */
+ if (auth_alg == WLAN_AUTH_OPEN && auth_transaction == 1)
+ ieee80211_send_auth(sdata, 2, WLAN_AUTH_OPEN, NULL, 0,
+ sdata->u.ibss.bssid, 0);
+}
+
+static int __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
+ const u8 *bssid, const int beacon_int,
+ const int freq,
+ const size_t supp_rates_len,
+ const u8 *supp_rates,
+ const u16 capability, u64 tsf)
+{
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+ struct ieee80211_local *local = sdata->local;
+ int res = 0, rates, i, j;
+ struct sk_buff *skb;
+ struct ieee80211_mgmt *mgmt;
+ u8 *pos;
+ struct ieee80211_supported_band *sband;
+ union iwreq_data wrqu;
+
+ if (local->ops->reset_tsf) {
+ /* Reset own TSF to allow time synchronization work. */
+ local->ops->reset_tsf(local_to_hw(local));
+ }
+
+ if ((ifibss->flags & IEEE80211_IBSS_PREV_BSSID_SET) &&
+ memcmp(ifibss->bssid, bssid, ETH_ALEN) == 0)
+ return res;
+
+ skb = dev_alloc_skb(local->hw.extra_tx_headroom + 400);
+ if (!skb) {
+ printk(KERN_DEBUG "%s: failed to allocate buffer for probe "
+ "response\n", sdata->dev->name);
+ return -ENOMEM;
+ }
+
+ if (!(ifibss->flags & IEEE80211_IBSS_PREV_BSSID_SET)) {
+ /* Remove possible STA entries from other IBSS networks. */
+ sta_info_flush_delayed(sdata);
+ }
+
+ memcpy(ifibss->bssid, bssid, ETH_ALEN);
+ res = ieee80211_if_config(sdata, IEEE80211_IFCC_BSSID);
+ if (res)
+ return res;
+
+ local->hw.conf.beacon_int = beacon_int >= 10 ? beacon_int : 10;
+
+ sdata->drop_unencrypted = capability &
+ WLAN_CAPABILITY_PRIVACY ? 1 : 0;
+
+ res = ieee80211_set_freq(sdata, freq);
+
+ if (res)
+ return res;
+
+ sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
+
+ /* Build IBSS probe response */
+
+ skb_reserve(skb, local->hw.extra_tx_headroom);
+
+ mgmt = (struct ieee80211_mgmt *)
+ skb_put(skb, 24 + sizeof(mgmt->u.beacon));
+ memset(mgmt, 0, 24 + sizeof(mgmt->u.beacon));
+ mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+ IEEE80211_STYPE_PROBE_RESP);
+ memset(mgmt->da, 0xff, ETH_ALEN);
+ memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
+ memcpy(mgmt->bssid, ifibss->bssid, ETH_ALEN);
+ mgmt->u.beacon.beacon_int =
+ cpu_to_le16(local->hw.conf.beacon_int);
+ mgmt->u.beacon.timestamp = cpu_to_le64(tsf);
+ mgmt->u.beacon.capab_info = cpu_to_le16(capability);
+
+ pos = skb_put(skb, 2 + ifibss->ssid_len);
+ *pos++ = WLAN_EID_SSID;
+ *pos++ = ifibss->ssid_len;
+ memcpy(pos, ifibss->ssid, ifibss->ssid_len);
+
+ rates = supp_rates_len;
+ if (rates > 8)
+ rates = 8;
+ pos = skb_put(skb, 2 + rates);
+ *pos++ = WLAN_EID_SUPP_RATES;
+ *pos++ = rates;
+ memcpy(pos, supp_rates, rates);
+
+ if (sband->band == IEEE80211_BAND_2GHZ) {
+ pos = skb_put(skb, 2 + 1);
+ *pos++ = WLAN_EID_DS_PARAMS;
+ *pos++ = 1;
+ *pos++ = ieee80211_frequency_to_channel(freq);
+ }
+
+ pos = skb_put(skb, 2 + 2);
+ *pos++ = WLAN_EID_IBSS_PARAMS;
+ *pos++ = 2;
+ /* FIX: set ATIM window based on scan results */
+ *pos++ = 0;
+ *pos++ = 0;
+
+ if (supp_rates_len > 8) {
+ rates = supp_rates_len - 8;
+ pos = skb_put(skb, 2 + rates);
+ *pos++ = WLAN_EID_EXT_SUPP_RATES;
+ *pos++ = rates;
+ memcpy(pos, &supp_rates[8], rates);
+ }
+
+ ifibss->probe_resp = skb;
+
+ ieee80211_if_config(sdata, IEEE80211_IFCC_BEACON |
+ IEEE80211_IFCC_BEACON_ENABLED);
+
+
+ rates = 0;
+ for (i = 0; i < supp_rates_len; i++) {
+ int bitrate = (supp_rates[i] & 0x7f) * 5;
+ for (j = 0; j < sband->n_bitrates; j++)
+ if (sband->bitrates[j].bitrate == bitrate)
+ rates |= BIT(j);
+ }
+
+ ieee80211_sta_def_wmm_params(sdata, supp_rates_len, supp_rates);
+
+ ifibss->flags |= IEEE80211_IBSS_PREV_BSSID_SET;
+ ifibss->state = IEEE80211_IBSS_MLME_JOINED;
+ mod_timer(&ifibss->timer, jiffies + IEEE80211_IBSS_MERGE_INTERVAL);
+
+ memset(&wrqu, 0, sizeof(wrqu));
+ memcpy(wrqu.ap_addr.sa_data, bssid, ETH_ALEN);
+ wireless_send_event(sdata->dev, SIOCGIWAP, &wrqu, NULL);
+
+ return res;
+}
+
+static int ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_bss *bss)
+{
+ return __ieee80211_sta_join_ibss(sdata,
+ bss->cbss.bssid,
+ bss->cbss.beacon_interval,
+ bss->cbss.channel->center_freq,
+ bss->supp_rates_len, bss->supp_rates,
+ bss->cbss.capability,
+ bss->cbss.tsf);
+}
+
+static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt,
+ size_t len,
+ struct ieee80211_rx_status *rx_status,
+ struct ieee802_11_elems *elems,
+ bool beacon)
+{
+ struct ieee80211_local *local = sdata->local;
+ int freq;
+ struct ieee80211_bss *bss;
+ struct sta_info *sta;
+ struct ieee80211_channel *channel;
+ u64 beacon_timestamp, rx_timestamp;
+ u32 supp_rates = 0;
+ enum ieee80211_band band = rx_status->band;
+
+ if (elems->ds_params && elems->ds_params_len == 1)
+ freq = ieee80211_channel_to_frequency(elems->ds_params[0]);
+ else
+ freq = rx_status->freq;
+
+ channel = ieee80211_get_channel(local->hw.wiphy, freq);
+
+ if (!channel || channel->flags & IEEE80211_CHAN_DISABLED)
+ return;
+
+ if (sdata->vif.type == NL80211_IFTYPE_ADHOC && elems->supp_rates &&
+ memcmp(mgmt->bssid, sdata->u.ibss.bssid, ETH_ALEN) == 0) {
+ supp_rates = ieee80211_sta_get_rates(local, elems, band);
+
+ rcu_read_lock();
+
+ sta = sta_info_get(local, mgmt->sa);
+ if (sta) {
+ u32 prev_rates;
+
+ prev_rates = sta->sta.supp_rates[band];
+ /* make sure mandatory rates are always added */
+ sta->sta.supp_rates[band] = supp_rates |
+ ieee80211_mandatory_rates(local, band);
+
+#ifdef CONFIG_MAC80211_IBSS_DEBUG
+ if (sta->sta.supp_rates[band] != prev_rates)
+ printk(KERN_DEBUG "%s: updated supp_rates set "
+ "for %pM based on beacon info (0x%llx | "
+ "0x%llx -> 0x%llx)\n",
+ sdata->dev->name,
+ sta->sta.addr,
+ (unsigned long long) prev_rates,
+ (unsigned long long) supp_rates,
+ (unsigned long long) sta->sta.supp_rates[band]);
+#endif
+ } else
+ ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa, supp_rates);
+
+ rcu_read_unlock();
+ }
+
+ bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, elems,
+ channel, beacon);
+ if (!bss)
+ return;
+
+ /* was just updated in ieee80211_bss_info_update */
+ beacon_timestamp = bss->cbss.tsf;
+
+ /* check if we need to merge IBSS */
+
+ /* merge only on beacons (???) */
+ if (!beacon)
+ goto put_bss;
+
+ /* we use a fixed BSSID */
+ if (sdata->u.ibss.flags & IEEE80211_IBSS_BSSID_SET)
+ goto put_bss;
+
+ /* not an IBSS */
+ if (!(bss->cbss.capability & WLAN_CAPABILITY_IBSS))
+ goto put_bss;
+
+ /* different channel */
+ if (bss->cbss.channel != local->oper_channel)
+ goto put_bss;
+
+ /* different SSID */
+ if (elems->ssid_len != sdata->u.ibss.ssid_len ||
+ memcmp(elems->ssid, sdata->u.ibss.ssid,
+ sdata->u.ibss.ssid_len))
+ goto put_bss;
+
+ /* same BSSID */
+ if (memcmp(bss->cbss.bssid, sdata->u.ibss.bssid, ETH_ALEN) == 0)
+ goto put_bss;
+
+ if (rx_status->flag & RX_FLAG_TSFT) {
+ /*
+ * For correct IBSS merging we need mactime; since mactime is
+ * defined as the time the first data symbol of the frame hits
+ * the PHY, and the timestamp of the beacon is defined as "the
+ * time that the data symbol containing the first bit of the
+ * timestamp is transmitted to the PHY plus the transmitting
+ * STA's delays through its local PHY from the MAC-PHY
+ * interface to its interface with the WM" (802.11 11.1.2)
+ * - equals the time this bit arrives at the receiver - we have
+ * to take into account the offset between the two.
+ *
+ * E.g. at 1 MBit that means mactime is 192 usec earlier
+ * (=24 bytes * 8 usecs/byte) than the beacon timestamp.
+ */
+ int rate;
+
+ if (rx_status->flag & RX_FLAG_HT)
+ rate = 65; /* TODO: HT rates */
+ else
+ rate = local->hw.wiphy->bands[band]->
+ bitrates[rx_status->rate_idx].bitrate;
+
+ rx_timestamp = rx_status->mactime + (24 * 8 * 10 / rate);
+ } else if (local && local->ops && local->ops->get_tsf)
+ /* second best option: get current TSF */
+ rx_timestamp = local->ops->get_tsf(local_to_hw(local));
+ else
+ /* can't merge without knowing the TSF */
+ rx_timestamp = -1LLU;
+
+#ifdef CONFIG_MAC80211_IBSS_DEBUG
+ printk(KERN_DEBUG "RX beacon SA=%pM BSSID="
+ "%pM TSF=0x%llx BCN=0x%llx diff=%lld @%lu\n",
+ mgmt->sa, mgmt->bssid,
+ (unsigned long long)rx_timestamp,
+ (unsigned long long)beacon_timestamp,
+ (unsigned long long)(rx_timestamp - beacon_timestamp),
+ jiffies);
+#endif
+
+ /* give slow hardware some time to do the TSF sync */
+ if (rx_timestamp < IEEE80211_IBSS_MERGE_DELAY)
+ goto put_bss;
+
+ if (beacon_timestamp > rx_timestamp) {
+#ifdef CONFIG_MAC80211_IBSS_DEBUG
+ printk(KERN_DEBUG "%s: beacon TSF higher than "
+ "local TSF - IBSS merge with BSSID %pM\n",
+ sdata->dev->name, mgmt->bssid);
+#endif
+ ieee80211_sta_join_ibss(sdata, bss);
+ ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa, supp_rates);
+ }
+
+ put_bss:
+ ieee80211_rx_bss_put(local, bss);
+}
+
+/*
+ * Add a new IBSS station, will also be called by the RX code when,
+ * in IBSS mode, receiving a frame from a yet-unknown station, hence
+ * must be callable in atomic context.
+ */
+struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata,
+ u8 *bssid,u8 *addr, u32 supp_rates)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta;
+ int band = local->hw.conf.channel->band;
+
+ /* TODO: Could consider removing the least recently used entry and
+ * allow new one to be added. */
+ if (local->num_sta >= IEEE80211_IBSS_MAX_STA_ENTRIES) {
+ if (net_ratelimit()) {
+ printk(KERN_DEBUG "%s: No room for a new IBSS STA "
+ "entry %pM\n", sdata->dev->name, addr);
+ }
+ return NULL;
+ }
+
+ if (compare_ether_addr(bssid, sdata->u.ibss.bssid))
+ return NULL;
+
+#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
+ printk(KERN_DEBUG "%s: Adding new IBSS station %pM (dev=%s)\n",
+ wiphy_name(local->hw.wiphy), addr, sdata->dev->name);
+#endif
+
+ sta = sta_info_alloc(sdata, addr, GFP_ATOMIC);
+ if (!sta)
+ return NULL;
+
+ set_sta_flags(sta, WLAN_STA_AUTHORIZED);
+
+ /* make sure mandatory rates are always added */
+ sta->sta.supp_rates[band] = supp_rates |
+ ieee80211_mandatory_rates(local, band);
+
+ rate_control_rate_init(sta);
+
+ if (sta_info_insert(sta))
+ return NULL;
+
+ return sta;
+}
+
+static int ieee80211_sta_active_ibss(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ int active = 0;
+ struct sta_info *sta;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(sta, &local->sta_list, list) {
+ if (sta->sdata == sdata &&
+ time_after(sta->last_rx + IEEE80211_IBSS_MERGE_INTERVAL,
+ jiffies)) {
+ active++;
+ break;
+ }
+ }
+
+ rcu_read_unlock();
+
+ return active;
+}
+
+
+static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+
+ mod_timer(&ifibss->timer, jiffies + IEEE80211_IBSS_MERGE_INTERVAL);
+
+ ieee80211_sta_expire(sdata, IEEE80211_IBSS_INACTIVITY_LIMIT);
+ if (ieee80211_sta_active_ibss(sdata))
+ return;
+
+ if ((ifibss->flags & IEEE80211_IBSS_BSSID_SET) &&
+ (!(ifibss->flags & IEEE80211_IBSS_AUTO_CHANNEL_SEL)))
+ return;
+
+ printk(KERN_DEBUG "%s: No active IBSS STAs - trying to scan for other "
+ "IBSS networks with same SSID (merge)\n", sdata->dev->name);
+
+ /* XXX maybe racy? */
+ if (sdata->local->scan_req)
+ return;
+
+ memcpy(sdata->local->int_scan_req.ssids[0].ssid,
+ ifibss->ssid, IEEE80211_MAX_SSID_LEN);
+ sdata->local->int_scan_req.ssids[0].ssid_len = ifibss->ssid_len;
+ ieee80211_request_scan(sdata, &sdata->local->int_scan_req);
+}
+
+static int ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_supported_band *sband;
+ u8 *pos;
+ u8 bssid[ETH_ALEN];
+ u8 supp_rates[IEEE80211_MAX_SUPP_RATES];
+ u16 capability;
+ int i;
+
+ if (ifibss->flags & IEEE80211_IBSS_BSSID_SET) {
+ memcpy(bssid, ifibss->bssid, ETH_ALEN);
+ } else {
+ /* Generate random, not broadcast, locally administered BSSID. Mix in
+ * own MAC address to make sure that devices that do not have proper
+ * random number generator get different BSSID. */
+ get_random_bytes(bssid, ETH_ALEN);
+ for (i = 0; i < ETH_ALEN; i++)
+ bssid[i] ^= sdata->dev->dev_addr[i];
+ bssid[0] &= ~0x01;
+ bssid[0] |= 0x02;
+ }
+
+ printk(KERN_DEBUG "%s: Creating new IBSS network, BSSID %pM\n",
+ sdata->dev->name, bssid);
+
+ sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
+
+ if (local->hw.conf.beacon_int == 0)
+ local->hw.conf.beacon_int = 100;
+
+ capability = WLAN_CAPABILITY_IBSS;
+
+ if (sdata->default_key)
+ capability |= WLAN_CAPABILITY_PRIVACY;
+ else
+ sdata->drop_unencrypted = 0;
+
+ pos = supp_rates;
+ for (i = 0; i < sband->n_bitrates; i++) {
+ int rate = sband->bitrates[i].bitrate;
+ *pos++ = (u8) (rate / 5);
+ }
+
+ return __ieee80211_sta_join_ibss(sdata,
+ bssid, local->hw.conf.beacon_int,
+ local->hw.conf.channel->center_freq,
+ sband->n_bitrates, supp_rates,
+ capability, 0);
+}
+
+static int ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_bss *bss;
+ const u8 *bssid = NULL;
+ int active_ibss;
+
+ if (ifibss->ssid_len == 0)
+ return -EINVAL;
+
+ active_ibss = ieee80211_sta_active_ibss(sdata);
+#ifdef CONFIG_MAC80211_IBSS_DEBUG
+ printk(KERN_DEBUG "%s: sta_find_ibss (active_ibss=%d)\n",
+ sdata->dev->name, active_ibss);
+#endif /* CONFIG_MAC80211_IBSS_DEBUG */
+
+ if (active_ibss)
+ return 0;
+
+ if (ifibss->flags & IEEE80211_IBSS_BSSID_SET)
+ bssid = ifibss->bssid;
+ bss = (void *)cfg80211_get_bss(local->hw.wiphy, NULL, bssid,
+ ifibss->ssid, ifibss->ssid_len,
+ WLAN_CAPABILITY_IBSS,
+ WLAN_CAPABILITY_IBSS);
+
+#ifdef CONFIG_MAC80211_IBSS_DEBUG
+ if (bss)
+ printk(KERN_DEBUG " sta_find_ibss: selected %pM current "
+ "%pM\n", bss->cbss.bssid, ifibss->bssid);
+#endif /* CONFIG_MAC80211_IBSS_DEBUG */
+
+ if (bss &&
+ (!(ifibss->flags & IEEE80211_IBSS_PREV_BSSID_SET) ||
+ memcmp(ifibss->bssid, bss->cbss.bssid, ETH_ALEN))) {
+ int ret;
+
+ printk(KERN_DEBUG "%s: Selected IBSS BSSID %pM"
+ " based on configured SSID\n",
+ sdata->dev->name, bss->cbss.bssid);
+
+ ret = ieee80211_sta_join_ibss(sdata, bss);
+ ieee80211_rx_bss_put(local, bss);
+ return ret;
+ } else if (bss)
+ ieee80211_rx_bss_put(local, bss);
+
+#ifdef CONFIG_MAC80211_IBSS_DEBUG
+ printk(KERN_DEBUG " did not try to join ibss\n");
+#endif /* CONFIG_MAC80211_IBSS_DEBUG */
+
+ /* Selected IBSS not found in current scan results - try to scan */
+ if (ifibss->state == IEEE80211_IBSS_MLME_JOINED &&
+ !ieee80211_sta_active_ibss(sdata)) {
+ mod_timer(&ifibss->timer, jiffies +
+ IEEE80211_IBSS_MERGE_INTERVAL);
+ } else if (time_after(jiffies, local->last_scan_completed +
+ IEEE80211_SCAN_INTERVAL)) {
+ printk(KERN_DEBUG "%s: Trigger new scan to find an IBSS to "
+ "join\n", sdata->dev->name);
+
+ /* XXX maybe racy? */
+ if (local->scan_req)
+ return -EBUSY;
+
+ memcpy(local->int_scan_req.ssids[0].ssid,
+ ifibss->ssid, IEEE80211_MAX_SSID_LEN);
+ local->int_scan_req.ssids[0].ssid_len = ifibss->ssid_len;
+ return ieee80211_request_scan(sdata, &local->int_scan_req);
+ } else if (ifibss->state != IEEE80211_IBSS_MLME_JOINED) {
+ int interval = IEEE80211_SCAN_INTERVAL;
+
+ if (time_after(jiffies, ifibss->ibss_join_req +
+ IEEE80211_IBSS_JOIN_TIMEOUT)) {
+ if (!(local->oper_channel->flags &
+ IEEE80211_CHAN_NO_IBSS))
+ return ieee80211_sta_create_ibss(sdata);
+ printk(KERN_DEBUG "%s: IBSS not allowed on"
+ " %d MHz\n", sdata->dev->name,
+ local->hw.conf.channel->center_freq);
+
+ /* No IBSS found - decrease scan interval and continue
+ * scanning. */
+ interval = IEEE80211_SCAN_INTERVAL_SLOW;
+ }
+
+ ifibss->state = IEEE80211_IBSS_MLME_SEARCH;
+ mod_timer(&ifibss->timer, jiffies + interval);
+ return 0;
+ }
+
+ return 0;
+}
+
+static void ieee80211_rx_mgmt_probe_req(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt,
+ size_t len)
+{
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+ struct ieee80211_local *local = sdata->local;
+ int tx_last_beacon;
+ struct sk_buff *skb;
+ struct ieee80211_mgmt *resp;
+ u8 *pos, *end;
+
+ if (ifibss->state != IEEE80211_IBSS_MLME_JOINED ||
+ len < 24 + 2 || !ifibss->probe_resp)
+ return;
+
+ if (local->ops->tx_last_beacon)
+ tx_last_beacon = local->ops->tx_last_beacon(local_to_hw(local));
+ else
+ tx_last_beacon = 1;
+
+#ifdef CONFIG_MAC80211_IBSS_DEBUG
+ printk(KERN_DEBUG "%s: RX ProbeReq SA=%pM DA=%pM BSSID=%pM"
+ " (tx_last_beacon=%d)\n",
+ sdata->dev->name, mgmt->sa, mgmt->da,
+ mgmt->bssid, tx_last_beacon);
+#endif /* CONFIG_MAC80211_IBSS_DEBUG */
+
+ if (!tx_last_beacon)
+ return;
+
+ if (memcmp(mgmt->bssid, ifibss->bssid, ETH_ALEN) != 0 &&
+ memcmp(mgmt->bssid, "\xff\xff\xff\xff\xff\xff", ETH_ALEN) != 0)
+ return;
+
+ end = ((u8 *) mgmt) + len;
+ pos = mgmt->u.probe_req.variable;
+ if (pos[0] != WLAN_EID_SSID ||
+ pos + 2 + pos[1] > end) {
+#ifdef CONFIG_MAC80211_IBSS_DEBUG
+ printk(KERN_DEBUG "%s: Invalid SSID IE in ProbeReq "
+ "from %pM\n",
+ sdata->dev->name, mgmt->sa);
+#endif
+ return;
+ }
+ if (pos[1] != 0 &&
+ (pos[1] != ifibss->ssid_len ||
+ memcmp(pos + 2, ifibss->ssid, ifibss->ssid_len) != 0)) {
+ /* Ignore ProbeReq for foreign SSID */
+ return;
+ }
+
+ /* Reply with ProbeResp */
+ skb = skb_copy(ifibss->probe_resp, GFP_KERNEL);
+ if (!skb)
+ return;
+
+ resp = (struct ieee80211_mgmt *) skb->data;
+ memcpy(resp->da, mgmt->sa, ETH_ALEN);
+#ifdef CONFIG_MAC80211_IBSS_DEBUG
+ printk(KERN_DEBUG "%s: Sending ProbeResp to %pM\n",
+ sdata->dev->name, resp->da);
+#endif /* CONFIG_MAC80211_IBSS_DEBUG */
+ ieee80211_tx_skb(sdata, skb, 0);
+}
+
+static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt,
+ size_t len,
+ struct ieee80211_rx_status *rx_status)
+{
+ size_t baselen;
+ struct ieee802_11_elems elems;
+
+ if (memcmp(mgmt->da, sdata->dev->dev_addr, ETH_ALEN))
+ return; /* ignore ProbeResp to foreign address */
+
+ baselen = (u8 *) mgmt->u.probe_resp.variable - (u8 *) mgmt;
+ if (baselen > len)
+ return;
+
+ ieee802_11_parse_elems(mgmt->u.probe_resp.variable, len - baselen,
+ &elems);
+
+ ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems, false);
+}
+
+static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt,
+ size_t len,
+ struct ieee80211_rx_status *rx_status)
+{
+ size_t baselen;
+ struct ieee802_11_elems elems;
+
+ /* Process beacon from the current BSS */
+ baselen = (u8 *) mgmt->u.beacon.variable - (u8 *) mgmt;
+ if (baselen > len)
+ return;
+
+ ieee802_11_parse_elems(mgmt->u.beacon.variable, len - baselen, &elems);
+
+ ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems, true);
+}
+
+static void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb)
+{
+ struct ieee80211_rx_status *rx_status;
+ struct ieee80211_mgmt *mgmt;
+ u16 fc;
+
+ rx_status = (struct ieee80211_rx_status *) skb->cb;
+ mgmt = (struct ieee80211_mgmt *) skb->data;
+ fc = le16_to_cpu(mgmt->frame_control);
+
+ switch (fc & IEEE80211_FCTL_STYPE) {
+ case IEEE80211_STYPE_PROBE_REQ:
+ ieee80211_rx_mgmt_probe_req(sdata, mgmt, skb->len);
+ break;
+ case IEEE80211_STYPE_PROBE_RESP:
+ ieee80211_rx_mgmt_probe_resp(sdata, mgmt, skb->len,
+ rx_status);
+ break;
+ case IEEE80211_STYPE_BEACON:
+ ieee80211_rx_mgmt_beacon(sdata, mgmt, skb->len,
+ rx_status);
+ break;
+ case IEEE80211_STYPE_AUTH:
+ ieee80211_rx_mgmt_auth_ibss(sdata, mgmt, skb->len);
+ break;
+ }
+
+ kfree_skb(skb);
+}
+
+static void ieee80211_ibss_work(struct work_struct *work)
+{
+ struct ieee80211_sub_if_data *sdata =
+ container_of(work, struct ieee80211_sub_if_data, u.ibss.work);
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_if_ibss *ifibss;
+ struct sk_buff *skb;
+
+ if (!netif_running(sdata->dev))
+ return;
+
+ if (local->sw_scanning || local->hw_scanning)
+ return;
+
+ if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_ADHOC))
+ return;
+ ifibss = &sdata->u.ibss;
+
+ while ((skb = skb_dequeue(&ifibss->skb_queue)))
+ ieee80211_ibss_rx_queued_mgmt(sdata, skb);
+
+ if (!test_and_clear_bit(IEEE80211_IBSS_REQ_RUN, &ifibss->request))
+ return;
+
+ switch (ifibss->state) {
+ case IEEE80211_IBSS_MLME_SEARCH:
+ ieee80211_sta_find_ibss(sdata);
+ break;
+ case IEEE80211_IBSS_MLME_JOINED:
+ ieee80211_sta_merge_ibss(sdata);
+ break;
+ default:
+ WARN_ON(1);
+ break;
+ }
+}
+
+static void ieee80211_ibss_timer(unsigned long data)
+{
+ struct ieee80211_sub_if_data *sdata =
+ (struct ieee80211_sub_if_data *) data;
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+ struct ieee80211_local *local = sdata->local;
+
+ set_bit(IEEE80211_IBSS_REQ_RUN, &ifibss->request);
+ queue_work(local->hw.workqueue, &ifibss->work);
+}
+
+void ieee80211_ibss_setup_sdata(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+
+ INIT_WORK(&ifibss->work, ieee80211_ibss_work);
+ setup_timer(&ifibss->timer, ieee80211_ibss_timer,
+ (unsigned long) sdata);
+ skb_queue_head_init(&ifibss->skb_queue);
+
+ ifibss->flags |= IEEE80211_IBSS_AUTO_BSSID_SEL |
+ IEEE80211_IBSS_AUTO_CHANNEL_SEL;
+}
+
+int ieee80211_ibss_commit(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+
+ ifibss->flags &= ~IEEE80211_IBSS_PREV_BSSID_SET;
+
+ if (ifibss->ssid_len)
+ ifibss->flags |= IEEE80211_IBSS_SSID_SET;
+ else
+ ifibss->flags &= ~IEEE80211_IBSS_SSID_SET;
+
+ ifibss->ibss_join_req = jiffies;
+ ifibss->state = IEEE80211_IBSS_MLME_SEARCH;
+
+ return ieee80211_sta_find_ibss(sdata);
+}
+
+int ieee80211_ibss_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t len)
+{
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+
+ if (len > IEEE80211_MAX_SSID_LEN)
+ return -EINVAL;
+
+ if (ifibss->ssid_len != len || memcmp(ifibss->ssid, ssid, len) != 0) {
+ memset(ifibss->ssid, 0, sizeof(ifibss->ssid));
+ memcpy(ifibss->ssid, ssid, len);
+ ifibss->ssid_len = len;
+ }
+
+ return ieee80211_ibss_commit(sdata);
+}
+
+int ieee80211_ibss_get_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t *len)
+{
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+
+ memcpy(ssid, ifibss->ssid, ifibss->ssid_len);
+ *len = ifibss->ssid_len;
+
+ return 0;
+}
+
+int ieee80211_ibss_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid)
+{
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+
+ if (is_valid_ether_addr(bssid)) {
+ memcpy(ifibss->bssid, bssid, ETH_ALEN);
+ ifibss->flags |= IEEE80211_IBSS_BSSID_SET;
+ } else {
+ memset(ifibss->bssid, 0, ETH_ALEN);
+ ifibss->flags &= ~IEEE80211_IBSS_BSSID_SET;
+ }
+
+ if (netif_running(sdata->dev)) {
+ if (ieee80211_if_config(sdata, IEEE80211_IFCC_BSSID)) {
+ printk(KERN_DEBUG "%s: Failed to config new BSSID to "
+ "the low-level driver\n", sdata->dev->name);
+ }
+ }
+
+ return ieee80211_ibss_commit(sdata);
+}
+
+/* scan finished notification */
+void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local)
+{
+ struct ieee80211_sub_if_data *sdata = local->scan_sdata;
+ struct ieee80211_if_ibss *ifibss;
+
+ if (sdata && sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+ ifibss = &sdata->u.ibss;
+ if ((!(ifibss->flags & IEEE80211_IBSS_PREV_BSSID_SET)) ||
+ !ieee80211_sta_active_ibss(sdata))
+ ieee80211_sta_find_ibss(sdata);
+ }
+}
+
+ieee80211_rx_result
+ieee80211_ibss_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
+ struct ieee80211_rx_status *rx_status)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_mgmt *mgmt;
+ u16 fc;
+
+ if (skb->len < 24)
+ return RX_DROP_MONITOR;
+
+ mgmt = (struct ieee80211_mgmt *) skb->data;
+ fc = le16_to_cpu(mgmt->frame_control);
+
+ switch (fc & IEEE80211_FCTL_STYPE) {
+ case IEEE80211_STYPE_PROBE_RESP:
+ case IEEE80211_STYPE_BEACON:
+ memcpy(skb->cb, rx_status, sizeof(*rx_status));
+ case IEEE80211_STYPE_PROBE_REQ:
+ case IEEE80211_STYPE_AUTH:
+ skb_queue_tail(&sdata->u.ibss.skb_queue, skb);
+ queue_work(local->hw.workqueue, &sdata->u.ibss.work);
+ return RX_QUEUED;
+ }
+
+ return RX_DROP_MONITOR;
+}
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 2cb743ed9f9..fbb91f1aebb 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -239,7 +239,7 @@ struct mesh_preq_queue {
u8 flags;
};
-/* flags used in struct ieee80211_if_sta.flags */
+/* flags used in struct ieee80211_if_managed.flags */
#define IEEE80211_STA_SSID_SET BIT(0)
#define IEEE80211_STA_BSSID_SET BIT(1)
#define IEEE80211_STA_PREV_BSSID_SET BIT(2)
@@ -262,31 +262,30 @@ struct mesh_preq_queue {
#define IEEE80211_STA_REQ_AUTH 2
#define IEEE80211_STA_REQ_RUN 3
-/* STA/IBSS MLME states */
-enum ieee80211_sta_mlme_state {
- IEEE80211_STA_MLME_DISABLED,
- IEEE80211_STA_MLME_DIRECT_PROBE,
- IEEE80211_STA_MLME_AUTHENTICATE,
- IEEE80211_STA_MLME_ASSOCIATE,
- IEEE80211_STA_MLME_ASSOCIATED,
- IEEE80211_STA_MLME_IBSS_SEARCH,
- IEEE80211_STA_MLME_IBSS_JOINED,
-};
-
/* bitfield of allowed auth algs */
#define IEEE80211_AUTH_ALG_OPEN BIT(0)
#define IEEE80211_AUTH_ALG_SHARED_KEY BIT(1)
#define IEEE80211_AUTH_ALG_LEAP BIT(2)
-struct ieee80211_if_sta {
+struct ieee80211_if_managed {
struct timer_list timer;
struct timer_list chswitch_timer;
struct work_struct work;
struct work_struct chswitch_work;
+
u8 bssid[ETH_ALEN], prev_bssid[ETH_ALEN];
+
u8 ssid[IEEE80211_MAX_SSID_LEN];
- enum ieee80211_sta_mlme_state state;
size_t ssid_len;
+
+ enum {
+ IEEE80211_STA_MLME_DISABLED,
+ IEEE80211_STA_MLME_DIRECT_PROBE,
+ IEEE80211_STA_MLME_AUTHENTICATE,
+ IEEE80211_STA_MLME_ASSOCIATE,
+ IEEE80211_STA_MLME_ASSOCIATED,
+ } state;
+
u16 aid;
u16 ap_capab, capab;
u8 *extra_ie; /* to be added to the end of AssocReq */
@@ -319,10 +318,6 @@ struct ieee80211_if_sta {
IEEE80211_MFP_REQUIRED
} mfp; /* management frame protection */
- unsigned long ibss_join_req;
- struct sk_buff *probe_resp; /* ProbeResp template for IBSS */
- u32 supp_rates_bits[IEEE80211_NUM_BANDS];
-
int wmm_last_param_set;
/* Extra IE data for management frames */
@@ -342,6 +337,42 @@ struct ieee80211_if_sta {
size_t ie_disassoc_len;
};
+enum ieee80211_ibss_flags {
+ IEEE80211_IBSS_AUTO_CHANNEL_SEL = BIT(0),
+ IEEE80211_IBSS_AUTO_BSSID_SEL = BIT(1),
+ IEEE80211_IBSS_BSSID_SET = BIT(2),
+ IEEE80211_IBSS_PREV_BSSID_SET = BIT(3),
+ IEEE80211_IBSS_SSID_SET = BIT(4),
+};
+
+enum ieee80211_ibss_request {
+ IEEE80211_IBSS_REQ_RUN = 0,
+};
+
+struct ieee80211_if_ibss {
+ struct timer_list timer;
+ struct work_struct work;
+
+ struct sk_buff_head skb_queue;
+
+ u8 ssid[IEEE80211_MAX_SSID_LEN];
+ u8 ssid_len;
+
+ u32 flags;
+
+ u8 bssid[ETH_ALEN];
+
+ unsigned long request;
+
+ unsigned long ibss_join_req;
+ struct sk_buff *probe_resp; /* ProbeResp template for IBSS */
+
+ enum {
+ IEEE80211_IBSS_MLME_SEARCH,
+ IEEE80211_IBSS_MLME_JOINED,
+ } state;
+};
+
struct ieee80211_if_mesh {
struct work_struct work;
struct timer_list housekeeping_timer;
@@ -445,7 +476,8 @@ struct ieee80211_sub_if_data {
struct ieee80211_if_ap ap;
struct ieee80211_if_wds wds;
struct ieee80211_if_vlan vlan;
- struct ieee80211_if_sta sta;
+ struct ieee80211_if_managed mgd;
+ struct ieee80211_if_ibss ibss;
#ifdef CONFIG_MAC80211_MESH
struct ieee80211_if_mesh mesh;
#endif
@@ -564,12 +596,10 @@ enum {
enum queue_stop_reason {
IEEE80211_QUEUE_STOP_REASON_DRIVER,
IEEE80211_QUEUE_STOP_REASON_PS,
- IEEE80211_QUEUE_STOP_REASON_CSA
+ IEEE80211_QUEUE_STOP_REASON_CSA,
+ IEEE80211_QUEUE_STOP_REASON_AGGREGATION,
};
-/* maximum number of hardware queues we support. */
-#define QD_MAX_QUEUES (IEEE80211_MAX_AMPDU_QUEUES + IEEE80211_MAX_QUEUES)
-
struct ieee80211_master_priv {
struct ieee80211_local *local;
};
@@ -582,9 +612,15 @@ struct ieee80211_local {
const struct ieee80211_ops *ops;
- unsigned long queue_pool[BITS_TO_LONGS(QD_MAX_QUEUES)];
- unsigned long queue_stop_reasons[IEEE80211_MAX_QUEUES];
+ /* AC queue corresponding to each AMPDU queue */
+ s8 ampdu_ac_queue[IEEE80211_MAX_AMPDU_QUEUES];
+ unsigned int amdpu_ac_stop_refcnt[IEEE80211_MAX_AMPDU_QUEUES];
+
+ unsigned long queue_stop_reasons[IEEE80211_MAX_QUEUES +
+ IEEE80211_MAX_AMPDU_QUEUES];
+ /* also used to protect ampdu_ac_queue and amdpu_ac_stop_refcnt */
spinlock_t queue_stop_reason_lock;
+
struct net_device *mdev; /* wmaster# - "master" 802.11 device */
int open_count;
int monitors, cooked_mntrs;
@@ -888,34 +924,41 @@ void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx);
void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
u32 changed);
void ieee80211_configure_filter(struct ieee80211_local *local);
+u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata);
/* wireless extensions */
extern const struct iw_handler_def ieee80211_iw_handler_def;
-/* STA/IBSS code */
+/* STA code */
void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata);
-void ieee80211_scan_work(struct work_struct *work);
-void ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
- struct ieee80211_rx_status *rx_status);
+ieee80211_rx_result ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb,
+ struct ieee80211_rx_status *rx_status);
+int ieee80211_sta_commit(struct ieee80211_sub_if_data *sdata);
int ieee80211_sta_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t len);
int ieee80211_sta_get_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t *len);
int ieee80211_sta_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid);
-void ieee80211_sta_req_auth(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_if_sta *ifsta);
-struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata,
- u8 *bssid, u8 *addr, u32 supp_rates);
+void ieee80211_sta_req_auth(struct ieee80211_sub_if_data *sdata);
int ieee80211_sta_deauthenticate(struct ieee80211_sub_if_data *sdata, u16 reason);
int ieee80211_sta_disassociate(struct ieee80211_sub_if_data *sdata, u16 reason);
-u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata);
-u32 ieee80211_sta_get_rates(struct ieee80211_local *local,
- struct ieee802_11_elems *elems,
- enum ieee80211_band band);
-void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
- u8 *ssid, size_t ssid_len);
void ieee80211_send_pspoll(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata);
+/* IBSS code */
+int ieee80211_ibss_commit(struct ieee80211_sub_if_data *sdata);
+int ieee80211_ibss_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t len);
+int ieee80211_ibss_get_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t *len);
+int ieee80211_ibss_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid);
+void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local);
+void ieee80211_ibss_setup_sdata(struct ieee80211_sub_if_data *sdata);
+ieee80211_rx_result
+ieee80211_ibss_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
+ struct ieee80211_rx_status *rx_status);
+struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata,
+ u8 *bssid, u8 *addr, u32 supp_rates);
+
/* scan/BSS handling */
+void ieee80211_scan_work(struct work_struct *work);
int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata,
struct cfg80211_scan_request *req);
int ieee80211_scan_results(struct ieee80211_local *local,
@@ -929,6 +972,7 @@ int ieee80211_sta_set_extra_ie(struct ieee80211_sub_if_data *sdata,
char *ie, size_t len);
void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local);
+void ieee80211_scan_failed(struct ieee80211_local *local);
int ieee80211_start_scan(struct ieee80211_sub_if_data *scan_sdata,
struct cfg80211_scan_request *req);
struct ieee80211_bss *
@@ -1042,6 +1086,25 @@ void ieee80211_wake_queues_by_reason(struct ieee80211_hw *hw,
enum queue_stop_reason reason);
void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw,
enum queue_stop_reason reason);
+void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue,
+ enum queue_stop_reason reason);
+void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue,
+ enum queue_stop_reason reason);
+
+void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
+ u16 transaction, u16 auth_alg,
+ u8 *extra, size_t extra_len,
+ const u8 *bssid, int encrypt);
+void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
+ u8 *ssid, size_t ssid_len,
+ u8 *ie, size_t ie_len);
+
+void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata,
+ const size_t supp_rates_len,
+ const u8 *supp_rates);
+u32 ieee80211_sta_get_rates(struct ieee80211_local *local,
+ struct ieee802_11_elems *elems,
+ enum ieee80211_band band);
#ifdef CONFIG_MAC80211_NOINLINE
#define debug_noinline noinline
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index df94b936526..f9f27b9cadb 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -236,7 +236,10 @@ static int ieee80211_open(struct net_device *dev)
break;
case NL80211_IFTYPE_STATION:
case NL80211_IFTYPE_ADHOC:
- sdata->u.sta.flags &= ~IEEE80211_STA_PREV_BSSID_SET;
+ if (sdata->vif.type == NL80211_IFTYPE_STATION)
+ sdata->u.mgd.flags &= ~IEEE80211_STA_PREV_BSSID_SET;
+ else
+ sdata->u.ibss.flags &= ~IEEE80211_IBSS_PREV_BSSID_SET;
/* fall through */
default:
conf.vif = &sdata->vif;
@@ -321,11 +324,10 @@ static int ieee80211_open(struct net_device *dev)
* yet be effective. Trigger execution of ieee80211_sta_work
* to fix this.
*/
- if (sdata->vif.type == NL80211_IFTYPE_STATION ||
- sdata->vif.type == NL80211_IFTYPE_ADHOC) {
- struct ieee80211_if_sta *ifsta = &sdata->u.sta;
- queue_work(local->hw.workqueue, &ifsta->work);
- }
+ if (sdata->vif.type == NL80211_IFTYPE_STATION)
+ queue_work(local->hw.workqueue, &sdata->u.mgd.work);
+ else if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
+ queue_work(local->hw.workqueue, &sdata->u.ibss.work);
netif_tx_start_all_queues(dev);
@@ -368,6 +370,18 @@ static int ieee80211_stop(struct net_device *dev)
rcu_read_unlock();
/*
+ * Announce that we are leaving the network, in case we are a
+ * station interface type. This must be done before removing
+ * all stations associated with sta_info_flush, otherwise STA
+ * information will be gone and no announce being done.
+ */
+ if (sdata->vif.type == NL80211_IFTYPE_STATION) {
+ if (sdata->u.mgd.state != IEEE80211_STA_MLME_DISABLED)
+ ieee80211_sta_deauthenticate(sdata,
+ WLAN_REASON_DEAUTH_LEAVING);
+ }
+
+ /*
* Remove all stations associated with this interface.
*
* This must be done before calling ops->remove_interface()
@@ -452,15 +466,9 @@ static int ieee80211_stop(struct net_device *dev)
netif_addr_unlock_bh(local->mdev);
break;
case NL80211_IFTYPE_STATION:
- case NL80211_IFTYPE_ADHOC:
- /* Announce that we are leaving the network. */
- if (sdata->u.sta.state != IEEE80211_STA_MLME_DISABLED)
- ieee80211_sta_deauthenticate(sdata,
- WLAN_REASON_DEAUTH_LEAVING);
-
- memset(sdata->u.sta.bssid, 0, ETH_ALEN);
- del_timer_sync(&sdata->u.sta.chswitch_timer);
- del_timer_sync(&sdata->u.sta.timer);
+ memset(sdata->u.mgd.bssid, 0, ETH_ALEN);
+ del_timer_sync(&sdata->u.mgd.chswitch_timer);
+ del_timer_sync(&sdata->u.mgd.timer);
/*
* If the timer fired while we waited for it, it will have
* requeued the work. Now the work will be running again
@@ -468,8 +476,8 @@ static int ieee80211_stop(struct net_device *dev)
* whether the interface is running, which, at this point,
* it no longer is.
*/
- cancel_work_sync(&sdata->u.sta.work);
- cancel_work_sync(&sdata->u.sta.chswitch_work);
+ cancel_work_sync(&sdata->u.mgd.work);
+ cancel_work_sync(&sdata->u.mgd.chswitch_work);
/*
* When we get here, the interface is marked down.
* Call synchronize_rcu() to wait for the RX path
@@ -477,13 +485,22 @@ static int ieee80211_stop(struct net_device *dev)
* frames at this very time on another CPU.
*/
synchronize_rcu();
- skb_queue_purge(&sdata->u.sta.skb_queue);
+ skb_queue_purge(&sdata->u.mgd.skb_queue);
- sdata->u.sta.flags &= ~(IEEE80211_STA_PRIVACY_INVOKED |
+ sdata->u.mgd.flags &= ~(IEEE80211_STA_PRIVACY_INVOKED |
IEEE80211_STA_TKIP_WEP_USED);
- kfree(sdata->u.sta.extra_ie);
- sdata->u.sta.extra_ie = NULL;
- sdata->u.sta.extra_ie_len = 0;
+ kfree(sdata->u.mgd.extra_ie);
+ sdata->u.mgd.extra_ie = NULL;
+ sdata->u.mgd.extra_ie_len = 0;
+ /* fall through */
+ case NL80211_IFTYPE_ADHOC:
+ if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+ memset(sdata->u.ibss.bssid, 0, ETH_ALEN);
+ del_timer_sync(&sdata->u.ibss.timer);
+ cancel_work_sync(&sdata->u.ibss.work);
+ synchronize_rcu();
+ skb_queue_purge(&sdata->u.ibss.skb_queue);
+ }
/* fall through */
case NL80211_IFTYPE_MESH_POINT:
if (ieee80211_vif_is_mesh(&sdata->vif)) {
@@ -629,19 +646,20 @@ static void ieee80211_teardown_sdata(struct net_device *dev)
if (ieee80211_vif_is_mesh(&sdata->vif))
mesh_rmc_free(sdata);
break;
- case NL80211_IFTYPE_STATION:
case NL80211_IFTYPE_ADHOC:
- kfree(sdata->u.sta.extra_ie);
- kfree(sdata->u.sta.assocreq_ies);
- kfree(sdata->u.sta.assocresp_ies);
- kfree_skb(sdata->u.sta.probe_resp);
- kfree(sdata->u.sta.ie_probereq);
- kfree(sdata->u.sta.ie_proberesp);
- kfree(sdata->u.sta.ie_auth);
- kfree(sdata->u.sta.ie_assocreq);
- kfree(sdata->u.sta.ie_reassocreq);
- kfree(sdata->u.sta.ie_deauth);
- kfree(sdata->u.sta.ie_disassoc);
+ kfree_skb(sdata->u.ibss.probe_resp);
+ break;
+ case NL80211_IFTYPE_STATION:
+ kfree(sdata->u.mgd.extra_ie);
+ kfree(sdata->u.mgd.assocreq_ies);
+ kfree(sdata->u.mgd.assocresp_ies);
+ kfree(sdata->u.mgd.ie_probereq);
+ kfree(sdata->u.mgd.ie_proberesp);
+ kfree(sdata->u.mgd.ie_auth);
+ kfree(sdata->u.mgd.ie_assocreq);
+ kfree(sdata->u.mgd.ie_reassocreq);
+ kfree(sdata->u.mgd.ie_deauth);
+ kfree(sdata->u.mgd.ie_disassoc);
break;
case NL80211_IFTYPE_WDS:
case NL80211_IFTYPE_AP_VLAN:
@@ -708,9 +726,11 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
INIT_LIST_HEAD(&sdata->u.ap.vlans);
break;
case NL80211_IFTYPE_STATION:
- case NL80211_IFTYPE_ADHOC:
ieee80211_sta_setup_sdata(sdata);
break;
+ case NL80211_IFTYPE_ADHOC:
+ ieee80211_ibss_setup_sdata(sdata);
+ break;
case NL80211_IFTYPE_MESH_POINT:
if (ieee80211_vif_is_mesh(&sdata->vif))
ieee80211_mesh_init_sdata(sdata);
@@ -798,6 +818,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
memcpy(ndev->dev_addr, local->hw.wiphy->perm_addr, ETH_ALEN);
SET_NETDEV_DEV(ndev, wiphy_dev(local->hw.wiphy));
+ ndev->features |= NETIF_F_NETNS_LOCAL;
/* don't use IEEE80211_DEV_TO_SUB_IF because it checks too much */
sdata = netdev_priv(ndev);
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index 19b480de4bb..687acf23054 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -400,7 +400,7 @@ void ieee80211_key_link(struct ieee80211_key *key,
*/
/* same here, the AP could be using QoS */
- ap = sta_info_get(key->local, key->sdata->u.sta.bssid);
+ ap = sta_info_get(key->local, key->sdata->u.mgd.bssid);
if (ap) {
if (test_sta_flags(ap, WLAN_STA_WME))
key->conf.flags |=
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 5667f4e8067..f38db4d37e5 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -169,9 +169,10 @@ int ieee80211_if_config(struct ieee80211_sub_if_data *sdata, u32 changed)
memset(&conf, 0, sizeof(conf));
- if (sdata->vif.type == NL80211_IFTYPE_STATION ||
- sdata->vif.type == NL80211_IFTYPE_ADHOC)
- conf.bssid = sdata->u.sta.bssid;
+ if (sdata->vif.type == NL80211_IFTYPE_STATION)
+ conf.bssid = sdata->u.mgd.bssid;
+ else if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
+ conf.bssid = sdata->u.ibss.bssid;
else if (sdata->vif.type == NL80211_IFTYPE_AP)
conf.bssid = sdata->dev->dev_addr;
else if (ieee80211_vif_is_mesh(&sdata->vif)) {
@@ -210,7 +211,7 @@ int ieee80211_if_config(struct ieee80211_sub_if_data *sdata, u32 changed)
!!rcu_dereference(sdata->u.ap.beacon);
break;
case NL80211_IFTYPE_ADHOC:
- conf.enable_beacon = !!sdata->u.sta.probe_resp;
+ conf.enable_beacon = !!sdata->u.ibss.probe_resp;
break;
case NL80211_IFTYPE_MESH_POINT:
conf.enable_beacon = true;
@@ -705,7 +706,7 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
const struct ieee80211_ops *ops)
{
struct ieee80211_local *local;
- int priv_size;
+ int priv_size, i;
struct wiphy *wiphy;
/* Ensure 32-byte alignment of our private data and hw private data.
@@ -779,6 +780,11 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
setup_timer(&local->dynamic_ps_timer,
ieee80211_dynamic_ps_timer, (unsigned long) local);
+ for (i = 0; i < IEEE80211_MAX_AMPDU_QUEUES; i++)
+ local->ampdu_ac_queue[i] = -1;
+ /* using an s8 won't work with more than that */
+ BUILD_BUG_ON(IEEE80211_MAX_AMPDU_QUEUES > 127);
+
sta_info_init(local);
tasklet_init(&local->tx_pending_tasklet, ieee80211_tx_pending,
@@ -855,6 +861,11 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
/* mac80211 always supports monitor */
local->hw.wiphy->interface_modes |= BIT(NL80211_IFTYPE_MONITOR);
+ if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
+ local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_MBM;
+ else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)
+ local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_UNSPEC;
+
result = wiphy_register(local->hw.wiphy);
if (result < 0)
goto fail_wiphy_register;
@@ -872,7 +883,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
mdev = alloc_netdev_mq(sizeof(struct ieee80211_master_priv),
"wmaster%d", ieee80211_master_setup,
- ieee80211_num_queues(hw));
+ hw->queues);
if (!mdev)
goto fail_mdev_alloc;
@@ -916,6 +927,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
memcpy(local->mdev->dev_addr, local->hw.wiphy->perm_addr, ETH_ALEN);
SET_NETDEV_DEV(local->mdev, wiphy_dev(local->hw.wiphy));
+ local->mdev->features |= NETIF_F_NETNS_LOCAL;
result = register_netdevice(local->mdev);
if (result < 0)
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index fbb766afe59..841b8450b3d 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -15,11 +15,8 @@
#include <linux/if_ether.h>
#include <linux/skbuff.h>
#include <linux/if_arp.h>
-#include <linux/wireless.h>
-#include <linux/random.h>
#include <linux/etherdevice.h>
#include <linux/rtnetlink.h>
-#include <net/iw_handler.h>
#include <net/mac80211.h>
#include <asm/unaligned.h>
@@ -35,15 +32,6 @@
#define IEEE80211_MONITORING_INTERVAL (2 * HZ)
#define IEEE80211_PROBE_INTERVAL (60 * HZ)
#define IEEE80211_RETRY_AUTH_INTERVAL (1 * HZ)
-#define IEEE80211_SCAN_INTERVAL (2 * HZ)
-#define IEEE80211_SCAN_INTERVAL_SLOW (15 * HZ)
-#define IEEE80211_IBSS_JOIN_TIMEOUT (7 * HZ)
-
-#define IEEE80211_IBSS_MERGE_INTERVAL (30 * HZ)
-#define IEEE80211_IBSS_INACTIVITY_LIMIT (60 * HZ)
-
-#define IEEE80211_IBSS_MAX_STA_ENTRIES 128
-
/* utils */
static int ecw2cw(int ecw)
@@ -92,43 +80,6 @@ static int ieee80211_compatible_rates(struct ieee80211_bss *bss,
return count;
}
-/* also used by mesh code */
-u32 ieee80211_sta_get_rates(struct ieee80211_local *local,
- struct ieee802_11_elems *elems,
- enum ieee80211_band band)
-{
- struct ieee80211_supported_band *sband;
- struct ieee80211_rate *bitrates;
- size_t num_rates;
- u32 supp_rates;
- int i, j;
- sband = local->hw.wiphy->bands[band];
-
- if (!sband) {
- WARN_ON(1);
- sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
- }
-
- bitrates = sband->bitrates;
- num_rates = sband->n_bitrates;
- supp_rates = 0;
- for (i = 0; i < elems->supp_rates_len +
- elems->ext_supp_rates_len; i++) {
- u8 rate = 0;
- int own_rate;
- if (i < elems->supp_rates_len)
- rate = elems->supp_rates[i];
- else if (elems->ext_supp_rates)
- rate = elems->ext_supp_rates
- [i - elems->supp_rates_len];
- own_rate = 5 * (rate & 0x7f);
- for (j = 0; j < num_rates; j++)
- if (bitrates[j].bitrate == own_rate)
- supp_rates |= BIT(j);
- }
- return supp_rates;
-}
-
/* frame sending functions */
static void add_extra_ies(struct sk_buff *skb, u8 *ies, size_t ies_len)
@@ -137,113 +88,9 @@ static void add_extra_ies(struct sk_buff *skb, u8 *ies, size_t ies_len)
memcpy(skb_put(skb, ies_len), ies, ies_len);
}
-/* also used by scanning code */
-void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
- u8 *ssid, size_t ssid_len)
-{
- struct ieee80211_local *local = sdata->local;
- struct ieee80211_supported_band *sband;
- struct sk_buff *skb;
- struct ieee80211_mgmt *mgmt;
- u8 *pos, *supp_rates, *esupp_rates = NULL;
- int i;
-
- skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt) + 200 +
- sdata->u.sta.ie_probereq_len);
- if (!skb) {
- printk(KERN_DEBUG "%s: failed to allocate buffer for probe "
- "request\n", sdata->dev->name);
- return;
- }
- skb_reserve(skb, local->hw.extra_tx_headroom);
-
- mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
- memset(mgmt, 0, 24);
- mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
- IEEE80211_STYPE_PROBE_REQ);
- memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
- if (dst) {
- memcpy(mgmt->da, dst, ETH_ALEN);
- memcpy(mgmt->bssid, dst, ETH_ALEN);
- } else {
- memset(mgmt->da, 0xff, ETH_ALEN);
- memset(mgmt->bssid, 0xff, ETH_ALEN);
- }
- pos = skb_put(skb, 2 + ssid_len);
- *pos++ = WLAN_EID_SSID;
- *pos++ = ssid_len;
- memcpy(pos, ssid, ssid_len);
-
- supp_rates = skb_put(skb, 2);
- supp_rates[0] = WLAN_EID_SUPP_RATES;
- supp_rates[1] = 0;
- sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
-
- for (i = 0; i < sband->n_bitrates; i++) {
- struct ieee80211_rate *rate = &sband->bitrates[i];
- if (esupp_rates) {
- pos = skb_put(skb, 1);
- esupp_rates[1]++;
- } else if (supp_rates[1] == 8) {
- esupp_rates = skb_put(skb, 3);
- esupp_rates[0] = WLAN_EID_EXT_SUPP_RATES;
- esupp_rates[1] = 1;
- pos = &esupp_rates[2];
- } else {
- pos = skb_put(skb, 1);
- supp_rates[1]++;
- }
- *pos = rate->bitrate / 5;
- }
-
- add_extra_ies(skb, sdata->u.sta.ie_probereq,
- sdata->u.sta.ie_probereq_len);
-
- ieee80211_tx_skb(sdata, skb, 0);
-}
-
-static void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_if_sta *ifsta,
- int transaction, u8 *extra, size_t extra_len,
- int encrypt)
-{
- struct ieee80211_local *local = sdata->local;
- struct sk_buff *skb;
- struct ieee80211_mgmt *mgmt;
-
- skb = dev_alloc_skb(local->hw.extra_tx_headroom +
- sizeof(*mgmt) + 6 + extra_len +
- sdata->u.sta.ie_auth_len);
- if (!skb) {
- printk(KERN_DEBUG "%s: failed to allocate buffer for auth "
- "frame\n", sdata->dev->name);
- return;
- }
- skb_reserve(skb, local->hw.extra_tx_headroom);
-
- mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24 + 6);
- memset(mgmt, 0, 24 + 6);
- mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
- IEEE80211_STYPE_AUTH);
- if (encrypt)
- mgmt->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);
- memcpy(mgmt->da, ifsta->bssid, ETH_ALEN);
- memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
- memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
- mgmt->u.auth.auth_alg = cpu_to_le16(ifsta->auth_alg);
- mgmt->u.auth.auth_transaction = cpu_to_le16(transaction);
- ifsta->auth_transaction = transaction + 1;
- mgmt->u.auth.status_code = cpu_to_le16(0);
- if (extra)
- memcpy(skb_put(skb, extra_len), extra, extra_len);
- add_extra_ies(skb, sdata->u.sta.ie_auth, sdata->u.sta.ie_auth_len);
-
- ieee80211_tx_skb(sdata, skb, encrypt);
-}
-
-static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_if_sta *ifsta)
+static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
struct ieee80211_local *local = sdata->local;
struct sk_buff *skb;
struct ieee80211_mgmt *mgmt;
@@ -256,17 +103,17 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
u32 rates = 0;
size_t e_ies_len;
- if (ifsta->flags & IEEE80211_STA_PREV_BSSID_SET) {
- e_ies = sdata->u.sta.ie_reassocreq;
- e_ies_len = sdata->u.sta.ie_reassocreq_len;
+ if (ifmgd->flags & IEEE80211_IBSS_PREV_BSSID_SET) {
+ e_ies = sdata->u.mgd.ie_reassocreq;
+ e_ies_len = sdata->u.mgd.ie_reassocreq_len;
} else {
- e_ies = sdata->u.sta.ie_assocreq;
- e_ies_len = sdata->u.sta.ie_assocreq_len;
+ e_ies = sdata->u.mgd.ie_assocreq;
+ e_ies_len = sdata->u.mgd.ie_assocreq_len;
}
skb = dev_alloc_skb(local->hw.extra_tx_headroom +
- sizeof(*mgmt) + 200 + ifsta->extra_ie_len +
- ifsta->ssid_len + e_ies_len);
+ sizeof(*mgmt) + 200 + ifmgd->extra_ie_len +
+ ifmgd->ssid_len + e_ies_len);
if (!skb) {
printk(KERN_DEBUG "%s: failed to allocate buffer for assoc "
"frame\n", sdata->dev->name);
@@ -276,7 +123,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
- capab = ifsta->capab;
+ capab = ifmgd->capab;
if (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ) {
if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE))
@@ -285,9 +132,9 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
capab |= WLAN_CAPABILITY_SHORT_PREAMBLE;
}
- bss = ieee80211_rx_bss_get(local, ifsta->bssid,
+ bss = ieee80211_rx_bss_get(local, ifmgd->bssid,
local->hw.conf.channel->center_freq,
- ifsta->ssid, ifsta->ssid_len);
+ ifmgd->ssid, ifmgd->ssid_len);
if (bss) {
if (bss->cbss.capability & WLAN_CAPABILITY_PRIVACY)
capab |= WLAN_CAPABILITY_PRIVACY;
@@ -312,18 +159,18 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
memset(mgmt, 0, 24);
- memcpy(mgmt->da, ifsta->bssid, ETH_ALEN);
+ memcpy(mgmt->da, ifmgd->bssid, ETH_ALEN);
memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
- memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
+ memcpy(mgmt->bssid, ifmgd->bssid, ETH_ALEN);
- if (ifsta->flags & IEEE80211_STA_PREV_BSSID_SET) {
+ if (ifmgd->flags & IEEE80211_STA_PREV_BSSID_SET) {
skb_put(skb, 10);
mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
IEEE80211_STYPE_REASSOC_REQ);
mgmt->u.reassoc_req.capab_info = cpu_to_le16(capab);
mgmt->u.reassoc_req.listen_interval =
cpu_to_le16(local->hw.conf.listen_interval);
- memcpy(mgmt->u.reassoc_req.current_ap, ifsta->prev_bssid,
+ memcpy(mgmt->u.reassoc_req.current_ap, ifmgd->prev_bssid,
ETH_ALEN);
} else {
skb_put(skb, 4);
@@ -335,10 +182,10 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
}
/* SSID */
- ies = pos = skb_put(skb, 2 + ifsta->ssid_len);
+ ies = pos = skb_put(skb, 2 + ifmgd->ssid_len);
*pos++ = WLAN_EID_SSID;
- *pos++ = ifsta->ssid_len;
- memcpy(pos, ifsta->ssid, ifsta->ssid_len);
+ *pos++ = ifmgd->ssid_len;
+ memcpy(pos, ifmgd->ssid, ifmgd->ssid_len);
/* add all rates which were marked to be used above */
supp_rates_len = rates_len;
@@ -393,12 +240,12 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
}
}
- if (ifsta->extra_ie) {
- pos = skb_put(skb, ifsta->extra_ie_len);
- memcpy(pos, ifsta->extra_ie, ifsta->extra_ie_len);
+ if (ifmgd->extra_ie) {
+ pos = skb_put(skb, ifmgd->extra_ie_len);
+ memcpy(pos, ifmgd->extra_ie, ifmgd->extra_ie_len);
}
- if (wmm && (ifsta->flags & IEEE80211_STA_WMM_ENABLED)) {
+ if (wmm && (ifmgd->flags & IEEE80211_STA_WMM_ENABLED)) {
pos = skb_put(skb, 9);
*pos++ = WLAN_EID_VENDOR_SPECIFIC;
*pos++ = 7; /* len */
@@ -418,11 +265,11 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
* mode (11a/b/g) if any one of these ciphers is
* configured as pairwise.
*/
- if (wmm && (ifsta->flags & IEEE80211_STA_WMM_ENABLED) &&
+ if (wmm && (ifmgd->flags & IEEE80211_STA_WMM_ENABLED) &&
sband->ht_cap.ht_supported &&
(ht_ie = ieee80211_bss_get_ie(bss, WLAN_EID_HT_INFORMATION)) &&
ht_ie[1] >= sizeof(struct ieee80211_ht_info) &&
- (!(ifsta->flags & IEEE80211_STA_TKIP_WEP_USED))) {
+ (!(ifmgd->flags & IEEE80211_STA_TKIP_WEP_USED))) {
struct ieee80211_ht_info *ht_info =
(struct ieee80211_ht_info *)(ht_ie + 2);
u16 cap = sband->ht_cap.cap;
@@ -459,11 +306,11 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
add_extra_ies(skb, e_ies, e_ies_len);
- kfree(ifsta->assocreq_ies);
- ifsta->assocreq_ies_len = (skb->data + skb->len) - ies;
- ifsta->assocreq_ies = kmalloc(ifsta->assocreq_ies_len, GFP_KERNEL);
- if (ifsta->assocreq_ies)
- memcpy(ifsta->assocreq_ies, ies, ifsta->assocreq_ies_len);
+ kfree(ifmgd->assocreq_ies);
+ ifmgd->assocreq_ies_len = (skb->data + skb->len) - ies;
+ ifmgd->assocreq_ies = kmalloc(ifmgd->assocreq_ies_len, GFP_KERNEL);
+ if (ifmgd->assocreq_ies)
+ memcpy(ifmgd->assocreq_ies, ies, ifmgd->assocreq_ies_len);
ieee80211_tx_skb(sdata, skb, 0);
}
@@ -473,18 +320,18 @@ static void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
u16 stype, u16 reason)
{
struct ieee80211_local *local = sdata->local;
- struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
struct sk_buff *skb;
struct ieee80211_mgmt *mgmt;
u8 *ies;
size_t ies_len;
if (stype == IEEE80211_STYPE_DEAUTH) {
- ies = sdata->u.sta.ie_deauth;
- ies_len = sdata->u.sta.ie_deauth_len;
+ ies = sdata->u.mgd.ie_deauth;
+ ies_len = sdata->u.mgd.ie_deauth_len;
} else {
- ies = sdata->u.sta.ie_disassoc;
- ies_len = sdata->u.sta.ie_disassoc_len;
+ ies = sdata->u.mgd.ie_disassoc;
+ ies_len = sdata->u.mgd.ie_disassoc_len;
}
skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt) +
@@ -498,9 +345,9 @@ static void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
memset(mgmt, 0, 24);
- memcpy(mgmt->da, ifsta->bssid, ETH_ALEN);
+ memcpy(mgmt->da, ifmgd->bssid, ETH_ALEN);
memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
- memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
+ memcpy(mgmt->bssid, ifmgd->bssid, ETH_ALEN);
mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | stype);
skb_put(skb, 2);
/* u.deauth.reason_code == u.disassoc.reason_code */
@@ -508,13 +355,13 @@ static void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
add_extra_ies(skb, ies, ies_len);
- ieee80211_tx_skb(sdata, skb, ifsta->flags & IEEE80211_STA_MFP_ENABLED);
+ ieee80211_tx_skb(sdata, skb, ifmgd->flags & IEEE80211_STA_MFP_ENABLED);
}
void ieee80211_send_pspoll(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata)
{
- struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
struct ieee80211_pspoll *pspoll;
struct sk_buff *skb;
u16 fc;
@@ -531,43 +378,20 @@ void ieee80211_send_pspoll(struct ieee80211_local *local,
memset(pspoll, 0, sizeof(*pspoll));
fc = IEEE80211_FTYPE_CTL | IEEE80211_STYPE_PSPOLL | IEEE80211_FCTL_PM;
pspoll->frame_control = cpu_to_le16(fc);
- pspoll->aid = cpu_to_le16(ifsta->aid);
+ pspoll->aid = cpu_to_le16(ifmgd->aid);
/* aid in PS-Poll has its two MSBs each set to 1 */
pspoll->aid |= cpu_to_le16(1 << 15 | 1 << 14);
- memcpy(pspoll->bssid, ifsta->bssid, ETH_ALEN);
+ memcpy(pspoll->bssid, ifmgd->bssid, ETH_ALEN);
memcpy(pspoll->ta, sdata->dev->dev_addr, ETH_ALEN);
ieee80211_tx_skb(sdata, skb, 0);
-
- return;
}
/* MLME */
-static void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata,
- const size_t supp_rates_len,
- const u8 *supp_rates)
-{
- struct ieee80211_local *local = sdata->local;
- int i, have_higher_than_11mbit = 0;
-
- /* cf. IEEE 802.11 9.2.12 */
- for (i = 0; i < supp_rates_len; i++)
- if ((supp_rates[i] & 0x7f) * 5 > 110)
- have_higher_than_11mbit = 1;
-
- if (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ &&
- have_higher_than_11mbit)
- sdata->flags |= IEEE80211_SDATA_OPERATING_GMODE;
- else
- sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE;
-
- ieee80211_set_wmm_default(sdata);
-}
-
static void ieee80211_sta_wmm_params(struct ieee80211_local *local,
- struct ieee80211_if_sta *ifsta,
+ struct ieee80211_if_managed *ifmgd,
u8 *wmm_param, size_t wmm_param_len)
{
struct ieee80211_tx_queue_params params;
@@ -575,7 +399,7 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local,
int count;
u8 *pos;
- if (!(ifsta->flags & IEEE80211_STA_WMM_ENABLED))
+ if (!(ifmgd->flags & IEEE80211_STA_WMM_ENABLED))
return;
if (!wmm_param)
@@ -584,18 +408,15 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local,
if (wmm_param_len < 8 || wmm_param[5] /* version */ != 1)
return;
count = wmm_param[6] & 0x0f;
- if (count == ifsta->wmm_last_param_set)
+ if (count == ifmgd->wmm_last_param_set)
return;
- ifsta->wmm_last_param_set = count;
+ ifmgd->wmm_last_param_set = count;
pos = wmm_param + 8;
left = wmm_param_len - 8;
memset(&params, 0, sizeof(params));
- if (!local->ops->conf_tx)
- return;
-
local->wmm_acm = 0;
for (; left >= 4; left -= 4, pos += 4) {
int aci = (pos[0] >> 5) & 0x03;
@@ -603,26 +424,26 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local,
int queue;
switch (aci) {
- case 1:
+ case 1: /* AC_BK */
queue = 3;
if (acm)
- local->wmm_acm |= BIT(0) | BIT(3);
+ local->wmm_acm |= BIT(1) | BIT(2); /* BK/- */
break;
- case 2:
+ case 2: /* AC_VI */
queue = 1;
if (acm)
- local->wmm_acm |= BIT(4) | BIT(5);
+ local->wmm_acm |= BIT(4) | BIT(5); /* CL/VI */
break;
- case 3:
+ case 3: /* AC_VO */
queue = 0;
if (acm)
- local->wmm_acm |= BIT(6) | BIT(7);
+ local->wmm_acm |= BIT(6) | BIT(7); /* VO/NC */
break;
- case 0:
+ case 0: /* AC_BE */
default:
queue = 2;
if (acm)
- local->wmm_acm |= BIT(1) | BIT(2);
+ local->wmm_acm |= BIT(0) | BIT(3); /* BE/EE */
break;
}
@@ -636,9 +457,8 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local,
local->mdev->name, queue, aci, acm, params.aifs, params.cw_min,
params.cw_max, params.txop);
#endif
- /* TODO: handle ACM (block TX, fallback to next lowest allowed
- * AC for now) */
- if (local->ops->conf_tx(local_to_hw(local), queue, &params)) {
+ if (local->ops->conf_tx &&
+ local->ops->conf_tx(local_to_hw(local), queue, &params)) {
printk(KERN_DEBUG "%s: failed to set TX queue "
"parameters for queue %d\n", local->mdev->name, queue);
}
@@ -671,7 +491,7 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata,
{
struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf;
#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
- struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
#endif
u32 changed = 0;
bool use_protection;
@@ -694,7 +514,7 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata,
printk(KERN_DEBUG "%s: CTS protection %s (BSSID=%pM)\n",
sdata->dev->name,
use_protection ? "enabled" : "disabled",
- ifsta->bssid);
+ ifmgd->bssid);
}
#endif
bss_conf->use_cts_prot = use_protection;
@@ -708,7 +528,7 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata,
" (BSSID=%pM)\n",
sdata->dev->name,
use_short_preamble ? "short" : "long",
- ifsta->bssid);
+ ifmgd->bssid);
}
#endif
bss_conf->use_short_preamble = use_short_preamble;
@@ -722,7 +542,7 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata,
" (BSSID=%pM)\n",
sdata->dev->name,
use_short_slot ? "short" : "long",
- ifsta->bssid);
+ ifmgd->bssid);
}
#endif
bss_conf->use_short_slot = use_short_slot;
@@ -732,57 +552,57 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata,
return changed;
}
-static void ieee80211_sta_send_apinfo(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_if_sta *ifsta)
+static void ieee80211_sta_send_apinfo(struct ieee80211_sub_if_data *sdata)
{
union iwreq_data wrqu;
+
memset(&wrqu, 0, sizeof(wrqu));
- if (ifsta->flags & IEEE80211_STA_ASSOCIATED)
- memcpy(wrqu.ap_addr.sa_data, sdata->u.sta.bssid, ETH_ALEN);
+ if (sdata->u.mgd.flags & IEEE80211_STA_ASSOCIATED)
+ memcpy(wrqu.ap_addr.sa_data, sdata->u.mgd.bssid, ETH_ALEN);
wrqu.ap_addr.sa_family = ARPHRD_ETHER;
wireless_send_event(sdata->dev, SIOCGIWAP, &wrqu, NULL);
}
-static void ieee80211_sta_send_associnfo(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_if_sta *ifsta)
+static void ieee80211_sta_send_associnfo(struct ieee80211_sub_if_data *sdata)
{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
char *buf;
size_t len;
int i;
union iwreq_data wrqu;
- if (!ifsta->assocreq_ies && !ifsta->assocresp_ies)
+ if (!ifmgd->assocreq_ies && !ifmgd->assocresp_ies)
return;
- buf = kmalloc(50 + 2 * (ifsta->assocreq_ies_len +
- ifsta->assocresp_ies_len), GFP_KERNEL);
+ buf = kmalloc(50 + 2 * (ifmgd->assocreq_ies_len +
+ ifmgd->assocresp_ies_len), GFP_KERNEL);
if (!buf)
return;
len = sprintf(buf, "ASSOCINFO(");
- if (ifsta->assocreq_ies) {
+ if (ifmgd->assocreq_ies) {
len += sprintf(buf + len, "ReqIEs=");
- for (i = 0; i < ifsta->assocreq_ies_len; i++) {
+ for (i = 0; i < ifmgd->assocreq_ies_len; i++) {
len += sprintf(buf + len, "%02x",
- ifsta->assocreq_ies[i]);
+ ifmgd->assocreq_ies[i]);
}
}
- if (ifsta->assocresp_ies) {
- if (ifsta->assocreq_ies)
+ if (ifmgd->assocresp_ies) {
+ if (ifmgd->assocreq_ies)
len += sprintf(buf + len, " ");
len += sprintf(buf + len, "RespIEs=");
- for (i = 0; i < ifsta->assocresp_ies_len; i++) {
+ for (i = 0; i < ifmgd->assocresp_ies_len; i++) {
len += sprintf(buf + len, "%02x",
- ifsta->assocresp_ies[i]);
+ ifmgd->assocresp_ies[i]);
}
}
len += sprintf(buf + len, ")");
if (len > IW_CUSTOM_MAX) {
len = sprintf(buf, "ASSOCRESPIE=");
- for (i = 0; i < ifsta->assocresp_ies_len; i++) {
+ for (i = 0; i < ifmgd->assocresp_ies_len; i++) {
len += sprintf(buf + len, "%02x",
- ifsta->assocresp_ies[i]);
+ ifmgd->assocresp_ies[i]);
}
}
@@ -797,20 +617,20 @@ static void ieee80211_sta_send_associnfo(struct ieee80211_sub_if_data *sdata,
static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_if_sta *ifsta,
u32 bss_info_changed)
{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
struct ieee80211_local *local = sdata->local;
struct ieee80211_conf *conf = &local_to_hw(local)->conf;
struct ieee80211_bss *bss;
bss_info_changed |= BSS_CHANGED_ASSOC;
- ifsta->flags |= IEEE80211_STA_ASSOCIATED;
+ ifmgd->flags |= IEEE80211_STA_ASSOCIATED;
- bss = ieee80211_rx_bss_get(local, ifsta->bssid,
+ bss = ieee80211_rx_bss_get(local, ifmgd->bssid,
conf->channel->center_freq,
- ifsta->ssid, ifsta->ssid_len);
+ ifmgd->ssid, ifmgd->ssid_len);
if (bss) {
/* set timing information */
sdata->vif.bss_conf.beacon_int = bss->cbss.beacon_interval;
@@ -823,11 +643,11 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
ieee80211_rx_bss_put(local, bss);
}
- ifsta->flags |= IEEE80211_STA_PREV_BSSID_SET;
- memcpy(ifsta->prev_bssid, sdata->u.sta.bssid, ETH_ALEN);
- ieee80211_sta_send_associnfo(sdata, ifsta);
+ ifmgd->flags |= IEEE80211_STA_PREV_BSSID_SET;
+ memcpy(ifmgd->prev_bssid, sdata->u.mgd.bssid, ETH_ALEN);
+ ieee80211_sta_send_associnfo(sdata);
- ifsta->last_probe = jiffies;
+ ifmgd->last_probe = jiffies;
ieee80211_led_assoc(local, 1);
sdata->vif.bss_conf.assoc = 1;
@@ -856,70 +676,74 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
netif_tx_start_all_queues(sdata->dev);
netif_carrier_on(sdata->dev);
- ieee80211_sta_send_apinfo(sdata, ifsta);
+ ieee80211_sta_send_apinfo(sdata);
}
-static void ieee80211_direct_probe(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_if_sta *ifsta)
+static void ieee80211_direct_probe(struct ieee80211_sub_if_data *sdata)
{
- ifsta->direct_probe_tries++;
- if (ifsta->direct_probe_tries > IEEE80211_AUTH_MAX_TRIES) {
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
+ ifmgd->direct_probe_tries++;
+ if (ifmgd->direct_probe_tries > IEEE80211_AUTH_MAX_TRIES) {
printk(KERN_DEBUG "%s: direct probe to AP %pM timed out\n",
- sdata->dev->name, ifsta->bssid);
- ifsta->state = IEEE80211_STA_MLME_DISABLED;
- ieee80211_sta_send_apinfo(sdata, ifsta);
+ sdata->dev->name, ifmgd->bssid);
+ ifmgd->state = IEEE80211_STA_MLME_DISABLED;
+ ieee80211_sta_send_apinfo(sdata);
/*
* Most likely AP is not in the range so remove the
* bss information associated to the AP
*/
- ieee80211_rx_bss_remove(sdata, ifsta->bssid,
+ ieee80211_rx_bss_remove(sdata, ifmgd->bssid,
sdata->local->hw.conf.channel->center_freq,
- ifsta->ssid, ifsta->ssid_len);
+ ifmgd->ssid, ifmgd->ssid_len);
return;
}
printk(KERN_DEBUG "%s: direct probe to AP %pM try %d\n",
- sdata->dev->name, ifsta->bssid,
- ifsta->direct_probe_tries);
+ sdata->dev->name, ifmgd->bssid,
+ ifmgd->direct_probe_tries);
- ifsta->state = IEEE80211_STA_MLME_DIRECT_PROBE;
+ ifmgd->state = IEEE80211_STA_MLME_DIRECT_PROBE;
- set_bit(IEEE80211_STA_REQ_DIRECT_PROBE, &ifsta->request);
+ set_bit(IEEE80211_STA_REQ_DIRECT_PROBE, &ifmgd->request);
/* Direct probe is sent to broadcast address as some APs
* will not answer to direct packet in unassociated state.
*/
ieee80211_send_probe_req(sdata, NULL,
- ifsta->ssid, ifsta->ssid_len);
+ ifmgd->ssid, ifmgd->ssid_len, NULL, 0);
- mod_timer(&ifsta->timer, jiffies + IEEE80211_AUTH_TIMEOUT);
+ mod_timer(&ifmgd->timer, jiffies + IEEE80211_AUTH_TIMEOUT);
}
-static void ieee80211_authenticate(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_if_sta *ifsta)
+static void ieee80211_authenticate(struct ieee80211_sub_if_data *sdata)
{
- ifsta->auth_tries++;
- if (ifsta->auth_tries > IEEE80211_AUTH_MAX_TRIES) {
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
+ ifmgd->auth_tries++;
+ if (ifmgd->auth_tries > IEEE80211_AUTH_MAX_TRIES) {
printk(KERN_DEBUG "%s: authentication with AP %pM"
" timed out\n",
- sdata->dev->name, ifsta->bssid);
- ifsta->state = IEEE80211_STA_MLME_DISABLED;
- ieee80211_sta_send_apinfo(sdata, ifsta);
- ieee80211_rx_bss_remove(sdata, ifsta->bssid,
+ sdata->dev->name, ifmgd->bssid);
+ ifmgd->state = IEEE80211_STA_MLME_DISABLED;
+ ieee80211_sta_send_apinfo(sdata);
+ ieee80211_rx_bss_remove(sdata, ifmgd->bssid,
sdata->local->hw.conf.channel->center_freq,
- ifsta->ssid, ifsta->ssid_len);
+ ifmgd->ssid, ifmgd->ssid_len);
return;
}
- ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE;
+ ifmgd->state = IEEE80211_STA_MLME_AUTHENTICATE;
printk(KERN_DEBUG "%s: authenticate with AP %pM\n",
- sdata->dev->name, ifsta->bssid);
+ sdata->dev->name, ifmgd->bssid);
- ieee80211_send_auth(sdata, ifsta, 1, NULL, 0, 0);
+ ieee80211_send_auth(sdata, 1, ifmgd->auth_alg, NULL, 0,
+ ifmgd->bssid, 0);
+ ifmgd->auth_transaction = 2;
- mod_timer(&ifsta->timer, jiffies + IEEE80211_AUTH_TIMEOUT);
+ mod_timer(&ifmgd->timer, jiffies + IEEE80211_AUTH_TIMEOUT);
}
/*
@@ -927,27 +751,28 @@ static void ieee80211_authenticate(struct ieee80211_sub_if_data *sdata,
* if self disconnected or a reason code from the AP.
*/
static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_if_sta *ifsta, bool deauth,
- bool self_disconnected, u16 reason)
+ bool deauth, bool self_disconnected,
+ u16 reason)
{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
struct ieee80211_local *local = sdata->local;
struct sta_info *sta;
u32 changed = 0, config_changed = 0;
rcu_read_lock();
- sta = sta_info_get(local, ifsta->bssid);
+ sta = sta_info_get(local, ifmgd->bssid);
if (!sta) {
rcu_read_unlock();
return;
}
if (deauth) {
- ifsta->direct_probe_tries = 0;
- ifsta->auth_tries = 0;
+ ifmgd->direct_probe_tries = 0;
+ ifmgd->auth_tries = 0;
}
- ifsta->assoc_scan_tries = 0;
- ifsta->assoc_tries = 0;
+ ifmgd->assoc_scan_tries = 0;
+ ifmgd->assoc_tries = 0;
netif_tx_stop_all_queues(sdata->dev);
netif_carrier_off(sdata->dev);
@@ -963,20 +788,20 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
IEEE80211_STYPE_DISASSOC, reason);
}
- ifsta->flags &= ~IEEE80211_STA_ASSOCIATED;
+ ifmgd->flags &= ~IEEE80211_STA_ASSOCIATED;
changed |= ieee80211_reset_erp_info(sdata);
ieee80211_led_assoc(local, 0);
changed |= BSS_CHANGED_ASSOC;
sdata->vif.bss_conf.assoc = false;
- ieee80211_sta_send_apinfo(sdata, ifsta);
+ ieee80211_sta_send_apinfo(sdata);
if (self_disconnected || reason == WLAN_REASON_DISASSOC_STA_HAS_LEFT) {
- ifsta->state = IEEE80211_STA_MLME_DISABLED;
- ieee80211_rx_bss_remove(sdata, ifsta->bssid,
+ ifmgd->state = IEEE80211_STA_MLME_DISABLED;
+ ieee80211_rx_bss_remove(sdata, ifmgd->bssid,
sdata->local->hw.conf.channel->center_freq,
- ifsta->ssid, ifsta->ssid_len);
+ ifmgd->ssid, ifmgd->ssid_len);
}
rcu_read_unlock();
@@ -999,7 +824,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
rcu_read_lock();
- sta = sta_info_get(local, ifsta->bssid);
+ sta = sta_info_get(local, ifmgd->bssid);
if (!sta) {
rcu_read_unlock();
return;
@@ -1020,27 +845,27 @@ static int ieee80211_sta_wep_configured(struct ieee80211_sub_if_data *sdata)
return 1;
}
-static int ieee80211_privacy_mismatch(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_if_sta *ifsta)
+static int ieee80211_privacy_mismatch(struct ieee80211_sub_if_data *sdata)
{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
struct ieee80211_local *local = sdata->local;
struct ieee80211_bss *bss;
int bss_privacy;
int wep_privacy;
int privacy_invoked;
- if (!ifsta || (ifsta->flags & IEEE80211_STA_MIXED_CELL))
+ if (!ifmgd || (ifmgd->flags & IEEE80211_STA_MIXED_CELL))
return 0;
- bss = ieee80211_rx_bss_get(local, ifsta->bssid,
+ bss = ieee80211_rx_bss_get(local, ifmgd->bssid,
local->hw.conf.channel->center_freq,
- ifsta->ssid, ifsta->ssid_len);
+ ifmgd->ssid, ifmgd->ssid_len);
if (!bss)
return 0;
bss_privacy = !!(bss->cbss.capability & WLAN_CAPABILITY_PRIVACY);
wep_privacy = !!ieee80211_sta_wep_configured(sdata);
- privacy_invoked = !!(ifsta->flags & IEEE80211_STA_PRIVACY_INVOKED);
+ privacy_invoked = !!(ifmgd->flags & IEEE80211_STA_PRIVACY_INVOKED);
ieee80211_rx_bss_put(local, bss);
@@ -1050,41 +875,42 @@ static int ieee80211_privacy_mismatch(struct ieee80211_sub_if_data *sdata,
return 1;
}
-static void ieee80211_associate(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_if_sta *ifsta)
+static void ieee80211_associate(struct ieee80211_sub_if_data *sdata)
{
- ifsta->assoc_tries++;
- if (ifsta->assoc_tries > IEEE80211_ASSOC_MAX_TRIES) {
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
+ ifmgd->assoc_tries++;
+ if (ifmgd->assoc_tries > IEEE80211_ASSOC_MAX_TRIES) {
printk(KERN_DEBUG "%s: association with AP %pM"
" timed out\n",
- sdata->dev->name, ifsta->bssid);
- ifsta->state = IEEE80211_STA_MLME_DISABLED;
- ieee80211_sta_send_apinfo(sdata, ifsta);
- ieee80211_rx_bss_remove(sdata, ifsta->bssid,
+ sdata->dev->name, ifmgd->bssid);
+ ifmgd->state = IEEE80211_STA_MLME_DISABLED;
+ ieee80211_sta_send_apinfo(sdata);
+ ieee80211_rx_bss_remove(sdata, ifmgd->bssid,
sdata->local->hw.conf.channel->center_freq,
- ifsta->ssid, ifsta->ssid_len);
+ ifmgd->ssid, ifmgd->ssid_len);
return;
}
- ifsta->state = IEEE80211_STA_MLME_ASSOCIATE;
+ ifmgd->state = IEEE80211_STA_MLME_ASSOCIATE;
printk(KERN_DEBUG "%s: associate with AP %pM\n",
- sdata->dev->name, ifsta->bssid);
- if (ieee80211_privacy_mismatch(sdata, ifsta)) {
+ sdata->dev->name, ifmgd->bssid);
+ if (ieee80211_privacy_mismatch(sdata)) {
printk(KERN_DEBUG "%s: mismatch in privacy configuration and "
"mixed-cell disabled - abort association\n", sdata->dev->name);
- ifsta->state = IEEE80211_STA_MLME_DISABLED;
+ ifmgd->state = IEEE80211_STA_MLME_DISABLED;
return;
}
- ieee80211_send_assoc(sdata, ifsta);
+ ieee80211_send_assoc(sdata);
- mod_timer(&ifsta->timer, jiffies + IEEE80211_ASSOC_TIMEOUT);
+ mod_timer(&ifmgd->timer, jiffies + IEEE80211_ASSOC_TIMEOUT);
}
-static void ieee80211_associated(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_if_sta *ifsta)
+static void ieee80211_associated(struct ieee80211_sub_if_data *sdata)
{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
struct ieee80211_local *local = sdata->local;
struct sta_info *sta;
int disassoc;
@@ -1094,38 +920,40 @@ static void ieee80211_associated(struct ieee80211_sub_if_data *sdata,
* for better APs. */
/* TODO: remove expired BSSes */
- ifsta->state = IEEE80211_STA_MLME_ASSOCIATED;
+ ifmgd->state = IEEE80211_STA_MLME_ASSOCIATED;
rcu_read_lock();
- sta = sta_info_get(local, ifsta->bssid);
+ sta = sta_info_get(local, ifmgd->bssid);
if (!sta) {
printk(KERN_DEBUG "%s: No STA entry for own AP %pM\n",
- sdata->dev->name, ifsta->bssid);
+ sdata->dev->name, ifmgd->bssid);
disassoc = 1;
} else {
disassoc = 0;
if (time_after(jiffies,
sta->last_rx + IEEE80211_MONITORING_INTERVAL)) {
- if (ifsta->flags & IEEE80211_STA_PROBEREQ_POLL) {
+ if (ifmgd->flags & IEEE80211_STA_PROBEREQ_POLL) {
printk(KERN_DEBUG "%s: No ProbeResp from "
"current AP %pM - assume out of "
"range\n",
- sdata->dev->name, ifsta->bssid);
+ sdata->dev->name, ifmgd->bssid);
disassoc = 1;
} else
- ieee80211_send_probe_req(sdata, ifsta->bssid,
- ifsta->ssid,
- ifsta->ssid_len);
- ifsta->flags ^= IEEE80211_STA_PROBEREQ_POLL;
+ ieee80211_send_probe_req(sdata, ifmgd->bssid,
+ ifmgd->ssid,
+ ifmgd->ssid_len,
+ NULL, 0);
+ ifmgd->flags ^= IEEE80211_STA_PROBEREQ_POLL;
} else {
- ifsta->flags &= ~IEEE80211_STA_PROBEREQ_POLL;
- if (time_after(jiffies, ifsta->last_probe +
+ ifmgd->flags &= ~IEEE80211_STA_PROBEREQ_POLL;
+ if (time_after(jiffies, ifmgd->last_probe +
IEEE80211_PROBE_INTERVAL)) {
- ifsta->last_probe = jiffies;
- ieee80211_send_probe_req(sdata, ifsta->bssid,
- ifsta->ssid,
- ifsta->ssid_len);
+ ifmgd->last_probe = jiffies;
+ ieee80211_send_probe_req(sdata, ifmgd->bssid,
+ ifmgd->ssid,
+ ifmgd->ssid_len,
+ NULL, 0);
}
}
}
@@ -1133,25 +961,25 @@ static void ieee80211_associated(struct ieee80211_sub_if_data *sdata,
rcu_read_unlock();
if (disassoc)
- ieee80211_set_disassoc(sdata, ifsta, true, true,
+ ieee80211_set_disassoc(sdata, true, true,
WLAN_REASON_PREV_AUTH_NOT_VALID);
else
- mod_timer(&ifsta->timer, jiffies +
+ mod_timer(&ifmgd->timer, jiffies +
IEEE80211_MONITORING_INTERVAL);
}
-static void ieee80211_auth_completed(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_if_sta *ifsta)
+static void ieee80211_auth_completed(struct ieee80211_sub_if_data *sdata)
{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
printk(KERN_DEBUG "%s: authenticated\n", sdata->dev->name);
- ifsta->flags |= IEEE80211_STA_AUTHENTICATED;
- ieee80211_associate(sdata, ifsta);
+ ifmgd->flags |= IEEE80211_STA_AUTHENTICATED;
+ ieee80211_associate(sdata);
}
static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_if_sta *ifsta,
struct ieee80211_mgmt *mgmt,
size_t len)
{
@@ -1162,59 +990,37 @@ static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata,
ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), &elems);
if (!elems.challenge)
return;
- ieee80211_send_auth(sdata, ifsta, 3, elems.challenge - 2,
- elems.challenge_len + 2, 1);
-}
-
-static void ieee80211_rx_mgmt_auth_ibss(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_if_sta *ifsta,
- struct ieee80211_mgmt *mgmt,
- size_t len)
-{
- u16 auth_alg, auth_transaction, status_code;
-
- if (len < 24 + 6)
- return;
-
- auth_alg = le16_to_cpu(mgmt->u.auth.auth_alg);
- auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction);
- status_code = le16_to_cpu(mgmt->u.auth.status_code);
-
- /*
- * IEEE 802.11 standard does not require authentication in IBSS
- * networks and most implementations do not seem to use it.
- * However, try to reply to authentication attempts if someone
- * has actually implemented this.
- */
- if (auth_alg == WLAN_AUTH_OPEN && auth_transaction == 1)
- ieee80211_send_auth(sdata, ifsta, 2, NULL, 0, 0);
+ ieee80211_send_auth(sdata, 3, sdata->u.mgd.auth_alg,
+ elems.challenge - 2, elems.challenge_len + 2,
+ sdata->u.mgd.bssid, 1);
+ sdata->u.mgd.auth_transaction = 4;
}
static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_if_sta *ifsta,
struct ieee80211_mgmt *mgmt,
size_t len)
{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
u16 auth_alg, auth_transaction, status_code;
- if (ifsta->state != IEEE80211_STA_MLME_AUTHENTICATE)
+ if (ifmgd->state != IEEE80211_STA_MLME_AUTHENTICATE)
return;
if (len < 24 + 6)
return;
- if (memcmp(ifsta->bssid, mgmt->sa, ETH_ALEN) != 0)
+ if (memcmp(ifmgd->bssid, mgmt->sa, ETH_ALEN) != 0)
return;
- if (memcmp(ifsta->bssid, mgmt->bssid, ETH_ALEN) != 0)
+ if (memcmp(ifmgd->bssid, mgmt->bssid, ETH_ALEN) != 0)
return;
auth_alg = le16_to_cpu(mgmt->u.auth.auth_alg);
auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction);
status_code = le16_to_cpu(mgmt->u.auth.status_code);
- if (auth_alg != ifsta->auth_alg ||
- auth_transaction != ifsta->auth_transaction)
+ if (auth_alg != ifmgd->auth_alg ||
+ auth_transaction != ifmgd->auth_transaction)
return;
if (status_code != WLAN_STATUS_SUCCESS) {
@@ -1223,15 +1029,15 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
const int num_algs = ARRAY_SIZE(algs);
int i, pos;
algs[0] = algs[1] = algs[2] = 0xff;
- if (ifsta->auth_algs & IEEE80211_AUTH_ALG_OPEN)
+ if (ifmgd->auth_algs & IEEE80211_AUTH_ALG_OPEN)
algs[0] = WLAN_AUTH_OPEN;
- if (ifsta->auth_algs & IEEE80211_AUTH_ALG_SHARED_KEY)
+ if (ifmgd->auth_algs & IEEE80211_AUTH_ALG_SHARED_KEY)
algs[1] = WLAN_AUTH_SHARED_KEY;
- if (ifsta->auth_algs & IEEE80211_AUTH_ALG_LEAP)
+ if (ifmgd->auth_algs & IEEE80211_AUTH_ALG_LEAP)
algs[2] = WLAN_AUTH_LEAP;
- if (ifsta->auth_alg == WLAN_AUTH_OPEN)
+ if (ifmgd->auth_alg == WLAN_AUTH_OPEN)
pos = 0;
- else if (ifsta->auth_alg == WLAN_AUTH_SHARED_KEY)
+ else if (ifmgd->auth_alg == WLAN_AUTH_SHARED_KEY)
pos = 1;
else
pos = 2;
@@ -1239,101 +1045,101 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
pos++;
if (pos >= num_algs)
pos = 0;
- if (algs[pos] == ifsta->auth_alg ||
+ if (algs[pos] == ifmgd->auth_alg ||
algs[pos] == 0xff)
continue;
if (algs[pos] == WLAN_AUTH_SHARED_KEY &&
!ieee80211_sta_wep_configured(sdata))
continue;
- ifsta->auth_alg = algs[pos];
+ ifmgd->auth_alg = algs[pos];
break;
}
}
return;
}
- switch (ifsta->auth_alg) {
+ switch (ifmgd->auth_alg) {
case WLAN_AUTH_OPEN:
case WLAN_AUTH_LEAP:
- ieee80211_auth_completed(sdata, ifsta);
+ ieee80211_auth_completed(sdata);
break;
case WLAN_AUTH_SHARED_KEY:
- if (ifsta->auth_transaction == 4)
- ieee80211_auth_completed(sdata, ifsta);
+ if (ifmgd->auth_transaction == 4)
+ ieee80211_auth_completed(sdata);
else
- ieee80211_auth_challenge(sdata, ifsta, mgmt, len);
+ ieee80211_auth_challenge(sdata, mgmt, len);
break;
}
}
static void ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_if_sta *ifsta,
struct ieee80211_mgmt *mgmt,
size_t len)
{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
u16 reason_code;
if (len < 24 + 2)
return;
- if (memcmp(ifsta->bssid, mgmt->sa, ETH_ALEN))
+ if (memcmp(ifmgd->bssid, mgmt->sa, ETH_ALEN))
return;
reason_code = le16_to_cpu(mgmt->u.deauth.reason_code);
- if (ifsta->flags & IEEE80211_STA_AUTHENTICATED)
+ if (ifmgd->flags & IEEE80211_STA_AUTHENTICATED)
printk(KERN_DEBUG "%s: deauthenticated (Reason: %u)\n",
sdata->dev->name, reason_code);
- if (ifsta->state == IEEE80211_STA_MLME_AUTHENTICATE ||
- ifsta->state == IEEE80211_STA_MLME_ASSOCIATE ||
- ifsta->state == IEEE80211_STA_MLME_ASSOCIATED) {
- ifsta->state = IEEE80211_STA_MLME_DIRECT_PROBE;
- mod_timer(&ifsta->timer, jiffies +
+ if (ifmgd->state == IEEE80211_STA_MLME_AUTHENTICATE ||
+ ifmgd->state == IEEE80211_STA_MLME_ASSOCIATE ||
+ ifmgd->state == IEEE80211_STA_MLME_ASSOCIATED) {
+ ifmgd->state = IEEE80211_STA_MLME_DIRECT_PROBE;
+ mod_timer(&ifmgd->timer, jiffies +
IEEE80211_RETRY_AUTH_INTERVAL);
}
- ieee80211_set_disassoc(sdata, ifsta, true, false, 0);
- ifsta->flags &= ~IEEE80211_STA_AUTHENTICATED;
+ ieee80211_set_disassoc(sdata, true, false, 0);
+ ifmgd->flags &= ~IEEE80211_STA_AUTHENTICATED;
}
static void ieee80211_rx_mgmt_disassoc(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_if_sta *ifsta,
struct ieee80211_mgmt *mgmt,
size_t len)
{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
u16 reason_code;
if (len < 24 + 2)
return;
- if (memcmp(ifsta->bssid, mgmt->sa, ETH_ALEN))
+ if (memcmp(ifmgd->bssid, mgmt->sa, ETH_ALEN))
return;
reason_code = le16_to_cpu(mgmt->u.disassoc.reason_code);
- if (ifsta->flags & IEEE80211_STA_ASSOCIATED)
+ if (ifmgd->flags & IEEE80211_STA_ASSOCIATED)
printk(KERN_DEBUG "%s: disassociated (Reason: %u)\n",
sdata->dev->name, reason_code);
- if (ifsta->state == IEEE80211_STA_MLME_ASSOCIATED) {
- ifsta->state = IEEE80211_STA_MLME_ASSOCIATE;
- mod_timer(&ifsta->timer, jiffies +
+ if (ifmgd->state == IEEE80211_STA_MLME_ASSOCIATED) {
+ ifmgd->state = IEEE80211_STA_MLME_ASSOCIATE;
+ mod_timer(&ifmgd->timer, jiffies +
IEEE80211_RETRY_AUTH_INTERVAL);
}
- ieee80211_set_disassoc(sdata, ifsta, false, false, reason_code);
+ ieee80211_set_disassoc(sdata, false, false, reason_code);
}
static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_if_sta *ifsta,
struct ieee80211_mgmt *mgmt,
size_t len,
int reassoc)
{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
struct ieee80211_local *local = sdata->local;
struct ieee80211_supported_band *sband;
struct sta_info *sta;
@@ -1350,13 +1156,13 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
/* AssocResp and ReassocResp have identical structure, so process both
* of them in this function. */
- if (ifsta->state != IEEE80211_STA_MLME_ASSOCIATE)
+ if (ifmgd->state != IEEE80211_STA_MLME_ASSOCIATE)
return;
if (len < 24 + 6)
return;
- if (memcmp(ifsta->bssid, mgmt->sa, ETH_ALEN) != 0)
+ if (memcmp(ifmgd->bssid, mgmt->sa, ETH_ALEN) != 0)
return;
capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info);
@@ -1381,7 +1187,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
"comeback duration %u TU (%u ms)\n",
sdata->dev->name, tu, ms);
if (ms > IEEE80211_ASSOC_TIMEOUT)
- mod_timer(&ifsta->timer,
+ mod_timer(&ifmgd->timer,
jiffies + msecs_to_jiffies(ms));
return;
}
@@ -1392,7 +1198,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
/* if this was a reassociation, ensure we try a "full"
* association next time. This works around some broken APs
* which do not correctly reject reassociation requests. */
- ifsta->flags &= ~IEEE80211_STA_PREV_BSSID_SET;
+ ifmgd->flags &= ~IEEE80211_STA_PREV_BSSID_SET;
return;
}
@@ -1408,23 +1214,23 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
}
printk(KERN_DEBUG "%s: associated\n", sdata->dev->name);
- ifsta->aid = aid;
- ifsta->ap_capab = capab_info;
+ ifmgd->aid = aid;
+ ifmgd->ap_capab = capab_info;
- kfree(ifsta->assocresp_ies);
- ifsta->assocresp_ies_len = len - (pos - (u8 *) mgmt);
- ifsta->assocresp_ies = kmalloc(ifsta->assocresp_ies_len, GFP_KERNEL);
- if (ifsta->assocresp_ies)
- memcpy(ifsta->assocresp_ies, pos, ifsta->assocresp_ies_len);
+ kfree(ifmgd->assocresp_ies);
+ ifmgd->assocresp_ies_len = len - (pos - (u8 *) mgmt);
+ ifmgd->assocresp_ies = kmalloc(ifmgd->assocresp_ies_len, GFP_KERNEL);
+ if (ifmgd->assocresp_ies)
+ memcpy(ifmgd->assocresp_ies, pos, ifmgd->assocresp_ies_len);
rcu_read_lock();
/* Add STA entry for the AP */
- sta = sta_info_get(local, ifsta->bssid);
+ sta = sta_info_get(local, ifmgd->bssid);
if (!sta) {
newsta = true;
- sta = sta_info_alloc(sdata, ifsta->bssid, GFP_ATOMIC);
+ sta = sta_info_alloc(sdata, ifmgd->bssid, GFP_ATOMIC);
if (!sta) {
printk(KERN_DEBUG "%s: failed to alloc STA entry for"
" the AP\n", sdata->dev->name);
@@ -1497,7 +1303,8 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
else
sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE;
- if (elems.ht_cap_elem)
+ /* If TKIP/WEP is used, no need to parse AP's HT capabilities */
+ if (elems.ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_TKIP_WEP_USED))
ieee80211_ht_cap_ie_to_sta_ht_cap(sband,
elems.ht_cap_elem, &sta->sta.ht_cap);
@@ -1505,7 +1312,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
rate_control_rate_init(sta);
- if (ifsta->flags & IEEE80211_STA_MFP_ENABLED)
+ if (ifmgd->flags & IEEE80211_STA_MFP_ENABLED)
set_sta_flags(sta, WLAN_STA_MFP);
if (elems.wmm_param)
@@ -1524,11 +1331,12 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
rcu_read_unlock();
if (elems.wmm_param)
- ieee80211_sta_wmm_params(local, ifsta, elems.wmm_param,
+ ieee80211_sta_wmm_params(local, ifmgd, elems.wmm_param,
elems.wmm_param_len);
if (elems.ht_info_elem && elems.wmm_param &&
- (ifsta->flags & IEEE80211_STA_WMM_ENABLED))
+ (ifmgd->flags & IEEE80211_STA_WMM_ENABLED) &&
+ !(ifmgd->flags & IEEE80211_STA_TKIP_WEP_USED))
changed |= ieee80211_enable_ht(sdata, elems.ht_info_elem,
ap_ht_cap_flags);
@@ -1536,163 +1344,12 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
* ieee80211_set_associated() will tell the driver */
bss_conf->aid = aid;
bss_conf->assoc_capability = capab_info;
- ieee80211_set_associated(sdata, ifsta, changed);
+ ieee80211_set_associated(sdata, changed);
- ieee80211_associated(sdata, ifsta);
+ ieee80211_associated(sdata);
}
-static int __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_if_sta *ifsta,
- const u8 *bssid, const int beacon_int,
- const int freq,
- const size_t supp_rates_len,
- const u8 *supp_rates,
- const u16 capability)
-{
- struct ieee80211_local *local = sdata->local;
- int res = 0, rates, i, j;
- struct sk_buff *skb;
- struct ieee80211_mgmt *mgmt;
- u8 *pos;
- struct ieee80211_supported_band *sband;
- union iwreq_data wrqu;
-
- if (local->ops->reset_tsf) {
- /* Reset own TSF to allow time synchronization work. */
- local->ops->reset_tsf(local_to_hw(local));
- }
-
- if ((ifsta->flags & IEEE80211_STA_PREV_BSSID_SET) &&
- memcmp(ifsta->bssid, bssid, ETH_ALEN) == 0)
- return res;
-
- skb = dev_alloc_skb(local->hw.extra_tx_headroom + 400 +
- sdata->u.sta.ie_proberesp_len);
- if (!skb) {
- printk(KERN_DEBUG "%s: failed to allocate buffer for probe "
- "response\n", sdata->dev->name);
- return -ENOMEM;
- }
-
- if (!(ifsta->flags & IEEE80211_STA_PREV_BSSID_SET)) {
- /* Remove possible STA entries from other IBSS networks. */
- sta_info_flush_delayed(sdata);
- }
-
- memcpy(ifsta->bssid, bssid, ETH_ALEN);
- res = ieee80211_if_config(sdata, IEEE80211_IFCC_BSSID);
- if (res)
- return res;
-
- local->hw.conf.beacon_int = beacon_int >= 10 ? beacon_int : 10;
-
- sdata->drop_unencrypted = capability &
- WLAN_CAPABILITY_PRIVACY ? 1 : 0;
-
- res = ieee80211_set_freq(sdata, freq);
-
- if (res)
- return res;
-
- sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
-
- /* Build IBSS probe response */
-
- skb_reserve(skb, local->hw.extra_tx_headroom);
-
- mgmt = (struct ieee80211_mgmt *)
- skb_put(skb, 24 + sizeof(mgmt->u.beacon));
- memset(mgmt, 0, 24 + sizeof(mgmt->u.beacon));
- mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
- IEEE80211_STYPE_PROBE_RESP);
- memset(mgmt->da, 0xff, ETH_ALEN);
- memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
- memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
- mgmt->u.beacon.beacon_int =
- cpu_to_le16(local->hw.conf.beacon_int);
- mgmt->u.beacon.capab_info = cpu_to_le16(capability);
-
- pos = skb_put(skb, 2 + ifsta->ssid_len);
- *pos++ = WLAN_EID_SSID;
- *pos++ = ifsta->ssid_len;
- memcpy(pos, ifsta->ssid, ifsta->ssid_len);
-
- rates = supp_rates_len;
- if (rates > 8)
- rates = 8;
- pos = skb_put(skb, 2 + rates);
- *pos++ = WLAN_EID_SUPP_RATES;
- *pos++ = rates;
- memcpy(pos, supp_rates, rates);
-
- if (sband->band == IEEE80211_BAND_2GHZ) {
- pos = skb_put(skb, 2 + 1);
- *pos++ = WLAN_EID_DS_PARAMS;
- *pos++ = 1;
- *pos++ = ieee80211_frequency_to_channel(freq);
- }
-
- pos = skb_put(skb, 2 + 2);
- *pos++ = WLAN_EID_IBSS_PARAMS;
- *pos++ = 2;
- /* FIX: set ATIM window based on scan results */
- *pos++ = 0;
- *pos++ = 0;
-
- if (supp_rates_len > 8) {
- rates = supp_rates_len - 8;
- pos = skb_put(skb, 2 + rates);
- *pos++ = WLAN_EID_EXT_SUPP_RATES;
- *pos++ = rates;
- memcpy(pos, &supp_rates[8], rates);
- }
-
- add_extra_ies(skb, sdata->u.sta.ie_proberesp,
- sdata->u.sta.ie_proberesp_len);
-
- ifsta->probe_resp = skb;
-
- ieee80211_if_config(sdata, IEEE80211_IFCC_BEACON |
- IEEE80211_IFCC_BEACON_ENABLED);
-
-
- rates = 0;
- for (i = 0; i < supp_rates_len; i++) {
- int bitrate = (supp_rates[i] & 0x7f) * 5;
- for (j = 0; j < sband->n_bitrates; j++)
- if (sband->bitrates[j].bitrate == bitrate)
- rates |= BIT(j);
- }
- ifsta->supp_rates_bits[local->hw.conf.channel->band] = rates;
-
- ieee80211_sta_def_wmm_params(sdata, supp_rates_len, supp_rates);
-
- ifsta->flags |= IEEE80211_STA_PREV_BSSID_SET;
- ifsta->state = IEEE80211_STA_MLME_IBSS_JOINED;
- mod_timer(&ifsta->timer, jiffies + IEEE80211_IBSS_MERGE_INTERVAL);
-
- ieee80211_led_assoc(local, true);
-
- memset(&wrqu, 0, sizeof(wrqu));
- memcpy(wrqu.ap_addr.sa_data, bssid, ETH_ALEN);
- wireless_send_event(sdata->dev, SIOCGIWAP, &wrqu, NULL);
-
- return res;
-}
-
-static int ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_if_sta *ifsta,
- struct ieee80211_bss *bss)
-{
- return __ieee80211_sta_join_ibss(sdata, ifsta,
- bss->cbss.bssid,
- bss->cbss.beacon_interval,
- bss->cbss.channel->center_freq,
- bss->supp_rates_len, bss->supp_rates,
- bss->cbss.capability);
-}
-
static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
struct ieee80211_mgmt *mgmt,
size_t len,
@@ -1703,11 +1360,7 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
struct ieee80211_local *local = sdata->local;
int freq;
struct ieee80211_bss *bss;
- struct sta_info *sta;
struct ieee80211_channel *channel;
- u64 beacon_timestamp, rx_timestamp;
- u32 supp_rates = 0;
- enum ieee80211_band band = rx_status->band;
if (elems->ds_params && elems->ds_params_len == 1)
freq = ieee80211_channel_to_frequency(elems->ds_params[0]);
@@ -1719,133 +1372,18 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
if (!channel || channel->flags & IEEE80211_CHAN_DISABLED)
return;
- if (sdata->vif.type == NL80211_IFTYPE_ADHOC && elems->supp_rates &&
- memcmp(mgmt->bssid, sdata->u.sta.bssid, ETH_ALEN) == 0) {
- supp_rates = ieee80211_sta_get_rates(local, elems, band);
-
- rcu_read_lock();
-
- sta = sta_info_get(local, mgmt->sa);
- if (sta) {
- u32 prev_rates;
-
- prev_rates = sta->sta.supp_rates[band];
- /* make sure mandatory rates are always added */
- sta->sta.supp_rates[band] = supp_rates |
- ieee80211_mandatory_rates(local, band);
-
-#ifdef CONFIG_MAC80211_IBSS_DEBUG
- if (sta->sta.supp_rates[band] != prev_rates)
- printk(KERN_DEBUG "%s: updated supp_rates set "
- "for %pM based on beacon info (0x%llx | "
- "0x%llx -> 0x%llx)\n",
- sdata->dev->name,
- sta->sta.addr,
- (unsigned long long) prev_rates,
- (unsigned long long) supp_rates,
- (unsigned long long) sta->sta.supp_rates[band]);
-#endif
- } else {
- ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa, supp_rates);
- }
-
- rcu_read_unlock();
- }
-
bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, elems,
channel, beacon);
if (!bss)
return;
if (elems->ch_switch_elem && (elems->ch_switch_elem_len == 3) &&
- (memcmp(mgmt->bssid, sdata->u.sta.bssid, ETH_ALEN) == 0)) {
+ (memcmp(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN) == 0)) {
struct ieee80211_channel_sw_ie *sw_elem =
(struct ieee80211_channel_sw_ie *)elems->ch_switch_elem;
ieee80211_process_chanswitch(sdata, sw_elem, bss);
}
- /* was just updated in ieee80211_bss_info_update */
- beacon_timestamp = bss->cbss.tsf;
-
- if (sdata->vif.type != NL80211_IFTYPE_ADHOC)
- goto put_bss;
-
- /* check if we need to merge IBSS */
-
- /* merge only on beacons (???) */
- if (!beacon)
- goto put_bss;
-
- /* we use a fixed BSSID */
- if (sdata->u.sta.flags & IEEE80211_STA_BSSID_SET)
- goto put_bss;
-
- /* not an IBSS */
- if (!(bss->cbss.capability & WLAN_CAPABILITY_IBSS))
- goto put_bss;
-
- /* different channel */
- if (bss->cbss.channel != local->oper_channel)
- goto put_bss;
-
- /* different SSID */
- if (elems->ssid_len != sdata->u.sta.ssid_len ||
- memcmp(elems->ssid, sdata->u.sta.ssid,
- sdata->u.sta.ssid_len))
- goto put_bss;
-
- if (rx_status->flag & RX_FLAG_TSFT) {
- /*
- * For correct IBSS merging we need mactime; since mactime is
- * defined as the time the first data symbol of the frame hits
- * the PHY, and the timestamp of the beacon is defined as "the
- * time that the data symbol containing the first bit of the
- * timestamp is transmitted to the PHY plus the transmitting
- * STA's delays through its local PHY from the MAC-PHY
- * interface to its interface with the WM" (802.11 11.1.2)
- * - equals the time this bit arrives at the receiver - we have
- * to take into account the offset between the two.
- *
- * E.g. at 1 MBit that means mactime is 192 usec earlier
- * (=24 bytes * 8 usecs/byte) than the beacon timestamp.
- */
- int rate;
-
- if (rx_status->flag & RX_FLAG_HT)
- rate = 65; /* TODO: HT rates */
- else
- rate = local->hw.wiphy->bands[band]->
- bitrates[rx_status->rate_idx].bitrate;
-
- rx_timestamp = rx_status->mactime + (24 * 8 * 10 / rate);
- } else if (local && local->ops && local->ops->get_tsf)
- /* second best option: get current TSF */
- rx_timestamp = local->ops->get_tsf(local_to_hw(local));
- else
- /* can't merge without knowing the TSF */
- rx_timestamp = -1LLU;
-
-#ifdef CONFIG_MAC80211_IBSS_DEBUG
- printk(KERN_DEBUG "RX beacon SA=%pM BSSID="
- "%pM TSF=0x%llx BCN=0x%llx diff=%lld @%lu\n",
- mgmt->sa, mgmt->bssid,
- (unsigned long long)rx_timestamp,
- (unsigned long long)beacon_timestamp,
- (unsigned long long)(rx_timestamp - beacon_timestamp),
- jiffies);
-#endif
-
- if (beacon_timestamp > rx_timestamp) {
-#ifdef CONFIG_MAC80211_IBSS_DEBUG
- printk(KERN_DEBUG "%s: beacon TSF higher than "
- "local TSF - IBSS merge with BSSID %pM\n",
- sdata->dev->name, mgmt->bssid);
-#endif
- ieee80211_sta_join_ibss(sdata, &sdata->u.sta, bss);
- ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa, supp_rates);
- }
-
- put_bss:
ieee80211_rx_bss_put(local, bss);
}
@@ -1857,7 +1395,6 @@ static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata,
{
size_t baselen;
struct ieee802_11_elems elems;
- struct ieee80211_if_sta *ifsta = &sdata->u.sta;
if (memcmp(mgmt->da, sdata->dev->dev_addr, ETH_ALEN))
return; /* ignore ProbeResp to foreign address */
@@ -1873,20 +1410,19 @@ static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata,
/* direct probe may be part of the association flow */
if (test_and_clear_bit(IEEE80211_STA_REQ_DIRECT_PROBE,
- &ifsta->request)) {
+ &sdata->u.mgd.request)) {
printk(KERN_DEBUG "%s direct probe responded\n",
sdata->dev->name);
- ieee80211_authenticate(sdata, ifsta);
+ ieee80211_authenticate(sdata);
}
}
-
static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
struct ieee80211_mgmt *mgmt,
size_t len,
struct ieee80211_rx_status *rx_status)
{
- struct ieee80211_if_sta *ifsta;
+ struct ieee80211_if_managed *ifmgd;
size_t baselen;
struct ieee802_11_elems elems;
struct ieee80211_local *local = sdata->local;
@@ -1905,21 +1441,21 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
if (sdata->vif.type != NL80211_IFTYPE_STATION)
return;
- ifsta = &sdata->u.sta;
- if (!(ifsta->flags & IEEE80211_STA_ASSOCIATED) ||
- memcmp(ifsta->bssid, mgmt->bssid, ETH_ALEN) != 0)
+ ifmgd = &sdata->u.mgd;
+
+ if (!(ifmgd->flags & IEEE80211_STA_ASSOCIATED) ||
+ memcmp(ifmgd->bssid, mgmt->bssid, ETH_ALEN) != 0)
return;
if (rx_status->freq != local->hw.conf.channel->center_freq)
return;
- ieee80211_sta_wmm_params(local, ifsta, elems.wmm_param,
+ ieee80211_sta_wmm_params(local, ifmgd, elems.wmm_param,
elems.wmm_param_len);
- if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK &&
- local->hw.conf.flags & IEEE80211_CONF_PS) {
- directed_tim = ieee80211_check_tim(&elems, ifsta->aid);
+ if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) {
+ directed_tim = ieee80211_check_tim(&elems, ifmgd->aid);
if (directed_tim) {
if (local->hw.conf.dynamic_ps_timeout > 0) {
@@ -1954,14 +1490,15 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
erp_valid, erp_value);
- if (elems.ht_cap_elem && elems.ht_info_elem && elems.wmm_param) {
+ if (elems.ht_cap_elem && elems.ht_info_elem && elems.wmm_param &&
+ !(ifmgd->flags & IEEE80211_STA_TKIP_WEP_USED)) {
struct sta_info *sta;
struct ieee80211_supported_band *sband;
u16 ap_ht_cap_flags;
rcu_read_lock();
- sta = sta_info_get(local, ifsta->bssid);
+ sta = sta_info_get(local, ifmgd->bssid);
if (!sta) {
rcu_read_unlock();
return;
@@ -1997,85 +1534,16 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
ieee80211_bss_info_change_notify(sdata, changed);
}
-
-static void ieee80211_rx_mgmt_probe_req(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_if_sta *ifsta,
- struct ieee80211_mgmt *mgmt,
- size_t len)
+ieee80211_rx_result ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb,
+ struct ieee80211_rx_status *rx_status)
{
struct ieee80211_local *local = sdata->local;
- int tx_last_beacon;
- struct sk_buff *skb;
- struct ieee80211_mgmt *resp;
- u8 *pos, *end;
-
- if (ifsta->state != IEEE80211_STA_MLME_IBSS_JOINED ||
- len < 24 + 2 || !ifsta->probe_resp)
- return;
-
- if (local->ops->tx_last_beacon)
- tx_last_beacon = local->ops->tx_last_beacon(local_to_hw(local));
- else
- tx_last_beacon = 1;
-
-#ifdef CONFIG_MAC80211_IBSS_DEBUG
- printk(KERN_DEBUG "%s: RX ProbeReq SA=%pM DA=%pM BSSID=%pM"
- " (tx_last_beacon=%d)\n",
- sdata->dev->name, mgmt->sa, mgmt->da,
- mgmt->bssid, tx_last_beacon);
-#endif /* CONFIG_MAC80211_IBSS_DEBUG */
-
- if (!tx_last_beacon)
- return;
-
- if (memcmp(mgmt->bssid, ifsta->bssid, ETH_ALEN) != 0 &&
- memcmp(mgmt->bssid, "\xff\xff\xff\xff\xff\xff", ETH_ALEN) != 0)
- return;
-
- end = ((u8 *) mgmt) + len;
- pos = mgmt->u.probe_req.variable;
- if (pos[0] != WLAN_EID_SSID ||
- pos + 2 + pos[1] > end) {
-#ifdef CONFIG_MAC80211_IBSS_DEBUG
- printk(KERN_DEBUG "%s: Invalid SSID IE in ProbeReq "
- "from %pM\n",
- sdata->dev->name, mgmt->sa);
-#endif
- return;
- }
- if (pos[1] != 0 &&
- (pos[1] != ifsta->ssid_len ||
- memcmp(pos + 2, ifsta->ssid, ifsta->ssid_len) != 0)) {
- /* Ignore ProbeReq for foreign SSID */
- return;
- }
-
- /* Reply with ProbeResp */
- skb = skb_copy(ifsta->probe_resp, GFP_KERNEL);
- if (!skb)
- return;
-
- resp = (struct ieee80211_mgmt *) skb->data;
- memcpy(resp->da, mgmt->sa, ETH_ALEN);
-#ifdef CONFIG_MAC80211_IBSS_DEBUG
- printk(KERN_DEBUG "%s: Sending ProbeResp to %pM\n",
- sdata->dev->name, resp->da);
-#endif /* CONFIG_MAC80211_IBSS_DEBUG */
- ieee80211_tx_skb(sdata, skb, 0);
-}
-
-void ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
- struct ieee80211_rx_status *rx_status)
-{
- struct ieee80211_local *local = sdata->local;
- struct ieee80211_if_sta *ifsta;
struct ieee80211_mgmt *mgmt;
u16 fc;
if (skb->len < 24)
- goto fail;
-
- ifsta = &sdata->u.sta;
+ return RX_DROP_MONITOR;
mgmt = (struct ieee80211_mgmt *) skb->data;
fc = le16_to_cpu(mgmt->frame_control);
@@ -2090,147 +1558,68 @@ void ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *
case IEEE80211_STYPE_REASSOC_RESP:
case IEEE80211_STYPE_DEAUTH:
case IEEE80211_STYPE_DISASSOC:
- skb_queue_tail(&ifsta->skb_queue, skb);
- queue_work(local->hw.workqueue, &ifsta->work);
- return;
+ skb_queue_tail(&sdata->u.mgd.skb_queue, skb);
+ queue_work(local->hw.workqueue, &sdata->u.mgd.work);
+ return RX_QUEUED;
}
- fail:
- kfree_skb(skb);
+ return RX_DROP_MONITOR;
}
static void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb)
{
struct ieee80211_rx_status *rx_status;
- struct ieee80211_if_sta *ifsta;
struct ieee80211_mgmt *mgmt;
u16 fc;
- ifsta = &sdata->u.sta;
-
rx_status = (struct ieee80211_rx_status *) skb->cb;
mgmt = (struct ieee80211_mgmt *) skb->data;
fc = le16_to_cpu(mgmt->frame_control);
- if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
- switch (fc & IEEE80211_FCTL_STYPE) {
- case IEEE80211_STYPE_PROBE_REQ:
- ieee80211_rx_mgmt_probe_req(sdata, ifsta, mgmt,
- skb->len);
- break;
- case IEEE80211_STYPE_PROBE_RESP:
- ieee80211_rx_mgmt_probe_resp(sdata, mgmt, skb->len,
- rx_status);
- break;
- case IEEE80211_STYPE_BEACON:
- ieee80211_rx_mgmt_beacon(sdata, mgmt, skb->len,
- rx_status);
- break;
- case IEEE80211_STYPE_AUTH:
- ieee80211_rx_mgmt_auth_ibss(sdata, ifsta, mgmt,
- skb->len);
- break;
- }
- } else { /* NL80211_IFTYPE_STATION */
- switch (fc & IEEE80211_FCTL_STYPE) {
- case IEEE80211_STYPE_PROBE_RESP:
- ieee80211_rx_mgmt_probe_resp(sdata, mgmt, skb->len,
- rx_status);
- break;
- case IEEE80211_STYPE_BEACON:
- ieee80211_rx_mgmt_beacon(sdata, mgmt, skb->len,
- rx_status);
- break;
- case IEEE80211_STYPE_AUTH:
- ieee80211_rx_mgmt_auth(sdata, ifsta, mgmt, skb->len);
- break;
- case IEEE80211_STYPE_ASSOC_RESP:
- ieee80211_rx_mgmt_assoc_resp(sdata, ifsta, mgmt,
- skb->len, 0);
- break;
- case IEEE80211_STYPE_REASSOC_RESP:
- ieee80211_rx_mgmt_assoc_resp(sdata, ifsta, mgmt,
- skb->len, 1);
- break;
- case IEEE80211_STYPE_DEAUTH:
- ieee80211_rx_mgmt_deauth(sdata, ifsta, mgmt, skb->len);
- break;
- case IEEE80211_STYPE_DISASSOC:
- ieee80211_rx_mgmt_disassoc(sdata, ifsta, mgmt,
- skb->len);
- break;
- }
+ switch (fc & IEEE80211_FCTL_STYPE) {
+ case IEEE80211_STYPE_PROBE_RESP:
+ ieee80211_rx_mgmt_probe_resp(sdata, mgmt, skb->len,
+ rx_status);
+ break;
+ case IEEE80211_STYPE_BEACON:
+ ieee80211_rx_mgmt_beacon(sdata, mgmt, skb->len,
+ rx_status);
+ break;
+ case IEEE80211_STYPE_AUTH:
+ ieee80211_rx_mgmt_auth(sdata, mgmt, skb->len);
+ break;
+ case IEEE80211_STYPE_ASSOC_RESP:
+ ieee80211_rx_mgmt_assoc_resp(sdata, mgmt, skb->len, 0);
+ break;
+ case IEEE80211_STYPE_REASSOC_RESP:
+ ieee80211_rx_mgmt_assoc_resp(sdata, mgmt, skb->len, 1);
+ break;
+ case IEEE80211_STYPE_DEAUTH:
+ ieee80211_rx_mgmt_deauth(sdata, mgmt, skb->len);
+ break;
+ case IEEE80211_STYPE_DISASSOC:
+ ieee80211_rx_mgmt_disassoc(sdata, mgmt, skb->len);
+ break;
}
kfree_skb(skb);
}
-
-static int ieee80211_sta_active_ibss(struct ieee80211_sub_if_data *sdata)
-{
- struct ieee80211_local *local = sdata->local;
- int active = 0;
- struct sta_info *sta;
-
- rcu_read_lock();
-
- list_for_each_entry_rcu(sta, &local->sta_list, list) {
- if (sta->sdata == sdata &&
- time_after(sta->last_rx + IEEE80211_IBSS_MERGE_INTERVAL,
- jiffies)) {
- active++;
- break;
- }
- }
-
- rcu_read_unlock();
-
- return active;
-}
-
-
-static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_if_sta *ifsta)
-{
- mod_timer(&ifsta->timer, jiffies + IEEE80211_IBSS_MERGE_INTERVAL);
-
- ieee80211_sta_expire(sdata, IEEE80211_IBSS_INACTIVITY_LIMIT);
- if (ieee80211_sta_active_ibss(sdata))
- return;
-
- if ((sdata->u.sta.flags & IEEE80211_STA_BSSID_SET) &&
- (!(sdata->u.sta.flags & IEEE80211_STA_AUTO_CHANNEL_SEL)))
- return;
-
- printk(KERN_DEBUG "%s: No active IBSS STAs - trying to scan for other "
- "IBSS networks with same SSID (merge)\n", sdata->dev->name);
-
- /* XXX maybe racy? */
- if (sdata->local->scan_req)
- return;
-
- memcpy(sdata->local->int_scan_req.ssids[0].ssid,
- ifsta->ssid, IEEE80211_MAX_SSID_LEN);
- sdata->local->int_scan_req.ssids[0].ssid_len = ifsta->ssid_len;
- ieee80211_request_scan(sdata, &sdata->local->int_scan_req);
-}
-
-
static void ieee80211_sta_timer(unsigned long data)
{
struct ieee80211_sub_if_data *sdata =
(struct ieee80211_sub_if_data *) data;
- struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
struct ieee80211_local *local = sdata->local;
- set_bit(IEEE80211_STA_REQ_RUN, &ifsta->request);
- queue_work(local->hw.workqueue, &ifsta->work);
+ set_bit(IEEE80211_STA_REQ_RUN, &ifmgd->request);
+ queue_work(local->hw.workqueue, &ifmgd->work);
}
-static void ieee80211_sta_reset_auth(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_if_sta *ifsta)
+static void ieee80211_sta_reset_auth(struct ieee80211_sub_if_data *sdata)
{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
struct ieee80211_local *local = sdata->local;
if (local->ops->reset_tsf) {
@@ -2238,191 +1627,39 @@ static void ieee80211_sta_reset_auth(struct ieee80211_sub_if_data *sdata,
local->ops->reset_tsf(local_to_hw(local));
}
- ifsta->wmm_last_param_set = -1; /* allow any WMM update */
+ ifmgd->wmm_last_param_set = -1; /* allow any WMM update */
- if (ifsta->auth_algs & IEEE80211_AUTH_ALG_OPEN)
- ifsta->auth_alg = WLAN_AUTH_OPEN;
- else if (ifsta->auth_algs & IEEE80211_AUTH_ALG_SHARED_KEY)
- ifsta->auth_alg = WLAN_AUTH_SHARED_KEY;
- else if (ifsta->auth_algs & IEEE80211_AUTH_ALG_LEAP)
- ifsta->auth_alg = WLAN_AUTH_LEAP;
+ if (ifmgd->auth_algs & IEEE80211_AUTH_ALG_OPEN)
+ ifmgd->auth_alg = WLAN_AUTH_OPEN;
+ else if (ifmgd->auth_algs & IEEE80211_AUTH_ALG_SHARED_KEY)
+ ifmgd->auth_alg = WLAN_AUTH_SHARED_KEY;
+ else if (ifmgd->auth_algs & IEEE80211_AUTH_ALG_LEAP)
+ ifmgd->auth_alg = WLAN_AUTH_LEAP;
else
- ifsta->auth_alg = WLAN_AUTH_OPEN;
- ifsta->auth_transaction = -1;
- ifsta->flags &= ~IEEE80211_STA_ASSOCIATED;
- ifsta->assoc_scan_tries = 0;
- ifsta->direct_probe_tries = 0;
- ifsta->auth_tries = 0;
- ifsta->assoc_tries = 0;
+ ifmgd->auth_alg = WLAN_AUTH_OPEN;
+ ifmgd->auth_transaction = -1;
+ ifmgd->flags &= ~IEEE80211_STA_ASSOCIATED;
+ ifmgd->assoc_scan_tries = 0;
+ ifmgd->direct_probe_tries = 0;
+ ifmgd->auth_tries = 0;
+ ifmgd->assoc_tries = 0;
netif_tx_stop_all_queues(sdata->dev);
netif_carrier_off(sdata->dev);
}
-static int ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_if_sta *ifsta)
-{
- struct ieee80211_local *local = sdata->local;
- struct ieee80211_supported_band *sband;
- u8 *pos;
- u8 bssid[ETH_ALEN];
- u8 supp_rates[IEEE80211_MAX_SUPP_RATES];
- u16 capability;
- int i;
-
- if (sdata->u.sta.flags & IEEE80211_STA_BSSID_SET) {
- memcpy(bssid, ifsta->bssid, ETH_ALEN);
- } else {
- /* Generate random, not broadcast, locally administered BSSID. Mix in
- * own MAC address to make sure that devices that do not have proper
- * random number generator get different BSSID. */
- get_random_bytes(bssid, ETH_ALEN);
- for (i = 0; i < ETH_ALEN; i++)
- bssid[i] ^= sdata->dev->dev_addr[i];
- bssid[0] &= ~0x01;
- bssid[0] |= 0x02;
- }
-
- printk(KERN_DEBUG "%s: Creating new IBSS network, BSSID %pM\n",
- sdata->dev->name, bssid);
-
- sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
-
- if (local->hw.conf.beacon_int == 0)
- local->hw.conf.beacon_int = 100;
-
- capability = WLAN_CAPABILITY_IBSS;
-
- if (sdata->default_key)
- capability |= WLAN_CAPABILITY_PRIVACY;
- else
- sdata->drop_unencrypted = 0;
-
- pos = supp_rates;
- for (i = 0; i < sband->n_bitrates; i++) {
- int rate = sband->bitrates[i].bitrate;
- *pos++ = (u8) (rate / 5);
- }
-
- return __ieee80211_sta_join_ibss(sdata, ifsta,
- bssid, local->hw.conf.beacon_int,
- local->hw.conf.channel->center_freq,
- sband->n_bitrates, supp_rates,
- capability);
-}
-
-
-static int ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_if_sta *ifsta)
-{
- struct ieee80211_local *local = sdata->local;
- struct ieee80211_bss *bss;
- int active_ibss;
-
- if (ifsta->ssid_len == 0)
- return -EINVAL;
-
- active_ibss = ieee80211_sta_active_ibss(sdata);
-#ifdef CONFIG_MAC80211_IBSS_DEBUG
- printk(KERN_DEBUG "%s: sta_find_ibss (active_ibss=%d)\n",
- sdata->dev->name, active_ibss);
-#endif /* CONFIG_MAC80211_IBSS_DEBUG */
-
- if (active_ibss)
- return 0;
-
- if (ifsta->flags & IEEE80211_STA_BSSID_SET)
- bss = ieee80211_rx_bss_get(local, ifsta->bssid, 0,
- ifsta->ssid, ifsta->ssid_len);
- else
- bss = (void *)cfg80211_get_ibss(local->hw.wiphy,
- NULL,
- ifsta->ssid, ifsta->ssid_len);
-
-#ifdef CONFIG_MAC80211_IBSS_DEBUG
- if (bss)
- printk(KERN_DEBUG " sta_find_ibss: selected %pM current "
- "%pM\n", bss->cbss.bssid, ifsta->bssid);
-#endif /* CONFIG_MAC80211_IBSS_DEBUG */
-
- if (bss &&
- (!(ifsta->flags & IEEE80211_STA_PREV_BSSID_SET) ||
- memcmp(ifsta->bssid, bss->cbss.bssid, ETH_ALEN))) {
- int ret;
-
- printk(KERN_DEBUG "%s: Selected IBSS BSSID %pM"
- " based on configured SSID\n",
- sdata->dev->name, bss->cbss.bssid);
-
- ret = ieee80211_sta_join_ibss(sdata, ifsta, bss);
- ieee80211_rx_bss_put(local, bss);
- return ret;
- } else if (bss)
- ieee80211_rx_bss_put(local, bss);
-
-#ifdef CONFIG_MAC80211_IBSS_DEBUG
- printk(KERN_DEBUG " did not try to join ibss\n");
-#endif /* CONFIG_MAC80211_IBSS_DEBUG */
-
- /* Selected IBSS not found in current scan results - try to scan */
- if (ifsta->state == IEEE80211_STA_MLME_IBSS_JOINED &&
- !ieee80211_sta_active_ibss(sdata)) {
- mod_timer(&ifsta->timer, jiffies +
- IEEE80211_IBSS_MERGE_INTERVAL);
- } else if (time_after(jiffies, local->last_scan_completed +
- IEEE80211_SCAN_INTERVAL)) {
- printk(KERN_DEBUG "%s: Trigger new scan to find an IBSS to "
- "join\n", sdata->dev->name);
-
- /* XXX maybe racy? */
- if (local->scan_req)
- return -EBUSY;
-
- memcpy(local->int_scan_req.ssids[0].ssid,
- ifsta->ssid, IEEE80211_MAX_SSID_LEN);
- local->int_scan_req.ssids[0].ssid_len = ifsta->ssid_len;
- return ieee80211_request_scan(sdata, &local->int_scan_req);
- } else if (ifsta->state != IEEE80211_STA_MLME_IBSS_JOINED) {
- int interval = IEEE80211_SCAN_INTERVAL;
-
- if (time_after(jiffies, ifsta->ibss_join_req +
- IEEE80211_IBSS_JOIN_TIMEOUT)) {
- if ((ifsta->flags & IEEE80211_STA_CREATE_IBSS) &&
- (!(local->oper_channel->flags &
- IEEE80211_CHAN_NO_IBSS)))
- return ieee80211_sta_create_ibss(sdata, ifsta);
- if (ifsta->flags & IEEE80211_STA_CREATE_IBSS) {
- printk(KERN_DEBUG "%s: IBSS not allowed on"
- " %d MHz\n", sdata->dev->name,
- local->hw.conf.channel->center_freq);
- }
-
- /* No IBSS found - decrease scan interval and continue
- * scanning. */
- interval = IEEE80211_SCAN_INTERVAL_SLOW;
- }
-
- ifsta->state = IEEE80211_STA_MLME_IBSS_SEARCH;
- mod_timer(&ifsta->timer, jiffies + interval);
- return 0;
- }
-
- return 0;
-}
-
-
-static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_if_sta *ifsta)
+static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata)
{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
struct ieee80211_local *local = sdata->local;
struct ieee80211_bss *bss;
- u8 *bssid = ifsta->bssid, *ssid = ifsta->ssid;
- u8 ssid_len = ifsta->ssid_len;
+ u8 *bssid = ifmgd->bssid, *ssid = ifmgd->ssid;
+ u8 ssid_len = ifmgd->ssid_len;
u16 capa_mask = WLAN_CAPABILITY_ESS;
u16 capa_val = WLAN_CAPABILITY_ESS;
struct ieee80211_channel *chan = local->oper_channel;
- if (ifsta->flags & (IEEE80211_STA_AUTO_SSID_SEL |
+ if (ifmgd->flags & (IEEE80211_STA_AUTO_SSID_SEL |
IEEE80211_STA_AUTO_BSSID_SEL |
IEEE80211_STA_AUTO_CHANNEL_SEL)) {
capa_mask |= WLAN_CAPABILITY_PRIVACY;
@@ -2430,13 +1667,13 @@ static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata,
capa_val |= WLAN_CAPABILITY_PRIVACY;
}
- if (ifsta->flags & IEEE80211_STA_AUTO_CHANNEL_SEL)
+ if (ifmgd->flags & IEEE80211_STA_AUTO_CHANNEL_SEL)
chan = NULL;
- if (ifsta->flags & IEEE80211_STA_AUTO_BSSID_SEL)
+ if (ifmgd->flags & IEEE80211_STA_AUTO_BSSID_SEL)
bssid = NULL;
- if (ifsta->flags & IEEE80211_STA_AUTO_SSID_SEL) {
+ if (ifmgd->flags & IEEE80211_STA_AUTO_SSID_SEL) {
ssid = NULL;
ssid_len = 0;
}
@@ -2447,16 +1684,16 @@ static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata,
if (bss) {
ieee80211_set_freq(sdata, bss->cbss.channel->center_freq);
- if (!(ifsta->flags & IEEE80211_STA_SSID_SET))
+ if (!(ifmgd->flags & IEEE80211_STA_SSID_SET))
ieee80211_sta_set_ssid(sdata, bss->ssid,
bss->ssid_len);
ieee80211_sta_set_bssid(sdata, bss->cbss.bssid);
ieee80211_sta_def_wmm_params(sdata, bss->supp_rates_len,
bss->supp_rates);
- if (sdata->u.sta.mfp == IEEE80211_MFP_REQUIRED)
- sdata->u.sta.flags |= IEEE80211_STA_MFP_ENABLED;
+ if (sdata->u.mgd.mfp == IEEE80211_MFP_REQUIRED)
+ sdata->u.mgd.flags |= IEEE80211_STA_MFP_ENABLED;
else
- sdata->u.sta.flags &= ~IEEE80211_STA_MFP_ENABLED;
+ sdata->u.mgd.flags &= ~IEEE80211_STA_MFP_ENABLED;
/* Send out direct probe if no probe resp was received or
* the one we have is outdated
@@ -2464,31 +1701,34 @@ static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata,
if (!bss->last_probe_resp ||
time_after(jiffies, bss->last_probe_resp
+ IEEE80211_SCAN_RESULT_EXPIRE))
- ifsta->state = IEEE80211_STA_MLME_DIRECT_PROBE;
+ ifmgd->state = IEEE80211_STA_MLME_DIRECT_PROBE;
else
- ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE;
+ ifmgd->state = IEEE80211_STA_MLME_AUTHENTICATE;
ieee80211_rx_bss_put(local, bss);
- ieee80211_sta_reset_auth(sdata, ifsta);
+ ieee80211_sta_reset_auth(sdata);
return 0;
} else {
- if (ifsta->assoc_scan_tries < IEEE80211_ASSOC_SCANS_MAX_TRIES) {
- ifsta->assoc_scan_tries++;
+ if (ifmgd->assoc_scan_tries < IEEE80211_ASSOC_SCANS_MAX_TRIES) {
+ ifmgd->assoc_scan_tries++;
/* XXX maybe racy? */
if (local->scan_req)
return -1;
memcpy(local->int_scan_req.ssids[0].ssid,
- ifsta->ssid, IEEE80211_MAX_SSID_LEN);
- if (ifsta->flags & IEEE80211_STA_AUTO_SSID_SEL)
+ ifmgd->ssid, IEEE80211_MAX_SSID_LEN);
+ if (ifmgd->flags & IEEE80211_STA_AUTO_SSID_SEL)
local->int_scan_req.ssids[0].ssid_len = 0;
else
- local->int_scan_req.ssids[0].ssid_len = ifsta->ssid_len;
- ieee80211_start_scan(sdata, &local->int_scan_req);
- ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE;
- set_bit(IEEE80211_STA_REQ_AUTH, &ifsta->request);
+ local->int_scan_req.ssids[0].ssid_len = ifmgd->ssid_len;
+
+ if (ieee80211_start_scan(sdata, &local->int_scan_req))
+ ieee80211_scan_failed(local);
+
+ ifmgd->state = IEEE80211_STA_MLME_AUTHENTICATE;
+ set_bit(IEEE80211_STA_REQ_AUTH, &ifmgd->request);
} else {
- ifsta->assoc_scan_tries = 0;
- ifsta->state = IEEE80211_STA_MLME_DISABLED;
+ ifmgd->assoc_scan_tries = 0;
+ ifmgd->state = IEEE80211_STA_MLME_DISABLED;
}
}
return -1;
@@ -2498,9 +1738,9 @@ static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata,
static void ieee80211_sta_work(struct work_struct *work)
{
struct ieee80211_sub_if_data *sdata =
- container_of(work, struct ieee80211_sub_if_data, u.sta.work);
+ container_of(work, struct ieee80211_sub_if_data, u.mgd.work);
struct ieee80211_local *local = sdata->local;
- struct ieee80211_if_sta *ifsta;
+ struct ieee80211_if_managed *ifmgd;
struct sk_buff *skb;
if (!netif_running(sdata->dev))
@@ -2509,60 +1749,60 @@ static void ieee80211_sta_work(struct work_struct *work)
if (local->sw_scanning || local->hw_scanning)
return;
- if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION &&
- sdata->vif.type != NL80211_IFTYPE_ADHOC))
+ if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION))
return;
- ifsta = &sdata->u.sta;
+ ifmgd = &sdata->u.mgd;
- while ((skb = skb_dequeue(&ifsta->skb_queue)))
+ while ((skb = skb_dequeue(&ifmgd->skb_queue)))
ieee80211_sta_rx_queued_mgmt(sdata, skb);
- if (ifsta->state != IEEE80211_STA_MLME_DIRECT_PROBE &&
- ifsta->state != IEEE80211_STA_MLME_AUTHENTICATE &&
- ifsta->state != IEEE80211_STA_MLME_ASSOCIATE &&
- test_and_clear_bit(IEEE80211_STA_REQ_SCAN, &ifsta->request)) {
- ieee80211_start_scan(sdata, local->scan_req);
+ if (ifmgd->state != IEEE80211_STA_MLME_DIRECT_PROBE &&
+ ifmgd->state != IEEE80211_STA_MLME_AUTHENTICATE &&
+ ifmgd->state != IEEE80211_STA_MLME_ASSOCIATE &&
+ test_and_clear_bit(IEEE80211_STA_REQ_SCAN, &ifmgd->request)) {
+ /*
+ * The call to ieee80211_start_scan can fail but ieee80211_request_scan
+ * (which queued ieee80211_sta_work) did not return an error. Thus, call
+ * ieee80211_scan_failed here if ieee80211_start_scan fails in order to
+ * notify the scan requester.
+ */
+ if (ieee80211_start_scan(sdata, local->scan_req))
+ ieee80211_scan_failed(local);
return;
}
- if (test_and_clear_bit(IEEE80211_STA_REQ_AUTH, &ifsta->request)) {
- if (ieee80211_sta_config_auth(sdata, ifsta))
+ if (test_and_clear_bit(IEEE80211_STA_REQ_AUTH, &ifmgd->request)) {
+ if (ieee80211_sta_config_auth(sdata))
return;
- clear_bit(IEEE80211_STA_REQ_RUN, &ifsta->request);
- } else if (!test_and_clear_bit(IEEE80211_STA_REQ_RUN, &ifsta->request))
+ clear_bit(IEEE80211_STA_REQ_RUN, &ifmgd->request);
+ } else if (!test_and_clear_bit(IEEE80211_STA_REQ_RUN, &ifmgd->request))
return;
- switch (ifsta->state) {
+ switch (ifmgd->state) {
case IEEE80211_STA_MLME_DISABLED:
break;
case IEEE80211_STA_MLME_DIRECT_PROBE:
- ieee80211_direct_probe(sdata, ifsta);
+ ieee80211_direct_probe(sdata);
break;
case IEEE80211_STA_MLME_AUTHENTICATE:
- ieee80211_authenticate(sdata, ifsta);
+ ieee80211_authenticate(sdata);
break;
case IEEE80211_STA_MLME_ASSOCIATE:
- ieee80211_associate(sdata, ifsta);
+ ieee80211_associate(sdata);
break;
case IEEE80211_STA_MLME_ASSOCIATED:
- ieee80211_associated(sdata, ifsta);
- break;
- case IEEE80211_STA_MLME_IBSS_SEARCH:
- ieee80211_sta_find_ibss(sdata, ifsta);
- break;
- case IEEE80211_STA_MLME_IBSS_JOINED:
- ieee80211_sta_merge_ibss(sdata, ifsta);
+ ieee80211_associated(sdata);
break;
default:
WARN_ON(1);
break;
}
- if (ieee80211_privacy_mismatch(sdata, ifsta)) {
+ if (ieee80211_privacy_mismatch(sdata)) {
printk(KERN_DEBUG "%s: privacy configuration mismatch and "
"mixed-cell disabled - disassociate\n", sdata->dev->name);
- ieee80211_set_disassoc(sdata, ifsta, false, true,
+ ieee80211_set_disassoc(sdata, false, true,
WLAN_REASON_UNSPECIFIED);
}
}
@@ -2571,155 +1811,106 @@ static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata)
{
if (sdata->vif.type == NL80211_IFTYPE_STATION)
queue_work(sdata->local->hw.workqueue,
- &sdata->u.sta.work);
+ &sdata->u.mgd.work);
}
/* interface setup */
void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata)
{
- struct ieee80211_if_sta *ifsta;
+ struct ieee80211_if_managed *ifmgd;
- ifsta = &sdata->u.sta;
- INIT_WORK(&ifsta->work, ieee80211_sta_work);
- INIT_WORK(&ifsta->chswitch_work, ieee80211_chswitch_work);
- setup_timer(&ifsta->timer, ieee80211_sta_timer,
+ ifmgd = &sdata->u.mgd;
+ INIT_WORK(&ifmgd->work, ieee80211_sta_work);
+ INIT_WORK(&ifmgd->chswitch_work, ieee80211_chswitch_work);
+ setup_timer(&ifmgd->timer, ieee80211_sta_timer,
(unsigned long) sdata);
- setup_timer(&ifsta->chswitch_timer, ieee80211_chswitch_timer,
+ setup_timer(&ifmgd->chswitch_timer, ieee80211_chswitch_timer,
(unsigned long) sdata);
- skb_queue_head_init(&ifsta->skb_queue);
+ skb_queue_head_init(&ifmgd->skb_queue);
- ifsta->capab = WLAN_CAPABILITY_ESS;
- ifsta->auth_algs = IEEE80211_AUTH_ALG_OPEN |
+ ifmgd->capab = WLAN_CAPABILITY_ESS;
+ ifmgd->auth_algs = IEEE80211_AUTH_ALG_OPEN |
IEEE80211_AUTH_ALG_SHARED_KEY;
- ifsta->flags |= IEEE80211_STA_CREATE_IBSS |
+ ifmgd->flags |= IEEE80211_STA_CREATE_IBSS |
IEEE80211_STA_AUTO_BSSID_SEL |
IEEE80211_STA_AUTO_CHANNEL_SEL;
if (ieee80211_num_regular_queues(&sdata->local->hw) >= 4)
- ifsta->flags |= IEEE80211_STA_WMM_ENABLED;
-}
-
-/*
- * Add a new IBSS station, will also be called by the RX code when,
- * in IBSS mode, receiving a frame from a yet-unknown station, hence
- * must be callable in atomic context.
- */
-struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata,
- u8 *bssid,u8 *addr, u32 supp_rates)
-{
- struct ieee80211_local *local = sdata->local;
- struct sta_info *sta;
- int band = local->hw.conf.channel->band;
-
- /* TODO: Could consider removing the least recently used entry and
- * allow new one to be added. */
- if (local->num_sta >= IEEE80211_IBSS_MAX_STA_ENTRIES) {
- if (net_ratelimit()) {
- printk(KERN_DEBUG "%s: No room for a new IBSS STA "
- "entry %pM\n", sdata->dev->name, addr);
- }
- return NULL;
- }
-
- if (compare_ether_addr(bssid, sdata->u.sta.bssid))
- return NULL;
-
-#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
- printk(KERN_DEBUG "%s: Adding new IBSS station %pM (dev=%s)\n",
- wiphy_name(local->hw.wiphy), addr, sdata->dev->name);
-#endif
-
- sta = sta_info_alloc(sdata, addr, GFP_ATOMIC);
- if (!sta)
- return NULL;
-
- set_sta_flags(sta, WLAN_STA_AUTHORIZED);
-
- /* make sure mandatory rates are always added */
- sta->sta.supp_rates[band] = supp_rates |
- ieee80211_mandatory_rates(local, band);
-
- rate_control_rate_init(sta);
-
- if (sta_info_insert(sta))
- return NULL;
-
- return sta;
+ ifmgd->flags |= IEEE80211_STA_WMM_ENABLED;
}
/* configuration hooks */
-void ieee80211_sta_req_auth(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_if_sta *ifsta)
+void ieee80211_sta_req_auth(struct ieee80211_sub_if_data *sdata)
{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
struct ieee80211_local *local = sdata->local;
- if (sdata->vif.type != NL80211_IFTYPE_STATION)
+ if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION))
return;
- if ((ifsta->flags & (IEEE80211_STA_BSSID_SET |
+ if ((ifmgd->flags & (IEEE80211_STA_BSSID_SET |
IEEE80211_STA_AUTO_BSSID_SEL)) &&
- (ifsta->flags & (IEEE80211_STA_SSID_SET |
+ (ifmgd->flags & (IEEE80211_STA_SSID_SET |
IEEE80211_STA_AUTO_SSID_SEL))) {
- if (ifsta->state == IEEE80211_STA_MLME_ASSOCIATED)
- ieee80211_set_disassoc(sdata, ifsta, true, true,
+ if (ifmgd->state == IEEE80211_STA_MLME_ASSOCIATED)
+ ieee80211_set_disassoc(sdata, true, true,
WLAN_REASON_DEAUTH_LEAVING);
- set_bit(IEEE80211_STA_REQ_AUTH, &ifsta->request);
- queue_work(local->hw.workqueue, &ifsta->work);
+ set_bit(IEEE80211_STA_REQ_AUTH, &ifmgd->request);
+ queue_work(local->hw.workqueue, &ifmgd->work);
}
}
-int ieee80211_sta_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t len)
+int ieee80211_sta_commit(struct ieee80211_sub_if_data *sdata)
{
- struct ieee80211_if_sta *ifsta;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
- if (len > IEEE80211_MAX_SSID_LEN)
- return -EINVAL;
+ ifmgd->flags &= ~IEEE80211_STA_PREV_BSSID_SET;
- ifsta = &sdata->u.sta;
+ if (ifmgd->ssid_len)
+ ifmgd->flags |= IEEE80211_STA_SSID_SET;
+ else
+ ifmgd->flags &= ~IEEE80211_STA_SSID_SET;
- if (ifsta->ssid_len != len || memcmp(ifsta->ssid, ssid, len) != 0) {
- memset(ifsta->ssid, 0, sizeof(ifsta->ssid));
- memcpy(ifsta->ssid, ssid, len);
- ifsta->ssid_len = len;
- }
+ return 0;
+}
- ifsta->flags &= ~IEEE80211_STA_PREV_BSSID_SET;
+int ieee80211_sta_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t len)
+{
+ struct ieee80211_if_managed *ifmgd;
- if (len)
- ifsta->flags |= IEEE80211_STA_SSID_SET;
- else
- ifsta->flags &= ~IEEE80211_STA_SSID_SET;
+ if (len > IEEE80211_MAX_SSID_LEN)
+ return -EINVAL;
+
+ ifmgd = &sdata->u.mgd;
- if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
- ifsta->ibss_join_req = jiffies;
- ifsta->state = IEEE80211_STA_MLME_IBSS_SEARCH;
- return ieee80211_sta_find_ibss(sdata, ifsta);
+ if (ifmgd->ssid_len != len || memcmp(ifmgd->ssid, ssid, len) != 0) {
+ memset(ifmgd->ssid, 0, sizeof(ifmgd->ssid));
+ memcpy(ifmgd->ssid, ssid, len);
+ ifmgd->ssid_len = len;
}
- return 0;
+ return ieee80211_sta_commit(sdata);
}
int ieee80211_sta_get_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t *len)
{
- struct ieee80211_if_sta *ifsta = &sdata->u.sta;
- memcpy(ssid, ifsta->ssid, ifsta->ssid_len);
- *len = ifsta->ssid_len;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ memcpy(ssid, ifmgd->ssid, ifmgd->ssid_len);
+ *len = ifmgd->ssid_len;
return 0;
}
int ieee80211_sta_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid)
{
- struct ieee80211_if_sta *ifsta;
-
- ifsta = &sdata->u.sta;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
if (is_valid_ether_addr(bssid)) {
- memcpy(ifsta->bssid, bssid, ETH_ALEN);
- ifsta->flags |= IEEE80211_STA_BSSID_SET;
+ memcpy(ifmgd->bssid, bssid, ETH_ALEN);
+ ifmgd->flags |= IEEE80211_STA_BSSID_SET;
} else {
- memset(ifsta->bssid, 0, ETH_ALEN);
- ifsta->flags &= ~IEEE80211_STA_BSSID_SET;
+ memset(ifmgd->bssid, 0, ETH_ALEN);
+ ifmgd->flags &= ~IEEE80211_STA_BSSID_SET;
}
if (netif_running(sdata->dev)) {
@@ -2729,47 +1920,44 @@ int ieee80211_sta_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid)
}
}
- return ieee80211_sta_set_ssid(sdata, ifsta->ssid, ifsta->ssid_len);
+ return ieee80211_sta_commit(sdata);
}
int ieee80211_sta_set_extra_ie(struct ieee80211_sub_if_data *sdata, char *ie, size_t len)
{
- struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
- kfree(ifsta->extra_ie);
+ kfree(ifmgd->extra_ie);
if (len == 0) {
- ifsta->extra_ie = NULL;
- ifsta->extra_ie_len = 0;
+ ifmgd->extra_ie = NULL;
+ ifmgd->extra_ie_len = 0;
return 0;
}
- ifsta->extra_ie = kmalloc(len, GFP_KERNEL);
- if (!ifsta->extra_ie) {
- ifsta->extra_ie_len = 0;
+ ifmgd->extra_ie = kmalloc(len, GFP_KERNEL);
+ if (!ifmgd->extra_ie) {
+ ifmgd->extra_ie_len = 0;
return -ENOMEM;
}
- memcpy(ifsta->extra_ie, ie, len);
- ifsta->extra_ie_len = len;
+ memcpy(ifmgd->extra_ie, ie, len);
+ ifmgd->extra_ie_len = len;
return 0;
}
int ieee80211_sta_deauthenticate(struct ieee80211_sub_if_data *sdata, u16 reason)
{
- struct ieee80211_if_sta *ifsta = &sdata->u.sta;
-
printk(KERN_DEBUG "%s: deauthenticating by local choice (reason=%d)\n",
sdata->dev->name, reason);
- if (sdata->vif.type != NL80211_IFTYPE_STATION &&
- sdata->vif.type != NL80211_IFTYPE_ADHOC)
+ if (sdata->vif.type != NL80211_IFTYPE_STATION)
return -EINVAL;
- ieee80211_set_disassoc(sdata, ifsta, true, true, reason);
+ ieee80211_set_disassoc(sdata, true, true, reason);
return 0;
}
int ieee80211_sta_disassociate(struct ieee80211_sub_if_data *sdata, u16 reason)
{
- struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
printk(KERN_DEBUG "%s: disassociating by local choice (reason=%d)\n",
sdata->dev->name, reason);
@@ -2777,10 +1965,10 @@ int ieee80211_sta_disassociate(struct ieee80211_sub_if_data *sdata, u16 reason)
if (sdata->vif.type != NL80211_IFTYPE_STATION)
return -EINVAL;
- if (!(ifsta->flags & IEEE80211_STA_ASSOCIATED))
- return -1;
+ if (!(ifmgd->flags & IEEE80211_STA_ASSOCIATED))
+ return -ENOLINK;
- ieee80211_set_disassoc(sdata, ifsta, false, true, reason);
+ ieee80211_set_disassoc(sdata, false, true, reason);
return 0;
}
@@ -2788,14 +1976,6 @@ int ieee80211_sta_disassociate(struct ieee80211_sub_if_data *sdata, u16 reason)
void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local)
{
struct ieee80211_sub_if_data *sdata = local->scan_sdata;
- struct ieee80211_if_sta *ifsta;
-
- if (sdata && sdata->vif.type == NL80211_IFTYPE_ADHOC) {
- ifsta = &sdata->u.sta;
- if ((!(ifsta->flags & IEEE80211_STA_PREV_BSSID_SET)) ||
- !ieee80211_sta_active_ibss(sdata))
- ieee80211_sta_find_ibss(sdata, ifsta);
- }
/* Restart STA timers */
rcu_read_lock();
@@ -2842,3 +2022,36 @@ void ieee80211_dynamic_ps_timer(unsigned long data)
queue_work(local->hw.workqueue, &local->dynamic_ps_enable_work);
}
+
+void ieee80211_send_nullfunc(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ int powersave)
+{
+ struct sk_buff *skb;
+ struct ieee80211_hdr *nullfunc;
+ __le16 fc;
+
+ if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION))
+ return;
+
+ skb = dev_alloc_skb(local->hw.extra_tx_headroom + 24);
+ if (!skb) {
+ printk(KERN_DEBUG "%s: failed to allocate buffer for nullfunc "
+ "frame\n", sdata->dev->name);
+ return;
+ }
+ skb_reserve(skb, local->hw.extra_tx_headroom);
+
+ nullfunc = (struct ieee80211_hdr *) skb_put(skb, 24);
+ memset(nullfunc, 0, 24);
+ fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC |
+ IEEE80211_FCTL_TODS);
+ if (powersave)
+ fc |= cpu_to_le16(IEEE80211_FCTL_PM);
+ nullfunc->frame_control = fc;
+ memcpy(nullfunc->addr1, sdata->u.mgd.bssid, ETH_ALEN);
+ memcpy(nullfunc->addr2, sdata->dev->dev_addr, ETH_ALEN);
+ memcpy(nullfunc->addr3, sdata->u.mgd.bssid, ETH_ALEN);
+
+ ieee80211_tx_skb(sdata, skb, 0);
+}
diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h
index 928da625e28..b9164c9a956 100644
--- a/net/mac80211/rate.h
+++ b/net/mac80211/rate.h
@@ -62,6 +62,18 @@ static inline void rate_control_rate_init(struct sta_info *sta)
ref->ops->rate_init(ref->priv, sband, ista, priv_sta);
}
+static inline void rate_control_rate_update(struct ieee80211_local *local,
+ struct ieee80211_supported_band *sband,
+ struct sta_info *sta, u32 changed)
+{
+ struct rate_control_ref *ref = local->rate_ctrl;
+ struct ieee80211_sta *ista = &sta->sta;
+ void *priv_sta = sta->rate_ctrl_priv;
+
+ if (ref->ops->rate_update)
+ ref->ops->rate_update(ref->priv, sband, ista,
+ priv_sta, changed);
+}
static inline void *rate_control_alloc_sta(struct rate_control_ref *ref,
struct ieee80211_sta *sta,
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 1327d424bf3..66f7ecf51b9 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -838,7 +838,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
if (rx->sdata->vif.type == NL80211_IFTYPE_ADHOC) {
u8 *bssid = ieee80211_get_bssid(hdr, rx->skb->len,
NL80211_IFTYPE_ADHOC);
- if (compare_ether_addr(bssid, rx->sdata->u.sta.bssid) == 0)
+ if (compare_ether_addr(bssid, rx->sdata->u.ibss.bssid) == 0)
sta->last_rx = jiffies;
} else
if (!is_multicast_ether_addr(hdr->addr1) ||
@@ -1702,13 +1702,13 @@ static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata,
return;
}
- if (compare_ether_addr(mgmt->sa, sdata->u.sta.bssid) != 0 ||
- compare_ether_addr(mgmt->bssid, sdata->u.sta.bssid) != 0) {
+ if (compare_ether_addr(mgmt->sa, sdata->u.mgd.bssid) != 0 ||
+ compare_ether_addr(mgmt->bssid, sdata->u.mgd.bssid) != 0) {
/* Not from the current AP. */
return;
}
- if (sdata->u.sta.state == IEEE80211_STA_MLME_ASSOCIATE) {
+ if (sdata->u.mgd.state == IEEE80211_STA_MLME_ASSOCIATE) {
/* Association in progress; ignore SA Query */
return;
}
@@ -1727,7 +1727,7 @@ static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata,
memset(resp, 0, 24);
memcpy(resp->da, mgmt->sa, ETH_ALEN);
memcpy(resp->sa, sdata->dev->dev_addr, ETH_ALEN);
- memcpy(resp->bssid, sdata->u.sta.bssid, ETH_ALEN);
+ memcpy(resp->bssid, sdata->u.mgd.bssid, ETH_ALEN);
resp->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
IEEE80211_STYPE_ACTION);
skb_put(skb, 1 + sizeof(resp->u.action.u.sa_query));
@@ -1745,7 +1745,6 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
{
struct ieee80211_local *local = rx->local;
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(rx->dev);
- struct ieee80211_if_sta *ifsta = &sdata->u.sta;
struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data;
struct ieee80211_bss *bss;
int len = rx->skb->len;
@@ -1803,6 +1802,10 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
case WLAN_CATEGORY_SPECTRUM_MGMT:
if (local->hw.conf.channel->band != IEEE80211_BAND_5GHZ)
return RX_DROP_MONITOR;
+
+ if (sdata->vif.type != NL80211_IFTYPE_STATION)
+ return RX_DROP_MONITOR;
+
switch (mgmt->u.action.u.measurement.action_code) {
case WLAN_ACTION_SPCT_MSR_REQ:
if (len < (IEEE80211_MIN_ACTION_SIZE +
@@ -1815,12 +1818,13 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
sizeof(mgmt->u.action.u.chan_switch)))
return RX_DROP_MONITOR;
- if (memcmp(mgmt->bssid, ifsta->bssid, ETH_ALEN) != 0)
+ if (memcmp(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN))
return RX_DROP_MONITOR;
- bss = ieee80211_rx_bss_get(local, ifsta->bssid,
+ bss = ieee80211_rx_bss_get(local, sdata->u.mgd.bssid,
local->hw.conf.channel->center_freq,
- ifsta->ssid, ifsta->ssid_len);
+ sdata->u.mgd.ssid,
+ sdata->u.mgd.ssid_len);
if (!bss)
return RX_DROP_MONITOR;
@@ -1876,11 +1880,14 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx)
sdata->vif.type != NL80211_IFTYPE_ADHOC)
return RX_DROP_MONITOR;
- if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME)
- return RX_DROP_MONITOR;
- ieee80211_sta_rx_mgmt(sdata, rx->skb, rx->status);
- return RX_QUEUED;
+ if (sdata->vif.type == NL80211_IFTYPE_STATION) {
+ if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME)
+ return RX_DROP_MONITOR;
+ return ieee80211_sta_rx_mgmt(sdata, rx->skb, rx->status);
+ }
+
+ return ieee80211_ibss_rx_mgmt(sdata, rx->skb, rx->status);
}
static void ieee80211_rx_michael_mic_report(struct net_device *dev,
@@ -2083,7 +2090,7 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
case NL80211_IFTYPE_STATION:
if (!bssid)
return 0;
- if (!ieee80211_bssid_match(bssid, sdata->u.sta.bssid)) {
+ if (!ieee80211_bssid_match(bssid, sdata->u.mgd.bssid)) {
if (!(rx->flags & IEEE80211_RX_IN_SCAN))
return 0;
rx->flags &= ~IEEE80211_RX_RA_MATCH;
@@ -2101,7 +2108,7 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
if (ieee80211_is_beacon(hdr->frame_control)) {
return 1;
}
- else if (!ieee80211_bssid_match(bssid, sdata->u.sta.bssid)) {
+ else if (!ieee80211_bssid_match(bssid, sdata->u.ibss.bssid)) {
if (!(rx->flags & IEEE80211_RX_IN_SCAN))
return 0;
rx->flags &= ~IEEE80211_RX_RA_MATCH;
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index f883ab9f1e6..5030a3c8750 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -63,20 +63,15 @@ ieee80211_bss_info_update(struct ieee80211_local *local,
{
struct ieee80211_bss *bss;
int clen;
- enum cfg80211_signal_type sigtype = CFG80211_SIGNAL_TYPE_NONE;
s32 signal = 0;
- if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) {
- sigtype = CFG80211_SIGNAL_TYPE_MBM;
+ if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
signal = rx_status->signal * 100;
- } else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) {
- sigtype = CFG80211_SIGNAL_TYPE_UNSPEC;
+ else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)
signal = (rx_status->signal * 100) / local->hw.max_signal;
- }
bss = (void *)cfg80211_inform_bss_frame(local->hw.wiphy, channel,
- mgmt, len, signal, sigtype,
- GFP_ATOMIC);
+ mgmt, len, signal, GFP_ATOMIC);
if (!bss)
return NULL;
@@ -207,34 +202,16 @@ ieee80211_scan_rx(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
return RX_QUEUED;
}
-void ieee80211_send_nullfunc(struct ieee80211_local *local,
- struct ieee80211_sub_if_data *sdata,
- int powersave)
+void ieee80211_scan_failed(struct ieee80211_local *local)
{
- struct sk_buff *skb;
- struct ieee80211_hdr *nullfunc;
- __le16 fc;
-
- skb = dev_alloc_skb(local->hw.extra_tx_headroom + 24);
- if (!skb) {
- printk(KERN_DEBUG "%s: failed to allocate buffer for nullfunc "
- "frame\n", sdata->dev->name);
+ if (WARN_ON(!local->scan_req))
return;
- }
- skb_reserve(skb, local->hw.extra_tx_headroom);
-
- nullfunc = (struct ieee80211_hdr *) skb_put(skb, 24);
- memset(nullfunc, 0, 24);
- fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC |
- IEEE80211_FCTL_TODS);
- if (powersave)
- fc |= cpu_to_le16(IEEE80211_FCTL_PM);
- nullfunc->frame_control = fc;
- memcpy(nullfunc->addr1, sdata->u.sta.bssid, ETH_ALEN);
- memcpy(nullfunc->addr2, sdata->dev->dev_addr, ETH_ALEN);
- memcpy(nullfunc->addr3, sdata->u.sta.bssid, ETH_ALEN);
-
- ieee80211_tx_skb(sdata, skb, 0);
+
+ /* notify cfg80211 about the failed scan */
+ if (local->scan_req != &local->int_scan_req)
+ cfg80211_scan_done(local->scan_req, true);
+
+ local->scan_req = NULL;
}
void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
@@ -280,6 +257,9 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
netif_addr_unlock(local->mdev);
netif_tx_unlock_bh(local->mdev);
+ if (local->ops->sw_scan_complete)
+ local->ops->sw_scan_complete(local_to_hw(local));
+
mutex_lock(&local->iflist_mtx);
list_for_each_entry(sdata, &local->interfaces, list) {
if (!netif_running(sdata->dev))
@@ -287,7 +267,7 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
/* Tell AP we're back */
if (sdata->vif.type == NL80211_IFTYPE_STATION) {
- if (sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED) {
+ if (sdata->u.mgd.flags & IEEE80211_STA_ASSOCIATED) {
ieee80211_send_nullfunc(local, sdata, 0);
netif_tx_wake_all_queues(sdata->dev);
}
@@ -305,6 +285,7 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
done:
ieee80211_mlme_notify_scan_completed(local);
+ ieee80211_ibss_notify_scan_completed(local);
ieee80211_mesh_notify_scan_completed(local);
}
EXPORT_SYMBOL(ieee80211_scan_completed);
@@ -367,7 +348,8 @@ void ieee80211_scan_work(struct work_struct *work)
ieee80211_send_probe_req(
sdata, NULL,
local->scan_req->ssids[i].ssid,
- local->scan_req->ssids[i].ssid_len);
+ local->scan_req->ssids[i].ssid_len,
+ local->scan_req->ie, local->scan_req->ie_len);
next_delay = IEEE80211_CHANNEL_TIME;
break;
}
@@ -428,6 +410,8 @@ int ieee80211_start_scan(struct ieee80211_sub_if_data *scan_sdata,
}
local->sw_scanning = true;
+ if (local->ops->sw_scan_start)
+ local->ops->sw_scan_start(local_to_hw(local));
mutex_lock(&local->iflist_mtx);
list_for_each_entry(sdata, &local->interfaces, list) {
@@ -442,7 +426,7 @@ int ieee80211_start_scan(struct ieee80211_sub_if_data *scan_sdata,
IEEE80211_IFCC_BEACON_ENABLED);
if (sdata->vif.type == NL80211_IFTYPE_STATION) {
- if (sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED) {
+ if (sdata->u.mgd.flags & IEEE80211_STA_ASSOCIATED) {
netif_tx_stop_all_queues(sdata->dev);
ieee80211_send_nullfunc(local, sdata, 1);
}
@@ -477,7 +461,7 @@ int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata,
struct cfg80211_scan_request *req)
{
struct ieee80211_local *local = sdata->local;
- struct ieee80211_if_sta *ifsta;
+ struct ieee80211_if_managed *ifmgd;
if (!req)
return -EINVAL;
@@ -502,9 +486,9 @@ int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata,
return -EBUSY;
}
- ifsta = &sdata->u.sta;
- set_bit(IEEE80211_STA_REQ_SCAN, &ifsta->request);
- queue_work(local->hw.workqueue, &ifsta->work);
+ ifmgd = &sdata->u.mgd;
+ set_bit(IEEE80211_STA_REQ_SCAN, &ifmgd->request);
+ queue_work(local->hw.workqueue, &ifmgd->work);
return 0;
}
diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c
index 47bb2aed281..5f7a2624ed7 100644
--- a/net/mac80211/spectmgmt.c
+++ b/net/mac80211/spectmgmt.c
@@ -88,16 +88,16 @@ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata,
void ieee80211_chswitch_work(struct work_struct *work)
{
struct ieee80211_sub_if_data *sdata =
- container_of(work, struct ieee80211_sub_if_data, u.sta.chswitch_work);
+ container_of(work, struct ieee80211_sub_if_data, u.mgd.chswitch_work);
struct ieee80211_bss *bss;
- struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
if (!netif_running(sdata->dev))
return;
- bss = ieee80211_rx_bss_get(sdata->local, ifsta->bssid,
+ bss = ieee80211_rx_bss_get(sdata->local, ifmgd->bssid,
sdata->local->hw.conf.channel->center_freq,
- ifsta->ssid, ifsta->ssid_len);
+ ifmgd->ssid, ifmgd->ssid_len);
if (!bss)
goto exit;
@@ -108,7 +108,7 @@ void ieee80211_chswitch_work(struct work_struct *work)
ieee80211_rx_bss_put(sdata->local, bss);
exit:
- ifsta->flags &= ~IEEE80211_STA_CSA_RECEIVED;
+ ifmgd->flags &= ~IEEE80211_STA_CSA_RECEIVED;
ieee80211_wake_queues_by_reason(&sdata->local->hw,
IEEE80211_QUEUE_STOP_REASON_CSA);
}
@@ -117,9 +117,9 @@ void ieee80211_chswitch_timer(unsigned long data)
{
struct ieee80211_sub_if_data *sdata =
(struct ieee80211_sub_if_data *) data;
- struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
- queue_work(sdata->local->hw.workqueue, &ifsta->chswitch_work);
+ queue_work(sdata->local->hw.workqueue, &ifmgd->chswitch_work);
}
void ieee80211_process_chanswitch(struct ieee80211_sub_if_data *sdata,
@@ -127,14 +127,14 @@ void ieee80211_process_chanswitch(struct ieee80211_sub_if_data *sdata,
struct ieee80211_bss *bss)
{
struct ieee80211_channel *new_ch;
- struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
int new_freq = ieee80211_channel_to_frequency(sw_elem->new_ch_num);
/* FIXME: Handle ADHOC later */
if (sdata->vif.type != NL80211_IFTYPE_STATION)
return;
- if (ifsta->state != IEEE80211_STA_MLME_ASSOCIATED)
+ if (ifmgd->state != IEEE80211_STA_MLME_ASSOCIATED)
return;
if (sdata->local->sw_scanning || sdata->local->hw_scanning)
@@ -143,7 +143,7 @@ void ieee80211_process_chanswitch(struct ieee80211_sub_if_data *sdata,
/* Disregard subsequent beacons if we are already running a timer
processing a CSA */
- if (ifsta->flags & IEEE80211_STA_CSA_RECEIVED)
+ if (ifmgd->flags & IEEE80211_STA_CSA_RECEIVED)
return;
new_ch = ieee80211_get_channel(sdata->local->hw.wiphy, new_freq);
@@ -153,12 +153,12 @@ void ieee80211_process_chanswitch(struct ieee80211_sub_if_data *sdata,
sdata->local->csa_channel = new_ch;
if (sw_elem->count <= 1) {
- queue_work(sdata->local->hw.workqueue, &ifsta->chswitch_work);
+ queue_work(sdata->local->hw.workqueue, &ifmgd->chswitch_work);
} else {
ieee80211_stop_queues_by_reason(&sdata->local->hw,
IEEE80211_QUEUE_STOP_REASON_CSA);
- ifsta->flags |= IEEE80211_STA_CSA_RECEIVED;
- mod_timer(&ifsta->chswitch_timer,
+ ifmgd->flags |= IEEE80211_STA_CSA_RECEIVED;
+ mod_timer(&ifmgd->chswitch_timer,
jiffies +
msecs_to_jiffies(sw_elem->count *
bss->cbss.beacon_interval));
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 634f65c0130..4ba3c540fcf 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -202,6 +202,18 @@ void sta_info_destroy(struct sta_info *sta)
/* Make sure timer won't free the tid_rx struct, see below */
if (tid_rx)
tid_rx->shutdown = true;
+
+ /*
+ * The stop callback cannot find this station any more, but
+ * it didn't complete its work -- start the queue if necessary
+ */
+ if (sta->ampdu_mlme.tid_state_tx[i] & HT_AGG_STATE_INITIATOR_MSK &&
+ sta->ampdu_mlme.tid_state_tx[i] & HT_AGG_STATE_REQ_STOP_BA_MSK &&
+ local->hw.ampdu_queues)
+ ieee80211_wake_queue_by_reason(&local->hw,
+ local->hw.queues + sta->tid_to_tx_q[i],
+ IEEE80211_QUEUE_STOP_REASON_AGGREGATION);
+
spin_unlock_bh(&sta->lock);
/*
@@ -275,8 +287,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
* enable session_timer's data differentiation. refer to
* sta_rx_agg_session_timer_expired for useage */
sta->timer_to_tid[i] = i;
- /* tid to tx queue: initialize according to HW (0 is valid) */
- sta->tid_to_tx_q[i] = ieee80211_num_queues(&local->hw);
+ sta->tid_to_tx_q[i] = -1;
/* rx */
sta->ampdu_mlme.tid_state_rx[i] = HT_AGG_STATE_IDLE;
sta->ampdu_mlme.tid_rx[i] = NULL;
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index d9653231992..1f45573c580 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -90,6 +90,7 @@ struct tid_ampdu_tx {
* @buf_size: buffer size for incoming A-MPDUs
* @timeout: reset timer value (in TUs).
* @dialog_token: dialog token for aggregation session
+ * @shutdown: this session is being shut down due to STA removal
*/
struct tid_ampdu_rx {
struct sk_buff **reorder_buf;
@@ -200,7 +201,7 @@ struct sta_ampdu_mlme {
* @tid_seq: per-TID sequence numbers for sending to this STA
* @ampdu_mlme: A-MPDU state machine state
* @timer_to_tid: identity mapping to ID timers
- * @tid_to_tx_q: map tid to tx queue
+ * @tid_to_tx_q: map tid to tx queue (invalid == negative values)
* @llid: Local link ID
* @plid: Peer link ID
* @reason: Cancel reason on PLINK_HOLDING state
@@ -275,7 +276,7 @@ struct sta_info {
*/
struct sta_ampdu_mlme ampdu_mlme;
u8 timer_to_tid[STA_TID_NUM];
- u8 tid_to_tx_q[STA_TID_NUM];
+ s8 tid_to_tx_q[STA_TID_NUM];
#ifdef CONFIG_MAC80211_MESH
/*
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 33926831c64..457238a2f3f 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -784,6 +784,8 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
skb_copy_queue_mapping(frag, first);
frag->do_not_encrypt = first->do_not_encrypt;
+ frag->dev = first->dev;
+ frag->iif = first->iif;
pos += copylen;
left -= copylen;
@@ -876,7 +878,6 @@ ieee80211_tx_h_stats(struct ieee80211_tx_data *tx)
return TX_CONTINUE;
}
-
/* actual transmit path */
/*
@@ -1016,12 +1017,20 @@ __ieee80211_tx_prepare(struct ieee80211_tx_data *tx,
tx->sta = sta_info_get(local, hdr->addr1);
if (tx->sta && ieee80211_is_data_qos(hdr->frame_control)) {
+ unsigned long flags;
qc = ieee80211_get_qos_ctl(hdr);
tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
+ spin_lock_irqsave(&tx->sta->lock, flags);
state = &tx->sta->ampdu_mlme.tid_state_tx[tid];
- if (*state == HT_AGG_STATE_OPERATIONAL)
+ if (*state == HT_AGG_STATE_OPERATIONAL) {
info->flags |= IEEE80211_TX_CTL_AMPDU;
+ if (local->hw.ampdu_queues)
+ skb_set_queue_mapping(
+ skb, tx->local->hw.queues +
+ tx->sta->tid_to_tx_q[tid]);
+ }
+ spin_unlock_irqrestore(&tx->sta->lock, flags);
}
if (is_multicast_ether_addr(hdr->addr1)) {
@@ -1085,7 +1094,8 @@ static int __ieee80211_tx(struct ieee80211_local *local, struct sk_buff *skb,
int ret, i;
if (skb) {
- if (netif_subqueue_stopped(local->mdev, skb))
+ if (ieee80211_queue_stopped(&local->hw,
+ skb_get_queue_mapping(skb)))
return IEEE80211_TX_PENDING;
ret = local->ops->tx(local_to_hw(local), skb);
@@ -1101,8 +1111,8 @@ static int __ieee80211_tx(struct ieee80211_local *local, struct sk_buff *skb,
info = IEEE80211_SKB_CB(tx->extra_frag[i]);
info->flags &= ~(IEEE80211_TX_CTL_CLEAR_PS_FILT |
IEEE80211_TX_CTL_FIRST_FRAGMENT);
- if (netif_subqueue_stopped(local->mdev,
- tx->extra_frag[i]))
+ if (ieee80211_queue_stopped(&local->hw,
+ skb_get_queue_mapping(tx->extra_frag[i])))
return IEEE80211_TX_FRAG_AGAIN;
ret = local->ops->tx(local_to_hw(local),
@@ -1625,7 +1635,7 @@ int ieee80211_subif_start_xmit(struct sk_buff *skb,
case NL80211_IFTYPE_STATION:
fc |= cpu_to_le16(IEEE80211_FCTL_TODS);
/* BSSID SA DA */
- memcpy(hdr.addr1, sdata->u.sta.bssid, ETH_ALEN);
+ memcpy(hdr.addr1, sdata->u.mgd.bssid, ETH_ALEN);
memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN);
memcpy(hdr.addr3, skb->data, ETH_ALEN);
hdrlen = 24;
@@ -1634,7 +1644,7 @@ int ieee80211_subif_start_xmit(struct sk_buff *skb,
/* DA SA BSSID */
memcpy(hdr.addr1, skb->data, ETH_ALEN);
memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN);
- memcpy(hdr.addr3, sdata->u.sta.bssid, ETH_ALEN);
+ memcpy(hdr.addr3, sdata->u.ibss.bssid, ETH_ALEN);
hdrlen = 24;
break;
default:
@@ -1920,7 +1930,6 @@ struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw,
struct ieee80211_tx_info *info;
struct ieee80211_sub_if_data *sdata = NULL;
struct ieee80211_if_ap *ap = NULL;
- struct ieee80211_if_sta *ifsta = NULL;
struct beacon_data *beacon;
struct ieee80211_supported_band *sband;
enum ieee80211_band band = local->hw.conf.channel->band;
@@ -1972,13 +1981,13 @@ struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw,
} else
goto out;
} else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
struct ieee80211_hdr *hdr;
- ifsta = &sdata->u.sta;
- if (!ifsta->probe_resp)
+ if (!ifibss->probe_resp)
goto out;
- skb = skb_copy(ifsta->probe_resp, GFP_ATOMIC);
+ skb = skb_copy(ifibss->probe_resp, GFP_ATOMIC);
if (!skb)
goto out;
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 73c7d7345ab..e0431a1d218 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -344,15 +344,36 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue,
{
struct ieee80211_local *local = hw_to_local(hw);
- /* we don't need to track ampdu queues */
- if (queue < ieee80211_num_regular_queues(hw)) {
- __clear_bit(reason, &local->queue_stop_reasons[queue]);
+ if (queue >= hw->queues) {
+ if (local->ampdu_ac_queue[queue - hw->queues] < 0)
+ return;
+
+ /*
+ * for virtual aggregation queues, we need to refcount the
+ * internal mac80211 disable (multiple times!), keep track of
+ * driver disable _and_ make sure the regular queue is
+ * actually enabled.
+ */
+ if (reason == IEEE80211_QUEUE_STOP_REASON_AGGREGATION)
+ local->amdpu_ac_stop_refcnt[queue - hw->queues]--;
+ else
+ __clear_bit(reason, &local->queue_stop_reasons[queue]);
- if (local->queue_stop_reasons[queue] != 0)
- /* someone still has this queue stopped */
+ if (local->queue_stop_reasons[queue] ||
+ local->amdpu_ac_stop_refcnt[queue - hw->queues])
return;
+
+ /* now go on to treat the corresponding regular queue */
+ queue = local->ampdu_ac_queue[queue - hw->queues];
+ reason = IEEE80211_QUEUE_STOP_REASON_AGGREGATION;
}
+ __clear_bit(reason, &local->queue_stop_reasons[queue]);
+
+ if (local->queue_stop_reasons[queue] != 0)
+ /* someone still has this queue stopped */
+ return;
+
if (test_bit(queue, local->queues_pending)) {
set_bit(queue, local->queues_pending_run);
tasklet_schedule(&local->tx_pending_tasklet);
@@ -361,8 +382,8 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue,
}
}
-static void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue,
- enum queue_stop_reason reason)
+void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue,
+ enum queue_stop_reason reason)
{
struct ieee80211_local *local = hw_to_local(hw);
unsigned long flags;
@@ -384,15 +405,33 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue,
{
struct ieee80211_local *local = hw_to_local(hw);
- /* we don't need to track ampdu queues */
- if (queue < ieee80211_num_regular_queues(hw))
- __set_bit(reason, &local->queue_stop_reasons[queue]);
+ if (queue >= hw->queues) {
+ if (local->ampdu_ac_queue[queue - hw->queues] < 0)
+ return;
+
+ /*
+ * for virtual aggregation queues, we need to refcount the
+ * internal mac80211 disable (multiple times!), keep track of
+ * driver disable _and_ make sure the regular queue is
+ * actually enabled.
+ */
+ if (reason == IEEE80211_QUEUE_STOP_REASON_AGGREGATION)
+ local->amdpu_ac_stop_refcnt[queue - hw->queues]++;
+ else
+ __set_bit(reason, &local->queue_stop_reasons[queue]);
+
+ /* now go on to treat the corresponding regular queue */
+ queue = local->ampdu_ac_queue[queue - hw->queues];
+ reason = IEEE80211_QUEUE_STOP_REASON_AGGREGATION;
+ }
+
+ __set_bit(reason, &local->queue_stop_reasons[queue]);
netif_stop_subqueue(local->mdev, queue);
}
-static void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue,
- enum queue_stop_reason reason)
+void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue,
+ enum queue_stop_reason reason)
{
struct ieee80211_local *local = hw_to_local(hw);
unsigned long flags;
@@ -418,7 +457,7 @@ void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw,
spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
- for (i = 0; i < ieee80211_num_queues(hw); i++)
+ for (i = 0; i < hw->queues; i++)
__ieee80211_stop_queue(hw, i, reason);
spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
@@ -434,6 +473,16 @@ EXPORT_SYMBOL(ieee80211_stop_queues);
int ieee80211_queue_stopped(struct ieee80211_hw *hw, int queue)
{
struct ieee80211_local *local = hw_to_local(hw);
+ unsigned long flags;
+
+ if (queue >= hw->queues) {
+ spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+ queue = local->ampdu_ac_queue[queue - hw->queues];
+ spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+ if (queue < 0)
+ return true;
+ }
+
return __netif_subqueue_stopped(local->mdev, queue);
}
EXPORT_SYMBOL(ieee80211_queue_stopped);
@@ -701,6 +750,27 @@ void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata)
local->ops->conf_tx(local_to_hw(local), i, &qparam);
}
+void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata,
+ const size_t supp_rates_len,
+ const u8 *supp_rates)
+{
+ struct ieee80211_local *local = sdata->local;
+ int i, have_higher_than_11mbit = 0;
+
+ /* cf. IEEE 802.11 9.2.12 */
+ for (i = 0; i < supp_rates_len; i++)
+ if ((supp_rates[i] & 0x7f) * 5 > 110)
+ have_higher_than_11mbit = 1;
+
+ if (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ &&
+ have_higher_than_11mbit)
+ sdata->flags |= IEEE80211_SDATA_OPERATING_GMODE;
+ else
+ sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE;
+
+ ieee80211_set_wmm_default(sdata);
+}
+
void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
int encrypt)
{
@@ -767,3 +837,161 @@ u32 ieee80211_mandatory_rates(struct ieee80211_local *local,
mandatory_rates |= BIT(i);
return mandatory_rates;
}
+
+void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
+ u16 transaction, u16 auth_alg,
+ u8 *extra, size_t extra_len,
+ const u8 *bssid, int encrypt)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct sk_buff *skb;
+ struct ieee80211_mgmt *mgmt;
+ const u8 *ie_auth = NULL;
+ int ie_auth_len = 0;
+
+ if (sdata->vif.type == NL80211_IFTYPE_STATION) {
+ ie_auth_len = sdata->u.mgd.ie_auth_len;
+ ie_auth = sdata->u.mgd.ie_auth;
+ }
+
+ skb = dev_alloc_skb(local->hw.extra_tx_headroom +
+ sizeof(*mgmt) + 6 + extra_len + ie_auth_len);
+ if (!skb) {
+ printk(KERN_DEBUG "%s: failed to allocate buffer for auth "
+ "frame\n", sdata->dev->name);
+ return;
+ }
+ skb_reserve(skb, local->hw.extra_tx_headroom);
+
+ mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24 + 6);
+ memset(mgmt, 0, 24 + 6);
+ mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+ IEEE80211_STYPE_AUTH);
+ if (encrypt)
+ mgmt->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);
+ memcpy(mgmt->da, bssid, ETH_ALEN);
+ memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
+ memcpy(mgmt->bssid, bssid, ETH_ALEN);
+ mgmt->u.auth.auth_alg = cpu_to_le16(auth_alg);
+ mgmt->u.auth.auth_transaction = cpu_to_le16(transaction);
+ mgmt->u.auth.status_code = cpu_to_le16(0);
+ if (extra)
+ memcpy(skb_put(skb, extra_len), extra, extra_len);
+ if (ie_auth)
+ memcpy(skb_put(skb, ie_auth_len), ie_auth, ie_auth_len);
+
+ ieee80211_tx_skb(sdata, skb, encrypt);
+}
+
+void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
+ u8 *ssid, size_t ssid_len,
+ u8 *ie, size_t ie_len)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_supported_band *sband;
+ struct sk_buff *skb;
+ struct ieee80211_mgmt *mgmt;
+ u8 *pos, *supp_rates, *esupp_rates = NULL, *extra_preq_ie = NULL;
+ int i, extra_preq_ie_len = 0;
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_STATION:
+ extra_preq_ie_len = sdata->u.mgd.ie_probereq_len;
+ extra_preq_ie = sdata->u.mgd.ie_probereq;
+ break;
+ default:
+ break;
+ }
+
+ skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt) + 200 +
+ ie_len + extra_preq_ie_len);
+ if (!skb) {
+ printk(KERN_DEBUG "%s: failed to allocate buffer for probe "
+ "request\n", sdata->dev->name);
+ return;
+ }
+ skb_reserve(skb, local->hw.extra_tx_headroom);
+
+ mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
+ memset(mgmt, 0, 24);
+ mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+ IEEE80211_STYPE_PROBE_REQ);
+ memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
+ if (dst) {
+ memcpy(mgmt->da, dst, ETH_ALEN);
+ memcpy(mgmt->bssid, dst, ETH_ALEN);
+ } else {
+ memset(mgmt->da, 0xff, ETH_ALEN);
+ memset(mgmt->bssid, 0xff, ETH_ALEN);
+ }
+ pos = skb_put(skb, 2 + ssid_len);
+ *pos++ = WLAN_EID_SSID;
+ *pos++ = ssid_len;
+ memcpy(pos, ssid, ssid_len);
+
+ supp_rates = skb_put(skb, 2);
+ supp_rates[0] = WLAN_EID_SUPP_RATES;
+ supp_rates[1] = 0;
+ sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
+
+ for (i = 0; i < sband->n_bitrates; i++) {
+ struct ieee80211_rate *rate = &sband->bitrates[i];
+ if (esupp_rates) {
+ pos = skb_put(skb, 1);
+ esupp_rates[1]++;
+ } else if (supp_rates[1] == 8) {
+ esupp_rates = skb_put(skb, 3);
+ esupp_rates[0] = WLAN_EID_EXT_SUPP_RATES;
+ esupp_rates[1] = 1;
+ pos = &esupp_rates[2];
+ } else {
+ pos = skb_put(skb, 1);
+ supp_rates[1]++;
+ }
+ *pos = rate->bitrate / 5;
+ }
+
+ if (ie)
+ memcpy(skb_put(skb, ie_len), ie, ie_len);
+ if (extra_preq_ie)
+ memcpy(skb_put(skb, extra_preq_ie_len), extra_preq_ie,
+ extra_preq_ie_len);
+
+ ieee80211_tx_skb(sdata, skb, 0);
+}
+
+u32 ieee80211_sta_get_rates(struct ieee80211_local *local,
+ struct ieee802_11_elems *elems,
+ enum ieee80211_band band)
+{
+ struct ieee80211_supported_band *sband;
+ struct ieee80211_rate *bitrates;
+ size_t num_rates;
+ u32 supp_rates;
+ int i, j;
+ sband = local->hw.wiphy->bands[band];
+
+ if (!sband) {
+ WARN_ON(1);
+ sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
+ }
+
+ bitrates = sband->bitrates;
+ num_rates = sband->n_bitrates;
+ supp_rates = 0;
+ for (i = 0; i < elems->supp_rates_len +
+ elems->ext_supp_rates_len; i++) {
+ u8 rate = 0;
+ int own_rate;
+ if (i < elems->supp_rates_len)
+ rate = elems->supp_rates[i];
+ else if (elems->ext_supp_rates)
+ rate = elems->ext_supp_rates
+ [i - elems->supp_rates_len];
+ own_rate = 5 * (rate & 0x7f);
+ for (j = 0; j < num_rates; j++)
+ if (bitrates[j].bitrate == own_rate)
+ supp_rates |= BIT(j);
+ }
+ return supp_rates;
+}
diff --git a/net/mac80211/wext.c b/net/mac80211/wext.c
index 2b023dce8b2..935c63ed3df 100644
--- a/net/mac80211/wext.c
+++ b/net/mac80211/wext.c
@@ -132,139 +132,37 @@ static int ieee80211_ioctl_siwgenie(struct net_device *dev,
if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME)
return -EOPNOTSUPP;
- if (sdata->vif.type == NL80211_IFTYPE_STATION ||
- sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+ if (sdata->vif.type == NL80211_IFTYPE_STATION) {
int ret = ieee80211_sta_set_extra_ie(sdata, extra, data->length);
if (ret)
return ret;
- sdata->u.sta.flags &= ~IEEE80211_STA_AUTO_BSSID_SEL;
- ieee80211_sta_req_auth(sdata, &sdata->u.sta);
+ sdata->u.mgd.flags &= ~IEEE80211_STA_AUTO_BSSID_SEL;
+ ieee80211_sta_req_auth(sdata);
return 0;
}
return -EOPNOTSUPP;
}
-static u8 ieee80211_get_wstats_flags(struct ieee80211_local *local)
-{
- u8 wstats_flags = 0;
-
- wstats_flags |= local->hw.flags & (IEEE80211_HW_SIGNAL_UNSPEC |
- IEEE80211_HW_SIGNAL_DBM) ?
- IW_QUAL_QUAL_UPDATED : IW_QUAL_QUAL_INVALID;
- wstats_flags |= local->hw.flags & IEEE80211_HW_NOISE_DBM ?
- IW_QUAL_NOISE_UPDATED : IW_QUAL_NOISE_INVALID;
- if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
- wstats_flags |= IW_QUAL_DBM;
-
- return wstats_flags;
-}
-
-static int ieee80211_ioctl_giwrange(struct net_device *dev,
- struct iw_request_info *info,
- struct iw_point *data, char *extra)
-{
- struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
- struct iw_range *range = (struct iw_range *) extra;
- enum ieee80211_band band;
- int c = 0;
-
- data->length = sizeof(struct iw_range);
- memset(range, 0, sizeof(struct iw_range));
-
- range->we_version_compiled = WIRELESS_EXT;
- range->we_version_source = 21;
- range->retry_capa = IW_RETRY_LIMIT;
- range->retry_flags = IW_RETRY_LIMIT;
- range->min_retry = 0;
- range->max_retry = 255;
- range->min_rts = 0;
- range->max_rts = 2347;
- range->min_frag = 256;
- range->max_frag = 2346;
-
- range->encoding_size[0] = 5;
- range->encoding_size[1] = 13;
- range->num_encoding_sizes = 2;
- range->max_encoding_tokens = NUM_DEFAULT_KEYS;
-
- /* cfg80211 requires this, and enforces 0..100 */
- if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)
- range->max_qual.level = 100;
- else if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
- range->max_qual.level = -110;
- else
- range->max_qual.level = 0;
-
- if (local->hw.flags & IEEE80211_HW_NOISE_DBM)
- range->max_qual.noise = -110;
- else
- range->max_qual.noise = 0;
-
- range->max_qual.qual = 100;
- range->max_qual.updated = ieee80211_get_wstats_flags(local);
-
- range->avg_qual.qual = 50;
- /* not always true but better than nothing */
- range->avg_qual.level = range->max_qual.level / 2;
- range->avg_qual.noise = range->max_qual.noise / 2;
- range->avg_qual.updated = ieee80211_get_wstats_flags(local);
-
- range->enc_capa = IW_ENC_CAPA_WPA | IW_ENC_CAPA_WPA2 |
- IW_ENC_CAPA_CIPHER_TKIP | IW_ENC_CAPA_CIPHER_CCMP;
-
-
- for (band = 0; band < IEEE80211_NUM_BANDS; band ++) {
- int i;
- struct ieee80211_supported_band *sband;
-
- sband = local->hw.wiphy->bands[band];
-
- if (!sband)
- continue;
-
- for (i = 0; i < sband->n_channels && c < IW_MAX_FREQUENCIES; i++) {
- struct ieee80211_channel *chan = &sband->channels[i];
-
- if (!(chan->flags & IEEE80211_CHAN_DISABLED)) {
- range->freq[c].i =
- ieee80211_frequency_to_channel(
- chan->center_freq);
- range->freq[c].m = chan->center_freq;
- range->freq[c].e = 6;
- c++;
- }
- }
- }
- range->num_channels = c;
- range->num_frequency = c;
-
- IW_EVENT_CAPA_SET_KERNEL(range->event_capa);
- IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWAP);
- IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWSCAN);
-
- range->scan_capa |= IW_SCAN_CAPA_ESSID;
-
- return 0;
-}
-
-
static int ieee80211_ioctl_siwfreq(struct net_device *dev,
struct iw_request_info *info,
struct iw_freq *freq, char *extra)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
- if (sdata->vif.type == NL80211_IFTYPE_ADHOC ||
- sdata->vif.type == NL80211_IFTYPE_STATION)
- sdata->u.sta.flags &= ~IEEE80211_STA_AUTO_CHANNEL_SEL;
+ if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
+ sdata->u.ibss.flags &= ~IEEE80211_IBSS_AUTO_CHANNEL_SEL;
+ else if (sdata->vif.type == NL80211_IFTYPE_STATION)
+ sdata->u.mgd.flags &= ~IEEE80211_STA_AUTO_CHANNEL_SEL;
/* freq->e == 0: freq->m = channel; otherwise freq = m * 10^e */
if (freq->e == 0) {
if (freq->m < 0) {
- if (sdata->vif.type == NL80211_IFTYPE_ADHOC ||
- sdata->vif.type == NL80211_IFTYPE_STATION)
- sdata->u.sta.flags |=
+ if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
+ sdata->u.ibss.flags |=
+ IEEE80211_IBSS_AUTO_CHANNEL_SEL;
+ else if (sdata->vif.type == NL80211_IFTYPE_STATION)
+ sdata->u.mgd.flags |=
IEEE80211_STA_AUTO_CHANNEL_SEL;
return 0;
} else
@@ -301,32 +199,35 @@ static int ieee80211_ioctl_siwessid(struct net_device *dev,
{
struct ieee80211_sub_if_data *sdata;
size_t len = data->length;
+ int ret;
/* iwconfig uses nul termination in SSID.. */
if (len > 0 && ssid[len - 1] == '\0')
len--;
sdata = IEEE80211_DEV_TO_SUB_IF(dev);
- if (sdata->vif.type == NL80211_IFTYPE_STATION ||
- sdata->vif.type == NL80211_IFTYPE_ADHOC) {
- int ret;
+ if (sdata->vif.type == NL80211_IFTYPE_STATION) {
if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME) {
if (len > IEEE80211_MAX_SSID_LEN)
return -EINVAL;
- memcpy(sdata->u.sta.ssid, ssid, len);
- sdata->u.sta.ssid_len = len;
+ memcpy(sdata->u.mgd.ssid, ssid, len);
+ sdata->u.mgd.ssid_len = len;
return 0;
}
+
if (data->flags)
- sdata->u.sta.flags &= ~IEEE80211_STA_AUTO_SSID_SEL;
+ sdata->u.mgd.flags &= ~IEEE80211_STA_AUTO_SSID_SEL;
else
- sdata->u.sta.flags |= IEEE80211_STA_AUTO_SSID_SEL;
+ sdata->u.mgd.flags |= IEEE80211_STA_AUTO_SSID_SEL;
+
ret = ieee80211_sta_set_ssid(sdata, ssid, len);
if (ret)
return ret;
- ieee80211_sta_req_auth(sdata, &sdata->u.sta);
+
+ ieee80211_sta_req_auth(sdata);
return 0;
- }
+ } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
+ return ieee80211_ibss_set_ssid(sdata, ssid, len);
return -EOPNOTSUPP;
}
@@ -340,8 +241,7 @@ static int ieee80211_ioctl_giwessid(struct net_device *dev,
struct ieee80211_sub_if_data *sdata;
sdata = IEEE80211_DEV_TO_SUB_IF(dev);
- if (sdata->vif.type == NL80211_IFTYPE_STATION ||
- sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+ if (sdata->vif.type == NL80211_IFTYPE_STATION) {
int res = ieee80211_sta_get_ssid(sdata, ssid, &len);
if (res == 0) {
data->length = len;
@@ -349,6 +249,14 @@ static int ieee80211_ioctl_giwessid(struct net_device *dev,
} else
data->flags = 0;
return res;
+ } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+ int res = ieee80211_ibss_get_ssid(sdata, ssid, &len);
+ if (res == 0) {
+ data->length = len;
+ data->flags = 1;
+ } else
+ data->flags = 0;
+ return res;
}
return -EOPNOTSUPP;
@@ -362,26 +270,35 @@ static int ieee80211_ioctl_siwap(struct net_device *dev,
struct ieee80211_sub_if_data *sdata;
sdata = IEEE80211_DEV_TO_SUB_IF(dev);
- if (sdata->vif.type == NL80211_IFTYPE_STATION ||
- sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+ if (sdata->vif.type == NL80211_IFTYPE_STATION) {
int ret;
if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME) {
- memcpy(sdata->u.sta.bssid, (u8 *) &ap_addr->sa_data,
+ memcpy(sdata->u.mgd.bssid, (u8 *) &ap_addr->sa_data,
ETH_ALEN);
return 0;
}
if (is_zero_ether_addr((u8 *) &ap_addr->sa_data))
- sdata->u.sta.flags |= IEEE80211_STA_AUTO_BSSID_SEL |
+ sdata->u.mgd.flags |= IEEE80211_STA_AUTO_BSSID_SEL |
IEEE80211_STA_AUTO_CHANNEL_SEL;
else if (is_broadcast_ether_addr((u8 *) &ap_addr->sa_data))
- sdata->u.sta.flags |= IEEE80211_STA_AUTO_BSSID_SEL;
+ sdata->u.mgd.flags |= IEEE80211_STA_AUTO_BSSID_SEL;
else
- sdata->u.sta.flags &= ~IEEE80211_STA_AUTO_BSSID_SEL;
+ sdata->u.mgd.flags &= ~IEEE80211_STA_AUTO_BSSID_SEL;
ret = ieee80211_sta_set_bssid(sdata, (u8 *) &ap_addr->sa_data);
if (ret)
return ret;
- ieee80211_sta_req_auth(sdata, &sdata->u.sta);
+ ieee80211_sta_req_auth(sdata);
return 0;
+ } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+ if (is_zero_ether_addr((u8 *) &ap_addr->sa_data))
+ sdata->u.ibss.flags |= IEEE80211_IBSS_AUTO_BSSID_SEL |
+ IEEE80211_IBSS_AUTO_CHANNEL_SEL;
+ else if (is_broadcast_ether_addr((u8 *) &ap_addr->sa_data))
+ sdata->u.ibss.flags |= IEEE80211_IBSS_AUTO_BSSID_SEL;
+ else
+ sdata->u.ibss.flags &= ~IEEE80211_IBSS_AUTO_BSSID_SEL;
+
+ return ieee80211_ibss_set_bssid(sdata, (u8 *) &ap_addr->sa_data);
} else if (sdata->vif.type == NL80211_IFTYPE_WDS) {
/*
* If it is necessary to update the WDS peer address
@@ -410,17 +327,20 @@ static int ieee80211_ioctl_giwap(struct net_device *dev,
struct ieee80211_sub_if_data *sdata;
sdata = IEEE80211_DEV_TO_SUB_IF(dev);
- if (sdata->vif.type == NL80211_IFTYPE_STATION ||
- sdata->vif.type == NL80211_IFTYPE_ADHOC) {
- if (sdata->u.sta.state == IEEE80211_STA_MLME_ASSOCIATED ||
- sdata->u.sta.state == IEEE80211_STA_MLME_IBSS_JOINED) {
+ if (sdata->vif.type == NL80211_IFTYPE_STATION) {
+ if (sdata->u.mgd.state == IEEE80211_STA_MLME_ASSOCIATED) {
ap_addr->sa_family = ARPHRD_ETHER;
- memcpy(&ap_addr->sa_data, sdata->u.sta.bssid, ETH_ALEN);
- return 0;
- } else {
+ memcpy(&ap_addr->sa_data, sdata->u.mgd.bssid, ETH_ALEN);
+ } else
memset(&ap_addr->sa_data, 0, ETH_ALEN);
- return 0;
- }
+ return 0;
+ } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+ if (sdata->u.ibss.state == IEEE80211_IBSS_MLME_JOINED) {
+ ap_addr->sa_family = ARPHRD_ETHER;
+ memcpy(&ap_addr->sa_data, sdata->u.ibss.bssid, ETH_ALEN);
+ } else
+ memset(&ap_addr->sa_data, 0, ETH_ALEN);
+ return 0;
} else if (sdata->vif.type == NL80211_IFTYPE_WDS) {
ap_addr->sa_family = ARPHRD_ETHER;
memcpy(&ap_addr->sa_data, sdata->u.wds.remote_addr, ETH_ALEN);
@@ -486,7 +406,7 @@ static int ieee80211_ioctl_giwrate(struct net_device *dev,
rcu_read_lock();
- sta = sta_info_get(local, sdata->u.sta.bssid);
+ sta = sta_info_get(local, sdata->u.mgd.bssid);
if (sta && !(sta->last_tx_rate.flags & IEEE80211_TX_RC_MCS))
rate->value = sband->bitrates[sta->last_tx_rate.idx].bitrate;
@@ -687,8 +607,7 @@ static int ieee80211_ioctl_siwmlme(struct net_device *dev,
struct iw_mlme *mlme = (struct iw_mlme *) extra;
sdata = IEEE80211_DEV_TO_SUB_IF(dev);
- if (sdata->vif.type != NL80211_IFTYPE_STATION &&
- sdata->vif.type != NL80211_IFTYPE_ADHOC)
+ if (!(sdata->vif.type == NL80211_IFTYPE_STATION))
return -EINVAL;
switch (mlme->cmd) {
@@ -784,8 +703,7 @@ static int ieee80211_ioctl_giwencode(struct net_device *dev,
erq->flags |= IW_ENCODE_ENABLED;
if (sdata->vif.type == NL80211_IFTYPE_STATION) {
- struct ieee80211_if_sta *ifsta = &sdata->u.sta;
- switch (ifsta->auth_alg) {
+ switch (sdata->u.mgd.auth_alg) {
case WLAN_AUTH_OPEN:
case WLAN_AUTH_LEAP:
erq->flags |= IW_ENCODE_OPEN;
@@ -849,7 +767,7 @@ static int ieee80211_ioctl_siwpower(struct net_device *dev,
ret = ieee80211_hw_config(local,
IEEE80211_CONF_CHANGE_DYNPS_TIMEOUT);
- if (!(sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED))
+ if (!(sdata->u.mgd.flags & IEEE80211_STA_ASSOCIATED))
return ret;
if (conf->dynamic_ps_timeout > 0 &&
@@ -908,10 +826,10 @@ static int ieee80211_ioctl_siwauth(struct net_device *dev,
if (sdata->vif.type == NL80211_IFTYPE_STATION) {
if (data->value & (IW_AUTH_CIPHER_WEP40 |
IW_AUTH_CIPHER_WEP104 | IW_AUTH_CIPHER_TKIP))
- sdata->u.sta.flags |=
+ sdata->u.mgd.flags |=
IEEE80211_STA_TKIP_WEP_USED;
else
- sdata->u.sta.flags &=
+ sdata->u.mgd.flags &=
~IEEE80211_STA_TKIP_WEP_USED;
}
break;
@@ -922,21 +840,20 @@ static int ieee80211_ioctl_siwauth(struct net_device *dev,
if (sdata->vif.type != NL80211_IFTYPE_STATION)
ret = -EINVAL;
else {
- sdata->u.sta.flags &= ~IEEE80211_STA_PRIVACY_INVOKED;
+ sdata->u.mgd.flags &= ~IEEE80211_STA_PRIVACY_INVOKED;
/*
* Privacy invoked by wpa_supplicant, store the
* value and allow associating to a protected
* network without having a key up front.
*/
if (data->value)
- sdata->u.sta.flags |=
+ sdata->u.mgd.flags |=
IEEE80211_STA_PRIVACY_INVOKED;
}
break;
case IW_AUTH_80211_AUTH_ALG:
- if (sdata->vif.type == NL80211_IFTYPE_STATION ||
- sdata->vif.type == NL80211_IFTYPE_ADHOC)
- sdata->u.sta.auth_algs = data->value;
+ if (sdata->vif.type == NL80211_IFTYPE_STATION)
+ sdata->u.mgd.auth_algs = data->value;
else
ret = -EOPNOTSUPP;
break;
@@ -945,17 +862,16 @@ static int ieee80211_ioctl_siwauth(struct net_device *dev,
ret = -EOPNOTSUPP;
break;
}
- if (sdata->vif.type == NL80211_IFTYPE_STATION ||
- sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+ if (sdata->vif.type == NL80211_IFTYPE_STATION) {
switch (data->value) {
case IW_AUTH_MFP_DISABLED:
- sdata->u.sta.mfp = IEEE80211_MFP_DISABLED;
+ sdata->u.mgd.mfp = IEEE80211_MFP_DISABLED;
break;
case IW_AUTH_MFP_OPTIONAL:
- sdata->u.sta.mfp = IEEE80211_MFP_OPTIONAL;
+ sdata->u.mgd.mfp = IEEE80211_MFP_OPTIONAL;
break;
case IW_AUTH_MFP_REQUIRED:
- sdata->u.sta.mfp = IEEE80211_MFP_REQUIRED;
+ sdata->u.mgd.mfp = IEEE80211_MFP_REQUIRED;
break;
default:
ret = -EINVAL;
@@ -980,9 +896,9 @@ static struct iw_statistics *ieee80211_get_wireless_stats(struct net_device *dev
rcu_read_lock();
- if (sdata->vif.type == NL80211_IFTYPE_STATION ||
- sdata->vif.type == NL80211_IFTYPE_ADHOC)
- sta = sta_info_get(local, sdata->u.sta.bssid);
+ if (sdata->vif.type == NL80211_IFTYPE_STATION)
+ sta = sta_info_get(local, sdata->u.mgd.bssid);
+
if (!sta) {
wstats->discard.fragment = 0;
wstats->discard.misc = 0;
@@ -991,10 +907,45 @@ static struct iw_statistics *ieee80211_get_wireless_stats(struct net_device *dev
wstats->qual.noise = 0;
wstats->qual.updated = IW_QUAL_ALL_INVALID;
} else {
- wstats->qual.level = sta->last_signal;
- wstats->qual.qual = sta->last_qual;
- wstats->qual.noise = sta->last_noise;
- wstats->qual.updated = ieee80211_get_wstats_flags(local);
+ wstats->qual.updated = 0;
+ /*
+ * mirror what cfg80211 does for iwrange/scan results,
+ * otherwise userspace gets confused.
+ */
+ if (local->hw.flags & (IEEE80211_HW_SIGNAL_UNSPEC |
+ IEEE80211_HW_SIGNAL_DBM)) {
+ wstats->qual.updated |= IW_QUAL_LEVEL_UPDATED;
+ wstats->qual.updated |= IW_QUAL_QUAL_UPDATED;
+ } else {
+ wstats->qual.updated |= IW_QUAL_LEVEL_INVALID;
+ wstats->qual.updated |= IW_QUAL_QUAL_INVALID;
+ }
+
+ if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) {
+ wstats->qual.level = sta->last_signal;
+ wstats->qual.qual = sta->last_signal;
+ } else if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) {
+ int sig = sta->last_signal;
+
+ wstats->qual.updated |= IW_QUAL_DBM;
+ wstats->qual.level = sig;
+ if (sig < -110)
+ sig = -110;
+ else if (sig > -40)
+ sig = -40;
+ wstats->qual.qual = sig + 110;
+ }
+
+ if (local->hw.flags & IEEE80211_HW_NOISE_DBM) {
+ /*
+ * This assumes that if driver reports noise, it also
+ * reports signal in dBm.
+ */
+ wstats->qual.noise = sta->last_noise;
+ wstats->qual.updated |= IW_QUAL_NOISE_UPDATED;
+ } else {
+ wstats->qual.updated |= IW_QUAL_NOISE_INVALID;
+ }
}
rcu_read_unlock();
@@ -1011,9 +962,8 @@ static int ieee80211_ioctl_giwauth(struct net_device *dev,
switch (data->flags & IW_AUTH_INDEX) {
case IW_AUTH_80211_AUTH_ALG:
- if (sdata->vif.type == NL80211_IFTYPE_STATION ||
- sdata->vif.type == NL80211_IFTYPE_ADHOC)
- data->value = sdata->u.sta.auth_algs;
+ if (sdata->vif.type == NL80211_IFTYPE_STATION)
+ data->value = sdata->u.mgd.auth_algs;
else
ret = -EOPNOTSUPP;
break;
@@ -1116,7 +1066,7 @@ static const iw_handler ieee80211_handler[] =
(iw_handler) NULL, /* SIOCSIWSENS */
(iw_handler) NULL, /* SIOCGIWSENS */
(iw_handler) NULL /* not used */, /* SIOCSIWRANGE */
- (iw_handler) ieee80211_ioctl_giwrange, /* SIOCGIWRANGE */
+ (iw_handler) cfg80211_wext_giwrange, /* SIOCGIWRANGE */
(iw_handler) NULL /* not used */, /* SIOCSIWPRIV */
(iw_handler) NULL /* kernel code */, /* SIOCGIWPRIV */
(iw_handler) NULL /* not used */, /* SIOCSIWSTATS */
diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c
index ac71b38f7cb..0b8ad1f4ecd 100644
--- a/net/mac80211/wme.c
+++ b/net/mac80211/wme.c
@@ -99,10 +99,13 @@ static u16 classify80211(struct ieee80211_local *local, struct sk_buff *skb)
/* in case we are a client verify acm is not set for this ac */
while (unlikely(local->wmm_acm & BIT(skb->priority))) {
if (wme_downgrade_ac(skb)) {
- /* The old code would drop the packet in this
- * case.
+ /*
+ * This should not really happen. The AP has marked all
+ * lower ACs to require admission control which is not
+ * a reasonable configuration. Allow the frame to be
+ * transmitted using AC_BK as a workaround.
*/
- return 0;
+ break;
}
}
@@ -114,9 +117,7 @@ u16 ieee80211_select_queue(struct net_device *dev, struct sk_buff *skb)
{
struct ieee80211_master_priv *mpriv = netdev_priv(dev);
struct ieee80211_local *local = mpriv->local;
- struct ieee80211_hw *hw = &local->hw;
struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
- struct sta_info *sta;
u16 queue;
u8 tid;
@@ -124,29 +125,11 @@ u16 ieee80211_select_queue(struct net_device *dev, struct sk_buff *skb)
if (unlikely(queue >= local->hw.queues))
queue = local->hw.queues - 1;
- if (skb->requeue) {
- if (!hw->ampdu_queues)
- return queue;
-
- rcu_read_lock();
- sta = sta_info_get(local, hdr->addr1);
- tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
- if (sta) {
- int ampdu_queue = sta->tid_to_tx_q[tid];
-
- if ((ampdu_queue < ieee80211_num_queues(hw)) &&
- test_bit(ampdu_queue, local->queue_pool))
- queue = ampdu_queue;
- }
- rcu_read_unlock();
-
- return queue;
- }
-
- /* Now we know the 1d priority, fill in the QoS header if
- * there is one.
+ /*
+ * Now we know the 1d priority, fill in the QoS header if
+ * there is one (and we haven't done this before).
*/
- if (ieee80211_is_data_qos(hdr->frame_control)) {
+ if (!skb->requeue && ieee80211_is_data_qos(hdr->frame_control)) {
u8 *p = ieee80211_get_qos_ctl(hdr);
u8 ack_policy = 0;
tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
@@ -156,140 +139,7 @@ u16 ieee80211_select_queue(struct net_device *dev, struct sk_buff *skb)
/* qos header is 2 bytes, second reserved */
*p++ = ack_policy | tid;
*p = 0;
-
- if (!hw->ampdu_queues)
- return queue;
-
- rcu_read_lock();
-
- sta = sta_info_get(local, hdr->addr1);
- if (sta) {
- int ampdu_queue = sta->tid_to_tx_q[tid];
-
- if ((ampdu_queue < ieee80211_num_queues(hw)) &&
- test_bit(ampdu_queue, local->queue_pool))
- queue = ampdu_queue;
- }
-
- rcu_read_unlock();
}
return queue;
}
-
-int ieee80211_ht_agg_queue_add(struct ieee80211_local *local,
- struct sta_info *sta, u16 tid)
-{
- int i;
-
- /* XXX: currently broken due to cb/requeue use */
- return -EPERM;
-
- /* prepare the filter and save it for the SW queue
- * matching the received HW queue */
-
- if (!local->hw.ampdu_queues)
- return -EPERM;
-
- /* try to get a Qdisc from the pool */
- for (i = local->hw.queues; i < ieee80211_num_queues(&local->hw); i++)
- if (!test_and_set_bit(i, local->queue_pool)) {
- ieee80211_stop_queue(local_to_hw(local), i);
- sta->tid_to_tx_q[tid] = i;
-
- /* IF there are already pending packets
- * on this tid first we need to drain them
- * on the previous queue
- * since HT is strict in order */
-#ifdef CONFIG_MAC80211_HT_DEBUG
- if (net_ratelimit())
- printk(KERN_DEBUG "allocated aggregation queue"
- " %d tid %d addr %pM pool=0x%lX\n",
- i, tid, sta->sta.addr,
- local->queue_pool[0]);
-#endif /* CONFIG_MAC80211_HT_DEBUG */
- return 0;
- }
-
- return -EAGAIN;
-}
-
-/**
- * the caller needs to hold netdev_get_tx_queue(local->mdev, X)->lock
- */
-void ieee80211_ht_agg_queue_remove(struct ieee80211_local *local,
- struct sta_info *sta, u16 tid,
- u8 requeue)
-{
- int agg_queue = sta->tid_to_tx_q[tid];
- struct ieee80211_hw *hw = &local->hw;
-
- /* return the qdisc to the pool */
- clear_bit(agg_queue, local->queue_pool);
- sta->tid_to_tx_q[tid] = ieee80211_num_queues(hw);
-
- if (requeue) {
- ieee80211_requeue(local, agg_queue);
- } else {
- struct netdev_queue *txq;
- spinlock_t *root_lock;
- struct Qdisc *q;
-
- txq = netdev_get_tx_queue(local->mdev, agg_queue);
- q = rcu_dereference(txq->qdisc);
- root_lock = qdisc_lock(q);
-
- spin_lock_bh(root_lock);
- qdisc_reset(q);
- spin_unlock_bh(root_lock);
- }
-}
-
-void ieee80211_requeue(struct ieee80211_local *local, int queue)
-{
- struct netdev_queue *txq = netdev_get_tx_queue(local->mdev, queue);
- struct sk_buff_head list;
- spinlock_t *root_lock;
- struct Qdisc *qdisc;
- u32 len;
-
- rcu_read_lock_bh();
-
- qdisc = rcu_dereference(txq->qdisc);
- if (!qdisc || !qdisc->dequeue)
- goto out_unlock;
-
- skb_queue_head_init(&list);
-
- root_lock = qdisc_root_lock(qdisc);
- spin_lock(root_lock);
- for (len = qdisc->q.qlen; len > 0; len--) {
- struct sk_buff *skb = qdisc->dequeue(qdisc);
-
- if (skb)
- __skb_queue_tail(&list, skb);
- }
- spin_unlock(root_lock);
-
- for (len = list.qlen; len > 0; len--) {
- struct sk_buff *skb = __skb_dequeue(&list);
- u16 new_queue;
-
- BUG_ON(!skb);
- new_queue = ieee80211_select_queue(local->mdev, skb);
- skb_set_queue_mapping(skb, new_queue);
-
- txq = netdev_get_tx_queue(local->mdev, new_queue);
-
-
- qdisc = rcu_dereference(txq->qdisc);
- root_lock = qdisc_root_lock(qdisc);
-
- spin_lock(root_lock);
- qdisc_enqueue_root(skb, qdisc);
- spin_unlock(root_lock);
- }
-
-out_unlock:
- rcu_read_unlock_bh();
-}
diff --git a/net/mac80211/wme.h b/net/mac80211/wme.h
index bc62f28a4d3..7520d2e014d 100644
--- a/net/mac80211/wme.h
+++ b/net/mac80211/wme.h
@@ -21,11 +21,5 @@
extern const int ieee802_1d_to_ac[8];
u16 ieee80211_select_queue(struct net_device *dev, struct sk_buff *skb);
-int ieee80211_ht_agg_queue_add(struct ieee80211_local *local,
- struct sta_info *sta, u16 tid);
-void ieee80211_ht_agg_queue_remove(struct ieee80211_local *local,
- struct sta_info *sta, u16 tid,
- u8 requeue);
-void ieee80211_requeue(struct ieee80211_local *local, int queue);
#endif /* _WME_H */
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 55befe59e1c..dfb447b584d 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -728,7 +728,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
NF_CT_ASSERT(skb->nfct);
ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum);
- if (ret < 0) {
+ if (ret <= 0) {
/* Invalid: inverse of the return code tells
* the netfilter core what to do */
pr_debug("nf_conntrack_in: Can't track with proto module\n");
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 1b75c9efb0e..7a16bd462f8 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1763,6 +1763,7 @@ ctnetlink_create_expect(struct nlattr *cda[], u_int8_t u3, u32 pid, int report)
goto out;
}
+ exp->class = 0;
exp->expectfn = NULL;
exp->flags = 0;
exp->master = ct;
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 7d3944f02ea..e46f3b79adb 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -861,7 +861,7 @@ static int tcp_packet(struct nf_conn *ct,
*/
if (nf_ct_kill(ct))
return -NF_REPEAT;
- return -NF_DROP;
+ return NF_DROP;
}
/* Fall through */
case TCP_CONNTRACK_IGNORE:
@@ -894,7 +894,7 @@ static int tcp_packet(struct nf_conn *ct,
nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: killing out of sync session ");
nf_ct_kill(ct);
- return -NF_DROP;
+ return NF_DROP;
}
ct->proto.tcp.last_index = index;
ct->proto.tcp.last_dir = dir;
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 3eae3fca29d..fd326ac27ec 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -39,7 +39,7 @@
#endif
#define NFULNL_NLBUFSIZ_DEFAULT NLMSG_GOODSIZE
-#define NFULNL_TIMEOUT_DEFAULT HZ /* every second */
+#define NFULNL_TIMEOUT_DEFAULT 100 /* every second */
#define NFULNL_QTHRESH_DEFAULT 100 /* 100 packets */
#define NFULNL_COPY_RANGE_MAX 0xFFFF /* max packet size is limited by 16-bit struct nfattr nfa_len field */
@@ -590,8 +590,10 @@ nfulnl_log_packet(u_int8_t pf,
qthreshold = inst->qthreshold;
/* per-rule qthreshold overrides per-instance */
- if (qthreshold > li->u.ulog.qthreshold)
- qthreshold = li->u.ulog.qthreshold;
+ if (li->u.ulog.qthreshold)
+ if (qthreshold > li->u.ulog.qthreshold)
+ qthreshold = li->u.ulog.qthreshold;
+
switch (inst->copy_mode) {
case NFULNL_COPY_META:
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index bfcac92d556..509a95621f9 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -843,59 +843,143 @@ static const struct file_operations xt_table_ops = {
.release = seq_release_net,
};
-static void *xt_match_seq_start(struct seq_file *seq, loff_t *pos)
+/*
+ * Traverse state for ip{,6}_{tables,matches} for helping crossing
+ * the multi-AF mutexes.
+ */
+struct nf_mttg_trav {
+ struct list_head *head, *curr;
+ uint8_t class, nfproto;
+};
+
+enum {
+ MTTG_TRAV_INIT,
+ MTTG_TRAV_NFP_UNSPEC,
+ MTTG_TRAV_NFP_SPEC,
+ MTTG_TRAV_DONE,
+};
+
+static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos,
+ bool is_target)
{
- struct proc_dir_entry *pde = (struct proc_dir_entry *)seq->private;
- u_int16_t af = (unsigned long)pde->data;
+ static const uint8_t next_class[] = {
+ [MTTG_TRAV_NFP_UNSPEC] = MTTG_TRAV_NFP_SPEC,
+ [MTTG_TRAV_NFP_SPEC] = MTTG_TRAV_DONE,
+ };
+ struct nf_mttg_trav *trav = seq->private;
+
+ switch (trav->class) {
+ case MTTG_TRAV_INIT:
+ trav->class = MTTG_TRAV_NFP_UNSPEC;
+ mutex_lock(&xt[NFPROTO_UNSPEC].mutex);
+ trav->head = trav->curr = is_target ?
+ &xt[NFPROTO_UNSPEC].target : &xt[NFPROTO_UNSPEC].match;
+ break;
+ case MTTG_TRAV_NFP_UNSPEC:
+ trav->curr = trav->curr->next;
+ if (trav->curr != trav->head)
+ break;
+ mutex_unlock(&xt[NFPROTO_UNSPEC].mutex);
+ mutex_lock(&xt[trav->nfproto].mutex);
+ trav->head = trav->curr = is_target ?
+ &xt[trav->nfproto].target : &xt[trav->nfproto].match;
+ trav->class = next_class[trav->class];
+ break;
+ case MTTG_TRAV_NFP_SPEC:
+ trav->curr = trav->curr->next;
+ if (trav->curr != trav->head)
+ break;
+ /* fallthru, _stop will unlock */
+ default:
+ return NULL;
+ }
- mutex_lock(&xt[af].mutex);
- return seq_list_start(&xt[af].match, *pos);
+ if (ppos != NULL)
+ ++*ppos;
+ return trav;
}
-static void *xt_match_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+static void *xt_mttg_seq_start(struct seq_file *seq, loff_t *pos,
+ bool is_target)
{
- struct proc_dir_entry *pde = (struct proc_dir_entry *)seq->private;
- u_int16_t af = (unsigned long)pde->data;
+ struct nf_mttg_trav *trav = seq->private;
+ unsigned int j;
- return seq_list_next(v, &xt[af].match, pos);
+ trav->class = MTTG_TRAV_INIT;
+ for (j = 0; j < *pos; ++j)
+ if (xt_mttg_seq_next(seq, NULL, NULL, is_target) == NULL)
+ return NULL;
+ return trav;
}
-static void xt_match_seq_stop(struct seq_file *seq, void *v)
+static void xt_mttg_seq_stop(struct seq_file *seq, void *v)
{
- struct proc_dir_entry *pde = seq->private;
- u_int16_t af = (unsigned long)pde->data;
+ struct nf_mttg_trav *trav = seq->private;
+
+ switch (trav->class) {
+ case MTTG_TRAV_NFP_UNSPEC:
+ mutex_unlock(&xt[NFPROTO_UNSPEC].mutex);
+ break;
+ case MTTG_TRAV_NFP_SPEC:
+ mutex_unlock(&xt[trav->nfproto].mutex);
+ break;
+ }
+}
- mutex_unlock(&xt[af].mutex);
+static void *xt_match_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ return xt_mttg_seq_start(seq, pos, false);
}
-static int xt_match_seq_show(struct seq_file *seq, void *v)
+static void *xt_match_seq_next(struct seq_file *seq, void *v, loff_t *ppos)
{
- struct xt_match *match = list_entry(v, struct xt_match, list);
+ return xt_mttg_seq_next(seq, v, ppos, false);
+}
- if (strlen(match->name))
- return seq_printf(seq, "%s\n", match->name);
- else
- return 0;
+static int xt_match_seq_show(struct seq_file *seq, void *v)
+{
+ const struct nf_mttg_trav *trav = seq->private;
+ const struct xt_match *match;
+
+ switch (trav->class) {
+ case MTTG_TRAV_NFP_UNSPEC:
+ case MTTG_TRAV_NFP_SPEC:
+ if (trav->curr == trav->head)
+ return 0;
+ match = list_entry(trav->curr, struct xt_match, list);
+ return (*match->name == '\0') ? 0 :
+ seq_printf(seq, "%s\n", match->name);
+ }
+ return 0;
}
static const struct seq_operations xt_match_seq_ops = {
.start = xt_match_seq_start,
.next = xt_match_seq_next,
- .stop = xt_match_seq_stop,
+ .stop = xt_mttg_seq_stop,
.show = xt_match_seq_show,
};
static int xt_match_open(struct inode *inode, struct file *file)
{
+ struct seq_file *seq;
+ struct nf_mttg_trav *trav;
int ret;
- ret = seq_open(file, &xt_match_seq_ops);
- if (!ret) {
- struct seq_file *seq = file->private_data;
+ trav = kmalloc(sizeof(*trav), GFP_KERNEL);
+ if (trav == NULL)
+ return -ENOMEM;
- seq->private = PDE(inode);
+ ret = seq_open(file, &xt_match_seq_ops);
+ if (ret < 0) {
+ kfree(trav);
+ return ret;
}
- return ret;
+
+ seq = file->private_data;
+ seq->private = trav;
+ trav->nfproto = (unsigned long)PDE(inode)->data;
+ return 0;
}
static const struct file_operations xt_match_ops = {
@@ -903,62 +987,63 @@ static const struct file_operations xt_match_ops = {
.open = xt_match_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release,
+ .release = seq_release_private,
};
static void *xt_target_seq_start(struct seq_file *seq, loff_t *pos)
{
- struct proc_dir_entry *pde = (struct proc_dir_entry *)seq->private;
- u_int16_t af = (unsigned long)pde->data;
-
- mutex_lock(&xt[af].mutex);
- return seq_list_start(&xt[af].target, *pos);
+ return xt_mttg_seq_start(seq, pos, true);
}
-static void *xt_target_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+static void *xt_target_seq_next(struct seq_file *seq, void *v, loff_t *ppos)
{
- struct proc_dir_entry *pde = (struct proc_dir_entry *)seq->private;
- u_int16_t af = (unsigned long)pde->data;
-
- return seq_list_next(v, &xt[af].target, pos);
-}
-
-static void xt_target_seq_stop(struct seq_file *seq, void *v)
-{
- struct proc_dir_entry *pde = seq->private;
- u_int16_t af = (unsigned long)pde->data;
-
- mutex_unlock(&xt[af].mutex);
+ return xt_mttg_seq_next(seq, v, ppos, true);
}
static int xt_target_seq_show(struct seq_file *seq, void *v)
{
- struct xt_target *target = list_entry(v, struct xt_target, list);
-
- if (strlen(target->name))
- return seq_printf(seq, "%s\n", target->name);
- else
- return 0;
+ const struct nf_mttg_trav *trav = seq->private;
+ const struct xt_target *target;
+
+ switch (trav->class) {
+ case MTTG_TRAV_NFP_UNSPEC:
+ case MTTG_TRAV_NFP_SPEC:
+ if (trav->curr == trav->head)
+ return 0;
+ target = list_entry(trav->curr, struct xt_target, list);
+ return (*target->name == '\0') ? 0 :
+ seq_printf(seq, "%s\n", target->name);
+ }
+ return 0;
}
static const struct seq_operations xt_target_seq_ops = {
.start = xt_target_seq_start,
.next = xt_target_seq_next,
- .stop = xt_target_seq_stop,
+ .stop = xt_mttg_seq_stop,
.show = xt_target_seq_show,
};
static int xt_target_open(struct inode *inode, struct file *file)
{
+ struct seq_file *seq;
+ struct nf_mttg_trav *trav;
int ret;
- ret = seq_open(file, &xt_target_seq_ops);
- if (!ret) {
- struct seq_file *seq = file->private_data;
+ trav = kmalloc(sizeof(*trav), GFP_KERNEL);
+ if (trav == NULL)
+ return -ENOMEM;
- seq->private = PDE(inode);
+ ret = seq_open(file, &xt_target_seq_ops);
+ if (ret < 0) {
+ kfree(trav);
+ return ret;
}
- return ret;
+
+ seq = file->private_data;
+ seq->private = trav;
+ trav->nfproto = (unsigned long)PDE(inode)->data;
+ return 0;
}
static const struct file_operations xt_target_ops = {
@@ -966,7 +1051,7 @@ static const struct file_operations xt_target_ops = {
.open = xt_target_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release,
+ .release = seq_release_private,
};
#define FORMAT_TABLES "_tables_names"
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index fe80b614a40..791e030ea90 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -542,7 +542,7 @@ recent_mt_proc_write(struct file *file, const char __user *input,
struct recent_entry *e;
char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")];
const char *c = buf;
- union nf_inet_addr addr;
+ union nf_inet_addr addr = {};
u_int16_t family;
bool add, succ;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 5b33879c642..b73d4e61c5a 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -85,6 +85,7 @@ struct netlink_sock {
#define NETLINK_KERNEL_SOCKET 0x1
#define NETLINK_RECV_PKTINFO 0x2
+#define NETLINK_BROADCAST_SEND_ERROR 0x4
static inline struct netlink_sock *nlk_sk(struct sock *sk)
{
@@ -995,12 +996,15 @@ static inline int do_one_broadcast(struct sock *sk,
netlink_overrun(sk);
/* Clone failed. Notify ALL listeners. */
p->failure = 1;
+ if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR)
+ p->delivery_failure = 1;
} else if (sk_filter(sk, p->skb2)) {
kfree_skb(p->skb2);
p->skb2 = NULL;
} else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) {
netlink_overrun(sk);
- p->delivery_failure = 1;
+ if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR)
+ p->delivery_failure = 1;
} else {
p->congested |= val;
p->delivered = 1;
@@ -1045,10 +1049,9 @@ int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid,
netlink_unlock_table();
- if (info.skb2)
- kfree_skb(info.skb2);
+ kfree_skb(info.skb2);
- if (info.delivery_failure || info.failure)
+ if (info.delivery_failure)
return -ENOBUFS;
if (info.delivered) {
@@ -1088,6 +1091,13 @@ out:
return 0;
}
+/**
+ * netlink_set_err - report error to broadcast listeners
+ * @ssk: the kernel netlink socket, as returned by netlink_kernel_create()
+ * @pid: the PID of a process that we want to skip (if any)
+ * @groups: the broadcast group that will notice the error
+ * @code: error code, must be negative (as usual in kernelspace)
+ */
void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code)
{
struct netlink_set_err_data info;
@@ -1097,7 +1107,8 @@ void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code)
info.exclude_sk = ssk;
info.pid = pid;
info.group = group;
- info.code = code;
+ /* sk->sk_err wants a positive error value */
+ info.code = -code;
read_lock(&nl_table_lock);
@@ -1164,6 +1175,13 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
err = 0;
break;
}
+ case NETLINK_BROADCAST_ERROR:
+ if (val)
+ nlk->flags |= NETLINK_BROADCAST_SEND_ERROR;
+ else
+ nlk->flags &= ~NETLINK_BROADCAST_SEND_ERROR;
+ err = 0;
+ break;
default:
err = -ENOPROTOOPT;
}
@@ -1196,6 +1214,16 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname,
return -EFAULT;
err = 0;
break;
+ case NETLINK_BROADCAST_ERROR:
+ if (len < sizeof(int))
+ return -EINVAL;
+ len = sizeof(int);
+ val = nlk->flags & NETLINK_BROADCAST_SEND_ERROR ? 1 : 0;
+ if (put_user(len, optlen) ||
+ put_user(val, optval))
+ return -EFAULT;
+ err = 0;
+ break;
default:
err = -ENOPROTOOPT;
}
@@ -1522,8 +1550,7 @@ EXPORT_SYMBOL(netlink_set_nonroot);
static void netlink_destroy_callback(struct netlink_callback *cb)
{
- if (cb->skb)
- kfree_skb(cb->skb);
+ kfree_skb(cb->skb);
kfree(cb);
}
@@ -1740,12 +1767,18 @@ int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 pid,
exclude_pid = pid;
}
- /* errors reported via destination sk->sk_err */
- nlmsg_multicast(sk, skb, exclude_pid, group, flags);
+ /* errors reported via destination sk->sk_err, but propagate
+ * delivery errors if NETLINK_BROADCAST_ERROR flag is set */
+ err = nlmsg_multicast(sk, skb, exclude_pid, group, flags);
}
- if (report)
- err = nlmsg_unicast(sk, skb, pid);
+ if (report) {
+ int err2;
+
+ err2 = nlmsg_unicast(sk, skb, pid);
+ if (!err || err == -ESRCH)
+ err = err2;
+ }
return err;
}
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index cba7849de98..6d9c58ec56a 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -1037,6 +1037,10 @@ static int nr_sendmsg(struct kiocb *iocb, struct socket *sock,
unsigned char *asmptr;
int size;
+ /* Netrom empty data frame has no meaning : don't send */
+ if (len == 0)
+ return 0;
+
if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_CMSG_COMPAT))
return -EINVAL;
@@ -1167,6 +1171,11 @@ static int nr_recvmsg(struct kiocb *iocb, struct socket *sock,
skb_reset_transport_header(skb);
copied = skb->len;
+ /* NetRom empty data frame has no meaning : ignore it */
+ if (copied == 0) {
+ goto out;
+ }
+
if (copied > size) {
copied = size;
msg->msg_flags |= MSG_TRUNC;
@@ -1182,7 +1191,7 @@ static int nr_recvmsg(struct kiocb *iocb, struct socket *sock,
msg->msg_namelen = sizeof(*sax);
- skb_free_datagram(sk, skb);
+out: skb_free_datagram(sk, skb);
release_sock(sk);
return copied;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 1fc4a7885c4..74776de523e 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -584,7 +584,7 @@ drop_n_restore:
skb->len = skb_len;
}
drop:
- kfree_skb(skb);
+ consume_skb(skb);
return 0;
}
@@ -756,8 +756,7 @@ ring_is_full:
spin_unlock(&sk->sk_receive_queue.lock);
sk->sk_data_ready(sk, 0);
- if (copy_skb)
- kfree_skb(copy_skb);
+ kfree_skb(copy_skb);
goto drop_n_restore;
}
diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index 81795ea8779..a662e62a99c 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -382,9 +382,8 @@ out:
return NET_RX_DROP;
}
-static struct packet_type phonet_packet_type = {
+static struct packet_type phonet_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_PHONET),
- .dev = NULL,
.func = phonet_rcv,
};
diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c
index 1ceea1f9241..cec4e595168 100644
--- a/net/phonet/pn_netlink.c
+++ b/net/phonet/pn_netlink.c
@@ -47,8 +47,9 @@ static void rtmsg_notify(int event, struct net_device *dev, u8 addr)
kfree_skb(skb);
goto errout;
}
- err = rtnl_notify(skb, dev_net(dev), 0,
- RTNLGRP_PHONET_IFADDR, NULL, GFP_KERNEL);
+ rtnl_notify(skb, dev_net(dev), 0,
+ RTNLGRP_PHONET_IFADDR, NULL, GFP_KERNEL);
+ return;
errout:
if (err < 0)
rtnl_set_sk_err(dev_net(dev), RTNLGRP_PHONET_IFADDR, err);
diff --git a/net/rds/Kconfig b/net/rds/Kconfig
new file mode 100644
index 00000000000..796773b5df9
--- /dev/null
+++ b/net/rds/Kconfig
@@ -0,0 +1,14 @@
+
+config RDS
+ tristate "Reliable Datagram Sockets (RDS) (EXPERIMENTAL)"
+ depends on INET && INFINIBAND_IPOIB && EXPERIMENTAL
+ depends on INFINIBAND && INFINIBAND_ADDR_TRANS
+ ---help---
+ RDS provides reliable, sequenced delivery of datagrams
+ over Infiniband.
+
+config RDS_DEBUG
+ bool "Debugging messages"
+ depends on RDS
+ default n
+
diff --git a/net/rds/Makefile b/net/rds/Makefile
new file mode 100644
index 00000000000..51f27585fa0
--- /dev/null
+++ b/net/rds/Makefile
@@ -0,0 +1,14 @@
+obj-$(CONFIG_RDS) += rds.o
+rds-y := af_rds.o bind.o cong.o connection.o info.o message.o \
+ recv.o send.o stats.o sysctl.o threads.o transport.o \
+ loop.o page.o rdma.o \
+ rdma_transport.o \
+ ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \
+ ib_sysctl.o ib_rdma.o \
+ iw.o iw_cm.o iw_recv.o iw_ring.o iw_send.o iw_stats.o \
+ iw_sysctl.o iw_rdma.o
+
+ifeq ($(CONFIG_RDS_DEBUG), y)
+EXTRA_CFLAGS += -DDEBUG
+endif
+
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
new file mode 100644
index 00000000000..20cf16fc572
--- /dev/null
+++ b/net/rds/af_rds.c
@@ -0,0 +1,586 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/poll.h>
+#include <linux/version.h>
+#include <net/sock.h>
+
+#include "rds.h"
+#include "rdma.h"
+#include "rdma_transport.h"
+
+/* this is just used for stats gathering :/ */
+static DEFINE_SPINLOCK(rds_sock_lock);
+static unsigned long rds_sock_count;
+static LIST_HEAD(rds_sock_list);
+DECLARE_WAIT_QUEUE_HEAD(rds_poll_waitq);
+
+/*
+ * This is called as the final descriptor referencing this socket is closed.
+ * We have to unbind the socket so that another socket can be bound to the
+ * address it was using.
+ *
+ * We have to be careful about racing with the incoming path. sock_orphan()
+ * sets SOCK_DEAD and we use that as an indicator to the rx path that new
+ * messages shouldn't be queued.
+ */
+static int rds_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct rds_sock *rs;
+ unsigned long flags;
+
+ if (sk == NULL)
+ goto out;
+
+ rs = rds_sk_to_rs(sk);
+
+ sock_orphan(sk);
+ /* Note - rds_clear_recv_queue grabs rs_recv_lock, so
+ * that ensures the recv path has completed messing
+ * with the socket. */
+ rds_clear_recv_queue(rs);
+ rds_cong_remove_socket(rs);
+ rds_remove_bound(rs);
+ rds_send_drop_to(rs, NULL);
+ rds_rdma_drop_keys(rs);
+ rds_notify_queue_get(rs, NULL);
+
+ spin_lock_irqsave(&rds_sock_lock, flags);
+ list_del_init(&rs->rs_item);
+ rds_sock_count--;
+ spin_unlock_irqrestore(&rds_sock_lock, flags);
+
+ sock->sk = NULL;
+ sock_put(sk);
+out:
+ return 0;
+}
+
+/*
+ * Careful not to race with rds_release -> sock_orphan which clears sk_sleep.
+ * _bh() isn't OK here, we're called from interrupt handlers. It's probably OK
+ * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but
+ * this seems more conservative.
+ * NB - normally, one would use sk_callback_lock for this, but we can
+ * get here from interrupts, whereas the network code grabs sk_callback_lock
+ * with _lock_bh only - so relying on sk_callback_lock introduces livelocks.
+ */
+void rds_wake_sk_sleep(struct rds_sock *rs)
+{
+ unsigned long flags;
+
+ read_lock_irqsave(&rs->rs_recv_lock, flags);
+ __rds_wake_sk_sleep(rds_rs_to_sk(rs));
+ read_unlock_irqrestore(&rs->rs_recv_lock, flags);
+}
+
+static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
+ int *uaddr_len, int peer)
+{
+ struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+ struct rds_sock *rs = rds_sk_to_rs(sock->sk);
+
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+
+ /* racey, don't care */
+ if (peer) {
+ if (!rs->rs_conn_addr)
+ return -ENOTCONN;
+
+ sin->sin_port = rs->rs_conn_port;
+ sin->sin_addr.s_addr = rs->rs_conn_addr;
+ } else {
+ sin->sin_port = rs->rs_bound_port;
+ sin->sin_addr.s_addr = rs->rs_bound_addr;
+ }
+
+ sin->sin_family = AF_INET;
+
+ *uaddr_len = sizeof(*sin);
+ return 0;
+}
+
+/*
+ * RDS' poll is without a doubt the least intuitive part of the interface,
+ * as POLLIN and POLLOUT do not behave entirely as you would expect from
+ * a network protocol.
+ *
+ * POLLIN is asserted if
+ * - there is data on the receive queue.
+ * - to signal that a previously congested destination may have become
+ * uncongested
+ * - A notification has been queued to the socket (this can be a congestion
+ * update, or a RDMA completion).
+ *
+ * POLLOUT is asserted if there is room on the send queue. This does not mean
+ * however, that the next sendmsg() call will succeed. If the application tries
+ * to send to a congested destination, the system call may still fail (and
+ * return ENOBUFS).
+ */
+static unsigned int rds_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
+{
+ struct sock *sk = sock->sk;
+ struct rds_sock *rs = rds_sk_to_rs(sk);
+ unsigned int mask = 0;
+ unsigned long flags;
+
+ poll_wait(file, sk->sk_sleep, wait);
+
+ poll_wait(file, &rds_poll_waitq, wait);
+
+ read_lock_irqsave(&rs->rs_recv_lock, flags);
+ if (!rs->rs_cong_monitor) {
+ /* When a congestion map was updated, we signal POLLIN for
+ * "historical" reasons. Applications can also poll for
+ * WRBAND instead. */
+ if (rds_cong_updated_since(&rs->rs_cong_track))
+ mask |= (POLLIN | POLLRDNORM | POLLWRBAND);
+ } else {
+ spin_lock(&rs->rs_lock);
+ if (rs->rs_cong_notify)
+ mask |= (POLLIN | POLLRDNORM);
+ spin_unlock(&rs->rs_lock);
+ }
+ if (!list_empty(&rs->rs_recv_queue)
+ || !list_empty(&rs->rs_notify_queue))
+ mask |= (POLLIN | POLLRDNORM);
+ if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
+ mask |= (POLLOUT | POLLWRNORM);
+ read_unlock_irqrestore(&rs->rs_recv_lock, flags);
+
+ return mask;
+}
+
+static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ return -ENOIOCTLCMD;
+}
+
+static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
+ int len)
+{
+ struct sockaddr_in sin;
+ int ret = 0;
+
+ /* racing with another thread binding seems ok here */
+ if (rs->rs_bound_addr == 0) {
+ ret = -ENOTCONN; /* XXX not a great errno */
+ goto out;
+ }
+
+ if (len < sizeof(struct sockaddr_in)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (copy_from_user(&sin, optval, sizeof(sin))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ rds_send_drop_to(rs, &sin);
+out:
+ return ret;
+}
+
+static int rds_set_bool_option(unsigned char *optvar, char __user *optval,
+ int optlen)
+{
+ int value;
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (get_user(value, (int __user *) optval))
+ return -EFAULT;
+ *optvar = !!value;
+ return 0;
+}
+
+static int rds_cong_monitor(struct rds_sock *rs, char __user *optval,
+ int optlen)
+{
+ int ret;
+
+ ret = rds_set_bool_option(&rs->rs_cong_monitor, optval, optlen);
+ if (ret == 0) {
+ if (rs->rs_cong_monitor) {
+ rds_cong_add_socket(rs);
+ } else {
+ rds_cong_remove_socket(rs);
+ rs->rs_cong_mask = 0;
+ rs->rs_cong_notify = 0;
+ }
+ }
+ return ret;
+}
+
+static int rds_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int optlen)
+{
+ struct rds_sock *rs = rds_sk_to_rs(sock->sk);
+ int ret;
+
+ if (level != SOL_RDS) {
+ ret = -ENOPROTOOPT;
+ goto out;
+ }
+
+ switch (optname) {
+ case RDS_CANCEL_SENT_TO:
+ ret = rds_cancel_sent_to(rs, optval, optlen);
+ break;
+ case RDS_GET_MR:
+ ret = rds_get_mr(rs, optval, optlen);
+ break;
+ case RDS_FREE_MR:
+ ret = rds_free_mr(rs, optval, optlen);
+ break;
+ case RDS_RECVERR:
+ ret = rds_set_bool_option(&rs->rs_recverr, optval, optlen);
+ break;
+ case RDS_CONG_MONITOR:
+ ret = rds_cong_monitor(rs, optval, optlen);
+ break;
+ default:
+ ret = -ENOPROTOOPT;
+ }
+out:
+ return ret;
+}
+
+static int rds_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct rds_sock *rs = rds_sk_to_rs(sock->sk);
+ int ret = -ENOPROTOOPT, len;
+
+ if (level != SOL_RDS)
+ goto out;
+
+ if (get_user(len, optlen)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ switch (optname) {
+ case RDS_INFO_FIRST ... RDS_INFO_LAST:
+ ret = rds_info_getsockopt(sock, optname, optval,
+ optlen);
+ break;
+
+ case RDS_RECVERR:
+ if (len < sizeof(int))
+ ret = -EINVAL;
+ else
+ if (put_user(rs->rs_recverr, (int __user *) optval)
+ || put_user(sizeof(int), optlen))
+ ret = -EFAULT;
+ else
+ ret = 0;
+ break;
+ default:
+ break;
+ }
+
+out:
+ return ret;
+
+}
+
+static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+ struct rds_sock *rs = rds_sk_to_rs(sk);
+ int ret = 0;
+
+ lock_sock(sk);
+
+ if (addr_len != sizeof(struct sockaddr_in)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (sin->sin_family != AF_INET) {
+ ret = -EAFNOSUPPORT;
+ goto out;
+ }
+
+ if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
+ ret = -EDESTADDRREQ;
+ goto out;
+ }
+
+ rs->rs_conn_addr = sin->sin_addr.s_addr;
+ rs->rs_conn_port = sin->sin_port;
+
+out:
+ release_sock(sk);
+ return ret;
+}
+
+static struct proto rds_proto = {
+ .name = "RDS",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct rds_sock),
+};
+
+static struct proto_ops rds_proto_ops = {
+ .family = AF_RDS,
+ .owner = THIS_MODULE,
+ .release = rds_release,
+ .bind = rds_bind,
+ .connect = rds_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = rds_getname,
+ .poll = rds_poll,
+ .ioctl = rds_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = rds_setsockopt,
+ .getsockopt = rds_getsockopt,
+ .sendmsg = rds_sendmsg,
+ .recvmsg = rds_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
+{
+ unsigned long flags;
+ struct rds_sock *rs;
+
+ sock_init_data(sock, sk);
+ sock->ops = &rds_proto_ops;
+ sk->sk_protocol = protocol;
+
+ rs = rds_sk_to_rs(sk);
+ spin_lock_init(&rs->rs_lock);
+ rwlock_init(&rs->rs_recv_lock);
+ INIT_LIST_HEAD(&rs->rs_send_queue);
+ INIT_LIST_HEAD(&rs->rs_recv_queue);
+ INIT_LIST_HEAD(&rs->rs_notify_queue);
+ INIT_LIST_HEAD(&rs->rs_cong_list);
+ spin_lock_init(&rs->rs_rdma_lock);
+ rs->rs_rdma_keys = RB_ROOT;
+
+ spin_lock_irqsave(&rds_sock_lock, flags);
+ list_add_tail(&rs->rs_item, &rds_sock_list);
+ rds_sock_count++;
+ spin_unlock_irqrestore(&rds_sock_lock, flags);
+
+ return 0;
+}
+
+static int rds_create(struct net *net, struct socket *sock, int protocol)
+{
+ struct sock *sk;
+
+ if (sock->type != SOCK_SEQPACKET || protocol)
+ return -ESOCKTNOSUPPORT;
+
+ sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto);
+ if (!sk)
+ return -ENOMEM;
+
+ return __rds_create(sock, sk, protocol);
+}
+
+void rds_sock_addref(struct rds_sock *rs)
+{
+ sock_hold(rds_rs_to_sk(rs));
+}
+
+void rds_sock_put(struct rds_sock *rs)
+{
+ sock_put(rds_rs_to_sk(rs));
+}
+
+static struct net_proto_family rds_family_ops = {
+ .family = AF_RDS,
+ .create = rds_create,
+ .owner = THIS_MODULE,
+};
+
+static void rds_sock_inc_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ struct rds_sock *rs;
+ struct sock *sk;
+ struct rds_incoming *inc;
+ unsigned long flags;
+ unsigned int total = 0;
+
+ len /= sizeof(struct rds_info_message);
+
+ spin_lock_irqsave(&rds_sock_lock, flags);
+
+ list_for_each_entry(rs, &rds_sock_list, rs_item) {
+ sk = rds_rs_to_sk(rs);
+ read_lock(&rs->rs_recv_lock);
+
+ /* XXX too lazy to maintain counts.. */
+ list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
+ total++;
+ if (total <= len)
+ rds_inc_info_copy(inc, iter, inc->i_saddr,
+ rs->rs_bound_addr, 1);
+ }
+
+ read_unlock(&rs->rs_recv_lock);
+ }
+
+ spin_unlock_irqrestore(&rds_sock_lock, flags);
+
+ lens->nr = total;
+ lens->each = sizeof(struct rds_info_message);
+}
+
+static void rds_sock_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ struct rds_info_socket sinfo;
+ struct rds_sock *rs;
+ unsigned long flags;
+
+ len /= sizeof(struct rds_info_socket);
+
+ spin_lock_irqsave(&rds_sock_lock, flags);
+
+ if (len < rds_sock_count)
+ goto out;
+
+ list_for_each_entry(rs, &rds_sock_list, rs_item) {
+ sinfo.sndbuf = rds_sk_sndbuf(rs);
+ sinfo.rcvbuf = rds_sk_rcvbuf(rs);
+ sinfo.bound_addr = rs->rs_bound_addr;
+ sinfo.connected_addr = rs->rs_conn_addr;
+ sinfo.bound_port = rs->rs_bound_port;
+ sinfo.connected_port = rs->rs_conn_port;
+ sinfo.inum = sock_i_ino(rds_rs_to_sk(rs));
+
+ rds_info_copy(iter, &sinfo, sizeof(sinfo));
+ }
+
+out:
+ lens->nr = rds_sock_count;
+ lens->each = sizeof(struct rds_info_socket);
+
+ spin_unlock_irqrestore(&rds_sock_lock, flags);
+}
+
+static void __exit rds_exit(void)
+{
+ rds_rdma_exit();
+ sock_unregister(rds_family_ops.family);
+ proto_unregister(&rds_proto);
+ rds_conn_exit();
+ rds_cong_exit();
+ rds_sysctl_exit();
+ rds_threads_exit();
+ rds_stats_exit();
+ rds_page_exit();
+ rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info);
+ rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
+}
+module_exit(rds_exit);
+
+static int __init rds_init(void)
+{
+ int ret;
+
+ ret = rds_conn_init();
+ if (ret)
+ goto out;
+ ret = rds_threads_init();
+ if (ret)
+ goto out_conn;
+ ret = rds_sysctl_init();
+ if (ret)
+ goto out_threads;
+ ret = rds_stats_init();
+ if (ret)
+ goto out_sysctl;
+ ret = proto_register(&rds_proto, 1);
+ if (ret)
+ goto out_stats;
+ ret = sock_register(&rds_family_ops);
+ if (ret)
+ goto out_proto;
+
+ rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info);
+ rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
+
+ /* ib/iwarp transports currently compiled-in */
+ ret = rds_rdma_init();
+ if (ret)
+ goto out_sock;
+ goto out;
+
+out_sock:
+ sock_unregister(rds_family_ops.family);
+out_proto:
+ proto_unregister(&rds_proto);
+out_stats:
+ rds_stats_exit();
+out_sysctl:
+ rds_sysctl_exit();
+out_threads:
+ rds_threads_exit();
+out_conn:
+ rds_conn_exit();
+ rds_cong_exit();
+ rds_page_exit();
+out:
+ return ret;
+}
+module_init(rds_init);
+
+#define DRV_VERSION "4.0"
+#define DRV_RELDATE "Feb 12, 2009"
+
+MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
+MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets"
+ " v" DRV_VERSION " (" DRV_RELDATE ")");
+MODULE_VERSION(DRV_VERSION);
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS_NETPROTO(PF_RDS);
diff --git a/net/rds/bind.c b/net/rds/bind.c
new file mode 100644
index 00000000000..c17cc39160c
--- /dev/null
+++ b/net/rds/bind.c
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <net/sock.h>
+#include <linux/in.h>
+#include <linux/if_arp.h>
+#include "rds.h"
+
+/*
+ * XXX this probably still needs more work.. no INADDR_ANY, and rbtrees aren't
+ * particularly zippy.
+ *
+ * This is now called for every incoming frame so we arguably care much more
+ * about it than we used to.
+ */
+static DEFINE_SPINLOCK(rds_bind_lock);
+static struct rb_root rds_bind_tree = RB_ROOT;
+
+static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port,
+ struct rds_sock *insert)
+{
+ struct rb_node **p = &rds_bind_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct rds_sock *rs;
+ u64 cmp;
+ u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port);
+
+ while (*p) {
+ parent = *p;
+ rs = rb_entry(parent, struct rds_sock, rs_bound_node);
+
+ cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) |
+ be16_to_cpu(rs->rs_bound_port);
+
+ if (needle < cmp)
+ p = &(*p)->rb_left;
+ else if (needle > cmp)
+ p = &(*p)->rb_right;
+ else
+ return rs;
+ }
+
+ if (insert) {
+ rb_link_node(&insert->rs_bound_node, parent, p);
+ rb_insert_color(&insert->rs_bound_node, &rds_bind_tree);
+ }
+ return NULL;
+}
+
+/*
+ * Return the rds_sock bound at the given local address.
+ *
+ * The rx path can race with rds_release. We notice if rds_release() has
+ * marked this socket and don't return a rs ref to the rx path.
+ */
+struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
+{
+ struct rds_sock *rs;
+ unsigned long flags;
+
+ spin_lock_irqsave(&rds_bind_lock, flags);
+ rs = rds_bind_tree_walk(addr, port, NULL);
+ if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
+ rds_sock_addref(rs);
+ else
+ rs = NULL;
+ spin_unlock_irqrestore(&rds_bind_lock, flags);
+
+ rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr,
+ ntohs(port));
+ return rs;
+}
+
+/* returns -ve errno or +ve port */
+static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
+{
+ unsigned long flags;
+ int ret = -EADDRINUSE;
+ u16 rover, last;
+
+ if (*port != 0) {
+ rover = be16_to_cpu(*port);
+ last = rover;
+ } else {
+ rover = max_t(u16, net_random(), 2);
+ last = rover - 1;
+ }
+
+ spin_lock_irqsave(&rds_bind_lock, flags);
+
+ do {
+ if (rover == 0)
+ rover++;
+ if (rds_bind_tree_walk(addr, cpu_to_be16(rover), rs) == NULL) {
+ *port = cpu_to_be16(rover);
+ ret = 0;
+ break;
+ }
+ } while (rover++ != last);
+
+ if (ret == 0) {
+ rs->rs_bound_addr = addr;
+ rs->rs_bound_port = *port;
+ rds_sock_addref(rs);
+
+ rdsdebug("rs %p binding to %pI4:%d\n",
+ rs, &addr, (int)ntohs(*port));
+ }
+
+ spin_unlock_irqrestore(&rds_bind_lock, flags);
+
+ return ret;
+}
+
+void rds_remove_bound(struct rds_sock *rs)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rds_bind_lock, flags);
+
+ if (rs->rs_bound_addr) {
+ rdsdebug("rs %p unbinding from %pI4:%d\n",
+ rs, &rs->rs_bound_addr,
+ ntohs(rs->rs_bound_port));
+
+ rb_erase(&rs->rs_bound_node, &rds_bind_tree);
+ rds_sock_put(rs);
+ rs->rs_bound_addr = 0;
+ }
+
+ spin_unlock_irqrestore(&rds_bind_lock, flags);
+}
+
+int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+ struct sock *sk = sock->sk;
+ struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+ struct rds_sock *rs = rds_sk_to_rs(sk);
+ struct rds_transport *trans;
+ int ret = 0;
+
+ lock_sock(sk);
+
+ if (addr_len != sizeof(struct sockaddr_in) ||
+ sin->sin_family != AF_INET ||
+ rs->rs_bound_addr ||
+ sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = rds_add_bound(rs, sin->sin_addr.s_addr, &sin->sin_port);
+ if (ret)
+ goto out;
+
+ trans = rds_trans_get_preferred(sin->sin_addr.s_addr);
+ if (trans == NULL) {
+ ret = -EADDRNOTAVAIL;
+ rds_remove_bound(rs);
+ goto out;
+ }
+
+ rs->rs_transport = trans;
+ ret = 0;
+
+out:
+ release_sock(sk);
+ return ret;
+}
diff --git a/net/rds/cong.c b/net/rds/cong.c
new file mode 100644
index 00000000000..710e4599d76
--- /dev/null
+++ b/net/rds/cong.c
@@ -0,0 +1,404 @@
+/*
+ * Copyright (c) 2007 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/types.h>
+#include <linux/rbtree.h>
+
+#include <asm-generic/bitops/le.h>
+
+#include "rds.h"
+
+/*
+ * This file implements the receive side of the unconventional congestion
+ * management in RDS.
+ *
+ * Messages waiting in the receive queue on the receiving socket are accounted
+ * against the sockets SO_RCVBUF option value. Only the payload bytes in the
+ * message are accounted for. If the number of bytes queued equals or exceeds
+ * rcvbuf then the socket is congested. All sends attempted to this socket's
+ * address should return block or return -EWOULDBLOCK.
+ *
+ * Applications are expected to be reasonably tuned such that this situation
+ * very rarely occurs. An application encountering this "back-pressure" is
+ * considered a bug.
+ *
+ * This is implemented by having each node maintain bitmaps which indicate
+ * which ports on bound addresses are congested. As the bitmap changes it is
+ * sent through all the connections which terminate in the local address of the
+ * bitmap which changed.
+ *
+ * The bitmaps are allocated as connections are brought up. This avoids
+ * allocation in the interrupt handling path which queues messages on sockets.
+ * The dense bitmaps let transports send the entire bitmap on any bitmap change
+ * reasonably efficiently. This is much easier to implement than some
+ * finer-grained communication of per-port congestion. The sender does a very
+ * inexpensive bit test to test if the port it's about to send to is congested
+ * or not.
+ */
+
+/*
+ * Interaction with poll is a tad tricky. We want all processes stuck in
+ * poll to wake up and check whether a congested destination became uncongested.
+ * The really sad thing is we have no idea which destinations the application
+ * wants to send to - we don't even know which rds_connections are involved.
+ * So until we implement a more flexible rds poll interface, we have to make
+ * do with this:
+ * We maintain a global counter that is incremented each time a congestion map
+ * update is received. Each rds socket tracks this value, and if rds_poll
+ * finds that the saved generation number is smaller than the global generation
+ * number, it wakes up the process.
+ */
+static atomic_t rds_cong_generation = ATOMIC_INIT(0);
+
+/*
+ * Congestion monitoring
+ */
+static LIST_HEAD(rds_cong_monitor);
+static DEFINE_RWLOCK(rds_cong_monitor_lock);
+
+/*
+ * Yes, a global lock. It's used so infrequently that it's worth keeping it
+ * global to simplify the locking. It's only used in the following
+ * circumstances:
+ *
+ * - on connection buildup to associate a conn with its maps
+ * - on map changes to inform conns of a new map to send
+ *
+ * It's sadly ordered under the socket callback lock and the connection lock.
+ * Receive paths can mark ports congested from interrupt context so the
+ * lock masks interrupts.
+ */
+static DEFINE_SPINLOCK(rds_cong_lock);
+static struct rb_root rds_cong_tree = RB_ROOT;
+
+static struct rds_cong_map *rds_cong_tree_walk(__be32 addr,
+ struct rds_cong_map *insert)
+{
+ struct rb_node **p = &rds_cong_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct rds_cong_map *map;
+
+ while (*p) {
+ parent = *p;
+ map = rb_entry(parent, struct rds_cong_map, m_rb_node);
+
+ if (addr < map->m_addr)
+ p = &(*p)->rb_left;
+ else if (addr > map->m_addr)
+ p = &(*p)->rb_right;
+ else
+ return map;
+ }
+
+ if (insert) {
+ rb_link_node(&insert->m_rb_node, parent, p);
+ rb_insert_color(&insert->m_rb_node, &rds_cong_tree);
+ }
+ return NULL;
+}
+
+/*
+ * There is only ever one bitmap for any address. Connections try and allocate
+ * these bitmaps in the process getting pointers to them. The bitmaps are only
+ * ever freed as the module is removed after all connections have been freed.
+ */
+static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
+{
+ struct rds_cong_map *map;
+ struct rds_cong_map *ret = NULL;
+ unsigned long zp;
+ unsigned long i;
+ unsigned long flags;
+
+ map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL);
+ if (map == NULL)
+ return NULL;
+
+ map->m_addr = addr;
+ init_waitqueue_head(&map->m_waitq);
+ INIT_LIST_HEAD(&map->m_conn_list);
+
+ for (i = 0; i < RDS_CONG_MAP_PAGES; i++) {
+ zp = get_zeroed_page(GFP_KERNEL);
+ if (zp == 0)
+ goto out;
+ map->m_page_addrs[i] = zp;
+ }
+
+ spin_lock_irqsave(&rds_cong_lock, flags);
+ ret = rds_cong_tree_walk(addr, map);
+ spin_unlock_irqrestore(&rds_cong_lock, flags);
+
+ if (ret == NULL) {
+ ret = map;
+ map = NULL;
+ }
+
+out:
+ if (map) {
+ for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++)
+ free_page(map->m_page_addrs[i]);
+ kfree(map);
+ }
+
+ rdsdebug("map %p for addr %x\n", ret, be32_to_cpu(addr));
+
+ return ret;
+}
+
+/*
+ * Put the conn on its local map's list. This is called when the conn is
+ * really added to the hash. It's nested under the rds_conn_lock, sadly.
+ */
+void rds_cong_add_conn(struct rds_connection *conn)
+{
+ unsigned long flags;
+
+ rdsdebug("conn %p now on map %p\n", conn, conn->c_lcong);
+ spin_lock_irqsave(&rds_cong_lock, flags);
+ list_add_tail(&conn->c_map_item, &conn->c_lcong->m_conn_list);
+ spin_unlock_irqrestore(&rds_cong_lock, flags);
+}
+
+void rds_cong_remove_conn(struct rds_connection *conn)
+{
+ unsigned long flags;
+
+ rdsdebug("removing conn %p from map %p\n", conn, conn->c_lcong);
+ spin_lock_irqsave(&rds_cong_lock, flags);
+ list_del_init(&conn->c_map_item);
+ spin_unlock_irqrestore(&rds_cong_lock, flags);
+}
+
+int rds_cong_get_maps(struct rds_connection *conn)
+{
+ conn->c_lcong = rds_cong_from_addr(conn->c_laddr);
+ conn->c_fcong = rds_cong_from_addr(conn->c_faddr);
+
+ if (conn->c_lcong == NULL || conn->c_fcong == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void rds_cong_queue_updates(struct rds_cong_map *map)
+{
+ struct rds_connection *conn;
+ unsigned long flags;
+
+ spin_lock_irqsave(&rds_cong_lock, flags);
+
+ list_for_each_entry(conn, &map->m_conn_list, c_map_item) {
+ if (!test_and_set_bit(0, &conn->c_map_queued)) {
+ rds_stats_inc(s_cong_update_queued);
+ queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+ }
+ }
+
+ spin_unlock_irqrestore(&rds_cong_lock, flags);
+}
+
+void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask)
+{
+ rdsdebug("waking map %p for %pI4\n",
+ map, &map->m_addr);
+ rds_stats_inc(s_cong_update_received);
+ atomic_inc(&rds_cong_generation);
+ if (waitqueue_active(&map->m_waitq))
+ wake_up(&map->m_waitq);
+ if (waitqueue_active(&rds_poll_waitq))
+ wake_up_all(&rds_poll_waitq);
+
+ if (portmask && !list_empty(&rds_cong_monitor)) {
+ unsigned long flags;
+ struct rds_sock *rs;
+
+ read_lock_irqsave(&rds_cong_monitor_lock, flags);
+ list_for_each_entry(rs, &rds_cong_monitor, rs_cong_list) {
+ spin_lock(&rs->rs_lock);
+ rs->rs_cong_notify |= (rs->rs_cong_mask & portmask);
+ rs->rs_cong_mask &= ~portmask;
+ spin_unlock(&rs->rs_lock);
+ if (rs->rs_cong_notify)
+ rds_wake_sk_sleep(rs);
+ }
+ read_unlock_irqrestore(&rds_cong_monitor_lock, flags);
+ }
+}
+
+int rds_cong_updated_since(unsigned long *recent)
+{
+ unsigned long gen = atomic_read(&rds_cong_generation);
+
+ if (likely(*recent == gen))
+ return 0;
+ *recent = gen;
+ return 1;
+}
+
+/*
+ * We're called under the locking that protects the sockets receive buffer
+ * consumption. This makes it a lot easier for the caller to only call us
+ * when it knows that an existing set bit needs to be cleared, and vice versa.
+ * We can't block and we need to deal with concurrent sockets working against
+ * the same per-address map.
+ */
+void rds_cong_set_bit(struct rds_cong_map *map, __be16 port)
+{
+ unsigned long i;
+ unsigned long off;
+
+ rdsdebug("setting congestion for %pI4:%u in map %p\n",
+ &map->m_addr, ntohs(port), map);
+
+ i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
+ off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
+
+ generic___set_le_bit(off, (void *)map->m_page_addrs[i]);
+}
+
+void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port)
+{
+ unsigned long i;
+ unsigned long off;
+
+ rdsdebug("clearing congestion for %pI4:%u in map %p\n",
+ &map->m_addr, ntohs(port), map);
+
+ i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
+ off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
+
+ generic___clear_le_bit(off, (void *)map->m_page_addrs[i]);
+}
+
+static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port)
+{
+ unsigned long i;
+ unsigned long off;
+
+ i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
+ off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
+
+ return generic_test_le_bit(off, (void *)map->m_page_addrs[i]);
+}
+
+void rds_cong_add_socket(struct rds_sock *rs)
+{
+ unsigned long flags;
+
+ write_lock_irqsave(&rds_cong_monitor_lock, flags);
+ if (list_empty(&rs->rs_cong_list))
+ list_add(&rs->rs_cong_list, &rds_cong_monitor);
+ write_unlock_irqrestore(&rds_cong_monitor_lock, flags);
+}
+
+void rds_cong_remove_socket(struct rds_sock *rs)
+{
+ unsigned long flags;
+ struct rds_cong_map *map;
+
+ write_lock_irqsave(&rds_cong_monitor_lock, flags);
+ list_del_init(&rs->rs_cong_list);
+ write_unlock_irqrestore(&rds_cong_monitor_lock, flags);
+
+ /* update congestion map for now-closed port */
+ spin_lock_irqsave(&rds_cong_lock, flags);
+ map = rds_cong_tree_walk(rs->rs_bound_addr, NULL);
+ spin_unlock_irqrestore(&rds_cong_lock, flags);
+
+ if (map && rds_cong_test_bit(map, rs->rs_bound_port)) {
+ rds_cong_clear_bit(map, rs->rs_bound_port);
+ rds_cong_queue_updates(map);
+ }
+}
+
+int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock,
+ struct rds_sock *rs)
+{
+ if (!rds_cong_test_bit(map, port))
+ return 0;
+ if (nonblock) {
+ if (rs && rs->rs_cong_monitor) {
+ unsigned long flags;
+
+ /* It would have been nice to have an atomic set_bit on
+ * a uint64_t. */
+ spin_lock_irqsave(&rs->rs_lock, flags);
+ rs->rs_cong_mask |= RDS_CONG_MONITOR_MASK(ntohs(port));
+ spin_unlock_irqrestore(&rs->rs_lock, flags);
+
+ /* Test again - a congestion update may have arrived in
+ * the meantime. */
+ if (!rds_cong_test_bit(map, port))
+ return 0;
+ }
+ rds_stats_inc(s_cong_send_error);
+ return -ENOBUFS;
+ }
+
+ rds_stats_inc(s_cong_send_blocked);
+ rdsdebug("waiting on map %p for port %u\n", map, be16_to_cpu(port));
+
+ return wait_event_interruptible(map->m_waitq,
+ !rds_cong_test_bit(map, port));
+}
+
+void rds_cong_exit(void)
+{
+ struct rb_node *node;
+ struct rds_cong_map *map;
+ unsigned long i;
+
+ while ((node = rb_first(&rds_cong_tree))) {
+ map = rb_entry(node, struct rds_cong_map, m_rb_node);
+ rdsdebug("freeing map %p\n", map);
+ rb_erase(&map->m_rb_node, &rds_cong_tree);
+ for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++)
+ free_page(map->m_page_addrs[i]);
+ kfree(map);
+ }
+}
+
+/*
+ * Allocate a RDS message containing a congestion update.
+ */
+struct rds_message *rds_cong_update_alloc(struct rds_connection *conn)
+{
+ struct rds_cong_map *map = conn->c_lcong;
+ struct rds_message *rm;
+
+ rm = rds_message_map_pages(map->m_page_addrs, RDS_CONG_MAP_BYTES);
+ if (!IS_ERR(rm))
+ rm->m_inc.i_hdr.h_flags = RDS_FLAG_CONG_BITMAP;
+
+ return rm;
+}
diff --git a/net/rds/connection.c b/net/rds/connection.c
new file mode 100644
index 00000000000..273f064930a
--- /dev/null
+++ b/net/rds/connection.c
@@ -0,0 +1,487 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <net/inet_hashtables.h>
+
+#include "rds.h"
+#include "loop.h"
+#include "rdma.h"
+
+#define RDS_CONNECTION_HASH_BITS 12
+#define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS)
+#define RDS_CONNECTION_HASH_MASK (RDS_CONNECTION_HASH_ENTRIES - 1)
+
+/* converting this to RCU is a chore for another day.. */
+static DEFINE_SPINLOCK(rds_conn_lock);
+static unsigned long rds_conn_count;
+static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES];
+static struct kmem_cache *rds_conn_slab;
+
+static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
+{
+ /* Pass NULL, don't need struct net for hash */
+ unsigned long hash = inet_ehashfn(NULL,
+ be32_to_cpu(laddr), 0,
+ be32_to_cpu(faddr), 0);
+ return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK];
+}
+
+#define rds_conn_info_set(var, test, suffix) do { \
+ if (test) \
+ var |= RDS_INFO_CONNECTION_FLAG_##suffix; \
+} while (0)
+
+static inline int rds_conn_is_sending(struct rds_connection *conn)
+{
+ int ret = 0;
+
+ if (!mutex_trylock(&conn->c_send_lock))
+ ret = 1;
+ else
+ mutex_unlock(&conn->c_send_lock);
+
+ return ret;
+}
+
+static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
+ __be32 laddr, __be32 faddr,
+ struct rds_transport *trans)
+{
+ struct rds_connection *conn, *ret = NULL;
+ struct hlist_node *pos;
+
+ hlist_for_each_entry(conn, pos, head, c_hash_node) {
+ if (conn->c_faddr == faddr && conn->c_laddr == laddr &&
+ conn->c_trans == trans) {
+ ret = conn;
+ break;
+ }
+ }
+ rdsdebug("returning conn %p for %pI4 -> %pI4\n", ret,
+ &laddr, &faddr);
+ return ret;
+}
+
+/*
+ * This is called by transports as they're bringing down a connection.
+ * It clears partial message state so that the transport can start sending
+ * and receiving over this connection again in the future. It is up to
+ * the transport to have serialized this call with its send and recv.
+ */
+void rds_conn_reset(struct rds_connection *conn)
+{
+ rdsdebug("connection %pI4 to %pI4 reset\n",
+ &conn->c_laddr, &conn->c_faddr);
+
+ rds_stats_inc(s_conn_reset);
+ rds_send_reset(conn);
+ conn->c_flags = 0;
+
+ /* Do not clear next_rx_seq here, else we cannot distinguish
+ * retransmitted packets from new packets, and will hand all
+ * of them to the application. That is not consistent with the
+ * reliability guarantees of RDS. */
+}
+
+/*
+ * There is only every one 'conn' for a given pair of addresses in the
+ * system at a time. They contain messages to be retransmitted and so
+ * span the lifetime of the actual underlying transport connections.
+ *
+ * For now they are not garbage collected once they're created. They
+ * are torn down as the module is removed, if ever.
+ */
+static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
+ struct rds_transport *trans, gfp_t gfp,
+ int is_outgoing)
+{
+ struct rds_connection *conn, *tmp, *parent = NULL;
+ struct hlist_head *head = rds_conn_bucket(laddr, faddr);
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&rds_conn_lock, flags);
+ conn = rds_conn_lookup(head, laddr, faddr, trans);
+ if (conn
+ && conn->c_loopback
+ && conn->c_trans != &rds_loop_transport
+ && !is_outgoing) {
+ /* This is a looped back IB connection, and we're
+ * called by the code handling the incoming connect.
+ * We need a second connection object into which we
+ * can stick the other QP. */
+ parent = conn;
+ conn = parent->c_passive;
+ }
+ spin_unlock_irqrestore(&rds_conn_lock, flags);
+ if (conn)
+ goto out;
+
+ conn = kmem_cache_alloc(rds_conn_slab, gfp);
+ if (conn == NULL) {
+ conn = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ memset(conn, 0, sizeof(*conn));
+
+ INIT_HLIST_NODE(&conn->c_hash_node);
+ conn->c_version = RDS_PROTOCOL_3_0;
+ conn->c_laddr = laddr;
+ conn->c_faddr = faddr;
+ spin_lock_init(&conn->c_lock);
+ conn->c_next_tx_seq = 1;
+
+ mutex_init(&conn->c_send_lock);
+ INIT_LIST_HEAD(&conn->c_send_queue);
+ INIT_LIST_HEAD(&conn->c_retrans);
+
+ ret = rds_cong_get_maps(conn);
+ if (ret) {
+ kmem_cache_free(rds_conn_slab, conn);
+ conn = ERR_PTR(ret);
+ goto out;
+ }
+
+ /*
+ * This is where a connection becomes loopback. If *any* RDS sockets
+ * can bind to the destination address then we'd rather the messages
+ * flow through loopback rather than either transport.
+ */
+ if (rds_trans_get_preferred(faddr)) {
+ conn->c_loopback = 1;
+ if (is_outgoing && trans->t_prefer_loopback) {
+ /* "outgoing" connection - and the transport
+ * says it wants the connection handled by the
+ * loopback transport. This is what TCP does.
+ */
+ trans = &rds_loop_transport;
+ }
+ }
+
+ conn->c_trans = trans;
+
+ ret = trans->conn_alloc(conn, gfp);
+ if (ret) {
+ kmem_cache_free(rds_conn_slab, conn);
+ conn = ERR_PTR(ret);
+ goto out;
+ }
+
+ atomic_set(&conn->c_state, RDS_CONN_DOWN);
+ conn->c_reconnect_jiffies = 0;
+ INIT_DELAYED_WORK(&conn->c_send_w, rds_send_worker);
+ INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker);
+ INIT_DELAYED_WORK(&conn->c_conn_w, rds_connect_worker);
+ INIT_WORK(&conn->c_down_w, rds_shutdown_worker);
+ mutex_init(&conn->c_cm_lock);
+ conn->c_flags = 0;
+
+ rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n",
+ conn, &laddr, &faddr,
+ trans->t_name ? trans->t_name : "[unknown]",
+ is_outgoing ? "(outgoing)" : "");
+
+ spin_lock_irqsave(&rds_conn_lock, flags);
+ if (parent == NULL) {
+ tmp = rds_conn_lookup(head, laddr, faddr, trans);
+ if (tmp == NULL)
+ hlist_add_head(&conn->c_hash_node, head);
+ } else {
+ tmp = parent->c_passive;
+ if (!tmp)
+ parent->c_passive = conn;
+ }
+
+ if (tmp) {
+ trans->conn_free(conn->c_transport_data);
+ kmem_cache_free(rds_conn_slab, conn);
+ conn = tmp;
+ } else {
+ rds_cong_add_conn(conn);
+ rds_conn_count++;
+ }
+
+ spin_unlock_irqrestore(&rds_conn_lock, flags);
+
+out:
+ return conn;
+}
+
+struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
+ struct rds_transport *trans, gfp_t gfp)
+{
+ return __rds_conn_create(laddr, faddr, trans, gfp, 0);
+}
+
+struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
+ struct rds_transport *trans, gfp_t gfp)
+{
+ return __rds_conn_create(laddr, faddr, trans, gfp, 1);
+}
+
+void rds_conn_destroy(struct rds_connection *conn)
+{
+ struct rds_message *rm, *rtmp;
+
+ rdsdebug("freeing conn %p for %pI4 -> "
+ "%pI4\n", conn, &conn->c_laddr,
+ &conn->c_faddr);
+
+ hlist_del_init(&conn->c_hash_node);
+
+ /* wait for the rds thread to shut it down */
+ atomic_set(&conn->c_state, RDS_CONN_ERROR);
+ cancel_delayed_work(&conn->c_conn_w);
+ queue_work(rds_wq, &conn->c_down_w);
+ flush_workqueue(rds_wq);
+
+ /* tear down queued messages */
+ list_for_each_entry_safe(rm, rtmp,
+ &conn->c_send_queue,
+ m_conn_item) {
+ list_del_init(&rm->m_conn_item);
+ BUG_ON(!list_empty(&rm->m_sock_item));
+ rds_message_put(rm);
+ }
+ if (conn->c_xmit_rm)
+ rds_message_put(conn->c_xmit_rm);
+
+ conn->c_trans->conn_free(conn->c_transport_data);
+
+ /*
+ * The congestion maps aren't freed up here. They're
+ * freed by rds_cong_exit() after all the connections
+ * have been freed.
+ */
+ rds_cong_remove_conn(conn);
+
+ BUG_ON(!list_empty(&conn->c_retrans));
+ kmem_cache_free(rds_conn_slab, conn);
+
+ rds_conn_count--;
+}
+
+static void rds_conn_message_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens,
+ int want_send)
+{
+ struct hlist_head *head;
+ struct hlist_node *pos;
+ struct list_head *list;
+ struct rds_connection *conn;
+ struct rds_message *rm;
+ unsigned long flags;
+ unsigned int total = 0;
+ size_t i;
+
+ len /= sizeof(struct rds_info_message);
+
+ spin_lock_irqsave(&rds_conn_lock, flags);
+
+ for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
+ i++, head++) {
+ hlist_for_each_entry(conn, pos, head, c_hash_node) {
+ if (want_send)
+ list = &conn->c_send_queue;
+ else
+ list = &conn->c_retrans;
+
+ spin_lock(&conn->c_lock);
+
+ /* XXX too lazy to maintain counts.. */
+ list_for_each_entry(rm, list, m_conn_item) {
+ total++;
+ if (total <= len)
+ rds_inc_info_copy(&rm->m_inc, iter,
+ conn->c_laddr,
+ conn->c_faddr, 0);
+ }
+
+ spin_unlock(&conn->c_lock);
+ }
+ }
+
+ spin_unlock_irqrestore(&rds_conn_lock, flags);
+
+ lens->nr = total;
+ lens->each = sizeof(struct rds_info_message);
+}
+
+static void rds_conn_message_info_send(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ rds_conn_message_info(sock, len, iter, lens, 1);
+}
+
+static void rds_conn_message_info_retrans(struct socket *sock,
+ unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ rds_conn_message_info(sock, len, iter, lens, 0);
+}
+
+void rds_for_each_conn_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens,
+ int (*visitor)(struct rds_connection *, void *),
+ size_t item_len)
+{
+ uint64_t buffer[(item_len + 7) / 8];
+ struct hlist_head *head;
+ struct hlist_node *pos;
+ struct hlist_node *tmp;
+ struct rds_connection *conn;
+ unsigned long flags;
+ size_t i;
+
+ spin_lock_irqsave(&rds_conn_lock, flags);
+
+ lens->nr = 0;
+ lens->each = item_len;
+
+ for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
+ i++, head++) {
+ hlist_for_each_entry_safe(conn, pos, tmp, head, c_hash_node) {
+
+ /* XXX no c_lock usage.. */
+ if (!visitor(conn, buffer))
+ continue;
+
+ /* We copy as much as we can fit in the buffer,
+ * but we count all items so that the caller
+ * can resize the buffer. */
+ if (len >= item_len) {
+ rds_info_copy(iter, buffer, item_len);
+ len -= item_len;
+ }
+ lens->nr++;
+ }
+ }
+
+ spin_unlock_irqrestore(&rds_conn_lock, flags);
+}
+
+static int rds_conn_info_visitor(struct rds_connection *conn,
+ void *buffer)
+{
+ struct rds_info_connection *cinfo = buffer;
+
+ cinfo->next_tx_seq = conn->c_next_tx_seq;
+ cinfo->next_rx_seq = conn->c_next_rx_seq;
+ cinfo->laddr = conn->c_laddr;
+ cinfo->faddr = conn->c_faddr;
+ strncpy(cinfo->transport, conn->c_trans->t_name,
+ sizeof(cinfo->transport));
+ cinfo->flags = 0;
+
+ rds_conn_info_set(cinfo->flags,
+ rds_conn_is_sending(conn), SENDING);
+ /* XXX Future: return the state rather than these funky bits */
+ rds_conn_info_set(cinfo->flags,
+ atomic_read(&conn->c_state) == RDS_CONN_CONNECTING,
+ CONNECTING);
+ rds_conn_info_set(cinfo->flags,
+ atomic_read(&conn->c_state) == RDS_CONN_UP,
+ CONNECTED);
+ return 1;
+}
+
+static void rds_conn_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ rds_for_each_conn_info(sock, len, iter, lens,
+ rds_conn_info_visitor,
+ sizeof(struct rds_info_connection));
+}
+
+int __init rds_conn_init(void)
+{
+ rds_conn_slab = kmem_cache_create("rds_connection",
+ sizeof(struct rds_connection),
+ 0, 0, NULL);
+ if (rds_conn_slab == NULL)
+ return -ENOMEM;
+
+ rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info);
+ rds_info_register_func(RDS_INFO_SEND_MESSAGES,
+ rds_conn_message_info_send);
+ rds_info_register_func(RDS_INFO_RETRANS_MESSAGES,
+ rds_conn_message_info_retrans);
+
+ return 0;
+}
+
+void rds_conn_exit(void)
+{
+ rds_loop_exit();
+
+ WARN_ON(!hlist_empty(rds_conn_hash));
+
+ kmem_cache_destroy(rds_conn_slab);
+
+ rds_info_deregister_func(RDS_INFO_CONNECTIONS, rds_conn_info);
+ rds_info_deregister_func(RDS_INFO_SEND_MESSAGES,
+ rds_conn_message_info_send);
+ rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES,
+ rds_conn_message_info_retrans);
+}
+
+/*
+ * Force a disconnect
+ */
+void rds_conn_drop(struct rds_connection *conn)
+{
+ atomic_set(&conn->c_state, RDS_CONN_ERROR);
+ queue_work(rds_wq, &conn->c_down_w);
+}
+
+/*
+ * An error occurred on the connection
+ */
+void
+__rds_conn_error(struct rds_connection *conn, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vprintk(fmt, ap);
+ va_end(ap);
+
+ rds_conn_drop(conn);
+}
diff --git a/net/rds/ib.c b/net/rds/ib.c
new file mode 100644
index 00000000000..06a7b798d9a
--- /dev/null
+++ b/net/rds/ib.c
@@ -0,0 +1,323 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/if.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/if_arp.h>
+#include <linux/delay.h>
+
+#include "rds.h"
+#include "ib.h"
+
+unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE;
+unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */
+
+module_param(fmr_pool_size, int, 0444);
+MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA");
+module_param(fmr_message_size, int, 0444);
+MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer");
+
+struct list_head rds_ib_devices;
+
+DEFINE_SPINLOCK(ib_nodev_conns_lock);
+LIST_HEAD(ib_nodev_conns);
+
+void rds_ib_add_one(struct ib_device *device)
+{
+ struct rds_ib_device *rds_ibdev;
+ struct ib_device_attr *dev_attr;
+
+ /* Only handle IB (no iWARP) devices */
+ if (device->node_type != RDMA_NODE_IB_CA)
+ return;
+
+ dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
+ if (!dev_attr)
+ return;
+
+ if (ib_query_device(device, dev_attr)) {
+ rdsdebug("Query device failed for %s\n", device->name);
+ goto free_attr;
+ }
+
+ rds_ibdev = kmalloc(sizeof *rds_ibdev, GFP_KERNEL);
+ if (!rds_ibdev)
+ goto free_attr;
+
+ spin_lock_init(&rds_ibdev->spinlock);
+
+ rds_ibdev->max_wrs = dev_attr->max_qp_wr;
+ rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
+
+ rds_ibdev->fmr_page_shift = max(9, ffs(dev_attr->page_size_cap) - 1);
+ rds_ibdev->fmr_page_size = 1 << rds_ibdev->fmr_page_shift;
+ rds_ibdev->fmr_page_mask = ~((u64) rds_ibdev->fmr_page_size - 1);
+ rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32;
+ rds_ibdev->max_fmrs = dev_attr->max_fmr ?
+ min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) :
+ fmr_pool_size;
+
+ rds_ibdev->dev = device;
+ rds_ibdev->pd = ib_alloc_pd(device);
+ if (IS_ERR(rds_ibdev->pd))
+ goto free_dev;
+
+ rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd,
+ IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(rds_ibdev->mr))
+ goto err_pd;
+
+ rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev);
+ if (IS_ERR(rds_ibdev->mr_pool)) {
+ rds_ibdev->mr_pool = NULL;
+ goto err_mr;
+ }
+
+ INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
+ INIT_LIST_HEAD(&rds_ibdev->conn_list);
+ list_add_tail(&rds_ibdev->list, &rds_ib_devices);
+
+ ib_set_client_data(device, &rds_ib_client, rds_ibdev);
+
+ goto free_attr;
+
+err_mr:
+ ib_dereg_mr(rds_ibdev->mr);
+err_pd:
+ ib_dealloc_pd(rds_ibdev->pd);
+free_dev:
+ kfree(rds_ibdev);
+free_attr:
+ kfree(dev_attr);
+}
+
+void rds_ib_remove_one(struct ib_device *device)
+{
+ struct rds_ib_device *rds_ibdev;
+ struct rds_ib_ipaddr *i_ipaddr, *i_next;
+
+ rds_ibdev = ib_get_client_data(device, &rds_ib_client);
+ if (!rds_ibdev)
+ return;
+
+ list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
+ list_del(&i_ipaddr->list);
+ kfree(i_ipaddr);
+ }
+
+ rds_ib_remove_conns(rds_ibdev);
+
+ if (rds_ibdev->mr_pool)
+ rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
+
+ ib_dereg_mr(rds_ibdev->mr);
+
+ while (ib_dealloc_pd(rds_ibdev->pd)) {
+ rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd);
+ msleep(1);
+ }
+
+ list_del(&rds_ibdev->list);
+ kfree(rds_ibdev);
+}
+
+struct ib_client rds_ib_client = {
+ .name = "rds_ib",
+ .add = rds_ib_add_one,
+ .remove = rds_ib_remove_one
+};
+
+static int rds_ib_conn_info_visitor(struct rds_connection *conn,
+ void *buffer)
+{
+ struct rds_info_rdma_connection *iinfo = buffer;
+ struct rds_ib_connection *ic;
+
+ /* We will only ever look at IB transports */
+ if (conn->c_trans != &rds_ib_transport)
+ return 0;
+
+ iinfo->src_addr = conn->c_laddr;
+ iinfo->dst_addr = conn->c_faddr;
+
+ memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
+ memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
+ if (rds_conn_state(conn) == RDS_CONN_UP) {
+ struct rds_ib_device *rds_ibdev;
+ struct rdma_dev_addr *dev_addr;
+
+ ic = conn->c_transport_data;
+ dev_addr = &ic->i_cm_id->route.addr.dev_addr;
+
+ ib_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
+ ib_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
+
+ rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
+ iinfo->max_send_wr = ic->i_send_ring.w_nr;
+ iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
+ iinfo->max_send_sge = rds_ibdev->max_sge;
+ rds_ib_get_mr_info(rds_ibdev, iinfo);
+ }
+ return 1;
+}
+
+static void rds_ib_ic_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ rds_for_each_conn_info(sock, len, iter, lens,
+ rds_ib_conn_info_visitor,
+ sizeof(struct rds_info_rdma_connection));
+}
+
+
+/*
+ * Early RDS/IB was built to only bind to an address if there is an IPoIB
+ * device with that address set.
+ *
+ * If it were me, I'd advocate for something more flexible. Sending and
+ * receiving should be device-agnostic. Transports would try and maintain
+ * connections between peers who have messages queued. Userspace would be
+ * allowed to influence which paths have priority. We could call userspace
+ * asserting this policy "routing".
+ */
+static int rds_ib_laddr_check(__be32 addr)
+{
+ int ret;
+ struct rdma_cm_id *cm_id;
+ struct sockaddr_in sin;
+
+ /* Create a CMA ID and try to bind it. This catches both
+ * IB and iWARP capable NICs.
+ */
+ cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
+ if (!cm_id)
+ return -EADDRNOTAVAIL;
+
+ memset(&sin, 0, sizeof(sin));
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = addr;
+
+ /* rdma_bind_addr will only succeed for IB & iWARP devices */
+ ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+ /* due to this, we will claim to support iWARP devices unless we
+ check node_type. */
+ if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA)
+ ret = -EADDRNOTAVAIL;
+
+ rdsdebug("addr %pI4 ret %d node type %d\n",
+ &addr, ret,
+ cm_id->device ? cm_id->device->node_type : -1);
+
+ rdma_destroy_id(cm_id);
+
+ return ret;
+}
+
+void rds_ib_exit(void)
+{
+ rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+ rds_ib_remove_nodev_conns();
+ ib_unregister_client(&rds_ib_client);
+ rds_ib_sysctl_exit();
+ rds_ib_recv_exit();
+ rds_trans_unregister(&rds_ib_transport);
+}
+
+struct rds_transport rds_ib_transport = {
+ .laddr_check = rds_ib_laddr_check,
+ .xmit_complete = rds_ib_xmit_complete,
+ .xmit = rds_ib_xmit,
+ .xmit_cong_map = NULL,
+ .xmit_rdma = rds_ib_xmit_rdma,
+ .recv = rds_ib_recv,
+ .conn_alloc = rds_ib_conn_alloc,
+ .conn_free = rds_ib_conn_free,
+ .conn_connect = rds_ib_conn_connect,
+ .conn_shutdown = rds_ib_conn_shutdown,
+ .inc_copy_to_user = rds_ib_inc_copy_to_user,
+ .inc_purge = rds_ib_inc_purge,
+ .inc_free = rds_ib_inc_free,
+ .cm_initiate_connect = rds_ib_cm_initiate_connect,
+ .cm_handle_connect = rds_ib_cm_handle_connect,
+ .cm_connect_complete = rds_ib_cm_connect_complete,
+ .stats_info_copy = rds_ib_stats_info_copy,
+ .exit = rds_ib_exit,
+ .get_mr = rds_ib_get_mr,
+ .sync_mr = rds_ib_sync_mr,
+ .free_mr = rds_ib_free_mr,
+ .flush_mrs = rds_ib_flush_mrs,
+ .t_owner = THIS_MODULE,
+ .t_name = "infiniband",
+};
+
+int __init rds_ib_init(void)
+{
+ int ret;
+
+ INIT_LIST_HEAD(&rds_ib_devices);
+
+ ret = ib_register_client(&rds_ib_client);
+ if (ret)
+ goto out;
+
+ ret = rds_ib_sysctl_init();
+ if (ret)
+ goto out_ibreg;
+
+ ret = rds_ib_recv_init();
+ if (ret)
+ goto out_sysctl;
+
+ ret = rds_trans_register(&rds_ib_transport);
+ if (ret)
+ goto out_recv;
+
+ rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+
+ goto out;
+
+out_recv:
+ rds_ib_recv_exit();
+out_sysctl:
+ rds_ib_sysctl_exit();
+out_ibreg:
+ ib_unregister_client(&rds_ib_client);
+out:
+ return ret;
+}
+
+MODULE_LICENSE("GPL");
+
diff --git a/net/rds/ib.h b/net/rds/ib.h
new file mode 100644
index 00000000000..8be563a1363
--- /dev/null
+++ b/net/rds/ib.h
@@ -0,0 +1,367 @@
+#ifndef _RDS_IB_H
+#define _RDS_IB_H
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include "rds.h"
+#include "rdma_transport.h"
+
+#define RDS_FMR_SIZE 256
+#define RDS_FMR_POOL_SIZE 4096
+
+#define RDS_IB_MAX_SGE 8
+#define RDS_IB_RECV_SGE 2
+
+#define RDS_IB_DEFAULT_RECV_WR 1024
+#define RDS_IB_DEFAULT_SEND_WR 256
+
+#define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */
+
+extern struct list_head rds_ib_devices;
+
+/*
+ * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to
+ * try and minimize the amount of memory tied up both the device and
+ * socket receive queues.
+ */
+/* page offset of the final full frag that fits in the page */
+#define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE)
+struct rds_page_frag {
+ struct list_head f_item;
+ struct page *f_page;
+ unsigned long f_offset;
+ dma_addr_t f_mapped;
+};
+
+struct rds_ib_incoming {
+ struct list_head ii_frags;
+ struct rds_incoming ii_inc;
+};
+
+struct rds_ib_connect_private {
+ /* Add new fields at the end, and don't permute existing fields. */
+ __be32 dp_saddr;
+ __be32 dp_daddr;
+ u8 dp_protocol_major;
+ u8 dp_protocol_minor;
+ __be16 dp_protocol_minor_mask; /* bitmask */
+ __be32 dp_reserved1;
+ __be64 dp_ack_seq;
+ __be32 dp_credit; /* non-zero enables flow ctl */
+};
+
+struct rds_ib_send_work {
+ struct rds_message *s_rm;
+ struct rds_rdma_op *s_op;
+ struct ib_send_wr s_wr;
+ struct ib_sge s_sge[RDS_IB_MAX_SGE];
+ unsigned long s_queued;
+};
+
+struct rds_ib_recv_work {
+ struct rds_ib_incoming *r_ibinc;
+ struct rds_page_frag *r_frag;
+ struct ib_recv_wr r_wr;
+ struct ib_sge r_sge[2];
+};
+
+struct rds_ib_work_ring {
+ u32 w_nr;
+ u32 w_alloc_ptr;
+ u32 w_alloc_ctr;
+ u32 w_free_ptr;
+ atomic_t w_free_ctr;
+};
+
+struct rds_ib_device;
+
+struct rds_ib_connection {
+
+ struct list_head ib_node;
+ struct rds_ib_device *rds_ibdev;
+ struct rds_connection *conn;
+
+ /* alphabet soup, IBTA style */
+ struct rdma_cm_id *i_cm_id;
+ struct ib_pd *i_pd;
+ struct ib_mr *i_mr;
+ struct ib_cq *i_send_cq;
+ struct ib_cq *i_recv_cq;
+
+ /* tx */
+ struct rds_ib_work_ring i_send_ring;
+ struct rds_message *i_rm;
+ struct rds_header *i_send_hdrs;
+ u64 i_send_hdrs_dma;
+ struct rds_ib_send_work *i_sends;
+
+ /* rx */
+ struct mutex i_recv_mutex;
+ struct rds_ib_work_ring i_recv_ring;
+ struct rds_ib_incoming *i_ibinc;
+ u32 i_recv_data_rem;
+ struct rds_header *i_recv_hdrs;
+ u64 i_recv_hdrs_dma;
+ struct rds_ib_recv_work *i_recvs;
+ struct rds_page_frag i_frag;
+ u64 i_ack_recv; /* last ACK received */
+
+ /* sending acks */
+ unsigned long i_ack_flags;
+ u64 i_ack_next; /* next ACK to send */
+ struct rds_header *i_ack;
+ struct ib_send_wr i_ack_wr;
+ struct ib_sge i_ack_sge;
+ u64 i_ack_dma;
+ unsigned long i_ack_queued;
+
+ /* Flow control related information
+ *
+ * Our algorithm uses a pair variables that we need to access
+ * atomically - one for the send credits, and one posted
+ * recv credits we need to transfer to remote.
+ * Rather than protect them using a slow spinlock, we put both into
+ * a single atomic_t and update it using cmpxchg
+ */
+ atomic_t i_credits;
+
+ /* Protocol version specific information */
+ unsigned int i_flowctl:1; /* enable/disable flow ctl */
+
+ /* Batched completions */
+ unsigned int i_unsignaled_wrs;
+ long i_unsignaled_bytes;
+};
+
+/* This assumes that atomic_t is at least 32 bits */
+#define IB_GET_SEND_CREDITS(v) ((v) & 0xffff)
+#define IB_GET_POST_CREDITS(v) ((v) >> 16)
+#define IB_SET_SEND_CREDITS(v) ((v) & 0xffff)
+#define IB_SET_POST_CREDITS(v) ((v) << 16)
+
+struct rds_ib_ipaddr {
+ struct list_head list;
+ __be32 ipaddr;
+};
+
+struct rds_ib_device {
+ struct list_head list;
+ struct list_head ipaddr_list;
+ struct list_head conn_list;
+ struct ib_device *dev;
+ struct ib_pd *pd;
+ struct ib_mr *mr;
+ struct rds_ib_mr_pool *mr_pool;
+ int fmr_page_shift;
+ int fmr_page_size;
+ u64 fmr_page_mask;
+ unsigned int fmr_max_remaps;
+ unsigned int max_fmrs;
+ int max_sge;
+ unsigned int max_wrs;
+ spinlock_t spinlock; /* protect the above */
+};
+
+/* bits for i_ack_flags */
+#define IB_ACK_IN_FLIGHT 0
+#define IB_ACK_REQUESTED 1
+
+/* Magic WR_ID for ACKs */
+#define RDS_IB_ACK_WR_ID (~(u64) 0)
+
+struct rds_ib_statistics {
+ uint64_t s_ib_connect_raced;
+ uint64_t s_ib_listen_closed_stale;
+ uint64_t s_ib_tx_cq_call;
+ uint64_t s_ib_tx_cq_event;
+ uint64_t s_ib_tx_ring_full;
+ uint64_t s_ib_tx_throttle;
+ uint64_t s_ib_tx_sg_mapping_failure;
+ uint64_t s_ib_tx_stalled;
+ uint64_t s_ib_tx_credit_updates;
+ uint64_t s_ib_rx_cq_call;
+ uint64_t s_ib_rx_cq_event;
+ uint64_t s_ib_rx_ring_empty;
+ uint64_t s_ib_rx_refill_from_cq;
+ uint64_t s_ib_rx_refill_from_thread;
+ uint64_t s_ib_rx_alloc_limit;
+ uint64_t s_ib_rx_credit_updates;
+ uint64_t s_ib_ack_sent;
+ uint64_t s_ib_ack_send_failure;
+ uint64_t s_ib_ack_send_delayed;
+ uint64_t s_ib_ack_send_piggybacked;
+ uint64_t s_ib_ack_received;
+ uint64_t s_ib_rdma_mr_alloc;
+ uint64_t s_ib_rdma_mr_free;
+ uint64_t s_ib_rdma_mr_used;
+ uint64_t s_ib_rdma_mr_pool_flush;
+ uint64_t s_ib_rdma_mr_pool_wait;
+ uint64_t s_ib_rdma_mr_pool_depleted;
+};
+
+extern struct workqueue_struct *rds_ib_wq;
+
+/*
+ * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h
+ * doesn't define it.
+ */
+static inline void rds_ib_dma_sync_sg_for_cpu(struct ib_device *dev,
+ struct scatterlist *sg, unsigned int sg_dma_len, int direction)
+{
+ unsigned int i;
+
+ for (i = 0; i < sg_dma_len; ++i) {
+ ib_dma_sync_single_for_cpu(dev,
+ ib_sg_dma_address(dev, &sg[i]),
+ ib_sg_dma_len(dev, &sg[i]),
+ direction);
+ }
+}
+#define ib_dma_sync_sg_for_cpu rds_ib_dma_sync_sg_for_cpu
+
+static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev,
+ struct scatterlist *sg, unsigned int sg_dma_len, int direction)
+{
+ unsigned int i;
+
+ for (i = 0; i < sg_dma_len; ++i) {
+ ib_dma_sync_single_for_device(dev,
+ ib_sg_dma_address(dev, &sg[i]),
+ ib_sg_dma_len(dev, &sg[i]),
+ direction);
+ }
+}
+#define ib_dma_sync_sg_for_device rds_ib_dma_sync_sg_for_device
+
+
+/* ib.c */
+extern struct rds_transport rds_ib_transport;
+extern void rds_ib_add_one(struct ib_device *device);
+extern void rds_ib_remove_one(struct ib_device *device);
+extern struct ib_client rds_ib_client;
+
+extern unsigned int fmr_pool_size;
+extern unsigned int fmr_message_size;
+
+extern spinlock_t ib_nodev_conns_lock;
+extern struct list_head ib_nodev_conns;
+
+/* ib_cm.c */
+int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp);
+void rds_ib_conn_free(void *arg);
+int rds_ib_conn_connect(struct rds_connection *conn);
+void rds_ib_conn_shutdown(struct rds_connection *conn);
+void rds_ib_state_change(struct sock *sk);
+int __init rds_ib_listen_init(void);
+void rds_ib_listen_stop(void);
+void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...);
+int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event);
+int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id);
+void rds_ib_cm_connect_complete(struct rds_connection *conn,
+ struct rdma_cm_event *event);
+
+
+#define rds_ib_conn_error(conn, fmt...) \
+ __rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt)
+
+/* ib_rdma.c */
+int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
+int rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
+void rds_ib_remove_nodev_conns(void);
+void rds_ib_remove_conns(struct rds_ib_device *rds_ibdev);
+struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *);
+void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo);
+void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
+void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
+ struct rds_sock *rs, u32 *key_ret);
+void rds_ib_sync_mr(void *trans_private, int dir);
+void rds_ib_free_mr(void *trans_private, int invalidate);
+void rds_ib_flush_mrs(void);
+
+/* ib_recv.c */
+int __init rds_ib_recv_init(void);
+void rds_ib_recv_exit(void);
+int rds_ib_recv(struct rds_connection *conn);
+int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
+ gfp_t page_gfp, int prefill);
+void rds_ib_inc_purge(struct rds_incoming *inc);
+void rds_ib_inc_free(struct rds_incoming *inc);
+int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
+ size_t size);
+void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context);
+void rds_ib_recv_init_ring(struct rds_ib_connection *ic);
+void rds_ib_recv_clear_ring(struct rds_ib_connection *ic);
+void rds_ib_recv_init_ack(struct rds_ib_connection *ic);
+void rds_ib_attempt_ack(struct rds_ib_connection *ic);
+void rds_ib_ack_send_complete(struct rds_ib_connection *ic);
+u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic);
+
+/* ib_ring.c */
+void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr);
+void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr);
+u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos);
+void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val);
+void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val);
+int rds_ib_ring_empty(struct rds_ib_work_ring *ring);
+int rds_ib_ring_low(struct rds_ib_work_ring *ring);
+u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring);
+u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest);
+extern wait_queue_head_t rds_ib_ring_empty_wait;
+
+/* ib_send.c */
+void rds_ib_xmit_complete(struct rds_connection *conn);
+int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
+ unsigned int hdr_off, unsigned int sg, unsigned int off);
+void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context);
+void rds_ib_send_init_ring(struct rds_ib_connection *ic);
+void rds_ib_send_clear_ring(struct rds_ib_connection *ic);
+int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op);
+void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits);
+void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted);
+int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted,
+ u32 *adv_credits, int need_posted);
+
+/* ib_stats.c */
+DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
+#define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member)
+unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
+ unsigned int avail);
+
+/* ib_sysctl.c */
+int __init rds_ib_sysctl_init(void);
+void rds_ib_sysctl_exit(void);
+extern unsigned long rds_ib_sysctl_max_send_wr;
+extern unsigned long rds_ib_sysctl_max_recv_wr;
+extern unsigned long rds_ib_sysctl_max_unsig_wrs;
+extern unsigned long rds_ib_sysctl_max_unsig_bytes;
+extern unsigned long rds_ib_sysctl_max_recv_allocation;
+extern unsigned int rds_ib_sysctl_flow_control;
+extern ctl_table rds_ib_sysctl_table[];
+
+/*
+ * Helper functions for getting/setting the header and data SGEs in
+ * RDS packets (not RDMA)
+ */
+static inline struct ib_sge *
+rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
+{
+ return &sge[0];
+}
+
+static inline struct ib_sge *
+rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
+{
+ return &sge[1];
+}
+
+static inline void rds_ib_set_64bit(u64 *ptr, u64 val)
+{
+#if BITS_PER_LONG == 64
+ *ptr = val;
+#else
+ set_64bit(ptr, val);
+#endif
+}
+
+#endif
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
new file mode 100644
index 00000000000..0532237bd12
--- /dev/null
+++ b/net/rds/ib_cm.c
@@ -0,0 +1,726 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/vmalloc.h>
+
+#include "rds.h"
+#include "ib.h"
+
+/*
+ * Set the selected protocol version
+ */
+static void rds_ib_set_protocol(struct rds_connection *conn, unsigned int version)
+{
+ conn->c_version = version;
+}
+
+/*
+ * Set up flow control
+ */
+static void rds_ib_set_flow_control(struct rds_connection *conn, u32 credits)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+
+ if (rds_ib_sysctl_flow_control && credits != 0) {
+ /* We're doing flow control */
+ ic->i_flowctl = 1;
+ rds_ib_send_add_credits(conn, credits);
+ } else {
+ ic->i_flowctl = 0;
+ }
+}
+
+/*
+ * Tune RNR behavior. Without flow control, we use a rather
+ * low timeout, but not the absolute minimum - this should
+ * be tunable.
+ *
+ * We already set the RNR retry count to 7 (which is the
+ * smallest infinite number :-) above.
+ * If flow control is off, we want to change this back to 0
+ * so that we learn quickly when our credit accounting is
+ * buggy.
+ *
+ * Caller passes in a qp_attr pointer - don't waste stack spacv
+ * by allocation this twice.
+ */
+static void
+rds_ib_tune_rnr(struct rds_ib_connection *ic, struct ib_qp_attr *attr)
+{
+ int ret;
+
+ attr->min_rnr_timer = IB_RNR_TIMER_000_32;
+ ret = ib_modify_qp(ic->i_cm_id->qp, attr, IB_QP_MIN_RNR_TIMER);
+ if (ret)
+ printk(KERN_NOTICE "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d\n", -ret);
+}
+
+/*
+ * Connection established.
+ * We get here for both outgoing and incoming connection.
+ */
+void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
+{
+ const struct rds_ib_connect_private *dp = NULL;
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct rds_ib_device *rds_ibdev;
+ struct ib_qp_attr qp_attr;
+ int err;
+
+ if (event->param.conn.private_data_len) {
+ dp = event->param.conn.private_data;
+
+ rds_ib_set_protocol(conn,
+ RDS_PROTOCOL(dp->dp_protocol_major,
+ dp->dp_protocol_minor));
+ rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+ }
+
+ printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
+ &conn->c_laddr,
+ RDS_PROTOCOL_MAJOR(conn->c_version),
+ RDS_PROTOCOL_MINOR(conn->c_version),
+ ic->i_flowctl ? ", flow control" : "");
+
+ /* Tune RNR behavior */
+ rds_ib_tune_rnr(ic, &qp_attr);
+
+ qp_attr.qp_state = IB_QPS_RTS;
+ err = ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE);
+ if (err)
+ printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
+
+ /* update ib_device with this local ipaddr & conn */
+ rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
+ err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr);
+ if (err)
+ printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err);
+ err = rds_ib_add_conn(rds_ibdev, conn);
+ if (err)
+ printk(KERN_ERR "rds_ib_add_conn failed (%d)\n", err);
+
+ /* If the peer gave us the last packet it saw, process this as if
+ * we had received a regular ACK. */
+ if (dp && dp->dp_ack_seq)
+ rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
+
+ rds_connect_complete(conn);
+}
+
+static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
+ struct rdma_conn_param *conn_param,
+ struct rds_ib_connect_private *dp,
+ u32 protocol_version)
+{
+ memset(conn_param, 0, sizeof(struct rdma_conn_param));
+ /* XXX tune these? */
+ conn_param->responder_resources = 1;
+ conn_param->initiator_depth = 1;
+ conn_param->retry_count = 7;
+ conn_param->rnr_retry_count = 7;
+
+ if (dp) {
+ struct rds_ib_connection *ic = conn->c_transport_data;
+
+ memset(dp, 0, sizeof(*dp));
+ dp->dp_saddr = conn->c_laddr;
+ dp->dp_daddr = conn->c_faddr;
+ dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
+ dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
+ dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
+ dp->dp_ack_seq = rds_ib_piggyb_ack(ic);
+
+ /* Advertise flow control */
+ if (ic->i_flowctl) {
+ unsigned int credits;
+
+ credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits));
+ dp->dp_credit = cpu_to_be32(credits);
+ atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits);
+ }
+
+ conn_param->private_data = dp;
+ conn_param->private_data_len = sizeof(*dp);
+ }
+}
+
+static void rds_ib_cq_event_handler(struct ib_event *event, void *data)
+{
+ rdsdebug("event %u data %p\n", event->event, data);
+}
+
+static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
+{
+ struct rds_connection *conn = data;
+ struct rds_ib_connection *ic = conn->c_transport_data;
+
+ rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event);
+
+ switch (event->event) {
+ case IB_EVENT_COMM_EST:
+ rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
+ break;
+ default:
+ printk(KERN_WARNING "RDS/ib: unhandled QP event %u "
+ "on connection to %pI4\n", event->event,
+ &conn->c_faddr);
+ break;
+ }
+}
+
+/*
+ * This needs to be very careful to not leave IS_ERR pointers around for
+ * cleanup to trip over.
+ */
+static int rds_ib_setup_qp(struct rds_connection *conn)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct ib_device *dev = ic->i_cm_id->device;
+ struct ib_qp_init_attr attr;
+ struct rds_ib_device *rds_ibdev;
+ int ret;
+
+ /* rds_ib_add_one creates a rds_ib_device object per IB device,
+ * and allocates a protection domain, memory range and FMR pool
+ * for each. If that fails for any reason, it will not register
+ * the rds_ibdev at all.
+ */
+ rds_ibdev = ib_get_client_data(dev, &rds_ib_client);
+ if (rds_ibdev == NULL) {
+ if (printk_ratelimit())
+ printk(KERN_NOTICE "RDS/IB: No client_data for device %s\n",
+ dev->name);
+ return -EOPNOTSUPP;
+ }
+
+ if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1)
+ rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1);
+ if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1)
+ rds_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1);
+
+ /* Protection domain and memory range */
+ ic->i_pd = rds_ibdev->pd;
+ ic->i_mr = rds_ibdev->mr;
+
+ ic->i_send_cq = ib_create_cq(dev, rds_ib_send_cq_comp_handler,
+ rds_ib_cq_event_handler, conn,
+ ic->i_send_ring.w_nr + 1, 0);
+ if (IS_ERR(ic->i_send_cq)) {
+ ret = PTR_ERR(ic->i_send_cq);
+ ic->i_send_cq = NULL;
+ rdsdebug("ib_create_cq send failed: %d\n", ret);
+ goto out;
+ }
+
+ ic->i_recv_cq = ib_create_cq(dev, rds_ib_recv_cq_comp_handler,
+ rds_ib_cq_event_handler, conn,
+ ic->i_recv_ring.w_nr, 0);
+ if (IS_ERR(ic->i_recv_cq)) {
+ ret = PTR_ERR(ic->i_recv_cq);
+ ic->i_recv_cq = NULL;
+ rdsdebug("ib_create_cq recv failed: %d\n", ret);
+ goto out;
+ }
+
+ ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
+ if (ret) {
+ rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
+ goto out;
+ }
+
+ ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
+ if (ret) {
+ rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
+ goto out;
+ }
+
+ /* XXX negotiate max send/recv with remote? */
+ memset(&attr, 0, sizeof(attr));
+ attr.event_handler = rds_ib_qp_event_handler;
+ attr.qp_context = conn;
+ /* + 1 to allow for the single ack message */
+ attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1;
+ attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1;
+ attr.cap.max_send_sge = rds_ibdev->max_sge;
+ attr.cap.max_recv_sge = RDS_IB_RECV_SGE;
+ attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+ attr.qp_type = IB_QPT_RC;
+ attr.send_cq = ic->i_send_cq;
+ attr.recv_cq = ic->i_recv_cq;
+
+ /*
+ * XXX this can fail if max_*_wr is too large? Are we supposed
+ * to back off until we get a value that the hardware can support?
+ */
+ ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
+ if (ret) {
+ rdsdebug("rdma_create_qp failed: %d\n", ret);
+ goto out;
+ }
+
+ ic->i_send_hdrs = ib_dma_alloc_coherent(dev,
+ ic->i_send_ring.w_nr *
+ sizeof(struct rds_header),
+ &ic->i_send_hdrs_dma, GFP_KERNEL);
+ if (ic->i_send_hdrs == NULL) {
+ ret = -ENOMEM;
+ rdsdebug("ib_dma_alloc_coherent send failed\n");
+ goto out;
+ }
+
+ ic->i_recv_hdrs = ib_dma_alloc_coherent(dev,
+ ic->i_recv_ring.w_nr *
+ sizeof(struct rds_header),
+ &ic->i_recv_hdrs_dma, GFP_KERNEL);
+ if (ic->i_recv_hdrs == NULL) {
+ ret = -ENOMEM;
+ rdsdebug("ib_dma_alloc_coherent recv failed\n");
+ goto out;
+ }
+
+ ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
+ &ic->i_ack_dma, GFP_KERNEL);
+ if (ic->i_ack == NULL) {
+ ret = -ENOMEM;
+ rdsdebug("ib_dma_alloc_coherent ack failed\n");
+ goto out;
+ }
+
+ ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work));
+ if (ic->i_sends == NULL) {
+ ret = -ENOMEM;
+ rdsdebug("send allocation failed\n");
+ goto out;
+ }
+ rds_ib_send_init_ring(ic);
+
+ ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work));
+ if (ic->i_recvs == NULL) {
+ ret = -ENOMEM;
+ rdsdebug("recv allocation failed\n");
+ goto out;
+ }
+
+ rds_ib_recv_init_ring(ic);
+ rds_ib_recv_init_ack(ic);
+
+ /* Post receive buffers - as a side effect, this will update
+ * the posted credit count. */
+ rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
+
+ rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
+ ic->i_send_cq, ic->i_recv_cq);
+
+out:
+ return ret;
+}
+
+static u32 rds_ib_protocol_compatible(const struct rds_ib_connect_private *dp)
+{
+ u16 common;
+ u32 version = 0;
+
+ /* rdma_cm private data is odd - when there is any private data in the
+ * request, we will be given a pretty large buffer without telling us the
+ * original size. The only way to tell the difference is by looking at
+ * the contents, which are initialized to zero.
+ * If the protocol version fields aren't set, this is a connection attempt
+ * from an older version. This could could be 3.0 or 2.0 - we can't tell.
+ * We really should have changed this for OFED 1.3 :-( */
+ if (dp->dp_protocol_major == 0)
+ return RDS_PROTOCOL_3_0;
+
+ common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS;
+ if (dp->dp_protocol_major == 3 && common) {
+ version = RDS_PROTOCOL_3_0;
+ while ((common >>= 1) != 0)
+ version++;
+ } else if (printk_ratelimit()) {
+ printk(KERN_NOTICE "RDS: Connection from %pI4 using "
+ "incompatible protocol version %u.%u\n",
+ &dp->dp_saddr,
+ dp->dp_protocol_major,
+ dp->dp_protocol_minor);
+ }
+ return version;
+}
+
+int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event)
+{
+ __be64 lguid = cm_id->route.path_rec->sgid.global.interface_id;
+ __be64 fguid = cm_id->route.path_rec->dgid.global.interface_id;
+ const struct rds_ib_connect_private *dp = event->param.conn.private_data;
+ struct rds_ib_connect_private dp_rep;
+ struct rds_connection *conn = NULL;
+ struct rds_ib_connection *ic = NULL;
+ struct rdma_conn_param conn_param;
+ u32 version;
+ int err, destroy = 1;
+
+ /* Check whether the remote protocol version matches ours. */
+ version = rds_ib_protocol_compatible(dp);
+ if (!version)
+ goto out;
+
+ rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u lguid 0x%llx fguid "
+ "0x%llx\n", &dp->dp_saddr, &dp->dp_daddr,
+ RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version),
+ (unsigned long long)be64_to_cpu(lguid),
+ (unsigned long long)be64_to_cpu(fguid));
+
+ conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_ib_transport,
+ GFP_KERNEL);
+ if (IS_ERR(conn)) {
+ rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
+ conn = NULL;
+ goto out;
+ }
+
+ /*
+ * The connection request may occur while the
+ * previous connection exist, e.g. in case of failover.
+ * But as connections may be initiated simultaneously
+ * by both hosts, we have a random backoff mechanism -
+ * see the comment above rds_queue_reconnect()
+ */
+ mutex_lock(&conn->c_cm_lock);
+ if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
+ if (rds_conn_state(conn) == RDS_CONN_UP) {
+ rdsdebug("incoming connect while connecting\n");
+ rds_conn_drop(conn);
+ rds_ib_stats_inc(s_ib_listen_closed_stale);
+ } else
+ if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
+ /* Wait and see - our connect may still be succeeding */
+ rds_ib_stats_inc(s_ib_connect_raced);
+ }
+ mutex_unlock(&conn->c_cm_lock);
+ goto out;
+ }
+
+ ic = conn->c_transport_data;
+
+ rds_ib_set_protocol(conn, version);
+ rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+
+ /* If the peer gave us the last packet it saw, process this as if
+ * we had received a regular ACK. */
+ if (dp->dp_ack_seq)
+ rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
+
+ BUG_ON(cm_id->context);
+ BUG_ON(ic->i_cm_id);
+
+ ic->i_cm_id = cm_id;
+ cm_id->context = conn;
+
+ /* We got halfway through setting up the ib_connection, if we
+ * fail now, we have to take the long route out of this mess. */
+ destroy = 0;
+
+ err = rds_ib_setup_qp(conn);
+ if (err) {
+ rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err);
+ goto out;
+ }
+
+ rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version);
+
+ /* rdma_accept() calls rdma_reject() internally if it fails */
+ err = rdma_accept(cm_id, &conn_param);
+ mutex_unlock(&conn->c_cm_lock);
+ if (err) {
+ rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err);
+ goto out;
+ }
+
+ return 0;
+
+out:
+ rdma_reject(cm_id, NULL, 0);
+ return destroy;
+}
+
+
+int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
+{
+ struct rds_connection *conn = cm_id->context;
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct rdma_conn_param conn_param;
+ struct rds_ib_connect_private dp;
+ int ret;
+
+ /* If the peer doesn't do protocol negotiation, we must
+ * default to RDSv3.0 */
+ rds_ib_set_protocol(conn, RDS_PROTOCOL_3_0);
+ ic->i_flowctl = rds_ib_sysctl_flow_control; /* advertise flow control */
+
+ ret = rds_ib_setup_qp(conn);
+ if (ret) {
+ rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", ret);
+ goto out;
+ }
+
+ rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION);
+
+ ret = rdma_connect(cm_id, &conn_param);
+ if (ret)
+ rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
+
+out:
+ /* Beware - returning non-zero tells the rdma_cm to destroy
+ * the cm_id. We should certainly not do it as long as we still
+ * "own" the cm_id. */
+ if (ret) {
+ if (ic->i_cm_id == cm_id)
+ ret = 0;
+ }
+ return ret;
+}
+
+int rds_ib_conn_connect(struct rds_connection *conn)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct sockaddr_in src, dest;
+ int ret;
+
+ /* XXX I wonder what affect the port space has */
+ /* delegate cm event handler to rdma_transport */
+ ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
+ RDMA_PS_TCP);
+ if (IS_ERR(ic->i_cm_id)) {
+ ret = PTR_ERR(ic->i_cm_id);
+ ic->i_cm_id = NULL;
+ rdsdebug("rdma_create_id() failed: %d\n", ret);
+ goto out;
+ }
+
+ rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
+
+ src.sin_family = AF_INET;
+ src.sin_addr.s_addr = (__force u32)conn->c_laddr;
+ src.sin_port = (__force u16)htons(0);
+
+ dest.sin_family = AF_INET;
+ dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
+ dest.sin_port = (__force u16)htons(RDS_PORT);
+
+ ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
+ (struct sockaddr *)&dest,
+ RDS_RDMA_RESOLVE_TIMEOUT_MS);
+ if (ret) {
+ rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id,
+ ret);
+ rdma_destroy_id(ic->i_cm_id);
+ ic->i_cm_id = NULL;
+ }
+
+out:
+ return ret;
+}
+
+/*
+ * This is so careful about only cleaning up resources that were built up
+ * so that it can be called at any point during startup. In fact it
+ * can be called multiple times for a given connection.
+ */
+void rds_ib_conn_shutdown(struct rds_connection *conn)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ int err = 0;
+
+ rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id,
+ ic->i_pd, ic->i_send_cq, ic->i_recv_cq,
+ ic->i_cm_id ? ic->i_cm_id->qp : NULL);
+
+ if (ic->i_cm_id) {
+ struct ib_device *dev = ic->i_cm_id->device;
+
+ rdsdebug("disconnecting cm %p\n", ic->i_cm_id);
+ err = rdma_disconnect(ic->i_cm_id);
+ if (err) {
+ /* Actually this may happen quite frequently, when
+ * an outgoing connect raced with an incoming connect.
+ */
+ rdsdebug("failed to disconnect, cm: %p err %d\n",
+ ic->i_cm_id, err);
+ }
+
+ wait_event(rds_ib_ring_empty_wait,
+ rds_ib_ring_empty(&ic->i_send_ring) &&
+ rds_ib_ring_empty(&ic->i_recv_ring));
+
+ if (ic->i_send_hdrs)
+ ib_dma_free_coherent(dev,
+ ic->i_send_ring.w_nr *
+ sizeof(struct rds_header),
+ ic->i_send_hdrs,
+ ic->i_send_hdrs_dma);
+
+ if (ic->i_recv_hdrs)
+ ib_dma_free_coherent(dev,
+ ic->i_recv_ring.w_nr *
+ sizeof(struct rds_header),
+ ic->i_recv_hdrs,
+ ic->i_recv_hdrs_dma);
+
+ if (ic->i_ack)
+ ib_dma_free_coherent(dev, sizeof(struct rds_header),
+ ic->i_ack, ic->i_ack_dma);
+
+ if (ic->i_sends)
+ rds_ib_send_clear_ring(ic);
+ if (ic->i_recvs)
+ rds_ib_recv_clear_ring(ic);
+
+ if (ic->i_cm_id->qp)
+ rdma_destroy_qp(ic->i_cm_id);
+ if (ic->i_send_cq)
+ ib_destroy_cq(ic->i_send_cq);
+ if (ic->i_recv_cq)
+ ib_destroy_cq(ic->i_recv_cq);
+ rdma_destroy_id(ic->i_cm_id);
+
+ /*
+ * Move connection back to the nodev list.
+ */
+ if (ic->rds_ibdev) {
+
+ spin_lock_irq(&ic->rds_ibdev->spinlock);
+ BUG_ON(list_empty(&ic->ib_node));
+ list_del(&ic->ib_node);
+ spin_unlock_irq(&ic->rds_ibdev->spinlock);
+
+ spin_lock_irq(&ib_nodev_conns_lock);
+ list_add_tail(&ic->ib_node, &ib_nodev_conns);
+ spin_unlock_irq(&ib_nodev_conns_lock);
+ ic->rds_ibdev = NULL;
+ }
+
+ ic->i_cm_id = NULL;
+ ic->i_pd = NULL;
+ ic->i_mr = NULL;
+ ic->i_send_cq = NULL;
+ ic->i_recv_cq = NULL;
+ ic->i_send_hdrs = NULL;
+ ic->i_recv_hdrs = NULL;
+ ic->i_ack = NULL;
+ }
+ BUG_ON(ic->rds_ibdev);
+
+ /* Clear pending transmit */
+ if (ic->i_rm) {
+ rds_message_put(ic->i_rm);
+ ic->i_rm = NULL;
+ }
+
+ /* Clear the ACK state */
+ clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+ rds_ib_set_64bit(&ic->i_ack_next, 0);
+ ic->i_ack_recv = 0;
+
+ /* Clear flow control state */
+ ic->i_flowctl = 0;
+ atomic_set(&ic->i_credits, 0);
+
+ rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr);
+ rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr);
+
+ if (ic->i_ibinc) {
+ rds_inc_put(&ic->i_ibinc->ii_inc);
+ ic->i_ibinc = NULL;
+ }
+
+ vfree(ic->i_sends);
+ ic->i_sends = NULL;
+ vfree(ic->i_recvs);
+ ic->i_recvs = NULL;
+}
+
+int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
+{
+ struct rds_ib_connection *ic;
+ unsigned long flags;
+
+ /* XXX too lazy? */
+ ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL);
+ if (ic == NULL)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&ic->ib_node);
+ mutex_init(&ic->i_recv_mutex);
+
+ /*
+ * rds_ib_conn_shutdown() waits for these to be emptied so they
+ * must be initialized before it can be called.
+ */
+ rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr);
+ rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr);
+
+ ic->conn = conn;
+ conn->c_transport_data = ic;
+
+ spin_lock_irqsave(&ib_nodev_conns_lock, flags);
+ list_add_tail(&ic->ib_node, &ib_nodev_conns);
+ spin_unlock_irqrestore(&ib_nodev_conns_lock, flags);
+
+
+ rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data);
+ return 0;
+}
+
+void rds_ib_conn_free(void *arg)
+{
+ struct rds_ib_connection *ic = arg;
+ rdsdebug("ic %p\n", ic);
+ list_del(&ic->ib_node);
+ kfree(ic);
+}
+
+
+/*
+ * An error occurred on the connection
+ */
+void
+__rds_ib_conn_error(struct rds_connection *conn, const char *fmt, ...)
+{
+ va_list ap;
+
+ rds_conn_drop(conn);
+
+ va_start(ap, fmt);
+ vprintk(fmt, ap);
+ va_end(ap);
+}
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
new file mode 100644
index 00000000000..69a6289ed67
--- /dev/null
+++ b/net/rds/ib_rdma.c
@@ -0,0 +1,641 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+
+#include "rds.h"
+#include "rdma.h"
+#include "ib.h"
+
+
+/*
+ * This is stored as mr->r_trans_private.
+ */
+struct rds_ib_mr {
+ struct rds_ib_device *device;
+ struct rds_ib_mr_pool *pool;
+ struct ib_fmr *fmr;
+ struct list_head list;
+ unsigned int remap_count;
+
+ struct scatterlist *sg;
+ unsigned int sg_len;
+ u64 *dma;
+ int sg_dma_len;
+};
+
+/*
+ * Our own little FMR pool
+ */
+struct rds_ib_mr_pool {
+ struct mutex flush_lock; /* serialize fmr invalidate */
+ struct work_struct flush_worker; /* flush worker */
+
+ spinlock_t list_lock; /* protect variables below */
+ atomic_t item_count; /* total # of MRs */
+ atomic_t dirty_count; /* # dirty of MRs */
+ struct list_head drop_list; /* MRs that have reached their max_maps limit */
+ struct list_head free_list; /* unused MRs */
+ struct list_head clean_list; /* unused & unamapped MRs */
+ atomic_t free_pinned; /* memory pinned by free MRs */
+ unsigned long max_items;
+ unsigned long max_items_soft;
+ unsigned long max_free_pinned;
+ struct ib_fmr_attr fmr_attr;
+};
+
+static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all);
+static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr);
+static void rds_ib_mr_pool_flush_worker(struct work_struct *work);
+
+static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
+{
+ struct rds_ib_device *rds_ibdev;
+ struct rds_ib_ipaddr *i_ipaddr;
+
+ list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
+ spin_lock_irq(&rds_ibdev->spinlock);
+ list_for_each_entry(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
+ if (i_ipaddr->ipaddr == ipaddr) {
+ spin_unlock_irq(&rds_ibdev->spinlock);
+ return rds_ibdev;
+ }
+ }
+ spin_unlock_irq(&rds_ibdev->spinlock);
+ }
+
+ return NULL;
+}
+
+static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
+{
+ struct rds_ib_ipaddr *i_ipaddr;
+
+ i_ipaddr = kmalloc(sizeof *i_ipaddr, GFP_KERNEL);
+ if (!i_ipaddr)
+ return -ENOMEM;
+
+ i_ipaddr->ipaddr = ipaddr;
+
+ spin_lock_irq(&rds_ibdev->spinlock);
+ list_add_tail(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
+ spin_unlock_irq(&rds_ibdev->spinlock);
+
+ return 0;
+}
+
+static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
+{
+ struct rds_ib_ipaddr *i_ipaddr, *next;
+
+ spin_lock_irq(&rds_ibdev->spinlock);
+ list_for_each_entry_safe(i_ipaddr, next, &rds_ibdev->ipaddr_list, list) {
+ if (i_ipaddr->ipaddr == ipaddr) {
+ list_del(&i_ipaddr->list);
+ kfree(i_ipaddr);
+ break;
+ }
+ }
+ spin_unlock_irq(&rds_ibdev->spinlock);
+}
+
+int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
+{
+ struct rds_ib_device *rds_ibdev_old;
+
+ rds_ibdev_old = rds_ib_get_device(ipaddr);
+ if (rds_ibdev_old)
+ rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr);
+
+ return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
+}
+
+int rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+
+ /* conn was previously on the nodev_conns_list */
+ spin_lock_irq(&ib_nodev_conns_lock);
+ BUG_ON(list_empty(&ib_nodev_conns));
+ BUG_ON(list_empty(&ic->ib_node));
+ list_del(&ic->ib_node);
+ spin_unlock_irq(&ib_nodev_conns_lock);
+
+ spin_lock_irq(&rds_ibdev->spinlock);
+ list_add_tail(&ic->ib_node, &rds_ibdev->conn_list);
+ spin_unlock_irq(&rds_ibdev->spinlock);
+
+ ic->rds_ibdev = rds_ibdev;
+
+ return 0;
+}
+
+void rds_ib_remove_nodev_conns(void)
+{
+ struct rds_ib_connection *ic, *_ic;
+ LIST_HEAD(tmp_list);
+
+ /* avoid calling conn_destroy with irqs off */
+ spin_lock_irq(&ib_nodev_conns_lock);
+ list_splice(&ib_nodev_conns, &tmp_list);
+ INIT_LIST_HEAD(&ib_nodev_conns);
+ spin_unlock_irq(&ib_nodev_conns_lock);
+
+ list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) {
+ if (ic->conn->c_passive)
+ rds_conn_destroy(ic->conn->c_passive);
+ rds_conn_destroy(ic->conn);
+ }
+}
+
+void rds_ib_remove_conns(struct rds_ib_device *rds_ibdev)
+{
+ struct rds_ib_connection *ic, *_ic;
+ LIST_HEAD(tmp_list);
+
+ /* avoid calling conn_destroy with irqs off */
+ spin_lock_irq(&rds_ibdev->spinlock);
+ list_splice(&rds_ibdev->conn_list, &tmp_list);
+ INIT_LIST_HEAD(&rds_ibdev->conn_list);
+ spin_unlock_irq(&rds_ibdev->spinlock);
+
+ list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) {
+ if (ic->conn->c_passive)
+ rds_conn_destroy(ic->conn->c_passive);
+ rds_conn_destroy(ic->conn);
+ }
+}
+
+struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev)
+{
+ struct rds_ib_mr_pool *pool;
+
+ pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+ if (!pool)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&pool->free_list);
+ INIT_LIST_HEAD(&pool->drop_list);
+ INIT_LIST_HEAD(&pool->clean_list);
+ mutex_init(&pool->flush_lock);
+ spin_lock_init(&pool->list_lock);
+ INIT_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
+
+ pool->fmr_attr.max_pages = fmr_message_size;
+ pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
+ pool->fmr_attr.page_shift = rds_ibdev->fmr_page_shift;
+ pool->max_free_pinned = rds_ibdev->max_fmrs * fmr_message_size / 4;
+
+ /* We never allow more than max_items MRs to be allocated.
+ * When we exceed more than max_items_soft, we start freeing
+ * items more aggressively.
+ * Make sure that max_items > max_items_soft > max_items / 2
+ */
+ pool->max_items_soft = rds_ibdev->max_fmrs * 3 / 4;
+ pool->max_items = rds_ibdev->max_fmrs;
+
+ return pool;
+}
+
+void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo)
+{
+ struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+
+ iinfo->rdma_mr_max = pool->max_items;
+ iinfo->rdma_mr_size = pool->fmr_attr.max_pages;
+}
+
+void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
+{
+ flush_workqueue(rds_wq);
+ rds_ib_flush_mr_pool(pool, 1);
+ BUG_ON(atomic_read(&pool->item_count));
+ BUG_ON(atomic_read(&pool->free_pinned));
+ kfree(pool);
+}
+
+static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool)
+{
+ struct rds_ib_mr *ibmr = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pool->list_lock, flags);
+ if (!list_empty(&pool->clean_list)) {
+ ibmr = list_entry(pool->clean_list.next, struct rds_ib_mr, list);
+ list_del_init(&ibmr->list);
+ }
+ spin_unlock_irqrestore(&pool->list_lock, flags);
+
+ return ibmr;
+}
+
+static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
+{
+ struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+ struct rds_ib_mr *ibmr = NULL;
+ int err = 0, iter = 0;
+
+ while (1) {
+ ibmr = rds_ib_reuse_fmr(pool);
+ if (ibmr)
+ return ibmr;
+
+ /* No clean MRs - now we have the choice of either
+ * allocating a fresh MR up to the limit imposed by the
+ * driver, or flush any dirty unused MRs.
+ * We try to avoid stalling in the send path if possible,
+ * so we allocate as long as we're allowed to.
+ *
+ * We're fussy with enforcing the FMR limit, though. If the driver
+ * tells us we can't use more than N fmrs, we shouldn't start
+ * arguing with it */
+ if (atomic_inc_return(&pool->item_count) <= pool->max_items)
+ break;
+
+ atomic_dec(&pool->item_count);
+
+ if (++iter > 2) {
+ rds_ib_stats_inc(s_ib_rdma_mr_pool_depleted);
+ return ERR_PTR(-EAGAIN);
+ }
+
+ /* We do have some empty MRs. Flush them out. */
+ rds_ib_stats_inc(s_ib_rdma_mr_pool_wait);
+ rds_ib_flush_mr_pool(pool, 0);
+ }
+
+ ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
+ if (!ibmr) {
+ err = -ENOMEM;
+ goto out_no_cigar;
+ }
+
+ ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
+ (IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_WRITE),
+ &pool->fmr_attr);
+ if (IS_ERR(ibmr->fmr)) {
+ err = PTR_ERR(ibmr->fmr);
+ ibmr->fmr = NULL;
+ printk(KERN_WARNING "RDS/IB: ib_alloc_fmr failed (err=%d)\n", err);
+ goto out_no_cigar;
+ }
+
+ rds_ib_stats_inc(s_ib_rdma_mr_alloc);
+ return ibmr;
+
+out_no_cigar:
+ if (ibmr) {
+ if (ibmr->fmr)
+ ib_dealloc_fmr(ibmr->fmr);
+ kfree(ibmr);
+ }
+ atomic_dec(&pool->item_count);
+ return ERR_PTR(err);
+}
+
+static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr,
+ struct scatterlist *sg, unsigned int nents)
+{
+ struct ib_device *dev = rds_ibdev->dev;
+ struct scatterlist *scat = sg;
+ u64 io_addr = 0;
+ u64 *dma_pages;
+ u32 len;
+ int page_cnt, sg_dma_len;
+ int i, j;
+ int ret;
+
+ sg_dma_len = ib_dma_map_sg(dev, sg, nents,
+ DMA_BIDIRECTIONAL);
+ if (unlikely(!sg_dma_len)) {
+ printk(KERN_WARNING "RDS/IB: dma_map_sg failed!\n");
+ return -EBUSY;
+ }
+
+ len = 0;
+ page_cnt = 0;
+
+ for (i = 0; i < sg_dma_len; ++i) {
+ unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
+ u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
+
+ if (dma_addr & ~rds_ibdev->fmr_page_mask) {
+ if (i > 0)
+ return -EINVAL;
+ else
+ ++page_cnt;
+ }
+ if ((dma_addr + dma_len) & ~rds_ibdev->fmr_page_mask) {
+ if (i < sg_dma_len - 1)
+ return -EINVAL;
+ else
+ ++page_cnt;
+ }
+
+ len += dma_len;
+ }
+
+ page_cnt += len >> rds_ibdev->fmr_page_shift;
+ if (page_cnt > fmr_message_size)
+ return -EINVAL;
+
+ dma_pages = kmalloc(sizeof(u64) * page_cnt, GFP_ATOMIC);
+ if (!dma_pages)
+ return -ENOMEM;
+
+ page_cnt = 0;
+ for (i = 0; i < sg_dma_len; ++i) {
+ unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
+ u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
+
+ for (j = 0; j < dma_len; j += rds_ibdev->fmr_page_size)
+ dma_pages[page_cnt++] =
+ (dma_addr & rds_ibdev->fmr_page_mask) + j;
+ }
+
+ ret = ib_map_phys_fmr(ibmr->fmr,
+ dma_pages, page_cnt, io_addr);
+ if (ret)
+ goto out;
+
+ /* Success - we successfully remapped the MR, so we can
+ * safely tear down the old mapping. */
+ rds_ib_teardown_mr(ibmr);
+
+ ibmr->sg = scat;
+ ibmr->sg_len = nents;
+ ibmr->sg_dma_len = sg_dma_len;
+ ibmr->remap_count++;
+
+ rds_ib_stats_inc(s_ib_rdma_mr_used);
+ ret = 0;
+
+out:
+ kfree(dma_pages);
+
+ return ret;
+}
+
+void rds_ib_sync_mr(void *trans_private, int direction)
+{
+ struct rds_ib_mr *ibmr = trans_private;
+ struct rds_ib_device *rds_ibdev = ibmr->device;
+
+ switch (direction) {
+ case DMA_FROM_DEVICE:
+ ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg,
+ ibmr->sg_dma_len, DMA_BIDIRECTIONAL);
+ break;
+ case DMA_TO_DEVICE:
+ ib_dma_sync_sg_for_device(rds_ibdev->dev, ibmr->sg,
+ ibmr->sg_dma_len, DMA_BIDIRECTIONAL);
+ break;
+ }
+}
+
+static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
+{
+ struct rds_ib_device *rds_ibdev = ibmr->device;
+
+ if (ibmr->sg_dma_len) {
+ ib_dma_unmap_sg(rds_ibdev->dev,
+ ibmr->sg, ibmr->sg_len,
+ DMA_BIDIRECTIONAL);
+ ibmr->sg_dma_len = 0;
+ }
+
+ /* Release the s/g list */
+ if (ibmr->sg_len) {
+ unsigned int i;
+
+ for (i = 0; i < ibmr->sg_len; ++i) {
+ struct page *page = sg_page(&ibmr->sg[i]);
+
+ /* FIXME we need a way to tell a r/w MR
+ * from a r/o MR */
+ set_page_dirty(page);
+ put_page(page);
+ }
+ kfree(ibmr->sg);
+
+ ibmr->sg = NULL;
+ ibmr->sg_len = 0;
+ }
+}
+
+static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
+{
+ unsigned int pinned = ibmr->sg_len;
+
+ __rds_ib_teardown_mr(ibmr);
+ if (pinned) {
+ struct rds_ib_device *rds_ibdev = ibmr->device;
+ struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+
+ atomic_sub(pinned, &pool->free_pinned);
+ }
+}
+
+static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int free_all)
+{
+ unsigned int item_count;
+
+ item_count = atomic_read(&pool->item_count);
+ if (free_all)
+ return item_count;
+
+ return 0;
+}
+
+/*
+ * Flush our pool of MRs.
+ * At a minimum, all currently unused MRs are unmapped.
+ * If the number of MRs allocated exceeds the limit, we also try
+ * to free as many MRs as needed to get back to this limit.
+ */
+static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
+{
+ struct rds_ib_mr *ibmr, *next;
+ LIST_HEAD(unmap_list);
+ LIST_HEAD(fmr_list);
+ unsigned long unpinned = 0;
+ unsigned long flags;
+ unsigned int nfreed = 0, ncleaned = 0, free_goal;
+ int ret = 0;
+
+ rds_ib_stats_inc(s_ib_rdma_mr_pool_flush);
+
+ mutex_lock(&pool->flush_lock);
+
+ spin_lock_irqsave(&pool->list_lock, flags);
+ /* Get the list of all MRs to be dropped. Ordering matters -
+ * we want to put drop_list ahead of free_list. */
+ list_splice_init(&pool->free_list, &unmap_list);
+ list_splice_init(&pool->drop_list, &unmap_list);
+ if (free_all)
+ list_splice_init(&pool->clean_list, &unmap_list);
+ spin_unlock_irqrestore(&pool->list_lock, flags);
+
+ free_goal = rds_ib_flush_goal(pool, free_all);
+
+ if (list_empty(&unmap_list))
+ goto out;
+
+ /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
+ list_for_each_entry(ibmr, &unmap_list, list)
+ list_add(&ibmr->fmr->list, &fmr_list);
+ ret = ib_unmap_fmr(&fmr_list);
+ if (ret)
+ printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret);
+
+ /* Now we can destroy the DMA mapping and unpin any pages */
+ list_for_each_entry_safe(ibmr, next, &unmap_list, list) {
+ unpinned += ibmr->sg_len;
+ __rds_ib_teardown_mr(ibmr);
+ if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) {
+ rds_ib_stats_inc(s_ib_rdma_mr_free);
+ list_del(&ibmr->list);
+ ib_dealloc_fmr(ibmr->fmr);
+ kfree(ibmr);
+ nfreed++;
+ }
+ ncleaned++;
+ }
+
+ spin_lock_irqsave(&pool->list_lock, flags);
+ list_splice(&unmap_list, &pool->clean_list);
+ spin_unlock_irqrestore(&pool->list_lock, flags);
+
+ atomic_sub(unpinned, &pool->free_pinned);
+ atomic_sub(ncleaned, &pool->dirty_count);
+ atomic_sub(nfreed, &pool->item_count);
+
+out:
+ mutex_unlock(&pool->flush_lock);
+ return ret;
+}
+
+static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
+{
+ struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker);
+
+ rds_ib_flush_mr_pool(pool, 0);
+}
+
+void rds_ib_free_mr(void *trans_private, int invalidate)
+{
+ struct rds_ib_mr *ibmr = trans_private;
+ struct rds_ib_device *rds_ibdev = ibmr->device;
+ struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+ unsigned long flags;
+
+ rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
+
+ /* Return it to the pool's free list */
+ spin_lock_irqsave(&pool->list_lock, flags);
+ if (ibmr->remap_count >= pool->fmr_attr.max_maps)
+ list_add(&ibmr->list, &pool->drop_list);
+ else
+ list_add(&ibmr->list, &pool->free_list);
+
+ atomic_add(ibmr->sg_len, &pool->free_pinned);
+ atomic_inc(&pool->dirty_count);
+ spin_unlock_irqrestore(&pool->list_lock, flags);
+
+ /* If we've pinned too many pages, request a flush */
+ if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned
+ || atomic_read(&pool->dirty_count) >= pool->max_items / 10)
+ queue_work(rds_wq, &pool->flush_worker);
+
+ if (invalidate) {
+ if (likely(!in_interrupt())) {
+ rds_ib_flush_mr_pool(pool, 0);
+ } else {
+ /* We get here if the user created a MR marked
+ * as use_once and invalidate at the same time. */
+ queue_work(rds_wq, &pool->flush_worker);
+ }
+ }
+}
+
+void rds_ib_flush_mrs(void)
+{
+ struct rds_ib_device *rds_ibdev;
+
+ list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
+ struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+
+ if (pool)
+ rds_ib_flush_mr_pool(pool, 0);
+ }
+}
+
+void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
+ struct rds_sock *rs, u32 *key_ret)
+{
+ struct rds_ib_device *rds_ibdev;
+ struct rds_ib_mr *ibmr = NULL;
+ int ret;
+
+ rds_ibdev = rds_ib_get_device(rs->rs_bound_addr);
+ if (!rds_ibdev) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ if (!rds_ibdev->mr_pool) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ ibmr = rds_ib_alloc_fmr(rds_ibdev);
+ if (IS_ERR(ibmr))
+ return ibmr;
+
+ ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents);
+ if (ret == 0)
+ *key_ret = ibmr->fmr->rkey;
+ else
+ printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret);
+
+ ibmr->device = rds_ibdev;
+
+ out:
+ if (ret) {
+ if (ibmr)
+ rds_ib_free_mr(ibmr, 0);
+ ibmr = ERR_PTR(ret);
+ }
+ return ibmr;
+}
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
new file mode 100644
index 00000000000..5061b550216
--- /dev/null
+++ b/net/rds/ib_recv.c
@@ -0,0 +1,869 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <rdma/rdma_cm.h>
+
+#include "rds.h"
+#include "ib.h"
+
+static struct kmem_cache *rds_ib_incoming_slab;
+static struct kmem_cache *rds_ib_frag_slab;
+static atomic_t rds_ib_allocation = ATOMIC_INIT(0);
+
+static void rds_ib_frag_drop_page(struct rds_page_frag *frag)
+{
+ rdsdebug("frag %p page %p\n", frag, frag->f_page);
+ __free_page(frag->f_page);
+ frag->f_page = NULL;
+}
+
+static void rds_ib_frag_free(struct rds_page_frag *frag)
+{
+ rdsdebug("frag %p page %p\n", frag, frag->f_page);
+ BUG_ON(frag->f_page != NULL);
+ kmem_cache_free(rds_ib_frag_slab, frag);
+}
+
+/*
+ * We map a page at a time. Its fragments are posted in order. This
+ * is called in fragment order as the fragments get send completion events.
+ * Only the last frag in the page performs the unmapping.
+ *
+ * It's OK for ring cleanup to call this in whatever order it likes because
+ * DMA is not in flight and so we can unmap while other ring entries still
+ * hold page references in their frags.
+ */
+static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic,
+ struct rds_ib_recv_work *recv)
+{
+ struct rds_page_frag *frag = recv->r_frag;
+
+ rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page);
+ if (frag->f_mapped)
+ ib_dma_unmap_page(ic->i_cm_id->device,
+ frag->f_mapped,
+ RDS_FRAG_SIZE, DMA_FROM_DEVICE);
+ frag->f_mapped = 0;
+}
+
+void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
+{
+ struct rds_ib_recv_work *recv;
+ u32 i;
+
+ for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
+ struct ib_sge *sge;
+
+ recv->r_ibinc = NULL;
+ recv->r_frag = NULL;
+
+ recv->r_wr.next = NULL;
+ recv->r_wr.wr_id = i;
+ recv->r_wr.sg_list = recv->r_sge;
+ recv->r_wr.num_sge = RDS_IB_RECV_SGE;
+
+ sge = rds_ib_data_sge(ic, recv->r_sge);
+ sge->addr = 0;
+ sge->length = RDS_FRAG_SIZE;
+ sge->lkey = ic->i_mr->lkey;
+
+ sge = rds_ib_header_sge(ic, recv->r_sge);
+ sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
+ sge->length = sizeof(struct rds_header);
+ sge->lkey = ic->i_mr->lkey;
+ }
+}
+
+static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
+ struct rds_ib_recv_work *recv)
+{
+ if (recv->r_ibinc) {
+ rds_inc_put(&recv->r_ibinc->ii_inc);
+ recv->r_ibinc = NULL;
+ }
+ if (recv->r_frag) {
+ rds_ib_recv_unmap_page(ic, recv);
+ if (recv->r_frag->f_page)
+ rds_ib_frag_drop_page(recv->r_frag);
+ rds_ib_frag_free(recv->r_frag);
+ recv->r_frag = NULL;
+ }
+}
+
+void rds_ib_recv_clear_ring(struct rds_ib_connection *ic)
+{
+ u32 i;
+
+ for (i = 0; i < ic->i_recv_ring.w_nr; i++)
+ rds_ib_recv_clear_one(ic, &ic->i_recvs[i]);
+
+ if (ic->i_frag.f_page)
+ rds_ib_frag_drop_page(&ic->i_frag);
+}
+
+static int rds_ib_recv_refill_one(struct rds_connection *conn,
+ struct rds_ib_recv_work *recv,
+ gfp_t kptr_gfp, gfp_t page_gfp)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ dma_addr_t dma_addr;
+ struct ib_sge *sge;
+ int ret = -ENOMEM;
+
+ if (recv->r_ibinc == NULL) {
+ if (atomic_read(&rds_ib_allocation) >= rds_ib_sysctl_max_recv_allocation) {
+ rds_ib_stats_inc(s_ib_rx_alloc_limit);
+ goto out;
+ }
+ recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab,
+ kptr_gfp);
+ if (recv->r_ibinc == NULL)
+ goto out;
+ atomic_inc(&rds_ib_allocation);
+ INIT_LIST_HEAD(&recv->r_ibinc->ii_frags);
+ rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr);
+ }
+
+ if (recv->r_frag == NULL) {
+ recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, kptr_gfp);
+ if (recv->r_frag == NULL)
+ goto out;
+ INIT_LIST_HEAD(&recv->r_frag->f_item);
+ recv->r_frag->f_page = NULL;
+ }
+
+ if (ic->i_frag.f_page == NULL) {
+ ic->i_frag.f_page = alloc_page(page_gfp);
+ if (ic->i_frag.f_page == NULL)
+ goto out;
+ ic->i_frag.f_offset = 0;
+ }
+
+ dma_addr = ib_dma_map_page(ic->i_cm_id->device,
+ ic->i_frag.f_page,
+ ic->i_frag.f_offset,
+ RDS_FRAG_SIZE,
+ DMA_FROM_DEVICE);
+ if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr))
+ goto out;
+
+ /*
+ * Once we get the RDS_PAGE_LAST_OFF frag then rds_ib_frag_unmap()
+ * must be called on this recv. This happens as completions hit
+ * in order or on connection shutdown.
+ */
+ recv->r_frag->f_page = ic->i_frag.f_page;
+ recv->r_frag->f_offset = ic->i_frag.f_offset;
+ recv->r_frag->f_mapped = dma_addr;
+
+ sge = rds_ib_data_sge(ic, recv->r_sge);
+ sge->addr = dma_addr;
+ sge->length = RDS_FRAG_SIZE;
+
+ sge = rds_ib_header_sge(ic, recv->r_sge);
+ sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
+ sge->length = sizeof(struct rds_header);
+
+ get_page(recv->r_frag->f_page);
+
+ if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) {
+ ic->i_frag.f_offset += RDS_FRAG_SIZE;
+ } else {
+ put_page(ic->i_frag.f_page);
+ ic->i_frag.f_page = NULL;
+ ic->i_frag.f_offset = 0;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+/*
+ * This tries to allocate and post unused work requests after making sure that
+ * they have all the allocations they need to queue received fragments into
+ * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc
+ * pairs don't go unmatched.
+ *
+ * -1 is returned if posting fails due to temporary resource exhaustion.
+ */
+int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
+ gfp_t page_gfp, int prefill)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct rds_ib_recv_work *recv;
+ struct ib_recv_wr *failed_wr;
+ unsigned int posted = 0;
+ int ret = 0;
+ u32 pos;
+
+ while ((prefill || rds_conn_up(conn))
+ && rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
+ if (pos >= ic->i_recv_ring.w_nr) {
+ printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
+ pos);
+ ret = -EINVAL;
+ break;
+ }
+
+ recv = &ic->i_recvs[pos];
+ ret = rds_ib_recv_refill_one(conn, recv, kptr_gfp, page_gfp);
+ if (ret) {
+ ret = -1;
+ break;
+ }
+
+ /* XXX when can this fail? */
+ ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
+ rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv,
+ recv->r_ibinc, recv->r_frag->f_page,
+ (long) recv->r_frag->f_mapped, ret);
+ if (ret) {
+ rds_ib_conn_error(conn, "recv post on "
+ "%pI4 returned %d, disconnecting and "
+ "reconnecting\n", &conn->c_faddr,
+ ret);
+ ret = -1;
+ break;
+ }
+
+ posted++;
+ }
+
+ /* We're doing flow control - update the window. */
+ if (ic->i_flowctl && posted)
+ rds_ib_advertise_credits(conn, posted);
+
+ if (ret)
+ rds_ib_ring_unalloc(&ic->i_recv_ring, 1);
+ return ret;
+}
+
+void rds_ib_inc_purge(struct rds_incoming *inc)
+{
+ struct rds_ib_incoming *ibinc;
+ struct rds_page_frag *frag;
+ struct rds_page_frag *pos;
+
+ ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
+ rdsdebug("purging ibinc %p inc %p\n", ibinc, inc);
+
+ list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
+ list_del_init(&frag->f_item);
+ rds_ib_frag_drop_page(frag);
+ rds_ib_frag_free(frag);
+ }
+}
+
+void rds_ib_inc_free(struct rds_incoming *inc)
+{
+ struct rds_ib_incoming *ibinc;
+
+ ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
+
+ rds_ib_inc_purge(inc);
+ rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
+ BUG_ON(!list_empty(&ibinc->ii_frags));
+ kmem_cache_free(rds_ib_incoming_slab, ibinc);
+ atomic_dec(&rds_ib_allocation);
+ BUG_ON(atomic_read(&rds_ib_allocation) < 0);
+}
+
+int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
+ size_t size)
+{
+ struct rds_ib_incoming *ibinc;
+ struct rds_page_frag *frag;
+ struct iovec *iov = first_iov;
+ unsigned long to_copy;
+ unsigned long frag_off = 0;
+ unsigned long iov_off = 0;
+ int copied = 0;
+ int ret;
+ u32 len;
+
+ ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
+ frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
+ len = be32_to_cpu(inc->i_hdr.h_len);
+
+ while (copied < size && copied < len) {
+ if (frag_off == RDS_FRAG_SIZE) {
+ frag = list_entry(frag->f_item.next,
+ struct rds_page_frag, f_item);
+ frag_off = 0;
+ }
+ while (iov_off == iov->iov_len) {
+ iov_off = 0;
+ iov++;
+ }
+
+ to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off);
+ to_copy = min_t(size_t, to_copy, size - copied);
+ to_copy = min_t(unsigned long, to_copy, len - copied);
+
+ rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
+ "[%p, %lu] + %lu\n",
+ to_copy, iov->iov_base, iov->iov_len, iov_off,
+ frag->f_page, frag->f_offset, frag_off);
+
+ /* XXX needs + offset for multiple recvs per page */
+ ret = rds_page_copy_to_user(frag->f_page,
+ frag->f_offset + frag_off,
+ iov->iov_base + iov_off,
+ to_copy);
+ if (ret) {
+ copied = ret;
+ break;
+ }
+
+ iov_off += to_copy;
+ frag_off += to_copy;
+ copied += to_copy;
+ }
+
+ return copied;
+}
+
+/* ic starts out kzalloc()ed */
+void rds_ib_recv_init_ack(struct rds_ib_connection *ic)
+{
+ struct ib_send_wr *wr = &ic->i_ack_wr;
+ struct ib_sge *sge = &ic->i_ack_sge;
+
+ sge->addr = ic->i_ack_dma;
+ sge->length = sizeof(struct rds_header);
+ sge->lkey = ic->i_mr->lkey;
+
+ wr->sg_list = sge;
+ wr->num_sge = 1;
+ wr->opcode = IB_WR_SEND;
+ wr->wr_id = RDS_IB_ACK_WR_ID;
+ wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+}
+
+/*
+ * You'd think that with reliable IB connections you wouldn't need to ack
+ * messages that have been received. The problem is that IB hardware generates
+ * an ack message before it has DMAed the message into memory. This creates a
+ * potential message loss if the HCA is disabled for any reason between when it
+ * sends the ack and before the message is DMAed and processed. This is only a
+ * potential issue if another HCA is available for fail-over.
+ *
+ * When the remote host receives our ack they'll free the sent message from
+ * their send queue. To decrease the latency of this we always send an ack
+ * immediately after we've received messages.
+ *
+ * For simplicity, we only have one ack in flight at a time. This puts
+ * pressure on senders to have deep enough send queues to absorb the latency of
+ * a single ack frame being in flight. This might not be good enough.
+ *
+ * This is implemented by have a long-lived send_wr and sge which point to a
+ * statically allocated ack frame. This ack wr does not fall under the ring
+ * accounting that the tx and rx wrs do. The QP attribute specifically makes
+ * room for it beyond the ring size. Send completion notices its special
+ * wr_id and avoids working with the ring in that case.
+ */
+static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq,
+ int ack_required)
+{
+ rds_ib_set_64bit(&ic->i_ack_next, seq);
+ if (ack_required) {
+ smp_mb__before_clear_bit();
+ set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+ }
+}
+
+static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
+{
+ clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+ smp_mb__after_clear_bit();
+
+ return ic->i_ack_next;
+}
+
+static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credits)
+{
+ struct rds_header *hdr = ic->i_ack;
+ struct ib_send_wr *failed_wr;
+ u64 seq;
+ int ret;
+
+ seq = rds_ib_get_ack(ic);
+
+ rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq);
+ rds_message_populate_header(hdr, 0, 0, 0);
+ hdr->h_ack = cpu_to_be64(seq);
+ hdr->h_credit = adv_credits;
+ rds_message_make_checksum(hdr);
+ ic->i_ack_queued = jiffies;
+
+ ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr);
+ if (unlikely(ret)) {
+ /* Failed to send. Release the WR, and
+ * force another ACK.
+ */
+ clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+ set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+
+ rds_ib_stats_inc(s_ib_ack_send_failure);
+ /* Need to finesse this later. */
+ BUG();
+ } else
+ rds_ib_stats_inc(s_ib_ack_sent);
+}
+
+/*
+ * There are 3 ways of getting acknowledgements to the peer:
+ * 1. We call rds_ib_attempt_ack from the recv completion handler
+ * to send an ACK-only frame.
+ * However, there can be only one such frame in the send queue
+ * at any time, so we may have to postpone it.
+ * 2. When another (data) packet is transmitted while there's
+ * an ACK in the queue, we piggyback the ACK sequence number
+ * on the data packet.
+ * 3. If the ACK WR is done sending, we get called from the
+ * send queue completion handler, and check whether there's
+ * another ACK pending (postponed because the WR was on the
+ * queue). If so, we transmit it.
+ *
+ * We maintain 2 variables:
+ * - i_ack_flags, which keeps track of whether the ACK WR
+ * is currently in the send queue or not (IB_ACK_IN_FLIGHT)
+ * - i_ack_next, which is the last sequence number we received
+ *
+ * Potentially, send queue and receive queue handlers can run concurrently.
+ *
+ * Reconnecting complicates this picture just slightly. When we
+ * reconnect, we may be seeing duplicate packets. The peer
+ * is retransmitting them, because it hasn't seen an ACK for
+ * them. It is important that we ACK these.
+ *
+ * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
+ * this flag set *MUST* be acknowledged immediately.
+ */
+
+/*
+ * When we get here, we're called from the recv queue handler.
+ * Check whether we ought to transmit an ACK.
+ */
+void rds_ib_attempt_ack(struct rds_ib_connection *ic)
+{
+ unsigned int adv_credits;
+
+ if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
+ return;
+
+ if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
+ rds_ib_stats_inc(s_ib_ack_send_delayed);
+ return;
+ }
+
+ /* Can we get a send credit? */
+ if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0)) {
+ rds_ib_stats_inc(s_ib_tx_throttle);
+ clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+ return;
+ }
+
+ clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+ rds_ib_send_ack(ic, adv_credits);
+}
+
+/*
+ * We get here from the send completion handler, when the
+ * adapter tells us the ACK frame was sent.
+ */
+void rds_ib_ack_send_complete(struct rds_ib_connection *ic)
+{
+ clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+ rds_ib_attempt_ack(ic);
+}
+
+/*
+ * This is called by the regular xmit code when it wants to piggyback
+ * an ACK on an outgoing frame.
+ */
+u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic)
+{
+ if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
+ rds_ib_stats_inc(s_ib_ack_send_piggybacked);
+ return rds_ib_get_ack(ic);
+}
+
+/*
+ * It's kind of lame that we're copying from the posted receive pages into
+ * long-lived bitmaps. We could have posted the bitmaps and rdma written into
+ * them. But receiving new congestion bitmaps should be a *rare* event, so
+ * hopefully we won't need to invest that complexity in making it more
+ * efficient. By copying we can share a simpler core with TCP which has to
+ * copy.
+ */
+static void rds_ib_cong_recv(struct rds_connection *conn,
+ struct rds_ib_incoming *ibinc)
+{
+ struct rds_cong_map *map;
+ unsigned int map_off;
+ unsigned int map_page;
+ struct rds_page_frag *frag;
+ unsigned long frag_off;
+ unsigned long to_copy;
+ unsigned long copied;
+ uint64_t uncongested = 0;
+ void *addr;
+
+ /* catch completely corrupt packets */
+ if (be32_to_cpu(ibinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
+ return;
+
+ map = conn->c_fcong;
+ map_page = 0;
+ map_off = 0;
+
+ frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
+ frag_off = 0;
+
+ copied = 0;
+
+ while (copied < RDS_CONG_MAP_BYTES) {
+ uint64_t *src, *dst;
+ unsigned int k;
+
+ to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
+ BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
+
+ addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0);
+
+ src = addr + frag_off;
+ dst = (void *)map->m_page_addrs[map_page] + map_off;
+ for (k = 0; k < to_copy; k += 8) {
+ /* Record ports that became uncongested, ie
+ * bits that changed from 0 to 1. */
+ uncongested |= ~(*src) & *dst;
+ *dst++ = *src++;
+ }
+ kunmap_atomic(addr, KM_SOFTIRQ0);
+
+ copied += to_copy;
+
+ map_off += to_copy;
+ if (map_off == PAGE_SIZE) {
+ map_off = 0;
+ map_page++;
+ }
+
+ frag_off += to_copy;
+ if (frag_off == RDS_FRAG_SIZE) {
+ frag = list_entry(frag->f_item.next,
+ struct rds_page_frag, f_item);
+ frag_off = 0;
+ }
+ }
+
+ /* the congestion map is in little endian order */
+ uncongested = le64_to_cpu(uncongested);
+
+ rds_cong_map_updated(map, uncongested);
+}
+
+/*
+ * Rings are posted with all the allocations they'll need to queue the
+ * incoming message to the receiving socket so this can't fail.
+ * All fragments start with a header, so we can make sure we're not receiving
+ * garbage, and we can tell a small 8 byte fragment from an ACK frame.
+ */
+struct rds_ib_ack_state {
+ u64 ack_next;
+ u64 ack_recv;
+ unsigned int ack_required:1;
+ unsigned int ack_next_valid:1;
+ unsigned int ack_recv_valid:1;
+};
+
+static void rds_ib_process_recv(struct rds_connection *conn,
+ struct rds_ib_recv_work *recv, u32 byte_len,
+ struct rds_ib_ack_state *state)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct rds_ib_incoming *ibinc = ic->i_ibinc;
+ struct rds_header *ihdr, *hdr;
+
+ /* XXX shut down the connection if port 0,0 are seen? */
+
+ rdsdebug("ic %p ibinc %p recv %p byte len %u\n", ic, ibinc, recv,
+ byte_len);
+
+ if (byte_len < sizeof(struct rds_header)) {
+ rds_ib_conn_error(conn, "incoming message "
+ "from %pI4 didn't inclue a "
+ "header, disconnecting and "
+ "reconnecting\n",
+ &conn->c_faddr);
+ return;
+ }
+ byte_len -= sizeof(struct rds_header);
+
+ ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
+
+ /* Validate the checksum. */
+ if (!rds_message_verify_checksum(ihdr)) {
+ rds_ib_conn_error(conn, "incoming message "
+ "from %pI4 has corrupted header - "
+ "forcing a reconnect\n",
+ &conn->c_faddr);
+ rds_stats_inc(s_recv_drop_bad_checksum);
+ return;
+ }
+
+ /* Process the ACK sequence which comes with every packet */
+ state->ack_recv = be64_to_cpu(ihdr->h_ack);
+ state->ack_recv_valid = 1;
+
+ /* Process the credits update if there was one */
+ if (ihdr->h_credit)
+ rds_ib_send_add_credits(conn, ihdr->h_credit);
+
+ if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) {
+ /* This is an ACK-only packet. The fact that it gets
+ * special treatment here is that historically, ACKs
+ * were rather special beasts.
+ */
+ rds_ib_stats_inc(s_ib_ack_received);
+
+ /*
+ * Usually the frags make their way on to incs and are then freed as
+ * the inc is freed. We don't go that route, so we have to drop the
+ * page ref ourselves. We can't just leave the page on the recv
+ * because that confuses the dma mapping of pages and each recv's use
+ * of a partial page. We can leave the frag, though, it will be
+ * reused.
+ *
+ * FIXME: Fold this into the code path below.
+ */
+ rds_ib_frag_drop_page(recv->r_frag);
+ return;
+ }
+
+ /*
+ * If we don't already have an inc on the connection then this
+ * fragment has a header and starts a message.. copy its header
+ * into the inc and save the inc so we can hang upcoming fragments
+ * off its list.
+ */
+ if (ibinc == NULL) {
+ ibinc = recv->r_ibinc;
+ recv->r_ibinc = NULL;
+ ic->i_ibinc = ibinc;
+
+ hdr = &ibinc->ii_inc.i_hdr;
+ memcpy(hdr, ihdr, sizeof(*hdr));
+ ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
+
+ rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc,
+ ic->i_recv_data_rem, hdr->h_flags);
+ } else {
+ hdr = &ibinc->ii_inc.i_hdr;
+ /* We can't just use memcmp here; fragments of a
+ * single message may carry different ACKs */
+ if (hdr->h_sequence != ihdr->h_sequence
+ || hdr->h_len != ihdr->h_len
+ || hdr->h_sport != ihdr->h_sport
+ || hdr->h_dport != ihdr->h_dport) {
+ rds_ib_conn_error(conn,
+ "fragment header mismatch; forcing reconnect\n");
+ return;
+ }
+ }
+
+ list_add_tail(&recv->r_frag->f_item, &ibinc->ii_frags);
+ recv->r_frag = NULL;
+
+ if (ic->i_recv_data_rem > RDS_FRAG_SIZE)
+ ic->i_recv_data_rem -= RDS_FRAG_SIZE;
+ else {
+ ic->i_recv_data_rem = 0;
+ ic->i_ibinc = NULL;
+
+ if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
+ rds_ib_cong_recv(conn, ibinc);
+ else {
+ rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
+ &ibinc->ii_inc, GFP_ATOMIC,
+ KM_SOFTIRQ0);
+ state->ack_next = be64_to_cpu(hdr->h_sequence);
+ state->ack_next_valid = 1;
+ }
+
+ /* Evaluate the ACK_REQUIRED flag *after* we received
+ * the complete frame, and after bumping the next_rx
+ * sequence. */
+ if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) {
+ rds_stats_inc(s_recv_ack_required);
+ state->ack_required = 1;
+ }
+
+ rds_inc_put(&ibinc->ii_inc);
+ }
+}
+
+/*
+ * Plucking the oldest entry from the ring can be done concurrently with
+ * the thread refilling the ring. Each ring operation is protected by
+ * spinlocks and the transient state of refilling doesn't change the
+ * recording of which entry is oldest.
+ *
+ * This relies on IB only calling one cq comp_handler for each cq so that
+ * there will only be one caller of rds_recv_incoming() per RDS connection.
+ */
+void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context)
+{
+ struct rds_connection *conn = context;
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct ib_wc wc;
+ struct rds_ib_ack_state state = { 0, };
+ struct rds_ib_recv_work *recv;
+
+ rdsdebug("conn %p cq %p\n", conn, cq);
+
+ rds_ib_stats_inc(s_ib_rx_cq_call);
+
+ ib_req_notify_cq(cq, IB_CQ_SOLICITED);
+
+ while (ib_poll_cq(cq, 1, &wc) > 0) {
+ rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+ (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
+ be32_to_cpu(wc.ex.imm_data));
+ rds_ib_stats_inc(s_ib_rx_cq_event);
+
+ recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
+
+ rds_ib_recv_unmap_page(ic, recv);
+
+ /*
+ * Also process recvs in connecting state because it is possible
+ * to get a recv completion _before_ the rdmacm ESTABLISHED
+ * event is processed.
+ */
+ if (rds_conn_up(conn) || rds_conn_connecting(conn)) {
+ /* We expect errors as the qp is drained during shutdown */
+ if (wc.status == IB_WC_SUCCESS) {
+ rds_ib_process_recv(conn, recv, wc.byte_len, &state);
+ } else {
+ rds_ib_conn_error(conn, "recv completion on "
+ "%pI4 had status %u, disconnecting and "
+ "reconnecting\n", &conn->c_faddr,
+ wc.status);
+ }
+ }
+
+ rds_ib_ring_free(&ic->i_recv_ring, 1);
+ }
+
+ if (state.ack_next_valid)
+ rds_ib_set_ack(ic, state.ack_next, state.ack_required);
+ if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
+ rds_send_drop_acked(conn, state.ack_recv, NULL);
+ ic->i_ack_recv = state.ack_recv;
+ }
+ if (rds_conn_up(conn))
+ rds_ib_attempt_ack(ic);
+
+ /* If we ever end up with a really empty receive ring, we're
+ * in deep trouble, as the sender will definitely see RNR
+ * timeouts. */
+ if (rds_ib_ring_empty(&ic->i_recv_ring))
+ rds_ib_stats_inc(s_ib_rx_ring_empty);
+
+ /*
+ * If the ring is running low, then schedule the thread to refill.
+ */
+ if (rds_ib_ring_low(&ic->i_recv_ring))
+ queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+}
+
+int rds_ib_recv(struct rds_connection *conn)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ int ret = 0;
+
+ rdsdebug("conn %p\n", conn);
+
+ /*
+ * If we get a temporary posting failure in this context then
+ * we're really low and we want the caller to back off for a bit.
+ */
+ mutex_lock(&ic->i_recv_mutex);
+ if (rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0))
+ ret = -ENOMEM;
+ else
+ rds_ib_stats_inc(s_ib_rx_refill_from_thread);
+ mutex_unlock(&ic->i_recv_mutex);
+
+ if (rds_conn_up(conn))
+ rds_ib_attempt_ack(ic);
+
+ return ret;
+}
+
+int __init rds_ib_recv_init(void)
+{
+ struct sysinfo si;
+ int ret = -ENOMEM;
+
+ /* Default to 30% of all available RAM for recv memory */
+ si_meminfo(&si);
+ rds_ib_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE;
+
+ rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming",
+ sizeof(struct rds_ib_incoming),
+ 0, 0, NULL);
+ if (rds_ib_incoming_slab == NULL)
+ goto out;
+
+ rds_ib_frag_slab = kmem_cache_create("rds_ib_frag",
+ sizeof(struct rds_page_frag),
+ 0, 0, NULL);
+ if (rds_ib_frag_slab == NULL)
+ kmem_cache_destroy(rds_ib_incoming_slab);
+ else
+ ret = 0;
+out:
+ return ret;
+}
+
+void rds_ib_recv_exit(void)
+{
+ kmem_cache_destroy(rds_ib_incoming_slab);
+ kmem_cache_destroy(rds_ib_frag_slab);
+}
diff --git a/net/rds/ib_ring.c b/net/rds/ib_ring.c
new file mode 100644
index 00000000000..99a6ccae964
--- /dev/null
+++ b/net/rds/ib_ring.c
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+
+#include "rds.h"
+#include "ib.h"
+
+/*
+ * Locking for IB rings.
+ * We assume that allocation is always protected by a mutex
+ * in the caller (this is a valid assumption for the current
+ * implementation).
+ *
+ * Freeing always happens in an interrupt, and hence only
+ * races with allocations, but not with other free()s.
+ *
+ * The interaction between allocation and freeing is that
+ * the alloc code has to determine the number of free entries.
+ * To this end, we maintain two counters; an allocation counter
+ * and a free counter. Both are allowed to run freely, and wrap
+ * around.
+ * The number of used entries is always (alloc_ctr - free_ctr) % NR.
+ *
+ * The current implementation makes free_ctr atomic. When the
+ * caller finds an allocation fails, it should set an "alloc fail"
+ * bit and retry the allocation. The "alloc fail" bit essentially tells
+ * the CQ completion handlers to wake it up after freeing some
+ * more entries.
+ */
+
+/*
+ * This only happens on shutdown.
+ */
+DECLARE_WAIT_QUEUE_HEAD(rds_ib_ring_empty_wait);
+
+void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr)
+{
+ memset(ring, 0, sizeof(*ring));
+ ring->w_nr = nr;
+ rdsdebug("ring %p nr %u\n", ring, ring->w_nr);
+}
+
+static inline u32 __rds_ib_ring_used(struct rds_ib_work_ring *ring)
+{
+ u32 diff;
+
+ /* This assumes that atomic_t has at least as many bits as u32 */
+ diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr);
+ BUG_ON(diff > ring->w_nr);
+
+ return diff;
+}
+
+void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr)
+{
+ /* We only ever get called from the connection setup code,
+ * prior to creating the QP. */
+ BUG_ON(__rds_ib_ring_used(ring));
+ ring->w_nr = nr;
+}
+
+static int __rds_ib_ring_empty(struct rds_ib_work_ring *ring)
+{
+ return __rds_ib_ring_used(ring) == 0;
+}
+
+u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos)
+{
+ u32 ret = 0, avail;
+
+ avail = ring->w_nr - __rds_ib_ring_used(ring);
+
+ rdsdebug("ring %p val %u next %u free %u\n", ring, val,
+ ring->w_alloc_ptr, avail);
+
+ if (val && avail) {
+ ret = min(val, avail);
+ *pos = ring->w_alloc_ptr;
+
+ ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr;
+ ring->w_alloc_ctr += ret;
+ }
+
+ return ret;
+}
+
+void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val)
+{
+ ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr;
+ atomic_add(val, &ring->w_free_ctr);
+
+ if (__rds_ib_ring_empty(ring) &&
+ waitqueue_active(&rds_ib_ring_empty_wait))
+ wake_up(&rds_ib_ring_empty_wait);
+}
+
+void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val)
+{
+ ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr;
+ ring->w_alloc_ctr -= val;
+}
+
+int rds_ib_ring_empty(struct rds_ib_work_ring *ring)
+{
+ return __rds_ib_ring_empty(ring);
+}
+
+int rds_ib_ring_low(struct rds_ib_work_ring *ring)
+{
+ return __rds_ib_ring_used(ring) <= (ring->w_nr >> 2);
+}
+
+/*
+ * returns the oldest alloced ring entry. This will be the next one
+ * freed. This can't be called if there are none allocated.
+ */
+u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring)
+{
+ return ring->w_free_ptr;
+}
+
+/*
+ * returns the number of completed work requests.
+ */
+
+u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest)
+{
+ u32 ret;
+
+ if (oldest <= (unsigned long long)wr_id)
+ ret = (unsigned long long)wr_id - oldest + 1;
+ else
+ ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1;
+
+ rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret,
+ wr_id, oldest);
+ return ret;
+}
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
new file mode 100644
index 00000000000..cb6c52cb1c4
--- /dev/null
+++ b/net/rds/ib_send.c
@@ -0,0 +1,874 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/device.h>
+#include <linux/dmapool.h>
+
+#include "rds.h"
+#include "rdma.h"
+#include "ib.h"
+
+static void rds_ib_send_rdma_complete(struct rds_message *rm,
+ int wc_status)
+{
+ int notify_status;
+
+ switch (wc_status) {
+ case IB_WC_WR_FLUSH_ERR:
+ return;
+
+ case IB_WC_SUCCESS:
+ notify_status = RDS_RDMA_SUCCESS;
+ break;
+
+ case IB_WC_REM_ACCESS_ERR:
+ notify_status = RDS_RDMA_REMOTE_ERROR;
+ break;
+
+ default:
+ notify_status = RDS_RDMA_OTHER_ERROR;
+ break;
+ }
+ rds_rdma_send_complete(rm, notify_status);
+}
+
+static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic,
+ struct rds_rdma_op *op)
+{
+ if (op->r_mapped) {
+ ib_dma_unmap_sg(ic->i_cm_id->device,
+ op->r_sg, op->r_nents,
+ op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ op->r_mapped = 0;
+ }
+}
+
+static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic,
+ struct rds_ib_send_work *send,
+ int wc_status)
+{
+ struct rds_message *rm = send->s_rm;
+
+ rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
+
+ ib_dma_unmap_sg(ic->i_cm_id->device,
+ rm->m_sg, rm->m_nents,
+ DMA_TO_DEVICE);
+
+ if (rm->m_rdma_op != NULL) {
+ rds_ib_send_unmap_rdma(ic, rm->m_rdma_op);
+
+ /* If the user asked for a completion notification on this
+ * message, we can implement three different semantics:
+ * 1. Notify when we received the ACK on the RDS message
+ * that was queued with the RDMA. This provides reliable
+ * notification of RDMA status at the expense of a one-way
+ * packet delay.
+ * 2. Notify when the IB stack gives us the completion event for
+ * the RDMA operation.
+ * 3. Notify when the IB stack gives us the completion event for
+ * the accompanying RDS messages.
+ * Here, we implement approach #3. To implement approach #2,
+ * call rds_rdma_send_complete from the cq_handler. To implement #1,
+ * don't call rds_rdma_send_complete at all, and fall back to the notify
+ * handling in the ACK processing code.
+ *
+ * Note: There's no need to explicitly sync any RDMA buffers using
+ * ib_dma_sync_sg_for_cpu - the completion for the RDMA
+ * operation itself unmapped the RDMA buffers, which takes care
+ * of synching.
+ */
+ rds_ib_send_rdma_complete(rm, wc_status);
+
+ if (rm->m_rdma_op->r_write)
+ rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes);
+ else
+ rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes);
+ }
+
+ /* If anyone waited for this message to get flushed out, wake
+ * them up now */
+ rds_message_unmapped(rm);
+
+ rds_message_put(rm);
+ send->s_rm = NULL;
+}
+
+void rds_ib_send_init_ring(struct rds_ib_connection *ic)
+{
+ struct rds_ib_send_work *send;
+ u32 i;
+
+ for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
+ struct ib_sge *sge;
+
+ send->s_rm = NULL;
+ send->s_op = NULL;
+
+ send->s_wr.wr_id = i;
+ send->s_wr.sg_list = send->s_sge;
+ send->s_wr.num_sge = 1;
+ send->s_wr.opcode = IB_WR_SEND;
+ send->s_wr.send_flags = 0;
+ send->s_wr.ex.imm_data = 0;
+
+ sge = rds_ib_data_sge(ic, send->s_sge);
+ sge->lkey = ic->i_mr->lkey;
+
+ sge = rds_ib_header_sge(ic, send->s_sge);
+ sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
+ sge->length = sizeof(struct rds_header);
+ sge->lkey = ic->i_mr->lkey;
+ }
+}
+
+void rds_ib_send_clear_ring(struct rds_ib_connection *ic)
+{
+ struct rds_ib_send_work *send;
+ u32 i;
+
+ for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
+ if (send->s_wr.opcode == 0xdead)
+ continue;
+ if (send->s_rm)
+ rds_ib_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR);
+ if (send->s_op)
+ rds_ib_send_unmap_rdma(ic, send->s_op);
+ }
+}
+
+/*
+ * The _oldest/_free ring operations here race cleanly with the alloc/unalloc
+ * operations performed in the send path. As the sender allocs and potentially
+ * unallocs the next free entry in the ring it doesn't alter which is
+ * the next to be freed, which is what this is concerned with.
+ */
+void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
+{
+ struct rds_connection *conn = context;
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct ib_wc wc;
+ struct rds_ib_send_work *send;
+ u32 completed;
+ u32 oldest;
+ u32 i = 0;
+ int ret;
+
+ rdsdebug("cq %p conn %p\n", cq, conn);
+ rds_ib_stats_inc(s_ib_tx_cq_call);
+ ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+ if (ret)
+ rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
+
+ while (ib_poll_cq(cq, 1, &wc) > 0) {
+ rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+ (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
+ be32_to_cpu(wc.ex.imm_data));
+ rds_ib_stats_inc(s_ib_tx_cq_event);
+
+ if (wc.wr_id == RDS_IB_ACK_WR_ID) {
+ if (ic->i_ack_queued + HZ/2 < jiffies)
+ rds_ib_stats_inc(s_ib_tx_stalled);
+ rds_ib_ack_send_complete(ic);
+ continue;
+ }
+
+ oldest = rds_ib_ring_oldest(&ic->i_send_ring);
+
+ completed = rds_ib_ring_completed(&ic->i_send_ring, wc.wr_id, oldest);
+
+ for (i = 0; i < completed; i++) {
+ send = &ic->i_sends[oldest];
+
+ /* In the error case, wc.opcode sometimes contains garbage */
+ switch (send->s_wr.opcode) {
+ case IB_WR_SEND:
+ if (send->s_rm)
+ rds_ib_send_unmap_rm(ic, send, wc.status);
+ break;
+ case IB_WR_RDMA_WRITE:
+ case IB_WR_RDMA_READ:
+ /* Nothing to be done - the SG list will be unmapped
+ * when the SEND completes. */
+ break;
+ default:
+ if (printk_ratelimit())
+ printk(KERN_NOTICE
+ "RDS/IB: %s: unexpected opcode 0x%x in WR!\n",
+ __func__, send->s_wr.opcode);
+ break;
+ }
+
+ send->s_wr.opcode = 0xdead;
+ send->s_wr.num_sge = 1;
+ if (send->s_queued + HZ/2 < jiffies)
+ rds_ib_stats_inc(s_ib_tx_stalled);
+
+ /* If a RDMA operation produced an error, signal this right
+ * away. If we don't, the subsequent SEND that goes with this
+ * RDMA will be canceled with ERR_WFLUSH, and the application
+ * never learn that the RDMA failed. */
+ if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) {
+ struct rds_message *rm;
+
+ rm = rds_send_get_message(conn, send->s_op);
+ if (rm)
+ rds_ib_send_rdma_complete(rm, wc.status);
+ }
+
+ oldest = (oldest + 1) % ic->i_send_ring.w_nr;
+ }
+
+ rds_ib_ring_free(&ic->i_send_ring, completed);
+
+ if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)
+ || test_bit(0, &conn->c_map_queued))
+ queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+
+ /* We expect errors as the qp is drained during shutdown */
+ if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) {
+ rds_ib_conn_error(conn,
+ "send completion on %pI4 "
+ "had status %u, disconnecting and reconnecting\n",
+ &conn->c_faddr, wc.status);
+ }
+ }
+}
+
+/*
+ * This is the main function for allocating credits when sending
+ * messages.
+ *
+ * Conceptually, we have two counters:
+ * - send credits: this tells us how many WRs we're allowed
+ * to submit without overruning the reciever's queue. For
+ * each SEND WR we post, we decrement this by one.
+ *
+ * - posted credits: this tells us how many WRs we recently
+ * posted to the receive queue. This value is transferred
+ * to the peer as a "credit update" in a RDS header field.
+ * Every time we transmit credits to the peer, we subtract
+ * the amount of transferred credits from this counter.
+ *
+ * It is essential that we avoid situations where both sides have
+ * exhausted their send credits, and are unable to send new credits
+ * to the peer. We achieve this by requiring that we send at least
+ * one credit update to the peer before exhausting our credits.
+ * When new credits arrive, we subtract one credit that is withheld
+ * until we've posted new buffers and are ready to transmit these
+ * credits (see rds_ib_send_add_credits below).
+ *
+ * The RDS send code is essentially single-threaded; rds_send_xmit
+ * grabs c_send_lock to ensure exclusive access to the send ring.
+ * However, the ACK sending code is independent and can race with
+ * message SENDs.
+ *
+ * In the send path, we need to update the counters for send credits
+ * and the counter of posted buffers atomically - when we use the
+ * last available credit, we cannot allow another thread to race us
+ * and grab the posted credits counter. Hence, we have to use a
+ * spinlock to protect the credit counter, or use atomics.
+ *
+ * Spinlocks shared between the send and the receive path are bad,
+ * because they create unnecessary delays. An early implementation
+ * using a spinlock showed a 5% degradation in throughput at some
+ * loads.
+ *
+ * This implementation avoids spinlocks completely, putting both
+ * counters into a single atomic, and updating that atomic using
+ * atomic_add (in the receive path, when receiving fresh credits),
+ * and using atomic_cmpxchg when updating the two counters.
+ */
+int rds_ib_send_grab_credits(struct rds_ib_connection *ic,
+ u32 wanted, u32 *adv_credits, int need_posted)
+{
+ unsigned int avail, posted, got = 0, advertise;
+ long oldval, newval;
+
+ *adv_credits = 0;
+ if (!ic->i_flowctl)
+ return wanted;
+
+try_again:
+ advertise = 0;
+ oldval = newval = atomic_read(&ic->i_credits);
+ posted = IB_GET_POST_CREDITS(oldval);
+ avail = IB_GET_SEND_CREDITS(oldval);
+
+ rdsdebug("rds_ib_send_grab_credits(%u): credits=%u posted=%u\n",
+ wanted, avail, posted);
+
+ /* The last credit must be used to send a credit update. */
+ if (avail && !posted)
+ avail--;
+
+ if (avail < wanted) {
+ struct rds_connection *conn = ic->i_cm_id->context;
+
+ /* Oops, there aren't that many credits left! */
+ set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+ got = avail;
+ } else {
+ /* Sometimes you get what you want, lalala. */
+ got = wanted;
+ }
+ newval -= IB_SET_SEND_CREDITS(got);
+
+ /*
+ * If need_posted is non-zero, then the caller wants
+ * the posted regardless of whether any send credits are
+ * available.
+ */
+ if (posted && (got || need_posted)) {
+ advertise = min_t(unsigned int, posted, RDS_MAX_ADV_CREDIT);
+ newval -= IB_SET_POST_CREDITS(advertise);
+ }
+
+ /* Finally bill everything */
+ if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval)
+ goto try_again;
+
+ *adv_credits = advertise;
+ return got;
+}
+
+void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+
+ if (credits == 0)
+ return;
+
+ rdsdebug("rds_ib_send_add_credits(%u): current=%u%s\n",
+ credits,
+ IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)),
+ test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : "");
+
+ atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits);
+ if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags))
+ queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+
+ WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384);
+
+ rds_ib_stats_inc(s_ib_rx_credit_updates);
+}
+
+void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+
+ if (posted == 0)
+ return;
+
+ atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits);
+
+ /* Decide whether to send an update to the peer now.
+ * If we would send a credit update for every single buffer we
+ * post, we would end up with an ACK storm (ACK arrives,
+ * consumes buffer, we refill the ring, send ACK to remote
+ * advertising the newly posted buffer... ad inf)
+ *
+ * Performance pretty much depends on how often we send
+ * credit updates - too frequent updates mean lots of ACKs.
+ * Too infrequent updates, and the peer will run out of
+ * credits and has to throttle.
+ * For the time being, 16 seems to be a good compromise.
+ */
+ if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16)
+ set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+}
+
+static inline void
+rds_ib_xmit_populate_wr(struct rds_ib_connection *ic,
+ struct rds_ib_send_work *send, unsigned int pos,
+ unsigned long buffer, unsigned int length,
+ int send_flags)
+{
+ struct ib_sge *sge;
+
+ WARN_ON(pos != send - ic->i_sends);
+
+ send->s_wr.send_flags = send_flags;
+ send->s_wr.opcode = IB_WR_SEND;
+ send->s_wr.num_sge = 2;
+ send->s_wr.next = NULL;
+ send->s_queued = jiffies;
+ send->s_op = NULL;
+
+ if (length != 0) {
+ sge = rds_ib_data_sge(ic, send->s_sge);
+ sge->addr = buffer;
+ sge->length = length;
+ sge->lkey = ic->i_mr->lkey;
+
+ sge = rds_ib_header_sge(ic, send->s_sge);
+ } else {
+ /* We're sending a packet with no payload. There is only
+ * one SGE */
+ send->s_wr.num_sge = 1;
+ sge = &send->s_sge[0];
+ }
+
+ sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header));
+ sge->length = sizeof(struct rds_header);
+ sge->lkey = ic->i_mr->lkey;
+}
+
+/*
+ * This can be called multiple times for a given message. The first time
+ * we see a message we map its scatterlist into the IB device so that
+ * we can provide that mapped address to the IB scatter gather entries
+ * in the IB work requests. We translate the scatterlist into a series
+ * of work requests that fragment the message. These work requests complete
+ * in order so we pass ownership of the message to the completion handler
+ * once we send the final fragment.
+ *
+ * The RDS core uses the c_send_lock to only enter this function once
+ * per connection. This makes sure that the tx ring alloc/unalloc pairs
+ * don't get out of sync and confuse the ring.
+ */
+int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
+ unsigned int hdr_off, unsigned int sg, unsigned int off)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct ib_device *dev = ic->i_cm_id->device;
+ struct rds_ib_send_work *send = NULL;
+ struct rds_ib_send_work *first;
+ struct rds_ib_send_work *prev;
+ struct ib_send_wr *failed_wr;
+ struct scatterlist *scat;
+ u32 pos;
+ u32 i;
+ u32 work_alloc;
+ u32 credit_alloc;
+ u32 posted;
+ u32 adv_credits = 0;
+ int send_flags = 0;
+ int sent;
+ int ret;
+ int flow_controlled = 0;
+
+ BUG_ON(off % RDS_FRAG_SIZE);
+ BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
+
+ /* FIXME we may overallocate here */
+ if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0)
+ i = 1;
+ else
+ i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE);
+
+ work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
+ if (work_alloc == 0) {
+ set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+ rds_ib_stats_inc(s_ib_tx_ring_full);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ credit_alloc = work_alloc;
+ if (ic->i_flowctl) {
+ credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0);
+ adv_credits += posted;
+ if (credit_alloc < work_alloc) {
+ rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
+ work_alloc = credit_alloc;
+ flow_controlled++;
+ }
+ if (work_alloc == 0) {
+ rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+ rds_ib_stats_inc(s_ib_tx_throttle);
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ /* map the message the first time we see it */
+ if (ic->i_rm == NULL) {
+ /*
+ printk(KERN_NOTICE "rds_ib_xmit prep msg dport=%u flags=0x%x len=%d\n",
+ be16_to_cpu(rm->m_inc.i_hdr.h_dport),
+ rm->m_inc.i_hdr.h_flags,
+ be32_to_cpu(rm->m_inc.i_hdr.h_len));
+ */
+ if (rm->m_nents) {
+ rm->m_count = ib_dma_map_sg(dev,
+ rm->m_sg, rm->m_nents, DMA_TO_DEVICE);
+ rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count);
+ if (rm->m_count == 0) {
+ rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
+ rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+ ret = -ENOMEM; /* XXX ? */
+ goto out;
+ }
+ } else {
+ rm->m_count = 0;
+ }
+
+ ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
+ ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes;
+ rds_message_addref(rm);
+ ic->i_rm = rm;
+
+ /* Finalize the header */
+ if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
+ rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED;
+ if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))
+ rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED;
+
+ /* If it has a RDMA op, tell the peer we did it. This is
+ * used by the peer to release use-once RDMA MRs. */
+ if (rm->m_rdma_op) {
+ struct rds_ext_header_rdma ext_hdr;
+
+ ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key);
+ rds_message_add_extension(&rm->m_inc.i_hdr,
+ RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
+ }
+ if (rm->m_rdma_cookie) {
+ rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
+ rds_rdma_cookie_key(rm->m_rdma_cookie),
+ rds_rdma_cookie_offset(rm->m_rdma_cookie));
+ }
+
+ /* Note - rds_ib_piggyb_ack clears the ACK_REQUIRED bit, so
+ * we should not do this unless we have a chance of at least
+ * sticking the header into the send ring. Which is why we
+ * should call rds_ib_ring_alloc first. */
+ rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_ib_piggyb_ack(ic));
+ rds_message_make_checksum(&rm->m_inc.i_hdr);
+
+ /*
+ * Update adv_credits since we reset the ACK_REQUIRED bit.
+ */
+ rds_ib_send_grab_credits(ic, 0, &posted, 1);
+ adv_credits += posted;
+ BUG_ON(adv_credits > 255);
+ } else if (ic->i_rm != rm)
+ BUG();
+
+ send = &ic->i_sends[pos];
+ first = send;
+ prev = NULL;
+ scat = &rm->m_sg[sg];
+ sent = 0;
+ i = 0;
+
+ /* Sometimes you want to put a fence between an RDMA
+ * READ and the following SEND.
+ * We could either do this all the time
+ * or when requested by the user. Right now, we let
+ * the application choose.
+ */
+ if (rm->m_rdma_op && rm->m_rdma_op->r_fence)
+ send_flags = IB_SEND_FENCE;
+
+ /*
+ * We could be copying the header into the unused tail of the page.
+ * That would need to be changed in the future when those pages might
+ * be mapped userspace pages or page cache pages. So instead we always
+ * use a second sge and our long-lived ring of mapped headers. We send
+ * the header after the data so that the data payload can be aligned on
+ * the receiver.
+ */
+
+ /* handle a 0-len message */
+ if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) {
+ rds_ib_xmit_populate_wr(ic, send, pos, 0, 0, send_flags);
+ goto add_header;
+ }
+
+ /* if there's data reference it with a chain of work reqs */
+ for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) {
+ unsigned int len;
+
+ send = &ic->i_sends[pos];
+
+ len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
+ rds_ib_xmit_populate_wr(ic, send, pos,
+ ib_sg_dma_address(dev, scat) + off, len,
+ send_flags);
+
+ /*
+ * We want to delay signaling completions just enough to get
+ * the batching benefits but not so much that we create dead time
+ * on the wire.
+ */
+ if (ic->i_unsignaled_wrs-- == 0) {
+ ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
+ send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+ }
+
+ ic->i_unsignaled_bytes -= len;
+ if (ic->i_unsignaled_bytes <= 0) {
+ ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes;
+ send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+ }
+
+ /*
+ * Always signal the last one if we're stopping due to flow control.
+ */
+ if (flow_controlled && i == (work_alloc-1))
+ send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+
+ rdsdebug("send %p wr %p num_sge %u next %p\n", send,
+ &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
+
+ sent += len;
+ off += len;
+ if (off == ib_sg_dma_len(dev, scat)) {
+ scat++;
+ off = 0;
+ }
+
+add_header:
+ /* Tack on the header after the data. The header SGE should already
+ * have been set up to point to the right header buffer. */
+ memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
+
+ if (0) {
+ struct rds_header *hdr = &ic->i_send_hdrs[pos];
+
+ printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n",
+ be16_to_cpu(hdr->h_dport),
+ hdr->h_flags,
+ be32_to_cpu(hdr->h_len));
+ }
+ if (adv_credits) {
+ struct rds_header *hdr = &ic->i_send_hdrs[pos];
+
+ /* add credit and redo the header checksum */
+ hdr->h_credit = adv_credits;
+ rds_message_make_checksum(hdr);
+ adv_credits = 0;
+ rds_ib_stats_inc(s_ib_tx_credit_updates);
+ }
+
+ if (prev)
+ prev->s_wr.next = &send->s_wr;
+ prev = send;
+
+ pos = (pos + 1) % ic->i_send_ring.w_nr;
+ }
+
+ /* Account the RDS header in the number of bytes we sent, but just once.
+ * The caller has no concept of fragmentation. */
+ if (hdr_off == 0)
+ sent += sizeof(struct rds_header);
+
+ /* if we finished the message then send completion owns it */
+ if (scat == &rm->m_sg[rm->m_count]) {
+ prev->s_rm = ic->i_rm;
+ prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+ ic->i_rm = NULL;
+ }
+
+ if (i < work_alloc) {
+ rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
+ work_alloc = i;
+ }
+ if (ic->i_flowctl && i < credit_alloc)
+ rds_ib_send_add_credits(conn, credit_alloc - i);
+
+ /* XXX need to worry about failed_wr and partial sends. */
+ failed_wr = &first->s_wr;
+ ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
+ rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
+ first, &first->s_wr, ret, failed_wr);
+ BUG_ON(failed_wr != &first->s_wr);
+ if (ret) {
+ printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 "
+ "returned %d\n", &conn->c_faddr, ret);
+ rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+ if (prev->s_rm) {
+ ic->i_rm = prev->s_rm;
+ prev->s_rm = NULL;
+ }
+ /* Finesse this later */
+ BUG();
+ goto out;
+ }
+
+ ret = sent;
+out:
+ BUG_ON(adv_credits);
+ return ret;
+}
+
+int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct rds_ib_send_work *send = NULL;
+ struct rds_ib_send_work *first;
+ struct rds_ib_send_work *prev;
+ struct ib_send_wr *failed_wr;
+ struct rds_ib_device *rds_ibdev;
+ struct scatterlist *scat;
+ unsigned long len;
+ u64 remote_addr = op->r_remote_addr;
+ u32 pos;
+ u32 work_alloc;
+ u32 i;
+ u32 j;
+ int sent;
+ int ret;
+ int num_sge;
+
+ rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
+
+ /* map the message the first time we see it */
+ if (!op->r_mapped) {
+ op->r_count = ib_dma_map_sg(ic->i_cm_id->device,
+ op->r_sg, op->r_nents, (op->r_write) ?
+ DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count);
+ if (op->r_count == 0) {
+ rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
+ ret = -ENOMEM; /* XXX ? */
+ goto out;
+ }
+
+ op->r_mapped = 1;
+ }
+
+ /*
+ * Instead of knowing how to return a partial rdma read/write we insist that there
+ * be enough work requests to send the entire message.
+ */
+ i = ceil(op->r_count, rds_ibdev->max_sge);
+
+ work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
+ if (work_alloc != i) {
+ rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+ rds_ib_stats_inc(s_ib_tx_ring_full);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ send = &ic->i_sends[pos];
+ first = send;
+ prev = NULL;
+ scat = &op->r_sg[0];
+ sent = 0;
+ num_sge = op->r_count;
+
+ for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) {
+ send->s_wr.send_flags = 0;
+ send->s_queued = jiffies;
+ /*
+ * We want to delay signaling completions just enough to get
+ * the batching benefits but not so much that we create dead time on the wire.
+ */
+ if (ic->i_unsignaled_wrs-- == 0) {
+ ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
+ send->s_wr.send_flags = IB_SEND_SIGNALED;
+ }
+
+ send->s_wr.opcode = op->r_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
+ send->s_wr.wr.rdma.remote_addr = remote_addr;
+ send->s_wr.wr.rdma.rkey = op->r_key;
+ send->s_op = op;
+
+ if (num_sge > rds_ibdev->max_sge) {
+ send->s_wr.num_sge = rds_ibdev->max_sge;
+ num_sge -= rds_ibdev->max_sge;
+ } else {
+ send->s_wr.num_sge = num_sge;
+ }
+
+ send->s_wr.next = NULL;
+
+ if (prev)
+ prev->s_wr.next = &send->s_wr;
+
+ for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) {
+ len = ib_sg_dma_len(ic->i_cm_id->device, scat);
+ send->s_sge[j].addr =
+ ib_sg_dma_address(ic->i_cm_id->device, scat);
+ send->s_sge[j].length = len;
+ send->s_sge[j].lkey = ic->i_mr->lkey;
+
+ sent += len;
+ rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);
+
+ remote_addr += len;
+ scat++;
+ }
+
+ rdsdebug("send %p wr %p num_sge %u next %p\n", send,
+ &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
+
+ prev = send;
+ if (++send == &ic->i_sends[ic->i_send_ring.w_nr])
+ send = ic->i_sends;
+ }
+
+ /* if we finished the message then send completion owns it */
+ if (scat == &op->r_sg[op->r_count])
+ prev->s_wr.send_flags = IB_SEND_SIGNALED;
+
+ if (i < work_alloc) {
+ rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
+ work_alloc = i;
+ }
+
+ failed_wr = &first->s_wr;
+ ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
+ rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
+ first, &first->s_wr, ret, failed_wr);
+ BUG_ON(failed_wr != &first->s_wr);
+ if (ret) {
+ printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 "
+ "returned %d\n", &conn->c_faddr, ret);
+ rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+ goto out;
+ }
+
+ if (unlikely(failed_wr != &first->s_wr)) {
+ printk(KERN_WARNING "RDS/IB: ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
+ BUG_ON(failed_wr != &first->s_wr);
+ }
+
+
+out:
+ return ret;
+}
+
+void rds_ib_xmit_complete(struct rds_connection *conn)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+
+ /* We may have a pending ACK or window update we were unable
+ * to send previously (due to flow control). Try again. */
+ rds_ib_attempt_ack(ic);
+}
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c
new file mode 100644
index 00000000000..02e3e3d50d4
--- /dev/null
+++ b/net/rds/ib_stats.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
+#include "rds.h"
+#include "ib.h"
+
+DEFINE_PER_CPU(struct rds_ib_statistics, rds_ib_stats) ____cacheline_aligned;
+
+static char *rds_ib_stat_names[] = {
+ "ib_connect_raced",
+ "ib_listen_closed_stale",
+ "ib_tx_cq_call",
+ "ib_tx_cq_event",
+ "ib_tx_ring_full",
+ "ib_tx_throttle",
+ "ib_tx_sg_mapping_failure",
+ "ib_tx_stalled",
+ "ib_tx_credit_updates",
+ "ib_rx_cq_call",
+ "ib_rx_cq_event",
+ "ib_rx_ring_empty",
+ "ib_rx_refill_from_cq",
+ "ib_rx_refill_from_thread",
+ "ib_rx_alloc_limit",
+ "ib_rx_credit_updates",
+ "ib_ack_sent",
+ "ib_ack_send_failure",
+ "ib_ack_send_delayed",
+ "ib_ack_send_piggybacked",
+ "ib_ack_received",
+ "ib_rdma_mr_alloc",
+ "ib_rdma_mr_free",
+ "ib_rdma_mr_used",
+ "ib_rdma_mr_pool_flush",
+ "ib_rdma_mr_pool_wait",
+ "ib_rdma_mr_pool_depleted",
+};
+
+unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
+ unsigned int avail)
+{
+ struct rds_ib_statistics stats = {0, };
+ uint64_t *src;
+ uint64_t *sum;
+ size_t i;
+ int cpu;
+
+ if (avail < ARRAY_SIZE(rds_ib_stat_names))
+ goto out;
+
+ for_each_online_cpu(cpu) {
+ src = (uint64_t *)&(per_cpu(rds_ib_stats, cpu));
+ sum = (uint64_t *)&stats;
+ for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
+ *(sum++) += *(src++);
+ }
+
+ rds_stats_info_copy(iter, (uint64_t *)&stats, rds_ib_stat_names,
+ ARRAY_SIZE(rds_ib_stat_names));
+out:
+ return ARRAY_SIZE(rds_ib_stat_names);
+}
diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c
new file mode 100644
index 00000000000..d87830db93a
--- /dev/null
+++ b/net/rds/ib_sysctl.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+
+#include "ib.h"
+
+static struct ctl_table_header *rds_ib_sysctl_hdr;
+
+unsigned long rds_ib_sysctl_max_send_wr = RDS_IB_DEFAULT_SEND_WR;
+unsigned long rds_ib_sysctl_max_recv_wr = RDS_IB_DEFAULT_RECV_WR;
+unsigned long rds_ib_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE;
+static unsigned long rds_ib_sysctl_max_wr_min = 1;
+/* hardware will fail CQ creation long before this */
+static unsigned long rds_ib_sysctl_max_wr_max = (u32)~0;
+
+unsigned long rds_ib_sysctl_max_unsig_wrs = 16;
+static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1;
+static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64;
+
+unsigned long rds_ib_sysctl_max_unsig_bytes = (16 << 20);
+static unsigned long rds_ib_sysctl_max_unsig_bytes_min = 1;
+static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL;
+
+unsigned int rds_ib_sysctl_flow_control = 1;
+
+ctl_table rds_ib_sysctl_table[] = {
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "max_send_wr",
+ .data = &rds_ib_sysctl_max_send_wr,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = &proc_doulongvec_minmax,
+ .extra1 = &rds_ib_sysctl_max_wr_min,
+ .extra2 = &rds_ib_sysctl_max_wr_max,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "max_recv_wr",
+ .data = &rds_ib_sysctl_max_recv_wr,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = &proc_doulongvec_minmax,
+ .extra1 = &rds_ib_sysctl_max_wr_min,
+ .extra2 = &rds_ib_sysctl_max_wr_max,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "max_unsignaled_wr",
+ .data = &rds_ib_sysctl_max_unsig_wrs,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = &proc_doulongvec_minmax,
+ .extra1 = &rds_ib_sysctl_max_unsig_wr_min,
+ .extra2 = &rds_ib_sysctl_max_unsig_wr_max,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "max_unsignaled_bytes",
+ .data = &rds_ib_sysctl_max_unsig_bytes,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = &proc_doulongvec_minmax,
+ .extra1 = &rds_ib_sysctl_max_unsig_bytes_min,
+ .extra2 = &rds_ib_sysctl_max_unsig_bytes_max,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "max_recv_allocation",
+ .data = &rds_ib_sysctl_max_recv_allocation,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = &proc_doulongvec_minmax,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "flow_control",
+ .data = &rds_ib_sysctl_flow_control,
+ .maxlen = sizeof(rds_ib_sysctl_flow_control),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ { .ctl_name = 0}
+};
+
+static struct ctl_path rds_ib_sysctl_path[] = {
+ { .procname = "net", .ctl_name = CTL_NET, },
+ { .procname = "rds", .ctl_name = CTL_UNNUMBERED, },
+ { .procname = "ib", .ctl_name = CTL_UNNUMBERED, },
+ { }
+};
+
+void rds_ib_sysctl_exit(void)
+{
+ if (rds_ib_sysctl_hdr)
+ unregister_sysctl_table(rds_ib_sysctl_hdr);
+}
+
+int __init rds_ib_sysctl_init(void)
+{
+ rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table);
+ if (rds_ib_sysctl_hdr == NULL)
+ return -ENOMEM;
+ return 0;
+}
diff --git a/net/rds/info.c b/net/rds/info.c
new file mode 100644
index 00000000000..1d885535214
--- /dev/null
+++ b/net/rds/info.c
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
+#include "rds.h"
+
+/*
+ * This file implements a getsockopt() call which copies a set of fixed
+ * sized structs into a user-specified buffer as a means of providing
+ * read-only information about RDS.
+ *
+ * For a given information source there are a given number of fixed sized
+ * structs at a given time. The structs are only copied if the user-specified
+ * buffer is big enough. The destination pages that make up the buffer
+ * are pinned for the duration of the copy.
+ *
+ * This gives us the following benefits:
+ *
+ * - simple implementation, no copy "position" across multiple calls
+ * - consistent snapshot of an info source
+ * - atomic copy works well with whatever locking info source has
+ * - one portable tool to get rds info across implementations
+ * - long-lived tool can get info without allocating
+ *
+ * at the following costs:
+ *
+ * - info source copy must be pinned, may be "large"
+ */
+
+struct rds_info_iterator {
+ struct page **pages;
+ void *addr;
+ unsigned long offset;
+};
+
+static DEFINE_SPINLOCK(rds_info_lock);
+static rds_info_func rds_info_funcs[RDS_INFO_LAST - RDS_INFO_FIRST + 1];
+
+void rds_info_register_func(int optname, rds_info_func func)
+{
+ int offset = optname - RDS_INFO_FIRST;
+
+ BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
+
+ spin_lock(&rds_info_lock);
+ BUG_ON(rds_info_funcs[offset] != NULL);
+ rds_info_funcs[offset] = func;
+ spin_unlock(&rds_info_lock);
+}
+
+void rds_info_deregister_func(int optname, rds_info_func func)
+{
+ int offset = optname - RDS_INFO_FIRST;
+
+ BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
+
+ spin_lock(&rds_info_lock);
+ BUG_ON(rds_info_funcs[offset] != func);
+ rds_info_funcs[offset] = NULL;
+ spin_unlock(&rds_info_lock);
+}
+
+/*
+ * Typically we hold an atomic kmap across multiple rds_info_copy() calls
+ * because the kmap is so expensive. This must be called before using blocking
+ * operations while holding the mapping and as the iterator is torn down.
+ */
+void rds_info_iter_unmap(struct rds_info_iterator *iter)
+{
+ if (iter->addr != NULL) {
+ kunmap_atomic(iter->addr, KM_USER0);
+ iter->addr = NULL;
+ }
+}
+
+/*
+ * get_user_pages() called flush_dcache_page() on the pages for us.
+ */
+void rds_info_copy(struct rds_info_iterator *iter, void *data,
+ unsigned long bytes)
+{
+ unsigned long this;
+
+ while (bytes) {
+ if (iter->addr == NULL)
+ iter->addr = kmap_atomic(*iter->pages, KM_USER0);
+
+ this = min(bytes, PAGE_SIZE - iter->offset);
+
+ rdsdebug("page %p addr %p offset %lu this %lu data %p "
+ "bytes %lu\n", *iter->pages, iter->addr,
+ iter->offset, this, data, bytes);
+
+ memcpy(iter->addr + iter->offset, data, this);
+
+ data += this;
+ bytes -= this;
+ iter->offset += this;
+
+ if (iter->offset == PAGE_SIZE) {
+ kunmap_atomic(iter->addr, KM_USER0);
+ iter->addr = NULL;
+ iter->offset = 0;
+ iter->pages++;
+ }
+ }
+}
+
+/*
+ * @optval points to the userspace buffer that the information snapshot
+ * will be copied into.
+ *
+ * @optlen on input is the size of the buffer in userspace. @optlen
+ * on output is the size of the requested snapshot in bytes.
+ *
+ * This function returns -errno if there is a failure, particularly -ENOSPC
+ * if the given userspace buffer was not large enough to fit the snapshot.
+ * On success it returns the positive number of bytes of each array element
+ * in the snapshot.
+ */
+int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
+ int __user *optlen)
+{
+ struct rds_info_iterator iter;
+ struct rds_info_lengths lens;
+ unsigned long nr_pages = 0;
+ unsigned long start;
+ unsigned long i;
+ rds_info_func func;
+ struct page **pages = NULL;
+ int ret;
+ int len;
+ int total;
+
+ if (get_user(len, optlen)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /* check for all kinds of wrapping and the like */
+ start = (unsigned long)optval;
+ if (len < 0 || len + PAGE_SIZE - 1 < len || start + len < start) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* a 0 len call is just trying to probe its length */
+ if (len == 0)
+ goto call_func;
+
+ nr_pages = (PAGE_ALIGN(start + len) - (start & PAGE_MASK))
+ >> PAGE_SHIFT;
+
+ pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL);
+ if (pages == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ down_read(&current->mm->mmap_sem);
+ ret = get_user_pages(current, current->mm, start, nr_pages, 1, 0,
+ pages, NULL);
+ up_read(&current->mm->mmap_sem);
+ if (ret != nr_pages) {
+ if (ret > 0)
+ nr_pages = ret;
+ else
+ nr_pages = 0;
+ ret = -EAGAIN; /* XXX ? */
+ goto out;
+ }
+
+ rdsdebug("len %d nr_pages %lu\n", len, nr_pages);
+
+call_func:
+ func = rds_info_funcs[optname - RDS_INFO_FIRST];
+ if (func == NULL) {
+ ret = -ENOPROTOOPT;
+ goto out;
+ }
+
+ iter.pages = pages;
+ iter.addr = NULL;
+ iter.offset = start & (PAGE_SIZE - 1);
+
+ func(sock, len, &iter, &lens);
+ BUG_ON(lens.each == 0);
+
+ total = lens.nr * lens.each;
+
+ rds_info_iter_unmap(&iter);
+
+ if (total > len) {
+ len = total;
+ ret = -ENOSPC;
+ } else {
+ len = total;
+ ret = lens.each;
+ }
+
+ if (put_user(len, optlen))
+ ret = -EFAULT;
+
+out:
+ for (i = 0; pages != NULL && i < nr_pages; i++)
+ put_page(pages[i]);
+ kfree(pages);
+
+ return ret;
+}
diff --git a/net/rds/info.h b/net/rds/info.h
new file mode 100644
index 00000000000..b6c052ca7d2
--- /dev/null
+++ b/net/rds/info.h
@@ -0,0 +1,30 @@
+#ifndef _RDS_INFO_H
+#define _RDS_INFO_H
+
+struct rds_info_lengths {
+ unsigned int nr;
+ unsigned int each;
+};
+
+struct rds_info_iterator;
+
+/*
+ * These functions must fill in the fields of @lens to reflect the size
+ * of the available info source. If the snapshot fits in @len then it
+ * should be copied using @iter. The caller will deduce if it was copied
+ * or not by comparing the lengths.
+ */
+typedef void (*rds_info_func)(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens);
+
+void rds_info_register_func(int optname, rds_info_func func);
+void rds_info_deregister_func(int optname, rds_info_func func);
+int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
+ int __user *optlen);
+void rds_info_copy(struct rds_info_iterator *iter, void *data,
+ unsigned long bytes);
+void rds_info_iter_unmap(struct rds_info_iterator *iter);
+
+
+#endif
diff --git a/net/rds/iw.c b/net/rds/iw.c
new file mode 100644
index 00000000000..1b56905c4c0
--- /dev/null
+++ b/net/rds/iw.c
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/if.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/if_arp.h>
+#include <linux/delay.h>
+
+#include "rds.h"
+#include "iw.h"
+
+unsigned int fastreg_pool_size = RDS_FASTREG_POOL_SIZE;
+unsigned int fastreg_message_size = RDS_FASTREG_SIZE + 1; /* +1 allows for unaligned MRs */
+
+module_param(fastreg_pool_size, int, 0444);
+MODULE_PARM_DESC(fastreg_pool_size, " Max number of fastreg MRs per device");
+module_param(fastreg_message_size, int, 0444);
+MODULE_PARM_DESC(fastreg_message_size, " Max size of a RDMA transfer (fastreg MRs)");
+
+struct list_head rds_iw_devices;
+
+DEFINE_SPINLOCK(iw_nodev_conns_lock);
+LIST_HEAD(iw_nodev_conns);
+
+void rds_iw_add_one(struct ib_device *device)
+{
+ struct rds_iw_device *rds_iwdev;
+ struct ib_device_attr *dev_attr;
+
+ /* Only handle iwarp devices */
+ if (device->node_type != RDMA_NODE_RNIC)
+ return;
+
+ dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
+ if (!dev_attr)
+ return;
+
+ if (ib_query_device(device, dev_attr)) {
+ rdsdebug("Query device failed for %s\n", device->name);
+ goto free_attr;
+ }
+
+ rds_iwdev = kmalloc(sizeof *rds_iwdev, GFP_KERNEL);
+ if (!rds_iwdev)
+ goto free_attr;
+
+ spin_lock_init(&rds_iwdev->spinlock);
+
+ rds_iwdev->dma_local_lkey = !!(dev_attr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY);
+ rds_iwdev->max_wrs = dev_attr->max_qp_wr;
+ rds_iwdev->max_sge = min(dev_attr->max_sge, RDS_IW_MAX_SGE);
+
+ rds_iwdev->page_shift = max(PAGE_SHIFT, ffs(dev_attr->page_size_cap) - 1);
+
+ rds_iwdev->dev = device;
+ rds_iwdev->pd = ib_alloc_pd(device);
+ if (IS_ERR(rds_iwdev->pd))
+ goto free_dev;
+
+ if (!rds_iwdev->dma_local_lkey) {
+ if (device->node_type != RDMA_NODE_RNIC) {
+ rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
+ IB_ACCESS_LOCAL_WRITE);
+ } else {
+ rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
+ IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_LOCAL_WRITE);
+ }
+ if (IS_ERR(rds_iwdev->mr))
+ goto err_pd;
+ } else
+ rds_iwdev->mr = NULL;
+
+ rds_iwdev->mr_pool = rds_iw_create_mr_pool(rds_iwdev);
+ if (IS_ERR(rds_iwdev->mr_pool)) {
+ rds_iwdev->mr_pool = NULL;
+ goto err_mr;
+ }
+
+ INIT_LIST_HEAD(&rds_iwdev->cm_id_list);
+ INIT_LIST_HEAD(&rds_iwdev->conn_list);
+ list_add_tail(&rds_iwdev->list, &rds_iw_devices);
+
+ ib_set_client_data(device, &rds_iw_client, rds_iwdev);
+
+ goto free_attr;
+
+err_mr:
+ if (rds_iwdev->mr)
+ ib_dereg_mr(rds_iwdev->mr);
+err_pd:
+ ib_dealloc_pd(rds_iwdev->pd);
+free_dev:
+ kfree(rds_iwdev);
+free_attr:
+ kfree(dev_attr);
+}
+
+void rds_iw_remove_one(struct ib_device *device)
+{
+ struct rds_iw_device *rds_iwdev;
+ struct rds_iw_cm_id *i_cm_id, *next;
+
+ rds_iwdev = ib_get_client_data(device, &rds_iw_client);
+ if (!rds_iwdev)
+ return;
+
+ spin_lock_irq(&rds_iwdev->spinlock);
+ list_for_each_entry_safe(i_cm_id, next, &rds_iwdev->cm_id_list, list) {
+ list_del(&i_cm_id->list);
+ kfree(i_cm_id);
+ }
+ spin_unlock_irq(&rds_iwdev->spinlock);
+
+ rds_iw_remove_conns(rds_iwdev);
+
+ if (rds_iwdev->mr_pool)
+ rds_iw_destroy_mr_pool(rds_iwdev->mr_pool);
+
+ if (rds_iwdev->mr)
+ ib_dereg_mr(rds_iwdev->mr);
+
+ while (ib_dealloc_pd(rds_iwdev->pd)) {
+ rdsdebug("Failed to dealloc pd %p\n", rds_iwdev->pd);
+ msleep(1);
+ }
+
+ list_del(&rds_iwdev->list);
+ kfree(rds_iwdev);
+}
+
+struct ib_client rds_iw_client = {
+ .name = "rds_iw",
+ .add = rds_iw_add_one,
+ .remove = rds_iw_remove_one
+};
+
+static int rds_iw_conn_info_visitor(struct rds_connection *conn,
+ void *buffer)
+{
+ struct rds_info_rdma_connection *iinfo = buffer;
+ struct rds_iw_connection *ic;
+
+ /* We will only ever look at IB transports */
+ if (conn->c_trans != &rds_iw_transport)
+ return 0;
+
+ iinfo->src_addr = conn->c_laddr;
+ iinfo->dst_addr = conn->c_faddr;
+
+ memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
+ memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
+ if (rds_conn_state(conn) == RDS_CONN_UP) {
+ struct rds_iw_device *rds_iwdev;
+ struct rdma_dev_addr *dev_addr;
+
+ ic = conn->c_transport_data;
+ dev_addr = &ic->i_cm_id->route.addr.dev_addr;
+
+ ib_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
+ ib_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
+
+ rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
+ iinfo->max_send_wr = ic->i_send_ring.w_nr;
+ iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
+ iinfo->max_send_sge = rds_iwdev->max_sge;
+ rds_iw_get_mr_info(rds_iwdev, iinfo);
+ }
+ return 1;
+}
+
+static void rds_iw_ic_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ rds_for_each_conn_info(sock, len, iter, lens,
+ rds_iw_conn_info_visitor,
+ sizeof(struct rds_info_rdma_connection));
+}
+
+
+/*
+ * Early RDS/IB was built to only bind to an address if there is an IPoIB
+ * device with that address set.
+ *
+ * If it were me, I'd advocate for something more flexible. Sending and
+ * receiving should be device-agnostic. Transports would try and maintain
+ * connections between peers who have messages queued. Userspace would be
+ * allowed to influence which paths have priority. We could call userspace
+ * asserting this policy "routing".
+ */
+static int rds_iw_laddr_check(__be32 addr)
+{
+ int ret;
+ struct rdma_cm_id *cm_id;
+ struct sockaddr_in sin;
+
+ /* Create a CMA ID and try to bind it. This catches both
+ * IB and iWARP capable NICs.
+ */
+ cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
+ if (!cm_id)
+ return -EADDRNOTAVAIL;
+
+ memset(&sin, 0, sizeof(sin));
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = addr;
+
+ /* rdma_bind_addr will only succeed for IB & iWARP devices */
+ ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+ /* due to this, we will claim to support IB devices unless we
+ check node_type. */
+ if (ret || cm_id->device->node_type != RDMA_NODE_RNIC)
+ ret = -EADDRNOTAVAIL;
+
+ rdsdebug("addr %pI4 ret %d node type %d\n",
+ &addr, ret,
+ cm_id->device ? cm_id->device->node_type : -1);
+
+ rdma_destroy_id(cm_id);
+
+ return ret;
+}
+
+void rds_iw_exit(void)
+{
+ rds_info_deregister_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info);
+ rds_iw_remove_nodev_conns();
+ ib_unregister_client(&rds_iw_client);
+ rds_iw_sysctl_exit();
+ rds_iw_recv_exit();
+ rds_trans_unregister(&rds_iw_transport);
+}
+
+struct rds_transport rds_iw_transport = {
+ .laddr_check = rds_iw_laddr_check,
+ .xmit_complete = rds_iw_xmit_complete,
+ .xmit = rds_iw_xmit,
+ .xmit_cong_map = NULL,
+ .xmit_rdma = rds_iw_xmit_rdma,
+ .recv = rds_iw_recv,
+ .conn_alloc = rds_iw_conn_alloc,
+ .conn_free = rds_iw_conn_free,
+ .conn_connect = rds_iw_conn_connect,
+ .conn_shutdown = rds_iw_conn_shutdown,
+ .inc_copy_to_user = rds_iw_inc_copy_to_user,
+ .inc_purge = rds_iw_inc_purge,
+ .inc_free = rds_iw_inc_free,
+ .cm_initiate_connect = rds_iw_cm_initiate_connect,
+ .cm_handle_connect = rds_iw_cm_handle_connect,
+ .cm_connect_complete = rds_iw_cm_connect_complete,
+ .stats_info_copy = rds_iw_stats_info_copy,
+ .exit = rds_iw_exit,
+ .get_mr = rds_iw_get_mr,
+ .sync_mr = rds_iw_sync_mr,
+ .free_mr = rds_iw_free_mr,
+ .flush_mrs = rds_iw_flush_mrs,
+ .t_owner = THIS_MODULE,
+ .t_name = "iwarp",
+ .t_prefer_loopback = 1,
+};
+
+int __init rds_iw_init(void)
+{
+ int ret;
+
+ INIT_LIST_HEAD(&rds_iw_devices);
+
+ ret = ib_register_client(&rds_iw_client);
+ if (ret)
+ goto out;
+
+ ret = rds_iw_sysctl_init();
+ if (ret)
+ goto out_ibreg;
+
+ ret = rds_iw_recv_init();
+ if (ret)
+ goto out_sysctl;
+
+ ret = rds_trans_register(&rds_iw_transport);
+ if (ret)
+ goto out_recv;
+
+ rds_info_register_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info);
+
+ goto out;
+
+out_recv:
+ rds_iw_recv_exit();
+out_sysctl:
+ rds_iw_sysctl_exit();
+out_ibreg:
+ ib_unregister_client(&rds_iw_client);
+out:
+ return ret;
+}
+
+MODULE_LICENSE("GPL");
+
diff --git a/net/rds/iw.h b/net/rds/iw.h
new file mode 100644
index 00000000000..0ddda34f2a1
--- /dev/null
+++ b/net/rds/iw.h
@@ -0,0 +1,395 @@
+#ifndef _RDS_IW_H
+#define _RDS_IW_H
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include "rds.h"
+#include "rdma_transport.h"
+
+#define RDS_FASTREG_SIZE 20
+#define RDS_FASTREG_POOL_SIZE 2048
+
+#define RDS_IW_MAX_SGE 8
+#define RDS_IW_RECV_SGE 2
+
+#define RDS_IW_DEFAULT_RECV_WR 1024
+#define RDS_IW_DEFAULT_SEND_WR 256
+
+#define RDS_IW_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */
+
+extern struct list_head rds_iw_devices;
+
+/*
+ * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to
+ * try and minimize the amount of memory tied up both the device and
+ * socket receive queues.
+ */
+/* page offset of the final full frag that fits in the page */
+#define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE)
+struct rds_page_frag {
+ struct list_head f_item;
+ struct page *f_page;
+ unsigned long f_offset;
+ dma_addr_t f_mapped;
+};
+
+struct rds_iw_incoming {
+ struct list_head ii_frags;
+ struct rds_incoming ii_inc;
+};
+
+struct rds_iw_connect_private {
+ /* Add new fields at the end, and don't permute existing fields. */
+ __be32 dp_saddr;
+ __be32 dp_daddr;
+ u8 dp_protocol_major;
+ u8 dp_protocol_minor;
+ __be16 dp_protocol_minor_mask; /* bitmask */
+ __be32 dp_reserved1;
+ __be64 dp_ack_seq;
+ __be32 dp_credit; /* non-zero enables flow ctl */
+};
+
+struct rds_iw_scatterlist {
+ struct scatterlist *list;
+ unsigned int len;
+ int dma_len;
+ unsigned int dma_npages;
+ unsigned int bytes;
+};
+
+struct rds_iw_mapping {
+ spinlock_t m_lock; /* protect the mapping struct */
+ struct list_head m_list;
+ struct rds_iw_mr *m_mr;
+ uint32_t m_rkey;
+ struct rds_iw_scatterlist m_sg;
+};
+
+struct rds_iw_send_work {
+ struct rds_message *s_rm;
+
+ /* We should really put these into a union: */
+ struct rds_rdma_op *s_op;
+ struct rds_iw_mapping *s_mapping;
+ struct ib_mr *s_mr;
+ struct ib_fast_reg_page_list *s_page_list;
+ unsigned char s_remap_count;
+
+ struct ib_send_wr s_wr;
+ struct ib_sge s_sge[RDS_IW_MAX_SGE];
+ unsigned long s_queued;
+};
+
+struct rds_iw_recv_work {
+ struct rds_iw_incoming *r_iwinc;
+ struct rds_page_frag *r_frag;
+ struct ib_recv_wr r_wr;
+ struct ib_sge r_sge[2];
+};
+
+struct rds_iw_work_ring {
+ u32 w_nr;
+ u32 w_alloc_ptr;
+ u32 w_alloc_ctr;
+ u32 w_free_ptr;
+ atomic_t w_free_ctr;
+};
+
+struct rds_iw_device;
+
+struct rds_iw_connection {
+
+ struct list_head iw_node;
+ struct rds_iw_device *rds_iwdev;
+ struct rds_connection *conn;
+
+ /* alphabet soup, IBTA style */
+ struct rdma_cm_id *i_cm_id;
+ struct ib_pd *i_pd;
+ struct ib_mr *i_mr;
+ struct ib_cq *i_send_cq;
+ struct ib_cq *i_recv_cq;
+
+ /* tx */
+ struct rds_iw_work_ring i_send_ring;
+ struct rds_message *i_rm;
+ struct rds_header *i_send_hdrs;
+ u64 i_send_hdrs_dma;
+ struct rds_iw_send_work *i_sends;
+
+ /* rx */
+ struct mutex i_recv_mutex;
+ struct rds_iw_work_ring i_recv_ring;
+ struct rds_iw_incoming *i_iwinc;
+ u32 i_recv_data_rem;
+ struct rds_header *i_recv_hdrs;
+ u64 i_recv_hdrs_dma;
+ struct rds_iw_recv_work *i_recvs;
+ struct rds_page_frag i_frag;
+ u64 i_ack_recv; /* last ACK received */
+
+ /* sending acks */
+ unsigned long i_ack_flags;
+ u64 i_ack_next; /* next ACK to send */
+ struct rds_header *i_ack;
+ struct ib_send_wr i_ack_wr;
+ struct ib_sge i_ack_sge;
+ u64 i_ack_dma;
+ unsigned long i_ack_queued;
+
+ /* Flow control related information
+ *
+ * Our algorithm uses a pair variables that we need to access
+ * atomically - one for the send credits, and one posted
+ * recv credits we need to transfer to remote.
+ * Rather than protect them using a slow spinlock, we put both into
+ * a single atomic_t and update it using cmpxchg
+ */
+ atomic_t i_credits;
+
+ /* Protocol version specific information */
+ unsigned int i_flowctl:1; /* enable/disable flow ctl */
+ unsigned int i_dma_local_lkey:1;
+ unsigned int i_fastreg_posted:1; /* fastreg posted on this connection */
+ /* Batched completions */
+ unsigned int i_unsignaled_wrs;
+ long i_unsignaled_bytes;
+};
+
+/* This assumes that atomic_t is at least 32 bits */
+#define IB_GET_SEND_CREDITS(v) ((v) & 0xffff)
+#define IB_GET_POST_CREDITS(v) ((v) >> 16)
+#define IB_SET_SEND_CREDITS(v) ((v) & 0xffff)
+#define IB_SET_POST_CREDITS(v) ((v) << 16)
+
+struct rds_iw_cm_id {
+ struct list_head list;
+ struct rdma_cm_id *cm_id;
+};
+
+struct rds_iw_device {
+ struct list_head list;
+ struct list_head cm_id_list;
+ struct list_head conn_list;
+ struct ib_device *dev;
+ struct ib_pd *pd;
+ struct ib_mr *mr;
+ struct rds_iw_mr_pool *mr_pool;
+ int page_shift;
+ int max_sge;
+ unsigned int max_wrs;
+ unsigned int dma_local_lkey:1;
+ spinlock_t spinlock; /* protect the above */
+};
+
+/* bits for i_ack_flags */
+#define IB_ACK_IN_FLIGHT 0
+#define IB_ACK_REQUESTED 1
+
+/* Magic WR_ID for ACKs */
+#define RDS_IW_ACK_WR_ID ((u64)0xffffffffffffffffULL)
+#define RDS_IW_FAST_REG_WR_ID ((u64)0xefefefefefefefefULL)
+#define RDS_IW_LOCAL_INV_WR_ID ((u64)0xdfdfdfdfdfdfdfdfULL)
+
+struct rds_iw_statistics {
+ uint64_t s_iw_connect_raced;
+ uint64_t s_iw_listen_closed_stale;
+ uint64_t s_iw_tx_cq_call;
+ uint64_t s_iw_tx_cq_event;
+ uint64_t s_iw_tx_ring_full;
+ uint64_t s_iw_tx_throttle;
+ uint64_t s_iw_tx_sg_mapping_failure;
+ uint64_t s_iw_tx_stalled;
+ uint64_t s_iw_tx_credit_updates;
+ uint64_t s_iw_rx_cq_call;
+ uint64_t s_iw_rx_cq_event;
+ uint64_t s_iw_rx_ring_empty;
+ uint64_t s_iw_rx_refill_from_cq;
+ uint64_t s_iw_rx_refill_from_thread;
+ uint64_t s_iw_rx_alloc_limit;
+ uint64_t s_iw_rx_credit_updates;
+ uint64_t s_iw_ack_sent;
+ uint64_t s_iw_ack_send_failure;
+ uint64_t s_iw_ack_send_delayed;
+ uint64_t s_iw_ack_send_piggybacked;
+ uint64_t s_iw_ack_received;
+ uint64_t s_iw_rdma_mr_alloc;
+ uint64_t s_iw_rdma_mr_free;
+ uint64_t s_iw_rdma_mr_used;
+ uint64_t s_iw_rdma_mr_pool_flush;
+ uint64_t s_iw_rdma_mr_pool_wait;
+ uint64_t s_iw_rdma_mr_pool_depleted;
+};
+
+extern struct workqueue_struct *rds_iw_wq;
+
+/*
+ * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h
+ * doesn't define it.
+ */
+static inline void rds_iw_dma_sync_sg_for_cpu(struct ib_device *dev,
+ struct scatterlist *sg, unsigned int sg_dma_len, int direction)
+{
+ unsigned int i;
+
+ for (i = 0; i < sg_dma_len; ++i) {
+ ib_dma_sync_single_for_cpu(dev,
+ ib_sg_dma_address(dev, &sg[i]),
+ ib_sg_dma_len(dev, &sg[i]),
+ direction);
+ }
+}
+#define ib_dma_sync_sg_for_cpu rds_iw_dma_sync_sg_for_cpu
+
+static inline void rds_iw_dma_sync_sg_for_device(struct ib_device *dev,
+ struct scatterlist *sg, unsigned int sg_dma_len, int direction)
+{
+ unsigned int i;
+
+ for (i = 0; i < sg_dma_len; ++i) {
+ ib_dma_sync_single_for_device(dev,
+ ib_sg_dma_address(dev, &sg[i]),
+ ib_sg_dma_len(dev, &sg[i]),
+ direction);
+ }
+}
+#define ib_dma_sync_sg_for_device rds_iw_dma_sync_sg_for_device
+
+static inline u32 rds_iw_local_dma_lkey(struct rds_iw_connection *ic)
+{
+ return ic->i_dma_local_lkey ? ic->i_cm_id->device->local_dma_lkey : ic->i_mr->lkey;
+}
+
+/* ib.c */
+extern struct rds_transport rds_iw_transport;
+extern void rds_iw_add_one(struct ib_device *device);
+extern void rds_iw_remove_one(struct ib_device *device);
+extern struct ib_client rds_iw_client;
+
+extern unsigned int fastreg_pool_size;
+extern unsigned int fastreg_message_size;
+
+extern spinlock_t iw_nodev_conns_lock;
+extern struct list_head iw_nodev_conns;
+
+/* ib_cm.c */
+int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp);
+void rds_iw_conn_free(void *arg);
+int rds_iw_conn_connect(struct rds_connection *conn);
+void rds_iw_conn_shutdown(struct rds_connection *conn);
+void rds_iw_state_change(struct sock *sk);
+int __init rds_iw_listen_init(void);
+void rds_iw_listen_stop(void);
+void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...);
+int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event);
+int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id);
+void rds_iw_cm_connect_complete(struct rds_connection *conn,
+ struct rdma_cm_event *event);
+
+
+#define rds_iw_conn_error(conn, fmt...) \
+ __rds_iw_conn_error(conn, KERN_WARNING "RDS/IW: " fmt)
+
+/* ib_rdma.c */
+int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id);
+int rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn);
+void rds_iw_remove_nodev_conns(void);
+void rds_iw_remove_conns(struct rds_iw_device *rds_iwdev);
+struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *);
+void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo);
+void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *);
+void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
+ struct rds_sock *rs, u32 *key_ret);
+void rds_iw_sync_mr(void *trans_private, int dir);
+void rds_iw_free_mr(void *trans_private, int invalidate);
+void rds_iw_flush_mrs(void);
+void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id);
+
+/* ib_recv.c */
+int __init rds_iw_recv_init(void);
+void rds_iw_recv_exit(void);
+int rds_iw_recv(struct rds_connection *conn);
+int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
+ gfp_t page_gfp, int prefill);
+void rds_iw_inc_purge(struct rds_incoming *inc);
+void rds_iw_inc_free(struct rds_incoming *inc);
+int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
+ size_t size);
+void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context);
+void rds_iw_recv_init_ring(struct rds_iw_connection *ic);
+void rds_iw_recv_clear_ring(struct rds_iw_connection *ic);
+void rds_iw_recv_init_ack(struct rds_iw_connection *ic);
+void rds_iw_attempt_ack(struct rds_iw_connection *ic);
+void rds_iw_ack_send_complete(struct rds_iw_connection *ic);
+u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic);
+
+/* ib_ring.c */
+void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr);
+void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr);
+u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos);
+void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val);
+void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val);
+int rds_iw_ring_empty(struct rds_iw_work_ring *ring);
+int rds_iw_ring_low(struct rds_iw_work_ring *ring);
+u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring);
+u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest);
+extern wait_queue_head_t rds_iw_ring_empty_wait;
+
+/* ib_send.c */
+void rds_iw_xmit_complete(struct rds_connection *conn);
+int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
+ unsigned int hdr_off, unsigned int sg, unsigned int off);
+void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context);
+void rds_iw_send_init_ring(struct rds_iw_connection *ic);
+void rds_iw_send_clear_ring(struct rds_iw_connection *ic);
+int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op);
+void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits);
+void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted);
+int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted,
+ u32 *adv_credits, int need_posted);
+
+/* ib_stats.c */
+DECLARE_PER_CPU(struct rds_iw_statistics, rds_iw_stats);
+#define rds_iw_stats_inc(member) rds_stats_inc_which(rds_iw_stats, member)
+unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
+ unsigned int avail);
+
+/* ib_sysctl.c */
+int __init rds_iw_sysctl_init(void);
+void rds_iw_sysctl_exit(void);
+extern unsigned long rds_iw_sysctl_max_send_wr;
+extern unsigned long rds_iw_sysctl_max_recv_wr;
+extern unsigned long rds_iw_sysctl_max_unsig_wrs;
+extern unsigned long rds_iw_sysctl_max_unsig_bytes;
+extern unsigned long rds_iw_sysctl_max_recv_allocation;
+extern unsigned int rds_iw_sysctl_flow_control;
+extern ctl_table rds_iw_sysctl_table[];
+
+/*
+ * Helper functions for getting/setting the header and data SGEs in
+ * RDS packets (not RDMA)
+ */
+static inline struct ib_sge *
+rds_iw_header_sge(struct rds_iw_connection *ic, struct ib_sge *sge)
+{
+ return &sge[0];
+}
+
+static inline struct ib_sge *
+rds_iw_data_sge(struct rds_iw_connection *ic, struct ib_sge *sge)
+{
+ return &sge[1];
+}
+
+static inline void rds_iw_set_64bit(u64 *ptr, u64 val)
+{
+#if BITS_PER_LONG == 64
+ *ptr = val;
+#else
+ set_64bit(ptr, val);
+#endif
+}
+
+#endif
diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c
new file mode 100644
index 00000000000..57ecb3d4b8a
--- /dev/null
+++ b/net/rds/iw_cm.c
@@ -0,0 +1,750 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/vmalloc.h>
+
+#include "rds.h"
+#include "iw.h"
+
+/*
+ * Set the selected protocol version
+ */
+static void rds_iw_set_protocol(struct rds_connection *conn, unsigned int version)
+{
+ conn->c_version = version;
+}
+
+/*
+ * Set up flow control
+ */
+static void rds_iw_set_flow_control(struct rds_connection *conn, u32 credits)
+{
+ struct rds_iw_connection *ic = conn->c_transport_data;
+
+ if (rds_iw_sysctl_flow_control && credits != 0) {
+ /* We're doing flow control */
+ ic->i_flowctl = 1;
+ rds_iw_send_add_credits(conn, credits);
+ } else {
+ ic->i_flowctl = 0;
+ }
+}
+
+/*
+ * Connection established.
+ * We get here for both outgoing and incoming connection.
+ */
+void rds_iw_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
+{
+ const struct rds_iw_connect_private *dp = NULL;
+ struct rds_iw_connection *ic = conn->c_transport_data;
+ struct rds_iw_device *rds_iwdev;
+ int err;
+
+ if (event->param.conn.private_data_len) {
+ dp = event->param.conn.private_data;
+
+ rds_iw_set_protocol(conn,
+ RDS_PROTOCOL(dp->dp_protocol_major,
+ dp->dp_protocol_minor));
+ rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+ }
+
+ /* update ib_device with this local ipaddr & conn */
+ rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
+ err = rds_iw_update_cm_id(rds_iwdev, ic->i_cm_id);
+ if (err)
+ printk(KERN_ERR "rds_iw_update_ipaddr failed (%d)\n", err);
+ err = rds_iw_add_conn(rds_iwdev, conn);
+ if (err)
+ printk(KERN_ERR "rds_iw_add_conn failed (%d)\n", err);
+
+ /* If the peer gave us the last packet it saw, process this as if
+ * we had received a regular ACK. */
+ if (dp && dp->dp_ack_seq)
+ rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
+
+ printk(KERN_NOTICE "RDS/IW: connected to %pI4<->%pI4 version %u.%u%s\n",
+ &conn->c_laddr, &conn->c_faddr,
+ RDS_PROTOCOL_MAJOR(conn->c_version),
+ RDS_PROTOCOL_MINOR(conn->c_version),
+ ic->i_flowctl ? ", flow control" : "");
+
+ rds_connect_complete(conn);
+}
+
+static void rds_iw_cm_fill_conn_param(struct rds_connection *conn,
+ struct rdma_conn_param *conn_param,
+ struct rds_iw_connect_private *dp,
+ u32 protocol_version)
+{
+ struct rds_iw_connection *ic = conn->c_transport_data;
+
+ memset(conn_param, 0, sizeof(struct rdma_conn_param));
+ /* XXX tune these? */
+ conn_param->responder_resources = 1;
+ conn_param->initiator_depth = 1;
+
+ if (dp) {
+ memset(dp, 0, sizeof(*dp));
+ dp->dp_saddr = conn->c_laddr;
+ dp->dp_daddr = conn->c_faddr;
+ dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
+ dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
+ dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IW_SUPPORTED_PROTOCOLS);
+ dp->dp_ack_seq = rds_iw_piggyb_ack(ic);
+
+ /* Advertise flow control */
+ if (ic->i_flowctl) {
+ unsigned int credits;
+
+ credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits));
+ dp->dp_credit = cpu_to_be32(credits);
+ atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits);
+ }
+
+ conn_param->private_data = dp;
+ conn_param->private_data_len = sizeof(*dp);
+ }
+}
+
+static void rds_iw_cq_event_handler(struct ib_event *event, void *data)
+{
+ rdsdebug("event %u data %p\n", event->event, data);
+}
+
+static void rds_iw_qp_event_handler(struct ib_event *event, void *data)
+{
+ struct rds_connection *conn = data;
+ struct rds_iw_connection *ic = conn->c_transport_data;
+
+ rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event);
+
+ switch (event->event) {
+ case IB_EVENT_COMM_EST:
+ rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
+ break;
+ case IB_EVENT_QP_REQ_ERR:
+ case IB_EVENT_QP_FATAL:
+ default:
+ rds_iw_conn_error(conn, "RDS/IW: Fatal QP Event %u - connection %pI4->%pI4...reconnecting\n",
+ event->event, &conn->c_laddr,
+ &conn->c_faddr);
+ break;
+ }
+}
+
+/*
+ * Create a QP
+ */
+static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr,
+ struct rds_iw_device *rds_iwdev,
+ struct rds_iw_work_ring *send_ring,
+ void (*send_cq_handler)(struct ib_cq *, void *),
+ struct rds_iw_work_ring *recv_ring,
+ void (*recv_cq_handler)(struct ib_cq *, void *),
+ void *context)
+{
+ struct ib_device *dev = rds_iwdev->dev;
+ unsigned int send_size, recv_size;
+ int ret;
+
+ /* The offset of 1 is to accomodate the additional ACK WR. */
+ send_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_send_wr + 1);
+ recv_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_recv_wr + 1);
+ rds_iw_ring_resize(send_ring, send_size - 1);
+ rds_iw_ring_resize(recv_ring, recv_size - 1);
+
+ memset(attr, 0, sizeof(*attr));
+ attr->event_handler = rds_iw_qp_event_handler;
+ attr->qp_context = context;
+ attr->cap.max_send_wr = send_size;
+ attr->cap.max_recv_wr = recv_size;
+ attr->cap.max_send_sge = rds_iwdev->max_sge;
+ attr->cap.max_recv_sge = RDS_IW_RECV_SGE;
+ attr->sq_sig_type = IB_SIGNAL_REQ_WR;
+ attr->qp_type = IB_QPT_RC;
+
+ attr->send_cq = ib_create_cq(dev, send_cq_handler,
+ rds_iw_cq_event_handler,
+ context, send_size, 0);
+ if (IS_ERR(attr->send_cq)) {
+ ret = PTR_ERR(attr->send_cq);
+ attr->send_cq = NULL;
+ rdsdebug("ib_create_cq send failed: %d\n", ret);
+ goto out;
+ }
+
+ attr->recv_cq = ib_create_cq(dev, recv_cq_handler,
+ rds_iw_cq_event_handler,
+ context, recv_size, 0);
+ if (IS_ERR(attr->recv_cq)) {
+ ret = PTR_ERR(attr->recv_cq);
+ attr->recv_cq = NULL;
+ rdsdebug("ib_create_cq send failed: %d\n", ret);
+ goto out;
+ }
+
+ ret = ib_req_notify_cq(attr->send_cq, IB_CQ_NEXT_COMP);
+ if (ret) {
+ rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
+ goto out;
+ }
+
+ ret = ib_req_notify_cq(attr->recv_cq, IB_CQ_SOLICITED);
+ if (ret) {
+ rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
+ goto out;
+ }
+
+out:
+ if (ret) {
+ if (attr->send_cq)
+ ib_destroy_cq(attr->send_cq);
+ if (attr->recv_cq)
+ ib_destroy_cq(attr->recv_cq);
+ }
+ return ret;
+}
+
+/*
+ * This needs to be very careful to not leave IS_ERR pointers around for
+ * cleanup to trip over.
+ */
+static int rds_iw_setup_qp(struct rds_connection *conn)
+{
+ struct rds_iw_connection *ic = conn->c_transport_data;
+ struct ib_device *dev = ic->i_cm_id->device;
+ struct ib_qp_init_attr attr;
+ struct rds_iw_device *rds_iwdev;
+ int ret;
+
+ /* rds_iw_add_one creates a rds_iw_device object per IB device,
+ * and allocates a protection domain, memory range and MR pool
+ * for each. If that fails for any reason, it will not register
+ * the rds_iwdev at all.
+ */
+ rds_iwdev = ib_get_client_data(dev, &rds_iw_client);
+ if (rds_iwdev == NULL) {
+ if (printk_ratelimit())
+ printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n",
+ dev->name);
+ return -EOPNOTSUPP;
+ }
+
+ /* Protection domain and memory range */
+ ic->i_pd = rds_iwdev->pd;
+ ic->i_mr = rds_iwdev->mr;
+
+ ret = rds_iw_init_qp_attrs(&attr, rds_iwdev,
+ &ic->i_send_ring, rds_iw_send_cq_comp_handler,
+ &ic->i_recv_ring, rds_iw_recv_cq_comp_handler,
+ conn);
+ if (ret < 0)
+ goto out;
+
+ ic->i_send_cq = attr.send_cq;
+ ic->i_recv_cq = attr.recv_cq;
+
+ /*
+ * XXX this can fail if max_*_wr is too large? Are we supposed
+ * to back off until we get a value that the hardware can support?
+ */
+ ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
+ if (ret) {
+ rdsdebug("rdma_create_qp failed: %d\n", ret);
+ goto out;
+ }
+
+ ic->i_send_hdrs = ib_dma_alloc_coherent(dev,
+ ic->i_send_ring.w_nr *
+ sizeof(struct rds_header),
+ &ic->i_send_hdrs_dma, GFP_KERNEL);
+ if (ic->i_send_hdrs == NULL) {
+ ret = -ENOMEM;
+ rdsdebug("ib_dma_alloc_coherent send failed\n");
+ goto out;
+ }
+
+ ic->i_recv_hdrs = ib_dma_alloc_coherent(dev,
+ ic->i_recv_ring.w_nr *
+ sizeof(struct rds_header),
+ &ic->i_recv_hdrs_dma, GFP_KERNEL);
+ if (ic->i_recv_hdrs == NULL) {
+ ret = -ENOMEM;
+ rdsdebug("ib_dma_alloc_coherent recv failed\n");
+ goto out;
+ }
+
+ ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
+ &ic->i_ack_dma, GFP_KERNEL);
+ if (ic->i_ack == NULL) {
+ ret = -ENOMEM;
+ rdsdebug("ib_dma_alloc_coherent ack failed\n");
+ goto out;
+ }
+
+ ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work));
+ if (ic->i_sends == NULL) {
+ ret = -ENOMEM;
+ rdsdebug("send allocation failed\n");
+ goto out;
+ }
+ rds_iw_send_init_ring(ic);
+
+ ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work));
+ if (ic->i_recvs == NULL) {
+ ret = -ENOMEM;
+ rdsdebug("recv allocation failed\n");
+ goto out;
+ }
+
+ rds_iw_recv_init_ring(ic);
+ rds_iw_recv_init_ack(ic);
+
+ /* Post receive buffers - as a side effect, this will update
+ * the posted credit count. */
+ rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
+
+ rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
+ ic->i_send_cq, ic->i_recv_cq);
+
+out:
+ return ret;
+}
+
+static u32 rds_iw_protocol_compatible(const struct rds_iw_connect_private *dp)
+{
+ u16 common;
+ u32 version = 0;
+
+ /* rdma_cm private data is odd - when there is any private data in the
+ * request, we will be given a pretty large buffer without telling us the
+ * original size. The only way to tell the difference is by looking at
+ * the contents, which are initialized to zero.
+ * If the protocol version fields aren't set, this is a connection attempt
+ * from an older version. This could could be 3.0 or 2.0 - we can't tell.
+ * We really should have changed this for OFED 1.3 :-( */
+ if (dp->dp_protocol_major == 0)
+ return RDS_PROTOCOL_3_0;
+
+ common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IW_SUPPORTED_PROTOCOLS;
+ if (dp->dp_protocol_major == 3 && common) {
+ version = RDS_PROTOCOL_3_0;
+ while ((common >>= 1) != 0)
+ version++;
+ } else if (printk_ratelimit()) {
+ printk(KERN_NOTICE "RDS: Connection from %pI4 using "
+ "incompatible protocol version %u.%u\n",
+ &dp->dp_saddr,
+ dp->dp_protocol_major,
+ dp->dp_protocol_minor);
+ }
+ return version;
+}
+
+int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event)
+{
+ const struct rds_iw_connect_private *dp = event->param.conn.private_data;
+ struct rds_iw_connect_private dp_rep;
+ struct rds_connection *conn = NULL;
+ struct rds_iw_connection *ic = NULL;
+ struct rdma_conn_param conn_param;
+ struct rds_iw_device *rds_iwdev;
+ u32 version;
+ int err, destroy = 1;
+
+ /* Check whether the remote protocol version matches ours. */
+ version = rds_iw_protocol_compatible(dp);
+ if (!version)
+ goto out;
+
+ rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u\n",
+ &dp->dp_saddr, &dp->dp_daddr,
+ RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version));
+
+ conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_iw_transport,
+ GFP_KERNEL);
+ if (IS_ERR(conn)) {
+ rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
+ conn = NULL;
+ goto out;
+ }
+
+ /*
+ * The connection request may occur while the
+ * previous connection exist, e.g. in case of failover.
+ * But as connections may be initiated simultaneously
+ * by both hosts, we have a random backoff mechanism -
+ * see the comment above rds_queue_reconnect()
+ */
+ mutex_lock(&conn->c_cm_lock);
+ if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
+ if (rds_conn_state(conn) == RDS_CONN_UP) {
+ rdsdebug("incoming connect while connecting\n");
+ rds_conn_drop(conn);
+ rds_iw_stats_inc(s_iw_listen_closed_stale);
+ } else
+ if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
+ /* Wait and see - our connect may still be succeeding */
+ rds_iw_stats_inc(s_iw_connect_raced);
+ }
+ mutex_unlock(&conn->c_cm_lock);
+ goto out;
+ }
+
+ ic = conn->c_transport_data;
+
+ rds_iw_set_protocol(conn, version);
+ rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+
+ /* If the peer gave us the last packet it saw, process this as if
+ * we had received a regular ACK. */
+ if (dp->dp_ack_seq)
+ rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
+
+ BUG_ON(cm_id->context);
+ BUG_ON(ic->i_cm_id);
+
+ ic->i_cm_id = cm_id;
+ cm_id->context = conn;
+
+ rds_iwdev = ib_get_client_data(cm_id->device, &rds_iw_client);
+ ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey;
+
+ /* We got halfway through setting up the ib_connection, if we
+ * fail now, we have to take the long route out of this mess. */
+ destroy = 0;
+
+ err = rds_iw_setup_qp(conn);
+ if (err) {
+ rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", err);
+ goto out;
+ }
+
+ rds_iw_cm_fill_conn_param(conn, &conn_param, &dp_rep, version);
+
+ /* rdma_accept() calls rdma_reject() internally if it fails */
+ err = rdma_accept(cm_id, &conn_param);
+ mutex_unlock(&conn->c_cm_lock);
+ if (err) {
+ rds_iw_conn_error(conn, "rdma_accept failed (%d)\n", err);
+ goto out;
+ }
+
+ return 0;
+
+out:
+ rdma_reject(cm_id, NULL, 0);
+ return destroy;
+}
+
+
+int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id)
+{
+ struct rds_connection *conn = cm_id->context;
+ struct rds_iw_connection *ic = conn->c_transport_data;
+ struct rdma_conn_param conn_param;
+ struct rds_iw_connect_private dp;
+ int ret;
+
+ /* If the peer doesn't do protocol negotiation, we must
+ * default to RDSv3.0 */
+ rds_iw_set_protocol(conn, RDS_PROTOCOL_3_0);
+ ic->i_flowctl = rds_iw_sysctl_flow_control; /* advertise flow control */
+
+ ret = rds_iw_setup_qp(conn);
+ if (ret) {
+ rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", ret);
+ goto out;
+ }
+
+ rds_iw_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION);
+
+ ret = rdma_connect(cm_id, &conn_param);
+ if (ret)
+ rds_iw_conn_error(conn, "rdma_connect failed (%d)\n", ret);
+
+out:
+ /* Beware - returning non-zero tells the rdma_cm to destroy
+ * the cm_id. We should certainly not do it as long as we still
+ * "own" the cm_id. */
+ if (ret) {
+ struct rds_iw_connection *ic = conn->c_transport_data;
+
+ if (ic->i_cm_id == cm_id)
+ ret = 0;
+ }
+ return ret;
+}
+
+int rds_iw_conn_connect(struct rds_connection *conn)
+{
+ struct rds_iw_connection *ic = conn->c_transport_data;
+ struct rds_iw_device *rds_iwdev;
+ struct sockaddr_in src, dest;
+ int ret;
+
+ /* XXX I wonder what affect the port space has */
+ /* delegate cm event handler to rdma_transport */
+ ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
+ RDMA_PS_TCP);
+ if (IS_ERR(ic->i_cm_id)) {
+ ret = PTR_ERR(ic->i_cm_id);
+ ic->i_cm_id = NULL;
+ rdsdebug("rdma_create_id() failed: %d\n", ret);
+ goto out;
+ }
+
+ rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
+
+ src.sin_family = AF_INET;
+ src.sin_addr.s_addr = (__force u32)conn->c_laddr;
+ src.sin_port = (__force u16)htons(0);
+
+ /* First, bind to the local address and device. */
+ ret = rdma_bind_addr(ic->i_cm_id, (struct sockaddr *) &src);
+ if (ret) {
+ rdsdebug("rdma_bind_addr(%pI4) failed: %d\n",
+ &conn->c_laddr, ret);
+ rdma_destroy_id(ic->i_cm_id);
+ ic->i_cm_id = NULL;
+ goto out;
+ }
+
+ rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
+ ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey;
+
+ dest.sin_family = AF_INET;
+ dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
+ dest.sin_port = (__force u16)htons(RDS_PORT);
+
+ ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
+ (struct sockaddr *)&dest,
+ RDS_RDMA_RESOLVE_TIMEOUT_MS);
+ if (ret) {
+ rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id,
+ ret);
+ rdma_destroy_id(ic->i_cm_id);
+ ic->i_cm_id = NULL;
+ }
+
+out:
+ return ret;
+}
+
+/*
+ * This is so careful about only cleaning up resources that were built up
+ * so that it can be called at any point during startup. In fact it
+ * can be called multiple times for a given connection.
+ */
+void rds_iw_conn_shutdown(struct rds_connection *conn)
+{
+ struct rds_iw_connection *ic = conn->c_transport_data;
+ int err = 0;
+ struct ib_qp_attr qp_attr;
+
+ rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id,
+ ic->i_pd, ic->i_send_cq, ic->i_recv_cq,
+ ic->i_cm_id ? ic->i_cm_id->qp : NULL);
+
+ if (ic->i_cm_id) {
+ struct ib_device *dev = ic->i_cm_id->device;
+
+ rdsdebug("disconnecting cm %p\n", ic->i_cm_id);
+ err = rdma_disconnect(ic->i_cm_id);
+ if (err) {
+ /* Actually this may happen quite frequently, when
+ * an outgoing connect raced with an incoming connect.
+ */
+ rdsdebug("rds_iw_conn_shutdown: failed to disconnect,"
+ " cm: %p err %d\n", ic->i_cm_id, err);
+ }
+
+ if (ic->i_cm_id->qp) {
+ qp_attr.qp_state = IB_QPS_ERR;
+ ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE);
+ }
+
+ wait_event(rds_iw_ring_empty_wait,
+ rds_iw_ring_empty(&ic->i_send_ring) &&
+ rds_iw_ring_empty(&ic->i_recv_ring));
+
+ if (ic->i_send_hdrs)
+ ib_dma_free_coherent(dev,
+ ic->i_send_ring.w_nr *
+ sizeof(struct rds_header),
+ ic->i_send_hdrs,
+ ic->i_send_hdrs_dma);
+
+ if (ic->i_recv_hdrs)
+ ib_dma_free_coherent(dev,
+ ic->i_recv_ring.w_nr *
+ sizeof(struct rds_header),
+ ic->i_recv_hdrs,
+ ic->i_recv_hdrs_dma);
+
+ if (ic->i_ack)
+ ib_dma_free_coherent(dev, sizeof(struct rds_header),
+ ic->i_ack, ic->i_ack_dma);
+
+ if (ic->i_sends)
+ rds_iw_send_clear_ring(ic);
+ if (ic->i_recvs)
+ rds_iw_recv_clear_ring(ic);
+
+ if (ic->i_cm_id->qp)
+ rdma_destroy_qp(ic->i_cm_id);
+ if (ic->i_send_cq)
+ ib_destroy_cq(ic->i_send_cq);
+ if (ic->i_recv_cq)
+ ib_destroy_cq(ic->i_recv_cq);
+
+ /*
+ * If associated with an rds_iw_device:
+ * Move connection back to the nodev list.
+ * Remove cm_id from the device cm_id list.
+ */
+ if (ic->rds_iwdev) {
+
+ spin_lock_irq(&ic->rds_iwdev->spinlock);
+ BUG_ON(list_empty(&ic->iw_node));
+ list_del(&ic->iw_node);
+ spin_unlock_irq(&ic->rds_iwdev->spinlock);
+
+ spin_lock_irq(&iw_nodev_conns_lock);
+ list_add_tail(&ic->iw_node, &iw_nodev_conns);
+ spin_unlock_irq(&iw_nodev_conns_lock);
+ rds_iw_remove_cm_id(ic->rds_iwdev, ic->i_cm_id);
+ ic->rds_iwdev = NULL;
+ }
+
+ rdma_destroy_id(ic->i_cm_id);
+
+ ic->i_cm_id = NULL;
+ ic->i_pd = NULL;
+ ic->i_mr = NULL;
+ ic->i_send_cq = NULL;
+ ic->i_recv_cq = NULL;
+ ic->i_send_hdrs = NULL;
+ ic->i_recv_hdrs = NULL;
+ ic->i_ack = NULL;
+ }
+ BUG_ON(ic->rds_iwdev);
+
+ /* Clear pending transmit */
+ if (ic->i_rm) {
+ rds_message_put(ic->i_rm);
+ ic->i_rm = NULL;
+ }
+
+ /* Clear the ACK state */
+ clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+ rds_iw_set_64bit(&ic->i_ack_next, 0);
+ ic->i_ack_recv = 0;
+
+ /* Clear flow control state */
+ ic->i_flowctl = 0;
+ atomic_set(&ic->i_credits, 0);
+
+ rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr);
+ rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr);
+
+ if (ic->i_iwinc) {
+ rds_inc_put(&ic->i_iwinc->ii_inc);
+ ic->i_iwinc = NULL;
+ }
+
+ vfree(ic->i_sends);
+ ic->i_sends = NULL;
+ vfree(ic->i_recvs);
+ ic->i_recvs = NULL;
+ rdsdebug("shutdown complete\n");
+}
+
+int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp)
+{
+ struct rds_iw_connection *ic;
+ unsigned long flags;
+
+ /* XXX too lazy? */
+ ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL);
+ if (ic == NULL)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&ic->iw_node);
+ mutex_init(&ic->i_recv_mutex);
+
+ /*
+ * rds_iw_conn_shutdown() waits for these to be emptied so they
+ * must be initialized before it can be called.
+ */
+ rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr);
+ rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr);
+
+ ic->conn = conn;
+ conn->c_transport_data = ic;
+
+ spin_lock_irqsave(&iw_nodev_conns_lock, flags);
+ list_add_tail(&ic->iw_node, &iw_nodev_conns);
+ spin_unlock_irqrestore(&iw_nodev_conns_lock, flags);
+
+
+ rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data);
+ return 0;
+}
+
+void rds_iw_conn_free(void *arg)
+{
+ struct rds_iw_connection *ic = arg;
+ rdsdebug("ic %p\n", ic);
+ list_del(&ic->iw_node);
+ kfree(ic);
+}
+
+/*
+ * An error occurred on the connection
+ */
+void
+__rds_iw_conn_error(struct rds_connection *conn, const char *fmt, ...)
+{
+ va_list ap;
+
+ rds_conn_drop(conn);
+
+ va_start(ap, fmt);
+ vprintk(fmt, ap);
+ va_end(ap);
+}
diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c
new file mode 100644
index 00000000000..1c02a8f952d
--- /dev/null
+++ b/net/rds/iw_rdma.c
@@ -0,0 +1,888 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+
+#include "rds.h"
+#include "rdma.h"
+#include "iw.h"
+
+
+/*
+ * This is stored as mr->r_trans_private.
+ */
+struct rds_iw_mr {
+ struct rds_iw_device *device;
+ struct rds_iw_mr_pool *pool;
+ struct rdma_cm_id *cm_id;
+
+ struct ib_mr *mr;
+ struct ib_fast_reg_page_list *page_list;
+
+ struct rds_iw_mapping mapping;
+ unsigned char remap_count;
+};
+
+/*
+ * Our own little MR pool
+ */
+struct rds_iw_mr_pool {
+ struct rds_iw_device *device; /* back ptr to the device that owns us */
+
+ struct mutex flush_lock; /* serialize fmr invalidate */
+ struct work_struct flush_worker; /* flush worker */
+
+ spinlock_t list_lock; /* protect variables below */
+ atomic_t item_count; /* total # of MRs */
+ atomic_t dirty_count; /* # dirty of MRs */
+ struct list_head dirty_list; /* dirty mappings */
+ struct list_head clean_list; /* unused & unamapped MRs */
+ atomic_t free_pinned; /* memory pinned by free MRs */
+ unsigned long max_message_size; /* in pages */
+ unsigned long max_items;
+ unsigned long max_items_soft;
+ unsigned long max_free_pinned;
+ int max_pages;
+};
+
+static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all);
+static void rds_iw_mr_pool_flush_worker(struct work_struct *work);
+static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
+static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool,
+ struct rds_iw_mr *ibmr,
+ struct scatterlist *sg, unsigned int nents);
+static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
+static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
+ struct list_head *unmap_list,
+ struct list_head *kill_list);
+static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
+
+static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwdev, struct rdma_cm_id **cm_id)
+{
+ struct rds_iw_device *iwdev;
+ struct rds_iw_cm_id *i_cm_id;
+
+ *rds_iwdev = NULL;
+ *cm_id = NULL;
+
+ list_for_each_entry(iwdev, &rds_iw_devices, list) {
+ spin_lock_irq(&iwdev->spinlock);
+ list_for_each_entry(i_cm_id, &iwdev->cm_id_list, list) {
+ struct sockaddr_in *src_addr, *dst_addr;
+
+ src_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.src_addr;
+ dst_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.dst_addr;
+
+ rdsdebug("local ipaddr = %x port %d, "
+ "remote ipaddr = %x port %d"
+ "..looking for %x port %d, "
+ "remote ipaddr = %x port %d\n",
+ src_addr->sin_addr.s_addr,
+ src_addr->sin_port,
+ dst_addr->sin_addr.s_addr,
+ dst_addr->sin_port,
+ rs->rs_bound_addr,
+ rs->rs_bound_port,
+ rs->rs_conn_addr,
+ rs->rs_conn_port);
+#ifdef WORKING_TUPLE_DETECTION
+ if (src_addr->sin_addr.s_addr == rs->rs_bound_addr &&
+ src_addr->sin_port == rs->rs_bound_port &&
+ dst_addr->sin_addr.s_addr == rs->rs_conn_addr &&
+ dst_addr->sin_port == rs->rs_conn_port) {
+#else
+ /* FIXME - needs to compare the local and remote
+ * ipaddr/port tuple, but the ipaddr is the only
+ * available infomation in the rds_sock (as the rest are
+ * zero'ed. It doesn't appear to be properly populated
+ * during connection setup...
+ */
+ if (src_addr->sin_addr.s_addr == rs->rs_bound_addr) {
+#endif
+ spin_unlock_irq(&iwdev->spinlock);
+ *rds_iwdev = iwdev;
+ *cm_id = i_cm_id->cm_id;
+ return 0;
+ }
+ }
+ spin_unlock_irq(&iwdev->spinlock);
+ }
+
+ return 1;
+}
+
+static int rds_iw_add_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
+{
+ struct rds_iw_cm_id *i_cm_id;
+
+ i_cm_id = kmalloc(sizeof *i_cm_id, GFP_KERNEL);
+ if (!i_cm_id)
+ return -ENOMEM;
+
+ i_cm_id->cm_id = cm_id;
+
+ spin_lock_irq(&rds_iwdev->spinlock);
+ list_add_tail(&i_cm_id->list, &rds_iwdev->cm_id_list);
+ spin_unlock_irq(&rds_iwdev->spinlock);
+
+ return 0;
+}
+
+void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
+{
+ struct rds_iw_cm_id *i_cm_id;
+
+ spin_lock_irq(&rds_iwdev->spinlock);
+ list_for_each_entry(i_cm_id, &rds_iwdev->cm_id_list, list) {
+ if (i_cm_id->cm_id == cm_id) {
+ list_del(&i_cm_id->list);
+ kfree(i_cm_id);
+ break;
+ }
+ }
+ spin_unlock_irq(&rds_iwdev->spinlock);
+}
+
+
+int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
+{
+ struct sockaddr_in *src_addr, *dst_addr;
+ struct rds_iw_device *rds_iwdev_old;
+ struct rds_sock rs;
+ struct rdma_cm_id *pcm_id;
+ int rc;
+
+ src_addr = (struct sockaddr_in *)&cm_id->route.addr.src_addr;
+ dst_addr = (struct sockaddr_in *)&cm_id->route.addr.dst_addr;
+
+ rs.rs_bound_addr = src_addr->sin_addr.s_addr;
+ rs.rs_bound_port = src_addr->sin_port;
+ rs.rs_conn_addr = dst_addr->sin_addr.s_addr;
+ rs.rs_conn_port = dst_addr->sin_port;
+
+ rc = rds_iw_get_device(&rs, &rds_iwdev_old, &pcm_id);
+ if (rc)
+ rds_iw_remove_cm_id(rds_iwdev, cm_id);
+
+ return rds_iw_add_cm_id(rds_iwdev, cm_id);
+}
+
+int rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn)
+{
+ struct rds_iw_connection *ic = conn->c_transport_data;
+
+ /* conn was previously on the nodev_conns_list */
+ spin_lock_irq(&iw_nodev_conns_lock);
+ BUG_ON(list_empty(&iw_nodev_conns));
+ BUG_ON(list_empty(&ic->iw_node));
+ list_del(&ic->iw_node);
+ spin_unlock_irq(&iw_nodev_conns_lock);
+
+ spin_lock_irq(&rds_iwdev->spinlock);
+ list_add_tail(&ic->iw_node, &rds_iwdev->conn_list);
+ spin_unlock_irq(&rds_iwdev->spinlock);
+
+ ic->rds_iwdev = rds_iwdev;
+
+ return 0;
+}
+
+void rds_iw_remove_nodev_conns(void)
+{
+ struct rds_iw_connection *ic, *_ic;
+ LIST_HEAD(tmp_list);
+
+ /* avoid calling conn_destroy with irqs off */
+ spin_lock_irq(&iw_nodev_conns_lock);
+ list_splice(&iw_nodev_conns, &tmp_list);
+ INIT_LIST_HEAD(&iw_nodev_conns);
+ spin_unlock_irq(&iw_nodev_conns_lock);
+
+ list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node) {
+ if (ic->conn->c_passive)
+ rds_conn_destroy(ic->conn->c_passive);
+ rds_conn_destroy(ic->conn);
+ }
+}
+
+void rds_iw_remove_conns(struct rds_iw_device *rds_iwdev)
+{
+ struct rds_iw_connection *ic, *_ic;
+ LIST_HEAD(tmp_list);
+
+ /* avoid calling conn_destroy with irqs off */
+ spin_lock_irq(&rds_iwdev->spinlock);
+ list_splice(&rds_iwdev->conn_list, &tmp_list);
+ INIT_LIST_HEAD(&rds_iwdev->conn_list);
+ spin_unlock_irq(&rds_iwdev->spinlock);
+
+ list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node) {
+ if (ic->conn->c_passive)
+ rds_conn_destroy(ic->conn->c_passive);
+ rds_conn_destroy(ic->conn);
+ }
+}
+
+static void rds_iw_set_scatterlist(struct rds_iw_scatterlist *sg,
+ struct scatterlist *list, unsigned int sg_len)
+{
+ sg->list = list;
+ sg->len = sg_len;
+ sg->dma_len = 0;
+ sg->dma_npages = 0;
+ sg->bytes = 0;
+}
+
+static u64 *rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev,
+ struct rds_iw_scatterlist *sg,
+ unsigned int dma_page_shift)
+{
+ struct ib_device *dev = rds_iwdev->dev;
+ u64 *dma_pages = NULL;
+ u64 dma_mask;
+ unsigned int dma_page_size;
+ int i, j, ret;
+
+ dma_page_size = 1 << dma_page_shift;
+ dma_mask = dma_page_size - 1;
+
+ WARN_ON(sg->dma_len);
+
+ sg->dma_len = ib_dma_map_sg(dev, sg->list, sg->len, DMA_BIDIRECTIONAL);
+ if (unlikely(!sg->dma_len)) {
+ printk(KERN_WARNING "RDS/IW: dma_map_sg failed!\n");
+ return ERR_PTR(-EBUSY);
+ }
+
+ sg->bytes = 0;
+ sg->dma_npages = 0;
+
+ ret = -EINVAL;
+ for (i = 0; i < sg->dma_len; ++i) {
+ unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]);
+ u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]);
+ u64 end_addr;
+
+ sg->bytes += dma_len;
+
+ end_addr = dma_addr + dma_len;
+ if (dma_addr & dma_mask) {
+ if (i > 0)
+ goto out_unmap;
+ dma_addr &= ~dma_mask;
+ }
+ if (end_addr & dma_mask) {
+ if (i < sg->dma_len - 1)
+ goto out_unmap;
+ end_addr = (end_addr + dma_mask) & ~dma_mask;
+ }
+
+ sg->dma_npages += (end_addr - dma_addr) >> dma_page_shift;
+ }
+
+ /* Now gather the dma addrs into one list */
+ if (sg->dma_npages > fastreg_message_size)
+ goto out_unmap;
+
+ dma_pages = kmalloc(sizeof(u64) * sg->dma_npages, GFP_ATOMIC);
+ if (!dma_pages) {
+ ret = -ENOMEM;
+ goto out_unmap;
+ }
+
+ for (i = j = 0; i < sg->dma_len; ++i) {
+ unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]);
+ u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]);
+ u64 end_addr;
+
+ end_addr = dma_addr + dma_len;
+ dma_addr &= ~dma_mask;
+ for (; dma_addr < end_addr; dma_addr += dma_page_size)
+ dma_pages[j++] = dma_addr;
+ BUG_ON(j > sg->dma_npages);
+ }
+
+ return dma_pages;
+
+out_unmap:
+ ib_dma_unmap_sg(rds_iwdev->dev, sg->list, sg->len, DMA_BIDIRECTIONAL);
+ sg->dma_len = 0;
+ kfree(dma_pages);
+ return ERR_PTR(ret);
+}
+
+
+struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *rds_iwdev)
+{
+ struct rds_iw_mr_pool *pool;
+
+ pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+ if (!pool) {
+ printk(KERN_WARNING "RDS/IW: rds_iw_create_mr_pool alloc error\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ pool->device = rds_iwdev;
+ INIT_LIST_HEAD(&pool->dirty_list);
+ INIT_LIST_HEAD(&pool->clean_list);
+ mutex_init(&pool->flush_lock);
+ spin_lock_init(&pool->list_lock);
+ INIT_WORK(&pool->flush_worker, rds_iw_mr_pool_flush_worker);
+
+ pool->max_message_size = fastreg_message_size;
+ pool->max_items = fastreg_pool_size;
+ pool->max_free_pinned = pool->max_items * pool->max_message_size / 4;
+ pool->max_pages = fastreg_message_size;
+
+ /* We never allow more than max_items MRs to be allocated.
+ * When we exceed more than max_items_soft, we start freeing
+ * items more aggressively.
+ * Make sure that max_items > max_items_soft > max_items / 2
+ */
+ pool->max_items_soft = pool->max_items * 3 / 4;
+
+ return pool;
+}
+
+void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo)
+{
+ struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
+
+ iinfo->rdma_mr_max = pool->max_items;
+ iinfo->rdma_mr_size = pool->max_pages;
+}
+
+void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *pool)
+{
+ flush_workqueue(rds_wq);
+ rds_iw_flush_mr_pool(pool, 1);
+ BUG_ON(atomic_read(&pool->item_count));
+ BUG_ON(atomic_read(&pool->free_pinned));
+ kfree(pool);
+}
+
+static inline struct rds_iw_mr *rds_iw_reuse_fmr(struct rds_iw_mr_pool *pool)
+{
+ struct rds_iw_mr *ibmr = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pool->list_lock, flags);
+ if (!list_empty(&pool->clean_list)) {
+ ibmr = list_entry(pool->clean_list.next, struct rds_iw_mr, mapping.m_list);
+ list_del_init(&ibmr->mapping.m_list);
+ }
+ spin_unlock_irqrestore(&pool->list_lock, flags);
+
+ return ibmr;
+}
+
+static struct rds_iw_mr *rds_iw_alloc_mr(struct rds_iw_device *rds_iwdev)
+{
+ struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
+ struct rds_iw_mr *ibmr = NULL;
+ int err = 0, iter = 0;
+
+ while (1) {
+ ibmr = rds_iw_reuse_fmr(pool);
+ if (ibmr)
+ return ibmr;
+
+ /* No clean MRs - now we have the choice of either
+ * allocating a fresh MR up to the limit imposed by the
+ * driver, or flush any dirty unused MRs.
+ * We try to avoid stalling in the send path if possible,
+ * so we allocate as long as we're allowed to.
+ *
+ * We're fussy with enforcing the FMR limit, though. If the driver
+ * tells us we can't use more than N fmrs, we shouldn't start
+ * arguing with it */
+ if (atomic_inc_return(&pool->item_count) <= pool->max_items)
+ break;
+
+ atomic_dec(&pool->item_count);
+
+ if (++iter > 2) {
+ rds_iw_stats_inc(s_iw_rdma_mr_pool_depleted);
+ return ERR_PTR(-EAGAIN);
+ }
+
+ /* We do have some empty MRs. Flush them out. */
+ rds_iw_stats_inc(s_iw_rdma_mr_pool_wait);
+ rds_iw_flush_mr_pool(pool, 0);
+ }
+
+ ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
+ if (!ibmr) {
+ err = -ENOMEM;
+ goto out_no_cigar;
+ }
+
+ spin_lock_init(&ibmr->mapping.m_lock);
+ INIT_LIST_HEAD(&ibmr->mapping.m_list);
+ ibmr->mapping.m_mr = ibmr;
+
+ err = rds_iw_init_fastreg(pool, ibmr);
+ if (err)
+ goto out_no_cigar;
+
+ rds_iw_stats_inc(s_iw_rdma_mr_alloc);
+ return ibmr;
+
+out_no_cigar:
+ if (ibmr) {
+ rds_iw_destroy_fastreg(pool, ibmr);
+ kfree(ibmr);
+ }
+ atomic_dec(&pool->item_count);
+ return ERR_PTR(err);
+}
+
+void rds_iw_sync_mr(void *trans_private, int direction)
+{
+ struct rds_iw_mr *ibmr = trans_private;
+ struct rds_iw_device *rds_iwdev = ibmr->device;
+
+ switch (direction) {
+ case DMA_FROM_DEVICE:
+ ib_dma_sync_sg_for_cpu(rds_iwdev->dev, ibmr->mapping.m_sg.list,
+ ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL);
+ break;
+ case DMA_TO_DEVICE:
+ ib_dma_sync_sg_for_device(rds_iwdev->dev, ibmr->mapping.m_sg.list,
+ ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL);
+ break;
+ }
+}
+
+static inline unsigned int rds_iw_flush_goal(struct rds_iw_mr_pool *pool, int free_all)
+{
+ unsigned int item_count;
+
+ item_count = atomic_read(&pool->item_count);
+ if (free_all)
+ return item_count;
+
+ return 0;
+}
+
+/*
+ * Flush our pool of MRs.
+ * At a minimum, all currently unused MRs are unmapped.
+ * If the number of MRs allocated exceeds the limit, we also try
+ * to free as many MRs as needed to get back to this limit.
+ */
+static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all)
+{
+ struct rds_iw_mr *ibmr, *next;
+ LIST_HEAD(unmap_list);
+ LIST_HEAD(kill_list);
+ unsigned long flags;
+ unsigned int nfreed = 0, ncleaned = 0, free_goal;
+ int ret = 0;
+
+ rds_iw_stats_inc(s_iw_rdma_mr_pool_flush);
+
+ mutex_lock(&pool->flush_lock);
+
+ spin_lock_irqsave(&pool->list_lock, flags);
+ /* Get the list of all mappings to be destroyed */
+ list_splice_init(&pool->dirty_list, &unmap_list);
+ if (free_all)
+ list_splice_init(&pool->clean_list, &kill_list);
+ spin_unlock_irqrestore(&pool->list_lock, flags);
+
+ free_goal = rds_iw_flush_goal(pool, free_all);
+
+ /* Batched invalidate of dirty MRs.
+ * For FMR based MRs, the mappings on the unmap list are
+ * actually members of an ibmr (ibmr->mapping). They either
+ * migrate to the kill_list, or have been cleaned and should be
+ * moved to the clean_list.
+ * For fastregs, they will be dynamically allocated, and
+ * will be destroyed by the unmap function.
+ */
+ if (!list_empty(&unmap_list)) {
+ ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list, &kill_list);
+ /* If we've been asked to destroy all MRs, move those
+ * that were simply cleaned to the kill list */
+ if (free_all)
+ list_splice_init(&unmap_list, &kill_list);
+ }
+
+ /* Destroy any MRs that are past their best before date */
+ list_for_each_entry_safe(ibmr, next, &kill_list, mapping.m_list) {
+ rds_iw_stats_inc(s_iw_rdma_mr_free);
+ list_del(&ibmr->mapping.m_list);
+ rds_iw_destroy_fastreg(pool, ibmr);
+ kfree(ibmr);
+ nfreed++;
+ }
+
+ /* Anything that remains are laundered ibmrs, which we can add
+ * back to the clean list. */
+ if (!list_empty(&unmap_list)) {
+ spin_lock_irqsave(&pool->list_lock, flags);
+ list_splice(&unmap_list, &pool->clean_list);
+ spin_unlock_irqrestore(&pool->list_lock, flags);
+ }
+
+ atomic_sub(ncleaned, &pool->dirty_count);
+ atomic_sub(nfreed, &pool->item_count);
+
+ mutex_unlock(&pool->flush_lock);
+ return ret;
+}
+
+static void rds_iw_mr_pool_flush_worker(struct work_struct *work)
+{
+ struct rds_iw_mr_pool *pool = container_of(work, struct rds_iw_mr_pool, flush_worker);
+
+ rds_iw_flush_mr_pool(pool, 0);
+}
+
+void rds_iw_free_mr(void *trans_private, int invalidate)
+{
+ struct rds_iw_mr *ibmr = trans_private;
+ struct rds_iw_mr_pool *pool = ibmr->device->mr_pool;
+
+ rdsdebug("RDS/IW: free_mr nents %u\n", ibmr->mapping.m_sg.len);
+ if (!pool)
+ return;
+
+ /* Return it to the pool's free list */
+ rds_iw_free_fastreg(pool, ibmr);
+
+ /* If we've pinned too many pages, request a flush */
+ if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned
+ || atomic_read(&pool->dirty_count) >= pool->max_items / 10)
+ queue_work(rds_wq, &pool->flush_worker);
+
+ if (invalidate) {
+ if (likely(!in_interrupt())) {
+ rds_iw_flush_mr_pool(pool, 0);
+ } else {
+ /* We get here if the user created a MR marked
+ * as use_once and invalidate at the same time. */
+ queue_work(rds_wq, &pool->flush_worker);
+ }
+ }
+}
+
+void rds_iw_flush_mrs(void)
+{
+ struct rds_iw_device *rds_iwdev;
+
+ list_for_each_entry(rds_iwdev, &rds_iw_devices, list) {
+ struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
+
+ if (pool)
+ rds_iw_flush_mr_pool(pool, 0);
+ }
+}
+
+void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
+ struct rds_sock *rs, u32 *key_ret)
+{
+ struct rds_iw_device *rds_iwdev;
+ struct rds_iw_mr *ibmr = NULL;
+ struct rdma_cm_id *cm_id;
+ int ret;
+
+ ret = rds_iw_get_device(rs, &rds_iwdev, &cm_id);
+ if (ret || !cm_id) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ if (!rds_iwdev->mr_pool) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ ibmr = rds_iw_alloc_mr(rds_iwdev);
+ if (IS_ERR(ibmr))
+ return ibmr;
+
+ ibmr->cm_id = cm_id;
+ ibmr->device = rds_iwdev;
+
+ ret = rds_iw_map_fastreg(rds_iwdev->mr_pool, ibmr, sg, nents);
+ if (ret == 0)
+ *key_ret = ibmr->mr->rkey;
+ else
+ printk(KERN_WARNING "RDS/IW: failed to map mr (errno=%d)\n", ret);
+
+out:
+ if (ret) {
+ if (ibmr)
+ rds_iw_free_mr(ibmr, 0);
+ ibmr = ERR_PTR(ret);
+ }
+ return ibmr;
+}
+
+/*
+ * iWARP fastreg handling
+ *
+ * The life cycle of a fastreg registration is a bit different from
+ * FMRs.
+ * The idea behind fastreg is to have one MR, to which we bind different
+ * mappings over time. To avoid stalling on the expensive map and invalidate
+ * operations, these operations are pipelined on the same send queue on
+ * which we want to send the message containing the r_key.
+ *
+ * This creates a bit of a problem for us, as we do not have the destination
+ * IP in GET_MR, so the connection must be setup prior to the GET_MR call for
+ * RDMA to be correctly setup. If a fastreg request is present, rds_iw_xmit
+ * will try to queue a LOCAL_INV (if needed) and a FAST_REG_MR work request
+ * before queuing the SEND. When completions for these arrive, they are
+ * dispatched to the MR has a bit set showing that RDMa can be performed.
+ *
+ * There is another interesting aspect that's related to invalidation.
+ * The application can request that a mapping is invalidated in FREE_MR.
+ * The expectation there is that this invalidation step includes ALL
+ * PREVIOUSLY FREED MRs.
+ */
+static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool,
+ struct rds_iw_mr *ibmr)
+{
+ struct rds_iw_device *rds_iwdev = pool->device;
+ struct ib_fast_reg_page_list *page_list = NULL;
+ struct ib_mr *mr;
+ int err;
+
+ mr = ib_alloc_fast_reg_mr(rds_iwdev->pd, pool->max_message_size);
+ if (IS_ERR(mr)) {
+ err = PTR_ERR(mr);
+
+ printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed (err=%d)\n", err);
+ return err;
+ }
+
+ /* FIXME - this is overkill, but mapping->m_sg.dma_len/mapping->m_sg.dma_npages
+ * is not filled in.
+ */
+ page_list = ib_alloc_fast_reg_page_list(rds_iwdev->dev, pool->max_message_size);
+ if (IS_ERR(page_list)) {
+ err = PTR_ERR(page_list);
+
+ printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed (err=%d)\n", err);
+ ib_dereg_mr(mr);
+ return err;
+ }
+
+ ibmr->page_list = page_list;
+ ibmr->mr = mr;
+ return 0;
+}
+
+static int rds_iw_rdma_build_fastreg(struct rds_iw_mapping *mapping)
+{
+ struct rds_iw_mr *ibmr = mapping->m_mr;
+ struct ib_send_wr f_wr, *failed_wr;
+ int ret;
+
+ /*
+ * Perform a WR for the fast_reg_mr. Each individual page
+ * in the sg list is added to the fast reg page list and placed
+ * inside the fast_reg_mr WR. The key used is a rolling 8bit
+ * counter, which should guarantee uniqueness.
+ */
+ ib_update_fast_reg_key(ibmr->mr, ibmr->remap_count++);
+ mapping->m_rkey = ibmr->mr->rkey;
+
+ memset(&f_wr, 0, sizeof(f_wr));
+ f_wr.wr_id = RDS_IW_FAST_REG_WR_ID;
+ f_wr.opcode = IB_WR_FAST_REG_MR;
+ f_wr.wr.fast_reg.length = mapping->m_sg.bytes;
+ f_wr.wr.fast_reg.rkey = mapping->m_rkey;
+ f_wr.wr.fast_reg.page_list = ibmr->page_list;
+ f_wr.wr.fast_reg.page_list_len = mapping->m_sg.dma_len;
+ f_wr.wr.fast_reg.page_shift = ibmr->device->page_shift;
+ f_wr.wr.fast_reg.access_flags = IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_WRITE;
+ f_wr.wr.fast_reg.iova_start = 0;
+ f_wr.send_flags = IB_SEND_SIGNALED;
+
+ failed_wr = &f_wr;
+ ret = ib_post_send(ibmr->cm_id->qp, &f_wr, &failed_wr);
+ BUG_ON(failed_wr != &f_wr);
+ if (ret && printk_ratelimit())
+ printk(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
+ __func__, __LINE__, ret);
+ return ret;
+}
+
+static int rds_iw_rdma_fastreg_inv(struct rds_iw_mr *ibmr)
+{
+ struct ib_send_wr s_wr, *failed_wr;
+ int ret = 0;
+
+ if (!ibmr->cm_id->qp || !ibmr->mr)
+ goto out;
+
+ memset(&s_wr, 0, sizeof(s_wr));
+ s_wr.wr_id = RDS_IW_LOCAL_INV_WR_ID;
+ s_wr.opcode = IB_WR_LOCAL_INV;
+ s_wr.ex.invalidate_rkey = ibmr->mr->rkey;
+ s_wr.send_flags = IB_SEND_SIGNALED;
+
+ failed_wr = &s_wr;
+ ret = ib_post_send(ibmr->cm_id->qp, &s_wr, &failed_wr);
+ if (ret && printk_ratelimit()) {
+ printk(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
+ __func__, __LINE__, ret);
+ goto out;
+ }
+out:
+ return ret;
+}
+
+static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool,
+ struct rds_iw_mr *ibmr,
+ struct scatterlist *sg,
+ unsigned int sg_len)
+{
+ struct rds_iw_device *rds_iwdev = pool->device;
+ struct rds_iw_mapping *mapping = &ibmr->mapping;
+ u64 *dma_pages;
+ int i, ret = 0;
+
+ rds_iw_set_scatterlist(&mapping->m_sg, sg, sg_len);
+
+ dma_pages = rds_iw_map_scatterlist(rds_iwdev,
+ &mapping->m_sg,
+ rds_iwdev->page_shift);
+ if (IS_ERR(dma_pages)) {
+ ret = PTR_ERR(dma_pages);
+ dma_pages = NULL;
+ goto out;
+ }
+
+ if (mapping->m_sg.dma_len > pool->max_message_size) {
+ ret = -EMSGSIZE;
+ goto out;
+ }
+
+ for (i = 0; i < mapping->m_sg.dma_npages; ++i)
+ ibmr->page_list->page_list[i] = dma_pages[i];
+
+ ret = rds_iw_rdma_build_fastreg(mapping);
+ if (ret)
+ goto out;
+
+ rds_iw_stats_inc(s_iw_rdma_mr_used);
+
+out:
+ kfree(dma_pages);
+
+ return ret;
+}
+
+/*
+ * "Free" a fastreg MR.
+ */
+static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool,
+ struct rds_iw_mr *ibmr)
+{
+ unsigned long flags;
+ int ret;
+
+ if (!ibmr->mapping.m_sg.dma_len)
+ return;
+
+ ret = rds_iw_rdma_fastreg_inv(ibmr);
+ if (ret)
+ return;
+
+ /* Try to post the LOCAL_INV WR to the queue. */
+ spin_lock_irqsave(&pool->list_lock, flags);
+
+ list_add_tail(&ibmr->mapping.m_list, &pool->dirty_list);
+ atomic_add(ibmr->mapping.m_sg.len, &pool->free_pinned);
+ atomic_inc(&pool->dirty_count);
+
+ spin_unlock_irqrestore(&pool->list_lock, flags);
+}
+
+static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
+ struct list_head *unmap_list,
+ struct list_head *kill_list)
+{
+ struct rds_iw_mapping *mapping, *next;
+ unsigned int ncleaned = 0;
+ LIST_HEAD(laundered);
+
+ /* Batched invalidation of fastreg MRs.
+ * Why do we do it this way, even though we could pipeline unmap
+ * and remap? The reason is the application semantics - when the
+ * application requests an invalidation of MRs, it expects all
+ * previously released R_Keys to become invalid.
+ *
+ * If we implement MR reuse naively, we risk memory corruption
+ * (this has actually been observed). So the default behavior
+ * requires that a MR goes through an explicit unmap operation before
+ * we can reuse it again.
+ *
+ * We could probably improve on this a little, by allowing immediate
+ * reuse of a MR on the same socket (eg you could add small
+ * cache of unused MRs to strct rds_socket - GET_MR could grab one
+ * of these without requiring an explicit invalidate).
+ */
+ while (!list_empty(unmap_list)) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&pool->list_lock, flags);
+ list_for_each_entry_safe(mapping, next, unmap_list, m_list) {
+ list_move(&mapping->m_list, &laundered);
+ ncleaned++;
+ }
+ spin_unlock_irqrestore(&pool->list_lock, flags);
+ }
+
+ /* Move all laundered mappings back to the unmap list.
+ * We do not kill any WRs right now - it doesn't seem the
+ * fastreg API has a max_remap limit. */
+ list_splice_init(&laundered, unmap_list);
+
+ return ncleaned;
+}
+
+static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool,
+ struct rds_iw_mr *ibmr)
+{
+ if (ibmr->page_list)
+ ib_free_fast_reg_page_list(ibmr->page_list);
+ if (ibmr->mr)
+ ib_dereg_mr(ibmr->mr);
+}
diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c
new file mode 100644
index 00000000000..a1931f0027a
--- /dev/null
+++ b/net/rds/iw_recv.c
@@ -0,0 +1,869 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <rdma/rdma_cm.h>
+
+#include "rds.h"
+#include "iw.h"
+
+static struct kmem_cache *rds_iw_incoming_slab;
+static struct kmem_cache *rds_iw_frag_slab;
+static atomic_t rds_iw_allocation = ATOMIC_INIT(0);
+
+static void rds_iw_frag_drop_page(struct rds_page_frag *frag)
+{
+ rdsdebug("frag %p page %p\n", frag, frag->f_page);
+ __free_page(frag->f_page);
+ frag->f_page = NULL;
+}
+
+static void rds_iw_frag_free(struct rds_page_frag *frag)
+{
+ rdsdebug("frag %p page %p\n", frag, frag->f_page);
+ BUG_ON(frag->f_page != NULL);
+ kmem_cache_free(rds_iw_frag_slab, frag);
+}
+
+/*
+ * We map a page at a time. Its fragments are posted in order. This
+ * is called in fragment order as the fragments get send completion events.
+ * Only the last frag in the page performs the unmapping.
+ *
+ * It's OK for ring cleanup to call this in whatever order it likes because
+ * DMA is not in flight and so we can unmap while other ring entries still
+ * hold page references in their frags.
+ */
+static void rds_iw_recv_unmap_page(struct rds_iw_connection *ic,
+ struct rds_iw_recv_work *recv)
+{
+ struct rds_page_frag *frag = recv->r_frag;
+
+ rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page);
+ if (frag->f_mapped)
+ ib_dma_unmap_page(ic->i_cm_id->device,
+ frag->f_mapped,
+ RDS_FRAG_SIZE, DMA_FROM_DEVICE);
+ frag->f_mapped = 0;
+}
+
+void rds_iw_recv_init_ring(struct rds_iw_connection *ic)
+{
+ struct rds_iw_recv_work *recv;
+ u32 i;
+
+ for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
+ struct ib_sge *sge;
+
+ recv->r_iwinc = NULL;
+ recv->r_frag = NULL;
+
+ recv->r_wr.next = NULL;
+ recv->r_wr.wr_id = i;
+ recv->r_wr.sg_list = recv->r_sge;
+ recv->r_wr.num_sge = RDS_IW_RECV_SGE;
+
+ sge = rds_iw_data_sge(ic, recv->r_sge);
+ sge->addr = 0;
+ sge->length = RDS_FRAG_SIZE;
+ sge->lkey = 0;
+
+ sge = rds_iw_header_sge(ic, recv->r_sge);
+ sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
+ sge->length = sizeof(struct rds_header);
+ sge->lkey = 0;
+ }
+}
+
+static void rds_iw_recv_clear_one(struct rds_iw_connection *ic,
+ struct rds_iw_recv_work *recv)
+{
+ if (recv->r_iwinc) {
+ rds_inc_put(&recv->r_iwinc->ii_inc);
+ recv->r_iwinc = NULL;
+ }
+ if (recv->r_frag) {
+ rds_iw_recv_unmap_page(ic, recv);
+ if (recv->r_frag->f_page)
+ rds_iw_frag_drop_page(recv->r_frag);
+ rds_iw_frag_free(recv->r_frag);
+ recv->r_frag = NULL;
+ }
+}
+
+void rds_iw_recv_clear_ring(struct rds_iw_connection *ic)
+{
+ u32 i;
+
+ for (i = 0; i < ic->i_recv_ring.w_nr; i++)
+ rds_iw_recv_clear_one(ic, &ic->i_recvs[i]);
+
+ if (ic->i_frag.f_page)
+ rds_iw_frag_drop_page(&ic->i_frag);
+}
+
+static int rds_iw_recv_refill_one(struct rds_connection *conn,
+ struct rds_iw_recv_work *recv,
+ gfp_t kptr_gfp, gfp_t page_gfp)
+{
+ struct rds_iw_connection *ic = conn->c_transport_data;
+ dma_addr_t dma_addr;
+ struct ib_sge *sge;
+ int ret = -ENOMEM;
+
+ if (recv->r_iwinc == NULL) {
+ if (atomic_read(&rds_iw_allocation) >= rds_iw_sysctl_max_recv_allocation) {
+ rds_iw_stats_inc(s_iw_rx_alloc_limit);
+ goto out;
+ }
+ recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab,
+ kptr_gfp);
+ if (recv->r_iwinc == NULL)
+ goto out;
+ atomic_inc(&rds_iw_allocation);
+ INIT_LIST_HEAD(&recv->r_iwinc->ii_frags);
+ rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr);
+ }
+
+ if (recv->r_frag == NULL) {
+ recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp);
+ if (recv->r_frag == NULL)
+ goto out;
+ INIT_LIST_HEAD(&recv->r_frag->f_item);
+ recv->r_frag->f_page = NULL;
+ }
+
+ if (ic->i_frag.f_page == NULL) {
+ ic->i_frag.f_page = alloc_page(page_gfp);
+ if (ic->i_frag.f_page == NULL)
+ goto out;
+ ic->i_frag.f_offset = 0;
+ }
+
+ dma_addr = ib_dma_map_page(ic->i_cm_id->device,
+ ic->i_frag.f_page,
+ ic->i_frag.f_offset,
+ RDS_FRAG_SIZE,
+ DMA_FROM_DEVICE);
+ if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr))
+ goto out;
+
+ /*
+ * Once we get the RDS_PAGE_LAST_OFF frag then rds_iw_frag_unmap()
+ * must be called on this recv. This happens as completions hit
+ * in order or on connection shutdown.
+ */
+ recv->r_frag->f_page = ic->i_frag.f_page;
+ recv->r_frag->f_offset = ic->i_frag.f_offset;
+ recv->r_frag->f_mapped = dma_addr;
+
+ sge = rds_iw_data_sge(ic, recv->r_sge);
+ sge->addr = dma_addr;
+ sge->length = RDS_FRAG_SIZE;
+
+ sge = rds_iw_header_sge(ic, recv->r_sge);
+ sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
+ sge->length = sizeof(struct rds_header);
+
+ get_page(recv->r_frag->f_page);
+
+ if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) {
+ ic->i_frag.f_offset += RDS_FRAG_SIZE;
+ } else {
+ put_page(ic->i_frag.f_page);
+ ic->i_frag.f_page = NULL;
+ ic->i_frag.f_offset = 0;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+/*
+ * This tries to allocate and post unused work requests after making sure that
+ * they have all the allocations they need to queue received fragments into
+ * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc
+ * pairs don't go unmatched.
+ *
+ * -1 is returned if posting fails due to temporary resource exhaustion.
+ */
+int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
+ gfp_t page_gfp, int prefill)
+{
+ struct rds_iw_connection *ic = conn->c_transport_data;
+ struct rds_iw_recv_work *recv;
+ struct ib_recv_wr *failed_wr;
+ unsigned int posted = 0;
+ int ret = 0;
+ u32 pos;
+
+ while ((prefill || rds_conn_up(conn))
+ && rds_iw_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
+ if (pos >= ic->i_recv_ring.w_nr) {
+ printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
+ pos);
+ ret = -EINVAL;
+ break;
+ }
+
+ recv = &ic->i_recvs[pos];
+ ret = rds_iw_recv_refill_one(conn, recv, kptr_gfp, page_gfp);
+ if (ret) {
+ ret = -1;
+ break;
+ }
+
+ /* XXX when can this fail? */
+ ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
+ rdsdebug("recv %p iwinc %p page %p addr %lu ret %d\n", recv,
+ recv->r_iwinc, recv->r_frag->f_page,
+ (long) recv->r_frag->f_mapped, ret);
+ if (ret) {
+ rds_iw_conn_error(conn, "recv post on "
+ "%pI4 returned %d, disconnecting and "
+ "reconnecting\n", &conn->c_faddr,
+ ret);
+ ret = -1;
+ break;
+ }
+
+ posted++;
+ }
+
+ /* We're doing flow control - update the window. */
+ if (ic->i_flowctl && posted)
+ rds_iw_advertise_credits(conn, posted);
+
+ if (ret)
+ rds_iw_ring_unalloc(&ic->i_recv_ring, 1);
+ return ret;
+}
+
+void rds_iw_inc_purge(struct rds_incoming *inc)
+{
+ struct rds_iw_incoming *iwinc;
+ struct rds_page_frag *frag;
+ struct rds_page_frag *pos;
+
+ iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
+ rdsdebug("purging iwinc %p inc %p\n", iwinc, inc);
+
+ list_for_each_entry_safe(frag, pos, &iwinc->ii_frags, f_item) {
+ list_del_init(&frag->f_item);
+ rds_iw_frag_drop_page(frag);
+ rds_iw_frag_free(frag);
+ }
+}
+
+void rds_iw_inc_free(struct rds_incoming *inc)
+{
+ struct rds_iw_incoming *iwinc;
+
+ iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
+
+ rds_iw_inc_purge(inc);
+ rdsdebug("freeing iwinc %p inc %p\n", iwinc, inc);
+ BUG_ON(!list_empty(&iwinc->ii_frags));
+ kmem_cache_free(rds_iw_incoming_slab, iwinc);
+ atomic_dec(&rds_iw_allocation);
+ BUG_ON(atomic_read(&rds_iw_allocation) < 0);
+}
+
+int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
+ size_t size)
+{
+ struct rds_iw_incoming *iwinc;
+ struct rds_page_frag *frag;
+ struct iovec *iov = first_iov;
+ unsigned long to_copy;
+ unsigned long frag_off = 0;
+ unsigned long iov_off = 0;
+ int copied = 0;
+ int ret;
+ u32 len;
+
+ iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
+ frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item);
+ len = be32_to_cpu(inc->i_hdr.h_len);
+
+ while (copied < size && copied < len) {
+ if (frag_off == RDS_FRAG_SIZE) {
+ frag = list_entry(frag->f_item.next,
+ struct rds_page_frag, f_item);
+ frag_off = 0;
+ }
+ while (iov_off == iov->iov_len) {
+ iov_off = 0;
+ iov++;
+ }
+
+ to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off);
+ to_copy = min_t(size_t, to_copy, size - copied);
+ to_copy = min_t(unsigned long, to_copy, len - copied);
+
+ rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
+ "[%p, %lu] + %lu\n",
+ to_copy, iov->iov_base, iov->iov_len, iov_off,
+ frag->f_page, frag->f_offset, frag_off);
+
+ /* XXX needs + offset for multiple recvs per page */
+ ret = rds_page_copy_to_user(frag->f_page,
+ frag->f_offset + frag_off,
+ iov->iov_base + iov_off,
+ to_copy);
+ if (ret) {
+ copied = ret;
+ break;
+ }
+
+ iov_off += to_copy;
+ frag_off += to_copy;
+ copied += to_copy;
+ }
+
+ return copied;
+}
+
+/* ic starts out kzalloc()ed */
+void rds_iw_recv_init_ack(struct rds_iw_connection *ic)
+{
+ struct ib_send_wr *wr = &ic->i_ack_wr;
+ struct ib_sge *sge = &ic->i_ack_sge;
+
+ sge->addr = ic->i_ack_dma;
+ sge->length = sizeof(struct rds_header);
+ sge->lkey = rds_iw_local_dma_lkey(ic);
+
+ wr->sg_list = sge;
+ wr->num_sge = 1;
+ wr->opcode = IB_WR_SEND;
+ wr->wr_id = RDS_IW_ACK_WR_ID;
+ wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+}
+
+/*
+ * You'd think that with reliable IB connections you wouldn't need to ack
+ * messages that have been received. The problem is that IB hardware generates
+ * an ack message before it has DMAed the message into memory. This creates a
+ * potential message loss if the HCA is disabled for any reason between when it
+ * sends the ack and before the message is DMAed and processed. This is only a
+ * potential issue if another HCA is available for fail-over.
+ *
+ * When the remote host receives our ack they'll free the sent message from
+ * their send queue. To decrease the latency of this we always send an ack
+ * immediately after we've received messages.
+ *
+ * For simplicity, we only have one ack in flight at a time. This puts
+ * pressure on senders to have deep enough send queues to absorb the latency of
+ * a single ack frame being in flight. This might not be good enough.
+ *
+ * This is implemented by have a long-lived send_wr and sge which point to a
+ * statically allocated ack frame. This ack wr does not fall under the ring
+ * accounting that the tx and rx wrs do. The QP attribute specifically makes
+ * room for it beyond the ring size. Send completion notices its special
+ * wr_id and avoids working with the ring in that case.
+ */
+static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq,
+ int ack_required)
+{
+ rds_iw_set_64bit(&ic->i_ack_next, seq);
+ if (ack_required) {
+ smp_mb__before_clear_bit();
+ set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+ }
+}
+
+static u64 rds_iw_get_ack(struct rds_iw_connection *ic)
+{
+ clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+ smp_mb__after_clear_bit();
+
+ return ic->i_ack_next;
+}
+
+static void rds_iw_send_ack(struct rds_iw_connection *ic, unsigned int adv_credits)
+{
+ struct rds_header *hdr = ic->i_ack;
+ struct ib_send_wr *failed_wr;
+ u64 seq;
+ int ret;
+
+ seq = rds_iw_get_ack(ic);
+
+ rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq);
+ rds_message_populate_header(hdr, 0, 0, 0);
+ hdr->h_ack = cpu_to_be64(seq);
+ hdr->h_credit = adv_credits;
+ rds_message_make_checksum(hdr);
+ ic->i_ack_queued = jiffies;
+
+ ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr);
+ if (unlikely(ret)) {
+ /* Failed to send. Release the WR, and
+ * force another ACK.
+ */
+ clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+ set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+
+ rds_iw_stats_inc(s_iw_ack_send_failure);
+ /* Need to finesse this later. */
+ BUG();
+ } else
+ rds_iw_stats_inc(s_iw_ack_sent);
+}
+
+/*
+ * There are 3 ways of getting acknowledgements to the peer:
+ * 1. We call rds_iw_attempt_ack from the recv completion handler
+ * to send an ACK-only frame.
+ * However, there can be only one such frame in the send queue
+ * at any time, so we may have to postpone it.
+ * 2. When another (data) packet is transmitted while there's
+ * an ACK in the queue, we piggyback the ACK sequence number
+ * on the data packet.
+ * 3. If the ACK WR is done sending, we get called from the
+ * send queue completion handler, and check whether there's
+ * another ACK pending (postponed because the WR was on the
+ * queue). If so, we transmit it.
+ *
+ * We maintain 2 variables:
+ * - i_ack_flags, which keeps track of whether the ACK WR
+ * is currently in the send queue or not (IB_ACK_IN_FLIGHT)
+ * - i_ack_next, which is the last sequence number we received
+ *
+ * Potentially, send queue and receive queue handlers can run concurrently.
+ *
+ * Reconnecting complicates this picture just slightly. When we
+ * reconnect, we may be seeing duplicate packets. The peer
+ * is retransmitting them, because it hasn't seen an ACK for
+ * them. It is important that we ACK these.
+ *
+ * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
+ * this flag set *MUST* be acknowledged immediately.
+ */
+
+/*
+ * When we get here, we're called from the recv queue handler.
+ * Check whether we ought to transmit an ACK.
+ */
+void rds_iw_attempt_ack(struct rds_iw_connection *ic)
+{
+ unsigned int adv_credits;
+
+ if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
+ return;
+
+ if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
+ rds_iw_stats_inc(s_iw_ack_send_delayed);
+ return;
+ }
+
+ /* Can we get a send credit? */
+ if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0)) {
+ rds_iw_stats_inc(s_iw_tx_throttle);
+ clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+ return;
+ }
+
+ clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+ rds_iw_send_ack(ic, adv_credits);
+}
+
+/*
+ * We get here from the send completion handler, when the
+ * adapter tells us the ACK frame was sent.
+ */
+void rds_iw_ack_send_complete(struct rds_iw_connection *ic)
+{
+ clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+ rds_iw_attempt_ack(ic);
+}
+
+/*
+ * This is called by the regular xmit code when it wants to piggyback
+ * an ACK on an outgoing frame.
+ */
+u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic)
+{
+ if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
+ rds_iw_stats_inc(s_iw_ack_send_piggybacked);
+ return rds_iw_get_ack(ic);
+}
+
+/*
+ * It's kind of lame that we're copying from the posted receive pages into
+ * long-lived bitmaps. We could have posted the bitmaps and rdma written into
+ * them. But receiving new congestion bitmaps should be a *rare* event, so
+ * hopefully we won't need to invest that complexity in making it more
+ * efficient. By copying we can share a simpler core with TCP which has to
+ * copy.
+ */
+static void rds_iw_cong_recv(struct rds_connection *conn,
+ struct rds_iw_incoming *iwinc)
+{
+ struct rds_cong_map *map;
+ unsigned int map_off;
+ unsigned int map_page;
+ struct rds_page_frag *frag;
+ unsigned long frag_off;
+ unsigned long to_copy;
+ unsigned long copied;
+ uint64_t uncongested = 0;
+ void *addr;
+
+ /* catch completely corrupt packets */
+ if (be32_to_cpu(iwinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
+ return;
+
+ map = conn->c_fcong;
+ map_page = 0;
+ map_off = 0;
+
+ frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item);
+ frag_off = 0;
+
+ copied = 0;
+
+ while (copied < RDS_CONG_MAP_BYTES) {
+ uint64_t *src, *dst;
+ unsigned int k;
+
+ to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
+ BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
+
+ addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0);
+
+ src = addr + frag_off;
+ dst = (void *)map->m_page_addrs[map_page] + map_off;
+ for (k = 0; k < to_copy; k += 8) {
+ /* Record ports that became uncongested, ie
+ * bits that changed from 0 to 1. */
+ uncongested |= ~(*src) & *dst;
+ *dst++ = *src++;
+ }
+ kunmap_atomic(addr, KM_SOFTIRQ0);
+
+ copied += to_copy;
+
+ map_off += to_copy;
+ if (map_off == PAGE_SIZE) {
+ map_off = 0;
+ map_page++;
+ }
+
+ frag_off += to_copy;
+ if (frag_off == RDS_FRAG_SIZE) {
+ frag = list_entry(frag->f_item.next,
+ struct rds_page_frag, f_item);
+ frag_off = 0;
+ }
+ }
+
+ /* the congestion map is in little endian order */
+ uncongested = le64_to_cpu(uncongested);
+
+ rds_cong_map_updated(map, uncongested);
+}
+
+/*
+ * Rings are posted with all the allocations they'll need to queue the
+ * incoming message to the receiving socket so this can't fail.
+ * All fragments start with a header, so we can make sure we're not receiving
+ * garbage, and we can tell a small 8 byte fragment from an ACK frame.
+ */
+struct rds_iw_ack_state {
+ u64 ack_next;
+ u64 ack_recv;
+ unsigned int ack_required:1;
+ unsigned int ack_next_valid:1;
+ unsigned int ack_recv_valid:1;
+};
+
+static void rds_iw_process_recv(struct rds_connection *conn,
+ struct rds_iw_recv_work *recv, u32 byte_len,
+ struct rds_iw_ack_state *state)
+{
+ struct rds_iw_connection *ic = conn->c_transport_data;
+ struct rds_iw_incoming *iwinc = ic->i_iwinc;
+ struct rds_header *ihdr, *hdr;
+
+ /* XXX shut down the connection if port 0,0 are seen? */
+
+ rdsdebug("ic %p iwinc %p recv %p byte len %u\n", ic, iwinc, recv,
+ byte_len);
+
+ if (byte_len < sizeof(struct rds_header)) {
+ rds_iw_conn_error(conn, "incoming message "
+ "from %pI4 didn't inclue a "
+ "header, disconnecting and "
+ "reconnecting\n",
+ &conn->c_faddr);
+ return;
+ }
+ byte_len -= sizeof(struct rds_header);
+
+ ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
+
+ /* Validate the checksum. */
+ if (!rds_message_verify_checksum(ihdr)) {
+ rds_iw_conn_error(conn, "incoming message "
+ "from %pI4 has corrupted header - "
+ "forcing a reconnect\n",
+ &conn->c_faddr);
+ rds_stats_inc(s_recv_drop_bad_checksum);
+ return;
+ }
+
+ /* Process the ACK sequence which comes with every packet */
+ state->ack_recv = be64_to_cpu(ihdr->h_ack);
+ state->ack_recv_valid = 1;
+
+ /* Process the credits update if there was one */
+ if (ihdr->h_credit)
+ rds_iw_send_add_credits(conn, ihdr->h_credit);
+
+ if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) {
+ /* This is an ACK-only packet. The fact that it gets
+ * special treatment here is that historically, ACKs
+ * were rather special beasts.
+ */
+ rds_iw_stats_inc(s_iw_ack_received);
+
+ /*
+ * Usually the frags make their way on to incs and are then freed as
+ * the inc is freed. We don't go that route, so we have to drop the
+ * page ref ourselves. We can't just leave the page on the recv
+ * because that confuses the dma mapping of pages and each recv's use
+ * of a partial page. We can leave the frag, though, it will be
+ * reused.
+ *
+ * FIXME: Fold this into the code path below.
+ */
+ rds_iw_frag_drop_page(recv->r_frag);
+ return;
+ }
+
+ /*
+ * If we don't already have an inc on the connection then this
+ * fragment has a header and starts a message.. copy its header
+ * into the inc and save the inc so we can hang upcoming fragments
+ * off its list.
+ */
+ if (iwinc == NULL) {
+ iwinc = recv->r_iwinc;
+ recv->r_iwinc = NULL;
+ ic->i_iwinc = iwinc;
+
+ hdr = &iwinc->ii_inc.i_hdr;
+ memcpy(hdr, ihdr, sizeof(*hdr));
+ ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
+
+ rdsdebug("ic %p iwinc %p rem %u flag 0x%x\n", ic, iwinc,
+ ic->i_recv_data_rem, hdr->h_flags);
+ } else {
+ hdr = &iwinc->ii_inc.i_hdr;
+ /* We can't just use memcmp here; fragments of a
+ * single message may carry different ACKs */
+ if (hdr->h_sequence != ihdr->h_sequence
+ || hdr->h_len != ihdr->h_len
+ || hdr->h_sport != ihdr->h_sport
+ || hdr->h_dport != ihdr->h_dport) {
+ rds_iw_conn_error(conn,
+ "fragment header mismatch; forcing reconnect\n");
+ return;
+ }
+ }
+
+ list_add_tail(&recv->r_frag->f_item, &iwinc->ii_frags);
+ recv->r_frag = NULL;
+
+ if (ic->i_recv_data_rem > RDS_FRAG_SIZE)
+ ic->i_recv_data_rem -= RDS_FRAG_SIZE;
+ else {
+ ic->i_recv_data_rem = 0;
+ ic->i_iwinc = NULL;
+
+ if (iwinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
+ rds_iw_cong_recv(conn, iwinc);
+ else {
+ rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
+ &iwinc->ii_inc, GFP_ATOMIC,
+ KM_SOFTIRQ0);
+ state->ack_next = be64_to_cpu(hdr->h_sequence);
+ state->ack_next_valid = 1;
+ }
+
+ /* Evaluate the ACK_REQUIRED flag *after* we received
+ * the complete frame, and after bumping the next_rx
+ * sequence. */
+ if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) {
+ rds_stats_inc(s_recv_ack_required);
+ state->ack_required = 1;
+ }
+
+ rds_inc_put(&iwinc->ii_inc);
+ }
+}
+
+/*
+ * Plucking the oldest entry from the ring can be done concurrently with
+ * the thread refilling the ring. Each ring operation is protected by
+ * spinlocks and the transient state of refilling doesn't change the
+ * recording of which entry is oldest.
+ *
+ * This relies on IB only calling one cq comp_handler for each cq so that
+ * there will only be one caller of rds_recv_incoming() per RDS connection.
+ */
+void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context)
+{
+ struct rds_connection *conn = context;
+ struct rds_iw_connection *ic = conn->c_transport_data;
+ struct ib_wc wc;
+ struct rds_iw_ack_state state = { 0, };
+ struct rds_iw_recv_work *recv;
+
+ rdsdebug("conn %p cq %p\n", conn, cq);
+
+ rds_iw_stats_inc(s_iw_rx_cq_call);
+
+ ib_req_notify_cq(cq, IB_CQ_SOLICITED);
+
+ while (ib_poll_cq(cq, 1, &wc) > 0) {
+ rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+ (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
+ be32_to_cpu(wc.ex.imm_data));
+ rds_iw_stats_inc(s_iw_rx_cq_event);
+
+ recv = &ic->i_recvs[rds_iw_ring_oldest(&ic->i_recv_ring)];
+
+ rds_iw_recv_unmap_page(ic, recv);
+
+ /*
+ * Also process recvs in connecting state because it is possible
+ * to get a recv completion _before_ the rdmacm ESTABLISHED
+ * event is processed.
+ */
+ if (rds_conn_up(conn) || rds_conn_connecting(conn)) {
+ /* We expect errors as the qp is drained during shutdown */
+ if (wc.status == IB_WC_SUCCESS) {
+ rds_iw_process_recv(conn, recv, wc.byte_len, &state);
+ } else {
+ rds_iw_conn_error(conn, "recv completion on "
+ "%pI4 had status %u, disconnecting and "
+ "reconnecting\n", &conn->c_faddr,
+ wc.status);
+ }
+ }
+
+ rds_iw_ring_free(&ic->i_recv_ring, 1);
+ }
+
+ if (state.ack_next_valid)
+ rds_iw_set_ack(ic, state.ack_next, state.ack_required);
+ if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
+ rds_send_drop_acked(conn, state.ack_recv, NULL);
+ ic->i_ack_recv = state.ack_recv;
+ }
+ if (rds_conn_up(conn))
+ rds_iw_attempt_ack(ic);
+
+ /* If we ever end up with a really empty receive ring, we're
+ * in deep trouble, as the sender will definitely see RNR
+ * timeouts. */
+ if (rds_iw_ring_empty(&ic->i_recv_ring))
+ rds_iw_stats_inc(s_iw_rx_ring_empty);
+
+ /*
+ * If the ring is running low, then schedule the thread to refill.
+ */
+ if (rds_iw_ring_low(&ic->i_recv_ring))
+ queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+}
+
+int rds_iw_recv(struct rds_connection *conn)
+{
+ struct rds_iw_connection *ic = conn->c_transport_data;
+ int ret = 0;
+
+ rdsdebug("conn %p\n", conn);
+
+ /*
+ * If we get a temporary posting failure in this context then
+ * we're really low and we want the caller to back off for a bit.
+ */
+ mutex_lock(&ic->i_recv_mutex);
+ if (rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0))
+ ret = -ENOMEM;
+ else
+ rds_iw_stats_inc(s_iw_rx_refill_from_thread);
+ mutex_unlock(&ic->i_recv_mutex);
+
+ if (rds_conn_up(conn))
+ rds_iw_attempt_ack(ic);
+
+ return ret;
+}
+
+int __init rds_iw_recv_init(void)
+{
+ struct sysinfo si;
+ int ret = -ENOMEM;
+
+ /* Default to 30% of all available RAM for recv memory */
+ si_meminfo(&si);
+ rds_iw_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE;
+
+ rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming",
+ sizeof(struct rds_iw_incoming),
+ 0, 0, NULL);
+ if (rds_iw_incoming_slab == NULL)
+ goto out;
+
+ rds_iw_frag_slab = kmem_cache_create("rds_iw_frag",
+ sizeof(struct rds_page_frag),
+ 0, 0, NULL);
+ if (rds_iw_frag_slab == NULL)
+ kmem_cache_destroy(rds_iw_incoming_slab);
+ else
+ ret = 0;
+out:
+ return ret;
+}
+
+void rds_iw_recv_exit(void)
+{
+ kmem_cache_destroy(rds_iw_incoming_slab);
+ kmem_cache_destroy(rds_iw_frag_slab);
+}
diff --git a/net/rds/iw_ring.c b/net/rds/iw_ring.c
new file mode 100644
index 00000000000..d422d4b5dee
--- /dev/null
+++ b/net/rds/iw_ring.c
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+
+#include "rds.h"
+#include "iw.h"
+
+/*
+ * Locking for IB rings.
+ * We assume that allocation is always protected by a mutex
+ * in the caller (this is a valid assumption for the current
+ * implementation).
+ *
+ * Freeing always happens in an interrupt, and hence only
+ * races with allocations, but not with other free()s.
+ *
+ * The interaction between allocation and freeing is that
+ * the alloc code has to determine the number of free entries.
+ * To this end, we maintain two counters; an allocation counter
+ * and a free counter. Both are allowed to run freely, and wrap
+ * around.
+ * The number of used entries is always (alloc_ctr - free_ctr) % NR.
+ *
+ * The current implementation makes free_ctr atomic. When the
+ * caller finds an allocation fails, it should set an "alloc fail"
+ * bit and retry the allocation. The "alloc fail" bit essentially tells
+ * the CQ completion handlers to wake it up after freeing some
+ * more entries.
+ */
+
+/*
+ * This only happens on shutdown.
+ */
+DECLARE_WAIT_QUEUE_HEAD(rds_iw_ring_empty_wait);
+
+void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr)
+{
+ memset(ring, 0, sizeof(*ring));
+ ring->w_nr = nr;
+ rdsdebug("ring %p nr %u\n", ring, ring->w_nr);
+}
+
+static inline u32 __rds_iw_ring_used(struct rds_iw_work_ring *ring)
+{
+ u32 diff;
+
+ /* This assumes that atomic_t has at least as many bits as u32 */
+ diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr);
+ BUG_ON(diff > ring->w_nr);
+
+ return diff;
+}
+
+void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr)
+{
+ /* We only ever get called from the connection setup code,
+ * prior to creating the QP. */
+ BUG_ON(__rds_iw_ring_used(ring));
+ ring->w_nr = nr;
+}
+
+static int __rds_iw_ring_empty(struct rds_iw_work_ring *ring)
+{
+ return __rds_iw_ring_used(ring) == 0;
+}
+
+u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos)
+{
+ u32 ret = 0, avail;
+
+ avail = ring->w_nr - __rds_iw_ring_used(ring);
+
+ rdsdebug("ring %p val %u next %u free %u\n", ring, val,
+ ring->w_alloc_ptr, avail);
+
+ if (val && avail) {
+ ret = min(val, avail);
+ *pos = ring->w_alloc_ptr;
+
+ ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr;
+ ring->w_alloc_ctr += ret;
+ }
+
+ return ret;
+}
+
+void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val)
+{
+ ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr;
+ atomic_add(val, &ring->w_free_ctr);
+
+ if (__rds_iw_ring_empty(ring) &&
+ waitqueue_active(&rds_iw_ring_empty_wait))
+ wake_up(&rds_iw_ring_empty_wait);
+}
+
+void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val)
+{
+ ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr;
+ ring->w_alloc_ctr -= val;
+}
+
+int rds_iw_ring_empty(struct rds_iw_work_ring *ring)
+{
+ return __rds_iw_ring_empty(ring);
+}
+
+int rds_iw_ring_low(struct rds_iw_work_ring *ring)
+{
+ return __rds_iw_ring_used(ring) <= (ring->w_nr >> 2);
+}
+
+
+/*
+ * returns the oldest alloced ring entry. This will be the next one
+ * freed. This can't be called if there are none allocated.
+ */
+u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring)
+{
+ return ring->w_free_ptr;
+}
+
+/*
+ * returns the number of completed work requests.
+ */
+
+u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest)
+{
+ u32 ret;
+
+ if (oldest <= (unsigned long long)wr_id)
+ ret = (unsigned long long)wr_id - oldest + 1;
+ else
+ ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1;
+
+ rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret,
+ wr_id, oldest);
+ return ret;
+}
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c
new file mode 100644
index 00000000000..22dd38ffd60
--- /dev/null
+++ b/net/rds/iw_send.c
@@ -0,0 +1,975 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/device.h>
+#include <linux/dmapool.h>
+
+#include "rds.h"
+#include "rdma.h"
+#include "iw.h"
+
+static void rds_iw_send_rdma_complete(struct rds_message *rm,
+ int wc_status)
+{
+ int notify_status;
+
+ switch (wc_status) {
+ case IB_WC_WR_FLUSH_ERR:
+ return;
+
+ case IB_WC_SUCCESS:
+ notify_status = RDS_RDMA_SUCCESS;
+ break;
+
+ case IB_WC_REM_ACCESS_ERR:
+ notify_status = RDS_RDMA_REMOTE_ERROR;
+ break;
+
+ default:
+ notify_status = RDS_RDMA_OTHER_ERROR;
+ break;
+ }
+ rds_rdma_send_complete(rm, notify_status);
+}
+
+static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic,
+ struct rds_rdma_op *op)
+{
+ if (op->r_mapped) {
+ ib_dma_unmap_sg(ic->i_cm_id->device,
+ op->r_sg, op->r_nents,
+ op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ op->r_mapped = 0;
+ }
+}
+
+static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
+ struct rds_iw_send_work *send,
+ int wc_status)
+{
+ struct rds_message *rm = send->s_rm;
+
+ rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
+
+ ib_dma_unmap_sg(ic->i_cm_id->device,
+ rm->m_sg, rm->m_nents,
+ DMA_TO_DEVICE);
+
+ if (rm->m_rdma_op != NULL) {
+ rds_iw_send_unmap_rdma(ic, rm->m_rdma_op);
+
+ /* If the user asked for a completion notification on this
+ * message, we can implement three different semantics:
+ * 1. Notify when we received the ACK on the RDS message
+ * that was queued with the RDMA. This provides reliable
+ * notification of RDMA status at the expense of a one-way
+ * packet delay.
+ * 2. Notify when the IB stack gives us the completion event for
+ * the RDMA operation.
+ * 3. Notify when the IB stack gives us the completion event for
+ * the accompanying RDS messages.
+ * Here, we implement approach #3. To implement approach #2,
+ * call rds_rdma_send_complete from the cq_handler. To implement #1,
+ * don't call rds_rdma_send_complete at all, and fall back to the notify
+ * handling in the ACK processing code.
+ *
+ * Note: There's no need to explicitly sync any RDMA buffers using
+ * ib_dma_sync_sg_for_cpu - the completion for the RDMA
+ * operation itself unmapped the RDMA buffers, which takes care
+ * of synching.
+ */
+ rds_iw_send_rdma_complete(rm, wc_status);
+
+ if (rm->m_rdma_op->r_write)
+ rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes);
+ else
+ rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes);
+ }
+
+ /* If anyone waited for this message to get flushed out, wake
+ * them up now */
+ rds_message_unmapped(rm);
+
+ rds_message_put(rm);
+ send->s_rm = NULL;
+}
+
+void rds_iw_send_init_ring(struct rds_iw_connection *ic)
+{
+ struct rds_iw_send_work *send;
+ u32 i;
+
+ for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
+ struct ib_sge *sge;
+
+ send->s_rm = NULL;
+ send->s_op = NULL;
+ send->s_mapping = NULL;
+
+ send->s_wr.next = NULL;
+ send->s_wr.wr_id = i;
+ send->s_wr.sg_list = send->s_sge;
+ send->s_wr.num_sge = 1;
+ send->s_wr.opcode = IB_WR_SEND;
+ send->s_wr.send_flags = 0;
+ send->s_wr.ex.imm_data = 0;
+
+ sge = rds_iw_data_sge(ic, send->s_sge);
+ sge->lkey = 0;
+
+ sge = rds_iw_header_sge(ic, send->s_sge);
+ sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
+ sge->length = sizeof(struct rds_header);
+ sge->lkey = 0;
+
+ send->s_mr = ib_alloc_fast_reg_mr(ic->i_pd, fastreg_message_size);
+ if (IS_ERR(send->s_mr)) {
+ printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed\n");
+ break;
+ }
+
+ send->s_page_list = ib_alloc_fast_reg_page_list(
+ ic->i_cm_id->device, fastreg_message_size);
+ if (IS_ERR(send->s_page_list)) {
+ printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed\n");
+ break;
+ }
+ }
+}
+
+void rds_iw_send_clear_ring(struct rds_iw_connection *ic)
+{
+ struct rds_iw_send_work *send;
+ u32 i;
+
+ for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
+ BUG_ON(!send->s_mr);
+ ib_dereg_mr(send->s_mr);
+ BUG_ON(!send->s_page_list);
+ ib_free_fast_reg_page_list(send->s_page_list);
+ if (send->s_wr.opcode == 0xdead)
+ continue;
+ if (send->s_rm)
+ rds_iw_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR);
+ if (send->s_op)
+ rds_iw_send_unmap_rdma(ic, send->s_op);
+ }
+}
+
+/*
+ * The _oldest/_free ring operations here race cleanly with the alloc/unalloc
+ * operations performed in the send path. As the sender allocs and potentially
+ * unallocs the next free entry in the ring it doesn't alter which is
+ * the next to be freed, which is what this is concerned with.
+ */
+void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context)
+{
+ struct rds_connection *conn = context;
+ struct rds_iw_connection *ic = conn->c_transport_data;
+ struct ib_wc wc;
+ struct rds_iw_send_work *send;
+ u32 completed;
+ u32 oldest;
+ u32 i;
+ int ret;
+
+ rdsdebug("cq %p conn %p\n", cq, conn);
+ rds_iw_stats_inc(s_iw_tx_cq_call);
+ ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+ if (ret)
+ rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
+
+ while (ib_poll_cq(cq, 1, &wc) > 0) {
+ rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+ (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
+ be32_to_cpu(wc.ex.imm_data));
+ rds_iw_stats_inc(s_iw_tx_cq_event);
+
+ if (wc.status != IB_WC_SUCCESS) {
+ printk(KERN_ERR "WC Error: status = %d opcode = %d\n", wc.status, wc.opcode);
+ break;
+ }
+
+ if (wc.opcode == IB_WC_LOCAL_INV && wc.wr_id == RDS_IW_LOCAL_INV_WR_ID) {
+ ic->i_fastreg_posted = 0;
+ continue;
+ }
+
+ if (wc.opcode == IB_WC_FAST_REG_MR && wc.wr_id == RDS_IW_FAST_REG_WR_ID) {
+ ic->i_fastreg_posted = 1;
+ continue;
+ }
+
+ if (wc.wr_id == RDS_IW_ACK_WR_ID) {
+ if (ic->i_ack_queued + HZ/2 < jiffies)
+ rds_iw_stats_inc(s_iw_tx_stalled);
+ rds_iw_ack_send_complete(ic);
+ continue;
+ }
+
+ oldest = rds_iw_ring_oldest(&ic->i_send_ring);
+
+ completed = rds_iw_ring_completed(&ic->i_send_ring, wc.wr_id, oldest);
+
+ for (i = 0; i < completed; i++) {
+ send = &ic->i_sends[oldest];
+
+ /* In the error case, wc.opcode sometimes contains garbage */
+ switch (send->s_wr.opcode) {
+ case IB_WR_SEND:
+ if (send->s_rm)
+ rds_iw_send_unmap_rm(ic, send, wc.status);
+ break;
+ case IB_WR_FAST_REG_MR:
+ case IB_WR_RDMA_WRITE:
+ case IB_WR_RDMA_READ:
+ case IB_WR_RDMA_READ_WITH_INV:
+ /* Nothing to be done - the SG list will be unmapped
+ * when the SEND completes. */
+ break;
+ default:
+ if (printk_ratelimit())
+ printk(KERN_NOTICE
+ "RDS/IW: %s: unexpected opcode 0x%x in WR!\n",
+ __func__, send->s_wr.opcode);
+ break;
+ }
+
+ send->s_wr.opcode = 0xdead;
+ send->s_wr.num_sge = 1;
+ if (send->s_queued + HZ/2 < jiffies)
+ rds_iw_stats_inc(s_iw_tx_stalled);
+
+ /* If a RDMA operation produced an error, signal this right
+ * away. If we don't, the subsequent SEND that goes with this
+ * RDMA will be canceled with ERR_WFLUSH, and the application
+ * never learn that the RDMA failed. */
+ if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) {
+ struct rds_message *rm;
+
+ rm = rds_send_get_message(conn, send->s_op);
+ if (rm)
+ rds_iw_send_rdma_complete(rm, wc.status);
+ }
+
+ oldest = (oldest + 1) % ic->i_send_ring.w_nr;
+ }
+
+ rds_iw_ring_free(&ic->i_send_ring, completed);
+
+ if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)
+ || test_bit(0, &conn->c_map_queued))
+ queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+
+ /* We expect errors as the qp is drained during shutdown */
+ if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) {
+ rds_iw_conn_error(conn,
+ "send completion on %pI4 "
+ "had status %u, disconnecting and reconnecting\n",
+ &conn->c_faddr, wc.status);
+ }
+ }
+}
+
+/*
+ * This is the main function for allocating credits when sending
+ * messages.
+ *
+ * Conceptually, we have two counters:
+ * - send credits: this tells us how many WRs we're allowed
+ * to submit without overruning the reciever's queue. For
+ * each SEND WR we post, we decrement this by one.
+ *
+ * - posted credits: this tells us how many WRs we recently
+ * posted to the receive queue. This value is transferred
+ * to the peer as a "credit update" in a RDS header field.
+ * Every time we transmit credits to the peer, we subtract
+ * the amount of transferred credits from this counter.
+ *
+ * It is essential that we avoid situations where both sides have
+ * exhausted their send credits, and are unable to send new credits
+ * to the peer. We achieve this by requiring that we send at least
+ * one credit update to the peer before exhausting our credits.
+ * When new credits arrive, we subtract one credit that is withheld
+ * until we've posted new buffers and are ready to transmit these
+ * credits (see rds_iw_send_add_credits below).
+ *
+ * The RDS send code is essentially single-threaded; rds_send_xmit
+ * grabs c_send_lock to ensure exclusive access to the send ring.
+ * However, the ACK sending code is independent and can race with
+ * message SENDs.
+ *
+ * In the send path, we need to update the counters for send credits
+ * and the counter of posted buffers atomically - when we use the
+ * last available credit, we cannot allow another thread to race us
+ * and grab the posted credits counter. Hence, we have to use a
+ * spinlock to protect the credit counter, or use atomics.
+ *
+ * Spinlocks shared between the send and the receive path are bad,
+ * because they create unnecessary delays. An early implementation
+ * using a spinlock showed a 5% degradation in throughput at some
+ * loads.
+ *
+ * This implementation avoids spinlocks completely, putting both
+ * counters into a single atomic, and updating that atomic using
+ * atomic_add (in the receive path, when receiving fresh credits),
+ * and using atomic_cmpxchg when updating the two counters.
+ */
+int rds_iw_send_grab_credits(struct rds_iw_connection *ic,
+ u32 wanted, u32 *adv_credits, int need_posted)
+{
+ unsigned int avail, posted, got = 0, advertise;
+ long oldval, newval;
+
+ *adv_credits = 0;
+ if (!ic->i_flowctl)
+ return wanted;
+
+try_again:
+ advertise = 0;
+ oldval = newval = atomic_read(&ic->i_credits);
+ posted = IB_GET_POST_CREDITS(oldval);
+ avail = IB_GET_SEND_CREDITS(oldval);
+
+ rdsdebug("rds_iw_send_grab_credits(%u): credits=%u posted=%u\n",
+ wanted, avail, posted);
+
+ /* The last credit must be used to send a credit update. */
+ if (avail && !posted)
+ avail--;
+
+ if (avail < wanted) {
+ struct rds_connection *conn = ic->i_cm_id->context;
+
+ /* Oops, there aren't that many credits left! */
+ set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+ got = avail;
+ } else {
+ /* Sometimes you get what you want, lalala. */
+ got = wanted;
+ }
+ newval -= IB_SET_SEND_CREDITS(got);
+
+ /*
+ * If need_posted is non-zero, then the caller wants
+ * the posted regardless of whether any send credits are
+ * available.
+ */
+ if (posted && (got || need_posted)) {
+ advertise = min_t(unsigned int, posted, RDS_MAX_ADV_CREDIT);
+ newval -= IB_SET_POST_CREDITS(advertise);
+ }
+
+ /* Finally bill everything */
+ if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval)
+ goto try_again;
+
+ *adv_credits = advertise;
+ return got;
+}
+
+void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits)
+{
+ struct rds_iw_connection *ic = conn->c_transport_data;
+
+ if (credits == 0)
+ return;
+
+ rdsdebug("rds_iw_send_add_credits(%u): current=%u%s\n",
+ credits,
+ IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)),
+ test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : "");
+
+ atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits);
+ if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags))
+ queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+
+ WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384);
+
+ rds_iw_stats_inc(s_iw_rx_credit_updates);
+}
+
+void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted)
+{
+ struct rds_iw_connection *ic = conn->c_transport_data;
+
+ if (posted == 0)
+ return;
+
+ atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits);
+
+ /* Decide whether to send an update to the peer now.
+ * If we would send a credit update for every single buffer we
+ * post, we would end up with an ACK storm (ACK arrives,
+ * consumes buffer, we refill the ring, send ACK to remote
+ * advertising the newly posted buffer... ad inf)
+ *
+ * Performance pretty much depends on how often we send
+ * credit updates - too frequent updates mean lots of ACKs.
+ * Too infrequent updates, and the peer will run out of
+ * credits and has to throttle.
+ * For the time being, 16 seems to be a good compromise.
+ */
+ if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16)
+ set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+}
+
+static inline void
+rds_iw_xmit_populate_wr(struct rds_iw_connection *ic,
+ struct rds_iw_send_work *send, unsigned int pos,
+ unsigned long buffer, unsigned int length,
+ int send_flags)
+{
+ struct ib_sge *sge;
+
+ WARN_ON(pos != send - ic->i_sends);
+
+ send->s_wr.send_flags = send_flags;
+ send->s_wr.opcode = IB_WR_SEND;
+ send->s_wr.num_sge = 2;
+ send->s_wr.next = NULL;
+ send->s_queued = jiffies;
+ send->s_op = NULL;
+
+ if (length != 0) {
+ sge = rds_iw_data_sge(ic, send->s_sge);
+ sge->addr = buffer;
+ sge->length = length;
+ sge->lkey = rds_iw_local_dma_lkey(ic);
+
+ sge = rds_iw_header_sge(ic, send->s_sge);
+ } else {
+ /* We're sending a packet with no payload. There is only
+ * one SGE */
+ send->s_wr.num_sge = 1;
+ sge = &send->s_sge[0];
+ }
+
+ sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header));
+ sge->length = sizeof(struct rds_header);
+ sge->lkey = rds_iw_local_dma_lkey(ic);
+}
+
+/*
+ * This can be called multiple times for a given message. The first time
+ * we see a message we map its scatterlist into the IB device so that
+ * we can provide that mapped address to the IB scatter gather entries
+ * in the IB work requests. We translate the scatterlist into a series
+ * of work requests that fragment the message. These work requests complete
+ * in order so we pass ownership of the message to the completion handler
+ * once we send the final fragment.
+ *
+ * The RDS core uses the c_send_lock to only enter this function once
+ * per connection. This makes sure that the tx ring alloc/unalloc pairs
+ * don't get out of sync and confuse the ring.
+ */
+int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
+ unsigned int hdr_off, unsigned int sg, unsigned int off)
+{
+ struct rds_iw_connection *ic = conn->c_transport_data;
+ struct ib_device *dev = ic->i_cm_id->device;
+ struct rds_iw_send_work *send = NULL;
+ struct rds_iw_send_work *first;
+ struct rds_iw_send_work *prev;
+ struct ib_send_wr *failed_wr;
+ struct scatterlist *scat;
+ u32 pos;
+ u32 i;
+ u32 work_alloc;
+ u32 credit_alloc;
+ u32 posted;
+ u32 adv_credits = 0;
+ int send_flags = 0;
+ int sent;
+ int ret;
+ int flow_controlled = 0;
+
+ BUG_ON(off % RDS_FRAG_SIZE);
+ BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
+
+ /* Fastreg support */
+ if (rds_rdma_cookie_key(rm->m_rdma_cookie)
+ && !ic->i_fastreg_posted) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ /* FIXME we may overallocate here */
+ if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0)
+ i = 1;
+ else
+ i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE);
+
+ work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
+ if (work_alloc == 0) {
+ set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+ rds_iw_stats_inc(s_iw_tx_ring_full);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ credit_alloc = work_alloc;
+ if (ic->i_flowctl) {
+ credit_alloc = rds_iw_send_grab_credits(ic, work_alloc, &posted, 0);
+ adv_credits += posted;
+ if (credit_alloc < work_alloc) {
+ rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
+ work_alloc = credit_alloc;
+ flow_controlled++;
+ }
+ if (work_alloc == 0) {
+ rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+ rds_iw_stats_inc(s_iw_tx_throttle);
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ /* map the message the first time we see it */
+ if (ic->i_rm == NULL) {
+ /*
+ printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n",
+ be16_to_cpu(rm->m_inc.i_hdr.h_dport),
+ rm->m_inc.i_hdr.h_flags,
+ be32_to_cpu(rm->m_inc.i_hdr.h_len));
+ */
+ if (rm->m_nents) {
+ rm->m_count = ib_dma_map_sg(dev,
+ rm->m_sg, rm->m_nents, DMA_TO_DEVICE);
+ rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count);
+ if (rm->m_count == 0) {
+ rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
+ rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+ ret = -ENOMEM; /* XXX ? */
+ goto out;
+ }
+ } else {
+ rm->m_count = 0;
+ }
+
+ ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
+ ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes;
+ rds_message_addref(rm);
+ ic->i_rm = rm;
+
+ /* Finalize the header */
+ if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
+ rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED;
+ if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))
+ rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED;
+
+ /* If it has a RDMA op, tell the peer we did it. This is
+ * used by the peer to release use-once RDMA MRs. */
+ if (rm->m_rdma_op) {
+ struct rds_ext_header_rdma ext_hdr;
+
+ ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key);
+ rds_message_add_extension(&rm->m_inc.i_hdr,
+ RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
+ }
+ if (rm->m_rdma_cookie) {
+ rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
+ rds_rdma_cookie_key(rm->m_rdma_cookie),
+ rds_rdma_cookie_offset(rm->m_rdma_cookie));
+ }
+
+ /* Note - rds_iw_piggyb_ack clears the ACK_REQUIRED bit, so
+ * we should not do this unless we have a chance of at least
+ * sticking the header into the send ring. Which is why we
+ * should call rds_iw_ring_alloc first. */
+ rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_iw_piggyb_ack(ic));
+ rds_message_make_checksum(&rm->m_inc.i_hdr);
+
+ /*
+ * Update adv_credits since we reset the ACK_REQUIRED bit.
+ */
+ rds_iw_send_grab_credits(ic, 0, &posted, 1);
+ adv_credits += posted;
+ BUG_ON(adv_credits > 255);
+ } else if (ic->i_rm != rm)
+ BUG();
+
+ send = &ic->i_sends[pos];
+ first = send;
+ prev = NULL;
+ scat = &rm->m_sg[sg];
+ sent = 0;
+ i = 0;
+
+ /* Sometimes you want to put a fence between an RDMA
+ * READ and the following SEND.
+ * We could either do this all the time
+ * or when requested by the user. Right now, we let
+ * the application choose.
+ */
+ if (rm->m_rdma_op && rm->m_rdma_op->r_fence)
+ send_flags = IB_SEND_FENCE;
+
+ /*
+ * We could be copying the header into the unused tail of the page.
+ * That would need to be changed in the future when those pages might
+ * be mapped userspace pages or page cache pages. So instead we always
+ * use a second sge and our long-lived ring of mapped headers. We send
+ * the header after the data so that the data payload can be aligned on
+ * the receiver.
+ */
+
+ /* handle a 0-len message */
+ if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) {
+ rds_iw_xmit_populate_wr(ic, send, pos, 0, 0, send_flags);
+ goto add_header;
+ }
+
+ /* if there's data reference it with a chain of work reqs */
+ for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) {
+ unsigned int len;
+
+ send = &ic->i_sends[pos];
+
+ len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
+ rds_iw_xmit_populate_wr(ic, send, pos,
+ ib_sg_dma_address(dev, scat) + off, len,
+ send_flags);
+
+ /*
+ * We want to delay signaling completions just enough to get
+ * the batching benefits but not so much that we create dead time
+ * on the wire.
+ */
+ if (ic->i_unsignaled_wrs-- == 0) {
+ ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
+ send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+ }
+
+ ic->i_unsignaled_bytes -= len;
+ if (ic->i_unsignaled_bytes <= 0) {
+ ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes;
+ send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+ }
+
+ /*
+ * Always signal the last one if we're stopping due to flow control.
+ */
+ if (flow_controlled && i == (work_alloc-1))
+ send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+
+ rdsdebug("send %p wr %p num_sge %u next %p\n", send,
+ &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
+
+ sent += len;
+ off += len;
+ if (off == ib_sg_dma_len(dev, scat)) {
+ scat++;
+ off = 0;
+ }
+
+add_header:
+ /* Tack on the header after the data. The header SGE should already
+ * have been set up to point to the right header buffer. */
+ memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
+
+ if (0) {
+ struct rds_header *hdr = &ic->i_send_hdrs[pos];
+
+ printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n",
+ be16_to_cpu(hdr->h_dport),
+ hdr->h_flags,
+ be32_to_cpu(hdr->h_len));
+ }
+ if (adv_credits) {
+ struct rds_header *hdr = &ic->i_send_hdrs[pos];
+
+ /* add credit and redo the header checksum */
+ hdr->h_credit = adv_credits;
+ rds_message_make_checksum(hdr);
+ adv_credits = 0;
+ rds_iw_stats_inc(s_iw_tx_credit_updates);
+ }
+
+ if (prev)
+ prev->s_wr.next = &send->s_wr;
+ prev = send;
+
+ pos = (pos + 1) % ic->i_send_ring.w_nr;
+ }
+
+ /* Account the RDS header in the number of bytes we sent, but just once.
+ * The caller has no concept of fragmentation. */
+ if (hdr_off == 0)
+ sent += sizeof(struct rds_header);
+
+ /* if we finished the message then send completion owns it */
+ if (scat == &rm->m_sg[rm->m_count]) {
+ prev->s_rm = ic->i_rm;
+ prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+ ic->i_rm = NULL;
+ }
+
+ if (i < work_alloc) {
+ rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i);
+ work_alloc = i;
+ }
+ if (ic->i_flowctl && i < credit_alloc)
+ rds_iw_send_add_credits(conn, credit_alloc - i);
+
+ /* XXX need to worry about failed_wr and partial sends. */
+ failed_wr = &first->s_wr;
+ ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
+ rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
+ first, &first->s_wr, ret, failed_wr);
+ BUG_ON(failed_wr != &first->s_wr);
+ if (ret) {
+ printk(KERN_WARNING "RDS/IW: ib_post_send to %pI4 "
+ "returned %d\n", &conn->c_faddr, ret);
+ rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+ if (prev->s_rm) {
+ ic->i_rm = prev->s_rm;
+ prev->s_rm = NULL;
+ }
+ goto out;
+ }
+
+ ret = sent;
+out:
+ BUG_ON(adv_credits);
+ return ret;
+}
+
+static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rds_iw_connection *ic, struct rds_iw_send_work *send, int nent, int len, u64 sg_addr)
+{
+ BUG_ON(nent > send->s_page_list->max_page_list_len);
+ /*
+ * Perform a WR for the fast_reg_mr. Each individual page
+ * in the sg list is added to the fast reg page list and placed
+ * inside the fast_reg_mr WR.
+ */
+ send->s_wr.opcode = IB_WR_FAST_REG_MR;
+ send->s_wr.wr.fast_reg.length = len;
+ send->s_wr.wr.fast_reg.rkey = send->s_mr->rkey;
+ send->s_wr.wr.fast_reg.page_list = send->s_page_list;
+ send->s_wr.wr.fast_reg.page_list_len = nent;
+ send->s_wr.wr.fast_reg.page_shift = rds_iwdev->page_shift;
+ send->s_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE;
+ send->s_wr.wr.fast_reg.iova_start = sg_addr;
+
+ ib_update_fast_reg_key(send->s_mr, send->s_remap_count++);
+}
+
+int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
+{
+ struct rds_iw_connection *ic = conn->c_transport_data;
+ struct rds_iw_send_work *send = NULL;
+ struct rds_iw_send_work *first;
+ struct rds_iw_send_work *prev;
+ struct ib_send_wr *failed_wr;
+ struct rds_iw_device *rds_iwdev;
+ struct scatterlist *scat;
+ unsigned long len;
+ u64 remote_addr = op->r_remote_addr;
+ u32 pos, fr_pos;
+ u32 work_alloc;
+ u32 i;
+ u32 j;
+ int sent;
+ int ret;
+ int num_sge;
+
+ rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
+
+ /* map the message the first time we see it */
+ if (!op->r_mapped) {
+ op->r_count = ib_dma_map_sg(ic->i_cm_id->device,
+ op->r_sg, op->r_nents, (op->r_write) ?
+ DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count);
+ if (op->r_count == 0) {
+ rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
+ ret = -ENOMEM; /* XXX ? */
+ goto out;
+ }
+
+ op->r_mapped = 1;
+ }
+
+ if (!op->r_write) {
+ /* Alloc space on the send queue for the fastreg */
+ work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos);
+ if (work_alloc != 1) {
+ rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+ rds_iw_stats_inc(s_iw_tx_ring_full);
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ /*
+ * Instead of knowing how to return a partial rdma read/write we insist that there
+ * be enough work requests to send the entire message.
+ */
+ i = ceil(op->r_count, rds_iwdev->max_sge);
+
+ work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
+ if (work_alloc != i) {
+ rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+ rds_iw_stats_inc(s_iw_tx_ring_full);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ send = &ic->i_sends[pos];
+ if (!op->r_write) {
+ first = prev = &ic->i_sends[fr_pos];
+ } else {
+ first = send;
+ prev = NULL;
+ }
+ scat = &op->r_sg[0];
+ sent = 0;
+ num_sge = op->r_count;
+
+ for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) {
+ send->s_wr.send_flags = 0;
+ send->s_queued = jiffies;
+
+ /*
+ * We want to delay signaling completions just enough to get
+ * the batching benefits but not so much that we create dead time on the wire.
+ */
+ if (ic->i_unsignaled_wrs-- == 0) {
+ ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
+ send->s_wr.send_flags = IB_SEND_SIGNALED;
+ }
+
+ /* To avoid the need to have the plumbing to invalidate the fastreg_mr used
+ * for local access after RDS is finished with it, using
+ * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed.
+ */
+ if (op->r_write)
+ send->s_wr.opcode = IB_WR_RDMA_WRITE;
+ else
+ send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
+
+ send->s_wr.wr.rdma.remote_addr = remote_addr;
+ send->s_wr.wr.rdma.rkey = op->r_key;
+ send->s_op = op;
+
+ if (num_sge > rds_iwdev->max_sge) {
+ send->s_wr.num_sge = rds_iwdev->max_sge;
+ num_sge -= rds_iwdev->max_sge;
+ } else
+ send->s_wr.num_sge = num_sge;
+
+ send->s_wr.next = NULL;
+
+ if (prev)
+ prev->s_wr.next = &send->s_wr;
+
+ for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) {
+ len = ib_sg_dma_len(ic->i_cm_id->device, scat);
+
+ if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV)
+ send->s_page_list->page_list[j] = ib_sg_dma_address(ic->i_cm_id->device, scat);
+ else {
+ send->s_sge[j].addr = ib_sg_dma_address(ic->i_cm_id->device, scat);
+ send->s_sge[j].length = len;
+ send->s_sge[j].lkey = rds_iw_local_dma_lkey(ic);
+ }
+
+ sent += len;
+ rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);
+ remote_addr += len;
+
+ scat++;
+ }
+
+ if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) {
+ send->s_wr.num_sge = 1;
+ send->s_sge[0].addr = conn->c_xmit_rm->m_rs->rs_user_addr;
+ send->s_sge[0].length = conn->c_xmit_rm->m_rs->rs_user_bytes;
+ send->s_sge[0].lkey = ic->i_sends[fr_pos].s_mr->lkey;
+ }
+
+ rdsdebug("send %p wr %p num_sge %u next %p\n", send,
+ &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
+
+ prev = send;
+ if (++send == &ic->i_sends[ic->i_send_ring.w_nr])
+ send = ic->i_sends;
+ }
+
+ /* if we finished the message then send completion owns it */
+ if (scat == &op->r_sg[op->r_count])
+ first->s_wr.send_flags = IB_SEND_SIGNALED;
+
+ if (i < work_alloc) {
+ rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i);
+ work_alloc = i;
+ }
+
+ /* On iWARP, local memory access by a remote system (ie, RDMA Read) is not
+ * recommended. Putting the lkey on the wire is a security hole, as it can
+ * allow for memory access to all of memory on the remote system. Some
+ * adapters do not allow using the lkey for this at all. To bypass this use a
+ * fastreg_mr (or possibly a dma_mr)
+ */
+ if (!op->r_write) {
+ rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos],
+ op->r_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr);
+ work_alloc++;
+ }
+
+ failed_wr = &first->s_wr;
+ ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
+ rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
+ first, &first->s_wr, ret, failed_wr);
+ BUG_ON(failed_wr != &first->s_wr);
+ if (ret) {
+ printk(KERN_WARNING "RDS/IW: rdma ib_post_send to %pI4 "
+ "returned %d\n", &conn->c_faddr, ret);
+ rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+ goto out;
+ }
+
+out:
+ return ret;
+}
+
+void rds_iw_xmit_complete(struct rds_connection *conn)
+{
+ struct rds_iw_connection *ic = conn->c_transport_data;
+
+ /* We may have a pending ACK or window update we were unable
+ * to send previously (due to flow control). Try again. */
+ rds_iw_attempt_ack(ic);
+}
diff --git a/net/rds/iw_stats.c b/net/rds/iw_stats.c
new file mode 100644
index 00000000000..ccc7e8f0bf0
--- /dev/null
+++ b/net/rds/iw_stats.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
+#include "rds.h"
+#include "iw.h"
+
+DEFINE_PER_CPU(struct rds_iw_statistics, rds_iw_stats) ____cacheline_aligned;
+
+static char *rds_iw_stat_names[] = {
+ "iw_connect_raced",
+ "iw_listen_closed_stale",
+ "iw_tx_cq_call",
+ "iw_tx_cq_event",
+ "iw_tx_ring_full",
+ "iw_tx_throttle",
+ "iw_tx_sg_mapping_failure",
+ "iw_tx_stalled",
+ "iw_tx_credit_updates",
+ "iw_rx_cq_call",
+ "iw_rx_cq_event",
+ "iw_rx_ring_empty",
+ "iw_rx_refill_from_cq",
+ "iw_rx_refill_from_thread",
+ "iw_rx_alloc_limit",
+ "iw_rx_credit_updates",
+ "iw_ack_sent",
+ "iw_ack_send_failure",
+ "iw_ack_send_delayed",
+ "iw_ack_send_piggybacked",
+ "iw_ack_received",
+ "iw_rdma_mr_alloc",
+ "iw_rdma_mr_free",
+ "iw_rdma_mr_used",
+ "iw_rdma_mr_pool_flush",
+ "iw_rdma_mr_pool_wait",
+ "iw_rdma_mr_pool_depleted",
+};
+
+unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
+ unsigned int avail)
+{
+ struct rds_iw_statistics stats = {0, };
+ uint64_t *src;
+ uint64_t *sum;
+ size_t i;
+ int cpu;
+
+ if (avail < ARRAY_SIZE(rds_iw_stat_names))
+ goto out;
+
+ for_each_online_cpu(cpu) {
+ src = (uint64_t *)&(per_cpu(rds_iw_stats, cpu));
+ sum = (uint64_t *)&stats;
+ for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
+ *(sum++) += *(src++);
+ }
+
+ rds_stats_info_copy(iter, (uint64_t *)&stats, rds_iw_stat_names,
+ ARRAY_SIZE(rds_iw_stat_names));
+out:
+ return ARRAY_SIZE(rds_iw_stat_names);
+}
diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c
new file mode 100644
index 00000000000..9590678cd61
--- /dev/null
+++ b/net/rds/iw_sysctl.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+
+#include "iw.h"
+
+static struct ctl_table_header *rds_iw_sysctl_hdr;
+
+unsigned long rds_iw_sysctl_max_send_wr = RDS_IW_DEFAULT_SEND_WR;
+unsigned long rds_iw_sysctl_max_recv_wr = RDS_IW_DEFAULT_RECV_WR;
+unsigned long rds_iw_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE;
+static unsigned long rds_iw_sysctl_max_wr_min = 1;
+/* hardware will fail CQ creation long before this */
+static unsigned long rds_iw_sysctl_max_wr_max = (u32)~0;
+
+unsigned long rds_iw_sysctl_max_unsig_wrs = 16;
+static unsigned long rds_iw_sysctl_max_unsig_wr_min = 1;
+static unsigned long rds_iw_sysctl_max_unsig_wr_max = 64;
+
+unsigned long rds_iw_sysctl_max_unsig_bytes = (16 << 20);
+static unsigned long rds_iw_sysctl_max_unsig_bytes_min = 1;
+static unsigned long rds_iw_sysctl_max_unsig_bytes_max = ~0UL;
+
+unsigned int rds_iw_sysctl_flow_control = 1;
+
+ctl_table rds_iw_sysctl_table[] = {
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "max_send_wr",
+ .data = &rds_iw_sysctl_max_send_wr,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = &proc_doulongvec_minmax,
+ .extra1 = &rds_iw_sysctl_max_wr_min,
+ .extra2 = &rds_iw_sysctl_max_wr_max,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "max_recv_wr",
+ .data = &rds_iw_sysctl_max_recv_wr,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = &proc_doulongvec_minmax,
+ .extra1 = &rds_iw_sysctl_max_wr_min,
+ .extra2 = &rds_iw_sysctl_max_wr_max,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "max_unsignaled_wr",
+ .data = &rds_iw_sysctl_max_unsig_wrs,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = &proc_doulongvec_minmax,
+ .extra1 = &rds_iw_sysctl_max_unsig_wr_min,
+ .extra2 = &rds_iw_sysctl_max_unsig_wr_max,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "max_unsignaled_bytes",
+ .data = &rds_iw_sysctl_max_unsig_bytes,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = &proc_doulongvec_minmax,
+ .extra1 = &rds_iw_sysctl_max_unsig_bytes_min,
+ .extra2 = &rds_iw_sysctl_max_unsig_bytes_max,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "max_recv_allocation",
+ .data = &rds_iw_sysctl_max_recv_allocation,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = &proc_doulongvec_minmax,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "flow_control",
+ .data = &rds_iw_sysctl_flow_control,
+ .maxlen = sizeof(rds_iw_sysctl_flow_control),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ { .ctl_name = 0}
+};
+
+static struct ctl_path rds_iw_sysctl_path[] = {
+ { .procname = "net", .ctl_name = CTL_NET, },
+ { .procname = "rds", .ctl_name = CTL_UNNUMBERED, },
+ { .procname = "iw", .ctl_name = CTL_UNNUMBERED, },
+ { }
+};
+
+void rds_iw_sysctl_exit(void)
+{
+ if (rds_iw_sysctl_hdr)
+ unregister_sysctl_table(rds_iw_sysctl_hdr);
+}
+
+int __init rds_iw_sysctl_init(void)
+{
+ rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table);
+ if (rds_iw_sysctl_hdr == NULL)
+ return -ENOMEM;
+ return 0;
+}
diff --git a/net/rds/loop.c b/net/rds/loop.c
new file mode 100644
index 00000000000..4a61997f554
--- /dev/null
+++ b/net/rds/loop.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+
+#include "rds.h"
+#include "loop.h"
+
+static DEFINE_SPINLOCK(loop_conns_lock);
+static LIST_HEAD(loop_conns);
+
+/*
+ * This 'loopback' transport is a special case for flows that originate
+ * and terminate on the same machine.
+ *
+ * Connection build-up notices if the destination address is thought of
+ * as a local address by a transport. At that time it decides to use the
+ * loopback transport instead of the bound transport of the sending socket.
+ *
+ * The loopback transport's sending path just hands the sent rds_message
+ * straight to the receiving path via an embedded rds_incoming.
+ */
+
+/*
+ * Usually a message transits both the sender and receiver's conns as it
+ * flows to the receiver. In the loopback case, though, the receive path
+ * is handed the sending conn so the sense of the addresses is reversed.
+ */
+static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
+ unsigned int hdr_off, unsigned int sg,
+ unsigned int off)
+{
+ BUG_ON(hdr_off || sg || off);
+
+ rds_inc_init(&rm->m_inc, conn, conn->c_laddr);
+ rds_message_addref(rm); /* for the inc */
+
+ rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc,
+ GFP_KERNEL, KM_USER0);
+
+ rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence),
+ NULL);
+
+ rds_inc_put(&rm->m_inc);
+
+ return sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len);
+}
+
+static int rds_loop_xmit_cong_map(struct rds_connection *conn,
+ struct rds_cong_map *map,
+ unsigned long offset)
+{
+ unsigned long i;
+
+ BUG_ON(offset);
+ BUG_ON(map != conn->c_lcong);
+
+ for (i = 0; i < RDS_CONG_MAP_PAGES; i++) {
+ memcpy((void *)conn->c_fcong->m_page_addrs[i],
+ (void *)map->m_page_addrs[i], PAGE_SIZE);
+ }
+
+ rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
+
+ return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES;
+}
+
+/* we need to at least give the thread something to succeed */
+static int rds_loop_recv(struct rds_connection *conn)
+{
+ return 0;
+}
+
+struct rds_loop_connection {
+ struct list_head loop_node;
+ struct rds_connection *conn;
+};
+
+/*
+ * Even the loopback transport needs to keep track of its connections,
+ * so it can call rds_conn_destroy() on them on exit. N.B. there are
+ * 1+ loopback addresses (127.*.*.*) so it's not a bug to have
+ * multiple loopback conns allocated, although rather useless.
+ */
+static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp)
+{
+ struct rds_loop_connection *lc;
+ unsigned long flags;
+
+ lc = kzalloc(sizeof(struct rds_loop_connection), GFP_KERNEL);
+ if (lc == NULL)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&lc->loop_node);
+ lc->conn = conn;
+ conn->c_transport_data = lc;
+
+ spin_lock_irqsave(&loop_conns_lock, flags);
+ list_add_tail(&lc->loop_node, &loop_conns);
+ spin_unlock_irqrestore(&loop_conns_lock, flags);
+
+ return 0;
+}
+
+static void rds_loop_conn_free(void *arg)
+{
+ struct rds_loop_connection *lc = arg;
+ rdsdebug("lc %p\n", lc);
+ list_del(&lc->loop_node);
+ kfree(lc);
+}
+
+static int rds_loop_conn_connect(struct rds_connection *conn)
+{
+ rds_connect_complete(conn);
+ return 0;
+}
+
+static void rds_loop_conn_shutdown(struct rds_connection *conn)
+{
+}
+
+void rds_loop_exit(void)
+{
+ struct rds_loop_connection *lc, *_lc;
+ LIST_HEAD(tmp_list);
+
+ /* avoid calling conn_destroy with irqs off */
+ spin_lock_irq(&loop_conns_lock);
+ list_splice(&loop_conns, &tmp_list);
+ INIT_LIST_HEAD(&loop_conns);
+ spin_unlock_irq(&loop_conns_lock);
+
+ list_for_each_entry_safe(lc, _lc, &tmp_list, loop_node) {
+ WARN_ON(lc->conn->c_passive);
+ rds_conn_destroy(lc->conn);
+ }
+}
+
+/*
+ * This is missing .xmit_* because loop doesn't go through generic
+ * rds_send_xmit() and doesn't call rds_recv_incoming(). .listen_stop and
+ * .laddr_check are missing because transport.c doesn't iterate over
+ * rds_loop_transport.
+ */
+struct rds_transport rds_loop_transport = {
+ .xmit = rds_loop_xmit,
+ .xmit_cong_map = rds_loop_xmit_cong_map,
+ .recv = rds_loop_recv,
+ .conn_alloc = rds_loop_conn_alloc,
+ .conn_free = rds_loop_conn_free,
+ .conn_connect = rds_loop_conn_connect,
+ .conn_shutdown = rds_loop_conn_shutdown,
+ .inc_copy_to_user = rds_message_inc_copy_to_user,
+ .inc_purge = rds_message_inc_purge,
+ .inc_free = rds_message_inc_free,
+ .t_name = "loopback",
+};
diff --git a/net/rds/loop.h b/net/rds/loop.h
new file mode 100644
index 00000000000..f32b0939a04
--- /dev/null
+++ b/net/rds/loop.h
@@ -0,0 +1,9 @@
+#ifndef _RDS_LOOP_H
+#define _RDS_LOOP_H
+
+/* loop.c */
+extern struct rds_transport rds_loop_transport;
+
+void rds_loop_exit(void);
+
+#endif
diff --git a/net/rds/message.c b/net/rds/message.c
new file mode 100644
index 00000000000..5a15dc8d0cd
--- /dev/null
+++ b/net/rds/message.c
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+
+#include "rds.h"
+#include "rdma.h"
+
+static DECLARE_WAIT_QUEUE_HEAD(rds_message_flush_waitq);
+
+static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
+[RDS_EXTHDR_NONE] = 0,
+[RDS_EXTHDR_VERSION] = sizeof(struct rds_ext_header_version),
+[RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma),
+[RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest),
+};
+
+
+void rds_message_addref(struct rds_message *rm)
+{
+ rdsdebug("addref rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
+ atomic_inc(&rm->m_refcount);
+}
+
+/*
+ * This relies on dma_map_sg() not touching sg[].page during merging.
+ */
+static void rds_message_purge(struct rds_message *rm)
+{
+ unsigned long i;
+
+ if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags)))
+ return;
+
+ for (i = 0; i < rm->m_nents; i++) {
+ rdsdebug("putting data page %p\n", (void *)sg_page(&rm->m_sg[i]));
+ /* XXX will have to put_page for page refs */
+ __free_page(sg_page(&rm->m_sg[i]));
+ }
+ rm->m_nents = 0;
+
+ if (rm->m_rdma_op)
+ rds_rdma_free_op(rm->m_rdma_op);
+ if (rm->m_rdma_mr)
+ rds_mr_put(rm->m_rdma_mr);
+}
+
+void rds_message_inc_purge(struct rds_incoming *inc)
+{
+ struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
+ rds_message_purge(rm);
+}
+
+void rds_message_put(struct rds_message *rm)
+{
+ rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
+
+ if (atomic_dec_and_test(&rm->m_refcount)) {
+ BUG_ON(!list_empty(&rm->m_sock_item));
+ BUG_ON(!list_empty(&rm->m_conn_item));
+ rds_message_purge(rm);
+
+ kfree(rm);
+ }
+}
+
+void rds_message_inc_free(struct rds_incoming *inc)
+{
+ struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
+ rds_message_put(rm);
+}
+
+void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
+ __be16 dport, u64 seq)
+{
+ hdr->h_flags = 0;
+ hdr->h_sport = sport;
+ hdr->h_dport = dport;
+ hdr->h_sequence = cpu_to_be64(seq);
+ hdr->h_exthdr[0] = RDS_EXTHDR_NONE;
+}
+
+int rds_message_add_extension(struct rds_header *hdr,
+ unsigned int type, const void *data, unsigned int len)
+{
+ unsigned int ext_len = sizeof(u8) + len;
+ unsigned char *dst;
+
+ /* For now, refuse to add more than one extension header */
+ if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE)
+ return 0;
+
+ if (type >= __RDS_EXTHDR_MAX
+ || len != rds_exthdr_size[type])
+ return 0;
+
+ if (ext_len >= RDS_HEADER_EXT_SPACE)
+ return 0;
+ dst = hdr->h_exthdr;
+
+ *dst++ = type;
+ memcpy(dst, data, len);
+
+ dst[len] = RDS_EXTHDR_NONE;
+ return 1;
+}
+
+/*
+ * If a message has extension headers, retrieve them here.
+ * Call like this:
+ *
+ * unsigned int pos = 0;
+ *
+ * while (1) {
+ * buflen = sizeof(buffer);
+ * type = rds_message_next_extension(hdr, &pos, buffer, &buflen);
+ * if (type == RDS_EXTHDR_NONE)
+ * break;
+ * ...
+ * }
+ */
+int rds_message_next_extension(struct rds_header *hdr,
+ unsigned int *pos, void *buf, unsigned int *buflen)
+{
+ unsigned int offset, ext_type, ext_len;
+ u8 *src = hdr->h_exthdr;
+
+ offset = *pos;
+ if (offset >= RDS_HEADER_EXT_SPACE)
+ goto none;
+
+ /* Get the extension type and length. For now, the
+ * length is implied by the extension type. */
+ ext_type = src[offset++];
+
+ if (ext_type == RDS_EXTHDR_NONE || ext_type >= __RDS_EXTHDR_MAX)
+ goto none;
+ ext_len = rds_exthdr_size[ext_type];
+ if (offset + ext_len > RDS_HEADER_EXT_SPACE)
+ goto none;
+
+ *pos = offset + ext_len;
+ if (ext_len < *buflen)
+ *buflen = ext_len;
+ memcpy(buf, src + offset, *buflen);
+ return ext_type;
+
+none:
+ *pos = RDS_HEADER_EXT_SPACE;
+ *buflen = 0;
+ return RDS_EXTHDR_NONE;
+}
+
+int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version)
+{
+ struct rds_ext_header_version ext_hdr;
+
+ ext_hdr.h_version = cpu_to_be32(version);
+ return rds_message_add_extension(hdr, RDS_EXTHDR_VERSION, &ext_hdr, sizeof(ext_hdr));
+}
+
+int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version)
+{
+ struct rds_ext_header_version ext_hdr;
+ unsigned int pos = 0, len = sizeof(ext_hdr);
+
+ /* We assume the version extension is the only one present */
+ if (rds_message_next_extension(hdr, &pos, &ext_hdr, &len) != RDS_EXTHDR_VERSION)
+ return 0;
+ *version = be32_to_cpu(ext_hdr.h_version);
+ return 1;
+}
+
+int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset)
+{
+ struct rds_ext_header_rdma_dest ext_hdr;
+
+ ext_hdr.h_rdma_rkey = cpu_to_be32(r_key);
+ ext_hdr.h_rdma_offset = cpu_to_be32(offset);
+ return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, sizeof(ext_hdr));
+}
+
+struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp)
+{
+ struct rds_message *rm;
+
+ rm = kzalloc(sizeof(struct rds_message) +
+ (nents * sizeof(struct scatterlist)), gfp);
+ if (!rm)
+ goto out;
+
+ if (nents)
+ sg_init_table(rm->m_sg, nents);
+ atomic_set(&rm->m_refcount, 1);
+ INIT_LIST_HEAD(&rm->m_sock_item);
+ INIT_LIST_HEAD(&rm->m_conn_item);
+ spin_lock_init(&rm->m_rs_lock);
+
+out:
+ return rm;
+}
+
+struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len)
+{
+ struct rds_message *rm;
+ unsigned int i;
+
+ rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL);
+ if (rm == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ set_bit(RDS_MSG_PAGEVEC, &rm->m_flags);
+ rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
+ rm->m_nents = ceil(total_len, PAGE_SIZE);
+
+ for (i = 0; i < rm->m_nents; ++i) {
+ sg_set_page(&rm->m_sg[i],
+ virt_to_page(page_addrs[i]),
+ PAGE_SIZE, 0);
+ }
+
+ return rm;
+}
+
+struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
+ size_t total_len)
+{
+ unsigned long to_copy;
+ unsigned long iov_off;
+ unsigned long sg_off;
+ struct rds_message *rm;
+ struct iovec *iov;
+ struct scatterlist *sg;
+ int ret;
+
+ rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL);
+ if (rm == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
+
+ /*
+ * now allocate and copy in the data payload.
+ */
+ sg = rm->m_sg;
+ iov = first_iov;
+ iov_off = 0;
+ sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
+
+ while (total_len) {
+ if (sg_page(sg) == NULL) {
+ ret = rds_page_remainder_alloc(sg, total_len,
+ GFP_HIGHUSER);
+ if (ret)
+ goto out;
+ rm->m_nents++;
+ sg_off = 0;
+ }
+
+ while (iov_off == iov->iov_len) {
+ iov_off = 0;
+ iov++;
+ }
+
+ to_copy = min(iov->iov_len - iov_off, sg->length - sg_off);
+ to_copy = min_t(size_t, to_copy, total_len);
+
+ rdsdebug("copying %lu bytes from user iov [%p, %zu] + %lu to "
+ "sg [%p, %u, %u] + %lu\n",
+ to_copy, iov->iov_base, iov->iov_len, iov_off,
+ (void *)sg_page(sg), sg->offset, sg->length, sg_off);
+
+ ret = rds_page_copy_from_user(sg_page(sg), sg->offset + sg_off,
+ iov->iov_base + iov_off,
+ to_copy);
+ if (ret)
+ goto out;
+
+ iov_off += to_copy;
+ total_len -= to_copy;
+ sg_off += to_copy;
+
+ if (sg_off == sg->length)
+ sg++;
+ }
+
+ ret = 0;
+out:
+ if (ret) {
+ if (rm)
+ rds_message_put(rm);
+ rm = ERR_PTR(ret);
+ }
+ return rm;
+}
+
+int rds_message_inc_copy_to_user(struct rds_incoming *inc,
+ struct iovec *first_iov, size_t size)
+{
+ struct rds_message *rm;
+ struct iovec *iov;
+ struct scatterlist *sg;
+ unsigned long to_copy;
+ unsigned long iov_off;
+ unsigned long vec_off;
+ int copied;
+ int ret;
+ u32 len;
+
+ rm = container_of(inc, struct rds_message, m_inc);
+ len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
+
+ iov = first_iov;
+ iov_off = 0;
+ sg = rm->m_sg;
+ vec_off = 0;
+ copied = 0;
+
+ while (copied < size && copied < len) {
+ while (iov_off == iov->iov_len) {
+ iov_off = 0;
+ iov++;
+ }
+
+ to_copy = min(iov->iov_len - iov_off, sg->length - vec_off);
+ to_copy = min_t(size_t, to_copy, size - copied);
+ to_copy = min_t(unsigned long, to_copy, len - copied);
+
+ rdsdebug("copying %lu bytes to user iov [%p, %zu] + %lu to "
+ "sg [%p, %u, %u] + %lu\n",
+ to_copy, iov->iov_base, iov->iov_len, iov_off,
+ sg_page(sg), sg->offset, sg->length, vec_off);
+
+ ret = rds_page_copy_to_user(sg_page(sg), sg->offset + vec_off,
+ iov->iov_base + iov_off,
+ to_copy);
+ if (ret) {
+ copied = ret;
+ break;
+ }
+
+ iov_off += to_copy;
+ vec_off += to_copy;
+ copied += to_copy;
+
+ if (vec_off == sg->length) {
+ vec_off = 0;
+ sg++;
+ }
+ }
+
+ return copied;
+}
+
+/*
+ * If the message is still on the send queue, wait until the transport
+ * is done with it. This is particularly important for RDMA operations.
+ */
+void rds_message_wait(struct rds_message *rm)
+{
+ wait_event(rds_message_flush_waitq,
+ !test_bit(RDS_MSG_MAPPED, &rm->m_flags));
+}
+
+void rds_message_unmapped(struct rds_message *rm)
+{
+ clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
+ if (waitqueue_active(&rds_message_flush_waitq))
+ wake_up(&rds_message_flush_waitq);
+}
+
diff --git a/net/rds/page.c b/net/rds/page.c
new file mode 100644
index 00000000000..c460743a89a
--- /dev/null
+++ b/net/rds/page.c
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/highmem.h>
+
+#include "rds.h"
+
+struct rds_page_remainder {
+ struct page *r_page;
+ unsigned long r_offset;
+};
+
+DEFINE_PER_CPU(struct rds_page_remainder, rds_page_remainders) ____cacheline_aligned;
+
+/*
+ * returns 0 on success or -errno on failure.
+ *
+ * We don't have to worry about flush_dcache_page() as this only works
+ * with private pages. If, say, we were to do directed receive to pinned
+ * user pages we'd have to worry more about cache coherence. (Though
+ * the flush_dcache_page() in get_user_pages() would probably be enough).
+ */
+int rds_page_copy_user(struct page *page, unsigned long offset,
+ void __user *ptr, unsigned long bytes,
+ int to_user)
+{
+ unsigned long ret;
+ void *addr;
+
+ if (to_user)
+ rds_stats_add(s_copy_to_user, bytes);
+ else
+ rds_stats_add(s_copy_from_user, bytes);
+
+ addr = kmap_atomic(page, KM_USER0);
+ if (to_user)
+ ret = __copy_to_user_inatomic(ptr, addr + offset, bytes);
+ else
+ ret = __copy_from_user_inatomic(addr + offset, ptr, bytes);
+ kunmap_atomic(addr, KM_USER0);
+
+ if (ret) {
+ addr = kmap(page);
+ if (to_user)
+ ret = copy_to_user(ptr, addr + offset, bytes);
+ else
+ ret = copy_from_user(addr + offset, ptr, bytes);
+ kunmap(page);
+ if (ret)
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+/*
+ * Message allocation uses this to build up regions of a message.
+ *
+ * @bytes - the number of bytes needed.
+ * @gfp - the waiting behaviour of the allocation
+ *
+ * @gfp is always ored with __GFP_HIGHMEM. Callers must be prepared to
+ * kmap the pages, etc.
+ *
+ * If @bytes is at least a full page then this just returns a page from
+ * alloc_page().
+ *
+ * If @bytes is a partial page then this stores the unused region of the
+ * page in a per-cpu structure. Future partial-page allocations may be
+ * satisfied from that cached region. This lets us waste less memory on
+ * small allocations with minimal complexity. It works because the transmit
+ * path passes read-only page regions down to devices. They hold a page
+ * reference until they are done with the region.
+ */
+int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
+ gfp_t gfp)
+{
+ struct rds_page_remainder *rem;
+ unsigned long flags;
+ struct page *page;
+ int ret;
+
+ gfp |= __GFP_HIGHMEM;
+
+ /* jump straight to allocation if we're trying for a huge page */
+ if (bytes >= PAGE_SIZE) {
+ page = alloc_page(gfp);
+ if (page == NULL) {
+ ret = -ENOMEM;
+ } else {
+ sg_set_page(scat, page, PAGE_SIZE, 0);
+ ret = 0;
+ }
+ goto out;
+ }
+
+ rem = &per_cpu(rds_page_remainders, get_cpu());
+ local_irq_save(flags);
+
+ while (1) {
+ /* avoid a tiny region getting stuck by tossing it */
+ if (rem->r_page && bytes > (PAGE_SIZE - rem->r_offset)) {
+ rds_stats_inc(s_page_remainder_miss);
+ __free_page(rem->r_page);
+ rem->r_page = NULL;
+ }
+
+ /* hand out a fragment from the cached page */
+ if (rem->r_page && bytes <= (PAGE_SIZE - rem->r_offset)) {
+ sg_set_page(scat, rem->r_page, bytes, rem->r_offset);
+ get_page(sg_page(scat));
+
+ if (rem->r_offset != 0)
+ rds_stats_inc(s_page_remainder_hit);
+
+ rem->r_offset += bytes;
+ if (rem->r_offset == PAGE_SIZE) {
+ __free_page(rem->r_page);
+ rem->r_page = NULL;
+ }
+ ret = 0;
+ break;
+ }
+
+ /* alloc if there is nothing for us to use */
+ local_irq_restore(flags);
+ put_cpu();
+
+ page = alloc_page(gfp);
+
+ rem = &per_cpu(rds_page_remainders, get_cpu());
+ local_irq_save(flags);
+
+ if (page == NULL) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ /* did someone race to fill the remainder before us? */
+ if (rem->r_page) {
+ __free_page(page);
+ continue;
+ }
+
+ /* otherwise install our page and loop around to alloc */
+ rem->r_page = page;
+ rem->r_offset = 0;
+ }
+
+ local_irq_restore(flags);
+ put_cpu();
+out:
+ rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret,
+ ret ? NULL : sg_page(scat), ret ? 0 : scat->offset,
+ ret ? 0 : scat->length);
+ return ret;
+}
+
+static int rds_page_remainder_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ struct rds_page_remainder *rem;
+ long cpu = (long)hcpu;
+
+ rem = &per_cpu(rds_page_remainders, cpu);
+
+ rdsdebug("cpu %ld action 0x%lx\n", cpu, action);
+
+ switch (action) {
+ case CPU_DEAD:
+ if (rem->r_page)
+ __free_page(rem->r_page);
+ rem->r_page = NULL;
+ break;
+ }
+
+ return 0;
+}
+
+static struct notifier_block rds_page_remainder_nb = {
+ .notifier_call = rds_page_remainder_cpu_notify,
+};
+
+void rds_page_exit(void)
+{
+ int i;
+
+ for_each_possible_cpu(i)
+ rds_page_remainder_cpu_notify(&rds_page_remainder_nb,
+ (unsigned long)CPU_DEAD,
+ (void *)(long)i);
+}
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
new file mode 100644
index 00000000000..eaeeb91e111
--- /dev/null
+++ b/net/rds/rdma.c
@@ -0,0 +1,679 @@
+/*
+ * Copyright (c) 2007 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/pagemap.h>
+#include <linux/rbtree.h>
+#include <linux/dma-mapping.h> /* for DMA_*_DEVICE */
+
+#include "rdma.h"
+
+/*
+ * XXX
+ * - build with sparse
+ * - should we limit the size of a mr region? let transport return failure?
+ * - should we detect duplicate keys on a socket? hmm.
+ * - an rdma is an mlock, apply rlimit?
+ */
+
+/*
+ * get the number of pages by looking at the page indices that the start and
+ * end addresses fall in.
+ *
+ * Returns 0 if the vec is invalid. It is invalid if the number of bytes
+ * causes the address to wrap or overflows an unsigned int. This comes
+ * from being stored in the 'length' member of 'struct scatterlist'.
+ */
+static unsigned int rds_pages_in_vec(struct rds_iovec *vec)
+{
+ if ((vec->addr + vec->bytes <= vec->addr) ||
+ (vec->bytes > (u64)UINT_MAX))
+ return 0;
+
+ return ((vec->addr + vec->bytes + PAGE_SIZE - 1) >> PAGE_SHIFT) -
+ (vec->addr >> PAGE_SHIFT);
+}
+
+static struct rds_mr *rds_mr_tree_walk(struct rb_root *root, u64 key,
+ struct rds_mr *insert)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct rds_mr *mr;
+
+ while (*p) {
+ parent = *p;
+ mr = rb_entry(parent, struct rds_mr, r_rb_node);
+
+ if (key < mr->r_key)
+ p = &(*p)->rb_left;
+ else if (key > mr->r_key)
+ p = &(*p)->rb_right;
+ else
+ return mr;
+ }
+
+ if (insert) {
+ rb_link_node(&insert->r_rb_node, parent, p);
+ rb_insert_color(&insert->r_rb_node, root);
+ atomic_inc(&insert->r_refcount);
+ }
+ return NULL;
+}
+
+/*
+ * Destroy the transport-specific part of a MR.
+ */
+static void rds_destroy_mr(struct rds_mr *mr)
+{
+ struct rds_sock *rs = mr->r_sock;
+ void *trans_private = NULL;
+ unsigned long flags;
+
+ rdsdebug("RDS: destroy mr key is %x refcnt %u\n",
+ mr->r_key, atomic_read(&mr->r_refcount));
+
+ if (test_and_set_bit(RDS_MR_DEAD, &mr->r_state))
+ return;
+
+ spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+ if (!RB_EMPTY_NODE(&mr->r_rb_node))
+ rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+ trans_private = mr->r_trans_private;
+ mr->r_trans_private = NULL;
+ spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+ if (trans_private)
+ mr->r_trans->free_mr(trans_private, mr->r_invalidate);
+}
+
+void __rds_put_mr_final(struct rds_mr *mr)
+{
+ rds_destroy_mr(mr);
+ kfree(mr);
+}
+
+/*
+ * By the time this is called we can't have any more ioctls called on
+ * the socket so we don't need to worry about racing with others.
+ */
+void rds_rdma_drop_keys(struct rds_sock *rs)
+{
+ struct rds_mr *mr;
+ struct rb_node *node;
+
+ /* Release any MRs associated with this socket */
+ while ((node = rb_first(&rs->rs_rdma_keys))) {
+ mr = container_of(node, struct rds_mr, r_rb_node);
+ if (mr->r_trans == rs->rs_transport)
+ mr->r_invalidate = 0;
+ rds_mr_put(mr);
+ }
+
+ if (rs->rs_transport && rs->rs_transport->flush_mrs)
+ rs->rs_transport->flush_mrs();
+}
+
+/*
+ * Helper function to pin user pages.
+ */
+static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
+ struct page **pages, int write)
+{
+ int ret;
+
+ down_read(&current->mm->mmap_sem);
+ ret = get_user_pages(current, current->mm, user_addr,
+ nr_pages, write, 0, pages, NULL);
+ up_read(&current->mm->mmap_sem);
+
+ if (0 <= ret && (unsigned) ret < nr_pages) {
+ while (ret--)
+ put_page(pages[ret]);
+ ret = -EFAULT;
+ }
+
+ return ret;
+}
+
+static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
+ u64 *cookie_ret, struct rds_mr **mr_ret)
+{
+ struct rds_mr *mr = NULL, *found;
+ unsigned int nr_pages;
+ struct page **pages = NULL;
+ struct scatterlist *sg;
+ void *trans_private;
+ unsigned long flags;
+ rds_rdma_cookie_t cookie;
+ unsigned int nents;
+ long i;
+ int ret;
+
+ if (rs->rs_bound_addr == 0) {
+ ret = -ENOTCONN; /* XXX not a great errno */
+ goto out;
+ }
+
+ if (rs->rs_transport->get_mr == NULL) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ nr_pages = rds_pages_in_vec(&args->vec);
+ if (nr_pages == 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n",
+ args->vec.addr, args->vec.bytes, nr_pages);
+
+ /* XXX clamp nr_pages to limit the size of this alloc? */
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
+ if (pages == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL);
+ if (mr == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ atomic_set(&mr->r_refcount, 1);
+ RB_CLEAR_NODE(&mr->r_rb_node);
+ mr->r_trans = rs->rs_transport;
+ mr->r_sock = rs;
+
+ if (args->flags & RDS_RDMA_USE_ONCE)
+ mr->r_use_once = 1;
+ if (args->flags & RDS_RDMA_INVALIDATE)
+ mr->r_invalidate = 1;
+ if (args->flags & RDS_RDMA_READWRITE)
+ mr->r_write = 1;
+
+ /*
+ * Pin the pages that make up the user buffer and transfer the page
+ * pointers to the mr's sg array. We check to see if we've mapped
+ * the whole region after transferring the partial page references
+ * to the sg array so that we can have one page ref cleanup path.
+ *
+ * For now we have no flag that tells us whether the mapping is
+ * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to
+ * the zero page.
+ */
+ ret = rds_pin_pages(args->vec.addr & PAGE_MASK, nr_pages, pages, 1);
+ if (ret < 0)
+ goto out;
+
+ nents = ret;
+ sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
+ if (sg == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ WARN_ON(!nents);
+ sg_init_table(sg, nents);
+
+ /* Stick all pages into the scatterlist */
+ for (i = 0 ; i < nents; i++)
+ sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0);
+
+ rdsdebug("RDS: trans_private nents is %u\n", nents);
+
+ /* Obtain a transport specific MR. If this succeeds, the
+ * s/g list is now owned by the MR.
+ * Note that dma_map() implies that pending writes are
+ * flushed to RAM, so no dma_sync is needed here. */
+ trans_private = rs->rs_transport->get_mr(sg, nents, rs,
+ &mr->r_key);
+
+ if (IS_ERR(trans_private)) {
+ for (i = 0 ; i < nents; i++)
+ put_page(sg_page(&sg[i]));
+ kfree(sg);
+ ret = PTR_ERR(trans_private);
+ goto out;
+ }
+
+ mr->r_trans_private = trans_private;
+
+ rdsdebug("RDS: get_mr put_user key is %x cookie_addr %p\n",
+ mr->r_key, (void *)(unsigned long) args->cookie_addr);
+
+ /* The user may pass us an unaligned address, but we can only
+ * map page aligned regions. So we keep the offset, and build
+ * a 64bit cookie containing <R_Key, offset> and pass that
+ * around. */
+ cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK);
+ if (cookie_ret)
+ *cookie_ret = cookie;
+
+ if (args->cookie_addr && put_user(cookie, (u64 __user *)(unsigned long) args->cookie_addr)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /* Inserting the new MR into the rbtree bumps its
+ * reference count. */
+ spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+ found = rds_mr_tree_walk(&rs->rs_rdma_keys, mr->r_key, mr);
+ spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+ BUG_ON(found && found != mr);
+
+ rdsdebug("RDS: get_mr key is %x\n", mr->r_key);
+ if (mr_ret) {
+ atomic_inc(&mr->r_refcount);
+ *mr_ret = mr;
+ }
+
+ ret = 0;
+out:
+ kfree(pages);
+ if (mr)
+ rds_mr_put(mr);
+ return ret;
+}
+
+int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen)
+{
+ struct rds_get_mr_args args;
+
+ if (optlen != sizeof(struct rds_get_mr_args))
+ return -EINVAL;
+
+ if (copy_from_user(&args, (struct rds_get_mr_args __user *)optval,
+ sizeof(struct rds_get_mr_args)))
+ return -EFAULT;
+
+ return __rds_rdma_map(rs, &args, NULL, NULL);
+}
+
+/*
+ * Free the MR indicated by the given R_Key
+ */
+int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen)
+{
+ struct rds_free_mr_args args;
+ struct rds_mr *mr;
+ unsigned long flags;
+
+ if (optlen != sizeof(struct rds_free_mr_args))
+ return -EINVAL;
+
+ if (copy_from_user(&args, (struct rds_free_mr_args __user *)optval,
+ sizeof(struct rds_free_mr_args)))
+ return -EFAULT;
+
+ /* Special case - a null cookie means flush all unused MRs */
+ if (args.cookie == 0) {
+ if (!rs->rs_transport || !rs->rs_transport->flush_mrs)
+ return -EINVAL;
+ rs->rs_transport->flush_mrs();
+ return 0;
+ }
+
+ /* Look up the MR given its R_key and remove it from the rbtree
+ * so nobody else finds it.
+ * This should also prevent races with rds_rdma_unuse.
+ */
+ spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+ mr = rds_mr_tree_walk(&rs->rs_rdma_keys, rds_rdma_cookie_key(args.cookie), NULL);
+ if (mr) {
+ rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+ RB_CLEAR_NODE(&mr->r_rb_node);
+ if (args.flags & RDS_RDMA_INVALIDATE)
+ mr->r_invalidate = 1;
+ }
+ spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+ if (!mr)
+ return -EINVAL;
+
+ /*
+ * call rds_destroy_mr() ourselves so that we're sure it's done by the time
+ * we return. If we let rds_mr_put() do it it might not happen until
+ * someone else drops their ref.
+ */
+ rds_destroy_mr(mr);
+ rds_mr_put(mr);
+ return 0;
+}
+
+/*
+ * This is called when we receive an extension header that
+ * tells us this MR was used. It allows us to implement
+ * use_once semantics
+ */
+void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
+{
+ struct rds_mr *mr;
+ unsigned long flags;
+ int zot_me = 0;
+
+ spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+ mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
+ if (mr && (mr->r_use_once || force)) {
+ rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+ RB_CLEAR_NODE(&mr->r_rb_node);
+ zot_me = 1;
+ } else if (mr)
+ atomic_inc(&mr->r_refcount);
+ spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+ /* May have to issue a dma_sync on this memory region.
+ * Note we could avoid this if the operation was a RDMA READ,
+ * but at this point we can't tell. */
+ if (mr != NULL) {
+ if (mr->r_trans->sync_mr)
+ mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
+
+ /* If the MR was marked as invalidate, this will
+ * trigger an async flush. */
+ if (zot_me)
+ rds_destroy_mr(mr);
+ rds_mr_put(mr);
+ }
+}
+
+void rds_rdma_free_op(struct rds_rdma_op *ro)
+{
+ unsigned int i;
+
+ for (i = 0; i < ro->r_nents; i++) {
+ struct page *page = sg_page(&ro->r_sg[i]);
+
+ /* Mark page dirty if it was possibly modified, which
+ * is the case for a RDMA_READ which copies from remote
+ * to local memory */
+ if (!ro->r_write)
+ set_page_dirty(page);
+ put_page(page);
+ }
+
+ kfree(ro->r_notifier);
+ kfree(ro);
+}
+
+/*
+ * args is a pointer to an in-kernel copy in the sendmsg cmsg.
+ */
+static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
+ struct rds_rdma_args *args)
+{
+ struct rds_iovec vec;
+ struct rds_rdma_op *op = NULL;
+ unsigned int nr_pages;
+ unsigned int max_pages;
+ unsigned int nr_bytes;
+ struct page **pages = NULL;
+ struct rds_iovec __user *local_vec;
+ struct scatterlist *sg;
+ unsigned int nr;
+ unsigned int i, j;
+ int ret;
+
+
+ if (rs->rs_bound_addr == 0) {
+ ret = -ENOTCONN; /* XXX not a great errno */
+ goto out;
+ }
+
+ if (args->nr_local > (u64)UINT_MAX) {
+ ret = -EMSGSIZE;
+ goto out;
+ }
+
+ nr_pages = 0;
+ max_pages = 0;
+
+ local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
+
+ /* figure out the number of pages in the vector */
+ for (i = 0; i < args->nr_local; i++) {
+ if (copy_from_user(&vec, &local_vec[i],
+ sizeof(struct rds_iovec))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ nr = rds_pages_in_vec(&vec);
+ if (nr == 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ max_pages = max(nr, max_pages);
+ nr_pages += nr;
+ }
+
+ pages = kcalloc(max_pages, sizeof(struct page *), GFP_KERNEL);
+ if (pages == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ op = kzalloc(offsetof(struct rds_rdma_op, r_sg[nr_pages]), GFP_KERNEL);
+ if (op == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ op->r_write = !!(args->flags & RDS_RDMA_READWRITE);
+ op->r_fence = !!(args->flags & RDS_RDMA_FENCE);
+ op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+ op->r_recverr = rs->rs_recverr;
+ WARN_ON(!nr_pages);
+ sg_init_table(op->r_sg, nr_pages);
+
+ if (op->r_notify || op->r_recverr) {
+ /* We allocate an uninitialized notifier here, because
+ * we don't want to do that in the completion handler. We
+ * would have to use GFP_ATOMIC there, and don't want to deal
+ * with failed allocations.
+ */
+ op->r_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
+ if (!op->r_notifier) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ op->r_notifier->n_user_token = args->user_token;
+ op->r_notifier->n_status = RDS_RDMA_SUCCESS;
+ }
+
+ /* The cookie contains the R_Key of the remote memory region, and
+ * optionally an offset into it. This is how we implement RDMA into
+ * unaligned memory.
+ * When setting up the RDMA, we need to add that offset to the
+ * destination address (which is really an offset into the MR)
+ * FIXME: We may want to move this into ib_rdma.c
+ */
+ op->r_key = rds_rdma_cookie_key(args->cookie);
+ op->r_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie);
+
+ nr_bytes = 0;
+
+ rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n",
+ (unsigned long long)args->nr_local,
+ (unsigned long long)args->remote_vec.addr,
+ op->r_key);
+
+ for (i = 0; i < args->nr_local; i++) {
+ if (copy_from_user(&vec, &local_vec[i],
+ sizeof(struct rds_iovec))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ nr = rds_pages_in_vec(&vec);
+ if (nr == 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ rs->rs_user_addr = vec.addr;
+ rs->rs_user_bytes = vec.bytes;
+
+ /* did the user change the vec under us? */
+ if (nr > max_pages || op->r_nents + nr > nr_pages) {
+ ret = -EINVAL;
+ goto out;
+ }
+ /* If it's a WRITE operation, we want to pin the pages for reading.
+ * If it's a READ operation, we need to pin the pages for writing.
+ */
+ ret = rds_pin_pages(vec.addr & PAGE_MASK, nr, pages, !op->r_write);
+ if (ret < 0)
+ goto out;
+
+ rdsdebug("RDS: nr_bytes %u nr %u vec.bytes %llu vec.addr %llx\n",
+ nr_bytes, nr, vec.bytes, vec.addr);
+
+ nr_bytes += vec.bytes;
+
+ for (j = 0; j < nr; j++) {
+ unsigned int offset = vec.addr & ~PAGE_MASK;
+
+ sg = &op->r_sg[op->r_nents + j];
+ sg_set_page(sg, pages[j],
+ min_t(unsigned int, vec.bytes, PAGE_SIZE - offset),
+ offset);
+
+ rdsdebug("RDS: sg->offset %x sg->len %x vec.addr %llx vec.bytes %llu\n",
+ sg->offset, sg->length, vec.addr, vec.bytes);
+
+ vec.addr += sg->length;
+ vec.bytes -= sg->length;
+ }
+
+ op->r_nents += nr;
+ }
+
+
+ if (nr_bytes > args->remote_vec.bytes) {
+ rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n",
+ nr_bytes,
+ (unsigned int) args->remote_vec.bytes);
+ ret = -EINVAL;
+ goto out;
+ }
+ op->r_bytes = nr_bytes;
+
+ ret = 0;
+out:
+ kfree(pages);
+ if (ret) {
+ if (op)
+ rds_rdma_free_op(op);
+ op = ERR_PTR(ret);
+ }
+ return op;
+}
+
+/*
+ * The application asks for a RDMA transfer.
+ * Extract all arguments and set up the rdma_op
+ */
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg)
+{
+ struct rds_rdma_op *op;
+
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
+ || rm->m_rdma_op != NULL)
+ return -EINVAL;
+
+ op = rds_rdma_prepare(rs, CMSG_DATA(cmsg));
+ if (IS_ERR(op))
+ return PTR_ERR(op);
+ rds_stats_inc(s_send_rdma);
+ rm->m_rdma_op = op;
+ return 0;
+}
+
+/*
+ * The application wants us to pass an RDMA destination (aka MR)
+ * to the remote
+ */
+int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg)
+{
+ unsigned long flags;
+ struct rds_mr *mr;
+ u32 r_key;
+ int err = 0;
+
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t))
+ || rm->m_rdma_cookie != 0)
+ return -EINVAL;
+
+ memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg), sizeof(rm->m_rdma_cookie));
+
+ /* We are reusing a previously mapped MR here. Most likely, the
+ * application has written to the buffer, so we need to explicitly
+ * flush those writes to RAM. Otherwise the HCA may not see them
+ * when doing a DMA from that buffer.
+ */
+ r_key = rds_rdma_cookie_key(rm->m_rdma_cookie);
+
+ spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+ mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
+ if (mr == NULL)
+ err = -EINVAL; /* invalid r_key */
+ else
+ atomic_inc(&mr->r_refcount);
+ spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+ if (mr) {
+ mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE);
+ rm->m_rdma_mr = mr;
+ }
+ return err;
+}
+
+/*
+ * The application passes us an address range it wants to enable RDMA
+ * to/from. We map the area, and save the <R_Key,offset> pair
+ * in rm->m_rdma_cookie. This causes it to be sent along to the peer
+ * in an extension header.
+ */
+int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg)
+{
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args))
+ || rm->m_rdma_cookie != 0)
+ return -EINVAL;
+
+ return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->m_rdma_mr);
+}
diff --git a/net/rds/rdma.h b/net/rds/rdma.h
new file mode 100644
index 00000000000..425512098b0
--- /dev/null
+++ b/net/rds/rdma.h
@@ -0,0 +1,84 @@
+#ifndef _RDS_RDMA_H
+#define _RDS_RDMA_H
+
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/scatterlist.h>
+
+#include "rds.h"
+
+struct rds_mr {
+ struct rb_node r_rb_node;
+ atomic_t r_refcount;
+ u32 r_key;
+
+ /* A copy of the creation flags */
+ unsigned int r_use_once:1;
+ unsigned int r_invalidate:1;
+ unsigned int r_write:1;
+
+ /* This is for RDS_MR_DEAD.
+ * It would be nice & consistent to make this part of the above
+ * bit field here, but we need to use test_and_set_bit.
+ */
+ unsigned long r_state;
+ struct rds_sock *r_sock; /* back pointer to the socket that owns us */
+ struct rds_transport *r_trans;
+ void *r_trans_private;
+};
+
+/* Flags for mr->r_state */
+#define RDS_MR_DEAD 0
+
+struct rds_rdma_op {
+ u32 r_key;
+ u64 r_remote_addr;
+ unsigned int r_write:1;
+ unsigned int r_fence:1;
+ unsigned int r_notify:1;
+ unsigned int r_recverr:1;
+ unsigned int r_mapped:1;
+ struct rds_notifier *r_notifier;
+ unsigned int r_bytes;
+ unsigned int r_nents;
+ unsigned int r_count;
+ struct scatterlist r_sg[0];
+};
+
+static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
+{
+ return r_key | (((u64) offset) << 32);
+}
+
+static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
+{
+ return cookie;
+}
+
+static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
+{
+ return cookie >> 32;
+}
+
+int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
+int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
+void rds_rdma_drop_keys(struct rds_sock *rs);
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg);
+int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg);
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg);
+int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg);
+void rds_rdma_free_op(struct rds_rdma_op *ro);
+void rds_rdma_send_complete(struct rds_message *rm, int);
+
+extern void __rds_put_mr_final(struct rds_mr *mr);
+static inline void rds_mr_put(struct rds_mr *mr)
+{
+ if (atomic_dec_and_test(&mr->r_refcount))
+ __rds_put_mr_final(mr);
+}
+
+#endif
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
new file mode 100644
index 00000000000..7b19024f970
--- /dev/null
+++ b/net/rds/rdma_transport.c
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2009 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <rdma/rdma_cm.h>
+
+#include "rdma_transport.h"
+
+static struct rdma_cm_id *rds_iw_listen_id;
+
+int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event)
+{
+ /* this can be null in the listening path */
+ struct rds_connection *conn = cm_id->context;
+ struct rds_transport *trans;
+ int ret = 0;
+
+ rdsdebug("conn %p id %p handling event %u\n", conn, cm_id,
+ event->event);
+
+ if (cm_id->device->node_type == RDMA_NODE_RNIC)
+ trans = &rds_iw_transport;
+ else
+ trans = &rds_ib_transport;
+
+ /* Prevent shutdown from tearing down the connection
+ * while we're executing. */
+ if (conn) {
+ mutex_lock(&conn->c_cm_lock);
+
+ /* If the connection is being shut down, bail out
+ * right away. We return 0 so cm_id doesn't get
+ * destroyed prematurely */
+ if (rds_conn_state(conn) == RDS_CONN_DISCONNECTING) {
+ /* Reject incoming connections while we're tearing
+ * down an existing one. */
+ if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST)
+ ret = 1;
+ goto out;
+ }
+ }
+
+ switch (event->event) {
+ case RDMA_CM_EVENT_CONNECT_REQUEST:
+ ret = trans->cm_handle_connect(cm_id, event);
+ break;
+
+ case RDMA_CM_EVENT_ADDR_RESOLVED:
+ /* XXX do we need to clean up if this fails? */
+ ret = rdma_resolve_route(cm_id,
+ RDS_RDMA_RESOLVE_TIMEOUT_MS);
+ break;
+
+ case RDMA_CM_EVENT_ROUTE_RESOLVED:
+ /* XXX worry about racing with listen acceptance */
+ ret = trans->cm_initiate_connect(cm_id);
+ break;
+
+ case RDMA_CM_EVENT_ESTABLISHED:
+ trans->cm_connect_complete(conn, event);
+ break;
+
+ case RDMA_CM_EVENT_ADDR_ERROR:
+ case RDMA_CM_EVENT_ROUTE_ERROR:
+ case RDMA_CM_EVENT_CONNECT_ERROR:
+ case RDMA_CM_EVENT_UNREACHABLE:
+ case RDMA_CM_EVENT_REJECTED:
+ case RDMA_CM_EVENT_DEVICE_REMOVAL:
+ case RDMA_CM_EVENT_ADDR_CHANGE:
+ if (conn)
+ rds_conn_drop(conn);
+ break;
+
+ case RDMA_CM_EVENT_DISCONNECTED:
+ printk(KERN_WARNING "RDS/IW: DISCONNECT event - dropping connection "
+ "%pI4->%pI4\n", &conn->c_laddr,
+ &conn->c_faddr);
+ rds_conn_drop(conn);
+ break;
+
+ default:
+ /* things like device disconnect? */
+ printk(KERN_ERR "unknown event %u\n", event->event);
+ BUG();
+ break;
+ }
+
+out:
+ if (conn)
+ mutex_unlock(&conn->c_cm_lock);
+
+ rdsdebug("id %p event %u handling ret %d\n", cm_id, event->event, ret);
+
+ return ret;
+}
+
+static int __init rds_rdma_listen_init(void)
+{
+ struct sockaddr_in sin;
+ struct rdma_cm_id *cm_id;
+ int ret;
+
+ cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP);
+ if (IS_ERR(cm_id)) {
+ ret = PTR_ERR(cm_id);
+ printk(KERN_ERR "RDS/IW: failed to setup listener, "
+ "rdma_create_id() returned %d\n", ret);
+ goto out;
+ }
+
+ sin.sin_family = PF_INET,
+ sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
+ sin.sin_port = (__force u16)htons(RDS_PORT);
+
+ /*
+ * XXX I bet this binds the cm_id to a device. If we want to support
+ * fail-over we'll have to take this into consideration.
+ */
+ ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+ if (ret) {
+ printk(KERN_ERR "RDS/IW: failed to setup listener, "
+ "rdma_bind_addr() returned %d\n", ret);
+ goto out;
+ }
+
+ ret = rdma_listen(cm_id, 128);
+ if (ret) {
+ printk(KERN_ERR "RDS/IW: failed to setup listener, "
+ "rdma_listen() returned %d\n", ret);
+ goto out;
+ }
+
+ rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT);
+
+ rds_iw_listen_id = cm_id;
+ cm_id = NULL;
+out:
+ if (cm_id)
+ rdma_destroy_id(cm_id);
+ return ret;
+}
+
+static void rds_rdma_listen_stop(void)
+{
+ if (rds_iw_listen_id) {
+ rdsdebug("cm %p\n", rds_iw_listen_id);
+ rdma_destroy_id(rds_iw_listen_id);
+ rds_iw_listen_id = NULL;
+ }
+}
+
+int __init rds_rdma_init(void)
+{
+ int ret;
+
+ ret = rds_rdma_listen_init();
+ if (ret)
+ goto out;
+
+ ret = rds_iw_init();
+ if (ret)
+ goto err_iw_init;
+
+ ret = rds_ib_init();
+ if (ret)
+ goto err_ib_init;
+
+ goto out;
+
+err_ib_init:
+ rds_iw_exit();
+err_iw_init:
+ rds_rdma_listen_stop();
+out:
+ return ret;
+}
+
+void rds_rdma_exit(void)
+{
+ /* stop listening first to ensure no new connections are attempted */
+ rds_rdma_listen_stop();
+ rds_ib_exit();
+ rds_iw_exit();
+}
+
diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h
new file mode 100644
index 00000000000..2f2c7d976c2
--- /dev/null
+++ b/net/rds/rdma_transport.h
@@ -0,0 +1,28 @@
+#ifndef _RDMA_TRANSPORT_H
+#define _RDMA_TRANSPORT_H
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include "rds.h"
+
+#define RDS_RDMA_RESOLVE_TIMEOUT_MS 5000
+
+int rds_rdma_conn_connect(struct rds_connection *conn);
+int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event);
+
+/* from rdma_transport.c */
+int rds_rdma_init(void);
+void rds_rdma_exit(void);
+
+/* from ib.c */
+extern struct rds_transport rds_ib_transport;
+int rds_ib_init(void);
+void rds_ib_exit(void);
+
+/* from iw.c */
+extern struct rds_transport rds_iw_transport;
+int rds_iw_init(void);
+void rds_iw_exit(void);
+
+#endif
diff --git a/net/rds/rds.h b/net/rds/rds.h
new file mode 100644
index 00000000000..06040070497
--- /dev/null
+++ b/net/rds/rds.h
@@ -0,0 +1,686 @@
+#ifndef _RDS_RDS_H
+#define _RDS_RDS_H
+
+#include <net/sock.h>
+#include <linux/scatterlist.h>
+#include <linux/highmem.h>
+#include <rdma/rdma_cm.h>
+#include <linux/mutex.h>
+#include <linux/rds.h>
+
+#include "info.h"
+
+/*
+ * RDS Network protocol version
+ */
+#define RDS_PROTOCOL_3_0 0x0300
+#define RDS_PROTOCOL_3_1 0x0301
+#define RDS_PROTOCOL_VERSION RDS_PROTOCOL_3_1
+#define RDS_PROTOCOL_MAJOR(v) ((v) >> 8)
+#define RDS_PROTOCOL_MINOR(v) ((v) & 255)
+#define RDS_PROTOCOL(maj, min) (((maj) << 8) | min)
+
+/*
+ * XXX randomly chosen, but at least seems to be unused:
+ * # 18464-18768 Unassigned
+ * We should do better. We want a reserved port to discourage unpriv'ed
+ * userspace from listening.
+ */
+#define RDS_PORT 18634
+
+#ifdef DEBUG
+#define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args)
+#else
+/* sigh, pr_debug() causes unused variable warnings */
+static inline void __attribute__ ((format (printf, 1, 2)))
+rdsdebug(char *fmt, ...)
+{
+}
+#endif
+
+/* XXX is there one of these somewhere? */
+#define ceil(x, y) \
+ ({ unsigned long __x = (x), __y = (y); (__x + __y - 1) / __y; })
+
+#define RDS_FRAG_SHIFT 12
+#define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT))
+
+#define RDS_CONG_MAP_BYTES (65536 / 8)
+#define RDS_CONG_MAP_LONGS (RDS_CONG_MAP_BYTES / sizeof(unsigned long))
+#define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE)
+#define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8)
+
+struct rds_cong_map {
+ struct rb_node m_rb_node;
+ __be32 m_addr;
+ wait_queue_head_t m_waitq;
+ struct list_head m_conn_list;
+ unsigned long m_page_addrs[RDS_CONG_MAP_PAGES];
+};
+
+
+/*
+ * This is how we will track the connection state:
+ * A connection is always in one of the following
+ * states. Updates to the state are atomic and imply
+ * a memory barrier.
+ */
+enum {
+ RDS_CONN_DOWN = 0,
+ RDS_CONN_CONNECTING,
+ RDS_CONN_DISCONNECTING,
+ RDS_CONN_UP,
+ RDS_CONN_ERROR,
+};
+
+/* Bits for c_flags */
+#define RDS_LL_SEND_FULL 0
+#define RDS_RECONNECT_PENDING 1
+
+struct rds_connection {
+ struct hlist_node c_hash_node;
+ __be32 c_laddr;
+ __be32 c_faddr;
+ unsigned int c_loopback:1;
+ struct rds_connection *c_passive;
+
+ struct rds_cong_map *c_lcong;
+ struct rds_cong_map *c_fcong;
+
+ struct mutex c_send_lock; /* protect send ring */
+ struct rds_message *c_xmit_rm;
+ unsigned long c_xmit_sg;
+ unsigned int c_xmit_hdr_off;
+ unsigned int c_xmit_data_off;
+ unsigned int c_xmit_rdma_sent;
+
+ spinlock_t c_lock; /* protect msg queues */
+ u64 c_next_tx_seq;
+ struct list_head c_send_queue;
+ struct list_head c_retrans;
+
+ u64 c_next_rx_seq;
+
+ struct rds_transport *c_trans;
+ void *c_transport_data;
+
+ atomic_t c_state;
+ unsigned long c_flags;
+ unsigned long c_reconnect_jiffies;
+ struct delayed_work c_send_w;
+ struct delayed_work c_recv_w;
+ struct delayed_work c_conn_w;
+ struct work_struct c_down_w;
+ struct mutex c_cm_lock; /* protect conn state & cm */
+
+ struct list_head c_map_item;
+ unsigned long c_map_queued;
+ unsigned long c_map_offset;
+ unsigned long c_map_bytes;
+
+ unsigned int c_unacked_packets;
+ unsigned int c_unacked_bytes;
+
+ /* Protocol version */
+ unsigned int c_version;
+};
+
+#define RDS_FLAG_CONG_BITMAP 0x01
+#define RDS_FLAG_ACK_REQUIRED 0x02
+#define RDS_FLAG_RETRANSMITTED 0x04
+#define RDS_MAX_ADV_CREDIT 127
+
+/*
+ * Maximum space available for extension headers.
+ */
+#define RDS_HEADER_EXT_SPACE 16
+
+struct rds_header {
+ __be64 h_sequence;
+ __be64 h_ack;
+ __be32 h_len;
+ __be16 h_sport;
+ __be16 h_dport;
+ u8 h_flags;
+ u8 h_credit;
+ u8 h_padding[4];
+ __sum16 h_csum;
+
+ u8 h_exthdr[RDS_HEADER_EXT_SPACE];
+};
+
+/*
+ * Reserved - indicates end of extensions
+ */
+#define RDS_EXTHDR_NONE 0
+
+/*
+ * This extension header is included in the very
+ * first message that is sent on a new connection,
+ * and identifies the protocol level. This will help
+ * rolling updates if a future change requires breaking
+ * the protocol.
+ * NB: This is no longer true for IB, where we do a version
+ * negotiation during the connection setup phase (protocol
+ * version information is included in the RDMA CM private data).
+ */
+#define RDS_EXTHDR_VERSION 1
+struct rds_ext_header_version {
+ __be32 h_version;
+};
+
+/*
+ * This extension header is included in the RDS message
+ * chasing an RDMA operation.
+ */
+#define RDS_EXTHDR_RDMA 2
+struct rds_ext_header_rdma {
+ __be32 h_rdma_rkey;
+};
+
+/*
+ * This extension header tells the peer about the
+ * destination <R_Key,offset> of the requested RDMA
+ * operation.
+ */
+#define RDS_EXTHDR_RDMA_DEST 3
+struct rds_ext_header_rdma_dest {
+ __be32 h_rdma_rkey;
+ __be32 h_rdma_offset;
+};
+
+#define __RDS_EXTHDR_MAX 16 /* for now */
+
+struct rds_incoming {
+ atomic_t i_refcount;
+ struct list_head i_item;
+ struct rds_connection *i_conn;
+ struct rds_header i_hdr;
+ unsigned long i_rx_jiffies;
+ __be32 i_saddr;
+
+ rds_rdma_cookie_t i_rdma_cookie;
+};
+
+/*
+ * m_sock_item and m_conn_item are on lists that are serialized under
+ * conn->c_lock. m_sock_item has additional meaning in that once it is empty
+ * the message will not be put back on the retransmit list after being sent.
+ * messages that are canceled while being sent rely on this.
+ *
+ * m_inc is used by loopback so that it can pass an incoming message straight
+ * back up into the rx path. It embeds a wire header which is also used by
+ * the send path, which is kind of awkward.
+ *
+ * m_sock_item indicates the message's presence on a socket's send or receive
+ * queue. m_rs will point to that socket.
+ *
+ * m_daddr is used by cancellation to prune messages to a given destination.
+ *
+ * The RDS_MSG_ON_SOCK and RDS_MSG_ON_CONN flags are used to avoid lock
+ * nesting. As paths iterate over messages on a sock, or conn, they must
+ * also lock the conn, or sock, to remove the message from those lists too.
+ * Testing the flag to determine if the message is still on the lists lets
+ * us avoid testing the list_head directly. That means each path can use
+ * the message's list_head to keep it on a local list while juggling locks
+ * without confusing the other path.
+ *
+ * m_ack_seq is an optional field set by transports who need a different
+ * sequence number range to invalidate. They can use this in a callback
+ * that they pass to rds_send_drop_acked() to see if each message has been
+ * acked. The HAS_ACK_SEQ flag can be used to detect messages which haven't
+ * had ack_seq set yet.
+ */
+#define RDS_MSG_ON_SOCK 1
+#define RDS_MSG_ON_CONN 2
+#define RDS_MSG_HAS_ACK_SEQ 3
+#define RDS_MSG_ACK_REQUIRED 4
+#define RDS_MSG_RETRANSMITTED 5
+#define RDS_MSG_MAPPED 6
+#define RDS_MSG_PAGEVEC 7
+
+struct rds_message {
+ atomic_t m_refcount;
+ struct list_head m_sock_item;
+ struct list_head m_conn_item;
+ struct rds_incoming m_inc;
+ u64 m_ack_seq;
+ __be32 m_daddr;
+ unsigned long m_flags;
+
+ /* Never access m_rs without holding m_rs_lock.
+ * Lock nesting is
+ * rm->m_rs_lock
+ * -> rs->rs_lock
+ */
+ spinlock_t m_rs_lock;
+ struct rds_sock *m_rs;
+ struct rds_rdma_op *m_rdma_op;
+ rds_rdma_cookie_t m_rdma_cookie;
+ struct rds_mr *m_rdma_mr;
+ unsigned int m_nents;
+ unsigned int m_count;
+ struct scatterlist m_sg[0];
+};
+
+/*
+ * The RDS notifier is used (optionally) to tell the application about
+ * completed RDMA operations. Rather than keeping the whole rds message
+ * around on the queue, we allocate a small notifier that is put on the
+ * socket's notifier_list. Notifications are delivered to the application
+ * through control messages.
+ */
+struct rds_notifier {
+ struct list_head n_list;
+ uint64_t n_user_token;
+ int n_status;
+};
+
+/**
+ * struct rds_transport - transport specific behavioural hooks
+ *
+ * @xmit: .xmit is called by rds_send_xmit() to tell the transport to send
+ * part of a message. The caller serializes on the send_sem so this
+ * doesn't need to be reentrant for a given conn. The header must be
+ * sent before the data payload. .xmit must be prepared to send a
+ * message with no data payload. .xmit should return the number of
+ * bytes that were sent down the connection, including header bytes.
+ * Returning 0 tells the caller that it doesn't need to perform any
+ * additional work now. This is usually the case when the transport has
+ * filled the sending queue for its connection and will handle
+ * triggering the rds thread to continue the send when space becomes
+ * available. Returning -EAGAIN tells the caller to retry the send
+ * immediately. Returning -ENOMEM tells the caller to retry the send at
+ * some point in the future.
+ *
+ * @conn_shutdown: conn_shutdown stops traffic on the given connection. Once
+ * it returns the connection can not call rds_recv_incoming().
+ * This will only be called once after conn_connect returns
+ * non-zero success and will The caller serializes this with
+ * the send and connecting paths (xmit_* and conn_*). The
+ * transport is responsible for other serialization, including
+ * rds_recv_incoming(). This is called in process context but
+ * should try hard not to block.
+ *
+ * @xmit_cong_map: This asks the transport to send the local bitmap down the
+ * given connection. XXX get a better story about the bitmap
+ * flag and header.
+ */
+
+struct rds_transport {
+ char t_name[TRANSNAMSIZ];
+ struct list_head t_item;
+ struct module *t_owner;
+ unsigned int t_prefer_loopback:1;
+
+ int (*laddr_check)(__be32 addr);
+ int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
+ void (*conn_free)(void *data);
+ int (*conn_connect)(struct rds_connection *conn);
+ void (*conn_shutdown)(struct rds_connection *conn);
+ void (*xmit_prepare)(struct rds_connection *conn);
+ void (*xmit_complete)(struct rds_connection *conn);
+ int (*xmit)(struct rds_connection *conn, struct rds_message *rm,
+ unsigned int hdr_off, unsigned int sg, unsigned int off);
+ int (*xmit_cong_map)(struct rds_connection *conn,
+ struct rds_cong_map *map, unsigned long offset);
+ int (*xmit_rdma)(struct rds_connection *conn, struct rds_rdma_op *op);
+ int (*recv)(struct rds_connection *conn);
+ int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov,
+ size_t size);
+ void (*inc_purge)(struct rds_incoming *inc);
+ void (*inc_free)(struct rds_incoming *inc);
+
+ int (*cm_handle_connect)(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event);
+ int (*cm_initiate_connect)(struct rdma_cm_id *cm_id);
+ void (*cm_connect_complete)(struct rds_connection *conn,
+ struct rdma_cm_event *event);
+
+ unsigned int (*stats_info_copy)(struct rds_info_iterator *iter,
+ unsigned int avail);
+ void (*exit)(void);
+ void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg,
+ struct rds_sock *rs, u32 *key_ret);
+ void (*sync_mr)(void *trans_private, int direction);
+ void (*free_mr)(void *trans_private, int invalidate);
+ void (*flush_mrs)(void);
+};
+
+struct rds_sock {
+ struct sock rs_sk;
+
+ u64 rs_user_addr;
+ u64 rs_user_bytes;
+
+ /*
+ * bound_addr used for both incoming and outgoing, no INADDR_ANY
+ * support.
+ */
+ struct rb_node rs_bound_node;
+ __be32 rs_bound_addr;
+ __be32 rs_conn_addr;
+ __be16 rs_bound_port;
+ __be16 rs_conn_port;
+
+ /*
+ * This is only used to communicate the transport between bind and
+ * initiating connections. All other trans use is referenced through
+ * the connection.
+ */
+ struct rds_transport *rs_transport;
+
+ /*
+ * rds_sendmsg caches the conn it used the last time around.
+ * This helps avoid costly lookups.
+ */
+ struct rds_connection *rs_conn;
+
+ /* flag indicating we were congested or not */
+ int rs_congested;
+
+ /* rs_lock protects all these adjacent members before the newline */
+ spinlock_t rs_lock;
+ struct list_head rs_send_queue;
+ u32 rs_snd_bytes;
+ int rs_rcv_bytes;
+ struct list_head rs_notify_queue; /* currently used for failed RDMAs */
+
+ /* Congestion wake_up. If rs_cong_monitor is set, we use cong_mask
+ * to decide whether the application should be woken up.
+ * If not set, we use rs_cong_track to find out whether a cong map
+ * update arrived.
+ */
+ uint64_t rs_cong_mask;
+ uint64_t rs_cong_notify;
+ struct list_head rs_cong_list;
+ unsigned long rs_cong_track;
+
+ /*
+ * rs_recv_lock protects the receive queue, and is
+ * used to serialize with rds_release.
+ */
+ rwlock_t rs_recv_lock;
+ struct list_head rs_recv_queue;
+
+ /* just for stats reporting */
+ struct list_head rs_item;
+
+ /* these have their own lock */
+ spinlock_t rs_rdma_lock;
+ struct rb_root rs_rdma_keys;
+
+ /* Socket options - in case there will be more */
+ unsigned char rs_recverr,
+ rs_cong_monitor;
+};
+
+static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
+{
+ return container_of(sk, struct rds_sock, rs_sk);
+}
+static inline struct sock *rds_rs_to_sk(struct rds_sock *rs)
+{
+ return &rs->rs_sk;
+}
+
+/*
+ * The stack assigns sk_sndbuf and sk_rcvbuf to twice the specified value
+ * to account for overhead. We don't account for overhead, we just apply
+ * the number of payload bytes to the specified value.
+ */
+static inline int rds_sk_sndbuf(struct rds_sock *rs)
+{
+ return rds_rs_to_sk(rs)->sk_sndbuf / 2;
+}
+static inline int rds_sk_rcvbuf(struct rds_sock *rs)
+{
+ return rds_rs_to_sk(rs)->sk_rcvbuf / 2;
+}
+
+struct rds_statistics {
+ uint64_t s_conn_reset;
+ uint64_t s_recv_drop_bad_checksum;
+ uint64_t s_recv_drop_old_seq;
+ uint64_t s_recv_drop_no_sock;
+ uint64_t s_recv_drop_dead_sock;
+ uint64_t s_recv_deliver_raced;
+ uint64_t s_recv_delivered;
+ uint64_t s_recv_queued;
+ uint64_t s_recv_immediate_retry;
+ uint64_t s_recv_delayed_retry;
+ uint64_t s_recv_ack_required;
+ uint64_t s_recv_rdma_bytes;
+ uint64_t s_recv_ping;
+ uint64_t s_send_queue_empty;
+ uint64_t s_send_queue_full;
+ uint64_t s_send_sem_contention;
+ uint64_t s_send_sem_queue_raced;
+ uint64_t s_send_immediate_retry;
+ uint64_t s_send_delayed_retry;
+ uint64_t s_send_drop_acked;
+ uint64_t s_send_ack_required;
+ uint64_t s_send_queued;
+ uint64_t s_send_rdma;
+ uint64_t s_send_rdma_bytes;
+ uint64_t s_send_pong;
+ uint64_t s_page_remainder_hit;
+ uint64_t s_page_remainder_miss;
+ uint64_t s_copy_to_user;
+ uint64_t s_copy_from_user;
+ uint64_t s_cong_update_queued;
+ uint64_t s_cong_update_received;
+ uint64_t s_cong_send_error;
+ uint64_t s_cong_send_blocked;
+};
+
+/* af_rds.c */
+void rds_sock_addref(struct rds_sock *rs);
+void rds_sock_put(struct rds_sock *rs);
+void rds_wake_sk_sleep(struct rds_sock *rs);
+static inline void __rds_wake_sk_sleep(struct sock *sk)
+{
+ wait_queue_head_t *waitq = sk->sk_sleep;
+
+ if (!sock_flag(sk, SOCK_DEAD) && waitq)
+ wake_up(waitq);
+}
+extern wait_queue_head_t rds_poll_waitq;
+
+
+/* bind.c */
+int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
+void rds_remove_bound(struct rds_sock *rs);
+struct rds_sock *rds_find_bound(__be32 addr, __be16 port);
+
+/* cong.c */
+int rds_cong_get_maps(struct rds_connection *conn);
+void rds_cong_add_conn(struct rds_connection *conn);
+void rds_cong_remove_conn(struct rds_connection *conn);
+void rds_cong_set_bit(struct rds_cong_map *map, __be16 port);
+void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port);
+int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, struct rds_sock *rs);
+void rds_cong_queue_updates(struct rds_cong_map *map);
+void rds_cong_map_updated(struct rds_cong_map *map, uint64_t);
+int rds_cong_updated_since(unsigned long *recent);
+void rds_cong_add_socket(struct rds_sock *);
+void rds_cong_remove_socket(struct rds_sock *);
+void rds_cong_exit(void);
+struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
+
+/* conn.c */
+int __init rds_conn_init(void);
+void rds_conn_exit(void);
+struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
+ struct rds_transport *trans, gfp_t gfp);
+struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
+ struct rds_transport *trans, gfp_t gfp);
+void rds_conn_destroy(struct rds_connection *conn);
+void rds_conn_reset(struct rds_connection *conn);
+void rds_conn_drop(struct rds_connection *conn);
+void rds_for_each_conn_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens,
+ int (*visitor)(struct rds_connection *, void *),
+ size_t item_len);
+void __rds_conn_error(struct rds_connection *conn, const char *, ...)
+ __attribute__ ((format (printf, 2, 3)));
+#define rds_conn_error(conn, fmt...) \
+ __rds_conn_error(conn, KERN_WARNING "RDS: " fmt)
+
+static inline int
+rds_conn_transition(struct rds_connection *conn, int old, int new)
+{
+ return atomic_cmpxchg(&conn->c_state, old, new) == old;
+}
+
+static inline int
+rds_conn_state(struct rds_connection *conn)
+{
+ return atomic_read(&conn->c_state);
+}
+
+static inline int
+rds_conn_up(struct rds_connection *conn)
+{
+ return atomic_read(&conn->c_state) == RDS_CONN_UP;
+}
+
+static inline int
+rds_conn_connecting(struct rds_connection *conn)
+{
+ return atomic_read(&conn->c_state) == RDS_CONN_CONNECTING;
+}
+
+/* message.c */
+struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
+struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
+ size_t total_len);
+struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len);
+void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
+ __be16 dport, u64 seq);
+int rds_message_add_extension(struct rds_header *hdr,
+ unsigned int type, const void *data, unsigned int len);
+int rds_message_next_extension(struct rds_header *hdr,
+ unsigned int *pos, void *buf, unsigned int *buflen);
+int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version);
+int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version);
+int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset);
+int rds_message_inc_copy_to_user(struct rds_incoming *inc,
+ struct iovec *first_iov, size_t size);
+void rds_message_inc_purge(struct rds_incoming *inc);
+void rds_message_inc_free(struct rds_incoming *inc);
+void rds_message_addref(struct rds_message *rm);
+void rds_message_put(struct rds_message *rm);
+void rds_message_wait(struct rds_message *rm);
+void rds_message_unmapped(struct rds_message *rm);
+
+static inline void rds_message_make_checksum(struct rds_header *hdr)
+{
+ hdr->h_csum = 0;
+ hdr->h_csum = ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2);
+}
+
+static inline int rds_message_verify_checksum(const struct rds_header *hdr)
+{
+ return !hdr->h_csum || ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2) == 0;
+}
+
+
+/* page.c */
+int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
+ gfp_t gfp);
+int rds_page_copy_user(struct page *page, unsigned long offset,
+ void __user *ptr, unsigned long bytes,
+ int to_user);
+#define rds_page_copy_to_user(page, offset, ptr, bytes) \
+ rds_page_copy_user(page, offset, ptr, bytes, 1)
+#define rds_page_copy_from_user(page, offset, ptr, bytes) \
+ rds_page_copy_user(page, offset, ptr, bytes, 0)
+void rds_page_exit(void);
+
+/* recv.c */
+void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
+ __be32 saddr);
+void rds_inc_addref(struct rds_incoming *inc);
+void rds_inc_put(struct rds_incoming *inc);
+void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
+ struct rds_incoming *inc, gfp_t gfp, enum km_type km);
+int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+ size_t size, int msg_flags);
+void rds_clear_recv_queue(struct rds_sock *rs);
+int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg);
+void rds_inc_info_copy(struct rds_incoming *inc,
+ struct rds_info_iterator *iter,
+ __be32 saddr, __be32 daddr, int flip);
+
+/* send.c */
+int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+ size_t payload_len);
+void rds_send_reset(struct rds_connection *conn);
+int rds_send_xmit(struct rds_connection *conn);
+struct sockaddr_in;
+void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest);
+typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
+void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
+ is_acked_func is_acked);
+int rds_send_acked_before(struct rds_connection *conn, u64 seq);
+void rds_send_remove_from_sock(struct list_head *messages, int status);
+int rds_send_pong(struct rds_connection *conn, __be16 dport);
+struct rds_message *rds_send_get_message(struct rds_connection *,
+ struct rds_rdma_op *);
+
+/* rdma.c */
+void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
+
+/* stats.c */
+DECLARE_PER_CPU(struct rds_statistics, rds_stats);
+#define rds_stats_inc_which(which, member) do { \
+ per_cpu(which, get_cpu()).member++; \
+ put_cpu(); \
+} while (0)
+#define rds_stats_inc(member) rds_stats_inc_which(rds_stats, member)
+#define rds_stats_add_which(which, member, count) do { \
+ per_cpu(which, get_cpu()).member += count; \
+ put_cpu(); \
+} while (0)
+#define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count)
+int __init rds_stats_init(void);
+void rds_stats_exit(void);
+void rds_stats_info_copy(struct rds_info_iterator *iter,
+ uint64_t *values, char **names, size_t nr);
+
+/* sysctl.c */
+int __init rds_sysctl_init(void);
+void rds_sysctl_exit(void);
+extern unsigned long rds_sysctl_sndbuf_min;
+extern unsigned long rds_sysctl_sndbuf_default;
+extern unsigned long rds_sysctl_sndbuf_max;
+extern unsigned long rds_sysctl_reconnect_min_jiffies;
+extern unsigned long rds_sysctl_reconnect_max_jiffies;
+extern unsigned int rds_sysctl_max_unacked_packets;
+extern unsigned int rds_sysctl_max_unacked_bytes;
+extern unsigned int rds_sysctl_ping_enable;
+extern unsigned long rds_sysctl_trace_flags;
+extern unsigned int rds_sysctl_trace_level;
+
+/* threads.c */
+int __init rds_threads_init(void);
+void rds_threads_exit(void);
+extern struct workqueue_struct *rds_wq;
+void rds_connect_worker(struct work_struct *);
+void rds_shutdown_worker(struct work_struct *);
+void rds_send_worker(struct work_struct *);
+void rds_recv_worker(struct work_struct *);
+void rds_connect_complete(struct rds_connection *conn);
+
+/* transport.c */
+int rds_trans_register(struct rds_transport *trans);
+void rds_trans_unregister(struct rds_transport *trans);
+struct rds_transport *rds_trans_get_preferred(__be32 addr);
+unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
+ unsigned int avail);
+int __init rds_trans_init(void);
+void rds_trans_exit(void);
+
+#endif
diff --git a/net/rds/recv.c b/net/rds/recv.c
new file mode 100644
index 00000000000..f2118c51cfa
--- /dev/null
+++ b/net/rds/recv.c
@@ -0,0 +1,542 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <net/sock.h>
+#include <linux/in.h>
+
+#include "rds.h"
+#include "rdma.h"
+
+void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
+ __be32 saddr)
+{
+ atomic_set(&inc->i_refcount, 1);
+ INIT_LIST_HEAD(&inc->i_item);
+ inc->i_conn = conn;
+ inc->i_saddr = saddr;
+ inc->i_rdma_cookie = 0;
+}
+
+void rds_inc_addref(struct rds_incoming *inc)
+{
+ rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
+ atomic_inc(&inc->i_refcount);
+}
+
+void rds_inc_put(struct rds_incoming *inc)
+{
+ rdsdebug("put inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
+ if (atomic_dec_and_test(&inc->i_refcount)) {
+ BUG_ON(!list_empty(&inc->i_item));
+
+ inc->i_conn->c_trans->inc_free(inc);
+ }
+}
+
+static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
+ struct rds_cong_map *map,
+ int delta, __be16 port)
+{
+ int now_congested;
+
+ if (delta == 0)
+ return;
+
+ rs->rs_rcv_bytes += delta;
+ now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
+
+ rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d "
+ "now_cong %d delta %d\n",
+ rs, &rs->rs_bound_addr,
+ ntohs(rs->rs_bound_port), rs->rs_rcv_bytes,
+ rds_sk_rcvbuf(rs), now_congested, delta);
+
+ /* wasn't -> am congested */
+ if (!rs->rs_congested && now_congested) {
+ rs->rs_congested = 1;
+ rds_cong_set_bit(map, port);
+ rds_cong_queue_updates(map);
+ }
+ /* was -> aren't congested */
+ /* Require more free space before reporting uncongested to prevent
+ bouncing cong/uncong state too often */
+ else if (rs->rs_congested && (rs->rs_rcv_bytes < (rds_sk_rcvbuf(rs)/2))) {
+ rs->rs_congested = 0;
+ rds_cong_clear_bit(map, port);
+ rds_cong_queue_updates(map);
+ }
+
+ /* do nothing if no change in cong state */
+}
+
+/*
+ * Process all extension headers that come with this message.
+ */
+static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock *rs)
+{
+ struct rds_header *hdr = &inc->i_hdr;
+ unsigned int pos = 0, type, len;
+ union {
+ struct rds_ext_header_version version;
+ struct rds_ext_header_rdma rdma;
+ struct rds_ext_header_rdma_dest rdma_dest;
+ } buffer;
+
+ while (1) {
+ len = sizeof(buffer);
+ type = rds_message_next_extension(hdr, &pos, &buffer, &len);
+ if (type == RDS_EXTHDR_NONE)
+ break;
+ /* Process extension header here */
+ switch (type) {
+ case RDS_EXTHDR_RDMA:
+ rds_rdma_unuse(rs, be32_to_cpu(buffer.rdma.h_rdma_rkey), 0);
+ break;
+
+ case RDS_EXTHDR_RDMA_DEST:
+ /* We ignore the size for now. We could stash it
+ * somewhere and use it for error checking. */
+ inc->i_rdma_cookie = rds_rdma_make_cookie(
+ be32_to_cpu(buffer.rdma_dest.h_rdma_rkey),
+ be32_to_cpu(buffer.rdma_dest.h_rdma_offset));
+
+ break;
+ }
+ }
+}
+
+/*
+ * The transport must make sure that this is serialized against other
+ * rx and conn reset on this specific conn.
+ *
+ * We currently assert that only one fragmented message will be sent
+ * down a connection at a time. This lets us reassemble in the conn
+ * instead of per-flow which means that we don't have to go digging through
+ * flows to tear down partial reassembly progress on conn failure and
+ * we save flow lookup and locking for each frag arrival. It does mean
+ * that small messages will wait behind large ones. Fragmenting at all
+ * is only to reduce the memory consumption of pre-posted buffers.
+ *
+ * The caller passes in saddr and daddr instead of us getting it from the
+ * conn. This lets loopback, who only has one conn for both directions,
+ * tell us which roles the addrs in the conn are playing for this message.
+ */
+void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
+ struct rds_incoming *inc, gfp_t gfp, enum km_type km)
+{
+ struct rds_sock *rs = NULL;
+ struct sock *sk;
+ unsigned long flags;
+
+ inc->i_conn = conn;
+ inc->i_rx_jiffies = jiffies;
+
+ rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u "
+ "flags 0x%x rx_jiffies %lu\n", conn,
+ (unsigned long long)conn->c_next_rx_seq,
+ inc,
+ (unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence),
+ be32_to_cpu(inc->i_hdr.h_len),
+ be16_to_cpu(inc->i_hdr.h_sport),
+ be16_to_cpu(inc->i_hdr.h_dport),
+ inc->i_hdr.h_flags,
+ inc->i_rx_jiffies);
+
+ /*
+ * Sequence numbers should only increase. Messages get their
+ * sequence number as they're queued in a sending conn. They
+ * can be dropped, though, if the sending socket is closed before
+ * they hit the wire. So sequence numbers can skip forward
+ * under normal operation. They can also drop back in the conn
+ * failover case as previously sent messages are resent down the
+ * new instance of a conn. We drop those, otherwise we have
+ * to assume that the next valid seq does not come after a
+ * hole in the fragment stream.
+ *
+ * The headers don't give us a way to realize if fragments of
+ * a message have been dropped. We assume that frags that arrive
+ * to a flow are part of the current message on the flow that is
+ * being reassembled. This means that senders can't drop messages
+ * from the sending conn until all their frags are sent.
+ *
+ * XXX we could spend more on the wire to get more robust failure
+ * detection, arguably worth it to avoid data corruption.
+ */
+ if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq
+ && (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) {
+ rds_stats_inc(s_recv_drop_old_seq);
+ goto out;
+ }
+ conn->c_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1;
+
+ if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
+ rds_stats_inc(s_recv_ping);
+ rds_send_pong(conn, inc->i_hdr.h_sport);
+ goto out;
+ }
+
+ rs = rds_find_bound(daddr, inc->i_hdr.h_dport);
+ if (rs == NULL) {
+ rds_stats_inc(s_recv_drop_no_sock);
+ goto out;
+ }
+
+ /* Process extension headers */
+ rds_recv_incoming_exthdrs(inc, rs);
+
+ /* We can be racing with rds_release() which marks the socket dead. */
+ sk = rds_rs_to_sk(rs);
+
+ /* serialize with rds_release -> sock_orphan */
+ write_lock_irqsave(&rs->rs_recv_lock, flags);
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ rdsdebug("adding inc %p to rs %p's recv queue\n", inc, rs);
+ rds_stats_inc(s_recv_queued);
+ rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
+ be32_to_cpu(inc->i_hdr.h_len),
+ inc->i_hdr.h_dport);
+ rds_inc_addref(inc);
+ list_add_tail(&inc->i_item, &rs->rs_recv_queue);
+ __rds_wake_sk_sleep(sk);
+ } else {
+ rds_stats_inc(s_recv_drop_dead_sock);
+ }
+ write_unlock_irqrestore(&rs->rs_recv_lock, flags);
+
+out:
+ if (rs)
+ rds_sock_put(rs);
+}
+
+/*
+ * be very careful here. This is being called as the condition in
+ * wait_event_*() needs to cope with being called many times.
+ */
+static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc)
+{
+ unsigned long flags;
+
+ if (*inc == NULL) {
+ read_lock_irqsave(&rs->rs_recv_lock, flags);
+ if (!list_empty(&rs->rs_recv_queue)) {
+ *inc = list_entry(rs->rs_recv_queue.next,
+ struct rds_incoming,
+ i_item);
+ rds_inc_addref(*inc);
+ }
+ read_unlock_irqrestore(&rs->rs_recv_lock, flags);
+ }
+
+ return *inc != NULL;
+}
+
+static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc,
+ int drop)
+{
+ struct sock *sk = rds_rs_to_sk(rs);
+ int ret = 0;
+ unsigned long flags;
+
+ write_lock_irqsave(&rs->rs_recv_lock, flags);
+ if (!list_empty(&inc->i_item)) {
+ ret = 1;
+ if (drop) {
+ /* XXX make sure this i_conn is reliable */
+ rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
+ -be32_to_cpu(inc->i_hdr.h_len),
+ inc->i_hdr.h_dport);
+ list_del_init(&inc->i_item);
+ rds_inc_put(inc);
+ }
+ }
+ write_unlock_irqrestore(&rs->rs_recv_lock, flags);
+
+ rdsdebug("inc %p rs %p still %d dropped %d\n", inc, rs, ret, drop);
+ return ret;
+}
+
+/*
+ * Pull errors off the error queue.
+ * If msghdr is NULL, we will just purge the error queue.
+ */
+int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
+{
+ struct rds_notifier *notifier;
+ struct rds_rdma_notify cmsg;
+ unsigned int count = 0, max_messages = ~0U;
+ unsigned long flags;
+ LIST_HEAD(copy);
+ int err = 0;
+
+
+ /* put_cmsg copies to user space and thus may sleep. We can't do this
+ * with rs_lock held, so first grab as many notifications as we can stuff
+ * in the user provided cmsg buffer. We don't try to copy more, to avoid
+ * losing notifications - except when the buffer is so small that it wouldn't
+ * even hold a single notification. Then we give him as much of this single
+ * msg as we can squeeze in, and set MSG_CTRUNC.
+ */
+ if (msghdr) {
+ max_messages = msghdr->msg_controllen / CMSG_SPACE(sizeof(cmsg));
+ if (!max_messages)
+ max_messages = 1;
+ }
+
+ spin_lock_irqsave(&rs->rs_lock, flags);
+ while (!list_empty(&rs->rs_notify_queue) && count < max_messages) {
+ notifier = list_entry(rs->rs_notify_queue.next,
+ struct rds_notifier, n_list);
+ list_move(&notifier->n_list, &copy);
+ count++;
+ }
+ spin_unlock_irqrestore(&rs->rs_lock, flags);
+
+ if (!count)
+ return 0;
+
+ while (!list_empty(&copy)) {
+ notifier = list_entry(copy.next, struct rds_notifier, n_list);
+
+ if (msghdr) {
+ cmsg.user_token = notifier->n_user_token;
+ cmsg.status = notifier->n_status;
+
+ err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS,
+ sizeof(cmsg), &cmsg);
+ if (err)
+ break;
+ }
+
+ list_del_init(&notifier->n_list);
+ kfree(notifier);
+ }
+
+ /* If we bailed out because of an error in put_cmsg,
+ * we may be left with one or more notifications that we
+ * didn't process. Return them to the head of the list. */
+ if (!list_empty(&copy)) {
+ spin_lock_irqsave(&rs->rs_lock, flags);
+ list_splice(&copy, &rs->rs_notify_queue);
+ spin_unlock_irqrestore(&rs->rs_lock, flags);
+ }
+
+ return err;
+}
+
+/*
+ * Queue a congestion notification
+ */
+static int rds_notify_cong(struct rds_sock *rs, struct msghdr *msghdr)
+{
+ uint64_t notify = rs->rs_cong_notify;
+ unsigned long flags;
+ int err;
+
+ err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_CONG_UPDATE,
+ sizeof(notify), &notify);
+ if (err)
+ return err;
+
+ spin_lock_irqsave(&rs->rs_lock, flags);
+ rs->rs_cong_notify &= ~notify;
+ spin_unlock_irqrestore(&rs->rs_lock, flags);
+
+ return 0;
+}
+
+/*
+ * Receive any control messages.
+ */
+static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg)
+{
+ int ret = 0;
+
+ if (inc->i_rdma_cookie) {
+ ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST,
+ sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+ size_t size, int msg_flags)
+{
+ struct sock *sk = sock->sk;
+ struct rds_sock *rs = rds_sk_to_rs(sk);
+ long timeo;
+ int ret = 0, nonblock = msg_flags & MSG_DONTWAIT;
+ struct sockaddr_in *sin;
+ struct rds_incoming *inc = NULL;
+
+ /* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */
+ timeo = sock_rcvtimeo(sk, nonblock);
+
+ rdsdebug("size %zu flags 0x%x timeo %ld\n", size, msg_flags, timeo);
+
+ if (msg_flags & MSG_OOB)
+ goto out;
+
+ /* If there are pending notifications, do those - and nothing else */
+ if (!list_empty(&rs->rs_notify_queue)) {
+ ret = rds_notify_queue_get(rs, msg);
+ goto out;
+ }
+
+ if (rs->rs_cong_notify) {
+ ret = rds_notify_cong(rs, msg);
+ goto out;
+ }
+
+ while (1) {
+ if (!rds_next_incoming(rs, &inc)) {
+ if (nonblock) {
+ ret = -EAGAIN;
+ break;
+ }
+
+ timeo = wait_event_interruptible_timeout(*sk->sk_sleep,
+ rds_next_incoming(rs, &inc),
+ timeo);
+ rdsdebug("recvmsg woke inc %p timeo %ld\n", inc,
+ timeo);
+ if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT)
+ continue;
+
+ ret = timeo;
+ if (ret == 0)
+ ret = -ETIMEDOUT;
+ break;
+ }
+
+ rdsdebug("copying inc %p from %pI4:%u to user\n", inc,
+ &inc->i_conn->c_faddr,
+ ntohs(inc->i_hdr.h_sport));
+ ret = inc->i_conn->c_trans->inc_copy_to_user(inc, msg->msg_iov,
+ size);
+ if (ret < 0)
+ break;
+
+ /*
+ * if the message we just copied isn't at the head of the
+ * recv queue then someone else raced us to return it, try
+ * to get the next message.
+ */
+ if (!rds_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) {
+ rds_inc_put(inc);
+ inc = NULL;
+ rds_stats_inc(s_recv_deliver_raced);
+ continue;
+ }
+
+ if (ret < be32_to_cpu(inc->i_hdr.h_len)) {
+ if (msg_flags & MSG_TRUNC)
+ ret = be32_to_cpu(inc->i_hdr.h_len);
+ msg->msg_flags |= MSG_TRUNC;
+ }
+
+ if (rds_cmsg_recv(inc, msg)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ rds_stats_inc(s_recv_delivered);
+
+ sin = (struct sockaddr_in *)msg->msg_name;
+ if (sin) {
+ sin->sin_family = AF_INET;
+ sin->sin_port = inc->i_hdr.h_sport;
+ sin->sin_addr.s_addr = inc->i_saddr;
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ }
+ break;
+ }
+
+ if (inc)
+ rds_inc_put(inc);
+
+out:
+ return ret;
+}
+
+/*
+ * The socket is being shut down and we're asked to drop messages that were
+ * queued for recvmsg. The caller has unbound the socket so the receive path
+ * won't queue any more incoming fragments or messages on the socket.
+ */
+void rds_clear_recv_queue(struct rds_sock *rs)
+{
+ struct sock *sk = rds_rs_to_sk(rs);
+ struct rds_incoming *inc, *tmp;
+ unsigned long flags;
+
+ write_lock_irqsave(&rs->rs_recv_lock, flags);
+ list_for_each_entry_safe(inc, tmp, &rs->rs_recv_queue, i_item) {
+ rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
+ -be32_to_cpu(inc->i_hdr.h_len),
+ inc->i_hdr.h_dport);
+ list_del_init(&inc->i_item);
+ rds_inc_put(inc);
+ }
+ write_unlock_irqrestore(&rs->rs_recv_lock, flags);
+}
+
+/*
+ * inc->i_saddr isn't used here because it is only set in the receive
+ * path.
+ */
+void rds_inc_info_copy(struct rds_incoming *inc,
+ struct rds_info_iterator *iter,
+ __be32 saddr, __be32 daddr, int flip)
+{
+ struct rds_info_message minfo;
+
+ minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence);
+ minfo.len = be32_to_cpu(inc->i_hdr.h_len);
+
+ if (flip) {
+ minfo.laddr = daddr;
+ minfo.faddr = saddr;
+ minfo.lport = inc->i_hdr.h_dport;
+ minfo.fport = inc->i_hdr.h_sport;
+ } else {
+ minfo.laddr = saddr;
+ minfo.faddr = daddr;
+ minfo.lport = inc->i_hdr.h_sport;
+ minfo.fport = inc->i_hdr.h_dport;
+ }
+
+ rds_info_copy(iter, &minfo, sizeof(minfo));
+}
diff --git a/net/rds/send.c b/net/rds/send.c
new file mode 100644
index 00000000000..1b37364656f
--- /dev/null
+++ b/net/rds/send.c
@@ -0,0 +1,1003 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <net/sock.h>
+#include <linux/in.h>
+#include <linux/list.h>
+
+#include "rds.h"
+#include "rdma.h"
+
+/* When transmitting messages in rds_send_xmit, we need to emerge from
+ * time to time and briefly release the CPU. Otherwise the softlock watchdog
+ * will kick our shin.
+ * Also, it seems fairer to not let one busy connection stall all the
+ * others.
+ *
+ * send_batch_count is the number of times we'll loop in send_xmit. Setting
+ * it to 0 will restore the old behavior (where we looped until we had
+ * drained the queue).
+ */
+static int send_batch_count = 64;
+module_param(send_batch_count, int, 0444);
+MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue");
+
+/*
+ * Reset the send state. Caller must hold c_send_lock when calling here.
+ */
+void rds_send_reset(struct rds_connection *conn)
+{
+ struct rds_message *rm, *tmp;
+ unsigned long flags;
+
+ if (conn->c_xmit_rm) {
+ /* Tell the user the RDMA op is no longer mapped by the
+ * transport. This isn't entirely true (it's flushed out
+ * independently) but as the connection is down, there's
+ * no ongoing RDMA to/from that memory */
+ rds_message_unmapped(conn->c_xmit_rm);
+ rds_message_put(conn->c_xmit_rm);
+ conn->c_xmit_rm = NULL;
+ }
+ conn->c_xmit_sg = 0;
+ conn->c_xmit_hdr_off = 0;
+ conn->c_xmit_data_off = 0;
+ conn->c_xmit_rdma_sent = 0;
+
+ conn->c_map_queued = 0;
+
+ conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
+ conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes;
+
+ /* Mark messages as retransmissions, and move them to the send q */
+ spin_lock_irqsave(&conn->c_lock, flags);
+ list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
+ set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
+ set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags);
+ }
+ list_splice_init(&conn->c_retrans, &conn->c_send_queue);
+ spin_unlock_irqrestore(&conn->c_lock, flags);
+}
+
+/*
+ * We're making the concious trade-off here to only send one message
+ * down the connection at a time.
+ * Pro:
+ * - tx queueing is a simple fifo list
+ * - reassembly is optional and easily done by transports per conn
+ * - no per flow rx lookup at all, straight to the socket
+ * - less per-frag memory and wire overhead
+ * Con:
+ * - queued acks can be delayed behind large messages
+ * Depends:
+ * - small message latency is higher behind queued large messages
+ * - large message latency isn't starved by intervening small sends
+ */
+int rds_send_xmit(struct rds_connection *conn)
+{
+ struct rds_message *rm;
+ unsigned long flags;
+ unsigned int tmp;
+ unsigned int send_quota = send_batch_count;
+ struct scatterlist *sg;
+ int ret = 0;
+ int was_empty = 0;
+ LIST_HEAD(to_be_dropped);
+
+ /*
+ * sendmsg calls here after having queued its message on the send
+ * queue. We only have one task feeding the connection at a time. If
+ * another thread is already feeding the queue then we back off. This
+ * avoids blocking the caller and trading per-connection data between
+ * caches per message.
+ *
+ * The sem holder will issue a retry if they notice that someone queued
+ * a message after they stopped walking the send queue but before they
+ * dropped the sem.
+ */
+ if (!mutex_trylock(&conn->c_send_lock)) {
+ rds_stats_inc(s_send_sem_contention);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (conn->c_trans->xmit_prepare)
+ conn->c_trans->xmit_prepare(conn);
+
+ /*
+ * spin trying to push headers and data down the connection until
+ * the connection doens't make forward progress.
+ */
+ while (--send_quota) {
+ /*
+ * See if need to send a congestion map update if we're
+ * between sending messages. The send_sem protects our sole
+ * use of c_map_offset and _bytes.
+ * Note this is used only by transports that define a special
+ * xmit_cong_map function. For all others, we create allocate
+ * a cong_map message and treat it just like any other send.
+ */
+ if (conn->c_map_bytes) {
+ ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong,
+ conn->c_map_offset);
+ if (ret <= 0)
+ break;
+
+ conn->c_map_offset += ret;
+ conn->c_map_bytes -= ret;
+ if (conn->c_map_bytes)
+ continue;
+ }
+
+ /* If we're done sending the current message, clear the
+ * offset and S/G temporaries.
+ */
+ rm = conn->c_xmit_rm;
+ if (rm != NULL &&
+ conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
+ conn->c_xmit_sg == rm->m_nents) {
+ conn->c_xmit_rm = NULL;
+ conn->c_xmit_sg = 0;
+ conn->c_xmit_hdr_off = 0;
+ conn->c_xmit_data_off = 0;
+ conn->c_xmit_rdma_sent = 0;
+
+ /* Release the reference to the previous message. */
+ rds_message_put(rm);
+ rm = NULL;
+ }
+
+ /* If we're asked to send a cong map update, do so.
+ */
+ if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) {
+ if (conn->c_trans->xmit_cong_map != NULL) {
+ conn->c_map_offset = 0;
+ conn->c_map_bytes = sizeof(struct rds_header) +
+ RDS_CONG_MAP_BYTES;
+ continue;
+ }
+
+ rm = rds_cong_update_alloc(conn);
+ if (IS_ERR(rm)) {
+ ret = PTR_ERR(rm);
+ break;
+ }
+
+ conn->c_xmit_rm = rm;
+ }
+
+ /*
+ * Grab the next message from the send queue, if there is one.
+ *
+ * c_xmit_rm holds a ref while we're sending this message down
+ * the connction. We can use this ref while holding the
+ * send_sem.. rds_send_reset() is serialized with it.
+ */
+ if (rm == NULL) {
+ unsigned int len;
+
+ spin_lock_irqsave(&conn->c_lock, flags);
+
+ if (!list_empty(&conn->c_send_queue)) {
+ rm = list_entry(conn->c_send_queue.next,
+ struct rds_message,
+ m_conn_item);
+ rds_message_addref(rm);
+
+ /*
+ * Move the message from the send queue to the retransmit
+ * list right away.
+ */
+ list_move_tail(&rm->m_conn_item, &conn->c_retrans);
+ }
+
+ spin_unlock_irqrestore(&conn->c_lock, flags);
+
+ if (rm == NULL) {
+ was_empty = 1;
+ break;
+ }
+
+ /* Unfortunately, the way Infiniband deals with
+ * RDMA to a bad MR key is by moving the entire
+ * queue pair to error state. We cold possibly
+ * recover from that, but right now we drop the
+ * connection.
+ * Therefore, we never retransmit messages with RDMA ops.
+ */
+ if (rm->m_rdma_op
+ && test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
+ spin_lock_irqsave(&conn->c_lock, flags);
+ if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
+ list_move(&rm->m_conn_item, &to_be_dropped);
+ spin_unlock_irqrestore(&conn->c_lock, flags);
+ rds_message_put(rm);
+ continue;
+ }
+
+ /* Require an ACK every once in a while */
+ len = ntohl(rm->m_inc.i_hdr.h_len);
+ if (conn->c_unacked_packets == 0
+ || conn->c_unacked_bytes < len) {
+ __set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
+
+ conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
+ conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes;
+ rds_stats_inc(s_send_ack_required);
+ } else {
+ conn->c_unacked_bytes -= len;
+ conn->c_unacked_packets--;
+ }
+
+ conn->c_xmit_rm = rm;
+ }
+
+ /*
+ * Try and send an rdma message. Let's see if we can
+ * keep this simple and require that the transport either
+ * send the whole rdma or none of it.
+ */
+ if (rm->m_rdma_op && !conn->c_xmit_rdma_sent) {
+ ret = conn->c_trans->xmit_rdma(conn, rm->m_rdma_op);
+ if (ret)
+ break;
+ conn->c_xmit_rdma_sent = 1;
+ /* The transport owns the mapped memory for now.
+ * You can't unmap it while it's on the send queue */
+ set_bit(RDS_MSG_MAPPED, &rm->m_flags);
+ }
+
+ if (conn->c_xmit_hdr_off < sizeof(struct rds_header) ||
+ conn->c_xmit_sg < rm->m_nents) {
+ ret = conn->c_trans->xmit(conn, rm,
+ conn->c_xmit_hdr_off,
+ conn->c_xmit_sg,
+ conn->c_xmit_data_off);
+ if (ret <= 0)
+ break;
+
+ if (conn->c_xmit_hdr_off < sizeof(struct rds_header)) {
+ tmp = min_t(int, ret,
+ sizeof(struct rds_header) -
+ conn->c_xmit_hdr_off);
+ conn->c_xmit_hdr_off += tmp;
+ ret -= tmp;
+ }
+
+ sg = &rm->m_sg[conn->c_xmit_sg];
+ while (ret) {
+ tmp = min_t(int, ret, sg->length -
+ conn->c_xmit_data_off);
+ conn->c_xmit_data_off += tmp;
+ ret -= tmp;
+ if (conn->c_xmit_data_off == sg->length) {
+ conn->c_xmit_data_off = 0;
+ sg++;
+ conn->c_xmit_sg++;
+ BUG_ON(ret != 0 &&
+ conn->c_xmit_sg == rm->m_nents);
+ }
+ }
+ }
+ }
+
+ /* Nuke any messages we decided not to retransmit. */
+ if (!list_empty(&to_be_dropped))
+ rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
+
+ if (conn->c_trans->xmit_complete)
+ conn->c_trans->xmit_complete(conn);
+
+ /*
+ * We might be racing with another sender who queued a message but
+ * backed off on noticing that we held the c_send_lock. If we check
+ * for queued messages after dropping the sem then either we'll
+ * see the queued message or the queuer will get the sem. If we
+ * notice the queued message then we trigger an immediate retry.
+ *
+ * We need to be careful only to do this when we stopped processing
+ * the send queue because it was empty. It's the only way we
+ * stop processing the loop when the transport hasn't taken
+ * responsibility for forward progress.
+ */
+ mutex_unlock(&conn->c_send_lock);
+
+ if (conn->c_map_bytes || (send_quota == 0 && !was_empty)) {
+ /* We exhausted the send quota, but there's work left to
+ * do. Return and (re-)schedule the send worker.
+ */
+ ret = -EAGAIN;
+ }
+
+ if (ret == 0 && was_empty) {
+ /* A simple bit test would be way faster than taking the
+ * spin lock */
+ spin_lock_irqsave(&conn->c_lock, flags);
+ if (!list_empty(&conn->c_send_queue)) {
+ rds_stats_inc(s_send_sem_queue_raced);
+ ret = -EAGAIN;
+ }
+ spin_unlock_irqrestore(&conn->c_lock, flags);
+ }
+out:
+ return ret;
+}
+
+static void rds_send_sndbuf_remove(struct rds_sock *rs, struct rds_message *rm)
+{
+ u32 len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
+
+ assert_spin_locked(&rs->rs_lock);
+
+ BUG_ON(rs->rs_snd_bytes < len);
+ rs->rs_snd_bytes -= len;
+
+ if (rs->rs_snd_bytes == 0)
+ rds_stats_inc(s_send_queue_empty);
+}
+
+static inline int rds_send_is_acked(struct rds_message *rm, u64 ack,
+ is_acked_func is_acked)
+{
+ if (is_acked)
+ return is_acked(rm, ack);
+ return be64_to_cpu(rm->m_inc.i_hdr.h_sequence) <= ack;
+}
+
+/*
+ * Returns true if there are no messages on the send and retransmit queues
+ * which have a sequence number greater than or equal to the given sequence
+ * number.
+ */
+int rds_send_acked_before(struct rds_connection *conn, u64 seq)
+{
+ struct rds_message *rm, *tmp;
+ int ret = 1;
+
+ spin_lock(&conn->c_lock);
+
+ list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
+ if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
+ ret = 0;
+ break;
+ }
+
+ list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
+ if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
+ ret = 0;
+ break;
+ }
+
+ spin_unlock(&conn->c_lock);
+
+ return ret;
+}
+
+/*
+ * This is pretty similar to what happens below in the ACK
+ * handling code - except that we call here as soon as we get
+ * the IB send completion on the RDMA op and the accompanying
+ * message.
+ */
+void rds_rdma_send_complete(struct rds_message *rm, int status)
+{
+ struct rds_sock *rs = NULL;
+ struct rds_rdma_op *ro;
+ struct rds_notifier *notifier;
+
+ spin_lock(&rm->m_rs_lock);
+
+ ro = rm->m_rdma_op;
+ if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)
+ && ro && ro->r_notify && ro->r_notifier) {
+ notifier = ro->r_notifier;
+ rs = rm->m_rs;
+ sock_hold(rds_rs_to_sk(rs));
+
+ notifier->n_status = status;
+ spin_lock(&rs->rs_lock);
+ list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
+ spin_unlock(&rs->rs_lock);
+
+ ro->r_notifier = NULL;
+ }
+
+ spin_unlock(&rm->m_rs_lock);
+
+ if (rs) {
+ rds_wake_sk_sleep(rs);
+ sock_put(rds_rs_to_sk(rs));
+ }
+}
+
+/*
+ * This is the same as rds_rdma_send_complete except we
+ * don't do any locking - we have all the ingredients (message,
+ * socket, socket lock) and can just move the notifier.
+ */
+static inline void
+__rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
+{
+ struct rds_rdma_op *ro;
+
+ ro = rm->m_rdma_op;
+ if (ro && ro->r_notify && ro->r_notifier) {
+ ro->r_notifier->n_status = status;
+ list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue);
+ ro->r_notifier = NULL;
+ }
+
+ /* No need to wake the app - caller does this */
+}
+
+/*
+ * This is called from the IB send completion when we detect
+ * a RDMA operation that failed with remote access error.
+ * So speed is not an issue here.
+ */
+struct rds_message *rds_send_get_message(struct rds_connection *conn,
+ struct rds_rdma_op *op)
+{
+ struct rds_message *rm, *tmp, *found = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&conn->c_lock, flags);
+
+ list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
+ if (rm->m_rdma_op == op) {
+ atomic_inc(&rm->m_refcount);
+ found = rm;
+ goto out;
+ }
+ }
+
+ list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
+ if (rm->m_rdma_op == op) {
+ atomic_inc(&rm->m_refcount);
+ found = rm;
+ break;
+ }
+ }
+
+out:
+ spin_unlock_irqrestore(&conn->c_lock, flags);
+
+ return found;
+}
+
+/*
+ * This removes messages from the socket's list if they're on it. The list
+ * argument must be private to the caller, we must be able to modify it
+ * without locks. The messages must have a reference held for their
+ * position on the list. This function will drop that reference after
+ * removing the messages from the 'messages' list regardless of if it found
+ * the messages on the socket list or not.
+ */
+void rds_send_remove_from_sock(struct list_head *messages, int status)
+{
+ unsigned long flags = 0; /* silence gcc :P */
+ struct rds_sock *rs = NULL;
+ struct rds_message *rm;
+
+ local_irq_save(flags);
+ while (!list_empty(messages)) {
+ rm = list_entry(messages->next, struct rds_message,
+ m_conn_item);
+ list_del_init(&rm->m_conn_item);
+
+ /*
+ * If we see this flag cleared then we're *sure* that someone
+ * else beat us to removing it from the sock. If we race
+ * with their flag update we'll get the lock and then really
+ * see that the flag has been cleared.
+ *
+ * The message spinlock makes sure nobody clears rm->m_rs
+ * while we're messing with it. It does not prevent the
+ * message from being removed from the socket, though.
+ */
+ spin_lock(&rm->m_rs_lock);
+ if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags))
+ goto unlock_and_drop;
+
+ if (rs != rm->m_rs) {
+ if (rs) {
+ spin_unlock(&rs->rs_lock);
+ rds_wake_sk_sleep(rs);
+ sock_put(rds_rs_to_sk(rs));
+ }
+ rs = rm->m_rs;
+ spin_lock(&rs->rs_lock);
+ sock_hold(rds_rs_to_sk(rs));
+ }
+
+ if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
+ struct rds_rdma_op *ro = rm->m_rdma_op;
+ struct rds_notifier *notifier;
+
+ list_del_init(&rm->m_sock_item);
+ rds_send_sndbuf_remove(rs, rm);
+
+ if (ro && ro->r_notifier
+ && (status || ro->r_notify)) {
+ notifier = ro->r_notifier;
+ list_add_tail(&notifier->n_list,
+ &rs->rs_notify_queue);
+ if (!notifier->n_status)
+ notifier->n_status = status;
+ rm->m_rdma_op->r_notifier = NULL;
+ }
+ rds_message_put(rm);
+ rm->m_rs = NULL;
+ }
+
+unlock_and_drop:
+ spin_unlock(&rm->m_rs_lock);
+ rds_message_put(rm);
+ }
+
+ if (rs) {
+ spin_unlock(&rs->rs_lock);
+ rds_wake_sk_sleep(rs);
+ sock_put(rds_rs_to_sk(rs));
+ }
+ local_irq_restore(flags);
+}
+
+/*
+ * Transports call here when they've determined that the receiver queued
+ * messages up to, and including, the given sequence number. Messages are
+ * moved to the retrans queue when rds_send_xmit picks them off the send
+ * queue. This means that in the TCP case, the message may not have been
+ * assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked
+ * checks the RDS_MSG_HAS_ACK_SEQ bit.
+ *
+ * XXX It's not clear to me how this is safely serialized with socket
+ * destruction. Maybe it should bail if it sees SOCK_DEAD.
+ */
+void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
+ is_acked_func is_acked)
+{
+ struct rds_message *rm, *tmp;
+ unsigned long flags;
+ LIST_HEAD(list);
+
+ spin_lock_irqsave(&conn->c_lock, flags);
+
+ list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
+ if (!rds_send_is_acked(rm, ack, is_acked))
+ break;
+
+ list_move(&rm->m_conn_item, &list);
+ clear_bit(RDS_MSG_ON_CONN, &rm->m_flags);
+ }
+
+ /* order flag updates with spin locks */
+ if (!list_empty(&list))
+ smp_mb__after_clear_bit();
+
+ spin_unlock_irqrestore(&conn->c_lock, flags);
+
+ /* now remove the messages from the sock list as needed */
+ rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS);
+}
+
+void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
+{
+ struct rds_message *rm, *tmp;
+ struct rds_connection *conn;
+ unsigned long flags;
+ LIST_HEAD(list);
+ int wake = 0;
+
+ /* get all the messages we're dropping under the rs lock */
+ spin_lock_irqsave(&rs->rs_lock, flags);
+
+ list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) {
+ if (dest && (dest->sin_addr.s_addr != rm->m_daddr ||
+ dest->sin_port != rm->m_inc.i_hdr.h_dport))
+ continue;
+
+ wake = 1;
+ list_move(&rm->m_sock_item, &list);
+ rds_send_sndbuf_remove(rs, rm);
+ clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
+
+ /* If this is a RDMA operation, notify the app. */
+ __rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED);
+ }
+
+ /* order flag updates with the rs lock */
+ if (wake)
+ smp_mb__after_clear_bit();
+
+ spin_unlock_irqrestore(&rs->rs_lock, flags);
+
+ if (wake)
+ rds_wake_sk_sleep(rs);
+
+ conn = NULL;
+
+ /* now remove the messages from the conn list as needed */
+ list_for_each_entry(rm, &list, m_sock_item) {
+ /* We do this here rather than in the loop above, so that
+ * we don't have to nest m_rs_lock under rs->rs_lock */
+ spin_lock(&rm->m_rs_lock);
+ rm->m_rs = NULL;
+ spin_unlock(&rm->m_rs_lock);
+
+ /*
+ * If we see this flag cleared then we're *sure* that someone
+ * else beat us to removing it from the conn. If we race
+ * with their flag update we'll get the lock and then really
+ * see that the flag has been cleared.
+ */
+ if (!test_bit(RDS_MSG_ON_CONN, &rm->m_flags))
+ continue;
+
+ if (conn != rm->m_inc.i_conn) {
+ if (conn)
+ spin_unlock_irqrestore(&conn->c_lock, flags);
+ conn = rm->m_inc.i_conn;
+ spin_lock_irqsave(&conn->c_lock, flags);
+ }
+
+ if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
+ list_del_init(&rm->m_conn_item);
+ rds_message_put(rm);
+ }
+ }
+
+ if (conn)
+ spin_unlock_irqrestore(&conn->c_lock, flags);
+
+ while (!list_empty(&list)) {
+ rm = list_entry(list.next, struct rds_message, m_sock_item);
+ list_del_init(&rm->m_sock_item);
+
+ rds_message_wait(rm);
+ rds_message_put(rm);
+ }
+}
+
+/*
+ * we only want this to fire once so we use the callers 'queued'. It's
+ * possible that another thread can race with us and remove the
+ * message from the flow with RDS_CANCEL_SENT_TO.
+ */
+static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn,
+ struct rds_message *rm, __be16 sport,
+ __be16 dport, int *queued)
+{
+ unsigned long flags;
+ u32 len;
+
+ if (*queued)
+ goto out;
+
+ len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
+
+ /* this is the only place which holds both the socket's rs_lock
+ * and the connection's c_lock */
+ spin_lock_irqsave(&rs->rs_lock, flags);
+
+ /*
+ * If there is a little space in sndbuf, we don't queue anything,
+ * and userspace gets -EAGAIN. But poll() indicates there's send
+ * room. This can lead to bad behavior (spinning) if snd_bytes isn't
+ * freed up by incoming acks. So we check the *old* value of
+ * rs_snd_bytes here to allow the last msg to exceed the buffer,
+ * and poll() now knows no more data can be sent.
+ */
+ if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) {
+ rs->rs_snd_bytes += len;
+
+ /* let recv side know we are close to send space exhaustion.
+ * This is probably not the optimal way to do it, as this
+ * means we set the flag on *all* messages as soon as our
+ * throughput hits a certain threshold.
+ */
+ if (rs->rs_snd_bytes >= rds_sk_sndbuf(rs) / 2)
+ __set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
+
+ list_add_tail(&rm->m_sock_item, &rs->rs_send_queue);
+ set_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
+ rds_message_addref(rm);
+ rm->m_rs = rs;
+
+ /* The code ordering is a little weird, but we're
+ trying to minimize the time we hold c_lock */
+ rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0);
+ rm->m_inc.i_conn = conn;
+ rds_message_addref(rm);
+
+ spin_lock(&conn->c_lock);
+ rm->m_inc.i_hdr.h_sequence = cpu_to_be64(conn->c_next_tx_seq++);
+ list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
+ set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
+ spin_unlock(&conn->c_lock);
+
+ rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n",
+ rm, len, rs, rs->rs_snd_bytes,
+ (unsigned long long)be64_to_cpu(rm->m_inc.i_hdr.h_sequence));
+
+ *queued = 1;
+ }
+
+ spin_unlock_irqrestore(&rs->rs_lock, flags);
+out:
+ return *queued;
+}
+
+static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
+ struct msghdr *msg, int *allocated_mr)
+{
+ struct cmsghdr *cmsg;
+ int ret = 0;
+
+ for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+
+ if (cmsg->cmsg_level != SOL_RDS)
+ continue;
+
+ /* As a side effect, RDMA_DEST and RDMA_MAP will set
+ * rm->m_rdma_cookie and rm->m_rdma_mr.
+ */
+ switch (cmsg->cmsg_type) {
+ case RDS_CMSG_RDMA_ARGS:
+ ret = rds_cmsg_rdma_args(rs, rm, cmsg);
+ break;
+
+ case RDS_CMSG_RDMA_DEST:
+ ret = rds_cmsg_rdma_dest(rs, rm, cmsg);
+ break;
+
+ case RDS_CMSG_RDMA_MAP:
+ ret = rds_cmsg_rdma_map(rs, rm, cmsg);
+ if (!ret)
+ *allocated_mr = 1;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+ size_t payload_len)
+{
+ struct sock *sk = sock->sk;
+ struct rds_sock *rs = rds_sk_to_rs(sk);
+ struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name;
+ __be32 daddr;
+ __be16 dport;
+ struct rds_message *rm = NULL;
+ struct rds_connection *conn;
+ int ret = 0;
+ int queued = 0, allocated_mr = 0;
+ int nonblock = msg->msg_flags & MSG_DONTWAIT;
+ long timeo = sock_rcvtimeo(sk, nonblock);
+
+ /* Mirror Linux UDP mirror of BSD error message compatibility */
+ /* XXX: Perhaps MSG_MORE someday */
+ if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) {
+ printk(KERN_INFO "msg_flags 0x%08X\n", msg->msg_flags);
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (msg->msg_namelen) {
+ /* XXX fail non-unicast destination IPs? */
+ if (msg->msg_namelen < sizeof(*usin) || usin->sin_family != AF_INET) {
+ ret = -EINVAL;
+ goto out;
+ }
+ daddr = usin->sin_addr.s_addr;
+ dport = usin->sin_port;
+ } else {
+ /* We only care about consistency with ->connect() */
+ lock_sock(sk);
+ daddr = rs->rs_conn_addr;
+ dport = rs->rs_conn_port;
+ release_sock(sk);
+ }
+
+ /* racing with another thread binding seems ok here */
+ if (daddr == 0 || rs->rs_bound_addr == 0) {
+ ret = -ENOTCONN; /* XXX not a great errno */
+ goto out;
+ }
+
+ rm = rds_message_copy_from_user(msg->msg_iov, payload_len);
+ if (IS_ERR(rm)) {
+ ret = PTR_ERR(rm);
+ rm = NULL;
+ goto out;
+ }
+
+ rm->m_daddr = daddr;
+
+ /* Parse any control messages the user may have included. */
+ ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
+ if (ret)
+ goto out;
+
+ /* rds_conn_create has a spinlock that runs with IRQ off.
+ * Caching the conn in the socket helps a lot. */
+ if (rs->rs_conn && rs->rs_conn->c_faddr == daddr)
+ conn = rs->rs_conn;
+ else {
+ conn = rds_conn_create_outgoing(rs->rs_bound_addr, daddr,
+ rs->rs_transport,
+ sock->sk->sk_allocation);
+ if (IS_ERR(conn)) {
+ ret = PTR_ERR(conn);
+ goto out;
+ }
+ rs->rs_conn = conn;
+ }
+
+ if ((rm->m_rdma_cookie || rm->m_rdma_op)
+ && conn->c_trans->xmit_rdma == NULL) {
+ if (printk_ratelimit())
+ printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
+ rm->m_rdma_op, conn->c_trans->xmit_rdma);
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ /* If the connection is down, trigger a connect. We may
+ * have scheduled a delayed reconnect however - in this case
+ * we should not interfere.
+ */
+ if (rds_conn_state(conn) == RDS_CONN_DOWN
+ && !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
+ queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+
+ ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
+ if (ret)
+ goto out;
+
+ while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port,
+ dport, &queued)) {
+ rds_stats_inc(s_send_queue_full);
+ /* XXX make sure this is reasonable */
+ if (payload_len > rds_sk_sndbuf(rs)) {
+ ret = -EMSGSIZE;
+ goto out;
+ }
+ if (nonblock) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ timeo = wait_event_interruptible_timeout(*sk->sk_sleep,
+ rds_send_queue_rm(rs, conn, rm,
+ rs->rs_bound_port,
+ dport,
+ &queued),
+ timeo);
+ rdsdebug("sendmsg woke queued %d timeo %ld\n", queued, timeo);
+ if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT)
+ continue;
+
+ ret = timeo;
+ if (ret == 0)
+ ret = -ETIMEDOUT;
+ goto out;
+ }
+
+ /*
+ * By now we've committed to the send. We reuse rds_send_worker()
+ * to retry sends in the rds thread if the transport asks us to.
+ */
+ rds_stats_inc(s_send_queued);
+
+ if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
+ rds_send_worker(&conn->c_send_w.work);
+
+ rds_message_put(rm);
+ return payload_len;
+
+out:
+ /* If the user included a RDMA_MAP cmsg, we allocated a MR on the fly.
+ * If the sendmsg goes through, we keep the MR. If it fails with EAGAIN
+ * or in any other way, we need to destroy the MR again */
+ if (allocated_mr)
+ rds_rdma_unuse(rs, rds_rdma_cookie_key(rm->m_rdma_cookie), 1);
+
+ if (rm)
+ rds_message_put(rm);
+ return ret;
+}
+
+/*
+ * Reply to a ping packet.
+ */
+int
+rds_send_pong(struct rds_connection *conn, __be16 dport)
+{
+ struct rds_message *rm;
+ unsigned long flags;
+ int ret = 0;
+
+ rm = rds_message_alloc(0, GFP_ATOMIC);
+ if (rm == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ rm->m_daddr = conn->c_faddr;
+
+ /* If the connection is down, trigger a connect. We may
+ * have scheduled a delayed reconnect however - in this case
+ * we should not interfere.
+ */
+ if (rds_conn_state(conn) == RDS_CONN_DOWN
+ && !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
+ queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+
+ ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL);
+ if (ret)
+ goto out;
+
+ spin_lock_irqsave(&conn->c_lock, flags);
+ list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
+ set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
+ rds_message_addref(rm);
+ rm->m_inc.i_conn = conn;
+
+ rds_message_populate_header(&rm->m_inc.i_hdr, 0, dport,
+ conn->c_next_tx_seq);
+ conn->c_next_tx_seq++;
+ spin_unlock_irqrestore(&conn->c_lock, flags);
+
+ rds_stats_inc(s_send_queued);
+ rds_stats_inc(s_send_pong);
+
+ queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+ rds_message_put(rm);
+ return 0;
+
+out:
+ if (rm)
+ rds_message_put(rm);
+ return ret;
+}
diff --git a/net/rds/stats.c b/net/rds/stats.c
new file mode 100644
index 00000000000..637146893cf
--- /dev/null
+++ b/net/rds/stats.c
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
+#include "rds.h"
+
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
+
+/* :.,$s/unsigned long\>.*\<s_\(.*\);/"\1",/g */
+
+static char *rds_stat_names[] = {
+ "conn_reset",
+ "recv_drop_bad_checksum",
+ "recv_drop_old_seq",
+ "recv_drop_no_sock",
+ "recv_drop_dead_sock",
+ "recv_deliver_raced",
+ "recv_delivered",
+ "recv_queued",
+ "recv_immediate_retry",
+ "recv_delayed_retry",
+ "recv_ack_required",
+ "recv_rdma_bytes",
+ "recv_ping",
+ "send_queue_empty",
+ "send_queue_full",
+ "send_sem_contention",
+ "send_sem_queue_raced",
+ "send_immediate_retry",
+ "send_delayed_retry",
+ "send_drop_acked",
+ "send_ack_required",
+ "send_queued",
+ "send_rdma",
+ "send_rdma_bytes",
+ "send_pong",
+ "page_remainder_hit",
+ "page_remainder_miss",
+ "copy_to_user",
+ "copy_from_user",
+ "cong_update_queued",
+ "cong_update_received",
+ "cong_send_error",
+ "cong_send_blocked",
+};
+
+void rds_stats_info_copy(struct rds_info_iterator *iter,
+ uint64_t *values, char **names, size_t nr)
+{
+ struct rds_info_counter ctr;
+ size_t i;
+
+ for (i = 0; i < nr; i++) {
+ BUG_ON(strlen(names[i]) >= sizeof(ctr.name));
+ strncpy(ctr.name, names[i], sizeof(ctr.name) - 1);
+ ctr.value = values[i];
+
+ rds_info_copy(iter, &ctr, sizeof(ctr));
+ }
+}
+
+/*
+ * This gives global counters across all the transports. The strings
+ * are copied in so that the tool doesn't need knowledge of the specific
+ * stats that we're exporting. Some are pretty implementation dependent
+ * and may change over time. That doesn't stop them from being useful.
+ *
+ * This is the only function in the chain that knows about the byte granular
+ * length in userspace. It converts it to number of stat entries that the
+ * rest of the functions operate in.
+ */
+static void rds_stats_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ struct rds_statistics stats = {0, };
+ uint64_t *src;
+ uint64_t *sum;
+ size_t i;
+ int cpu;
+ unsigned int avail;
+
+ avail = len / sizeof(struct rds_info_counter);
+
+ if (avail < ARRAY_SIZE(rds_stat_names)) {
+ avail = 0;
+ goto trans;
+ }
+
+ for_each_online_cpu(cpu) {
+ src = (uint64_t *)&(per_cpu(rds_stats, cpu));
+ sum = (uint64_t *)&stats;
+ for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
+ *(sum++) += *(src++);
+ }
+
+ rds_stats_info_copy(iter, (uint64_t *)&stats, rds_stat_names,
+ ARRAY_SIZE(rds_stat_names));
+ avail -= ARRAY_SIZE(rds_stat_names);
+
+trans:
+ lens->each = sizeof(struct rds_info_counter);
+ lens->nr = rds_trans_stats_info_copy(iter, avail) +
+ ARRAY_SIZE(rds_stat_names);
+}
+
+void rds_stats_exit(void)
+{
+ rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info);
+}
+
+int __init rds_stats_init(void)
+{
+ rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info);
+ return 0;
+}
diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c
new file mode 100644
index 00000000000..307dc5c1be1
--- /dev/null
+++ b/net/rds/sysctl.c
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+
+#include "rds.h"
+
+static struct ctl_table_header *rds_sysctl_reg_table;
+
+static unsigned long rds_sysctl_reconnect_min = 1;
+static unsigned long rds_sysctl_reconnect_max = ~0UL;
+
+unsigned long rds_sysctl_reconnect_min_jiffies;
+unsigned long rds_sysctl_reconnect_max_jiffies = HZ;
+
+unsigned int rds_sysctl_max_unacked_packets = 8;
+unsigned int rds_sysctl_max_unacked_bytes = (16 << 20);
+
+unsigned int rds_sysctl_ping_enable = 1;
+
+static ctl_table rds_sysctl_rds_table[] = {
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "reconnect_min_delay_ms",
+ .data = &rds_sysctl_reconnect_min_jiffies,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = &proc_doulongvec_ms_jiffies_minmax,
+ .extra1 = &rds_sysctl_reconnect_min,
+ .extra2 = &rds_sysctl_reconnect_max_jiffies,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "reconnect_max_delay_ms",
+ .data = &rds_sysctl_reconnect_max_jiffies,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = &proc_doulongvec_ms_jiffies_minmax,
+ .extra1 = &rds_sysctl_reconnect_min_jiffies,
+ .extra2 = &rds_sysctl_reconnect_max,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "max_unacked_packets",
+ .data = &rds_sysctl_max_unacked_packets,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "max_unacked_bytes",
+ .data = &rds_sysctl_max_unacked_bytes,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "ping_enable",
+ .data = &rds_sysctl_ping_enable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ { .ctl_name = 0}
+};
+
+static struct ctl_path rds_sysctl_path[] = {
+ { .procname = "net", .ctl_name = CTL_NET, },
+ { .procname = "rds", .ctl_name = CTL_UNNUMBERED, },
+ { }
+};
+
+
+void rds_sysctl_exit(void)
+{
+ if (rds_sysctl_reg_table)
+ unregister_sysctl_table(rds_sysctl_reg_table);
+}
+
+int __init rds_sysctl_init(void)
+{
+ rds_sysctl_reconnect_min = msecs_to_jiffies(1);
+ rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min;
+
+ rds_sysctl_reg_table = register_sysctl_paths(rds_sysctl_path, rds_sysctl_rds_table);
+ if (rds_sysctl_reg_table == NULL)
+ return -ENOMEM;
+ return 0;
+}
diff --git a/net/rds/threads.c b/net/rds/threads.c
new file mode 100644
index 00000000000..828a1bf9ea9
--- /dev/null
+++ b/net/rds/threads.c
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/random.h>
+
+#include "rds.h"
+
+/*
+ * All of connection management is simplified by serializing it through
+ * work queues that execute in a connection managing thread.
+ *
+ * TCP wants to send acks through sendpage() in response to data_ready(),
+ * but it needs a process context to do so.
+ *
+ * The receive paths need to allocate but can't drop packets (!) so we have
+ * a thread around to block allocating if the receive fast path sees an
+ * allocation failure.
+ */
+
+/* Grand Unified Theory of connection life cycle:
+ * At any point in time, the connection can be in one of these states:
+ * DOWN, CONNECTING, UP, DISCONNECTING, ERROR
+ *
+ * The following transitions are possible:
+ * ANY -> ERROR
+ * UP -> DISCONNECTING
+ * ERROR -> DISCONNECTING
+ * DISCONNECTING -> DOWN
+ * DOWN -> CONNECTING
+ * CONNECTING -> UP
+ *
+ * Transition to state DISCONNECTING/DOWN:
+ * - Inside the shutdown worker; synchronizes with xmit path
+ * through c_send_lock, and with connection management callbacks
+ * via c_cm_lock.
+ *
+ * For receive callbacks, we rely on the underlying transport
+ * (TCP, IB/RDMA) to provide the necessary synchronisation.
+ */
+struct workqueue_struct *rds_wq;
+
+void rds_connect_complete(struct rds_connection *conn)
+{
+ if (!rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_UP)) {
+ printk(KERN_WARNING "%s: Cannot transition to state UP, "
+ "current state is %d\n",
+ __func__,
+ atomic_read(&conn->c_state));
+ atomic_set(&conn->c_state, RDS_CONN_ERROR);
+ queue_work(rds_wq, &conn->c_down_w);
+ return;
+ }
+
+ rdsdebug("conn %p for %pI4 to %pI4 complete\n",
+ conn, &conn->c_laddr, &conn->c_faddr);
+
+ conn->c_reconnect_jiffies = 0;
+ set_bit(0, &conn->c_map_queued);
+ queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+ queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+}
+
+/*
+ * This random exponential backoff is relied on to eventually resolve racing
+ * connects.
+ *
+ * If connect attempts race then both parties drop both connections and come
+ * here to wait for a random amount of time before trying again. Eventually
+ * the backoff range will be so much greater than the time it takes to
+ * establish a connection that one of the pair will establish the connection
+ * before the other's random delay fires.
+ *
+ * Connection attempts that arrive while a connection is already established
+ * are also considered to be racing connects. This lets a connection from
+ * a rebooted machine replace an existing stale connection before the transport
+ * notices that the connection has failed.
+ *
+ * We should *always* start with a random backoff; otherwise a broken connection
+ * will always take several iterations to be re-established.
+ */
+static void rds_queue_reconnect(struct rds_connection *conn)
+{
+ unsigned long rand;
+
+ rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n",
+ conn, &conn->c_laddr, &conn->c_faddr,
+ conn->c_reconnect_jiffies);
+
+ set_bit(RDS_RECONNECT_PENDING, &conn->c_flags);
+ if (conn->c_reconnect_jiffies == 0) {
+ conn->c_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies;
+ queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+ return;
+ }
+
+ get_random_bytes(&rand, sizeof(rand));
+ rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n",
+ rand % conn->c_reconnect_jiffies, conn->c_reconnect_jiffies,
+ conn, &conn->c_laddr, &conn->c_faddr);
+ queue_delayed_work(rds_wq, &conn->c_conn_w,
+ rand % conn->c_reconnect_jiffies);
+
+ conn->c_reconnect_jiffies = min(conn->c_reconnect_jiffies * 2,
+ rds_sysctl_reconnect_max_jiffies);
+}
+
+void rds_connect_worker(struct work_struct *work)
+{
+ struct rds_connection *conn = container_of(work, struct rds_connection, c_conn_w.work);
+ int ret;
+
+ clear_bit(RDS_RECONNECT_PENDING, &conn->c_flags);
+ if (rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
+ ret = conn->c_trans->conn_connect(conn);
+ rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n",
+ conn, &conn->c_laddr, &conn->c_faddr, ret);
+
+ if (ret) {
+ if (rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_DOWN))
+ rds_queue_reconnect(conn);
+ else
+ rds_conn_error(conn, "RDS: connect failed\n");
+ }
+ }
+}
+
+void rds_shutdown_worker(struct work_struct *work)
+{
+ struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
+
+ /* shut it down unless it's down already */
+ if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
+ /*
+ * Quiesce the connection mgmt handlers before we start tearing
+ * things down. We don't hold the mutex for the entire
+ * duration of the shutdown operation, else we may be
+ * deadlocking with the CM handler. Instead, the CM event
+ * handler is supposed to check for state DISCONNECTING
+ */
+ mutex_lock(&conn->c_cm_lock);
+ if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING)
+ && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
+ rds_conn_error(conn, "shutdown called in state %d\n",
+ atomic_read(&conn->c_state));
+ mutex_unlock(&conn->c_cm_lock);
+ return;
+ }
+ mutex_unlock(&conn->c_cm_lock);
+
+ mutex_lock(&conn->c_send_lock);
+ conn->c_trans->conn_shutdown(conn);
+ rds_conn_reset(conn);
+ mutex_unlock(&conn->c_send_lock);
+
+ if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
+ /* This can happen - eg when we're in the middle of tearing
+ * down the connection, and someone unloads the rds module.
+ * Quite reproduceable with loopback connections.
+ * Mostly harmless.
+ */
+ rds_conn_error(conn,
+ "%s: failed to transition to state DOWN, "
+ "current state is %d\n",
+ __func__,
+ atomic_read(&conn->c_state));
+ return;
+ }
+ }
+
+ /* Then reconnect if it's still live.
+ * The passive side of an IB loopback connection is never added
+ * to the conn hash, so we never trigger a reconnect on this
+ * conn - the reconnect is always triggered by the active peer. */
+ cancel_delayed_work(&conn->c_conn_w);
+ if (!hlist_unhashed(&conn->c_hash_node))
+ rds_queue_reconnect(conn);
+}
+
+void rds_send_worker(struct work_struct *work)
+{
+ struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work);
+ int ret;
+
+ if (rds_conn_state(conn) == RDS_CONN_UP) {
+ ret = rds_send_xmit(conn);
+ rdsdebug("conn %p ret %d\n", conn, ret);
+ switch (ret) {
+ case -EAGAIN:
+ rds_stats_inc(s_send_immediate_retry);
+ queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+ break;
+ case -ENOMEM:
+ rds_stats_inc(s_send_delayed_retry);
+ queue_delayed_work(rds_wq, &conn->c_send_w, 2);
+ default:
+ break;
+ }
+ }
+}
+
+void rds_recv_worker(struct work_struct *work)
+{
+ struct rds_connection *conn = container_of(work, struct rds_connection, c_recv_w.work);
+ int ret;
+
+ if (rds_conn_state(conn) == RDS_CONN_UP) {
+ ret = conn->c_trans->recv(conn);
+ rdsdebug("conn %p ret %d\n", conn, ret);
+ switch (ret) {
+ case -EAGAIN:
+ rds_stats_inc(s_recv_immediate_retry);
+ queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+ break;
+ case -ENOMEM:
+ rds_stats_inc(s_recv_delayed_retry);
+ queue_delayed_work(rds_wq, &conn->c_recv_w, 2);
+ default:
+ break;
+ }
+ }
+}
+
+void rds_threads_exit(void)
+{
+ destroy_workqueue(rds_wq);
+}
+
+int __init rds_threads_init(void)
+{
+ rds_wq = create_singlethread_workqueue("krdsd");
+ if (rds_wq == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
diff --git a/net/rds/transport.c b/net/rds/transport.c
new file mode 100644
index 00000000000..767da61ad2f
--- /dev/null
+++ b/net/rds/transport.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/in.h>
+
+#include "rds.h"
+#include "loop.h"
+
+static LIST_HEAD(rds_transports);
+static DECLARE_RWSEM(rds_trans_sem);
+
+int rds_trans_register(struct rds_transport *trans)
+{
+ BUG_ON(strlen(trans->t_name) + 1 > TRANSNAMSIZ);
+
+ down_write(&rds_trans_sem);
+
+ list_add_tail(&trans->t_item, &rds_transports);
+ printk(KERN_INFO "Registered RDS/%s transport\n", trans->t_name);
+
+ up_write(&rds_trans_sem);
+
+ return 0;
+}
+
+void rds_trans_unregister(struct rds_transport *trans)
+{
+ down_write(&rds_trans_sem);
+
+ list_del_init(&trans->t_item);
+ printk(KERN_INFO "Unregistered RDS/%s transport\n", trans->t_name);
+
+ up_write(&rds_trans_sem);
+}
+
+struct rds_transport *rds_trans_get_preferred(__be32 addr)
+{
+ struct rds_transport *trans;
+ struct rds_transport *ret = NULL;
+
+ if (IN_LOOPBACK(ntohl(addr)))
+ return &rds_loop_transport;
+
+ down_read(&rds_trans_sem);
+ list_for_each_entry(trans, &rds_transports, t_item) {
+ if (trans->laddr_check(addr) == 0) {
+ ret = trans;
+ break;
+ }
+ }
+ up_read(&rds_trans_sem);
+
+ return ret;
+}
+
+/*
+ * This returns the number of stats entries in the snapshot and only
+ * copies them using the iter if there is enough space for them. The
+ * caller passes in the global stats so that we can size and copy while
+ * holding the lock.
+ */
+unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
+ unsigned int avail)
+
+{
+ struct rds_transport *trans;
+ unsigned int total = 0;
+ unsigned int part;
+
+ rds_info_iter_unmap(iter);
+ down_read(&rds_trans_sem);
+
+ list_for_each_entry(trans, &rds_transports, t_item) {
+ if (trans->stats_info_copy == NULL)
+ continue;
+
+ part = trans->stats_info_copy(iter, avail);
+ avail -= min(avail, part);
+ total += part;
+ }
+
+ up_read(&rds_trans_sem);
+
+ return total;
+}
+
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 5c72a116b1a..f8f047b6124 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -183,13 +183,6 @@ override:
if (R_tab == NULL)
goto failure;
- if (!est && (ret == ACT_P_CREATED ||
- !gen_estimator_active(&police->tcf_bstats,
- &police->tcf_rate_est))) {
- err = -EINVAL;
- goto failure;
- }
-
if (parm->peakrate.rate) {
P_tab = qdisc_get_rtab(&parm->peakrate,
tb[TCA_POLICE_PEAKRATE]);
@@ -205,6 +198,12 @@ override:
&police->tcf_lock, est);
if (err)
goto failure_unlock;
+ } else if (tb[TCA_POLICE_AVRATE] &&
+ (ret == ACT_P_CREATED ||
+ !gen_estimator_active(&police->tcf_bstats,
+ &police->tcf_rate_est))) {
+ err = -EINVAL;
+ goto failure_unlock;
}
/* No failure allowed after this point */
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 9e43ed94916..d728d811173 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1960,8 +1960,11 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg)
cbq_rmprio(q, cl);
sch_tree_unlock(sch);
- if (--cl->refcnt == 0)
- cbq_destroy_class(sch, cl);
+ BUG_ON(--cl->refcnt == 0);
+ /*
+ * This shouldn't happen: we "hold" one cops->get() when called
+ * from tc_ctl_tclass; the destroy method is done from cops->put().
+ */
return 0;
}
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index f6b4fa97df7..7597fe14686 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -66,11 +66,15 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
{
struct drr_sched *q = qdisc_priv(sch);
struct drr_class *cl = (struct drr_class *)*arg;
+ struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_DRR_MAX + 1];
u32 quantum;
int err;
- err = nla_parse_nested(tb, TCA_DRR_MAX, tca[TCA_OPTIONS], drr_policy);
+ if (!opt)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_DRR_MAX, opt, drr_policy);
if (err < 0)
return err;
@@ -151,8 +155,11 @@ static int drr_delete_class(struct Qdisc *sch, unsigned long arg)
drr_purge_queue(cl);
qdisc_class_hash_remove(&q->clhash, &cl->common);
- if (--cl->refcnt == 0)
- drr_destroy_class(sch, cl);
+ BUG_ON(--cl->refcnt == 0);
+ /*
+ * This shouldn't happen: we "hold" one cops->get() when called
+ * from tc_ctl_tclass; the destroy method is done from cops->put().
+ */
sch_tree_unlock(sch);
return 0;
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 74226b26552..5022f9c1f34 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1139,8 +1139,11 @@ hfsc_delete_class(struct Qdisc *sch, unsigned long arg)
hfsc_purge_queue(sch, cl);
qdisc_class_hash_remove(&q->clhash, &cl->cl_common);
- if (--cl->refcnt == 0)
- hfsc_destroy_class(sch, cl);
+ BUG_ON(--cl->refcnt == 0);
+ /*
+ * This shouldn't happen: we "hold" one cops->get() when called
+ * from tc_ctl_tclass; the destroy method is done from cops->put().
+ */
sch_tree_unlock(sch);
return 0;
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 355974f610c..88cd0262662 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1275,8 +1275,11 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg)
if (last_child)
htb_parent_to_leaf(q, cl, new_q);
- if (--cl->refcnt == 0)
- htb_destroy_class(sch, cl);
+ BUG_ON(--cl->refcnt == 0);
+ /*
+ * This shouldn't happen: we "hold" one cops->get() when called
+ * from tc_ctl_tclass; the destroy method is done from cops->put().
+ */
sch_tree_unlock(sch);
return 0;
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index a2f93c09f3c..e22dfe85e43 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -236,7 +236,6 @@ static int tbf_change(struct Qdisc* sch, struct nlattr *opt)
struct tc_tbf_qopt *qopt;
struct qdisc_rate_table *rtab = NULL;
struct qdisc_rate_table *ptab = NULL;
- struct qdisc_rate_table *tmp;
struct Qdisc *child = NULL;
int max_size,n;
@@ -295,13 +294,9 @@ static int tbf_change(struct Qdisc* sch, struct nlattr *opt)
q->tokens = q->buffer;
q->ptokens = q->mtu;
- tmp = q->R_tab;
- q->R_tab = rtab;
- rtab = tmp;
+ swap(q->R_tab, rtab);
+ swap(q->P_tab, ptab);
- tmp = q->P_tab;
- q->P_tab = ptab;
- ptab = tmp;
sch_tree_unlock(sch);
err = 0;
done:
diff --git a/net/sctp/debug.c b/net/sctp/debug.c
index 67715f4eb84..7ff548a30cf 100644
--- a/net/sctp/debug.c
+++ b/net/sctp/debug.c
@@ -86,6 +86,9 @@ const char *sctp_cname(const sctp_subtype_t cid)
case SCTP_CID_FWD_TSN:
return "FWD_TSN";
+ case SCTP_CID_AUTH:
+ return "AUTH";
+
default:
break;
}
@@ -135,6 +138,7 @@ static const char *sctp_primitive_tbl[SCTP_NUM_PRIMITIVE_TYPES] = {
"PRIMITIVE_ABORT",
"PRIMITIVE_SEND",
"PRIMITIVE_REQUESTHEARTBEAT",
+ "PRIMITIVE_ASCONF",
};
/* Lookup primitive debug name. */
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 4c8d9f45ce0..905fda582b9 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -111,7 +111,8 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
if (sctp_addip_enable) {
auth_chunks->chunks[0] = SCTP_CID_ASCONF;
auth_chunks->chunks[1] = SCTP_CID_ASCONF_ACK;
- auth_chunks->param_hdr.length += htons(2);
+ auth_chunks->param_hdr.length =
+ htons(sizeof(sctp_paramhdr_t) + 2);
}
}
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 07d58903a74..7d08f522ec8 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -49,13 +49,10 @@
#include <linux/ipv6.h>
#include <linux/init.h>
#include <net/inet_ecn.h>
+#include <net/ip.h>
#include <net/icmp.h>
#include <net/net_namespace.h>
-#ifndef TEST_FRAME
-#include <net/tcp.h>
-#endif /* TEST_FRAME (not defined) */
-
#include <linux/socket.h> /* for sa_family_t */
#include <net/sock.h>
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index bc411c89621..d765fc53e74 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -428,7 +428,8 @@ void sctp_retransmit_mark(struct sctp_outq *q,
* retransmitting due to T3 timeout.
*/
if (reason == SCTP_RTXR_T3_RTX &&
- (jiffies - chunk->sent_at) < transport->last_rto)
+ time_before(jiffies, chunk->sent_at +
+ transport->last_rto))
continue;
/* RFC 2960 6.2.1 Processing a Received SACK
@@ -1757,6 +1758,9 @@ static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn)
struct sctp_chunk *chunk;
struct list_head *lchunk, *temp;
+ if (!asoc->peer.prsctp_capable)
+ return;
+
/* PR-SCTP C1) Let SackCumAck be the Cumulative TSN ACK carried in the
* received SACK.
*
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index c1e316ee715..cb198af8887 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -692,15 +692,20 @@ static int sctp_inetaddr_event(struct notifier_block *this, unsigned long ev,
static int sctp_ctl_sock_init(void)
{
int err;
- sa_family_t family;
+ sa_family_t family = PF_INET;
if (sctp_get_pf_specific(PF_INET6))
family = PF_INET6;
- else
- family = PF_INET;
err = inet_ctl_sock_create(&sctp_ctl_sock, family,
SOCK_SEQPACKET, IPPROTO_SCTP, &init_net);
+
+ /* If IPv6 socket could not be created, try the IPv4 socket */
+ if (err < 0 && family == PF_INET6)
+ err = inet_ctl_sock_create(&sctp_ctl_sock, AF_INET,
+ SOCK_SEQPACKET, IPPROTO_SCTP,
+ &init_net);
+
if (err < 0) {
printk(KERN_ERR
"SCTP: Failed to create the SCTP control socket.\n");
@@ -1297,9 +1302,8 @@ SCTP_STATIC __init int sctp_init(void)
out:
return status;
err_v6_add_protocol:
- sctp_v6_del_protocol();
-err_add_protocol:
sctp_v4_del_protocol();
+err_add_protocol:
inet_ctl_sock_destroy(sctp_ctl_sock);
err_ctl_sock_init:
sctp_v6_protosw_exit();
@@ -1310,7 +1314,6 @@ err_protosw_init:
sctp_v4_pf_exit();
sctp_v6_pf_exit();
sctp_sysctl_unregister();
- list_del(&sctp_af_inet.list);
free_pages((unsigned long)sctp_port_hashtable,
get_order(sctp_port_hashsize *
sizeof(struct sctp_bind_hashbucket)));
@@ -1358,7 +1361,6 @@ SCTP_STATIC __exit void sctp_exit(void)
sctp_v4_pf_exit();
sctp_sysctl_unregister();
- list_del(&sctp_af_inet.list);
free_pages((unsigned long)sctp_assoc_hashtable,
get_order(sctp_assoc_hashsize *
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index b40e95f9851..6851ee94e97 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -224,7 +224,9 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
num_ext += 2;
}
- chunksize += sizeof(aiparam);
+ if (sp->adaptation_ind)
+ chunksize += sizeof(aiparam);
+
chunksize += vparam_len;
/* Account for AUTH related parameters */
@@ -304,10 +306,12 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
if (sctp_prsctp_enable)
sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param);
- aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND;
- aiparam.param_hdr.length = htons(sizeof(aiparam));
- aiparam.adaptation_ind = htonl(sp->adaptation_ind);
- sctp_addto_chunk(retval, sizeof(aiparam), &aiparam);
+ if (sp->adaptation_ind) {
+ aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND;
+ aiparam.param_hdr.length = htons(sizeof(aiparam));
+ aiparam.adaptation_ind = htonl(sp->adaptation_ind);
+ sctp_addto_chunk(retval, sizeof(aiparam), &aiparam);
+ }
/* Add SCTP-AUTH chunks to the parameter list */
if (sctp_auth_enable) {
@@ -332,6 +336,7 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
sctp_inithdr_t initack;
struct sctp_chunk *retval;
union sctp_params addrs;
+ struct sctp_sock *sp;
int addrs_len;
sctp_cookie_param_t *cookie;
int cookie_len;
@@ -366,22 +371,24 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
/* Calculate the total size of allocation, include the reserved
* space for reporting unknown parameters if it is specified.
*/
+ sp = sctp_sk(asoc->base.sk);
chunksize = sizeof(initack) + addrs_len + cookie_len + unkparam_len;
/* Tell peer that we'll do ECN only if peer advertised such cap. */
if (asoc->peer.ecn_capable)
chunksize += sizeof(ecap_param);
- if (sctp_prsctp_enable)
+ if (asoc->peer.prsctp_capable)
chunksize += sizeof(prsctp_param);
- if (sctp_addip_enable) {
+ if (asoc->peer.asconf_capable) {
extensions[num_ext] = SCTP_CID_ASCONF;
extensions[num_ext+1] = SCTP_CID_ASCONF_ACK;
num_ext += 2;
}
- chunksize += sizeof(aiparam);
+ if (sp->adaptation_ind)
+ chunksize += sizeof(aiparam);
if (asoc->peer.auth_capable) {
auth_random = (sctp_paramhdr_t *)asoc->c.auth_random;
@@ -432,10 +439,12 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
if (asoc->peer.prsctp_capable)
sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param);
- aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND;
- aiparam.param_hdr.length = htons(sizeof(aiparam));
- aiparam.adaptation_ind = htonl(sctp_sk(asoc->base.sk)->adaptation_ind);
- sctp_addto_chunk(retval, sizeof(aiparam), &aiparam);
+ if (sp->adaptation_ind) {
+ aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND;
+ aiparam.param_hdr.length = htons(sizeof(aiparam));
+ aiparam.adaptation_ind = htonl(sp->adaptation_ind);
+ sctp_addto_chunk(retval, sizeof(aiparam), &aiparam);
+ }
if (asoc->peer.auth_capable) {
sctp_addto_chunk(retval, ntohs(auth_random->length),
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 0146cfb1f18..e2020eb2c8c 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -434,7 +434,8 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
*
*/
static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
- struct sctp_transport *transport)
+ struct sctp_transport *transport,
+ int is_hb)
{
/* The check for association's overall error counter exceeding the
* threshold is done in the state function.
@@ -466,7 +467,7 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
* The first unacknowleged HB triggers it. We do this with a flag
* that indicates that we have an outstanding HB.
*/
- if (transport->hb_sent) {
+ if (!is_hb || transport->hb_sent) {
transport->last_rto = transport->rto;
transport->rto = min((transport->rto * 2), transport->asoc->rto_max);
}
@@ -657,20 +658,6 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
sctp_transport_hold(t);
}
-/* Helper function to do a transport reset at the expiry of the hearbeat
- * timer.
- */
-static void sctp_cmd_transport_reset(sctp_cmd_seq_t *cmds,
- struct sctp_association *asoc,
- struct sctp_transport *t)
-{
- sctp_transport_lower_cwnd(t, SCTP_LOWER_CWND_INACTIVE);
-
- /* Mark one strike against a transport. */
- sctp_do_8_2_transport_strike(asoc, t);
-
- t->hb_sent = 1;
-}
/* Helper function to process the process SACK command. */
static int sctp_cmd_process_sack(sctp_cmd_seq_t *cmds,
@@ -800,36 +787,48 @@ static void sctp_cmd_process_operr(sctp_cmd_seq_t *cmds,
struct sctp_association *asoc,
struct sctp_chunk *chunk)
{
- struct sctp_operr_chunk *operr_chunk;
struct sctp_errhdr *err_hdr;
+ struct sctp_ulpevent *ev;
- operr_chunk = (struct sctp_operr_chunk *)chunk->chunk_hdr;
- err_hdr = &operr_chunk->err_hdr;
+ while (chunk->chunk_end > chunk->skb->data) {
+ err_hdr = (struct sctp_errhdr *)(chunk->skb->data);
- switch (err_hdr->cause) {
- case SCTP_ERROR_UNKNOWN_CHUNK:
- {
- struct sctp_chunkhdr *unk_chunk_hdr;
+ ev = sctp_ulpevent_make_remote_error(asoc, chunk, 0,
+ GFP_ATOMIC);
+ if (!ev)
+ return;
- unk_chunk_hdr = (struct sctp_chunkhdr *)err_hdr->variable;
- switch (unk_chunk_hdr->type) {
- /* ADDIP 4.1 A9) If the peer responds to an ASCONF with an
- * ERROR chunk reporting that it did not recognized the ASCONF
- * chunk type, the sender of the ASCONF MUST NOT send any
- * further ASCONF chunks and MUST stop its T-4 timer.
- */
- case SCTP_CID_ASCONF:
- asoc->peer.asconf_capable = 0;
- sctp_add_cmd_sf(cmds, SCTP_CMD_TIMER_STOP,
+ sctp_ulpq_tail_event(&asoc->ulpq, ev);
+
+ switch (err_hdr->cause) {
+ case SCTP_ERROR_UNKNOWN_CHUNK:
+ {
+ sctp_chunkhdr_t *unk_chunk_hdr;
+
+ unk_chunk_hdr = (sctp_chunkhdr_t *)err_hdr->variable;
+ switch (unk_chunk_hdr->type) {
+ /* ADDIP 4.1 A9) If the peer responds to an ASCONF with
+ * an ERROR chunk reporting that it did not recognized
+ * the ASCONF chunk type, the sender of the ASCONF MUST
+ * NOT send any further ASCONF chunks and MUST stop its
+ * T-4 timer.
+ */
+ case SCTP_CID_ASCONF:
+ if (asoc->peer.asconf_capable == 0)
+ break;
+
+ asoc->peer.asconf_capable = 0;
+ sctp_add_cmd_sf(cmds, SCTP_CMD_TIMER_STOP,
SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO));
+ break;
+ default:
+ break;
+ }
break;
+ }
default:
break;
}
- break;
- }
- default:
- break;
}
}
@@ -1459,12 +1458,19 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
case SCTP_CMD_STRIKE:
/* Mark one strike against a transport. */
- sctp_do_8_2_transport_strike(asoc, cmd->obj.transport);
+ sctp_do_8_2_transport_strike(asoc, cmd->obj.transport,
+ 0);
+ break;
+
+ case SCTP_CMD_TRANSPORT_IDLE:
+ t = cmd->obj.transport;
+ sctp_transport_lower_cwnd(t, SCTP_LOWER_CWND_INACTIVE);
break;
- case SCTP_CMD_TRANSPORT_RESET:
+ case SCTP_CMD_TRANSPORT_HB_SENT:
t = cmd->obj.transport;
- sctp_cmd_transport_reset(commands, asoc, t);
+ sctp_do_8_2_transport_strike(asoc, t, 1);
+ t->hb_sent = 1;
break;
case SCTP_CMD_TRANSPORT_ON:
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 3a0cd075914..55a61aa6966 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -988,7 +988,9 @@ sctp_disposition_t sctp_sf_sendbeat_8_3(const struct sctp_endpoint *ep,
/* Set transport error counter and association error counter
* when sending heartbeat.
*/
- sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_RESET,
+ sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_IDLE,
+ SCTP_TRANSPORT(transport));
+ sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_HB_SENT,
SCTP_TRANSPORT(transport));
}
sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMER_UPDATE,
@@ -3163,7 +3165,6 @@ sctp_disposition_t sctp_sf_operr_notify(const struct sctp_endpoint *ep,
sctp_cmd_seq_t *commands)
{
struct sctp_chunk *chunk = arg;
- struct sctp_ulpevent *ev;
if (!sctp_vtag_verify(chunk, asoc))
return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
@@ -3173,21 +3174,10 @@ sctp_disposition_t sctp_sf_operr_notify(const struct sctp_endpoint *ep,
return sctp_sf_violation_chunklen(ep, asoc, type, arg,
commands);
- while (chunk->chunk_end > chunk->skb->data) {
- ev = sctp_ulpevent_make_remote_error(asoc, chunk, 0,
- GFP_ATOMIC);
- if (!ev)
- goto nomem;
+ sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_OPERR,
+ SCTP_CHUNK(chunk));
- sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
- SCTP_ULPEVENT(ev));
- sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_OPERR,
- SCTP_CHUNK(chunk));
- }
return SCTP_DISPOSITION_CONSUME;
-
-nomem:
- return SCTP_DISPOSITION_NOMEM;
}
/*
@@ -4967,7 +4957,7 @@ sctp_disposition_t sctp_sf_do_prm_requestheartbeat(
* to that address and not acknowledged within one RTO.
*
*/
- sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_RESET,
+ sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_HB_SENT,
SCTP_TRANSPORT(arg));
return SCTP_DISPOSITION_CONSUME;
}
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index dea864f5de5..5fb3a8c9792 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3069,9 +3069,6 @@ static int sctp_setsockopt_maxburst(struct sock *sk,
int val;
int assoc_id = 0;
- if (optlen < sizeof(int))
- return -EINVAL;
-
if (optlen == sizeof(int)) {
printk(KERN_WARNING
"SCTP: Use of int in max_burst socket option deprecated\n");
@@ -5283,16 +5280,14 @@ static int sctp_getsockopt_maxburst(struct sock *sk, int len,
struct sctp_sock *sp;
struct sctp_association *asoc;
- if (len < sizeof(int))
- return -EINVAL;
-
if (len == sizeof(int)) {
printk(KERN_WARNING
"SCTP: Use of int in max_burst socket option deprecated\n");
printk(KERN_WARNING
"SCTP: Use struct sctp_assoc_value instead\n");
params.assoc_id = 0;
- } else if (len == sizeof (struct sctp_assoc_value)) {
+ } else if (len >= sizeof(struct sctp_assoc_value)) {
+ len = sizeof(struct sctp_assoc_value);
if (copy_from_user(&params, optval, len))
return -EFAULT;
} else
@@ -5848,37 +5843,28 @@ static int sctp_get_port(struct sock *sk, unsigned short snum)
}
/*
- * 3.1.3 listen() - UDP Style Syntax
- *
- * By default, new associations are not accepted for UDP style sockets.
- * An application uses listen() to mark a socket as being able to
- * accept new associations.
+ * Move a socket to LISTENING state.
*/
-SCTP_STATIC int sctp_seqpacket_listen(struct sock *sk, int backlog)
+SCTP_STATIC int sctp_listen_start(struct sock *sk, int backlog)
{
struct sctp_sock *sp = sctp_sk(sk);
struct sctp_endpoint *ep = sp->ep;
+ struct crypto_hash *tfm = NULL;
- /* Only UDP style sockets that are not peeled off are allowed to
- * listen().
- */
- if (!sctp_style(sk, UDP))
- return -EINVAL;
-
- /* If backlog is zero, disable listening. */
- if (!backlog) {
- if (sctp_sstate(sk, CLOSED))
- return 0;
-
- sctp_unhash_endpoint(ep);
- sk->sk_state = SCTP_SS_CLOSED;
- return 0;
+ /* Allocate HMAC for generating cookie. */
+ if (!sctp_sk(sk)->hmac && sctp_hmac_alg) {
+ tfm = crypto_alloc_hash(sctp_hmac_alg, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm)) {
+ if (net_ratelimit()) {
+ printk(KERN_INFO
+ "SCTP: failed to load transform for %s: %ld\n",
+ sctp_hmac_alg, PTR_ERR(tfm));
+ }
+ return -ENOSYS;
+ }
+ sctp_sk(sk)->hmac = tfm;
}
- /* Return if we are already listening. */
- if (sctp_sstate(sk, LISTENING))
- return 0;
-
/*
* If a bind() or sctp_bindx() is not called prior to a listen()
* call that allows new associations to be accepted, the system
@@ -5889,7 +5875,6 @@ SCTP_STATIC int sctp_seqpacket_listen(struct sock *sk, int backlog)
* extensions draft, but follows the practice as seen in TCP
* sockets.
*
- * Additionally, turn off fastreuse flag since we are not listening
*/
sk->sk_state = SCTP_SS_LISTENING;
if (!ep->base.bind_addr.port) {
@@ -5900,113 +5885,71 @@ SCTP_STATIC int sctp_seqpacket_listen(struct sock *sk, int backlog)
sk->sk_state = SCTP_SS_CLOSED;
return -EADDRINUSE;
}
- sctp_sk(sk)->bind_hash->fastreuse = 0;
}
- sctp_hash_endpoint(ep);
- return 0;
-}
-
-/*
- * 4.1.3 listen() - TCP Style Syntax
- *
- * Applications uses listen() to ready the SCTP endpoint for accepting
- * inbound associations.
- */
-SCTP_STATIC int sctp_stream_listen(struct sock *sk, int backlog)
-{
- struct sctp_sock *sp = sctp_sk(sk);
- struct sctp_endpoint *ep = sp->ep;
-
- /* If backlog is zero, disable listening. */
- if (!backlog) {
- if (sctp_sstate(sk, CLOSED))
- return 0;
-
- sctp_unhash_endpoint(ep);
- sk->sk_state = SCTP_SS_CLOSED;
- return 0;
- }
-
- if (sctp_sstate(sk, LISTENING))
- return 0;
-
- /*
- * If a bind() or sctp_bindx() is not called prior to a listen()
- * call that allows new associations to be accepted, the system
- * picks an ephemeral port and will choose an address set equivalent
- * to binding with a wildcard address.
- *
- * This is not currently spelled out in the SCTP sockets
- * extensions draft, but follows the practice as seen in TCP
- * sockets.
- */
- sk->sk_state = SCTP_SS_LISTENING;
- if (!ep->base.bind_addr.port) {
- if (sctp_autobind(sk))
- return -EAGAIN;
- } else
- sctp_sk(sk)->bind_hash->fastreuse = 0;
-
sk->sk_max_ack_backlog = backlog;
sctp_hash_endpoint(ep);
return 0;
}
/*
+ * 4.1.3 / 5.1.3 listen()
+ *
+ * By default, new associations are not accepted for UDP style sockets.
+ * An application uses listen() to mark a socket as being able to
+ * accept new associations.
+ *
+ * On TCP style sockets, applications use listen() to ready the SCTP
+ * endpoint for accepting inbound associations.
+ *
+ * On both types of endpoints a backlog of '0' disables listening.
+ *
* Move a socket to LISTENING state.
*/
int sctp_inet_listen(struct socket *sock, int backlog)
{
struct sock *sk = sock->sk;
- struct crypto_hash *tfm = NULL;
+ struct sctp_endpoint *ep = sctp_sk(sk)->ep;
int err = -EINVAL;
if (unlikely(backlog < 0))
- goto out;
+ return err;
sctp_lock_sock(sk);
+ /* Peeled-off sockets are not allowed to listen(). */
+ if (sctp_style(sk, UDP_HIGH_BANDWIDTH))
+ goto out;
+
if (sock->state != SS_UNCONNECTED)
goto out;
- /* Allocate HMAC for generating cookie. */
- if (!sctp_sk(sk)->hmac && sctp_hmac_alg) {
- tfm = crypto_alloc_hash(sctp_hmac_alg, 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(tfm)) {
- if (net_ratelimit()) {
- printk(KERN_INFO
- "SCTP: failed to load transform for %s: %ld\n",
- sctp_hmac_alg, PTR_ERR(tfm));
- }
- err = -ENOSYS;
+ /* If backlog is zero, disable listening. */
+ if (!backlog) {
+ if (sctp_sstate(sk, CLOSED))
goto out;
- }
- }
- switch (sock->type) {
- case SOCK_SEQPACKET:
- err = sctp_seqpacket_listen(sk, backlog);
- break;
- case SOCK_STREAM:
- err = sctp_stream_listen(sk, backlog);
- break;
- default:
- break;
+ err = 0;
+ sctp_unhash_endpoint(ep);
+ sk->sk_state = SCTP_SS_CLOSED;
+ if (sk->sk_reuse)
+ sctp_sk(sk)->bind_hash->fastreuse = 1;
+ goto out;
}
- if (err)
- goto cleanup;
+ /* If we are already listening, just update the backlog */
+ if (sctp_sstate(sk, LISTENING))
+ sk->sk_max_ack_backlog = backlog;
+ else {
+ err = sctp_listen_start(sk, backlog);
+ if (err)
+ goto out;
+ }
- /* Store away the transform reference. */
- if (!sctp_sk(sk)->hmac)
- sctp_sk(sk)->hmac = tfm;
+ err = 0;
out:
sctp_release_sock(sk);
return err;
-cleanup:
- crypto_free_hash(tfm);
- goto out;
}
/*
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 5c29b14ee9a..e5dde45c79d 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -543,8 +543,8 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
* congestion indications more than once every window of
* data (or more loosely more than once every round-trip time).
*/
- if ((jiffies - transport->last_time_ecne_reduced) >
- transport->rtt) {
+ if (time_after(jiffies, transport->last_time_ecne_reduced +
+ transport->rtt)) {
transport->ssthresh = max(transport->cwnd/2,
4*transport->asoc->pathmtu);
transport->cwnd = transport->ssthresh;
@@ -561,7 +561,8 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
* to be done every RTO interval, we do it every hearbeat
* interval.
*/
- if ((jiffies - transport->last_time_used) > transport->rto)
+ if (time_after(jiffies, transport->last_time_used +
+ transport->rto))
transport->cwnd = max(transport->cwnd/2,
4*transport->asoc->pathmtu);
break;
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 3ddaff42d1b..a3bfd406491 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -119,7 +119,7 @@ static struct bclink *bclink = NULL;
static struct link *bcl = NULL;
static DEFINE_SPINLOCK(bc_lock);
-char tipc_bclink_name[] = "multicast-link";
+const char tipc_bclink_name[] = "multicast-link";
static u32 buf_seqno(struct sk_buff *buf)
@@ -800,7 +800,7 @@ int tipc_bclink_init(void)
tipc_link_set_queue_limits(bcl, BCLINK_WIN_DEFAULT);
bcl->b_ptr = &bcbearer->bearer;
bcl->state = WORKING_WORKING;
- sprintf(bcl->name, tipc_bclink_name);
+ strlcpy(bcl->name, tipc_bclink_name, TIPC_MAX_LINK_NAME);
if (BCLINK_LOG_BUF_SIZE) {
char *pb = kmalloc(BCLINK_LOG_BUF_SIZE, GFP_ATOMIC);
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
index 2f2d731bc1c..4c1771e95c9 100644
--- a/net/tipc/bcast.h
+++ b/net/tipc/bcast.h
@@ -70,7 +70,7 @@ struct port_list {
struct tipc_node;
-extern char tipc_bclink_name[];
+extern const char tipc_bclink_name[];
/**
diff --git a/net/tipc/dbg.c b/net/tipc/dbg.c
index 29ecae85166..1885a7edb0c 100644
--- a/net/tipc/dbg.c
+++ b/net/tipc/dbg.c
@@ -258,7 +258,7 @@ void tipc_printf(struct print_buf *pb, const char *fmt, ...)
}
if (pb->echo)
- printk(print_string);
+ printk("%s", print_string);
spin_unlock_bh(&print_lock);
}
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 20d98c56e15..2c24e7d6d95 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -703,7 +703,7 @@ struct sk_buff *tipc_node_get_links(const void *req_tlv_area, int req_tlv_space)
link_info.dest = htonl(tipc_own_addr & 0xfffff00);
link_info.up = htonl(1);
- sprintf(link_info.str, tipc_bclink_name);
+ strlcpy(link_info.str, tipc_bclink_name, TIPC_MAX_LINK_NAME);
tipc_cfg_append_tlv(buf, TIPC_TLV_LINK_INFO, &link_info, sizeof(link_info));
/* Add TLVs for any other links in scope */
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index d1b89820ab4..baac91049b0 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1178,8 +1178,7 @@ out_unlock:
unix_state_unlock(other);
out:
- if (skb)
- kfree_skb(skb);
+ kfree_skb(skb);
if (newsk)
unix_release_sock(newsk, 0);
if (other)
diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c
index 39701dec1db..466e2d22d25 100644
--- a/net/wanrouter/wanmain.c
+++ b/net/wanrouter/wanmain.c
@@ -86,8 +86,10 @@ static int wanrouter_device_del_if(struct wan_device *wandev,
static struct wan_device *wanrouter_find_device(char *name);
static int wanrouter_delete_interface(struct wan_device *wandev, char *name);
-static void lock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags);
-static void unlock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags);
+static void lock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags)
+ __acquires(lock);
+static void unlock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags)
+ __releases(lock);
@@ -763,12 +765,14 @@ static int wanrouter_delete_interface(struct wan_device *wandev, char *name)
}
static void lock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags)
+ __acquires(lock)
{
spin_lock_irqsave(lock, *smp_flags);
}
static void unlock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags)
+ __releases(lock)
{
spin_unlock_irqrestore(lock, *smp_flags);
}
diff --git a/net/wanrouter/wanproc.c b/net/wanrouter/wanproc.c
index 267f7ff4982..c44d96b3a43 100644
--- a/net/wanrouter/wanproc.c
+++ b/net/wanrouter/wanproc.c
@@ -80,6 +80,7 @@ static struct proc_dir_entry *proc_router;
* Iterator
*/
static void *r_start(struct seq_file *m, loff_t *pos)
+ __acquires(kernel_lock)
{
struct wan_device *wandev;
loff_t l = *pos;
@@ -101,6 +102,7 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos)
}
static void r_stop(struct seq_file *m, void *v)
+ __releases(kernel_lock)
{
unlock_kernel();
}
diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig
index e28e2b8fa43..092ae6faccc 100644
--- a/net/wireless/Kconfig
+++ b/net/wireless/Kconfig
@@ -102,3 +102,13 @@ config LIB80211_CRYPT_CCMP
config LIB80211_CRYPT_TKIP
tristate
+
+config LIB80211_DEBUG
+ bool "lib80211 debugging messages"
+ depends on LIB80211
+ default n
+ ---help---
+ You can enable this if you want verbose debugging messages
+ from lib80211.
+
+ If unsure, say N.
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 0668b2bfc1d..17fe3904974 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -7,7 +7,6 @@
#include <linux/if.h>
#include <linux/module.h>
#include <linux/err.h>
-#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/nl80211.h>
#include <linux/debugfs.h>
@@ -31,18 +30,29 @@ MODULE_DESCRIPTION("wireless configuration support");
* only read the list, and that can happen quite
* often because we need to do it for each command */
LIST_HEAD(cfg80211_drv_list);
-DEFINE_MUTEX(cfg80211_drv_mutex);
+
+/*
+ * This is used to protect the cfg80211_drv_list, cfg80211_regdomain,
+ * country_ie_regdomain, the reg_beacon_list and the the last regulatory
+ * request receipt (last_request).
+ */
+DEFINE_MUTEX(cfg80211_mutex);
/* for debugfs */
static struct dentry *ieee80211_debugfs_dir;
-/* requires cfg80211_drv_mutex to be held! */
-static struct cfg80211_registered_device *cfg80211_drv_by_wiphy(int wiphy)
+/* requires cfg80211_mutex to be held! */
+struct cfg80211_registered_device *cfg80211_drv_by_wiphy_idx(int wiphy_idx)
{
struct cfg80211_registered_device *result = NULL, *drv;
+ if (!wiphy_idx_valid(wiphy_idx))
+ return NULL;
+
+ assert_cfg80211_lock();
+
list_for_each_entry(drv, &cfg80211_drv_list, list) {
- if (drv->idx == wiphy) {
+ if (drv->wiphy_idx == wiphy_idx) {
result = drv;
break;
}
@@ -51,17 +61,44 @@ static struct cfg80211_registered_device *cfg80211_drv_by_wiphy(int wiphy)
return result;
}
+int get_wiphy_idx(struct wiphy *wiphy)
+{
+ struct cfg80211_registered_device *drv;
+ if (!wiphy)
+ return WIPHY_IDX_STALE;
+ drv = wiphy_to_dev(wiphy);
+ return drv->wiphy_idx;
+}
+
/* requires cfg80211_drv_mutex to be held! */
+struct wiphy *wiphy_idx_to_wiphy(int wiphy_idx)
+{
+ struct cfg80211_registered_device *drv;
+
+ if (!wiphy_idx_valid(wiphy_idx))
+ return NULL;
+
+ assert_cfg80211_lock();
+
+ drv = cfg80211_drv_by_wiphy_idx(wiphy_idx);
+ if (!drv)
+ return NULL;
+ return &drv->wiphy;
+}
+
+/* requires cfg80211_mutex to be held! */
static struct cfg80211_registered_device *
__cfg80211_drv_from_info(struct genl_info *info)
{
int ifindex;
- struct cfg80211_registered_device *bywiphy = NULL, *byifidx = NULL;
+ struct cfg80211_registered_device *bywiphyidx = NULL, *byifidx = NULL;
struct net_device *dev;
int err = -EINVAL;
+ assert_cfg80211_lock();
+
if (info->attrs[NL80211_ATTR_WIPHY]) {
- bywiphy = cfg80211_drv_by_wiphy(
+ bywiphyidx = cfg80211_drv_by_wiphy_idx(
nla_get_u32(info->attrs[NL80211_ATTR_WIPHY]));
err = -ENODEV;
}
@@ -78,14 +115,14 @@ __cfg80211_drv_from_info(struct genl_info *info)
err = -ENODEV;
}
- if (bywiphy && byifidx) {
- if (bywiphy != byifidx)
+ if (bywiphyidx && byifidx) {
+ if (bywiphyidx != byifidx)
return ERR_PTR(-EINVAL);
else
- return bywiphy; /* == byifidx */
+ return bywiphyidx; /* == byifidx */
}
- if (bywiphy)
- return bywiphy;
+ if (bywiphyidx)
+ return bywiphyidx;
if (byifidx)
return byifidx;
@@ -98,7 +135,7 @@ cfg80211_get_dev_from_info(struct genl_info *info)
{
struct cfg80211_registered_device *drv;
- mutex_lock(&cfg80211_drv_mutex);
+ mutex_lock(&cfg80211_mutex);
drv = __cfg80211_drv_from_info(info);
/* if it is not an error we grab the lock on
@@ -107,7 +144,7 @@ cfg80211_get_dev_from_info(struct genl_info *info)
if (!IS_ERR(drv))
mutex_lock(&drv->mtx);
- mutex_unlock(&cfg80211_drv_mutex);
+ mutex_unlock(&cfg80211_mutex);
return drv;
}
@@ -118,7 +155,7 @@ cfg80211_get_dev_from_ifindex(int ifindex)
struct cfg80211_registered_device *drv = ERR_PTR(-ENODEV);
struct net_device *dev;
- mutex_lock(&cfg80211_drv_mutex);
+ mutex_lock(&cfg80211_mutex);
dev = dev_get_by_index(&init_net, ifindex);
if (!dev)
goto out;
@@ -129,7 +166,7 @@ cfg80211_get_dev_from_ifindex(int ifindex)
drv = ERR_PTR(-ENODEV);
dev_put(dev);
out:
- mutex_unlock(&cfg80211_drv_mutex);
+ mutex_unlock(&cfg80211_mutex);
return drv;
}
@@ -143,16 +180,16 @@ int cfg80211_dev_rename(struct cfg80211_registered_device *rdev,
char *newname)
{
struct cfg80211_registered_device *drv;
- int idx, taken = -1, result, digits;
+ int wiphy_idx, taken = -1, result, digits;
- mutex_lock(&cfg80211_drv_mutex);
+ mutex_lock(&cfg80211_mutex);
/* prohibit calling the thing phy%d when %d is not its number */
- sscanf(newname, PHY_NAME "%d%n", &idx, &taken);
- if (taken == strlen(newname) && idx != rdev->idx) {
- /* count number of places needed to print idx */
+ sscanf(newname, PHY_NAME "%d%n", &wiphy_idx, &taken);
+ if (taken == strlen(newname) && wiphy_idx != rdev->wiphy_idx) {
+ /* count number of places needed to print wiphy_idx */
digits = 1;
- while (idx /= 10)
+ while (wiphy_idx /= 10)
digits++;
/*
* deny the name if it is phy<idx> where <idx> is printed
@@ -193,7 +230,7 @@ int cfg80211_dev_rename(struct cfg80211_registered_device *rdev,
result = 0;
out_unlock:
- mutex_unlock(&cfg80211_drv_mutex);
+ mutex_unlock(&cfg80211_mutex);
if (result == 0)
nl80211_notify_dev_rename(rdev);
@@ -220,22 +257,22 @@ struct wiphy *wiphy_new(struct cfg80211_ops *ops, int sizeof_priv)
drv->ops = ops;
- mutex_lock(&cfg80211_drv_mutex);
+ mutex_lock(&cfg80211_mutex);
- drv->idx = wiphy_counter++;
+ drv->wiphy_idx = wiphy_counter++;
- if (unlikely(drv->idx < 0)) {
+ if (unlikely(!wiphy_idx_valid(drv->wiphy_idx))) {
wiphy_counter--;
- mutex_unlock(&cfg80211_drv_mutex);
+ mutex_unlock(&cfg80211_mutex);
/* ugh, wrapped! */
kfree(drv);
return NULL;
}
- mutex_unlock(&cfg80211_drv_mutex);
+ mutex_unlock(&cfg80211_mutex);
/* give it a proper name */
- dev_set_name(&drv->wiphy.dev, PHY_NAME "%d", drv->idx);
+ dev_set_name(&drv->wiphy.dev, PHY_NAME "%d", drv->wiphy_idx);
mutex_init(&drv->mtx);
mutex_init(&drv->devlist_mtx);
@@ -310,10 +347,10 @@ int wiphy_register(struct wiphy *wiphy)
/* check and set up bitrates */
ieee80211_set_bitrate_flags(wiphy);
- mutex_lock(&cfg80211_drv_mutex);
+ mutex_lock(&cfg80211_mutex);
/* set up regulatory info */
- wiphy_update_regulatory(wiphy, REGDOM_SET_BY_CORE);
+ wiphy_update_regulatory(wiphy, NL80211_REGDOM_SET_BY_CORE);
res = device_add(&drv->wiphy.dev);
if (res)
@@ -328,9 +365,20 @@ int wiphy_register(struct wiphy *wiphy)
if (IS_ERR(drv->wiphy.debugfsdir))
drv->wiphy.debugfsdir = NULL;
+ if (wiphy->custom_regulatory) {
+ struct regulatory_request request;
+
+ request.wiphy_idx = get_wiphy_idx(wiphy);
+ request.initiator = NL80211_REGDOM_SET_BY_DRIVER;
+ request.alpha2[0] = '9';
+ request.alpha2[1] = '9';
+
+ nl80211_send_reg_change_event(&request);
+ }
+
res = 0;
out_unlock:
- mutex_unlock(&cfg80211_drv_mutex);
+ mutex_unlock(&cfg80211_mutex);
return res;
}
EXPORT_SYMBOL(wiphy_register);
@@ -340,7 +388,7 @@ void wiphy_unregister(struct wiphy *wiphy)
struct cfg80211_registered_device *drv = wiphy_to_dev(wiphy);
/* protect the device list */
- mutex_lock(&cfg80211_drv_mutex);
+ mutex_lock(&cfg80211_mutex);
BUG_ON(!list_empty(&drv->netdev_list));
@@ -366,7 +414,7 @@ void wiphy_unregister(struct wiphy *wiphy)
device_del(&drv->wiphy.dev);
debugfs_remove(drv->wiphy.debugfsdir);
- mutex_unlock(&cfg80211_drv_mutex);
+ mutex_unlock(&cfg80211_mutex);
}
EXPORT_SYMBOL(wiphy_unregister);
diff --git a/net/wireless/core.h b/net/wireless/core.h
index e29ad4cd464..6acd483a61f 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -10,6 +10,7 @@
#include <linux/netdevice.h>
#include <linux/kref.h>
#include <linux/rbtree.h>
+#include <linux/mutex.h>
#include <net/genetlink.h>
#include <net/wireless.h>
#include <net/cfg80211.h>
@@ -37,7 +38,7 @@ struct cfg80211_registered_device {
enum environment_cap env;
/* wiphy index, internal only */
- int idx;
+ int wiphy_idx;
/* associate netdev list */
struct mutex devlist_mtx;
@@ -49,6 +50,7 @@ struct cfg80211_registered_device {
struct rb_root bss_tree;
u32 bss_generation;
struct cfg80211_scan_request *scan_req; /* protected by RTNL */
+ unsigned long suspend_at;
/* must be last because of the way we do wiphy_priv(),
* and it should at least be aligned to NETDEV_ALIGN */
@@ -62,9 +64,27 @@ struct cfg80211_registered_device *wiphy_to_dev(struct wiphy *wiphy)
return container_of(wiphy, struct cfg80211_registered_device, wiphy);
}
-extern struct mutex cfg80211_drv_mutex;
+/* Note 0 is valid, hence phy0 */
+static inline
+bool wiphy_idx_valid(int wiphy_idx)
+{
+ return (wiphy_idx >= 0);
+}
+
+extern struct mutex cfg80211_mutex;
extern struct list_head cfg80211_drv_list;
+static inline void assert_cfg80211_lock(void)
+{
+ WARN_ON(!mutex_is_locked(&cfg80211_mutex));
+}
+
+/*
+ * You can use this to mark a wiphy_idx as not having an associated wiphy.
+ * It guarantees cfg80211_drv_by_wiphy_idx(wiphy_idx) will return NULL
+ */
+#define WIPHY_IDX_STALE -1
+
struct cfg80211_internal_bss {
struct list_head list;
struct rb_node rbn;
@@ -74,6 +94,9 @@ struct cfg80211_internal_bss {
struct cfg80211_bss pub;
};
+struct cfg80211_registered_device *cfg80211_drv_by_wiphy_idx(int wiphy_idx);
+int get_wiphy_idx(struct wiphy *wiphy);
+
/*
* This function returns a pointer to the driver
* that the genl_info item that is passed refers to.
@@ -81,13 +104,13 @@ struct cfg80211_internal_bss {
* the driver's mutex!
*
* This means that you need to call cfg80211_put_dev()
- * before being allowed to acquire &cfg80211_drv_mutex!
+ * before being allowed to acquire &cfg80211_mutex!
*
* This is necessary because we need to lock the global
* mutex to get an item off the list safely, and then
* we lock the drv mutex so it doesn't go away under us.
*
- * We don't want to keep cfg80211_drv_mutex locked
+ * We don't want to keep cfg80211_mutex locked
* for all the time in order to allow requests on
* other interfaces to go through at the same time.
*
@@ -97,6 +120,9 @@ struct cfg80211_internal_bss {
extern struct cfg80211_registered_device *
cfg80211_get_dev_from_info(struct genl_info *info);
+/* requires cfg80211_drv_mutex to be held! */
+struct wiphy *wiphy_idx_to_wiphy(int wiphy_idx);
+
/* identical to cfg80211_get_dev_from_info but only operate on ifindex */
extern struct cfg80211_registered_device *
cfg80211_get_dev_from_ifindex(int ifindex);
@@ -110,8 +136,11 @@ extern int cfg80211_dev_rename(struct cfg80211_registered_device *drv,
char *newname);
void ieee80211_set_bitrate_flags(struct wiphy *wiphy);
-void wiphy_update_regulatory(struct wiphy *wiphy, enum reg_set_by setby);
+void wiphy_update_regulatory(struct wiphy *wiphy,
+ enum nl80211_reg_initiator setby);
void cfg80211_bss_expire(struct cfg80211_registered_device *dev);
+void cfg80211_bss_age(struct cfg80211_registered_device *dev,
+ unsigned long age_secs);
#endif /* __NET_WIRELESS_CORE_H */
diff --git a/net/wireless/lib80211_crypt_ccmp.c b/net/wireless/lib80211_crypt_ccmp.c
index db428194c16..2301dc1edc4 100644
--- a/net/wireless/lib80211_crypt_ccmp.c
+++ b/net/wireless/lib80211_crypt_ccmp.c
@@ -337,6 +337,7 @@ static int lib80211_ccmp_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
pos += 8;
if (ccmp_replay_check(pn, key->rx_pn)) {
+#ifdef CONFIG_LIB80211_DEBUG
if (net_ratelimit()) {
printk(KERN_DEBUG "CCMP: replay detected: STA=%pM "
"previous PN %02x%02x%02x%02x%02x%02x "
@@ -346,6 +347,7 @@ static int lib80211_ccmp_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
key->rx_pn[3], key->rx_pn[4], key->rx_pn[5],
pn[0], pn[1], pn[2], pn[3], pn[4], pn[5]);
}
+#endif
key->dot11RSNAStatsCCMPReplays++;
return -4;
}
diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c
index 7e8e22bfed9..c36287399d7 100644
--- a/net/wireless/lib80211_crypt_tkip.c
+++ b/net/wireless/lib80211_crypt_tkip.c
@@ -465,12 +465,14 @@ static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
pos += 8;
if (tkip_replay_check(iv32, iv16, tkey->rx_iv32, tkey->rx_iv16)) {
+#ifdef CONFIG_LIB80211_DEBUG
if (net_ratelimit()) {
printk(KERN_DEBUG "TKIP: replay detected: STA=%pM"
" previous TSC %08x%04x received TSC "
"%08x%04x\n", hdr->addr2,
tkey->rx_iv32, tkey->rx_iv16, iv32, iv16);
}
+#endif
tkey->dot11RSNAStatsTKIPReplays++;
return -4;
}
@@ -505,10 +507,12 @@ static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
* it needs to be recalculated for the next packet. */
tkey->rx_phase1_done = 0;
}
+#ifdef CONFIG_LIB80211_DEBUG
if (net_ratelimit()) {
printk(KERN_DEBUG "TKIP: ICV error detected: STA="
"%pM\n", hdr->addr2);
}
+#endif
tkey->dot11RSNAStatsTKIPICVErrors++;
return -5;
}
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 298a4de5994..ab9d8f14e15 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -7,7 +7,6 @@
#include <linux/if.h>
#include <linux/module.h>
#include <linux/err.h>
-#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/if_ether.h>
#include <linux/ieee80211.h>
@@ -142,7 +141,7 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
if (!hdr)
return -1;
- NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, dev->idx);
+ NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, dev->wiphy_idx);
NLA_PUT_STRING(msg, NL80211_ATTR_WIPHY_NAME, wiphy_name(&dev->wiphy));
NLA_PUT_U8(msg, NL80211_ATTR_MAX_NUM_SCAN_SSIDS,
dev->wiphy.max_scan_ssids);
@@ -256,7 +255,7 @@ static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb)
int start = cb->args[0];
struct cfg80211_registered_device *dev;
- mutex_lock(&cfg80211_drv_mutex);
+ mutex_lock(&cfg80211_mutex);
list_for_each_entry(dev, &cfg80211_drv_list, list) {
if (++idx <= start)
continue;
@@ -267,7 +266,7 @@ static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb)
break;
}
}
- mutex_unlock(&cfg80211_drv_mutex);
+ mutex_unlock(&cfg80211_mutex);
cb->args[0] = idx;
@@ -470,7 +469,7 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *
struct cfg80211_registered_device *dev;
struct wireless_dev *wdev;
- mutex_lock(&cfg80211_drv_mutex);
+ mutex_lock(&cfg80211_mutex);
list_for_each_entry(dev, &cfg80211_drv_list, list) {
if (wp_idx < wp_start) {
wp_idx++;
@@ -497,7 +496,7 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *
wp_idx++;
}
out:
- mutex_unlock(&cfg80211_drv_mutex);
+ mutex_unlock(&cfg80211_mutex);
cb->args[0] = wp_idx;
cb->args[1] = if_idx;
@@ -1206,6 +1205,12 @@ static int nl80211_send_station(struct sk_buff *msg, u32 pid, u32 seq,
nla_nest_end(msg, txrate);
}
+ if (sinfo->filled & STATION_INFO_RX_PACKETS)
+ NLA_PUT_U32(msg, NL80211_STA_INFO_RX_PACKETS,
+ sinfo->rx_packets);
+ if (sinfo->filled & STATION_INFO_TX_PACKETS)
+ NLA_PUT_U32(msg, NL80211_STA_INFO_TX_PACKETS,
+ sinfo->tx_packets);
nla_nest_end(msg, sinfoattr);
return genlmsg_end(msg, hdr);
@@ -1900,6 +1905,19 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info)
int r;
char *data = NULL;
+ /*
+ * You should only get this when cfg80211 hasn't yet initialized
+ * completely when built-in to the kernel right between the time
+ * window between nl80211_init() and regulatory_init(), if that is
+ * even possible.
+ */
+ mutex_lock(&cfg80211_mutex);
+ if (unlikely(!cfg80211_regdomain)) {
+ mutex_unlock(&cfg80211_mutex);
+ return -EINPROGRESS;
+ }
+ mutex_unlock(&cfg80211_mutex);
+
if (!info->attrs[NL80211_ATTR_REG_ALPHA2])
return -EINVAL;
@@ -1910,14 +1928,9 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info)
if (is_world_regdom(data))
return -EINVAL;
#endif
- mutex_lock(&cfg80211_drv_mutex);
- r = __regulatory_hint(NULL, REGDOM_SET_BY_USER, data, 0, ENVIRON_ANY);
- mutex_unlock(&cfg80211_drv_mutex);
- /* This means the regulatory domain was already set, however
- * we don't want to confuse userspace with a "successful error"
- * message so lets just treat it as a success */
- if (r == -EALREADY)
- r = 0;
+
+ r = regulatory_hint_user(data);
+
return r;
}
@@ -1937,6 +1950,11 @@ static int nl80211_get_mesh_params(struct sk_buff *skb,
if (err)
return err;
+ if (!drv->ops->get_mesh_params) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
/* Get the mesh params */
rtnl_lock();
err = drv->ops->get_mesh_params(&drv->wiphy, dev, &cur_params);
@@ -2046,6 +2064,11 @@ static int nl80211_set_mesh_params(struct sk_buff *skb, struct genl_info *info)
if (err)
return err;
+ if (!drv->ops->set_mesh_params) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
/* This makes sure that there aren't more than 32 mesh config
* parameters (otherwise our bitfield scheme would not work.) */
BUILD_BUG_ON(NL80211_MESHCONF_ATTR_MAX > 32);
@@ -2090,6 +2113,7 @@ static int nl80211_set_mesh_params(struct sk_buff *skb, struct genl_info *info)
err = drv->ops->set_mesh_params(&drv->wiphy, dev, &cfg, mask);
rtnl_unlock();
+ out:
/* cleanup */
cfg80211_put_dev(drv);
dev_put(dev);
@@ -2106,7 +2130,7 @@ static int nl80211_get_reg(struct sk_buff *skb, struct genl_info *info)
unsigned int i;
int err = -EINVAL;
- mutex_lock(&cfg80211_drv_mutex);
+ mutex_lock(&cfg80211_mutex);
if (!cfg80211_regdomain)
goto out;
@@ -2169,7 +2193,7 @@ nla_put_failure:
genlmsg_cancel(msg, hdr);
err = -EMSGSIZE;
out:
- mutex_unlock(&cfg80211_drv_mutex);
+ mutex_unlock(&cfg80211_mutex);
return err;
}
@@ -2228,9 +2252,9 @@ static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info)
BUG_ON(rule_idx != num_rules);
- mutex_lock(&cfg80211_drv_mutex);
+ mutex_lock(&cfg80211_mutex);
r = set_regdom(rd);
- mutex_unlock(&cfg80211_drv_mutex);
+ mutex_unlock(&cfg80211_mutex);
return r;
bad_reg:
@@ -2286,6 +2310,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
struct wiphy *wiphy;
int err, tmp, n_ssids = 0, n_channels = 0, i;
enum ieee80211_band band;
+ size_t ie_len;
err = get_drv_dev_by_info_ifindex(info->attrs, &drv, &dev);
if (err)
@@ -2327,9 +2352,15 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
goto out_unlock;
}
+ if (info->attrs[NL80211_ATTR_IE])
+ ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
+ else
+ ie_len = 0;
+
request = kzalloc(sizeof(*request)
+ sizeof(*ssid) * n_ssids
- + sizeof(channel) * n_channels, GFP_KERNEL);
+ + sizeof(channel) * n_channels
+ + ie_len, GFP_KERNEL);
if (!request) {
err = -ENOMEM;
goto out_unlock;
@@ -2340,6 +2371,12 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
if (n_ssids)
request->ssids = (void *)(request->channels + n_channels);
request->n_ssids = n_ssids;
+ if (ie_len) {
+ if (request->ssids)
+ request->ie = (void *)(request->ssids + n_ssids);
+ else
+ request->ie = (void *)(request->channels + n_channels);
+ }
if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) {
/* user specified, bail out if channel not found */
@@ -2380,6 +2417,12 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
}
}
+ if (info->attrs[NL80211_ATTR_IE]) {
+ request->ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
+ memcpy(request->ie, nla_data(info->attrs[NL80211_ATTR_IE]),
+ request->ie_len);
+ }
+
request->ifidx = dev->ifindex;
request->wiphy = &drv->wiphy;
@@ -2432,7 +2475,7 @@ static int nl80211_send_bss(struct sk_buff *msg, u32 pid, u32 seq, int flags,
NLA_PUT_U16(msg, NL80211_BSS_CAPABILITY, res->capability);
NLA_PUT_U32(msg, NL80211_BSS_FREQUENCY, res->channel->center_freq);
- switch (res->signal_type) {
+ switch (rdev->wiphy.signal_type) {
case CFG80211_SIGNAL_TYPE_MBM:
NLA_PUT_U32(msg, NL80211_BSS_SIGNAL_MBM, res->signal);
break;
@@ -2601,7 +2644,6 @@ static struct genl_ops nl80211_ops[] = {
.doit = nl80211_get_station,
.dumpit = nl80211_dump_station,
.policy = nl80211_policy,
- .flags = GENL_ADMIN_PERM,
},
{
.cmd = NL80211_CMD_SET_STATION,
@@ -2708,6 +2750,9 @@ static struct genl_multicast_group nl80211_config_mcgrp = {
static struct genl_multicast_group nl80211_scan_mcgrp = {
.name = "scan",
};
+static struct genl_multicast_group nl80211_regulatory_mcgrp = {
+ .name = "regulatory",
+};
/* notification functions */
@@ -2739,7 +2784,7 @@ static int nl80211_send_scan_donemsg(struct sk_buff *msg,
if (!hdr)
return -1;
- NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->idx);
+ NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx);
NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex);
/* XXX: we should probably bounce back the request? */
@@ -2787,6 +2832,61 @@ void nl80211_send_scan_aborted(struct cfg80211_registered_device *rdev,
genlmsg_multicast(msg, 0, nl80211_scan_mcgrp.id, GFP_KERNEL);
}
+/*
+ * This can happen on global regulatory changes or device specific settings
+ * based on custom world regulatory domains.
+ */
+void nl80211_send_reg_change_event(struct regulatory_request *request)
+{
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_REG_CHANGE);
+ if (!hdr) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ /* Userspace can always count this one always being set */
+ NLA_PUT_U8(msg, NL80211_ATTR_REG_INITIATOR, request->initiator);
+
+ if (request->alpha2[0] == '0' && request->alpha2[1] == '0')
+ NLA_PUT_U8(msg, NL80211_ATTR_REG_TYPE,
+ NL80211_REGDOM_TYPE_WORLD);
+ else if (request->alpha2[0] == '9' && request->alpha2[1] == '9')
+ NLA_PUT_U8(msg, NL80211_ATTR_REG_TYPE,
+ NL80211_REGDOM_TYPE_CUSTOM_WORLD);
+ else if ((request->alpha2[0] == '9' && request->alpha2[1] == '8') ||
+ request->intersect)
+ NLA_PUT_U8(msg, NL80211_ATTR_REG_TYPE,
+ NL80211_REGDOM_TYPE_INTERSECTION);
+ else {
+ NLA_PUT_U8(msg, NL80211_ATTR_REG_TYPE,
+ NL80211_REGDOM_TYPE_COUNTRY);
+ NLA_PUT_STRING(msg, NL80211_ATTR_REG_ALPHA2, request->alpha2);
+ }
+
+ if (wiphy_idx_valid(request->wiphy_idx))
+ NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, request->wiphy_idx);
+
+ if (genlmsg_end(msg, hdr) < 0) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ genlmsg_multicast(msg, 0, nl80211_regulatory_mcgrp.id, GFP_KERNEL);
+
+ return;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ nlmsg_free(msg);
+}
+
/* initialisation/exit functions */
int nl80211_init(void)
@@ -2811,6 +2911,10 @@ int nl80211_init(void)
if (err)
goto err_out;
+ err = genl_register_mc_group(&nl80211_fam, &nl80211_regulatory_mcgrp);
+ if (err)
+ goto err_out;
+
return 0;
err_out:
genl_unregister_family(&nl80211_fam);
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index b565a5f84e9..e65a3c38c52 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -11,6 +11,7 @@ extern void nl80211_send_scan_done(struct cfg80211_registered_device *rdev,
struct net_device *netdev);
extern void nl80211_send_scan_aborted(struct cfg80211_registered_device *rdev,
struct net_device *netdev);
+extern void nl80211_send_reg_change_event(struct regulatory_request *request);
#else
static inline int nl80211_init(void)
{
@@ -27,6 +28,14 @@ static inline void
nl80211_send_scan_done(struct cfg80211_registered_device *rdev,
struct net_device *netdev)
{}
+static inline void nl80211_send_scan_aborted(
+ struct cfg80211_registered_device *rdev,
+ struct net_device *netdev)
+{}
+static inline void
+nl80211_send_reg_change_event(struct regulatory_request *request)
+{
+}
#endif /* CONFIG_NL80211 */
#endif /* __NET_WIRELESS_NL80211_H */
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 2323644330c..eb8b8ed1615 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -41,6 +41,7 @@
#include <net/cfg80211.h>
#include "core.h"
#include "reg.h"
+#include "nl80211.h"
/* Receipt of information from last regulatory request */
static struct regulatory_request *last_request;
@@ -54,22 +55,63 @@ static u32 supported_bandwidths[] = {
MHZ_TO_KHZ(20),
};
-/* Central wireless core regulatory domains, we only need two,
+/*
+ * Central wireless core regulatory domains, we only need two,
* the current one and a world regulatory domain in case we have no
- * information to give us an alpha2 */
+ * information to give us an alpha2
+ */
const struct ieee80211_regdomain *cfg80211_regdomain;
-/* We use this as a place for the rd structure built from the
+/*
+ * We use this as a place for the rd structure built from the
* last parsed country IE to rest until CRDA gets back to us with
- * what it thinks should apply for the same country */
+ * what it thinks should apply for the same country
+ */
static const struct ieee80211_regdomain *country_ie_regdomain;
+/* Used to queue up regulatory hints */
+static LIST_HEAD(reg_requests_list);
+static spinlock_t reg_requests_lock;
+
+/* Used to queue up beacon hints for review */
+static LIST_HEAD(reg_pending_beacons);
+static spinlock_t reg_pending_beacons_lock;
+
+/* Used to keep track of processed beacon hints */
+static LIST_HEAD(reg_beacon_list);
+
+struct reg_beacon {
+ struct list_head list;
+ struct ieee80211_channel chan;
+};
+
/* We keep a static world regulatory domain in case of the absence of CRDA */
static const struct ieee80211_regdomain world_regdom = {
- .n_reg_rules = 1,
+ .n_reg_rules = 5,
.alpha2 = "00",
.reg_rules = {
- REG_RULE(2412-10, 2462+10, 40, 6, 20,
+ /* IEEE 802.11b/g, channels 1..11 */
+ REG_RULE(2412-10, 2462+10, 40, 6, 20, 0),
+ /* IEEE 802.11b/g, channels 12..13. No HT40
+ * channel fits here. */
+ REG_RULE(2467-10, 2472+10, 20, 6, 20,
+ NL80211_RRF_PASSIVE_SCAN |
+ NL80211_RRF_NO_IBSS),
+ /* IEEE 802.11 channel 14 - Only JP enables
+ * this and for 802.11b only */
+ REG_RULE(2484-10, 2484+10, 20, 6, 20,
+ NL80211_RRF_PASSIVE_SCAN |
+ NL80211_RRF_NO_IBSS |
+ NL80211_RRF_NO_OFDM),
+ /* IEEE 802.11a, channel 36..48 */
+ REG_RULE(5180-10, 5240+10, 40, 6, 20,
+ NL80211_RRF_PASSIVE_SCAN |
+ NL80211_RRF_NO_IBSS),
+
+ /* NB: 5260 MHz - 5700 MHz requies DFS */
+
+ /* IEEE 802.11a, channel 149..165 */
+ REG_RULE(5745-10, 5825+10, 40, 6, 20,
NL80211_RRF_PASSIVE_SCAN |
NL80211_RRF_NO_IBSS),
}
@@ -83,9 +125,11 @@ static char *ieee80211_regdom = "US";
module_param(ieee80211_regdom, charp, 0444);
MODULE_PARM_DESC(ieee80211_regdom, "IEEE 802.11 regulatory domain code");
-/* We assume 40 MHz bandwidth for the old regulatory work.
+/*
+ * We assume 40 MHz bandwidth for the old regulatory work.
* We make emphasis we are using the exact same frequencies
- * as before */
+ * as before
+ */
static const struct ieee80211_regdomain us_regdom = {
.n_reg_rules = 6,
@@ -124,8 +168,10 @@ static const struct ieee80211_regdomain jp_regdom = {
static const struct ieee80211_regdomain eu_regdom = {
.n_reg_rules = 6,
- /* This alpha2 is bogus, we leave it here just for stupid
- * backward compatibility */
+ /*
+ * This alpha2 is bogus, we leave it here just for stupid
+ * backward compatibility
+ */
.alpha2 = "EU",
.reg_rules = {
/* IEEE 802.11b/g, channels 1..13 */
@@ -194,8 +240,10 @@ static void reset_regdomains(void)
cfg80211_regdomain = NULL;
}
-/* Dynamic world regulatory domain requested by the wireless
- * core upon initialization */
+/*
+ * Dynamic world regulatory domain requested by the wireless
+ * core upon initialization
+ */
static void update_world_regdomain(const struct ieee80211_regdomain *rd)
{
BUG_ON(!last_request);
@@ -236,8 +284,10 @@ static bool is_unknown_alpha2(const char *alpha2)
{
if (!alpha2)
return false;
- /* Special case where regulatory domain was built by driver
- * but a specific alpha2 cannot be determined */
+ /*
+ * Special case where regulatory domain was built by driver
+ * but a specific alpha2 cannot be determined
+ */
if (alpha2[0] == '9' && alpha2[1] == '9')
return true;
return false;
@@ -247,9 +297,11 @@ static bool is_intersected_alpha2(const char *alpha2)
{
if (!alpha2)
return false;
- /* Special case where regulatory domain is the
+ /*
+ * Special case where regulatory domain is the
* result of an intersection between two regulatory domain
- * structures */
+ * structures
+ */
if (alpha2[0] == '9' && alpha2[1] == '8')
return true;
return false;
@@ -274,8 +326,10 @@ static bool alpha2_equal(const char *alpha2_x, const char *alpha2_y)
return false;
}
-static bool regdom_changed(const char *alpha2)
+static bool regdom_changes(const char *alpha2)
{
+ assert_cfg80211_lock();
+
if (!cfg80211_regdomain)
return true;
if (alpha2_equal(cfg80211_regdomain->alpha2, alpha2))
@@ -302,8 +356,10 @@ static bool country_ie_integrity_changes(u32 checksum)
return false;
}
-/* This lets us keep regulatory code which is updated on a regulatory
- * basis in userspace. */
+/*
+ * This lets us keep regulatory code which is updated on a regulatory
+ * basis in userspace.
+ */
static int call_crda(const char *alpha2)
{
char country_env[9 + 2] = "COUNTRY=";
@@ -348,7 +404,8 @@ static bool is_valid_reg_rule(const struct ieee80211_reg_rule *rule)
freq_diff = freq_range->end_freq_khz - freq_range->start_freq_khz;
- if (freq_diff <= 0 || freq_range->max_bandwidth_khz > freq_diff)
+ if (freq_range->end_freq_khz <= freq_range->start_freq_khz ||
+ freq_range->max_bandwidth_khz > freq_diff)
return false;
return true;
@@ -414,10 +471,12 @@ static bool freq_in_rule_band(const struct ieee80211_freq_range *freq_range,
#undef ONE_GHZ_IN_KHZ
}
-/* Converts a country IE to a regulatory domain. A regulatory domain
+/*
+ * Converts a country IE to a regulatory domain. A regulatory domain
* structure has a lot of information which the IE doesn't yet have,
* so for the other values we use upper max values as we will intersect
- * with our userspace regulatory agent to get lower bounds. */
+ * with our userspace regulatory agent to get lower bounds.
+ */
static struct ieee80211_regdomain *country_ie_2_rd(
u8 *country_ie,
u8 country_ie_len,
@@ -462,9 +521,11 @@ static struct ieee80211_regdomain *country_ie_2_rd(
*checksum ^= ((flags ^ alpha2[0] ^ alpha2[1]) << 8);
- /* We need to build a reg rule for each triplet, but first we must
+ /*
+ * We need to build a reg rule for each triplet, but first we must
* calculate the number of reg rules we will need. We will need one
- * for each channel subband */
+ * for each channel subband
+ */
while (country_ie_len >= 3) {
int end_channel = 0;
struct ieee80211_country_ie_triplet *triplet =
@@ -502,9 +563,11 @@ static struct ieee80211_regdomain *country_ie_2_rd(
if (cur_sub_max_channel < cur_channel)
return NULL;
- /* Do not allow overlapping channels. Also channels
+ /*
+ * Do not allow overlapping channels. Also channels
* passed in each subband must be monotonically
- * increasing */
+ * increasing
+ */
if (last_sub_max_channel) {
if (cur_channel <= last_sub_max_channel)
return NULL;
@@ -512,10 +575,12 @@ static struct ieee80211_regdomain *country_ie_2_rd(
return NULL;
}
- /* When dot11RegulatoryClassesRequired is supported
+ /*
+ * When dot11RegulatoryClassesRequired is supported
* we can throw ext triplets as part of this soup,
* for now we don't care when those change as we
- * don't support them */
+ * don't support them
+ */
*checksum ^= ((cur_channel ^ cur_sub_max_channel) << 8) |
((cur_sub_max_channel ^ cur_sub_max_channel) << 16) |
((triplet->chans.max_power ^ cur_sub_max_channel) << 24);
@@ -526,8 +591,10 @@ static struct ieee80211_regdomain *country_ie_2_rd(
country_ie_len -= 3;
num_rules++;
- /* Note: this is not a IEEE requirement but
- * simply a memory requirement */
+ /*
+ * Note: this is not a IEEE requirement but
+ * simply a memory requirement
+ */
if (num_rules > NL80211_MAX_SUPP_REG_RULES)
return NULL;
}
@@ -555,8 +622,10 @@ static struct ieee80211_regdomain *country_ie_2_rd(
struct ieee80211_freq_range *freq_range = NULL;
struct ieee80211_power_rule *power_rule = NULL;
- /* Must parse if dot11RegulatoryClassesRequired is true,
- * we don't support this yet */
+ /*
+ * Must parse if dot11RegulatoryClassesRequired is true,
+ * we don't support this yet
+ */
if (triplet->ext.reg_extension_id >=
IEEE80211_COUNTRY_EXTENSION_ID) {
country_ie += 3;
@@ -578,10 +647,12 @@ static struct ieee80211_regdomain *country_ie_2_rd(
end_channel = triplet->chans.first_channel +
(4 * (triplet->chans.num_channels - 1));
- /* The +10 is since the regulatory domain expects
+ /*
+ * The +10 is since the regulatory domain expects
* the actual band edge, not the center of freq for
* its start and end freqs, assuming 20 MHz bandwidth on
- * the channels passed */
+ * the channels passed
+ */
freq_range->start_freq_khz =
MHZ_TO_KHZ(ieee80211_channel_to_frequency(
triplet->chans.first_channel) - 10);
@@ -589,9 +660,11 @@ static struct ieee80211_regdomain *country_ie_2_rd(
MHZ_TO_KHZ(ieee80211_channel_to_frequency(
end_channel) + 10);
- /* Large arbitrary values, we intersect later */
- /* Increment this if we ever support >= 40 MHz channels
- * in IEEE 802.11 */
+ /*
+ * These are large arbitrary values we use to intersect later.
+ * Increment this if we ever support >= 40 MHz channels
+ * in IEEE 802.11
+ */
freq_range->max_bandwidth_khz = MHZ_TO_KHZ(40);
power_rule->max_antenna_gain = DBI_TO_MBI(100);
power_rule->max_eirp = DBM_TO_MBM(100);
@@ -607,8 +680,10 @@ static struct ieee80211_regdomain *country_ie_2_rd(
}
-/* Helper for regdom_intersect(), this does the real
- * mathematical intersection fun */
+/*
+ * Helper for regdom_intersect(), this does the real
+ * mathematical intersection fun
+ */
static int reg_rules_intersect(
const struct ieee80211_reg_rule *rule1,
const struct ieee80211_reg_rule *rule2,
@@ -686,11 +761,13 @@ static struct ieee80211_regdomain *regdom_intersect(
if (!rd1 || !rd2)
return NULL;
- /* First we get a count of the rules we'll need, then we actually
+ /*
+ * First we get a count of the rules we'll need, then we actually
* build them. This is to so we can malloc() and free() a
* regdomain once. The reason we use reg_rules_intersect() here
* is it will return -EINVAL if the rule computed makes no sense.
- * All rules that do check out OK are valid. */
+ * All rules that do check out OK are valid.
+ */
for (x = 0; x < rd1->n_reg_rules; x++) {
rule1 = &rd1->reg_rules[x];
@@ -718,14 +795,18 @@ static struct ieee80211_regdomain *regdom_intersect(
rule1 = &rd1->reg_rules[x];
for (y = 0; y < rd2->n_reg_rules; y++) {
rule2 = &rd2->reg_rules[y];
- /* This time around instead of using the stack lets
+ /*
+ * This time around instead of using the stack lets
* write to the target rule directly saving ourselves
- * a memcpy() */
+ * a memcpy()
+ */
intersected_rule = &rd->reg_rules[rule_idx];
r = reg_rules_intersect(rule1, rule2,
intersected_rule);
- /* No need to memset here the intersected rule here as
- * we're not using the stack anymore */
+ /*
+ * No need to memset here the intersected rule here as
+ * we're not using the stack anymore
+ */
if (r)
continue;
rule_idx++;
@@ -744,8 +825,10 @@ static struct ieee80211_regdomain *regdom_intersect(
return rd;
}
-/* XXX: add support for the rest of enum nl80211_reg_rule_flags, we may
- * want to just have the channel structure use these */
+/*
+ * XXX: add support for the rest of enum nl80211_reg_rule_flags, we may
+ * want to just have the channel structure use these
+ */
static u32 map_regdom_flags(u32 rd_flags)
{
u32 channel_flags = 0;
@@ -771,10 +854,12 @@ static int freq_reg_info_regd(struct wiphy *wiphy,
regd = custom_regd ? custom_regd : cfg80211_regdomain;
- /* Follow the driver's regulatory domain, if present, unless a country
- * IE has been processed or a user wants to help complaince further */
- if (last_request->initiator != REGDOM_SET_BY_COUNTRY_IE &&
- last_request->initiator != REGDOM_SET_BY_USER &&
+ /*
+ * Follow the driver's regulatory domain, if present, unless a country
+ * IE has been processed or a user wants to help complaince further
+ */
+ if (last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE &&
+ last_request->initiator != NL80211_REGDOM_SET_BY_USER &&
wiphy->regd)
regd = wiphy->regd;
@@ -790,9 +875,11 @@ static int freq_reg_info_regd(struct wiphy *wiphy,
fr = &rr->freq_range;
pr = &rr->power_rule;
- /* We only need to know if one frequency rule was
+ /*
+ * We only need to know if one frequency rule was
* was in center_freq's band, that's enough, so lets
- * not overwrite it once found */
+ * not overwrite it once found
+ */
if (!band_rule_found)
band_rule_found = freq_in_rule_band(fr, center_freq);
@@ -829,6 +916,11 @@ static void handle_channel(struct wiphy *wiphy, enum ieee80211_band band,
const struct ieee80211_power_rule *power_rule = NULL;
struct ieee80211_supported_band *sband;
struct ieee80211_channel *chan;
+ struct wiphy *request_wiphy = NULL;
+
+ assert_cfg80211_lock();
+
+ request_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx);
sband = wiphy->bands[band];
BUG_ON(chan_idx >= sband->n_channels);
@@ -840,7 +932,8 @@ static void handle_channel(struct wiphy *wiphy, enum ieee80211_band band,
&max_bandwidth, &reg_rule);
if (r) {
- /* This means no regulatory rule was found in the country IE
+ /*
+ * This means no regulatory rule was found in the country IE
* with a frequency range on the center_freq's band, since
* IEEE-802.11 allows for a country IE to have a subset of the
* regulatory information provided in a country we ignore
@@ -851,7 +944,8 @@ static void handle_channel(struct wiphy *wiphy, enum ieee80211_band band,
* http://tinyurl.com/11d-clarification
*/
if (r == -ERANGE &&
- last_request->initiator == REGDOM_SET_BY_COUNTRY_IE) {
+ last_request->initiator ==
+ NL80211_REGDOM_SET_BY_COUNTRY_IE) {
#ifdef CONFIG_CFG80211_REG_DEBUG
printk(KERN_DEBUG "cfg80211: Leaving channel %d MHz "
"intact on %s - no rule found in band on "
@@ -859,10 +953,13 @@ static void handle_channel(struct wiphy *wiphy, enum ieee80211_band band,
chan->center_freq, wiphy_name(wiphy));
#endif
} else {
- /* In this case we know the country IE has at least one reg rule
- * for the band so we respect its band definitions */
+ /*
+ * In this case we know the country IE has at least one reg rule
+ * for the band so we respect its band definitions
+ */
#ifdef CONFIG_CFG80211_REG_DEBUG
- if (last_request->initiator == REGDOM_SET_BY_COUNTRY_IE)
+ if (last_request->initiator ==
+ NL80211_REGDOM_SET_BY_COUNTRY_IE)
printk(KERN_DEBUG "cfg80211: Disabling "
"channel %d MHz on %s due to "
"Country IE\n",
@@ -876,12 +973,14 @@ static void handle_channel(struct wiphy *wiphy, enum ieee80211_band band,
power_rule = &reg_rule->power_rule;
- if (last_request->initiator == REGDOM_SET_BY_DRIVER &&
- last_request->wiphy && last_request->wiphy == wiphy &&
- last_request->wiphy->strict_regulatory) {
- /* This gaurantees the driver's requested regulatory domain
+ if (last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER &&
+ request_wiphy && request_wiphy == wiphy &&
+ request_wiphy->strict_regulatory) {
+ /*
+ * This gaurantees the driver's requested regulatory domain
* will always be used as a base for further regulatory
- * settings */
+ * settings
+ */
chan->flags = chan->orig_flags =
map_regdom_flags(reg_rule->flags);
chan->max_antenna_gain = chan->orig_mag =
@@ -915,39 +1014,147 @@ static void handle_band(struct wiphy *wiphy, enum ieee80211_band band)
handle_channel(wiphy, band, i);
}
-static bool ignore_reg_update(struct wiphy *wiphy, enum reg_set_by setby)
+static bool ignore_reg_update(struct wiphy *wiphy,
+ enum nl80211_reg_initiator initiator)
{
if (!last_request)
return true;
- if (setby == REGDOM_SET_BY_CORE &&
+ if (initiator == NL80211_REGDOM_SET_BY_CORE &&
wiphy->custom_regulatory)
return true;
- /* wiphy->regd will be set once the device has its own
- * desired regulatory domain set */
+ /*
+ * wiphy->regd will be set once the device has its own
+ * desired regulatory domain set
+ */
if (wiphy->strict_regulatory && !wiphy->regd &&
!is_world_regdom(last_request->alpha2))
return true;
return false;
}
-static void update_all_wiphy_regulatory(enum reg_set_by setby)
+static void update_all_wiphy_regulatory(enum nl80211_reg_initiator initiator)
{
struct cfg80211_registered_device *drv;
list_for_each_entry(drv, &cfg80211_drv_list, list)
- wiphy_update_regulatory(&drv->wiphy, setby);
+ wiphy_update_regulatory(&drv->wiphy, initiator);
}
-void wiphy_update_regulatory(struct wiphy *wiphy, enum reg_set_by setby)
+static void handle_reg_beacon(struct wiphy *wiphy,
+ unsigned int chan_idx,
+ struct reg_beacon *reg_beacon)
{
- enum ieee80211_band band;
+#ifdef CONFIG_CFG80211_REG_DEBUG
+#define REG_DEBUG_BEACON_FLAG(desc) \
+ printk(KERN_DEBUG "cfg80211: Enabling " desc " on " \
+ "frequency: %d MHz (Ch %d) on %s\n", \
+ reg_beacon->chan.center_freq, \
+ ieee80211_frequency_to_channel(reg_beacon->chan.center_freq), \
+ wiphy_name(wiphy));
+#else
+#define REG_DEBUG_BEACON_FLAG(desc) do {} while (0)
+#endif
+ struct ieee80211_supported_band *sband;
+ struct ieee80211_channel *chan;
+
+ assert_cfg80211_lock();
+
+ sband = wiphy->bands[reg_beacon->chan.band];
+ chan = &sband->channels[chan_idx];
+
+ if (likely(chan->center_freq != reg_beacon->chan.center_freq))
+ return;
+
+ if (chan->flags & IEEE80211_CHAN_PASSIVE_SCAN) {
+ chan->flags &= ~IEEE80211_CHAN_PASSIVE_SCAN;
+ REG_DEBUG_BEACON_FLAG("active scanning");
+ }
+
+ if (chan->flags & IEEE80211_CHAN_NO_IBSS) {
+ chan->flags &= ~IEEE80211_CHAN_NO_IBSS;
+ REG_DEBUG_BEACON_FLAG("beaconing");
+ }
+
+ chan->beacon_found = true;
+#undef REG_DEBUG_BEACON_FLAG
+}
+
+/*
+ * Called when a scan on a wiphy finds a beacon on
+ * new channel
+ */
+static void wiphy_update_new_beacon(struct wiphy *wiphy,
+ struct reg_beacon *reg_beacon)
+{
+ unsigned int i;
+ struct ieee80211_supported_band *sband;
+
+ assert_cfg80211_lock();
- if (ignore_reg_update(wiphy, setby))
+ if (!wiphy->bands[reg_beacon->chan.band])
return;
+
+ sband = wiphy->bands[reg_beacon->chan.band];
+
+ for (i = 0; i < sband->n_channels; i++)
+ handle_reg_beacon(wiphy, i, reg_beacon);
+}
+
+/*
+ * Called upon reg changes or a new wiphy is added
+ */
+static void wiphy_update_beacon_reg(struct wiphy *wiphy)
+{
+ unsigned int i;
+ struct ieee80211_supported_band *sband;
+ struct reg_beacon *reg_beacon;
+
+ assert_cfg80211_lock();
+
+ if (list_empty(&reg_beacon_list))
+ return;
+
+ list_for_each_entry(reg_beacon, &reg_beacon_list, list) {
+ if (!wiphy->bands[reg_beacon->chan.band])
+ continue;
+ sband = wiphy->bands[reg_beacon->chan.band];
+ for (i = 0; i < sband->n_channels; i++)
+ handle_reg_beacon(wiphy, i, reg_beacon);
+ }
+}
+
+static bool reg_is_world_roaming(struct wiphy *wiphy)
+{
+ if (is_world_regdom(cfg80211_regdomain->alpha2) ||
+ (wiphy->regd && is_world_regdom(wiphy->regd->alpha2)))
+ return true;
+ if (last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE &&
+ wiphy->custom_regulatory)
+ return true;
+ return false;
+}
+
+/* Reap the advantages of previously found beacons */
+static void reg_process_beacons(struct wiphy *wiphy)
+{
+ if (!reg_is_world_roaming(wiphy))
+ return;
+ wiphy_update_beacon_reg(wiphy);
+}
+
+void wiphy_update_regulatory(struct wiphy *wiphy,
+ enum nl80211_reg_initiator initiator)
+{
+ enum ieee80211_band band;
+
+ if (ignore_reg_update(wiphy, initiator))
+ goto out;
for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
if (wiphy->bands[band])
handle_band(wiphy, band);
}
+out:
+ reg_process_beacons(wiphy);
if (wiphy->reg_notifier)
wiphy->reg_notifier(wiphy, last_request);
}
@@ -1033,81 +1240,98 @@ static int reg_copy_regd(const struct ieee80211_regdomain **dst_regd,
return 0;
}
-/* Return value which can be used by ignore_request() to indicate
- * it has been determined we should intersect two regulatory domains */
+/*
+ * Return value which can be used by ignore_request() to indicate
+ * it has been determined we should intersect two regulatory domains
+ */
#define REG_INTERSECT 1
/* This has the logic which determines when a new request
* should be ignored. */
-static int ignore_request(struct wiphy *wiphy, enum reg_set_by set_by,
- const char *alpha2)
+static int ignore_request(struct wiphy *wiphy,
+ struct regulatory_request *pending_request)
{
+ struct wiphy *last_wiphy = NULL;
+
+ assert_cfg80211_lock();
+
/* All initial requests are respected */
if (!last_request)
return 0;
- switch (set_by) {
- case REGDOM_SET_BY_INIT:
+ switch (pending_request->initiator) {
+ case NL80211_REGDOM_SET_BY_CORE:
return -EINVAL;
- case REGDOM_SET_BY_CORE:
- /*
- * Always respect new wireless core hints, should only happen
- * when updating the world regulatory domain at init.
- */
- return 0;
- case REGDOM_SET_BY_COUNTRY_IE:
- if (unlikely(!is_an_alpha2(alpha2)))
+ case NL80211_REGDOM_SET_BY_COUNTRY_IE:
+
+ last_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx);
+
+ if (unlikely(!is_an_alpha2(pending_request->alpha2)))
return -EINVAL;
- if (last_request->initiator == REGDOM_SET_BY_COUNTRY_IE) {
- if (last_request->wiphy != wiphy) {
+ if (last_request->initiator ==
+ NL80211_REGDOM_SET_BY_COUNTRY_IE) {
+ if (last_wiphy != wiphy) {
/*
* Two cards with two APs claiming different
* different Country IE alpha2s. We could
* intersect them, but that seems unlikely
* to be correct. Reject second one for now.
*/
- if (!alpha2_equal(alpha2,
- cfg80211_regdomain->alpha2))
+ if (regdom_changes(pending_request->alpha2))
return -EOPNOTSUPP;
return -EALREADY;
}
- /* Two consecutive Country IE hints on the same wiphy.
- * This should be picked up early by the driver/stack */
- if (WARN_ON(!alpha2_equal(cfg80211_regdomain->alpha2,
- alpha2)))
+ /*
+ * Two consecutive Country IE hints on the same wiphy.
+ * This should be picked up early by the driver/stack
+ */
+ if (WARN_ON(regdom_changes(pending_request->alpha2)))
return 0;
return -EALREADY;
}
return REG_INTERSECT;
- case REGDOM_SET_BY_DRIVER:
- if (last_request->initiator == REGDOM_SET_BY_CORE) {
+ case NL80211_REGDOM_SET_BY_DRIVER:
+ if (last_request->initiator == NL80211_REGDOM_SET_BY_CORE) {
if (is_old_static_regdom(cfg80211_regdomain))
return 0;
- if (!alpha2_equal(cfg80211_regdomain->alpha2, alpha2))
+ if (regdom_changes(pending_request->alpha2))
return 0;
return -EALREADY;
}
+
+ /*
+ * This would happen if you unplug and plug your card
+ * back in or if you add a new device for which the previously
+ * loaded card also agrees on the regulatory domain.
+ */
+ if (last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER &&
+ !regdom_changes(pending_request->alpha2))
+ return -EALREADY;
+
return REG_INTERSECT;
- case REGDOM_SET_BY_USER:
- if (last_request->initiator == REGDOM_SET_BY_COUNTRY_IE)
+ case NL80211_REGDOM_SET_BY_USER:
+ if (last_request->initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE)
return REG_INTERSECT;
- /* If the user knows better the user should set the regdom
- * to their country before the IE is picked up */
- if (last_request->initiator == REGDOM_SET_BY_USER &&
+ /*
+ * If the user knows better the user should set the regdom
+ * to their country before the IE is picked up
+ */
+ if (last_request->initiator == NL80211_REGDOM_SET_BY_USER &&
last_request->intersect)
return -EOPNOTSUPP;
- /* Process user requests only after previous user/driver/core
- * requests have been processed */
- if (last_request->initiator == REGDOM_SET_BY_CORE ||
- last_request->initiator == REGDOM_SET_BY_DRIVER ||
- last_request->initiator == REGDOM_SET_BY_USER) {
- if (!alpha2_equal(last_request->alpha2,
- cfg80211_regdomain->alpha2))
+ /*
+ * Process user requests only after previous user/driver/core
+ * requests have been processed
+ */
+ if (last_request->initiator == NL80211_REGDOM_SET_BY_CORE ||
+ last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER ||
+ last_request->initiator == NL80211_REGDOM_SET_BY_USER) {
+ if (regdom_changes(last_request->alpha2))
return -EAGAIN;
}
if (!is_old_static_regdom(cfg80211_regdomain) &&
- alpha2_equal(cfg80211_regdomain->alpha2, alpha2))
+ !regdom_changes(pending_request->alpha2))
return -EALREADY;
return 0;
@@ -1116,59 +1340,80 @@ static int ignore_request(struct wiphy *wiphy, enum reg_set_by set_by,
return -EINVAL;
}
-/* Caller must hold &cfg80211_drv_mutex */
-int __regulatory_hint(struct wiphy *wiphy, enum reg_set_by set_by,
- const char *alpha2,
- u32 country_ie_checksum,
- enum environment_cap env)
+/**
+ * __regulatory_hint - hint to the wireless core a regulatory domain
+ * @wiphy: if the hint comes from country information from an AP, this
+ * is required to be set to the wiphy that received the information
+ * @pending_request: the regulatory request currently being processed
+ *
+ * The Wireless subsystem can use this function to hint to the wireless core
+ * what it believes should be the current regulatory domain.
+ *
+ * Returns zero if all went fine, %-EALREADY if a regulatory domain had
+ * already been set or other standard error codes.
+ *
+ * Caller must hold &cfg80211_mutex
+ */
+static int __regulatory_hint(struct wiphy *wiphy,
+ struct regulatory_request *pending_request)
{
- struct regulatory_request *request;
bool intersect = false;
int r = 0;
- r = ignore_request(wiphy, set_by, alpha2);
+ assert_cfg80211_lock();
+
+ r = ignore_request(wiphy, pending_request);
if (r == REG_INTERSECT) {
- if (set_by == REGDOM_SET_BY_DRIVER) {
+ if (pending_request->initiator ==
+ NL80211_REGDOM_SET_BY_DRIVER) {
r = reg_copy_regd(&wiphy->regd, cfg80211_regdomain);
- if (r)
+ if (r) {
+ kfree(pending_request);
return r;
+ }
}
intersect = true;
} else if (r) {
- /* If the regulatory domain being requested by the
+ /*
+ * If the regulatory domain being requested by the
* driver has already been set just copy it to the
- * wiphy */
- if (r == -EALREADY && set_by == REGDOM_SET_BY_DRIVER) {
+ * wiphy
+ */
+ if (r == -EALREADY &&
+ pending_request->initiator ==
+ NL80211_REGDOM_SET_BY_DRIVER) {
r = reg_copy_regd(&wiphy->regd, cfg80211_regdomain);
- if (r)
+ if (r) {
+ kfree(pending_request);
return r;
+ }
r = -EALREADY;
goto new_request;
}
+ kfree(pending_request);
return r;
}
new_request:
- request = kzalloc(sizeof(struct regulatory_request),
- GFP_KERNEL);
- if (!request)
- return -ENOMEM;
+ kfree(last_request);
- request->alpha2[0] = alpha2[0];
- request->alpha2[1] = alpha2[1];
- request->initiator = set_by;
- request->wiphy = wiphy;
- request->intersect = intersect;
- request->country_ie_checksum = country_ie_checksum;
- request->country_ie_env = env;
+ last_request = pending_request;
+ last_request->intersect = intersect;
- kfree(last_request);
- last_request = request;
+ pending_request = NULL;
/* When r == REG_INTERSECT we do need to call CRDA */
- if (r < 0)
+ if (r < 0) {
+ /*
+ * Since CRDA will not be called in this case as we already
+ * have applied the requested regulatory domain before we just
+ * inform userspace we have processed the request
+ */
+ if (r == -EALREADY)
+ nl80211_send_reg_change_event(last_request);
return r;
+ }
/*
* Note: When CONFIG_WIRELESS_OLD_REGULATORY is enabled
@@ -1180,34 +1425,194 @@ new_request:
*
* to intersect with the static rd
*/
- return call_crda(alpha2);
+ return call_crda(last_request->alpha2);
}
-void regulatory_hint(struct wiphy *wiphy, const char *alpha2)
+/* This currently only processes user and driver regulatory hints */
+static void reg_process_hint(struct regulatory_request *reg_request)
{
- int r;
- BUG_ON(!alpha2);
+ int r = 0;
+ struct wiphy *wiphy = NULL;
+
+ BUG_ON(!reg_request->alpha2);
+
+ mutex_lock(&cfg80211_mutex);
+
+ if (wiphy_idx_valid(reg_request->wiphy_idx))
+ wiphy = wiphy_idx_to_wiphy(reg_request->wiphy_idx);
+
+ if (reg_request->initiator == NL80211_REGDOM_SET_BY_DRIVER &&
+ !wiphy) {
+ kfree(reg_request);
+ goto out;
+ }
- mutex_lock(&cfg80211_drv_mutex);
- r = __regulatory_hint(wiphy, REGDOM_SET_BY_DRIVER,
- alpha2, 0, ENVIRON_ANY);
+ r = __regulatory_hint(wiphy, reg_request);
/* This is required so that the orig_* parameters are saved */
- if (r == -EALREADY && wiphy->strict_regulatory)
- wiphy_update_regulatory(wiphy, REGDOM_SET_BY_DRIVER);
- mutex_unlock(&cfg80211_drv_mutex);
+ if (r == -EALREADY && wiphy && wiphy->strict_regulatory)
+ wiphy_update_regulatory(wiphy, reg_request->initiator);
+out:
+ mutex_unlock(&cfg80211_mutex);
+}
+
+/* Processes regulatory hints, this is all the NL80211_REGDOM_SET_BY_* */
+static void reg_process_pending_hints(void)
+ {
+ struct regulatory_request *reg_request;
+
+ spin_lock(&reg_requests_lock);
+ while (!list_empty(&reg_requests_list)) {
+ reg_request = list_first_entry(&reg_requests_list,
+ struct regulatory_request,
+ list);
+ list_del_init(&reg_request->list);
+
+ spin_unlock(&reg_requests_lock);
+ reg_process_hint(reg_request);
+ spin_lock(&reg_requests_lock);
+ }
+ spin_unlock(&reg_requests_lock);
+}
+
+/* Processes beacon hints -- this has nothing to do with country IEs */
+static void reg_process_pending_beacon_hints(void)
+{
+ struct cfg80211_registered_device *drv;
+ struct reg_beacon *pending_beacon, *tmp;
+
+ mutex_lock(&cfg80211_mutex);
+
+ /* This goes through the _pending_ beacon list */
+ spin_lock_bh(&reg_pending_beacons_lock);
+
+ if (list_empty(&reg_pending_beacons)) {
+ spin_unlock_bh(&reg_pending_beacons_lock);
+ goto out;
+ }
+
+ list_for_each_entry_safe(pending_beacon, tmp,
+ &reg_pending_beacons, list) {
+
+ list_del_init(&pending_beacon->list);
+
+ /* Applies the beacon hint to current wiphys */
+ list_for_each_entry(drv, &cfg80211_drv_list, list)
+ wiphy_update_new_beacon(&drv->wiphy, pending_beacon);
+
+ /* Remembers the beacon hint for new wiphys or reg changes */
+ list_add_tail(&pending_beacon->list, &reg_beacon_list);
+ }
+
+ spin_unlock_bh(&reg_pending_beacons_lock);
+out:
+ mutex_unlock(&cfg80211_mutex);
+}
+
+static void reg_todo(struct work_struct *work)
+{
+ reg_process_pending_hints();
+ reg_process_pending_beacon_hints();
+}
+
+static DECLARE_WORK(reg_work, reg_todo);
+
+static void queue_regulatory_request(struct regulatory_request *request)
+{
+ spin_lock(&reg_requests_lock);
+ list_add_tail(&request->list, &reg_requests_list);
+ spin_unlock(&reg_requests_lock);
+
+ schedule_work(&reg_work);
+}
+
+/* Core regulatory hint -- happens once during cfg80211_init() */
+static int regulatory_hint_core(const char *alpha2)
+{
+ struct regulatory_request *request;
+
+ BUG_ON(last_request);
+
+ request = kzalloc(sizeof(struct regulatory_request),
+ GFP_KERNEL);
+ if (!request)
+ return -ENOMEM;
+
+ request->alpha2[0] = alpha2[0];
+ request->alpha2[1] = alpha2[1];
+ request->initiator = NL80211_REGDOM_SET_BY_CORE;
+
+ queue_regulatory_request(request);
+
+ return 0;
+}
+
+/* User hints */
+int regulatory_hint_user(const char *alpha2)
+{
+ struct regulatory_request *request;
+
+ BUG_ON(!alpha2);
+
+ request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL);
+ if (!request)
+ return -ENOMEM;
+
+ request->wiphy_idx = WIPHY_IDX_STALE;
+ request->alpha2[0] = alpha2[0];
+ request->alpha2[1] = alpha2[1];
+ request->initiator = NL80211_REGDOM_SET_BY_USER,
+
+ queue_regulatory_request(request);
+
+ return 0;
+}
+
+/* Driver hints */
+int regulatory_hint(struct wiphy *wiphy, const char *alpha2)
+{
+ struct regulatory_request *request;
+
+ BUG_ON(!alpha2);
+ BUG_ON(!wiphy);
+
+ request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL);
+ if (!request)
+ return -ENOMEM;
+
+ request->wiphy_idx = get_wiphy_idx(wiphy);
+
+ /* Must have registered wiphy first */
+ BUG_ON(!wiphy_idx_valid(request->wiphy_idx));
+
+ request->alpha2[0] = alpha2[0];
+ request->alpha2[1] = alpha2[1];
+ request->initiator = NL80211_REGDOM_SET_BY_DRIVER;
+
+ queue_regulatory_request(request);
+
+ return 0;
}
EXPORT_SYMBOL(regulatory_hint);
static bool reg_same_country_ie_hint(struct wiphy *wiphy,
u32 country_ie_checksum)
{
- if (!last_request->wiphy)
+ struct wiphy *request_wiphy;
+
+ assert_cfg80211_lock();
+
+ request_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx);
+
+ if (!request_wiphy)
return false;
- if (likely(last_request->wiphy != wiphy))
+
+ if (likely(request_wiphy != wiphy))
return !country_ie_integrity_changes(country_ie_checksum);
- /* We should not have let these through at this point, they
+ /*
+ * We should not have let these through at this point, they
* should have been picked up earlier by the first alpha2 check
- * on the device */
+ * on the device
+ */
if (WARN_ON(!country_ie_integrity_changes(country_ie_checksum)))
return true;
return false;
@@ -1221,11 +1626,14 @@ void regulatory_hint_11d(struct wiphy *wiphy,
char alpha2[2];
u32 checksum = 0;
enum environment_cap env = ENVIRON_ANY;
+ struct regulatory_request *request;
- if (!last_request)
- return;
+ mutex_lock(&cfg80211_mutex);
- mutex_lock(&cfg80211_drv_mutex);
+ if (unlikely(!last_request)) {
+ mutex_unlock(&cfg80211_mutex);
+ return;
+ }
/* IE len must be evenly divisible by 2 */
if (country_ie_len & 0x01)
@@ -1234,9 +1642,11 @@ void regulatory_hint_11d(struct wiphy *wiphy,
if (country_ie_len < IEEE80211_COUNTRY_IE_MIN_LEN)
goto out;
- /* Pending country IE processing, this can happen after we
+ /*
+ * Pending country IE processing, this can happen after we
* call CRDA and wait for a response if a beacon was received before
- * we were able to process the last regulatory_hint_11d() call */
+ * we were able to process the last regulatory_hint_11d() call
+ */
if (country_ie_regdomain)
goto out;
@@ -1248,33 +1658,44 @@ void regulatory_hint_11d(struct wiphy *wiphy,
else if (country_ie[2] == 'O')
env = ENVIRON_OUTDOOR;
- /* We will run this for *every* beacon processed for the BSSID, so
+ /*
+ * We will run this for *every* beacon processed for the BSSID, so
* we optimize an early check to exit out early if we don't have to
- * do anything */
- if (likely(last_request->wiphy)) {
+ * do anything
+ */
+ if (likely(wiphy_idx_valid(last_request->wiphy_idx))) {
struct cfg80211_registered_device *drv_last_ie;
- drv_last_ie = wiphy_to_dev(last_request->wiphy);
+ drv_last_ie =
+ cfg80211_drv_by_wiphy_idx(last_request->wiphy_idx);
- /* Lets keep this simple -- we trust the first AP
- * after we intersect with CRDA */
- if (likely(last_request->wiphy == wiphy)) {
- /* Ignore IEs coming in on this wiphy with
- * the same alpha2 and environment cap */
+ /*
+ * Lets keep this simple -- we trust the first AP
+ * after we intersect with CRDA
+ */
+ if (likely(&drv_last_ie->wiphy == wiphy)) {
+ /*
+ * Ignore IEs coming in on this wiphy with
+ * the same alpha2 and environment cap
+ */
if (likely(alpha2_equal(drv_last_ie->country_ie_alpha2,
alpha2) &&
env == drv_last_ie->env)) {
goto out;
}
- /* the wiphy moved on to another BSSID or the AP
+ /*
+ * the wiphy moved on to another BSSID or the AP
* was reconfigured. XXX: We need to deal with the
* case where the user suspends and goes to goes
* to another country, and then gets IEs from an
- * AP with different settings */
+ * AP with different settings
+ */
goto out;
} else {
- /* Ignore IEs coming in on two separate wiphys with
- * the same alpha2 and environment cap */
+ /*
+ * Ignore IEs coming in on two separate wiphys with
+ * the same alpha2 and environment cap
+ */
if (likely(alpha2_equal(drv_last_ie->country_ie_alpha2,
alpha2) &&
env == drv_last_ie->env)) {
@@ -1289,28 +1710,97 @@ void regulatory_hint_11d(struct wiphy *wiphy,
if (!rd)
goto out;
- /* This will not happen right now but we leave it here for the
+ /*
+ * This will not happen right now but we leave it here for the
* the future when we want to add suspend/resume support and having
* the user move to another country after doing so, or having the user
- * move to another AP. Right now we just trust the first AP. This is why
- * this is marked as likley(). If we hit this before we add this support
- * we want to be informed of it as it would indicate a mistake in the
- * current design */
- if (likely(WARN_ON(reg_same_country_ie_hint(wiphy, checksum))))
- goto out;
+ * move to another AP. Right now we just trust the first AP.
+ *
+ * If we hit this before we add this support we want to be informed of
+ * it as it would indicate a mistake in the current design
+ */
+ if (WARN_ON(reg_same_country_ie_hint(wiphy, checksum)))
+ goto free_rd_out;
+
+ request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL);
+ if (!request)
+ goto free_rd_out;
- /* We keep this around for when CRDA comes back with a response so
- * we can intersect with that */
+ /*
+ * We keep this around for when CRDA comes back with a response so
+ * we can intersect with that
+ */
country_ie_regdomain = rd;
- __regulatory_hint(wiphy, REGDOM_SET_BY_COUNTRY_IE,
- country_ie_regdomain->alpha2, checksum, env);
+ request->wiphy_idx = get_wiphy_idx(wiphy);
+ request->alpha2[0] = rd->alpha2[0];
+ request->alpha2[1] = rd->alpha2[1];
+ request->initiator = NL80211_REGDOM_SET_BY_COUNTRY_IE;
+ request->country_ie_checksum = checksum;
+ request->country_ie_env = env;
+ mutex_unlock(&cfg80211_mutex);
+
+ queue_regulatory_request(request);
+
+ return;
+
+free_rd_out:
+ kfree(rd);
out:
- mutex_unlock(&cfg80211_drv_mutex);
+ mutex_unlock(&cfg80211_mutex);
}
EXPORT_SYMBOL(regulatory_hint_11d);
+static bool freq_is_chan_12_13_14(u16 freq)
+{
+ if (freq == ieee80211_channel_to_frequency(12) ||
+ freq == ieee80211_channel_to_frequency(13) ||
+ freq == ieee80211_channel_to_frequency(14))
+ return true;
+ return false;
+}
+
+int regulatory_hint_found_beacon(struct wiphy *wiphy,
+ struct ieee80211_channel *beacon_chan,
+ gfp_t gfp)
+{
+ struct reg_beacon *reg_beacon;
+
+ if (likely((beacon_chan->beacon_found ||
+ (beacon_chan->flags & IEEE80211_CHAN_RADAR) ||
+ (beacon_chan->band == IEEE80211_BAND_2GHZ &&
+ !freq_is_chan_12_13_14(beacon_chan->center_freq)))))
+ return 0;
+
+ reg_beacon = kzalloc(sizeof(struct reg_beacon), gfp);
+ if (!reg_beacon)
+ return -ENOMEM;
+
+#ifdef CONFIG_CFG80211_REG_DEBUG
+ printk(KERN_DEBUG "cfg80211: Found new beacon on "
+ "frequency: %d MHz (Ch %d) on %s\n",
+ beacon_chan->center_freq,
+ ieee80211_frequency_to_channel(beacon_chan->center_freq),
+ wiphy_name(wiphy));
+#endif
+ memcpy(&reg_beacon->chan, beacon_chan,
+ sizeof(struct ieee80211_channel));
+
+
+ /*
+ * Since we can be called from BH or and non-BH context
+ * we must use spin_lock_bh()
+ */
+ spin_lock_bh(&reg_pending_beacons_lock);
+ list_add_tail(&reg_beacon->list, &reg_pending_beacons);
+ spin_unlock_bh(&reg_pending_beacons_lock);
+
+ schedule_work(&reg_work);
+
+ return 0;
+}
+
static void print_rd_rules(const struct ieee80211_regdomain *rd)
{
unsigned int i;
@@ -1326,8 +1816,10 @@ static void print_rd_rules(const struct ieee80211_regdomain *rd)
freq_range = &reg_rule->freq_range;
power_rule = &reg_rule->power_rule;
- /* There may not be documentation for max antenna gain
- * in certain regions */
+ /*
+ * There may not be documentation for max antenna gain
+ * in certain regions
+ */
if (power_rule->max_antenna_gain)
printk(KERN_INFO "\t(%d KHz - %d KHz @ %d KHz), "
"(%d mBi, %d mBm)\n",
@@ -1350,13 +1842,13 @@ static void print_regdomain(const struct ieee80211_regdomain *rd)
{
if (is_intersected_alpha2(rd->alpha2)) {
- struct wiphy *wiphy = NULL;
- struct cfg80211_registered_device *drv;
- if (last_request->initiator == REGDOM_SET_BY_COUNTRY_IE) {
- if (last_request->wiphy) {
- wiphy = last_request->wiphy;
- drv = wiphy_to_dev(wiphy);
+ if (last_request->initiator ==
+ NL80211_REGDOM_SET_BY_COUNTRY_IE) {
+ struct cfg80211_registered_device *drv;
+ drv = cfg80211_drv_by_wiphy_idx(
+ last_request->wiphy_idx);
+ if (drv) {
printk(KERN_INFO "cfg80211: Current regulatory "
"domain updated by AP to: %c%c\n",
drv->country_ie_alpha2[0],
@@ -1422,7 +1914,7 @@ static int __set_regdom(const struct ieee80211_regdomain *rd)
{
const struct ieee80211_regdomain *intersected_rd = NULL;
struct cfg80211_registered_device *drv = NULL;
- struct wiphy *wiphy = NULL;
+ struct wiphy *request_wiphy;
/* Some basic sanity checks first */
if (is_world_regdom(rd->alpha2)) {
@@ -1439,23 +1931,27 @@ static int __set_regdom(const struct ieee80211_regdomain *rd)
if (!last_request)
return -EINVAL;
- /* Lets only bother proceeding on the same alpha2 if the current
+ /*
+ * Lets only bother proceeding on the same alpha2 if the current
* rd is non static (it means CRDA was present and was used last)
- * and the pending request came in from a country IE */
- if (last_request->initiator != REGDOM_SET_BY_COUNTRY_IE) {
- /* If someone else asked us to change the rd lets only bother
- * checking if the alpha2 changes if CRDA was already called */
+ * and the pending request came in from a country IE
+ */
+ if (last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE) {
+ /*
+ * If someone else asked us to change the rd lets only bother
+ * checking if the alpha2 changes if CRDA was already called
+ */
if (!is_old_static_regdom(cfg80211_regdomain) &&
- !regdom_changed(rd->alpha2))
+ !regdom_changes(rd->alpha2))
return -EINVAL;
}
- wiphy = last_request->wiphy;
-
- /* Now lets set the regulatory domain, update all driver channels
+ /*
+ * Now lets set the regulatory domain, update all driver channels
* and finally inform them of what we have done, in case they want
* to review or adjust their own settings based on their own
- * internal EEPROM data */
+ * internal EEPROM data
+ */
if (WARN_ON(!reg_is_valid_request(rd->alpha2)))
return -EINVAL;
@@ -1467,21 +1963,25 @@ static int __set_regdom(const struct ieee80211_regdomain *rd)
return -EINVAL;
}
+ request_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx);
+
if (!last_request->intersect) {
int r;
- if (last_request->initiator != REGDOM_SET_BY_DRIVER) {
+ if (last_request->initiator != NL80211_REGDOM_SET_BY_DRIVER) {
reset_regdomains();
cfg80211_regdomain = rd;
return 0;
}
- /* For a driver hint, lets copy the regulatory domain the
- * driver wanted to the wiphy to deal with conflicts */
+ /*
+ * For a driver hint, lets copy the regulatory domain the
+ * driver wanted to the wiphy to deal with conflicts
+ */
- BUG_ON(last_request->wiphy->regd);
+ BUG_ON(request_wiphy->regd);
- r = reg_copy_regd(&last_request->wiphy->regd, rd);
+ r = reg_copy_regd(&request_wiphy->regd, rd);
if (r)
return r;
@@ -1492,17 +1992,19 @@ static int __set_regdom(const struct ieee80211_regdomain *rd)
/* Intersection requires a bit more work */
- if (last_request->initiator != REGDOM_SET_BY_COUNTRY_IE) {
+ if (last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE) {
intersected_rd = regdom_intersect(rd, cfg80211_regdomain);
if (!intersected_rd)
return -EINVAL;
- /* We can trash what CRDA provided now.
+ /*
+ * We can trash what CRDA provided now.
* However if a driver requested this specific regulatory
- * domain we keep it for its private use */
- if (last_request->initiator == REGDOM_SET_BY_DRIVER)
- last_request->wiphy->regd = rd;
+ * domain we keep it for its private use
+ */
+ if (last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER)
+ request_wiphy->regd = rd;
else
kfree(rd);
@@ -1522,8 +2024,10 @@ static int __set_regdom(const struct ieee80211_regdomain *rd)
BUG_ON(!country_ie_regdomain);
if (rd != country_ie_regdomain) {
- /* Intersect what CRDA returned and our what we
- * had built from the Country IE received */
+ /*
+ * Intersect what CRDA returned and our what we
+ * had built from the Country IE received
+ */
intersected_rd = regdom_intersect(rd, country_ie_regdomain);
@@ -1533,16 +2037,18 @@ static int __set_regdom(const struct ieee80211_regdomain *rd)
kfree(country_ie_regdomain);
country_ie_regdomain = NULL;
} else {
- /* This would happen when CRDA was not present and
+ /*
+ * This would happen when CRDA was not present and
* OLD_REGULATORY was enabled. We intersect our Country
- * IE rd and what was set on cfg80211 originally */
+ * IE rd and what was set on cfg80211 originally
+ */
intersected_rd = regdom_intersect(rd, cfg80211_regdomain);
}
if (!intersected_rd)
return -EINVAL;
- drv = wiphy_to_dev(wiphy);
+ drv = wiphy_to_dev(request_wiphy);
drv->country_ie_alpha2[0] = rd->alpha2[0];
drv->country_ie_alpha2[1] = rd->alpha2[1];
@@ -1560,13 +2066,17 @@ static int __set_regdom(const struct ieee80211_regdomain *rd)
}
-/* Use this call to set the current regulatory domain. Conflicts with
+/*
+ * Use this call to set the current regulatory domain. Conflicts with
* multiple drivers can be ironed out later. Caller must've already
- * kmalloc'd the rd structure. Caller must hold cfg80211_drv_mutex */
+ * kmalloc'd the rd structure. Caller must hold cfg80211_mutex
+ */
int set_regdom(const struct ieee80211_regdomain *rd)
{
int r;
+ assert_cfg80211_lock();
+
/* Note that this doesn't update the wiphys, this is done below */
r = __set_regdom(rd);
if (r) {
@@ -1583,57 +2093,87 @@ int set_regdom(const struct ieee80211_regdomain *rd)
print_regdomain(cfg80211_regdomain);
+ nl80211_send_reg_change_event(last_request);
+
return r;
}
-/* Caller must hold cfg80211_drv_mutex */
+/* Caller must hold cfg80211_mutex */
void reg_device_remove(struct wiphy *wiphy)
{
+ struct wiphy *request_wiphy;
+
+ assert_cfg80211_lock();
+
+ request_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx);
+
kfree(wiphy->regd);
- if (!last_request || !last_request->wiphy)
+ if (!last_request || !request_wiphy)
return;
- if (last_request->wiphy != wiphy)
+ if (request_wiphy != wiphy)
return;
- last_request->wiphy = NULL;
+ last_request->wiphy_idx = WIPHY_IDX_STALE;
last_request->country_ie_env = ENVIRON_ANY;
}
int regulatory_init(void)
{
- int err;
+ int err = 0;
reg_pdev = platform_device_register_simple("regulatory", 0, NULL, 0);
if (IS_ERR(reg_pdev))
return PTR_ERR(reg_pdev);
+ spin_lock_init(&reg_requests_lock);
+ spin_lock_init(&reg_pending_beacons_lock);
+
#ifdef CONFIG_WIRELESS_OLD_REGULATORY
cfg80211_regdomain = static_regdom(ieee80211_regdom);
printk(KERN_INFO "cfg80211: Using static regulatory domain info\n");
print_regdomain_info(cfg80211_regdomain);
- /* The old code still requests for a new regdomain and if
+ /*
+ * The old code still requests for a new regdomain and if
* you have CRDA you get it updated, otherwise you get
* stuck with the static values. We ignore "EU" code as
- * that is not a valid ISO / IEC 3166 alpha2 */
+ * that is not a valid ISO / IEC 3166 alpha2
+ */
if (ieee80211_regdom[0] != 'E' || ieee80211_regdom[1] != 'U')
- err = __regulatory_hint(NULL, REGDOM_SET_BY_CORE,
- ieee80211_regdom, 0, ENVIRON_ANY);
+ err = regulatory_hint_core(ieee80211_regdom);
#else
cfg80211_regdomain = cfg80211_world_regdom;
- err = __regulatory_hint(NULL, REGDOM_SET_BY_CORE, "00", 0, ENVIRON_ANY);
- if (err)
- printk(KERN_ERR "cfg80211: calling CRDA failed - "
- "unable to update world regulatory domain, "
- "using static definition\n");
+ err = regulatory_hint_core("00");
#endif
+ if (err) {
+ if (err == -ENOMEM)
+ return err;
+ /*
+ * N.B. kobject_uevent_env() can fail mainly for when we're out
+ * memory which is handled and propagated appropriately above
+ * but it can also fail during a netlink_broadcast() or during
+ * early boot for call_usermodehelper(). For now treat these
+ * errors as non-fatal.
+ */
+ printk(KERN_ERR "cfg80211: kobject_uevent_env() was unable "
+ "to call CRDA during init");
+#ifdef CONFIG_CFG80211_REG_DEBUG
+ /* We want to find out exactly why when debugging */
+ WARN_ON(err);
+#endif
+ }
return 0;
}
void regulatory_exit(void)
{
- mutex_lock(&cfg80211_drv_mutex);
+ struct regulatory_request *reg_request, *tmp;
+ struct reg_beacon *reg_beacon, *btmp;
+
+ cancel_work_sync(&reg_work);
+
+ mutex_lock(&cfg80211_mutex);
reset_regdomains();
@@ -1644,5 +2184,33 @@ void regulatory_exit(void)
platform_device_unregister(reg_pdev);
- mutex_unlock(&cfg80211_drv_mutex);
+ spin_lock_bh(&reg_pending_beacons_lock);
+ if (!list_empty(&reg_pending_beacons)) {
+ list_for_each_entry_safe(reg_beacon, btmp,
+ &reg_pending_beacons, list) {
+ list_del(&reg_beacon->list);
+ kfree(reg_beacon);
+ }
+ }
+ spin_unlock_bh(&reg_pending_beacons_lock);
+
+ if (!list_empty(&reg_beacon_list)) {
+ list_for_each_entry_safe(reg_beacon, btmp,
+ &reg_beacon_list, list) {
+ list_del(&reg_beacon->list);
+ kfree(reg_beacon);
+ }
+ }
+
+ spin_lock(&reg_requests_lock);
+ if (!list_empty(&reg_requests_list)) {
+ list_for_each_entry_safe(reg_request, tmp,
+ &reg_requests_list, list) {
+ list_del(&reg_request->list);
+ kfree(reg_request);
+ }
+ }
+ spin_unlock(&reg_requests_lock);
+
+ mutex_unlock(&cfg80211_mutex);
}
diff --git a/net/wireless/reg.h b/net/wireless/reg.h
index fe8c83f34fb..e37829a49dc 100644
--- a/net/wireless/reg.h
+++ b/net/wireless/reg.h
@@ -6,6 +6,8 @@ extern const struct ieee80211_regdomain *cfg80211_regdomain;
bool is_world_regdom(const char *alpha2);
bool reg_is_valid_request(const char *alpha2);
+int regulatory_hint_user(const char *alpha2);
+
void reg_device_remove(struct wiphy *wiphy);
int regulatory_init(void);
@@ -14,26 +16,24 @@ void regulatory_exit(void);
int set_regdom(const struct ieee80211_regdomain *rd);
/**
- * __regulatory_hint - hint to the wireless core a regulatory domain
- * @wiphy: if the hint comes from country information from an AP, this
- * is required to be set to the wiphy that received the information
- * @alpha2: the ISO/IEC 3166 alpha2 being claimed the regulatory domain
- * should be in.
- * @country_ie_checksum: checksum of processed country IE, set this to 0
- * if the hint did not come from a country IE
- * @country_ie_env: the environment the IE told us we are in, %ENVIRON_*
- *
- * The Wireless subsystem can use this function to hint to the wireless core
- * what it believes should be the current regulatory domain by giving it an
- * ISO/IEC 3166 alpha2 country code it knows its regulatory domain should be
- * in.
+ * regulatory_hint_found_beacon - hints a beacon was found on a channel
+ * @wiphy: the wireless device where the beacon was found on
+ * @beacon_chan: the channel on which the beacon was found on
+ * @gfp: context flags
*
- * Returns zero if all went fine, %-EALREADY if a regulatory domain had
- * already been set or other standard error codes.
+ * This informs the wireless core that a beacon from an AP was found on
+ * the channel provided. This allows the wireless core to make educated
+ * guesses on regulatory to help with world roaming. This is only used for
+ * world roaming -- when we do not know our current location. This is
+ * only useful on channels 12, 13 and 14 on the 2 GHz band as channels
+ * 1-11 are already enabled by the world regulatory domain; and on
+ * non-radar 5 GHz channels.
*
+ * Drivers do not need to call this, cfg80211 will do it for after a scan
+ * on a newly found BSS.
*/
-extern int __regulatory_hint(struct wiphy *wiphy, enum reg_set_by set_by,
- const char *alpha2, u32 country_ie_checksum,
- enum environment_cap country_ie_env);
+int regulatory_hint_found_beacon(struct wiphy *wiphy,
+ struct ieee80211_channel *beacon_chan,
+ gfp_t gfp);
#endif /* __NET_WIRELESS_REG_H */
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index b1893c863b9..280dbcd02c1 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -62,6 +62,18 @@ static void bss_release(struct kref *ref)
}
/* must hold dev->bss_lock! */
+void cfg80211_bss_age(struct cfg80211_registered_device *dev,
+ unsigned long age_secs)
+{
+ struct cfg80211_internal_bss *bss;
+ unsigned long age_jiffies = msecs_to_jiffies(age_secs * MSEC_PER_SEC);
+
+ list_for_each_entry(bss, &dev->bss_list, list) {
+ bss->ts -= age_jiffies;
+ }
+}
+
+/* must hold dev->bss_lock! */
void cfg80211_bss_expire(struct cfg80211_registered_device *dev)
{
struct cfg80211_internal_bss *bss, *tmp;
@@ -358,7 +370,6 @@ cfg80211_bss_update(struct cfg80211_registered_device *dev,
found->pub.beacon_interval = res->pub.beacon_interval;
found->pub.tsf = res->pub.tsf;
found->pub.signal = res->pub.signal;
- found->pub.signal_type = res->pub.signal_type;
found->pub.capability = res->pub.capability;
found->ts = res->ts;
kref_put(&res->ref, bss_release);
@@ -380,8 +391,7 @@ struct cfg80211_bss *
cfg80211_inform_bss_frame(struct wiphy *wiphy,
struct ieee80211_channel *channel,
struct ieee80211_mgmt *mgmt, size_t len,
- s32 signal, enum cfg80211_signal_type sigtype,
- gfp_t gfp)
+ s32 signal, gfp_t gfp)
{
struct cfg80211_internal_bss *res;
size_t ielen = len - offsetof(struct ieee80211_mgmt,
@@ -389,7 +399,7 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy,
bool overwrite;
size_t privsz = wiphy->bss_priv_size;
- if (WARN_ON(sigtype == NL80211_BSS_SIGNAL_UNSPEC &&
+ if (WARN_ON(wiphy->signal_type == NL80211_BSS_SIGNAL_UNSPEC &&
(signal < 0 || signal > 100)))
return NULL;
@@ -403,7 +413,6 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy,
memcpy(res->pub.bssid, mgmt->bssid, ETH_ALEN);
res->pub.channel = channel;
- res->pub.signal_type = sigtype;
res->pub.signal = signal;
res->pub.tsf = le64_to_cpu(mgmt->u.probe_resp.timestamp);
res->pub.beacon_interval = le16_to_cpu(mgmt->u.probe_resp.beacon_int);
@@ -421,6 +430,9 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy,
if (!res)
return NULL;
+ if (res->pub.capability & WLAN_CAPABILITY_ESS)
+ regulatory_hint_found_beacon(wiphy, channel, gfp);
+
/* cfg80211_bss_update gives us a referenced result */
return &res->pub;
}
@@ -584,16 +596,25 @@ static void ieee80211_scan_add_ies(struct iw_request_info *info,
}
}
+static inline unsigned int elapsed_jiffies_msecs(unsigned long start)
+{
+ unsigned long end = jiffies;
+
+ if (end >= start)
+ return jiffies_to_msecs(end - start);
+
+ return jiffies_to_msecs(end + (MAX_JIFFY_OFFSET - start) + 1);
+}
static char *
-ieee80211_bss(struct iw_request_info *info,
- struct cfg80211_internal_bss *bss,
- char *current_ev, char *end_buf)
+ieee80211_bss(struct wiphy *wiphy, struct iw_request_info *info,
+ struct cfg80211_internal_bss *bss, char *current_ev,
+ char *end_buf)
{
struct iw_event iwe;
u8 *buf, *cfg, *p;
u8 *ie = bss->pub.information_elements;
- int rem = bss->pub.len_information_elements, i;
+ int rem = bss->pub.len_information_elements, i, sig;
bool ismesh = false;
memset(&iwe, 0, sizeof(iwe));
@@ -617,19 +638,28 @@ ieee80211_bss(struct iw_request_info *info,
current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe,
IW_EV_FREQ_LEN);
- if (bss->pub.signal_type != CFG80211_SIGNAL_TYPE_NONE) {
+ if (wiphy->signal_type != CFG80211_SIGNAL_TYPE_NONE) {
memset(&iwe, 0, sizeof(iwe));
iwe.cmd = IWEVQUAL;
iwe.u.qual.updated = IW_QUAL_LEVEL_UPDATED |
IW_QUAL_NOISE_INVALID |
- IW_QUAL_QUAL_INVALID;
- switch (bss->pub.signal_type) {
+ IW_QUAL_QUAL_UPDATED;
+ switch (wiphy->signal_type) {
case CFG80211_SIGNAL_TYPE_MBM:
- iwe.u.qual.level = bss->pub.signal / 100;
+ sig = bss->pub.signal / 100;
+ iwe.u.qual.level = sig;
iwe.u.qual.updated |= IW_QUAL_DBM;
+ if (sig < -110) /* rather bad */
+ sig = -110;
+ else if (sig > -40) /* perfect */
+ sig = -40;
+ /* will give a range of 0 .. 70 */
+ iwe.u.qual.qual = sig + 110;
break;
case CFG80211_SIGNAL_TYPE_UNSPEC:
iwe.u.qual.level = bss->pub.signal;
+ /* will give range 0 .. 100 */
+ iwe.u.qual.qual = bss->pub.signal;
break;
default:
/* not reached */
@@ -763,8 +793,8 @@ ieee80211_bss(struct iw_request_info *info,
&iwe, buf);
memset(&iwe, 0, sizeof(iwe));
iwe.cmd = IWEVCUSTOM;
- sprintf(buf, " Last beacon: %dms ago",
- jiffies_to_msecs(jiffies - bss->ts));
+ sprintf(buf, " Last beacon: %ums ago",
+ elapsed_jiffies_msecs(bss->ts));
iwe.u.data.length = strlen(buf);
current_ev = iwe_stream_add_point(info, current_ev,
end_buf, &iwe, buf);
@@ -793,8 +823,8 @@ static int ieee80211_scan_results(struct cfg80211_registered_device *dev,
spin_unlock_bh(&dev->bss_lock);
return -E2BIG;
}
- current_ev = ieee80211_bss(info, bss,
- current_ev, end_buf);
+ current_ev = ieee80211_bss(&dev->wiphy, info, bss,
+ current_ev, end_buf);
}
spin_unlock_bh(&dev->bss_lock);
return current_ev - buf;
diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c
index 26a72b0797a..efe3c5c92b2 100644
--- a/net/wireless/sysfs.c
+++ b/net/wireless/sysfs.c
@@ -31,7 +31,7 @@ static ssize_t name ## _show(struct device *dev, \
return sprintf(buf, fmt "\n", dev_to_rdev(dev)->member); \
}
-SHOW_FMT(index, "%d", idx);
+SHOW_FMT(index, "%d", wiphy_idx);
SHOW_FMT(macaddress, "%pM", wiphy.perm_addr);
static struct device_attribute ieee80211_dev_attrs[] = {
@@ -60,6 +60,8 @@ static int wiphy_suspend(struct device *dev, pm_message_t state)
struct cfg80211_registered_device *rdev = dev_to_rdev(dev);
int ret = 0;
+ rdev->suspend_at = get_seconds();
+
if (rdev->ops->suspend) {
rtnl_lock();
ret = rdev->ops->suspend(&rdev->wiphy);
@@ -74,6 +76,11 @@ static int wiphy_resume(struct device *dev)
struct cfg80211_registered_device *rdev = dev_to_rdev(dev);
int ret = 0;
+ /* Age scan results with time spent in suspend */
+ spin_lock_bh(&rdev->bss_lock);
+ cfg80211_bss_age(rdev, get_seconds() - rdev->suspend_at);
+ spin_unlock_bh(&rdev->bss_lock);
+
if (rdev->ops->resume) {
rtnl_lock();
ret = rdev->ops->resume(&rdev->wiphy);
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index 58e489fd4ae..b84a9b4fe96 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -137,3 +137,100 @@ int cfg80211_wext_giwmode(struct net_device *dev, struct iw_request_info *info,
return 0;
}
EXPORT_SYMBOL(cfg80211_wext_giwmode);
+
+
+int cfg80211_wext_giwrange(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_point *data, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct iw_range *range = (struct iw_range *) extra;
+ enum ieee80211_band band;
+ int c = 0;
+
+ if (!wdev)
+ return -EOPNOTSUPP;
+
+ data->length = sizeof(struct iw_range);
+ memset(range, 0, sizeof(struct iw_range));
+
+ range->we_version_compiled = WIRELESS_EXT;
+ range->we_version_source = 21;
+ range->retry_capa = IW_RETRY_LIMIT;
+ range->retry_flags = IW_RETRY_LIMIT;
+ range->min_retry = 0;
+ range->max_retry = 255;
+ range->min_rts = 0;
+ range->max_rts = 2347;
+ range->min_frag = 256;
+ range->max_frag = 2346;
+
+ range->encoding_size[0] = 5;
+ range->encoding_size[1] = 13;
+ range->num_encoding_sizes = 2;
+ range->max_encoding_tokens = 4;
+
+ range->max_qual.updated = IW_QUAL_NOISE_INVALID;
+
+ switch (wdev->wiphy->signal_type) {
+ case CFG80211_SIGNAL_TYPE_NONE:
+ break;
+ case CFG80211_SIGNAL_TYPE_MBM:
+ range->max_qual.level = -110;
+ range->max_qual.qual = 70;
+ range->avg_qual.qual = 35;
+ range->max_qual.updated |= IW_QUAL_DBM;
+ range->max_qual.updated |= IW_QUAL_QUAL_UPDATED;
+ range->max_qual.updated |= IW_QUAL_LEVEL_UPDATED;
+ break;
+ case CFG80211_SIGNAL_TYPE_UNSPEC:
+ range->max_qual.level = 100;
+ range->max_qual.qual = 100;
+ range->avg_qual.qual = 50;
+ range->max_qual.updated |= IW_QUAL_QUAL_UPDATED;
+ range->max_qual.updated |= IW_QUAL_LEVEL_UPDATED;
+ break;
+ }
+
+ range->avg_qual.level = range->max_qual.level / 2;
+ range->avg_qual.noise = range->max_qual.noise / 2;
+ range->avg_qual.updated = range->max_qual.updated;
+
+ range->enc_capa = IW_ENC_CAPA_WPA | IW_ENC_CAPA_WPA2 |
+ IW_ENC_CAPA_CIPHER_TKIP | IW_ENC_CAPA_CIPHER_CCMP;
+
+
+ for (band = 0; band < IEEE80211_NUM_BANDS; band ++) {
+ int i;
+ struct ieee80211_supported_band *sband;
+
+ sband = wdev->wiphy->bands[band];
+
+ if (!sband)
+ continue;
+
+ for (i = 0; i < sband->n_channels && c < IW_MAX_FREQUENCIES; i++) {
+ struct ieee80211_channel *chan = &sband->channels[i];
+
+ if (!(chan->flags & IEEE80211_CHAN_DISABLED)) {
+ range->freq[c].i =
+ ieee80211_frequency_to_channel(
+ chan->center_freq);
+ range->freq[c].m = chan->center_freq;
+ range->freq[c].e = 6;
+ c++;
+ }
+ }
+ }
+ range->num_channels = c;
+ range->num_frequency = c;
+
+ IW_EVENT_CAPA_SET_KERNEL(range->event_capa);
+ IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWAP);
+ IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWSCAN);
+
+ range->scan_capa |= IW_SCAN_CAPA_ESSID;
+
+ return 0;
+}
+EXPORT_SYMBOL(cfg80211_wext_giwrange);
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 8f76f4009c2..9ca17b1ce52 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -951,10 +951,8 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
/*
* Incoming Call User Data.
*/
- if (skb->len >= 0) {
- skb_copy_from_linear_data(skb, makex25->calluserdata.cuddata, skb->len);
- makex25->calluserdata.cudlength = skb->len;
- }
+ skb_copy_from_linear_data(skb, makex25->calluserdata.cuddata, skb->len);
+ makex25->calluserdata.cudlength = skb->len;
sk->sk_ack_backlog++;
@@ -1122,8 +1120,9 @@ static int x25_sendmsg(struct kiocb *iocb, struct socket *sock,
if (msg->msg_flags & MSG_OOB)
skb_queue_tail(&x25->interrupt_out_queue, skb);
else {
- len = x25_output(sk, skb);
- if (len < 0)
+ rc = x25_output(sk, skb);
+ len = rc;
+ if (rc < 0)
kfree_skb(skb);
else if (x25->qbitincl)
len++;
@@ -1608,7 +1607,7 @@ static const struct proto_ops SOCKOPS_WRAPPED(x25_proto_ops) = {
SOCKOPS_WRAP(x25_proto, AF_X25);
-static struct packet_type x25_packet_type = {
+static struct packet_type x25_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_X25),
.func = x25_lapb_receive_frame,
};
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index e25ff62ab2a..62a5425cc6a 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -748,12 +748,51 @@ static void xfrm_hash_grow_check(struct net *net, int have_hash_collision)
schedule_work(&net->xfrm.state_hash_work);
}
+static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x,
+ struct flowi *fl, unsigned short family,
+ xfrm_address_t *daddr, xfrm_address_t *saddr,
+ struct xfrm_state **best, int *acq_in_progress,
+ int *error)
+{
+ /* Resolution logic:
+ * 1. There is a valid state with matching selector. Done.
+ * 2. Valid state with inappropriate selector. Skip.
+ *
+ * Entering area of "sysdeps".
+ *
+ * 3. If state is not valid, selector is temporary, it selects
+ * only session which triggered previous resolution. Key
+ * manager will do something to install a state with proper
+ * selector.
+ */
+ if (x->km.state == XFRM_STATE_VALID) {
+ if ((x->sel.family &&
+ !xfrm_selector_match(&x->sel, fl, x->sel.family)) ||
+ !security_xfrm_state_pol_flow_match(x, pol, fl))
+ return;
+
+ if (!*best ||
+ (*best)->km.dying > x->km.dying ||
+ ((*best)->km.dying == x->km.dying &&
+ (*best)->curlft.add_time < x->curlft.add_time))
+ *best = x;
+ } else if (x->km.state == XFRM_STATE_ACQ) {
+ *acq_in_progress = 1;
+ } else if (x->km.state == XFRM_STATE_ERROR ||
+ x->km.state == XFRM_STATE_EXPIRED) {
+ if (xfrm_selector_match(&x->sel, fl, x->sel.family) &&
+ security_xfrm_state_pol_flow_match(x, pol, fl))
+ *error = -ESRCH;
+ }
+}
+
struct xfrm_state *
xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
struct flowi *fl, struct xfrm_tmpl *tmpl,
struct xfrm_policy *pol, int *err,
unsigned short family)
{
+ static xfrm_address_t saddr_wildcard = { };
struct net *net = xp_net(pol);
unsigned int h;
struct hlist_node *entry;
@@ -773,40 +812,27 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
xfrm_state_addr_check(x, daddr, saddr, family) &&
tmpl->mode == x->props.mode &&
tmpl->id.proto == x->id.proto &&
- (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) {
- /* Resolution logic:
- 1. There is a valid state with matching selector.
- Done.
- 2. Valid state with inappropriate selector. Skip.
-
- Entering area of "sysdeps".
-
- 3. If state is not valid, selector is temporary,
- it selects only session which triggered
- previous resolution. Key manager will do
- something to install a state with proper
- selector.
- */
- if (x->km.state == XFRM_STATE_VALID) {
- if ((x->sel.family && !xfrm_selector_match(&x->sel, fl, x->sel.family)) ||
- !security_xfrm_state_pol_flow_match(x, pol, fl))
- continue;
- if (!best ||
- best->km.dying > x->km.dying ||
- (best->km.dying == x->km.dying &&
- best->curlft.add_time < x->curlft.add_time))
- best = x;
- } else if (x->km.state == XFRM_STATE_ACQ) {
- acquire_in_progress = 1;
- } else if (x->km.state == XFRM_STATE_ERROR ||
- x->km.state == XFRM_STATE_EXPIRED) {
- if (xfrm_selector_match(&x->sel, fl, x->sel.family) &&
- security_xfrm_state_pol_flow_match(x, pol, fl))
- error = -ESRCH;
- }
- }
+ (tmpl->id.spi == x->id.spi || !tmpl->id.spi))
+ xfrm_state_look_at(pol, x, fl, family, daddr, saddr,
+ &best, &acquire_in_progress, &error);
+ }
+ if (best)
+ goto found;
+
+ h = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, family);
+ hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h, bydst) {
+ if (x->props.family == family &&
+ x->props.reqid == tmpl->reqid &&
+ !(x->props.flags & XFRM_STATE_WILDRECV) &&
+ xfrm_state_addr_check(x, daddr, saddr, family) &&
+ tmpl->mode == x->props.mode &&
+ tmpl->id.proto == x->id.proto &&
+ (tmpl->id.spi == x->id.spi || !tmpl->id.spi))
+ xfrm_state_look_at(pol, x, fl, family, daddr, saddr,
+ &best, &acquire_in_progress, &error);
}
+found:
x = best;
if (!x && !error && !acquire_in_progress) {
if (tmpl->id.spi &&