diff options
Diffstat (limited to 'net')
219 files changed, 21713 insertions, 3691 deletions
diff --git a/net/802/psnap.c b/net/802/psnap.c index 6ed711748f2..6fea0750662 100644 --- a/net/802/psnap.c +++ b/net/802/psnap.c @@ -29,7 +29,7 @@ static struct llc_sap *snap_sap; /* * Find a snap client by matching the 5 bytes. */ -static struct datalink_proto *find_snap_client(unsigned char *desc) +static struct datalink_proto *find_snap_client(const unsigned char *desc) { struct datalink_proto *proto = NULL, *p; @@ -95,15 +95,16 @@ static int snap_request(struct datalink_proto *dl, EXPORT_SYMBOL(register_snap_client); EXPORT_SYMBOL(unregister_snap_client); -static char snap_err_msg[] __initdata = +static const char snap_err_msg[] __initconst = KERN_CRIT "SNAP - unable to register with 802.2\n"; static int __init snap_init(void) { snap_sap = llc_sap_open(0xAA, snap_rcv); - - if (!snap_sap) + if (!snap_sap) { printk(snap_err_msg); + return -EBUSY; + } return 0; } @@ -121,7 +122,7 @@ module_exit(snap_exit); /* * Register SNAP clients. We don't yet use this for IP. */ -struct datalink_proto *register_snap_client(unsigned char *desc, +struct datalink_proto *register_snap_client(const unsigned char *desc, int (*rcvfunc)(struct sk_buff *, struct net_device *, struct packet_type *, @@ -136,7 +137,7 @@ struct datalink_proto *register_snap_client(unsigned char *desc, proto = kmalloc(sizeof(*proto), GFP_ATOMIC); if (proto) { - memcpy(proto->type, desc,5); + memcpy(proto->type, desc, 5); proto->rcvfunc = rcvfunc; proto->header_length = 5 + 3; /* snap + 802.2 */ proto->request = snap_request; diff --git a/net/802/tr.c b/net/802/tr.c index 158150fee46..e7eb13084d7 100644 --- a/net/802/tr.c +++ b/net/802/tr.c @@ -486,6 +486,7 @@ static struct rif_cache *rif_get_idx(loff_t pos) } static void *rif_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(&rif_lock) { spin_lock_irq(&rif_lock); @@ -517,6 +518,7 @@ static void *rif_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void rif_seq_stop(struct seq_file *seq, void *v) + __releases(&rif_lock) { spin_unlock_irq(&rif_lock); } @@ -668,3 +670,5 @@ module_init(rif_init); EXPORT_SYMBOL(tr_type_trans); EXPORT_SYMBOL(alloc_trdev); + +MODULE_LICENSE("GPL"); diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 4163ea65bf4..2b7390e377b 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -51,7 +51,7 @@ const char vlan_version[] = DRV_VERSION; static const char vlan_copyright[] = "Ben Greear <greearb@candelatech.com>"; static const char vlan_buggyright[] = "David S. Miller <davem@redhat.com>"; -static struct packet_type vlan_packet_type = { +static struct packet_type vlan_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_8021Q), .func = vlan_skb_recv, /* VLAN receive method */ }; diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 70435af153f..654e45f5719 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -1,12 +1,16 @@ #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/if_vlan.h> +#include <linux/netpoll.h> #include "vlan.h" /* VLAN rx hw acceleration helper. This acts like netif_{rx,receive_skb}(). */ int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp, u16 vlan_tci, int polling) { + if (netpoll_rx(skb)) + return NET_RX_DROP; + if (skb_bond_should_drop(skb)) goto drop; @@ -94,12 +98,15 @@ static int vlan_gro_common(struct napi_struct *napi, struct vlan_group *grp, return dev_gro_receive(napi, skb); drop: - return 2; + return GRO_DROP; } int vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp, unsigned int vlan_tci, struct sk_buff *skb) { + if (netpoll_rx_on(skb)) + return vlan_hwaccel_receive_skb(skb, grp, vlan_tci); + skb_gro_reset_offset(skb); return napi_skb_finish(vlan_gro_common(napi, grp, vlan_tci, skb), skb); @@ -114,6 +121,9 @@ int vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp, if (!skb) return NET_RX_DROP; + if (netpoll_rx_on(skb)) + return vlan_hwaccel_receive_skb(skb, grp, vlan_tci); + return napi_frags_finish(napi, skb, vlan_gro_common(napi, grp, vlan_tci, skb)); } diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 4a19acd3a32..1b34135cf99 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -553,7 +553,7 @@ static int vlan_dev_neigh_setup(struct net_device *dev, struct neigh_parms *pa) int err = 0; if (netif_device_present(real_dev) && ops->ndo_neigh_setup) - err = ops->ndo_neigh_setup(dev, pa); + err = ops->ndo_neigh_setup(real_dev, pa); return err; } @@ -639,6 +639,7 @@ static int vlan_dev_init(struct net_device *dev) dev->hard_header_len = real_dev->hard_header_len + VLAN_HLEN; dev->netdev_ops = &vlan_netdev_ops; } + netdev_resync_ops(dev); if (is_vlan_dev(real_dev)) subclass = 1; diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 1df0356f242..c613ed08a5e 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -417,7 +417,7 @@ static int p9_fd_write(struct p9_client *client, void *v, int len) oldfs = get_fs(); set_fs(get_ds()); /* The cast to a user pointer is valid due to the set_fs() */ - ret = vfs_write(ts->wr, (void __user *)v, len, &ts->wr->f_pos); + ret = vfs_write(ts->wr, (__force void __user *)v, len, &ts->wr->f_pos); set_fs(oldfs); if (ret <= 0 && ret != -ERESTARTSYS && ret != -EAGAIN) diff --git a/net/Kconfig b/net/Kconfig index a12bae0e3fe..93998a9c39c 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -24,9 +24,6 @@ if NET menu "Networking options" -config COMPAT_NET_DEV_OPS - def_bool y - source "net/packet/Kconfig" source "net/unix/Kconfig" source "net/xfrm/Kconfig" @@ -171,6 +168,7 @@ endif source "net/dccp/Kconfig" source "net/sctp/Kconfig" +source "net/rds/Kconfig" source "net/tipc/Kconfig" source "net/atm/Kconfig" source "net/802/Kconfig" @@ -221,6 +219,17 @@ config NET_TCPPROBE To compile this code as a module, choose M here: the module will be called tcp_probe. +config NET_DROP_MONITOR + boolean "Network packet drop alerting service" + depends on INET && EXPERIMENTAL && TRACEPOINTS + ---help--- + This feature provides an alerting service to userspace in the + event that packets are discarded in the network stack. Alerts + are broadcast via netlink socket to any listening user space + process. If you don't need network drop alerts, or if you are ok + just checking the various proc files and other utilities for + drop statistics, say N here. + endmenu endmenu diff --git a/net/Makefile b/net/Makefile index 0fcce89d716..9e00a55a901 100644 --- a/net/Makefile +++ b/net/Makefile @@ -49,6 +49,7 @@ obj-y += 8021q/ endif obj-$(CONFIG_IP_DCCP) += dccp/ obj-$(CONFIG_IP_SCTP) += sctp/ +obj-$(CONFIG_RDS) += rds/ obj-y += wireless/ obj-$(CONFIG_MAC80211) += mac80211/ obj-$(CONFIG_TIPC) += tipc/ diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 510a6782da8..3e0671df3a3 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1860,12 +1860,12 @@ static struct notifier_block ddp_notifier = { .notifier_call = ddp_device_event, }; -static struct packet_type ltalk_packet_type = { +static struct packet_type ltalk_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_LOCALTALK), .func = ltalk_rcv, }; -static struct packet_type ppptalk_packet_type = { +static struct packet_type ppptalk_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_PPPTALK), .func = atalk_rcv, }; @@ -1877,7 +1877,7 @@ EXPORT_SYMBOL(aarp_send_ddp); EXPORT_SYMBOL(atrtr_get_dev); EXPORT_SYMBOL(atalk_find_dev_addr); -static char atalk_err_snap[] __initdata = +static const char atalk_err_snap[] __initconst = KERN_CRIT "Unable to register DDP with SNAP.\n"; /* Called by proto.c on kernel start up */ diff --git a/net/atm/clip.c b/net/atm/clip.c index da42fd06b61..3dc0a3a42a5 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -552,10 +552,13 @@ static int clip_setentry(struct atm_vcc *vcc, __be32 ip) return error; } +static const struct net_device_ops clip_netdev_ops = { + .ndo_start_xmit = clip_start_xmit, +}; + static void clip_setup(struct net_device *dev) { - dev->hard_start_xmit = clip_start_xmit; - /* sg_xmit ... */ + dev->netdev_ops = &clip_netdev_ops; dev->type = ARPHRD_ATM; dev->hard_header_len = RFC1483LLC_LEN; dev->mtu = RFC1626_MTU; @@ -615,7 +618,7 @@ static int clip_device_event(struct notifier_block *this, unsigned long event, } /* ignore non-CLIP devices */ - if (dev->type != ARPHRD_ATM || dev->hard_start_xmit != clip_start_xmit) + if (dev->type != ARPHRD_ATM || dev->netdev_ops != &clip_netdev_ops) return NOTIFY_DONE; switch (event) { diff --git a/net/atm/lec.c b/net/atm/lec.c index c0cba9a037e..199b6bb79f4 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -502,7 +502,7 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb) priv->lane2_ops = NULL; if (priv->lane_version > 1) priv->lane2_ops = &lane2_ops; - if (dev->change_mtu(dev, mesg->content.config.mtu)) + if (dev_set_mtu(dev, mesg->content.config.mtu)) printk("%s: change_mtu to %d failed\n", dev->name, mesg->content.config.mtu); priv->is_proxy = mesg->content.config.is_proxy; diff --git a/net/atm/mpc.c b/net/atm/mpc.c index 039d5cc72c3..e5bf11453a1 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -286,33 +286,32 @@ static void start_mpc(struct mpoa_client *mpc, struct net_device *dev) { dprintk("mpoa: (%s) start_mpc:\n", mpc->dev->name); - if (dev->hard_start_xmit == NULL) { - printk("mpoa: (%s) start_mpc: dev->hard_start_xmit == NULL, not starting\n", - dev->name); - return; + if (!dev->netdev_ops) + printk("mpoa: (%s) start_mpc not starting\n", dev->name); + else { + mpc->old_ops = dev->netdev_ops; + mpc->new_ops = *mpc->old_ops; + mpc->new_ops.ndo_start_xmit = mpc_send_packet; + dev->netdev_ops = &mpc->new_ops; } - mpc->old_hard_start_xmit = dev->hard_start_xmit; - dev->hard_start_xmit = mpc_send_packet; - - return; } static void stop_mpc(struct mpoa_client *mpc) { - + struct net_device *dev = mpc->dev; dprintk("mpoa: (%s) stop_mpc:", mpc->dev->name); /* Lets not nullify lec device's dev->hard_start_xmit */ - if (mpc->dev->hard_start_xmit != mpc_send_packet) { + if (dev->netdev_ops != &mpc->new_ops) { dprintk(" mpc already stopped, not fatal\n"); return; } dprintk("\n"); - mpc->dev->hard_start_xmit = mpc->old_hard_start_xmit; - mpc->old_hard_start_xmit = NULL; - /* close_shortcuts(mpc); ??? FIXME */ - return; + dev->netdev_ops = mpc->old_ops; + mpc->old_ops = NULL; + + /* close_shortcuts(mpc); ??? FIXME */ } static const char *mpoa_device_type_string(char type) __attribute__ ((unused)); @@ -531,7 +530,6 @@ static int send_via_shortcut(struct sk_buff *skb, struct mpoa_client *mpc) */ static int mpc_send_packet(struct sk_buff *skb, struct net_device *dev) { - int retval; struct mpoa_client *mpc; struct ethhdr *eth; int i = 0; @@ -561,9 +559,7 @@ static int mpc_send_packet(struct sk_buff *skb, struct net_device *dev) } non_ip: - retval = mpc->old_hard_start_xmit(skb,dev); - - return retval; + return mpc->old_ops->ndo_start_xmit(skb,dev); } static int atm_mpoa_vcc_attach(struct atm_vcc *vcc, void __user *arg) diff --git a/net/atm/mpc.h b/net/atm/mpc.h index 24c386c35f5..0919a88bbc7 100644 --- a/net/atm/mpc.h +++ b/net/atm/mpc.h @@ -15,7 +15,7 @@ struct mpoa_client { struct mpoa_client *next; struct net_device *dev; /* lec in question */ int dev_num; /* e.g. 2 for lec2 */ - int (*old_hard_start_xmit)(struct sk_buff *skb, struct net_device *dev); + struct atm_vcc *mpoad_vcc; /* control channel to mpoad */ uint8_t mps_ctrl_addr[ATM_ESA_LEN]; /* MPS control ATM address */ uint8_t our_ctrl_addr[ATM_ESA_LEN]; /* MPC's control ATM address */ @@ -31,6 +31,9 @@ struct mpoa_client { uint8_t *mps_macs; /* array of MPS MAC addresses, >=1 */ int number_of_mps_macs; /* number of the above MAC addresses */ struct mpc_parameters parameters; /* parameters for this client */ + + const struct net_device_ops *old_ops; + struct net_device_ops new_ops; }; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index d127fd3ba5c..7da5ebb84e9 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1435,6 +1435,11 @@ static int ax25_sendmsg(struct kiocb *iocb, struct socket *sock, size_t size; int lv, err, addr_len = msg->msg_namelen; + /* AX.25 empty data frame has no meaning : don't send */ + if (len == 0) { + return (0); + } + if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_CMSG_COMPAT)) return -EINVAL; @@ -1529,10 +1534,8 @@ static int ax25_sendmsg(struct kiocb *iocb, struct socket *sock, dp = ax25->digipeat; } - SOCK_DEBUG(sk, "AX.25: sendto: Addresses built.\n"); - /* Build a packet */ - SOCK_DEBUG(sk, "AX.25: sendto: building packet.\n"); + SOCK_DEBUG(sk, "AX.25: sendto: Addresses built. Building packet.\n"); /* Assume the worst case */ size = len + ax25->ax25_dev->dev->hard_header_len; @@ -1636,6 +1639,13 @@ static int ax25_recvmsg(struct kiocb *iocb, struct socket *sock, skb_reset_transport_header(skb); copied = skb->len; + /* AX.25 empty data frame has no meaning : ignore it */ + if (copied == 0) { + err = copied; + skb_free_datagram(sk, skb); + goto out; + } + if (copied > size) { copied = size; msg->msg_flags |= MSG_TRUNC; @@ -1985,9 +1995,8 @@ static const struct proto_ops ax25_proto_ops = { /* * Called by socket.c on kernel start up */ -static struct packet_type ax25_packet_type = { +static struct packet_type ax25_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_AX25), - .dev = NULL, /* All devices */ .func = ax25_kiss_rcv, }; diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 744ed3f07ef..02b9baa1930 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -41,14 +41,13 @@ #include <net/bluetooth/bluetooth.h> -#define VERSION "2.14" +#define VERSION "2.15" /* Bluetooth sockets */ #define BT_MAX_PROTO 8 static struct net_proto_family *bt_proto[BT_MAX_PROTO]; static DEFINE_RWLOCK(bt_proto_lock); -#ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key bt_lock_key[BT_MAX_PROTO]; static const char *bt_key_strings[BT_MAX_PROTO] = { "sk_lock-AF_BLUETOOTH-BTPROTO_L2CAP", @@ -86,11 +85,6 @@ static inline void bt_sock_reclassify_lock(struct socket *sock, int proto) bt_slock_key_strings[proto], &bt_slock_key[proto], bt_key_strings[proto], &bt_lock_key[proto]); } -#else -static inline void bt_sock_reclassify_lock(struct socket *sock, int proto) -{ -} -#endif int bt_sock_register(int proto, struct net_proto_family *ops) { @@ -217,7 +211,8 @@ struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock) continue; } - if (sk->sk_state == BT_CONNECTED || !newsock) { + if (sk->sk_state == BT_CONNECTED || !newsock || + bt_sk(parent)->defer_setup) { bt_accept_unlink(sk); if (newsock) sock_graft(sk, newsock); @@ -232,7 +227,7 @@ struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock) EXPORT_SYMBOL(bt_accept_dequeue); int bt_sock_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len, int flags) + struct msghdr *msg, size_t len, int flags) { int noblock = flags & MSG_DONTWAIT; struct sock *sk = sock->sk; @@ -277,7 +272,9 @@ static inline unsigned int bt_accept_poll(struct sock *parent) list_for_each_safe(p, n, &bt_sk(parent)->accept_q) { sk = (struct sock *) list_entry(p, struct bt_sock, accept_q); - if (sk->sk_state == BT_CONNECTED) + if (sk->sk_state == BT_CONNECTED || + (bt_sk(parent)->defer_setup && + sk->sk_state == BT_CONNECT2)) return POLLIN | POLLRDNORM; } diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c index c9cac7719ef..0073ec8495d 100644 --- a/net/bluetooth/cmtp/core.c +++ b/net/bluetooth/cmtp/core.c @@ -126,8 +126,7 @@ static inline void cmtp_add_msgpart(struct cmtp_session *session, int id, const session->reassembly[id] = nskb; - if (skb) - kfree_skb(skb); + kfree_skb(skb); } static inline int cmtp_recv_frame(struct cmtp_session *session, struct sk_buff *skb) diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index a4a789f24c8..1181db08d9d 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -123,6 +123,8 @@ void hci_add_sco(struct hci_conn *conn, __u16 handle) conn->state = BT_CONNECT; conn->out = 1; + conn->attempt++; + cp.handle = cpu_to_le16(handle); cp.pkt_type = cpu_to_le16(conn->pkt_type); @@ -139,6 +141,8 @@ void hci_setup_sync(struct hci_conn *conn, __u16 handle) conn->state = BT_CONNECT; conn->out = 1; + conn->attempt++; + cp.handle = cpu_to_le16(handle); cp.pkt_type = cpu_to_le16(conn->pkt_type); @@ -155,6 +159,7 @@ static void hci_conn_timeout(unsigned long arg) { struct hci_conn *conn = (void *) arg; struct hci_dev *hdev = conn->hdev; + __u8 reason; BT_DBG("conn %p state %d", conn, conn->state); @@ -173,7 +178,8 @@ static void hci_conn_timeout(unsigned long arg) break; case BT_CONFIG: case BT_CONNECTED: - hci_acl_disconn(conn, 0x13); + reason = hci_proto_disconn_ind(conn); + hci_acl_disconn(conn, reason); break; default: conn->state = BT_CLOSED; @@ -216,12 +222,13 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst) break; case SCO_LINK: if (lmp_esco_capable(hdev)) - conn->pkt_type = hdev->esco_type & SCO_ESCO_MASK; + conn->pkt_type = (hdev->esco_type & SCO_ESCO_MASK) | + (hdev->esco_type & EDR_ESCO_MASK); else conn->pkt_type = hdev->pkt_type & SCO_PTYPE_MASK; break; case ESCO_LINK: - conn->pkt_type = hdev->esco_type; + conn->pkt_type = hdev->esco_type & ~EDR_ESCO_MASK; break; } @@ -280,6 +287,8 @@ int hci_conn_del(struct hci_conn *conn) skb_queue_purge(&conn->data_q); + hci_conn_del_sysfs(conn); + return 0; } @@ -325,7 +334,7 @@ EXPORT_SYMBOL(hci_get_route); /* Create SCO or ACL connection. * Device _must_ be locked */ -struct hci_conn *hci_connect(struct hci_dev *hdev, int type, bdaddr_t *dst, __u8 auth_type) +struct hci_conn *hci_connect(struct hci_dev *hdev, int type, bdaddr_t *dst, __u8 sec_level, __u8 auth_type) { struct hci_conn *acl; struct hci_conn *sco; @@ -340,6 +349,7 @@ struct hci_conn *hci_connect(struct hci_dev *hdev, int type, bdaddr_t *dst, __u8 hci_conn_hold(acl); if (acl->state == BT_OPEN || acl->state == BT_CLOSED) { + acl->sec_level = sec_level; acl->auth_type = auth_type; hci_acl_connect(acl); } @@ -385,51 +395,59 @@ int hci_conn_check_link_mode(struct hci_conn *conn) EXPORT_SYMBOL(hci_conn_check_link_mode); /* Authenticate remote device */ -int hci_conn_auth(struct hci_conn *conn) +static int hci_conn_auth(struct hci_conn *conn, __u8 sec_level, __u8 auth_type) { BT_DBG("conn %p", conn); - if (conn->ssp_mode > 0 && conn->hdev->ssp_mode > 0) { - if (!(conn->auth_type & 0x01)) { - conn->auth_type |= 0x01; - conn->link_mode &= ~HCI_LM_AUTH; - } - } - - if (conn->link_mode & HCI_LM_AUTH) + if (sec_level > conn->sec_level) + conn->sec_level = sec_level; + else if (conn->link_mode & HCI_LM_AUTH) return 1; + conn->auth_type = auth_type; + if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->pend)) { struct hci_cp_auth_requested cp; cp.handle = cpu_to_le16(conn->handle); hci_send_cmd(conn->hdev, HCI_OP_AUTH_REQUESTED, sizeof(cp), &cp); } + return 0; } -EXPORT_SYMBOL(hci_conn_auth); -/* Enable encryption */ -int hci_conn_encrypt(struct hci_conn *conn) +/* Enable security */ +int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type) { BT_DBG("conn %p", conn); + if (sec_level == BT_SECURITY_SDP) + return 1; + + if (sec_level == BT_SECURITY_LOW) { + if (conn->ssp_mode > 0 && conn->hdev->ssp_mode > 0) + return hci_conn_auth(conn, sec_level, auth_type); + else + return 1; + } + if (conn->link_mode & HCI_LM_ENCRYPT) - return hci_conn_auth(conn); + return hci_conn_auth(conn, sec_level, auth_type); if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend)) return 0; - if (hci_conn_auth(conn)) { + if (hci_conn_auth(conn, sec_level, auth_type)) { struct hci_cp_set_conn_encrypt cp; cp.handle = cpu_to_le16(conn->handle); cp.encrypt = 1; hci_send_cmd(conn->hdev, HCI_OP_SET_CONN_ENCRYPT, sizeof(cp), &cp); } + return 0; } -EXPORT_SYMBOL(hci_conn_encrypt); +EXPORT_SYMBOL(hci_conn_security); /* Change link key */ int hci_conn_change_link_key(struct hci_conn *conn) @@ -442,12 +460,13 @@ int hci_conn_change_link_key(struct hci_conn *conn) hci_send_cmd(conn->hdev, HCI_OP_CHANGE_CONN_LINK_KEY, sizeof(cp), &cp); } + return 0; } EXPORT_SYMBOL(hci_conn_change_link_key); /* Switch role */ -int hci_conn_switch_role(struct hci_conn *conn, uint8_t role) +int hci_conn_switch_role(struct hci_conn *conn, __u8 role) { BT_DBG("conn %p", conn); @@ -460,6 +479,7 @@ int hci_conn_switch_role(struct hci_conn *conn, uint8_t role) cp.role = role; hci_send_cmd(conn->hdev, HCI_OP_SWITCH_ROLE, sizeof(cp), &cp); } + return 0; } EXPORT_SYMBOL(hci_conn_switch_role); @@ -542,9 +562,7 @@ void hci_conn_hash_flush(struct hci_dev *hdev) c->state = BT_CLOSED; - hci_conn_del_sysfs(c); - - hci_proto_disconn_ind(c, 0x16); + hci_proto_disconn_cfm(c, 0x16); hci_conn_del(c); } } diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index ba78cc1eb8d..cd061510b6b 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1565,8 +1565,7 @@ static void hci_cmd_task(unsigned long arg) /* Send queued commands */ if (atomic_read(&hdev->cmd_cnt) && (skb = skb_dequeue(&hdev->cmd_q))) { - if (hdev->sent_cmd) - kfree_skb(hdev->sent_cmd); + kfree_skb(hdev->sent_cmd); if ((hdev->sent_cmd = skb_clone(skb, GFP_ATOMIC))) { atomic_dec(&hdev->cmd_cnt); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index f91ba690f5d..55534244c3a 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -484,6 +484,15 @@ static void hci_cc_read_local_features(struct hci_dev *hdev, struct sk_buff *skb if (hdev->features[4] & LMP_EV5) hdev->esco_type |= (ESCO_EV5); + if (hdev->features[5] & LMP_EDR_ESCO_2M) + hdev->esco_type |= (ESCO_2EV3); + + if (hdev->features[5] & LMP_EDR_ESCO_3M) + hdev->esco_type |= (ESCO_3EV3); + + if (hdev->features[5] & LMP_EDR_3S_ESCO) + hdev->esco_type |= (ESCO_2EV5 | ESCO_3EV5); + BT_DBG("%s features 0x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x", hdev->name, hdev->features[0], hdev->features[1], hdev->features[2], hdev->features[3], @@ -914,7 +923,8 @@ static inline void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *s if (ev->status) { hci_proto_connect_cfm(conn, ev->status); hci_conn_del(conn); - } + } else if (ev->link_type != ACL_LINK) + hci_proto_connect_cfm(conn, ev->status); unlock: hci_dev_unlock(hdev); @@ -1009,9 +1019,7 @@ static inline void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff if (conn) { conn->state = BT_CLOSED; - hci_conn_del_sysfs(conn); - - hci_proto_disconn_ind(conn, ev->reason); + hci_proto_disconn_cfm(conn, ev->reason); hci_conn_del(conn); } @@ -1600,7 +1608,8 @@ static inline void hci_remote_ext_features_evt(struct hci_dev *hdev, struct sk_b if (conn->state == BT_CONFIG) { if (!ev->status && hdev->ssp_mode > 0 && - conn->ssp_mode > 0 && conn->out) { + conn->ssp_mode > 0 && conn->out && + conn->sec_level != BT_SECURITY_SDP) { struct hci_cp_auth_requested cp; cp.handle = ev->handle; hci_send_cmd(hdev, HCI_OP_AUTH_REQUESTED, @@ -1637,6 +1646,13 @@ static inline void hci_sync_conn_complete_evt(struct hci_dev *hdev, struct sk_bu conn->type = SCO_LINK; } + if (conn->out && ev->status == 0x1c && conn->attempt < 2) { + conn->pkt_type = (hdev->esco_type & SCO_ESCO_MASK) | + (hdev->esco_type & EDR_ESCO_MASK); + hci_setup_sync(conn, conn->link->handle); + goto unlock; + } + if (!ev->status) { conn->handle = __le16_to_cpu(ev->handle); conn->state = BT_CONNECTED; diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c index b93748e224f..ca4d3b40d5c 100644 --- a/net/bluetooth/l2cap.c +++ b/net/bluetooth/l2cap.c @@ -50,9 +50,10 @@ #include <net/bluetooth/hci_core.h> #include <net/bluetooth/l2cap.h> -#define VERSION "2.11" +#define VERSION "2.13" -static u32 l2cap_feat_mask = 0x0000; +static u32 l2cap_feat_mask = 0x0080; +static u8 l2cap_fixed_chan[8] = { 0x02, }; static const struct proto_ops l2cap_sock_ops; @@ -77,9 +78,10 @@ static void l2cap_sock_timeout(unsigned long arg) bh_lock_sock(sk); - if (sk->sk_state == BT_CONNECT && - (l2cap_pi(sk)->link_mode & (L2CAP_LM_AUTH | - L2CAP_LM_ENCRYPT | L2CAP_LM_SECURE))) + if (sk->sk_state == BT_CONNECTED || sk->sk_state == BT_CONFIG) + reason = ECONNREFUSED; + else if (sk->sk_state == BT_CONNECT && + l2cap_pi(sk)->sec_level != BT_SECURITY_SDP) reason = ECONNREFUSED; else reason = ETIMEDOUT; @@ -204,6 +206,8 @@ static void __l2cap_chan_add(struct l2cap_conn *conn, struct sock *sk, struct so BT_DBG("conn %p, psm 0x%2.2x, dcid 0x%4.4x", conn, l2cap_pi(sk)->psm, l2cap_pi(sk)->dcid); + conn->disc_reason = 0x13; + l2cap_pi(sk)->conn = conn; if (sk->sk_type == SOCK_SEQPACKET) { @@ -259,18 +263,35 @@ static void l2cap_chan_del(struct sock *sk, int err) } /* Service level security */ -static inline int l2cap_check_link_mode(struct sock *sk) +static inline int l2cap_check_security(struct sock *sk) { struct l2cap_conn *conn = l2cap_pi(sk)->conn; + __u8 auth_type; - if ((l2cap_pi(sk)->link_mode & L2CAP_LM_ENCRYPT) || - (l2cap_pi(sk)->link_mode & L2CAP_LM_SECURE)) - return hci_conn_encrypt(conn->hcon); + if (l2cap_pi(sk)->psm == cpu_to_le16(0x0001)) { + if (l2cap_pi(sk)->sec_level == BT_SECURITY_HIGH) + auth_type = HCI_AT_NO_BONDING_MITM; + else + auth_type = HCI_AT_NO_BONDING; - if (l2cap_pi(sk)->link_mode & L2CAP_LM_AUTH) - return hci_conn_auth(conn->hcon); + if (l2cap_pi(sk)->sec_level == BT_SECURITY_LOW) + l2cap_pi(sk)->sec_level = BT_SECURITY_SDP; + } else { + switch (l2cap_pi(sk)->sec_level) { + case BT_SECURITY_HIGH: + auth_type = HCI_AT_GENERAL_BONDING_MITM; + break; + case BT_SECURITY_MEDIUM: + auth_type = HCI_AT_GENERAL_BONDING; + break; + default: + auth_type = HCI_AT_NO_BONDING; + break; + } + } - return 1; + return hci_conn_security(conn->hcon, l2cap_pi(sk)->sec_level, + auth_type); } static inline u8 l2cap_get_ident(struct l2cap_conn *conn) @@ -312,7 +333,10 @@ static void l2cap_do_start(struct sock *sk) struct l2cap_conn *conn = l2cap_pi(sk)->conn; if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) { - if (l2cap_check_link_mode(sk)) { + if (!(conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE)) + return; + + if (l2cap_check_security(sk)) { struct l2cap_conn_req req; req.scid = cpu_to_le16(l2cap_pi(sk)->scid); req.psm = l2cap_pi(sk)->psm; @@ -356,7 +380,7 @@ static void l2cap_conn_start(struct l2cap_conn *conn) } if (sk->sk_state == BT_CONNECT) { - if (l2cap_check_link_mode(sk)) { + if (l2cap_check_security(sk)) { struct l2cap_conn_req req; req.scid = cpu_to_le16(l2cap_pi(sk)->scid); req.psm = l2cap_pi(sk)->psm; @@ -371,10 +395,18 @@ static void l2cap_conn_start(struct l2cap_conn *conn) rsp.scid = cpu_to_le16(l2cap_pi(sk)->dcid); rsp.dcid = cpu_to_le16(l2cap_pi(sk)->scid); - if (l2cap_check_link_mode(sk)) { - sk->sk_state = BT_CONFIG; - rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS); - rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO); + if (l2cap_check_security(sk)) { + if (bt_sk(sk)->defer_setup) { + struct sock *parent = bt_sk(sk)->parent; + rsp.result = cpu_to_le16(L2CAP_CR_PEND); + rsp.status = cpu_to_le16(L2CAP_CS_AUTHOR_PEND); + parent->sk_data_ready(parent, 0); + + } else { + sk->sk_state = BT_CONFIG; + rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS); + rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO); + } } else { rsp.result = cpu_to_le16(L2CAP_CR_PEND); rsp.status = cpu_to_le16(L2CAP_CS_AUTHEN_PEND); @@ -426,7 +458,7 @@ static void l2cap_conn_unreliable(struct l2cap_conn *conn, int err) read_lock(&l->lock); for (sk = l->head; sk; sk = l2cap_pi(sk)->next_c) { - if (l2cap_pi(sk)->link_mode & L2CAP_LM_RELIABLE) + if (l2cap_pi(sk)->force_reliable) sk->sk_err = err; } @@ -437,6 +469,7 @@ static void l2cap_info_timeout(unsigned long arg) { struct l2cap_conn *conn = (void *) arg; + conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE; conn->info_ident = 0; l2cap_conn_start(conn); @@ -470,6 +503,8 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon, u8 status) spin_lock_init(&conn->lock); rwlock_init(&conn->chan_list.lock); + conn->disc_reason = 0x13; + return conn; } @@ -483,8 +518,7 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) BT_DBG("hcon %p conn %p, err %d", hcon, conn, err); - if (conn->rx_skb) - kfree_skb(conn->rx_skb); + kfree_skb(conn->rx_skb); /* Kill channels */ while ((sk = conn->chan_list.head)) { @@ -608,7 +642,6 @@ static void __l2cap_sock_close(struct sock *sk, int reason) case BT_CONNECTED: case BT_CONFIG: - case BT_CONNECT2: if (sk->sk_type == SOCK_SEQPACKET) { struct l2cap_conn *conn = l2cap_pi(sk)->conn; struct l2cap_disconn_req req; @@ -624,6 +657,27 @@ static void __l2cap_sock_close(struct sock *sk, int reason) l2cap_chan_del(sk, reason); break; + case BT_CONNECT2: + if (sk->sk_type == SOCK_SEQPACKET) { + struct l2cap_conn *conn = l2cap_pi(sk)->conn; + struct l2cap_conn_rsp rsp; + __u16 result; + + if (bt_sk(sk)->defer_setup) + result = L2CAP_CR_SEC_BLOCK; + else + result = L2CAP_CR_BAD_PSM; + + rsp.scid = cpu_to_le16(l2cap_pi(sk)->dcid); + rsp.dcid = cpu_to_le16(l2cap_pi(sk)->scid); + rsp.result = cpu_to_le16(result); + rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO); + l2cap_send_cmd(conn, l2cap_pi(sk)->ident, + L2CAP_CONN_RSP, sizeof(rsp), &rsp); + } else + l2cap_chan_del(sk, reason); + break; + case BT_CONNECT: case BT_DISCONN: l2cap_chan_del(sk, reason); @@ -653,13 +707,19 @@ static void l2cap_sock_init(struct sock *sk, struct sock *parent) if (parent) { sk->sk_type = parent->sk_type; + bt_sk(sk)->defer_setup = bt_sk(parent)->defer_setup; + pi->imtu = l2cap_pi(parent)->imtu; pi->omtu = l2cap_pi(parent)->omtu; - pi->link_mode = l2cap_pi(parent)->link_mode; + pi->sec_level = l2cap_pi(parent)->sec_level; + pi->role_switch = l2cap_pi(parent)->role_switch; + pi->force_reliable = l2cap_pi(parent)->force_reliable; } else { pi->imtu = L2CAP_DEFAULT_MTU; pi->omtu = 0; - pi->link_mode = 0; + pi->sec_level = BT_SECURITY_LOW; + pi->role_switch = 0; + pi->force_reliable = 0; } /* Default config options */ @@ -723,17 +783,24 @@ static int l2cap_sock_create(struct net *net, struct socket *sock, int protocol) return 0; } -static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) { - struct sockaddr_l2 *la = (struct sockaddr_l2 *) addr; struct sock *sk = sock->sk; - int err = 0; + struct sockaddr_l2 la; + int len, err = 0; - BT_DBG("sk %p, %s %d", sk, batostr(&la->l2_bdaddr), la->l2_psm); + BT_DBG("sk %p", sk); if (!addr || addr->sa_family != AF_BLUETOOTH) return -EINVAL; + memset(&la, 0, sizeof(la)); + len = min_t(unsigned int, sizeof(la), alen); + memcpy(&la, addr, len); + + if (la.l2_cid) + return -EINVAL; + lock_sock(sk); if (sk->sk_state != BT_OPEN) { @@ -741,7 +808,7 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_ goto done; } - if (la->l2_psm && btohs(la->l2_psm) < 0x1001 && + if (la.l2_psm && btohs(la.l2_psm) < 0x1001 && !capable(CAP_NET_BIND_SERVICE)) { err = -EACCES; goto done; @@ -749,14 +816,17 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_ write_lock_bh(&l2cap_sk_list.lock); - if (la->l2_psm && __l2cap_get_sock_by_addr(la->l2_psm, &la->l2_bdaddr)) { + if (la.l2_psm && __l2cap_get_sock_by_addr(la.l2_psm, &la.l2_bdaddr)) { err = -EADDRINUSE; } else { /* Save source address */ - bacpy(&bt_sk(sk)->src, &la->l2_bdaddr); - l2cap_pi(sk)->psm = la->l2_psm; - l2cap_pi(sk)->sport = la->l2_psm; + bacpy(&bt_sk(sk)->src, &la.l2_bdaddr); + l2cap_pi(sk)->psm = la.l2_psm; + l2cap_pi(sk)->sport = la.l2_psm; sk->sk_state = BT_BOUND; + + if (btohs(la.l2_psm) == 0x0001 || btohs(la.l2_psm) == 0x0003) + l2cap_pi(sk)->sec_level = BT_SECURITY_SDP; } write_unlock_bh(&l2cap_sk_list.lock); @@ -776,7 +846,8 @@ static int l2cap_do_connect(struct sock *sk) __u8 auth_type; int err = 0; - BT_DBG("%s -> %s psm 0x%2.2x", batostr(src), batostr(dst), l2cap_pi(sk)->psm); + BT_DBG("%s -> %s psm 0x%2.2x", batostr(src), batostr(dst), + l2cap_pi(sk)->psm); if (!(hdev = hci_get_route(dst, src))) return -EHOSTUNREACH; @@ -785,21 +856,42 @@ static int l2cap_do_connect(struct sock *sk) err = -ENOMEM; - if (l2cap_pi(sk)->link_mode & L2CAP_LM_AUTH || - l2cap_pi(sk)->link_mode & L2CAP_LM_ENCRYPT || - l2cap_pi(sk)->link_mode & L2CAP_LM_SECURE) { - if (l2cap_pi(sk)->psm == cpu_to_le16(0x0001)) + if (sk->sk_type == SOCK_RAW) { + switch (l2cap_pi(sk)->sec_level) { + case BT_SECURITY_HIGH: + auth_type = HCI_AT_DEDICATED_BONDING_MITM; + break; + case BT_SECURITY_MEDIUM: + auth_type = HCI_AT_DEDICATED_BONDING; + break; + default: + auth_type = HCI_AT_NO_BONDING; + break; + } + } else if (l2cap_pi(sk)->psm == cpu_to_le16(0x0001)) { + if (l2cap_pi(sk)->sec_level == BT_SECURITY_HIGH) auth_type = HCI_AT_NO_BONDING_MITM; else - auth_type = HCI_AT_GENERAL_BONDING_MITM; - } else { - if (l2cap_pi(sk)->psm == cpu_to_le16(0x0001)) auth_type = HCI_AT_NO_BONDING; - else + + if (l2cap_pi(sk)->sec_level == BT_SECURITY_LOW) + l2cap_pi(sk)->sec_level = BT_SECURITY_SDP; + } else { + switch (l2cap_pi(sk)->sec_level) { + case BT_SECURITY_HIGH: + auth_type = HCI_AT_GENERAL_BONDING_MITM; + break; + case BT_SECURITY_MEDIUM: auth_type = HCI_AT_GENERAL_BONDING; + break; + default: + auth_type = HCI_AT_NO_BONDING; + break; + } } - hcon = hci_connect(hdev, ACL_LINK, dst, auth_type); + hcon = hci_connect(hdev, ACL_LINK, dst, + l2cap_pi(sk)->sec_level, auth_type); if (!hcon) goto done; @@ -835,20 +927,25 @@ done: static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) { - struct sockaddr_l2 *la = (struct sockaddr_l2 *) addr; struct sock *sk = sock->sk; - int err = 0; - - lock_sock(sk); + struct sockaddr_l2 la; + int len, err = 0; BT_DBG("sk %p", sk); - if (addr->sa_family != AF_BLUETOOTH || alen < sizeof(struct sockaddr_l2)) { - err = -EINVAL; - goto done; - } + if (!addr || addr->sa_family != AF_BLUETOOTH) + return -EINVAL; + + memset(&la, 0, sizeof(la)); + len = min_t(unsigned int, sizeof(la), alen); + memcpy(&la, addr, len); + + if (la.l2_cid) + return -EINVAL; + + lock_sock(sk); - if (sk->sk_type == SOCK_SEQPACKET && !la->l2_psm) { + if (sk->sk_type == SOCK_SEQPACKET && !la.l2_psm) { err = -EINVAL; goto done; } @@ -875,8 +972,8 @@ static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, int al } /* Set destination address and psm */ - bacpy(&bt_sk(sk)->dst, &la->l2_bdaddr); - l2cap_pi(sk)->psm = la->l2_psm; + bacpy(&bt_sk(sk)->dst, &la.l2_bdaddr); + l2cap_pi(sk)->psm = la.l2_psm; if ((err = l2cap_do_connect(sk))) goto done; @@ -1000,12 +1097,16 @@ static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr, int *l addr->sa_family = AF_BLUETOOTH; *len = sizeof(struct sockaddr_l2); - if (peer) + if (peer) { + la->l2_psm = l2cap_pi(sk)->psm; bacpy(&la->l2_bdaddr, &bt_sk(sk)->dst); - else + la->l2_cid = htobs(l2cap_pi(sk)->dcid); + } else { + la->l2_psm = l2cap_pi(sk)->sport; bacpy(&la->l2_bdaddr, &bt_sk(sk)->src); + la->l2_cid = htobs(l2cap_pi(sk)->scid); + } - la->l2_psm = l2cap_pi(sk)->psm; return 0; } @@ -1106,11 +1207,38 @@ static int l2cap_sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct ms return err; } -static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen) +static int l2cap_sock_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t len, int flags) +{ + struct sock *sk = sock->sk; + + lock_sock(sk); + + if (sk->sk_state == BT_CONNECT2 && bt_sk(sk)->defer_setup) { + struct l2cap_conn_rsp rsp; + + sk->sk_state = BT_CONFIG; + + rsp.scid = cpu_to_le16(l2cap_pi(sk)->dcid); + rsp.dcid = cpu_to_le16(l2cap_pi(sk)->scid); + rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS); + rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO); + l2cap_send_cmd(l2cap_pi(sk)->conn, l2cap_pi(sk)->ident, + L2CAP_CONN_RSP, sizeof(rsp), &rsp); + + release_sock(sk); + return 0; + } + + release_sock(sk); + + return bt_sock_recvmsg(iocb, sock, msg, len, flags); +} + +static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, char __user *optval, int optlen) { struct sock *sk = sock->sk; struct l2cap_options opts; - int err = 0, len; + int len, err = 0; u32 opt; BT_DBG("sk %p", sk); @@ -1140,7 +1268,15 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, ch break; } - l2cap_pi(sk)->link_mode = opt; + if (opt & L2CAP_LM_AUTH) + l2cap_pi(sk)->sec_level = BT_SECURITY_LOW; + if (opt & L2CAP_LM_ENCRYPT) + l2cap_pi(sk)->sec_level = BT_SECURITY_MEDIUM; + if (opt & L2CAP_LM_SECURE) + l2cap_pi(sk)->sec_level = BT_SECURITY_HIGH; + + l2cap_pi(sk)->role_switch = (opt & L2CAP_LM_MASTER); + l2cap_pi(sk)->force_reliable = (opt & L2CAP_LM_RELIABLE); break; default: @@ -1152,12 +1288,77 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, ch return err; } -static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) +static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen) +{ + struct sock *sk = sock->sk; + struct bt_security sec; + int len, err = 0; + u32 opt; + + BT_DBG("sk %p", sk); + + if (level == SOL_L2CAP) + return l2cap_sock_setsockopt_old(sock, optname, optval, optlen); + + if (level != SOL_BLUETOOTH) + return -ENOPROTOOPT; + + lock_sock(sk); + + switch (optname) { + case BT_SECURITY: + if (sk->sk_type != SOCK_SEQPACKET && sk->sk_type != SOCK_RAW) { + err = -EINVAL; + break; + } + + sec.level = BT_SECURITY_LOW; + + len = min_t(unsigned int, sizeof(sec), optlen); + if (copy_from_user((char *) &sec, optval, len)) { + err = -EFAULT; + break; + } + + if (sec.level < BT_SECURITY_LOW || + sec.level > BT_SECURITY_HIGH) { + err = -EINVAL; + break; + } + + l2cap_pi(sk)->sec_level = sec.level; + break; + + case BT_DEFER_SETUP: + if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) { + err = -EINVAL; + break; + } + + if (get_user(opt, (u32 __user *) optval)) { + err = -EFAULT; + break; + } + + bt_sk(sk)->defer_setup = opt; + break; + + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + +static int l2cap_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct l2cap_options opts; struct l2cap_conninfo cinfo; int len, err = 0; + u32 opt; BT_DBG("sk %p", sk); @@ -1180,12 +1381,36 @@ static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, ch break; case L2CAP_LM: - if (put_user(l2cap_pi(sk)->link_mode, (u32 __user *) optval)) + switch (l2cap_pi(sk)->sec_level) { + case BT_SECURITY_LOW: + opt = L2CAP_LM_AUTH; + break; + case BT_SECURITY_MEDIUM: + opt = L2CAP_LM_AUTH | L2CAP_LM_ENCRYPT; + break; + case BT_SECURITY_HIGH: + opt = L2CAP_LM_AUTH | L2CAP_LM_ENCRYPT | + L2CAP_LM_SECURE; + break; + default: + opt = 0; + break; + } + + if (l2cap_pi(sk)->role_switch) + opt |= L2CAP_LM_MASTER; + + if (l2cap_pi(sk)->force_reliable) + opt |= L2CAP_LM_RELIABLE; + + if (put_user(opt, (u32 __user *) optval)) err = -EFAULT; break; case L2CAP_CONNINFO: - if (sk->sk_state != BT_CONNECTED) { + if (sk->sk_state != BT_CONNECTED && + !(sk->sk_state == BT_CONNECT2 && + bt_sk(sk)->defer_setup)) { err = -ENOTCONN; break; } @@ -1208,6 +1433,60 @@ static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, ch return err; } +static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct bt_security sec; + int len, err = 0; + + BT_DBG("sk %p", sk); + + if (level == SOL_L2CAP) + return l2cap_sock_getsockopt_old(sock, optname, optval, optlen); + + if (level != SOL_BLUETOOTH) + return -ENOPROTOOPT; + + if (get_user(len, optlen)) + return -EFAULT; + + lock_sock(sk); + + switch (optname) { + case BT_SECURITY: + if (sk->sk_type != SOCK_SEQPACKET && sk->sk_type != SOCK_RAW) { + err = -EINVAL; + break; + } + + sec.level = l2cap_pi(sk)->sec_level; + + len = min_t(unsigned int, len, sizeof(sec)); + if (copy_to_user(optval, (char *) &sec, len)) + err = -EFAULT; + + break; + + case BT_DEFER_SETUP: + if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) { + err = -EINVAL; + break; + } + + if (put_user(bt_sk(sk)->defer_setup, (u32 __user *) optval)) + err = -EFAULT; + + break; + + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + static int l2cap_sock_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; @@ -1270,11 +1549,6 @@ static void l2cap_chan_ready(struct sock *sk) */ parent->sk_data_ready(parent, 0); } - - if (l2cap_pi(sk)->link_mode & L2CAP_LM_SECURE) { - struct l2cap_conn *conn = l2cap_pi(sk)->conn; - hci_conn_change_link_key(conn->hcon); - } } /* Copy frame to all raw sockets on that connection */ @@ -1549,8 +1823,11 @@ static inline int l2cap_command_rej(struct l2cap_conn *conn, struct l2cap_cmd_hd if ((conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) && cmd->ident == conn->info_ident) { - conn->info_ident = 0; del_timer(&conn->info_timer); + + conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE; + conn->info_ident = 0; + l2cap_conn_start(conn); } @@ -1580,6 +1857,7 @@ static inline int l2cap_connect_req(struct l2cap_conn *conn, struct l2cap_cmd_hd /* Check if the ACL is secure enough (if not SDP) */ if (psm != cpu_to_le16(0x0001) && !hci_conn_check_link_mode(conn->hcon)) { + conn->disc_reason = 0x05; result = L2CAP_CR_SEC_BLOCK; goto response; } @@ -1621,11 +1899,18 @@ static inline int l2cap_connect_req(struct l2cap_conn *conn, struct l2cap_cmd_hd l2cap_pi(sk)->ident = cmd->ident; - if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) { - if (l2cap_check_link_mode(sk)) { - sk->sk_state = BT_CONFIG; - result = L2CAP_CR_SUCCESS; - status = L2CAP_CS_NO_INFO; + if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE) { + if (l2cap_check_security(sk)) { + if (bt_sk(sk)->defer_setup) { + sk->sk_state = BT_CONNECT2; + result = L2CAP_CR_PEND; + status = L2CAP_CS_AUTHOR_PEND; + parent->sk_data_ready(parent, 0); + } else { + sk->sk_state = BT_CONFIG; + result = L2CAP_CR_SUCCESS; + status = L2CAP_CS_NO_INFO; + } } else { sk->sk_state = BT_CONNECT2; result = L2CAP_CR_PEND; @@ -1695,11 +1980,14 @@ static inline int l2cap_connect_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hd l2cap_pi(sk)->dcid = dcid; l2cap_pi(sk)->conf_state |= L2CAP_CONF_REQ_SENT; + l2cap_pi(sk)->conf_state &= ~L2CAP_CONF_CONNECT_PEND; + l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, l2cap_build_conf_req(sk, req), req); break; case L2CAP_CR_PEND: + l2cap_pi(sk)->conf_state |= L2CAP_CONF_CONNECT_PEND; break; default: @@ -1908,6 +2196,14 @@ static inline int l2cap_information_req(struct l2cap_conn *conn, struct l2cap_cm put_unaligned(cpu_to_le32(l2cap_feat_mask), (__le32 *) rsp->data); l2cap_send_cmd(conn, cmd->ident, L2CAP_INFO_RSP, sizeof(buf), buf); + } else if (type == L2CAP_IT_FIXED_CHAN) { + u8 buf[12]; + struct l2cap_info_rsp *rsp = (struct l2cap_info_rsp *) buf; + rsp->type = cpu_to_le16(L2CAP_IT_FIXED_CHAN); + rsp->result = cpu_to_le16(L2CAP_IR_SUCCESS); + memcpy(buf + 4, l2cap_fixed_chan, 8); + l2cap_send_cmd(conn, cmd->ident, + L2CAP_INFO_RSP, sizeof(buf), buf); } else { struct l2cap_info_rsp rsp; rsp.type = cpu_to_le16(type); @@ -1929,14 +2225,31 @@ static inline int l2cap_information_rsp(struct l2cap_conn *conn, struct l2cap_cm BT_DBG("type 0x%4.4x result 0x%2.2x", type, result); - conn->info_ident = 0; - del_timer(&conn->info_timer); - if (type == L2CAP_IT_FEAT_MASK) + if (type == L2CAP_IT_FEAT_MASK) { conn->feat_mask = get_unaligned_le32(rsp->data); - l2cap_conn_start(conn); + if (conn->feat_mask & 0x0080) { + struct l2cap_info_req req; + req.type = cpu_to_le16(L2CAP_IT_FIXED_CHAN); + + conn->info_ident = l2cap_get_ident(conn); + + l2cap_send_cmd(conn, conn->info_ident, + L2CAP_INFO_REQ, sizeof(req), &req); + } else { + conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE; + conn->info_ident = 0; + + l2cap_conn_start(conn); + } + } else if (type == L2CAP_IT_FIXED_CHAN) { + conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE; + conn->info_ident = 0; + + l2cap_conn_start(conn); + } return 0; } @@ -2143,10 +2456,15 @@ static int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type) continue; if (!bacmp(&bt_sk(sk)->src, &hdev->bdaddr)) { - lm1 |= (HCI_LM_ACCEPT | l2cap_pi(sk)->link_mode); + lm1 |= HCI_LM_ACCEPT; + if (l2cap_pi(sk)->role_switch) + lm1 |= HCI_LM_MASTER; exact++; - } else if (!bacmp(&bt_sk(sk)->src, BDADDR_ANY)) - lm2 |= (HCI_LM_ACCEPT | l2cap_pi(sk)->link_mode); + } else if (!bacmp(&bt_sk(sk)->src, BDADDR_ANY)) { + lm2 |= HCI_LM_ACCEPT; + if (l2cap_pi(sk)->role_switch) + lm2 |= HCI_LM_MASTER; + } } read_unlock(&l2cap_sk_list.lock); @@ -2172,89 +2490,48 @@ static int l2cap_connect_cfm(struct hci_conn *hcon, u8 status) return 0; } -static int l2cap_disconn_ind(struct hci_conn *hcon, u8 reason) +static int l2cap_disconn_ind(struct hci_conn *hcon) { - BT_DBG("hcon %p reason %d", hcon, reason); + struct l2cap_conn *conn = hcon->l2cap_data; - if (hcon->type != ACL_LINK) - return 0; + BT_DBG("hcon %p", hcon); - l2cap_conn_del(hcon, bt_err(reason)); + if (hcon->type != ACL_LINK || !conn) + return 0x13; - return 0; + return conn->disc_reason; } -static int l2cap_auth_cfm(struct hci_conn *hcon, u8 status) +static int l2cap_disconn_cfm(struct hci_conn *hcon, u8 reason) { - struct l2cap_chan_list *l; - struct l2cap_conn *conn = hcon->l2cap_data; - struct sock *sk; + BT_DBG("hcon %p reason %d", hcon, reason); - if (!conn) + if (hcon->type != ACL_LINK) return 0; - l = &conn->chan_list; - - BT_DBG("conn %p", conn); - - read_lock(&l->lock); - - for (sk = l->head; sk; sk = l2cap_pi(sk)->next_c) { - struct l2cap_pinfo *pi = l2cap_pi(sk); - - bh_lock_sock(sk); - - if ((pi->link_mode & (L2CAP_LM_ENCRYPT | L2CAP_LM_SECURE)) && - !(hcon->link_mode & HCI_LM_ENCRYPT) && - !status) { - bh_unlock_sock(sk); - continue; - } - - if (sk->sk_state == BT_CONNECT) { - if (!status) { - struct l2cap_conn_req req; - req.scid = cpu_to_le16(l2cap_pi(sk)->scid); - req.psm = l2cap_pi(sk)->psm; - - l2cap_pi(sk)->ident = l2cap_get_ident(conn); - - l2cap_send_cmd(conn, l2cap_pi(sk)->ident, - L2CAP_CONN_REQ, sizeof(req), &req); - } else { - l2cap_sock_clear_timer(sk); - l2cap_sock_set_timer(sk, HZ / 10); - } - } else if (sk->sk_state == BT_CONNECT2) { - struct l2cap_conn_rsp rsp; - __u16 result; + l2cap_conn_del(hcon, bt_err(reason)); - if (!status) { - sk->sk_state = BT_CONFIG; - result = L2CAP_CR_SUCCESS; - } else { - sk->sk_state = BT_DISCONN; - l2cap_sock_set_timer(sk, HZ / 10); - result = L2CAP_CR_SEC_BLOCK; - } + return 0; +} - rsp.scid = cpu_to_le16(l2cap_pi(sk)->dcid); - rsp.dcid = cpu_to_le16(l2cap_pi(sk)->scid); - rsp.result = cpu_to_le16(result); - rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO); - l2cap_send_cmd(conn, l2cap_pi(sk)->ident, - L2CAP_CONN_RSP, sizeof(rsp), &rsp); - } +static inline void l2cap_check_encryption(struct sock *sk, u8 encrypt) +{ + if (sk->sk_type != SOCK_SEQPACKET) + return; - bh_unlock_sock(sk); + if (encrypt == 0x00) { + if (l2cap_pi(sk)->sec_level == BT_SECURITY_MEDIUM) { + l2cap_sock_clear_timer(sk); + l2cap_sock_set_timer(sk, HZ * 5); + } else if (l2cap_pi(sk)->sec_level == BT_SECURITY_HIGH) + __l2cap_sock_close(sk, ECONNREFUSED); + } else { + if (l2cap_pi(sk)->sec_level == BT_SECURITY_MEDIUM) + l2cap_sock_clear_timer(sk); } - - read_unlock(&l->lock); - - return 0; } -static int l2cap_encrypt_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) +static int l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) { struct l2cap_chan_list *l; struct l2cap_conn *conn = hcon->l2cap_data; @@ -2270,15 +2547,16 @@ static int l2cap_encrypt_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) read_lock(&l->lock); for (sk = l->head; sk; sk = l2cap_pi(sk)->next_c) { - struct l2cap_pinfo *pi = l2cap_pi(sk); - bh_lock_sock(sk); - if ((pi->link_mode & (L2CAP_LM_ENCRYPT | L2CAP_LM_SECURE)) && - (sk->sk_state == BT_CONNECTED || - sk->sk_state == BT_CONFIG) && - !status && encrypt == 0x00) { - __l2cap_sock_close(sk, ECONNREFUSED); + if (l2cap_pi(sk)->conf_state & L2CAP_CONF_CONNECT_PEND) { + bh_unlock_sock(sk); + continue; + } + + if (!status && (sk->sk_state == BT_CONNECTED || + sk->sk_state == BT_CONFIG)) { + l2cap_check_encryption(sk, encrypt); bh_unlock_sock(sk); continue; } @@ -2376,7 +2654,7 @@ static int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 fl goto drop; skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), - skb->len); + skb->len); conn->rx_len = len - skb->len; } else { BT_DBG("Cont: frag len %d (expecting %d)", skb->len, conn->rx_len); @@ -2398,7 +2676,7 @@ static int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 fl } skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), - skb->len); + skb->len); conn->rx_len -= skb->len; if (!conn->rx_len) { @@ -2424,10 +2702,10 @@ static ssize_t l2cap_sysfs_show(struct class *dev, char *buf) sk_for_each(sk, node, &l2cap_sk_list.head) { struct l2cap_pinfo *pi = l2cap_pi(sk); - str += sprintf(str, "%s %s %d %d 0x%4.4x 0x%4.4x %d %d 0x%x\n", + str += sprintf(str, "%s %s %d %d 0x%4.4x 0x%4.4x %d %d %d\n", batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), sk->sk_state, btohs(pi->psm), pi->scid, pi->dcid, - pi->imtu, pi->omtu, pi->link_mode); + pi->imtu, pi->omtu, pi->sec_level); } read_unlock_bh(&l2cap_sk_list.lock); @@ -2447,7 +2725,7 @@ static const struct proto_ops l2cap_sock_ops = { .accept = l2cap_sock_accept, .getname = l2cap_sock_getname, .sendmsg = l2cap_sock_sendmsg, - .recvmsg = bt_sock_recvmsg, + .recvmsg = l2cap_sock_recvmsg, .poll = bt_sock_poll, .ioctl = bt_sock_ioctl, .mmap = sock_no_mmap, @@ -2469,8 +2747,8 @@ static struct hci_proto l2cap_hci_proto = { .connect_ind = l2cap_connect_ind, .connect_cfm = l2cap_connect_cfm, .disconn_ind = l2cap_disconn_ind, - .auth_cfm = l2cap_auth_cfm, - .encrypt_cfm = l2cap_encrypt_cfm, + .disconn_cfm = l2cap_disconn_cfm, + .security_cfm = l2cap_security_cfm, .recv_acldata = l2cap_recv_acldata }; diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index acd84fd524b..1d0fb0f23c6 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -46,7 +46,7 @@ #include <net/bluetooth/l2cap.h> #include <net/bluetooth/rfcomm.h> -#define VERSION "1.10" +#define VERSION "1.11" static int disable_cfc = 0; static int channel_mtu = -1; @@ -223,19 +223,25 @@ static int rfcomm_l2sock_create(struct socket **sock) return err; } -static inline int rfcomm_check_link_mode(struct rfcomm_dlc *d) +static inline int rfcomm_check_security(struct rfcomm_dlc *d) { struct sock *sk = d->session->sock->sk; + __u8 auth_type; - if (d->link_mode & (RFCOMM_LM_ENCRYPT | RFCOMM_LM_SECURE)) { - if (!hci_conn_encrypt(l2cap_pi(sk)->conn->hcon)) - return 1; - } else if (d->link_mode & RFCOMM_LM_AUTH) { - if (!hci_conn_auth(l2cap_pi(sk)->conn->hcon)) - return 1; + switch (d->sec_level) { + case BT_SECURITY_HIGH: + auth_type = HCI_AT_GENERAL_BONDING_MITM; + break; + case BT_SECURITY_MEDIUM: + auth_type = HCI_AT_GENERAL_BONDING; + break; + default: + auth_type = HCI_AT_NO_BONDING; + break; } - return 0; + return hci_conn_security(l2cap_pi(sk)->conn->hcon, d->sec_level, + auth_type); } /* ---- RFCOMM DLCs ---- */ @@ -388,10 +394,10 @@ static int __rfcomm_dlc_open(struct rfcomm_dlc *d, bdaddr_t *src, bdaddr_t *dst, d->cfc = (s->cfc == RFCOMM_CFC_UNKNOWN) ? 0 : s->cfc; if (s->state == BT_CONNECTED) { - if (rfcomm_check_link_mode(d)) - set_bit(RFCOMM_AUTH_PENDING, &d->flags); - else + if (rfcomm_check_security(d)) rfcomm_send_pn(s, 1, d); + else + set_bit(RFCOMM_AUTH_PENDING, &d->flags); } rfcomm_dlc_set_timer(d, RFCOMM_CONN_TIMEOUT); @@ -421,9 +427,16 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err) d, d->state, d->dlci, err, s); switch (d->state) { - case BT_CONNECTED: - case BT_CONFIG: case BT_CONNECT: + case BT_CONFIG: + if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) { + set_bit(RFCOMM_AUTH_REJECT, &d->flags); + rfcomm_schedule(RFCOMM_SCHED_AUTH); + break; + } + /* Fall through */ + + case BT_CONNECTED: d->state = BT_DISCONN; if (skb_queue_empty(&d->tx_queue)) { rfcomm_send_disc(s, d->dlci); @@ -434,6 +447,15 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err) } break; + case BT_OPEN: + case BT_CONNECT2: + if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) { + set_bit(RFCOMM_AUTH_REJECT, &d->flags); + rfcomm_schedule(RFCOMM_SCHED_AUTH); + break; + } + /* Fall through */ + default: rfcomm_dlc_clear_timer(d); @@ -636,6 +658,7 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, bdaddr_t *dst bacpy(&addr.l2_bdaddr, src); addr.l2_family = AF_BLUETOOTH; addr.l2_psm = 0; + addr.l2_cid = 0; *err = kernel_bind(sock, (struct sockaddr *) &addr, sizeof(addr)); if (*err < 0) goto failed; @@ -657,6 +680,7 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, bdaddr_t *dst bacpy(&addr.l2_bdaddr, dst); addr.l2_family = AF_BLUETOOTH; addr.l2_psm = htobs(RFCOMM_PSM); + addr.l2_cid = 0; *err = kernel_connect(sock, (struct sockaddr *) &addr, sizeof(addr), O_NONBLOCK); if (*err == 0 || *err == -EINPROGRESS) return s; @@ -1162,7 +1186,7 @@ static int rfcomm_recv_disc(struct rfcomm_session *s, u8 dlci) return 0; } -static void rfcomm_dlc_accept(struct rfcomm_dlc *d) +void rfcomm_dlc_accept(struct rfcomm_dlc *d) { struct sock *sk = d->session->sock->sk; @@ -1175,12 +1199,31 @@ static void rfcomm_dlc_accept(struct rfcomm_dlc *d) d->state_change(d, 0); rfcomm_dlc_unlock(d); - if (d->link_mode & RFCOMM_LM_MASTER) + if (d->role_switch) hci_conn_switch_role(l2cap_pi(sk)->conn->hcon, 0x00); rfcomm_send_msc(d->session, 1, d->dlci, d->v24_sig); } +static void rfcomm_check_accept(struct rfcomm_dlc *d) +{ + if (rfcomm_check_security(d)) { + if (d->defer_setup) { + set_bit(RFCOMM_DEFER_SETUP, &d->flags); + rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT); + + rfcomm_dlc_lock(d); + d->state = BT_CONNECT2; + d->state_change(d, 0); + rfcomm_dlc_unlock(d); + } else + rfcomm_dlc_accept(d); + } else { + set_bit(RFCOMM_AUTH_PENDING, &d->flags); + rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT); + } +} + static int rfcomm_recv_sabm(struct rfcomm_session *s, u8 dlci) { struct rfcomm_dlc *d; @@ -1203,11 +1246,7 @@ static int rfcomm_recv_sabm(struct rfcomm_session *s, u8 dlci) if (d) { if (d->state == BT_OPEN) { /* DLC was previously opened by PN request */ - if (rfcomm_check_link_mode(d)) { - set_bit(RFCOMM_AUTH_PENDING, &d->flags); - rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT); - } else - rfcomm_dlc_accept(d); + rfcomm_check_accept(d); } return 0; } @@ -1219,11 +1258,7 @@ static int rfcomm_recv_sabm(struct rfcomm_session *s, u8 dlci) d->addr = __addr(s->initiator, dlci); rfcomm_dlc_link(s, d); - if (rfcomm_check_link_mode(d)) { - set_bit(RFCOMM_AUTH_PENDING, &d->flags); - rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT); - } else - rfcomm_dlc_accept(d); + rfcomm_check_accept(d); } else { rfcomm_send_dm(s, dlci); } @@ -1637,11 +1672,12 @@ static void rfcomm_process_connect(struct rfcomm_session *s) d = list_entry(p, struct rfcomm_dlc, list); if (d->state == BT_CONFIG) { d->mtu = s->mtu; - if (rfcomm_check_link_mode(d)) { + if (rfcomm_check_security(d)) { + rfcomm_send_pn(s, 1, d); + } else { set_bit(RFCOMM_AUTH_PENDING, &d->flags); rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT); - } else - rfcomm_send_pn(s, 1, d); + } } } } @@ -1717,11 +1753,17 @@ static inline void rfcomm_process_dlcs(struct rfcomm_session *s) if (d->out) { rfcomm_send_pn(s, 1, d); rfcomm_dlc_set_timer(d, RFCOMM_CONN_TIMEOUT); - } else - rfcomm_dlc_accept(d); - if (d->link_mode & RFCOMM_LM_SECURE) { - struct sock *sk = s->sock->sk; - hci_conn_change_link_key(l2cap_pi(sk)->conn->hcon); + } else { + if (d->defer_setup) { + set_bit(RFCOMM_DEFER_SETUP, &d->flags); + rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT); + + rfcomm_dlc_lock(d); + d->state = BT_CONNECT2; + d->state_change(d, 0); + rfcomm_dlc_unlock(d); + } else + rfcomm_dlc_accept(d); } continue; } else if (test_and_clear_bit(RFCOMM_AUTH_REJECT, &d->flags)) { @@ -1734,6 +1776,9 @@ static inline void rfcomm_process_dlcs(struct rfcomm_session *s) continue; } + if (test_bit(RFCOMM_SEC_PENDING, &d->flags)) + continue; + if (test_bit(RFCOMM_TX_THROTTLED, &s->flags)) continue; @@ -1876,6 +1921,7 @@ static int rfcomm_add_listener(bdaddr_t *ba) bacpy(&addr.l2_bdaddr, ba); addr.l2_family = AF_BLUETOOTH; addr.l2_psm = htobs(RFCOMM_PSM); + addr.l2_cid = 0; err = kernel_bind(sock, (struct sockaddr *) &addr, sizeof(addr)); if (err < 0) { BT_ERR("Bind failed %d", err); @@ -1947,42 +1993,7 @@ static int rfcomm_run(void *unused) return 0; } -static void rfcomm_auth_cfm(struct hci_conn *conn, u8 status) -{ - struct rfcomm_session *s; - struct rfcomm_dlc *d; - struct list_head *p, *n; - - BT_DBG("conn %p status 0x%02x", conn, status); - - s = rfcomm_session_get(&conn->hdev->bdaddr, &conn->dst); - if (!s) - return; - - rfcomm_session_hold(s); - - list_for_each_safe(p, n, &s->dlcs) { - d = list_entry(p, struct rfcomm_dlc, list); - - if ((d->link_mode & (RFCOMM_LM_ENCRYPT | RFCOMM_LM_SECURE)) && - !(conn->link_mode & HCI_LM_ENCRYPT) && !status) - continue; - - if (!test_and_clear_bit(RFCOMM_AUTH_PENDING, &d->flags)) - continue; - - if (!status) - set_bit(RFCOMM_AUTH_ACCEPT, &d->flags); - else - set_bit(RFCOMM_AUTH_REJECT, &d->flags); - } - - rfcomm_session_put(s); - - rfcomm_schedule(RFCOMM_SCHED_AUTH); -} - -static void rfcomm_encrypt_cfm(struct hci_conn *conn, u8 status, u8 encrypt) +static void rfcomm_security_cfm(struct hci_conn *conn, u8 status, u8 encrypt) { struct rfcomm_session *s; struct rfcomm_dlc *d; @@ -1999,18 +2010,29 @@ static void rfcomm_encrypt_cfm(struct hci_conn *conn, u8 status, u8 encrypt) list_for_each_safe(p, n, &s->dlcs) { d = list_entry(p, struct rfcomm_dlc, list); - if ((d->link_mode & (RFCOMM_LM_ENCRYPT | RFCOMM_LM_SECURE)) && - (d->state == BT_CONNECTED || - d->state == BT_CONFIG) && - !status && encrypt == 0x00) { - __rfcomm_dlc_close(d, ECONNREFUSED); - continue; + if (test_and_clear_bit(RFCOMM_SEC_PENDING, &d->flags)) { + rfcomm_dlc_clear_timer(d); + if (status || encrypt == 0x00) { + __rfcomm_dlc_close(d, ECONNREFUSED); + continue; + } + } + + if (d->state == BT_CONNECTED && !status && encrypt == 0x00) { + if (d->sec_level == BT_SECURITY_MEDIUM) { + set_bit(RFCOMM_SEC_PENDING, &d->flags); + rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT); + continue; + } else if (d->sec_level == BT_SECURITY_HIGH) { + __rfcomm_dlc_close(d, ECONNREFUSED); + continue; + } } if (!test_and_clear_bit(RFCOMM_AUTH_PENDING, &d->flags)) continue; - if (!status && encrypt) + if (!status) set_bit(RFCOMM_AUTH_ACCEPT, &d->flags); else set_bit(RFCOMM_AUTH_REJECT, &d->flags); @@ -2023,8 +2045,7 @@ static void rfcomm_encrypt_cfm(struct hci_conn *conn, u8 status, u8 encrypt) static struct hci_cb rfcomm_cb = { .name = "RFCOMM", - .auth_cfm = rfcomm_auth_cfm, - .encrypt_cfm = rfcomm_encrypt_cfm + .security_cfm = rfcomm_security_cfm }; static ssize_t rfcomm_dlc_sysfs_show(struct class *dev, char *buf) diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index d3fc6fca38d..7f482784e9f 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -261,12 +261,19 @@ static void rfcomm_sock_init(struct sock *sk, struct sock *parent) if (parent) { sk->sk_type = parent->sk_type; - pi->link_mode = rfcomm_pi(parent)->link_mode; + pi->dlc->defer_setup = bt_sk(parent)->defer_setup; + + pi->sec_level = rfcomm_pi(parent)->sec_level; + pi->role_switch = rfcomm_pi(parent)->role_switch; } else { - pi->link_mode = 0; + pi->dlc->defer_setup = 0; + + pi->sec_level = BT_SECURITY_LOW; + pi->role_switch = 0; } - pi->dlc->link_mode = pi->link_mode; + pi->dlc->sec_level = pi->sec_level; + pi->dlc->role_switch = pi->role_switch; } static struct proto rfcomm_proto = { @@ -406,7 +413,8 @@ static int rfcomm_sock_connect(struct socket *sock, struct sockaddr *addr, int a bacpy(&bt_sk(sk)->dst, &sa->rc_bdaddr); rfcomm_pi(sk)->channel = sa->rc_channel; - d->link_mode = rfcomm_pi(sk)->link_mode; + d->sec_level = rfcomm_pi(sk)->sec_level; + d->role_switch = rfcomm_pi(sk)->role_switch; err = rfcomm_dlc_open(d, &bt_sk(sk)->src, &sa->rc_bdaddr, sa->rc_channel); if (!err) @@ -554,6 +562,9 @@ static int rfcomm_sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct sk_buff *skb; int sent = 0; + if (test_bit(RFCOMM_DEFER_SETUP, &d->flags)) + return -ENOTCONN; + if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; @@ -570,8 +581,11 @@ static int rfcomm_sock_sendmsg(struct kiocb *iocb, struct socket *sock, skb = sock_alloc_send_skb(sk, size + RFCOMM_SKB_RESERVE, msg->msg_flags & MSG_DONTWAIT, &err); - if (!skb) + if (!skb) { + if (sent == 0) + sent = err; break; + } skb_reserve(skb, RFCOMM_SKB_HEAD_RESERVE); err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); @@ -630,10 +644,16 @@ static int rfcomm_sock_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; + struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc; int err = 0; size_t target, copied = 0; long timeo; + if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) { + rfcomm_dlc_accept(d); + return 0; + } + if (flags & MSG_OOB) return -EOPNOTSUPP; @@ -710,7 +730,7 @@ out: return copied ? : err; } -static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen) +static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, char __user *optval, int optlen) { struct sock *sk = sock->sk; int err = 0; @@ -727,7 +747,14 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, c break; } - rfcomm_pi(sk)->link_mode = opt; + if (opt & RFCOMM_LM_AUTH) + rfcomm_pi(sk)->sec_level = BT_SECURITY_LOW; + if (opt & RFCOMM_LM_ENCRYPT) + rfcomm_pi(sk)->sec_level = BT_SECURITY_MEDIUM; + if (opt & RFCOMM_LM_SECURE) + rfcomm_pi(sk)->sec_level = BT_SECURITY_HIGH; + + rfcomm_pi(sk)->role_switch = (opt & RFCOMM_LM_MASTER); break; default: @@ -739,12 +766,76 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, c return err; } -static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) +static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen) +{ + struct sock *sk = sock->sk; + struct bt_security sec; + int len, err = 0; + u32 opt; + + BT_DBG("sk %p", sk); + + if (level == SOL_RFCOMM) + return rfcomm_sock_setsockopt_old(sock, optname, optval, optlen); + + if (level != SOL_BLUETOOTH) + return -ENOPROTOOPT; + + lock_sock(sk); + + switch (optname) { + case BT_SECURITY: + if (sk->sk_type != SOCK_STREAM) { + err = -EINVAL; + break; + } + + sec.level = BT_SECURITY_LOW; + + len = min_t(unsigned int, sizeof(sec), optlen); + if (copy_from_user((char *) &sec, optval, len)) { + err = -EFAULT; + break; + } + + if (sec.level > BT_SECURITY_HIGH) { + err = -EINVAL; + break; + } + + rfcomm_pi(sk)->sec_level = sec.level; + break; + + case BT_DEFER_SETUP: + if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) { + err = -EINVAL; + break; + } + + if (get_user(opt, (u32 __user *) optval)) { + err = -EFAULT; + break; + } + + bt_sk(sk)->defer_setup = opt; + break; + + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + +static int rfcomm_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct sock *l2cap_sk; struct rfcomm_conninfo cinfo; int len, err = 0; + u32 opt; BT_DBG("sk %p", sk); @@ -755,12 +846,32 @@ static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, c switch (optname) { case RFCOMM_LM: - if (put_user(rfcomm_pi(sk)->link_mode, (u32 __user *) optval)) + switch (rfcomm_pi(sk)->sec_level) { + case BT_SECURITY_LOW: + opt = RFCOMM_LM_AUTH; + break; + case BT_SECURITY_MEDIUM: + opt = RFCOMM_LM_AUTH | RFCOMM_LM_ENCRYPT; + break; + case BT_SECURITY_HIGH: + opt = RFCOMM_LM_AUTH | RFCOMM_LM_ENCRYPT | + RFCOMM_LM_SECURE; + break; + default: + opt = 0; + break; + } + + if (rfcomm_pi(sk)->role_switch) + opt |= RFCOMM_LM_MASTER; + + if (put_user(opt, (u32 __user *) optval)) err = -EFAULT; break; case RFCOMM_CONNINFO: - if (sk->sk_state != BT_CONNECTED) { + if (sk->sk_state != BT_CONNECTED && + !rfcomm_pi(sk)->dlc->defer_setup) { err = -ENOTCONN; break; } @@ -785,6 +896,60 @@ static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, c return err; } +static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct bt_security sec; + int len, err = 0; + + BT_DBG("sk %p", sk); + + if (level == SOL_RFCOMM) + return rfcomm_sock_getsockopt_old(sock, optname, optval, optlen); + + if (level != SOL_BLUETOOTH) + return -ENOPROTOOPT; + + if (get_user(len, optlen)) + return -EFAULT; + + lock_sock(sk); + + switch (optname) { + case BT_SECURITY: + if (sk->sk_type != SOCK_STREAM) { + err = -EINVAL; + break; + } + + sec.level = rfcomm_pi(sk)->sec_level; + + len = min_t(unsigned int, len, sizeof(sec)); + if (copy_to_user(optval, (char *) &sec, len)) + err = -EFAULT; + + break; + + case BT_DEFER_SETUP: + if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) { + err = -EINVAL; + break; + } + + if (put_user(bt_sk(sk)->defer_setup, (u32 __user *) optval)) + err = -EFAULT; + + break; + + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + static int rfcomm_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk __maybe_unused = sock->sk; @@ -888,6 +1053,10 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc * done: bh_unlock_sock(parent); + + if (bt_sk(parent)->defer_setup) + parent->sk_state_change(parent); + return result; } diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 46fd8bf9a69..51ae0c3e470 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -195,7 +195,7 @@ static int sco_connect(struct sock *sk) else type = SCO_LINK; - hcon = hci_connect(hdev, type, dst, HCI_AT_NO_BONDING); + hcon = hci_connect(hdev, type, dst, BT_SECURITY_LOW, HCI_AT_NO_BONDING); if (!hcon) goto done; @@ -668,7 +668,7 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, char return err; } -static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) +static int sco_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct sco_options opts; @@ -723,6 +723,31 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char return err; } +static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + int len, err = 0; + + BT_DBG("sk %p", sk); + + if (level == SOL_SCO) + return sco_sock_getsockopt_old(sock, optname, optval, optlen); + + if (get_user(len, optlen)) + return -EFAULT; + + lock_sock(sk); + + switch (optname) { + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + static int sco_sock_release(struct socket *sock) { struct sock *sk = sock->sk; @@ -832,10 +857,30 @@ done: /* ----- SCO interface with lower layer (HCI) ----- */ static int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 type) { + register struct sock *sk; + struct hlist_node *node; + int lm = 0; + + if (type != SCO_LINK && type != ESCO_LINK) + return 0; + BT_DBG("hdev %s, bdaddr %s", hdev->name, batostr(bdaddr)); - /* Always accept connection */ - return HCI_LM_ACCEPT; + /* Find listening sockets */ + read_lock(&sco_sk_list.lock); + sk_for_each(sk, node, &sco_sk_list.head) { + if (sk->sk_state != BT_LISTEN) + continue; + + if (!bacmp(&bt_sk(sk)->src, &hdev->bdaddr) || + !bacmp(&bt_sk(sk)->src, BDADDR_ANY)) { + lm |= HCI_LM_ACCEPT; + break; + } + } + read_unlock(&sco_sk_list.lock); + + return lm; } static int sco_connect_cfm(struct hci_conn *hcon, __u8 status) @@ -857,7 +902,7 @@ static int sco_connect_cfm(struct hci_conn *hcon, __u8 status) return 0; } -static int sco_disconn_ind(struct hci_conn *hcon, __u8 reason) +static int sco_disconn_cfm(struct hci_conn *hcon, __u8 reason) { BT_DBG("hcon %p reason %d", hcon, reason); @@ -940,7 +985,7 @@ static struct hci_proto sco_hci_proto = { .id = HCI_PROTO_SCO, .connect_ind = sco_connect_ind, .connect_cfm = sco_connect_cfm, - .disconn_ind = sco_disconn_ind, + .disconn_cfm = sco_disconn_cfm, .recv_scodata = sco_recv_scodata }; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index ba7be195803..fcffb3fb117 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -98,7 +98,8 @@ void br_ifinfo_notify(int event, struct net_bridge_port *port) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); + rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); + return; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_LINK, err); diff --git a/net/can/af_can.c b/net/can/af_can.c index d90e8dd975f..547bafc79e2 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -273,8 +273,7 @@ int can_send(struct sk_buff *skb, int loop) err = net_xmit_errno(err); if (err) { - if (newskb) - kfree_skb(newskb); + kfree_skb(newskb); return err; } diff --git a/net/core/Makefile b/net/core/Makefile index 26a37cb3192..796f46eece5 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -17,3 +17,6 @@ obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NETPOLL) += netpoll.o obj-$(CONFIG_NET_DMA) += user_dma.o obj-$(CONFIG_FIB_RULES) += fib_rules.o +obj-$(CONFIG_TRACEPOINTS) += net-traces.o +obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o + diff --git a/net/core/datagram.c b/net/core/datagram.c index 5e2ac0c4b07..d0de644b378 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -208,7 +208,7 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, void skb_free_datagram(struct sock *sk, struct sk_buff *skb) { - kfree_skb(skb); + consume_skb(skb); sk_mem_reclaim_partial(sk); } diff --git a/net/core/dev.c b/net/core/dev.c index d393fc997cd..052dd478d3e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -135,14 +135,6 @@ /* This should be increased if a protocol with a bigger head is added. */ #define GRO_MAX_HEAD (MAX_HEADER + 128) -enum { - GRO_MERGED, - GRO_MERGED_FREE, - GRO_HELD, - GRO_NORMAL, - GRO_DROP, -}; - /* * The list of packet types we will receive (as opposed to discard) * and the routines to invoke. @@ -1672,23 +1664,12 @@ static int dev_gso_segment(struct sk_buff *skb) return 0; } -static void tstamp_tx(struct sk_buff *skb) -{ - union skb_shared_tx *shtx = - skb_tx(skb); - if (unlikely(shtx->software && - !shtx->in_progress)) { - skb_tstamp_tx(skb, NULL); - } -} - int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) { const struct net_device_ops *ops = dev->netdev_ops; int rc; - prefetch(&dev->netdev_ops->ndo_start_xmit); if (likely(!skb->next)) { if (!list_empty(&ptype_all)) dev_queue_xmit_nit(skb, dev); @@ -1715,8 +1696,6 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, * the skb destructor before the call and restoring it * afterwards, then doing the skb_orphan() ourselves? */ - if (likely(!rc)) - tstamp_tx(skb); return rc; } @@ -1732,7 +1711,6 @@ gso: skb->next = nskb; return rc; } - tstamp_tx(skb); if (unlikely(netif_tx_queue_stopped(txq) && skb->next)) return NETDEV_TX_BUSY; } while (skb->next); @@ -1745,17 +1723,11 @@ out_kfree_skb: } static u32 skb_tx_hashrnd; -static int skb_tx_hashrnd_initialized = 0; -static u16 skb_tx_hash(struct net_device *dev, struct sk_buff *skb) +u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) { u32 hash; - if (unlikely(!skb_tx_hashrnd_initialized)) { - get_random_bytes(&skb_tx_hashrnd, 4); - skb_tx_hashrnd_initialized = 1; - } - if (skb_rx_queue_recorded(skb)) { hash = skb_get_rx_queue(skb); } else if (skb->sk && skb->sk->sk_hash) { @@ -1767,6 +1739,7 @@ static u16 skb_tx_hash(struct net_device *dev, struct sk_buff *skb) return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32); } +EXPORT_SYMBOL(skb_tx_hash); static struct netdev_queue *dev_pick_tx(struct net_device *dev, struct sk_buff *skb) @@ -2273,12 +2246,6 @@ int netif_receive_skb(struct sk_buff *skb) rcu_read_lock(); - /* Don't receive packets in an exiting network namespace */ - if (!net_alive(dev_net(skb->dev))) { - kfree_skb(skb); - goto out; - } - #ifdef CONFIG_NET_CLS_ACT if (skb->tc_verd & TC_NCLS) { skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); @@ -2499,6 +2466,9 @@ static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff *p; + if (netpoll_rx_on(skb)) + return GRO_NORMAL; + for (p = napi->gro_list; p; p = p->next) { NAPI_GRO_CB(p)->same_flow = !compare_ether_header( skb_mac_header(p), skb_gro_mac_header(skb)); @@ -2657,9 +2627,9 @@ static int process_backlog(struct napi_struct *napi, int quota) local_irq_disable(); skb = __skb_dequeue(&queue->input_pkt_queue); if (!skb) { - __napi_complete(napi); local_irq_enable(); - break; + napi_complete(napi); + goto out; } local_irq_enable(); @@ -2668,6 +2638,7 @@ static int process_backlog(struct napi_struct *napi, int quota) napi_gro_flush(napi); +out: return work; } @@ -2741,7 +2712,7 @@ void netif_napi_del(struct napi_struct *napi) struct sk_buff *skb, *next; list_del_init(&napi->dev_list); - kfree(napi->skb); + kfree_skb(napi->skb); for (skb = napi->gro_list; skb; skb = next) { next = skb->next; @@ -4355,6 +4326,39 @@ unsigned long netdev_fix_features(unsigned long features, const char *name) } EXPORT_SYMBOL(netdev_fix_features); +/* Some devices need to (re-)set their netdev_ops inside + * ->init() or similar. If that happens, we have to setup + * the compat pointers again. + */ +void netdev_resync_ops(struct net_device *dev) +{ +#ifdef CONFIG_COMPAT_NET_DEV_OPS + const struct net_device_ops *ops = dev->netdev_ops; + + dev->init = ops->ndo_init; + dev->uninit = ops->ndo_uninit; + dev->open = ops->ndo_open; + dev->change_rx_flags = ops->ndo_change_rx_flags; + dev->set_rx_mode = ops->ndo_set_rx_mode; + dev->set_multicast_list = ops->ndo_set_multicast_list; + dev->set_mac_address = ops->ndo_set_mac_address; + dev->validate_addr = ops->ndo_validate_addr; + dev->do_ioctl = ops->ndo_do_ioctl; + dev->set_config = ops->ndo_set_config; + dev->change_mtu = ops->ndo_change_mtu; + dev->neigh_setup = ops->ndo_neigh_setup; + dev->tx_timeout = ops->ndo_tx_timeout; + dev->get_stats = ops->ndo_get_stats; + dev->vlan_rx_register = ops->ndo_vlan_rx_register; + dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid; + dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid; +#ifdef CONFIG_NET_POLL_CONTROLLER + dev->poll_controller = ops->ndo_poll_controller; +#endif +#endif +} +EXPORT_SYMBOL(netdev_resync_ops); + /** * register_netdevice - register a network device * @dev: device to register @@ -4399,27 +4403,7 @@ int register_netdevice(struct net_device *dev) * This is temporary until all network devices are converted. */ if (dev->netdev_ops) { - const struct net_device_ops *ops = dev->netdev_ops; - - dev->init = ops->ndo_init; - dev->uninit = ops->ndo_uninit; - dev->open = ops->ndo_open; - dev->change_rx_flags = ops->ndo_change_rx_flags; - dev->set_rx_mode = ops->ndo_set_rx_mode; - dev->set_multicast_list = ops->ndo_set_multicast_list; - dev->set_mac_address = ops->ndo_set_mac_address; - dev->validate_addr = ops->ndo_validate_addr; - dev->do_ioctl = ops->ndo_do_ioctl; - dev->set_config = ops->ndo_set_config; - dev->change_mtu = ops->ndo_change_mtu; - dev->tx_timeout = ops->ndo_tx_timeout; - dev->get_stats = ops->ndo_get_stats; - dev->vlan_rx_register = ops->ndo_vlan_rx_register; - dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid; - dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid; -#ifdef CONFIG_NET_POLL_CONTROLLER - dev->poll_controller = ops->ndo_poll_controller; -#endif + netdev_resync_ops(dev); } else { char drivername[64]; pr_info("%s (%s): not using net_device_ops yet\n", @@ -5291,6 +5275,14 @@ out: subsys_initcall(net_dev_init); +static int __init initialize_hashrnd(void) +{ + get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd)); + return 0; +} + +late_initcall_sync(initialize_hashrnd); + EXPORT_SYMBOL(__dev_get_by_index); EXPORT_SYMBOL(__dev_get_by_name); EXPORT_SYMBOL(__dev_remove_pack); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c new file mode 100644 index 00000000000..9fd0dc3cca9 --- /dev/null +++ b/net/core/drop_monitor.c @@ -0,0 +1,263 @@ +/* + * Monitoring code for network dropped packet alerts + * + * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com> + */ + +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/string.h> +#include <linux/if_arp.h> +#include <linux/inetdevice.h> +#include <linux/inet.h> +#include <linux/interrupt.h> +#include <linux/netpoll.h> +#include <linux/sched.h> +#include <linux/delay.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <linux/netlink.h> +#include <linux/net_dropmon.h> +#include <linux/percpu.h> +#include <linux/timer.h> +#include <linux/bitops.h> +#include <net/genetlink.h> + +#include <trace/skb.h> + +#include <asm/unaligned.h> + +#define TRACE_ON 1 +#define TRACE_OFF 0 + +static void send_dm_alert(struct work_struct *unused); + + +/* + * Globals, our netlink socket pointer + * and the work handle that will send up + * netlink alerts + */ +struct sock *dm_sock; + +struct per_cpu_dm_data { + struct work_struct dm_alert_work; + struct sk_buff *skb; + atomic_t dm_hit_count; + struct timer_list send_timer; +}; + +static struct genl_family net_drop_monitor_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = "NET_DM", + .version = 1, + .maxattr = NET_DM_CMD_MAX, +}; + +static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); + +static int dm_hit_limit = 64; +static int dm_delay = 1; + + +static void reset_per_cpu_data(struct per_cpu_dm_data *data) +{ + size_t al; + struct net_dm_alert_msg *msg; + + al = sizeof(struct net_dm_alert_msg); + al += dm_hit_limit * sizeof(struct net_dm_drop_point); + data->skb = genlmsg_new(al, GFP_KERNEL); + genlmsg_put(data->skb, 0, 0, &net_drop_monitor_family, + 0, NET_DM_CMD_ALERT); + msg = __nla_reserve_nohdr(data->skb, sizeof(struct net_dm_alert_msg)); + memset(msg, 0, al); + atomic_set(&data->dm_hit_count, dm_hit_limit); +} + +static void send_dm_alert(struct work_struct *unused) +{ + struct sk_buff *skb; + struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data); + + /* + * Grab the skb we're about to send + */ + skb = data->skb; + + /* + * Replace it with a new one + */ + reset_per_cpu_data(data); + + /* + * Ship it! + */ + genlmsg_multicast(skb, 0, NET_DM_GRP_ALERT, GFP_KERNEL); + +} + +/* + * This is the timer function to delay the sending of an alert + * in the event that more drops will arrive during the + * hysteresis period. Note that it operates under the timer interrupt + * so we don't need to disable preemption here + */ +static void sched_send_work(unsigned long unused) +{ + struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data); + + schedule_work(&data->dm_alert_work); +} + +static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) +{ + struct net_dm_alert_msg *msg; + struct nlmsghdr *nlh; + int i; + struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data); + + + if (!atomic_add_unless(&data->dm_hit_count, -1, 0)) { + /* + * we're already at zero, discard this hit + */ + goto out; + } + + nlh = (struct nlmsghdr *)data->skb->data; + msg = genlmsg_data(nlmsg_data(nlh)); + for (i = 0; i < msg->entries; i++) { + if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) { + msg->points[i].count++; + goto out; + } + } + + /* + * We need to create a new entry + */ + __nla_reserve_nohdr(data->skb, sizeof(struct net_dm_drop_point)); + memcpy(msg->points[msg->entries].pc, &location, sizeof(void *)); + msg->points[msg->entries].count = 1; + msg->entries++; + + if (!timer_pending(&data->send_timer)) { + data->send_timer.expires = jiffies + dm_delay * HZ; + add_timer_on(&data->send_timer, smp_processor_id()); + } + +out: + return; +} + +static int set_all_monitor_traces(int state) +{ + int rc = 0; + + switch (state) { + case TRACE_ON: + rc |= register_trace_kfree_skb(trace_kfree_skb_hit); + break; + case TRACE_OFF: + rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit); + + tracepoint_synchronize_unregister(); + break; + default: + rc = 1; + break; + } + + if (rc) + return -EINPROGRESS; + return rc; +} + + +static int net_dm_cmd_config(struct sk_buff *skb, + struct genl_info *info) +{ + return -ENOTSUPP; +} + +static int net_dm_cmd_trace(struct sk_buff *skb, + struct genl_info *info) +{ + switch (info->genlhdr->cmd) { + case NET_DM_CMD_START: + return set_all_monitor_traces(TRACE_ON); + break; + case NET_DM_CMD_STOP: + return set_all_monitor_traces(TRACE_OFF); + break; + } + + return -ENOTSUPP; +} + + +static struct genl_ops dropmon_ops[] = { + { + .cmd = NET_DM_CMD_CONFIG, + .doit = net_dm_cmd_config, + }, + { + .cmd = NET_DM_CMD_START, + .doit = net_dm_cmd_trace, + }, + { + .cmd = NET_DM_CMD_STOP, + .doit = net_dm_cmd_trace, + }, +}; + +static int __init init_net_drop_monitor(void) +{ + int cpu; + int rc, i, ret; + struct per_cpu_dm_data *data; + printk(KERN_INFO "Initalizing network drop monitor service\n"); + + if (sizeof(void *) > 8) { + printk(KERN_ERR "Unable to store program counters on this arch, Drop monitor failed\n"); + return -ENOSPC; + } + + if (genl_register_family(&net_drop_monitor_family) < 0) { + printk(KERN_ERR "Could not create drop monitor netlink family\n"); + return -EFAULT; + } + + rc = -EFAULT; + + for (i = 0; i < ARRAY_SIZE(dropmon_ops); i++) { + ret = genl_register_ops(&net_drop_monitor_family, + &dropmon_ops[i]); + if (ret) { + printk(KERN_CRIT "failed to register operation %d\n", + dropmon_ops[i].cmd); + goto out_unreg; + } + } + + rc = 0; + + for_each_present_cpu(cpu) { + data = &per_cpu(dm_cpu_data, cpu); + reset_per_cpu_data(data); + INIT_WORK(&data->dm_alert_work, send_dm_alert); + init_timer(&data->send_timer); + data->send_timer.data = cpu; + data->send_timer.function = sched_send_work; + } + goto out; + +out_unreg: + genl_unregister_family(&net_drop_monitor_family); +out: + return rc; +} + +late_initcall(init_net_drop_monitor); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 947710a36ce..244ca56dffa 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -209,34 +209,62 @@ static int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr) return 0; } -static int ethtool_set_rxhash(struct net_device *dev, void __user *useraddr) +static int ethtool_set_rxnfc(struct net_device *dev, void __user *useraddr) { struct ethtool_rxnfc cmd; - if (!dev->ethtool_ops->set_rxhash) + if (!dev->ethtool_ops->set_rxnfc) return -EOPNOTSUPP; if (copy_from_user(&cmd, useraddr, sizeof(cmd))) return -EFAULT; - return dev->ethtool_ops->set_rxhash(dev, &cmd); + return dev->ethtool_ops->set_rxnfc(dev, &cmd); } -static int ethtool_get_rxhash(struct net_device *dev, void __user *useraddr) +static int ethtool_get_rxnfc(struct net_device *dev, void __user *useraddr) { struct ethtool_rxnfc info; + const struct ethtool_ops *ops = dev->ethtool_ops; + int ret; + void *rule_buf = NULL; - if (!dev->ethtool_ops->get_rxhash) + if (!ops->get_rxnfc) return -EOPNOTSUPP; if (copy_from_user(&info, useraddr, sizeof(info))) return -EFAULT; - dev->ethtool_ops->get_rxhash(dev, &info); + if (info.cmd == ETHTOOL_GRXCLSRLALL) { + if (info.rule_cnt > 0) { + rule_buf = kmalloc(info.rule_cnt * sizeof(u32), + GFP_USER); + if (!rule_buf) + return -ENOMEM; + } + } + ret = ops->get_rxnfc(dev, &info, rule_buf); + if (ret < 0) + goto err_out; + + ret = -EFAULT; if (copy_to_user(useraddr, &info, sizeof(info))) - return -EFAULT; - return 0; + goto err_out; + + if (rule_buf) { + useraddr += offsetof(struct ethtool_rxnfc, rule_locs); + if (copy_to_user(useraddr, rule_buf, + info.rule_cnt * sizeof(u32))) + goto err_out; + } + ret = 0; + +err_out: + if (rule_buf) + kfree(rule_buf); + + return ret; } static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) @@ -901,6 +929,10 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GFLAGS: case ETHTOOL_GPFLAGS: case ETHTOOL_GRXFH: + case ETHTOOL_GRXRINGS: + case ETHTOOL_GRXCLSRLCNT: + case ETHTOOL_GRXCLSRULE: + case ETHTOOL_GRXCLSRLALL: break; default: if (!capable(CAP_NET_ADMIN)) @@ -1052,10 +1084,16 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) dev->ethtool_ops->set_priv_flags); break; case ETHTOOL_GRXFH: - rc = ethtool_get_rxhash(dev, useraddr); + case ETHTOOL_GRXRINGS: + case ETHTOOL_GRXCLSRLCNT: + case ETHTOOL_GRXCLSRULE: + case ETHTOOL_GRXCLSRLALL: + rc = ethtool_get_rxnfc(dev, useraddr); break; case ETHTOOL_SRXFH: - rc = ethtool_set_rxhash(dev, useraddr); + case ETHTOOL_SRXCLSRLDEL: + case ETHTOOL_SRXCLSRLINS: + rc = ethtool_set_rxnfc(dev, useraddr); break; case ETHTOOL_GGRO: rc = ethtool_get_gro(dev, useraddr); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 32b3a0152d7..98691e1466b 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -588,7 +588,8 @@ static void notify_rule_change(int event, struct fib_rule *rule, goto errout; } - err = rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL); + rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL); + return; errout: if (err < 0) rtnl_set_sk_err(net, ops->nlgroup, err); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 278a142d104..a1cbce7fdae 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -871,8 +871,7 @@ static void neigh_timer_handler(unsigned long arg) write_unlock(&neigh->lock); neigh->ops->solicit(neigh, skb); atomic_inc(&neigh->probes); - if (skb) - kfree_skb(skb); + kfree_skb(skb); } else { out: write_unlock(&neigh->lock); @@ -908,8 +907,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) neigh->updated = jiffies; write_unlock_bh(&neigh->lock); - if (skb) - kfree_skb(skb); + kfree_skb(skb); return 1; } } else if (neigh->nud_state & NUD_STALE) { @@ -1656,7 +1654,11 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) flags &= ~NEIGH_UPDATE_F_OVERRIDE; } - err = neigh_update(neigh, lladdr, ndm->ndm_state, flags); + if (ndm->ndm_flags & NTF_USE) { + neigh_event_send(neigh, NULL); + err = 0; + } else + err = neigh_update(neigh, lladdr, ndm->ndm_state, flags); neigh_release(neigh); goto out_dev_put; } @@ -2534,7 +2536,8 @@ static void __neigh_notify(struct neighbour *n, int type, int flags) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); + rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); + return; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 6ac29a46e23..2da59a0ac4a 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -77,7 +77,9 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, if (endp == buf) goto err; - rtnl_lock(); + if (!rtnl_trylock()) + return -ERESTARTSYS; + if (dev_isalive(net)) { if ((ret = (*set)(net, new)) == 0) ret = len; @@ -496,7 +498,7 @@ int netdev_register_kobject(struct net_device *net) dev->groups = groups; BUILD_BUG_ON(BUS_ID_SIZE < IFNAMSIZ); - dev_set_name(dev, net->name); + dev_set_name(dev, "%s", net->name); #ifdef CONFIG_SYSFS *groups++ = &netstat_group; diff --git a/net/core/net-traces.c b/net/core/net-traces.c new file mode 100644 index 00000000000..c8fb45665e4 --- /dev/null +++ b/net/core/net-traces.c @@ -0,0 +1,29 @@ +/* + * consolidates trace point definitions + * + * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com> + */ + +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/string.h> +#include <linux/if_arp.h> +#include <linux/inetdevice.h> +#include <linux/inet.h> +#include <linux/interrupt.h> +#include <linux/netpoll.h> +#include <linux/sched.h> +#include <linux/delay.h> +#include <linux/rcupdate.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <linux/netlink.h> +#include <linux/net_dropmon.h> +#include <trace/skb.h> + +#include <asm/unaligned.h> +#include <asm/bitops.h> + + +DEFINE_TRACE(kfree_skb); +EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 55151faaf90..e3bebd36f05 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -32,24 +32,14 @@ static __net_init int setup_net(struct net *net) { /* Must be called with net_mutex held */ struct pernet_operations *ops; - int error; - struct net_generic *ng; + int error = 0; atomic_set(&net->count, 1); + #ifdef NETNS_REFCNT_DEBUG atomic_set(&net->use_count, 0); #endif - error = -ENOMEM; - ng = kzalloc(sizeof(struct net_generic) + - INITIAL_NET_GEN_PTRS * sizeof(void *), GFP_KERNEL); - if (ng == NULL) - goto out; - - ng->len = INITIAL_NET_GEN_PTRS; - rcu_assign_pointer(net->gen, ng); - - error = 0; list_for_each_entry(ops, &pernet_list, list) { if (ops->init) { error = ops->init(net); @@ -70,24 +60,50 @@ out_undo: } rcu_barrier(); - kfree(ng); goto out; } +static struct net_generic *net_alloc_generic(void) +{ + struct net_generic *ng; + size_t generic_size = sizeof(struct net_generic) + + INITIAL_NET_GEN_PTRS * sizeof(void *); + + ng = kzalloc(generic_size, GFP_KERNEL); + if (ng) + ng->len = INITIAL_NET_GEN_PTRS; + + return ng; +} + #ifdef CONFIG_NET_NS static struct kmem_cache *net_cachep; static struct workqueue_struct *netns_wq; static struct net *net_alloc(void) { - return kmem_cache_zalloc(net_cachep, GFP_KERNEL); + struct net *net = NULL; + struct net_generic *ng; + + ng = net_alloc_generic(); + if (!ng) + goto out; + + net = kmem_cache_zalloc(net_cachep, GFP_KERNEL); + if (!net) + goto out_free; + + rcu_assign_pointer(net->gen, ng); +out: + return net; + +out_free: + kfree(ng); + goto out; } static void net_free(struct net *net) { - if (!net) - return; - #ifdef NETNS_REFCNT_DEBUG if (unlikely(atomic_read(&net->use_count) != 0)) { printk(KERN_EMERG "network namespace not free! Usage: %d\n", @@ -112,27 +128,28 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net) err = -ENOMEM; new_net = net_alloc(); if (!new_net) - goto out; + goto out_err; mutex_lock(&net_mutex); err = setup_net(new_net); - if (err) - goto out_unlock; - - rtnl_lock(); - list_add_tail(&new_net->list, &net_namespace_list); - rtnl_unlock(); - - -out_unlock: + if (!err) { + rtnl_lock(); + list_add_tail(&new_net->list, &net_namespace_list); + rtnl_unlock(); + } mutex_unlock(&net_mutex); + + if (err) + goto out_free; out: put_net(old_net); - if (err) { - net_free(new_net); - new_net = ERR_PTR(err); - } return new_net; + +out_free: + net_free(new_net); +out_err: + new_net = ERR_PTR(err); + goto out; } static void cleanup_net(struct work_struct *work) @@ -140,9 +157,6 @@ static void cleanup_net(struct work_struct *work) struct pernet_operations *ops; struct net *net; - /* Be very certain incoming network packets will not find us */ - rcu_barrier(); - net = container_of(work, struct net, work); mutex_lock(&net_mutex); @@ -188,6 +202,7 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net) static int __init net_ns_init(void) { + struct net_generic *ng; int err; printk(KERN_INFO "net_namespace: %zd bytes\n", sizeof(struct net)); @@ -202,6 +217,12 @@ static int __init net_ns_init(void) panic("Could not create netns workq"); #endif + ng = net_alloc_generic(); + if (!ng) + panic("Could not allocate generic netns"); + + rcu_assign_pointer(init_net.gen, ng); + mutex_lock(&net_mutex); err = setup_net(&init_net); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 65498483325..32d419f5ac9 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3275,8 +3275,7 @@ static void pktgen_stop(struct pktgen_thread *t) list_for_each_entry(pkt_dev, &t->if_list, list) { pktgen_stop_device(pkt_dev); - if (pkt_dev->skb) - kfree_skb(pkt_dev->skb); + kfree_skb(pkt_dev->skb); pkt_dev->skb = NULL; } @@ -3303,8 +3302,7 @@ static void pktgen_rem_one_if(struct pktgen_thread *t) if (!cur->removal_mark) continue; - if (cur->skb) - kfree_skb(cur->skb); + kfree_skb(cur->skb); cur->skb = NULL; pktgen_remove_device(t, cur); @@ -3328,8 +3326,7 @@ static void pktgen_rem_all_ifs(struct pktgen_thread *t) list_for_each_safe(q, n, &t->if_list) { cur = list_entry(q, struct pktgen_dev, list); - if (cur->skb) - kfree_skb(cur->skb); + kfree_skb(cur->skb); cur->skb = NULL; pktgen_remove_device(t, cur); @@ -3393,8 +3390,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) if (!netif_running(odev)) { pktgen_stop_device(pkt_dev); - if (pkt_dev->skb) - kfree_skb(pkt_dev->skb); + kfree_skb(pkt_dev->skb); pkt_dev->skb = NULL; goto out; } @@ -3415,8 +3411,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) if ((++pkt_dev->clone_count >= pkt_dev->clone_skb) || (!pkt_dev->skb)) { /* build a new pkt */ - if (pkt_dev->skb) - kfree_skb(pkt_dev->skb); + kfree_skb(pkt_dev->skb); pkt_dev->skb = fill_packet(odev, pkt_dev); if (pkt_dev->skb == NULL) { @@ -3498,8 +3493,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) /* Done with this */ pktgen_stop_device(pkt_dev); - if (pkt_dev->skb) - kfree_skb(pkt_dev->skb); + kfree_skb(pkt_dev->skb); pkt_dev->skb = NULL; } out:; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 790dd205bb5..d78030f88bd 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -455,8 +455,8 @@ int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid) return nlmsg_unicast(rtnl, skb, pid); } -int rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, - struct nlmsghdr *nlh, gfp_t flags) +void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, + struct nlmsghdr *nlh, gfp_t flags) { struct sock *rtnl = net->rtnl; int report = 0; @@ -464,7 +464,7 @@ int rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, if (nlh) report = nlmsg_report(nlh); - return nlmsg_notify(rtnl, skb, pid, group, report, flags); + nlmsg_notify(rtnl, skb, pid, group, report, flags); } void rtnl_set_sk_err(struct net *net, u32 group, int error) @@ -1246,7 +1246,8 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL); + rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL); + return; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_LINK, err); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e5a8351ff12..6acbf9e79eb 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -65,6 +65,7 @@ #include <asm/uaccess.h> #include <asm/system.h> +#include <trace/skb.h> #include "kmap_skb.h" @@ -146,14 +147,6 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here) } EXPORT_SYMBOL(skb_under_panic); -void skb_truesize_bug(struct sk_buff *skb) -{ - WARN(net_ratelimit(), KERN_ERR "SKB BUG: Invalid truesize (%u) " - "len=%u, sizeof(sk_buff)=%Zd\n", - skb->truesize, skb->len, sizeof(struct sk_buff)); -} -EXPORT_SYMBOL(skb_truesize_bug); - /* Allocate a new skbuff. We do this ourselves so we can fill in a few * 'private' fields and also do memory statistics to find all the * [BEEP] leaks. @@ -450,11 +443,32 @@ void kfree_skb(struct sk_buff *skb) smp_rmb(); else if (likely(!atomic_dec_and_test(&skb->users))) return; + trace_kfree_skb(skb, __builtin_return_address(0)); __kfree_skb(skb); } EXPORT_SYMBOL(kfree_skb); /** + * consume_skb - free an skbuff + * @skb: buffer to free + * + * Drop a ref to the buffer and free it if the usage count has hit zero + * Functions identically to kfree_skb, but kfree_skb assumes that the frame + * is being dropped after a failure and notes that + */ +void consume_skb(struct sk_buff *skb) +{ + if (unlikely(!skb)) + return; + if (likely(atomic_read(&skb->users) == 1)) + smp_rmb(); + else if (likely(!atomic_dec_and_test(&skb->users))) + return; + __kfree_skb(skb); +} +EXPORT_SYMBOL(consume_skb); + +/** * skb_recycle_check - check if skb can be reused for receive * @skb: buffer * @skb_size: minimum receive buffer size @@ -1216,8 +1230,7 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) insp = list; } if (!pskb_pull(list, eat)) { - if (clone) - kfree_skb(clone); + kfree_skb(clone); return NULL; } break; diff --git a/net/core/sock.c b/net/core/sock.c index 40887e76652..0620046e4eb 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -150,7 +150,7 @@ static const char *af_family_key_strings[AF_MAX+1] = { "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" , "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" , "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" , - "sk_lock-21" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" , + "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" , "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" , "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" , "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" , @@ -165,7 +165,7 @@ static const char *af_family_slock_key_strings[AF_MAX+1] = { "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" , "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" , "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" , - "slock-21" , "slock-AF_SNA" , "slock-AF_IRDA" , + "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" , "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" , "slock-27" , "slock-28" , "slock-AF_CAN" , "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" , @@ -180,7 +180,7 @@ static const char *af_family_clock_key_strings[AF_MAX+1] = { "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" , "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" , "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" , - "clock-21" , "clock-AF_SNA" , "clock-AF_IRDA" , + "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" , "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" , "clock-27" , "clock-28" , "clock-AF_CAN" , "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" , @@ -725,7 +725,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, if (len < 0) return -EINVAL; - v.val = 0; + memset(&v, 0, sizeof(v)); switch(optname) { case SO_DEBUG: @@ -1185,7 +1185,6 @@ void sock_rfree(struct sk_buff *skb) { struct sock *sk = skb->sk; - skb_truesize_check(skb); atomic_sub(skb->truesize, &sk->sk_rmem_alloc); sk_mem_uncharge(skb->sk, skb->truesize); } diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 83d3398559e..7db1de0497c 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -11,6 +11,7 @@ #include <linux/socket.h> #include <linux/netdevice.h> #include <linux/init.h> +#include <net/ip.h> #include <net/sock.h> static struct ctl_table net_core_table[] = { diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h index 45f95e55f87..7ea557b7c6b 100644 --- a/net/dccp/ackvec.h +++ b/net/dccp/ackvec.h @@ -20,6 +20,9 @@ /* We can spread an ack vector across multiple options */ #define DCCP_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * 2) +/* Estimated minimum average Ack Vector length - used for updating MPS */ +#define DCCPAV_MIN_OPTLEN 16 + #define DCCP_ACKVEC_STATE_RECEIVED 0 #define DCCP_ACKVEC_STATE_ECN_MARKED (1 << 6) #define DCCP_ACKVEC_STATE_NOT_RECEIVED (3 << 6) diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 08a569ff02d..d6bc47363b1 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -63,11 +63,14 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo); * - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields * Hence a safe upper bound for the maximum option length is 1020-28 = 992 */ -#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(int)) +#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t)) #define DCCP_MAX_PACKET_HDR 28 #define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR) #define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER) +/* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */ +#define DCCP_FEATNEG_OVERHEAD (32 * sizeof(uint32_t)) + #define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT * state, about 60 seconds */ diff --git a/net/dccp/output.c b/net/dccp/output.c index 22a618af489..36bcc00654d 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -161,21 +161,27 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) struct inet_connection_sock *icsk = inet_csk(sk); struct dccp_sock *dp = dccp_sk(sk); u32 ccmps = dccp_determine_ccmps(dp); - int cur_mps = ccmps ? min(pmtu, ccmps) : pmtu; + u32 cur_mps = ccmps ? min(pmtu, ccmps) : pmtu; /* Account for header lengths and IPv4/v6 option overhead */ cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len + sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext)); /* - * FIXME: this should come from the CCID infrastructure, where, say, - * TFRC will say it wants TIMESTAMPS, ELAPSED time, etc, for now lets - * put a rough estimate for NDP + TIMESTAMP + TIMESTAMP_ECHO + ELAPSED - * TIME + TFRC_OPT_LOSS_EVENT_RATE + TFRC_OPT_RECEIVE_RATE + padding to - * make it a multiple of 4 + * Leave enough headroom for common DCCP header options. + * This only considers options which may appear on DCCP-Data packets, as + * per table 3 in RFC 4340, 5.8. When running out of space for other + * options (eg. Ack Vector which can take up to 255 bytes), it is better + * to schedule a separate Ack. Thus we leave headroom for the following: + * - 1 byte for Slow Receiver (11.6) + * - 6 bytes for Timestamp (13.1) + * - 10 bytes for Timestamp Echo (13.3) + * - 8 bytes for NDP count (7.7, when activated) + * - 6 bytes for Data Checksum (9.3) + * - %DCCPAV_MIN_OPTLEN bytes for Ack Vector size (11.4, when enabled) */ - - cur_mps -= roundup(5 + 6 + 10 + 6 + 6 + 6, 4); + cur_mps -= roundup(1 + 6 + 10 + dp->dccps_send_ndp_count * 8 + 6 + + (dp->dccps_hc_rx_ackvec ? DCCPAV_MIN_OPTLEN : 0), 4); /* And store cached results */ icsk->icsk_pmtu_cookie = pmtu; @@ -270,7 +276,20 @@ void dccp_write_xmit(struct sock *sk, int block) const int len = skb->len; if (sk->sk_state == DCCP_PARTOPEN) { - /* See 8.1.5. Handshake Completion */ + const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD; + /* + * See 8.1.5 - Handshake Completion. + * + * For robustness we resend Confirm options until the client has + * entered OPEN. During the initial feature negotiation, the MPS + * is smaller than usual, reduced by the Change/Confirm options. + */ + if (!list_empty(&dp->dccps_featneg) && len > cur_mps) { + DCCP_WARN("Payload too large (%d) for featneg.\n", len); + dccp_send_ack(sk); + dccp_feat_list_purge(&dp->dccps_featneg); + } + inet_csk_schedule_ack(sk); inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, inet_csk(sk)->icsk_rto, diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 12bf7d4c16c..9647d911f91 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -1246,11 +1246,12 @@ static int dn_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case TIOCINQ: lock_sock(sk); - if ((skb = skb_peek(&scp->other_receive_queue)) != NULL) { + skb = skb_peek(&scp->other_receive_queue); + if (skb) { amount = skb->len; } else { - struct sk_buff *skb = sk->sk_receive_queue.next; - for(;;) { + skb = sk->sk_receive_queue.next; + for (;;) { if (skb == (struct sk_buff *)&sk->sk_receive_queue) break; @@ -1579,16 +1580,16 @@ static int __dn_getsockopt(struct socket *sock, int level,int optname, char __us default: #ifdef CONFIG_NETFILTER { - int val, len; + int ret, len; if(get_user(len, optlen)) return -EFAULT; - val = nf_getsockopt(sk, PF_DECnet, optname, + ret = nf_getsockopt(sk, PF_DECnet, optname, optval, &len); - if (val >= 0) - val = put_user(len, optlen); - return val; + if (ret >= 0) + ret = put_user(len, optlen); + return ret; } #endif case DSO_STREAM: @@ -2071,8 +2072,7 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock, } out: - if (skb) - kfree_skb(skb); + kfree_skb(skb); release_sock(sk); @@ -2112,9 +2112,8 @@ static struct notifier_block dn_dev_notifier = { extern int dn_route_rcv(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); -static struct packet_type dn_dix_packet_type = { +static struct packet_type dn_dix_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_DNA_RT), - .dev = NULL, /* All devices */ .func = dn_route_rcv, }; diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index daf2b98b15f..1c6a5bb6f0c 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -684,7 +684,6 @@ static int dn_nl_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) return -ENODEV; if ((dn_db = dev->dn_ptr) == NULL) { - int err; dn_db = dn_dev_create(dev, &err); if (!dn_db) return err; @@ -769,7 +768,8 @@ static void dn_ifaddr_notify(int event, struct dn_ifaddr *ifa) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, &init_net, 0, RTNLGRP_DECnet_IFADDR, NULL, GFP_KERNEL); + rtnl_notify(skb, &init_net, 0, RTNLGRP_DECnet_IFADDR, NULL, GFP_KERNEL); + return; errout: if (err < 0) rtnl_set_sk_err(&init_net, RTNLGRP_DECnet_IFADDR, err); @@ -1322,6 +1322,7 @@ static inline int is_dn_dev(struct net_device *dev) } static void *dn_dev_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(&dev_base_lock) { int i; struct net_device *dev; @@ -1364,6 +1365,7 @@ static void *dn_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void dn_dev_seq_stop(struct seq_file *seq, void *v) + __releases(&dev_base_lock) { read_unlock(&dev_base_lock); } diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 5130dee0b38..0cc4394117d 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -380,7 +380,6 @@ static int dn_return_short(struct sk_buff *skb) unsigned char *ptr; __le16 *src; __le16 *dst; - __le16 tmp; /* Add back headers */ skb_push(skb, skb->data - skb_network_header(skb)); @@ -399,10 +398,7 @@ static int dn_return_short(struct sk_buff *skb) ptr += 2; *ptr = 0; /* Zero hop count */ - /* Swap source and destination */ - tmp = *src; - *src = *dst; - *dst = tmp; + swap(*src, *dst); skb->pkt_type = PACKET_OUTGOING; dn_rt_finish_output(skb, NULL, NULL); diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c index 69ad9280c69..67054b0d550 100644 --- a/net/decnet/dn_table.c +++ b/net/decnet/dn_table.c @@ -375,7 +375,8 @@ static void dn_rtmsg_fib(int event, struct dn_fib_node *f, int z, u32 tb_id, kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, &init_net, pid, RTNLGRP_DECnet_ROUTE, nlh, GFP_KERNEL); + rtnl_notify(skb, &init_net, pid, RTNLGRP_DECnet_ROUTE, nlh, GFP_KERNEL); + return; errout: if (err < 0) rtnl_set_sk_err(&init_net, RTNLGRP_DECnet_ROUTE, err); diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c index 965397af9a8..5bcd592ae6d 100644 --- a/net/decnet/sysctl_net_decnet.c +++ b/net/decnet/sysctl_net_decnet.c @@ -179,7 +179,7 @@ static int dn_node_address_handler(ctl_table *table, int write, } if (write) { - int len = (*lenp < DN_ASCBUF_LEN) ? *lenp : (DN_ASCBUF_LEN-1); + len = (*lenp < DN_ASCBUF_LEN) ? *lenp : (DN_ASCBUF_LEN-1); if (copy_from_user(addr, buffer, len)) return -EFAULT; diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 49211b35725..c51b55400dc 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -41,13 +41,13 @@ config NET_DSA_MV88E6XXX_NEED_PPU default n config NET_DSA_MV88E6131 - bool "Marvell 88E6131 ethernet switch chip support" + bool "Marvell 88E6095/6095F/6131 ethernet switch chip support" select NET_DSA_MV88E6XXX select NET_DSA_MV88E6XXX_NEED_PPU select NET_DSA_TAG_DSA ---help--- - This enables support for the Marvell 88E6131 ethernet switch - chip. + This enables support for the Marvell 88E6095/6095F/6131 + ethernet switch chips. config NET_DSA_MV88E6123_61_65 bool "Marvell 88E6123/6161/6165 ethernet switch chip support" diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 33e99462023..71489f69a42 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -1,6 +1,6 @@ /* * net/dsa/dsa.c - Hardware switch handling - * Copyright (c) 2008 Marvell Semiconductor + * Copyright (c) 2008-2009 Marvell Semiconductor * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -67,12 +67,13 @@ dsa_switch_probe(struct mii_bus *bus, int sw_addr, char **_name) /* basic switch operations **************************************************/ static struct dsa_switch * -dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd, - struct mii_bus *bus, struct net_device *dev) +dsa_switch_setup(struct dsa_switch_tree *dst, int index, + struct device *parent, struct mii_bus *bus) { + struct dsa_chip_data *pd = dst->pd->chip + index; + struct dsa_switch_driver *drv; struct dsa_switch *ds; int ret; - struct dsa_switch_driver *drv; char *name; int i; @@ -81,11 +82,12 @@ dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd, */ drv = dsa_switch_probe(bus, pd->sw_addr, &name); if (drv == NULL) { - printk(KERN_ERR "%s: could not detect attached switch\n", - dev->name); + printk(KERN_ERR "%s[%d]: could not detect attached switch\n", + dst->master_netdev->name, index); return ERR_PTR(-EINVAL); } - printk(KERN_INFO "%s: detected a %s switch\n", dev->name, name); + printk(KERN_INFO "%s[%d]: detected a %s switch\n", + dst->master_netdev->name, index, name); /* @@ -95,18 +97,16 @@ dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd, if (ds == NULL) return ERR_PTR(-ENOMEM); - ds->pd = pd; - ds->master_netdev = dev; - ds->master_mii_bus = bus; - + ds->dst = dst; + ds->index = index; + ds->pd = dst->pd->chip + index; ds->drv = drv; - ds->tag_protocol = drv->tag_protocol; + ds->master_mii_bus = bus; /* * Validate supplied switch configuration. */ - ds->cpu_port = -1; for (i = 0; i < DSA_MAX_PORTS; i++) { char *name; @@ -115,32 +115,28 @@ dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd, continue; if (!strcmp(name, "cpu")) { - if (ds->cpu_port != -1) { + if (dst->cpu_switch != -1) { printk(KERN_ERR "multiple cpu ports?!\n"); ret = -EINVAL; goto out; } - ds->cpu_port = i; + dst->cpu_switch = index; + dst->cpu_port = i; + } else if (!strcmp(name, "dsa")) { + ds->dsa_port_mask |= 1 << i; } else { - ds->valid_port_mask |= 1 << i; + ds->phys_port_mask |= 1 << i; } } - if (ds->cpu_port == -1) { - printk(KERN_ERR "no cpu port?!\n"); - ret = -EINVAL; - goto out; - } - /* - * If we use a tagging format that doesn't have an ethertype - * field, make sure that all packets from this point on get - * sent to the tag format's receive function. (Which will - * discard received packets until we set ds->ports[] below.) + * If the CPU connects to this switch, set the switch tree + * tagging protocol to the preferred tagging format of this + * switch. */ - wmb(); - dev->dsa_ptr = (void *)ds; + if (ds->dst->cpu_switch == index) + ds->dst->tag_protocol = drv->tag_protocol; /* @@ -150,7 +146,7 @@ dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd, if (ret < 0) goto out; - ret = drv->set_addr(ds, dev->dev_addr); + ret = drv->set_addr(ds, dst->master_netdev->dev_addr); if (ret < 0) goto out; @@ -169,18 +165,18 @@ dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd, /* * Create network devices for physical switch ports. */ - wmb(); for (i = 0; i < DSA_MAX_PORTS; i++) { struct net_device *slave_dev; - if (!(ds->valid_port_mask & (1 << i))) + if (!(ds->phys_port_mask & (1 << i))) continue; slave_dev = dsa_slave_create(ds, parent, i, pd->port_names[i]); if (slave_dev == NULL) { - printk(KERN_ERR "%s: can't create dsa slave " - "device for port %d(%s)\n", - dev->name, i, pd->port_names[i]); + printk(KERN_ERR "%s[%d]: can't create dsa " + "slave device for port %d(%s)\n", + dst->master_netdev->name, + index, i, pd->port_names[i]); continue; } @@ -192,7 +188,6 @@ dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd, out_free: mdiobus_free(ds->slave_mii_bus); out: - dev->dsa_ptr = NULL; kfree(ds); return ERR_PTR(ret); } @@ -212,35 +207,42 @@ static void dsa_switch_destroy(struct dsa_switch *ds) */ bool dsa_uses_dsa_tags(void *dsa_ptr) { - struct dsa_switch *ds = dsa_ptr; + struct dsa_switch_tree *dst = dsa_ptr; - return !!(ds->tag_protocol == htons(ETH_P_DSA)); + return !!(dst->tag_protocol == htons(ETH_P_DSA)); } bool dsa_uses_trailer_tags(void *dsa_ptr) { - struct dsa_switch *ds = dsa_ptr; + struct dsa_switch_tree *dst = dsa_ptr; - return !!(ds->tag_protocol == htons(ETH_P_TRAILER)); + return !!(dst->tag_protocol == htons(ETH_P_TRAILER)); } /* link polling *************************************************************/ static void dsa_link_poll_work(struct work_struct *ugly) { - struct dsa_switch *ds; + struct dsa_switch_tree *dst; + int i; + + dst = container_of(ugly, struct dsa_switch_tree, link_poll_work); - ds = container_of(ugly, struct dsa_switch, link_poll_work); + for (i = 0; i < dst->pd->nr_chips; i++) { + struct dsa_switch *ds = dst->ds[i]; - ds->drv->poll_link(ds); - mod_timer(&ds->link_poll_timer, round_jiffies(jiffies + HZ)); + if (ds != NULL && ds->drv->poll_link != NULL) + ds->drv->poll_link(ds); + } + + mod_timer(&dst->link_poll_timer, round_jiffies(jiffies + HZ)); } -static void dsa_link_poll_timer(unsigned long _ds) +static void dsa_link_poll_timer(unsigned long _dst) { - struct dsa_switch *ds = (void *)_ds; + struct dsa_switch_tree *dst = (void *)_dst; - schedule_work(&ds->link_poll_work); + schedule_work(&dst->link_poll_work); } @@ -303,18 +305,14 @@ static int dsa_probe(struct platform_device *pdev) static int dsa_version_printed; struct dsa_platform_data *pd = pdev->dev.platform_data; struct net_device *dev; - struct mii_bus *bus; - struct dsa_switch *ds; + struct dsa_switch_tree *dst; + int i; if (!dsa_version_printed++) printk(KERN_NOTICE "Distributed Switch Architecture " "driver version %s\n", dsa_driver_version); - if (pd == NULL || pd->mii_bus == NULL || pd->netdev == NULL) - return -EINVAL; - - bus = dev_to_mii_bus(pd->mii_bus); - if (bus == NULL) + if (pd == NULL || pd->netdev == NULL) return -EINVAL; dev = dev_to_net_device(pd->netdev); @@ -326,36 +324,79 @@ static int dsa_probe(struct platform_device *pdev) return -EEXIST; } - ds = dsa_switch_setup(&pdev->dev, pd, bus, dev); - if (IS_ERR(ds)) { + dst = kzalloc(sizeof(*dst), GFP_KERNEL); + if (dst == NULL) { dev_put(dev); - return PTR_ERR(ds); + return -ENOMEM; } - if (ds->drv->poll_link != NULL) { - INIT_WORK(&ds->link_poll_work, dsa_link_poll_work); - init_timer(&ds->link_poll_timer); - ds->link_poll_timer.data = (unsigned long)ds; - ds->link_poll_timer.function = dsa_link_poll_timer; - ds->link_poll_timer.expires = round_jiffies(jiffies + HZ); - add_timer(&ds->link_poll_timer); + platform_set_drvdata(pdev, dst); + + dst->pd = pd; + dst->master_netdev = dev; + dst->cpu_switch = -1; + dst->cpu_port = -1; + + for (i = 0; i < pd->nr_chips; i++) { + struct mii_bus *bus; + struct dsa_switch *ds; + + bus = dev_to_mii_bus(pd->chip[i].mii_bus); + if (bus == NULL) { + printk(KERN_ERR "%s[%d]: no mii bus found for " + "dsa switch\n", dev->name, i); + continue; + } + + ds = dsa_switch_setup(dst, i, &pdev->dev, bus); + if (IS_ERR(ds)) { + printk(KERN_ERR "%s[%d]: couldn't create dsa switch " + "instance (error %ld)\n", dev->name, i, + PTR_ERR(ds)); + continue; + } + + dst->ds[i] = ds; + if (ds->drv->poll_link != NULL) + dst->link_poll_needed = 1; } - platform_set_drvdata(pdev, ds); + /* + * If we use a tagging format that doesn't have an ethertype + * field, make sure that all packets from this point on get + * sent to the tag format's receive function. + */ + wmb(); + dev->dsa_ptr = (void *)dst; + + if (dst->link_poll_needed) { + INIT_WORK(&dst->link_poll_work, dsa_link_poll_work); + init_timer(&dst->link_poll_timer); + dst->link_poll_timer.data = (unsigned long)dst; + dst->link_poll_timer.function = dsa_link_poll_timer; + dst->link_poll_timer.expires = round_jiffies(jiffies + HZ); + add_timer(&dst->link_poll_timer); + } return 0; } static int dsa_remove(struct platform_device *pdev) { - struct dsa_switch *ds = platform_get_drvdata(pdev); + struct dsa_switch_tree *dst = platform_get_drvdata(pdev); + int i; - if (ds->drv->poll_link != NULL) - del_timer_sync(&ds->link_poll_timer); + if (dst->link_poll_needed) + del_timer_sync(&dst->link_poll_timer); flush_scheduled_work(); - dsa_switch_destroy(ds); + for (i = 0; i < dst->pd->nr_chips; i++) { + struct dsa_switch *ds = dst->ds[i]; + + if (ds != NULL) + dsa_switch_destroy(ds); + } return 0; } diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 7063378a1eb..41055f33d28 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -1,6 +1,6 @@ /* * net/dsa/dsa_priv.h - Hardware switch handling - * Copyright (c) 2008 Marvell Semiconductor + * Copyright (c) 2008-2009 Marvell Semiconductor * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -19,42 +19,107 @@ struct dsa_switch { /* - * Configuration data for the platform device that owns - * this dsa switch instance. + * Parent switch tree, and switch index. */ - struct dsa_platform_data *pd; + struct dsa_switch_tree *dst; + int index; /* - * References to network device and mii bus to use. + * Configuration data for this switch. */ - struct net_device *master_netdev; - struct mii_bus *master_mii_bus; + struct dsa_chip_data *pd; /* - * The used switch driver and frame tagging type. + * The used switch driver. */ struct dsa_switch_driver *drv; - __be16 tag_protocol; + + /* + * Reference to mii bus to use. + */ + struct mii_bus *master_mii_bus; /* * Slave mii_bus and devices for the individual ports. */ - int cpu_port; - u32 valid_port_mask; - struct mii_bus *slave_mii_bus; - struct net_device *ports[DSA_MAX_PORTS]; + u32 dsa_port_mask; + u32 phys_port_mask; + struct mii_bus *slave_mii_bus; + struct net_device *ports[DSA_MAX_PORTS]; +}; + +struct dsa_switch_tree { + /* + * Configuration data for the platform device that owns + * this dsa switch tree instance. + */ + struct dsa_platform_data *pd; + + /* + * Reference to network device to use, and which tagging + * protocol to use. + */ + struct net_device *master_netdev; + __be16 tag_protocol; + + /* + * The switch and port to which the CPU is attached. + */ + s8 cpu_switch; + s8 cpu_port; /* * Link state polling. */ - struct work_struct link_poll_work; - struct timer_list link_poll_timer; + int link_poll_needed; + struct work_struct link_poll_work; + struct timer_list link_poll_timer; + + /* + * Data for the individual switch chips. + */ + struct dsa_switch *ds[DSA_MAX_SWITCHES]; }; +static inline bool dsa_is_cpu_port(struct dsa_switch *ds, int p) +{ + return !!(ds->index == ds->dst->cpu_switch && p == ds->dst->cpu_port); +} + +static inline u8 dsa_upstream_port(struct dsa_switch *ds) +{ + struct dsa_switch_tree *dst = ds->dst; + + /* + * If this is the root switch (i.e. the switch that connects + * to the CPU), return the cpu port number on this switch. + * Else return the (DSA) port number that connects to the + * switch that is one hop closer to the cpu. + */ + if (dst->cpu_switch == ds->index) + return dst->cpu_port; + else + return ds->pd->rtable[dst->cpu_switch]; +} + struct dsa_slave_priv { + /* + * The linux network interface corresponding to this + * switch port. + */ struct net_device *dev; + + /* + * Which switch this port is a part of, and the port index + * for this port. + */ struct dsa_switch *parent; - int port; + u8 port; + + /* + * The phylib phy_device pointer for the PHY connected + * to this port. + */ struct phy_device *phy; }; diff --git a/net/dsa/mv88e6060.c b/net/dsa/mv88e6060.c index 85081ae9fe8..83277f463af 100644 --- a/net/dsa/mv88e6060.c +++ b/net/dsa/mv88e6060.c @@ -1,6 +1,6 @@ /* * net/dsa/mv88e6060.c - Driver for Marvell 88e6060 switch chips - * Copyright (c) 2008 Marvell Semiconductor + * Copyright (c) 2008-2009 Marvell Semiconductor * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -81,7 +81,7 @@ static int mv88e6060_switch_reset(struct dsa_switch *ds) /* * Reset the switch. */ - REG_WRITE(REG_GLOBAL, 0x0A, 0xa130); + REG_WRITE(REG_GLOBAL, 0x0a, 0xa130); /* * Wait up to one second for reset to complete. @@ -128,7 +128,7 @@ static int mv88e6060_setup_port(struct dsa_switch *ds, int p) * state to Forwarding. Additionally, if this is the CPU * port, enable Ingress and Egress Trailer tagging mode. */ - REG_WRITE(addr, 0x04, (p == ds->cpu_port) ? 0x4103 : 0x0003); + REG_WRITE(addr, 0x04, dsa_is_cpu_port(ds, p) ? 0x4103 : 0x0003); /* * Port based VLAN map: give each port its own address @@ -138,9 +138,9 @@ static int mv88e6060_setup_port(struct dsa_switch *ds, int p) */ REG_WRITE(addr, 0x06, ((p & 0xf) << 12) | - ((p == ds->cpu_port) ? - ds->valid_port_mask : - (1 << ds->cpu_port))); + (dsa_is_cpu_port(ds, p) ? + ds->phys_port_mask : + (1 << ds->dst->cpu_port))); /* * Port Association Vector: when learning source addresses diff --git a/net/dsa/mv88e6123_61_65.c b/net/dsa/mv88e6123_61_65.c index 10031872221..52faaa21a4d 100644 --- a/net/dsa/mv88e6123_61_65.c +++ b/net/dsa/mv88e6123_61_65.c @@ -1,6 +1,6 @@ /* * net/dsa/mv88e6123_61_65.c - Marvell 88e6123/6161/6165 switch chip support - * Copyright (c) 2008 Marvell Semiconductor + * Copyright (c) 2008-2009 Marvell Semiconductor * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -98,17 +98,17 @@ static int mv88e6123_61_65_setup_global(struct dsa_switch *ds) return ret; /* - * Configure the cpu port, and configure the cpu port as the - * port to which ingress and egress monitor frames are to be - * sent. + * Configure the upstream port, and configure the upstream + * port as the port to which ingress and egress monitor frames + * are to be sent. */ - REG_WRITE(REG_GLOBAL, 0x1a, (ds->cpu_port * 0x1110)); + REG_WRITE(REG_GLOBAL, 0x1a, (dsa_upstream_port(ds) * 0x1110)); /* * Disable remote management for now, and set the switch's - * DSA device number to zero. + * DSA device number. */ - REG_WRITE(REG_GLOBAL, 0x1c, 0x0000); + REG_WRITE(REG_GLOBAL, 0x1c, ds->index & 0x1f); /* * Send all frames with destination addresses matching @@ -133,10 +133,17 @@ static int mv88e6123_61_65_setup_global(struct dsa_switch *ds) REG_WRITE(REG_GLOBAL2, 0x05, 0x00ff); /* - * Map all DSA device IDs to the CPU port. + * Program the DSA routing table. */ - for (i = 0; i < 32; i++) - REG_WRITE(REG_GLOBAL2, 0x06, 0x8000 | (i << 8) | ds->cpu_port); + for (i = 0; i < 32; i++) { + int nexthop; + + nexthop = 0x1f; + if (i != ds->index && i < ds->dst->pd->nr_chips) + nexthop = ds->pd->rtable[i] & 0x1f; + + REG_WRITE(REG_GLOBAL2, 0x06, 0x8000 | (i << 8) | nexthop); + } /* * Clear all trunk masks. @@ -176,12 +183,18 @@ static int mv88e6123_61_65_setup_global(struct dsa_switch *ds) static int mv88e6123_61_65_setup_port(struct dsa_switch *ds, int p) { int addr = REG_PORT(p); + u16 val; /* * MAC Forcing register: don't force link, speed, duplex - * or flow control state to any particular values. + * or flow control state to any particular values on physical + * ports, but force the CPU port and all DSA ports to 1000 Mb/s + * full duplex. */ - REG_WRITE(addr, 0x01, 0x0003); + if (dsa_is_cpu_port(ds, p) || ds->dsa_port_mask & (1 << p)) + REG_WRITE(addr, 0x01, 0x003e); + else + REG_WRITE(addr, 0x01, 0x0003); /* * Do not limit the period of time that this port can be @@ -192,37 +205,50 @@ static int mv88e6123_61_65_setup_port(struct dsa_switch *ds, int p) /* * Port Control: disable Drop-on-Unlock, disable Drop-on-Lock, - * configure the requested (DSA/EDSA) tagging mode if this is - * the CPU port, disable Header mode, enable IGMP/MLD snooping, - * disable VLAN tunneling, determine priority by looking at - * 802.1p and IP priority fields (IP prio has precedence), and - * set STP state to Forwarding. Finally, if this is the CPU - * port, additionally enable forwarding of unknown unicast and - * multicast addresses. - */ - REG_WRITE(addr, 0x04, - (p == ds->cpu_port) ? - (ds->tag_protocol == htons(ETH_P_DSA)) ? - 0x053f : 0x373f : - 0x0433); + * disable Header mode, enable IGMP/MLD snooping, disable VLAN + * tunneling, determine priority by looking at 802.1p and IP + * priority fields (IP prio has precedence), and set STP state + * to Forwarding. + * + * If this is the CPU link, use DSA or EDSA tagging depending + * on which tagging mode was configured. + * + * If this is a link to another switch, use DSA tagging mode. + * + * If this is the upstream port for this switch, enable + * forwarding of unknown unicasts and multicasts. + */ + val = 0x0433; + if (dsa_is_cpu_port(ds, p)) { + if (ds->dst->tag_protocol == htons(ETH_P_EDSA)) + val |= 0x3300; + else + val |= 0x0100; + } + if (ds->dsa_port_mask & (1 << p)) + val |= 0x0100; + if (p == dsa_upstream_port(ds)) + val |= 0x000c; + REG_WRITE(addr, 0x04, val); /* * Port Control 1: disable trunking. Also, if this is the * CPU port, enable learn messages to be sent to this port. */ - REG_WRITE(addr, 0x05, (p == ds->cpu_port) ? 0x8000 : 0x0000); + REG_WRITE(addr, 0x05, dsa_is_cpu_port(ds, p) ? 0x8000 : 0x0000); /* * Port based VLAN map: give each port its own address * database, allow the CPU port to talk to each of the 'real' * ports, and allow each of the 'real' ports to only talk to - * the CPU port. - */ - REG_WRITE(addr, 0x06, - ((p & 0xf) << 12) | - ((p == ds->cpu_port) ? - ds->valid_port_mask : - (1 << ds->cpu_port))); + * the upstream port. + */ + val = (p & 0xf) << 12; + if (dsa_is_cpu_port(ds, p)) + val |= ds->phys_port_mask; + else + val |= 1 << dsa_upstream_port(ds); + REG_WRITE(addr, 0x06, val); /* * Default VLAN ID and priority: don't set a default VLAN diff --git a/net/dsa/mv88e6131.c b/net/dsa/mv88e6131.c index 70fae2444cb..bb2b41bc854 100644 --- a/net/dsa/mv88e6131.c +++ b/net/dsa/mv88e6131.c @@ -1,6 +1,6 @@ /* - * net/dsa/mv88e6131.c - Marvell 88e6131 switch chip support - * Copyright (c) 2008 Marvell Semiconductor + * net/dsa/mv88e6131.c - Marvell 88e6095/6095f/6131 switch chip support + * Copyright (c) 2008-2009 Marvell Semiconductor * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -21,6 +21,8 @@ static char *mv88e6131_probe(struct mii_bus *bus, int sw_addr) ret = __mv88e6xxx_reg_read(bus, sw_addr, REG_PORT(0), 0x03); if (ret >= 0) { ret &= 0xfff0; + if (ret == 0x0950) + return "Marvell 88E6095/88E6095F"; if (ret == 0x1060) return "Marvell 88E6131"; } @@ -36,7 +38,7 @@ static int mv88e6131_switch_reset(struct dsa_switch *ds) /* * Set all ports to the disabled state. */ - for (i = 0; i < 8; i++) { + for (i = 0; i < 11; i++) { ret = REG_READ(REG_PORT(i), 0x04); REG_WRITE(REG_PORT(i), 0x04, ret & 0xfffc); } @@ -100,17 +102,17 @@ static int mv88e6131_setup_global(struct dsa_switch *ds) REG_WRITE(REG_GLOBAL, 0x19, 0x8100); /* - * Disable ARP mirroring, and configure the cpu port as the - * port to which ingress and egress monitor frames are to be - * sent. + * Disable ARP mirroring, and configure the upstream port as + * the port to which ingress and egress monitor frames are to + * be sent. */ - REG_WRITE(REG_GLOBAL, 0x1a, (ds->cpu_port * 0x1100) | 0x00f0); + REG_WRITE(REG_GLOBAL, 0x1a, (dsa_upstream_port(ds) * 0x1100) | 0x00f0); /* * Disable cascade port functionality, and set the switch's - * DSA device number to zero. + * DSA device number. */ - REG_WRITE(REG_GLOBAL, 0x1c, 0xe000); + REG_WRITE(REG_GLOBAL, 0x1c, 0xe000 | (ds->index & 0x1f)); /* * Send all frames with destination addresses matching @@ -127,16 +129,23 @@ static int mv88e6131_setup_global(struct dsa_switch *ds) REG_WRITE(REG_GLOBAL2, 0x05, 0x00ff); /* - * Map all DSA device IDs to the CPU port. + * Program the DSA routing table. */ - for (i = 0; i < 32; i++) - REG_WRITE(REG_GLOBAL2, 0x06, 0x8000 | (i << 8) | ds->cpu_port); + for (i = 0; i < 32; i++) { + int nexthop; + + nexthop = 0x1f; + if (i != ds->index && i < ds->dst->pd->nr_chips) + nexthop = ds->pd->rtable[i] & 0x1f; + + REG_WRITE(REG_GLOBAL2, 0x06, 0x8000 | (i << 8) | nexthop); + } /* * Clear all trunk masks. */ for (i = 0; i < 8; i++) - REG_WRITE(REG_GLOBAL2, 0x07, 0x8000 | (i << 12) | 0xff); + REG_WRITE(REG_GLOBAL2, 0x07, 0x8000 | (i << 12) | 0x7ff); /* * Clear all trunk mappings. @@ -156,12 +165,18 @@ static int mv88e6131_setup_global(struct dsa_switch *ds) static int mv88e6131_setup_port(struct dsa_switch *ds, int p) { int addr = REG_PORT(p); + u16 val; /* * MAC Forcing register: don't force link, speed, duplex - * or flow control state to any particular values. + * or flow control state to any particular values on physical + * ports, but force the CPU port and all DSA ports to 1000 Mb/s + * full duplex. */ - REG_WRITE(addr, 0x01, 0x0003); + if (dsa_is_cpu_port(ds, p) || ds->dsa_port_mask & (1 << p)) + REG_WRITE(addr, 0x01, 0x003e); + else + REG_WRITE(addr, 0x01, 0x0003); /* * Port Control: disable Core Tag, disable Drop-on-Lock, @@ -169,29 +184,40 @@ static int mv88e6131_setup_port(struct dsa_switch *ds, int p) * enable IGMP/MLD snoop, disable DoubleTag, disable VLAN * tunneling, determine priority by looking at 802.1p and * IP priority fields (IP prio has precedence), and set STP - * state to Forwarding. Finally, if this is the CPU port, - * additionally enable DSA tagging and forwarding of unknown - * unicast addresses. + * state to Forwarding. + * + * If this is the upstream port for this switch, enable + * forwarding of unknown unicasts, and enable DSA tagging + * mode. + * + * If this is the link to another switch, use DSA tagging + * mode, but do not enable forwarding of unknown unicasts. */ - REG_WRITE(addr, 0x04, (p == ds->cpu_port) ? 0x0537 : 0x0433); + val = 0x0433; + if (p == dsa_upstream_port(ds)) + val |= 0x0104; + if (ds->dsa_port_mask & (1 << p)) + val |= 0x0100; + REG_WRITE(addr, 0x04, val); /* * Port Control 1: disable trunking. Also, if this is the * CPU port, enable learn messages to be sent to this port. */ - REG_WRITE(addr, 0x05, (p == ds->cpu_port) ? 0x8000 : 0x0000); + REG_WRITE(addr, 0x05, dsa_is_cpu_port(ds, p) ? 0x8000 : 0x0000); /* * Port based VLAN map: give each port its own address * database, allow the CPU port to talk to each of the 'real' * ports, and allow each of the 'real' ports to only talk to - * the CPU port. + * the upstream port. */ - REG_WRITE(addr, 0x06, - ((p & 0xf) << 12) | - ((p == ds->cpu_port) ? - ds->valid_port_mask : - (1 << ds->cpu_port))); + val = (p & 0xf) << 12; + if (dsa_is_cpu_port(ds, p)) + val |= ds->phys_port_mask; + else + val |= 1 << dsa_upstream_port(ds); + REG_WRITE(addr, 0x06, val); /* * Default VLAN ID and priority: don't set a default VLAN @@ -207,13 +233,15 @@ static int mv88e6131_setup_port(struct dsa_switch *ds, int p) * untagged frames on this port, do a destination address * lookup on received packets as usual, don't send a copy * of all transmitted/received frames on this port to the - * CPU, and configure the CPU port number. Also, if this - * is the CPU port, enable forwarding of unknown multicast - * addresses. + * CPU, and configure the upstream port number. + * + * If this is the upstream port for this switch, enable + * forwarding of unknown multicast addresses. */ - REG_WRITE(addr, 0x08, - ((p == ds->cpu_port) ? 0x00c0 : 0x0080) | - ds->cpu_port); + val = 0x0080 | dsa_upstream_port(ds); + if (p == dsa_upstream_port(ds)) + val |= 0x0040; + REG_WRITE(addr, 0x08, val); /* * Rate Control: disable ingress rate limiting. @@ -268,7 +296,7 @@ static int mv88e6131_setup(struct dsa_switch *ds) if (ret < 0) return ret; - for (i = 0; i < 6; i++) { + for (i = 0; i < 11; i++) { ret = mv88e6131_setup_port(ds, i); if (ret < 0) return ret; @@ -279,7 +307,7 @@ static int mv88e6131_setup(struct dsa_switch *ds) static int mv88e6131_port_to_phy_addr(int port) { - if (port >= 0 && port != 3 && port <= 7) + if (port >= 0 && port <= 11) return port; return -1; } diff --git a/net/dsa/slave.c b/net/dsa/slave.c index a68fd79e9ec..ed131181215 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1,6 +1,6 @@ /* * net/dsa/slave.c - Slave device handling - * Copyright (c) 2008 Marvell Semiconductor + * Copyright (c) 2008-2009 Marvell Semiconductor * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -19,7 +19,7 @@ static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg) { struct dsa_switch *ds = bus->priv; - if (ds->valid_port_mask & (1 << addr)) + if (ds->phys_port_mask & (1 << addr)) return ds->drv->phy_read(ds, addr, reg); return 0xffff; @@ -29,7 +29,7 @@ static int dsa_slave_phy_write(struct mii_bus *bus, int addr, int reg, u16 val) { struct dsa_switch *ds = bus->priv; - if (ds->valid_port_mask & (1 << addr)) + if (ds->phys_port_mask & (1 << addr)) return ds->drv->phy_write(ds, addr, reg, val); return 0; @@ -43,15 +43,24 @@ void dsa_slave_mii_bus_init(struct dsa_switch *ds) ds->slave_mii_bus->write = dsa_slave_phy_write; snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "%s:%.2x", ds->master_mii_bus->id, ds->pd->sw_addr); - ds->slave_mii_bus->parent = &(ds->master_mii_bus->dev); + ds->slave_mii_bus->parent = &ds->master_mii_bus->dev; } /* slave device handling ****************************************************/ +static int dsa_slave_init(struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + + dev->iflink = p->parent->dst->master_netdev->ifindex; + + return 0; +} + static int dsa_slave_open(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); - struct net_device *master = p->parent->master_netdev; + struct net_device *master = p->parent->dst->master_netdev; int err; if (!(master->flags & IFF_UP)) @@ -89,7 +98,7 @@ out: static int dsa_slave_close(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); - struct net_device *master = p->parent->master_netdev; + struct net_device *master = p->parent->dst->master_netdev; dev_mc_unsync(master, dev); dev_unicast_unsync(master, dev); @@ -107,7 +116,7 @@ static int dsa_slave_close(struct net_device *dev) static void dsa_slave_change_rx_flags(struct net_device *dev, int change) { struct dsa_slave_priv *p = netdev_priv(dev); - struct net_device *master = p->parent->master_netdev; + struct net_device *master = p->parent->dst->master_netdev; if (change & IFF_ALLMULTI) dev_set_allmulti(master, dev->flags & IFF_ALLMULTI ? 1 : -1); @@ -118,7 +127,7 @@ static void dsa_slave_change_rx_flags(struct net_device *dev, int change) static void dsa_slave_set_rx_mode(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); - struct net_device *master = p->parent->master_netdev; + struct net_device *master = p->parent->dst->master_netdev; dev_mc_sync(master, dev); dev_unicast_sync(master, dev); @@ -127,7 +136,7 @@ static void dsa_slave_set_rx_mode(struct net_device *dev) static int dsa_slave_set_mac_address(struct net_device *dev, void *a) { struct dsa_slave_priv *p = netdev_priv(dev); - struct net_device *master = p->parent->master_netdev; + struct net_device *master = p->parent->dst->master_netdev; struct sockaddr *addr = a; int err; @@ -288,6 +297,7 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { #ifdef CONFIG_NET_DSA_TAG_DSA static const struct net_device_ops dsa_netdev_ops = { + .ndo_init = dsa_slave_init, .ndo_open = dsa_slave_open, .ndo_stop = dsa_slave_close, .ndo_start_xmit = dsa_xmit, @@ -300,6 +310,7 @@ static const struct net_device_ops dsa_netdev_ops = { #endif #ifdef CONFIG_NET_DSA_TAG_EDSA static const struct net_device_ops edsa_netdev_ops = { + .ndo_init = dsa_slave_init, .ndo_open = dsa_slave_open, .ndo_stop = dsa_slave_close, .ndo_start_xmit = edsa_xmit, @@ -312,6 +323,7 @@ static const struct net_device_ops edsa_netdev_ops = { #endif #ifdef CONFIG_NET_DSA_TAG_TRAILER static const struct net_device_ops trailer_netdev_ops = { + .ndo_init = dsa_slave_init, .ndo_open = dsa_slave_open, .ndo_stop = dsa_slave_close, .ndo_start_xmit = trailer_xmit, @@ -328,7 +340,7 @@ struct net_device * dsa_slave_create(struct dsa_switch *ds, struct device *parent, int port, char *name) { - struct net_device *master = ds->master_netdev; + struct net_device *master = ds->dst->master_netdev; struct net_device *slave_dev; struct dsa_slave_priv *p; int ret; @@ -343,7 +355,7 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent, memcpy(slave_dev->dev_addr, master->dev_addr, ETH_ALEN); slave_dev->tx_queue_len = 0; - switch (ds->tag_protocol) { + switch (ds->dst->tag_protocol) { #ifdef CONFIG_NET_DSA_TAG_DSA case htons(ETH_P_DSA): slave_dev->netdev_ops = &dsa_netdev_ops; diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index 63e532a69fd..8fa25bafe6c 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -1,6 +1,6 @@ /* * net/dsa/tag_dsa.c - (Non-ethertype) DSA tagging - * Copyright (c) 2008 Marvell Semiconductor + * Copyright (c) 2008-2009 Marvell Semiconductor * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -36,7 +36,7 @@ int dsa_xmit(struct sk_buff *skb, struct net_device *dev) * Construct tagged FROM_CPU DSA tag from 802.1q tag. */ dsa_header = skb->data + 2 * ETH_ALEN; - dsa_header[0] = 0x60; + dsa_header[0] = 0x60 | p->parent->index; dsa_header[1] = p->port << 3; /* @@ -57,7 +57,7 @@ int dsa_xmit(struct sk_buff *skb, struct net_device *dev) * Construct untagged FROM_CPU DSA tag. */ dsa_header = skb->data + 2 * ETH_ALEN; - dsa_header[0] = 0x40; + dsa_header[0] = 0x40 | p->parent->index; dsa_header[1] = p->port << 3; dsa_header[2] = 0x00; dsa_header[3] = 0x00; @@ -65,7 +65,7 @@ int dsa_xmit(struct sk_buff *skb, struct net_device *dev) skb->protocol = htons(ETH_P_DSA); - skb->dev = p->parent->master_netdev; + skb->dev = p->parent->dst->master_netdev; dev_queue_xmit(skb); return NETDEV_TX_OK; @@ -78,11 +78,13 @@ out_free: static int dsa_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { - struct dsa_switch *ds = dev->dsa_ptr; + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_switch *ds; u8 *dsa_header; + int source_device; int source_port; - if (unlikely(ds == NULL)) + if (unlikely(dst == NULL)) goto out_drop; skb = skb_unshare(skb, GFP_ATOMIC); @@ -98,16 +100,24 @@ static int dsa_rcv(struct sk_buff *skb, struct net_device *dev, dsa_header = skb->data - 2; /* - * Check that frame type is either TO_CPU or FORWARD, and - * that the source device is zero. + * Check that frame type is either TO_CPU or FORWARD. */ - if ((dsa_header[0] & 0xdf) != 0x00 && (dsa_header[0] & 0xdf) != 0xc0) + if ((dsa_header[0] & 0xc0) != 0x00 && (dsa_header[0] & 0xc0) != 0xc0) goto out_drop; /* - * Check that the source port is a registered DSA port. + * Determine source device and port. */ + source_device = dsa_header[0] & 0x1f; source_port = (dsa_header[1] >> 3) & 0x1f; + + /* + * Check that the source device exists and that the source + * port is a registered DSA port. + */ + if (source_device >= dst->pd->nr_chips) + goto out_drop; + ds = dst->ds[source_device]; if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL) goto out_drop; @@ -175,7 +185,7 @@ out: return 0; } -static struct packet_type dsa_packet_type = { +static struct packet_type dsa_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_DSA), .func = dsa_rcv, }; diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index 6197f9a7ef4..815607bd286 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -1,6 +1,6 @@ /* * net/dsa/tag_edsa.c - Ethertype DSA tagging - * Copyright (c) 2008 Marvell Semiconductor + * Copyright (c) 2008-2009 Marvell Semiconductor * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -45,7 +45,7 @@ int edsa_xmit(struct sk_buff *skb, struct net_device *dev) edsa_header[1] = ETH_P_EDSA & 0xff; edsa_header[2] = 0x00; edsa_header[3] = 0x00; - edsa_header[4] = 0x60; + edsa_header[4] = 0x60 | p->parent->index; edsa_header[5] = p->port << 3; /* @@ -70,7 +70,7 @@ int edsa_xmit(struct sk_buff *skb, struct net_device *dev) edsa_header[1] = ETH_P_EDSA & 0xff; edsa_header[2] = 0x00; edsa_header[3] = 0x00; - edsa_header[4] = 0x40; + edsa_header[4] = 0x40 | p->parent->index; edsa_header[5] = p->port << 3; edsa_header[6] = 0x00; edsa_header[7] = 0x00; @@ -78,7 +78,7 @@ int edsa_xmit(struct sk_buff *skb, struct net_device *dev) skb->protocol = htons(ETH_P_EDSA); - skb->dev = p->parent->master_netdev; + skb->dev = p->parent->dst->master_netdev; dev_queue_xmit(skb); return NETDEV_TX_OK; @@ -91,11 +91,13 @@ out_free: static int edsa_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { - struct dsa_switch *ds = dev->dsa_ptr; + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_switch *ds; u8 *edsa_header; + int source_device; int source_port; - if (unlikely(ds == NULL)) + if (unlikely(dst == NULL)) goto out_drop; skb = skb_unshare(skb, GFP_ATOMIC); @@ -111,16 +113,24 @@ static int edsa_rcv(struct sk_buff *skb, struct net_device *dev, edsa_header = skb->data + 2; /* - * Check that frame type is either TO_CPU or FORWARD, and - * that the source device is zero. + * Check that frame type is either TO_CPU or FORWARD. */ - if ((edsa_header[0] & 0xdf) != 0x00 && (edsa_header[0] & 0xdf) != 0xc0) + if ((edsa_header[0] & 0xc0) != 0x00 && (edsa_header[0] & 0xc0) != 0xc0) goto out_drop; /* - * Check that the source port is a registered DSA port. + * Determine source device and port. */ + source_device = edsa_header[0] & 0x1f; source_port = (edsa_header[1] >> 3) & 0x1f; + + /* + * Check that the source device exists and that the source + * port is a registered DSA port. + */ + if (source_device >= dst->pd->nr_chips) + goto out_drop; + ds = dst->ds[source_device]; if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL) goto out_drop; @@ -194,7 +204,7 @@ out: return 0; } -static struct packet_type edsa_packet_type = { +static struct packet_type edsa_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_EDSA), .func = edsa_rcv, }; diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index d7e7f424ff0..1c3e30c38b8 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -1,6 +1,6 @@ /* * net/dsa/tag_trailer.c - Trailer tag format handling - * Copyright (c) 2008 Marvell Semiconductor + * Copyright (c) 2008-2009 Marvell Semiconductor * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -59,7 +59,7 @@ int trailer_xmit(struct sk_buff *skb, struct net_device *dev) nskb->protocol = htons(ETH_P_TRAILER); - nskb->dev = p->parent->master_netdev; + nskb->dev = p->parent->dst->master_netdev; dev_queue_xmit(nskb); return NETDEV_TX_OK; @@ -68,12 +68,14 @@ int trailer_xmit(struct sk_buff *skb, struct net_device *dev) static int trailer_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { - struct dsa_switch *ds = dev->dsa_ptr; + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_switch *ds; u8 *trailer; int source_port; - if (unlikely(ds == NULL)) + if (unlikely(dst == NULL)) goto out_drop; + ds = dst->ds[0]; skb = skb_unshare(skb, GFP_ATOMIC); if (skb == NULL) @@ -111,7 +113,7 @@ out: return 0; } -static struct packet_type trailer_packet_type = { +static struct packet_type trailer_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_TRAILER), .func = trailer_rcv, }; diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c index 7bf35582f65..6f479fa522c 100644 --- a/net/econet/af_econet.c +++ b/net/econet/af_econet.c @@ -1102,7 +1102,7 @@ drop: return NET_RX_DROP; } -static struct packet_type econet_packet_type = { +static struct packet_type econet_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_ECONET), .func = econet_rcv, }; diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 691268f3a35..b2cf91e4cca 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -35,7 +35,7 @@ config IP_ADVANCED_ROUTER at boot time after the /proc file system has been mounted. - If you turn on IP forwarding, you will also get the rp_filter, which + If you turn on IP forwarding, you should consider the rp_filter, which automatically rejects incoming packets if the routing table entry for their source address doesn't match the network interface they're arriving on. This has security advantages because it prevents the @@ -46,12 +46,16 @@ config IP_ADVANCED_ROUTER rp_filter on use: echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter - or + and echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter + Note that some distributions enable it in startup scripts. + For details about rp_filter strict and loose mode read + <file:Documentation/networking/ip-sysctl.txt>. + If unsure, say N here. -choice +choice prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)" depends on IP_ADVANCED_ROUTER default ASK_IP_FIB_HASH @@ -59,27 +63,29 @@ choice config ASK_IP_FIB_HASH bool "FIB_HASH" ---help--- - Current FIB is very proven and good enough for most users. + Current FIB is very proven and good enough for most users. config IP_FIB_TRIE bool "FIB_TRIE" ---help--- - Use new experimental LC-trie as FIB lookup algorithm. - This improves lookup performance if you have a large - number of routes. - - LC-trie is a longest matching prefix lookup algorithm which - performs better than FIB_HASH for large routing tables. - But, it consumes more memory and is more complex. - - LC-trie is described in: - - IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson - IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999 - An experimental study of compression methods for dynamic tries - Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. - http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/ - + Use new experimental LC-trie as FIB lookup algorithm. + This improves lookup performance if you have a large + number of routes. + + LC-trie is a longest matching prefix lookup algorithm which + performs better than FIB_HASH for large routing tables. + But, it consumes more memory and is more complex. + + LC-trie is described in: + + IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson + IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, + June 1999 + + An experimental study of compression methods for dynamic tries + Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. + http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/ + endchoice config IP_FIB_HASH @@ -191,7 +197,7 @@ config IP_PNP_RARP <file:Documentation/filesystems/nfsroot.txt> for details. # not yet ready.. -# bool ' IP: ARP support' CONFIG_IP_PNP_ARP +# bool ' IP: ARP support' CONFIG_IP_PNP_ARP config NET_IPIP tristate "IP: tunneling" select INET_TUNNEL @@ -361,7 +367,7 @@ config INET_IPCOMP ---help--- Support for IP Payload Compression Protocol (IPComp) (RFC3173), typically needed for IPsec. - + If unsure, say Y. config INET_XFRM_TUNNEL @@ -415,7 +421,7 @@ config INET_DIAG Support for INET (TCP, DCCP, etc) socket monitoring interface used by native Linux tools such as ss. ss is included in iproute2, currently downloadable at <http://linux-net.osdl.org/index.php/Iproute2>. - + If unsure, say Y. config INET_TCP_DIAG diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 627be4dc7fb..d5aaabbb7cb 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1500,7 +1500,7 @@ static int ipv4_proc_init(void); * IP protocol layer initialiser */ -static struct packet_type ip_packet_type = { +static struct packet_type ip_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_IP), .func = ip_rcv, .gso_send_check = inet_gso_send_check, diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 3f6b7354699..f11931c1838 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -801,8 +801,11 @@ static int arp_process(struct sk_buff *skb) * cache. */ - /* Special case: IPv4 duplicate address detection packet (RFC2131) */ - if (sip == 0) { + /* + * Special case: IPv4 duplicate address detection packet (RFC2131) + * and Gratuitous ARP/ARP Announce. (RFC3927, Section 2.4) + */ + if (sip == 0 || tip == sip) { if (arp->ar_op == htons(ARPOP_REQUEST) && inet_addr_type(net, tip) == RTN_LOCAL && !arp_ignore(in_dev, sip, tip)) @@ -892,7 +895,7 @@ static int arp_process(struct sk_buff *skb) out: if (in_dev) in_dev_put(in_dev); - kfree_skb(skb); + consume_skb(skb); return 0; } @@ -1225,7 +1228,7 @@ void arp_ifdown(struct net_device *dev) * Called once on startup. */ -static struct packet_type arp_packet_type = { +static struct packet_type arp_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_ARP), .func = arp_rcv, }; diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 6bb2635b5de..7bc992976d2 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -3,11 +3,16 @@ * * This is an implementation of the CIPSO 2.2 protocol as specified in * draft-ietf-cipso-ipsecurity-01.txt with additional tag types as found in - * FIPS-188, copies of both documents can be found in the Documentation - * directory. While CIPSO never became a full IETF RFC standard many vendors + * FIPS-188. While CIPSO never became a full IETF RFC standard many vendors * have chosen to adopt the protocol and over the years it has become a * de-facto standard for labeled networking. * + * The CIPSO draft specification can be found in the kernel's Documentation + * directory as well as the following URL: + * http://netlabel.sourceforge.net/files/draft-ietf-cipso-ipsecurity-01.txt + * The FIPS-188 specification can be found at the following URL: + * http://www.itl.nist.gov/fipspubs/fip188.htm + * * Author: Paul Moore <paul.moore@hp.com> * */ diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index d519a6a6672..126bb911880 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1216,7 +1216,8 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL); + rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL); + return; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 741e4fa3e47..cafcc49d099 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -275,7 +275,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, fib_res_put(&res); if (no_addr) goto last_resort; - if (rpf) + if (rpf == 1) goto e_inval; fl.oif = dev->ifindex; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 4817dea3bc7..f831df50090 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -322,8 +322,9 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE, - info->nlh, GFP_KERNEL); + rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE, + info->nlh, GFP_KERNEL); + return; errout: if (err < 0) rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 382800a62b3..3f50807237e 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1207,7 +1207,7 @@ static struct pernet_operations __net_initdata icmp_sk_ops = { int __init icmp_init(void) { - return register_pernet_device(&icmp_sk_ops); + return register_pernet_subsys(&icmp_sk_ops); } EXPORT_SYMBOL(icmp_err_convert); diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 6c52e08f786..eaf3e2c8646 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -267,6 +267,7 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, struct inet_frags *f, void *key, unsigned int hash) + __releases(&f->lock) { struct inet_frag_queue *q; struct hlist_node *n; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 6659ac000ee..7985346653b 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -463,6 +463,7 @@ err: static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, struct net_device *dev) { + struct net *net = container_of(qp->q.net, struct net, ipv4.frags); struct iphdr *iph; struct sk_buff *fp, *head = qp->q.fragments; int len; @@ -548,7 +549,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, iph = ip_hdr(head); iph->frag_off = 0; iph->tot_len = htons(len); - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_REASMOKS); + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); qp->q.fragments = NULL; return 0; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 07a188afb3a..e62510d5ea5 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -491,7 +491,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info) if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) goto out; - if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO) + if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) t->err_count++; else t->err_count = 1; @@ -803,7 +803,8 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) #endif if (tunnel->err_count > 0) { - if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { + if (time_before(jiffies, + tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { tunnel->err_count--; dst_link_failure(skb); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 5079dfbc6f3..9054139795a 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -327,7 +327,7 @@ static int ipip_err(struct sk_buff *skb, u32 info) if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) goto out; - if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO) + if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) t->err_count++; else t->err_count = 1; @@ -466,7 +466,8 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) } if (tunnel->err_count > 0) { - if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { + if (time_before(jiffies, + tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { tunnel->err_count--; dst_link_failure(skb); } else @@ -750,7 +751,7 @@ static struct xfrm_tunnel ipip_handler = { .priority = 1, }; -static char banner[] __initdata = +static const char banner[] __initconst = KERN_INFO "IPv4 over IPv4 tunneling driver\n"; static void ipip_destroy_tunnels(struct ipip_net *ipn) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 90b2f3c192f..2451aeb5ac2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -661,6 +661,47 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) return NULL; } +static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, + int large_allowed) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 xmit_size_goal, old_size_goal; + + xmit_size_goal = mss_now; + + if (large_allowed && sk_can_gso(sk)) { + xmit_size_goal = ((sk->sk_gso_max_size - 1) - + inet_csk(sk)->icsk_af_ops->net_header_len - + inet_csk(sk)->icsk_ext_hdr_len - + tp->tcp_header_len); + + xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); + + /* We try hard to avoid divides here */ + old_size_goal = tp->xmit_size_goal_segs * mss_now; + + if (likely(old_size_goal <= xmit_size_goal && + old_size_goal + mss_now > xmit_size_goal)) { + xmit_size_goal = old_size_goal; + } else { + tp->xmit_size_goal_segs = xmit_size_goal / mss_now; + xmit_size_goal = tp->xmit_size_goal_segs * mss_now; + } + } + + return max(xmit_size_goal, mss_now); +} + +static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) +{ + int mss_now; + + mss_now = tcp_current_mss(sk); + *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); + + return mss_now; +} + static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags) { @@ -677,13 +718,12 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); - size_goal = tp->xmit_size_goal; + mss_now = tcp_send_mss(sk, &size_goal, flags); copied = 0; err = -EPIPE; if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) - goto do_error; + goto out_err; while (psize > 0) { struct sk_buff *skb = tcp_write_queue_tail(sk); @@ -761,8 +801,7 @@ wait_for_memory: if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) goto do_error; - mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); - size_goal = tp->xmit_size_goal; + mss_now = tcp_send_mss(sk, &size_goal, flags); } out: @@ -844,8 +883,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, /* This should be in poll */ clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); - size_goal = tp->xmit_size_goal; + mss_now = tcp_send_mss(sk, &size_goal, flags); /* Ok commence sending. */ iovlen = msg->msg_iovlen; @@ -854,7 +892,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, err = -EPIPE; if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) - goto do_error; + goto out_err; while (--iovlen >= 0) { int seglen = iov->iov_len; @@ -1007,8 +1045,7 @@ wait_for_memory: if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) goto do_error; - mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); - size_goal = tp->xmit_size_goal; + mss_now = tcp_send_mss(sk, &size_goal, flags); } } @@ -1045,8 +1082,7 @@ out_err: */ static int tcp_recv_urg(struct sock *sk, long timeo, - struct msghdr *msg, int len, int flags, - int *addr_len) + struct msghdr *msg, int len, int flags) { struct tcp_sock *tp = tcp_sk(sk); @@ -1661,7 +1697,7 @@ out: return err; recv_urg: - err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len); + err = tcp_recv_urg(sk, timeo, msg, len, flags); goto out; } diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 7eb7636db0d..3b53fd1af23 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -149,16 +149,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) tcp_slow_start(tp); else { bictcp_update(ca, tp->snd_cwnd); - - /* In dangerous area, increase slowly. - * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd - */ - if (tp->snd_cwnd_cnt >= ca->cnt) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - } else - tp->snd_cwnd_cnt++; + tcp_cong_avoid_ai(tp, ca->cnt); } } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 4ec5b4e97c4..e92beb9e55e 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -336,6 +336,19 @@ void tcp_slow_start(struct tcp_sock *tp) } EXPORT_SYMBOL_GPL(tcp_slow_start); +/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */ +void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w) +{ + if (tp->snd_cwnd_cnt >= w) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else { + tp->snd_cwnd_cnt++; + } +} +EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai); + /* * TCP Reno congestion control * This is special case used for fallback as well. @@ -365,13 +378,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) tp->snd_cwnd++; } } else { - /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */ - if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - } else - tp->snd_cwnd_cnt++; + tcp_cong_avoid_ai(tp, tp->snd_cwnd); } } EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index ee467ec40c4..71d5f2f29fa 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -294,16 +294,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) tcp_slow_start(tp); } else { bictcp_update(ca, tp->snd_cwnd); - - /* In dangerous area, increase slowly. - * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd - */ - if (tp->snd_cwnd_cnt >= ca->cnt) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - } else - tp->snd_cwnd_cnt++; + tcp_cong_avoid_ai(tp, ca->cnt); } } diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 937549b8a92..26d5c7fc7de 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -115,8 +115,7 @@ static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked, s32 rtt return; /* achieved throughput calculations */ - if (icsk->icsk_ca_state != TCP_CA_Open && - icsk->icsk_ca_state != TCP_CA_Disorder) { + if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_Disorder))) { ca->packetcount = 0; ca->lasttime = now; return; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a6961d75c7e..2bc8e27a163 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -64,6 +64,7 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/sysctl.h> +#include <linux/kernel.h> #include <net/dst.h> #include <net/tcp.h> #include <net/inet_common.h> @@ -1178,10 +1179,18 @@ static void tcp_mark_lost_retrans(struct sock *sk) if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) continue; - if (after(received_upto, ack_seq) && - (tcp_is_fack(tp) || - !before(received_upto, - ack_seq + tp->reordering * tp->mss_cache))) { + /* TODO: We would like to get rid of tcp_is_fack(tp) only + * constraint here (see above) but figuring out that at + * least tp->reordering SACK blocks reside between ack_seq + * and received_upto is not easy task to do cheaply with + * the available datastructures. + * + * Whether FACK should check here for tp->reordering segs + * in-between one could argue for either way (it would be + * rather simple to implement as we could count fack_count + * during the walk and do tp->fackets_out - fack_count). + */ + if (after(received_upto, ack_seq)) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); @@ -1374,7 +1383,8 @@ static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, struct tcp_sacktag_state *state, - unsigned int pcount, int shifted, int mss) + unsigned int pcount, int shifted, int mss, + int dup_sack) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *prev = tcp_write_queue_prev(sk, skb); @@ -1410,7 +1420,7 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, } /* We discard results */ - tcp_sacktag_one(skb, sk, state, 0, pcount); + tcp_sacktag_one(skb, sk, state, dup_sack, pcount); /* Difference in this won't matter, both ACKed by the same cumul. ACK */ TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); @@ -1561,7 +1571,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, if (!skb_shift(prev, skb, len)) goto fallback; - if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss)) + if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack)) goto out; /* Hole filled allows collapsing with the next as well, this is very @@ -1580,7 +1590,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, len = skb->len; if (skb_shift(prev, skb, len)) { pcount += tcp_skb_pcount(skb); - tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss); + tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0); } out: @@ -1793,11 +1803,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, for (i = used_sacks - 1; i > 0; i--) { for (j = 0; j < i; j++) { if (after(sp[j].start_seq, sp[j + 1].start_seq)) { - struct tcp_sack_block tmp; - - tmp = sp[j]; - sp[j] = sp[j + 1]; - sp[j + 1] = tmp; + swap(sp[j], sp[j + 1]); /* Track where the first SACK block goes to */ if (j == first_sack_index) @@ -2452,6 +2458,44 @@ static int tcp_time_to_recover(struct sock *sk) return 0; } +/* New heuristics: it is possible only after we switched to restart timer + * each time when something is ACKed. Hence, we can detect timed out packets + * during fast retransmit without falling to slow start. + * + * Usefulness of this as is very questionable, since we should know which of + * the segments is the next to timeout which is relatively expensive to find + * in general case unless we add some data structure just for that. The + * current approach certainly won't find the right one too often and when it + * finally does find _something_ it usually marks large part of the window + * right away (because a retransmission with a larger timestamp blocks the + * loop from advancing). -ij + */ +static void tcp_timeout_skbs(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + if (!tcp_is_fack(tp) || !tcp_head_timedout(sk)) + return; + + skb = tp->scoreboard_skb_hint; + if (tp->scoreboard_skb_hint == NULL) + skb = tcp_write_queue_head(sk); + + tcp_for_write_queue_from(skb, sk) { + if (skb == tcp_send_head(sk)) + break; + if (!tcp_skb_timedout(sk, skb)) + break; + + tcp_skb_mark_lost(tp, skb); + } + + tp->scoreboard_skb_hint = skb; + + tcp_verify_left_out(tp); +} + /* Mark head of queue up as lost. With RFC3517 SACK, the packets is * is against sacked "cnt", otherwise it's against facked "cnt" */ @@ -2524,30 +2568,7 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) tcp_mark_head_lost(sk, sacked_upto); } - /* New heuristics: it is possible only after we switched - * to restart timer each time when something is ACKed. - * Hence, we can detect timed out packets during fast - * retransmit without falling to slow start. - */ - if (tcp_is_fack(tp) && tcp_head_timedout(sk)) { - struct sk_buff *skb; - - skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint - : tcp_write_queue_head(sk); - - tcp_for_write_queue_from(skb, sk) { - if (skb == tcp_send_head(sk)) - break; - if (!tcp_skb_timedout(sk, skb)) - break; - - tcp_skb_mark_lost(tp, skb); - } - - tp->scoreboard_skb_hint = skb; - - tcp_verify_left_out(tp); - } + tcp_timeout_skbs(sk); } /* CWND moderation, preventing bursts due to too big ACKs @@ -2812,7 +2833,7 @@ static void tcp_mtup_probe_failed(struct sock *sk) icsk->icsk_mtup.probe_size = 0; } -static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb) +static void tcp_mtup_probe_success(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); @@ -2840,7 +2861,7 @@ void tcp_simple_retransmit(struct sock *sk) const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - unsigned int mss = tcp_current_mss(sk, 0); + unsigned int mss = tcp_current_mss(sk); u32 prior_lost = tp->lost_out; tcp_for_write_queue(skb, sk) { @@ -3177,7 +3198,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); - u32 end_seq; u32 acked_pcount; u8 sacked = scb->sacked; @@ -3192,16 +3212,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, break; fully_acked = 0; - end_seq = tp->snd_una; } else { acked_pcount = tcp_skb_pcount(skb); - end_seq = scb->end_seq; - } - - /* MTU probing checks */ - if (fully_acked && icsk->icsk_mtup.probe_size && - !after(tp->mtu_probe.probe_seq_end, scb->end_seq)) { - tcp_mtup_probe_success(sk, skb); } if (sacked & TCPCB_RETRANS) { @@ -3266,24 +3278,26 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; + if (unlikely(icsk->icsk_mtup.probe_size && + !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) { + tcp_mtup_probe_success(sk); + } + tcp_ack_update_rtt(sk, flag, seq_rtt); tcp_rearm_rto(sk); if (tcp_is_reno(tp)) { tcp_remove_reno_sacks(sk, pkts_acked); } else { + int delta; + /* Non-retransmitted hole got filled? That's reordering */ if (reord < prior_fackets) tcp_update_reordering(sk, tp->fackets_out - reord, 0); - /* No need to care for underflows here because - * the lost_skb_hint gets NULLed if we're past it - * (or something non-trivial happened) - */ - if (tcp_is_fack(tp)) - tp->lost_cnt_hint -= pkts_acked; - else - tp->lost_cnt_hint -= prior_sacked - tp->sacked_out; + delta = tcp_is_fack(tp) ? pkts_acked : + prior_sacked - tp->sacked_out; + tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta); } tp->fackets_out -= min(pkts_acked, tp->fackets_out); @@ -3395,7 +3409,7 @@ static int tcp_ack_update_window(struct sock *sk, struct sk_buff *skb, u32 ack, if (tcp_may_update_window(tp, ack, ack_seq, nwin)) { flag |= FLAG_WIN_UPDATE; - tcp_update_wl(tp, ack, ack_seq); + tcp_update_wl(tp, ack_seq); if (tp->snd_wnd != nwin) { tp->snd_wnd = nwin; @@ -3571,15 +3585,18 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) int prior_packets; int frto_cwnd = 0; - /* If the ack is newer than sent or older than previous acks + /* If the ack is older than previous acks * then we can probably ignore it. */ - if (after(ack, tp->snd_nxt)) - goto uninteresting_ack; - if (before(ack, prior_snd_una)) goto old_ack; + /* If the ack includes data we haven't sent yet, discard + * this segment (RFC793 Section 3.9). + */ + if (after(ack, tp->snd_nxt)) + goto invalid_ack; + if (after(ack, prior_snd_una)) flag |= FLAG_SND_UNA_ADVANCED; @@ -3600,7 +3617,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) * No more checks are required. * Note, we use the fact that SND.UNA>=SND.WL2. */ - tcp_update_wl(tp, ack, ack_seq); + tcp_update_wl(tp, ack_seq); tp->snd_una = ack; flag |= FLAG_WIN_UPDATE; @@ -3669,6 +3686,10 @@ no_queue: tcp_ack_probe(sk); return 1; +invalid_ack: + SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt); + return -1; + old_ack: if (TCP_SKB_CB(skb)->sacked) { tcp_sacktag_write_queue(sk, skb, prior_snd_una); @@ -3676,8 +3697,7 @@ old_ack: tcp_try_keep_open(sk); } -uninteresting_ack: - SOCK_DEBUG(sk, "Ack %u out of %u:%u\n", ack, tp->snd_una, tp->snd_nxt); + SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt); return 0; } @@ -3865,8 +3885,7 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) * Not only, also it occurs for expired timestamps. */ - if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 || - get_seconds() >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS) + if (tcp_paws_check(&tp->rx_opt, 0)) tcp_store_ts_recent(tp); } } @@ -3918,9 +3937,9 @@ static inline int tcp_paws_discard(const struct sock *sk, const struct sk_buff *skb) { const struct tcp_sock *tp = tcp_sk(sk); - return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW && - get_seconds() < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS && - !tcp_disordered_ack(sk, skb)); + + return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) && + !tcp_disordered_ack(sk, skb); } /* Check segment sequence number for validity. @@ -4078,7 +4097,6 @@ static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) tp->rx_opt.dsack = 1; tp->duplicate_sack[0].start_seq = seq; tp->duplicate_sack[0].end_seq = end_seq; - tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + 1; } } @@ -4133,8 +4151,6 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) * Decrease num_sacks. */ tp->rx_opt.num_sacks--; - tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + - tp->rx_opt.dsack; for (i = this_sack; i < tp->rx_opt.num_sacks; i++) sp[i] = sp[i + 1]; continue; @@ -4143,20 +4159,6 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) } } -static inline void tcp_sack_swap(struct tcp_sack_block *sack1, - struct tcp_sack_block *sack2) -{ - __u32 tmp; - - tmp = sack1->start_seq; - sack1->start_seq = sack2->start_seq; - sack2->start_seq = tmp; - - tmp = sack1->end_seq; - sack1->end_seq = sack2->end_seq; - sack2->end_seq = tmp; -} - static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) { struct tcp_sock *tp = tcp_sk(sk); @@ -4171,7 +4173,7 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) if (tcp_sack_extend(sp, seq, end_seq)) { /* Rotate this_sack to the first one. */ for (; this_sack > 0; this_sack--, sp--) - tcp_sack_swap(sp, sp - 1); + swap(*sp, *(sp - 1)); if (cur_sacks > 1) tcp_sack_maybe_coalesce(tp); return; @@ -4197,7 +4199,6 @@ new_sack: sp->start_seq = seq; sp->end_seq = end_seq; tp->rx_opt.num_sacks++; - tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; } /* RCV.NXT advances, some SACKs should be eaten. */ @@ -4211,7 +4212,6 @@ static void tcp_sack_remove(struct tcp_sock *tp) /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */ if (skb_queue_empty(&tp->out_of_order_queue)) { tp->rx_opt.num_sacks = 0; - tp->rx_opt.eff_sacks = tp->rx_opt.dsack; return; } @@ -4232,11 +4232,7 @@ static void tcp_sack_remove(struct tcp_sock *tp) this_sack++; sp++; } - if (num_sacks != tp->rx_opt.num_sacks) { - tp->rx_opt.num_sacks = num_sacks; - tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + - tp->rx_opt.dsack; - } + tp->rx_opt.num_sacks = num_sacks; } /* This one checks to see if we can put data from the @@ -4312,10 +4308,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) TCP_ECN_accept_cwr(tp, skb); - if (tp->rx_opt.dsack) { - tp->rx_opt.dsack = 0; - tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks; - } + tp->rx_opt.dsack = 0; /* Queue data for delivery to the user. * Packets in sequence go to the receive queue. @@ -4434,8 +4427,6 @@ drop: /* Initial out of order segment, build 1 SACK. */ if (tcp_is_sack(tp)) { tp->rx_opt.num_sacks = 1; - tp->rx_opt.dsack = 0; - tp->rx_opt.eff_sacks = 1; tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; tp->selective_acks[0].end_seq = TCP_SKB_CB(skb)->end_seq; @@ -5156,7 +5147,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, */ if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags && - TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { + TCP_SKB_CB(skb)->seq == tp->rcv_nxt && + !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) { int tcp_header_len = tp->tcp_header_len; /* Timestamp header prediction: tcp_header_len @@ -5309,8 +5301,8 @@ slow_path: return -res; step5: - if (th->ack) - tcp_ack(sk, skb, FLAG_SLOWPATH); + if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0) + goto discard; tcp_rcv_rtt_measure_ts(sk, skb); @@ -5408,7 +5400,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * never scaled. */ tp->snd_wnd = ntohs(th->window); - tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(skb)->seq); + tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); if (!tp->rx_opt.wscale_ok) { tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0; @@ -5509,7 +5501,7 @@ discard: /* PAWS check. */ if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp && - tcp_paws_check(&tp->rx_opt, 0)) + tcp_paws_reject(&tp->rx_opt, 0)) goto discard_and_undo; if (th->syn) { @@ -5647,7 +5639,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* step 5: check the ACK field */ if (th->ack) { - int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH); + int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0; switch (sk->sk_state) { case TCP_SYN_RECV: @@ -5669,8 +5661,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tp->snd_una = TCP_SKB_CB(skb)->ack_seq; tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; - tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq, - TCP_SKB_CB(skb)->seq); + tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); /* tcp_ack considers this ACK as duplicate * and does not calculate rtt. diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f6b962f56ab..d0a314879d8 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1226,15 +1226,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (want_cookie && !tmp_opt.saw_tstamp) tcp_clear_options(&tmp_opt); - if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) { - /* Some OSes (unknown ones, but I see them on web server, which - * contains information interesting only for windows' - * users) do not send their stamp in SYN. It is easy case. - * We simply do not advertise TS support. - */ - tmp_opt.saw_tstamp = 0; - tmp_opt.tstamp_ok = 0; - } tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; tcp_openreq_init(req, &tmp_opt, skb); @@ -2443,7 +2434,7 @@ static struct pernet_operations __net_initdata tcp_sk_ops = { void __init tcp_v4_init(void) { inet_hashinfo_init(&tcp_hashinfo); - if (register_pernet_device(&tcp_sk_ops)) + if (register_pernet_subsys(&tcp_sk_ops)) panic("Failed to create the TCP control socket.\n"); } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f67effbb102..43bbba7926e 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -107,7 +107,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = tcptw->tw_ts_recent; tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; - paws_reject = tcp_paws_check(&tmp_opt, th->rst); + paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } @@ -399,7 +399,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, tcp_prequeue_init(newtp); - tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn); + tcp_init_wl(newtp, treq->rcv_isn); newtp->srtt = 0; newtp->mdev = TCP_TIMEOUT_INIT; @@ -434,9 +434,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->rx_opt.saw_tstamp = 0; newtp->rx_opt.dsack = 0; - newtp->rx_opt.eff_sacks = 0; - newtp->rx_opt.num_sacks = 0; + newtp->urg_data = 0; if (sock_flag(newsk, SOCK_KEEPOPEN)) @@ -512,7 +511,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * from another data. */ tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans); - paws_reject = tcp_paws_check(&tmp_opt, th->rst); + paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index dda42f0bd7a..c1f259d2d33 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -441,10 +441,7 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, *ptr++ = htonl(sp[this_sack].end_seq); } - if (tp->rx_opt.dsack) { - tp->rx_opt.dsack = 0; - tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks; - } + tp->rx_opt.dsack = 0; } } @@ -550,6 +547,7 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb, struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; struct tcp_sock *tp = tcp_sk(sk); unsigned size = 0; + unsigned int eff_sacks; #ifdef CONFIG_TCP_MD5SIG *md5 = tp->af_specific->md5_lookup(sk, sk); @@ -568,10 +566,11 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb, size += TCPOLEN_TSTAMP_ALIGNED; } - if (unlikely(tp->rx_opt.eff_sacks)) { + eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; + if (unlikely(eff_sacks)) { const unsigned remaining = MAX_TCP_OPTION_SPACE - size; opts->num_sack_blocks = - min_t(unsigned, tp->rx_opt.eff_sacks, + min_t(unsigned, eff_sacks, (remaining - TCPOLEN_SACK_BASE_ALIGNED) / TCPOLEN_SACK_PERBLOCK); size += TCPOLEN_SACK_BASE_ALIGNED + @@ -663,10 +662,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, th->urg_ptr = 0; /* The urg_mode check is necessary during a below snd_una win probe */ - if (unlikely(tcp_urg_mode(tp) && - between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) { - th->urg_ptr = htons(tp->snd_up - tcb->seq); - th->urg = 1; + if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) { + if (before(tp->snd_up, tcb->seq + 0x10000)) { + th->urg_ptr = htons(tp->snd_up - tcb->seq); + th->urg = 1; + } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) { + th->urg_ptr = 0xFFFF; + th->urg = 1; + } } tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); @@ -763,11 +766,10 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, struct sk_buff *buff; int nsize, old_factor; int nlen; - u16 flags; + u8 flags; BUG_ON(len > skb->len); - tcp_clear_retrans_hints_partial(tp); nsize = skb_headlen(skb) - len; if (nsize < 0) nsize = 0; @@ -850,6 +852,12 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, tcp_verify_left_out(tp); } tcp_adjust_fackets_out(sk, skb, diff); + + if (tp->lost_skb_hint && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->lost_skb_hint)->seq) && + (tcp_is_fack(tp) || TCP_SKB_CB(skb)->sacked)) + tp->lost_cnt_hint -= diff; } /* Link BUFF into the send queue. */ @@ -913,7 +921,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) * factor and mss. */ if (tcp_skb_pcount(skb) > 1) - tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1)); + tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk)); return 0; } @@ -974,15 +982,6 @@ void tcp_mtup_init(struct sock *sk) icsk->icsk_mtup.probe_size = 0; } -/* Bound MSS / TSO packet size with the half of the window */ -static int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize) -{ - if (tp->max_window && pktsize > (tp->max_window >> 1)) - return max(tp->max_window >> 1, 68U - tp->tcp_header_len); - else - return pktsize; -} - /* This function synchronize snd mss to current pmtu/exthdr set. tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts @@ -1029,22 +1028,17 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) /* Compute the current effective MSS, taking SACKs and IP options, * and even PMTU discovery events into account. */ -unsigned int tcp_current_mss(struct sock *sk, int large_allowed) +unsigned int tcp_current_mss(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); u32 mss_now; - u16 xmit_size_goal; - int doing_tso = 0; unsigned header_len; struct tcp_out_options opts; struct tcp_md5sig_key *md5; mss_now = tp->mss_cache; - if (large_allowed && sk_can_gso(sk)) - doing_tso = 1; - if (dst) { u32 mtu = dst_mtu(dst); if (mtu != inet_csk(sk)->icsk_pmtu_cookie) @@ -1062,19 +1056,6 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) mss_now -= delta; } - xmit_size_goal = mss_now; - - if (doing_tso) { - xmit_size_goal = ((sk->sk_gso_max_size - 1) - - inet_csk(sk)->icsk_af_ops->net_header_len - - inet_csk(sk)->icsk_ext_hdr_len - - tp->tcp_header_len); - - xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); - xmit_size_goal -= (xmit_size_goal % mss_now); - } - tp->xmit_size_goal = xmit_size_goal; - return mss_now; } @@ -1256,7 +1237,7 @@ int tcp_may_send_now(struct sock *sk) struct sk_buff *skb = tcp_send_head(sk); return (skb && - tcp_snd_test(sk, skb, tcp_current_mss(sk, 1), + tcp_snd_test(sk, skb, tcp_current_mss(sk), (tcp_skb_is_last(sk, skb) ? tp->nonagle : TCP_NAGLE_PUSH))); } @@ -1273,7 +1254,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, { struct sk_buff *buff; int nlen = skb->len - len; - u16 flags; + u8 flags; /* All of a TSO frame must be composed of paged data. */ if (skb->len != skb->data_len) @@ -1352,6 +1333,10 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) if (limit >= sk->sk_gso_max_size) goto send_now; + /* Middle in queue won't get any more data, full sendable already? */ + if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) + goto send_now; + if (sysctl_tcp_tso_win_divisor) { u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); @@ -1405,11 +1390,11 @@ static int tcp_mtu_probe(struct sock *sk) icsk->icsk_mtup.probe_size || inet_csk(sk)->icsk_ca_state != TCP_CA_Open || tp->snd_cwnd < 11 || - tp->rx_opt.eff_sacks) + tp->rx_opt.num_sacks || tp->rx_opt.dsack) return -1; /* Very simple search strategy: just double the MSS. */ - mss_now = tcp_current_mss(sk, 0); + mss_now = tcp_current_mss(sk); probe_size = 2 * tp->mss_cache; size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache; if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) { @@ -1754,11 +1739,9 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); int skb_size, next_skb_size; - u16 flags; skb_size = skb->len; next_skb_size = next_skb->len; - flags = TCP_SKB_CB(skb)->flags; BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); @@ -1778,9 +1761,8 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) /* Update sequence range on original skb. */ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; - /* Merge over control information. */ - flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */ - TCP_SKB_CB(skb)->flags = flags; + /* Merge over control information. This moves PSH/FIN etc. over */ + TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(next_skb)->flags; /* All done, get rid of second SKB and account for it so * packet counting does not break. @@ -1894,7 +1876,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) return -EHOSTUNREACH; /* Routing failure or similar. */ - cur_mss = tcp_current_mss(sk, 0); + cur_mss = tcp_current_mss(sk); /* If receiver has shrunk his window, and skb is out of * new window, do not retransmit it. The exception is the @@ -1908,6 +1890,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (skb->len > cur_mss) { if (tcp_fragment(sk, skb, cur_mss, cur_mss)) return -ENOMEM; /* We'll try again later. */ + } else { + tcp_init_tso_segs(sk, skb, cur_mss); } tcp_retrans_try_collapse(sk, skb, cur_mss); @@ -2023,7 +2007,6 @@ void tcp_xmit_retransmit_queue(struct sock *sk) last_lost = tp->snd_una; } - /* First pass: retransmit lost packets. */ tcp_for_write_queue_from(skb, sk) { __u8 sacked = TCP_SKB_CB(skb)->sacked; @@ -2062,7 +2045,7 @@ begin_fwd: goto begin_fwd; } else if (!(sacked & TCPCB_LOST)) { - if (hole == NULL && !(sacked & TCPCB_SACKED_RETRANS)) + if (hole == NULL && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED))) hole = skb; continue; @@ -2101,7 +2084,7 @@ void tcp_send_fin(struct sock *sk) * unsent frames. But be careful about outgoing SACKS * and IP options. */ - mss_now = tcp_current_mss(sk, 1); + mss_now = tcp_current_mss(sk); if (tcp_send_head(sk) != NULL) { TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN; @@ -2326,7 +2309,7 @@ static void tcp_connect_init(struct sock *sk) sk->sk_err = 0; sock_reset_flag(sk, SOCK_DONE); tp->snd_wnd = 0; - tcp_init_wl(tp, tp->write_seq, 0); + tcp_init_wl(tp, 0); tp->snd_una = tp->write_seq; tp->snd_sml = tp->write_seq; tp->snd_up = tp->write_seq; @@ -2513,7 +2496,7 @@ int tcp_write_wakeup(struct sock *sk) if ((skb = tcp_send_head(sk)) != NULL && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) { int err; - unsigned int mss = tcp_current_mss(sk, 0); + unsigned int mss = tcp_current_mss(sk); unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index 25524d4e372..59f5b5e7c56 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -165,9 +165,10 @@ static int tcpprobe_sprint(char *tbuf, int n) static ssize_t tcpprobe_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { - int error = 0, cnt = 0; + int error = 0; + size_t cnt = 0; - if (!buf || len < 0) + if (!buf) return -EINVAL; while (cnt < len) { diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 2747ec7bfb6..a76513779e2 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -1,6 +1,6 @@ /* Tom Kelly's Scalable TCP * - * See htt://www-lce.eng.cam.ac.uk/~ctk21/scalable/ + * See http://www.deneholme.net/tom/scalable/ * * John Heffner <jheffner@sc.edu> */ @@ -24,14 +24,8 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) if (tp->snd_cwnd <= tp->snd_ssthresh) tcp_slow_start(tp); - else { - tp->snd_cwnd_cnt++; - if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - } - } + else + tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)); } static u32 tcp_scalable_ssthresh(struct sock *sk) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 0170e914f1b..b144a26359b 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -328,19 +328,16 @@ static void tcp_retransmit_timer(struct sock *sk) if (icsk->icsk_retransmits == 0) { int mib_idx; - if (icsk->icsk_ca_state == TCP_CA_Disorder || - icsk->icsk_ca_state == TCP_CA_Recovery) { - if (tcp_is_sack(tp)) { - if (icsk->icsk_ca_state == TCP_CA_Recovery) - mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL; - else - mib_idx = LINUX_MIB_TCPSACKFAILURES; - } else { - if (icsk->icsk_ca_state == TCP_CA_Recovery) - mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL; - else - mib_idx = LINUX_MIB_TCPRENOFAILURES; - } + if (icsk->icsk_ca_state == TCP_CA_Disorder) { + if (tcp_is_sack(tp)) + mib_idx = LINUX_MIB_TCPSACKFAILURES; + else + mib_idx = LINUX_MIB_TCPRENOFAILURES; + } else if (icsk->icsk_ca_state == TCP_CA_Recovery) { + if (tcp_is_sack(tp)) + mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL; + else + mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL; } else if (icsk->icsk_ca_state == TCP_CA_Loss) { mib_idx = LINUX_MIB_TCPLOSSFAILURES; } else { diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index d08b2e855c2..e9bbff74648 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -159,12 +159,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) /* In the "non-congestive state", increase cwnd * every rtt. */ - if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - } else - tp->snd_cwnd_cnt++; + tcp_cong_avoid_ai(tp, tp->snd_cwnd); } else { /* In the "congestive state", increase cwnd * every other rtt. diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index 9ec843a9bbb..66b6821b984 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -94,14 +94,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) } else { /* Reno */ - - if (tp->snd_cwnd_cnt < tp->snd_cwnd) - tp->snd_cwnd_cnt++; - - if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - } + tcp_cong_avoid_ai(tp, tp->snd_cwnd); } /* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt. diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 4bd178a111d..05b7abb99f6 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1184,7 +1184,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, sk = sknext; } while (sknext); } else - kfree_skb(skb); + consume_skb(skb); spin_unlock(&hslot->lock); return 0; } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 03e2a1ad71e..8499da9e76a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -40,6 +40,7 @@ #include <linux/errno.h> #include <linux/types.h> +#include <linux/kernel.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> @@ -493,15 +494,17 @@ static void addrconf_forward_change(struct net *net, __s32 newf) read_unlock(&dev_base_lock); } -static void addrconf_fixup_forwarding(struct ctl_table *table, int *p, int old) +static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int old) { struct net *net; net = (struct net *)table->extra2; if (p == &net->ipv6.devconf_dflt->forwarding) - return; + return 0; + + if (!rtnl_trylock()) + return -ERESTARTSYS; - rtnl_lock(); if (p == &net->ipv6.devconf_all->forwarding) { __s32 newf = net->ipv6.devconf_all->forwarding; net->ipv6.devconf_dflt->forwarding = newf; @@ -512,6 +515,7 @@ static void addrconf_fixup_forwarding(struct ctl_table *table, int *p, int old) if (*p) rt6_purge_dflt_routers(net); + return 1; } #endif @@ -587,6 +591,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen, { struct inet6_ifaddr *ifa = NULL; struct rt6_info *rt; + struct net *net = dev_net(idev->dev); int hash; int err = 0; int addr_type = ipv6_addr_type(addr); @@ -603,6 +608,11 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen, goto out2; } + if (idev->cnf.disable_ipv6 || net->ipv6.devconf_all->disable_ipv6) { + err = -EACCES; + goto out2; + } + write_lock(&addrconf_hash_lock); /* Ignore adding duplicate addresses on an interface */ @@ -1206,16 +1216,12 @@ int ipv6_dev_get_saddr(struct net *net, struct net_device *dst_dev, } break; } else if (minihiscore < miniscore) { - struct ipv6_saddr_score *tmp; - if (hiscore->ifa) in6_ifa_put(hiscore->ifa); in6_ifa_hold(score->ifa); - tmp = hiscore; - hiscore = score; - score = tmp; + swap(hiscore, score); /* restore our iterator */ score->ifa = hiscore->ifa; @@ -1430,6 +1436,11 @@ static void addrconf_dad_stop(struct inet6_ifaddr *ifp) void addrconf_dad_failure(struct inet6_ifaddr *ifp) { struct inet6_dev *idev = ifp->idev; + + if (net_ratelimit()) + printk(KERN_INFO "%s: IPv6 duplicate address detected!\n", + ifp->idev->dev->name); + if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6) { struct in6_addr addr; @@ -1440,11 +1451,12 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp) ipv6_addr_equal(&ifp->addr, &addr)) { /* DAD failed for link-local based on MAC address */ idev->cnf.disable_ipv6 = 1; + + printk(KERN_INFO "%s: IPv6 being disabled!\n", + ifp->idev->dev->name); } } - if (net_ratelimit()) - printk(KERN_INFO "%s: duplicate address detected!\n", ifp->idev->dev->name); addrconf_dad_stop(ifp); } @@ -2599,9 +2611,6 @@ static int addrconf_ifdown(struct net_device *dev, int how) ASSERT_RTNL(); - if ((dev->flags & IFF_LOOPBACK) && how == 1) - how = 0; - rt6_ifdown(net, dev); neigh_ifdown(&nd_tbl, dev); @@ -2823,11 +2832,6 @@ static void addrconf_dad_timer(unsigned long data) read_unlock_bh(&idev->lock); goto out; } - if (idev->cnf.accept_dad > 1 && idev->cnf.disable_ipv6) { - read_unlock_bh(&idev->lock); - addrconf_dad_failure(ifp); - return; - } spin_lock_bh(&ifp->lock); if (ifp->probes == 0) { /* @@ -3638,7 +3642,8 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC); + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC); + return; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err); @@ -3849,7 +3854,8 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC); + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC); + return; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err); @@ -3919,7 +3925,8 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev, kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, net, 0, RTNLGRP_IPV6_PREFIX, NULL, GFP_ATOMIC); + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_PREFIX, NULL, GFP_ATOMIC); + return; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_IPV6_PREFIX, err); @@ -3974,7 +3981,7 @@ int addrconf_sysctl_forward(ctl_table *ctl, int write, struct file * filp, ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); if (write) - addrconf_fixup_forwarding(ctl, valp, val); + ret = addrconf_fixup_forwarding(ctl, valp, val); return ret; } @@ -4010,8 +4017,7 @@ static int addrconf_sysctl_forward_strategy(ctl_table *table, } *valp = new; - addrconf_fixup_forwarding(table, valp, val); - return 1; + return addrconf_fixup_forwarding(table, valp, val); } static struct addrconf_sysctl_table @@ -4437,25 +4443,6 @@ int unregister_inet6addr_notifier(struct notifier_block *nb) EXPORT_SYMBOL(unregister_inet6addr_notifier); -static void addrconf_net_exit(struct net *net) -{ - struct net_device *dev; - - rtnl_lock(); - /* clean dev list */ - for_each_netdev(net, dev) { - if (__in6_dev_get(dev) == NULL) - continue; - addrconf_ifdown(dev, 1); - } - addrconf_ifdown(net->loopback_dev, 2); - rtnl_unlock(); -} - -static struct pernet_operations addrconf_net_ops = { - .exit = addrconf_net_exit, -}; - /* * Init / cleanup code */ @@ -4497,10 +4484,6 @@ int __init addrconf_init(void) if (err) goto errlo; - err = register_pernet_device(&addrconf_net_ops); - if (err) - return err; - register_netdevice_notifier(&ipv6_dev_notf); addrconf_verify(0); @@ -4530,15 +4513,22 @@ errlo: void addrconf_cleanup(void) { struct inet6_ifaddr *ifa; + struct net_device *dev; int i; unregister_netdevice_notifier(&ipv6_dev_notf); - unregister_pernet_device(&addrconf_net_ops); - unregister_pernet_subsys(&addrconf_ops); rtnl_lock(); + /* clean dev list */ + for_each_netdev(&init_net, dev) { + if (__in6_dev_get(dev) == NULL) + continue; + addrconf_ifdown(dev, 1); + } + addrconf_ifdown(init_net.loopback_dev, 2); + /* * Check hash table. */ @@ -4559,6 +4549,4 @@ void addrconf_cleanup(void) del_timer(&addr_chk_timer); rtnl_unlock(); - - unregister_pernet_subsys(&addrconf_net_ops); } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index fa2ac7ee662..fbf533cc9dc 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -72,6 +72,10 @@ MODULE_LICENSE("GPL"); static struct list_head inetsw6[SOCK_MAX]; static DEFINE_SPINLOCK(inetsw6_lock); +static int disable_ipv6 = 0; +module_param_named(disable, disable_ipv6, int, 0); +MODULE_PARM_DESC(disable, "Disable IPv6 such that it is non-functional"); + static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) { const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo); @@ -889,7 +893,7 @@ out_unlock: return err; } -static struct packet_type ipv6_packet_type = { +static struct packet_type ipv6_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_IPV6), .func = ipv6_rcv, .gso_send_check = ipv6_gso_send_check, @@ -1001,10 +1005,21 @@ static int __init inet6_init(void) { struct sk_buff *dummy_skb; struct list_head *r; - int err; + int err = 0; BUILD_BUG_ON(sizeof(struct inet6_skb_parm) > sizeof(dummy_skb->cb)); + /* Register the socket-side information for inet6_create. */ + for(r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r) + INIT_LIST_HEAD(r); + + if (disable_ipv6) { + printk(KERN_INFO + "IPv6: Loaded, but administratively disabled, " + "reboot required to enable\n"); + goto out; + } + err = proto_register(&tcpv6_prot, 1); if (err) goto out; @@ -1022,10 +1037,6 @@ static int __init inet6_init(void) goto out_unregister_udplite_proto; - /* Register the socket-side information for inet6_create. */ - for(r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r) - INIT_LIST_HEAD(r); - /* We MUST register RAW sockets before we create the ICMP6, * IGMP6, or NDISC control sockets. */ @@ -1191,6 +1202,9 @@ module_init(inet6_init); static void __exit inet6_exit(void) { + if (disable_ipv6) + return; + /* First of all disallow new sockets creation. */ sock_unregister(PF_INET6); /* Disallow any further netlink messages */ diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 8fe267feb81..1bcc3431859 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -258,11 +258,11 @@ unique: if (twp != NULL) { *twp = tw; - NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITRECYCLED); + NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); } else if (tw != NULL) { /* Silly. Should hash-dance instead... */ inet_twsk_deschedule(tw, death_row); - NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITRECYCLED); + NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); inet_twsk_put(tw); } diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 40f324655e2..d31df0f4bc9 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -218,8 +218,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (opt) sock_kfree_s(sk, opt, opt->tot_len); pktopt = xchg(&np->pktoptions, NULL); - if (pktopt) - kfree_skb(pktopt); + kfree_skb(pktopt); sk->sk_destruct = inet_sock_destruct; /* diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 3cd83b85e9e..9f061d1adbc 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1095,11 +1095,7 @@ static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt) &ipv6_hdr(ra)->saddr); nlmsg_end(skb, nlh); - err = rtnl_notify(skb, net, 0, RTNLGRP_ND_USEROPT, NULL, - GFP_ATOMIC); - if (err < 0) - goto errout; - + rtnl_notify(skb, net, 0, RTNLGRP_ND_USEROPT, NULL, GFP_ATOMIC); return; nla_put_failure: diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index 165b256a6fa..41b8a956e1b 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -205,8 +205,9 @@ icmpv6_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) { - nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, - "nf_ct_icmpv6: ICMPv6 checksum failed\n"); + if (LOG_INVALID(net, IPPROTO_ICMPV6)) + nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, + "nf_ct_icmpv6: ICMPv6 checksum failed "); return -NF_ACCEPT; } diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index ed4d79a9e4a..058a5e4a60c 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -528,14 +528,14 @@ find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff) if (!ipv6_ext_hdr(nexthdr)) { return -1; } - if (len < (int)sizeof(struct ipv6_opt_hdr)) { - pr_debug("too short\n"); - return -1; - } if (nexthdr == NEXTHDR_NONE) { pr_debug("next header is none\n"); return -1; } + if (len < (int)sizeof(struct ipv6_opt_hdr)) { + pr_debug("too short\n"); + return -1; + } if (skb_copy_bits(skb, start, &hdr, sizeof(hdr))) BUG(); if (nexthdr == NEXTHDR_AUTH) diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 3c575118fca..e9ac7a12f59 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -452,6 +452,7 @@ err: static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_device *dev) { + struct net *net = container_of(fq->q.net, struct net, ipv6.frags); struct sk_buff *fp, *head = fq->q.fragments; int payload_len; unsigned int nhoff; @@ -551,8 +552,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, head->csum); rcu_read_lock(); - IP6_INC_STATS_BH(dev_net(dev), - __in6_dev_get(dev), IPSTATS_MIB_REASMOKS); + IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS); rcu_read_unlock(); fq->q.fragments = NULL; return 1; @@ -566,8 +566,7 @@ out_oom: printk(KERN_DEBUG "ip6_frag_reasm: no memory for reassembly\n"); out_fail: rcu_read_lock(); - IP6_INC_STATS_BH(dev_net(dev), - __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); + IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); rcu_read_unlock(); return -1; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c3d486a3eda..1394ddb6e35 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2400,8 +2400,9 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE, - info->nlh, gfp_any()); + rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE, + info->nlh, gfp_any()); + return; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index d3467e563f0..664ab82e03b 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -188,9 +188,9 @@ static struct ip_tunnel * ipip6_tunnel_locate(struct net *net, } nt = netdev_priv(dev); - ipip6_tunnel_init(dev); nt->parms = *parms; + ipip6_tunnel_init(dev); if (parms->i_flags & SIT_ISATAP) dev->priv_flags |= IFF_ISATAP; @@ -454,7 +454,7 @@ static int ipip6_err(struct sk_buff *skb, u32 info) if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) goto out; - if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO) + if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) t->err_count++; else t->err_count = 1; @@ -658,7 +658,8 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) } if (tunnel->err_count > 0) { - if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { + if (time_before(jiffies, + tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { tunnel->err_count--; dst_link_failure(skb); } else diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 00f1269e11e..4b5aa185426 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -533,8 +533,7 @@ static inline void syn_flood_warning(struct sk_buff *skb) static void tcp_v6_reqsk_destructor(struct request_sock *req) { - if (inet6_rsk(req)->pktopts) - kfree_skb(inet6_rsk(req)->pktopts); + kfree_skb(inet6_rsk(req)->pktopts); } #ifdef CONFIG_TCP_MD5SIG @@ -1611,8 +1610,7 @@ ipv6_pktoptions: } } - if (opt_skb) - kfree_skb(opt_skb); + kfree_skb(opt_skb); return 0; } diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index 0e685b05496..f417b77fa0e 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c @@ -69,7 +69,7 @@ __xfrm6_sort(void **dst, void **src, int n, int (*cmp)(void *p), int maxclass) for (i = 0; i < n; i++) { dst[count[class[i] - 1]++] = src[i]; - src[i] = 0; + src[i] = NULL; } return 0; diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 43d0ffc6d56..1627050e29f 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -1958,12 +1958,12 @@ static const struct proto_ops SOCKOPS_WRAPPED(ipx_dgram_ops) = { SOCKOPS_WRAP(ipx_dgram, PF_IPX); -static struct packet_type ipx_8023_packet_type = { +static struct packet_type ipx_8023_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_802_3), .func = ipx_rcv, }; -static struct packet_type ipx_dix_packet_type = { +static struct packet_type ipx_dix_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_IPX), .func = ipx_rcv, }; @@ -1975,15 +1975,15 @@ static struct notifier_block ipx_dev_notifier = { extern struct datalink_proto *make_EII_client(void); extern void destroy_EII_client(struct datalink_proto *); -static unsigned char ipx_8022_type = 0xE0; -static unsigned char ipx_snap_id[5] = { 0x0, 0x0, 0x0, 0x81, 0x37 }; -static char ipx_EII_err_msg[] __initdata = +static const unsigned char ipx_8022_type = 0xE0; +static const unsigned char ipx_snap_id[5] = { 0x0, 0x0, 0x0, 0x81, 0x37 }; +static const char ipx_EII_err_msg[] __initconst = KERN_CRIT "IPX: Unable to register with Ethernet II\n"; -static char ipx_8023_err_msg[] __initdata = +static const char ipx_8023_err_msg[] __initconst = KERN_CRIT "IPX: Unable to register with 802.3\n"; -static char ipx_llc_err_msg[] __initdata = +static const char ipx_llc_err_msg[] __initconst = KERN_CRIT "IPX: Unable to register with 802.2\n"; -static char ipx_snap_err_msg[] __initdata = +static const char ipx_snap_err_msg[] __initconst = KERN_CRIT "IPX: Unable to register with SNAP\n"; static int __init ipx_init(void) diff --git a/net/irda/irda_device.c b/net/irda/irda_device.c index ea319e3ddc1..bf92e147344 100644 --- a/net/irda/irda_device.c +++ b/net/irda/irda_device.c @@ -149,13 +149,14 @@ int irda_device_is_receiving(struct net_device *dev) IRDA_DEBUG(2, "%s()\n", __func__); - if (!dev->do_ioctl) { + if (!dev->netdev_ops->ndo_do_ioctl) { IRDA_ERROR("%s: do_ioctl not impl. by device driver\n", __func__); return -1; } - ret = dev->do_ioctl(dev, (struct ifreq *) &req, SIOCGRECEIVING); + ret = (dev->netdev_ops->ndo_do_ioctl)(dev, (struct ifreq *) &req, + SIOCGRECEIVING); if (ret < 0) return ret; diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c index 05112be9956..724bcf951b8 100644 --- a/net/irda/irlan/irlan_eth.c +++ b/net/irda/irlan/irlan_eth.c @@ -45,6 +45,16 @@ static int irlan_eth_xmit(struct sk_buff *skb, struct net_device *dev); static void irlan_eth_set_multicast_list( struct net_device *dev); static struct net_device_stats *irlan_eth_get_stats(struct net_device *dev); +static const struct net_device_ops irlan_eth_netdev_ops = { + .ndo_open = irlan_eth_open, + .ndo_stop = irlan_eth_close, + .ndo_start_xmit = irlan_eth_xmit, + .ndo_get_stats = irlan_eth_get_stats, + .ndo_set_multicast_list = irlan_eth_set_multicast_list, + .ndo_change_mtu = eth_change_mtu, + .ndo_validate_addr = eth_validate_addr, +}; + /* * Function irlan_eth_setup (dev) * @@ -53,14 +63,11 @@ static struct net_device_stats *irlan_eth_get_stats(struct net_device *dev); */ static void irlan_eth_setup(struct net_device *dev) { - dev->open = irlan_eth_open; - dev->stop = irlan_eth_close; - dev->hard_start_xmit = irlan_eth_xmit; - dev->get_stats = irlan_eth_get_stats; - dev->set_multicast_list = irlan_eth_set_multicast_list; + ether_setup(dev); + + dev->netdev_ops = &irlan_eth_netdev_ops; dev->destructor = free_netdev; - ether_setup(dev); /* * Lets do all queueing in IrTTP instead of this device driver. diff --git a/net/irda/irmod.c b/net/irda/irmod.c index 1bb607f2f5c..303a68d9273 100644 --- a/net/irda/irmod.c +++ b/net/irda/irmod.c @@ -55,7 +55,7 @@ EXPORT_SYMBOL(irda_debug); /* Packet type handler. * Tell the kernel how IrDA packets should be handled. */ -static struct packet_type irda_packet_type = { +static struct packet_type irda_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_IRDA), .func = irlap_driver_rcv, /* Packet type handler irlap_frame.c */ }; diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index eb8a2a0b6eb..49e786535dc 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1171,8 +1171,7 @@ static void iucv_callback_txdone(struct iucv_path *path, spin_unlock_irqrestore(&list->lock, flags); - if (this) - kfree_skb(this); + kfree_skb(this); } BUG_ON(!this); diff --git a/net/key/af_key.c b/net/key/af_key.c index 7dcbde3ea7d..643c1be2d02 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -313,8 +313,7 @@ static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation, if (one_sk != NULL) err = pfkey_broadcast_one(skb, &skb2, allocation, one_sk); - if (skb2) - kfree_skb(skb2); + kfree_skb(skb2); kfree_skb(skb); return err; } @@ -3573,8 +3572,7 @@ static int pfkey_sendmsg(struct kiocb *kiocb, out: if (err && hdr && pfkey_error(hdr, err, sk) == 0) err = 0; - if (skb) - kfree_skb(skb); + kfree_skb(skb); return err ? : len; } diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 56fd85ab358..febae702685 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -1118,11 +1118,11 @@ static const struct proto_ops llc_ui_ops = { .sendpage = sock_no_sendpage, }; -static char llc_proc_err_msg[] __initdata = +static const char llc_proc_err_msg[] __initconst = KERN_CRIT "LLC: Unable to register the proc_fs entries\n"; -static char llc_sysctl_err_msg[] __initdata = +static const char llc_sysctl_err_msg[] __initconst = KERN_CRIT "LLC: Unable to register the sysctl entries\n"; -static char llc_sock_err_msg[] __initdata = +static const char llc_sock_err_msg[] __initconst = KERN_CRIT "LLC: Unable to register the network family\n"; static int __init llc2_init(void) diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index 5c6d89c6d51..3477624a490 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -332,8 +332,7 @@ int llc_conn_remove_acked_pdus(struct sock *sk, u8 nr, u16 *how_many_unacked) for (i = 0; i < pdu_pos && i < q_len; i++) { skb = skb_dequeue(&llc->pdu_unack_q); - if (skb) - kfree_skb(skb); + kfree_skb(skb); nbr_acked++; } out: diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c index a7fe1adc378..ff4c0ab96a6 100644 --- a/net/llc/llc_core.c +++ b/net/llc/llc_core.c @@ -147,12 +147,12 @@ void llc_sap_close(struct llc_sap *sap) kfree(sap); } -static struct packet_type llc_packet_type = { +static struct packet_type llc_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_802_2), .func = llc_rcv, }; -static struct packet_type llc_tr_packet_type = { +static struct packet_type llc_tr_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_TR_802_2), .func = llc_rcv, }; diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile index 3503a3d2131..0e3ab88bb70 100644 --- a/net/mac80211/Makefile +++ b/net/mac80211/Makefile @@ -9,6 +9,7 @@ mac80211-y := \ wpa.o \ scan.o \ ht.o agg-tx.o agg-rx.o \ + ibss.o \ mlme.o \ iface.o \ rate.o \ diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 3112bfd441b..a95affc9462 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -129,7 +129,6 @@ static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *d u8 dialog_token, u16 status, u16 policy, u16 buf_size, u16 timeout) { - struct ieee80211_if_sta *ifsta = &sdata->u.sta; struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; @@ -151,8 +150,9 @@ static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *d if (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_AP_VLAN) memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN); - else - memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN); + else if (sdata->vif.type == NL80211_IFTYPE_STATION) + memcpy(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN); + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 1232d9f01ca..1df116d4d6e 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -49,7 +49,6 @@ static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata, u16 agg_size, u16 timeout) { struct ieee80211_local *local = sdata->local; - struct ieee80211_if_sta *ifsta = &sdata->u.sta; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; u16 capab; @@ -69,8 +68,8 @@ static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata, if (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_AP_VLAN) memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN); - else - memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN); + else if (sdata->vif.type == NL80211_IFTYPE_STATION) + memcpy(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); @@ -132,9 +131,24 @@ static int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, state = &sta->ampdu_mlme.tid_state_tx[tid]; - if (local->hw.ampdu_queues) - ieee80211_stop_queue(&local->hw, sta->tid_to_tx_q[tid]); + if (local->hw.ampdu_queues) { + if (initiator) { + /* + * Stop the AC queue to avoid issues where we send + * unaggregated frames already before the delba. + */ + ieee80211_stop_queue_by_reason(&local->hw, + local->hw.queues + sta->tid_to_tx_q[tid], + IEEE80211_QUEUE_STOP_REASON_AGGREGATION); + } + /* + * Pretend the driver woke the queue, just in case + * it disabled it before the session was stopped. + */ + ieee80211_wake_queue( + &local->hw, local->hw.queues + sta->tid_to_tx_q[tid]); + } *state = HT_AGG_STATE_REQ_STOP_BA_MSK | (initiator << HT_AGG_STATE_INITIATOR_SHIFT); @@ -144,8 +158,6 @@ static int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, /* HW shall not deny going back to legacy */ if (WARN_ON(ret)) { *state = HT_AGG_STATE_OPERATIONAL; - if (local->hw.ampdu_queues) - ieee80211_wake_queue(&local->hw, sta->tid_to_tx_q[tid]); } return ret; @@ -189,14 +201,19 @@ static void sta_addba_resp_timer_expired(unsigned long data) spin_unlock_bh(&sta->lock); } +static inline int ieee80211_ac_from_tid(int tid) +{ + return ieee802_1d_to_ac[tid & 7]; +} + int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid) { struct ieee80211_local *local = hw_to_local(hw); struct sta_info *sta; struct ieee80211_sub_if_data *sdata; - u16 start_seq_num; u8 *state; - int ret = 0; + int i, qn = -1, ret = 0; + u16 start_seq_num; if (WARN_ON(!local->ops->ampdu_action)) return -EINVAL; @@ -209,6 +226,13 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid) ra, tid); #endif /* CONFIG_MAC80211_HT_DEBUG */ + if (hw->ampdu_queues && ieee80211_ac_from_tid(tid) == 0) { +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "rejecting on voice AC\n"); +#endif + return -EINVAL; + } + rcu_read_lock(); sta = sta_info_get(local, ra); @@ -217,7 +241,7 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid) printk(KERN_DEBUG "Could not find the station\n"); #endif ret = -ENOENT; - goto exit; + goto unlock; } /* @@ -230,11 +254,13 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid) sta->sdata->vif.type != NL80211_IFTYPE_AP_VLAN && sta->sdata->vif.type != NL80211_IFTYPE_AP) { ret = -EINVAL; - goto exit; + goto unlock; } spin_lock_bh(&sta->lock); + sdata = sta->sdata; + /* we have tried too many times, receiver does not want A-MPDU */ if (sta->ampdu_mlme.addba_req_num[tid] > HT_AGG_MAX_RETRIES) { ret = -EBUSY; @@ -252,6 +278,42 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid) goto err_unlock_sta; } + if (hw->ampdu_queues) { + spin_lock(&local->queue_stop_reason_lock); + /* reserve a new queue for this session */ + for (i = 0; i < local->hw.ampdu_queues; i++) { + if (local->ampdu_ac_queue[i] < 0) { + qn = i; + local->ampdu_ac_queue[qn] = + ieee80211_ac_from_tid(tid); + break; + } + } + spin_unlock(&local->queue_stop_reason_lock); + + if (qn < 0) { +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "BA request denied - " + "queue unavailable for tid %d\n", tid); +#endif /* CONFIG_MAC80211_HT_DEBUG */ + ret = -ENOSPC; + goto err_unlock_sta; + } + + /* + * If we successfully allocate the session, we can't have + * anything going on on the queue this TID maps into, so + * stop it for now. This is a "virtual" stop using the same + * mechanism that drivers will use. + * + * XXX: queue up frames for this session in the sta_info + * struct instead to avoid hitting all other STAs. + */ + ieee80211_stop_queue_by_reason( + &local->hw, hw->queues + qn, + IEEE80211_QUEUE_STOP_REASON_AGGREGATION); + } + /* prepare A-MPDU MLME for Tx aggregation */ sta->ampdu_mlme.tid_tx[tid] = kmalloc(sizeof(struct tid_ampdu_tx), GFP_ATOMIC); @@ -262,8 +324,9 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid) tid); #endif ret = -ENOMEM; - goto err_unlock_sta; + goto err_return_queue; } + /* Tx timer */ sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer.function = sta_addba_resp_timer_expired; @@ -271,49 +334,25 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid) (unsigned long)&sta->timer_to_tid[tid]; init_timer(&sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer); - if (hw->ampdu_queues) { - /* create a new queue for this aggregation */ - ret = ieee80211_ht_agg_queue_add(local, sta, tid); - - /* case no queue is available to aggregation - * don't switch to aggregation */ - if (ret) { -#ifdef CONFIG_MAC80211_HT_DEBUG - printk(KERN_DEBUG "BA request denied - " - "queue unavailable for tid %d\n", tid); -#endif /* CONFIG_MAC80211_HT_DEBUG */ - goto err_unlock_queue; - } - } - sdata = sta->sdata; - /* Ok, the Addba frame hasn't been sent yet, but if the driver calls the * call back right away, it must see that the flow has begun */ *state |= HT_ADDBA_REQUESTED_MSK; - /* This is slightly racy because the queue isn't stopped */ start_seq_num = sta->tid_seq[tid]; ret = local->ops->ampdu_action(hw, IEEE80211_AMPDU_TX_START, &sta->sta, tid, &start_seq_num); if (ret) { - /* No need to requeue the packets in the agg queue, since we - * held the tx lock: no packet could be enqueued to the newly - * allocated queue */ - if (hw->ampdu_queues) - ieee80211_ht_agg_queue_remove(local, sta, tid, 0); #ifdef CONFIG_MAC80211_HT_DEBUG printk(KERN_DEBUG "BA request denied - HW unavailable for" " tid %d\n", tid); #endif /* CONFIG_MAC80211_HT_DEBUG */ *state = HT_AGG_STATE_IDLE; - goto err_unlock_queue; + goto err_free; } + sta->tid_to_tx_q[tid] = qn; - /* Will put all the packets in the new SW queue */ - if (hw->ampdu_queues) - ieee80211_requeue(local, ieee802_1d_to_ac[tid]); spin_unlock_bh(&sta->lock); /* send an addBA request */ @@ -322,7 +361,6 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid) sta->ampdu_mlme.dialog_token_allocator; sta->ampdu_mlme.tid_tx[tid]->ssn = start_seq_num; - ieee80211_send_addba_request(sta->sdata, ra, tid, sta->ampdu_mlme.tid_tx[tid]->dialog_token, sta->ampdu_mlme.tid_tx[tid]->ssn, @@ -334,15 +372,24 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid) #ifdef CONFIG_MAC80211_HT_DEBUG printk(KERN_DEBUG "activated addBA response timer on tid %d\n", tid); #endif - goto exit; + goto unlock; -err_unlock_queue: + err_free: kfree(sta->ampdu_mlme.tid_tx[tid]); sta->ampdu_mlme.tid_tx[tid] = NULL; - ret = -EBUSY; -err_unlock_sta: + err_return_queue: + if (qn >= 0) { + /* We failed, so start queue again right away. */ + ieee80211_wake_queue_by_reason(hw, hw->queues + qn, + IEEE80211_QUEUE_STOP_REASON_AGGREGATION); + /* give queue back to pool */ + spin_lock(&local->queue_stop_reason_lock); + local->ampdu_ac_queue[qn] = -1; + spin_unlock(&local->queue_stop_reason_lock); + } + err_unlock_sta: spin_unlock_bh(&sta->lock); -exit: + unlock: rcu_read_unlock(); return ret; } @@ -375,7 +422,7 @@ void ieee80211_start_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u16 tid) state = &sta->ampdu_mlme.tid_state_tx[tid]; spin_lock_bh(&sta->lock); - if (!(*state & HT_ADDBA_REQUESTED_MSK)) { + if (WARN_ON(!(*state & HT_ADDBA_REQUESTED_MSK))) { #ifdef CONFIG_MAC80211_HT_DEBUG printk(KERN_DEBUG "addBA was not requested yet, state is %d\n", *state); @@ -385,7 +432,8 @@ void ieee80211_start_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u16 tid) return; } - WARN_ON_ONCE(*state & HT_ADDBA_DRV_READY_MSK); + if (WARN_ON(*state & HT_ADDBA_DRV_READY_MSK)) + goto out; *state |= HT_ADDBA_DRV_READY_MSK; @@ -393,9 +441,18 @@ void ieee80211_start_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u16 tid) #ifdef CONFIG_MAC80211_HT_DEBUG printk(KERN_DEBUG "Aggregation is on for tid %d \n", tid); #endif - if (hw->ampdu_queues) - ieee80211_wake_queue(hw, sta->tid_to_tx_q[tid]); + if (hw->ampdu_queues) { + /* + * Wake up this queue, we stopped it earlier, + * this will in turn wake the entire AC. + */ + ieee80211_wake_queue_by_reason(hw, + hw->queues + sta->tid_to_tx_q[tid], + IEEE80211_QUEUE_STOP_REASON_AGGREGATION); + } } + + out: spin_unlock_bh(&sta->lock); rcu_read_unlock(); } @@ -485,7 +542,6 @@ void ieee80211_stop_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u8 tid) struct ieee80211_local *local = hw_to_local(hw); struct sta_info *sta; u8 *state; - int agg_queue; if (tid >= STA_TID_NUM) { #ifdef CONFIG_MAC80211_HT_DEBUG @@ -527,19 +583,19 @@ void ieee80211_stop_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u8 tid) ieee80211_send_delba(sta->sdata, ra, tid, WLAN_BACK_INITIATOR, WLAN_REASON_QSTA_NOT_USE); - if (hw->ampdu_queues) { - agg_queue = sta->tid_to_tx_q[tid]; - ieee80211_ht_agg_queue_remove(local, sta, tid, 1); + spin_lock_bh(&sta->lock); - /* We just requeued the all the frames that were in the - * removed queue, and since we might miss a softirq we do - * netif_schedule_queue. ieee80211_wake_queue is not used - * here as this queue is not necessarily stopped + if (*state & HT_AGG_STATE_INITIATOR_MSK && + hw->ampdu_queues) { + /* + * Wake up this queue, we stopped it earlier, + * this will in turn wake the entire AC. */ - netif_schedule_queue(netdev_get_tx_queue(local->mdev, - agg_queue)); + ieee80211_wake_queue_by_reason(hw, + hw->queues + sta->tid_to_tx_q[tid], + IEEE80211_QUEUE_STOP_REASON_AGGREGATION); } - spin_lock_bh(&sta->lock); + *state = HT_AGG_STATE_IDLE; sta->ampdu_mlme.addba_req_num[tid] = 0; kfree(sta->ampdu_mlme.tid_tx[tid]); @@ -613,12 +669,21 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local, #endif /* CONFIG_MAC80211_HT_DEBUG */ if (le16_to_cpu(mgmt->u.action.u.addba_resp.status) == WLAN_STATUS_SUCCESS) { + u8 curstate = *state; + *state |= HT_ADDBA_RECEIVED_MSK; - sta->ampdu_mlme.addba_req_num[tid] = 0; - if (*state == HT_AGG_STATE_OPERATIONAL && - local->hw.ampdu_queues) - ieee80211_wake_queue(hw, sta->tid_to_tx_q[tid]); + if (hw->ampdu_queues && *state != curstate && + *state == HT_AGG_STATE_OPERATIONAL) { + /* + * Wake up this queue, we stopped it earlier, + * this will in turn wake the entire AC. + */ + ieee80211_wake_queue_by_reason(hw, + hw->queues + sta->tid_to_tx_q[tid], + IEEE80211_QUEUE_STOP_REASON_AGGREGATION); + } + sta->ampdu_mlme.addba_req_num[tid] = 0; if (local->ops->ampdu_action) { (void)local->ops->ampdu_action(hw, diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index c8d969be440..58693e52d45 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -341,11 +341,15 @@ static void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) sinfo->filled = STATION_INFO_INACTIVE_TIME | STATION_INFO_RX_BYTES | STATION_INFO_TX_BYTES | + STATION_INFO_RX_PACKETS | + STATION_INFO_TX_PACKETS | STATION_INFO_TX_BITRATE; sinfo->inactive_time = jiffies_to_msecs(jiffies - sta->last_rx); sinfo->rx_bytes = sta->rx_bytes; sinfo->tx_bytes = sta->tx_bytes; + sinfo->rx_packets = sta->rx_packets; + sinfo->tx_packets = sta->tx_packets; if (sta->local->hw.flags & IEEE80211_HW_SIGNAL_DBM) { sinfo->filled |= STATION_INFO_SIGNAL; @@ -447,7 +451,8 @@ static int ieee80211_config_beacon(struct ieee80211_sub_if_data *sdata, * This is a kludge. beacon interval should really be part * of the beacon information. */ - if (params->interval) { + if (params->interval && (sdata->local->hw.conf.beacon_int != + params->interval)) { sdata->local->hw.conf.beacon_int = params->interval; err = ieee80211_hw_config(sdata->local, IEEE80211_CONF_CHANGE_BEACON_INTERVAL); @@ -1180,45 +1185,45 @@ static int set_mgmt_extra_ie_sta(struct ieee80211_sub_if_data *sdata, u8 subtype, u8 *ies, size_t ies_len) { struct ieee80211_local *local = sdata->local; - struct ieee80211_if_sta *ifsta = &sdata->u.sta; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; switch (subtype) { case IEEE80211_STYPE_PROBE_REQ >> 4: if (local->ops->hw_scan) break; - kfree(ifsta->ie_probereq); - ifsta->ie_probereq = ies; - ifsta->ie_probereq_len = ies_len; + kfree(ifmgd->ie_probereq); + ifmgd->ie_probereq = ies; + ifmgd->ie_probereq_len = ies_len; return 0; case IEEE80211_STYPE_PROBE_RESP >> 4: - kfree(ifsta->ie_proberesp); - ifsta->ie_proberesp = ies; - ifsta->ie_proberesp_len = ies_len; + kfree(ifmgd->ie_proberesp); + ifmgd->ie_proberesp = ies; + ifmgd->ie_proberesp_len = ies_len; return 0; case IEEE80211_STYPE_AUTH >> 4: - kfree(ifsta->ie_auth); - ifsta->ie_auth = ies; - ifsta->ie_auth_len = ies_len; + kfree(ifmgd->ie_auth); + ifmgd->ie_auth = ies; + ifmgd->ie_auth_len = ies_len; return 0; case IEEE80211_STYPE_ASSOC_REQ >> 4: - kfree(ifsta->ie_assocreq); - ifsta->ie_assocreq = ies; - ifsta->ie_assocreq_len = ies_len; + kfree(ifmgd->ie_assocreq); + ifmgd->ie_assocreq = ies; + ifmgd->ie_assocreq_len = ies_len; return 0; case IEEE80211_STYPE_REASSOC_REQ >> 4: - kfree(ifsta->ie_reassocreq); - ifsta->ie_reassocreq = ies; - ifsta->ie_reassocreq_len = ies_len; + kfree(ifmgd->ie_reassocreq); + ifmgd->ie_reassocreq = ies; + ifmgd->ie_reassocreq_len = ies_len; return 0; case IEEE80211_STYPE_DEAUTH >> 4: - kfree(ifsta->ie_deauth); - ifsta->ie_deauth = ies; - ifsta->ie_deauth_len = ies_len; + kfree(ifmgd->ie_deauth); + ifmgd->ie_deauth = ies; + ifmgd->ie_deauth_len = ies_len; return 0; case IEEE80211_STYPE_DISASSOC >> 4: - kfree(ifsta->ie_disassoc); - ifsta->ie_disassoc = ies; - ifsta->ie_disassoc_len = ies_len; + kfree(ifmgd->ie_disassoc); + ifmgd->ie_disassoc = ies; + ifmgd->ie_disassoc_len = ies_len; return 0; } @@ -1248,7 +1253,6 @@ static int ieee80211_set_mgmt_extra_ie(struct wiphy *wiphy, switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: - case NL80211_IFTYPE_ADHOC: ret = set_mgmt_extra_ie_sta(sdata, params->subtype, ies, ies_len); break; diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index c5421930172..e3420329f4e 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -94,31 +94,31 @@ IEEE80211_IF_FILE(drop_unencrypted, drop_unencrypted, DEC); IEEE80211_IF_FILE(force_unicast_rateidx, force_unicast_rateidx, DEC); IEEE80211_IF_FILE(max_ratectrl_rateidx, max_ratectrl_rateidx, DEC); -/* STA/IBSS attributes */ -IEEE80211_IF_FILE(state, u.sta.state, DEC); -IEEE80211_IF_FILE(bssid, u.sta.bssid, MAC); -IEEE80211_IF_FILE(prev_bssid, u.sta.prev_bssid, MAC); -IEEE80211_IF_FILE(ssid_len, u.sta.ssid_len, SIZE); -IEEE80211_IF_FILE(aid, u.sta.aid, DEC); -IEEE80211_IF_FILE(ap_capab, u.sta.ap_capab, HEX); -IEEE80211_IF_FILE(capab, u.sta.capab, HEX); -IEEE80211_IF_FILE(extra_ie_len, u.sta.extra_ie_len, SIZE); -IEEE80211_IF_FILE(auth_tries, u.sta.auth_tries, DEC); -IEEE80211_IF_FILE(assoc_tries, u.sta.assoc_tries, DEC); -IEEE80211_IF_FILE(auth_algs, u.sta.auth_algs, HEX); -IEEE80211_IF_FILE(auth_alg, u.sta.auth_alg, DEC); -IEEE80211_IF_FILE(auth_transaction, u.sta.auth_transaction, DEC); +/* STA attributes */ +IEEE80211_IF_FILE(state, u.mgd.state, DEC); +IEEE80211_IF_FILE(bssid, u.mgd.bssid, MAC); +IEEE80211_IF_FILE(prev_bssid, u.mgd.prev_bssid, MAC); +IEEE80211_IF_FILE(ssid_len, u.mgd.ssid_len, SIZE); +IEEE80211_IF_FILE(aid, u.mgd.aid, DEC); +IEEE80211_IF_FILE(ap_capab, u.mgd.ap_capab, HEX); +IEEE80211_IF_FILE(capab, u.mgd.capab, HEX); +IEEE80211_IF_FILE(extra_ie_len, u.mgd.extra_ie_len, SIZE); +IEEE80211_IF_FILE(auth_tries, u.mgd.auth_tries, DEC); +IEEE80211_IF_FILE(assoc_tries, u.mgd.assoc_tries, DEC); +IEEE80211_IF_FILE(auth_algs, u.mgd.auth_algs, HEX); +IEEE80211_IF_FILE(auth_alg, u.mgd.auth_alg, DEC); +IEEE80211_IF_FILE(auth_transaction, u.mgd.auth_transaction, DEC); static ssize_t ieee80211_if_fmt_flags( const struct ieee80211_sub_if_data *sdata, char *buf, int buflen) { return scnprintf(buf, buflen, "%s%s%s%s%s%s%s\n", - sdata->u.sta.flags & IEEE80211_STA_SSID_SET ? "SSID\n" : "", - sdata->u.sta.flags & IEEE80211_STA_BSSID_SET ? "BSSID\n" : "", - sdata->u.sta.flags & IEEE80211_STA_PREV_BSSID_SET ? "prev BSSID\n" : "", - sdata->u.sta.flags & IEEE80211_STA_AUTHENTICATED ? "AUTH\n" : "", - sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED ? "ASSOC\n" : "", - sdata->u.sta.flags & IEEE80211_STA_PROBEREQ_POLL ? "PROBEREQ POLL\n" : "", + sdata->u.mgd.flags & IEEE80211_STA_SSID_SET ? "SSID\n" : "", + sdata->u.mgd.flags & IEEE80211_STA_BSSID_SET ? "BSSID\n" : "", + sdata->u.mgd.flags & IEEE80211_STA_PREV_BSSID_SET ? "prev BSSID\n" : "", + sdata->u.mgd.flags & IEEE80211_STA_AUTHENTICATED ? "AUTH\n" : "", + sdata->u.mgd.flags & IEEE80211_STA_ASSOCIATED ? "ASSOC\n" : "", + sdata->u.mgd.flags & IEEE80211_STA_PROBEREQ_POLL ? "PROBEREQ POLL\n" : "", sdata->vif.bss_conf.use_cts_prot ? "CTS prot\n" : ""); } __IEEE80211_IF_FILE(flags); @@ -283,9 +283,11 @@ static void add_files(struct ieee80211_sub_if_data *sdata) #endif break; case NL80211_IFTYPE_STATION: - case NL80211_IFTYPE_ADHOC: add_sta_files(sdata); break; + case NL80211_IFTYPE_ADHOC: + /* XXX */ + break; case NL80211_IFTYPE_AP: add_ap_files(sdata); break; @@ -418,9 +420,11 @@ static void del_files(struct ieee80211_sub_if_data *sdata) #endif break; case NL80211_IFTYPE_STATION: - case NL80211_IFTYPE_ADHOC: del_sta_files(sdata); break; + case NL80211_IFTYPE_ADHOC: + /* XXX */ + break; case NL80211_IFTYPE_AP: del_ap_files(sdata); break; diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 82ea0b63a38..4e3c72f20de 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -17,6 +17,7 @@ #include <net/wireless.h> #include <net/mac80211.h> #include "ieee80211_i.h" +#include "rate.h" void ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_supported_band *sband, struct ieee80211_ht_cap *ht_cap_ie, @@ -93,7 +94,9 @@ u32 ieee80211_enable_ht(struct ieee80211_sub_if_data *sdata, { struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_bss_ht_conf ht; + struct sta_info *sta; u32 changed = 0; bool enable_ht = true, ht_changed; enum nl80211_channel_type channel_type = NL80211_CHAN_NO_HT; @@ -136,6 +139,16 @@ u32 ieee80211_enable_ht(struct ieee80211_sub_if_data *sdata, if (ht_changed) { /* channel_type change automatically detected */ ieee80211_hw_config(local, 0); + + rcu_read_lock(); + + sta = sta_info_get(local, ifmgd->bssid); + if (sta) + rate_control_rate_update(local, sband, sta, + IEEE80211_RC_HT_CHANGED); + + rcu_read_unlock(); + } /* disable HT */ @@ -169,7 +182,6 @@ void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, u16 initiator, u16 reason_code) { struct ieee80211_local *local = sdata->local; - struct ieee80211_if_sta *ifsta = &sdata->u.sta; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; u16 params; @@ -190,8 +202,9 @@ void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, if (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_AP_VLAN) memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN); - else - memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN); + else if (sdata->vif.type == NL80211_IFTYPE_STATION) + memcpy(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN); + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c new file mode 100644 index 00000000000..f4becc12904 --- /dev/null +++ b/net/mac80211/ibss.c @@ -0,0 +1,907 @@ +/* + * IBSS mode implementation + * Copyright 2003-2008, Jouni Malinen <j@w1.fi> + * Copyright 2004, Instant802 Networks, Inc. + * Copyright 2005, Devicescape Software, Inc. + * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> + * Copyright 2007, Michael Wu <flamingice@sourmilk.net> + * Copyright 2009, Johannes Berg <johannes@sipsolutions.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/delay.h> +#include <linux/if_ether.h> +#include <linux/skbuff.h> +#include <linux/if_arp.h> +#include <linux/etherdevice.h> +#include <linux/rtnetlink.h> +#include <net/mac80211.h> +#include <asm/unaligned.h> + +#include "ieee80211_i.h" +#include "rate.h" + +#define IEEE80211_SCAN_INTERVAL (2 * HZ) +#define IEEE80211_SCAN_INTERVAL_SLOW (15 * HZ) +#define IEEE80211_IBSS_JOIN_TIMEOUT (7 * HZ) + +#define IEEE80211_IBSS_MERGE_INTERVAL (30 * HZ) +#define IEEE80211_IBSS_MERGE_DELAY 0x400000 +#define IEEE80211_IBSS_INACTIVITY_LIMIT (60 * HZ) + +#define IEEE80211_IBSS_MAX_STA_ENTRIES 128 + + +static void ieee80211_rx_mgmt_auth_ibss(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, + size_t len) +{ + u16 auth_alg, auth_transaction, status_code; + + if (len < 24 + 6) + return; + + auth_alg = le16_to_cpu(mgmt->u.auth.auth_alg); + auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction); + status_code = le16_to_cpu(mgmt->u.auth.status_code); + + /* + * IEEE 802.11 standard does not require authentication in IBSS + * networks and most implementations do not seem to use it. + * However, try to reply to authentication attempts if someone + * has actually implemented this. + */ + if (auth_alg == WLAN_AUTH_OPEN && auth_transaction == 1) + ieee80211_send_auth(sdata, 2, WLAN_AUTH_OPEN, NULL, 0, + sdata->u.ibss.bssid, 0); +} + +static int __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, + const u8 *bssid, const int beacon_int, + const int freq, + const size_t supp_rates_len, + const u8 *supp_rates, + const u16 capability, u64 tsf) +{ + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + struct ieee80211_local *local = sdata->local; + int res = 0, rates, i, j; + struct sk_buff *skb; + struct ieee80211_mgmt *mgmt; + u8 *pos; + struct ieee80211_supported_band *sband; + union iwreq_data wrqu; + + if (local->ops->reset_tsf) { + /* Reset own TSF to allow time synchronization work. */ + local->ops->reset_tsf(local_to_hw(local)); + } + + if ((ifibss->flags & IEEE80211_IBSS_PREV_BSSID_SET) && + memcmp(ifibss->bssid, bssid, ETH_ALEN) == 0) + return res; + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + 400); + if (!skb) { + printk(KERN_DEBUG "%s: failed to allocate buffer for probe " + "response\n", sdata->dev->name); + return -ENOMEM; + } + + if (!(ifibss->flags & IEEE80211_IBSS_PREV_BSSID_SET)) { + /* Remove possible STA entries from other IBSS networks. */ + sta_info_flush_delayed(sdata); + } + + memcpy(ifibss->bssid, bssid, ETH_ALEN); + res = ieee80211_if_config(sdata, IEEE80211_IFCC_BSSID); + if (res) + return res; + + local->hw.conf.beacon_int = beacon_int >= 10 ? beacon_int : 10; + + sdata->drop_unencrypted = capability & + WLAN_CAPABILITY_PRIVACY ? 1 : 0; + + res = ieee80211_set_freq(sdata, freq); + + if (res) + return res; + + sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; + + /* Build IBSS probe response */ + + skb_reserve(skb, local->hw.extra_tx_headroom); + + mgmt = (struct ieee80211_mgmt *) + skb_put(skb, 24 + sizeof(mgmt->u.beacon)); + memset(mgmt, 0, 24 + sizeof(mgmt->u.beacon)); + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_PROBE_RESP); + memset(mgmt->da, 0xff, ETH_ALEN); + memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN); + memcpy(mgmt->bssid, ifibss->bssid, ETH_ALEN); + mgmt->u.beacon.beacon_int = + cpu_to_le16(local->hw.conf.beacon_int); + mgmt->u.beacon.timestamp = cpu_to_le64(tsf); + mgmt->u.beacon.capab_info = cpu_to_le16(capability); + + pos = skb_put(skb, 2 + ifibss->ssid_len); + *pos++ = WLAN_EID_SSID; + *pos++ = ifibss->ssid_len; + memcpy(pos, ifibss->ssid, ifibss->ssid_len); + + rates = supp_rates_len; + if (rates > 8) + rates = 8; + pos = skb_put(skb, 2 + rates); + *pos++ = WLAN_EID_SUPP_RATES; + *pos++ = rates; + memcpy(pos, supp_rates, rates); + + if (sband->band == IEEE80211_BAND_2GHZ) { + pos = skb_put(skb, 2 + 1); + *pos++ = WLAN_EID_DS_PARAMS; + *pos++ = 1; + *pos++ = ieee80211_frequency_to_channel(freq); + } + + pos = skb_put(skb, 2 + 2); + *pos++ = WLAN_EID_IBSS_PARAMS; + *pos++ = 2; + /* FIX: set ATIM window based on scan results */ + *pos++ = 0; + *pos++ = 0; + + if (supp_rates_len > 8) { + rates = supp_rates_len - 8; + pos = skb_put(skb, 2 + rates); + *pos++ = WLAN_EID_EXT_SUPP_RATES; + *pos++ = rates; + memcpy(pos, &supp_rates[8], rates); + } + + ifibss->probe_resp = skb; + + ieee80211_if_config(sdata, IEEE80211_IFCC_BEACON | + IEEE80211_IFCC_BEACON_ENABLED); + + + rates = 0; + for (i = 0; i < supp_rates_len; i++) { + int bitrate = (supp_rates[i] & 0x7f) * 5; + for (j = 0; j < sband->n_bitrates; j++) + if (sband->bitrates[j].bitrate == bitrate) + rates |= BIT(j); + } + + ieee80211_sta_def_wmm_params(sdata, supp_rates_len, supp_rates); + + ifibss->flags |= IEEE80211_IBSS_PREV_BSSID_SET; + ifibss->state = IEEE80211_IBSS_MLME_JOINED; + mod_timer(&ifibss->timer, jiffies + IEEE80211_IBSS_MERGE_INTERVAL); + + memset(&wrqu, 0, sizeof(wrqu)); + memcpy(wrqu.ap_addr.sa_data, bssid, ETH_ALEN); + wireless_send_event(sdata->dev, SIOCGIWAP, &wrqu, NULL); + + return res; +} + +static int ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, + struct ieee80211_bss *bss) +{ + return __ieee80211_sta_join_ibss(sdata, + bss->cbss.bssid, + bss->cbss.beacon_interval, + bss->cbss.channel->center_freq, + bss->supp_rates_len, bss->supp_rates, + bss->cbss.capability, + bss->cbss.tsf); +} + +static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, + size_t len, + struct ieee80211_rx_status *rx_status, + struct ieee802_11_elems *elems, + bool beacon) +{ + struct ieee80211_local *local = sdata->local; + int freq; + struct ieee80211_bss *bss; + struct sta_info *sta; + struct ieee80211_channel *channel; + u64 beacon_timestamp, rx_timestamp; + u32 supp_rates = 0; + enum ieee80211_band band = rx_status->band; + + if (elems->ds_params && elems->ds_params_len == 1) + freq = ieee80211_channel_to_frequency(elems->ds_params[0]); + else + freq = rx_status->freq; + + channel = ieee80211_get_channel(local->hw.wiphy, freq); + + if (!channel || channel->flags & IEEE80211_CHAN_DISABLED) + return; + + if (sdata->vif.type == NL80211_IFTYPE_ADHOC && elems->supp_rates && + memcmp(mgmt->bssid, sdata->u.ibss.bssid, ETH_ALEN) == 0) { + supp_rates = ieee80211_sta_get_rates(local, elems, band); + + rcu_read_lock(); + + sta = sta_info_get(local, mgmt->sa); + if (sta) { + u32 prev_rates; + + prev_rates = sta->sta.supp_rates[band]; + /* make sure mandatory rates are always added */ + sta->sta.supp_rates[band] = supp_rates | + ieee80211_mandatory_rates(local, band); + +#ifdef CONFIG_MAC80211_IBSS_DEBUG + if (sta->sta.supp_rates[band] != prev_rates) + printk(KERN_DEBUG "%s: updated supp_rates set " + "for %pM based on beacon info (0x%llx | " + "0x%llx -> 0x%llx)\n", + sdata->dev->name, + sta->sta.addr, + (unsigned long long) prev_rates, + (unsigned long long) supp_rates, + (unsigned long long) sta->sta.supp_rates[band]); +#endif + } else + ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa, supp_rates); + + rcu_read_unlock(); + } + + bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, elems, + channel, beacon); + if (!bss) + return; + + /* was just updated in ieee80211_bss_info_update */ + beacon_timestamp = bss->cbss.tsf; + + /* check if we need to merge IBSS */ + + /* merge only on beacons (???) */ + if (!beacon) + goto put_bss; + + /* we use a fixed BSSID */ + if (sdata->u.ibss.flags & IEEE80211_IBSS_BSSID_SET) + goto put_bss; + + /* not an IBSS */ + if (!(bss->cbss.capability & WLAN_CAPABILITY_IBSS)) + goto put_bss; + + /* different channel */ + if (bss->cbss.channel != local->oper_channel) + goto put_bss; + + /* different SSID */ + if (elems->ssid_len != sdata->u.ibss.ssid_len || + memcmp(elems->ssid, sdata->u.ibss.ssid, + sdata->u.ibss.ssid_len)) + goto put_bss; + + /* same BSSID */ + if (memcmp(bss->cbss.bssid, sdata->u.ibss.bssid, ETH_ALEN) == 0) + goto put_bss; + + if (rx_status->flag & RX_FLAG_TSFT) { + /* + * For correct IBSS merging we need mactime; since mactime is + * defined as the time the first data symbol of the frame hits + * the PHY, and the timestamp of the beacon is defined as "the + * time that the data symbol containing the first bit of the + * timestamp is transmitted to the PHY plus the transmitting + * STA's delays through its local PHY from the MAC-PHY + * interface to its interface with the WM" (802.11 11.1.2) + * - equals the time this bit arrives at the receiver - we have + * to take into account the offset between the two. + * + * E.g. at 1 MBit that means mactime is 192 usec earlier + * (=24 bytes * 8 usecs/byte) than the beacon timestamp. + */ + int rate; + + if (rx_status->flag & RX_FLAG_HT) + rate = 65; /* TODO: HT rates */ + else + rate = local->hw.wiphy->bands[band]-> + bitrates[rx_status->rate_idx].bitrate; + + rx_timestamp = rx_status->mactime + (24 * 8 * 10 / rate); + } else if (local && local->ops && local->ops->get_tsf) + /* second best option: get current TSF */ + rx_timestamp = local->ops->get_tsf(local_to_hw(local)); + else + /* can't merge without knowing the TSF */ + rx_timestamp = -1LLU; + +#ifdef CONFIG_MAC80211_IBSS_DEBUG + printk(KERN_DEBUG "RX beacon SA=%pM BSSID=" + "%pM TSF=0x%llx BCN=0x%llx diff=%lld @%lu\n", + mgmt->sa, mgmt->bssid, + (unsigned long long)rx_timestamp, + (unsigned long long)beacon_timestamp, + (unsigned long long)(rx_timestamp - beacon_timestamp), + jiffies); +#endif + + /* give slow hardware some time to do the TSF sync */ + if (rx_timestamp < IEEE80211_IBSS_MERGE_DELAY) + goto put_bss; + + if (beacon_timestamp > rx_timestamp) { +#ifdef CONFIG_MAC80211_IBSS_DEBUG + printk(KERN_DEBUG "%s: beacon TSF higher than " + "local TSF - IBSS merge with BSSID %pM\n", + sdata->dev->name, mgmt->bssid); +#endif + ieee80211_sta_join_ibss(sdata, bss); + ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa, supp_rates); + } + + put_bss: + ieee80211_rx_bss_put(local, bss); +} + +/* + * Add a new IBSS station, will also be called by the RX code when, + * in IBSS mode, receiving a frame from a yet-unknown station, hence + * must be callable in atomic context. + */ +struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, + u8 *bssid,u8 *addr, u32 supp_rates) +{ + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + int band = local->hw.conf.channel->band; + + /* TODO: Could consider removing the least recently used entry and + * allow new one to be added. */ + if (local->num_sta >= IEEE80211_IBSS_MAX_STA_ENTRIES) { + if (net_ratelimit()) { + printk(KERN_DEBUG "%s: No room for a new IBSS STA " + "entry %pM\n", sdata->dev->name, addr); + } + return NULL; + } + + if (compare_ether_addr(bssid, sdata->u.ibss.bssid)) + return NULL; + +#ifdef CONFIG_MAC80211_VERBOSE_DEBUG + printk(KERN_DEBUG "%s: Adding new IBSS station %pM (dev=%s)\n", + wiphy_name(local->hw.wiphy), addr, sdata->dev->name); +#endif + + sta = sta_info_alloc(sdata, addr, GFP_ATOMIC); + if (!sta) + return NULL; + + set_sta_flags(sta, WLAN_STA_AUTHORIZED); + + /* make sure mandatory rates are always added */ + sta->sta.supp_rates[band] = supp_rates | + ieee80211_mandatory_rates(local, band); + + rate_control_rate_init(sta); + + if (sta_info_insert(sta)) + return NULL; + + return sta; +} + +static int ieee80211_sta_active_ibss(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + int active = 0; + struct sta_info *sta; + + rcu_read_lock(); + + list_for_each_entry_rcu(sta, &local->sta_list, list) { + if (sta->sdata == sdata && + time_after(sta->last_rx + IEEE80211_IBSS_MERGE_INTERVAL, + jiffies)) { + active++; + break; + } + } + + rcu_read_unlock(); + + return active; +} + + +static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + + mod_timer(&ifibss->timer, jiffies + IEEE80211_IBSS_MERGE_INTERVAL); + + ieee80211_sta_expire(sdata, IEEE80211_IBSS_INACTIVITY_LIMIT); + if (ieee80211_sta_active_ibss(sdata)) + return; + + if ((ifibss->flags & IEEE80211_IBSS_BSSID_SET) && + (!(ifibss->flags & IEEE80211_IBSS_AUTO_CHANNEL_SEL))) + return; + + printk(KERN_DEBUG "%s: No active IBSS STAs - trying to scan for other " + "IBSS networks with same SSID (merge)\n", sdata->dev->name); + + /* XXX maybe racy? */ + if (sdata->local->scan_req) + return; + + memcpy(sdata->local->int_scan_req.ssids[0].ssid, + ifibss->ssid, IEEE80211_MAX_SSID_LEN); + sdata->local->int_scan_req.ssids[0].ssid_len = ifibss->ssid_len; + ieee80211_request_scan(sdata, &sdata->local->int_scan_req); +} + +static int ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + struct ieee80211_local *local = sdata->local; + struct ieee80211_supported_band *sband; + u8 *pos; + u8 bssid[ETH_ALEN]; + u8 supp_rates[IEEE80211_MAX_SUPP_RATES]; + u16 capability; + int i; + + if (ifibss->flags & IEEE80211_IBSS_BSSID_SET) { + memcpy(bssid, ifibss->bssid, ETH_ALEN); + } else { + /* Generate random, not broadcast, locally administered BSSID. Mix in + * own MAC address to make sure that devices that do not have proper + * random number generator get different BSSID. */ + get_random_bytes(bssid, ETH_ALEN); + for (i = 0; i < ETH_ALEN; i++) + bssid[i] ^= sdata->dev->dev_addr[i]; + bssid[0] &= ~0x01; + bssid[0] |= 0x02; + } + + printk(KERN_DEBUG "%s: Creating new IBSS network, BSSID %pM\n", + sdata->dev->name, bssid); + + sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; + + if (local->hw.conf.beacon_int == 0) + local->hw.conf.beacon_int = 100; + + capability = WLAN_CAPABILITY_IBSS; + + if (sdata->default_key) + capability |= WLAN_CAPABILITY_PRIVACY; + else + sdata->drop_unencrypted = 0; + + pos = supp_rates; + for (i = 0; i < sband->n_bitrates; i++) { + int rate = sband->bitrates[i].bitrate; + *pos++ = (u8) (rate / 5); + } + + return __ieee80211_sta_join_ibss(sdata, + bssid, local->hw.conf.beacon_int, + local->hw.conf.channel->center_freq, + sband->n_bitrates, supp_rates, + capability, 0); +} + +static int ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + struct ieee80211_local *local = sdata->local; + struct ieee80211_bss *bss; + const u8 *bssid = NULL; + int active_ibss; + + if (ifibss->ssid_len == 0) + return -EINVAL; + + active_ibss = ieee80211_sta_active_ibss(sdata); +#ifdef CONFIG_MAC80211_IBSS_DEBUG + printk(KERN_DEBUG "%s: sta_find_ibss (active_ibss=%d)\n", + sdata->dev->name, active_ibss); +#endif /* CONFIG_MAC80211_IBSS_DEBUG */ + + if (active_ibss) + return 0; + + if (ifibss->flags & IEEE80211_IBSS_BSSID_SET) + bssid = ifibss->bssid; + bss = (void *)cfg80211_get_bss(local->hw.wiphy, NULL, bssid, + ifibss->ssid, ifibss->ssid_len, + WLAN_CAPABILITY_IBSS, + WLAN_CAPABILITY_IBSS); + +#ifdef CONFIG_MAC80211_IBSS_DEBUG + if (bss) + printk(KERN_DEBUG " sta_find_ibss: selected %pM current " + "%pM\n", bss->cbss.bssid, ifibss->bssid); +#endif /* CONFIG_MAC80211_IBSS_DEBUG */ + + if (bss && + (!(ifibss->flags & IEEE80211_IBSS_PREV_BSSID_SET) || + memcmp(ifibss->bssid, bss->cbss.bssid, ETH_ALEN))) { + int ret; + + printk(KERN_DEBUG "%s: Selected IBSS BSSID %pM" + " based on configured SSID\n", + sdata->dev->name, bss->cbss.bssid); + + ret = ieee80211_sta_join_ibss(sdata, bss); + ieee80211_rx_bss_put(local, bss); + return ret; + } else if (bss) + ieee80211_rx_bss_put(local, bss); + +#ifdef CONFIG_MAC80211_IBSS_DEBUG + printk(KERN_DEBUG " did not try to join ibss\n"); +#endif /* CONFIG_MAC80211_IBSS_DEBUG */ + + /* Selected IBSS not found in current scan results - try to scan */ + if (ifibss->state == IEEE80211_IBSS_MLME_JOINED && + !ieee80211_sta_active_ibss(sdata)) { + mod_timer(&ifibss->timer, jiffies + + IEEE80211_IBSS_MERGE_INTERVAL); + } else if (time_after(jiffies, local->last_scan_completed + + IEEE80211_SCAN_INTERVAL)) { + printk(KERN_DEBUG "%s: Trigger new scan to find an IBSS to " + "join\n", sdata->dev->name); + + /* XXX maybe racy? */ + if (local->scan_req) + return -EBUSY; + + memcpy(local->int_scan_req.ssids[0].ssid, + ifibss->ssid, IEEE80211_MAX_SSID_LEN); + local->int_scan_req.ssids[0].ssid_len = ifibss->ssid_len; + return ieee80211_request_scan(sdata, &local->int_scan_req); + } else if (ifibss->state != IEEE80211_IBSS_MLME_JOINED) { + int interval = IEEE80211_SCAN_INTERVAL; + + if (time_after(jiffies, ifibss->ibss_join_req + + IEEE80211_IBSS_JOIN_TIMEOUT)) { + if (!(local->oper_channel->flags & + IEEE80211_CHAN_NO_IBSS)) + return ieee80211_sta_create_ibss(sdata); + printk(KERN_DEBUG "%s: IBSS not allowed on" + " %d MHz\n", sdata->dev->name, + local->hw.conf.channel->center_freq); + + /* No IBSS found - decrease scan interval and continue + * scanning. */ + interval = IEEE80211_SCAN_INTERVAL_SLOW; + } + + ifibss->state = IEEE80211_IBSS_MLME_SEARCH; + mod_timer(&ifibss->timer, jiffies + interval); + return 0; + } + + return 0; +} + +static void ieee80211_rx_mgmt_probe_req(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, + size_t len) +{ + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + struct ieee80211_local *local = sdata->local; + int tx_last_beacon; + struct sk_buff *skb; + struct ieee80211_mgmt *resp; + u8 *pos, *end; + + if (ifibss->state != IEEE80211_IBSS_MLME_JOINED || + len < 24 + 2 || !ifibss->probe_resp) + return; + + if (local->ops->tx_last_beacon) + tx_last_beacon = local->ops->tx_last_beacon(local_to_hw(local)); + else + tx_last_beacon = 1; + +#ifdef CONFIG_MAC80211_IBSS_DEBUG + printk(KERN_DEBUG "%s: RX ProbeReq SA=%pM DA=%pM BSSID=%pM" + " (tx_last_beacon=%d)\n", + sdata->dev->name, mgmt->sa, mgmt->da, + mgmt->bssid, tx_last_beacon); +#endif /* CONFIG_MAC80211_IBSS_DEBUG */ + + if (!tx_last_beacon) + return; + + if (memcmp(mgmt->bssid, ifibss->bssid, ETH_ALEN) != 0 && + memcmp(mgmt->bssid, "\xff\xff\xff\xff\xff\xff", ETH_ALEN) != 0) + return; + + end = ((u8 *) mgmt) + len; + pos = mgmt->u.probe_req.variable; + if (pos[0] != WLAN_EID_SSID || + pos + 2 + pos[1] > end) { +#ifdef CONFIG_MAC80211_IBSS_DEBUG + printk(KERN_DEBUG "%s: Invalid SSID IE in ProbeReq " + "from %pM\n", + sdata->dev->name, mgmt->sa); +#endif + return; + } + if (pos[1] != 0 && + (pos[1] != ifibss->ssid_len || + memcmp(pos + 2, ifibss->ssid, ifibss->ssid_len) != 0)) { + /* Ignore ProbeReq for foreign SSID */ + return; + } + + /* Reply with ProbeResp */ + skb = skb_copy(ifibss->probe_resp, GFP_KERNEL); + if (!skb) + return; + + resp = (struct ieee80211_mgmt *) skb->data; + memcpy(resp->da, mgmt->sa, ETH_ALEN); +#ifdef CONFIG_MAC80211_IBSS_DEBUG + printk(KERN_DEBUG "%s: Sending ProbeResp to %pM\n", + sdata->dev->name, resp->da); +#endif /* CONFIG_MAC80211_IBSS_DEBUG */ + ieee80211_tx_skb(sdata, skb, 0); +} + +static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, + size_t len, + struct ieee80211_rx_status *rx_status) +{ + size_t baselen; + struct ieee802_11_elems elems; + + if (memcmp(mgmt->da, sdata->dev->dev_addr, ETH_ALEN)) + return; /* ignore ProbeResp to foreign address */ + + baselen = (u8 *) mgmt->u.probe_resp.variable - (u8 *) mgmt; + if (baselen > len) + return; + + ieee802_11_parse_elems(mgmt->u.probe_resp.variable, len - baselen, + &elems); + + ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems, false); +} + +static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, + size_t len, + struct ieee80211_rx_status *rx_status) +{ + size_t baselen; + struct ieee802_11_elems elems; + + /* Process beacon from the current BSS */ + baselen = (u8 *) mgmt->u.beacon.variable - (u8 *) mgmt; + if (baselen > len) + return; + + ieee802_11_parse_elems(mgmt->u.beacon.variable, len - baselen, &elems); + + ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems, true); +} + +static void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee80211_rx_status *rx_status; + struct ieee80211_mgmt *mgmt; + u16 fc; + + rx_status = (struct ieee80211_rx_status *) skb->cb; + mgmt = (struct ieee80211_mgmt *) skb->data; + fc = le16_to_cpu(mgmt->frame_control); + + switch (fc & IEEE80211_FCTL_STYPE) { + case IEEE80211_STYPE_PROBE_REQ: + ieee80211_rx_mgmt_probe_req(sdata, mgmt, skb->len); + break; + case IEEE80211_STYPE_PROBE_RESP: + ieee80211_rx_mgmt_probe_resp(sdata, mgmt, skb->len, + rx_status); + break; + case IEEE80211_STYPE_BEACON: + ieee80211_rx_mgmt_beacon(sdata, mgmt, skb->len, + rx_status); + break; + case IEEE80211_STYPE_AUTH: + ieee80211_rx_mgmt_auth_ibss(sdata, mgmt, skb->len); + break; + } + + kfree_skb(skb); +} + +static void ieee80211_ibss_work(struct work_struct *work) +{ + struct ieee80211_sub_if_data *sdata = + container_of(work, struct ieee80211_sub_if_data, u.ibss.work); + struct ieee80211_local *local = sdata->local; + struct ieee80211_if_ibss *ifibss; + struct sk_buff *skb; + + if (!netif_running(sdata->dev)) + return; + + if (local->sw_scanning || local->hw_scanning) + return; + + if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_ADHOC)) + return; + ifibss = &sdata->u.ibss; + + while ((skb = skb_dequeue(&ifibss->skb_queue))) + ieee80211_ibss_rx_queued_mgmt(sdata, skb); + + if (!test_and_clear_bit(IEEE80211_IBSS_REQ_RUN, &ifibss->request)) + return; + + switch (ifibss->state) { + case IEEE80211_IBSS_MLME_SEARCH: + ieee80211_sta_find_ibss(sdata); + break; + case IEEE80211_IBSS_MLME_JOINED: + ieee80211_sta_merge_ibss(sdata); + break; + default: + WARN_ON(1); + break; + } +} + +static void ieee80211_ibss_timer(unsigned long data) +{ + struct ieee80211_sub_if_data *sdata = + (struct ieee80211_sub_if_data *) data; + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + struct ieee80211_local *local = sdata->local; + + set_bit(IEEE80211_IBSS_REQ_RUN, &ifibss->request); + queue_work(local->hw.workqueue, &ifibss->work); +} + +void ieee80211_ibss_setup_sdata(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + + INIT_WORK(&ifibss->work, ieee80211_ibss_work); + setup_timer(&ifibss->timer, ieee80211_ibss_timer, + (unsigned long) sdata); + skb_queue_head_init(&ifibss->skb_queue); + + ifibss->flags |= IEEE80211_IBSS_AUTO_BSSID_SEL | + IEEE80211_IBSS_AUTO_CHANNEL_SEL; +} + +int ieee80211_ibss_commit(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + + ifibss->flags &= ~IEEE80211_IBSS_PREV_BSSID_SET; + + if (ifibss->ssid_len) + ifibss->flags |= IEEE80211_IBSS_SSID_SET; + else + ifibss->flags &= ~IEEE80211_IBSS_SSID_SET; + + ifibss->ibss_join_req = jiffies; + ifibss->state = IEEE80211_IBSS_MLME_SEARCH; + + return ieee80211_sta_find_ibss(sdata); +} + +int ieee80211_ibss_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t len) +{ + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + + if (len > IEEE80211_MAX_SSID_LEN) + return -EINVAL; + + if (ifibss->ssid_len != len || memcmp(ifibss->ssid, ssid, len) != 0) { + memset(ifibss->ssid, 0, sizeof(ifibss->ssid)); + memcpy(ifibss->ssid, ssid, len); + ifibss->ssid_len = len; + } + + return ieee80211_ibss_commit(sdata); +} + +int ieee80211_ibss_get_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t *len) +{ + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + + memcpy(ssid, ifibss->ssid, ifibss->ssid_len); + *len = ifibss->ssid_len; + + return 0; +} + +int ieee80211_ibss_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid) +{ + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + + if (is_valid_ether_addr(bssid)) { + memcpy(ifibss->bssid, bssid, ETH_ALEN); + ifibss->flags |= IEEE80211_IBSS_BSSID_SET; + } else { + memset(ifibss->bssid, 0, ETH_ALEN); + ifibss->flags &= ~IEEE80211_IBSS_BSSID_SET; + } + + if (netif_running(sdata->dev)) { + if (ieee80211_if_config(sdata, IEEE80211_IFCC_BSSID)) { + printk(KERN_DEBUG "%s: Failed to config new BSSID to " + "the low-level driver\n", sdata->dev->name); + } + } + + return ieee80211_ibss_commit(sdata); +} + +/* scan finished notification */ +void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local) +{ + struct ieee80211_sub_if_data *sdata = local->scan_sdata; + struct ieee80211_if_ibss *ifibss; + + if (sdata && sdata->vif.type == NL80211_IFTYPE_ADHOC) { + ifibss = &sdata->u.ibss; + if ((!(ifibss->flags & IEEE80211_IBSS_PREV_BSSID_SET)) || + !ieee80211_sta_active_ibss(sdata)) + ieee80211_sta_find_ibss(sdata); + } +} + +ieee80211_rx_result +ieee80211_ibss_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, + struct ieee80211_rx_status *rx_status) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_mgmt *mgmt; + u16 fc; + + if (skb->len < 24) + return RX_DROP_MONITOR; + + mgmt = (struct ieee80211_mgmt *) skb->data; + fc = le16_to_cpu(mgmt->frame_control); + + switch (fc & IEEE80211_FCTL_STYPE) { + case IEEE80211_STYPE_PROBE_RESP: + case IEEE80211_STYPE_BEACON: + memcpy(skb->cb, rx_status, sizeof(*rx_status)); + case IEEE80211_STYPE_PROBE_REQ: + case IEEE80211_STYPE_AUTH: + skb_queue_tail(&sdata->u.ibss.skb_queue, skb); + queue_work(local->hw.workqueue, &sdata->u.ibss.work); + return RX_QUEUED; + } + + return RX_DROP_MONITOR; +} diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 2cb743ed9f9..fbb91f1aebb 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -239,7 +239,7 @@ struct mesh_preq_queue { u8 flags; }; -/* flags used in struct ieee80211_if_sta.flags */ +/* flags used in struct ieee80211_if_managed.flags */ #define IEEE80211_STA_SSID_SET BIT(0) #define IEEE80211_STA_BSSID_SET BIT(1) #define IEEE80211_STA_PREV_BSSID_SET BIT(2) @@ -262,31 +262,30 @@ struct mesh_preq_queue { #define IEEE80211_STA_REQ_AUTH 2 #define IEEE80211_STA_REQ_RUN 3 -/* STA/IBSS MLME states */ -enum ieee80211_sta_mlme_state { - IEEE80211_STA_MLME_DISABLED, - IEEE80211_STA_MLME_DIRECT_PROBE, - IEEE80211_STA_MLME_AUTHENTICATE, - IEEE80211_STA_MLME_ASSOCIATE, - IEEE80211_STA_MLME_ASSOCIATED, - IEEE80211_STA_MLME_IBSS_SEARCH, - IEEE80211_STA_MLME_IBSS_JOINED, -}; - /* bitfield of allowed auth algs */ #define IEEE80211_AUTH_ALG_OPEN BIT(0) #define IEEE80211_AUTH_ALG_SHARED_KEY BIT(1) #define IEEE80211_AUTH_ALG_LEAP BIT(2) -struct ieee80211_if_sta { +struct ieee80211_if_managed { struct timer_list timer; struct timer_list chswitch_timer; struct work_struct work; struct work_struct chswitch_work; + u8 bssid[ETH_ALEN], prev_bssid[ETH_ALEN]; + u8 ssid[IEEE80211_MAX_SSID_LEN]; - enum ieee80211_sta_mlme_state state; size_t ssid_len; + + enum { + IEEE80211_STA_MLME_DISABLED, + IEEE80211_STA_MLME_DIRECT_PROBE, + IEEE80211_STA_MLME_AUTHENTICATE, + IEEE80211_STA_MLME_ASSOCIATE, + IEEE80211_STA_MLME_ASSOCIATED, + } state; + u16 aid; u16 ap_capab, capab; u8 *extra_ie; /* to be added to the end of AssocReq */ @@ -319,10 +318,6 @@ struct ieee80211_if_sta { IEEE80211_MFP_REQUIRED } mfp; /* management frame protection */ - unsigned long ibss_join_req; - struct sk_buff *probe_resp; /* ProbeResp template for IBSS */ - u32 supp_rates_bits[IEEE80211_NUM_BANDS]; - int wmm_last_param_set; /* Extra IE data for management frames */ @@ -342,6 +337,42 @@ struct ieee80211_if_sta { size_t ie_disassoc_len; }; +enum ieee80211_ibss_flags { + IEEE80211_IBSS_AUTO_CHANNEL_SEL = BIT(0), + IEEE80211_IBSS_AUTO_BSSID_SEL = BIT(1), + IEEE80211_IBSS_BSSID_SET = BIT(2), + IEEE80211_IBSS_PREV_BSSID_SET = BIT(3), + IEEE80211_IBSS_SSID_SET = BIT(4), +}; + +enum ieee80211_ibss_request { + IEEE80211_IBSS_REQ_RUN = 0, +}; + +struct ieee80211_if_ibss { + struct timer_list timer; + struct work_struct work; + + struct sk_buff_head skb_queue; + + u8 ssid[IEEE80211_MAX_SSID_LEN]; + u8 ssid_len; + + u32 flags; + + u8 bssid[ETH_ALEN]; + + unsigned long request; + + unsigned long ibss_join_req; + struct sk_buff *probe_resp; /* ProbeResp template for IBSS */ + + enum { + IEEE80211_IBSS_MLME_SEARCH, + IEEE80211_IBSS_MLME_JOINED, + } state; +}; + struct ieee80211_if_mesh { struct work_struct work; struct timer_list housekeeping_timer; @@ -445,7 +476,8 @@ struct ieee80211_sub_if_data { struct ieee80211_if_ap ap; struct ieee80211_if_wds wds; struct ieee80211_if_vlan vlan; - struct ieee80211_if_sta sta; + struct ieee80211_if_managed mgd; + struct ieee80211_if_ibss ibss; #ifdef CONFIG_MAC80211_MESH struct ieee80211_if_mesh mesh; #endif @@ -564,12 +596,10 @@ enum { enum queue_stop_reason { IEEE80211_QUEUE_STOP_REASON_DRIVER, IEEE80211_QUEUE_STOP_REASON_PS, - IEEE80211_QUEUE_STOP_REASON_CSA + IEEE80211_QUEUE_STOP_REASON_CSA, + IEEE80211_QUEUE_STOP_REASON_AGGREGATION, }; -/* maximum number of hardware queues we support. */ -#define QD_MAX_QUEUES (IEEE80211_MAX_AMPDU_QUEUES + IEEE80211_MAX_QUEUES) - struct ieee80211_master_priv { struct ieee80211_local *local; }; @@ -582,9 +612,15 @@ struct ieee80211_local { const struct ieee80211_ops *ops; - unsigned long queue_pool[BITS_TO_LONGS(QD_MAX_QUEUES)]; - unsigned long queue_stop_reasons[IEEE80211_MAX_QUEUES]; + /* AC queue corresponding to each AMPDU queue */ + s8 ampdu_ac_queue[IEEE80211_MAX_AMPDU_QUEUES]; + unsigned int amdpu_ac_stop_refcnt[IEEE80211_MAX_AMPDU_QUEUES]; + + unsigned long queue_stop_reasons[IEEE80211_MAX_QUEUES + + IEEE80211_MAX_AMPDU_QUEUES]; + /* also used to protect ampdu_ac_queue and amdpu_ac_stop_refcnt */ spinlock_t queue_stop_reason_lock; + struct net_device *mdev; /* wmaster# - "master" 802.11 device */ int open_count; int monitors, cooked_mntrs; @@ -888,34 +924,41 @@ void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx); void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata, u32 changed); void ieee80211_configure_filter(struct ieee80211_local *local); +u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata); /* wireless extensions */ extern const struct iw_handler_def ieee80211_iw_handler_def; -/* STA/IBSS code */ +/* STA code */ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata); -void ieee80211_scan_work(struct work_struct *work); -void ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, - struct ieee80211_rx_status *rx_status); +ieee80211_rx_result ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb, + struct ieee80211_rx_status *rx_status); +int ieee80211_sta_commit(struct ieee80211_sub_if_data *sdata); int ieee80211_sta_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t len); int ieee80211_sta_get_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t *len); int ieee80211_sta_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid); -void ieee80211_sta_req_auth(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta); -struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, - u8 *bssid, u8 *addr, u32 supp_rates); +void ieee80211_sta_req_auth(struct ieee80211_sub_if_data *sdata); int ieee80211_sta_deauthenticate(struct ieee80211_sub_if_data *sdata, u16 reason); int ieee80211_sta_disassociate(struct ieee80211_sub_if_data *sdata, u16 reason); -u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata); -u32 ieee80211_sta_get_rates(struct ieee80211_local *local, - struct ieee802_11_elems *elems, - enum ieee80211_band band); -void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst, - u8 *ssid, size_t ssid_len); void ieee80211_send_pspoll(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); +/* IBSS code */ +int ieee80211_ibss_commit(struct ieee80211_sub_if_data *sdata); +int ieee80211_ibss_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t len); +int ieee80211_ibss_get_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t *len); +int ieee80211_ibss_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid); +void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local); +void ieee80211_ibss_setup_sdata(struct ieee80211_sub_if_data *sdata); +ieee80211_rx_result +ieee80211_ibss_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, + struct ieee80211_rx_status *rx_status); +struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, + u8 *bssid, u8 *addr, u32 supp_rates); + /* scan/BSS handling */ +void ieee80211_scan_work(struct work_struct *work); int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata, struct cfg80211_scan_request *req); int ieee80211_scan_results(struct ieee80211_local *local, @@ -929,6 +972,7 @@ int ieee80211_sta_set_extra_ie(struct ieee80211_sub_if_data *sdata, char *ie, size_t len); void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local); +void ieee80211_scan_failed(struct ieee80211_local *local); int ieee80211_start_scan(struct ieee80211_sub_if_data *scan_sdata, struct cfg80211_scan_request *req); struct ieee80211_bss * @@ -1042,6 +1086,25 @@ void ieee80211_wake_queues_by_reason(struct ieee80211_hw *hw, enum queue_stop_reason reason); void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw, enum queue_stop_reason reason); +void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue, + enum queue_stop_reason reason); +void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue, + enum queue_stop_reason reason); + +void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, + u16 transaction, u16 auth_alg, + u8 *extra, size_t extra_len, + const u8 *bssid, int encrypt); +void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst, + u8 *ssid, size_t ssid_len, + u8 *ie, size_t ie_len); + +void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata, + const size_t supp_rates_len, + const u8 *supp_rates); +u32 ieee80211_sta_get_rates(struct ieee80211_local *local, + struct ieee802_11_elems *elems, + enum ieee80211_band band); #ifdef CONFIG_MAC80211_NOINLINE #define debug_noinline noinline diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index df94b936526..f9f27b9cadb 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -236,7 +236,10 @@ static int ieee80211_open(struct net_device *dev) break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_ADHOC: - sdata->u.sta.flags &= ~IEEE80211_STA_PREV_BSSID_SET; + if (sdata->vif.type == NL80211_IFTYPE_STATION) + sdata->u.mgd.flags &= ~IEEE80211_STA_PREV_BSSID_SET; + else + sdata->u.ibss.flags &= ~IEEE80211_IBSS_PREV_BSSID_SET; /* fall through */ default: conf.vif = &sdata->vif; @@ -321,11 +324,10 @@ static int ieee80211_open(struct net_device *dev) * yet be effective. Trigger execution of ieee80211_sta_work * to fix this. */ - if (sdata->vif.type == NL80211_IFTYPE_STATION || - sdata->vif.type == NL80211_IFTYPE_ADHOC) { - struct ieee80211_if_sta *ifsta = &sdata->u.sta; - queue_work(local->hw.workqueue, &ifsta->work); - } + if (sdata->vif.type == NL80211_IFTYPE_STATION) + queue_work(local->hw.workqueue, &sdata->u.mgd.work); + else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) + queue_work(local->hw.workqueue, &sdata->u.ibss.work); netif_tx_start_all_queues(dev); @@ -368,6 +370,18 @@ static int ieee80211_stop(struct net_device *dev) rcu_read_unlock(); /* + * Announce that we are leaving the network, in case we are a + * station interface type. This must be done before removing + * all stations associated with sta_info_flush, otherwise STA + * information will be gone and no announce being done. + */ + if (sdata->vif.type == NL80211_IFTYPE_STATION) { + if (sdata->u.mgd.state != IEEE80211_STA_MLME_DISABLED) + ieee80211_sta_deauthenticate(sdata, + WLAN_REASON_DEAUTH_LEAVING); + } + + /* * Remove all stations associated with this interface. * * This must be done before calling ops->remove_interface() @@ -452,15 +466,9 @@ static int ieee80211_stop(struct net_device *dev) netif_addr_unlock_bh(local->mdev); break; case NL80211_IFTYPE_STATION: - case NL80211_IFTYPE_ADHOC: - /* Announce that we are leaving the network. */ - if (sdata->u.sta.state != IEEE80211_STA_MLME_DISABLED) - ieee80211_sta_deauthenticate(sdata, - WLAN_REASON_DEAUTH_LEAVING); - - memset(sdata->u.sta.bssid, 0, ETH_ALEN); - del_timer_sync(&sdata->u.sta.chswitch_timer); - del_timer_sync(&sdata->u.sta.timer); + memset(sdata->u.mgd.bssid, 0, ETH_ALEN); + del_timer_sync(&sdata->u.mgd.chswitch_timer); + del_timer_sync(&sdata->u.mgd.timer); /* * If the timer fired while we waited for it, it will have * requeued the work. Now the work will be running again @@ -468,8 +476,8 @@ static int ieee80211_stop(struct net_device *dev) * whether the interface is running, which, at this point, * it no longer is. */ - cancel_work_sync(&sdata->u.sta.work); - cancel_work_sync(&sdata->u.sta.chswitch_work); + cancel_work_sync(&sdata->u.mgd.work); + cancel_work_sync(&sdata->u.mgd.chswitch_work); /* * When we get here, the interface is marked down. * Call synchronize_rcu() to wait for the RX path @@ -477,13 +485,22 @@ static int ieee80211_stop(struct net_device *dev) * frames at this very time on another CPU. */ synchronize_rcu(); - skb_queue_purge(&sdata->u.sta.skb_queue); + skb_queue_purge(&sdata->u.mgd.skb_queue); - sdata->u.sta.flags &= ~(IEEE80211_STA_PRIVACY_INVOKED | + sdata->u.mgd.flags &= ~(IEEE80211_STA_PRIVACY_INVOKED | IEEE80211_STA_TKIP_WEP_USED); - kfree(sdata->u.sta.extra_ie); - sdata->u.sta.extra_ie = NULL; - sdata->u.sta.extra_ie_len = 0; + kfree(sdata->u.mgd.extra_ie); + sdata->u.mgd.extra_ie = NULL; + sdata->u.mgd.extra_ie_len = 0; + /* fall through */ + case NL80211_IFTYPE_ADHOC: + if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { + memset(sdata->u.ibss.bssid, 0, ETH_ALEN); + del_timer_sync(&sdata->u.ibss.timer); + cancel_work_sync(&sdata->u.ibss.work); + synchronize_rcu(); + skb_queue_purge(&sdata->u.ibss.skb_queue); + } /* fall through */ case NL80211_IFTYPE_MESH_POINT: if (ieee80211_vif_is_mesh(&sdata->vif)) { @@ -629,19 +646,20 @@ static void ieee80211_teardown_sdata(struct net_device *dev) if (ieee80211_vif_is_mesh(&sdata->vif)) mesh_rmc_free(sdata); break; - case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_ADHOC: - kfree(sdata->u.sta.extra_ie); - kfree(sdata->u.sta.assocreq_ies); - kfree(sdata->u.sta.assocresp_ies); - kfree_skb(sdata->u.sta.probe_resp); - kfree(sdata->u.sta.ie_probereq); - kfree(sdata->u.sta.ie_proberesp); - kfree(sdata->u.sta.ie_auth); - kfree(sdata->u.sta.ie_assocreq); - kfree(sdata->u.sta.ie_reassocreq); - kfree(sdata->u.sta.ie_deauth); - kfree(sdata->u.sta.ie_disassoc); + kfree_skb(sdata->u.ibss.probe_resp); + break; + case NL80211_IFTYPE_STATION: + kfree(sdata->u.mgd.extra_ie); + kfree(sdata->u.mgd.assocreq_ies); + kfree(sdata->u.mgd.assocresp_ies); + kfree(sdata->u.mgd.ie_probereq); + kfree(sdata->u.mgd.ie_proberesp); + kfree(sdata->u.mgd.ie_auth); + kfree(sdata->u.mgd.ie_assocreq); + kfree(sdata->u.mgd.ie_reassocreq); + kfree(sdata->u.mgd.ie_deauth); + kfree(sdata->u.mgd.ie_disassoc); break; case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_AP_VLAN: @@ -708,9 +726,11 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, INIT_LIST_HEAD(&sdata->u.ap.vlans); break; case NL80211_IFTYPE_STATION: - case NL80211_IFTYPE_ADHOC: ieee80211_sta_setup_sdata(sdata); break; + case NL80211_IFTYPE_ADHOC: + ieee80211_ibss_setup_sdata(sdata); + break; case NL80211_IFTYPE_MESH_POINT: if (ieee80211_vif_is_mesh(&sdata->vif)) ieee80211_mesh_init_sdata(sdata); @@ -798,6 +818,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, memcpy(ndev->dev_addr, local->hw.wiphy->perm_addr, ETH_ALEN); SET_NETDEV_DEV(ndev, wiphy_dev(local->hw.wiphy)); + ndev->features |= NETIF_F_NETNS_LOCAL; /* don't use IEEE80211_DEV_TO_SUB_IF because it checks too much */ sdata = netdev_priv(ndev); diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 19b480de4bb..687acf23054 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -400,7 +400,7 @@ void ieee80211_key_link(struct ieee80211_key *key, */ /* same here, the AP could be using QoS */ - ap = sta_info_get(key->local, key->sdata->u.sta.bssid); + ap = sta_info_get(key->local, key->sdata->u.mgd.bssid); if (ap) { if (test_sta_flags(ap, WLAN_STA_WME)) key->conf.flags |= diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 5667f4e8067..f38db4d37e5 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -169,9 +169,10 @@ int ieee80211_if_config(struct ieee80211_sub_if_data *sdata, u32 changed) memset(&conf, 0, sizeof(conf)); - if (sdata->vif.type == NL80211_IFTYPE_STATION || - sdata->vif.type == NL80211_IFTYPE_ADHOC) - conf.bssid = sdata->u.sta.bssid; + if (sdata->vif.type == NL80211_IFTYPE_STATION) + conf.bssid = sdata->u.mgd.bssid; + else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) + conf.bssid = sdata->u.ibss.bssid; else if (sdata->vif.type == NL80211_IFTYPE_AP) conf.bssid = sdata->dev->dev_addr; else if (ieee80211_vif_is_mesh(&sdata->vif)) { @@ -210,7 +211,7 @@ int ieee80211_if_config(struct ieee80211_sub_if_data *sdata, u32 changed) !!rcu_dereference(sdata->u.ap.beacon); break; case NL80211_IFTYPE_ADHOC: - conf.enable_beacon = !!sdata->u.sta.probe_resp; + conf.enable_beacon = !!sdata->u.ibss.probe_resp; break; case NL80211_IFTYPE_MESH_POINT: conf.enable_beacon = true; @@ -705,7 +706,7 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, const struct ieee80211_ops *ops) { struct ieee80211_local *local; - int priv_size; + int priv_size, i; struct wiphy *wiphy; /* Ensure 32-byte alignment of our private data and hw private data. @@ -779,6 +780,11 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, setup_timer(&local->dynamic_ps_timer, ieee80211_dynamic_ps_timer, (unsigned long) local); + for (i = 0; i < IEEE80211_MAX_AMPDU_QUEUES; i++) + local->ampdu_ac_queue[i] = -1; + /* using an s8 won't work with more than that */ + BUILD_BUG_ON(IEEE80211_MAX_AMPDU_QUEUES > 127); + sta_info_init(local); tasklet_init(&local->tx_pending_tasklet, ieee80211_tx_pending, @@ -855,6 +861,11 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) /* mac80211 always supports monitor */ local->hw.wiphy->interface_modes |= BIT(NL80211_IFTYPE_MONITOR); + if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) + local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_MBM; + else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) + local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_UNSPEC; + result = wiphy_register(local->hw.wiphy); if (result < 0) goto fail_wiphy_register; @@ -872,7 +883,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) mdev = alloc_netdev_mq(sizeof(struct ieee80211_master_priv), "wmaster%d", ieee80211_master_setup, - ieee80211_num_queues(hw)); + hw->queues); if (!mdev) goto fail_mdev_alloc; @@ -916,6 +927,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) memcpy(local->mdev->dev_addr, local->hw.wiphy->perm_addr, ETH_ALEN); SET_NETDEV_DEV(local->mdev, wiphy_dev(local->hw.wiphy)); + local->mdev->features |= NETIF_F_NETNS_LOCAL; result = register_netdevice(local->mdev); if (result < 0) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index fbb766afe59..841b8450b3d 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -15,11 +15,8 @@ #include <linux/if_ether.h> #include <linux/skbuff.h> #include <linux/if_arp.h> -#include <linux/wireless.h> -#include <linux/random.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> -#include <net/iw_handler.h> #include <net/mac80211.h> #include <asm/unaligned.h> @@ -35,15 +32,6 @@ #define IEEE80211_MONITORING_INTERVAL (2 * HZ) #define IEEE80211_PROBE_INTERVAL (60 * HZ) #define IEEE80211_RETRY_AUTH_INTERVAL (1 * HZ) -#define IEEE80211_SCAN_INTERVAL (2 * HZ) -#define IEEE80211_SCAN_INTERVAL_SLOW (15 * HZ) -#define IEEE80211_IBSS_JOIN_TIMEOUT (7 * HZ) - -#define IEEE80211_IBSS_MERGE_INTERVAL (30 * HZ) -#define IEEE80211_IBSS_INACTIVITY_LIMIT (60 * HZ) - -#define IEEE80211_IBSS_MAX_STA_ENTRIES 128 - /* utils */ static int ecw2cw(int ecw) @@ -92,43 +80,6 @@ static int ieee80211_compatible_rates(struct ieee80211_bss *bss, return count; } -/* also used by mesh code */ -u32 ieee80211_sta_get_rates(struct ieee80211_local *local, - struct ieee802_11_elems *elems, - enum ieee80211_band band) -{ - struct ieee80211_supported_band *sband; - struct ieee80211_rate *bitrates; - size_t num_rates; - u32 supp_rates; - int i, j; - sband = local->hw.wiphy->bands[band]; - - if (!sband) { - WARN_ON(1); - sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; - } - - bitrates = sband->bitrates; - num_rates = sband->n_bitrates; - supp_rates = 0; - for (i = 0; i < elems->supp_rates_len + - elems->ext_supp_rates_len; i++) { - u8 rate = 0; - int own_rate; - if (i < elems->supp_rates_len) - rate = elems->supp_rates[i]; - else if (elems->ext_supp_rates) - rate = elems->ext_supp_rates - [i - elems->supp_rates_len]; - own_rate = 5 * (rate & 0x7f); - for (j = 0; j < num_rates; j++) - if (bitrates[j].bitrate == own_rate) - supp_rates |= BIT(j); - } - return supp_rates; -} - /* frame sending functions */ static void add_extra_ies(struct sk_buff *skb, u8 *ies, size_t ies_len) @@ -137,113 +88,9 @@ static void add_extra_ies(struct sk_buff *skb, u8 *ies, size_t ies_len) memcpy(skb_put(skb, ies_len), ies, ies_len); } -/* also used by scanning code */ -void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst, - u8 *ssid, size_t ssid_len) -{ - struct ieee80211_local *local = sdata->local; - struct ieee80211_supported_band *sband; - struct sk_buff *skb; - struct ieee80211_mgmt *mgmt; - u8 *pos, *supp_rates, *esupp_rates = NULL; - int i; - - skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt) + 200 + - sdata->u.sta.ie_probereq_len); - if (!skb) { - printk(KERN_DEBUG "%s: failed to allocate buffer for probe " - "request\n", sdata->dev->name); - return; - } - skb_reserve(skb, local->hw.extra_tx_headroom); - - mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24); - memset(mgmt, 0, 24); - mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | - IEEE80211_STYPE_PROBE_REQ); - memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN); - if (dst) { - memcpy(mgmt->da, dst, ETH_ALEN); - memcpy(mgmt->bssid, dst, ETH_ALEN); - } else { - memset(mgmt->da, 0xff, ETH_ALEN); - memset(mgmt->bssid, 0xff, ETH_ALEN); - } - pos = skb_put(skb, 2 + ssid_len); - *pos++ = WLAN_EID_SSID; - *pos++ = ssid_len; - memcpy(pos, ssid, ssid_len); - - supp_rates = skb_put(skb, 2); - supp_rates[0] = WLAN_EID_SUPP_RATES; - supp_rates[1] = 0; - sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; - - for (i = 0; i < sband->n_bitrates; i++) { - struct ieee80211_rate *rate = &sband->bitrates[i]; - if (esupp_rates) { - pos = skb_put(skb, 1); - esupp_rates[1]++; - } else if (supp_rates[1] == 8) { - esupp_rates = skb_put(skb, 3); - esupp_rates[0] = WLAN_EID_EXT_SUPP_RATES; - esupp_rates[1] = 1; - pos = &esupp_rates[2]; - } else { - pos = skb_put(skb, 1); - supp_rates[1]++; - } - *pos = rate->bitrate / 5; - } - - add_extra_ies(skb, sdata->u.sta.ie_probereq, - sdata->u.sta.ie_probereq_len); - - ieee80211_tx_skb(sdata, skb, 0); -} - -static void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta, - int transaction, u8 *extra, size_t extra_len, - int encrypt) -{ - struct ieee80211_local *local = sdata->local; - struct sk_buff *skb; - struct ieee80211_mgmt *mgmt; - - skb = dev_alloc_skb(local->hw.extra_tx_headroom + - sizeof(*mgmt) + 6 + extra_len + - sdata->u.sta.ie_auth_len); - if (!skb) { - printk(KERN_DEBUG "%s: failed to allocate buffer for auth " - "frame\n", sdata->dev->name); - return; - } - skb_reserve(skb, local->hw.extra_tx_headroom); - - mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24 + 6); - memset(mgmt, 0, 24 + 6); - mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | - IEEE80211_STYPE_AUTH); - if (encrypt) - mgmt->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); - memcpy(mgmt->da, ifsta->bssid, ETH_ALEN); - memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN); - memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN); - mgmt->u.auth.auth_alg = cpu_to_le16(ifsta->auth_alg); - mgmt->u.auth.auth_transaction = cpu_to_le16(transaction); - ifsta->auth_transaction = transaction + 1; - mgmt->u.auth.status_code = cpu_to_le16(0); - if (extra) - memcpy(skb_put(skb, extra_len), extra, extra_len); - add_extra_ies(skb, sdata->u.sta.ie_auth, sdata->u.sta.ie_auth_len); - - ieee80211_tx_skb(sdata, skb, encrypt); -} - -static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta) +static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; @@ -256,17 +103,17 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, u32 rates = 0; size_t e_ies_len; - if (ifsta->flags & IEEE80211_STA_PREV_BSSID_SET) { - e_ies = sdata->u.sta.ie_reassocreq; - e_ies_len = sdata->u.sta.ie_reassocreq_len; + if (ifmgd->flags & IEEE80211_IBSS_PREV_BSSID_SET) { + e_ies = sdata->u.mgd.ie_reassocreq; + e_ies_len = sdata->u.mgd.ie_reassocreq_len; } else { - e_ies = sdata->u.sta.ie_assocreq; - e_ies_len = sdata->u.sta.ie_assocreq_len; + e_ies = sdata->u.mgd.ie_assocreq; + e_ies_len = sdata->u.mgd.ie_assocreq_len; } skb = dev_alloc_skb(local->hw.extra_tx_headroom + - sizeof(*mgmt) + 200 + ifsta->extra_ie_len + - ifsta->ssid_len + e_ies_len); + sizeof(*mgmt) + 200 + ifmgd->extra_ie_len + + ifmgd->ssid_len + e_ies_len); if (!skb) { printk(KERN_DEBUG "%s: failed to allocate buffer for assoc " "frame\n", sdata->dev->name); @@ -276,7 +123,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; - capab = ifsta->capab; + capab = ifmgd->capab; if (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ) { if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE)) @@ -285,9 +132,9 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, capab |= WLAN_CAPABILITY_SHORT_PREAMBLE; } - bss = ieee80211_rx_bss_get(local, ifsta->bssid, + bss = ieee80211_rx_bss_get(local, ifmgd->bssid, local->hw.conf.channel->center_freq, - ifsta->ssid, ifsta->ssid_len); + ifmgd->ssid, ifmgd->ssid_len); if (bss) { if (bss->cbss.capability & WLAN_CAPABILITY_PRIVACY) capab |= WLAN_CAPABILITY_PRIVACY; @@ -312,18 +159,18 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24); memset(mgmt, 0, 24); - memcpy(mgmt->da, ifsta->bssid, ETH_ALEN); + memcpy(mgmt->da, ifmgd->bssid, ETH_ALEN); memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN); - memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN); + memcpy(mgmt->bssid, ifmgd->bssid, ETH_ALEN); - if (ifsta->flags & IEEE80211_STA_PREV_BSSID_SET) { + if (ifmgd->flags & IEEE80211_STA_PREV_BSSID_SET) { skb_put(skb, 10); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_REASSOC_REQ); mgmt->u.reassoc_req.capab_info = cpu_to_le16(capab); mgmt->u.reassoc_req.listen_interval = cpu_to_le16(local->hw.conf.listen_interval); - memcpy(mgmt->u.reassoc_req.current_ap, ifsta->prev_bssid, + memcpy(mgmt->u.reassoc_req.current_ap, ifmgd->prev_bssid, ETH_ALEN); } else { skb_put(skb, 4); @@ -335,10 +182,10 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, } /* SSID */ - ies = pos = skb_put(skb, 2 + ifsta->ssid_len); + ies = pos = skb_put(skb, 2 + ifmgd->ssid_len); *pos++ = WLAN_EID_SSID; - *pos++ = ifsta->ssid_len; - memcpy(pos, ifsta->ssid, ifsta->ssid_len); + *pos++ = ifmgd->ssid_len; + memcpy(pos, ifmgd->ssid, ifmgd->ssid_len); /* add all rates which were marked to be used above */ supp_rates_len = rates_len; @@ -393,12 +240,12 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, } } - if (ifsta->extra_ie) { - pos = skb_put(skb, ifsta->extra_ie_len); - memcpy(pos, ifsta->extra_ie, ifsta->extra_ie_len); + if (ifmgd->extra_ie) { + pos = skb_put(skb, ifmgd->extra_ie_len); + memcpy(pos, ifmgd->extra_ie, ifmgd->extra_ie_len); } - if (wmm && (ifsta->flags & IEEE80211_STA_WMM_ENABLED)) { + if (wmm && (ifmgd->flags & IEEE80211_STA_WMM_ENABLED)) { pos = skb_put(skb, 9); *pos++ = WLAN_EID_VENDOR_SPECIFIC; *pos++ = 7; /* len */ @@ -418,11 +265,11 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, * mode (11a/b/g) if any one of these ciphers is * configured as pairwise. */ - if (wmm && (ifsta->flags & IEEE80211_STA_WMM_ENABLED) && + if (wmm && (ifmgd->flags & IEEE80211_STA_WMM_ENABLED) && sband->ht_cap.ht_supported && (ht_ie = ieee80211_bss_get_ie(bss, WLAN_EID_HT_INFORMATION)) && ht_ie[1] >= sizeof(struct ieee80211_ht_info) && - (!(ifsta->flags & IEEE80211_STA_TKIP_WEP_USED))) { + (!(ifmgd->flags & IEEE80211_STA_TKIP_WEP_USED))) { struct ieee80211_ht_info *ht_info = (struct ieee80211_ht_info *)(ht_ie + 2); u16 cap = sband->ht_cap.cap; @@ -459,11 +306,11 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, add_extra_ies(skb, e_ies, e_ies_len); - kfree(ifsta->assocreq_ies); - ifsta->assocreq_ies_len = (skb->data + skb->len) - ies; - ifsta->assocreq_ies = kmalloc(ifsta->assocreq_ies_len, GFP_KERNEL); - if (ifsta->assocreq_ies) - memcpy(ifsta->assocreq_ies, ies, ifsta->assocreq_ies_len); + kfree(ifmgd->assocreq_ies); + ifmgd->assocreq_ies_len = (skb->data + skb->len) - ies; + ifmgd->assocreq_ies = kmalloc(ifmgd->assocreq_ies_len, GFP_KERNEL); + if (ifmgd->assocreq_ies) + memcpy(ifmgd->assocreq_ies, ies, ifmgd->assocreq_ies_len); ieee80211_tx_skb(sdata, skb, 0); } @@ -473,18 +320,18 @@ static void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata, u16 stype, u16 reason) { struct ieee80211_local *local = sdata->local; - struct ieee80211_if_sta *ifsta = &sdata->u.sta; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; u8 *ies; size_t ies_len; if (stype == IEEE80211_STYPE_DEAUTH) { - ies = sdata->u.sta.ie_deauth; - ies_len = sdata->u.sta.ie_deauth_len; + ies = sdata->u.mgd.ie_deauth; + ies_len = sdata->u.mgd.ie_deauth_len; } else { - ies = sdata->u.sta.ie_disassoc; - ies_len = sdata->u.sta.ie_disassoc_len; + ies = sdata->u.mgd.ie_disassoc; + ies_len = sdata->u.mgd.ie_disassoc_len; } skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt) + @@ -498,9 +345,9 @@ static void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata, mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24); memset(mgmt, 0, 24); - memcpy(mgmt->da, ifsta->bssid, ETH_ALEN); + memcpy(mgmt->da, ifmgd->bssid, ETH_ALEN); memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN); - memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN); + memcpy(mgmt->bssid, ifmgd->bssid, ETH_ALEN); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | stype); skb_put(skb, 2); /* u.deauth.reason_code == u.disassoc.reason_code */ @@ -508,13 +355,13 @@ static void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata, add_extra_ies(skb, ies, ies_len); - ieee80211_tx_skb(sdata, skb, ifsta->flags & IEEE80211_STA_MFP_ENABLED); + ieee80211_tx_skb(sdata, skb, ifmgd->flags & IEEE80211_STA_MFP_ENABLED); } void ieee80211_send_pspoll(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { - struct ieee80211_if_sta *ifsta = &sdata->u.sta; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_pspoll *pspoll; struct sk_buff *skb; u16 fc; @@ -531,43 +378,20 @@ void ieee80211_send_pspoll(struct ieee80211_local *local, memset(pspoll, 0, sizeof(*pspoll)); fc = IEEE80211_FTYPE_CTL | IEEE80211_STYPE_PSPOLL | IEEE80211_FCTL_PM; pspoll->frame_control = cpu_to_le16(fc); - pspoll->aid = cpu_to_le16(ifsta->aid); + pspoll->aid = cpu_to_le16(ifmgd->aid); /* aid in PS-Poll has its two MSBs each set to 1 */ pspoll->aid |= cpu_to_le16(1 << 15 | 1 << 14); - memcpy(pspoll->bssid, ifsta->bssid, ETH_ALEN); + memcpy(pspoll->bssid, ifmgd->bssid, ETH_ALEN); memcpy(pspoll->ta, sdata->dev->dev_addr, ETH_ALEN); ieee80211_tx_skb(sdata, skb, 0); - - return; } /* MLME */ -static void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata, - const size_t supp_rates_len, - const u8 *supp_rates) -{ - struct ieee80211_local *local = sdata->local; - int i, have_higher_than_11mbit = 0; - - /* cf. IEEE 802.11 9.2.12 */ - for (i = 0; i < supp_rates_len; i++) - if ((supp_rates[i] & 0x7f) * 5 > 110) - have_higher_than_11mbit = 1; - - if (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ && - have_higher_than_11mbit) - sdata->flags |= IEEE80211_SDATA_OPERATING_GMODE; - else - sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE; - - ieee80211_set_wmm_default(sdata); -} - static void ieee80211_sta_wmm_params(struct ieee80211_local *local, - struct ieee80211_if_sta *ifsta, + struct ieee80211_if_managed *ifmgd, u8 *wmm_param, size_t wmm_param_len) { struct ieee80211_tx_queue_params params; @@ -575,7 +399,7 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local, int count; u8 *pos; - if (!(ifsta->flags & IEEE80211_STA_WMM_ENABLED)) + if (!(ifmgd->flags & IEEE80211_STA_WMM_ENABLED)) return; if (!wmm_param) @@ -584,18 +408,15 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local, if (wmm_param_len < 8 || wmm_param[5] /* version */ != 1) return; count = wmm_param[6] & 0x0f; - if (count == ifsta->wmm_last_param_set) + if (count == ifmgd->wmm_last_param_set) return; - ifsta->wmm_last_param_set = count; + ifmgd->wmm_last_param_set = count; pos = wmm_param + 8; left = wmm_param_len - 8; memset(¶ms, 0, sizeof(params)); - if (!local->ops->conf_tx) - return; - local->wmm_acm = 0; for (; left >= 4; left -= 4, pos += 4) { int aci = (pos[0] >> 5) & 0x03; @@ -603,26 +424,26 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local, int queue; switch (aci) { - case 1: + case 1: /* AC_BK */ queue = 3; if (acm) - local->wmm_acm |= BIT(0) | BIT(3); + local->wmm_acm |= BIT(1) | BIT(2); /* BK/- */ break; - case 2: + case 2: /* AC_VI */ queue = 1; if (acm) - local->wmm_acm |= BIT(4) | BIT(5); + local->wmm_acm |= BIT(4) | BIT(5); /* CL/VI */ break; - case 3: + case 3: /* AC_VO */ queue = 0; if (acm) - local->wmm_acm |= BIT(6) | BIT(7); + local->wmm_acm |= BIT(6) | BIT(7); /* VO/NC */ break; - case 0: + case 0: /* AC_BE */ default: queue = 2; if (acm) - local->wmm_acm |= BIT(1) | BIT(2); + local->wmm_acm |= BIT(0) | BIT(3); /* BE/EE */ break; } @@ -636,9 +457,8 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local, local->mdev->name, queue, aci, acm, params.aifs, params.cw_min, params.cw_max, params.txop); #endif - /* TODO: handle ACM (block TX, fallback to next lowest allowed - * AC for now) */ - if (local->ops->conf_tx(local_to_hw(local), queue, ¶ms)) { + if (local->ops->conf_tx && + local->ops->conf_tx(local_to_hw(local), queue, ¶ms)) { printk(KERN_DEBUG "%s: failed to set TX queue " "parameters for queue %d\n", local->mdev->name, queue); } @@ -671,7 +491,7 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata, { struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf; #ifdef CONFIG_MAC80211_VERBOSE_DEBUG - struct ieee80211_if_sta *ifsta = &sdata->u.sta; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; #endif u32 changed = 0; bool use_protection; @@ -694,7 +514,7 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata, printk(KERN_DEBUG "%s: CTS protection %s (BSSID=%pM)\n", sdata->dev->name, use_protection ? "enabled" : "disabled", - ifsta->bssid); + ifmgd->bssid); } #endif bss_conf->use_cts_prot = use_protection; @@ -708,7 +528,7 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata, " (BSSID=%pM)\n", sdata->dev->name, use_short_preamble ? "short" : "long", - ifsta->bssid); + ifmgd->bssid); } #endif bss_conf->use_short_preamble = use_short_preamble; @@ -722,7 +542,7 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata, " (BSSID=%pM)\n", sdata->dev->name, use_short_slot ? "short" : "long", - ifsta->bssid); + ifmgd->bssid); } #endif bss_conf->use_short_slot = use_short_slot; @@ -732,57 +552,57 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata, return changed; } -static void ieee80211_sta_send_apinfo(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta) +static void ieee80211_sta_send_apinfo(struct ieee80211_sub_if_data *sdata) { union iwreq_data wrqu; + memset(&wrqu, 0, sizeof(wrqu)); - if (ifsta->flags & IEEE80211_STA_ASSOCIATED) - memcpy(wrqu.ap_addr.sa_data, sdata->u.sta.bssid, ETH_ALEN); + if (sdata->u.mgd.flags & IEEE80211_STA_ASSOCIATED) + memcpy(wrqu.ap_addr.sa_data, sdata->u.mgd.bssid, ETH_ALEN); wrqu.ap_addr.sa_family = ARPHRD_ETHER; wireless_send_event(sdata->dev, SIOCGIWAP, &wrqu, NULL); } -static void ieee80211_sta_send_associnfo(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta) +static void ieee80211_sta_send_associnfo(struct ieee80211_sub_if_data *sdata) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; char *buf; size_t len; int i; union iwreq_data wrqu; - if (!ifsta->assocreq_ies && !ifsta->assocresp_ies) + if (!ifmgd->assocreq_ies && !ifmgd->assocresp_ies) return; - buf = kmalloc(50 + 2 * (ifsta->assocreq_ies_len + - ifsta->assocresp_ies_len), GFP_KERNEL); + buf = kmalloc(50 + 2 * (ifmgd->assocreq_ies_len + + ifmgd->assocresp_ies_len), GFP_KERNEL); if (!buf) return; len = sprintf(buf, "ASSOCINFO("); - if (ifsta->assocreq_ies) { + if (ifmgd->assocreq_ies) { len += sprintf(buf + len, "ReqIEs="); - for (i = 0; i < ifsta->assocreq_ies_len; i++) { + for (i = 0; i < ifmgd->assocreq_ies_len; i++) { len += sprintf(buf + len, "%02x", - ifsta->assocreq_ies[i]); + ifmgd->assocreq_ies[i]); } } - if (ifsta->assocresp_ies) { - if (ifsta->assocreq_ies) + if (ifmgd->assocresp_ies) { + if (ifmgd->assocreq_ies) len += sprintf(buf + len, " "); len += sprintf(buf + len, "RespIEs="); - for (i = 0; i < ifsta->assocresp_ies_len; i++) { + for (i = 0; i < ifmgd->assocresp_ies_len; i++) { len += sprintf(buf + len, "%02x", - ifsta->assocresp_ies[i]); + ifmgd->assocresp_ies[i]); } } len += sprintf(buf + len, ")"); if (len > IW_CUSTOM_MAX) { len = sprintf(buf, "ASSOCRESPIE="); - for (i = 0; i < ifsta->assocresp_ies_len; i++) { + for (i = 0; i < ifmgd->assocresp_ies_len; i++) { len += sprintf(buf + len, "%02x", - ifsta->assocresp_ies[i]); + ifmgd->assocresp_ies[i]); } } @@ -797,20 +617,20 @@ static void ieee80211_sta_send_associnfo(struct ieee80211_sub_if_data *sdata, static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta, u32 bss_info_changed) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; struct ieee80211_conf *conf = &local_to_hw(local)->conf; struct ieee80211_bss *bss; bss_info_changed |= BSS_CHANGED_ASSOC; - ifsta->flags |= IEEE80211_STA_ASSOCIATED; + ifmgd->flags |= IEEE80211_STA_ASSOCIATED; - bss = ieee80211_rx_bss_get(local, ifsta->bssid, + bss = ieee80211_rx_bss_get(local, ifmgd->bssid, conf->channel->center_freq, - ifsta->ssid, ifsta->ssid_len); + ifmgd->ssid, ifmgd->ssid_len); if (bss) { /* set timing information */ sdata->vif.bss_conf.beacon_int = bss->cbss.beacon_interval; @@ -823,11 +643,11 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata, ieee80211_rx_bss_put(local, bss); } - ifsta->flags |= IEEE80211_STA_PREV_BSSID_SET; - memcpy(ifsta->prev_bssid, sdata->u.sta.bssid, ETH_ALEN); - ieee80211_sta_send_associnfo(sdata, ifsta); + ifmgd->flags |= IEEE80211_STA_PREV_BSSID_SET; + memcpy(ifmgd->prev_bssid, sdata->u.mgd.bssid, ETH_ALEN); + ieee80211_sta_send_associnfo(sdata); - ifsta->last_probe = jiffies; + ifmgd->last_probe = jiffies; ieee80211_led_assoc(local, 1); sdata->vif.bss_conf.assoc = 1; @@ -856,70 +676,74 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata, netif_tx_start_all_queues(sdata->dev); netif_carrier_on(sdata->dev); - ieee80211_sta_send_apinfo(sdata, ifsta); + ieee80211_sta_send_apinfo(sdata); } -static void ieee80211_direct_probe(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta) +static void ieee80211_direct_probe(struct ieee80211_sub_if_data *sdata) { - ifsta->direct_probe_tries++; - if (ifsta->direct_probe_tries > IEEE80211_AUTH_MAX_TRIES) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + + ifmgd->direct_probe_tries++; + if (ifmgd->direct_probe_tries > IEEE80211_AUTH_MAX_TRIES) { printk(KERN_DEBUG "%s: direct probe to AP %pM timed out\n", - sdata->dev->name, ifsta->bssid); - ifsta->state = IEEE80211_STA_MLME_DISABLED; - ieee80211_sta_send_apinfo(sdata, ifsta); + sdata->dev->name, ifmgd->bssid); + ifmgd->state = IEEE80211_STA_MLME_DISABLED; + ieee80211_sta_send_apinfo(sdata); /* * Most likely AP is not in the range so remove the * bss information associated to the AP */ - ieee80211_rx_bss_remove(sdata, ifsta->bssid, + ieee80211_rx_bss_remove(sdata, ifmgd->bssid, sdata->local->hw.conf.channel->center_freq, - ifsta->ssid, ifsta->ssid_len); + ifmgd->ssid, ifmgd->ssid_len); return; } printk(KERN_DEBUG "%s: direct probe to AP %pM try %d\n", - sdata->dev->name, ifsta->bssid, - ifsta->direct_probe_tries); + sdata->dev->name, ifmgd->bssid, + ifmgd->direct_probe_tries); - ifsta->state = IEEE80211_STA_MLME_DIRECT_PROBE; + ifmgd->state = IEEE80211_STA_MLME_DIRECT_PROBE; - set_bit(IEEE80211_STA_REQ_DIRECT_PROBE, &ifsta->request); + set_bit(IEEE80211_STA_REQ_DIRECT_PROBE, &ifmgd->request); /* Direct probe is sent to broadcast address as some APs * will not answer to direct packet in unassociated state. */ ieee80211_send_probe_req(sdata, NULL, - ifsta->ssid, ifsta->ssid_len); + ifmgd->ssid, ifmgd->ssid_len, NULL, 0); - mod_timer(&ifsta->timer, jiffies + IEEE80211_AUTH_TIMEOUT); + mod_timer(&ifmgd->timer, jiffies + IEEE80211_AUTH_TIMEOUT); } -static void ieee80211_authenticate(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta) +static void ieee80211_authenticate(struct ieee80211_sub_if_data *sdata) { - ifsta->auth_tries++; - if (ifsta->auth_tries > IEEE80211_AUTH_MAX_TRIES) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + + ifmgd->auth_tries++; + if (ifmgd->auth_tries > IEEE80211_AUTH_MAX_TRIES) { printk(KERN_DEBUG "%s: authentication with AP %pM" " timed out\n", - sdata->dev->name, ifsta->bssid); - ifsta->state = IEEE80211_STA_MLME_DISABLED; - ieee80211_sta_send_apinfo(sdata, ifsta); - ieee80211_rx_bss_remove(sdata, ifsta->bssid, + sdata->dev->name, ifmgd->bssid); + ifmgd->state = IEEE80211_STA_MLME_DISABLED; + ieee80211_sta_send_apinfo(sdata); + ieee80211_rx_bss_remove(sdata, ifmgd->bssid, sdata->local->hw.conf.channel->center_freq, - ifsta->ssid, ifsta->ssid_len); + ifmgd->ssid, ifmgd->ssid_len); return; } - ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE; + ifmgd->state = IEEE80211_STA_MLME_AUTHENTICATE; printk(KERN_DEBUG "%s: authenticate with AP %pM\n", - sdata->dev->name, ifsta->bssid); + sdata->dev->name, ifmgd->bssid); - ieee80211_send_auth(sdata, ifsta, 1, NULL, 0, 0); + ieee80211_send_auth(sdata, 1, ifmgd->auth_alg, NULL, 0, + ifmgd->bssid, 0); + ifmgd->auth_transaction = 2; - mod_timer(&ifsta->timer, jiffies + IEEE80211_AUTH_TIMEOUT); + mod_timer(&ifmgd->timer, jiffies + IEEE80211_AUTH_TIMEOUT); } /* @@ -927,27 +751,28 @@ static void ieee80211_authenticate(struct ieee80211_sub_if_data *sdata, * if self disconnected or a reason code from the AP. */ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta, bool deauth, - bool self_disconnected, u16 reason) + bool deauth, bool self_disconnected, + u16 reason) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; struct sta_info *sta; u32 changed = 0, config_changed = 0; rcu_read_lock(); - sta = sta_info_get(local, ifsta->bssid); + sta = sta_info_get(local, ifmgd->bssid); if (!sta) { rcu_read_unlock(); return; } if (deauth) { - ifsta->direct_probe_tries = 0; - ifsta->auth_tries = 0; + ifmgd->direct_probe_tries = 0; + ifmgd->auth_tries = 0; } - ifsta->assoc_scan_tries = 0; - ifsta->assoc_tries = 0; + ifmgd->assoc_scan_tries = 0; + ifmgd->assoc_tries = 0; netif_tx_stop_all_queues(sdata->dev); netif_carrier_off(sdata->dev); @@ -963,20 +788,20 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, IEEE80211_STYPE_DISASSOC, reason); } - ifsta->flags &= ~IEEE80211_STA_ASSOCIATED; + ifmgd->flags &= ~IEEE80211_STA_ASSOCIATED; changed |= ieee80211_reset_erp_info(sdata); ieee80211_led_assoc(local, 0); changed |= BSS_CHANGED_ASSOC; sdata->vif.bss_conf.assoc = false; - ieee80211_sta_send_apinfo(sdata, ifsta); + ieee80211_sta_send_apinfo(sdata); if (self_disconnected || reason == WLAN_REASON_DISASSOC_STA_HAS_LEFT) { - ifsta->state = IEEE80211_STA_MLME_DISABLED; - ieee80211_rx_bss_remove(sdata, ifsta->bssid, + ifmgd->state = IEEE80211_STA_MLME_DISABLED; + ieee80211_rx_bss_remove(sdata, ifmgd->bssid, sdata->local->hw.conf.channel->center_freq, - ifsta->ssid, ifsta->ssid_len); + ifmgd->ssid, ifmgd->ssid_len); } rcu_read_unlock(); @@ -999,7 +824,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, rcu_read_lock(); - sta = sta_info_get(local, ifsta->bssid); + sta = sta_info_get(local, ifmgd->bssid); if (!sta) { rcu_read_unlock(); return; @@ -1020,27 +845,27 @@ static int ieee80211_sta_wep_configured(struct ieee80211_sub_if_data *sdata) return 1; } -static int ieee80211_privacy_mismatch(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta) +static int ieee80211_privacy_mismatch(struct ieee80211_sub_if_data *sdata) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; struct ieee80211_bss *bss; int bss_privacy; int wep_privacy; int privacy_invoked; - if (!ifsta || (ifsta->flags & IEEE80211_STA_MIXED_CELL)) + if (!ifmgd || (ifmgd->flags & IEEE80211_STA_MIXED_CELL)) return 0; - bss = ieee80211_rx_bss_get(local, ifsta->bssid, + bss = ieee80211_rx_bss_get(local, ifmgd->bssid, local->hw.conf.channel->center_freq, - ifsta->ssid, ifsta->ssid_len); + ifmgd->ssid, ifmgd->ssid_len); if (!bss) return 0; bss_privacy = !!(bss->cbss.capability & WLAN_CAPABILITY_PRIVACY); wep_privacy = !!ieee80211_sta_wep_configured(sdata); - privacy_invoked = !!(ifsta->flags & IEEE80211_STA_PRIVACY_INVOKED); + privacy_invoked = !!(ifmgd->flags & IEEE80211_STA_PRIVACY_INVOKED); ieee80211_rx_bss_put(local, bss); @@ -1050,41 +875,42 @@ static int ieee80211_privacy_mismatch(struct ieee80211_sub_if_data *sdata, return 1; } -static void ieee80211_associate(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta) +static void ieee80211_associate(struct ieee80211_sub_if_data *sdata) { - ifsta->assoc_tries++; - if (ifsta->assoc_tries > IEEE80211_ASSOC_MAX_TRIES) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + + ifmgd->assoc_tries++; + if (ifmgd->assoc_tries > IEEE80211_ASSOC_MAX_TRIES) { printk(KERN_DEBUG "%s: association with AP %pM" " timed out\n", - sdata->dev->name, ifsta->bssid); - ifsta->state = IEEE80211_STA_MLME_DISABLED; - ieee80211_sta_send_apinfo(sdata, ifsta); - ieee80211_rx_bss_remove(sdata, ifsta->bssid, + sdata->dev->name, ifmgd->bssid); + ifmgd->state = IEEE80211_STA_MLME_DISABLED; + ieee80211_sta_send_apinfo(sdata); + ieee80211_rx_bss_remove(sdata, ifmgd->bssid, sdata->local->hw.conf.channel->center_freq, - ifsta->ssid, ifsta->ssid_len); + ifmgd->ssid, ifmgd->ssid_len); return; } - ifsta->state = IEEE80211_STA_MLME_ASSOCIATE; + ifmgd->state = IEEE80211_STA_MLME_ASSOCIATE; printk(KERN_DEBUG "%s: associate with AP %pM\n", - sdata->dev->name, ifsta->bssid); - if (ieee80211_privacy_mismatch(sdata, ifsta)) { + sdata->dev->name, ifmgd->bssid); + if (ieee80211_privacy_mismatch(sdata)) { printk(KERN_DEBUG "%s: mismatch in privacy configuration and " "mixed-cell disabled - abort association\n", sdata->dev->name); - ifsta->state = IEEE80211_STA_MLME_DISABLED; + ifmgd->state = IEEE80211_STA_MLME_DISABLED; return; } - ieee80211_send_assoc(sdata, ifsta); + ieee80211_send_assoc(sdata); - mod_timer(&ifsta->timer, jiffies + IEEE80211_ASSOC_TIMEOUT); + mod_timer(&ifmgd->timer, jiffies + IEEE80211_ASSOC_TIMEOUT); } -static void ieee80211_associated(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta) +static void ieee80211_associated(struct ieee80211_sub_if_data *sdata) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; struct sta_info *sta; int disassoc; @@ -1094,38 +920,40 @@ static void ieee80211_associated(struct ieee80211_sub_if_data *sdata, * for better APs. */ /* TODO: remove expired BSSes */ - ifsta->state = IEEE80211_STA_MLME_ASSOCIATED; + ifmgd->state = IEEE80211_STA_MLME_ASSOCIATED; rcu_read_lock(); - sta = sta_info_get(local, ifsta->bssid); + sta = sta_info_get(local, ifmgd->bssid); if (!sta) { printk(KERN_DEBUG "%s: No STA entry for own AP %pM\n", - sdata->dev->name, ifsta->bssid); + sdata->dev->name, ifmgd->bssid); disassoc = 1; } else { disassoc = 0; if (time_after(jiffies, sta->last_rx + IEEE80211_MONITORING_INTERVAL)) { - if (ifsta->flags & IEEE80211_STA_PROBEREQ_POLL) { + if (ifmgd->flags & IEEE80211_STA_PROBEREQ_POLL) { printk(KERN_DEBUG "%s: No ProbeResp from " "current AP %pM - assume out of " "range\n", - sdata->dev->name, ifsta->bssid); + sdata->dev->name, ifmgd->bssid); disassoc = 1; } else - ieee80211_send_probe_req(sdata, ifsta->bssid, - ifsta->ssid, - ifsta->ssid_len); - ifsta->flags ^= IEEE80211_STA_PROBEREQ_POLL; + ieee80211_send_probe_req(sdata, ifmgd->bssid, + ifmgd->ssid, + ifmgd->ssid_len, + NULL, 0); + ifmgd->flags ^= IEEE80211_STA_PROBEREQ_POLL; } else { - ifsta->flags &= ~IEEE80211_STA_PROBEREQ_POLL; - if (time_after(jiffies, ifsta->last_probe + + ifmgd->flags &= ~IEEE80211_STA_PROBEREQ_POLL; + if (time_after(jiffies, ifmgd->last_probe + IEEE80211_PROBE_INTERVAL)) { - ifsta->last_probe = jiffies; - ieee80211_send_probe_req(sdata, ifsta->bssid, - ifsta->ssid, - ifsta->ssid_len); + ifmgd->last_probe = jiffies; + ieee80211_send_probe_req(sdata, ifmgd->bssid, + ifmgd->ssid, + ifmgd->ssid_len, + NULL, 0); } } } @@ -1133,25 +961,25 @@ static void ieee80211_associated(struct ieee80211_sub_if_data *sdata, rcu_read_unlock(); if (disassoc) - ieee80211_set_disassoc(sdata, ifsta, true, true, + ieee80211_set_disassoc(sdata, true, true, WLAN_REASON_PREV_AUTH_NOT_VALID); else - mod_timer(&ifsta->timer, jiffies + + mod_timer(&ifmgd->timer, jiffies + IEEE80211_MONITORING_INTERVAL); } -static void ieee80211_auth_completed(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta) +static void ieee80211_auth_completed(struct ieee80211_sub_if_data *sdata) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + printk(KERN_DEBUG "%s: authenticated\n", sdata->dev->name); - ifsta->flags |= IEEE80211_STA_AUTHENTICATED; - ieee80211_associate(sdata, ifsta); + ifmgd->flags |= IEEE80211_STA_AUTHENTICATED; + ieee80211_associate(sdata); } static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta, struct ieee80211_mgmt *mgmt, size_t len) { @@ -1162,59 +990,37 @@ static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata, ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), &elems); if (!elems.challenge) return; - ieee80211_send_auth(sdata, ifsta, 3, elems.challenge - 2, - elems.challenge_len + 2, 1); -} - -static void ieee80211_rx_mgmt_auth_ibss(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta, - struct ieee80211_mgmt *mgmt, - size_t len) -{ - u16 auth_alg, auth_transaction, status_code; - - if (len < 24 + 6) - return; - - auth_alg = le16_to_cpu(mgmt->u.auth.auth_alg); - auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction); - status_code = le16_to_cpu(mgmt->u.auth.status_code); - - /* - * IEEE 802.11 standard does not require authentication in IBSS - * networks and most implementations do not seem to use it. - * However, try to reply to authentication attempts if someone - * has actually implemented this. - */ - if (auth_alg == WLAN_AUTH_OPEN && auth_transaction == 1) - ieee80211_send_auth(sdata, ifsta, 2, NULL, 0, 0); + ieee80211_send_auth(sdata, 3, sdata->u.mgd.auth_alg, + elems.challenge - 2, elems.challenge_len + 2, + sdata->u.mgd.bssid, 1); + sdata->u.mgd.auth_transaction = 4; } static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta, struct ieee80211_mgmt *mgmt, size_t len) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u16 auth_alg, auth_transaction, status_code; - if (ifsta->state != IEEE80211_STA_MLME_AUTHENTICATE) + if (ifmgd->state != IEEE80211_STA_MLME_AUTHENTICATE) return; if (len < 24 + 6) return; - if (memcmp(ifsta->bssid, mgmt->sa, ETH_ALEN) != 0) + if (memcmp(ifmgd->bssid, mgmt->sa, ETH_ALEN) != 0) return; - if (memcmp(ifsta->bssid, mgmt->bssid, ETH_ALEN) != 0) + if (memcmp(ifmgd->bssid, mgmt->bssid, ETH_ALEN) != 0) return; auth_alg = le16_to_cpu(mgmt->u.auth.auth_alg); auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction); status_code = le16_to_cpu(mgmt->u.auth.status_code); - if (auth_alg != ifsta->auth_alg || - auth_transaction != ifsta->auth_transaction) + if (auth_alg != ifmgd->auth_alg || + auth_transaction != ifmgd->auth_transaction) return; if (status_code != WLAN_STATUS_SUCCESS) { @@ -1223,15 +1029,15 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, const int num_algs = ARRAY_SIZE(algs); int i, pos; algs[0] = algs[1] = algs[2] = 0xff; - if (ifsta->auth_algs & IEEE80211_AUTH_ALG_OPEN) + if (ifmgd->auth_algs & IEEE80211_AUTH_ALG_OPEN) algs[0] = WLAN_AUTH_OPEN; - if (ifsta->auth_algs & IEEE80211_AUTH_ALG_SHARED_KEY) + if (ifmgd->auth_algs & IEEE80211_AUTH_ALG_SHARED_KEY) algs[1] = WLAN_AUTH_SHARED_KEY; - if (ifsta->auth_algs & IEEE80211_AUTH_ALG_LEAP) + if (ifmgd->auth_algs & IEEE80211_AUTH_ALG_LEAP) algs[2] = WLAN_AUTH_LEAP; - if (ifsta->auth_alg == WLAN_AUTH_OPEN) + if (ifmgd->auth_alg == WLAN_AUTH_OPEN) pos = 0; - else if (ifsta->auth_alg == WLAN_AUTH_SHARED_KEY) + else if (ifmgd->auth_alg == WLAN_AUTH_SHARED_KEY) pos = 1; else pos = 2; @@ -1239,101 +1045,101 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, pos++; if (pos >= num_algs) pos = 0; - if (algs[pos] == ifsta->auth_alg || + if (algs[pos] == ifmgd->auth_alg || algs[pos] == 0xff) continue; if (algs[pos] == WLAN_AUTH_SHARED_KEY && !ieee80211_sta_wep_configured(sdata)) continue; - ifsta->auth_alg = algs[pos]; + ifmgd->auth_alg = algs[pos]; break; } } return; } - switch (ifsta->auth_alg) { + switch (ifmgd->auth_alg) { case WLAN_AUTH_OPEN: case WLAN_AUTH_LEAP: - ieee80211_auth_completed(sdata, ifsta); + ieee80211_auth_completed(sdata); break; case WLAN_AUTH_SHARED_KEY: - if (ifsta->auth_transaction == 4) - ieee80211_auth_completed(sdata, ifsta); + if (ifmgd->auth_transaction == 4) + ieee80211_auth_completed(sdata); else - ieee80211_auth_challenge(sdata, ifsta, mgmt, len); + ieee80211_auth_challenge(sdata, mgmt, len); break; } } static void ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta, struct ieee80211_mgmt *mgmt, size_t len) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u16 reason_code; if (len < 24 + 2) return; - if (memcmp(ifsta->bssid, mgmt->sa, ETH_ALEN)) + if (memcmp(ifmgd->bssid, mgmt->sa, ETH_ALEN)) return; reason_code = le16_to_cpu(mgmt->u.deauth.reason_code); - if (ifsta->flags & IEEE80211_STA_AUTHENTICATED) + if (ifmgd->flags & IEEE80211_STA_AUTHENTICATED) printk(KERN_DEBUG "%s: deauthenticated (Reason: %u)\n", sdata->dev->name, reason_code); - if (ifsta->state == IEEE80211_STA_MLME_AUTHENTICATE || - ifsta->state == IEEE80211_STA_MLME_ASSOCIATE || - ifsta->state == IEEE80211_STA_MLME_ASSOCIATED) { - ifsta->state = IEEE80211_STA_MLME_DIRECT_PROBE; - mod_timer(&ifsta->timer, jiffies + + if (ifmgd->state == IEEE80211_STA_MLME_AUTHENTICATE || + ifmgd->state == IEEE80211_STA_MLME_ASSOCIATE || + ifmgd->state == IEEE80211_STA_MLME_ASSOCIATED) { + ifmgd->state = IEEE80211_STA_MLME_DIRECT_PROBE; + mod_timer(&ifmgd->timer, jiffies + IEEE80211_RETRY_AUTH_INTERVAL); } - ieee80211_set_disassoc(sdata, ifsta, true, false, 0); - ifsta->flags &= ~IEEE80211_STA_AUTHENTICATED; + ieee80211_set_disassoc(sdata, true, false, 0); + ifmgd->flags &= ~IEEE80211_STA_AUTHENTICATED; } static void ieee80211_rx_mgmt_disassoc(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta, struct ieee80211_mgmt *mgmt, size_t len) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u16 reason_code; if (len < 24 + 2) return; - if (memcmp(ifsta->bssid, mgmt->sa, ETH_ALEN)) + if (memcmp(ifmgd->bssid, mgmt->sa, ETH_ALEN)) return; reason_code = le16_to_cpu(mgmt->u.disassoc.reason_code); - if (ifsta->flags & IEEE80211_STA_ASSOCIATED) + if (ifmgd->flags & IEEE80211_STA_ASSOCIATED) printk(KERN_DEBUG "%s: disassociated (Reason: %u)\n", sdata->dev->name, reason_code); - if (ifsta->state == IEEE80211_STA_MLME_ASSOCIATED) { - ifsta->state = IEEE80211_STA_MLME_ASSOCIATE; - mod_timer(&ifsta->timer, jiffies + + if (ifmgd->state == IEEE80211_STA_MLME_ASSOCIATED) { + ifmgd->state = IEEE80211_STA_MLME_ASSOCIATE; + mod_timer(&ifmgd->timer, jiffies + IEEE80211_RETRY_AUTH_INTERVAL); } - ieee80211_set_disassoc(sdata, ifsta, false, false, reason_code); + ieee80211_set_disassoc(sdata, false, false, reason_code); } static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta, struct ieee80211_mgmt *mgmt, size_t len, int reassoc) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; struct sta_info *sta; @@ -1350,13 +1156,13 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, /* AssocResp and ReassocResp have identical structure, so process both * of them in this function. */ - if (ifsta->state != IEEE80211_STA_MLME_ASSOCIATE) + if (ifmgd->state != IEEE80211_STA_MLME_ASSOCIATE) return; if (len < 24 + 6) return; - if (memcmp(ifsta->bssid, mgmt->sa, ETH_ALEN) != 0) + if (memcmp(ifmgd->bssid, mgmt->sa, ETH_ALEN) != 0) return; capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info); @@ -1381,7 +1187,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, "comeback duration %u TU (%u ms)\n", sdata->dev->name, tu, ms); if (ms > IEEE80211_ASSOC_TIMEOUT) - mod_timer(&ifsta->timer, + mod_timer(&ifmgd->timer, jiffies + msecs_to_jiffies(ms)); return; } @@ -1392,7 +1198,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, /* if this was a reassociation, ensure we try a "full" * association next time. This works around some broken APs * which do not correctly reject reassociation requests. */ - ifsta->flags &= ~IEEE80211_STA_PREV_BSSID_SET; + ifmgd->flags &= ~IEEE80211_STA_PREV_BSSID_SET; return; } @@ -1408,23 +1214,23 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, } printk(KERN_DEBUG "%s: associated\n", sdata->dev->name); - ifsta->aid = aid; - ifsta->ap_capab = capab_info; + ifmgd->aid = aid; + ifmgd->ap_capab = capab_info; - kfree(ifsta->assocresp_ies); - ifsta->assocresp_ies_len = len - (pos - (u8 *) mgmt); - ifsta->assocresp_ies = kmalloc(ifsta->assocresp_ies_len, GFP_KERNEL); - if (ifsta->assocresp_ies) - memcpy(ifsta->assocresp_ies, pos, ifsta->assocresp_ies_len); + kfree(ifmgd->assocresp_ies); + ifmgd->assocresp_ies_len = len - (pos - (u8 *) mgmt); + ifmgd->assocresp_ies = kmalloc(ifmgd->assocresp_ies_len, GFP_KERNEL); + if (ifmgd->assocresp_ies) + memcpy(ifmgd->assocresp_ies, pos, ifmgd->assocresp_ies_len); rcu_read_lock(); /* Add STA entry for the AP */ - sta = sta_info_get(local, ifsta->bssid); + sta = sta_info_get(local, ifmgd->bssid); if (!sta) { newsta = true; - sta = sta_info_alloc(sdata, ifsta->bssid, GFP_ATOMIC); + sta = sta_info_alloc(sdata, ifmgd->bssid, GFP_ATOMIC); if (!sta) { printk(KERN_DEBUG "%s: failed to alloc STA entry for" " the AP\n", sdata->dev->name); @@ -1497,7 +1303,8 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, else sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE; - if (elems.ht_cap_elem) + /* If TKIP/WEP is used, no need to parse AP's HT capabilities */ + if (elems.ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_TKIP_WEP_USED)) ieee80211_ht_cap_ie_to_sta_ht_cap(sband, elems.ht_cap_elem, &sta->sta.ht_cap); @@ -1505,7 +1312,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, rate_control_rate_init(sta); - if (ifsta->flags & IEEE80211_STA_MFP_ENABLED) + if (ifmgd->flags & IEEE80211_STA_MFP_ENABLED) set_sta_flags(sta, WLAN_STA_MFP); if (elems.wmm_param) @@ -1524,11 +1331,12 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, rcu_read_unlock(); if (elems.wmm_param) - ieee80211_sta_wmm_params(local, ifsta, elems.wmm_param, + ieee80211_sta_wmm_params(local, ifmgd, elems.wmm_param, elems.wmm_param_len); if (elems.ht_info_elem && elems.wmm_param && - (ifsta->flags & IEEE80211_STA_WMM_ENABLED)) + (ifmgd->flags & IEEE80211_STA_WMM_ENABLED) && + !(ifmgd->flags & IEEE80211_STA_TKIP_WEP_USED)) changed |= ieee80211_enable_ht(sdata, elems.ht_info_elem, ap_ht_cap_flags); @@ -1536,163 +1344,12 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, * ieee80211_set_associated() will tell the driver */ bss_conf->aid = aid; bss_conf->assoc_capability = capab_info; - ieee80211_set_associated(sdata, ifsta, changed); + ieee80211_set_associated(sdata, changed); - ieee80211_associated(sdata, ifsta); + ieee80211_associated(sdata); } -static int __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta, - const u8 *bssid, const int beacon_int, - const int freq, - const size_t supp_rates_len, - const u8 *supp_rates, - const u16 capability) -{ - struct ieee80211_local *local = sdata->local; - int res = 0, rates, i, j; - struct sk_buff *skb; - struct ieee80211_mgmt *mgmt; - u8 *pos; - struct ieee80211_supported_band *sband; - union iwreq_data wrqu; - - if (local->ops->reset_tsf) { - /* Reset own TSF to allow time synchronization work. */ - local->ops->reset_tsf(local_to_hw(local)); - } - - if ((ifsta->flags & IEEE80211_STA_PREV_BSSID_SET) && - memcmp(ifsta->bssid, bssid, ETH_ALEN) == 0) - return res; - - skb = dev_alloc_skb(local->hw.extra_tx_headroom + 400 + - sdata->u.sta.ie_proberesp_len); - if (!skb) { - printk(KERN_DEBUG "%s: failed to allocate buffer for probe " - "response\n", sdata->dev->name); - return -ENOMEM; - } - - if (!(ifsta->flags & IEEE80211_STA_PREV_BSSID_SET)) { - /* Remove possible STA entries from other IBSS networks. */ - sta_info_flush_delayed(sdata); - } - - memcpy(ifsta->bssid, bssid, ETH_ALEN); - res = ieee80211_if_config(sdata, IEEE80211_IFCC_BSSID); - if (res) - return res; - - local->hw.conf.beacon_int = beacon_int >= 10 ? beacon_int : 10; - - sdata->drop_unencrypted = capability & - WLAN_CAPABILITY_PRIVACY ? 1 : 0; - - res = ieee80211_set_freq(sdata, freq); - - if (res) - return res; - - sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; - - /* Build IBSS probe response */ - - skb_reserve(skb, local->hw.extra_tx_headroom); - - mgmt = (struct ieee80211_mgmt *) - skb_put(skb, 24 + sizeof(mgmt->u.beacon)); - memset(mgmt, 0, 24 + sizeof(mgmt->u.beacon)); - mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | - IEEE80211_STYPE_PROBE_RESP); - memset(mgmt->da, 0xff, ETH_ALEN); - memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN); - memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN); - mgmt->u.beacon.beacon_int = - cpu_to_le16(local->hw.conf.beacon_int); - mgmt->u.beacon.capab_info = cpu_to_le16(capability); - - pos = skb_put(skb, 2 + ifsta->ssid_len); - *pos++ = WLAN_EID_SSID; - *pos++ = ifsta->ssid_len; - memcpy(pos, ifsta->ssid, ifsta->ssid_len); - - rates = supp_rates_len; - if (rates > 8) - rates = 8; - pos = skb_put(skb, 2 + rates); - *pos++ = WLAN_EID_SUPP_RATES; - *pos++ = rates; - memcpy(pos, supp_rates, rates); - - if (sband->band == IEEE80211_BAND_2GHZ) { - pos = skb_put(skb, 2 + 1); - *pos++ = WLAN_EID_DS_PARAMS; - *pos++ = 1; - *pos++ = ieee80211_frequency_to_channel(freq); - } - - pos = skb_put(skb, 2 + 2); - *pos++ = WLAN_EID_IBSS_PARAMS; - *pos++ = 2; - /* FIX: set ATIM window based on scan results */ - *pos++ = 0; - *pos++ = 0; - - if (supp_rates_len > 8) { - rates = supp_rates_len - 8; - pos = skb_put(skb, 2 + rates); - *pos++ = WLAN_EID_EXT_SUPP_RATES; - *pos++ = rates; - memcpy(pos, &supp_rates[8], rates); - } - - add_extra_ies(skb, sdata->u.sta.ie_proberesp, - sdata->u.sta.ie_proberesp_len); - - ifsta->probe_resp = skb; - - ieee80211_if_config(sdata, IEEE80211_IFCC_BEACON | - IEEE80211_IFCC_BEACON_ENABLED); - - - rates = 0; - for (i = 0; i < supp_rates_len; i++) { - int bitrate = (supp_rates[i] & 0x7f) * 5; - for (j = 0; j < sband->n_bitrates; j++) - if (sband->bitrates[j].bitrate == bitrate) - rates |= BIT(j); - } - ifsta->supp_rates_bits[local->hw.conf.channel->band] = rates; - - ieee80211_sta_def_wmm_params(sdata, supp_rates_len, supp_rates); - - ifsta->flags |= IEEE80211_STA_PREV_BSSID_SET; - ifsta->state = IEEE80211_STA_MLME_IBSS_JOINED; - mod_timer(&ifsta->timer, jiffies + IEEE80211_IBSS_MERGE_INTERVAL); - - ieee80211_led_assoc(local, true); - - memset(&wrqu, 0, sizeof(wrqu)); - memcpy(wrqu.ap_addr.sa_data, bssid, ETH_ALEN); - wireless_send_event(sdata->dev, SIOCGIWAP, &wrqu, NULL); - - return res; -} - -static int ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta, - struct ieee80211_bss *bss) -{ - return __ieee80211_sta_join_ibss(sdata, ifsta, - bss->cbss.bssid, - bss->cbss.beacon_interval, - bss->cbss.channel->center_freq, - bss->supp_rates_len, bss->supp_rates, - bss->cbss.capability); -} - static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len, @@ -1703,11 +1360,7 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; int freq; struct ieee80211_bss *bss; - struct sta_info *sta; struct ieee80211_channel *channel; - u64 beacon_timestamp, rx_timestamp; - u32 supp_rates = 0; - enum ieee80211_band band = rx_status->band; if (elems->ds_params && elems->ds_params_len == 1) freq = ieee80211_channel_to_frequency(elems->ds_params[0]); @@ -1719,133 +1372,18 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, if (!channel || channel->flags & IEEE80211_CHAN_DISABLED) return; - if (sdata->vif.type == NL80211_IFTYPE_ADHOC && elems->supp_rates && - memcmp(mgmt->bssid, sdata->u.sta.bssid, ETH_ALEN) == 0) { - supp_rates = ieee80211_sta_get_rates(local, elems, band); - - rcu_read_lock(); - - sta = sta_info_get(local, mgmt->sa); - if (sta) { - u32 prev_rates; - - prev_rates = sta->sta.supp_rates[band]; - /* make sure mandatory rates are always added */ - sta->sta.supp_rates[band] = supp_rates | - ieee80211_mandatory_rates(local, band); - -#ifdef CONFIG_MAC80211_IBSS_DEBUG - if (sta->sta.supp_rates[band] != prev_rates) - printk(KERN_DEBUG "%s: updated supp_rates set " - "for %pM based on beacon info (0x%llx | " - "0x%llx -> 0x%llx)\n", - sdata->dev->name, - sta->sta.addr, - (unsigned long long) prev_rates, - (unsigned long long) supp_rates, - (unsigned long long) sta->sta.supp_rates[band]); -#endif - } else { - ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa, supp_rates); - } - - rcu_read_unlock(); - } - bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, elems, channel, beacon); if (!bss) return; if (elems->ch_switch_elem && (elems->ch_switch_elem_len == 3) && - (memcmp(mgmt->bssid, sdata->u.sta.bssid, ETH_ALEN) == 0)) { + (memcmp(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN) == 0)) { struct ieee80211_channel_sw_ie *sw_elem = (struct ieee80211_channel_sw_ie *)elems->ch_switch_elem; ieee80211_process_chanswitch(sdata, sw_elem, bss); } - /* was just updated in ieee80211_bss_info_update */ - beacon_timestamp = bss->cbss.tsf; - - if (sdata->vif.type != NL80211_IFTYPE_ADHOC) - goto put_bss; - - /* check if we need to merge IBSS */ - - /* merge only on beacons (???) */ - if (!beacon) - goto put_bss; - - /* we use a fixed BSSID */ - if (sdata->u.sta.flags & IEEE80211_STA_BSSID_SET) - goto put_bss; - - /* not an IBSS */ - if (!(bss->cbss.capability & WLAN_CAPABILITY_IBSS)) - goto put_bss; - - /* different channel */ - if (bss->cbss.channel != local->oper_channel) - goto put_bss; - - /* different SSID */ - if (elems->ssid_len != sdata->u.sta.ssid_len || - memcmp(elems->ssid, sdata->u.sta.ssid, - sdata->u.sta.ssid_len)) - goto put_bss; - - if (rx_status->flag & RX_FLAG_TSFT) { - /* - * For correct IBSS merging we need mactime; since mactime is - * defined as the time the first data symbol of the frame hits - * the PHY, and the timestamp of the beacon is defined as "the - * time that the data symbol containing the first bit of the - * timestamp is transmitted to the PHY plus the transmitting - * STA's delays through its local PHY from the MAC-PHY - * interface to its interface with the WM" (802.11 11.1.2) - * - equals the time this bit arrives at the receiver - we have - * to take into account the offset between the two. - * - * E.g. at 1 MBit that means mactime is 192 usec earlier - * (=24 bytes * 8 usecs/byte) than the beacon timestamp. - */ - int rate; - - if (rx_status->flag & RX_FLAG_HT) - rate = 65; /* TODO: HT rates */ - else - rate = local->hw.wiphy->bands[band]-> - bitrates[rx_status->rate_idx].bitrate; - - rx_timestamp = rx_status->mactime + (24 * 8 * 10 / rate); - } else if (local && local->ops && local->ops->get_tsf) - /* second best option: get current TSF */ - rx_timestamp = local->ops->get_tsf(local_to_hw(local)); - else - /* can't merge without knowing the TSF */ - rx_timestamp = -1LLU; - -#ifdef CONFIG_MAC80211_IBSS_DEBUG - printk(KERN_DEBUG "RX beacon SA=%pM BSSID=" - "%pM TSF=0x%llx BCN=0x%llx diff=%lld @%lu\n", - mgmt->sa, mgmt->bssid, - (unsigned long long)rx_timestamp, - (unsigned long long)beacon_timestamp, - (unsigned long long)(rx_timestamp - beacon_timestamp), - jiffies); -#endif - - if (beacon_timestamp > rx_timestamp) { -#ifdef CONFIG_MAC80211_IBSS_DEBUG - printk(KERN_DEBUG "%s: beacon TSF higher than " - "local TSF - IBSS merge with BSSID %pM\n", - sdata->dev->name, mgmt->bssid); -#endif - ieee80211_sta_join_ibss(sdata, &sdata->u.sta, bss); - ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa, supp_rates); - } - - put_bss: ieee80211_rx_bss_put(local, bss); } @@ -1857,7 +1395,6 @@ static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata, { size_t baselen; struct ieee802_11_elems elems; - struct ieee80211_if_sta *ifsta = &sdata->u.sta; if (memcmp(mgmt->da, sdata->dev->dev_addr, ETH_ALEN)) return; /* ignore ProbeResp to foreign address */ @@ -1873,20 +1410,19 @@ static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata, /* direct probe may be part of the association flow */ if (test_and_clear_bit(IEEE80211_STA_REQ_DIRECT_PROBE, - &ifsta->request)) { + &sdata->u.mgd.request)) { printk(KERN_DEBUG "%s direct probe responded\n", sdata->dev->name); - ieee80211_authenticate(sdata, ifsta); + ieee80211_authenticate(sdata); } } - static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len, struct ieee80211_rx_status *rx_status) { - struct ieee80211_if_sta *ifsta; + struct ieee80211_if_managed *ifmgd; size_t baselen; struct ieee802_11_elems elems; struct ieee80211_local *local = sdata->local; @@ -1905,21 +1441,21 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, if (sdata->vif.type != NL80211_IFTYPE_STATION) return; - ifsta = &sdata->u.sta; - if (!(ifsta->flags & IEEE80211_STA_ASSOCIATED) || - memcmp(ifsta->bssid, mgmt->bssid, ETH_ALEN) != 0) + ifmgd = &sdata->u.mgd; + + if (!(ifmgd->flags & IEEE80211_STA_ASSOCIATED) || + memcmp(ifmgd->bssid, mgmt->bssid, ETH_ALEN) != 0) return; if (rx_status->freq != local->hw.conf.channel->center_freq) return; - ieee80211_sta_wmm_params(local, ifsta, elems.wmm_param, + ieee80211_sta_wmm_params(local, ifmgd, elems.wmm_param, elems.wmm_param_len); - if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK && - local->hw.conf.flags & IEEE80211_CONF_PS) { - directed_tim = ieee80211_check_tim(&elems, ifsta->aid); + if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) { + directed_tim = ieee80211_check_tim(&elems, ifmgd->aid); if (directed_tim) { if (local->hw.conf.dynamic_ps_timeout > 0) { @@ -1954,14 +1490,15 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, erp_valid, erp_value); - if (elems.ht_cap_elem && elems.ht_info_elem && elems.wmm_param) { + if (elems.ht_cap_elem && elems.ht_info_elem && elems.wmm_param && + !(ifmgd->flags & IEEE80211_STA_TKIP_WEP_USED)) { struct sta_info *sta; struct ieee80211_supported_band *sband; u16 ap_ht_cap_flags; rcu_read_lock(); - sta = sta_info_get(local, ifsta->bssid); + sta = sta_info_get(local, ifmgd->bssid); if (!sta) { rcu_read_unlock(); return; @@ -1997,85 +1534,16 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, ieee80211_bss_info_change_notify(sdata, changed); } - -static void ieee80211_rx_mgmt_probe_req(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta, - struct ieee80211_mgmt *mgmt, - size_t len) +ieee80211_rx_result ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb, + struct ieee80211_rx_status *rx_status) { struct ieee80211_local *local = sdata->local; - int tx_last_beacon; - struct sk_buff *skb; - struct ieee80211_mgmt *resp; - u8 *pos, *end; - - if (ifsta->state != IEEE80211_STA_MLME_IBSS_JOINED || - len < 24 + 2 || !ifsta->probe_resp) - return; - - if (local->ops->tx_last_beacon) - tx_last_beacon = local->ops->tx_last_beacon(local_to_hw(local)); - else - tx_last_beacon = 1; - -#ifdef CONFIG_MAC80211_IBSS_DEBUG - printk(KERN_DEBUG "%s: RX ProbeReq SA=%pM DA=%pM BSSID=%pM" - " (tx_last_beacon=%d)\n", - sdata->dev->name, mgmt->sa, mgmt->da, - mgmt->bssid, tx_last_beacon); -#endif /* CONFIG_MAC80211_IBSS_DEBUG */ - - if (!tx_last_beacon) - return; - - if (memcmp(mgmt->bssid, ifsta->bssid, ETH_ALEN) != 0 && - memcmp(mgmt->bssid, "\xff\xff\xff\xff\xff\xff", ETH_ALEN) != 0) - return; - - end = ((u8 *) mgmt) + len; - pos = mgmt->u.probe_req.variable; - if (pos[0] != WLAN_EID_SSID || - pos + 2 + pos[1] > end) { -#ifdef CONFIG_MAC80211_IBSS_DEBUG - printk(KERN_DEBUG "%s: Invalid SSID IE in ProbeReq " - "from %pM\n", - sdata->dev->name, mgmt->sa); -#endif - return; - } - if (pos[1] != 0 && - (pos[1] != ifsta->ssid_len || - memcmp(pos + 2, ifsta->ssid, ifsta->ssid_len) != 0)) { - /* Ignore ProbeReq for foreign SSID */ - return; - } - - /* Reply with ProbeResp */ - skb = skb_copy(ifsta->probe_resp, GFP_KERNEL); - if (!skb) - return; - - resp = (struct ieee80211_mgmt *) skb->data; - memcpy(resp->da, mgmt->sa, ETH_ALEN); -#ifdef CONFIG_MAC80211_IBSS_DEBUG - printk(KERN_DEBUG "%s: Sending ProbeResp to %pM\n", - sdata->dev->name, resp->da); -#endif /* CONFIG_MAC80211_IBSS_DEBUG */ - ieee80211_tx_skb(sdata, skb, 0); -} - -void ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, - struct ieee80211_rx_status *rx_status) -{ - struct ieee80211_local *local = sdata->local; - struct ieee80211_if_sta *ifsta; struct ieee80211_mgmt *mgmt; u16 fc; if (skb->len < 24) - goto fail; - - ifsta = &sdata->u.sta; + return RX_DROP_MONITOR; mgmt = (struct ieee80211_mgmt *) skb->data; fc = le16_to_cpu(mgmt->frame_control); @@ -2090,147 +1558,68 @@ void ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff * case IEEE80211_STYPE_REASSOC_RESP: case IEEE80211_STYPE_DEAUTH: case IEEE80211_STYPE_DISASSOC: - skb_queue_tail(&ifsta->skb_queue, skb); - queue_work(local->hw.workqueue, &ifsta->work); - return; + skb_queue_tail(&sdata->u.mgd.skb_queue, skb); + queue_work(local->hw.workqueue, &sdata->u.mgd.work); + return RX_QUEUED; } - fail: - kfree_skb(skb); + return RX_DROP_MONITOR; } static void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { struct ieee80211_rx_status *rx_status; - struct ieee80211_if_sta *ifsta; struct ieee80211_mgmt *mgmt; u16 fc; - ifsta = &sdata->u.sta; - rx_status = (struct ieee80211_rx_status *) skb->cb; mgmt = (struct ieee80211_mgmt *) skb->data; fc = le16_to_cpu(mgmt->frame_control); - if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { - switch (fc & IEEE80211_FCTL_STYPE) { - case IEEE80211_STYPE_PROBE_REQ: - ieee80211_rx_mgmt_probe_req(sdata, ifsta, mgmt, - skb->len); - break; - case IEEE80211_STYPE_PROBE_RESP: - ieee80211_rx_mgmt_probe_resp(sdata, mgmt, skb->len, - rx_status); - break; - case IEEE80211_STYPE_BEACON: - ieee80211_rx_mgmt_beacon(sdata, mgmt, skb->len, - rx_status); - break; - case IEEE80211_STYPE_AUTH: - ieee80211_rx_mgmt_auth_ibss(sdata, ifsta, mgmt, - skb->len); - break; - } - } else { /* NL80211_IFTYPE_STATION */ - switch (fc & IEEE80211_FCTL_STYPE) { - case IEEE80211_STYPE_PROBE_RESP: - ieee80211_rx_mgmt_probe_resp(sdata, mgmt, skb->len, - rx_status); - break; - case IEEE80211_STYPE_BEACON: - ieee80211_rx_mgmt_beacon(sdata, mgmt, skb->len, - rx_status); - break; - case IEEE80211_STYPE_AUTH: - ieee80211_rx_mgmt_auth(sdata, ifsta, mgmt, skb->len); - break; - case IEEE80211_STYPE_ASSOC_RESP: - ieee80211_rx_mgmt_assoc_resp(sdata, ifsta, mgmt, - skb->len, 0); - break; - case IEEE80211_STYPE_REASSOC_RESP: - ieee80211_rx_mgmt_assoc_resp(sdata, ifsta, mgmt, - skb->len, 1); - break; - case IEEE80211_STYPE_DEAUTH: - ieee80211_rx_mgmt_deauth(sdata, ifsta, mgmt, skb->len); - break; - case IEEE80211_STYPE_DISASSOC: - ieee80211_rx_mgmt_disassoc(sdata, ifsta, mgmt, - skb->len); - break; - } + switch (fc & IEEE80211_FCTL_STYPE) { + case IEEE80211_STYPE_PROBE_RESP: + ieee80211_rx_mgmt_probe_resp(sdata, mgmt, skb->len, + rx_status); + break; + case IEEE80211_STYPE_BEACON: + ieee80211_rx_mgmt_beacon(sdata, mgmt, skb->len, + rx_status); + break; + case IEEE80211_STYPE_AUTH: + ieee80211_rx_mgmt_auth(sdata, mgmt, skb->len); + break; + case IEEE80211_STYPE_ASSOC_RESP: + ieee80211_rx_mgmt_assoc_resp(sdata, mgmt, skb->len, 0); + break; + case IEEE80211_STYPE_REASSOC_RESP: + ieee80211_rx_mgmt_assoc_resp(sdata, mgmt, skb->len, 1); + break; + case IEEE80211_STYPE_DEAUTH: + ieee80211_rx_mgmt_deauth(sdata, mgmt, skb->len); + break; + case IEEE80211_STYPE_DISASSOC: + ieee80211_rx_mgmt_disassoc(sdata, mgmt, skb->len); + break; } kfree_skb(skb); } - -static int ieee80211_sta_active_ibss(struct ieee80211_sub_if_data *sdata) -{ - struct ieee80211_local *local = sdata->local; - int active = 0; - struct sta_info *sta; - - rcu_read_lock(); - - list_for_each_entry_rcu(sta, &local->sta_list, list) { - if (sta->sdata == sdata && - time_after(sta->last_rx + IEEE80211_IBSS_MERGE_INTERVAL, - jiffies)) { - active++; - break; - } - } - - rcu_read_unlock(); - - return active; -} - - -static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta) -{ - mod_timer(&ifsta->timer, jiffies + IEEE80211_IBSS_MERGE_INTERVAL); - - ieee80211_sta_expire(sdata, IEEE80211_IBSS_INACTIVITY_LIMIT); - if (ieee80211_sta_active_ibss(sdata)) - return; - - if ((sdata->u.sta.flags & IEEE80211_STA_BSSID_SET) && - (!(sdata->u.sta.flags & IEEE80211_STA_AUTO_CHANNEL_SEL))) - return; - - printk(KERN_DEBUG "%s: No active IBSS STAs - trying to scan for other " - "IBSS networks with same SSID (merge)\n", sdata->dev->name); - - /* XXX maybe racy? */ - if (sdata->local->scan_req) - return; - - memcpy(sdata->local->int_scan_req.ssids[0].ssid, - ifsta->ssid, IEEE80211_MAX_SSID_LEN); - sdata->local->int_scan_req.ssids[0].ssid_len = ifsta->ssid_len; - ieee80211_request_scan(sdata, &sdata->local->int_scan_req); -} - - static void ieee80211_sta_timer(unsigned long data) { struct ieee80211_sub_if_data *sdata = (struct ieee80211_sub_if_data *) data; - struct ieee80211_if_sta *ifsta = &sdata->u.sta; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; - set_bit(IEEE80211_STA_REQ_RUN, &ifsta->request); - queue_work(local->hw.workqueue, &ifsta->work); + set_bit(IEEE80211_STA_REQ_RUN, &ifmgd->request); + queue_work(local->hw.workqueue, &ifmgd->work); } -static void ieee80211_sta_reset_auth(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta) +static void ieee80211_sta_reset_auth(struct ieee80211_sub_if_data *sdata) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; if (local->ops->reset_tsf) { @@ -2238,191 +1627,39 @@ static void ieee80211_sta_reset_auth(struct ieee80211_sub_if_data *sdata, local->ops->reset_tsf(local_to_hw(local)); } - ifsta->wmm_last_param_set = -1; /* allow any WMM update */ + ifmgd->wmm_last_param_set = -1; /* allow any WMM update */ - if (ifsta->auth_algs & IEEE80211_AUTH_ALG_OPEN) - ifsta->auth_alg = WLAN_AUTH_OPEN; - else if (ifsta->auth_algs & IEEE80211_AUTH_ALG_SHARED_KEY) - ifsta->auth_alg = WLAN_AUTH_SHARED_KEY; - else if (ifsta->auth_algs & IEEE80211_AUTH_ALG_LEAP) - ifsta->auth_alg = WLAN_AUTH_LEAP; + if (ifmgd->auth_algs & IEEE80211_AUTH_ALG_OPEN) + ifmgd->auth_alg = WLAN_AUTH_OPEN; + else if (ifmgd->auth_algs & IEEE80211_AUTH_ALG_SHARED_KEY) + ifmgd->auth_alg = WLAN_AUTH_SHARED_KEY; + else if (ifmgd->auth_algs & IEEE80211_AUTH_ALG_LEAP) + ifmgd->auth_alg = WLAN_AUTH_LEAP; else - ifsta->auth_alg = WLAN_AUTH_OPEN; - ifsta->auth_transaction = -1; - ifsta->flags &= ~IEEE80211_STA_ASSOCIATED; - ifsta->assoc_scan_tries = 0; - ifsta->direct_probe_tries = 0; - ifsta->auth_tries = 0; - ifsta->assoc_tries = 0; + ifmgd->auth_alg = WLAN_AUTH_OPEN; + ifmgd->auth_transaction = -1; + ifmgd->flags &= ~IEEE80211_STA_ASSOCIATED; + ifmgd->assoc_scan_tries = 0; + ifmgd->direct_probe_tries = 0; + ifmgd->auth_tries = 0; + ifmgd->assoc_tries = 0; netif_tx_stop_all_queues(sdata->dev); netif_carrier_off(sdata->dev); } -static int ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta) -{ - struct ieee80211_local *local = sdata->local; - struct ieee80211_supported_band *sband; - u8 *pos; - u8 bssid[ETH_ALEN]; - u8 supp_rates[IEEE80211_MAX_SUPP_RATES]; - u16 capability; - int i; - - if (sdata->u.sta.flags & IEEE80211_STA_BSSID_SET) { - memcpy(bssid, ifsta->bssid, ETH_ALEN); - } else { - /* Generate random, not broadcast, locally administered BSSID. Mix in - * own MAC address to make sure that devices that do not have proper - * random number generator get different BSSID. */ - get_random_bytes(bssid, ETH_ALEN); - for (i = 0; i < ETH_ALEN; i++) - bssid[i] ^= sdata->dev->dev_addr[i]; - bssid[0] &= ~0x01; - bssid[0] |= 0x02; - } - - printk(KERN_DEBUG "%s: Creating new IBSS network, BSSID %pM\n", - sdata->dev->name, bssid); - - sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; - - if (local->hw.conf.beacon_int == 0) - local->hw.conf.beacon_int = 100; - - capability = WLAN_CAPABILITY_IBSS; - - if (sdata->default_key) - capability |= WLAN_CAPABILITY_PRIVACY; - else - sdata->drop_unencrypted = 0; - - pos = supp_rates; - for (i = 0; i < sband->n_bitrates; i++) { - int rate = sband->bitrates[i].bitrate; - *pos++ = (u8) (rate / 5); - } - - return __ieee80211_sta_join_ibss(sdata, ifsta, - bssid, local->hw.conf.beacon_int, - local->hw.conf.channel->center_freq, - sband->n_bitrates, supp_rates, - capability); -} - - -static int ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta) -{ - struct ieee80211_local *local = sdata->local; - struct ieee80211_bss *bss; - int active_ibss; - - if (ifsta->ssid_len == 0) - return -EINVAL; - - active_ibss = ieee80211_sta_active_ibss(sdata); -#ifdef CONFIG_MAC80211_IBSS_DEBUG - printk(KERN_DEBUG "%s: sta_find_ibss (active_ibss=%d)\n", - sdata->dev->name, active_ibss); -#endif /* CONFIG_MAC80211_IBSS_DEBUG */ - - if (active_ibss) - return 0; - - if (ifsta->flags & IEEE80211_STA_BSSID_SET) - bss = ieee80211_rx_bss_get(local, ifsta->bssid, 0, - ifsta->ssid, ifsta->ssid_len); - else - bss = (void *)cfg80211_get_ibss(local->hw.wiphy, - NULL, - ifsta->ssid, ifsta->ssid_len); - -#ifdef CONFIG_MAC80211_IBSS_DEBUG - if (bss) - printk(KERN_DEBUG " sta_find_ibss: selected %pM current " - "%pM\n", bss->cbss.bssid, ifsta->bssid); -#endif /* CONFIG_MAC80211_IBSS_DEBUG */ - - if (bss && - (!(ifsta->flags & IEEE80211_STA_PREV_BSSID_SET) || - memcmp(ifsta->bssid, bss->cbss.bssid, ETH_ALEN))) { - int ret; - - printk(KERN_DEBUG "%s: Selected IBSS BSSID %pM" - " based on configured SSID\n", - sdata->dev->name, bss->cbss.bssid); - - ret = ieee80211_sta_join_ibss(sdata, ifsta, bss); - ieee80211_rx_bss_put(local, bss); - return ret; - } else if (bss) - ieee80211_rx_bss_put(local, bss); - -#ifdef CONFIG_MAC80211_IBSS_DEBUG - printk(KERN_DEBUG " did not try to join ibss\n"); -#endif /* CONFIG_MAC80211_IBSS_DEBUG */ - - /* Selected IBSS not found in current scan results - try to scan */ - if (ifsta->state == IEEE80211_STA_MLME_IBSS_JOINED && - !ieee80211_sta_active_ibss(sdata)) { - mod_timer(&ifsta->timer, jiffies + - IEEE80211_IBSS_MERGE_INTERVAL); - } else if (time_after(jiffies, local->last_scan_completed + - IEEE80211_SCAN_INTERVAL)) { - printk(KERN_DEBUG "%s: Trigger new scan to find an IBSS to " - "join\n", sdata->dev->name); - - /* XXX maybe racy? */ - if (local->scan_req) - return -EBUSY; - - memcpy(local->int_scan_req.ssids[0].ssid, - ifsta->ssid, IEEE80211_MAX_SSID_LEN); - local->int_scan_req.ssids[0].ssid_len = ifsta->ssid_len; - return ieee80211_request_scan(sdata, &local->int_scan_req); - } else if (ifsta->state != IEEE80211_STA_MLME_IBSS_JOINED) { - int interval = IEEE80211_SCAN_INTERVAL; - - if (time_after(jiffies, ifsta->ibss_join_req + - IEEE80211_IBSS_JOIN_TIMEOUT)) { - if ((ifsta->flags & IEEE80211_STA_CREATE_IBSS) && - (!(local->oper_channel->flags & - IEEE80211_CHAN_NO_IBSS))) - return ieee80211_sta_create_ibss(sdata, ifsta); - if (ifsta->flags & IEEE80211_STA_CREATE_IBSS) { - printk(KERN_DEBUG "%s: IBSS not allowed on" - " %d MHz\n", sdata->dev->name, - local->hw.conf.channel->center_freq); - } - - /* No IBSS found - decrease scan interval and continue - * scanning. */ - interval = IEEE80211_SCAN_INTERVAL_SLOW; - } - - ifsta->state = IEEE80211_STA_MLME_IBSS_SEARCH; - mod_timer(&ifsta->timer, jiffies + interval); - return 0; - } - - return 0; -} - - -static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta) +static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; struct ieee80211_bss *bss; - u8 *bssid = ifsta->bssid, *ssid = ifsta->ssid; - u8 ssid_len = ifsta->ssid_len; + u8 *bssid = ifmgd->bssid, *ssid = ifmgd->ssid; + u8 ssid_len = ifmgd->ssid_len; u16 capa_mask = WLAN_CAPABILITY_ESS; u16 capa_val = WLAN_CAPABILITY_ESS; struct ieee80211_channel *chan = local->oper_channel; - if (ifsta->flags & (IEEE80211_STA_AUTO_SSID_SEL | + if (ifmgd->flags & (IEEE80211_STA_AUTO_SSID_SEL | IEEE80211_STA_AUTO_BSSID_SEL | IEEE80211_STA_AUTO_CHANNEL_SEL)) { capa_mask |= WLAN_CAPABILITY_PRIVACY; @@ -2430,13 +1667,13 @@ static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata, capa_val |= WLAN_CAPABILITY_PRIVACY; } - if (ifsta->flags & IEEE80211_STA_AUTO_CHANNEL_SEL) + if (ifmgd->flags & IEEE80211_STA_AUTO_CHANNEL_SEL) chan = NULL; - if (ifsta->flags & IEEE80211_STA_AUTO_BSSID_SEL) + if (ifmgd->flags & IEEE80211_STA_AUTO_BSSID_SEL) bssid = NULL; - if (ifsta->flags & IEEE80211_STA_AUTO_SSID_SEL) { + if (ifmgd->flags & IEEE80211_STA_AUTO_SSID_SEL) { ssid = NULL; ssid_len = 0; } @@ -2447,16 +1684,16 @@ static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata, if (bss) { ieee80211_set_freq(sdata, bss->cbss.channel->center_freq); - if (!(ifsta->flags & IEEE80211_STA_SSID_SET)) + if (!(ifmgd->flags & IEEE80211_STA_SSID_SET)) ieee80211_sta_set_ssid(sdata, bss->ssid, bss->ssid_len); ieee80211_sta_set_bssid(sdata, bss->cbss.bssid); ieee80211_sta_def_wmm_params(sdata, bss->supp_rates_len, bss->supp_rates); - if (sdata->u.sta.mfp == IEEE80211_MFP_REQUIRED) - sdata->u.sta.flags |= IEEE80211_STA_MFP_ENABLED; + if (sdata->u.mgd.mfp == IEEE80211_MFP_REQUIRED) + sdata->u.mgd.flags |= IEEE80211_STA_MFP_ENABLED; else - sdata->u.sta.flags &= ~IEEE80211_STA_MFP_ENABLED; + sdata->u.mgd.flags &= ~IEEE80211_STA_MFP_ENABLED; /* Send out direct probe if no probe resp was received or * the one we have is outdated @@ -2464,31 +1701,34 @@ static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata, if (!bss->last_probe_resp || time_after(jiffies, bss->last_probe_resp + IEEE80211_SCAN_RESULT_EXPIRE)) - ifsta->state = IEEE80211_STA_MLME_DIRECT_PROBE; + ifmgd->state = IEEE80211_STA_MLME_DIRECT_PROBE; else - ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE; + ifmgd->state = IEEE80211_STA_MLME_AUTHENTICATE; ieee80211_rx_bss_put(local, bss); - ieee80211_sta_reset_auth(sdata, ifsta); + ieee80211_sta_reset_auth(sdata); return 0; } else { - if (ifsta->assoc_scan_tries < IEEE80211_ASSOC_SCANS_MAX_TRIES) { - ifsta->assoc_scan_tries++; + if (ifmgd->assoc_scan_tries < IEEE80211_ASSOC_SCANS_MAX_TRIES) { + ifmgd->assoc_scan_tries++; /* XXX maybe racy? */ if (local->scan_req) return -1; memcpy(local->int_scan_req.ssids[0].ssid, - ifsta->ssid, IEEE80211_MAX_SSID_LEN); - if (ifsta->flags & IEEE80211_STA_AUTO_SSID_SEL) + ifmgd->ssid, IEEE80211_MAX_SSID_LEN); + if (ifmgd->flags & IEEE80211_STA_AUTO_SSID_SEL) local->int_scan_req.ssids[0].ssid_len = 0; else - local->int_scan_req.ssids[0].ssid_len = ifsta->ssid_len; - ieee80211_start_scan(sdata, &local->int_scan_req); - ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE; - set_bit(IEEE80211_STA_REQ_AUTH, &ifsta->request); + local->int_scan_req.ssids[0].ssid_len = ifmgd->ssid_len; + + if (ieee80211_start_scan(sdata, &local->int_scan_req)) + ieee80211_scan_failed(local); + + ifmgd->state = IEEE80211_STA_MLME_AUTHENTICATE; + set_bit(IEEE80211_STA_REQ_AUTH, &ifmgd->request); } else { - ifsta->assoc_scan_tries = 0; - ifsta->state = IEEE80211_STA_MLME_DISABLED; + ifmgd->assoc_scan_tries = 0; + ifmgd->state = IEEE80211_STA_MLME_DISABLED; } } return -1; @@ -2498,9 +1738,9 @@ static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata, static void ieee80211_sta_work(struct work_struct *work) { struct ieee80211_sub_if_data *sdata = - container_of(work, struct ieee80211_sub_if_data, u.sta.work); + container_of(work, struct ieee80211_sub_if_data, u.mgd.work); struct ieee80211_local *local = sdata->local; - struct ieee80211_if_sta *ifsta; + struct ieee80211_if_managed *ifmgd; struct sk_buff *skb; if (!netif_running(sdata->dev)) @@ -2509,60 +1749,60 @@ static void ieee80211_sta_work(struct work_struct *work) if (local->sw_scanning || local->hw_scanning) return; - if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION && - sdata->vif.type != NL80211_IFTYPE_ADHOC)) + if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION)) return; - ifsta = &sdata->u.sta; + ifmgd = &sdata->u.mgd; - while ((skb = skb_dequeue(&ifsta->skb_queue))) + while ((skb = skb_dequeue(&ifmgd->skb_queue))) ieee80211_sta_rx_queued_mgmt(sdata, skb); - if (ifsta->state != IEEE80211_STA_MLME_DIRECT_PROBE && - ifsta->state != IEEE80211_STA_MLME_AUTHENTICATE && - ifsta->state != IEEE80211_STA_MLME_ASSOCIATE && - test_and_clear_bit(IEEE80211_STA_REQ_SCAN, &ifsta->request)) { - ieee80211_start_scan(sdata, local->scan_req); + if (ifmgd->state != IEEE80211_STA_MLME_DIRECT_PROBE && + ifmgd->state != IEEE80211_STA_MLME_AUTHENTICATE && + ifmgd->state != IEEE80211_STA_MLME_ASSOCIATE && + test_and_clear_bit(IEEE80211_STA_REQ_SCAN, &ifmgd->request)) { + /* + * The call to ieee80211_start_scan can fail but ieee80211_request_scan + * (which queued ieee80211_sta_work) did not return an error. Thus, call + * ieee80211_scan_failed here if ieee80211_start_scan fails in order to + * notify the scan requester. + */ + if (ieee80211_start_scan(sdata, local->scan_req)) + ieee80211_scan_failed(local); return; } - if (test_and_clear_bit(IEEE80211_STA_REQ_AUTH, &ifsta->request)) { - if (ieee80211_sta_config_auth(sdata, ifsta)) + if (test_and_clear_bit(IEEE80211_STA_REQ_AUTH, &ifmgd->request)) { + if (ieee80211_sta_config_auth(sdata)) return; - clear_bit(IEEE80211_STA_REQ_RUN, &ifsta->request); - } else if (!test_and_clear_bit(IEEE80211_STA_REQ_RUN, &ifsta->request)) + clear_bit(IEEE80211_STA_REQ_RUN, &ifmgd->request); + } else if (!test_and_clear_bit(IEEE80211_STA_REQ_RUN, &ifmgd->request)) return; - switch (ifsta->state) { + switch (ifmgd->state) { case IEEE80211_STA_MLME_DISABLED: break; case IEEE80211_STA_MLME_DIRECT_PROBE: - ieee80211_direct_probe(sdata, ifsta); + ieee80211_direct_probe(sdata); break; case IEEE80211_STA_MLME_AUTHENTICATE: - ieee80211_authenticate(sdata, ifsta); + ieee80211_authenticate(sdata); break; case IEEE80211_STA_MLME_ASSOCIATE: - ieee80211_associate(sdata, ifsta); + ieee80211_associate(sdata); break; case IEEE80211_STA_MLME_ASSOCIATED: - ieee80211_associated(sdata, ifsta); - break; - case IEEE80211_STA_MLME_IBSS_SEARCH: - ieee80211_sta_find_ibss(sdata, ifsta); - break; - case IEEE80211_STA_MLME_IBSS_JOINED: - ieee80211_sta_merge_ibss(sdata, ifsta); + ieee80211_associated(sdata); break; default: WARN_ON(1); break; } - if (ieee80211_privacy_mismatch(sdata, ifsta)) { + if (ieee80211_privacy_mismatch(sdata)) { printk(KERN_DEBUG "%s: privacy configuration mismatch and " "mixed-cell disabled - disassociate\n", sdata->dev->name); - ieee80211_set_disassoc(sdata, ifsta, false, true, + ieee80211_set_disassoc(sdata, false, true, WLAN_REASON_UNSPECIFIED); } } @@ -2571,155 +1811,106 @@ static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata) { if (sdata->vif.type == NL80211_IFTYPE_STATION) queue_work(sdata->local->hw.workqueue, - &sdata->u.sta.work); + &sdata->u.mgd.work); } /* interface setup */ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata) { - struct ieee80211_if_sta *ifsta; + struct ieee80211_if_managed *ifmgd; - ifsta = &sdata->u.sta; - INIT_WORK(&ifsta->work, ieee80211_sta_work); - INIT_WORK(&ifsta->chswitch_work, ieee80211_chswitch_work); - setup_timer(&ifsta->timer, ieee80211_sta_timer, + ifmgd = &sdata->u.mgd; + INIT_WORK(&ifmgd->work, ieee80211_sta_work); + INIT_WORK(&ifmgd->chswitch_work, ieee80211_chswitch_work); + setup_timer(&ifmgd->timer, ieee80211_sta_timer, (unsigned long) sdata); - setup_timer(&ifsta->chswitch_timer, ieee80211_chswitch_timer, + setup_timer(&ifmgd->chswitch_timer, ieee80211_chswitch_timer, (unsigned long) sdata); - skb_queue_head_init(&ifsta->skb_queue); + skb_queue_head_init(&ifmgd->skb_queue); - ifsta->capab = WLAN_CAPABILITY_ESS; - ifsta->auth_algs = IEEE80211_AUTH_ALG_OPEN | + ifmgd->capab = WLAN_CAPABILITY_ESS; + ifmgd->auth_algs = IEEE80211_AUTH_ALG_OPEN | IEEE80211_AUTH_ALG_SHARED_KEY; - ifsta->flags |= IEEE80211_STA_CREATE_IBSS | + ifmgd->flags |= IEEE80211_STA_CREATE_IBSS | IEEE80211_STA_AUTO_BSSID_SEL | IEEE80211_STA_AUTO_CHANNEL_SEL; if (ieee80211_num_regular_queues(&sdata->local->hw) >= 4) - ifsta->flags |= IEEE80211_STA_WMM_ENABLED; -} - -/* - * Add a new IBSS station, will also be called by the RX code when, - * in IBSS mode, receiving a frame from a yet-unknown station, hence - * must be callable in atomic context. - */ -struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, - u8 *bssid,u8 *addr, u32 supp_rates) -{ - struct ieee80211_local *local = sdata->local; - struct sta_info *sta; - int band = local->hw.conf.channel->band; - - /* TODO: Could consider removing the least recently used entry and - * allow new one to be added. */ - if (local->num_sta >= IEEE80211_IBSS_MAX_STA_ENTRIES) { - if (net_ratelimit()) { - printk(KERN_DEBUG "%s: No room for a new IBSS STA " - "entry %pM\n", sdata->dev->name, addr); - } - return NULL; - } - - if (compare_ether_addr(bssid, sdata->u.sta.bssid)) - return NULL; - -#ifdef CONFIG_MAC80211_VERBOSE_DEBUG - printk(KERN_DEBUG "%s: Adding new IBSS station %pM (dev=%s)\n", - wiphy_name(local->hw.wiphy), addr, sdata->dev->name); -#endif - - sta = sta_info_alloc(sdata, addr, GFP_ATOMIC); - if (!sta) - return NULL; - - set_sta_flags(sta, WLAN_STA_AUTHORIZED); - - /* make sure mandatory rates are always added */ - sta->sta.supp_rates[band] = supp_rates | - ieee80211_mandatory_rates(local, band); - - rate_control_rate_init(sta); - - if (sta_info_insert(sta)) - return NULL; - - return sta; + ifmgd->flags |= IEEE80211_STA_WMM_ENABLED; } /* configuration hooks */ -void ieee80211_sta_req_auth(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta) +void ieee80211_sta_req_auth(struct ieee80211_sub_if_data *sdata) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; - if (sdata->vif.type != NL80211_IFTYPE_STATION) + if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION)) return; - if ((ifsta->flags & (IEEE80211_STA_BSSID_SET | + if ((ifmgd->flags & (IEEE80211_STA_BSSID_SET | IEEE80211_STA_AUTO_BSSID_SEL)) && - (ifsta->flags & (IEEE80211_STA_SSID_SET | + (ifmgd->flags & (IEEE80211_STA_SSID_SET | IEEE80211_STA_AUTO_SSID_SEL))) { - if (ifsta->state == IEEE80211_STA_MLME_ASSOCIATED) - ieee80211_set_disassoc(sdata, ifsta, true, true, + if (ifmgd->state == IEEE80211_STA_MLME_ASSOCIATED) + ieee80211_set_disassoc(sdata, true, true, WLAN_REASON_DEAUTH_LEAVING); - set_bit(IEEE80211_STA_REQ_AUTH, &ifsta->request); - queue_work(local->hw.workqueue, &ifsta->work); + set_bit(IEEE80211_STA_REQ_AUTH, &ifmgd->request); + queue_work(local->hw.workqueue, &ifmgd->work); } } -int ieee80211_sta_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t len) +int ieee80211_sta_commit(struct ieee80211_sub_if_data *sdata) { - struct ieee80211_if_sta *ifsta; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - if (len > IEEE80211_MAX_SSID_LEN) - return -EINVAL; + ifmgd->flags &= ~IEEE80211_STA_PREV_BSSID_SET; - ifsta = &sdata->u.sta; + if (ifmgd->ssid_len) + ifmgd->flags |= IEEE80211_STA_SSID_SET; + else + ifmgd->flags &= ~IEEE80211_STA_SSID_SET; - if (ifsta->ssid_len != len || memcmp(ifsta->ssid, ssid, len) != 0) { - memset(ifsta->ssid, 0, sizeof(ifsta->ssid)); - memcpy(ifsta->ssid, ssid, len); - ifsta->ssid_len = len; - } + return 0; +} - ifsta->flags &= ~IEEE80211_STA_PREV_BSSID_SET; +int ieee80211_sta_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t len) +{ + struct ieee80211_if_managed *ifmgd; - if (len) - ifsta->flags |= IEEE80211_STA_SSID_SET; - else - ifsta->flags &= ~IEEE80211_STA_SSID_SET; + if (len > IEEE80211_MAX_SSID_LEN) + return -EINVAL; + + ifmgd = &sdata->u.mgd; - if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { - ifsta->ibss_join_req = jiffies; - ifsta->state = IEEE80211_STA_MLME_IBSS_SEARCH; - return ieee80211_sta_find_ibss(sdata, ifsta); + if (ifmgd->ssid_len != len || memcmp(ifmgd->ssid, ssid, len) != 0) { + memset(ifmgd->ssid, 0, sizeof(ifmgd->ssid)); + memcpy(ifmgd->ssid, ssid, len); + ifmgd->ssid_len = len; } - return 0; + return ieee80211_sta_commit(sdata); } int ieee80211_sta_get_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t *len) { - struct ieee80211_if_sta *ifsta = &sdata->u.sta; - memcpy(ssid, ifsta->ssid, ifsta->ssid_len); - *len = ifsta->ssid_len; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + memcpy(ssid, ifmgd->ssid, ifmgd->ssid_len); + *len = ifmgd->ssid_len; return 0; } int ieee80211_sta_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid) { - struct ieee80211_if_sta *ifsta; - - ifsta = &sdata->u.sta; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; if (is_valid_ether_addr(bssid)) { - memcpy(ifsta->bssid, bssid, ETH_ALEN); - ifsta->flags |= IEEE80211_STA_BSSID_SET; + memcpy(ifmgd->bssid, bssid, ETH_ALEN); + ifmgd->flags |= IEEE80211_STA_BSSID_SET; } else { - memset(ifsta->bssid, 0, ETH_ALEN); - ifsta->flags &= ~IEEE80211_STA_BSSID_SET; + memset(ifmgd->bssid, 0, ETH_ALEN); + ifmgd->flags &= ~IEEE80211_STA_BSSID_SET; } if (netif_running(sdata->dev)) { @@ -2729,47 +1920,44 @@ int ieee80211_sta_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid) } } - return ieee80211_sta_set_ssid(sdata, ifsta->ssid, ifsta->ssid_len); + return ieee80211_sta_commit(sdata); } int ieee80211_sta_set_extra_ie(struct ieee80211_sub_if_data *sdata, char *ie, size_t len) { - struct ieee80211_if_sta *ifsta = &sdata->u.sta; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - kfree(ifsta->extra_ie); + kfree(ifmgd->extra_ie); if (len == 0) { - ifsta->extra_ie = NULL; - ifsta->extra_ie_len = 0; + ifmgd->extra_ie = NULL; + ifmgd->extra_ie_len = 0; return 0; } - ifsta->extra_ie = kmalloc(len, GFP_KERNEL); - if (!ifsta->extra_ie) { - ifsta->extra_ie_len = 0; + ifmgd->extra_ie = kmalloc(len, GFP_KERNEL); + if (!ifmgd->extra_ie) { + ifmgd->extra_ie_len = 0; return -ENOMEM; } - memcpy(ifsta->extra_ie, ie, len); - ifsta->extra_ie_len = len; + memcpy(ifmgd->extra_ie, ie, len); + ifmgd->extra_ie_len = len; return 0; } int ieee80211_sta_deauthenticate(struct ieee80211_sub_if_data *sdata, u16 reason) { - struct ieee80211_if_sta *ifsta = &sdata->u.sta; - printk(KERN_DEBUG "%s: deauthenticating by local choice (reason=%d)\n", sdata->dev->name, reason); - if (sdata->vif.type != NL80211_IFTYPE_STATION && - sdata->vif.type != NL80211_IFTYPE_ADHOC) + if (sdata->vif.type != NL80211_IFTYPE_STATION) return -EINVAL; - ieee80211_set_disassoc(sdata, ifsta, true, true, reason); + ieee80211_set_disassoc(sdata, true, true, reason); return 0; } int ieee80211_sta_disassociate(struct ieee80211_sub_if_data *sdata, u16 reason) { - struct ieee80211_if_sta *ifsta = &sdata->u.sta; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; printk(KERN_DEBUG "%s: disassociating by local choice (reason=%d)\n", sdata->dev->name, reason); @@ -2777,10 +1965,10 @@ int ieee80211_sta_disassociate(struct ieee80211_sub_if_data *sdata, u16 reason) if (sdata->vif.type != NL80211_IFTYPE_STATION) return -EINVAL; - if (!(ifsta->flags & IEEE80211_STA_ASSOCIATED)) - return -1; + if (!(ifmgd->flags & IEEE80211_STA_ASSOCIATED)) + return -ENOLINK; - ieee80211_set_disassoc(sdata, ifsta, false, true, reason); + ieee80211_set_disassoc(sdata, false, true, reason); return 0; } @@ -2788,14 +1976,6 @@ int ieee80211_sta_disassociate(struct ieee80211_sub_if_data *sdata, u16 reason) void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata = local->scan_sdata; - struct ieee80211_if_sta *ifsta; - - if (sdata && sdata->vif.type == NL80211_IFTYPE_ADHOC) { - ifsta = &sdata->u.sta; - if ((!(ifsta->flags & IEEE80211_STA_PREV_BSSID_SET)) || - !ieee80211_sta_active_ibss(sdata)) - ieee80211_sta_find_ibss(sdata, ifsta); - } /* Restart STA timers */ rcu_read_lock(); @@ -2842,3 +2022,36 @@ void ieee80211_dynamic_ps_timer(unsigned long data) queue_work(local->hw.workqueue, &local->dynamic_ps_enable_work); } + +void ieee80211_send_nullfunc(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + int powersave) +{ + struct sk_buff *skb; + struct ieee80211_hdr *nullfunc; + __le16 fc; + + if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION)) + return; + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + 24); + if (!skb) { + printk(KERN_DEBUG "%s: failed to allocate buffer for nullfunc " + "frame\n", sdata->dev->name); + return; + } + skb_reserve(skb, local->hw.extra_tx_headroom); + + nullfunc = (struct ieee80211_hdr *) skb_put(skb, 24); + memset(nullfunc, 0, 24); + fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC | + IEEE80211_FCTL_TODS); + if (powersave) + fc |= cpu_to_le16(IEEE80211_FCTL_PM); + nullfunc->frame_control = fc; + memcpy(nullfunc->addr1, sdata->u.mgd.bssid, ETH_ALEN); + memcpy(nullfunc->addr2, sdata->dev->dev_addr, ETH_ALEN); + memcpy(nullfunc->addr3, sdata->u.mgd.bssid, ETH_ALEN); + + ieee80211_tx_skb(sdata, skb, 0); +} diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h index 928da625e28..b9164c9a956 100644 --- a/net/mac80211/rate.h +++ b/net/mac80211/rate.h @@ -62,6 +62,18 @@ static inline void rate_control_rate_init(struct sta_info *sta) ref->ops->rate_init(ref->priv, sband, ista, priv_sta); } +static inline void rate_control_rate_update(struct ieee80211_local *local, + struct ieee80211_supported_band *sband, + struct sta_info *sta, u32 changed) +{ + struct rate_control_ref *ref = local->rate_ctrl; + struct ieee80211_sta *ista = &sta->sta; + void *priv_sta = sta->rate_ctrl_priv; + + if (ref->ops->rate_update) + ref->ops->rate_update(ref->priv, sband, ista, + priv_sta, changed); +} static inline void *rate_control_alloc_sta(struct rate_control_ref *ref, struct ieee80211_sta *sta, diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 1327d424bf3..66f7ecf51b9 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -838,7 +838,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) if (rx->sdata->vif.type == NL80211_IFTYPE_ADHOC) { u8 *bssid = ieee80211_get_bssid(hdr, rx->skb->len, NL80211_IFTYPE_ADHOC); - if (compare_ether_addr(bssid, rx->sdata->u.sta.bssid) == 0) + if (compare_ether_addr(bssid, rx->sdata->u.ibss.bssid) == 0) sta->last_rx = jiffies; } else if (!is_multicast_ether_addr(hdr->addr1) || @@ -1702,13 +1702,13 @@ static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata, return; } - if (compare_ether_addr(mgmt->sa, sdata->u.sta.bssid) != 0 || - compare_ether_addr(mgmt->bssid, sdata->u.sta.bssid) != 0) { + if (compare_ether_addr(mgmt->sa, sdata->u.mgd.bssid) != 0 || + compare_ether_addr(mgmt->bssid, sdata->u.mgd.bssid) != 0) { /* Not from the current AP. */ return; } - if (sdata->u.sta.state == IEEE80211_STA_MLME_ASSOCIATE) { + if (sdata->u.mgd.state == IEEE80211_STA_MLME_ASSOCIATE) { /* Association in progress; ignore SA Query */ return; } @@ -1727,7 +1727,7 @@ static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata, memset(resp, 0, 24); memcpy(resp->da, mgmt->sa, ETH_ALEN); memcpy(resp->sa, sdata->dev->dev_addr, ETH_ALEN); - memcpy(resp->bssid, sdata->u.sta.bssid, ETH_ALEN); + memcpy(resp->bssid, sdata->u.mgd.bssid, ETH_ALEN); resp->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); skb_put(skb, 1 + sizeof(resp->u.action.u.sa_query)); @@ -1745,7 +1745,6 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) { struct ieee80211_local *local = rx->local; struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(rx->dev); - struct ieee80211_if_sta *ifsta = &sdata->u.sta; struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data; struct ieee80211_bss *bss; int len = rx->skb->len; @@ -1803,6 +1802,10 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) case WLAN_CATEGORY_SPECTRUM_MGMT: if (local->hw.conf.channel->band != IEEE80211_BAND_5GHZ) return RX_DROP_MONITOR; + + if (sdata->vif.type != NL80211_IFTYPE_STATION) + return RX_DROP_MONITOR; + switch (mgmt->u.action.u.measurement.action_code) { case WLAN_ACTION_SPCT_MSR_REQ: if (len < (IEEE80211_MIN_ACTION_SIZE + @@ -1815,12 +1818,13 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) sizeof(mgmt->u.action.u.chan_switch))) return RX_DROP_MONITOR; - if (memcmp(mgmt->bssid, ifsta->bssid, ETH_ALEN) != 0) + if (memcmp(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN)) return RX_DROP_MONITOR; - bss = ieee80211_rx_bss_get(local, ifsta->bssid, + bss = ieee80211_rx_bss_get(local, sdata->u.mgd.bssid, local->hw.conf.channel->center_freq, - ifsta->ssid, ifsta->ssid_len); + sdata->u.mgd.ssid, + sdata->u.mgd.ssid_len); if (!bss) return RX_DROP_MONITOR; @@ -1876,11 +1880,14 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx) sdata->vif.type != NL80211_IFTYPE_ADHOC) return RX_DROP_MONITOR; - if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME) - return RX_DROP_MONITOR; - ieee80211_sta_rx_mgmt(sdata, rx->skb, rx->status); - return RX_QUEUED; + if (sdata->vif.type == NL80211_IFTYPE_STATION) { + if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME) + return RX_DROP_MONITOR; + return ieee80211_sta_rx_mgmt(sdata, rx->skb, rx->status); + } + + return ieee80211_ibss_rx_mgmt(sdata, rx->skb, rx->status); } static void ieee80211_rx_michael_mic_report(struct net_device *dev, @@ -2083,7 +2090,7 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata, case NL80211_IFTYPE_STATION: if (!bssid) return 0; - if (!ieee80211_bssid_match(bssid, sdata->u.sta.bssid)) { + if (!ieee80211_bssid_match(bssid, sdata->u.mgd.bssid)) { if (!(rx->flags & IEEE80211_RX_IN_SCAN)) return 0; rx->flags &= ~IEEE80211_RX_RA_MATCH; @@ -2101,7 +2108,7 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata, if (ieee80211_is_beacon(hdr->frame_control)) { return 1; } - else if (!ieee80211_bssid_match(bssid, sdata->u.sta.bssid)) { + else if (!ieee80211_bssid_match(bssid, sdata->u.ibss.bssid)) { if (!(rx->flags & IEEE80211_RX_IN_SCAN)) return 0; rx->flags &= ~IEEE80211_RX_RA_MATCH; diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index f883ab9f1e6..5030a3c8750 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -63,20 +63,15 @@ ieee80211_bss_info_update(struct ieee80211_local *local, { struct ieee80211_bss *bss; int clen; - enum cfg80211_signal_type sigtype = CFG80211_SIGNAL_TYPE_NONE; s32 signal = 0; - if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) { - sigtype = CFG80211_SIGNAL_TYPE_MBM; + if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) signal = rx_status->signal * 100; - } else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) { - sigtype = CFG80211_SIGNAL_TYPE_UNSPEC; + else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) signal = (rx_status->signal * 100) / local->hw.max_signal; - } bss = (void *)cfg80211_inform_bss_frame(local->hw.wiphy, channel, - mgmt, len, signal, sigtype, - GFP_ATOMIC); + mgmt, len, signal, GFP_ATOMIC); if (!bss) return NULL; @@ -207,34 +202,16 @@ ieee80211_scan_rx(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, return RX_QUEUED; } -void ieee80211_send_nullfunc(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata, - int powersave) +void ieee80211_scan_failed(struct ieee80211_local *local) { - struct sk_buff *skb; - struct ieee80211_hdr *nullfunc; - __le16 fc; - - skb = dev_alloc_skb(local->hw.extra_tx_headroom + 24); - if (!skb) { - printk(KERN_DEBUG "%s: failed to allocate buffer for nullfunc " - "frame\n", sdata->dev->name); + if (WARN_ON(!local->scan_req)) return; - } - skb_reserve(skb, local->hw.extra_tx_headroom); - - nullfunc = (struct ieee80211_hdr *) skb_put(skb, 24); - memset(nullfunc, 0, 24); - fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC | - IEEE80211_FCTL_TODS); - if (powersave) - fc |= cpu_to_le16(IEEE80211_FCTL_PM); - nullfunc->frame_control = fc; - memcpy(nullfunc->addr1, sdata->u.sta.bssid, ETH_ALEN); - memcpy(nullfunc->addr2, sdata->dev->dev_addr, ETH_ALEN); - memcpy(nullfunc->addr3, sdata->u.sta.bssid, ETH_ALEN); - - ieee80211_tx_skb(sdata, skb, 0); + + /* notify cfg80211 about the failed scan */ + if (local->scan_req != &local->int_scan_req) + cfg80211_scan_done(local->scan_req, true); + + local->scan_req = NULL; } void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) @@ -280,6 +257,9 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) netif_addr_unlock(local->mdev); netif_tx_unlock_bh(local->mdev); + if (local->ops->sw_scan_complete) + local->ops->sw_scan_complete(local_to_hw(local)); + mutex_lock(&local->iflist_mtx); list_for_each_entry(sdata, &local->interfaces, list) { if (!netif_running(sdata->dev)) @@ -287,7 +267,7 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) /* Tell AP we're back */ if (sdata->vif.type == NL80211_IFTYPE_STATION) { - if (sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED) { + if (sdata->u.mgd.flags & IEEE80211_STA_ASSOCIATED) { ieee80211_send_nullfunc(local, sdata, 0); netif_tx_wake_all_queues(sdata->dev); } @@ -305,6 +285,7 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) done: ieee80211_mlme_notify_scan_completed(local); + ieee80211_ibss_notify_scan_completed(local); ieee80211_mesh_notify_scan_completed(local); } EXPORT_SYMBOL(ieee80211_scan_completed); @@ -367,7 +348,8 @@ void ieee80211_scan_work(struct work_struct *work) ieee80211_send_probe_req( sdata, NULL, local->scan_req->ssids[i].ssid, - local->scan_req->ssids[i].ssid_len); + local->scan_req->ssids[i].ssid_len, + local->scan_req->ie, local->scan_req->ie_len); next_delay = IEEE80211_CHANNEL_TIME; break; } @@ -428,6 +410,8 @@ int ieee80211_start_scan(struct ieee80211_sub_if_data *scan_sdata, } local->sw_scanning = true; + if (local->ops->sw_scan_start) + local->ops->sw_scan_start(local_to_hw(local)); mutex_lock(&local->iflist_mtx); list_for_each_entry(sdata, &local->interfaces, list) { @@ -442,7 +426,7 @@ int ieee80211_start_scan(struct ieee80211_sub_if_data *scan_sdata, IEEE80211_IFCC_BEACON_ENABLED); if (sdata->vif.type == NL80211_IFTYPE_STATION) { - if (sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED) { + if (sdata->u.mgd.flags & IEEE80211_STA_ASSOCIATED) { netif_tx_stop_all_queues(sdata->dev); ieee80211_send_nullfunc(local, sdata, 1); } @@ -477,7 +461,7 @@ int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata, struct cfg80211_scan_request *req) { struct ieee80211_local *local = sdata->local; - struct ieee80211_if_sta *ifsta; + struct ieee80211_if_managed *ifmgd; if (!req) return -EINVAL; @@ -502,9 +486,9 @@ int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata, return -EBUSY; } - ifsta = &sdata->u.sta; - set_bit(IEEE80211_STA_REQ_SCAN, &ifsta->request); - queue_work(local->hw.workqueue, &ifsta->work); + ifmgd = &sdata->u.mgd; + set_bit(IEEE80211_STA_REQ_SCAN, &ifmgd->request); + queue_work(local->hw.workqueue, &ifmgd->work); return 0; } diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c index 47bb2aed281..5f7a2624ed7 100644 --- a/net/mac80211/spectmgmt.c +++ b/net/mac80211/spectmgmt.c @@ -88,16 +88,16 @@ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, void ieee80211_chswitch_work(struct work_struct *work) { struct ieee80211_sub_if_data *sdata = - container_of(work, struct ieee80211_sub_if_data, u.sta.chswitch_work); + container_of(work, struct ieee80211_sub_if_data, u.mgd.chswitch_work); struct ieee80211_bss *bss; - struct ieee80211_if_sta *ifsta = &sdata->u.sta; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; if (!netif_running(sdata->dev)) return; - bss = ieee80211_rx_bss_get(sdata->local, ifsta->bssid, + bss = ieee80211_rx_bss_get(sdata->local, ifmgd->bssid, sdata->local->hw.conf.channel->center_freq, - ifsta->ssid, ifsta->ssid_len); + ifmgd->ssid, ifmgd->ssid_len); if (!bss) goto exit; @@ -108,7 +108,7 @@ void ieee80211_chswitch_work(struct work_struct *work) ieee80211_rx_bss_put(sdata->local, bss); exit: - ifsta->flags &= ~IEEE80211_STA_CSA_RECEIVED; + ifmgd->flags &= ~IEEE80211_STA_CSA_RECEIVED; ieee80211_wake_queues_by_reason(&sdata->local->hw, IEEE80211_QUEUE_STOP_REASON_CSA); } @@ -117,9 +117,9 @@ void ieee80211_chswitch_timer(unsigned long data) { struct ieee80211_sub_if_data *sdata = (struct ieee80211_sub_if_data *) data; - struct ieee80211_if_sta *ifsta = &sdata->u.sta; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - queue_work(sdata->local->hw.workqueue, &ifsta->chswitch_work); + queue_work(sdata->local->hw.workqueue, &ifmgd->chswitch_work); } void ieee80211_process_chanswitch(struct ieee80211_sub_if_data *sdata, @@ -127,14 +127,14 @@ void ieee80211_process_chanswitch(struct ieee80211_sub_if_data *sdata, struct ieee80211_bss *bss) { struct ieee80211_channel *new_ch; - struct ieee80211_if_sta *ifsta = &sdata->u.sta; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; int new_freq = ieee80211_channel_to_frequency(sw_elem->new_ch_num); /* FIXME: Handle ADHOC later */ if (sdata->vif.type != NL80211_IFTYPE_STATION) return; - if (ifsta->state != IEEE80211_STA_MLME_ASSOCIATED) + if (ifmgd->state != IEEE80211_STA_MLME_ASSOCIATED) return; if (sdata->local->sw_scanning || sdata->local->hw_scanning) @@ -143,7 +143,7 @@ void ieee80211_process_chanswitch(struct ieee80211_sub_if_data *sdata, /* Disregard subsequent beacons if we are already running a timer processing a CSA */ - if (ifsta->flags & IEEE80211_STA_CSA_RECEIVED) + if (ifmgd->flags & IEEE80211_STA_CSA_RECEIVED) return; new_ch = ieee80211_get_channel(sdata->local->hw.wiphy, new_freq); @@ -153,12 +153,12 @@ void ieee80211_process_chanswitch(struct ieee80211_sub_if_data *sdata, sdata->local->csa_channel = new_ch; if (sw_elem->count <= 1) { - queue_work(sdata->local->hw.workqueue, &ifsta->chswitch_work); + queue_work(sdata->local->hw.workqueue, &ifmgd->chswitch_work); } else { ieee80211_stop_queues_by_reason(&sdata->local->hw, IEEE80211_QUEUE_STOP_REASON_CSA); - ifsta->flags |= IEEE80211_STA_CSA_RECEIVED; - mod_timer(&ifsta->chswitch_timer, + ifmgd->flags |= IEEE80211_STA_CSA_RECEIVED; + mod_timer(&ifmgd->chswitch_timer, jiffies + msecs_to_jiffies(sw_elem->count * bss->cbss.beacon_interval)); diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 634f65c0130..4ba3c540fcf 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -202,6 +202,18 @@ void sta_info_destroy(struct sta_info *sta) /* Make sure timer won't free the tid_rx struct, see below */ if (tid_rx) tid_rx->shutdown = true; + + /* + * The stop callback cannot find this station any more, but + * it didn't complete its work -- start the queue if necessary + */ + if (sta->ampdu_mlme.tid_state_tx[i] & HT_AGG_STATE_INITIATOR_MSK && + sta->ampdu_mlme.tid_state_tx[i] & HT_AGG_STATE_REQ_STOP_BA_MSK && + local->hw.ampdu_queues) + ieee80211_wake_queue_by_reason(&local->hw, + local->hw.queues + sta->tid_to_tx_q[i], + IEEE80211_QUEUE_STOP_REASON_AGGREGATION); + spin_unlock_bh(&sta->lock); /* @@ -275,8 +287,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, * enable session_timer's data differentiation. refer to * sta_rx_agg_session_timer_expired for useage */ sta->timer_to_tid[i] = i; - /* tid to tx queue: initialize according to HW (0 is valid) */ - sta->tid_to_tx_q[i] = ieee80211_num_queues(&local->hw); + sta->tid_to_tx_q[i] = -1; /* rx */ sta->ampdu_mlme.tid_state_rx[i] = HT_AGG_STATE_IDLE; sta->ampdu_mlme.tid_rx[i] = NULL; diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index d9653231992..1f45573c580 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -90,6 +90,7 @@ struct tid_ampdu_tx { * @buf_size: buffer size for incoming A-MPDUs * @timeout: reset timer value (in TUs). * @dialog_token: dialog token for aggregation session + * @shutdown: this session is being shut down due to STA removal */ struct tid_ampdu_rx { struct sk_buff **reorder_buf; @@ -200,7 +201,7 @@ struct sta_ampdu_mlme { * @tid_seq: per-TID sequence numbers for sending to this STA * @ampdu_mlme: A-MPDU state machine state * @timer_to_tid: identity mapping to ID timers - * @tid_to_tx_q: map tid to tx queue + * @tid_to_tx_q: map tid to tx queue (invalid == negative values) * @llid: Local link ID * @plid: Peer link ID * @reason: Cancel reason on PLINK_HOLDING state @@ -275,7 +276,7 @@ struct sta_info { */ struct sta_ampdu_mlme ampdu_mlme; u8 timer_to_tid[STA_TID_NUM]; - u8 tid_to_tx_q[STA_TID_NUM]; + s8 tid_to_tx_q[STA_TID_NUM]; #ifdef CONFIG_MAC80211_MESH /* diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 33926831c64..457238a2f3f 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -784,6 +784,8 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx) skb_copy_queue_mapping(frag, first); frag->do_not_encrypt = first->do_not_encrypt; + frag->dev = first->dev; + frag->iif = first->iif; pos += copylen; left -= copylen; @@ -876,7 +878,6 @@ ieee80211_tx_h_stats(struct ieee80211_tx_data *tx) return TX_CONTINUE; } - /* actual transmit path */ /* @@ -1016,12 +1017,20 @@ __ieee80211_tx_prepare(struct ieee80211_tx_data *tx, tx->sta = sta_info_get(local, hdr->addr1); if (tx->sta && ieee80211_is_data_qos(hdr->frame_control)) { + unsigned long flags; qc = ieee80211_get_qos_ctl(hdr); tid = *qc & IEEE80211_QOS_CTL_TID_MASK; + spin_lock_irqsave(&tx->sta->lock, flags); state = &tx->sta->ampdu_mlme.tid_state_tx[tid]; - if (*state == HT_AGG_STATE_OPERATIONAL) + if (*state == HT_AGG_STATE_OPERATIONAL) { info->flags |= IEEE80211_TX_CTL_AMPDU; + if (local->hw.ampdu_queues) + skb_set_queue_mapping( + skb, tx->local->hw.queues + + tx->sta->tid_to_tx_q[tid]); + } + spin_unlock_irqrestore(&tx->sta->lock, flags); } if (is_multicast_ether_addr(hdr->addr1)) { @@ -1085,7 +1094,8 @@ static int __ieee80211_tx(struct ieee80211_local *local, struct sk_buff *skb, int ret, i; if (skb) { - if (netif_subqueue_stopped(local->mdev, skb)) + if (ieee80211_queue_stopped(&local->hw, + skb_get_queue_mapping(skb))) return IEEE80211_TX_PENDING; ret = local->ops->tx(local_to_hw(local), skb); @@ -1101,8 +1111,8 @@ static int __ieee80211_tx(struct ieee80211_local *local, struct sk_buff *skb, info = IEEE80211_SKB_CB(tx->extra_frag[i]); info->flags &= ~(IEEE80211_TX_CTL_CLEAR_PS_FILT | IEEE80211_TX_CTL_FIRST_FRAGMENT); - if (netif_subqueue_stopped(local->mdev, - tx->extra_frag[i])) + if (ieee80211_queue_stopped(&local->hw, + skb_get_queue_mapping(tx->extra_frag[i]))) return IEEE80211_TX_FRAG_AGAIN; ret = local->ops->tx(local_to_hw(local), @@ -1625,7 +1635,7 @@ int ieee80211_subif_start_xmit(struct sk_buff *skb, case NL80211_IFTYPE_STATION: fc |= cpu_to_le16(IEEE80211_FCTL_TODS); /* BSSID SA DA */ - memcpy(hdr.addr1, sdata->u.sta.bssid, ETH_ALEN); + memcpy(hdr.addr1, sdata->u.mgd.bssid, ETH_ALEN); memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN); memcpy(hdr.addr3, skb->data, ETH_ALEN); hdrlen = 24; @@ -1634,7 +1644,7 @@ int ieee80211_subif_start_xmit(struct sk_buff *skb, /* DA SA BSSID */ memcpy(hdr.addr1, skb->data, ETH_ALEN); memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN); - memcpy(hdr.addr3, sdata->u.sta.bssid, ETH_ALEN); + memcpy(hdr.addr3, sdata->u.ibss.bssid, ETH_ALEN); hdrlen = 24; break; default: @@ -1920,7 +1930,6 @@ struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw, struct ieee80211_tx_info *info; struct ieee80211_sub_if_data *sdata = NULL; struct ieee80211_if_ap *ap = NULL; - struct ieee80211_if_sta *ifsta = NULL; struct beacon_data *beacon; struct ieee80211_supported_band *sband; enum ieee80211_band band = local->hw.conf.channel->band; @@ -1972,13 +1981,13 @@ struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw, } else goto out; } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct ieee80211_hdr *hdr; - ifsta = &sdata->u.sta; - if (!ifsta->probe_resp) + if (!ifibss->probe_resp) goto out; - skb = skb_copy(ifsta->probe_resp, GFP_ATOMIC); + skb = skb_copy(ifibss->probe_resp, GFP_ATOMIC); if (!skb) goto out; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 73c7d7345ab..e0431a1d218 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -344,15 +344,36 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue, { struct ieee80211_local *local = hw_to_local(hw); - /* we don't need to track ampdu queues */ - if (queue < ieee80211_num_regular_queues(hw)) { - __clear_bit(reason, &local->queue_stop_reasons[queue]); + if (queue >= hw->queues) { + if (local->ampdu_ac_queue[queue - hw->queues] < 0) + return; + + /* + * for virtual aggregation queues, we need to refcount the + * internal mac80211 disable (multiple times!), keep track of + * driver disable _and_ make sure the regular queue is + * actually enabled. + */ + if (reason == IEEE80211_QUEUE_STOP_REASON_AGGREGATION) + local->amdpu_ac_stop_refcnt[queue - hw->queues]--; + else + __clear_bit(reason, &local->queue_stop_reasons[queue]); - if (local->queue_stop_reasons[queue] != 0) - /* someone still has this queue stopped */ + if (local->queue_stop_reasons[queue] || + local->amdpu_ac_stop_refcnt[queue - hw->queues]) return; + + /* now go on to treat the corresponding regular queue */ + queue = local->ampdu_ac_queue[queue - hw->queues]; + reason = IEEE80211_QUEUE_STOP_REASON_AGGREGATION; } + __clear_bit(reason, &local->queue_stop_reasons[queue]); + + if (local->queue_stop_reasons[queue] != 0) + /* someone still has this queue stopped */ + return; + if (test_bit(queue, local->queues_pending)) { set_bit(queue, local->queues_pending_run); tasklet_schedule(&local->tx_pending_tasklet); @@ -361,8 +382,8 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue, } } -static void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue, - enum queue_stop_reason reason) +void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue, + enum queue_stop_reason reason) { struct ieee80211_local *local = hw_to_local(hw); unsigned long flags; @@ -384,15 +405,33 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue, { struct ieee80211_local *local = hw_to_local(hw); - /* we don't need to track ampdu queues */ - if (queue < ieee80211_num_regular_queues(hw)) - __set_bit(reason, &local->queue_stop_reasons[queue]); + if (queue >= hw->queues) { + if (local->ampdu_ac_queue[queue - hw->queues] < 0) + return; + + /* + * for virtual aggregation queues, we need to refcount the + * internal mac80211 disable (multiple times!), keep track of + * driver disable _and_ make sure the regular queue is + * actually enabled. + */ + if (reason == IEEE80211_QUEUE_STOP_REASON_AGGREGATION) + local->amdpu_ac_stop_refcnt[queue - hw->queues]++; + else + __set_bit(reason, &local->queue_stop_reasons[queue]); + + /* now go on to treat the corresponding regular queue */ + queue = local->ampdu_ac_queue[queue - hw->queues]; + reason = IEEE80211_QUEUE_STOP_REASON_AGGREGATION; + } + + __set_bit(reason, &local->queue_stop_reasons[queue]); netif_stop_subqueue(local->mdev, queue); } -static void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue, - enum queue_stop_reason reason) +void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue, + enum queue_stop_reason reason) { struct ieee80211_local *local = hw_to_local(hw); unsigned long flags; @@ -418,7 +457,7 @@ void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw, spin_lock_irqsave(&local->queue_stop_reason_lock, flags); - for (i = 0; i < ieee80211_num_queues(hw); i++) + for (i = 0; i < hw->queues; i++) __ieee80211_stop_queue(hw, i, reason); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); @@ -434,6 +473,16 @@ EXPORT_SYMBOL(ieee80211_stop_queues); int ieee80211_queue_stopped(struct ieee80211_hw *hw, int queue) { struct ieee80211_local *local = hw_to_local(hw); + unsigned long flags; + + if (queue >= hw->queues) { + spin_lock_irqsave(&local->queue_stop_reason_lock, flags); + queue = local->ampdu_ac_queue[queue - hw->queues]; + spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); + if (queue < 0) + return true; + } + return __netif_subqueue_stopped(local->mdev, queue); } EXPORT_SYMBOL(ieee80211_queue_stopped); @@ -701,6 +750,27 @@ void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata) local->ops->conf_tx(local_to_hw(local), i, &qparam); } +void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata, + const size_t supp_rates_len, + const u8 *supp_rates) +{ + struct ieee80211_local *local = sdata->local; + int i, have_higher_than_11mbit = 0; + + /* cf. IEEE 802.11 9.2.12 */ + for (i = 0; i < supp_rates_len; i++) + if ((supp_rates[i] & 0x7f) * 5 > 110) + have_higher_than_11mbit = 1; + + if (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ && + have_higher_than_11mbit) + sdata->flags |= IEEE80211_SDATA_OPERATING_GMODE; + else + sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE; + + ieee80211_set_wmm_default(sdata); +} + void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int encrypt) { @@ -767,3 +837,161 @@ u32 ieee80211_mandatory_rates(struct ieee80211_local *local, mandatory_rates |= BIT(i); return mandatory_rates; } + +void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, + u16 transaction, u16 auth_alg, + u8 *extra, size_t extra_len, + const u8 *bssid, int encrypt) +{ + struct ieee80211_local *local = sdata->local; + struct sk_buff *skb; + struct ieee80211_mgmt *mgmt; + const u8 *ie_auth = NULL; + int ie_auth_len = 0; + + if (sdata->vif.type == NL80211_IFTYPE_STATION) { + ie_auth_len = sdata->u.mgd.ie_auth_len; + ie_auth = sdata->u.mgd.ie_auth; + } + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + + sizeof(*mgmt) + 6 + extra_len + ie_auth_len); + if (!skb) { + printk(KERN_DEBUG "%s: failed to allocate buffer for auth " + "frame\n", sdata->dev->name); + return; + } + skb_reserve(skb, local->hw.extra_tx_headroom); + + mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24 + 6); + memset(mgmt, 0, 24 + 6); + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_AUTH); + if (encrypt) + mgmt->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); + memcpy(mgmt->da, bssid, ETH_ALEN); + memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN); + memcpy(mgmt->bssid, bssid, ETH_ALEN); + mgmt->u.auth.auth_alg = cpu_to_le16(auth_alg); + mgmt->u.auth.auth_transaction = cpu_to_le16(transaction); + mgmt->u.auth.status_code = cpu_to_le16(0); + if (extra) + memcpy(skb_put(skb, extra_len), extra, extra_len); + if (ie_auth) + memcpy(skb_put(skb, ie_auth_len), ie_auth, ie_auth_len); + + ieee80211_tx_skb(sdata, skb, encrypt); +} + +void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst, + u8 *ssid, size_t ssid_len, + u8 *ie, size_t ie_len) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_supported_band *sband; + struct sk_buff *skb; + struct ieee80211_mgmt *mgmt; + u8 *pos, *supp_rates, *esupp_rates = NULL, *extra_preq_ie = NULL; + int i, extra_preq_ie_len = 0; + + switch (sdata->vif.type) { + case NL80211_IFTYPE_STATION: + extra_preq_ie_len = sdata->u.mgd.ie_probereq_len; + extra_preq_ie = sdata->u.mgd.ie_probereq; + break; + default: + break; + } + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt) + 200 + + ie_len + extra_preq_ie_len); + if (!skb) { + printk(KERN_DEBUG "%s: failed to allocate buffer for probe " + "request\n", sdata->dev->name); + return; + } + skb_reserve(skb, local->hw.extra_tx_headroom); + + mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24); + memset(mgmt, 0, 24); + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_PROBE_REQ); + memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN); + if (dst) { + memcpy(mgmt->da, dst, ETH_ALEN); + memcpy(mgmt->bssid, dst, ETH_ALEN); + } else { + memset(mgmt->da, 0xff, ETH_ALEN); + memset(mgmt->bssid, 0xff, ETH_ALEN); + } + pos = skb_put(skb, 2 + ssid_len); + *pos++ = WLAN_EID_SSID; + *pos++ = ssid_len; + memcpy(pos, ssid, ssid_len); + + supp_rates = skb_put(skb, 2); + supp_rates[0] = WLAN_EID_SUPP_RATES; + supp_rates[1] = 0; + sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; + + for (i = 0; i < sband->n_bitrates; i++) { + struct ieee80211_rate *rate = &sband->bitrates[i]; + if (esupp_rates) { + pos = skb_put(skb, 1); + esupp_rates[1]++; + } else if (supp_rates[1] == 8) { + esupp_rates = skb_put(skb, 3); + esupp_rates[0] = WLAN_EID_EXT_SUPP_RATES; + esupp_rates[1] = 1; + pos = &esupp_rates[2]; + } else { + pos = skb_put(skb, 1); + supp_rates[1]++; + } + *pos = rate->bitrate / 5; + } + + if (ie) + memcpy(skb_put(skb, ie_len), ie, ie_len); + if (extra_preq_ie) + memcpy(skb_put(skb, extra_preq_ie_len), extra_preq_ie, + extra_preq_ie_len); + + ieee80211_tx_skb(sdata, skb, 0); +} + +u32 ieee80211_sta_get_rates(struct ieee80211_local *local, + struct ieee802_11_elems *elems, + enum ieee80211_band band) +{ + struct ieee80211_supported_band *sband; + struct ieee80211_rate *bitrates; + size_t num_rates; + u32 supp_rates; + int i, j; + sband = local->hw.wiphy->bands[band]; + + if (!sband) { + WARN_ON(1); + sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; + } + + bitrates = sband->bitrates; + num_rates = sband->n_bitrates; + supp_rates = 0; + for (i = 0; i < elems->supp_rates_len + + elems->ext_supp_rates_len; i++) { + u8 rate = 0; + int own_rate; + if (i < elems->supp_rates_len) + rate = elems->supp_rates[i]; + else if (elems->ext_supp_rates) + rate = elems->ext_supp_rates + [i - elems->supp_rates_len]; + own_rate = 5 * (rate & 0x7f); + for (j = 0; j < num_rates; j++) + if (bitrates[j].bitrate == own_rate) + supp_rates |= BIT(j); + } + return supp_rates; +} diff --git a/net/mac80211/wext.c b/net/mac80211/wext.c index 2b023dce8b2..935c63ed3df 100644 --- a/net/mac80211/wext.c +++ b/net/mac80211/wext.c @@ -132,139 +132,37 @@ static int ieee80211_ioctl_siwgenie(struct net_device *dev, if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME) return -EOPNOTSUPP; - if (sdata->vif.type == NL80211_IFTYPE_STATION || - sdata->vif.type == NL80211_IFTYPE_ADHOC) { + if (sdata->vif.type == NL80211_IFTYPE_STATION) { int ret = ieee80211_sta_set_extra_ie(sdata, extra, data->length); if (ret) return ret; - sdata->u.sta.flags &= ~IEEE80211_STA_AUTO_BSSID_SEL; - ieee80211_sta_req_auth(sdata, &sdata->u.sta); + sdata->u.mgd.flags &= ~IEEE80211_STA_AUTO_BSSID_SEL; + ieee80211_sta_req_auth(sdata); return 0; } return -EOPNOTSUPP; } -static u8 ieee80211_get_wstats_flags(struct ieee80211_local *local) -{ - u8 wstats_flags = 0; - - wstats_flags |= local->hw.flags & (IEEE80211_HW_SIGNAL_UNSPEC | - IEEE80211_HW_SIGNAL_DBM) ? - IW_QUAL_QUAL_UPDATED : IW_QUAL_QUAL_INVALID; - wstats_flags |= local->hw.flags & IEEE80211_HW_NOISE_DBM ? - IW_QUAL_NOISE_UPDATED : IW_QUAL_NOISE_INVALID; - if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) - wstats_flags |= IW_QUAL_DBM; - - return wstats_flags; -} - -static int ieee80211_ioctl_giwrange(struct net_device *dev, - struct iw_request_info *info, - struct iw_point *data, char *extra) -{ - struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); - struct iw_range *range = (struct iw_range *) extra; - enum ieee80211_band band; - int c = 0; - - data->length = sizeof(struct iw_range); - memset(range, 0, sizeof(struct iw_range)); - - range->we_version_compiled = WIRELESS_EXT; - range->we_version_source = 21; - range->retry_capa = IW_RETRY_LIMIT; - range->retry_flags = IW_RETRY_LIMIT; - range->min_retry = 0; - range->max_retry = 255; - range->min_rts = 0; - range->max_rts = 2347; - range->min_frag = 256; - range->max_frag = 2346; - - range->encoding_size[0] = 5; - range->encoding_size[1] = 13; - range->num_encoding_sizes = 2; - range->max_encoding_tokens = NUM_DEFAULT_KEYS; - - /* cfg80211 requires this, and enforces 0..100 */ - if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) - range->max_qual.level = 100; - else if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) - range->max_qual.level = -110; - else - range->max_qual.level = 0; - - if (local->hw.flags & IEEE80211_HW_NOISE_DBM) - range->max_qual.noise = -110; - else - range->max_qual.noise = 0; - - range->max_qual.qual = 100; - range->max_qual.updated = ieee80211_get_wstats_flags(local); - - range->avg_qual.qual = 50; - /* not always true but better than nothing */ - range->avg_qual.level = range->max_qual.level / 2; - range->avg_qual.noise = range->max_qual.noise / 2; - range->avg_qual.updated = ieee80211_get_wstats_flags(local); - - range->enc_capa = IW_ENC_CAPA_WPA | IW_ENC_CAPA_WPA2 | - IW_ENC_CAPA_CIPHER_TKIP | IW_ENC_CAPA_CIPHER_CCMP; - - - for (band = 0; band < IEEE80211_NUM_BANDS; band ++) { - int i; - struct ieee80211_supported_band *sband; - - sband = local->hw.wiphy->bands[band]; - - if (!sband) - continue; - - for (i = 0; i < sband->n_channels && c < IW_MAX_FREQUENCIES; i++) { - struct ieee80211_channel *chan = &sband->channels[i]; - - if (!(chan->flags & IEEE80211_CHAN_DISABLED)) { - range->freq[c].i = - ieee80211_frequency_to_channel( - chan->center_freq); - range->freq[c].m = chan->center_freq; - range->freq[c].e = 6; - c++; - } - } - } - range->num_channels = c; - range->num_frequency = c; - - IW_EVENT_CAPA_SET_KERNEL(range->event_capa); - IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWAP); - IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWSCAN); - - range->scan_capa |= IW_SCAN_CAPA_ESSID; - - return 0; -} - - static int ieee80211_ioctl_siwfreq(struct net_device *dev, struct iw_request_info *info, struct iw_freq *freq, char *extra) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); - if (sdata->vif.type == NL80211_IFTYPE_ADHOC || - sdata->vif.type == NL80211_IFTYPE_STATION) - sdata->u.sta.flags &= ~IEEE80211_STA_AUTO_CHANNEL_SEL; + if (sdata->vif.type == NL80211_IFTYPE_ADHOC) + sdata->u.ibss.flags &= ~IEEE80211_IBSS_AUTO_CHANNEL_SEL; + else if (sdata->vif.type == NL80211_IFTYPE_STATION) + sdata->u.mgd.flags &= ~IEEE80211_STA_AUTO_CHANNEL_SEL; /* freq->e == 0: freq->m = channel; otherwise freq = m * 10^e */ if (freq->e == 0) { if (freq->m < 0) { - if (sdata->vif.type == NL80211_IFTYPE_ADHOC || - sdata->vif.type == NL80211_IFTYPE_STATION) - sdata->u.sta.flags |= + if (sdata->vif.type == NL80211_IFTYPE_ADHOC) + sdata->u.ibss.flags |= + IEEE80211_IBSS_AUTO_CHANNEL_SEL; + else if (sdata->vif.type == NL80211_IFTYPE_STATION) + sdata->u.mgd.flags |= IEEE80211_STA_AUTO_CHANNEL_SEL; return 0; } else @@ -301,32 +199,35 @@ static int ieee80211_ioctl_siwessid(struct net_device *dev, { struct ieee80211_sub_if_data *sdata; size_t len = data->length; + int ret; /* iwconfig uses nul termination in SSID.. */ if (len > 0 && ssid[len - 1] == '\0') len--; sdata = IEEE80211_DEV_TO_SUB_IF(dev); - if (sdata->vif.type == NL80211_IFTYPE_STATION || - sdata->vif.type == NL80211_IFTYPE_ADHOC) { - int ret; + if (sdata->vif.type == NL80211_IFTYPE_STATION) { if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME) { if (len > IEEE80211_MAX_SSID_LEN) return -EINVAL; - memcpy(sdata->u.sta.ssid, ssid, len); - sdata->u.sta.ssid_len = len; + memcpy(sdata->u.mgd.ssid, ssid, len); + sdata->u.mgd.ssid_len = len; return 0; } + if (data->flags) - sdata->u.sta.flags &= ~IEEE80211_STA_AUTO_SSID_SEL; + sdata->u.mgd.flags &= ~IEEE80211_STA_AUTO_SSID_SEL; else - sdata->u.sta.flags |= IEEE80211_STA_AUTO_SSID_SEL; + sdata->u.mgd.flags |= IEEE80211_STA_AUTO_SSID_SEL; + ret = ieee80211_sta_set_ssid(sdata, ssid, len); if (ret) return ret; - ieee80211_sta_req_auth(sdata, &sdata->u.sta); + + ieee80211_sta_req_auth(sdata); return 0; - } + } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) + return ieee80211_ibss_set_ssid(sdata, ssid, len); return -EOPNOTSUPP; } @@ -340,8 +241,7 @@ static int ieee80211_ioctl_giwessid(struct net_device *dev, struct ieee80211_sub_if_data *sdata; sdata = IEEE80211_DEV_TO_SUB_IF(dev); - if (sdata->vif.type == NL80211_IFTYPE_STATION || - sdata->vif.type == NL80211_IFTYPE_ADHOC) { + if (sdata->vif.type == NL80211_IFTYPE_STATION) { int res = ieee80211_sta_get_ssid(sdata, ssid, &len); if (res == 0) { data->length = len; @@ -349,6 +249,14 @@ static int ieee80211_ioctl_giwessid(struct net_device *dev, } else data->flags = 0; return res; + } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { + int res = ieee80211_ibss_get_ssid(sdata, ssid, &len); + if (res == 0) { + data->length = len; + data->flags = 1; + } else + data->flags = 0; + return res; } return -EOPNOTSUPP; @@ -362,26 +270,35 @@ static int ieee80211_ioctl_siwap(struct net_device *dev, struct ieee80211_sub_if_data *sdata; sdata = IEEE80211_DEV_TO_SUB_IF(dev); - if (sdata->vif.type == NL80211_IFTYPE_STATION || - sdata->vif.type == NL80211_IFTYPE_ADHOC) { + if (sdata->vif.type == NL80211_IFTYPE_STATION) { int ret; if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME) { - memcpy(sdata->u.sta.bssid, (u8 *) &ap_addr->sa_data, + memcpy(sdata->u.mgd.bssid, (u8 *) &ap_addr->sa_data, ETH_ALEN); return 0; } if (is_zero_ether_addr((u8 *) &ap_addr->sa_data)) - sdata->u.sta.flags |= IEEE80211_STA_AUTO_BSSID_SEL | + sdata->u.mgd.flags |= IEEE80211_STA_AUTO_BSSID_SEL | IEEE80211_STA_AUTO_CHANNEL_SEL; else if (is_broadcast_ether_addr((u8 *) &ap_addr->sa_data)) - sdata->u.sta.flags |= IEEE80211_STA_AUTO_BSSID_SEL; + sdata->u.mgd.flags |= IEEE80211_STA_AUTO_BSSID_SEL; else - sdata->u.sta.flags &= ~IEEE80211_STA_AUTO_BSSID_SEL; + sdata->u.mgd.flags &= ~IEEE80211_STA_AUTO_BSSID_SEL; ret = ieee80211_sta_set_bssid(sdata, (u8 *) &ap_addr->sa_data); if (ret) return ret; - ieee80211_sta_req_auth(sdata, &sdata->u.sta); + ieee80211_sta_req_auth(sdata); return 0; + } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { + if (is_zero_ether_addr((u8 *) &ap_addr->sa_data)) + sdata->u.ibss.flags |= IEEE80211_IBSS_AUTO_BSSID_SEL | + IEEE80211_IBSS_AUTO_CHANNEL_SEL; + else if (is_broadcast_ether_addr((u8 *) &ap_addr->sa_data)) + sdata->u.ibss.flags |= IEEE80211_IBSS_AUTO_BSSID_SEL; + else + sdata->u.ibss.flags &= ~IEEE80211_IBSS_AUTO_BSSID_SEL; + + return ieee80211_ibss_set_bssid(sdata, (u8 *) &ap_addr->sa_data); } else if (sdata->vif.type == NL80211_IFTYPE_WDS) { /* * If it is necessary to update the WDS peer address @@ -410,17 +327,20 @@ static int ieee80211_ioctl_giwap(struct net_device *dev, struct ieee80211_sub_if_data *sdata; sdata = IEEE80211_DEV_TO_SUB_IF(dev); - if (sdata->vif.type == NL80211_IFTYPE_STATION || - sdata->vif.type == NL80211_IFTYPE_ADHOC) { - if (sdata->u.sta.state == IEEE80211_STA_MLME_ASSOCIATED || - sdata->u.sta.state == IEEE80211_STA_MLME_IBSS_JOINED) { + if (sdata->vif.type == NL80211_IFTYPE_STATION) { + if (sdata->u.mgd.state == IEEE80211_STA_MLME_ASSOCIATED) { ap_addr->sa_family = ARPHRD_ETHER; - memcpy(&ap_addr->sa_data, sdata->u.sta.bssid, ETH_ALEN); - return 0; - } else { + memcpy(&ap_addr->sa_data, sdata->u.mgd.bssid, ETH_ALEN); + } else memset(&ap_addr->sa_data, 0, ETH_ALEN); - return 0; - } + return 0; + } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { + if (sdata->u.ibss.state == IEEE80211_IBSS_MLME_JOINED) { + ap_addr->sa_family = ARPHRD_ETHER; + memcpy(&ap_addr->sa_data, sdata->u.ibss.bssid, ETH_ALEN); + } else + memset(&ap_addr->sa_data, 0, ETH_ALEN); + return 0; } else if (sdata->vif.type == NL80211_IFTYPE_WDS) { ap_addr->sa_family = ARPHRD_ETHER; memcpy(&ap_addr->sa_data, sdata->u.wds.remote_addr, ETH_ALEN); @@ -486,7 +406,7 @@ static int ieee80211_ioctl_giwrate(struct net_device *dev, rcu_read_lock(); - sta = sta_info_get(local, sdata->u.sta.bssid); + sta = sta_info_get(local, sdata->u.mgd.bssid); if (sta && !(sta->last_tx_rate.flags & IEEE80211_TX_RC_MCS)) rate->value = sband->bitrates[sta->last_tx_rate.idx].bitrate; @@ -687,8 +607,7 @@ static int ieee80211_ioctl_siwmlme(struct net_device *dev, struct iw_mlme *mlme = (struct iw_mlme *) extra; sdata = IEEE80211_DEV_TO_SUB_IF(dev); - if (sdata->vif.type != NL80211_IFTYPE_STATION && - sdata->vif.type != NL80211_IFTYPE_ADHOC) + if (!(sdata->vif.type == NL80211_IFTYPE_STATION)) return -EINVAL; switch (mlme->cmd) { @@ -784,8 +703,7 @@ static int ieee80211_ioctl_giwencode(struct net_device *dev, erq->flags |= IW_ENCODE_ENABLED; if (sdata->vif.type == NL80211_IFTYPE_STATION) { - struct ieee80211_if_sta *ifsta = &sdata->u.sta; - switch (ifsta->auth_alg) { + switch (sdata->u.mgd.auth_alg) { case WLAN_AUTH_OPEN: case WLAN_AUTH_LEAP: erq->flags |= IW_ENCODE_OPEN; @@ -849,7 +767,7 @@ static int ieee80211_ioctl_siwpower(struct net_device *dev, ret = ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_DYNPS_TIMEOUT); - if (!(sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED)) + if (!(sdata->u.mgd.flags & IEEE80211_STA_ASSOCIATED)) return ret; if (conf->dynamic_ps_timeout > 0 && @@ -908,10 +826,10 @@ static int ieee80211_ioctl_siwauth(struct net_device *dev, if (sdata->vif.type == NL80211_IFTYPE_STATION) { if (data->value & (IW_AUTH_CIPHER_WEP40 | IW_AUTH_CIPHER_WEP104 | IW_AUTH_CIPHER_TKIP)) - sdata->u.sta.flags |= + sdata->u.mgd.flags |= IEEE80211_STA_TKIP_WEP_USED; else - sdata->u.sta.flags &= + sdata->u.mgd.flags &= ~IEEE80211_STA_TKIP_WEP_USED; } break; @@ -922,21 +840,20 @@ static int ieee80211_ioctl_siwauth(struct net_device *dev, if (sdata->vif.type != NL80211_IFTYPE_STATION) ret = -EINVAL; else { - sdata->u.sta.flags &= ~IEEE80211_STA_PRIVACY_INVOKED; + sdata->u.mgd.flags &= ~IEEE80211_STA_PRIVACY_INVOKED; /* * Privacy invoked by wpa_supplicant, store the * value and allow associating to a protected * network without having a key up front. */ if (data->value) - sdata->u.sta.flags |= + sdata->u.mgd.flags |= IEEE80211_STA_PRIVACY_INVOKED; } break; case IW_AUTH_80211_AUTH_ALG: - if (sdata->vif.type == NL80211_IFTYPE_STATION || - sdata->vif.type == NL80211_IFTYPE_ADHOC) - sdata->u.sta.auth_algs = data->value; + if (sdata->vif.type == NL80211_IFTYPE_STATION) + sdata->u.mgd.auth_algs = data->value; else ret = -EOPNOTSUPP; break; @@ -945,17 +862,16 @@ static int ieee80211_ioctl_siwauth(struct net_device *dev, ret = -EOPNOTSUPP; break; } - if (sdata->vif.type == NL80211_IFTYPE_STATION || - sdata->vif.type == NL80211_IFTYPE_ADHOC) { + if (sdata->vif.type == NL80211_IFTYPE_STATION) { switch (data->value) { case IW_AUTH_MFP_DISABLED: - sdata->u.sta.mfp = IEEE80211_MFP_DISABLED; + sdata->u.mgd.mfp = IEEE80211_MFP_DISABLED; break; case IW_AUTH_MFP_OPTIONAL: - sdata->u.sta.mfp = IEEE80211_MFP_OPTIONAL; + sdata->u.mgd.mfp = IEEE80211_MFP_OPTIONAL; break; case IW_AUTH_MFP_REQUIRED: - sdata->u.sta.mfp = IEEE80211_MFP_REQUIRED; + sdata->u.mgd.mfp = IEEE80211_MFP_REQUIRED; break; default: ret = -EINVAL; @@ -980,9 +896,9 @@ static struct iw_statistics *ieee80211_get_wireless_stats(struct net_device *dev rcu_read_lock(); - if (sdata->vif.type == NL80211_IFTYPE_STATION || - sdata->vif.type == NL80211_IFTYPE_ADHOC) - sta = sta_info_get(local, sdata->u.sta.bssid); + if (sdata->vif.type == NL80211_IFTYPE_STATION) + sta = sta_info_get(local, sdata->u.mgd.bssid); + if (!sta) { wstats->discard.fragment = 0; wstats->discard.misc = 0; @@ -991,10 +907,45 @@ static struct iw_statistics *ieee80211_get_wireless_stats(struct net_device *dev wstats->qual.noise = 0; wstats->qual.updated = IW_QUAL_ALL_INVALID; } else { - wstats->qual.level = sta->last_signal; - wstats->qual.qual = sta->last_qual; - wstats->qual.noise = sta->last_noise; - wstats->qual.updated = ieee80211_get_wstats_flags(local); + wstats->qual.updated = 0; + /* + * mirror what cfg80211 does for iwrange/scan results, + * otherwise userspace gets confused. + */ + if (local->hw.flags & (IEEE80211_HW_SIGNAL_UNSPEC | + IEEE80211_HW_SIGNAL_DBM)) { + wstats->qual.updated |= IW_QUAL_LEVEL_UPDATED; + wstats->qual.updated |= IW_QUAL_QUAL_UPDATED; + } else { + wstats->qual.updated |= IW_QUAL_LEVEL_INVALID; + wstats->qual.updated |= IW_QUAL_QUAL_INVALID; + } + + if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) { + wstats->qual.level = sta->last_signal; + wstats->qual.qual = sta->last_signal; + } else if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) { + int sig = sta->last_signal; + + wstats->qual.updated |= IW_QUAL_DBM; + wstats->qual.level = sig; + if (sig < -110) + sig = -110; + else if (sig > -40) + sig = -40; + wstats->qual.qual = sig + 110; + } + + if (local->hw.flags & IEEE80211_HW_NOISE_DBM) { + /* + * This assumes that if driver reports noise, it also + * reports signal in dBm. + */ + wstats->qual.noise = sta->last_noise; + wstats->qual.updated |= IW_QUAL_NOISE_UPDATED; + } else { + wstats->qual.updated |= IW_QUAL_NOISE_INVALID; + } } rcu_read_unlock(); @@ -1011,9 +962,8 @@ static int ieee80211_ioctl_giwauth(struct net_device *dev, switch (data->flags & IW_AUTH_INDEX) { case IW_AUTH_80211_AUTH_ALG: - if (sdata->vif.type == NL80211_IFTYPE_STATION || - sdata->vif.type == NL80211_IFTYPE_ADHOC) - data->value = sdata->u.sta.auth_algs; + if (sdata->vif.type == NL80211_IFTYPE_STATION) + data->value = sdata->u.mgd.auth_algs; else ret = -EOPNOTSUPP; break; @@ -1116,7 +1066,7 @@ static const iw_handler ieee80211_handler[] = (iw_handler) NULL, /* SIOCSIWSENS */ (iw_handler) NULL, /* SIOCGIWSENS */ (iw_handler) NULL /* not used */, /* SIOCSIWRANGE */ - (iw_handler) ieee80211_ioctl_giwrange, /* SIOCGIWRANGE */ + (iw_handler) cfg80211_wext_giwrange, /* SIOCGIWRANGE */ (iw_handler) NULL /* not used */, /* SIOCSIWPRIV */ (iw_handler) NULL /* kernel code */, /* SIOCGIWPRIV */ (iw_handler) NULL /* not used */, /* SIOCSIWSTATS */ diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c index ac71b38f7cb..0b8ad1f4ecd 100644 --- a/net/mac80211/wme.c +++ b/net/mac80211/wme.c @@ -99,10 +99,13 @@ static u16 classify80211(struct ieee80211_local *local, struct sk_buff *skb) /* in case we are a client verify acm is not set for this ac */ while (unlikely(local->wmm_acm & BIT(skb->priority))) { if (wme_downgrade_ac(skb)) { - /* The old code would drop the packet in this - * case. + /* + * This should not really happen. The AP has marked all + * lower ACs to require admission control which is not + * a reasonable configuration. Allow the frame to be + * transmitted using AC_BK as a workaround. */ - return 0; + break; } } @@ -114,9 +117,7 @@ u16 ieee80211_select_queue(struct net_device *dev, struct sk_buff *skb) { struct ieee80211_master_priv *mpriv = netdev_priv(dev); struct ieee80211_local *local = mpriv->local; - struct ieee80211_hw *hw = &local->hw; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; - struct sta_info *sta; u16 queue; u8 tid; @@ -124,29 +125,11 @@ u16 ieee80211_select_queue(struct net_device *dev, struct sk_buff *skb) if (unlikely(queue >= local->hw.queues)) queue = local->hw.queues - 1; - if (skb->requeue) { - if (!hw->ampdu_queues) - return queue; - - rcu_read_lock(); - sta = sta_info_get(local, hdr->addr1); - tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; - if (sta) { - int ampdu_queue = sta->tid_to_tx_q[tid]; - - if ((ampdu_queue < ieee80211_num_queues(hw)) && - test_bit(ampdu_queue, local->queue_pool)) - queue = ampdu_queue; - } - rcu_read_unlock(); - - return queue; - } - - /* Now we know the 1d priority, fill in the QoS header if - * there is one. + /* + * Now we know the 1d priority, fill in the QoS header if + * there is one (and we haven't done this before). */ - if (ieee80211_is_data_qos(hdr->frame_control)) { + if (!skb->requeue && ieee80211_is_data_qos(hdr->frame_control)) { u8 *p = ieee80211_get_qos_ctl(hdr); u8 ack_policy = 0; tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; @@ -156,140 +139,7 @@ u16 ieee80211_select_queue(struct net_device *dev, struct sk_buff *skb) /* qos header is 2 bytes, second reserved */ *p++ = ack_policy | tid; *p = 0; - - if (!hw->ampdu_queues) - return queue; - - rcu_read_lock(); - - sta = sta_info_get(local, hdr->addr1); - if (sta) { - int ampdu_queue = sta->tid_to_tx_q[tid]; - - if ((ampdu_queue < ieee80211_num_queues(hw)) && - test_bit(ampdu_queue, local->queue_pool)) - queue = ampdu_queue; - } - - rcu_read_unlock(); } return queue; } - -int ieee80211_ht_agg_queue_add(struct ieee80211_local *local, - struct sta_info *sta, u16 tid) -{ - int i; - - /* XXX: currently broken due to cb/requeue use */ - return -EPERM; - - /* prepare the filter and save it for the SW queue - * matching the received HW queue */ - - if (!local->hw.ampdu_queues) - return -EPERM; - - /* try to get a Qdisc from the pool */ - for (i = local->hw.queues; i < ieee80211_num_queues(&local->hw); i++) - if (!test_and_set_bit(i, local->queue_pool)) { - ieee80211_stop_queue(local_to_hw(local), i); - sta->tid_to_tx_q[tid] = i; - - /* IF there are already pending packets - * on this tid first we need to drain them - * on the previous queue - * since HT is strict in order */ -#ifdef CONFIG_MAC80211_HT_DEBUG - if (net_ratelimit()) - printk(KERN_DEBUG "allocated aggregation queue" - " %d tid %d addr %pM pool=0x%lX\n", - i, tid, sta->sta.addr, - local->queue_pool[0]); -#endif /* CONFIG_MAC80211_HT_DEBUG */ - return 0; - } - - return -EAGAIN; -} - -/** - * the caller needs to hold netdev_get_tx_queue(local->mdev, X)->lock - */ -void ieee80211_ht_agg_queue_remove(struct ieee80211_local *local, - struct sta_info *sta, u16 tid, - u8 requeue) -{ - int agg_queue = sta->tid_to_tx_q[tid]; - struct ieee80211_hw *hw = &local->hw; - - /* return the qdisc to the pool */ - clear_bit(agg_queue, local->queue_pool); - sta->tid_to_tx_q[tid] = ieee80211_num_queues(hw); - - if (requeue) { - ieee80211_requeue(local, agg_queue); - } else { - struct netdev_queue *txq; - spinlock_t *root_lock; - struct Qdisc *q; - - txq = netdev_get_tx_queue(local->mdev, agg_queue); - q = rcu_dereference(txq->qdisc); - root_lock = qdisc_lock(q); - - spin_lock_bh(root_lock); - qdisc_reset(q); - spin_unlock_bh(root_lock); - } -} - -void ieee80211_requeue(struct ieee80211_local *local, int queue) -{ - struct netdev_queue *txq = netdev_get_tx_queue(local->mdev, queue); - struct sk_buff_head list; - spinlock_t *root_lock; - struct Qdisc *qdisc; - u32 len; - - rcu_read_lock_bh(); - - qdisc = rcu_dereference(txq->qdisc); - if (!qdisc || !qdisc->dequeue) - goto out_unlock; - - skb_queue_head_init(&list); - - root_lock = qdisc_root_lock(qdisc); - spin_lock(root_lock); - for (len = qdisc->q.qlen; len > 0; len--) { - struct sk_buff *skb = qdisc->dequeue(qdisc); - - if (skb) - __skb_queue_tail(&list, skb); - } - spin_unlock(root_lock); - - for (len = list.qlen; len > 0; len--) { - struct sk_buff *skb = __skb_dequeue(&list); - u16 new_queue; - - BUG_ON(!skb); - new_queue = ieee80211_select_queue(local->mdev, skb); - skb_set_queue_mapping(skb, new_queue); - - txq = netdev_get_tx_queue(local->mdev, new_queue); - - - qdisc = rcu_dereference(txq->qdisc); - root_lock = qdisc_root_lock(qdisc); - - spin_lock(root_lock); - qdisc_enqueue_root(skb, qdisc); - spin_unlock(root_lock); - } - -out_unlock: - rcu_read_unlock_bh(); -} diff --git a/net/mac80211/wme.h b/net/mac80211/wme.h index bc62f28a4d3..7520d2e014d 100644 --- a/net/mac80211/wme.h +++ b/net/mac80211/wme.h @@ -21,11 +21,5 @@ extern const int ieee802_1d_to_ac[8]; u16 ieee80211_select_queue(struct net_device *dev, struct sk_buff *skb); -int ieee80211_ht_agg_queue_add(struct ieee80211_local *local, - struct sta_info *sta, u16 tid); -void ieee80211_ht_agg_queue_remove(struct ieee80211_local *local, - struct sta_info *sta, u16 tid, - u8 requeue); -void ieee80211_requeue(struct ieee80211_local *local, int queue); #endif /* _WME_H */ diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 55befe59e1c..dfb447b584d 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -728,7 +728,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, NF_CT_ASSERT(skb->nfct); ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum); - if (ret < 0) { + if (ret <= 0) { /* Invalid: inverse of the return code tells * the netfilter core what to do */ pr_debug("nf_conntrack_in: Can't track with proto module\n"); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 1b75c9efb0e..7a16bd462f8 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1763,6 +1763,7 @@ ctnetlink_create_expect(struct nlattr *cda[], u_int8_t u3, u32 pid, int report) goto out; } + exp->class = 0; exp->expectfn = NULL; exp->flags = 0; exp->master = ct; diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 7d3944f02ea..e46f3b79adb 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -861,7 +861,7 @@ static int tcp_packet(struct nf_conn *ct, */ if (nf_ct_kill(ct)) return -NF_REPEAT; - return -NF_DROP; + return NF_DROP; } /* Fall through */ case TCP_CONNTRACK_IGNORE: @@ -894,7 +894,7 @@ static int tcp_packet(struct nf_conn *ct, nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: killing out of sync session "); nf_ct_kill(ct); - return -NF_DROP; + return NF_DROP; } ct->proto.tcp.last_index = index; ct->proto.tcp.last_dir = dir; diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 3eae3fca29d..fd326ac27ec 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -39,7 +39,7 @@ #endif #define NFULNL_NLBUFSIZ_DEFAULT NLMSG_GOODSIZE -#define NFULNL_TIMEOUT_DEFAULT HZ /* every second */ +#define NFULNL_TIMEOUT_DEFAULT 100 /* every second */ #define NFULNL_QTHRESH_DEFAULT 100 /* 100 packets */ #define NFULNL_COPY_RANGE_MAX 0xFFFF /* max packet size is limited by 16-bit struct nfattr nfa_len field */ @@ -590,8 +590,10 @@ nfulnl_log_packet(u_int8_t pf, qthreshold = inst->qthreshold; /* per-rule qthreshold overrides per-instance */ - if (qthreshold > li->u.ulog.qthreshold) - qthreshold = li->u.ulog.qthreshold; + if (li->u.ulog.qthreshold) + if (qthreshold > li->u.ulog.qthreshold) + qthreshold = li->u.ulog.qthreshold; + switch (inst->copy_mode) { case NFULNL_COPY_META: diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index bfcac92d556..509a95621f9 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -843,59 +843,143 @@ static const struct file_operations xt_table_ops = { .release = seq_release_net, }; -static void *xt_match_seq_start(struct seq_file *seq, loff_t *pos) +/* + * Traverse state for ip{,6}_{tables,matches} for helping crossing + * the multi-AF mutexes. + */ +struct nf_mttg_trav { + struct list_head *head, *curr; + uint8_t class, nfproto; +}; + +enum { + MTTG_TRAV_INIT, + MTTG_TRAV_NFP_UNSPEC, + MTTG_TRAV_NFP_SPEC, + MTTG_TRAV_DONE, +}; + +static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos, + bool is_target) { - struct proc_dir_entry *pde = (struct proc_dir_entry *)seq->private; - u_int16_t af = (unsigned long)pde->data; + static const uint8_t next_class[] = { + [MTTG_TRAV_NFP_UNSPEC] = MTTG_TRAV_NFP_SPEC, + [MTTG_TRAV_NFP_SPEC] = MTTG_TRAV_DONE, + }; + struct nf_mttg_trav *trav = seq->private; + + switch (trav->class) { + case MTTG_TRAV_INIT: + trav->class = MTTG_TRAV_NFP_UNSPEC; + mutex_lock(&xt[NFPROTO_UNSPEC].mutex); + trav->head = trav->curr = is_target ? + &xt[NFPROTO_UNSPEC].target : &xt[NFPROTO_UNSPEC].match; + break; + case MTTG_TRAV_NFP_UNSPEC: + trav->curr = trav->curr->next; + if (trav->curr != trav->head) + break; + mutex_unlock(&xt[NFPROTO_UNSPEC].mutex); + mutex_lock(&xt[trav->nfproto].mutex); + trav->head = trav->curr = is_target ? + &xt[trav->nfproto].target : &xt[trav->nfproto].match; + trav->class = next_class[trav->class]; + break; + case MTTG_TRAV_NFP_SPEC: + trav->curr = trav->curr->next; + if (trav->curr != trav->head) + break; + /* fallthru, _stop will unlock */ + default: + return NULL; + } - mutex_lock(&xt[af].mutex); - return seq_list_start(&xt[af].match, *pos); + if (ppos != NULL) + ++*ppos; + return trav; } -static void *xt_match_seq_next(struct seq_file *seq, void *v, loff_t *pos) +static void *xt_mttg_seq_start(struct seq_file *seq, loff_t *pos, + bool is_target) { - struct proc_dir_entry *pde = (struct proc_dir_entry *)seq->private; - u_int16_t af = (unsigned long)pde->data; + struct nf_mttg_trav *trav = seq->private; + unsigned int j; - return seq_list_next(v, &xt[af].match, pos); + trav->class = MTTG_TRAV_INIT; + for (j = 0; j < *pos; ++j) + if (xt_mttg_seq_next(seq, NULL, NULL, is_target) == NULL) + return NULL; + return trav; } -static void xt_match_seq_stop(struct seq_file *seq, void *v) +static void xt_mttg_seq_stop(struct seq_file *seq, void *v) { - struct proc_dir_entry *pde = seq->private; - u_int16_t af = (unsigned long)pde->data; + struct nf_mttg_trav *trav = seq->private; + + switch (trav->class) { + case MTTG_TRAV_NFP_UNSPEC: + mutex_unlock(&xt[NFPROTO_UNSPEC].mutex); + break; + case MTTG_TRAV_NFP_SPEC: + mutex_unlock(&xt[trav->nfproto].mutex); + break; + } +} - mutex_unlock(&xt[af].mutex); +static void *xt_match_seq_start(struct seq_file *seq, loff_t *pos) +{ + return xt_mttg_seq_start(seq, pos, false); } -static int xt_match_seq_show(struct seq_file *seq, void *v) +static void *xt_match_seq_next(struct seq_file *seq, void *v, loff_t *ppos) { - struct xt_match *match = list_entry(v, struct xt_match, list); + return xt_mttg_seq_next(seq, v, ppos, false); +} - if (strlen(match->name)) - return seq_printf(seq, "%s\n", match->name); - else - return 0; +static int xt_match_seq_show(struct seq_file *seq, void *v) +{ + const struct nf_mttg_trav *trav = seq->private; + const struct xt_match *match; + + switch (trav->class) { + case MTTG_TRAV_NFP_UNSPEC: + case MTTG_TRAV_NFP_SPEC: + if (trav->curr == trav->head) + return 0; + match = list_entry(trav->curr, struct xt_match, list); + return (*match->name == '\0') ? 0 : + seq_printf(seq, "%s\n", match->name); + } + return 0; } static const struct seq_operations xt_match_seq_ops = { .start = xt_match_seq_start, .next = xt_match_seq_next, - .stop = xt_match_seq_stop, + .stop = xt_mttg_seq_stop, .show = xt_match_seq_show, }; static int xt_match_open(struct inode *inode, struct file *file) { + struct seq_file *seq; + struct nf_mttg_trav *trav; int ret; - ret = seq_open(file, &xt_match_seq_ops); - if (!ret) { - struct seq_file *seq = file->private_data; + trav = kmalloc(sizeof(*trav), GFP_KERNEL); + if (trav == NULL) + return -ENOMEM; - seq->private = PDE(inode); + ret = seq_open(file, &xt_match_seq_ops); + if (ret < 0) { + kfree(trav); + return ret; } - return ret; + + seq = file->private_data; + seq->private = trav; + trav->nfproto = (unsigned long)PDE(inode)->data; + return 0; } static const struct file_operations xt_match_ops = { @@ -903,62 +987,63 @@ static const struct file_operations xt_match_ops = { .open = xt_match_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_private, }; static void *xt_target_seq_start(struct seq_file *seq, loff_t *pos) { - struct proc_dir_entry *pde = (struct proc_dir_entry *)seq->private; - u_int16_t af = (unsigned long)pde->data; - - mutex_lock(&xt[af].mutex); - return seq_list_start(&xt[af].target, *pos); + return xt_mttg_seq_start(seq, pos, true); } -static void *xt_target_seq_next(struct seq_file *seq, void *v, loff_t *pos) +static void *xt_target_seq_next(struct seq_file *seq, void *v, loff_t *ppos) { - struct proc_dir_entry *pde = (struct proc_dir_entry *)seq->private; - u_int16_t af = (unsigned long)pde->data; - - return seq_list_next(v, &xt[af].target, pos); -} - -static void xt_target_seq_stop(struct seq_file *seq, void *v) -{ - struct proc_dir_entry *pde = seq->private; - u_int16_t af = (unsigned long)pde->data; - - mutex_unlock(&xt[af].mutex); + return xt_mttg_seq_next(seq, v, ppos, true); } static int xt_target_seq_show(struct seq_file *seq, void *v) { - struct xt_target *target = list_entry(v, struct xt_target, list); - - if (strlen(target->name)) - return seq_printf(seq, "%s\n", target->name); - else - return 0; + const struct nf_mttg_trav *trav = seq->private; + const struct xt_target *target; + + switch (trav->class) { + case MTTG_TRAV_NFP_UNSPEC: + case MTTG_TRAV_NFP_SPEC: + if (trav->curr == trav->head) + return 0; + target = list_entry(trav->curr, struct xt_target, list); + return (*target->name == '\0') ? 0 : + seq_printf(seq, "%s\n", target->name); + } + return 0; } static const struct seq_operations xt_target_seq_ops = { .start = xt_target_seq_start, .next = xt_target_seq_next, - .stop = xt_target_seq_stop, + .stop = xt_mttg_seq_stop, .show = xt_target_seq_show, }; static int xt_target_open(struct inode *inode, struct file *file) { + struct seq_file *seq; + struct nf_mttg_trav *trav; int ret; - ret = seq_open(file, &xt_target_seq_ops); - if (!ret) { - struct seq_file *seq = file->private_data; + trav = kmalloc(sizeof(*trav), GFP_KERNEL); + if (trav == NULL) + return -ENOMEM; - seq->private = PDE(inode); + ret = seq_open(file, &xt_target_seq_ops); + if (ret < 0) { + kfree(trav); + return ret; } - return ret; + + seq = file->private_data; + seq->private = trav; + trav->nfproto = (unsigned long)PDE(inode)->data; + return 0; } static const struct file_operations xt_target_ops = { @@ -966,7 +1051,7 @@ static const struct file_operations xt_target_ops = { .open = xt_target_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_private, }; #define FORMAT_TABLES "_tables_names" diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index fe80b614a40..791e030ea90 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -542,7 +542,7 @@ recent_mt_proc_write(struct file *file, const char __user *input, struct recent_entry *e; char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")]; const char *c = buf; - union nf_inet_addr addr; + union nf_inet_addr addr = {}; u_int16_t family; bool add, succ; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 5b33879c642..b73d4e61c5a 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -85,6 +85,7 @@ struct netlink_sock { #define NETLINK_KERNEL_SOCKET 0x1 #define NETLINK_RECV_PKTINFO 0x2 +#define NETLINK_BROADCAST_SEND_ERROR 0x4 static inline struct netlink_sock *nlk_sk(struct sock *sk) { @@ -995,12 +996,15 @@ static inline int do_one_broadcast(struct sock *sk, netlink_overrun(sk); /* Clone failed. Notify ALL listeners. */ p->failure = 1; + if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR) + p->delivery_failure = 1; } else if (sk_filter(sk, p->skb2)) { kfree_skb(p->skb2); p->skb2 = NULL; } else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) { netlink_overrun(sk); - p->delivery_failure = 1; + if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR) + p->delivery_failure = 1; } else { p->congested |= val; p->delivered = 1; @@ -1045,10 +1049,9 @@ int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid, netlink_unlock_table(); - if (info.skb2) - kfree_skb(info.skb2); + kfree_skb(info.skb2); - if (info.delivery_failure || info.failure) + if (info.delivery_failure) return -ENOBUFS; if (info.delivered) { @@ -1088,6 +1091,13 @@ out: return 0; } +/** + * netlink_set_err - report error to broadcast listeners + * @ssk: the kernel netlink socket, as returned by netlink_kernel_create() + * @pid: the PID of a process that we want to skip (if any) + * @groups: the broadcast group that will notice the error + * @code: error code, must be negative (as usual in kernelspace) + */ void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code) { struct netlink_set_err_data info; @@ -1097,7 +1107,8 @@ void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code) info.exclude_sk = ssk; info.pid = pid; info.group = group; - info.code = code; + /* sk->sk_err wants a positive error value */ + info.code = -code; read_lock(&nl_table_lock); @@ -1164,6 +1175,13 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, err = 0; break; } + case NETLINK_BROADCAST_ERROR: + if (val) + nlk->flags |= NETLINK_BROADCAST_SEND_ERROR; + else + nlk->flags &= ~NETLINK_BROADCAST_SEND_ERROR; + err = 0; + break; default: err = -ENOPROTOOPT; } @@ -1196,6 +1214,16 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, return -EFAULT; err = 0; break; + case NETLINK_BROADCAST_ERROR: + if (len < sizeof(int)) + return -EINVAL; + len = sizeof(int); + val = nlk->flags & NETLINK_BROADCAST_SEND_ERROR ? 1 : 0; + if (put_user(len, optlen) || + put_user(val, optval)) + return -EFAULT; + err = 0; + break; default: err = -ENOPROTOOPT; } @@ -1522,8 +1550,7 @@ EXPORT_SYMBOL(netlink_set_nonroot); static void netlink_destroy_callback(struct netlink_callback *cb) { - if (cb->skb) - kfree_skb(cb->skb); + kfree_skb(cb->skb); kfree(cb); } @@ -1740,12 +1767,18 @@ int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 pid, exclude_pid = pid; } - /* errors reported via destination sk->sk_err */ - nlmsg_multicast(sk, skb, exclude_pid, group, flags); + /* errors reported via destination sk->sk_err, but propagate + * delivery errors if NETLINK_BROADCAST_ERROR flag is set */ + err = nlmsg_multicast(sk, skb, exclude_pid, group, flags); } - if (report) - err = nlmsg_unicast(sk, skb, pid); + if (report) { + int err2; + + err2 = nlmsg_unicast(sk, skb, pid); + if (!err || err == -ESRCH) + err = err2; + } return err; } diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index cba7849de98..6d9c58ec56a 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1037,6 +1037,10 @@ static int nr_sendmsg(struct kiocb *iocb, struct socket *sock, unsigned char *asmptr; int size; + /* Netrom empty data frame has no meaning : don't send */ + if (len == 0) + return 0; + if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_CMSG_COMPAT)) return -EINVAL; @@ -1167,6 +1171,11 @@ static int nr_recvmsg(struct kiocb *iocb, struct socket *sock, skb_reset_transport_header(skb); copied = skb->len; + /* NetRom empty data frame has no meaning : ignore it */ + if (copied == 0) { + goto out; + } + if (copied > size) { copied = size; msg->msg_flags |= MSG_TRUNC; @@ -1182,7 +1191,7 @@ static int nr_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_namelen = sizeof(*sax); - skb_free_datagram(sk, skb); +out: skb_free_datagram(sk, skb); release_sock(sk); return copied; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 1fc4a7885c4..74776de523e 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -584,7 +584,7 @@ drop_n_restore: skb->len = skb_len; } drop: - kfree_skb(skb); + consume_skb(skb); return 0; } @@ -756,8 +756,7 @@ ring_is_full: spin_unlock(&sk->sk_receive_queue.lock); sk->sk_data_ready(sk, 0); - if (copy_skb) - kfree_skb(copy_skb); + kfree_skb(copy_skb); goto drop_n_restore; } diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c index 81795ea8779..a662e62a99c 100644 --- a/net/phonet/af_phonet.c +++ b/net/phonet/af_phonet.c @@ -382,9 +382,8 @@ out: return NET_RX_DROP; } -static struct packet_type phonet_packet_type = { +static struct packet_type phonet_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_PHONET), - .dev = NULL, .func = phonet_rcv, }; diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c index 1ceea1f9241..cec4e595168 100644 --- a/net/phonet/pn_netlink.c +++ b/net/phonet/pn_netlink.c @@ -47,8 +47,9 @@ static void rtmsg_notify(int event, struct net_device *dev, u8 addr) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, dev_net(dev), 0, - RTNLGRP_PHONET_IFADDR, NULL, GFP_KERNEL); + rtnl_notify(skb, dev_net(dev), 0, + RTNLGRP_PHONET_IFADDR, NULL, GFP_KERNEL); + return; errout: if (err < 0) rtnl_set_sk_err(dev_net(dev), RTNLGRP_PHONET_IFADDR, err); diff --git a/net/rds/Kconfig b/net/rds/Kconfig new file mode 100644 index 00000000000..796773b5df9 --- /dev/null +++ b/net/rds/Kconfig @@ -0,0 +1,14 @@ + +config RDS + tristate "Reliable Datagram Sockets (RDS) (EXPERIMENTAL)" + depends on INET && INFINIBAND_IPOIB && EXPERIMENTAL + depends on INFINIBAND && INFINIBAND_ADDR_TRANS + ---help--- + RDS provides reliable, sequenced delivery of datagrams + over Infiniband. + +config RDS_DEBUG + bool "Debugging messages" + depends on RDS + default n + diff --git a/net/rds/Makefile b/net/rds/Makefile new file mode 100644 index 00000000000..51f27585fa0 --- /dev/null +++ b/net/rds/Makefile @@ -0,0 +1,14 @@ +obj-$(CONFIG_RDS) += rds.o +rds-y := af_rds.o bind.o cong.o connection.o info.o message.o \ + recv.o send.o stats.o sysctl.o threads.o transport.o \ + loop.o page.o rdma.o \ + rdma_transport.o \ + ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \ + ib_sysctl.o ib_rdma.o \ + iw.o iw_cm.o iw_recv.o iw_ring.o iw_send.o iw_stats.o \ + iw_sysctl.o iw_rdma.o + +ifeq ($(CONFIG_RDS_DEBUG), y) +EXTRA_CFLAGS += -DDEBUG +endif + diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c new file mode 100644 index 00000000000..20cf16fc572 --- /dev/null +++ b/net/rds/af_rds.c @@ -0,0 +1,586 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/in.h> +#include <linux/poll.h> +#include <linux/version.h> +#include <net/sock.h> + +#include "rds.h" +#include "rdma.h" +#include "rdma_transport.h" + +/* this is just used for stats gathering :/ */ +static DEFINE_SPINLOCK(rds_sock_lock); +static unsigned long rds_sock_count; +static LIST_HEAD(rds_sock_list); +DECLARE_WAIT_QUEUE_HEAD(rds_poll_waitq); + +/* + * This is called as the final descriptor referencing this socket is closed. + * We have to unbind the socket so that another socket can be bound to the + * address it was using. + * + * We have to be careful about racing with the incoming path. sock_orphan() + * sets SOCK_DEAD and we use that as an indicator to the rx path that new + * messages shouldn't be queued. + */ +static int rds_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct rds_sock *rs; + unsigned long flags; + + if (sk == NULL) + goto out; + + rs = rds_sk_to_rs(sk); + + sock_orphan(sk); + /* Note - rds_clear_recv_queue grabs rs_recv_lock, so + * that ensures the recv path has completed messing + * with the socket. */ + rds_clear_recv_queue(rs); + rds_cong_remove_socket(rs); + rds_remove_bound(rs); + rds_send_drop_to(rs, NULL); + rds_rdma_drop_keys(rs); + rds_notify_queue_get(rs, NULL); + + spin_lock_irqsave(&rds_sock_lock, flags); + list_del_init(&rs->rs_item); + rds_sock_count--; + spin_unlock_irqrestore(&rds_sock_lock, flags); + + sock->sk = NULL; + sock_put(sk); +out: + return 0; +} + +/* + * Careful not to race with rds_release -> sock_orphan which clears sk_sleep. + * _bh() isn't OK here, we're called from interrupt handlers. It's probably OK + * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but + * this seems more conservative. + * NB - normally, one would use sk_callback_lock for this, but we can + * get here from interrupts, whereas the network code grabs sk_callback_lock + * with _lock_bh only - so relying on sk_callback_lock introduces livelocks. + */ +void rds_wake_sk_sleep(struct rds_sock *rs) +{ + unsigned long flags; + + read_lock_irqsave(&rs->rs_recv_lock, flags); + __rds_wake_sk_sleep(rds_rs_to_sk(rs)); + read_unlock_irqrestore(&rs->rs_recv_lock, flags); +} + +static int rds_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; + struct rds_sock *rs = rds_sk_to_rs(sock->sk); + + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + + /* racey, don't care */ + if (peer) { + if (!rs->rs_conn_addr) + return -ENOTCONN; + + sin->sin_port = rs->rs_conn_port; + sin->sin_addr.s_addr = rs->rs_conn_addr; + } else { + sin->sin_port = rs->rs_bound_port; + sin->sin_addr.s_addr = rs->rs_bound_addr; + } + + sin->sin_family = AF_INET; + + *uaddr_len = sizeof(*sin); + return 0; +} + +/* + * RDS' poll is without a doubt the least intuitive part of the interface, + * as POLLIN and POLLOUT do not behave entirely as you would expect from + * a network protocol. + * + * POLLIN is asserted if + * - there is data on the receive queue. + * - to signal that a previously congested destination may have become + * uncongested + * - A notification has been queued to the socket (this can be a congestion + * update, or a RDMA completion). + * + * POLLOUT is asserted if there is room on the send queue. This does not mean + * however, that the next sendmsg() call will succeed. If the application tries + * to send to a congested destination, the system call may still fail (and + * return ENOBUFS). + */ +static unsigned int rds_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + struct sock *sk = sock->sk; + struct rds_sock *rs = rds_sk_to_rs(sk); + unsigned int mask = 0; + unsigned long flags; + + poll_wait(file, sk->sk_sleep, wait); + + poll_wait(file, &rds_poll_waitq, wait); + + read_lock_irqsave(&rs->rs_recv_lock, flags); + if (!rs->rs_cong_monitor) { + /* When a congestion map was updated, we signal POLLIN for + * "historical" reasons. Applications can also poll for + * WRBAND instead. */ + if (rds_cong_updated_since(&rs->rs_cong_track)) + mask |= (POLLIN | POLLRDNORM | POLLWRBAND); + } else { + spin_lock(&rs->rs_lock); + if (rs->rs_cong_notify) + mask |= (POLLIN | POLLRDNORM); + spin_unlock(&rs->rs_lock); + } + if (!list_empty(&rs->rs_recv_queue) + || !list_empty(&rs->rs_notify_queue)) + mask |= (POLLIN | POLLRDNORM); + if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) + mask |= (POLLOUT | POLLWRNORM); + read_unlock_irqrestore(&rs->rs_recv_lock, flags); + + return mask; +} + +static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + return -ENOIOCTLCMD; +} + +static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval, + int len) +{ + struct sockaddr_in sin; + int ret = 0; + + /* racing with another thread binding seems ok here */ + if (rs->rs_bound_addr == 0) { + ret = -ENOTCONN; /* XXX not a great errno */ + goto out; + } + + if (len < sizeof(struct sockaddr_in)) { + ret = -EINVAL; + goto out; + } + + if (copy_from_user(&sin, optval, sizeof(sin))) { + ret = -EFAULT; + goto out; + } + + rds_send_drop_to(rs, &sin); +out: + return ret; +} + +static int rds_set_bool_option(unsigned char *optvar, char __user *optval, + int optlen) +{ + int value; + + if (optlen < sizeof(int)) + return -EINVAL; + if (get_user(value, (int __user *) optval)) + return -EFAULT; + *optvar = !!value; + return 0; +} + +static int rds_cong_monitor(struct rds_sock *rs, char __user *optval, + int optlen) +{ + int ret; + + ret = rds_set_bool_option(&rs->rs_cong_monitor, optval, optlen); + if (ret == 0) { + if (rs->rs_cong_monitor) { + rds_cong_add_socket(rs); + } else { + rds_cong_remove_socket(rs); + rs->rs_cong_mask = 0; + rs->rs_cong_notify = 0; + } + } + return ret; +} + +static int rds_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, int optlen) +{ + struct rds_sock *rs = rds_sk_to_rs(sock->sk); + int ret; + + if (level != SOL_RDS) { + ret = -ENOPROTOOPT; + goto out; + } + + switch (optname) { + case RDS_CANCEL_SENT_TO: + ret = rds_cancel_sent_to(rs, optval, optlen); + break; + case RDS_GET_MR: + ret = rds_get_mr(rs, optval, optlen); + break; + case RDS_FREE_MR: + ret = rds_free_mr(rs, optval, optlen); + break; + case RDS_RECVERR: + ret = rds_set_bool_option(&rs->rs_recverr, optval, optlen); + break; + case RDS_CONG_MONITOR: + ret = rds_cong_monitor(rs, optval, optlen); + break; + default: + ret = -ENOPROTOOPT; + } +out: + return ret; +} + +static int rds_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct rds_sock *rs = rds_sk_to_rs(sock->sk); + int ret = -ENOPROTOOPT, len; + + if (level != SOL_RDS) + goto out; + + if (get_user(len, optlen)) { + ret = -EFAULT; + goto out; + } + + switch (optname) { + case RDS_INFO_FIRST ... RDS_INFO_LAST: + ret = rds_info_getsockopt(sock, optname, optval, + optlen); + break; + + case RDS_RECVERR: + if (len < sizeof(int)) + ret = -EINVAL; + else + if (put_user(rs->rs_recverr, (int __user *) optval) + || put_user(sizeof(int), optlen)) + ret = -EFAULT; + else + ret = 0; + break; + default: + break; + } + +out: + return ret; + +} + +static int rds_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct sock *sk = sock->sk; + struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; + struct rds_sock *rs = rds_sk_to_rs(sk); + int ret = 0; + + lock_sock(sk); + + if (addr_len != sizeof(struct sockaddr_in)) { + ret = -EINVAL; + goto out; + } + + if (sin->sin_family != AF_INET) { + ret = -EAFNOSUPPORT; + goto out; + } + + if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) { + ret = -EDESTADDRREQ; + goto out; + } + + rs->rs_conn_addr = sin->sin_addr.s_addr; + rs->rs_conn_port = sin->sin_port; + +out: + release_sock(sk); + return ret; +} + +static struct proto rds_proto = { + .name = "RDS", + .owner = THIS_MODULE, + .obj_size = sizeof(struct rds_sock), +}; + +static struct proto_ops rds_proto_ops = { + .family = AF_RDS, + .owner = THIS_MODULE, + .release = rds_release, + .bind = rds_bind, + .connect = rds_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = rds_getname, + .poll = rds_poll, + .ioctl = rds_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = rds_setsockopt, + .getsockopt = rds_getsockopt, + .sendmsg = rds_sendmsg, + .recvmsg = rds_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static int __rds_create(struct socket *sock, struct sock *sk, int protocol) +{ + unsigned long flags; + struct rds_sock *rs; + + sock_init_data(sock, sk); + sock->ops = &rds_proto_ops; + sk->sk_protocol = protocol; + + rs = rds_sk_to_rs(sk); + spin_lock_init(&rs->rs_lock); + rwlock_init(&rs->rs_recv_lock); + INIT_LIST_HEAD(&rs->rs_send_queue); + INIT_LIST_HEAD(&rs->rs_recv_queue); + INIT_LIST_HEAD(&rs->rs_notify_queue); + INIT_LIST_HEAD(&rs->rs_cong_list); + spin_lock_init(&rs->rs_rdma_lock); + rs->rs_rdma_keys = RB_ROOT; + + spin_lock_irqsave(&rds_sock_lock, flags); + list_add_tail(&rs->rs_item, &rds_sock_list); + rds_sock_count++; + spin_unlock_irqrestore(&rds_sock_lock, flags); + + return 0; +} + +static int rds_create(struct net *net, struct socket *sock, int protocol) +{ + struct sock *sk; + + if (sock->type != SOCK_SEQPACKET || protocol) + return -ESOCKTNOSUPPORT; + + sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto); + if (!sk) + return -ENOMEM; + + return __rds_create(sock, sk, protocol); +} + +void rds_sock_addref(struct rds_sock *rs) +{ + sock_hold(rds_rs_to_sk(rs)); +} + +void rds_sock_put(struct rds_sock *rs) +{ + sock_put(rds_rs_to_sk(rs)); +} + +static struct net_proto_family rds_family_ops = { + .family = AF_RDS, + .create = rds_create, + .owner = THIS_MODULE, +}; + +static void rds_sock_inc_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + struct rds_sock *rs; + struct sock *sk; + struct rds_incoming *inc; + unsigned long flags; + unsigned int total = 0; + + len /= sizeof(struct rds_info_message); + + spin_lock_irqsave(&rds_sock_lock, flags); + + list_for_each_entry(rs, &rds_sock_list, rs_item) { + sk = rds_rs_to_sk(rs); + read_lock(&rs->rs_recv_lock); + + /* XXX too lazy to maintain counts.. */ + list_for_each_entry(inc, &rs->rs_recv_queue, i_item) { + total++; + if (total <= len) + rds_inc_info_copy(inc, iter, inc->i_saddr, + rs->rs_bound_addr, 1); + } + + read_unlock(&rs->rs_recv_lock); + } + + spin_unlock_irqrestore(&rds_sock_lock, flags); + + lens->nr = total; + lens->each = sizeof(struct rds_info_message); +} + +static void rds_sock_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + struct rds_info_socket sinfo; + struct rds_sock *rs; + unsigned long flags; + + len /= sizeof(struct rds_info_socket); + + spin_lock_irqsave(&rds_sock_lock, flags); + + if (len < rds_sock_count) + goto out; + + list_for_each_entry(rs, &rds_sock_list, rs_item) { + sinfo.sndbuf = rds_sk_sndbuf(rs); + sinfo.rcvbuf = rds_sk_rcvbuf(rs); + sinfo.bound_addr = rs->rs_bound_addr; + sinfo.connected_addr = rs->rs_conn_addr; + sinfo.bound_port = rs->rs_bound_port; + sinfo.connected_port = rs->rs_conn_port; + sinfo.inum = sock_i_ino(rds_rs_to_sk(rs)); + + rds_info_copy(iter, &sinfo, sizeof(sinfo)); + } + +out: + lens->nr = rds_sock_count; + lens->each = sizeof(struct rds_info_socket); + + spin_unlock_irqrestore(&rds_sock_lock, flags); +} + +static void __exit rds_exit(void) +{ + rds_rdma_exit(); + sock_unregister(rds_family_ops.family); + proto_unregister(&rds_proto); + rds_conn_exit(); + rds_cong_exit(); + rds_sysctl_exit(); + rds_threads_exit(); + rds_stats_exit(); + rds_page_exit(); + rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info); + rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); +} +module_exit(rds_exit); + +static int __init rds_init(void) +{ + int ret; + + ret = rds_conn_init(); + if (ret) + goto out; + ret = rds_threads_init(); + if (ret) + goto out_conn; + ret = rds_sysctl_init(); + if (ret) + goto out_threads; + ret = rds_stats_init(); + if (ret) + goto out_sysctl; + ret = proto_register(&rds_proto, 1); + if (ret) + goto out_stats; + ret = sock_register(&rds_family_ops); + if (ret) + goto out_proto; + + rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info); + rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); + + /* ib/iwarp transports currently compiled-in */ + ret = rds_rdma_init(); + if (ret) + goto out_sock; + goto out; + +out_sock: + sock_unregister(rds_family_ops.family); +out_proto: + proto_unregister(&rds_proto); +out_stats: + rds_stats_exit(); +out_sysctl: + rds_sysctl_exit(); +out_threads: + rds_threads_exit(); +out_conn: + rds_conn_exit(); + rds_cong_exit(); + rds_page_exit(); +out: + return ret; +} +module_init(rds_init); + +#define DRV_VERSION "4.0" +#define DRV_RELDATE "Feb 12, 2009" + +MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>"); +MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets" + " v" DRV_VERSION " (" DRV_RELDATE ")"); +MODULE_VERSION(DRV_VERSION); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS_NETPROTO(PF_RDS); diff --git a/net/rds/bind.c b/net/rds/bind.c new file mode 100644 index 00000000000..c17cc39160c --- /dev/null +++ b/net/rds/bind.c @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <net/sock.h> +#include <linux/in.h> +#include <linux/if_arp.h> +#include "rds.h" + +/* + * XXX this probably still needs more work.. no INADDR_ANY, and rbtrees aren't + * particularly zippy. + * + * This is now called for every incoming frame so we arguably care much more + * about it than we used to. + */ +static DEFINE_SPINLOCK(rds_bind_lock); +static struct rb_root rds_bind_tree = RB_ROOT; + +static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port, + struct rds_sock *insert) +{ + struct rb_node **p = &rds_bind_tree.rb_node; + struct rb_node *parent = NULL; + struct rds_sock *rs; + u64 cmp; + u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port); + + while (*p) { + parent = *p; + rs = rb_entry(parent, struct rds_sock, rs_bound_node); + + cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) | + be16_to_cpu(rs->rs_bound_port); + + if (needle < cmp) + p = &(*p)->rb_left; + else if (needle > cmp) + p = &(*p)->rb_right; + else + return rs; + } + + if (insert) { + rb_link_node(&insert->rs_bound_node, parent, p); + rb_insert_color(&insert->rs_bound_node, &rds_bind_tree); + } + return NULL; +} + +/* + * Return the rds_sock bound at the given local address. + * + * The rx path can race with rds_release. We notice if rds_release() has + * marked this socket and don't return a rs ref to the rx path. + */ +struct rds_sock *rds_find_bound(__be32 addr, __be16 port) +{ + struct rds_sock *rs; + unsigned long flags; + + spin_lock_irqsave(&rds_bind_lock, flags); + rs = rds_bind_tree_walk(addr, port, NULL); + if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) + rds_sock_addref(rs); + else + rs = NULL; + spin_unlock_irqrestore(&rds_bind_lock, flags); + + rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, + ntohs(port)); + return rs; +} + +/* returns -ve errno or +ve port */ +static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) +{ + unsigned long flags; + int ret = -EADDRINUSE; + u16 rover, last; + + if (*port != 0) { + rover = be16_to_cpu(*port); + last = rover; + } else { + rover = max_t(u16, net_random(), 2); + last = rover - 1; + } + + spin_lock_irqsave(&rds_bind_lock, flags); + + do { + if (rover == 0) + rover++; + if (rds_bind_tree_walk(addr, cpu_to_be16(rover), rs) == NULL) { + *port = cpu_to_be16(rover); + ret = 0; + break; + } + } while (rover++ != last); + + if (ret == 0) { + rs->rs_bound_addr = addr; + rs->rs_bound_port = *port; + rds_sock_addref(rs); + + rdsdebug("rs %p binding to %pI4:%d\n", + rs, &addr, (int)ntohs(*port)); + } + + spin_unlock_irqrestore(&rds_bind_lock, flags); + + return ret; +} + +void rds_remove_bound(struct rds_sock *rs) +{ + unsigned long flags; + + spin_lock_irqsave(&rds_bind_lock, flags); + + if (rs->rs_bound_addr) { + rdsdebug("rs %p unbinding from %pI4:%d\n", + rs, &rs->rs_bound_addr, + ntohs(rs->rs_bound_port)); + + rb_erase(&rs->rs_bound_node, &rds_bind_tree); + rds_sock_put(rs); + rs->rs_bound_addr = 0; + } + + spin_unlock_irqrestore(&rds_bind_lock, flags); +} + +int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sock *sk = sock->sk; + struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; + struct rds_sock *rs = rds_sk_to_rs(sk); + struct rds_transport *trans; + int ret = 0; + + lock_sock(sk); + + if (addr_len != sizeof(struct sockaddr_in) || + sin->sin_family != AF_INET || + rs->rs_bound_addr || + sin->sin_addr.s_addr == htonl(INADDR_ANY)) { + ret = -EINVAL; + goto out; + } + + ret = rds_add_bound(rs, sin->sin_addr.s_addr, &sin->sin_port); + if (ret) + goto out; + + trans = rds_trans_get_preferred(sin->sin_addr.s_addr); + if (trans == NULL) { + ret = -EADDRNOTAVAIL; + rds_remove_bound(rs); + goto out; + } + + rs->rs_transport = trans; + ret = 0; + +out: + release_sock(sk); + return ret; +} diff --git a/net/rds/cong.c b/net/rds/cong.c new file mode 100644 index 00000000000..710e4599d76 --- /dev/null +++ b/net/rds/cong.c @@ -0,0 +1,404 @@ +/* + * Copyright (c) 2007 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/types.h> +#include <linux/rbtree.h> + +#include <asm-generic/bitops/le.h> + +#include "rds.h" + +/* + * This file implements the receive side of the unconventional congestion + * management in RDS. + * + * Messages waiting in the receive queue on the receiving socket are accounted + * against the sockets SO_RCVBUF option value. Only the payload bytes in the + * message are accounted for. If the number of bytes queued equals or exceeds + * rcvbuf then the socket is congested. All sends attempted to this socket's + * address should return block or return -EWOULDBLOCK. + * + * Applications are expected to be reasonably tuned such that this situation + * very rarely occurs. An application encountering this "back-pressure" is + * considered a bug. + * + * This is implemented by having each node maintain bitmaps which indicate + * which ports on bound addresses are congested. As the bitmap changes it is + * sent through all the connections which terminate in the local address of the + * bitmap which changed. + * + * The bitmaps are allocated as connections are brought up. This avoids + * allocation in the interrupt handling path which queues messages on sockets. + * The dense bitmaps let transports send the entire bitmap on any bitmap change + * reasonably efficiently. This is much easier to implement than some + * finer-grained communication of per-port congestion. The sender does a very + * inexpensive bit test to test if the port it's about to send to is congested + * or not. + */ + +/* + * Interaction with poll is a tad tricky. We want all processes stuck in + * poll to wake up and check whether a congested destination became uncongested. + * The really sad thing is we have no idea which destinations the application + * wants to send to - we don't even know which rds_connections are involved. + * So until we implement a more flexible rds poll interface, we have to make + * do with this: + * We maintain a global counter that is incremented each time a congestion map + * update is received. Each rds socket tracks this value, and if rds_poll + * finds that the saved generation number is smaller than the global generation + * number, it wakes up the process. + */ +static atomic_t rds_cong_generation = ATOMIC_INIT(0); + +/* + * Congestion monitoring + */ +static LIST_HEAD(rds_cong_monitor); +static DEFINE_RWLOCK(rds_cong_monitor_lock); + +/* + * Yes, a global lock. It's used so infrequently that it's worth keeping it + * global to simplify the locking. It's only used in the following + * circumstances: + * + * - on connection buildup to associate a conn with its maps + * - on map changes to inform conns of a new map to send + * + * It's sadly ordered under the socket callback lock and the connection lock. + * Receive paths can mark ports congested from interrupt context so the + * lock masks interrupts. + */ +static DEFINE_SPINLOCK(rds_cong_lock); +static struct rb_root rds_cong_tree = RB_ROOT; + +static struct rds_cong_map *rds_cong_tree_walk(__be32 addr, + struct rds_cong_map *insert) +{ + struct rb_node **p = &rds_cong_tree.rb_node; + struct rb_node *parent = NULL; + struct rds_cong_map *map; + + while (*p) { + parent = *p; + map = rb_entry(parent, struct rds_cong_map, m_rb_node); + + if (addr < map->m_addr) + p = &(*p)->rb_left; + else if (addr > map->m_addr) + p = &(*p)->rb_right; + else + return map; + } + + if (insert) { + rb_link_node(&insert->m_rb_node, parent, p); + rb_insert_color(&insert->m_rb_node, &rds_cong_tree); + } + return NULL; +} + +/* + * There is only ever one bitmap for any address. Connections try and allocate + * these bitmaps in the process getting pointers to them. The bitmaps are only + * ever freed as the module is removed after all connections have been freed. + */ +static struct rds_cong_map *rds_cong_from_addr(__be32 addr) +{ + struct rds_cong_map *map; + struct rds_cong_map *ret = NULL; + unsigned long zp; + unsigned long i; + unsigned long flags; + + map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL); + if (map == NULL) + return NULL; + + map->m_addr = addr; + init_waitqueue_head(&map->m_waitq); + INIT_LIST_HEAD(&map->m_conn_list); + + for (i = 0; i < RDS_CONG_MAP_PAGES; i++) { + zp = get_zeroed_page(GFP_KERNEL); + if (zp == 0) + goto out; + map->m_page_addrs[i] = zp; + } + + spin_lock_irqsave(&rds_cong_lock, flags); + ret = rds_cong_tree_walk(addr, map); + spin_unlock_irqrestore(&rds_cong_lock, flags); + + if (ret == NULL) { + ret = map; + map = NULL; + } + +out: + if (map) { + for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++) + free_page(map->m_page_addrs[i]); + kfree(map); + } + + rdsdebug("map %p for addr %x\n", ret, be32_to_cpu(addr)); + + return ret; +} + +/* + * Put the conn on its local map's list. This is called when the conn is + * really added to the hash. It's nested under the rds_conn_lock, sadly. + */ +void rds_cong_add_conn(struct rds_connection *conn) +{ + unsigned long flags; + + rdsdebug("conn %p now on map %p\n", conn, conn->c_lcong); + spin_lock_irqsave(&rds_cong_lock, flags); + list_add_tail(&conn->c_map_item, &conn->c_lcong->m_conn_list); + spin_unlock_irqrestore(&rds_cong_lock, flags); +} + +void rds_cong_remove_conn(struct rds_connection *conn) +{ + unsigned long flags; + + rdsdebug("removing conn %p from map %p\n", conn, conn->c_lcong); + spin_lock_irqsave(&rds_cong_lock, flags); + list_del_init(&conn->c_map_item); + spin_unlock_irqrestore(&rds_cong_lock, flags); +} + +int rds_cong_get_maps(struct rds_connection *conn) +{ + conn->c_lcong = rds_cong_from_addr(conn->c_laddr); + conn->c_fcong = rds_cong_from_addr(conn->c_faddr); + + if (conn->c_lcong == NULL || conn->c_fcong == NULL) + return -ENOMEM; + + return 0; +} + +void rds_cong_queue_updates(struct rds_cong_map *map) +{ + struct rds_connection *conn; + unsigned long flags; + + spin_lock_irqsave(&rds_cong_lock, flags); + + list_for_each_entry(conn, &map->m_conn_list, c_map_item) { + if (!test_and_set_bit(0, &conn->c_map_queued)) { + rds_stats_inc(s_cong_update_queued); + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + } + } + + spin_unlock_irqrestore(&rds_cong_lock, flags); +} + +void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask) +{ + rdsdebug("waking map %p for %pI4\n", + map, &map->m_addr); + rds_stats_inc(s_cong_update_received); + atomic_inc(&rds_cong_generation); + if (waitqueue_active(&map->m_waitq)) + wake_up(&map->m_waitq); + if (waitqueue_active(&rds_poll_waitq)) + wake_up_all(&rds_poll_waitq); + + if (portmask && !list_empty(&rds_cong_monitor)) { + unsigned long flags; + struct rds_sock *rs; + + read_lock_irqsave(&rds_cong_monitor_lock, flags); + list_for_each_entry(rs, &rds_cong_monitor, rs_cong_list) { + spin_lock(&rs->rs_lock); + rs->rs_cong_notify |= (rs->rs_cong_mask & portmask); + rs->rs_cong_mask &= ~portmask; + spin_unlock(&rs->rs_lock); + if (rs->rs_cong_notify) + rds_wake_sk_sleep(rs); + } + read_unlock_irqrestore(&rds_cong_monitor_lock, flags); + } +} + +int rds_cong_updated_since(unsigned long *recent) +{ + unsigned long gen = atomic_read(&rds_cong_generation); + + if (likely(*recent == gen)) + return 0; + *recent = gen; + return 1; +} + +/* + * We're called under the locking that protects the sockets receive buffer + * consumption. This makes it a lot easier for the caller to only call us + * when it knows that an existing set bit needs to be cleared, and vice versa. + * We can't block and we need to deal with concurrent sockets working against + * the same per-address map. + */ +void rds_cong_set_bit(struct rds_cong_map *map, __be16 port) +{ + unsigned long i; + unsigned long off; + + rdsdebug("setting congestion for %pI4:%u in map %p\n", + &map->m_addr, ntohs(port), map); + + i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; + off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; + + generic___set_le_bit(off, (void *)map->m_page_addrs[i]); +} + +void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port) +{ + unsigned long i; + unsigned long off; + + rdsdebug("clearing congestion for %pI4:%u in map %p\n", + &map->m_addr, ntohs(port), map); + + i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; + off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; + + generic___clear_le_bit(off, (void *)map->m_page_addrs[i]); +} + +static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port) +{ + unsigned long i; + unsigned long off; + + i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; + off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; + + return generic_test_le_bit(off, (void *)map->m_page_addrs[i]); +} + +void rds_cong_add_socket(struct rds_sock *rs) +{ + unsigned long flags; + + write_lock_irqsave(&rds_cong_monitor_lock, flags); + if (list_empty(&rs->rs_cong_list)) + list_add(&rs->rs_cong_list, &rds_cong_monitor); + write_unlock_irqrestore(&rds_cong_monitor_lock, flags); +} + +void rds_cong_remove_socket(struct rds_sock *rs) +{ + unsigned long flags; + struct rds_cong_map *map; + + write_lock_irqsave(&rds_cong_monitor_lock, flags); + list_del_init(&rs->rs_cong_list); + write_unlock_irqrestore(&rds_cong_monitor_lock, flags); + + /* update congestion map for now-closed port */ + spin_lock_irqsave(&rds_cong_lock, flags); + map = rds_cong_tree_walk(rs->rs_bound_addr, NULL); + spin_unlock_irqrestore(&rds_cong_lock, flags); + + if (map && rds_cong_test_bit(map, rs->rs_bound_port)) { + rds_cong_clear_bit(map, rs->rs_bound_port); + rds_cong_queue_updates(map); + } +} + +int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, + struct rds_sock *rs) +{ + if (!rds_cong_test_bit(map, port)) + return 0; + if (nonblock) { + if (rs && rs->rs_cong_monitor) { + unsigned long flags; + + /* It would have been nice to have an atomic set_bit on + * a uint64_t. */ + spin_lock_irqsave(&rs->rs_lock, flags); + rs->rs_cong_mask |= RDS_CONG_MONITOR_MASK(ntohs(port)); + spin_unlock_irqrestore(&rs->rs_lock, flags); + + /* Test again - a congestion update may have arrived in + * the meantime. */ + if (!rds_cong_test_bit(map, port)) + return 0; + } + rds_stats_inc(s_cong_send_error); + return -ENOBUFS; + } + + rds_stats_inc(s_cong_send_blocked); + rdsdebug("waiting on map %p for port %u\n", map, be16_to_cpu(port)); + + return wait_event_interruptible(map->m_waitq, + !rds_cong_test_bit(map, port)); +} + +void rds_cong_exit(void) +{ + struct rb_node *node; + struct rds_cong_map *map; + unsigned long i; + + while ((node = rb_first(&rds_cong_tree))) { + map = rb_entry(node, struct rds_cong_map, m_rb_node); + rdsdebug("freeing map %p\n", map); + rb_erase(&map->m_rb_node, &rds_cong_tree); + for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++) + free_page(map->m_page_addrs[i]); + kfree(map); + } +} + +/* + * Allocate a RDS message containing a congestion update. + */ +struct rds_message *rds_cong_update_alloc(struct rds_connection *conn) +{ + struct rds_cong_map *map = conn->c_lcong; + struct rds_message *rm; + + rm = rds_message_map_pages(map->m_page_addrs, RDS_CONG_MAP_BYTES); + if (!IS_ERR(rm)) + rm->m_inc.i_hdr.h_flags = RDS_FLAG_CONG_BITMAP; + + return rm; +} diff --git a/net/rds/connection.c b/net/rds/connection.c new file mode 100644 index 00000000000..273f064930a --- /dev/null +++ b/net/rds/connection.c @@ -0,0 +1,487 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/list.h> +#include <net/inet_hashtables.h> + +#include "rds.h" +#include "loop.h" +#include "rdma.h" + +#define RDS_CONNECTION_HASH_BITS 12 +#define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS) +#define RDS_CONNECTION_HASH_MASK (RDS_CONNECTION_HASH_ENTRIES - 1) + +/* converting this to RCU is a chore for another day.. */ +static DEFINE_SPINLOCK(rds_conn_lock); +static unsigned long rds_conn_count; +static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES]; +static struct kmem_cache *rds_conn_slab; + +static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr) +{ + /* Pass NULL, don't need struct net for hash */ + unsigned long hash = inet_ehashfn(NULL, + be32_to_cpu(laddr), 0, + be32_to_cpu(faddr), 0); + return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK]; +} + +#define rds_conn_info_set(var, test, suffix) do { \ + if (test) \ + var |= RDS_INFO_CONNECTION_FLAG_##suffix; \ +} while (0) + +static inline int rds_conn_is_sending(struct rds_connection *conn) +{ + int ret = 0; + + if (!mutex_trylock(&conn->c_send_lock)) + ret = 1; + else + mutex_unlock(&conn->c_send_lock); + + return ret; +} + +static struct rds_connection *rds_conn_lookup(struct hlist_head *head, + __be32 laddr, __be32 faddr, + struct rds_transport *trans) +{ + struct rds_connection *conn, *ret = NULL; + struct hlist_node *pos; + + hlist_for_each_entry(conn, pos, head, c_hash_node) { + if (conn->c_faddr == faddr && conn->c_laddr == laddr && + conn->c_trans == trans) { + ret = conn; + break; + } + } + rdsdebug("returning conn %p for %pI4 -> %pI4\n", ret, + &laddr, &faddr); + return ret; +} + +/* + * This is called by transports as they're bringing down a connection. + * It clears partial message state so that the transport can start sending + * and receiving over this connection again in the future. It is up to + * the transport to have serialized this call with its send and recv. + */ +void rds_conn_reset(struct rds_connection *conn) +{ + rdsdebug("connection %pI4 to %pI4 reset\n", + &conn->c_laddr, &conn->c_faddr); + + rds_stats_inc(s_conn_reset); + rds_send_reset(conn); + conn->c_flags = 0; + + /* Do not clear next_rx_seq here, else we cannot distinguish + * retransmitted packets from new packets, and will hand all + * of them to the application. That is not consistent with the + * reliability guarantees of RDS. */ +} + +/* + * There is only every one 'conn' for a given pair of addresses in the + * system at a time. They contain messages to be retransmitted and so + * span the lifetime of the actual underlying transport connections. + * + * For now they are not garbage collected once they're created. They + * are torn down as the module is removed, if ever. + */ +static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, + struct rds_transport *trans, gfp_t gfp, + int is_outgoing) +{ + struct rds_connection *conn, *tmp, *parent = NULL; + struct hlist_head *head = rds_conn_bucket(laddr, faddr); + unsigned long flags; + int ret; + + spin_lock_irqsave(&rds_conn_lock, flags); + conn = rds_conn_lookup(head, laddr, faddr, trans); + if (conn + && conn->c_loopback + && conn->c_trans != &rds_loop_transport + && !is_outgoing) { + /* This is a looped back IB connection, and we're + * called by the code handling the incoming connect. + * We need a second connection object into which we + * can stick the other QP. */ + parent = conn; + conn = parent->c_passive; + } + spin_unlock_irqrestore(&rds_conn_lock, flags); + if (conn) + goto out; + + conn = kmem_cache_alloc(rds_conn_slab, gfp); + if (conn == NULL) { + conn = ERR_PTR(-ENOMEM); + goto out; + } + + memset(conn, 0, sizeof(*conn)); + + INIT_HLIST_NODE(&conn->c_hash_node); + conn->c_version = RDS_PROTOCOL_3_0; + conn->c_laddr = laddr; + conn->c_faddr = faddr; + spin_lock_init(&conn->c_lock); + conn->c_next_tx_seq = 1; + + mutex_init(&conn->c_send_lock); + INIT_LIST_HEAD(&conn->c_send_queue); + INIT_LIST_HEAD(&conn->c_retrans); + + ret = rds_cong_get_maps(conn); + if (ret) { + kmem_cache_free(rds_conn_slab, conn); + conn = ERR_PTR(ret); + goto out; + } + + /* + * This is where a connection becomes loopback. If *any* RDS sockets + * can bind to the destination address then we'd rather the messages + * flow through loopback rather than either transport. + */ + if (rds_trans_get_preferred(faddr)) { + conn->c_loopback = 1; + if (is_outgoing && trans->t_prefer_loopback) { + /* "outgoing" connection - and the transport + * says it wants the connection handled by the + * loopback transport. This is what TCP does. + */ + trans = &rds_loop_transport; + } + } + + conn->c_trans = trans; + + ret = trans->conn_alloc(conn, gfp); + if (ret) { + kmem_cache_free(rds_conn_slab, conn); + conn = ERR_PTR(ret); + goto out; + } + + atomic_set(&conn->c_state, RDS_CONN_DOWN); + conn->c_reconnect_jiffies = 0; + INIT_DELAYED_WORK(&conn->c_send_w, rds_send_worker); + INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker); + INIT_DELAYED_WORK(&conn->c_conn_w, rds_connect_worker); + INIT_WORK(&conn->c_down_w, rds_shutdown_worker); + mutex_init(&conn->c_cm_lock); + conn->c_flags = 0; + + rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n", + conn, &laddr, &faddr, + trans->t_name ? trans->t_name : "[unknown]", + is_outgoing ? "(outgoing)" : ""); + + spin_lock_irqsave(&rds_conn_lock, flags); + if (parent == NULL) { + tmp = rds_conn_lookup(head, laddr, faddr, trans); + if (tmp == NULL) + hlist_add_head(&conn->c_hash_node, head); + } else { + tmp = parent->c_passive; + if (!tmp) + parent->c_passive = conn; + } + + if (tmp) { + trans->conn_free(conn->c_transport_data); + kmem_cache_free(rds_conn_slab, conn); + conn = tmp; + } else { + rds_cong_add_conn(conn); + rds_conn_count++; + } + + spin_unlock_irqrestore(&rds_conn_lock, flags); + +out: + return conn; +} + +struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, + struct rds_transport *trans, gfp_t gfp) +{ + return __rds_conn_create(laddr, faddr, trans, gfp, 0); +} + +struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, + struct rds_transport *trans, gfp_t gfp) +{ + return __rds_conn_create(laddr, faddr, trans, gfp, 1); +} + +void rds_conn_destroy(struct rds_connection *conn) +{ + struct rds_message *rm, *rtmp; + + rdsdebug("freeing conn %p for %pI4 -> " + "%pI4\n", conn, &conn->c_laddr, + &conn->c_faddr); + + hlist_del_init(&conn->c_hash_node); + + /* wait for the rds thread to shut it down */ + atomic_set(&conn->c_state, RDS_CONN_ERROR); + cancel_delayed_work(&conn->c_conn_w); + queue_work(rds_wq, &conn->c_down_w); + flush_workqueue(rds_wq); + + /* tear down queued messages */ + list_for_each_entry_safe(rm, rtmp, + &conn->c_send_queue, + m_conn_item) { + list_del_init(&rm->m_conn_item); + BUG_ON(!list_empty(&rm->m_sock_item)); + rds_message_put(rm); + } + if (conn->c_xmit_rm) + rds_message_put(conn->c_xmit_rm); + + conn->c_trans->conn_free(conn->c_transport_data); + + /* + * The congestion maps aren't freed up here. They're + * freed by rds_cong_exit() after all the connections + * have been freed. + */ + rds_cong_remove_conn(conn); + + BUG_ON(!list_empty(&conn->c_retrans)); + kmem_cache_free(rds_conn_slab, conn); + + rds_conn_count--; +} + +static void rds_conn_message_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens, + int want_send) +{ + struct hlist_head *head; + struct hlist_node *pos; + struct list_head *list; + struct rds_connection *conn; + struct rds_message *rm; + unsigned long flags; + unsigned int total = 0; + size_t i; + + len /= sizeof(struct rds_info_message); + + spin_lock_irqsave(&rds_conn_lock, flags); + + for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); + i++, head++) { + hlist_for_each_entry(conn, pos, head, c_hash_node) { + if (want_send) + list = &conn->c_send_queue; + else + list = &conn->c_retrans; + + spin_lock(&conn->c_lock); + + /* XXX too lazy to maintain counts.. */ + list_for_each_entry(rm, list, m_conn_item) { + total++; + if (total <= len) + rds_inc_info_copy(&rm->m_inc, iter, + conn->c_laddr, + conn->c_faddr, 0); + } + + spin_unlock(&conn->c_lock); + } + } + + spin_unlock_irqrestore(&rds_conn_lock, flags); + + lens->nr = total; + lens->each = sizeof(struct rds_info_message); +} + +static void rds_conn_message_info_send(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + rds_conn_message_info(sock, len, iter, lens, 1); +} + +static void rds_conn_message_info_retrans(struct socket *sock, + unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + rds_conn_message_info(sock, len, iter, lens, 0); +} + +void rds_for_each_conn_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens, + int (*visitor)(struct rds_connection *, void *), + size_t item_len) +{ + uint64_t buffer[(item_len + 7) / 8]; + struct hlist_head *head; + struct hlist_node *pos; + struct hlist_node *tmp; + struct rds_connection *conn; + unsigned long flags; + size_t i; + + spin_lock_irqsave(&rds_conn_lock, flags); + + lens->nr = 0; + lens->each = item_len; + + for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); + i++, head++) { + hlist_for_each_entry_safe(conn, pos, tmp, head, c_hash_node) { + + /* XXX no c_lock usage.. */ + if (!visitor(conn, buffer)) + continue; + + /* We copy as much as we can fit in the buffer, + * but we count all items so that the caller + * can resize the buffer. */ + if (len >= item_len) { + rds_info_copy(iter, buffer, item_len); + len -= item_len; + } + lens->nr++; + } + } + + spin_unlock_irqrestore(&rds_conn_lock, flags); +} + +static int rds_conn_info_visitor(struct rds_connection *conn, + void *buffer) +{ + struct rds_info_connection *cinfo = buffer; + + cinfo->next_tx_seq = conn->c_next_tx_seq; + cinfo->next_rx_seq = conn->c_next_rx_seq; + cinfo->laddr = conn->c_laddr; + cinfo->faddr = conn->c_faddr; + strncpy(cinfo->transport, conn->c_trans->t_name, + sizeof(cinfo->transport)); + cinfo->flags = 0; + + rds_conn_info_set(cinfo->flags, + rds_conn_is_sending(conn), SENDING); + /* XXX Future: return the state rather than these funky bits */ + rds_conn_info_set(cinfo->flags, + atomic_read(&conn->c_state) == RDS_CONN_CONNECTING, + CONNECTING); + rds_conn_info_set(cinfo->flags, + atomic_read(&conn->c_state) == RDS_CONN_UP, + CONNECTED); + return 1; +} + +static void rds_conn_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + rds_for_each_conn_info(sock, len, iter, lens, + rds_conn_info_visitor, + sizeof(struct rds_info_connection)); +} + +int __init rds_conn_init(void) +{ + rds_conn_slab = kmem_cache_create("rds_connection", + sizeof(struct rds_connection), + 0, 0, NULL); + if (rds_conn_slab == NULL) + return -ENOMEM; + + rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info); + rds_info_register_func(RDS_INFO_SEND_MESSAGES, + rds_conn_message_info_send); + rds_info_register_func(RDS_INFO_RETRANS_MESSAGES, + rds_conn_message_info_retrans); + + return 0; +} + +void rds_conn_exit(void) +{ + rds_loop_exit(); + + WARN_ON(!hlist_empty(rds_conn_hash)); + + kmem_cache_destroy(rds_conn_slab); + + rds_info_deregister_func(RDS_INFO_CONNECTIONS, rds_conn_info); + rds_info_deregister_func(RDS_INFO_SEND_MESSAGES, + rds_conn_message_info_send); + rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES, + rds_conn_message_info_retrans); +} + +/* + * Force a disconnect + */ +void rds_conn_drop(struct rds_connection *conn) +{ + atomic_set(&conn->c_state, RDS_CONN_ERROR); + queue_work(rds_wq, &conn->c_down_w); +} + +/* + * An error occurred on the connection + */ +void +__rds_conn_error(struct rds_connection *conn, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vprintk(fmt, ap); + va_end(ap); + + rds_conn_drop(conn); +} diff --git a/net/rds/ib.c b/net/rds/ib.c new file mode 100644 index 00000000000..06a7b798d9a --- /dev/null +++ b/net/rds/ib.c @@ -0,0 +1,323 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/in.h> +#include <linux/if.h> +#include <linux/netdevice.h> +#include <linux/inetdevice.h> +#include <linux/if_arp.h> +#include <linux/delay.h> + +#include "rds.h" +#include "ib.h" + +unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE; +unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */ + +module_param(fmr_pool_size, int, 0444); +MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA"); +module_param(fmr_message_size, int, 0444); +MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer"); + +struct list_head rds_ib_devices; + +DEFINE_SPINLOCK(ib_nodev_conns_lock); +LIST_HEAD(ib_nodev_conns); + +void rds_ib_add_one(struct ib_device *device) +{ + struct rds_ib_device *rds_ibdev; + struct ib_device_attr *dev_attr; + + /* Only handle IB (no iWARP) devices */ + if (device->node_type != RDMA_NODE_IB_CA) + return; + + dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); + if (!dev_attr) + return; + + if (ib_query_device(device, dev_attr)) { + rdsdebug("Query device failed for %s\n", device->name); + goto free_attr; + } + + rds_ibdev = kmalloc(sizeof *rds_ibdev, GFP_KERNEL); + if (!rds_ibdev) + goto free_attr; + + spin_lock_init(&rds_ibdev->spinlock); + + rds_ibdev->max_wrs = dev_attr->max_qp_wr; + rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); + + rds_ibdev->fmr_page_shift = max(9, ffs(dev_attr->page_size_cap) - 1); + rds_ibdev->fmr_page_size = 1 << rds_ibdev->fmr_page_shift; + rds_ibdev->fmr_page_mask = ~((u64) rds_ibdev->fmr_page_size - 1); + rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32; + rds_ibdev->max_fmrs = dev_attr->max_fmr ? + min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) : + fmr_pool_size; + + rds_ibdev->dev = device; + rds_ibdev->pd = ib_alloc_pd(device); + if (IS_ERR(rds_ibdev->pd)) + goto free_dev; + + rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, + IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(rds_ibdev->mr)) + goto err_pd; + + rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev); + if (IS_ERR(rds_ibdev->mr_pool)) { + rds_ibdev->mr_pool = NULL; + goto err_mr; + } + + INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); + INIT_LIST_HEAD(&rds_ibdev->conn_list); + list_add_tail(&rds_ibdev->list, &rds_ib_devices); + + ib_set_client_data(device, &rds_ib_client, rds_ibdev); + + goto free_attr; + +err_mr: + ib_dereg_mr(rds_ibdev->mr); +err_pd: + ib_dealloc_pd(rds_ibdev->pd); +free_dev: + kfree(rds_ibdev); +free_attr: + kfree(dev_attr); +} + +void rds_ib_remove_one(struct ib_device *device) +{ + struct rds_ib_device *rds_ibdev; + struct rds_ib_ipaddr *i_ipaddr, *i_next; + + rds_ibdev = ib_get_client_data(device, &rds_ib_client); + if (!rds_ibdev) + return; + + list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { + list_del(&i_ipaddr->list); + kfree(i_ipaddr); + } + + rds_ib_remove_conns(rds_ibdev); + + if (rds_ibdev->mr_pool) + rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); + + ib_dereg_mr(rds_ibdev->mr); + + while (ib_dealloc_pd(rds_ibdev->pd)) { + rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd); + msleep(1); + } + + list_del(&rds_ibdev->list); + kfree(rds_ibdev); +} + +struct ib_client rds_ib_client = { + .name = "rds_ib", + .add = rds_ib_add_one, + .remove = rds_ib_remove_one +}; + +static int rds_ib_conn_info_visitor(struct rds_connection *conn, + void *buffer) +{ + struct rds_info_rdma_connection *iinfo = buffer; + struct rds_ib_connection *ic; + + /* We will only ever look at IB transports */ + if (conn->c_trans != &rds_ib_transport) + return 0; + + iinfo->src_addr = conn->c_laddr; + iinfo->dst_addr = conn->c_faddr; + + memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid)); + memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid)); + if (rds_conn_state(conn) == RDS_CONN_UP) { + struct rds_ib_device *rds_ibdev; + struct rdma_dev_addr *dev_addr; + + ic = conn->c_transport_data; + dev_addr = &ic->i_cm_id->route.addr.dev_addr; + + ib_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); + ib_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); + + rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); + iinfo->max_send_wr = ic->i_send_ring.w_nr; + iinfo->max_recv_wr = ic->i_recv_ring.w_nr; + iinfo->max_send_sge = rds_ibdev->max_sge; + rds_ib_get_mr_info(rds_ibdev, iinfo); + } + return 1; +} + +static void rds_ib_ic_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + rds_for_each_conn_info(sock, len, iter, lens, + rds_ib_conn_info_visitor, + sizeof(struct rds_info_rdma_connection)); +} + + +/* + * Early RDS/IB was built to only bind to an address if there is an IPoIB + * device with that address set. + * + * If it were me, I'd advocate for something more flexible. Sending and + * receiving should be device-agnostic. Transports would try and maintain + * connections between peers who have messages queued. Userspace would be + * allowed to influence which paths have priority. We could call userspace + * asserting this policy "routing". + */ +static int rds_ib_laddr_check(__be32 addr) +{ + int ret; + struct rdma_cm_id *cm_id; + struct sockaddr_in sin; + + /* Create a CMA ID and try to bind it. This catches both + * IB and iWARP capable NICs. + */ + cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP); + if (!cm_id) + return -EADDRNOTAVAIL; + + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = addr; + + /* rdma_bind_addr will only succeed for IB & iWARP devices */ + ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); + /* due to this, we will claim to support iWARP devices unless we + check node_type. */ + if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA) + ret = -EADDRNOTAVAIL; + + rdsdebug("addr %pI4 ret %d node type %d\n", + &addr, ret, + cm_id->device ? cm_id->device->node_type : -1); + + rdma_destroy_id(cm_id); + + return ret; +} + +void rds_ib_exit(void) +{ + rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); + rds_ib_remove_nodev_conns(); + ib_unregister_client(&rds_ib_client); + rds_ib_sysctl_exit(); + rds_ib_recv_exit(); + rds_trans_unregister(&rds_ib_transport); +} + +struct rds_transport rds_ib_transport = { + .laddr_check = rds_ib_laddr_check, + .xmit_complete = rds_ib_xmit_complete, + .xmit = rds_ib_xmit, + .xmit_cong_map = NULL, + .xmit_rdma = rds_ib_xmit_rdma, + .recv = rds_ib_recv, + .conn_alloc = rds_ib_conn_alloc, + .conn_free = rds_ib_conn_free, + .conn_connect = rds_ib_conn_connect, + .conn_shutdown = rds_ib_conn_shutdown, + .inc_copy_to_user = rds_ib_inc_copy_to_user, + .inc_purge = rds_ib_inc_purge, + .inc_free = rds_ib_inc_free, + .cm_initiate_connect = rds_ib_cm_initiate_connect, + .cm_handle_connect = rds_ib_cm_handle_connect, + .cm_connect_complete = rds_ib_cm_connect_complete, + .stats_info_copy = rds_ib_stats_info_copy, + .exit = rds_ib_exit, + .get_mr = rds_ib_get_mr, + .sync_mr = rds_ib_sync_mr, + .free_mr = rds_ib_free_mr, + .flush_mrs = rds_ib_flush_mrs, + .t_owner = THIS_MODULE, + .t_name = "infiniband", +}; + +int __init rds_ib_init(void) +{ + int ret; + + INIT_LIST_HEAD(&rds_ib_devices); + + ret = ib_register_client(&rds_ib_client); + if (ret) + goto out; + + ret = rds_ib_sysctl_init(); + if (ret) + goto out_ibreg; + + ret = rds_ib_recv_init(); + if (ret) + goto out_sysctl; + + ret = rds_trans_register(&rds_ib_transport); + if (ret) + goto out_recv; + + rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); + + goto out; + +out_recv: + rds_ib_recv_exit(); +out_sysctl: + rds_ib_sysctl_exit(); +out_ibreg: + ib_unregister_client(&rds_ib_client); +out: + return ret; +} + +MODULE_LICENSE("GPL"); + diff --git a/net/rds/ib.h b/net/rds/ib.h new file mode 100644 index 00000000000..8be563a1363 --- /dev/null +++ b/net/rds/ib.h @@ -0,0 +1,367 @@ +#ifndef _RDS_IB_H +#define _RDS_IB_H + +#include <rdma/ib_verbs.h> +#include <rdma/rdma_cm.h> +#include "rds.h" +#include "rdma_transport.h" + +#define RDS_FMR_SIZE 256 +#define RDS_FMR_POOL_SIZE 4096 + +#define RDS_IB_MAX_SGE 8 +#define RDS_IB_RECV_SGE 2 + +#define RDS_IB_DEFAULT_RECV_WR 1024 +#define RDS_IB_DEFAULT_SEND_WR 256 + +#define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ + +extern struct list_head rds_ib_devices; + +/* + * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to + * try and minimize the amount of memory tied up both the device and + * socket receive queues. + */ +/* page offset of the final full frag that fits in the page */ +#define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE) +struct rds_page_frag { + struct list_head f_item; + struct page *f_page; + unsigned long f_offset; + dma_addr_t f_mapped; +}; + +struct rds_ib_incoming { + struct list_head ii_frags; + struct rds_incoming ii_inc; +}; + +struct rds_ib_connect_private { + /* Add new fields at the end, and don't permute existing fields. */ + __be32 dp_saddr; + __be32 dp_daddr; + u8 dp_protocol_major; + u8 dp_protocol_minor; + __be16 dp_protocol_minor_mask; /* bitmask */ + __be32 dp_reserved1; + __be64 dp_ack_seq; + __be32 dp_credit; /* non-zero enables flow ctl */ +}; + +struct rds_ib_send_work { + struct rds_message *s_rm; + struct rds_rdma_op *s_op; + struct ib_send_wr s_wr; + struct ib_sge s_sge[RDS_IB_MAX_SGE]; + unsigned long s_queued; +}; + +struct rds_ib_recv_work { + struct rds_ib_incoming *r_ibinc; + struct rds_page_frag *r_frag; + struct ib_recv_wr r_wr; + struct ib_sge r_sge[2]; +}; + +struct rds_ib_work_ring { + u32 w_nr; + u32 w_alloc_ptr; + u32 w_alloc_ctr; + u32 w_free_ptr; + atomic_t w_free_ctr; +}; + +struct rds_ib_device; + +struct rds_ib_connection { + + struct list_head ib_node; + struct rds_ib_device *rds_ibdev; + struct rds_connection *conn; + + /* alphabet soup, IBTA style */ + struct rdma_cm_id *i_cm_id; + struct ib_pd *i_pd; + struct ib_mr *i_mr; + struct ib_cq *i_send_cq; + struct ib_cq *i_recv_cq; + + /* tx */ + struct rds_ib_work_ring i_send_ring; + struct rds_message *i_rm; + struct rds_header *i_send_hdrs; + u64 i_send_hdrs_dma; + struct rds_ib_send_work *i_sends; + + /* rx */ + struct mutex i_recv_mutex; + struct rds_ib_work_ring i_recv_ring; + struct rds_ib_incoming *i_ibinc; + u32 i_recv_data_rem; + struct rds_header *i_recv_hdrs; + u64 i_recv_hdrs_dma; + struct rds_ib_recv_work *i_recvs; + struct rds_page_frag i_frag; + u64 i_ack_recv; /* last ACK received */ + + /* sending acks */ + unsigned long i_ack_flags; + u64 i_ack_next; /* next ACK to send */ + struct rds_header *i_ack; + struct ib_send_wr i_ack_wr; + struct ib_sge i_ack_sge; + u64 i_ack_dma; + unsigned long i_ack_queued; + + /* Flow control related information + * + * Our algorithm uses a pair variables that we need to access + * atomically - one for the send credits, and one posted + * recv credits we need to transfer to remote. + * Rather than protect them using a slow spinlock, we put both into + * a single atomic_t and update it using cmpxchg + */ + atomic_t i_credits; + + /* Protocol version specific information */ + unsigned int i_flowctl:1; /* enable/disable flow ctl */ + + /* Batched completions */ + unsigned int i_unsignaled_wrs; + long i_unsignaled_bytes; +}; + +/* This assumes that atomic_t is at least 32 bits */ +#define IB_GET_SEND_CREDITS(v) ((v) & 0xffff) +#define IB_GET_POST_CREDITS(v) ((v) >> 16) +#define IB_SET_SEND_CREDITS(v) ((v) & 0xffff) +#define IB_SET_POST_CREDITS(v) ((v) << 16) + +struct rds_ib_ipaddr { + struct list_head list; + __be32 ipaddr; +}; + +struct rds_ib_device { + struct list_head list; + struct list_head ipaddr_list; + struct list_head conn_list; + struct ib_device *dev; + struct ib_pd *pd; + struct ib_mr *mr; + struct rds_ib_mr_pool *mr_pool; + int fmr_page_shift; + int fmr_page_size; + u64 fmr_page_mask; + unsigned int fmr_max_remaps; + unsigned int max_fmrs; + int max_sge; + unsigned int max_wrs; + spinlock_t spinlock; /* protect the above */ +}; + +/* bits for i_ack_flags */ +#define IB_ACK_IN_FLIGHT 0 +#define IB_ACK_REQUESTED 1 + +/* Magic WR_ID for ACKs */ +#define RDS_IB_ACK_WR_ID (~(u64) 0) + +struct rds_ib_statistics { + uint64_t s_ib_connect_raced; + uint64_t s_ib_listen_closed_stale; + uint64_t s_ib_tx_cq_call; + uint64_t s_ib_tx_cq_event; + uint64_t s_ib_tx_ring_full; + uint64_t s_ib_tx_throttle; + uint64_t s_ib_tx_sg_mapping_failure; + uint64_t s_ib_tx_stalled; + uint64_t s_ib_tx_credit_updates; + uint64_t s_ib_rx_cq_call; + uint64_t s_ib_rx_cq_event; + uint64_t s_ib_rx_ring_empty; + uint64_t s_ib_rx_refill_from_cq; + uint64_t s_ib_rx_refill_from_thread; + uint64_t s_ib_rx_alloc_limit; + uint64_t s_ib_rx_credit_updates; + uint64_t s_ib_ack_sent; + uint64_t s_ib_ack_send_failure; + uint64_t s_ib_ack_send_delayed; + uint64_t s_ib_ack_send_piggybacked; + uint64_t s_ib_ack_received; + uint64_t s_ib_rdma_mr_alloc; + uint64_t s_ib_rdma_mr_free; + uint64_t s_ib_rdma_mr_used; + uint64_t s_ib_rdma_mr_pool_flush; + uint64_t s_ib_rdma_mr_pool_wait; + uint64_t s_ib_rdma_mr_pool_depleted; +}; + +extern struct workqueue_struct *rds_ib_wq; + +/* + * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h + * doesn't define it. + */ +static inline void rds_ib_dma_sync_sg_for_cpu(struct ib_device *dev, + struct scatterlist *sg, unsigned int sg_dma_len, int direction) +{ + unsigned int i; + + for (i = 0; i < sg_dma_len; ++i) { + ib_dma_sync_single_for_cpu(dev, + ib_sg_dma_address(dev, &sg[i]), + ib_sg_dma_len(dev, &sg[i]), + direction); + } +} +#define ib_dma_sync_sg_for_cpu rds_ib_dma_sync_sg_for_cpu + +static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev, + struct scatterlist *sg, unsigned int sg_dma_len, int direction) +{ + unsigned int i; + + for (i = 0; i < sg_dma_len; ++i) { + ib_dma_sync_single_for_device(dev, + ib_sg_dma_address(dev, &sg[i]), + ib_sg_dma_len(dev, &sg[i]), + direction); + } +} +#define ib_dma_sync_sg_for_device rds_ib_dma_sync_sg_for_device + + +/* ib.c */ +extern struct rds_transport rds_ib_transport; +extern void rds_ib_add_one(struct ib_device *device); +extern void rds_ib_remove_one(struct ib_device *device); +extern struct ib_client rds_ib_client; + +extern unsigned int fmr_pool_size; +extern unsigned int fmr_message_size; + +extern spinlock_t ib_nodev_conns_lock; +extern struct list_head ib_nodev_conns; + +/* ib_cm.c */ +int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp); +void rds_ib_conn_free(void *arg); +int rds_ib_conn_connect(struct rds_connection *conn); +void rds_ib_conn_shutdown(struct rds_connection *conn); +void rds_ib_state_change(struct sock *sk); +int __init rds_ib_listen_init(void); +void rds_ib_listen_stop(void); +void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...); +int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event); +int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id); +void rds_ib_cm_connect_complete(struct rds_connection *conn, + struct rdma_cm_event *event); + + +#define rds_ib_conn_error(conn, fmt...) \ + __rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt) + +/* ib_rdma.c */ +int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); +int rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); +void rds_ib_remove_nodev_conns(void); +void rds_ib_remove_conns(struct rds_ib_device *rds_ibdev); +struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *); +void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo); +void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); +void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, + struct rds_sock *rs, u32 *key_ret); +void rds_ib_sync_mr(void *trans_private, int dir); +void rds_ib_free_mr(void *trans_private, int invalidate); +void rds_ib_flush_mrs(void); + +/* ib_recv.c */ +int __init rds_ib_recv_init(void); +void rds_ib_recv_exit(void); +int rds_ib_recv(struct rds_connection *conn); +int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, + gfp_t page_gfp, int prefill); +void rds_ib_inc_purge(struct rds_incoming *inc); +void rds_ib_inc_free(struct rds_incoming *inc); +int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, + size_t size); +void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context); +void rds_ib_recv_init_ring(struct rds_ib_connection *ic); +void rds_ib_recv_clear_ring(struct rds_ib_connection *ic); +void rds_ib_recv_init_ack(struct rds_ib_connection *ic); +void rds_ib_attempt_ack(struct rds_ib_connection *ic); +void rds_ib_ack_send_complete(struct rds_ib_connection *ic); +u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic); + +/* ib_ring.c */ +void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr); +void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr); +u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos); +void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val); +void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val); +int rds_ib_ring_empty(struct rds_ib_work_ring *ring); +int rds_ib_ring_low(struct rds_ib_work_ring *ring); +u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring); +u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest); +extern wait_queue_head_t rds_ib_ring_empty_wait; + +/* ib_send.c */ +void rds_ib_xmit_complete(struct rds_connection *conn); +int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, + unsigned int hdr_off, unsigned int sg, unsigned int off); +void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context); +void rds_ib_send_init_ring(struct rds_ib_connection *ic); +void rds_ib_send_clear_ring(struct rds_ib_connection *ic); +int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op); +void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits); +void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted); +int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted, + u32 *adv_credits, int need_posted); + +/* ib_stats.c */ +DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats); +#define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member) +unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, + unsigned int avail); + +/* ib_sysctl.c */ +int __init rds_ib_sysctl_init(void); +void rds_ib_sysctl_exit(void); +extern unsigned long rds_ib_sysctl_max_send_wr; +extern unsigned long rds_ib_sysctl_max_recv_wr; +extern unsigned long rds_ib_sysctl_max_unsig_wrs; +extern unsigned long rds_ib_sysctl_max_unsig_bytes; +extern unsigned long rds_ib_sysctl_max_recv_allocation; +extern unsigned int rds_ib_sysctl_flow_control; +extern ctl_table rds_ib_sysctl_table[]; + +/* + * Helper functions for getting/setting the header and data SGEs in + * RDS packets (not RDMA) + */ +static inline struct ib_sge * +rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge) +{ + return &sge[0]; +} + +static inline struct ib_sge * +rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge) +{ + return &sge[1]; +} + +static inline void rds_ib_set_64bit(u64 *ptr, u64 val) +{ +#if BITS_PER_LONG == 64 + *ptr = val; +#else + set_64bit(ptr, val); +#endif +} + +#endif diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c new file mode 100644 index 00000000000..0532237bd12 --- /dev/null +++ b/net/rds/ib_cm.c @@ -0,0 +1,726 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/in.h> +#include <linux/vmalloc.h> + +#include "rds.h" +#include "ib.h" + +/* + * Set the selected protocol version + */ +static void rds_ib_set_protocol(struct rds_connection *conn, unsigned int version) +{ + conn->c_version = version; +} + +/* + * Set up flow control + */ +static void rds_ib_set_flow_control(struct rds_connection *conn, u32 credits) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + + if (rds_ib_sysctl_flow_control && credits != 0) { + /* We're doing flow control */ + ic->i_flowctl = 1; + rds_ib_send_add_credits(conn, credits); + } else { + ic->i_flowctl = 0; + } +} + +/* + * Tune RNR behavior. Without flow control, we use a rather + * low timeout, but not the absolute minimum - this should + * be tunable. + * + * We already set the RNR retry count to 7 (which is the + * smallest infinite number :-) above. + * If flow control is off, we want to change this back to 0 + * so that we learn quickly when our credit accounting is + * buggy. + * + * Caller passes in a qp_attr pointer - don't waste stack spacv + * by allocation this twice. + */ +static void +rds_ib_tune_rnr(struct rds_ib_connection *ic, struct ib_qp_attr *attr) +{ + int ret; + + attr->min_rnr_timer = IB_RNR_TIMER_000_32; + ret = ib_modify_qp(ic->i_cm_id->qp, attr, IB_QP_MIN_RNR_TIMER); + if (ret) + printk(KERN_NOTICE "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d\n", -ret); +} + +/* + * Connection established. + * We get here for both outgoing and incoming connection. + */ +void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event) +{ + const struct rds_ib_connect_private *dp = NULL; + struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_ib_device *rds_ibdev; + struct ib_qp_attr qp_attr; + int err; + + if (event->param.conn.private_data_len) { + dp = event->param.conn.private_data; + + rds_ib_set_protocol(conn, + RDS_PROTOCOL(dp->dp_protocol_major, + dp->dp_protocol_minor)); + rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); + } + + printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n", + &conn->c_laddr, + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version), + ic->i_flowctl ? ", flow control" : ""); + + /* Tune RNR behavior */ + rds_ib_tune_rnr(ic, &qp_attr); + + qp_attr.qp_state = IB_QPS_RTS; + err = ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE); + if (err) + printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err); + + /* update ib_device with this local ipaddr & conn */ + rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); + err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr); + if (err) + printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err); + err = rds_ib_add_conn(rds_ibdev, conn); + if (err) + printk(KERN_ERR "rds_ib_add_conn failed (%d)\n", err); + + /* If the peer gave us the last packet it saw, process this as if + * we had received a regular ACK. */ + if (dp && dp->dp_ack_seq) + rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); + + rds_connect_complete(conn); +} + +static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, + struct rdma_conn_param *conn_param, + struct rds_ib_connect_private *dp, + u32 protocol_version) +{ + memset(conn_param, 0, sizeof(struct rdma_conn_param)); + /* XXX tune these? */ + conn_param->responder_resources = 1; + conn_param->initiator_depth = 1; + conn_param->retry_count = 7; + conn_param->rnr_retry_count = 7; + + if (dp) { + struct rds_ib_connection *ic = conn->c_transport_data; + + memset(dp, 0, sizeof(*dp)); + dp->dp_saddr = conn->c_laddr; + dp->dp_daddr = conn->c_faddr; + dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version); + dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version); + dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS); + dp->dp_ack_seq = rds_ib_piggyb_ack(ic); + + /* Advertise flow control */ + if (ic->i_flowctl) { + unsigned int credits; + + credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)); + dp->dp_credit = cpu_to_be32(credits); + atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits); + } + + conn_param->private_data = dp; + conn_param->private_data_len = sizeof(*dp); + } +} + +static void rds_ib_cq_event_handler(struct ib_event *event, void *data) +{ + rdsdebug("event %u data %p\n", event->event, data); +} + +static void rds_ib_qp_event_handler(struct ib_event *event, void *data) +{ + struct rds_connection *conn = data; + struct rds_ib_connection *ic = conn->c_transport_data; + + rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event); + + switch (event->event) { + case IB_EVENT_COMM_EST: + rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); + break; + default: + printk(KERN_WARNING "RDS/ib: unhandled QP event %u " + "on connection to %pI4\n", event->event, + &conn->c_faddr); + break; + } +} + +/* + * This needs to be very careful to not leave IS_ERR pointers around for + * cleanup to trip over. + */ +static int rds_ib_setup_qp(struct rds_connection *conn) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct ib_device *dev = ic->i_cm_id->device; + struct ib_qp_init_attr attr; + struct rds_ib_device *rds_ibdev; + int ret; + + /* rds_ib_add_one creates a rds_ib_device object per IB device, + * and allocates a protection domain, memory range and FMR pool + * for each. If that fails for any reason, it will not register + * the rds_ibdev at all. + */ + rds_ibdev = ib_get_client_data(dev, &rds_ib_client); + if (rds_ibdev == NULL) { + if (printk_ratelimit()) + printk(KERN_NOTICE "RDS/IB: No client_data for device %s\n", + dev->name); + return -EOPNOTSUPP; + } + + if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1) + rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1); + if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1) + rds_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1); + + /* Protection domain and memory range */ + ic->i_pd = rds_ibdev->pd; + ic->i_mr = rds_ibdev->mr; + + ic->i_send_cq = ib_create_cq(dev, rds_ib_send_cq_comp_handler, + rds_ib_cq_event_handler, conn, + ic->i_send_ring.w_nr + 1, 0); + if (IS_ERR(ic->i_send_cq)) { + ret = PTR_ERR(ic->i_send_cq); + ic->i_send_cq = NULL; + rdsdebug("ib_create_cq send failed: %d\n", ret); + goto out; + } + + ic->i_recv_cq = ib_create_cq(dev, rds_ib_recv_cq_comp_handler, + rds_ib_cq_event_handler, conn, + ic->i_recv_ring.w_nr, 0); + if (IS_ERR(ic->i_recv_cq)) { + ret = PTR_ERR(ic->i_recv_cq); + ic->i_recv_cq = NULL; + rdsdebug("ib_create_cq recv failed: %d\n", ret); + goto out; + } + + ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP); + if (ret) { + rdsdebug("ib_req_notify_cq send failed: %d\n", ret); + goto out; + } + + ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); + if (ret) { + rdsdebug("ib_req_notify_cq recv failed: %d\n", ret); + goto out; + } + + /* XXX negotiate max send/recv with remote? */ + memset(&attr, 0, sizeof(attr)); + attr.event_handler = rds_ib_qp_event_handler; + attr.qp_context = conn; + /* + 1 to allow for the single ack message */ + attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1; + attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1; + attr.cap.max_send_sge = rds_ibdev->max_sge; + attr.cap.max_recv_sge = RDS_IB_RECV_SGE; + attr.sq_sig_type = IB_SIGNAL_REQ_WR; + attr.qp_type = IB_QPT_RC; + attr.send_cq = ic->i_send_cq; + attr.recv_cq = ic->i_recv_cq; + + /* + * XXX this can fail if max_*_wr is too large? Are we supposed + * to back off until we get a value that the hardware can support? + */ + ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr); + if (ret) { + rdsdebug("rdma_create_qp failed: %d\n", ret); + goto out; + } + + ic->i_send_hdrs = ib_dma_alloc_coherent(dev, + ic->i_send_ring.w_nr * + sizeof(struct rds_header), + &ic->i_send_hdrs_dma, GFP_KERNEL); + if (ic->i_send_hdrs == NULL) { + ret = -ENOMEM; + rdsdebug("ib_dma_alloc_coherent send failed\n"); + goto out; + } + + ic->i_recv_hdrs = ib_dma_alloc_coherent(dev, + ic->i_recv_ring.w_nr * + sizeof(struct rds_header), + &ic->i_recv_hdrs_dma, GFP_KERNEL); + if (ic->i_recv_hdrs == NULL) { + ret = -ENOMEM; + rdsdebug("ib_dma_alloc_coherent recv failed\n"); + goto out; + } + + ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), + &ic->i_ack_dma, GFP_KERNEL); + if (ic->i_ack == NULL) { + ret = -ENOMEM; + rdsdebug("ib_dma_alloc_coherent ack failed\n"); + goto out; + } + + ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); + if (ic->i_sends == NULL) { + ret = -ENOMEM; + rdsdebug("send allocation failed\n"); + goto out; + } + rds_ib_send_init_ring(ic); + + ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work)); + if (ic->i_recvs == NULL) { + ret = -ENOMEM; + rdsdebug("recv allocation failed\n"); + goto out; + } + + rds_ib_recv_init_ring(ic); + rds_ib_recv_init_ack(ic); + + /* Post receive buffers - as a side effect, this will update + * the posted credit count. */ + rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1); + + rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr, + ic->i_send_cq, ic->i_recv_cq); + +out: + return ret; +} + +static u32 rds_ib_protocol_compatible(const struct rds_ib_connect_private *dp) +{ + u16 common; + u32 version = 0; + + /* rdma_cm private data is odd - when there is any private data in the + * request, we will be given a pretty large buffer without telling us the + * original size. The only way to tell the difference is by looking at + * the contents, which are initialized to zero. + * If the protocol version fields aren't set, this is a connection attempt + * from an older version. This could could be 3.0 or 2.0 - we can't tell. + * We really should have changed this for OFED 1.3 :-( */ + if (dp->dp_protocol_major == 0) + return RDS_PROTOCOL_3_0; + + common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS; + if (dp->dp_protocol_major == 3 && common) { + version = RDS_PROTOCOL_3_0; + while ((common >>= 1) != 0) + version++; + } else if (printk_ratelimit()) { + printk(KERN_NOTICE "RDS: Connection from %pI4 using " + "incompatible protocol version %u.%u\n", + &dp->dp_saddr, + dp->dp_protocol_major, + dp->dp_protocol_minor); + } + return version; +} + +int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + __be64 lguid = cm_id->route.path_rec->sgid.global.interface_id; + __be64 fguid = cm_id->route.path_rec->dgid.global.interface_id; + const struct rds_ib_connect_private *dp = event->param.conn.private_data; + struct rds_ib_connect_private dp_rep; + struct rds_connection *conn = NULL; + struct rds_ib_connection *ic = NULL; + struct rdma_conn_param conn_param; + u32 version; + int err, destroy = 1; + + /* Check whether the remote protocol version matches ours. */ + version = rds_ib_protocol_compatible(dp); + if (!version) + goto out; + + rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u lguid 0x%llx fguid " + "0x%llx\n", &dp->dp_saddr, &dp->dp_daddr, + RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version), + (unsigned long long)be64_to_cpu(lguid), + (unsigned long long)be64_to_cpu(fguid)); + + conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_ib_transport, + GFP_KERNEL); + if (IS_ERR(conn)) { + rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); + conn = NULL; + goto out; + } + + /* + * The connection request may occur while the + * previous connection exist, e.g. in case of failover. + * But as connections may be initiated simultaneously + * by both hosts, we have a random backoff mechanism - + * see the comment above rds_queue_reconnect() + */ + mutex_lock(&conn->c_cm_lock); + if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { + if (rds_conn_state(conn) == RDS_CONN_UP) { + rdsdebug("incoming connect while connecting\n"); + rds_conn_drop(conn); + rds_ib_stats_inc(s_ib_listen_closed_stale); + } else + if (rds_conn_state(conn) == RDS_CONN_CONNECTING) { + /* Wait and see - our connect may still be succeeding */ + rds_ib_stats_inc(s_ib_connect_raced); + } + mutex_unlock(&conn->c_cm_lock); + goto out; + } + + ic = conn->c_transport_data; + + rds_ib_set_protocol(conn, version); + rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); + + /* If the peer gave us the last packet it saw, process this as if + * we had received a regular ACK. */ + if (dp->dp_ack_seq) + rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); + + BUG_ON(cm_id->context); + BUG_ON(ic->i_cm_id); + + ic->i_cm_id = cm_id; + cm_id->context = conn; + + /* We got halfway through setting up the ib_connection, if we + * fail now, we have to take the long route out of this mess. */ + destroy = 0; + + err = rds_ib_setup_qp(conn); + if (err) { + rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err); + goto out; + } + + rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version); + + /* rdma_accept() calls rdma_reject() internally if it fails */ + err = rdma_accept(cm_id, &conn_param); + mutex_unlock(&conn->c_cm_lock); + if (err) { + rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err); + goto out; + } + + return 0; + +out: + rdma_reject(cm_id, NULL, 0); + return destroy; +} + + +int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) +{ + struct rds_connection *conn = cm_id->context; + struct rds_ib_connection *ic = conn->c_transport_data; + struct rdma_conn_param conn_param; + struct rds_ib_connect_private dp; + int ret; + + /* If the peer doesn't do protocol negotiation, we must + * default to RDSv3.0 */ + rds_ib_set_protocol(conn, RDS_PROTOCOL_3_0); + ic->i_flowctl = rds_ib_sysctl_flow_control; /* advertise flow control */ + + ret = rds_ib_setup_qp(conn); + if (ret) { + rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", ret); + goto out; + } + + rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION); + + ret = rdma_connect(cm_id, &conn_param); + if (ret) + rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret); + +out: + /* Beware - returning non-zero tells the rdma_cm to destroy + * the cm_id. We should certainly not do it as long as we still + * "own" the cm_id. */ + if (ret) { + if (ic->i_cm_id == cm_id) + ret = 0; + } + return ret; +} + +int rds_ib_conn_connect(struct rds_connection *conn) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct sockaddr_in src, dest; + int ret; + + /* XXX I wonder what affect the port space has */ + /* delegate cm event handler to rdma_transport */ + ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn, + RDMA_PS_TCP); + if (IS_ERR(ic->i_cm_id)) { + ret = PTR_ERR(ic->i_cm_id); + ic->i_cm_id = NULL; + rdsdebug("rdma_create_id() failed: %d\n", ret); + goto out; + } + + rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn); + + src.sin_family = AF_INET; + src.sin_addr.s_addr = (__force u32)conn->c_laddr; + src.sin_port = (__force u16)htons(0); + + dest.sin_family = AF_INET; + dest.sin_addr.s_addr = (__force u32)conn->c_faddr; + dest.sin_port = (__force u16)htons(RDS_PORT); + + ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src, + (struct sockaddr *)&dest, + RDS_RDMA_RESOLVE_TIMEOUT_MS); + if (ret) { + rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id, + ret); + rdma_destroy_id(ic->i_cm_id); + ic->i_cm_id = NULL; + } + +out: + return ret; +} + +/* + * This is so careful about only cleaning up resources that were built up + * so that it can be called at any point during startup. In fact it + * can be called multiple times for a given connection. + */ +void rds_ib_conn_shutdown(struct rds_connection *conn) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + int err = 0; + + rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id, + ic->i_pd, ic->i_send_cq, ic->i_recv_cq, + ic->i_cm_id ? ic->i_cm_id->qp : NULL); + + if (ic->i_cm_id) { + struct ib_device *dev = ic->i_cm_id->device; + + rdsdebug("disconnecting cm %p\n", ic->i_cm_id); + err = rdma_disconnect(ic->i_cm_id); + if (err) { + /* Actually this may happen quite frequently, when + * an outgoing connect raced with an incoming connect. + */ + rdsdebug("failed to disconnect, cm: %p err %d\n", + ic->i_cm_id, err); + } + + wait_event(rds_ib_ring_empty_wait, + rds_ib_ring_empty(&ic->i_send_ring) && + rds_ib_ring_empty(&ic->i_recv_ring)); + + if (ic->i_send_hdrs) + ib_dma_free_coherent(dev, + ic->i_send_ring.w_nr * + sizeof(struct rds_header), + ic->i_send_hdrs, + ic->i_send_hdrs_dma); + + if (ic->i_recv_hdrs) + ib_dma_free_coherent(dev, + ic->i_recv_ring.w_nr * + sizeof(struct rds_header), + ic->i_recv_hdrs, + ic->i_recv_hdrs_dma); + + if (ic->i_ack) + ib_dma_free_coherent(dev, sizeof(struct rds_header), + ic->i_ack, ic->i_ack_dma); + + if (ic->i_sends) + rds_ib_send_clear_ring(ic); + if (ic->i_recvs) + rds_ib_recv_clear_ring(ic); + + if (ic->i_cm_id->qp) + rdma_destroy_qp(ic->i_cm_id); + if (ic->i_send_cq) + ib_destroy_cq(ic->i_send_cq); + if (ic->i_recv_cq) + ib_destroy_cq(ic->i_recv_cq); + rdma_destroy_id(ic->i_cm_id); + + /* + * Move connection back to the nodev list. + */ + if (ic->rds_ibdev) { + + spin_lock_irq(&ic->rds_ibdev->spinlock); + BUG_ON(list_empty(&ic->ib_node)); + list_del(&ic->ib_node); + spin_unlock_irq(&ic->rds_ibdev->spinlock); + + spin_lock_irq(&ib_nodev_conns_lock); + list_add_tail(&ic->ib_node, &ib_nodev_conns); + spin_unlock_irq(&ib_nodev_conns_lock); + ic->rds_ibdev = NULL; + } + + ic->i_cm_id = NULL; + ic->i_pd = NULL; + ic->i_mr = NULL; + ic->i_send_cq = NULL; + ic->i_recv_cq = NULL; + ic->i_send_hdrs = NULL; + ic->i_recv_hdrs = NULL; + ic->i_ack = NULL; + } + BUG_ON(ic->rds_ibdev); + + /* Clear pending transmit */ + if (ic->i_rm) { + rds_message_put(ic->i_rm); + ic->i_rm = NULL; + } + + /* Clear the ACK state */ + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); + rds_ib_set_64bit(&ic->i_ack_next, 0); + ic->i_ack_recv = 0; + + /* Clear flow control state */ + ic->i_flowctl = 0; + atomic_set(&ic->i_credits, 0); + + rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr); + rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr); + + if (ic->i_ibinc) { + rds_inc_put(&ic->i_ibinc->ii_inc); + ic->i_ibinc = NULL; + } + + vfree(ic->i_sends); + ic->i_sends = NULL; + vfree(ic->i_recvs); + ic->i_recvs = NULL; +} + +int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) +{ + struct rds_ib_connection *ic; + unsigned long flags; + + /* XXX too lazy? */ + ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL); + if (ic == NULL) + return -ENOMEM; + + INIT_LIST_HEAD(&ic->ib_node); + mutex_init(&ic->i_recv_mutex); + + /* + * rds_ib_conn_shutdown() waits for these to be emptied so they + * must be initialized before it can be called. + */ + rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr); + rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr); + + ic->conn = conn; + conn->c_transport_data = ic; + + spin_lock_irqsave(&ib_nodev_conns_lock, flags); + list_add_tail(&ic->ib_node, &ib_nodev_conns); + spin_unlock_irqrestore(&ib_nodev_conns_lock, flags); + + + rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data); + return 0; +} + +void rds_ib_conn_free(void *arg) +{ + struct rds_ib_connection *ic = arg; + rdsdebug("ic %p\n", ic); + list_del(&ic->ib_node); + kfree(ic); +} + + +/* + * An error occurred on the connection + */ +void +__rds_ib_conn_error(struct rds_connection *conn, const char *fmt, ...) +{ + va_list ap; + + rds_conn_drop(conn); + + va_start(ap, fmt); + vprintk(fmt, ap); + va_end(ap); +} diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c new file mode 100644 index 00000000000..69a6289ed67 --- /dev/null +++ b/net/rds/ib_rdma.c @@ -0,0 +1,641 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> + +#include "rds.h" +#include "rdma.h" +#include "ib.h" + + +/* + * This is stored as mr->r_trans_private. + */ +struct rds_ib_mr { + struct rds_ib_device *device; + struct rds_ib_mr_pool *pool; + struct ib_fmr *fmr; + struct list_head list; + unsigned int remap_count; + + struct scatterlist *sg; + unsigned int sg_len; + u64 *dma; + int sg_dma_len; +}; + +/* + * Our own little FMR pool + */ +struct rds_ib_mr_pool { + struct mutex flush_lock; /* serialize fmr invalidate */ + struct work_struct flush_worker; /* flush worker */ + + spinlock_t list_lock; /* protect variables below */ + atomic_t item_count; /* total # of MRs */ + atomic_t dirty_count; /* # dirty of MRs */ + struct list_head drop_list; /* MRs that have reached their max_maps limit */ + struct list_head free_list; /* unused MRs */ + struct list_head clean_list; /* unused & unamapped MRs */ + atomic_t free_pinned; /* memory pinned by free MRs */ + unsigned long max_items; + unsigned long max_items_soft; + unsigned long max_free_pinned; + struct ib_fmr_attr fmr_attr; +}; + +static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all); +static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr); +static void rds_ib_mr_pool_flush_worker(struct work_struct *work); + +static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) +{ + struct rds_ib_device *rds_ibdev; + struct rds_ib_ipaddr *i_ipaddr; + + list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { + spin_lock_irq(&rds_ibdev->spinlock); + list_for_each_entry(i_ipaddr, &rds_ibdev->ipaddr_list, list) { + if (i_ipaddr->ipaddr == ipaddr) { + spin_unlock_irq(&rds_ibdev->spinlock); + return rds_ibdev; + } + } + spin_unlock_irq(&rds_ibdev->spinlock); + } + + return NULL; +} + +static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) +{ + struct rds_ib_ipaddr *i_ipaddr; + + i_ipaddr = kmalloc(sizeof *i_ipaddr, GFP_KERNEL); + if (!i_ipaddr) + return -ENOMEM; + + i_ipaddr->ipaddr = ipaddr; + + spin_lock_irq(&rds_ibdev->spinlock); + list_add_tail(&i_ipaddr->list, &rds_ibdev->ipaddr_list); + spin_unlock_irq(&rds_ibdev->spinlock); + + return 0; +} + +static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) +{ + struct rds_ib_ipaddr *i_ipaddr, *next; + + spin_lock_irq(&rds_ibdev->spinlock); + list_for_each_entry_safe(i_ipaddr, next, &rds_ibdev->ipaddr_list, list) { + if (i_ipaddr->ipaddr == ipaddr) { + list_del(&i_ipaddr->list); + kfree(i_ipaddr); + break; + } + } + spin_unlock_irq(&rds_ibdev->spinlock); +} + +int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) +{ + struct rds_ib_device *rds_ibdev_old; + + rds_ibdev_old = rds_ib_get_device(ipaddr); + if (rds_ibdev_old) + rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr); + + return rds_ib_add_ipaddr(rds_ibdev, ipaddr); +} + +int rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + + /* conn was previously on the nodev_conns_list */ + spin_lock_irq(&ib_nodev_conns_lock); + BUG_ON(list_empty(&ib_nodev_conns)); + BUG_ON(list_empty(&ic->ib_node)); + list_del(&ic->ib_node); + spin_unlock_irq(&ib_nodev_conns_lock); + + spin_lock_irq(&rds_ibdev->spinlock); + list_add_tail(&ic->ib_node, &rds_ibdev->conn_list); + spin_unlock_irq(&rds_ibdev->spinlock); + + ic->rds_ibdev = rds_ibdev; + + return 0; +} + +void rds_ib_remove_nodev_conns(void) +{ + struct rds_ib_connection *ic, *_ic; + LIST_HEAD(tmp_list); + + /* avoid calling conn_destroy with irqs off */ + spin_lock_irq(&ib_nodev_conns_lock); + list_splice(&ib_nodev_conns, &tmp_list); + INIT_LIST_HEAD(&ib_nodev_conns); + spin_unlock_irq(&ib_nodev_conns_lock); + + list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) { + if (ic->conn->c_passive) + rds_conn_destroy(ic->conn->c_passive); + rds_conn_destroy(ic->conn); + } +} + +void rds_ib_remove_conns(struct rds_ib_device *rds_ibdev) +{ + struct rds_ib_connection *ic, *_ic; + LIST_HEAD(tmp_list); + + /* avoid calling conn_destroy with irqs off */ + spin_lock_irq(&rds_ibdev->spinlock); + list_splice(&rds_ibdev->conn_list, &tmp_list); + INIT_LIST_HEAD(&rds_ibdev->conn_list); + spin_unlock_irq(&rds_ibdev->spinlock); + + list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) { + if (ic->conn->c_passive) + rds_conn_destroy(ic->conn->c_passive); + rds_conn_destroy(ic->conn); + } +} + +struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev) +{ + struct rds_ib_mr_pool *pool; + + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&pool->free_list); + INIT_LIST_HEAD(&pool->drop_list); + INIT_LIST_HEAD(&pool->clean_list); + mutex_init(&pool->flush_lock); + spin_lock_init(&pool->list_lock); + INIT_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); + + pool->fmr_attr.max_pages = fmr_message_size; + pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps; + pool->fmr_attr.page_shift = rds_ibdev->fmr_page_shift; + pool->max_free_pinned = rds_ibdev->max_fmrs * fmr_message_size / 4; + + /* We never allow more than max_items MRs to be allocated. + * When we exceed more than max_items_soft, we start freeing + * items more aggressively. + * Make sure that max_items > max_items_soft > max_items / 2 + */ + pool->max_items_soft = rds_ibdev->max_fmrs * 3 / 4; + pool->max_items = rds_ibdev->max_fmrs; + + return pool; +} + +void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo) +{ + struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + + iinfo->rdma_mr_max = pool->max_items; + iinfo->rdma_mr_size = pool->fmr_attr.max_pages; +} + +void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) +{ + flush_workqueue(rds_wq); + rds_ib_flush_mr_pool(pool, 1); + BUG_ON(atomic_read(&pool->item_count)); + BUG_ON(atomic_read(&pool->free_pinned)); + kfree(pool); +} + +static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool) +{ + struct rds_ib_mr *ibmr = NULL; + unsigned long flags; + + spin_lock_irqsave(&pool->list_lock, flags); + if (!list_empty(&pool->clean_list)) { + ibmr = list_entry(pool->clean_list.next, struct rds_ib_mr, list); + list_del_init(&ibmr->list); + } + spin_unlock_irqrestore(&pool->list_lock, flags); + + return ibmr; +} + +static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) +{ + struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + struct rds_ib_mr *ibmr = NULL; + int err = 0, iter = 0; + + while (1) { + ibmr = rds_ib_reuse_fmr(pool); + if (ibmr) + return ibmr; + + /* No clean MRs - now we have the choice of either + * allocating a fresh MR up to the limit imposed by the + * driver, or flush any dirty unused MRs. + * We try to avoid stalling in the send path if possible, + * so we allocate as long as we're allowed to. + * + * We're fussy with enforcing the FMR limit, though. If the driver + * tells us we can't use more than N fmrs, we shouldn't start + * arguing with it */ + if (atomic_inc_return(&pool->item_count) <= pool->max_items) + break; + + atomic_dec(&pool->item_count); + + if (++iter > 2) { + rds_ib_stats_inc(s_ib_rdma_mr_pool_depleted); + return ERR_PTR(-EAGAIN); + } + + /* We do have some empty MRs. Flush them out. */ + rds_ib_stats_inc(s_ib_rdma_mr_pool_wait); + rds_ib_flush_mr_pool(pool, 0); + } + + ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL); + if (!ibmr) { + err = -ENOMEM; + goto out_no_cigar; + } + + ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd, + (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE), + &pool->fmr_attr); + if (IS_ERR(ibmr->fmr)) { + err = PTR_ERR(ibmr->fmr); + ibmr->fmr = NULL; + printk(KERN_WARNING "RDS/IB: ib_alloc_fmr failed (err=%d)\n", err); + goto out_no_cigar; + } + + rds_ib_stats_inc(s_ib_rdma_mr_alloc); + return ibmr; + +out_no_cigar: + if (ibmr) { + if (ibmr->fmr) + ib_dealloc_fmr(ibmr->fmr); + kfree(ibmr); + } + atomic_dec(&pool->item_count); + return ERR_PTR(err); +} + +static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr, + struct scatterlist *sg, unsigned int nents) +{ + struct ib_device *dev = rds_ibdev->dev; + struct scatterlist *scat = sg; + u64 io_addr = 0; + u64 *dma_pages; + u32 len; + int page_cnt, sg_dma_len; + int i, j; + int ret; + + sg_dma_len = ib_dma_map_sg(dev, sg, nents, + DMA_BIDIRECTIONAL); + if (unlikely(!sg_dma_len)) { + printk(KERN_WARNING "RDS/IB: dma_map_sg failed!\n"); + return -EBUSY; + } + + len = 0; + page_cnt = 0; + + for (i = 0; i < sg_dma_len; ++i) { + unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]); + u64 dma_addr = ib_sg_dma_address(dev, &scat[i]); + + if (dma_addr & ~rds_ibdev->fmr_page_mask) { + if (i > 0) + return -EINVAL; + else + ++page_cnt; + } + if ((dma_addr + dma_len) & ~rds_ibdev->fmr_page_mask) { + if (i < sg_dma_len - 1) + return -EINVAL; + else + ++page_cnt; + } + + len += dma_len; + } + + page_cnt += len >> rds_ibdev->fmr_page_shift; + if (page_cnt > fmr_message_size) + return -EINVAL; + + dma_pages = kmalloc(sizeof(u64) * page_cnt, GFP_ATOMIC); + if (!dma_pages) + return -ENOMEM; + + page_cnt = 0; + for (i = 0; i < sg_dma_len; ++i) { + unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]); + u64 dma_addr = ib_sg_dma_address(dev, &scat[i]); + + for (j = 0; j < dma_len; j += rds_ibdev->fmr_page_size) + dma_pages[page_cnt++] = + (dma_addr & rds_ibdev->fmr_page_mask) + j; + } + + ret = ib_map_phys_fmr(ibmr->fmr, + dma_pages, page_cnt, io_addr); + if (ret) + goto out; + + /* Success - we successfully remapped the MR, so we can + * safely tear down the old mapping. */ + rds_ib_teardown_mr(ibmr); + + ibmr->sg = scat; + ibmr->sg_len = nents; + ibmr->sg_dma_len = sg_dma_len; + ibmr->remap_count++; + + rds_ib_stats_inc(s_ib_rdma_mr_used); + ret = 0; + +out: + kfree(dma_pages); + + return ret; +} + +void rds_ib_sync_mr(void *trans_private, int direction) +{ + struct rds_ib_mr *ibmr = trans_private; + struct rds_ib_device *rds_ibdev = ibmr->device; + + switch (direction) { + case DMA_FROM_DEVICE: + ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg, + ibmr->sg_dma_len, DMA_BIDIRECTIONAL); + break; + case DMA_TO_DEVICE: + ib_dma_sync_sg_for_device(rds_ibdev->dev, ibmr->sg, + ibmr->sg_dma_len, DMA_BIDIRECTIONAL); + break; + } +} + +static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr) +{ + struct rds_ib_device *rds_ibdev = ibmr->device; + + if (ibmr->sg_dma_len) { + ib_dma_unmap_sg(rds_ibdev->dev, + ibmr->sg, ibmr->sg_len, + DMA_BIDIRECTIONAL); + ibmr->sg_dma_len = 0; + } + + /* Release the s/g list */ + if (ibmr->sg_len) { + unsigned int i; + + for (i = 0; i < ibmr->sg_len; ++i) { + struct page *page = sg_page(&ibmr->sg[i]); + + /* FIXME we need a way to tell a r/w MR + * from a r/o MR */ + set_page_dirty(page); + put_page(page); + } + kfree(ibmr->sg); + + ibmr->sg = NULL; + ibmr->sg_len = 0; + } +} + +static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr) +{ + unsigned int pinned = ibmr->sg_len; + + __rds_ib_teardown_mr(ibmr); + if (pinned) { + struct rds_ib_device *rds_ibdev = ibmr->device; + struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + + atomic_sub(pinned, &pool->free_pinned); + } +} + +static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int free_all) +{ + unsigned int item_count; + + item_count = atomic_read(&pool->item_count); + if (free_all) + return item_count; + + return 0; +} + +/* + * Flush our pool of MRs. + * At a minimum, all currently unused MRs are unmapped. + * If the number of MRs allocated exceeds the limit, we also try + * to free as many MRs as needed to get back to this limit. + */ +static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) +{ + struct rds_ib_mr *ibmr, *next; + LIST_HEAD(unmap_list); + LIST_HEAD(fmr_list); + unsigned long unpinned = 0; + unsigned long flags; + unsigned int nfreed = 0, ncleaned = 0, free_goal; + int ret = 0; + + rds_ib_stats_inc(s_ib_rdma_mr_pool_flush); + + mutex_lock(&pool->flush_lock); + + spin_lock_irqsave(&pool->list_lock, flags); + /* Get the list of all MRs to be dropped. Ordering matters - + * we want to put drop_list ahead of free_list. */ + list_splice_init(&pool->free_list, &unmap_list); + list_splice_init(&pool->drop_list, &unmap_list); + if (free_all) + list_splice_init(&pool->clean_list, &unmap_list); + spin_unlock_irqrestore(&pool->list_lock, flags); + + free_goal = rds_ib_flush_goal(pool, free_all); + + if (list_empty(&unmap_list)) + goto out; + + /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */ + list_for_each_entry(ibmr, &unmap_list, list) + list_add(&ibmr->fmr->list, &fmr_list); + ret = ib_unmap_fmr(&fmr_list); + if (ret) + printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret); + + /* Now we can destroy the DMA mapping and unpin any pages */ + list_for_each_entry_safe(ibmr, next, &unmap_list, list) { + unpinned += ibmr->sg_len; + __rds_ib_teardown_mr(ibmr); + if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) { + rds_ib_stats_inc(s_ib_rdma_mr_free); + list_del(&ibmr->list); + ib_dealloc_fmr(ibmr->fmr); + kfree(ibmr); + nfreed++; + } + ncleaned++; + } + + spin_lock_irqsave(&pool->list_lock, flags); + list_splice(&unmap_list, &pool->clean_list); + spin_unlock_irqrestore(&pool->list_lock, flags); + + atomic_sub(unpinned, &pool->free_pinned); + atomic_sub(ncleaned, &pool->dirty_count); + atomic_sub(nfreed, &pool->item_count); + +out: + mutex_unlock(&pool->flush_lock); + return ret; +} + +static void rds_ib_mr_pool_flush_worker(struct work_struct *work) +{ + struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker); + + rds_ib_flush_mr_pool(pool, 0); +} + +void rds_ib_free_mr(void *trans_private, int invalidate) +{ + struct rds_ib_mr *ibmr = trans_private; + struct rds_ib_device *rds_ibdev = ibmr->device; + struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + unsigned long flags; + + rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); + + /* Return it to the pool's free list */ + spin_lock_irqsave(&pool->list_lock, flags); + if (ibmr->remap_count >= pool->fmr_attr.max_maps) + list_add(&ibmr->list, &pool->drop_list); + else + list_add(&ibmr->list, &pool->free_list); + + atomic_add(ibmr->sg_len, &pool->free_pinned); + atomic_inc(&pool->dirty_count); + spin_unlock_irqrestore(&pool->list_lock, flags); + + /* If we've pinned too many pages, request a flush */ + if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned + || atomic_read(&pool->dirty_count) >= pool->max_items / 10) + queue_work(rds_wq, &pool->flush_worker); + + if (invalidate) { + if (likely(!in_interrupt())) { + rds_ib_flush_mr_pool(pool, 0); + } else { + /* We get here if the user created a MR marked + * as use_once and invalidate at the same time. */ + queue_work(rds_wq, &pool->flush_worker); + } + } +} + +void rds_ib_flush_mrs(void) +{ + struct rds_ib_device *rds_ibdev; + + list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { + struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + + if (pool) + rds_ib_flush_mr_pool(pool, 0); + } +} + +void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, + struct rds_sock *rs, u32 *key_ret) +{ + struct rds_ib_device *rds_ibdev; + struct rds_ib_mr *ibmr = NULL; + int ret; + + rds_ibdev = rds_ib_get_device(rs->rs_bound_addr); + if (!rds_ibdev) { + ret = -ENODEV; + goto out; + } + + if (!rds_ibdev->mr_pool) { + ret = -ENODEV; + goto out; + } + + ibmr = rds_ib_alloc_fmr(rds_ibdev); + if (IS_ERR(ibmr)) + return ibmr; + + ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents); + if (ret == 0) + *key_ret = ibmr->fmr->rkey; + else + printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret); + + ibmr->device = rds_ibdev; + + out: + if (ret) { + if (ibmr) + rds_ib_free_mr(ibmr, 0); + ibmr = ERR_PTR(ret); + } + return ibmr; +} diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c new file mode 100644 index 00000000000..5061b550216 --- /dev/null +++ b/net/rds/ib_recv.c @@ -0,0 +1,869 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/dma-mapping.h> +#include <rdma/rdma_cm.h> + +#include "rds.h" +#include "ib.h" + +static struct kmem_cache *rds_ib_incoming_slab; +static struct kmem_cache *rds_ib_frag_slab; +static atomic_t rds_ib_allocation = ATOMIC_INIT(0); + +static void rds_ib_frag_drop_page(struct rds_page_frag *frag) +{ + rdsdebug("frag %p page %p\n", frag, frag->f_page); + __free_page(frag->f_page); + frag->f_page = NULL; +} + +static void rds_ib_frag_free(struct rds_page_frag *frag) +{ + rdsdebug("frag %p page %p\n", frag, frag->f_page); + BUG_ON(frag->f_page != NULL); + kmem_cache_free(rds_ib_frag_slab, frag); +} + +/* + * We map a page at a time. Its fragments are posted in order. This + * is called in fragment order as the fragments get send completion events. + * Only the last frag in the page performs the unmapping. + * + * It's OK for ring cleanup to call this in whatever order it likes because + * DMA is not in flight and so we can unmap while other ring entries still + * hold page references in their frags. + */ +static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic, + struct rds_ib_recv_work *recv) +{ + struct rds_page_frag *frag = recv->r_frag; + + rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page); + if (frag->f_mapped) + ib_dma_unmap_page(ic->i_cm_id->device, + frag->f_mapped, + RDS_FRAG_SIZE, DMA_FROM_DEVICE); + frag->f_mapped = 0; +} + +void rds_ib_recv_init_ring(struct rds_ib_connection *ic) +{ + struct rds_ib_recv_work *recv; + u32 i; + + for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) { + struct ib_sge *sge; + + recv->r_ibinc = NULL; + recv->r_frag = NULL; + + recv->r_wr.next = NULL; + recv->r_wr.wr_id = i; + recv->r_wr.sg_list = recv->r_sge; + recv->r_wr.num_sge = RDS_IB_RECV_SGE; + + sge = rds_ib_data_sge(ic, recv->r_sge); + sge->addr = 0; + sge->length = RDS_FRAG_SIZE; + sge->lkey = ic->i_mr->lkey; + + sge = rds_ib_header_sge(ic, recv->r_sge); + sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); + sge->length = sizeof(struct rds_header); + sge->lkey = ic->i_mr->lkey; + } +} + +static void rds_ib_recv_clear_one(struct rds_ib_connection *ic, + struct rds_ib_recv_work *recv) +{ + if (recv->r_ibinc) { + rds_inc_put(&recv->r_ibinc->ii_inc); + recv->r_ibinc = NULL; + } + if (recv->r_frag) { + rds_ib_recv_unmap_page(ic, recv); + if (recv->r_frag->f_page) + rds_ib_frag_drop_page(recv->r_frag); + rds_ib_frag_free(recv->r_frag); + recv->r_frag = NULL; + } +} + +void rds_ib_recv_clear_ring(struct rds_ib_connection *ic) +{ + u32 i; + + for (i = 0; i < ic->i_recv_ring.w_nr; i++) + rds_ib_recv_clear_one(ic, &ic->i_recvs[i]); + + if (ic->i_frag.f_page) + rds_ib_frag_drop_page(&ic->i_frag); +} + +static int rds_ib_recv_refill_one(struct rds_connection *conn, + struct rds_ib_recv_work *recv, + gfp_t kptr_gfp, gfp_t page_gfp) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + dma_addr_t dma_addr; + struct ib_sge *sge; + int ret = -ENOMEM; + + if (recv->r_ibinc == NULL) { + if (atomic_read(&rds_ib_allocation) >= rds_ib_sysctl_max_recv_allocation) { + rds_ib_stats_inc(s_ib_rx_alloc_limit); + goto out; + } + recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab, + kptr_gfp); + if (recv->r_ibinc == NULL) + goto out; + atomic_inc(&rds_ib_allocation); + INIT_LIST_HEAD(&recv->r_ibinc->ii_frags); + rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr); + } + + if (recv->r_frag == NULL) { + recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, kptr_gfp); + if (recv->r_frag == NULL) + goto out; + INIT_LIST_HEAD(&recv->r_frag->f_item); + recv->r_frag->f_page = NULL; + } + + if (ic->i_frag.f_page == NULL) { + ic->i_frag.f_page = alloc_page(page_gfp); + if (ic->i_frag.f_page == NULL) + goto out; + ic->i_frag.f_offset = 0; + } + + dma_addr = ib_dma_map_page(ic->i_cm_id->device, + ic->i_frag.f_page, + ic->i_frag.f_offset, + RDS_FRAG_SIZE, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr)) + goto out; + + /* + * Once we get the RDS_PAGE_LAST_OFF frag then rds_ib_frag_unmap() + * must be called on this recv. This happens as completions hit + * in order or on connection shutdown. + */ + recv->r_frag->f_page = ic->i_frag.f_page; + recv->r_frag->f_offset = ic->i_frag.f_offset; + recv->r_frag->f_mapped = dma_addr; + + sge = rds_ib_data_sge(ic, recv->r_sge); + sge->addr = dma_addr; + sge->length = RDS_FRAG_SIZE; + + sge = rds_ib_header_sge(ic, recv->r_sge); + sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); + sge->length = sizeof(struct rds_header); + + get_page(recv->r_frag->f_page); + + if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) { + ic->i_frag.f_offset += RDS_FRAG_SIZE; + } else { + put_page(ic->i_frag.f_page); + ic->i_frag.f_page = NULL; + ic->i_frag.f_offset = 0; + } + + ret = 0; +out: + return ret; +} + +/* + * This tries to allocate and post unused work requests after making sure that + * they have all the allocations they need to queue received fragments into + * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc + * pairs don't go unmatched. + * + * -1 is returned if posting fails due to temporary resource exhaustion. + */ +int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, + gfp_t page_gfp, int prefill) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_ib_recv_work *recv; + struct ib_recv_wr *failed_wr; + unsigned int posted = 0; + int ret = 0; + u32 pos; + + while ((prefill || rds_conn_up(conn)) + && rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) { + if (pos >= ic->i_recv_ring.w_nr) { + printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n", + pos); + ret = -EINVAL; + break; + } + + recv = &ic->i_recvs[pos]; + ret = rds_ib_recv_refill_one(conn, recv, kptr_gfp, page_gfp); + if (ret) { + ret = -1; + break; + } + + /* XXX when can this fail? */ + ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); + rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv, + recv->r_ibinc, recv->r_frag->f_page, + (long) recv->r_frag->f_mapped, ret); + if (ret) { + rds_ib_conn_error(conn, "recv post on " + "%pI4 returned %d, disconnecting and " + "reconnecting\n", &conn->c_faddr, + ret); + ret = -1; + break; + } + + posted++; + } + + /* We're doing flow control - update the window. */ + if (ic->i_flowctl && posted) + rds_ib_advertise_credits(conn, posted); + + if (ret) + rds_ib_ring_unalloc(&ic->i_recv_ring, 1); + return ret; +} + +void rds_ib_inc_purge(struct rds_incoming *inc) +{ + struct rds_ib_incoming *ibinc; + struct rds_page_frag *frag; + struct rds_page_frag *pos; + + ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); + rdsdebug("purging ibinc %p inc %p\n", ibinc, inc); + + list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) { + list_del_init(&frag->f_item); + rds_ib_frag_drop_page(frag); + rds_ib_frag_free(frag); + } +} + +void rds_ib_inc_free(struct rds_incoming *inc) +{ + struct rds_ib_incoming *ibinc; + + ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); + + rds_ib_inc_purge(inc); + rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc); + BUG_ON(!list_empty(&ibinc->ii_frags)); + kmem_cache_free(rds_ib_incoming_slab, ibinc); + atomic_dec(&rds_ib_allocation); + BUG_ON(atomic_read(&rds_ib_allocation) < 0); +} + +int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, + size_t size) +{ + struct rds_ib_incoming *ibinc; + struct rds_page_frag *frag; + struct iovec *iov = first_iov; + unsigned long to_copy; + unsigned long frag_off = 0; + unsigned long iov_off = 0; + int copied = 0; + int ret; + u32 len; + + ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); + frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item); + len = be32_to_cpu(inc->i_hdr.h_len); + + while (copied < size && copied < len) { + if (frag_off == RDS_FRAG_SIZE) { + frag = list_entry(frag->f_item.next, + struct rds_page_frag, f_item); + frag_off = 0; + } + while (iov_off == iov->iov_len) { + iov_off = 0; + iov++; + } + + to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off); + to_copy = min_t(size_t, to_copy, size - copied); + to_copy = min_t(unsigned long, to_copy, len - copied); + + rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag " + "[%p, %lu] + %lu\n", + to_copy, iov->iov_base, iov->iov_len, iov_off, + frag->f_page, frag->f_offset, frag_off); + + /* XXX needs + offset for multiple recvs per page */ + ret = rds_page_copy_to_user(frag->f_page, + frag->f_offset + frag_off, + iov->iov_base + iov_off, + to_copy); + if (ret) { + copied = ret; + break; + } + + iov_off += to_copy; + frag_off += to_copy; + copied += to_copy; + } + + return copied; +} + +/* ic starts out kzalloc()ed */ +void rds_ib_recv_init_ack(struct rds_ib_connection *ic) +{ + struct ib_send_wr *wr = &ic->i_ack_wr; + struct ib_sge *sge = &ic->i_ack_sge; + + sge->addr = ic->i_ack_dma; + sge->length = sizeof(struct rds_header); + sge->lkey = ic->i_mr->lkey; + + wr->sg_list = sge; + wr->num_sge = 1; + wr->opcode = IB_WR_SEND; + wr->wr_id = RDS_IB_ACK_WR_ID; + wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED; +} + +/* + * You'd think that with reliable IB connections you wouldn't need to ack + * messages that have been received. The problem is that IB hardware generates + * an ack message before it has DMAed the message into memory. This creates a + * potential message loss if the HCA is disabled for any reason between when it + * sends the ack and before the message is DMAed and processed. This is only a + * potential issue if another HCA is available for fail-over. + * + * When the remote host receives our ack they'll free the sent message from + * their send queue. To decrease the latency of this we always send an ack + * immediately after we've received messages. + * + * For simplicity, we only have one ack in flight at a time. This puts + * pressure on senders to have deep enough send queues to absorb the latency of + * a single ack frame being in flight. This might not be good enough. + * + * This is implemented by have a long-lived send_wr and sge which point to a + * statically allocated ack frame. This ack wr does not fall under the ring + * accounting that the tx and rx wrs do. The QP attribute specifically makes + * room for it beyond the ring size. Send completion notices its special + * wr_id and avoids working with the ring in that case. + */ +static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, + int ack_required) +{ + rds_ib_set_64bit(&ic->i_ack_next, seq); + if (ack_required) { + smp_mb__before_clear_bit(); + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + } +} + +static u64 rds_ib_get_ack(struct rds_ib_connection *ic) +{ + clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + smp_mb__after_clear_bit(); + + return ic->i_ack_next; +} + +static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credits) +{ + struct rds_header *hdr = ic->i_ack; + struct ib_send_wr *failed_wr; + u64 seq; + int ret; + + seq = rds_ib_get_ack(ic); + + rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq); + rds_message_populate_header(hdr, 0, 0, 0); + hdr->h_ack = cpu_to_be64(seq); + hdr->h_credit = adv_credits; + rds_message_make_checksum(hdr); + ic->i_ack_queued = jiffies; + + ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr); + if (unlikely(ret)) { + /* Failed to send. Release the WR, and + * force another ACK. + */ + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + + rds_ib_stats_inc(s_ib_ack_send_failure); + /* Need to finesse this later. */ + BUG(); + } else + rds_ib_stats_inc(s_ib_ack_sent); +} + +/* + * There are 3 ways of getting acknowledgements to the peer: + * 1. We call rds_ib_attempt_ack from the recv completion handler + * to send an ACK-only frame. + * However, there can be only one such frame in the send queue + * at any time, so we may have to postpone it. + * 2. When another (data) packet is transmitted while there's + * an ACK in the queue, we piggyback the ACK sequence number + * on the data packet. + * 3. If the ACK WR is done sending, we get called from the + * send queue completion handler, and check whether there's + * another ACK pending (postponed because the WR was on the + * queue). If so, we transmit it. + * + * We maintain 2 variables: + * - i_ack_flags, which keeps track of whether the ACK WR + * is currently in the send queue or not (IB_ACK_IN_FLIGHT) + * - i_ack_next, which is the last sequence number we received + * + * Potentially, send queue and receive queue handlers can run concurrently. + * + * Reconnecting complicates this picture just slightly. When we + * reconnect, we may be seeing duplicate packets. The peer + * is retransmitting them, because it hasn't seen an ACK for + * them. It is important that we ACK these. + * + * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with + * this flag set *MUST* be acknowledged immediately. + */ + +/* + * When we get here, we're called from the recv queue handler. + * Check whether we ought to transmit an ACK. + */ +void rds_ib_attempt_ack(struct rds_ib_connection *ic) +{ + unsigned int adv_credits; + + if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) + return; + + if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) { + rds_ib_stats_inc(s_ib_ack_send_delayed); + return; + } + + /* Can we get a send credit? */ + if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0)) { + rds_ib_stats_inc(s_ib_tx_throttle); + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); + return; + } + + clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + rds_ib_send_ack(ic, adv_credits); +} + +/* + * We get here from the send completion handler, when the + * adapter tells us the ACK frame was sent. + */ +void rds_ib_ack_send_complete(struct rds_ib_connection *ic) +{ + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); + rds_ib_attempt_ack(ic); +} + +/* + * This is called by the regular xmit code when it wants to piggyback + * an ACK on an outgoing frame. + */ +u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic) +{ + if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) + rds_ib_stats_inc(s_ib_ack_send_piggybacked); + return rds_ib_get_ack(ic); +} + +/* + * It's kind of lame that we're copying from the posted receive pages into + * long-lived bitmaps. We could have posted the bitmaps and rdma written into + * them. But receiving new congestion bitmaps should be a *rare* event, so + * hopefully we won't need to invest that complexity in making it more + * efficient. By copying we can share a simpler core with TCP which has to + * copy. + */ +static void rds_ib_cong_recv(struct rds_connection *conn, + struct rds_ib_incoming *ibinc) +{ + struct rds_cong_map *map; + unsigned int map_off; + unsigned int map_page; + struct rds_page_frag *frag; + unsigned long frag_off; + unsigned long to_copy; + unsigned long copied; + uint64_t uncongested = 0; + void *addr; + + /* catch completely corrupt packets */ + if (be32_to_cpu(ibinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES) + return; + + map = conn->c_fcong; + map_page = 0; + map_off = 0; + + frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item); + frag_off = 0; + + copied = 0; + + while (copied < RDS_CONG_MAP_BYTES) { + uint64_t *src, *dst; + unsigned int k; + + to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); + BUG_ON(to_copy & 7); /* Must be 64bit aligned. */ + + addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0); + + src = addr + frag_off; + dst = (void *)map->m_page_addrs[map_page] + map_off; + for (k = 0; k < to_copy; k += 8) { + /* Record ports that became uncongested, ie + * bits that changed from 0 to 1. */ + uncongested |= ~(*src) & *dst; + *dst++ = *src++; + } + kunmap_atomic(addr, KM_SOFTIRQ0); + + copied += to_copy; + + map_off += to_copy; + if (map_off == PAGE_SIZE) { + map_off = 0; + map_page++; + } + + frag_off += to_copy; + if (frag_off == RDS_FRAG_SIZE) { + frag = list_entry(frag->f_item.next, + struct rds_page_frag, f_item); + frag_off = 0; + } + } + + /* the congestion map is in little endian order */ + uncongested = le64_to_cpu(uncongested); + + rds_cong_map_updated(map, uncongested); +} + +/* + * Rings are posted with all the allocations they'll need to queue the + * incoming message to the receiving socket so this can't fail. + * All fragments start with a header, so we can make sure we're not receiving + * garbage, and we can tell a small 8 byte fragment from an ACK frame. + */ +struct rds_ib_ack_state { + u64 ack_next; + u64 ack_recv; + unsigned int ack_required:1; + unsigned int ack_next_valid:1; + unsigned int ack_recv_valid:1; +}; + +static void rds_ib_process_recv(struct rds_connection *conn, + struct rds_ib_recv_work *recv, u32 byte_len, + struct rds_ib_ack_state *state) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_ib_incoming *ibinc = ic->i_ibinc; + struct rds_header *ihdr, *hdr; + + /* XXX shut down the connection if port 0,0 are seen? */ + + rdsdebug("ic %p ibinc %p recv %p byte len %u\n", ic, ibinc, recv, + byte_len); + + if (byte_len < sizeof(struct rds_header)) { + rds_ib_conn_error(conn, "incoming message " + "from %pI4 didn't inclue a " + "header, disconnecting and " + "reconnecting\n", + &conn->c_faddr); + return; + } + byte_len -= sizeof(struct rds_header); + + ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs]; + + /* Validate the checksum. */ + if (!rds_message_verify_checksum(ihdr)) { + rds_ib_conn_error(conn, "incoming message " + "from %pI4 has corrupted header - " + "forcing a reconnect\n", + &conn->c_faddr); + rds_stats_inc(s_recv_drop_bad_checksum); + return; + } + + /* Process the ACK sequence which comes with every packet */ + state->ack_recv = be64_to_cpu(ihdr->h_ack); + state->ack_recv_valid = 1; + + /* Process the credits update if there was one */ + if (ihdr->h_credit) + rds_ib_send_add_credits(conn, ihdr->h_credit); + + if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) { + /* This is an ACK-only packet. The fact that it gets + * special treatment here is that historically, ACKs + * were rather special beasts. + */ + rds_ib_stats_inc(s_ib_ack_received); + + /* + * Usually the frags make their way on to incs and are then freed as + * the inc is freed. We don't go that route, so we have to drop the + * page ref ourselves. We can't just leave the page on the recv + * because that confuses the dma mapping of pages and each recv's use + * of a partial page. We can leave the frag, though, it will be + * reused. + * + * FIXME: Fold this into the code path below. + */ + rds_ib_frag_drop_page(recv->r_frag); + return; + } + + /* + * If we don't already have an inc on the connection then this + * fragment has a header and starts a message.. copy its header + * into the inc and save the inc so we can hang upcoming fragments + * off its list. + */ + if (ibinc == NULL) { + ibinc = recv->r_ibinc; + recv->r_ibinc = NULL; + ic->i_ibinc = ibinc; + + hdr = &ibinc->ii_inc.i_hdr; + memcpy(hdr, ihdr, sizeof(*hdr)); + ic->i_recv_data_rem = be32_to_cpu(hdr->h_len); + + rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc, + ic->i_recv_data_rem, hdr->h_flags); + } else { + hdr = &ibinc->ii_inc.i_hdr; + /* We can't just use memcmp here; fragments of a + * single message may carry different ACKs */ + if (hdr->h_sequence != ihdr->h_sequence + || hdr->h_len != ihdr->h_len + || hdr->h_sport != ihdr->h_sport + || hdr->h_dport != ihdr->h_dport) { + rds_ib_conn_error(conn, + "fragment header mismatch; forcing reconnect\n"); + return; + } + } + + list_add_tail(&recv->r_frag->f_item, &ibinc->ii_frags); + recv->r_frag = NULL; + + if (ic->i_recv_data_rem > RDS_FRAG_SIZE) + ic->i_recv_data_rem -= RDS_FRAG_SIZE; + else { + ic->i_recv_data_rem = 0; + ic->i_ibinc = NULL; + + if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) + rds_ib_cong_recv(conn, ibinc); + else { + rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr, + &ibinc->ii_inc, GFP_ATOMIC, + KM_SOFTIRQ0); + state->ack_next = be64_to_cpu(hdr->h_sequence); + state->ack_next_valid = 1; + } + + /* Evaluate the ACK_REQUIRED flag *after* we received + * the complete frame, and after bumping the next_rx + * sequence. */ + if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) { + rds_stats_inc(s_recv_ack_required); + state->ack_required = 1; + } + + rds_inc_put(&ibinc->ii_inc); + } +} + +/* + * Plucking the oldest entry from the ring can be done concurrently with + * the thread refilling the ring. Each ring operation is protected by + * spinlocks and the transient state of refilling doesn't change the + * recording of which entry is oldest. + * + * This relies on IB only calling one cq comp_handler for each cq so that + * there will only be one caller of rds_recv_incoming() per RDS connection. + */ +void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context) +{ + struct rds_connection *conn = context; + struct rds_ib_connection *ic = conn->c_transport_data; + struct ib_wc wc; + struct rds_ib_ack_state state = { 0, }; + struct rds_ib_recv_work *recv; + + rdsdebug("conn %p cq %p\n", conn, cq); + + rds_ib_stats_inc(s_ib_rx_cq_call); + + ib_req_notify_cq(cq, IB_CQ_SOLICITED); + + while (ib_poll_cq(cq, 1, &wc) > 0) { + rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", + (unsigned long long)wc.wr_id, wc.status, wc.byte_len, + be32_to_cpu(wc.ex.imm_data)); + rds_ib_stats_inc(s_ib_rx_cq_event); + + recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)]; + + rds_ib_recv_unmap_page(ic, recv); + + /* + * Also process recvs in connecting state because it is possible + * to get a recv completion _before_ the rdmacm ESTABLISHED + * event is processed. + */ + if (rds_conn_up(conn) || rds_conn_connecting(conn)) { + /* We expect errors as the qp is drained during shutdown */ + if (wc.status == IB_WC_SUCCESS) { + rds_ib_process_recv(conn, recv, wc.byte_len, &state); + } else { + rds_ib_conn_error(conn, "recv completion on " + "%pI4 had status %u, disconnecting and " + "reconnecting\n", &conn->c_faddr, + wc.status); + } + } + + rds_ib_ring_free(&ic->i_recv_ring, 1); + } + + if (state.ack_next_valid) + rds_ib_set_ack(ic, state.ack_next, state.ack_required); + if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) { + rds_send_drop_acked(conn, state.ack_recv, NULL); + ic->i_ack_recv = state.ack_recv; + } + if (rds_conn_up(conn)) + rds_ib_attempt_ack(ic); + + /* If we ever end up with a really empty receive ring, we're + * in deep trouble, as the sender will definitely see RNR + * timeouts. */ + if (rds_ib_ring_empty(&ic->i_recv_ring)) + rds_ib_stats_inc(s_ib_rx_ring_empty); + + /* + * If the ring is running low, then schedule the thread to refill. + */ + if (rds_ib_ring_low(&ic->i_recv_ring)) + queue_delayed_work(rds_wq, &conn->c_recv_w, 0); +} + +int rds_ib_recv(struct rds_connection *conn) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + int ret = 0; + + rdsdebug("conn %p\n", conn); + + /* + * If we get a temporary posting failure in this context then + * we're really low and we want the caller to back off for a bit. + */ + mutex_lock(&ic->i_recv_mutex); + if (rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0)) + ret = -ENOMEM; + else + rds_ib_stats_inc(s_ib_rx_refill_from_thread); + mutex_unlock(&ic->i_recv_mutex); + + if (rds_conn_up(conn)) + rds_ib_attempt_ack(ic); + + return ret; +} + +int __init rds_ib_recv_init(void) +{ + struct sysinfo si; + int ret = -ENOMEM; + + /* Default to 30% of all available RAM for recv memory */ + si_meminfo(&si); + rds_ib_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE; + + rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming", + sizeof(struct rds_ib_incoming), + 0, 0, NULL); + if (rds_ib_incoming_slab == NULL) + goto out; + + rds_ib_frag_slab = kmem_cache_create("rds_ib_frag", + sizeof(struct rds_page_frag), + 0, 0, NULL); + if (rds_ib_frag_slab == NULL) + kmem_cache_destroy(rds_ib_incoming_slab); + else + ret = 0; +out: + return ret; +} + +void rds_ib_recv_exit(void) +{ + kmem_cache_destroy(rds_ib_incoming_slab); + kmem_cache_destroy(rds_ib_frag_slab); +} diff --git a/net/rds/ib_ring.c b/net/rds/ib_ring.c new file mode 100644 index 00000000000..99a6ccae964 --- /dev/null +++ b/net/rds/ib_ring.c @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> + +#include "rds.h" +#include "ib.h" + +/* + * Locking for IB rings. + * We assume that allocation is always protected by a mutex + * in the caller (this is a valid assumption for the current + * implementation). + * + * Freeing always happens in an interrupt, and hence only + * races with allocations, but not with other free()s. + * + * The interaction between allocation and freeing is that + * the alloc code has to determine the number of free entries. + * To this end, we maintain two counters; an allocation counter + * and a free counter. Both are allowed to run freely, and wrap + * around. + * The number of used entries is always (alloc_ctr - free_ctr) % NR. + * + * The current implementation makes free_ctr atomic. When the + * caller finds an allocation fails, it should set an "alloc fail" + * bit and retry the allocation. The "alloc fail" bit essentially tells + * the CQ completion handlers to wake it up after freeing some + * more entries. + */ + +/* + * This only happens on shutdown. + */ +DECLARE_WAIT_QUEUE_HEAD(rds_ib_ring_empty_wait); + +void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr) +{ + memset(ring, 0, sizeof(*ring)); + ring->w_nr = nr; + rdsdebug("ring %p nr %u\n", ring, ring->w_nr); +} + +static inline u32 __rds_ib_ring_used(struct rds_ib_work_ring *ring) +{ + u32 diff; + + /* This assumes that atomic_t has at least as many bits as u32 */ + diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr); + BUG_ON(diff > ring->w_nr); + + return diff; +} + +void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr) +{ + /* We only ever get called from the connection setup code, + * prior to creating the QP. */ + BUG_ON(__rds_ib_ring_used(ring)); + ring->w_nr = nr; +} + +static int __rds_ib_ring_empty(struct rds_ib_work_ring *ring) +{ + return __rds_ib_ring_used(ring) == 0; +} + +u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos) +{ + u32 ret = 0, avail; + + avail = ring->w_nr - __rds_ib_ring_used(ring); + + rdsdebug("ring %p val %u next %u free %u\n", ring, val, + ring->w_alloc_ptr, avail); + + if (val && avail) { + ret = min(val, avail); + *pos = ring->w_alloc_ptr; + + ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr; + ring->w_alloc_ctr += ret; + } + + return ret; +} + +void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val) +{ + ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr; + atomic_add(val, &ring->w_free_ctr); + + if (__rds_ib_ring_empty(ring) && + waitqueue_active(&rds_ib_ring_empty_wait)) + wake_up(&rds_ib_ring_empty_wait); +} + +void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val) +{ + ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr; + ring->w_alloc_ctr -= val; +} + +int rds_ib_ring_empty(struct rds_ib_work_ring *ring) +{ + return __rds_ib_ring_empty(ring); +} + +int rds_ib_ring_low(struct rds_ib_work_ring *ring) +{ + return __rds_ib_ring_used(ring) <= (ring->w_nr >> 2); +} + +/* + * returns the oldest alloced ring entry. This will be the next one + * freed. This can't be called if there are none allocated. + */ +u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring) +{ + return ring->w_free_ptr; +} + +/* + * returns the number of completed work requests. + */ + +u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest) +{ + u32 ret; + + if (oldest <= (unsigned long long)wr_id) + ret = (unsigned long long)wr_id - oldest + 1; + else + ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1; + + rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret, + wr_id, oldest); + return ret; +} diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c new file mode 100644 index 00000000000..cb6c52cb1c4 --- /dev/null +++ b/net/rds/ib_send.c @@ -0,0 +1,874 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/in.h> +#include <linux/device.h> +#include <linux/dmapool.h> + +#include "rds.h" +#include "rdma.h" +#include "ib.h" + +static void rds_ib_send_rdma_complete(struct rds_message *rm, + int wc_status) +{ + int notify_status; + + switch (wc_status) { + case IB_WC_WR_FLUSH_ERR: + return; + + case IB_WC_SUCCESS: + notify_status = RDS_RDMA_SUCCESS; + break; + + case IB_WC_REM_ACCESS_ERR: + notify_status = RDS_RDMA_REMOTE_ERROR; + break; + + default: + notify_status = RDS_RDMA_OTHER_ERROR; + break; + } + rds_rdma_send_complete(rm, notify_status); +} + +static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, + struct rds_rdma_op *op) +{ + if (op->r_mapped) { + ib_dma_unmap_sg(ic->i_cm_id->device, + op->r_sg, op->r_nents, + op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + op->r_mapped = 0; + } +} + +static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic, + struct rds_ib_send_work *send, + int wc_status) +{ + struct rds_message *rm = send->s_rm; + + rdsdebug("ic %p send %p rm %p\n", ic, send, rm); + + ib_dma_unmap_sg(ic->i_cm_id->device, + rm->m_sg, rm->m_nents, + DMA_TO_DEVICE); + + if (rm->m_rdma_op != NULL) { + rds_ib_send_unmap_rdma(ic, rm->m_rdma_op); + + /* If the user asked for a completion notification on this + * message, we can implement three different semantics: + * 1. Notify when we received the ACK on the RDS message + * that was queued with the RDMA. This provides reliable + * notification of RDMA status at the expense of a one-way + * packet delay. + * 2. Notify when the IB stack gives us the completion event for + * the RDMA operation. + * 3. Notify when the IB stack gives us the completion event for + * the accompanying RDS messages. + * Here, we implement approach #3. To implement approach #2, + * call rds_rdma_send_complete from the cq_handler. To implement #1, + * don't call rds_rdma_send_complete at all, and fall back to the notify + * handling in the ACK processing code. + * + * Note: There's no need to explicitly sync any RDMA buffers using + * ib_dma_sync_sg_for_cpu - the completion for the RDMA + * operation itself unmapped the RDMA buffers, which takes care + * of synching. + */ + rds_ib_send_rdma_complete(rm, wc_status); + + if (rm->m_rdma_op->r_write) + rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes); + else + rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes); + } + + /* If anyone waited for this message to get flushed out, wake + * them up now */ + rds_message_unmapped(rm); + + rds_message_put(rm); + send->s_rm = NULL; +} + +void rds_ib_send_init_ring(struct rds_ib_connection *ic) +{ + struct rds_ib_send_work *send; + u32 i; + + for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { + struct ib_sge *sge; + + send->s_rm = NULL; + send->s_op = NULL; + + send->s_wr.wr_id = i; + send->s_wr.sg_list = send->s_sge; + send->s_wr.num_sge = 1; + send->s_wr.opcode = IB_WR_SEND; + send->s_wr.send_flags = 0; + send->s_wr.ex.imm_data = 0; + + sge = rds_ib_data_sge(ic, send->s_sge); + sge->lkey = ic->i_mr->lkey; + + sge = rds_ib_header_sge(ic, send->s_sge); + sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); + sge->length = sizeof(struct rds_header); + sge->lkey = ic->i_mr->lkey; + } +} + +void rds_ib_send_clear_ring(struct rds_ib_connection *ic) +{ + struct rds_ib_send_work *send; + u32 i; + + for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { + if (send->s_wr.opcode == 0xdead) + continue; + if (send->s_rm) + rds_ib_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR); + if (send->s_op) + rds_ib_send_unmap_rdma(ic, send->s_op); + } +} + +/* + * The _oldest/_free ring operations here race cleanly with the alloc/unalloc + * operations performed in the send path. As the sender allocs and potentially + * unallocs the next free entry in the ring it doesn't alter which is + * the next to be freed, which is what this is concerned with. + */ +void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) +{ + struct rds_connection *conn = context; + struct rds_ib_connection *ic = conn->c_transport_data; + struct ib_wc wc; + struct rds_ib_send_work *send; + u32 completed; + u32 oldest; + u32 i = 0; + int ret; + + rdsdebug("cq %p conn %p\n", cq, conn); + rds_ib_stats_inc(s_ib_tx_cq_call); + ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + if (ret) + rdsdebug("ib_req_notify_cq send failed: %d\n", ret); + + while (ib_poll_cq(cq, 1, &wc) > 0) { + rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", + (unsigned long long)wc.wr_id, wc.status, wc.byte_len, + be32_to_cpu(wc.ex.imm_data)); + rds_ib_stats_inc(s_ib_tx_cq_event); + + if (wc.wr_id == RDS_IB_ACK_WR_ID) { + if (ic->i_ack_queued + HZ/2 < jiffies) + rds_ib_stats_inc(s_ib_tx_stalled); + rds_ib_ack_send_complete(ic); + continue; + } + + oldest = rds_ib_ring_oldest(&ic->i_send_ring); + + completed = rds_ib_ring_completed(&ic->i_send_ring, wc.wr_id, oldest); + + for (i = 0; i < completed; i++) { + send = &ic->i_sends[oldest]; + + /* In the error case, wc.opcode sometimes contains garbage */ + switch (send->s_wr.opcode) { + case IB_WR_SEND: + if (send->s_rm) + rds_ib_send_unmap_rm(ic, send, wc.status); + break; + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_READ: + /* Nothing to be done - the SG list will be unmapped + * when the SEND completes. */ + break; + default: + if (printk_ratelimit()) + printk(KERN_NOTICE + "RDS/IB: %s: unexpected opcode 0x%x in WR!\n", + __func__, send->s_wr.opcode); + break; + } + + send->s_wr.opcode = 0xdead; + send->s_wr.num_sge = 1; + if (send->s_queued + HZ/2 < jiffies) + rds_ib_stats_inc(s_ib_tx_stalled); + + /* If a RDMA operation produced an error, signal this right + * away. If we don't, the subsequent SEND that goes with this + * RDMA will be canceled with ERR_WFLUSH, and the application + * never learn that the RDMA failed. */ + if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) { + struct rds_message *rm; + + rm = rds_send_get_message(conn, send->s_op); + if (rm) + rds_ib_send_rdma_complete(rm, wc.status); + } + + oldest = (oldest + 1) % ic->i_send_ring.w_nr; + } + + rds_ib_ring_free(&ic->i_send_ring, completed); + + if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) + || test_bit(0, &conn->c_map_queued)) + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + + /* We expect errors as the qp is drained during shutdown */ + if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { + rds_ib_conn_error(conn, + "send completion on %pI4 " + "had status %u, disconnecting and reconnecting\n", + &conn->c_faddr, wc.status); + } + } +} + +/* + * This is the main function for allocating credits when sending + * messages. + * + * Conceptually, we have two counters: + * - send credits: this tells us how many WRs we're allowed + * to submit without overruning the reciever's queue. For + * each SEND WR we post, we decrement this by one. + * + * - posted credits: this tells us how many WRs we recently + * posted to the receive queue. This value is transferred + * to the peer as a "credit update" in a RDS header field. + * Every time we transmit credits to the peer, we subtract + * the amount of transferred credits from this counter. + * + * It is essential that we avoid situations where both sides have + * exhausted their send credits, and are unable to send new credits + * to the peer. We achieve this by requiring that we send at least + * one credit update to the peer before exhausting our credits. + * When new credits arrive, we subtract one credit that is withheld + * until we've posted new buffers and are ready to transmit these + * credits (see rds_ib_send_add_credits below). + * + * The RDS send code is essentially single-threaded; rds_send_xmit + * grabs c_send_lock to ensure exclusive access to the send ring. + * However, the ACK sending code is independent and can race with + * message SENDs. + * + * In the send path, we need to update the counters for send credits + * and the counter of posted buffers atomically - when we use the + * last available credit, we cannot allow another thread to race us + * and grab the posted credits counter. Hence, we have to use a + * spinlock to protect the credit counter, or use atomics. + * + * Spinlocks shared between the send and the receive path are bad, + * because they create unnecessary delays. An early implementation + * using a spinlock showed a 5% degradation in throughput at some + * loads. + * + * This implementation avoids spinlocks completely, putting both + * counters into a single atomic, and updating that atomic using + * atomic_add (in the receive path, when receiving fresh credits), + * and using atomic_cmpxchg when updating the two counters. + */ +int rds_ib_send_grab_credits(struct rds_ib_connection *ic, + u32 wanted, u32 *adv_credits, int need_posted) +{ + unsigned int avail, posted, got = 0, advertise; + long oldval, newval; + + *adv_credits = 0; + if (!ic->i_flowctl) + return wanted; + +try_again: + advertise = 0; + oldval = newval = atomic_read(&ic->i_credits); + posted = IB_GET_POST_CREDITS(oldval); + avail = IB_GET_SEND_CREDITS(oldval); + + rdsdebug("rds_ib_send_grab_credits(%u): credits=%u posted=%u\n", + wanted, avail, posted); + + /* The last credit must be used to send a credit update. */ + if (avail && !posted) + avail--; + + if (avail < wanted) { + struct rds_connection *conn = ic->i_cm_id->context; + + /* Oops, there aren't that many credits left! */ + set_bit(RDS_LL_SEND_FULL, &conn->c_flags); + got = avail; + } else { + /* Sometimes you get what you want, lalala. */ + got = wanted; + } + newval -= IB_SET_SEND_CREDITS(got); + + /* + * If need_posted is non-zero, then the caller wants + * the posted regardless of whether any send credits are + * available. + */ + if (posted && (got || need_posted)) { + advertise = min_t(unsigned int, posted, RDS_MAX_ADV_CREDIT); + newval -= IB_SET_POST_CREDITS(advertise); + } + + /* Finally bill everything */ + if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval) + goto try_again; + + *adv_credits = advertise; + return got; +} + +void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + + if (credits == 0) + return; + + rdsdebug("rds_ib_send_add_credits(%u): current=%u%s\n", + credits, + IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)), + test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : ""); + + atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits); + if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)) + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + + WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384); + + rds_ib_stats_inc(s_ib_rx_credit_updates); +} + +void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + + if (posted == 0) + return; + + atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits); + + /* Decide whether to send an update to the peer now. + * If we would send a credit update for every single buffer we + * post, we would end up with an ACK storm (ACK arrives, + * consumes buffer, we refill the ring, send ACK to remote + * advertising the newly posted buffer... ad inf) + * + * Performance pretty much depends on how often we send + * credit updates - too frequent updates mean lots of ACKs. + * Too infrequent updates, and the peer will run out of + * credits and has to throttle. + * For the time being, 16 seems to be a good compromise. + */ + if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16) + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); +} + +static inline void +rds_ib_xmit_populate_wr(struct rds_ib_connection *ic, + struct rds_ib_send_work *send, unsigned int pos, + unsigned long buffer, unsigned int length, + int send_flags) +{ + struct ib_sge *sge; + + WARN_ON(pos != send - ic->i_sends); + + send->s_wr.send_flags = send_flags; + send->s_wr.opcode = IB_WR_SEND; + send->s_wr.num_sge = 2; + send->s_wr.next = NULL; + send->s_queued = jiffies; + send->s_op = NULL; + + if (length != 0) { + sge = rds_ib_data_sge(ic, send->s_sge); + sge->addr = buffer; + sge->length = length; + sge->lkey = ic->i_mr->lkey; + + sge = rds_ib_header_sge(ic, send->s_sge); + } else { + /* We're sending a packet with no payload. There is only + * one SGE */ + send->s_wr.num_sge = 1; + sge = &send->s_sge[0]; + } + + sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header)); + sge->length = sizeof(struct rds_header); + sge->lkey = ic->i_mr->lkey; +} + +/* + * This can be called multiple times for a given message. The first time + * we see a message we map its scatterlist into the IB device so that + * we can provide that mapped address to the IB scatter gather entries + * in the IB work requests. We translate the scatterlist into a series + * of work requests that fragment the message. These work requests complete + * in order so we pass ownership of the message to the completion handler + * once we send the final fragment. + * + * The RDS core uses the c_send_lock to only enter this function once + * per connection. This makes sure that the tx ring alloc/unalloc pairs + * don't get out of sync and confuse the ring. + */ +int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, + unsigned int hdr_off, unsigned int sg, unsigned int off) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct ib_device *dev = ic->i_cm_id->device; + struct rds_ib_send_work *send = NULL; + struct rds_ib_send_work *first; + struct rds_ib_send_work *prev; + struct ib_send_wr *failed_wr; + struct scatterlist *scat; + u32 pos; + u32 i; + u32 work_alloc; + u32 credit_alloc; + u32 posted; + u32 adv_credits = 0; + int send_flags = 0; + int sent; + int ret; + int flow_controlled = 0; + + BUG_ON(off % RDS_FRAG_SIZE); + BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header)); + + /* FIXME we may overallocate here */ + if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) + i = 1; + else + i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE); + + work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); + if (work_alloc == 0) { + set_bit(RDS_LL_SEND_FULL, &conn->c_flags); + rds_ib_stats_inc(s_ib_tx_ring_full); + ret = -ENOMEM; + goto out; + } + + credit_alloc = work_alloc; + if (ic->i_flowctl) { + credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0); + adv_credits += posted; + if (credit_alloc < work_alloc) { + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); + work_alloc = credit_alloc; + flow_controlled++; + } + if (work_alloc == 0) { + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_stats_inc(s_ib_tx_throttle); + ret = -ENOMEM; + goto out; + } + } + + /* map the message the first time we see it */ + if (ic->i_rm == NULL) { + /* + printk(KERN_NOTICE "rds_ib_xmit prep msg dport=%u flags=0x%x len=%d\n", + be16_to_cpu(rm->m_inc.i_hdr.h_dport), + rm->m_inc.i_hdr.h_flags, + be32_to_cpu(rm->m_inc.i_hdr.h_len)); + */ + if (rm->m_nents) { + rm->m_count = ib_dma_map_sg(dev, + rm->m_sg, rm->m_nents, DMA_TO_DEVICE); + rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count); + if (rm->m_count == 0) { + rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + ret = -ENOMEM; /* XXX ? */ + goto out; + } + } else { + rm->m_count = 0; + } + + ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; + ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes; + rds_message_addref(rm); + ic->i_rm = rm; + + /* Finalize the header */ + if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags)) + rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED; + if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) + rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED; + + /* If it has a RDMA op, tell the peer we did it. This is + * used by the peer to release use-once RDMA MRs. */ + if (rm->m_rdma_op) { + struct rds_ext_header_rdma ext_hdr; + + ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key); + rds_message_add_extension(&rm->m_inc.i_hdr, + RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); + } + if (rm->m_rdma_cookie) { + rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr, + rds_rdma_cookie_key(rm->m_rdma_cookie), + rds_rdma_cookie_offset(rm->m_rdma_cookie)); + } + + /* Note - rds_ib_piggyb_ack clears the ACK_REQUIRED bit, so + * we should not do this unless we have a chance of at least + * sticking the header into the send ring. Which is why we + * should call rds_ib_ring_alloc first. */ + rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_ib_piggyb_ack(ic)); + rds_message_make_checksum(&rm->m_inc.i_hdr); + + /* + * Update adv_credits since we reset the ACK_REQUIRED bit. + */ + rds_ib_send_grab_credits(ic, 0, &posted, 1); + adv_credits += posted; + BUG_ON(adv_credits > 255); + } else if (ic->i_rm != rm) + BUG(); + + send = &ic->i_sends[pos]; + first = send; + prev = NULL; + scat = &rm->m_sg[sg]; + sent = 0; + i = 0; + + /* Sometimes you want to put a fence between an RDMA + * READ and the following SEND. + * We could either do this all the time + * or when requested by the user. Right now, we let + * the application choose. + */ + if (rm->m_rdma_op && rm->m_rdma_op->r_fence) + send_flags = IB_SEND_FENCE; + + /* + * We could be copying the header into the unused tail of the page. + * That would need to be changed in the future when those pages might + * be mapped userspace pages or page cache pages. So instead we always + * use a second sge and our long-lived ring of mapped headers. We send + * the header after the data so that the data payload can be aligned on + * the receiver. + */ + + /* handle a 0-len message */ + if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) { + rds_ib_xmit_populate_wr(ic, send, pos, 0, 0, send_flags); + goto add_header; + } + + /* if there's data reference it with a chain of work reqs */ + for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) { + unsigned int len; + + send = &ic->i_sends[pos]; + + len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); + rds_ib_xmit_populate_wr(ic, send, pos, + ib_sg_dma_address(dev, scat) + off, len, + send_flags); + + /* + * We want to delay signaling completions just enough to get + * the batching benefits but not so much that we create dead time + * on the wire. + */ + if (ic->i_unsignaled_wrs-- == 0) { + ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; + send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + } + + ic->i_unsignaled_bytes -= len; + if (ic->i_unsignaled_bytes <= 0) { + ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes; + send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + } + + /* + * Always signal the last one if we're stopping due to flow control. + */ + if (flow_controlled && i == (work_alloc-1)) + send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + + rdsdebug("send %p wr %p num_sge %u next %p\n", send, + &send->s_wr, send->s_wr.num_sge, send->s_wr.next); + + sent += len; + off += len; + if (off == ib_sg_dma_len(dev, scat)) { + scat++; + off = 0; + } + +add_header: + /* Tack on the header after the data. The header SGE should already + * have been set up to point to the right header buffer. */ + memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); + + if (0) { + struct rds_header *hdr = &ic->i_send_hdrs[pos]; + + printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n", + be16_to_cpu(hdr->h_dport), + hdr->h_flags, + be32_to_cpu(hdr->h_len)); + } + if (adv_credits) { + struct rds_header *hdr = &ic->i_send_hdrs[pos]; + + /* add credit and redo the header checksum */ + hdr->h_credit = adv_credits; + rds_message_make_checksum(hdr); + adv_credits = 0; + rds_ib_stats_inc(s_ib_tx_credit_updates); + } + + if (prev) + prev->s_wr.next = &send->s_wr; + prev = send; + + pos = (pos + 1) % ic->i_send_ring.w_nr; + } + + /* Account the RDS header in the number of bytes we sent, but just once. + * The caller has no concept of fragmentation. */ + if (hdr_off == 0) + sent += sizeof(struct rds_header); + + /* if we finished the message then send completion owns it */ + if (scat == &rm->m_sg[rm->m_count]) { + prev->s_rm = ic->i_rm; + prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + ic->i_rm = NULL; + } + + if (i < work_alloc) { + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); + work_alloc = i; + } + if (ic->i_flowctl && i < credit_alloc) + rds_ib_send_add_credits(conn, credit_alloc - i); + + /* XXX need to worry about failed_wr and partial sends. */ + failed_wr = &first->s_wr; + ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); + rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, + first, &first->s_wr, ret, failed_wr); + BUG_ON(failed_wr != &first->s_wr); + if (ret) { + printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 " + "returned %d\n", &conn->c_faddr, ret); + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + if (prev->s_rm) { + ic->i_rm = prev->s_rm; + prev->s_rm = NULL; + } + /* Finesse this later */ + BUG(); + goto out; + } + + ret = sent; +out: + BUG_ON(adv_credits); + return ret; +} + +int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_ib_send_work *send = NULL; + struct rds_ib_send_work *first; + struct rds_ib_send_work *prev; + struct ib_send_wr *failed_wr; + struct rds_ib_device *rds_ibdev; + struct scatterlist *scat; + unsigned long len; + u64 remote_addr = op->r_remote_addr; + u32 pos; + u32 work_alloc; + u32 i; + u32 j; + int sent; + int ret; + int num_sge; + + rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); + + /* map the message the first time we see it */ + if (!op->r_mapped) { + op->r_count = ib_dma_map_sg(ic->i_cm_id->device, + op->r_sg, op->r_nents, (op->r_write) ? + DMA_TO_DEVICE : DMA_FROM_DEVICE); + rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count); + if (op->r_count == 0) { + rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); + ret = -ENOMEM; /* XXX ? */ + goto out; + } + + op->r_mapped = 1; + } + + /* + * Instead of knowing how to return a partial rdma read/write we insist that there + * be enough work requests to send the entire message. + */ + i = ceil(op->r_count, rds_ibdev->max_sge); + + work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); + if (work_alloc != i) { + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_stats_inc(s_ib_tx_ring_full); + ret = -ENOMEM; + goto out; + } + + send = &ic->i_sends[pos]; + first = send; + prev = NULL; + scat = &op->r_sg[0]; + sent = 0; + num_sge = op->r_count; + + for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) { + send->s_wr.send_flags = 0; + send->s_queued = jiffies; + /* + * We want to delay signaling completions just enough to get + * the batching benefits but not so much that we create dead time on the wire. + */ + if (ic->i_unsignaled_wrs-- == 0) { + ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; + send->s_wr.send_flags = IB_SEND_SIGNALED; + } + + send->s_wr.opcode = op->r_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; + send->s_wr.wr.rdma.remote_addr = remote_addr; + send->s_wr.wr.rdma.rkey = op->r_key; + send->s_op = op; + + if (num_sge > rds_ibdev->max_sge) { + send->s_wr.num_sge = rds_ibdev->max_sge; + num_sge -= rds_ibdev->max_sge; + } else { + send->s_wr.num_sge = num_sge; + } + + send->s_wr.next = NULL; + + if (prev) + prev->s_wr.next = &send->s_wr; + + for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) { + len = ib_sg_dma_len(ic->i_cm_id->device, scat); + send->s_sge[j].addr = + ib_sg_dma_address(ic->i_cm_id->device, scat); + send->s_sge[j].length = len; + send->s_sge[j].lkey = ic->i_mr->lkey; + + sent += len; + rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr); + + remote_addr += len; + scat++; + } + + rdsdebug("send %p wr %p num_sge %u next %p\n", send, + &send->s_wr, send->s_wr.num_sge, send->s_wr.next); + + prev = send; + if (++send == &ic->i_sends[ic->i_send_ring.w_nr]) + send = ic->i_sends; + } + + /* if we finished the message then send completion owns it */ + if (scat == &op->r_sg[op->r_count]) + prev->s_wr.send_flags = IB_SEND_SIGNALED; + + if (i < work_alloc) { + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); + work_alloc = i; + } + + failed_wr = &first->s_wr; + ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); + rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, + first, &first->s_wr, ret, failed_wr); + BUG_ON(failed_wr != &first->s_wr); + if (ret) { + printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 " + "returned %d\n", &conn->c_faddr, ret); + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + goto out; + } + + if (unlikely(failed_wr != &first->s_wr)) { + printk(KERN_WARNING "RDS/IB: ib_post_send() rc=%d, but failed_wqe updated!\n", ret); + BUG_ON(failed_wr != &first->s_wr); + } + + +out: + return ret; +} + +void rds_ib_xmit_complete(struct rds_connection *conn) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + + /* We may have a pending ACK or window update we were unable + * to send previously (due to flow control). Try again. */ + rds_ib_attempt_ack(ic); +} diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c new file mode 100644 index 00000000000..02e3e3d50d4 --- /dev/null +++ b/net/rds/ib_stats.c @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/percpu.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> + +#include "rds.h" +#include "ib.h" + +DEFINE_PER_CPU(struct rds_ib_statistics, rds_ib_stats) ____cacheline_aligned; + +static char *rds_ib_stat_names[] = { + "ib_connect_raced", + "ib_listen_closed_stale", + "ib_tx_cq_call", + "ib_tx_cq_event", + "ib_tx_ring_full", + "ib_tx_throttle", + "ib_tx_sg_mapping_failure", + "ib_tx_stalled", + "ib_tx_credit_updates", + "ib_rx_cq_call", + "ib_rx_cq_event", + "ib_rx_ring_empty", + "ib_rx_refill_from_cq", + "ib_rx_refill_from_thread", + "ib_rx_alloc_limit", + "ib_rx_credit_updates", + "ib_ack_sent", + "ib_ack_send_failure", + "ib_ack_send_delayed", + "ib_ack_send_piggybacked", + "ib_ack_received", + "ib_rdma_mr_alloc", + "ib_rdma_mr_free", + "ib_rdma_mr_used", + "ib_rdma_mr_pool_flush", + "ib_rdma_mr_pool_wait", + "ib_rdma_mr_pool_depleted", +}; + +unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, + unsigned int avail) +{ + struct rds_ib_statistics stats = {0, }; + uint64_t *src; + uint64_t *sum; + size_t i; + int cpu; + + if (avail < ARRAY_SIZE(rds_ib_stat_names)) + goto out; + + for_each_online_cpu(cpu) { + src = (uint64_t *)&(per_cpu(rds_ib_stats, cpu)); + sum = (uint64_t *)&stats; + for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) + *(sum++) += *(src++); + } + + rds_stats_info_copy(iter, (uint64_t *)&stats, rds_ib_stat_names, + ARRAY_SIZE(rds_ib_stat_names)); +out: + return ARRAY_SIZE(rds_ib_stat_names); +} diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c new file mode 100644 index 00000000000..d87830db93a --- /dev/null +++ b/net/rds/ib_sysctl.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/sysctl.h> +#include <linux/proc_fs.h> + +#include "ib.h" + +static struct ctl_table_header *rds_ib_sysctl_hdr; + +unsigned long rds_ib_sysctl_max_send_wr = RDS_IB_DEFAULT_SEND_WR; +unsigned long rds_ib_sysctl_max_recv_wr = RDS_IB_DEFAULT_RECV_WR; +unsigned long rds_ib_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE; +static unsigned long rds_ib_sysctl_max_wr_min = 1; +/* hardware will fail CQ creation long before this */ +static unsigned long rds_ib_sysctl_max_wr_max = (u32)~0; + +unsigned long rds_ib_sysctl_max_unsig_wrs = 16; +static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1; +static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64; + +unsigned long rds_ib_sysctl_max_unsig_bytes = (16 << 20); +static unsigned long rds_ib_sysctl_max_unsig_bytes_min = 1; +static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL; + +unsigned int rds_ib_sysctl_flow_control = 1; + +ctl_table rds_ib_sysctl_table[] = { + { + .ctl_name = CTL_UNNUMBERED, + .procname = "max_send_wr", + .data = &rds_ib_sysctl_max_send_wr, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + .extra1 = &rds_ib_sysctl_max_wr_min, + .extra2 = &rds_ib_sysctl_max_wr_max, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "max_recv_wr", + .data = &rds_ib_sysctl_max_recv_wr, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + .extra1 = &rds_ib_sysctl_max_wr_min, + .extra2 = &rds_ib_sysctl_max_wr_max, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "max_unsignaled_wr", + .data = &rds_ib_sysctl_max_unsig_wrs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + .extra1 = &rds_ib_sysctl_max_unsig_wr_min, + .extra2 = &rds_ib_sysctl_max_unsig_wr_max, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "max_unsignaled_bytes", + .data = &rds_ib_sysctl_max_unsig_bytes, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + .extra1 = &rds_ib_sysctl_max_unsig_bytes_min, + .extra2 = &rds_ib_sysctl_max_unsig_bytes_max, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "max_recv_allocation", + .data = &rds_ib_sysctl_max_recv_allocation, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "flow_control", + .data = &rds_ib_sysctl_flow_control, + .maxlen = sizeof(rds_ib_sysctl_flow_control), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0} +}; + +static struct ctl_path rds_ib_sysctl_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "rds", .ctl_name = CTL_UNNUMBERED, }, + { .procname = "ib", .ctl_name = CTL_UNNUMBERED, }, + { } +}; + +void rds_ib_sysctl_exit(void) +{ + if (rds_ib_sysctl_hdr) + unregister_sysctl_table(rds_ib_sysctl_hdr); +} + +int __init rds_ib_sysctl_init(void) +{ + rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table); + if (rds_ib_sysctl_hdr == NULL) + return -ENOMEM; + return 0; +} diff --git a/net/rds/info.c b/net/rds/info.c new file mode 100644 index 00000000000..1d885535214 --- /dev/null +++ b/net/rds/info.c @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/percpu.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> + +#include "rds.h" + +/* + * This file implements a getsockopt() call which copies a set of fixed + * sized structs into a user-specified buffer as a means of providing + * read-only information about RDS. + * + * For a given information source there are a given number of fixed sized + * structs at a given time. The structs are only copied if the user-specified + * buffer is big enough. The destination pages that make up the buffer + * are pinned for the duration of the copy. + * + * This gives us the following benefits: + * + * - simple implementation, no copy "position" across multiple calls + * - consistent snapshot of an info source + * - atomic copy works well with whatever locking info source has + * - one portable tool to get rds info across implementations + * - long-lived tool can get info without allocating + * + * at the following costs: + * + * - info source copy must be pinned, may be "large" + */ + +struct rds_info_iterator { + struct page **pages; + void *addr; + unsigned long offset; +}; + +static DEFINE_SPINLOCK(rds_info_lock); +static rds_info_func rds_info_funcs[RDS_INFO_LAST - RDS_INFO_FIRST + 1]; + +void rds_info_register_func(int optname, rds_info_func func) +{ + int offset = optname - RDS_INFO_FIRST; + + BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST); + + spin_lock(&rds_info_lock); + BUG_ON(rds_info_funcs[offset] != NULL); + rds_info_funcs[offset] = func; + spin_unlock(&rds_info_lock); +} + +void rds_info_deregister_func(int optname, rds_info_func func) +{ + int offset = optname - RDS_INFO_FIRST; + + BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST); + + spin_lock(&rds_info_lock); + BUG_ON(rds_info_funcs[offset] != func); + rds_info_funcs[offset] = NULL; + spin_unlock(&rds_info_lock); +} + +/* + * Typically we hold an atomic kmap across multiple rds_info_copy() calls + * because the kmap is so expensive. This must be called before using blocking + * operations while holding the mapping and as the iterator is torn down. + */ +void rds_info_iter_unmap(struct rds_info_iterator *iter) +{ + if (iter->addr != NULL) { + kunmap_atomic(iter->addr, KM_USER0); + iter->addr = NULL; + } +} + +/* + * get_user_pages() called flush_dcache_page() on the pages for us. + */ +void rds_info_copy(struct rds_info_iterator *iter, void *data, + unsigned long bytes) +{ + unsigned long this; + + while (bytes) { + if (iter->addr == NULL) + iter->addr = kmap_atomic(*iter->pages, KM_USER0); + + this = min(bytes, PAGE_SIZE - iter->offset); + + rdsdebug("page %p addr %p offset %lu this %lu data %p " + "bytes %lu\n", *iter->pages, iter->addr, + iter->offset, this, data, bytes); + + memcpy(iter->addr + iter->offset, data, this); + + data += this; + bytes -= this; + iter->offset += this; + + if (iter->offset == PAGE_SIZE) { + kunmap_atomic(iter->addr, KM_USER0); + iter->addr = NULL; + iter->offset = 0; + iter->pages++; + } + } +} + +/* + * @optval points to the userspace buffer that the information snapshot + * will be copied into. + * + * @optlen on input is the size of the buffer in userspace. @optlen + * on output is the size of the requested snapshot in bytes. + * + * This function returns -errno if there is a failure, particularly -ENOSPC + * if the given userspace buffer was not large enough to fit the snapshot. + * On success it returns the positive number of bytes of each array element + * in the snapshot. + */ +int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, + int __user *optlen) +{ + struct rds_info_iterator iter; + struct rds_info_lengths lens; + unsigned long nr_pages = 0; + unsigned long start; + unsigned long i; + rds_info_func func; + struct page **pages = NULL; + int ret; + int len; + int total; + + if (get_user(len, optlen)) { + ret = -EFAULT; + goto out; + } + + /* check for all kinds of wrapping and the like */ + start = (unsigned long)optval; + if (len < 0 || len + PAGE_SIZE - 1 < len || start + len < start) { + ret = -EINVAL; + goto out; + } + + /* a 0 len call is just trying to probe its length */ + if (len == 0) + goto call_func; + + nr_pages = (PAGE_ALIGN(start + len) - (start & PAGE_MASK)) + >> PAGE_SHIFT; + + pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL); + if (pages == NULL) { + ret = -ENOMEM; + goto out; + } + down_read(¤t->mm->mmap_sem); + ret = get_user_pages(current, current->mm, start, nr_pages, 1, 0, + pages, NULL); + up_read(¤t->mm->mmap_sem); + if (ret != nr_pages) { + if (ret > 0) + nr_pages = ret; + else + nr_pages = 0; + ret = -EAGAIN; /* XXX ? */ + goto out; + } + + rdsdebug("len %d nr_pages %lu\n", len, nr_pages); + +call_func: + func = rds_info_funcs[optname - RDS_INFO_FIRST]; + if (func == NULL) { + ret = -ENOPROTOOPT; + goto out; + } + + iter.pages = pages; + iter.addr = NULL; + iter.offset = start & (PAGE_SIZE - 1); + + func(sock, len, &iter, &lens); + BUG_ON(lens.each == 0); + + total = lens.nr * lens.each; + + rds_info_iter_unmap(&iter); + + if (total > len) { + len = total; + ret = -ENOSPC; + } else { + len = total; + ret = lens.each; + } + + if (put_user(len, optlen)) + ret = -EFAULT; + +out: + for (i = 0; pages != NULL && i < nr_pages; i++) + put_page(pages[i]); + kfree(pages); + + return ret; +} diff --git a/net/rds/info.h b/net/rds/info.h new file mode 100644 index 00000000000..b6c052ca7d2 --- /dev/null +++ b/net/rds/info.h @@ -0,0 +1,30 @@ +#ifndef _RDS_INFO_H +#define _RDS_INFO_H + +struct rds_info_lengths { + unsigned int nr; + unsigned int each; +}; + +struct rds_info_iterator; + +/* + * These functions must fill in the fields of @lens to reflect the size + * of the available info source. If the snapshot fits in @len then it + * should be copied using @iter. The caller will deduce if it was copied + * or not by comparing the lengths. + */ +typedef void (*rds_info_func)(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens); + +void rds_info_register_func(int optname, rds_info_func func); +void rds_info_deregister_func(int optname, rds_info_func func); +int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, + int __user *optlen); +void rds_info_copy(struct rds_info_iterator *iter, void *data, + unsigned long bytes); +void rds_info_iter_unmap(struct rds_info_iterator *iter); + + +#endif diff --git a/net/rds/iw.c b/net/rds/iw.c new file mode 100644 index 00000000000..1b56905c4c0 --- /dev/null +++ b/net/rds/iw.c @@ -0,0 +1,333 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/in.h> +#include <linux/if.h> +#include <linux/netdevice.h> +#include <linux/inetdevice.h> +#include <linux/if_arp.h> +#include <linux/delay.h> + +#include "rds.h" +#include "iw.h" + +unsigned int fastreg_pool_size = RDS_FASTREG_POOL_SIZE; +unsigned int fastreg_message_size = RDS_FASTREG_SIZE + 1; /* +1 allows for unaligned MRs */ + +module_param(fastreg_pool_size, int, 0444); +MODULE_PARM_DESC(fastreg_pool_size, " Max number of fastreg MRs per device"); +module_param(fastreg_message_size, int, 0444); +MODULE_PARM_DESC(fastreg_message_size, " Max size of a RDMA transfer (fastreg MRs)"); + +struct list_head rds_iw_devices; + +DEFINE_SPINLOCK(iw_nodev_conns_lock); +LIST_HEAD(iw_nodev_conns); + +void rds_iw_add_one(struct ib_device *device) +{ + struct rds_iw_device *rds_iwdev; + struct ib_device_attr *dev_attr; + + /* Only handle iwarp devices */ + if (device->node_type != RDMA_NODE_RNIC) + return; + + dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); + if (!dev_attr) + return; + + if (ib_query_device(device, dev_attr)) { + rdsdebug("Query device failed for %s\n", device->name); + goto free_attr; + } + + rds_iwdev = kmalloc(sizeof *rds_iwdev, GFP_KERNEL); + if (!rds_iwdev) + goto free_attr; + + spin_lock_init(&rds_iwdev->spinlock); + + rds_iwdev->dma_local_lkey = !!(dev_attr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY); + rds_iwdev->max_wrs = dev_attr->max_qp_wr; + rds_iwdev->max_sge = min(dev_attr->max_sge, RDS_IW_MAX_SGE); + + rds_iwdev->page_shift = max(PAGE_SHIFT, ffs(dev_attr->page_size_cap) - 1); + + rds_iwdev->dev = device; + rds_iwdev->pd = ib_alloc_pd(device); + if (IS_ERR(rds_iwdev->pd)) + goto free_dev; + + if (!rds_iwdev->dma_local_lkey) { + if (device->node_type != RDMA_NODE_RNIC) { + rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd, + IB_ACCESS_LOCAL_WRITE); + } else { + rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd, + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_LOCAL_WRITE); + } + if (IS_ERR(rds_iwdev->mr)) + goto err_pd; + } else + rds_iwdev->mr = NULL; + + rds_iwdev->mr_pool = rds_iw_create_mr_pool(rds_iwdev); + if (IS_ERR(rds_iwdev->mr_pool)) { + rds_iwdev->mr_pool = NULL; + goto err_mr; + } + + INIT_LIST_HEAD(&rds_iwdev->cm_id_list); + INIT_LIST_HEAD(&rds_iwdev->conn_list); + list_add_tail(&rds_iwdev->list, &rds_iw_devices); + + ib_set_client_data(device, &rds_iw_client, rds_iwdev); + + goto free_attr; + +err_mr: + if (rds_iwdev->mr) + ib_dereg_mr(rds_iwdev->mr); +err_pd: + ib_dealloc_pd(rds_iwdev->pd); +free_dev: + kfree(rds_iwdev); +free_attr: + kfree(dev_attr); +} + +void rds_iw_remove_one(struct ib_device *device) +{ + struct rds_iw_device *rds_iwdev; + struct rds_iw_cm_id *i_cm_id, *next; + + rds_iwdev = ib_get_client_data(device, &rds_iw_client); + if (!rds_iwdev) + return; + + spin_lock_irq(&rds_iwdev->spinlock); + list_for_each_entry_safe(i_cm_id, next, &rds_iwdev->cm_id_list, list) { + list_del(&i_cm_id->list); + kfree(i_cm_id); + } + spin_unlock_irq(&rds_iwdev->spinlock); + + rds_iw_remove_conns(rds_iwdev); + + if (rds_iwdev->mr_pool) + rds_iw_destroy_mr_pool(rds_iwdev->mr_pool); + + if (rds_iwdev->mr) + ib_dereg_mr(rds_iwdev->mr); + + while (ib_dealloc_pd(rds_iwdev->pd)) { + rdsdebug("Failed to dealloc pd %p\n", rds_iwdev->pd); + msleep(1); + } + + list_del(&rds_iwdev->list); + kfree(rds_iwdev); +} + +struct ib_client rds_iw_client = { + .name = "rds_iw", + .add = rds_iw_add_one, + .remove = rds_iw_remove_one +}; + +static int rds_iw_conn_info_visitor(struct rds_connection *conn, + void *buffer) +{ + struct rds_info_rdma_connection *iinfo = buffer; + struct rds_iw_connection *ic; + + /* We will only ever look at IB transports */ + if (conn->c_trans != &rds_iw_transport) + return 0; + + iinfo->src_addr = conn->c_laddr; + iinfo->dst_addr = conn->c_faddr; + + memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid)); + memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid)); + if (rds_conn_state(conn) == RDS_CONN_UP) { + struct rds_iw_device *rds_iwdev; + struct rdma_dev_addr *dev_addr; + + ic = conn->c_transport_data; + dev_addr = &ic->i_cm_id->route.addr.dev_addr; + + ib_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); + ib_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); + + rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); + iinfo->max_send_wr = ic->i_send_ring.w_nr; + iinfo->max_recv_wr = ic->i_recv_ring.w_nr; + iinfo->max_send_sge = rds_iwdev->max_sge; + rds_iw_get_mr_info(rds_iwdev, iinfo); + } + return 1; +} + +static void rds_iw_ic_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + rds_for_each_conn_info(sock, len, iter, lens, + rds_iw_conn_info_visitor, + sizeof(struct rds_info_rdma_connection)); +} + + +/* + * Early RDS/IB was built to only bind to an address if there is an IPoIB + * device with that address set. + * + * If it were me, I'd advocate for something more flexible. Sending and + * receiving should be device-agnostic. Transports would try and maintain + * connections between peers who have messages queued. Userspace would be + * allowed to influence which paths have priority. We could call userspace + * asserting this policy "routing". + */ +static int rds_iw_laddr_check(__be32 addr) +{ + int ret; + struct rdma_cm_id *cm_id; + struct sockaddr_in sin; + + /* Create a CMA ID and try to bind it. This catches both + * IB and iWARP capable NICs. + */ + cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP); + if (!cm_id) + return -EADDRNOTAVAIL; + + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = addr; + + /* rdma_bind_addr will only succeed for IB & iWARP devices */ + ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); + /* due to this, we will claim to support IB devices unless we + check node_type. */ + if (ret || cm_id->device->node_type != RDMA_NODE_RNIC) + ret = -EADDRNOTAVAIL; + + rdsdebug("addr %pI4 ret %d node type %d\n", + &addr, ret, + cm_id->device ? cm_id->device->node_type : -1); + + rdma_destroy_id(cm_id); + + return ret; +} + +void rds_iw_exit(void) +{ + rds_info_deregister_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info); + rds_iw_remove_nodev_conns(); + ib_unregister_client(&rds_iw_client); + rds_iw_sysctl_exit(); + rds_iw_recv_exit(); + rds_trans_unregister(&rds_iw_transport); +} + +struct rds_transport rds_iw_transport = { + .laddr_check = rds_iw_laddr_check, + .xmit_complete = rds_iw_xmit_complete, + .xmit = rds_iw_xmit, + .xmit_cong_map = NULL, + .xmit_rdma = rds_iw_xmit_rdma, + .recv = rds_iw_recv, + .conn_alloc = rds_iw_conn_alloc, + .conn_free = rds_iw_conn_free, + .conn_connect = rds_iw_conn_connect, + .conn_shutdown = rds_iw_conn_shutdown, + .inc_copy_to_user = rds_iw_inc_copy_to_user, + .inc_purge = rds_iw_inc_purge, + .inc_free = rds_iw_inc_free, + .cm_initiate_connect = rds_iw_cm_initiate_connect, + .cm_handle_connect = rds_iw_cm_handle_connect, + .cm_connect_complete = rds_iw_cm_connect_complete, + .stats_info_copy = rds_iw_stats_info_copy, + .exit = rds_iw_exit, + .get_mr = rds_iw_get_mr, + .sync_mr = rds_iw_sync_mr, + .free_mr = rds_iw_free_mr, + .flush_mrs = rds_iw_flush_mrs, + .t_owner = THIS_MODULE, + .t_name = "iwarp", + .t_prefer_loopback = 1, +}; + +int __init rds_iw_init(void) +{ + int ret; + + INIT_LIST_HEAD(&rds_iw_devices); + + ret = ib_register_client(&rds_iw_client); + if (ret) + goto out; + + ret = rds_iw_sysctl_init(); + if (ret) + goto out_ibreg; + + ret = rds_iw_recv_init(); + if (ret) + goto out_sysctl; + + ret = rds_trans_register(&rds_iw_transport); + if (ret) + goto out_recv; + + rds_info_register_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info); + + goto out; + +out_recv: + rds_iw_recv_exit(); +out_sysctl: + rds_iw_sysctl_exit(); +out_ibreg: + ib_unregister_client(&rds_iw_client); +out: + return ret; +} + +MODULE_LICENSE("GPL"); + diff --git a/net/rds/iw.h b/net/rds/iw.h new file mode 100644 index 00000000000..0ddda34f2a1 --- /dev/null +++ b/net/rds/iw.h @@ -0,0 +1,395 @@ +#ifndef _RDS_IW_H +#define _RDS_IW_H + +#include <rdma/ib_verbs.h> +#include <rdma/rdma_cm.h> +#include "rds.h" +#include "rdma_transport.h" + +#define RDS_FASTREG_SIZE 20 +#define RDS_FASTREG_POOL_SIZE 2048 + +#define RDS_IW_MAX_SGE 8 +#define RDS_IW_RECV_SGE 2 + +#define RDS_IW_DEFAULT_RECV_WR 1024 +#define RDS_IW_DEFAULT_SEND_WR 256 + +#define RDS_IW_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ + +extern struct list_head rds_iw_devices; + +/* + * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to + * try and minimize the amount of memory tied up both the device and + * socket receive queues. + */ +/* page offset of the final full frag that fits in the page */ +#define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE) +struct rds_page_frag { + struct list_head f_item; + struct page *f_page; + unsigned long f_offset; + dma_addr_t f_mapped; +}; + +struct rds_iw_incoming { + struct list_head ii_frags; + struct rds_incoming ii_inc; +}; + +struct rds_iw_connect_private { + /* Add new fields at the end, and don't permute existing fields. */ + __be32 dp_saddr; + __be32 dp_daddr; + u8 dp_protocol_major; + u8 dp_protocol_minor; + __be16 dp_protocol_minor_mask; /* bitmask */ + __be32 dp_reserved1; + __be64 dp_ack_seq; + __be32 dp_credit; /* non-zero enables flow ctl */ +}; + +struct rds_iw_scatterlist { + struct scatterlist *list; + unsigned int len; + int dma_len; + unsigned int dma_npages; + unsigned int bytes; +}; + +struct rds_iw_mapping { + spinlock_t m_lock; /* protect the mapping struct */ + struct list_head m_list; + struct rds_iw_mr *m_mr; + uint32_t m_rkey; + struct rds_iw_scatterlist m_sg; +}; + +struct rds_iw_send_work { + struct rds_message *s_rm; + + /* We should really put these into a union: */ + struct rds_rdma_op *s_op; + struct rds_iw_mapping *s_mapping; + struct ib_mr *s_mr; + struct ib_fast_reg_page_list *s_page_list; + unsigned char s_remap_count; + + struct ib_send_wr s_wr; + struct ib_sge s_sge[RDS_IW_MAX_SGE]; + unsigned long s_queued; +}; + +struct rds_iw_recv_work { + struct rds_iw_incoming *r_iwinc; + struct rds_page_frag *r_frag; + struct ib_recv_wr r_wr; + struct ib_sge r_sge[2]; +}; + +struct rds_iw_work_ring { + u32 w_nr; + u32 w_alloc_ptr; + u32 w_alloc_ctr; + u32 w_free_ptr; + atomic_t w_free_ctr; +}; + +struct rds_iw_device; + +struct rds_iw_connection { + + struct list_head iw_node; + struct rds_iw_device *rds_iwdev; + struct rds_connection *conn; + + /* alphabet soup, IBTA style */ + struct rdma_cm_id *i_cm_id; + struct ib_pd *i_pd; + struct ib_mr *i_mr; + struct ib_cq *i_send_cq; + struct ib_cq *i_recv_cq; + + /* tx */ + struct rds_iw_work_ring i_send_ring; + struct rds_message *i_rm; + struct rds_header *i_send_hdrs; + u64 i_send_hdrs_dma; + struct rds_iw_send_work *i_sends; + + /* rx */ + struct mutex i_recv_mutex; + struct rds_iw_work_ring i_recv_ring; + struct rds_iw_incoming *i_iwinc; + u32 i_recv_data_rem; + struct rds_header *i_recv_hdrs; + u64 i_recv_hdrs_dma; + struct rds_iw_recv_work *i_recvs; + struct rds_page_frag i_frag; + u64 i_ack_recv; /* last ACK received */ + + /* sending acks */ + unsigned long i_ack_flags; + u64 i_ack_next; /* next ACK to send */ + struct rds_header *i_ack; + struct ib_send_wr i_ack_wr; + struct ib_sge i_ack_sge; + u64 i_ack_dma; + unsigned long i_ack_queued; + + /* Flow control related information + * + * Our algorithm uses a pair variables that we need to access + * atomically - one for the send credits, and one posted + * recv credits we need to transfer to remote. + * Rather than protect them using a slow spinlock, we put both into + * a single atomic_t and update it using cmpxchg + */ + atomic_t i_credits; + + /* Protocol version specific information */ + unsigned int i_flowctl:1; /* enable/disable flow ctl */ + unsigned int i_dma_local_lkey:1; + unsigned int i_fastreg_posted:1; /* fastreg posted on this connection */ + /* Batched completions */ + unsigned int i_unsignaled_wrs; + long i_unsignaled_bytes; +}; + +/* This assumes that atomic_t is at least 32 bits */ +#define IB_GET_SEND_CREDITS(v) ((v) & 0xffff) +#define IB_GET_POST_CREDITS(v) ((v) >> 16) +#define IB_SET_SEND_CREDITS(v) ((v) & 0xffff) +#define IB_SET_POST_CREDITS(v) ((v) << 16) + +struct rds_iw_cm_id { + struct list_head list; + struct rdma_cm_id *cm_id; +}; + +struct rds_iw_device { + struct list_head list; + struct list_head cm_id_list; + struct list_head conn_list; + struct ib_device *dev; + struct ib_pd *pd; + struct ib_mr *mr; + struct rds_iw_mr_pool *mr_pool; + int page_shift; + int max_sge; + unsigned int max_wrs; + unsigned int dma_local_lkey:1; + spinlock_t spinlock; /* protect the above */ +}; + +/* bits for i_ack_flags */ +#define IB_ACK_IN_FLIGHT 0 +#define IB_ACK_REQUESTED 1 + +/* Magic WR_ID for ACKs */ +#define RDS_IW_ACK_WR_ID ((u64)0xffffffffffffffffULL) +#define RDS_IW_FAST_REG_WR_ID ((u64)0xefefefefefefefefULL) +#define RDS_IW_LOCAL_INV_WR_ID ((u64)0xdfdfdfdfdfdfdfdfULL) + +struct rds_iw_statistics { + uint64_t s_iw_connect_raced; + uint64_t s_iw_listen_closed_stale; + uint64_t s_iw_tx_cq_call; + uint64_t s_iw_tx_cq_event; + uint64_t s_iw_tx_ring_full; + uint64_t s_iw_tx_throttle; + uint64_t s_iw_tx_sg_mapping_failure; + uint64_t s_iw_tx_stalled; + uint64_t s_iw_tx_credit_updates; + uint64_t s_iw_rx_cq_call; + uint64_t s_iw_rx_cq_event; + uint64_t s_iw_rx_ring_empty; + uint64_t s_iw_rx_refill_from_cq; + uint64_t s_iw_rx_refill_from_thread; + uint64_t s_iw_rx_alloc_limit; + uint64_t s_iw_rx_credit_updates; + uint64_t s_iw_ack_sent; + uint64_t s_iw_ack_send_failure; + uint64_t s_iw_ack_send_delayed; + uint64_t s_iw_ack_send_piggybacked; + uint64_t s_iw_ack_received; + uint64_t s_iw_rdma_mr_alloc; + uint64_t s_iw_rdma_mr_free; + uint64_t s_iw_rdma_mr_used; + uint64_t s_iw_rdma_mr_pool_flush; + uint64_t s_iw_rdma_mr_pool_wait; + uint64_t s_iw_rdma_mr_pool_depleted; +}; + +extern struct workqueue_struct *rds_iw_wq; + +/* + * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h + * doesn't define it. + */ +static inline void rds_iw_dma_sync_sg_for_cpu(struct ib_device *dev, + struct scatterlist *sg, unsigned int sg_dma_len, int direction) +{ + unsigned int i; + + for (i = 0; i < sg_dma_len; ++i) { + ib_dma_sync_single_for_cpu(dev, + ib_sg_dma_address(dev, &sg[i]), + ib_sg_dma_len(dev, &sg[i]), + direction); + } +} +#define ib_dma_sync_sg_for_cpu rds_iw_dma_sync_sg_for_cpu + +static inline void rds_iw_dma_sync_sg_for_device(struct ib_device *dev, + struct scatterlist *sg, unsigned int sg_dma_len, int direction) +{ + unsigned int i; + + for (i = 0; i < sg_dma_len; ++i) { + ib_dma_sync_single_for_device(dev, + ib_sg_dma_address(dev, &sg[i]), + ib_sg_dma_len(dev, &sg[i]), + direction); + } +} +#define ib_dma_sync_sg_for_device rds_iw_dma_sync_sg_for_device + +static inline u32 rds_iw_local_dma_lkey(struct rds_iw_connection *ic) +{ + return ic->i_dma_local_lkey ? ic->i_cm_id->device->local_dma_lkey : ic->i_mr->lkey; +} + +/* ib.c */ +extern struct rds_transport rds_iw_transport; +extern void rds_iw_add_one(struct ib_device *device); +extern void rds_iw_remove_one(struct ib_device *device); +extern struct ib_client rds_iw_client; + +extern unsigned int fastreg_pool_size; +extern unsigned int fastreg_message_size; + +extern spinlock_t iw_nodev_conns_lock; +extern struct list_head iw_nodev_conns; + +/* ib_cm.c */ +int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp); +void rds_iw_conn_free(void *arg); +int rds_iw_conn_connect(struct rds_connection *conn); +void rds_iw_conn_shutdown(struct rds_connection *conn); +void rds_iw_state_change(struct sock *sk); +int __init rds_iw_listen_init(void); +void rds_iw_listen_stop(void); +void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...); +int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event); +int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id); +void rds_iw_cm_connect_complete(struct rds_connection *conn, + struct rdma_cm_event *event); + + +#define rds_iw_conn_error(conn, fmt...) \ + __rds_iw_conn_error(conn, KERN_WARNING "RDS/IW: " fmt) + +/* ib_rdma.c */ +int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id); +int rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn); +void rds_iw_remove_nodev_conns(void); +void rds_iw_remove_conns(struct rds_iw_device *rds_iwdev); +struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *); +void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo); +void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *); +void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents, + struct rds_sock *rs, u32 *key_ret); +void rds_iw_sync_mr(void *trans_private, int dir); +void rds_iw_free_mr(void *trans_private, int invalidate); +void rds_iw_flush_mrs(void); +void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id); + +/* ib_recv.c */ +int __init rds_iw_recv_init(void); +void rds_iw_recv_exit(void); +int rds_iw_recv(struct rds_connection *conn); +int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, + gfp_t page_gfp, int prefill); +void rds_iw_inc_purge(struct rds_incoming *inc); +void rds_iw_inc_free(struct rds_incoming *inc); +int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, + size_t size); +void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context); +void rds_iw_recv_init_ring(struct rds_iw_connection *ic); +void rds_iw_recv_clear_ring(struct rds_iw_connection *ic); +void rds_iw_recv_init_ack(struct rds_iw_connection *ic); +void rds_iw_attempt_ack(struct rds_iw_connection *ic); +void rds_iw_ack_send_complete(struct rds_iw_connection *ic); +u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic); + +/* ib_ring.c */ +void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr); +void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr); +u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos); +void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val); +void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val); +int rds_iw_ring_empty(struct rds_iw_work_ring *ring); +int rds_iw_ring_low(struct rds_iw_work_ring *ring); +u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring); +u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest); +extern wait_queue_head_t rds_iw_ring_empty_wait; + +/* ib_send.c */ +void rds_iw_xmit_complete(struct rds_connection *conn); +int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, + unsigned int hdr_off, unsigned int sg, unsigned int off); +void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context); +void rds_iw_send_init_ring(struct rds_iw_connection *ic); +void rds_iw_send_clear_ring(struct rds_iw_connection *ic); +int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op); +void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits); +void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted); +int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted, + u32 *adv_credits, int need_posted); + +/* ib_stats.c */ +DECLARE_PER_CPU(struct rds_iw_statistics, rds_iw_stats); +#define rds_iw_stats_inc(member) rds_stats_inc_which(rds_iw_stats, member) +unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter, + unsigned int avail); + +/* ib_sysctl.c */ +int __init rds_iw_sysctl_init(void); +void rds_iw_sysctl_exit(void); +extern unsigned long rds_iw_sysctl_max_send_wr; +extern unsigned long rds_iw_sysctl_max_recv_wr; +extern unsigned long rds_iw_sysctl_max_unsig_wrs; +extern unsigned long rds_iw_sysctl_max_unsig_bytes; +extern unsigned long rds_iw_sysctl_max_recv_allocation; +extern unsigned int rds_iw_sysctl_flow_control; +extern ctl_table rds_iw_sysctl_table[]; + +/* + * Helper functions for getting/setting the header and data SGEs in + * RDS packets (not RDMA) + */ +static inline struct ib_sge * +rds_iw_header_sge(struct rds_iw_connection *ic, struct ib_sge *sge) +{ + return &sge[0]; +} + +static inline struct ib_sge * +rds_iw_data_sge(struct rds_iw_connection *ic, struct ib_sge *sge) +{ + return &sge[1]; +} + +static inline void rds_iw_set_64bit(u64 *ptr, u64 val) +{ +#if BITS_PER_LONG == 64 + *ptr = val; +#else + set_64bit(ptr, val); +#endif +} + +#endif diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c new file mode 100644 index 00000000000..57ecb3d4b8a --- /dev/null +++ b/net/rds/iw_cm.c @@ -0,0 +1,750 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/in.h> +#include <linux/vmalloc.h> + +#include "rds.h" +#include "iw.h" + +/* + * Set the selected protocol version + */ +static void rds_iw_set_protocol(struct rds_connection *conn, unsigned int version) +{ + conn->c_version = version; +} + +/* + * Set up flow control + */ +static void rds_iw_set_flow_control(struct rds_connection *conn, u32 credits) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + + if (rds_iw_sysctl_flow_control && credits != 0) { + /* We're doing flow control */ + ic->i_flowctl = 1; + rds_iw_send_add_credits(conn, credits); + } else { + ic->i_flowctl = 0; + } +} + +/* + * Connection established. + * We get here for both outgoing and incoming connection. + */ +void rds_iw_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event) +{ + const struct rds_iw_connect_private *dp = NULL; + struct rds_iw_connection *ic = conn->c_transport_data; + struct rds_iw_device *rds_iwdev; + int err; + + if (event->param.conn.private_data_len) { + dp = event->param.conn.private_data; + + rds_iw_set_protocol(conn, + RDS_PROTOCOL(dp->dp_protocol_major, + dp->dp_protocol_minor)); + rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); + } + + /* update ib_device with this local ipaddr & conn */ + rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); + err = rds_iw_update_cm_id(rds_iwdev, ic->i_cm_id); + if (err) + printk(KERN_ERR "rds_iw_update_ipaddr failed (%d)\n", err); + err = rds_iw_add_conn(rds_iwdev, conn); + if (err) + printk(KERN_ERR "rds_iw_add_conn failed (%d)\n", err); + + /* If the peer gave us the last packet it saw, process this as if + * we had received a regular ACK. */ + if (dp && dp->dp_ack_seq) + rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); + + printk(KERN_NOTICE "RDS/IW: connected to %pI4<->%pI4 version %u.%u%s\n", + &conn->c_laddr, &conn->c_faddr, + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version), + ic->i_flowctl ? ", flow control" : ""); + + rds_connect_complete(conn); +} + +static void rds_iw_cm_fill_conn_param(struct rds_connection *conn, + struct rdma_conn_param *conn_param, + struct rds_iw_connect_private *dp, + u32 protocol_version) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + + memset(conn_param, 0, sizeof(struct rdma_conn_param)); + /* XXX tune these? */ + conn_param->responder_resources = 1; + conn_param->initiator_depth = 1; + + if (dp) { + memset(dp, 0, sizeof(*dp)); + dp->dp_saddr = conn->c_laddr; + dp->dp_daddr = conn->c_faddr; + dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version); + dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version); + dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IW_SUPPORTED_PROTOCOLS); + dp->dp_ack_seq = rds_iw_piggyb_ack(ic); + + /* Advertise flow control */ + if (ic->i_flowctl) { + unsigned int credits; + + credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)); + dp->dp_credit = cpu_to_be32(credits); + atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits); + } + + conn_param->private_data = dp; + conn_param->private_data_len = sizeof(*dp); + } +} + +static void rds_iw_cq_event_handler(struct ib_event *event, void *data) +{ + rdsdebug("event %u data %p\n", event->event, data); +} + +static void rds_iw_qp_event_handler(struct ib_event *event, void *data) +{ + struct rds_connection *conn = data; + struct rds_iw_connection *ic = conn->c_transport_data; + + rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event); + + switch (event->event) { + case IB_EVENT_COMM_EST: + rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); + break; + case IB_EVENT_QP_REQ_ERR: + case IB_EVENT_QP_FATAL: + default: + rds_iw_conn_error(conn, "RDS/IW: Fatal QP Event %u - connection %pI4->%pI4...reconnecting\n", + event->event, &conn->c_laddr, + &conn->c_faddr); + break; + } +} + +/* + * Create a QP + */ +static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr, + struct rds_iw_device *rds_iwdev, + struct rds_iw_work_ring *send_ring, + void (*send_cq_handler)(struct ib_cq *, void *), + struct rds_iw_work_ring *recv_ring, + void (*recv_cq_handler)(struct ib_cq *, void *), + void *context) +{ + struct ib_device *dev = rds_iwdev->dev; + unsigned int send_size, recv_size; + int ret; + + /* The offset of 1 is to accomodate the additional ACK WR. */ + send_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_send_wr + 1); + recv_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_recv_wr + 1); + rds_iw_ring_resize(send_ring, send_size - 1); + rds_iw_ring_resize(recv_ring, recv_size - 1); + + memset(attr, 0, sizeof(*attr)); + attr->event_handler = rds_iw_qp_event_handler; + attr->qp_context = context; + attr->cap.max_send_wr = send_size; + attr->cap.max_recv_wr = recv_size; + attr->cap.max_send_sge = rds_iwdev->max_sge; + attr->cap.max_recv_sge = RDS_IW_RECV_SGE; + attr->sq_sig_type = IB_SIGNAL_REQ_WR; + attr->qp_type = IB_QPT_RC; + + attr->send_cq = ib_create_cq(dev, send_cq_handler, + rds_iw_cq_event_handler, + context, send_size, 0); + if (IS_ERR(attr->send_cq)) { + ret = PTR_ERR(attr->send_cq); + attr->send_cq = NULL; + rdsdebug("ib_create_cq send failed: %d\n", ret); + goto out; + } + + attr->recv_cq = ib_create_cq(dev, recv_cq_handler, + rds_iw_cq_event_handler, + context, recv_size, 0); + if (IS_ERR(attr->recv_cq)) { + ret = PTR_ERR(attr->recv_cq); + attr->recv_cq = NULL; + rdsdebug("ib_create_cq send failed: %d\n", ret); + goto out; + } + + ret = ib_req_notify_cq(attr->send_cq, IB_CQ_NEXT_COMP); + if (ret) { + rdsdebug("ib_req_notify_cq send failed: %d\n", ret); + goto out; + } + + ret = ib_req_notify_cq(attr->recv_cq, IB_CQ_SOLICITED); + if (ret) { + rdsdebug("ib_req_notify_cq recv failed: %d\n", ret); + goto out; + } + +out: + if (ret) { + if (attr->send_cq) + ib_destroy_cq(attr->send_cq); + if (attr->recv_cq) + ib_destroy_cq(attr->recv_cq); + } + return ret; +} + +/* + * This needs to be very careful to not leave IS_ERR pointers around for + * cleanup to trip over. + */ +static int rds_iw_setup_qp(struct rds_connection *conn) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + struct ib_device *dev = ic->i_cm_id->device; + struct ib_qp_init_attr attr; + struct rds_iw_device *rds_iwdev; + int ret; + + /* rds_iw_add_one creates a rds_iw_device object per IB device, + * and allocates a protection domain, memory range and MR pool + * for each. If that fails for any reason, it will not register + * the rds_iwdev at all. + */ + rds_iwdev = ib_get_client_data(dev, &rds_iw_client); + if (rds_iwdev == NULL) { + if (printk_ratelimit()) + printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n", + dev->name); + return -EOPNOTSUPP; + } + + /* Protection domain and memory range */ + ic->i_pd = rds_iwdev->pd; + ic->i_mr = rds_iwdev->mr; + + ret = rds_iw_init_qp_attrs(&attr, rds_iwdev, + &ic->i_send_ring, rds_iw_send_cq_comp_handler, + &ic->i_recv_ring, rds_iw_recv_cq_comp_handler, + conn); + if (ret < 0) + goto out; + + ic->i_send_cq = attr.send_cq; + ic->i_recv_cq = attr.recv_cq; + + /* + * XXX this can fail if max_*_wr is too large? Are we supposed + * to back off until we get a value that the hardware can support? + */ + ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr); + if (ret) { + rdsdebug("rdma_create_qp failed: %d\n", ret); + goto out; + } + + ic->i_send_hdrs = ib_dma_alloc_coherent(dev, + ic->i_send_ring.w_nr * + sizeof(struct rds_header), + &ic->i_send_hdrs_dma, GFP_KERNEL); + if (ic->i_send_hdrs == NULL) { + ret = -ENOMEM; + rdsdebug("ib_dma_alloc_coherent send failed\n"); + goto out; + } + + ic->i_recv_hdrs = ib_dma_alloc_coherent(dev, + ic->i_recv_ring.w_nr * + sizeof(struct rds_header), + &ic->i_recv_hdrs_dma, GFP_KERNEL); + if (ic->i_recv_hdrs == NULL) { + ret = -ENOMEM; + rdsdebug("ib_dma_alloc_coherent recv failed\n"); + goto out; + } + + ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), + &ic->i_ack_dma, GFP_KERNEL); + if (ic->i_ack == NULL) { + ret = -ENOMEM; + rdsdebug("ib_dma_alloc_coherent ack failed\n"); + goto out; + } + + ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work)); + if (ic->i_sends == NULL) { + ret = -ENOMEM; + rdsdebug("send allocation failed\n"); + goto out; + } + rds_iw_send_init_ring(ic); + + ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work)); + if (ic->i_recvs == NULL) { + ret = -ENOMEM; + rdsdebug("recv allocation failed\n"); + goto out; + } + + rds_iw_recv_init_ring(ic); + rds_iw_recv_init_ack(ic); + + /* Post receive buffers - as a side effect, this will update + * the posted credit count. */ + rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1); + + rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr, + ic->i_send_cq, ic->i_recv_cq); + +out: + return ret; +} + +static u32 rds_iw_protocol_compatible(const struct rds_iw_connect_private *dp) +{ + u16 common; + u32 version = 0; + + /* rdma_cm private data is odd - when there is any private data in the + * request, we will be given a pretty large buffer without telling us the + * original size. The only way to tell the difference is by looking at + * the contents, which are initialized to zero. + * If the protocol version fields aren't set, this is a connection attempt + * from an older version. This could could be 3.0 or 2.0 - we can't tell. + * We really should have changed this for OFED 1.3 :-( */ + if (dp->dp_protocol_major == 0) + return RDS_PROTOCOL_3_0; + + common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IW_SUPPORTED_PROTOCOLS; + if (dp->dp_protocol_major == 3 && common) { + version = RDS_PROTOCOL_3_0; + while ((common >>= 1) != 0) + version++; + } else if (printk_ratelimit()) { + printk(KERN_NOTICE "RDS: Connection from %pI4 using " + "incompatible protocol version %u.%u\n", + &dp->dp_saddr, + dp->dp_protocol_major, + dp->dp_protocol_minor); + } + return version; +} + +int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + const struct rds_iw_connect_private *dp = event->param.conn.private_data; + struct rds_iw_connect_private dp_rep; + struct rds_connection *conn = NULL; + struct rds_iw_connection *ic = NULL; + struct rdma_conn_param conn_param; + struct rds_iw_device *rds_iwdev; + u32 version; + int err, destroy = 1; + + /* Check whether the remote protocol version matches ours. */ + version = rds_iw_protocol_compatible(dp); + if (!version) + goto out; + + rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u\n", + &dp->dp_saddr, &dp->dp_daddr, + RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version)); + + conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_iw_transport, + GFP_KERNEL); + if (IS_ERR(conn)) { + rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); + conn = NULL; + goto out; + } + + /* + * The connection request may occur while the + * previous connection exist, e.g. in case of failover. + * But as connections may be initiated simultaneously + * by both hosts, we have a random backoff mechanism - + * see the comment above rds_queue_reconnect() + */ + mutex_lock(&conn->c_cm_lock); + if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { + if (rds_conn_state(conn) == RDS_CONN_UP) { + rdsdebug("incoming connect while connecting\n"); + rds_conn_drop(conn); + rds_iw_stats_inc(s_iw_listen_closed_stale); + } else + if (rds_conn_state(conn) == RDS_CONN_CONNECTING) { + /* Wait and see - our connect may still be succeeding */ + rds_iw_stats_inc(s_iw_connect_raced); + } + mutex_unlock(&conn->c_cm_lock); + goto out; + } + + ic = conn->c_transport_data; + + rds_iw_set_protocol(conn, version); + rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); + + /* If the peer gave us the last packet it saw, process this as if + * we had received a regular ACK. */ + if (dp->dp_ack_seq) + rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); + + BUG_ON(cm_id->context); + BUG_ON(ic->i_cm_id); + + ic->i_cm_id = cm_id; + cm_id->context = conn; + + rds_iwdev = ib_get_client_data(cm_id->device, &rds_iw_client); + ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey; + + /* We got halfway through setting up the ib_connection, if we + * fail now, we have to take the long route out of this mess. */ + destroy = 0; + + err = rds_iw_setup_qp(conn); + if (err) { + rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", err); + goto out; + } + + rds_iw_cm_fill_conn_param(conn, &conn_param, &dp_rep, version); + + /* rdma_accept() calls rdma_reject() internally if it fails */ + err = rdma_accept(cm_id, &conn_param); + mutex_unlock(&conn->c_cm_lock); + if (err) { + rds_iw_conn_error(conn, "rdma_accept failed (%d)\n", err); + goto out; + } + + return 0; + +out: + rdma_reject(cm_id, NULL, 0); + return destroy; +} + + +int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id) +{ + struct rds_connection *conn = cm_id->context; + struct rds_iw_connection *ic = conn->c_transport_data; + struct rdma_conn_param conn_param; + struct rds_iw_connect_private dp; + int ret; + + /* If the peer doesn't do protocol negotiation, we must + * default to RDSv3.0 */ + rds_iw_set_protocol(conn, RDS_PROTOCOL_3_0); + ic->i_flowctl = rds_iw_sysctl_flow_control; /* advertise flow control */ + + ret = rds_iw_setup_qp(conn); + if (ret) { + rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", ret); + goto out; + } + + rds_iw_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION); + + ret = rdma_connect(cm_id, &conn_param); + if (ret) + rds_iw_conn_error(conn, "rdma_connect failed (%d)\n", ret); + +out: + /* Beware - returning non-zero tells the rdma_cm to destroy + * the cm_id. We should certainly not do it as long as we still + * "own" the cm_id. */ + if (ret) { + struct rds_iw_connection *ic = conn->c_transport_data; + + if (ic->i_cm_id == cm_id) + ret = 0; + } + return ret; +} + +int rds_iw_conn_connect(struct rds_connection *conn) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + struct rds_iw_device *rds_iwdev; + struct sockaddr_in src, dest; + int ret; + + /* XXX I wonder what affect the port space has */ + /* delegate cm event handler to rdma_transport */ + ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn, + RDMA_PS_TCP); + if (IS_ERR(ic->i_cm_id)) { + ret = PTR_ERR(ic->i_cm_id); + ic->i_cm_id = NULL; + rdsdebug("rdma_create_id() failed: %d\n", ret); + goto out; + } + + rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn); + + src.sin_family = AF_INET; + src.sin_addr.s_addr = (__force u32)conn->c_laddr; + src.sin_port = (__force u16)htons(0); + + /* First, bind to the local address and device. */ + ret = rdma_bind_addr(ic->i_cm_id, (struct sockaddr *) &src); + if (ret) { + rdsdebug("rdma_bind_addr(%pI4) failed: %d\n", + &conn->c_laddr, ret); + rdma_destroy_id(ic->i_cm_id); + ic->i_cm_id = NULL; + goto out; + } + + rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); + ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey; + + dest.sin_family = AF_INET; + dest.sin_addr.s_addr = (__force u32)conn->c_faddr; + dest.sin_port = (__force u16)htons(RDS_PORT); + + ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src, + (struct sockaddr *)&dest, + RDS_RDMA_RESOLVE_TIMEOUT_MS); + if (ret) { + rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id, + ret); + rdma_destroy_id(ic->i_cm_id); + ic->i_cm_id = NULL; + } + +out: + return ret; +} + +/* + * This is so careful about only cleaning up resources that were built up + * so that it can be called at any point during startup. In fact it + * can be called multiple times for a given connection. + */ +void rds_iw_conn_shutdown(struct rds_connection *conn) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + int err = 0; + struct ib_qp_attr qp_attr; + + rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id, + ic->i_pd, ic->i_send_cq, ic->i_recv_cq, + ic->i_cm_id ? ic->i_cm_id->qp : NULL); + + if (ic->i_cm_id) { + struct ib_device *dev = ic->i_cm_id->device; + + rdsdebug("disconnecting cm %p\n", ic->i_cm_id); + err = rdma_disconnect(ic->i_cm_id); + if (err) { + /* Actually this may happen quite frequently, when + * an outgoing connect raced with an incoming connect. + */ + rdsdebug("rds_iw_conn_shutdown: failed to disconnect," + " cm: %p err %d\n", ic->i_cm_id, err); + } + + if (ic->i_cm_id->qp) { + qp_attr.qp_state = IB_QPS_ERR; + ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE); + } + + wait_event(rds_iw_ring_empty_wait, + rds_iw_ring_empty(&ic->i_send_ring) && + rds_iw_ring_empty(&ic->i_recv_ring)); + + if (ic->i_send_hdrs) + ib_dma_free_coherent(dev, + ic->i_send_ring.w_nr * + sizeof(struct rds_header), + ic->i_send_hdrs, + ic->i_send_hdrs_dma); + + if (ic->i_recv_hdrs) + ib_dma_free_coherent(dev, + ic->i_recv_ring.w_nr * + sizeof(struct rds_header), + ic->i_recv_hdrs, + ic->i_recv_hdrs_dma); + + if (ic->i_ack) + ib_dma_free_coherent(dev, sizeof(struct rds_header), + ic->i_ack, ic->i_ack_dma); + + if (ic->i_sends) + rds_iw_send_clear_ring(ic); + if (ic->i_recvs) + rds_iw_recv_clear_ring(ic); + + if (ic->i_cm_id->qp) + rdma_destroy_qp(ic->i_cm_id); + if (ic->i_send_cq) + ib_destroy_cq(ic->i_send_cq); + if (ic->i_recv_cq) + ib_destroy_cq(ic->i_recv_cq); + + /* + * If associated with an rds_iw_device: + * Move connection back to the nodev list. + * Remove cm_id from the device cm_id list. + */ + if (ic->rds_iwdev) { + + spin_lock_irq(&ic->rds_iwdev->spinlock); + BUG_ON(list_empty(&ic->iw_node)); + list_del(&ic->iw_node); + spin_unlock_irq(&ic->rds_iwdev->spinlock); + + spin_lock_irq(&iw_nodev_conns_lock); + list_add_tail(&ic->iw_node, &iw_nodev_conns); + spin_unlock_irq(&iw_nodev_conns_lock); + rds_iw_remove_cm_id(ic->rds_iwdev, ic->i_cm_id); + ic->rds_iwdev = NULL; + } + + rdma_destroy_id(ic->i_cm_id); + + ic->i_cm_id = NULL; + ic->i_pd = NULL; + ic->i_mr = NULL; + ic->i_send_cq = NULL; + ic->i_recv_cq = NULL; + ic->i_send_hdrs = NULL; + ic->i_recv_hdrs = NULL; + ic->i_ack = NULL; + } + BUG_ON(ic->rds_iwdev); + + /* Clear pending transmit */ + if (ic->i_rm) { + rds_message_put(ic->i_rm); + ic->i_rm = NULL; + } + + /* Clear the ACK state */ + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); + rds_iw_set_64bit(&ic->i_ack_next, 0); + ic->i_ack_recv = 0; + + /* Clear flow control state */ + ic->i_flowctl = 0; + atomic_set(&ic->i_credits, 0); + + rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr); + rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr); + + if (ic->i_iwinc) { + rds_inc_put(&ic->i_iwinc->ii_inc); + ic->i_iwinc = NULL; + } + + vfree(ic->i_sends); + ic->i_sends = NULL; + vfree(ic->i_recvs); + ic->i_recvs = NULL; + rdsdebug("shutdown complete\n"); +} + +int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp) +{ + struct rds_iw_connection *ic; + unsigned long flags; + + /* XXX too lazy? */ + ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL); + if (ic == NULL) + return -ENOMEM; + + INIT_LIST_HEAD(&ic->iw_node); + mutex_init(&ic->i_recv_mutex); + + /* + * rds_iw_conn_shutdown() waits for these to be emptied so they + * must be initialized before it can be called. + */ + rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr); + rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr); + + ic->conn = conn; + conn->c_transport_data = ic; + + spin_lock_irqsave(&iw_nodev_conns_lock, flags); + list_add_tail(&ic->iw_node, &iw_nodev_conns); + spin_unlock_irqrestore(&iw_nodev_conns_lock, flags); + + + rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data); + return 0; +} + +void rds_iw_conn_free(void *arg) +{ + struct rds_iw_connection *ic = arg; + rdsdebug("ic %p\n", ic); + list_del(&ic->iw_node); + kfree(ic); +} + +/* + * An error occurred on the connection + */ +void +__rds_iw_conn_error(struct rds_connection *conn, const char *fmt, ...) +{ + va_list ap; + + rds_conn_drop(conn); + + va_start(ap, fmt); + vprintk(fmt, ap); + va_end(ap); +} diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c new file mode 100644 index 00000000000..1c02a8f952d --- /dev/null +++ b/net/rds/iw_rdma.c @@ -0,0 +1,888 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> + +#include "rds.h" +#include "rdma.h" +#include "iw.h" + + +/* + * This is stored as mr->r_trans_private. + */ +struct rds_iw_mr { + struct rds_iw_device *device; + struct rds_iw_mr_pool *pool; + struct rdma_cm_id *cm_id; + + struct ib_mr *mr; + struct ib_fast_reg_page_list *page_list; + + struct rds_iw_mapping mapping; + unsigned char remap_count; +}; + +/* + * Our own little MR pool + */ +struct rds_iw_mr_pool { + struct rds_iw_device *device; /* back ptr to the device that owns us */ + + struct mutex flush_lock; /* serialize fmr invalidate */ + struct work_struct flush_worker; /* flush worker */ + + spinlock_t list_lock; /* protect variables below */ + atomic_t item_count; /* total # of MRs */ + atomic_t dirty_count; /* # dirty of MRs */ + struct list_head dirty_list; /* dirty mappings */ + struct list_head clean_list; /* unused & unamapped MRs */ + atomic_t free_pinned; /* memory pinned by free MRs */ + unsigned long max_message_size; /* in pages */ + unsigned long max_items; + unsigned long max_items_soft; + unsigned long max_free_pinned; + int max_pages; +}; + +static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all); +static void rds_iw_mr_pool_flush_worker(struct work_struct *work); +static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); +static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool, + struct rds_iw_mr *ibmr, + struct scatterlist *sg, unsigned int nents); +static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); +static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool, + struct list_head *unmap_list, + struct list_head *kill_list); +static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); + +static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwdev, struct rdma_cm_id **cm_id) +{ + struct rds_iw_device *iwdev; + struct rds_iw_cm_id *i_cm_id; + + *rds_iwdev = NULL; + *cm_id = NULL; + + list_for_each_entry(iwdev, &rds_iw_devices, list) { + spin_lock_irq(&iwdev->spinlock); + list_for_each_entry(i_cm_id, &iwdev->cm_id_list, list) { + struct sockaddr_in *src_addr, *dst_addr; + + src_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.src_addr; + dst_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.dst_addr; + + rdsdebug("local ipaddr = %x port %d, " + "remote ipaddr = %x port %d" + "..looking for %x port %d, " + "remote ipaddr = %x port %d\n", + src_addr->sin_addr.s_addr, + src_addr->sin_port, + dst_addr->sin_addr.s_addr, + dst_addr->sin_port, + rs->rs_bound_addr, + rs->rs_bound_port, + rs->rs_conn_addr, + rs->rs_conn_port); +#ifdef WORKING_TUPLE_DETECTION + if (src_addr->sin_addr.s_addr == rs->rs_bound_addr && + src_addr->sin_port == rs->rs_bound_port && + dst_addr->sin_addr.s_addr == rs->rs_conn_addr && + dst_addr->sin_port == rs->rs_conn_port) { +#else + /* FIXME - needs to compare the local and remote + * ipaddr/port tuple, but the ipaddr is the only + * available infomation in the rds_sock (as the rest are + * zero'ed. It doesn't appear to be properly populated + * during connection setup... + */ + if (src_addr->sin_addr.s_addr == rs->rs_bound_addr) { +#endif + spin_unlock_irq(&iwdev->spinlock); + *rds_iwdev = iwdev; + *cm_id = i_cm_id->cm_id; + return 0; + } + } + spin_unlock_irq(&iwdev->spinlock); + } + + return 1; +} + +static int rds_iw_add_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id) +{ + struct rds_iw_cm_id *i_cm_id; + + i_cm_id = kmalloc(sizeof *i_cm_id, GFP_KERNEL); + if (!i_cm_id) + return -ENOMEM; + + i_cm_id->cm_id = cm_id; + + spin_lock_irq(&rds_iwdev->spinlock); + list_add_tail(&i_cm_id->list, &rds_iwdev->cm_id_list); + spin_unlock_irq(&rds_iwdev->spinlock); + + return 0; +} + +void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id) +{ + struct rds_iw_cm_id *i_cm_id; + + spin_lock_irq(&rds_iwdev->spinlock); + list_for_each_entry(i_cm_id, &rds_iwdev->cm_id_list, list) { + if (i_cm_id->cm_id == cm_id) { + list_del(&i_cm_id->list); + kfree(i_cm_id); + break; + } + } + spin_unlock_irq(&rds_iwdev->spinlock); +} + + +int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id) +{ + struct sockaddr_in *src_addr, *dst_addr; + struct rds_iw_device *rds_iwdev_old; + struct rds_sock rs; + struct rdma_cm_id *pcm_id; + int rc; + + src_addr = (struct sockaddr_in *)&cm_id->route.addr.src_addr; + dst_addr = (struct sockaddr_in *)&cm_id->route.addr.dst_addr; + + rs.rs_bound_addr = src_addr->sin_addr.s_addr; + rs.rs_bound_port = src_addr->sin_port; + rs.rs_conn_addr = dst_addr->sin_addr.s_addr; + rs.rs_conn_port = dst_addr->sin_port; + + rc = rds_iw_get_device(&rs, &rds_iwdev_old, &pcm_id); + if (rc) + rds_iw_remove_cm_id(rds_iwdev, cm_id); + + return rds_iw_add_cm_id(rds_iwdev, cm_id); +} + +int rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + + /* conn was previously on the nodev_conns_list */ + spin_lock_irq(&iw_nodev_conns_lock); + BUG_ON(list_empty(&iw_nodev_conns)); + BUG_ON(list_empty(&ic->iw_node)); + list_del(&ic->iw_node); + spin_unlock_irq(&iw_nodev_conns_lock); + + spin_lock_irq(&rds_iwdev->spinlock); + list_add_tail(&ic->iw_node, &rds_iwdev->conn_list); + spin_unlock_irq(&rds_iwdev->spinlock); + + ic->rds_iwdev = rds_iwdev; + + return 0; +} + +void rds_iw_remove_nodev_conns(void) +{ + struct rds_iw_connection *ic, *_ic; + LIST_HEAD(tmp_list); + + /* avoid calling conn_destroy with irqs off */ + spin_lock_irq(&iw_nodev_conns_lock); + list_splice(&iw_nodev_conns, &tmp_list); + INIT_LIST_HEAD(&iw_nodev_conns); + spin_unlock_irq(&iw_nodev_conns_lock); + + list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node) { + if (ic->conn->c_passive) + rds_conn_destroy(ic->conn->c_passive); + rds_conn_destroy(ic->conn); + } +} + +void rds_iw_remove_conns(struct rds_iw_device *rds_iwdev) +{ + struct rds_iw_connection *ic, *_ic; + LIST_HEAD(tmp_list); + + /* avoid calling conn_destroy with irqs off */ + spin_lock_irq(&rds_iwdev->spinlock); + list_splice(&rds_iwdev->conn_list, &tmp_list); + INIT_LIST_HEAD(&rds_iwdev->conn_list); + spin_unlock_irq(&rds_iwdev->spinlock); + + list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node) { + if (ic->conn->c_passive) + rds_conn_destroy(ic->conn->c_passive); + rds_conn_destroy(ic->conn); + } +} + +static void rds_iw_set_scatterlist(struct rds_iw_scatterlist *sg, + struct scatterlist *list, unsigned int sg_len) +{ + sg->list = list; + sg->len = sg_len; + sg->dma_len = 0; + sg->dma_npages = 0; + sg->bytes = 0; +} + +static u64 *rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev, + struct rds_iw_scatterlist *sg, + unsigned int dma_page_shift) +{ + struct ib_device *dev = rds_iwdev->dev; + u64 *dma_pages = NULL; + u64 dma_mask; + unsigned int dma_page_size; + int i, j, ret; + + dma_page_size = 1 << dma_page_shift; + dma_mask = dma_page_size - 1; + + WARN_ON(sg->dma_len); + + sg->dma_len = ib_dma_map_sg(dev, sg->list, sg->len, DMA_BIDIRECTIONAL); + if (unlikely(!sg->dma_len)) { + printk(KERN_WARNING "RDS/IW: dma_map_sg failed!\n"); + return ERR_PTR(-EBUSY); + } + + sg->bytes = 0; + sg->dma_npages = 0; + + ret = -EINVAL; + for (i = 0; i < sg->dma_len; ++i) { + unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]); + u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]); + u64 end_addr; + + sg->bytes += dma_len; + + end_addr = dma_addr + dma_len; + if (dma_addr & dma_mask) { + if (i > 0) + goto out_unmap; + dma_addr &= ~dma_mask; + } + if (end_addr & dma_mask) { + if (i < sg->dma_len - 1) + goto out_unmap; + end_addr = (end_addr + dma_mask) & ~dma_mask; + } + + sg->dma_npages += (end_addr - dma_addr) >> dma_page_shift; + } + + /* Now gather the dma addrs into one list */ + if (sg->dma_npages > fastreg_message_size) + goto out_unmap; + + dma_pages = kmalloc(sizeof(u64) * sg->dma_npages, GFP_ATOMIC); + if (!dma_pages) { + ret = -ENOMEM; + goto out_unmap; + } + + for (i = j = 0; i < sg->dma_len; ++i) { + unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]); + u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]); + u64 end_addr; + + end_addr = dma_addr + dma_len; + dma_addr &= ~dma_mask; + for (; dma_addr < end_addr; dma_addr += dma_page_size) + dma_pages[j++] = dma_addr; + BUG_ON(j > sg->dma_npages); + } + + return dma_pages; + +out_unmap: + ib_dma_unmap_sg(rds_iwdev->dev, sg->list, sg->len, DMA_BIDIRECTIONAL); + sg->dma_len = 0; + kfree(dma_pages); + return ERR_PTR(ret); +} + + +struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *rds_iwdev) +{ + struct rds_iw_mr_pool *pool; + + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) { + printk(KERN_WARNING "RDS/IW: rds_iw_create_mr_pool alloc error\n"); + return ERR_PTR(-ENOMEM); + } + + pool->device = rds_iwdev; + INIT_LIST_HEAD(&pool->dirty_list); + INIT_LIST_HEAD(&pool->clean_list); + mutex_init(&pool->flush_lock); + spin_lock_init(&pool->list_lock); + INIT_WORK(&pool->flush_worker, rds_iw_mr_pool_flush_worker); + + pool->max_message_size = fastreg_message_size; + pool->max_items = fastreg_pool_size; + pool->max_free_pinned = pool->max_items * pool->max_message_size / 4; + pool->max_pages = fastreg_message_size; + + /* We never allow more than max_items MRs to be allocated. + * When we exceed more than max_items_soft, we start freeing + * items more aggressively. + * Make sure that max_items > max_items_soft > max_items / 2 + */ + pool->max_items_soft = pool->max_items * 3 / 4; + + return pool; +} + +void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo) +{ + struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool; + + iinfo->rdma_mr_max = pool->max_items; + iinfo->rdma_mr_size = pool->max_pages; +} + +void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *pool) +{ + flush_workqueue(rds_wq); + rds_iw_flush_mr_pool(pool, 1); + BUG_ON(atomic_read(&pool->item_count)); + BUG_ON(atomic_read(&pool->free_pinned)); + kfree(pool); +} + +static inline struct rds_iw_mr *rds_iw_reuse_fmr(struct rds_iw_mr_pool *pool) +{ + struct rds_iw_mr *ibmr = NULL; + unsigned long flags; + + spin_lock_irqsave(&pool->list_lock, flags); + if (!list_empty(&pool->clean_list)) { + ibmr = list_entry(pool->clean_list.next, struct rds_iw_mr, mapping.m_list); + list_del_init(&ibmr->mapping.m_list); + } + spin_unlock_irqrestore(&pool->list_lock, flags); + + return ibmr; +} + +static struct rds_iw_mr *rds_iw_alloc_mr(struct rds_iw_device *rds_iwdev) +{ + struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool; + struct rds_iw_mr *ibmr = NULL; + int err = 0, iter = 0; + + while (1) { + ibmr = rds_iw_reuse_fmr(pool); + if (ibmr) + return ibmr; + + /* No clean MRs - now we have the choice of either + * allocating a fresh MR up to the limit imposed by the + * driver, or flush any dirty unused MRs. + * We try to avoid stalling in the send path if possible, + * so we allocate as long as we're allowed to. + * + * We're fussy with enforcing the FMR limit, though. If the driver + * tells us we can't use more than N fmrs, we shouldn't start + * arguing with it */ + if (atomic_inc_return(&pool->item_count) <= pool->max_items) + break; + + atomic_dec(&pool->item_count); + + if (++iter > 2) { + rds_iw_stats_inc(s_iw_rdma_mr_pool_depleted); + return ERR_PTR(-EAGAIN); + } + + /* We do have some empty MRs. Flush them out. */ + rds_iw_stats_inc(s_iw_rdma_mr_pool_wait); + rds_iw_flush_mr_pool(pool, 0); + } + + ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL); + if (!ibmr) { + err = -ENOMEM; + goto out_no_cigar; + } + + spin_lock_init(&ibmr->mapping.m_lock); + INIT_LIST_HEAD(&ibmr->mapping.m_list); + ibmr->mapping.m_mr = ibmr; + + err = rds_iw_init_fastreg(pool, ibmr); + if (err) + goto out_no_cigar; + + rds_iw_stats_inc(s_iw_rdma_mr_alloc); + return ibmr; + +out_no_cigar: + if (ibmr) { + rds_iw_destroy_fastreg(pool, ibmr); + kfree(ibmr); + } + atomic_dec(&pool->item_count); + return ERR_PTR(err); +} + +void rds_iw_sync_mr(void *trans_private, int direction) +{ + struct rds_iw_mr *ibmr = trans_private; + struct rds_iw_device *rds_iwdev = ibmr->device; + + switch (direction) { + case DMA_FROM_DEVICE: + ib_dma_sync_sg_for_cpu(rds_iwdev->dev, ibmr->mapping.m_sg.list, + ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL); + break; + case DMA_TO_DEVICE: + ib_dma_sync_sg_for_device(rds_iwdev->dev, ibmr->mapping.m_sg.list, + ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL); + break; + } +} + +static inline unsigned int rds_iw_flush_goal(struct rds_iw_mr_pool *pool, int free_all) +{ + unsigned int item_count; + + item_count = atomic_read(&pool->item_count); + if (free_all) + return item_count; + + return 0; +} + +/* + * Flush our pool of MRs. + * At a minimum, all currently unused MRs are unmapped. + * If the number of MRs allocated exceeds the limit, we also try + * to free as many MRs as needed to get back to this limit. + */ +static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all) +{ + struct rds_iw_mr *ibmr, *next; + LIST_HEAD(unmap_list); + LIST_HEAD(kill_list); + unsigned long flags; + unsigned int nfreed = 0, ncleaned = 0, free_goal; + int ret = 0; + + rds_iw_stats_inc(s_iw_rdma_mr_pool_flush); + + mutex_lock(&pool->flush_lock); + + spin_lock_irqsave(&pool->list_lock, flags); + /* Get the list of all mappings to be destroyed */ + list_splice_init(&pool->dirty_list, &unmap_list); + if (free_all) + list_splice_init(&pool->clean_list, &kill_list); + spin_unlock_irqrestore(&pool->list_lock, flags); + + free_goal = rds_iw_flush_goal(pool, free_all); + + /* Batched invalidate of dirty MRs. + * For FMR based MRs, the mappings on the unmap list are + * actually members of an ibmr (ibmr->mapping). They either + * migrate to the kill_list, or have been cleaned and should be + * moved to the clean_list. + * For fastregs, they will be dynamically allocated, and + * will be destroyed by the unmap function. + */ + if (!list_empty(&unmap_list)) { + ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list, &kill_list); + /* If we've been asked to destroy all MRs, move those + * that were simply cleaned to the kill list */ + if (free_all) + list_splice_init(&unmap_list, &kill_list); + } + + /* Destroy any MRs that are past their best before date */ + list_for_each_entry_safe(ibmr, next, &kill_list, mapping.m_list) { + rds_iw_stats_inc(s_iw_rdma_mr_free); + list_del(&ibmr->mapping.m_list); + rds_iw_destroy_fastreg(pool, ibmr); + kfree(ibmr); + nfreed++; + } + + /* Anything that remains are laundered ibmrs, which we can add + * back to the clean list. */ + if (!list_empty(&unmap_list)) { + spin_lock_irqsave(&pool->list_lock, flags); + list_splice(&unmap_list, &pool->clean_list); + spin_unlock_irqrestore(&pool->list_lock, flags); + } + + atomic_sub(ncleaned, &pool->dirty_count); + atomic_sub(nfreed, &pool->item_count); + + mutex_unlock(&pool->flush_lock); + return ret; +} + +static void rds_iw_mr_pool_flush_worker(struct work_struct *work) +{ + struct rds_iw_mr_pool *pool = container_of(work, struct rds_iw_mr_pool, flush_worker); + + rds_iw_flush_mr_pool(pool, 0); +} + +void rds_iw_free_mr(void *trans_private, int invalidate) +{ + struct rds_iw_mr *ibmr = trans_private; + struct rds_iw_mr_pool *pool = ibmr->device->mr_pool; + + rdsdebug("RDS/IW: free_mr nents %u\n", ibmr->mapping.m_sg.len); + if (!pool) + return; + + /* Return it to the pool's free list */ + rds_iw_free_fastreg(pool, ibmr); + + /* If we've pinned too many pages, request a flush */ + if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned + || atomic_read(&pool->dirty_count) >= pool->max_items / 10) + queue_work(rds_wq, &pool->flush_worker); + + if (invalidate) { + if (likely(!in_interrupt())) { + rds_iw_flush_mr_pool(pool, 0); + } else { + /* We get here if the user created a MR marked + * as use_once and invalidate at the same time. */ + queue_work(rds_wq, &pool->flush_worker); + } + } +} + +void rds_iw_flush_mrs(void) +{ + struct rds_iw_device *rds_iwdev; + + list_for_each_entry(rds_iwdev, &rds_iw_devices, list) { + struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool; + + if (pool) + rds_iw_flush_mr_pool(pool, 0); + } +} + +void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents, + struct rds_sock *rs, u32 *key_ret) +{ + struct rds_iw_device *rds_iwdev; + struct rds_iw_mr *ibmr = NULL; + struct rdma_cm_id *cm_id; + int ret; + + ret = rds_iw_get_device(rs, &rds_iwdev, &cm_id); + if (ret || !cm_id) { + ret = -ENODEV; + goto out; + } + + if (!rds_iwdev->mr_pool) { + ret = -ENODEV; + goto out; + } + + ibmr = rds_iw_alloc_mr(rds_iwdev); + if (IS_ERR(ibmr)) + return ibmr; + + ibmr->cm_id = cm_id; + ibmr->device = rds_iwdev; + + ret = rds_iw_map_fastreg(rds_iwdev->mr_pool, ibmr, sg, nents); + if (ret == 0) + *key_ret = ibmr->mr->rkey; + else + printk(KERN_WARNING "RDS/IW: failed to map mr (errno=%d)\n", ret); + +out: + if (ret) { + if (ibmr) + rds_iw_free_mr(ibmr, 0); + ibmr = ERR_PTR(ret); + } + return ibmr; +} + +/* + * iWARP fastreg handling + * + * The life cycle of a fastreg registration is a bit different from + * FMRs. + * The idea behind fastreg is to have one MR, to which we bind different + * mappings over time. To avoid stalling on the expensive map and invalidate + * operations, these operations are pipelined on the same send queue on + * which we want to send the message containing the r_key. + * + * This creates a bit of a problem for us, as we do not have the destination + * IP in GET_MR, so the connection must be setup prior to the GET_MR call for + * RDMA to be correctly setup. If a fastreg request is present, rds_iw_xmit + * will try to queue a LOCAL_INV (if needed) and a FAST_REG_MR work request + * before queuing the SEND. When completions for these arrive, they are + * dispatched to the MR has a bit set showing that RDMa can be performed. + * + * There is another interesting aspect that's related to invalidation. + * The application can request that a mapping is invalidated in FREE_MR. + * The expectation there is that this invalidation step includes ALL + * PREVIOUSLY FREED MRs. + */ +static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool, + struct rds_iw_mr *ibmr) +{ + struct rds_iw_device *rds_iwdev = pool->device; + struct ib_fast_reg_page_list *page_list = NULL; + struct ib_mr *mr; + int err; + + mr = ib_alloc_fast_reg_mr(rds_iwdev->pd, pool->max_message_size); + if (IS_ERR(mr)) { + err = PTR_ERR(mr); + + printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed (err=%d)\n", err); + return err; + } + + /* FIXME - this is overkill, but mapping->m_sg.dma_len/mapping->m_sg.dma_npages + * is not filled in. + */ + page_list = ib_alloc_fast_reg_page_list(rds_iwdev->dev, pool->max_message_size); + if (IS_ERR(page_list)) { + err = PTR_ERR(page_list); + + printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed (err=%d)\n", err); + ib_dereg_mr(mr); + return err; + } + + ibmr->page_list = page_list; + ibmr->mr = mr; + return 0; +} + +static int rds_iw_rdma_build_fastreg(struct rds_iw_mapping *mapping) +{ + struct rds_iw_mr *ibmr = mapping->m_mr; + struct ib_send_wr f_wr, *failed_wr; + int ret; + + /* + * Perform a WR for the fast_reg_mr. Each individual page + * in the sg list is added to the fast reg page list and placed + * inside the fast_reg_mr WR. The key used is a rolling 8bit + * counter, which should guarantee uniqueness. + */ + ib_update_fast_reg_key(ibmr->mr, ibmr->remap_count++); + mapping->m_rkey = ibmr->mr->rkey; + + memset(&f_wr, 0, sizeof(f_wr)); + f_wr.wr_id = RDS_IW_FAST_REG_WR_ID; + f_wr.opcode = IB_WR_FAST_REG_MR; + f_wr.wr.fast_reg.length = mapping->m_sg.bytes; + f_wr.wr.fast_reg.rkey = mapping->m_rkey; + f_wr.wr.fast_reg.page_list = ibmr->page_list; + f_wr.wr.fast_reg.page_list_len = mapping->m_sg.dma_len; + f_wr.wr.fast_reg.page_shift = ibmr->device->page_shift; + f_wr.wr.fast_reg.access_flags = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE; + f_wr.wr.fast_reg.iova_start = 0; + f_wr.send_flags = IB_SEND_SIGNALED; + + failed_wr = &f_wr; + ret = ib_post_send(ibmr->cm_id->qp, &f_wr, &failed_wr); + BUG_ON(failed_wr != &f_wr); + if (ret && printk_ratelimit()) + printk(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n", + __func__, __LINE__, ret); + return ret; +} + +static int rds_iw_rdma_fastreg_inv(struct rds_iw_mr *ibmr) +{ + struct ib_send_wr s_wr, *failed_wr; + int ret = 0; + + if (!ibmr->cm_id->qp || !ibmr->mr) + goto out; + + memset(&s_wr, 0, sizeof(s_wr)); + s_wr.wr_id = RDS_IW_LOCAL_INV_WR_ID; + s_wr.opcode = IB_WR_LOCAL_INV; + s_wr.ex.invalidate_rkey = ibmr->mr->rkey; + s_wr.send_flags = IB_SEND_SIGNALED; + + failed_wr = &s_wr; + ret = ib_post_send(ibmr->cm_id->qp, &s_wr, &failed_wr); + if (ret && printk_ratelimit()) { + printk(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n", + __func__, __LINE__, ret); + goto out; + } +out: + return ret; +} + +static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool, + struct rds_iw_mr *ibmr, + struct scatterlist *sg, + unsigned int sg_len) +{ + struct rds_iw_device *rds_iwdev = pool->device; + struct rds_iw_mapping *mapping = &ibmr->mapping; + u64 *dma_pages; + int i, ret = 0; + + rds_iw_set_scatterlist(&mapping->m_sg, sg, sg_len); + + dma_pages = rds_iw_map_scatterlist(rds_iwdev, + &mapping->m_sg, + rds_iwdev->page_shift); + if (IS_ERR(dma_pages)) { + ret = PTR_ERR(dma_pages); + dma_pages = NULL; + goto out; + } + + if (mapping->m_sg.dma_len > pool->max_message_size) { + ret = -EMSGSIZE; + goto out; + } + + for (i = 0; i < mapping->m_sg.dma_npages; ++i) + ibmr->page_list->page_list[i] = dma_pages[i]; + + ret = rds_iw_rdma_build_fastreg(mapping); + if (ret) + goto out; + + rds_iw_stats_inc(s_iw_rdma_mr_used); + +out: + kfree(dma_pages); + + return ret; +} + +/* + * "Free" a fastreg MR. + */ +static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, + struct rds_iw_mr *ibmr) +{ + unsigned long flags; + int ret; + + if (!ibmr->mapping.m_sg.dma_len) + return; + + ret = rds_iw_rdma_fastreg_inv(ibmr); + if (ret) + return; + + /* Try to post the LOCAL_INV WR to the queue. */ + spin_lock_irqsave(&pool->list_lock, flags); + + list_add_tail(&ibmr->mapping.m_list, &pool->dirty_list); + atomic_add(ibmr->mapping.m_sg.len, &pool->free_pinned); + atomic_inc(&pool->dirty_count); + + spin_unlock_irqrestore(&pool->list_lock, flags); +} + +static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool, + struct list_head *unmap_list, + struct list_head *kill_list) +{ + struct rds_iw_mapping *mapping, *next; + unsigned int ncleaned = 0; + LIST_HEAD(laundered); + + /* Batched invalidation of fastreg MRs. + * Why do we do it this way, even though we could pipeline unmap + * and remap? The reason is the application semantics - when the + * application requests an invalidation of MRs, it expects all + * previously released R_Keys to become invalid. + * + * If we implement MR reuse naively, we risk memory corruption + * (this has actually been observed). So the default behavior + * requires that a MR goes through an explicit unmap operation before + * we can reuse it again. + * + * We could probably improve on this a little, by allowing immediate + * reuse of a MR on the same socket (eg you could add small + * cache of unused MRs to strct rds_socket - GET_MR could grab one + * of these without requiring an explicit invalidate). + */ + while (!list_empty(unmap_list)) { + unsigned long flags; + + spin_lock_irqsave(&pool->list_lock, flags); + list_for_each_entry_safe(mapping, next, unmap_list, m_list) { + list_move(&mapping->m_list, &laundered); + ncleaned++; + } + spin_unlock_irqrestore(&pool->list_lock, flags); + } + + /* Move all laundered mappings back to the unmap list. + * We do not kill any WRs right now - it doesn't seem the + * fastreg API has a max_remap limit. */ + list_splice_init(&laundered, unmap_list); + + return ncleaned; +} + +static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, + struct rds_iw_mr *ibmr) +{ + if (ibmr->page_list) + ib_free_fast_reg_page_list(ibmr->page_list); + if (ibmr->mr) + ib_dereg_mr(ibmr->mr); +} diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c new file mode 100644 index 00000000000..a1931f0027a --- /dev/null +++ b/net/rds/iw_recv.c @@ -0,0 +1,869 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/dma-mapping.h> +#include <rdma/rdma_cm.h> + +#include "rds.h" +#include "iw.h" + +static struct kmem_cache *rds_iw_incoming_slab; +static struct kmem_cache *rds_iw_frag_slab; +static atomic_t rds_iw_allocation = ATOMIC_INIT(0); + +static void rds_iw_frag_drop_page(struct rds_page_frag *frag) +{ + rdsdebug("frag %p page %p\n", frag, frag->f_page); + __free_page(frag->f_page); + frag->f_page = NULL; +} + +static void rds_iw_frag_free(struct rds_page_frag *frag) +{ + rdsdebug("frag %p page %p\n", frag, frag->f_page); + BUG_ON(frag->f_page != NULL); + kmem_cache_free(rds_iw_frag_slab, frag); +} + +/* + * We map a page at a time. Its fragments are posted in order. This + * is called in fragment order as the fragments get send completion events. + * Only the last frag in the page performs the unmapping. + * + * It's OK for ring cleanup to call this in whatever order it likes because + * DMA is not in flight and so we can unmap while other ring entries still + * hold page references in their frags. + */ +static void rds_iw_recv_unmap_page(struct rds_iw_connection *ic, + struct rds_iw_recv_work *recv) +{ + struct rds_page_frag *frag = recv->r_frag; + + rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page); + if (frag->f_mapped) + ib_dma_unmap_page(ic->i_cm_id->device, + frag->f_mapped, + RDS_FRAG_SIZE, DMA_FROM_DEVICE); + frag->f_mapped = 0; +} + +void rds_iw_recv_init_ring(struct rds_iw_connection *ic) +{ + struct rds_iw_recv_work *recv; + u32 i; + + for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) { + struct ib_sge *sge; + + recv->r_iwinc = NULL; + recv->r_frag = NULL; + + recv->r_wr.next = NULL; + recv->r_wr.wr_id = i; + recv->r_wr.sg_list = recv->r_sge; + recv->r_wr.num_sge = RDS_IW_RECV_SGE; + + sge = rds_iw_data_sge(ic, recv->r_sge); + sge->addr = 0; + sge->length = RDS_FRAG_SIZE; + sge->lkey = 0; + + sge = rds_iw_header_sge(ic, recv->r_sge); + sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); + sge->length = sizeof(struct rds_header); + sge->lkey = 0; + } +} + +static void rds_iw_recv_clear_one(struct rds_iw_connection *ic, + struct rds_iw_recv_work *recv) +{ + if (recv->r_iwinc) { + rds_inc_put(&recv->r_iwinc->ii_inc); + recv->r_iwinc = NULL; + } + if (recv->r_frag) { + rds_iw_recv_unmap_page(ic, recv); + if (recv->r_frag->f_page) + rds_iw_frag_drop_page(recv->r_frag); + rds_iw_frag_free(recv->r_frag); + recv->r_frag = NULL; + } +} + +void rds_iw_recv_clear_ring(struct rds_iw_connection *ic) +{ + u32 i; + + for (i = 0; i < ic->i_recv_ring.w_nr; i++) + rds_iw_recv_clear_one(ic, &ic->i_recvs[i]); + + if (ic->i_frag.f_page) + rds_iw_frag_drop_page(&ic->i_frag); +} + +static int rds_iw_recv_refill_one(struct rds_connection *conn, + struct rds_iw_recv_work *recv, + gfp_t kptr_gfp, gfp_t page_gfp) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + dma_addr_t dma_addr; + struct ib_sge *sge; + int ret = -ENOMEM; + + if (recv->r_iwinc == NULL) { + if (atomic_read(&rds_iw_allocation) >= rds_iw_sysctl_max_recv_allocation) { + rds_iw_stats_inc(s_iw_rx_alloc_limit); + goto out; + } + recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab, + kptr_gfp); + if (recv->r_iwinc == NULL) + goto out; + atomic_inc(&rds_iw_allocation); + INIT_LIST_HEAD(&recv->r_iwinc->ii_frags); + rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr); + } + + if (recv->r_frag == NULL) { + recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp); + if (recv->r_frag == NULL) + goto out; + INIT_LIST_HEAD(&recv->r_frag->f_item); + recv->r_frag->f_page = NULL; + } + + if (ic->i_frag.f_page == NULL) { + ic->i_frag.f_page = alloc_page(page_gfp); + if (ic->i_frag.f_page == NULL) + goto out; + ic->i_frag.f_offset = 0; + } + + dma_addr = ib_dma_map_page(ic->i_cm_id->device, + ic->i_frag.f_page, + ic->i_frag.f_offset, + RDS_FRAG_SIZE, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr)) + goto out; + + /* + * Once we get the RDS_PAGE_LAST_OFF frag then rds_iw_frag_unmap() + * must be called on this recv. This happens as completions hit + * in order or on connection shutdown. + */ + recv->r_frag->f_page = ic->i_frag.f_page; + recv->r_frag->f_offset = ic->i_frag.f_offset; + recv->r_frag->f_mapped = dma_addr; + + sge = rds_iw_data_sge(ic, recv->r_sge); + sge->addr = dma_addr; + sge->length = RDS_FRAG_SIZE; + + sge = rds_iw_header_sge(ic, recv->r_sge); + sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); + sge->length = sizeof(struct rds_header); + + get_page(recv->r_frag->f_page); + + if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) { + ic->i_frag.f_offset += RDS_FRAG_SIZE; + } else { + put_page(ic->i_frag.f_page); + ic->i_frag.f_page = NULL; + ic->i_frag.f_offset = 0; + } + + ret = 0; +out: + return ret; +} + +/* + * This tries to allocate and post unused work requests after making sure that + * they have all the allocations they need to queue received fragments into + * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc + * pairs don't go unmatched. + * + * -1 is returned if posting fails due to temporary resource exhaustion. + */ +int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, + gfp_t page_gfp, int prefill) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + struct rds_iw_recv_work *recv; + struct ib_recv_wr *failed_wr; + unsigned int posted = 0; + int ret = 0; + u32 pos; + + while ((prefill || rds_conn_up(conn)) + && rds_iw_ring_alloc(&ic->i_recv_ring, 1, &pos)) { + if (pos >= ic->i_recv_ring.w_nr) { + printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n", + pos); + ret = -EINVAL; + break; + } + + recv = &ic->i_recvs[pos]; + ret = rds_iw_recv_refill_one(conn, recv, kptr_gfp, page_gfp); + if (ret) { + ret = -1; + break; + } + + /* XXX when can this fail? */ + ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); + rdsdebug("recv %p iwinc %p page %p addr %lu ret %d\n", recv, + recv->r_iwinc, recv->r_frag->f_page, + (long) recv->r_frag->f_mapped, ret); + if (ret) { + rds_iw_conn_error(conn, "recv post on " + "%pI4 returned %d, disconnecting and " + "reconnecting\n", &conn->c_faddr, + ret); + ret = -1; + break; + } + + posted++; + } + + /* We're doing flow control - update the window. */ + if (ic->i_flowctl && posted) + rds_iw_advertise_credits(conn, posted); + + if (ret) + rds_iw_ring_unalloc(&ic->i_recv_ring, 1); + return ret; +} + +void rds_iw_inc_purge(struct rds_incoming *inc) +{ + struct rds_iw_incoming *iwinc; + struct rds_page_frag *frag; + struct rds_page_frag *pos; + + iwinc = container_of(inc, struct rds_iw_incoming, ii_inc); + rdsdebug("purging iwinc %p inc %p\n", iwinc, inc); + + list_for_each_entry_safe(frag, pos, &iwinc->ii_frags, f_item) { + list_del_init(&frag->f_item); + rds_iw_frag_drop_page(frag); + rds_iw_frag_free(frag); + } +} + +void rds_iw_inc_free(struct rds_incoming *inc) +{ + struct rds_iw_incoming *iwinc; + + iwinc = container_of(inc, struct rds_iw_incoming, ii_inc); + + rds_iw_inc_purge(inc); + rdsdebug("freeing iwinc %p inc %p\n", iwinc, inc); + BUG_ON(!list_empty(&iwinc->ii_frags)); + kmem_cache_free(rds_iw_incoming_slab, iwinc); + atomic_dec(&rds_iw_allocation); + BUG_ON(atomic_read(&rds_iw_allocation) < 0); +} + +int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, + size_t size) +{ + struct rds_iw_incoming *iwinc; + struct rds_page_frag *frag; + struct iovec *iov = first_iov; + unsigned long to_copy; + unsigned long frag_off = 0; + unsigned long iov_off = 0; + int copied = 0; + int ret; + u32 len; + + iwinc = container_of(inc, struct rds_iw_incoming, ii_inc); + frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item); + len = be32_to_cpu(inc->i_hdr.h_len); + + while (copied < size && copied < len) { + if (frag_off == RDS_FRAG_SIZE) { + frag = list_entry(frag->f_item.next, + struct rds_page_frag, f_item); + frag_off = 0; + } + while (iov_off == iov->iov_len) { + iov_off = 0; + iov++; + } + + to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off); + to_copy = min_t(size_t, to_copy, size - copied); + to_copy = min_t(unsigned long, to_copy, len - copied); + + rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag " + "[%p, %lu] + %lu\n", + to_copy, iov->iov_base, iov->iov_len, iov_off, + frag->f_page, frag->f_offset, frag_off); + + /* XXX needs + offset for multiple recvs per page */ + ret = rds_page_copy_to_user(frag->f_page, + frag->f_offset + frag_off, + iov->iov_base + iov_off, + to_copy); + if (ret) { + copied = ret; + break; + } + + iov_off += to_copy; + frag_off += to_copy; + copied += to_copy; + } + + return copied; +} + +/* ic starts out kzalloc()ed */ +void rds_iw_recv_init_ack(struct rds_iw_connection *ic) +{ + struct ib_send_wr *wr = &ic->i_ack_wr; + struct ib_sge *sge = &ic->i_ack_sge; + + sge->addr = ic->i_ack_dma; + sge->length = sizeof(struct rds_header); + sge->lkey = rds_iw_local_dma_lkey(ic); + + wr->sg_list = sge; + wr->num_sge = 1; + wr->opcode = IB_WR_SEND; + wr->wr_id = RDS_IW_ACK_WR_ID; + wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED; +} + +/* + * You'd think that with reliable IB connections you wouldn't need to ack + * messages that have been received. The problem is that IB hardware generates + * an ack message before it has DMAed the message into memory. This creates a + * potential message loss if the HCA is disabled for any reason between when it + * sends the ack and before the message is DMAed and processed. This is only a + * potential issue if another HCA is available for fail-over. + * + * When the remote host receives our ack they'll free the sent message from + * their send queue. To decrease the latency of this we always send an ack + * immediately after we've received messages. + * + * For simplicity, we only have one ack in flight at a time. This puts + * pressure on senders to have deep enough send queues to absorb the latency of + * a single ack frame being in flight. This might not be good enough. + * + * This is implemented by have a long-lived send_wr and sge which point to a + * statically allocated ack frame. This ack wr does not fall under the ring + * accounting that the tx and rx wrs do. The QP attribute specifically makes + * room for it beyond the ring size. Send completion notices its special + * wr_id and avoids working with the ring in that case. + */ +static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq, + int ack_required) +{ + rds_iw_set_64bit(&ic->i_ack_next, seq); + if (ack_required) { + smp_mb__before_clear_bit(); + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + } +} + +static u64 rds_iw_get_ack(struct rds_iw_connection *ic) +{ + clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + smp_mb__after_clear_bit(); + + return ic->i_ack_next; +} + +static void rds_iw_send_ack(struct rds_iw_connection *ic, unsigned int adv_credits) +{ + struct rds_header *hdr = ic->i_ack; + struct ib_send_wr *failed_wr; + u64 seq; + int ret; + + seq = rds_iw_get_ack(ic); + + rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq); + rds_message_populate_header(hdr, 0, 0, 0); + hdr->h_ack = cpu_to_be64(seq); + hdr->h_credit = adv_credits; + rds_message_make_checksum(hdr); + ic->i_ack_queued = jiffies; + + ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr); + if (unlikely(ret)) { + /* Failed to send. Release the WR, and + * force another ACK. + */ + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + + rds_iw_stats_inc(s_iw_ack_send_failure); + /* Need to finesse this later. */ + BUG(); + } else + rds_iw_stats_inc(s_iw_ack_sent); +} + +/* + * There are 3 ways of getting acknowledgements to the peer: + * 1. We call rds_iw_attempt_ack from the recv completion handler + * to send an ACK-only frame. + * However, there can be only one such frame in the send queue + * at any time, so we may have to postpone it. + * 2. When another (data) packet is transmitted while there's + * an ACK in the queue, we piggyback the ACK sequence number + * on the data packet. + * 3. If the ACK WR is done sending, we get called from the + * send queue completion handler, and check whether there's + * another ACK pending (postponed because the WR was on the + * queue). If so, we transmit it. + * + * We maintain 2 variables: + * - i_ack_flags, which keeps track of whether the ACK WR + * is currently in the send queue or not (IB_ACK_IN_FLIGHT) + * - i_ack_next, which is the last sequence number we received + * + * Potentially, send queue and receive queue handlers can run concurrently. + * + * Reconnecting complicates this picture just slightly. When we + * reconnect, we may be seeing duplicate packets. The peer + * is retransmitting them, because it hasn't seen an ACK for + * them. It is important that we ACK these. + * + * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with + * this flag set *MUST* be acknowledged immediately. + */ + +/* + * When we get here, we're called from the recv queue handler. + * Check whether we ought to transmit an ACK. + */ +void rds_iw_attempt_ack(struct rds_iw_connection *ic) +{ + unsigned int adv_credits; + + if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) + return; + + if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) { + rds_iw_stats_inc(s_iw_ack_send_delayed); + return; + } + + /* Can we get a send credit? */ + if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0)) { + rds_iw_stats_inc(s_iw_tx_throttle); + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); + return; + } + + clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + rds_iw_send_ack(ic, adv_credits); +} + +/* + * We get here from the send completion handler, when the + * adapter tells us the ACK frame was sent. + */ +void rds_iw_ack_send_complete(struct rds_iw_connection *ic) +{ + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); + rds_iw_attempt_ack(ic); +} + +/* + * This is called by the regular xmit code when it wants to piggyback + * an ACK on an outgoing frame. + */ +u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic) +{ + if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) + rds_iw_stats_inc(s_iw_ack_send_piggybacked); + return rds_iw_get_ack(ic); +} + +/* + * It's kind of lame that we're copying from the posted receive pages into + * long-lived bitmaps. We could have posted the bitmaps and rdma written into + * them. But receiving new congestion bitmaps should be a *rare* event, so + * hopefully we won't need to invest that complexity in making it more + * efficient. By copying we can share a simpler core with TCP which has to + * copy. + */ +static void rds_iw_cong_recv(struct rds_connection *conn, + struct rds_iw_incoming *iwinc) +{ + struct rds_cong_map *map; + unsigned int map_off; + unsigned int map_page; + struct rds_page_frag *frag; + unsigned long frag_off; + unsigned long to_copy; + unsigned long copied; + uint64_t uncongested = 0; + void *addr; + + /* catch completely corrupt packets */ + if (be32_to_cpu(iwinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES) + return; + + map = conn->c_fcong; + map_page = 0; + map_off = 0; + + frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item); + frag_off = 0; + + copied = 0; + + while (copied < RDS_CONG_MAP_BYTES) { + uint64_t *src, *dst; + unsigned int k; + + to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); + BUG_ON(to_copy & 7); /* Must be 64bit aligned. */ + + addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0); + + src = addr + frag_off; + dst = (void *)map->m_page_addrs[map_page] + map_off; + for (k = 0; k < to_copy; k += 8) { + /* Record ports that became uncongested, ie + * bits that changed from 0 to 1. */ + uncongested |= ~(*src) & *dst; + *dst++ = *src++; + } + kunmap_atomic(addr, KM_SOFTIRQ0); + + copied += to_copy; + + map_off += to_copy; + if (map_off == PAGE_SIZE) { + map_off = 0; + map_page++; + } + + frag_off += to_copy; + if (frag_off == RDS_FRAG_SIZE) { + frag = list_entry(frag->f_item.next, + struct rds_page_frag, f_item); + frag_off = 0; + } + } + + /* the congestion map is in little endian order */ + uncongested = le64_to_cpu(uncongested); + + rds_cong_map_updated(map, uncongested); +} + +/* + * Rings are posted with all the allocations they'll need to queue the + * incoming message to the receiving socket so this can't fail. + * All fragments start with a header, so we can make sure we're not receiving + * garbage, and we can tell a small 8 byte fragment from an ACK frame. + */ +struct rds_iw_ack_state { + u64 ack_next; + u64 ack_recv; + unsigned int ack_required:1; + unsigned int ack_next_valid:1; + unsigned int ack_recv_valid:1; +}; + +static void rds_iw_process_recv(struct rds_connection *conn, + struct rds_iw_recv_work *recv, u32 byte_len, + struct rds_iw_ack_state *state) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + struct rds_iw_incoming *iwinc = ic->i_iwinc; + struct rds_header *ihdr, *hdr; + + /* XXX shut down the connection if port 0,0 are seen? */ + + rdsdebug("ic %p iwinc %p recv %p byte len %u\n", ic, iwinc, recv, + byte_len); + + if (byte_len < sizeof(struct rds_header)) { + rds_iw_conn_error(conn, "incoming message " + "from %pI4 didn't inclue a " + "header, disconnecting and " + "reconnecting\n", + &conn->c_faddr); + return; + } + byte_len -= sizeof(struct rds_header); + + ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs]; + + /* Validate the checksum. */ + if (!rds_message_verify_checksum(ihdr)) { + rds_iw_conn_error(conn, "incoming message " + "from %pI4 has corrupted header - " + "forcing a reconnect\n", + &conn->c_faddr); + rds_stats_inc(s_recv_drop_bad_checksum); + return; + } + + /* Process the ACK sequence which comes with every packet */ + state->ack_recv = be64_to_cpu(ihdr->h_ack); + state->ack_recv_valid = 1; + + /* Process the credits update if there was one */ + if (ihdr->h_credit) + rds_iw_send_add_credits(conn, ihdr->h_credit); + + if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) { + /* This is an ACK-only packet. The fact that it gets + * special treatment here is that historically, ACKs + * were rather special beasts. + */ + rds_iw_stats_inc(s_iw_ack_received); + + /* + * Usually the frags make their way on to incs and are then freed as + * the inc is freed. We don't go that route, so we have to drop the + * page ref ourselves. We can't just leave the page on the recv + * because that confuses the dma mapping of pages and each recv's use + * of a partial page. We can leave the frag, though, it will be + * reused. + * + * FIXME: Fold this into the code path below. + */ + rds_iw_frag_drop_page(recv->r_frag); + return; + } + + /* + * If we don't already have an inc on the connection then this + * fragment has a header and starts a message.. copy its header + * into the inc and save the inc so we can hang upcoming fragments + * off its list. + */ + if (iwinc == NULL) { + iwinc = recv->r_iwinc; + recv->r_iwinc = NULL; + ic->i_iwinc = iwinc; + + hdr = &iwinc->ii_inc.i_hdr; + memcpy(hdr, ihdr, sizeof(*hdr)); + ic->i_recv_data_rem = be32_to_cpu(hdr->h_len); + + rdsdebug("ic %p iwinc %p rem %u flag 0x%x\n", ic, iwinc, + ic->i_recv_data_rem, hdr->h_flags); + } else { + hdr = &iwinc->ii_inc.i_hdr; + /* We can't just use memcmp here; fragments of a + * single message may carry different ACKs */ + if (hdr->h_sequence != ihdr->h_sequence + || hdr->h_len != ihdr->h_len + || hdr->h_sport != ihdr->h_sport + || hdr->h_dport != ihdr->h_dport) { + rds_iw_conn_error(conn, + "fragment header mismatch; forcing reconnect\n"); + return; + } + } + + list_add_tail(&recv->r_frag->f_item, &iwinc->ii_frags); + recv->r_frag = NULL; + + if (ic->i_recv_data_rem > RDS_FRAG_SIZE) + ic->i_recv_data_rem -= RDS_FRAG_SIZE; + else { + ic->i_recv_data_rem = 0; + ic->i_iwinc = NULL; + + if (iwinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) + rds_iw_cong_recv(conn, iwinc); + else { + rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr, + &iwinc->ii_inc, GFP_ATOMIC, + KM_SOFTIRQ0); + state->ack_next = be64_to_cpu(hdr->h_sequence); + state->ack_next_valid = 1; + } + + /* Evaluate the ACK_REQUIRED flag *after* we received + * the complete frame, and after bumping the next_rx + * sequence. */ + if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) { + rds_stats_inc(s_recv_ack_required); + state->ack_required = 1; + } + + rds_inc_put(&iwinc->ii_inc); + } +} + +/* + * Plucking the oldest entry from the ring can be done concurrently with + * the thread refilling the ring. Each ring operation is protected by + * spinlocks and the transient state of refilling doesn't change the + * recording of which entry is oldest. + * + * This relies on IB only calling one cq comp_handler for each cq so that + * there will only be one caller of rds_recv_incoming() per RDS connection. + */ +void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context) +{ + struct rds_connection *conn = context; + struct rds_iw_connection *ic = conn->c_transport_data; + struct ib_wc wc; + struct rds_iw_ack_state state = { 0, }; + struct rds_iw_recv_work *recv; + + rdsdebug("conn %p cq %p\n", conn, cq); + + rds_iw_stats_inc(s_iw_rx_cq_call); + + ib_req_notify_cq(cq, IB_CQ_SOLICITED); + + while (ib_poll_cq(cq, 1, &wc) > 0) { + rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", + (unsigned long long)wc.wr_id, wc.status, wc.byte_len, + be32_to_cpu(wc.ex.imm_data)); + rds_iw_stats_inc(s_iw_rx_cq_event); + + recv = &ic->i_recvs[rds_iw_ring_oldest(&ic->i_recv_ring)]; + + rds_iw_recv_unmap_page(ic, recv); + + /* + * Also process recvs in connecting state because it is possible + * to get a recv completion _before_ the rdmacm ESTABLISHED + * event is processed. + */ + if (rds_conn_up(conn) || rds_conn_connecting(conn)) { + /* We expect errors as the qp is drained during shutdown */ + if (wc.status == IB_WC_SUCCESS) { + rds_iw_process_recv(conn, recv, wc.byte_len, &state); + } else { + rds_iw_conn_error(conn, "recv completion on " + "%pI4 had status %u, disconnecting and " + "reconnecting\n", &conn->c_faddr, + wc.status); + } + } + + rds_iw_ring_free(&ic->i_recv_ring, 1); + } + + if (state.ack_next_valid) + rds_iw_set_ack(ic, state.ack_next, state.ack_required); + if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) { + rds_send_drop_acked(conn, state.ack_recv, NULL); + ic->i_ack_recv = state.ack_recv; + } + if (rds_conn_up(conn)) + rds_iw_attempt_ack(ic); + + /* If we ever end up with a really empty receive ring, we're + * in deep trouble, as the sender will definitely see RNR + * timeouts. */ + if (rds_iw_ring_empty(&ic->i_recv_ring)) + rds_iw_stats_inc(s_iw_rx_ring_empty); + + /* + * If the ring is running low, then schedule the thread to refill. + */ + if (rds_iw_ring_low(&ic->i_recv_ring)) + queue_delayed_work(rds_wq, &conn->c_recv_w, 0); +} + +int rds_iw_recv(struct rds_connection *conn) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + int ret = 0; + + rdsdebug("conn %p\n", conn); + + /* + * If we get a temporary posting failure in this context then + * we're really low and we want the caller to back off for a bit. + */ + mutex_lock(&ic->i_recv_mutex); + if (rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0)) + ret = -ENOMEM; + else + rds_iw_stats_inc(s_iw_rx_refill_from_thread); + mutex_unlock(&ic->i_recv_mutex); + + if (rds_conn_up(conn)) + rds_iw_attempt_ack(ic); + + return ret; +} + +int __init rds_iw_recv_init(void) +{ + struct sysinfo si; + int ret = -ENOMEM; + + /* Default to 30% of all available RAM for recv memory */ + si_meminfo(&si); + rds_iw_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE; + + rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming", + sizeof(struct rds_iw_incoming), + 0, 0, NULL); + if (rds_iw_incoming_slab == NULL) + goto out; + + rds_iw_frag_slab = kmem_cache_create("rds_iw_frag", + sizeof(struct rds_page_frag), + 0, 0, NULL); + if (rds_iw_frag_slab == NULL) + kmem_cache_destroy(rds_iw_incoming_slab); + else + ret = 0; +out: + return ret; +} + +void rds_iw_recv_exit(void) +{ + kmem_cache_destroy(rds_iw_incoming_slab); + kmem_cache_destroy(rds_iw_frag_slab); +} diff --git a/net/rds/iw_ring.c b/net/rds/iw_ring.c new file mode 100644 index 00000000000..d422d4b5dee --- /dev/null +++ b/net/rds/iw_ring.c @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> + +#include "rds.h" +#include "iw.h" + +/* + * Locking for IB rings. + * We assume that allocation is always protected by a mutex + * in the caller (this is a valid assumption for the current + * implementation). + * + * Freeing always happens in an interrupt, and hence only + * races with allocations, but not with other free()s. + * + * The interaction between allocation and freeing is that + * the alloc code has to determine the number of free entries. + * To this end, we maintain two counters; an allocation counter + * and a free counter. Both are allowed to run freely, and wrap + * around. + * The number of used entries is always (alloc_ctr - free_ctr) % NR. + * + * The current implementation makes free_ctr atomic. When the + * caller finds an allocation fails, it should set an "alloc fail" + * bit and retry the allocation. The "alloc fail" bit essentially tells + * the CQ completion handlers to wake it up after freeing some + * more entries. + */ + +/* + * This only happens on shutdown. + */ +DECLARE_WAIT_QUEUE_HEAD(rds_iw_ring_empty_wait); + +void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr) +{ + memset(ring, 0, sizeof(*ring)); + ring->w_nr = nr; + rdsdebug("ring %p nr %u\n", ring, ring->w_nr); +} + +static inline u32 __rds_iw_ring_used(struct rds_iw_work_ring *ring) +{ + u32 diff; + + /* This assumes that atomic_t has at least as many bits as u32 */ + diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr); + BUG_ON(diff > ring->w_nr); + + return diff; +} + +void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr) +{ + /* We only ever get called from the connection setup code, + * prior to creating the QP. */ + BUG_ON(__rds_iw_ring_used(ring)); + ring->w_nr = nr; +} + +static int __rds_iw_ring_empty(struct rds_iw_work_ring *ring) +{ + return __rds_iw_ring_used(ring) == 0; +} + +u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos) +{ + u32 ret = 0, avail; + + avail = ring->w_nr - __rds_iw_ring_used(ring); + + rdsdebug("ring %p val %u next %u free %u\n", ring, val, + ring->w_alloc_ptr, avail); + + if (val && avail) { + ret = min(val, avail); + *pos = ring->w_alloc_ptr; + + ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr; + ring->w_alloc_ctr += ret; + } + + return ret; +} + +void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val) +{ + ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr; + atomic_add(val, &ring->w_free_ctr); + + if (__rds_iw_ring_empty(ring) && + waitqueue_active(&rds_iw_ring_empty_wait)) + wake_up(&rds_iw_ring_empty_wait); +} + +void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val) +{ + ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr; + ring->w_alloc_ctr -= val; +} + +int rds_iw_ring_empty(struct rds_iw_work_ring *ring) +{ + return __rds_iw_ring_empty(ring); +} + +int rds_iw_ring_low(struct rds_iw_work_ring *ring) +{ + return __rds_iw_ring_used(ring) <= (ring->w_nr >> 2); +} + + +/* + * returns the oldest alloced ring entry. This will be the next one + * freed. This can't be called if there are none allocated. + */ +u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring) +{ + return ring->w_free_ptr; +} + +/* + * returns the number of completed work requests. + */ + +u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest) +{ + u32 ret; + + if (oldest <= (unsigned long long)wr_id) + ret = (unsigned long long)wr_id - oldest + 1; + else + ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1; + + rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret, + wr_id, oldest); + return ret; +} diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c new file mode 100644 index 00000000000..22dd38ffd60 --- /dev/null +++ b/net/rds/iw_send.c @@ -0,0 +1,975 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/in.h> +#include <linux/device.h> +#include <linux/dmapool.h> + +#include "rds.h" +#include "rdma.h" +#include "iw.h" + +static void rds_iw_send_rdma_complete(struct rds_message *rm, + int wc_status) +{ + int notify_status; + + switch (wc_status) { + case IB_WC_WR_FLUSH_ERR: + return; + + case IB_WC_SUCCESS: + notify_status = RDS_RDMA_SUCCESS; + break; + + case IB_WC_REM_ACCESS_ERR: + notify_status = RDS_RDMA_REMOTE_ERROR; + break; + + default: + notify_status = RDS_RDMA_OTHER_ERROR; + break; + } + rds_rdma_send_complete(rm, notify_status); +} + +static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic, + struct rds_rdma_op *op) +{ + if (op->r_mapped) { + ib_dma_unmap_sg(ic->i_cm_id->device, + op->r_sg, op->r_nents, + op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + op->r_mapped = 0; + } +} + +static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic, + struct rds_iw_send_work *send, + int wc_status) +{ + struct rds_message *rm = send->s_rm; + + rdsdebug("ic %p send %p rm %p\n", ic, send, rm); + + ib_dma_unmap_sg(ic->i_cm_id->device, + rm->m_sg, rm->m_nents, + DMA_TO_DEVICE); + + if (rm->m_rdma_op != NULL) { + rds_iw_send_unmap_rdma(ic, rm->m_rdma_op); + + /* If the user asked for a completion notification on this + * message, we can implement three different semantics: + * 1. Notify when we received the ACK on the RDS message + * that was queued with the RDMA. This provides reliable + * notification of RDMA status at the expense of a one-way + * packet delay. + * 2. Notify when the IB stack gives us the completion event for + * the RDMA operation. + * 3. Notify when the IB stack gives us the completion event for + * the accompanying RDS messages. + * Here, we implement approach #3. To implement approach #2, + * call rds_rdma_send_complete from the cq_handler. To implement #1, + * don't call rds_rdma_send_complete at all, and fall back to the notify + * handling in the ACK processing code. + * + * Note: There's no need to explicitly sync any RDMA buffers using + * ib_dma_sync_sg_for_cpu - the completion for the RDMA + * operation itself unmapped the RDMA buffers, which takes care + * of synching. + */ + rds_iw_send_rdma_complete(rm, wc_status); + + if (rm->m_rdma_op->r_write) + rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes); + else + rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes); + } + + /* If anyone waited for this message to get flushed out, wake + * them up now */ + rds_message_unmapped(rm); + + rds_message_put(rm); + send->s_rm = NULL; +} + +void rds_iw_send_init_ring(struct rds_iw_connection *ic) +{ + struct rds_iw_send_work *send; + u32 i; + + for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { + struct ib_sge *sge; + + send->s_rm = NULL; + send->s_op = NULL; + send->s_mapping = NULL; + + send->s_wr.next = NULL; + send->s_wr.wr_id = i; + send->s_wr.sg_list = send->s_sge; + send->s_wr.num_sge = 1; + send->s_wr.opcode = IB_WR_SEND; + send->s_wr.send_flags = 0; + send->s_wr.ex.imm_data = 0; + + sge = rds_iw_data_sge(ic, send->s_sge); + sge->lkey = 0; + + sge = rds_iw_header_sge(ic, send->s_sge); + sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); + sge->length = sizeof(struct rds_header); + sge->lkey = 0; + + send->s_mr = ib_alloc_fast_reg_mr(ic->i_pd, fastreg_message_size); + if (IS_ERR(send->s_mr)) { + printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed\n"); + break; + } + + send->s_page_list = ib_alloc_fast_reg_page_list( + ic->i_cm_id->device, fastreg_message_size); + if (IS_ERR(send->s_page_list)) { + printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed\n"); + break; + } + } +} + +void rds_iw_send_clear_ring(struct rds_iw_connection *ic) +{ + struct rds_iw_send_work *send; + u32 i; + + for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { + BUG_ON(!send->s_mr); + ib_dereg_mr(send->s_mr); + BUG_ON(!send->s_page_list); + ib_free_fast_reg_page_list(send->s_page_list); + if (send->s_wr.opcode == 0xdead) + continue; + if (send->s_rm) + rds_iw_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR); + if (send->s_op) + rds_iw_send_unmap_rdma(ic, send->s_op); + } +} + +/* + * The _oldest/_free ring operations here race cleanly with the alloc/unalloc + * operations performed in the send path. As the sender allocs and potentially + * unallocs the next free entry in the ring it doesn't alter which is + * the next to be freed, which is what this is concerned with. + */ +void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context) +{ + struct rds_connection *conn = context; + struct rds_iw_connection *ic = conn->c_transport_data; + struct ib_wc wc; + struct rds_iw_send_work *send; + u32 completed; + u32 oldest; + u32 i; + int ret; + + rdsdebug("cq %p conn %p\n", cq, conn); + rds_iw_stats_inc(s_iw_tx_cq_call); + ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + if (ret) + rdsdebug("ib_req_notify_cq send failed: %d\n", ret); + + while (ib_poll_cq(cq, 1, &wc) > 0) { + rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", + (unsigned long long)wc.wr_id, wc.status, wc.byte_len, + be32_to_cpu(wc.ex.imm_data)); + rds_iw_stats_inc(s_iw_tx_cq_event); + + if (wc.status != IB_WC_SUCCESS) { + printk(KERN_ERR "WC Error: status = %d opcode = %d\n", wc.status, wc.opcode); + break; + } + + if (wc.opcode == IB_WC_LOCAL_INV && wc.wr_id == RDS_IW_LOCAL_INV_WR_ID) { + ic->i_fastreg_posted = 0; + continue; + } + + if (wc.opcode == IB_WC_FAST_REG_MR && wc.wr_id == RDS_IW_FAST_REG_WR_ID) { + ic->i_fastreg_posted = 1; + continue; + } + + if (wc.wr_id == RDS_IW_ACK_WR_ID) { + if (ic->i_ack_queued + HZ/2 < jiffies) + rds_iw_stats_inc(s_iw_tx_stalled); + rds_iw_ack_send_complete(ic); + continue; + } + + oldest = rds_iw_ring_oldest(&ic->i_send_ring); + + completed = rds_iw_ring_completed(&ic->i_send_ring, wc.wr_id, oldest); + + for (i = 0; i < completed; i++) { + send = &ic->i_sends[oldest]; + + /* In the error case, wc.opcode sometimes contains garbage */ + switch (send->s_wr.opcode) { + case IB_WR_SEND: + if (send->s_rm) + rds_iw_send_unmap_rm(ic, send, wc.status); + break; + case IB_WR_FAST_REG_MR: + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_READ: + case IB_WR_RDMA_READ_WITH_INV: + /* Nothing to be done - the SG list will be unmapped + * when the SEND completes. */ + break; + default: + if (printk_ratelimit()) + printk(KERN_NOTICE + "RDS/IW: %s: unexpected opcode 0x%x in WR!\n", + __func__, send->s_wr.opcode); + break; + } + + send->s_wr.opcode = 0xdead; + send->s_wr.num_sge = 1; + if (send->s_queued + HZ/2 < jiffies) + rds_iw_stats_inc(s_iw_tx_stalled); + + /* If a RDMA operation produced an error, signal this right + * away. If we don't, the subsequent SEND that goes with this + * RDMA will be canceled with ERR_WFLUSH, and the application + * never learn that the RDMA failed. */ + if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) { + struct rds_message *rm; + + rm = rds_send_get_message(conn, send->s_op); + if (rm) + rds_iw_send_rdma_complete(rm, wc.status); + } + + oldest = (oldest + 1) % ic->i_send_ring.w_nr; + } + + rds_iw_ring_free(&ic->i_send_ring, completed); + + if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) + || test_bit(0, &conn->c_map_queued)) + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + + /* We expect errors as the qp is drained during shutdown */ + if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { + rds_iw_conn_error(conn, + "send completion on %pI4 " + "had status %u, disconnecting and reconnecting\n", + &conn->c_faddr, wc.status); + } + } +} + +/* + * This is the main function for allocating credits when sending + * messages. + * + * Conceptually, we have two counters: + * - send credits: this tells us how many WRs we're allowed + * to submit without overruning the reciever's queue. For + * each SEND WR we post, we decrement this by one. + * + * - posted credits: this tells us how many WRs we recently + * posted to the receive queue. This value is transferred + * to the peer as a "credit update" in a RDS header field. + * Every time we transmit credits to the peer, we subtract + * the amount of transferred credits from this counter. + * + * It is essential that we avoid situations where both sides have + * exhausted their send credits, and are unable to send new credits + * to the peer. We achieve this by requiring that we send at least + * one credit update to the peer before exhausting our credits. + * When new credits arrive, we subtract one credit that is withheld + * until we've posted new buffers and are ready to transmit these + * credits (see rds_iw_send_add_credits below). + * + * The RDS send code is essentially single-threaded; rds_send_xmit + * grabs c_send_lock to ensure exclusive access to the send ring. + * However, the ACK sending code is independent and can race with + * message SENDs. + * + * In the send path, we need to update the counters for send credits + * and the counter of posted buffers atomically - when we use the + * last available credit, we cannot allow another thread to race us + * and grab the posted credits counter. Hence, we have to use a + * spinlock to protect the credit counter, or use atomics. + * + * Spinlocks shared between the send and the receive path are bad, + * because they create unnecessary delays. An early implementation + * using a spinlock showed a 5% degradation in throughput at some + * loads. + * + * This implementation avoids spinlocks completely, putting both + * counters into a single atomic, and updating that atomic using + * atomic_add (in the receive path, when receiving fresh credits), + * and using atomic_cmpxchg when updating the two counters. + */ +int rds_iw_send_grab_credits(struct rds_iw_connection *ic, + u32 wanted, u32 *adv_credits, int need_posted) +{ + unsigned int avail, posted, got = 0, advertise; + long oldval, newval; + + *adv_credits = 0; + if (!ic->i_flowctl) + return wanted; + +try_again: + advertise = 0; + oldval = newval = atomic_read(&ic->i_credits); + posted = IB_GET_POST_CREDITS(oldval); + avail = IB_GET_SEND_CREDITS(oldval); + + rdsdebug("rds_iw_send_grab_credits(%u): credits=%u posted=%u\n", + wanted, avail, posted); + + /* The last credit must be used to send a credit update. */ + if (avail && !posted) + avail--; + + if (avail < wanted) { + struct rds_connection *conn = ic->i_cm_id->context; + + /* Oops, there aren't that many credits left! */ + set_bit(RDS_LL_SEND_FULL, &conn->c_flags); + got = avail; + } else { + /* Sometimes you get what you want, lalala. */ + got = wanted; + } + newval -= IB_SET_SEND_CREDITS(got); + + /* + * If need_posted is non-zero, then the caller wants + * the posted regardless of whether any send credits are + * available. + */ + if (posted && (got || need_posted)) { + advertise = min_t(unsigned int, posted, RDS_MAX_ADV_CREDIT); + newval -= IB_SET_POST_CREDITS(advertise); + } + + /* Finally bill everything */ + if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval) + goto try_again; + + *adv_credits = advertise; + return got; +} + +void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + + if (credits == 0) + return; + + rdsdebug("rds_iw_send_add_credits(%u): current=%u%s\n", + credits, + IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)), + test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : ""); + + atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits); + if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)) + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + + WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384); + + rds_iw_stats_inc(s_iw_rx_credit_updates); +} + +void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + + if (posted == 0) + return; + + atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits); + + /* Decide whether to send an update to the peer now. + * If we would send a credit update for every single buffer we + * post, we would end up with an ACK storm (ACK arrives, + * consumes buffer, we refill the ring, send ACK to remote + * advertising the newly posted buffer... ad inf) + * + * Performance pretty much depends on how often we send + * credit updates - too frequent updates mean lots of ACKs. + * Too infrequent updates, and the peer will run out of + * credits and has to throttle. + * For the time being, 16 seems to be a good compromise. + */ + if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16) + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); +} + +static inline void +rds_iw_xmit_populate_wr(struct rds_iw_connection *ic, + struct rds_iw_send_work *send, unsigned int pos, + unsigned long buffer, unsigned int length, + int send_flags) +{ + struct ib_sge *sge; + + WARN_ON(pos != send - ic->i_sends); + + send->s_wr.send_flags = send_flags; + send->s_wr.opcode = IB_WR_SEND; + send->s_wr.num_sge = 2; + send->s_wr.next = NULL; + send->s_queued = jiffies; + send->s_op = NULL; + + if (length != 0) { + sge = rds_iw_data_sge(ic, send->s_sge); + sge->addr = buffer; + sge->length = length; + sge->lkey = rds_iw_local_dma_lkey(ic); + + sge = rds_iw_header_sge(ic, send->s_sge); + } else { + /* We're sending a packet with no payload. There is only + * one SGE */ + send->s_wr.num_sge = 1; + sge = &send->s_sge[0]; + } + + sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header)); + sge->length = sizeof(struct rds_header); + sge->lkey = rds_iw_local_dma_lkey(ic); +} + +/* + * This can be called multiple times for a given message. The first time + * we see a message we map its scatterlist into the IB device so that + * we can provide that mapped address to the IB scatter gather entries + * in the IB work requests. We translate the scatterlist into a series + * of work requests that fragment the message. These work requests complete + * in order so we pass ownership of the message to the completion handler + * once we send the final fragment. + * + * The RDS core uses the c_send_lock to only enter this function once + * per connection. This makes sure that the tx ring alloc/unalloc pairs + * don't get out of sync and confuse the ring. + */ +int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, + unsigned int hdr_off, unsigned int sg, unsigned int off) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + struct ib_device *dev = ic->i_cm_id->device; + struct rds_iw_send_work *send = NULL; + struct rds_iw_send_work *first; + struct rds_iw_send_work *prev; + struct ib_send_wr *failed_wr; + struct scatterlist *scat; + u32 pos; + u32 i; + u32 work_alloc; + u32 credit_alloc; + u32 posted; + u32 adv_credits = 0; + int send_flags = 0; + int sent; + int ret; + int flow_controlled = 0; + + BUG_ON(off % RDS_FRAG_SIZE); + BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header)); + + /* Fastreg support */ + if (rds_rdma_cookie_key(rm->m_rdma_cookie) + && !ic->i_fastreg_posted) { + ret = -EAGAIN; + goto out; + } + + /* FIXME we may overallocate here */ + if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) + i = 1; + else + i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE); + + work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos); + if (work_alloc == 0) { + set_bit(RDS_LL_SEND_FULL, &conn->c_flags); + rds_iw_stats_inc(s_iw_tx_ring_full); + ret = -ENOMEM; + goto out; + } + + credit_alloc = work_alloc; + if (ic->i_flowctl) { + credit_alloc = rds_iw_send_grab_credits(ic, work_alloc, &posted, 0); + adv_credits += posted; + if (credit_alloc < work_alloc) { + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); + work_alloc = credit_alloc; + flow_controlled++; + } + if (work_alloc == 0) { + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_iw_stats_inc(s_iw_tx_throttle); + ret = -ENOMEM; + goto out; + } + } + + /* map the message the first time we see it */ + if (ic->i_rm == NULL) { + /* + printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n", + be16_to_cpu(rm->m_inc.i_hdr.h_dport), + rm->m_inc.i_hdr.h_flags, + be32_to_cpu(rm->m_inc.i_hdr.h_len)); + */ + if (rm->m_nents) { + rm->m_count = ib_dma_map_sg(dev, + rm->m_sg, rm->m_nents, DMA_TO_DEVICE); + rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count); + if (rm->m_count == 0) { + rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); + ret = -ENOMEM; /* XXX ? */ + goto out; + } + } else { + rm->m_count = 0; + } + + ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; + ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes; + rds_message_addref(rm); + ic->i_rm = rm; + + /* Finalize the header */ + if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags)) + rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED; + if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) + rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED; + + /* If it has a RDMA op, tell the peer we did it. This is + * used by the peer to release use-once RDMA MRs. */ + if (rm->m_rdma_op) { + struct rds_ext_header_rdma ext_hdr; + + ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key); + rds_message_add_extension(&rm->m_inc.i_hdr, + RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); + } + if (rm->m_rdma_cookie) { + rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr, + rds_rdma_cookie_key(rm->m_rdma_cookie), + rds_rdma_cookie_offset(rm->m_rdma_cookie)); + } + + /* Note - rds_iw_piggyb_ack clears the ACK_REQUIRED bit, so + * we should not do this unless we have a chance of at least + * sticking the header into the send ring. Which is why we + * should call rds_iw_ring_alloc first. */ + rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_iw_piggyb_ack(ic)); + rds_message_make_checksum(&rm->m_inc.i_hdr); + + /* + * Update adv_credits since we reset the ACK_REQUIRED bit. + */ + rds_iw_send_grab_credits(ic, 0, &posted, 1); + adv_credits += posted; + BUG_ON(adv_credits > 255); + } else if (ic->i_rm != rm) + BUG(); + + send = &ic->i_sends[pos]; + first = send; + prev = NULL; + scat = &rm->m_sg[sg]; + sent = 0; + i = 0; + + /* Sometimes you want to put a fence between an RDMA + * READ and the following SEND. + * We could either do this all the time + * or when requested by the user. Right now, we let + * the application choose. + */ + if (rm->m_rdma_op && rm->m_rdma_op->r_fence) + send_flags = IB_SEND_FENCE; + + /* + * We could be copying the header into the unused tail of the page. + * That would need to be changed in the future when those pages might + * be mapped userspace pages or page cache pages. So instead we always + * use a second sge and our long-lived ring of mapped headers. We send + * the header after the data so that the data payload can be aligned on + * the receiver. + */ + + /* handle a 0-len message */ + if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) { + rds_iw_xmit_populate_wr(ic, send, pos, 0, 0, send_flags); + goto add_header; + } + + /* if there's data reference it with a chain of work reqs */ + for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) { + unsigned int len; + + send = &ic->i_sends[pos]; + + len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); + rds_iw_xmit_populate_wr(ic, send, pos, + ib_sg_dma_address(dev, scat) + off, len, + send_flags); + + /* + * We want to delay signaling completions just enough to get + * the batching benefits but not so much that we create dead time + * on the wire. + */ + if (ic->i_unsignaled_wrs-- == 0) { + ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; + send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + } + + ic->i_unsignaled_bytes -= len; + if (ic->i_unsignaled_bytes <= 0) { + ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes; + send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + } + + /* + * Always signal the last one if we're stopping due to flow control. + */ + if (flow_controlled && i == (work_alloc-1)) + send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + + rdsdebug("send %p wr %p num_sge %u next %p\n", send, + &send->s_wr, send->s_wr.num_sge, send->s_wr.next); + + sent += len; + off += len; + if (off == ib_sg_dma_len(dev, scat)) { + scat++; + off = 0; + } + +add_header: + /* Tack on the header after the data. The header SGE should already + * have been set up to point to the right header buffer. */ + memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); + + if (0) { + struct rds_header *hdr = &ic->i_send_hdrs[pos]; + + printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n", + be16_to_cpu(hdr->h_dport), + hdr->h_flags, + be32_to_cpu(hdr->h_len)); + } + if (adv_credits) { + struct rds_header *hdr = &ic->i_send_hdrs[pos]; + + /* add credit and redo the header checksum */ + hdr->h_credit = adv_credits; + rds_message_make_checksum(hdr); + adv_credits = 0; + rds_iw_stats_inc(s_iw_tx_credit_updates); + } + + if (prev) + prev->s_wr.next = &send->s_wr; + prev = send; + + pos = (pos + 1) % ic->i_send_ring.w_nr; + } + + /* Account the RDS header in the number of bytes we sent, but just once. + * The caller has no concept of fragmentation. */ + if (hdr_off == 0) + sent += sizeof(struct rds_header); + + /* if we finished the message then send completion owns it */ + if (scat == &rm->m_sg[rm->m_count]) { + prev->s_rm = ic->i_rm; + prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + ic->i_rm = NULL; + } + + if (i < work_alloc) { + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i); + work_alloc = i; + } + if (ic->i_flowctl && i < credit_alloc) + rds_iw_send_add_credits(conn, credit_alloc - i); + + /* XXX need to worry about failed_wr and partial sends. */ + failed_wr = &first->s_wr; + ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); + rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, + first, &first->s_wr, ret, failed_wr); + BUG_ON(failed_wr != &first->s_wr); + if (ret) { + printk(KERN_WARNING "RDS/IW: ib_post_send to %pI4 " + "returned %d\n", &conn->c_faddr, ret); + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); + if (prev->s_rm) { + ic->i_rm = prev->s_rm; + prev->s_rm = NULL; + } + goto out; + } + + ret = sent; +out: + BUG_ON(adv_credits); + return ret; +} + +static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rds_iw_connection *ic, struct rds_iw_send_work *send, int nent, int len, u64 sg_addr) +{ + BUG_ON(nent > send->s_page_list->max_page_list_len); + /* + * Perform a WR for the fast_reg_mr. Each individual page + * in the sg list is added to the fast reg page list and placed + * inside the fast_reg_mr WR. + */ + send->s_wr.opcode = IB_WR_FAST_REG_MR; + send->s_wr.wr.fast_reg.length = len; + send->s_wr.wr.fast_reg.rkey = send->s_mr->rkey; + send->s_wr.wr.fast_reg.page_list = send->s_page_list; + send->s_wr.wr.fast_reg.page_list_len = nent; + send->s_wr.wr.fast_reg.page_shift = rds_iwdev->page_shift; + send->s_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE; + send->s_wr.wr.fast_reg.iova_start = sg_addr; + + ib_update_fast_reg_key(send->s_mr, send->s_remap_count++); +} + +int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + struct rds_iw_send_work *send = NULL; + struct rds_iw_send_work *first; + struct rds_iw_send_work *prev; + struct ib_send_wr *failed_wr; + struct rds_iw_device *rds_iwdev; + struct scatterlist *scat; + unsigned long len; + u64 remote_addr = op->r_remote_addr; + u32 pos, fr_pos; + u32 work_alloc; + u32 i; + u32 j; + int sent; + int ret; + int num_sge; + + rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); + + /* map the message the first time we see it */ + if (!op->r_mapped) { + op->r_count = ib_dma_map_sg(ic->i_cm_id->device, + op->r_sg, op->r_nents, (op->r_write) ? + DMA_TO_DEVICE : DMA_FROM_DEVICE); + rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count); + if (op->r_count == 0) { + rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); + ret = -ENOMEM; /* XXX ? */ + goto out; + } + + op->r_mapped = 1; + } + + if (!op->r_write) { + /* Alloc space on the send queue for the fastreg */ + work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos); + if (work_alloc != 1) { + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_iw_stats_inc(s_iw_tx_ring_full); + ret = -ENOMEM; + goto out; + } + } + + /* + * Instead of knowing how to return a partial rdma read/write we insist that there + * be enough work requests to send the entire message. + */ + i = ceil(op->r_count, rds_iwdev->max_sge); + + work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos); + if (work_alloc != i) { + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_iw_stats_inc(s_iw_tx_ring_full); + ret = -ENOMEM; + goto out; + } + + send = &ic->i_sends[pos]; + if (!op->r_write) { + first = prev = &ic->i_sends[fr_pos]; + } else { + first = send; + prev = NULL; + } + scat = &op->r_sg[0]; + sent = 0; + num_sge = op->r_count; + + for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) { + send->s_wr.send_flags = 0; + send->s_queued = jiffies; + + /* + * We want to delay signaling completions just enough to get + * the batching benefits but not so much that we create dead time on the wire. + */ + if (ic->i_unsignaled_wrs-- == 0) { + ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; + send->s_wr.send_flags = IB_SEND_SIGNALED; + } + + /* To avoid the need to have the plumbing to invalidate the fastreg_mr used + * for local access after RDS is finished with it, using + * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed. + */ + if (op->r_write) + send->s_wr.opcode = IB_WR_RDMA_WRITE; + else + send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV; + + send->s_wr.wr.rdma.remote_addr = remote_addr; + send->s_wr.wr.rdma.rkey = op->r_key; + send->s_op = op; + + if (num_sge > rds_iwdev->max_sge) { + send->s_wr.num_sge = rds_iwdev->max_sge; + num_sge -= rds_iwdev->max_sge; + } else + send->s_wr.num_sge = num_sge; + + send->s_wr.next = NULL; + + if (prev) + prev->s_wr.next = &send->s_wr; + + for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) { + len = ib_sg_dma_len(ic->i_cm_id->device, scat); + + if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) + send->s_page_list->page_list[j] = ib_sg_dma_address(ic->i_cm_id->device, scat); + else { + send->s_sge[j].addr = ib_sg_dma_address(ic->i_cm_id->device, scat); + send->s_sge[j].length = len; + send->s_sge[j].lkey = rds_iw_local_dma_lkey(ic); + } + + sent += len; + rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr); + remote_addr += len; + + scat++; + } + + if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) { + send->s_wr.num_sge = 1; + send->s_sge[0].addr = conn->c_xmit_rm->m_rs->rs_user_addr; + send->s_sge[0].length = conn->c_xmit_rm->m_rs->rs_user_bytes; + send->s_sge[0].lkey = ic->i_sends[fr_pos].s_mr->lkey; + } + + rdsdebug("send %p wr %p num_sge %u next %p\n", send, + &send->s_wr, send->s_wr.num_sge, send->s_wr.next); + + prev = send; + if (++send == &ic->i_sends[ic->i_send_ring.w_nr]) + send = ic->i_sends; + } + + /* if we finished the message then send completion owns it */ + if (scat == &op->r_sg[op->r_count]) + first->s_wr.send_flags = IB_SEND_SIGNALED; + + if (i < work_alloc) { + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i); + work_alloc = i; + } + + /* On iWARP, local memory access by a remote system (ie, RDMA Read) is not + * recommended. Putting the lkey on the wire is a security hole, as it can + * allow for memory access to all of memory on the remote system. Some + * adapters do not allow using the lkey for this at all. To bypass this use a + * fastreg_mr (or possibly a dma_mr) + */ + if (!op->r_write) { + rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos], + op->r_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr); + work_alloc++; + } + + failed_wr = &first->s_wr; + ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); + rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, + first, &first->s_wr, ret, failed_wr); + BUG_ON(failed_wr != &first->s_wr); + if (ret) { + printk(KERN_WARNING "RDS/IW: rdma ib_post_send to %pI4 " + "returned %d\n", &conn->c_faddr, ret); + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); + goto out; + } + +out: + return ret; +} + +void rds_iw_xmit_complete(struct rds_connection *conn) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + + /* We may have a pending ACK or window update we were unable + * to send previously (due to flow control). Try again. */ + rds_iw_attempt_ack(ic); +} diff --git a/net/rds/iw_stats.c b/net/rds/iw_stats.c new file mode 100644 index 00000000000..ccc7e8f0bf0 --- /dev/null +++ b/net/rds/iw_stats.c @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/percpu.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> + +#include "rds.h" +#include "iw.h" + +DEFINE_PER_CPU(struct rds_iw_statistics, rds_iw_stats) ____cacheline_aligned; + +static char *rds_iw_stat_names[] = { + "iw_connect_raced", + "iw_listen_closed_stale", + "iw_tx_cq_call", + "iw_tx_cq_event", + "iw_tx_ring_full", + "iw_tx_throttle", + "iw_tx_sg_mapping_failure", + "iw_tx_stalled", + "iw_tx_credit_updates", + "iw_rx_cq_call", + "iw_rx_cq_event", + "iw_rx_ring_empty", + "iw_rx_refill_from_cq", + "iw_rx_refill_from_thread", + "iw_rx_alloc_limit", + "iw_rx_credit_updates", + "iw_ack_sent", + "iw_ack_send_failure", + "iw_ack_send_delayed", + "iw_ack_send_piggybacked", + "iw_ack_received", + "iw_rdma_mr_alloc", + "iw_rdma_mr_free", + "iw_rdma_mr_used", + "iw_rdma_mr_pool_flush", + "iw_rdma_mr_pool_wait", + "iw_rdma_mr_pool_depleted", +}; + +unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter, + unsigned int avail) +{ + struct rds_iw_statistics stats = {0, }; + uint64_t *src; + uint64_t *sum; + size_t i; + int cpu; + + if (avail < ARRAY_SIZE(rds_iw_stat_names)) + goto out; + + for_each_online_cpu(cpu) { + src = (uint64_t *)&(per_cpu(rds_iw_stats, cpu)); + sum = (uint64_t *)&stats; + for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) + *(sum++) += *(src++); + } + + rds_stats_info_copy(iter, (uint64_t *)&stats, rds_iw_stat_names, + ARRAY_SIZE(rds_iw_stat_names)); +out: + return ARRAY_SIZE(rds_iw_stat_names); +} diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c new file mode 100644 index 00000000000..9590678cd61 --- /dev/null +++ b/net/rds/iw_sysctl.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/sysctl.h> +#include <linux/proc_fs.h> + +#include "iw.h" + +static struct ctl_table_header *rds_iw_sysctl_hdr; + +unsigned long rds_iw_sysctl_max_send_wr = RDS_IW_DEFAULT_SEND_WR; +unsigned long rds_iw_sysctl_max_recv_wr = RDS_IW_DEFAULT_RECV_WR; +unsigned long rds_iw_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE; +static unsigned long rds_iw_sysctl_max_wr_min = 1; +/* hardware will fail CQ creation long before this */ +static unsigned long rds_iw_sysctl_max_wr_max = (u32)~0; + +unsigned long rds_iw_sysctl_max_unsig_wrs = 16; +static unsigned long rds_iw_sysctl_max_unsig_wr_min = 1; +static unsigned long rds_iw_sysctl_max_unsig_wr_max = 64; + +unsigned long rds_iw_sysctl_max_unsig_bytes = (16 << 20); +static unsigned long rds_iw_sysctl_max_unsig_bytes_min = 1; +static unsigned long rds_iw_sysctl_max_unsig_bytes_max = ~0UL; + +unsigned int rds_iw_sysctl_flow_control = 1; + +ctl_table rds_iw_sysctl_table[] = { + { + .ctl_name = CTL_UNNUMBERED, + .procname = "max_send_wr", + .data = &rds_iw_sysctl_max_send_wr, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + .extra1 = &rds_iw_sysctl_max_wr_min, + .extra2 = &rds_iw_sysctl_max_wr_max, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "max_recv_wr", + .data = &rds_iw_sysctl_max_recv_wr, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + .extra1 = &rds_iw_sysctl_max_wr_min, + .extra2 = &rds_iw_sysctl_max_wr_max, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "max_unsignaled_wr", + .data = &rds_iw_sysctl_max_unsig_wrs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + .extra1 = &rds_iw_sysctl_max_unsig_wr_min, + .extra2 = &rds_iw_sysctl_max_unsig_wr_max, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "max_unsignaled_bytes", + .data = &rds_iw_sysctl_max_unsig_bytes, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + .extra1 = &rds_iw_sysctl_max_unsig_bytes_min, + .extra2 = &rds_iw_sysctl_max_unsig_bytes_max, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "max_recv_allocation", + .data = &rds_iw_sysctl_max_recv_allocation, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "flow_control", + .data = &rds_iw_sysctl_flow_control, + .maxlen = sizeof(rds_iw_sysctl_flow_control), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0} +}; + +static struct ctl_path rds_iw_sysctl_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "rds", .ctl_name = CTL_UNNUMBERED, }, + { .procname = "iw", .ctl_name = CTL_UNNUMBERED, }, + { } +}; + +void rds_iw_sysctl_exit(void) +{ + if (rds_iw_sysctl_hdr) + unregister_sysctl_table(rds_iw_sysctl_hdr); +} + +int __init rds_iw_sysctl_init(void) +{ + rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table); + if (rds_iw_sysctl_hdr == NULL) + return -ENOMEM; + return 0; +} diff --git a/net/rds/loop.c b/net/rds/loop.c new file mode 100644 index 00000000000..4a61997f554 --- /dev/null +++ b/net/rds/loop.c @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/in.h> + +#include "rds.h" +#include "loop.h" + +static DEFINE_SPINLOCK(loop_conns_lock); +static LIST_HEAD(loop_conns); + +/* + * This 'loopback' transport is a special case for flows that originate + * and terminate on the same machine. + * + * Connection build-up notices if the destination address is thought of + * as a local address by a transport. At that time it decides to use the + * loopback transport instead of the bound transport of the sending socket. + * + * The loopback transport's sending path just hands the sent rds_message + * straight to the receiving path via an embedded rds_incoming. + */ + +/* + * Usually a message transits both the sender and receiver's conns as it + * flows to the receiver. In the loopback case, though, the receive path + * is handed the sending conn so the sense of the addresses is reversed. + */ +static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm, + unsigned int hdr_off, unsigned int sg, + unsigned int off) +{ + BUG_ON(hdr_off || sg || off); + + rds_inc_init(&rm->m_inc, conn, conn->c_laddr); + rds_message_addref(rm); /* for the inc */ + + rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc, + GFP_KERNEL, KM_USER0); + + rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence), + NULL); + + rds_inc_put(&rm->m_inc); + + return sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len); +} + +static int rds_loop_xmit_cong_map(struct rds_connection *conn, + struct rds_cong_map *map, + unsigned long offset) +{ + unsigned long i; + + BUG_ON(offset); + BUG_ON(map != conn->c_lcong); + + for (i = 0; i < RDS_CONG_MAP_PAGES; i++) { + memcpy((void *)conn->c_fcong->m_page_addrs[i], + (void *)map->m_page_addrs[i], PAGE_SIZE); + } + + rds_cong_map_updated(conn->c_fcong, ~(u64) 0); + + return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES; +} + +/* we need to at least give the thread something to succeed */ +static int rds_loop_recv(struct rds_connection *conn) +{ + return 0; +} + +struct rds_loop_connection { + struct list_head loop_node; + struct rds_connection *conn; +}; + +/* + * Even the loopback transport needs to keep track of its connections, + * so it can call rds_conn_destroy() on them on exit. N.B. there are + * 1+ loopback addresses (127.*.*.*) so it's not a bug to have + * multiple loopback conns allocated, although rather useless. + */ +static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp) +{ + struct rds_loop_connection *lc; + unsigned long flags; + + lc = kzalloc(sizeof(struct rds_loop_connection), GFP_KERNEL); + if (lc == NULL) + return -ENOMEM; + + INIT_LIST_HEAD(&lc->loop_node); + lc->conn = conn; + conn->c_transport_data = lc; + + spin_lock_irqsave(&loop_conns_lock, flags); + list_add_tail(&lc->loop_node, &loop_conns); + spin_unlock_irqrestore(&loop_conns_lock, flags); + + return 0; +} + +static void rds_loop_conn_free(void *arg) +{ + struct rds_loop_connection *lc = arg; + rdsdebug("lc %p\n", lc); + list_del(&lc->loop_node); + kfree(lc); +} + +static int rds_loop_conn_connect(struct rds_connection *conn) +{ + rds_connect_complete(conn); + return 0; +} + +static void rds_loop_conn_shutdown(struct rds_connection *conn) +{ +} + +void rds_loop_exit(void) +{ + struct rds_loop_connection *lc, *_lc; + LIST_HEAD(tmp_list); + + /* avoid calling conn_destroy with irqs off */ + spin_lock_irq(&loop_conns_lock); + list_splice(&loop_conns, &tmp_list); + INIT_LIST_HEAD(&loop_conns); + spin_unlock_irq(&loop_conns_lock); + + list_for_each_entry_safe(lc, _lc, &tmp_list, loop_node) { + WARN_ON(lc->conn->c_passive); + rds_conn_destroy(lc->conn); + } +} + +/* + * This is missing .xmit_* because loop doesn't go through generic + * rds_send_xmit() and doesn't call rds_recv_incoming(). .listen_stop and + * .laddr_check are missing because transport.c doesn't iterate over + * rds_loop_transport. + */ +struct rds_transport rds_loop_transport = { + .xmit = rds_loop_xmit, + .xmit_cong_map = rds_loop_xmit_cong_map, + .recv = rds_loop_recv, + .conn_alloc = rds_loop_conn_alloc, + .conn_free = rds_loop_conn_free, + .conn_connect = rds_loop_conn_connect, + .conn_shutdown = rds_loop_conn_shutdown, + .inc_copy_to_user = rds_message_inc_copy_to_user, + .inc_purge = rds_message_inc_purge, + .inc_free = rds_message_inc_free, + .t_name = "loopback", +}; diff --git a/net/rds/loop.h b/net/rds/loop.h new file mode 100644 index 00000000000..f32b0939a04 --- /dev/null +++ b/net/rds/loop.h @@ -0,0 +1,9 @@ +#ifndef _RDS_LOOP_H +#define _RDS_LOOP_H + +/* loop.c */ +extern struct rds_transport rds_loop_transport; + +void rds_loop_exit(void); + +#endif diff --git a/net/rds/message.c b/net/rds/message.c new file mode 100644 index 00000000000..5a15dc8d0cd --- /dev/null +++ b/net/rds/message.c @@ -0,0 +1,402 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> + +#include "rds.h" +#include "rdma.h" + +static DECLARE_WAIT_QUEUE_HEAD(rds_message_flush_waitq); + +static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = { +[RDS_EXTHDR_NONE] = 0, +[RDS_EXTHDR_VERSION] = sizeof(struct rds_ext_header_version), +[RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma), +[RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest), +}; + + +void rds_message_addref(struct rds_message *rm) +{ + rdsdebug("addref rm %p ref %d\n", rm, atomic_read(&rm->m_refcount)); + atomic_inc(&rm->m_refcount); +} + +/* + * This relies on dma_map_sg() not touching sg[].page during merging. + */ +static void rds_message_purge(struct rds_message *rm) +{ + unsigned long i; + + if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags))) + return; + + for (i = 0; i < rm->m_nents; i++) { + rdsdebug("putting data page %p\n", (void *)sg_page(&rm->m_sg[i])); + /* XXX will have to put_page for page refs */ + __free_page(sg_page(&rm->m_sg[i])); + } + rm->m_nents = 0; + + if (rm->m_rdma_op) + rds_rdma_free_op(rm->m_rdma_op); + if (rm->m_rdma_mr) + rds_mr_put(rm->m_rdma_mr); +} + +void rds_message_inc_purge(struct rds_incoming *inc) +{ + struct rds_message *rm = container_of(inc, struct rds_message, m_inc); + rds_message_purge(rm); +} + +void rds_message_put(struct rds_message *rm) +{ + rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount)); + + if (atomic_dec_and_test(&rm->m_refcount)) { + BUG_ON(!list_empty(&rm->m_sock_item)); + BUG_ON(!list_empty(&rm->m_conn_item)); + rds_message_purge(rm); + + kfree(rm); + } +} + +void rds_message_inc_free(struct rds_incoming *inc) +{ + struct rds_message *rm = container_of(inc, struct rds_message, m_inc); + rds_message_put(rm); +} + +void rds_message_populate_header(struct rds_header *hdr, __be16 sport, + __be16 dport, u64 seq) +{ + hdr->h_flags = 0; + hdr->h_sport = sport; + hdr->h_dport = dport; + hdr->h_sequence = cpu_to_be64(seq); + hdr->h_exthdr[0] = RDS_EXTHDR_NONE; +} + +int rds_message_add_extension(struct rds_header *hdr, + unsigned int type, const void *data, unsigned int len) +{ + unsigned int ext_len = sizeof(u8) + len; + unsigned char *dst; + + /* For now, refuse to add more than one extension header */ + if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE) + return 0; + + if (type >= __RDS_EXTHDR_MAX + || len != rds_exthdr_size[type]) + return 0; + + if (ext_len >= RDS_HEADER_EXT_SPACE) + return 0; + dst = hdr->h_exthdr; + + *dst++ = type; + memcpy(dst, data, len); + + dst[len] = RDS_EXTHDR_NONE; + return 1; +} + +/* + * If a message has extension headers, retrieve them here. + * Call like this: + * + * unsigned int pos = 0; + * + * while (1) { + * buflen = sizeof(buffer); + * type = rds_message_next_extension(hdr, &pos, buffer, &buflen); + * if (type == RDS_EXTHDR_NONE) + * break; + * ... + * } + */ +int rds_message_next_extension(struct rds_header *hdr, + unsigned int *pos, void *buf, unsigned int *buflen) +{ + unsigned int offset, ext_type, ext_len; + u8 *src = hdr->h_exthdr; + + offset = *pos; + if (offset >= RDS_HEADER_EXT_SPACE) + goto none; + + /* Get the extension type and length. For now, the + * length is implied by the extension type. */ + ext_type = src[offset++]; + + if (ext_type == RDS_EXTHDR_NONE || ext_type >= __RDS_EXTHDR_MAX) + goto none; + ext_len = rds_exthdr_size[ext_type]; + if (offset + ext_len > RDS_HEADER_EXT_SPACE) + goto none; + + *pos = offset + ext_len; + if (ext_len < *buflen) + *buflen = ext_len; + memcpy(buf, src + offset, *buflen); + return ext_type; + +none: + *pos = RDS_HEADER_EXT_SPACE; + *buflen = 0; + return RDS_EXTHDR_NONE; +} + +int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version) +{ + struct rds_ext_header_version ext_hdr; + + ext_hdr.h_version = cpu_to_be32(version); + return rds_message_add_extension(hdr, RDS_EXTHDR_VERSION, &ext_hdr, sizeof(ext_hdr)); +} + +int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version) +{ + struct rds_ext_header_version ext_hdr; + unsigned int pos = 0, len = sizeof(ext_hdr); + + /* We assume the version extension is the only one present */ + if (rds_message_next_extension(hdr, &pos, &ext_hdr, &len) != RDS_EXTHDR_VERSION) + return 0; + *version = be32_to_cpu(ext_hdr.h_version); + return 1; +} + +int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset) +{ + struct rds_ext_header_rdma_dest ext_hdr; + + ext_hdr.h_rdma_rkey = cpu_to_be32(r_key); + ext_hdr.h_rdma_offset = cpu_to_be32(offset); + return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, sizeof(ext_hdr)); +} + +struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp) +{ + struct rds_message *rm; + + rm = kzalloc(sizeof(struct rds_message) + + (nents * sizeof(struct scatterlist)), gfp); + if (!rm) + goto out; + + if (nents) + sg_init_table(rm->m_sg, nents); + atomic_set(&rm->m_refcount, 1); + INIT_LIST_HEAD(&rm->m_sock_item); + INIT_LIST_HEAD(&rm->m_conn_item); + spin_lock_init(&rm->m_rs_lock); + +out: + return rm; +} + +struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len) +{ + struct rds_message *rm; + unsigned int i; + + rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL); + if (rm == NULL) + return ERR_PTR(-ENOMEM); + + set_bit(RDS_MSG_PAGEVEC, &rm->m_flags); + rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); + rm->m_nents = ceil(total_len, PAGE_SIZE); + + for (i = 0; i < rm->m_nents; ++i) { + sg_set_page(&rm->m_sg[i], + virt_to_page(page_addrs[i]), + PAGE_SIZE, 0); + } + + return rm; +} + +struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, + size_t total_len) +{ + unsigned long to_copy; + unsigned long iov_off; + unsigned long sg_off; + struct rds_message *rm; + struct iovec *iov; + struct scatterlist *sg; + int ret; + + rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL); + if (rm == NULL) { + ret = -ENOMEM; + goto out; + } + + rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); + + /* + * now allocate and copy in the data payload. + */ + sg = rm->m_sg; + iov = first_iov; + iov_off = 0; + sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */ + + while (total_len) { + if (sg_page(sg) == NULL) { + ret = rds_page_remainder_alloc(sg, total_len, + GFP_HIGHUSER); + if (ret) + goto out; + rm->m_nents++; + sg_off = 0; + } + + while (iov_off == iov->iov_len) { + iov_off = 0; + iov++; + } + + to_copy = min(iov->iov_len - iov_off, sg->length - sg_off); + to_copy = min_t(size_t, to_copy, total_len); + + rdsdebug("copying %lu bytes from user iov [%p, %zu] + %lu to " + "sg [%p, %u, %u] + %lu\n", + to_copy, iov->iov_base, iov->iov_len, iov_off, + (void *)sg_page(sg), sg->offset, sg->length, sg_off); + + ret = rds_page_copy_from_user(sg_page(sg), sg->offset + sg_off, + iov->iov_base + iov_off, + to_copy); + if (ret) + goto out; + + iov_off += to_copy; + total_len -= to_copy; + sg_off += to_copy; + + if (sg_off == sg->length) + sg++; + } + + ret = 0; +out: + if (ret) { + if (rm) + rds_message_put(rm); + rm = ERR_PTR(ret); + } + return rm; +} + +int rds_message_inc_copy_to_user(struct rds_incoming *inc, + struct iovec *first_iov, size_t size) +{ + struct rds_message *rm; + struct iovec *iov; + struct scatterlist *sg; + unsigned long to_copy; + unsigned long iov_off; + unsigned long vec_off; + int copied; + int ret; + u32 len; + + rm = container_of(inc, struct rds_message, m_inc); + len = be32_to_cpu(rm->m_inc.i_hdr.h_len); + + iov = first_iov; + iov_off = 0; + sg = rm->m_sg; + vec_off = 0; + copied = 0; + + while (copied < size && copied < len) { + while (iov_off == iov->iov_len) { + iov_off = 0; + iov++; + } + + to_copy = min(iov->iov_len - iov_off, sg->length - vec_off); + to_copy = min_t(size_t, to_copy, size - copied); + to_copy = min_t(unsigned long, to_copy, len - copied); + + rdsdebug("copying %lu bytes to user iov [%p, %zu] + %lu to " + "sg [%p, %u, %u] + %lu\n", + to_copy, iov->iov_base, iov->iov_len, iov_off, + sg_page(sg), sg->offset, sg->length, vec_off); + + ret = rds_page_copy_to_user(sg_page(sg), sg->offset + vec_off, + iov->iov_base + iov_off, + to_copy); + if (ret) { + copied = ret; + break; + } + + iov_off += to_copy; + vec_off += to_copy; + copied += to_copy; + + if (vec_off == sg->length) { + vec_off = 0; + sg++; + } + } + + return copied; +} + +/* + * If the message is still on the send queue, wait until the transport + * is done with it. This is particularly important for RDMA operations. + */ +void rds_message_wait(struct rds_message *rm) +{ + wait_event(rds_message_flush_waitq, + !test_bit(RDS_MSG_MAPPED, &rm->m_flags)); +} + +void rds_message_unmapped(struct rds_message *rm) +{ + clear_bit(RDS_MSG_MAPPED, &rm->m_flags); + if (waitqueue_active(&rds_message_flush_waitq)) + wake_up(&rds_message_flush_waitq); +} + diff --git a/net/rds/page.c b/net/rds/page.c new file mode 100644 index 00000000000..c460743a89a --- /dev/null +++ b/net/rds/page.c @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/highmem.h> + +#include "rds.h" + +struct rds_page_remainder { + struct page *r_page; + unsigned long r_offset; +}; + +DEFINE_PER_CPU(struct rds_page_remainder, rds_page_remainders) ____cacheline_aligned; + +/* + * returns 0 on success or -errno on failure. + * + * We don't have to worry about flush_dcache_page() as this only works + * with private pages. If, say, we were to do directed receive to pinned + * user pages we'd have to worry more about cache coherence. (Though + * the flush_dcache_page() in get_user_pages() would probably be enough). + */ +int rds_page_copy_user(struct page *page, unsigned long offset, + void __user *ptr, unsigned long bytes, + int to_user) +{ + unsigned long ret; + void *addr; + + if (to_user) + rds_stats_add(s_copy_to_user, bytes); + else + rds_stats_add(s_copy_from_user, bytes); + + addr = kmap_atomic(page, KM_USER0); + if (to_user) + ret = __copy_to_user_inatomic(ptr, addr + offset, bytes); + else + ret = __copy_from_user_inatomic(addr + offset, ptr, bytes); + kunmap_atomic(addr, KM_USER0); + + if (ret) { + addr = kmap(page); + if (to_user) + ret = copy_to_user(ptr, addr + offset, bytes); + else + ret = copy_from_user(addr + offset, ptr, bytes); + kunmap(page); + if (ret) + return -EFAULT; + } + + return 0; +} + +/* + * Message allocation uses this to build up regions of a message. + * + * @bytes - the number of bytes needed. + * @gfp - the waiting behaviour of the allocation + * + * @gfp is always ored with __GFP_HIGHMEM. Callers must be prepared to + * kmap the pages, etc. + * + * If @bytes is at least a full page then this just returns a page from + * alloc_page(). + * + * If @bytes is a partial page then this stores the unused region of the + * page in a per-cpu structure. Future partial-page allocations may be + * satisfied from that cached region. This lets us waste less memory on + * small allocations with minimal complexity. It works because the transmit + * path passes read-only page regions down to devices. They hold a page + * reference until they are done with the region. + */ +int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, + gfp_t gfp) +{ + struct rds_page_remainder *rem; + unsigned long flags; + struct page *page; + int ret; + + gfp |= __GFP_HIGHMEM; + + /* jump straight to allocation if we're trying for a huge page */ + if (bytes >= PAGE_SIZE) { + page = alloc_page(gfp); + if (page == NULL) { + ret = -ENOMEM; + } else { + sg_set_page(scat, page, PAGE_SIZE, 0); + ret = 0; + } + goto out; + } + + rem = &per_cpu(rds_page_remainders, get_cpu()); + local_irq_save(flags); + + while (1) { + /* avoid a tiny region getting stuck by tossing it */ + if (rem->r_page && bytes > (PAGE_SIZE - rem->r_offset)) { + rds_stats_inc(s_page_remainder_miss); + __free_page(rem->r_page); + rem->r_page = NULL; + } + + /* hand out a fragment from the cached page */ + if (rem->r_page && bytes <= (PAGE_SIZE - rem->r_offset)) { + sg_set_page(scat, rem->r_page, bytes, rem->r_offset); + get_page(sg_page(scat)); + + if (rem->r_offset != 0) + rds_stats_inc(s_page_remainder_hit); + + rem->r_offset += bytes; + if (rem->r_offset == PAGE_SIZE) { + __free_page(rem->r_page); + rem->r_page = NULL; + } + ret = 0; + break; + } + + /* alloc if there is nothing for us to use */ + local_irq_restore(flags); + put_cpu(); + + page = alloc_page(gfp); + + rem = &per_cpu(rds_page_remainders, get_cpu()); + local_irq_save(flags); + + if (page == NULL) { + ret = -ENOMEM; + break; + } + + /* did someone race to fill the remainder before us? */ + if (rem->r_page) { + __free_page(page); + continue; + } + + /* otherwise install our page and loop around to alloc */ + rem->r_page = page; + rem->r_offset = 0; + } + + local_irq_restore(flags); + put_cpu(); +out: + rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret, + ret ? NULL : sg_page(scat), ret ? 0 : scat->offset, + ret ? 0 : scat->length); + return ret; +} + +static int rds_page_remainder_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + struct rds_page_remainder *rem; + long cpu = (long)hcpu; + + rem = &per_cpu(rds_page_remainders, cpu); + + rdsdebug("cpu %ld action 0x%lx\n", cpu, action); + + switch (action) { + case CPU_DEAD: + if (rem->r_page) + __free_page(rem->r_page); + rem->r_page = NULL; + break; + } + + return 0; +} + +static struct notifier_block rds_page_remainder_nb = { + .notifier_call = rds_page_remainder_cpu_notify, +}; + +void rds_page_exit(void) +{ + int i; + + for_each_possible_cpu(i) + rds_page_remainder_cpu_notify(&rds_page_remainder_nb, + (unsigned long)CPU_DEAD, + (void *)(long)i); +} diff --git a/net/rds/rdma.c b/net/rds/rdma.c new file mode 100644 index 00000000000..eaeeb91e111 --- /dev/null +++ b/net/rds/rdma.c @@ -0,0 +1,679 @@ +/* + * Copyright (c) 2007 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/pagemap.h> +#include <linux/rbtree.h> +#include <linux/dma-mapping.h> /* for DMA_*_DEVICE */ + +#include "rdma.h" + +/* + * XXX + * - build with sparse + * - should we limit the size of a mr region? let transport return failure? + * - should we detect duplicate keys on a socket? hmm. + * - an rdma is an mlock, apply rlimit? + */ + +/* + * get the number of pages by looking at the page indices that the start and + * end addresses fall in. + * + * Returns 0 if the vec is invalid. It is invalid if the number of bytes + * causes the address to wrap or overflows an unsigned int. This comes + * from being stored in the 'length' member of 'struct scatterlist'. + */ +static unsigned int rds_pages_in_vec(struct rds_iovec *vec) +{ + if ((vec->addr + vec->bytes <= vec->addr) || + (vec->bytes > (u64)UINT_MAX)) + return 0; + + return ((vec->addr + vec->bytes + PAGE_SIZE - 1) >> PAGE_SHIFT) - + (vec->addr >> PAGE_SHIFT); +} + +static struct rds_mr *rds_mr_tree_walk(struct rb_root *root, u64 key, + struct rds_mr *insert) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct rds_mr *mr; + + while (*p) { + parent = *p; + mr = rb_entry(parent, struct rds_mr, r_rb_node); + + if (key < mr->r_key) + p = &(*p)->rb_left; + else if (key > mr->r_key) + p = &(*p)->rb_right; + else + return mr; + } + + if (insert) { + rb_link_node(&insert->r_rb_node, parent, p); + rb_insert_color(&insert->r_rb_node, root); + atomic_inc(&insert->r_refcount); + } + return NULL; +} + +/* + * Destroy the transport-specific part of a MR. + */ +static void rds_destroy_mr(struct rds_mr *mr) +{ + struct rds_sock *rs = mr->r_sock; + void *trans_private = NULL; + unsigned long flags; + + rdsdebug("RDS: destroy mr key is %x refcnt %u\n", + mr->r_key, atomic_read(&mr->r_refcount)); + + if (test_and_set_bit(RDS_MR_DEAD, &mr->r_state)) + return; + + spin_lock_irqsave(&rs->rs_rdma_lock, flags); + if (!RB_EMPTY_NODE(&mr->r_rb_node)) + rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); + trans_private = mr->r_trans_private; + mr->r_trans_private = NULL; + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + + if (trans_private) + mr->r_trans->free_mr(trans_private, mr->r_invalidate); +} + +void __rds_put_mr_final(struct rds_mr *mr) +{ + rds_destroy_mr(mr); + kfree(mr); +} + +/* + * By the time this is called we can't have any more ioctls called on + * the socket so we don't need to worry about racing with others. + */ +void rds_rdma_drop_keys(struct rds_sock *rs) +{ + struct rds_mr *mr; + struct rb_node *node; + + /* Release any MRs associated with this socket */ + while ((node = rb_first(&rs->rs_rdma_keys))) { + mr = container_of(node, struct rds_mr, r_rb_node); + if (mr->r_trans == rs->rs_transport) + mr->r_invalidate = 0; + rds_mr_put(mr); + } + + if (rs->rs_transport && rs->rs_transport->flush_mrs) + rs->rs_transport->flush_mrs(); +} + +/* + * Helper function to pin user pages. + */ +static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages, + struct page **pages, int write) +{ + int ret; + + down_read(¤t->mm->mmap_sem); + ret = get_user_pages(current, current->mm, user_addr, + nr_pages, write, 0, pages, NULL); + up_read(¤t->mm->mmap_sem); + + if (0 <= ret && (unsigned) ret < nr_pages) { + while (ret--) + put_page(pages[ret]); + ret = -EFAULT; + } + + return ret; +} + +static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, + u64 *cookie_ret, struct rds_mr **mr_ret) +{ + struct rds_mr *mr = NULL, *found; + unsigned int nr_pages; + struct page **pages = NULL; + struct scatterlist *sg; + void *trans_private; + unsigned long flags; + rds_rdma_cookie_t cookie; + unsigned int nents; + long i; + int ret; + + if (rs->rs_bound_addr == 0) { + ret = -ENOTCONN; /* XXX not a great errno */ + goto out; + } + + if (rs->rs_transport->get_mr == NULL) { + ret = -EOPNOTSUPP; + goto out; + } + + nr_pages = rds_pages_in_vec(&args->vec); + if (nr_pages == 0) { + ret = -EINVAL; + goto out; + } + + rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n", + args->vec.addr, args->vec.bytes, nr_pages); + + /* XXX clamp nr_pages to limit the size of this alloc? */ + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (pages == NULL) { + ret = -ENOMEM; + goto out; + } + + mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL); + if (mr == NULL) { + ret = -ENOMEM; + goto out; + } + + atomic_set(&mr->r_refcount, 1); + RB_CLEAR_NODE(&mr->r_rb_node); + mr->r_trans = rs->rs_transport; + mr->r_sock = rs; + + if (args->flags & RDS_RDMA_USE_ONCE) + mr->r_use_once = 1; + if (args->flags & RDS_RDMA_INVALIDATE) + mr->r_invalidate = 1; + if (args->flags & RDS_RDMA_READWRITE) + mr->r_write = 1; + + /* + * Pin the pages that make up the user buffer and transfer the page + * pointers to the mr's sg array. We check to see if we've mapped + * the whole region after transferring the partial page references + * to the sg array so that we can have one page ref cleanup path. + * + * For now we have no flag that tells us whether the mapping is + * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to + * the zero page. + */ + ret = rds_pin_pages(args->vec.addr & PAGE_MASK, nr_pages, pages, 1); + if (ret < 0) + goto out; + + nents = ret; + sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL); + if (sg == NULL) { + ret = -ENOMEM; + goto out; + } + WARN_ON(!nents); + sg_init_table(sg, nents); + + /* Stick all pages into the scatterlist */ + for (i = 0 ; i < nents; i++) + sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0); + + rdsdebug("RDS: trans_private nents is %u\n", nents); + + /* Obtain a transport specific MR. If this succeeds, the + * s/g list is now owned by the MR. + * Note that dma_map() implies that pending writes are + * flushed to RAM, so no dma_sync is needed here. */ + trans_private = rs->rs_transport->get_mr(sg, nents, rs, + &mr->r_key); + + if (IS_ERR(trans_private)) { + for (i = 0 ; i < nents; i++) + put_page(sg_page(&sg[i])); + kfree(sg); + ret = PTR_ERR(trans_private); + goto out; + } + + mr->r_trans_private = trans_private; + + rdsdebug("RDS: get_mr put_user key is %x cookie_addr %p\n", + mr->r_key, (void *)(unsigned long) args->cookie_addr); + + /* The user may pass us an unaligned address, but we can only + * map page aligned regions. So we keep the offset, and build + * a 64bit cookie containing <R_Key, offset> and pass that + * around. */ + cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK); + if (cookie_ret) + *cookie_ret = cookie; + + if (args->cookie_addr && put_user(cookie, (u64 __user *)(unsigned long) args->cookie_addr)) { + ret = -EFAULT; + goto out; + } + + /* Inserting the new MR into the rbtree bumps its + * reference count. */ + spin_lock_irqsave(&rs->rs_rdma_lock, flags); + found = rds_mr_tree_walk(&rs->rs_rdma_keys, mr->r_key, mr); + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + + BUG_ON(found && found != mr); + + rdsdebug("RDS: get_mr key is %x\n", mr->r_key); + if (mr_ret) { + atomic_inc(&mr->r_refcount); + *mr_ret = mr; + } + + ret = 0; +out: + kfree(pages); + if (mr) + rds_mr_put(mr); + return ret; +} + +int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen) +{ + struct rds_get_mr_args args; + + if (optlen != sizeof(struct rds_get_mr_args)) + return -EINVAL; + + if (copy_from_user(&args, (struct rds_get_mr_args __user *)optval, + sizeof(struct rds_get_mr_args))) + return -EFAULT; + + return __rds_rdma_map(rs, &args, NULL, NULL); +} + +/* + * Free the MR indicated by the given R_Key + */ +int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen) +{ + struct rds_free_mr_args args; + struct rds_mr *mr; + unsigned long flags; + + if (optlen != sizeof(struct rds_free_mr_args)) + return -EINVAL; + + if (copy_from_user(&args, (struct rds_free_mr_args __user *)optval, + sizeof(struct rds_free_mr_args))) + return -EFAULT; + + /* Special case - a null cookie means flush all unused MRs */ + if (args.cookie == 0) { + if (!rs->rs_transport || !rs->rs_transport->flush_mrs) + return -EINVAL; + rs->rs_transport->flush_mrs(); + return 0; + } + + /* Look up the MR given its R_key and remove it from the rbtree + * so nobody else finds it. + * This should also prevent races with rds_rdma_unuse. + */ + spin_lock_irqsave(&rs->rs_rdma_lock, flags); + mr = rds_mr_tree_walk(&rs->rs_rdma_keys, rds_rdma_cookie_key(args.cookie), NULL); + if (mr) { + rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); + RB_CLEAR_NODE(&mr->r_rb_node); + if (args.flags & RDS_RDMA_INVALIDATE) + mr->r_invalidate = 1; + } + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + + if (!mr) + return -EINVAL; + + /* + * call rds_destroy_mr() ourselves so that we're sure it's done by the time + * we return. If we let rds_mr_put() do it it might not happen until + * someone else drops their ref. + */ + rds_destroy_mr(mr); + rds_mr_put(mr); + return 0; +} + +/* + * This is called when we receive an extension header that + * tells us this MR was used. It allows us to implement + * use_once semantics + */ +void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force) +{ + struct rds_mr *mr; + unsigned long flags; + int zot_me = 0; + + spin_lock_irqsave(&rs->rs_rdma_lock, flags); + mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); + if (mr && (mr->r_use_once || force)) { + rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); + RB_CLEAR_NODE(&mr->r_rb_node); + zot_me = 1; + } else if (mr) + atomic_inc(&mr->r_refcount); + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + + /* May have to issue a dma_sync on this memory region. + * Note we could avoid this if the operation was a RDMA READ, + * but at this point we can't tell. */ + if (mr != NULL) { + if (mr->r_trans->sync_mr) + mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE); + + /* If the MR was marked as invalidate, this will + * trigger an async flush. */ + if (zot_me) + rds_destroy_mr(mr); + rds_mr_put(mr); + } +} + +void rds_rdma_free_op(struct rds_rdma_op *ro) +{ + unsigned int i; + + for (i = 0; i < ro->r_nents; i++) { + struct page *page = sg_page(&ro->r_sg[i]); + + /* Mark page dirty if it was possibly modified, which + * is the case for a RDMA_READ which copies from remote + * to local memory */ + if (!ro->r_write) + set_page_dirty(page); + put_page(page); + } + + kfree(ro->r_notifier); + kfree(ro); +} + +/* + * args is a pointer to an in-kernel copy in the sendmsg cmsg. + */ +static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, + struct rds_rdma_args *args) +{ + struct rds_iovec vec; + struct rds_rdma_op *op = NULL; + unsigned int nr_pages; + unsigned int max_pages; + unsigned int nr_bytes; + struct page **pages = NULL; + struct rds_iovec __user *local_vec; + struct scatterlist *sg; + unsigned int nr; + unsigned int i, j; + int ret; + + + if (rs->rs_bound_addr == 0) { + ret = -ENOTCONN; /* XXX not a great errno */ + goto out; + } + + if (args->nr_local > (u64)UINT_MAX) { + ret = -EMSGSIZE; + goto out; + } + + nr_pages = 0; + max_pages = 0; + + local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; + + /* figure out the number of pages in the vector */ + for (i = 0; i < args->nr_local; i++) { + if (copy_from_user(&vec, &local_vec[i], + sizeof(struct rds_iovec))) { + ret = -EFAULT; + goto out; + } + + nr = rds_pages_in_vec(&vec); + if (nr == 0) { + ret = -EINVAL; + goto out; + } + + max_pages = max(nr, max_pages); + nr_pages += nr; + } + + pages = kcalloc(max_pages, sizeof(struct page *), GFP_KERNEL); + if (pages == NULL) { + ret = -ENOMEM; + goto out; + } + + op = kzalloc(offsetof(struct rds_rdma_op, r_sg[nr_pages]), GFP_KERNEL); + if (op == NULL) { + ret = -ENOMEM; + goto out; + } + + op->r_write = !!(args->flags & RDS_RDMA_READWRITE); + op->r_fence = !!(args->flags & RDS_RDMA_FENCE); + op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); + op->r_recverr = rs->rs_recverr; + WARN_ON(!nr_pages); + sg_init_table(op->r_sg, nr_pages); + + if (op->r_notify || op->r_recverr) { + /* We allocate an uninitialized notifier here, because + * we don't want to do that in the completion handler. We + * would have to use GFP_ATOMIC there, and don't want to deal + * with failed allocations. + */ + op->r_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); + if (!op->r_notifier) { + ret = -ENOMEM; + goto out; + } + op->r_notifier->n_user_token = args->user_token; + op->r_notifier->n_status = RDS_RDMA_SUCCESS; + } + + /* The cookie contains the R_Key of the remote memory region, and + * optionally an offset into it. This is how we implement RDMA into + * unaligned memory. + * When setting up the RDMA, we need to add that offset to the + * destination address (which is really an offset into the MR) + * FIXME: We may want to move this into ib_rdma.c + */ + op->r_key = rds_rdma_cookie_key(args->cookie); + op->r_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie); + + nr_bytes = 0; + + rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n", + (unsigned long long)args->nr_local, + (unsigned long long)args->remote_vec.addr, + op->r_key); + + for (i = 0; i < args->nr_local; i++) { + if (copy_from_user(&vec, &local_vec[i], + sizeof(struct rds_iovec))) { + ret = -EFAULT; + goto out; + } + + nr = rds_pages_in_vec(&vec); + if (nr == 0) { + ret = -EINVAL; + goto out; + } + + rs->rs_user_addr = vec.addr; + rs->rs_user_bytes = vec.bytes; + + /* did the user change the vec under us? */ + if (nr > max_pages || op->r_nents + nr > nr_pages) { + ret = -EINVAL; + goto out; + } + /* If it's a WRITE operation, we want to pin the pages for reading. + * If it's a READ operation, we need to pin the pages for writing. + */ + ret = rds_pin_pages(vec.addr & PAGE_MASK, nr, pages, !op->r_write); + if (ret < 0) + goto out; + + rdsdebug("RDS: nr_bytes %u nr %u vec.bytes %llu vec.addr %llx\n", + nr_bytes, nr, vec.bytes, vec.addr); + + nr_bytes += vec.bytes; + + for (j = 0; j < nr; j++) { + unsigned int offset = vec.addr & ~PAGE_MASK; + + sg = &op->r_sg[op->r_nents + j]; + sg_set_page(sg, pages[j], + min_t(unsigned int, vec.bytes, PAGE_SIZE - offset), + offset); + + rdsdebug("RDS: sg->offset %x sg->len %x vec.addr %llx vec.bytes %llu\n", + sg->offset, sg->length, vec.addr, vec.bytes); + + vec.addr += sg->length; + vec.bytes -= sg->length; + } + + op->r_nents += nr; + } + + + if (nr_bytes > args->remote_vec.bytes) { + rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n", + nr_bytes, + (unsigned int) args->remote_vec.bytes); + ret = -EINVAL; + goto out; + } + op->r_bytes = nr_bytes; + + ret = 0; +out: + kfree(pages); + if (ret) { + if (op) + rds_rdma_free_op(op); + op = ERR_PTR(ret); + } + return op; +} + +/* + * The application asks for a RDMA transfer. + * Extract all arguments and set up the rdma_op + */ +int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg) +{ + struct rds_rdma_op *op; + + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) + || rm->m_rdma_op != NULL) + return -EINVAL; + + op = rds_rdma_prepare(rs, CMSG_DATA(cmsg)); + if (IS_ERR(op)) + return PTR_ERR(op); + rds_stats_inc(s_send_rdma); + rm->m_rdma_op = op; + return 0; +} + +/* + * The application wants us to pass an RDMA destination (aka MR) + * to the remote + */ +int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg) +{ + unsigned long flags; + struct rds_mr *mr; + u32 r_key; + int err = 0; + + if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t)) + || rm->m_rdma_cookie != 0) + return -EINVAL; + + memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg), sizeof(rm->m_rdma_cookie)); + + /* We are reusing a previously mapped MR here. Most likely, the + * application has written to the buffer, so we need to explicitly + * flush those writes to RAM. Otherwise the HCA may not see them + * when doing a DMA from that buffer. + */ + r_key = rds_rdma_cookie_key(rm->m_rdma_cookie); + + spin_lock_irqsave(&rs->rs_rdma_lock, flags); + mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); + if (mr == NULL) + err = -EINVAL; /* invalid r_key */ + else + atomic_inc(&mr->r_refcount); + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + + if (mr) { + mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); + rm->m_rdma_mr = mr; + } + return err; +} + +/* + * The application passes us an address range it wants to enable RDMA + * to/from. We map the area, and save the <R_Key,offset> pair + * in rm->m_rdma_cookie. This causes it to be sent along to the peer + * in an extension header. + */ +int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg) +{ + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args)) + || rm->m_rdma_cookie != 0) + return -EINVAL; + + return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->m_rdma_mr); +} diff --git a/net/rds/rdma.h b/net/rds/rdma.h new file mode 100644 index 00000000000..425512098b0 --- /dev/null +++ b/net/rds/rdma.h @@ -0,0 +1,84 @@ +#ifndef _RDS_RDMA_H +#define _RDS_RDMA_H + +#include <linux/rbtree.h> +#include <linux/spinlock.h> +#include <linux/scatterlist.h> + +#include "rds.h" + +struct rds_mr { + struct rb_node r_rb_node; + atomic_t r_refcount; + u32 r_key; + + /* A copy of the creation flags */ + unsigned int r_use_once:1; + unsigned int r_invalidate:1; + unsigned int r_write:1; + + /* This is for RDS_MR_DEAD. + * It would be nice & consistent to make this part of the above + * bit field here, but we need to use test_and_set_bit. + */ + unsigned long r_state; + struct rds_sock *r_sock; /* back pointer to the socket that owns us */ + struct rds_transport *r_trans; + void *r_trans_private; +}; + +/* Flags for mr->r_state */ +#define RDS_MR_DEAD 0 + +struct rds_rdma_op { + u32 r_key; + u64 r_remote_addr; + unsigned int r_write:1; + unsigned int r_fence:1; + unsigned int r_notify:1; + unsigned int r_recverr:1; + unsigned int r_mapped:1; + struct rds_notifier *r_notifier; + unsigned int r_bytes; + unsigned int r_nents; + unsigned int r_count; + struct scatterlist r_sg[0]; +}; + +static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset) +{ + return r_key | (((u64) offset) << 32); +} + +static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie) +{ + return cookie; +} + +static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) +{ + return cookie >> 32; +} + +int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen); +int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen); +void rds_rdma_drop_keys(struct rds_sock *rs); +int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); +int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); +int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); +int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); +void rds_rdma_free_op(struct rds_rdma_op *ro); +void rds_rdma_send_complete(struct rds_message *rm, int); + +extern void __rds_put_mr_final(struct rds_mr *mr); +static inline void rds_mr_put(struct rds_mr *mr) +{ + if (atomic_dec_and_test(&mr->r_refcount)) + __rds_put_mr_final(mr); +} + +#endif diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c new file mode 100644 index 00000000000..7b19024f970 --- /dev/null +++ b/net/rds/rdma_transport.c @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2009 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <rdma/rdma_cm.h> + +#include "rdma_transport.h" + +static struct rdma_cm_id *rds_iw_listen_id; + +int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + /* this can be null in the listening path */ + struct rds_connection *conn = cm_id->context; + struct rds_transport *trans; + int ret = 0; + + rdsdebug("conn %p id %p handling event %u\n", conn, cm_id, + event->event); + + if (cm_id->device->node_type == RDMA_NODE_RNIC) + trans = &rds_iw_transport; + else + trans = &rds_ib_transport; + + /* Prevent shutdown from tearing down the connection + * while we're executing. */ + if (conn) { + mutex_lock(&conn->c_cm_lock); + + /* If the connection is being shut down, bail out + * right away. We return 0 so cm_id doesn't get + * destroyed prematurely */ + if (rds_conn_state(conn) == RDS_CONN_DISCONNECTING) { + /* Reject incoming connections while we're tearing + * down an existing one. */ + if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) + ret = 1; + goto out; + } + } + + switch (event->event) { + case RDMA_CM_EVENT_CONNECT_REQUEST: + ret = trans->cm_handle_connect(cm_id, event); + break; + + case RDMA_CM_EVENT_ADDR_RESOLVED: + /* XXX do we need to clean up if this fails? */ + ret = rdma_resolve_route(cm_id, + RDS_RDMA_RESOLVE_TIMEOUT_MS); + break; + + case RDMA_CM_EVENT_ROUTE_RESOLVED: + /* XXX worry about racing with listen acceptance */ + ret = trans->cm_initiate_connect(cm_id); + break; + + case RDMA_CM_EVENT_ESTABLISHED: + trans->cm_connect_complete(conn, event); + break; + + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_CONNECT_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + case RDMA_CM_EVENT_REJECTED: + case RDMA_CM_EVENT_DEVICE_REMOVAL: + case RDMA_CM_EVENT_ADDR_CHANGE: + if (conn) + rds_conn_drop(conn); + break; + + case RDMA_CM_EVENT_DISCONNECTED: + printk(KERN_WARNING "RDS/IW: DISCONNECT event - dropping connection " + "%pI4->%pI4\n", &conn->c_laddr, + &conn->c_faddr); + rds_conn_drop(conn); + break; + + default: + /* things like device disconnect? */ + printk(KERN_ERR "unknown event %u\n", event->event); + BUG(); + break; + } + +out: + if (conn) + mutex_unlock(&conn->c_cm_lock); + + rdsdebug("id %p event %u handling ret %d\n", cm_id, event->event, ret); + + return ret; +} + +static int __init rds_rdma_listen_init(void) +{ + struct sockaddr_in sin; + struct rdma_cm_id *cm_id; + int ret; + + cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP); + if (IS_ERR(cm_id)) { + ret = PTR_ERR(cm_id); + printk(KERN_ERR "RDS/IW: failed to setup listener, " + "rdma_create_id() returned %d\n", ret); + goto out; + } + + sin.sin_family = PF_INET, + sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY); + sin.sin_port = (__force u16)htons(RDS_PORT); + + /* + * XXX I bet this binds the cm_id to a device. If we want to support + * fail-over we'll have to take this into consideration. + */ + ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); + if (ret) { + printk(KERN_ERR "RDS/IW: failed to setup listener, " + "rdma_bind_addr() returned %d\n", ret); + goto out; + } + + ret = rdma_listen(cm_id, 128); + if (ret) { + printk(KERN_ERR "RDS/IW: failed to setup listener, " + "rdma_listen() returned %d\n", ret); + goto out; + } + + rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT); + + rds_iw_listen_id = cm_id; + cm_id = NULL; +out: + if (cm_id) + rdma_destroy_id(cm_id); + return ret; +} + +static void rds_rdma_listen_stop(void) +{ + if (rds_iw_listen_id) { + rdsdebug("cm %p\n", rds_iw_listen_id); + rdma_destroy_id(rds_iw_listen_id); + rds_iw_listen_id = NULL; + } +} + +int __init rds_rdma_init(void) +{ + int ret; + + ret = rds_rdma_listen_init(); + if (ret) + goto out; + + ret = rds_iw_init(); + if (ret) + goto err_iw_init; + + ret = rds_ib_init(); + if (ret) + goto err_ib_init; + + goto out; + +err_ib_init: + rds_iw_exit(); +err_iw_init: + rds_rdma_listen_stop(); +out: + return ret; +} + +void rds_rdma_exit(void) +{ + /* stop listening first to ensure no new connections are attempted */ + rds_rdma_listen_stop(); + rds_ib_exit(); + rds_iw_exit(); +} + diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h new file mode 100644 index 00000000000..2f2c7d976c2 --- /dev/null +++ b/net/rds/rdma_transport.h @@ -0,0 +1,28 @@ +#ifndef _RDMA_TRANSPORT_H +#define _RDMA_TRANSPORT_H + +#include <rdma/ib_verbs.h> +#include <rdma/rdma_cm.h> +#include "rds.h" + +#define RDS_RDMA_RESOLVE_TIMEOUT_MS 5000 + +int rds_rdma_conn_connect(struct rds_connection *conn); +int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event); + +/* from rdma_transport.c */ +int rds_rdma_init(void); +void rds_rdma_exit(void); + +/* from ib.c */ +extern struct rds_transport rds_ib_transport; +int rds_ib_init(void); +void rds_ib_exit(void); + +/* from iw.c */ +extern struct rds_transport rds_iw_transport; +int rds_iw_init(void); +void rds_iw_exit(void); + +#endif diff --git a/net/rds/rds.h b/net/rds/rds.h new file mode 100644 index 00000000000..06040070497 --- /dev/null +++ b/net/rds/rds.h @@ -0,0 +1,686 @@ +#ifndef _RDS_RDS_H +#define _RDS_RDS_H + +#include <net/sock.h> +#include <linux/scatterlist.h> +#include <linux/highmem.h> +#include <rdma/rdma_cm.h> +#include <linux/mutex.h> +#include <linux/rds.h> + +#include "info.h" + +/* + * RDS Network protocol version + */ +#define RDS_PROTOCOL_3_0 0x0300 +#define RDS_PROTOCOL_3_1 0x0301 +#define RDS_PROTOCOL_VERSION RDS_PROTOCOL_3_1 +#define RDS_PROTOCOL_MAJOR(v) ((v) >> 8) +#define RDS_PROTOCOL_MINOR(v) ((v) & 255) +#define RDS_PROTOCOL(maj, min) (((maj) << 8) | min) + +/* + * XXX randomly chosen, but at least seems to be unused: + * # 18464-18768 Unassigned + * We should do better. We want a reserved port to discourage unpriv'ed + * userspace from listening. + */ +#define RDS_PORT 18634 + +#ifdef DEBUG +#define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args) +#else +/* sigh, pr_debug() causes unused variable warnings */ +static inline void __attribute__ ((format (printf, 1, 2))) +rdsdebug(char *fmt, ...) +{ +} +#endif + +/* XXX is there one of these somewhere? */ +#define ceil(x, y) \ + ({ unsigned long __x = (x), __y = (y); (__x + __y - 1) / __y; }) + +#define RDS_FRAG_SHIFT 12 +#define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT)) + +#define RDS_CONG_MAP_BYTES (65536 / 8) +#define RDS_CONG_MAP_LONGS (RDS_CONG_MAP_BYTES / sizeof(unsigned long)) +#define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE) +#define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8) + +struct rds_cong_map { + struct rb_node m_rb_node; + __be32 m_addr; + wait_queue_head_t m_waitq; + struct list_head m_conn_list; + unsigned long m_page_addrs[RDS_CONG_MAP_PAGES]; +}; + + +/* + * This is how we will track the connection state: + * A connection is always in one of the following + * states. Updates to the state are atomic and imply + * a memory barrier. + */ +enum { + RDS_CONN_DOWN = 0, + RDS_CONN_CONNECTING, + RDS_CONN_DISCONNECTING, + RDS_CONN_UP, + RDS_CONN_ERROR, +}; + +/* Bits for c_flags */ +#define RDS_LL_SEND_FULL 0 +#define RDS_RECONNECT_PENDING 1 + +struct rds_connection { + struct hlist_node c_hash_node; + __be32 c_laddr; + __be32 c_faddr; + unsigned int c_loopback:1; + struct rds_connection *c_passive; + + struct rds_cong_map *c_lcong; + struct rds_cong_map *c_fcong; + + struct mutex c_send_lock; /* protect send ring */ + struct rds_message *c_xmit_rm; + unsigned long c_xmit_sg; + unsigned int c_xmit_hdr_off; + unsigned int c_xmit_data_off; + unsigned int c_xmit_rdma_sent; + + spinlock_t c_lock; /* protect msg queues */ + u64 c_next_tx_seq; + struct list_head c_send_queue; + struct list_head c_retrans; + + u64 c_next_rx_seq; + + struct rds_transport *c_trans; + void *c_transport_data; + + atomic_t c_state; + unsigned long c_flags; + unsigned long c_reconnect_jiffies; + struct delayed_work c_send_w; + struct delayed_work c_recv_w; + struct delayed_work c_conn_w; + struct work_struct c_down_w; + struct mutex c_cm_lock; /* protect conn state & cm */ + + struct list_head c_map_item; + unsigned long c_map_queued; + unsigned long c_map_offset; + unsigned long c_map_bytes; + + unsigned int c_unacked_packets; + unsigned int c_unacked_bytes; + + /* Protocol version */ + unsigned int c_version; +}; + +#define RDS_FLAG_CONG_BITMAP 0x01 +#define RDS_FLAG_ACK_REQUIRED 0x02 +#define RDS_FLAG_RETRANSMITTED 0x04 +#define RDS_MAX_ADV_CREDIT 127 + +/* + * Maximum space available for extension headers. + */ +#define RDS_HEADER_EXT_SPACE 16 + +struct rds_header { + __be64 h_sequence; + __be64 h_ack; + __be32 h_len; + __be16 h_sport; + __be16 h_dport; + u8 h_flags; + u8 h_credit; + u8 h_padding[4]; + __sum16 h_csum; + + u8 h_exthdr[RDS_HEADER_EXT_SPACE]; +}; + +/* + * Reserved - indicates end of extensions + */ +#define RDS_EXTHDR_NONE 0 + +/* + * This extension header is included in the very + * first message that is sent on a new connection, + * and identifies the protocol level. This will help + * rolling updates if a future change requires breaking + * the protocol. + * NB: This is no longer true for IB, where we do a version + * negotiation during the connection setup phase (protocol + * version information is included in the RDMA CM private data). + */ +#define RDS_EXTHDR_VERSION 1 +struct rds_ext_header_version { + __be32 h_version; +}; + +/* + * This extension header is included in the RDS message + * chasing an RDMA operation. + */ +#define RDS_EXTHDR_RDMA 2 +struct rds_ext_header_rdma { + __be32 h_rdma_rkey; +}; + +/* + * This extension header tells the peer about the + * destination <R_Key,offset> of the requested RDMA + * operation. + */ +#define RDS_EXTHDR_RDMA_DEST 3 +struct rds_ext_header_rdma_dest { + __be32 h_rdma_rkey; + __be32 h_rdma_offset; +}; + +#define __RDS_EXTHDR_MAX 16 /* for now */ + +struct rds_incoming { + atomic_t i_refcount; + struct list_head i_item; + struct rds_connection *i_conn; + struct rds_header i_hdr; + unsigned long i_rx_jiffies; + __be32 i_saddr; + + rds_rdma_cookie_t i_rdma_cookie; +}; + +/* + * m_sock_item and m_conn_item are on lists that are serialized under + * conn->c_lock. m_sock_item has additional meaning in that once it is empty + * the message will not be put back on the retransmit list after being sent. + * messages that are canceled while being sent rely on this. + * + * m_inc is used by loopback so that it can pass an incoming message straight + * back up into the rx path. It embeds a wire header which is also used by + * the send path, which is kind of awkward. + * + * m_sock_item indicates the message's presence on a socket's send or receive + * queue. m_rs will point to that socket. + * + * m_daddr is used by cancellation to prune messages to a given destination. + * + * The RDS_MSG_ON_SOCK and RDS_MSG_ON_CONN flags are used to avoid lock + * nesting. As paths iterate over messages on a sock, or conn, they must + * also lock the conn, or sock, to remove the message from those lists too. + * Testing the flag to determine if the message is still on the lists lets + * us avoid testing the list_head directly. That means each path can use + * the message's list_head to keep it on a local list while juggling locks + * without confusing the other path. + * + * m_ack_seq is an optional field set by transports who need a different + * sequence number range to invalidate. They can use this in a callback + * that they pass to rds_send_drop_acked() to see if each message has been + * acked. The HAS_ACK_SEQ flag can be used to detect messages which haven't + * had ack_seq set yet. + */ +#define RDS_MSG_ON_SOCK 1 +#define RDS_MSG_ON_CONN 2 +#define RDS_MSG_HAS_ACK_SEQ 3 +#define RDS_MSG_ACK_REQUIRED 4 +#define RDS_MSG_RETRANSMITTED 5 +#define RDS_MSG_MAPPED 6 +#define RDS_MSG_PAGEVEC 7 + +struct rds_message { + atomic_t m_refcount; + struct list_head m_sock_item; + struct list_head m_conn_item; + struct rds_incoming m_inc; + u64 m_ack_seq; + __be32 m_daddr; + unsigned long m_flags; + + /* Never access m_rs without holding m_rs_lock. + * Lock nesting is + * rm->m_rs_lock + * -> rs->rs_lock + */ + spinlock_t m_rs_lock; + struct rds_sock *m_rs; + struct rds_rdma_op *m_rdma_op; + rds_rdma_cookie_t m_rdma_cookie; + struct rds_mr *m_rdma_mr; + unsigned int m_nents; + unsigned int m_count; + struct scatterlist m_sg[0]; +}; + +/* + * The RDS notifier is used (optionally) to tell the application about + * completed RDMA operations. Rather than keeping the whole rds message + * around on the queue, we allocate a small notifier that is put on the + * socket's notifier_list. Notifications are delivered to the application + * through control messages. + */ +struct rds_notifier { + struct list_head n_list; + uint64_t n_user_token; + int n_status; +}; + +/** + * struct rds_transport - transport specific behavioural hooks + * + * @xmit: .xmit is called by rds_send_xmit() to tell the transport to send + * part of a message. The caller serializes on the send_sem so this + * doesn't need to be reentrant for a given conn. The header must be + * sent before the data payload. .xmit must be prepared to send a + * message with no data payload. .xmit should return the number of + * bytes that were sent down the connection, including header bytes. + * Returning 0 tells the caller that it doesn't need to perform any + * additional work now. This is usually the case when the transport has + * filled the sending queue for its connection and will handle + * triggering the rds thread to continue the send when space becomes + * available. Returning -EAGAIN tells the caller to retry the send + * immediately. Returning -ENOMEM tells the caller to retry the send at + * some point in the future. + * + * @conn_shutdown: conn_shutdown stops traffic on the given connection. Once + * it returns the connection can not call rds_recv_incoming(). + * This will only be called once after conn_connect returns + * non-zero success and will The caller serializes this with + * the send and connecting paths (xmit_* and conn_*). The + * transport is responsible for other serialization, including + * rds_recv_incoming(). This is called in process context but + * should try hard not to block. + * + * @xmit_cong_map: This asks the transport to send the local bitmap down the + * given connection. XXX get a better story about the bitmap + * flag and header. + */ + +struct rds_transport { + char t_name[TRANSNAMSIZ]; + struct list_head t_item; + struct module *t_owner; + unsigned int t_prefer_loopback:1; + + int (*laddr_check)(__be32 addr); + int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp); + void (*conn_free)(void *data); + int (*conn_connect)(struct rds_connection *conn); + void (*conn_shutdown)(struct rds_connection *conn); + void (*xmit_prepare)(struct rds_connection *conn); + void (*xmit_complete)(struct rds_connection *conn); + int (*xmit)(struct rds_connection *conn, struct rds_message *rm, + unsigned int hdr_off, unsigned int sg, unsigned int off); + int (*xmit_cong_map)(struct rds_connection *conn, + struct rds_cong_map *map, unsigned long offset); + int (*xmit_rdma)(struct rds_connection *conn, struct rds_rdma_op *op); + int (*recv)(struct rds_connection *conn); + int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov, + size_t size); + void (*inc_purge)(struct rds_incoming *inc); + void (*inc_free)(struct rds_incoming *inc); + + int (*cm_handle_connect)(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event); + int (*cm_initiate_connect)(struct rdma_cm_id *cm_id); + void (*cm_connect_complete)(struct rds_connection *conn, + struct rdma_cm_event *event); + + unsigned int (*stats_info_copy)(struct rds_info_iterator *iter, + unsigned int avail); + void (*exit)(void); + void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg, + struct rds_sock *rs, u32 *key_ret); + void (*sync_mr)(void *trans_private, int direction); + void (*free_mr)(void *trans_private, int invalidate); + void (*flush_mrs)(void); +}; + +struct rds_sock { + struct sock rs_sk; + + u64 rs_user_addr; + u64 rs_user_bytes; + + /* + * bound_addr used for both incoming and outgoing, no INADDR_ANY + * support. + */ + struct rb_node rs_bound_node; + __be32 rs_bound_addr; + __be32 rs_conn_addr; + __be16 rs_bound_port; + __be16 rs_conn_port; + + /* + * This is only used to communicate the transport between bind and + * initiating connections. All other trans use is referenced through + * the connection. + */ + struct rds_transport *rs_transport; + + /* + * rds_sendmsg caches the conn it used the last time around. + * This helps avoid costly lookups. + */ + struct rds_connection *rs_conn; + + /* flag indicating we were congested or not */ + int rs_congested; + + /* rs_lock protects all these adjacent members before the newline */ + spinlock_t rs_lock; + struct list_head rs_send_queue; + u32 rs_snd_bytes; + int rs_rcv_bytes; + struct list_head rs_notify_queue; /* currently used for failed RDMAs */ + + /* Congestion wake_up. If rs_cong_monitor is set, we use cong_mask + * to decide whether the application should be woken up. + * If not set, we use rs_cong_track to find out whether a cong map + * update arrived. + */ + uint64_t rs_cong_mask; + uint64_t rs_cong_notify; + struct list_head rs_cong_list; + unsigned long rs_cong_track; + + /* + * rs_recv_lock protects the receive queue, and is + * used to serialize with rds_release. + */ + rwlock_t rs_recv_lock; + struct list_head rs_recv_queue; + + /* just for stats reporting */ + struct list_head rs_item; + + /* these have their own lock */ + spinlock_t rs_rdma_lock; + struct rb_root rs_rdma_keys; + + /* Socket options - in case there will be more */ + unsigned char rs_recverr, + rs_cong_monitor; +}; + +static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk) +{ + return container_of(sk, struct rds_sock, rs_sk); +} +static inline struct sock *rds_rs_to_sk(struct rds_sock *rs) +{ + return &rs->rs_sk; +} + +/* + * The stack assigns sk_sndbuf and sk_rcvbuf to twice the specified value + * to account for overhead. We don't account for overhead, we just apply + * the number of payload bytes to the specified value. + */ +static inline int rds_sk_sndbuf(struct rds_sock *rs) +{ + return rds_rs_to_sk(rs)->sk_sndbuf / 2; +} +static inline int rds_sk_rcvbuf(struct rds_sock *rs) +{ + return rds_rs_to_sk(rs)->sk_rcvbuf / 2; +} + +struct rds_statistics { + uint64_t s_conn_reset; + uint64_t s_recv_drop_bad_checksum; + uint64_t s_recv_drop_old_seq; + uint64_t s_recv_drop_no_sock; + uint64_t s_recv_drop_dead_sock; + uint64_t s_recv_deliver_raced; + uint64_t s_recv_delivered; + uint64_t s_recv_queued; + uint64_t s_recv_immediate_retry; + uint64_t s_recv_delayed_retry; + uint64_t s_recv_ack_required; + uint64_t s_recv_rdma_bytes; + uint64_t s_recv_ping; + uint64_t s_send_queue_empty; + uint64_t s_send_queue_full; + uint64_t s_send_sem_contention; + uint64_t s_send_sem_queue_raced; + uint64_t s_send_immediate_retry; + uint64_t s_send_delayed_retry; + uint64_t s_send_drop_acked; + uint64_t s_send_ack_required; + uint64_t s_send_queued; + uint64_t s_send_rdma; + uint64_t s_send_rdma_bytes; + uint64_t s_send_pong; + uint64_t s_page_remainder_hit; + uint64_t s_page_remainder_miss; + uint64_t s_copy_to_user; + uint64_t s_copy_from_user; + uint64_t s_cong_update_queued; + uint64_t s_cong_update_received; + uint64_t s_cong_send_error; + uint64_t s_cong_send_blocked; +}; + +/* af_rds.c */ +void rds_sock_addref(struct rds_sock *rs); +void rds_sock_put(struct rds_sock *rs); +void rds_wake_sk_sleep(struct rds_sock *rs); +static inline void __rds_wake_sk_sleep(struct sock *sk) +{ + wait_queue_head_t *waitq = sk->sk_sleep; + + if (!sock_flag(sk, SOCK_DEAD) && waitq) + wake_up(waitq); +} +extern wait_queue_head_t rds_poll_waitq; + + +/* bind.c */ +int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); +void rds_remove_bound(struct rds_sock *rs); +struct rds_sock *rds_find_bound(__be32 addr, __be16 port); + +/* cong.c */ +int rds_cong_get_maps(struct rds_connection *conn); +void rds_cong_add_conn(struct rds_connection *conn); +void rds_cong_remove_conn(struct rds_connection *conn); +void rds_cong_set_bit(struct rds_cong_map *map, __be16 port); +void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port); +int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, struct rds_sock *rs); +void rds_cong_queue_updates(struct rds_cong_map *map); +void rds_cong_map_updated(struct rds_cong_map *map, uint64_t); +int rds_cong_updated_since(unsigned long *recent); +void rds_cong_add_socket(struct rds_sock *); +void rds_cong_remove_socket(struct rds_sock *); +void rds_cong_exit(void); +struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); + +/* conn.c */ +int __init rds_conn_init(void); +void rds_conn_exit(void); +struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, + struct rds_transport *trans, gfp_t gfp); +struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, + struct rds_transport *trans, gfp_t gfp); +void rds_conn_destroy(struct rds_connection *conn); +void rds_conn_reset(struct rds_connection *conn); +void rds_conn_drop(struct rds_connection *conn); +void rds_for_each_conn_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens, + int (*visitor)(struct rds_connection *, void *), + size_t item_len); +void __rds_conn_error(struct rds_connection *conn, const char *, ...) + __attribute__ ((format (printf, 2, 3))); +#define rds_conn_error(conn, fmt...) \ + __rds_conn_error(conn, KERN_WARNING "RDS: " fmt) + +static inline int +rds_conn_transition(struct rds_connection *conn, int old, int new) +{ + return atomic_cmpxchg(&conn->c_state, old, new) == old; +} + +static inline int +rds_conn_state(struct rds_connection *conn) +{ + return atomic_read(&conn->c_state); +} + +static inline int +rds_conn_up(struct rds_connection *conn) +{ + return atomic_read(&conn->c_state) == RDS_CONN_UP; +} + +static inline int +rds_conn_connecting(struct rds_connection *conn) +{ + return atomic_read(&conn->c_state) == RDS_CONN_CONNECTING; +} + +/* message.c */ +struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp); +struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, + size_t total_len); +struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len); +void rds_message_populate_header(struct rds_header *hdr, __be16 sport, + __be16 dport, u64 seq); +int rds_message_add_extension(struct rds_header *hdr, + unsigned int type, const void *data, unsigned int len); +int rds_message_next_extension(struct rds_header *hdr, + unsigned int *pos, void *buf, unsigned int *buflen); +int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version); +int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version); +int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset); +int rds_message_inc_copy_to_user(struct rds_incoming *inc, + struct iovec *first_iov, size_t size); +void rds_message_inc_purge(struct rds_incoming *inc); +void rds_message_inc_free(struct rds_incoming *inc); +void rds_message_addref(struct rds_message *rm); +void rds_message_put(struct rds_message *rm); +void rds_message_wait(struct rds_message *rm); +void rds_message_unmapped(struct rds_message *rm); + +static inline void rds_message_make_checksum(struct rds_header *hdr) +{ + hdr->h_csum = 0; + hdr->h_csum = ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2); +} + +static inline int rds_message_verify_checksum(const struct rds_header *hdr) +{ + return !hdr->h_csum || ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2) == 0; +} + + +/* page.c */ +int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, + gfp_t gfp); +int rds_page_copy_user(struct page *page, unsigned long offset, + void __user *ptr, unsigned long bytes, + int to_user); +#define rds_page_copy_to_user(page, offset, ptr, bytes) \ + rds_page_copy_user(page, offset, ptr, bytes, 1) +#define rds_page_copy_from_user(page, offset, ptr, bytes) \ + rds_page_copy_user(page, offset, ptr, bytes, 0) +void rds_page_exit(void); + +/* recv.c */ +void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, + __be32 saddr); +void rds_inc_addref(struct rds_incoming *inc); +void rds_inc_put(struct rds_incoming *inc); +void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, + struct rds_incoming *inc, gfp_t gfp, enum km_type km); +int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t size, int msg_flags); +void rds_clear_recv_queue(struct rds_sock *rs); +int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg); +void rds_inc_info_copy(struct rds_incoming *inc, + struct rds_info_iterator *iter, + __be32 saddr, __be32 daddr, int flip); + +/* send.c */ +int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t payload_len); +void rds_send_reset(struct rds_connection *conn); +int rds_send_xmit(struct rds_connection *conn); +struct sockaddr_in; +void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest); +typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); +void rds_send_drop_acked(struct rds_connection *conn, u64 ack, + is_acked_func is_acked); +int rds_send_acked_before(struct rds_connection *conn, u64 seq); +void rds_send_remove_from_sock(struct list_head *messages, int status); +int rds_send_pong(struct rds_connection *conn, __be16 dport); +struct rds_message *rds_send_get_message(struct rds_connection *, + struct rds_rdma_op *); + +/* rdma.c */ +void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force); + +/* stats.c */ +DECLARE_PER_CPU(struct rds_statistics, rds_stats); +#define rds_stats_inc_which(which, member) do { \ + per_cpu(which, get_cpu()).member++; \ + put_cpu(); \ +} while (0) +#define rds_stats_inc(member) rds_stats_inc_which(rds_stats, member) +#define rds_stats_add_which(which, member, count) do { \ + per_cpu(which, get_cpu()).member += count; \ + put_cpu(); \ +} while (0) +#define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count) +int __init rds_stats_init(void); +void rds_stats_exit(void); +void rds_stats_info_copy(struct rds_info_iterator *iter, + uint64_t *values, char **names, size_t nr); + +/* sysctl.c */ +int __init rds_sysctl_init(void); +void rds_sysctl_exit(void); +extern unsigned long rds_sysctl_sndbuf_min; +extern unsigned long rds_sysctl_sndbuf_default; +extern unsigned long rds_sysctl_sndbuf_max; +extern unsigned long rds_sysctl_reconnect_min_jiffies; +extern unsigned long rds_sysctl_reconnect_max_jiffies; +extern unsigned int rds_sysctl_max_unacked_packets; +extern unsigned int rds_sysctl_max_unacked_bytes; +extern unsigned int rds_sysctl_ping_enable; +extern unsigned long rds_sysctl_trace_flags; +extern unsigned int rds_sysctl_trace_level; + +/* threads.c */ +int __init rds_threads_init(void); +void rds_threads_exit(void); +extern struct workqueue_struct *rds_wq; +void rds_connect_worker(struct work_struct *); +void rds_shutdown_worker(struct work_struct *); +void rds_send_worker(struct work_struct *); +void rds_recv_worker(struct work_struct *); +void rds_connect_complete(struct rds_connection *conn); + +/* transport.c */ +int rds_trans_register(struct rds_transport *trans); +void rds_trans_unregister(struct rds_transport *trans); +struct rds_transport *rds_trans_get_preferred(__be32 addr); +unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, + unsigned int avail); +int __init rds_trans_init(void); +void rds_trans_exit(void); + +#endif diff --git a/net/rds/recv.c b/net/rds/recv.c new file mode 100644 index 00000000000..f2118c51cfa --- /dev/null +++ b/net/rds/recv.c @@ -0,0 +1,542 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <net/sock.h> +#include <linux/in.h> + +#include "rds.h" +#include "rdma.h" + +void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, + __be32 saddr) +{ + atomic_set(&inc->i_refcount, 1); + INIT_LIST_HEAD(&inc->i_item); + inc->i_conn = conn; + inc->i_saddr = saddr; + inc->i_rdma_cookie = 0; +} + +void rds_inc_addref(struct rds_incoming *inc) +{ + rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount)); + atomic_inc(&inc->i_refcount); +} + +void rds_inc_put(struct rds_incoming *inc) +{ + rdsdebug("put inc %p ref %d\n", inc, atomic_read(&inc->i_refcount)); + if (atomic_dec_and_test(&inc->i_refcount)) { + BUG_ON(!list_empty(&inc->i_item)); + + inc->i_conn->c_trans->inc_free(inc); + } +} + +static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk, + struct rds_cong_map *map, + int delta, __be16 port) +{ + int now_congested; + + if (delta == 0) + return; + + rs->rs_rcv_bytes += delta; + now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs); + + rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d " + "now_cong %d delta %d\n", + rs, &rs->rs_bound_addr, + ntohs(rs->rs_bound_port), rs->rs_rcv_bytes, + rds_sk_rcvbuf(rs), now_congested, delta); + + /* wasn't -> am congested */ + if (!rs->rs_congested && now_congested) { + rs->rs_congested = 1; + rds_cong_set_bit(map, port); + rds_cong_queue_updates(map); + } + /* was -> aren't congested */ + /* Require more free space before reporting uncongested to prevent + bouncing cong/uncong state too often */ + else if (rs->rs_congested && (rs->rs_rcv_bytes < (rds_sk_rcvbuf(rs)/2))) { + rs->rs_congested = 0; + rds_cong_clear_bit(map, port); + rds_cong_queue_updates(map); + } + + /* do nothing if no change in cong state */ +} + +/* + * Process all extension headers that come with this message. + */ +static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock *rs) +{ + struct rds_header *hdr = &inc->i_hdr; + unsigned int pos = 0, type, len; + union { + struct rds_ext_header_version version; + struct rds_ext_header_rdma rdma; + struct rds_ext_header_rdma_dest rdma_dest; + } buffer; + + while (1) { + len = sizeof(buffer); + type = rds_message_next_extension(hdr, &pos, &buffer, &len); + if (type == RDS_EXTHDR_NONE) + break; + /* Process extension header here */ + switch (type) { + case RDS_EXTHDR_RDMA: + rds_rdma_unuse(rs, be32_to_cpu(buffer.rdma.h_rdma_rkey), 0); + break; + + case RDS_EXTHDR_RDMA_DEST: + /* We ignore the size for now. We could stash it + * somewhere and use it for error checking. */ + inc->i_rdma_cookie = rds_rdma_make_cookie( + be32_to_cpu(buffer.rdma_dest.h_rdma_rkey), + be32_to_cpu(buffer.rdma_dest.h_rdma_offset)); + + break; + } + } +} + +/* + * The transport must make sure that this is serialized against other + * rx and conn reset on this specific conn. + * + * We currently assert that only one fragmented message will be sent + * down a connection at a time. This lets us reassemble in the conn + * instead of per-flow which means that we don't have to go digging through + * flows to tear down partial reassembly progress on conn failure and + * we save flow lookup and locking for each frag arrival. It does mean + * that small messages will wait behind large ones. Fragmenting at all + * is only to reduce the memory consumption of pre-posted buffers. + * + * The caller passes in saddr and daddr instead of us getting it from the + * conn. This lets loopback, who only has one conn for both directions, + * tell us which roles the addrs in the conn are playing for this message. + */ +void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, + struct rds_incoming *inc, gfp_t gfp, enum km_type km) +{ + struct rds_sock *rs = NULL; + struct sock *sk; + unsigned long flags; + + inc->i_conn = conn; + inc->i_rx_jiffies = jiffies; + + rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u " + "flags 0x%x rx_jiffies %lu\n", conn, + (unsigned long long)conn->c_next_rx_seq, + inc, + (unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence), + be32_to_cpu(inc->i_hdr.h_len), + be16_to_cpu(inc->i_hdr.h_sport), + be16_to_cpu(inc->i_hdr.h_dport), + inc->i_hdr.h_flags, + inc->i_rx_jiffies); + + /* + * Sequence numbers should only increase. Messages get their + * sequence number as they're queued in a sending conn. They + * can be dropped, though, if the sending socket is closed before + * they hit the wire. So sequence numbers can skip forward + * under normal operation. They can also drop back in the conn + * failover case as previously sent messages are resent down the + * new instance of a conn. We drop those, otherwise we have + * to assume that the next valid seq does not come after a + * hole in the fragment stream. + * + * The headers don't give us a way to realize if fragments of + * a message have been dropped. We assume that frags that arrive + * to a flow are part of the current message on the flow that is + * being reassembled. This means that senders can't drop messages + * from the sending conn until all their frags are sent. + * + * XXX we could spend more on the wire to get more robust failure + * detection, arguably worth it to avoid data corruption. + */ + if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq + && (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) { + rds_stats_inc(s_recv_drop_old_seq); + goto out; + } + conn->c_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1; + + if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) { + rds_stats_inc(s_recv_ping); + rds_send_pong(conn, inc->i_hdr.h_sport); + goto out; + } + + rs = rds_find_bound(daddr, inc->i_hdr.h_dport); + if (rs == NULL) { + rds_stats_inc(s_recv_drop_no_sock); + goto out; + } + + /* Process extension headers */ + rds_recv_incoming_exthdrs(inc, rs); + + /* We can be racing with rds_release() which marks the socket dead. */ + sk = rds_rs_to_sk(rs); + + /* serialize with rds_release -> sock_orphan */ + write_lock_irqsave(&rs->rs_recv_lock, flags); + if (!sock_flag(sk, SOCK_DEAD)) { + rdsdebug("adding inc %p to rs %p's recv queue\n", inc, rs); + rds_stats_inc(s_recv_queued); + rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, + be32_to_cpu(inc->i_hdr.h_len), + inc->i_hdr.h_dport); + rds_inc_addref(inc); + list_add_tail(&inc->i_item, &rs->rs_recv_queue); + __rds_wake_sk_sleep(sk); + } else { + rds_stats_inc(s_recv_drop_dead_sock); + } + write_unlock_irqrestore(&rs->rs_recv_lock, flags); + +out: + if (rs) + rds_sock_put(rs); +} + +/* + * be very careful here. This is being called as the condition in + * wait_event_*() needs to cope with being called many times. + */ +static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc) +{ + unsigned long flags; + + if (*inc == NULL) { + read_lock_irqsave(&rs->rs_recv_lock, flags); + if (!list_empty(&rs->rs_recv_queue)) { + *inc = list_entry(rs->rs_recv_queue.next, + struct rds_incoming, + i_item); + rds_inc_addref(*inc); + } + read_unlock_irqrestore(&rs->rs_recv_lock, flags); + } + + return *inc != NULL; +} + +static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc, + int drop) +{ + struct sock *sk = rds_rs_to_sk(rs); + int ret = 0; + unsigned long flags; + + write_lock_irqsave(&rs->rs_recv_lock, flags); + if (!list_empty(&inc->i_item)) { + ret = 1; + if (drop) { + /* XXX make sure this i_conn is reliable */ + rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, + -be32_to_cpu(inc->i_hdr.h_len), + inc->i_hdr.h_dport); + list_del_init(&inc->i_item); + rds_inc_put(inc); + } + } + write_unlock_irqrestore(&rs->rs_recv_lock, flags); + + rdsdebug("inc %p rs %p still %d dropped %d\n", inc, rs, ret, drop); + return ret; +} + +/* + * Pull errors off the error queue. + * If msghdr is NULL, we will just purge the error queue. + */ +int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr) +{ + struct rds_notifier *notifier; + struct rds_rdma_notify cmsg; + unsigned int count = 0, max_messages = ~0U; + unsigned long flags; + LIST_HEAD(copy); + int err = 0; + + + /* put_cmsg copies to user space and thus may sleep. We can't do this + * with rs_lock held, so first grab as many notifications as we can stuff + * in the user provided cmsg buffer. We don't try to copy more, to avoid + * losing notifications - except when the buffer is so small that it wouldn't + * even hold a single notification. Then we give him as much of this single + * msg as we can squeeze in, and set MSG_CTRUNC. + */ + if (msghdr) { + max_messages = msghdr->msg_controllen / CMSG_SPACE(sizeof(cmsg)); + if (!max_messages) + max_messages = 1; + } + + spin_lock_irqsave(&rs->rs_lock, flags); + while (!list_empty(&rs->rs_notify_queue) && count < max_messages) { + notifier = list_entry(rs->rs_notify_queue.next, + struct rds_notifier, n_list); + list_move(¬ifier->n_list, ©); + count++; + } + spin_unlock_irqrestore(&rs->rs_lock, flags); + + if (!count) + return 0; + + while (!list_empty(©)) { + notifier = list_entry(copy.next, struct rds_notifier, n_list); + + if (msghdr) { + cmsg.user_token = notifier->n_user_token; + cmsg.status = notifier->n_status; + + err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS, + sizeof(cmsg), &cmsg); + if (err) + break; + } + + list_del_init(¬ifier->n_list); + kfree(notifier); + } + + /* If we bailed out because of an error in put_cmsg, + * we may be left with one or more notifications that we + * didn't process. Return them to the head of the list. */ + if (!list_empty(©)) { + spin_lock_irqsave(&rs->rs_lock, flags); + list_splice(©, &rs->rs_notify_queue); + spin_unlock_irqrestore(&rs->rs_lock, flags); + } + + return err; +} + +/* + * Queue a congestion notification + */ +static int rds_notify_cong(struct rds_sock *rs, struct msghdr *msghdr) +{ + uint64_t notify = rs->rs_cong_notify; + unsigned long flags; + int err; + + err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_CONG_UPDATE, + sizeof(notify), ¬ify); + if (err) + return err; + + spin_lock_irqsave(&rs->rs_lock, flags); + rs->rs_cong_notify &= ~notify; + spin_unlock_irqrestore(&rs->rs_lock, flags); + + return 0; +} + +/* + * Receive any control messages. + */ +static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg) +{ + int ret = 0; + + if (inc->i_rdma_cookie) { + ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST, + sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie); + if (ret) + return ret; + } + + return 0; +} + +int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t size, int msg_flags) +{ + struct sock *sk = sock->sk; + struct rds_sock *rs = rds_sk_to_rs(sk); + long timeo; + int ret = 0, nonblock = msg_flags & MSG_DONTWAIT; + struct sockaddr_in *sin; + struct rds_incoming *inc = NULL; + + /* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */ + timeo = sock_rcvtimeo(sk, nonblock); + + rdsdebug("size %zu flags 0x%x timeo %ld\n", size, msg_flags, timeo); + + if (msg_flags & MSG_OOB) + goto out; + + /* If there are pending notifications, do those - and nothing else */ + if (!list_empty(&rs->rs_notify_queue)) { + ret = rds_notify_queue_get(rs, msg); + goto out; + } + + if (rs->rs_cong_notify) { + ret = rds_notify_cong(rs, msg); + goto out; + } + + while (1) { + if (!rds_next_incoming(rs, &inc)) { + if (nonblock) { + ret = -EAGAIN; + break; + } + + timeo = wait_event_interruptible_timeout(*sk->sk_sleep, + rds_next_incoming(rs, &inc), + timeo); + rdsdebug("recvmsg woke inc %p timeo %ld\n", inc, + timeo); + if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT) + continue; + + ret = timeo; + if (ret == 0) + ret = -ETIMEDOUT; + break; + } + + rdsdebug("copying inc %p from %pI4:%u to user\n", inc, + &inc->i_conn->c_faddr, + ntohs(inc->i_hdr.h_sport)); + ret = inc->i_conn->c_trans->inc_copy_to_user(inc, msg->msg_iov, + size); + if (ret < 0) + break; + + /* + * if the message we just copied isn't at the head of the + * recv queue then someone else raced us to return it, try + * to get the next message. + */ + if (!rds_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) { + rds_inc_put(inc); + inc = NULL; + rds_stats_inc(s_recv_deliver_raced); + continue; + } + + if (ret < be32_to_cpu(inc->i_hdr.h_len)) { + if (msg_flags & MSG_TRUNC) + ret = be32_to_cpu(inc->i_hdr.h_len); + msg->msg_flags |= MSG_TRUNC; + } + + if (rds_cmsg_recv(inc, msg)) { + ret = -EFAULT; + goto out; + } + + rds_stats_inc(s_recv_delivered); + + sin = (struct sockaddr_in *)msg->msg_name; + if (sin) { + sin->sin_family = AF_INET; + sin->sin_port = inc->i_hdr.h_sport; + sin->sin_addr.s_addr = inc->i_saddr; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + } + break; + } + + if (inc) + rds_inc_put(inc); + +out: + return ret; +} + +/* + * The socket is being shut down and we're asked to drop messages that were + * queued for recvmsg. The caller has unbound the socket so the receive path + * won't queue any more incoming fragments or messages on the socket. + */ +void rds_clear_recv_queue(struct rds_sock *rs) +{ + struct sock *sk = rds_rs_to_sk(rs); + struct rds_incoming *inc, *tmp; + unsigned long flags; + + write_lock_irqsave(&rs->rs_recv_lock, flags); + list_for_each_entry_safe(inc, tmp, &rs->rs_recv_queue, i_item) { + rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, + -be32_to_cpu(inc->i_hdr.h_len), + inc->i_hdr.h_dport); + list_del_init(&inc->i_item); + rds_inc_put(inc); + } + write_unlock_irqrestore(&rs->rs_recv_lock, flags); +} + +/* + * inc->i_saddr isn't used here because it is only set in the receive + * path. + */ +void rds_inc_info_copy(struct rds_incoming *inc, + struct rds_info_iterator *iter, + __be32 saddr, __be32 daddr, int flip) +{ + struct rds_info_message minfo; + + minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence); + minfo.len = be32_to_cpu(inc->i_hdr.h_len); + + if (flip) { + minfo.laddr = daddr; + minfo.faddr = saddr; + minfo.lport = inc->i_hdr.h_dport; + minfo.fport = inc->i_hdr.h_sport; + } else { + minfo.laddr = saddr; + minfo.faddr = daddr; + minfo.lport = inc->i_hdr.h_sport; + minfo.fport = inc->i_hdr.h_dport; + } + + rds_info_copy(iter, &minfo, sizeof(minfo)); +} diff --git a/net/rds/send.c b/net/rds/send.c new file mode 100644 index 00000000000..1b37364656f --- /dev/null +++ b/net/rds/send.c @@ -0,0 +1,1003 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <net/sock.h> +#include <linux/in.h> +#include <linux/list.h> + +#include "rds.h" +#include "rdma.h" + +/* When transmitting messages in rds_send_xmit, we need to emerge from + * time to time and briefly release the CPU. Otherwise the softlock watchdog + * will kick our shin. + * Also, it seems fairer to not let one busy connection stall all the + * others. + * + * send_batch_count is the number of times we'll loop in send_xmit. Setting + * it to 0 will restore the old behavior (where we looped until we had + * drained the queue). + */ +static int send_batch_count = 64; +module_param(send_batch_count, int, 0444); +MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue"); + +/* + * Reset the send state. Caller must hold c_send_lock when calling here. + */ +void rds_send_reset(struct rds_connection *conn) +{ + struct rds_message *rm, *tmp; + unsigned long flags; + + if (conn->c_xmit_rm) { + /* Tell the user the RDMA op is no longer mapped by the + * transport. This isn't entirely true (it's flushed out + * independently) but as the connection is down, there's + * no ongoing RDMA to/from that memory */ + rds_message_unmapped(conn->c_xmit_rm); + rds_message_put(conn->c_xmit_rm); + conn->c_xmit_rm = NULL; + } + conn->c_xmit_sg = 0; + conn->c_xmit_hdr_off = 0; + conn->c_xmit_data_off = 0; + conn->c_xmit_rdma_sent = 0; + + conn->c_map_queued = 0; + + conn->c_unacked_packets = rds_sysctl_max_unacked_packets; + conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes; + + /* Mark messages as retransmissions, and move them to the send q */ + spin_lock_irqsave(&conn->c_lock, flags); + list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { + set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); + set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags); + } + list_splice_init(&conn->c_retrans, &conn->c_send_queue); + spin_unlock_irqrestore(&conn->c_lock, flags); +} + +/* + * We're making the concious trade-off here to only send one message + * down the connection at a time. + * Pro: + * - tx queueing is a simple fifo list + * - reassembly is optional and easily done by transports per conn + * - no per flow rx lookup at all, straight to the socket + * - less per-frag memory and wire overhead + * Con: + * - queued acks can be delayed behind large messages + * Depends: + * - small message latency is higher behind queued large messages + * - large message latency isn't starved by intervening small sends + */ +int rds_send_xmit(struct rds_connection *conn) +{ + struct rds_message *rm; + unsigned long flags; + unsigned int tmp; + unsigned int send_quota = send_batch_count; + struct scatterlist *sg; + int ret = 0; + int was_empty = 0; + LIST_HEAD(to_be_dropped); + + /* + * sendmsg calls here after having queued its message on the send + * queue. We only have one task feeding the connection at a time. If + * another thread is already feeding the queue then we back off. This + * avoids blocking the caller and trading per-connection data between + * caches per message. + * + * The sem holder will issue a retry if they notice that someone queued + * a message after they stopped walking the send queue but before they + * dropped the sem. + */ + if (!mutex_trylock(&conn->c_send_lock)) { + rds_stats_inc(s_send_sem_contention); + ret = -ENOMEM; + goto out; + } + + if (conn->c_trans->xmit_prepare) + conn->c_trans->xmit_prepare(conn); + + /* + * spin trying to push headers and data down the connection until + * the connection doens't make forward progress. + */ + while (--send_quota) { + /* + * See if need to send a congestion map update if we're + * between sending messages. The send_sem protects our sole + * use of c_map_offset and _bytes. + * Note this is used only by transports that define a special + * xmit_cong_map function. For all others, we create allocate + * a cong_map message and treat it just like any other send. + */ + if (conn->c_map_bytes) { + ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong, + conn->c_map_offset); + if (ret <= 0) + break; + + conn->c_map_offset += ret; + conn->c_map_bytes -= ret; + if (conn->c_map_bytes) + continue; + } + + /* If we're done sending the current message, clear the + * offset and S/G temporaries. + */ + rm = conn->c_xmit_rm; + if (rm != NULL && + conn->c_xmit_hdr_off == sizeof(struct rds_header) && + conn->c_xmit_sg == rm->m_nents) { + conn->c_xmit_rm = NULL; + conn->c_xmit_sg = 0; + conn->c_xmit_hdr_off = 0; + conn->c_xmit_data_off = 0; + conn->c_xmit_rdma_sent = 0; + + /* Release the reference to the previous message. */ + rds_message_put(rm); + rm = NULL; + } + + /* If we're asked to send a cong map update, do so. + */ + if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) { + if (conn->c_trans->xmit_cong_map != NULL) { + conn->c_map_offset = 0; + conn->c_map_bytes = sizeof(struct rds_header) + + RDS_CONG_MAP_BYTES; + continue; + } + + rm = rds_cong_update_alloc(conn); + if (IS_ERR(rm)) { + ret = PTR_ERR(rm); + break; + } + + conn->c_xmit_rm = rm; + } + + /* + * Grab the next message from the send queue, if there is one. + * + * c_xmit_rm holds a ref while we're sending this message down + * the connction. We can use this ref while holding the + * send_sem.. rds_send_reset() is serialized with it. + */ + if (rm == NULL) { + unsigned int len; + + spin_lock_irqsave(&conn->c_lock, flags); + + if (!list_empty(&conn->c_send_queue)) { + rm = list_entry(conn->c_send_queue.next, + struct rds_message, + m_conn_item); + rds_message_addref(rm); + + /* + * Move the message from the send queue to the retransmit + * list right away. + */ + list_move_tail(&rm->m_conn_item, &conn->c_retrans); + } + + spin_unlock_irqrestore(&conn->c_lock, flags); + + if (rm == NULL) { + was_empty = 1; + break; + } + + /* Unfortunately, the way Infiniband deals with + * RDMA to a bad MR key is by moving the entire + * queue pair to error state. We cold possibly + * recover from that, but right now we drop the + * connection. + * Therefore, we never retransmit messages with RDMA ops. + */ + if (rm->m_rdma_op + && test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) { + spin_lock_irqsave(&conn->c_lock, flags); + if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) + list_move(&rm->m_conn_item, &to_be_dropped); + spin_unlock_irqrestore(&conn->c_lock, flags); + rds_message_put(rm); + continue; + } + + /* Require an ACK every once in a while */ + len = ntohl(rm->m_inc.i_hdr.h_len); + if (conn->c_unacked_packets == 0 + || conn->c_unacked_bytes < len) { + __set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); + + conn->c_unacked_packets = rds_sysctl_max_unacked_packets; + conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes; + rds_stats_inc(s_send_ack_required); + } else { + conn->c_unacked_bytes -= len; + conn->c_unacked_packets--; + } + + conn->c_xmit_rm = rm; + } + + /* + * Try and send an rdma message. Let's see if we can + * keep this simple and require that the transport either + * send the whole rdma or none of it. + */ + if (rm->m_rdma_op && !conn->c_xmit_rdma_sent) { + ret = conn->c_trans->xmit_rdma(conn, rm->m_rdma_op); + if (ret) + break; + conn->c_xmit_rdma_sent = 1; + /* The transport owns the mapped memory for now. + * You can't unmap it while it's on the send queue */ + set_bit(RDS_MSG_MAPPED, &rm->m_flags); + } + + if (conn->c_xmit_hdr_off < sizeof(struct rds_header) || + conn->c_xmit_sg < rm->m_nents) { + ret = conn->c_trans->xmit(conn, rm, + conn->c_xmit_hdr_off, + conn->c_xmit_sg, + conn->c_xmit_data_off); + if (ret <= 0) + break; + + if (conn->c_xmit_hdr_off < sizeof(struct rds_header)) { + tmp = min_t(int, ret, + sizeof(struct rds_header) - + conn->c_xmit_hdr_off); + conn->c_xmit_hdr_off += tmp; + ret -= tmp; + } + + sg = &rm->m_sg[conn->c_xmit_sg]; + while (ret) { + tmp = min_t(int, ret, sg->length - + conn->c_xmit_data_off); + conn->c_xmit_data_off += tmp; + ret -= tmp; + if (conn->c_xmit_data_off == sg->length) { + conn->c_xmit_data_off = 0; + sg++; + conn->c_xmit_sg++; + BUG_ON(ret != 0 && + conn->c_xmit_sg == rm->m_nents); + } + } + } + } + + /* Nuke any messages we decided not to retransmit. */ + if (!list_empty(&to_be_dropped)) + rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED); + + if (conn->c_trans->xmit_complete) + conn->c_trans->xmit_complete(conn); + + /* + * We might be racing with another sender who queued a message but + * backed off on noticing that we held the c_send_lock. If we check + * for queued messages after dropping the sem then either we'll + * see the queued message or the queuer will get the sem. If we + * notice the queued message then we trigger an immediate retry. + * + * We need to be careful only to do this when we stopped processing + * the send queue because it was empty. It's the only way we + * stop processing the loop when the transport hasn't taken + * responsibility for forward progress. + */ + mutex_unlock(&conn->c_send_lock); + + if (conn->c_map_bytes || (send_quota == 0 && !was_empty)) { + /* We exhausted the send quota, but there's work left to + * do. Return and (re-)schedule the send worker. + */ + ret = -EAGAIN; + } + + if (ret == 0 && was_empty) { + /* A simple bit test would be way faster than taking the + * spin lock */ + spin_lock_irqsave(&conn->c_lock, flags); + if (!list_empty(&conn->c_send_queue)) { + rds_stats_inc(s_send_sem_queue_raced); + ret = -EAGAIN; + } + spin_unlock_irqrestore(&conn->c_lock, flags); + } +out: + return ret; +} + +static void rds_send_sndbuf_remove(struct rds_sock *rs, struct rds_message *rm) +{ + u32 len = be32_to_cpu(rm->m_inc.i_hdr.h_len); + + assert_spin_locked(&rs->rs_lock); + + BUG_ON(rs->rs_snd_bytes < len); + rs->rs_snd_bytes -= len; + + if (rs->rs_snd_bytes == 0) + rds_stats_inc(s_send_queue_empty); +} + +static inline int rds_send_is_acked(struct rds_message *rm, u64 ack, + is_acked_func is_acked) +{ + if (is_acked) + return is_acked(rm, ack); + return be64_to_cpu(rm->m_inc.i_hdr.h_sequence) <= ack; +} + +/* + * Returns true if there are no messages on the send and retransmit queues + * which have a sequence number greater than or equal to the given sequence + * number. + */ +int rds_send_acked_before(struct rds_connection *conn, u64 seq) +{ + struct rds_message *rm, *tmp; + int ret = 1; + + spin_lock(&conn->c_lock); + + list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { + if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq) + ret = 0; + break; + } + + list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { + if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq) + ret = 0; + break; + } + + spin_unlock(&conn->c_lock); + + return ret; +} + +/* + * This is pretty similar to what happens below in the ACK + * handling code - except that we call here as soon as we get + * the IB send completion on the RDMA op and the accompanying + * message. + */ +void rds_rdma_send_complete(struct rds_message *rm, int status) +{ + struct rds_sock *rs = NULL; + struct rds_rdma_op *ro; + struct rds_notifier *notifier; + + spin_lock(&rm->m_rs_lock); + + ro = rm->m_rdma_op; + if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) + && ro && ro->r_notify && ro->r_notifier) { + notifier = ro->r_notifier; + rs = rm->m_rs; + sock_hold(rds_rs_to_sk(rs)); + + notifier->n_status = status; + spin_lock(&rs->rs_lock); + list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); + spin_unlock(&rs->rs_lock); + + ro->r_notifier = NULL; + } + + spin_unlock(&rm->m_rs_lock); + + if (rs) { + rds_wake_sk_sleep(rs); + sock_put(rds_rs_to_sk(rs)); + } +} + +/* + * This is the same as rds_rdma_send_complete except we + * don't do any locking - we have all the ingredients (message, + * socket, socket lock) and can just move the notifier. + */ +static inline void +__rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status) +{ + struct rds_rdma_op *ro; + + ro = rm->m_rdma_op; + if (ro && ro->r_notify && ro->r_notifier) { + ro->r_notifier->n_status = status; + list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue); + ro->r_notifier = NULL; + } + + /* No need to wake the app - caller does this */ +} + +/* + * This is called from the IB send completion when we detect + * a RDMA operation that failed with remote access error. + * So speed is not an issue here. + */ +struct rds_message *rds_send_get_message(struct rds_connection *conn, + struct rds_rdma_op *op) +{ + struct rds_message *rm, *tmp, *found = NULL; + unsigned long flags; + + spin_lock_irqsave(&conn->c_lock, flags); + + list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { + if (rm->m_rdma_op == op) { + atomic_inc(&rm->m_refcount); + found = rm; + goto out; + } + } + + list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { + if (rm->m_rdma_op == op) { + atomic_inc(&rm->m_refcount); + found = rm; + break; + } + } + +out: + spin_unlock_irqrestore(&conn->c_lock, flags); + + return found; +} + +/* + * This removes messages from the socket's list if they're on it. The list + * argument must be private to the caller, we must be able to modify it + * without locks. The messages must have a reference held for their + * position on the list. This function will drop that reference after + * removing the messages from the 'messages' list regardless of if it found + * the messages on the socket list or not. + */ +void rds_send_remove_from_sock(struct list_head *messages, int status) +{ + unsigned long flags = 0; /* silence gcc :P */ + struct rds_sock *rs = NULL; + struct rds_message *rm; + + local_irq_save(flags); + while (!list_empty(messages)) { + rm = list_entry(messages->next, struct rds_message, + m_conn_item); + list_del_init(&rm->m_conn_item); + + /* + * If we see this flag cleared then we're *sure* that someone + * else beat us to removing it from the sock. If we race + * with their flag update we'll get the lock and then really + * see that the flag has been cleared. + * + * The message spinlock makes sure nobody clears rm->m_rs + * while we're messing with it. It does not prevent the + * message from being removed from the socket, though. + */ + spin_lock(&rm->m_rs_lock); + if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) + goto unlock_and_drop; + + if (rs != rm->m_rs) { + if (rs) { + spin_unlock(&rs->rs_lock); + rds_wake_sk_sleep(rs); + sock_put(rds_rs_to_sk(rs)); + } + rs = rm->m_rs; + spin_lock(&rs->rs_lock); + sock_hold(rds_rs_to_sk(rs)); + } + + if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) { + struct rds_rdma_op *ro = rm->m_rdma_op; + struct rds_notifier *notifier; + + list_del_init(&rm->m_sock_item); + rds_send_sndbuf_remove(rs, rm); + + if (ro && ro->r_notifier + && (status || ro->r_notify)) { + notifier = ro->r_notifier; + list_add_tail(¬ifier->n_list, + &rs->rs_notify_queue); + if (!notifier->n_status) + notifier->n_status = status; + rm->m_rdma_op->r_notifier = NULL; + } + rds_message_put(rm); + rm->m_rs = NULL; + } + +unlock_and_drop: + spin_unlock(&rm->m_rs_lock); + rds_message_put(rm); + } + + if (rs) { + spin_unlock(&rs->rs_lock); + rds_wake_sk_sleep(rs); + sock_put(rds_rs_to_sk(rs)); + } + local_irq_restore(flags); +} + +/* + * Transports call here when they've determined that the receiver queued + * messages up to, and including, the given sequence number. Messages are + * moved to the retrans queue when rds_send_xmit picks them off the send + * queue. This means that in the TCP case, the message may not have been + * assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked + * checks the RDS_MSG_HAS_ACK_SEQ bit. + * + * XXX It's not clear to me how this is safely serialized with socket + * destruction. Maybe it should bail if it sees SOCK_DEAD. + */ +void rds_send_drop_acked(struct rds_connection *conn, u64 ack, + is_acked_func is_acked) +{ + struct rds_message *rm, *tmp; + unsigned long flags; + LIST_HEAD(list); + + spin_lock_irqsave(&conn->c_lock, flags); + + list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { + if (!rds_send_is_acked(rm, ack, is_acked)) + break; + + list_move(&rm->m_conn_item, &list); + clear_bit(RDS_MSG_ON_CONN, &rm->m_flags); + } + + /* order flag updates with spin locks */ + if (!list_empty(&list)) + smp_mb__after_clear_bit(); + + spin_unlock_irqrestore(&conn->c_lock, flags); + + /* now remove the messages from the sock list as needed */ + rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS); +} + +void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) +{ + struct rds_message *rm, *tmp; + struct rds_connection *conn; + unsigned long flags; + LIST_HEAD(list); + int wake = 0; + + /* get all the messages we're dropping under the rs lock */ + spin_lock_irqsave(&rs->rs_lock, flags); + + list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) { + if (dest && (dest->sin_addr.s_addr != rm->m_daddr || + dest->sin_port != rm->m_inc.i_hdr.h_dport)) + continue; + + wake = 1; + list_move(&rm->m_sock_item, &list); + rds_send_sndbuf_remove(rs, rm); + clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags); + + /* If this is a RDMA operation, notify the app. */ + __rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED); + } + + /* order flag updates with the rs lock */ + if (wake) + smp_mb__after_clear_bit(); + + spin_unlock_irqrestore(&rs->rs_lock, flags); + + if (wake) + rds_wake_sk_sleep(rs); + + conn = NULL; + + /* now remove the messages from the conn list as needed */ + list_for_each_entry(rm, &list, m_sock_item) { + /* We do this here rather than in the loop above, so that + * we don't have to nest m_rs_lock under rs->rs_lock */ + spin_lock(&rm->m_rs_lock); + rm->m_rs = NULL; + spin_unlock(&rm->m_rs_lock); + + /* + * If we see this flag cleared then we're *sure* that someone + * else beat us to removing it from the conn. If we race + * with their flag update we'll get the lock and then really + * see that the flag has been cleared. + */ + if (!test_bit(RDS_MSG_ON_CONN, &rm->m_flags)) + continue; + + if (conn != rm->m_inc.i_conn) { + if (conn) + spin_unlock_irqrestore(&conn->c_lock, flags); + conn = rm->m_inc.i_conn; + spin_lock_irqsave(&conn->c_lock, flags); + } + + if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { + list_del_init(&rm->m_conn_item); + rds_message_put(rm); + } + } + + if (conn) + spin_unlock_irqrestore(&conn->c_lock, flags); + + while (!list_empty(&list)) { + rm = list_entry(list.next, struct rds_message, m_sock_item); + list_del_init(&rm->m_sock_item); + + rds_message_wait(rm); + rds_message_put(rm); + } +} + +/* + * we only want this to fire once so we use the callers 'queued'. It's + * possible that another thread can race with us and remove the + * message from the flow with RDS_CANCEL_SENT_TO. + */ +static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn, + struct rds_message *rm, __be16 sport, + __be16 dport, int *queued) +{ + unsigned long flags; + u32 len; + + if (*queued) + goto out; + + len = be32_to_cpu(rm->m_inc.i_hdr.h_len); + + /* this is the only place which holds both the socket's rs_lock + * and the connection's c_lock */ + spin_lock_irqsave(&rs->rs_lock, flags); + + /* + * If there is a little space in sndbuf, we don't queue anything, + * and userspace gets -EAGAIN. But poll() indicates there's send + * room. This can lead to bad behavior (spinning) if snd_bytes isn't + * freed up by incoming acks. So we check the *old* value of + * rs_snd_bytes here to allow the last msg to exceed the buffer, + * and poll() now knows no more data can be sent. + */ + if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) { + rs->rs_snd_bytes += len; + + /* let recv side know we are close to send space exhaustion. + * This is probably not the optimal way to do it, as this + * means we set the flag on *all* messages as soon as our + * throughput hits a certain threshold. + */ + if (rs->rs_snd_bytes >= rds_sk_sndbuf(rs) / 2) + __set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); + + list_add_tail(&rm->m_sock_item, &rs->rs_send_queue); + set_bit(RDS_MSG_ON_SOCK, &rm->m_flags); + rds_message_addref(rm); + rm->m_rs = rs; + + /* The code ordering is a little weird, but we're + trying to minimize the time we hold c_lock */ + rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0); + rm->m_inc.i_conn = conn; + rds_message_addref(rm); + + spin_lock(&conn->c_lock); + rm->m_inc.i_hdr.h_sequence = cpu_to_be64(conn->c_next_tx_seq++); + list_add_tail(&rm->m_conn_item, &conn->c_send_queue); + set_bit(RDS_MSG_ON_CONN, &rm->m_flags); + spin_unlock(&conn->c_lock); + + rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n", + rm, len, rs, rs->rs_snd_bytes, + (unsigned long long)be64_to_cpu(rm->m_inc.i_hdr.h_sequence)); + + *queued = 1; + } + + spin_unlock_irqrestore(&rs->rs_lock, flags); +out: + return *queued; +} + +static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, + struct msghdr *msg, int *allocated_mr) +{ + struct cmsghdr *cmsg; + int ret = 0; + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + + if (cmsg->cmsg_level != SOL_RDS) + continue; + + /* As a side effect, RDMA_DEST and RDMA_MAP will set + * rm->m_rdma_cookie and rm->m_rdma_mr. + */ + switch (cmsg->cmsg_type) { + case RDS_CMSG_RDMA_ARGS: + ret = rds_cmsg_rdma_args(rs, rm, cmsg); + break; + + case RDS_CMSG_RDMA_DEST: + ret = rds_cmsg_rdma_dest(rs, rm, cmsg); + break; + + case RDS_CMSG_RDMA_MAP: + ret = rds_cmsg_rdma_map(rs, rm, cmsg); + if (!ret) + *allocated_mr = 1; + break; + + default: + return -EINVAL; + } + + if (ret) + break; + } + + return ret; +} + +int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t payload_len) +{ + struct sock *sk = sock->sk; + struct rds_sock *rs = rds_sk_to_rs(sk); + struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; + __be32 daddr; + __be16 dport; + struct rds_message *rm = NULL; + struct rds_connection *conn; + int ret = 0; + int queued = 0, allocated_mr = 0; + int nonblock = msg->msg_flags & MSG_DONTWAIT; + long timeo = sock_rcvtimeo(sk, nonblock); + + /* Mirror Linux UDP mirror of BSD error message compatibility */ + /* XXX: Perhaps MSG_MORE someday */ + if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) { + printk(KERN_INFO "msg_flags 0x%08X\n", msg->msg_flags); + ret = -EOPNOTSUPP; + goto out; + } + + if (msg->msg_namelen) { + /* XXX fail non-unicast destination IPs? */ + if (msg->msg_namelen < sizeof(*usin) || usin->sin_family != AF_INET) { + ret = -EINVAL; + goto out; + } + daddr = usin->sin_addr.s_addr; + dport = usin->sin_port; + } else { + /* We only care about consistency with ->connect() */ + lock_sock(sk); + daddr = rs->rs_conn_addr; + dport = rs->rs_conn_port; + release_sock(sk); + } + + /* racing with another thread binding seems ok here */ + if (daddr == 0 || rs->rs_bound_addr == 0) { + ret = -ENOTCONN; /* XXX not a great errno */ + goto out; + } + + rm = rds_message_copy_from_user(msg->msg_iov, payload_len); + if (IS_ERR(rm)) { + ret = PTR_ERR(rm); + rm = NULL; + goto out; + } + + rm->m_daddr = daddr; + + /* Parse any control messages the user may have included. */ + ret = rds_cmsg_send(rs, rm, msg, &allocated_mr); + if (ret) + goto out; + + /* rds_conn_create has a spinlock that runs with IRQ off. + * Caching the conn in the socket helps a lot. */ + if (rs->rs_conn && rs->rs_conn->c_faddr == daddr) + conn = rs->rs_conn; + else { + conn = rds_conn_create_outgoing(rs->rs_bound_addr, daddr, + rs->rs_transport, + sock->sk->sk_allocation); + if (IS_ERR(conn)) { + ret = PTR_ERR(conn); + goto out; + } + rs->rs_conn = conn; + } + + if ((rm->m_rdma_cookie || rm->m_rdma_op) + && conn->c_trans->xmit_rdma == NULL) { + if (printk_ratelimit()) + printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", + rm->m_rdma_op, conn->c_trans->xmit_rdma); + ret = -EOPNOTSUPP; + goto out; + } + + /* If the connection is down, trigger a connect. We may + * have scheduled a delayed reconnect however - in this case + * we should not interfere. + */ + if (rds_conn_state(conn) == RDS_CONN_DOWN + && !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags)) + queue_delayed_work(rds_wq, &conn->c_conn_w, 0); + + ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs); + if (ret) + goto out; + + while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port, + dport, &queued)) { + rds_stats_inc(s_send_queue_full); + /* XXX make sure this is reasonable */ + if (payload_len > rds_sk_sndbuf(rs)) { + ret = -EMSGSIZE; + goto out; + } + if (nonblock) { + ret = -EAGAIN; + goto out; + } + + timeo = wait_event_interruptible_timeout(*sk->sk_sleep, + rds_send_queue_rm(rs, conn, rm, + rs->rs_bound_port, + dport, + &queued), + timeo); + rdsdebug("sendmsg woke queued %d timeo %ld\n", queued, timeo); + if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT) + continue; + + ret = timeo; + if (ret == 0) + ret = -ETIMEDOUT; + goto out; + } + + /* + * By now we've committed to the send. We reuse rds_send_worker() + * to retry sends in the rds thread if the transport asks us to. + */ + rds_stats_inc(s_send_queued); + + if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) + rds_send_worker(&conn->c_send_w.work); + + rds_message_put(rm); + return payload_len; + +out: + /* If the user included a RDMA_MAP cmsg, we allocated a MR on the fly. + * If the sendmsg goes through, we keep the MR. If it fails with EAGAIN + * or in any other way, we need to destroy the MR again */ + if (allocated_mr) + rds_rdma_unuse(rs, rds_rdma_cookie_key(rm->m_rdma_cookie), 1); + + if (rm) + rds_message_put(rm); + return ret; +} + +/* + * Reply to a ping packet. + */ +int +rds_send_pong(struct rds_connection *conn, __be16 dport) +{ + struct rds_message *rm; + unsigned long flags; + int ret = 0; + + rm = rds_message_alloc(0, GFP_ATOMIC); + if (rm == NULL) { + ret = -ENOMEM; + goto out; + } + + rm->m_daddr = conn->c_faddr; + + /* If the connection is down, trigger a connect. We may + * have scheduled a delayed reconnect however - in this case + * we should not interfere. + */ + if (rds_conn_state(conn) == RDS_CONN_DOWN + && !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags)) + queue_delayed_work(rds_wq, &conn->c_conn_w, 0); + + ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL); + if (ret) + goto out; + + spin_lock_irqsave(&conn->c_lock, flags); + list_add_tail(&rm->m_conn_item, &conn->c_send_queue); + set_bit(RDS_MSG_ON_CONN, &rm->m_flags); + rds_message_addref(rm); + rm->m_inc.i_conn = conn; + + rds_message_populate_header(&rm->m_inc.i_hdr, 0, dport, + conn->c_next_tx_seq); + conn->c_next_tx_seq++; + spin_unlock_irqrestore(&conn->c_lock, flags); + + rds_stats_inc(s_send_queued); + rds_stats_inc(s_send_pong); + + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + rds_message_put(rm); + return 0; + +out: + if (rm) + rds_message_put(rm); + return ret; +} diff --git a/net/rds/stats.c b/net/rds/stats.c new file mode 100644 index 00000000000..637146893cf --- /dev/null +++ b/net/rds/stats.c @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/percpu.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> + +#include "rds.h" + +DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); + +/* :.,$s/unsigned long\>.*\<s_\(.*\);/"\1",/g */ + +static char *rds_stat_names[] = { + "conn_reset", + "recv_drop_bad_checksum", + "recv_drop_old_seq", + "recv_drop_no_sock", + "recv_drop_dead_sock", + "recv_deliver_raced", + "recv_delivered", + "recv_queued", + "recv_immediate_retry", + "recv_delayed_retry", + "recv_ack_required", + "recv_rdma_bytes", + "recv_ping", + "send_queue_empty", + "send_queue_full", + "send_sem_contention", + "send_sem_queue_raced", + "send_immediate_retry", + "send_delayed_retry", + "send_drop_acked", + "send_ack_required", + "send_queued", + "send_rdma", + "send_rdma_bytes", + "send_pong", + "page_remainder_hit", + "page_remainder_miss", + "copy_to_user", + "copy_from_user", + "cong_update_queued", + "cong_update_received", + "cong_send_error", + "cong_send_blocked", +}; + +void rds_stats_info_copy(struct rds_info_iterator *iter, + uint64_t *values, char **names, size_t nr) +{ + struct rds_info_counter ctr; + size_t i; + + for (i = 0; i < nr; i++) { + BUG_ON(strlen(names[i]) >= sizeof(ctr.name)); + strncpy(ctr.name, names[i], sizeof(ctr.name) - 1); + ctr.value = values[i]; + + rds_info_copy(iter, &ctr, sizeof(ctr)); + } +} + +/* + * This gives global counters across all the transports. The strings + * are copied in so that the tool doesn't need knowledge of the specific + * stats that we're exporting. Some are pretty implementation dependent + * and may change over time. That doesn't stop them from being useful. + * + * This is the only function in the chain that knows about the byte granular + * length in userspace. It converts it to number of stat entries that the + * rest of the functions operate in. + */ +static void rds_stats_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + struct rds_statistics stats = {0, }; + uint64_t *src; + uint64_t *sum; + size_t i; + int cpu; + unsigned int avail; + + avail = len / sizeof(struct rds_info_counter); + + if (avail < ARRAY_SIZE(rds_stat_names)) { + avail = 0; + goto trans; + } + + for_each_online_cpu(cpu) { + src = (uint64_t *)&(per_cpu(rds_stats, cpu)); + sum = (uint64_t *)&stats; + for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) + *(sum++) += *(src++); + } + + rds_stats_info_copy(iter, (uint64_t *)&stats, rds_stat_names, + ARRAY_SIZE(rds_stat_names)); + avail -= ARRAY_SIZE(rds_stat_names); + +trans: + lens->each = sizeof(struct rds_info_counter); + lens->nr = rds_trans_stats_info_copy(iter, avail) + + ARRAY_SIZE(rds_stat_names); +} + +void rds_stats_exit(void) +{ + rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info); +} + +int __init rds_stats_init(void) +{ + rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info); + return 0; +} diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c new file mode 100644 index 00000000000..307dc5c1be1 --- /dev/null +++ b/net/rds/sysctl.c @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/sysctl.h> +#include <linux/proc_fs.h> + +#include "rds.h" + +static struct ctl_table_header *rds_sysctl_reg_table; + +static unsigned long rds_sysctl_reconnect_min = 1; +static unsigned long rds_sysctl_reconnect_max = ~0UL; + +unsigned long rds_sysctl_reconnect_min_jiffies; +unsigned long rds_sysctl_reconnect_max_jiffies = HZ; + +unsigned int rds_sysctl_max_unacked_packets = 8; +unsigned int rds_sysctl_max_unacked_bytes = (16 << 20); + +unsigned int rds_sysctl_ping_enable = 1; + +static ctl_table rds_sysctl_rds_table[] = { + { + .ctl_name = CTL_UNNUMBERED, + .procname = "reconnect_min_delay_ms", + .data = &rds_sysctl_reconnect_min_jiffies, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_doulongvec_ms_jiffies_minmax, + .extra1 = &rds_sysctl_reconnect_min, + .extra2 = &rds_sysctl_reconnect_max_jiffies, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "reconnect_max_delay_ms", + .data = &rds_sysctl_reconnect_max_jiffies, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_doulongvec_ms_jiffies_minmax, + .extra1 = &rds_sysctl_reconnect_min_jiffies, + .extra2 = &rds_sysctl_reconnect_max, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "max_unacked_packets", + .data = &rds_sysctl_max_unacked_packets, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "max_unacked_bytes", + .data = &rds_sysctl_max_unacked_bytes, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "ping_enable", + .data = &rds_sysctl_ping_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0} +}; + +static struct ctl_path rds_sysctl_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "rds", .ctl_name = CTL_UNNUMBERED, }, + { } +}; + + +void rds_sysctl_exit(void) +{ + if (rds_sysctl_reg_table) + unregister_sysctl_table(rds_sysctl_reg_table); +} + +int __init rds_sysctl_init(void) +{ + rds_sysctl_reconnect_min = msecs_to_jiffies(1); + rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min; + + rds_sysctl_reg_table = register_sysctl_paths(rds_sysctl_path, rds_sysctl_rds_table); + if (rds_sysctl_reg_table == NULL) + return -ENOMEM; + return 0; +} diff --git a/net/rds/threads.c b/net/rds/threads.c new file mode 100644 index 00000000000..828a1bf9ea9 --- /dev/null +++ b/net/rds/threads.c @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/random.h> + +#include "rds.h" + +/* + * All of connection management is simplified by serializing it through + * work queues that execute in a connection managing thread. + * + * TCP wants to send acks through sendpage() in response to data_ready(), + * but it needs a process context to do so. + * + * The receive paths need to allocate but can't drop packets (!) so we have + * a thread around to block allocating if the receive fast path sees an + * allocation failure. + */ + +/* Grand Unified Theory of connection life cycle: + * At any point in time, the connection can be in one of these states: + * DOWN, CONNECTING, UP, DISCONNECTING, ERROR + * + * The following transitions are possible: + * ANY -> ERROR + * UP -> DISCONNECTING + * ERROR -> DISCONNECTING + * DISCONNECTING -> DOWN + * DOWN -> CONNECTING + * CONNECTING -> UP + * + * Transition to state DISCONNECTING/DOWN: + * - Inside the shutdown worker; synchronizes with xmit path + * through c_send_lock, and with connection management callbacks + * via c_cm_lock. + * + * For receive callbacks, we rely on the underlying transport + * (TCP, IB/RDMA) to provide the necessary synchronisation. + */ +struct workqueue_struct *rds_wq; + +void rds_connect_complete(struct rds_connection *conn) +{ + if (!rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_UP)) { + printk(KERN_WARNING "%s: Cannot transition to state UP, " + "current state is %d\n", + __func__, + atomic_read(&conn->c_state)); + atomic_set(&conn->c_state, RDS_CONN_ERROR); + queue_work(rds_wq, &conn->c_down_w); + return; + } + + rdsdebug("conn %p for %pI4 to %pI4 complete\n", + conn, &conn->c_laddr, &conn->c_faddr); + + conn->c_reconnect_jiffies = 0; + set_bit(0, &conn->c_map_queued); + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + queue_delayed_work(rds_wq, &conn->c_recv_w, 0); +} + +/* + * This random exponential backoff is relied on to eventually resolve racing + * connects. + * + * If connect attempts race then both parties drop both connections and come + * here to wait for a random amount of time before trying again. Eventually + * the backoff range will be so much greater than the time it takes to + * establish a connection that one of the pair will establish the connection + * before the other's random delay fires. + * + * Connection attempts that arrive while a connection is already established + * are also considered to be racing connects. This lets a connection from + * a rebooted machine replace an existing stale connection before the transport + * notices that the connection has failed. + * + * We should *always* start with a random backoff; otherwise a broken connection + * will always take several iterations to be re-established. + */ +static void rds_queue_reconnect(struct rds_connection *conn) +{ + unsigned long rand; + + rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n", + conn, &conn->c_laddr, &conn->c_faddr, + conn->c_reconnect_jiffies); + + set_bit(RDS_RECONNECT_PENDING, &conn->c_flags); + if (conn->c_reconnect_jiffies == 0) { + conn->c_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies; + queue_delayed_work(rds_wq, &conn->c_conn_w, 0); + return; + } + + get_random_bytes(&rand, sizeof(rand)); + rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n", + rand % conn->c_reconnect_jiffies, conn->c_reconnect_jiffies, + conn, &conn->c_laddr, &conn->c_faddr); + queue_delayed_work(rds_wq, &conn->c_conn_w, + rand % conn->c_reconnect_jiffies); + + conn->c_reconnect_jiffies = min(conn->c_reconnect_jiffies * 2, + rds_sysctl_reconnect_max_jiffies); +} + +void rds_connect_worker(struct work_struct *work) +{ + struct rds_connection *conn = container_of(work, struct rds_connection, c_conn_w.work); + int ret; + + clear_bit(RDS_RECONNECT_PENDING, &conn->c_flags); + if (rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { + ret = conn->c_trans->conn_connect(conn); + rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n", + conn, &conn->c_laddr, &conn->c_faddr, ret); + + if (ret) { + if (rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_DOWN)) + rds_queue_reconnect(conn); + else + rds_conn_error(conn, "RDS: connect failed\n"); + } + } +} + +void rds_shutdown_worker(struct work_struct *work) +{ + struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w); + + /* shut it down unless it's down already */ + if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) { + /* + * Quiesce the connection mgmt handlers before we start tearing + * things down. We don't hold the mutex for the entire + * duration of the shutdown operation, else we may be + * deadlocking with the CM handler. Instead, the CM event + * handler is supposed to check for state DISCONNECTING + */ + mutex_lock(&conn->c_cm_lock); + if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING) + && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) { + rds_conn_error(conn, "shutdown called in state %d\n", + atomic_read(&conn->c_state)); + mutex_unlock(&conn->c_cm_lock); + return; + } + mutex_unlock(&conn->c_cm_lock); + + mutex_lock(&conn->c_send_lock); + conn->c_trans->conn_shutdown(conn); + rds_conn_reset(conn); + mutex_unlock(&conn->c_send_lock); + + if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) { + /* This can happen - eg when we're in the middle of tearing + * down the connection, and someone unloads the rds module. + * Quite reproduceable with loopback connections. + * Mostly harmless. + */ + rds_conn_error(conn, + "%s: failed to transition to state DOWN, " + "current state is %d\n", + __func__, + atomic_read(&conn->c_state)); + return; + } + } + + /* Then reconnect if it's still live. + * The passive side of an IB loopback connection is never added + * to the conn hash, so we never trigger a reconnect on this + * conn - the reconnect is always triggered by the active peer. */ + cancel_delayed_work(&conn->c_conn_w); + if (!hlist_unhashed(&conn->c_hash_node)) + rds_queue_reconnect(conn); +} + +void rds_send_worker(struct work_struct *work) +{ + struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work); + int ret; + + if (rds_conn_state(conn) == RDS_CONN_UP) { + ret = rds_send_xmit(conn); + rdsdebug("conn %p ret %d\n", conn, ret); + switch (ret) { + case -EAGAIN: + rds_stats_inc(s_send_immediate_retry); + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + break; + case -ENOMEM: + rds_stats_inc(s_send_delayed_retry); + queue_delayed_work(rds_wq, &conn->c_send_w, 2); + default: + break; + } + } +} + +void rds_recv_worker(struct work_struct *work) +{ + struct rds_connection *conn = container_of(work, struct rds_connection, c_recv_w.work); + int ret; + + if (rds_conn_state(conn) == RDS_CONN_UP) { + ret = conn->c_trans->recv(conn); + rdsdebug("conn %p ret %d\n", conn, ret); + switch (ret) { + case -EAGAIN: + rds_stats_inc(s_recv_immediate_retry); + queue_delayed_work(rds_wq, &conn->c_recv_w, 0); + break; + case -ENOMEM: + rds_stats_inc(s_recv_delayed_retry); + queue_delayed_work(rds_wq, &conn->c_recv_w, 2); + default: + break; + } + } +} + +void rds_threads_exit(void) +{ + destroy_workqueue(rds_wq); +} + +int __init rds_threads_init(void) +{ + rds_wq = create_singlethread_workqueue("krdsd"); + if (rds_wq == NULL) + return -ENOMEM; + + return 0; +} diff --git a/net/rds/transport.c b/net/rds/transport.c new file mode 100644 index 00000000000..767da61ad2f --- /dev/null +++ b/net/rds/transport.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/in.h> + +#include "rds.h" +#include "loop.h" + +static LIST_HEAD(rds_transports); +static DECLARE_RWSEM(rds_trans_sem); + +int rds_trans_register(struct rds_transport *trans) +{ + BUG_ON(strlen(trans->t_name) + 1 > TRANSNAMSIZ); + + down_write(&rds_trans_sem); + + list_add_tail(&trans->t_item, &rds_transports); + printk(KERN_INFO "Registered RDS/%s transport\n", trans->t_name); + + up_write(&rds_trans_sem); + + return 0; +} + +void rds_trans_unregister(struct rds_transport *trans) +{ + down_write(&rds_trans_sem); + + list_del_init(&trans->t_item); + printk(KERN_INFO "Unregistered RDS/%s transport\n", trans->t_name); + + up_write(&rds_trans_sem); +} + +struct rds_transport *rds_trans_get_preferred(__be32 addr) +{ + struct rds_transport *trans; + struct rds_transport *ret = NULL; + + if (IN_LOOPBACK(ntohl(addr))) + return &rds_loop_transport; + + down_read(&rds_trans_sem); + list_for_each_entry(trans, &rds_transports, t_item) { + if (trans->laddr_check(addr) == 0) { + ret = trans; + break; + } + } + up_read(&rds_trans_sem); + + return ret; +} + +/* + * This returns the number of stats entries in the snapshot and only + * copies them using the iter if there is enough space for them. The + * caller passes in the global stats so that we can size and copy while + * holding the lock. + */ +unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, + unsigned int avail) + +{ + struct rds_transport *trans; + unsigned int total = 0; + unsigned int part; + + rds_info_iter_unmap(iter); + down_read(&rds_trans_sem); + + list_for_each_entry(trans, &rds_transports, t_item) { + if (trans->stats_info_copy == NULL) + continue; + + part = trans->stats_info_copy(iter, avail); + avail -= min(avail, part); + total += part; + } + + up_read(&rds_trans_sem); + + return total; +} + diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 5c72a116b1a..f8f047b6124 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -183,13 +183,6 @@ override: if (R_tab == NULL) goto failure; - if (!est && (ret == ACT_P_CREATED || - !gen_estimator_active(&police->tcf_bstats, - &police->tcf_rate_est))) { - err = -EINVAL; - goto failure; - } - if (parm->peakrate.rate) { P_tab = qdisc_get_rtab(&parm->peakrate, tb[TCA_POLICE_PEAKRATE]); @@ -205,6 +198,12 @@ override: &police->tcf_lock, est); if (err) goto failure_unlock; + } else if (tb[TCA_POLICE_AVRATE] && + (ret == ACT_P_CREATED || + !gen_estimator_active(&police->tcf_bstats, + &police->tcf_rate_est))) { + err = -EINVAL; + goto failure_unlock; } /* No failure allowed after this point */ diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 9e43ed94916..d728d811173 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1960,8 +1960,11 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg) cbq_rmprio(q, cl); sch_tree_unlock(sch); - if (--cl->refcnt == 0) - cbq_destroy_class(sch, cl); + BUG_ON(--cl->refcnt == 0); + /* + * This shouldn't happen: we "hold" one cops->get() when called + * from tc_ctl_tclass; the destroy method is done from cops->put(). + */ return 0; } diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index f6b4fa97df7..7597fe14686 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -66,11 +66,15 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl = (struct drr_class *)*arg; + struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_DRR_MAX + 1]; u32 quantum; int err; - err = nla_parse_nested(tb, TCA_DRR_MAX, tca[TCA_OPTIONS], drr_policy); + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_DRR_MAX, opt, drr_policy); if (err < 0) return err; @@ -151,8 +155,11 @@ static int drr_delete_class(struct Qdisc *sch, unsigned long arg) drr_purge_queue(cl); qdisc_class_hash_remove(&q->clhash, &cl->common); - if (--cl->refcnt == 0) - drr_destroy_class(sch, cl); + BUG_ON(--cl->refcnt == 0); + /* + * This shouldn't happen: we "hold" one cops->get() when called + * from tc_ctl_tclass; the destroy method is done from cops->put(). + */ sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 74226b26552..5022f9c1f34 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1139,8 +1139,11 @@ hfsc_delete_class(struct Qdisc *sch, unsigned long arg) hfsc_purge_queue(sch, cl); qdisc_class_hash_remove(&q->clhash, &cl->cl_common); - if (--cl->refcnt == 0) - hfsc_destroy_class(sch, cl); + BUG_ON(--cl->refcnt == 0); + /* + * This shouldn't happen: we "hold" one cops->get() when called + * from tc_ctl_tclass; the destroy method is done from cops->put(). + */ sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 355974f610c..88cd0262662 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1275,8 +1275,11 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg) if (last_child) htb_parent_to_leaf(q, cl, new_q); - if (--cl->refcnt == 0) - htb_destroy_class(sch, cl); + BUG_ON(--cl->refcnt == 0); + /* + * This shouldn't happen: we "hold" one cops->get() when called + * from tc_ctl_tclass; the destroy method is done from cops->put(). + */ sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index a2f93c09f3c..e22dfe85e43 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -236,7 +236,6 @@ static int tbf_change(struct Qdisc* sch, struct nlattr *opt) struct tc_tbf_qopt *qopt; struct qdisc_rate_table *rtab = NULL; struct qdisc_rate_table *ptab = NULL; - struct qdisc_rate_table *tmp; struct Qdisc *child = NULL; int max_size,n; @@ -295,13 +294,9 @@ static int tbf_change(struct Qdisc* sch, struct nlattr *opt) q->tokens = q->buffer; q->ptokens = q->mtu; - tmp = q->R_tab; - q->R_tab = rtab; - rtab = tmp; + swap(q->R_tab, rtab); + swap(q->P_tab, ptab); - tmp = q->P_tab; - q->P_tab = ptab; - ptab = tmp; sch_tree_unlock(sch); err = 0; done: diff --git a/net/sctp/debug.c b/net/sctp/debug.c index 67715f4eb84..7ff548a30cf 100644 --- a/net/sctp/debug.c +++ b/net/sctp/debug.c @@ -86,6 +86,9 @@ const char *sctp_cname(const sctp_subtype_t cid) case SCTP_CID_FWD_TSN: return "FWD_TSN"; + case SCTP_CID_AUTH: + return "AUTH"; + default: break; } @@ -135,6 +138,7 @@ static const char *sctp_primitive_tbl[SCTP_NUM_PRIMITIVE_TYPES] = { "PRIMITIVE_ABORT", "PRIMITIVE_SEND", "PRIMITIVE_REQUESTHEARTBEAT", + "PRIMITIVE_ASCONF", }; /* Lookup primitive debug name. */ diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 4c8d9f45ce0..905fda582b9 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -111,7 +111,8 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, if (sctp_addip_enable) { auth_chunks->chunks[0] = SCTP_CID_ASCONF; auth_chunks->chunks[1] = SCTP_CID_ASCONF_ACK; - auth_chunks->param_hdr.length += htons(2); + auth_chunks->param_hdr.length = + htons(sizeof(sctp_paramhdr_t) + 2); } } diff --git a/net/sctp/output.c b/net/sctp/output.c index 07d58903a74..7d08f522ec8 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -49,13 +49,10 @@ #include <linux/ipv6.h> #include <linux/init.h> #include <net/inet_ecn.h> +#include <net/ip.h> #include <net/icmp.h> #include <net/net_namespace.h> -#ifndef TEST_FRAME -#include <net/tcp.h> -#endif /* TEST_FRAME (not defined) */ - #include <linux/socket.h> /* for sa_family_t */ #include <net/sock.h> diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index bc411c89621..d765fc53e74 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -428,7 +428,8 @@ void sctp_retransmit_mark(struct sctp_outq *q, * retransmitting due to T3 timeout. */ if (reason == SCTP_RTXR_T3_RTX && - (jiffies - chunk->sent_at) < transport->last_rto) + time_before(jiffies, chunk->sent_at + + transport->last_rto)) continue; /* RFC 2960 6.2.1 Processing a Received SACK @@ -1757,6 +1758,9 @@ static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn) struct sctp_chunk *chunk; struct list_head *lchunk, *temp; + if (!asoc->peer.prsctp_capable) + return; + /* PR-SCTP C1) Let SackCumAck be the Cumulative TSN ACK carried in the * received SACK. * diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index c1e316ee715..cb198af8887 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -692,15 +692,20 @@ static int sctp_inetaddr_event(struct notifier_block *this, unsigned long ev, static int sctp_ctl_sock_init(void) { int err; - sa_family_t family; + sa_family_t family = PF_INET; if (sctp_get_pf_specific(PF_INET6)) family = PF_INET6; - else - family = PF_INET; err = inet_ctl_sock_create(&sctp_ctl_sock, family, SOCK_SEQPACKET, IPPROTO_SCTP, &init_net); + + /* If IPv6 socket could not be created, try the IPv4 socket */ + if (err < 0 && family == PF_INET6) + err = inet_ctl_sock_create(&sctp_ctl_sock, AF_INET, + SOCK_SEQPACKET, IPPROTO_SCTP, + &init_net); + if (err < 0) { printk(KERN_ERR "SCTP: Failed to create the SCTP control socket.\n"); @@ -1297,9 +1302,8 @@ SCTP_STATIC __init int sctp_init(void) out: return status; err_v6_add_protocol: - sctp_v6_del_protocol(); -err_add_protocol: sctp_v4_del_protocol(); +err_add_protocol: inet_ctl_sock_destroy(sctp_ctl_sock); err_ctl_sock_init: sctp_v6_protosw_exit(); @@ -1310,7 +1314,6 @@ err_protosw_init: sctp_v4_pf_exit(); sctp_v6_pf_exit(); sctp_sysctl_unregister(); - list_del(&sctp_af_inet.list); free_pages((unsigned long)sctp_port_hashtable, get_order(sctp_port_hashsize * sizeof(struct sctp_bind_hashbucket))); @@ -1358,7 +1361,6 @@ SCTP_STATIC __exit void sctp_exit(void) sctp_v4_pf_exit(); sctp_sysctl_unregister(); - list_del(&sctp_af_inet.list); free_pages((unsigned long)sctp_assoc_hashtable, get_order(sctp_assoc_hashsize * diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index b40e95f9851..6851ee94e97 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -224,7 +224,9 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, num_ext += 2; } - chunksize += sizeof(aiparam); + if (sp->adaptation_ind) + chunksize += sizeof(aiparam); + chunksize += vparam_len; /* Account for AUTH related parameters */ @@ -304,10 +306,12 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, if (sctp_prsctp_enable) sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param); - aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND; - aiparam.param_hdr.length = htons(sizeof(aiparam)); - aiparam.adaptation_ind = htonl(sp->adaptation_ind); - sctp_addto_chunk(retval, sizeof(aiparam), &aiparam); + if (sp->adaptation_ind) { + aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND; + aiparam.param_hdr.length = htons(sizeof(aiparam)); + aiparam.adaptation_ind = htonl(sp->adaptation_ind); + sctp_addto_chunk(retval, sizeof(aiparam), &aiparam); + } /* Add SCTP-AUTH chunks to the parameter list */ if (sctp_auth_enable) { @@ -332,6 +336,7 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, sctp_inithdr_t initack; struct sctp_chunk *retval; union sctp_params addrs; + struct sctp_sock *sp; int addrs_len; sctp_cookie_param_t *cookie; int cookie_len; @@ -366,22 +371,24 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, /* Calculate the total size of allocation, include the reserved * space for reporting unknown parameters if it is specified. */ + sp = sctp_sk(asoc->base.sk); chunksize = sizeof(initack) + addrs_len + cookie_len + unkparam_len; /* Tell peer that we'll do ECN only if peer advertised such cap. */ if (asoc->peer.ecn_capable) chunksize += sizeof(ecap_param); - if (sctp_prsctp_enable) + if (asoc->peer.prsctp_capable) chunksize += sizeof(prsctp_param); - if (sctp_addip_enable) { + if (asoc->peer.asconf_capable) { extensions[num_ext] = SCTP_CID_ASCONF; extensions[num_ext+1] = SCTP_CID_ASCONF_ACK; num_ext += 2; } - chunksize += sizeof(aiparam); + if (sp->adaptation_ind) + chunksize += sizeof(aiparam); if (asoc->peer.auth_capable) { auth_random = (sctp_paramhdr_t *)asoc->c.auth_random; @@ -432,10 +439,12 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, if (asoc->peer.prsctp_capable) sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param); - aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND; - aiparam.param_hdr.length = htons(sizeof(aiparam)); - aiparam.adaptation_ind = htonl(sctp_sk(asoc->base.sk)->adaptation_ind); - sctp_addto_chunk(retval, sizeof(aiparam), &aiparam); + if (sp->adaptation_ind) { + aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND; + aiparam.param_hdr.length = htons(sizeof(aiparam)); + aiparam.adaptation_ind = htonl(sp->adaptation_ind); + sctp_addto_chunk(retval, sizeof(aiparam), &aiparam); + } if (asoc->peer.auth_capable) { sctp_addto_chunk(retval, ntohs(auth_random->length), diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 0146cfb1f18..e2020eb2c8c 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -434,7 +434,8 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = { * */ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc, - struct sctp_transport *transport) + struct sctp_transport *transport, + int is_hb) { /* The check for association's overall error counter exceeding the * threshold is done in the state function. @@ -466,7 +467,7 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc, * The first unacknowleged HB triggers it. We do this with a flag * that indicates that we have an outstanding HB. */ - if (transport->hb_sent) { + if (!is_hb || transport->hb_sent) { transport->last_rto = transport->rto; transport->rto = min((transport->rto * 2), transport->asoc->rto_max); } @@ -657,20 +658,6 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds, sctp_transport_hold(t); } -/* Helper function to do a transport reset at the expiry of the hearbeat - * timer. - */ -static void sctp_cmd_transport_reset(sctp_cmd_seq_t *cmds, - struct sctp_association *asoc, - struct sctp_transport *t) -{ - sctp_transport_lower_cwnd(t, SCTP_LOWER_CWND_INACTIVE); - - /* Mark one strike against a transport. */ - sctp_do_8_2_transport_strike(asoc, t); - - t->hb_sent = 1; -} /* Helper function to process the process SACK command. */ static int sctp_cmd_process_sack(sctp_cmd_seq_t *cmds, @@ -800,36 +787,48 @@ static void sctp_cmd_process_operr(sctp_cmd_seq_t *cmds, struct sctp_association *asoc, struct sctp_chunk *chunk) { - struct sctp_operr_chunk *operr_chunk; struct sctp_errhdr *err_hdr; + struct sctp_ulpevent *ev; - operr_chunk = (struct sctp_operr_chunk *)chunk->chunk_hdr; - err_hdr = &operr_chunk->err_hdr; + while (chunk->chunk_end > chunk->skb->data) { + err_hdr = (struct sctp_errhdr *)(chunk->skb->data); - switch (err_hdr->cause) { - case SCTP_ERROR_UNKNOWN_CHUNK: - { - struct sctp_chunkhdr *unk_chunk_hdr; + ev = sctp_ulpevent_make_remote_error(asoc, chunk, 0, + GFP_ATOMIC); + if (!ev) + return; - unk_chunk_hdr = (struct sctp_chunkhdr *)err_hdr->variable; - switch (unk_chunk_hdr->type) { - /* ADDIP 4.1 A9) If the peer responds to an ASCONF with an - * ERROR chunk reporting that it did not recognized the ASCONF - * chunk type, the sender of the ASCONF MUST NOT send any - * further ASCONF chunks and MUST stop its T-4 timer. - */ - case SCTP_CID_ASCONF: - asoc->peer.asconf_capable = 0; - sctp_add_cmd_sf(cmds, SCTP_CMD_TIMER_STOP, + sctp_ulpq_tail_event(&asoc->ulpq, ev); + + switch (err_hdr->cause) { + case SCTP_ERROR_UNKNOWN_CHUNK: + { + sctp_chunkhdr_t *unk_chunk_hdr; + + unk_chunk_hdr = (sctp_chunkhdr_t *)err_hdr->variable; + switch (unk_chunk_hdr->type) { + /* ADDIP 4.1 A9) If the peer responds to an ASCONF with + * an ERROR chunk reporting that it did not recognized + * the ASCONF chunk type, the sender of the ASCONF MUST + * NOT send any further ASCONF chunks and MUST stop its + * T-4 timer. + */ + case SCTP_CID_ASCONF: + if (asoc->peer.asconf_capable == 0) + break; + + asoc->peer.asconf_capable = 0; + sctp_add_cmd_sf(cmds, SCTP_CMD_TIMER_STOP, SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO)); + break; + default: + break; + } break; + } default: break; } - break; - } - default: - break; } } @@ -1459,12 +1458,19 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, case SCTP_CMD_STRIKE: /* Mark one strike against a transport. */ - sctp_do_8_2_transport_strike(asoc, cmd->obj.transport); + sctp_do_8_2_transport_strike(asoc, cmd->obj.transport, + 0); + break; + + case SCTP_CMD_TRANSPORT_IDLE: + t = cmd->obj.transport; + sctp_transport_lower_cwnd(t, SCTP_LOWER_CWND_INACTIVE); break; - case SCTP_CMD_TRANSPORT_RESET: + case SCTP_CMD_TRANSPORT_HB_SENT: t = cmd->obj.transport; - sctp_cmd_transport_reset(commands, asoc, t); + sctp_do_8_2_transport_strike(asoc, t, 1); + t->hb_sent = 1; break; case SCTP_CMD_TRANSPORT_ON: diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 3a0cd075914..55a61aa6966 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -988,7 +988,9 @@ sctp_disposition_t sctp_sf_sendbeat_8_3(const struct sctp_endpoint *ep, /* Set transport error counter and association error counter * when sending heartbeat. */ - sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_RESET, + sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_IDLE, + SCTP_TRANSPORT(transport)); + sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_HB_SENT, SCTP_TRANSPORT(transport)); } sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMER_UPDATE, @@ -3163,7 +3165,6 @@ sctp_disposition_t sctp_sf_operr_notify(const struct sctp_endpoint *ep, sctp_cmd_seq_t *commands) { struct sctp_chunk *chunk = arg; - struct sctp_ulpevent *ev; if (!sctp_vtag_verify(chunk, asoc)) return sctp_sf_pdiscard(ep, asoc, type, arg, commands); @@ -3173,21 +3174,10 @@ sctp_disposition_t sctp_sf_operr_notify(const struct sctp_endpoint *ep, return sctp_sf_violation_chunklen(ep, asoc, type, arg, commands); - while (chunk->chunk_end > chunk->skb->data) { - ev = sctp_ulpevent_make_remote_error(asoc, chunk, 0, - GFP_ATOMIC); - if (!ev) - goto nomem; + sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_OPERR, + SCTP_CHUNK(chunk)); - sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, - SCTP_ULPEVENT(ev)); - sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_OPERR, - SCTP_CHUNK(chunk)); - } return SCTP_DISPOSITION_CONSUME; - -nomem: - return SCTP_DISPOSITION_NOMEM; } /* @@ -4967,7 +4957,7 @@ sctp_disposition_t sctp_sf_do_prm_requestheartbeat( * to that address and not acknowledged within one RTO. * */ - sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_RESET, + sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_HB_SENT, SCTP_TRANSPORT(arg)); return SCTP_DISPOSITION_CONSUME; } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index dea864f5de5..5fb3a8c9792 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3069,9 +3069,6 @@ static int sctp_setsockopt_maxburst(struct sock *sk, int val; int assoc_id = 0; - if (optlen < sizeof(int)) - return -EINVAL; - if (optlen == sizeof(int)) { printk(KERN_WARNING "SCTP: Use of int in max_burst socket option deprecated\n"); @@ -5283,16 +5280,14 @@ static int sctp_getsockopt_maxburst(struct sock *sk, int len, struct sctp_sock *sp; struct sctp_association *asoc; - if (len < sizeof(int)) - return -EINVAL; - if (len == sizeof(int)) { printk(KERN_WARNING "SCTP: Use of int in max_burst socket option deprecated\n"); printk(KERN_WARNING "SCTP: Use struct sctp_assoc_value instead\n"); params.assoc_id = 0; - } else if (len == sizeof (struct sctp_assoc_value)) { + } else if (len >= sizeof(struct sctp_assoc_value)) { + len = sizeof(struct sctp_assoc_value); if (copy_from_user(¶ms, optval, len)) return -EFAULT; } else @@ -5848,37 +5843,28 @@ static int sctp_get_port(struct sock *sk, unsigned short snum) } /* - * 3.1.3 listen() - UDP Style Syntax - * - * By default, new associations are not accepted for UDP style sockets. - * An application uses listen() to mark a socket as being able to - * accept new associations. + * Move a socket to LISTENING state. */ -SCTP_STATIC int sctp_seqpacket_listen(struct sock *sk, int backlog) +SCTP_STATIC int sctp_listen_start(struct sock *sk, int backlog) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_endpoint *ep = sp->ep; + struct crypto_hash *tfm = NULL; - /* Only UDP style sockets that are not peeled off are allowed to - * listen(). - */ - if (!sctp_style(sk, UDP)) - return -EINVAL; - - /* If backlog is zero, disable listening. */ - if (!backlog) { - if (sctp_sstate(sk, CLOSED)) - return 0; - - sctp_unhash_endpoint(ep); - sk->sk_state = SCTP_SS_CLOSED; - return 0; + /* Allocate HMAC for generating cookie. */ + if (!sctp_sk(sk)->hmac && sctp_hmac_alg) { + tfm = crypto_alloc_hash(sctp_hmac_alg, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) { + if (net_ratelimit()) { + printk(KERN_INFO + "SCTP: failed to load transform for %s: %ld\n", + sctp_hmac_alg, PTR_ERR(tfm)); + } + return -ENOSYS; + } + sctp_sk(sk)->hmac = tfm; } - /* Return if we are already listening. */ - if (sctp_sstate(sk, LISTENING)) - return 0; - /* * If a bind() or sctp_bindx() is not called prior to a listen() * call that allows new associations to be accepted, the system @@ -5889,7 +5875,6 @@ SCTP_STATIC int sctp_seqpacket_listen(struct sock *sk, int backlog) * extensions draft, but follows the practice as seen in TCP * sockets. * - * Additionally, turn off fastreuse flag since we are not listening */ sk->sk_state = SCTP_SS_LISTENING; if (!ep->base.bind_addr.port) { @@ -5900,113 +5885,71 @@ SCTP_STATIC int sctp_seqpacket_listen(struct sock *sk, int backlog) sk->sk_state = SCTP_SS_CLOSED; return -EADDRINUSE; } - sctp_sk(sk)->bind_hash->fastreuse = 0; } - sctp_hash_endpoint(ep); - return 0; -} - -/* - * 4.1.3 listen() - TCP Style Syntax - * - * Applications uses listen() to ready the SCTP endpoint for accepting - * inbound associations. - */ -SCTP_STATIC int sctp_stream_listen(struct sock *sk, int backlog) -{ - struct sctp_sock *sp = sctp_sk(sk); - struct sctp_endpoint *ep = sp->ep; - - /* If backlog is zero, disable listening. */ - if (!backlog) { - if (sctp_sstate(sk, CLOSED)) - return 0; - - sctp_unhash_endpoint(ep); - sk->sk_state = SCTP_SS_CLOSED; - return 0; - } - - if (sctp_sstate(sk, LISTENING)) - return 0; - - /* - * If a bind() or sctp_bindx() is not called prior to a listen() - * call that allows new associations to be accepted, the system - * picks an ephemeral port and will choose an address set equivalent - * to binding with a wildcard address. - * - * This is not currently spelled out in the SCTP sockets - * extensions draft, but follows the practice as seen in TCP - * sockets. - */ - sk->sk_state = SCTP_SS_LISTENING; - if (!ep->base.bind_addr.port) { - if (sctp_autobind(sk)) - return -EAGAIN; - } else - sctp_sk(sk)->bind_hash->fastreuse = 0; - sk->sk_max_ack_backlog = backlog; sctp_hash_endpoint(ep); return 0; } /* + * 4.1.3 / 5.1.3 listen() + * + * By default, new associations are not accepted for UDP style sockets. + * An application uses listen() to mark a socket as being able to + * accept new associations. + * + * On TCP style sockets, applications use listen() to ready the SCTP + * endpoint for accepting inbound associations. + * + * On both types of endpoints a backlog of '0' disables listening. + * * Move a socket to LISTENING state. */ int sctp_inet_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; - struct crypto_hash *tfm = NULL; + struct sctp_endpoint *ep = sctp_sk(sk)->ep; int err = -EINVAL; if (unlikely(backlog < 0)) - goto out; + return err; sctp_lock_sock(sk); + /* Peeled-off sockets are not allowed to listen(). */ + if (sctp_style(sk, UDP_HIGH_BANDWIDTH)) + goto out; + if (sock->state != SS_UNCONNECTED) goto out; - /* Allocate HMAC for generating cookie. */ - if (!sctp_sk(sk)->hmac && sctp_hmac_alg) { - tfm = crypto_alloc_hash(sctp_hmac_alg, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) { - if (net_ratelimit()) { - printk(KERN_INFO - "SCTP: failed to load transform for %s: %ld\n", - sctp_hmac_alg, PTR_ERR(tfm)); - } - err = -ENOSYS; + /* If backlog is zero, disable listening. */ + if (!backlog) { + if (sctp_sstate(sk, CLOSED)) goto out; - } - } - switch (sock->type) { - case SOCK_SEQPACKET: - err = sctp_seqpacket_listen(sk, backlog); - break; - case SOCK_STREAM: - err = sctp_stream_listen(sk, backlog); - break; - default: - break; + err = 0; + sctp_unhash_endpoint(ep); + sk->sk_state = SCTP_SS_CLOSED; + if (sk->sk_reuse) + sctp_sk(sk)->bind_hash->fastreuse = 1; + goto out; } - if (err) - goto cleanup; + /* If we are already listening, just update the backlog */ + if (sctp_sstate(sk, LISTENING)) + sk->sk_max_ack_backlog = backlog; + else { + err = sctp_listen_start(sk, backlog); + if (err) + goto out; + } - /* Store away the transform reference. */ - if (!sctp_sk(sk)->hmac) - sctp_sk(sk)->hmac = tfm; + err = 0; out: sctp_release_sock(sk); return err; -cleanup: - crypto_free_hash(tfm); - goto out; } /* diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 5c29b14ee9a..e5dde45c79d 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -543,8 +543,8 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, * congestion indications more than once every window of * data (or more loosely more than once every round-trip time). */ - if ((jiffies - transport->last_time_ecne_reduced) > - transport->rtt) { + if (time_after(jiffies, transport->last_time_ecne_reduced + + transport->rtt)) { transport->ssthresh = max(transport->cwnd/2, 4*transport->asoc->pathmtu); transport->cwnd = transport->ssthresh; @@ -561,7 +561,8 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, * to be done every RTO interval, we do it every hearbeat * interval. */ - if ((jiffies - transport->last_time_used) > transport->rto) + if (time_after(jiffies, transport->last_time_used + + transport->rto)) transport->cwnd = max(transport->cwnd/2, 4*transport->asoc->pathmtu); break; diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 3ddaff42d1b..a3bfd406491 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -119,7 +119,7 @@ static struct bclink *bclink = NULL; static struct link *bcl = NULL; static DEFINE_SPINLOCK(bc_lock); -char tipc_bclink_name[] = "multicast-link"; +const char tipc_bclink_name[] = "multicast-link"; static u32 buf_seqno(struct sk_buff *buf) @@ -800,7 +800,7 @@ int tipc_bclink_init(void) tipc_link_set_queue_limits(bcl, BCLINK_WIN_DEFAULT); bcl->b_ptr = &bcbearer->bearer; bcl->state = WORKING_WORKING; - sprintf(bcl->name, tipc_bclink_name); + strlcpy(bcl->name, tipc_bclink_name, TIPC_MAX_LINK_NAME); if (BCLINK_LOG_BUF_SIZE) { char *pb = kmalloc(BCLINK_LOG_BUF_SIZE, GFP_ATOMIC); diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h index 2f2d731bc1c..4c1771e95c9 100644 --- a/net/tipc/bcast.h +++ b/net/tipc/bcast.h @@ -70,7 +70,7 @@ struct port_list { struct tipc_node; -extern char tipc_bclink_name[]; +extern const char tipc_bclink_name[]; /** diff --git a/net/tipc/dbg.c b/net/tipc/dbg.c index 29ecae85166..1885a7edb0c 100644 --- a/net/tipc/dbg.c +++ b/net/tipc/dbg.c @@ -258,7 +258,7 @@ void tipc_printf(struct print_buf *pb, const char *fmt, ...) } if (pb->echo) - printk(print_string); + printk("%s", print_string); spin_unlock_bh(&print_lock); } diff --git a/net/tipc/node.c b/net/tipc/node.c index 20d98c56e15..2c24e7d6d95 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -703,7 +703,7 @@ struct sk_buff *tipc_node_get_links(const void *req_tlv_area, int req_tlv_space) link_info.dest = htonl(tipc_own_addr & 0xfffff00); link_info.up = htonl(1); - sprintf(link_info.str, tipc_bclink_name); + strlcpy(link_info.str, tipc_bclink_name, TIPC_MAX_LINK_NAME); tipc_cfg_append_tlv(buf, TIPC_TLV_LINK_INFO, &link_info, sizeof(link_info)); /* Add TLVs for any other links in scope */ diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index d1b89820ab4..baac91049b0 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1178,8 +1178,7 @@ out_unlock: unix_state_unlock(other); out: - if (skb) - kfree_skb(skb); + kfree_skb(skb); if (newsk) unix_release_sock(newsk, 0); if (other) diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c index 39701dec1db..466e2d22d25 100644 --- a/net/wanrouter/wanmain.c +++ b/net/wanrouter/wanmain.c @@ -86,8 +86,10 @@ static int wanrouter_device_del_if(struct wan_device *wandev, static struct wan_device *wanrouter_find_device(char *name); static int wanrouter_delete_interface(struct wan_device *wandev, char *name); -static void lock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags); -static void unlock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags); +static void lock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags) + __acquires(lock); +static void unlock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags) + __releases(lock); @@ -763,12 +765,14 @@ static int wanrouter_delete_interface(struct wan_device *wandev, char *name) } static void lock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags) + __acquires(lock) { spin_lock_irqsave(lock, *smp_flags); } static void unlock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags) + __releases(lock) { spin_unlock_irqrestore(lock, *smp_flags); } diff --git a/net/wanrouter/wanproc.c b/net/wanrouter/wanproc.c index 267f7ff4982..c44d96b3a43 100644 --- a/net/wanrouter/wanproc.c +++ b/net/wanrouter/wanproc.c @@ -80,6 +80,7 @@ static struct proc_dir_entry *proc_router; * Iterator */ static void *r_start(struct seq_file *m, loff_t *pos) + __acquires(kernel_lock) { struct wan_device *wandev; loff_t l = *pos; @@ -101,6 +102,7 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos) } static void r_stop(struct seq_file *m, void *v) + __releases(kernel_lock) { unlock_kernel(); } diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index e28e2b8fa43..092ae6faccc 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -102,3 +102,13 @@ config LIB80211_CRYPT_CCMP config LIB80211_CRYPT_TKIP tristate + +config LIB80211_DEBUG + bool "lib80211 debugging messages" + depends on LIB80211 + default n + ---help--- + You can enable this if you want verbose debugging messages + from lib80211. + + If unsure, say N. diff --git a/net/wireless/core.c b/net/wireless/core.c index 0668b2bfc1d..17fe3904974 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -7,7 +7,6 @@ #include <linux/if.h> #include <linux/module.h> #include <linux/err.h> -#include <linux/mutex.h> #include <linux/list.h> #include <linux/nl80211.h> #include <linux/debugfs.h> @@ -31,18 +30,29 @@ MODULE_DESCRIPTION("wireless configuration support"); * only read the list, and that can happen quite * often because we need to do it for each command */ LIST_HEAD(cfg80211_drv_list); -DEFINE_MUTEX(cfg80211_drv_mutex); + +/* + * This is used to protect the cfg80211_drv_list, cfg80211_regdomain, + * country_ie_regdomain, the reg_beacon_list and the the last regulatory + * request receipt (last_request). + */ +DEFINE_MUTEX(cfg80211_mutex); /* for debugfs */ static struct dentry *ieee80211_debugfs_dir; -/* requires cfg80211_drv_mutex to be held! */ -static struct cfg80211_registered_device *cfg80211_drv_by_wiphy(int wiphy) +/* requires cfg80211_mutex to be held! */ +struct cfg80211_registered_device *cfg80211_drv_by_wiphy_idx(int wiphy_idx) { struct cfg80211_registered_device *result = NULL, *drv; + if (!wiphy_idx_valid(wiphy_idx)) + return NULL; + + assert_cfg80211_lock(); + list_for_each_entry(drv, &cfg80211_drv_list, list) { - if (drv->idx == wiphy) { + if (drv->wiphy_idx == wiphy_idx) { result = drv; break; } @@ -51,17 +61,44 @@ static struct cfg80211_registered_device *cfg80211_drv_by_wiphy(int wiphy) return result; } +int get_wiphy_idx(struct wiphy *wiphy) +{ + struct cfg80211_registered_device *drv; + if (!wiphy) + return WIPHY_IDX_STALE; + drv = wiphy_to_dev(wiphy); + return drv->wiphy_idx; +} + /* requires cfg80211_drv_mutex to be held! */ +struct wiphy *wiphy_idx_to_wiphy(int wiphy_idx) +{ + struct cfg80211_registered_device *drv; + + if (!wiphy_idx_valid(wiphy_idx)) + return NULL; + + assert_cfg80211_lock(); + + drv = cfg80211_drv_by_wiphy_idx(wiphy_idx); + if (!drv) + return NULL; + return &drv->wiphy; +} + +/* requires cfg80211_mutex to be held! */ static struct cfg80211_registered_device * __cfg80211_drv_from_info(struct genl_info *info) { int ifindex; - struct cfg80211_registered_device *bywiphy = NULL, *byifidx = NULL; + struct cfg80211_registered_device *bywiphyidx = NULL, *byifidx = NULL; struct net_device *dev; int err = -EINVAL; + assert_cfg80211_lock(); + if (info->attrs[NL80211_ATTR_WIPHY]) { - bywiphy = cfg80211_drv_by_wiphy( + bywiphyidx = cfg80211_drv_by_wiphy_idx( nla_get_u32(info->attrs[NL80211_ATTR_WIPHY])); err = -ENODEV; } @@ -78,14 +115,14 @@ __cfg80211_drv_from_info(struct genl_info *info) err = -ENODEV; } - if (bywiphy && byifidx) { - if (bywiphy != byifidx) + if (bywiphyidx && byifidx) { + if (bywiphyidx != byifidx) return ERR_PTR(-EINVAL); else - return bywiphy; /* == byifidx */ + return bywiphyidx; /* == byifidx */ } - if (bywiphy) - return bywiphy; + if (bywiphyidx) + return bywiphyidx; if (byifidx) return byifidx; @@ -98,7 +135,7 @@ cfg80211_get_dev_from_info(struct genl_info *info) { struct cfg80211_registered_device *drv; - mutex_lock(&cfg80211_drv_mutex); + mutex_lock(&cfg80211_mutex); drv = __cfg80211_drv_from_info(info); /* if it is not an error we grab the lock on @@ -107,7 +144,7 @@ cfg80211_get_dev_from_info(struct genl_info *info) if (!IS_ERR(drv)) mutex_lock(&drv->mtx); - mutex_unlock(&cfg80211_drv_mutex); + mutex_unlock(&cfg80211_mutex); return drv; } @@ -118,7 +155,7 @@ cfg80211_get_dev_from_ifindex(int ifindex) struct cfg80211_registered_device *drv = ERR_PTR(-ENODEV); struct net_device *dev; - mutex_lock(&cfg80211_drv_mutex); + mutex_lock(&cfg80211_mutex); dev = dev_get_by_index(&init_net, ifindex); if (!dev) goto out; @@ -129,7 +166,7 @@ cfg80211_get_dev_from_ifindex(int ifindex) drv = ERR_PTR(-ENODEV); dev_put(dev); out: - mutex_unlock(&cfg80211_drv_mutex); + mutex_unlock(&cfg80211_mutex); return drv; } @@ -143,16 +180,16 @@ int cfg80211_dev_rename(struct cfg80211_registered_device *rdev, char *newname) { struct cfg80211_registered_device *drv; - int idx, taken = -1, result, digits; + int wiphy_idx, taken = -1, result, digits; - mutex_lock(&cfg80211_drv_mutex); + mutex_lock(&cfg80211_mutex); /* prohibit calling the thing phy%d when %d is not its number */ - sscanf(newname, PHY_NAME "%d%n", &idx, &taken); - if (taken == strlen(newname) && idx != rdev->idx) { - /* count number of places needed to print idx */ + sscanf(newname, PHY_NAME "%d%n", &wiphy_idx, &taken); + if (taken == strlen(newname) && wiphy_idx != rdev->wiphy_idx) { + /* count number of places needed to print wiphy_idx */ digits = 1; - while (idx /= 10) + while (wiphy_idx /= 10) digits++; /* * deny the name if it is phy<idx> where <idx> is printed @@ -193,7 +230,7 @@ int cfg80211_dev_rename(struct cfg80211_registered_device *rdev, result = 0; out_unlock: - mutex_unlock(&cfg80211_drv_mutex); + mutex_unlock(&cfg80211_mutex); if (result == 0) nl80211_notify_dev_rename(rdev); @@ -220,22 +257,22 @@ struct wiphy *wiphy_new(struct cfg80211_ops *ops, int sizeof_priv) drv->ops = ops; - mutex_lock(&cfg80211_drv_mutex); + mutex_lock(&cfg80211_mutex); - drv->idx = wiphy_counter++; + drv->wiphy_idx = wiphy_counter++; - if (unlikely(drv->idx < 0)) { + if (unlikely(!wiphy_idx_valid(drv->wiphy_idx))) { wiphy_counter--; - mutex_unlock(&cfg80211_drv_mutex); + mutex_unlock(&cfg80211_mutex); /* ugh, wrapped! */ kfree(drv); return NULL; } - mutex_unlock(&cfg80211_drv_mutex); + mutex_unlock(&cfg80211_mutex); /* give it a proper name */ - dev_set_name(&drv->wiphy.dev, PHY_NAME "%d", drv->idx); + dev_set_name(&drv->wiphy.dev, PHY_NAME "%d", drv->wiphy_idx); mutex_init(&drv->mtx); mutex_init(&drv->devlist_mtx); @@ -310,10 +347,10 @@ int wiphy_register(struct wiphy *wiphy) /* check and set up bitrates */ ieee80211_set_bitrate_flags(wiphy); - mutex_lock(&cfg80211_drv_mutex); + mutex_lock(&cfg80211_mutex); /* set up regulatory info */ - wiphy_update_regulatory(wiphy, REGDOM_SET_BY_CORE); + wiphy_update_regulatory(wiphy, NL80211_REGDOM_SET_BY_CORE); res = device_add(&drv->wiphy.dev); if (res) @@ -328,9 +365,20 @@ int wiphy_register(struct wiphy *wiphy) if (IS_ERR(drv->wiphy.debugfsdir)) drv->wiphy.debugfsdir = NULL; + if (wiphy->custom_regulatory) { + struct regulatory_request request; + + request.wiphy_idx = get_wiphy_idx(wiphy); + request.initiator = NL80211_REGDOM_SET_BY_DRIVER; + request.alpha2[0] = '9'; + request.alpha2[1] = '9'; + + nl80211_send_reg_change_event(&request); + } + res = 0; out_unlock: - mutex_unlock(&cfg80211_drv_mutex); + mutex_unlock(&cfg80211_mutex); return res; } EXPORT_SYMBOL(wiphy_register); @@ -340,7 +388,7 @@ void wiphy_unregister(struct wiphy *wiphy) struct cfg80211_registered_device *drv = wiphy_to_dev(wiphy); /* protect the device list */ - mutex_lock(&cfg80211_drv_mutex); + mutex_lock(&cfg80211_mutex); BUG_ON(!list_empty(&drv->netdev_list)); @@ -366,7 +414,7 @@ void wiphy_unregister(struct wiphy *wiphy) device_del(&drv->wiphy.dev); debugfs_remove(drv->wiphy.debugfsdir); - mutex_unlock(&cfg80211_drv_mutex); + mutex_unlock(&cfg80211_mutex); } EXPORT_SYMBOL(wiphy_unregister); diff --git a/net/wireless/core.h b/net/wireless/core.h index e29ad4cd464..6acd483a61f 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -10,6 +10,7 @@ #include <linux/netdevice.h> #include <linux/kref.h> #include <linux/rbtree.h> +#include <linux/mutex.h> #include <net/genetlink.h> #include <net/wireless.h> #include <net/cfg80211.h> @@ -37,7 +38,7 @@ struct cfg80211_registered_device { enum environment_cap env; /* wiphy index, internal only */ - int idx; + int wiphy_idx; /* associate netdev list */ struct mutex devlist_mtx; @@ -49,6 +50,7 @@ struct cfg80211_registered_device { struct rb_root bss_tree; u32 bss_generation; struct cfg80211_scan_request *scan_req; /* protected by RTNL */ + unsigned long suspend_at; /* must be last because of the way we do wiphy_priv(), * and it should at least be aligned to NETDEV_ALIGN */ @@ -62,9 +64,27 @@ struct cfg80211_registered_device *wiphy_to_dev(struct wiphy *wiphy) return container_of(wiphy, struct cfg80211_registered_device, wiphy); } -extern struct mutex cfg80211_drv_mutex; +/* Note 0 is valid, hence phy0 */ +static inline +bool wiphy_idx_valid(int wiphy_idx) +{ + return (wiphy_idx >= 0); +} + +extern struct mutex cfg80211_mutex; extern struct list_head cfg80211_drv_list; +static inline void assert_cfg80211_lock(void) +{ + WARN_ON(!mutex_is_locked(&cfg80211_mutex)); +} + +/* + * You can use this to mark a wiphy_idx as not having an associated wiphy. + * It guarantees cfg80211_drv_by_wiphy_idx(wiphy_idx) will return NULL + */ +#define WIPHY_IDX_STALE -1 + struct cfg80211_internal_bss { struct list_head list; struct rb_node rbn; @@ -74,6 +94,9 @@ struct cfg80211_internal_bss { struct cfg80211_bss pub; }; +struct cfg80211_registered_device *cfg80211_drv_by_wiphy_idx(int wiphy_idx); +int get_wiphy_idx(struct wiphy *wiphy); + /* * This function returns a pointer to the driver * that the genl_info item that is passed refers to. @@ -81,13 +104,13 @@ struct cfg80211_internal_bss { * the driver's mutex! * * This means that you need to call cfg80211_put_dev() - * before being allowed to acquire &cfg80211_drv_mutex! + * before being allowed to acquire &cfg80211_mutex! * * This is necessary because we need to lock the global * mutex to get an item off the list safely, and then * we lock the drv mutex so it doesn't go away under us. * - * We don't want to keep cfg80211_drv_mutex locked + * We don't want to keep cfg80211_mutex locked * for all the time in order to allow requests on * other interfaces to go through at the same time. * @@ -97,6 +120,9 @@ struct cfg80211_internal_bss { extern struct cfg80211_registered_device * cfg80211_get_dev_from_info(struct genl_info *info); +/* requires cfg80211_drv_mutex to be held! */ +struct wiphy *wiphy_idx_to_wiphy(int wiphy_idx); + /* identical to cfg80211_get_dev_from_info but only operate on ifindex */ extern struct cfg80211_registered_device * cfg80211_get_dev_from_ifindex(int ifindex); @@ -110,8 +136,11 @@ extern int cfg80211_dev_rename(struct cfg80211_registered_device *drv, char *newname); void ieee80211_set_bitrate_flags(struct wiphy *wiphy); -void wiphy_update_regulatory(struct wiphy *wiphy, enum reg_set_by setby); +void wiphy_update_regulatory(struct wiphy *wiphy, + enum nl80211_reg_initiator setby); void cfg80211_bss_expire(struct cfg80211_registered_device *dev); +void cfg80211_bss_age(struct cfg80211_registered_device *dev, + unsigned long age_secs); #endif /* __NET_WIRELESS_CORE_H */ diff --git a/net/wireless/lib80211_crypt_ccmp.c b/net/wireless/lib80211_crypt_ccmp.c index db428194c16..2301dc1edc4 100644 --- a/net/wireless/lib80211_crypt_ccmp.c +++ b/net/wireless/lib80211_crypt_ccmp.c @@ -337,6 +337,7 @@ static int lib80211_ccmp_decrypt(struct sk_buff *skb, int hdr_len, void *priv) pos += 8; if (ccmp_replay_check(pn, key->rx_pn)) { +#ifdef CONFIG_LIB80211_DEBUG if (net_ratelimit()) { printk(KERN_DEBUG "CCMP: replay detected: STA=%pM " "previous PN %02x%02x%02x%02x%02x%02x " @@ -346,6 +347,7 @@ static int lib80211_ccmp_decrypt(struct sk_buff *skb, int hdr_len, void *priv) key->rx_pn[3], key->rx_pn[4], key->rx_pn[5], pn[0], pn[1], pn[2], pn[3], pn[4], pn[5]); } +#endif key->dot11RSNAStatsCCMPReplays++; return -4; } diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c index 7e8e22bfed9..c36287399d7 100644 --- a/net/wireless/lib80211_crypt_tkip.c +++ b/net/wireless/lib80211_crypt_tkip.c @@ -465,12 +465,14 @@ static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv) pos += 8; if (tkip_replay_check(iv32, iv16, tkey->rx_iv32, tkey->rx_iv16)) { +#ifdef CONFIG_LIB80211_DEBUG if (net_ratelimit()) { printk(KERN_DEBUG "TKIP: replay detected: STA=%pM" " previous TSC %08x%04x received TSC " "%08x%04x\n", hdr->addr2, tkey->rx_iv32, tkey->rx_iv16, iv32, iv16); } +#endif tkey->dot11RSNAStatsTKIPReplays++; return -4; } @@ -505,10 +507,12 @@ static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv) * it needs to be recalculated for the next packet. */ tkey->rx_phase1_done = 0; } +#ifdef CONFIG_LIB80211_DEBUG if (net_ratelimit()) { printk(KERN_DEBUG "TKIP: ICV error detected: STA=" "%pM\n", hdr->addr2); } +#endif tkey->dot11RSNAStatsTKIPICVErrors++; return -5; } diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 298a4de5994..ab9d8f14e15 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -7,7 +7,6 @@ #include <linux/if.h> #include <linux/module.h> #include <linux/err.h> -#include <linux/mutex.h> #include <linux/list.h> #include <linux/if_ether.h> #include <linux/ieee80211.h> @@ -142,7 +141,7 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags, if (!hdr) return -1; - NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, dev->idx); + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, dev->wiphy_idx); NLA_PUT_STRING(msg, NL80211_ATTR_WIPHY_NAME, wiphy_name(&dev->wiphy)); NLA_PUT_U8(msg, NL80211_ATTR_MAX_NUM_SCAN_SSIDS, dev->wiphy.max_scan_ssids); @@ -256,7 +255,7 @@ static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb) int start = cb->args[0]; struct cfg80211_registered_device *dev; - mutex_lock(&cfg80211_drv_mutex); + mutex_lock(&cfg80211_mutex); list_for_each_entry(dev, &cfg80211_drv_list, list) { if (++idx <= start) continue; @@ -267,7 +266,7 @@ static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb) break; } } - mutex_unlock(&cfg80211_drv_mutex); + mutex_unlock(&cfg80211_mutex); cb->args[0] = idx; @@ -470,7 +469,7 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback * struct cfg80211_registered_device *dev; struct wireless_dev *wdev; - mutex_lock(&cfg80211_drv_mutex); + mutex_lock(&cfg80211_mutex); list_for_each_entry(dev, &cfg80211_drv_list, list) { if (wp_idx < wp_start) { wp_idx++; @@ -497,7 +496,7 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback * wp_idx++; } out: - mutex_unlock(&cfg80211_drv_mutex); + mutex_unlock(&cfg80211_mutex); cb->args[0] = wp_idx; cb->args[1] = if_idx; @@ -1206,6 +1205,12 @@ static int nl80211_send_station(struct sk_buff *msg, u32 pid, u32 seq, nla_nest_end(msg, txrate); } + if (sinfo->filled & STATION_INFO_RX_PACKETS) + NLA_PUT_U32(msg, NL80211_STA_INFO_RX_PACKETS, + sinfo->rx_packets); + if (sinfo->filled & STATION_INFO_TX_PACKETS) + NLA_PUT_U32(msg, NL80211_STA_INFO_TX_PACKETS, + sinfo->tx_packets); nla_nest_end(msg, sinfoattr); return genlmsg_end(msg, hdr); @@ -1900,6 +1905,19 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info) int r; char *data = NULL; + /* + * You should only get this when cfg80211 hasn't yet initialized + * completely when built-in to the kernel right between the time + * window between nl80211_init() and regulatory_init(), if that is + * even possible. + */ + mutex_lock(&cfg80211_mutex); + if (unlikely(!cfg80211_regdomain)) { + mutex_unlock(&cfg80211_mutex); + return -EINPROGRESS; + } + mutex_unlock(&cfg80211_mutex); + if (!info->attrs[NL80211_ATTR_REG_ALPHA2]) return -EINVAL; @@ -1910,14 +1928,9 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info) if (is_world_regdom(data)) return -EINVAL; #endif - mutex_lock(&cfg80211_drv_mutex); - r = __regulatory_hint(NULL, REGDOM_SET_BY_USER, data, 0, ENVIRON_ANY); - mutex_unlock(&cfg80211_drv_mutex); - /* This means the regulatory domain was already set, however - * we don't want to confuse userspace with a "successful error" - * message so lets just treat it as a success */ - if (r == -EALREADY) - r = 0; + + r = regulatory_hint_user(data); + return r; } @@ -1937,6 +1950,11 @@ static int nl80211_get_mesh_params(struct sk_buff *skb, if (err) return err; + if (!drv->ops->get_mesh_params) { + err = -EOPNOTSUPP; + goto out; + } + /* Get the mesh params */ rtnl_lock(); err = drv->ops->get_mesh_params(&drv->wiphy, dev, &cur_params); @@ -2046,6 +2064,11 @@ static int nl80211_set_mesh_params(struct sk_buff *skb, struct genl_info *info) if (err) return err; + if (!drv->ops->set_mesh_params) { + err = -EOPNOTSUPP; + goto out; + } + /* This makes sure that there aren't more than 32 mesh config * parameters (otherwise our bitfield scheme would not work.) */ BUILD_BUG_ON(NL80211_MESHCONF_ATTR_MAX > 32); @@ -2090,6 +2113,7 @@ static int nl80211_set_mesh_params(struct sk_buff *skb, struct genl_info *info) err = drv->ops->set_mesh_params(&drv->wiphy, dev, &cfg, mask); rtnl_unlock(); + out: /* cleanup */ cfg80211_put_dev(drv); dev_put(dev); @@ -2106,7 +2130,7 @@ static int nl80211_get_reg(struct sk_buff *skb, struct genl_info *info) unsigned int i; int err = -EINVAL; - mutex_lock(&cfg80211_drv_mutex); + mutex_lock(&cfg80211_mutex); if (!cfg80211_regdomain) goto out; @@ -2169,7 +2193,7 @@ nla_put_failure: genlmsg_cancel(msg, hdr); err = -EMSGSIZE; out: - mutex_unlock(&cfg80211_drv_mutex); + mutex_unlock(&cfg80211_mutex); return err; } @@ -2228,9 +2252,9 @@ static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info) BUG_ON(rule_idx != num_rules); - mutex_lock(&cfg80211_drv_mutex); + mutex_lock(&cfg80211_mutex); r = set_regdom(rd); - mutex_unlock(&cfg80211_drv_mutex); + mutex_unlock(&cfg80211_mutex); return r; bad_reg: @@ -2286,6 +2310,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) struct wiphy *wiphy; int err, tmp, n_ssids = 0, n_channels = 0, i; enum ieee80211_band band; + size_t ie_len; err = get_drv_dev_by_info_ifindex(info->attrs, &drv, &dev); if (err) @@ -2327,9 +2352,15 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) goto out_unlock; } + if (info->attrs[NL80211_ATTR_IE]) + ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); + else + ie_len = 0; + request = kzalloc(sizeof(*request) + sizeof(*ssid) * n_ssids - + sizeof(channel) * n_channels, GFP_KERNEL); + + sizeof(channel) * n_channels + + ie_len, GFP_KERNEL); if (!request) { err = -ENOMEM; goto out_unlock; @@ -2340,6 +2371,12 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) if (n_ssids) request->ssids = (void *)(request->channels + n_channels); request->n_ssids = n_ssids; + if (ie_len) { + if (request->ssids) + request->ie = (void *)(request->ssids + n_ssids); + else + request->ie = (void *)(request->channels + n_channels); + } if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) { /* user specified, bail out if channel not found */ @@ -2380,6 +2417,12 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) } } + if (info->attrs[NL80211_ATTR_IE]) { + request->ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); + memcpy(request->ie, nla_data(info->attrs[NL80211_ATTR_IE]), + request->ie_len); + } + request->ifidx = dev->ifindex; request->wiphy = &drv->wiphy; @@ -2432,7 +2475,7 @@ static int nl80211_send_bss(struct sk_buff *msg, u32 pid, u32 seq, int flags, NLA_PUT_U16(msg, NL80211_BSS_CAPABILITY, res->capability); NLA_PUT_U32(msg, NL80211_BSS_FREQUENCY, res->channel->center_freq); - switch (res->signal_type) { + switch (rdev->wiphy.signal_type) { case CFG80211_SIGNAL_TYPE_MBM: NLA_PUT_U32(msg, NL80211_BSS_SIGNAL_MBM, res->signal); break; @@ -2601,7 +2644,6 @@ static struct genl_ops nl80211_ops[] = { .doit = nl80211_get_station, .dumpit = nl80211_dump_station, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, }, { .cmd = NL80211_CMD_SET_STATION, @@ -2708,6 +2750,9 @@ static struct genl_multicast_group nl80211_config_mcgrp = { static struct genl_multicast_group nl80211_scan_mcgrp = { .name = "scan", }; +static struct genl_multicast_group nl80211_regulatory_mcgrp = { + .name = "regulatory", +}; /* notification functions */ @@ -2739,7 +2784,7 @@ static int nl80211_send_scan_donemsg(struct sk_buff *msg, if (!hdr) return -1; - NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->idx); + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx); NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex); /* XXX: we should probably bounce back the request? */ @@ -2787,6 +2832,61 @@ void nl80211_send_scan_aborted(struct cfg80211_registered_device *rdev, genlmsg_multicast(msg, 0, nl80211_scan_mcgrp.id, GFP_KERNEL); } +/* + * This can happen on global regulatory changes or device specific settings + * based on custom world regulatory domains. + */ +void nl80211_send_reg_change_event(struct regulatory_request *request) +{ + struct sk_buff *msg; + void *hdr; + + msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_REG_CHANGE); + if (!hdr) { + nlmsg_free(msg); + return; + } + + /* Userspace can always count this one always being set */ + NLA_PUT_U8(msg, NL80211_ATTR_REG_INITIATOR, request->initiator); + + if (request->alpha2[0] == '0' && request->alpha2[1] == '0') + NLA_PUT_U8(msg, NL80211_ATTR_REG_TYPE, + NL80211_REGDOM_TYPE_WORLD); + else if (request->alpha2[0] == '9' && request->alpha2[1] == '9') + NLA_PUT_U8(msg, NL80211_ATTR_REG_TYPE, + NL80211_REGDOM_TYPE_CUSTOM_WORLD); + else if ((request->alpha2[0] == '9' && request->alpha2[1] == '8') || + request->intersect) + NLA_PUT_U8(msg, NL80211_ATTR_REG_TYPE, + NL80211_REGDOM_TYPE_INTERSECTION); + else { + NLA_PUT_U8(msg, NL80211_ATTR_REG_TYPE, + NL80211_REGDOM_TYPE_COUNTRY); + NLA_PUT_STRING(msg, NL80211_ATTR_REG_ALPHA2, request->alpha2); + } + + if (wiphy_idx_valid(request->wiphy_idx)) + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, request->wiphy_idx); + + if (genlmsg_end(msg, hdr) < 0) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast(msg, 0, nl80211_regulatory_mcgrp.id, GFP_KERNEL); + + return; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + nlmsg_free(msg); +} + /* initialisation/exit functions */ int nl80211_init(void) @@ -2811,6 +2911,10 @@ int nl80211_init(void) if (err) goto err_out; + err = genl_register_mc_group(&nl80211_fam, &nl80211_regulatory_mcgrp); + if (err) + goto err_out; + return 0; err_out: genl_unregister_family(&nl80211_fam); diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index b565a5f84e9..e65a3c38c52 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -11,6 +11,7 @@ extern void nl80211_send_scan_done(struct cfg80211_registered_device *rdev, struct net_device *netdev); extern void nl80211_send_scan_aborted(struct cfg80211_registered_device *rdev, struct net_device *netdev); +extern void nl80211_send_reg_change_event(struct regulatory_request *request); #else static inline int nl80211_init(void) { @@ -27,6 +28,14 @@ static inline void nl80211_send_scan_done(struct cfg80211_registered_device *rdev, struct net_device *netdev) {} +static inline void nl80211_send_scan_aborted( + struct cfg80211_registered_device *rdev, + struct net_device *netdev) +{} +static inline void +nl80211_send_reg_change_event(struct regulatory_request *request) +{ +} #endif /* CONFIG_NL80211 */ #endif /* __NET_WIRELESS_NL80211_H */ diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 2323644330c..eb8b8ed1615 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -41,6 +41,7 @@ #include <net/cfg80211.h> #include "core.h" #include "reg.h" +#include "nl80211.h" /* Receipt of information from last regulatory request */ static struct regulatory_request *last_request; @@ -54,22 +55,63 @@ static u32 supported_bandwidths[] = { MHZ_TO_KHZ(20), }; -/* Central wireless core regulatory domains, we only need two, +/* + * Central wireless core regulatory domains, we only need two, * the current one and a world regulatory domain in case we have no - * information to give us an alpha2 */ + * information to give us an alpha2 + */ const struct ieee80211_regdomain *cfg80211_regdomain; -/* We use this as a place for the rd structure built from the +/* + * We use this as a place for the rd structure built from the * last parsed country IE to rest until CRDA gets back to us with - * what it thinks should apply for the same country */ + * what it thinks should apply for the same country + */ static const struct ieee80211_regdomain *country_ie_regdomain; +/* Used to queue up regulatory hints */ +static LIST_HEAD(reg_requests_list); +static spinlock_t reg_requests_lock; + +/* Used to queue up beacon hints for review */ +static LIST_HEAD(reg_pending_beacons); +static spinlock_t reg_pending_beacons_lock; + +/* Used to keep track of processed beacon hints */ +static LIST_HEAD(reg_beacon_list); + +struct reg_beacon { + struct list_head list; + struct ieee80211_channel chan; +}; + /* We keep a static world regulatory domain in case of the absence of CRDA */ static const struct ieee80211_regdomain world_regdom = { - .n_reg_rules = 1, + .n_reg_rules = 5, .alpha2 = "00", .reg_rules = { - REG_RULE(2412-10, 2462+10, 40, 6, 20, + /* IEEE 802.11b/g, channels 1..11 */ + REG_RULE(2412-10, 2462+10, 40, 6, 20, 0), + /* IEEE 802.11b/g, channels 12..13. No HT40 + * channel fits here. */ + REG_RULE(2467-10, 2472+10, 20, 6, 20, + NL80211_RRF_PASSIVE_SCAN | + NL80211_RRF_NO_IBSS), + /* IEEE 802.11 channel 14 - Only JP enables + * this and for 802.11b only */ + REG_RULE(2484-10, 2484+10, 20, 6, 20, + NL80211_RRF_PASSIVE_SCAN | + NL80211_RRF_NO_IBSS | + NL80211_RRF_NO_OFDM), + /* IEEE 802.11a, channel 36..48 */ + REG_RULE(5180-10, 5240+10, 40, 6, 20, + NL80211_RRF_PASSIVE_SCAN | + NL80211_RRF_NO_IBSS), + + /* NB: 5260 MHz - 5700 MHz requies DFS */ + + /* IEEE 802.11a, channel 149..165 */ + REG_RULE(5745-10, 5825+10, 40, 6, 20, NL80211_RRF_PASSIVE_SCAN | NL80211_RRF_NO_IBSS), } @@ -83,9 +125,11 @@ static char *ieee80211_regdom = "US"; module_param(ieee80211_regdom, charp, 0444); MODULE_PARM_DESC(ieee80211_regdom, "IEEE 802.11 regulatory domain code"); -/* We assume 40 MHz bandwidth for the old regulatory work. +/* + * We assume 40 MHz bandwidth for the old regulatory work. * We make emphasis we are using the exact same frequencies - * as before */ + * as before + */ static const struct ieee80211_regdomain us_regdom = { .n_reg_rules = 6, @@ -124,8 +168,10 @@ static const struct ieee80211_regdomain jp_regdom = { static const struct ieee80211_regdomain eu_regdom = { .n_reg_rules = 6, - /* This alpha2 is bogus, we leave it here just for stupid - * backward compatibility */ + /* + * This alpha2 is bogus, we leave it here just for stupid + * backward compatibility + */ .alpha2 = "EU", .reg_rules = { /* IEEE 802.11b/g, channels 1..13 */ @@ -194,8 +240,10 @@ static void reset_regdomains(void) cfg80211_regdomain = NULL; } -/* Dynamic world regulatory domain requested by the wireless - * core upon initialization */ +/* + * Dynamic world regulatory domain requested by the wireless + * core upon initialization + */ static void update_world_regdomain(const struct ieee80211_regdomain *rd) { BUG_ON(!last_request); @@ -236,8 +284,10 @@ static bool is_unknown_alpha2(const char *alpha2) { if (!alpha2) return false; - /* Special case where regulatory domain was built by driver - * but a specific alpha2 cannot be determined */ + /* + * Special case where regulatory domain was built by driver + * but a specific alpha2 cannot be determined + */ if (alpha2[0] == '9' && alpha2[1] == '9') return true; return false; @@ -247,9 +297,11 @@ static bool is_intersected_alpha2(const char *alpha2) { if (!alpha2) return false; - /* Special case where regulatory domain is the + /* + * Special case where regulatory domain is the * result of an intersection between two regulatory domain - * structures */ + * structures + */ if (alpha2[0] == '9' && alpha2[1] == '8') return true; return false; @@ -274,8 +326,10 @@ static bool alpha2_equal(const char *alpha2_x, const char *alpha2_y) return false; } -static bool regdom_changed(const char *alpha2) +static bool regdom_changes(const char *alpha2) { + assert_cfg80211_lock(); + if (!cfg80211_regdomain) return true; if (alpha2_equal(cfg80211_regdomain->alpha2, alpha2)) @@ -302,8 +356,10 @@ static bool country_ie_integrity_changes(u32 checksum) return false; } -/* This lets us keep regulatory code which is updated on a regulatory - * basis in userspace. */ +/* + * This lets us keep regulatory code which is updated on a regulatory + * basis in userspace. + */ static int call_crda(const char *alpha2) { char country_env[9 + 2] = "COUNTRY="; @@ -348,7 +404,8 @@ static bool is_valid_reg_rule(const struct ieee80211_reg_rule *rule) freq_diff = freq_range->end_freq_khz - freq_range->start_freq_khz; - if (freq_diff <= 0 || freq_range->max_bandwidth_khz > freq_diff) + if (freq_range->end_freq_khz <= freq_range->start_freq_khz || + freq_range->max_bandwidth_khz > freq_diff) return false; return true; @@ -414,10 +471,12 @@ static bool freq_in_rule_band(const struct ieee80211_freq_range *freq_range, #undef ONE_GHZ_IN_KHZ } -/* Converts a country IE to a regulatory domain. A regulatory domain +/* + * Converts a country IE to a regulatory domain. A regulatory domain * structure has a lot of information which the IE doesn't yet have, * so for the other values we use upper max values as we will intersect - * with our userspace regulatory agent to get lower bounds. */ + * with our userspace regulatory agent to get lower bounds. + */ static struct ieee80211_regdomain *country_ie_2_rd( u8 *country_ie, u8 country_ie_len, @@ -462,9 +521,11 @@ static struct ieee80211_regdomain *country_ie_2_rd( *checksum ^= ((flags ^ alpha2[0] ^ alpha2[1]) << 8); - /* We need to build a reg rule for each triplet, but first we must + /* + * We need to build a reg rule for each triplet, but first we must * calculate the number of reg rules we will need. We will need one - * for each channel subband */ + * for each channel subband + */ while (country_ie_len >= 3) { int end_channel = 0; struct ieee80211_country_ie_triplet *triplet = @@ -502,9 +563,11 @@ static struct ieee80211_regdomain *country_ie_2_rd( if (cur_sub_max_channel < cur_channel) return NULL; - /* Do not allow overlapping channels. Also channels + /* + * Do not allow overlapping channels. Also channels * passed in each subband must be monotonically - * increasing */ + * increasing + */ if (last_sub_max_channel) { if (cur_channel <= last_sub_max_channel) return NULL; @@ -512,10 +575,12 @@ static struct ieee80211_regdomain *country_ie_2_rd( return NULL; } - /* When dot11RegulatoryClassesRequired is supported + /* + * When dot11RegulatoryClassesRequired is supported * we can throw ext triplets as part of this soup, * for now we don't care when those change as we - * don't support them */ + * don't support them + */ *checksum ^= ((cur_channel ^ cur_sub_max_channel) << 8) | ((cur_sub_max_channel ^ cur_sub_max_channel) << 16) | ((triplet->chans.max_power ^ cur_sub_max_channel) << 24); @@ -526,8 +591,10 @@ static struct ieee80211_regdomain *country_ie_2_rd( country_ie_len -= 3; num_rules++; - /* Note: this is not a IEEE requirement but - * simply a memory requirement */ + /* + * Note: this is not a IEEE requirement but + * simply a memory requirement + */ if (num_rules > NL80211_MAX_SUPP_REG_RULES) return NULL; } @@ -555,8 +622,10 @@ static struct ieee80211_regdomain *country_ie_2_rd( struct ieee80211_freq_range *freq_range = NULL; struct ieee80211_power_rule *power_rule = NULL; - /* Must parse if dot11RegulatoryClassesRequired is true, - * we don't support this yet */ + /* + * Must parse if dot11RegulatoryClassesRequired is true, + * we don't support this yet + */ if (triplet->ext.reg_extension_id >= IEEE80211_COUNTRY_EXTENSION_ID) { country_ie += 3; @@ -578,10 +647,12 @@ static struct ieee80211_regdomain *country_ie_2_rd( end_channel = triplet->chans.first_channel + (4 * (triplet->chans.num_channels - 1)); - /* The +10 is since the regulatory domain expects + /* + * The +10 is since the regulatory domain expects * the actual band edge, not the center of freq for * its start and end freqs, assuming 20 MHz bandwidth on - * the channels passed */ + * the channels passed + */ freq_range->start_freq_khz = MHZ_TO_KHZ(ieee80211_channel_to_frequency( triplet->chans.first_channel) - 10); @@ -589,9 +660,11 @@ static struct ieee80211_regdomain *country_ie_2_rd( MHZ_TO_KHZ(ieee80211_channel_to_frequency( end_channel) + 10); - /* Large arbitrary values, we intersect later */ - /* Increment this if we ever support >= 40 MHz channels - * in IEEE 802.11 */ + /* + * These are large arbitrary values we use to intersect later. + * Increment this if we ever support >= 40 MHz channels + * in IEEE 802.11 + */ freq_range->max_bandwidth_khz = MHZ_TO_KHZ(40); power_rule->max_antenna_gain = DBI_TO_MBI(100); power_rule->max_eirp = DBM_TO_MBM(100); @@ -607,8 +680,10 @@ static struct ieee80211_regdomain *country_ie_2_rd( } -/* Helper for regdom_intersect(), this does the real - * mathematical intersection fun */ +/* + * Helper for regdom_intersect(), this does the real + * mathematical intersection fun + */ static int reg_rules_intersect( const struct ieee80211_reg_rule *rule1, const struct ieee80211_reg_rule *rule2, @@ -686,11 +761,13 @@ static struct ieee80211_regdomain *regdom_intersect( if (!rd1 || !rd2) return NULL; - /* First we get a count of the rules we'll need, then we actually + /* + * First we get a count of the rules we'll need, then we actually * build them. This is to so we can malloc() and free() a * regdomain once. The reason we use reg_rules_intersect() here * is it will return -EINVAL if the rule computed makes no sense. - * All rules that do check out OK are valid. */ + * All rules that do check out OK are valid. + */ for (x = 0; x < rd1->n_reg_rules; x++) { rule1 = &rd1->reg_rules[x]; @@ -718,14 +795,18 @@ static struct ieee80211_regdomain *regdom_intersect( rule1 = &rd1->reg_rules[x]; for (y = 0; y < rd2->n_reg_rules; y++) { rule2 = &rd2->reg_rules[y]; - /* This time around instead of using the stack lets + /* + * This time around instead of using the stack lets * write to the target rule directly saving ourselves - * a memcpy() */ + * a memcpy() + */ intersected_rule = &rd->reg_rules[rule_idx]; r = reg_rules_intersect(rule1, rule2, intersected_rule); - /* No need to memset here the intersected rule here as - * we're not using the stack anymore */ + /* + * No need to memset here the intersected rule here as + * we're not using the stack anymore + */ if (r) continue; rule_idx++; @@ -744,8 +825,10 @@ static struct ieee80211_regdomain *regdom_intersect( return rd; } -/* XXX: add support for the rest of enum nl80211_reg_rule_flags, we may - * want to just have the channel structure use these */ +/* + * XXX: add support for the rest of enum nl80211_reg_rule_flags, we may + * want to just have the channel structure use these + */ static u32 map_regdom_flags(u32 rd_flags) { u32 channel_flags = 0; @@ -771,10 +854,12 @@ static int freq_reg_info_regd(struct wiphy *wiphy, regd = custom_regd ? custom_regd : cfg80211_regdomain; - /* Follow the driver's regulatory domain, if present, unless a country - * IE has been processed or a user wants to help complaince further */ - if (last_request->initiator != REGDOM_SET_BY_COUNTRY_IE && - last_request->initiator != REGDOM_SET_BY_USER && + /* + * Follow the driver's regulatory domain, if present, unless a country + * IE has been processed or a user wants to help complaince further + */ + if (last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE && + last_request->initiator != NL80211_REGDOM_SET_BY_USER && wiphy->regd) regd = wiphy->regd; @@ -790,9 +875,11 @@ static int freq_reg_info_regd(struct wiphy *wiphy, fr = &rr->freq_range; pr = &rr->power_rule; - /* We only need to know if one frequency rule was + /* + * We only need to know if one frequency rule was * was in center_freq's band, that's enough, so lets - * not overwrite it once found */ + * not overwrite it once found + */ if (!band_rule_found) band_rule_found = freq_in_rule_band(fr, center_freq); @@ -829,6 +916,11 @@ static void handle_channel(struct wiphy *wiphy, enum ieee80211_band band, const struct ieee80211_power_rule *power_rule = NULL; struct ieee80211_supported_band *sband; struct ieee80211_channel *chan; + struct wiphy *request_wiphy = NULL; + + assert_cfg80211_lock(); + + request_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx); sband = wiphy->bands[band]; BUG_ON(chan_idx >= sband->n_channels); @@ -840,7 +932,8 @@ static void handle_channel(struct wiphy *wiphy, enum ieee80211_band band, &max_bandwidth, ®_rule); if (r) { - /* This means no regulatory rule was found in the country IE + /* + * This means no regulatory rule was found in the country IE * with a frequency range on the center_freq's band, since * IEEE-802.11 allows for a country IE to have a subset of the * regulatory information provided in a country we ignore @@ -851,7 +944,8 @@ static void handle_channel(struct wiphy *wiphy, enum ieee80211_band band, * http://tinyurl.com/11d-clarification */ if (r == -ERANGE && - last_request->initiator == REGDOM_SET_BY_COUNTRY_IE) { + last_request->initiator == + NL80211_REGDOM_SET_BY_COUNTRY_IE) { #ifdef CONFIG_CFG80211_REG_DEBUG printk(KERN_DEBUG "cfg80211: Leaving channel %d MHz " "intact on %s - no rule found in band on " @@ -859,10 +953,13 @@ static void handle_channel(struct wiphy *wiphy, enum ieee80211_band band, chan->center_freq, wiphy_name(wiphy)); #endif } else { - /* In this case we know the country IE has at least one reg rule - * for the band so we respect its band definitions */ + /* + * In this case we know the country IE has at least one reg rule + * for the band so we respect its band definitions + */ #ifdef CONFIG_CFG80211_REG_DEBUG - if (last_request->initiator == REGDOM_SET_BY_COUNTRY_IE) + if (last_request->initiator == + NL80211_REGDOM_SET_BY_COUNTRY_IE) printk(KERN_DEBUG "cfg80211: Disabling " "channel %d MHz on %s due to " "Country IE\n", @@ -876,12 +973,14 @@ static void handle_channel(struct wiphy *wiphy, enum ieee80211_band band, power_rule = ®_rule->power_rule; - if (last_request->initiator == REGDOM_SET_BY_DRIVER && - last_request->wiphy && last_request->wiphy == wiphy && - last_request->wiphy->strict_regulatory) { - /* This gaurantees the driver's requested regulatory domain + if (last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER && + request_wiphy && request_wiphy == wiphy && + request_wiphy->strict_regulatory) { + /* + * This gaurantees the driver's requested regulatory domain * will always be used as a base for further regulatory - * settings */ + * settings + */ chan->flags = chan->orig_flags = map_regdom_flags(reg_rule->flags); chan->max_antenna_gain = chan->orig_mag = @@ -915,39 +1014,147 @@ static void handle_band(struct wiphy *wiphy, enum ieee80211_band band) handle_channel(wiphy, band, i); } -static bool ignore_reg_update(struct wiphy *wiphy, enum reg_set_by setby) +static bool ignore_reg_update(struct wiphy *wiphy, + enum nl80211_reg_initiator initiator) { if (!last_request) return true; - if (setby == REGDOM_SET_BY_CORE && + if (initiator == NL80211_REGDOM_SET_BY_CORE && wiphy->custom_regulatory) return true; - /* wiphy->regd will be set once the device has its own - * desired regulatory domain set */ + /* + * wiphy->regd will be set once the device has its own + * desired regulatory domain set + */ if (wiphy->strict_regulatory && !wiphy->regd && !is_world_regdom(last_request->alpha2)) return true; return false; } -static void update_all_wiphy_regulatory(enum reg_set_by setby) +static void update_all_wiphy_regulatory(enum nl80211_reg_initiator initiator) { struct cfg80211_registered_device *drv; list_for_each_entry(drv, &cfg80211_drv_list, list) - wiphy_update_regulatory(&drv->wiphy, setby); + wiphy_update_regulatory(&drv->wiphy, initiator); } -void wiphy_update_regulatory(struct wiphy *wiphy, enum reg_set_by setby) +static void handle_reg_beacon(struct wiphy *wiphy, + unsigned int chan_idx, + struct reg_beacon *reg_beacon) { - enum ieee80211_band band; +#ifdef CONFIG_CFG80211_REG_DEBUG +#define REG_DEBUG_BEACON_FLAG(desc) \ + printk(KERN_DEBUG "cfg80211: Enabling " desc " on " \ + "frequency: %d MHz (Ch %d) on %s\n", \ + reg_beacon->chan.center_freq, \ + ieee80211_frequency_to_channel(reg_beacon->chan.center_freq), \ + wiphy_name(wiphy)); +#else +#define REG_DEBUG_BEACON_FLAG(desc) do {} while (0) +#endif + struct ieee80211_supported_band *sband; + struct ieee80211_channel *chan; + + assert_cfg80211_lock(); + + sband = wiphy->bands[reg_beacon->chan.band]; + chan = &sband->channels[chan_idx]; + + if (likely(chan->center_freq != reg_beacon->chan.center_freq)) + return; + + if (chan->flags & IEEE80211_CHAN_PASSIVE_SCAN) { + chan->flags &= ~IEEE80211_CHAN_PASSIVE_SCAN; + REG_DEBUG_BEACON_FLAG("active scanning"); + } + + if (chan->flags & IEEE80211_CHAN_NO_IBSS) { + chan->flags &= ~IEEE80211_CHAN_NO_IBSS; + REG_DEBUG_BEACON_FLAG("beaconing"); + } + + chan->beacon_found = true; +#undef REG_DEBUG_BEACON_FLAG +} + +/* + * Called when a scan on a wiphy finds a beacon on + * new channel + */ +static void wiphy_update_new_beacon(struct wiphy *wiphy, + struct reg_beacon *reg_beacon) +{ + unsigned int i; + struct ieee80211_supported_band *sband; + + assert_cfg80211_lock(); - if (ignore_reg_update(wiphy, setby)) + if (!wiphy->bands[reg_beacon->chan.band]) return; + + sband = wiphy->bands[reg_beacon->chan.band]; + + for (i = 0; i < sband->n_channels; i++) + handle_reg_beacon(wiphy, i, reg_beacon); +} + +/* + * Called upon reg changes or a new wiphy is added + */ +static void wiphy_update_beacon_reg(struct wiphy *wiphy) +{ + unsigned int i; + struct ieee80211_supported_band *sband; + struct reg_beacon *reg_beacon; + + assert_cfg80211_lock(); + + if (list_empty(®_beacon_list)) + return; + + list_for_each_entry(reg_beacon, ®_beacon_list, list) { + if (!wiphy->bands[reg_beacon->chan.band]) + continue; + sband = wiphy->bands[reg_beacon->chan.band]; + for (i = 0; i < sband->n_channels; i++) + handle_reg_beacon(wiphy, i, reg_beacon); + } +} + +static bool reg_is_world_roaming(struct wiphy *wiphy) +{ + if (is_world_regdom(cfg80211_regdomain->alpha2) || + (wiphy->regd && is_world_regdom(wiphy->regd->alpha2))) + return true; + if (last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE && + wiphy->custom_regulatory) + return true; + return false; +} + +/* Reap the advantages of previously found beacons */ +static void reg_process_beacons(struct wiphy *wiphy) +{ + if (!reg_is_world_roaming(wiphy)) + return; + wiphy_update_beacon_reg(wiphy); +} + +void wiphy_update_regulatory(struct wiphy *wiphy, + enum nl80211_reg_initiator initiator) +{ + enum ieee80211_band band; + + if (ignore_reg_update(wiphy, initiator)) + goto out; for (band = 0; band < IEEE80211_NUM_BANDS; band++) { if (wiphy->bands[band]) handle_band(wiphy, band); } +out: + reg_process_beacons(wiphy); if (wiphy->reg_notifier) wiphy->reg_notifier(wiphy, last_request); } @@ -1033,81 +1240,98 @@ static int reg_copy_regd(const struct ieee80211_regdomain **dst_regd, return 0; } -/* Return value which can be used by ignore_request() to indicate - * it has been determined we should intersect two regulatory domains */ +/* + * Return value which can be used by ignore_request() to indicate + * it has been determined we should intersect two regulatory domains + */ #define REG_INTERSECT 1 /* This has the logic which determines when a new request * should be ignored. */ -static int ignore_request(struct wiphy *wiphy, enum reg_set_by set_by, - const char *alpha2) +static int ignore_request(struct wiphy *wiphy, + struct regulatory_request *pending_request) { + struct wiphy *last_wiphy = NULL; + + assert_cfg80211_lock(); + /* All initial requests are respected */ if (!last_request) return 0; - switch (set_by) { - case REGDOM_SET_BY_INIT: + switch (pending_request->initiator) { + case NL80211_REGDOM_SET_BY_CORE: return -EINVAL; - case REGDOM_SET_BY_CORE: - /* - * Always respect new wireless core hints, should only happen - * when updating the world regulatory domain at init. - */ - return 0; - case REGDOM_SET_BY_COUNTRY_IE: - if (unlikely(!is_an_alpha2(alpha2))) + case NL80211_REGDOM_SET_BY_COUNTRY_IE: + + last_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx); + + if (unlikely(!is_an_alpha2(pending_request->alpha2))) return -EINVAL; - if (last_request->initiator == REGDOM_SET_BY_COUNTRY_IE) { - if (last_request->wiphy != wiphy) { + if (last_request->initiator == + NL80211_REGDOM_SET_BY_COUNTRY_IE) { + if (last_wiphy != wiphy) { /* * Two cards with two APs claiming different * different Country IE alpha2s. We could * intersect them, but that seems unlikely * to be correct. Reject second one for now. */ - if (!alpha2_equal(alpha2, - cfg80211_regdomain->alpha2)) + if (regdom_changes(pending_request->alpha2)) return -EOPNOTSUPP; return -EALREADY; } - /* Two consecutive Country IE hints on the same wiphy. - * This should be picked up early by the driver/stack */ - if (WARN_ON(!alpha2_equal(cfg80211_regdomain->alpha2, - alpha2))) + /* + * Two consecutive Country IE hints on the same wiphy. + * This should be picked up early by the driver/stack + */ + if (WARN_ON(regdom_changes(pending_request->alpha2))) return 0; return -EALREADY; } return REG_INTERSECT; - case REGDOM_SET_BY_DRIVER: - if (last_request->initiator == REGDOM_SET_BY_CORE) { + case NL80211_REGDOM_SET_BY_DRIVER: + if (last_request->initiator == NL80211_REGDOM_SET_BY_CORE) { if (is_old_static_regdom(cfg80211_regdomain)) return 0; - if (!alpha2_equal(cfg80211_regdomain->alpha2, alpha2)) + if (regdom_changes(pending_request->alpha2)) return 0; return -EALREADY; } + + /* + * This would happen if you unplug and plug your card + * back in or if you add a new device for which the previously + * loaded card also agrees on the regulatory domain. + */ + if (last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER && + !regdom_changes(pending_request->alpha2)) + return -EALREADY; + return REG_INTERSECT; - case REGDOM_SET_BY_USER: - if (last_request->initiator == REGDOM_SET_BY_COUNTRY_IE) + case NL80211_REGDOM_SET_BY_USER: + if (last_request->initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE) return REG_INTERSECT; - /* If the user knows better the user should set the regdom - * to their country before the IE is picked up */ - if (last_request->initiator == REGDOM_SET_BY_USER && + /* + * If the user knows better the user should set the regdom + * to their country before the IE is picked up + */ + if (last_request->initiator == NL80211_REGDOM_SET_BY_USER && last_request->intersect) return -EOPNOTSUPP; - /* Process user requests only after previous user/driver/core - * requests have been processed */ - if (last_request->initiator == REGDOM_SET_BY_CORE || - last_request->initiator == REGDOM_SET_BY_DRIVER || - last_request->initiator == REGDOM_SET_BY_USER) { - if (!alpha2_equal(last_request->alpha2, - cfg80211_regdomain->alpha2)) + /* + * Process user requests only after previous user/driver/core + * requests have been processed + */ + if (last_request->initiator == NL80211_REGDOM_SET_BY_CORE || + last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER || + last_request->initiator == NL80211_REGDOM_SET_BY_USER) { + if (regdom_changes(last_request->alpha2)) return -EAGAIN; } if (!is_old_static_regdom(cfg80211_regdomain) && - alpha2_equal(cfg80211_regdomain->alpha2, alpha2)) + !regdom_changes(pending_request->alpha2)) return -EALREADY; return 0; @@ -1116,59 +1340,80 @@ static int ignore_request(struct wiphy *wiphy, enum reg_set_by set_by, return -EINVAL; } -/* Caller must hold &cfg80211_drv_mutex */ -int __regulatory_hint(struct wiphy *wiphy, enum reg_set_by set_by, - const char *alpha2, - u32 country_ie_checksum, - enum environment_cap env) +/** + * __regulatory_hint - hint to the wireless core a regulatory domain + * @wiphy: if the hint comes from country information from an AP, this + * is required to be set to the wiphy that received the information + * @pending_request: the regulatory request currently being processed + * + * The Wireless subsystem can use this function to hint to the wireless core + * what it believes should be the current regulatory domain. + * + * Returns zero if all went fine, %-EALREADY if a regulatory domain had + * already been set or other standard error codes. + * + * Caller must hold &cfg80211_mutex + */ +static int __regulatory_hint(struct wiphy *wiphy, + struct regulatory_request *pending_request) { - struct regulatory_request *request; bool intersect = false; int r = 0; - r = ignore_request(wiphy, set_by, alpha2); + assert_cfg80211_lock(); + + r = ignore_request(wiphy, pending_request); if (r == REG_INTERSECT) { - if (set_by == REGDOM_SET_BY_DRIVER) { + if (pending_request->initiator == + NL80211_REGDOM_SET_BY_DRIVER) { r = reg_copy_regd(&wiphy->regd, cfg80211_regdomain); - if (r) + if (r) { + kfree(pending_request); return r; + } } intersect = true; } else if (r) { - /* If the regulatory domain being requested by the + /* + * If the regulatory domain being requested by the * driver has already been set just copy it to the - * wiphy */ - if (r == -EALREADY && set_by == REGDOM_SET_BY_DRIVER) { + * wiphy + */ + if (r == -EALREADY && + pending_request->initiator == + NL80211_REGDOM_SET_BY_DRIVER) { r = reg_copy_regd(&wiphy->regd, cfg80211_regdomain); - if (r) + if (r) { + kfree(pending_request); return r; + } r = -EALREADY; goto new_request; } + kfree(pending_request); return r; } new_request: - request = kzalloc(sizeof(struct regulatory_request), - GFP_KERNEL); - if (!request) - return -ENOMEM; + kfree(last_request); - request->alpha2[0] = alpha2[0]; - request->alpha2[1] = alpha2[1]; - request->initiator = set_by; - request->wiphy = wiphy; - request->intersect = intersect; - request->country_ie_checksum = country_ie_checksum; - request->country_ie_env = env; + last_request = pending_request; + last_request->intersect = intersect; - kfree(last_request); - last_request = request; + pending_request = NULL; /* When r == REG_INTERSECT we do need to call CRDA */ - if (r < 0) + if (r < 0) { + /* + * Since CRDA will not be called in this case as we already + * have applied the requested regulatory domain before we just + * inform userspace we have processed the request + */ + if (r == -EALREADY) + nl80211_send_reg_change_event(last_request); return r; + } /* * Note: When CONFIG_WIRELESS_OLD_REGULATORY is enabled @@ -1180,34 +1425,194 @@ new_request: * * to intersect with the static rd */ - return call_crda(alpha2); + return call_crda(last_request->alpha2); } -void regulatory_hint(struct wiphy *wiphy, const char *alpha2) +/* This currently only processes user and driver regulatory hints */ +static void reg_process_hint(struct regulatory_request *reg_request) { - int r; - BUG_ON(!alpha2); + int r = 0; + struct wiphy *wiphy = NULL; + + BUG_ON(!reg_request->alpha2); + + mutex_lock(&cfg80211_mutex); + + if (wiphy_idx_valid(reg_request->wiphy_idx)) + wiphy = wiphy_idx_to_wiphy(reg_request->wiphy_idx); + + if (reg_request->initiator == NL80211_REGDOM_SET_BY_DRIVER && + !wiphy) { + kfree(reg_request); + goto out; + } - mutex_lock(&cfg80211_drv_mutex); - r = __regulatory_hint(wiphy, REGDOM_SET_BY_DRIVER, - alpha2, 0, ENVIRON_ANY); + r = __regulatory_hint(wiphy, reg_request); /* This is required so that the orig_* parameters are saved */ - if (r == -EALREADY && wiphy->strict_regulatory) - wiphy_update_regulatory(wiphy, REGDOM_SET_BY_DRIVER); - mutex_unlock(&cfg80211_drv_mutex); + if (r == -EALREADY && wiphy && wiphy->strict_regulatory) + wiphy_update_regulatory(wiphy, reg_request->initiator); +out: + mutex_unlock(&cfg80211_mutex); +} + +/* Processes regulatory hints, this is all the NL80211_REGDOM_SET_BY_* */ +static void reg_process_pending_hints(void) + { + struct regulatory_request *reg_request; + + spin_lock(®_requests_lock); + while (!list_empty(®_requests_list)) { + reg_request = list_first_entry(®_requests_list, + struct regulatory_request, + list); + list_del_init(®_request->list); + + spin_unlock(®_requests_lock); + reg_process_hint(reg_request); + spin_lock(®_requests_lock); + } + spin_unlock(®_requests_lock); +} + +/* Processes beacon hints -- this has nothing to do with country IEs */ +static void reg_process_pending_beacon_hints(void) +{ + struct cfg80211_registered_device *drv; + struct reg_beacon *pending_beacon, *tmp; + + mutex_lock(&cfg80211_mutex); + + /* This goes through the _pending_ beacon list */ + spin_lock_bh(®_pending_beacons_lock); + + if (list_empty(®_pending_beacons)) { + spin_unlock_bh(®_pending_beacons_lock); + goto out; + } + + list_for_each_entry_safe(pending_beacon, tmp, + ®_pending_beacons, list) { + + list_del_init(&pending_beacon->list); + + /* Applies the beacon hint to current wiphys */ + list_for_each_entry(drv, &cfg80211_drv_list, list) + wiphy_update_new_beacon(&drv->wiphy, pending_beacon); + + /* Remembers the beacon hint for new wiphys or reg changes */ + list_add_tail(&pending_beacon->list, ®_beacon_list); + } + + spin_unlock_bh(®_pending_beacons_lock); +out: + mutex_unlock(&cfg80211_mutex); +} + +static void reg_todo(struct work_struct *work) +{ + reg_process_pending_hints(); + reg_process_pending_beacon_hints(); +} + +static DECLARE_WORK(reg_work, reg_todo); + +static void queue_regulatory_request(struct regulatory_request *request) +{ + spin_lock(®_requests_lock); + list_add_tail(&request->list, ®_requests_list); + spin_unlock(®_requests_lock); + + schedule_work(®_work); +} + +/* Core regulatory hint -- happens once during cfg80211_init() */ +static int regulatory_hint_core(const char *alpha2) +{ + struct regulatory_request *request; + + BUG_ON(last_request); + + request = kzalloc(sizeof(struct regulatory_request), + GFP_KERNEL); + if (!request) + return -ENOMEM; + + request->alpha2[0] = alpha2[0]; + request->alpha2[1] = alpha2[1]; + request->initiator = NL80211_REGDOM_SET_BY_CORE; + + queue_regulatory_request(request); + + return 0; +} + +/* User hints */ +int regulatory_hint_user(const char *alpha2) +{ + struct regulatory_request *request; + + BUG_ON(!alpha2); + + request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL); + if (!request) + return -ENOMEM; + + request->wiphy_idx = WIPHY_IDX_STALE; + request->alpha2[0] = alpha2[0]; + request->alpha2[1] = alpha2[1]; + request->initiator = NL80211_REGDOM_SET_BY_USER, + + queue_regulatory_request(request); + + return 0; +} + +/* Driver hints */ +int regulatory_hint(struct wiphy *wiphy, const char *alpha2) +{ + struct regulatory_request *request; + + BUG_ON(!alpha2); + BUG_ON(!wiphy); + + request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL); + if (!request) + return -ENOMEM; + + request->wiphy_idx = get_wiphy_idx(wiphy); + + /* Must have registered wiphy first */ + BUG_ON(!wiphy_idx_valid(request->wiphy_idx)); + + request->alpha2[0] = alpha2[0]; + request->alpha2[1] = alpha2[1]; + request->initiator = NL80211_REGDOM_SET_BY_DRIVER; + + queue_regulatory_request(request); + + return 0; } EXPORT_SYMBOL(regulatory_hint); static bool reg_same_country_ie_hint(struct wiphy *wiphy, u32 country_ie_checksum) { - if (!last_request->wiphy) + struct wiphy *request_wiphy; + + assert_cfg80211_lock(); + + request_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx); + + if (!request_wiphy) return false; - if (likely(last_request->wiphy != wiphy)) + + if (likely(request_wiphy != wiphy)) return !country_ie_integrity_changes(country_ie_checksum); - /* We should not have let these through at this point, they + /* + * We should not have let these through at this point, they * should have been picked up earlier by the first alpha2 check - * on the device */ + * on the device + */ if (WARN_ON(!country_ie_integrity_changes(country_ie_checksum))) return true; return false; @@ -1221,11 +1626,14 @@ void regulatory_hint_11d(struct wiphy *wiphy, char alpha2[2]; u32 checksum = 0; enum environment_cap env = ENVIRON_ANY; + struct regulatory_request *request; - if (!last_request) - return; + mutex_lock(&cfg80211_mutex); - mutex_lock(&cfg80211_drv_mutex); + if (unlikely(!last_request)) { + mutex_unlock(&cfg80211_mutex); + return; + } /* IE len must be evenly divisible by 2 */ if (country_ie_len & 0x01) @@ -1234,9 +1642,11 @@ void regulatory_hint_11d(struct wiphy *wiphy, if (country_ie_len < IEEE80211_COUNTRY_IE_MIN_LEN) goto out; - /* Pending country IE processing, this can happen after we + /* + * Pending country IE processing, this can happen after we * call CRDA and wait for a response if a beacon was received before - * we were able to process the last regulatory_hint_11d() call */ + * we were able to process the last regulatory_hint_11d() call + */ if (country_ie_regdomain) goto out; @@ -1248,33 +1658,44 @@ void regulatory_hint_11d(struct wiphy *wiphy, else if (country_ie[2] == 'O') env = ENVIRON_OUTDOOR; - /* We will run this for *every* beacon processed for the BSSID, so + /* + * We will run this for *every* beacon processed for the BSSID, so * we optimize an early check to exit out early if we don't have to - * do anything */ - if (likely(last_request->wiphy)) { + * do anything + */ + if (likely(wiphy_idx_valid(last_request->wiphy_idx))) { struct cfg80211_registered_device *drv_last_ie; - drv_last_ie = wiphy_to_dev(last_request->wiphy); + drv_last_ie = + cfg80211_drv_by_wiphy_idx(last_request->wiphy_idx); - /* Lets keep this simple -- we trust the first AP - * after we intersect with CRDA */ - if (likely(last_request->wiphy == wiphy)) { - /* Ignore IEs coming in on this wiphy with - * the same alpha2 and environment cap */ + /* + * Lets keep this simple -- we trust the first AP + * after we intersect with CRDA + */ + if (likely(&drv_last_ie->wiphy == wiphy)) { + /* + * Ignore IEs coming in on this wiphy with + * the same alpha2 and environment cap + */ if (likely(alpha2_equal(drv_last_ie->country_ie_alpha2, alpha2) && env == drv_last_ie->env)) { goto out; } - /* the wiphy moved on to another BSSID or the AP + /* + * the wiphy moved on to another BSSID or the AP * was reconfigured. XXX: We need to deal with the * case where the user suspends and goes to goes * to another country, and then gets IEs from an - * AP with different settings */ + * AP with different settings + */ goto out; } else { - /* Ignore IEs coming in on two separate wiphys with - * the same alpha2 and environment cap */ + /* + * Ignore IEs coming in on two separate wiphys with + * the same alpha2 and environment cap + */ if (likely(alpha2_equal(drv_last_ie->country_ie_alpha2, alpha2) && env == drv_last_ie->env)) { @@ -1289,28 +1710,97 @@ void regulatory_hint_11d(struct wiphy *wiphy, if (!rd) goto out; - /* This will not happen right now but we leave it here for the + /* + * This will not happen right now but we leave it here for the * the future when we want to add suspend/resume support and having * the user move to another country after doing so, or having the user - * move to another AP. Right now we just trust the first AP. This is why - * this is marked as likley(). If we hit this before we add this support - * we want to be informed of it as it would indicate a mistake in the - * current design */ - if (likely(WARN_ON(reg_same_country_ie_hint(wiphy, checksum)))) - goto out; + * move to another AP. Right now we just trust the first AP. + * + * If we hit this before we add this support we want to be informed of + * it as it would indicate a mistake in the current design + */ + if (WARN_ON(reg_same_country_ie_hint(wiphy, checksum))) + goto free_rd_out; + + request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL); + if (!request) + goto free_rd_out; - /* We keep this around for when CRDA comes back with a response so - * we can intersect with that */ + /* + * We keep this around for when CRDA comes back with a response so + * we can intersect with that + */ country_ie_regdomain = rd; - __regulatory_hint(wiphy, REGDOM_SET_BY_COUNTRY_IE, - country_ie_regdomain->alpha2, checksum, env); + request->wiphy_idx = get_wiphy_idx(wiphy); + request->alpha2[0] = rd->alpha2[0]; + request->alpha2[1] = rd->alpha2[1]; + request->initiator = NL80211_REGDOM_SET_BY_COUNTRY_IE; + request->country_ie_checksum = checksum; + request->country_ie_env = env; + mutex_unlock(&cfg80211_mutex); + + queue_regulatory_request(request); + + return; + +free_rd_out: + kfree(rd); out: - mutex_unlock(&cfg80211_drv_mutex); + mutex_unlock(&cfg80211_mutex); } EXPORT_SYMBOL(regulatory_hint_11d); +static bool freq_is_chan_12_13_14(u16 freq) +{ + if (freq == ieee80211_channel_to_frequency(12) || + freq == ieee80211_channel_to_frequency(13) || + freq == ieee80211_channel_to_frequency(14)) + return true; + return false; +} + +int regulatory_hint_found_beacon(struct wiphy *wiphy, + struct ieee80211_channel *beacon_chan, + gfp_t gfp) +{ + struct reg_beacon *reg_beacon; + + if (likely((beacon_chan->beacon_found || + (beacon_chan->flags & IEEE80211_CHAN_RADAR) || + (beacon_chan->band == IEEE80211_BAND_2GHZ && + !freq_is_chan_12_13_14(beacon_chan->center_freq))))) + return 0; + + reg_beacon = kzalloc(sizeof(struct reg_beacon), gfp); + if (!reg_beacon) + return -ENOMEM; + +#ifdef CONFIG_CFG80211_REG_DEBUG + printk(KERN_DEBUG "cfg80211: Found new beacon on " + "frequency: %d MHz (Ch %d) on %s\n", + beacon_chan->center_freq, + ieee80211_frequency_to_channel(beacon_chan->center_freq), + wiphy_name(wiphy)); +#endif + memcpy(®_beacon->chan, beacon_chan, + sizeof(struct ieee80211_channel)); + + + /* + * Since we can be called from BH or and non-BH context + * we must use spin_lock_bh() + */ + spin_lock_bh(®_pending_beacons_lock); + list_add_tail(®_beacon->list, ®_pending_beacons); + spin_unlock_bh(®_pending_beacons_lock); + + schedule_work(®_work); + + return 0; +} + static void print_rd_rules(const struct ieee80211_regdomain *rd) { unsigned int i; @@ -1326,8 +1816,10 @@ static void print_rd_rules(const struct ieee80211_regdomain *rd) freq_range = ®_rule->freq_range; power_rule = ®_rule->power_rule; - /* There may not be documentation for max antenna gain - * in certain regions */ + /* + * There may not be documentation for max antenna gain + * in certain regions + */ if (power_rule->max_antenna_gain) printk(KERN_INFO "\t(%d KHz - %d KHz @ %d KHz), " "(%d mBi, %d mBm)\n", @@ -1350,13 +1842,13 @@ static void print_regdomain(const struct ieee80211_regdomain *rd) { if (is_intersected_alpha2(rd->alpha2)) { - struct wiphy *wiphy = NULL; - struct cfg80211_registered_device *drv; - if (last_request->initiator == REGDOM_SET_BY_COUNTRY_IE) { - if (last_request->wiphy) { - wiphy = last_request->wiphy; - drv = wiphy_to_dev(wiphy); + if (last_request->initiator == + NL80211_REGDOM_SET_BY_COUNTRY_IE) { + struct cfg80211_registered_device *drv; + drv = cfg80211_drv_by_wiphy_idx( + last_request->wiphy_idx); + if (drv) { printk(KERN_INFO "cfg80211: Current regulatory " "domain updated by AP to: %c%c\n", drv->country_ie_alpha2[0], @@ -1422,7 +1914,7 @@ static int __set_regdom(const struct ieee80211_regdomain *rd) { const struct ieee80211_regdomain *intersected_rd = NULL; struct cfg80211_registered_device *drv = NULL; - struct wiphy *wiphy = NULL; + struct wiphy *request_wiphy; /* Some basic sanity checks first */ if (is_world_regdom(rd->alpha2)) { @@ -1439,23 +1931,27 @@ static int __set_regdom(const struct ieee80211_regdomain *rd) if (!last_request) return -EINVAL; - /* Lets only bother proceeding on the same alpha2 if the current + /* + * Lets only bother proceeding on the same alpha2 if the current * rd is non static (it means CRDA was present and was used last) - * and the pending request came in from a country IE */ - if (last_request->initiator != REGDOM_SET_BY_COUNTRY_IE) { - /* If someone else asked us to change the rd lets only bother - * checking if the alpha2 changes if CRDA was already called */ + * and the pending request came in from a country IE + */ + if (last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE) { + /* + * If someone else asked us to change the rd lets only bother + * checking if the alpha2 changes if CRDA was already called + */ if (!is_old_static_regdom(cfg80211_regdomain) && - !regdom_changed(rd->alpha2)) + !regdom_changes(rd->alpha2)) return -EINVAL; } - wiphy = last_request->wiphy; - - /* Now lets set the regulatory domain, update all driver channels + /* + * Now lets set the regulatory domain, update all driver channels * and finally inform them of what we have done, in case they want * to review or adjust their own settings based on their own - * internal EEPROM data */ + * internal EEPROM data + */ if (WARN_ON(!reg_is_valid_request(rd->alpha2))) return -EINVAL; @@ -1467,21 +1963,25 @@ static int __set_regdom(const struct ieee80211_regdomain *rd) return -EINVAL; } + request_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx); + if (!last_request->intersect) { int r; - if (last_request->initiator != REGDOM_SET_BY_DRIVER) { + if (last_request->initiator != NL80211_REGDOM_SET_BY_DRIVER) { reset_regdomains(); cfg80211_regdomain = rd; return 0; } - /* For a driver hint, lets copy the regulatory domain the - * driver wanted to the wiphy to deal with conflicts */ + /* + * For a driver hint, lets copy the regulatory domain the + * driver wanted to the wiphy to deal with conflicts + */ - BUG_ON(last_request->wiphy->regd); + BUG_ON(request_wiphy->regd); - r = reg_copy_regd(&last_request->wiphy->regd, rd); + r = reg_copy_regd(&request_wiphy->regd, rd); if (r) return r; @@ -1492,17 +1992,19 @@ static int __set_regdom(const struct ieee80211_regdomain *rd) /* Intersection requires a bit more work */ - if (last_request->initiator != REGDOM_SET_BY_COUNTRY_IE) { + if (last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE) { intersected_rd = regdom_intersect(rd, cfg80211_regdomain); if (!intersected_rd) return -EINVAL; - /* We can trash what CRDA provided now. + /* + * We can trash what CRDA provided now. * However if a driver requested this specific regulatory - * domain we keep it for its private use */ - if (last_request->initiator == REGDOM_SET_BY_DRIVER) - last_request->wiphy->regd = rd; + * domain we keep it for its private use + */ + if (last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER) + request_wiphy->regd = rd; else kfree(rd); @@ -1522,8 +2024,10 @@ static int __set_regdom(const struct ieee80211_regdomain *rd) BUG_ON(!country_ie_regdomain); if (rd != country_ie_regdomain) { - /* Intersect what CRDA returned and our what we - * had built from the Country IE received */ + /* + * Intersect what CRDA returned and our what we + * had built from the Country IE received + */ intersected_rd = regdom_intersect(rd, country_ie_regdomain); @@ -1533,16 +2037,18 @@ static int __set_regdom(const struct ieee80211_regdomain *rd) kfree(country_ie_regdomain); country_ie_regdomain = NULL; } else { - /* This would happen when CRDA was not present and + /* + * This would happen when CRDA was not present and * OLD_REGULATORY was enabled. We intersect our Country - * IE rd and what was set on cfg80211 originally */ + * IE rd and what was set on cfg80211 originally + */ intersected_rd = regdom_intersect(rd, cfg80211_regdomain); } if (!intersected_rd) return -EINVAL; - drv = wiphy_to_dev(wiphy); + drv = wiphy_to_dev(request_wiphy); drv->country_ie_alpha2[0] = rd->alpha2[0]; drv->country_ie_alpha2[1] = rd->alpha2[1]; @@ -1560,13 +2066,17 @@ static int __set_regdom(const struct ieee80211_regdomain *rd) } -/* Use this call to set the current regulatory domain. Conflicts with +/* + * Use this call to set the current regulatory domain. Conflicts with * multiple drivers can be ironed out later. Caller must've already - * kmalloc'd the rd structure. Caller must hold cfg80211_drv_mutex */ + * kmalloc'd the rd structure. Caller must hold cfg80211_mutex + */ int set_regdom(const struct ieee80211_regdomain *rd) { int r; + assert_cfg80211_lock(); + /* Note that this doesn't update the wiphys, this is done below */ r = __set_regdom(rd); if (r) { @@ -1583,57 +2093,87 @@ int set_regdom(const struct ieee80211_regdomain *rd) print_regdomain(cfg80211_regdomain); + nl80211_send_reg_change_event(last_request); + return r; } -/* Caller must hold cfg80211_drv_mutex */ +/* Caller must hold cfg80211_mutex */ void reg_device_remove(struct wiphy *wiphy) { + struct wiphy *request_wiphy; + + assert_cfg80211_lock(); + + request_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx); + kfree(wiphy->regd); - if (!last_request || !last_request->wiphy) + if (!last_request || !request_wiphy) return; - if (last_request->wiphy != wiphy) + if (request_wiphy != wiphy) return; - last_request->wiphy = NULL; + last_request->wiphy_idx = WIPHY_IDX_STALE; last_request->country_ie_env = ENVIRON_ANY; } int regulatory_init(void) { - int err; + int err = 0; reg_pdev = platform_device_register_simple("regulatory", 0, NULL, 0); if (IS_ERR(reg_pdev)) return PTR_ERR(reg_pdev); + spin_lock_init(®_requests_lock); + spin_lock_init(®_pending_beacons_lock); + #ifdef CONFIG_WIRELESS_OLD_REGULATORY cfg80211_regdomain = static_regdom(ieee80211_regdom); printk(KERN_INFO "cfg80211: Using static regulatory domain info\n"); print_regdomain_info(cfg80211_regdomain); - /* The old code still requests for a new regdomain and if + /* + * The old code still requests for a new regdomain and if * you have CRDA you get it updated, otherwise you get * stuck with the static values. We ignore "EU" code as - * that is not a valid ISO / IEC 3166 alpha2 */ + * that is not a valid ISO / IEC 3166 alpha2 + */ if (ieee80211_regdom[0] != 'E' || ieee80211_regdom[1] != 'U') - err = __regulatory_hint(NULL, REGDOM_SET_BY_CORE, - ieee80211_regdom, 0, ENVIRON_ANY); + err = regulatory_hint_core(ieee80211_regdom); #else cfg80211_regdomain = cfg80211_world_regdom; - err = __regulatory_hint(NULL, REGDOM_SET_BY_CORE, "00", 0, ENVIRON_ANY); - if (err) - printk(KERN_ERR "cfg80211: calling CRDA failed - " - "unable to update world regulatory domain, " - "using static definition\n"); + err = regulatory_hint_core("00"); #endif + if (err) { + if (err == -ENOMEM) + return err; + /* + * N.B. kobject_uevent_env() can fail mainly for when we're out + * memory which is handled and propagated appropriately above + * but it can also fail during a netlink_broadcast() or during + * early boot for call_usermodehelper(). For now treat these + * errors as non-fatal. + */ + printk(KERN_ERR "cfg80211: kobject_uevent_env() was unable " + "to call CRDA during init"); +#ifdef CONFIG_CFG80211_REG_DEBUG + /* We want to find out exactly why when debugging */ + WARN_ON(err); +#endif + } return 0; } void regulatory_exit(void) { - mutex_lock(&cfg80211_drv_mutex); + struct regulatory_request *reg_request, *tmp; + struct reg_beacon *reg_beacon, *btmp; + + cancel_work_sync(®_work); + + mutex_lock(&cfg80211_mutex); reset_regdomains(); @@ -1644,5 +2184,33 @@ void regulatory_exit(void) platform_device_unregister(reg_pdev); - mutex_unlock(&cfg80211_drv_mutex); + spin_lock_bh(®_pending_beacons_lock); + if (!list_empty(®_pending_beacons)) { + list_for_each_entry_safe(reg_beacon, btmp, + ®_pending_beacons, list) { + list_del(®_beacon->list); + kfree(reg_beacon); + } + } + spin_unlock_bh(®_pending_beacons_lock); + + if (!list_empty(®_beacon_list)) { + list_for_each_entry_safe(reg_beacon, btmp, + ®_beacon_list, list) { + list_del(®_beacon->list); + kfree(reg_beacon); + } + } + + spin_lock(®_requests_lock); + if (!list_empty(®_requests_list)) { + list_for_each_entry_safe(reg_request, tmp, + ®_requests_list, list) { + list_del(®_request->list); + kfree(reg_request); + } + } + spin_unlock(®_requests_lock); + + mutex_unlock(&cfg80211_mutex); } diff --git a/net/wireless/reg.h b/net/wireless/reg.h index fe8c83f34fb..e37829a49dc 100644 --- a/net/wireless/reg.h +++ b/net/wireless/reg.h @@ -6,6 +6,8 @@ extern const struct ieee80211_regdomain *cfg80211_regdomain; bool is_world_regdom(const char *alpha2); bool reg_is_valid_request(const char *alpha2); +int regulatory_hint_user(const char *alpha2); + void reg_device_remove(struct wiphy *wiphy); int regulatory_init(void); @@ -14,26 +16,24 @@ void regulatory_exit(void); int set_regdom(const struct ieee80211_regdomain *rd); /** - * __regulatory_hint - hint to the wireless core a regulatory domain - * @wiphy: if the hint comes from country information from an AP, this - * is required to be set to the wiphy that received the information - * @alpha2: the ISO/IEC 3166 alpha2 being claimed the regulatory domain - * should be in. - * @country_ie_checksum: checksum of processed country IE, set this to 0 - * if the hint did not come from a country IE - * @country_ie_env: the environment the IE told us we are in, %ENVIRON_* - * - * The Wireless subsystem can use this function to hint to the wireless core - * what it believes should be the current regulatory domain by giving it an - * ISO/IEC 3166 alpha2 country code it knows its regulatory domain should be - * in. + * regulatory_hint_found_beacon - hints a beacon was found on a channel + * @wiphy: the wireless device where the beacon was found on + * @beacon_chan: the channel on which the beacon was found on + * @gfp: context flags * - * Returns zero if all went fine, %-EALREADY if a regulatory domain had - * already been set or other standard error codes. + * This informs the wireless core that a beacon from an AP was found on + * the channel provided. This allows the wireless core to make educated + * guesses on regulatory to help with world roaming. This is only used for + * world roaming -- when we do not know our current location. This is + * only useful on channels 12, 13 and 14 on the 2 GHz band as channels + * 1-11 are already enabled by the world regulatory domain; and on + * non-radar 5 GHz channels. * + * Drivers do not need to call this, cfg80211 will do it for after a scan + * on a newly found BSS. */ -extern int __regulatory_hint(struct wiphy *wiphy, enum reg_set_by set_by, - const char *alpha2, u32 country_ie_checksum, - enum environment_cap country_ie_env); +int regulatory_hint_found_beacon(struct wiphy *wiphy, + struct ieee80211_channel *beacon_chan, + gfp_t gfp); #endif /* __NET_WIRELESS_REG_H */ diff --git a/net/wireless/scan.c b/net/wireless/scan.c index b1893c863b9..280dbcd02c1 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -62,6 +62,18 @@ static void bss_release(struct kref *ref) } /* must hold dev->bss_lock! */ +void cfg80211_bss_age(struct cfg80211_registered_device *dev, + unsigned long age_secs) +{ + struct cfg80211_internal_bss *bss; + unsigned long age_jiffies = msecs_to_jiffies(age_secs * MSEC_PER_SEC); + + list_for_each_entry(bss, &dev->bss_list, list) { + bss->ts -= age_jiffies; + } +} + +/* must hold dev->bss_lock! */ void cfg80211_bss_expire(struct cfg80211_registered_device *dev) { struct cfg80211_internal_bss *bss, *tmp; @@ -358,7 +370,6 @@ cfg80211_bss_update(struct cfg80211_registered_device *dev, found->pub.beacon_interval = res->pub.beacon_interval; found->pub.tsf = res->pub.tsf; found->pub.signal = res->pub.signal; - found->pub.signal_type = res->pub.signal_type; found->pub.capability = res->pub.capability; found->ts = res->ts; kref_put(&res->ref, bss_release); @@ -380,8 +391,7 @@ struct cfg80211_bss * cfg80211_inform_bss_frame(struct wiphy *wiphy, struct ieee80211_channel *channel, struct ieee80211_mgmt *mgmt, size_t len, - s32 signal, enum cfg80211_signal_type sigtype, - gfp_t gfp) + s32 signal, gfp_t gfp) { struct cfg80211_internal_bss *res; size_t ielen = len - offsetof(struct ieee80211_mgmt, @@ -389,7 +399,7 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy, bool overwrite; size_t privsz = wiphy->bss_priv_size; - if (WARN_ON(sigtype == NL80211_BSS_SIGNAL_UNSPEC && + if (WARN_ON(wiphy->signal_type == NL80211_BSS_SIGNAL_UNSPEC && (signal < 0 || signal > 100))) return NULL; @@ -403,7 +413,6 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy, memcpy(res->pub.bssid, mgmt->bssid, ETH_ALEN); res->pub.channel = channel; - res->pub.signal_type = sigtype; res->pub.signal = signal; res->pub.tsf = le64_to_cpu(mgmt->u.probe_resp.timestamp); res->pub.beacon_interval = le16_to_cpu(mgmt->u.probe_resp.beacon_int); @@ -421,6 +430,9 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy, if (!res) return NULL; + if (res->pub.capability & WLAN_CAPABILITY_ESS) + regulatory_hint_found_beacon(wiphy, channel, gfp); + /* cfg80211_bss_update gives us a referenced result */ return &res->pub; } @@ -584,16 +596,25 @@ static void ieee80211_scan_add_ies(struct iw_request_info *info, } } +static inline unsigned int elapsed_jiffies_msecs(unsigned long start) +{ + unsigned long end = jiffies; + + if (end >= start) + return jiffies_to_msecs(end - start); + + return jiffies_to_msecs(end + (MAX_JIFFY_OFFSET - start) + 1); +} static char * -ieee80211_bss(struct iw_request_info *info, - struct cfg80211_internal_bss *bss, - char *current_ev, char *end_buf) +ieee80211_bss(struct wiphy *wiphy, struct iw_request_info *info, + struct cfg80211_internal_bss *bss, char *current_ev, + char *end_buf) { struct iw_event iwe; u8 *buf, *cfg, *p; u8 *ie = bss->pub.information_elements; - int rem = bss->pub.len_information_elements, i; + int rem = bss->pub.len_information_elements, i, sig; bool ismesh = false; memset(&iwe, 0, sizeof(iwe)); @@ -617,19 +638,28 @@ ieee80211_bss(struct iw_request_info *info, current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe, IW_EV_FREQ_LEN); - if (bss->pub.signal_type != CFG80211_SIGNAL_TYPE_NONE) { + if (wiphy->signal_type != CFG80211_SIGNAL_TYPE_NONE) { memset(&iwe, 0, sizeof(iwe)); iwe.cmd = IWEVQUAL; iwe.u.qual.updated = IW_QUAL_LEVEL_UPDATED | IW_QUAL_NOISE_INVALID | - IW_QUAL_QUAL_INVALID; - switch (bss->pub.signal_type) { + IW_QUAL_QUAL_UPDATED; + switch (wiphy->signal_type) { case CFG80211_SIGNAL_TYPE_MBM: - iwe.u.qual.level = bss->pub.signal / 100; + sig = bss->pub.signal / 100; + iwe.u.qual.level = sig; iwe.u.qual.updated |= IW_QUAL_DBM; + if (sig < -110) /* rather bad */ + sig = -110; + else if (sig > -40) /* perfect */ + sig = -40; + /* will give a range of 0 .. 70 */ + iwe.u.qual.qual = sig + 110; break; case CFG80211_SIGNAL_TYPE_UNSPEC: iwe.u.qual.level = bss->pub.signal; + /* will give range 0 .. 100 */ + iwe.u.qual.qual = bss->pub.signal; break; default: /* not reached */ @@ -763,8 +793,8 @@ ieee80211_bss(struct iw_request_info *info, &iwe, buf); memset(&iwe, 0, sizeof(iwe)); iwe.cmd = IWEVCUSTOM; - sprintf(buf, " Last beacon: %dms ago", - jiffies_to_msecs(jiffies - bss->ts)); + sprintf(buf, " Last beacon: %ums ago", + elapsed_jiffies_msecs(bss->ts)); iwe.u.data.length = strlen(buf); current_ev = iwe_stream_add_point(info, current_ev, end_buf, &iwe, buf); @@ -793,8 +823,8 @@ static int ieee80211_scan_results(struct cfg80211_registered_device *dev, spin_unlock_bh(&dev->bss_lock); return -E2BIG; } - current_ev = ieee80211_bss(info, bss, - current_ev, end_buf); + current_ev = ieee80211_bss(&dev->wiphy, info, bss, + current_ev, end_buf); } spin_unlock_bh(&dev->bss_lock); return current_ev - buf; diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c index 26a72b0797a..efe3c5c92b2 100644 --- a/net/wireless/sysfs.c +++ b/net/wireless/sysfs.c @@ -31,7 +31,7 @@ static ssize_t name ## _show(struct device *dev, \ return sprintf(buf, fmt "\n", dev_to_rdev(dev)->member); \ } -SHOW_FMT(index, "%d", idx); +SHOW_FMT(index, "%d", wiphy_idx); SHOW_FMT(macaddress, "%pM", wiphy.perm_addr); static struct device_attribute ieee80211_dev_attrs[] = { @@ -60,6 +60,8 @@ static int wiphy_suspend(struct device *dev, pm_message_t state) struct cfg80211_registered_device *rdev = dev_to_rdev(dev); int ret = 0; + rdev->suspend_at = get_seconds(); + if (rdev->ops->suspend) { rtnl_lock(); ret = rdev->ops->suspend(&rdev->wiphy); @@ -74,6 +76,11 @@ static int wiphy_resume(struct device *dev) struct cfg80211_registered_device *rdev = dev_to_rdev(dev); int ret = 0; + /* Age scan results with time spent in suspend */ + spin_lock_bh(&rdev->bss_lock); + cfg80211_bss_age(rdev, get_seconds() - rdev->suspend_at); + spin_unlock_bh(&rdev->bss_lock); + if (rdev->ops->resume) { rtnl_lock(); ret = rdev->ops->resume(&rdev->wiphy); diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 58e489fd4ae..b84a9b4fe96 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -137,3 +137,100 @@ int cfg80211_wext_giwmode(struct net_device *dev, struct iw_request_info *info, return 0; } EXPORT_SYMBOL(cfg80211_wext_giwmode); + + +int cfg80211_wext_giwrange(struct net_device *dev, + struct iw_request_info *info, + struct iw_point *data, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct iw_range *range = (struct iw_range *) extra; + enum ieee80211_band band; + int c = 0; + + if (!wdev) + return -EOPNOTSUPP; + + data->length = sizeof(struct iw_range); + memset(range, 0, sizeof(struct iw_range)); + + range->we_version_compiled = WIRELESS_EXT; + range->we_version_source = 21; + range->retry_capa = IW_RETRY_LIMIT; + range->retry_flags = IW_RETRY_LIMIT; + range->min_retry = 0; + range->max_retry = 255; + range->min_rts = 0; + range->max_rts = 2347; + range->min_frag = 256; + range->max_frag = 2346; + + range->encoding_size[0] = 5; + range->encoding_size[1] = 13; + range->num_encoding_sizes = 2; + range->max_encoding_tokens = 4; + + range->max_qual.updated = IW_QUAL_NOISE_INVALID; + + switch (wdev->wiphy->signal_type) { + case CFG80211_SIGNAL_TYPE_NONE: + break; + case CFG80211_SIGNAL_TYPE_MBM: + range->max_qual.level = -110; + range->max_qual.qual = 70; + range->avg_qual.qual = 35; + range->max_qual.updated |= IW_QUAL_DBM; + range->max_qual.updated |= IW_QUAL_QUAL_UPDATED; + range->max_qual.updated |= IW_QUAL_LEVEL_UPDATED; + break; + case CFG80211_SIGNAL_TYPE_UNSPEC: + range->max_qual.level = 100; + range->max_qual.qual = 100; + range->avg_qual.qual = 50; + range->max_qual.updated |= IW_QUAL_QUAL_UPDATED; + range->max_qual.updated |= IW_QUAL_LEVEL_UPDATED; + break; + } + + range->avg_qual.level = range->max_qual.level / 2; + range->avg_qual.noise = range->max_qual.noise / 2; + range->avg_qual.updated = range->max_qual.updated; + + range->enc_capa = IW_ENC_CAPA_WPA | IW_ENC_CAPA_WPA2 | + IW_ENC_CAPA_CIPHER_TKIP | IW_ENC_CAPA_CIPHER_CCMP; + + + for (band = 0; band < IEEE80211_NUM_BANDS; band ++) { + int i; + struct ieee80211_supported_band *sband; + + sband = wdev->wiphy->bands[band]; + + if (!sband) + continue; + + for (i = 0; i < sband->n_channels && c < IW_MAX_FREQUENCIES; i++) { + struct ieee80211_channel *chan = &sband->channels[i]; + + if (!(chan->flags & IEEE80211_CHAN_DISABLED)) { + range->freq[c].i = + ieee80211_frequency_to_channel( + chan->center_freq); + range->freq[c].m = chan->center_freq; + range->freq[c].e = 6; + c++; + } + } + } + range->num_channels = c; + range->num_frequency = c; + + IW_EVENT_CAPA_SET_KERNEL(range->event_capa); + IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWAP); + IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWSCAN); + + range->scan_capa |= IW_SCAN_CAPA_ESSID; + + return 0; +} +EXPORT_SYMBOL(cfg80211_wext_giwrange); diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 8f76f4009c2..9ca17b1ce52 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -951,10 +951,8 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, /* * Incoming Call User Data. */ - if (skb->len >= 0) { - skb_copy_from_linear_data(skb, makex25->calluserdata.cuddata, skb->len); - makex25->calluserdata.cudlength = skb->len; - } + skb_copy_from_linear_data(skb, makex25->calluserdata.cuddata, skb->len); + makex25->calluserdata.cudlength = skb->len; sk->sk_ack_backlog++; @@ -1122,8 +1120,9 @@ static int x25_sendmsg(struct kiocb *iocb, struct socket *sock, if (msg->msg_flags & MSG_OOB) skb_queue_tail(&x25->interrupt_out_queue, skb); else { - len = x25_output(sk, skb); - if (len < 0) + rc = x25_output(sk, skb); + len = rc; + if (rc < 0) kfree_skb(skb); else if (x25->qbitincl) len++; @@ -1608,7 +1607,7 @@ static const struct proto_ops SOCKOPS_WRAPPED(x25_proto_ops) = { SOCKOPS_WRAP(x25_proto, AF_X25); -static struct packet_type x25_packet_type = { +static struct packet_type x25_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_X25), .func = x25_lapb_receive_frame, }; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index e25ff62ab2a..62a5425cc6a 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -748,12 +748,51 @@ static void xfrm_hash_grow_check(struct net *net, int have_hash_collision) schedule_work(&net->xfrm.state_hash_work); } +static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x, + struct flowi *fl, unsigned short family, + xfrm_address_t *daddr, xfrm_address_t *saddr, + struct xfrm_state **best, int *acq_in_progress, + int *error) +{ + /* Resolution logic: + * 1. There is a valid state with matching selector. Done. + * 2. Valid state with inappropriate selector. Skip. + * + * Entering area of "sysdeps". + * + * 3. If state is not valid, selector is temporary, it selects + * only session which triggered previous resolution. Key + * manager will do something to install a state with proper + * selector. + */ + if (x->km.state == XFRM_STATE_VALID) { + if ((x->sel.family && + !xfrm_selector_match(&x->sel, fl, x->sel.family)) || + !security_xfrm_state_pol_flow_match(x, pol, fl)) + return; + + if (!*best || + (*best)->km.dying > x->km.dying || + ((*best)->km.dying == x->km.dying && + (*best)->curlft.add_time < x->curlft.add_time)) + *best = x; + } else if (x->km.state == XFRM_STATE_ACQ) { + *acq_in_progress = 1; + } else if (x->km.state == XFRM_STATE_ERROR || + x->km.state == XFRM_STATE_EXPIRED) { + if (xfrm_selector_match(&x->sel, fl, x->sel.family) && + security_xfrm_state_pol_flow_match(x, pol, fl)) + *error = -ESRCH; + } +} + struct xfrm_state * xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr, struct flowi *fl, struct xfrm_tmpl *tmpl, struct xfrm_policy *pol, int *err, unsigned short family) { + static xfrm_address_t saddr_wildcard = { }; struct net *net = xp_net(pol); unsigned int h; struct hlist_node *entry; @@ -773,40 +812,27 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr, xfrm_state_addr_check(x, daddr, saddr, family) && tmpl->mode == x->props.mode && tmpl->id.proto == x->id.proto && - (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) { - /* Resolution logic: - 1. There is a valid state with matching selector. - Done. - 2. Valid state with inappropriate selector. Skip. - - Entering area of "sysdeps". - - 3. If state is not valid, selector is temporary, - it selects only session which triggered - previous resolution. Key manager will do - something to install a state with proper - selector. - */ - if (x->km.state == XFRM_STATE_VALID) { - if ((x->sel.family && !xfrm_selector_match(&x->sel, fl, x->sel.family)) || - !security_xfrm_state_pol_flow_match(x, pol, fl)) - continue; - if (!best || - best->km.dying > x->km.dying || - (best->km.dying == x->km.dying && - best->curlft.add_time < x->curlft.add_time)) - best = x; - } else if (x->km.state == XFRM_STATE_ACQ) { - acquire_in_progress = 1; - } else if (x->km.state == XFRM_STATE_ERROR || - x->km.state == XFRM_STATE_EXPIRED) { - if (xfrm_selector_match(&x->sel, fl, x->sel.family) && - security_xfrm_state_pol_flow_match(x, pol, fl)) - error = -ESRCH; - } - } + (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) + xfrm_state_look_at(pol, x, fl, family, daddr, saddr, + &best, &acquire_in_progress, &error); + } + if (best) + goto found; + + h = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, family); + hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h, bydst) { + if (x->props.family == family && + x->props.reqid == tmpl->reqid && + !(x->props.flags & XFRM_STATE_WILDRECV) && + xfrm_state_addr_check(x, daddr, saddr, family) && + tmpl->mode == x->props.mode && + tmpl->id.proto == x->id.proto && + (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) + xfrm_state_look_at(pol, x, fl, family, daddr, saddr, + &best, &acquire_in_progress, &error); } +found: x = best; if (!x && !error && !acquire_in_progress) { if (tmpl->id.spi && |