diff options
Diffstat (limited to 'net')
215 files changed, 16890 insertions, 5586 deletions
diff --git a/net/802/p8023.c b/net/802/p8023.c index 6368d3dce44..d23e906456e 100644 --- a/net/802/p8023.c +++ b/net/802/p8023.c @@ -54,8 +54,7 @@ struct datalink_proto *make_8023_client(void) */ void destroy_8023_client(struct datalink_proto *dl) { - if (dl) - kfree(dl); + kfree(dl); } EXPORT_SYMBOL(destroy_8023_client); diff --git a/net/Makefile b/net/Makefile index 4aa2f46d2a5..f5141b9d4f3 100644 --- a/net/Makefile +++ b/net/Makefile @@ -15,8 +15,8 @@ obj-$(CONFIG_NET) += $(tmp-y) # LLC has to be linked before the files in net/802/ obj-$(CONFIG_LLC) += llc/ obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/ -obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_NETFILTER) += netfilter/ +obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_XFRM) += xfrm/ obj-$(CONFIG_UNIX) += unix/ ifneq ($(CONFIG_IPV6),) diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 8e37e71e34f..1b683f30265 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1138,10 +1138,8 @@ static int ax25_connect(struct socket *sock, struct sockaddr *uaddr, sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; - if (ax25->digipeat != NULL) { - kfree(ax25->digipeat); - ax25->digipeat = NULL; - } + kfree(ax25->digipeat); + ax25->digipeat = NULL; /* * Handle digi-peaters to be used. diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c index 73cfc3411c4..4cf87540fb3 100644 --- a/net/ax25/ax25_in.c +++ b/net/ax25/ax25_in.c @@ -401,10 +401,8 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev, } if (dp.ndigi == 0) { - if (ax25->digipeat != NULL) { - kfree(ax25->digipeat); - ax25->digipeat = NULL; - } + kfree(ax25->digipeat); + ax25->digipeat = NULL; } else { /* Reverse the source SABM's path */ memcpy(ax25->digipeat, &reverse_dp, sizeof(ax25_digi)); diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index 26b77d97222..b1e945bd6ed 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -54,15 +54,13 @@ void ax25_rt_device_down(struct net_device *dev) if (s->dev == dev) { if (ax25_route_list == s) { ax25_route_list = s->next; - if (s->digipeat != NULL) - kfree(s->digipeat); + kfree(s->digipeat); kfree(s); } else { for (t = ax25_route_list; t != NULL; t = t->next) { if (t->next == s) { t->next = s->next; - if (s->digipeat != NULL) - kfree(s->digipeat); + kfree(s->digipeat); kfree(s); break; } @@ -90,10 +88,8 @@ static int ax25_rt_add(struct ax25_routes_struct *route) while (ax25_rt != NULL) { if (ax25cmp(&ax25_rt->callsign, &route->dest_addr) == 0 && ax25_rt->dev == ax25_dev->dev) { - if (ax25_rt->digipeat != NULL) { - kfree(ax25_rt->digipeat); - ax25_rt->digipeat = NULL; - } + kfree(ax25_rt->digipeat); + ax25_rt->digipeat = NULL; if (route->digi_count != 0) { if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { write_unlock(&ax25_route_lock); @@ -145,8 +141,7 @@ static int ax25_rt_add(struct ax25_routes_struct *route) static void ax25_rt_destroy(ax25_route *ax25_rt) { if (atomic_read(&ax25_rt->ref) == 0) { - if (ax25_rt->digipeat != NULL) - kfree(ax25_rt->digipeat); + kfree(ax25_rt->digipeat); kfree(ax25_rt); return; } @@ -530,9 +525,7 @@ void __exit ax25_rt_free(void) s = ax25_rt; ax25_rt = ax25_rt->next; - if (s->digipeat != NULL) - kfree(s->digipeat); - + kfree(s->digipeat); kfree(s); } write_unlock(&ax25_route_lock); diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 12b43345b54..ea616e3fc98 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -36,7 +36,6 @@ #include <linux/skbuff.h> #include <linux/init.h> #include <linux/poll.h> -#include <linux/proc_fs.h> #include <net/sock.h> #if defined(CONFIG_KMOD) @@ -50,10 +49,7 @@ #define BT_DBG(D...) #endif -#define VERSION "2.7" - -struct proc_dir_entry *proc_bt; -EXPORT_SYMBOL(proc_bt); +#define VERSION "2.8" /* Bluetooth sockets */ #define BT_MAX_PROTO 8 @@ -308,20 +304,10 @@ static struct net_proto_family bt_sock_family_ops = { .create = bt_sock_create, }; -extern int hci_sock_init(void); -extern int hci_sock_cleanup(void); - -extern int bt_sysfs_init(void); -extern int bt_sysfs_cleanup(void); - static int __init bt_init(void) { BT_INFO("Core ver %s", VERSION); - proc_bt = proc_mkdir("bluetooth", NULL); - if (proc_bt) - proc_bt->owner = THIS_MODULE; - sock_register(&bt_sock_family_ops); BT_INFO("HCI device and connection manager initialized"); @@ -340,8 +326,6 @@ static void __exit bt_exit(void) bt_sysfs_cleanup(); sock_unregister(PF_BLUETOOTH); - - remove_proc_entry("bluetooth", NULL); } subsys_initcall(bt_init); diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 55dc42eac92..9106354c781 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -87,7 +87,7 @@ int hci_unregister_notifier(struct notifier_block *nb) return notifier_chain_unregister(&hci_notifier, nb); } -void hci_notify(struct hci_dev *hdev, int event) +static void hci_notify(struct hci_dev *hdev, int event) { notifier_call_chain(&hci_notifier, event, hdev); } @@ -183,7 +183,7 @@ static void hci_reset_req(struct hci_dev *hdev, unsigned long opt) static void hci_init_req(struct hci_dev *hdev, unsigned long opt) { struct sk_buff *skb; - __u16 param; + __le16 param; BT_DBG("%s %ld", hdev->name, opt); @@ -1347,7 +1347,7 @@ static inline void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb) kfree_skb(skb); } -void hci_rx_task(unsigned long arg) +static void hci_rx_task(unsigned long arg) { struct hci_dev *hdev = (struct hci_dev *) arg; struct sk_buff *skb; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index b61b4e8e36f..eb64555d1fb 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -242,7 +242,7 @@ static void hci_cc_host_ctl(struct hci_dev *hdev, __u16 ocf, struct sk_buff *skb break; status = *((__u8 *) skb->data); - setting = __le16_to_cpu(get_unaligned((__u16 *) sent)); + setting = __le16_to_cpu(get_unaligned((__le16 *) sent)); if (!status && hdev->voice_setting != setting) { hdev->voice_setting = setting; @@ -728,7 +728,7 @@ static inline void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff static inline void hci_num_comp_pkts_evt(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_ev_num_comp_pkts *ev = (struct hci_ev_num_comp_pkts *) skb->data; - __u16 *ptr; + __le16 *ptr; int i; skb_pull(skb, sizeof(*ev)); @@ -742,7 +742,7 @@ static inline void hci_num_comp_pkts_evt(struct hci_dev *hdev, struct sk_buff *s tasklet_disable(&hdev->tx_task); - for (i = 0, ptr = (__u16 *) skb->data; i < ev->num_hndl; i++) { + for (i = 0, ptr = (__le16 *) skb->data; i < ev->num_hndl; i++) { struct hci_conn *conn; __u16 handle, count; diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 32ef7975a13..1d6d0a15c09 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -66,20 +66,20 @@ static struct hci_sec_filter hci_sec_filter = { /* Packet types */ 0x10, /* Events */ - { 0x1000d9fe, 0x0000300c }, + { 0x1000d9fe, 0x0000b00c }, /* Commands */ { { 0x0 }, /* OGF_LINK_CTL */ - { 0xbe000006, 0x00000001, 0x0000, 0x00 }, + { 0xbe000006, 0x00000001, 0x000000, 0x00 }, /* OGF_LINK_POLICY */ - { 0x00005200, 0x00000000, 0x0000, 0x00 }, + { 0x00005200, 0x00000000, 0x000000, 0x00 }, /* OGF_HOST_CTL */ - { 0xaab00200, 0x2b402aaa, 0x0154, 0x00 }, + { 0xaab00200, 0x2b402aaa, 0x020154, 0x00 }, /* OGF_INFO_PARAM */ - { 0x000002be, 0x00000000, 0x0000, 0x00 }, + { 0x000002be, 0x00000000, 0x000000, 0x00 }, /* OGF_STATUS_PARAM */ - { 0x000000ea, 0x00000000, 0x0000, 0x00 } + { 0x000000ea, 0x00000000, 0x000000, 0x00 } } }; @@ -416,7 +416,7 @@ static int hci_sock_sendmsg(struct kiocb *iocb, struct socket *sock, skb->dev = (void *) hdev; if (bt_cb(skb)->pkt_type == HCI_COMMAND_PKT) { - u16 opcode = __le16_to_cpu(get_unaligned((u16 *)skb->data)); + u16 opcode = __le16_to_cpu(get_unaligned((__le16 *) skb->data)); u16 ogf = hci_opcode_ogf(opcode); u16 ocf = hci_opcode_ocf(opcode); diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c index 7856bc26acc..bd7568ac87f 100644 --- a/net/bluetooth/hci_sysfs.c +++ b/net/bluetooth/hci_sysfs.c @@ -103,7 +103,7 @@ static void bt_release(struct class_device *cdev) kfree(hdev); } -static struct class bt_class = { +struct class bt_class = { .name = "bluetooth", .release = bt_release, #ifdef CONFIG_HOTPLUG @@ -111,6 +111,8 @@ static struct class bt_class = { #endif }; +EXPORT_SYMBOL_GPL(bt_class); + int hci_register_sysfs(struct hci_dev *hdev) { struct class_device *cdev = &hdev->class_dev; diff --git a/net/bluetooth/hidp/Kconfig b/net/bluetooth/hidp/Kconfig index 4e958f7d941..edfea772fb6 100644 --- a/net/bluetooth/hidp/Kconfig +++ b/net/bluetooth/hidp/Kconfig @@ -1,6 +1,6 @@ config BT_HIDP tristate "HIDP protocol support" - depends on BT && BT_L2CAP + depends on BT && BT_L2CAP && (BROKEN || !S390) select INPUT help HIDP (Human Interface Device Protocol) is a transport layer diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index de8af5f4239..cdb9cfafd96 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -520,7 +520,7 @@ static int hidp_session(void *arg) if (session->input) { input_unregister_device(session->input); - kfree(session->input); + session->input = NULL; } up_write(&hidp_session_sem); @@ -536,6 +536,8 @@ static inline void hidp_setup_input(struct hidp_session *session, struct hidp_co input->private = session; + input->name = "Bluetooth HID Boot Protocol Device"; + input->id.bustype = BUS_BLUETOOTH; input->id.vendor = req->vendor; input->id.product = req->product; @@ -582,16 +584,15 @@ int hidp_add_connection(struct hidp_connadd_req *req, struct socket *ctrl_sock, return -ENOTUNIQ; session = kmalloc(sizeof(struct hidp_session), GFP_KERNEL); - if (!session) + if (!session) return -ENOMEM; memset(session, 0, sizeof(struct hidp_session)); - session->input = kmalloc(sizeof(struct input_dev), GFP_KERNEL); + session->input = input_allocate_device(); if (!session->input) { kfree(session); return -ENOMEM; } - memset(session->input, 0, sizeof(struct input_dev)); down_write(&hidp_session_sem); @@ -651,15 +652,15 @@ unlink: __hidp_unlink_session(session); - if (session->input) + if (session->input) { input_unregister_device(session->input); + session->input = NULL; /* don't try to free it here */ + } failed: up_write(&hidp_session_sem); - if (session->input) - kfree(session->input); - + kfree(session->input); kfree(session); return err; } diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c index 59b2dd36baa..e3bb11ca423 100644 --- a/net/bluetooth/l2cap.c +++ b/net/bluetooth/l2cap.c @@ -38,9 +38,8 @@ #include <linux/interrupt.h> #include <linux/socket.h> #include <linux/skbuff.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> #include <linux/list.h> +#include <linux/device.h> #include <net/sock.h> #include <asm/system.h> @@ -56,7 +55,7 @@ #define BT_DBG(D...) #endif -#define VERSION "2.7" +#define VERSION "2.8" static struct proto_ops l2cap_sock_ops; @@ -2137,94 +2136,29 @@ drop: return 0; } -/* ---- Proc fs support ---- */ -#ifdef CONFIG_PROC_FS -static void *l2cap_seq_start(struct seq_file *seq, loff_t *pos) +static ssize_t l2cap_sysfs_show(struct class *dev, char *buf) { struct sock *sk; struct hlist_node *node; - loff_t l = *pos; + char *str = buf; read_lock_bh(&l2cap_sk_list.lock); - sk_for_each(sk, node, &l2cap_sk_list.head) - if (!l--) - goto found; - sk = NULL; -found: - return sk; -} + sk_for_each(sk, node, &l2cap_sk_list.head) { + struct l2cap_pinfo *pi = l2cap_pi(sk); -static void *l2cap_seq_next(struct seq_file *seq, void *e, loff_t *pos) -{ - (*pos)++; - return sk_next(e); -} + str += sprintf(str, "%s %s %d %d 0x%4.4x 0x%4.4x %d %d 0x%x\n", + batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), + sk->sk_state, pi->psm, pi->scid, pi->dcid, pi->imtu, + pi->omtu, pi->link_mode); + } -static void l2cap_seq_stop(struct seq_file *seq, void *e) -{ read_unlock_bh(&l2cap_sk_list.lock); -} -static int l2cap_seq_show(struct seq_file *seq, void *e) -{ - struct sock *sk = e; - struct l2cap_pinfo *pi = l2cap_pi(sk); - - seq_printf(seq, "%s %s %d %d 0x%4.4x 0x%4.4x %d %d 0x%x\n", - batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), - sk->sk_state, pi->psm, pi->scid, pi->dcid, pi->imtu, - pi->omtu, pi->link_mode); - return 0; + return (str - buf); } -static struct seq_operations l2cap_seq_ops = { - .start = l2cap_seq_start, - .next = l2cap_seq_next, - .stop = l2cap_seq_stop, - .show = l2cap_seq_show -}; - -static int l2cap_seq_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &l2cap_seq_ops); -} - -static struct file_operations l2cap_seq_fops = { - .owner = THIS_MODULE, - .open = l2cap_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int __init l2cap_proc_init(void) -{ - struct proc_dir_entry *p = create_proc_entry("l2cap", S_IRUGO, proc_bt); - if (!p) - return -ENOMEM; - p->owner = THIS_MODULE; - p->proc_fops = &l2cap_seq_fops; - return 0; -} - -static void __exit l2cap_proc_cleanup(void) -{ - remove_proc_entry("l2cap", proc_bt); -} - -#else /* CONFIG_PROC_FS */ - -static int __init l2cap_proc_init(void) -{ - return 0; -} - -static void __exit l2cap_proc_cleanup(void) -{ - return; -} -#endif /* CONFIG_PROC_FS */ +static CLASS_ATTR(l2cap, S_IRUGO, l2cap_sysfs_show, NULL); static struct proto_ops l2cap_sock_ops = { .family = PF_BLUETOOTH, @@ -2266,7 +2200,7 @@ static struct hci_proto l2cap_hci_proto = { static int __init l2cap_init(void) { int err; - + err = proto_register(&l2cap_proto, 0); if (err < 0) return err; @@ -2284,7 +2218,7 @@ static int __init l2cap_init(void) goto error; } - l2cap_proc_init(); + class_create_file(&bt_class, &class_attr_l2cap); BT_INFO("L2CAP ver %s", VERSION); BT_INFO("L2CAP socket layer initialized"); @@ -2298,7 +2232,7 @@ error: static void __exit l2cap_exit(void) { - l2cap_proc_cleanup(); + class_remove_file(&bt_class, &class_attr_l2cap); if (bt_sock_unregister(BTPROTO_L2CAP) < 0) BT_ERR("L2CAP socket unregistration failed"); diff --git a/net/bluetooth/rfcomm/Makefile b/net/bluetooth/rfcomm/Makefile index aecec45ec68..fe07988a370 100644 --- a/net/bluetooth/rfcomm/Makefile +++ b/net/bluetooth/rfcomm/Makefile @@ -4,5 +4,5 @@ obj-$(CONFIG_BT_RFCOMM) += rfcomm.o -rfcomm-y := core.o sock.o crc.o +rfcomm-y := core.o sock.o rfcomm-$(CONFIG_BT_RFCOMM_TTY) += tty.o diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 35adce6482b..0d89d643413 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -35,9 +35,8 @@ #include <linux/signal.h> #include <linux/init.h> #include <linux/wait.h> +#include <linux/device.h> #include <linux/net.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> #include <net/sock.h> #include <asm/uaccess.h> #include <asm/unaligned.h> @@ -47,17 +46,13 @@ #include <net/bluetooth/l2cap.h> #include <net/bluetooth/rfcomm.h> -#define VERSION "1.5" +#define VERSION "1.6" #ifndef CONFIG_BT_RFCOMM_DEBUG #undef BT_DBG #define BT_DBG(D...) #endif -#ifdef CONFIG_PROC_FS -struct proc_dir_entry *proc_bt_rfcomm; -#endif - static struct task_struct *rfcomm_thread; static DECLARE_MUTEX(rfcomm_sem); @@ -133,6 +128,49 @@ static inline void rfcomm_session_put(struct rfcomm_session *s) /* ---- RFCOMM FCS computation ---- */ +/* reversed, 8-bit, poly=0x07 */ +static unsigned char rfcomm_crc_table[256] = { + 0x00, 0x91, 0xe3, 0x72, 0x07, 0x96, 0xe4, 0x75, + 0x0e, 0x9f, 0xed, 0x7c, 0x09, 0x98, 0xea, 0x7b, + 0x1c, 0x8d, 0xff, 0x6e, 0x1b, 0x8a, 0xf8, 0x69, + 0x12, 0x83, 0xf1, 0x60, 0x15, 0x84, 0xf6, 0x67, + + 0x38, 0xa9, 0xdb, 0x4a, 0x3f, 0xae, 0xdc, 0x4d, + 0x36, 0xa7, 0xd5, 0x44, 0x31, 0xa0, 0xd2, 0x43, + 0x24, 0xb5, 0xc7, 0x56, 0x23, 0xb2, 0xc0, 0x51, + 0x2a, 0xbb, 0xc9, 0x58, 0x2d, 0xbc, 0xce, 0x5f, + + 0x70, 0xe1, 0x93, 0x02, 0x77, 0xe6, 0x94, 0x05, + 0x7e, 0xef, 0x9d, 0x0c, 0x79, 0xe8, 0x9a, 0x0b, + 0x6c, 0xfd, 0x8f, 0x1e, 0x6b, 0xfa, 0x88, 0x19, + 0x62, 0xf3, 0x81, 0x10, 0x65, 0xf4, 0x86, 0x17, + + 0x48, 0xd9, 0xab, 0x3a, 0x4f, 0xde, 0xac, 0x3d, + 0x46, 0xd7, 0xa5, 0x34, 0x41, 0xd0, 0xa2, 0x33, + 0x54, 0xc5, 0xb7, 0x26, 0x53, 0xc2, 0xb0, 0x21, + 0x5a, 0xcb, 0xb9, 0x28, 0x5d, 0xcc, 0xbe, 0x2f, + + 0xe0, 0x71, 0x03, 0x92, 0xe7, 0x76, 0x04, 0x95, + 0xee, 0x7f, 0x0d, 0x9c, 0xe9, 0x78, 0x0a, 0x9b, + 0xfc, 0x6d, 0x1f, 0x8e, 0xfb, 0x6a, 0x18, 0x89, + 0xf2, 0x63, 0x11, 0x80, 0xf5, 0x64, 0x16, 0x87, + + 0xd8, 0x49, 0x3b, 0xaa, 0xdf, 0x4e, 0x3c, 0xad, + 0xd6, 0x47, 0x35, 0xa4, 0xd1, 0x40, 0x32, 0xa3, + 0xc4, 0x55, 0x27, 0xb6, 0xc3, 0x52, 0x20, 0xb1, + 0xca, 0x5b, 0x29, 0xb8, 0xcd, 0x5c, 0x2e, 0xbf, + + 0x90, 0x01, 0x73, 0xe2, 0x97, 0x06, 0x74, 0xe5, + 0x9e, 0x0f, 0x7d, 0xec, 0x99, 0x08, 0x7a, 0xeb, + 0x8c, 0x1d, 0x6f, 0xfe, 0x8b, 0x1a, 0x68, 0xf9, + 0x82, 0x13, 0x61, 0xf0, 0x85, 0x14, 0x66, 0xf7, + + 0xa8, 0x39, 0x4b, 0xda, 0xaf, 0x3e, 0x4c, 0xdd, + 0xa6, 0x37, 0x45, 0xd4, 0xa1, 0x30, 0x42, 0xd3, + 0xb4, 0x25, 0x57, 0xc6, 0xb3, 0x22, 0x50, 0xc1, + 0xba, 0x2b, 0x59, 0xc8, 0xbd, 0x2c, 0x5e, 0xcf +}; + /* CRC on 2 bytes */ #define __crc(data) (rfcomm_crc_table[rfcomm_crc_table[0xff ^ data[0]] ^ data[1]]) @@ -1958,117 +1996,32 @@ static struct hci_cb rfcomm_cb = { .encrypt_cfm = rfcomm_encrypt_cfm }; -/* ---- Proc fs support ---- */ -#ifdef CONFIG_PROC_FS -static void *rfcomm_seq_start(struct seq_file *seq, loff_t *pos) +static ssize_t rfcomm_dlc_sysfs_show(struct class *dev, char *buf) { struct rfcomm_session *s; struct list_head *pp, *p; - loff_t l = *pos; + char *str = buf; rfcomm_lock(); list_for_each(p, &session_list) { s = list_entry(p, struct rfcomm_session, list); - list_for_each(pp, &s->dlcs) - if (!l--) { - seq->private = s; - return pp; - } - } - return NULL; -} - -static void *rfcomm_seq_next(struct seq_file *seq, void *e, loff_t *pos) -{ - struct rfcomm_session *s = seq->private; - struct list_head *pp, *p = e; - (*pos)++; + list_for_each(pp, &s->dlcs) { + struct sock *sk = s->sock->sk; + struct rfcomm_dlc *d = list_entry(pp, struct rfcomm_dlc, list); - if (p->next != &s->dlcs) - return p->next; - - list_for_each(p, &session_list) { - s = list_entry(p, struct rfcomm_session, list); - __list_for_each(pp, &s->dlcs) { - seq->private = s; - return pp; + str += sprintf(str, "%s %s %ld %d %d %d %d\n", + batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), + d->state, d->dlci, d->mtu, d->rx_credits, d->tx_credits); } } - return NULL; -} -static void rfcomm_seq_stop(struct seq_file *seq, void *e) -{ rfcomm_unlock(); -} - -static int rfcomm_seq_show(struct seq_file *seq, void *e) -{ - struct rfcomm_session *s = seq->private; - struct sock *sk = s->sock->sk; - struct rfcomm_dlc *d = list_entry(e, struct rfcomm_dlc, list); - - seq_printf(seq, "%s %s %ld %d %d %d %d\n", - batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), - d->state, d->dlci, d->mtu, d->rx_credits, d->tx_credits); - return 0; -} - -static struct seq_operations rfcomm_seq_ops = { - .start = rfcomm_seq_start, - .next = rfcomm_seq_next, - .stop = rfcomm_seq_stop, - .show = rfcomm_seq_show -}; -static int rfcomm_seq_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &rfcomm_seq_ops); -} - -static struct file_operations rfcomm_seq_fops = { - .owner = THIS_MODULE, - .open = rfcomm_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int __init rfcomm_proc_init(void) -{ - struct proc_dir_entry *p; - - proc_bt_rfcomm = proc_mkdir("rfcomm", proc_bt); - if (proc_bt_rfcomm) { - proc_bt_rfcomm->owner = THIS_MODULE; - - p = create_proc_entry("dlc", S_IRUGO, proc_bt_rfcomm); - if (p) - p->proc_fops = &rfcomm_seq_fops; - } - return 0; -} - -static void __exit rfcomm_proc_cleanup(void) -{ - remove_proc_entry("dlc", proc_bt_rfcomm); - - remove_proc_entry("rfcomm", proc_bt); + return (str - buf); } -#else /* CONFIG_PROC_FS */ - -static int __init rfcomm_proc_init(void) -{ - return 0; -} - -static void __exit rfcomm_proc_cleanup(void) -{ - return; -} -#endif /* CONFIG_PROC_FS */ +static CLASS_ATTR(rfcomm_dlc, S_IRUGO, rfcomm_dlc_sysfs_show, NULL); /* ---- Initialization ---- */ static int __init rfcomm_init(void) @@ -2079,9 +2032,7 @@ static int __init rfcomm_init(void) kernel_thread(rfcomm_run, NULL, CLONE_KERNEL); - BT_INFO("RFCOMM ver %s", VERSION); - - rfcomm_proc_init(); + class_create_file(&bt_class, &class_attr_rfcomm_dlc); rfcomm_init_sockets(); @@ -2089,11 +2040,15 @@ static int __init rfcomm_init(void) rfcomm_init_ttys(); #endif + BT_INFO("RFCOMM ver %s", VERSION); + return 0; } static void __exit rfcomm_exit(void) { + class_remove_file(&bt_class, &class_attr_rfcomm_dlc); + hci_unregister_cb(&rfcomm_cb); /* Terminate working thread. @@ -2110,8 +2065,6 @@ static void __exit rfcomm_exit(void) #endif rfcomm_cleanup_sockets(); - - rfcomm_proc_cleanup(); } module_init(rfcomm_init); diff --git a/net/bluetooth/rfcomm/crc.c b/net/bluetooth/rfcomm/crc.c deleted file mode 100644 index 1011bc4a869..00000000000 --- a/net/bluetooth/rfcomm/crc.c +++ /dev/null @@ -1,71 +0,0 @@ -/* - RFCOMM implementation for Linux Bluetooth stack (BlueZ). - Copyright (C) 2002 Maxim Krasnyansky <maxk@qualcomm.com> - Copyright (C) 2002 Marcel Holtmann <marcel@holtmann.org> - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License version 2 as - published by the Free Software Foundation; - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. - IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY - CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES - WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - - ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, - COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS - SOFTWARE IS DISCLAIMED. -*/ - -/* - * RFCOMM FCS calculation. - * - * $Id: crc.c,v 1.2 2002/09/21 09:54:32 holtmann Exp $ - */ - -/* reversed, 8-bit, poly=0x07 */ -unsigned char rfcomm_crc_table[256] = { - 0x00, 0x91, 0xe3, 0x72, 0x07, 0x96, 0xe4, 0x75, - 0x0e, 0x9f, 0xed, 0x7c, 0x09, 0x98, 0xea, 0x7b, - 0x1c, 0x8d, 0xff, 0x6e, 0x1b, 0x8a, 0xf8, 0x69, - 0x12, 0x83, 0xf1, 0x60, 0x15, 0x84, 0xf6, 0x67, - - 0x38, 0xa9, 0xdb, 0x4a, 0x3f, 0xae, 0xdc, 0x4d, - 0x36, 0xa7, 0xd5, 0x44, 0x31, 0xa0, 0xd2, 0x43, - 0x24, 0xb5, 0xc7, 0x56, 0x23, 0xb2, 0xc0, 0x51, - 0x2a, 0xbb, 0xc9, 0x58, 0x2d, 0xbc, 0xce, 0x5f, - - 0x70, 0xe1, 0x93, 0x02, 0x77, 0xe6, 0x94, 0x05, - 0x7e, 0xef, 0x9d, 0x0c, 0x79, 0xe8, 0x9a, 0x0b, - 0x6c, 0xfd, 0x8f, 0x1e, 0x6b, 0xfa, 0x88, 0x19, - 0x62, 0xf3, 0x81, 0x10, 0x65, 0xf4, 0x86, 0x17, - - 0x48, 0xd9, 0xab, 0x3a, 0x4f, 0xde, 0xac, 0x3d, - 0x46, 0xd7, 0xa5, 0x34, 0x41, 0xd0, 0xa2, 0x33, - 0x54, 0xc5, 0xb7, 0x26, 0x53, 0xc2, 0xb0, 0x21, - 0x5a, 0xcb, 0xb9, 0x28, 0x5d, 0xcc, 0xbe, 0x2f, - - 0xe0, 0x71, 0x03, 0x92, 0xe7, 0x76, 0x04, 0x95, - 0xee, 0x7f, 0x0d, 0x9c, 0xe9, 0x78, 0x0a, 0x9b, - 0xfc, 0x6d, 0x1f, 0x8e, 0xfb, 0x6a, 0x18, 0x89, - 0xf2, 0x63, 0x11, 0x80, 0xf5, 0x64, 0x16, 0x87, - - 0xd8, 0x49, 0x3b, 0xaa, 0xdf, 0x4e, 0x3c, 0xad, - 0xd6, 0x47, 0x35, 0xa4, 0xd1, 0x40, 0x32, 0xa3, - 0xc4, 0x55, 0x27, 0xb6, 0xc3, 0x52, 0x20, 0xb1, - 0xca, 0x5b, 0x29, 0xb8, 0xcd, 0x5c, 0x2e, 0xbf, - - 0x90, 0x01, 0x73, 0xe2, 0x97, 0x06, 0x74, 0xe5, - 0x9e, 0x0f, 0x7d, 0xec, 0x99, 0x08, 0x7a, 0xeb, - 0x8c, 0x1d, 0x6f, 0xfe, 0x8b, 0x1a, 0x68, 0xf9, - 0x82, 0x13, 0x61, 0xf0, 0x85, 0x14, 0x66, 0xf7, - - 0xa8, 0x39, 0x4b, 0xda, 0xaf, 0x3e, 0x4c, 0xdd, - 0xa6, 0x37, 0x45, 0xd4, 0xa1, 0x30, 0x42, 0xd3, - 0xb4, 0x25, 0x57, 0xc6, 0xb3, 0x22, 0x50, 0xc1, - 0xba, 0x2b, 0x59, 0xc8, 0xbd, 0x2c, 0x5e, 0xcf -}; diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index a2b30f0aedb..6c34261b232 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -42,8 +42,7 @@ #include <linux/socket.h> #include <linux/skbuff.h> #include <linux/list.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> +#include <linux/device.h> #include <net/sock.h> #include <asm/system.h> @@ -887,89 +886,26 @@ done: return result; } -/* ---- Proc fs support ---- */ -#ifdef CONFIG_PROC_FS -static void *rfcomm_seq_start(struct seq_file *seq, loff_t *pos) +static ssize_t rfcomm_sock_sysfs_show(struct class *dev, char *buf) { struct sock *sk; struct hlist_node *node; - loff_t l = *pos; + char *str = buf; read_lock_bh(&rfcomm_sk_list.lock); - sk_for_each(sk, node, &rfcomm_sk_list.head) - if (!l--) - return sk; - return NULL; -} - -static void *rfcomm_seq_next(struct seq_file *seq, void *e, loff_t *pos) -{ - struct sock *sk = e; - (*pos)++; - return sk_next(sk); -} + sk_for_each(sk, node, &rfcomm_sk_list.head) { + str += sprintf(str, "%s %s %d %d\n", + batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), + sk->sk_state, rfcomm_pi(sk)->channel); + } -static void rfcomm_seq_stop(struct seq_file *seq, void *e) -{ read_unlock_bh(&rfcomm_sk_list.lock); -} -static int rfcomm_seq_show(struct seq_file *seq, void *e) -{ - struct sock *sk = e; - seq_printf(seq, "%s %s %d %d\n", - batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), - sk->sk_state, rfcomm_pi(sk)->channel); - return 0; -} - -static struct seq_operations rfcomm_seq_ops = { - .start = rfcomm_seq_start, - .next = rfcomm_seq_next, - .stop = rfcomm_seq_stop, - .show = rfcomm_seq_show -}; - -static int rfcomm_seq_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &rfcomm_seq_ops); + return (str - buf); } -static struct file_operations rfcomm_seq_fops = { - .owner = THIS_MODULE, - .open = rfcomm_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int __init rfcomm_sock_proc_init(void) -{ - struct proc_dir_entry *p = create_proc_entry("sock", S_IRUGO, proc_bt_rfcomm); - if (!p) - return -ENOMEM; - p->proc_fops = &rfcomm_seq_fops; - return 0; -} - -static void __exit rfcomm_sock_proc_cleanup(void) -{ - remove_proc_entry("sock", proc_bt_rfcomm); -} - -#else /* CONFIG_PROC_FS */ - -static int __init rfcomm_sock_proc_init(void) -{ - return 0; -} - -static void __exit rfcomm_sock_proc_cleanup(void) -{ - return; -} -#endif /* CONFIG_PROC_FS */ +static CLASS_ATTR(rfcomm, S_IRUGO, rfcomm_sock_sysfs_show, NULL); static struct proto_ops rfcomm_sock_ops = { .family = PF_BLUETOOTH, @@ -997,7 +933,7 @@ static struct net_proto_family rfcomm_sock_family_ops = { .create = rfcomm_sock_create }; -int __init rfcomm_init_sockets(void) +int __init rfcomm_init_sockets(void) { int err; @@ -1009,7 +945,7 @@ int __init rfcomm_init_sockets(void) if (err < 0) goto error; - rfcomm_sock_proc_init(); + class_create_file(&bt_class, &class_attr_rfcomm); BT_INFO("RFCOMM socket layer initialized"); @@ -1023,7 +959,7 @@ error: void __exit rfcomm_cleanup_sockets(void) { - rfcomm_sock_proc_cleanup(); + class_remove_file(&bt_class, &class_attr_rfcomm); if (bt_sock_unregister(BTPROTO_RFCOMM) < 0) BT_ERR("RFCOMM socket layer unregistration failed"); diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 997e42df115..9cb00dc6c08 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -38,8 +38,7 @@ #include <linux/interrupt.h> #include <linux/socket.h> #include <linux/skbuff.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> +#include <linux/device.h> #include <linux/list.h> #include <net/sock.h> @@ -55,7 +54,7 @@ #define BT_DBG(D...) #endif -#define VERSION "0.4" +#define VERSION "0.5" static struct proto_ops sco_sock_ops; @@ -893,91 +892,26 @@ drop: return 0; } -/* ---- Proc fs support ---- */ -#ifdef CONFIG_PROC_FS -static void *sco_seq_start(struct seq_file *seq, loff_t *pos) +static ssize_t sco_sysfs_show(struct class *dev, char *buf) { struct sock *sk; struct hlist_node *node; - loff_t l = *pos; + char *str = buf; read_lock_bh(&sco_sk_list.lock); - sk_for_each(sk, node, &sco_sk_list.head) - if (!l--) - goto found; - sk = NULL; -found: - return sk; -} - -static void *sco_seq_next(struct seq_file *seq, void *e, loff_t *pos) -{ - struct sock *sk = e; - (*pos)++; - return sk_next(sk); -} + sk_for_each(sk, node, &sco_sk_list.head) { + str += sprintf(str, "%s %s %d\n", + batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), + sk->sk_state); + } -static void sco_seq_stop(struct seq_file *seq, void *e) -{ read_unlock_bh(&sco_sk_list.lock); -} - -static int sco_seq_show(struct seq_file *seq, void *e) -{ - struct sock *sk = e; - seq_printf(seq, "%s %s %d\n", - batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), sk->sk_state); - return 0; -} -static struct seq_operations sco_seq_ops = { - .start = sco_seq_start, - .next = sco_seq_next, - .stop = sco_seq_stop, - .show = sco_seq_show -}; - -static int sco_seq_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &sco_seq_ops); + return (str - buf); } -static struct file_operations sco_seq_fops = { - .owner = THIS_MODULE, - .open = sco_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int __init sco_proc_init(void) -{ - struct proc_dir_entry *p = create_proc_entry("sco", S_IRUGO, proc_bt); - if (!p) - return -ENOMEM; - p->owner = THIS_MODULE; - p->proc_fops = &sco_seq_fops; - return 0; -} - -static void __exit sco_proc_cleanup(void) -{ - remove_proc_entry("sco", proc_bt); -} - -#else /* CONFIG_PROC_FS */ - -static int __init sco_proc_init(void) -{ - return 0; -} - -static void __exit sco_proc_cleanup(void) -{ - return; -} -#endif /* CONFIG_PROC_FS */ +static CLASS_ATTR(sco, S_IRUGO, sco_sysfs_show, NULL); static struct proto_ops sco_sock_ops = { .family = PF_BLUETOOTH, @@ -1035,7 +969,7 @@ static int __init sco_init(void) goto error; } - sco_proc_init(); + class_create_file(&bt_class, &class_attr_sco); BT_INFO("SCO (Voice Link) ver %s", VERSION); BT_INFO("SCO socket layer initialized"); @@ -1049,7 +983,7 @@ error: static void __exit sco_exit(void) { - sco_proc_cleanup(); + class_remove_file(&bt_class, &class_attr_sco); if (bt_sock_unregister(BTPROTO_SCO) < 0) BT_ERR("SCO socket unregistration failed"); diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 24396b914d1..1f08a59b51e 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -86,8 +86,8 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) struct net_bridge_port *op; list_for_each_entry(op, &br->port_list, list) { if (op != p && - !memcmp(op->dev->dev_addr, - f->addr.addr, ETH_ALEN)) { + !compare_ether_addr(op->dev->dev_addr, + f->addr.addr)) { f->dst = op; goto insert; } @@ -151,8 +151,8 @@ void br_fdb_delete_by_port(struct net_bridge *br, struct net_bridge_port *p) struct net_bridge_port *op; list_for_each_entry(op, &br->port_list, list) { if (op != p && - !memcmp(op->dev->dev_addr, - f->addr.addr, ETH_ALEN)) { + !compare_ether_addr(op->dev->dev_addr, + f->addr.addr)) { f->dst = op; goto skip_delete; } @@ -174,7 +174,7 @@ struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br, struct net_bridge_fdb_entry *fdb; hlist_for_each_entry_rcu(fdb, h, &br->hash[br_mac_hash(addr)], hlist) { - if (!memcmp(fdb->addr.addr, addr, ETH_ALEN)) { + if (!compare_ether_addr(fdb->addr.addr, addr)) { if (unlikely(has_expired(br, fdb))) break; return fdb; @@ -264,7 +264,7 @@ static inline struct net_bridge_fdb_entry *fdb_find(struct hlist_head *head, struct net_bridge_fdb_entry *fdb; hlist_for_each_entry_rcu(fdb, h, head, hlist) { - if (!memcmp(fdb->addr.addr, addr, ETH_ALEN)) + if (!compare_ether_addr(fdb->addr.addr, addr)) return fdb; } return NULL; diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 9a45e6279c5..b88220a64cd 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -128,7 +128,7 @@ int br_handle_frame(struct net_bridge_port *p, struct sk_buff **pskb) dest = eth_hdr(skb)->h_dest; } - if (!memcmp(p->br->dev->dev_addr, dest, ETH_ALEN)) + if (!compare_ether_addr(p->br->dev->dev_addr, dest)) skb->pkt_type = PACKET_HOST; NF_HOOK(PF_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL, diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 0da11ff05fa..ac09b6a2352 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -15,6 +15,7 @@ #include <linux/kernel.h> #include <linux/smp_lock.h> +#include <linux/etherdevice.h> #include "br_private.h" #include "br_private_stp.h" @@ -133,10 +134,10 @@ static void br_stp_change_bridge_id(struct net_bridge *br, memcpy(br->dev->dev_addr, addr, ETH_ALEN); list_for_each_entry(p, &br->port_list, list) { - if (!memcmp(p->designated_bridge.addr, oldaddr, ETH_ALEN)) + if (!compare_ether_addr(p->designated_bridge.addr, oldaddr)) memcpy(p->designated_bridge.addr, addr, ETH_ALEN); - if (!memcmp(p->designated_root.addr, oldaddr, ETH_ALEN)) + if (!compare_ether_addr(p->designated_root.addr, oldaddr)) memcpy(p->designated_root.addr, addr, ETH_ALEN); } @@ -157,12 +158,12 @@ void br_stp_recalculate_bridge_id(struct net_bridge *br) list_for_each_entry(p, &br->port_list, list) { if (addr == br_mac_zero || - memcmp(p->dev->dev_addr, addr, ETH_ALEN) < 0) + compare_ether_addr(p->dev->dev_addr, addr) < 0) addr = p->dev->dev_addr; } - if (memcmp(br->bridge_id.addr, addr, ETH_ALEN)) + if (compare_ether_addr(br->bridge_id.addr, addr)) br_stp_change_bridge_id(br, addr); } diff --git a/net/core/datagram.c b/net/core/datagram.c index 81987df536e..1bcfef51ac5 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -213,6 +213,10 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, { int i, err, fraglen, end = 0; struct sk_buff *next = skb_shinfo(skb)->frag_list; + + if (!len) + return 0; + next_skb: fraglen = skb_headlen(skb); i = -1; @@ -346,6 +350,20 @@ fault: return -EFAULT; } +unsigned int __skb_checksum_complete(struct sk_buff *skb) +{ + unsigned int sum; + + sum = (u16)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_HW)) + netdev_rx_csum_fault(skb->dev); + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + return sum; +} +EXPORT_SYMBOL(__skb_checksum_complete); + /** * skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec. * @skb: skbuff @@ -359,7 +377,7 @@ fault: * -EFAULT - fault during copy. Beware, in this case iovec * can be modified! */ -int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb, +int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, int hlen, struct iovec *iov) { unsigned int csum; @@ -372,8 +390,7 @@ int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb, iov++; if (iov->iov_len < chunk) { - if ((unsigned short)csum_fold(skb_checksum(skb, 0, chunk + hlen, - skb->csum))) + if (__skb_checksum_complete(skb)) goto csum_error; if (skb_copy_datagram_iovec(skb, hlen, iov, chunk)) goto fault; @@ -384,6 +401,8 @@ int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb, goto fault; if ((unsigned short)csum_fold(csum)) goto csum_error; + if (unlikely(skb->ip_summed == CHECKSUM_HW)) + netdev_rx_csum_fault(skb->dev); iov->iov_len -= chunk; iov->iov_base += chunk; } diff --git a/net/core/dev.c b/net/core/dev.c index a44eeef24ed..0b48e294aaf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1108,6 +1108,18 @@ out: return ret; } +/* Take action when hardware reception checksum errors are detected. */ +#ifdef CONFIG_BUG +void netdev_rx_csum_fault(struct net_device *dev) +{ + if (net_ratelimit()) { + printk(KERN_ERR "%s: hw csum failure.\n", dev->name); + dump_stack(); + } +} +EXPORT_SYMBOL(netdev_rx_csum_fault); +#endif + #ifdef CONFIG_HIGHMEM /* Actually, we should eliminate this check as soon as we know, that: * 1. IOMMU is present and allows to map all the memory. @@ -2717,6 +2729,20 @@ int register_netdevice(struct net_device *dev) dev->name); dev->features &= ~NETIF_F_TSO; } + if (dev->features & NETIF_F_UFO) { + if (!(dev->features & NETIF_F_HW_CSUM)) { + printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no " + "NETIF_F_HW_CSUM feature.\n", + dev->name); + dev->features &= ~NETIF_F_UFO; + } + if (!(dev->features & NETIF_F_SG)) { + printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no " + "NETIF_F_SG feature.\n", + dev->name); + dev->features &= ~NETIF_F_UFO; + } + } /* * nil rebuild_header routine, diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c index db098ff3cd6..cb530eef0e3 100644 --- a/net/core/dev_mcast.c +++ b/net/core/dev_mcast.c @@ -194,8 +194,7 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl) done: spin_unlock_bh(&dev->xmit_lock); - if (dmi1) - kfree(dmi1); + kfree(dmi1); return err; } diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 404b761e82c..0350586e919 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -93,6 +93,20 @@ int ethtool_op_get_perm_addr(struct net_device *dev, struct ethtool_perm_addr *a } +u32 ethtool_op_get_ufo(struct net_device *dev) +{ + return (dev->features & NETIF_F_UFO) != 0; +} + +int ethtool_op_set_ufo(struct net_device *dev, u32 data) +{ + if (data) + dev->features |= NETIF_F_UFO; + else + dev->features &= ~NETIF_F_UFO; + return 0; +} + /* Handlers for each ethtool command */ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) @@ -483,6 +497,11 @@ static int __ethtool_set_sg(struct net_device *dev, u32 data) return err; } + if (!data && dev->ethtool_ops->set_ufo) { + err = dev->ethtool_ops->set_ufo(dev, 0); + if (err) + return err; + } return dev->ethtool_ops->set_sg(dev, data); } @@ -569,6 +588,32 @@ static int ethtool_set_tso(struct net_device *dev, char __user *useraddr) return dev->ethtool_ops->set_tso(dev, edata.data); } +static int ethtool_get_ufo(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata = { ETHTOOL_GTSO }; + + if (!dev->ethtool_ops->get_ufo) + return -EOPNOTSUPP; + edata.data = dev->ethtool_ops->get_ufo(dev); + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} +static int ethtool_set_ufo(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata; + + if (!dev->ethtool_ops->set_ufo) + return -EOPNOTSUPP; + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + if (edata.data && !(dev->features & NETIF_F_SG)) + return -EINVAL; + if (edata.data && !(dev->features & NETIF_F_HW_CSUM)) + return -EINVAL; + return dev->ethtool_ops->set_ufo(dev, edata.data); +} + static int ethtool_self_test(struct net_device *dev, char __user *useraddr) { struct ethtool_test test; @@ -854,6 +899,12 @@ int dev_ethtool(struct ifreq *ifr) case ETHTOOL_GPERMADDR: rc = ethtool_get_perm_addr(dev, useraddr); break; + case ETHTOOL_GUFO: + rc = ethtool_get_ufo(dev, useraddr); + break; + case ETHTOOL_SUFO: + rc = ethtool_set_ufo(dev, useraddr); + break; default: rc = -EOPNOTSUPP; } @@ -882,3 +933,5 @@ EXPORT_SYMBOL(ethtool_op_set_sg); EXPORT_SYMBOL(ethtool_op_set_tso); EXPORT_SYMBOL(ethtool_op_set_tx_csum); EXPORT_SYMBOL(ethtool_op_set_tx_hw_csum); +EXPORT_SYMBOL(ethtool_op_set_ufo); +EXPORT_SYMBOL(ethtool_op_get_ufo); diff --git a/net/core/filter.c b/net/core/filter.c index 079c2edff78..2841bfce29d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -116,8 +116,6 @@ int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) A /= X; continue; case BPF_ALU|BPF_DIV|BPF_K: - if (fentry->k == 0) - return 0; A /= fentry->k; continue; case BPF_ALU|BPF_AND|BPF_X: @@ -320,6 +318,10 @@ int sk_chk_filter(struct sock_filter *filter, int flen) } } + /* check for division by zero -Kris Katterjohn 2005-10-30 */ + if (ftest->code == (BPF_ALU|BPF_DIV|BPF_K) && ftest->k == 0) + return -EINVAL; + /* check that memory operations use valid addresses. */ if (ftest->k >= BPF_MEMWORDS) { /* but it might not be a memory operation... */ diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 1dcf7fa1f0f..e68700f950a 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1625,12 +1625,9 @@ static int neightbl_fill_info(struct neigh_table *tbl, struct sk_buff *skb, memset(&ndst, 0, sizeof(ndst)); - for (cpu = 0; cpu < NR_CPUS; cpu++) { + for_each_cpu(cpu) { struct neigh_statistics *st; - if (!cpu_possible(cpu)) - continue; - st = per_cpu_ptr(tbl->stats, cpu); ndst.ndts_allocs += st->allocs; ndst.ndts_destroys += st->destroys; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 802fe11efad..49424a42a2c 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -101,16 +101,20 @@ void netpoll_queue(struct sk_buff *skb) static int checksum_udp(struct sk_buff *skb, struct udphdr *uh, unsigned short ulen, u32 saddr, u32 daddr) { - if (uh->check == 0) + unsigned int psum; + + if (uh->check == 0 || skb->ip_summed == CHECKSUM_UNNECESSARY) return 0; - if (skb->ip_summed == CHECKSUM_HW) - return csum_tcpudp_magic( - saddr, daddr, ulen, IPPROTO_UDP, skb->csum); + psum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); + + if (skb->ip_summed == CHECKSUM_HW && + !(u16)csum_fold(csum_add(psum, skb->csum))) + return 0; - skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); + skb->csum = psum; - return csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); + return __skb_checksum_complete(skb); } /* @@ -489,7 +493,7 @@ int __netpoll_rx(struct sk_buff *skb) if (ulen != len) goto out; - if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr) < 0) + if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr)) goto out; if (np->local_ip && np->local_ip != ntohl(iph->daddr)) goto out; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 5f043d34669..7fc3e9e28c3 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -75,7 +75,7 @@ * By design there should only be *one* "controlling" process. In practice * multiple write accesses gives unpredictable result. Understood by "write" * to /proc gives result code thats should be read be the "writer". - * For pratical use this should be no problem. + * For practical use this should be no problem. * * Note when adding devices to a specific CPU there good idea to also assign * /proc/irq/XX/smp_affinity so TX-interrupts gets bound to the same CPU. @@ -96,7 +96,7 @@ * New xmit() return, do_div and misc clean up by Stephen Hemminger * <shemminger@osdl.org> 040923 * - * Rany Dunlap fixed u64 printk compiler waring + * Randy Dunlap fixed u64 printk compiler waring * * Remove FCS from BW calculation. Lennert Buytenhek <buytenh@wantstofly.org> * New time handling. Lennert Buytenhek <buytenh@wantstofly.org> 041213 @@ -137,6 +137,7 @@ #include <linux/ipv6.h> #include <linux/udp.h> #include <linux/proc_fs.h> +#include <linux/seq_file.h> #include <linux/wait.h> #include <net/checksum.h> #include <net/ipv6.h> @@ -151,7 +152,7 @@ #include <asm/timex.h> -#define VERSION "pktgen v2.62: Packet Generator for packet performance testing.\n" +#define VERSION "pktgen v2.63: Packet Generator for packet performance testing.\n" /* #define PG_DEBUG(a) a */ #define PG_DEBUG(a) @@ -177,8 +178,8 @@ #define T_REMDEV (1<<3) /* Remove all devs */ /* Locks */ -#define thread_lock() spin_lock(&_thread_lock) -#define thread_unlock() spin_unlock(&_thread_lock) +#define thread_lock() down(&pktgen_sem) +#define thread_unlock() up(&pktgen_sem) /* If lock -- can be removed after some work */ #define if_lock(t) spin_lock(&(t->if_lock)); @@ -186,7 +187,9 @@ /* Used to help with determining the pkts on receive */ #define PKTGEN_MAGIC 0xbe9be955 -#define PG_PROC_DIR "net/pktgen" +#define PG_PROC_DIR "pktgen" +#define PGCTRL "pgctrl" +static struct proc_dir_entry *pg_proc_dir = NULL; #define MAX_CFLOWS 65536 @@ -202,11 +205,8 @@ struct pktgen_dev { * Try to keep frequent/infrequent used vars. separated. */ - char ifname[32]; - struct proc_dir_entry *proc_ent; + char ifname[IFNAMSIZ]; char result[512]; - /* proc file names */ - char fname[80]; struct pktgen_thread* pg_thread; /* the owner */ struct pktgen_dev *next; /* Used for chaining in the thread's run-queue */ @@ -244,7 +244,7 @@ struct pktgen_dev { __u32 seq_num; int clone_skb; /* Use multiple SKBs during packet gen. If this number - * is greater than 1, then that many coppies of the same + * is greater than 1, then that many copies of the same * packet will be sent before a new packet is allocated. * For instance, if you want to send 1024 identical packets * before creating a new packet, set clone_skb to 1024. @@ -330,8 +330,6 @@ struct pktgen_thread { struct pktgen_dev *if_list; /* All device here */ struct pktgen_thread* next; char name[32]; - char fname[128]; /* name of proc file */ - struct proc_dir_entry *proc_ent; char result[512]; u32 max_before_softirq; /* We'll call do_softirq to prevent starvation. */ @@ -396,7 +394,7 @@ static inline s64 divremdi3(s64 x, s64 y, int type) /* End of hacks to deal with 64-bit math on x86 */ -/** Convert to miliseconds */ +/** Convert to milliseconds */ static inline __u64 tv_to_ms(const struct timeval* tv) { __u64 ms = tv->tv_usec / 1000; @@ -425,7 +423,7 @@ static inline __u64 pg_div64(__u64 n, __u64 base) { __u64 tmp = n; /* - * How do we know if the architectrure we are running on + * How do we know if the architecture we are running on * supports division with 64 bit base? * */ @@ -473,16 +471,6 @@ static inline __u64 tv_diff(const struct timeval* a, const struct timeval* b) static char version[] __initdata = VERSION; -static ssize_t proc_pgctrl_read(struct file* file, char __user * buf, size_t count, loff_t *ppos); -static ssize_t proc_pgctrl_write(struct file* file, const char __user * buf, size_t count, loff_t *ppos); -static int proc_if_read(char *buf , char **start, off_t offset, int len, int *eof, void *data); - -static int proc_thread_read(char *buf , char **start, off_t offset, int len, int *eof, void *data); -static int proc_if_write(struct file *file, const char __user *user_buffer, unsigned long count, void *data); -static int proc_thread_write(struct file *file, const char __user *user_buffer, unsigned long count, void *data); -static int create_proc_dir(void); -static int remove_proc_dir(void); - static int pktgen_remove_device(struct pktgen_thread* t, struct pktgen_dev *i); static int pktgen_add_device(struct pktgen_thread* t, const char* ifname); static struct pktgen_thread* pktgen_find_thread(const char* name); @@ -503,83 +491,41 @@ static int pg_delay_d = 0; static int pg_clone_skb_d = 0; static int debug = 0; -static DEFINE_SPINLOCK(_thread_lock); +static DECLARE_MUTEX(pktgen_sem); static struct pktgen_thread *pktgen_threads = NULL; -static char module_fname[128]; -static struct proc_dir_entry *module_proc_ent = NULL; - static struct notifier_block pktgen_notifier_block = { .notifier_call = pktgen_device_event, }; -static struct file_operations pktgen_fops = { - .read = proc_pgctrl_read, - .write = proc_pgctrl_write, - /* .ioctl = pktgen_ioctl, later maybe */ -}; - /* * /proc handling functions * */ -static struct proc_dir_entry *pg_proc_dir = NULL; -static int proc_pgctrl_read_eof=0; - -static ssize_t proc_pgctrl_read(struct file* file, char __user * buf, - size_t count, loff_t *ppos) +static int pgctrl_show(struct seq_file *seq, void *v) { - char data[200]; - int len = 0; - - if(proc_pgctrl_read_eof) { - proc_pgctrl_read_eof=0; - len = 0; - goto out; - } - - sprintf(data, "%s", VERSION); - - len = strlen(data); - - if(len > count) { - len =-EFAULT; - goto out; - } - - if (copy_to_user(buf, data, len)) { - len =-EFAULT; - goto out; - } - - *ppos += len; - proc_pgctrl_read_eof=1; /* EOF next call */ - - out: - return len; + seq_puts(seq, VERSION); + return 0; } -static ssize_t proc_pgctrl_write(struct file* file,const char __user * buf, - size_t count, loff_t *ppos) +static ssize_t pgctrl_write(struct file* file,const char __user * buf, + size_t count, loff_t *ppos) { - char *data = NULL; int err = 0; + char data[128]; if (!capable(CAP_NET_ADMIN)){ err = -EPERM; goto out; } - data = (void*)vmalloc ((unsigned int)count); + if (count > sizeof(data)) + count = sizeof(data); - if(!data) { - err = -ENOMEM; - goto out; - } if (copy_from_user(data, buf, count)) { - err =-EFAULT; - goto out_free; + err = -EFAULT; + goto out; } data[count-1] = 0; /* Make string */ @@ -594,31 +540,40 @@ static ssize_t proc_pgctrl_write(struct file* file,const char __user * buf, err = count; - out_free: - vfree (data); out: return err; } -static int proc_if_read(char *buf , char **start, off_t offset, - int len, int *eof, void *data) +static int pgctrl_open(struct inode *inode, struct file *file) +{ + return single_open(file, pgctrl_show, PDE(inode)->data); +} + +static struct file_operations pktgen_fops = { + .owner = THIS_MODULE, + .open = pgctrl_open, + .read = seq_read, + .llseek = seq_lseek, + .write = pgctrl_write, + .release = single_release, +}; + +static int pktgen_if_show(struct seq_file *seq, void *v) { - char *p; int i; - struct pktgen_dev *pkt_dev = (struct pktgen_dev*)(data); + struct pktgen_dev *pkt_dev = seq->private; __u64 sa; __u64 stopped; __u64 now = getCurUs(); - p = buf; - p += sprintf(p, "Params: count %llu min_pkt_size: %u max_pkt_size: %u\n", - (unsigned long long) pkt_dev->count, - pkt_dev->min_pkt_size, pkt_dev->max_pkt_size); + seq_printf(seq, "Params: count %llu min_pkt_size: %u max_pkt_size: %u\n", + (unsigned long long) pkt_dev->count, + pkt_dev->min_pkt_size, pkt_dev->max_pkt_size); - p += sprintf(p, " frags: %d delay: %u clone_skb: %d ifname: %s\n", - pkt_dev->nfrags, 1000*pkt_dev->delay_us+pkt_dev->delay_ns, pkt_dev->clone_skb, pkt_dev->ifname); + seq_printf(seq, " frags: %d delay: %u clone_skb: %d ifname: %s\n", + pkt_dev->nfrags, 1000*pkt_dev->delay_us+pkt_dev->delay_ns, pkt_dev->clone_skb, pkt_dev->ifname); - p += sprintf(p, " flows: %u flowlen: %u\n", pkt_dev->cflows, pkt_dev->lflow); + seq_printf(seq, " flows: %u flowlen: %u\n", pkt_dev->cflows, pkt_dev->lflow); if(pkt_dev->flags & F_IPV6) { @@ -626,19 +581,19 @@ static int proc_if_read(char *buf , char **start, off_t offset, fmt_ip6(b1, pkt_dev->in6_saddr.s6_addr); fmt_ip6(b2, pkt_dev->min_in6_saddr.s6_addr); fmt_ip6(b3, pkt_dev->max_in6_saddr.s6_addr); - p += sprintf(p, " saddr: %s min_saddr: %s max_saddr: %s\n", b1, b2, b3); + seq_printf(seq, " saddr: %s min_saddr: %s max_saddr: %s\n", b1, b2, b3); fmt_ip6(b1, pkt_dev->in6_daddr.s6_addr); fmt_ip6(b2, pkt_dev->min_in6_daddr.s6_addr); fmt_ip6(b3, pkt_dev->max_in6_daddr.s6_addr); - p += sprintf(p, " daddr: %s min_daddr: %s max_daddr: %s\n", b1, b2, b3); + seq_printf(seq, " daddr: %s min_daddr: %s max_daddr: %s\n", b1, b2, b3); } else - p += sprintf(p, " dst_min: %s dst_max: %s\n src_min: %s src_max: %s\n", - pkt_dev->dst_min, pkt_dev->dst_max, pkt_dev->src_min, pkt_dev->src_max); + seq_printf(seq," dst_min: %s dst_max: %s\n src_min: %s src_max: %s\n", + pkt_dev->dst_min, pkt_dev->dst_max, pkt_dev->src_min, pkt_dev->src_max); - p += sprintf(p, " src_mac: "); + seq_puts(seq, " src_mac: "); if ((pkt_dev->src_mac[0] == 0) && (pkt_dev->src_mac[1] == 0) && @@ -648,89 +603,89 @@ static int proc_if_read(char *buf , char **start, off_t offset, (pkt_dev->src_mac[5] == 0)) for (i = 0; i < 6; i++) - p += sprintf(p, "%02X%s", pkt_dev->odev->dev_addr[i], i == 5 ? " " : ":"); + seq_printf(seq, "%02X%s", pkt_dev->odev->dev_addr[i], i == 5 ? " " : ":"); else for (i = 0; i < 6; i++) - p += sprintf(p, "%02X%s", pkt_dev->src_mac[i], i == 5 ? " " : ":"); + seq_printf(seq, "%02X%s", pkt_dev->src_mac[i], i == 5 ? " " : ":"); - p += sprintf(p, "dst_mac: "); + seq_printf(seq, "dst_mac: "); for (i = 0; i < 6; i++) - p += sprintf(p, "%02X%s", pkt_dev->dst_mac[i], i == 5 ? "\n" : ":"); + seq_printf(seq, "%02X%s", pkt_dev->dst_mac[i], i == 5 ? "\n" : ":"); - p += sprintf(p, " udp_src_min: %d udp_src_max: %d udp_dst_min: %d udp_dst_max: %d\n", - pkt_dev->udp_src_min, pkt_dev->udp_src_max, pkt_dev->udp_dst_min, - pkt_dev->udp_dst_max); + seq_printf(seq, " udp_src_min: %d udp_src_max: %d udp_dst_min: %d udp_dst_max: %d\n", + pkt_dev->udp_src_min, pkt_dev->udp_src_max, pkt_dev->udp_dst_min, + pkt_dev->udp_dst_max); - p += sprintf(p, " src_mac_count: %d dst_mac_count: %d \n Flags: ", - pkt_dev->src_mac_count, pkt_dev->dst_mac_count); + seq_printf(seq, " src_mac_count: %d dst_mac_count: %d \n Flags: ", + pkt_dev->src_mac_count, pkt_dev->dst_mac_count); if (pkt_dev->flags & F_IPV6) - p += sprintf(p, "IPV6 "); + seq_printf(seq, "IPV6 "); if (pkt_dev->flags & F_IPSRC_RND) - p += sprintf(p, "IPSRC_RND "); + seq_printf(seq, "IPSRC_RND "); if (pkt_dev->flags & F_IPDST_RND) - p += sprintf(p, "IPDST_RND "); + seq_printf(seq, "IPDST_RND "); if (pkt_dev->flags & F_TXSIZE_RND) - p += sprintf(p, "TXSIZE_RND "); + seq_printf(seq, "TXSIZE_RND "); if (pkt_dev->flags & F_UDPSRC_RND) - p += sprintf(p, "UDPSRC_RND "); + seq_printf(seq, "UDPSRC_RND "); if (pkt_dev->flags & F_UDPDST_RND) - p += sprintf(p, "UDPDST_RND "); + seq_printf(seq, "UDPDST_RND "); if (pkt_dev->flags & F_MACSRC_RND) - p += sprintf(p, "MACSRC_RND "); + seq_printf(seq, "MACSRC_RND "); if (pkt_dev->flags & F_MACDST_RND) - p += sprintf(p, "MACDST_RND "); + seq_printf(seq, "MACDST_RND "); - p += sprintf(p, "\n"); + seq_puts(seq, "\n"); sa = pkt_dev->started_at; stopped = pkt_dev->stopped_at; if (pkt_dev->running) stopped = now; /* not really stopped, more like last-running-at */ - p += sprintf(p, "Current:\n pkts-sofar: %llu errors: %llu\n started: %lluus stopped: %lluus idle: %lluus\n", - (unsigned long long) pkt_dev->sofar, - (unsigned long long) pkt_dev->errors, - (unsigned long long) sa, - (unsigned long long) stopped, - (unsigned long long) pkt_dev->idle_acc); + seq_printf(seq, "Current:\n pkts-sofar: %llu errors: %llu\n started: %lluus stopped: %lluus idle: %lluus\n", + (unsigned long long) pkt_dev->sofar, + (unsigned long long) pkt_dev->errors, + (unsigned long long) sa, + (unsigned long long) stopped, + (unsigned long long) pkt_dev->idle_acc); - p += sprintf(p, " seq_num: %d cur_dst_mac_offset: %d cur_src_mac_offset: %d\n", - pkt_dev->seq_num, pkt_dev->cur_dst_mac_offset, pkt_dev->cur_src_mac_offset); + seq_printf(seq, " seq_num: %d cur_dst_mac_offset: %d cur_src_mac_offset: %d\n", + pkt_dev->seq_num, pkt_dev->cur_dst_mac_offset, + pkt_dev->cur_src_mac_offset); if(pkt_dev->flags & F_IPV6) { char b1[128], b2[128]; fmt_ip6(b1, pkt_dev->cur_in6_daddr.s6_addr); fmt_ip6(b2, pkt_dev->cur_in6_saddr.s6_addr); - p += sprintf(p, " cur_saddr: %s cur_daddr: %s\n", b2, b1); + seq_printf(seq, " cur_saddr: %s cur_daddr: %s\n", b2, b1); } else - p += sprintf(p, " cur_saddr: 0x%x cur_daddr: 0x%x\n", - pkt_dev->cur_saddr, pkt_dev->cur_daddr); + seq_printf(seq, " cur_saddr: 0x%x cur_daddr: 0x%x\n", + pkt_dev->cur_saddr, pkt_dev->cur_daddr); - p += sprintf(p, " cur_udp_dst: %d cur_udp_src: %d\n", - pkt_dev->cur_udp_dst, pkt_dev->cur_udp_src); + seq_printf(seq, " cur_udp_dst: %d cur_udp_src: %d\n", + pkt_dev->cur_udp_dst, pkt_dev->cur_udp_src); - p += sprintf(p, " flows: %u\n", pkt_dev->nflows); + seq_printf(seq, " flows: %u\n", pkt_dev->nflows); if (pkt_dev->result[0]) - p += sprintf(p, "Result: %s\n", pkt_dev->result); + seq_printf(seq, "Result: %s\n", pkt_dev->result); else - p += sprintf(p, "Result: Idle\n"); - *eof = 1; + seq_printf(seq, "Result: Idle\n"); - return p - buf; + return 0; } @@ -802,13 +757,14 @@ done_str: return i; } -static int proc_if_write(struct file *file, const char __user *user_buffer, - unsigned long count, void *data) +static ssize_t pktgen_if_write(struct file *file, const char __user *user_buffer, + size_t count, loff_t *offset) { + struct seq_file *seq = (struct seq_file *) file->private_data; + struct pktgen_dev *pkt_dev = seq->private; int i = 0, max, len; char name[16], valstr[32]; unsigned long value = 0; - struct pktgen_dev *pkt_dev = (struct pktgen_dev*)(data); char* pg_result = NULL; int tmp = 0; char buf[128]; @@ -849,7 +805,8 @@ static int proc_if_write(struct file *file, const char __user *user_buffer, if (copy_from_user(tb, user_buffer, count)) return -EFAULT; tb[count] = 0; - printk("pktgen: %s,%lu buffer -:%s:-\n", name, count, tb); + printk("pktgen: %s,%lu buffer -:%s:-\n", name, + (unsigned long) count, tb); } if (!strcmp(name, "min_pkt_size")) { @@ -1335,92 +1292,98 @@ static int proc_if_write(struct file *file, const char __user *user_buffer, return -EINVAL; } -static int proc_thread_read(char *buf , char **start, off_t offset, - int len, int *eof, void *data) +static int pktgen_if_open(struct inode *inode, struct file *file) { - char *p; - struct pktgen_thread *t = (struct pktgen_thread*)(data); - struct pktgen_dev *pkt_dev = NULL; + return single_open(file, pktgen_if_show, PDE(inode)->data); +} +static struct file_operations pktgen_if_fops = { + .owner = THIS_MODULE, + .open = pktgen_if_open, + .read = seq_read, + .llseek = seq_lseek, + .write = pktgen_if_write, + .release = single_release, +}; - if (!t) { - printk("pktgen: ERROR: could not find thread in proc_thread_read\n"); - return -EINVAL; - } +static int pktgen_thread_show(struct seq_file *seq, void *v) +{ + struct pktgen_thread *t = seq->private; + struct pktgen_dev *pkt_dev = NULL; + + BUG_ON(!t); - p = buf; - p += sprintf(p, "Name: %s max_before_softirq: %d\n", + seq_printf(seq, "Name: %s max_before_softirq: %d\n", t->name, t->max_before_softirq); - p += sprintf(p, "Running: "); + seq_printf(seq, "Running: "); if_lock(t); for(pkt_dev = t->if_list;pkt_dev; pkt_dev = pkt_dev->next) if(pkt_dev->running) - p += sprintf(p, "%s ", pkt_dev->ifname); + seq_printf(seq, "%s ", pkt_dev->ifname); - p += sprintf(p, "\nStopped: "); + seq_printf(seq, "\nStopped: "); for(pkt_dev = t->if_list;pkt_dev; pkt_dev = pkt_dev->next) if(!pkt_dev->running) - p += sprintf(p, "%s ", pkt_dev->ifname); + seq_printf(seq, "%s ", pkt_dev->ifname); if (t->result[0]) - p += sprintf(p, "\nResult: %s\n", t->result); + seq_printf(seq, "\nResult: %s\n", t->result); else - p += sprintf(p, "\nResult: NA\n"); - - *eof = 1; + seq_printf(seq, "\nResult: NA\n"); if_unlock(t); - return p - buf; + return 0; } -static int proc_thread_write(struct file *file, const char __user *user_buffer, - unsigned long count, void *data) +static ssize_t pktgen_thread_write(struct file *file, + const char __user *user_buffer, + size_t count, loff_t *offset) { + struct seq_file *seq = (struct seq_file *) file->private_data; + struct pktgen_thread *t = seq->private; int i = 0, max, len, ret; char name[40]; - struct pktgen_thread *t; char *pg_result; unsigned long value = 0; - + if (count < 1) { // sprintf(pg_result, "Wrong command format"); return -EINVAL; } - + max = count - i; len = count_trail_chars(&user_buffer[i], max); - if (len < 0) - return len; - + if (len < 0) + return len; + i += len; - + /* Read variable name */ len = strn_len(&user_buffer[i], sizeof(name) - 1); - if (len < 0) - return len; + if (len < 0) + return len; memset(name, 0, sizeof(name)); if (copy_from_user(name, &user_buffer[i], len)) return -EFAULT; i += len; - + max = count -i; len = count_trail_chars(&user_buffer[i], max); - if (len < 0) - return len; - + if (len < 0) + return len; + i += len; - if (debug) - printk("pktgen: t=%s, count=%lu\n", name, count); - + if (debug) + printk("pktgen: t=%s, count=%lu\n", name, + (unsigned long) count); - t = (struct pktgen_thread*)(data); if(!t) { printk("pktgen: ERROR: No thread\n"); ret = -EINVAL; @@ -1474,21 +1437,19 @@ static int proc_thread_write(struct file *file, const char __user *user_buffer, return ret; } -static int create_proc_dir(void) +static int pktgen_thread_open(struct inode *inode, struct file *file) { - pg_proc_dir = proc_mkdir(PG_PROC_DIR, NULL); - - if (!pg_proc_dir) - return -ENODEV; - - return 0; + return single_open(file, pktgen_thread_show, PDE(inode)->data); } -static int remove_proc_dir(void) -{ - remove_proc_entry(PG_PROC_DIR, NULL); - return 0; -} +static struct file_operations pktgen_thread_fops = { + .owner = THIS_MODULE, + .open = pktgen_thread_open, + .read = seq_read, + .llseek = seq_lseek, + .write = pktgen_thread_write, + .release = single_release, +}; /* Think find or remove for NN */ static struct pktgen_dev *__pktgen_NN_threads(const char* ifname, int remove) @@ -1702,7 +1663,7 @@ static void spin(struct pktgen_dev *pkt_dev, __u64 spin_until_us) start = now = getCurUs(); printk(KERN_INFO "sleeping for %d\n", (int)(spin_until_us - now)); while (now < spin_until_us) { - /* TODO: optimise sleeping behavior */ + /* TODO: optimize sleeping behavior */ if (spin_until_us - now > jiffies_to_usecs(1)+1) schedule_timeout_interruptible(1); else if (spin_until_us - now > 100) { @@ -2361,7 +2322,7 @@ static void pktgen_stop_all_threads_ifs(void) pktgen_stop(t); t = t->next; } - thread_unlock(); + thread_unlock(); } static int thread_is_running(struct pktgen_thread *t ) @@ -2552,10 +2513,9 @@ static void pktgen_rem_thread(struct pktgen_thread *t) struct pktgen_thread *tmp = pktgen_threads; - if (strlen(t->fname)) - remove_proc_entry(t->fname, NULL); + remove_proc_entry(t->name, pg_proc_dir); - thread_lock(); + thread_lock(); if (tmp == t) pktgen_threads = tmp->next; @@ -2825,7 +2785,7 @@ static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, const char* i if_lock(t); for(pkt_dev=t->if_list; pkt_dev; pkt_dev = pkt_dev->next ) { - if (strcmp(pkt_dev->ifname, ifname) == 0) { + if (strncmp(pkt_dev->ifname, ifname, IFNAMSIZ) == 0) { break; } } @@ -2864,74 +2824,70 @@ static int add_dev_to_thread(struct pktgen_thread *t, struct pktgen_dev *pkt_dev static int pktgen_add_device(struct pktgen_thread *t, const char* ifname) { struct pktgen_dev *pkt_dev; + struct proc_dir_entry *pe; /* We don't allow a device to be on several threads */ - if( (pkt_dev = __pktgen_NN_threads(ifname, FIND)) == NULL) { - - pkt_dev = kmalloc(sizeof(struct pktgen_dev), GFP_KERNEL); - if (!pkt_dev) - return -ENOMEM; + pkt_dev = __pktgen_NN_threads(ifname, FIND); + if (pkt_dev) { + printk("pktgen: ERROR: interface already used.\n"); + return -EBUSY; + } - memset(pkt_dev, 0, sizeof(struct pktgen_dev)); + pkt_dev = kzalloc(sizeof(struct pktgen_dev), GFP_KERNEL); + if (!pkt_dev) + return -ENOMEM; - pkt_dev->flows = vmalloc(MAX_CFLOWS*sizeof(struct flow_state)); - if (pkt_dev->flows == NULL) { - kfree(pkt_dev); - return -ENOMEM; - } - memset(pkt_dev->flows, 0, MAX_CFLOWS*sizeof(struct flow_state)); - - pkt_dev->min_pkt_size = ETH_ZLEN; - pkt_dev->max_pkt_size = ETH_ZLEN; - pkt_dev->nfrags = 0; - pkt_dev->clone_skb = pg_clone_skb_d; - pkt_dev->delay_us = pg_delay_d / 1000; - pkt_dev->delay_ns = pg_delay_d % 1000; - pkt_dev->count = pg_count_d; - pkt_dev->sofar = 0; - pkt_dev->udp_src_min = 9; /* sink port */ - pkt_dev->udp_src_max = 9; - pkt_dev->udp_dst_min = 9; - pkt_dev->udp_dst_max = 9; - - strncpy(pkt_dev->ifname, ifname, 31); - sprintf(pkt_dev->fname, "%s/%s", PG_PROC_DIR, ifname); - - if (! pktgen_setup_dev(pkt_dev)) { - printk("pktgen: ERROR: pktgen_setup_dev failed.\n"); - if (pkt_dev->flows) - vfree(pkt_dev->flows); - kfree(pkt_dev); - return -ENODEV; - } + pkt_dev->flows = vmalloc(MAX_CFLOWS*sizeof(struct flow_state)); + if (pkt_dev->flows == NULL) { + kfree(pkt_dev); + return -ENOMEM; + } + memset(pkt_dev->flows, 0, MAX_CFLOWS*sizeof(struct flow_state)); - pkt_dev->proc_ent = create_proc_entry(pkt_dev->fname, 0600, NULL); - if (!pkt_dev->proc_ent) { - printk("pktgen: cannot create %s procfs entry.\n", pkt_dev->fname); - if (pkt_dev->flows) - vfree(pkt_dev->flows); - kfree(pkt_dev); - return -EINVAL; - } - pkt_dev->proc_ent->read_proc = proc_if_read; - pkt_dev->proc_ent->write_proc = proc_if_write; - pkt_dev->proc_ent->data = (void*)(pkt_dev); - pkt_dev->proc_ent->owner = THIS_MODULE; + pkt_dev->min_pkt_size = ETH_ZLEN; + pkt_dev->max_pkt_size = ETH_ZLEN; + pkt_dev->nfrags = 0; + pkt_dev->clone_skb = pg_clone_skb_d; + pkt_dev->delay_us = pg_delay_d / 1000; + pkt_dev->delay_ns = pg_delay_d % 1000; + pkt_dev->count = pg_count_d; + pkt_dev->sofar = 0; + pkt_dev->udp_src_min = 9; /* sink port */ + pkt_dev->udp_src_max = 9; + pkt_dev->udp_dst_min = 9; + pkt_dev->udp_dst_max = 9; + + strncpy(pkt_dev->ifname, ifname, IFNAMSIZ); + + if (! pktgen_setup_dev(pkt_dev)) { + printk("pktgen: ERROR: pktgen_setup_dev failed.\n"); + if (pkt_dev->flows) + vfree(pkt_dev->flows); + kfree(pkt_dev); + return -ENODEV; + } + + pe = create_proc_entry(ifname, 0600, pg_proc_dir); + if (!pe) { + printk("pktgen: cannot create %s/%s procfs entry.\n", + PG_PROC_DIR, ifname); + if (pkt_dev->flows) + vfree(pkt_dev->flows); + kfree(pkt_dev); + return -EINVAL; + } + pe->proc_fops = &pktgen_if_fops; + pe->data = pkt_dev; - return add_dev_to_thread(t, pkt_dev); - } - else { - printk("pktgen: ERROR: interface already used.\n"); - return -EBUSY; - } + return add_dev_to_thread(t, pkt_dev); } static struct pktgen_thread *pktgen_find_thread(const char* name) { struct pktgen_thread *t = NULL; - thread_lock(); + thread_lock(); t = pktgen_threads; while (t) { @@ -2947,6 +2903,7 @@ static struct pktgen_thread *pktgen_find_thread(const char* name) static int pktgen_create_thread(const char* name, int cpu) { struct pktgen_thread *t = NULL; + struct proc_dir_entry *pe; if (strlen(name) > 31) { printk("pktgen: ERROR: Thread name cannot be more than 31 characters.\n"); @@ -2958,28 +2915,26 @@ static int pktgen_create_thread(const char* name, int cpu) return -EINVAL; } - t = (struct pktgen_thread*)(kmalloc(sizeof(struct pktgen_thread), GFP_KERNEL)); + t = kzalloc(sizeof(struct pktgen_thread), GFP_KERNEL); if (!t) { printk("pktgen: ERROR: out of memory, can't create new thread.\n"); return -ENOMEM; } - memset(t, 0, sizeof(struct pktgen_thread)); strcpy(t->name, name); spin_lock_init(&t->if_lock); t->cpu = cpu; - sprintf(t->fname, "%s/%s", PG_PROC_DIR, t->name); - t->proc_ent = create_proc_entry(t->fname, 0600, NULL); - if (!t->proc_ent) { - printk("pktgen: cannot create %s procfs entry.\n", t->fname); + pe = create_proc_entry(t->name, 0600, pg_proc_dir); + if (!pe) { + printk("pktgen: cannot create %s/%s procfs entry.\n", + PG_PROC_DIR, t->name); kfree(t); return -EINVAL; } - t->proc_ent->read_proc = proc_thread_read; - t->proc_ent->write_proc = proc_thread_write; - t->proc_ent->data = (void*)(t); - t->proc_ent->owner = THIS_MODULE; + + pe->proc_fops = &pktgen_thread_fops; + pe->data = t; t->next = pktgen_threads; pktgen_threads = t; @@ -3034,8 +2989,7 @@ static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *pkt_ /* Clean up proc file system */ - if (strlen(pkt_dev->fname)) - remove_proc_entry(pkt_dev->fname, NULL); + remove_proc_entry(pkt_dev->ifname, pg_proc_dir); if (pkt_dev->flows) vfree(pkt_dev->flows); @@ -3046,31 +3000,31 @@ static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *pkt_ static int __init pg_init(void) { int cpu; - printk(version); + struct proc_dir_entry *pe; - module_fname[0] = 0; + printk(version); - create_proc_dir(); + pg_proc_dir = proc_mkdir(PG_PROC_DIR, proc_net); + if (!pg_proc_dir) + return -ENODEV; + pg_proc_dir->owner = THIS_MODULE; - sprintf(module_fname, "%s/pgctrl", PG_PROC_DIR); - module_proc_ent = create_proc_entry(module_fname, 0600, NULL); - if (!module_proc_ent) { - printk("pktgen: ERROR: cannot create %s procfs entry.\n", module_fname); + pe = create_proc_entry(PGCTRL, 0600, pg_proc_dir); + if (pe == NULL) { + printk("pktgen: ERROR: cannot create %s procfs entry.\n", PGCTRL); + proc_net_remove(PG_PROC_DIR); return -EINVAL; } - module_proc_ent->proc_fops = &pktgen_fops; - module_proc_ent->data = NULL; + pe->proc_fops = &pktgen_fops; + pe->data = NULL; /* Register us to receive netdevice events */ register_netdevice_notifier(&pktgen_notifier_block); - for (cpu = 0; cpu < NR_CPUS ; cpu++) { + for_each_online_cpu(cpu) { char buf[30]; - if (!cpu_online(cpu)) - continue; - sprintf(buf, "kpktgend_%i", cpu); pktgen_create_thread(buf, cpu); } @@ -3095,10 +3049,8 @@ static void __exit pg_cleanup(void) unregister_netdevice_notifier(&pktgen_notifier_block); /* Clean up proc file system */ - - remove_proc_entry(module_fname, NULL); - - remove_proc_dir(); + remove_proc_entry(PGCTRL, pg_proc_dir); + proc_net_remove(PG_PROC_DIR); } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 9bed7569ce3..8700379685e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -49,6 +49,7 @@ #include <net/udp.h> #include <net/sock.h> #include <net/pkt_sched.h> +#include <net/netlink.h> DECLARE_MUTEX(rtnl_sem); @@ -462,11 +463,6 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change) netlink_broadcast(rtnl, skb, 0, RTNLGRP_LINK, GFP_KERNEL); } -static int rtnetlink_done(struct netlink_callback *cb) -{ - return 0; -} - /* Protected by RTNL sempahore. */ static struct rtattr **rta_buf; static int rtattr_max; @@ -524,8 +520,6 @@ rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp) } if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { - u32 rlen; - if (link->dumpit == NULL) link = &(rtnetlink_links[PF_UNSPEC][type]); @@ -533,14 +527,11 @@ rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp) goto err_inval; if ((*errp = netlink_dump_start(rtnl, skb, nlh, - link->dumpit, - rtnetlink_done)) != 0) { + link->dumpit, NULL)) != 0) { return -1; } - rlen = NLMSG_ALIGN(nlh->nlmsg_len); - if (rlen > skb->len) - rlen = skb->len; - skb_pull(skb, rlen); + + netlink_queue_skip(nlh, skb); return -1; } @@ -579,75 +570,13 @@ err_inval: return -1; } -/* - * Process one packet of messages. - * Malformed skbs with wrong lengths of messages are discarded silently. - */ - -static inline int rtnetlink_rcv_skb(struct sk_buff *skb) -{ - int err; - struct nlmsghdr * nlh; - - while (skb->len >= NLMSG_SPACE(0)) { - u32 rlen; - - nlh = (struct nlmsghdr *)skb->data; - if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) - return 0; - rlen = NLMSG_ALIGN(nlh->nlmsg_len); - if (rlen > skb->len) - rlen = skb->len; - if (rtnetlink_rcv_msg(skb, nlh, &err)) { - /* Not error, but we must interrupt processing here: - * Note, that in this case we do not pull message - * from skb, it will be processed later. - */ - if (err == 0) - return -1; - netlink_ack(skb, nlh, err); - } else if (nlh->nlmsg_flags&NLM_F_ACK) - netlink_ack(skb, nlh, 0); - skb_pull(skb, rlen); - } - - return 0; -} - -/* - * rtnetlink input queue processing routine: - * - process as much as there was in the queue upon entry. - * - feed skbs to rtnetlink_rcv_skb, until it refuse a message, - * that will occur, when a dump started. - */ - static void rtnetlink_rcv(struct sock *sk, int len) { - unsigned int qlen = skb_queue_len(&sk->sk_receive_queue); + unsigned int qlen = 0; do { - struct sk_buff *skb; - rtnl_lock(); - - if (qlen > skb_queue_len(&sk->sk_receive_queue)) - qlen = skb_queue_len(&sk->sk_receive_queue); - - for (; qlen; qlen--) { - skb = skb_dequeue(&sk->sk_receive_queue); - if (rtnetlink_rcv_skb(skb)) { - if (skb->len) - skb_queue_head(&sk->sk_receive_queue, - skb); - else { - kfree_skb(skb); - qlen--; - } - break; - } - kfree_skb(skb); - } - + netlink_run_queue(sk, &qlen, &rtnetlink_rcv_msg); up(&rtnl_sem); netdev_run_todo(); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 02cd4cde211..b7d13a4fff4 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -122,6 +122,8 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here) * __alloc_skb - allocate a network buffer * @size: size to allocate * @gfp_mask: allocation mask + * @fclone: allocate from fclone cache instead of head cache + * and allocate a cloned (child) skb * * Allocate a new &sk_buff. The returned buffer has no headroom and a * tail room of size bytes. The object has a reference count of one. @@ -174,6 +176,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, skb_shinfo(skb)->tso_size = 0; skb_shinfo(skb)->tso_segs = 0; skb_shinfo(skb)->frag_list = NULL; + skb_shinfo(skb)->ufo_size = 0; + skb_shinfo(skb)->ip6_frag_id = 0; out: return skb; nodata: @@ -332,6 +336,9 @@ void __kfree_skb(struct sk_buff *skb) } #ifdef CONFIG_NETFILTER nf_conntrack_put(skb->nfct); +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + nf_conntrack_put_reasm(skb->nfct_reasm); +#endif #ifdef CONFIG_BRIDGE_NETFILTER nf_bridge_put(skb->nf_bridge); #endif @@ -410,9 +417,17 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) C(nfct); nf_conntrack_get(skb->nfct); C(nfctinfo); +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + C(nfct_reasm); + nf_conntrack_get_reasm(skb->nfct_reasm); +#endif #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) C(ipvs_property); #endif +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + C(nfct_reasm); + nf_conntrack_get_reasm(skb->nfct_reasm); +#endif #ifdef CONFIG_BRIDGE_NETFILTER C(nf_bridge); nf_bridge_get(skb->nf_bridge); @@ -470,6 +485,10 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->nfct = old->nfct; nf_conntrack_get(old->nfct); new->nfctinfo = old->nfctinfo; +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + new->nfct_reasm = old->nfct_reasm; + nf_conntrack_get_reasm(old->nfct_reasm); +#endif #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) new->ipvs_property = old->ipvs_property; #endif @@ -1694,6 +1713,78 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, return textsearch_find(config, state); } +/** + * skb_append_datato_frags: - append the user data to a skb + * @sk: sock structure + * @skb: skb structure to be appened with user data. + * @getfrag: call back function to be used for getting the user data + * @from: pointer to user message iov + * @length: length of the iov message + * + * Description: This procedure append the user data in the fragment part + * of the skb if any page alloc fails user this procedure returns -ENOMEM + */ +int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, + int getfrag(void *from, char *to, int offset, + int len, int odd, struct sk_buff *skb), + void *from, int length) +{ + int frg_cnt = 0; + skb_frag_t *frag = NULL; + struct page *page = NULL; + int copy, left; + int offset = 0; + int ret; + + do { + /* Return error if we don't have space for new frag */ + frg_cnt = skb_shinfo(skb)->nr_frags; + if (frg_cnt >= MAX_SKB_FRAGS) + return -EFAULT; + + /* allocate a new page for next frag */ + page = alloc_pages(sk->sk_allocation, 0); + + /* If alloc_page fails just return failure and caller will + * free previous allocated pages by doing kfree_skb() + */ + if (page == NULL) + return -ENOMEM; + + /* initialize the next frag */ + sk->sk_sndmsg_page = page; + sk->sk_sndmsg_off = 0; + skb_fill_page_desc(skb, frg_cnt, page, 0, 0); + skb->truesize += PAGE_SIZE; + atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc); + + /* get the new initialized frag */ + frg_cnt = skb_shinfo(skb)->nr_frags; + frag = &skb_shinfo(skb)->frags[frg_cnt - 1]; + + /* copy the user data to page */ + left = PAGE_SIZE - frag->page_offset; + copy = (length > left)? left : length; + + ret = getfrag(from, (page_address(frag->page) + + frag->page_offset + frag->size), + offset, copy, 0, skb); + if (ret < 0) + return -EFAULT; + + /* copy was successful so update the size parameters */ + sk->sk_sndmsg_off += copy; + frag->size += copy; + skb->len += copy; + skb->data_len += copy; + offset += copy; + length -= copy; + + } while (length > 0); + + return 0; +} + void __init skb_init(void) { skbuff_head_cache = kmem_cache_create("skbuff_head_cache", @@ -1745,3 +1836,4 @@ EXPORT_SYMBOL(skb_prepare_seq_read); EXPORT_SYMBOL(skb_seq_read); EXPORT_SYMBOL(skb_abort_seq_read); EXPORT_SYMBOL(skb_find_text); +EXPORT_SYMBOL(skb_append_datato_frags); diff --git a/net/core/sock.c b/net/core/sock.c index 1c52fe809ed..13cc3be4f05 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -940,7 +940,7 @@ static struct sk_buff *sock_alloc_send_pskb(struct sock *sk, int noblock, int *errcode) { struct sk_buff *skb; - unsigned int gfp_mask; + gfp_t gfp_mask; long timeo; int err; @@ -1242,8 +1242,7 @@ static void sock_def_write_space(struct sock *sk) static void sock_def_destruct(struct sock *sk) { - if (sk->sk_protinfo) - kfree(sk->sk_protinfo); + kfree(sk->sk_protinfo); } void sk_send_sigurg(struct sock *sk) diff --git a/net/core/stream.c b/net/core/stream.c index ac9edfdf874..15bfd03e802 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -52,8 +52,9 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) { struct task_struct *tsk = current; DEFINE_WAIT(wait); + int done; - while (1) { + do { if (sk->sk_err) return sock_error(sk); if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) @@ -65,13 +66,12 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); sk->sk_write_pending++; - if (sk_wait_event(sk, timeo_p, - !((1 << sk->sk_state) & - ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)))) - break; + done = sk_wait_event(sk, timeo_p, + !((1 << sk->sk_state) & + ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))); finish_wait(sk->sk_sleep, &wait); sk->sk_write_pending--; - } + } while (!done); return 0; } diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 5871c027f9d..f97b85d55ad 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -118,7 +118,6 @@ DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics); #define DCCP_ADD_STATS_USER(field, val) \ SNMP_ADD_STATS_USER(dccp_statistics, field, val) -extern int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb); extern int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb); extern int dccp_send_response(struct sock *sk); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 6298cf58ff9..ca03521112c 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -31,8 +31,6 @@ struct inet_hashinfo __cacheline_aligned dccp_hashinfo = { .lhash_lock = RW_LOCK_UNLOCKED, .lhash_users = ATOMIC_INIT(0), .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait), - .portalloc_lock = SPIN_LOCK_UNLOCKED, - .port_rover = 1024 - 1, }; EXPORT_SYMBOL_GPL(dccp_hashinfo); @@ -125,36 +123,15 @@ static int dccp_v4_hash_connect(struct sock *sk) int ret; if (snum == 0) { - int rover; int low = sysctl_local_port_range[0]; int high = sysctl_local_port_range[1]; int remaining = (high - low) + 1; + int rover = net_random() % (high - low) + low; struct hlist_node *node; struct inet_timewait_sock *tw = NULL; local_bh_disable(); - - /* TODO. Actually it is not so bad idea to remove - * dccp_hashinfo.portalloc_lock before next submission to - * Linus. - * As soon as we touch this place at all it is time to think. - * - * Now it protects single _advisory_ variable - * dccp_hashinfo.port_rover, hence it is mostly useless. - * Code will work nicely if we just delete it, but - * I am afraid in contented case it will work not better or - * even worse: another cpu just will hit the same bucket - * and spin there. - * So some cpu salt could remove both contention and - * memory pingpong. Any ideas how to do this in a nice way? - */ - spin_lock(&dccp_hashinfo.portalloc_lock); - rover = dccp_hashinfo.port_rover; - do { - rover++; - if ((rover < low) || (rover > high)) - rover = low; head = &dccp_hashinfo.bhash[inet_bhashfn(rover, dccp_hashinfo.bhash_size)]; spin_lock(&head->lock); @@ -187,9 +164,9 @@ static int dccp_v4_hash_connect(struct sock *sk) next_port: spin_unlock(&head->lock); + if (++rover > high) + rover = low; } while (--remaining > 0); - dccp_hashinfo.port_rover = rover; - spin_unlock(&dccp_hashinfo.portalloc_lock); local_bh_enable(); @@ -197,9 +174,6 @@ static int dccp_v4_hash_connect(struct sock *sk) ok: /* All locks still held and bhs disabled */ - dccp_hashinfo.port_rover = rover; - spin_unlock(&dccp_hashinfo.portalloc_lock); - inet_bind_hash(sk, tb, rover); if (sk_unhashed(sk)) { inet_sk(sk)->sport = htons(rover); @@ -1289,10 +1263,8 @@ static int dccp_v4_destroy_sock(struct sock *sk) if (inet_csk(sk)->icsk_bind_hash != NULL) inet_put_port(&dccp_hashinfo, sk); - if (dp->dccps_service_list != NULL) { - kfree(dp->dccps_service_list); - dp->dccps_service_list = NULL; - } + kfree(dp->dccps_service_list); + dp->dccps_service_list = NULL; ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk); ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk); diff --git a/net/dccp/output.c b/net/dccp/output.c index 29250749f16..74ff8702587 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -12,6 +12,7 @@ #include <linux/config.h> #include <linux/dccp.h> +#include <linux/kernel.h> #include <linux/skbuff.h> #include <net/sock.h> @@ -25,13 +26,20 @@ static inline void dccp_event_ack_sent(struct sock *sk) inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } +static inline void dccp_skb_entail(struct sock *sk, struct sk_buff *skb) +{ + skb_set_owner_w(skb, sk); + WARN_ON(sk->sk_send_head); + sk->sk_send_head = skb; +} + /* * All SKB's seen here are completely headerless. It is our * job to build the DCCP header, and pass the packet down to * IP so it can do the same plus pass the packet off to the * device. */ -int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) +static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) { if (likely(skb != NULL)) { const struct inet_sock *inet = inet_sk(sk); @@ -50,10 +58,21 @@ int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) switch (dcb->dccpd_type) { case DCCP_PKT_DATA: set_ack = 0; + /* fall through */ + case DCCP_PKT_DATAACK: break; + case DCCP_PKT_SYNC: case DCCP_PKT_SYNCACK: ackno = dcb->dccpd_seq; + /* fall through */ + default: + /* + * Only data packets should come through with skb->sk + * set. + */ + WARN_ON(skb->sk); + skb_set_owner_w(skb, sk); break; } @@ -63,9 +82,6 @@ int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) skb->h.raw = skb_push(skb, dccp_header_size); dh = dccp_hdr(skb); - if (!skb->sk) - skb_set_owner_w(skb, sk); - /* Build DCCP header and checksum it. */ memset(dh, 0, dccp_header_size); dh->dccph_type = dcb->dccpd_type; @@ -393,10 +409,8 @@ int dccp_connect(struct sock *sk) DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST; skb->csum = 0; - skb_set_owner_w(skb, sk); - BUG_TRAP(sk->sk_send_head == NULL); - sk->sk_send_head = skb; + dccp_skb_entail(sk, skb); dccp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL)); DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS); @@ -425,7 +439,6 @@ void dccp_send_ack(struct sock *sk) skb_reserve(skb, MAX_DCCP_HEADER); skb->csum = 0; DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_ACK; - skb_set_owner_w(skb, sk); dccp_transmit_skb(sk, skb); } } @@ -482,7 +495,6 @@ void dccp_send_sync(struct sock *sk, const u64 seq, DCCP_SKB_CB(skb)->dccpd_type = pkt_type; DCCP_SKB_CB(skb)->dccpd_seq = seq; - skb_set_owner_w(skb, sk); dccp_transmit_skb(sk, skb); } @@ -495,7 +507,7 @@ void dccp_send_close(struct sock *sk, const int active) { struct dccp_sock *dp = dccp_sk(sk); struct sk_buff *skb; - const unsigned int prio = active ? GFP_KERNEL : GFP_ATOMIC; + const gfp_t prio = active ? GFP_KERNEL : GFP_ATOMIC; skb = alloc_skb(sk->sk_prot->max_header, prio); if (skb == NULL) @@ -507,10 +519,8 @@ void dccp_send_close(struct sock *sk, const int active) DCCP_SKB_CB(skb)->dccpd_type = dp->dccps_role == DCCP_ROLE_CLIENT ? DCCP_PKT_CLOSE : DCCP_PKT_CLOSEREQ; - skb_set_owner_w(skb, sk); if (active) { - BUG_TRAP(sk->sk_send_head == NULL); - sk->sk_send_head = skb; + dccp_skb_entail(sk, skb); dccp_transmit_skb(sk, skb_clone(skb, prio)); } else dccp_transmit_skb(sk, skb); diff --git a/net/dccp/proto.c b/net/dccp/proto.c index a021c3422f6..e0ace7cbb99 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -238,8 +238,7 @@ static int dccp_setsockopt_service(struct sock *sk, const u32 service, lock_sock(sk); dp->dccps_service = service; - if (dp->dccps_service_list != NULL) - kfree(dp->dccps_service_list); + kfree(dp->dccps_service_list); dp->dccps_service_list = sl; release_sock(sk); diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 1186dc44cdf..f89e55f814d 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -719,22 +719,9 @@ static int dn_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (saddr->sdn_flags & ~SDF_WILD) return -EINVAL; -#if 1 if (!capable(CAP_NET_BIND_SERVICE) && (saddr->sdn_objnum || (saddr->sdn_flags & SDF_WILD))) return -EACCES; -#else - /* - * Maybe put the default actions in the default security ops for - * dn_prot_sock ? Would be nice if the capable call would go there - * too. - */ - if (security_dn_prot_sock(saddr) && - !capable(CAP_NET_BIND_SERVICE) || - saddr->sdn_objnum || (saddr->sdn_flags & SDF_WILD)) - return -EACCES; -#endif - if (!(saddr->sdn_flags & SDF_WILD)) { if (dn_ntohs(saddr->sdn_nodeaddrl)) { @@ -1677,17 +1664,15 @@ static int dn_recvmsg(struct kiocb *iocb, struct socket *sock, goto out; } - rv = dn_check_state(sk, NULL, 0, &timeo, flags); - if (rv) - goto out; - if (sk->sk_shutdown & RCV_SHUTDOWN) { - if (!(flags & MSG_NOSIGNAL)) - send_sig(SIGPIPE, current, 0); - rv = -EPIPE; + rv = 0; goto out; } + rv = dn_check_state(sk, NULL, 0, &timeo, flags); + if (rv) + goto out; + if (flags & ~(MSG_PEEK|MSG_OOB|MSG_WAITALL|MSG_DONTWAIT|MSG_NOSIGNAL)) { rv = -EOPNOTSUPP; goto out; @@ -1941,6 +1926,8 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock, if (sk->sk_shutdown & SEND_SHUTDOWN) { err = -EPIPE; + if (!(flags & MSG_NOSIGNAL)) + send_sig(SIGPIPE, current, 0); goto out_err; } diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c index eeba56f9932..6f8b5658cb4 100644 --- a/net/decnet/dn_table.c +++ b/net/decnet/dn_table.c @@ -784,16 +784,14 @@ struct dn_fib_table *dn_fib_get_table(int n, int create) static void dn_fib_del_tree(int n) { - struct dn_fib_table *t; + struct dn_fib_table *t; - write_lock(&dn_fib_tables_lock); - t = dn_fib_tables[n]; - dn_fib_tables[n] = NULL; - write_unlock(&dn_fib_tables_lock); + write_lock(&dn_fib_tables_lock); + t = dn_fib_tables[n]; + dn_fib_tables[n] = NULL; + write_unlock(&dn_fib_tables_lock); - if (t) { - kfree(t); - } + kfree(t); } struct dn_fib_table *dn_fib_empty_table(void) diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 68a5ca86644..e2457736727 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -146,19 +146,6 @@ int eth_rebuild_header(struct sk_buff *skb) return 0; } -static inline unsigned int compare_eth_addr(const unsigned char *__a, const unsigned char *__b) -{ - const unsigned short *dest = (unsigned short *) __a; - const unsigned short *devaddr = (unsigned short *) __b; - unsigned int res; - - BUILD_BUG_ON(ETH_ALEN != 6); - res = ((dest[0] ^ devaddr[0]) | - (dest[1] ^ devaddr[1]) | - (dest[2] ^ devaddr[2])) != 0; - - return res; -} /* * Determine the packet's protocol ID. The rule here is that we @@ -176,7 +163,7 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) eth = eth_hdr(skb); if (*eth->h_dest&1) { - if (!compare_eth_addr(eth->h_dest, dev->broadcast)) + if (!compare_ether_addr(eth->h_dest, dev->broadcast)) skb->pkt_type = PACKET_BROADCAST; else skb->pkt_type = PACKET_MULTICAST; @@ -191,7 +178,7 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) */ else if(1 /*dev->flags&IFF_PROMISC*/) { - if (unlikely(compare_eth_addr(eth->h_dest, dev->dev_addr))) + if (unlikely(compare_ether_addr(eth->h_dest, dev->dev_addr))) skb->pkt_type = PACKET_OTHERHOST; } diff --git a/net/ethernet/pe2.c b/net/ethernet/pe2.c index 98a494be603..9d57b4fb644 100644 --- a/net/ethernet/pe2.c +++ b/net/ethernet/pe2.c @@ -32,8 +32,7 @@ struct datalink_proto *make_EII_client(void) void destroy_EII_client(struct datalink_proto *dl) { - if (dl) - kfree(dl); + kfree(dl); } EXPORT_SYMBOL(destroy_EII_client); diff --git a/net/ieee80211/Makefile b/net/ieee80211/Makefile index a6ccac5baea..f988417121d 100644 --- a/net/ieee80211/Makefile +++ b/net/ieee80211/Makefile @@ -7,5 +7,6 @@ ieee80211-objs := \ ieee80211_module.o \ ieee80211_tx.o \ ieee80211_rx.o \ - ieee80211_wx.o + ieee80211_wx.o \ + ieee80211_geo.o diff --git a/net/ieee80211/ieee80211_crypt.c b/net/ieee80211/ieee80211_crypt.c index 61a9d92e455..ecc9bb196ab 100644 --- a/net/ieee80211/ieee80211_crypt.c +++ b/net/ieee80211/ieee80211_crypt.c @@ -11,16 +11,14 @@ * */ -#include <linux/config.h> -#include <linux/version.h> +#include <linux/errno.h> #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> -#include <asm/string.h> -#include <asm/errno.h> - +#include <linux/string.h> #include <net/ieee80211.h> + MODULE_AUTHOR("Jouni Malinen"); MODULE_DESCRIPTION("HostAP crypto"); MODULE_LICENSE("GPL"); @@ -30,26 +28,20 @@ struct ieee80211_crypto_alg { struct ieee80211_crypto_ops *ops; }; -struct ieee80211_crypto { - struct list_head algs; - spinlock_t lock; -}; - -static struct ieee80211_crypto *hcrypt; +static LIST_HEAD(ieee80211_crypto_algs); +static DEFINE_SPINLOCK(ieee80211_crypto_lock); void ieee80211_crypt_deinit_entries(struct ieee80211_device *ieee, int force) { - struct list_head *ptr, *n; - struct ieee80211_crypt_data *entry; - - for (ptr = ieee->crypt_deinit_list.next, n = ptr->next; - ptr != &ieee->crypt_deinit_list; ptr = n, n = ptr->next) { - entry = list_entry(ptr, struct ieee80211_crypt_data, list); + struct ieee80211_crypt_data *entry, *next; + unsigned long flags; + spin_lock_irqsave(&ieee->lock, flags); + list_for_each_entry_safe(entry, next, &ieee->crypt_deinit_list, list) { if (atomic_read(&entry->refcnt) != 0 && !force) continue; - list_del(ptr); + list_del(&entry->list); if (entry->ops) { entry->ops->deinit(entry->priv); @@ -57,6 +49,17 @@ void ieee80211_crypt_deinit_entries(struct ieee80211_device *ieee, int force) } kfree(entry); } + spin_unlock_irqrestore(&ieee->lock, flags); +} + +/* After this, crypt_deinit_list won't accept new members */ +void ieee80211_crypt_quiescing(struct ieee80211_device *ieee) +{ + unsigned long flags; + + spin_lock_irqsave(&ieee->lock, flags); + ieee->crypt_quiesced = 1; + spin_unlock_irqrestore(&ieee->lock, flags); } void ieee80211_crypt_deinit_handler(unsigned long data) @@ -64,16 +67,16 @@ void ieee80211_crypt_deinit_handler(unsigned long data) struct ieee80211_device *ieee = (struct ieee80211_device *)data; unsigned long flags; - spin_lock_irqsave(&ieee->lock, flags); ieee80211_crypt_deinit_entries(ieee, 0); - if (!list_empty(&ieee->crypt_deinit_list)) { + + spin_lock_irqsave(&ieee->lock, flags); + if (!list_empty(&ieee->crypt_deinit_list) && !ieee->crypt_quiesced) { printk(KERN_DEBUG "%s: entries remaining in delayed crypt " "deletion list\n", ieee->dev->name); ieee->crypt_deinit_timer.expires = jiffies + HZ; add_timer(&ieee->crypt_deinit_timer); } spin_unlock_irqrestore(&ieee->lock, flags); - } void ieee80211_crypt_delayed_deinit(struct ieee80211_device *ieee, @@ -93,10 +96,12 @@ void ieee80211_crypt_delayed_deinit(struct ieee80211_device *ieee, * locking. */ spin_lock_irqsave(&ieee->lock, flags); - list_add(&tmp->list, &ieee->crypt_deinit_list); - if (!timer_pending(&ieee->crypt_deinit_timer)) { - ieee->crypt_deinit_timer.expires = jiffies + HZ; - add_timer(&ieee->crypt_deinit_timer); + if (!ieee->crypt_quiesced) { + list_add(&tmp->list, &ieee->crypt_deinit_list); + if (!timer_pending(&ieee->crypt_deinit_timer)) { + ieee->crypt_deinit_timer.expires = jiffies + HZ; + add_timer(&ieee->crypt_deinit_timer); + } } spin_unlock_irqrestore(&ieee->lock, flags); } @@ -106,9 +111,6 @@ int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops) unsigned long flags; struct ieee80211_crypto_alg *alg; - if (hcrypt == NULL) - return -1; - alg = kmalloc(sizeof(*alg), GFP_KERNEL); if (alg == NULL) return -ENOMEM; @@ -116,9 +118,9 @@ int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops) memset(alg, 0, sizeof(*alg)); alg->ops = ops; - spin_lock_irqsave(&hcrypt->lock, flags); - list_add(&alg->list, &hcrypt->algs); - spin_unlock_irqrestore(&hcrypt->lock, flags); + spin_lock_irqsave(&ieee80211_crypto_lock, flags); + list_add(&alg->list, &ieee80211_crypto_algs); + spin_unlock_irqrestore(&ieee80211_crypto_lock, flags); printk(KERN_DEBUG "ieee80211_crypt: registered algorithm '%s'\n", ops->name); @@ -128,127 +130,75 @@ int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops) int ieee80211_unregister_crypto_ops(struct ieee80211_crypto_ops *ops) { + struct ieee80211_crypto_alg *alg; unsigned long flags; - struct list_head *ptr; - struct ieee80211_crypto_alg *del_alg = NULL; - - if (hcrypt == NULL) - return -1; - - spin_lock_irqsave(&hcrypt->lock, flags); - for (ptr = hcrypt->algs.next; ptr != &hcrypt->algs; ptr = ptr->next) { - struct ieee80211_crypto_alg *alg = - (struct ieee80211_crypto_alg *)ptr; - if (alg->ops == ops) { - list_del(&alg->list); - del_alg = alg; - break; - } - } - spin_unlock_irqrestore(&hcrypt->lock, flags); - if (del_alg) { - printk(KERN_DEBUG "ieee80211_crypt: unregistered algorithm " - "'%s'\n", ops->name); - kfree(del_alg); + spin_lock_irqsave(&ieee80211_crypto_lock, flags); + list_for_each_entry(alg, &ieee80211_crypto_algs, list) { + if (alg->ops == ops) + goto found; } - - return del_alg ? 0 : -1; + spin_unlock_irqrestore(&ieee80211_crypto_lock, flags); + return -EINVAL; + + found: + printk(KERN_DEBUG "ieee80211_crypt: unregistered algorithm " + "'%s'\n", ops->name); + list_del(&alg->list); + spin_unlock_irqrestore(&ieee80211_crypto_lock, flags); + kfree(alg); + return 0; } struct ieee80211_crypto_ops *ieee80211_get_crypto_ops(const char *name) { + struct ieee80211_crypto_alg *alg; unsigned long flags; - struct list_head *ptr; - struct ieee80211_crypto_alg *found_alg = NULL; - - if (hcrypt == NULL) - return NULL; - - spin_lock_irqsave(&hcrypt->lock, flags); - for (ptr = hcrypt->algs.next; ptr != &hcrypt->algs; ptr = ptr->next) { - struct ieee80211_crypto_alg *alg = - (struct ieee80211_crypto_alg *)ptr; - if (strcmp(alg->ops->name, name) == 0) { - found_alg = alg; - break; - } + + spin_lock_irqsave(&ieee80211_crypto_lock, flags); + list_for_each_entry(alg, &ieee80211_crypto_algs, list) { + if (strcmp(alg->ops->name, name) == 0) + goto found; } - spin_unlock_irqrestore(&hcrypt->lock, flags); + spin_unlock_irqrestore(&ieee80211_crypto_lock, flags); + return NULL; - if (found_alg) - return found_alg->ops; - else - return NULL; + found: + spin_unlock_irqrestore(&ieee80211_crypto_lock, flags); + return alg->ops; } static void *ieee80211_crypt_null_init(int keyidx) { return (void *)1; } + static void ieee80211_crypt_null_deinit(void *priv) { } static struct ieee80211_crypto_ops ieee80211_crypt_null = { - .name = "NULL", - .init = ieee80211_crypt_null_init, - .deinit = ieee80211_crypt_null_deinit, - .encrypt_mpdu = NULL, - .decrypt_mpdu = NULL, - .encrypt_msdu = NULL, - .decrypt_msdu = NULL, - .set_key = NULL, - .get_key = NULL, - .extra_prefix_len = 0, - .extra_postfix_len = 0, - .owner = THIS_MODULE, + .name = "NULL", + .init = ieee80211_crypt_null_init, + .deinit = ieee80211_crypt_null_deinit, + .owner = THIS_MODULE, }; static int __init ieee80211_crypto_init(void) { - int ret = -ENOMEM; - - hcrypt = kmalloc(sizeof(*hcrypt), GFP_KERNEL); - if (!hcrypt) - goto out; - - memset(hcrypt, 0, sizeof(*hcrypt)); - INIT_LIST_HEAD(&hcrypt->algs); - spin_lock_init(&hcrypt->lock); - - ret = ieee80211_register_crypto_ops(&ieee80211_crypt_null); - if (ret < 0) { - kfree(hcrypt); - hcrypt = NULL; - } - out: - return ret; + return ieee80211_register_crypto_ops(&ieee80211_crypt_null); } static void __exit ieee80211_crypto_deinit(void) { - struct list_head *ptr, *n; - - if (hcrypt == NULL) - return; - - for (ptr = hcrypt->algs.next, n = ptr->next; ptr != &hcrypt->algs; - ptr = n, n = ptr->next) { - struct ieee80211_crypto_alg *alg = - (struct ieee80211_crypto_alg *)ptr; - list_del(ptr); - printk(KERN_DEBUG "ieee80211_crypt: unregistered algorithm " - "'%s' (deinit)\n", alg->ops->name); - kfree(alg); - } - - kfree(hcrypt); + ieee80211_unregister_crypto_ops(&ieee80211_crypt_null); + BUG_ON(!list_empty(&ieee80211_crypto_algs)); } EXPORT_SYMBOL(ieee80211_crypt_deinit_entries); EXPORT_SYMBOL(ieee80211_crypt_deinit_handler); EXPORT_SYMBOL(ieee80211_crypt_delayed_deinit); +EXPORT_SYMBOL(ieee80211_crypt_quiescing); EXPORT_SYMBOL(ieee80211_register_crypto_ops); EXPORT_SYMBOL(ieee80211_unregister_crypto_ops); diff --git a/net/ieee80211/ieee80211_crypt_ccmp.c b/net/ieee80211/ieee80211_crypt_ccmp.c index 8fc13f45971..47022172850 100644 --- a/net/ieee80211/ieee80211_crypt_ccmp.c +++ b/net/ieee80211/ieee80211_crypt_ccmp.c @@ -10,7 +10,6 @@ */ #include <linux/config.h> -#include <linux/version.h> #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> @@ -119,7 +118,7 @@ static inline void xor_block(u8 * b, u8 * a, size_t len) } static void ccmp_init_blocks(struct crypto_tfm *tfm, - struct ieee80211_hdr *hdr, + struct ieee80211_hdr_4addr *hdr, u8 * pn, size_t dlen, u8 * b0, u8 * auth, u8 * s0) { u8 *pos, qc = 0; @@ -191,26 +190,18 @@ static void ccmp_init_blocks(struct crypto_tfm *tfm, ieee80211_ccmp_aes_encrypt(tfm, b0, s0); } -static int ieee80211_ccmp_encrypt(struct sk_buff *skb, int hdr_len, void *priv) +static int ieee80211_ccmp_hdr(struct sk_buff *skb, int hdr_len, void *priv) { struct ieee80211_ccmp_data *key = priv; - int data_len, i, blocks, last, len; - u8 *pos, *mic; - struct ieee80211_hdr *hdr; - u8 *b0 = key->tx_b0; - u8 *b = key->tx_b; - u8 *e = key->tx_e; - u8 *s0 = key->tx_s0; + int i; + u8 *pos; - if (skb_headroom(skb) < CCMP_HDR_LEN || - skb_tailroom(skb) < CCMP_MIC_LEN || skb->len < hdr_len) + if (skb_headroom(skb) < CCMP_HDR_LEN || skb->len < hdr_len) return -1; - data_len = skb->len - hdr_len; pos = skb_push(skb, CCMP_HDR_LEN); memmove(pos, pos + CCMP_HDR_LEN, hdr_len); pos += hdr_len; - mic = skb_put(skb, CCMP_MIC_LEN); i = CCMP_PN_LEN - 1; while (i >= 0) { @@ -229,7 +220,31 @@ static int ieee80211_ccmp_encrypt(struct sk_buff *skb, int hdr_len, void *priv) *pos++ = key->tx_pn[1]; *pos++ = key->tx_pn[0]; - hdr = (struct ieee80211_hdr *)skb->data; + return CCMP_HDR_LEN; +} + +static int ieee80211_ccmp_encrypt(struct sk_buff *skb, int hdr_len, void *priv) +{ + struct ieee80211_ccmp_data *key = priv; + int data_len, i, blocks, last, len; + u8 *pos, *mic; + struct ieee80211_hdr_4addr *hdr; + u8 *b0 = key->tx_b0; + u8 *b = key->tx_b; + u8 *e = key->tx_e; + u8 *s0 = key->tx_s0; + + if (skb_tailroom(skb) < CCMP_MIC_LEN || skb->len < hdr_len) + return -1; + + data_len = skb->len - hdr_len; + len = ieee80211_ccmp_hdr(skb, hdr_len, priv); + if (len < 0) + return -1; + + pos = skb->data + hdr_len + CCMP_HDR_LEN; + mic = skb_put(skb, CCMP_MIC_LEN); + hdr = (struct ieee80211_hdr_4addr *)skb->data; ccmp_init_blocks(key->tfm, hdr, key->tx_pn, data_len, b0, b, s0); blocks = (data_len + AES_BLOCK_LEN - 1) / AES_BLOCK_LEN; @@ -258,7 +273,7 @@ static int ieee80211_ccmp_decrypt(struct sk_buff *skb, int hdr_len, void *priv) { struct ieee80211_ccmp_data *key = priv; u8 keyidx, *pos; - struct ieee80211_hdr *hdr; + struct ieee80211_hdr_4addr *hdr; u8 *b0 = key->rx_b0; u8 *b = key->rx_b; u8 *a = key->rx_a; @@ -272,7 +287,7 @@ static int ieee80211_ccmp_decrypt(struct sk_buff *skb, int hdr_len, void *priv) return -1; } - hdr = (struct ieee80211_hdr *)skb->data; + hdr = (struct ieee80211_hdr_4addr *)skb->data; pos = skb->data + hdr_len; keyidx = pos[3]; if (!(keyidx & (1 << 5))) { @@ -426,19 +441,20 @@ static char *ieee80211_ccmp_print_stats(char *p, void *priv) } static struct ieee80211_crypto_ops ieee80211_crypt_ccmp = { - .name = "CCMP", - .init = ieee80211_ccmp_init, - .deinit = ieee80211_ccmp_deinit, - .encrypt_mpdu = ieee80211_ccmp_encrypt, - .decrypt_mpdu = ieee80211_ccmp_decrypt, - .encrypt_msdu = NULL, - .decrypt_msdu = NULL, - .set_key = ieee80211_ccmp_set_key, - .get_key = ieee80211_ccmp_get_key, - .print_stats = ieee80211_ccmp_print_stats, - .extra_prefix_len = CCMP_HDR_LEN, - .extra_postfix_len = CCMP_MIC_LEN, - .owner = THIS_MODULE, + .name = "CCMP", + .init = ieee80211_ccmp_init, + .deinit = ieee80211_ccmp_deinit, + .build_iv = ieee80211_ccmp_hdr, + .encrypt_mpdu = ieee80211_ccmp_encrypt, + .decrypt_mpdu = ieee80211_ccmp_decrypt, + .encrypt_msdu = NULL, + .decrypt_msdu = NULL, + .set_key = ieee80211_ccmp_set_key, + .get_key = ieee80211_ccmp_get_key, + .print_stats = ieee80211_ccmp_print_stats, + .extra_mpdu_prefix_len = CCMP_HDR_LEN, + .extra_mpdu_postfix_len = CCMP_MIC_LEN, + .owner = THIS_MODULE, }; static int __init ieee80211_crypto_ccmp_init(void) diff --git a/net/ieee80211/ieee80211_crypt_tkip.c b/net/ieee80211/ieee80211_crypt_tkip.c index d4f9164be1a..e0988320efb 100644 --- a/net/ieee80211/ieee80211_crypt_tkip.c +++ b/net/ieee80211/ieee80211_crypt_tkip.c @@ -10,7 +10,6 @@ */ #include <linux/config.h> -#include <linux/version.h> #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> @@ -59,8 +58,24 @@ struct ieee80211_tkip_data { /* scratch buffers for virt_to_page() (crypto API) */ u8 rx_hdr[16], tx_hdr[16]; + + unsigned long flags; }; +static unsigned long ieee80211_tkip_set_flags(unsigned long flags, void *priv) +{ + struct ieee80211_tkip_data *_priv = priv; + unsigned long old_flags = _priv->flags; + _priv->flags = flags; + return old_flags; +} + +static unsigned long ieee80211_tkip_get_flags(void *priv) +{ + struct ieee80211_tkip_data *_priv = priv; + return _priv->flags; +} + static void *ieee80211_tkip_init(int key_idx) { struct ieee80211_tkip_data *priv; @@ -69,6 +84,7 @@ static void *ieee80211_tkip_init(int key_idx) if (priv == NULL) goto fail; memset(priv, 0, sizeof(*priv)); + priv->key_idx = key_idx; priv->tfm_arc4 = crypto_alloc_tfm("arc4", 0); @@ -255,25 +271,27 @@ static void tkip_mixing_phase2(u8 * WEPSeed, const u8 * TK, const u16 * TTAK, #endif } -static int ieee80211_tkip_encrypt(struct sk_buff *skb, int hdr_len, void *priv) +static u8 *ieee80211_tkip_hdr(struct sk_buff *skb, int hdr_len, void *priv) { struct ieee80211_tkip_data *tkey = priv; int len; - u8 rc4key[16], *pos, *icv; - struct ieee80211_hdr *hdr; + u8 *rc4key, *pos, *icv; + struct ieee80211_hdr_4addr *hdr; u32 crc; - struct scatterlist sg; - if (skb_headroom(skb) < 8 || skb_tailroom(skb) < 4 || - skb->len < hdr_len) - return -1; + hdr = (struct ieee80211_hdr_4addr *)skb->data; + + if (skb_headroom(skb) < 8 || skb->len < hdr_len) + return NULL; - hdr = (struct ieee80211_hdr *)skb->data; if (!tkey->tx_phase1_done) { tkip_mixing_phase1(tkey->tx_ttak, tkey->key, hdr->addr2, tkey->tx_iv32); tkey->tx_phase1_done = 1; } + rc4key = kmalloc(16, GFP_ATOMIC); + if (!rc4key) + return NULL; tkip_mixing_phase2(rc4key, tkey->key, tkey->tx_ttak, tkey->tx_iv16); len = skb->len - hdr_len; @@ -282,9 +300,9 @@ static int ieee80211_tkip_encrypt(struct sk_buff *skb, int hdr_len, void *priv) pos += hdr_len; icv = skb_put(skb, 4); - *pos++ = rc4key[0]; - *pos++ = rc4key[1]; - *pos++ = rc4key[2]; + *pos++ = *rc4key; + *pos++ = *(rc4key + 1); + *pos++ = *(rc4key + 2); *pos++ = (tkey->key_idx << 6) | (1 << 5) /* Ext IV included */ ; *pos++ = tkey->tx_iv32 & 0xff; *pos++ = (tkey->tx_iv32 >> 8) & 0xff; @@ -297,6 +315,38 @@ static int ieee80211_tkip_encrypt(struct sk_buff *skb, int hdr_len, void *priv) icv[2] = crc >> 16; icv[3] = crc >> 24; + return rc4key; +} + +static int ieee80211_tkip_encrypt(struct sk_buff *skb, int hdr_len, void *priv) +{ + struct ieee80211_tkip_data *tkey = priv; + int len; + const u8 *rc4key; + u8 *pos; + struct scatterlist sg; + + if (tkey->flags & IEEE80211_CRYPTO_TKIP_COUNTERMEASURES) { + if (net_ratelimit()) { + struct ieee80211_hdr_4addr *hdr = + (struct ieee80211_hdr_4addr *)skb->data; + printk(KERN_DEBUG "TKIP countermeasures: dropped " + "TX packet to " MAC_FMT "\n", + MAC_ARG(hdr->addr1)); + } + return -1; + } + + if (skb_tailroom(skb) < 4 || skb->len < hdr_len) + return -1; + + len = skb->len - hdr_len; + pos = skb->data + hdr_len; + + rc4key = ieee80211_tkip_hdr(skb, hdr_len, priv); + if (!rc4key) + return -1; + crypto_cipher_setkey(tkey->tfm_arc4, rc4key, 16); sg.page = virt_to_page(pos); sg.offset = offset_in_page(pos); @@ -319,16 +369,26 @@ static int ieee80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv) u8 keyidx, *pos; u32 iv32; u16 iv16; - struct ieee80211_hdr *hdr; + struct ieee80211_hdr_4addr *hdr; u8 icv[4]; u32 crc; struct scatterlist sg; int plen; + hdr = (struct ieee80211_hdr_4addr *)skb->data; + + if (tkey->flags & IEEE80211_CRYPTO_TKIP_COUNTERMEASURES) { + if (net_ratelimit()) { + printk(KERN_DEBUG "TKIP countermeasures: dropped " + "received packet from " MAC_FMT "\n", + MAC_ARG(hdr->addr2)); + } + return -1; + } + if (skb->len < hdr_len + 8 + 4) return -1; - hdr = (struct ieee80211_hdr *)skb->data; pos = skb->data + hdr_len; keyidx = pos[3]; if (!(keyidx & (1 << 5))) { @@ -441,9 +501,9 @@ static int michael_mic(struct ieee80211_tkip_data *tkey, u8 * key, u8 * hdr, static void michael_mic_hdr(struct sk_buff *skb, u8 * hdr) { - struct ieee80211_hdr *hdr11; + struct ieee80211_hdr_4addr *hdr11; - hdr11 = (struct ieee80211_hdr *)skb->data; + hdr11 = (struct ieee80211_hdr_4addr *)skb->data; switch (le16_to_cpu(hdr11->frame_ctl) & (IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS)) { case IEEE80211_FCTL_TODS: @@ -490,9 +550,9 @@ static int ieee80211_michael_mic_add(struct sk_buff *skb, int hdr_len, return 0; } -#if WIRELESS_EXT >= 18 static void ieee80211_michael_mic_failure(struct net_device *dev, - struct ieee80211_hdr *hdr, int keyidx) + struct ieee80211_hdr_4addr *hdr, + int keyidx) { union iwreq_data wrqu; struct iw_michaelmicfailure ev; @@ -510,28 +570,6 @@ static void ieee80211_michael_mic_failure(struct net_device *dev, wrqu.data.length = sizeof(ev); wireless_send_event(dev, IWEVMICHAELMICFAILURE, &wrqu, (char *)&ev); } -#elif WIRELESS_EXT >= 15 -static void ieee80211_michael_mic_failure(struct net_device *dev, - struct ieee80211_hdr *hdr, int keyidx) -{ - union iwreq_data wrqu; - char buf[128]; - - /* TODO: needed parameters: count, keyid, key type, TSC */ - sprintf(buf, "MLME-MICHAELMICFAILURE.indication(keyid=%d %scast addr=" - MAC_FMT ")", keyidx, hdr->addr1[0] & 0x01 ? "broad" : "uni", - MAC_ARG(hdr->addr2)); - memset(&wrqu, 0, sizeof(wrqu)); - wrqu.data.length = strlen(buf); - wireless_send_event(dev, IWEVCUSTOM, &wrqu, buf); -} -#else /* WIRELESS_EXT >= 15 */ -static inline void ieee80211_michael_mic_failure(struct net_device *dev, - struct ieee80211_hdr *hdr, - int keyidx) -{ -} -#endif /* WIRELESS_EXT >= 15 */ static int ieee80211_michael_mic_verify(struct sk_buff *skb, int keyidx, int hdr_len, void *priv) @@ -547,8 +585,8 @@ static int ieee80211_michael_mic_verify(struct sk_buff *skb, int keyidx, skb->data + hdr_len, skb->len - 8 - hdr_len, mic)) return -1; if (memcmp(mic, skb->data + skb->len - 8, 8) != 0) { - struct ieee80211_hdr *hdr; - hdr = (struct ieee80211_hdr *)skb->data; + struct ieee80211_hdr_4addr *hdr; + hdr = (struct ieee80211_hdr_4addr *)skb->data; printk(KERN_DEBUG "%s: Michael MIC verification failed for " "MSDU from " MAC_FMT " keyidx=%d\n", skb->dev ? skb->dev->name : "N/A", MAC_ARG(hdr->addr2), @@ -654,19 +692,22 @@ static char *ieee80211_tkip_print_stats(char *p, void *priv) } static struct ieee80211_crypto_ops ieee80211_crypt_tkip = { - .name = "TKIP", - .init = ieee80211_tkip_init, - .deinit = ieee80211_tkip_deinit, - .encrypt_mpdu = ieee80211_tkip_encrypt, - .decrypt_mpdu = ieee80211_tkip_decrypt, - .encrypt_msdu = ieee80211_michael_mic_add, - .decrypt_msdu = ieee80211_michael_mic_verify, - .set_key = ieee80211_tkip_set_key, - .get_key = ieee80211_tkip_get_key, - .print_stats = ieee80211_tkip_print_stats, - .extra_prefix_len = 4 + 4, /* IV + ExtIV */ - .extra_postfix_len = 8 + 4, /* MIC + ICV */ - .owner = THIS_MODULE, + .name = "TKIP", + .init = ieee80211_tkip_init, + .deinit = ieee80211_tkip_deinit, + .encrypt_mpdu = ieee80211_tkip_encrypt, + .decrypt_mpdu = ieee80211_tkip_decrypt, + .encrypt_msdu = ieee80211_michael_mic_add, + .decrypt_msdu = ieee80211_michael_mic_verify, + .set_key = ieee80211_tkip_set_key, + .get_key = ieee80211_tkip_get_key, + .print_stats = ieee80211_tkip_print_stats, + .extra_mpdu_prefix_len = 4 + 4, /* IV + ExtIV */ + .extra_mpdu_postfix_len = 4, /* ICV */ + .extra_msdu_postfix_len = 8, /* MIC */ + .get_flags = ieee80211_tkip_get_flags, + .set_flags = ieee80211_tkip_set_flags, + .owner = THIS_MODULE, }; static int __init ieee80211_crypto_tkip_init(void) diff --git a/net/ieee80211/ieee80211_crypt_wep.c b/net/ieee80211/ieee80211_crypt_wep.c index b4d2514a090..073aebdf0f6 100644 --- a/net/ieee80211/ieee80211_crypt_wep.c +++ b/net/ieee80211/ieee80211_crypt_wep.c @@ -10,7 +10,6 @@ */ #include <linux/config.h> -#include <linux/version.h> #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> @@ -229,19 +228,19 @@ static char *prism2_wep_print_stats(char *p, void *priv) } static struct ieee80211_crypto_ops ieee80211_crypt_wep = { - .name = "WEP", - .init = prism2_wep_init, - .deinit = prism2_wep_deinit, - .encrypt_mpdu = prism2_wep_encrypt, - .decrypt_mpdu = prism2_wep_decrypt, - .encrypt_msdu = NULL, - .decrypt_msdu = NULL, - .set_key = prism2_wep_set_key, - .get_key = prism2_wep_get_key, - .print_stats = prism2_wep_print_stats, - .extra_prefix_len = 4, /* IV */ - .extra_postfix_len = 4, /* ICV */ - .owner = THIS_MODULE, + .name = "WEP", + .init = prism2_wep_init, + .deinit = prism2_wep_deinit, + .encrypt_mpdu = prism2_wep_encrypt, + .decrypt_mpdu = prism2_wep_decrypt, + .encrypt_msdu = NULL, + .decrypt_msdu = NULL, + .set_key = prism2_wep_set_key, + .get_key = prism2_wep_get_key, + .print_stats = prism2_wep_print_stats, + .extra_mpdu_prefix_len = 4, /* IV */ + .extra_mpdu_postfix_len = 4, /* ICV */ + .owner = THIS_MODULE, }; static int __init ieee80211_crypto_wep_init(void) diff --git a/net/ieee80211/ieee80211_geo.c b/net/ieee80211/ieee80211_geo.c new file mode 100644 index 00000000000..610cc5cbc25 --- /dev/null +++ b/net/ieee80211/ieee80211_geo.c @@ -0,0 +1,140 @@ +/****************************************************************************** + + Copyright(c) 2005 Intel Corporation. All rights reserved. + + This program is free software; you can redistribute it and/or modify it + under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 59 + Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + The full GNU General Public License is included in this distribution in the + file called LICENSE. + + Contact Information: + James P. Ketrenos <ipw2100-admin@linux.intel.com> + Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497 + +******************************************************************************/ +#include <linux/compiler.h> +#include <linux/config.h> +#include <linux/errno.h> +#include <linux/if_arp.h> +#include <linux/in6.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/proc_fs.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/tcp.h> +#include <linux/types.h> +#include <linux/wireless.h> +#include <linux/etherdevice.h> +#include <asm/uaccess.h> + +#include <net/ieee80211.h> + +int ieee80211_is_valid_channel(struct ieee80211_device *ieee, u8 channel) +{ + int i; + + /* Driver needs to initialize the geography map before using + * these helper functions */ + BUG_ON(ieee->geo.bg_channels == 0 && ieee->geo.a_channels == 0); + + if (ieee->freq_band & IEEE80211_24GHZ_BAND) + for (i = 0; i < ieee->geo.bg_channels; i++) + /* NOTE: If G mode is currently supported but + * this is a B only channel, we don't see it + * as valid. */ + if ((ieee->geo.bg[i].channel == channel) && + (!(ieee->mode & IEEE_G) || + !(ieee->geo.bg[i].flags & IEEE80211_CH_B_ONLY))) + return IEEE80211_24GHZ_BAND; + + if (ieee->freq_band & IEEE80211_52GHZ_BAND) + for (i = 0; i < ieee->geo.a_channels; i++) + if (ieee->geo.a[i].channel == channel) + return IEEE80211_52GHZ_BAND; + + return 0; +} + +int ieee80211_channel_to_index(struct ieee80211_device *ieee, u8 channel) +{ + int i; + + /* Driver needs to initialize the geography map before using + * these helper functions */ + BUG_ON(ieee->geo.bg_channels == 0 && ieee->geo.a_channels == 0); + + if (ieee->freq_band & IEEE80211_24GHZ_BAND) + for (i = 0; i < ieee->geo.bg_channels; i++) + if (ieee->geo.bg[i].channel == channel) + return i; + + if (ieee->freq_band & IEEE80211_52GHZ_BAND) + for (i = 0; i < ieee->geo.a_channels; i++) + if (ieee->geo.a[i].channel == channel) + return i; + + return -1; +} + +u8 ieee80211_freq_to_channel(struct ieee80211_device * ieee, u32 freq) +{ + int i; + + /* Driver needs to initialize the geography map before using + * these helper functions */ + BUG_ON(ieee->geo.bg_channels == 0 && ieee->geo.a_channels == 0); + + freq /= 100000; + + if (ieee->freq_band & IEEE80211_24GHZ_BAND) + for (i = 0; i < ieee->geo.bg_channels; i++) + if (ieee->geo.bg[i].freq == freq) + return ieee->geo.bg[i].channel; + + if (ieee->freq_band & IEEE80211_52GHZ_BAND) + for (i = 0; i < ieee->geo.a_channels; i++) + if (ieee->geo.a[i].freq == freq) + return ieee->geo.a[i].channel; + + return 0; +} + +int ieee80211_set_geo(struct ieee80211_device *ieee, + const struct ieee80211_geo *geo) +{ + memcpy(ieee->geo.name, geo->name, 3); + ieee->geo.name[3] = '\0'; + ieee->geo.bg_channels = geo->bg_channels; + ieee->geo.a_channels = geo->a_channels; + memcpy(ieee->geo.bg, geo->bg, geo->bg_channels * + sizeof(struct ieee80211_channel)); + memcpy(ieee->geo.a, geo->a, ieee->geo.a_channels * + sizeof(struct ieee80211_channel)); + return 0; +} + +const struct ieee80211_geo *ieee80211_get_geo(struct ieee80211_device *ieee) +{ + return &ieee->geo; +} + +EXPORT_SYMBOL(ieee80211_is_valid_channel); +EXPORT_SYMBOL(ieee80211_freq_to_channel); +EXPORT_SYMBOL(ieee80211_channel_to_index); +EXPORT_SYMBOL(ieee80211_set_geo); +EXPORT_SYMBOL(ieee80211_get_geo); diff --git a/net/ieee80211/ieee80211_module.c b/net/ieee80211/ieee80211_module.c index 6059e9e3712..321287bc887 100644 --- a/net/ieee80211/ieee80211_module.c +++ b/net/ieee80211/ieee80211_module.c @@ -1,6 +1,6 @@ /******************************************************************************* - Copyright(c) 2004 Intel Corporation. All rights reserved. + Copyright(c) 2004-2005 Intel Corporation. All rights reserved. Portions of this file are based on the WEP enablement code provided by the Host AP project hostap-drivers v0.1.3 @@ -45,7 +45,6 @@ #include <linux/slab.h> #include <linux/tcp.h> #include <linux/types.h> -#include <linux/version.h> #include <linux/wireless.h> #include <linux/etherdevice.h> #include <asm/uaccess.h> @@ -53,12 +52,15 @@ #include <net/ieee80211.h> -MODULE_DESCRIPTION("802.11 data/management/control stack"); -MODULE_AUTHOR - ("Copyright (C) 2004 Intel Corporation <jketreno@linux.intel.com>"); -MODULE_LICENSE("GPL"); +#define DRV_DESCRIPTION "802.11 data/management/control stack" +#define DRV_NAME "ieee80211" +#define DRV_VERSION IEEE80211_VERSION +#define DRV_COPYRIGHT "Copyright (C) 2004-2005 Intel Corporation <jketreno@linux.intel.com>" -#define DRV_NAME "ieee80211" +MODULE_VERSION(DRV_VERSION); +MODULE_DESCRIPTION(DRV_DESCRIPTION); +MODULE_AUTHOR(DRV_COPYRIGHT); +MODULE_LICENSE("GPL"); static inline int ieee80211_networks_allocate(struct ieee80211_device *ieee) { @@ -126,26 +128,34 @@ struct net_device *alloc_ieee80211(int sizeof_priv) /* Default fragmentation threshold is maximum payload size */ ieee->fts = DEFAULT_FTS; + ieee->rts = DEFAULT_FTS; ieee->scan_age = DEFAULT_MAX_SCAN_AGE; ieee->open_wep = 1; /* Default to enabling full open WEP with host based encrypt/decrypt */ ieee->host_encrypt = 1; ieee->host_decrypt = 1; + ieee->host_mc_decrypt = 1; + + /* Host fragementation in Open mode. Default is enabled. + * Note: host fragmentation is always enabled if host encryption + * is enabled. For cards can do hardware encryption, they must do + * hardware fragmentation as well. So we don't need a variable + * like host_enc_frag. */ + ieee->host_open_frag = 1; ieee->ieee802_1x = 1; /* Default to supporting 802.1x */ INIT_LIST_HEAD(&ieee->crypt_deinit_list); init_timer(&ieee->crypt_deinit_timer); ieee->crypt_deinit_timer.data = (unsigned long)ieee; ieee->crypt_deinit_timer.function = ieee80211_crypt_deinit_handler; + ieee->crypt_quiesced = 0; spin_lock_init(&ieee->lock); ieee->wpa_enabled = 0; - ieee->tkip_countermeasures = 0; ieee->drop_unencrypted = 0; ieee->privacy_invoked = 0; - ieee->ieee802_1x = 1; return dev; @@ -161,6 +171,7 @@ void free_ieee80211(struct net_device *dev) int i; + ieee80211_crypt_quiescing(ieee); del_timer_sync(&ieee->crypt_deinit_timer); ieee80211_crypt_deinit_entries(ieee, 1); @@ -195,38 +206,26 @@ static int show_debug_level(char *page, char **start, off_t offset, static int store_debug_level(struct file *file, const char __user * buffer, unsigned long count, void *data) { - char buf[] = "0x00000000"; - char *p = (char *)buf; + char buf[] = "0x00000000\n"; + unsigned long len = min((unsigned long)sizeof(buf) - 1, count); unsigned long val; - if (count > sizeof(buf) - 1) - count = sizeof(buf) - 1; - - if (copy_from_user(buf, buffer, count)) + if (copy_from_user(buf, buffer, len)) return count; - buf[count] = 0; - /* - * what a FPOS... What, sscanf(buf, "%i", &val) would be too - * scary? - */ - if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') { - p++; - if (p[0] == 'x' || p[0] == 'X') - p++; - val = simple_strtoul(p, &p, 16); - } else - val = simple_strtoul(p, &p, 10); - if (p == buf) + buf[len] = 0; + if (sscanf(buf, "%li", &val) != 1) printk(KERN_INFO DRV_NAME ": %s is not in hex or decimal form.\n", buf); else ieee80211_debug_level = val; - return strlen(buf); + return strnlen(buf, len); } +#endif /* CONFIG_IEEE80211_DEBUG */ static int __init ieee80211_init(void) { +#ifdef CONFIG_IEEE80211_DEBUG struct proc_dir_entry *e; ieee80211_debug_level = debug; @@ -246,26 +245,33 @@ static int __init ieee80211_init(void) e->read_proc = show_debug_level; e->write_proc = store_debug_level; e->data = NULL; +#endif /* CONFIG_IEEE80211_DEBUG */ + + printk(KERN_INFO DRV_NAME ": " DRV_DESCRIPTION ", " DRV_VERSION "\n"); + printk(KERN_INFO DRV_NAME ": " DRV_COPYRIGHT "\n"); return 0; } static void __exit ieee80211_exit(void) { +#ifdef CONFIG_IEEE80211_DEBUG if (ieee80211_proc) { remove_proc_entry("debug_level", ieee80211_proc); remove_proc_entry(DRV_NAME, proc_net); ieee80211_proc = NULL; } +#endif /* CONFIG_IEEE80211_DEBUG */ } +#ifdef CONFIG_IEEE80211_DEBUG #include <linux/moduleparam.h> module_param(debug, int, 0444); MODULE_PARM_DESC(debug, "debug output mask"); +#endif /* CONFIG_IEEE80211_DEBUG */ module_exit(ieee80211_exit); module_init(ieee80211_init); -#endif const char *escape_essid(const char *essid, u8 essid_len) { diff --git a/net/ieee80211/ieee80211_rx.c b/net/ieee80211/ieee80211_rx.c index f7dcd854139..03efaacbdb7 100644 --- a/net/ieee80211/ieee80211_rx.c +++ b/net/ieee80211/ieee80211_rx.c @@ -5,7 +5,7 @@ * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen * <jkmaline@cc.hut.fi> * Copyright (c) 2002-2003, Jouni Malinen <jkmaline@cc.hut.fi> - * Copyright (c) 2004, Intel Corporation + * Copyright (c) 2004-2005, Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -28,7 +28,6 @@ #include <linux/slab.h> #include <linux/tcp.h> #include <linux/types.h> -#include <linux/version.h> #include <linux/wireless.h> #include <linux/etherdevice.h> #include <asm/uaccess.h> @@ -87,7 +86,7 @@ static struct ieee80211_frag_entry *ieee80211_frag_cache_find(struct /* Called only as a tasklet (software IRQ) */ static struct sk_buff *ieee80211_frag_cache_get(struct ieee80211_device *ieee, - struct ieee80211_hdr *hdr) + struct ieee80211_hdr_4addr *hdr) { struct sk_buff *skb = NULL; u16 sc; @@ -101,7 +100,7 @@ static struct sk_buff *ieee80211_frag_cache_get(struct ieee80211_device *ieee, if (frag == 0) { /* Reserve enough space to fit maximum frame length */ skb = dev_alloc_skb(ieee->dev->mtu + - sizeof(struct ieee80211_hdr) + + sizeof(struct ieee80211_hdr_4addr) + 8 /* LLC */ + 2 /* alignment */ + 8 /* WEP */ + ETH_ALEN /* WDS */ ); @@ -138,7 +137,7 @@ static struct sk_buff *ieee80211_frag_cache_get(struct ieee80211_device *ieee, /* Called only as a tasklet (software IRQ) */ static int ieee80211_frag_cache_invalidate(struct ieee80211_device *ieee, - struct ieee80211_hdr *hdr) + struct ieee80211_hdr_4addr *hdr) { u16 sc; unsigned int seq; @@ -176,7 +175,7 @@ ieee80211_rx_frame_mgmt(struct ieee80211_device *ieee, struct sk_buff *skb, ieee->dev->name); return 0; /* - hostap_update_sta_ps(ieee, (struct hostap_ieee80211_hdr *) + hostap_update_sta_ps(ieee, (struct hostap_ieee80211_hdr_4addr *) skb->data);*/ } @@ -232,13 +231,13 @@ static int ieee80211_is_eapol_frame(struct ieee80211_device *ieee, { struct net_device *dev = ieee->dev; u16 fc, ethertype; - struct ieee80211_hdr *hdr; + struct ieee80211_hdr_3addr *hdr; u8 *pos; if (skb->len < 24) return 0; - hdr = (struct ieee80211_hdr *)skb->data; + hdr = (struct ieee80211_hdr_3addr *)skb->data; fc = le16_to_cpu(hdr->frame_ctl); /* check that the frame is unicast frame to us */ @@ -271,26 +270,15 @@ static inline int ieee80211_rx_frame_decrypt(struct ieee80211_device *ieee, struct sk_buff *skb, struct ieee80211_crypt_data *crypt) { - struct ieee80211_hdr *hdr; + struct ieee80211_hdr_3addr *hdr; int res, hdrlen; if (crypt == NULL || crypt->ops->decrypt_mpdu == NULL) return 0; - hdr = (struct ieee80211_hdr *)skb->data; + hdr = (struct ieee80211_hdr_3addr *)skb->data; hdrlen = ieee80211_get_hdrlen(le16_to_cpu(hdr->frame_ctl)); -#ifdef CONFIG_IEEE80211_CRYPT_TKIP - if (ieee->tkip_countermeasures && strcmp(crypt->ops->name, "TKIP") == 0) { - if (net_ratelimit()) { - printk(KERN_DEBUG "%s: TKIP countermeasures: dropped " - "received packet from " MAC_FMT "\n", - ieee->dev->name, MAC_ARG(hdr->addr2)); - } - return -1; - } -#endif - atomic_inc(&crypt->refcnt); res = crypt->ops->decrypt_mpdu(skb, hdrlen, crypt->priv); atomic_dec(&crypt->refcnt); @@ -314,13 +302,13 @@ ieee80211_rx_frame_decrypt_msdu(struct ieee80211_device *ieee, struct sk_buff *skb, int keyidx, struct ieee80211_crypt_data *crypt) { - struct ieee80211_hdr *hdr; + struct ieee80211_hdr_3addr *hdr; int res, hdrlen; if (crypt == NULL || crypt->ops->decrypt_msdu == NULL) return 0; - hdr = (struct ieee80211_hdr *)skb->data; + hdr = (struct ieee80211_hdr_3addr *)skb->data; hdrlen = ieee80211_get_hdrlen(le16_to_cpu(hdr->frame_ctl)); atomic_inc(&crypt->refcnt); @@ -343,7 +331,7 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb, struct ieee80211_rx_stats *rx_stats) { struct net_device *dev = ieee->dev; - struct ieee80211_hdr *hdr; + struct ieee80211_hdr_4addr *hdr; size_t hdrlen; u16 fc, type, stype, sc; struct net_device_stats *stats; @@ -363,7 +351,7 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb, struct ieee80211_crypt_data *crypt = NULL; int keyidx = 0; - hdr = (struct ieee80211_hdr *)skb->data; + hdr = (struct ieee80211_hdr_4addr *)skb->data; stats = &ieee->stats; if (skb->len < 10) { @@ -378,35 +366,53 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb, frag = WLAN_GET_SEQ_FRAG(sc); hdrlen = ieee80211_get_hdrlen(fc); -#ifdef NOT_YET -#if WIRELESS_EXT > 15 /* Put this code here so that we avoid duplicating it in all * Rx paths. - Jean II */ #ifdef IW_WIRELESS_SPY /* defined in iw_handler.h */ +#ifdef CONFIG_NET_RADIO /* If spy monitoring on */ - if (iface->spy_data.spy_number > 0) { + if (ieee->spy_data.spy_number > 0) { struct iw_quality wstats; - wstats.level = rx_stats->signal; - wstats.noise = rx_stats->noise; - wstats.updated = 6; /* No qual value */ + + wstats.updated = 0; + if (rx_stats->mask & IEEE80211_STATMASK_RSSI) { + wstats.level = rx_stats->rssi; + wstats.updated |= IW_QUAL_LEVEL_UPDATED; + } else + wstats.updated |= IW_QUAL_LEVEL_INVALID; + + if (rx_stats->mask & IEEE80211_STATMASK_NOISE) { + wstats.noise = rx_stats->noise; + wstats.updated |= IW_QUAL_NOISE_UPDATED; + } else + wstats.updated |= IW_QUAL_NOISE_INVALID; + + if (rx_stats->mask & IEEE80211_STATMASK_SIGNAL) { + wstats.qual = rx_stats->signal; + wstats.updated |= IW_QUAL_QUAL_UPDATED; + } else + wstats.updated |= IW_QUAL_QUAL_INVALID; + /* Update spy records */ - wireless_spy_update(dev, hdr->addr2, &wstats); + wireless_spy_update(ieee->dev, hdr->addr2, &wstats); } +#endif /* CONFIG_NET_RADIO */ #endif /* IW_WIRELESS_SPY */ -#endif /* WIRELESS_EXT > 15 */ + +#ifdef NOT_YET hostap_update_rx_stats(local->ap, hdr, rx_stats); #endif -#if WIRELESS_EXT > 15 if (ieee->iw_mode == IW_MODE_MONITOR) { ieee80211_monitor_rx(ieee, skb, rx_stats); stats->rx_packets++; stats->rx_bytes += skb->len; return 1; } -#endif - if (ieee->host_decrypt) { + if ((is_multicast_ether_addr(hdr->addr1) || + is_broadcast_ether_addr(hdr->addr2)) ? ieee->host_mc_decrypt : + ieee->host_decrypt) { int idx = 0; if (skb->len >= hdrlen + 3) idx = skb->data[hdrlen + 3] >> 6; @@ -531,6 +537,9 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb, /* Nullfunc frames may have PS-bit set, so they must be passed to * hostap_handle_sta_rx() before being dropped here. */ + + stype &= ~IEEE80211_STYPE_QOS_DATA; + if (stype != IEEE80211_STYPE_DATA && stype != IEEE80211_STYPE_DATA_CFACK && stype != IEEE80211_STYPE_DATA_CFPOLL && @@ -549,7 +558,7 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb, (keyidx = ieee80211_rx_frame_decrypt(ieee, skb, crypt)) < 0) goto rx_dropped; - hdr = (struct ieee80211_hdr *)skb->data; + hdr = (struct ieee80211_hdr_4addr *)skb->data; /* skb: hdr + (possibly fragmented) plaintext payload */ // PR: FIXME: hostap has additional conditions in the "if" below: @@ -603,7 +612,7 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb, /* this was the last fragment and the frame will be * delivered, so remove skb from fragment cache */ skb = frag_skb; - hdr = (struct ieee80211_hdr *)skb->data; + hdr = (struct ieee80211_hdr_4addr *)skb->data; ieee80211_frag_cache_invalidate(ieee, hdr); } @@ -613,7 +622,7 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb, ieee80211_rx_frame_decrypt_msdu(ieee, skb, keyidx, crypt)) goto rx_dropped; - hdr = (struct ieee80211_hdr *)skb->data; + hdr = (struct ieee80211_hdr_4addr *)skb->data; if (crypt && !(fc & IEEE80211_FCTL_PROTECTED) && !ieee->open_wep) { if ( /*ieee->ieee802_1x && */ ieee80211_is_eapol_frame(ieee, skb)) { @@ -755,69 +764,179 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb, #define MGMT_FRAME_FIXED_PART_LENGTH 0x24 -static inline int ieee80211_is_ofdm_rate(u8 rate) +static u8 qos_oui[QOS_OUI_LEN] = { 0x00, 0x50, 0xF2 }; + +/* +* Make ther structure we read from the beacon packet has +* the right values +*/ +static int ieee80211_verify_qos_info(struct ieee80211_qos_information_element + *info_element, int sub_type) { - switch (rate & ~IEEE80211_BASIC_RATE_MASK) { - case IEEE80211_OFDM_RATE_6MB: - case IEEE80211_OFDM_RATE_9MB: - case IEEE80211_OFDM_RATE_12MB: - case IEEE80211_OFDM_RATE_18MB: - case IEEE80211_OFDM_RATE_24MB: - case IEEE80211_OFDM_RATE_36MB: - case IEEE80211_OFDM_RATE_48MB: - case IEEE80211_OFDM_RATE_54MB: - return 1; - } + + if (info_element->qui_subtype != sub_type) + return -1; + if (memcmp(info_element->qui, qos_oui, QOS_OUI_LEN)) + return -1; + if (info_element->qui_type != QOS_OUI_TYPE) + return -1; + if (info_element->version != QOS_VERSION_1) + return -1; + return 0; } -static inline int ieee80211_network_init(struct ieee80211_device *ieee, - struct ieee80211_probe_response - *beacon, - struct ieee80211_network *network, - struct ieee80211_rx_stats *stats) +/* + * Parse a QoS parameter element + */ +static int ieee80211_read_qos_param_element(struct ieee80211_qos_parameter_info + *element_param, struct ieee80211_info_element + *info_element) { -#ifdef CONFIG_IEEE80211_DEBUG - char rates_str[64]; - char *p; -#endif - struct ieee80211_info_element *info_element; - u16 left; - u8 i; + int ret = 0; + u16 size = sizeof(struct ieee80211_qos_parameter_info) - 2; - /* Pull out fixed field data */ - memcpy(network->bssid, beacon->header.addr3, ETH_ALEN); - network->capability = beacon->capability; - network->last_scanned = jiffies; - network->time_stamp[0] = beacon->time_stamp[0]; - network->time_stamp[1] = beacon->time_stamp[1]; - network->beacon_interval = beacon->beacon_interval; - /* Where to pull this? beacon->listen_interval; */ - network->listen_interval = 0x0A; - network->rates_len = network->rates_ex_len = 0; - network->last_associate = 0; - network->ssid_len = 0; - network->flags = 0; - network->atim_window = 0; + if ((info_element == NULL) || (element_param == NULL)) + return -1; - if (stats->freq == IEEE80211_52GHZ_BAND) { - /* for A band (No DS info) */ - network->channel = stats->received_channel; + if (info_element->id == QOS_ELEMENT_ID && info_element->len == size) { + memcpy(element_param->info_element.qui, info_element->data, + info_element->len); + element_param->info_element.elementID = info_element->id; + element_param->info_element.length = info_element->len; } else - network->flags |= NETWORK_HAS_CCK; + ret = -1; + if (ret == 0) + ret = ieee80211_verify_qos_info(&element_param->info_element, + QOS_OUI_PARAM_SUB_TYPE); + return ret; +} - network->wpa_ie_len = 0; - network->rsn_ie_len = 0; +/* + * Parse a QoS information element + */ +static int ieee80211_read_qos_info_element(struct + ieee80211_qos_information_element + *element_info, struct ieee80211_info_element + *info_element) +{ + int ret = 0; + u16 size = sizeof(struct ieee80211_qos_information_element) - 2; + + if (element_info == NULL) + return -1; + if (info_element == NULL) + return -1; + + if ((info_element->id == QOS_ELEMENT_ID) && (info_element->len == size)) { + memcpy(element_info->qui, info_element->data, + info_element->len); + element_info->elementID = info_element->id; + element_info->length = info_element->len; + } else + ret = -1; + + if (ret == 0) + ret = ieee80211_verify_qos_info(element_info, + QOS_OUI_INFO_SUB_TYPE); + return ret; +} + +/* + * Write QoS parameters from the ac parameters. + */ +static int ieee80211_qos_convert_ac_to_parameters(struct + ieee80211_qos_parameter_info + *param_elm, struct + ieee80211_qos_parameters + *qos_param) +{ + int rc = 0; + int i; + struct ieee80211_qos_ac_parameter *ac_params; + u32 txop; + u8 cw_min; + u8 cw_max; + + for (i = 0; i < QOS_QUEUE_NUM; i++) { + ac_params = &(param_elm->ac_params_record[i]); + + qos_param->aifs[i] = (ac_params->aci_aifsn) & 0x0F; + qos_param->aifs[i] -= (qos_param->aifs[i] < 2) ? 0 : 2; + + cw_min = ac_params->ecw_min_max & 0x0F; + qos_param->cw_min[i] = (u16) ((1 << cw_min) - 1); + + cw_max = (ac_params->ecw_min_max & 0xF0) >> 4; + qos_param->cw_max[i] = (u16) ((1 << cw_max) - 1); + + qos_param->flag[i] = + (ac_params->aci_aifsn & 0x10) ? 0x01 : 0x00; + + txop = le16_to_cpu(ac_params->tx_op_limit) * 32; + qos_param->tx_op_limit[i] = (u16) txop; + } + return rc; +} + +/* + * we have a generic data element which it may contain QoS information or + * parameters element. check the information element length to decide + * which type to read + */ +static int ieee80211_parse_qos_info_param_IE(struct ieee80211_info_element + *info_element, + struct ieee80211_network *network) +{ + int rc = 0; + struct ieee80211_qos_parameters *qos_param = NULL; + struct ieee80211_qos_information_element qos_info_element; + + rc = ieee80211_read_qos_info_element(&qos_info_element, info_element); + + if (rc == 0) { + network->qos_data.param_count = qos_info_element.ac_info & 0x0F; + network->flags |= NETWORK_HAS_QOS_INFORMATION; + } else { + struct ieee80211_qos_parameter_info param_element; + + rc = ieee80211_read_qos_param_element(¶m_element, + info_element); + if (rc == 0) { + qos_param = &(network->qos_data.parameters); + ieee80211_qos_convert_ac_to_parameters(¶m_element, + qos_param); + network->flags |= NETWORK_HAS_QOS_PARAMETERS; + network->qos_data.param_count = + param_element.info_element.ac_info & 0x0F; + } + } + + if (rc == 0) { + IEEE80211_DEBUG_QOS("QoS is supported\n"); + network->qos_data.supported = 1; + } + return rc; +} + +static int ieee80211_parse_info_param(struct ieee80211_info_element + *info_element, u16 length, + struct ieee80211_network *network) +{ + u8 i; +#ifdef CONFIG_IEEE80211_DEBUG + char rates_str[64]; + char *p; +#endif - info_element = &beacon->info_element; - left = stats->len - ((void *)info_element - (void *)beacon); - while (left >= sizeof(struct ieee80211_info_element_hdr)) { - if (sizeof(struct ieee80211_info_element_hdr) + - info_element->len > left) { - IEEE80211_DEBUG_SCAN - ("SCAN: parse failed: info_element->len + 2 > left : info_element->len+2=%Zd left=%d.\n", - info_element->len + - sizeof(struct ieee80211_info_element), left); + while (length >= sizeof(*info_element)) { + if (sizeof(*info_element) + info_element->len > length) { + IEEE80211_DEBUG_MGMT("Info elem: parse failed: " + "info_element->len + 2 > left : " + "info_element->len+2=%zd left=%d, id=%d.\n", + info_element->len + + sizeof(*info_element), + length, info_element->id); return 1; } @@ -837,7 +956,7 @@ static inline int ieee80211_network_init(struct ieee80211_device *ieee, memset(network->ssid + network->ssid_len, 0, IW_ESSID_MAX_SIZE - network->ssid_len); - IEEE80211_DEBUG_SCAN("MFIE_TYPE_SSID: '%s' len=%d.\n", + IEEE80211_DEBUG_MGMT("MFIE_TYPE_SSID: '%s' len=%d.\n", network->ssid, network->ssid_len); break; @@ -845,15 +964,14 @@ static inline int ieee80211_network_init(struct ieee80211_device *ieee, #ifdef CONFIG_IEEE80211_DEBUG p = rates_str; #endif - network->rates_len = - min(info_element->len, MAX_RATES_LENGTH); + network->rates_len = min(info_element->len, + MAX_RATES_LENGTH); for (i = 0; i < network->rates_len; i++) { network->rates[i] = info_element->data[i]; #ifdef CONFIG_IEEE80211_DEBUG - p += snprintf(p, - sizeof(rates_str) - (p - - rates_str), - "%02X ", network->rates[i]); + p += snprintf(p, sizeof(rates_str) - + (p - rates_str), "%02X ", + network->rates[i]); #endif if (ieee80211_is_ofdm_rate (info_element->data[i])) { @@ -865,7 +983,7 @@ static inline int ieee80211_network_init(struct ieee80211_device *ieee, } } - IEEE80211_DEBUG_SCAN("MFIE_TYPE_RATES: '%s' (%d)\n", + IEEE80211_DEBUG_MGMT("MFIE_TYPE_RATES: '%s' (%d)\n", rates_str, network->rates_len); break; @@ -873,15 +991,14 @@ static inline int ieee80211_network_init(struct ieee80211_device *ieee, #ifdef CONFIG_IEEE80211_DEBUG p = rates_str; #endif - network->rates_ex_len = - min(info_element->len, MAX_RATES_EX_LENGTH); + network->rates_ex_len = min(info_element->len, + MAX_RATES_EX_LENGTH); for (i = 0; i < network->rates_ex_len; i++) { network->rates_ex[i] = info_element->data[i]; #ifdef CONFIG_IEEE80211_DEBUG - p += snprintf(p, - sizeof(rates_str) - (p - - rates_str), - "%02X ", network->rates[i]); + p += snprintf(p, sizeof(rates_str) - + (p - rates_str), "%02X ", + network->rates[i]); #endif if (ieee80211_is_ofdm_rate (info_element->data[i])) { @@ -893,40 +1010,51 @@ static inline int ieee80211_network_init(struct ieee80211_device *ieee, } } - IEEE80211_DEBUG_SCAN("MFIE_TYPE_RATES_EX: '%s' (%d)\n", + IEEE80211_DEBUG_MGMT("MFIE_TYPE_RATES_EX: '%s' (%d)\n", rates_str, network->rates_ex_len); break; case MFIE_TYPE_DS_SET: - IEEE80211_DEBUG_SCAN("MFIE_TYPE_DS_SET: %d\n", + IEEE80211_DEBUG_MGMT("MFIE_TYPE_DS_SET: %d\n", info_element->data[0]); - if (stats->freq == IEEE80211_24GHZ_BAND) - network->channel = info_element->data[0]; + network->channel = info_element->data[0]; break; case MFIE_TYPE_FH_SET: - IEEE80211_DEBUG_SCAN("MFIE_TYPE_FH_SET: ignored\n"); + IEEE80211_DEBUG_MGMT("MFIE_TYPE_FH_SET: ignored\n"); break; case MFIE_TYPE_CF_SET: - IEEE80211_DEBUG_SCAN("MFIE_TYPE_CF_SET: ignored\n"); + IEEE80211_DEBUG_MGMT("MFIE_TYPE_CF_SET: ignored\n"); break; case MFIE_TYPE_TIM: - IEEE80211_DEBUG_SCAN("MFIE_TYPE_TIM: ignored\n"); + IEEE80211_DEBUG_MGMT("MFIE_TYPE_TIM: ignored\n"); + break; + + case MFIE_TYPE_ERP_INFO: + network->erp_value = info_element->data[0]; + IEEE80211_DEBUG_MGMT("MFIE_TYPE_ERP_SET: %d\n", + network->erp_value); break; case MFIE_TYPE_IBSS_SET: - IEEE80211_DEBUG_SCAN("MFIE_TYPE_IBSS_SET: ignored\n"); + network->atim_window = info_element->data[0]; + IEEE80211_DEBUG_MGMT("MFIE_TYPE_IBSS_SET: %d\n", + network->atim_window); break; case MFIE_TYPE_CHALLENGE: - IEEE80211_DEBUG_SCAN("MFIE_TYPE_CHALLENGE: ignored\n"); + IEEE80211_DEBUG_MGMT("MFIE_TYPE_CHALLENGE: ignored\n"); break; case MFIE_TYPE_GENERIC: - IEEE80211_DEBUG_SCAN("MFIE_TYPE_GENERIC: %d bytes\n", + IEEE80211_DEBUG_MGMT("MFIE_TYPE_GENERIC: %d bytes\n", info_element->len); + if (!ieee80211_parse_qos_info_param_IE(info_element, + network)) + break; + if (info_element->len >= 4 && info_element->data[0] == 0x00 && info_element->data[1] == 0x50 && @@ -940,7 +1068,7 @@ static inline int ieee80211_network_init(struct ieee80211_device *ieee, break; case MFIE_TYPE_RSN: - IEEE80211_DEBUG_SCAN("MFIE_TYPE_RSN: %d bytes\n", + IEEE80211_DEBUG_MGMT("MFIE_TYPE_RSN: %d bytes\n", info_element->len); network->rsn_ie_len = min(info_element->len + 2, MAX_WPA_IE_LEN); @@ -948,18 +1076,127 @@ static inline int ieee80211_network_init(struct ieee80211_device *ieee, network->rsn_ie_len); break; + case MFIE_TYPE_QOS_PARAMETER: + printk(KERN_ERR + "QoS Error need to parse QOS_PARAMETER IE\n"); + break; + default: - IEEE80211_DEBUG_SCAN("unsupported IE %d\n", + IEEE80211_DEBUG_MGMT("unsupported IE %d\n", info_element->id); break; } - left -= sizeof(struct ieee80211_info_element_hdr) + - info_element->len; - info_element = (struct ieee80211_info_element *) - &info_element->data[info_element->len]; + length -= sizeof(*info_element) + info_element->len; + info_element = + (struct ieee80211_info_element *)&info_element-> + data[info_element->len]; + } + + return 0; +} + +static int ieee80211_handle_assoc_resp(struct ieee80211_device *ieee, struct ieee80211_assoc_response + *frame, struct ieee80211_rx_stats *stats) +{ + struct ieee80211_network network_resp; + struct ieee80211_network *network = &network_resp; + struct net_device *dev = ieee->dev; + + network->flags = 0; + network->qos_data.active = 0; + network->qos_data.supported = 0; + network->qos_data.param_count = 0; + network->qos_data.old_param_count = 0; + + //network->atim_window = le16_to_cpu(frame->aid) & (0x3FFF); + network->atim_window = le16_to_cpu(frame->aid); + network->listen_interval = le16_to_cpu(frame->status); + memcpy(network->bssid, frame->header.addr3, ETH_ALEN); + network->capability = le16_to_cpu(frame->capability); + network->last_scanned = jiffies; + network->rates_len = network->rates_ex_len = 0; + network->last_associate = 0; + network->ssid_len = 0; + network->erp_value = + (network->capability & WLAN_CAPABILITY_IBSS) ? 0x3 : 0x0; + + if (stats->freq == IEEE80211_52GHZ_BAND) { + /* for A band (No DS info) */ + network->channel = stats->received_channel; + } else + network->flags |= NETWORK_HAS_CCK; + + network->wpa_ie_len = 0; + network->rsn_ie_len = 0; + + if (ieee80211_parse_info_param + (frame->info_element, stats->len - sizeof(*frame), network)) + return 1; + + network->mode = 0; + if (stats->freq == IEEE80211_52GHZ_BAND) + network->mode = IEEE_A; + else { + if (network->flags & NETWORK_HAS_OFDM) + network->mode |= IEEE_G; + if (network->flags & NETWORK_HAS_CCK) + network->mode |= IEEE_B; } + if (ieee80211_is_empty_essid(network->ssid, network->ssid_len)) + network->flags |= NETWORK_EMPTY_ESSID; + + memcpy(&network->stats, stats, sizeof(network->stats)); + + if (ieee->handle_assoc_response != NULL) + ieee->handle_assoc_response(dev, frame, network); + + return 0; +} + +/***************************************************/ + +static inline int ieee80211_network_init(struct ieee80211_device *ieee, struct ieee80211_probe_response + *beacon, + struct ieee80211_network *network, + struct ieee80211_rx_stats *stats) +{ + network->qos_data.active = 0; + network->qos_data.supported = 0; + network->qos_data.param_count = 0; + network->qos_data.old_param_count = 0; + + /* Pull out fixed field data */ + memcpy(network->bssid, beacon->header.addr3, ETH_ALEN); + network->capability = le16_to_cpu(beacon->capability); + network->last_scanned = jiffies; + network->time_stamp[0] = le32_to_cpu(beacon->time_stamp[0]); + network->time_stamp[1] = le32_to_cpu(beacon->time_stamp[1]); + network->beacon_interval = le16_to_cpu(beacon->beacon_interval); + /* Where to pull this? beacon->listen_interval; */ + network->listen_interval = 0x0A; + network->rates_len = network->rates_ex_len = 0; + network->last_associate = 0; + network->ssid_len = 0; + network->flags = 0; + network->atim_window = 0; + network->erp_value = (network->capability & WLAN_CAPABILITY_IBSS) ? + 0x3 : 0x0; + + if (stats->freq == IEEE80211_52GHZ_BAND) { + /* for A band (No DS info) */ + network->channel = stats->received_channel; + } else + network->flags |= NETWORK_HAS_CCK; + + network->wpa_ie_len = 0; + network->rsn_ie_len = 0; + + if (ieee80211_parse_info_param + (beacon->info_element, stats->len - sizeof(*beacon), network)) + return 1; + network->mode = 0; if (stats->freq == IEEE80211_52GHZ_BAND) network->mode = IEEE_A; @@ -1002,6 +1239,9 @@ static inline int is_same_network(struct ieee80211_network *src, static inline void update_network(struct ieee80211_network *dst, struct ieee80211_network *src) { + int qos_active; + u8 old_param; + memcpy(&dst->stats, &src->stats, sizeof(struct ieee80211_rx_stats)); dst->capability = src->capability; memcpy(dst->rates, src->rates, src->rates_len); @@ -1017,6 +1257,7 @@ static inline void update_network(struct ieee80211_network *dst, dst->beacon_interval = src->beacon_interval; dst->listen_interval = src->listen_interval; dst->atim_window = src->atim_window; + dst->erp_value = src->erp_value; memcpy(dst->wpa_ie, src->wpa_ie, src->wpa_ie_len); dst->wpa_ie_len = src->wpa_ie_len; @@ -1024,22 +1265,48 @@ static inline void update_network(struct ieee80211_network *dst, dst->rsn_ie_len = src->rsn_ie_len; dst->last_scanned = jiffies; + qos_active = src->qos_data.active; + old_param = dst->qos_data.old_param_count; + if (dst->flags & NETWORK_HAS_QOS_MASK) + memcpy(&dst->qos_data, &src->qos_data, + sizeof(struct ieee80211_qos_data)); + else { + dst->qos_data.supported = src->qos_data.supported; + dst->qos_data.param_count = src->qos_data.param_count; + } + + if (dst->qos_data.supported == 1) { + if (dst->ssid_len) + IEEE80211_DEBUG_QOS + ("QoS the network %s is QoS supported\n", + dst->ssid); + else + IEEE80211_DEBUG_QOS + ("QoS the network is QoS supported\n"); + } + dst->qos_data.active = qos_active; + dst->qos_data.old_param_count = old_param; + /* dst->last_associate is not overwritten */ } +static inline int is_beacon(int fc) +{ + return (WLAN_FC_GET_STYPE(le16_to_cpu(fc)) == IEEE80211_STYPE_BEACON); +} + static inline void ieee80211_process_probe_response(struct ieee80211_device - *ieee, - struct + *ieee, struct ieee80211_probe_response - *beacon, - struct ieee80211_rx_stats + *beacon, struct ieee80211_rx_stats *stats) { + struct net_device *dev = ieee->dev; struct ieee80211_network network; struct ieee80211_network *target; struct ieee80211_network *oldest = NULL; #ifdef CONFIG_IEEE80211_DEBUG - struct ieee80211_info_element *info_element = &beacon->info_element; + struct ieee80211_info_element *info_element = beacon->info_element; #endif unsigned long flags; @@ -1070,10 +1337,10 @@ static inline void ieee80211_process_probe_response(struct ieee80211_device escape_essid(info_element->data, info_element->len), MAC_ARG(beacon->header.addr3), - WLAN_FC_GET_STYPE(beacon->header. - frame_ctl) == - IEEE80211_STYPE_PROBE_RESP ? - "PROBE RESPONSE" : "BEACON"); + is_beacon(le16_to_cpu + (beacon->header. + frame_ctl)) ? + "BEACON" : "PROBE RESPONSE"); return; } @@ -1122,10 +1389,10 @@ static inline void ieee80211_process_probe_response(struct ieee80211_device escape_essid(network.ssid, network.ssid_len), MAC_ARG(network.bssid), - WLAN_FC_GET_STYPE(beacon->header. - frame_ctl) == - IEEE80211_STYPE_PROBE_RESP ? - "PROBE RESPONSE" : "BEACON"); + is_beacon(le16_to_cpu + (beacon->header. + frame_ctl)) ? + "BEACON" : "PROBE RESPONSE"); #endif memcpy(target, &network, sizeof(*target)); list_add_tail(&target->list, &ieee->network_list); @@ -1134,34 +1401,60 @@ static inline void ieee80211_process_probe_response(struct ieee80211_device escape_essid(target->ssid, target->ssid_len), MAC_ARG(target->bssid), - WLAN_FC_GET_STYPE(beacon->header. - frame_ctl) == - IEEE80211_STYPE_PROBE_RESP ? - "PROBE RESPONSE" : "BEACON"); + is_beacon(le16_to_cpu + (beacon->header. + frame_ctl)) ? + "BEACON" : "PROBE RESPONSE"); update_network(target, &network); } spin_unlock_irqrestore(&ieee->lock, flags); + + if (is_beacon(le16_to_cpu(beacon->header.frame_ctl))) { + if (ieee->handle_beacon != NULL) + ieee->handle_beacon(dev, beacon, &network); + } else { + if (ieee->handle_probe_response != NULL) + ieee->handle_probe_response(dev, beacon, &network); + } } void ieee80211_rx_mgt(struct ieee80211_device *ieee, - struct ieee80211_hdr *header, + struct ieee80211_hdr_4addr *header, struct ieee80211_rx_stats *stats) { - switch (WLAN_FC_GET_STYPE(header->frame_ctl)) { + switch (WLAN_FC_GET_STYPE(le16_to_cpu(header->frame_ctl))) { case IEEE80211_STYPE_ASSOC_RESP: IEEE80211_DEBUG_MGMT("received ASSOCIATION RESPONSE (%d)\n", - WLAN_FC_GET_STYPE(header->frame_ctl)); + WLAN_FC_GET_STYPE(le16_to_cpu + (header->frame_ctl))); + ieee80211_handle_assoc_resp(ieee, + (struct ieee80211_assoc_response *) + header, stats); break; case IEEE80211_STYPE_REASSOC_RESP: IEEE80211_DEBUG_MGMT("received REASSOCIATION RESPONSE (%d)\n", - WLAN_FC_GET_STYPE(header->frame_ctl)); + WLAN_FC_GET_STYPE(le16_to_cpu + (header->frame_ctl))); + break; + + case IEEE80211_STYPE_PROBE_REQ: + IEEE80211_DEBUG_MGMT("recieved auth (%d)\n", + WLAN_FC_GET_STYPE(le16_to_cpu + (header->frame_ctl))); + + if (ieee->handle_probe_request != NULL) + ieee->handle_probe_request(ieee->dev, + (struct + ieee80211_probe_request *) + header, stats); break; case IEEE80211_STYPE_PROBE_RESP: IEEE80211_DEBUG_MGMT("received PROBE RESPONSE (%d)\n", - WLAN_FC_GET_STYPE(header->frame_ctl)); + WLAN_FC_GET_STYPE(le16_to_cpu + (header->frame_ctl))); IEEE80211_DEBUG_SCAN("Probe response\n"); ieee80211_process_probe_response(ieee, (struct @@ -1171,20 +1464,46 @@ void ieee80211_rx_mgt(struct ieee80211_device *ieee, case IEEE80211_STYPE_BEACON: IEEE80211_DEBUG_MGMT("received BEACON (%d)\n", - WLAN_FC_GET_STYPE(header->frame_ctl)); + WLAN_FC_GET_STYPE(le16_to_cpu + (header->frame_ctl))); IEEE80211_DEBUG_SCAN("Beacon\n"); ieee80211_process_probe_response(ieee, (struct ieee80211_probe_response *) header, stats); break; + case IEEE80211_STYPE_AUTH: + IEEE80211_DEBUG_MGMT("recieved auth (%d)\n", + WLAN_FC_GET_STYPE(le16_to_cpu + (header->frame_ctl))); + + if (ieee->handle_auth != NULL) + ieee->handle_auth(ieee->dev, + (struct ieee80211_auth *)header); + break; + + case IEEE80211_STYPE_DISASSOC: + if (ieee->handle_disassoc != NULL) + ieee->handle_disassoc(ieee->dev, + (struct ieee80211_disassoc *) + header); + break; + + case IEEE80211_STYPE_DEAUTH: + printk("DEAUTH from AP\n"); + if (ieee->handle_deauth != NULL) + ieee->handle_deauth(ieee->dev, (struct ieee80211_auth *) + header); + break; default: IEEE80211_DEBUG_MGMT("received UNKNOWN (%d)\n", - WLAN_FC_GET_STYPE(header->frame_ctl)); + WLAN_FC_GET_STYPE(le16_to_cpu + (header->frame_ctl))); IEEE80211_WARNING("%s: Unknown management packet: %d\n", ieee->dev->name, - WLAN_FC_GET_STYPE(header->frame_ctl)); + WLAN_FC_GET_STYPE(le16_to_cpu + (header->frame_ctl))); break; } } diff --git a/net/ieee80211/ieee80211_tx.c b/net/ieee80211/ieee80211_tx.c index eed07bbbe6b..445f206e65e 100644 --- a/net/ieee80211/ieee80211_tx.c +++ b/net/ieee80211/ieee80211_tx.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright(c) 2003 - 2004 Intel Corporation. All rights reserved. + Copyright(c) 2003 - 2005 Intel Corporation. All rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as @@ -38,7 +38,6 @@ #include <linux/slab.h> #include <linux/tcp.h> #include <linux/types.h> -#include <linux/version.h> #include <linux/wireless.h> #include <linux/etherdevice.h> #include <asm/uaccess.h> @@ -128,7 +127,7 @@ payload of each frame is reduced to 492 bytes. static u8 P802_1H_OUI[P80211_OUI_LEN] = { 0x00, 0x00, 0xf8 }; static u8 RFC1042_OUI[P80211_OUI_LEN] = { 0x00, 0x00, 0x00 }; -static inline int ieee80211_put_snap(u8 * data, u16 h_proto) +static inline int ieee80211_copy_snap(u8 * data, u16 h_proto) { struct ieee80211_snap_hdr *snap; u8 *oui; @@ -157,31 +156,14 @@ static inline int ieee80211_encrypt_fragment(struct ieee80211_device *ieee, struct ieee80211_crypt_data *crypt = ieee->crypt[ieee->tx_keyidx]; int res; -#ifdef CONFIG_IEEE80211_CRYPT_TKIP - struct ieee80211_hdr *header; - - if (ieee->tkip_countermeasures && - crypt && crypt->ops && strcmp(crypt->ops->name, "TKIP") == 0) { - header = (struct ieee80211_hdr *)frag->data; - if (net_ratelimit()) { - printk(KERN_DEBUG "%s: TKIP countermeasures: dropped " - "TX packet to " MAC_FMT "\n", - ieee->dev->name, MAC_ARG(header->addr1)); - } + if (crypt == NULL) return -1; - } -#endif + /* To encrypt, frame format is: * IV (4 bytes), clear payload (including SNAP), ICV (4 bytes) */ - - // PR: FIXME: Copied from hostap. Check fragmentation/MSDU/MPDU encryption. - /* Host-based IEEE 802.11 fragmentation for TX is not yet supported, so - * call both MSDU and MPDU encryption functions from here. */ atomic_inc(&crypt->refcnt); res = 0; - if (crypt->ops->encrypt_msdu) - res = crypt->ops->encrypt_msdu(frag, hdr_len, crypt->priv); - if (res == 0 && crypt->ops->encrypt_mpdu) + if (crypt->ops && crypt->ops->encrypt_mpdu) res = crypt->ops->encrypt_mpdu(frag, hdr_len, crypt->priv); atomic_dec(&crypt->refcnt); @@ -207,7 +189,7 @@ void ieee80211_txb_free(struct ieee80211_txb *txb) } static struct ieee80211_txb *ieee80211_alloc_txb(int nr_frags, int txb_size, - gfp_t gfp_mask) + int headroom, gfp_t gfp_mask) { struct ieee80211_txb *txb; int i; @@ -221,11 +203,13 @@ static struct ieee80211_txb *ieee80211_alloc_txb(int nr_frags, int txb_size, txb->frag_size = txb_size; for (i = 0; i < nr_frags; i++) { - txb->fragments[i] = dev_alloc_skb(txb_size); + txb->fragments[i] = __dev_alloc_skb(txb_size + headroom, + gfp_mask); if (unlikely(!txb->fragments[i])) { i--; break; } + skb_reserve(txb->fragments[i], headroom); } if (unlikely(i != nr_frags)) { while (i >= 0) @@ -236,25 +220,31 @@ static struct ieee80211_txb *ieee80211_alloc_txb(int nr_frags, int txb_size, return txb; } -/* SKBs are added to the ieee->tx_queue. */ +/* Incoming skb is converted to a txb which consists of + * a block of 802.11 fragment packets (stored as skbs) */ int ieee80211_xmit(struct sk_buff *skb, struct net_device *dev) { struct ieee80211_device *ieee = netdev_priv(dev); struct ieee80211_txb *txb = NULL; - struct ieee80211_hdr *frag_hdr; - int i, bytes_per_frag, nr_frags, bytes_last_frag, frag_size; + struct ieee80211_hdr_3addr *frag_hdr; + int i, bytes_per_frag, nr_frags, bytes_last_frag, frag_size, + rts_required; unsigned long flags; struct net_device_stats *stats = &ieee->stats; - int ether_type, encrypt; + int ether_type, encrypt, host_encrypt, host_encrypt_msdu, host_build_iv; int bytes, fc, hdr_len; struct sk_buff *skb_frag; - struct ieee80211_hdr header = { /* Ensure zero initialized */ + struct ieee80211_hdr_3addr header = { /* Ensure zero initialized */ .duration_id = 0, .seq_ctl = 0 }; u8 dest[ETH_ALEN], src[ETH_ALEN]; - struct ieee80211_crypt_data *crypt; + int priority = skb->priority; + int snapped = 0; + + if (ieee->is_queue_full && (*ieee->is_queue_full) (dev, priority)) + return NETDEV_TX_BUSY; spin_lock_irqsave(&ieee->lock, flags); @@ -276,7 +266,11 @@ int ieee80211_xmit(struct sk_buff *skb, struct net_device *dev) crypt = ieee->crypt[ieee->tx_keyidx]; encrypt = !(ether_type == ETH_P_PAE && ieee->ieee802_1x) && - ieee->host_encrypt && crypt && crypt->ops; + ieee->sec.encrypt; + + host_encrypt = ieee->host_encrypt && encrypt && crypt; + host_encrypt_msdu = ieee->host_encrypt_msdu && encrypt && crypt; + host_build_iv = ieee->host_build_iv && encrypt && crypt; if (!encrypt && ieee->ieee802_1x && ieee->drop_unencrypted && ether_type != ETH_P_PAE) { @@ -285,8 +279,8 @@ int ieee80211_xmit(struct sk_buff *skb, struct net_device *dev) } /* Save source and destination addresses */ - memcpy(&dest, skb->data, ETH_ALEN); - memcpy(&src, skb->data + ETH_ALEN, ETH_ALEN); + memcpy(dest, skb->data, ETH_ALEN); + memcpy(src, skb->data + ETH_ALEN, ETH_ALEN); /* Advance the SKB to the start of the payload */ skb_pull(skb, sizeof(struct ethhdr)); @@ -294,7 +288,7 @@ int ieee80211_xmit(struct sk_buff *skb, struct net_device *dev) /* Determine total amount of storage required for TXB packets */ bytes = skb->len + SNAP_SIZE + sizeof(u16); - if (encrypt) + if (host_encrypt) fc = IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA | IEEE80211_FCTL_PROTECTED; else @@ -302,70 +296,144 @@ int ieee80211_xmit(struct sk_buff *skb, struct net_device *dev) if (ieee->iw_mode == IW_MODE_INFRA) { fc |= IEEE80211_FCTL_TODS; - /* To DS: Addr1 = BSSID, Addr2 = SA, - Addr3 = DA */ - memcpy(&header.addr1, ieee->bssid, ETH_ALEN); - memcpy(&header.addr2, &src, ETH_ALEN); - memcpy(&header.addr3, &dest, ETH_ALEN); + /* To DS: Addr1 = BSSID, Addr2 = SA, Addr3 = DA */ + memcpy(header.addr1, ieee->bssid, ETH_ALEN); + memcpy(header.addr2, src, ETH_ALEN); + memcpy(header.addr3, dest, ETH_ALEN); } else if (ieee->iw_mode == IW_MODE_ADHOC) { - /* not From/To DS: Addr1 = DA, Addr2 = SA, - Addr3 = BSSID */ - memcpy(&header.addr1, dest, ETH_ALEN); - memcpy(&header.addr2, src, ETH_ALEN); - memcpy(&header.addr3, ieee->bssid, ETH_ALEN); + /* not From/To DS: Addr1 = DA, Addr2 = SA, Addr3 = BSSID */ + memcpy(header.addr1, dest, ETH_ALEN); + memcpy(header.addr2, src, ETH_ALEN); + memcpy(header.addr3, ieee->bssid, ETH_ALEN); } header.frame_ctl = cpu_to_le16(fc); hdr_len = IEEE80211_3ADDR_LEN; - /* Determine fragmentation size based on destination (multicast - * and broadcast are not fragmented) */ - if (is_multicast_ether_addr(dest) || is_broadcast_ether_addr(dest)) - frag_size = MAX_FRAG_THRESHOLD; - else - frag_size = ieee->fts; + /* Encrypt msdu first on the whole data packet. */ + if ((host_encrypt || host_encrypt_msdu) && + crypt && crypt->ops && crypt->ops->encrypt_msdu) { + int res = 0; + int len = bytes + hdr_len + crypt->ops->extra_msdu_prefix_len + + crypt->ops->extra_msdu_postfix_len; + struct sk_buff *skb_new = dev_alloc_skb(len); + + if (unlikely(!skb_new)) + goto failed; + + skb_reserve(skb_new, crypt->ops->extra_msdu_prefix_len); + memcpy(skb_put(skb_new, hdr_len), &header, hdr_len); + snapped = 1; + ieee80211_copy_snap(skb_put(skb_new, SNAP_SIZE + sizeof(u16)), + ether_type); + memcpy(skb_put(skb_new, skb->len), skb->data, skb->len); + res = crypt->ops->encrypt_msdu(skb_new, hdr_len, crypt->priv); + if (res < 0) { + IEEE80211_ERROR("msdu encryption failed\n"); + dev_kfree_skb_any(skb_new); + goto failed; + } + dev_kfree_skb_any(skb); + skb = skb_new; + bytes += crypt->ops->extra_msdu_prefix_len + + crypt->ops->extra_msdu_postfix_len; + skb_pull(skb, hdr_len); + } - /* Determine amount of payload per fragment. Regardless of if - * this stack is providing the full 802.11 header, one will - * eventually be affixed to this fragment -- so we must account for - * it when determining the amount of payload space. */ - bytes_per_frag = frag_size - IEEE80211_3ADDR_LEN; - if (ieee->config & - (CFG_IEEE80211_COMPUTE_FCS | CFG_IEEE80211_RESERVE_FCS)) - bytes_per_frag -= IEEE80211_FCS_LEN; - - /* Each fragment may need to have room for encryptiong pre/postfix */ - if (encrypt) - bytes_per_frag -= crypt->ops->extra_prefix_len + - crypt->ops->extra_postfix_len; - - /* Number of fragments is the total bytes_per_frag / - * payload_per_fragment */ - nr_frags = bytes / bytes_per_frag; - bytes_last_frag = bytes % bytes_per_frag; - if (bytes_last_frag) + if (host_encrypt || ieee->host_open_frag) { + /* Determine fragmentation size based on destination (multicast + * and broadcast are not fragmented) */ + if (is_multicast_ether_addr(dest) || + is_broadcast_ether_addr(dest)) + frag_size = MAX_FRAG_THRESHOLD; + else + frag_size = ieee->fts; + + /* Determine amount of payload per fragment. Regardless of if + * this stack is providing the full 802.11 header, one will + * eventually be affixed to this fragment -- so we must account + * for it when determining the amount of payload space. */ + bytes_per_frag = frag_size - IEEE80211_3ADDR_LEN; + if (ieee->config & + (CFG_IEEE80211_COMPUTE_FCS | CFG_IEEE80211_RESERVE_FCS)) + bytes_per_frag -= IEEE80211_FCS_LEN; + + /* Each fragment may need to have room for encryptiong + * pre/postfix */ + if (host_encrypt) + bytes_per_frag -= crypt->ops->extra_mpdu_prefix_len + + crypt->ops->extra_mpdu_postfix_len; + + /* Number of fragments is the total + * bytes_per_frag / payload_per_fragment */ + nr_frags = bytes / bytes_per_frag; + bytes_last_frag = bytes % bytes_per_frag; + if (bytes_last_frag) + nr_frags++; + else + bytes_last_frag = bytes_per_frag; + } else { + nr_frags = 1; + bytes_per_frag = bytes_last_frag = bytes; + frag_size = bytes + IEEE80211_3ADDR_LEN; + } + + rts_required = (frag_size > ieee->rts + && ieee->config & CFG_IEEE80211_RTS); + if (rts_required) nr_frags++; - else - bytes_last_frag = bytes_per_frag; /* When we allocate the TXB we allocate enough space for the reserve * and full fragment bytes (bytes_per_frag doesn't include prefix, * postfix, header, FCS, etc.) */ - txb = ieee80211_alloc_txb(nr_frags, frag_size, GFP_ATOMIC); + txb = ieee80211_alloc_txb(nr_frags, frag_size, + ieee->tx_headroom, GFP_ATOMIC); if (unlikely(!txb)) { printk(KERN_WARNING "%s: Could not allocate TXB\n", ieee->dev->name); goto failed; } txb->encrypted = encrypt; - txb->payload_size = bytes; + if (host_encrypt) + txb->payload_size = frag_size * (nr_frags - 1) + + bytes_last_frag; + else + txb->payload_size = bytes; + + if (rts_required) { + skb_frag = txb->fragments[0]; + frag_hdr = + (struct ieee80211_hdr_3addr *)skb_put(skb_frag, hdr_len); + + /* + * Set header frame_ctl to the RTS. + */ + header.frame_ctl = + cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_RTS); + memcpy(frag_hdr, &header, hdr_len); - for (i = 0; i < nr_frags; i++) { + /* + * Restore header frame_ctl to the original data setting. + */ + header.frame_ctl = cpu_to_le16(fc); + + if (ieee->config & + (CFG_IEEE80211_COMPUTE_FCS | CFG_IEEE80211_RESERVE_FCS)) + skb_put(skb_frag, 4); + + txb->rts_included = 1; + i = 1; + } else + i = 0; + + for (; i < nr_frags; i++) { skb_frag = txb->fragments[i]; - if (encrypt) - skb_reserve(skb_frag, crypt->ops->extra_prefix_len); + if (host_encrypt || host_build_iv) + skb_reserve(skb_frag, + crypt->ops->extra_mpdu_prefix_len); - frag_hdr = (struct ieee80211_hdr *)skb_put(skb_frag, hdr_len); + frag_hdr = + (struct ieee80211_hdr_3addr *)skb_put(skb_frag, hdr_len); memcpy(frag_hdr, &header, hdr_len); /* If this is not the last fragment, then add the MOREFRAGS @@ -379,11 +447,10 @@ int ieee80211_xmit(struct sk_buff *skb, struct net_device *dev) bytes = bytes_last_frag; } - /* Put a SNAP header on the first fragment */ - if (i == 0) { - ieee80211_put_snap(skb_put - (skb_frag, SNAP_SIZE + sizeof(u16)), - ether_type); + if (i == 0 && !snapped) { + ieee80211_copy_snap(skb_put + (skb_frag, SNAP_SIZE + sizeof(u16)), + ether_type); bytes -= SNAP_SIZE + sizeof(u16); } @@ -394,8 +461,19 @@ int ieee80211_xmit(struct sk_buff *skb, struct net_device *dev) /* Encryption routine will move the header forward in order * to insert the IV between the header and the payload */ - if (encrypt) + if (host_encrypt) ieee80211_encrypt_fragment(ieee, skb_frag, hdr_len); + else if (host_build_iv) { + struct ieee80211_crypt_data *crypt; + + crypt = ieee->crypt[ieee->tx_keyidx]; + atomic_inc(&crypt->refcnt); + if (crypt->ops->build_iv) + crypt->ops->build_iv(skb_frag, hdr_len, + crypt->priv); + atomic_dec(&crypt->refcnt); + } + if (ieee->config & (CFG_IEEE80211_COMPUTE_FCS | CFG_IEEE80211_RESERVE_FCS)) skb_put(skb_frag, 4); @@ -407,11 +485,20 @@ int ieee80211_xmit(struct sk_buff *skb, struct net_device *dev) dev_kfree_skb_any(skb); if (txb) { - if ((*ieee->hard_start_xmit) (txb, dev) == 0) { + int ret = (*ieee->hard_start_xmit) (txb, dev, priority); + if (ret == 0) { stats->tx_packets++; stats->tx_bytes += txb->payload_size; return 0; } + + if (ret == NETDEV_TX_BUSY) { + printk(KERN_ERR "%s: NETDEV_TX_BUSY returned; " + "driver should report queue full via " + "ieee_device->is_queue_full.\n", + ieee->dev->name); + } + ieee80211_txb_free(txb); } @@ -422,7 +509,72 @@ int ieee80211_xmit(struct sk_buff *skb, struct net_device *dev) netif_stop_queue(dev); stats->tx_errors++; return 1; +} + +/* Incoming 802.11 strucure is converted to a TXB + * a block of 802.11 fragment packets (stored as skbs) */ +int ieee80211_tx_frame(struct ieee80211_device *ieee, + struct ieee80211_hdr *frame, int len) +{ + struct ieee80211_txb *txb = NULL; + unsigned long flags; + struct net_device_stats *stats = &ieee->stats; + struct sk_buff *skb_frag; + int priority = -1; + + spin_lock_irqsave(&ieee->lock, flags); + /* If there is no driver handler to take the TXB, dont' bother + * creating it... */ + if (!ieee->hard_start_xmit) { + printk(KERN_WARNING "%s: No xmit handler.\n", ieee->dev->name); + goto success; + } + + if (unlikely(len < 24)) { + printk(KERN_WARNING "%s: skb too small (%d).\n", + ieee->dev->name, len); + goto success; + } + + /* When we allocate the TXB we allocate enough space for the reserve + * and full fragment bytes (bytes_per_frag doesn't include prefix, + * postfix, header, FCS, etc.) */ + txb = ieee80211_alloc_txb(1, len, ieee->tx_headroom, GFP_ATOMIC); + if (unlikely(!txb)) { + printk(KERN_WARNING "%s: Could not allocate TXB\n", + ieee->dev->name); + goto failed; + } + txb->encrypted = 0; + txb->payload_size = len; + + skb_frag = txb->fragments[0]; + + memcpy(skb_put(skb_frag, len), frame, len); + + if (ieee->config & + (CFG_IEEE80211_COMPUTE_FCS | CFG_IEEE80211_RESERVE_FCS)) + skb_put(skb_frag, 4); + + success: + spin_unlock_irqrestore(&ieee->lock, flags); + + if (txb) { + if ((*ieee->hard_start_xmit) (txb, ieee->dev, priority) == 0) { + stats->tx_packets++; + stats->tx_bytes += txb->payload_size; + return 0; + } + ieee80211_txb_free(txb); + } + return 0; + + failed: + spin_unlock_irqrestore(&ieee->lock, flags); + stats->tx_errors++; + return 1; } +EXPORT_SYMBOL(ieee80211_tx_frame); EXPORT_SYMBOL(ieee80211_txb_free); diff --git a/net/ieee80211/ieee80211_wx.c b/net/ieee80211/ieee80211_wx.c index 94882f39b07..181755f2aa8 100644 --- a/net/ieee80211/ieee80211_wx.c +++ b/net/ieee80211/ieee80211_wx.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright(c) 2004 Intel Corporation. All rights reserved. + Copyright(c) 2004-2005 Intel Corporation. All rights reserved. Portions of this file are based on the WEP enablement code provided by the Host AP project hostap-drivers v0.1.3 @@ -32,6 +32,7 @@ #include <linux/kmod.h> #include <linux/module.h> +#include <linux/jiffies.h> #include <net/ieee80211.h> #include <linux/wireless.h> @@ -140,18 +141,43 @@ static inline char *ipw2100_translate_scan(struct ieee80211_device *ieee, start = iwe_stream_add_point(start, stop, &iwe, custom); /* Add quality statistics */ - /* TODO: Fix these values... */ iwe.cmd = IWEVQUAL; - iwe.u.qual.qual = network->stats.signal; - iwe.u.qual.level = network->stats.rssi; - iwe.u.qual.noise = network->stats.noise; - iwe.u.qual.updated = network->stats.mask & IEEE80211_STATMASK_WEMASK; - if (!(network->stats.mask & IEEE80211_STATMASK_RSSI)) - iwe.u.qual.updated |= IW_QUAL_LEVEL_INVALID; - if (!(network->stats.mask & IEEE80211_STATMASK_NOISE)) + iwe.u.qual.updated = IW_QUAL_QUAL_UPDATED | IW_QUAL_LEVEL_UPDATED | + IW_QUAL_NOISE_UPDATED; + + if (!(network->stats.mask & IEEE80211_STATMASK_RSSI)) { + iwe.u.qual.updated |= IW_QUAL_QUAL_INVALID | + IW_QUAL_LEVEL_INVALID; + iwe.u.qual.qual = 0; + iwe.u.qual.level = 0; + } else { + iwe.u.qual.level = network->stats.rssi; + if (ieee->perfect_rssi == ieee->worst_rssi) + iwe.u.qual.qual = 100; + else + iwe.u.qual.qual = + (100 * + (ieee->perfect_rssi - ieee->worst_rssi) * + (ieee->perfect_rssi - ieee->worst_rssi) - + (ieee->perfect_rssi - network->stats.rssi) * + (15 * (ieee->perfect_rssi - ieee->worst_rssi) + + 62 * (ieee->perfect_rssi - + network->stats.rssi))) / + ((ieee->perfect_rssi - + ieee->worst_rssi) * (ieee->perfect_rssi - + ieee->worst_rssi)); + if (iwe.u.qual.qual > 100) + iwe.u.qual.qual = 100; + else if (iwe.u.qual.qual < 1) + iwe.u.qual.qual = 0; + } + + if (!(network->stats.mask & IEEE80211_STATMASK_NOISE)) { iwe.u.qual.updated |= IW_QUAL_NOISE_INVALID; - if (!(network->stats.mask & IEEE80211_STATMASK_SIGNAL)) - iwe.u.qual.updated |= IW_QUAL_QUAL_INVALID; + iwe.u.qual.noise = 0; + } else { + iwe.u.qual.noise = network->stats.noise; + } start = iwe_stream_add_event(start, stop, &iwe, IW_EV_QUAL_LEN); @@ -162,7 +188,7 @@ static inline char *ipw2100_translate_scan(struct ieee80211_device *ieee, if (iwe.u.data.length) start = iwe_stream_add_point(start, stop, &iwe, custom); - if (ieee->wpa_enabled && network->wpa_ie_len) { + if (network->wpa_ie_len) { char buf[MAX_WPA_IE_LEN * 2 + 30]; u8 *p = buf; @@ -177,7 +203,7 @@ static inline char *ipw2100_translate_scan(struct ieee80211_device *ieee, start = iwe_stream_add_point(start, stop, &iwe, buf); } - if (ieee->wpa_enabled && network->rsn_ie_len) { + if (network->rsn_ie_len) { char buf[MAX_WPA_IE_LEN * 2 + 30]; u8 *p = buf; @@ -197,8 +223,8 @@ static inline char *ipw2100_translate_scan(struct ieee80211_device *ieee, iwe.cmd = IWEVCUSTOM; p = custom; p += snprintf(p, MAX_CUSTOM_LEN - (p - custom), - " Last beacon: %lums ago", - (jiffies - network->last_scanned) / (HZ / 100)); + " Last beacon: %dms ago", + jiffies_to_msecs(jiffies - network->last_scanned)); iwe.u.data.length = p - custom; if (iwe.u.data.length) start = iwe_stream_add_point(start, stop, &iwe, custom); @@ -228,13 +254,13 @@ int ieee80211_wx_get_scan(struct ieee80211_device *ieee, ev = ipw2100_translate_scan(ieee, ev, stop, network); else IEEE80211_DEBUG_SCAN("Not showing network '%s (" - MAC_FMT ")' due to age (%lums).\n", + MAC_FMT ")' due to age (%dms).\n", escape_essid(network->ssid, network->ssid_len), MAC_ARG(network->bssid), - (jiffies - - network->last_scanned) / (HZ / - 100)); + jiffies_to_msecs(jiffies - + network-> + last_scanned)); } spin_unlock_irqrestore(&ieee->lock, flags); @@ -258,6 +284,7 @@ int ieee80211_wx_set_encode(struct ieee80211_device *ieee, }; int i, key, key_provided, len; struct ieee80211_crypt_data **crypt; + int host_crypto = ieee->host_encrypt || ieee->host_decrypt; IEEE80211_DEBUG_WX("SET_ENCODE\n"); @@ -298,15 +325,17 @@ int ieee80211_wx_set_encode(struct ieee80211_device *ieee, if (i == WEP_KEYS) { sec.enabled = 0; + sec.encrypt = 0; sec.level = SEC_LEVEL_0; - sec.flags |= SEC_ENABLED | SEC_LEVEL; + sec.flags |= SEC_ENABLED | SEC_LEVEL | SEC_ENCRYPT; } goto done; } sec.enabled = 1; - sec.flags |= SEC_ENABLED; + sec.encrypt = 1; + sec.flags |= SEC_ENABLED | SEC_ENCRYPT; if (*crypt != NULL && (*crypt)->ops != NULL && strcmp((*crypt)->ops->name, "WEP") != 0) { @@ -315,7 +344,7 @@ int ieee80211_wx_set_encode(struct ieee80211_device *ieee, ieee80211_crypt_delayed_deinit(ieee, crypt); } - if (*crypt == NULL) { + if (*crypt == NULL && host_crypto) { struct ieee80211_crypt_data *new_crypt; /* take WEP into use */ @@ -355,49 +384,56 @@ int ieee80211_wx_set_encode(struct ieee80211_device *ieee, key, escape_essid(sec.keys[key], len), erq->length, len); sec.key_sizes[key] = len; - (*crypt)->ops->set_key(sec.keys[key], len, NULL, - (*crypt)->priv); + if (*crypt) + (*crypt)->ops->set_key(sec.keys[key], len, NULL, + (*crypt)->priv); sec.flags |= (1 << key); /* This ensures a key will be activated if no key is * explicitely set */ if (key == sec.active_key) sec.flags |= SEC_ACTIVE_KEY; + } else { - len = (*crypt)->ops->get_key(sec.keys[key], WEP_KEY_LEN, - NULL, (*crypt)->priv); - if (len == 0) { - /* Set a default key of all 0 */ - IEEE80211_DEBUG_WX("Setting key %d to all zero.\n", - key); - memset(sec.keys[key], 0, 13); - (*crypt)->ops->set_key(sec.keys[key], 13, NULL, - (*crypt)->priv); - sec.key_sizes[key] = 13; - sec.flags |= (1 << key); + if (host_crypto) { + len = (*crypt)->ops->get_key(sec.keys[key], WEP_KEY_LEN, + NULL, (*crypt)->priv); + if (len == 0) { + /* Set a default key of all 0 */ + IEEE80211_DEBUG_WX("Setting key %d to all " + "zero.\n", key); + memset(sec.keys[key], 0, 13); + (*crypt)->ops->set_key(sec.keys[key], 13, NULL, + (*crypt)->priv); + sec.key_sizes[key] = 13; + sec.flags |= (1 << key); + } } - /* No key data - just set the default TX key index */ if (key_provided) { - IEEE80211_DEBUG_WX - ("Setting key %d to default Tx key.\n", key); + IEEE80211_DEBUG_WX("Setting key %d to default Tx " + "key.\n", key); ieee->tx_keyidx = key; sec.active_key = key; sec.flags |= SEC_ACTIVE_KEY; } } - - done: - ieee->open_wep = !(erq->flags & IW_ENCODE_RESTRICTED); - sec.auth_mode = ieee->open_wep ? WLAN_AUTH_OPEN : WLAN_AUTH_SHARED_KEY; - sec.flags |= SEC_AUTH_MODE; - IEEE80211_DEBUG_WX("Auth: %s\n", sec.auth_mode == WLAN_AUTH_OPEN ? - "OPEN" : "SHARED KEY"); + if (erq->flags & (IW_ENCODE_OPEN | IW_ENCODE_RESTRICTED)) { + ieee->open_wep = !(erq->flags & IW_ENCODE_RESTRICTED); + sec.auth_mode = ieee->open_wep ? WLAN_AUTH_OPEN : + WLAN_AUTH_SHARED_KEY; + sec.flags |= SEC_AUTH_MODE; + IEEE80211_DEBUG_WX("Auth: %s\n", + sec.auth_mode == WLAN_AUTH_OPEN ? + "OPEN" : "SHARED KEY"); + } /* For now we just support WEP, so only set that security level... * TODO: When WPA is added this is one place that needs to change */ sec.flags |= SEC_LEVEL; sec.level = SEC_LEVEL_1; /* 40 and 104 bit WEP */ + sec.encode_alg[key] = SEC_ALG_WEP; + done: if (ieee->set_security) ieee->set_security(dev, &sec); @@ -422,6 +458,7 @@ int ieee80211_wx_get_encode(struct ieee80211_device *ieee, struct iw_point *erq = &(wrqu->encoding); int len, key; struct ieee80211_crypt_data *crypt; + struct ieee80211_security *sec = &ieee->sec; IEEE80211_DEBUG_WX("GET_ENCODE\n"); @@ -436,23 +473,16 @@ int ieee80211_wx_get_encode(struct ieee80211_device *ieee, crypt = ieee->crypt[key]; erq->flags = key + 1; - if (crypt == NULL || crypt->ops == NULL) { + if (!sec->enabled) { erq->length = 0; erq->flags |= IW_ENCODE_DISABLED; return 0; } - if (strcmp(crypt->ops->name, "WEP") != 0) { - /* only WEP is supported with wireless extensions, so just - * report that encryption is used */ - erq->length = 0; - erq->flags |= IW_ENCODE_ENABLED; - return 0; - } + len = sec->key_sizes[key]; + memcpy(keybuf, sec->keys[key], len); - len = crypt->ops->get_key(keybuf, WEP_KEY_LEN, NULL, crypt->priv); erq->length = (len >= 0 ? len : 0); - erq->flags |= IW_ENCODE_ENABLED; if (ieee->open_wep) @@ -463,6 +493,242 @@ int ieee80211_wx_get_encode(struct ieee80211_device *ieee, return 0; } +int ieee80211_wx_set_encodeext(struct ieee80211_device *ieee, + struct iw_request_info *info, + union iwreq_data *wrqu, char *extra) +{ + struct net_device *dev = ieee->dev; + struct iw_point *encoding = &wrqu->encoding; + struct iw_encode_ext *ext = (struct iw_encode_ext *)extra; + int i, idx, ret = 0; + int group_key = 0; + const char *alg, *module; + struct ieee80211_crypto_ops *ops; + struct ieee80211_crypt_data **crypt; + + struct ieee80211_security sec = { + .flags = 0, + }; + + idx = encoding->flags & IW_ENCODE_INDEX; + if (idx) { + if (idx < 1 || idx > WEP_KEYS) + return -EINVAL; + idx--; + } else + idx = ieee->tx_keyidx; + + if (ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY) { + crypt = &ieee->crypt[idx]; + group_key = 1; + } else { + /* some Cisco APs use idx>0 for unicast in dynamic WEP */ + if (idx != 0 && ext->alg != IW_ENCODE_ALG_WEP) + return -EINVAL; + if (ieee->iw_mode == IW_MODE_INFRA) + crypt = &ieee->crypt[idx]; + else + return -EINVAL; + } + + sec.flags |= SEC_ENABLED | SEC_ENCRYPT; + if ((encoding->flags & IW_ENCODE_DISABLED) || + ext->alg == IW_ENCODE_ALG_NONE) { + if (*crypt) + ieee80211_crypt_delayed_deinit(ieee, crypt); + + for (i = 0; i < WEP_KEYS; i++) + if (ieee->crypt[i] != NULL) + break; + + if (i == WEP_KEYS) { + sec.enabled = 0; + sec.encrypt = 0; + sec.level = SEC_LEVEL_0; + sec.flags |= SEC_LEVEL; + } + goto done; + } + + sec.enabled = 1; + sec.encrypt = 1; + + if (group_key ? !ieee->host_mc_decrypt : + !(ieee->host_encrypt || ieee->host_decrypt || + ieee->host_encrypt_msdu)) + goto skip_host_crypt; + + switch (ext->alg) { + case IW_ENCODE_ALG_WEP: + alg = "WEP"; + module = "ieee80211_crypt_wep"; + break; + case IW_ENCODE_ALG_TKIP: + alg = "TKIP"; + module = "ieee80211_crypt_tkip"; + break; + case IW_ENCODE_ALG_CCMP: + alg = "CCMP"; + module = "ieee80211_crypt_ccmp"; + break; + default: + IEEE80211_DEBUG_WX("%s: unknown crypto alg %d\n", + dev->name, ext->alg); + ret = -EINVAL; + goto done; + } + + ops = ieee80211_get_crypto_ops(alg); + if (ops == NULL) { + request_module(module); + ops = ieee80211_get_crypto_ops(alg); + } + if (ops == NULL) { + IEEE80211_DEBUG_WX("%s: unknown crypto alg %d\n", + dev->name, ext->alg); + ret = -EINVAL; + goto done; + } + + if (*crypt == NULL || (*crypt)->ops != ops) { + struct ieee80211_crypt_data *new_crypt; + + ieee80211_crypt_delayed_deinit(ieee, crypt); + + new_crypt = (struct ieee80211_crypt_data *) + kmalloc(sizeof(*new_crypt), GFP_KERNEL); + if (new_crypt == NULL) { + ret = -ENOMEM; + goto done; + } + memset(new_crypt, 0, sizeof(struct ieee80211_crypt_data)); + new_crypt->ops = ops; + if (new_crypt->ops && try_module_get(new_crypt->ops->owner)) + new_crypt->priv = new_crypt->ops->init(idx); + if (new_crypt->priv == NULL) { + kfree(new_crypt); + ret = -EINVAL; + goto done; + } + *crypt = new_crypt; + } + + if (ext->key_len > 0 && (*crypt)->ops->set_key && + (*crypt)->ops->set_key(ext->key, ext->key_len, ext->rx_seq, + (*crypt)->priv) < 0) { + IEEE80211_DEBUG_WX("%s: key setting failed\n", dev->name); + ret = -EINVAL; + goto done; + } + + skip_host_crypt: + if (ext->ext_flags & IW_ENCODE_EXT_SET_TX_KEY) { + ieee->tx_keyidx = idx; + sec.active_key = idx; + sec.flags |= SEC_ACTIVE_KEY; + } + + if (ext->alg != IW_ENCODE_ALG_NONE) { + memcpy(sec.keys[idx], ext->key, ext->key_len); + sec.key_sizes[idx] = ext->key_len; + sec.flags |= (1 << idx); + if (ext->alg == IW_ENCODE_ALG_WEP) { + sec.encode_alg[idx] = SEC_ALG_WEP; + sec.flags |= SEC_LEVEL; + sec.level = SEC_LEVEL_1; + } else if (ext->alg == IW_ENCODE_ALG_TKIP) { + sec.encode_alg[idx] = SEC_ALG_TKIP; + sec.flags |= SEC_LEVEL; + sec.level = SEC_LEVEL_2; + } else if (ext->alg == IW_ENCODE_ALG_CCMP) { + sec.encode_alg[idx] = SEC_ALG_CCMP; + sec.flags |= SEC_LEVEL; + sec.level = SEC_LEVEL_3; + } + /* Don't set sec level for group keys. */ + if (group_key) + sec.flags &= ~SEC_LEVEL; + } + done: + if (ieee->set_security) + ieee->set_security(ieee->dev, &sec); + + /* + * Do not reset port if card is in Managed mode since resetting will + * generate new IEEE 802.11 authentication which may end up in looping + * with IEEE 802.1X. If your hardware requires a reset after WEP + * configuration (for example... Prism2), implement the reset_port in + * the callbacks structures used to initialize the 802.11 stack. + */ + if (ieee->reset_on_keychange && + ieee->iw_mode != IW_MODE_INFRA && + ieee->reset_port && ieee->reset_port(dev)) { + IEEE80211_DEBUG_WX("%s: reset_port failed\n", dev->name); + return -EINVAL; + } + + return ret; +} + +int ieee80211_wx_get_encodeext(struct ieee80211_device *ieee, + struct iw_request_info *info, + union iwreq_data *wrqu, char *extra) +{ + struct iw_point *encoding = &wrqu->encoding; + struct iw_encode_ext *ext = (struct iw_encode_ext *)extra; + struct ieee80211_security *sec = &ieee->sec; + int idx, max_key_len; + + max_key_len = encoding->length - sizeof(*ext); + if (max_key_len < 0) + return -EINVAL; + + idx = encoding->flags & IW_ENCODE_INDEX; + if (idx) { + if (idx < 1 || idx > WEP_KEYS) + return -EINVAL; + idx--; + } else + idx = ieee->tx_keyidx; + + if (!ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY && + ext->alg != IW_ENCODE_ALG_WEP) + if (idx != 0 || ieee->iw_mode != IW_MODE_INFRA) + return -EINVAL; + + encoding->flags = idx + 1; + memset(ext, 0, sizeof(*ext)); + + if (!sec->enabled) { + ext->alg = IW_ENCODE_ALG_NONE; + ext->key_len = 0; + encoding->flags |= IW_ENCODE_DISABLED; + } else { + if (sec->encode_alg[idx] == SEC_ALG_WEP) + ext->alg = IW_ENCODE_ALG_WEP; + else if (sec->encode_alg[idx] == SEC_ALG_TKIP) + ext->alg = IW_ENCODE_ALG_TKIP; + else if (sec->encode_alg[idx] == SEC_ALG_CCMP) + ext->alg = IW_ENCODE_ALG_CCMP; + else + return -EINVAL; + + ext->key_len = sec->key_sizes[idx]; + memcpy(ext->key, sec->keys[idx], ext->key_len); + encoding->flags |= IW_ENCODE_ENABLED; + if (ext->key_len && + (ext->alg == IW_ENCODE_ALG_TKIP || + ext->alg == IW_ENCODE_ALG_CCMP)) + ext->ext_flags |= IW_ENCODE_EXT_TX_SEQ_VALID; + + } + + return 0; +} + +EXPORT_SYMBOL(ieee80211_wx_set_encodeext); +EXPORT_SYMBOL(ieee80211_wx_get_encodeext); + EXPORT_SYMBOL(ieee80211_wx_get_scan); EXPORT_SYMBOL(ieee80211_wx_set_encode); EXPORT_SYMBOL(ieee80211_wx_get_encode); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index a9d84f93442..eaa150c33b0 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -147,8 +147,7 @@ void inet_sock_destruct(struct sock *sk) BUG_TRAP(!sk->sk_wmem_queued); BUG_TRAP(!sk->sk_forward_alloc); - if (inet->opt) - kfree(inet->opt); + kfree(inet->opt); dst_release(sk->sk_dst_cache); sk_refcnt_debug_dec(sk); } diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 74f2207e131..4ec4b2ca6ab 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -715,6 +715,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg) break; ret = 0; if (ifa->ifa_mask != sin->sin_addr.s_addr) { + u32 old_mask = ifa->ifa_mask; inet_del_ifa(in_dev, ifap, 0); ifa->ifa_mask = sin->sin_addr.s_addr; ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask); @@ -728,7 +729,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg) if ((dev->flags & IFF_BROADCAST) && (ifa->ifa_prefixlen < 31) && (ifa->ifa_broadcast == - (ifa->ifa_local|~ifa->ifa_mask))) { + (ifa->ifa_local|~old_mask))) { ifa->ifa_broadcast = (ifa->ifa_local | ~sin->sin_addr.s_addr); } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index e61bc7177eb..2267c1fad87 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -266,8 +266,7 @@ int ip_rt_ioctl(unsigned int cmd, void __user *arg) if (tb) err = tb->tb_insert(tb, &req.rtm, &rta, &req.nlh, NULL); } - if (rta.rta_mx) - kfree(rta.rta_mx); + kfree(rta.rta_mx); } rtnl_unlock(); return err; @@ -591,7 +590,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, break; case NETDEV_DOWN: fib_del_ifaddr(ifa); - if (ifa->ifa_dev && ifa->ifa_dev->ifa_list == NULL) { + if (ifa->ifa_dev->ifa_list == NULL) { /* Last address was deleted from this interface. Disable IP. */ diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 0093ea08c7f..66247f38b37 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2404,7 +2404,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) prefix = htonl(l->key); list_for_each_entry_rcu(fa, &li->falh, fa_list) { - const struct fib_info *fi = rcu_dereference(fa->fa_info); + const struct fib_info *fi = fa->fa_info; unsigned flags = fib_flag_trans(fa->fa_type, mask, fi); if (fa->fa_type == RTN_BROADCAST diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 90dca711ac9..e3eceecd049 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -934,11 +934,11 @@ int icmp_rcv(struct sk_buff *skb) case CHECKSUM_HW: if (!(u16)csum_fold(skb->csum)) break; - LIMIT_NETDEBUG(KERN_DEBUG "icmp v4 hw csum failure\n"); + /* fall through */ case CHECKSUM_NONE: - if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) + skb->csum = 0; + if (__skb_checksum_complete(skb)) goto error; - default:; } if (!pskb_pull(skb, sizeof(struct icmphdr))) @@ -1108,12 +1108,9 @@ void __init icmp_init(struct net_proto_family *ops) struct inet_sock *inet; int i; - for (i = 0; i < NR_CPUS; i++) { + for_each_cpu(i) { int err; - if (!cpu_possible(i)) - continue; - err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_ICMP, &per_cpu(__icmp_socket, i)); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 8b6d3939e1e..c04607b4921 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -872,11 +872,18 @@ int igmp_rcv(struct sk_buff *skb) return 0; } - if (!pskb_may_pull(skb, sizeof(struct igmphdr)) || - (u16)csum_fold(skb_checksum(skb, 0, len, 0))) { - in_dev_put(in_dev); - kfree_skb(skb); - return 0; + if (!pskb_may_pull(skb, sizeof(struct igmphdr))) + goto drop; + + switch (skb->ip_summed) { + case CHECKSUM_HW: + if (!(u16)csum_fold(skb->csum)) + break; + /* fall through */ + case CHECKSUM_NONE: + skb->csum = 0; + if (__skb_checksum_complete(skb)) + goto drop; } ih = skb->h.igmph; @@ -906,6 +913,8 @@ int igmp_rcv(struct sk_buff *skb) default: NETDEBUG(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type); } + +drop: in_dev_put(in_dev); kfree_skb(skb); return 0; @@ -1908,8 +1917,11 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) sock_kfree_s(sk, newpsl, IP_SFLSIZE(newpsl->sl_max)); goto done; } - } else + } else { newpsl = NULL; + (void) ip_mc_add_src(in_dev, &msf->imsf_multiaddr, + msf->imsf_fmode, 0, NULL, 0); + } psl = pmc->sflist; if (psl) { (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 94468a76c5b..3fe021f1a56 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -78,17 +78,9 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo, int low = sysctl_local_port_range[0]; int high = sysctl_local_port_range[1]; int remaining = (high - low) + 1; - int rover; + int rover = net_random() % (high - low) + low; - spin_lock(&hashinfo->portalloc_lock); - if (hashinfo->port_rover < low) - rover = low; - else - rover = hashinfo->port_rover; do { - rover++; - if (rover > high) - rover = low; head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) @@ -97,9 +89,9 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo, break; next: spin_unlock(&head->lock); + if (++rover > high) + rover = low; } while (--remaining > 0); - hashinfo->port_rover = rover; - spin_unlock(&hashinfo->portalloc_lock); /* Exhausted local port range during search? It is not * possible for us to be holding one of the bind hash diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 71f3c7350c6..39061ed53cf 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -724,12 +724,6 @@ done: return skb->len; } -static int inet_diag_dump_done(struct netlink_callback *cb) -{ - return 0; -} - - static __inline__ int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { @@ -760,8 +754,7 @@ inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) goto err_inval; } return netlink_dump_start(idiagnl, skb, nlh, - inet_diag_dump, - inet_diag_dump_done); + inet_diag_dump, NULL); } else { return inet_diag_get_exact(skb, nlh); } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index e7d26d9943c..8ce0ce2ee48 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -71,7 +71,7 @@ struct ipfrag_skb_cb /* Describe an entry in the "incomplete datagrams" queue. */ struct ipq { - struct ipq *next; /* linked list pointers */ + struct hlist_node list; struct list_head lru_list; /* lru list member */ u32 user; u32 saddr; @@ -89,7 +89,6 @@ struct ipq { spinlock_t lock; atomic_t refcnt; struct timer_list timer; /* when will this queue expire? */ - struct ipq **pprev; int iif; struct timeval stamp; }; @@ -99,7 +98,7 @@ struct ipq { #define IPQ_HASHSZ 64 /* Per-bucket lock is easy to add now. */ -static struct ipq *ipq_hash[IPQ_HASHSZ]; +static struct hlist_head ipq_hash[IPQ_HASHSZ]; static DEFINE_RWLOCK(ipfrag_lock); static u32 ipfrag_hash_rnd; static LIST_HEAD(ipq_lru_list); @@ -107,9 +106,7 @@ int ip_frag_nqueues = 0; static __inline__ void __ipq_unlink(struct ipq *qp) { - if(qp->next) - qp->next->pprev = qp->pprev; - *qp->pprev = qp->next; + hlist_del(&qp->list); list_del(&qp->lru_list); ip_frag_nqueues--; } @@ -139,27 +136,18 @@ static void ipfrag_secret_rebuild(unsigned long dummy) get_random_bytes(&ipfrag_hash_rnd, sizeof(u32)); for (i = 0; i < IPQ_HASHSZ; i++) { struct ipq *q; + struct hlist_node *p, *n; - q = ipq_hash[i]; - while (q) { - struct ipq *next = q->next; + hlist_for_each_entry_safe(q, p, n, &ipq_hash[i], list) { unsigned int hval = ipqhashfn(q->id, q->saddr, q->daddr, q->protocol); if (hval != i) { - /* Unlink. */ - if (q->next) - q->next->pprev = q->pprev; - *q->pprev = q->next; + hlist_del(&q->list); /* Relink to new hash chain. */ - if ((q->next = ipq_hash[hval]) != NULL) - q->next->pprev = &q->next; - ipq_hash[hval] = q; - q->pprev = &ipq_hash[hval]; + hlist_add_head(&q->list, &ipq_hash[hval]); } - - q = next; } } write_unlock(&ipfrag_lock); @@ -310,14 +298,16 @@ out: static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in) { struct ipq *qp; - +#ifdef CONFIG_SMP + struct hlist_node *n; +#endif write_lock(&ipfrag_lock); #ifdef CONFIG_SMP /* With SMP race we have to recheck hash table, because * such entry could be created on other cpu, while we * promoted read lock to write lock. */ - for(qp = ipq_hash[hash]; qp; qp = qp->next) { + hlist_for_each_entry(qp, n, &ipq_hash[hash], list) { if(qp->id == qp_in->id && qp->saddr == qp_in->saddr && qp->daddr == qp_in->daddr && @@ -337,10 +327,7 @@ static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in) atomic_inc(&qp->refcnt); atomic_inc(&qp->refcnt); - if((qp->next = ipq_hash[hash]) != NULL) - qp->next->pprev = &qp->next; - ipq_hash[hash] = qp; - qp->pprev = &ipq_hash[hash]; + hlist_add_head(&qp->list, &ipq_hash[hash]); INIT_LIST_HEAD(&qp->lru_list); list_add_tail(&qp->lru_list, &ipq_lru_list); ip_frag_nqueues++; @@ -392,9 +379,10 @@ static inline struct ipq *ip_find(struct iphdr *iph, u32 user) __u8 protocol = iph->protocol; unsigned int hash = ipqhashfn(id, saddr, daddr, protocol); struct ipq *qp; + struct hlist_node *n; read_lock(&ipfrag_lock); - for(qp = ipq_hash[hash]; qp; qp = qp->next) { + hlist_for_each_entry(qp, n, &ipq_hash[hash], list) { if(qp->id == id && qp->saddr == saddr && qp->daddr == daddr && diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 896ce3f8f53..4e9c74b54b1 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -577,15 +577,16 @@ static int ipgre_rcv(struct sk_buff *skb) goto drop_nolock; if (flags&GRE_CSUM) { - if (skb->ip_summed == CHECKSUM_HW) { + switch (skb->ip_summed) { + case CHECKSUM_HW: csum = (u16)csum_fold(skb->csum); - if (csum) - skb->ip_summed = CHECKSUM_NONE; - } - if (skb->ip_summed == CHECKSUM_NONE) { - skb->csum = skb_checksum(skb, 0, skb->len, 0); + if (!csum) + break; + /* fall through */ + case CHECKSUM_NONE: + skb->csum = 0; + csum = __skb_checksum_complete(skb); skb->ip_summed = CHECKSUM_HW; - csum = (u16)csum_fold(skb->csum); } offset += 4; } diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index bce4e875193..dbe12da8d8b 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -510,8 +510,7 @@ static int ip_options_get_finish(struct ip_options **optp, kfree(opt); return -EINVAL; } - if (*optp) - kfree(*optp); + kfree(*optp); *optp = opt; return 0; } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 1ad5202e556..11c2f68254f 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -275,7 +275,8 @@ int ip_output(struct sk_buff *skb) { IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS); - if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->tso_size) + if (skb->len > dst_mtu(skb->dst) && + !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size)) return ip_fragment(skb, ip_finish_output); else return ip_finish_output(skb); @@ -352,7 +353,8 @@ packet_routed: ip_options_build(skb, opt, inet->daddr, rt, 0); } - ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs); + ip_select_ident_more(iph, &rt->u.dst, sk, + (skb_shinfo(skb)->tso_segs ?: 1) - 1); /* Add an IP checksum. */ ip_send_check(iph); @@ -688,6 +690,60 @@ csum_page(struct page *page, int offset, int copy) return csum; } +inline int ip_ufo_append_data(struct sock *sk, + int getfrag(void *from, char *to, int offset, int len, + int odd, struct sk_buff *skb), + void *from, int length, int hh_len, int fragheaderlen, + int transhdrlen, int mtu,unsigned int flags) +{ + struct sk_buff *skb; + int err; + + /* There is support for UDP fragmentation offload by network + * device, so create one single skb packet containing complete + * udp datagram + */ + if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) { + skb = sock_alloc_send_skb(sk, + hh_len + fragheaderlen + transhdrlen + 20, + (flags & MSG_DONTWAIT), &err); + + if (skb == NULL) + return err; + + /* reserve space for Hardware header */ + skb_reserve(skb, hh_len); + + /* create space for UDP/IP header */ + skb_put(skb,fragheaderlen + transhdrlen); + + /* initialize network header pointer */ + skb->nh.raw = skb->data; + + /* initialize protocol header pointer */ + skb->h.raw = skb->data + fragheaderlen; + + skb->ip_summed = CHECKSUM_HW; + skb->csum = 0; + sk->sk_sndmsg_off = 0; + } + + err = skb_append_datato_frags(sk,skb, getfrag, from, + (length - transhdrlen)); + if (!err) { + /* specify the length of each IP datagram fragment*/ + skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen); + __skb_queue_tail(&sk->sk_write_queue, skb); + + return 0; + } + /* There is not enough support do UFO , + * so follow normal path + */ + kfree_skb(skb); + return err; +} + /* * ip_append_data() and ip_append_page() can make one large IP datagram * from many pieces of data. Each pieces will be holded on the socket @@ -777,6 +833,15 @@ int ip_append_data(struct sock *sk, csummode = CHECKSUM_HW; inet->cork.length += length; + if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) && + (rt->u.dst.dev->features & NETIF_F_UFO)) { + + if(ip_ufo_append_data(sk, getfrag, from, length, hh_len, + fragheaderlen, transhdrlen, mtu, flags)) + goto error; + + return 0; + } /* So, what's going on in the loop below? * @@ -1008,14 +1073,23 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, return -EINVAL; inet->cork.length += size; + if ((sk->sk_protocol == IPPROTO_UDP) && + (rt->u.dst.dev->features & NETIF_F_UFO)) + skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen); + while (size > 0) { int i; - /* Check if the remaining data fits into current packet. */ - len = mtu - skb->len; - if (len < size) - len = maxfraglen - skb->len; + if (skb_shinfo(skb)->ufo_size) + len = size; + else { + + /* Check if the remaining data fits into current packet. */ + len = mtu - skb->len; + if (len < size) + len = maxfraglen - skb->len; + } if (len <= 0) { struct sk_buff *skb_prev; char *data; @@ -1023,10 +1097,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, int alloclen; skb_prev = skb; - if (skb_prev) - fraggap = skb_prev->len - maxfraglen; - else - fraggap = 0; + fraggap = skb_prev->len - maxfraglen; alloclen = fragheaderlen + hh_len + fraggap + 15; skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation); @@ -1192,10 +1263,8 @@ int ip_push_pending_frames(struct sock *sk) out: inet->cork.flags &= ~IPCORK_OPT; - if (inet->cork.opt) { - kfree(inet->cork.opt); - inet->cork.opt = NULL; - } + kfree(inet->cork.opt); + inet->cork.opt = NULL; if (inet->cork.rt) { ip_rt_put(inet->cork.rt); inet->cork.rt = NULL; @@ -1219,10 +1288,8 @@ void ip_flush_pending_frames(struct sock *sk) kfree_skb(skb); inet->cork.flags &= ~IPCORK_OPT; - if (inet->cork.opt) { - kfree(inet->cork.opt); - inet->cork.opt = NULL; - } + kfree(inet->cork.opt); + inet->cork.opt = NULL; if (inet->cork.rt) { ip_rt_put(inet->cork.rt); inet->cork.rt = NULL; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 2f0b47da5b3..4f2d8725730 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -202,8 +202,7 @@ int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct s if (ra->sk == sk) { if (on) { write_unlock_bh(&ip_ra_lock); - if (new_ra) - kfree(new_ra); + kfree(new_ra); return -EADDRINUSE; } *rap = ra->next; @@ -446,8 +445,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, #endif } opt = xchg(&inet->opt, opt); - if (opt) - kfree(opt); + kfree(opt); break; } case IP_PKTINFO: @@ -828,10 +826,8 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, err = ip_mc_msfilter(sk, msf, ifindex); mc_msf_out: - if (msf) - kfree(msf); - if (gsf) - kfree(gsf); + kfree(msf); + kfree(gsf); break; } case IP_ROUTER_ALERT: diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c index fc6f95aaa96..d7eb680101c 100644 --- a/net/ipv4/ipvs/ip_vs_app.c +++ b/net/ipv4/ipvs/ip_vs_app.c @@ -110,8 +110,7 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port) return 0; out: - if (inc->timeout_table) - kfree(inc->timeout_table); + kfree(inc->timeout_table); kfree(inc); return ret; } @@ -136,8 +135,7 @@ ip_vs_app_inc_release(struct ip_vs_app *inc) list_del(&inc->a_list); - if (inc->timeout_table != NULL) - kfree(inc->timeout_table); + kfree(inc->timeout_table); kfree(inc); } diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 981cc3244ef..1a0843cd58a 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -1009,11 +1009,10 @@ ip_vs_in(unsigned int hooknum, struct sk_buff **pskb, if (sysctl_ip_vs_expire_nodest_conn) { /* try to expire the connection immediately */ ip_vs_conn_expire_now(cp); - } else { - /* don't restart its timer, and silently - drop the packet. */ - __ip_vs_conn_put(cp); } + /* don't restart its timer, and silently + drop the packet. */ + __ip_vs_conn_put(cp); return NF_DROP; } diff --git a/net/ipv4/multipath_wrandom.c b/net/ipv4/multipath_wrandom.c index bd7d75b6abe..d34a9fa608e 100644 --- a/net/ipv4/multipath_wrandom.c +++ b/net/ipv4/multipath_wrandom.c @@ -207,16 +207,12 @@ static void wrandom_select_route(const struct flowi *flp, decision = mpc->rt; last_power = mpc->power; - if (last_mpc) - kfree(last_mpc); - + kfree(last_mpc); last_mpc = mpc; } - if (last_mpc) { - /* concurrent __multipath_flush may lead to !last_mpc */ - kfree(last_mpc); - } + /* concurrent __multipath_flush may lead to !last_mpc */ + kfree(last_mpc); decision->u.dst.__use++; *rp = decision; diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 7d917e4ce1d..9d3c8b5f327 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -5,6 +5,20 @@ menu "IP: Netfilter Configuration" depends on INET && NETFILTER +config NF_CONNTRACK_IPV4 + tristate "IPv4 support for new connection tracking (EXPERIMENTAL)" + depends on EXPERIMENTAL && NF_CONNTRACK + ---help--- + Connection tracking keeps a record of what packets have passed + through your machine, in order to figure out how they are related + into connections. + + This is IPv4 support on Layer 3 independent connection tracking. + Layer 3 independent connection tracking is experimental scheme + which generalize ip_conntrack to support other layer 3 protocols. + + To compile it as a module, choose M here. If unsure, say N. + # connection tracking, helpers and protocols config IP_NF_CONNTRACK tristate "Connection tracking (required for masq/NAT)" @@ -209,8 +223,8 @@ config IP_NF_MATCH_PKTTYPE tristate "Packet type match support" depends on IP_NF_IPTABLES help - Packet type matching allows you to match a packet by - its "class", eg. BROADCAST, MULTICAST, ... + Packet type matching allows you to match a packet by + its "class", eg. BROADCAST, MULTICAST, ... Typical usage: iptables -A INPUT -m pkttype --pkt-type broadcast -j LOG @@ -317,7 +331,8 @@ config IP_NF_MATCH_TCPMSS config IP_NF_MATCH_HELPER tristate "Helper match support" - depends on IP_NF_CONNTRACK && IP_NF_IPTABLES + depends on IP_NF_IPTABLES + depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4 help Helper matching allows you to match packets in dynamic connections tracked by a conntrack-helper, ie. ip_conntrack_ftp @@ -326,7 +341,8 @@ config IP_NF_MATCH_HELPER config IP_NF_MATCH_STATE tristate "Connection state match support" - depends on IP_NF_CONNTRACK && IP_NF_IPTABLES + depends on IP_NF_IPTABLES + depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4 help Connection state matching allows you to match packets based on their relationship to a tracked connection (ie. previous packets). This @@ -336,7 +352,8 @@ config IP_NF_MATCH_STATE config IP_NF_MATCH_CONNTRACK tristate "Connection tracking match support" - depends on IP_NF_CONNTRACK && IP_NF_IPTABLES + depends on IP_NF_IPTABLES + depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4 help This is a general conntrack match module, a superset of the state match. @@ -422,7 +439,8 @@ config IP_NF_MATCH_COMMENT config IP_NF_MATCH_CONNMARK tristate 'Connection mark match support' - depends on IP_NF_CONNTRACK_MARK && IP_NF_IPTABLES + depends on IP_NF_IPTABLES + depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) help This option adds a `connmark' match, which allows you to match the connection mark value previously set for the session by `CONNMARK'. @@ -433,7 +451,8 @@ config IP_NF_MATCH_CONNMARK config IP_NF_MATCH_CONNBYTES tristate 'Connection byte/packet counter match support' - depends on IP_NF_CT_ACCT && IP_NF_IPTABLES + depends on IP_NF_IPTABLES + depends on IP_NF_CT_ACCT || (NF_CT_ACCT && NF_CONNTRACK_IPV4) help This option adds a `connbytes' match, which allows you to match the number of bytes and/or packets for each direction within a connection. @@ -747,7 +766,8 @@ config IP_NF_TARGET_TTL config IP_NF_TARGET_CONNMARK tristate 'CONNMARK target support' - depends on IP_NF_CONNTRACK_MARK && IP_NF_MANGLE + depends on IP_NF_MANGLE + depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) help This option adds a `CONNMARK' target, which allows one to manipulate the connection mark value. Similar to the MARK target, but @@ -759,7 +779,8 @@ config IP_NF_TARGET_CONNMARK config IP_NF_TARGET_CLUSTERIP tristate "CLUSTERIP target support (EXPERIMENTAL)" - depends on IP_NF_CONNTRACK_MARK && IP_NF_IPTABLES && EXPERIMENTAL + depends on IP_NF_IPTABLES && EXPERIMENTAL + depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) help The CLUSTERIP target allows you to build load-balancing clusters of network servers without having a dedicated load-balancing @@ -782,7 +803,7 @@ config IP_NF_RAW config IP_NF_TARGET_NOTRACK tristate 'NOTRACK target support' depends on IP_NF_RAW - depends on IP_NF_CONNTRACK + depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4 help The NOTRACK target allows a select rule to specify which packets *not* to enter the conntrack/NAT diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index dab4b58dd31..058c48e258f 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -103,3 +103,9 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o + +# objects for l3 independent conntrack +nf_conntrack_ipv4-objs := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o + +# l3 independent conntrack +obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index a7969286e6e..3c2e9639bba 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -347,58 +347,106 @@ unsigned int arpt_do_table(struct sk_buff **pskb, return verdict; } -static inline void *find_inlist_lock_noload(struct list_head *head, - const char *name, - int *error, - struct semaphore *mutex) +/* + * These are weird, but module loading must not be done with mutex + * held (since they will register), and we have to have a single + * function to use try_then_request_module(). + */ + +/* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */ +static inline struct arpt_table *find_table_lock(const char *name) { - void *ret; + struct arpt_table *t; - *error = down_interruptible(mutex); - if (*error != 0) - return NULL; + if (down_interruptible(&arpt_mutex) != 0) + return ERR_PTR(-EINTR); - ret = list_named_find(head, name); - if (!ret) { - *error = -ENOENT; - up(mutex); - } - return ret; + list_for_each_entry(t, &arpt_tables, list) + if (strcmp(t->name, name) == 0 && try_module_get(t->me)) + return t; + up(&arpt_mutex); + return NULL; } -#ifndef CONFIG_KMOD -#define find_inlist_lock(h,n,p,e,m) find_inlist_lock_noload((h),(n),(e),(m)) -#else -static void * -find_inlist_lock(struct list_head *head, - const char *name, - const char *prefix, - int *error, - struct semaphore *mutex) + +/* Find target, grabs ref. Returns ERR_PTR() on error. */ +static inline struct arpt_target *find_target(const char *name, u8 revision) { - void *ret; + struct arpt_target *t; + int err = 0; - ret = find_inlist_lock_noload(head, name, error, mutex); - if (!ret) { - duprintf("find_inlist: loading `%s%s'.\n", prefix, name); - request_module("%s%s", prefix, name); - ret = find_inlist_lock_noload(head, name, error, mutex); + if (down_interruptible(&arpt_mutex) != 0) + return ERR_PTR(-EINTR); + + list_for_each_entry(t, &arpt_target, list) { + if (strcmp(t->name, name) == 0) { + if (t->revision == revision) { + if (try_module_get(t->me)) { + up(&arpt_mutex); + return t; + } + } else + err = -EPROTOTYPE; /* Found something. */ + } } + up(&arpt_mutex); + return ERR_PTR(err); +} - return ret; +struct arpt_target *arpt_find_target(const char *name, u8 revision) +{ + struct arpt_target *target; + + target = try_then_request_module(find_target(name, revision), + "arpt_%s", name); + if (IS_ERR(target) || !target) + return NULL; + return target; } -#endif -static inline struct arpt_table *arpt_find_table_lock(const char *name, int *error, struct semaphore *mutex) +static int target_revfn(const char *name, u8 revision, int *bestp) { - return find_inlist_lock(&arpt_tables, name, "arptable_", error, mutex); + struct arpt_target *t; + int have_rev = 0; + + list_for_each_entry(t, &arpt_target, list) { + if (strcmp(t->name, name) == 0) { + if (t->revision > *bestp) + *bestp = t->revision; + if (t->revision == revision) + have_rev =1; + } + } + return have_rev; } -static struct arpt_target *arpt_find_target_lock(const char *name, int *error, struct semaphore *mutex) +/* Returns true or false (if no such extension at all) */ +static inline int find_revision(const char *name, u8 revision, + int (*revfn)(const char *, u8, int *), + int *err) { - return find_inlist_lock(&arpt_target, name, "arpt_", error, mutex); + int have_rev, best = -1; + + if (down_interruptible(&arpt_mutex) != 0) { + *err = -EINTR; + return 1; + } + have_rev = revfn(name, revision, &best); + up(&arpt_mutex); + + /* Nothing at all? Return 0 to try loading module. */ + if (best == -1) { + *err = -ENOENT; + return 0; + } + + *err = best; + if (!have_rev) + *err = -EPROTONOSUPPORT; + return 1; } + /* All zeroes == unconditional rule. */ static inline int unconditional(const struct arpt_arp *arp) { @@ -544,17 +592,15 @@ static inline int check_entry(struct arpt_entry *e, const char *name, unsigned i } t = arpt_get_target(e); - target = arpt_find_target_lock(t->u.user.name, &ret, &arpt_mutex); - if (!target) { + target = try_then_request_module(find_target(t->u.user.name, + t->u.user.revision), + "arpt_%s", t->u.user.name); + if (IS_ERR(target) || !target) { duprintf("check_entry: `%s' not found\n", t->u.user.name); + ret = target ? PTR_ERR(target) : -ENOENT; goto out; } - if (!try_module_get((target->me))) { - ret = -ENOENT; - goto out_unlock; - } t->u.kernel.target = target; - up(&arpt_mutex); if (t->u.kernel.target == &arpt_standard_target) { if (!standard_check(t, size)) { @@ -576,8 +622,6 @@ static inline int check_entry(struct arpt_entry *e, const char *name, unsigned i (*i)++; return 0; -out_unlock: - up(&arpt_mutex); out: return ret; } @@ -846,8 +890,8 @@ static int get_entries(const struct arpt_get_entries *entries, int ret; struct arpt_table *t; - t = arpt_find_table_lock(entries->name, &ret, &arpt_mutex); - if (t) { + t = find_table_lock(entries->name); + if (t || !IS_ERR(t)) { duprintf("t->private->number = %u\n", t->private->number); if (entries->size == t->private->size) @@ -859,10 +903,10 @@ static int get_entries(const struct arpt_get_entries *entries, entries->size); ret = -EINVAL; } + module_put(t->me); up(&arpt_mutex); } else - duprintf("get_entries: Can't find %s!\n", - entries->name); + ret = t ? PTR_ERR(t) : -ENOENT; return ret; } @@ -913,22 +957,19 @@ static int do_replace(void __user *user, unsigned int len) duprintf("arp_tables: Translated table\n"); - t = arpt_find_table_lock(tmp.name, &ret, &arpt_mutex); - if (!t) + t = try_then_request_module(find_table_lock(tmp.name), + "arptable_%s", tmp.name); + if (!t || IS_ERR(t)) { + ret = t ? PTR_ERR(t) : -ENOENT; goto free_newinfo_counters_untrans; + } /* You lied! */ if (tmp.valid_hooks != t->valid_hooks) { duprintf("Valid hook crap: %08X vs %08X\n", tmp.valid_hooks, t->valid_hooks); ret = -EINVAL; - goto free_newinfo_counters_untrans_unlock; - } - - /* Get a reference in advance, we're not allowed fail later */ - if (!try_module_get(t->me)) { - ret = -EBUSY; - goto free_newinfo_counters_untrans_unlock; + goto put_module; } oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret); @@ -959,7 +1000,6 @@ static int do_replace(void __user *user, unsigned int len) put_module: module_put(t->me); - free_newinfo_counters_untrans_unlock: up(&arpt_mutex); free_newinfo_counters_untrans: ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry, NULL); @@ -989,7 +1029,7 @@ static int do_add_counters(void __user *user, unsigned int len) unsigned int i; struct arpt_counters_info tmp, *paddc; struct arpt_table *t; - int ret; + int ret = 0; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1006,9 +1046,11 @@ static int do_add_counters(void __user *user, unsigned int len) goto free; } - t = arpt_find_table_lock(tmp.name, &ret, &arpt_mutex); - if (!t) + t = find_table_lock(tmp.name); + if (!t || IS_ERR(t)) { + ret = t ? PTR_ERR(t) : -ENOENT; goto free; + } write_lock_bh(&t->lock); if (t->private->number != paddc->num_counters) { @@ -1025,6 +1067,7 @@ static int do_add_counters(void __user *user, unsigned int len) unlock_up_free: write_unlock_bh(&t->lock); up(&arpt_mutex); + module_put(t->me); free: vfree(paddc); @@ -1079,8 +1122,10 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len break; } name[ARPT_TABLE_MAXNAMELEN-1] = '\0'; - t = arpt_find_table_lock(name, &ret, &arpt_mutex); - if (t) { + + t = try_then_request_module(find_table_lock(name), + "arptable_%s", name); + if (t && !IS_ERR(t)) { struct arpt_getinfo info; info.valid_hooks = t->valid_hooks; @@ -1096,9 +1141,10 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len ret = -EFAULT; else ret = 0; - up(&arpt_mutex); - } + module_put(t->me); + } else + ret = t ? PTR_ERR(t) : -ENOENT; } break; @@ -1119,6 +1165,24 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len break; } + case ARPT_SO_GET_REVISION_TARGET: { + struct arpt_get_revision rev; + + if (*len != sizeof(rev)) { + ret = -EINVAL; + break; + } + if (copy_from_user(&rev, user, sizeof(rev)) != 0) { + ret = -EFAULT; + break; + } + + try_then_request_module(find_revision(rev.name, rev.revision, + target_revfn, &ret), + "arpt_%s", rev.name); + break; + } + default: duprintf("do_arpt_get_ctl: unknown request %i\n", cmd); ret = -EINVAL; @@ -1136,12 +1200,9 @@ int arpt_register_target(struct arpt_target *target) if (ret != 0) return ret; - if (!list_named_insert(&arpt_target, target)) { - duprintf("arpt_register_target: `%s' already in list!\n", - target->name); - ret = -EINVAL; - } + list_add(&target->list, &arpt_target); up(&arpt_mutex); + return ret; } diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 07a80b56e8d..422ab68ee7f 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -50,7 +50,7 @@ #include <linux/netfilter_ipv4/ip_conntrack_core.h> #include <linux/netfilter_ipv4/listhelp.h> -#define IP_CONNTRACK_VERSION "2.3" +#define IP_CONNTRACK_VERSION "2.4" #if 0 #define DEBUGP printk @@ -148,16 +148,20 @@ DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); static int ip_conntrack_hash_rnd_initted; static unsigned int ip_conntrack_hash_rnd; -static u_int32_t -hash_conntrack(const struct ip_conntrack_tuple *tuple) +static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple, + unsigned int size, unsigned int rnd) { -#if 0 - dump_tuple(tuple); -#endif return (jhash_3words(tuple->src.ip, (tuple->dst.ip ^ tuple->dst.protonum), (tuple->src.u.all | (tuple->dst.u.all << 16)), - ip_conntrack_hash_rnd) % ip_conntrack_htable_size); + rnd) % size); +} + +static u_int32_t +hash_conntrack(const struct ip_conntrack_tuple *tuple) +{ + return __hash_conntrack(tuple, ip_conntrack_htable_size, + ip_conntrack_hash_rnd); } int @@ -1341,14 +1345,13 @@ static int kill_all(struct ip_conntrack *i, void *data) return 1; } -static void free_conntrack_hash(void) +static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size) { - if (ip_conntrack_vmalloc) - vfree(ip_conntrack_hash); + if (vmalloced) + vfree(hash); else - free_pages((unsigned long)ip_conntrack_hash, - get_order(sizeof(struct list_head) - * ip_conntrack_htable_size)); + free_pages((unsigned long)hash, + get_order(sizeof(struct list_head) * size)); } void ip_conntrack_flush() @@ -1378,12 +1381,83 @@ void ip_conntrack_cleanup(void) ip_conntrack_flush(); kmem_cache_destroy(ip_conntrack_cachep); kmem_cache_destroy(ip_conntrack_expect_cachep); - free_conntrack_hash(); + free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc, + ip_conntrack_htable_size); nf_unregister_sockopt(&so_getorigdst); } -static int hashsize; -module_param(hashsize, int, 0400); +static struct list_head *alloc_hashtable(int size, int *vmalloced) +{ + struct list_head *hash; + unsigned int i; + + *vmalloced = 0; + hash = (void*)__get_free_pages(GFP_KERNEL, + get_order(sizeof(struct list_head) + * size)); + if (!hash) { + *vmalloced = 1; + printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n"); + hash = vmalloc(sizeof(struct list_head) * size); + } + + if (hash) + for (i = 0; i < size; i++) + INIT_LIST_HEAD(&hash[i]); + + return hash; +} + +int set_hashsize(const char *val, struct kernel_param *kp) +{ + int i, bucket, hashsize, vmalloced; + int old_vmalloced, old_size; + int rnd; + struct list_head *hash, *old_hash; + struct ip_conntrack_tuple_hash *h; + + /* On boot, we can set this without any fancy locking. */ + if (!ip_conntrack_htable_size) + return param_set_int(val, kp); + + hashsize = simple_strtol(val, NULL, 0); + if (!hashsize) + return -EINVAL; + + hash = alloc_hashtable(hashsize, &vmalloced); + if (!hash) + return -ENOMEM; + + /* We have to rehash for the new table anyway, so we also can + * use a new random seed */ + get_random_bytes(&rnd, 4); + + write_lock_bh(&ip_conntrack_lock); + for (i = 0; i < ip_conntrack_htable_size; i++) { + while (!list_empty(&ip_conntrack_hash[i])) { + h = list_entry(ip_conntrack_hash[i].next, + struct ip_conntrack_tuple_hash, list); + list_del(&h->list); + bucket = __hash_conntrack(&h->tuple, hashsize, rnd); + list_add_tail(&h->list, &hash[bucket]); + } + } + old_size = ip_conntrack_htable_size; + old_vmalloced = ip_conntrack_vmalloc; + old_hash = ip_conntrack_hash; + + ip_conntrack_htable_size = hashsize; + ip_conntrack_vmalloc = vmalloced; + ip_conntrack_hash = hash; + ip_conntrack_hash_rnd = rnd; + write_unlock_bh(&ip_conntrack_lock); + + free_conntrack_hash(old_hash, old_vmalloced, old_size); + return 0; +} + +module_param_call(hashsize, set_hashsize, param_get_uint, + &ip_conntrack_htable_size, 0600); int __init ip_conntrack_init(void) { @@ -1392,9 +1466,7 @@ int __init ip_conntrack_init(void) /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB * machine has 256 buckets. >= 1GB machines have 8192 buckets. */ - if (hashsize) { - ip_conntrack_htable_size = hashsize; - } else { + if (!ip_conntrack_htable_size) { ip_conntrack_htable_size = (((num_physpages << PAGE_SHIFT) / 16384) / sizeof(struct list_head)); @@ -1416,20 +1488,8 @@ int __init ip_conntrack_init(void) return ret; } - /* AK: the hash table is twice as big than needed because it - uses list_head. it would be much nicer to caches to use a - single pointer list head here. */ - ip_conntrack_vmalloc = 0; - ip_conntrack_hash - =(void*)__get_free_pages(GFP_KERNEL, - get_order(sizeof(struct list_head) - *ip_conntrack_htable_size)); - if (!ip_conntrack_hash) { - ip_conntrack_vmalloc = 1; - printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n"); - ip_conntrack_hash = vmalloc(sizeof(struct list_head) - * ip_conntrack_htable_size); - } + ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size, + &ip_conntrack_vmalloc); if (!ip_conntrack_hash) { printk(KERN_ERR "Unable to create ip_conntrack_hash\n"); goto err_unreg_sockopt; @@ -1461,9 +1521,6 @@ int __init ip_conntrack_init(void) ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp; write_unlock_bh(&ip_conntrack_lock); - for (i = 0; i < ip_conntrack_htable_size; i++) - INIT_LIST_HEAD(&ip_conntrack_hash[i]); - /* For use by ipt_REJECT */ ip_ct_attach = ip_conntrack_attach; @@ -1478,7 +1535,8 @@ int __init ip_conntrack_init(void) err_free_conntrack_slab: kmem_cache_destroy(ip_conntrack_cachep); err_free_hash: - free_conntrack_hash(); + free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc, + ip_conntrack_htable_size); err_unreg_sockopt: nf_unregister_sockopt(&so_getorigdst); diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c index d77d6b3f5f8..59e12b02b22 100644 --- a/net/ipv4/netfilter/ip_conntrack_ftp.c +++ b/net/ipv4/netfilter/ip_conntrack_ftp.c @@ -29,9 +29,9 @@ static char *ftp_buffer; static DEFINE_SPINLOCK(ip_ftp_lock); #define MAX_PORTS 8 -static short ports[MAX_PORTS]; +static unsigned short ports[MAX_PORTS]; static int ports_c; -module_param_array(ports, short, &ports_c, 0400); +module_param_array(ports, ushort, &ports_c, 0400); static int loose; module_param(loose, int, 0600); diff --git a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c index 926a6684643..4108a5e12b3 100644 --- a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c +++ b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c @@ -270,14 +270,10 @@ exp_gre(struct ip_conntrack *master, exp_orig->expectfn = pptp_expectfn; exp_orig->flags = 0; - exp_orig->dir = IP_CT_DIR_ORIGINAL; - /* both expectations are identical apart from tuple */ memcpy(exp_reply, exp_orig, sizeof(*exp_reply)); memcpy(&exp_reply->tuple, &exp_tuples[1], sizeof(exp_reply->tuple)); - exp_reply->dir = !exp_orig->dir; - if (ip_nat_pptp_hook_exp_gre) ret = ip_nat_pptp_hook_exp_gre(exp_orig, exp_reply); else { diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c index 15457415a4f..2dea1db1440 100644 --- a/net/ipv4/netfilter/ip_conntrack_irc.c +++ b/net/ipv4/netfilter/ip_conntrack_irc.c @@ -34,7 +34,7 @@ #include <linux/moduleparam.h> #define MAX_PORTS 8 -static short ports[MAX_PORTS]; +static unsigned short ports[MAX_PORTS]; static int ports_c; static int max_dcc_channels = 8; static unsigned int dcc_timeout = 300; @@ -52,7 +52,7 @@ EXPORT_SYMBOL_GPL(ip_nat_irc_hook); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("IRC (DCC) connection tracking helper"); MODULE_LICENSE("GPL"); -module_param_array(ports, short, &ports_c, 0400); +module_param_array(ports, ushort, &ports_c, 0400); MODULE_PARM_DESC(ports, "port numbers of IRC servers"); module_param(max_dcc_channels, int, 0400); MODULE_PARM_DESC(max_dcc_channels, "max number of expected DCC channels per IRC session"); diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 166e6069f12..de9f4464438 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -28,11 +28,8 @@ #include <linux/netlink.h> #include <linux/spinlock.h> #include <linux/notifier.h> -#include <linux/rtnetlink.h> #include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ip_conntrack.h> #include <linux/netfilter_ipv4/ip_conntrack_core.h> #include <linux/netfilter_ipv4/ip_conntrack_helper.h> @@ -58,14 +55,17 @@ ctnetlink_dump_tuples_proto(struct sk_buff *skb, const struct ip_conntrack_tuple *tuple) { struct ip_conntrack_protocol *proto; + int ret = 0; NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum); proto = ip_conntrack_proto_find_get(tuple->dst.protonum); - if (proto && proto->tuple_to_nfattr) - return proto->tuple_to_nfattr(skb, tuple); + if (likely(proto && proto->tuple_to_nfattr)) { + ret = proto->tuple_to_nfattr(skb, tuple); + ip_conntrack_proto_put(proto); + } - return 0; + return ret; nfattr_failure: return -1; @@ -175,7 +175,7 @@ ctnetlink_dump_counters(struct sk_buff *skb, const struct ip_conntrack *ct, { enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG; struct nfattr *nest_count = NFA_NEST(skb, type); - u_int64_t tmp; + u_int32_t tmp; tmp = htonl(ct->counters[dir].packets); NFA_PUT(skb, CTA_COUNTERS32_PACKETS, sizeof(u_int32_t), &tmp); @@ -467,7 +467,7 @@ out: } #endif -static const int cta_min_ip[CTA_IP_MAX] = { +static const size_t cta_min_ip[CTA_IP_MAX] = { [CTA_IP_V4_SRC-1] = sizeof(u_int32_t), [CTA_IP_V4_DST-1] = sizeof(u_int32_t), }; @@ -479,9 +479,7 @@ ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple) DEBUGP("entered %s\n", __FUNCTION__); - - if (nfattr_parse_nested(tb, CTA_IP_MAX, attr) < 0) - goto nfattr_failure; + nfattr_parse_nested(tb, CTA_IP_MAX, attr); if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip)) return -EINVAL; @@ -497,12 +495,9 @@ ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple) DEBUGP("leaving\n"); return 0; - -nfattr_failure: - return -1; } -static const int cta_min_proto[CTA_PROTO_MAX] = { +static const size_t cta_min_proto[CTA_PROTO_MAX] = { [CTA_PROTO_NUM-1] = sizeof(u_int16_t), [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t), [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t), @@ -521,8 +516,7 @@ ctnetlink_parse_tuple_proto(struct nfattr *attr, DEBUGP("entered %s\n", __FUNCTION__); - if (nfattr_parse_nested(tb, CTA_PROTO_MAX, attr) < 0) - goto nfattr_failure; + nfattr_parse_nested(tb, CTA_PROTO_MAX, attr); if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto)) return -EINVAL; @@ -539,9 +533,6 @@ ctnetlink_parse_tuple_proto(struct nfattr *attr, } return ret; - -nfattr_failure: - return -1; } static inline int @@ -555,8 +546,7 @@ ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple, memset(tuple, 0, sizeof(*tuple)); - if (nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]) < 0) - goto nfattr_failure; + nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]); if (!tb[CTA_TUPLE_IP-1]) return -EINVAL; @@ -583,13 +573,10 @@ ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple, DEBUGP("leaving\n"); return 0; - -nfattr_failure: - return -1; } #ifdef CONFIG_IP_NF_NAT_NEEDED -static const int cta_min_protonat[CTA_PROTONAT_MAX] = { +static const size_t cta_min_protonat[CTA_PROTONAT_MAX] = { [CTA_PROTONAT_PORT_MIN-1] = sizeof(u_int16_t), [CTA_PROTONAT_PORT_MAX-1] = sizeof(u_int16_t), }; @@ -603,11 +590,10 @@ static int ctnetlink_parse_nat_proto(struct nfattr *attr, DEBUGP("entered %s\n", __FUNCTION__); - if (nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr) < 0) - goto nfattr_failure; + nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr); if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat)) - goto nfattr_failure; + return -EINVAL; npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); if (!npt) @@ -626,11 +612,13 @@ static int ctnetlink_parse_nat_proto(struct nfattr *attr, DEBUGP("leaving\n"); return 0; - -nfattr_failure: - return -1; } +static const size_t cta_min_nat[CTA_NAT_MAX] = { + [CTA_NAT_MINIP-1] = sizeof(u_int32_t), + [CTA_NAT_MAXIP-1] = sizeof(u_int32_t), +}; + static inline int ctnetlink_parse_nat(struct nfattr *cda[], const struct ip_conntrack *ct, struct ip_nat_range *range) @@ -642,8 +630,10 @@ ctnetlink_parse_nat(struct nfattr *cda[], memset(range, 0, sizeof(*range)); - if (nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]) < 0) - goto nfattr_failure; + nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]); + + if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat)) + return -EINVAL; if (tb[CTA_NAT_MINIP-1]) range->min_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MINIP-1]); @@ -665,9 +655,6 @@ ctnetlink_parse_nat(struct nfattr *cda[], DEBUGP("leaving\n"); return 0; - -nfattr_failure: - return -1; } #endif @@ -678,8 +665,7 @@ ctnetlink_parse_help(struct nfattr *attr, char **helper_name) DEBUGP("entered %s\n", __FUNCTION__); - if (nfattr_parse_nested(tb, CTA_HELP_MAX, attr) < 0) - goto nfattr_failure; + nfattr_parse_nested(tb, CTA_HELP_MAX, attr); if (!tb[CTA_HELP_NAME-1]) return -EINVAL; @@ -687,11 +673,16 @@ ctnetlink_parse_help(struct nfattr *attr, char **helper_name) *helper_name = NFA_DATA(tb[CTA_HELP_NAME-1]); return 0; - -nfattr_failure: - return -1; } +static const size_t cta_min[CTA_MAX] = { + [CTA_STATUS-1] = sizeof(u_int32_t), + [CTA_TIMEOUT-1] = sizeof(u_int32_t), + [CTA_MARK-1] = sizeof(u_int32_t), + [CTA_USE-1] = sizeof(u_int32_t), + [CTA_ID-1] = sizeof(u_int32_t) +}; + static int ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) @@ -703,6 +694,9 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, DEBUGP("entered %s\n", __FUNCTION__); + if (nfattr_bad_size(cda, CTA_MAX, cta_min)) + return -EINVAL; + if (cda[CTA_TUPLE_ORIG-1]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); else if (cda[CTA_TUPLE_REPLY-1]) @@ -785,6 +779,9 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, return 0; } + if (nfattr_bad_size(cda, CTA_MAX, cta_min)) + return -EINVAL; + if (cda[CTA_TUPLE_ORIG-1]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); else if (cda[CTA_TUPLE_REPLY-1]) @@ -804,7 +801,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, ct = tuplehash_to_ctrack(h); err = -ENOMEM; - skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb2) { ip_conntrack_put(ct); return -ENOMEM; @@ -815,7 +812,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, IPCTNL_MSG_CT_NEW, 1, ct); ip_conntrack_put(ct); if (err <= 0) - goto out; + goto free; err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); if (err < 0) @@ -824,10 +821,10 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, DEBUGP("leaving\n"); return 0; +free: + kfree_skb(skb2); out: - if (skb2) - kfree_skb(skb2); - return -1; + return err; } static inline int @@ -957,8 +954,7 @@ ctnetlink_change_protoinfo(struct ip_conntrack *ct, struct nfattr *cda[]) u_int16_t npt = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; int err = 0; - if (nfattr_parse_nested(tb, CTA_PROTOINFO_MAX, attr) < 0) - goto nfattr_failure; + nfattr_parse_nested(tb, CTA_PROTOINFO_MAX, attr); proto = ip_conntrack_proto_find_get(npt); if (!proto) @@ -969,9 +965,6 @@ ctnetlink_change_protoinfo(struct ip_conntrack *ct, struct nfattr *cda[]) ip_conntrack_proto_put(proto); return err; - -nfattr_failure: - return -ENOMEM; } static int @@ -1005,6 +998,11 @@ ctnetlink_change_conntrack(struct ip_conntrack *ct, struct nfattr *cda[]) return err; } +#if defined(CONFIG_IP_NF_CONNTRACK_MARK) + if (cda[CTA_MARK-1]) + ct->mark = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_MARK-1])); +#endif + DEBUGP("all done\n"); return 0; } @@ -1048,6 +1046,11 @@ ctnetlink_create_conntrack(struct nfattr *cda[], if (ct->helper) ip_conntrack_helper_put(ct->helper); +#if defined(CONFIG_IP_NF_CONNTRACK_MARK) + if (cda[CTA_MARK-1]) + ct->mark = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_MARK-1])); +#endif + DEBUGP("conntrack with id %u inserted\n", ct->id); return 0; @@ -1066,6 +1069,9 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, DEBUGP("entered %s\n", __FUNCTION__); + if (nfattr_bad_size(cda, CTA_MAX, cta_min)) + return -EINVAL; + if (cda[CTA_TUPLE_ORIG-1]) { err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG); if (err < 0) @@ -1271,6 +1277,11 @@ out: return skb->len; } +static const size_t cta_min_exp[CTA_EXPECT_MAX] = { + [CTA_EXPECT_TIMEOUT-1] = sizeof(u_int32_t), + [CTA_EXPECT_ID-1] = sizeof(u_int32_t) +}; + static int ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) @@ -1282,6 +1293,9 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, DEBUGP("entered %s\n", __FUNCTION__); + if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) + return -EINVAL; + if (nlh->nlmsg_flags & NLM_F_DUMP) { struct nfgenmsg *msg = NLMSG_DATA(nlh); u32 rlen; @@ -1312,6 +1326,14 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, if (!exp) return -ENOENT; + if (cda[CTA_EXPECT_ID-1]) { + u_int32_t id = *(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]); + if (exp->id != ntohl(id)) { + ip_conntrack_expect_put(exp); + return -ENOENT; + } + } + err = -ENOMEM; skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb2) @@ -1322,21 +1344,16 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, 1, exp); if (err <= 0) - goto out; + goto free; ip_conntrack_expect_put(exp); - err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); - if (err < 0) - goto free; - - return err; + return netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); +free: + kfree_skb(skb2); out: ip_conntrack_expect_put(exp); -free: - if (skb2) - kfree_skb(skb2); return err; } @@ -1349,6 +1366,9 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, struct ip_conntrack_helper *h; int err; + if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) + return -EINVAL; + if (cda[CTA_EXPECT_TUPLE-1]) { /* delete a single expect by tuple */ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); @@ -1392,7 +1412,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, ip_conntrack_expect_put(exp); } } - write_unlock(&ip_conntrack_lock); + write_unlock_bh(&ip_conntrack_lock); } else { /* This basically means we have to flush everything*/ write_lock_bh(&ip_conntrack_lock); @@ -1478,6 +1498,9 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, DEBUGP("entered %s\n", __FUNCTION__); + if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) + return -EINVAL; + if (!cda[CTA_EXPECT_TUPLE-1] || !cda[CTA_EXPECT_MASK-1] || !cda[CTA_EXPECT_MASTER-1]) @@ -1520,29 +1543,22 @@ static struct notifier_block ctnl_notifier_exp = { static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = { [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack, - .attr_count = CTA_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_MAX, }, [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack, - .attr_count = CTA_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_MAX, }, [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack, - .attr_count = CTA_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_MAX, }, [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack, - .attr_count = CTA_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_MAX, }, }; static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = { [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect, - .attr_count = CTA_EXPECT_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_EXPECT_MAX, }, [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect, - .attr_count = CTA_EXPECT_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_EXPECT_MAX, }, [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect, - .attr_count = CTA_EXPECT_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_EXPECT_MAX, }, }; static struct nfnetlink_subsystem ctnl_subsys = { @@ -1559,6 +1575,8 @@ static struct nfnetlink_subsystem ctnl_exp_subsys = { .cb = ctnl_exp_cb, }; +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK); + static int __init ctnetlink_init(void) { int ret; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index 98f0015dd25..e4d6b268e8c 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -13,6 +13,7 @@ #include <linux/in.h> #include <linux/icmp.h> #include <linux/seq_file.h> +#include <linux/skbuff.h> #include <net/ip.h> #include <net/checksum.h> #include <linux/netfilter.h> @@ -151,13 +152,13 @@ icmp_error_message(struct sk_buff *skb, /* Not enough header? */ inside = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_in), &_in); if (inside == NULL) - return NF_ACCEPT; + return -NF_ACCEPT; /* Ignore ICMP's containing fragments (shouldn't happen) */ if (inside->ip.frag_off & htons(IP_OFFSET)) { DEBUGP("icmp_error_track: fragment of proto %u\n", inside->ip.protocol); - return NF_ACCEPT; + return -NF_ACCEPT; } innerproto = ip_conntrack_proto_find_get(inside->ip.protocol); @@ -166,7 +167,7 @@ icmp_error_message(struct sk_buff *skb, if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) { DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol); ip_conntrack_proto_put(innerproto); - return NF_ACCEPT; + return -NF_ACCEPT; } /* Ordinarily, we'd expect the inverted tupleproto, but it's @@ -174,7 +175,7 @@ icmp_error_message(struct sk_buff *skb, if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) { DEBUGP("icmp_error_track: Can't invert tuple\n"); ip_conntrack_proto_put(innerproto); - return NF_ACCEPT; + return -NF_ACCEPT; } ip_conntrack_proto_put(innerproto); @@ -190,7 +191,7 @@ icmp_error_message(struct sk_buff *skb, if (!h) { DEBUGP("icmp_error_track: no match\n"); - return NF_ACCEPT; + return -NF_ACCEPT; } /* Reverse direction from that found */ if (DIRECTION(h) != IP_CT_DIR_REPLY) @@ -230,19 +231,15 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, case CHECKSUM_HW: if (!(u16)csum_fold(skb->csum)) break; - if (LOG_INVALID(IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "ip_ct_icmp: bad HW ICMP checksum "); - return -NF_ACCEPT; + /* fall through */ case CHECKSUM_NONE: - if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) { + skb->csum = 0; + if (__skb_checksum_complete(skb)) { if (LOG_INVALID(IPPROTO_ICMP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_icmp: bad ICMP checksum "); return -NF_ACCEPT; } - default: - break; } checksum_skipped: @@ -296,7 +293,8 @@ static int icmp_nfattr_to_tuple(struct nfattr *tb[], struct ip_conntrack_tuple *tuple) { if (!tb[CTA_PROTO_ICMP_TYPE-1] - || !tb[CTA_PROTO_ICMP_CODE-1]) + || !tb[CTA_PROTO_ICMP_CODE-1] + || !tb[CTA_PROTO_ICMP_ID-1]) return -1; tuple->dst.u.icmp.type = @@ -304,7 +302,7 @@ static int icmp_nfattr_to_tuple(struct nfattr *tb[], tuple->dst.u.icmp.code = *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]); tuple->src.u.icmp.id = - *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]); + *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]); return 0; } diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index d6701cafbcc..ee3b7d6c4d2 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -357,13 +357,24 @@ nfattr_failure: return -1; } +static const size_t cta_min_tcp[CTA_PROTOINFO_TCP_MAX] = { + [CTA_PROTOINFO_TCP_STATE-1] = sizeof(u_int8_t), +}; + static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct) { struct nfattr *attr = cda[CTA_PROTOINFO_TCP-1]; struct nfattr *tb[CTA_PROTOINFO_TCP_MAX]; - if (nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr) < 0) - goto nfattr_failure; + /* updates could not contain anything about the private + * protocol info, in that case skip the parsing */ + if (!attr) + return 0; + + nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr); + + if (nfattr_bad_size(tb, CTA_PROTOINFO_TCP_MAX, cta_min_tcp)) + return -EINVAL; if (!tb[CTA_PROTOINFO_TCP_STATE-1]) return -EINVAL; @@ -374,9 +385,6 @@ static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct) write_unlock_bh(&tcp_lock); return 0; - -nfattr_failure: - return -1; } #endif @@ -813,6 +821,7 @@ static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] = { [TH_SYN] = 1, [TH_SYN|TH_ACK] = 1, + [TH_SYN|TH_PUSH] = 1, [TH_SYN|TH_ACK|TH_PUSH] = 1, [TH_RST] = 1, [TH_RST|TH_ACK] = 1, diff --git a/net/ipv4/netfilter/ip_conntrack_tftp.c b/net/ipv4/netfilter/ip_conntrack_tftp.c index a78736b8525..d3c5a371f99 100644 --- a/net/ipv4/netfilter/ip_conntrack_tftp.c +++ b/net/ipv4/netfilter/ip_conntrack_tftp.c @@ -26,9 +26,9 @@ MODULE_DESCRIPTION("tftp connection tracking helper"); MODULE_LICENSE("GPL"); #define MAX_PORTS 8 -static short ports[MAX_PORTS]; +static unsigned short ports[MAX_PORTS]; static int ports_c; -module_param_array(ports, short, &ports_c, 0400); +module_param_array(ports, ushort, &ports_c, 0400); MODULE_PARM_DESC(ports, "port numbers of tftp servers"); #if 0 diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c index c5e3abd2467..762f4d93936 100644 --- a/net/ipv4/netfilter/ip_nat_core.c +++ b/net/ipv4/netfilter/ip_nat_core.c @@ -66,10 +66,8 @@ ip_nat_proto_find_get(u_int8_t protonum) * removed until we've grabbed the reference */ preempt_disable(); p = __ip_nat_proto_find(protonum); - if (p) { - if (!try_module_get(p->me)) - p = &ip_nat_unknown_protocol; - } + if (!try_module_get(p->me)) + p = &ip_nat_unknown_protocol; preempt_enable(); return p; diff --git a/net/ipv4/netfilter/ip_nat_helper_pptp.c b/net/ipv4/netfilter/ip_nat_helper_pptp.c index 3cdd0684d30..e546203f566 100644 --- a/net/ipv4/netfilter/ip_nat_helper_pptp.c +++ b/net/ipv4/netfilter/ip_nat_helper_pptp.c @@ -73,6 +73,7 @@ static void pptp_nat_expected(struct ip_conntrack *ct, struct ip_conntrack_tuple t; struct ip_ct_pptp_master *ct_pptp_info; struct ip_nat_pptp *nat_pptp_info; + struct ip_nat_range range; ct_pptp_info = &master->help.ct_pptp_info; nat_pptp_info = &master->nat.help.nat_pptp_info; @@ -110,7 +111,30 @@ static void pptp_nat_expected(struct ip_conntrack *ct, DEBUGP("not found!\n"); } - ip_nat_follow_master(ct, exp); + /* This must be a fresh one. */ + BUG_ON(ct->status & IPS_NAT_DONE_MASK); + + /* Change src to where master sends to */ + range.flags = IP_NAT_RANGE_MAP_IPS; + range.min_ip = range.max_ip + = ct->master->tuplehash[!exp->dir].tuple.dst.ip; + if (exp->dir == IP_CT_DIR_ORIGINAL) { + range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED; + range.min = range.max = exp->saved_proto; + } + /* hook doesn't matter, but it has to do source manip */ + ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING); + + /* For DST manip, map port here to where it's expected. */ + range.flags = IP_NAT_RANGE_MAP_IPS; + range.min_ip = range.max_ip + = ct->master->tuplehash[!exp->dir].tuple.src.ip; + if (exp->dir == IP_CT_DIR_REPLY) { + range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED; + range.min = range.max = exp->saved_proto; + } + /* hook doesn't matter, but it has to do destination manip */ + ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING); } /* outbound packets == from PNS to PAC */ @@ -213,9 +237,10 @@ pptp_exp_gre(struct ip_conntrack_expect *expect_orig, /* alter expectation for PNS->PAC direction */ invert_tuplepr(&inv_t, &expect_orig->tuple); - expect_orig->saved_proto.gre.key = htons(nat_pptp_info->pac_call_id); + expect_orig->saved_proto.gre.key = htons(ct_pptp_info->pns_call_id); expect_orig->tuple.src.u.gre.key = htons(nat_pptp_info->pns_call_id); expect_orig->tuple.dst.u.gre.key = htons(ct_pptp_info->pac_call_id); + expect_orig->dir = IP_CT_DIR_ORIGINAL; inv_t.src.ip = reply_t->src.ip; inv_t.dst.ip = reply_t->dst.ip; inv_t.src.u.gre.key = htons(nat_pptp_info->pac_call_id); @@ -233,6 +258,7 @@ pptp_exp_gre(struct ip_conntrack_expect *expect_orig, expect_reply->saved_proto.gre.key = htons(nat_pptp_info->pns_call_id); expect_reply->tuple.src.u.gre.key = htons(nat_pptp_info->pac_call_id); expect_reply->tuple.dst.u.gre.key = htons(ct_pptp_info->pns_call_id); + expect_reply->dir = IP_CT_DIR_REPLY; inv_t.src.ip = orig_t->src.ip; inv_t.dst.ip = orig_t->dst.ip; inv_t.src.u.gre.key = htons(nat_pptp_info->pns_call_id); diff --git a/net/ipv4/netfilter/ip_nat_proto_gre.c b/net/ipv4/netfilter/ip_nat_proto_gre.c index 7c128540167..f7cad7cf1ae 100644 --- a/net/ipv4/netfilter/ip_nat_proto_gre.c +++ b/net/ipv4/netfilter/ip_nat_proto_gre.c @@ -139,8 +139,8 @@ gre_manip_pkt(struct sk_buff **pskb, break; case GRE_VERSION_PPTP: DEBUGP("call_id -> 0x%04x\n", - ntohl(tuple->dst.u.gre.key)); - pgreh->call_id = htons(ntohl(tuple->dst.u.gre.key)); + ntohs(tuple->dst.u.gre.key)); + pgreh->call_id = tuple->dst.u.gre.key; break; default: DEBUGP("can't nat unknown GRE version\n"); diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c index 99bbef56f84..f0099a646a0 100644 --- a/net/ipv4/netfilter/ip_nat_proto_unknown.c +++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c @@ -62,7 +62,7 @@ unknown_print_range(char *buffer, const struct ip_nat_range *range) struct ip_nat_protocol ip_nat_unknown_protocol = { .name = "unknown", - .me = THIS_MODULE, + /* .me isn't set: getting a ref to this cannot fail. */ .manip_pkt = unknown_manip_pkt, .in_range = unknown_in_range, .unique_tuple = unknown_unique_tuple, diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c index 93b2c5111bb..8acb7ed40b4 100644 --- a/net/ipv4/netfilter/ip_nat_snmp_basic.c +++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c @@ -1161,8 +1161,7 @@ static int snmp_parse_mangle(unsigned char *msg, if (!snmp_object_decode(&ctx, obj)) { if (*obj) { - if ((*obj)->id) - kfree((*obj)->id); + kfree((*obj)->id); kfree(*obj); } kfree(obj); diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 9bcb398fbc1..45c52d8f4d9 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -29,7 +29,7 @@ #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_CLUSTERIP.h> -#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <net/netfilter/nf_conntrack_compat.h> #define CLUSTERIP_VERSION "0.8" @@ -316,14 +316,14 @@ target(struct sk_buff **pskb, { const struct ipt_clusterip_tgt_info *cipinfo = targinfo; enum ip_conntrack_info ctinfo; - struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo); - u_int32_t hash; + u_int32_t *mark, hash; /* don't need to clusterip_config_get() here, since refcount * is only decremented by destroy() - and ip_tables guarantees * that the ->target() function isn't called after ->destroy() */ - if (!ct) { + mark = nf_ct_get_mark((*pskb), &ctinfo); + if (mark == NULL) { printk(KERN_ERR "CLUSTERIP: no conntrack!\n"); /* FIXME: need to drop invalid ones, since replies * to outgoing connections of other nodes will be @@ -346,7 +346,7 @@ target(struct sk_buff **pskb, switch (ctinfo) { case IP_CT_NEW: - ct->mark = hash; + *mark = hash; break; case IP_CT_RELATED: case IP_CT_RELATED+IP_CT_IS_REPLY: @@ -363,7 +363,7 @@ target(struct sk_buff **pskb, #ifdef DEBUG_CLUSTERP DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); #endif - DEBUGP("hash=%u ct_hash=%u ", hash, ct->mark); + DEBUGP("hash=%u ct_hash=%u ", hash, *mark); if (!clusterip_responsible(cipinfo->config, hash)) { DEBUGP("not responsible\n"); return NF_DROP; diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c index 13463802133..8acac5a40a9 100644 --- a/net/ipv4/netfilter/ipt_CONNMARK.c +++ b/net/ipv4/netfilter/ipt_CONNMARK.c @@ -29,7 +29,7 @@ MODULE_LICENSE("GPL"); #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_CONNMARK.h> -#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <net/netfilter/nf_conntrack_compat.h> static unsigned int target(struct sk_buff **pskb, @@ -43,24 +43,24 @@ target(struct sk_buff **pskb, u_int32_t diff; u_int32_t nfmark; u_int32_t newmark; + u_int32_t ctinfo; + u_int32_t *ctmark = nf_ct_get_mark(*pskb, &ctinfo); - enum ip_conntrack_info ctinfo; - struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo); - if (ct) { + if (ctmark) { switch(markinfo->mode) { case IPT_CONNMARK_SET: - newmark = (ct->mark & ~markinfo->mask) | markinfo->mark; - if (newmark != ct->mark) - ct->mark = newmark; + newmark = (*ctmark & ~markinfo->mask) | markinfo->mark; + if (newmark != *ctmark) + *ctmark = newmark; break; case IPT_CONNMARK_SAVE: - newmark = (ct->mark & ~markinfo->mask) | ((*pskb)->nfmark & markinfo->mask); - if (ct->mark != newmark) - ct->mark = newmark; + newmark = (*ctmark & ~markinfo->mask) | ((*pskb)->nfmark & markinfo->mask); + if (*ctmark != newmark) + *ctmark = newmark; break; case IPT_CONNMARK_RESTORE: nfmark = (*pskb)->nfmark; - diff = (ct->mark ^ nfmark) & markinfo->mask; + diff = (*ctmark ^ nfmark) & markinfo->mask; if (diff != 0) (*pskb)->nfmark = nfmark ^ diff; break; @@ -109,6 +109,7 @@ static struct ipt_target ipt_connmark_reg = { static int __init init(void) { + need_ip_conntrack(); return ipt_register_target(&ipt_connmark_reg); } diff --git a/net/ipv4/netfilter/ipt_NOTRACK.c b/net/ipv4/netfilter/ipt_NOTRACK.c index a4bb9b3bc29..e3c69d072c6 100644 --- a/net/ipv4/netfilter/ipt_NOTRACK.c +++ b/net/ipv4/netfilter/ipt_NOTRACK.c @@ -5,7 +5,7 @@ #include <linux/skbuff.h> #include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <net/netfilter/nf_conntrack_compat.h> static unsigned int target(struct sk_buff **pskb, @@ -23,7 +23,7 @@ target(struct sk_buff **pskb, If there is a real ct entry correspondig to this packet, it'll hang aroun till timing out. We don't deal with it for performance reasons. JK */ - (*pskb)->nfct = &ip_conntrack_untracked.ct_general; + nf_ct_untrack(*pskb); (*pskb)->nfctinfo = IP_CT_NEW; nf_conntrack_get((*pskb)->nfct); diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c index f5909a4c3fc..e19c2a52d00 100644 --- a/net/ipv4/netfilter/ipt_addrtype.c +++ b/net/ipv4/netfilter/ipt_addrtype.c @@ -48,7 +48,7 @@ static int checkentry(const char *tablename, const struct ipt_ip *ip, unsigned int hook_mask) { if (matchsize != IPT_ALIGN(sizeof(struct ipt_addrtype_info))) { - printk(KERN_ERR "ipt_addrtype: invalid size (%u != %Zu)\n.", + printk(KERN_ERR "ipt_addrtype: invalid size (%u != %Zu)\n", matchsize, IPT_ALIGN(sizeof(struct ipt_addrtype_info))); return 0; } diff --git a/net/ipv4/netfilter/ipt_connbytes.c b/net/ipv4/netfilter/ipt_connbytes.c index df4a42c6da2..d68a048b717 100644 --- a/net/ipv4/netfilter/ipt_connbytes.c +++ b/net/ipv4/netfilter/ipt_connbytes.c @@ -10,7 +10,7 @@ */ #include <linux/module.h> #include <linux/skbuff.h> -#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <net/netfilter/nf_conntrack_compat.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_connbytes.h> @@ -46,60 +46,59 @@ match(const struct sk_buff *skb, int *hotdrop) { const struct ipt_connbytes_info *sinfo = matchinfo; - enum ip_conntrack_info ctinfo; - struct ip_conntrack *ct; u_int64_t what = 0; /* initialize to make gcc happy */ + const struct ip_conntrack_counter *counters; - if (!(ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo))) + if (!(counters = nf_ct_get_counters(skb))) return 0; /* no match */ switch (sinfo->what) { case IPT_CONNBYTES_PKTS: switch (sinfo->direction) { case IPT_CONNBYTES_DIR_ORIGINAL: - what = ct->counters[IP_CT_DIR_ORIGINAL].packets; + what = counters[IP_CT_DIR_ORIGINAL].packets; break; case IPT_CONNBYTES_DIR_REPLY: - what = ct->counters[IP_CT_DIR_REPLY].packets; + what = counters[IP_CT_DIR_REPLY].packets; break; case IPT_CONNBYTES_DIR_BOTH: - what = ct->counters[IP_CT_DIR_ORIGINAL].packets; - what += ct->counters[IP_CT_DIR_REPLY].packets; + what = counters[IP_CT_DIR_ORIGINAL].packets; + what += counters[IP_CT_DIR_REPLY].packets; break; } break; case IPT_CONNBYTES_BYTES: switch (sinfo->direction) { case IPT_CONNBYTES_DIR_ORIGINAL: - what = ct->counters[IP_CT_DIR_ORIGINAL].bytes; + what = counters[IP_CT_DIR_ORIGINAL].bytes; break; case IPT_CONNBYTES_DIR_REPLY: - what = ct->counters[IP_CT_DIR_REPLY].bytes; + what = counters[IP_CT_DIR_REPLY].bytes; break; case IPT_CONNBYTES_DIR_BOTH: - what = ct->counters[IP_CT_DIR_ORIGINAL].bytes; - what += ct->counters[IP_CT_DIR_REPLY].bytes; + what = counters[IP_CT_DIR_ORIGINAL].bytes; + what += counters[IP_CT_DIR_REPLY].bytes; break; } break; case IPT_CONNBYTES_AVGPKT: switch (sinfo->direction) { case IPT_CONNBYTES_DIR_ORIGINAL: - what = div64_64(ct->counters[IP_CT_DIR_ORIGINAL].bytes, - ct->counters[IP_CT_DIR_ORIGINAL].packets); + what = div64_64(counters[IP_CT_DIR_ORIGINAL].bytes, + counters[IP_CT_DIR_ORIGINAL].packets); break; case IPT_CONNBYTES_DIR_REPLY: - what = div64_64(ct->counters[IP_CT_DIR_REPLY].bytes, - ct->counters[IP_CT_DIR_REPLY].packets); + what = div64_64(counters[IP_CT_DIR_REPLY].bytes, + counters[IP_CT_DIR_REPLY].packets); break; case IPT_CONNBYTES_DIR_BOTH: { u_int64_t bytes; u_int64_t pkts; - bytes = ct->counters[IP_CT_DIR_ORIGINAL].bytes + - ct->counters[IP_CT_DIR_REPLY].bytes; - pkts = ct->counters[IP_CT_DIR_ORIGINAL].packets+ - ct->counters[IP_CT_DIR_REPLY].packets; + bytes = counters[IP_CT_DIR_ORIGINAL].bytes + + counters[IP_CT_DIR_REPLY].bytes; + pkts = counters[IP_CT_DIR_ORIGINAL].packets+ + counters[IP_CT_DIR_REPLY].packets; /* FIXME_THEORETICAL: what to do if sum * overflows ? */ diff --git a/net/ipv4/netfilter/ipt_connmark.c b/net/ipv4/netfilter/ipt_connmark.c index bf8de47ce00..5306ef293b9 100644 --- a/net/ipv4/netfilter/ipt_connmark.c +++ b/net/ipv4/netfilter/ipt_connmark.c @@ -28,7 +28,7 @@ MODULE_LICENSE("GPL"); #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_connmark.h> -#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <net/netfilter/nf_conntrack_compat.h> static int match(const struct sk_buff *skb, @@ -39,12 +39,12 @@ match(const struct sk_buff *skb, int *hotdrop) { const struct ipt_connmark_info *info = matchinfo; - enum ip_conntrack_info ctinfo; - struct ip_conntrack *ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo); - if (!ct) + u_int32_t ctinfo; + const u_int32_t *ctmark = nf_ct_get_mark(skb, &ctinfo); + if (!ctmark) return 0; - return ((ct->mark & info->mask) == info->mark) ^ info->invert; + return (((*ctmark) & info->mask) == info->mark) ^ info->invert; } static int diff --git a/net/ipv4/netfilter/ipt_conntrack.c b/net/ipv4/netfilter/ipt_conntrack.c index c1d22801b7c..c8d18705469 100644 --- a/net/ipv4/netfilter/ipt_conntrack.c +++ b/net/ipv4/netfilter/ipt_conntrack.c @@ -10,7 +10,14 @@ #include <linux/module.h> #include <linux/skbuff.h> + +#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE) #include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/ip_conntrack_tuple.h> +#else +#include <net/netfilter/nf_conntrack.h> +#endif + #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_conntrack.h> @@ -18,6 +25,8 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); MODULE_DESCRIPTION("iptables connection tracking match module"); +#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE) + static int match(const struct sk_buff *skb, const struct net_device *in, @@ -102,6 +111,93 @@ match(const struct sk_buff *skb, return 1; } +#else /* CONFIG_IP_NF_CONNTRACK */ +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_conntrack_info *sinfo = matchinfo; + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + unsigned int statebit; + + ct = nf_ct_get((struct sk_buff *)skb, &ctinfo); + +#define FWINV(bool,invflg) ((bool) ^ !!(sinfo->invflags & invflg)) + + if (ct == &nf_conntrack_untracked) + statebit = IPT_CONNTRACK_STATE_UNTRACKED; + else if (ct) + statebit = IPT_CONNTRACK_STATE_BIT(ctinfo); + else + statebit = IPT_CONNTRACK_STATE_INVALID; + + if(sinfo->flags & IPT_CONNTRACK_STATE) { + if (ct) { + if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip != + ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip) + statebit |= IPT_CONNTRACK_STATE_SNAT; + + if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip != + ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip) + statebit |= IPT_CONNTRACK_STATE_DNAT; + } + + if (FWINV((statebit & sinfo->statemask) == 0, IPT_CONNTRACK_STATE)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_PROTO) { + if (!ct || FWINV(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.protonum, IPT_CONNTRACK_PROTO)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_ORIGSRC) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip&sinfo->sipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].src.ip, IPT_CONNTRACK_ORIGSRC)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_ORIGDST) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip&sinfo->dipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.ip, IPT_CONNTRACK_ORIGDST)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_REPLSRC) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip&sinfo->sipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].src.ip, IPT_CONNTRACK_REPLSRC)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_REPLDST) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip&sinfo->dipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].dst.ip, IPT_CONNTRACK_REPLDST)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_STATUS) { + if (!ct || FWINV((ct->status & sinfo->statusmask) == 0, IPT_CONNTRACK_STATUS)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_EXPIRES) { + unsigned long expires; + + if(!ct) + return 0; + + expires = timer_pending(&ct->timeout) ? (ct->timeout.expires - jiffies)/HZ : 0; + + if (FWINV(!(expires >= sinfo->expires_min && expires <= sinfo->expires_max), IPT_CONNTRACK_EXPIRES)) + return 0; + } + + return 1; +} + +#endif /* CONFIG_NF_IP_CONNTRACK */ + static int check(const char *tablename, const struct ipt_ip *ip, void *matchinfo, diff --git a/net/ipv4/netfilter/ipt_helper.c b/net/ipv4/netfilter/ipt_helper.c index 3e7dd014de4..bf14e1c7798 100644 --- a/net/ipv4/netfilter/ipt_helper.c +++ b/net/ipv4/netfilter/ipt_helper.c @@ -13,9 +13,15 @@ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netfilter.h> +#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE) #include <linux/netfilter_ipv4/ip_conntrack.h> #include <linux/netfilter_ipv4/ip_conntrack_core.h> #include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#else +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_helper.h> +#endif #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_helper.h> @@ -29,6 +35,7 @@ MODULE_DESCRIPTION("iptables helper match module"); #define DEBUGP(format, args...) #endif +#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE) static int match(const struct sk_buff *skb, const struct net_device *in, @@ -73,6 +80,53 @@ out_unlock: return ret; } +#else /* CONFIG_IP_NF_CONNTRACK */ + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_helper_info *info = matchinfo; + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + int ret = info->invert; + + ct = nf_ct_get((struct sk_buff *)skb, &ctinfo); + if (!ct) { + DEBUGP("ipt_helper: Eek! invalid conntrack?\n"); + return ret; + } + + if (!ct->master) { + DEBUGP("ipt_helper: conntrack %p has no master\n", ct); + return ret; + } + + read_lock_bh(&nf_conntrack_lock); + if (!ct->master->helper) { + DEBUGP("ipt_helper: master ct %p has no helper\n", + exp->expectant); + goto out_unlock; + } + + DEBUGP("master's name = %s , info->name = %s\n", + ct->master->helper->name, info->name); + + if (info->name[0] == '\0') + ret ^= 1; + else + ret ^= !strncmp(ct->master->helper->name, info->name, + strlen(ct->master->helper->name)); +out_unlock: + read_unlock_bh(&nf_conntrack_lock); + return ret; +} +#endif + static int check(const char *tablename, const struct ipt_ip *ip, void *matchinfo, diff --git a/net/ipv4/netfilter/ipt_state.c b/net/ipv4/netfilter/ipt_state.c index b1511b97ea5..4d7f16b70ce 100644 --- a/net/ipv4/netfilter/ipt_state.c +++ b/net/ipv4/netfilter/ipt_state.c @@ -10,7 +10,7 @@ #include <linux/module.h> #include <linux/skbuff.h> -#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <net/netfilter/nf_conntrack_compat.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_state.h> @@ -30,9 +30,9 @@ match(const struct sk_buff *skb, enum ip_conntrack_info ctinfo; unsigned int statebit; - if (skb->nfct == &ip_conntrack_untracked.ct_general) + if (nf_ct_is_untracked(skb)) statebit = IPT_STATE_UNTRACKED; - else if (!ip_conntrack_get(skb, &ctinfo)) + else if (!nf_ct_get_ctinfo(skb, &ctinfo)) statebit = IPT_STATE_INVALID; else statebit = IPT_STATE_BIT(ctinfo); diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c new file mode 100644 index 00000000000..8202c1c0afa --- /dev/null +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -0,0 +1,571 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * - move L3 protocol dependent part to this file. + * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * - add get_features() to support various size of conntrack + * structures. + * + * Derived from net/ipv4/netfilter/ip_conntrack_standalone.c + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/ip.h> +#include <linux/netfilter.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/icmp.h> +#include <linux/sysctl.h> +#include <net/ip.h> + +#include <linux/netfilter_ipv4.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_protocol.h> +#include <net/netfilter/nf_conntrack_l3proto.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/ipv4/nf_conntrack_ipv4.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +DECLARE_PER_CPU(struct nf_conntrack_stat, nf_conntrack_stat); + +static int ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, + struct nf_conntrack_tuple *tuple) +{ + u_int32_t _addrs[2], *ap; + ap = skb_header_pointer(skb, nhoff + offsetof(struct iphdr, saddr), + sizeof(u_int32_t) * 2, _addrs); + if (ap == NULL) + return 0; + + tuple->src.u3.ip = ap[0]; + tuple->dst.u3.ip = ap[1]; + + return 1; +} + +static int ipv4_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + tuple->src.u3.ip = orig->dst.u3.ip; + tuple->dst.u3.ip = orig->src.u3.ip; + + return 1; +} + +static int ipv4_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "src=%u.%u.%u.%u dst=%u.%u.%u.%u ", + NIPQUAD(tuple->src.u3.ip), + NIPQUAD(tuple->dst.u3.ip)); +} + +static int ipv4_print_conntrack(struct seq_file *s, + const struct nf_conn *conntrack) +{ + return 0; +} + +/* Returns new sk_buff, or NULL */ +static struct sk_buff * +nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) +{ + skb_orphan(skb); + + local_bh_disable(); + skb = ip_defrag(skb, user); + local_bh_enable(); + + if (skb) + ip_send_check(skb->nh.iph); + + return skb; +} + +static int +ipv4_prepare(struct sk_buff **pskb, unsigned int hooknum, unsigned int *dataoff, + u_int8_t *protonum) +{ + /* Never happen */ + if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) { + if (net_ratelimit()) { + printk(KERN_ERR "ipv4_prepare: Frag of proto %u (hook=%u)\n", + (*pskb)->nh.iph->protocol, hooknum); + } + return -NF_DROP; + } + + *dataoff = (*pskb)->nh.raw - (*pskb)->data + (*pskb)->nh.iph->ihl*4; + *protonum = (*pskb)->nh.iph->protocol; + + return NF_ACCEPT; +} + +int nat_module_is_loaded = 0; +static u_int32_t ipv4_get_features(const struct nf_conntrack_tuple *tuple) +{ + if (nat_module_is_loaded) + return NF_CT_F_NAT; + + return NF_CT_F_BASIC; +} + +static unsigned int ipv4_confirm(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + /* We've seen it coming out the other side: confirm it */ + return nf_conntrack_confirm(pskb); +} + +static unsigned int ipv4_conntrack_help(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + + /* This is where we call the helper: as the packet goes out. */ + ct = nf_ct_get(*pskb, &ctinfo); + if (ct && ct->helper) { + unsigned int ret; + ret = ct->helper->help(pskb, + (*pskb)->nh.raw - (*pskb)->data + + (*pskb)->nh.iph->ihl*4, + ct, ctinfo); + if (ret != NF_ACCEPT) + return ret; + } + return NF_ACCEPT; +} + +static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ +#if !defined(CONFIG_IP_NF_NAT) && !defined(CONFIG_IP_NF_NAT_MODULE) + /* Previously seen (loopback)? Ignore. Do this before + fragment check. */ + if ((*pskb)->nfct) + return NF_ACCEPT; +#endif + + /* Gather fragments. */ + if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { + *pskb = nf_ct_ipv4_gather_frags(*pskb, + hooknum == NF_IP_PRE_ROUTING ? + IP_DEFRAG_CONNTRACK_IN : + IP_DEFRAG_CONNTRACK_OUT); + if (!*pskb) + return NF_STOLEN; + } + return NF_ACCEPT; +} + +static unsigned int ipv4_refrag(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct rtable *rt = (struct rtable *)(*pskb)->dst; + + /* We've seen it coming out the other side: confirm */ + if (ipv4_confirm(hooknum, pskb, in, out, okfn) != NF_ACCEPT) + return NF_DROP; + + /* Local packets are never produced too large for their + interface. We degfragment them at LOCAL_OUT, however, + so we have to refragment them here. */ + if ((*pskb)->len > dst_mtu(&rt->u.dst) && + !skb_shinfo(*pskb)->tso_size) { + /* No hook can be after us, so this should be OK. */ + ip_fragment(*pskb, okfn); + return NF_STOLEN; + } + return NF_ACCEPT; +} + +static unsigned int ipv4_conntrack_in(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return nf_conntrack_in(PF_INET, hooknum, pskb); +} + +static unsigned int ipv4_conntrack_local(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + /* root is playing with raw sockets. */ + if ((*pskb)->len < sizeof(struct iphdr) + || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { + if (net_ratelimit()) + printk("ipt_hook: happy cracking.\n"); + return NF_ACCEPT; + } + return nf_conntrack_in(PF_INET, hooknum, pskb); +} + +/* Connection tracking may drop packets, but never alters them, so + make it the first hook. */ +static struct nf_hook_ops ipv4_conntrack_defrag_ops = { + .hook = ipv4_conntrack_defrag, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_PRE_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_DEFRAG, +}; + +static struct nf_hook_ops ipv4_conntrack_in_ops = { + .hook = ipv4_conntrack_in, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_PRE_ROUTING, + .priority = NF_IP_PRI_CONNTRACK, +}; + +static struct nf_hook_ops ipv4_conntrack_defrag_local_out_ops = { + .hook = ipv4_conntrack_defrag, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_OUT, + .priority = NF_IP_PRI_CONNTRACK_DEFRAG, +}; + +static struct nf_hook_ops ipv4_conntrack_local_out_ops = { + .hook = ipv4_conntrack_local, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_OUT, + .priority = NF_IP_PRI_CONNTRACK, +}; + +/* helpers */ +static struct nf_hook_ops ipv4_conntrack_helper_out_ops = { + .hook = ipv4_conntrack_help, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_HELPER, +}; + +static struct nf_hook_ops ipv4_conntrack_helper_in_ops = { + .hook = ipv4_conntrack_help, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_HELPER, +}; + + +/* Refragmenter; last chance. */ +static struct nf_hook_ops ipv4_conntrack_out_ops = { + .hook = ipv4_refrag, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM, +}; + +static struct nf_hook_ops ipv4_conntrack_local_in_ops = { + .hook = ipv4_confirm, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM, +}; + +#ifdef CONFIG_SYSCTL +/* From nf_conntrack_proto_icmp.c */ +extern unsigned long nf_ct_icmp_timeout; +static struct ctl_table_header *nf_ct_ipv4_sysctl_header; + +static ctl_table nf_ct_sysctl_table[] = { + { + .ctl_name = NET_NF_CONNTRACK_ICMP_TIMEOUT, + .procname = "nf_conntrack_icmp_timeout", + .data = &nf_ct_icmp_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { .ctl_name = 0 } +}; + +static ctl_table nf_ct_netfilter_table[] = { + { + .ctl_name = NET_NETFILTER, + .procname = "netfilter", + .mode = 0555, + .child = nf_ct_sysctl_table, + }, + { .ctl_name = 0 } +}; + +static ctl_table nf_ct_net_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = nf_ct_netfilter_table, + }, + { .ctl_name = 0 } +}; +#endif + +/* Fast function for those who don't want to parse /proc (and I don't + blame them). */ +/* Reversing the socket's dst/src point of view gives us the reply + mapping. */ +static int +getorigdst(struct sock *sk, int optval, void __user *user, int *len) +{ + struct inet_sock *inet = inet_sk(sk); + struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_tuple tuple; + + NF_CT_TUPLE_U_BLANK(&tuple); + tuple.src.u3.ip = inet->rcv_saddr; + tuple.src.u.tcp.port = inet->sport; + tuple.dst.u3.ip = inet->daddr; + tuple.dst.u.tcp.port = inet->dport; + tuple.src.l3num = PF_INET; + tuple.dst.protonum = IPPROTO_TCP; + + /* We only do TCP at the moment: is there a better way? */ + if (strcmp(sk->sk_prot->name, "TCP")) { + DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n"); + return -ENOPROTOOPT; + } + + if ((unsigned int) *len < sizeof(struct sockaddr_in)) { + DEBUGP("SO_ORIGINAL_DST: len %u not %u\n", + *len, sizeof(struct sockaddr_in)); + return -EINVAL; + } + + h = nf_conntrack_find_get(&tuple, NULL); + if (h) { + struct sockaddr_in sin; + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + + sin.sin_family = AF_INET; + sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.u.tcp.port; + sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.u3.ip; + + DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n", + NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); + nf_ct_put(ct); + if (copy_to_user(user, &sin, sizeof(sin)) != 0) + return -EFAULT; + else + return 0; + } + DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n", + NIPQUAD(tuple.src.u3.ip), ntohs(tuple.src.u.tcp.port), + NIPQUAD(tuple.dst.u3.ip), ntohs(tuple.dst.u.tcp.port)); + return -ENOENT; +} + +static struct nf_sockopt_ops so_getorigdst = { + .pf = PF_INET, + .get_optmin = SO_ORIGINAL_DST, + .get_optmax = SO_ORIGINAL_DST+1, + .get = &getorigdst, +}; + +struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = { + .l3proto = PF_INET, + .name = "ipv4", + .pkt_to_tuple = ipv4_pkt_to_tuple, + .invert_tuple = ipv4_invert_tuple, + .print_tuple = ipv4_print_tuple, + .print_conntrack = ipv4_print_conntrack, + .prepare = ipv4_prepare, + .get_features = ipv4_get_features, + .me = THIS_MODULE, +}; + +extern struct nf_conntrack_protocol nf_conntrack_protocol_tcp4; +extern struct nf_conntrack_protocol nf_conntrack_protocol_udp4; +extern struct nf_conntrack_protocol nf_conntrack_protocol_icmp; +static int init_or_cleanup(int init) +{ + int ret = 0; + + if (!init) goto cleanup; + + ret = nf_register_sockopt(&so_getorigdst); + if (ret < 0) { + printk(KERN_ERR "Unable to register netfilter socket option\n"); + goto cleanup_nothing; + } + + ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_tcp4); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register tcp.\n"); + goto cleanup_sockopt; + } + + ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_udp4); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register udp.\n"); + goto cleanup_tcp; + } + + ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_icmp); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register icmp.\n"); + goto cleanup_udp; + } + + ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register ipv4\n"); + goto cleanup_icmp; + } + + ret = nf_register_hook(&ipv4_conntrack_defrag_ops); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register pre-routing defrag hook.\n"); + goto cleanup_ipv4; + } + ret = nf_register_hook(&ipv4_conntrack_defrag_local_out_ops); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register local_out defrag hook.\n"); + goto cleanup_defragops; + } + + ret = nf_register_hook(&ipv4_conntrack_in_ops); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register pre-routing hook.\n"); + goto cleanup_defraglocalops; + } + + ret = nf_register_hook(&ipv4_conntrack_local_out_ops); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register local out hook.\n"); + goto cleanup_inops; + } + + ret = nf_register_hook(&ipv4_conntrack_helper_in_ops); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register local helper hook.\n"); + goto cleanup_inandlocalops; + } + + ret = nf_register_hook(&ipv4_conntrack_helper_out_ops); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register postrouting helper hook.\n"); + goto cleanup_helperinops; + } + + ret = nf_register_hook(&ipv4_conntrack_out_ops); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register post-routing hook.\n"); + goto cleanup_helperoutops; + } + + ret = nf_register_hook(&ipv4_conntrack_local_in_ops); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register local in hook.\n"); + goto cleanup_inoutandlocalops; + } + +#ifdef CONFIG_SYSCTL + nf_ct_ipv4_sysctl_header = register_sysctl_table(nf_ct_net_table, 0); + if (nf_ct_ipv4_sysctl_header == NULL) { + printk("nf_conntrack: can't register to sysctl.\n"); + ret = -ENOMEM; + goto cleanup_localinops; + } +#endif + + /* For use by REJECT target */ + ip_ct_attach = __nf_conntrack_attach; + + return ret; + + cleanup: + synchronize_net(); + ip_ct_attach = NULL; +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(nf_ct_ipv4_sysctl_header); + cleanup_localinops: +#endif + nf_unregister_hook(&ipv4_conntrack_local_in_ops); + cleanup_inoutandlocalops: + nf_unregister_hook(&ipv4_conntrack_out_ops); + cleanup_helperoutops: + nf_unregister_hook(&ipv4_conntrack_helper_out_ops); + cleanup_helperinops: + nf_unregister_hook(&ipv4_conntrack_helper_in_ops); + cleanup_inandlocalops: + nf_unregister_hook(&ipv4_conntrack_local_out_ops); + cleanup_inops: + nf_unregister_hook(&ipv4_conntrack_in_ops); + cleanup_defraglocalops: + nf_unregister_hook(&ipv4_conntrack_defrag_local_out_ops); + cleanup_defragops: + nf_unregister_hook(&ipv4_conntrack_defrag_ops); + cleanup_ipv4: + nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4); + cleanup_icmp: + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_icmp); + cleanup_udp: + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_udp4); + cleanup_tcp: + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_tcp4); + cleanup_sockopt: + nf_unregister_sockopt(&so_getorigdst); + cleanup_nothing: + return ret; +} + +MODULE_LICENSE("GPL"); + +static int __init init(void) +{ + need_nf_conntrack(); + return init_or_cleanup(1); +} + +static void __exit fini(void) +{ + init_or_cleanup(0); +} + +module_init(init); +module_exit(fini); + +void need_ip_conntrack(void) +{ +} + +EXPORT_SYMBOL(need_ip_conntrack); +EXPORT_SYMBOL(nf_ct_ipv4_gather_frags); diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c new file mode 100644 index 00000000000..7ddb5c08f7b --- /dev/null +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -0,0 +1,301 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * - enable working with Layer 3 protocol independent connection tracking. + * + * Derived from net/ipv4/netfilter/ip_conntrack_proto_icmp.c + */ + +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/netfilter.h> +#include <linux/in.h> +#include <linux/icmp.h> +#include <linux/seq_file.h> +#include <net/ip.h> +#include <net/checksum.h> +#include <linux/netfilter_ipv4.h> +#include <net/netfilter/nf_conntrack_tuple.h> +#include <net/netfilter/nf_conntrack_protocol.h> +#include <net/netfilter/nf_conntrack_core.h> + +unsigned long nf_ct_icmp_timeout = 30*HZ; + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static int icmp_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + struct icmphdr _hdr, *hp; + + hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hp == NULL) + return 0; + + tuple->dst.u.icmp.type = hp->type; + tuple->src.u.icmp.id = hp->un.echo.id; + tuple->dst.u.icmp.code = hp->code; + + return 1; +} + +static int icmp_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + /* Add 1; spaces filled with 0. */ + static u_int8_t invmap[] + = { [ICMP_ECHO] = ICMP_ECHOREPLY + 1, + [ICMP_ECHOREPLY] = ICMP_ECHO + 1, + [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, + [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1, + [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1, + [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1, + [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1, + [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1}; + + if (orig->dst.u.icmp.type >= sizeof(invmap) + || !invmap[orig->dst.u.icmp.type]) + return 0; + + tuple->src.u.icmp.id = orig->src.u.icmp.id; + tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1; + tuple->dst.u.icmp.code = orig->dst.u.icmp.code; + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int icmp_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "type=%u code=%u id=%u ", + tuple->dst.u.icmp.type, + tuple->dst.u.icmp.code, + ntohs(tuple->src.u.icmp.id)); +} + +/* Print out the private part of the conntrack. */ +static int icmp_print_conntrack(struct seq_file *s, + const struct nf_conn *conntrack) +{ + return 0; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int icmp_packet(struct nf_conn *ct, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + int pf, + unsigned int hooknum) +{ + /* Try to delete connection immediately after all replies: + won't actually vanish as we still have skb, and del_timer + means this will only run once even if count hits zero twice + (theoretically possible with SMP) */ + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { + if (atomic_dec_and_test(&ct->proto.icmp.count) + && del_timer(&ct->timeout)) + ct->timeout.function((unsigned long)ct); + } else { + atomic_inc(&ct->proto.icmp.count); + nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); + nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmp_timeout); + } + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int icmp_new(struct nf_conn *conntrack, + const struct sk_buff *skb, unsigned int dataoff) +{ + static u_int8_t valid_new[] + = { [ICMP_ECHO] = 1, + [ICMP_TIMESTAMP] = 1, + [ICMP_INFO_REQUEST] = 1, + [ICMP_ADDRESS] = 1 }; + + if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) + || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) { + /* Can't create a new ICMP `conn' with this. */ + DEBUGP("icmp: can't create new conn with type %u\n", + conntrack->tuplehash[0].tuple.dst.u.icmp.type); + NF_CT_DUMP_TUPLE(&conntrack->tuplehash[0].tuple); + return 0; + } + atomic_set(&conntrack->proto.icmp.count, 0); + return 1; +} + +extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4; +/* Returns conntrack if it dealt with ICMP, and filled in skb fields */ +static int +icmp_error_message(struct sk_buff *skb, + enum ip_conntrack_info *ctinfo, + unsigned int hooknum) +{ + struct nf_conntrack_tuple innertuple, origtuple; + struct { + struct icmphdr icmp; + struct iphdr ip; + } _in, *inside; + struct nf_conntrack_protocol *innerproto; + struct nf_conntrack_tuple_hash *h; + int dataoff; + + NF_CT_ASSERT(skb->nfct == NULL); + + /* Not enough header? */ + inside = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_in), &_in); + if (inside == NULL) + return -NF_ACCEPT; + + /* Ignore ICMP's containing fragments (shouldn't happen) */ + if (inside->ip.frag_off & htons(IP_OFFSET)) { + DEBUGP("icmp_error_message: fragment of proto %u\n", + inside->ip.protocol); + return -NF_ACCEPT; + } + + innerproto = nf_ct_find_proto(PF_INET, inside->ip.protocol); + dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp); + /* Are they talking about one of our connections? */ + if (!nf_ct_get_tuple(skb, dataoff, dataoff + inside->ip.ihl*4, PF_INET, + inside->ip.protocol, &origtuple, + &nf_conntrack_l3proto_ipv4, innerproto)) { + DEBUGP("icmp_error_message: ! get_tuple p=%u", + inside->ip.protocol); + return -NF_ACCEPT; + } + + /* Ordinarily, we'd expect the inverted tupleproto, but it's + been preserved inside the ICMP. */ + if (!nf_ct_invert_tuple(&innertuple, &origtuple, + &nf_conntrack_l3proto_ipv4, innerproto)) { + DEBUGP("icmp_error_message: no match\n"); + return -NF_ACCEPT; + } + + *ctinfo = IP_CT_RELATED; + + h = nf_conntrack_find_get(&innertuple, NULL); + if (!h) { + /* Locally generated ICMPs will match inverted if they + haven't been SNAT'ed yet */ + /* FIXME: NAT code has to handle half-done double NAT --RR */ + if (hooknum == NF_IP_LOCAL_OUT) + h = nf_conntrack_find_get(&origtuple, NULL); + + if (!h) { + DEBUGP("icmp_error_message: no match\n"); + return -NF_ACCEPT; + } + + /* Reverse direction from that found */ + if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) + *ctinfo += IP_CT_IS_REPLY; + } else { + if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) + *ctinfo += IP_CT_IS_REPLY; + } + + /* Update skb to refer to this connection */ + skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general; + skb->nfctinfo = *ctinfo; + return -NF_ACCEPT; +} + +/* Small and modified version of icmp_rcv */ +static int +icmp_error(struct sk_buff *skb, unsigned int dataoff, + enum ip_conntrack_info *ctinfo, int pf, unsigned int hooknum) +{ + struct icmphdr _ih, *icmph; + + /* Not enough header? */ + icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih); + if (icmph == NULL) { + if (LOG_INVALID(IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, + "nf_ct_icmp: short packet "); + return -NF_ACCEPT; + } + + /* See ip_conntrack_proto_tcp.c */ + if (hooknum != NF_IP_PRE_ROUTING) + goto checksum_skipped; + + switch (skb->ip_summed) { + case CHECKSUM_HW: + if (!(u16)csum_fold(skb->csum)) + break; + if (LOG_INVALID(IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, + "nf_ct_icmp: bad HW ICMP checksum "); + return -NF_ACCEPT; + case CHECKSUM_NONE: + if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) { + if (LOG_INVALID(IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + NULL, + "nf_ct_icmp: bad ICMP checksum "); + return -NF_ACCEPT; + } + default: + break; + } + +checksum_skipped: + /* + * 18 is the highest 'known' ICMP type. Anything else is a mystery + * + * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently + * discarded. + */ + if (icmph->type > NR_ICMP_TYPES) { + if (LOG_INVALID(IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, + "nf_ct_icmp: invalid ICMP type "); + return -NF_ACCEPT; + } + + /* Need to track icmp error message? */ + if (icmph->type != ICMP_DEST_UNREACH + && icmph->type != ICMP_SOURCE_QUENCH + && icmph->type != ICMP_TIME_EXCEEDED + && icmph->type != ICMP_PARAMETERPROB + && icmph->type != ICMP_REDIRECT) + return NF_ACCEPT; + + return icmp_error_message(skb, ctinfo, hooknum); +} + +struct nf_conntrack_protocol nf_conntrack_protocol_icmp = +{ + .list = { NULL, NULL }, + .l3proto = PF_INET, + .proto = IPPROTO_ICMP, + .name = "icmp", + .pkt_to_tuple = icmp_pkt_to_tuple, + .invert_tuple = icmp_invert_tuple, + .print_tuple = icmp_print_tuple, + .print_conntrack = icmp_print_conntrack, + .packet = icmp_packet, + .new = icmp_new, + .error = icmp_error, + .destroy = NULL, + .me = NULL +}; + +EXPORT_SYMBOL(nf_conntrack_protocol_icmp); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index f7943ba1f43..a65e508fbd4 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -90,9 +90,7 @@ fold_field(void *mib[], int offt) unsigned long res = 0; int i; - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_possible(i)) - continue; + for_each_cpu(i) { res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt); res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt); } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 65268562351..01444a02b48 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -645,6 +645,14 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_tcp_congestion_control, .strategy = &sysctl_tcp_congestion_control, }, + { + .ctl_name = NET_TCP_ABC, + .procname = "tcp_abc", + .data = &sysctl_tcp_abc, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = 0 } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f3f0013a958..9ac7a4f46bd 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1640,7 +1640,7 @@ int tcp_disconnect(struct sock *sk, int flags) } else if (tcp_need_reset(old_state) || (tp->snd_nxt != tp->write_seq && (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { - /* The last check adjusts for discrepance of Linux wrt. RFC + /* The last check adjusts for discrepancy of Linux wrt. RFC * states */ tcp_send_active_reset(sk, gfp_any()); @@ -1669,6 +1669,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->packets_out = 0; tp->snd_ssthresh = 0x7fffffff; tp->snd_cwnd_cnt = 0; + tp->bytes_acked = 0; tcp_set_ca_state(sk, TCP_CA_Open); tcp_clear_retrans(tp); inet_csk_delack_init(sk); @@ -2112,7 +2113,6 @@ void __init tcp_init(void) sysctl_tcp_max_orphans >>= (3 - order); sysctl_max_syn_backlog = 128; } - tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1; sysctl_tcp_mem[0] = 768 << order; sysctl_tcp_mem[1] = 1024 << order; diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 6d80e063c18..1d0cd86621b 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -27,7 +27,7 @@ */ static int fast_convergence = 1; -static int max_increment = 32; +static int max_increment = 16; static int low_window = 14; static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ static int low_utilization_threshold = 153; @@ -217,17 +217,15 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, bictcp_low_utilization(sk, data_acked); - if (in_flight < tp->snd_cwnd) + if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* In "safe" area, increase. */ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { bictcp_update(ca, tp->snd_cwnd); - /* In dangerous area, increase slowly. + /* In dangerous area, increase slowly. * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */ if (tp->snd_cwnd_cnt >= ca->cnt) { diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index bbf2d6624e8..c7cc62c8dc1 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -186,24 +186,32 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, { struct tcp_sock *tp = tcp_sk(sk); - if (in_flight < tp->snd_cwnd) + if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* In "safe" area, increase. */ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { - /* In dangerous area, increase slowly. - * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd - */ - if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - } else - tp->snd_cwnd_cnt++; - } + /* In "safe" area, increase. */ + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + + /* In dangerous area, increase slowly. */ + else if (sysctl_tcp_abc) { + /* RFC3465: Apppriate Byte Count + * increase once for each full cwnd acked + */ + if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) { + tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache; + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } + } else { + /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */ + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } } EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 6acc04bde08..63cf7e54084 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -111,18 +111,17 @@ static void hstcp_init(struct sock *sk) } static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, - u32 in_flight, int good) + u32 in_flight, int data_acked) { struct tcp_sock *tp = tcp_sk(sk); struct hstcp *ca = inet_csk_ca(sk); - if (in_flight < tp->snd_cwnd) + if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { /* Update AIMD parameters */ if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index e47b37984e9..3284cfb993e 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -207,14 +207,13 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, struct tcp_sock *tp = tcp_sk(sk); struct htcp *ca = inet_csk_ca(sk); - if (in_flight < tp->snd_cwnd) + if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* In "safe" area, increase. */ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { + measure_rtt(sk); /* keep track of number of round-trip times since last backoff event */ @@ -224,7 +223,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, htcp_alpha_update(ca); } - /* In dangerous area, increase slowly. + /* In dangerous area, increase slowly. * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd */ if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) { diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index 77add63623d..40dbb387751 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -100,12 +100,12 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt, ca->minrtt = tp->srtt; } + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + if (!ca->hybla_en) return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); - if (in_flight < tp->snd_cwnd) - return; - if (ca->rho == 0) hybla_recalc_param(sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3e98b57578d..bf2e23086bc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -42,7 +42,7 @@ * Andi Kleen : Moved open_request checking here * and process RSTs for open_requests. * Andi Kleen : Better prune_queue, and other fixes. - * Andrey Savochkin: Fix RTT measurements in the presnce of + * Andrey Savochkin: Fix RTT measurements in the presence of * timestamps. * Andrey Savochkin: Check sequence numbers correctly when * removing SACKs due to in sequence incoming @@ -89,6 +89,7 @@ int sysctl_tcp_frto; int sysctl_tcp_nometrics_save; int sysctl_tcp_moderate_rcvbuf = 1; +int sysctl_tcp_abc = 1; #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ @@ -223,7 +224,7 @@ static void tcp_fixup_sndbuf(struct sock *sk) * of receiver window. Check #2. * * The scheme does not work when sender sends good segments opening - * window and then starts to feed us spagetti. But it should work + * window and then starts to feed us spaghetti. But it should work * in common situations. Otherwise, we have to rely on queue collapsing. */ @@ -233,7 +234,7 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp, { /* Optimize this! */ int truesize = tcp_win_from_space(skb->truesize)/2; - int window = tcp_full_space(sk)/2; + int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2; while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) @@ -277,7 +278,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); /* Try to select rcvbuf so that 4 mss-sized segments - * will fit to window and correspoding skbs will fit to our rcvbuf. + * will fit to window and corresponding skbs will fit to our rcvbuf. * (was 3; 4 is minimum to allow fast retransmit to work.) */ while (tcp_win_from_space(rcvmem) < tp->advmss) @@ -286,7 +287,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]); } -/* 4. Try to fixup all. It is made iimediately after connection enters +/* 4. Try to fixup all. It is made immediately after connection enters * established state. */ static void tcp_init_buffer_space(struct sock *sk) @@ -326,37 +327,18 @@ static void tcp_init_buffer_space(struct sock *sk) static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) { struct inet_connection_sock *icsk = inet_csk(sk); - struct sk_buff *skb; - unsigned int app_win = tp->rcv_nxt - tp->copied_seq; - int ofo_win = 0; icsk->icsk_ack.quick = 0; - skb_queue_walk(&tp->out_of_order_queue, skb) { - ofo_win += skb->len; - } - - /* If overcommit is due to out of order segments, - * do not clamp window. Try to expand rcvbuf instead. - */ - if (ofo_win) { - if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && - !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && - !tcp_memory_pressure && - atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) - sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), - sysctl_tcp_rmem[2]); + if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && + !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && + !tcp_memory_pressure && + atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { + sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), + sysctl_tcp_rmem[2]); } - if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) { - app_win += ofo_win; - if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf) - app_win >>= 1; - if (app_win > icsk->icsk_ack.rcv_mss) - app_win -= icsk->icsk_ack.rcv_mss; - app_win = max(app_win, 2U*tp->advmss); - + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); - } } /* Receiver "autotuning" code. @@ -385,8 +367,8 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) * are stalled on filesystem I/O. * * Also, since we are only going for a minimum in the - * non-timestamp case, we do not smoothe things out - * else with timestamps disabled convergance takes too + * non-timestamp case, we do not smooth things out + * else with timestamps disabled convergence takes too * long. */ if (!win_dep) { @@ -395,7 +377,7 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) } else if (m < new_sample) new_sample = m << 3; } else { - /* No previous mesaure. */ + /* No previous measure. */ new_sample = m << 3; } @@ -524,7 +506,7 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ if (icsk->icsk_ack.ato > icsk->icsk_rto) icsk->icsk_ack.ato = icsk->icsk_rto; } else if (m > icsk->icsk_rto) { - /* Too long gap. Apparently sender falled to + /* Too long gap. Apparently sender failed to * restart window, so that we send ACKs quickly. */ tcp_incr_quickack(sk); @@ -548,10 +530,9 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ -static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) +static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) { struct tcp_sock *tp = tcp_sk(sk); - const struct inet_connection_sock *icsk = inet_csk(sk); long m = mrtt; /* RTT */ /* The following amusing code comes from Jacobson's @@ -565,7 +546,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) * * Funny. This algorithm seems to be very broken. * These formulae increase RTO, when it should be decreased, increase - * too slowly, when it should be incresed fastly, decrease too fastly + * too slowly, when it should be increased quickly, decrease too quickly * etc. I guess in BSD RTO takes ONE value, so that it is absolutely * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) @@ -610,9 +591,6 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); tp->rtt_seq = tp->snd_nxt; } - - if (icsk->icsk_ca_ops->rtt_sample) - icsk->icsk_ca_ops->rtt_sample(sk, *usrtt); } /* Calculate rto without backoff. This is the second half of Van Jacobson's @@ -629,14 +607,14 @@ static inline void tcp_set_rto(struct sock *sk) * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ * to do with delayed acks, because at cwnd>2 true delack timeout * is invisible. Actually, Linux-2.4 also generates erratic - * ACKs in some curcumstances. + * ACKs in some circumstances. */ inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar; /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, * all the algo is pure shit and should be replaced - * with correct one. It is exaclty, which we pretend to do. + * with correct one. It is exactly, which we pretend to do. */ } @@ -794,7 +772,7 @@ static void tcp_init_metrics(struct sock *sk) * to make it more realistic. * * A bit of theory. RTT is time passed after "normal" sized packet - * is sent until it is ACKed. In normal curcumstances sending small + * is sent until it is ACKed. In normal circumstances sending small * packets force peer to delay ACKs and calculation is correct too. * The algorithm is adaptive and, provided we follow specs, it * NEVER underestimate RTT. BUT! If peer tries to make some clever @@ -919,18 +897,32 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ int prior_fackets; u32 lost_retrans = 0; int flag = 0; + int dup_sack = 0; int i; if (!tp->sacked_out) tp->fackets_out = 0; prior_fackets = tp->fackets_out; - for (i=0; i<num_sacks; i++, sp++) { - struct sk_buff *skb; - __u32 start_seq = ntohl(sp->start_seq); - __u32 end_seq = ntohl(sp->end_seq); - int fack_count = 0; - int dup_sack = 0; + /* SACK fastpath: + * if the only SACK change is the increase of the end_seq of + * the first block then only apply that SACK block + * and use retrans queue hinting otherwise slowpath */ + flag = 1; + for (i = 0; i< num_sacks; i++) { + __u32 start_seq = ntohl(sp[i].start_seq); + __u32 end_seq = ntohl(sp[i].end_seq); + + if (i == 0){ + if (tp->recv_sack_cache[i].start_seq != start_seq) + flag = 0; + } else { + if ((tp->recv_sack_cache[i].start_seq != start_seq) || + (tp->recv_sack_cache[i].end_seq != end_seq)) + flag = 0; + } + tp->recv_sack_cache[i].start_seq = start_seq; + tp->recv_sack_cache[i].end_seq = end_seq; /* Check for D-SACK. */ if (i == 0) { @@ -962,15 +954,58 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (before(ack, prior_snd_una - tp->max_window)) return 0; } + } + + if (flag) + num_sacks = 1; + else { + int j; + tp->fastpath_skb_hint = NULL; + + /* order SACK blocks to allow in order walk of the retrans queue */ + for (i = num_sacks-1; i > 0; i--) { + for (j = 0; j < i; j++){ + if (after(ntohl(sp[j].start_seq), + ntohl(sp[j+1].start_seq))){ + sp[j].start_seq = htonl(tp->recv_sack_cache[j+1].start_seq); + sp[j].end_seq = htonl(tp->recv_sack_cache[j+1].end_seq); + sp[j+1].start_seq = htonl(tp->recv_sack_cache[j].start_seq); + sp[j+1].end_seq = htonl(tp->recv_sack_cache[j].end_seq); + } + + } + } + } + + /* clear flag as used for different purpose in following code */ + flag = 0; + + for (i=0; i<num_sacks; i++, sp++) { + struct sk_buff *skb; + __u32 start_seq = ntohl(sp->start_seq); + __u32 end_seq = ntohl(sp->end_seq); + int fack_count; + + /* Use SACK fastpath hint if valid */ + if (tp->fastpath_skb_hint) { + skb = tp->fastpath_skb_hint; + fack_count = tp->fastpath_cnt_hint; + } else { + skb = sk->sk_write_queue.next; + fack_count = 0; + } /* Event "B" in the comment above. */ if (after(end_seq, tp->high_seq)) flag |= FLAG_DATA_LOST; - sk_stream_for_retrans_queue(skb, sk) { + sk_stream_for_retrans_queue_from(skb, sk) { int in_sack, pcount; u8 sacked; + tp->fastpath_skb_hint = skb; + tp->fastpath_cnt_hint = fack_count; + /* The retransmission queue is always in order, so * we can short-circuit the walk early. */ @@ -1045,6 +1080,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); tp->lost_out -= tcp_skb_pcount(skb); tp->retrans_out -= tcp_skb_pcount(skb); + + /* clear lost hint */ + tp->retransmit_skb_hint = NULL; } } else { /* New sack for not retransmitted frame, @@ -1057,6 +1095,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (sacked & TCPCB_LOST) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; tp->lost_out -= tcp_skb_pcount(skb); + + /* clear lost hint */ + tp->retransmit_skb_hint = NULL; } } @@ -1080,6 +1121,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); + tp->retransmit_skb_hint = NULL; } } } @@ -1107,6 +1149,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); + /* clear lost hint */ + tp->retransmit_skb_hint = NULL; + if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) { tp->lost_out += tcp_skb_pcount(skb); TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; @@ -1214,6 +1259,8 @@ static void tcp_enter_frto_loss(struct sock *sk) tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->frto_highmark; TCP_ECN_queue_cwr(tp); + + clear_all_retrans_hints(tp); } void tcp_clear_retrans(struct tcp_sock *tp) @@ -1251,6 +1298,7 @@ void tcp_enter_loss(struct sock *sk, int how) tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; + tp->bytes_acked = 0; tcp_clear_retrans(tp); /* Push undo marker, if it was plain RTO and nothing @@ -1279,6 +1327,8 @@ void tcp_enter_loss(struct sock *sk, int how) tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp); + + clear_all_retrans_hints(tp); } static int tcp_check_sack_reneging(struct sock *sk) @@ -1503,17 +1553,37 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp, int packets, u32 high_seq) { struct sk_buff *skb; - int cnt = packets; + int cnt; - BUG_TRAP(cnt <= tp->packets_out); + BUG_TRAP(packets <= tp->packets_out); + if (tp->lost_skb_hint) { + skb = tp->lost_skb_hint; + cnt = tp->lost_cnt_hint; + } else { + skb = sk->sk_write_queue.next; + cnt = 0; + } - sk_stream_for_retrans_queue(skb, sk) { - cnt -= tcp_skb_pcount(skb); - if (cnt < 0 || after(TCP_SKB_CB(skb)->end_seq, high_seq)) + sk_stream_for_retrans_queue_from(skb, sk) { + /* TODO: do this better */ + /* this is not the most efficient way to do this... */ + tp->lost_skb_hint = skb; + tp->lost_cnt_hint = cnt; + cnt += tcp_skb_pcount(skb); + if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, high_seq)) break; if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); + + /* clear xmit_retransmit_queue hints + * if this is beyond hint */ + if(tp->retransmit_skb_hint != NULL && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) { + + tp->retransmit_skb_hint = NULL; + } } } tcp_sync_left_out(tp); @@ -1540,13 +1610,28 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) if (tcp_head_timedout(sk, tp)) { struct sk_buff *skb; - sk_stream_for_retrans_queue(skb, sk) { - if (tcp_skb_timedout(sk, skb) && - !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { + skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint + : sk->sk_write_queue.next; + + sk_stream_for_retrans_queue_from(skb, sk) { + if (!tcp_skb_timedout(sk, skb)) + break; + + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); + + /* clear xmit_retrans hint */ + if (tp->retransmit_skb_hint && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) + + tp->retransmit_skb_hint = NULL; } } + + tp->scoreboard_skb_hint = skb; + tcp_sync_left_out(tp); } } @@ -1626,6 +1711,10 @@ static void tcp_undo_cwr(struct sock *sk, const int undo) } tcp_moderate_cwnd(tp); tp->snd_cwnd_stamp = tcp_time_stamp; + + /* There is something screwy going on with the retrans hints after + an undo */ + clear_all_retrans_hints(tp); } static inline int tcp_may_undo(struct tcp_sock *tp) @@ -1709,6 +1798,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) sk_stream_for_retrans_queue(skb, sk) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; } + + clear_all_retrans_hints(tp); + DBGUNDO(sk, tp, "partial loss"); tp->lost_out = 0; tp->left_out = tp->sacked_out; @@ -1908,6 +2000,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, TCP_ECN_queue_cwr(tp); } + tp->bytes_acked = 0; tp->snd_cwnd_cnt = 0; tcp_set_ca_state(sk, TCP_CA_Recovery); } @@ -1919,9 +2012,9 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, } /* Read draft-ietf-tcplw-high-performance before mucking - * with this code. (Superceeds RFC1323) + * with this code. (Supersedes RFC1323) */ -static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) +static void tcp_ack_saw_tstamp(struct sock *sk, int flag) { /* RTTM Rule: A TSecr value received in a segment is used to * update the averaged RTT measurement only if the segment @@ -1932,7 +2025,7 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) * 1998/04/10 Andrey V. Savochkin <saw@msu.ru> * * Changed: reset backoff as soon as we see the first valid sample. - * If we do not, we get strongly overstimated rto. With timestamps + * If we do not, we get strongly overestimated rto. With timestamps * samples are accepted even from very old segments: f.e., when rtt=1 * increases to 8, we retransmit 5 times and after 8 seconds delayed * answer arrives rto becomes 120 seconds! If at least one of segments @@ -1940,13 +2033,13 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) */ struct tcp_sock *tp = tcp_sk(sk); const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; - tcp_rtt_estimator(sk, seq_rtt, usrtt); + tcp_rtt_estimator(sk, seq_rtt); tcp_set_rto(sk); inet_csk(sk)->icsk_backoff = 0; tcp_bound_rto(sk); } -static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag) +static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) { /* We don't have a timestamp. Can only use * packets that are not retransmitted to determine @@ -1960,21 +2053,21 @@ static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag if (flag & FLAG_RETRANS_DATA_ACKED) return; - tcp_rtt_estimator(sk, seq_rtt, usrtt); + tcp_rtt_estimator(sk, seq_rtt); tcp_set_rto(sk); inet_csk(sk)->icsk_backoff = 0; tcp_bound_rto(sk); } static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, - const s32 seq_rtt, u32 *usrtt) + const s32 seq_rtt) { const struct tcp_sock *tp = tcp_sk(sk); /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) - tcp_ack_saw_tstamp(sk, usrtt, flag); + tcp_ack_saw_tstamp(sk, flag); else if (seq_rtt >= 0) - tcp_ack_no_tstamp(sk, seq_rtt, usrtt, flag); + tcp_ack_no_tstamp(sk, seq_rtt, flag); } static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, @@ -2054,20 +2147,27 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, return acked; } +static inline u32 tcp_usrtt(const struct sk_buff *skb) +{ + struct timeval tv, now; + + do_gettimeofday(&now); + skb_get_timestamp(skb, &tv); + return (now.tv_sec - tv.tv_sec) * 1000000 + (now.tv_usec - tv.tv_usec); +} /* Remove acknowledged frames from the retransmission queue. */ -static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt) +static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) { struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb; __u32 now = tcp_time_stamp; int acked = 0; __s32 seq_rtt = -1; - struct timeval usnow; u32 pkts_acked = 0; - - if (seq_usrtt) - do_gettimeofday(&usnow); + void (*rtt_sample)(struct sock *sk, u32 usrtt) + = icsk->icsk_ca_ops->rtt_sample; while ((skb = skb_peek(&sk->sk_write_queue)) && skb != sk->sk_send_head) { @@ -2107,16 +2207,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt tp->retrans_out -= tcp_skb_pcount(skb); acked |= FLAG_RETRANS_DATA_ACKED; seq_rtt = -1; - } else if (seq_rtt < 0) + } else if (seq_rtt < 0) { seq_rtt = now - scb->when; - if (seq_usrtt) { - struct timeval tv; - - skb_get_timestamp(skb, &tv); - *seq_usrtt = (usnow.tv_sec - tv.tv_sec) * 1000000 - + (usnow.tv_usec - tv.tv_usec); + if (rtt_sample) + (*rtt_sample)(sk, tcp_usrtt(skb)); } - if (sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= tcp_skb_pcount(skb); if (sacked & TCPCB_LOST) @@ -2126,17 +2221,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt !before(scb->end_seq, tp->snd_up)) tp->urg_mode = 0; } - } else if (seq_rtt < 0) + } else if (seq_rtt < 0) { seq_rtt = now - scb->when; + if (rtt_sample) + (*rtt_sample)(sk, tcp_usrtt(skb)); + } tcp_dec_pcount_approx(&tp->fackets_out, skb); tcp_packets_out_dec(tp, skb); __skb_unlink(skb, &sk->sk_write_queue); sk_stream_free_skb(sk, skb); + clear_all_retrans_hints(tp); } if (acked&FLAG_ACKED) { - const struct inet_connection_sock *icsk = inet_csk(sk); - tcp_ack_update_rtt(sk, acked, seq_rtt, seq_usrtt); + tcp_ack_update_rtt(sk, acked, seq_rtt); tcp_ack_packets_out(sk, tp); if (icsk->icsk_ca_ops->pkts_acked) @@ -2284,7 +2382,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) } /* F-RTO affects on two new ACKs following RTO. - * At latest on third ACK the TCP behavor is back to normal. + * At latest on third ACK the TCP behavior is back to normal. */ tp->frto_counter = (tp->frto_counter + 1) % 3; } @@ -2299,7 +2397,6 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) u32 ack = TCP_SKB_CB(skb)->ack_seq; u32 prior_in_flight; s32 seq_rtt; - s32 seq_usrtt = 0; int prior_packets; /* If the ack is newer than sent or older than previous acks @@ -2311,6 +2408,9 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) if (before(ack, prior_snd_una)) goto old_ack; + if (sysctl_tcp_abc && icsk->icsk_ca_state < TCP_CA_CWR) + tp->bytes_acked += ack - prior_snd_una; + if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) { /* Window is constant, pure forward advance. * No more checks are required. @@ -2352,14 +2452,13 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) prior_in_flight = tcp_packets_in_flight(tp); /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, &seq_rtt, - icsk->icsk_ca_ops->rtt_sample ? &seq_usrtt : NULL); + flag |= tcp_clean_rtx_queue(sk, &seq_rtt); if (tp->frto_counter) tcp_process_frto(sk, prior_snd_una); if (tcp_ack_is_dubious(sk, flag)) { - /* Advanve CWND, if state allows this. */ + /* Advance CWND, if state allows this. */ if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0); tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); @@ -3148,7 +3247,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, { struct sk_buff *skb; - /* First, check that queue is collapsable and find + /* First, check that queue is collapsible and find * the point where collapsing can be useful. */ for (skb = head; skb != tail; ) { /* No new bits? It is possible on ofo queue. */ @@ -3456,7 +3555,7 @@ static __inline__ void tcp_ack_snd_check(struct sock *sk) /* * This routine is only called when we have urgent data - * signalled. Its the 'slow' part of tcp_urg. It could be + * signaled. Its the 'slow' part of tcp_urg. It could be * moved inline now as tcp_urg is only called from one * place. We handle URGent data wrong. We have to - as * BSD still doesn't use the correction from RFC961. @@ -3501,7 +3600,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th) * urgent. To do this requires some care. We cannot just ignore * tp->copied_seq since we would read the last urgent byte again * as data, nor can we alter copied_seq until this data arrives - * or we break the sematics of SIOCATMARK (and thus sockatmark()) + * or we break the semantics of SIOCATMARK (and thus sockatmark()) * * NOTE. Double Dutch. Rendering to plain English: author of comment * above did something sort of send("A", MSG_OOB); send("B", MSG_OOB); @@ -3646,7 +3745,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tp->rx_opt.saw_tstamp = 0; /* pred_flags is 0xS?10 << 16 + snd_wnd - * if header_predition is to be made + * if header_prediction is to be made * 'S' will always be tp->tcp_header_len >> 2 * '?' will be 0 for the fast path, otherwise pred_flags is 0 to * turn it off (when there are holes in the receive @@ -4242,7 +4341,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !tp->srtt) - tcp_ack_saw_tstamp(sk, NULL, 0); + tcp_ack_saw_tstamp(sk, 0); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; @@ -4372,6 +4471,7 @@ discard: EXPORT_SYMBOL(sysctl_tcp_ecn); EXPORT_SYMBOL(sysctl_tcp_reordering); +EXPORT_SYMBOL(sysctl_tcp_abc); EXPORT_SYMBOL(tcp_parse_options); EXPORT_SYMBOL(tcp_rcv_established); EXPORT_SYMBOL(tcp_rcv_state_process); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c85819d8474..4d5021e1929 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -39,7 +39,7 @@ * request_sock handling and moved * most of it into the af independent code. * Added tail drop and some other bugfixes. - * Added new listen sematics. + * Added new listen semantics. * Mike McLagan : Routing by source * Juan Jose Ciarlante: ip_dynaddr bits * Andi Kleen: various fixes. @@ -93,8 +93,6 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { .lhash_lock = RW_LOCK_UNLOCKED, .lhash_users = ATOMIC_INIT(0), .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait), - .portalloc_lock = SPIN_LOCK_UNLOCKED, - .port_rover = 1024 - 1, }; static int tcp_v4_get_port(struct sock *sk, unsigned short snum) @@ -825,8 +823,7 @@ out: */ static void tcp_v4_reqsk_destructor(struct request_sock *req) { - if (inet_rsk(req)->opt) - kfree(inet_rsk(req)->opt); + kfree(inet_rsk(req)->opt); } static inline void syn_flood_warning(struct sk_buff *skb) @@ -1113,24 +1110,18 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) static int tcp_v4_checksum_init(struct sk_buff *skb) { if (skb->ip_summed == CHECKSUM_HW) { - skb->ip_summed = CHECKSUM_UNNECESSARY; if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr, - skb->nh.iph->daddr, skb->csum)) + skb->nh.iph->daddr, skb->csum)) { + skb->ip_summed = CHECKSUM_UNNECESSARY; return 0; - - LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v4 csum failed\n"); - skb->ip_summed = CHECKSUM_NONE; + } } + + skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr, + skb->len, IPPROTO_TCP, 0); + if (skb->len <= 76) { - if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr, - skb->nh.iph->daddr, - skb_checksum(skb, 0, skb->len, 0))) - return -1; - skb->ip_summed = CHECKSUM_UNNECESSARY; - } else { - skb->csum = ~tcp_v4_check(skb->h.th, skb->len, - skb->nh.iph->saddr, - skb->nh.iph->daddr, 0); + return __skb_checksum_complete(skb); } return 0; } @@ -1219,10 +1210,10 @@ int tcp_v4_rcv(struct sk_buff *skb) /* An explanation is required here, I think. * Packet length and doff are validated by header prediction, - * provided case of th->doff==0 is elimineted. + * provided case of th->doff==0 is eliminated. * So, we defer the checks. */ if ((skb->ip_summed != CHECKSUM_UNNECESSARY && - tcp_v4_checksum_init(skb) < 0)) + tcp_v4_checksum_init(skb))) goto bad_packet; th = skb->h.th; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b1a63b2c6b4..1b66a2ac432 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -158,7 +158,7 @@ kill_with_rst: /* I am shamed, but failed to make it more elegant. * Yes, it is direct reference to IP, which is impossible * to generalize to IPv6. Taking into account that IPv6 - * do not undertsnad recycling in any case, it not + * do not understand recycling in any case, it not * a big problem in practice. --ANK */ if (tw->tw_family == AF_INET && tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && @@ -194,7 +194,7 @@ kill_with_rst: /* In window segment, it may be only reset or bare ack. */ if (th->rst) { - /* This is TIME_WAIT assasination, in two flavors. + /* This is TIME_WAIT assassination, in two flavors. * Oh well... nobody has a sufficient solution to this * protocol bug yet. */ @@ -380,6 +380,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, */ newtp->snd_cwnd = 2; newtp->snd_cwnd_cnt = 0; + newtp->bytes_acked = 0; newtp->frto_counter = 0; newtp->frto_highmark = 0; @@ -550,7 +551,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, /* RFC793 page 36: "If the connection is in any non-synchronized state ... * and the incoming segment acknowledges something not yet - * sent (the segment carries an unaccaptable ACK) ... + * sent (the segment carries an unacceptable ACK) ... * a reset is sent." * * Invalid ACK: reset will be sent by listening socket diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b907456a79f..029c70dfb58 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -436,6 +436,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss u16 flags; BUG_ON(len > skb->len); + + clear_all_retrans_hints(tp); nsize = skb_headlen(skb) - len; if (nsize < 0) nsize = 0; @@ -599,7 +601,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) for TCP options, but includes only bare TCP header. tp->rx_opt.mss_clamp is mss negotiated at connection setup. - It is minumum of user_mss and mss received with SYN. + It is minimum of user_mss and mss received with SYN. It also does not include TCP options. tp->pmtu_cookie is last pmtu, seen by this function. @@ -1171,7 +1173,7 @@ u32 __tcp_select_window(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - /* MSS for the peer's data. Previous verions used mss_clamp + /* MSS for the peer's data. Previous versions used mss_clamp * here. I don't know if the value based on our guesses * of peer's MSS is better for the performance. It's more correct * but may be worse for the performance because of rcv_mss @@ -1260,7 +1262,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); - /* Ok. We will be able to collapse the packet. */ + /* changing transmit queue under us so clear hints */ + clear_all_retrans_hints(tp); + + /* Ok. We will be able to collapse the packet. */ __skb_unlink(next_skb, &sk->sk_write_queue); memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); @@ -1330,6 +1335,8 @@ void tcp_simple_retransmit(struct sock *sk) } } + clear_all_retrans_hints(tp); + if (!lost) return; @@ -1361,7 +1368,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) int err; /* Do not sent more than we queued. 1/4 is reserved for possible - * copying overhead: frgagmentation, tunneling, mangling etc. + * copying overhead: fragmentation, tunneling, mangling etc. */ if (atomic_read(&sk->sk_wmem_alloc) > min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) @@ -1468,13 +1475,25 @@ void tcp_xmit_retransmit_queue(struct sock *sk) const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - int packet_cnt = tp->lost_out; + int packet_cnt; + + if (tp->retransmit_skb_hint) { + skb = tp->retransmit_skb_hint; + packet_cnt = tp->retransmit_cnt_hint; + }else{ + skb = sk->sk_write_queue.next; + packet_cnt = 0; + } /* First pass: retransmit lost packets. */ - if (packet_cnt) { - sk_stream_for_retrans_queue(skb, sk) { + if (tp->lost_out) { + sk_stream_for_retrans_queue_from(skb, sk) { __u8 sacked = TCP_SKB_CB(skb)->sacked; + /* we could do better than to assign each time */ + tp->retransmit_skb_hint = skb; + tp->retransmit_cnt_hint = packet_cnt; + /* Assume this retransmit will generate * only one packet for congestion window * calculation purposes. This works because @@ -1485,10 +1504,12 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) return; - if (sacked&TCPCB_LOST) { + if (sacked & TCPCB_LOST) { if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { - if (tcp_retransmit_skb(sk, skb)) + if (tcp_retransmit_skb(sk, skb)) { + tp->retransmit_skb_hint = NULL; return; + } if (icsk->icsk_ca_state != TCP_CA_Loss) NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); else @@ -1501,8 +1522,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk) TCP_RTO_MAX); } - packet_cnt -= tcp_skb_pcount(skb); - if (packet_cnt <= 0) + packet_cnt += tcp_skb_pcount(skb); + if (packet_cnt >= tp->lost_out) break; } } @@ -1528,9 +1549,18 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (tcp_may_send_now(sk, tp)) return; - packet_cnt = 0; + if (tp->forward_skb_hint) { + skb = tp->forward_skb_hint; + packet_cnt = tp->forward_cnt_hint; + } else{ + skb = sk->sk_write_queue.next; + packet_cnt = 0; + } + + sk_stream_for_retrans_queue_from(skb, sk) { + tp->forward_cnt_hint = packet_cnt; + tp->forward_skb_hint = skb; - sk_stream_for_retrans_queue(skb, sk) { /* Similar to the retransmit loop above we * can pretend that the retransmitted SKB * we send out here will be composed of one @@ -1547,8 +1577,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk) continue; /* Ok, retransmit it. */ - if (tcp_retransmit_skb(sk, skb)) + if (tcp_retransmit_skb(sk, skb)) { + tp->forward_skb_hint = NULL; break; + } if (skb == skb_peek(&sk->sk_write_queue)) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, @@ -2058,3 +2090,4 @@ EXPORT_SYMBOL(tcp_connect); EXPORT_SYMBOL(tcp_make_synack); EXPORT_SYMBOL(tcp_simple_retransmit); EXPORT_SYMBOL(tcp_sync_mss); +EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor); diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 327770bf552..26d7486ee50 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -20,20 +20,20 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, int flag) { struct tcp_sock *tp = tcp_sk(sk); - if (in_flight < tp->snd_cwnd) + + if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - tp->snd_cwnd++; - } else { + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { tp->snd_cwnd_cnt++; if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ - tp->snd_cwnd++; + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; tp->snd_cwnd_cnt = 0; } } - tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); - tp->snd_cwnd_stamp = tcp_time_stamp; } static u32 tcp_scalable_ssthresh(struct sock *sk) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 415ee47ac1c..e1880959614 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -58,7 +58,7 @@ static void tcp_write_err(struct sock *sk) * to prevent DoS attacks. It is called when a retransmission timeout * or zero probe timeout occurs on orphaned socket. * - * Criterium is still not confirmed experimentally and may change. + * Criteria is still not confirmed experimentally and may change. * We kill the socket, if: * 1. If number of orphaned sockets exceeds an administratively configured * limit. @@ -132,7 +132,7 @@ static int tcp_write_timeout(struct sock *sk) hole detection. :-( It is place to make it. It is not made. I do not want - to make it. It is disguisting. It does not work in any + to make it. It is disgusting. It does not work in any case. Let me to cite the same draft, which requires for us to implement this: diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 93c5f92070f..b7d296a8ac6 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -236,8 +236,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, /* We don't have enough RTT samples to do the Vegas * calculation, so we'll behave like Reno. */ - if (tp->snd_cwnd > tp->snd_ssthresh) - tp->snd_cwnd++; + tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); } else { u32 rtt, target_cwnd, diff; @@ -275,7 +274,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, */ diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; - if (tp->snd_cwnd < tp->snd_ssthresh) { + if (tp->snd_cwnd <= tp->snd_ssthresh) { /* Slow start. */ if (diff > gamma) { /* Going too fast. Time to slow down @@ -295,6 +294,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, V_PARAM_SHIFT)+1); } + tcp_slow_start(tp); } else { /* Congestion avoidance. */ u32 next_snd_cwnd; @@ -327,37 +327,17 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, else if (next_snd_cwnd < tp->snd_cwnd) tp->snd_cwnd--; } - } - /* Wipe the slate clean for the next RTT. */ - vegas->cntRTT = 0; - vegas->minRTT = 0x7fffffff; + if (tp->snd_cwnd < 2) + tp->snd_cwnd = 2; + else if (tp->snd_cwnd > tp->snd_cwnd_clamp) + tp->snd_cwnd = tp->snd_cwnd_clamp; + } } - /* The following code is executed for every ack we receive, - * except for conditions checked in should_advance_cwnd() - * before the call to tcp_cong_avoid(). Mainly this means that - * we only execute this code if the ack actually acked some - * data. - */ - - /* If we are in slow start, increase our cwnd in response to this ACK. - * (If we are not in slow start then we are in congestion avoidance, - * and adjust our congestion window only once per RTT. See the code - * above.) - */ - if (tp->snd_cwnd <= tp->snd_ssthresh) - tp->snd_cwnd++; - - /* to keep cwnd from growing without bound */ - tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); - - /* Make sure that we are never so timid as to reduce our cwnd below - * 2 MSS. - * - * Going below 2 MSS would risk huge delayed ACKs from our receiver. - */ - tp->snd_cwnd = max(tp->snd_cwnd, 2U); + /* Wipe the slate clean for the next RTT. */ + vegas->cntRTT = 0; + vegas->minRTT = 0x7fffffff; } /* Extract info for Tcp socket info provided via netlink. */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e0bd1013cb0..2422a5f7195 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -761,7 +761,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) static __inline__ int __udp_checksum_complete(struct sk_buff *skb) { - return (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); + return __skb_checksum_complete(skb); } static __inline__ int udp_checksum_complete(struct sk_buff *skb) @@ -1100,11 +1100,8 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, if (uh->check == 0) { skb->ip_summed = CHECKSUM_UNNECESSARY; } else if (skb->ip_summed == CHECKSUM_HW) { - skb->ip_summed = CHECKSUM_UNNECESSARY; if (!udp_check(uh, ulen, saddr, daddr, skb->csum)) - return 0; - LIMIT_NETDEBUG(KERN_DEBUG "udp v4 hw csum failure.\n"); - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = CHECKSUM_UNNECESSARY; } if (skb->ip_summed != CHECKSUM_UNNECESSARY) skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index a970b4727ce..a16064ba0ca 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -35,6 +35,9 @@ * YOSHIFUJI Hideaki @USAGI : ARCnet support * YOSHIFUJI Hideaki @USAGI : convert /proc/net/if_inet6 to * seq_file. + * YOSHIFUJI Hideaki @USAGI : improved source address + * selection; consider scope, + * status etc. */ #include <linux/config.h> @@ -75,7 +78,7 @@ #ifdef CONFIG_IPV6_PRIVACY #include <linux/random.h> #include <linux/crypto.h> -#include <asm/scatterlist.h> +#include <linux/scatterlist.h> #endif #include <asm/uaccess.h> @@ -193,46 +196,51 @@ const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; #endif const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT; -int ipv6_addr_type(const struct in6_addr *addr) +#define IPV6_ADDR_SCOPE_TYPE(scope) ((scope) << 16) + +static inline unsigned ipv6_addr_scope2type(unsigned scope) +{ + switch(scope) { + case IPV6_ADDR_SCOPE_NODELOCAL: + return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_NODELOCAL) | + IPV6_ADDR_LOOPBACK); + case IPV6_ADDR_SCOPE_LINKLOCAL: + return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL) | + IPV6_ADDR_LINKLOCAL); + case IPV6_ADDR_SCOPE_SITELOCAL: + return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL) | + IPV6_ADDR_SITELOCAL); + } + return IPV6_ADDR_SCOPE_TYPE(scope); +} + +int __ipv6_addr_type(const struct in6_addr *addr) { - int type; u32 st; st = addr->s6_addr32[0]; - if ((st & htonl(0xFF000000)) == htonl(0xFF000000)) { - type = IPV6_ADDR_MULTICAST; - - switch((st & htonl(0x00FF0000))) { - case __constant_htonl(0x00010000): - type |= IPV6_ADDR_LOOPBACK; - break; - - case __constant_htonl(0x00020000): - type |= IPV6_ADDR_LINKLOCAL; - break; - - case __constant_htonl(0x00050000): - type |= IPV6_ADDR_SITELOCAL; - break; - }; - return type; - } - - type = IPV6_ADDR_UNICAST; - /* Consider all addresses with the first three bits different of - 000 and 111 as finished. + 000 and 111 as unicasts. */ if ((st & htonl(0xE0000000)) != htonl(0x00000000) && (st & htonl(0xE0000000)) != htonl(0xE0000000)) - return type; - - if ((st & htonl(0xFFC00000)) == htonl(0xFE800000)) - return (IPV6_ADDR_LINKLOCAL | type); + return (IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); + if ((st & htonl(0xFF000000)) == htonl(0xFF000000)) { + /* multicast */ + /* addr-select 3.1 */ + return (IPV6_ADDR_MULTICAST | + ipv6_addr_scope2type(IPV6_ADDR_MC_SCOPE(addr))); + } + + if ((st & htonl(0xFFC00000)) == htonl(0xFE800000)) + return (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL)); /* addr-select 3.1 */ if ((st & htonl(0xFFC00000)) == htonl(0xFEC00000)) - return (IPV6_ADDR_SITELOCAL | type); + return (IPV6_ADDR_SITELOCAL | IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL)); /* addr-select 3.1 */ if ((addr->s6_addr32[0] | addr->s6_addr32[1]) == 0) { if (addr->s6_addr32[2] == 0) { @@ -240,24 +248,20 @@ int ipv6_addr_type(const struct in6_addr *addr) return IPV6_ADDR_ANY; if (addr->s6_addr32[3] == htonl(0x00000001)) - return (IPV6_ADDR_LOOPBACK | type); + return (IPV6_ADDR_LOOPBACK | IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL)); /* addr-select 3.4 */ - return (IPV6_ADDR_COMPATv4 | type); + return (IPV6_ADDR_COMPATv4 | IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.3 */ } if (addr->s6_addr32[2] == htonl(0x0000ffff)) - return IPV6_ADDR_MAPPED; + return (IPV6_ADDR_MAPPED | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.3 */ } - st &= htonl(0xFF000000); - if (st == 0) - return IPV6_ADDR_RESERVED; - st &= htonl(0xFE000000); - if (st == htonl(0x02000000)) - return IPV6_ADDR_RESERVED; /* for NSAP */ - if (st == htonl(0x04000000)) - return IPV6_ADDR_RESERVED; /* for IPX */ - return type; + return (IPV6_ADDR_RESERVED | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.4 */ } static void addrconf_del_timer(struct inet6_ifaddr *ifp) @@ -805,138 +809,276 @@ out: #endif /* - * Choose an appropriate source address - * should do: - * i) get an address with an appropriate scope - * ii) see if there is a specific route for the destination and use - * an address of the attached interface - * iii) don't use deprecated addresses + * Choose an appropriate source address (RFC3484) */ -static int inline ipv6_saddr_pref(const struct inet6_ifaddr *ifp, u8 invpref) +struct ipv6_saddr_score { + int addr_type; + unsigned int attrs; + int matchlen; + unsigned int scope; + unsigned int rule; +}; + +#define IPV6_SADDR_SCORE_LOCAL 0x0001 +#define IPV6_SADDR_SCORE_PREFERRED 0x0004 +#define IPV6_SADDR_SCORE_HOA 0x0008 +#define IPV6_SADDR_SCORE_OIF 0x0010 +#define IPV6_SADDR_SCORE_LABEL 0x0020 +#define IPV6_SADDR_SCORE_PRIVACY 0x0040 + +static int inline ipv6_saddr_preferred(int type) { - int pref; - pref = ifp->flags&IFA_F_DEPRECATED ? 0 : 2; -#ifdef CONFIG_IPV6_PRIVACY - pref |= (ifp->flags^invpref)&IFA_F_TEMPORARY ? 0 : 1; -#endif - return pref; + if (type & (IPV6_ADDR_MAPPED|IPV6_ADDR_COMPATv4| + IPV6_ADDR_LOOPBACK|IPV6_ADDR_RESERVED)) + return 1; + return 0; } -#ifdef CONFIG_IPV6_PRIVACY -#define IPV6_GET_SADDR_MAXSCORE(score) ((score) == 3) -#else -#define IPV6_GET_SADDR_MAXSCORE(score) (score) -#endif +/* static matching label */ +static int inline ipv6_saddr_label(const struct in6_addr *addr, int type) +{ + /* + * prefix (longest match) label + * ----------------------------- + * ::1/128 0 + * ::/0 1 + * 2002::/16 2 + * ::/96 3 + * ::ffff:0:0/96 4 + */ + if (type & IPV6_ADDR_LOOPBACK) + return 0; + else if (type & IPV6_ADDR_COMPATv4) + return 3; + else if (type & IPV6_ADDR_MAPPED) + return 4; + else if (addr->s6_addr16[0] == htons(0x2002)) + return 2; + return 1; +} -int ipv6_dev_get_saddr(struct net_device *dev, +int ipv6_dev_get_saddr(struct net_device *daddr_dev, struct in6_addr *daddr, struct in6_addr *saddr) { - struct inet6_ifaddr *ifp = NULL; - struct inet6_ifaddr *match = NULL; - struct inet6_dev *idev; - int scope; - int err; - int hiscore = -1, score; + struct ipv6_saddr_score hiscore; + struct inet6_ifaddr *ifa_result = NULL; + int daddr_type = __ipv6_addr_type(daddr); + int daddr_scope = __ipv6_addr_src_scope(daddr_type); + u32 daddr_label = ipv6_saddr_label(daddr, daddr_type); + struct net_device *dev; - scope = ipv6_addr_scope(daddr); + memset(&hiscore, 0, sizeof(hiscore)); - /* - * known dev - * search dev and walk through dev addresses - */ + read_lock(&dev_base_lock); + read_lock(&addrconf_lock); - if (dev) { - if (dev->flags & IFF_LOOPBACK) - scope = IFA_HOST; + for (dev = dev_base; dev; dev=dev->next) { + struct inet6_dev *idev; + struct inet6_ifaddr *ifa; + + /* Rule 0: Candidate Source Address (section 4) + * - multicast and link-local destination address, + * the set of candidate source address MUST only + * include addresses assigned to interfaces + * belonging to the same link as the outgoing + * interface. + * (- For site-local destination addresses, the + * set of candidate source addresses MUST only + * include addresses assigned to interfaces + * belonging to the same site as the outgoing + * interface.) + */ + if ((daddr_type & IPV6_ADDR_MULTICAST || + daddr_scope <= IPV6_ADDR_SCOPE_LINKLOCAL) && + daddr_dev && dev != daddr_dev) + continue; - read_lock(&addrconf_lock); idev = __in6_dev_get(dev); - if (idev) { - read_lock_bh(&idev->lock); - for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) { - if (ifp->scope == scope) { - if (ifp->flags&IFA_F_TENTATIVE) - continue; -#ifdef CONFIG_IPV6_PRIVACY - score = ipv6_saddr_pref(ifp, idev->cnf.use_tempaddr > 1 ? IFA_F_TEMPORARY : 0); -#else - score = ipv6_saddr_pref(ifp, 0); -#endif - if (score <= hiscore) - continue; + if (!idev) + continue; - if (match) - in6_ifa_put(match); - match = ifp; - hiscore = score; - in6_ifa_hold(ifp); + read_lock_bh(&idev->lock); + for (ifa = idev->addr_list; ifa; ifa = ifa->if_next) { + struct ipv6_saddr_score score; - if (IPV6_GET_SADDR_MAXSCORE(score)) { - read_unlock_bh(&idev->lock); - read_unlock(&addrconf_lock); - goto out; - } + score.addr_type = __ipv6_addr_type(&ifa->addr); + + /* Rule 0: Candidate Source Address (section 4) + * - In any case, anycast addresses, multicast + * addresses, and the unspecified address MUST + * NOT be included in a candidate set. + */ + if (unlikely(score.addr_type == IPV6_ADDR_ANY || + score.addr_type & IPV6_ADDR_MULTICAST)) { + LIMIT_NETDEBUG(KERN_DEBUG + "ADDRCONF: unspecified / multicast address" + "assigned as unicast address on %s", + dev->name); + continue; + } + + score.attrs = 0; + score.matchlen = 0; + score.scope = 0; + score.rule = 0; + + if (ifa_result == NULL) { + /* record it if the first available entry */ + goto record_it; + } + + /* Rule 1: Prefer same address */ + if (hiscore.rule < 1) { + if (ipv6_addr_equal(&ifa_result->addr, daddr)) + hiscore.attrs |= IPV6_SADDR_SCORE_LOCAL; + hiscore.rule++; + } + if (ipv6_addr_equal(&ifa->addr, daddr)) { + score.attrs |= IPV6_SADDR_SCORE_LOCAL; + if (!(hiscore.attrs & IPV6_SADDR_SCORE_LOCAL)) { + score.rule = 1; + goto record_it; } + } else { + if (hiscore.attrs & IPV6_SADDR_SCORE_LOCAL) + continue; } - read_unlock_bh(&idev->lock); - } - read_unlock(&addrconf_lock); - } - if (scope == IFA_LINK) - goto out; + /* Rule 2: Prefer appropriate scope */ + if (hiscore.rule < 2) { + hiscore.scope = __ipv6_addr_src_scope(hiscore.addr_type); + hiscore.rule++; + } + score.scope = __ipv6_addr_src_scope(score.addr_type); + if (hiscore.scope < score.scope) { + if (hiscore.scope < daddr_scope) { + score.rule = 2; + goto record_it; + } else + continue; + } else if (score.scope < hiscore.scope) { + if (score.scope < daddr_scope) + continue; + else { + score.rule = 2; + goto record_it; + } + } - /* - * dev == NULL or search failed for specified dev - */ + /* Rule 3: Avoid deprecated address */ + if (hiscore.rule < 3) { + if (ipv6_saddr_preferred(hiscore.addr_type) || + !(ifa_result->flags & IFA_F_DEPRECATED)) + hiscore.attrs |= IPV6_SADDR_SCORE_PREFERRED; + hiscore.rule++; + } + if (ipv6_saddr_preferred(score.addr_type) || + !(ifa->flags & IFA_F_DEPRECATED)) { + score.attrs |= IPV6_SADDR_SCORE_PREFERRED; + if (!(hiscore.attrs & IPV6_SADDR_SCORE_PREFERRED)) { + score.rule = 3; + goto record_it; + } + } else { + if (hiscore.attrs & IPV6_SADDR_SCORE_PREFERRED) + continue; + } - read_lock(&dev_base_lock); - read_lock(&addrconf_lock); - for (dev = dev_base; dev; dev=dev->next) { - idev = __in6_dev_get(dev); - if (idev) { - read_lock_bh(&idev->lock); - for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) { - if (ifp->scope == scope) { - if (ifp->flags&IFA_F_TENTATIVE) - continue; -#ifdef CONFIG_IPV6_PRIVACY - score = ipv6_saddr_pref(ifp, idev->cnf.use_tempaddr > 1 ? IFA_F_TEMPORARY : 0); -#else - score = ipv6_saddr_pref(ifp, 0); -#endif - if (score <= hiscore) - continue; + /* Rule 4: Prefer home address -- not implemented yet */ - if (match) - in6_ifa_put(match); - match = ifp; - hiscore = score; - in6_ifa_hold(ifp); + /* Rule 5: Prefer outgoing interface */ + if (hiscore.rule < 5) { + if (daddr_dev == NULL || + daddr_dev == ifa_result->idev->dev) + hiscore.attrs |= IPV6_SADDR_SCORE_OIF; + hiscore.rule++; + } + if (daddr_dev == NULL || + daddr_dev == ifa->idev->dev) { + score.attrs |= IPV6_SADDR_SCORE_OIF; + if (!(hiscore.attrs & IPV6_SADDR_SCORE_OIF)) { + score.rule = 5; + goto record_it; + } + } else { + if (hiscore.attrs & IPV6_SADDR_SCORE_OIF) + continue; + } - if (IPV6_GET_SADDR_MAXSCORE(score)) { - read_unlock_bh(&idev->lock); - goto out_unlock_base; - } + /* Rule 6: Prefer matching label */ + if (hiscore.rule < 6) { + if (ipv6_saddr_label(&ifa_result->addr, hiscore.addr_type) == daddr_label) + hiscore.attrs |= IPV6_SADDR_SCORE_LABEL; + hiscore.rule++; + } + if (ipv6_saddr_label(&ifa->addr, score.addr_type) == daddr_label) { + score.attrs |= IPV6_SADDR_SCORE_LABEL; + if (!(hiscore.attrs & IPV6_SADDR_SCORE_LABEL)) { + score.rule = 6; + goto record_it; } + } else { + if (hiscore.attrs & IPV6_SADDR_SCORE_LABEL) + continue; } - read_unlock_bh(&idev->lock); + +#ifdef CONFIG_IPV6_PRIVACY + /* Rule 7: Prefer public address + * Note: prefer temprary address if use_tempaddr >= 2 + */ + if (hiscore.rule < 7) { + if ((!(ifa_result->flags & IFA_F_TEMPORARY)) ^ + (ifa_result->idev->cnf.use_tempaddr >= 2)) + hiscore.attrs |= IPV6_SADDR_SCORE_PRIVACY; + hiscore.rule++; + } + if ((!(ifa->flags & IFA_F_TEMPORARY)) ^ + (ifa->idev->cnf.use_tempaddr >= 2)) { + score.attrs |= IPV6_SADDR_SCORE_PRIVACY; + if (!(hiscore.attrs & IPV6_SADDR_SCORE_PRIVACY)) { + score.rule = 7; + goto record_it; + } + } else { + if (hiscore.attrs & IPV6_SADDR_SCORE_PRIVACY) + continue; + } +#endif + /* Rule 8: Use longest matching prefix */ + if (hiscore.rule < 8) { + hiscore.matchlen = ipv6_addr_diff(&ifa_result->addr, daddr); + hiscore.rule++; + } + score.matchlen = ipv6_addr_diff(&ifa->addr, daddr); + if (score.matchlen > hiscore.matchlen) { + score.rule = 8; + goto record_it; + } +#if 0 + else if (score.matchlen < hiscore.matchlen) + continue; +#endif + + /* Final Rule: choose first available one */ + continue; +record_it: + if (ifa_result) + in6_ifa_put(ifa_result); + in6_ifa_hold(ifa); + ifa_result = ifa; + hiscore = score; } + read_unlock_bh(&idev->lock); } - -out_unlock_base: read_unlock(&addrconf_lock); read_unlock(&dev_base_lock); -out: - err = -EADDRNOTAVAIL; - if (match) { - ipv6_addr_copy(saddr, &match->addr); - err = 0; - in6_ifa_put(match); - } - - return err; + if (!ifa_result) + return -EADDRNOTAVAIL; + + ipv6_addr_copy(saddr, &ifa_result->addr); + in6_ifa_put(ifa_result); + return 0; } @@ -1217,12 +1359,8 @@ static int __ipv6_regen_rndid(struct inet6_dev *idev) struct net_device *dev; struct scatterlist sg[2]; - sg[0].page = virt_to_page(idev->entropy); - sg[0].offset = offset_in_page(idev->entropy); - sg[0].length = 8; - sg[1].page = virt_to_page(idev->work_eui64); - sg[1].offset = offset_in_page(idev->work_eui64); - sg[1].length = 8; + sg_set_buf(&sg[0], idev->entropy, 8); + sg_set_buf(&sg[1], idev->work_eui64, 8); dev = idev->dev; @@ -2167,7 +2305,7 @@ static int addrconf_ifdown(struct net_device *dev, int how) /* Step 5: netlink notification of this interface */ idev->tstamp = jiffies; - inet6_ifinfo_notify(RTM_NEWLINK, idev); + inet6_ifinfo_notify(RTM_DELLINK, idev); /* Shot the device (if unregistered) */ @@ -2489,7 +2627,7 @@ static void addrconf_verify(unsigned long foo) for (i=0; i < IN6_ADDR_HSIZE; i++) { restart: - write_lock(&addrconf_hash_lock); + read_lock(&addrconf_hash_lock); for (ifp=inet6_addr_lst[i]; ifp; ifp=ifp->lst_next) { unsigned long age; #ifdef CONFIG_IPV6_PRIVACY @@ -2511,7 +2649,7 @@ restart: if (age >= ifp->valid_lft) { spin_unlock(&ifp->lock); in6_ifa_hold(ifp); - write_unlock(&addrconf_hash_lock); + read_unlock(&addrconf_hash_lock); ipv6_del_addr(ifp); goto restart; } else if (age >= ifp->prefered_lft) { @@ -2530,7 +2668,7 @@ restart: if (deprecate) { in6_ifa_hold(ifp); - write_unlock(&addrconf_hash_lock); + read_unlock(&addrconf_hash_lock); ipv6_ifa_notify(0, ifp); in6_ifa_put(ifp); @@ -2548,7 +2686,7 @@ restart: in6_ifa_hold(ifp); in6_ifa_hold(ifpub); spin_unlock(&ifp->lock); - write_unlock(&addrconf_hash_lock); + read_unlock(&addrconf_hash_lock); ipv6_create_tempaddr(ifpub, ifp); in6_ifa_put(ifpub); in6_ifa_put(ifp); @@ -2565,7 +2703,7 @@ restart: spin_unlock(&ifp->lock); } } - write_unlock(&addrconf_hash_lock); + read_unlock(&addrconf_hash_lock); } addr_chk_timer.expires = time_before(next, jiffies + HZ) ? jiffies + HZ : next; @@ -2954,8 +3092,7 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, nlmsg_failure: rtattr_failure: - if (array) - kfree(array); + kfree(array); skb_trim(skb, b - skb->data); return -1; } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 4f8795af2ed..c63b8ce0e1b 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -699,12 +699,14 @@ static int __init inet6_init(void) /* Register the family here so that the init calls below will * be able to create sockets. (?? is this dangerous ??) */ - (void) sock_register(&inet6_family_ops); + err = sock_register(&inet6_family_ops); + if (err) + goto out_unregister_raw_proto; /* Initialise ipv6 mibs */ err = init_ipv6_mibs(); if (err) - goto out_unregister_raw_proto; + goto out_unregister_sock; /* * ipngwg API draft makes clear that the correct semantics @@ -796,6 +798,8 @@ icmp_fail: ipv6_sysctl_unregister(); #endif cleanup_ipv6_mibs(); +out_unregister_sock: + sock_unregister(PF_INET6); out_unregister_raw_proto: proto_unregister(&rawv6_prot); out_unregister_udp_proto: diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index b7185fb3377..1bdf0fb8bf8 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -585,17 +585,16 @@ static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) daddr = &skb->nh.ipv6h->daddr; /* Perform checksum. */ - if (skb->ip_summed == CHECKSUM_HW) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, - skb->csum)) { - LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 hw checksum failed\n"); - skb->ip_summed = CHECKSUM_NONE; - } - } - if (skb->ip_summed == CHECKSUM_NONE) { - if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, - skb_checksum(skb, 0, skb->len, 0))) { + switch (skb->ip_summed) { + case CHECKSUM_HW: + if (!csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, + skb->csum)) + break; + /* fall through */ + case CHECKSUM_NONE: + skb->csum = ~csum_ipv6_magic(saddr, daddr, skb->len, + IPPROTO_ICMPV6, 0); + if (__skb_checksum_complete(skb)) { LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n", NIP6(*saddr), NIP6(*daddr)); goto discard_it; @@ -700,10 +699,7 @@ int __init icmpv6_init(struct net_proto_family *ops) struct sock *sk; int err, i, j; - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_possible(i)) - continue; - + for_each_cpu(i) { err = sock_create_kern(PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, &per_cpu(__icmpv6_socket, i)); if (err < 0) { @@ -749,9 +745,7 @@ void icmpv6_cleanup(void) { int i; - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_possible(i)) - continue; + for_each_cpu(i) { sock_release(per_cpu(__icmpv6_socket, i)); } inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 4fcc5a7acf6..1bf6d9a769e 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -127,56 +127,6 @@ static __inline__ int addr_bit_set(void *token, int fn_bit) return htonl(1 << ((~fn_bit)&0x1F)) & addr[fn_bit>>5]; } -/* - * find the first different bit between two addresses - * length of address must be a multiple of 32bits - */ - -static __inline__ int addr_diff(void *token1, void *token2, int addrlen) -{ - __u32 *a1 = token1; - __u32 *a2 = token2; - int i; - - addrlen >>= 2; - - for (i = 0; i < addrlen; i++) { - __u32 xb; - - xb = a1[i] ^ a2[i]; - - if (xb) { - int j = 31; - - xb = ntohl(xb); - - while ((xb & (1 << j)) == 0) - j--; - - return (i * 32 + 31 - j); - } - } - - /* - * we should *never* get to this point since that - * would mean the addrs are equal - * - * However, we do get to it 8) And exacly, when - * addresses are equal 8) - * - * ip route add 1111::/128 via ... - * ip route add 1111::/64 via ... - * and we are here. - * - * Ideally, this function should stop comparison - * at prefix length. It does not, but it is still OK, - * if returned value is greater than prefix length. - * --ANK (980803) - */ - - return addrlen<<5; -} - static __inline__ struct fib6_node * node_alloc(void) { struct fib6_node *fn; @@ -296,11 +246,11 @@ insert_above: /* find 1st bit in difference between the 2 addrs. - See comment in addr_diff: bit may be an invalid value, + See comment in __ipv6_addr_diff: bit may be an invalid value, but if it is >= plen, the value is ignored in any case. */ - bit = addr_diff(addr, &key->addr, addrlen); + bit = __ipv6_addr_diff(addr, &key->addr, addrlen); /* * (intermediate)[in] diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 6e348042693..a6026d2787d 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -176,6 +176,11 @@ resubmit: if (ipprot->flags & INET6_PROTO_FINAL) { struct ipv6hdr *hdr; + /* Free reference early: we don't need it any more, + and it may hold ip_conntrack module loaded + indefinitely. */ + nf_reset(skb); + skb_postpull_rcsum(skb, skb->nh.raw, skb->h.raw - skb->nh.raw); hdr = skb->nh.ipv6h; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 563b442ffab..c1fa693511a 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -147,7 +147,8 @@ static int ip6_output2(struct sk_buff *skb) int ip6_output(struct sk_buff *skb) { - if (skb->len > dst_mtu(skb->dst) || dst_allfrag(skb->dst)) + if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->ufo_size) || + dst_allfrag(skb->dst)) return ip6_fragment(skb, ip6_output2); else return ip6_output2(skb); @@ -440,9 +441,15 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) #ifdef CONFIG_NETFILTER to->nfmark = from->nfmark; /* Connection association is same as pre-frag packet */ + nf_conntrack_put(to->nfct); to->nfct = from->nfct; nf_conntrack_get(to->nfct); to->nfctinfo = from->nfctinfo; +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + nf_conntrack_put_reasm(to->nfct_reasm); + to->nfct_reasm = from->nfct_reasm; + nf_conntrack_get_reasm(to->nfct_reasm); +#endif #ifdef CONFIG_BRIDGE_NETFILTER nf_bridge_put(to->nf_bridge); to->nf_bridge = from->nf_bridge; @@ -586,8 +593,7 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) skb->next = NULL; } - if (tmp_hdr) - kfree(tmp_hdr); + kfree(tmp_hdr); if (err == 0) { IP6_INC_STATS(IPSTATS_MIB_FRAGOKS); @@ -768,6 +774,65 @@ out_err_release: *dst = NULL; return err; } +inline int ip6_ufo_append_data(struct sock *sk, + int getfrag(void *from, char *to, int offset, int len, + int odd, struct sk_buff *skb), + void *from, int length, int hh_len, int fragheaderlen, + int transhdrlen, int mtu,unsigned int flags) + +{ + struct sk_buff *skb; + int err; + + /* There is support for UDP large send offload by network + * device, so create one single skb packet containing complete + * udp datagram + */ + if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) { + skb = sock_alloc_send_skb(sk, + hh_len + fragheaderlen + transhdrlen + 20, + (flags & MSG_DONTWAIT), &err); + if (skb == NULL) + return -ENOMEM; + + /* reserve space for Hardware header */ + skb_reserve(skb, hh_len); + + /* create space for UDP/IP header */ + skb_put(skb,fragheaderlen + transhdrlen); + + /* initialize network header pointer */ + skb->nh.raw = skb->data; + + /* initialize protocol header pointer */ + skb->h.raw = skb->data + fragheaderlen; + + skb->ip_summed = CHECKSUM_HW; + skb->csum = 0; + sk->sk_sndmsg_off = 0; + } + + err = skb_append_datato_frags(sk,skb, getfrag, from, + (length - transhdrlen)); + if (!err) { + struct frag_hdr fhdr; + + /* specify the length of each IP datagram fragment*/ + skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen) - + sizeof(struct frag_hdr); + ipv6_select_ident(skb, &fhdr); + skb_shinfo(skb)->ip6_frag_id = fhdr.identification; + __skb_queue_tail(&sk->sk_write_queue, skb); + + return 0; + } + /* There is not enough support do UPD LSO, + * so follow normal path + */ + kfree_skb(skb); + + return err; +} int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), @@ -860,6 +925,15 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, */ inet->cork.length += length; + if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) && + (rt->u.dst.dev->features & NETIF_F_UFO)) { + + if(ip6_ufo_append_data(sk, getfrag, from, length, hh_len, + fragheaderlen, transhdrlen, mtu, flags)) + goto error; + + return 0; + } if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) goto alloc_new_skb; @@ -1117,10 +1191,8 @@ int ip6_push_pending_frames(struct sock *sk) out: inet->cork.flags &= ~IPCORK_OPT; - if (np->cork.opt) { - kfree(np->cork.opt); - np->cork.opt = NULL; - } + kfree(np->cork.opt); + np->cork.opt = NULL; if (np->cork.rt) { dst_release(&np->cork.rt->u.dst); np->cork.rt = NULL; @@ -1145,10 +1217,8 @@ void ip6_flush_pending_frames(struct sock *sk) inet->cork.flags &= ~IPCORK_OPT; - if (np->cork.opt) { - kfree(np->cork.opt); - np->cork.opt = NULL; - } + kfree(np->cork.opt); + np->cork.opt = NULL; if (np->cork.rt) { dst_release(&np->cork.rt->u.dst); np->cork.rt = NULL; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index cf94372d1af..e315d0f80af 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -525,6 +525,7 @@ ip6ip6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) if ((t = ip6ip6_tnl_lookup(&ipv6h->saddr, &ipv6h->daddr)) != NULL) { if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { + read_unlock(&ip6ip6_lock); kfree_skb(skb); return 0; } @@ -756,8 +757,7 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) } ip6_tnl_dst_store(t, dst); - if (opt) - kfree(opt); + kfree(opt); t->recursion--; return 0; @@ -766,8 +766,7 @@ tx_err_link_failure: dst_link_failure(skb); tx_err_dst_release: dst_release(dst); - if (opt) - kfree(opt); + kfree(opt); tx_err: stats->tx_errors++; stats->tx_dropped++; diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 85bfbc69b2c..55917fb1709 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -130,8 +130,7 @@ static int ipcomp6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, s out_put_cpu: put_cpu(); out: - if (tmp_hdr) - kfree(tmp_hdr); + kfree(tmp_hdr); if (err) goto error_out; return nexthdr; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 8567873d0dd..25757ade989 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -80,8 +80,7 @@ int ip6_ra_control(struct sock *sk, int sel, void (*destructor)(struct sock *)) if (ra->sk == sk) { if (sel>=0) { write_unlock_bh(&ip6_ra_lock); - if (new_ra) - kfree(new_ra); + kfree(new_ra); return -EADDRINUSE; } @@ -288,7 +287,7 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, { struct ipv6_txoptions *opt; if (optlen == 0) - optval = 0; + optval = NULL; /* hop-by-hop / destination options are privileged option */ retv = -EPERM; diff --git a/net/ipv6/ipv6_syms.c b/net/ipv6/ipv6_syms.c index 37a4a99c9fe..16482785bdf 100644 --- a/net/ipv6/ipv6_syms.c +++ b/net/ipv6/ipv6_syms.c @@ -7,7 +7,7 @@ #include <net/ip6_route.h> #include <net/xfrm.h> -EXPORT_SYMBOL(ipv6_addr_type); +EXPORT_SYMBOL(__ipv6_addr_type); EXPORT_SYMBOL(icmpv6_send); EXPORT_SYMBOL(icmpv6_statistics); EXPORT_SYMBOL(icmpv6_err_convert); diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 39a96c76810..f15e04ad026 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -164,7 +164,7 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, #define MLDV2_MASK(value, nb) ((nb)>=32 ? (value) : ((1<<(nb))-1) & (value)) #define MLDV2_EXP(thresh, nbmant, nbexp, value) \ ((value) < (thresh) ? (value) : \ - ((MLDV2_MASK(value, nbmant) | (1<<(nbmant+nbexp))) << \ + ((MLDV2_MASK(value, nbmant) | (1<<(nbmant))) << \ (MLDV2_MASK((value) >> (nbmant), nbexp) + (nbexp)))) #define MLDV2_QQIC(value) MLDV2_EXP(0x80, 4, 3, value) @@ -545,8 +545,10 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) sock_kfree_s(sk, newpsl, IP6_SFLSIZE(newpsl->sl_max)); goto done; } - } else + } else { newpsl = NULL; + (void) ip6_mc_add_src(idev, group, gsf->gf_fmode, 0, NULL, 0); + } psl = pmc->sflist; if (psl) { (void) ip6_mc_del_src(idev, group, pmc->sfmode, @@ -1087,7 +1089,7 @@ static void mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, int igmp6_event_query(struct sk_buff *skb) { - struct mld2_query *mlh2 = (struct mld2_query *) skb->h.raw; + struct mld2_query *mlh2 = NULL; struct ifmcaddr6 *ma; struct in6_addr *group; unsigned long max_delay; @@ -1140,6 +1142,13 @@ int igmp6_event_query(struct sk_buff *skb) /* clear deleted report items */ mld_clear_delrec(idev); } else if (len >= 28) { + int srcs_offset = sizeof(struct mld2_query) - + sizeof(struct icmp6hdr); + if (!pskb_may_pull(skb, srcs_offset)) { + in6_dev_put(idev); + return -EINVAL; + } + mlh2 = (struct mld2_query *) skb->h.raw; max_delay = (MLDV2_MRC(ntohs(mlh2->mrc))*HZ)/1000; if (!max_delay) max_delay = 1; @@ -1156,7 +1165,15 @@ int igmp6_event_query(struct sk_buff *skb) return 0; } /* mark sources to include, if group & source-specific */ - mark = mlh2->nsrcs != 0; + if (mlh2->nsrcs != 0) { + if (!pskb_may_pull(skb, srcs_offset + + mlh2->nsrcs * sizeof(struct in6_addr))) { + in6_dev_put(idev); + return -EINVAL; + } + mlh2 = (struct mld2_query *) skb->h.raw; + mark = 1; + } } else { in6_dev_put(idev); return -EINVAL; diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index bb7ccfe33f2..060d6120241 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -5,10 +5,20 @@ menu "IPv6: Netfilter Configuration (EXPERIMENTAL)" depends on INET && IPV6 && NETFILTER && EXPERIMENTAL -#tristate 'Connection tracking (required for masq/NAT)' CONFIG_IP6_NF_CONNTRACK -#if [ "$CONFIG_IP6_NF_CONNTRACK" != "n" ]; then -# dep_tristate ' FTP protocol support' CONFIG_IP6_NF_FTP $CONFIG_IP6_NF_CONNTRACK -#fi +config NF_CONNTRACK_IPV6 + tristate "IPv6 support for new connection tracking (EXPERIMENTAL)" + depends on EXPERIMENTAL && NF_CONNTRACK + ---help--- + Connection tracking keeps a record of what packets have passed + through your machine, in order to figure out how they are related + into connections. + + This is IPv6 support on Layer 3 independent connection tracking. + Layer 3 independent connection tracking is experimental scheme + which generalize ip_conntrack to support other layer 3 protocols. + + To compile it as a module, choose M here. If unsure, say N. + config IP6_NF_QUEUE tristate "IP6 Userspace queueing via NETLINK (OBSOLETE)" ---help--- @@ -114,7 +124,6 @@ config IP6_NF_MATCH_OWNER To compile it as a module, choose M here. If unsure, say N. -# dep_tristate ' MAC address match support' CONFIG_IP6_NF_MATCH_MAC $CONFIG_IP6_NF_IPTABLES config IP6_NF_MATCH_MARK tristate "netfilter MARK match support" depends on IP6_NF_IPTABLES @@ -170,15 +179,6 @@ config IP6_NF_MATCH_PHYSDEV To compile it as a module, choose M here. If unsure, say N. -# dep_tristate ' Multiple port match support' CONFIG_IP6_NF_MATCH_MULTIPORT $CONFIG_IP6_NF_IPTABLES -# dep_tristate ' TOS match support' CONFIG_IP6_NF_MATCH_TOS $CONFIG_IP6_NF_IPTABLES -# if [ "$CONFIG_IP6_NF_CONNTRACK" != "n" ]; then -# dep_tristate ' Connection state match support' CONFIG_IP6_NF_MATCH_STATE $CONFIG_IP6_NF_CONNTRACK $CONFIG_IP6_NF_IPTABLES -# fi -# if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then -# dep_tristate ' Unclean match support (EXPERIMENTAL)' CONFIG_IP6_NF_MATCH_UNCLEAN $CONFIG_IP6_NF_IPTABLES -# dep_tristate ' Owner match support (EXPERIMENTAL)' CONFIG_IP6_NF_MATCH_OWNER $CONFIG_IP6_NF_IPTABLES -# fi # The targets config IP6_NF_FILTER tristate "Packet filtering" @@ -220,12 +220,6 @@ config IP6_NF_TARGET_NFQUEUE To compile it as a module, choose M here. If unsure, say N. -# if [ "$CONFIG_IP6_NF_FILTER" != "n" ]; then -# dep_tristate ' REJECT target support' CONFIG_IP6_NF_TARGET_REJECT $CONFIG_IP6_NF_FILTER -# if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then -# dep_tristate ' MIRROR target support (EXPERIMENTAL)' CONFIG_IP6_NF_TARGET_MIRROR $CONFIG_IP6_NF_FILTER -# fi -# fi config IP6_NF_MANGLE tristate "Packet mangling" depends on IP6_NF_IPTABLES @@ -236,7 +230,6 @@ config IP6_NF_MANGLE To compile it as a module, choose M here. If unsure, say N. -# dep_tristate ' TOS target support' CONFIG_IP6_NF_TARGET_TOS $CONFIG_IP_NF_MANGLE config IP6_NF_TARGET_MARK tristate "MARK target support" depends on IP6_NF_MANGLE @@ -266,7 +259,6 @@ config IP6_NF_TARGET_HL To compile it as a module, choose M here. If unsure, say N. -#dep_tristate ' LOG target support' CONFIG_IP6_NF_TARGET_LOG $CONFIG_IP6_NF_IPTABLES config IP6_NF_RAW tristate 'raw table support (required for TRACE)' depends on IP6_NF_IPTABLES diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 2b2c370e8b1..9ab5b2ca1f5 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -27,3 +27,9 @@ obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o obj-$(CONFIG_IP6_NF_MATCH_HL) += ip6t_hl.o obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o + +# objects for l3 independent conntrack +nf_conntrack_ipv6-objs := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o nf_conntrack_reasm.o + +# l3 independent conntrack +obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 21deec25a12..7d492226c16 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -2,7 +2,7 @@ * Packet matching code. * * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling - * Copyright (C) 2000-2002 Netfilter core team <coreteam@netfilter.org> + * Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -23,7 +23,6 @@ #include <linux/tcp.h> #include <linux/udp.h> #include <linux/icmpv6.h> -#include <net/ip.h> #include <net/ipv6.h> #include <asm/uaccess.h> #include <asm/semaphore.h> @@ -80,13 +79,12 @@ static DECLARE_MUTEX(ip6t_mutex); #define inline #endif -/* Locking is simple: we assume at worst case there will be one packet - in user context and one from bottom halves (or soft irq if Alexey's - softnet patch was applied). - +/* We keep a set of rules for each CPU, so we can avoid write-locking - them; doing a readlock_bh() stops packets coming through if we're - in user context. + them in the softirq when updating the counters and therefore + only need to read-lock in the softirq; doing a write_lock_bh() in user + context stops packets coming through and allows user context to read + the counters or update the rules. To be cache friendly on SMP, we arrange them like so: [ n-entries ] @@ -356,7 +354,7 @@ ip6t_do_table(struct sk_buff **pskb, struct ip6t_table *table, void *userdata) { - static const char nulldevname[IFNAMSIZ]; + static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); int offset = 0; unsigned int protoff = 0; int hotdrop = 0; @@ -369,7 +367,6 @@ ip6t_do_table(struct sk_buff **pskb, /* Initialization */ indev = in ? in->name : nulldevname; outdev = out ? out->name : nulldevname; - /* We handle fragments by dealing with the first fragment as * if it was a normal packet. All other fragments are treated * normally, except that they will NEVER match rules that ask @@ -497,75 +494,145 @@ ip6t_do_table(struct sk_buff **pskb, #endif } -/* If it succeeds, returns element and locks mutex */ -static inline void * -find_inlist_lock_noload(struct list_head *head, - const char *name, - int *error, - struct semaphore *mutex) +/* + * These are weird, but module loading must not be done with mutex + * held (since they will register), and we have to have a single + * function to use try_then_request_module(). + */ + +/* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */ +static inline struct ip6t_table *find_table_lock(const char *name) { - void *ret; + struct ip6t_table *t; -#if 1 - duprintf("find_inlist: searching for `%s' in %s.\n", - name, head == &ip6t_target ? "ip6t_target" - : head == &ip6t_match ? "ip6t_match" - : head == &ip6t_tables ? "ip6t_tables" : "UNKNOWN"); -#endif + if (down_interruptible(&ip6t_mutex) != 0) + return ERR_PTR(-EINTR); - *error = down_interruptible(mutex); - if (*error != 0) - return NULL; + list_for_each_entry(t, &ip6t_tables, list) + if (strcmp(t->name, name) == 0 && try_module_get(t->me)) + return t; + up(&ip6t_mutex); + return NULL; +} + +/* Find match, grabs ref. Returns ERR_PTR() on error. */ +static inline struct ip6t_match *find_match(const char *name, u8 revision) +{ + struct ip6t_match *m; + int err = 0; - ret = list_named_find(head, name); - if (!ret) { - *error = -ENOENT; - up(mutex); + if (down_interruptible(&ip6t_mutex) != 0) + return ERR_PTR(-EINTR); + + list_for_each_entry(m, &ip6t_match, list) { + if (strcmp(m->name, name) == 0) { + if (m->revision == revision) { + if (try_module_get(m->me)) { + up(&ip6t_mutex); + return m; + } + } else + err = -EPROTOTYPE; /* Found something. */ + } } - return ret; + up(&ip6t_mutex); + return ERR_PTR(err); } -#ifndef CONFIG_KMOD -#define find_inlist_lock(h,n,p,e,m) find_inlist_lock_noload((h),(n),(e),(m)) -#else -static void * -find_inlist_lock(struct list_head *head, - const char *name, - const char *prefix, - int *error, - struct semaphore *mutex) +/* Find target, grabs ref. Returns ERR_PTR() on error. */ +static inline struct ip6t_target *find_target(const char *name, u8 revision) { - void *ret; + struct ip6t_target *t; + int err = 0; - ret = find_inlist_lock_noload(head, name, error, mutex); - if (!ret) { - duprintf("find_inlist: loading `%s%s'.\n", prefix, name); - request_module("%s%s", prefix, name); - ret = find_inlist_lock_noload(head, name, error, mutex); + if (down_interruptible(&ip6t_mutex) != 0) + return ERR_PTR(-EINTR); + + list_for_each_entry(t, &ip6t_target, list) { + if (strcmp(t->name, name) == 0) { + if (t->revision == revision) { + if (try_module_get(t->me)) { + up(&ip6t_mutex); + return t; + } + } else + err = -EPROTOTYPE; /* Found something. */ + } } + up(&ip6t_mutex); + return ERR_PTR(err); +} - return ret; +struct ip6t_target *ip6t_find_target(const char *name, u8 revision) +{ + struct ip6t_target *target; + + target = try_then_request_module(find_target(name, revision), + "ip6t_%s", name); + if (IS_ERR(target) || !target) + return NULL; + return target; } -#endif -static inline struct ip6t_table * -ip6t_find_table_lock(const char *name, int *error, struct semaphore *mutex) +static int match_revfn(const char *name, u8 revision, int *bestp) { - return find_inlist_lock(&ip6t_tables, name, "ip6table_", error, mutex); + struct ip6t_match *m; + int have_rev = 0; + + list_for_each_entry(m, &ip6t_match, list) { + if (strcmp(m->name, name) == 0) { + if (m->revision > *bestp) + *bestp = m->revision; + if (m->revision == revision) + have_rev = 1; + } + } + return have_rev; } -static inline struct ip6t_match * -find_match_lock(const char *name, int *error, struct semaphore *mutex) +static int target_revfn(const char *name, u8 revision, int *bestp) { - return find_inlist_lock(&ip6t_match, name, "ip6t_", error, mutex); + struct ip6t_target *t; + int have_rev = 0; + + list_for_each_entry(t, &ip6t_target, list) { + if (strcmp(t->name, name) == 0) { + if (t->revision > *bestp) + *bestp = t->revision; + if (t->revision == revision) + have_rev = 1; + } + } + return have_rev; } -static struct ip6t_target * -ip6t_find_target_lock(const char *name, int *error, struct semaphore *mutex) +/* Returns true or fals (if no such extension at all) */ +static inline int find_revision(const char *name, u8 revision, + int (*revfn)(const char *, u8, int *), + int *err) { - return find_inlist_lock(&ip6t_target, name, "ip6t_", error, mutex); + int have_rev, best = -1; + + if (down_interruptible(&ip6t_mutex) != 0) { + *err = -EINTR; + return 1; + } + have_rev = revfn(name, revision, &best); + up(&ip6t_mutex); + + /* Nothing at all? Return 0 to try loading module. */ + if (best == -1) { + *err = -ENOENT; + return 0; + } + + *err = best; + if (!have_rev) + *err = -EPROTONOSUPPORT; + return 1; } + /* All zeroes == unconditional rule. */ static inline int unconditional(const struct ip6t_ip6 *ipv6) @@ -725,20 +792,16 @@ check_match(struct ip6t_entry_match *m, unsigned int hookmask, unsigned int *i) { - int ret; struct ip6t_match *match; - match = find_match_lock(m->u.user.name, &ret, &ip6t_mutex); - if (!match) { - // duprintf("check_match: `%s' not found\n", m->u.name); - return ret; - } - if (!try_module_get(match->me)) { - up(&ip6t_mutex); - return -ENOENT; + match = try_then_request_module(find_match(m->u.user.name, + m->u.user.revision), + "ip6t_%s", m->u.user.name); + if (IS_ERR(match) || !match) { + duprintf("check_match: `%s' not found\n", m->u.user.name); + return match ? PTR_ERR(match) : -ENOENT; } m->u.kernel.match = match; - up(&ip6t_mutex); if (m->u.kernel.match->checkentry && !m->u.kernel.match->checkentry(name, ipv6, m->data, @@ -776,22 +839,16 @@ check_entry(struct ip6t_entry *e, const char *name, unsigned int size, goto cleanup_matches; t = ip6t_get_target(e); - target = ip6t_find_target_lock(t->u.user.name, &ret, &ip6t_mutex); - if (!target) { + target = try_then_request_module(find_target(t->u.user.name, + t->u.user.revision), + "ip6t_%s", t->u.user.name); + if (IS_ERR(target) || !target) { duprintf("check_entry: `%s' not found\n", t->u.user.name); - goto cleanup_matches; - } - if (!try_module_get(target->me)) { - up(&ip6t_mutex); - ret = -ENOENT; + ret = target ? PTR_ERR(target) : -ENOENT; goto cleanup_matches; } t->u.kernel.target = target; - up(&ip6t_mutex); - if (!t->u.kernel.target) { - ret = -EBUSY; - goto cleanup_matches; - } + if (t->u.kernel.target == &ip6t_standard_target) { if (!standard_check(t, size)) { ret = -EINVAL; @@ -1118,8 +1175,8 @@ get_entries(const struct ip6t_get_entries *entries, int ret; struct ip6t_table *t; - t = ip6t_find_table_lock(entries->name, &ret, &ip6t_mutex); - if (t) { + t = find_table_lock(entries->name); + if (t && !IS_ERR(t)) { duprintf("t->private->number = %u\n", t->private->number); if (entries->size == t->private->size) @@ -1131,10 +1188,10 @@ get_entries(const struct ip6t_get_entries *entries, entries->size); ret = -EINVAL; } + module_put(t->me); up(&ip6t_mutex); } else - duprintf("get_entries: Can't find %s!\n", - entries->name); + ret = t ? PTR_ERR(t) : -ENOENT; return ret; } @@ -1182,22 +1239,19 @@ do_replace(void __user *user, unsigned int len) duprintf("ip_tables: Translated table\n"); - t = ip6t_find_table_lock(tmp.name, &ret, &ip6t_mutex); - if (!t) + t = try_then_request_module(find_table_lock(tmp.name), + "ip6table_%s", tmp.name); + if (!t || IS_ERR(t)) { + ret = t ? PTR_ERR(t) : -ENOENT; goto free_newinfo_counters_untrans; + } /* You lied! */ if (tmp.valid_hooks != t->valid_hooks) { duprintf("Valid hook crap: %08X vs %08X\n", tmp.valid_hooks, t->valid_hooks); ret = -EINVAL; - goto free_newinfo_counters_untrans_unlock; - } - - /* Get a reference in advance, we're not allowed fail later */ - if (!try_module_get(t->me)) { - ret = -EBUSY; - goto free_newinfo_counters_untrans_unlock; + goto put_module; } oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret); @@ -1219,7 +1273,6 @@ do_replace(void __user *user, unsigned int len) /* Decrease module usage counts and free resource */ IP6T_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); vfree(oldinfo); - /* Silent error: too late now. */ if (copy_to_user(tmp.counters, counters, sizeof(struct ip6t_counters) * tmp.num_counters) != 0) ret = -EFAULT; @@ -1229,7 +1282,6 @@ do_replace(void __user *user, unsigned int len) put_module: module_put(t->me); - free_newinfo_counters_untrans_unlock: up(&ip6t_mutex); free_newinfo_counters_untrans: IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL); @@ -1268,7 +1320,7 @@ do_add_counters(void __user *user, unsigned int len) unsigned int i; struct ip6t_counters_info tmp, *paddc; struct ip6t_table *t; - int ret; + int ret = 0; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1285,9 +1337,11 @@ do_add_counters(void __user *user, unsigned int len) goto free; } - t = ip6t_find_table_lock(tmp.name, &ret, &ip6t_mutex); - if (!t) + t = find_table_lock(tmp.name); + if (!t || IS_ERR(t)) { + ret = t ? PTR_ERR(t) : -ENOENT; goto free; + } write_lock_bh(&t->lock); if (t->private->number != paddc->num_counters) { @@ -1304,6 +1358,7 @@ do_add_counters(void __user *user, unsigned int len) unlock_up_free: write_unlock_bh(&t->lock); up(&ip6t_mutex); + module_put(t->me); free: vfree(paddc); @@ -1360,8 +1415,10 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) break; } name[IP6T_TABLE_MAXNAMELEN-1] = '\0'; - t = ip6t_find_table_lock(name, &ret, &ip6t_mutex); - if (t) { + + t = try_then_request_module(find_table_lock(name), + "ip6table_%s", name); + if (t && !IS_ERR(t)) { struct ip6t_getinfo info; info.valid_hooks = t->valid_hooks; @@ -1377,9 +1434,10 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) ret = -EFAULT; else ret = 0; - up(&ip6t_mutex); - } + module_put(t->me); + } else + ret = t ? PTR_ERR(t) : -ENOENT; } break; @@ -1400,6 +1458,31 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) break; } + case IP6T_SO_GET_REVISION_MATCH: + case IP6T_SO_GET_REVISION_TARGET: { + struct ip6t_get_revision rev; + int (*revfn)(const char *, u8, int *); + + if (*len != sizeof(rev)) { + ret = -EINVAL; + break; + } + if (copy_from_user(&rev, user, sizeof(rev)) != 0) { + ret = -EFAULT; + break; + } + + if (cmd == IP6T_SO_GET_REVISION_TARGET) + revfn = target_revfn; + else + revfn = match_revfn; + + try_then_request_module(find_revision(rev.name, rev.revision, + revfn, &ret), + "ip6t_%s", rev.name); + break; + } + default: duprintf("do_ip6t_get_ctl: unknown request %i\n", cmd); ret = -EINVAL; @@ -1417,12 +1500,7 @@ ip6t_register_target(struct ip6t_target *target) ret = down_interruptible(&ip6t_mutex); if (ret != 0) return ret; - - if (!list_named_insert(&ip6t_target, target)) { - duprintf("ip6t_register_target: `%s' already in list!\n", - target->name); - ret = -EINVAL; - } + list_add(&target->list, &ip6t_target); up(&ip6t_mutex); return ret; } @@ -1444,11 +1522,7 @@ ip6t_register_match(struct ip6t_match *match) if (ret != 0) return ret; - if (!list_named_insert(&ip6t_match, match)) { - duprintf("ip6t_register_match: `%s' already in list!\n", - match->name); - ret = -EINVAL; - } + list_add(&match->list, &ip6t_match); up(&ip6t_mutex); return ret; diff --git a/net/ipv6/netfilter/ip6t_MARK.c b/net/ipv6/netfilter/ip6t_MARK.c index 81924fcc585..eab8fb864ee 100644 --- a/net/ipv6/netfilter/ip6t_MARK.c +++ b/net/ipv6/netfilter/ip6t_MARK.c @@ -56,8 +56,12 @@ checkentry(const char *tablename, return 1; } -static struct ip6t_target ip6t_mark_reg -= { { NULL, NULL }, "MARK", target, checkentry, NULL, THIS_MODULE }; +static struct ip6t_target ip6t_mark_reg = { + .name = "MARK", + .target = target, + .checkentry = checkentry, + .me = THIS_MODULE +}; static int __init init(void) { diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c new file mode 100644 index 00000000000..753a3ae8502 --- /dev/null +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -0,0 +1,556 @@ +/* + * Copyright (C)2004 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Author: + * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * + * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * - support Layer 3 protocol independent connection tracking. + * Based on the original ip_conntrack code which had the following + * copyright information: + * (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * - add get_features() to support various size of conntrack + * structures. + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/ipv6.h> +#include <linux/in6.h> +#include <linux/netfilter.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/icmp.h> +#include <linux/sysctl.h> +#include <net/ipv6.h> + +#include <linux/netfilter_ipv6.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_protocol.h> +#include <net/netfilter/nf_conntrack_l3proto.h> +#include <net/netfilter/nf_conntrack_core.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +DECLARE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat); + +static int ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, + struct nf_conntrack_tuple *tuple) +{ + u_int32_t _addrs[8], *ap; + + ap = skb_header_pointer(skb, nhoff + offsetof(struct ipv6hdr, saddr), + sizeof(_addrs), _addrs); + if (ap == NULL) + return 0; + + memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6)); + memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6)); + + return 1; +} + +static int ipv6_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + memcpy(tuple->src.u3.ip6, orig->dst.u3.ip6, sizeof(tuple->src.u3.ip6)); + memcpy(tuple->dst.u3.ip6, orig->src.u3.ip6, sizeof(tuple->dst.u3.ip6)); + + return 1; +} + +static int ipv6_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "src=%x:%x:%x:%x:%x:%x:%x:%x dst=%x:%x:%x:%x:%x:%x:%x:%x ", + NIP6(*((struct in6_addr *)tuple->src.u3.ip6)), + NIP6(*((struct in6_addr *)tuple->dst.u3.ip6))); +} + +static int ipv6_print_conntrack(struct seq_file *s, + const struct nf_conn *conntrack) +{ + return 0; +} + +/* + * Based on ipv6_skip_exthdr() in net/ipv6/exthdr.c + * + * This function parses (probably truncated) exthdr set "hdr" + * of length "len". "nexthdrp" initially points to some place, + * where type of the first header can be found. + * + * It skips all well-known exthdrs, and returns pointer to the start + * of unparsable area i.e. the first header with unknown type. + * if success, *nexthdr is updated by type/protocol of this header. + * + * NOTES: - it may return pointer pointing beyond end of packet, + * if the last recognized header is truncated in the middle. + * - if packet is truncated, so that all parsed headers are skipped, + * it returns -1. + * - if packet is fragmented, return pointer of the fragment header. + * - ESP is unparsable for now and considered like + * normal payload protocol. + * - Note also special handling of AUTH header. Thanks to IPsec wizards. + */ + +int nf_ct_ipv6_skip_exthdr(struct sk_buff *skb, int start, u8 *nexthdrp, + int len) +{ + u8 nexthdr = *nexthdrp; + + while (ipv6_ext_hdr(nexthdr)) { + struct ipv6_opt_hdr hdr; + int hdrlen; + + if (len < (int)sizeof(struct ipv6_opt_hdr)) + return -1; + if (nexthdr == NEXTHDR_NONE) + break; + if (nexthdr == NEXTHDR_FRAGMENT) + break; + if (skb_copy_bits(skb, start, &hdr, sizeof(hdr))) + BUG(); + if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hdr.hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(&hdr); + + nexthdr = hdr.nexthdr; + len -= hdrlen; + start += hdrlen; + } + + *nexthdrp = nexthdr; + return start; +} + +static int +ipv6_prepare(struct sk_buff **pskb, unsigned int hooknum, unsigned int *dataoff, + u_int8_t *protonum) +{ + unsigned int extoff; + unsigned char pnum; + int protoff; + + extoff = (u8*)((*pskb)->nh.ipv6h + 1) - (*pskb)->data; + pnum = (*pskb)->nh.ipv6h->nexthdr; + + protoff = nf_ct_ipv6_skip_exthdr(*pskb, extoff, &pnum, + (*pskb)->len - extoff); + + /* + * (protoff == (*pskb)->len) mean that the packet doesn't have no data + * except of IPv6 & ext headers. but it's tracked anyway. - YK + */ + if ((protoff < 0) || (protoff > (*pskb)->len)) { + DEBUGP("ip6_conntrack_core: can't find proto in pkt\n"); + NF_CT_STAT_INC(error); + NF_CT_STAT_INC(invalid); + return -NF_ACCEPT; + } + + *dataoff = protoff; + *protonum = pnum; + return NF_ACCEPT; +} + +static u_int32_t ipv6_get_features(const struct nf_conntrack_tuple *tuple) +{ + return NF_CT_F_BASIC; +} + +static unsigned int ipv6_confirm(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + + /* This is where we call the helper: as the packet goes out. */ + ct = nf_ct_get(*pskb, &ctinfo); + if (ct && ct->helper) { + unsigned int ret, protoff; + unsigned int extoff = (u8*)((*pskb)->nh.ipv6h + 1) + - (*pskb)->data; + unsigned char pnum = (*pskb)->nh.ipv6h->nexthdr; + + protoff = nf_ct_ipv6_skip_exthdr(*pskb, extoff, &pnum, + (*pskb)->len - extoff); + if (protoff < 0 || protoff > (*pskb)->len || + pnum == NEXTHDR_FRAGMENT) { + DEBUGP("proto header not found\n"); + return NF_ACCEPT; + } + + ret = ct->helper->help(pskb, protoff, ct, ctinfo); + if (ret != NF_ACCEPT) + return ret; + } + + /* We've seen it coming out the other side: confirm it */ + + return nf_conntrack_confirm(pskb); +} + +extern struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb); +extern void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb, + struct net_device *in, + struct net_device *out, + int (*okfn)(struct sk_buff *)); +static unsigned int ipv6_defrag(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *reasm; + + /* Previously seen (loopback)? */ + if ((*pskb)->nfct) + return NF_ACCEPT; + + reasm = nf_ct_frag6_gather(*pskb); + + /* queued */ + if (reasm == NULL) + return NF_STOLEN; + + /* error occured or not fragmented */ + if (reasm == *pskb) + return NF_ACCEPT; + + nf_ct_frag6_output(hooknum, reasm, (struct net_device *)in, + (struct net_device *)out, okfn); + + return NF_STOLEN; +} + +static unsigned int ipv6_conntrack_in(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *reasm = (*pskb)->nfct_reasm; + + /* This packet is fragmented and has reassembled packet. */ + if (reasm) { + /* Reassembled packet isn't parsed yet ? */ + if (!reasm->nfct) { + unsigned int ret; + + ret = nf_conntrack_in(PF_INET6, hooknum, &reasm); + if (ret != NF_ACCEPT) + return ret; + } + nf_conntrack_get(reasm->nfct); + (*pskb)->nfct = reasm->nfct; + return NF_ACCEPT; + } + + return nf_conntrack_in(PF_INET6, hooknum, pskb); +} + +static unsigned int ipv6_conntrack_local(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + /* root is playing with raw sockets. */ + if ((*pskb)->len < sizeof(struct ipv6hdr)) { + if (net_ratelimit()) + printk("ipv6_conntrack_local: packet too short\n"); + return NF_ACCEPT; + } + return ipv6_conntrack_in(hooknum, pskb, in, out, okfn); +} + +/* Connection tracking may drop packets, but never alters them, so + make it the first hook. */ +static struct nf_hook_ops ipv6_conntrack_defrag_ops = { + .hook = ipv6_defrag, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_PRE_ROUTING, + .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, +}; + +static struct nf_hook_ops ipv6_conntrack_in_ops = { + .hook = ipv6_conntrack_in, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_PRE_ROUTING, + .priority = NF_IP6_PRI_CONNTRACK, +}; + +static struct nf_hook_ops ipv6_conntrack_local_out_ops = { + .hook = ipv6_conntrack_local, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_LOCAL_OUT, + .priority = NF_IP6_PRI_CONNTRACK, +}; + +static struct nf_hook_ops ipv6_conntrack_defrag_local_out_ops = { + .hook = ipv6_defrag, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_LOCAL_OUT, + .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, +}; + +/* Refragmenter; last chance. */ +static struct nf_hook_ops ipv6_conntrack_out_ops = { + .hook = ipv6_confirm, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_POST_ROUTING, + .priority = NF_IP6_PRI_LAST, +}; + +static struct nf_hook_ops ipv6_conntrack_local_in_ops = { + .hook = ipv6_confirm, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_LOCAL_IN, + .priority = NF_IP6_PRI_LAST-1, +}; + +#ifdef CONFIG_SYSCTL + +/* From nf_conntrack_proto_icmpv6.c */ +extern unsigned long nf_ct_icmpv6_timeout; + +/* From nf_conntrack_frag6.c */ +extern unsigned long nf_ct_frag6_timeout; +extern unsigned int nf_ct_frag6_low_thresh; +extern unsigned int nf_ct_frag6_high_thresh; + +static struct ctl_table_header *nf_ct_ipv6_sysctl_header; + +static ctl_table nf_ct_sysctl_table[] = { + { + .ctl_name = NET_NF_CONNTRACK_ICMPV6_TIMEOUT, + .procname = "nf_conntrack_icmpv6_timeout", + .data = &nf_ct_icmpv6_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_FRAG6_TIMEOUT, + .procname = "nf_conntrack_frag6_timeout", + .data = &nf_ct_frag6_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_FRAG6_LOW_THRESH, + .procname = "nf_conntrack_frag6_low_thresh", + .data = &nf_ct_frag6_low_thresh, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, + .procname = "nf_conntrack_frag6_high_thresh", + .data = &nf_ct_frag6_high_thresh, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; + +static ctl_table nf_ct_netfilter_table[] = { + { + .ctl_name = NET_NETFILTER, + .procname = "netfilter", + .mode = 0555, + .child = nf_ct_sysctl_table, + }, + { .ctl_name = 0 } +}; + +static ctl_table nf_ct_net_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = nf_ct_netfilter_table, + }, + { .ctl_name = 0 } +}; +#endif + +struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = { + .l3proto = PF_INET6, + .name = "ipv6", + .pkt_to_tuple = ipv6_pkt_to_tuple, + .invert_tuple = ipv6_invert_tuple, + .print_tuple = ipv6_print_tuple, + .print_conntrack = ipv6_print_conntrack, + .prepare = ipv6_prepare, + .get_features = ipv6_get_features, + .me = THIS_MODULE, +}; + +extern struct nf_conntrack_protocol nf_conntrack_protocol_tcp6; +extern struct nf_conntrack_protocol nf_conntrack_protocol_udp6; +extern struct nf_conntrack_protocol nf_conntrack_protocol_icmpv6; +extern int nf_ct_frag6_init(void); +extern void nf_ct_frag6_cleanup(void); +static int init_or_cleanup(int init) +{ + int ret = 0; + + if (!init) goto cleanup; + + ret = nf_ct_frag6_init(); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't initialize frag6.\n"); + goto cleanup_nothing; + } + ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_tcp6); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register tcp.\n"); + goto cleanup_frag6; + } + + ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_udp6); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register udp.\n"); + goto cleanup_tcp; + } + + ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_icmpv6); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register icmpv6.\n"); + goto cleanup_udp; + } + + ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv6); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register ipv6\n"); + goto cleanup_icmpv6; + } + + ret = nf_register_hook(&ipv6_conntrack_defrag_ops); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register pre-routing defrag " + "hook.\n"); + goto cleanup_ipv6; + } + + ret = nf_register_hook(&ipv6_conntrack_defrag_local_out_ops); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register local_out defrag " + "hook.\n"); + goto cleanup_defragops; + } + + ret = nf_register_hook(&ipv6_conntrack_in_ops); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register pre-routing hook.\n"); + goto cleanup_defraglocalops; + } + + ret = nf_register_hook(&ipv6_conntrack_local_out_ops); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register local out hook.\n"); + goto cleanup_inops; + } + + ret = nf_register_hook(&ipv6_conntrack_out_ops); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register post-routing hook.\n"); + goto cleanup_inandlocalops; + } + + ret = nf_register_hook(&ipv6_conntrack_local_in_ops); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register local in hook.\n"); + goto cleanup_inoutandlocalops; + } + +#ifdef CONFIG_SYSCTL + nf_ct_ipv6_sysctl_header = register_sysctl_table(nf_ct_net_table, 0); + if (nf_ct_ipv6_sysctl_header == NULL) { + printk("nf_conntrack: can't register to sysctl.\n"); + ret = -ENOMEM; + goto cleanup_localinops; + } +#endif + return ret; + + cleanup: + synchronize_net(); +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(nf_ct_ipv6_sysctl_header); + cleanup_localinops: +#endif + nf_unregister_hook(&ipv6_conntrack_local_in_ops); + cleanup_inoutandlocalops: + nf_unregister_hook(&ipv6_conntrack_out_ops); + cleanup_inandlocalops: + nf_unregister_hook(&ipv6_conntrack_local_out_ops); + cleanup_inops: + nf_unregister_hook(&ipv6_conntrack_in_ops); + cleanup_defraglocalops: + nf_unregister_hook(&ipv6_conntrack_defrag_local_out_ops); + cleanup_defragops: + nf_unregister_hook(&ipv6_conntrack_defrag_ops); + cleanup_ipv6: + nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv6); + cleanup_icmpv6: + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_icmpv6); + cleanup_udp: + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_udp6); + cleanup_tcp: + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_tcp6); + cleanup_frag6: + nf_ct_frag6_cleanup(); + cleanup_nothing: + return ret; +} + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Yasuyuki KOZAKAI @USAGI <yasuyuki.kozakai@toshiba.co.jp>"); + +static int __init init(void) +{ + need_nf_conntrack(); + return init_or_cleanup(1); +} + +static void __exit fini(void) +{ + init_or_cleanup(0); +} + +module_init(init); +module_exit(fini); + +void need_ip6_conntrack(void) +{ +} + +EXPORT_SYMBOL(need_ip6_conntrack); diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c new file mode 100644 index 00000000000..c0f1da5497a --- /dev/null +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -0,0 +1,272 @@ +/* + * Copyright (C)2003,2004 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Author: + * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * + * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * - ICMPv6 tracking support. Derived from the original ip_conntrack code + * net/ipv4/netfilter/ip_conntrack_proto_icmp.c which had the following + * copyright information: + * (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + */ + +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/in6.h> +#include <linux/icmpv6.h> +#include <linux/ipv6.h> +#include <net/ipv6.h> +#include <net/ip6_checksum.h> +#include <linux/seq_file.h> +#include <linux/netfilter_ipv6.h> +#include <net/netfilter/nf_conntrack_tuple.h> +#include <net/netfilter/nf_conntrack_protocol.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/ipv6/nf_conntrack_icmpv6.h> + +unsigned long nf_ct_icmpv6_timeout = 30*HZ; + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static int icmpv6_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + struct icmp6hdr _hdr, *hp; + + hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hp == NULL) + return 0; + tuple->dst.u.icmp.type = hp->icmp6_type; + tuple->src.u.icmp.id = hp->icmp6_identifier; + tuple->dst.u.icmp.code = hp->icmp6_code; + + return 1; +} + +static int icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + /* Add 1; spaces filled with 0. */ + static u_int8_t invmap[] = { + [ICMPV6_ECHO_REQUEST - 128] = ICMPV6_ECHO_REPLY + 1, + [ICMPV6_ECHO_REPLY - 128] = ICMPV6_ECHO_REQUEST + 1, + [ICMPV6_NI_QUERY - 128] = ICMPV6_NI_QUERY + 1, + [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_REPLY +1 + }; + + __u8 type = orig->dst.u.icmp.type - 128; + if (type >= sizeof(invmap) || !invmap[type]) + return 0; + + tuple->src.u.icmp.id = orig->src.u.icmp.id; + tuple->dst.u.icmp.type = invmap[type] - 1; + tuple->dst.u.icmp.code = orig->dst.u.icmp.code; + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int icmpv6_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "type=%u code=%u id=%u ", + tuple->dst.u.icmp.type, + tuple->dst.u.icmp.code, + ntohs(tuple->src.u.icmp.id)); +} + +/* Print out the private part of the conntrack. */ +static int icmpv6_print_conntrack(struct seq_file *s, + const struct nf_conn *conntrack) +{ + return 0; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int icmpv6_packet(struct nf_conn *ct, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + int pf, + unsigned int hooknum) +{ + /* Try to delete connection immediately after all replies: + won't actually vanish as we still have skb, and del_timer + means this will only run once even if count hits zero twice + (theoretically possible with SMP) */ + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { + if (atomic_dec_and_test(&ct->proto.icmp.count) + && del_timer(&ct->timeout)) + ct->timeout.function((unsigned long)ct); + } else { + atomic_inc(&ct->proto.icmp.count); + nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); + nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmpv6_timeout); + } + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int icmpv6_new(struct nf_conn *conntrack, + const struct sk_buff *skb, + unsigned int dataoff) +{ + static u_int8_t valid_new[] = { + [ICMPV6_ECHO_REQUEST - 128] = 1, + [ICMPV6_NI_QUERY - 128] = 1 + }; + + if (conntrack->tuplehash[0].tuple.dst.u.icmp.type - 128 >= sizeof(valid_new) + || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type - 128]) { + /* Can't create a new ICMPv6 `conn' with this. */ + DEBUGP("icmp: can't create new conn with type %u\n", + conntrack->tuplehash[0].tuple.dst.u.icmp.type); + NF_CT_DUMP_TUPLE(&conntrack->tuplehash[0].tuple); + return 0; + } + atomic_set(&conntrack->proto.icmp.count, 0); + return 1; +} + +extern int +nf_ct_ipv6_skip_exthdr(struct sk_buff *skb, int start, u8 *nexthdrp, int len); +extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6; +static int +icmpv6_error_message(struct sk_buff *skb, + unsigned int icmp6off, + enum ip_conntrack_info *ctinfo, + unsigned int hooknum) +{ + struct nf_conntrack_tuple intuple, origtuple; + struct nf_conntrack_tuple_hash *h; + struct icmp6hdr _hdr, *hp; + unsigned int inip6off; + struct nf_conntrack_protocol *inproto; + u_int8_t inprotonum; + unsigned int inprotoff; + + NF_CT_ASSERT(skb->nfct == NULL); + + hp = skb_header_pointer(skb, icmp6off, sizeof(_hdr), &_hdr); + if (hp == NULL) { + DEBUGP("icmpv6_error: Can't get ICMPv6 hdr.\n"); + return -NF_ACCEPT; + } + + inip6off = icmp6off + sizeof(_hdr); + if (skb_copy_bits(skb, inip6off+offsetof(struct ipv6hdr, nexthdr), + &inprotonum, sizeof(inprotonum)) != 0) { + DEBUGP("icmpv6_error: Can't get nexthdr in inner IPv6 header.\n"); + return -NF_ACCEPT; + } + inprotoff = nf_ct_ipv6_skip_exthdr(skb, + inip6off + sizeof(struct ipv6hdr), + &inprotonum, + skb->len - inip6off + - sizeof(struct ipv6hdr)); + + if ((inprotoff < 0) || (inprotoff > skb->len) || + (inprotonum == NEXTHDR_FRAGMENT)) { + DEBUGP("icmpv6_error: Can't get protocol header in ICMPv6 payload.\n"); + return -NF_ACCEPT; + } + + inproto = nf_ct_find_proto(PF_INET6, inprotonum); + + /* Are they talking about one of our connections? */ + if (!nf_ct_get_tuple(skb, inip6off, inprotoff, PF_INET6, inprotonum, + &origtuple, &nf_conntrack_l3proto_ipv6, inproto)) { + DEBUGP("icmpv6_error: Can't get tuple\n"); + return -NF_ACCEPT; + } + + /* Ordinarily, we'd expect the inverted tupleproto, but it's + been preserved inside the ICMP. */ + if (!nf_ct_invert_tuple(&intuple, &origtuple, + &nf_conntrack_l3proto_ipv6, inproto)) { + DEBUGP("icmpv6_error: Can't invert tuple\n"); + return -NF_ACCEPT; + } + + *ctinfo = IP_CT_RELATED; + + h = nf_conntrack_find_get(&intuple, NULL); + if (!h) { + DEBUGP("icmpv6_error: no match\n"); + return -NF_ACCEPT; + } else { + if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) + *ctinfo += IP_CT_IS_REPLY; + } + + /* Update skb to refer to this connection */ + skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general; + skb->nfctinfo = *ctinfo; + return -NF_ACCEPT; +} + +static int +icmpv6_error(struct sk_buff *skb, unsigned int dataoff, + enum ip_conntrack_info *ctinfo, int pf, unsigned int hooknum) +{ + struct icmp6hdr _ih, *icmp6h; + + icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih); + if (icmp6h == NULL) { + if (LOG_INVALID(IPPROTO_ICMPV6)) + nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, + "nf_ct_icmpv6: short packet "); + return -NF_ACCEPT; + } + + if (hooknum != NF_IP6_PRE_ROUTING) + goto skipped; + + /* Ignore it if the checksum's bogus. */ + if (csum_ipv6_magic(&skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr, + skb->len - dataoff, IPPROTO_ICMPV6, + skb_checksum(skb, dataoff, + skb->len - dataoff, 0))) { + nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, + "nf_ct_icmpv6: ICMPv6 checksum failed\n"); + return -NF_ACCEPT; + } + +skipped: + + /* is not error message ? */ + if (icmp6h->icmp6_type >= 128) + return NF_ACCEPT; + + return icmpv6_error_message(skb, dataoff, ctinfo, hooknum); +} + +struct nf_conntrack_protocol nf_conntrack_protocol_icmpv6 = +{ + .l3proto = PF_INET6, + .proto = IPPROTO_ICMPV6, + .name = "icmpv6", + .pkt_to_tuple = icmpv6_pkt_to_tuple, + .invert_tuple = icmpv6_invert_tuple, + .print_tuple = icmpv6_print_tuple, + .print_conntrack = icmpv6_print_conntrack, + .packet = icmpv6_packet, + .new = icmpv6_new, + .error = icmpv6_error, +}; + +EXPORT_SYMBOL(nf_conntrack_protocol_icmpv6); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c new file mode 100644 index 00000000000..c2c52af9e56 --- /dev/null +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -0,0 +1,897 @@ +/* + * IPv6 fragment reassembly for connection tracking + * + * Copyright (C)2004 USAGI/WIDE Project + * + * Author: + * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * + * Based on: net/ipv6/reassembly.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/jiffies.h> +#include <linux/net.h> +#include <linux/list.h> +#include <linux/netdevice.h> +#include <linux/in6.h> +#include <linux/ipv6.h> +#include <linux/icmpv6.h> +#include <linux/random.h> +#include <linux/jhash.h> + +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/transp_v6.h> +#include <net/rawv6.h> +#include <net/ndisc.h> +#include <net/addrconf.h> +#include <linux/sysctl.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> +#include <linux/kernel.h> +#include <linux/module.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +#define NF_CT_FRAG6_HIGH_THRESH 262144 /* == 256*1024 */ +#define NF_CT_FRAG6_LOW_THRESH 196608 /* == 192*1024 */ +#define NF_CT_FRAG6_TIMEOUT IPV6_FRAG_TIMEOUT + +unsigned int nf_ct_frag6_high_thresh = 256*1024; +unsigned int nf_ct_frag6_low_thresh = 192*1024; +unsigned long nf_ct_frag6_timeout = IPV6_FRAG_TIMEOUT; + +struct nf_ct_frag6_skb_cb +{ + struct inet6_skb_parm h; + int offset; + struct sk_buff *orig; +}; + +#define NFCT_FRAG6_CB(skb) ((struct nf_ct_frag6_skb_cb*)((skb)->cb)) + +struct nf_ct_frag6_queue +{ + struct nf_ct_frag6_queue *next; + struct list_head lru_list; /* lru list member */ + + __u32 id; /* fragment id */ + struct in6_addr saddr; + struct in6_addr daddr; + + spinlock_t lock; + atomic_t refcnt; + struct timer_list timer; /* expire timer */ + struct sk_buff *fragments; + int len; + int meat; + struct timeval stamp; + unsigned int csum; + __u8 last_in; /* has first/last segment arrived? */ +#define COMPLETE 4 +#define FIRST_IN 2 +#define LAST_IN 1 + __u16 nhoffset; + struct nf_ct_frag6_queue **pprev; +}; + +/* Hash table. */ + +#define FRAG6Q_HASHSZ 64 + +static struct nf_ct_frag6_queue *nf_ct_frag6_hash[FRAG6Q_HASHSZ]; +static rwlock_t nf_ct_frag6_lock = RW_LOCK_UNLOCKED; +static u32 nf_ct_frag6_hash_rnd; +static LIST_HEAD(nf_ct_frag6_lru_list); +int nf_ct_frag6_nqueues = 0; + +static __inline__ void __fq_unlink(struct nf_ct_frag6_queue *fq) +{ + if (fq->next) + fq->next->pprev = fq->pprev; + *fq->pprev = fq->next; + list_del(&fq->lru_list); + nf_ct_frag6_nqueues--; +} + +static __inline__ void fq_unlink(struct nf_ct_frag6_queue *fq) +{ + write_lock(&nf_ct_frag6_lock); + __fq_unlink(fq); + write_unlock(&nf_ct_frag6_lock); +} + +static unsigned int ip6qhashfn(u32 id, struct in6_addr *saddr, + struct in6_addr *daddr) +{ + u32 a, b, c; + + a = saddr->s6_addr32[0]; + b = saddr->s6_addr32[1]; + c = saddr->s6_addr32[2]; + + a += JHASH_GOLDEN_RATIO; + b += JHASH_GOLDEN_RATIO; + c += nf_ct_frag6_hash_rnd; + __jhash_mix(a, b, c); + + a += saddr->s6_addr32[3]; + b += daddr->s6_addr32[0]; + c += daddr->s6_addr32[1]; + __jhash_mix(a, b, c); + + a += daddr->s6_addr32[2]; + b += daddr->s6_addr32[3]; + c += id; + __jhash_mix(a, b, c); + + return c & (FRAG6Q_HASHSZ - 1); +} + +static struct timer_list nf_ct_frag6_secret_timer; +int nf_ct_frag6_secret_interval = 10 * 60 * HZ; + +static void nf_ct_frag6_secret_rebuild(unsigned long dummy) +{ + unsigned long now = jiffies; + int i; + + write_lock(&nf_ct_frag6_lock); + get_random_bytes(&nf_ct_frag6_hash_rnd, sizeof(u32)); + for (i = 0; i < FRAG6Q_HASHSZ; i++) { + struct nf_ct_frag6_queue *q; + + q = nf_ct_frag6_hash[i]; + while (q) { + struct nf_ct_frag6_queue *next = q->next; + unsigned int hval = ip6qhashfn(q->id, + &q->saddr, + &q->daddr); + + if (hval != i) { + /* Unlink. */ + if (q->next) + q->next->pprev = q->pprev; + *q->pprev = q->next; + + /* Relink to new hash chain. */ + if ((q->next = nf_ct_frag6_hash[hval]) != NULL) + q->next->pprev = &q->next; + nf_ct_frag6_hash[hval] = q; + q->pprev = &nf_ct_frag6_hash[hval]; + } + + q = next; + } + } + write_unlock(&nf_ct_frag6_lock); + + mod_timer(&nf_ct_frag6_secret_timer, now + nf_ct_frag6_secret_interval); +} + +atomic_t nf_ct_frag6_mem = ATOMIC_INIT(0); + +/* Memory Tracking Functions. */ +static inline void frag_kfree_skb(struct sk_buff *skb, unsigned int *work) +{ + if (work) + *work -= skb->truesize; + atomic_sub(skb->truesize, &nf_ct_frag6_mem); + if (NFCT_FRAG6_CB(skb)->orig) + kfree_skb(NFCT_FRAG6_CB(skb)->orig); + + kfree_skb(skb); +} + +static inline void frag_free_queue(struct nf_ct_frag6_queue *fq, + unsigned int *work) +{ + if (work) + *work -= sizeof(struct nf_ct_frag6_queue); + atomic_sub(sizeof(struct nf_ct_frag6_queue), &nf_ct_frag6_mem); + kfree(fq); +} + +static inline struct nf_ct_frag6_queue *frag_alloc_queue(void) +{ + struct nf_ct_frag6_queue *fq = kmalloc(sizeof(struct nf_ct_frag6_queue), GFP_ATOMIC); + + if (!fq) + return NULL; + atomic_add(sizeof(struct nf_ct_frag6_queue), &nf_ct_frag6_mem); + return fq; +} + +/* Destruction primitives. */ + +/* Complete destruction of fq. */ +static void nf_ct_frag6_destroy(struct nf_ct_frag6_queue *fq, + unsigned int *work) +{ + struct sk_buff *fp; + + BUG_TRAP(fq->last_in&COMPLETE); + BUG_TRAP(del_timer(&fq->timer) == 0); + + /* Release all fragment data. */ + fp = fq->fragments; + while (fp) { + struct sk_buff *xp = fp->next; + + frag_kfree_skb(fp, work); + fp = xp; + } + + frag_free_queue(fq, work); +} + +static __inline__ void fq_put(struct nf_ct_frag6_queue *fq, unsigned int *work) +{ + if (atomic_dec_and_test(&fq->refcnt)) + nf_ct_frag6_destroy(fq, work); +} + +/* Kill fq entry. It is not destroyed immediately, + * because caller (and someone more) holds reference count. + */ +static __inline__ void fq_kill(struct nf_ct_frag6_queue *fq) +{ + if (del_timer(&fq->timer)) + atomic_dec(&fq->refcnt); + + if (!(fq->last_in & COMPLETE)) { + fq_unlink(fq); + atomic_dec(&fq->refcnt); + fq->last_in |= COMPLETE; + } +} + +static void nf_ct_frag6_evictor(void) +{ + struct nf_ct_frag6_queue *fq; + struct list_head *tmp; + unsigned int work; + + work = atomic_read(&nf_ct_frag6_mem); + if (work <= nf_ct_frag6_low_thresh) + return; + + work -= nf_ct_frag6_low_thresh; + while (work > 0) { + read_lock(&nf_ct_frag6_lock); + if (list_empty(&nf_ct_frag6_lru_list)) { + read_unlock(&nf_ct_frag6_lock); + return; + } + tmp = nf_ct_frag6_lru_list.next; + BUG_ON(tmp == NULL); + fq = list_entry(tmp, struct nf_ct_frag6_queue, lru_list); + atomic_inc(&fq->refcnt); + read_unlock(&nf_ct_frag6_lock); + + spin_lock(&fq->lock); + if (!(fq->last_in&COMPLETE)) + fq_kill(fq); + spin_unlock(&fq->lock); + + fq_put(fq, &work); + } +} + +static void nf_ct_frag6_expire(unsigned long data) +{ + struct nf_ct_frag6_queue *fq = (struct nf_ct_frag6_queue *) data; + + spin_lock(&fq->lock); + + if (fq->last_in & COMPLETE) + goto out; + + fq_kill(fq); + +out: + spin_unlock(&fq->lock); + fq_put(fq, NULL); +} + +/* Creation primitives. */ + + +static struct nf_ct_frag6_queue *nf_ct_frag6_intern(unsigned int hash, + struct nf_ct_frag6_queue *fq_in) +{ + struct nf_ct_frag6_queue *fq; + + write_lock(&nf_ct_frag6_lock); +#ifdef CONFIG_SMP + for (fq = nf_ct_frag6_hash[hash]; fq; fq = fq->next) { + if (fq->id == fq_in->id && + !ipv6_addr_cmp(&fq_in->saddr, &fq->saddr) && + !ipv6_addr_cmp(&fq_in->daddr, &fq->daddr)) { + atomic_inc(&fq->refcnt); + write_unlock(&nf_ct_frag6_lock); + fq_in->last_in |= COMPLETE; + fq_put(fq_in, NULL); + return fq; + } + } +#endif + fq = fq_in; + + if (!mod_timer(&fq->timer, jiffies + nf_ct_frag6_timeout)) + atomic_inc(&fq->refcnt); + + atomic_inc(&fq->refcnt); + if ((fq->next = nf_ct_frag6_hash[hash]) != NULL) + fq->next->pprev = &fq->next; + nf_ct_frag6_hash[hash] = fq; + fq->pprev = &nf_ct_frag6_hash[hash]; + INIT_LIST_HEAD(&fq->lru_list); + list_add_tail(&fq->lru_list, &nf_ct_frag6_lru_list); + nf_ct_frag6_nqueues++; + write_unlock(&nf_ct_frag6_lock); + return fq; +} + + +static struct nf_ct_frag6_queue * +nf_ct_frag6_create(unsigned int hash, u32 id, struct in6_addr *src, struct in6_addr *dst) +{ + struct nf_ct_frag6_queue *fq; + + if ((fq = frag_alloc_queue()) == NULL) { + DEBUGP("Can't alloc new queue\n"); + goto oom; + } + + memset(fq, 0, sizeof(struct nf_ct_frag6_queue)); + + fq->id = id; + ipv6_addr_copy(&fq->saddr, src); + ipv6_addr_copy(&fq->daddr, dst); + + init_timer(&fq->timer); + fq->timer.function = nf_ct_frag6_expire; + fq->timer.data = (long) fq; + fq->lock = SPIN_LOCK_UNLOCKED; + atomic_set(&fq->refcnt, 1); + + return nf_ct_frag6_intern(hash, fq); + +oom: + return NULL; +} + +static __inline__ struct nf_ct_frag6_queue * +fq_find(u32 id, struct in6_addr *src, struct in6_addr *dst) +{ + struct nf_ct_frag6_queue *fq; + unsigned int hash = ip6qhashfn(id, src, dst); + + read_lock(&nf_ct_frag6_lock); + for (fq = nf_ct_frag6_hash[hash]; fq; fq = fq->next) { + if (fq->id == id && + !ipv6_addr_cmp(src, &fq->saddr) && + !ipv6_addr_cmp(dst, &fq->daddr)) { + atomic_inc(&fq->refcnt); + read_unlock(&nf_ct_frag6_lock); + return fq; + } + } + read_unlock(&nf_ct_frag6_lock); + + return nf_ct_frag6_create(hash, id, src, dst); +} + + +static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, + struct frag_hdr *fhdr, int nhoff) +{ + struct sk_buff *prev, *next; + int offset, end; + + if (fq->last_in & COMPLETE) { + DEBUGP("Allready completed\n"); + goto err; + } + + offset = ntohs(fhdr->frag_off) & ~0x7; + end = offset + (ntohs(skb->nh.ipv6h->payload_len) - + ((u8 *) (fhdr + 1) - (u8 *) (skb->nh.ipv6h + 1))); + + if ((unsigned int)end > IPV6_MAXPLEN) { + DEBUGP("offset is too large.\n"); + return -1; + } + + if (skb->ip_summed == CHECKSUM_HW) + skb->csum = csum_sub(skb->csum, + csum_partial(skb->nh.raw, + (u8*)(fhdr + 1) - skb->nh.raw, + 0)); + + /* Is this the final fragment? */ + if (!(fhdr->frag_off & htons(IP6_MF))) { + /* If we already have some bits beyond end + * or have different end, the segment is corrupted. + */ + if (end < fq->len || + ((fq->last_in & LAST_IN) && end != fq->len)) { + DEBUGP("already received last fragment\n"); + goto err; + } + fq->last_in |= LAST_IN; + fq->len = end; + } else { + /* Check if the fragment is rounded to 8 bytes. + * Required by the RFC. + */ + if (end & 0x7) { + /* RFC2460 says always send parameter problem in + * this case. -DaveM + */ + DEBUGP("the end of this fragment is not rounded to 8 bytes.\n"); + return -1; + } + if (end > fq->len) { + /* Some bits beyond end -> corruption. */ + if (fq->last_in & LAST_IN) { + DEBUGP("last packet already reached.\n"); + goto err; + } + fq->len = end; + } + } + + if (end == offset) + goto err; + + /* Point into the IP datagram 'data' part. */ + if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) { + DEBUGP("queue: message is too short.\n"); + goto err; + } + if (end-offset < skb->len) { + if (pskb_trim(skb, end - offset)) { + DEBUGP("Can't trim\n"); + goto err; + } + if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->ip_summed = CHECKSUM_NONE; + } + + /* Find out which fragments are in front and at the back of us + * in the chain of fragments so far. We must know where to put + * this fragment, right? + */ + prev = NULL; + for (next = fq->fragments; next != NULL; next = next->next) { + if (NFCT_FRAG6_CB(next)->offset >= offset) + break; /* bingo! */ + prev = next; + } + + /* We found where to put this one. Check for overlap with + * preceding fragment, and, if needed, align things so that + * any overlaps are eliminated. + */ + if (prev) { + int i = (NFCT_FRAG6_CB(prev)->offset + prev->len) - offset; + + if (i > 0) { + offset += i; + if (end <= offset) { + DEBUGP("overlap\n"); + goto err; + } + if (!pskb_pull(skb, i)) { + DEBUGP("Can't pull\n"); + goto err; + } + if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->ip_summed = CHECKSUM_NONE; + } + } + + /* Look for overlap with succeeding segments. + * If we can merge fragments, do it. + */ + while (next && NFCT_FRAG6_CB(next)->offset < end) { + /* overlap is 'i' bytes */ + int i = end - NFCT_FRAG6_CB(next)->offset; + + if (i < next->len) { + /* Eat head of the next overlapped fragment + * and leave the loop. The next ones cannot overlap. + */ + DEBUGP("Eat head of the overlapped parts.: %d", i); + if (!pskb_pull(next, i)) + goto err; + + /* next fragment */ + NFCT_FRAG6_CB(next)->offset += i; + fq->meat -= i; + if (next->ip_summed != CHECKSUM_UNNECESSARY) + next->ip_summed = CHECKSUM_NONE; + break; + } else { + struct sk_buff *free_it = next; + + /* Old fragmnet is completely overridden with + * new one drop it. + */ + next = next->next; + + if (prev) + prev->next = next; + else + fq->fragments = next; + + fq->meat -= free_it->len; + frag_kfree_skb(free_it, NULL); + } + } + + NFCT_FRAG6_CB(skb)->offset = offset; + + /* Insert this fragment in the chain of fragments. */ + skb->next = next; + if (prev) + prev->next = skb; + else + fq->fragments = skb; + + skb->dev = NULL; + skb_get_timestamp(skb, &fq->stamp); + fq->meat += skb->len; + atomic_add(skb->truesize, &nf_ct_frag6_mem); + + /* The first fragment. + * nhoffset is obtained from the first fragment, of course. + */ + if (offset == 0) { + fq->nhoffset = nhoff; + fq->last_in |= FIRST_IN; + } + write_lock(&nf_ct_frag6_lock); + list_move_tail(&fq->lru_list, &nf_ct_frag6_lru_list); + write_unlock(&nf_ct_frag6_lock); + return 0; + +err: + return -1; +} + +/* + * Check if this packet is complete. + * Returns NULL on failure by any reason, and pointer + * to current nexthdr field in reassembled frame. + * + * It is called with locked fq, and caller must check that + * queue is eligible for reassembly i.e. it is not COMPLETE, + * the last and the first frames arrived and all the bits are here. + */ +static struct sk_buff * +nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) +{ + struct sk_buff *fp, *op, *head = fq->fragments; + int payload_len; + + fq_kill(fq); + + BUG_TRAP(head != NULL); + BUG_TRAP(NFCT_FRAG6_CB(head)->offset == 0); + + /* Unfragmented part is taken from the first segment. */ + payload_len = (head->data - head->nh.raw) - sizeof(struct ipv6hdr) + fq->len - sizeof(struct frag_hdr); + if (payload_len > IPV6_MAXPLEN) { + DEBUGP("payload len is too large.\n"); + goto out_oversize; + } + + /* Head of list must not be cloned. */ + if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) { + DEBUGP("skb is cloned but can't expand head"); + goto out_oom; + } + + /* If the first fragment is fragmented itself, we split + * it to two chunks: the first with data and paged part + * and the second, holding only fragments. */ + if (skb_shinfo(head)->frag_list) { + struct sk_buff *clone; + int i, plen = 0; + + if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL) { + DEBUGP("Can't alloc skb\n"); + goto out_oom; + } + clone->next = head->next; + head->next = clone; + skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; + skb_shinfo(head)->frag_list = NULL; + for (i=0; i<skb_shinfo(head)->nr_frags; i++) + plen += skb_shinfo(head)->frags[i].size; + clone->len = clone->data_len = head->data_len - plen; + head->data_len -= clone->len; + head->len -= clone->len; + clone->csum = 0; + clone->ip_summed = head->ip_summed; + + NFCT_FRAG6_CB(clone)->orig = NULL; + atomic_add(clone->truesize, &nf_ct_frag6_mem); + } + + /* We have to remove fragment header from datagram and to relocate + * header in order to calculate ICV correctly. */ + head->nh.raw[fq->nhoffset] = head->h.raw[0]; + memmove(head->head + sizeof(struct frag_hdr), head->head, + (head->data - head->head) - sizeof(struct frag_hdr)); + head->mac.raw += sizeof(struct frag_hdr); + head->nh.raw += sizeof(struct frag_hdr); + + skb_shinfo(head)->frag_list = head->next; + head->h.raw = head->data; + skb_push(head, head->data - head->nh.raw); + atomic_sub(head->truesize, &nf_ct_frag6_mem); + + for (fp=head->next; fp; fp = fp->next) { + head->data_len += fp->len; + head->len += fp->len; + if (head->ip_summed != fp->ip_summed) + head->ip_summed = CHECKSUM_NONE; + else if (head->ip_summed == CHECKSUM_HW) + head->csum = csum_add(head->csum, fp->csum); + head->truesize += fp->truesize; + atomic_sub(fp->truesize, &nf_ct_frag6_mem); + } + + head->next = NULL; + head->dev = dev; + skb_set_timestamp(head, &fq->stamp); + head->nh.ipv6h->payload_len = htons(payload_len); + + /* Yes, and fold redundant checksum back. 8) */ + if (head->ip_summed == CHECKSUM_HW) + head->csum = csum_partial(head->nh.raw, head->h.raw-head->nh.raw, head->csum); + + fq->fragments = NULL; + + /* all original skbs are linked into the NFCT_FRAG6_CB(head).orig */ + fp = skb_shinfo(head)->frag_list; + if (NFCT_FRAG6_CB(fp)->orig == NULL) + /* at above code, head skb is divided into two skbs. */ + fp = fp->next; + + op = NFCT_FRAG6_CB(head)->orig; + for (; fp; fp = fp->next) { + struct sk_buff *orig = NFCT_FRAG6_CB(fp)->orig; + + op->next = orig; + op = orig; + NFCT_FRAG6_CB(fp)->orig = NULL; + } + + return head; + +out_oversize: + if (net_ratelimit()) + printk(KERN_DEBUG "nf_ct_frag6_reasm: payload len = %d\n", payload_len); + goto out_fail; +out_oom: + if (net_ratelimit()) + printk(KERN_DEBUG "nf_ct_frag6_reasm: no memory for reassembly\n"); +out_fail: + return NULL; +} + +/* + * find the header just before Fragment Header. + * + * if success return 0 and set ... + * (*prevhdrp): the value of "Next Header Field" in the header + * just before Fragment Header. + * (*prevhoff): the offset of "Next Header Field" in the header + * just before Fragment Header. + * (*fhoff) : the offset of Fragment Header. + * + * Based on ipv6_skip_hdr() in net/ipv6/exthdr.c + * + */ +static int +find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff) +{ + u8 nexthdr = skb->nh.ipv6h->nexthdr; + u8 prev_nhoff = (u8 *)&skb->nh.ipv6h->nexthdr - skb->data; + int start = (u8 *)(skb->nh.ipv6h+1) - skb->data; + int len = skb->len - start; + u8 prevhdr = NEXTHDR_IPV6; + + while (nexthdr != NEXTHDR_FRAGMENT) { + struct ipv6_opt_hdr hdr; + int hdrlen; + + if (!ipv6_ext_hdr(nexthdr)) { + return -1; + } + if (len < (int)sizeof(struct ipv6_opt_hdr)) { + DEBUGP("too short\n"); + return -1; + } + if (nexthdr == NEXTHDR_NONE) { + DEBUGP("next header is none\n"); + return -1; + } + if (skb_copy_bits(skb, start, &hdr, sizeof(hdr))) + BUG(); + if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hdr.hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(&hdr); + + prevhdr = nexthdr; + prev_nhoff = start; + + nexthdr = hdr.nexthdr; + len -= hdrlen; + start += hdrlen; + } + + if (len < 0) + return -1; + + *prevhdrp = prevhdr; + *prevhoff = prev_nhoff; + *fhoff = start; + + return 0; +} + +struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb) +{ + struct sk_buff *clone; + struct net_device *dev = skb->dev; + struct frag_hdr *fhdr; + struct nf_ct_frag6_queue *fq; + struct ipv6hdr *hdr; + int fhoff, nhoff; + u8 prevhdr; + struct sk_buff *ret_skb = NULL; + + /* Jumbo payload inhibits frag. header */ + if (skb->nh.ipv6h->payload_len == 0) { + DEBUGP("payload len = 0\n"); + return skb; + } + + if (find_prev_fhdr(skb, &prevhdr, &nhoff, &fhoff) < 0) + return skb; + + clone = skb_clone(skb, GFP_ATOMIC); + if (clone == NULL) { + DEBUGP("Can't clone skb\n"); + return skb; + } + + NFCT_FRAG6_CB(clone)->orig = skb; + + if (!pskb_may_pull(clone, fhoff + sizeof(*fhdr))) { + DEBUGP("message is too short.\n"); + goto ret_orig; + } + + clone->h.raw = clone->data + fhoff; + hdr = clone->nh.ipv6h; + fhdr = (struct frag_hdr *)clone->h.raw; + + if (!(fhdr->frag_off & htons(0xFFF9))) { + DEBUGP("Invalid fragment offset\n"); + /* It is not a fragmented frame */ + goto ret_orig; + } + + if (atomic_read(&nf_ct_frag6_mem) > nf_ct_frag6_high_thresh) + nf_ct_frag6_evictor(); + + fq = fq_find(fhdr->identification, &hdr->saddr, &hdr->daddr); + if (fq == NULL) { + DEBUGP("Can't find and can't create new queue\n"); + goto ret_orig; + } + + spin_lock(&fq->lock); + + if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) { + spin_unlock(&fq->lock); + DEBUGP("Can't insert skb to queue\n"); + fq_put(fq, NULL); + goto ret_orig; + } + + if (fq->last_in == (FIRST_IN|LAST_IN) && fq->meat == fq->len) { + ret_skb = nf_ct_frag6_reasm(fq, dev); + if (ret_skb == NULL) + DEBUGP("Can't reassemble fragmented packets\n"); + } + spin_unlock(&fq->lock); + + fq_put(fq, NULL); + return ret_skb; + +ret_orig: + kfree_skb(clone); + return skb; +} + +void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb, + struct net_device *in, struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *s, *s2; + + for (s = NFCT_FRAG6_CB(skb)->orig; s;) { + nf_conntrack_put_reasm(s->nfct_reasm); + nf_conntrack_get_reasm(skb); + s->nfct_reasm = skb; + + s2 = s->next; + NF_HOOK_THRESH(PF_INET6, hooknum, s, in, out, okfn, + NF_IP6_PRI_CONNTRACK_DEFRAG + 1); + s = s2; + } + nf_conntrack_put_reasm(skb); +} + +int nf_ct_frag6_kfree_frags(struct sk_buff *skb) +{ + struct sk_buff *s, *s2; + + for (s = NFCT_FRAG6_CB(skb)->orig; s; s = s2) { + + s2 = s->next; + kfree_skb(s); + } + + kfree_skb(skb); + + return 0; +} + +int nf_ct_frag6_init(void) +{ + nf_ct_frag6_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^ + (jiffies ^ (jiffies >> 6))); + + init_timer(&nf_ct_frag6_secret_timer); + nf_ct_frag6_secret_timer.function = nf_ct_frag6_secret_rebuild; + nf_ct_frag6_secret_timer.expires = jiffies + + nf_ct_frag6_secret_interval; + add_timer(&nf_ct_frag6_secret_timer); + + return 0; +} + +void nf_ct_frag6_cleanup(void) +{ + del_timer(&nf_ct_frag6_secret_timer); + nf_ct_frag6_low_thresh = 0; + nf_ct_frag6_evictor(); +} diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 334a5967831..50a13e75d70 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -140,9 +140,7 @@ fold_field(void *mib[], int offt) unsigned long res = 0; int i; - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_possible(i)) - continue; + for_each_cpu(i) { res += *(((unsigned long *)per_cpu_ptr(mib[0], i)) + offt); res += *(((unsigned long *)per_cpu_ptr(mib[1], i)) + offt); } diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index d77d3352c96..a66900cda2a 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -174,8 +174,10 @@ int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); /* Not releasing hash table! */ - if (clone) + if (clone) { + nf_reset(clone); rawv6_rcv(sk, clone); + } } sk = __raw_v6_lookup(sk_next(sk), nexthdr, daddr, saddr, IP6CB(skb)->iif); @@ -296,13 +298,10 @@ void rawv6_err(struct sock *sk, struct sk_buff *skb, static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb) { if ((raw6_sk(sk)->checksum || sk->sk_filter) && - skb->ip_summed != CHECKSUM_UNNECESSARY) { - if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { - /* FIXME: increment a raw6 drops counter here */ - kfree_skb(skb); - return 0; - } - skb->ip_summed = CHECKSUM_UNNECESSARY; + skb_checksum_complete(skb)) { + /* FIXME: increment a raw6 drops counter here */ + kfree_skb(skb); + return 0; } /* Charge it to the socket. */ @@ -335,32 +334,25 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) if (!rp->checksum) skb->ip_summed = CHECKSUM_UNNECESSARY; - if (skb->ip_summed != CHECKSUM_UNNECESSARY) { - if (skb->ip_summed == CHECKSUM_HW) { - skb_postpull_rcsum(skb, skb->nh.raw, - skb->h.raw - skb->nh.raw); + if (skb->ip_summed == CHECKSUM_HW) { + skb_postpull_rcsum(skb, skb->nh.raw, + skb->h.raw - skb->nh.raw); + if (!csum_ipv6_magic(&skb->nh.ipv6h->saddr, + &skb->nh.ipv6h->daddr, + skb->len, inet->num, skb->csum)) skb->ip_summed = CHECKSUM_UNNECESSARY; - if (csum_ipv6_magic(&skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr, - skb->len, inet->num, skb->csum)) { - LIMIT_NETDEBUG(KERN_DEBUG "raw v6 hw csum failure.\n"); - skb->ip_summed = CHECKSUM_NONE; - } - } - if (skb->ip_summed == CHECKSUM_NONE) - skb->csum = ~csum_ipv6_magic(&skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr, - skb->len, inet->num, 0); } + if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->csum = ~csum_ipv6_magic(&skb->nh.ipv6h->saddr, + &skb->nh.ipv6h->daddr, + skb->len, inet->num, 0); if (inet->hdrincl) { - if (skb->ip_summed != CHECKSUM_UNNECESSARY && - (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { + if (skb_checksum_complete(skb)) { /* FIXME: increment a raw6 drops counter here */ kfree_skb(skb); return 0; } - skb->ip_summed = CHECKSUM_UNNECESSARY; } rawv6_rcv_skb(sk, skb); @@ -405,7 +397,7 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, if (skb->ip_summed==CHECKSUM_UNNECESSARY) { err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); } else if (msg->msg_flags&MSG_TRUNC) { - if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) + if (__skb_checksum_complete(skb)) goto csum_copy_err; err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); } else { diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index e4fe9ee484d..5d316cb72ec 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -74,7 +74,7 @@ struct ip6frag_skb_cb struct frag_queue { - struct frag_queue *next; + struct hlist_node list; struct list_head lru_list; /* lru list member */ __u32 id; /* fragment id */ @@ -95,14 +95,13 @@ struct frag_queue #define FIRST_IN 2 #define LAST_IN 1 __u16 nhoffset; - struct frag_queue **pprev; }; /* Hash table. */ #define IP6Q_HASHSZ 64 -static struct frag_queue *ip6_frag_hash[IP6Q_HASHSZ]; +static struct hlist_head ip6_frag_hash[IP6Q_HASHSZ]; static DEFINE_RWLOCK(ip6_frag_lock); static u32 ip6_frag_hash_rnd; static LIST_HEAD(ip6_frag_lru_list); @@ -110,9 +109,7 @@ int ip6_frag_nqueues = 0; static __inline__ void __fq_unlink(struct frag_queue *fq) { - if(fq->next) - fq->next->pprev = fq->pprev; - *fq->pprev = fq->next; + hlist_del(&fq->list); list_del(&fq->lru_list); ip6_frag_nqueues--; } @@ -163,28 +160,21 @@ static void ip6_frag_secret_rebuild(unsigned long dummy) get_random_bytes(&ip6_frag_hash_rnd, sizeof(u32)); for (i = 0; i < IP6Q_HASHSZ; i++) { struct frag_queue *q; + struct hlist_node *p, *n; - q = ip6_frag_hash[i]; - while (q) { - struct frag_queue *next = q->next; + hlist_for_each_entry_safe(q, p, n, &ip6_frag_hash[i], list) { unsigned int hval = ip6qhashfn(q->id, &q->saddr, &q->daddr); if (hval != i) { - /* Unlink. */ - if (q->next) - q->next->pprev = q->pprev; - *q->pprev = q->next; + hlist_del(&q->list); /* Relink to new hash chain. */ - if ((q->next = ip6_frag_hash[hval]) != NULL) - q->next->pprev = &q->next; - ip6_frag_hash[hval] = q; - q->pprev = &ip6_frag_hash[hval]; - } + hlist_add_head(&q->list, + &ip6_frag_hash[hval]); - q = next; + } } } write_unlock(&ip6_frag_lock); @@ -337,10 +327,13 @@ static struct frag_queue *ip6_frag_intern(unsigned int hash, struct frag_queue *fq_in) { struct frag_queue *fq; +#ifdef CONFIG_SMP + struct hlist_node *n; +#endif write_lock(&ip6_frag_lock); #ifdef CONFIG_SMP - for (fq = ip6_frag_hash[hash]; fq; fq = fq->next) { + hlist_for_each_entry(fq, n, &ip6_frag_hash[hash], list) { if (fq->id == fq_in->id && ipv6_addr_equal(&fq_in->saddr, &fq->saddr) && ipv6_addr_equal(&fq_in->daddr, &fq->daddr)) { @@ -358,10 +351,7 @@ static struct frag_queue *ip6_frag_intern(unsigned int hash, atomic_inc(&fq->refcnt); atomic_inc(&fq->refcnt); - if((fq->next = ip6_frag_hash[hash]) != NULL) - fq->next->pprev = &fq->next; - ip6_frag_hash[hash] = fq; - fq->pprev = &ip6_frag_hash[hash]; + hlist_add_head(&fq->list, &ip6_frag_hash[hash]); INIT_LIST_HEAD(&fq->lru_list); list_add_tail(&fq->lru_list, &ip6_frag_lru_list); ip6_frag_nqueues++; @@ -401,10 +391,11 @@ static __inline__ struct frag_queue * fq_find(u32 id, struct in6_addr *src, struct in6_addr *dst) { struct frag_queue *fq; + struct hlist_node *n; unsigned int hash = ip6qhashfn(id, src, dst); read_lock(&ip6_frag_lock); - for(fq = ip6_frag_hash[hash]; fq; fq = fq->next) { + hlist_for_each_entry(fq, n, &ip6_frag_hash[hash], list) { if (fq->id == id && ipv6_addr_equal(src, &fq->saddr) && ipv6_addr_equal(dst, &fq->daddr)) { diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 5d5bbb49ec7..a7a537b5059 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -483,7 +483,7 @@ restart: goto out; } - rt = rt6_device_match(rt, skb->dev->ifindex, 0); + rt = rt6_device_match(rt, skb->dev->ifindex, strict); BACKTRACK(); if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) { @@ -1701,16 +1701,14 @@ static void fib6_dump_end(struct netlink_callback *cb) fib6_walker_unlink(w); kfree(w); } - if (cb->args[1]) { - cb->done = (void*)cb->args[1]; - cb->args[1] = 0; - } + cb->done = (void*)cb->args[1]; + cb->args[1] = 0; } static int fib6_dump_done(struct netlink_callback *cb) { fib6_dump_end(cb); - return cb->done(cb); + return cb->done ? cb->done(cb) : 0; } int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d693cb988b7..62c0e5bd931 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -114,16 +114,9 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum) int low = sysctl_local_port_range[0]; int high = sysctl_local_port_range[1]; int remaining = (high - low) + 1; - int rover; + int rover = net_random() % (high - low) + low; - spin_lock(&tcp_hashinfo.portalloc_lock); - if (tcp_hashinfo.port_rover < low) - rover = low; - else - rover = tcp_hashinfo.port_rover; - do { rover++; - if (rover > high) - rover = low; + do { head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) @@ -132,9 +125,9 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum) break; next: spin_unlock(&head->lock); + if (++rover > high) + rover = low; } while (--remaining > 0); - tcp_hashinfo.port_rover = rover; - spin_unlock(&tcp_hashinfo.portalloc_lock); /* Exhausted local port range during search? It is not * possible for us to be holding one of the bind hash @@ -1408,20 +1401,18 @@ out: static int tcp_v6_checksum_init(struct sk_buff *skb) { if (skb->ip_summed == CHECKSUM_HW) { - skb->ip_summed = CHECKSUM_UNNECESSARY; if (!tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr,skb->csum)) + &skb->nh.ipv6h->daddr,skb->csum)) { + skb->ip_summed = CHECKSUM_UNNECESSARY; return 0; - LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v6 csum failed\n"); + } } + + skb->csum = ~tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, + &skb->nh.ipv6h->daddr, 0); + if (skb->len <= 76) { - if (tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr,skb_checksum(skb, 0, skb->len, 0))) - return -1; - skb->ip_summed = CHECKSUM_UNNECESSARY; - } else { - skb->csum = ~tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr,0); + return __skb_checksum_complete(skb); } return 0; } @@ -1582,7 +1573,7 @@ static int tcp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) goto discard_it; if ((skb->ip_summed != CHECKSUM_UNNECESSARY && - tcp_v6_checksum_init(skb) < 0)) + tcp_v6_checksum_init(skb))) goto bad_packet; th = skb->h.th; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e2b87cc68b7..5cc8731eb55 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -248,7 +248,7 @@ try_again: err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, copied); } else if (msg->msg_flags&MSG_TRUNC) { - if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) + if (__skb_checksum_complete(skb)) goto csum_copy_err; err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, copied); @@ -363,13 +363,10 @@ static inline int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) return -1; } - if (skb->ip_summed != CHECKSUM_UNNECESSARY) { - if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { - UDP6_INC_STATS_BH(UDP_MIB_INERRORS); - kfree_skb(skb); - return 0; - } - skb->ip_summed = CHECKSUM_UNNECESSARY; + if (skb_checksum_complete(skb)) { + UDP6_INC_STATS_BH(UDP_MIB_INERRORS); + kfree_skb(skb); + return 0; } if (sock_queue_rcv_skb(sk,skb)<0) { @@ -491,13 +488,10 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) uh = skb->h.uh; } - if (skb->ip_summed==CHECKSUM_HW) { + if (skb->ip_summed == CHECKSUM_HW && + !csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) skb->ip_summed = CHECKSUM_UNNECESSARY; - if (csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) { - LIMIT_NETDEBUG(KERN_DEBUG "udp v6 hw csum failure.\n"); - skb->ip_summed = CHECKSUM_NONE; - } - } + if (skb->ip_summed != CHECKSUM_UNNECESSARY) skb->csum = ~csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, 0); @@ -521,8 +515,7 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard; - if (skb->ip_summed != CHECKSUM_UNNECESSARY && - (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) + if (skb_checksum_complete(skb)) goto discard; UDP6_INC_STATS_BH(UDP_MIB_NOPORTS); diff --git a/net/irda/discovery.c b/net/irda/discovery.c index c4ba5fa1446..3fefc822c1c 100644 --- a/net/irda/discovery.c +++ b/net/irda/discovery.c @@ -194,8 +194,7 @@ void irlmp_expire_discoveries(hashbin_t *log, __u32 saddr, int force) /* Remove it from the log */ curr = hashbin_remove_this(log, (irda_queue_t *) curr); - if (curr) - kfree(curr); + kfree(curr); } } diff --git a/net/irda/irias_object.c b/net/irda/irias_object.c index 6fec428b451..75f2666e863 100644 --- a/net/irda/irias_object.c +++ b/net/irda/irias_object.c @@ -122,8 +122,7 @@ static void __irias_delete_attrib(struct ias_attrib *attrib) IRDA_ASSERT(attrib != NULL, return;); IRDA_ASSERT(attrib->magic == IAS_ATTRIB_MAGIC, return;); - if (attrib->name) - kfree(attrib->name); + kfree(attrib->name); irias_delete_value(attrib->value); attrib->magic = ~IAS_ATTRIB_MAGIC; @@ -136,8 +135,7 @@ void __irias_delete_object(struct ias_object *obj) IRDA_ASSERT(obj != NULL, return;); IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return;); - if (obj->name) - kfree(obj->name); + kfree(obj->name); hashbin_delete(obj->attribs, (FREE_FUNC) __irias_delete_attrib); @@ -562,14 +560,12 @@ void irias_delete_value(struct ias_value *value) /* No need to deallocate */ break; case IAS_STRING: - /* If string, deallocate string */ - if (value->t.string != NULL) - kfree(value->t.string); + /* Deallocate string */ + kfree(value->t.string); break; case IAS_OCT_SEQ: - /* If byte stream, deallocate byte stream */ - if (value->t.oct_seq != NULL) - kfree(value->t.oct_seq); + /* Deallocate byte stream */ + kfree(value->t.oct_seq); break; default: IRDA_DEBUG(0, "%s(), Unknown value type!\n", __FUNCTION__); diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 59d02cbbeb9..c3f0b078345 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -116,7 +116,9 @@ static int llc_ui_send_data(struct sock* sk, struct sk_buff *skb, int noblock) struct llc_sock* llc = llc_sk(sk); int rc = 0; - if (unlikely(llc_data_accept_state(llc->state) || llc->p_flag)) { + if (unlikely(llc_data_accept_state(llc->state) || + llc->remote_busy_flag || + llc->p_flag)) { long timeout = sock_sndtimeo(sk, noblock); rc = llc_ui_wait_for_busy_core(sk, timeout); @@ -542,6 +544,7 @@ static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout) if (sk_wait_event(sk, &timeout, (sk->sk_shutdown & RCV_SHUTDOWN) || (!llc_data_accept_state(llc->state) && + !llc->remote_busy_flag && !llc->p_flag))) break; rc = -ERESTARTSYS; diff --git a/net/llc/llc_c_ac.c b/net/llc/llc_c_ac.c index b0bcfb1f12d..8169f24ed33 100644 --- a/net/llc/llc_c_ac.c +++ b/net/llc/llc_c_ac.c @@ -866,7 +866,8 @@ int llc_conn_ac_send_ack_if_needed(struct sock *sk, struct sk_buff *skb) llc->ack_must_be_send = 1; llc->ack_pf = pf_bit & 1; } - if (((llc->vR - llc->first_pdu_Ns + 129) % 128) >= llc->npta) { + if (((llc->vR - llc->first_pdu_Ns + 1 + LLC_2_SEQ_NBR_MODULO) + % LLC_2_SEQ_NBR_MODULO) >= llc->npta) { llc_conn_ac_send_rr_rsp_f_set_ackpf(sk, skb); llc->ack_must_be_send = 0; llc->ack_pf = 0; @@ -994,8 +995,8 @@ static int llc_conn_ac_inc_npta_value(struct sock *sk, struct sk_buff *skb) llc->dec_step = 0; llc->dec_cntr = llc->inc_cntr = 2; ++llc->npta; - if (llc->npta > 127) - llc->npta = 127 ; + if (llc->npta > (u8) ~LLC_2_SEQ_NBR_MODULO) + llc->npta = (u8) ~LLC_2_SEQ_NBR_MODULO; } else --llc->inc_cntr; return 0; @@ -1065,9 +1066,10 @@ int llc_conn_ac_dec_tx_win_size(struct sock *sk, struct sk_buff *skb) struct llc_sock *llc = llc_sk(sk); u8 unacked_pdu = skb_queue_len(&llc->pdu_unack_q); - llc->k -= unacked_pdu; - if (llc->k < 2) - llc->k = 2; + if (llc->k - unacked_pdu < 1) + llc->k = 1; + else + llc->k -= unacked_pdu; return 0; } @@ -1084,8 +1086,8 @@ int llc_conn_ac_inc_tx_win_size(struct sock *sk, struct sk_buff *skb) struct llc_sock *llc = llc_sk(sk); llc->k += 1; - if (llc->k > 128) - llc->k = 128 ; + if (llc->k > (u8) ~LLC_2_SEQ_NBR_MODULO) + llc->k = (u8) ~LLC_2_SEQ_NBR_MODULO; return 0; } @@ -1309,7 +1311,7 @@ int llc_conn_ac_set_vs_nr(struct sock *sk, struct sk_buff *skb) static int llc_conn_ac_inc_vs_by_1(struct sock *sk, struct sk_buff *skb) { - llc_sk(sk)->vS = (llc_sk(sk)->vS + 1) % 128; + llc_sk(sk)->vS = (llc_sk(sk)->vS + 1) % LLC_2_SEQ_NBR_MODULO; return 0; } diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 8296b38bf27..a84f9221e5f 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -1,3 +1,6 @@ +menu "Core Netfilter Configuration" + depends on NET && NETFILTER + config NETFILTER_NETLINK tristate "Netfilter netlink interface" help @@ -22,3 +25,74 @@ config NETFILTER_NETLINK_LOG and is also scheduled to replace the old syslog-based ipt_LOG and ip6t_LOG modules. +config NF_CONNTRACK + tristate "Layer 3 Independent Connection tracking (EXPERIMENTAL)" + depends on EXPERIMENTAL && IP_NF_CONNTRACK=n + default n + ---help--- + Connection tracking keeps a record of what packets have passed + through your machine, in order to figure out how they are related + into connections. + + Layer 3 independent connection tracking is experimental scheme + which generalize ip_conntrack to support other layer 3 protocols. + + To compile it as a module, choose M here. If unsure, say N. + +config NF_CT_ACCT + bool "Connection tracking flow accounting" + depends on NF_CONNTRACK + help + If this option is enabled, the connection tracking code will + keep per-flow packet and byte counters. + + Those counters can be used for flow-based accounting or the + `connbytes' match. + + If unsure, say `N'. + +config NF_CONNTRACK_MARK + bool 'Connection mark tracking support' + depends on NF_CONNTRACK + help + This option enables support for connection marks, used by the + `CONNMARK' target and `connmark' match. Similar to the mark value + of packets, but this mark value is kept in the conntrack session + instead of the individual packets. + +config NF_CONNTRACK_EVENTS + bool "Connection tracking events" + depends on NF_CONNTRACK + help + If this option is enabled, the connection tracking code will + provide a notifier chain that can be used by other kernel code + to get notified aboutchanges in the connection tracking state. + + If unsure, say `N'. + +config NF_CT_PROTO_SCTP + tristate 'SCTP protocol on new connection tracking support (EXPERIMENTAL)' + depends on EXPERIMENTAL && NF_CONNTRACK + default n + help + With this option enabled, the layer 3 independent connection + tracking code will be able to do state tracking on SCTP connections. + + If you want to compile it as a module, say M here and read + Documentation/modules.txt. If unsure, say `N'. + +config NF_CONNTRACK_FTP + tristate "FTP support on new connection tracking (EXPERIMENTAL)" + depends on EXPERIMENTAL && NF_CONNTRACK + help + Tracking FTP connections is problematic: special helpers are + required for tracking them, and doing masquerading and other forms + of Network Address Translation on them. + + This is FTP support on Layer 3 independent connection tracking. + Layer 3 independent connection tracking is experimental scheme + which generalize ip_conntrack to support other layer 3 protocols. + + To compile it as a module, choose M here. If unsure, say N. + +endmenu diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index b3b44f8b415..55f019ad2c0 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -5,3 +5,11 @@ obj-$(CONFIG_NETFILTER) = netfilter.o obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o + +nf_conntrack-objs := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o + +obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o +obj-$(CONFIG_NF_CONNTRACK_FTP) += nf_conntrack_ftp.o + +# SCTP protocol connection tracking +obj-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c new file mode 100644 index 00000000000..1da678303d7 --- /dev/null +++ b/net/netfilter/nf_conntrack_core.c @@ -0,0 +1,1545 @@ +/* Connection state tracking for netfilter. This is separated from, + but required by, the NAT layer; it can also be used by an iptables + extension. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 23 Apr 2001: Harald Welte <laforge@gnumonks.org> + * - new API and handling of conntrack/nat helpers + * - now capable of multiple expectations for one master + * 16 Jul 2002: Harald Welte <laforge@gnumonks.org> + * - add usage/reference counts to ip_conntrack_expect + * - export ip_conntrack[_expect]_{find_get,put} functions + * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * - generalize L3 protocol denendent part. + * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * - add support various size of conntrack structures. + * + * Derived from net/ipv4/netfilter/ip_conntrack_core.c + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/netfilter.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/proc_fs.h> +#include <linux/vmalloc.h> +#include <linux/stddef.h> +#include <linux/slab.h> +#include <linux/random.h> +#include <linux/jhash.h> +#include <linux/err.h> +#include <linux/percpu.h> +#include <linux/moduleparam.h> +#include <linux/notifier.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/socket.h> + +/* This rwlock protects the main hash table, protocol/helper/expected + registrations, conntrack timers*/ +#define ASSERT_READ_LOCK(x) +#define ASSERT_WRITE_LOCK(x) + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_l3proto.h> +#include <net/netfilter/nf_conntrack_protocol.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <linux/netfilter_ipv4/listhelp.h> + +#define NF_CONNTRACK_VERSION "0.4.1" + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +DEFINE_RWLOCK(nf_conntrack_lock); + +/* nf_conntrack_standalone needs this */ +atomic_t nf_conntrack_count = ATOMIC_INIT(0); + +void (*nf_conntrack_destroyed)(struct nf_conn *conntrack) = NULL; +LIST_HEAD(nf_conntrack_expect_list); +struct nf_conntrack_protocol **nf_ct_protos[PF_MAX]; +struct nf_conntrack_l3proto *nf_ct_l3protos[PF_MAX]; +static LIST_HEAD(helpers); +unsigned int nf_conntrack_htable_size = 0; +int nf_conntrack_max; +struct list_head *nf_conntrack_hash; +static kmem_cache_t *nf_conntrack_expect_cachep; +struct nf_conn nf_conntrack_untracked; +unsigned int nf_ct_log_invalid; +static LIST_HEAD(unconfirmed); +static int nf_conntrack_vmalloc; + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +struct notifier_block *nf_conntrack_chain; +struct notifier_block *nf_conntrack_expect_chain; + +DEFINE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache); + +/* deliver cached events and clear cache entry - must be called with locally + * disabled softirqs */ +static inline void +__nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache) +{ + DEBUGP("ecache: delivering events for %p\n", ecache->ct); + if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct) + && ecache->events) + notifier_call_chain(&nf_conntrack_chain, ecache->events, + ecache->ct); + + ecache->events = 0; + nf_ct_put(ecache->ct); + ecache->ct = NULL; +} + +/* Deliver all cached events for a particular conntrack. This is called + * by code prior to async packet handling for freeing the skb */ +void nf_ct_deliver_cached_events(const struct nf_conn *ct) +{ + struct nf_conntrack_ecache *ecache; + + local_bh_disable(); + ecache = &__get_cpu_var(nf_conntrack_ecache); + if (ecache->ct == ct) + __nf_ct_deliver_cached_events(ecache); + local_bh_enable(); +} + +/* Deliver cached events for old pending events, if current conntrack != old */ +void __nf_ct_event_cache_init(struct nf_conn *ct) +{ + struct nf_conntrack_ecache *ecache; + + /* take care of delivering potentially old events */ + ecache = &__get_cpu_var(nf_conntrack_ecache); + BUG_ON(ecache->ct == ct); + if (ecache->ct) + __nf_ct_deliver_cached_events(ecache); + /* initialize for this conntrack/packet */ + ecache->ct = ct; + nf_conntrack_get(&ct->ct_general); +} + +/* flush the event cache - touches other CPU's data and must not be called + * while packets are still passing through the code */ +static void nf_ct_event_cache_flush(void) +{ + struct nf_conntrack_ecache *ecache; + int cpu; + + for_each_cpu(cpu) { + ecache = &per_cpu(nf_conntrack_ecache, cpu); + if (ecache->ct) + nf_ct_put(ecache->ct); + } +} +#else +static inline void nf_ct_event_cache_flush(void) {} +#endif /* CONFIG_NF_CONNTRACK_EVENTS */ + +DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat); +EXPORT_PER_CPU_SYMBOL(nf_conntrack_stat); + +/* + * This scheme offers various size of "struct nf_conn" dependent on + * features(helper, nat, ...) + */ + +#define NF_CT_FEATURES_NAMELEN 256 +static struct { + /* name of slab cache. printed in /proc/slabinfo */ + char *name; + + /* size of slab cache */ + size_t size; + + /* slab cache pointer */ + kmem_cache_t *cachep; + + /* allocated slab cache + modules which uses this slab cache */ + int use; + + /* Initialization */ + int (*init_conntrack)(struct nf_conn *, u_int32_t); + +} nf_ct_cache[NF_CT_F_NUM]; + +/* protect members of nf_ct_cache except of "use" */ +DEFINE_RWLOCK(nf_ct_cache_lock); + +/* This avoids calling kmem_cache_create() with same name simultaneously */ +DECLARE_MUTEX(nf_ct_cache_mutex); + +extern struct nf_conntrack_protocol nf_conntrack_generic_protocol; +struct nf_conntrack_protocol * +nf_ct_find_proto(u_int16_t l3proto, u_int8_t protocol) +{ + if (unlikely(nf_ct_protos[l3proto] == NULL)) + return &nf_conntrack_generic_protocol; + + return nf_ct_protos[l3proto][protocol]; +} + +static int nf_conntrack_hash_rnd_initted; +static unsigned int nf_conntrack_hash_rnd; + +static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple, + unsigned int size, unsigned int rnd) +{ + unsigned int a, b; + a = jhash((void *)tuple->src.u3.all, sizeof(tuple->src.u3.all), + ((tuple->src.l3num) << 16) | tuple->dst.protonum); + b = jhash((void *)tuple->dst.u3.all, sizeof(tuple->dst.u3.all), + (tuple->src.u.all << 16) | tuple->dst.u.all); + + return jhash_2words(a, b, rnd) % size; +} + +static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple) +{ + return __hash_conntrack(tuple, nf_conntrack_htable_size, + nf_conntrack_hash_rnd); +} + +/* Initialize "struct nf_conn" which has spaces for helper */ +static int +init_conntrack_for_helper(struct nf_conn *conntrack, u_int32_t features) +{ + + conntrack->help = (union nf_conntrack_help *) + (((unsigned long)conntrack->data + + (__alignof__(union nf_conntrack_help) - 1)) + & (~((unsigned long)(__alignof__(union nf_conntrack_help) -1)))); + return 0; +} + +int nf_conntrack_register_cache(u_int32_t features, const char *name, + size_t size, + int (*init)(struct nf_conn *, u_int32_t)) +{ + int ret = 0; + char *cache_name; + kmem_cache_t *cachep; + + DEBUGP("nf_conntrack_register_cache: features=0x%x, name=%s, size=%d\n", + features, name, size); + + if (features < NF_CT_F_BASIC || features >= NF_CT_F_NUM) { + DEBUGP("nf_conntrack_register_cache: invalid features.: 0x%x\n", + features); + return -EINVAL; + } + + down(&nf_ct_cache_mutex); + + write_lock_bh(&nf_ct_cache_lock); + /* e.g: multiple helpers are loaded */ + if (nf_ct_cache[features].use > 0) { + DEBUGP("nf_conntrack_register_cache: already resisterd.\n"); + if ((!strncmp(nf_ct_cache[features].name, name, + NF_CT_FEATURES_NAMELEN)) + && nf_ct_cache[features].size == size + && nf_ct_cache[features].init_conntrack == init) { + DEBUGP("nf_conntrack_register_cache: reusing.\n"); + nf_ct_cache[features].use++; + ret = 0; + } else + ret = -EBUSY; + + write_unlock_bh(&nf_ct_cache_lock); + up(&nf_ct_cache_mutex); + return ret; + } + write_unlock_bh(&nf_ct_cache_lock); + + /* + * The memory space for name of slab cache must be alive until + * cache is destroyed. + */ + cache_name = kmalloc(sizeof(char)*NF_CT_FEATURES_NAMELEN, GFP_ATOMIC); + if (cache_name == NULL) { + DEBUGP("nf_conntrack_register_cache: can't alloc cache_name\n"); + ret = -ENOMEM; + goto out_up_mutex; + } + + if (strlcpy(cache_name, name, NF_CT_FEATURES_NAMELEN) + >= NF_CT_FEATURES_NAMELEN) { + printk("nf_conntrack_register_cache: name too long\n"); + ret = -EINVAL; + goto out_free_name; + } + + cachep = kmem_cache_create(cache_name, size, 0, 0, + NULL, NULL); + if (!cachep) { + printk("nf_conntrack_register_cache: Can't create slab cache " + "for the features = 0x%x\n", features); + ret = -ENOMEM; + goto out_free_name; + } + + write_lock_bh(&nf_ct_cache_lock); + nf_ct_cache[features].use = 1; + nf_ct_cache[features].size = size; + nf_ct_cache[features].init_conntrack = init; + nf_ct_cache[features].cachep = cachep; + nf_ct_cache[features].name = cache_name; + write_unlock_bh(&nf_ct_cache_lock); + + goto out_up_mutex; + +out_free_name: + kfree(cache_name); +out_up_mutex: + up(&nf_ct_cache_mutex); + return ret; +} + +/* FIXME: In the current, only nf_conntrack_cleanup() can call this function. */ +void nf_conntrack_unregister_cache(u_int32_t features) +{ + kmem_cache_t *cachep; + char *name; + + /* + * This assures that kmem_cache_create() isn't called before destroying + * slab cache. + */ + DEBUGP("nf_conntrack_unregister_cache: 0x%04x\n", features); + down(&nf_ct_cache_mutex); + + write_lock_bh(&nf_ct_cache_lock); + if (--nf_ct_cache[features].use > 0) { + write_unlock_bh(&nf_ct_cache_lock); + up(&nf_ct_cache_mutex); + return; + } + cachep = nf_ct_cache[features].cachep; + name = nf_ct_cache[features].name; + nf_ct_cache[features].cachep = NULL; + nf_ct_cache[features].name = NULL; + nf_ct_cache[features].init_conntrack = NULL; + nf_ct_cache[features].size = 0; + write_unlock_bh(&nf_ct_cache_lock); + + synchronize_net(); + + kmem_cache_destroy(cachep); + kfree(name); + + up(&nf_ct_cache_mutex); +} + +int +nf_ct_get_tuple(const struct sk_buff *skb, + unsigned int nhoff, + unsigned int dataoff, + u_int16_t l3num, + u_int8_t protonum, + struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_l3proto *l3proto, + const struct nf_conntrack_protocol *protocol) +{ + NF_CT_TUPLE_U_BLANK(tuple); + + tuple->src.l3num = l3num; + if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0) + return 0; + + tuple->dst.protonum = protonum; + tuple->dst.dir = IP_CT_DIR_ORIGINAL; + + return protocol->pkt_to_tuple(skb, dataoff, tuple); +} + +int +nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, + const struct nf_conntrack_tuple *orig, + const struct nf_conntrack_l3proto *l3proto, + const struct nf_conntrack_protocol *protocol) +{ + NF_CT_TUPLE_U_BLANK(inverse); + + inverse->src.l3num = orig->src.l3num; + if (l3proto->invert_tuple(inverse, orig) == 0) + return 0; + + inverse->dst.dir = !orig->dst.dir; + + inverse->dst.protonum = orig->dst.protonum; + return protocol->invert_tuple(inverse, orig); +} + +/* nf_conntrack_expect helper functions */ +static void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) +{ + ASSERT_WRITE_LOCK(&nf_conntrack_lock); + NF_CT_ASSERT(!timer_pending(&exp->timeout)); + list_del(&exp->list); + NF_CT_STAT_INC(expect_delete); + exp->master->expecting--; + nf_conntrack_expect_put(exp); +} + +static void expectation_timed_out(unsigned long ul_expect) +{ + struct nf_conntrack_expect *exp = (void *)ul_expect; + + write_lock_bh(&nf_conntrack_lock); + nf_ct_unlink_expect(exp); + write_unlock_bh(&nf_conntrack_lock); + nf_conntrack_expect_put(exp); +} + +/* If an expectation for this connection is found, it gets delete from + * global list then returned. */ +static struct nf_conntrack_expect * +find_expectation(const struct nf_conntrack_tuple *tuple) +{ + struct nf_conntrack_expect *i; + + list_for_each_entry(i, &nf_conntrack_expect_list, list) { + /* If master is not in hash table yet (ie. packet hasn't left + this machine yet), how can other end know about expected? + Hence these are not the droids you are looking for (if + master ct never got confirmed, we'd hold a reference to it + and weird things would happen to future packets). */ + if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) + && nf_ct_is_confirmed(i->master)) { + if (i->flags & NF_CT_EXPECT_PERMANENT) { + atomic_inc(&i->use); + return i; + } else if (del_timer(&i->timeout)) { + nf_ct_unlink_expect(i); + return i; + } + } + } + return NULL; +} + +/* delete all expectations for this conntrack */ +static void remove_expectations(struct nf_conn *ct) +{ + struct nf_conntrack_expect *i, *tmp; + + /* Optimization: most connection never expect any others. */ + if (ct->expecting == 0) + return; + + list_for_each_entry_safe(i, tmp, &nf_conntrack_expect_list, list) { + if (i->master == ct && del_timer(&i->timeout)) { + nf_ct_unlink_expect(i); + nf_conntrack_expect_put(i); + } + } +} + +static void +clean_from_lists(struct nf_conn *ct) +{ + unsigned int ho, hr; + + DEBUGP("clean_from_lists(%p)\n", ct); + ASSERT_WRITE_LOCK(&nf_conntrack_lock); + + ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + LIST_DELETE(&nf_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]); + LIST_DELETE(&nf_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]); + + /* Destroy all pending expectations */ + remove_expectations(ct); +} + +static void +destroy_conntrack(struct nf_conntrack *nfct) +{ + struct nf_conn *ct = (struct nf_conn *)nfct; + struct nf_conntrack_l3proto *l3proto; + struct nf_conntrack_protocol *proto; + + DEBUGP("destroy_conntrack(%p)\n", ct); + NF_CT_ASSERT(atomic_read(&nfct->use) == 0); + NF_CT_ASSERT(!timer_pending(&ct->timeout)); + + nf_conntrack_event(IPCT_DESTROY, ct); + set_bit(IPS_DYING_BIT, &ct->status); + + /* To make sure we don't get any weird locking issues here: + * destroy_conntrack() MUST NOT be called with a write lock + * to nf_conntrack_lock!!! -HW */ + l3proto = nf_ct_find_l3proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num); + if (l3proto && l3proto->destroy) + l3proto->destroy(ct); + + proto = nf_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num, + ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); + if (proto && proto->destroy) + proto->destroy(ct); + + if (nf_conntrack_destroyed) + nf_conntrack_destroyed(ct); + + write_lock_bh(&nf_conntrack_lock); + /* Expectations will have been removed in clean_from_lists, + * except TFTP can create an expectation on the first packet, + * before connection is in the list, so we need to clean here, + * too. */ + remove_expectations(ct); + + /* We overload first tuple to link into unconfirmed list. */ + if (!nf_ct_is_confirmed(ct)) { + BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list)); + list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); + } + + NF_CT_STAT_INC(delete); + write_unlock_bh(&nf_conntrack_lock); + + if (ct->master) + nf_ct_put(ct->master); + + DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct); + nf_conntrack_free(ct); +} + +static void death_by_timeout(unsigned long ul_conntrack) +{ + struct nf_conn *ct = (void *)ul_conntrack; + + write_lock_bh(&nf_conntrack_lock); + /* Inside lock so preempt is disabled on module removal path. + * Otherwise we can get spurious warnings. */ + NF_CT_STAT_INC(delete_list); + clean_from_lists(ct); + write_unlock_bh(&nf_conntrack_lock); + nf_ct_put(ct); +} + +static inline int +conntrack_tuple_cmp(const struct nf_conntrack_tuple_hash *i, + const struct nf_conntrack_tuple *tuple, + const struct nf_conn *ignored_conntrack) +{ + ASSERT_READ_LOCK(&nf_conntrack_lock); + return nf_ct_tuplehash_to_ctrack(i) != ignored_conntrack + && nf_ct_tuple_equal(tuple, &i->tuple); +} + +static struct nf_conntrack_tuple_hash * +__nf_conntrack_find(const struct nf_conntrack_tuple *tuple, + const struct nf_conn *ignored_conntrack) +{ + struct nf_conntrack_tuple_hash *h; + unsigned int hash = hash_conntrack(tuple); + + ASSERT_READ_LOCK(&nf_conntrack_lock); + list_for_each_entry(h, &nf_conntrack_hash[hash], list) { + if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) { + NF_CT_STAT_INC(found); + return h; + } + NF_CT_STAT_INC(searched); + } + + return NULL; +} + +/* Find a connection corresponding to a tuple. */ +struct nf_conntrack_tuple_hash * +nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple, + const struct nf_conn *ignored_conntrack) +{ + struct nf_conntrack_tuple_hash *h; + + read_lock_bh(&nf_conntrack_lock); + h = __nf_conntrack_find(tuple, ignored_conntrack); + if (h) + atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use); + read_unlock_bh(&nf_conntrack_lock); + + return h; +} + +/* Confirm a connection given skb; places it in hash table */ +int +__nf_conntrack_confirm(struct sk_buff **pskb) +{ + unsigned int hash, repl_hash; + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + + ct = nf_ct_get(*pskb, &ctinfo); + + /* ipt_REJECT uses nf_conntrack_attach to attach related + ICMP/TCP RST packets in other direction. Actual packet + which created connection will be IP_CT_NEW or for an + expected connection, IP_CT_RELATED. */ + if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + return NF_ACCEPT; + + hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + /* We're not in hash table, and we refuse to set up related + connections for unconfirmed conns. But packet copies and + REJECT will give spurious warnings here. */ + /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */ + + /* No external references means noone else could have + confirmed us. */ + NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); + DEBUGP("Confirming conntrack %p\n", ct); + + write_lock_bh(&nf_conntrack_lock); + + /* See if there's one in the list already, including reverse: + NAT could have grabbed it without realizing, since we're + not in the hash. If there is, we lost race. */ + if (!LIST_FIND(&nf_conntrack_hash[hash], + conntrack_tuple_cmp, + struct nf_conntrack_tuple_hash *, + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL) + && !LIST_FIND(&nf_conntrack_hash[repl_hash], + conntrack_tuple_cmp, + struct nf_conntrack_tuple_hash *, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) { + /* Remove from unconfirmed list */ + list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); + + list_prepend(&nf_conntrack_hash[hash], + &ct->tuplehash[IP_CT_DIR_ORIGINAL]); + list_prepend(&nf_conntrack_hash[repl_hash], + &ct->tuplehash[IP_CT_DIR_REPLY]); + /* Timer relative to confirmation time, not original + setting time, otherwise we'd get timer wrap in + weird delay cases. */ + ct->timeout.expires += jiffies; + add_timer(&ct->timeout); + atomic_inc(&ct->ct_general.use); + set_bit(IPS_CONFIRMED_BIT, &ct->status); + NF_CT_STAT_INC(insert); + write_unlock_bh(&nf_conntrack_lock); + if (ct->helper) + nf_conntrack_event_cache(IPCT_HELPER, *pskb); +#ifdef CONFIG_NF_NAT_NEEDED + if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) || + test_bit(IPS_DST_NAT_DONE_BIT, &ct->status)) + nf_conntrack_event_cache(IPCT_NATINFO, *pskb); +#endif + nf_conntrack_event_cache(master_ct(ct) ? + IPCT_RELATED : IPCT_NEW, *pskb); + return NF_ACCEPT; + } + + NF_CT_STAT_INC(insert_failed); + write_unlock_bh(&nf_conntrack_lock); + return NF_DROP; +} + +/* Returns true if a connection correspondings to the tuple (required + for NAT). */ +int +nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, + const struct nf_conn *ignored_conntrack) +{ + struct nf_conntrack_tuple_hash *h; + + read_lock_bh(&nf_conntrack_lock); + h = __nf_conntrack_find(tuple, ignored_conntrack); + read_unlock_bh(&nf_conntrack_lock); + + return h != NULL; +} + +/* There's a small race here where we may free a just-assured + connection. Too bad: we're in trouble anyway. */ +static inline int unreplied(const struct nf_conntrack_tuple_hash *i) +{ + return !(test_bit(IPS_ASSURED_BIT, + &nf_ct_tuplehash_to_ctrack(i)->status)); +} + +static int early_drop(struct list_head *chain) +{ + /* Traverse backwards: gives us oldest, which is roughly LRU */ + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct = NULL; + int dropped = 0; + + read_lock_bh(&nf_conntrack_lock); + h = LIST_FIND_B(chain, unreplied, struct nf_conntrack_tuple_hash *); + if (h) { + ct = nf_ct_tuplehash_to_ctrack(h); + atomic_inc(&ct->ct_general.use); + } + read_unlock_bh(&nf_conntrack_lock); + + if (!ct) + return dropped; + + if (del_timer(&ct->timeout)) { + death_by_timeout((unsigned long)ct); + dropped = 1; + NF_CT_STAT_INC(early_drop); + } + nf_ct_put(ct); + return dropped; +} + +static inline int helper_cmp(const struct nf_conntrack_helper *i, + const struct nf_conntrack_tuple *rtuple) +{ + return nf_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask); +} + +static struct nf_conntrack_helper * +nf_ct_find_helper(const struct nf_conntrack_tuple *tuple) +{ + return LIST_FIND(&helpers, helper_cmp, + struct nf_conntrack_helper *, + tuple); +} + +static struct nf_conn * +__nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, + const struct nf_conntrack_tuple *repl, + const struct nf_conntrack_l3proto *l3proto) +{ + struct nf_conn *conntrack = NULL; + u_int32_t features = 0; + + if (!nf_conntrack_hash_rnd_initted) { + get_random_bytes(&nf_conntrack_hash_rnd, 4); + nf_conntrack_hash_rnd_initted = 1; + } + + if (nf_conntrack_max + && atomic_read(&nf_conntrack_count) >= nf_conntrack_max) { + unsigned int hash = hash_conntrack(orig); + /* Try dropping from this hash chain. */ + if (!early_drop(&nf_conntrack_hash[hash])) { + if (net_ratelimit()) + printk(KERN_WARNING + "nf_conntrack: table full, dropping" + " packet.\n"); + return ERR_PTR(-ENOMEM); + } + } + + /* find features needed by this conntrack. */ + features = l3proto->get_features(orig); + read_lock_bh(&nf_conntrack_lock); + if (nf_ct_find_helper(repl) != NULL) + features |= NF_CT_F_HELP; + read_unlock_bh(&nf_conntrack_lock); + + DEBUGP("nf_conntrack_alloc: features=0x%x\n", features); + + read_lock_bh(&nf_ct_cache_lock); + + if (!nf_ct_cache[features].use) { + DEBUGP("nf_conntrack_alloc: not supported features = 0x%x\n", + features); + goto out; + } + + conntrack = kmem_cache_alloc(nf_ct_cache[features].cachep, GFP_ATOMIC); + if (conntrack == NULL) { + DEBUGP("nf_conntrack_alloc: Can't alloc conntrack from cache\n"); + goto out; + } + + memset(conntrack, 0, nf_ct_cache[features].size); + conntrack->features = features; + if (nf_ct_cache[features].init_conntrack && + nf_ct_cache[features].init_conntrack(conntrack, features) < 0) { + DEBUGP("nf_conntrack_alloc: failed to init\n"); + kmem_cache_free(nf_ct_cache[features].cachep, conntrack); + conntrack = NULL; + goto out; + } + + atomic_set(&conntrack->ct_general.use, 1); + conntrack->ct_general.destroy = destroy_conntrack; + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; + conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; + /* Don't set timer yet: wait for confirmation */ + init_timer(&conntrack->timeout); + conntrack->timeout.data = (unsigned long)conntrack; + conntrack->timeout.function = death_by_timeout; + + atomic_inc(&nf_conntrack_count); +out: + read_unlock_bh(&nf_ct_cache_lock); + return conntrack; +} + +struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, + const struct nf_conntrack_tuple *repl) +{ + struct nf_conntrack_l3proto *l3proto; + + l3proto = nf_ct_find_l3proto(orig->src.l3num); + return __nf_conntrack_alloc(orig, repl, l3proto); +} + +void nf_conntrack_free(struct nf_conn *conntrack) +{ + u_int32_t features = conntrack->features; + NF_CT_ASSERT(features >= NF_CT_F_BASIC && features < NF_CT_F_NUM); + DEBUGP("nf_conntrack_free: features = 0x%x, conntrack=%p\n", features, + conntrack); + kmem_cache_free(nf_ct_cache[features].cachep, conntrack); + atomic_dec(&nf_conntrack_count); +} + +/* Allocate a new conntrack: we return -ENOMEM if classification + failed due to stress. Otherwise it really is unclassifiable. */ +static struct nf_conntrack_tuple_hash * +init_conntrack(const struct nf_conntrack_tuple *tuple, + struct nf_conntrack_l3proto *l3proto, + struct nf_conntrack_protocol *protocol, + struct sk_buff *skb, + unsigned int dataoff) +{ + struct nf_conn *conntrack; + struct nf_conntrack_tuple repl_tuple; + struct nf_conntrack_expect *exp; + + if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, protocol)) { + DEBUGP("Can't invert tuple.\n"); + return NULL; + } + + conntrack = __nf_conntrack_alloc(tuple, &repl_tuple, l3proto); + if (conntrack == NULL || IS_ERR(conntrack)) { + DEBUGP("Can't allocate conntrack.\n"); + return (struct nf_conntrack_tuple_hash *)conntrack; + } + + if (!protocol->new(conntrack, skb, dataoff)) { + nf_conntrack_free(conntrack); + DEBUGP("init conntrack: can't track with proto module\n"); + return NULL; + } + + write_lock_bh(&nf_conntrack_lock); + exp = find_expectation(tuple); + + if (exp) { + DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n", + conntrack, exp); + /* Welcome, Mr. Bond. We've been expecting you... */ + __set_bit(IPS_EXPECTED_BIT, &conntrack->status); + conntrack->master = exp->master; +#ifdef CONFIG_NF_CONNTRACK_MARK + conntrack->mark = exp->master->mark; +#endif + nf_conntrack_get(&conntrack->master->ct_general); + NF_CT_STAT_INC(expect_new); + } else { + conntrack->helper = nf_ct_find_helper(&repl_tuple); + + NF_CT_STAT_INC(new); + } + + /* Overload tuple linked list to put us in unconfirmed list. */ + list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed); + + write_unlock_bh(&nf_conntrack_lock); + + if (exp) { + if (exp->expectfn) + exp->expectfn(conntrack, exp); + nf_conntrack_expect_put(exp); + } + + return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL]; +} + +/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */ +static inline struct nf_conn * +resolve_normal_ct(struct sk_buff *skb, + unsigned int dataoff, + u_int16_t l3num, + u_int8_t protonum, + struct nf_conntrack_l3proto *l3proto, + struct nf_conntrack_protocol *proto, + int *set_reply, + enum ip_conntrack_info *ctinfo) +{ + struct nf_conntrack_tuple tuple; + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; + + if (!nf_ct_get_tuple(skb, (unsigned int)(skb->nh.raw - skb->data), + dataoff, l3num, protonum, &tuple, l3proto, + proto)) { + DEBUGP("resolve_normal_ct: Can't get tuple\n"); + return NULL; + } + + /* look for tuple match */ + h = nf_conntrack_find_get(&tuple, NULL); + if (!h) { + h = init_conntrack(&tuple, l3proto, proto, skb, dataoff); + if (!h) + return NULL; + if (IS_ERR(h)) + return (void *)h; + } + ct = nf_ct_tuplehash_to_ctrack(h); + + /* It exists; we have (non-exclusive) reference. */ + if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { + *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY; + /* Please set reply bit if this packet OK */ + *set_reply = 1; + } else { + /* Once we've had two way comms, always ESTABLISHED. */ + if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + DEBUGP("nf_conntrack_in: normal packet for %p\n", ct); + *ctinfo = IP_CT_ESTABLISHED; + } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { + DEBUGP("nf_conntrack_in: related packet for %p\n", ct); + *ctinfo = IP_CT_RELATED; + } else { + DEBUGP("nf_conntrack_in: new packet for %p\n", ct); + *ctinfo = IP_CT_NEW; + } + *set_reply = 0; + } + skb->nfct = &ct->ct_general; + skb->nfctinfo = *ctinfo; + return ct; +} + +unsigned int +nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff **pskb) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + struct nf_conntrack_l3proto *l3proto; + struct nf_conntrack_protocol *proto; + unsigned int dataoff; + u_int8_t protonum; + int set_reply = 0; + int ret; + + /* Previously seen (loopback or untracked)? Ignore. */ + if ((*pskb)->nfct) { + NF_CT_STAT_INC(ignore); + return NF_ACCEPT; + } + + l3proto = nf_ct_find_l3proto((u_int16_t)pf); + if ((ret = l3proto->prepare(pskb, hooknum, &dataoff, &protonum)) <= 0) { + DEBUGP("not prepared to track yet or error occured\n"); + return -ret; + } + + proto = nf_ct_find_proto((u_int16_t)pf, protonum); + + /* It may be an special packet, error, unclean... + * inverse of the return code tells to the netfilter + * core what to do with the packet. */ + if (proto->error != NULL && + (ret = proto->error(*pskb, dataoff, &ctinfo, pf, hooknum)) <= 0) { + NF_CT_STAT_INC(error); + NF_CT_STAT_INC(invalid); + return -ret; + } + + ct = resolve_normal_ct(*pskb, dataoff, pf, protonum, l3proto, proto, + &set_reply, &ctinfo); + if (!ct) { + /* Not valid part of a connection */ + NF_CT_STAT_INC(invalid); + return NF_ACCEPT; + } + + if (IS_ERR(ct)) { + /* Too stressed to deal. */ + NF_CT_STAT_INC(drop); + return NF_DROP; + } + + NF_CT_ASSERT((*pskb)->nfct); + + ret = proto->packet(ct, *pskb, dataoff, ctinfo, pf, hooknum); + if (ret < 0) { + /* Invalid: inverse of the return code tells + * the netfilter core what to do */ + DEBUGP("nf_conntrack_in: Can't track with proto module\n"); + nf_conntrack_put((*pskb)->nfct); + (*pskb)->nfct = NULL; + NF_CT_STAT_INC(invalid); + return -ret; + } + + if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) + nf_conntrack_event_cache(IPCT_STATUS, *pskb); + + return ret; +} + +int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse, + const struct nf_conntrack_tuple *orig) +{ + return nf_ct_invert_tuple(inverse, orig, + nf_ct_find_l3proto(orig->src.l3num), + nf_ct_find_proto(orig->src.l3num, + orig->dst.protonum)); +} + +/* Would two expected things clash? */ +static inline int expect_clash(const struct nf_conntrack_expect *a, + const struct nf_conntrack_expect *b) +{ + /* Part covered by intersection of masks must be unequal, + otherwise they clash */ + struct nf_conntrack_tuple intersect_mask; + int count; + + intersect_mask.src.l3num = a->mask.src.l3num & b->mask.src.l3num; + intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all; + intersect_mask.dst.u.all = a->mask.dst.u.all & b->mask.dst.u.all; + intersect_mask.dst.protonum = a->mask.dst.protonum + & b->mask.dst.protonum; + + for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){ + intersect_mask.src.u3.all[count] = + a->mask.src.u3.all[count] & b->mask.src.u3.all[count]; + } + + for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){ + intersect_mask.dst.u3.all[count] = + a->mask.dst.u3.all[count] & b->mask.dst.u3.all[count]; + } + + return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask); +} + +static inline int expect_matches(const struct nf_conntrack_expect *a, + const struct nf_conntrack_expect *b) +{ + return a->master == b->master + && nf_ct_tuple_equal(&a->tuple, &b->tuple) + && nf_ct_tuple_equal(&a->mask, &b->mask); +} + +/* Generally a bad idea to call this: could have matched already. */ +void nf_conntrack_unexpect_related(struct nf_conntrack_expect *exp) +{ + struct nf_conntrack_expect *i; + + write_lock_bh(&nf_conntrack_lock); + /* choose the the oldest expectation to evict */ + list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) { + if (expect_matches(i, exp) && del_timer(&i->timeout)) { + nf_ct_unlink_expect(i); + write_unlock_bh(&nf_conntrack_lock); + nf_conntrack_expect_put(i); + return; + } + } + write_unlock_bh(&nf_conntrack_lock); +} + +/* We don't increase the master conntrack refcount for non-fulfilled + * conntracks. During the conntrack destruction, the expectations are + * always killed before the conntrack itself */ +struct nf_conntrack_expect *nf_conntrack_expect_alloc(struct nf_conn *me) +{ + struct nf_conntrack_expect *new; + + new = kmem_cache_alloc(nf_conntrack_expect_cachep, GFP_ATOMIC); + if (!new) { + DEBUGP("expect_related: OOM allocating expect\n"); + return NULL; + } + new->master = me; + atomic_set(&new->use, 1); + return new; +} + +void nf_conntrack_expect_put(struct nf_conntrack_expect *exp) +{ + if (atomic_dec_and_test(&exp->use)) + kmem_cache_free(nf_conntrack_expect_cachep, exp); +} + +static void nf_conntrack_expect_insert(struct nf_conntrack_expect *exp) +{ + atomic_inc(&exp->use); + exp->master->expecting++; + list_add(&exp->list, &nf_conntrack_expect_list); + + init_timer(&exp->timeout); + exp->timeout.data = (unsigned long)exp; + exp->timeout.function = expectation_timed_out; + exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ; + add_timer(&exp->timeout); + + atomic_inc(&exp->use); + NF_CT_STAT_INC(expect_create); +} + +/* Race with expectations being used means we could have none to find; OK. */ +static void evict_oldest_expect(struct nf_conn *master) +{ + struct nf_conntrack_expect *i; + + list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) { + if (i->master == master) { + if (del_timer(&i->timeout)) { + nf_ct_unlink_expect(i); + nf_conntrack_expect_put(i); + } + break; + } + } +} + +static inline int refresh_timer(struct nf_conntrack_expect *i) +{ + if (!del_timer(&i->timeout)) + return 0; + + i->timeout.expires = jiffies + i->master->helper->timeout*HZ; + add_timer(&i->timeout); + return 1; +} + +int nf_conntrack_expect_related(struct nf_conntrack_expect *expect) +{ + struct nf_conntrack_expect *i; + int ret; + + DEBUGP("nf_conntrack_expect_related %p\n", related_to); + DEBUGP("tuple: "); NF_CT_DUMP_TUPLE(&expect->tuple); + DEBUGP("mask: "); NF_CT_DUMP_TUPLE(&expect->mask); + + write_lock_bh(&nf_conntrack_lock); + list_for_each_entry(i, &nf_conntrack_expect_list, list) { + if (expect_matches(i, expect)) { + /* Refresh timer: if it's dying, ignore.. */ + if (refresh_timer(i)) { + ret = 0; + goto out; + } + } else if (expect_clash(i, expect)) { + ret = -EBUSY; + goto out; + } + } + /* Will be over limit? */ + if (expect->master->helper->max_expected && + expect->master->expecting >= expect->master->helper->max_expected) + evict_oldest_expect(expect->master); + + nf_conntrack_expect_insert(expect); + nf_conntrack_expect_event(IPEXP_NEW, expect); + ret = 0; +out: + write_unlock_bh(&nf_conntrack_lock); + return ret; +} + +/* Alter reply tuple (maybe alter helper). This is for NAT, and is + implicitly racy: see __nf_conntrack_confirm */ +void nf_conntrack_alter_reply(struct nf_conn *conntrack, + const struct nf_conntrack_tuple *newreply) +{ + write_lock_bh(&nf_conntrack_lock); + /* Should be unconfirmed, so not in hash table yet */ + NF_CT_ASSERT(!nf_ct_is_confirmed(conntrack)); + + DEBUGP("Altering reply tuple of %p to ", conntrack); + NF_CT_DUMP_TUPLE(newreply); + + conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; + if (!conntrack->master && conntrack->expecting == 0) + conntrack->helper = nf_ct_find_helper(newreply); + write_unlock_bh(&nf_conntrack_lock); +} + +int nf_conntrack_helper_register(struct nf_conntrack_helper *me) +{ + int ret; + BUG_ON(me->timeout == 0); + + ret = nf_conntrack_register_cache(NF_CT_F_HELP, "nf_conntrack:help", + sizeof(struct nf_conn) + + sizeof(union nf_conntrack_help) + + __alignof__(union nf_conntrack_help), + init_conntrack_for_helper); + if (ret < 0) { + printk(KERN_ERR "nf_conntrack_helper_reigster: Unable to create slab cache for conntracks\n"); + return ret; + } + write_lock_bh(&nf_conntrack_lock); + list_prepend(&helpers, me); + write_unlock_bh(&nf_conntrack_lock); + + return 0; +} + +static inline int unhelp(struct nf_conntrack_tuple_hash *i, + const struct nf_conntrack_helper *me) +{ + if (nf_ct_tuplehash_to_ctrack(i)->helper == me) { + nf_conntrack_event(IPCT_HELPER, nf_ct_tuplehash_to_ctrack(i)); + nf_ct_tuplehash_to_ctrack(i)->helper = NULL; + } + return 0; +} + +void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) +{ + unsigned int i; + struct nf_conntrack_expect *exp, *tmp; + + /* Need write lock here, to delete helper. */ + write_lock_bh(&nf_conntrack_lock); + LIST_DELETE(&helpers, me); + + /* Get rid of expectations */ + list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list, list) { + if (exp->master->helper == me && del_timer(&exp->timeout)) { + nf_ct_unlink_expect(exp); + nf_conntrack_expect_put(exp); + } + } + + /* Get rid of expecteds, set helpers to NULL. */ + LIST_FIND_W(&unconfirmed, unhelp, struct nf_conntrack_tuple_hash*, me); + for (i = 0; i < nf_conntrack_htable_size; i++) + LIST_FIND_W(&nf_conntrack_hash[i], unhelp, + struct nf_conntrack_tuple_hash *, me); + write_unlock_bh(&nf_conntrack_lock); + + /* Someone could be still looking at the helper in a bh. */ + synchronize_net(); +} + +/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ +void __nf_ct_refresh_acct(struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + const struct sk_buff *skb, + unsigned long extra_jiffies, + int do_acct) +{ + int event = 0; + + NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct); + NF_CT_ASSERT(skb); + + write_lock_bh(&nf_conntrack_lock); + + /* If not in hash table, timer will not be active yet */ + if (!nf_ct_is_confirmed(ct)) { + ct->timeout.expires = extra_jiffies; + event = IPCT_REFRESH; + } else { + /* Need del_timer for race avoidance (may already be dying). */ + if (del_timer(&ct->timeout)) { + ct->timeout.expires = jiffies + extra_jiffies; + add_timer(&ct->timeout); + event = IPCT_REFRESH; + } + } + +#ifdef CONFIG_NF_CT_ACCT + if (do_acct) { + ct->counters[CTINFO2DIR(ctinfo)].packets++; + ct->counters[CTINFO2DIR(ctinfo)].bytes += + skb->len - (unsigned int)(skb->nh.raw - skb->data); + if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000) + || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000)) + event |= IPCT_COUNTER_FILLING; + } +#endif + + write_unlock_bh(&nf_conntrack_lock); + + /* must be unlocked when calling event cache */ + if (event) + nf_conntrack_event_cache(event, skb); +} + +/* Used by ipt_REJECT and ip6t_REJECT. */ +void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + + /* This ICMP is in reverse direction to the packet which caused it */ + ct = nf_ct_get(skb, &ctinfo); + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) + ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY; + else + ctinfo = IP_CT_RELATED; + + /* Attach to new skbuff, and increment count */ + nskb->nfct = &ct->ct_general; + nskb->nfctinfo = ctinfo; + nf_conntrack_get(nskb->nfct); +} + +static inline int +do_iter(const struct nf_conntrack_tuple_hash *i, + int (*iter)(struct nf_conn *i, void *data), + void *data) +{ + return iter(nf_ct_tuplehash_to_ctrack(i), data); +} + +/* Bring out ya dead! */ +static struct nf_conntrack_tuple_hash * +get_next_corpse(int (*iter)(struct nf_conn *i, void *data), + void *data, unsigned int *bucket) +{ + struct nf_conntrack_tuple_hash *h = NULL; + + write_lock_bh(&nf_conntrack_lock); + for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { + h = LIST_FIND_W(&nf_conntrack_hash[*bucket], do_iter, + struct nf_conntrack_tuple_hash *, iter, data); + if (h) + break; + } + if (!h) + h = LIST_FIND_W(&unconfirmed, do_iter, + struct nf_conntrack_tuple_hash *, iter, data); + if (h) + atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use); + write_unlock_bh(&nf_conntrack_lock); + + return h; +} + +void +nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data) +{ + struct nf_conntrack_tuple_hash *h; + unsigned int bucket = 0; + + while ((h = get_next_corpse(iter, data, &bucket)) != NULL) { + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + /* Time to push up daises... */ + if (del_timer(&ct->timeout)) + death_by_timeout((unsigned long)ct); + /* ... else the timer will get him soon. */ + + nf_ct_put(ct); + } +} + +static int kill_all(struct nf_conn *i, void *data) +{ + return 1; +} + +static void free_conntrack_hash(struct list_head *hash, int vmalloced, int size) +{ + if (vmalloced) + vfree(hash); + else + free_pages((unsigned long)hash, + get_order(sizeof(struct list_head) * size)); +} + +/* Mishearing the voices in his head, our hero wonders how he's + supposed to kill the mall. */ +void nf_conntrack_cleanup(void) +{ + int i; + + /* This makes sure all current packets have passed through + netfilter framework. Roll on, two-stage module + delete... */ + synchronize_net(); + + nf_ct_event_cache_flush(); + i_see_dead_people: + nf_ct_iterate_cleanup(kill_all, NULL); + if (atomic_read(&nf_conntrack_count) != 0) { + schedule(); + goto i_see_dead_people; + } + + for (i = 0; i < NF_CT_F_NUM; i++) { + if (nf_ct_cache[i].use == 0) + continue; + + NF_CT_ASSERT(nf_ct_cache[i].use == 1); + nf_ct_cache[i].use = 1; + nf_conntrack_unregister_cache(i); + } + kmem_cache_destroy(nf_conntrack_expect_cachep); + free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc, + nf_conntrack_htable_size); + + /* free l3proto protocol tables */ + for (i = 0; i < PF_MAX; i++) + if (nf_ct_protos[i]) { + kfree(nf_ct_protos[i]); + nf_ct_protos[i] = NULL; + } +} + +static struct list_head *alloc_hashtable(int size, int *vmalloced) +{ + struct list_head *hash; + unsigned int i; + + *vmalloced = 0; + hash = (void*)__get_free_pages(GFP_KERNEL, + get_order(sizeof(struct list_head) + * size)); + if (!hash) { + *vmalloced = 1; + printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n"); + hash = vmalloc(sizeof(struct list_head) * size); + } + + if (hash) + for (i = 0; i < size; i++) + INIT_LIST_HEAD(&hash[i]); + + return hash; +} + +int set_hashsize(const char *val, struct kernel_param *kp) +{ + int i, bucket, hashsize, vmalloced; + int old_vmalloced, old_size; + int rnd; + struct list_head *hash, *old_hash; + struct nf_conntrack_tuple_hash *h; + + /* On boot, we can set this without any fancy locking. */ + if (!nf_conntrack_htable_size) + return param_set_uint(val, kp); + + hashsize = simple_strtol(val, NULL, 0); + if (!hashsize) + return -EINVAL; + + hash = alloc_hashtable(hashsize, &vmalloced); + if (!hash) + return -ENOMEM; + + /* We have to rehahs for the new table anyway, so we also can + * use a newrandom seed */ + get_random_bytes(&rnd, 4); + + write_lock_bh(&nf_conntrack_lock); + for (i = 0; i < nf_conntrack_htable_size; i++) { + while (!list_empty(&nf_conntrack_hash[i])) { + h = list_entry(nf_conntrack_hash[i].next, + struct nf_conntrack_tuple_hash, list); + list_del(&h->list); + bucket = __hash_conntrack(&h->tuple, hashsize, rnd); + list_add_tail(&h->list, &hash[bucket]); + } + } + old_size = nf_conntrack_htable_size; + old_vmalloced = nf_conntrack_vmalloc; + old_hash = nf_conntrack_hash; + + nf_conntrack_htable_size = hashsize; + nf_conntrack_vmalloc = vmalloced; + nf_conntrack_hash = hash; + nf_conntrack_hash_rnd = rnd; + write_unlock_bh(&nf_conntrack_lock); + + free_conntrack_hash(old_hash, old_vmalloced, old_size); + return 0; +} + +module_param_call(hashsize, set_hashsize, param_get_uint, + &nf_conntrack_htable_size, 0600); + +int __init nf_conntrack_init(void) +{ + unsigned int i; + int ret; + + /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB + * machine has 256 buckets. >= 1GB machines have 8192 buckets. */ + if (!nf_conntrack_htable_size) { + nf_conntrack_htable_size + = (((num_physpages << PAGE_SHIFT) / 16384) + / sizeof(struct list_head)); + if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) + nf_conntrack_htable_size = 8192; + if (nf_conntrack_htable_size < 16) + nf_conntrack_htable_size = 16; + } + nf_conntrack_max = 8 * nf_conntrack_htable_size; + + printk("nf_conntrack version %s (%u buckets, %d max)\n", + NF_CONNTRACK_VERSION, nf_conntrack_htable_size, + nf_conntrack_max); + + nf_conntrack_hash = alloc_hashtable(nf_conntrack_htable_size, + &nf_conntrack_vmalloc); + if (!nf_conntrack_hash) { + printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); + goto err_out; + } + + ret = nf_conntrack_register_cache(NF_CT_F_BASIC, "nf_conntrack:basic", + sizeof(struct nf_conn), NULL); + if (ret < 0) { + printk(KERN_ERR "Unable to create nf_conn slab cache\n"); + goto err_free_hash; + } + + nf_conntrack_expect_cachep = kmem_cache_create("nf_conntrack_expect", + sizeof(struct nf_conntrack_expect), + 0, 0, NULL, NULL); + if (!nf_conntrack_expect_cachep) { + printk(KERN_ERR "Unable to create nf_expect slab cache\n"); + goto err_free_conntrack_slab; + } + + /* Don't NEED lock here, but good form anyway. */ + write_lock_bh(&nf_conntrack_lock); + for (i = 0; i < PF_MAX; i++) + nf_ct_l3protos[i] = &nf_conntrack_generic_l3proto; + write_unlock_bh(&nf_conntrack_lock); + + /* Set up fake conntrack: + - to never be deleted, not in any hashes */ + atomic_set(&nf_conntrack_untracked.ct_general.use, 1); + /* - and look it like as a confirmed connection */ + set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status); + + return ret; + +err_free_conntrack_slab: + nf_conntrack_unregister_cache(NF_CT_F_BASIC); +err_free_hash: + free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc, + nf_conntrack_htable_size); +err_out: + return -ENOMEM; +} diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c new file mode 100644 index 00000000000..65080e269f2 --- /dev/null +++ b/net/netfilter/nf_conntrack_ftp.c @@ -0,0 +1,698 @@ +/* FTP extension for connection tracking. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * - enable working with Layer 3 protocol independent connection tracking. + * - track EPRT and EPSV commands with IPv6 address. + * + * Derived from net/ipv4/netfilter/ip_conntrack_ftp.c + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/netfilter.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/ctype.h> +#include <net/checksum.h> +#include <net/tcp.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <linux/netfilter/nf_conntrack_ftp.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); +MODULE_DESCRIPTION("ftp connection tracking helper"); + +/* This is slow, but it's simple. --RR */ +static char *ftp_buffer; + +static DEFINE_SPINLOCK(nf_ftp_lock); + +#define MAX_PORTS 8 +static u_int16_t ports[MAX_PORTS]; +static unsigned int ports_c; +module_param_array(ports, ushort, &ports_c, 0400); + +static int loose; +module_param(loose, int, 0600); + +unsigned int (*nf_nat_ftp_hook)(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + enum ip_ct_ftp_type type, + unsigned int matchoff, + unsigned int matchlen, + struct nf_conntrack_expect *exp, + u32 *seq); +EXPORT_SYMBOL_GPL(nf_nat_ftp_hook); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static int try_rfc959(const char *, size_t, struct nf_conntrack_man *, char); +static int try_eprt(const char *, size_t, struct nf_conntrack_man *, char); +static int try_epsv_response(const char *, size_t, struct nf_conntrack_man *, + char); + +static struct ftp_search { + enum ip_conntrack_dir dir; + const char *pattern; + size_t plen; + char skip; + char term; + enum ip_ct_ftp_type ftptype; + int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char); +} search[] = { + { + IP_CT_DIR_ORIGINAL, + "PORT", sizeof("PORT") - 1, ' ', '\r', + IP_CT_FTP_PORT, + try_rfc959, + }, + { + IP_CT_DIR_REPLY, + "227 ", sizeof("227 ") - 1, '(', ')', + IP_CT_FTP_PASV, + try_rfc959, + }, + { + IP_CT_DIR_ORIGINAL, + "EPRT", sizeof("EPRT") - 1, ' ', '\r', + IP_CT_FTP_EPRT, + try_eprt, + }, + { + IP_CT_DIR_REPLY, + "229 ", sizeof("229 ") - 1, '(', ')', + IP_CT_FTP_EPSV, + try_epsv_response, + }, +}; + +/* This code is based on inet_pton() in glibc-2.2.4 */ +static int +get_ipv6_addr(const char *src, size_t dlen, struct in6_addr *dst, u_int8_t term) +{ + static const char xdigits[] = "0123456789abcdef"; + u_int8_t tmp[16], *tp, *endp, *colonp; + int ch, saw_xdigit; + u_int32_t val; + size_t clen = 0; + + tp = memset(tmp, '\0', sizeof(tmp)); + endp = tp + sizeof(tmp); + colonp = NULL; + + /* Leading :: requires some special handling. */ + if (*src == ':'){ + if (*++src != ':') { + DEBUGP("invalid \":\" at the head of addr\n"); + return 0; + } + clen++; + } + + saw_xdigit = 0; + val = 0; + while ((clen < dlen) && (*src != term)) { + const char *pch; + + ch = tolower(*src++); + clen++; + + pch = strchr(xdigits, ch); + if (pch != NULL) { + val <<= 4; + val |= (pch - xdigits); + if (val > 0xffff) + return 0; + + saw_xdigit = 1; + continue; + } + if (ch != ':') { + DEBUGP("get_ipv6_addr: invalid char. \'%c\'\n", ch); + return 0; + } + + if (!saw_xdigit) { + if (colonp) { + DEBUGP("invalid location of \"::\".\n"); + return 0; + } + colonp = tp; + continue; + } else if (*src == term) { + DEBUGP("trancated IPv6 addr\n"); + return 0; + } + + if (tp + 2 > endp) + return 0; + *tp++ = (u_int8_t) (val >> 8) & 0xff; + *tp++ = (u_int8_t) val & 0xff; + + saw_xdigit = 0; + val = 0; + continue; + } + if (saw_xdigit) { + if (tp + 2 > endp) + return 0; + *tp++ = (u_int8_t) (val >> 8) & 0xff; + *tp++ = (u_int8_t) val & 0xff; + } + if (colonp != NULL) { + /* + * Since some memmove()'s erroneously fail to handle + * overlapping regions, we'll do the shift by hand. + */ + const int n = tp - colonp; + int i; + + if (tp == endp) + return 0; + + for (i = 1; i <= n; i++) { + endp[- i] = colonp[n - i]; + colonp[n - i] = 0; + } + tp = endp; + } + if (tp != endp || (*src != term)) + return 0; + + memcpy(dst->s6_addr, tmp, sizeof(dst->s6_addr)); + return clen; +} + +static int try_number(const char *data, size_t dlen, u_int32_t array[], + int array_size, char sep, char term) +{ + u_int32_t i, len; + + memset(array, 0, sizeof(array[0])*array_size); + + /* Keep data pointing at next char. */ + for (i = 0, len = 0; len < dlen && i < array_size; len++, data++) { + if (*data >= '0' && *data <= '9') { + array[i] = array[i]*10 + *data - '0'; + } + else if (*data == sep) + i++; + else { + /* Unexpected character; true if it's the + terminator and we're finished. */ + if (*data == term && i == array_size - 1) + return len; + + DEBUGP("Char %u (got %u nums) `%u' unexpected\n", + len, i, *data); + return 0; + } + } + DEBUGP("Failed to fill %u numbers separated by %c\n", array_size, sep); + + return 0; +} + +/* Returns 0, or length of numbers: 192,168,1,1,5,6 */ +static int try_rfc959(const char *data, size_t dlen, + struct nf_conntrack_man *cmd, char term) +{ + int length; + u_int32_t array[6]; + + length = try_number(data, dlen, array, 6, ',', term); + if (length == 0) + return 0; + + cmd->u3.ip = htonl((array[0] << 24) | (array[1] << 16) | + (array[2] << 8) | array[3]); + cmd->u.tcp.port = htons((array[4] << 8) | array[5]); + return length; +} + +/* Grab port: number up to delimiter */ +static int get_port(const char *data, int start, size_t dlen, char delim, + u_int16_t *port) +{ + u_int16_t tmp_port = 0; + int i; + + for (i = start; i < dlen; i++) { + /* Finished? */ + if (data[i] == delim) { + if (tmp_port == 0) + break; + *port = htons(tmp_port); + DEBUGP("get_port: return %d\n", tmp_port); + return i + 1; + } + else if (data[i] >= '0' && data[i] <= '9') + tmp_port = tmp_port*10 + data[i] - '0'; + else { /* Some other crap */ + DEBUGP("get_port: invalid char.\n"); + break; + } + } + return 0; +} + +/* Returns 0, or length of numbers: |1|132.235.1.2|6275| or |2|3ffe::1|6275| */ +static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd, + char term) +{ + char delim; + int length; + + /* First character is delimiter, then "1" for IPv4 or "2" for IPv6, + then delimiter again. */ + if (dlen <= 3) { + DEBUGP("EPRT: too short\n"); + return 0; + } + delim = data[0]; + if (isdigit(delim) || delim < 33 || delim > 126 || data[2] != delim) { + DEBUGP("try_eprt: invalid delimitter.\n"); + return 0; + } + + if ((cmd->l3num == PF_INET && data[1] != '1') || + (cmd->l3num == PF_INET6 && data[1] != '2')) { + DEBUGP("EPRT: invalid protocol number.\n"); + return 0; + } + + DEBUGP("EPRT: Got %c%c%c\n", delim, data[1], delim); + + if (data[1] == '1') { + u_int32_t array[4]; + + /* Now we have IP address. */ + length = try_number(data + 3, dlen - 3, array, 4, '.', delim); + if (length != 0) + cmd->u3.ip = htonl((array[0] << 24) | (array[1] << 16) + | (array[2] << 8) | array[3]); + } else { + /* Now we have IPv6 address. */ + length = get_ipv6_addr(data + 3, dlen - 3, + (struct in6_addr *)cmd->u3.ip6, delim); + } + + if (length == 0) + return 0; + DEBUGP("EPRT: Got IP address!\n"); + /* Start offset includes initial "|1|", and trailing delimiter */ + return get_port(data, 3 + length + 1, dlen, delim, &cmd->u.tcp.port); +} + +/* Returns 0, or length of numbers: |||6446| */ +static int try_epsv_response(const char *data, size_t dlen, + struct nf_conntrack_man *cmd, char term) +{ + char delim; + + /* Three delimiters. */ + if (dlen <= 3) return 0; + delim = data[0]; + if (isdigit(delim) || delim < 33 || delim > 126 + || data[1] != delim || data[2] != delim) + return 0; + + return get_port(data, 3, dlen, delim, &cmd->u.tcp.port); +} + +/* Return 1 for match, 0 for accept, -1 for partial. */ +static int find_pattern(const char *data, size_t dlen, + const char *pattern, size_t plen, + char skip, char term, + unsigned int *numoff, + unsigned int *numlen, + struct nf_conntrack_man *cmd, + int (*getnum)(const char *, size_t, + struct nf_conntrack_man *, char)) +{ + size_t i; + + DEBUGP("find_pattern `%s': dlen = %u\n", pattern, dlen); + if (dlen == 0) + return 0; + + if (dlen <= plen) { + /* Short packet: try for partial? */ + if (strnicmp(data, pattern, dlen) == 0) + return -1; + else return 0; + } + + if (strnicmp(data, pattern, plen) != 0) { +#if 0 + size_t i; + + DEBUGP("ftp: string mismatch\n"); + for (i = 0; i < plen; i++) { + DEBUGP("ftp:char %u `%c'(%u) vs `%c'(%u)\n", + i, data[i], data[i], + pattern[i], pattern[i]); + } +#endif + return 0; + } + + DEBUGP("Pattern matches!\n"); + /* Now we've found the constant string, try to skip + to the 'skip' character */ + for (i = plen; data[i] != skip; i++) + if (i == dlen - 1) return -1; + + /* Skip over the last character */ + i++; + + DEBUGP("Skipped up to `%c'!\n", skip); + + *numoff = i; + *numlen = getnum(data + i, dlen - i, cmd, term); + if (!*numlen) + return -1; + + DEBUGP("Match succeeded!\n"); + return 1; +} + +/* Look up to see if we're just after a \n. */ +static int find_nl_seq(u32 seq, const struct ip_ct_ftp_master *info, int dir) +{ + unsigned int i; + + for (i = 0; i < info->seq_aft_nl_num[dir]; i++) + if (info->seq_aft_nl[dir][i] == seq) + return 1; + return 0; +} + +/* We don't update if it's older than what we have. */ +static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir, + struct sk_buff *skb) +{ + unsigned int i, oldest = NUM_SEQ_TO_REMEMBER; + + /* Look for oldest: if we find exact match, we're done. */ + for (i = 0; i < info->seq_aft_nl_num[dir]; i++) { + if (info->seq_aft_nl[dir][i] == nl_seq) + return; + + if (oldest == info->seq_aft_nl_num[dir] + || before(info->seq_aft_nl[dir][i], oldest)) + oldest = i; + } + + if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) { + info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq; + nf_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb); + } else if (oldest != NUM_SEQ_TO_REMEMBER) { + info->seq_aft_nl[dir][oldest] = nl_seq; + nf_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb); + } +} + +static int help(struct sk_buff **pskb, + unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + unsigned int dataoff, datalen; + struct tcphdr _tcph, *th; + char *fb_ptr; + int ret; + u32 seq; + int dir = CTINFO2DIR(ctinfo); + unsigned int matchlen, matchoff; + struct ip_ct_ftp_master *ct_ftp_info = &ct->help->ct_ftp_info; + struct nf_conntrack_expect *exp; + struct nf_conntrack_man cmd = {}; + + unsigned int i; + int found = 0, ends_in_nl; + + /* Until there's been traffic both ways, don't look in packets. */ + if (ctinfo != IP_CT_ESTABLISHED + && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) { + DEBUGP("ftp: Conntrackinfo = %u\n", ctinfo); + return NF_ACCEPT; + } + + th = skb_header_pointer(*pskb, protoff, sizeof(_tcph), &_tcph); + if (th == NULL) + return NF_ACCEPT; + + dataoff = protoff + th->doff * 4; + /* No data? */ + if (dataoff >= (*pskb)->len) { + DEBUGP("ftp: dataoff(%u) >= skblen(%u)\n", dataoff, + (*pskb)->len); + return NF_ACCEPT; + } + datalen = (*pskb)->len - dataoff; + + spin_lock_bh(&nf_ftp_lock); + fb_ptr = skb_header_pointer(*pskb, dataoff, datalen, ftp_buffer); + BUG_ON(fb_ptr == NULL); + + ends_in_nl = (fb_ptr[datalen - 1] == '\n'); + seq = ntohl(th->seq) + datalen; + + /* Look up to see if we're just after a \n. */ + if (!find_nl_seq(ntohl(th->seq), ct_ftp_info, dir)) { + /* Now if this ends in \n, update ftp info. */ + DEBUGP("nf_conntrack_ftp_help: wrong seq pos %s(%u) or %s(%u)\n", + ct_ftp_info->seq_aft_nl_num[dir] > 0 ? "" : "(UNSET)", + ct_ftp_info->seq_aft_nl[dir][0], + ct_ftp_info->seq_aft_nl_num[dir] > 1 ? "" : "(UNSET)", + ct_ftp_info->seq_aft_nl[dir][1]); + ret = NF_ACCEPT; + goto out_update_nl; + } + + /* Initialize IP/IPv6 addr to expected address (it's not mentioned + in EPSV responses) */ + cmd.l3num = ct->tuplehash[dir].tuple.src.l3num; + memcpy(cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all, + sizeof(cmd.u3.all)); + + for (i = 0; i < ARRAY_SIZE(search); i++) { + if (search[i].dir != dir) continue; + + found = find_pattern(fb_ptr, datalen, + search[i].pattern, + search[i].plen, + search[i].skip, + search[i].term, + &matchoff, &matchlen, + &cmd, + search[i].getnum); + if (found) break; + } + if (found == -1) { + /* We don't usually drop packets. After all, this is + connection tracking, not packet filtering. + However, it is necessary for accurate tracking in + this case. */ + if (net_ratelimit()) + printk("conntrack_ftp: partial %s %u+%u\n", + search[i].pattern, + ntohl(th->seq), datalen); + ret = NF_DROP; + goto out; + } else if (found == 0) { /* No match */ + ret = NF_ACCEPT; + goto out_update_nl; + } + + DEBUGP("conntrack_ftp: match `%.*s' (%u bytes at %u)\n", + (int)matchlen, fb_ptr + matchoff, + matchlen, ntohl(th->seq) + matchoff); + + exp = nf_conntrack_expect_alloc(ct); + if (exp == NULL) { + ret = NF_DROP; + goto out; + } + + /* We refer to the reverse direction ("!dir") tuples here, + * because we're expecting something in the other direction. + * Doesn't matter unless NAT is happening. */ + exp->tuple.dst.u3 = ct->tuplehash[!dir].tuple.dst.u3; + + /* Update the ftp info */ + if ((cmd.l3num == ct->tuplehash[dir].tuple.src.l3num) && + memcmp(&cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all, + sizeof(cmd.u3.all))) { + /* Enrico Scholz's passive FTP to partially RNAT'd ftp + server: it really wants us to connect to a + different IP address. Simply don't record it for + NAT. */ + if (cmd.l3num == PF_INET) { + DEBUGP("conntrack_ftp: NOT RECORDING: %u,%u,%u,%u != %u.%u.%u.%u\n", + NIPQUAD(cmd.u3.ip), + NIPQUAD(ct->tuplehash[dir].tuple.src.u3.ip)); + } else { + DEBUGP("conntrack_ftp: NOT RECORDING: %x:%x:%x:%x:%x:%x:%x:%x != %x:%x:%x:%x:%x:%x:%x:%x\n", + NIP6(*((struct in6_addr *)cmd.u3.ip6)), + NIP6(*((struct in6_addr *)ct->tuplehash[dir] + .tuple.src.u3.ip6))); + } + + /* Thanks to Cristiano Lincoln Mattos + <lincoln@cesar.org.br> for reporting this potential + problem (DMZ machines opening holes to internal + networks, or the packet filter itself). */ + if (!loose) { + ret = NF_ACCEPT; + goto out_put_expect; + } + memcpy(&exp->tuple.dst.u3, &cmd.u3.all, + sizeof(exp->tuple.dst.u3)); + } + + exp->tuple.src.u3 = ct->tuplehash[!dir].tuple.src.u3; + exp->tuple.src.l3num = cmd.l3num; + exp->tuple.src.u.tcp.port = 0; + exp->tuple.dst.u.tcp.port = cmd.u.tcp.port; + exp->tuple.dst.protonum = IPPROTO_TCP; + + exp->mask = (struct nf_conntrack_tuple) + { .src = { .l3num = 0xFFFF, + .u = { .tcp = { 0 }}, + }, + .dst = { .protonum = 0xFF, + .u = { .tcp = { 0xFFFF }}, + }, + }; + if (cmd.l3num == PF_INET) { + exp->mask.src.u3.ip = 0xFFFFFFFF; + exp->mask.dst.u3.ip = 0xFFFFFFFF; + } else { + memset(exp->mask.src.u3.ip6, 0xFF, + sizeof(exp->mask.src.u3.ip6)); + memset(exp->mask.dst.u3.ip6, 0xFF, + sizeof(exp->mask.src.u3.ip6)); + } + + exp->expectfn = NULL; + exp->flags = 0; + + /* Now, NAT might want to mangle the packet, and register the + * (possibly changed) expectation itself. */ + if (nf_nat_ftp_hook) + ret = nf_nat_ftp_hook(pskb, ctinfo, search[i].ftptype, + matchoff, matchlen, exp, &seq); + else { + /* Can't expect this? Best to drop packet now. */ + if (nf_conntrack_expect_related(exp) != 0) + ret = NF_DROP; + else + ret = NF_ACCEPT; + } + +out_put_expect: + nf_conntrack_expect_put(exp); + +out_update_nl: + /* Now if this ends in \n, update ftp info. Seq may have been + * adjusted by NAT code. */ + if (ends_in_nl) + update_nl_seq(seq, ct_ftp_info, dir, *pskb); + out: + spin_unlock_bh(&nf_ftp_lock); + return ret; +} + +static struct nf_conntrack_helper ftp[MAX_PORTS][2]; +static char ftp_names[MAX_PORTS][2][sizeof("ftp-65535")]; + +/* don't make this __exit, since it's called from __init ! */ +static void fini(void) +{ + int i, j; + for (i = 0; i < ports_c; i++) { + for (j = 0; j < 2; j++) { + if (ftp[i][j].me == NULL) + continue; + + DEBUGP("nf_ct_ftp: unregistering helper for pf: %d " + "port: %d\n", + ftp[i][j].tuple.src.l3num, ports[i]); + nf_conntrack_helper_unregister(&ftp[i][j]); + } + } + + kfree(ftp_buffer); +} + +static int __init init(void) +{ + int i, j = -1, ret = 0; + char *tmpname; + + ftp_buffer = kmalloc(65536, GFP_KERNEL); + if (!ftp_buffer) + return -ENOMEM; + + if (ports_c == 0) + ports[ports_c++] = FTP_PORT; + + /* FIXME should be configurable whether IPv4 and IPv6 FTP connections + are tracked or not - YK */ + for (i = 0; i < ports_c; i++) { + memset(&ftp[i], 0, sizeof(struct nf_conntrack_helper)); + + ftp[i][0].tuple.src.l3num = PF_INET; + ftp[i][1].tuple.src.l3num = PF_INET6; + for (j = 0; j < 2; j++) { + ftp[i][j].tuple.src.u.tcp.port = htons(ports[i]); + ftp[i][j].tuple.dst.protonum = IPPROTO_TCP; + ftp[i][j].mask.src.u.tcp.port = 0xFFFF; + ftp[i][j].mask.dst.protonum = 0xFF; + ftp[i][j].max_expected = 1; + ftp[i][j].timeout = 5 * 60; /* 5 Minutes */ + ftp[i][j].me = THIS_MODULE; + ftp[i][j].help = help; + tmpname = &ftp_names[i][j][0]; + if (ports[i] == FTP_PORT) + sprintf(tmpname, "ftp"); + else + sprintf(tmpname, "ftp-%d", ports[i]); + ftp[i][j].name = tmpname; + + DEBUGP("nf_ct_ftp: registering helper for pf: %d " + "port: %d\n", + ftp[i][j].tuple.src.l3num, ports[i]); + ret = nf_conntrack_helper_register(&ftp[i][j]); + if (ret) { + printk("nf_ct_ftp: failed to register helper " + " for pf: %d port: %d\n", + ftp[i][j].tuple.src.l3num, ports[i]); + fini(); + return ret; + } + } + } + + return 0; +} + +module_init(init); +module_exit(fini); diff --git a/net/netfilter/nf_conntrack_l3proto_generic.c b/net/netfilter/nf_conntrack_l3proto_generic.c new file mode 100644 index 00000000000..7de4f06c63c --- /dev/null +++ b/net/netfilter/nf_conntrack_l3proto_generic.c @@ -0,0 +1,98 @@ +/* + * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * + * Based largely upon the original ip_conntrack code which + * had the following copyright information: + * + * (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Author: + * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/ip.h> +#include <linux/netfilter.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/icmp.h> +#include <linux/sysctl.h> +#include <net/ip.h> + +#include <linux/netfilter_ipv4.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_protocol.h> +#include <net/netfilter/nf_conntrack_l3proto.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/ipv4/nf_conntrack_ipv4.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +DECLARE_PER_CPU(struct nf_conntrack_stat, nf_conntrack_stat); + +static int generic_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, + struct nf_conntrack_tuple *tuple) +{ + memset(&tuple->src.u3, 0, sizeof(tuple->src.u3)); + memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3)); + + return 1; +} + +static int generic_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + memset(&tuple->src.u3, 0, sizeof(tuple->src.u3)); + memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3)); + + return 1; +} + +static int generic_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return 0; +} + +static int generic_print_conntrack(struct seq_file *s, + const struct nf_conn *conntrack) +{ + return 0; +} + +static int +generic_prepare(struct sk_buff **pskb, unsigned int hooknum, + unsigned int *dataoff, u_int8_t *protonum) +{ + /* Never track !!! */ + return -NF_ACCEPT; +} + + +static u_int32_t generic_get_features(const struct nf_conntrack_tuple *tuple) + +{ + return NF_CT_F_BASIC; +} + +struct nf_conntrack_l3proto nf_conntrack_generic_l3proto = { + .l3proto = PF_UNSPEC, + .name = "unknown", + .pkt_to_tuple = generic_pkt_to_tuple, + .invert_tuple = generic_invert_tuple, + .print_tuple = generic_print_tuple, + .print_conntrack = generic_print_conntrack, + .prepare = generic_prepare, + .get_features = generic_get_features, + .me = THIS_MODULE, +}; diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c new file mode 100644 index 00000000000..36425f6c833 --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -0,0 +1,85 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * - enable working with L3 protocol independent connection tracking. + * + * Derived from net/ipv4/netfilter/ip_conntrack_proto_generic.c + */ + +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/netfilter.h> +#include <net/netfilter/nf_conntrack_protocol.h> + +unsigned long nf_ct_generic_timeout = 600*HZ; + +static int generic_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + tuple->src.u.all = 0; + tuple->dst.u.all = 0; + + return 1; +} + +static int generic_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + tuple->src.u.all = 0; + tuple->dst.u.all = 0; + + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int generic_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return 0; +} + +/* Print out the private part of the conntrack. */ +static int generic_print_conntrack(struct seq_file *s, + const struct nf_conn *state) +{ + return 0; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int packet(struct nf_conn *conntrack, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + int pf, + unsigned int hooknum) +{ + nf_ct_refresh_acct(conntrack, ctinfo, skb, nf_ct_generic_timeout); + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int new(struct nf_conn *conntrack, const struct sk_buff *skb, + unsigned int dataoff) +{ + return 1; +} + +struct nf_conntrack_protocol nf_conntrack_generic_protocol = +{ + .l3proto = PF_UNSPEC, + .proto = 0, + .name = "unknown", + .pkt_to_tuple = generic_pkt_to_tuple, + .invert_tuple = generic_invert_tuple, + .print_tuple = generic_print_tuple, + .print_conntrack = generic_print_conntrack, + .packet = packet, + .new = new, +}; diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c new file mode 100644 index 00000000000..3a600f77b4e --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -0,0 +1,670 @@ +/* + * Connection tracking protocol helper module for SCTP. + * + * SCTP is defined in RFC 2960. References to various sections in this code + * are to this RFC. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 17 Oct 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * - enable working with L3 protocol independent connection tracking. + * + * Derived from net/ipv4/ip_conntrack_sctp.c + */ + +/* + * Added support for proc manipulation of timeouts. + */ + +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/netfilter.h> +#include <linux/module.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/sctp.h> +#include <linux/string.h> +#include <linux/seq_file.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_protocol.h> + +#if 0 +#define DEBUGP(format, ...) printk(format, ## __VA_ARGS__) +#else +#define DEBUGP(format, args...) +#endif + +/* Protects conntrack->proto.sctp */ +static DEFINE_RWLOCK(sctp_lock); + +/* FIXME: Examine ipfilter's timeouts and conntrack transitions more + closely. They're more complex. --RR + + And so for me for SCTP :D -Kiran */ + +static const char *sctp_conntrack_names[] = { + "NONE", + "CLOSED", + "COOKIE_WAIT", + "COOKIE_ECHOED", + "ESTABLISHED", + "SHUTDOWN_SENT", + "SHUTDOWN_RECD", + "SHUTDOWN_ACK_SENT", +}; + +#define SECS * HZ +#define MINS * 60 SECS +#define HOURS * 60 MINS +#define DAYS * 24 HOURS + +static unsigned long nf_ct_sctp_timeout_closed = 10 SECS; +static unsigned long nf_ct_sctp_timeout_cookie_wait = 3 SECS; +static unsigned long nf_ct_sctp_timeout_cookie_echoed = 3 SECS; +static unsigned long nf_ct_sctp_timeout_established = 5 DAYS; +static unsigned long nf_ct_sctp_timeout_shutdown_sent = 300 SECS / 1000; +static unsigned long nf_ct_sctp_timeout_shutdown_recd = 300 SECS / 1000; +static unsigned long nf_ct_sctp_timeout_shutdown_ack_sent = 3 SECS; + +static unsigned long * sctp_timeouts[] += { NULL, /* SCTP_CONNTRACK_NONE */ + &nf_ct_sctp_timeout_closed, /* SCTP_CONNTRACK_CLOSED */ + &nf_ct_sctp_timeout_cookie_wait, /* SCTP_CONNTRACK_COOKIE_WAIT */ + &nf_ct_sctp_timeout_cookie_echoed, /* SCTP_CONNTRACK_COOKIE_ECHOED */ + &nf_ct_sctp_timeout_established, /* SCTP_CONNTRACK_ESTABLISHED */ + &nf_ct_sctp_timeout_shutdown_sent, /* SCTP_CONNTRACK_SHUTDOWN_SENT */ + &nf_ct_sctp_timeout_shutdown_recd, /* SCTP_CONNTRACK_SHUTDOWN_RECD */ + &nf_ct_sctp_timeout_shutdown_ack_sent /* SCTP_CONNTRACK_SHUTDOWN_ACK_SENT */ + }; + +#define sNO SCTP_CONNTRACK_NONE +#define sCL SCTP_CONNTRACK_CLOSED +#define sCW SCTP_CONNTRACK_COOKIE_WAIT +#define sCE SCTP_CONNTRACK_COOKIE_ECHOED +#define sES SCTP_CONNTRACK_ESTABLISHED +#define sSS SCTP_CONNTRACK_SHUTDOWN_SENT +#define sSR SCTP_CONNTRACK_SHUTDOWN_RECD +#define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT +#define sIV SCTP_CONNTRACK_MAX + +/* + These are the descriptions of the states: + +NOTE: These state names are tantalizingly similar to the states of an +SCTP endpoint. But the interpretation of the states is a little different, +considering that these are the states of the connection and not of an end +point. Please note the subtleties. -Kiran + +NONE - Nothing so far. +COOKIE WAIT - We have seen an INIT chunk in the original direction, or also + an INIT_ACK chunk in the reply direction. +COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction. +ESTABLISHED - We have seen a COOKIE_ACK in the reply direction. +SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction. +SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply directoin. +SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite + to that of the SHUTDOWN chunk. +CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of + the SHUTDOWN chunk. Connection is closed. +*/ + +/* TODO + - I have assumed that the first INIT is in the original direction. + This messes things when an INIT comes in the reply direction in CLOSED + state. + - Check the error type in the reply dir before transitioning from +cookie echoed to closed. + - Sec 5.2.4 of RFC 2960 + - Multi Homing support. +*/ + +/* SCTP conntrack state transitions */ +static enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = { + { +/* ORIGINAL */ +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ +/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA}, +/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA}, +/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, +/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA}, +/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA}, +/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant have Stale cookie*/ +/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */ +/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in orig dir */ +/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL} + }, + { +/* REPLY */ +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ +/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* INIT in sCL Big TODO */ +/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA}, +/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, +/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA}, +/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA}, +/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA}, +/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in reply dir */ +/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA}, +/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL} + } +}; + +static int sctp_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + sctp_sctphdr_t _hdr, *hp; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + /* Actually only need first 8 bytes. */ + hp = skb_header_pointer(skb, dataoff, 8, &_hdr); + if (hp == NULL) + return 0; + + tuple->src.u.sctp.port = hp->source; + tuple->dst.u.sctp.port = hp->dest; + return 1; +} + +static int sctp_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + tuple->src.u.sctp.port = orig->dst.u.sctp.port; + tuple->dst.u.sctp.port = orig->src.u.sctp.port; + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int sctp_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + return seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.sctp.port), + ntohs(tuple->dst.u.sctp.port)); +} + +/* Print out the private part of the conntrack. */ +static int sctp_print_conntrack(struct seq_file *s, + const struct nf_conn *conntrack) +{ + enum sctp_conntrack state; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + read_lock_bh(&sctp_lock); + state = conntrack->proto.sctp.state; + read_unlock_bh(&sctp_lock); + + return seq_printf(s, "%s ", sctp_conntrack_names[state]); +} + +#define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \ +for (offset = dataoff + sizeof(sctp_sctphdr_t), count = 0; \ + offset < skb->len && \ + (sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch)); \ + offset += (htons(sch->length) + 3) & ~3, count++) + +/* Some validity checks to make sure the chunks are fine */ +static int do_basic_checks(struct nf_conn *conntrack, + const struct sk_buff *skb, + unsigned int dataoff, + char *map) +{ + u_int32_t offset, count; + sctp_chunkhdr_t _sch, *sch; + int flag; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + flag = 0; + + for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { + DEBUGP("Chunk Num: %d Type: %d\n", count, sch->type); + + if (sch->type == SCTP_CID_INIT + || sch->type == SCTP_CID_INIT_ACK + || sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { + flag = 1; + } + + /* Cookie Ack/Echo chunks not the first OR + Init / Init Ack / Shutdown compl chunks not the only chunks */ + if ((sch->type == SCTP_CID_COOKIE_ACK + || sch->type == SCTP_CID_COOKIE_ECHO + || flag) + && count !=0 ) { + DEBUGP("Basic checks failed\n"); + return 1; + } + + if (map) { + set_bit(sch->type, (void *)map); + } + } + + DEBUGP("Basic checks passed\n"); + return 0; +} + +static int new_state(enum ip_conntrack_dir dir, + enum sctp_conntrack cur_state, + int chunk_type) +{ + int i; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + DEBUGP("Chunk type: %d\n", chunk_type); + + switch (chunk_type) { + case SCTP_CID_INIT: + DEBUGP("SCTP_CID_INIT\n"); + i = 0; break; + case SCTP_CID_INIT_ACK: + DEBUGP("SCTP_CID_INIT_ACK\n"); + i = 1; break; + case SCTP_CID_ABORT: + DEBUGP("SCTP_CID_ABORT\n"); + i = 2; break; + case SCTP_CID_SHUTDOWN: + DEBUGP("SCTP_CID_SHUTDOWN\n"); + i = 3; break; + case SCTP_CID_SHUTDOWN_ACK: + DEBUGP("SCTP_CID_SHUTDOWN_ACK\n"); + i = 4; break; + case SCTP_CID_ERROR: + DEBUGP("SCTP_CID_ERROR\n"); + i = 5; break; + case SCTP_CID_COOKIE_ECHO: + DEBUGP("SCTP_CID_COOKIE_ECHO\n"); + i = 6; break; + case SCTP_CID_COOKIE_ACK: + DEBUGP("SCTP_CID_COOKIE_ACK\n"); + i = 7; break; + case SCTP_CID_SHUTDOWN_COMPLETE: + DEBUGP("SCTP_CID_SHUTDOWN_COMPLETE\n"); + i = 8; break; + default: + /* Other chunks like DATA, SACK, HEARTBEAT and + its ACK do not cause a change in state */ + DEBUGP("Unknown chunk type, Will stay in %s\n", + sctp_conntrack_names[cur_state]); + return cur_state; + } + + DEBUGP("dir: %d cur_state: %s chunk_type: %d new_state: %s\n", + dir, sctp_conntrack_names[cur_state], chunk_type, + sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]); + + return sctp_conntracks[dir][i][cur_state]; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int sctp_packet(struct nf_conn *conntrack, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + int pf, + unsigned int hooknum) +{ + enum sctp_conntrack newconntrack, oldsctpstate; + sctp_sctphdr_t _sctph, *sh; + sctp_chunkhdr_t _sch, *sch; + u_int32_t offset, count; + char map[256 / sizeof (char)] = {0}; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); + if (sh == NULL) + return -1; + + if (do_basic_checks(conntrack, skb, dataoff, map) != 0) + return -1; + + /* Check the verification tag (Sec 8.5) */ + if (!test_bit(SCTP_CID_INIT, (void *)map) + && !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, (void *)map) + && !test_bit(SCTP_CID_COOKIE_ECHO, (void *)map) + && !test_bit(SCTP_CID_ABORT, (void *)map) + && !test_bit(SCTP_CID_SHUTDOWN_ACK, (void *)map) + && (sh->vtag != conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) { + DEBUGP("Verification tag check failed\n"); + return -1; + } + + oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX; + for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { + write_lock_bh(&sctp_lock); + + /* Special cases of Verification tag check (Sec 8.5.1) */ + if (sch->type == SCTP_CID_INIT) { + /* Sec 8.5.1 (A) */ + if (sh->vtag != 0) { + write_unlock_bh(&sctp_lock); + return -1; + } + } else if (sch->type == SCTP_CID_ABORT) { + /* Sec 8.5.1 (B) */ + if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)]) + && !(sh->vtag == conntrack->proto.sctp.vtag + [1 - CTINFO2DIR(ctinfo)])) { + write_unlock_bh(&sctp_lock); + return -1; + } + } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { + /* Sec 8.5.1 (C) */ + if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)]) + && !(sh->vtag == conntrack->proto.sctp.vtag + [1 - CTINFO2DIR(ctinfo)] + && (sch->flags & 1))) { + write_unlock_bh(&sctp_lock); + return -1; + } + } else if (sch->type == SCTP_CID_COOKIE_ECHO) { + /* Sec 8.5.1 (D) */ + if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) { + write_unlock_bh(&sctp_lock); + return -1; + } + } + + oldsctpstate = conntrack->proto.sctp.state; + newconntrack = new_state(CTINFO2DIR(ctinfo), oldsctpstate, sch->type); + + /* Invalid */ + if (newconntrack == SCTP_CONNTRACK_MAX) { + DEBUGP("nf_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n", + CTINFO2DIR(ctinfo), sch->type, oldsctpstate); + write_unlock_bh(&sctp_lock); + return -1; + } + + /* If it is an INIT or an INIT ACK note down the vtag */ + if (sch->type == SCTP_CID_INIT + || sch->type == SCTP_CID_INIT_ACK) { + sctp_inithdr_t _inithdr, *ih; + + ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t), + sizeof(_inithdr), &_inithdr); + if (ih == NULL) { + write_unlock_bh(&sctp_lock); + return -1; + } + DEBUGP("Setting vtag %x for dir %d\n", + ih->init_tag, !CTINFO2DIR(ctinfo)); + conntrack->proto.sctp.vtag[!CTINFO2DIR(ctinfo)] = ih->init_tag; + } + + conntrack->proto.sctp.state = newconntrack; + if (oldsctpstate != newconntrack) + nf_conntrack_event_cache(IPCT_PROTOINFO, skb); + write_unlock_bh(&sctp_lock); + } + + nf_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]); + + if (oldsctpstate == SCTP_CONNTRACK_COOKIE_ECHOED + && CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY + && newconntrack == SCTP_CONNTRACK_ESTABLISHED) { + DEBUGP("Setting assured bit\n"); + set_bit(IPS_ASSURED_BIT, &conntrack->status); + nf_conntrack_event_cache(IPCT_STATUS, skb); + } + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int sctp_new(struct nf_conn *conntrack, const struct sk_buff *skb, + unsigned int dataoff) +{ + enum sctp_conntrack newconntrack; + sctp_sctphdr_t _sctph, *sh; + sctp_chunkhdr_t _sch, *sch; + u_int32_t offset, count; + char map[256 / sizeof (char)] = {0}; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); + if (sh == NULL) + return 0; + + if (do_basic_checks(conntrack, skb, dataoff, map) != 0) + return 0; + + /* If an OOTB packet has any of these chunks discard (Sec 8.4) */ + if ((test_bit (SCTP_CID_ABORT, (void *)map)) + || (test_bit (SCTP_CID_SHUTDOWN_COMPLETE, (void *)map)) + || (test_bit (SCTP_CID_COOKIE_ACK, (void *)map))) { + return 0; + } + + newconntrack = SCTP_CONNTRACK_MAX; + for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { + /* Don't need lock here: this conntrack not in circulation yet */ + newconntrack = new_state(IP_CT_DIR_ORIGINAL, + SCTP_CONNTRACK_NONE, sch->type); + + /* Invalid: delete conntrack */ + if (newconntrack == SCTP_CONNTRACK_MAX) { + DEBUGP("nf_conntrack_sctp: invalid new deleting.\n"); + return 0; + } + + /* Copy the vtag into the state info */ + if (sch->type == SCTP_CID_INIT) { + if (sh->vtag == 0) { + sctp_inithdr_t _inithdr, *ih; + + ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t), + sizeof(_inithdr), &_inithdr); + if (ih == NULL) + return 0; + + DEBUGP("Setting vtag %x for new conn\n", + ih->init_tag); + + conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = + ih->init_tag; + } else { + /* Sec 8.5.1 (A) */ + return 0; + } + } + /* If it is a shutdown ack OOTB packet, we expect a return + shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */ + else { + DEBUGP("Setting vtag %x for new conn OOTB\n", + sh->vtag); + conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag; + } + + conntrack->proto.sctp.state = newconntrack; + } + + return 1; +} + +struct nf_conntrack_protocol nf_conntrack_protocol_sctp4 = { + .l3proto = PF_INET, + .proto = IPPROTO_SCTP, + .name = "sctp", + .pkt_to_tuple = sctp_pkt_to_tuple, + .invert_tuple = sctp_invert_tuple, + .print_tuple = sctp_print_tuple, + .print_conntrack = sctp_print_conntrack, + .packet = sctp_packet, + .new = sctp_new, + .destroy = NULL, + .me = THIS_MODULE +}; + +struct nf_conntrack_protocol nf_conntrack_protocol_sctp6 = { + .l3proto = PF_INET6, + .proto = IPPROTO_SCTP, + .name = "sctp", + .pkt_to_tuple = sctp_pkt_to_tuple, + .invert_tuple = sctp_invert_tuple, + .print_tuple = sctp_print_tuple, + .print_conntrack = sctp_print_conntrack, + .packet = sctp_packet, + .new = sctp_new, + .destroy = NULL, + .me = THIS_MODULE +}; + +#ifdef CONFIG_SYSCTL +static ctl_table nf_ct_sysctl_table[] = { + { + .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, + .procname = "nf_conntrack_sctp_timeout_closed", + .data = &nf_ct_sctp_timeout_closed, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, + .procname = "nf_conntrack_sctp_timeout_cookie_wait", + .data = &nf_ct_sctp_timeout_cookie_wait, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, + .procname = "nf_conntrack_sctp_timeout_cookie_echoed", + .data = &nf_ct_sctp_timeout_cookie_echoed, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, + .procname = "nf_conntrack_sctp_timeout_established", + .data = &nf_ct_sctp_timeout_established, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, + .procname = "nf_conntrack_sctp_timeout_shutdown_sent", + .data = &nf_ct_sctp_timeout_shutdown_sent, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, + .procname = "nf_conntrack_sctp_timeout_shutdown_recd", + .data = &nf_ct_sctp_timeout_shutdown_recd, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, + .procname = "nf_conntrack_sctp_timeout_shutdown_ack_sent", + .data = &nf_ct_sctp_timeout_shutdown_ack_sent, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { .ctl_name = 0 } +}; + +static ctl_table nf_ct_netfilter_table[] = { + { + .ctl_name = NET_NETFILTER, + .procname = "netfilter", + .mode = 0555, + .child = nf_ct_sysctl_table, + }, + { .ctl_name = 0 } +}; + +static ctl_table nf_ct_net_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = nf_ct_netfilter_table, + }, + { .ctl_name = 0 } +}; + +static struct ctl_table_header *nf_ct_sysctl_header; +#endif + +int __init init(void) +{ + int ret; + + ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_sctp4); + if (ret) { + printk("nf_conntrack_proto_sctp4: protocol register failed\n"); + goto out; + } + ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_sctp6); + if (ret) { + printk("nf_conntrack_proto_sctp6: protocol register failed\n"); + goto cleanup_sctp4; + } + +#ifdef CONFIG_SYSCTL + nf_ct_sysctl_header = register_sysctl_table(nf_ct_net_table, 0); + if (nf_ct_sysctl_header == NULL) { + printk("nf_conntrack_proto_sctp: can't register to sysctl.\n"); + goto cleanup; + } +#endif + + return ret; + +#ifdef CONFIG_SYSCTL + cleanup: + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_sctp6); +#endif + cleanup_sctp4: + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_sctp4); + out: + DEBUGP("SCTP conntrack module loading %s\n", + ret ? "failed": "succeeded"); + return ret; +} + +void __exit fini(void) +{ + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_sctp6); + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_sctp4); +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(nf_ct_sysctl_header); +#endif + DEBUGP("SCTP conntrack module unloaded\n"); +} + +module_init(init); +module_exit(fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Kiran Kumar Immidi"); +MODULE_DESCRIPTION("Netfilter connection tracking protocol helper for SCTP"); diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c new file mode 100644 index 00000000000..5a6fcf349bd --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -0,0 +1,1169 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>: + * - Real stateful connection tracking + * - Modified state transitions table + * - Window scaling support added + * - SACK support added + * + * Willy Tarreau: + * - State table bugfixes + * - More robust state changes + * - Tuning timer parameters + * + * 27 Oct 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * - genelized Layer 3 protocol part. + * + * Derived from net/ipv4/netfilter/ip_conntrack_proto_tcp.c + * + * version 2.2 + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/netfilter.h> +#include <linux/module.h> +#include <linux/in.h> +#include <linux/tcp.h> +#include <linux/spinlock.h> +#include <linux/skbuff.h> +#include <linux/ipv6.h> +#include <net/ip6_checksum.h> + +#include <net/tcp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_protocol.h> + +#if 0 +#define DEBUGP printk +#define DEBUGP_VARS +#else +#define DEBUGP(format, args...) +#endif + +/* Protects conntrack->proto.tcp */ +static DEFINE_RWLOCK(tcp_lock); + +/* "Be conservative in what you do, + be liberal in what you accept from others." + If it's non-zero, we mark only out of window RST segments as INVALID. */ +int nf_ct_tcp_be_liberal = 0; + +/* When connection is picked up from the middle, how many packets are required + to pass in each direction when we assume we are in sync - if any side uses + window scaling, we lost the game. + If it is set to zero, we disable picking up already established + connections. */ +int nf_ct_tcp_loose = 3; + +/* Max number of the retransmitted packets without receiving an (acceptable) + ACK from the destination. If this number is reached, a shorter timer + will be started. */ +int nf_ct_tcp_max_retrans = 3; + + /* FIXME: Examine ipfilter's timeouts and conntrack transitions more + closely. They're more complex. --RR */ + +static const char *tcp_conntrack_names[] = { + "NONE", + "SYN_SENT", + "SYN_RECV", + "ESTABLISHED", + "FIN_WAIT", + "CLOSE_WAIT", + "LAST_ACK", + "TIME_WAIT", + "CLOSE", + "LISTEN" +}; + +#define SECS * HZ +#define MINS * 60 SECS +#define HOURS * 60 MINS +#define DAYS * 24 HOURS + +unsigned long nf_ct_tcp_timeout_syn_sent = 2 MINS; +unsigned long nf_ct_tcp_timeout_syn_recv = 60 SECS; +unsigned long nf_ct_tcp_timeout_established = 5 DAYS; +unsigned long nf_ct_tcp_timeout_fin_wait = 2 MINS; +unsigned long nf_ct_tcp_timeout_close_wait = 60 SECS; +unsigned long nf_ct_tcp_timeout_last_ack = 30 SECS; +unsigned long nf_ct_tcp_timeout_time_wait = 2 MINS; +unsigned long nf_ct_tcp_timeout_close = 10 SECS; + +/* RFC1122 says the R2 limit should be at least 100 seconds. + Linux uses 15 packets as limit, which corresponds + to ~13-30min depending on RTO. */ +unsigned long nf_ct_tcp_timeout_max_retrans = 5 MINS; + +static unsigned long * tcp_timeouts[] += { NULL, /* TCP_CONNTRACK_NONE */ + &nf_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */ + &nf_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */ + &nf_ct_tcp_timeout_established, /* TCP_CONNTRACK_ESTABLISHED, */ + &nf_ct_tcp_timeout_fin_wait, /* TCP_CONNTRACK_FIN_WAIT, */ + &nf_ct_tcp_timeout_close_wait, /* TCP_CONNTRACK_CLOSE_WAIT, */ + &nf_ct_tcp_timeout_last_ack, /* TCP_CONNTRACK_LAST_ACK, */ + &nf_ct_tcp_timeout_time_wait, /* TCP_CONNTRACK_TIME_WAIT, */ + &nf_ct_tcp_timeout_close, /* TCP_CONNTRACK_CLOSE, */ + NULL, /* TCP_CONNTRACK_LISTEN */ + }; + +#define sNO TCP_CONNTRACK_NONE +#define sSS TCP_CONNTRACK_SYN_SENT +#define sSR TCP_CONNTRACK_SYN_RECV +#define sES TCP_CONNTRACK_ESTABLISHED +#define sFW TCP_CONNTRACK_FIN_WAIT +#define sCW TCP_CONNTRACK_CLOSE_WAIT +#define sLA TCP_CONNTRACK_LAST_ACK +#define sTW TCP_CONNTRACK_TIME_WAIT +#define sCL TCP_CONNTRACK_CLOSE +#define sLI TCP_CONNTRACK_LISTEN +#define sIV TCP_CONNTRACK_MAX +#define sIG TCP_CONNTRACK_IGNORE + +/* What TCP flags are set from RST/SYN/FIN/ACK. */ +enum tcp_bit_set { + TCP_SYN_SET, + TCP_SYNACK_SET, + TCP_FIN_SET, + TCP_ACK_SET, + TCP_RST_SET, + TCP_NONE_SET, +}; + +/* + * The TCP state transition table needs a few words... + * + * We are the man in the middle. All the packets go through us + * but might get lost in transit to the destination. + * It is assumed that the destinations can't receive segments + * we haven't seen. + * + * The checked segment is in window, but our windows are *not* + * equivalent with the ones of the sender/receiver. We always + * try to guess the state of the current sender. + * + * The meaning of the states are: + * + * NONE: initial state + * SYN_SENT: SYN-only packet seen + * SYN_RECV: SYN-ACK packet seen + * ESTABLISHED: ACK packet seen + * FIN_WAIT: FIN packet seen + * CLOSE_WAIT: ACK seen (after FIN) + * LAST_ACK: FIN seen (after FIN) + * TIME_WAIT: last ACK seen + * CLOSE: closed connection + * + * LISTEN state is not used. + * + * Packets marked as IGNORED (sIG): + * if they may be either invalid or valid + * and the receiver may send back a connection + * closing RST or a SYN/ACK. + * + * Packets marked as INVALID (sIV): + * if they are invalid + * or we do not support the request (simultaneous open) + */ +static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { + { +/* ORIGINAL */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sIV }, +/* + * sNO -> sSS Initialize a new connection + * sSS -> sSS Retransmitted SYN + * sSR -> sIG Late retransmitted SYN? + * sES -> sIG Error: SYNs in window outside the SYN_SENT state + * are errors. Receiver will reply with RST + * and close the connection. + * Or we are not in sync and hold a dead connection. + * sFW -> sIG + * sCW -> sIG + * sLA -> sIG + * sTW -> sSS Reopened connection (RFC 1122). + * sCL -> sSS + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*synack*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }, +/* + * A SYN/ACK from the client is always invalid: + * - either it tries to set up a simultaneous open, which is + * not supported; + * - or the firewall has just been inserted between the two hosts + * during the session set-up. The SYN will be retransmitted + * by the true client (or it'll time out). + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, +/* + * sNO -> sIV Too late and no reason to do anything... + * sSS -> sIV Client migth not send FIN in this state: + * we enforce waiting for a SYN/ACK reply first. + * sSR -> sFW Close started. + * sES -> sFW + * sFW -> sLA FIN seen in both directions, waiting for + * the last ACK. + * Migth be a retransmitted FIN as well... + * sCW -> sLA + * sLA -> sLA Retransmitted FIN. Remain in the same state. + * sTW -> sTW + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV }, +/* + * sNO -> sES Assumed. + * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet. + * sSR -> sES Established state is reached. + * sES -> sES :-) + * sFW -> sCW Normal close request answered by ACK. + * sCW -> sCW + * sLA -> sTW Last ACK detected. + * sTW -> sTW Retransmitted last ACK. Remain in the same state. + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV }, +/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } + }, + { +/* REPLY */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*syn*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }, +/* + * sNO -> sIV Never reached. + * sSS -> sIV Simultaneous open, not supported + * sSR -> sIV Simultaneous open, not supported. + * sES -> sIV Server may not initiate a connection. + * sFW -> sIV + * sCW -> sIV + * sLA -> sIV + * sTW -> sIV Reopened connection, but server may not do it. + * sCL -> sIV + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIV }, +/* + * sSS -> sSR Standard open. + * sSR -> sSR Retransmitted SYN/ACK. + * sES -> sIG Late retransmitted SYN/ACK? + * sFW -> sIG Might be SYN/ACK answering ignored SYN + * sCW -> sIG + * sLA -> sIG + * sTW -> sIG + * sCL -> sIG + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, +/* + * sSS -> sIV Server might not send FIN in this state. + * sSR -> sFW Close started. + * sES -> sFW + * sFW -> sLA FIN seen in both directions. + * sCW -> sLA + * sLA -> sLA Retransmitted FIN. + * sTW -> sTW + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*ack*/ { sIV, sIV, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV }, +/* + * sSS -> sIV Might be a half-open connection. + * sSR -> sSR Might answer late resent SYN. + * sES -> sES :-) + * sFW -> sCW Normal close request answered by ACK. + * sCW -> sCW + * sLA -> sTW Last ACK detected. + * sTW -> sTW Retransmitted last ACK. + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV }, +/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } + } +}; + +static int tcp_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + struct tcphdr _hdr, *hp; + + /* Actually only need first 8 bytes. */ + hp = skb_header_pointer(skb, dataoff, 8, &_hdr); + if (hp == NULL) + return 0; + + tuple->src.u.tcp.port = hp->source; + tuple->dst.u.tcp.port = hp->dest; + + return 1; +} + +static int tcp_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + tuple->src.u.tcp.port = orig->dst.u.tcp.port; + tuple->dst.u.tcp.port = orig->src.u.tcp.port; + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int tcp_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.tcp.port), + ntohs(tuple->dst.u.tcp.port)); +} + +/* Print out the private part of the conntrack. */ +static int tcp_print_conntrack(struct seq_file *s, + const struct nf_conn *conntrack) +{ + enum tcp_conntrack state; + + read_lock_bh(&tcp_lock); + state = conntrack->proto.tcp.state; + read_unlock_bh(&tcp_lock); + + return seq_printf(s, "%s ", tcp_conntrack_names[state]); +} + +static unsigned int get_conntrack_index(const struct tcphdr *tcph) +{ + if (tcph->rst) return TCP_RST_SET; + else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET); + else if (tcph->fin) return TCP_FIN_SET; + else if (tcph->ack) return TCP_ACK_SET; + else return TCP_NONE_SET; +} + +/* TCP connection tracking based on 'Real Stateful TCP Packet Filtering + in IP Filter' by Guido van Rooij. + + http://www.nluug.nl/events/sane2000/papers.html + http://www.iae.nl/users/guido/papers/tcp_filtering.ps.gz + + The boundaries and the conditions are changed according to RFC793: + the packet must intersect the window (i.e. segments may be + after the right or before the left edge) and thus receivers may ACK + segments after the right edge of the window. + + td_maxend = max(sack + max(win,1)) seen in reply packets + td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets + td_maxwin += seq + len - sender.td_maxend + if seq + len > sender.td_maxend + td_end = max(seq + len) seen in sent packets + + I. Upper bound for valid data: seq <= sender.td_maxend + II. Lower bound for valid data: seq + len >= sender.td_end - receiver.td_maxwin + III. Upper bound for valid ack: sack <= receiver.td_end + IV. Lower bound for valid ack: ack >= receiver.td_end - MAXACKWINDOW + + where sack is the highest right edge of sack block found in the packet. + + The upper bound limit for a valid ack is not ignored - + we doesn't have to deal with fragments. +*/ + +static inline __u32 segment_seq_plus_len(__u32 seq, + size_t len, + unsigned int dataoff, + struct tcphdr *tcph) +{ + /* XXX Should I use payload length field in IP/IPv6 header ? + * - YK */ + return (seq + len - dataoff - tcph->doff*4 + + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0)); +} + +/* Fixme: what about big packets? */ +#define MAXACKWINCONST 66000 +#define MAXACKWINDOW(sender) \ + ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin \ + : MAXACKWINCONST) + +/* + * Simplified tcp_parse_options routine from tcp_input.c + */ +static void tcp_options(const struct sk_buff *skb, + unsigned int dataoff, + struct tcphdr *tcph, + struct ip_ct_tcp_state *state) +{ + unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; + unsigned char *ptr; + int length = (tcph->doff*4) - sizeof(struct tcphdr); + + if (!length) + return; + + ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr), + length, buff); + BUG_ON(ptr == NULL); + + state->td_scale = + state->flags = 0; + + while (length > 0) { + int opcode=*ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize=*ptr++; + if (opsize < 2) /* "silly options" */ + return; + if (opsize > length) + break; /* don't parse partial options */ + + if (opcode == TCPOPT_SACK_PERM + && opsize == TCPOLEN_SACK_PERM) + state->flags |= IP_CT_TCP_FLAG_SACK_PERM; + else if (opcode == TCPOPT_WINDOW + && opsize == TCPOLEN_WINDOW) { + state->td_scale = *(u_int8_t *)ptr; + + if (state->td_scale > 14) { + /* See RFC1323 */ + state->td_scale = 14; + } + state->flags |= + IP_CT_TCP_FLAG_WINDOW_SCALE; + } + ptr += opsize - 2; + length -= opsize; + } + } +} + +static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, + struct tcphdr *tcph, __u32 *sack) +{ + unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; + unsigned char *ptr; + int length = (tcph->doff*4) - sizeof(struct tcphdr); + __u32 tmp; + + if (!length) + return; + + ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr), + length, buff); + BUG_ON(ptr == NULL); + + /* Fast path for timestamp-only option */ + if (length == TCPOLEN_TSTAMP_ALIGNED*4 + && *(__u32 *)ptr == + __constant_ntohl((TCPOPT_NOP << 24) + | (TCPOPT_NOP << 16) + | (TCPOPT_TIMESTAMP << 8) + | TCPOLEN_TIMESTAMP)) + return; + + while (length > 0) { + int opcode = *ptr++; + int opsize, i; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize = *ptr++; + if (opsize < 2) /* "silly options" */ + return; + if (opsize > length) + break; /* don't parse partial options */ + + if (opcode == TCPOPT_SACK + && opsize >= (TCPOLEN_SACK_BASE + + TCPOLEN_SACK_PERBLOCK) + && !((opsize - TCPOLEN_SACK_BASE) + % TCPOLEN_SACK_PERBLOCK)) { + for (i = 0; + i < (opsize - TCPOLEN_SACK_BASE); + i += TCPOLEN_SACK_PERBLOCK) { + memcpy(&tmp, (__u32 *)(ptr + i) + 1, + sizeof(__u32)); + tmp = ntohl(tmp); + + if (after(tmp, *sack)) + *sack = tmp; + } + return; + } + ptr += opsize - 2; + length -= opsize; + } + } +} + +static int tcp_in_window(struct ip_ct_tcp *state, + enum ip_conntrack_dir dir, + unsigned int index, + const struct sk_buff *skb, + unsigned int dataoff, + struct tcphdr *tcph, + int pf) +{ + struct ip_ct_tcp_state *sender = &state->seen[dir]; + struct ip_ct_tcp_state *receiver = &state->seen[!dir]; + __u32 seq, ack, sack, end, win, swin; + int res; + + /* + * Get the required data from the packet. + */ + seq = ntohl(tcph->seq); + ack = sack = ntohl(tcph->ack_seq); + win = ntohs(tcph->window); + end = segment_seq_plus_len(seq, skb->len, dataoff, tcph); + + if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM) + tcp_sack(skb, dataoff, tcph, &sack); + + DEBUGP("tcp_in_window: START\n"); + DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " + "seq=%u ack=%u sack=%u win=%u end=%u\n", + NIPQUAD(iph->saddr), ntohs(tcph->source), + NIPQUAD(iph->daddr), ntohs(tcph->dest), + seq, ack, sack, win, end); + DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); + + if (sender->td_end == 0) { + /* + * Initialize sender data. + */ + if (tcph->syn && tcph->ack) { + /* + * Outgoing SYN-ACK in reply to a SYN. + */ + sender->td_end = + sender->td_maxend = end; + sender->td_maxwin = (win == 0 ? 1 : win); + + tcp_options(skb, dataoff, tcph, sender); + /* + * RFC 1323: + * Both sides must send the Window Scale option + * to enable window scaling in either direction. + */ + if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE + && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) + sender->td_scale = + receiver->td_scale = 0; + } else { + /* + * We are in the middle of a connection, + * its history is lost for us. + * Let's try to use the data from the packet. + */ + sender->td_end = end; + sender->td_maxwin = (win == 0 ? 1 : win); + sender->td_maxend = end + sender->td_maxwin; + } + } else if (((state->state == TCP_CONNTRACK_SYN_SENT + && dir == IP_CT_DIR_ORIGINAL) + || (state->state == TCP_CONNTRACK_SYN_RECV + && dir == IP_CT_DIR_REPLY)) + && after(end, sender->td_end)) { + /* + * RFC 793: "if a TCP is reinitialized ... then it need + * not wait at all; it must only be sure to use sequence + * numbers larger than those recently used." + */ + sender->td_end = + sender->td_maxend = end; + sender->td_maxwin = (win == 0 ? 1 : win); + + tcp_options(skb, dataoff, tcph, sender); + } + + if (!(tcph->ack)) { + /* + * If there is no ACK, just pretend it was set and OK. + */ + ack = sack = receiver->td_end; + } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) == + (TCP_FLAG_ACK|TCP_FLAG_RST)) + && (ack == 0)) { + /* + * Broken TCP stacks, that set ACK in RST packets as well + * with zero ack value. + */ + ack = sack = receiver->td_end; + } + + if (seq == end + && (!tcph->rst + || (seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT))) + /* + * Packets contains no data: we assume it is valid + * and check the ack value only. + * However RST segments are always validated by their + * SEQ number, except when seq == 0 (reset sent answering + * SYN. + */ + seq = end = sender->td_end; + + DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " + "seq=%u ack=%u sack =%u win=%u end=%u\n", + NIPQUAD(iph->saddr), ntohs(tcph->source), + NIPQUAD(iph->daddr), ntohs(tcph->dest), + seq, ack, sack, win, end); + DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); + + DEBUGP("tcp_in_window: I=%i II=%i III=%i IV=%i\n", + before(seq, sender->td_maxend + 1), + after(end, sender->td_end - receiver->td_maxwin - 1), + before(sack, receiver->td_end + 1), + after(ack, receiver->td_end - MAXACKWINDOW(sender))); + + if (sender->loose || receiver->loose || + (before(seq, sender->td_maxend + 1) && + after(end, sender->td_end - receiver->td_maxwin - 1) && + before(sack, receiver->td_end + 1) && + after(ack, receiver->td_end - MAXACKWINDOW(sender)))) { + /* + * Take into account window scaling (RFC 1323). + */ + if (!tcph->syn) + win <<= sender->td_scale; + + /* + * Update sender data. + */ + swin = win + (sack - ack); + if (sender->td_maxwin < swin) + sender->td_maxwin = swin; + if (after(end, sender->td_end)) + sender->td_end = end; + /* + * Update receiver data. + */ + if (after(end, sender->td_maxend)) + receiver->td_maxwin += end - sender->td_maxend; + if (after(sack + win, receiver->td_maxend - 1)) { + receiver->td_maxend = sack + win; + if (win == 0) + receiver->td_maxend++; + } + + /* + * Check retransmissions. + */ + if (index == TCP_ACK_SET) { + if (state->last_dir == dir + && state->last_seq == seq + && state->last_ack == ack + && state->last_end == end) + state->retrans++; + else { + state->last_dir = dir; + state->last_seq = seq; + state->last_ack = ack; + state->last_end = end; + state->retrans = 0; + } + } + /* + * Close the window of disabled window tracking :-) + */ + if (sender->loose) + sender->loose--; + + res = 1; + } else { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: %s ", + before(seq, sender->td_maxend + 1) ? + after(end, sender->td_end - receiver->td_maxwin - 1) ? + before(sack, receiver->td_end + 1) ? + after(ack, receiver->td_end - MAXACKWINDOW(sender)) ? "BUG" + : "ACK is under the lower bound (possible overly delayed ACK)" + : "ACK is over the upper bound (ACKed data not seen yet)" + : "SEQ is under the lower bound (already ACKed data retransmitted)" + : "SEQ is over the upper bound (over the window of the receiver)"); + + res = nf_ct_tcp_be_liberal; + } + + DEBUGP("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u " + "receiver end=%u maxend=%u maxwin=%u\n", + res, sender->td_end, sender->td_maxend, sender->td_maxwin, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin); + + return res; +} + +#ifdef CONFIG_IP_NF_NAT_NEEDED +/* Update sender->td_end after NAT successfully mangled the packet */ +/* Caller must linearize skb at tcp header. */ +void nf_conntrack_tcp_update(struct sk_buff *skb, + unsigned int dataoff, + struct nf_conn *conntrack, + int dir) +{ + struct tcphdr *tcph = (void *)skb->data + dataoff; + __u32 end; +#ifdef DEBUGP_VARS + struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[dir]; + struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[!dir]; +#endif + + end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, dataoff, tcph); + + write_lock_bh(&tcp_lock); + /* + * We have to worry for the ack in the reply packet only... + */ + if (after(end, conntrack->proto.tcp.seen[dir].td_end)) + conntrack->proto.tcp.seen[dir].td_end = end; + conntrack->proto.tcp.last_end = end; + write_unlock_bh(&tcp_lock); + DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); +} + +#endif + +#define TH_FIN 0x01 +#define TH_SYN 0x02 +#define TH_RST 0x04 +#define TH_PUSH 0x08 +#define TH_ACK 0x10 +#define TH_URG 0x20 +#define TH_ECE 0x40 +#define TH_CWR 0x80 + +/* table of valid flag combinations - ECE and CWR are always valid */ +static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] = +{ + [TH_SYN] = 1, + [TH_SYN|TH_ACK] = 1, + [TH_SYN|TH_PUSH] = 1, + [TH_SYN|TH_ACK|TH_PUSH] = 1, + [TH_RST] = 1, + [TH_RST|TH_ACK] = 1, + [TH_RST|TH_ACK|TH_PUSH] = 1, + [TH_FIN|TH_ACK] = 1, + [TH_ACK] = 1, + [TH_ACK|TH_PUSH] = 1, + [TH_ACK|TH_URG] = 1, + [TH_ACK|TH_URG|TH_PUSH] = 1, + [TH_FIN|TH_ACK|TH_PUSH] = 1, + [TH_FIN|TH_ACK|TH_URG] = 1, + [TH_FIN|TH_ACK|TH_URG|TH_PUSH] = 1, +}; + +/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ +static int tcp_error(struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info *ctinfo, + int pf, + unsigned int hooknum, + int(*csum)(const struct sk_buff *,unsigned int)) +{ + struct tcphdr _tcph, *th; + unsigned int tcplen = skb->len - dataoff; + u_int8_t tcpflags; + + /* Smaller that minimal TCP header? */ + th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); + if (th == NULL) { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: short packet "); + return -NF_ACCEPT; + } + + /* Not whole TCP header or malformed packet */ + if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: truncated/malformed packet "); + return -NF_ACCEPT; + } + + /* Checksum invalid? Ignore. + * We skip checking packets on the outgoing path + * because the semantic of CHECKSUM_HW is different there + * and moreover root might send raw packets. + */ + /* FIXME: Source route IP option packets --RR */ + if (((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) || + (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) + && skb->ip_summed != CHECKSUM_UNNECESSARY + && csum(skb, dataoff)) { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: bad TCP checksum "); + return -NF_ACCEPT; + } + + /* Check TCP flags. */ + tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR)); + if (!tcp_valid_flags[tcpflags]) { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: invalid TCP flag combination "); + return -NF_ACCEPT; + } + + return NF_ACCEPT; +} + +static int csum4(const struct sk_buff *skb, unsigned int dataoff) +{ + return csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr, + skb->len - dataoff, IPPROTO_TCP, + skb->ip_summed == CHECKSUM_HW ? skb->csum + : skb_checksum(skb, dataoff, + skb->len - dataoff, 0)); +} + +static int csum6(const struct sk_buff *skb, unsigned int dataoff) +{ + return csum_ipv6_magic(&skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr, + skb->len - dataoff, IPPROTO_TCP, + skb->ip_summed == CHECKSUM_HW ? skb->csum + : skb_checksum(skb, dataoff, skb->len - dataoff, + 0)); +} + +static int tcp_error4(struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info *ctinfo, + int pf, + unsigned int hooknum) +{ + return tcp_error(skb, dataoff, ctinfo, pf, hooknum, csum4); +} + +static int tcp_error6(struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info *ctinfo, + int pf, + unsigned int hooknum) +{ + return tcp_error(skb, dataoff, ctinfo, pf, hooknum, csum6); +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int tcp_packet(struct nf_conn *conntrack, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + int pf, + unsigned int hooknum) +{ + enum tcp_conntrack new_state, old_state; + enum ip_conntrack_dir dir; + struct tcphdr *th, _tcph; + unsigned long timeout; + unsigned int index; + + th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); + BUG_ON(th == NULL); + + write_lock_bh(&tcp_lock); + old_state = conntrack->proto.tcp.state; + dir = CTINFO2DIR(ctinfo); + index = get_conntrack_index(th); + new_state = tcp_conntracks[dir][index][old_state]; + + switch (new_state) { + case TCP_CONNTRACK_IGNORE: + /* Either SYN in ORIGINAL + * or SYN/ACK in REPLY. */ + if (index == TCP_SYNACK_SET + && conntrack->proto.tcp.last_index == TCP_SYN_SET + && conntrack->proto.tcp.last_dir != dir + && ntohl(th->ack_seq) == + conntrack->proto.tcp.last_end) { + /* This SYN/ACK acknowledges a SYN that we earlier + * ignored as invalid. This means that the client and + * the server are both in sync, while the firewall is + * not. We kill this session and block the SYN/ACK so + * that the client cannot but retransmit its SYN and + * thus initiate a clean new session. + */ + write_unlock_bh(&tcp_lock); + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: killing out of sync session "); + if (del_timer(&conntrack->timeout)) + conntrack->timeout.function((unsigned long) + conntrack); + return -NF_DROP; + } + conntrack->proto.tcp.last_index = index; + conntrack->proto.tcp.last_dir = dir; + conntrack->proto.tcp.last_seq = ntohl(th->seq); + conntrack->proto.tcp.last_end = + segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th); + + write_unlock_bh(&tcp_lock); + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: invalid packed ignored "); + return NF_ACCEPT; + case TCP_CONNTRACK_MAX: + /* Invalid packet */ + DEBUGP("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", + dir, get_conntrack_index(th), + old_state); + write_unlock_bh(&tcp_lock); + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: invalid state "); + return -NF_ACCEPT; + case TCP_CONNTRACK_SYN_SENT: + if (old_state < TCP_CONNTRACK_TIME_WAIT) + break; + if ((conntrack->proto.tcp.seen[dir].flags & + IP_CT_TCP_FLAG_CLOSE_INIT) + || after(ntohl(th->seq), + conntrack->proto.tcp.seen[dir].td_end)) { + /* Attempt to reopen a closed connection. + * Delete this connection and look up again. */ + write_unlock_bh(&tcp_lock); + if (del_timer(&conntrack->timeout)) + conntrack->timeout.function((unsigned long) + conntrack); + return -NF_REPEAT; + } else { + write_unlock_bh(&tcp_lock); + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, + NULL, "nf_ct_tcp: invalid SYN"); + return -NF_ACCEPT; + } + case TCP_CONNTRACK_CLOSE: + if (index == TCP_RST_SET + && test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status) + && conntrack->proto.tcp.last_index == TCP_SYN_SET + && ntohl(th->ack_seq) == conntrack->proto.tcp.last_end) { + /* RST sent to invalid SYN we had let trough + * SYN was in window then, tear down connection. + * We skip window checking, because packet might ACK + * segments we ignored in the SYN. */ + goto in_window; + } + /* Just fall trough */ + default: + /* Keep compilers happy. */ + break; + } + + if (!tcp_in_window(&conntrack->proto.tcp, dir, index, + skb, dataoff, th, pf)) { + write_unlock_bh(&tcp_lock); + return -NF_ACCEPT; + } + in_window: + /* From now on we have got in-window packets */ + conntrack->proto.tcp.last_index = index; + + DEBUGP("tcp_conntracks: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " + "syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n", + NIPQUAD(iph->saddr), ntohs(th->source), + NIPQUAD(iph->daddr), ntohs(th->dest), + (th->syn ? 1 : 0), (th->ack ? 1 : 0), + (th->fin ? 1 : 0), (th->rst ? 1 : 0), + old_state, new_state); + + conntrack->proto.tcp.state = new_state; + if (old_state != new_state + && (new_state == TCP_CONNTRACK_FIN_WAIT + || new_state == TCP_CONNTRACK_CLOSE)) + conntrack->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; + timeout = conntrack->proto.tcp.retrans >= nf_ct_tcp_max_retrans + && *tcp_timeouts[new_state] > nf_ct_tcp_timeout_max_retrans + ? nf_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; + write_unlock_bh(&tcp_lock); + + nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); + if (new_state != old_state) + nf_conntrack_event_cache(IPCT_PROTOINFO, skb); + + if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { + /* If only reply is a RST, we can consider ourselves not to + have an established connection: this is a fairly common + problem case, so we can delete the conntrack + immediately. --RR */ + if (th->rst) { + if (del_timer(&conntrack->timeout)) + conntrack->timeout.function((unsigned long) + conntrack); + return NF_ACCEPT; + } + } else if (!test_bit(IPS_ASSURED_BIT, &conntrack->status) + && (old_state == TCP_CONNTRACK_SYN_RECV + || old_state == TCP_CONNTRACK_ESTABLISHED) + && new_state == TCP_CONNTRACK_ESTABLISHED) { + /* Set ASSURED if we see see valid ack in ESTABLISHED + after SYN_RECV or a valid answer for a picked up + connection. */ + set_bit(IPS_ASSURED_BIT, &conntrack->status); + nf_conntrack_event_cache(IPCT_STATUS, skb); + } + nf_ct_refresh_acct(conntrack, ctinfo, skb, timeout); + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int tcp_new(struct nf_conn *conntrack, + const struct sk_buff *skb, + unsigned int dataoff) +{ + enum tcp_conntrack new_state; + struct tcphdr *th, _tcph; +#ifdef DEBUGP_VARS + struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[0]; + struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[1]; +#endif + + th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); + BUG_ON(th == NULL); + + /* Don't need lock here: this conntrack not in circulation yet */ + new_state + = tcp_conntracks[0][get_conntrack_index(th)] + [TCP_CONNTRACK_NONE]; + + /* Invalid: delete conntrack */ + if (new_state >= TCP_CONNTRACK_MAX) { + DEBUGP("nf_ct_tcp: invalid new deleting.\n"); + return 0; + } + + if (new_state == TCP_CONNTRACK_SYN_SENT) { + /* SYN packet */ + conntrack->proto.tcp.seen[0].td_end = + segment_seq_plus_len(ntohl(th->seq), skb->len, + dataoff, th); + conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window); + if (conntrack->proto.tcp.seen[0].td_maxwin == 0) + conntrack->proto.tcp.seen[0].td_maxwin = 1; + conntrack->proto.tcp.seen[0].td_maxend = + conntrack->proto.tcp.seen[0].td_end; + + tcp_options(skb, dataoff, th, &conntrack->proto.tcp.seen[0]); + conntrack->proto.tcp.seen[1].flags = 0; + conntrack->proto.tcp.seen[0].loose = + conntrack->proto.tcp.seen[1].loose = 0; + } else if (nf_ct_tcp_loose == 0) { + /* Don't try to pick up connections. */ + return 0; + } else { + /* + * We are in the middle of a connection, + * its history is lost for us. + * Let's try to use the data from the packet. + */ + conntrack->proto.tcp.seen[0].td_end = + segment_seq_plus_len(ntohl(th->seq), skb->len, + dataoff, th); + conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window); + if (conntrack->proto.tcp.seen[0].td_maxwin == 0) + conntrack->proto.tcp.seen[0].td_maxwin = 1; + conntrack->proto.tcp.seen[0].td_maxend = + conntrack->proto.tcp.seen[0].td_end + + conntrack->proto.tcp.seen[0].td_maxwin; + conntrack->proto.tcp.seen[0].td_scale = 0; + + /* We assume SACK. Should we assume window scaling too? */ + conntrack->proto.tcp.seen[0].flags = + conntrack->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM; + conntrack->proto.tcp.seen[0].loose = + conntrack->proto.tcp.seen[1].loose = nf_ct_tcp_loose; + } + + conntrack->proto.tcp.seen[1].td_end = 0; + conntrack->proto.tcp.seen[1].td_maxend = 0; + conntrack->proto.tcp.seen[1].td_maxwin = 1; + conntrack->proto.tcp.seen[1].td_scale = 0; + + /* tcp_packet will set them */ + conntrack->proto.tcp.state = TCP_CONNTRACK_NONE; + conntrack->proto.tcp.last_index = TCP_NONE_SET; + + DEBUGP("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); + return 1; +} + +struct nf_conntrack_protocol nf_conntrack_protocol_tcp4 = +{ + .l3proto = PF_INET, + .proto = IPPROTO_TCP, + .name = "tcp", + .pkt_to_tuple = tcp_pkt_to_tuple, + .invert_tuple = tcp_invert_tuple, + .print_tuple = tcp_print_tuple, + .print_conntrack = tcp_print_conntrack, + .packet = tcp_packet, + .new = tcp_new, + .error = tcp_error4, +}; + +struct nf_conntrack_protocol nf_conntrack_protocol_tcp6 = +{ + .l3proto = PF_INET6, + .proto = IPPROTO_TCP, + .name = "tcp", + .pkt_to_tuple = tcp_pkt_to_tuple, + .invert_tuple = tcp_invert_tuple, + .print_tuple = tcp_print_tuple, + .print_conntrack = tcp_print_conntrack, + .packet = tcp_packet, + .new = tcp_new, + .error = tcp_error6, +}; + +EXPORT_SYMBOL(nf_conntrack_protocol_tcp4); +EXPORT_SYMBOL(nf_conntrack_protocol_tcp6); diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c new file mode 100644 index 00000000000..3cae7ce420d --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -0,0 +1,216 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * - enable working with Layer 3 protocol independent connection tracking. + * + * Derived from net/ipv4/netfilter/ip_conntrack_proto_udp.c + */ + +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/udp.h> +#include <linux/seq_file.h> +#include <linux/skbuff.h> +#include <linux/ipv6.h> +#include <net/ip6_checksum.h> +#include <net/checksum.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <net/netfilter/nf_conntrack_protocol.h> + +unsigned long nf_ct_udp_timeout = 30*HZ; +unsigned long nf_ct_udp_timeout_stream = 180*HZ; + +static int udp_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + struct udphdr _hdr, *hp; + + /* Actually only need first 8 bytes. */ + hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hp == NULL) + return 0; + + tuple->src.u.udp.port = hp->source; + tuple->dst.u.udp.port = hp->dest; + + return 1; +} + +static int udp_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + tuple->src.u.udp.port = orig->dst.u.udp.port; + tuple->dst.u.udp.port = orig->src.u.udp.port; + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int udp_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.udp.port), + ntohs(tuple->dst.u.udp.port)); +} + +/* Print out the private part of the conntrack. */ +static int udp_print_conntrack(struct seq_file *s, + const struct nf_conn *conntrack) +{ + return 0; +} + +/* Returns verdict for packet, and may modify conntracktype */ +static int udp_packet(struct nf_conn *conntrack, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + int pf, + unsigned int hooknum) +{ + /* If we've seen traffic both ways, this is some kind of UDP + stream. Extend timeout. */ + if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { + nf_ct_refresh_acct(conntrack, ctinfo, skb, + nf_ct_udp_timeout_stream); + /* Also, more likely to be important, and not a probe */ + if (!test_and_set_bit(IPS_ASSURED_BIT, &conntrack->status)) + nf_conntrack_event_cache(IPCT_STATUS, skb); + } else + nf_ct_refresh_acct(conntrack, ctinfo, skb, nf_ct_udp_timeout); + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int udp_new(struct nf_conn *conntrack, const struct sk_buff *skb, + unsigned int dataoff) +{ + return 1; +} + +static int udp_error(struct sk_buff *skb, unsigned int dataoff, + enum ip_conntrack_info *ctinfo, + int pf, + unsigned int hooknum, + int (*csum)(const struct sk_buff *, unsigned int)) +{ + unsigned int udplen = skb->len - dataoff; + struct udphdr _hdr, *hdr; + + /* Header is too small? */ + hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hdr == NULL) { + if (LOG_INVALID(IPPROTO_UDP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_udp: short packet "); + return -NF_ACCEPT; + } + + /* Truncated/malformed packets */ + if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { + if (LOG_INVALID(IPPROTO_UDP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_udp: truncated/malformed packet "); + return -NF_ACCEPT; + } + + /* Packet with no checksum */ + if (!hdr->check) + return NF_ACCEPT; + + /* Checksum invalid? Ignore. + * We skip checking packets on the outgoing path + * because the semantic of CHECKSUM_HW is different there + * and moreover root might send raw packets. + * FIXME: Source route IP option packets --RR */ + if (((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) || + (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) + && skb->ip_summed != CHECKSUM_UNNECESSARY + && csum(skb, dataoff)) { + if (LOG_INVALID(IPPROTO_UDP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_udp: bad UDP checksum "); + return -NF_ACCEPT; + } + + return NF_ACCEPT; +} + +static int csum4(const struct sk_buff *skb, unsigned int dataoff) +{ + return csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr, + skb->len - dataoff, IPPROTO_UDP, + skb->ip_summed == CHECKSUM_HW ? skb->csum + : skb_checksum(skb, dataoff, + skb->len - dataoff, 0)); +} + +static int csum6(const struct sk_buff *skb, unsigned int dataoff) +{ + return csum_ipv6_magic(&skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr, + skb->len - dataoff, IPPROTO_UDP, + skb->ip_summed == CHECKSUM_HW ? skb->csum + : skb_checksum(skb, dataoff, skb->len - dataoff, + 0)); +} + +static int udp_error4(struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info *ctinfo, + int pf, + unsigned int hooknum) +{ + return udp_error(skb, dataoff, ctinfo, pf, hooknum, csum4); +} + +static int udp_error6(struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info *ctinfo, + int pf, + unsigned int hooknum) +{ + return udp_error(skb, dataoff, ctinfo, pf, hooknum, csum6); +} + +struct nf_conntrack_protocol nf_conntrack_protocol_udp4 = +{ + .l3proto = PF_INET, + .proto = IPPROTO_UDP, + .name = "udp", + .pkt_to_tuple = udp_pkt_to_tuple, + .invert_tuple = udp_invert_tuple, + .print_tuple = udp_print_tuple, + .print_conntrack = udp_print_conntrack, + .packet = udp_packet, + .new = udp_new, + .error = udp_error4, +}; + +struct nf_conntrack_protocol nf_conntrack_protocol_udp6 = +{ + .l3proto = PF_INET6, + .proto = IPPROTO_UDP, + .name = "udp", + .pkt_to_tuple = udp_pkt_to_tuple, + .invert_tuple = udp_invert_tuple, + .print_tuple = udp_print_tuple, + .print_conntrack = udp_print_conntrack, + .packet = udp_packet, + .new = udp_new, + .error = udp_error6, +}; + +EXPORT_SYMBOL(nf_conntrack_protocol_udp4); +EXPORT_SYMBOL(nf_conntrack_protocol_udp6); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c new file mode 100644 index 00000000000..5af381f9fe3 --- /dev/null +++ b/net/netfilter/nf_conntrack_standalone.c @@ -0,0 +1,869 @@ +/* This file contains all the functions required for the standalone + nf_conntrack module. + + These are not required by the compatibility layer. +*/ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * - generalize L3 protocol dependent part. + * + * Derived from net/ipv4/netfilter/ip_conntrack_standalone.c + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/netfilter.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/percpu.h> +#include <linux/netdevice.h> +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif + +#define ASSERT_READ_LOCK(x) +#define ASSERT_WRITE_LOCK(x) + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_l3proto.h> +#include <net/netfilter/nf_conntrack_protocol.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <linux/netfilter_ipv4/listhelp.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +MODULE_LICENSE("GPL"); + +extern atomic_t nf_conntrack_count; +DECLARE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat); + +static int kill_l3proto(struct nf_conn *i, void *data) +{ + return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num == + ((struct nf_conntrack_l3proto *)data)->l3proto); +} + +static int kill_proto(struct nf_conn *i, void *data) +{ + struct nf_conntrack_protocol *proto; + proto = (struct nf_conntrack_protocol *)data; + return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum == + proto->proto) && + (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num == + proto->l3proto); +} + +#ifdef CONFIG_PROC_FS +static int +print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, + struct nf_conntrack_l3proto *l3proto, + struct nf_conntrack_protocol *proto) +{ + return l3proto->print_tuple(s, tuple) || proto->print_tuple(s, tuple); +} + +#ifdef CONFIG_NF_CT_ACCT +static unsigned int +seq_print_counters(struct seq_file *s, + const struct ip_conntrack_counter *counter) +{ + return seq_printf(s, "packets=%llu bytes=%llu ", + (unsigned long long)counter->packets, + (unsigned long long)counter->bytes); +} +#else +#define seq_print_counters(x, y) 0 +#endif + +struct ct_iter_state { + unsigned int bucket; +}; + +static struct list_head *ct_get_first(struct seq_file *seq) +{ + struct ct_iter_state *st = seq->private; + + for (st->bucket = 0; + st->bucket < nf_conntrack_htable_size; + st->bucket++) { + if (!list_empty(&nf_conntrack_hash[st->bucket])) + return nf_conntrack_hash[st->bucket].next; + } + return NULL; +} + +static struct list_head *ct_get_next(struct seq_file *seq, struct list_head *head) +{ + struct ct_iter_state *st = seq->private; + + head = head->next; + while (head == &nf_conntrack_hash[st->bucket]) { + if (++st->bucket >= nf_conntrack_htable_size) + return NULL; + head = nf_conntrack_hash[st->bucket].next; + } + return head; +} + +static struct list_head *ct_get_idx(struct seq_file *seq, loff_t pos) +{ + struct list_head *head = ct_get_first(seq); + + if (head) + while (pos && (head = ct_get_next(seq, head))) + pos--; + return pos ? NULL : head; +} + +static void *ct_seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock_bh(&nf_conntrack_lock); + return ct_get_idx(seq, *pos); +} + +static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + return ct_get_next(s, v); +} + +static void ct_seq_stop(struct seq_file *s, void *v) +{ + read_unlock_bh(&nf_conntrack_lock); +} + +/* return 0 on success, 1 in case of error */ +static int ct_seq_show(struct seq_file *s, void *v) +{ + const struct nf_conntrack_tuple_hash *hash = v; + const struct nf_conn *conntrack = nf_ct_tuplehash_to_ctrack(hash); + struct nf_conntrack_l3proto *l3proto; + struct nf_conntrack_protocol *proto; + + ASSERT_READ_LOCK(&nf_conntrack_lock); + NF_CT_ASSERT(conntrack); + + /* we only want to print DIR_ORIGINAL */ + if (NF_CT_DIRECTION(hash)) + return 0; + + l3proto = nf_ct_find_l3proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.src.l3num); + + NF_CT_ASSERT(l3proto); + proto = nf_ct_find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.src.l3num, + conntrack->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.protonum); + NF_CT_ASSERT(proto); + + if (seq_printf(s, "%-8s %u %-8s %u %ld ", + l3proto->name, + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num, + proto->name, + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum, + timer_pending(&conntrack->timeout) + ? (long)(conntrack->timeout.expires - jiffies)/HZ : 0) != 0) + return -ENOSPC; + + if (l3proto->print_conntrack(s, conntrack)) + return -ENOSPC; + + if (proto->print_conntrack(s, conntrack)) + return -ENOSPC; + + if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + l3proto, proto)) + return -ENOSPC; + + if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_ORIGINAL])) + return -ENOSPC; + + if (!(test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status))) + if (seq_printf(s, "[UNREPLIED] ")) + return -ENOSPC; + + if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple, + l3proto, proto)) + return -ENOSPC; + + if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_REPLY])) + return -ENOSPC; + + if (test_bit(IPS_ASSURED_BIT, &conntrack->status)) + if (seq_printf(s, "[ASSURED] ")) + return -ENOSPC; + +#if defined(CONFIG_NF_CONNTRACK_MARK) + if (seq_printf(s, "mark=%u ", conntrack->mark)) + return -ENOSPC; +#endif + + if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use))) + return -ENOSPC; + + return 0; +} + +static struct seq_operations ct_seq_ops = { + .start = ct_seq_start, + .next = ct_seq_next, + .stop = ct_seq_stop, + .show = ct_seq_show +}; + +static int ct_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + struct ct_iter_state *st; + int ret; + + st = kmalloc(sizeof(struct ct_iter_state), GFP_KERNEL); + if (st == NULL) + return -ENOMEM; + ret = seq_open(file, &ct_seq_ops); + if (ret) + goto out_free; + seq = file->private_data; + seq->private = st; + memset(st, 0, sizeof(struct ct_iter_state)); + return ret; +out_free: + kfree(st); + return ret; +} + +static struct file_operations ct_file_ops = { + .owner = THIS_MODULE, + .open = ct_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +/* expects */ +static void *exp_seq_start(struct seq_file *s, loff_t *pos) +{ + struct list_head *e = &nf_conntrack_expect_list; + loff_t i; + + /* strange seq_file api calls stop even if we fail, + * thus we need to grab lock since stop unlocks */ + read_lock_bh(&nf_conntrack_lock); + + if (list_empty(e)) + return NULL; + + for (i = 0; i <= *pos; i++) { + e = e->next; + if (e == &nf_conntrack_expect_list) + return NULL; + } + return e; +} + +static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct list_head *e = v; + + ++*pos; + e = e->next; + + if (e == &nf_conntrack_expect_list) + return NULL; + + return e; +} + +static void exp_seq_stop(struct seq_file *s, void *v) +{ + read_unlock_bh(&nf_conntrack_lock); +} + +static int exp_seq_show(struct seq_file *s, void *v) +{ + struct nf_conntrack_expect *expect = v; + + if (expect->timeout.function) + seq_printf(s, "%ld ", timer_pending(&expect->timeout) + ? (long)(expect->timeout.expires - jiffies)/HZ : 0); + else + seq_printf(s, "- "); + seq_printf(s, "l3proto = %u proto=%u ", + expect->tuple.src.l3num, + expect->tuple.dst.protonum); + print_tuple(s, &expect->tuple, + nf_ct_find_l3proto(expect->tuple.src.l3num), + nf_ct_find_proto(expect->tuple.src.l3num, + expect->tuple.dst.protonum)); + return seq_putc(s, '\n'); +} + +static struct seq_operations exp_seq_ops = { + .start = exp_seq_start, + .next = exp_seq_next, + .stop = exp_seq_stop, + .show = exp_seq_show +}; + +static int exp_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &exp_seq_ops); +} + +static struct file_operations exp_file_ops = { + .owner = THIS_MODULE, + .open = exp_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) +{ + int cpu; + + if (*pos == 0) + return SEQ_START_TOKEN; + + for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu + 1; + return &per_cpu(nf_conntrack_stat, cpu); + } + + return NULL; +} + +static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + int cpu; + + for (cpu = *pos; cpu < NR_CPUS; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu + 1; + return &per_cpu(nf_conntrack_stat, cpu); + } + + return NULL; +} + +static void ct_cpu_seq_stop(struct seq_file *seq, void *v) +{ +} + +static int ct_cpu_seq_show(struct seq_file *seq, void *v) +{ + unsigned int nr_conntracks = atomic_read(&nf_conntrack_count); + struct ip_conntrack_stat *st = v; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete\n"); + return 0; + } + + seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " + "%08x %08x %08x %08x %08x %08x %08x %08x \n", + nr_conntracks, + st->searched, + st->found, + st->new, + st->invalid, + st->ignore, + st->delete, + st->delete_list, + st->insert, + st->insert_failed, + st->drop, + st->early_drop, + st->error, + + st->expect_new, + st->expect_create, + st->expect_delete + ); + return 0; +} + +static struct seq_operations ct_cpu_seq_ops = { + .start = ct_cpu_seq_start, + .next = ct_cpu_seq_next, + .stop = ct_cpu_seq_stop, + .show = ct_cpu_seq_show, +}; + +static int ct_cpu_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ct_cpu_seq_ops); +} + +static struct file_operations ct_cpu_seq_fops = { + .owner = THIS_MODULE, + .open = ct_cpu_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; +#endif /* CONFIG_PROC_FS */ + +/* Sysctl support */ + +#ifdef CONFIG_SYSCTL + +/* From nf_conntrack_core.c */ +extern int nf_conntrack_max; +extern unsigned int nf_conntrack_htable_size; + +/* From nf_conntrack_proto_tcp.c */ +extern unsigned long nf_ct_tcp_timeout_syn_sent; +extern unsigned long nf_ct_tcp_timeout_syn_recv; +extern unsigned long nf_ct_tcp_timeout_established; +extern unsigned long nf_ct_tcp_timeout_fin_wait; +extern unsigned long nf_ct_tcp_timeout_close_wait; +extern unsigned long nf_ct_tcp_timeout_last_ack; +extern unsigned long nf_ct_tcp_timeout_time_wait; +extern unsigned long nf_ct_tcp_timeout_close; +extern unsigned long nf_ct_tcp_timeout_max_retrans; +extern int nf_ct_tcp_loose; +extern int nf_ct_tcp_be_liberal; +extern int nf_ct_tcp_max_retrans; + +/* From nf_conntrack_proto_udp.c */ +extern unsigned long nf_ct_udp_timeout; +extern unsigned long nf_ct_udp_timeout_stream; + +/* From nf_conntrack_proto_generic.c */ +extern unsigned long nf_ct_generic_timeout; + +/* Log invalid packets of a given protocol */ +static int log_invalid_proto_min = 0; +static int log_invalid_proto_max = 255; + +static struct ctl_table_header *nf_ct_sysctl_header; + +static ctl_table nf_ct_sysctl_table[] = { + { + .ctl_name = NET_NF_CONNTRACK_MAX, + .procname = "nf_conntrack_max", + .data = &nf_conntrack_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NF_CONNTRACK_COUNT, + .procname = "nf_conntrack_count", + .data = &nf_conntrack_count, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NF_CONNTRACK_BUCKETS, + .procname = "nf_conntrack_buckets", + .data = &nf_conntrack_htable_size, + .maxlen = sizeof(unsigned int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, + .procname = "nf_conntrack_tcp_timeout_syn_sent", + .data = &nf_ct_tcp_timeout_syn_sent, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, + .procname = "nf_conntrack_tcp_timeout_syn_recv", + .data = &nf_ct_tcp_timeout_syn_recv, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, + .procname = "nf_conntrack_tcp_timeout_established", + .data = &nf_ct_tcp_timeout_established, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, + .procname = "nf_conntrack_tcp_timeout_fin_wait", + .data = &nf_ct_tcp_timeout_fin_wait, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, + .procname = "nf_conntrack_tcp_timeout_close_wait", + .data = &nf_ct_tcp_timeout_close_wait, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, + .procname = "nf_conntrack_tcp_timeout_last_ack", + .data = &nf_ct_tcp_timeout_last_ack, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, + .procname = "nf_conntrack_tcp_timeout_time_wait", + .data = &nf_ct_tcp_timeout_time_wait, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, + .procname = "nf_conntrack_tcp_timeout_close", + .data = &nf_ct_tcp_timeout_close, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_UDP_TIMEOUT, + .procname = "nf_conntrack_udp_timeout", + .data = &nf_ct_udp_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM, + .procname = "nf_conntrack_udp_timeout_stream", + .data = &nf_ct_udp_timeout_stream, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_GENERIC_TIMEOUT, + .procname = "nf_conntrack_generic_timeout", + .data = &nf_ct_generic_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_LOG_INVALID, + .procname = "nf_conntrack_log_invalid", + .data = &nf_ct_log_invalid, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &log_invalid_proto_min, + .extra2 = &log_invalid_proto_max, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, + .procname = "nf_conntrack_tcp_timeout_max_retrans", + .data = &nf_ct_tcp_timeout_max_retrans, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_LOOSE, + .procname = "nf_conntrack_tcp_loose", + .data = &nf_ct_tcp_loose, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_BE_LIBERAL, + .procname = "nf_conntrack_tcp_be_liberal", + .data = &nf_ct_tcp_be_liberal, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_MAX_RETRANS, + .procname = "nf_conntrack_tcp_max_retrans", + .data = &nf_ct_tcp_max_retrans, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + + { .ctl_name = 0 } +}; + +#define NET_NF_CONNTRACK_MAX 2089 + +static ctl_table nf_ct_netfilter_table[] = { + { + .ctl_name = NET_NETFILTER, + .procname = "netfilter", + .mode = 0555, + .child = nf_ct_sysctl_table, + }, + { + .ctl_name = NET_NF_CONNTRACK_MAX, + .procname = "nf_conntrack_max", + .data = &nf_conntrack_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; + +static ctl_table nf_ct_net_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = nf_ct_netfilter_table, + }, + { .ctl_name = 0 } +}; +EXPORT_SYMBOL(nf_ct_log_invalid); +#endif /* CONFIG_SYSCTL */ + +static int init_or_cleanup(int init) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *proc, *proc_exp, *proc_stat; +#endif + int ret = 0; + + if (!init) goto cleanup; + + ret = nf_conntrack_init(); + if (ret < 0) + goto cleanup_nothing; + +#ifdef CONFIG_PROC_FS + proc = proc_net_fops_create("nf_conntrack", 0440, &ct_file_ops); + if (!proc) goto cleanup_init; + + proc_exp = proc_net_fops_create("nf_conntrack_expect", 0440, + &exp_file_ops); + if (!proc_exp) goto cleanup_proc; + + proc_stat = create_proc_entry("nf_conntrack", S_IRUGO, proc_net_stat); + if (!proc_stat) + goto cleanup_proc_exp; + + proc_stat->proc_fops = &ct_cpu_seq_fops; + proc_stat->owner = THIS_MODULE; +#endif +#ifdef CONFIG_SYSCTL + nf_ct_sysctl_header = register_sysctl_table(nf_ct_net_table, 0); + if (nf_ct_sysctl_header == NULL) { + printk("nf_conntrack: can't register to sysctl.\n"); + ret = -ENOMEM; + goto cleanup_proc_stat; + } +#endif + + return ret; + + cleanup: +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(nf_ct_sysctl_header); + cleanup_proc_stat: +#endif +#ifdef CONFIG_PROC_FS + remove_proc_entry("nf_conntrack", proc_net_stat); + cleanup_proc_exp: + proc_net_remove("nf_conntrack_expect"); + cleanup_proc: + proc_net_remove("nf_conntrack"); + cleanup_init: +#endif /* CNFIG_PROC_FS */ + nf_conntrack_cleanup(); + cleanup_nothing: + return ret; +} + +int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto) +{ + int ret = 0; + + write_lock_bh(&nf_conntrack_lock); + if (nf_ct_l3protos[proto->l3proto] != &nf_conntrack_generic_l3proto) { + ret = -EBUSY; + goto out; + } + nf_ct_l3protos[proto->l3proto] = proto; +out: + write_unlock_bh(&nf_conntrack_lock); + + return ret; +} + +void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto) +{ + write_lock_bh(&nf_conntrack_lock); + nf_ct_l3protos[proto->l3proto] = &nf_conntrack_generic_l3proto; + write_unlock_bh(&nf_conntrack_lock); + + /* Somebody could be still looking at the proto in bh. */ + synchronize_net(); + + /* Remove all contrack entries for this protocol */ + nf_ct_iterate_cleanup(kill_l3proto, proto); +} + +/* FIXME: Allow NULL functions and sub in pointers to generic for + them. --RR */ +int nf_conntrack_protocol_register(struct nf_conntrack_protocol *proto) +{ + int ret = 0; + +retry: + write_lock_bh(&nf_conntrack_lock); + if (nf_ct_protos[proto->l3proto]) { + if (nf_ct_protos[proto->l3proto][proto->proto] + != &nf_conntrack_generic_protocol) { + ret = -EBUSY; + goto out_unlock; + } + } else { + /* l3proto may be loaded latter. */ + struct nf_conntrack_protocol **proto_array; + int i; + + write_unlock_bh(&nf_conntrack_lock); + + proto_array = (struct nf_conntrack_protocol **) + kmalloc(MAX_NF_CT_PROTO * + sizeof(struct nf_conntrack_protocol *), + GFP_KERNEL); + if (proto_array == NULL) { + ret = -ENOMEM; + goto out; + } + for (i = 0; i < MAX_NF_CT_PROTO; i++) + proto_array[i] = &nf_conntrack_generic_protocol; + + write_lock_bh(&nf_conntrack_lock); + if (nf_ct_protos[proto->l3proto]) { + /* bad timing, but no problem */ + write_unlock_bh(&nf_conntrack_lock); + kfree(proto_array); + } else { + nf_ct_protos[proto->l3proto] = proto_array; + write_unlock_bh(&nf_conntrack_lock); + } + + /* + * Just once because array is never freed until unloading + * nf_conntrack.ko + */ + goto retry; + } + + nf_ct_protos[proto->l3proto][proto->proto] = proto; + +out_unlock: + write_unlock_bh(&nf_conntrack_lock); +out: + return ret; +} + +void nf_conntrack_protocol_unregister(struct nf_conntrack_protocol *proto) +{ + write_lock_bh(&nf_conntrack_lock); + nf_ct_protos[proto->l3proto][proto->proto] + = &nf_conntrack_generic_protocol; + write_unlock_bh(&nf_conntrack_lock); + + /* Somebody could be still looking at the proto in bh. */ + synchronize_net(); + + /* Remove all contrack entries for this protocol */ + nf_ct_iterate_cleanup(kill_proto, proto); +} + +static int __init init(void) +{ + return init_or_cleanup(1); +} + +static void __exit fini(void) +{ + init_or_cleanup(0); +} + +module_init(init); +module_exit(fini); + +/* Some modules need us, but don't depend directly on any symbol. + They should call this. */ +void need_nf_conntrack(void) +{ +} + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +EXPORT_SYMBOL_GPL(nf_conntrack_chain); +EXPORT_SYMBOL_GPL(nf_conntrack_expect_chain); +EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier); +EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier); +EXPORT_SYMBOL_GPL(__nf_ct_event_cache_init); +EXPORT_PER_CPU_SYMBOL_GPL(nf_conntrack_ecache); +EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events); +#endif +EXPORT_SYMBOL(nf_conntrack_l3proto_register); +EXPORT_SYMBOL(nf_conntrack_l3proto_unregister); +EXPORT_SYMBOL(nf_conntrack_protocol_register); +EXPORT_SYMBOL(nf_conntrack_protocol_unregister); +EXPORT_SYMBOL(nf_ct_invert_tuplepr); +EXPORT_SYMBOL(nf_conntrack_alter_reply); +EXPORT_SYMBOL(nf_conntrack_destroyed); +EXPORT_SYMBOL(need_nf_conntrack); +EXPORT_SYMBOL(nf_conntrack_helper_register); +EXPORT_SYMBOL(nf_conntrack_helper_unregister); +EXPORT_SYMBOL(nf_ct_iterate_cleanup); +EXPORT_SYMBOL(__nf_ct_refresh_acct); +EXPORT_SYMBOL(nf_ct_protos); +EXPORT_SYMBOL(nf_ct_find_proto); +EXPORT_SYMBOL(nf_ct_l3protos); +EXPORT_SYMBOL(nf_conntrack_expect_alloc); +EXPORT_SYMBOL(nf_conntrack_expect_put); +EXPORT_SYMBOL(nf_conntrack_expect_related); +EXPORT_SYMBOL(nf_conntrack_unexpect_related); +EXPORT_SYMBOL(nf_conntrack_tuple_taken); +EXPORT_SYMBOL(nf_conntrack_htable_size); +EXPORT_SYMBOL(nf_conntrack_lock); +EXPORT_SYMBOL(nf_conntrack_hash); +EXPORT_SYMBOL(nf_conntrack_untracked); +EXPORT_SYMBOL_GPL(nf_conntrack_find_get); +#ifdef CONFIG_IP_NF_NAT_NEEDED +EXPORT_SYMBOL(nf_conntrack_tcp_update); +#endif +EXPORT_SYMBOL(__nf_conntrack_confirm); +EXPORT_SYMBOL(nf_ct_get_tuple); +EXPORT_SYMBOL(nf_ct_invert_tuple); +EXPORT_SYMBOL(nf_conntrack_in); +EXPORT_SYMBOL(__nf_conntrack_attach); diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index d10d552d9c4..d3a4f30a7f2 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -117,7 +117,7 @@ int nf_queue(struct sk_buff **skb, /* QUEUE == DROP if noone is waiting, to be safe. */ read_lock(&queue_handler_lock); - if (!queue_handler[pf]->outfn) { + if (!queue_handler[pf] || !queue_handler[pf]->outfn) { read_unlock(&queue_handler_lock); kfree_skb(*skb); return 1; diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 4bc27a6334c..a60c59b9763 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -128,7 +128,7 @@ void __nfa_fill(struct sk_buff *skb, int attrtype, int attrlen, memset(NFA_DATA(nfa) + attrlen, 0, NFA_ALIGN(size) - size); } -int nfattr_parse(struct nfattr *tb[], int maxattr, struct nfattr *nfa, int len) +void nfattr_parse(struct nfattr *tb[], int maxattr, struct nfattr *nfa, int len) { memset(tb, 0, sizeof(struct nfattr *) * maxattr); @@ -138,8 +138,6 @@ int nfattr_parse(struct nfattr *tb[], int maxattr, struct nfattr *nfa, int len) tb[flavor-1] = nfa; nfa = NFA_NEXT(nfa, len); } - - return 0; } /** @@ -225,6 +223,12 @@ static inline int nfnetlink_rcv_msg(struct sk_buff *skb, NFNL_SUBSYS_ID(nlh->nlmsg_type), NFNL_MSG_TYPE(nlh->nlmsg_type)); + if (!cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)) { + DEBUGP("missing CAP_NET_ADMIN\n"); + *errp = -EPERM; + return -1; + } + /* Only requests are handled by kernel now. */ if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) { DEBUGP("received non-request message\n"); @@ -250,7 +254,7 @@ static inline int nfnetlink_rcv_msg(struct sk_buff *skb, ss = nfnetlink_get_subsys(type); if (!ss) #endif - goto err_inval; + goto err_inval; } nc = nfnetlink_find_client(type, ss); @@ -259,13 +263,6 @@ static inline int nfnetlink_rcv_msg(struct sk_buff *skb, goto err_inval; } - if (nc->cap_required && - !cap_raised(NETLINK_CB(skb).eff_cap, nc->cap_required)) { - DEBUGP("permission denied for type %d\n", type); - *errp = -EPERM; - return -1; - } - { u_int16_t attr_count = ss->cb[NFNL_MSG_TYPE(nlh->nlmsg_type)].attr_count; diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index efcd10f996b..cba63729313 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -146,11 +146,10 @@ instance_create(u_int16_t group_num, int pid) goto out_unlock; } - inst = kmalloc(sizeof(*inst), GFP_ATOMIC); + inst = kzalloc(sizeof(*inst), GFP_ATOMIC); if (!inst) goto out_unlock; - memset(inst, 0, sizeof(*inst)); INIT_HLIST_NODE(&inst->hlist); inst->lock = SPIN_LOCK_UNLOCKED; /* needs to be two, since we _put() after creation */ @@ -863,11 +862,9 @@ out_put: static struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = { [NFULNL_MSG_PACKET] = { .call = nfulnl_recv_unsupp, - .attr_count = NFULA_MAX, - .cap_required = CAP_NET_ADMIN, }, + .attr_count = NFULA_MAX, }, [NFULNL_MSG_CONFIG] = { .call = nfulnl_recv_config, - .attr_count = NFULA_CFG_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = NFULA_CFG_MAX, }, }; static struct nfnetlink_subsystem nfulnl_subsys = { @@ -962,10 +959,9 @@ static int nful_open(struct inode *inode, struct file *file) struct iter_state *is; int ret; - is = kmalloc(sizeof(*is), GFP_KERNEL); + is = kzalloc(sizeof(*is), GFP_KERNEL); if (!is) return -ENOMEM; - memset(is, 0, sizeof(*is)); ret = seq_open(file, &nful_seq_ops); if (ret < 0) goto out_free; diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index eaa44c49567..f28460b61e4 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -136,11 +136,10 @@ instance_create(u_int16_t queue_num, int pid) goto out_unlock; } - inst = kmalloc(sizeof(*inst), GFP_ATOMIC); + inst = kzalloc(sizeof(*inst), GFP_ATOMIC); if (!inst) goto out_unlock; - memset(inst, 0, sizeof(*inst)); inst->queue_num = queue_num; inst->peer_pid = pid; inst->queue_maxlen = NFQNL_QMAX_DEFAULT; @@ -932,14 +931,11 @@ out_put: static struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = { [NFQNL_MSG_PACKET] = { .call = nfqnl_recv_unsupp, - .attr_count = NFQA_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = NFQA_MAX, }, [NFQNL_MSG_VERDICT] = { .call = nfqnl_recv_verdict, - .attr_count = NFQA_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = NFQA_MAX, }, [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config, - .attr_count = NFQA_CFG_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = NFQA_CFG_MAX, }, }; static struct nfnetlink_subsystem nfqnl_subsys = { @@ -1036,10 +1032,9 @@ static int nfqnl_open(struct inode *inode, struct file *file) struct iter_state *is; int ret; - is = kmalloc(sizeof(*is), GFP_KERNEL); + is = kzalloc(sizeof(*is), GFP_KERNEL); if (!is) return -ENOMEM; - memset(is, 0, sizeof(*is)); ret = seq_open(file, &nfqnl_seq_ops); if (ret < 0) goto out_free; diff --git a/net/netlink/Makefile b/net/netlink/Makefile index 39d9c2dcd03..e3589c2de49 100644 --- a/net/netlink/Makefile +++ b/net/netlink/Makefile @@ -2,4 +2,4 @@ # Makefile for the netlink driver. # -obj-y := af_netlink.o +obj-y := af_netlink.o attr.o genetlink.o diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 678c3f2c0d0..8c38ee6d255 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -58,6 +58,7 @@ #include <net/sock.h> #include <net/scm.h> +#include <net/netlink.h> #define Nprintk(a...) #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) @@ -427,7 +428,8 @@ static int netlink_release(struct socket *sock) spin_lock(&nlk->cb_lock); if (nlk->cb) { - nlk->cb->done(nlk->cb); + if (nlk->cb->done) + nlk->cb->done(nlk->cb); netlink_destroy_callback(nlk->cb); nlk->cb = NULL; } @@ -740,11 +742,8 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, long t int netlink_sendskb(struct sock *sk, struct sk_buff *skb, int protocol) { - struct netlink_sock *nlk; int len = skb->len; - nlk = nlk_sk(sk); - skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk, len); sock_put(sk); @@ -827,7 +826,7 @@ struct netlink_broadcast_data { int failure; int congested; int delivered; - unsigned int allocation; + gfp_t allocation; struct sk_buff *skb, *skb2; }; @@ -1325,7 +1324,8 @@ static int netlink_dump(struct sock *sk) skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk, skb->len); - cb->done(cb); + if (cb->done) + cb->done(cb); nlk->cb = NULL; spin_unlock(&nlk->cb_lock); @@ -1412,6 +1412,94 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); } +static int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, + struct nlmsghdr *, int *)) +{ + unsigned int total_len; + struct nlmsghdr *nlh; + int err; + + while (skb->len >= nlmsg_total_size(0)) { + nlh = (struct nlmsghdr *) skb->data; + + if (skb->len < nlh->nlmsg_len) + return 0; + + total_len = min(NLMSG_ALIGN(nlh->nlmsg_len), skb->len); + + if (cb(skb, nlh, &err) < 0) { + /* Not an error, but we have to interrupt processing + * here. Note: that in this case we do not pull + * message from skb, it will be processed later. + */ + if (err == 0) + return -1; + netlink_ack(skb, nlh, err); + } else if (nlh->nlmsg_flags & NLM_F_ACK) + netlink_ack(skb, nlh, 0); + + skb_pull(skb, total_len); + } + + return 0; +} + +/** + * nelink_run_queue - Process netlink receive queue. + * @sk: Netlink socket containing the queue + * @qlen: Place to store queue length upon entry + * @cb: Callback function invoked for each netlink message found + * + * Processes as much as there was in the queue upon entry and invokes + * a callback function for each netlink message found. The callback + * function may refuse a message by returning a negative error code + * but setting the error pointer to 0 in which case this function + * returns with a qlen != 0. + * + * qlen must be initialized to 0 before the initial entry, afterwards + * the function may be called repeatedly until qlen reaches 0. + */ +void netlink_run_queue(struct sock *sk, unsigned int *qlen, + int (*cb)(struct sk_buff *, struct nlmsghdr *, int *)) +{ + struct sk_buff *skb; + + if (!*qlen || *qlen > skb_queue_len(&sk->sk_receive_queue)) + *qlen = skb_queue_len(&sk->sk_receive_queue); + + for (; *qlen; (*qlen)--) { + skb = skb_dequeue(&sk->sk_receive_queue); + if (netlink_rcv_skb(skb, cb)) { + if (skb->len) + skb_queue_head(&sk->sk_receive_queue, skb); + else { + kfree_skb(skb); + (*qlen)--; + } + break; + } + + kfree_skb(skb); + } +} + +/** + * netlink_queue_skip - Skip netlink message while processing queue. + * @nlh: Netlink message to be skipped + * @skb: Socket buffer containing the netlink messages. + * + * Pulls the given netlink message off the socket buffer so the next + * call to netlink_queue_run() will not reconsider the message. + */ +void netlink_queue_skip(struct nlmsghdr *nlh, struct sk_buff *skb) +{ + int msglen = NLMSG_ALIGN(nlh->nlmsg_len); + + if (msglen > skb->len) + msglen = skb->len; + + skb_pull(skb, msglen); +} #ifdef CONFIG_PROC_FS struct nl_seq_iter { @@ -1660,6 +1748,8 @@ out: core_initcall(netlink_proto_init); EXPORT_SYMBOL(netlink_ack); +EXPORT_SYMBOL(netlink_run_queue); +EXPORT_SYMBOL(netlink_queue_skip); EXPORT_SYMBOL(netlink_broadcast); EXPORT_SYMBOL(netlink_dump_start); EXPORT_SYMBOL(netlink_kernel_create); diff --git a/net/netlink/attr.c b/net/netlink/attr.c new file mode 100644 index 00000000000..fffef4ab276 --- /dev/null +++ b/net/netlink/attr.c @@ -0,0 +1,328 @@ +/* + * NETLINK Netlink attributes + * + * Authors: Thomas Graf <tgraf@suug.ch> + * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/jiffies.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/string.h> +#include <linux/types.h> +#include <net/netlink.h> + +static u16 nla_attr_minlen[NLA_TYPE_MAX+1] __read_mostly = { + [NLA_U8] = sizeof(u8), + [NLA_U16] = sizeof(u16), + [NLA_U32] = sizeof(u32), + [NLA_U64] = sizeof(u64), + [NLA_STRING] = 1, + [NLA_NESTED] = NLA_HDRLEN, +}; + +static int validate_nla(struct nlattr *nla, int maxtype, + struct nla_policy *policy) +{ + struct nla_policy *pt; + int minlen = 0; + + if (nla->nla_type <= 0 || nla->nla_type > maxtype) + return 0; + + pt = &policy[nla->nla_type]; + + BUG_ON(pt->type > NLA_TYPE_MAX); + + if (pt->minlen) + minlen = pt->minlen; + else if (pt->type != NLA_UNSPEC) + minlen = nla_attr_minlen[pt->type]; + + if (pt->type == NLA_FLAG && nla_len(nla) > 0) + return -ERANGE; + + if (nla_len(nla) < minlen) + return -ERANGE; + + return 0; +} + +/** + * nla_validate - Validate a stream of attributes + * @head: head of attribute stream + * @len: length of attribute stream + * @maxtype: maximum attribute type to be expected + * @policy: validation policy + * + * Validates all attributes in the specified attribute stream against the + * specified policy. Attributes with a type exceeding maxtype will be + * ignored. See documenation of struct nla_policy for more details. + * + * Returns 0 on success or a negative error code. + */ +int nla_validate(struct nlattr *head, int len, int maxtype, + struct nla_policy *policy) +{ + struct nlattr *nla; + int rem, err; + + nla_for_each_attr(nla, head, len, rem) { + err = validate_nla(nla, maxtype, policy); + if (err < 0) + goto errout; + } + + err = 0; +errout: + return err; +} + +/** + * nla_parse - Parse a stream of attributes into a tb buffer + * @tb: destination array with maxtype+1 elements + * @maxtype: maximum attribute type to be expected + * @head: head of attribute stream + * @len: length of attribute stream + * + * Parses a stream of attributes and stores a pointer to each attribute in + * the tb array accessable via the attribute type. Attributes with a type + * exceeding maxtype will be silently ignored for backwards compatibility + * reasons. policy may be set to NULL if no validation is required. + * + * Returns 0 on success or a negative error code. + */ +int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len, + struct nla_policy *policy) +{ + struct nlattr *nla; + int rem, err; + + memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); + + nla_for_each_attr(nla, head, len, rem) { + u16 type = nla->nla_type; + + if (type > 0 && type <= maxtype) { + if (policy) { + err = validate_nla(nla, maxtype, policy); + if (err < 0) + goto errout; + } + + tb[type] = nla; + } + } + + if (unlikely(rem > 0)) + printk(KERN_WARNING "netlink: %d bytes leftover after parsing " + "attributes.\n", rem); + + err = 0; +errout: + return err; +} + +/** + * nla_find - Find a specific attribute in a stream of attributes + * @head: head of attribute stream + * @len: length of attribute stream + * @attrtype: type of attribute to look for + * + * Returns the first attribute in the stream matching the specified type. + */ +struct nlattr *nla_find(struct nlattr *head, int len, int attrtype) +{ + struct nlattr *nla; + int rem; + + nla_for_each_attr(nla, head, len, rem) + if (nla->nla_type == attrtype) + return nla; + + return NULL; +} + +/** + * nla_strlcpy - Copy string attribute payload into a sized buffer + * @dst: where to copy the string to + * @src: attribute to copy the string from + * @dstsize: size of destination buffer + * + * Copies at most dstsize - 1 bytes into the destination buffer. + * The result is always a valid NUL-terminated string. Unlike + * strlcpy the destination buffer is always padded out. + * + * Returns the length of the source buffer. + */ +size_t nla_strlcpy(char *dst, const struct nlattr *nla, size_t dstsize) +{ + size_t srclen = nla_len(nla); + char *src = nla_data(nla); + + if (srclen > 0 && src[srclen - 1] == '\0') + srclen--; + + if (dstsize > 0) { + size_t len = (srclen >= dstsize) ? dstsize - 1 : srclen; + + memset(dst, 0, dstsize); + memcpy(dst, src, len); + } + + return srclen; +} + +/** + * nla_memcpy - Copy a netlink attribute into another memory area + * @dest: where to copy to memcpy + * @src: netlink attribute to copy from + * @count: size of the destination area + * + * Note: The number of bytes copied is limited by the length of + * attribute's payload. memcpy + * + * Returns the number of bytes copied. + */ +int nla_memcpy(void *dest, struct nlattr *src, int count) +{ + int minlen = min_t(int, count, nla_len(src)); + + memcpy(dest, nla_data(src), minlen); + + return minlen; +} + +/** + * nla_memcmp - Compare an attribute with sized memory area + * @nla: netlink attribute + * @data: memory area + * @size: size of memory area + */ +int nla_memcmp(const struct nlattr *nla, const void *data, + size_t size) +{ + int d = nla_len(nla) - size; + + if (d == 0) + d = memcmp(nla_data(nla), data, size); + + return d; +} + +/** + * nla_strcmp - Compare a string attribute against a string + * @nla: netlink string attribute + * @str: another string + */ +int nla_strcmp(const struct nlattr *nla, const char *str) +{ + int len = strlen(str) + 1; + int d = nla_len(nla) - len; + + if (d == 0) + d = memcmp(nla_data(nla), str, len); + + return d; +} + +/** + * __nla_reserve - reserve room for attribute on the skb + * @skb: socket buffer to reserve room on + * @attrtype: attribute type + * @attrlen: length of attribute payload + * + * Adds a netlink attribute header to a socket buffer and reserves + * room for the payload but does not copy it. + * + * The caller is responsible to ensure that the skb provides enough + * tailroom for the attribute header and payload. + */ +struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) +{ + struct nlattr *nla; + + nla = (struct nlattr *) skb_put(skb, nla_total_size(attrlen)); + nla->nla_type = attrtype; + nla->nla_len = nla_attr_size(attrlen); + + memset((unsigned char *) nla + nla->nla_len, 0, nla_padlen(attrlen)); + + return nla; +} + +/** + * nla_reserve - reserve room for attribute on the skb + * @skb: socket buffer to reserve room on + * @attrtype: attribute type + * @attrlen: length of attribute payload + * + * Adds a netlink attribute header to a socket buffer and reserves + * room for the payload but does not copy it. + * + * Returns NULL if the tailroom of the skb is insufficient to store + * the attribute header and payload. + */ +struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) +{ + if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen))) + return NULL; + + return __nla_reserve(skb, attrtype, attrlen); +} + +/** + * __nla_put - Add a netlink attribute to a socket buffer + * @skb: socket buffer to add attribute to + * @attrtype: attribute type + * @attrlen: length of attribute payload + * @data: head of attribute payload + * + * The caller is responsible to ensure that the skb provides enough + * tailroom for the attribute header and payload. + */ +void __nla_put(struct sk_buff *skb, int attrtype, int attrlen, + const void *data) +{ + struct nlattr *nla; + + nla = __nla_reserve(skb, attrtype, attrlen); + memcpy(nla_data(nla), data, attrlen); +} + + +/** + * nla_put - Add a netlink attribute to a socket buffer + * @skb: socket buffer to add attribute to + * @attrtype: attribute type + * @attrlen: length of attribute payload + * @data: head of attribute payload + * + * Returns -1 if the tailroom of the skb is insufficient to store + * the attribute header and payload. + */ +int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data) +{ + if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen))) + return -1; + + __nla_put(skb, attrtype, attrlen, data); + return 0; +} + + +EXPORT_SYMBOL(nla_validate); +EXPORT_SYMBOL(nla_parse); +EXPORT_SYMBOL(nla_find); +EXPORT_SYMBOL(nla_strlcpy); +EXPORT_SYMBOL(__nla_reserve); +EXPORT_SYMBOL(nla_reserve); +EXPORT_SYMBOL(__nla_put); +EXPORT_SYMBOL(nla_put); +EXPORT_SYMBOL(nla_memcpy); +EXPORT_SYMBOL(nla_memcmp); +EXPORT_SYMBOL(nla_strcmp); diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c new file mode 100644 index 00000000000..287cfcc5695 --- /dev/null +++ b/net/netlink/genetlink.c @@ -0,0 +1,579 @@ +/* + * NETLINK Generic Netlink Family + * + * Authors: Jamal Hadi Salim + * Thomas Graf <tgraf@suug.ch> + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/string.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/genetlink.h> + +struct sock *genl_sock = NULL; + +static DECLARE_MUTEX(genl_sem); /* serialization of message processing */ + +static void genl_lock(void) +{ + down(&genl_sem); +} + +static int genl_trylock(void) +{ + return down_trylock(&genl_sem); +} + +static void genl_unlock(void) +{ + up(&genl_sem); + + if (genl_sock && genl_sock->sk_receive_queue.qlen) + genl_sock->sk_data_ready(genl_sock, 0); +} + +#define GENL_FAM_TAB_SIZE 16 +#define GENL_FAM_TAB_MASK (GENL_FAM_TAB_SIZE - 1) + +static struct list_head family_ht[GENL_FAM_TAB_SIZE]; + +static int genl_ctrl_event(int event, void *data); + +static inline unsigned int genl_family_hash(unsigned int id) +{ + return id & GENL_FAM_TAB_MASK; +} + +static inline struct list_head *genl_family_chain(unsigned int id) +{ + return &family_ht[genl_family_hash(id)]; +} + +static struct genl_family *genl_family_find_byid(unsigned int id) +{ + struct genl_family *f; + + list_for_each_entry(f, genl_family_chain(id), family_list) + if (f->id == id) + return f; + + return NULL; +} + +static struct genl_family *genl_family_find_byname(char *name) +{ + struct genl_family *f; + int i; + + for (i = 0; i < GENL_FAM_TAB_SIZE; i++) + list_for_each_entry(f, genl_family_chain(i), family_list) + if (strcmp(f->name, name) == 0) + return f; + + return NULL; +} + +static struct genl_ops *genl_get_cmd(u8 cmd, struct genl_family *family) +{ + struct genl_ops *ops; + + list_for_each_entry(ops, &family->ops_list, ops_list) + if (ops->cmd == cmd) + return ops; + + return NULL; +} + +/* Of course we are going to have problems once we hit + * 2^16 alive types, but that can only happen by year 2K +*/ +static inline u16 genl_generate_id(void) +{ + static u16 id_gen_idx; + int overflowed = 0; + + do { + if (id_gen_idx == 0) + id_gen_idx = GENL_MIN_ID; + + if (++id_gen_idx > GENL_MAX_ID) { + if (!overflowed) { + overflowed = 1; + id_gen_idx = 0; + continue; + } else + return 0; + } + + } while (genl_family_find_byid(id_gen_idx)); + + return id_gen_idx; +} + +/** + * genl_register_ops - register generic netlink operations + * @family: generic netlink family + * @ops: operations to be registered + * + * Registers the specified operations and assigns them to the specified + * family. Either a doit or dumpit callback must be specified or the + * operation will fail. Only one operation structure per command + * identifier may be registered. + * + * See include/net/genetlink.h for more documenation on the operations + * structure. + * + * Returns 0 on success or a negative error code. + */ +int genl_register_ops(struct genl_family *family, struct genl_ops *ops) +{ + int err = -EINVAL; + + if (ops->dumpit == NULL && ops->doit == NULL) + goto errout; + + if (genl_get_cmd(ops->cmd, family)) { + err = -EEXIST; + goto errout; + } + + genl_lock(); + list_add_tail(&ops->ops_list, &family->ops_list); + genl_unlock(); + + genl_ctrl_event(CTRL_CMD_NEWOPS, ops); + err = 0; +errout: + return err; +} + +/** + * genl_unregister_ops - unregister generic netlink operations + * @family: generic netlink family + * @ops: operations to be unregistered + * + * Unregisters the specified operations and unassigns them from the + * specified family. The operation blocks until the current message + * processing has finished and doesn't start again until the + * unregister process has finished. + * + * Note: It is not necessary to unregister all operations before + * unregistering the family, unregistering the family will cause + * all assigned operations to be unregistered automatically. + * + * Returns 0 on success or a negative error code. + */ +int genl_unregister_ops(struct genl_family *family, struct genl_ops *ops) +{ + struct genl_ops *rc; + + genl_lock(); + list_for_each_entry(rc, &family->ops_list, ops_list) { + if (rc == ops) { + list_del(&ops->ops_list); + genl_unlock(); + genl_ctrl_event(CTRL_CMD_DELOPS, ops); + return 0; + } + } + genl_unlock(); + + return -ENOENT; +} + +/** + * genl_register_family - register a generic netlink family + * @family: generic netlink family + * + * Registers the specified family after validating it first. Only one + * family may be registered with the same family name or identifier. + * The family id may equal GENL_ID_GENERATE causing an unique id to + * be automatically generated and assigned. + * + * Return 0 on success or a negative error code. + */ +int genl_register_family(struct genl_family *family) +{ + int err = -EINVAL; + + if (family->id && family->id < GENL_MIN_ID) + goto errout; + + if (family->id > GENL_MAX_ID) + goto errout; + + INIT_LIST_HEAD(&family->ops_list); + + genl_lock(); + + if (genl_family_find_byname(family->name)) { + err = -EEXIST; + goto errout_locked; + } + + if (genl_family_find_byid(family->id)) { + err = -EEXIST; + goto errout_locked; + } + + if (!try_module_get(family->owner)) { + err = -EBUSY; + goto errout_locked; + } + + if (family->id == GENL_ID_GENERATE) { + u16 newid = genl_generate_id(); + + if (!newid) { + err = -ENOMEM; + goto errout_locked; + } + + family->id = newid; + } + + if (family->maxattr) { + family->attrbuf = kmalloc((family->maxattr+1) * + sizeof(struct nlattr *), GFP_KERNEL); + if (family->attrbuf == NULL) { + err = -ENOMEM; + goto errout; + } + } else + family->attrbuf = NULL; + + list_add_tail(&family->family_list, genl_family_chain(family->id)); + genl_unlock(); + + genl_ctrl_event(CTRL_CMD_NEWFAMILY, family); + + return 0; + +errout_locked: + genl_unlock(); +errout: + return err; +} + +/** + * genl_unregister_family - unregister generic netlink family + * @family: generic netlink family + * + * Unregisters the specified family. + * + * Returns 0 on success or a negative error code. + */ +int genl_unregister_family(struct genl_family *family) +{ + struct genl_family *rc; + + genl_lock(); + + list_for_each_entry(rc, genl_family_chain(family->id), family_list) { + if (family->id != rc->id || strcmp(rc->name, family->name)) + continue; + + list_del(&rc->family_list); + INIT_LIST_HEAD(&family->ops_list); + genl_unlock(); + + module_put(family->owner); + kfree(family->attrbuf); + genl_ctrl_event(CTRL_CMD_DELFAMILY, family); + return 0; + } + + genl_unlock(); + + return -ENOENT; +} + +static inline int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, + int *errp) +{ + struct genl_ops *ops; + struct genl_family *family; + struct genl_info info; + struct genlmsghdr *hdr = nlmsg_data(nlh); + int hdrlen, err = -EINVAL; + + if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) + goto ignore; + + if (nlh->nlmsg_type < NLMSG_MIN_TYPE) + goto ignore; + + family = genl_family_find_byid(nlh->nlmsg_type); + if (family == NULL) { + err = -ENOENT; + goto errout; + } + + hdrlen = GENL_HDRLEN + family->hdrsize; + if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) + goto errout; + + ops = genl_get_cmd(hdr->cmd, family); + if (ops == NULL) { + err = -EOPNOTSUPP; + goto errout; + } + + if ((ops->flags & GENL_ADMIN_PERM) && security_netlink_recv(skb)) { + err = -EPERM; + goto errout; + } + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (ops->dumpit == NULL) { + err = -EOPNOTSUPP; + goto errout; + } + + *errp = err = netlink_dump_start(genl_sock, skb, nlh, + ops->dumpit, NULL); + if (err == 0) + skb_pull(skb, min(NLMSG_ALIGN(nlh->nlmsg_len), + skb->len)); + return -1; + } + + if (ops->doit == NULL) { + err = -EOPNOTSUPP; + goto errout; + } + + if (family->attrbuf) { + err = nlmsg_parse(nlh, hdrlen, family->attrbuf, family->maxattr, + ops->policy); + if (err < 0) + goto errout; + } + + info.snd_seq = nlh->nlmsg_seq; + info.snd_pid = NETLINK_CB(skb).pid; + info.nlhdr = nlh; + info.genlhdr = nlmsg_data(nlh); + info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN; + info.attrs = family->attrbuf; + + *errp = err = ops->doit(skb, &info); + return err; + +ignore: + return 0; + +errout: + *errp = err; + return -1; +} + +static void genl_rcv(struct sock *sk, int len) +{ + unsigned int qlen = 0; + + do { + if (genl_trylock()) + return; + netlink_run_queue(sk, &qlen, &genl_rcv_msg); + genl_unlock(); + } while (qlen && genl_sock && genl_sock->sk_receive_queue.qlen); +} + +/************************************************************************** + * Controller + **************************************************************************/ + +static int ctrl_fill_info(struct genl_family *family, u32 pid, u32 seq, + u32 flags, struct sk_buff *skb, u8 cmd) +{ + void *hdr; + + hdr = genlmsg_put(skb, pid, seq, GENL_ID_CTRL, 0, flags, cmd, + family->version); + if (hdr == NULL) + return -1; + + NLA_PUT_STRING(skb, CTRL_ATTR_FAMILY_NAME, family->name); + NLA_PUT_U16(skb, CTRL_ATTR_FAMILY_ID, family->id); + + return genlmsg_end(skb, hdr); + +nla_put_failure: + return genlmsg_cancel(skb, hdr); +} + +static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb) +{ + + int i, n = 0; + struct genl_family *rt; + int chains_to_skip = cb->args[0]; + int fams_to_skip = cb->args[1]; + + for (i = 0; i < GENL_FAM_TAB_SIZE; i++) { + if (i < chains_to_skip) + continue; + n = 0; + list_for_each_entry(rt, genl_family_chain(i), family_list) { + if (++n < fams_to_skip) + continue; + if (ctrl_fill_info(rt, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + skb, CTRL_CMD_NEWFAMILY) < 0) + goto errout; + } + + fams_to_skip = 0; + } + +errout: + cb->args[0] = i; + cb->args[1] = n; + + return skb->len; +} + +static struct sk_buff *ctrl_build_msg(struct genl_family *family, u32 pid, + int seq, int cmd) +{ + struct sk_buff *skb; + int err; + + skb = nlmsg_new(NLMSG_GOODSIZE); + if (skb == NULL) + return ERR_PTR(-ENOBUFS); + + err = ctrl_fill_info(family, pid, seq, 0, skb, cmd); + if (err < 0) { + nlmsg_free(skb); + return ERR_PTR(err); + } + + return skb; +} + +static struct nla_policy ctrl_policy[CTRL_ATTR_MAX+1] __read_mostly = { + [CTRL_ATTR_FAMILY_ID] = { .type = NLA_U16 }, + [CTRL_ATTR_FAMILY_NAME] = { .type = NLA_STRING }, +}; + +static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *msg; + struct genl_family *res = NULL; + int err = -EINVAL; + + if (info->attrs[CTRL_ATTR_FAMILY_ID]) { + u16 id = nla_get_u16(info->attrs[CTRL_ATTR_FAMILY_ID]); + res = genl_family_find_byid(id); + } + + if (info->attrs[CTRL_ATTR_FAMILY_NAME]) { + char name[GENL_NAMSIZ]; + + if (nla_strlcpy(name, info->attrs[CTRL_ATTR_FAMILY_NAME], + GENL_NAMSIZ) >= GENL_NAMSIZ) + goto errout; + + res = genl_family_find_byname(name); + } + + if (res == NULL) { + err = -ENOENT; + goto errout; + } + + msg = ctrl_build_msg(res, info->snd_pid, info->snd_seq, + CTRL_CMD_NEWFAMILY); + if (IS_ERR(msg)) { + err = PTR_ERR(msg); + goto errout; + } + + err = genlmsg_unicast(msg, info->snd_pid); +errout: + return err; +} + +static int genl_ctrl_event(int event, void *data) +{ + struct sk_buff *msg; + + if (genl_sock == NULL) + return 0; + + switch (event) { + case CTRL_CMD_NEWFAMILY: + case CTRL_CMD_DELFAMILY: + msg = ctrl_build_msg(data, 0, 0, event); + if (IS_ERR(msg)) + return PTR_ERR(msg); + + genlmsg_multicast(msg, 0, GENL_ID_CTRL); + break; + } + + return 0; +} + +static struct genl_ops genl_ctrl_ops = { + .cmd = CTRL_CMD_GETFAMILY, + .doit = ctrl_getfamily, + .dumpit = ctrl_dumpfamily, + .policy = ctrl_policy, +}; + +static struct genl_family genl_ctrl = { + .id = GENL_ID_CTRL, + .name = "nlctrl", + .version = 0x1, + .maxattr = CTRL_ATTR_MAX, + .owner = THIS_MODULE, +}; + +static int __init genl_init(void) +{ + int i, err; + + for (i = 0; i < GENL_FAM_TAB_SIZE; i++) + INIT_LIST_HEAD(&family_ht[i]); + + err = genl_register_family(&genl_ctrl); + if (err < 0) + goto errout; + + err = genl_register_ops(&genl_ctrl, &genl_ctrl_ops); + if (err < 0) + goto errout_register; + + netlink_set_nonroot(NETLINK_GENERIC, NL_NONROOT_RECV); + genl_sock = netlink_kernel_create(NETLINK_GENERIC, GENL_MAX_ID, + genl_rcv, THIS_MODULE); + if (genl_sock == NULL) { + panic("GENL: Cannot initialize generic netlink\n"); + return -ENOMEM; + } + + return 0; + +errout_register: + genl_unregister_family(&genl_ctrl); +errout: + panic("GENL: Cannot register controller: %d\n", err); + return err; +} + +subsys_initcall(genl_init); + +EXPORT_SYMBOL(genl_sock); +EXPORT_SYMBOL(genl_register_ops); +EXPORT_SYMBOL(genl_unregister_ops); +EXPORT_SYMBOL(genl_register_family); +EXPORT_SYMBOL(genl_unregister_family); diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index e556d92c0bc..8631b65a731 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -240,8 +240,7 @@ static void rose_remove_neigh(struct rose_neigh *rose_neigh) if ((s = rose_neigh_list) == rose_neigh) { rose_neigh_list = rose_neigh->next; spin_unlock_bh(&rose_neigh_list_lock); - if (rose_neigh->digipeat != NULL) - kfree(rose_neigh->digipeat); + kfree(rose_neigh->digipeat); kfree(rose_neigh); return; } @@ -250,8 +249,7 @@ static void rose_remove_neigh(struct rose_neigh *rose_neigh) if (s->next == rose_neigh) { s->next = rose_neigh->next; spin_unlock_bh(&rose_neigh_list_lock); - if (rose_neigh->digipeat != NULL) - kfree(rose_neigh->digipeat); + kfree(rose_neigh->digipeat); kfree(rose_neigh); return; } @@ -727,7 +725,7 @@ int rose_rt_ioctl(unsigned int cmd, void __user *arg) } if (rose_route.mask > 10) /* Mask can't be more than 10 digits */ return -EINVAL; - if (rose_route.ndigis > 8) /* No more than 8 digipeats */ + if (rose_route.ndigis > AX25_MAX_DIGIS) return -EINVAL; err = rose_add_node(&rose_route, dev); dev_put(dev); diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c index 50ae0371dab..b6c8f38cc26 100644 --- a/net/rose/rose_timer.c +++ b/net/rose/rose_timer.c @@ -138,6 +138,7 @@ static void rose_heartbeat_expiry(unsigned long param) is accepted() it isn't 'dead' so doesn't get removed. */ if (sock_flag(sk, SOCK_DESTROY) || (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) { + bh_unlock_sock(sk); rose_destroy_socket(sk); return; } diff --git a/net/rxrpc/transport.c b/net/rxrpc/transport.c index 122c086ee2d..dbe6105e83a 100644 --- a/net/rxrpc/transport.c +++ b/net/rxrpc/transport.c @@ -23,6 +23,7 @@ #include <linux/in.h> #include <linux/in6.h> #include <linux/icmp.h> +#include <linux/skbuff.h> #include <net/sock.h> #include <net/ip.h> #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) @@ -475,15 +476,11 @@ void rxrpc_trans_receive_packet(struct rxrpc_transport *trans) /* we'll probably need to checksum it (didn't call * sock_recvmsg) */ - if (pkt->ip_summed != CHECKSUM_UNNECESSARY) { - if ((unsigned short) - csum_fold(skb_checksum(pkt, 0, pkt->len, - pkt->csum))) { - kfree_skb(pkt); - rxrpc_krxiod_queue_transport(trans); - _leave(" CSUM failed"); - return; - } + if (skb_checksum_complete(pkt)) { + kfree_skb(pkt); + rxrpc_krxiod_queue_transport(trans); + _leave(" CSUM failed"); + return; } addr = pkt->nh.iph->saddr; diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 81510da3179..55cd5327fbd 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -2,13 +2,15 @@ # Traffic control configuration. # -menuconfig NET_SCHED +menu "QoS and/or fair queueing" + +config NET_SCHED bool "QoS and/or fair queueing" ---help--- When the kernel has several packets to send out over a network device, it has to decide which ones to send first, which ones to - delay, and which ones to drop. This is the job of the packet - scheduler, and several different algorithms for how to do this + delay, and which ones to drop. This is the job of the queueing + disciplines, several different algorithms for how to do this "fairly" have been proposed. If you say N here, you will get the standard packet scheduler, which @@ -23,13 +25,13 @@ menuconfig NET_SCHED To administer these schedulers, you'll need the user-level utilities from the package iproute2+tc at <ftp://ftp.tux.org/pub/net/ip-routing/>. That package also contains some documentation; for more, check out - <http://snafu.freedom.org/linux2.2/iproute-notes.html>. + <http://linux-net.osdl.org/index.php/Iproute2>. This Quality of Service (QoS) support will enable you to use Differentiated Services (diffserv) and Resource Reservation Protocol - (RSVP) on your Linux router if you also say Y to "QoS support", - "Packet classifier API" and to some classifiers below. Documentation - and software is at <http://diffserv.sourceforge.net/>. + (RSVP) on your Linux router if you also say Y to the corresponding + classifiers below. Documentation and software is at + <http://diffserv.sourceforge.net/>. If you say Y here and to "/proc file system" below, you will be able to read status information about packet schedulers from the file @@ -38,11 +40,12 @@ menuconfig NET_SCHED The available schedulers are listed in the following questions; you can say Y to as many as you like. If unsure, say N now. +if NET_SCHED + choice prompt "Packet scheduler clock source" - depends on NET_SCHED default NET_SCH_CLK_JIFFIES - help + ---help--- Packet schedulers need a monotonic clock that increments at a static rate. The kernel provides several suitable interfaces, each with different properties: @@ -56,7 +59,7 @@ choice config NET_SCH_CLK_JIFFIES bool "Timer interrupt" - help + ---help--- Say Y here if you want to use the timer interrupt (jiffies) as clock source. This clock source is fast, synchronized on all processors and handles cpu clock frequency changes, but its resolution is too low @@ -64,7 +67,7 @@ config NET_SCH_CLK_JIFFIES config NET_SCH_CLK_GETTIMEOFDAY bool "gettimeofday" - help + ---help--- Say Y here if you want to use gettimeofday as clock source. This clock source has high resolution, is synchronized on all processors and handles cpu clock frequency changes, but it is slow. @@ -77,7 +80,7 @@ config NET_SCH_CLK_GETTIMEOFDAY config NET_SCH_CLK_CPU bool "CPU cycle counter" depends on ((X86_TSC || X86_64) && !SMP) || ALPHA || SPARC64 || PPC64 || IA64 - help + ---help--- Say Y here if you want to use the CPU's cycle counter as clock source. This is a cheap and high resolution clock source, but on some architectures it is not synchronized on all processors and doesn't @@ -95,134 +98,119 @@ config NET_SCH_CLK_CPU endchoice +comment "Queueing/Scheduling" + config NET_SCH_CBQ - tristate "CBQ packet scheduler" - depends on NET_SCHED + tristate "Class Based Queueing (CBQ)" ---help--- Say Y here if you want to use the Class-Based Queueing (CBQ) packet - scheduling algorithm for some of your network devices. This - algorithm classifies the waiting packets into a tree-like hierarchy - of classes; the leaves of this tree are in turn scheduled by - separate algorithms (called "disciplines" in this context). + scheduling algorithm. This algorithm classifies the waiting packets + into a tree-like hierarchy of classes; the leaves of this tree are + in turn scheduled by separate algorithms. - See the top of <file:net/sched/sch_cbq.c> for references about the - CBQ algorithm. + See the top of <file:net/sched/sch_cbq.c> for more details. CBQ is a commonly used scheduler, so if you're unsure, you should say Y here. Then say Y to all the queueing algorithms below that you - want to use as CBQ disciplines. Then say Y to "Packet classifier - API" and say Y to all the classifiers you want to use; a classifier - is a routine that allows you to sort your outgoing traffic into - classes based on a certain criterion. + want to use as leaf disciplines. To compile this code as a module, choose M here: the module will be called sch_cbq. config NET_SCH_HTB - tristate "HTB packet scheduler" - depends on NET_SCHED + tristate "Hierarchical Token Bucket (HTB)" ---help--- Say Y here if you want to use the Hierarchical Token Buckets (HTB) - packet scheduling algorithm for some of your network devices. See + packet scheduling algorithm. See <http://luxik.cdi.cz/~devik/qos/htb/> for complete manual and in-depth articles. - HTB is very similar to the CBQ regarding its goals however is has + HTB is very similar to CBQ regarding its goals however is has different properties and different algorithm. To compile this code as a module, choose M here: the module will be called sch_htb. config NET_SCH_HFSC - tristate "HFSC packet scheduler" - depends on NET_SCHED + tristate "Hierarchical Fair Service Curve (HFSC)" ---help--- Say Y here if you want to use the Hierarchical Fair Service Curve - (HFSC) packet scheduling algorithm for some of your network devices. + (HFSC) packet scheduling algorithm. To compile this code as a module, choose M here: the module will be called sch_hfsc. -#tristate ' H-PFQ packet scheduler' CONFIG_NET_SCH_HPFQ config NET_SCH_ATM - tristate "ATM pseudo-scheduler" - depends on NET_SCHED && ATM + tristate "ATM Virtual Circuits (ATM)" + depends on ATM ---help--- Say Y here if you want to use the ATM pseudo-scheduler. This - provides a framework for invoking classifiers (aka "filters"), which - in turn select classes of this queuing discipline. Each class maps - the flow(s) it is handling to a given virtual circuit (see the top of - <file:net/sched/sch_atm.c>). + provides a framework for invoking classifiers, which in turn + select classes of this queuing discipline. Each class maps + the flow(s) it is handling to a given virtual circuit. + + See the top of <file:net/sched/sch_atm.c>) for more details. To compile this code as a module, choose M here: the module will be called sch_atm. config NET_SCH_PRIO - tristate "The simplest PRIO pseudoscheduler" - depends on NET_SCHED - help + tristate "Multi Band Priority Queueing (PRIO)" + ---help--- Say Y here if you want to use an n-band priority queue packet - "scheduler" for some of your network devices or as a leaf discipline - for the CBQ scheduling algorithm. If unsure, say Y. + scheduler. To compile this code as a module, choose M here: the module will be called sch_prio. config NET_SCH_RED - tristate "RED queue" - depends on NET_SCHED - help + tristate "Random Early Detection (RED)" + ---help--- Say Y here if you want to use the Random Early Detection (RED) - packet scheduling algorithm for some of your network devices (see - the top of <file:net/sched/sch_red.c> for details and references - about the algorithm). + packet scheduling algorithm. + + See the top of <file:net/sched/sch_red.c> for more details. To compile this code as a module, choose M here: the module will be called sch_red. config NET_SCH_SFQ - tristate "SFQ queue" - depends on NET_SCHED + tristate "Stochastic Fairness Queueing (SFQ)" ---help--- Say Y here if you want to use the Stochastic Fairness Queueing (SFQ) - packet scheduling algorithm for some of your network devices or as a - leaf discipline for the CBQ scheduling algorithm (see the top of - <file:net/sched/sch_sfq.c> for details and references about the SFQ - algorithm). + packet scheduling algorithm . + + See the top of <file:net/sched/sch_sfq.c> for more details. To compile this code as a module, choose M here: the module will be called sch_sfq. config NET_SCH_TEQL - tristate "TEQL queue" - depends on NET_SCHED + tristate "True Link Equalizer (TEQL)" ---help--- Say Y here if you want to use the True Link Equalizer (TLE) packet - scheduling algorithm for some of your network devices or as a leaf - discipline for the CBQ scheduling algorithm. This queueing - discipline allows the combination of several physical devices into - one virtual device. (see the top of <file:net/sched/sch_teql.c> for - details). + scheduling algorithm. This queueing discipline allows the combination + of several physical devices into one virtual device. + + See the top of <file:net/sched/sch_teql.c> for more details. To compile this code as a module, choose M here: the module will be called sch_teql. config NET_SCH_TBF - tristate "TBF queue" - depends on NET_SCHED - help - Say Y here if you want to use the Simple Token Bucket Filter (TBF) - packet scheduling algorithm for some of your network devices or as a - leaf discipline for the CBQ scheduling algorithm (see the top of - <file:net/sched/sch_tbf.c> for a description of the TBF algorithm). + tristate "Token Bucket Filter (TBF)" + ---help--- + Say Y here if you want to use the Token Bucket Filter (TBF) packet + scheduling algorithm. + + See the top of <file:net/sched/sch_tbf.c> for more details. To compile this code as a module, choose M here: the module will be called sch_tbf. config NET_SCH_GRED - tristate "GRED queue" - depends on NET_SCHED - help + tristate "Generic Random Early Detection (GRED)" + ---help--- Say Y here if you want to use the Generic Random Early Detection (GRED) packet scheduling algorithm for some of your network devices (see the top of <file:net/sched/sch_red.c> for details and @@ -232,9 +220,8 @@ config NET_SCH_GRED module will be called sch_gred. config NET_SCH_DSMARK - tristate "Diffserv field marker" - depends on NET_SCHED - help + tristate "Differentiated Services marker (DSMARK)" + ---help--- Say Y if you want to schedule packets according to the Differentiated Services architecture proposed in RFC 2475. Technical information on this method, with pointers to associated @@ -244,9 +231,8 @@ config NET_SCH_DSMARK module will be called sch_dsmark. config NET_SCH_NETEM - tristate "Network emulator" - depends on NET_SCHED - help + tristate "Network emulator (NETEM)" + ---help--- Say Y if you want to emulate network delay, loss, and packet re-ordering. This is often useful to simulate networks when testing applications or protocols. @@ -258,59 +244,21 @@ config NET_SCH_NETEM config NET_SCH_INGRESS tristate "Ingress Qdisc" - depends on NET_SCHED - help - If you say Y here, you will be able to police incoming bandwidth - and drop packets when this bandwidth exceeds your desired rate. + ---help--- + Say Y here if you want to use classifiers for incoming packets. If unsure, say Y. To compile this code as a module, choose M here: the module will be called sch_ingress. -config NET_QOS - bool "QoS support" - depends on NET_SCHED - ---help--- - Say Y here if you want to include Quality Of Service scheduling - features, which means that you will be able to request certain - rate-of-flow limits for your network devices. - - This Quality of Service (QoS) support will enable you to use - Differentiated Services (diffserv) and Resource Reservation Protocol - (RSVP) on your Linux router if you also say Y to "Packet classifier - API" and to some classifiers below. Documentation and software is at - <http://diffserv.sourceforge.net/>. - - Note that the answer to this question won't directly affect the - kernel: saying N will just cause the configurator to skip all - the questions about QoS support. - -config NET_ESTIMATOR - bool "Rate estimator" - depends on NET_QOS - help - In order for Quality of Service scheduling to work, the current - rate-of-flow for a network device has to be estimated; if you say Y - here, the kernel will do just that. +comment "Classification" config NET_CLS - bool "Packet classifier API" - depends on NET_SCHED - ---help--- - The CBQ scheduling algorithm requires that network packets which are - scheduled to be sent out over a network device be classified - according to some criterion. If you say Y here, you will get a - choice of several different packet classifiers with the following - questions. - - This will enable you to use Differentiated Services (diffserv) and - Resource Reservation Protocol (RSVP) on your Linux router. - Documentation and software is at - <http://diffserv.sourceforge.net/>. + boolean config NET_CLS_BASIC - tristate "Basic classifier" - depends on NET_CLS + tristate "Elementary classification (BASIC)" + select NET_CLS ---help--- Say Y here if you want to be able to classify packets using only extended matches and actions. @@ -319,85 +267,67 @@ config NET_CLS_BASIC module will be called cls_basic. config NET_CLS_TCINDEX - tristate "TC index classifier" - depends on NET_CLS - help - If you say Y here, you will be able to classify outgoing packets - according to the tc_index field of the skb. You will want this - feature if you want to implement Differentiated Services using - sch_dsmark. If unsure, say Y. + tristate "Traffic-Control Index (TCINDEX)" + select NET_CLS + ---help--- + Say Y here if you want to be able to classify packets based on + traffic control indices. You will want this feature if you want + to implement Differentiated Services together with DSMARK. To compile this code as a module, choose M here: the module will be called cls_tcindex. config NET_CLS_ROUTE4 - tristate "Routing table based classifier" - depends on NET_CLS + tristate "Routing decision (ROUTE)" select NET_CLS_ROUTE - help - If you say Y here, you will be able to classify outgoing packets - according to the route table entry they matched. If unsure, say Y. + select NET_CLS + ---help--- + If you say Y here, you will be able to classify packets + according to the route table entry they matched. To compile this code as a module, choose M here: the module will be called cls_route. config NET_CLS_ROUTE bool - default n config NET_CLS_FW - tristate "Firewall based classifier" - depends on NET_CLS - help - If you say Y here, you will be able to classify outgoing packets - according to firewall criteria you specified. + tristate "Netfilter mark (FW)" + select NET_CLS + ---help--- + If you say Y here, you will be able to classify packets + according to netfilter/firewall marks. To compile this code as a module, choose M here: the module will be called cls_fw. config NET_CLS_U32 - tristate "U32 classifier" - depends on NET_CLS - help - If you say Y here, you will be able to classify outgoing packets - according to their destination address. If unsure, say Y. + tristate "Universal 32bit comparisons w/ hashing (U32)" + select NET_CLS + ---help--- + Say Y here to be able to classify packetes using a universal + 32bit pieces based comparison scheme. To compile this code as a module, choose M here: the module will be called cls_u32. config CLS_U32_PERF - bool "U32 classifier performance counters" + bool "Performance counters support" depends on NET_CLS_U32 - help - gathers stats that could be used to tune u32 classifier performance. - Requires a new iproute2 - You MUST NOT turn this on if you dont have an update iproute2. - -config NET_CLS_IND - bool "classify input device (slows things u32/fw) " - depends on NET_CLS_U32 || NET_CLS_FW - help - This option will be killed eventually when a - metadata action appears because it slows things a little - Available only for u32 and fw classifiers. - Requires a new iproute2 - You MUST NOT turn this on if you dont have an update iproute2. + ---help--- + Say Y here to make u32 gather additional statistics useful for + fine tuning u32 classifiers. config CLS_U32_MARK - bool "Use nfmark as a key in U32 classifier" + bool "Netfilter marks support" depends on NET_CLS_U32 && NETFILTER - help - This allows you to match mark in a u32 filter. - Example: - tc filter add dev eth0 protocol ip parent 1:0 prio 5 u32 \ - match mark 0x0090 0xffff \ - match ip dst 4.4.4.4 \ - flowid 1:90 - You must use a new iproute2 to use this feature. + ---help--- + Say Y here to be able to use netfilter marks as u32 key. config NET_CLS_RSVP - tristate "Special RSVP classifier" - depends on NET_CLS && NET_QOS + tristate "IPv4 Resource Reservation Protocol (RSVP)" + select NET_CLS + select NET_ESTIMATOR ---help--- The Resource Reservation Protocol (RSVP) permits end systems to request a minimum and maximum data flow rate for a connection; this @@ -410,31 +340,31 @@ config NET_CLS_RSVP module will be called cls_rsvp. config NET_CLS_RSVP6 - tristate "Special RSVP classifier for IPv6" - depends on NET_CLS && NET_QOS + tristate "IPv6 Resource Reservation Protocol (RSVP6)" + select NET_CLS + select NET_ESTIMATOR ---help--- The Resource Reservation Protocol (RSVP) permits end systems to request a minimum and maximum data flow rate for a connection; this is important for real time data such as streaming sound or video. Say Y here if you want to be able to classify outgoing packets based - on their RSVP requests and you are using the new Internet Protocol - IPv6 as opposed to the older and more common IPv4. + on their RSVP requests and you are using the IPv6. To compile this code as a module, choose M here: the module will be called cls_rsvp6. config NET_EMATCH bool "Extended Matches" - depends on NET_CLS + select NET_CLS ---help--- Say Y here if you want to use extended matches on top of classifiers and select the extended matches below. Extended matches are small classification helpers not worth writing - a separate classifier. + a separate classifier for. - You must have a recent version of the iproute2 tools in order to use + A recent version of the iproute2 package is required to use extended matches. config NET_EMATCH_STACK @@ -468,7 +398,7 @@ config NET_EMATCH_NBYTE module will be called em_nbyte. config NET_EMATCH_U32 - tristate "U32 hashing key" + tristate "U32 key" depends on NET_EMATCH ---help--- Say Y here if you want to be able to classify packets using @@ -496,76 +426,121 @@ config NET_EMATCH_TEXT select TEXTSEARCH_BM select TEXTSEARCH_FSM ---help--- - Say Y here if you want to be ablt to classify packets based on + Say Y here if you want to be able to classify packets based on textsearch comparisons. To compile this code as a module, choose M here: the module will be called em_text. config NET_CLS_ACT - bool "Packet ACTION" - depends on EXPERIMENTAL && NET_CLS && NET_QOS + bool "Actions" + depends on EXPERIMENTAL + select NET_ESTIMATOR ---help--- - This option requires you have a new iproute2. It enables - tc extensions which can be used with tc classifiers. - You MUST NOT turn this on if you dont have an update iproute2. + Say Y here if you want to use traffic control actions. Actions + get attached to classifiers and are invoked after a successful + classification. They are used to overwrite the classification + result, instantly drop or redirect packets, etc. + + A recent version of the iproute2 package is required to use + extended matches. config NET_ACT_POLICE - tristate "Policing Actions" + tristate "Traffic Policing" depends on NET_CLS_ACT ---help--- - If you are using a newer iproute2 select this one, otherwise use one - below to select a policer. - You MUST NOT turn this on if you dont have an update iproute2. + Say Y here if you want to do traffic policing, i.e. strict + bandwidth limiting. This action replaces the existing policing + module. + + To compile this code as a module, choose M here: the + module will be called police. config NET_ACT_GACT - tristate "generic Actions" + tristate "Generic actions" depends on NET_CLS_ACT ---help--- - You must have new iproute2 to use this feature. - This adds simple filtering actions like drop, accept etc. + Say Y here to take generic actions such as dropping and + accepting packets. + + To compile this code as a module, choose M here: the + module will be called gact. config GACT_PROB - bool "generic Actions probability" + bool "Probability support" depends on NET_ACT_GACT ---help--- - Allows generic actions to be randomly or deterministically used. + Say Y here to use the generic action randomly or deterministically. config NET_ACT_MIRRED - tristate "Packet In/Egress redirecton/mirror Actions" + tristate "Redirecting and Mirroring" depends on NET_CLS_ACT ---help--- - requires new iproute2 - This allows packets to be mirrored or redirected to netdevices + Say Y here to allow packets to be mirrored or redirected to + other devices. + + To compile this code as a module, choose M here: the + module will be called mirred. config NET_ACT_IPT - tristate "iptables Actions" + tristate "IPtables targets" depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES ---help--- - requires new iproute2 - This allows iptables targets to be used by tc filters + Say Y here to be able to invoke iptables targets after succesful + classification. + + To compile this code as a module, choose M here: the + module will be called ipt. config NET_ACT_PEDIT - tristate "Generic Packet Editor Actions" + tristate "Packet Editing" depends on NET_CLS_ACT ---help--- - requires new iproute2 - This allows for packets to be generically edited + Say Y here if you want to mangle the content of packets. -config NET_CLS_POLICE - bool "Traffic policing (needed for in/egress)" - depends on NET_CLS && NET_QOS && NET_CLS_ACT!=y - help - Say Y to support traffic policing (bandwidth limits). Needed for - ingress and egress rate limiting. + To compile this code as a module, choose M here: the + module will be called pedit. config NET_ACT_SIMP - tristate "Simple action" + tristate "Simple Example (Debug)" depends on NET_CLS_ACT ---help--- - You must have new iproute2 to use this feature. - This adds a very simple action for demonstration purposes - The idea is to give action authors a basic example to look at. - All this action will do is print on the console the configured - policy string followed by _ then packet count. + Say Y here to add a simple action for demonstration purposes. + It is meant as an example and for debugging purposes. It will + print a configured policy string followed by the packet count + to the console for every packet that passes by. + + If unsure, say N. + + To compile this code as a module, choose M here: the + module will be called simple. + +config NET_CLS_POLICE + bool "Traffic Policing (obsolete)" + depends on NET_CLS_ACT!=y + select NET_ESTIMATOR + ---help--- + Say Y here if you want to do traffic policing, i.e. strict + bandwidth limiting. This option is obsoleted by the traffic + policer implemented as action, it stays here for compatibility + reasons. + +config NET_CLS_IND + bool "Incoming device classification" + depends on NET_CLS_U32 || NET_CLS_FW + ---help--- + Say Y here to extend the u32 and fw classifier to support + classification based on the incoming device. This option is + likely to disappear in favour of the metadata ematch. + +config NET_ESTIMATOR + bool "Rate estimator" + ---help--- + Say Y here to allow using rate estimators to estimate the current + rate-of-flow for network devices, queues, etc. This module is + automaticaly selected if needed but can be selected manually for + statstical purposes. + +endif # NET_SCHED +endmenu diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 29d8b9a4d16..75470486e40 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -298,8 +298,7 @@ static int fw_change(struct tcf_proto *tp, unsigned long base, return 0; errout: - if (f) - kfree(f); + kfree(f); return err; } diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 02996ac05c7..520ff716dab 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -525,8 +525,7 @@ reinsert: return 0; errout: - if (f) - kfree(f); + kfree(f); return err; } diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 006168d6937..572f06be3b0 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -555,8 +555,7 @@ insert: goto insert; errout: - if (f) - kfree(f); + kfree(f); errout2: tcf_exts_destroy(tp, &e); return err; diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 404d9d83a7f..9f921174c8a 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -194,8 +194,7 @@ found: } tcf_unbind_filter(tp, &r->res); tcf_exts_destroy(tp, &r->exts); - if (f) - kfree(f); + kfree(f); return 0; } @@ -442,10 +441,8 @@ static void tcindex_destroy(struct tcf_proto *tp) walker.skip = 0; walker.fn = &tcindex_destroy_element; tcindex_walk(tp,&walker); - if (p->perfect) - kfree(p->perfect); - if (p->h) - kfree(p->h); + kfree(p->perfect); + kfree(p->h); kfree(p); tp->root = NULL; } diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 364b87d8645..2b670479dde 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -347,7 +347,7 @@ static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n) if (n->ht_down) n->ht_down->refcnt--; #ifdef CONFIG_CLS_U32_PERF - if (n && (NULL != n->pf)) + if (n) kfree(n->pf); #endif kfree(n); @@ -680,7 +680,7 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle, return 0; } #ifdef CONFIG_CLS_U32_PERF - if (n && (NULL != n->pf)) + if (n) kfree(n->pf); #endif kfree(n); diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index cf68a59fdc5..700844d49d7 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -561,8 +561,7 @@ static int meta_var_change(struct meta_value *dst, struct rtattr *rta) static void meta_var_destroy(struct meta_value *v) { - if (v->val) - kfree((void *) v->val); + kfree((void *) v->val); } static void meta_var_apply_extras(struct meta_value *v, diff --git a/net/sched/ematch.c b/net/sched/ematch.c index ebfe2e7d21b..64b047c6556 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -298,6 +298,11 @@ int tcf_em_tree_validate(struct tcf_proto *tp, struct rtattr *rta, struct tcf_ematch_tree_hdr *tree_hdr; struct tcf_ematch *em; + if (!rta) { + memset(tree, 0, sizeof(*tree)); + return 0; + } + if (rtattr_parse_nested(tb, TCA_EMATCH_TREE_MAX, rta) < 0) goto errout; diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 25c171c3271..29a2dd9f302 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -15,247 +15,281 @@ * from Ren Liu * - More error checks * - * - * - * For all the glorious comments look at Alexey's sch_red.c + * For all the glorious comments look at include/net/red.h */ #include <linux/config.h> #include <linux/module.h> -#include <asm/uaccess.h> -#include <asm/system.h> -#include <linux/bitops.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/in.h> -#include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/if_ether.h> -#include <linux/inet.h> #include <linux/netdevice.h> -#include <linux/etherdevice.h> -#include <linux/notifier.h> -#include <net/ip.h> -#include <net/route.h> #include <linux/skbuff.h> -#include <net/sock.h> #include <net/pkt_sched.h> +#include <net/red.h> -#if 1 /* control */ -#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) -#else -#define DPRINTK(format,args...) -#endif - -#if 0 /* data */ -#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) -#else -#define D2PRINTK(format,args...) -#endif +#define GRED_DEF_PRIO (MAX_DPs / 2) +#define GRED_VQ_MASK (MAX_DPs - 1) struct gred_sched_data; struct gred_sched; struct gred_sched_data { -/* Parameters */ u32 limit; /* HARD maximal queue length */ - u32 qth_min; /* Min average length threshold: A scaled */ - u32 qth_max; /* Max average length threshold: A scaled */ u32 DP; /* the drop pramaters */ - char Wlog; /* log(W) */ - char Plog; /* random number bits */ - u32 Scell_max; - u32 Rmask; u32 bytesin; /* bytes seen on virtualQ so far*/ u32 packetsin; /* packets seen on virtualQ so far*/ u32 backlog; /* bytes on the virtualQ */ - u32 forced; /* packets dropped for exceeding limits */ - u32 early; /* packets dropped as a warning */ - u32 other; /* packets dropped by invoking drop() */ - u32 pdrop; /* packets dropped because we exceeded physical queue limits */ - char Scell_log; - u8 Stab[256]; - u8 prio; /* the prio of this vq */ - -/* Variables */ - unsigned long qave; /* Average queue length: A scaled */ - int qcount; /* Packets since last random number generation */ - u32 qR; /* Cached random number */ - - psched_time_t qidlestart; /* Start of idle period */ + u8 prio; /* the prio of this vq */ + + struct red_parms parms; + struct red_stats stats; +}; + +enum { + GRED_WRED_MODE = 1, + GRED_RIO_MODE, }; struct gred_sched { struct gred_sched_data *tab[MAX_DPs]; - u32 DPs; - u32 def; - u8 initd; - u8 grio; - u8 eqp; + unsigned long flags; + u32 red_flags; + u32 DPs; + u32 def; + struct red_parms wred_set; }; -static int -gred_enqueue(struct sk_buff *skb, struct Qdisc* sch) +static inline int gred_wred_mode(struct gred_sched *table) { - psched_time_t now; - struct gred_sched_data *q=NULL; - struct gred_sched *t= qdisc_priv(sch); - unsigned long qave=0; - int i=0; + return test_bit(GRED_WRED_MODE, &table->flags); +} + +static inline void gred_enable_wred_mode(struct gred_sched *table) +{ + __set_bit(GRED_WRED_MODE, &table->flags); +} + +static inline void gred_disable_wred_mode(struct gred_sched *table) +{ + __clear_bit(GRED_WRED_MODE, &table->flags); +} + +static inline int gred_rio_mode(struct gred_sched *table) +{ + return test_bit(GRED_RIO_MODE, &table->flags); +} + +static inline void gred_enable_rio_mode(struct gred_sched *table) +{ + __set_bit(GRED_RIO_MODE, &table->flags); +} + +static inline void gred_disable_rio_mode(struct gred_sched *table) +{ + __clear_bit(GRED_RIO_MODE, &table->flags); +} + +static inline int gred_wred_mode_check(struct Qdisc *sch) +{ + struct gred_sched *table = qdisc_priv(sch); + int i; - if (!t->initd && skb_queue_len(&sch->q) < (sch->dev->tx_queue_len ? : 1)) { - D2PRINTK("NO GRED Queues setup yet! Enqueued anyway\n"); - goto do_enqueue; + /* Really ugly O(n^2) but shouldn't be necessary too frequent. */ + for (i = 0; i < table->DPs; i++) { + struct gred_sched_data *q = table->tab[i]; + int n; + + if (q == NULL) + continue; + + for (n = 0; n < table->DPs; n++) + if (table->tab[n] && table->tab[n] != q && + table->tab[n]->prio == q->prio) + return 1; } + return 0; +} + +static inline unsigned int gred_backlog(struct gred_sched *table, + struct gred_sched_data *q, + struct Qdisc *sch) +{ + if (gred_wred_mode(table)) + return sch->qstats.backlog; + else + return q->backlog; +} + +static inline u16 tc_index_to_dp(struct sk_buff *skb) +{ + return skb->tc_index & GRED_VQ_MASK; +} + +static inline void gred_load_wred_set(struct gred_sched *table, + struct gred_sched_data *q) +{ + q->parms.qavg = table->wred_set.qavg; + q->parms.qidlestart = table->wred_set.qidlestart; +} + +static inline void gred_store_wred_set(struct gred_sched *table, + struct gred_sched_data *q) +{ + table->wred_set.qavg = q->parms.qavg; +} + +static inline int gred_use_ecn(struct gred_sched *t) +{ + return t->red_flags & TC_RED_ECN; +} - if ( ((skb->tc_index&0xf) > (t->DPs -1)) || !(q=t->tab[skb->tc_index&0xf])) { - printk("GRED: setting to default (%d)\n ",t->def); - if (!(q=t->tab[t->def])) { - DPRINTK("GRED: setting to default FAILED! dropping!! " - "(%d)\n ", t->def); - goto drop; +static inline int gred_use_harddrop(struct gred_sched *t) +{ + return t->red_flags & TC_RED_HARDDROP; +} + +static int gred_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct gred_sched_data *q=NULL; + struct gred_sched *t= qdisc_priv(sch); + unsigned long qavg = 0; + u16 dp = tc_index_to_dp(skb); + + if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { + dp = t->def; + + if ((q = t->tab[dp]) == NULL) { + /* Pass through packets not assigned to a DP + * if no default DP has been configured. This + * allows for DP flows to be left untouched. + */ + if (skb_queue_len(&sch->q) < sch->dev->tx_queue_len) + return qdisc_enqueue_tail(skb, sch); + else + goto drop; } + /* fix tc_index? --could be controvesial but needed for requeueing */ - skb->tc_index=(skb->tc_index&0xfffffff0) | t->def; + skb->tc_index = (skb->tc_index & ~GRED_VQ_MASK) | dp; } - D2PRINTK("gred_enqueue virtualQ 0x%x classid %x backlog %d " - "general backlog %d\n",skb->tc_index&0xf,sch->handle,q->backlog, - sch->qstats.backlog); - /* sum up all the qaves of prios <= to ours to get the new qave*/ - if (!t->eqp && t->grio) { - for (i=0;i<t->DPs;i++) { - if ((!t->tab[i]) || (i==q->DP)) - continue; - - if ((t->tab[i]->prio < q->prio) && (PSCHED_IS_PASTPERFECT(t->tab[i]->qidlestart))) - qave +=t->tab[i]->qave; + /* sum up all the qaves of prios <= to ours to get the new qave */ + if (!gred_wred_mode(t) && gred_rio_mode(t)) { + int i; + + for (i = 0; i < t->DPs; i++) { + if (t->tab[i] && t->tab[i]->prio < q->prio && + !red_is_idling(&t->tab[i]->parms)) + qavg +=t->tab[i]->parms.qavg; } - + } q->packetsin++; - q->bytesin+=skb->len; + q->bytesin += skb->len; - if (t->eqp && t->grio) { - qave=0; - q->qave=t->tab[t->def]->qave; - q->qidlestart=t->tab[t->def]->qidlestart; - } + if (gred_wred_mode(t)) + gred_load_wred_set(t, q); - if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { - long us_idle; - PSCHED_GET_TIME(now); - us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max); - PSCHED_SET_PASTPERFECT(q->qidlestart); + q->parms.qavg = red_calc_qavg(&q->parms, gred_backlog(t, q, sch)); - q->qave >>= q->Stab[(us_idle>>q->Scell_log)&0xFF]; - } else { - if (t->eqp) { - q->qave += sch->qstats.backlog - (q->qave >> q->Wlog); - } else { - q->qave += q->backlog - (q->qave >> q->Wlog); - } + if (red_is_idling(&q->parms)) + red_end_of_idle_period(&q->parms); - } - - - if (t->eqp && t->grio) - t->tab[t->def]->qave=q->qave; - - if ((q->qave+qave) < q->qth_min) { - q->qcount = -1; -enqueue: - if (q->backlog + skb->len <= q->limit) { - q->backlog += skb->len; -do_enqueue: - __skb_queue_tail(&sch->q, skb); - sch->qstats.backlog += skb->len; - sch->bstats.bytes += skb->len; - sch->bstats.packets++; - return 0; - } else { - q->pdrop++; - } + if (gred_wred_mode(t)) + gred_store_wred_set(t, q); -drop: - kfree_skb(skb); - sch->qstats.drops++; - return NET_XMIT_DROP; - } - if ((q->qave+qave) >= q->qth_max) { - q->qcount = -1; - sch->qstats.overlimits++; - q->forced++; - goto drop; + switch (red_action(&q->parms, q->parms.qavg + qavg)) { + case RED_DONT_MARK: + break; + + case RED_PROB_MARK: + sch->qstats.overlimits++; + if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) { + q->stats.prob_drop++; + goto congestion_drop; + } + + q->stats.prob_mark++; + break; + + case RED_HARD_MARK: + sch->qstats.overlimits++; + if (gred_use_harddrop(t) || !gred_use_ecn(t) || + !INET_ECN_set_ce(skb)) { + q->stats.forced_drop++; + goto congestion_drop; + } + q->stats.forced_mark++; + break; } - if (++q->qcount) { - if ((((qave+q->qave) - q->qth_min)>>q->Wlog)*q->qcount < q->qR) - goto enqueue; - q->qcount = 0; - q->qR = net_random()&q->Rmask; - sch->qstats.overlimits++; - q->early++; - goto drop; + + if (q->backlog + skb->len <= q->limit) { + q->backlog += skb->len; + return qdisc_enqueue_tail(skb, sch); } - q->qR = net_random()&q->Rmask; - goto enqueue; + + q->stats.pdrop++; +drop: + return qdisc_drop(skb, sch); + +congestion_drop: + qdisc_drop(skb, sch); + return NET_XMIT_CN; } -static int -gred_requeue(struct sk_buff *skb, struct Qdisc* sch) +static int gred_requeue(struct sk_buff *skb, struct Qdisc* sch) { + struct gred_sched *t = qdisc_priv(sch); struct gred_sched_data *q; - struct gred_sched *t= qdisc_priv(sch); - q= t->tab[(skb->tc_index&0xf)]; -/* error checking here -- probably unnecessary */ - PSCHED_SET_PASTPERFECT(q->qidlestart); - - __skb_queue_head(&sch->q, skb); - sch->qstats.backlog += skb->len; - sch->qstats.requeues++; - q->backlog += skb->len; - return 0; + u16 dp = tc_index_to_dp(skb); + + if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { + if (net_ratelimit()) + printk(KERN_WARNING "GRED: Unable to relocate VQ 0x%x " + "for requeue, screwing up backlog.\n", + tc_index_to_dp(skb)); + } else { + if (red_is_idling(&q->parms)) + red_end_of_idle_period(&q->parms); + q->backlog += skb->len; + } + + return qdisc_requeue(skb, sch); } -static struct sk_buff * -gred_dequeue(struct Qdisc* sch) +static struct sk_buff *gred_dequeue(struct Qdisc* sch) { struct sk_buff *skb; - struct gred_sched_data *q; - struct gred_sched *t= qdisc_priv(sch); + struct gred_sched *t = qdisc_priv(sch); + + skb = qdisc_dequeue_head(sch); - skb = __skb_dequeue(&sch->q); if (skb) { - sch->qstats.backlog -= skb->len; - q= t->tab[(skb->tc_index&0xf)]; - if (q) { - q->backlog -= skb->len; - if (!q->backlog && !t->eqp) - PSCHED_GET_TIME(q->qidlestart); + struct gred_sched_data *q; + u16 dp = tc_index_to_dp(skb); + + if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { + if (net_ratelimit()) + printk(KERN_WARNING "GRED: Unable to relocate " + "VQ 0x%x after dequeue, screwing up " + "backlog.\n", tc_index_to_dp(skb)); } else { - D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf); + q->backlog -= skb->len; + + if (!q->backlog && !gred_wred_mode(t)) + red_start_of_idle_period(&q->parms); } + return skb; } - if (t->eqp) { - q= t->tab[t->def]; - if (!q) - D2PRINTK("no default VQ set: Results will be " - "screwed up\n"); - else - PSCHED_GET_TIME(q->qidlestart); - } + if (gred_wred_mode(t) && !red_is_idling(&t->wred_set)) + red_start_of_idle_period(&t->wred_set); return NULL; } @@ -263,36 +297,34 @@ gred_dequeue(struct Qdisc* sch) static unsigned int gred_drop(struct Qdisc* sch) { struct sk_buff *skb; + struct gred_sched *t = qdisc_priv(sch); - struct gred_sched_data *q; - struct gred_sched *t= qdisc_priv(sch); - - skb = __skb_dequeue_tail(&sch->q); + skb = qdisc_dequeue_tail(sch); if (skb) { unsigned int len = skb->len; - sch->qstats.backlog -= len; - sch->qstats.drops++; - q= t->tab[(skb->tc_index&0xf)]; - if (q) { - q->backlog -= len; - q->other++; - if (!q->backlog && !t->eqp) - PSCHED_GET_TIME(q->qidlestart); + struct gred_sched_data *q; + u16 dp = tc_index_to_dp(skb); + + if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { + if (net_ratelimit()) + printk(KERN_WARNING "GRED: Unable to relocate " + "VQ 0x%x while dropping, screwing up " + "backlog.\n", tc_index_to_dp(skb)); } else { - D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf); + q->backlog -= len; + q->stats.other++; + + if (!q->backlog && !gred_wred_mode(t)) + red_start_of_idle_period(&q->parms); } - kfree_skb(skb); + qdisc_drop(skb, sch); return len; } - q=t->tab[t->def]; - if (!q) { - D2PRINTK("no default VQ set: Results might be screwed up\n"); - return 0; - } + if (gred_wred_mode(t) && !red_is_idling(&t->wred_set)) + red_start_of_idle_period(&t->wred_set); - PSCHED_GET_TIME(q->qidlestart); return 0; } @@ -300,293 +332,241 @@ static unsigned int gred_drop(struct Qdisc* sch) static void gred_reset(struct Qdisc* sch) { int i; - struct gred_sched_data *q; - struct gred_sched *t= qdisc_priv(sch); + struct gred_sched *t = qdisc_priv(sch); + + qdisc_reset_queue(sch); - __skb_queue_purge(&sch->q); + for (i = 0; i < t->DPs; i++) { + struct gred_sched_data *q = t->tab[i]; - sch->qstats.backlog = 0; + if (!q) + continue; - for (i=0;i<t->DPs;i++) { - q= t->tab[i]; - if (!q) - continue; - PSCHED_SET_PASTPERFECT(q->qidlestart); - q->qave = 0; - q->qcount = -1; + red_restart(&q->parms); q->backlog = 0; - q->other=0; - q->forced=0; - q->pdrop=0; - q->early=0; } } -static int gred_change(struct Qdisc *sch, struct rtattr *opt) +static inline void gred_destroy_vq(struct gred_sched_data *q) +{ + kfree(q); +} + +static inline int gred_change_table_def(struct Qdisc *sch, struct rtattr *dps) { struct gred_sched *table = qdisc_priv(sch); - struct gred_sched_data *q; - struct tc_gred_qopt *ctl; struct tc_gred_sopt *sopt; - struct rtattr *tb[TCA_GRED_STAB]; - struct rtattr *tb2[TCA_GRED_DPS]; int i; - if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_STAB, opt)) + if (dps == NULL || RTA_PAYLOAD(dps) < sizeof(*sopt)) return -EINVAL; - if (tb[TCA_GRED_PARMS-1] == 0 && tb[TCA_GRED_STAB-1] == 0) { - rtattr_parse_nested(tb2, TCA_GRED_DPS, opt); + sopt = RTA_DATA(dps); + + if (sopt->DPs > MAX_DPs || sopt->DPs == 0 || sopt->def_DP >= sopt->DPs) + return -EINVAL; - if (tb2[TCA_GRED_DPS-1] == 0) - return -EINVAL; + sch_tree_lock(sch); + table->DPs = sopt->DPs; + table->def = sopt->def_DP; + table->red_flags = sopt->flags; + + /* + * Every entry point to GRED is synchronized with the above code + * and the DP is checked against DPs, i.e. shadowed VQs can no + * longer be found so we can unlock right here. + */ + sch_tree_unlock(sch); + + if (sopt->grio) { + gred_enable_rio_mode(table); + gred_disable_wred_mode(table); + if (gred_wred_mode_check(sch)) + gred_enable_wred_mode(table); + } else { + gred_disable_rio_mode(table); + gred_disable_wred_mode(table); + } - sopt = RTA_DATA(tb2[TCA_GRED_DPS-1]); - table->DPs=sopt->DPs; - table->def=sopt->def_DP; - table->grio=sopt->grio; - table->initd=0; - /* probably need to clear all the table DP entries as well */ - return 0; - } + for (i = table->DPs; i < MAX_DPs; i++) { + if (table->tab[i]) { + printk(KERN_WARNING "GRED: Warning: Destroying " + "shadowed VQ 0x%x\n", i); + gred_destroy_vq(table->tab[i]); + table->tab[i] = NULL; + } + } + return 0; +} - if (!table->DPs || tb[TCA_GRED_PARMS-1] == 0 || tb[TCA_GRED_STAB-1] == 0 || - RTA_PAYLOAD(tb[TCA_GRED_PARMS-1]) < sizeof(*ctl) || - RTA_PAYLOAD(tb[TCA_GRED_STAB-1]) < 256) - return -EINVAL; +static inline int gred_change_vq(struct Qdisc *sch, int dp, + struct tc_gred_qopt *ctl, int prio, u8 *stab) +{ + struct gred_sched *table = qdisc_priv(sch); + struct gred_sched_data *q; - ctl = RTA_DATA(tb[TCA_GRED_PARMS-1]); - if (ctl->DP > MAX_DPs-1 ) { - /* misbehaving is punished! Put in the default drop probability */ - DPRINTK("\nGRED: DP %u not in the proper range fixed. New DP " - "set to default at %d\n",ctl->DP,table->def); - ctl->DP=table->def; - } - - if (table->tab[ctl->DP] == NULL) { - table->tab[ctl->DP]=kmalloc(sizeof(struct gred_sched_data), - GFP_KERNEL); - if (NULL == table->tab[ctl->DP]) + if (table->tab[dp] == NULL) { + table->tab[dp] = kmalloc(sizeof(*q), GFP_KERNEL); + if (table->tab[dp] == NULL) return -ENOMEM; - memset(table->tab[ctl->DP], 0, (sizeof(struct gred_sched_data))); - } - q= table->tab[ctl->DP]; - - if (table->grio) { - if (ctl->prio <=0) { - if (table->def && table->tab[table->def]) { - DPRINTK("\nGRED: DP %u does not have a prio" - "setting default to %d\n",ctl->DP, - table->tab[table->def]->prio); - q->prio=table->tab[table->def]->prio; - } else { - DPRINTK("\nGRED: DP %u does not have a prio" - " setting default to 8\n",ctl->DP); - q->prio=8; - } - } else { - q->prio=ctl->prio; - } - } else { - q->prio=8; + memset(table->tab[dp], 0, sizeof(*q)); } - - q->DP=ctl->DP; - q->Wlog = ctl->Wlog; - q->Plog = ctl->Plog; + q = table->tab[dp]; + q->DP = dp; + q->prio = prio; q->limit = ctl->limit; - q->Scell_log = ctl->Scell_log; - q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL; - q->Scell_max = (255<<q->Scell_log); - q->qth_min = ctl->qth_min<<ctl->Wlog; - q->qth_max = ctl->qth_max<<ctl->Wlog; - q->qave=0; - q->backlog=0; - q->qcount = -1; - q->other=0; - q->forced=0; - q->pdrop=0; - q->early=0; - - PSCHED_SET_PASTPERFECT(q->qidlestart); - memcpy(q->Stab, RTA_DATA(tb[TCA_GRED_STAB-1]), 256); - - if ( table->initd && table->grio) { - /* this looks ugly but it's not in the fast path */ - for (i=0;i<table->DPs;i++) { - if ((!table->tab[i]) || (i==q->DP) ) - continue; - if (table->tab[i]->prio == q->prio ){ - /* WRED mode detected */ - table->eqp=1; - break; - } - } - } - if (!table->initd) { - table->initd=1; - /* - the first entry also goes into the default until - over-written - */ - - if (table->tab[table->def] == NULL) { - table->tab[table->def]= - kmalloc(sizeof(struct gred_sched_data), GFP_KERNEL); - if (NULL == table->tab[table->def]) - return -ENOMEM; - - memset(table->tab[table->def], 0, - (sizeof(struct gred_sched_data))); - } - q= table->tab[table->def]; - q->DP=table->def; - q->Wlog = ctl->Wlog; - q->Plog = ctl->Plog; - q->limit = ctl->limit; - q->Scell_log = ctl->Scell_log; - q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL; - q->Scell_max = (255<<q->Scell_log); - q->qth_min = ctl->qth_min<<ctl->Wlog; - q->qth_max = ctl->qth_max<<ctl->Wlog; - - if (table->grio) - q->prio=table->tab[ctl->DP]->prio; - else - q->prio=8; - - q->qcount = -1; - PSCHED_SET_PASTPERFECT(q->qidlestart); - memcpy(q->Stab, RTA_DATA(tb[TCA_GRED_STAB-1]), 256); - } - return 0; + if (q->backlog == 0) + red_end_of_idle_period(&q->parms); + red_set_parms(&q->parms, + ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog, + ctl->Scell_log, stab); + + return 0; } -static int gred_init(struct Qdisc *sch, struct rtattr *opt) +static int gred_change(struct Qdisc *sch, struct rtattr *opt) { struct gred_sched *table = qdisc_priv(sch); - struct tc_gred_sopt *sopt; - struct rtattr *tb[TCA_GRED_STAB]; - struct rtattr *tb2[TCA_GRED_DPS]; + struct tc_gred_qopt *ctl; + struct rtattr *tb[TCA_GRED_MAX]; + int err = -EINVAL, prio = GRED_DEF_PRIO; + u8 *stab; - if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_STAB, opt)) + if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_MAX, opt)) return -EINVAL; - if (tb[TCA_GRED_PARMS-1] == 0 && tb[TCA_GRED_STAB-1] == 0) { - rtattr_parse_nested(tb2, TCA_GRED_DPS, opt); + if (tb[TCA_GRED_PARMS-1] == NULL && tb[TCA_GRED_STAB-1] == NULL) + return gred_change_table_def(sch, opt); + + if (tb[TCA_GRED_PARMS-1] == NULL || + RTA_PAYLOAD(tb[TCA_GRED_PARMS-1]) < sizeof(*ctl) || + tb[TCA_GRED_STAB-1] == NULL || + RTA_PAYLOAD(tb[TCA_GRED_STAB-1]) < 256) + return -EINVAL; + + ctl = RTA_DATA(tb[TCA_GRED_PARMS-1]); + stab = RTA_DATA(tb[TCA_GRED_STAB-1]); + + if (ctl->DP >= table->DPs) + goto errout; - if (tb2[TCA_GRED_DPS-1] == 0) - return -EINVAL; + if (gred_rio_mode(table)) { + if (ctl->prio == 0) { + int def_prio = GRED_DEF_PRIO; - sopt = RTA_DATA(tb2[TCA_GRED_DPS-1]); - table->DPs=sopt->DPs; - table->def=sopt->def_DP; - table->grio=sopt->grio; - table->initd=0; - return 0; + if (table->tab[table->def]) + def_prio = table->tab[table->def]->prio; + + printk(KERN_DEBUG "GRED: DP %u does not have a prio " + "setting default to %d\n", ctl->DP, def_prio); + + prio = def_prio; + } else + prio = ctl->prio; + } + + sch_tree_lock(sch); + + err = gred_change_vq(sch, ctl->DP, ctl, prio, stab); + if (err < 0) + goto errout_locked; + + if (gred_rio_mode(table)) { + gred_disable_wred_mode(table); + if (gred_wred_mode_check(sch)) + gred_enable_wred_mode(table); } - DPRINTK("\n GRED_INIT error!\n"); - return -EINVAL; + err = 0; + +errout_locked: + sch_tree_unlock(sch); +errout: + return err; } -static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) +static int gred_init(struct Qdisc *sch, struct rtattr *opt) { - unsigned long qave; - struct rtattr *rta; - struct tc_gred_qopt *opt = NULL ; - struct tc_gred_qopt *dst; - struct gred_sched *table = qdisc_priv(sch); - struct gred_sched_data *q; - int i; - unsigned char *b = skb->tail; + struct rtattr *tb[TCA_GRED_MAX]; - rta = (struct rtattr*)b; - RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_MAX, opt)) + return -EINVAL; - opt=kmalloc(sizeof(struct tc_gred_qopt)*MAX_DPs, GFP_KERNEL); + if (tb[TCA_GRED_PARMS-1] || tb[TCA_GRED_STAB-1]) + return -EINVAL; - if (opt == NULL) { - DPRINTK("gred_dump:failed to malloc for %Zd\n", - sizeof(struct tc_gred_qopt)*MAX_DPs); - goto rtattr_failure; - } + return gred_change_table_def(sch, tb[TCA_GRED_DPS-1]); +} - memset(opt, 0, (sizeof(struct tc_gred_qopt))*table->DPs); +static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct gred_sched *table = qdisc_priv(sch); + struct rtattr *parms, *opts = NULL; + int i; + struct tc_gred_sopt sopt = { + .DPs = table->DPs, + .def_DP = table->def, + .grio = gred_rio_mode(table), + .flags = table->red_flags, + }; - if (!table->initd) { - DPRINTK("NO GRED Queues setup!\n"); - } + opts = RTA_NEST(skb, TCA_OPTIONS); + RTA_PUT(skb, TCA_GRED_DPS, sizeof(sopt), &sopt); + parms = RTA_NEST(skb, TCA_GRED_PARMS); + + for (i = 0; i < MAX_DPs; i++) { + struct gred_sched_data *q = table->tab[i]; + struct tc_gred_qopt opt; - for (i=0;i<MAX_DPs;i++) { - dst= &opt[i]; - q= table->tab[i]; + memset(&opt, 0, sizeof(opt)); if (!q) { /* hack -- fix at some point with proper message This is how we indicate to tc that there is no VQ at this DP */ - dst->DP=MAX_DPs+i; - continue; + opt.DP = MAX_DPs + i; + goto append_opt; } - dst->limit=q->limit; - dst->qth_min=q->qth_min>>q->Wlog; - dst->qth_max=q->qth_max>>q->Wlog; - dst->DP=q->DP; - dst->backlog=q->backlog; - if (q->qave) { - if (table->eqp && table->grio) { - q->qidlestart=table->tab[table->def]->qidlestart; - q->qave=table->tab[table->def]->qave; - } - if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { - long idle; - psched_time_t now; - PSCHED_GET_TIME(now); - idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max); - qave = q->qave >> q->Stab[(idle>>q->Scell_log)&0xFF]; - dst->qave = qave >> q->Wlog; - - } else { - dst->qave = q->qave >> q->Wlog; - } - } else { - dst->qave = 0; + opt.limit = q->limit; + opt.DP = q->DP; + opt.backlog = q->backlog; + opt.prio = q->prio; + opt.qth_min = q->parms.qth_min >> q->parms.Wlog; + opt.qth_max = q->parms.qth_max >> q->parms.Wlog; + opt.Wlog = q->parms.Wlog; + opt.Plog = q->parms.Plog; + opt.Scell_log = q->parms.Scell_log; + opt.other = q->stats.other; + opt.early = q->stats.prob_drop; + opt.forced = q->stats.forced_drop; + opt.pdrop = q->stats.pdrop; + opt.packets = q->packetsin; + opt.bytesin = q->bytesin; + + if (gred_wred_mode(table)) { + q->parms.qidlestart = + table->tab[table->def]->parms.qidlestart; + q->parms.qavg = table->tab[table->def]->parms.qavg; } - - - dst->Wlog = q->Wlog; - dst->Plog = q->Plog; - dst->Scell_log = q->Scell_log; - dst->other = q->other; - dst->forced = q->forced; - dst->early = q->early; - dst->pdrop = q->pdrop; - dst->prio = q->prio; - dst->packets=q->packetsin; - dst->bytesin=q->bytesin; + + opt.qave = red_calc_qavg(&q->parms, q->parms.qavg); + +append_opt: + RTA_APPEND(skb, sizeof(opt), &opt); } - RTA_PUT(skb, TCA_GRED_PARMS, sizeof(struct tc_gred_qopt)*MAX_DPs, opt); - rta->rta_len = skb->tail - b; + RTA_NEST_END(skb, parms); - kfree(opt); - return skb->len; + return RTA_NEST_END(skb, opts); rtattr_failure: - if (opt) - kfree(opt); - DPRINTK("gred_dump: FAILURE!!!!\n"); - -/* also free the opt struct here */ - skb_trim(skb, b - skb->data); - return -1; + return RTA_NEST_CANCEL(skb, opts); } static void gred_destroy(struct Qdisc *sch) @@ -594,15 +574,13 @@ static void gred_destroy(struct Qdisc *sch) struct gred_sched *table = qdisc_priv(sch); int i; - for (i = 0;i < table->DPs; i++) { + for (i = 0; i < table->DPs; i++) { if (table->tab[i]) - kfree(table->tab[i]); + gred_destroy_vq(table->tab[i]); } } static struct Qdisc_ops gred_qdisc_ops = { - .next = NULL, - .cl_ops = NULL, .id = "gred", .priv_size = sizeof(struct gred_sched), .enqueue = gred_enqueue, @@ -621,10 +599,13 @@ static int __init gred_module_init(void) { return register_qdisc(&gred_qdisc_ops); } -static void __exit gred_module_exit(void) + +static void __exit gred_module_exit(void) { unregister_qdisc(&gred_qdisc_ops); } + module_init(gred_module_init) module_exit(gred_module_exit) + MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index bb9bf8d5003..82fb07aa06a 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -25,6 +25,8 @@ #include <net/pkt_sched.h> +#define VERSION "1.1" + /* Network Emulation Queuing algorithm. ==================================== @@ -185,10 +187,13 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) || q->counter < q->gap /* inside last reordering gap */ || q->reorder < get_crandom(&q->reorder_cor)) { psched_time_t now; + psched_tdiff_t delay; + + delay = tabledist(q->latency, q->jitter, + &q->delay_cor, q->delay_dist); + PSCHED_GET_TIME(now); - PSCHED_TADD2(now, tabledist(q->latency, q->jitter, - &q->delay_cor, q->delay_dist), - cb->time_to_send); + PSCHED_TADD2(now, delay, cb->time_to_send); ++q->counter; ret = q->qdisc->enqueue(skb, q->qdisc); } else { @@ -248,24 +253,31 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch) const struct netem_skb_cb *cb = (const struct netem_skb_cb *)skb->cb; psched_time_t now; - long delay; /* if more time remaining? */ PSCHED_GET_TIME(now); - delay = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now)); - pr_debug("netem_run: skb=%p delay=%ld\n", skb, delay); - if (delay <= 0) { + + if (PSCHED_TLESS(cb->time_to_send, now)) { pr_debug("netem_dequeue: return skb=%p\n", skb); sch->q.qlen--; sch->flags &= ~TCQ_F_THROTTLED; return skb; - } + } else { + psched_tdiff_t delay = PSCHED_TDIFF(cb->time_to_send, now); + + if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) { + sch->qstats.drops++; - mod_timer(&q->timer, jiffies + delay); - sch->flags |= TCQ_F_THROTTLED; + /* After this qlen is confused */ + printk(KERN_ERR "netem: queue discpline %s could not requeue\n", + q->qdisc->ops->id); - if (q->qdisc->ops->requeue(skb, q->qdisc) != 0) - sch->qstats.drops++; + sch->q.qlen--; + } + + mod_timer(&q->timer, jiffies + PSCHED_US2JIFFIE(delay)); + sch->flags |= TCQ_F_THROTTLED; + } } return NULL; @@ -290,11 +302,16 @@ static void netem_reset(struct Qdisc *sch) del_timer_sync(&q->timer); } +/* Pass size change message down to embedded FIFO */ static int set_fifo_limit(struct Qdisc *q, int limit) { struct rtattr *rta; int ret = -ENOMEM; + /* Hack to avoid sending change message to non-FIFO */ + if (strncmp(q->ops->id + 1, "fifo", 4) != 0) + return 0; + rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL); if (rta) { rta->rta_type = RTM_NEWQDISC; @@ -426,6 +443,84 @@ static int netem_change(struct Qdisc *sch, struct rtattr *opt) return 0; } +/* + * Special case version of FIFO queue for use by netem. + * It queues in order based on timestamps in skb's + */ +struct fifo_sched_data { + u32 limit; +}; + +static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) +{ + struct fifo_sched_data *q = qdisc_priv(sch); + struct sk_buff_head *list = &sch->q; + const struct netem_skb_cb *ncb + = (const struct netem_skb_cb *)nskb->cb; + struct sk_buff *skb; + + if (likely(skb_queue_len(list) < q->limit)) { + skb_queue_reverse_walk(list, skb) { + const struct netem_skb_cb *cb + = (const struct netem_skb_cb *)skb->cb; + + if (!PSCHED_TLESS(ncb->time_to_send, cb->time_to_send)) + break; + } + + __skb_queue_after(list, skb, nskb); + + sch->qstats.backlog += nskb->len; + sch->bstats.bytes += nskb->len; + sch->bstats.packets++; + + return NET_XMIT_SUCCESS; + } + + return qdisc_drop(nskb, sch); +} + +static int tfifo_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct fifo_sched_data *q = qdisc_priv(sch); + + if (opt) { + struct tc_fifo_qopt *ctl = RTA_DATA(opt); + if (RTA_PAYLOAD(opt) < sizeof(*ctl)) + return -EINVAL; + + q->limit = ctl->limit; + } else + q->limit = max_t(u32, sch->dev->tx_queue_len, 1); + + return 0; +} + +static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct fifo_sched_data *q = qdisc_priv(sch); + struct tc_fifo_qopt opt = { .limit = q->limit }; + + RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: + return -1; +} + +static struct Qdisc_ops tfifo_qdisc_ops = { + .id = "tfifo", + .priv_size = sizeof(struct fifo_sched_data), + .enqueue = tfifo_enqueue, + .dequeue = qdisc_dequeue_head, + .requeue = qdisc_requeue, + .drop = qdisc_queue_drop, + .init = tfifo_init, + .reset = qdisc_reset_queue, + .change = tfifo_init, + .dump = tfifo_dump, +}; + static int netem_init(struct Qdisc *sch, struct rtattr *opt) { struct netem_sched_data *q = qdisc_priv(sch); @@ -438,7 +533,7 @@ static int netem_init(struct Qdisc *sch, struct rtattr *opt) q->timer.function = netem_watchdog; q->timer.data = (unsigned long) sch; - q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + q->qdisc = qdisc_create_dflt(sch->dev, &tfifo_qdisc_ops); if (!q->qdisc) { pr_debug("netem: qdisc create failed\n"); return -ENOMEM; @@ -601,6 +696,7 @@ static struct Qdisc_ops netem_qdisc_ops = { static int __init netem_module_init(void) { + pr_info("netem: version " VERSION "\n"); return register_qdisc(&netem_qdisc_ops); } static void __exit netem_module_exit(void) diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 7845d045eec..dccfa44c2d7 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -9,76 +9,23 @@ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Changes: - * J Hadi Salim <hadi@nortel.com> 980914: computation fixes + * J Hadi Salim 980914: computation fixes * Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly. - * J Hadi Salim <hadi@nortelnetworks.com> 980816: ECN support + * J Hadi Salim 980816: ECN support */ #include <linux/config.h> #include <linux/module.h> -#include <asm/uaccess.h> -#include <asm/system.h> -#include <linux/bitops.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/in.h> -#include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/if_ether.h> -#include <linux/inet.h> #include <linux/netdevice.h> -#include <linux/etherdevice.h> -#include <linux/notifier.h> -#include <net/ip.h> -#include <net/route.h> #include <linux/skbuff.h> -#include <net/sock.h> #include <net/pkt_sched.h> #include <net/inet_ecn.h> -#include <net/dsfield.h> +#include <net/red.h> -/* Random Early Detection (RED) algorithm. - ======================================= - - Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways - for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking. - - This file codes a "divisionless" version of RED algorithm - as written down in Fig.17 of the paper. - -Short description. ------------------- - - When a new packet arrives we calculate the average queue length: - - avg = (1-W)*avg + W*current_queue_len, - - W is the filter time constant (chosen as 2^(-Wlog)), it controls - the inertia of the algorithm. To allow larger bursts, W should be - decreased. - - if (avg > th_max) -> packet marked (dropped). - if (avg < th_min) -> packet passes. - if (th_min < avg < th_max) we calculate probability: - - Pb = max_P * (avg - th_min)/(th_max-th_min) - - and mark (drop) packet with this probability. - Pb changes from 0 (at avg==th_min) to max_P (avg==th_max). - max_P should be small (not 1), usually 0.01..0.02 is good value. - - max_P is chosen as a number, so that max_P/(th_max-th_min) - is a negative power of two in order arithmetics to contain - only shifts. - - - Parameters, settable by user: +/* Parameters, settable by user: ----------------------------- limit - bytes (must be > qth_max + burst) @@ -89,243 +36,93 @@ Short description. arbitrarily high (well, less than ram size) Really, this limit will never be reached if RED works correctly. - - qth_min - bytes (should be < qth_max/2) - qth_max - bytes (should be at least 2*qth_min and less limit) - Wlog - bits (<32) log(1/W). - Plog - bits (<32) - - Plog is related to max_P by formula: - - max_P = (qth_max-qth_min)/2^Plog; - - F.e. if qth_max=128K and qth_min=32K, then Plog=22 - corresponds to max_P=0.02 - - Scell_log - Stab - - Lookup table for log((1-W)^(t/t_ave). - - -NOTES: - -Upper bound on W. ------------------ - - If you want to allow bursts of L packets of size S, - you should choose W: - - L + 1 - th_min/S < (1-(1-W)^L)/W - - th_min/S = 32 th_min/S = 4 - - log(W) L - -1 33 - -2 35 - -3 39 - -4 46 - -5 57 - -6 75 - -7 101 - -8 135 - -9 190 - etc. */ struct red_sched_data { -/* Parameters */ - u32 limit; /* HARD maximal queue length */ - u32 qth_min; /* Min average length threshold: A scaled */ - u32 qth_max; /* Max average length threshold: A scaled */ - u32 Rmask; - u32 Scell_max; - unsigned char flags; - char Wlog; /* log(W) */ - char Plog; /* random number bits */ - char Scell_log; - u8 Stab[256]; - -/* Variables */ - unsigned long qave; /* Average queue length: A scaled */ - int qcount; /* Packets since last random number generation */ - u32 qR; /* Cached random number */ - - psched_time_t qidlestart; /* Start of idle period */ - struct tc_red_xstats st; + u32 limit; /* HARD maximal queue length */ + unsigned char flags; + struct red_parms parms; + struct red_stats stats; }; -static int red_ecn_mark(struct sk_buff *skb) +static inline int red_use_ecn(struct red_sched_data *q) { - if (skb->nh.raw + 20 > skb->tail) - return 0; - - switch (skb->protocol) { - case __constant_htons(ETH_P_IP): - if (INET_ECN_is_not_ect(skb->nh.iph->tos)) - return 0; - IP_ECN_set_ce(skb->nh.iph); - return 1; - case __constant_htons(ETH_P_IPV6): - if (INET_ECN_is_not_ect(ipv6_get_dsfield(skb->nh.ipv6h))) - return 0; - IP6_ECN_set_ce(skb->nh.ipv6h); - return 1; - default: - return 0; - } + return q->flags & TC_RED_ECN; } -static int -red_enqueue(struct sk_buff *skb, struct Qdisc* sch) +static inline int red_use_harddrop(struct red_sched_data *q) +{ + return q->flags & TC_RED_HARDDROP; +} + +static int red_enqueue(struct sk_buff *skb, struct Qdisc* sch) { struct red_sched_data *q = qdisc_priv(sch); - psched_time_t now; + q->parms.qavg = red_calc_qavg(&q->parms, sch->qstats.backlog); - if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { - long us_idle; - int shift; + if (red_is_idling(&q->parms)) + red_end_of_idle_period(&q->parms); - PSCHED_GET_TIME(now); - us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max); - PSCHED_SET_PASTPERFECT(q->qidlestart); + switch (red_action(&q->parms, q->parms.qavg)) { + case RED_DONT_MARK: + break; -/* - The problem: ideally, average length queue recalcultion should - be done over constant clock intervals. This is too expensive, so that - the calculation is driven by outgoing packets. - When the queue is idle we have to model this clock by hand. - - SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth) - dummy packets as a burst after idle time, i.e. - - q->qave *= (1-W)^m - - This is an apparently overcomplicated solution (f.e. we have to precompute - a table to make this calculation in reasonable time) - I believe that a simpler model may be used here, - but it is field for experiments. -*/ - shift = q->Stab[us_idle>>q->Scell_log]; - - if (shift) { - q->qave >>= shift; - } else { - /* Approximate initial part of exponent - with linear function: - (1-W)^m ~= 1-mW + ... - - Seems, it is the best solution to - problem of too coarce exponent tabulation. - */ - - us_idle = (q->qave * us_idle)>>q->Scell_log; - if (us_idle < q->qave/2) - q->qave -= us_idle; - else - q->qave >>= 1; - } - } else { - q->qave += sch->qstats.backlog - (q->qave >> q->Wlog); - /* NOTE: - q->qave is fixed point number with point at Wlog. - The formulae above is equvalent to floating point - version: - - qave = qave*(1-W) + sch->qstats.backlog*W; - --ANK (980924) - */ - } + case RED_PROB_MARK: + sch->qstats.overlimits++; + if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) { + q->stats.prob_drop++; + goto congestion_drop; + } - if (q->qave < q->qth_min) { - q->qcount = -1; -enqueue: - if (sch->qstats.backlog + skb->len <= q->limit) { - __skb_queue_tail(&sch->q, skb); - sch->qstats.backlog += skb->len; - sch->bstats.bytes += skb->len; - sch->bstats.packets++; - return NET_XMIT_SUCCESS; - } else { - q->st.pdrop++; - } - kfree_skb(skb); - sch->qstats.drops++; - return NET_XMIT_DROP; - } - if (q->qave >= q->qth_max) { - q->qcount = -1; - sch->qstats.overlimits++; -mark: - if (!(q->flags&TC_RED_ECN) || !red_ecn_mark(skb)) { - q->st.early++; - goto drop; - } - q->st.marked++; - goto enqueue; - } + q->stats.prob_mark++; + break; + + case RED_HARD_MARK: + sch->qstats.overlimits++; + if (red_use_harddrop(q) || !red_use_ecn(q) || + !INET_ECN_set_ce(skb)) { + q->stats.forced_drop++; + goto congestion_drop; + } - if (++q->qcount) { - /* The formula used below causes questions. - - OK. qR is random number in the interval 0..Rmask - i.e. 0..(2^Plog). If we used floating point - arithmetics, it would be: (2^Plog)*rnd_num, - where rnd_num is less 1. - - Taking into account, that qave have fixed - point at Wlog, and Plog is related to max_P by - max_P = (qth_max-qth_min)/2^Plog; two lines - below have the following floating point equivalent: - - max_P*(qave - qth_min)/(qth_max-qth_min) < rnd/qcount - - Any questions? --ANK (980924) - */ - if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR) - goto enqueue; - q->qcount = 0; - q->qR = net_random()&q->Rmask; - sch->qstats.overlimits++; - goto mark; + q->stats.forced_mark++; + break; } - q->qR = net_random()&q->Rmask; - goto enqueue; -drop: - kfree_skb(skb); - sch->qstats.drops++; + if (sch->qstats.backlog + skb->len <= q->limit) + return qdisc_enqueue_tail(skb, sch); + + q->stats.pdrop++; + return qdisc_drop(skb, sch); + +congestion_drop: + qdisc_drop(skb, sch); return NET_XMIT_CN; } -static int -red_requeue(struct sk_buff *skb, struct Qdisc* sch) +static int red_requeue(struct sk_buff *skb, struct Qdisc* sch) { struct red_sched_data *q = qdisc_priv(sch); - PSCHED_SET_PASTPERFECT(q->qidlestart); + if (red_is_idling(&q->parms)) + red_end_of_idle_period(&q->parms); - __skb_queue_head(&sch->q, skb); - sch->qstats.backlog += skb->len; - sch->qstats.requeues++; - return 0; + return qdisc_requeue(skb, sch); } -static struct sk_buff * -red_dequeue(struct Qdisc* sch) +static struct sk_buff * red_dequeue(struct Qdisc* sch) { struct sk_buff *skb; struct red_sched_data *q = qdisc_priv(sch); - skb = __skb_dequeue(&sch->q); - if (skb) { - sch->qstats.backlog -= skb->len; - return skb; - } - PSCHED_GET_TIME(q->qidlestart); - return NULL; + skb = qdisc_dequeue_head(sch); + + if (skb == NULL && !red_is_idling(&q->parms)) + red_start_of_idle_period(&q->parms); + + return skb; } static unsigned int red_drop(struct Qdisc* sch) @@ -333,16 +130,17 @@ static unsigned int red_drop(struct Qdisc* sch) struct sk_buff *skb; struct red_sched_data *q = qdisc_priv(sch); - skb = __skb_dequeue_tail(&sch->q); + skb = qdisc_dequeue_tail(sch); if (skb) { unsigned int len = skb->len; - sch->qstats.backlog -= len; - sch->qstats.drops++; - q->st.other++; - kfree_skb(skb); + q->stats.other++; + qdisc_drop(skb, sch); return len; } - PSCHED_GET_TIME(q->qidlestart); + + if (!red_is_idling(&q->parms)) + red_start_of_idle_period(&q->parms); + return 0; } @@ -350,43 +148,38 @@ static void red_reset(struct Qdisc* sch) { struct red_sched_data *q = qdisc_priv(sch); - __skb_queue_purge(&sch->q); - sch->qstats.backlog = 0; - PSCHED_SET_PASTPERFECT(q->qidlestart); - q->qave = 0; - q->qcount = -1; + qdisc_reset_queue(sch); + red_restart(&q->parms); } static int red_change(struct Qdisc *sch, struct rtattr *opt) { struct red_sched_data *q = qdisc_priv(sch); - struct rtattr *tb[TCA_RED_STAB]; + struct rtattr *tb[TCA_RED_MAX]; struct tc_red_qopt *ctl; - if (opt == NULL || - rtattr_parse_nested(tb, TCA_RED_STAB, opt) || - tb[TCA_RED_PARMS-1] == 0 || tb[TCA_RED_STAB-1] == 0 || + if (opt == NULL || rtattr_parse_nested(tb, TCA_RED_MAX, opt)) + return -EINVAL; + + if (tb[TCA_RED_PARMS-1] == NULL || RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) || - RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < 256) + tb[TCA_RED_STAB-1] == NULL || + RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < RED_STAB_SIZE) return -EINVAL; ctl = RTA_DATA(tb[TCA_RED_PARMS-1]); sch_tree_lock(sch); q->flags = ctl->flags; - q->Wlog = ctl->Wlog; - q->Plog = ctl->Plog; - q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL; - q->Scell_log = ctl->Scell_log; - q->Scell_max = (255<<q->Scell_log); - q->qth_min = ctl->qth_min<<ctl->Wlog; - q->qth_max = ctl->qth_max<<ctl->Wlog; q->limit = ctl->limit; - memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256); - q->qcount = -1; + red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog, + ctl->Plog, ctl->Scell_log, + RTA_DATA(tb[TCA_RED_STAB-1])); + if (skb_queue_empty(&sch->q)) - PSCHED_SET_PASTPERFECT(q->qidlestart); + red_end_of_idle_period(&q->parms); + sch_tree_unlock(sch); return 0; } @@ -399,39 +192,39 @@ static int red_init(struct Qdisc* sch, struct rtattr *opt) static int red_dump(struct Qdisc *sch, struct sk_buff *skb) { struct red_sched_data *q = qdisc_priv(sch); - unsigned char *b = skb->tail; - struct rtattr *rta; - struct tc_red_qopt opt; - - rta = (struct rtattr*)b; - RTA_PUT(skb, TCA_OPTIONS, 0, NULL); - opt.limit = q->limit; - opt.qth_min = q->qth_min>>q->Wlog; - opt.qth_max = q->qth_max>>q->Wlog; - opt.Wlog = q->Wlog; - opt.Plog = q->Plog; - opt.Scell_log = q->Scell_log; - opt.flags = q->flags; + struct rtattr *opts = NULL; + struct tc_red_qopt opt = { + .limit = q->limit, + .flags = q->flags, + .qth_min = q->parms.qth_min >> q->parms.Wlog, + .qth_max = q->parms.qth_max >> q->parms.Wlog, + .Wlog = q->parms.Wlog, + .Plog = q->parms.Plog, + .Scell_log = q->parms.Scell_log, + }; + + opts = RTA_NEST(skb, TCA_OPTIONS); RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt); - rta->rta_len = skb->tail - b; - - return skb->len; + return RTA_NEST_END(skb, opts); rtattr_failure: - skb_trim(skb, b - skb->data); - return -1; + return RTA_NEST_CANCEL(skb, opts); } static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct red_sched_data *q = qdisc_priv(sch); - - return gnet_stats_copy_app(d, &q->st, sizeof(q->st)); + struct tc_red_xstats st = { + .early = q->stats.prob_drop + q->stats.forced_drop, + .pdrop = q->stats.pdrop, + .other = q->stats.other, + .marked = q->stats.prob_mark + q->stats.forced_mark, + }; + + return gnet_stats_copy_app(d, &st, sizeof(st)); } static struct Qdisc_ops red_qdisc_ops = { - .next = NULL, - .cl_ops = NULL, .id = "red", .priv_size = sizeof(struct red_sched_data), .enqueue = red_enqueue, @@ -450,10 +243,13 @@ static int __init red_module_init(void) { return register_qdisc(&red_qdisc_ops); } -static void __exit red_module_exit(void) + +static void __exit red_module_exit(void) { unregister_qdisc(&red_qdisc_ops); } + module_init(red_module_init) module_exit(red_module_exit) + MODULE_LICENSE("GPL"); diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 12b0f582a66..dec68a60477 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -128,9 +128,29 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a */ asoc->max_burst = sctp_max_burst; - /* Copy things from the endpoint. */ + /* initialize association timers */ + asoc->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0; + asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = asoc->rto_initial; + asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = asoc->rto_initial; + asoc->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = asoc->rto_initial; + asoc->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] = 0; + asoc->timeouts[SCTP_EVENT_TIMEOUT_T4_RTO] = 0; + + /* sctpimpguide Section 2.12.2 + * If the 'T5-shutdown-guard' timer is used, it SHOULD be set to the + * recommended value of 5 times 'RTO.Max'. + */ + asoc->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD] + = 5 * asoc->rto_max; + + asoc->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0; + asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = + SCTP_DEFAULT_TIMEOUT_SACK; + asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = + sp->autoclose * HZ; + + /* Initilizes the timers */ for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) { - asoc->timeouts[i] = ep->timeouts[i]; init_timer(&asoc->timers[i]); asoc->timers[i].function = sctp_timer_events[i]; asoc->timers[i].data = (unsigned long) asoc; @@ -157,10 +177,10 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a * RFC 6 - A SCTP receiver MUST be able to receive a minimum of * 1500 bytes in one SCTP packet. */ - if (sk->sk_rcvbuf < SCTP_DEFAULT_MINWINDOW) + if ((sk->sk_rcvbuf/2) < SCTP_DEFAULT_MINWINDOW) asoc->rwnd = SCTP_DEFAULT_MINWINDOW; else - asoc->rwnd = sk->sk_rcvbuf; + asoc->rwnd = sk->sk_rcvbuf/2; asoc->a_rwnd = asoc->rwnd; @@ -172,6 +192,9 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a /* Set the sndbuf size for transmit. */ asoc->sndbuf_used = 0; + /* Initialize the receive memory counter */ + atomic_set(&asoc->rmem_alloc, 0); + init_waitqueue_head(&asoc->wait); asoc->c.my_vtag = sctp_generate_tag(ep); @@ -344,9 +367,7 @@ void sctp_association_free(struct sctp_association *asoc) } /* Free peer's cached cookie. */ - if (asoc->peer.cookie) { - kfree(asoc->peer.cookie); - } + kfree(asoc->peer.cookie); /* Release the transport structures. */ list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { @@ -382,6 +403,8 @@ static void sctp_association_destroy(struct sctp_association *asoc) spin_unlock_bh(&sctp_assocs_id_lock); } + BUG_TRAP(!atomic_read(&asoc->rmem_alloc)); + if (asoc->base.malloced) { kfree(asoc); SCTP_DBG_OBJCNT_DEC(assoc); diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 96984f7a2d6..67bd53070ee 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -70,7 +70,6 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, struct sock *sk, gfp_t gfp) { - struct sctp_sock *sp = sctp_sk(sk); memset(ep, 0, sizeof(struct sctp_endpoint)); /* Initialize the base structure. */ @@ -100,33 +99,14 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, /* Create the lists of associations. */ INIT_LIST_HEAD(&ep->asocs); - /* Set up the base timeout information. */ - ep->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0; - ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = - msecs_to_jiffies(sp->rtoinfo.srto_initial); - ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = - msecs_to_jiffies(sp->rtoinfo.srto_initial); - ep->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = - msecs_to_jiffies(sp->rtoinfo.srto_initial); - ep->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] = 0; - ep->timeouts[SCTP_EVENT_TIMEOUT_T4_RTO] = 0; - - /* sctpimpguide-05 Section 2.12.2 - * If the 'T5-shutdown-guard' timer is used, it SHOULD be set to the - * recommended value of 5 times 'RTO.Max'. - */ - ep->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD] - = 5 * msecs_to_jiffies(sp->rtoinfo.srto_max); - - ep->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0; - ep->timeouts[SCTP_EVENT_TIMEOUT_SACK] = sctp_sack_timeout; - ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ; - /* Use SCTP specific send buffer space queues. */ ep->sndbuf_policy = sctp_sndbuf_policy; sk->sk_write_space = sctp_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); + /* Get the receive buffer policy for this endpoint */ + ep->rcvbuf_policy = sctp_rcvbuf_policy; + /* Initialize the secret key used with cookie. */ get_random_bytes(&ep->secret_key[0], SCTP_SECRET_SIZE); ep->last_key = ep->current_key = 0; diff --git a/net/sctp/input.c b/net/sctp/input.c index 28f32243397..b24ff2c1aef 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -100,21 +100,6 @@ static inline int sctp_rcv_checksum(struct sk_buff *skb) return 0; } -/* The free routine for skbuffs that sctp receives */ -static void sctp_rfree(struct sk_buff *skb) -{ - atomic_sub(sizeof(struct sctp_chunk),&skb->sk->sk_rmem_alloc); - sock_rfree(skb); -} - -/* The ownership wrapper routine to do receive buffer accounting */ -static void sctp_rcv_set_owner_r(struct sk_buff *skb, struct sock *sk) -{ - skb_set_owner_r(skb,sk); - skb->destructor = sctp_rfree; - atomic_add(sizeof(struct sctp_chunk),&sk->sk_rmem_alloc); -} - struct sctp_input_cb { union { struct inet_skb_parm h4; @@ -217,9 +202,6 @@ int sctp_rcv(struct sk_buff *skb) rcvr = &ep->base; } - if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) - goto discard_release; - /* * RFC 2960, 8.4 - Handle "Out of the blue" Packets. * An SCTP packet is called an "out of the blue" (OOTB) @@ -256,8 +238,6 @@ int sctp_rcv(struct sk_buff *skb) } SCTP_INPUT_CB(skb)->chunk = chunk; - sctp_rcv_set_owner_r(skb,sk); - /* Remember what endpoint is to handle this packet. */ chunk->rcvr = rcvr; diff --git a/net/sctp/proc.c b/net/sctp/proc.c index b74f7772b57..6e4dc28874d 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -69,9 +69,7 @@ fold_field(void *mib[], int nr) unsigned long res = 0; int i; - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_possible(i)) - continue; + for_each_cpu(i) { res += *((unsigned long *) (((void *) per_cpu_ptr(mib[0], i)) + sizeof (unsigned long) * nr)); diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 26de4d3e1bd..f775d78aa59 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -530,6 +530,9 @@ static void sctp_v4_get_saddr(struct sctp_association *asoc, { struct rtable *rt = (struct rtable *)dst; + if (!asoc) + return; + if (rt) { saddr->v4.sin_family = AF_INET; saddr->v4.sin_port = asoc->base.bind_addr.port; @@ -1047,6 +1050,9 @@ SCTP_STATIC __init int sctp_init(void) /* Sendbuffer growth - do per-socket accounting */ sctp_sndbuf_policy = 0; + /* Rcvbuffer growth - do per-socket accounting */ + sctp_rcvbuf_policy = 0; + /* HB.interval - 30 seconds */ sctp_hb_interval = SCTP_DEFAULT_TIMEOUT_HEARTBEAT; diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 10e82ec2ebd..f9573eba5c7 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -254,8 +254,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, aiparam.adaption_ind = htonl(sp->adaption_ind); sctp_addto_chunk(retval, sizeof(aiparam), &aiparam); nodata: - if (addrs.v) - kfree(addrs.v); + kfree(addrs.v); return retval; } @@ -347,8 +346,7 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, nomem_chunk: kfree(cookie); nomem_cookie: - if (addrs.v) - kfree(addrs.v); + kfree(addrs.v); return retval; } @@ -554,7 +552,7 @@ struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc, dp.ppid = sinfo->sinfo_ppid; /* Set the flags for an unordered send. */ - if (sinfo->sinfo_flags & MSG_UNORDERED) { + if (sinfo->sinfo_flags & SCTP_UNORDERED) { flags |= SCTP_DATA_UNORDERED; dp.ssn = 0; } else diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index f84173ea8ec..823947170a3 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -385,7 +385,7 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = { NULL, sctp_generate_t4_rto_event, sctp_generate_t5_shutdown_guard_event, - sctp_generate_heartbeat_event, + NULL, sctp_generate_sack_event, sctp_generate_autoclose_event, }; @@ -689,9 +689,9 @@ static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds, * increased due to timer expirations. */ asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = - asoc->ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT]; + asoc->rto_initial; asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = - asoc->ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE]; + asoc->rto_initial; } if (sctp_state(asoc, ESTABLISHED) || diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 505c7de10c5..475bfb4972d 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -5160,6 +5160,8 @@ static int sctp_eat_data(const struct sctp_association *asoc, sctp_verb_t deliver; int tmp; __u32 tsn; + int account_value; + struct sock *sk = asoc->base.sk; data_hdr = chunk->subh.data_hdr = (sctp_datahdr_t *)chunk->skb->data; skb_pull(chunk->skb, sizeof(sctp_datahdr_t)); @@ -5169,6 +5171,26 @@ static int sctp_eat_data(const struct sctp_association *asoc, /* ASSERT: Now skb->data is really the user data. */ + /* + * if we are established, and we have used up our receive + * buffer memory, drop the frame + */ + if (asoc->state == SCTP_STATE_ESTABLISHED) { + /* + * If the receive buffer policy is 1, then each + * association can allocate up to sk_rcvbuf bytes + * otherwise, all the associations in aggregate + * may allocate up to sk_rcvbuf bytes + */ + if (asoc->ep->rcvbuf_policy) + account_value = atomic_read(&asoc->rmem_alloc); + else + account_value = atomic_read(&sk->sk_rmem_alloc); + + if (account_value > sk->sk_rcvbuf) + return SCTP_IERROR_IGNORE_TSN; + } + /* Process ECN based congestion. * * Since the chunk structure is reused for all chunks within diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 02e068d3450..abab81f3818 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1010,6 +1010,19 @@ static int __sctp_connect(struct sock* sk, err = -EAGAIN; goto out_free; } + } else { + /* + * If an unprivileged user inherits a 1-many + * style socket with open associations on a + * privileged port, it MAY be permitted to + * accept new associations, but it SHOULD NOT + * be permitted to open new associations. + */ + if (ep->base.bind_addr.port < PROT_SOCK && + !capable(CAP_NET_BIND_SERVICE)) { + err = -EACCES; + goto out_free; + } } scope = sctp_scope(&to); @@ -1389,27 +1402,27 @@ SCTP_STATIC int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, SCTP_DEBUG_PRINTK("msg_len: %zu, sinfo_flags: 0x%x\n", msg_len, sinfo_flags); - /* MSG_EOF or MSG_ABORT cannot be set on a TCP-style socket. */ - if (sctp_style(sk, TCP) && (sinfo_flags & (MSG_EOF | MSG_ABORT))) { + /* SCTP_EOF or SCTP_ABORT cannot be set on a TCP-style socket. */ + if (sctp_style(sk, TCP) && (sinfo_flags & (SCTP_EOF | SCTP_ABORT))) { err = -EINVAL; goto out_nounlock; } - /* If MSG_EOF is set, no data can be sent. Disallow sending zero - * length messages when MSG_EOF|MSG_ABORT is not set. - * If MSG_ABORT is set, the message length could be non zero with + /* If SCTP_EOF is set, no data can be sent. Disallow sending zero + * length messages when SCTP_EOF|SCTP_ABORT is not set. + * If SCTP_ABORT is set, the message length could be non zero with * the msg_iov set to the user abort reason. */ - if (((sinfo_flags & MSG_EOF) && (msg_len > 0)) || - (!(sinfo_flags & (MSG_EOF|MSG_ABORT)) && (msg_len == 0))) { + if (((sinfo_flags & SCTP_EOF) && (msg_len > 0)) || + (!(sinfo_flags & (SCTP_EOF|SCTP_ABORT)) && (msg_len == 0))) { err = -EINVAL; goto out_nounlock; } - /* If MSG_ADDR_OVER is set, there must be an address + /* If SCTP_ADDR_OVER is set, there must be an address * specified in msg_name. */ - if ((sinfo_flags & MSG_ADDR_OVER) && (!msg->msg_name)) { + if ((sinfo_flags & SCTP_ADDR_OVER) && (!msg->msg_name)) { err = -EINVAL; goto out_nounlock; } @@ -1458,14 +1471,14 @@ SCTP_STATIC int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, goto out_unlock; } - if (sinfo_flags & MSG_EOF) { + if (sinfo_flags & SCTP_EOF) { SCTP_DEBUG_PRINTK("Shutting down association: %p\n", asoc); sctp_primitive_SHUTDOWN(asoc, NULL); err = 0; goto out_unlock; } - if (sinfo_flags & MSG_ABORT) { + if (sinfo_flags & SCTP_ABORT) { SCTP_DEBUG_PRINTK("Aborting association: %p\n", asoc); sctp_primitive_ABORT(asoc, msg); err = 0; @@ -1477,7 +1490,7 @@ SCTP_STATIC int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, if (!asoc) { SCTP_DEBUG_PRINTK("There is no association yet.\n"); - if (sinfo_flags & (MSG_EOF | MSG_ABORT)) { + if (sinfo_flags & (SCTP_EOF | SCTP_ABORT)) { err = -EINVAL; goto out_unlock; } @@ -1515,6 +1528,19 @@ SCTP_STATIC int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, err = -EAGAIN; goto out_unlock; } + } else { + /* + * If an unprivileged user inherits a one-to-many + * style socket with open associations on a privileged + * port, it MAY be permitted to accept new associations, + * but it SHOULD NOT be permitted to open new + * associations. + */ + if (ep->base.bind_addr.port < PROT_SOCK && + !capable(CAP_NET_BIND_SERVICE)) { + err = -EACCES; + goto out_unlock; + } } scope = sctp_scope(&to); @@ -1611,10 +1637,10 @@ SCTP_STATIC int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, /* If an address is passed with the sendto/sendmsg call, it is used * to override the primary destination address in the TCP model, or - * when MSG_ADDR_OVER flag is set in the UDP model. + * when SCTP_ADDR_OVER flag is set in the UDP model. */ if ((sctp_style(sk, TCP) && msg_name) || - (sinfo_flags & MSG_ADDR_OVER)) { + (sinfo_flags & SCTP_ADDR_OVER)) { chunk_tp = sctp_assoc_lookup_paddr(asoc, &to); if (!chunk_tp) { err = -EINVAL; @@ -1906,7 +1932,6 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval, if (copy_from_user(&sp->autoclose, optval, optlen)) return -EFAULT; - sp->ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ; return 0; } @@ -2306,16 +2331,14 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, int optl return -EINVAL; if (get_user(val, (int __user *)optval)) return -EFAULT; - if ((val < 8) || (val > SCTP_MAX_CHUNK_LEN)) + if ((val != 0) && ((val < 8) || (val > SCTP_MAX_CHUNK_LEN))) return -EINVAL; sp->user_frag = val; - if (val) { - /* Update the frag_point of the existing associations. */ - list_for_each(pos, &(sp->ep->asocs)) { - asoc = list_entry(pos, struct sctp_association, asocs); - asoc->frag_point = sctp_frag_point(sp, asoc->pmtu); - } + /* Update the frag_point of the existing associations. */ + list_for_each(pos, &(sp->ep->asocs)) { + asoc = list_entry(pos, struct sctp_association, asocs); + asoc->frag_point = sctp_frag_point(sp, asoc->pmtu); } return 0; @@ -2384,14 +2407,14 @@ static int sctp_setsockopt_peer_primary_addr(struct sock *sk, char __user *optva static int sctp_setsockopt_adaption_layer(struct sock *sk, char __user *optval, int optlen) { - __u32 val; + struct sctp_setadaption adaption; - if (optlen < sizeof(__u32)) + if (optlen != sizeof(struct sctp_setadaption)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(__u32))) + if (copy_from_user(&adaption, optval, optlen)) return -EFAULT; - sctp_sk(sk)->adaption_ind = val; + sctp_sk(sk)->adaption_ind = adaption.ssb_adaption_ind; return 0; } @@ -3672,17 +3695,15 @@ static int sctp_getsockopt_primary_addr(struct sock *sk, int len, static int sctp_getsockopt_adaption_layer(struct sock *sk, int len, char __user *optval, int __user *optlen) { - __u32 val; + struct sctp_setadaption adaption; - if (len < sizeof(__u32)) + if (len != sizeof(struct sctp_setadaption)) return -EINVAL; - len = sizeof(__u32); - val = sctp_sk(sk)->adaption_ind; - if (put_user(len, optlen)) - return -EFAULT; - if (copy_to_user(optval, &val, len)) + adaption.ssb_adaption_ind = sctp_sk(sk)->adaption_ind; + if (copy_to_user(optval, &adaption, len)) return -EFAULT; + return 0; } @@ -4640,8 +4661,8 @@ SCTP_STATIC int sctp_msghdr_parse(const struct msghdr *msg, /* Minimally, validate the sinfo_flags. */ if (cmsgs->info->sinfo_flags & - ~(MSG_UNORDERED | MSG_ADDR_OVER | - MSG_ABORT | MSG_EOF)) + ~(SCTP_UNORDERED | SCTP_ADDR_OVER | + SCTP_ABORT | SCTP_EOF)) return -EINVAL; break; @@ -5093,8 +5114,10 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, sctp_skb_for_each(skb, &oldsk->sk_receive_queue, tmp) { event = sctp_skb2event(skb); if (event->asoc == assoc) { + sock_rfree(skb); __skb_unlink(skb, &oldsk->sk_receive_queue); __skb_queue_tail(&newsk->sk_receive_queue, skb); + skb_set_owner_r(skb, newsk); } } @@ -5122,8 +5145,10 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, sctp_skb_for_each(skb, &oldsp->pd_lobby, tmp) { event = sctp_skb2event(skb); if (event->asoc == assoc) { + sock_rfree(skb); __skb_unlink(skb, &oldsp->pd_lobby); __skb_queue_tail(queue, skb); + skb_set_owner_r(skb, newsk); } } diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 75b28dd634f..fcd7096c953 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -121,6 +121,14 @@ static ctl_table sctp_table[] = { .proc_handler = &proc_dointvec }, { + .ctl_name = NET_SCTP_RCVBUF_POLICY, + .procname = "rcvbuf_policy", + .data = &sctp_rcvbuf_policy, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { .ctl_name = NET_SCTP_PATH_MAX_RETRANS, .procname = "path_max_retrans", .data = &sctp_max_retrans_path, diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index 057e7fac3af..ba97f974f57 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -52,19 +52,6 @@ static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event, struct sctp_association *asoc); static void sctp_ulpevent_release_data(struct sctp_ulpevent *event); -/* Stub skb destructor. */ -static void sctp_stub_rfree(struct sk_buff *skb) -{ -/* WARNING: This function is just a warning not to use the - * skb destructor. If the skb is shared, we may get the destructor - * callback on some processor that does not own the sock_lock. This - * was occuring with PACKET socket applications that were monitoring - * our skbs. We can't take the sock_lock, because we can't risk - * recursing if we do really own the sock lock. Instead, do all - * of our rwnd manipulation while we own the sock_lock outright. - */ -} - /* Initialize an ULP event from an given skb. */ SCTP_STATIC void sctp_ulpevent_init(struct sctp_ulpevent *event, int msg_flags) { @@ -111,15 +98,19 @@ static inline void sctp_ulpevent_set_owner(struct sctp_ulpevent *event, */ sctp_association_hold((struct sctp_association *)asoc); skb = sctp_event2skb(event); - skb->sk = asoc->base.sk; event->asoc = (struct sctp_association *)asoc; - skb->destructor = sctp_stub_rfree; + atomic_add(skb->truesize, &event->asoc->rmem_alloc); + skb_set_owner_r(skb, asoc->base.sk); } /* A simple destructor to give up the reference to the association. */ static inline void sctp_ulpevent_release_owner(struct sctp_ulpevent *event) { - sctp_association_put(event->asoc); + struct sctp_association *asoc = event->asoc; + struct sk_buff *skb = sctp_event2skb(event); + + atomic_sub(skb->truesize, &asoc->rmem_alloc); + sctp_association_put(asoc); } /* Create and initialize an SCTP_ASSOC_CHANGE event. @@ -698,7 +689,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, event->ssn = ntohs(chunk->subh.data_hdr->ssn); event->ppid = chunk->subh.data_hdr->ppid; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) { - event->flags |= MSG_UNORDERED; + event->flags |= SCTP_UNORDERED; event->cumtsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); } event->tsn = ntohl(chunk->subh.data_hdr->tsn); @@ -824,7 +815,7 @@ void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event, * * recvmsg() flags: * - * MSG_UNORDERED - This flag is present when the message was sent + * SCTP_UNORDERED - This flag is present when the message was sent * non-ordered. */ sinfo.sinfo_flags = event->flags; @@ -839,7 +830,7 @@ void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event, * This field will hold the current cumulative TSN as * known by the underlying SCTP layer. Note this field is * ignored when sending and only valid for a receive - * operation when sinfo_flags are set to MSG_UNORDERED. + * operation when sinfo_flags are set to SCTP_UNORDERED. */ sinfo.sinfo_cumtsn = event->cumtsn; /* sinfo_assoc_id: sizeof (sctp_assoc_t) @@ -922,7 +913,6 @@ done: /* Free a ulpevent that has an owner. It includes releasing the reference * to the owner, updating the rwnd in case of a DATA event and freeing the * skb. - * See comments in sctp_stub_rfree(). */ void sctp_ulpevent_free(struct sctp_ulpevent *event) { diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile index 46a2ce00a29..cdcab9ca4c6 100644 --- a/net/sunrpc/Makefile +++ b/net/sunrpc/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_SUNRPC) += sunrpc.o obj-$(CONFIG_SUNRPC_GSS) += auth_gss/ -sunrpc-y := clnt.o xprt.o sched.o \ +sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \ auth.o auth_null.o auth_unix.o \ svc.o svcsock.o svcauth.o svcauth_unix.o \ pmap_clnt.o timer.o xdr.o \ diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 505e2d4b3d6..8c7756036e9 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -11,7 +11,6 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/errno.h> -#include <linux/socket.h> #include <linux/sunrpc/clnt.h> #include <linux/spinlock.h> @@ -300,11 +299,10 @@ put_rpccred(struct rpc_cred *cred) void rpcauth_unbindcred(struct rpc_task *task) { - struct rpc_auth *auth = task->tk_auth; struct rpc_cred *cred = task->tk_msg.rpc_cred; dprintk("RPC: %4d releasing %s cred %p\n", - task->tk_pid, auth->au_ops->au_name, cred); + task->tk_pid, task->tk_auth->au_ops->au_name, cred); put_rpccred(cred); task->tk_msg.rpc_cred = NULL; @@ -313,22 +311,22 @@ rpcauth_unbindcred(struct rpc_task *task) u32 * rpcauth_marshcred(struct rpc_task *task, u32 *p) { - struct rpc_auth *auth = task->tk_auth; struct rpc_cred *cred = task->tk_msg.rpc_cred; dprintk("RPC: %4d marshaling %s cred %p\n", - task->tk_pid, auth->au_ops->au_name, cred); + task->tk_pid, task->tk_auth->au_ops->au_name, cred); + return cred->cr_ops->crmarshal(task, p); } u32 * rpcauth_checkverf(struct rpc_task *task, u32 *p) { - struct rpc_auth *auth = task->tk_auth; struct rpc_cred *cred = task->tk_msg.rpc_cred; dprintk("RPC: %4d validating %s cred %p\n", - task->tk_pid, auth->au_ops->au_name, cred); + task->tk_pid, task->tk_auth->au_ops->au_name, cred); + return cred->cr_ops->crvalidate(task, p); } @@ -364,12 +362,12 @@ rpcauth_unwrap_resp(struct rpc_task *task, kxdrproc_t decode, void *rqstp, int rpcauth_refreshcred(struct rpc_task *task) { - struct rpc_auth *auth = task->tk_auth; struct rpc_cred *cred = task->tk_msg.rpc_cred; int err; dprintk("RPC: %4d refreshing %s cred %p\n", - task->tk_pid, auth->au_ops->au_name, cred); + task->tk_pid, task->tk_auth->au_ops->au_name, cred); + err = cred->cr_ops->crrefresh(task); if (err < 0) task->tk_status = err; diff --git a/net/sunrpc/auth_gss/Makefile b/net/sunrpc/auth_gss/Makefile index fe1b874084b..f3431a7e33d 100644 --- a/net/sunrpc/auth_gss/Makefile +++ b/net/sunrpc/auth_gss/Makefile @@ -10,7 +10,7 @@ auth_rpcgss-objs := auth_gss.o gss_generic_token.o \ obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o rpcsec_gss_krb5-objs := gss_krb5_mech.o gss_krb5_seal.o gss_krb5_unseal.o \ - gss_krb5_seqnum.o + gss_krb5_seqnum.o gss_krb5_wrap.o obj-$(CONFIG_RPCSEC_GSS_SPKM3) += rpcsec_gss_spkm3.o diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 2f7b867161d..f44f46f1d8e 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -42,9 +42,8 @@ #include <linux/init.h> #include <linux/types.h> #include <linux/slab.h> -#include <linux/socket.h> -#include <linux/in.h> #include <linux/sched.h> +#include <linux/pagemap.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/auth.h> #include <linux/sunrpc/auth_gss.h> @@ -846,10 +845,8 @@ gss_marshal(struct rpc_task *task, u32 *p) /* We compute the checksum for the verifier over the xdr-encoded bytes * starting with the xid and ending at the end of the credential: */ - iov.iov_base = req->rq_snd_buf.head[0].iov_base; - if (task->tk_client->cl_xprt->stream) - /* See clnt.c:call_header() */ - iov.iov_base += 4; + iov.iov_base = xprt_skip_transport_header(task->tk_xprt, + req->rq_snd_buf.head[0].iov_base); iov.iov_len = (u8 *)p - (u8 *)iov.iov_base; xdr_buf_from_iov(&iov, &verf_buf); @@ -857,9 +854,7 @@ gss_marshal(struct rpc_task *task, u32 *p) *p++ = htonl(RPC_AUTH_GSS); mic.data = (u8 *)(p + 1); - maj_stat = gss_get_mic(ctx->gc_gss_ctx, - GSS_C_QOP_DEFAULT, - &verf_buf, &mic); + maj_stat = gss_get_mic(ctx->gc_gss_ctx, &verf_buf, &mic); if (maj_stat == GSS_S_CONTEXT_EXPIRED) { cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; } else if (maj_stat != 0) { @@ -890,10 +885,8 @@ static u32 * gss_validate(struct rpc_task *task, u32 *p) { struct rpc_cred *cred = task->tk_msg.rpc_cred; - struct gss_cred *gss_cred = container_of(cred, struct gss_cred, - gc_base); struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); - u32 seq, qop_state; + u32 seq; struct kvec iov; struct xdr_buf verf_buf; struct xdr_netobj mic; @@ -914,23 +907,14 @@ gss_validate(struct rpc_task *task, u32 *p) mic.data = (u8 *)p; mic.len = len; - maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic, &qop_state); + maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic); if (maj_stat == GSS_S_CONTEXT_EXPIRED) cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; if (maj_stat) goto out_bad; - switch (gss_cred->gc_service) { - case RPC_GSS_SVC_NONE: - /* verifier data, flavor, length: */ - task->tk_auth->au_rslack = XDR_QUADLEN(len) + 2; - break; - case RPC_GSS_SVC_INTEGRITY: - /* verifier data, flavor, length, length, sequence number: */ - task->tk_auth->au_rslack = XDR_QUADLEN(len) + 4; - break; - case RPC_GSS_SVC_PRIVACY: - goto out_bad; - } + /* We leave it to unwrap to calculate au_rslack. For now we just + * calculate the length of the verifier: */ + task->tk_auth->au_verfsize = XDR_QUADLEN(len) + 2; gss_put_ctx(ctx); dprintk("RPC: %4u GSS gss_validate: gss_verify_mic succeeded.\n", task->tk_pid); @@ -975,8 +959,7 @@ gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, p = iov->iov_base + iov->iov_len; mic.data = (u8 *)(p + 1); - maj_stat = gss_get_mic(ctx->gc_gss_ctx, - GSS_C_QOP_DEFAULT, &integ_buf, &mic); + maj_stat = gss_get_mic(ctx->gc_gss_ctx, &integ_buf, &mic); status = -EIO; /* XXX? */ if (maj_stat == GSS_S_CONTEXT_EXPIRED) cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; @@ -990,6 +973,113 @@ gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, return 0; } +static void +priv_release_snd_buf(struct rpc_rqst *rqstp) +{ + int i; + + for (i=0; i < rqstp->rq_enc_pages_num; i++) + __free_page(rqstp->rq_enc_pages[i]); + kfree(rqstp->rq_enc_pages); +} + +static int +alloc_enc_pages(struct rpc_rqst *rqstp) +{ + struct xdr_buf *snd_buf = &rqstp->rq_snd_buf; + int first, last, i; + + if (snd_buf->page_len == 0) { + rqstp->rq_enc_pages_num = 0; + return 0; + } + + first = snd_buf->page_base >> PAGE_CACHE_SHIFT; + last = (snd_buf->page_base + snd_buf->page_len - 1) >> PAGE_CACHE_SHIFT; + rqstp->rq_enc_pages_num = last - first + 1 + 1; + rqstp->rq_enc_pages + = kmalloc(rqstp->rq_enc_pages_num * sizeof(struct page *), + GFP_NOFS); + if (!rqstp->rq_enc_pages) + goto out; + for (i=0; i < rqstp->rq_enc_pages_num; i++) { + rqstp->rq_enc_pages[i] = alloc_page(GFP_NOFS); + if (rqstp->rq_enc_pages[i] == NULL) + goto out_free; + } + rqstp->rq_release_snd_buf = priv_release_snd_buf; + return 0; +out_free: + for (i--; i >= 0; i--) { + __free_page(rqstp->rq_enc_pages[i]); + } +out: + return -EAGAIN; +} + +static inline int +gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, + kxdrproc_t encode, struct rpc_rqst *rqstp, u32 *p, void *obj) +{ + struct xdr_buf *snd_buf = &rqstp->rq_snd_buf; + u32 offset; + u32 maj_stat; + int status; + u32 *opaque_len; + struct page **inpages; + int first; + int pad; + struct kvec *iov; + char *tmp; + + opaque_len = p++; + offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base; + *p++ = htonl(rqstp->rq_seqno); + + status = encode(rqstp, p, obj); + if (status) + return status; + + status = alloc_enc_pages(rqstp); + if (status) + return status; + first = snd_buf->page_base >> PAGE_CACHE_SHIFT; + inpages = snd_buf->pages + first; + snd_buf->pages = rqstp->rq_enc_pages; + snd_buf->page_base -= first << PAGE_CACHE_SHIFT; + /* Give the tail its own page, in case we need extra space in the + * head when wrapping: */ + if (snd_buf->page_len || snd_buf->tail[0].iov_len) { + tmp = page_address(rqstp->rq_enc_pages[rqstp->rq_enc_pages_num - 1]); + memcpy(tmp, snd_buf->tail[0].iov_base, snd_buf->tail[0].iov_len); + snd_buf->tail[0].iov_base = tmp; + } + maj_stat = gss_wrap(ctx->gc_gss_ctx, offset, snd_buf, inpages); + /* RPC_SLACK_SPACE should prevent this ever happening: */ + BUG_ON(snd_buf->len > snd_buf->buflen); + status = -EIO; + /* We're assuming that when GSS_S_CONTEXT_EXPIRED, the encryption was + * done anyway, so it's safe to put the request on the wire: */ + if (maj_stat == GSS_S_CONTEXT_EXPIRED) + cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; + else if (maj_stat) + return status; + + *opaque_len = htonl(snd_buf->len - offset); + /* guess whether we're in the head or the tail: */ + if (snd_buf->page_len || snd_buf->tail[0].iov_len) + iov = snd_buf->tail; + else + iov = snd_buf->head; + p = iov->iov_base + iov->iov_len; + pad = 3 - ((snd_buf->len - offset - 1) & 3); + memset(p, 0, pad); + iov->iov_len += pad; + snd_buf->len += pad; + + return 0; +} + static int gss_wrap_req(struct rpc_task *task, kxdrproc_t encode, void *rqstp, u32 *p, void *obj) @@ -1017,6 +1107,8 @@ gss_wrap_req(struct rpc_task *task, rqstp, p, obj); break; case RPC_GSS_SVC_PRIVACY: + status = gss_wrap_req_priv(cred, ctx, encode, + rqstp, p, obj); break; } out: @@ -1054,8 +1146,7 @@ gss_unwrap_resp_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, if (xdr_buf_read_netobj(rcv_buf, &mic, mic_offset)) return status; - maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, - &mic, NULL); + maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, &mic); if (maj_stat == GSS_S_CONTEXT_EXPIRED) cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; if (maj_stat != GSS_S_COMPLETE) @@ -1063,6 +1154,35 @@ gss_unwrap_resp_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, return 0; } +static inline int +gss_unwrap_resp_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, + struct rpc_rqst *rqstp, u32 **p) +{ + struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf; + u32 offset; + u32 opaque_len; + u32 maj_stat; + int status = -EIO; + + opaque_len = ntohl(*(*p)++); + offset = (u8 *)(*p) - (u8 *)rcv_buf->head[0].iov_base; + if (offset + opaque_len > rcv_buf->len) + return status; + /* remove padding: */ + rcv_buf->len = offset + opaque_len; + + maj_stat = gss_unwrap(ctx->gc_gss_ctx, offset, rcv_buf); + if (maj_stat == GSS_S_CONTEXT_EXPIRED) + cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; + if (maj_stat != GSS_S_COMPLETE) + return status; + if (ntohl(*(*p)++) != rqstp->rq_seqno) + return status; + + return 0; +} + + static int gss_unwrap_resp(struct rpc_task *task, kxdrproc_t decode, void *rqstp, u32 *p, void *obj) @@ -1071,6 +1191,9 @@ gss_unwrap_resp(struct rpc_task *task, struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); + u32 *savedp = p; + struct kvec *head = ((struct rpc_rqst *)rqstp)->rq_rcv_buf.head; + int savedlen = head->iov_len; int status = -EIO; if (ctx->gc_proc != RPC_GSS_PROC_DATA) @@ -1084,8 +1207,14 @@ gss_unwrap_resp(struct rpc_task *task, goto out; break; case RPC_GSS_SVC_PRIVACY: + status = gss_unwrap_resp_priv(cred, ctx, rqstp, &p); + if (status) + goto out; break; } + /* take into account extra slack for integrity and privacy cases: */ + task->tk_auth->au_rslack = task->tk_auth->au_verfsize + (p - savedp) + + (savedlen - head->iov_len); out_decode: status = decode(rqstp, p, obj); out: diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index ee6ae74cd1b..97c981fa6b8 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -37,7 +37,7 @@ #include <linux/types.h> #include <linux/mm.h> #include <linux/slab.h> -#include <asm/scatterlist.h> +#include <linux/scatterlist.h> #include <linux/crypto.h> #include <linux/highmem.h> #include <linux/pagemap.h> @@ -75,9 +75,7 @@ krb5_encrypt( memcpy(local_iv, iv, crypto_tfm_alg_ivsize(tfm)); memcpy(out, in, length); - sg[0].page = virt_to_page(out); - sg[0].offset = offset_in_page(out); - sg[0].length = length; + sg_set_buf(sg, out, length); ret = crypto_cipher_encrypt_iv(tfm, sg, sg, length, local_iv); @@ -117,9 +115,7 @@ krb5_decrypt( memcpy(local_iv,iv, crypto_tfm_alg_ivsize(tfm)); memcpy(out, in, length); - sg[0].page = virt_to_page(out); - sg[0].offset = offset_in_page(out); - sg[0].length = length; + sg_set_buf(sg, out, length); ret = crypto_cipher_decrypt_iv(tfm, sg, sg, length, local_iv); @@ -132,24 +128,91 @@ out: EXPORT_SYMBOL(krb5_decrypt); -static void -buf_to_sg(struct scatterlist *sg, char *ptr, int len) { - sg->page = virt_to_page(ptr); - sg->offset = offset_in_page(ptr); - sg->length = len; +static int +process_xdr_buf(struct xdr_buf *buf, int offset, int len, + int (*actor)(struct scatterlist *, void *), void *data) +{ + int i, page_len, thislen, page_offset, ret = 0; + struct scatterlist sg[1]; + + if (offset >= buf->head[0].iov_len) { + offset -= buf->head[0].iov_len; + } else { + thislen = buf->head[0].iov_len - offset; + if (thislen > len) + thislen = len; + sg_set_buf(sg, buf->head[0].iov_base + offset, thislen); + ret = actor(sg, data); + if (ret) + goto out; + offset = 0; + len -= thislen; + } + if (len == 0) + goto out; + + if (offset >= buf->page_len) { + offset -= buf->page_len; + } else { + page_len = buf->page_len - offset; + if (page_len > len) + page_len = len; + len -= page_len; + page_offset = (offset + buf->page_base) & (PAGE_CACHE_SIZE - 1); + i = (offset + buf->page_base) >> PAGE_CACHE_SHIFT; + thislen = PAGE_CACHE_SIZE - page_offset; + do { + if (thislen > page_len) + thislen = page_len; + sg->page = buf->pages[i]; + sg->offset = page_offset; + sg->length = thislen; + ret = actor(sg, data); + if (ret) + goto out; + page_len -= thislen; + i++; + page_offset = 0; + thislen = PAGE_CACHE_SIZE; + } while (page_len != 0); + offset = 0; + } + if (len == 0) + goto out; + + if (offset < buf->tail[0].iov_len) { + thislen = buf->tail[0].iov_len - offset; + if (thislen > len) + thislen = len; + sg_set_buf(sg, buf->tail[0].iov_base + offset, thislen); + ret = actor(sg, data); + len -= thislen; + } + if (len != 0) + ret = -EINVAL; +out: + return ret; +} + +static int +checksummer(struct scatterlist *sg, void *data) +{ + struct crypto_tfm *tfm = (struct crypto_tfm *)data; + + crypto_digest_update(tfm, sg, 1); + + return 0; } /* checksum the plaintext data and hdrlen bytes of the token header */ s32 make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body, - struct xdr_netobj *cksum) + int body_offset, struct xdr_netobj *cksum) { char *cksumname; struct crypto_tfm *tfm = NULL; /* XXX add to ctx? */ struct scatterlist sg[1]; u32 code = GSS_S_FAILURE; - int len, thislen, offset; - int i; switch (cksumtype) { case CKSUMTYPE_RSA_MD5: @@ -167,35 +230,10 @@ make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body, goto out; crypto_digest_init(tfm); - buf_to_sg(sg, header, hdrlen); + sg_set_buf(sg, header, hdrlen); crypto_digest_update(tfm, sg, 1); - if (body->head[0].iov_len) { - buf_to_sg(sg, body->head[0].iov_base, body->head[0].iov_len); - crypto_digest_update(tfm, sg, 1); - } - - len = body->page_len; - if (len != 0) { - offset = body->page_base & (PAGE_CACHE_SIZE - 1); - i = body->page_base >> PAGE_CACHE_SHIFT; - thislen = PAGE_CACHE_SIZE - offset; - do { - if (thislen > len) - thislen = len; - sg->page = body->pages[i]; - sg->offset = offset; - sg->length = thislen; - crypto_digest_update(tfm, sg, 1); - len -= thislen; - i++; - offset = 0; - thislen = PAGE_CACHE_SIZE; - } while(len != 0); - } - if (body->tail[0].iov_len) { - buf_to_sg(sg, body->tail[0].iov_base, body->tail[0].iov_len); - crypto_digest_update(tfm, sg, 1); - } + process_xdr_buf(body, body_offset, body->len - body_offset, + checksummer, tfm); crypto_digest_final(tfm, cksum->data); code = 0; out: @@ -204,3 +242,154 @@ out: } EXPORT_SYMBOL(make_checksum); + +struct encryptor_desc { + u8 iv[8]; /* XXX hard-coded blocksize */ + struct crypto_tfm *tfm; + int pos; + struct xdr_buf *outbuf; + struct page **pages; + struct scatterlist infrags[4]; + struct scatterlist outfrags[4]; + int fragno; + int fraglen; +}; + +static int +encryptor(struct scatterlist *sg, void *data) +{ + struct encryptor_desc *desc = data; + struct xdr_buf *outbuf = desc->outbuf; + struct page *in_page; + int thislen = desc->fraglen + sg->length; + int fraglen, ret; + int page_pos; + + /* Worst case is 4 fragments: head, end of page 1, start + * of page 2, tail. Anything more is a bug. */ + BUG_ON(desc->fragno > 3); + desc->infrags[desc->fragno] = *sg; + desc->outfrags[desc->fragno] = *sg; + + page_pos = desc->pos - outbuf->head[0].iov_len; + if (page_pos >= 0 && page_pos < outbuf->page_len) { + /* pages are not in place: */ + int i = (page_pos + outbuf->page_base) >> PAGE_CACHE_SHIFT; + in_page = desc->pages[i]; + } else { + in_page = sg->page; + } + desc->infrags[desc->fragno].page = in_page; + desc->fragno++; + desc->fraglen += sg->length; + desc->pos += sg->length; + + fraglen = thislen & 7; /* XXX hardcoded blocksize */ + thislen -= fraglen; + + if (thislen == 0) + return 0; + + ret = crypto_cipher_encrypt_iv(desc->tfm, desc->outfrags, desc->infrags, + thislen, desc->iv); + if (ret) + return ret; + if (fraglen) { + desc->outfrags[0].page = sg->page; + desc->outfrags[0].offset = sg->offset + sg->length - fraglen; + desc->outfrags[0].length = fraglen; + desc->infrags[0] = desc->outfrags[0]; + desc->infrags[0].page = in_page; + desc->fragno = 1; + desc->fraglen = fraglen; + } else { + desc->fragno = 0; + desc->fraglen = 0; + } + return 0; +} + +int +gss_encrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *buf, int offset, + struct page **pages) +{ + int ret; + struct encryptor_desc desc; + + BUG_ON((buf->len - offset) % crypto_tfm_alg_blocksize(tfm) != 0); + + memset(desc.iv, 0, sizeof(desc.iv)); + desc.tfm = tfm; + desc.pos = offset; + desc.outbuf = buf; + desc.pages = pages; + desc.fragno = 0; + desc.fraglen = 0; + + ret = process_xdr_buf(buf, offset, buf->len - offset, encryptor, &desc); + return ret; +} + +EXPORT_SYMBOL(gss_encrypt_xdr_buf); + +struct decryptor_desc { + u8 iv[8]; /* XXX hard-coded blocksize */ + struct crypto_tfm *tfm; + struct scatterlist frags[4]; + int fragno; + int fraglen; +}; + +static int +decryptor(struct scatterlist *sg, void *data) +{ + struct decryptor_desc *desc = data; + int thislen = desc->fraglen + sg->length; + int fraglen, ret; + + /* Worst case is 4 fragments: head, end of page 1, start + * of page 2, tail. Anything more is a bug. */ + BUG_ON(desc->fragno > 3); + desc->frags[desc->fragno] = *sg; + desc->fragno++; + desc->fraglen += sg->length; + + fraglen = thislen & 7; /* XXX hardcoded blocksize */ + thislen -= fraglen; + + if (thislen == 0) + return 0; + + ret = crypto_cipher_decrypt_iv(desc->tfm, desc->frags, desc->frags, + thislen, desc->iv); + if (ret) + return ret; + if (fraglen) { + desc->frags[0].page = sg->page; + desc->frags[0].offset = sg->offset + sg->length - fraglen; + desc->frags[0].length = fraglen; + desc->fragno = 1; + desc->fraglen = fraglen; + } else { + desc->fragno = 0; + desc->fraglen = 0; + } + return 0; +} + +int +gss_decrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *buf, int offset) +{ + struct decryptor_desc desc; + + /* XXXJBF: */ + BUG_ON((buf->len - offset) % crypto_tfm_alg_blocksize(tfm) != 0); + + memset(desc.iv, 0, sizeof(desc.iv)); + desc.tfm = tfm; + desc.fragno = 0; + desc.fraglen = 0; + return process_xdr_buf(buf, offset, buf->len - offset, decryptor, &desc); +} + +EXPORT_SYMBOL(gss_decrypt_xdr_buf); diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 606a8a82caf..5f1f806a0b1 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -39,7 +39,6 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/sunrpc/auth.h> -#include <linux/in.h> #include <linux/sunrpc/gss_krb5.h> #include <linux/sunrpc/xdr.h> #include <linux/crypto.h> @@ -191,43 +190,12 @@ gss_delete_sec_context_kerberos(void *internal_ctx) { kfree(kctx); } -static u32 -gss_verify_mic_kerberos(struct gss_ctx *ctx, - struct xdr_buf *message, - struct xdr_netobj *mic_token, - u32 *qstate) { - u32 maj_stat = 0; - int qop_state; - struct krb5_ctx *kctx = ctx->internal_ctx_id; - - maj_stat = krb5_read_token(kctx, mic_token, message, &qop_state, - KG_TOK_MIC_MSG); - if (!maj_stat && qop_state) - *qstate = qop_state; - - dprintk("RPC: gss_verify_mic_kerberos returning %d\n", maj_stat); - return maj_stat; -} - -static u32 -gss_get_mic_kerberos(struct gss_ctx *ctx, - u32 qop, - struct xdr_buf *message, - struct xdr_netobj *mic_token) { - u32 err = 0; - struct krb5_ctx *kctx = ctx->internal_ctx_id; - - err = krb5_make_token(kctx, qop, message, mic_token, KG_TOK_MIC_MSG); - - dprintk("RPC: gss_get_mic_kerberos returning %d\n",err); - - return err; -} - static struct gss_api_ops gss_kerberos_ops = { .gss_import_sec_context = gss_import_sec_context_kerberos, .gss_get_mic = gss_get_mic_kerberos, .gss_verify_mic = gss_verify_mic_kerberos, + .gss_wrap = gss_wrap_kerberos, + .gss_unwrap = gss_unwrap_kerberos, .gss_delete_sec_context = gss_delete_sec_context_kerberos, }; @@ -242,6 +210,11 @@ static struct pf_desc gss_kerberos_pfs[] = { .service = RPC_GSS_SVC_INTEGRITY, .name = "krb5i", }, + [2] = { + .pseudoflavor = RPC_AUTH_GSS_KRB5P, + .service = RPC_GSS_SVC_PRIVACY, + .name = "krb5p", + }, }; static struct gss_api_mech gss_kerberos_mech = { diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c index afeeb8715a7..d0dfdfd5e79 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seal.c +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -70,22 +70,13 @@ # define RPCDBG_FACILITY RPCDBG_AUTH #endif -static inline int -gss_krb5_padding(int blocksize, int length) { - /* Most of the code is block-size independent but in practice we - * use only 8: */ - BUG_ON(blocksize != 8); - return 8 - (length & 7); -} - u32 -krb5_make_token(struct krb5_ctx *ctx, int qop_req, - struct xdr_buf *text, struct xdr_netobj *token, - int toktype) +gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text, + struct xdr_netobj *token) { + struct krb5_ctx *ctx = gss_ctx->internal_ctx_id; s32 checksum_type; struct xdr_netobj md5cksum = {.len = 0, .data = NULL}; - int blocksize = 0, tmsglen; unsigned char *ptr, *krb5_hdr, *msg_start; s32 now; @@ -93,9 +84,6 @@ krb5_make_token(struct krb5_ctx *ctx, int qop_req, now = get_seconds(); - if (qop_req != 0) - goto out_err; - switch (ctx->signalg) { case SGN_ALG_DES_MAC_MD5: checksum_type = CKSUMTYPE_RSA_MD5; @@ -111,21 +99,13 @@ krb5_make_token(struct krb5_ctx *ctx, int qop_req, goto out_err; } - if (toktype == KG_TOK_WRAP_MSG) { - blocksize = crypto_tfm_alg_blocksize(ctx->enc); - tmsglen = blocksize + text->len - + gss_krb5_padding(blocksize, blocksize + text->len); - } else { - tmsglen = 0; - } - - token->len = g_token_size(&ctx->mech_used, 22 + tmsglen); + token->len = g_token_size(&ctx->mech_used, 22); ptr = token->data; - g_make_token_header(&ctx->mech_used, 22 + tmsglen, &ptr); + g_make_token_header(&ctx->mech_used, 22, &ptr); - *ptr++ = (unsigned char) ((toktype>>8)&0xff); - *ptr++ = (unsigned char) (toktype&0xff); + *ptr++ = (unsigned char) ((KG_TOK_MIC_MSG>>8)&0xff); + *ptr++ = (unsigned char) (KG_TOK_MIC_MSG&0xff); /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */ krb5_hdr = ptr - 2; @@ -133,17 +113,9 @@ krb5_make_token(struct krb5_ctx *ctx, int qop_req, *(u16 *)(krb5_hdr + 2) = htons(ctx->signalg); memset(krb5_hdr + 4, 0xff, 4); - if (toktype == KG_TOK_WRAP_MSG) - *(u16 *)(krb5_hdr + 4) = htons(ctx->sealalg); - if (toktype == KG_TOK_WRAP_MSG) { - /* XXX removing support for now */ - goto out_err; - } else { /* Sign only. */ - if (make_checksum(checksum_type, krb5_hdr, 8, text, - &md5cksum)) + if (make_checksum(checksum_type, krb5_hdr, 8, text, 0, &md5cksum)) goto out_err; - } switch (ctx->signalg) { case SGN_ALG_DES_MAC_MD5: @@ -171,6 +143,6 @@ krb5_make_token(struct krb5_ctx *ctx, int qop_req, return ((ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE); out_err: - if (md5cksum.data) kfree(md5cksum.data); + kfree(md5cksum.data); return GSS_S_FAILURE; } diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c index 8767fc53183..db055fd7d77 100644 --- a/net/sunrpc/auth_gss/gss_krb5_unseal.c +++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c @@ -68,21 +68,14 @@ #endif -/* message_buffer is an input if toktype is MIC and an output if it is WRAP: - * If toktype is MIC: read_token is a mic token, and message_buffer is the - * data that the mic was supposedly taken over. - * If toktype is WRAP: read_token is a wrap token, and message_buffer is used - * to return the decrypted data. - */ +/* read_token is a mic token, and message_buffer is the data that the mic was + * supposedly taken over. */ -/* XXX will need to change prototype and/or just split into a separate function - * when we add privacy (because read_token will be in pages too). */ u32 -krb5_read_token(struct krb5_ctx *ctx, - struct xdr_netobj *read_token, - struct xdr_buf *message_buffer, - int *qop_state, int toktype) +gss_verify_mic_kerberos(struct gss_ctx *gss_ctx, + struct xdr_buf *message_buffer, struct xdr_netobj *read_token) { + struct krb5_ctx *ctx = gss_ctx->internal_ctx_id; int signalg; int sealalg; s32 checksum_type; @@ -100,16 +93,12 @@ krb5_read_token(struct krb5_ctx *ctx, read_token->len)) goto out; - if ((*ptr++ != ((toktype>>8)&0xff)) || (*ptr++ != (toktype&0xff))) + if ((*ptr++ != ((KG_TOK_MIC_MSG>>8)&0xff)) || + (*ptr++ != ( KG_TOK_MIC_MSG &0xff)) ) goto out; /* XXX sanity-check bodysize?? */ - if (toktype == KG_TOK_WRAP_MSG) { - /* XXX gone */ - goto out; - } - /* get the sign and seal algorithms */ signalg = ptr[0] + (ptr[1] << 8); @@ -120,14 +109,7 @@ krb5_read_token(struct krb5_ctx *ctx, if ((ptr[4] != 0xff) || (ptr[5] != 0xff)) goto out; - if (((toktype != KG_TOK_WRAP_MSG) && (sealalg != 0xffff)) || - ((toktype == KG_TOK_WRAP_MSG) && (sealalg == 0xffff))) - goto out; - - /* in the current spec, there is only one valid seal algorithm per - key type, so a simple comparison is ok */ - - if ((toktype == KG_TOK_WRAP_MSG) && !(sealalg == ctx->sealalg)) + if (sealalg != 0xffff) goto out; /* there are several mappings of seal algorithms to sign algorithms, @@ -154,7 +136,7 @@ krb5_read_token(struct krb5_ctx *ctx, switch (signalg) { case SGN_ALG_DES_MAC_MD5: ret = make_checksum(checksum_type, ptr - 2, 8, - message_buffer, &md5cksum); + message_buffer, 0, &md5cksum); if (ret) goto out; @@ -175,9 +157,6 @@ krb5_read_token(struct krb5_ctx *ctx, /* it got through unscathed. Make sure the context is unexpired */ - if (qop_state) - *qop_state = GSS_C_QOP_DEFAULT; - now = get_seconds(); ret = GSS_S_CONTEXT_EXPIRED; @@ -197,6 +176,6 @@ krb5_read_token(struct krb5_ctx *ctx, ret = GSS_S_COMPLETE; out: - if (md5cksum.data) kfree(md5cksum.data); + kfree(md5cksum.data); return ret; } diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c new file mode 100644 index 00000000000..af777cf9f25 --- /dev/null +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -0,0 +1,363 @@ +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/jiffies.h> +#include <linux/sunrpc/gss_krb5.h> +#include <linux/random.h> +#include <linux/pagemap.h> +#include <asm/scatterlist.h> +#include <linux/crypto.h> + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +static inline int +gss_krb5_padding(int blocksize, int length) +{ + /* Most of the code is block-size independent but currently we + * use only 8: */ + BUG_ON(blocksize != 8); + return 8 - (length & 7); +} + +static inline void +gss_krb5_add_padding(struct xdr_buf *buf, int offset, int blocksize) +{ + int padding = gss_krb5_padding(blocksize, buf->len - offset); + char *p; + struct kvec *iov; + + if (buf->page_len || buf->tail[0].iov_len) + iov = &buf->tail[0]; + else + iov = &buf->head[0]; + p = iov->iov_base + iov->iov_len; + iov->iov_len += padding; + buf->len += padding; + memset(p, padding, padding); +} + +static inline int +gss_krb5_remove_padding(struct xdr_buf *buf, int blocksize) +{ + u8 *ptr; + u8 pad; + int len = buf->len; + + if (len <= buf->head[0].iov_len) { + pad = *(u8 *)(buf->head[0].iov_base + len - 1); + if (pad > buf->head[0].iov_len) + return -EINVAL; + buf->head[0].iov_len -= pad; + goto out; + } else + len -= buf->head[0].iov_len; + if (len <= buf->page_len) { + int last = (buf->page_base + len - 1) + >>PAGE_CACHE_SHIFT; + int offset = (buf->page_base + len - 1) + & (PAGE_CACHE_SIZE - 1); + ptr = kmap_atomic(buf->pages[last], KM_SKB_SUNRPC_DATA); + pad = *(ptr + offset); + kunmap_atomic(ptr, KM_SKB_SUNRPC_DATA); + goto out; + } else + len -= buf->page_len; + BUG_ON(len > buf->tail[0].iov_len); + pad = *(u8 *)(buf->tail[0].iov_base + len - 1); +out: + /* XXX: NOTE: we do not adjust the page lengths--they represent + * a range of data in the real filesystem page cache, and we need + * to know that range so the xdr code can properly place read data. + * However adjusting the head length, as we do above, is harmless. + * In the case of a request that fits into a single page, the server + * also uses length and head length together to determine the original + * start of the request to copy the request for deferal; so it's + * easier on the server if we adjust head and tail length in tandem. + * It's not really a problem that we don't fool with the page and + * tail lengths, though--at worst badly formed xdr might lead the + * server to attempt to parse the padding. + * XXX: Document all these weird requirements for gss mechanism + * wrap/unwrap functions. */ + if (pad > blocksize) + return -EINVAL; + if (buf->len > pad) + buf->len -= pad; + else + return -EINVAL; + return 0; +} + +static inline void +make_confounder(char *p, int blocksize) +{ + static u64 i = 0; + u64 *q = (u64 *)p; + + /* rfc1964 claims this should be "random". But all that's really + * necessary is that it be unique. And not even that is necessary in + * our case since our "gssapi" implementation exists only to support + * rpcsec_gss, so we know that the only buffers we will ever encrypt + * already begin with a unique sequence number. Just to hedge my bets + * I'll make a half-hearted attempt at something unique, but ensuring + * uniqueness would mean worrying about atomicity and rollover, and I + * don't care enough. */ + + BUG_ON(blocksize != 8); + *q = i++; +} + +/* Assumptions: the head and tail of inbuf are ours to play with. + * The pages, however, may be real pages in the page cache and we replace + * them with scratch pages from **pages before writing to them. */ +/* XXX: obviously the above should be documentation of wrap interface, + * and shouldn't be in this kerberos-specific file. */ + +/* XXX factor out common code with seal/unseal. */ + +u32 +gss_wrap_kerberos(struct gss_ctx *ctx, int offset, + struct xdr_buf *buf, struct page **pages) +{ + struct krb5_ctx *kctx = ctx->internal_ctx_id; + s32 checksum_type; + struct xdr_netobj md5cksum = {.len = 0, .data = NULL}; + int blocksize = 0, plainlen; + unsigned char *ptr, *krb5_hdr, *msg_start; + s32 now; + int headlen; + struct page **tmp_pages; + + dprintk("RPC: gss_wrap_kerberos\n"); + + now = get_seconds(); + + switch (kctx->signalg) { + case SGN_ALG_DES_MAC_MD5: + checksum_type = CKSUMTYPE_RSA_MD5; + break; + default: + dprintk("RPC: gss_krb5_seal: kctx->signalg %d not" + " supported\n", kctx->signalg); + goto out_err; + } + if (kctx->sealalg != SEAL_ALG_NONE && kctx->sealalg != SEAL_ALG_DES) { + dprintk("RPC: gss_krb5_seal: kctx->sealalg %d not supported\n", + kctx->sealalg); + goto out_err; + } + + blocksize = crypto_tfm_alg_blocksize(kctx->enc); + gss_krb5_add_padding(buf, offset, blocksize); + BUG_ON((buf->len - offset) % blocksize); + plainlen = blocksize + buf->len - offset; + + headlen = g_token_size(&kctx->mech_used, 22 + plainlen) - + (buf->len - offset); + + ptr = buf->head[0].iov_base + offset; + /* shift data to make room for header. */ + /* XXX Would be cleverer to encrypt while copying. */ + /* XXX bounds checking, slack, etc. */ + memmove(ptr + headlen, ptr, buf->head[0].iov_len - offset); + buf->head[0].iov_len += headlen; + buf->len += headlen; + BUG_ON((buf->len - offset - headlen) % blocksize); + + g_make_token_header(&kctx->mech_used, 22 + plainlen, &ptr); + + + *ptr++ = (unsigned char) ((KG_TOK_WRAP_MSG>>8)&0xff); + *ptr++ = (unsigned char) (KG_TOK_WRAP_MSG&0xff); + + /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */ + krb5_hdr = ptr - 2; + msg_start = krb5_hdr + 24; + /* XXXJBF: */ BUG_ON(buf->head[0].iov_base + offset + headlen != msg_start + blocksize); + + *(u16 *)(krb5_hdr + 2) = htons(kctx->signalg); + memset(krb5_hdr + 4, 0xff, 4); + *(u16 *)(krb5_hdr + 4) = htons(kctx->sealalg); + + make_confounder(msg_start, blocksize); + + /* XXXJBF: UGH!: */ + tmp_pages = buf->pages; + buf->pages = pages; + if (make_checksum(checksum_type, krb5_hdr, 8, buf, + offset + headlen - blocksize, &md5cksum)) + goto out_err; + buf->pages = tmp_pages; + + switch (kctx->signalg) { + case SGN_ALG_DES_MAC_MD5: + if (krb5_encrypt(kctx->seq, NULL, md5cksum.data, + md5cksum.data, md5cksum.len)) + goto out_err; + memcpy(krb5_hdr + 16, + md5cksum.data + md5cksum.len - KRB5_CKSUM_LENGTH, + KRB5_CKSUM_LENGTH); + + dprintk("RPC: make_seal_token: cksum data: \n"); + print_hexl((u32 *) (krb5_hdr + 16), KRB5_CKSUM_LENGTH, 0); + break; + default: + BUG(); + } + + kfree(md5cksum.data); + + /* XXX would probably be more efficient to compute checksum + * and encrypt at the same time: */ + if ((krb5_make_seq_num(kctx->seq, kctx->initiate ? 0 : 0xff, + kctx->seq_send, krb5_hdr + 16, krb5_hdr + 8))) + goto out_err; + + if (gss_encrypt_xdr_buf(kctx->enc, buf, offset + headlen - blocksize, + pages)) + goto out_err; + + kctx->seq_send++; + + return ((kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE); +out_err: + if (md5cksum.data) kfree(md5cksum.data); + return GSS_S_FAILURE; +} + +u32 +gss_unwrap_kerberos(struct gss_ctx *ctx, int offset, struct xdr_buf *buf) +{ + struct krb5_ctx *kctx = ctx->internal_ctx_id; + int signalg; + int sealalg; + s32 checksum_type; + struct xdr_netobj md5cksum = {.len = 0, .data = NULL}; + s32 now; + int direction; + s32 seqnum; + unsigned char *ptr; + int bodysize; + u32 ret = GSS_S_DEFECTIVE_TOKEN; + void *data_start, *orig_start; + int data_len; + int blocksize; + + dprintk("RPC: gss_unwrap_kerberos\n"); + + ptr = (u8 *)buf->head[0].iov_base + offset; + if (g_verify_token_header(&kctx->mech_used, &bodysize, &ptr, + buf->len - offset)) + goto out; + + if ((*ptr++ != ((KG_TOK_WRAP_MSG>>8)&0xff)) || + (*ptr++ != (KG_TOK_WRAP_MSG &0xff)) ) + goto out; + + /* XXX sanity-check bodysize?? */ + + /* get the sign and seal algorithms */ + + signalg = ptr[0] + (ptr[1] << 8); + sealalg = ptr[2] + (ptr[3] << 8); + + /* Sanity checks */ + + if ((ptr[4] != 0xff) || (ptr[5] != 0xff)) + goto out; + + if (sealalg == 0xffff) + goto out; + + /* in the current spec, there is only one valid seal algorithm per + key type, so a simple comparison is ok */ + + if (sealalg != kctx->sealalg) + goto out; + + /* there are several mappings of seal algorithms to sign algorithms, + but few enough that we can try them all. */ + + if ((kctx->sealalg == SEAL_ALG_NONE && signalg > 1) || + (kctx->sealalg == SEAL_ALG_1 && signalg != SGN_ALG_3) || + (kctx->sealalg == SEAL_ALG_DES3KD && + signalg != SGN_ALG_HMAC_SHA1_DES3_KD)) + goto out; + + if (gss_decrypt_xdr_buf(kctx->enc, buf, + ptr + 22 - (unsigned char *)buf->head[0].iov_base)) + goto out; + + /* compute the checksum of the message */ + + /* initialize the the cksum */ + switch (signalg) { + case SGN_ALG_DES_MAC_MD5: + checksum_type = CKSUMTYPE_RSA_MD5; + break; + default: + ret = GSS_S_DEFECTIVE_TOKEN; + goto out; + } + + switch (signalg) { + case SGN_ALG_DES_MAC_MD5: + ret = make_checksum(checksum_type, ptr - 2, 8, buf, + ptr + 22 - (unsigned char *)buf->head[0].iov_base, &md5cksum); + if (ret) + goto out; + + ret = krb5_encrypt(kctx->seq, NULL, md5cksum.data, + md5cksum.data, md5cksum.len); + if (ret) + goto out; + + if (memcmp(md5cksum.data + 8, ptr + 14, 8)) { + ret = GSS_S_BAD_SIG; + goto out; + } + break; + default: + ret = GSS_S_DEFECTIVE_TOKEN; + goto out; + } + + /* it got through unscathed. Make sure the context is unexpired */ + + now = get_seconds(); + + ret = GSS_S_CONTEXT_EXPIRED; + if (now > kctx->endtime) + goto out; + + /* do sequencing checks */ + + ret = GSS_S_BAD_SIG; + if ((ret = krb5_get_seq_num(kctx->seq, ptr + 14, ptr + 6, &direction, + &seqnum))) + goto out; + + if ((kctx->initiate && direction != 0xff) || + (!kctx->initiate && direction != 0)) + goto out; + + /* Copy the data back to the right position. XXX: Would probably be + * better to copy and encrypt at the same time. */ + + blocksize = crypto_tfm_alg_blocksize(kctx->enc); + data_start = ptr + 22 + blocksize; + orig_start = buf->head[0].iov_base + offset; + data_len = (buf->head[0].iov_base + buf->head[0].iov_len) - data_start; + memmove(orig_start, data_start, data_len); + buf->head[0].iov_len -= (data_start - orig_start); + buf->len -= (data_start - orig_start); + + ret = GSS_S_DEFECTIVE_TOKEN; + if (gss_krb5_remove_padding(buf, blocksize)) + goto out; + + ret = GSS_S_COMPLETE; +out: + if (md5cksum.data) kfree(md5cksum.data); + return ret; +} diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 9dfb68377d6..f8bac6ccd52 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -35,7 +35,6 @@ #include <linux/types.h> #include <linux/slab.h> -#include <linux/socket.h> #include <linux/module.h> #include <linux/sunrpc/msg_prot.h> #include <linux/sunrpc/gss_asn1.h> @@ -61,8 +60,7 @@ gss_mech_free(struct gss_api_mech *gm) for (i = 0; i < gm->gm_pf_num; i++) { pf = &gm->gm_pfs[i]; - if (pf->auth_domain_name) - kfree(pf->auth_domain_name); + kfree(pf->auth_domain_name); pf->auth_domain_name = NULL; } } @@ -251,13 +249,11 @@ gss_import_sec_context(const void *input_token, size_t bufsize, u32 gss_get_mic(struct gss_ctx *context_handle, - u32 qop, struct xdr_buf *message, struct xdr_netobj *mic_token) { return context_handle->mech_type->gm_ops ->gss_get_mic(context_handle, - qop, message, mic_token); } @@ -267,16 +263,34 @@ gss_get_mic(struct gss_ctx *context_handle, u32 gss_verify_mic(struct gss_ctx *context_handle, struct xdr_buf *message, - struct xdr_netobj *mic_token, - u32 *qstate) + struct xdr_netobj *mic_token) { return context_handle->mech_type->gm_ops ->gss_verify_mic(context_handle, message, - mic_token, - qstate); + mic_token); } +u32 +gss_wrap(struct gss_ctx *ctx_id, + int offset, + struct xdr_buf *buf, + struct page **inpages) +{ + return ctx_id->mech_type->gm_ops + ->gss_wrap(ctx_id, offset, buf, inpages); +} + +u32 +gss_unwrap(struct gss_ctx *ctx_id, + int offset, + struct xdr_buf *buf) +{ + return ctx_id->mech_type->gm_ops + ->gss_unwrap(ctx_id, offset, buf); +} + + /* gss_delete_sec_context: free all resources associated with context_handle. * Note this differs from the RFC 2744-specified prototype in that we don't * bother returning an output token, since it would never be used anyway. */ diff --git a/net/sunrpc/auth_gss/gss_spkm3_mech.c b/net/sunrpc/auth_gss/gss_spkm3_mech.c index 6c97d61baa9..39b3edc1469 100644 --- a/net/sunrpc/auth_gss/gss_spkm3_mech.c +++ b/net/sunrpc/auth_gss/gss_spkm3_mech.c @@ -224,18 +224,13 @@ gss_delete_sec_context_spkm3(void *internal_ctx) { static u32 gss_verify_mic_spkm3(struct gss_ctx *ctx, struct xdr_buf *signbuf, - struct xdr_netobj *checksum, - u32 *qstate) { + struct xdr_netobj *checksum) +{ u32 maj_stat = 0; - int qop_state = 0; struct spkm3_ctx *sctx = ctx->internal_ctx_id; dprintk("RPC: gss_verify_mic_spkm3 calling spkm3_read_token\n"); - maj_stat = spkm3_read_token(sctx, checksum, signbuf, &qop_state, - SPKM_MIC_TOK); - - if (!maj_stat && qop_state) - *qstate = qop_state; + maj_stat = spkm3_read_token(sctx, checksum, signbuf, SPKM_MIC_TOK); dprintk("RPC: gss_verify_mic_spkm3 returning %d\n", maj_stat); return maj_stat; @@ -243,15 +238,15 @@ gss_verify_mic_spkm3(struct gss_ctx *ctx, static u32 gss_get_mic_spkm3(struct gss_ctx *ctx, - u32 qop, struct xdr_buf *message_buffer, - struct xdr_netobj *message_token) { + struct xdr_netobj *message_token) +{ u32 err = 0; struct spkm3_ctx *sctx = ctx->internal_ctx_id; dprintk("RPC: gss_get_mic_spkm3\n"); - err = spkm3_make_token(sctx, qop, message_buffer, + err = spkm3_make_token(sctx, message_buffer, message_token, SPKM_MIC_TOK); return err; } @@ -264,8 +259,8 @@ static struct gss_api_ops gss_spkm3_ops = { }; static struct pf_desc gss_spkm3_pfs[] = { - {RPC_AUTH_GSS_SPKM, 0, RPC_GSS_SVC_NONE, "spkm3"}, - {RPC_AUTH_GSS_SPKMI, 0, RPC_GSS_SVC_INTEGRITY, "spkm3i"}, + {RPC_AUTH_GSS_SPKM, RPC_GSS_SVC_NONE, "spkm3"}, + {RPC_AUTH_GSS_SPKMI, RPC_GSS_SVC_INTEGRITY, "spkm3i"}, }; static struct gss_api_mech gss_spkm3_mech = { diff --git a/net/sunrpc/auth_gss/gss_spkm3_seal.c b/net/sunrpc/auth_gss/gss_spkm3_seal.c index 25339868d46..d1e12b25d6e 100644 --- a/net/sunrpc/auth_gss/gss_spkm3_seal.c +++ b/net/sunrpc/auth_gss/gss_spkm3_seal.c @@ -51,7 +51,7 @@ */ u32 -spkm3_make_token(struct spkm3_ctx *ctx, int qop_req, +spkm3_make_token(struct spkm3_ctx *ctx, struct xdr_buf * text, struct xdr_netobj * token, int toktype) { @@ -68,8 +68,6 @@ spkm3_make_token(struct spkm3_ctx *ctx, int qop_req, dprintk("RPC: spkm3_make_token\n"); now = jiffies; - if (qop_req != 0) - goto out_err; if (ctx->ctx_id.len != 16) { dprintk("RPC: spkm3_make_token BAD ctx_id.len %d\n", @@ -124,8 +122,7 @@ spkm3_make_token(struct spkm3_ctx *ctx, int qop_req, return GSS_S_COMPLETE; out_err: - if (md5cksum.data) - kfree(md5cksum.data); + kfree(md5cksum.data); token->data = NULL; token->len = 0; return GSS_S_FAILURE; diff --git a/net/sunrpc/auth_gss/gss_spkm3_token.c b/net/sunrpc/auth_gss/gss_spkm3_token.c index 46c08a0710f..1f824578d77 100644 --- a/net/sunrpc/auth_gss/gss_spkm3_token.c +++ b/net/sunrpc/auth_gss/gss_spkm3_token.c @@ -259,8 +259,7 @@ spkm3_verify_mic_token(unsigned char **tokp, int *mic_hdrlen, unsigned char **ck ret = GSS_S_COMPLETE; out: - if (spkm3_ctx_id.data) - kfree(spkm3_ctx_id.data); + kfree(spkm3_ctx_id.data); return ret; } diff --git a/net/sunrpc/auth_gss/gss_spkm3_unseal.c b/net/sunrpc/auth_gss/gss_spkm3_unseal.c index 65ce81bf0bc..241d5b30dfc 100644 --- a/net/sunrpc/auth_gss/gss_spkm3_unseal.c +++ b/net/sunrpc/auth_gss/gss_spkm3_unseal.c @@ -52,7 +52,7 @@ u32 spkm3_read_token(struct spkm3_ctx *ctx, struct xdr_netobj *read_token, /* checksum */ struct xdr_buf *message_buffer, /* signbuf */ - int *qop_state, int toktype) + int toktype) { s32 code; struct xdr_netobj wire_cksum = {.len =0, .data = NULL}; @@ -120,9 +120,7 @@ spkm3_read_token(struct spkm3_ctx *ctx, /* XXX: need to add expiration and sequencing */ ret = GSS_S_COMPLETE; out: - if (md5cksum.data) - kfree(md5cksum.data); - if (wire_cksum.data) - kfree(wire_cksum.data); + kfree(md5cksum.data); + kfree(wire_cksum.data); return ret; } diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index e3308195374..e4ada15ed85 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -566,8 +566,7 @@ gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci, if (rqstp->rq_deferred) /* skip verification of revisited request */ return SVC_OK; - if (gss_verify_mic(ctx_id, &rpchdr, &checksum, NULL) - != GSS_S_COMPLETE) { + if (gss_verify_mic(ctx_id, &rpchdr, &checksum) != GSS_S_COMPLETE) { *authp = rpcsec_gsserr_credproblem; return SVC_DENIED; } @@ -604,7 +603,7 @@ gss_write_verf(struct svc_rqst *rqstp, struct gss_ctx *ctx_id, u32 seq) xdr_buf_from_iov(&iov, &verf_data); p = rqstp->rq_res.head->iov_base + rqstp->rq_res.head->iov_len; mic.data = (u8 *)(p + 1); - maj_stat = gss_get_mic(ctx_id, 0, &verf_data, &mic); + maj_stat = gss_get_mic(ctx_id, &verf_data, &mic); if (maj_stat != GSS_S_COMPLETE) return -1; *p++ = htonl(mic.len); @@ -710,7 +709,7 @@ unwrap_integ_data(struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx) goto out; if (read_bytes_from_xdr_buf(buf, integ_len + 4, mic.data, mic.len)) goto out; - maj_stat = gss_verify_mic(ctx, &integ_buf, &mic, NULL); + maj_stat = gss_verify_mic(ctx, &integ_buf, &mic); if (maj_stat != GSS_S_COMPLETE) goto out; if (ntohl(svc_getu32(&buf->head[0])) != seq) @@ -1012,7 +1011,7 @@ svcauth_gss_release(struct svc_rqst *rqstp) resv = &resbuf->tail[0]; } mic.data = (u8 *)resv->iov_base + resv->iov_len + 4; - if (gss_get_mic(gsd->rsci->mechctx, 0, &integ_buf, &mic)) + if (gss_get_mic(gsd->rsci->mechctx, &integ_buf, &mic)) goto out_err; svc_putu32(resv, htonl(mic.len)); memset(mic.data + mic.len, 0, diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c index 9b72d3abf82..f56767aaa92 100644 --- a/net/sunrpc/auth_null.c +++ b/net/sunrpc/auth_null.c @@ -7,9 +7,7 @@ */ #include <linux/types.h> -#include <linux/socket.h> #include <linux/module.h> -#include <linux/in.h> #include <linux/utsname.h> #include <linux/sunrpc/clnt.h> #include <linux/sched.h> diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 4ff297a9b15..890fb5ea0dc 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -9,8 +9,6 @@ #include <linux/types.h> #include <linux/sched.h> #include <linux/module.h> -#include <linux/socket.h> -#include <linux/in.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/auth.h> diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index f17e6153b68..61c3abeacca 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1,5 +1,5 @@ /* - * linux/net/sunrpc/rpcclnt.c + * linux/net/sunrpc/clnt.c * * This file contains the high-level RPC interface. * It is modeled as a finite state machine to support both synchronous @@ -27,7 +27,6 @@ #include <linux/types.h> #include <linux/mm.h> #include <linux/slab.h> -#include <linux/in.h> #include <linux/utsname.h> #include <linux/sunrpc/clnt.h> @@ -53,8 +52,10 @@ static void call_allocate(struct rpc_task *task); static void call_encode(struct rpc_task *task); static void call_decode(struct rpc_task *task); static void call_bind(struct rpc_task *task); +static void call_bind_status(struct rpc_task *task); static void call_transmit(struct rpc_task *task); static void call_status(struct rpc_task *task); +static void call_transmit_status(struct rpc_task *task); static void call_refresh(struct rpc_task *task); static void call_refreshresult(struct rpc_task *task); static void call_timeout(struct rpc_task *task); @@ -517,15 +518,8 @@ void rpc_setbufsize(struct rpc_clnt *clnt, unsigned int sndsize, unsigned int rcvsize) { struct rpc_xprt *xprt = clnt->cl_xprt; - - xprt->sndsize = 0; - if (sndsize) - xprt->sndsize = sndsize + RPC_SLACK_SPACE; - xprt->rcvsize = 0; - if (rcvsize) - xprt->rcvsize = rcvsize + RPC_SLACK_SPACE; - if (xprt_connected(xprt)) - xprt_sock_setbufsize(xprt); + if (xprt->ops->set_buffer_size) + xprt->ops->set_buffer_size(xprt, sndsize, rcvsize); } /* @@ -679,19 +673,29 @@ call_allocate(struct rpc_task *task) rpc_exit(task, -ERESTARTSYS); } +static inline int +rpc_task_need_encode(struct rpc_task *task) +{ + return task->tk_rqstp->rq_snd_buf.len == 0; +} + +static inline void +rpc_task_force_reencode(struct rpc_task *task) +{ + task->tk_rqstp->rq_snd_buf.len = 0; +} + /* * 3. Encode arguments of an RPC call */ static void call_encode(struct rpc_task *task) { - struct rpc_clnt *clnt = task->tk_client; struct rpc_rqst *req = task->tk_rqstp; struct xdr_buf *sndbuf = &req->rq_snd_buf; struct xdr_buf *rcvbuf = &req->rq_rcv_buf; unsigned int bufsiz; kxdrproc_t encode; - int status; u32 *p; dprintk("RPC: %4d call_encode (status %d)\n", @@ -719,11 +723,15 @@ call_encode(struct rpc_task *task) rpc_exit(task, -EIO); return; } - if (encode && (status = rpcauth_wrap_req(task, encode, req, p, - task->tk_msg.rpc_argp)) < 0) { - printk(KERN_WARNING "%s: can't encode arguments: %d\n", - clnt->cl_protname, -status); - rpc_exit(task, status); + if (encode == NULL) + return; + + task->tk_status = rpcauth_wrap_req(task, encode, req, p, + task->tk_msg.rpc_argp); + if (task->tk_status == -ENOMEM) { + /* XXX: Is this sane? */ + rpc_delay(task, 3*HZ); + task->tk_status = -EAGAIN; } } @@ -734,43 +742,95 @@ static void call_bind(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; - struct rpc_xprt *xprt = clnt->cl_xprt; - - dprintk("RPC: %4d call_bind xprt %p %s connected\n", task->tk_pid, - xprt, (xprt_connected(xprt) ? "is" : "is not")); - task->tk_action = (xprt_connected(xprt)) ? call_transmit : call_connect; + dprintk("RPC: %4d call_bind (status %d)\n", + task->tk_pid, task->tk_status); + task->tk_action = call_connect; if (!clnt->cl_port) { - task->tk_action = call_connect; - task->tk_timeout = RPC_CONNECT_TIMEOUT; + task->tk_action = call_bind_status; + task->tk_timeout = task->tk_xprt->bind_timeout; rpc_getport(task, clnt); } } /* - * 4a. Connect to the RPC server (TCP case) + * 4a. Sort out bind result + */ +static void +call_bind_status(struct rpc_task *task) +{ + int status = -EACCES; + + if (task->tk_status >= 0) { + dprintk("RPC: %4d call_bind_status (status %d)\n", + task->tk_pid, task->tk_status); + task->tk_status = 0; + task->tk_action = call_connect; + return; + } + + switch (task->tk_status) { + case -EACCES: + dprintk("RPC: %4d remote rpcbind: RPC program/version unavailable\n", + task->tk_pid); + rpc_delay(task, 3*HZ); + goto retry_bind; + case -ETIMEDOUT: + dprintk("RPC: %4d rpcbind request timed out\n", + task->tk_pid); + if (RPC_IS_SOFT(task)) { + status = -EIO; + break; + } + goto retry_bind; + case -EPFNOSUPPORT: + dprintk("RPC: %4d remote rpcbind service unavailable\n", + task->tk_pid); + break; + case -EPROTONOSUPPORT: + dprintk("RPC: %4d remote rpcbind version 2 unavailable\n", + task->tk_pid); + break; + default: + dprintk("RPC: %4d unrecognized rpcbind error (%d)\n", + task->tk_pid, -task->tk_status); + status = -EIO; + break; + } + + rpc_exit(task, status); + return; + +retry_bind: + task->tk_status = 0; + task->tk_action = call_bind; + return; +} + +/* + * 4b. Connect to the RPC server */ static void call_connect(struct rpc_task *task) { - struct rpc_clnt *clnt = task->tk_client; + struct rpc_xprt *xprt = task->tk_xprt; - dprintk("RPC: %4d call_connect status %d\n", - task->tk_pid, task->tk_status); + dprintk("RPC: %4d call_connect xprt %p %s connected\n", + task->tk_pid, xprt, + (xprt_connected(xprt) ? "is" : "is not")); - if (xprt_connected(clnt->cl_xprt)) { - task->tk_action = call_transmit; - return; + task->tk_action = call_transmit; + if (!xprt_connected(xprt)) { + task->tk_action = call_connect_status; + if (task->tk_status < 0) + return; + xprt_connect(task); } - task->tk_action = call_connect_status; - if (task->tk_status < 0) - return; - xprt_connect(task); } /* - * 4b. Sort out connect result + * 4c. Sort out connect result */ static void call_connect_status(struct rpc_task *task) @@ -778,6 +838,9 @@ call_connect_status(struct rpc_task *task) struct rpc_clnt *clnt = task->tk_client; int status = task->tk_status; + dprintk("RPC: %5u call_connect_status (status %d)\n", + task->tk_pid, task->tk_status); + task->tk_status = 0; if (status >= 0) { clnt->cl_stats->netreconn++; @@ -785,17 +848,19 @@ call_connect_status(struct rpc_task *task) return; } - /* Something failed: we may have to rebind */ + /* Something failed: remote service port may have changed */ if (clnt->cl_autobind) clnt->cl_port = 0; + switch (status) { case -ENOTCONN: case -ETIMEDOUT: case -EAGAIN: - task->tk_action = (clnt->cl_port == 0) ? call_bind : call_connect; + task->tk_action = call_bind; break; default: rpc_exit(task, -EIO); + break; } } @@ -815,10 +880,14 @@ call_transmit(struct rpc_task *task) if (task->tk_status != 0) return; /* Encode here so that rpcsec_gss can use correct sequence number. */ - if (!task->tk_rqstp->rq_bytes_sent) + if (rpc_task_need_encode(task)) { + task->tk_rqstp->rq_bytes_sent = 0; call_encode(task); - if (task->tk_status < 0) - return; + /* Did the encode result in an error condition? */ + if (task->tk_status != 0) + goto out_nosend; + } + task->tk_action = call_transmit_status; xprt_transmit(task); if (task->tk_status < 0) return; @@ -826,6 +895,11 @@ call_transmit(struct rpc_task *task) task->tk_action = NULL; rpc_wake_up_task(task); } + return; +out_nosend: + /* release socket write lock before attempting to handle error */ + xprt_abort_transmit(task); + rpc_task_force_reencode(task); } /* @@ -857,7 +931,6 @@ call_status(struct rpc_task *task) break; case -ECONNREFUSED: case -ENOTCONN: - req->rq_bytes_sent = 0; if (clnt->cl_autobind) clnt->cl_port = 0; task->tk_action = call_bind; @@ -879,7 +952,18 @@ call_status(struct rpc_task *task) } /* - * 6a. Handle RPC timeout + * 6a. Handle transmission errors. + */ +static void +call_transmit_status(struct rpc_task *task) +{ + if (task->tk_status != -EAGAIN) + rpc_task_force_reencode(task); + call_status(task); +} + +/* + * 6b. Handle RPC timeout * We do not release the request slot, so we keep using the * same XID for all retransmits. */ @@ -1020,13 +1104,12 @@ static u32 * call_header(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; - struct rpc_xprt *xprt = clnt->cl_xprt; struct rpc_rqst *req = task->tk_rqstp; u32 *p = req->rq_svec[0].iov_base; /* FIXME: check buffer size? */ - if (xprt->stream) - *p++ = 0; /* fill in later */ + + p = xprt_skip_transport_header(task->tk_xprt, p); *p++ = req->rq_xid; /* XID */ *p++ = htonl(RPC_CALL); /* CALL */ *p++ = htonl(RPC_VERSION); /* RPC version */ diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c index 4e81f276692..a398575f94b 100644 --- a/net/sunrpc/pmap_clnt.c +++ b/net/sunrpc/pmap_clnt.c @@ -26,7 +26,7 @@ #define PMAP_GETPORT 3 static struct rpc_procinfo pmap_procedures[]; -static struct rpc_clnt * pmap_create(char *, struct sockaddr_in *, int); +static struct rpc_clnt * pmap_create(char *, struct sockaddr_in *, int, int); static void pmap_getport_done(struct rpc_task *); static struct rpc_program pmap_program; static DEFINE_SPINLOCK(pmap_lock); @@ -65,7 +65,7 @@ rpc_getport(struct rpc_task *task, struct rpc_clnt *clnt) map->pm_binding = 1; spin_unlock(&pmap_lock); - pmap_clnt = pmap_create(clnt->cl_server, sap, map->pm_prot); + pmap_clnt = pmap_create(clnt->cl_server, sap, map->pm_prot, 0); if (IS_ERR(pmap_clnt)) { task->tk_status = PTR_ERR(pmap_clnt); goto bailout; @@ -112,7 +112,7 @@ rpc_getport_external(struct sockaddr_in *sin, __u32 prog, __u32 vers, int prot) NIPQUAD(sin->sin_addr.s_addr), prog, vers, prot); sprintf(hostname, "%u.%u.%u.%u", NIPQUAD(sin->sin_addr.s_addr)); - pmap_clnt = pmap_create(hostname, sin, prot); + pmap_clnt = pmap_create(hostname, sin, prot, 0); if (IS_ERR(pmap_clnt)) return PTR_ERR(pmap_clnt); @@ -171,7 +171,7 @@ rpc_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay) sin.sin_family = AF_INET; sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK); - pmap_clnt = pmap_create("localhost", &sin, IPPROTO_UDP); + pmap_clnt = pmap_create("localhost", &sin, IPPROTO_UDP, 1); if (IS_ERR(pmap_clnt)) { error = PTR_ERR(pmap_clnt); dprintk("RPC: couldn't create pmap client. Error = %d\n", error); @@ -198,7 +198,7 @@ rpc_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay) } static struct rpc_clnt * -pmap_create(char *hostname, struct sockaddr_in *srvaddr, int proto) +pmap_create(char *hostname, struct sockaddr_in *srvaddr, int proto, int privileged) { struct rpc_xprt *xprt; struct rpc_clnt *clnt; @@ -208,6 +208,8 @@ pmap_create(char *hostname, struct sockaddr_in *srvaddr, int proto) if (IS_ERR(xprt)) return (struct rpc_clnt *)xprt; xprt->addr.sin_port = htons(RPC_PMAP_PORT); + if (!privileged) + xprt->resvport = 0; /* printk("pmap: create clnt\n"); */ clnt = rpc_new_client(xprt, hostname, diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index ded6c63f11e..81e00a6c19d 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -76,25 +76,35 @@ int rpc_queue_upcall(struct inode *inode, struct rpc_pipe_msg *msg) { struct rpc_inode *rpci = RPC_I(inode); - int res = 0; + int res = -EPIPE; down(&inode->i_sem); + if (rpci->ops == NULL) + goto out; if (rpci->nreaders) { list_add_tail(&msg->list, &rpci->pipe); rpci->pipelen += msg->len; + res = 0; } else if (rpci->flags & RPC_PIPE_WAIT_FOR_OPEN) { if (list_empty(&rpci->pipe)) schedule_delayed_work(&rpci->queue_timeout, RPC_UPCALL_TIMEOUT); list_add_tail(&msg->list, &rpci->pipe); rpci->pipelen += msg->len; - } else - res = -EPIPE; + res = 0; + } +out: up(&inode->i_sem); wake_up(&rpci->waitq); return res; } +static inline void +rpc_inode_setowner(struct inode *inode, void *private) +{ + RPC_I(inode)->private = private; +} + static void rpc_close_pipes(struct inode *inode) { @@ -111,15 +121,10 @@ rpc_close_pipes(struct inode *inode) rpci->ops->release_pipe(inode); rpci->ops = NULL; } + rpc_inode_setowner(inode, NULL); up(&inode->i_sem); } -static inline void -rpc_inode_setowner(struct inode *inode, void *private) -{ - RPC_I(inode)->private = private; -} - static struct inode * rpc_alloc_inode(struct super_block *sb) { @@ -501,7 +506,6 @@ repeat: dentry = dvec[--n]; if (dentry->d_inode) { rpc_close_pipes(dentry->d_inode); - rpc_inode_setowner(dentry->d_inode, NULL); simple_unlink(dir, dentry); } dput(dentry); @@ -576,10 +580,8 @@ __rpc_rmdir(struct inode *dir, struct dentry *dentry) int error; shrink_dcache_parent(dentry); - if (dentry->d_inode) { + if (dentry->d_inode) rpc_close_pipes(dentry->d_inode); - rpc_inode_setowner(dentry->d_inode, NULL); - } if ((error = simple_rmdir(dir, dentry)) != 0) return error; if (!error) { @@ -601,7 +603,7 @@ rpc_lookup_negative(char *path, struct nameidata *nd) return ERR_PTR(error); dir = nd->dentry->d_inode; down(&dir->i_sem); - dentry = lookup_hash(&nd->last, nd->dentry); + dentry = lookup_hash(nd); if (IS_ERR(dentry)) goto out_err; if (dentry->d_inode) { @@ -663,7 +665,7 @@ rpc_rmdir(char *path) return error; dir = nd.dentry->d_inode; down(&dir->i_sem); - dentry = lookup_hash(&nd.last, nd.dentry); + dentry = lookup_hash(&nd); if (IS_ERR(dentry)) { error = PTR_ERR(dentry); goto out_release; @@ -724,7 +726,7 @@ rpc_unlink(char *path) return error; dir = nd.dentry->d_inode; down(&dir->i_sem); - dentry = lookup_hash(&nd.last, nd.dentry); + dentry = lookup_hash(&nd); if (IS_ERR(dentry)) { error = PTR_ERR(dentry); goto out_release; @@ -732,7 +734,6 @@ rpc_unlink(char *path) d_drop(dentry); if (dentry->d_inode) { rpc_close_pipes(dentry->d_inode); - rpc_inode_setowner(dentry->d_inode, NULL); error = simple_unlink(dir, dentry); } dput(dentry); diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c new file mode 100644 index 00000000000..eb330d4f66d --- /dev/null +++ b/net/sunrpc/socklib.c @@ -0,0 +1,180 @@ +/* + * linux/net/sunrpc/socklib.c + * + * Common socket helper routines for RPC client and server + * + * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> + */ + +#include <linux/compiler.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/types.h> +#include <linux/pagemap.h> +#include <linux/udp.h> +#include <linux/sunrpc/xdr.h> + + +/** + * skb_read_bits - copy some data bits from skb to internal buffer + * @desc: sk_buff copy helper + * @to: copy destination + * @len: number of bytes to copy + * + * Possibly called several times to iterate over an sk_buff and copy + * data out of it. + */ +static size_t skb_read_bits(skb_reader_t *desc, void *to, size_t len) +{ + if (len > desc->count) + len = desc->count; + if (skb_copy_bits(desc->skb, desc->offset, to, len)) + return 0; + desc->count -= len; + desc->offset += len; + return len; +} + +/** + * skb_read_and_csum_bits - copy and checksum from skb to buffer + * @desc: sk_buff copy helper + * @to: copy destination + * @len: number of bytes to copy + * + * Same as skb_read_bits, but calculate a checksum at the same time. + */ +static size_t skb_read_and_csum_bits(skb_reader_t *desc, void *to, size_t len) +{ + unsigned int csum2, pos; + + if (len > desc->count) + len = desc->count; + pos = desc->offset; + csum2 = skb_copy_and_csum_bits(desc->skb, pos, to, len, 0); + desc->csum = csum_block_add(desc->csum, csum2, pos); + desc->count -= len; + desc->offset += len; + return len; +} + +/** + * xdr_partial_copy_from_skb - copy data out of an skb + * @xdr: target XDR buffer + * @base: starting offset + * @desc: sk_buff copy helper + * @copy_actor: virtual method for copying data + * + */ +ssize_t xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, skb_reader_t *desc, skb_read_actor_t copy_actor) +{ + struct page **ppage = xdr->pages; + unsigned int len, pglen = xdr->page_len; + ssize_t copied = 0; + int ret; + + len = xdr->head[0].iov_len; + if (base < len) { + len -= base; + ret = copy_actor(desc, (char *)xdr->head[0].iov_base + base, len); + copied += ret; + if (ret != len || !desc->count) + goto out; + base = 0; + } else + base -= len; + + if (unlikely(pglen == 0)) + goto copy_tail; + if (unlikely(base >= pglen)) { + base -= pglen; + goto copy_tail; + } + if (base || xdr->page_base) { + pglen -= base; + base += xdr->page_base; + ppage += base >> PAGE_CACHE_SHIFT; + base &= ~PAGE_CACHE_MASK; + } + do { + char *kaddr; + + /* ACL likes to be lazy in allocating pages - ACLs + * are small by default but can get huge. */ + if (unlikely(*ppage == NULL)) { + *ppage = alloc_page(GFP_ATOMIC); + if (unlikely(*ppage == NULL)) { + if (copied == 0) + copied = -ENOMEM; + goto out; + } + } + + len = PAGE_CACHE_SIZE; + kaddr = kmap_atomic(*ppage, KM_SKB_SUNRPC_DATA); + if (base) { + len -= base; + if (pglen < len) + len = pglen; + ret = copy_actor(desc, kaddr + base, len); + base = 0; + } else { + if (pglen < len) + len = pglen; + ret = copy_actor(desc, kaddr, len); + } + flush_dcache_page(*ppage); + kunmap_atomic(kaddr, KM_SKB_SUNRPC_DATA); + copied += ret; + if (ret != len || !desc->count) + goto out; + ppage++; + } while ((pglen -= len) != 0); +copy_tail: + len = xdr->tail[0].iov_len; + if (base < len) + copied += copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len - base); +out: + return copied; +} + +/** + * csum_partial_copy_to_xdr - checksum and copy data + * @xdr: target XDR buffer + * @skb: source skb + * + * We have set things up such that we perform the checksum of the UDP + * packet in parallel with the copies into the RPC client iovec. -DaveM + */ +int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) +{ + skb_reader_t desc; + + desc.skb = skb; + desc.offset = sizeof(struct udphdr); + desc.count = skb->len - desc.offset; + + if (skb->ip_summed == CHECKSUM_UNNECESSARY) + goto no_checksum; + + desc.csum = csum_partial(skb->data, desc.offset, skb->csum); + if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_and_csum_bits) < 0) + return -1; + if (desc.offset != skb->len) { + unsigned int csum2; + csum2 = skb_checksum(skb, desc.offset, skb->len - desc.offset, 0); + desc.csum = csum_block_add(desc.csum, csum2, desc.offset); + } + if (desc.count) + return -1; + if ((unsigned short)csum_fold(desc.csum)) + return -1; + if (unlikely(skb->ip_summed == CHECKSUM_HW)) + netdev_rx_csum_fault(skb->dev); + return 0; +no_checksum: + if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits) < 0) + return -1; + if (desc.count) + return -1; + return 0; +} diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index ed48ff022d3..a03d4b600c9 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -10,7 +10,6 @@ #include <linux/module.h> #include <linux/types.h> -#include <linux/socket.h> #include <linux/sched.h> #include <linux/uio.h> #include <linux/unistd.h> @@ -64,8 +63,6 @@ EXPORT_SYMBOL(rpc_mkpipe); /* Client transport */ EXPORT_SYMBOL(xprt_create_proto); EXPORT_SYMBOL(xprt_set_timeout); -EXPORT_SYMBOL(xprt_udp_slot_table_entries); -EXPORT_SYMBOL(xprt_tcp_slot_table_entries); /* Client credential cache */ EXPORT_SYMBOL(rpcauth_register); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index e9bd91265f7..e4296c8b861 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -196,12 +196,9 @@ svc_exit_thread(struct svc_rqst *rqstp) struct svc_serv *serv = rqstp->rq_server; svc_release_buffer(rqstp); - if (rqstp->rq_resp) - kfree(rqstp->rq_resp); - if (rqstp->rq_argp) - kfree(rqstp->rq_argp); - if (rqstp->rq_auth_data) - kfree(rqstp->rq_auth_data); + kfree(rqstp->rq_resp); + kfree(rqstp->rq_argp); + kfree(rqstp->rq_auth_data); kfree(rqstp); /* Release the server */ @@ -313,6 +310,11 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp) rqstp->rq_proc = proc = ntohl(svc_getu32(argv)); /* procedure number */ progp = serv->sv_program; + + for (progp = serv->sv_program; progp; progp = progp->pg_next) + if (prog == progp->pg_prog) + break; + /* * Decode auth data, and add verifier to reply buffer. * We do this before anything else in order to get a decent @@ -320,7 +322,7 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp) */ auth_res = svc_authenticate(rqstp, &auth_stat); /* Also give the program a chance to reject this call: */ - if (auth_res == SVC_OK) { + if (auth_res == SVC_OK && progp) { auth_stat = rpc_autherr_badcred; auth_res = progp->pg_authenticate(rqstp); } @@ -340,10 +342,7 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp) case SVC_COMPLETE: goto sendit; } - - for (progp = serv->sv_program; progp; progp = progp->pg_next) - if (prog == progp->pg_prog) - break; + if (progp == NULL) goto err_bad_prog; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 691dea4a58e..c6a51911e71 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -548,9 +548,6 @@ svc_write_space(struct sock *sk) /* * Receive a datagram from a UDP socket. */ -extern int -csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb); - static int svc_udp_recvfrom(struct svc_rqst *rqstp) { @@ -626,12 +623,9 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) /* we can use it in-place */ rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr); rqstp->rq_arg.head[0].iov_len = len; - if (skb->ip_summed != CHECKSUM_UNNECESSARY) { - if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { - skb_free_datagram(svsk->sk_sk, skb); - return 0; - } - skb->ip_summed = CHECKSUM_UNNECESSARY; + if (skb_checksum_complete(skb)) { + skb_free_datagram(svsk->sk_sk, skb); + return 0; } rqstp->rq_skbuff = skb; } @@ -1184,6 +1178,7 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout) arg->tail[0].iov_len = 0; try_to_freeze(); + cond_resched(); if (signalled()) return -EINTR; diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index 1b9616a12e2..1065904841f 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -119,8 +119,11 @@ done: return 0; } + static unsigned int min_slot_table_size = RPC_MIN_SLOT_TABLE; static unsigned int max_slot_table_size = RPC_MAX_SLOT_TABLE; +static unsigned int xprt_min_resvport_limit = RPC_MIN_RESVPORT; +static unsigned int xprt_max_resvport_limit = RPC_MAX_RESVPORT; static ctl_table debug_table[] = { { @@ -177,6 +180,28 @@ static ctl_table debug_table[] = { .extra1 = &min_slot_table_size, .extra2 = &max_slot_table_size }, + { + .ctl_name = CTL_MIN_RESVPORT, + .procname = "min_resvport", + .data = &xprt_min_resvport, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xprt_min_resvport_limit, + .extra2 = &xprt_max_resvport_limit + }, + { + .ctl_name = CTL_MAX_RESVPORT, + .procname = "max_resvport", + .data = &xprt_max_resvport, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xprt_min_resvport_limit, + .extra2 = &xprt_max_resvport_limit + }, { .ctl_name = 0 } }; diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index fde16f40a58..aaf08cdd19f 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -6,15 +6,12 @@ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */ +#include <linux/module.h> #include <linux/types.h> -#include <linux/socket.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/pagemap.h> #include <linux/errno.h> -#include <linux/in.h> -#include <linux/net.h> -#include <net/sock.h> #include <linux/sunrpc/xdr.h> #include <linux/sunrpc/msg_prot.h> @@ -176,178 +173,6 @@ xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset, xdr->buflen += len; } -ssize_t -xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, - skb_reader_t *desc, - skb_read_actor_t copy_actor) -{ - struct page **ppage = xdr->pages; - unsigned int len, pglen = xdr->page_len; - ssize_t copied = 0; - int ret; - - len = xdr->head[0].iov_len; - if (base < len) { - len -= base; - ret = copy_actor(desc, (char *)xdr->head[0].iov_base + base, len); - copied += ret; - if (ret != len || !desc->count) - goto out; - base = 0; - } else - base -= len; - - if (pglen == 0) - goto copy_tail; - if (base >= pglen) { - base -= pglen; - goto copy_tail; - } - if (base || xdr->page_base) { - pglen -= base; - base += xdr->page_base; - ppage += base >> PAGE_CACHE_SHIFT; - base &= ~PAGE_CACHE_MASK; - } - do { - char *kaddr; - - /* ACL likes to be lazy in allocating pages - ACLs - * are small by default but can get huge. */ - if (unlikely(*ppage == NULL)) { - *ppage = alloc_page(GFP_ATOMIC); - if (unlikely(*ppage == NULL)) { - if (copied == 0) - copied = -ENOMEM; - goto out; - } - } - - len = PAGE_CACHE_SIZE; - kaddr = kmap_atomic(*ppage, KM_SKB_SUNRPC_DATA); - if (base) { - len -= base; - if (pglen < len) - len = pglen; - ret = copy_actor(desc, kaddr + base, len); - base = 0; - } else { - if (pglen < len) - len = pglen; - ret = copy_actor(desc, kaddr, len); - } - flush_dcache_page(*ppage); - kunmap_atomic(kaddr, KM_SKB_SUNRPC_DATA); - copied += ret; - if (ret != len || !desc->count) - goto out; - ppage++; - } while ((pglen -= len) != 0); -copy_tail: - len = xdr->tail[0].iov_len; - if (base < len) - copied += copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len - base); -out: - return copied; -} - - -int -xdr_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, - struct xdr_buf *xdr, unsigned int base, int msgflags) -{ - struct page **ppage = xdr->pages; - unsigned int len, pglen = xdr->page_len; - int err, ret = 0; - ssize_t (*sendpage)(struct socket *, struct page *, int, size_t, int); - - len = xdr->head[0].iov_len; - if (base < len || (addr != NULL && base == 0)) { - struct kvec iov = { - .iov_base = xdr->head[0].iov_base + base, - .iov_len = len - base, - }; - struct msghdr msg = { - .msg_name = addr, - .msg_namelen = addrlen, - .msg_flags = msgflags, - }; - if (xdr->len > len) - msg.msg_flags |= MSG_MORE; - - if (iov.iov_len != 0) - err = kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len); - else - err = kernel_sendmsg(sock, &msg, NULL, 0, 0); - if (ret == 0) - ret = err; - else if (err > 0) - ret += err; - if (err != iov.iov_len) - goto out; - base = 0; - } else - base -= len; - - if (pglen == 0) - goto copy_tail; - if (base >= pglen) { - base -= pglen; - goto copy_tail; - } - if (base || xdr->page_base) { - pglen -= base; - base += xdr->page_base; - ppage += base >> PAGE_CACHE_SHIFT; - base &= ~PAGE_CACHE_MASK; - } - - sendpage = sock->ops->sendpage ? : sock_no_sendpage; - do { - int flags = msgflags; - - len = PAGE_CACHE_SIZE; - if (base) - len -= base; - if (pglen < len) - len = pglen; - - if (pglen != len || xdr->tail[0].iov_len != 0) - flags |= MSG_MORE; - - /* Hmm... We might be dealing with highmem pages */ - if (PageHighMem(*ppage)) - sendpage = sock_no_sendpage; - err = sendpage(sock, *ppage, base, len, flags); - if (ret == 0) - ret = err; - else if (err > 0) - ret += err; - if (err != len) - goto out; - base = 0; - ppage++; - } while ((pglen -= len) != 0); -copy_tail: - len = xdr->tail[0].iov_len; - if (base < len) { - struct kvec iov = { - .iov_base = xdr->tail[0].iov_base + base, - .iov_len = len - base, - }; - struct msghdr msg = { - .msg_flags = msgflags, - }; - err = kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len); - if (ret == 0) - ret = err; - else if (err > 0) - ret += err; - } -out: - return ret; -} - /* * Helper routines for doing 'memmove' like operations on a struct xdr_buf @@ -1167,8 +992,7 @@ xdr_xcode_array2(struct xdr_buf *buf, unsigned int base, err = 0; out: - if (elem) - kfree(elem); + kfree(elem); if (ppages) kunmap(*ppages); return err; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 3c654e06b08..6dda3860351 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -10,12 +10,12 @@ * one is available. Otherwise, it sleeps on the backlog queue * (xprt_reserve). * - Next, the caller puts together the RPC message, stuffs it into - * the request struct, and calls xprt_call(). - * - xprt_call transmits the message and installs the caller on the - * socket's wait list. At the same time, it installs a timer that + * the request struct, and calls xprt_transmit(). + * - xprt_transmit sends the message and installs the caller on the + * transport's wait list. At the same time, it installs a timer that * is run after the packet's timeout has expired. * - When a packet arrives, the data_ready handler walks the list of - * pending requests for that socket. If a matching XID is found, the + * pending requests for that transport. If a matching XID is found, the * caller is woken up, and the timer removed. * - When no reply arrives within the timeout interval, the timer is * fired by the kernel and runs xprt_timer(). It either adjusts the @@ -33,36 +33,17 @@ * * Copyright (C) 1995-1997, Olaf Kirch <okir@monad.swb.de> * - * TCP callback races fixes (C) 1998 Red Hat Software <alan@redhat.com> - * TCP send fixes (C) 1998 Red Hat Software <alan@redhat.com> - * TCP NFS related read + write fixes - * (C) 1999 Dave Airlie, University of Limerick, Ireland <airlied@linux.ie> - * - * Rewrite of larges part of the code in order to stabilize TCP stuff. - * Fix behaviour when socket buffer is full. - * (C) 1999 Trond Myklebust <trond.myklebust@fys.uio.no> + * Transport switch API copyright (C) 2005, Chuck Lever <cel@netapp.com> */ +#include <linux/module.h> + #include <linux/types.h> -#include <linux/slab.h> -#include <linux/capability.h> -#include <linux/sched.h> -#include <linux/errno.h> -#include <linux/socket.h> -#include <linux/in.h> -#include <linux/net.h> -#include <linux/mm.h> -#include <linux/udp.h> -#include <linux/tcp.h> -#include <linux/sunrpc/clnt.h> -#include <linux/file.h> +#include <linux/interrupt.h> #include <linux/workqueue.h> #include <linux/random.h> -#include <net/sock.h> -#include <net/checksum.h> -#include <net/udp.h> -#include <net/tcp.h> +#include <linux/sunrpc/clnt.h> /* * Local variables @@ -73,81 +54,90 @@ # define RPCDBG_FACILITY RPCDBG_XPRT #endif -#define XPRT_MAX_BACKOFF (8) -#define XPRT_IDLE_TIMEOUT (5*60*HZ) -#define XPRT_MAX_RESVPORT (800) - /* * Local functions */ static void xprt_request_init(struct rpc_task *, struct rpc_xprt *); static inline void do_xprt_reserve(struct rpc_task *); -static void xprt_disconnect(struct rpc_xprt *); static void xprt_connect_status(struct rpc_task *task); -static struct rpc_xprt * xprt_setup(int proto, struct sockaddr_in *ap, - struct rpc_timeout *to); -static struct socket *xprt_create_socket(struct rpc_xprt *, int, int); -static void xprt_bind_socket(struct rpc_xprt *, struct socket *); static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *); -static int xprt_clear_backlog(struct rpc_xprt *xprt); - -#ifdef RPC_DEBUG_DATA /* - * Print the buffer contents (first 128 bytes only--just enough for - * diropres return). + * The transport code maintains an estimate on the maximum number of out- + * standing RPC requests, using a smoothed version of the congestion + * avoidance implemented in 44BSD. This is basically the Van Jacobson + * congestion algorithm: If a retransmit occurs, the congestion window is + * halved; otherwise, it is incremented by 1/cwnd when + * + * - a reply is received and + * - a full number of requests are outstanding and + * - the congestion window hasn't been updated recently. */ -static void -xprt_pktdump(char *msg, u32 *packet, unsigned int count) -{ - u8 *buf = (u8 *) packet; - int j; - - dprintk("RPC: %s\n", msg); - for (j = 0; j < count && j < 128; j += 4) { - if (!(j & 31)) { - if (j) - dprintk("\n"); - dprintk("0x%04x ", j); - } - dprintk("%02x%02x%02x%02x ", - buf[j], buf[j+1], buf[j+2], buf[j+3]); - } - dprintk("\n"); -} -#else -static inline void -xprt_pktdump(char *msg, u32 *packet, unsigned int count) -{ - /* NOP */ -} -#endif +#define RPC_CWNDSHIFT (8U) +#define RPC_CWNDSCALE (1U << RPC_CWNDSHIFT) +#define RPC_INITCWND RPC_CWNDSCALE +#define RPC_MAXCWND(xprt) ((xprt)->max_reqs << RPC_CWNDSHIFT) -/* - * Look up RPC transport given an INET socket +#define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd) + +/** + * xprt_reserve_xprt - serialize write access to transports + * @task: task that is requesting access to the transport + * + * This prevents mixing the payload of separate requests, and prevents + * transport connects from colliding with writes. No congestion control + * is provided. */ -static inline struct rpc_xprt * -xprt_from_sock(struct sock *sk) +int xprt_reserve_xprt(struct rpc_task *task) { - return (struct rpc_xprt *) sk->sk_user_data; + struct rpc_xprt *xprt = task->tk_xprt; + struct rpc_rqst *req = task->tk_rqstp; + + if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { + if (task == xprt->snd_task) + return 1; + if (task == NULL) + return 0; + goto out_sleep; + } + xprt->snd_task = task; + if (req) { + req->rq_bytes_sent = 0; + req->rq_ntrans++; + } + return 1; + +out_sleep: + dprintk("RPC: %4d failed to lock transport %p\n", + task->tk_pid, xprt); + task->tk_timeout = 0; + task->tk_status = -EAGAIN; + if (req && req->rq_ntrans) + rpc_sleep_on(&xprt->resend, task, NULL, NULL); + else + rpc_sleep_on(&xprt->sending, task, NULL, NULL); + return 0; } /* - * Serialize write access to sockets, in order to prevent different - * requests from interfering with each other. - * Also prevents TCP socket connects from colliding with writes. + * xprt_reserve_xprt_cong - serialize write access to transports + * @task: task that is requesting access to the transport + * + * Same as xprt_reserve_xprt, but Van Jacobson congestion control is + * integrated into the decision of whether a request is allowed to be + * woken up and given access to the transport. */ -static int -__xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) +int xprt_reserve_xprt_cong(struct rpc_task *task) { + struct rpc_xprt *xprt = task->tk_xprt; struct rpc_rqst *req = task->tk_rqstp; - if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) { + if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { if (task == xprt->snd_task) return 1; goto out_sleep; } - if (xprt->nocong || __xprt_get_cong(xprt, task)) { + if (__xprt_get_cong(xprt, task)) { xprt->snd_task = task; if (req) { req->rq_bytes_sent = 0; @@ -156,10 +146,10 @@ __xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) return 1; } smp_mb__before_clear_bit(); - clear_bit(XPRT_LOCKED, &xprt->sockstate); + clear_bit(XPRT_LOCKED, &xprt->state); smp_mb__after_clear_bit(); out_sleep: - dprintk("RPC: %4d failed to lock socket %p\n", task->tk_pid, xprt); + dprintk("RPC: %4d failed to lock transport %p\n", task->tk_pid, xprt); task->tk_timeout = 0; task->tk_status = -EAGAIN; if (req && req->rq_ntrans) @@ -169,26 +159,52 @@ out_sleep: return 0; } -static inline int -xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) +static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) { int retval; - spin_lock_bh(&xprt->sock_lock); - retval = __xprt_lock_write(xprt, task); - spin_unlock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); + retval = xprt->ops->reserve_xprt(task); + spin_unlock_bh(&xprt->transport_lock); return retval; } +static void __xprt_lock_write_next(struct rpc_xprt *xprt) +{ + struct rpc_task *task; + struct rpc_rqst *req; -static void -__xprt_lock_write_next(struct rpc_xprt *xprt) + if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) + return; + + task = rpc_wake_up_next(&xprt->resend); + if (!task) { + task = rpc_wake_up_next(&xprt->sending); + if (!task) + goto out_unlock; + } + + req = task->tk_rqstp; + xprt->snd_task = task; + if (req) { + req->rq_bytes_sent = 0; + req->rq_ntrans++; + } + return; + +out_unlock: + smp_mb__before_clear_bit(); + clear_bit(XPRT_LOCKED, &xprt->state); + smp_mb__after_clear_bit(); +} + +static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt) { struct rpc_task *task; - if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) + if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) return; - if (!xprt->nocong && RPCXPRT_CONGESTED(xprt)) + if (RPCXPRT_CONGESTED(xprt)) goto out_unlock; task = rpc_wake_up_next(&xprt->resend); if (!task) { @@ -196,7 +212,7 @@ __xprt_lock_write_next(struct rpc_xprt *xprt) if (!task) goto out_unlock; } - if (xprt->nocong || __xprt_get_cong(xprt, task)) { + if (__xprt_get_cong(xprt, task)) { struct rpc_rqst *req = task->tk_rqstp; xprt->snd_task = task; if (req) { @@ -207,87 +223,52 @@ __xprt_lock_write_next(struct rpc_xprt *xprt) } out_unlock: smp_mb__before_clear_bit(); - clear_bit(XPRT_LOCKED, &xprt->sockstate); + clear_bit(XPRT_LOCKED, &xprt->state); smp_mb__after_clear_bit(); } -/* - * Releases the socket for use by other requests. +/** + * xprt_release_xprt - allow other requests to use a transport + * @xprt: transport with other tasks potentially waiting + * @task: task that is releasing access to the transport + * + * Note that "task" can be NULL. No congestion control is provided. */ -static void -__xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) +void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task == task) { xprt->snd_task = NULL; smp_mb__before_clear_bit(); - clear_bit(XPRT_LOCKED, &xprt->sockstate); + clear_bit(XPRT_LOCKED, &xprt->state); smp_mb__after_clear_bit(); __xprt_lock_write_next(xprt); } } -static inline void -xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) -{ - spin_lock_bh(&xprt->sock_lock); - __xprt_release_write(xprt, task); - spin_unlock_bh(&xprt->sock_lock); -} - -/* - * Write data to socket. +/** + * xprt_release_xprt_cong - allow other requests to use a transport + * @xprt: transport with other tasks potentially waiting + * @task: task that is releasing access to the transport + * + * Note that "task" can be NULL. Another task is awoken to use the + * transport if the transport's congestion window allows it. */ -static inline int -xprt_sendmsg(struct rpc_xprt *xprt, struct rpc_rqst *req) +void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) { - struct socket *sock = xprt->sock; - struct xdr_buf *xdr = &req->rq_snd_buf; - struct sockaddr *addr = NULL; - int addrlen = 0; - unsigned int skip; - int result; - - if (!sock) - return -ENOTCONN; - - xprt_pktdump("packet data:", - req->rq_svec->iov_base, - req->rq_svec->iov_len); - - /* For UDP, we need to provide an address */ - if (!xprt->stream) { - addr = (struct sockaddr *) &xprt->addr; - addrlen = sizeof(xprt->addr); + if (xprt->snd_task == task) { + xprt->snd_task = NULL; + smp_mb__before_clear_bit(); + clear_bit(XPRT_LOCKED, &xprt->state); + smp_mb__after_clear_bit(); + __xprt_lock_write_next_cong(xprt); } - /* Dont repeat bytes */ - skip = req->rq_bytes_sent; - - clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags); - result = xdr_sendpages(sock, addr, addrlen, xdr, skip, MSG_DONTWAIT); - - dprintk("RPC: xprt_sendmsg(%d) = %d\n", xdr->len - skip, result); - - if (result >= 0) - return result; +} - switch (result) { - case -ECONNREFUSED: - /* When the server has died, an ICMP port unreachable message - * prompts ECONNREFUSED. - */ - case -EAGAIN: - break; - case -ECONNRESET: - case -ENOTCONN: - case -EPIPE: - /* connection broken */ - if (xprt->stream) - result = -ENOTCONN; - break; - default: - printk(KERN_NOTICE "RPC: sendmsg returned error %d\n", -result); - } - return result; +static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) +{ + spin_lock_bh(&xprt->transport_lock); + xprt->ops->release_xprt(xprt, task); + spin_unlock_bh(&xprt->transport_lock); } /* @@ -321,26 +302,40 @@ __xprt_put_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) return; req->rq_cong = 0; xprt->cong -= RPC_CWNDSCALE; - __xprt_lock_write_next(xprt); + __xprt_lock_write_next_cong(xprt); } -/* - * Adjust RPC congestion window +/** + * xprt_release_rqst_cong - housekeeping when request is complete + * @task: RPC request that recently completed + * + * Useful for transports that require congestion control. + */ +void xprt_release_rqst_cong(struct rpc_task *task) +{ + __xprt_put_cong(task->tk_xprt, task->tk_rqstp); +} + +/** + * xprt_adjust_cwnd - adjust transport congestion window + * @task: recently completed RPC request used to adjust window + * @result: result code of completed RPC request + * * We use a time-smoothed congestion estimator to avoid heavy oscillation. */ -static void -xprt_adjust_cwnd(struct rpc_xprt *xprt, int result) +void xprt_adjust_cwnd(struct rpc_task *task, int result) { - unsigned long cwnd; + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = task->tk_xprt; + unsigned long cwnd = xprt->cwnd; - cwnd = xprt->cwnd; if (result >= 0 && cwnd <= xprt->cong) { /* The (cwnd >> 1) term makes sure * the result gets rounded properly. */ cwnd += (RPC_CWNDSCALE * RPC_CWNDSCALE + (cwnd >> 1)) / cwnd; if (cwnd > RPC_MAXCWND(xprt)) cwnd = RPC_MAXCWND(xprt); - __xprt_lock_write_next(xprt); + __xprt_lock_write_next_cong(xprt); } else if (result == -ETIMEDOUT) { cwnd >>= 1; if (cwnd < RPC_CWNDSCALE) @@ -349,11 +344,89 @@ xprt_adjust_cwnd(struct rpc_xprt *xprt, int result) dprintk("RPC: cong %ld, cwnd was %ld, now %ld\n", xprt->cong, xprt->cwnd, cwnd); xprt->cwnd = cwnd; + __xprt_put_cong(xprt, req); +} + +/** + * xprt_wake_pending_tasks - wake all tasks on a transport's pending queue + * @xprt: transport with waiting tasks + * @status: result code to plant in each task before waking it + * + */ +void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status) +{ + if (status < 0) + rpc_wake_up_status(&xprt->pending, status); + else + rpc_wake_up(&xprt->pending); +} + +/** + * xprt_wait_for_buffer_space - wait for transport output buffer to clear + * @task: task to be put to sleep + * + */ +void xprt_wait_for_buffer_space(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + task->tk_timeout = req->rq_timeout; + rpc_sleep_on(&xprt->pending, task, NULL, NULL); +} + +/** + * xprt_write_space - wake the task waiting for transport output buffer space + * @xprt: transport with waiting tasks + * + * Can be called in a soft IRQ context, so xprt_write_space never sleeps. + */ +void xprt_write_space(struct rpc_xprt *xprt) +{ + if (unlikely(xprt->shutdown)) + return; + + spin_lock_bh(&xprt->transport_lock); + if (xprt->snd_task) { + dprintk("RPC: write space: waking waiting task on xprt %p\n", + xprt); + rpc_wake_up_task(xprt->snd_task); + } + spin_unlock_bh(&xprt->transport_lock); +} + +/** + * xprt_set_retrans_timeout_def - set a request's retransmit timeout + * @task: task whose timeout is to be set + * + * Set a request's retransmit timeout based on the transport's + * default timeout parameters. Used by transports that don't adjust + * the retransmit timeout based on round-trip time estimation. + */ +void xprt_set_retrans_timeout_def(struct rpc_task *task) +{ + task->tk_timeout = task->tk_rqstp->rq_timeout; } /* - * Reset the major timeout value + * xprt_set_retrans_timeout_rtt - set a request's retransmit timeout + * @task: task whose timeout is to be set + * + * Set a request's retransmit timeout using the RTT estimator. */ +void xprt_set_retrans_timeout_rtt(struct rpc_task *task) +{ + int timer = task->tk_msg.rpc_proc->p_timer; + struct rpc_rtt *rtt = task->tk_client->cl_rtt; + struct rpc_rqst *req = task->tk_rqstp; + unsigned long max_timeout = req->rq_xprt->timeout.to_maxval; + + task->tk_timeout = rpc_calc_rto(rtt, timer); + task->tk_timeout <<= rpc_ntimeo(rtt, timer) + req->rq_retries; + if (task->tk_timeout > max_timeout || task->tk_timeout == 0) + task->tk_timeout = max_timeout; +} + static void xprt_reset_majortimeo(struct rpc_rqst *req) { struct rpc_timeout *to = &req->rq_xprt->timeout; @@ -368,8 +441,10 @@ static void xprt_reset_majortimeo(struct rpc_rqst *req) req->rq_majortimeo += jiffies; } -/* - * Adjust timeout values etc for next retransmit +/** + * xprt_adjust_timeout - adjust timeout values for next retransmit + * @req: RPC request containing parameters to use for the adjustment + * */ int xprt_adjust_timeout(struct rpc_rqst *req) { @@ -391,9 +466,9 @@ int xprt_adjust_timeout(struct rpc_rqst *req) req->rq_retries = 0; xprt_reset_majortimeo(req); /* Reset the RTT counters == "slow start" */ - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); rpc_init_rtt(req->rq_task->tk_client->cl_rtt, to->to_initval); - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); pprintk("RPC: %lu timeout\n", jiffies); status = -ETIMEDOUT; } @@ -405,133 +480,52 @@ int xprt_adjust_timeout(struct rpc_rqst *req) return status; } -/* - * Close down a transport socket - */ -static void -xprt_close(struct rpc_xprt *xprt) -{ - struct socket *sock = xprt->sock; - struct sock *sk = xprt->inet; - - if (!sk) - return; - - write_lock_bh(&sk->sk_callback_lock); - xprt->inet = NULL; - xprt->sock = NULL; - - sk->sk_user_data = NULL; - sk->sk_data_ready = xprt->old_data_ready; - sk->sk_state_change = xprt->old_state_change; - sk->sk_write_space = xprt->old_write_space; - write_unlock_bh(&sk->sk_callback_lock); - - sk->sk_no_check = 0; - - sock_release(sock); -} - -static void -xprt_socket_autoclose(void *args) +static void xprt_autoclose(void *args) { struct rpc_xprt *xprt = (struct rpc_xprt *)args; xprt_disconnect(xprt); - xprt_close(xprt); + xprt->ops->close(xprt); xprt_release_write(xprt, NULL); } -/* - * Mark a transport as disconnected +/** + * xprt_disconnect - mark a transport as disconnected + * @xprt: transport to flag for disconnect + * */ -static void -xprt_disconnect(struct rpc_xprt *xprt) +void xprt_disconnect(struct rpc_xprt *xprt) { dprintk("RPC: disconnected transport %p\n", xprt); - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); xprt_clear_connected(xprt); - rpc_wake_up_status(&xprt->pending, -ENOTCONN); - spin_unlock_bh(&xprt->sock_lock); + xprt_wake_pending_tasks(xprt, -ENOTCONN); + spin_unlock_bh(&xprt->transport_lock); } -/* - * Used to allow disconnection when we've been idle - */ static void xprt_init_autodisconnect(unsigned long data) { struct rpc_xprt *xprt = (struct rpc_xprt *)data; - spin_lock(&xprt->sock_lock); + spin_lock(&xprt->transport_lock); if (!list_empty(&xprt->recv) || xprt->shutdown) goto out_abort; - if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) + if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) goto out_abort; - spin_unlock(&xprt->sock_lock); - /* Let keventd close the socket */ - if (test_bit(XPRT_CONNECTING, &xprt->sockstate) != 0) + spin_unlock(&xprt->transport_lock); + if (xprt_connecting(xprt)) xprt_release_write(xprt, NULL); else schedule_work(&xprt->task_cleanup); return; out_abort: - spin_unlock(&xprt->sock_lock); -} - -static void xprt_socket_connect(void *args) -{ - struct rpc_xprt *xprt = (struct rpc_xprt *)args; - struct socket *sock = xprt->sock; - int status = -EIO; - - if (xprt->shutdown || xprt->addr.sin_port == 0) - goto out; - - /* - * Start by resetting any existing state - */ - xprt_close(xprt); - sock = xprt_create_socket(xprt, xprt->prot, xprt->resvport); - if (sock == NULL) { - /* couldn't create socket or bind to reserved port; - * this is likely a permanent error, so cause an abort */ - goto out; - } - xprt_bind_socket(xprt, sock); - xprt_sock_setbufsize(xprt); - - status = 0; - if (!xprt->stream) - goto out; - - /* - * Tell the socket layer to start connecting... - */ - status = sock->ops->connect(sock, (struct sockaddr *) &xprt->addr, - sizeof(xprt->addr), O_NONBLOCK); - dprintk("RPC: %p connect status %d connected %d sock state %d\n", - xprt, -status, xprt_connected(xprt), sock->sk->sk_state); - if (status < 0) { - switch (status) { - case -EINPROGRESS: - case -EALREADY: - goto out_clear; - } - } -out: - if (status < 0) - rpc_wake_up_status(&xprt->pending, status); - else - rpc_wake_up(&xprt->pending); -out_clear: - smp_mb__before_clear_bit(); - clear_bit(XPRT_CONNECTING, &xprt->sockstate); - smp_mb__after_clear_bit(); + spin_unlock(&xprt->transport_lock); } -/* - * Attempt to connect a TCP socket. +/** + * xprt_connect - schedule a transport connect operation + * @task: RPC task that is requesting the connect * */ void xprt_connect(struct rpc_task *task) @@ -552,37 +546,19 @@ void xprt_connect(struct rpc_task *task) if (!xprt_lock_write(xprt, task)) return; if (xprt_connected(xprt)) - goto out_write; + xprt_release_write(xprt, task); + else { + if (task->tk_rqstp) + task->tk_rqstp->rq_bytes_sent = 0; - if (task->tk_rqstp) - task->tk_rqstp->rq_bytes_sent = 0; - - task->tk_timeout = RPC_CONNECT_TIMEOUT; - rpc_sleep_on(&xprt->pending, task, xprt_connect_status, NULL); - if (!test_and_set_bit(XPRT_CONNECTING, &xprt->sockstate)) { - /* Note: if we are here due to a dropped connection - * we delay reconnecting by RPC_REESTABLISH_TIMEOUT/HZ - * seconds - */ - if (xprt->sock != NULL) - schedule_delayed_work(&xprt->sock_connect, - RPC_REESTABLISH_TIMEOUT); - else { - schedule_work(&xprt->sock_connect); - if (!RPC_IS_ASYNC(task)) - flush_scheduled_work(); - } + task->tk_timeout = xprt->connect_timeout; + rpc_sleep_on(&xprt->pending, task, xprt_connect_status, NULL); + xprt->ops->connect(task); } return; - out_write: - xprt_release_write(xprt, task); } -/* - * We arrive here when awoken from waiting on connection establishment. - */ -static void -xprt_connect_status(struct rpc_task *task) +static void xprt_connect_status(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; @@ -592,31 +568,42 @@ xprt_connect_status(struct rpc_task *task) return; } - /* if soft mounted, just cause this RPC to fail */ - if (RPC_IS_SOFT(task)) - task->tk_status = -EIO; - switch (task->tk_status) { case -ECONNREFUSED: case -ECONNRESET: + dprintk("RPC: %4d xprt_connect_status: server %s refused connection\n", + task->tk_pid, task->tk_client->cl_server); + break; case -ENOTCONN: - return; + dprintk("RPC: %4d xprt_connect_status: connection broken\n", + task->tk_pid); + break; case -ETIMEDOUT: - dprintk("RPC: %4d xprt_connect_status: timed out\n", + dprintk("RPC: %4d xprt_connect_status: connect attempt timed out\n", task->tk_pid); break; default: - printk(KERN_ERR "RPC: error %d connecting to server %s\n", - -task->tk_status, task->tk_client->cl_server); + dprintk("RPC: %4d xprt_connect_status: error %d connecting to server %s\n", + task->tk_pid, -task->tk_status, task->tk_client->cl_server); + xprt_release_write(xprt, task); + task->tk_status = -EIO; + return; + } + + /* if soft mounted, just cause this RPC to fail */ + if (RPC_IS_SOFT(task)) { + xprt_release_write(xprt, task); + task->tk_status = -EIO; } - xprt_release_write(xprt, task); } -/* - * Look up the RPC request corresponding to a reply, and then lock it. +/** + * xprt_lookup_rqst - find an RPC request corresponding to an XID + * @xprt: transport on which the original request was transmitted + * @xid: RPC XID of incoming reply + * */ -static inline struct rpc_rqst * -xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid) +struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid) { struct list_head *pos; struct rpc_rqst *req = NULL; @@ -631,556 +618,68 @@ xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid) return req; } -/* - * Complete reply received. - * The TCP code relies on us to remove the request from xprt->pending. - */ -static void -xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied) -{ - struct rpc_task *task = req->rq_task; - struct rpc_clnt *clnt = task->tk_client; - - /* Adjust congestion window */ - if (!xprt->nocong) { - unsigned timer = task->tk_msg.rpc_proc->p_timer; - xprt_adjust_cwnd(xprt, copied); - __xprt_put_cong(xprt, req); - if (timer) { - if (req->rq_ntrans == 1) - rpc_update_rtt(clnt->cl_rtt, timer, - (long)jiffies - req->rq_xtime); - rpc_set_timeo(clnt->cl_rtt, timer, req->rq_ntrans - 1); - } - } - -#ifdef RPC_PROFILE - /* Profile only reads for now */ - if (copied > 1024) { - static unsigned long nextstat; - static unsigned long pkt_rtt, pkt_len, pkt_cnt; - - pkt_cnt++; - pkt_len += req->rq_slen + copied; - pkt_rtt += jiffies - req->rq_xtime; - if (time_before(nextstat, jiffies)) { - printk("RPC: %lu %ld cwnd\n", jiffies, xprt->cwnd); - printk("RPC: %ld %ld %ld %ld stat\n", - jiffies, pkt_cnt, pkt_len, pkt_rtt); - pkt_rtt = pkt_len = pkt_cnt = 0; - nextstat = jiffies + 5 * HZ; - } - } -#endif - - dprintk("RPC: %4d has input (%d bytes)\n", task->tk_pid, copied); - list_del_init(&req->rq_list); - req->rq_received = req->rq_private_buf.len = copied; - - /* ... and wake up the process. */ - rpc_wake_up_task(task); - return; -} - -static size_t -skb_read_bits(skb_reader_t *desc, void *to, size_t len) -{ - if (len > desc->count) - len = desc->count; - if (skb_copy_bits(desc->skb, desc->offset, to, len)) - return 0; - desc->count -= len; - desc->offset += len; - return len; -} - -static size_t -skb_read_and_csum_bits(skb_reader_t *desc, void *to, size_t len) -{ - unsigned int csum2, pos; - - if (len > desc->count) - len = desc->count; - pos = desc->offset; - csum2 = skb_copy_and_csum_bits(desc->skb, pos, to, len, 0); - desc->csum = csum_block_add(desc->csum, csum2, pos); - desc->count -= len; - desc->offset += len; - return len; -} - -/* - * We have set things up such that we perform the checksum of the UDP - * packet in parallel with the copies into the RPC client iovec. -DaveM - */ -int -csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) -{ - skb_reader_t desc; - - desc.skb = skb; - desc.offset = sizeof(struct udphdr); - desc.count = skb->len - desc.offset; - - if (skb->ip_summed == CHECKSUM_UNNECESSARY) - goto no_checksum; - - desc.csum = csum_partial(skb->data, desc.offset, skb->csum); - if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_and_csum_bits) < 0) - return -1; - if (desc.offset != skb->len) { - unsigned int csum2; - csum2 = skb_checksum(skb, desc.offset, skb->len - desc.offset, 0); - desc.csum = csum_block_add(desc.csum, csum2, desc.offset); - } - if (desc.count) - return -1; - if ((unsigned short)csum_fold(desc.csum)) - return -1; - return 0; -no_checksum: - if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits) < 0) - return -1; - if (desc.count) - return -1; - return 0; -} - -/* - * Input handler for RPC replies. Called from a bottom half and hence - * atomic. - */ -static void -udp_data_ready(struct sock *sk, int len) -{ - struct rpc_task *task; - struct rpc_xprt *xprt; - struct rpc_rqst *rovr; - struct sk_buff *skb; - int err, repsize, copied; - u32 _xid, *xp; - - read_lock(&sk->sk_callback_lock); - dprintk("RPC: udp_data_ready...\n"); - if (!(xprt = xprt_from_sock(sk))) { - printk("RPC: udp_data_ready request not found!\n"); - goto out; - } - - dprintk("RPC: udp_data_ready client %p\n", xprt); - - if ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL) - goto out; - - if (xprt->shutdown) - goto dropit; - - repsize = skb->len - sizeof(struct udphdr); - if (repsize < 4) { - printk("RPC: impossible RPC reply size %d!\n", repsize); - goto dropit; - } - - /* Copy the XID from the skb... */ - xp = skb_header_pointer(skb, sizeof(struct udphdr), - sizeof(_xid), &_xid); - if (xp == NULL) - goto dropit; - - /* Look up and lock the request corresponding to the given XID */ - spin_lock(&xprt->sock_lock); - rovr = xprt_lookup_rqst(xprt, *xp); - if (!rovr) - goto out_unlock; - task = rovr->rq_task; - - dprintk("RPC: %4d received reply\n", task->tk_pid); - - if ((copied = rovr->rq_private_buf.buflen) > repsize) - copied = repsize; - - /* Suck it into the iovec, verify checksum if not done by hw. */ - if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) - goto out_unlock; - - /* Something worked... */ - dst_confirm(skb->dst); - - xprt_complete_rqst(xprt, rovr, copied); - - out_unlock: - spin_unlock(&xprt->sock_lock); - dropit: - skb_free_datagram(sk, skb); - out: - read_unlock(&sk->sk_callback_lock); -} - -/* - * Copy from an skb into memory and shrink the skb. - */ -static inline size_t -tcp_copy_data(skb_reader_t *desc, void *p, size_t len) -{ - if (len > desc->count) - len = desc->count; - if (skb_copy_bits(desc->skb, desc->offset, p, len)) { - dprintk("RPC: failed to copy %zu bytes from skb. %zu bytes remain\n", - len, desc->count); - return 0; - } - desc->offset += len; - desc->count -= len; - dprintk("RPC: copied %zu bytes from skb. %zu bytes remain\n", - len, desc->count); - return len; -} - -/* - * TCP read fragment marker - */ -static inline void -tcp_read_fraghdr(struct rpc_xprt *xprt, skb_reader_t *desc) -{ - size_t len, used; - char *p; - - p = ((char *) &xprt->tcp_recm) + xprt->tcp_offset; - len = sizeof(xprt->tcp_recm) - xprt->tcp_offset; - used = tcp_copy_data(desc, p, len); - xprt->tcp_offset += used; - if (used != len) - return; - xprt->tcp_reclen = ntohl(xprt->tcp_recm); - if (xprt->tcp_reclen & 0x80000000) - xprt->tcp_flags |= XPRT_LAST_FRAG; - else - xprt->tcp_flags &= ~XPRT_LAST_FRAG; - xprt->tcp_reclen &= 0x7fffffff; - xprt->tcp_flags &= ~XPRT_COPY_RECM; - xprt->tcp_offset = 0; - /* Sanity check of the record length */ - if (xprt->tcp_reclen < 4) { - printk(KERN_ERR "RPC: Invalid TCP record fragment length\n"); - xprt_disconnect(xprt); - } - dprintk("RPC: reading TCP record fragment of length %d\n", - xprt->tcp_reclen); -} - -static void -tcp_check_recm(struct rpc_xprt *xprt) -{ - dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u, tcp_flags = %lx\n", - xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen, xprt->tcp_flags); - if (xprt->tcp_offset == xprt->tcp_reclen) { - xprt->tcp_flags |= XPRT_COPY_RECM; - xprt->tcp_offset = 0; - if (xprt->tcp_flags & XPRT_LAST_FRAG) { - xprt->tcp_flags &= ~XPRT_COPY_DATA; - xprt->tcp_flags |= XPRT_COPY_XID; - xprt->tcp_copied = 0; - } - } -} - -/* - * TCP read xid - */ -static inline void -tcp_read_xid(struct rpc_xprt *xprt, skb_reader_t *desc) -{ - size_t len, used; - char *p; - - len = sizeof(xprt->tcp_xid) - xprt->tcp_offset; - dprintk("RPC: reading XID (%Zu bytes)\n", len); - p = ((char *) &xprt->tcp_xid) + xprt->tcp_offset; - used = tcp_copy_data(desc, p, len); - xprt->tcp_offset += used; - if (used != len) - return; - xprt->tcp_flags &= ~XPRT_COPY_XID; - xprt->tcp_flags |= XPRT_COPY_DATA; - xprt->tcp_copied = 4; - dprintk("RPC: reading reply for XID %08x\n", - ntohl(xprt->tcp_xid)); - tcp_check_recm(xprt); -} - -/* - * TCP read and complete request - */ -static inline void -tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc) -{ - struct rpc_rqst *req; - struct xdr_buf *rcvbuf; - size_t len; - ssize_t r; - - /* Find and lock the request corresponding to this xid */ - spin_lock(&xprt->sock_lock); - req = xprt_lookup_rqst(xprt, xprt->tcp_xid); - if (!req) { - xprt->tcp_flags &= ~XPRT_COPY_DATA; - dprintk("RPC: XID %08x request not found!\n", - ntohl(xprt->tcp_xid)); - spin_unlock(&xprt->sock_lock); - return; - } - - rcvbuf = &req->rq_private_buf; - len = desc->count; - if (len > xprt->tcp_reclen - xprt->tcp_offset) { - skb_reader_t my_desc; - - len = xprt->tcp_reclen - xprt->tcp_offset; - memcpy(&my_desc, desc, sizeof(my_desc)); - my_desc.count = len; - r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied, - &my_desc, tcp_copy_data); - desc->count -= r; - desc->offset += r; - } else - r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied, - desc, tcp_copy_data); - - if (r > 0) { - xprt->tcp_copied += r; - xprt->tcp_offset += r; - } - if (r != len) { - /* Error when copying to the receive buffer, - * usually because we weren't able to allocate - * additional buffer pages. All we can do now - * is turn off XPRT_COPY_DATA, so the request - * will not receive any additional updates, - * and time out. - * Any remaining data from this record will - * be discarded. - */ - xprt->tcp_flags &= ~XPRT_COPY_DATA; - dprintk("RPC: XID %08x truncated request\n", - ntohl(xprt->tcp_xid)); - dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n", - xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen); - goto out; - } - - dprintk("RPC: XID %08x read %Zd bytes\n", - ntohl(xprt->tcp_xid), r); - dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n", - xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen); - - if (xprt->tcp_copied == req->rq_private_buf.buflen) - xprt->tcp_flags &= ~XPRT_COPY_DATA; - else if (xprt->tcp_offset == xprt->tcp_reclen) { - if (xprt->tcp_flags & XPRT_LAST_FRAG) - xprt->tcp_flags &= ~XPRT_COPY_DATA; - } - -out: - if (!(xprt->tcp_flags & XPRT_COPY_DATA)) { - dprintk("RPC: %4d received reply complete\n", - req->rq_task->tk_pid); - xprt_complete_rqst(xprt, req, xprt->tcp_copied); - } - spin_unlock(&xprt->sock_lock); - tcp_check_recm(xprt); -} - -/* - * TCP discard extra bytes from a short read - */ -static inline void -tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc) -{ - size_t len; - - len = xprt->tcp_reclen - xprt->tcp_offset; - if (len > desc->count) - len = desc->count; - desc->count -= len; - desc->offset += len; - xprt->tcp_offset += len; - dprintk("RPC: discarded %Zu bytes\n", len); - tcp_check_recm(xprt); -} - -/* - * TCP record receive routine - * We first have to grab the record marker, then the XID, then the data. +/** + * xprt_update_rtt - update an RPC client's RTT state after receiving a reply + * @task: RPC request that recently completed + * */ -static int -tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, - unsigned int offset, size_t len) -{ - struct rpc_xprt *xprt = rd_desc->arg.data; - skb_reader_t desc = { - .skb = skb, - .offset = offset, - .count = len, - .csum = 0 - }; - - dprintk("RPC: tcp_data_recv\n"); - do { - /* Read in a new fragment marker if necessary */ - /* Can we ever really expect to get completely empty fragments? */ - if (xprt->tcp_flags & XPRT_COPY_RECM) { - tcp_read_fraghdr(xprt, &desc); - continue; - } - /* Read in the xid if necessary */ - if (xprt->tcp_flags & XPRT_COPY_XID) { - tcp_read_xid(xprt, &desc); - continue; - } - /* Read in the request data */ - if (xprt->tcp_flags & XPRT_COPY_DATA) { - tcp_read_request(xprt, &desc); - continue; - } - /* Skip over any trailing bytes on short reads */ - tcp_read_discard(xprt, &desc); - } while (desc.count); - dprintk("RPC: tcp_data_recv done\n"); - return len - desc.count; -} - -static void tcp_data_ready(struct sock *sk, int bytes) +void xprt_update_rtt(struct rpc_task *task) { - struct rpc_xprt *xprt; - read_descriptor_t rd_desc; - - read_lock(&sk->sk_callback_lock); - dprintk("RPC: tcp_data_ready...\n"); - if (!(xprt = xprt_from_sock(sk))) { - printk("RPC: tcp_data_ready socket info not found!\n"); - goto out; - } - if (xprt->shutdown) - goto out; - - /* We use rd_desc to pass struct xprt to tcp_data_recv */ - rd_desc.arg.data = xprt; - rd_desc.count = 65536; - tcp_read_sock(sk, &rd_desc, tcp_data_recv); -out: - read_unlock(&sk->sk_callback_lock); -} - -static void -tcp_state_change(struct sock *sk) -{ - struct rpc_xprt *xprt; + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_rtt *rtt = task->tk_client->cl_rtt; + unsigned timer = task->tk_msg.rpc_proc->p_timer; - read_lock(&sk->sk_callback_lock); - if (!(xprt = xprt_from_sock(sk))) - goto out; - dprintk("RPC: tcp_state_change client %p...\n", xprt); - dprintk("RPC: state %x conn %d dead %d zapped %d\n", - sk->sk_state, xprt_connected(xprt), - sock_flag(sk, SOCK_DEAD), - sock_flag(sk, SOCK_ZAPPED)); - - switch (sk->sk_state) { - case TCP_ESTABLISHED: - spin_lock_bh(&xprt->sock_lock); - if (!xprt_test_and_set_connected(xprt)) { - /* Reset TCP record info */ - xprt->tcp_offset = 0; - xprt->tcp_reclen = 0; - xprt->tcp_copied = 0; - xprt->tcp_flags = XPRT_COPY_RECM | XPRT_COPY_XID; - rpc_wake_up(&xprt->pending); - } - spin_unlock_bh(&xprt->sock_lock); - break; - case TCP_SYN_SENT: - case TCP_SYN_RECV: - break; - default: - xprt_disconnect(xprt); - break; + if (timer) { + if (req->rq_ntrans == 1) + rpc_update_rtt(rtt, timer, + (long)jiffies - req->rq_xtime); + rpc_set_timeo(rtt, timer, req->rq_ntrans - 1); } - out: - read_unlock(&sk->sk_callback_lock); } -/* - * Called when more output buffer space is available for this socket. - * We try not to wake our writers until they can make "significant" - * progress, otherwise we'll waste resources thrashing sock_sendmsg - * with a bunch of small requests. +/** + * xprt_complete_rqst - called when reply processing is complete + * @task: RPC request that recently completed + * @copied: actual number of bytes received from the transport + * + * Caller holds transport lock. */ -static void -xprt_write_space(struct sock *sk) +void xprt_complete_rqst(struct rpc_task *task, int copied) { - struct rpc_xprt *xprt; - struct socket *sock; - - read_lock(&sk->sk_callback_lock); - if (!(xprt = xprt_from_sock(sk)) || !(sock = sk->sk_socket)) - goto out; - if (xprt->shutdown) - goto out; - - /* Wait until we have enough socket memory */ - if (xprt->stream) { - /* from net/core/stream.c:sk_stream_write_space */ - if (sk_stream_wspace(sk) < sk_stream_min_wspace(sk)) - goto out; - } else { - /* from net/core/sock.c:sock_def_write_space */ - if (!sock_writeable(sk)) - goto out; - } + struct rpc_rqst *req = task->tk_rqstp; - if (!test_and_clear_bit(SOCK_NOSPACE, &sock->flags)) - goto out; + dprintk("RPC: %5u xid %08x complete (%d bytes received)\n", + task->tk_pid, ntohl(req->rq_xid), copied); - spin_lock_bh(&xprt->sock_lock); - if (xprt->snd_task) - rpc_wake_up_task(xprt->snd_task); - spin_unlock_bh(&xprt->sock_lock); -out: - read_unlock(&sk->sk_callback_lock); + list_del_init(&req->rq_list); + req->rq_received = req->rq_private_buf.len = copied; + rpc_wake_up_task(task); } -/* - * RPC receive timeout handler. - */ -static void -xprt_timer(struct rpc_task *task) +static void xprt_timer(struct rpc_task *task) { - struct rpc_rqst *req = task->tk_rqstp; + struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; - spin_lock(&xprt->sock_lock); - if (req->rq_received) - goto out; - - xprt_adjust_cwnd(req->rq_xprt, -ETIMEDOUT); - __xprt_put_cong(xprt, req); + dprintk("RPC: %4d xprt_timer\n", task->tk_pid); - dprintk("RPC: %4d xprt_timer (%s request)\n", - task->tk_pid, req ? "pending" : "backlogged"); - - task->tk_status = -ETIMEDOUT; -out: + spin_lock(&xprt->transport_lock); + if (!req->rq_received) { + if (xprt->ops->timer) + xprt->ops->timer(task); + task->tk_status = -ETIMEDOUT; + } task->tk_timeout = 0; rpc_wake_up_task(task); - spin_unlock(&xprt->sock_lock); + spin_unlock(&xprt->transport_lock); } -/* - * Place the actual RPC call. - * We have to copy the iovec because sendmsg fiddles with its contents. +/** + * xprt_prepare_transmit - reserve the transport before sending a request + * @task: RPC task about to send a request + * */ -int -xprt_prepare_transmit(struct rpc_task *task) +int xprt_prepare_transmit(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; @@ -1191,12 +690,12 @@ xprt_prepare_transmit(struct rpc_task *task) if (xprt->shutdown) return -EIO; - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); if (req->rq_received && !req->rq_bytes_sent) { err = req->rq_received; goto out_unlock; } - if (!__xprt_lock_write(xprt, task)) { + if (!xprt->ops->reserve_xprt(task)) { err = -EAGAIN; goto out_unlock; } @@ -1206,39 +705,42 @@ xprt_prepare_transmit(struct rpc_task *task) goto out_unlock; } out_unlock: - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); return err; } void -xprt_transmit(struct rpc_task *task) +xprt_abort_transmit(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + + xprt_release_write(xprt, task); +} + +/** + * xprt_transmit - send an RPC request on a transport + * @task: controlling RPC task + * + * We have to copy the iovec because sendmsg fiddles with its contents. + */ +void xprt_transmit(struct rpc_task *task) { - struct rpc_clnt *clnt = task->tk_client; struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; - int status, retry = 0; - + int status; dprintk("RPC: %4d xprt_transmit(%u)\n", task->tk_pid, req->rq_slen); - /* set up everything as needed. */ - /* Write the record marker */ - if (xprt->stream) { - u32 *marker = req->rq_svec[0].iov_base; - - *marker = htonl(0x80000000|(req->rq_slen-sizeof(*marker))); - } - smp_rmb(); if (!req->rq_received) { if (list_empty(&req->rq_list)) { - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); /* Update the softirq receive buffer */ memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(req->rq_private_buf)); /* Add request to the receive list */ list_add_tail(&req->rq_list, &xprt->recv); - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); xprt_reset_majortimeo(req); /* Turn off autodisconnect */ del_singleshot_timer_sync(&xprt->timer); @@ -1246,40 +748,19 @@ xprt_transmit(struct rpc_task *task) } else if (!req->rq_bytes_sent) return; - /* Continue transmitting the packet/record. We must be careful - * to cope with writespace callbacks arriving _after_ we have - * called xprt_sendmsg(). - */ - while (1) { - req->rq_xtime = jiffies; - status = xprt_sendmsg(xprt, req); - - if (status < 0) - break; - - if (xprt->stream) { - req->rq_bytes_sent += status; - - /* If we've sent the entire packet, immediately - * reset the count of bytes sent. */ - if (req->rq_bytes_sent >= req->rq_slen) { - req->rq_bytes_sent = 0; - goto out_receive; - } - } else { - if (status >= req->rq_slen) - goto out_receive; - status = -EAGAIN; - break; - } - - dprintk("RPC: %4d xmit incomplete (%d left of %d)\n", - task->tk_pid, req->rq_slen - req->rq_bytes_sent, - req->rq_slen); - - status = -EAGAIN; - if (retry++ > 50) - break; + status = xprt->ops->send_request(task); + if (status == 0) { + dprintk("RPC: %4d xmit complete\n", task->tk_pid); + spin_lock_bh(&xprt->transport_lock); + xprt->ops->set_retrans_timeout(task); + /* Don't race with disconnect */ + if (!xprt_connected(xprt)) + task->tk_status = -ENOTCONN; + else if (!req->rq_received) + rpc_sleep_on(&xprt->pending, task, NULL, xprt_timer); + xprt->ops->release_xprt(xprt, task); + spin_unlock_bh(&xprt->transport_lock); + return; } /* Note: at this point, task->tk_sleeping has not yet been set, @@ -1289,60 +770,19 @@ xprt_transmit(struct rpc_task *task) task->tk_status = status; switch (status) { - case -EAGAIN: - if (test_bit(SOCK_ASYNC_NOSPACE, &xprt->sock->flags)) { - /* Protect against races with xprt_write_space */ - spin_lock_bh(&xprt->sock_lock); - /* Don't race with disconnect */ - if (!xprt_connected(xprt)) - task->tk_status = -ENOTCONN; - else if (test_bit(SOCK_NOSPACE, &xprt->sock->flags)) { - task->tk_timeout = req->rq_timeout; - rpc_sleep_on(&xprt->pending, task, NULL, NULL); - } - spin_unlock_bh(&xprt->sock_lock); - return; - } - /* Keep holding the socket if it is blocked */ - rpc_delay(task, HZ>>4); - return; case -ECONNREFUSED: - task->tk_timeout = RPC_REESTABLISH_TIMEOUT; rpc_sleep_on(&xprt->sending, task, NULL, NULL); + case -EAGAIN: case -ENOTCONN: return; default: - if (xprt->stream) - xprt_disconnect(xprt); + break; } xprt_release_write(xprt, task); return; - out_receive: - dprintk("RPC: %4d xmit complete\n", task->tk_pid); - /* Set the task's receive timeout value */ - spin_lock_bh(&xprt->sock_lock); - if (!xprt->nocong) { - int timer = task->tk_msg.rpc_proc->p_timer; - task->tk_timeout = rpc_calc_rto(clnt->cl_rtt, timer); - task->tk_timeout <<= rpc_ntimeo(clnt->cl_rtt, timer) + req->rq_retries; - if (task->tk_timeout > xprt->timeout.to_maxval || task->tk_timeout == 0) - task->tk_timeout = xprt->timeout.to_maxval; - } else - task->tk_timeout = req->rq_timeout; - /* Don't race with disconnect */ - if (!xprt_connected(xprt)) - task->tk_status = -ENOTCONN; - else if (!req->rq_received) - rpc_sleep_on(&xprt->pending, task, NULL, xprt_timer); - __xprt_release_write(xprt, task); - spin_unlock_bh(&xprt->sock_lock); } -/* - * Reserve an RPC call slot. - */ -static inline void -do_xprt_reserve(struct rpc_task *task) +static inline void do_xprt_reserve(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; @@ -1362,22 +802,25 @@ do_xprt_reserve(struct rpc_task *task) rpc_sleep_on(&xprt->backlog, task, NULL, NULL); } -void -xprt_reserve(struct rpc_task *task) +/** + * xprt_reserve - allocate an RPC request slot + * @task: RPC task requesting a slot allocation + * + * If no more slots are available, place the task on the transport's + * backlog queue. + */ +void xprt_reserve(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; task->tk_status = -EIO; if (!xprt->shutdown) { - spin_lock(&xprt->xprt_lock); + spin_lock(&xprt->reserve_lock); do_xprt_reserve(task); - spin_unlock(&xprt->xprt_lock); + spin_unlock(&xprt->reserve_lock); } } -/* - * Allocate a 'unique' XID - */ static inline u32 xprt_alloc_xid(struct rpc_xprt *xprt) { return xprt->xid++; @@ -1388,11 +831,7 @@ static inline void xprt_init_xid(struct rpc_xprt *xprt) get_random_bytes(&xprt->xid, sizeof(xprt->xid)); } -/* - * Initialize RPC request - */ -static void -xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt) +static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt) { struct rpc_rqst *req = task->tk_rqstp; @@ -1400,128 +839,104 @@ xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt) req->rq_task = task; req->rq_xprt = xprt; req->rq_xid = xprt_alloc_xid(xprt); + req->rq_release_snd_buf = NULL; dprintk("RPC: %4d reserved req %p xid %08x\n", task->tk_pid, req, ntohl(req->rq_xid)); } -/* - * Release an RPC call slot +/** + * xprt_release - release an RPC request slot + * @task: task which is finished with the slot + * */ -void -xprt_release(struct rpc_task *task) +void xprt_release(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; struct rpc_rqst *req; if (!(req = task->tk_rqstp)) return; - spin_lock_bh(&xprt->sock_lock); - __xprt_release_write(xprt, task); - __xprt_put_cong(xprt, req); + spin_lock_bh(&xprt->transport_lock); + xprt->ops->release_xprt(xprt, task); + if (xprt->ops->release_request) + xprt->ops->release_request(task); if (!list_empty(&req->rq_list)) list_del(&req->rq_list); xprt->last_used = jiffies; if (list_empty(&xprt->recv) && !xprt->shutdown) - mod_timer(&xprt->timer, xprt->last_used + XPRT_IDLE_TIMEOUT); - spin_unlock_bh(&xprt->sock_lock); + mod_timer(&xprt->timer, + xprt->last_used + xprt->idle_timeout); + spin_unlock_bh(&xprt->transport_lock); task->tk_rqstp = NULL; + if (req->rq_release_snd_buf) + req->rq_release_snd_buf(req); memset(req, 0, sizeof(*req)); /* mark unused */ dprintk("RPC: %4d release request %p\n", task->tk_pid, req); - spin_lock(&xprt->xprt_lock); + spin_lock(&xprt->reserve_lock); list_add(&req->rq_list, &xprt->free); - xprt_clear_backlog(xprt); - spin_unlock(&xprt->xprt_lock); -} - -/* - * Set default timeout parameters - */ -static void -xprt_default_timeout(struct rpc_timeout *to, int proto) -{ - if (proto == IPPROTO_UDP) - xprt_set_timeout(to, 5, 5 * HZ); - else - xprt_set_timeout(to, 5, 60 * HZ); + rpc_wake_up_next(&xprt->backlog); + spin_unlock(&xprt->reserve_lock); } -/* - * Set constant timeout +/** + * xprt_set_timeout - set constant RPC timeout + * @to: RPC timeout parameters to set up + * @retr: number of retries + * @incr: amount of increase after each retry + * */ -void -xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long incr) +void xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long incr) { to->to_initval = to->to_increment = incr; - to->to_maxval = incr * retr; + to->to_maxval = to->to_initval + (incr * retr); to->to_retries = retr; to->to_exponential = 0; } -unsigned int xprt_udp_slot_table_entries = RPC_DEF_SLOT_TABLE; -unsigned int xprt_tcp_slot_table_entries = RPC_DEF_SLOT_TABLE; - -/* - * Initialize an RPC client - */ -static struct rpc_xprt * -xprt_setup(int proto, struct sockaddr_in *ap, struct rpc_timeout *to) +static struct rpc_xprt *xprt_setup(int proto, struct sockaddr_in *ap, struct rpc_timeout *to) { + int result; struct rpc_xprt *xprt; - unsigned int entries; - size_t slot_table_size; struct rpc_rqst *req; - dprintk("RPC: setting up %s transport...\n", - proto == IPPROTO_UDP? "UDP" : "TCP"); - - entries = (proto == IPPROTO_TCP)? - xprt_tcp_slot_table_entries : xprt_udp_slot_table_entries; - if ((xprt = kmalloc(sizeof(struct rpc_xprt), GFP_KERNEL)) == NULL) return ERR_PTR(-ENOMEM); memset(xprt, 0, sizeof(*xprt)); /* Nnnngh! */ - xprt->max_reqs = entries; - slot_table_size = entries * sizeof(xprt->slot[0]); - xprt->slot = kmalloc(slot_table_size, GFP_KERNEL); - if (xprt->slot == NULL) { - kfree(xprt); - return ERR_PTR(-ENOMEM); - } - memset(xprt->slot, 0, slot_table_size); xprt->addr = *ap; - xprt->prot = proto; - xprt->stream = (proto == IPPROTO_TCP)? 1 : 0; - if (xprt->stream) { - xprt->cwnd = RPC_MAXCWND(xprt); - xprt->nocong = 1; - xprt->max_payload = (1U << 31) - 1; - } else { - xprt->cwnd = RPC_INITCWND; - xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); + + switch (proto) { + case IPPROTO_UDP: + result = xs_setup_udp(xprt, to); + break; + case IPPROTO_TCP: + result = xs_setup_tcp(xprt, to); + break; + default: + printk(KERN_ERR "RPC: unrecognized transport protocol: %d\n", + proto); + result = -EIO; + break; + } + if (result) { + kfree(xprt); + return ERR_PTR(result); } - spin_lock_init(&xprt->sock_lock); - spin_lock_init(&xprt->xprt_lock); - init_waitqueue_head(&xprt->cong_wait); + + spin_lock_init(&xprt->transport_lock); + spin_lock_init(&xprt->reserve_lock); INIT_LIST_HEAD(&xprt->free); INIT_LIST_HEAD(&xprt->recv); - INIT_WORK(&xprt->sock_connect, xprt_socket_connect, xprt); - INIT_WORK(&xprt->task_cleanup, xprt_socket_autoclose, xprt); + INIT_WORK(&xprt->task_cleanup, xprt_autoclose, xprt); init_timer(&xprt->timer); xprt->timer.function = xprt_init_autodisconnect; xprt->timer.data = (unsigned long) xprt; xprt->last_used = jiffies; - xprt->port = XPRT_MAX_RESVPORT; - - /* Set timeout parameters */ - if (to) { - xprt->timeout = *to; - } else - xprt_default_timeout(&xprt->timeout, xprt->prot); + xprt->cwnd = RPC_INITCWND; rpc_init_wait_queue(&xprt->pending, "xprt_pending"); rpc_init_wait_queue(&xprt->sending, "xprt_sending"); @@ -1529,139 +944,25 @@ xprt_setup(int proto, struct sockaddr_in *ap, struct rpc_timeout *to) rpc_init_priority_wait_queue(&xprt->backlog, "xprt_backlog"); /* initialize free list */ - for (req = &xprt->slot[entries-1]; req >= &xprt->slot[0]; req--) + for (req = &xprt->slot[xprt->max_reqs-1]; req >= &xprt->slot[0]; req--) list_add(&req->rq_list, &xprt->free); xprt_init_xid(xprt); - /* Check whether we want to use a reserved port */ - xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; - dprintk("RPC: created transport %p with %u slots\n", xprt, xprt->max_reqs); return xprt; } -/* - * Bind to a reserved port - */ -static inline int xprt_bindresvport(struct rpc_xprt *xprt, struct socket *sock) -{ - struct sockaddr_in myaddr = { - .sin_family = AF_INET, - }; - int err, port; - - /* Were we already bound to a given port? Try to reuse it */ - port = xprt->port; - do { - myaddr.sin_port = htons(port); - err = sock->ops->bind(sock, (struct sockaddr *) &myaddr, - sizeof(myaddr)); - if (err == 0) { - xprt->port = port; - return 0; - } - if (--port == 0) - port = XPRT_MAX_RESVPORT; - } while (err == -EADDRINUSE && port != xprt->port); - - printk("RPC: Can't bind to reserved port (%d).\n", -err); - return err; -} - -static void -xprt_bind_socket(struct rpc_xprt *xprt, struct socket *sock) -{ - struct sock *sk = sock->sk; - - if (xprt->inet) - return; - - write_lock_bh(&sk->sk_callback_lock); - sk->sk_user_data = xprt; - xprt->old_data_ready = sk->sk_data_ready; - xprt->old_state_change = sk->sk_state_change; - xprt->old_write_space = sk->sk_write_space; - if (xprt->prot == IPPROTO_UDP) { - sk->sk_data_ready = udp_data_ready; - sk->sk_no_check = UDP_CSUM_NORCV; - xprt_set_connected(xprt); - } else { - tcp_sk(sk)->nonagle = 1; /* disable Nagle's algorithm */ - sk->sk_data_ready = tcp_data_ready; - sk->sk_state_change = tcp_state_change; - xprt_clear_connected(xprt); - } - sk->sk_write_space = xprt_write_space; - - /* Reset to new socket */ - xprt->sock = sock; - xprt->inet = sk; - write_unlock_bh(&sk->sk_callback_lock); - - return; -} - -/* - * Set socket buffer length - */ -void -xprt_sock_setbufsize(struct rpc_xprt *xprt) -{ - struct sock *sk = xprt->inet; - - if (xprt->stream) - return; - if (xprt->rcvsize) { - sk->sk_userlocks |= SOCK_RCVBUF_LOCK; - sk->sk_rcvbuf = xprt->rcvsize * xprt->max_reqs * 2; - } - if (xprt->sndsize) { - sk->sk_userlocks |= SOCK_SNDBUF_LOCK; - sk->sk_sndbuf = xprt->sndsize * xprt->max_reqs * 2; - sk->sk_write_space(sk); - } -} - -/* - * Datastream sockets are created here, but xprt_connect will create - * and connect stream sockets. - */ -static struct socket * xprt_create_socket(struct rpc_xprt *xprt, int proto, int resvport) -{ - struct socket *sock; - int type, err; - - dprintk("RPC: xprt_create_socket(%s %d)\n", - (proto == IPPROTO_UDP)? "udp" : "tcp", proto); - - type = (proto == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; - - if ((err = sock_create_kern(PF_INET, type, proto, &sock)) < 0) { - printk("RPC: can't create socket (%d).\n", -err); - return NULL; - } - - /* If the caller has the capability, bind to a reserved port */ - if (resvport && xprt_bindresvport(xprt, sock) < 0) { - printk("RPC: can't bind to reserved port.\n"); - goto failed; - } - - return sock; - -failed: - sock_release(sock); - return NULL; -} - -/* - * Create an RPC client transport given the protocol and peer address. +/** + * xprt_create_proto - create an RPC client transport + * @proto: requested transport protocol + * @sap: remote peer's address + * @to: timeout parameters for new transport + * */ -struct rpc_xprt * -xprt_create_proto(int proto, struct sockaddr_in *sap, struct rpc_timeout *to) +struct rpc_xprt *xprt_create_proto(int proto, struct sockaddr_in *sap, struct rpc_timeout *to) { struct rpc_xprt *xprt; @@ -1673,46 +974,26 @@ xprt_create_proto(int proto, struct sockaddr_in *sap, struct rpc_timeout *to) return xprt; } -/* - * Prepare for transport shutdown. - */ -static void -xprt_shutdown(struct rpc_xprt *xprt) +static void xprt_shutdown(struct rpc_xprt *xprt) { xprt->shutdown = 1; rpc_wake_up(&xprt->sending); rpc_wake_up(&xprt->resend); - rpc_wake_up(&xprt->pending); + xprt_wake_pending_tasks(xprt, -EIO); rpc_wake_up(&xprt->backlog); - wake_up(&xprt->cong_wait); del_timer_sync(&xprt->timer); - - /* synchronously wait for connect worker to finish */ - cancel_delayed_work(&xprt->sock_connect); - flush_scheduled_work(); } -/* - * Clear the xprt backlog queue - */ -static int -xprt_clear_backlog(struct rpc_xprt *xprt) { - rpc_wake_up_next(&xprt->backlog); - wake_up(&xprt->cong_wait); - return 1; -} - -/* - * Destroy an RPC transport, killing off all requests. +/** + * xprt_destroy - destroy an RPC transport, killing off all requests. + * @xprt: transport to destroy + * */ -int -xprt_destroy(struct rpc_xprt *xprt) +int xprt_destroy(struct rpc_xprt *xprt) { dprintk("RPC: destroying transport %p\n", xprt); xprt_shutdown(xprt); - xprt_disconnect(xprt); - xprt_close(xprt); - kfree(xprt->slot); + xprt->ops->destroy(xprt); kfree(xprt); return 0; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c new file mode 100644 index 00000000000..0a51fd46a84 --- /dev/null +++ b/net/sunrpc/xprtsock.c @@ -0,0 +1,1261 @@ +/* + * linux/net/sunrpc/xprtsock.c + * + * Client-side transport implementation for sockets. + * + * TCP callback races fixes (C) 1998 Red Hat Software <alan@redhat.com> + * TCP send fixes (C) 1998 Red Hat Software <alan@redhat.com> + * TCP NFS related read + write fixes + * (C) 1999 Dave Airlie, University of Limerick, Ireland <airlied@linux.ie> + * + * Rewrite of larges part of the code in order to stabilize TCP stuff. + * Fix behaviour when socket buffer is full. + * (C) 1999 Trond Myklebust <trond.myklebust@fys.uio.no> + * + * IP socket transport implementation, (C) 2005 Chuck Lever <cel@netapp.com> + */ + +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/capability.h> +#include <linux/sched.h> +#include <linux/pagemap.h> +#include <linux/errno.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/net.h> +#include <linux/mm.h> +#include <linux/udp.h> +#include <linux/tcp.h> +#include <linux/sunrpc/clnt.h> +#include <linux/file.h> + +#include <net/sock.h> +#include <net/checksum.h> +#include <net/udp.h> +#include <net/tcp.h> + +/* + * xprtsock tunables + */ +unsigned int xprt_udp_slot_table_entries = RPC_DEF_SLOT_TABLE; +unsigned int xprt_tcp_slot_table_entries = RPC_DEF_SLOT_TABLE; + +unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT; +unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT; + +/* + * How many times to try sending a request on a socket before waiting + * for the socket buffer to clear. + */ +#define XS_SENDMSG_RETRY (10U) + +/* + * Time out for an RPC UDP socket connect. UDP socket connects are + * synchronous, but we set a timeout anyway in case of resource + * exhaustion on the local host. + */ +#define XS_UDP_CONN_TO (5U * HZ) + +/* + * Wait duration for an RPC TCP connection to be established. Solaris + * NFS over TCP uses 60 seconds, for example, which is in line with how + * long a server takes to reboot. + */ +#define XS_TCP_CONN_TO (60U * HZ) + +/* + * Wait duration for a reply from the RPC portmapper. + */ +#define XS_BIND_TO (60U * HZ) + +/* + * Delay if a UDP socket connect error occurs. This is most likely some + * kind of resource problem on the local host. + */ +#define XS_UDP_REEST_TO (2U * HZ) + +/* + * The reestablish timeout allows clients to delay for a bit before attempting + * to reconnect to a server that just dropped our connection. + * + * We implement an exponential backoff when trying to reestablish a TCP + * transport connection with the server. Some servers like to drop a TCP + * connection when they are overworked, so we start with a short timeout and + * increase over time if the server is down or not responding. + */ +#define XS_TCP_INIT_REEST_TO (3U * HZ) +#define XS_TCP_MAX_REEST_TO (5U * 60 * HZ) + +/* + * TCP idle timeout; client drops the transport socket if it is idle + * for this long. Note that we also timeout UDP sockets to prevent + * holding port numbers when there is no RPC traffic. + */ +#define XS_IDLE_DISC_TO (5U * 60 * HZ) + +#ifdef RPC_DEBUG +# undef RPC_DEBUG_DATA +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +#ifdef RPC_DEBUG_DATA +static void xs_pktdump(char *msg, u32 *packet, unsigned int count) +{ + u8 *buf = (u8 *) packet; + int j; + + dprintk("RPC: %s\n", msg); + for (j = 0; j < count && j < 128; j += 4) { + if (!(j & 31)) { + if (j) + dprintk("\n"); + dprintk("0x%04x ", j); + } + dprintk("%02x%02x%02x%02x ", + buf[j], buf[j+1], buf[j+2], buf[j+3]); + } + dprintk("\n"); +} +#else +static inline void xs_pktdump(char *msg, u32 *packet, unsigned int count) +{ + /* NOP */ +} +#endif + +#define XS_SENDMSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL) + +static inline int xs_send_head(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, unsigned int len) +{ + struct kvec iov = { + .iov_base = xdr->head[0].iov_base + base, + .iov_len = len - base, + }; + struct msghdr msg = { + .msg_name = addr, + .msg_namelen = addrlen, + .msg_flags = XS_SENDMSG_FLAGS, + }; + + if (xdr->len > len) + msg.msg_flags |= MSG_MORE; + + if (likely(iov.iov_len)) + return kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len); + return kernel_sendmsg(sock, &msg, NULL, 0, 0); +} + +static int xs_send_tail(struct socket *sock, struct xdr_buf *xdr, unsigned int base, unsigned int len) +{ + struct kvec iov = { + .iov_base = xdr->tail[0].iov_base + base, + .iov_len = len - base, + }; + struct msghdr msg = { + .msg_flags = XS_SENDMSG_FLAGS, + }; + + return kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len); +} + +/** + * xs_sendpages - write pages directly to a socket + * @sock: socket to send on + * @addr: UDP only -- address of destination + * @addrlen: UDP only -- length of destination address + * @xdr: buffer containing this request + * @base: starting position in the buffer + * + */ +static inline int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base) +{ + struct page **ppage = xdr->pages; + unsigned int len, pglen = xdr->page_len; + int err, ret = 0; + ssize_t (*sendpage)(struct socket *, struct page *, int, size_t, int); + + if (unlikely(!sock)) + return -ENOTCONN; + + clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags); + + len = xdr->head[0].iov_len; + if (base < len || (addr != NULL && base == 0)) { + err = xs_send_head(sock, addr, addrlen, xdr, base, len); + if (ret == 0) + ret = err; + else if (err > 0) + ret += err; + if (err != (len - base)) + goto out; + base = 0; + } else + base -= len; + + if (unlikely(pglen == 0)) + goto copy_tail; + if (unlikely(base >= pglen)) { + base -= pglen; + goto copy_tail; + } + if (base || xdr->page_base) { + pglen -= base; + base += xdr->page_base; + ppage += base >> PAGE_CACHE_SHIFT; + base &= ~PAGE_CACHE_MASK; + } + + sendpage = sock->ops->sendpage ? : sock_no_sendpage; + do { + int flags = XS_SENDMSG_FLAGS; + + len = PAGE_CACHE_SIZE; + if (base) + len -= base; + if (pglen < len) + len = pglen; + + if (pglen != len || xdr->tail[0].iov_len != 0) + flags |= MSG_MORE; + + /* Hmm... We might be dealing with highmem pages */ + if (PageHighMem(*ppage)) + sendpage = sock_no_sendpage; + err = sendpage(sock, *ppage, base, len, flags); + if (ret == 0) + ret = err; + else if (err > 0) + ret += err; + if (err != len) + goto out; + base = 0; + ppage++; + } while ((pglen -= len) != 0); +copy_tail: + len = xdr->tail[0].iov_len; + if (base < len) { + err = xs_send_tail(sock, xdr, base, len); + if (ret == 0) + ret = err; + else if (err > 0) + ret += err; + } +out: + return ret; +} + +/** + * xs_nospace - place task on wait queue if transmit was incomplete + * @task: task to put to sleep + * + */ +static void xs_nospace(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + dprintk("RPC: %4d xmit incomplete (%u left of %u)\n", + task->tk_pid, req->rq_slen - req->rq_bytes_sent, + req->rq_slen); + + if (test_bit(SOCK_ASYNC_NOSPACE, &xprt->sock->flags)) { + /* Protect against races with write_space */ + spin_lock_bh(&xprt->transport_lock); + + /* Don't race with disconnect */ + if (!xprt_connected(xprt)) + task->tk_status = -ENOTCONN; + else if (test_bit(SOCK_NOSPACE, &xprt->sock->flags)) + xprt_wait_for_buffer_space(task); + + spin_unlock_bh(&xprt->transport_lock); + } else + /* Keep holding the socket if it is blocked */ + rpc_delay(task, HZ>>4); +} + +/** + * xs_udp_send_request - write an RPC request to a UDP socket + * @task: address of RPC task that manages the state of an RPC request + * + * Return values: + * 0: The request has been sent + * EAGAIN: The socket was blocked, please call again later to + * complete the request + * ENOTCONN: Caller needs to invoke connect logic then call again + * other: Some other error occured, the request was not sent + */ +static int xs_udp_send_request(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + struct xdr_buf *xdr = &req->rq_snd_buf; + int status; + + xs_pktdump("packet data:", + req->rq_svec->iov_base, + req->rq_svec->iov_len); + + req->rq_xtime = jiffies; + status = xs_sendpages(xprt->sock, (struct sockaddr *) &xprt->addr, + sizeof(xprt->addr), xdr, req->rq_bytes_sent); + + dprintk("RPC: xs_udp_send_request(%u) = %d\n", + xdr->len - req->rq_bytes_sent, status); + + if (likely(status >= (int) req->rq_slen)) + return 0; + + /* Still some bytes left; set up for a retry later. */ + if (status > 0) + status = -EAGAIN; + + switch (status) { + case -ENETUNREACH: + case -EPIPE: + case -ECONNREFUSED: + /* When the server has died, an ICMP port unreachable message + * prompts ECONNREFUSED. */ + break; + case -EAGAIN: + xs_nospace(task); + break; + default: + dprintk("RPC: sendmsg returned unrecognized error %d\n", + -status); + break; + } + + return status; +} + +static inline void xs_encode_tcp_record_marker(struct xdr_buf *buf) +{ + u32 reclen = buf->len - sizeof(rpc_fraghdr); + rpc_fraghdr *base = buf->head[0].iov_base; + *base = htonl(RPC_LAST_STREAM_FRAGMENT | reclen); +} + +/** + * xs_tcp_send_request - write an RPC request to a TCP socket + * @task: address of RPC task that manages the state of an RPC request + * + * Return values: + * 0: The request has been sent + * EAGAIN: The socket was blocked, please call again later to + * complete the request + * ENOTCONN: Caller needs to invoke connect logic then call again + * other: Some other error occured, the request was not sent + * + * XXX: In the case of soft timeouts, should we eventually give up + * if sendmsg is not able to make progress? + */ +static int xs_tcp_send_request(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + struct xdr_buf *xdr = &req->rq_snd_buf; + int status, retry = 0; + + xs_encode_tcp_record_marker(&req->rq_snd_buf); + + xs_pktdump("packet data:", + req->rq_svec->iov_base, + req->rq_svec->iov_len); + + /* Continue transmitting the packet/record. We must be careful + * to cope with writespace callbacks arriving _after_ we have + * called sendmsg(). */ + while (1) { + req->rq_xtime = jiffies; + status = xs_sendpages(xprt->sock, NULL, 0, xdr, + req->rq_bytes_sent); + + dprintk("RPC: xs_tcp_send_request(%u) = %d\n", + xdr->len - req->rq_bytes_sent, status); + + if (unlikely(status < 0)) + break; + + /* If we've sent the entire packet, immediately + * reset the count of bytes sent. */ + req->rq_bytes_sent += status; + if (likely(req->rq_bytes_sent >= req->rq_slen)) { + req->rq_bytes_sent = 0; + return 0; + } + + status = -EAGAIN; + if (retry++ > XS_SENDMSG_RETRY) + break; + } + + switch (status) { + case -EAGAIN: + xs_nospace(task); + break; + case -ECONNREFUSED: + case -ECONNRESET: + case -ENOTCONN: + case -EPIPE: + status = -ENOTCONN; + break; + default: + dprintk("RPC: sendmsg returned unrecognized error %d\n", + -status); + xprt_disconnect(xprt); + break; + } + + return status; +} + +/** + * xs_close - close a socket + * @xprt: transport + * + * This is used when all requests are complete; ie, no DRC state remains + * on the server we want to save. + */ +static void xs_close(struct rpc_xprt *xprt) +{ + struct socket *sock = xprt->sock; + struct sock *sk = xprt->inet; + + if (!sk) + return; + + dprintk("RPC: xs_close xprt %p\n", xprt); + + write_lock_bh(&sk->sk_callback_lock); + xprt->inet = NULL; + xprt->sock = NULL; + + sk->sk_user_data = NULL; + sk->sk_data_ready = xprt->old_data_ready; + sk->sk_state_change = xprt->old_state_change; + sk->sk_write_space = xprt->old_write_space; + write_unlock_bh(&sk->sk_callback_lock); + + sk->sk_no_check = 0; + + sock_release(sock); +} + +/** + * xs_destroy - prepare to shutdown a transport + * @xprt: doomed transport + * + */ +static void xs_destroy(struct rpc_xprt *xprt) +{ + dprintk("RPC: xs_destroy xprt %p\n", xprt); + + cancel_delayed_work(&xprt->connect_worker); + flush_scheduled_work(); + + xprt_disconnect(xprt); + xs_close(xprt); + kfree(xprt->slot); +} + +static inline struct rpc_xprt *xprt_from_sock(struct sock *sk) +{ + return (struct rpc_xprt *) sk->sk_user_data; +} + +/** + * xs_udp_data_ready - "data ready" callback for UDP sockets + * @sk: socket with data to read + * @len: how much data to read + * + */ +static void xs_udp_data_ready(struct sock *sk, int len) +{ + struct rpc_task *task; + struct rpc_xprt *xprt; + struct rpc_rqst *rovr; + struct sk_buff *skb; + int err, repsize, copied; + u32 _xid, *xp; + + read_lock(&sk->sk_callback_lock); + dprintk("RPC: xs_udp_data_ready...\n"); + if (!(xprt = xprt_from_sock(sk))) + goto out; + + if ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL) + goto out; + + if (xprt->shutdown) + goto dropit; + + repsize = skb->len - sizeof(struct udphdr); + if (repsize < 4) { + dprintk("RPC: impossible RPC reply size %d!\n", repsize); + goto dropit; + } + + /* Copy the XID from the skb... */ + xp = skb_header_pointer(skb, sizeof(struct udphdr), + sizeof(_xid), &_xid); + if (xp == NULL) + goto dropit; + + /* Look up and lock the request corresponding to the given XID */ + spin_lock(&xprt->transport_lock); + rovr = xprt_lookup_rqst(xprt, *xp); + if (!rovr) + goto out_unlock; + task = rovr->rq_task; + + if ((copied = rovr->rq_private_buf.buflen) > repsize) + copied = repsize; + + /* Suck it into the iovec, verify checksum if not done by hw. */ + if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) + goto out_unlock; + + /* Something worked... */ + dst_confirm(skb->dst); + + xprt_adjust_cwnd(task, copied); + xprt_update_rtt(task); + xprt_complete_rqst(task, copied); + + out_unlock: + spin_unlock(&xprt->transport_lock); + dropit: + skb_free_datagram(sk, skb); + out: + read_unlock(&sk->sk_callback_lock); +} + +static inline size_t xs_tcp_copy_data(skb_reader_t *desc, void *p, size_t len) +{ + if (len > desc->count) + len = desc->count; + if (skb_copy_bits(desc->skb, desc->offset, p, len)) { + dprintk("RPC: failed to copy %zu bytes from skb. %zu bytes remain\n", + len, desc->count); + return 0; + } + desc->offset += len; + desc->count -= len; + dprintk("RPC: copied %zu bytes from skb. %zu bytes remain\n", + len, desc->count); + return len; +} + +static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, skb_reader_t *desc) +{ + size_t len, used; + char *p; + + p = ((char *) &xprt->tcp_recm) + xprt->tcp_offset; + len = sizeof(xprt->tcp_recm) - xprt->tcp_offset; + used = xs_tcp_copy_data(desc, p, len); + xprt->tcp_offset += used; + if (used != len) + return; + + xprt->tcp_reclen = ntohl(xprt->tcp_recm); + if (xprt->tcp_reclen & RPC_LAST_STREAM_FRAGMENT) + xprt->tcp_flags |= XPRT_LAST_FRAG; + else + xprt->tcp_flags &= ~XPRT_LAST_FRAG; + xprt->tcp_reclen &= RPC_FRAGMENT_SIZE_MASK; + + xprt->tcp_flags &= ~XPRT_COPY_RECM; + xprt->tcp_offset = 0; + + /* Sanity check of the record length */ + if (unlikely(xprt->tcp_reclen < 4)) { + dprintk("RPC: invalid TCP record fragment length\n"); + xprt_disconnect(xprt); + return; + } + dprintk("RPC: reading TCP record fragment of length %d\n", + xprt->tcp_reclen); +} + +static void xs_tcp_check_recm(struct rpc_xprt *xprt) +{ + dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u, tcp_flags = %lx\n", + xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen, xprt->tcp_flags); + if (xprt->tcp_offset == xprt->tcp_reclen) { + xprt->tcp_flags |= XPRT_COPY_RECM; + xprt->tcp_offset = 0; + if (xprt->tcp_flags & XPRT_LAST_FRAG) { + xprt->tcp_flags &= ~XPRT_COPY_DATA; + xprt->tcp_flags |= XPRT_COPY_XID; + xprt->tcp_copied = 0; + } + } +} + +static inline void xs_tcp_read_xid(struct rpc_xprt *xprt, skb_reader_t *desc) +{ + size_t len, used; + char *p; + + len = sizeof(xprt->tcp_xid) - xprt->tcp_offset; + dprintk("RPC: reading XID (%Zu bytes)\n", len); + p = ((char *) &xprt->tcp_xid) + xprt->tcp_offset; + used = xs_tcp_copy_data(desc, p, len); + xprt->tcp_offset += used; + if (used != len) + return; + xprt->tcp_flags &= ~XPRT_COPY_XID; + xprt->tcp_flags |= XPRT_COPY_DATA; + xprt->tcp_copied = 4; + dprintk("RPC: reading reply for XID %08x\n", + ntohl(xprt->tcp_xid)); + xs_tcp_check_recm(xprt); +} + +static inline void xs_tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc) +{ + struct rpc_rqst *req; + struct xdr_buf *rcvbuf; + size_t len; + ssize_t r; + + /* Find and lock the request corresponding to this xid */ + spin_lock(&xprt->transport_lock); + req = xprt_lookup_rqst(xprt, xprt->tcp_xid); + if (!req) { + xprt->tcp_flags &= ~XPRT_COPY_DATA; + dprintk("RPC: XID %08x request not found!\n", + ntohl(xprt->tcp_xid)); + spin_unlock(&xprt->transport_lock); + return; + } + + rcvbuf = &req->rq_private_buf; + len = desc->count; + if (len > xprt->tcp_reclen - xprt->tcp_offset) { + skb_reader_t my_desc; + + len = xprt->tcp_reclen - xprt->tcp_offset; + memcpy(&my_desc, desc, sizeof(my_desc)); + my_desc.count = len; + r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied, + &my_desc, xs_tcp_copy_data); + desc->count -= r; + desc->offset += r; + } else + r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied, + desc, xs_tcp_copy_data); + + if (r > 0) { + xprt->tcp_copied += r; + xprt->tcp_offset += r; + } + if (r != len) { + /* Error when copying to the receive buffer, + * usually because we weren't able to allocate + * additional buffer pages. All we can do now + * is turn off XPRT_COPY_DATA, so the request + * will not receive any additional updates, + * and time out. + * Any remaining data from this record will + * be discarded. + */ + xprt->tcp_flags &= ~XPRT_COPY_DATA; + dprintk("RPC: XID %08x truncated request\n", + ntohl(xprt->tcp_xid)); + dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n", + xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen); + goto out; + } + + dprintk("RPC: XID %08x read %Zd bytes\n", + ntohl(xprt->tcp_xid), r); + dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n", + xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen); + + if (xprt->tcp_copied == req->rq_private_buf.buflen) + xprt->tcp_flags &= ~XPRT_COPY_DATA; + else if (xprt->tcp_offset == xprt->tcp_reclen) { + if (xprt->tcp_flags & XPRT_LAST_FRAG) + xprt->tcp_flags &= ~XPRT_COPY_DATA; + } + +out: + if (!(xprt->tcp_flags & XPRT_COPY_DATA)) + xprt_complete_rqst(req->rq_task, xprt->tcp_copied); + spin_unlock(&xprt->transport_lock); + xs_tcp_check_recm(xprt); +} + +static inline void xs_tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc) +{ + size_t len; + + len = xprt->tcp_reclen - xprt->tcp_offset; + if (len > desc->count) + len = desc->count; + desc->count -= len; + desc->offset += len; + xprt->tcp_offset += len; + dprintk("RPC: discarded %Zu bytes\n", len); + xs_tcp_check_recm(xprt); +} + +static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, unsigned int offset, size_t len) +{ + struct rpc_xprt *xprt = rd_desc->arg.data; + skb_reader_t desc = { + .skb = skb, + .offset = offset, + .count = len, + .csum = 0 + }; + + dprintk("RPC: xs_tcp_data_recv started\n"); + do { + /* Read in a new fragment marker if necessary */ + /* Can we ever really expect to get completely empty fragments? */ + if (xprt->tcp_flags & XPRT_COPY_RECM) { + xs_tcp_read_fraghdr(xprt, &desc); + continue; + } + /* Read in the xid if necessary */ + if (xprt->tcp_flags & XPRT_COPY_XID) { + xs_tcp_read_xid(xprt, &desc); + continue; + } + /* Read in the request data */ + if (xprt->tcp_flags & XPRT_COPY_DATA) { + xs_tcp_read_request(xprt, &desc); + continue; + } + /* Skip over any trailing bytes on short reads */ + xs_tcp_read_discard(xprt, &desc); + } while (desc.count); + dprintk("RPC: xs_tcp_data_recv done\n"); + return len - desc.count; +} + +/** + * xs_tcp_data_ready - "data ready" callback for TCP sockets + * @sk: socket with data to read + * @bytes: how much data to read + * + */ +static void xs_tcp_data_ready(struct sock *sk, int bytes) +{ + struct rpc_xprt *xprt; + read_descriptor_t rd_desc; + + read_lock(&sk->sk_callback_lock); + dprintk("RPC: xs_tcp_data_ready...\n"); + if (!(xprt = xprt_from_sock(sk))) + goto out; + if (xprt->shutdown) + goto out; + + /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */ + rd_desc.arg.data = xprt; + rd_desc.count = 65536; + tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); +out: + read_unlock(&sk->sk_callback_lock); +} + +/** + * xs_tcp_state_change - callback to handle TCP socket state changes + * @sk: socket whose state has changed + * + */ +static void xs_tcp_state_change(struct sock *sk) +{ + struct rpc_xprt *xprt; + + read_lock(&sk->sk_callback_lock); + if (!(xprt = xprt_from_sock(sk))) + goto out; + dprintk("RPC: xs_tcp_state_change client %p...\n", xprt); + dprintk("RPC: state %x conn %d dead %d zapped %d\n", + sk->sk_state, xprt_connected(xprt), + sock_flag(sk, SOCK_DEAD), + sock_flag(sk, SOCK_ZAPPED)); + + switch (sk->sk_state) { + case TCP_ESTABLISHED: + spin_lock_bh(&xprt->transport_lock); + if (!xprt_test_and_set_connected(xprt)) { + /* Reset TCP record info */ + xprt->tcp_offset = 0; + xprt->tcp_reclen = 0; + xprt->tcp_copied = 0; + xprt->tcp_flags = XPRT_COPY_RECM | XPRT_COPY_XID; + xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; + xprt_wake_pending_tasks(xprt, 0); + } + spin_unlock_bh(&xprt->transport_lock); + break; + case TCP_SYN_SENT: + case TCP_SYN_RECV: + break; + default: + xprt_disconnect(xprt); + break; + } + out: + read_unlock(&sk->sk_callback_lock); +} + +/** + * xs_udp_write_space - callback invoked when socket buffer space + * becomes available + * @sk: socket whose state has changed + * + * Called when more output buffer space is available for this socket. + * We try not to wake our writers until they can make "significant" + * progress, otherwise we'll waste resources thrashing kernel_sendmsg + * with a bunch of small requests. + */ +static void xs_udp_write_space(struct sock *sk) +{ + read_lock(&sk->sk_callback_lock); + + /* from net/core/sock.c:sock_def_write_space */ + if (sock_writeable(sk)) { + struct socket *sock; + struct rpc_xprt *xprt; + + if (unlikely(!(sock = sk->sk_socket))) + goto out; + if (unlikely(!(xprt = xprt_from_sock(sk)))) + goto out; + if (unlikely(!test_and_clear_bit(SOCK_NOSPACE, &sock->flags))) + goto out; + + xprt_write_space(xprt); + } + + out: + read_unlock(&sk->sk_callback_lock); +} + +/** + * xs_tcp_write_space - callback invoked when socket buffer space + * becomes available + * @sk: socket whose state has changed + * + * Called when more output buffer space is available for this socket. + * We try not to wake our writers until they can make "significant" + * progress, otherwise we'll waste resources thrashing kernel_sendmsg + * with a bunch of small requests. + */ +static void xs_tcp_write_space(struct sock *sk) +{ + read_lock(&sk->sk_callback_lock); + + /* from net/core/stream.c:sk_stream_write_space */ + if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { + struct socket *sock; + struct rpc_xprt *xprt; + + if (unlikely(!(sock = sk->sk_socket))) + goto out; + if (unlikely(!(xprt = xprt_from_sock(sk)))) + goto out; + if (unlikely(!test_and_clear_bit(SOCK_NOSPACE, &sock->flags))) + goto out; + + xprt_write_space(xprt); + } + + out: + read_unlock(&sk->sk_callback_lock); +} + +static void xs_udp_do_set_buffer_size(struct rpc_xprt *xprt) +{ + struct sock *sk = xprt->inet; + + if (xprt->rcvsize) { + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + sk->sk_rcvbuf = xprt->rcvsize * xprt->max_reqs * 2; + } + if (xprt->sndsize) { + sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + sk->sk_sndbuf = xprt->sndsize * xprt->max_reqs * 2; + sk->sk_write_space(sk); + } +} + +/** + * xs_udp_set_buffer_size - set send and receive limits + * @xprt: generic transport + * @sndsize: requested size of send buffer, in bytes + * @rcvsize: requested size of receive buffer, in bytes + * + * Set socket send and receive buffer size limits. + */ +static void xs_udp_set_buffer_size(struct rpc_xprt *xprt, size_t sndsize, size_t rcvsize) +{ + xprt->sndsize = 0; + if (sndsize) + xprt->sndsize = sndsize + 1024; + xprt->rcvsize = 0; + if (rcvsize) + xprt->rcvsize = rcvsize + 1024; + + xs_udp_do_set_buffer_size(xprt); +} + +/** + * xs_udp_timer - called when a retransmit timeout occurs on a UDP transport + * @task: task that timed out + * + * Adjust the congestion window after a retransmit timeout has occurred. + */ +static void xs_udp_timer(struct rpc_task *task) +{ + xprt_adjust_cwnd(task, -ETIMEDOUT); +} + +static int xs_bindresvport(struct rpc_xprt *xprt, struct socket *sock) +{ + struct sockaddr_in myaddr = { + .sin_family = AF_INET, + }; + int err; + unsigned short port = xprt->port; + + do { + myaddr.sin_port = htons(port); + err = sock->ops->bind(sock, (struct sockaddr *) &myaddr, + sizeof(myaddr)); + if (err == 0) { + xprt->port = port; + dprintk("RPC: xs_bindresvport bound to port %u\n", + port); + return 0; + } + if (port <= xprt_min_resvport) + port = xprt_max_resvport; + else + port--; + } while (err == -EADDRINUSE && port != xprt->port); + + dprintk("RPC: can't bind to reserved port (%d).\n", -err); + return err; +} + +/** + * xs_udp_connect_worker - set up a UDP socket + * @args: RPC transport to connect + * + * Invoked by a work queue tasklet. + */ +static void xs_udp_connect_worker(void *args) +{ + struct rpc_xprt *xprt = (struct rpc_xprt *) args; + struct socket *sock = xprt->sock; + int err, status = -EIO; + + if (xprt->shutdown || xprt->addr.sin_port == 0) + goto out; + + dprintk("RPC: xs_udp_connect_worker for xprt %p\n", xprt); + + /* Start by resetting any existing state */ + xs_close(xprt); + + if ((err = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock)) < 0) { + dprintk("RPC: can't create UDP transport socket (%d).\n", -err); + goto out; + } + + if (xprt->resvport && xs_bindresvport(xprt, sock) < 0) { + sock_release(sock); + goto out; + } + + if (!xprt->inet) { + struct sock *sk = sock->sk; + + write_lock_bh(&sk->sk_callback_lock); + + sk->sk_user_data = xprt; + xprt->old_data_ready = sk->sk_data_ready; + xprt->old_state_change = sk->sk_state_change; + xprt->old_write_space = sk->sk_write_space; + sk->sk_data_ready = xs_udp_data_ready; + sk->sk_write_space = xs_udp_write_space; + sk->sk_no_check = UDP_CSUM_NORCV; + + xprt_set_connected(xprt); + + /* Reset to new socket */ + xprt->sock = sock; + xprt->inet = sk; + + write_unlock_bh(&sk->sk_callback_lock); + } + xs_udp_do_set_buffer_size(xprt); + status = 0; +out: + xprt_wake_pending_tasks(xprt, status); + xprt_clear_connecting(xprt); +} + +/* + * We need to preserve the port number so the reply cache on the server can + * find our cached RPC replies when we get around to reconnecting. + */ +static void xs_tcp_reuse_connection(struct rpc_xprt *xprt) +{ + int result; + struct socket *sock = xprt->sock; + struct sockaddr any; + + dprintk("RPC: disconnecting xprt %p to reuse port\n", xprt); + + /* + * Disconnect the transport socket by doing a connect operation + * with AF_UNSPEC. This should return immediately... + */ + memset(&any, 0, sizeof(any)); + any.sa_family = AF_UNSPEC; + result = sock->ops->connect(sock, &any, sizeof(any), 0); + if (result) + dprintk("RPC: AF_UNSPEC connect return code %d\n", + result); +} + +/** + * xs_tcp_connect_worker - connect a TCP socket to a remote endpoint + * @args: RPC transport to connect + * + * Invoked by a work queue tasklet. + */ +static void xs_tcp_connect_worker(void *args) +{ + struct rpc_xprt *xprt = (struct rpc_xprt *)args; + struct socket *sock = xprt->sock; + int err, status = -EIO; + + if (xprt->shutdown || xprt->addr.sin_port == 0) + goto out; + + dprintk("RPC: xs_tcp_connect_worker for xprt %p\n", xprt); + + if (!xprt->sock) { + /* start from scratch */ + if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) { + dprintk("RPC: can't create TCP transport socket (%d).\n", -err); + goto out; + } + + if (xprt->resvport && xs_bindresvport(xprt, sock) < 0) { + sock_release(sock); + goto out; + } + } else + /* "close" the socket, preserving the local port */ + xs_tcp_reuse_connection(xprt); + + if (!xprt->inet) { + struct sock *sk = sock->sk; + + write_lock_bh(&sk->sk_callback_lock); + + sk->sk_user_data = xprt; + xprt->old_data_ready = sk->sk_data_ready; + xprt->old_state_change = sk->sk_state_change; + xprt->old_write_space = sk->sk_write_space; + sk->sk_data_ready = xs_tcp_data_ready; + sk->sk_state_change = xs_tcp_state_change; + sk->sk_write_space = xs_tcp_write_space; + + /* socket options */ + sk->sk_userlocks |= SOCK_BINDPORT_LOCK; + sock_reset_flag(sk, SOCK_LINGER); + tcp_sk(sk)->linger2 = 0; + tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF; + + xprt_clear_connected(xprt); + + /* Reset to new socket */ + xprt->sock = sock; + xprt->inet = sk; + + write_unlock_bh(&sk->sk_callback_lock); + } + + /* Tell the socket layer to start connecting... */ + status = sock->ops->connect(sock, (struct sockaddr *) &xprt->addr, + sizeof(xprt->addr), O_NONBLOCK); + dprintk("RPC: %p connect status %d connected %d sock state %d\n", + xprt, -status, xprt_connected(xprt), sock->sk->sk_state); + if (status < 0) { + switch (status) { + case -EINPROGRESS: + case -EALREADY: + goto out_clear; + case -ECONNREFUSED: + case -ECONNRESET: + /* retry with existing socket, after a delay */ + break; + default: + /* get rid of existing socket, and retry */ + xs_close(xprt); + break; + } + } +out: + xprt_wake_pending_tasks(xprt, status); +out_clear: + xprt_clear_connecting(xprt); +} + +/** + * xs_connect - connect a socket to a remote endpoint + * @task: address of RPC task that manages state of connect request + * + * TCP: If the remote end dropped the connection, delay reconnecting. + * + * UDP socket connects are synchronous, but we use a work queue anyway + * to guarantee that even unprivileged user processes can set up a + * socket on a privileged port. + * + * If a UDP socket connect fails, the delay behavior here prevents + * retry floods (hard mounts). + */ +static void xs_connect(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + + if (xprt_test_and_set_connecting(xprt)) + return; + + if (xprt->sock != NULL) { + dprintk("RPC: xs_connect delayed xprt %p for %lu seconds\n", + xprt, xprt->reestablish_timeout / HZ); + schedule_delayed_work(&xprt->connect_worker, + xprt->reestablish_timeout); + xprt->reestablish_timeout <<= 1; + if (xprt->reestablish_timeout > XS_TCP_MAX_REEST_TO) + xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO; + } else { + dprintk("RPC: xs_connect scheduled xprt %p\n", xprt); + schedule_work(&xprt->connect_worker); + + /* flush_scheduled_work can sleep... */ + if (!RPC_IS_ASYNC(task)) + flush_scheduled_work(); + } +} + +static struct rpc_xprt_ops xs_udp_ops = { + .set_buffer_size = xs_udp_set_buffer_size, + .reserve_xprt = xprt_reserve_xprt_cong, + .release_xprt = xprt_release_xprt_cong, + .connect = xs_connect, + .send_request = xs_udp_send_request, + .set_retrans_timeout = xprt_set_retrans_timeout_rtt, + .timer = xs_udp_timer, + .release_request = xprt_release_rqst_cong, + .close = xs_close, + .destroy = xs_destroy, +}; + +static struct rpc_xprt_ops xs_tcp_ops = { + .reserve_xprt = xprt_reserve_xprt, + .release_xprt = xprt_release_xprt, + .connect = xs_connect, + .send_request = xs_tcp_send_request, + .set_retrans_timeout = xprt_set_retrans_timeout_def, + .close = xs_close, + .destroy = xs_destroy, +}; + +/** + * xs_setup_udp - Set up transport to use a UDP socket + * @xprt: transport to set up + * @to: timeout parameters + * + */ +int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) +{ + size_t slot_table_size; + + dprintk("RPC: setting up udp-ipv4 transport...\n"); + + xprt->max_reqs = xprt_udp_slot_table_entries; + slot_table_size = xprt->max_reqs * sizeof(xprt->slot[0]); + xprt->slot = kmalloc(slot_table_size, GFP_KERNEL); + if (xprt->slot == NULL) + return -ENOMEM; + memset(xprt->slot, 0, slot_table_size); + + xprt->prot = IPPROTO_UDP; + xprt->port = xprt_max_resvport; + xprt->tsh_size = 0; + xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; + /* XXX: header size can vary due to auth type, IPv6, etc. */ + xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); + + INIT_WORK(&xprt->connect_worker, xs_udp_connect_worker, xprt); + xprt->bind_timeout = XS_BIND_TO; + xprt->connect_timeout = XS_UDP_CONN_TO; + xprt->reestablish_timeout = XS_UDP_REEST_TO; + xprt->idle_timeout = XS_IDLE_DISC_TO; + + xprt->ops = &xs_udp_ops; + + if (to) + xprt->timeout = *to; + else + xprt_set_timeout(&xprt->timeout, 5, 5 * HZ); + + return 0; +} + +/** + * xs_setup_tcp - Set up transport to use a TCP socket + * @xprt: transport to set up + * @to: timeout parameters + * + */ +int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) +{ + size_t slot_table_size; + + dprintk("RPC: setting up tcp-ipv4 transport...\n"); + + xprt->max_reqs = xprt_tcp_slot_table_entries; + slot_table_size = xprt->max_reqs * sizeof(xprt->slot[0]); + xprt->slot = kmalloc(slot_table_size, GFP_KERNEL); + if (xprt->slot == NULL) + return -ENOMEM; + memset(xprt->slot, 0, slot_table_size); + + xprt->prot = IPPROTO_TCP; + xprt->port = xprt_max_resvport; + xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32); + xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; + xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; + + INIT_WORK(&xprt->connect_worker, xs_tcp_connect_worker, xprt); + xprt->bind_timeout = XS_BIND_TO; + xprt->connect_timeout = XS_TCP_CONN_TO; + xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; + xprt->idle_timeout = XS_IDLE_DISC_TO; + + xprt->ops = &xs_tcp_ops; + + if (to) + xprt->timeout = *to; + else + xprt_set_timeout(&xprt->timeout, 2, 60 * HZ); + + return 0; +} diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 41feca3bef8..acc73ba8bad 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -676,7 +676,7 @@ static struct sock *unix_find_other(struct sockaddr_un *sunname, int len, err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd); if (err) goto fail; - err = permission(nd.dentry->d_inode,MAY_WRITE, &nd); + err = vfs_permission(&nd, MAY_WRITE); if (err) goto put_fail; diff --git a/net/wanrouter/af_wanpipe.c b/net/wanrouter/af_wanpipe.c index 596cb96e5f4..59fec59b213 100644 --- a/net/wanrouter/af_wanpipe.c +++ b/net/wanrouter/af_wanpipe.c @@ -1099,7 +1099,7 @@ static void release_driver(struct sock *sk) sock_reset_flag(sk, SOCK_ZAPPED); wp = wp_sk(sk); - if (wp && wp->mbox) { + if (wp) { kfree(wp->mbox); wp->mbox = NULL; } @@ -1186,10 +1186,8 @@ static void wanpipe_kill_sock_timer (unsigned long data) return; } - if (wp_sk(sk)) { - kfree(wp_sk(sk)); - wp_sk(sk) = NULL; - } + kfree(wp_sk(sk)); + wp_sk(sk) = NULL; if (atomic_read(&sk->sk_refcnt) != 1) { atomic_set(&sk->sk_refcnt, 1); @@ -1219,10 +1217,8 @@ static void wanpipe_kill_sock_accept (struct sock *sk) sk->sk_socket = NULL; - if (wp_sk(sk)) { - kfree(wp_sk(sk)); - wp_sk(sk) = NULL; - } + kfree(wp_sk(sk)); + wp_sk(sk) = NULL; if (atomic_read(&sk->sk_refcnt) != 1) { atomic_set(&sk->sk_refcnt, 1); @@ -1243,10 +1239,8 @@ static void wanpipe_kill_sock_irq (struct sock *sk) sk->sk_socket = NULL; - if (wp_sk(sk)) { - kfree(wp_sk(sk)); - wp_sk(sk) = NULL; - } + kfree(wp_sk(sk)); + wp_sk(sk) = NULL; if (atomic_read(&sk->sk_refcnt) != 1) { atomic_set(&sk->sk_refcnt, 1); diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c index 13b650ad22e..bcf7b3faa76 100644 --- a/net/wanrouter/wanmain.c +++ b/net/wanrouter/wanmain.c @@ -714,10 +714,8 @@ static int wanrouter_device_new_if(struct wan_device *wandev, } /* This code has moved from del_if() function */ - if (dev->priv) { - kfree(dev->priv); - dev->priv = NULL; - } + kfree(dev->priv); + dev->priv = NULL; #ifdef CONFIG_WANPIPE_MULTPPP if (cnf->config_id == WANCONFIG_MPPP) @@ -851,10 +849,8 @@ static int wanrouter_delete_interface(struct wan_device *wandev, char *name) /* Due to new interface linking method using dev->priv, * this code has moved from del_if() function.*/ - if (dev->priv){ - kfree(dev->priv); - dev->priv=NULL; - } + kfree(dev->priv); + dev->priv=NULL; unregister_netdev(dev); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index cbb0ba34a60..0db9e57013f 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1192,46 +1192,6 @@ int xfrm_bundle_ok(struct xfrm_dst *first, struct flowi *fl, int family) EXPORT_SYMBOL(xfrm_bundle_ok); -/* Well... that's _TASK_. We need to scan through transformation - * list and figure out what mss tcp should generate in order to - * final datagram fit to mtu. Mama mia... :-) - * - * Apparently, some easy way exists, but we used to choose the most - * bizarre ones. :-) So, raising Kalashnikov... tra-ta-ta. - * - * Consider this function as something like dark humour. :-) - */ -static int xfrm_get_mss(struct dst_entry *dst, u32 mtu) -{ - int res = mtu - dst->header_len; - - for (;;) { - struct dst_entry *d = dst; - int m = res; - - do { - struct xfrm_state *x = d->xfrm; - if (x) { - spin_lock_bh(&x->lock); - if (x->km.state == XFRM_STATE_VALID && - x->type && x->type->get_max_size) - m = x->type->get_max_size(d->xfrm, m); - else - m += x->props.header_len; - spin_unlock_bh(&x->lock); - } - } while ((d = d->child) != NULL); - - if (m <= mtu) - break; - res -= (m - mtu); - if (res < 88) - return mtu; - } - - return res + dst->header_len; -} - int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) { int err = 0; @@ -1252,8 +1212,6 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) dst_ops->negative_advice = xfrm_negative_advice; if (likely(dst_ops->link_failure == NULL)) dst_ops->link_failure = xfrm_link_failure; - if (likely(dst_ops->get_mss == NULL)) - dst_ops->get_mss = xfrm_get_mss; if (likely(afinfo->garbage_collect == NULL)) afinfo->garbage_collect = __xfrm_garbage_collect; xfrm_policy_afinfo[afinfo->family] = afinfo; @@ -1281,7 +1239,6 @@ int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo) dst_ops->check = NULL; dst_ops->negative_advice = NULL; dst_ops->link_failure = NULL; - dst_ops->get_mss = NULL; afinfo->garbage_collect = NULL; } } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 9d206c282cf..7cf48aa6c95 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -62,14 +62,10 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x) { if (del_timer(&x->timer)) BUG(); - if (x->aalg) - kfree(x->aalg); - if (x->ealg) - kfree(x->ealg); - if (x->calg) - kfree(x->calg); - if (x->encap) - kfree(x->encap); + kfree(x->aalg); + kfree(x->ealg); + kfree(x->calg); + kfree(x->encap); if (x->type) { x->type->destructor(x); xfrm_put_type(x->type); @@ -1026,6 +1022,12 @@ void xfrm_state_delete_tunnel(struct xfrm_state *x) } EXPORT_SYMBOL(xfrm_state_delete_tunnel); +/* + * This function is NOT optimal. For example, with ESP it will give an + * MTU that's usually two bytes short of being optimal. However, it will + * usually give an answer that's a multiple of 4 provided the input is + * also a multiple of 4. + */ int xfrm_state_mtu(struct xfrm_state *x, int mtu) { int res = mtu; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index c35336a0f71..0cdd9a07e04 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -18,7 +18,6 @@ #include <linux/string.h> #include <linux/net.h> #include <linux/skbuff.h> -#include <linux/netlink.h> #include <linux/rtnetlink.h> #include <linux/pfkeyv2.h> #include <linux/ipsec.h> @@ -26,6 +25,7 @@ #include <linux/security.h> #include <net/sock.h> #include <net/xfrm.h> +#include <net/netlink.h> #include <asm/uaccess.h> static struct sock *xfrm_nl; @@ -948,11 +948,6 @@ static struct xfrm_link { [XFRM_MSG_FLUSHPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_flush_policy }, }; -static int xfrm_done(struct netlink_callback *cb) -{ - return 0; -} - static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp) { struct rtattr *xfrma[XFRMA_MAX]; @@ -984,20 +979,15 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *err if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) || type == (XFRM_MSG_GETPOLICY - XFRM_MSG_BASE)) && (nlh->nlmsg_flags & NLM_F_DUMP)) { - u32 rlen; - if (link->dump == NULL) goto err_einval; if ((*errp = netlink_dump_start(xfrm_nl, skb, nlh, - link->dump, - xfrm_done)) != 0) { + link->dump, NULL)) != 0) { return -1; } - rlen = NLMSG_ALIGN(nlh->nlmsg_len); - if (rlen > skb->len) - rlen = skb->len; - skb_pull(skb, rlen); + + netlink_queue_skip(nlh, skb); return -1; } @@ -1032,60 +1022,13 @@ err_einval: return -1; } -static int xfrm_user_rcv_skb(struct sk_buff *skb) -{ - int err; - struct nlmsghdr *nlh; - - while (skb->len >= NLMSG_SPACE(0)) { - u32 rlen; - - nlh = (struct nlmsghdr *) skb->data; - if (nlh->nlmsg_len < sizeof(*nlh) || - skb->len < nlh->nlmsg_len) - return 0; - rlen = NLMSG_ALIGN(nlh->nlmsg_len); - if (rlen > skb->len) - rlen = skb->len; - if (xfrm_user_rcv_msg(skb, nlh, &err) < 0) { - if (err == 0) - return -1; - netlink_ack(skb, nlh, err); - } else if (nlh->nlmsg_flags & NLM_F_ACK) - netlink_ack(skb, nlh, 0); - skb_pull(skb, rlen); - } - - return 0; -} - static void xfrm_netlink_rcv(struct sock *sk, int len) { - unsigned int qlen = skb_queue_len(&sk->sk_receive_queue); + unsigned int qlen = 0; do { - struct sk_buff *skb; - down(&xfrm_cfg_sem); - - if (qlen > skb_queue_len(&sk->sk_receive_queue)) - qlen = skb_queue_len(&sk->sk_receive_queue); - - for (; qlen; qlen--) { - skb = skb_dequeue(&sk->sk_receive_queue); - if (xfrm_user_rcv_skb(skb)) { - if (skb->len) - skb_queue_head(&sk->sk_receive_queue, - skb); - else { - kfree_skb(skb); - qlen--; - } - break; - } - kfree_skb(skb); - } - + netlink_run_queue(sk, &qlen, &xfrm_user_rcv_msg); up(&xfrm_cfg_sem); } while (qlen); |