From 38815b780285a4957852c5c9dbe94991c0b26c56 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 2 Mar 2011 16:55:21 -0800 Subject: libceph: fix handling of short returns from get_user_pages get_user_pages() can return fewer pages than we ask for. We were returning a bogus pointer/error code in that case. Instead, loop until we get all the pages we want or get an error we can return to the caller. Signed-off-by: Sage Weil --- net/ceph/pagevec.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c index 1a040e64c69..cd9c21df87d 100644 --- a/net/ceph/pagevec.c +++ b/net/ceph/pagevec.c @@ -16,22 +16,30 @@ struct page **ceph_get_direct_page_vector(const char __user *data, int num_pages, bool write_page) { struct page **pages; - int rc; + int got = 0; + int rc = 0; pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS); if (!pages) return ERR_PTR(-ENOMEM); down_read(¤t->mm->mmap_sem); - rc = get_user_pages(current, current->mm, (unsigned long)data, - num_pages, write_page, 0, pages, NULL); + while (got < num_pages) { + rc = get_user_pages(current, current->mm, + (unsigned long)data + ((unsigned long)got * PAGE_SIZE), + num_pages - got, write_page, 0, pages + got, NULL); + if (rc < 0) + break; + BUG_ON(rc == 0); + got += rc; + } up_read(¤t->mm->mmap_sem); - if (rc < num_pages) + if (rc < 0) goto fail; return pages; fail: - ceph_put_page_vector(pages, rc > 0 ? rc : 0, false); + ceph_put_page_vector(pages, got, false); return ERR_PTR(rc); } EXPORT_SYMBOL(ceph_get_direct_page_vector); -- cgit v1.2.3-70-g09d2 From 692d20f576fb26f62c83f80dbf3ea899998391b7 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 3 Mar 2011 12:14:53 -0800 Subject: libceph: retry after authorization failure If we mark the connection CLOSED we will give up trying to reconnect to this server instance. That is appropriate for things like a protocol version mismatch that won't change until the server is restarted, at which point we'll get a new addr and reconnect. An authorization failure like this is probably due to the server not properly rotating it's secret keys, however, and should be treated as transient so that the normal backoff and retry behavior kicks in. Signed-off-by: Sage Weil --- net/ceph/messenger.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 35b36b86d76..6bd5025f622 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1248,8 +1248,6 @@ static int process_connect(struct ceph_connection *con) con->auth_retry); if (con->auth_retry == 2) { con->error_msg = "connect authorization failure"; - reset_connection(con); - set_bit(CLOSED, &con->state); return -1; } con->auth_retry = 1; -- cgit v1.2.3-70-g09d2 From 60bf8bf8815e6adea4c1d0423578c3b8000e2ec8 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 4 Mar 2011 12:24:28 -0800 Subject: libceph: fix msgr backoff With commit f363e45f we replaced a bunch of hacky workqueue mutual exclusion logic with the WQ_NON_REENTRANT flag. One pieces of fallout is that the exponential backoff breaks in certain cases: * con_work attempts to connect. * we get an immediate failure, and the socket state change handler queues immediate work. * con_work calls con_fault, we decide to back off, but can't queue delayed work. In this case, we add a BACKOFF bit to make con_work reschedule delayed work next time it runs (which should be immediately). Signed-off-by: Sage Weil --- include/linux/ceph/messenger.h | 1 + net/ceph/messenger.c | 30 ++++++++++++++++++++++++++++-- 2 files changed, 29 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index c3011beac30..eb31e108a64 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -123,6 +123,7 @@ struct ceph_msg_pos { #define SOCK_CLOSED 11 /* socket state changed to closed */ #define OPENING 13 /* open connection w/ (possibly new) peer */ #define DEAD 14 /* dead, about to kfree */ +#define BACKOFF 15 /* * A single connection with another host. diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 6bd5025f622..46fbc422ba7 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1949,6 +1949,19 @@ static void con_work(struct work_struct *work) work.work); mutex_lock(&con->mutex); + if (test_and_clear_bit(BACKOFF, &con->state)) { + dout("con_work %p backing off\n", con); + if (queue_delayed_work(ceph_msgr_wq, &con->work, + round_jiffies_relative(con->delay))) { + dout("con_work %p backoff %lu\n", con, con->delay); + mutex_unlock(&con->mutex); + return; + } else { + con->ops->put(con); + dout("con_work %p FAILED to back off %lu\n", con, + con->delay); + } + } if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */ dout("con_work CLOSED\n"); @@ -2017,11 +2030,24 @@ static void ceph_fault(struct ceph_connection *con) con->delay = BASE_DELAY_INTERVAL; else if (con->delay < MAX_DELAY_INTERVAL) con->delay *= 2; - dout("fault queueing %p delay %lu\n", con, con->delay); con->ops->get(con); if (queue_delayed_work(ceph_msgr_wq, &con->work, - round_jiffies_relative(con->delay)) == 0) + round_jiffies_relative(con->delay))) { + dout("fault queued %p delay %lu\n", con, con->delay); + } else { con->ops->put(con); + dout("fault failed to queue %p delay %lu, backoff\n", + con, con->delay); + /* + * In many cases we see a socket state change + * while con_work is running and end up + * queuing (non-delayed) work, such that we + * can't backoff with a delay. Set a flag so + * that when con_work restarts we schedule the + * delay then. + */ + set_bit(BACKOFF, &con->state); + } } out_unlock: -- cgit v1.2.3-70-g09d2 From e76661d0a59e53e5cc4dccbe4b755d1dc8a968ec Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 3 Mar 2011 10:10:15 -0800 Subject: libceph: fix msgr keepalive flag There was some broken keepalive code using a dead variable. Shift to using the proper bit flag. Signed-off-by: Sage Weil --- include/linux/ceph/messenger.h | 1 - net/ceph/messenger.c | 9 ++++----- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index eb31e108a64..31d91a64838 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -161,7 +161,6 @@ struct ceph_connection { struct list_head out_queue; struct list_head out_sent; /* sending or sent but unacked */ u64 out_seq; /* last message queued for send */ - bool out_keepalive_pending; u64 in_seq, in_seq_acked; /* last message received, acked */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 46fbc422ba7..3252ea974e8 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -336,7 +336,6 @@ static void reset_connection(struct ceph_connection *con) ceph_msg_put(con->out_msg); con->out_msg = NULL; } - con->out_keepalive_pending = false; con->in_seq = 0; con->in_seq_acked = 0; } @@ -2019,10 +2018,10 @@ static void ceph_fault(struct ceph_connection *con) /* Requeue anything that hasn't been acked */ list_splice_init(&con->out_sent, &con->out_queue); - /* If there are no messages in the queue, place the connection - * in a STANDBY state (i.e., don't try to reconnect just yet). */ - if (list_empty(&con->out_queue) && !con->out_keepalive_pending) { - dout("fault setting STANDBY\n"); + /* If there are no messages queued or keepalive pending, place + * the connection in a STANDBY state */ + if (list_empty(&con->out_queue) && + !test_bit(KEEPALIVE_PENDING, &con->state)) { set_bit(STANDBY, &con->state); } else { /* retry after a delay. */ -- cgit v1.2.3-70-g09d2 From e00de341fdb76c955703b4438100f9933c452b7f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 4 Mar 2011 12:25:05 -0800 Subject: libceph: fix msgr standby handling The standby logic used to be pretty dependent on the work requeueing behavior that changed when we switched to WQ_NON_REENTRANT. It was also very fragile. Restructure things so that: - We clear WRITE_PENDING when we set STANDBY. This ensures we will requeue work when we wake up later. - con_work backs off if STANDBY is set. There is nothing to do if we are in standby. - clear_standby() helper is called by both con_send() and con_keepalive(), the two actions that can wake us up again. Move the connect_seq++ logic here. Signed-off-by: Sage Weil --- net/ceph/messenger.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 3252ea974e8..05f357828a2 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1712,14 +1712,6 @@ more: /* open the socket first? */ if (con->sock == NULL) { - /* - * if we were STANDBY and are reconnecting _this_ - * connection, bump connect_seq now. Always bump - * global_seq. - */ - if (test_and_clear_bit(STANDBY, &con->state)) - con->connect_seq++; - prepare_write_banner(msgr, con); prepare_write_connect(msgr, con, 1); prepare_read_banner(con); @@ -1962,6 +1954,10 @@ static void con_work(struct work_struct *work) } } + if (test_bit(STANDBY, &con->state)) { + dout("con_work %p STANDBY\n", con); + goto done; + } if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */ dout("con_work CLOSED\n"); con_close_socket(con); @@ -2022,6 +2018,8 @@ static void ceph_fault(struct ceph_connection *con) * the connection in a STANDBY state */ if (list_empty(&con->out_queue) && !test_bit(KEEPALIVE_PENDING, &con->state)) { + dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con); + clear_bit(WRITE_PENDING, &con->state); set_bit(STANDBY, &con->state); } else { /* retry after a delay. */ @@ -2117,6 +2115,19 @@ void ceph_messenger_destroy(struct ceph_messenger *msgr) } EXPORT_SYMBOL(ceph_messenger_destroy); +static void clear_standby(struct ceph_connection *con) +{ + /* come back from STANDBY? */ + if (test_and_clear_bit(STANDBY, &con->state)) { + mutex_lock(&con->mutex); + dout("clear_standby %p and ++connect_seq\n", con); + con->connect_seq++; + WARN_ON(test_bit(WRITE_PENDING, &con->state)); + WARN_ON(test_bit(KEEPALIVE_PENDING, &con->state)); + mutex_unlock(&con->mutex); + } +} + /* * Queue up an outgoing message on the given connection. */ @@ -2149,6 +2160,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) /* if there wasn't anything waiting to send before, queue * new work */ + clear_standby(con); if (test_and_set_bit(WRITE_PENDING, &con->state) == 0) queue_con(con); } @@ -2214,6 +2226,8 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg) */ void ceph_con_keepalive(struct ceph_connection *con) { + dout("con_keepalive %p\n", con); + clear_standby(con); if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 && test_and_set_bit(WRITE_PENDING, &con->state) == 0) queue_con(con); -- cgit v1.2.3-70-g09d2 From 2ea6d8c446752008df7f37867f0cf7483533b05e Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Thu, 3 Mar 2011 23:35:07 +0000 Subject: net: Enter net/ipv6/ even if CONFIG_IPV6=n exthdrs_core.c and addrconf_core.c in net/ipv6/ contain bits which must be made available even if IPv6 is disabled. net/ipv6/Makefile already correctly includes them if CONFIG_IPV6=n but net/Makefile prevents entering the subdirectory. Signed-off-by: Thomas Graf Acked-by: Randy Dunlap Signed-off-by: David S. Miller --- net/Makefile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/Makefile b/net/Makefile index a3330ebe2c5..a51d9465e62 100644 --- a/net/Makefile +++ b/net/Makefile @@ -19,9 +19,7 @@ obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_XFRM) += xfrm/ obj-$(CONFIG_UNIX) += unix/ -ifneq ($(CONFIG_IPV6),) -obj-y += ipv6/ -endif +obj-$(CONFIG_NET) += ipv6/ obj-$(CONFIG_PACKET) += packet/ obj-$(CONFIG_NET_KEY) += key/ obj-$(CONFIG_BRIDGE) += bridge/ -- cgit v1.2.3-70-g09d2 From b3ca9b02b00704053a38bfe4c31dbbb9c13595d0 Mon Sep 17 00:00:00 2001 From: Rainer Weikusat Date: Mon, 28 Feb 2011 04:50:55 +0000 Subject: net: fix multithreaded signal handling in unix recv routines The unix_dgram_recvmsg and unix_stream_recvmsg routines in net/af_unix.c utilize mutex_lock(&u->readlock) calls in order to serialize read operations of multiple threads on a single socket. This implies that, if all n threads of a process block in an AF_UNIX recv call trying to read data from the same socket, one of these threads will be sleeping in state TASK_INTERRUPTIBLE and all others in state TASK_UNINTERRUPTIBLE. Provided that a particular signal is supposed to be handled by a signal handler defined by the process and that none of this threads is blocking the signal, the complete_signal routine in kernel/signal.c will select the 'first' such thread it happens to encounter when deciding which thread to notify that a signal is supposed to be handled and if this is one of the TASK_UNINTERRUPTIBLE threads, the signal won't be handled until the one thread not blocking on the u->readlock mutex is woken up because some data to process has arrived (if this ever happens). The included patch fixes this by changing mutex_lock to mutex_lock_interruptible and handling possible error returns in the same way interruptions are handled by the actual receive-code. Signed-off-by: Rainer Weikusat Signed-off-by: David S. Miller --- net/unix/af_unix.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index dd419d28620..437a99e560e 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1724,7 +1724,11 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_namelen = 0; - mutex_lock(&u->readlock); + err = mutex_lock_interruptible(&u->readlock); + if (err) { + err = sock_intr_errno(sock_rcvtimeo(sk, noblock)); + goto out; + } skb = skb_recv_datagram(sk, flags, noblock, &err); if (!skb) { @@ -1864,7 +1868,11 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, memset(&tmp_scm, 0, sizeof(tmp_scm)); } - mutex_lock(&u->readlock); + err = mutex_lock_interruptible(&u->readlock); + if (err) { + err = sock_intr_errno(timeo); + goto out; + } do { int chunk; @@ -1895,11 +1903,12 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, timeo = unix_stream_data_wait(sk, timeo); - if (signal_pending(current)) { + if (signal_pending(current) + || mutex_lock_interruptible(&u->readlock)) { err = sock_intr_errno(timeo); goto out; } - mutex_lock(&u->readlock); + continue; unlock: unix_state_unlock(sk); -- cgit v1.2.3-70-g09d2 From 6094628bfd94323fc1cea05ec2c6affd98c18f7f Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Wed, 2 Mar 2011 06:28:22 +0000 Subject: rds: prevent BUG_ON triggering on congestion map updates Recently had this bug halt reported to me: kernel BUG at net/rds/send.c:329! Oops: Exception in kernel mode, sig: 5 [#1] SMP NR_CPUS=1024 NUMA pSeries Modules linked in: rds sunrpc ipv6 dm_mirror dm_region_hash dm_log ibmveth sg ext4 jbd2 mbcache sd_mod crc_t10dif ibmvscsic scsi_transport_srp scsi_tgt dm_mod [last unloaded: scsi_wait_scan] NIP: d000000003ca68f4 LR: d000000003ca67fc CTR: d000000003ca8770 REGS: c000000175cab980 TRAP: 0700 Not tainted (2.6.32-118.el6.ppc64) MSR: 8000000000029032 CR: 44000022 XER: 00000000 TASK = c00000017586ec90[1896] 'krdsd' THREAD: c000000175ca8000 CPU: 0 GPR00: 0000000000000150 c000000175cabc00 d000000003cb7340 0000000000002030 GPR04: ffffffffffffffff 0000000000000030 0000000000000000 0000000000000030 GPR08: 0000000000000001 0000000000000001 c0000001756b1e30 0000000000010000 GPR12: d000000003caac90 c000000000fa2500 c0000001742b2858 c0000001742b2a00 GPR16: c0000001742b2a08 c0000001742b2820 0000000000000001 0000000000000001 GPR20: 0000000000000040 c0000001742b2814 c000000175cabc70 0800000000000000 GPR24: 0000000000000004 0200000000000000 0000000000000000 c0000001742b2860 GPR28: 0000000000000000 c0000001756b1c80 d000000003cb68e8 c0000001742b27b8 NIP [d000000003ca68f4] .rds_send_xmit+0x4c4/0x8a0 [rds] LR [d000000003ca67fc] .rds_send_xmit+0x3cc/0x8a0 [rds] Call Trace: [c000000175cabc00] [d000000003ca67fc] .rds_send_xmit+0x3cc/0x8a0 [rds] (unreliable) [c000000175cabd30] [d000000003ca7e64] .rds_send_worker+0x54/0x100 [rds] [c000000175cabdb0] [c0000000000b475c] .worker_thread+0x1dc/0x3c0 [c000000175cabed0] [c0000000000baa9c] .kthread+0xbc/0xd0 [c000000175cabf90] [c000000000032114] .kernel_thread+0x54/0x70 Instruction dump: 4bfffd50 60000000 60000000 39080001 935f004c f91f0040 41820024 813d017c 7d094a78 7d290074 7929d182 394a0020 <0b090000> 40e2ff68 4bffffa4 39200000 Kernel panic - not syncing: Fatal exception Call Trace: [c000000175cab560] [c000000000012e04] .show_stack+0x74/0x1c0 (unreliable) [c000000175cab610] [c0000000005a365c] .panic+0x80/0x1b4 [c000000175cab6a0] [c00000000002fbcc] .die+0x21c/0x2a0 [c000000175cab750] [c000000000030000] ._exception+0x110/0x220 [c000000175cab910] [c000000000004b9c] program_check_common+0x11c/0x180 Signed-off-by: David S. Miller --- net/rds/ib_send.c | 5 ++++- net/rds/loop.c | 11 ++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 71f373c421b..c47a511f203 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -551,7 +551,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, if (conn->c_loopback && rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) { rds_cong_map_updated(conn->c_fcong, ~(u64) 0); - return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES; + scat = &rm->data.op_sg[sg]; + ret = sizeof(struct rds_header) + RDS_CONG_MAP_BYTES; + ret = min_t(int, ret, scat->length - conn->c_xmit_data_off); + return ret; } /* FIXME we may overallocate here */ diff --git a/net/rds/loop.c b/net/rds/loop.c index aeec1d483b1..bca6761a3ca 100644 --- a/net/rds/loop.c +++ b/net/rds/loop.c @@ -61,10 +61,15 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off) { + struct scatterlist *sgp = &rm->data.op_sg[sg]; + int ret = sizeof(struct rds_header) + + be32_to_cpu(rm->m_inc.i_hdr.h_len); + /* Do not send cong updates to loopback */ if (rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) { rds_cong_map_updated(conn->c_fcong, ~(u64) 0); - return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES; + ret = min_t(int, ret, sgp->length - conn->c_xmit_data_off); + goto out; } BUG_ON(hdr_off || sg || off); @@ -80,8 +85,8 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm, NULL); rds_inc_put(&rm->m_inc); - - return sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len); +out: + return ret; } /* -- cgit v1.2.3-70-g09d2 From 6c91afe1a984d43f922a6a70c7c390b7fb8de45e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 9 Mar 2011 13:27:16 -0800 Subject: ipv4: Fix erroneous uses of ifa_address. In usual cases ifa_address == ifa_local, but in the case where SIOCSIFDSTADDR sets the destination address on a point-to-point link, ifa_address gets set to that destination address. Therefore we should use ifa_local when we want the local interface address. There were two cases where the selection was done incorrectly: 1) When devinet_ioctl() does matching, it checks ifa_address even though gifconf correct reported ifa_local to the user 2) IN_DEV_ARP_NOTIFY handling sends a gratuitous ARP using ifa_address instead of ifa_local. Reported-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index df4616fce92..036652c8166 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -670,7 +670,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) ifap = &ifa->ifa_next) { if (!strcmp(ifr.ifr_name, ifa->ifa_label) && sin_orig.sin_addr.s_addr == - ifa->ifa_address) { + ifa->ifa_local) { break; /* found */ } } @@ -1040,8 +1040,8 @@ static void inetdev_send_gratuitous_arp(struct net_device *dev, return; arp_send(ARPOP_REQUEST, ETH_P_ARP, - ifa->ifa_address, dev, - ifa->ifa_address, NULL, + ifa->ifa_local, dev, + ifa->ifa_local, NULL, dev->dev_addr, NULL); } -- cgit v1.2.3-70-g09d2 From 03a14ab134f4811ab1475f07b1305ccaf38b690f Mon Sep 17 00:00:00 2001 From: Daniel Turull Date: Wed, 9 Mar 2011 14:11:00 -0800 Subject: pktgen: fix errata in show results The units in show_results in pktgen were not correct. The results are in usec but it was displayed nsec. Reported-by: Jong-won Lee Signed-off-by: Daniel Turull Signed-off-by: David S. Miller --- net/core/pktgen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index a9e7fc4c461..b5bada92f63 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3321,7 +3321,7 @@ static void show_results(struct pktgen_dev *pkt_dev, int nr_frags) pkt_dev->started_at); ktime_t idle = ns_to_ktime(pkt_dev->idle_acc); - p += sprintf(p, "OK: %llu(c%llu+d%llu) nsec, %llu (%dbyte,%dfrags)\n", + p += sprintf(p, "OK: %llu(c%llu+d%llu) usec, %llu (%dbyte,%dfrags)\n", (unsigned long long)ktime_to_us(elapsed), (unsigned long long)ktime_to_us(ktime_sub(elapsed, idle)), (unsigned long long)ktime_to_us(idle), -- cgit v1.2.3-70-g09d2 From 8909c9ad8ff03611c9c96c9a92656213e4bb495b Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Wed, 2 Mar 2011 00:33:13 +0300 Subject: net: don't allow CAP_NET_ADMIN to load non-netdev kernel modules Since a8f80e8ff94ecba629542d9b4b5f5a8ee3eb565c any process with CAP_NET_ADMIN may load any module from /lib/modules/. This doesn't mean that CAP_NET_ADMIN is a superset of CAP_SYS_MODULE as modules are limited to /lib/modules/**. However, CAP_NET_ADMIN capability shouldn't allow anybody load any module not related to networking. This patch restricts an ability of autoloading modules to netdev modules with explicit aliases. This fixes CVE-2011-1019. Arnd Bergmann suggested to leave untouched the old pre-v2.6.32 behavior of loading netdev modules by name (without any prefix) for processes with CAP_SYS_MODULE to maintain the compatibility with network scripts that use autoloading netdev modules by aliases like "eth0", "wlan0". Currently there are only three users of the feature in the upstream kernel: ipip, ip_gre and sit. root@albatros:~# capsh --drop=$(seq -s, 0 11),$(seq -s, 13 34) -- root@albatros:~# grep Cap /proc/$$/status CapInh: 0000000000000000 CapPrm: fffffff800001000 CapEff: fffffff800001000 CapBnd: fffffff800001000 root@albatros:~# modprobe xfs FATAL: Error inserting xfs (/lib/modules/2.6.38-rc6-00001-g2bf4ca3/kernel/fs/xfs/xfs.ko): Operation not permitted root@albatros:~# lsmod | grep xfs root@albatros:~# ifconfig xfs xfs: error fetching interface information: Device not found root@albatros:~# lsmod | grep xfs root@albatros:~# lsmod | grep sit root@albatros:~# ifconfig sit sit: error fetching interface information: Device not found root@albatros:~# lsmod | grep sit root@albatros:~# ifconfig sit0 sit0 Link encap:IPv6-in-IPv4 NOARP MTU:1480 Metric:1 root@albatros:~# lsmod | grep sit sit 10457 0 tunnel4 2957 1 sit For CAP_SYS_MODULE module loading is still relaxed: root@albatros:~# grep Cap /proc/$$/status CapInh: 0000000000000000 CapPrm: ffffffffffffffff CapEff: ffffffffffffffff CapBnd: ffffffffffffffff root@albatros:~# ifconfig xfs xfs: error fetching interface information: Device not found root@albatros:~# lsmod | grep xfs xfs 745319 0 Reference: https://lkml.org/lkml/2011/2/24/203 Signed-off-by: Vasiliy Kulikov Signed-off-by: Michael Tokarev Acked-by: David S. Miller Acked-by: Kees Cook Signed-off-by: James Morris --- include/linux/netdevice.h | 3 +++ net/core/dev.c | 12 ++++++++++-- net/ipv4/ip_gre.c | 2 +- net/ipv4/ipip.c | 2 +- net/ipv6/sit.c | 2 +- 5 files changed, 16 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d971346b034..71caf7a5e6c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2392,6 +2392,9 @@ extern int netdev_notice(const struct net_device *dev, const char *format, ...) extern int netdev_info(const struct net_device *dev, const char *format, ...) __attribute__ ((format (printf, 2, 3))); +#define MODULE_ALIAS_NETDEV(device) \ + MODULE_ALIAS("netdev-" device) + #if defined(DEBUG) #define netdev_dbg(__dev, format, args...) \ netdev_printk(KERN_DEBUG, __dev, format, ##args) diff --git a/net/core/dev.c b/net/core/dev.c index 8ae6631abcc..6561021d22d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1114,13 +1114,21 @@ EXPORT_SYMBOL(netdev_bonding_change); void dev_load(struct net *net, const char *name) { struct net_device *dev; + int no_module; rcu_read_lock(); dev = dev_get_by_name_rcu(net, name); rcu_read_unlock(); - if (!dev && capable(CAP_NET_ADMIN)) - request_module("%s", name); + no_module = !dev; + if (no_module && capable(CAP_NET_ADMIN)) + no_module = request_module("netdev-%s", name); + if (no_module && capable(CAP_SYS_MODULE)) { + if (!request_module("%s", name)) + pr_err("Loading kernel module for a network device " +"with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s " +"instead\n", name); + } } EXPORT_SYMBOL(dev_load); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 6613edfac28..d1d0e2c256f 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1765,4 +1765,4 @@ module_exit(ipgre_fini); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("gre"); MODULE_ALIAS_RTNL_LINK("gretap"); -MODULE_ALIAS("gre0"); +MODULE_ALIAS_NETDEV("gre0"); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 988f52fba54..a5f58e7cbb2 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -913,4 +913,4 @@ static void __exit ipip_fini(void) module_init(ipip_init); module_exit(ipip_fini); MODULE_LICENSE("GPL"); -MODULE_ALIAS("tunl0"); +MODULE_ALIAS_NETDEV("tunl0"); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 8ce38f10a54..d2c16e10f65 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1290,4 +1290,4 @@ static int __init sit_init(void) module_init(sit_init); module_exit(sit_cleanup); MODULE_LICENSE("GPL"); -MODULE_ALIAS("sit0"); +MODULE_ALIAS_NETDEV("sit0"); -- cgit v1.2.3-70-g09d2 From 7343ff31ebf01691ea4515d3126467434b9d22d6 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 9 Mar 2011 19:55:25 -0800 Subject: ipv6: Don't create clones of host routes. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses https://bugzilla.kernel.org/show_bug.cgi?id=29252 Addresses https://bugzilla.kernel.org/show_bug.cgi?id=30462 In commit d80bc0fd262ef840ed4e82593ad6416fa1ba3fc4 ("ipv6: Always clone offlink routes.") we forced the kernel to always clone offlink routes. The reason we do that is to make sure we never bind an inetpeer to a prefixed route. The logic turned on here has existed in the tree for many years, but was always off due to a protecting CPP define. So perhaps it's no surprise that there is a logic bug here. The problem is that we canot clone a route that is already a host route (ie. has DST_HOST set). Because if we do, an identical entry already exists in the routing tree and therefore the ip6_rt_ins() call is going to fail. This sets off a series of failures and high cpu usage, because when ip6_rt_ins() fails we loop retrying this operation a few times in order to handle a race between two threads trying to clone and insert the same host route at the same time. Fix this by simply using the route as-is when DST_HOST is set. Reported-by: slash@ac.auone-net.jp Reported-by: Ernst Sjöstrand Signed-off-by: David S. Miller --- net/ipv6/route.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 904312e25a3..e7db7014e89 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -739,8 +739,10 @@ restart: if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src); - else + else if (!(rt->dst.flags & DST_HOST)) nrt = rt6_alloc_clone(rt, &fl->fl6_dst); + else + goto out2; dst_release(&rt->dst); rt = nrt ? : net->ipv6.ip6_null_entry; -- cgit v1.2.3-70-g09d2 From dcbcdf22f500ac6e4ec06485341024739b9dc241 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 10 Mar 2011 13:45:57 -0800 Subject: net: bridge builtin vs. ipv6 modular When configs BRIDGE=y and IPV6=m, this build error occurs: br_multicast.c:(.text+0xa3341): undefined reference to `ipv6_dev_get_saddr' BRIDGE_IGMP_SNOOPING is boolean; if it were tristate, then adding depends on IPV6 || IPV6=n to BRIDGE_IGMP_SNOOPING would be a good fix. As it is currently, making BRIDGE depend on the IPV6 config works. Reported-by: Patrick Schaaf Signed-off-by: Randy Dunlap Signed-off-by: David S. Miller --- net/bridge/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bridge/Kconfig b/net/bridge/Kconfig index 9190ae462cb..6dee7bf648a 100644 --- a/net/bridge/Kconfig +++ b/net/bridge/Kconfig @@ -6,6 +6,7 @@ config BRIDGE tristate "802.1d Ethernet Bridging" select LLC select STP + depends on IPV6 || IPV6=n ---help--- If you say Y here, then your Linux box will be able to act as an Ethernet bridge, which means that the different Ethernet segments it -- cgit v1.2.3-70-g09d2 From 6dfbd87a20a737641ef228230c77f4262434fa24 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Thu, 10 Mar 2011 11:43:19 +0000 Subject: ip6ip6: autoload ip6 tunnel Add necessary alias to autoload ip6ip6 tunnel module. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 4f4483e697b..e528a42a52b 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -57,6 +57,7 @@ MODULE_AUTHOR("Ville Nuorvala"); MODULE_DESCRIPTION("IPv6 tunneling device"); MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETDEV("ip6tnl0"); #ifdef IP6_TNL_DEBUG #define IP6_TNL_TRACE(x...) printk(KERN_DEBUG "%s:" x "\n", __func__) -- cgit v1.2.3-70-g09d2