From ecfd6b183780c6d9e85873693b3ce6c5f4d08b58 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <hadi@cyberus.ca>
Date: Sat, 28 Apr 2007 21:20:32 -0700
Subject: [XFRM]: Export SPD info

With this patch you can use iproute2 in user space to efficiently see
how many policies exist in different directions.

Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xfrm.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 8287081d77f..9561bf817b0 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -423,6 +423,18 @@ struct xfrm_sadinfo
 	u32 sadhmcnt; /* max allowed hash bkts */
 	u32 sadcnt; /* current running count */
 };
+
+struct xfrm_spdinfo
+{
+	u32 incnt;
+	u32 outcnt;
+	u32 fwdcnt;
+	u32 inscnt;
+	u32 outscnt;
+	u32 fwdscnt;
+	u32 spdhcnt;
+	u32 spdhmcnt;
+};
 #ifdef CONFIG_AUDITSYSCALL
 extern void xfrm_audit_log(uid_t auid, u32 secid, int type, int result,
 		    struct xfrm_policy *xp, struct xfrm_state *x);
@@ -946,6 +958,7 @@ extern struct xfrm_state *xfrm_find_acq_byseq(u32 seq);
 extern int xfrm_state_delete(struct xfrm_state *x);
 extern void xfrm_state_flush(u8 proto, struct xfrm_audit *audit_info);
 extern void xfrm_sad_getinfo(struct xfrm_sadinfo *si);
+extern void xfrm_spd_getinfo(struct xfrm_spdinfo *si);
 extern int xfrm_replay_check(struct xfrm_state *x, __be32 seq);
 extern void xfrm_replay_advance(struct xfrm_state *x, __be32 seq);
 extern void xfrm_replay_notify(struct xfrm_state *x, int event);
-- 
cgit v1.2.3-70-g09d2


From 04b090d50c88ac8e5ec9c2e985bb65bd153893aa Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Sat, 28 Apr 2007 23:03:59 -0700
Subject: [AF_IUCV/IUCV]: smp_call_function deadlock

Calling smp_call_function can lead to a deadlock if it is called
from tasklet context.
Fixing this deadlock requires to move the smp_call_function from the
tasklet context to a work queue. To do that queue the path pending
interrupts to a separate list and move the path cleanup out of
iucv_path_sever to iucv_path_connect and iucv_path_pending.
This creates a new requirement for iucv_path_connect: it may not be
called from tasklet context anymore.
Also fixed compile problem for CONFIG_HOTPLUG_CPU=n and
another one when walking the cpu_online mask. When doing this,
we must disable cpu hotplug.

Signed-off-by: Frank Pavlic <fpavlic@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/iucv/iucv.h |   2 +-
 net/iucv/iucv.c         | 205 +++++++++++++++++++++++++++++++-----------------
 2 files changed, 133 insertions(+), 74 deletions(-)

(limited to 'include/net')

diff --git a/include/net/iucv/iucv.h b/include/net/iucv/iucv.h
index 746e7416261..fd70adbb356 100644
--- a/include/net/iucv/iucv.h
+++ b/include/net/iucv/iucv.h
@@ -16,7 +16,7 @@
  * completed a register, it can exploit the other functions.
  * For furthur reference on all IUCV functionality, refer to the
  * CP Programming Services book, also available on the web thru
- * www.ibm.com/s390/vm/pubs, manual # SC24-5760
+ * www.vm.ibm.com/pubs, manual # SC24-6084
  *
  * Definition of Return Codes
  * - All positive return codes including zero are reflected back
diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c
index 60f293842a3..903bdb6eaaa 100644
--- a/net/iucv/iucv.c
+++ b/net/iucv/iucv.c
@@ -90,20 +90,43 @@ struct iucv_irq_data {
 	u32 res2[8];
 };
 
-struct iucv_work {
+struct iucv_irq_list {
 	struct list_head list;
 	struct iucv_irq_data data;
 };
 
-static LIST_HEAD(iucv_work_queue);
-static DEFINE_SPINLOCK(iucv_work_lock);
-
 static struct iucv_irq_data *iucv_irq_data;
 static cpumask_t iucv_buffer_cpumask = CPU_MASK_NONE;
 static cpumask_t iucv_irq_cpumask = CPU_MASK_NONE;
 
-static void iucv_tasklet_handler(unsigned long);
-static DECLARE_TASKLET(iucv_tasklet, iucv_tasklet_handler,0);
+/*
+ * Queue of interrupt buffers lock for delivery via the tasklet
+ * (fast but can't call smp_call_function).
+ */
+static LIST_HEAD(iucv_task_queue);
+
+/*
+ * The tasklet for fast delivery of iucv interrupts.
+ */
+static void iucv_tasklet_fn(unsigned long);
+static DECLARE_TASKLET(iucv_tasklet, iucv_tasklet_fn,0);
+
+/*
+ * Queue of interrupt buffers for delivery via a work queue
+ * (slower but can call smp_call_function).
+ */
+static LIST_HEAD(iucv_work_queue);
+
+/*
+ * The work element to deliver path pending interrupts.
+ */
+static void iucv_work_fn(struct work_struct *work);
+static DECLARE_WORK(iucv_work, iucv_work_fn);
+
+/*
+ * Spinlock protecting task and work queue.
+ */
+static DEFINE_SPINLOCK(iucv_queue_lock);
 
 enum iucv_command_codes {
 	IUCV_QUERY = 0,
@@ -147,10 +170,10 @@ static unsigned long iucv_max_pathid;
 static DEFINE_SPINLOCK(iucv_table_lock);
 
 /*
- * iucv_tasklet_cpu: contains the number of the cpu executing the tasklet.
- * Needed for iucv_path_sever called from tasklet.
+ * iucv_active_cpu: contains the number of the cpu executing the tasklet
+ * or the work handler. Needed for iucv_path_sever called from tasklet.
  */
-static int iucv_tasklet_cpu = -1;
+static int iucv_active_cpu = -1;
 
 /*
  * Mutex and wait queue for iucv_register/iucv_unregister.
@@ -449,17 +472,19 @@ static void iucv_setmask_mp(void)
 {
 	int cpu;
 
+	preempt_disable();
 	for_each_online_cpu(cpu)
 		/* Enable all cpus with a declared buffer. */
 		if (cpu_isset(cpu, iucv_buffer_cpumask) &&
 		    !cpu_isset(cpu, iucv_irq_cpumask))
 			smp_call_function_on(iucv_allow_cpu, NULL, 0, 1, cpu);
+	preempt_enable();
 }
 
 /**
  * iucv_setmask_up
  *
- * Allow iucv interrupts on a single cpus.
+ * Allow iucv interrupts on a single cpu.
  */
 static void iucv_setmask_up(void)
 {
@@ -493,8 +518,10 @@ static int iucv_enable(void)
 		goto out;
 	/* Declare per cpu buffers. */
 	rc = -EIO;
+	preempt_disable();
 	for_each_online_cpu(cpu)
 		smp_call_function_on(iucv_declare_cpu, NULL, 0, 1, cpu);
+	preempt_enable();
 	if (cpus_empty(iucv_buffer_cpumask))
 		/* No cpu could declare an iucv buffer. */
 		goto out_path;
@@ -584,48 +611,49 @@ static int iucv_sever_pathid(u16 pathid, u8 userdata[16])
 	return iucv_call_b2f0(IUCV_SEVER, parm);
 }
 
+#ifdef CONFIG_SMP
 /**
- * __iucv_cleanup_pathid
+ * __iucv_cleanup_queue
  * @dummy: unused dummy argument
  *
  * Nop function called via smp_call_function to force work items from
  * pending external iucv interrupts to the work queue.
  */
-static void __iucv_cleanup_pathid(void *dummy)
+static void __iucv_cleanup_queue(void *dummy)
 {
 }
+#endif
 
 /**
- * iucv_cleanup_pathid
- * @pathid: 16 bit pathid
+ * iucv_cleanup_queue
  *
  * Function called after a path has been severed to find all remaining
  * work items for the now stale pathid. The caller needs to hold the
  * iucv_table_lock.
  */
-static void iucv_cleanup_pathid(u16 pathid)
+static void iucv_cleanup_queue(void)
 {
-	struct iucv_work *p, *n;
+	struct iucv_irq_list *p, *n;
 
 	/*
-	 * Path is severed, the pathid can be reused immediatly on
-	 * a iucv connect or a connection pending interrupt.
-	 * iucv_path_connect and connection pending interrupt will
-	 * wait until the iucv_table_lock is released before the
-	 * recycled pathid enters the system.
-	 * Force remaining interrupts to the work queue, then
-	 * scan the work queue for items of this path.
+	 * When a path is severed, the pathid can be reused immediatly
+	 * on a iucv connect or a connection pending interrupt. Remove
+	 * all entries from the task queue that refer to a stale pathid
+	 * (iucv_path_table[ix] == NULL). Only then do the iucv connect
+	 * or deliver the connection pending interrupt. To get all the
+	 * pending interrupts force them to the work queue by calling
+	 * an empty function on all cpus.
 	 */
-	smp_call_function(__iucv_cleanup_pathid, NULL, 0, 1);
-	spin_lock_irq(&iucv_work_lock);
-	list_for_each_entry_safe(p, n, &iucv_work_queue, list) {
-		/* Remove work items for pathid except connection pending */
-		if (p->data.ippathid == pathid && p->data.iptype != 0x01) {
+	smp_call_function(__iucv_cleanup_queue, NULL, 0, 1);
+	spin_lock_irq(&iucv_queue_lock);
+	list_for_each_entry_safe(p, n, &iucv_task_queue, list) {
+		/* Remove stale work items from the task queue. */
+		if (iucv_path_table[p->data.ippathid] == NULL) {
 			list_del(&p->list);
 			kfree(p);
 		}
 	}
-	spin_unlock_irq(&iucv_work_lock);
+	spin_unlock_irq(&iucv_queue_lock);
 }
 
 /**
@@ -684,7 +712,6 @@ void iucv_unregister(struct iucv_handler *handler, int smp)
 		iucv_sever_pathid(p->pathid, NULL);
 		iucv_path_table[p->pathid] = NULL;
 		list_del(&p->list);
-		iucv_cleanup_pathid(p->pathid);
 		iucv_path_free(p);
 	}
 	spin_unlock_bh(&iucv_table_lock);
@@ -757,9 +784,9 @@ int iucv_path_connect(struct iucv_path *path, struct iucv_handler *handler,
 	union iucv_param *parm;
 	int rc;
 
-	preempt_disable();
-	if (iucv_tasklet_cpu != smp_processor_id())
-		spin_lock_bh(&iucv_table_lock);
+	BUG_ON(in_atomic());
+	spin_lock_bh(&iucv_table_lock);
+	iucv_cleanup_queue();
 	parm = percpu_ptr(iucv_param, smp_processor_id());
 	memset(parm, 0, sizeof(union iucv_param));
 	parm->ctrl.ipmsglim = path->msglim;
@@ -794,9 +821,7 @@ int iucv_path_connect(struct iucv_path *path, struct iucv_handler *handler,
 			rc = -EIO;
 		}
 	}
-	if (iucv_tasklet_cpu != smp_processor_id())
-		spin_unlock_bh(&iucv_table_lock);
-	preempt_enable();
+	spin_unlock_bh(&iucv_table_lock);
 	return rc;
 }
 
@@ -867,15 +892,14 @@ int iucv_path_sever(struct iucv_path *path, u8 userdata[16])
 
 
 	preempt_disable();
-	if (iucv_tasklet_cpu != smp_processor_id())
+	if (iucv_active_cpu != smp_processor_id())
 		spin_lock_bh(&iucv_table_lock);
 	rc = iucv_sever_pathid(path->pathid, userdata);
 	if (!rc) {
 		iucv_path_table[path->pathid] = NULL;
 		list_del_init(&path->list);
-		iucv_cleanup_pathid(path->pathid);
 	}
-	if (iucv_tasklet_cpu != smp_processor_id())
+	if (iucv_active_cpu != smp_processor_id())
 		spin_unlock_bh(&iucv_table_lock);
 	preempt_enable();
 	return rc;
@@ -1244,8 +1268,7 @@ static void iucv_path_complete(struct iucv_irq_data *data)
 	struct iucv_path_complete *ipc = (void *) data;
 	struct iucv_path *path = iucv_path_table[ipc->ippathid];
 
-	BUG_ON(!path || !path->handler);
-	if (path->handler->path_complete)
+	if (path && path->handler && path->handler->path_complete)
 		path->handler->path_complete(path, ipc->ipuser);
 }
 
@@ -1273,14 +1296,14 @@ static void iucv_path_severed(struct iucv_irq_data *data)
 	struct iucv_path_severed *ips = (void *) data;
 	struct iucv_path *path = iucv_path_table[ips->ippathid];
 
-	BUG_ON(!path || !path->handler);
+	if (!path || !path->handler)	/* Already severed */
+		return;
 	if (path->handler->path_severed)
 		path->handler->path_severed(path, ips->ipuser);
 	else {
 		iucv_sever_pathid(path->pathid, NULL);
 		iucv_path_table[path->pathid] = NULL;
 		list_del_init(&path->list);
-		iucv_cleanup_pathid(path->pathid);
 		iucv_path_free(path);
 	}
 }
@@ -1309,8 +1332,7 @@ static void iucv_path_quiesced(struct iucv_irq_data *data)
 	struct iucv_path_quiesced *ipq = (void *) data;
 	struct iucv_path *path = iucv_path_table[ipq->ippathid];
 
-	BUG_ON(!path || !path->handler);
-	if (path->handler->path_quiesced)
+	if (path && path->handler && path->handler->path_quiesced)
 		path->handler->path_quiesced(path, ipq->ipuser);
 }
 
@@ -1338,8 +1360,7 @@ static void iucv_path_resumed(struct iucv_irq_data *data)
 	struct iucv_path_resumed *ipr = (void *) data;
 	struct iucv_path *path = iucv_path_table[ipr->ippathid];
 
-	BUG_ON(!path || !path->handler);
-	if (path->handler->path_resumed)
+	if (path && path->handler && path->handler->path_resumed)
 		path->handler->path_resumed(path, ipr->ipuser);
 }
 
@@ -1371,8 +1392,7 @@ static void iucv_message_complete(struct iucv_irq_data *data)
 	struct iucv_path *path = iucv_path_table[imc->ippathid];
 	struct iucv_message msg;
 
-	BUG_ON(!path || !path->handler);
-	if (path->handler->message_complete) {
+	if (path && path->handler && path->handler->message_complete) {
 		msg.flags = imc->ipflags1;
 		msg.id = imc->ipmsgid;
 		msg.audit = imc->ipaudit;
@@ -1417,8 +1437,7 @@ static void iucv_message_pending(struct iucv_irq_data *data)
 	struct iucv_path *path = iucv_path_table[imp->ippathid];
 	struct iucv_message msg;
 
-	BUG_ON(!path || !path->handler);
-	if (path->handler->message_pending) {
+	if (path && path->handler && path->handler->message_pending) {
 		msg.flags = imp->ipflags1;
 		msg.id = imp->ipmsgid;
 		msg.class = imp->iptrgcls;
@@ -1433,17 +1452,16 @@ static void iucv_message_pending(struct iucv_irq_data *data)
 }
 
 /**
- * iucv_tasklet_handler:
+ * iucv_tasklet_fn:
  *
  * This tasklet loops over the queue of irq buffers created by
  * iucv_external_interrupt, calls the appropriate action handler
  * and then frees the buffer.
  */
-static void iucv_tasklet_handler(unsigned long ignored)
+static void iucv_tasklet_fn(unsigned long ignored)
 {
 	typedef void iucv_irq_fn(struct iucv_irq_data *);
 	static iucv_irq_fn *irq_fn[] = {
-		[0x01] = iucv_path_pending,
 		[0x02] = iucv_path_complete,
 		[0x03] = iucv_path_severed,
 		[0x04] = iucv_path_quiesced,
@@ -1453,38 +1471,70 @@ static void iucv_tasklet_handler(unsigned long ignored)
 		[0x08] = iucv_message_pending,
 		[0x09] = iucv_message_pending,
 	};
-	struct iucv_work *p;
+	struct list_head task_queue = LIST_HEAD_INIT(task_queue);
+	struct iucv_irq_list *p, *n;
 
 	/* Serialize tasklet, iucv_path_sever and iucv_path_connect. */
 	spin_lock(&iucv_table_lock);
-	iucv_tasklet_cpu = smp_processor_id();
+	iucv_active_cpu = smp_processor_id();
 
-	spin_lock_irq(&iucv_work_lock);
-	while (!list_empty(&iucv_work_queue)) {
-		p = list_entry(iucv_work_queue.next, struct iucv_work, list);
+	spin_lock_irq(&iucv_queue_lock);
+	list_splice_init(&iucv_task_queue, &task_queue);
+	spin_unlock_irq(&iucv_queue_lock);
+
+	list_for_each_entry_safe(p, n, &task_queue, list) {
 		list_del_init(&p->list);
-		spin_unlock_irq(&iucv_work_lock);
 		irq_fn[p->data.iptype](&p->data);
 		kfree(p);
-		spin_lock_irq(&iucv_work_lock);
 	}
-	spin_unlock_irq(&iucv_work_lock);
 
-	iucv_tasklet_cpu = -1;
+	iucv_active_cpu = -1;
 	spin_unlock(&iucv_table_lock);
 }
 
+/**
+ * iucv_work_fn:
+ *
+ * This work function loops over the queue of path pending irq blocks
+ * created by iucv_external_interrupt, calls the appropriate action
+ * handler and then frees the buffer.
+ */
+static void iucv_work_fn(struct work_struct *work)
+{
+	typedef void iucv_irq_fn(struct iucv_irq_data *);
+	struct list_head work_queue = LIST_HEAD_INIT(work_queue);
+	struct iucv_irq_list *p, *n;
+
+	/* Serialize tasklet, iucv_path_sever and iucv_path_connect. */
+	spin_lock_bh(&iucv_table_lock);
+	iucv_active_cpu = smp_processor_id();
+
+	spin_lock_irq(&iucv_queue_lock);
+	list_splice_init(&iucv_work_queue, &work_queue);
+	spin_unlock_irq(&iucv_queue_lock);
+
+	iucv_cleanup_queue();
+	list_for_each_entry_safe(p, n, &work_queue, list) {
+		list_del_init(&p->list);
+		iucv_path_pending(&p->data);
+		kfree(p);
+	}
+
+	iucv_active_cpu = -1;
+	spin_unlock_bh(&iucv_table_lock);
+}
+
 /**
  * iucv_external_interrupt
  * @code: irq code
  *
  * Handles external interrupts coming in from CP.
- * Places the interrupt buffer on a queue and schedules iucv_tasklet_handler().
+ * Places the interrupt buffer on a queue and schedules iucv_tasklet_fn().
  */
 static void iucv_external_interrupt(u16 code)
 {
 	struct iucv_irq_data *p;
-	struct iucv_work *work;
+	struct iucv_irq_list *work;
 
 	p = percpu_ptr(iucv_irq_data, smp_processor_id());
 	if (p->ippathid >= iucv_max_pathid) {
@@ -1498,16 +1548,23 @@ static void iucv_external_interrupt(u16 code)
 		printk(KERN_ERR "iucv_do_int: unknown iucv interrupt\n");
 		return;
 	}
-	work = kmalloc(sizeof(struct iucv_work), GFP_ATOMIC);
+	work = kmalloc(sizeof(struct iucv_irq_list), GFP_ATOMIC);
 	if (!work) {
 		printk(KERN_WARNING "iucv_external_interrupt: out of memory\n");
 		return;
 	}
 	memcpy(&work->data, p, sizeof(work->data));
-	spin_lock(&iucv_work_lock);
-	list_add_tail(&work->list, &iucv_work_queue);
-	spin_unlock(&iucv_work_lock);
-	tasklet_schedule(&iucv_tasklet);
+	spin_lock(&iucv_queue_lock);
+	if (p->iptype == 0x01) {
+		/* Path pending interrupt. */
+		list_add_tail(&work->list, &iucv_work_queue);
+		schedule_work(&iucv_work);
+	} else {
+		/* The other interrupts. */
+		list_add_tail(&work->list, &iucv_task_queue);
+		tasklet_schedule(&iucv_tasklet);
+	}
+	spin_unlock(&iucv_queue_lock);
 }
 
 /**
@@ -1577,12 +1634,14 @@ out:
  */
 static void iucv_exit(void)
 {
-	struct iucv_work *p, *n;
+	struct iucv_irq_list *p, *n;
 
-	spin_lock_irq(&iucv_work_lock);
+	spin_lock_irq(&iucv_queue_lock);
+	list_for_each_entry_safe(p, n, &iucv_task_queue, list)
+		kfree(p);
 	list_for_each_entry_safe(p, n, &iucv_work_queue, list)
 		kfree(p);
-	spin_unlock_irq(&iucv_work_lock);
+	spin_unlock_irq(&iucv_queue_lock);
 	unregister_hotcpu_notifier(&iucv_cpu_notifier);
 	percpu_free(iucv_param);
 	percpu_free(iucv_irq_data);
-- 
cgit v1.2.3-70-g09d2


From 34588b4c046c34773e5a1a962da7b78b05c4d1bd Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Mon, 30 Apr 2007 00:57:33 -0700
Subject: [TCP]: Catch skb with S+L bugs earlier
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SACKED_ACKED and LOST are mutually exclusive with SACK, thus
having their sum larger than packets_out is bug with SACK.
Eventually these bugs trigger traps in the tcp_clean_rtx_queue
with SACK but it's much more informative to do this here.

Non-SACK TCP, however, could get more than packets_out duplicate
ACKs which each increment sacked_out, so it makes sense to do
this kind of limitting for non-SACK TCP but not for SACK enabled
one. Perhaps the author had the opposite in mind but did the
logic accidently wrong way around? Anyway, the sacked_out
incrementer code for non-SACK already deals this issue before
calling sync_left_out so this trapping can be done
unconditionally.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index a385797f160..c6ecd455eda 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -736,9 +736,7 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk)
 
 static inline void tcp_sync_left_out(struct tcp_sock *tp)
 {
-	if (tp->rx_opt.sack_ok &&
-	    (tp->sacked_out >= tp->packets_out - tp->lost_out))
-		tp->sacked_out = tp->packets_out - tp->lost_out;
+	BUG_ON(tp->sacked_out + tp->lost_out > tp->packets_out);
 	tp->left_out = tp->sacked_out + tp->lost_out;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 157bfc25020f7eb731f94140e099307ade47299e Mon Sep 17 00:00:00 2001
From: Masahide NAKAMURA <nakam@linux-ipv6.org>
Date: Mon, 30 Apr 2007 00:33:35 -0700
Subject: [XFRM]: Restrict upper layer information by bundle.

On MIPv6 usage, XFRM sub policy is enabled.
When main (IPsec) and sub (MIPv6) policy selectors have the same
address set but different upper layer information (i.e. protocol
number and its ports or type/code), multiple bundle should be created.
However, currently we have issue to use the same bundle created for
the first time with all flows covered by the case.

It is useful for the bundle to have the upper layer information
to be restructured correctly if it does not match with the flow.

1. Bundle was created by two policies
Selector from another policy is added to xfrm_dst.
If the flow does not match the selector, it goes to slow path to
restructure new bundle by single policy.

2. Bundle was created by one policy
Flow cache is added to xfrm_dst as originated one. If the flow does
not match the cache, it goes to slow path to try searching another
policy.

Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow.h     |  6 ++++++
 include/net/xfrm.h     | 10 +++++++++
 net/xfrm/xfrm_policy.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 71 insertions(+)

(limited to 'include/net')

diff --git a/include/net/flow.h b/include/net/flow.h
index ce4b10d8b41..f3cc1f81261 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -97,4 +97,10 @@ extern void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir,
 extern void flow_cache_flush(void);
 extern atomic_t flow_cache_genid;
 
+static inline int flow_cache_uli_match(struct flowi *fl1, struct flowi *fl2)
+{
+	return (fl1->proto == fl2->proto &&
+		!memcmp(&fl1->uli_u, &fl2->uli_u, sizeof(fl1->uli_u)));
+}
+
 #endif
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 9561bf817b0..66c2d3eec03 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -603,6 +603,10 @@ struct xfrm_dst
 		struct rt6_info		rt6;
 	} u;
 	struct dst_entry *route;
+#ifdef CONFIG_XFRM_SUB_POLICY
+	struct flowi *origin;
+	struct xfrm_selector *partner;
+#endif
 	u32 genid;
 	u32 route_mtu_cached;
 	u32 child_mtu_cached;
@@ -615,6 +619,12 @@ static inline void xfrm_dst_destroy(struct xfrm_dst *xdst)
 	dst_release(xdst->route);
 	if (likely(xdst->u.dst.xfrm))
 		xfrm_state_put(xdst->u.dst.xfrm);
+#ifdef CONFIG_XFRM_SUB_POLICY
+	kfree(xdst->origin);
+	xdst->origin = NULL;
+	kfree(xdst->partner);
+	xdst->partner = NULL;
+#endif
 }
 
 extern void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index dbf9d96a2f0..263e34e4526 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1344,6 +1344,40 @@ xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx,
 	return err;
 }
 
+static int inline
+xfrm_dst_alloc_copy(void **target, void *src, int size)
+{
+	if (!*target) {
+		*target = kmalloc(size, GFP_ATOMIC);
+		if (!*target)
+			return -ENOMEM;
+	}
+	memcpy(*target, src, size);
+	return 0;
+}
+
+static int inline
+xfrm_dst_update_parent(struct dst_entry *dst, struct xfrm_selector *sel)
+{
+#ifdef CONFIG_XFRM_SUB_POLICY
+	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+	return xfrm_dst_alloc_copy((void **)&(xdst->partner),
+				   sel, sizeof(*sel));
+#else
+	return 0;
+#endif
+}
+
+static int inline
+xfrm_dst_update_origin(struct dst_entry *dst, struct flowi *fl)
+{
+#ifdef CONFIG_XFRM_SUB_POLICY
+	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+	return xfrm_dst_alloc_copy((void **)&(xdst->origin), fl, sizeof(*fl));
+#else
+	return 0;
+#endif
+}
 
 static int stale_bundle(struct dst_entry *dst);
 
@@ -1532,6 +1566,18 @@ restart:
 			err = -EHOSTUNREACH;
 			goto error;
 		}
+
+		if (npols > 1)
+			err = xfrm_dst_update_parent(dst, &pols[1]->selector);
+		else
+			err = xfrm_dst_update_origin(dst, fl);
+		if (unlikely(err)) {
+			write_unlock_bh(&policy->lock);
+			if (dst)
+				dst_free(dst);
+			goto error;
+		}
+
 		dst->next = policy->bundles;
 		policy->bundles = dst;
 		dst_hold(dst);
@@ -1947,6 +1993,15 @@ int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first,
 	if (!dst_check(dst->path, ((struct xfrm_dst *)dst)->path_cookie) ||
 	    (dst->dev && !netif_running(dst->dev)))
 		return 0;
+#ifdef CONFIG_XFRM_SUB_POLICY
+	if (fl) {
+		if (first->origin && !flow_cache_uli_match(first->origin, fl))
+			return 0;
+		if (first->partner &&
+		    !xfrm_selector_match(first->partner, fl, family))
+			return 0;
+	}
+#endif
 
 	last = NULL;
 
-- 
cgit v1.2.3-70-g09d2


From d551e4541dd60ae53459f77a971f2d6043431f5f Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Mon, 30 Apr 2007 00:42:20 -0700
Subject: [TCP] FRTO: RFC4138 allows Nagle override when new data must be sent
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is a corner case where less than MSS sized new data thingie
is awaiting in the send queue. For F-RTO to work correctly, a
new data segment must be sent at certain point or F-RTO cannot
be used at all. RFC4138 allows overriding of Nagle at that
point.

Implementation uses frto_counter states 2 and 3 to distinguish
when Nagle override is needed.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     |  5 +++++
 net/ipv4/tcp_input.c  | 13 ++++++++-----
 net/ipv4/tcp_output.c |  6 ++++--
 3 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index c6ecd455eda..ef8f9d4dae8 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1199,9 +1199,14 @@ static inline struct sk_buff *tcp_send_head(struct sock *sk)
 
 static inline void tcp_advance_send_head(struct sock *sk, struct sk_buff *skb)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
+
 	sk->sk_send_head = skb->next;
 	if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue)
 		sk->sk_send_head = NULL;
+	/* Don't override Nagle indefinately with F-RTO */
+	if (tp->frto_counter == 2)
+		tp->frto_counter = 3;
 }
 
 static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unlinked)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 6b669898b19..7641b2761a1 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2637,7 +2637,9 @@ static void tcp_undo_spur_to_response(struct sock *sk, int flag)
  *                  algorithm is not part of the F-RTO detection algorithm
  *                  given in RFC4138 but can be selected separately).
  * Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss
- * and TCP falls back to conventional RTO recovery.
+ * and TCP falls back to conventional RTO recovery. F-RTO allows overriding
+ * of Nagle, this is done using frto_counter states 2 and 3, when a new data
+ * segment of any size sent during F-RTO, state 2 is upgraded to 3.
  *
  * Rationale: if the RTO was spurious, new ACKs should arrive from the
  * original window even after we transmit two new data segments.
@@ -2666,7 +2668,7 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
 		inet_csk(sk)->icsk_retransmits = 0;
 
 	if (!before(tp->snd_una, tp->frto_highmark)) {
-		tcp_enter_frto_loss(sk, tp->frto_counter + 1, flag);
+		tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag);
 		return 1;
 	}
 
@@ -2692,7 +2694,7 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
 			return 1;
 		}
 
-		if ((tp->frto_counter == 2) &&
+		if ((tp->frto_counter >= 2) &&
 		    (!(flag&FLAG_FORWARD_PROGRESS) ||
 		     ((flag&FLAG_DATA_SACKED) && !(flag&FLAG_ONLY_ORIG_SACKED)))) {
 			/* RFC4138 shortcoming (see comment above) */
@@ -2709,14 +2711,15 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
 		if (!tcp_send_head(sk) ||
 		    after(TCP_SKB_CB(tcp_send_head(sk))->end_seq,
 				     tp->snd_una + tp->snd_wnd)) {
-			tcp_enter_frto_loss(sk, tp->frto_counter + 1, flag);
+			tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3),
+					    flag);
 			return 1;
 		}
 
 		tp->snd_cwnd = tcp_packets_in_flight(tp) + 2;
 		tp->frto_counter = 2;
 		return 1;
-	} else /* frto_counter == 2 */ {
+	} else {
 		switch (sysctl_tcp_frto_response) {
 		case 2:
 			tcp_undo_spur_to_response(sk, flag);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b5fa3c19afe..0faacf9c419 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1035,8 +1035,10 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
 	if (nonagle & TCP_NAGLE_PUSH)
 		return 1;
 
-	/* Don't use the nagle rule for urgent data (or for the final FIN).  */
-	if (tp->urg_mode ||
+	/* Don't use the nagle rule for urgent data (or for the final FIN).
+	 * Nagle can be ignored during F-RTO too (see RFC4138).
+	 */
+	if (tp->urg_mode || (tp->frto_counter == 2) ||
 	    (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
 		return 1;
 
-- 
cgit v1.2.3-70-g09d2