powerpc: Implement accurate task and CPU time accounting

This implements accurate task and cpu time accounting for 64-bit powerpc kernels. Instead of accounting a whole jiffy of time to a task on a timer interrupt because that task happened to be running at the time, we now account time in units of timebase ticks according to the actual time spent by the task in user mode and kernel mode. We also count the time spent processing hardware and software interrupts accurately. This is conditional on CONFIG_VIRT_CPU_ACCOUNTING. If that is not set, we do tick-based approximate accounting as before. To get this accurate information, we read either the PURR (processor utilization of resources register) on POWER5 machines, or the timebase on other machines on * each entry to the kernel from usermode * each exit to usermode * transitions between process context, hard irq context and soft irq context in kernel mode * context switches. On POWER5 systems with shared-processor logical partitioning we also read both the PURR and the timebase at each timer interrupt and context switch in order to determine how much time has been taken by the hypervisor to run other partitions ("steal" time). Unfortunately, since we need values of the PURR on both threads at the same time to accurately calculate the steal time, and since we can only calculate steal time on a per-core basis, the apportioning of the steal time between idle time (time which we ceded to the hypervisor in the idle loop) and actual stolen time is somewhat approximate at the moment. This is all based quite heavily on what s390 does, and it uses the generic interfaces that were added by the s390 developers, i.e. account_system_time(), account_user_time(), etc. This patch doesn't add any new interfaces between the kernel and userspace, and doesn't change the units in which time is reported to userspace by things such as /proc/stat, /proc/<pid>/stat, getrusage(), times(), etc. Internally the various task and cpu times are stored in timebase units, but they are converted to USER_HZ units (1/100th of a second) when reported to userspace. Some precision is therefore lost but there should not be any accumulating error, since the internal accumulation is at full precision. Signed-off-by: Paul Mackerras <paulus@samba.org>
author: Paul Mackerras <paulus@samba.org> 2006-02-24 10:06:59 +1100
committer: Paul Mackerras <paulus@samba.org> 2006-02-24 14:05:56 +1100
commit: c6622f63db86fcbd41bf6fe05ddf2e00c1e51ced (patch)
tree: 102f3ea0a891212603a3722fece337d6a74d450c /arch/powerpc/kernel/time.c
parent: a00428f5b149e36b8225b2a0812742a6dfb07b8c (diff)
1 files changed, 234 insertions, 2 deletions
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 2a7ddc57937..0b34db28916 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -51,6 +51,7 @@
 #include <linux/percpu.h>
 #include <linux/rtc.h>
 #include <linux/jiffies.h>
+#include <linux/posix-timers.h>
 
 #include <asm/io.h>
 #include <asm/processor.h>
@@ -135,6 +136,220 @@ unsigned long tb_last_stamp;
  */
 DEFINE_PER_CPU(unsigned long, last_jiffy);
 
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+/*
+ * Factors for converting from cputime_t (timebase ticks) to
+ * jiffies, milliseconds, seconds, and clock_t (1/USER_HZ seconds).
+ * These are all stored as 0.64 fixed-point binary fractions.
+ */
+u64 __cputime_jiffies_factor;
+u64 __cputime_msec_factor;
+u64 __cputime_sec_factor;
+u64 __cputime_clockt_factor;
+
+static void calc_cputime_factors(void)
+{
+	struct div_result res;
+
+	div128_by_32(HZ, 0, tb_ticks_per_sec, &res);
+	__cputime_jiffies_factor = res.result_low;
+	div128_by_32(1000, 0, tb_ticks_per_sec, &res);
+	__cputime_msec_factor = res.result_low;
+	div128_by_32(1, 0, tb_ticks_per_sec, &res);
+	__cputime_sec_factor = res.result_low;
+	div128_by_32(USER_HZ, 0, tb_ticks_per_sec, &res);
+	__cputime_clockt_factor = res.result_low;
+}
+
+/*
+ * Read the PURR on systems that have it, otherwise the timebase.
+ */
+static u64 read_purr(void)
+{
+	if (cpu_has_feature(CPU_FTR_PURR))
+		return mfspr(SPRN_PURR);
+	return mftb();
+}
+
+/*
+ * Account time for a transition between system, hard irq
+ * or soft irq state.
+ */
+void account_system_vtime(struct task_struct *tsk)
+{
+	u64 now, delta;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	now = read_purr();
+	delta = now - get_paca()->startpurr;
+	get_paca()->startpurr = now;
+	if (!in_interrupt()) {
+		delta += get_paca()->system_time;
+		get_paca()->system_time = 0;
+	}
+	account_system_time(tsk, 0, delta);
+	local_irq_restore(flags);
+}
+
+/*
+ * Transfer the user and system times accumulated in the paca
+ * by the exception entry and exit code to the generic process
+ * user and system time records.
+ * Must be called with interrupts disabled.
+ */
+void account_process_vtime(struct task_struct *tsk)
+{
+	cputime_t utime;
+
+	utime = get_paca()->user_time;
+	get_paca()->user_time = 0;
+	account_user_time(tsk, utime);
+}
+
+static void account_process_time(struct pt_regs *regs)
+{
+	int cpu = smp_processor_id();
+
+	account_process_vtime(current);
+	run_local_timers();
+	if (rcu_pending(cpu))
+		rcu_check_callbacks(cpu, user_mode(regs));
+	scheduler_tick();
+ 	run_posix_cpu_timers(current);
+}
+
+#ifdef CONFIG_PPC_SPLPAR
+/*
+ * Stuff for accounting stolen time.
+ */
+struct cpu_purr_data {
+	int	initialized;			/* thread is running */
+	u64	tb0;			/* timebase at origin time */
+	u64	purr0;			/* PURR at origin time */
+	u64	tb;			/* last TB value read */
+	u64	purr;			/* last PURR value read */
+	u64	stolen;			/* stolen time so far */
+	spinlock_t lock;
+};
+
+static DEFINE_PER_CPU(struct cpu_purr_data, cpu_purr_data);
+
+static void snapshot_tb_and_purr(void *data)
+{
+	struct cpu_purr_data *p = &__get_cpu_var(cpu_purr_data);
+
+	p->tb0 = mftb();
+	p->purr0 = mfspr(SPRN_PURR);
+	p->tb = p->tb0;
+	p->purr = 0;
+	wmb();
+	p->initialized = 1;
+}
+
+/*
+ * Called during boot when all cpus have come up.
+ */
+void snapshot_timebases(void)
+{
+	int cpu;
+
+	if (!cpu_has_feature(CPU_FTR_PURR))
+		return;
+	for_each_cpu(cpu)
+		spin_lock_init(&per_cpu(cpu_purr_data, cpu).lock);
+	on_each_cpu(snapshot_tb_and_purr, NULL, 0, 1);
+}
+
+void calculate_steal_time(void)
+{
+	u64 tb, purr, t0;
+	s64 stolen;
+	struct cpu_purr_data *p0, *pme, *phim;
+	int cpu;
+
+	if (!cpu_has_feature(CPU_FTR_PURR))
+		return;
+	cpu = smp_processor_id();
+	pme = &per_cpu(cpu_purr_data, cpu);
+	if (!pme->initialized)
+		return;		/* this can happen in early boot */
+	p0 = &per_cpu(cpu_purr_data, cpu & ~1);
+	phim = &per_cpu(cpu_purr_data, cpu ^ 1);
+	spin_lock(&p0->lock);
+	tb = mftb();
+	purr = mfspr(SPRN_PURR) - pme->purr0;
+	if (!phim->initialized || !cpu_online(cpu ^ 1)) {
+		stolen = (tb - pme->tb) - (purr - pme->purr);
+	} else {
+		t0 = pme->tb0;
+		if (phim->tb0 < t0)
+			t0 = phim->tb0;
+		stolen = phim->tb - t0 - phim->purr - purr - p0->stolen;
+	}
+	if (stolen > 0) {
+		account_steal_time(current, stolen);
+		p0->stolen += stolen;
+	}
+	pme->tb = tb;
+	pme->purr = purr;
+	spin_unlock(&p0->lock);
+}
+
+/*
+ * Must be called before the cpu is added to the online map when
+ * a cpu is being brought up at runtime.
+ */
+static void snapshot_purr(void)
+{
+	int cpu;
+	u64 purr;
+	struct cpu_purr_data *p0, *pme, *phim;
+	unsigned long flags;
+
+	if (!cpu_has_feature(CPU_FTR_PURR))
+		return;
+	cpu = smp_processor_id();
+	pme = &per_cpu(cpu_purr_data, cpu);
+	p0 = &per_cpu(cpu_purr_data, cpu & ~1);
+	phim = &per_cpu(cpu_purr_data, cpu ^ 1);
+	spin_lock_irqsave(&p0->lock, flags);
+	pme->tb = pme->tb0 = mftb();
+	purr = mfspr(SPRN_PURR);
+	if (!phim->initialized) {
+		pme->purr = 0;
+		pme->purr0 = purr;
+	} else {
+		/* set p->purr and p->purr0 for no change in p0->stolen */
+		pme->purr = phim->tb - phim->tb0 - phim->purr - p0->stolen;
+		pme->purr0 = purr - pme->purr;
+	}
+	pme->initialized = 1;
+	spin_unlock_irqrestore(&p0->lock, flags);
+}
+
+#endif /* CONFIG_PPC_SPLPAR */
+
+#else /* ! CONFIG_VIRT_CPU_ACCOUNTING */
+#define calc_cputime_factors()
+#define account_process_time(regs)	update_process_times(user_mode(regs))
+#define calculate_steal_time()		do { } while (0)
+#endif
+
+#if !(defined(CONFIG_VIRT_CPU_ACCOUNTING) && defined(CONFIG_PPC_SPLPAR))
+#define snapshot_purr()			do { } while (0)
+#endif
+
+/*
+ * Called when a cpu comes up after the system has finished booting,
+ * i.e. as a result of a hotplug cpu action.
+ */
+void snapshot_timebase(void)
+{
+	__get_cpu_var(last_jiffy) = get_tb();
+	snapshot_purr();
+}
+
 void __delay(unsigned long loops)
 {
 	unsigned long start;
@@ -382,6 +597,7 @@ static void iSeries_tb_recal(void)
 						new_tb_ticks_per_jiffy, sign, tick_diff );
 				tb_ticks_per_jiffy = new_tb_ticks_per_jiffy;
 				tb_ticks_per_sec   = new_tb_ticks_per_sec;
+				calc_cputime_factors();
 				div128_by_32( XSEC_PER_SEC, 0, tb_ticks_per_sec, &divres );
 				do_gtod.tb_ticks_per_sec = tb_ticks_per_sec;
 				tb_to_xs = divres.result_low;
@@ -430,6 +646,7 @@ void timer_interrupt(struct pt_regs * regs)
 	irq_enter();
 
 	profile_tick(CPU_PROFILING, regs);
+	calculate_steal_time();
 
 #ifdef CONFIG_PPC_ISERIES
 	get_lppaca()->int_dword.fields.decr_int = 0;
@@ -451,7 +668,7 @@ void timer_interrupt(struct pt_regs * regs)
 		 * is the case.
 		 */
 		if (!cpu_is_offline(cpu))
-			update_process_times(user_mode(regs));
+			account_process_time(regs);
 
 		/*
 		 * No need to check whether cpu is offline here; boot_cpuid
@@ -508,13 +725,27 @@ void wakeup_decrementer(void)
 void __init smp_space_timers(unsigned int max_cpus)
 {
 	int i;
+	unsigned long half = tb_ticks_per_jiffy / 2;
 	unsigned long offset = tb_ticks_per_jiffy / max_cpus;
 	unsigned long previous_tb = per_cpu(last_jiffy, boot_cpuid);
 
 	/* make sure tb > per_cpu(last_jiffy, cpu) for all cpus always */
 	previous_tb -= tb_ticks_per_jiffy;
+	/*
+	 * The stolen time calculation for POWER5 shared-processor LPAR
+	 * systems works better if the two threads' timebase interrupts
+	 * are staggered by half a jiffy with respect to each other.
+	 */
 	for_each_cpu(i) {
-		if (i != boot_cpuid) {
+		if (i == boot_cpuid)
+			continue;
+		if (i == (boot_cpuid ^ 1))
+			per_cpu(last_jiffy, i) =
+				per_cpu(last_jiffy, boot_cpuid) - half;
+		else if (i & 1)
+			per_cpu(last_jiffy, i) =
+				per_cpu(last_jiffy, i ^ 1) + half;
+		else {
 			previous_tb += offset;
 			per_cpu(last_jiffy, i) = previous_tb;
 		}
@@ -706,6 +937,7 @@ void __init time_init(void)
 	tb_ticks_per_sec = ppc_tb_freq;
 	tb_ticks_per_usec = ppc_tb_freq / 1000000;
 	tb_to_us = mulhwu_scale_factor(ppc_tb_freq, 1000000);
+	calc_cputime_factors();
 
 	/*
 	 * Calculate the length of each tick in ns.  It will not be
author	Paul Mackerras <paulus@samba.org>	2006-02-24 10:06:59 +1100
committer	Paul Mackerras <paulus@samba.org>	2006-02-24 14:05:56 +1100
commit	c6622f63db86fcbd41bf6fe05ddf2e00c1e51ced (patch)
tree	102f3ea0a891212603a3722fece337d6a74d450c /arch/powerpc/kernel/time.c
parent	a00428f5b149e36b8225b2a0812742a6dfb07b8c (diff)