/**
 * @file op_model_xscale.c
 * XScale Performance Monitor Driver
 *
 * @remark Copyright 2000-2004 Deepak Saxena <dsaxena@mvista.com>
 * @remark Copyright 2000-2004 MontaVista Software Inc
 * @remark Copyright 2004 Dave Jiang <dave.jiang@intel.com>
 * @remark Copyright 2004 Intel Corporation
 * @remark Copyright 2004 Zwane Mwaikambo <zwane@arm.linux.org.uk>
 * @remark Copyright 2004 OProfile Authors
 *
 * @remark Read the file COPYING
 *
 * @author Zwane Mwaikambo
 */

/* #define DEBUG */
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/oprofile.h>
#include <linux/interrupt.h>
#include <linux/irq.h>

#include <asm/system.h>

#include "op_counter.h"
#include "op_arm_model.h"

#define	PMU_ENABLE	0x001	/* Enable counters */
#define PMN_RESET	0x002	/* Reset event counters */
#define	CCNT_RESET	0x004	/* Reset clock counter */
#define	PMU_RESET	(CCNT_RESET | PMN_RESET)
#define PMU_CNT64	0x008	/* Make CCNT count every 64th cycle */

/* TODO do runtime detection */
#ifdef CONFIG_ARCH_IOP32X
#define XSCALE_PMU_IRQ  IRQ_IOP32X_CORE_PMU
#endif
#ifdef CONFIG_ARCH_IOP33X
#define XSCALE_PMU_IRQ  IRQ_IOP33X_CORE_PMU
#endif
#ifdef CONFIG_ARCH_PXA
#define XSCALE_PMU_IRQ  IRQ_PMU
#endif

/*
 * Different types of events that can be counted by the XScale PMU
 * as used by Oprofile userspace. Here primarily for documentation
 * purposes.
 */

#define EVT_ICACHE_MISS			0x00
#define	EVT_ICACHE_NO_DELIVER		0x01
#define	EVT_DATA_STALL			0x02
#define	EVT_ITLB_MISS			0x03
#define	EVT_DTLB_MISS			0x04
#define	EVT_BRANCH			0x05
#define	EVT_BRANCH_MISS			0x06
#define	EVT_INSTRUCTION			0x07
#define	EVT_DCACHE_FULL_STALL		0x08
#define	EVT_DCACHE_FULL_STALL_CONTIG	0x09
#define	EVT_DCACHE_ACCESS		0x0A
#define	EVT_DCACHE_MISS			0x0B
#define	EVT_DCACE_WRITE_BACK		0x0C
#define	EVT_PC_CHANGED			0x0D
#define	EVT_BCU_REQUEST			0x10
#define	EVT_BCU_FULL			0x11
#define	EVT_BCU_DRAIN			0x12
#define	EVT_BCU_ECC_NO_ELOG		0x14
#define	EVT_BCU_1_BIT_ERR		0x15
#define	EVT_RMW				0x16
/* EVT_CCNT is not hardware defined */
#define EVT_CCNT			0xFE
#define EVT_UNUSED			0xFF

struct pmu_counter {
	volatile unsigned long ovf;
	unsigned long reset_counter;
};

enum { CCNT, PMN0, PMN1, PMN2, PMN3, MAX_COUNTERS };

static struct pmu_counter results[MAX_COUNTERS];

/*
 * There are two versions of the PMU in current XScale processors
 * with differing register layouts and number of performance counters.
 * e.g. IOP32x is xsc1 whilst IOP33x is xsc2.
 * We detect which register layout to use in xscale_detect_pmu()
 */
enum { PMU_XSC1, PMU_XSC2 };

struct pmu_type {
	int id;
	char *name;
	int num_counters;
	unsigned int int_enable;
	unsigned int cnt_ovf[MAX_COUNTERS];
	unsigned int int_mask[MAX_COUNTERS];
};

static struct pmu_type pmu_parms[] = {
	{
		.id		= PMU_XSC1,
		.name		= "arm/xscale1",
		.num_counters	= 3,
		.int_mask	= { [PMN0] = 0x10, [PMN1] = 0x20,
				    [CCNT] = 0x40 },
		.cnt_ovf	= { [CCNT] = 0x400, [PMN0] = 0x100,
				    [PMN1] = 0x200},
	},
	{
		.id		= PMU_XSC2,
		.name		= "arm/xscale2",
		.num_counters	= 5,
		.int_mask	= { [CCNT] = 0x01, [PMN0] = 0x02,
				    [PMN1] = 0x04, [PMN2] = 0x08,
				    [PMN3] = 0x10 },
		.cnt_ovf	= { [CCNT] = 0x01, [PMN0] = 0x02,
				    [PMN1] = 0x04, [PMN2] = 0x08,
				    [PMN3] = 0x10 },
	},
};

static struct pmu_type *pmu;

static void write_pmnc(u32 val)
{
	if (pmu->id == PMU_XSC1) {
		/* upper 4bits and 7, 11 are write-as-0 */
		val &= 0xffff77f;
		__asm__ __volatile__ ("mcr p14, 0, %0, c0, c0, 0" : : "r" (val));
	} else {
		/* bits 4-23 are write-as-0, 24-31 are write ignored */
		val &= 0xf;
		__asm__ __volatile__ ("mcr p14, 0, %0, c0, c1, 0" : : "r" (val));
	}
}

static u32 read_pmnc(void)
{
	u32 val;

	if (pmu->id == PMU_XSC1)
		__asm__ __volatile__ ("mrc p14, 0, %0, c0, c0, 0" : "=r" (val));
	else {
		__asm__ __volatile__ ("mrc p14, 0, %0, c0, c1, 0" : "=r" (val));
		/* bits 1-2 and 4-23 are read-unpredictable */
		val &= 0xff000009;
	}

	return val;
}

static u32 __xsc1_read_counter(int counter)
{
	u32 val = 0;

	switch (counter) {
	case CCNT:
		__asm__ __volatile__ ("mrc p14, 0, %0, c1, c0, 0" : "=r" (val));
		break;
	case PMN0:
		__asm__ __volatile__ ("mrc p14, 0, %0, c2, c0, 0" : "=r" (val));
		break;
	case PMN1:
		__asm__ __volatile__ ("mrc p14, 0, %0, c3, c0, 0" : "=r" (val));
		break;
	}
	return val;
}

static u32 __xsc2_read_counter(int counter)
{
	u32 val = 0;

	switch (counter) {
	case CCNT:
		__asm__ __volatile__ ("mrc p14, 0, %0, c1, c1, 0" : "=r" (val));
		break;
	case PMN0:
		__asm__ __volatile__ ("mrc p14, 0, %0, c0, c2, 0" : "=r" (val));
		break;
	case PMN1:
		__asm__ __volatile__ ("mrc p14, 0, %0, c1, c2, 0" : "=r" (val));
		break;
	case PMN2:
		__asm__ __volatile__ ("mrc p14, 0, %0, c2, c2, 0" : "=r" (val));
		break;
	case PMN3:
		__asm__ __volatile__ ("mrc p14, 0, %0, c3, c2, 0" : "=r" (val));
		break;
	}
	return val;
}

static u32 read_counter(int counter)
{
	u32 val;

	if (pmu->id == PMU_XSC1)
		val = __xsc1_read_counter(counter);
	else
		val = __xsc2_read_counter(counter);

	return val;
}

static void __xsc1_write_counter(int counter, u32 val)
{
	switch (counter) {
	case CCNT:
		__asm__ __volatile__ ("mcr p14, 0, %0, c1, c0, 0" : : "r" (val));
		break;
	case PMN0:
		__asm__ __volatile__ ("mcr p14, 0, %0, c2, c0, 0" : : "r" (val));
		break;
	case PMN1:
		__asm__ __volatile__ ("mcr p14, 0, %0, c3, c0, 0" : : "r" (val));
		break;
	}
}

static void __xsc2_write_counter(int counter, u32 val)
{
	switch (counter) {
	case CCNT:
		__asm__ __volatile__ ("mcr p14, 0, %0, c1, c1, 0" : : "r" (val));
		break;
	case PMN0:
		__asm__ __volatile__ ("mcr p14, 0, %0, c0, c2, 0" : : "r" (val));
		break;
	case PMN1:
		__asm__ __volatile__ ("mcr p14, 0, %0, c1, c2, 0" : : "r" (val));
		break;
	case PMN2:
		__asm__ __volatile__ ("mcr p14, 0, %0, c2, c2, 0" : : "r" (val));
		break;
	case PMN3:
		__asm__ __volatile__ ("mcr p14, 0, %0, c3, c2, 0" : : "r" (val));
		break;
	}
}

static void write_counter(int counter, u32 val)
{
	if (pmu->id == PMU_XSC1)
		__xsc1_write_counter(counter, val);
	else
		__xsc2_write_counter(counter, val);
}

static int xscale_setup_ctrs(void)
{
	u32 evtsel, pmnc;
	int i;

	for (i = CCNT; i < MAX_COUNTERS; i++) {
		if (counter_config[i].enabled)
			continue;

		counter_config[i].event = EVT_UNUSED;
	}

	switch (pmu->id) {
	case PMU_XSC1:
		pmnc = (counter_config[PMN1].event << 20) | (counter_config[PMN0].event << 12);
		pr_debug("xscale_setup_ctrs: pmnc: %#08x\n", pmnc);
		write_pmnc(pmnc);
		break;

	case PMU_XSC2:
		evtsel = counter_config[PMN0].event | (counter_config[PMN1].event << 8) |
			(counter_config[PMN2].event << 16) | (counter_config[PMN3].event << 24);

		pr_debug("xscale_setup_ctrs: evtsel %#08x\n", evtsel);
		__asm__ __volatile__ ("mcr p14, 0, %0, c8, c1, 0" : : "r" (evtsel));
		break;
	}

	for (i = CCNT; i < MAX_COUNTERS; i++) {
		if (counter_config[i].event == EVT_UNUSED) {
			counter_config[i].event = 0;
			pmu->int_enable &= ~pmu->int_mask[i];
			continue;
		}

		results[i].reset_counter = counter_config[i].count;
		write_counter(i, -(u32)counter_config[i].count);
		pmu->int_enable |= pmu->int_mask[i];
		pr_debug("xscale_setup_ctrs: counter%d %#08x from %#08lx\n", i,
			read_counter(i), counter_config[i].count);
	}

	return 0;
}

static void inline __xsc1_check_ctrs(void)
{
	int i;
	u32 pmnc = read_pmnc();

	/* NOTE: there's an A stepping errata that states if an overflow */
	/*       bit already exists and another occurs, the previous     */
	/*       Overflow bit gets cleared. There's no workaround.	 */
	/*	 Fixed in B stepping or later			 	 */

	/* Write the value back to clear the overflow flags. Overflow */
	/* flags remain in pmnc for use below */
	write_pmnc(pmnc & ~PMU_ENABLE);

	for (i = CCNT; i <= PMN1; i++) {
		if (!(pmu->int_mask[i] & pmu->int_enable))
			continue;

		if (pmnc & pmu->cnt_ovf[i])
			results[i].ovf++;
	}
}

static void inline __xsc2_check_ctrs(void)
{
	int i;
	u32 flag = 0, pmnc = read_pmnc();

	pmnc &= ~PMU_ENABLE;
	write_pmnc(pmnc);

	/* read overflow flag register */
	__asm__ __volatile__ ("mrc p14, 0, %0, c5, c1, 0" : "=r" (flag));

	for (i = CCNT; i <= PMN3; i++) {
		if (!(pmu->int_mask[i] & pmu->int_enable))
			continue;

		if (flag & pmu->cnt_ovf[i])
			results[i].ovf++;
	}

	/* writeback clears overflow bits */
	__asm__ __volatile__ ("mcr p14, 0, %0, c5, c1, 0" : : "r" (flag));
}

static irqreturn_t xscale_pmu_interrupt(int irq, void *arg)
{
	int i;
	u32 pmnc;

	if (pmu->id == PMU_XSC1)
		__xsc1_check_ctrs();
	else
		__xsc2_check_ctrs();

	for (i = CCNT; i < MAX_COUNTERS; i++) {
		if (!results[i].ovf)
			continue;

		write_counter(i, -(u32)results[i].reset_counter);
		oprofile_add_sample(get_irq_regs(), i);
		results[i].ovf--;
	}

	pmnc = read_pmnc() | PMU_ENABLE;
	write_pmnc(pmnc);

	return IRQ_HANDLED;
}

static void xscale_pmu_stop(void)
{
	u32 pmnc = read_pmnc();

	pmnc &= ~PMU_ENABLE;
	write_pmnc(pmnc);

	free_irq(XSCALE_PMU_IRQ, results);
}

static int xscale_pmu_start(void)
{
	int ret;
	u32 pmnc = read_pmnc();

	ret = request_irq(XSCALE_PMU_IRQ, xscale_pmu_interrupt, IRQF_DISABLED,
			"XScale PMU", (void *)results);

	if (ret < 0) {
		printk(KERN_ERR "oprofile: unable to request IRQ%d for XScale PMU\n",
			XSCALE_PMU_IRQ);
		return ret;
	}

	if (pmu->id == PMU_XSC1)
		pmnc |= pmu->int_enable;
	else {
		__asm__ __volatile__ ("mcr p14, 0, %0, c4, c1, 0" : : "r" (pmu->int_enable));
		pmnc &= ~PMU_CNT64;
	}

	pmnc |= PMU_ENABLE;
	write_pmnc(pmnc);
	pr_debug("xscale_pmu_start: pmnc: %#08x mask: %08x\n", pmnc, pmu->int_enable);
	return 0;
}

static int xscale_detect_pmu(void)
{
	int ret = 0;
	u32 id;

	id = (read_cpuid(CPUID_ID) >> 13) & 0x7;

	switch (id) {
	case 1:
		pmu = &pmu_parms[PMU_XSC1];
		break;
	case 2:
		pmu = &pmu_parms[PMU_XSC2];
		break;
	default:
		ret = -ENODEV;
		break;
	}

	if (!ret) {
		op_xscale_spec.name = pmu->name;
		op_xscale_spec.num_counters = pmu->num_counters;
		pr_debug("xscale_detect_pmu: detected %s PMU\n", pmu->name);
	}

	return ret;
}

struct op_arm_model_spec op_xscale_spec = {
	.init		= xscale_detect_pmu,
	.setup_ctrs	= xscale_setup_ctrs,
	.start		= xscale_pmu_start,
	.stop		= xscale_pmu_stop,
};