summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/boot/compressed/aslr.c9
-rw-r--r--arch/x86/crypto/sha512_ssse3_glue.c2
-rw-r--r--arch/x86/include/asm/asm.h7
-rw-r--r--arch/x86/include/asm/checksum_64.h9
-rw-r--r--arch/x86/include/asm/irq.h2
-rw-r--r--arch/x86/include/asm/kprobes.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h4
-rw-r--r--arch/x86/include/asm/ptrace.h16
-rw-r--r--arch/x86/include/asm/qrwlock.h17
-rw-r--r--arch/x86/include/asm/spinlock.h4
-rw-r--r--arch/x86/include/asm/spinlock_types.h4
-rw-r--r--arch/x86/include/asm/traps.h3
-rw-r--r--arch/x86/include/asm/uprobes.h10
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/alternative.c3
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c21
-rw-r--r--arch/x86/kernel/apic/io_apic.c28
-rw-r--r--arch/x86/kernel/cpu/common.c4
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c9
-rw-r--r--arch/x86/kernel/cpu/perf_event.c21
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_ibs.c3
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c5
-rw-r--r--arch/x86/kernel/dumpstack.c9
-rw-r--r--arch/x86/kernel/early-quirks.c46
-rw-r--r--arch/x86/kernel/entry_32.S43
-rw-r--r--arch/x86/kernel/entry_64.S225
-rw-r--r--arch/x86/kernel/ftrace.c56
-rw-r--r--arch/x86/kernel/hw_breakpoint.c5
-rw-r--r--arch/x86/kernel/i8259.c20
-rw-r--r--arch/x86/kernel/irq.c13
-rw-r--r--arch/x86/kernel/kprobes/core.c128
-rw-r--r--arch/x86/kernel/kprobes/ftrace.c17
-rw-r--r--arch/x86/kernel/kprobes/opt.c32
-rw-r--r--arch/x86/kernel/kvm.c4
-rw-r--r--arch/x86/kernel/mcount_64.S217
-rw-r--r--arch/x86/kernel/nmi.c18
-rw-r--r--arch/x86/kernel/paravirt.c6
-rw-r--r--arch/x86/kernel/process_64.c7
-rw-r--r--arch/x86/kernel/signal.c2
-rw-r--r--arch/x86/kernel/traps.c140
-rw-r--r--arch/x86/kernel/uprobes.c505
-rw-r--r--arch/x86/kvm/svm.c1
-rw-r--r--arch/x86/kvm/x86.c2
-rw-r--r--arch/x86/lib/thunk_32.S3
-rw-r--r--arch/x86/lib/thunk_64.S3
-rw-r--r--arch/x86/mm/fault.c29
-rw-r--r--arch/x86/net/bpf_jit.S77
-rw-r--r--arch/x86/net/bpf_jit_comp.c1399
-rw-r--r--arch/x86/platform/intel-mid/device_libs/Makefile1
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_wdt.c72
-rw-r--r--arch/x86/vdso/Makefile58
-rw-r--r--arch/x86/vdso/vclock_gettime.c3
-rw-r--r--arch/x86/vdso/vdso-fakesections.c21
-rw-r--r--arch/x86/vdso/vdso-layout.lds.S64
-rw-r--r--arch/x86/vdso/vdso.lds.S2
-rw-r--r--arch/x86/vdso/vdso2c.c92
-rw-r--r--arch/x86/vdso/vdso2c.h197
-rw-r--r--arch/x86/vdso/vdso32/vdso-fakesections.c1
-rw-r--r--arch/x86/vdso/vdsox32.lds.S2
-rw-r--r--arch/x86/vdso/vma.c4
-rw-r--r--arch/x86/xen/enlighten.c5
-rw-r--r--arch/x86/xen/setup.c60
-rw-r--r--arch/x86/xen/xen-ops.h1
64 files changed, 2194 insertions, 1582 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b660088c220..a8f749ef0fd 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -121,6 +121,7 @@ config X86
select MODULES_USE_ELF_RELA if X86_64
select CLONE_BACKWARDS if X86_32
select ARCH_USE_BUILTIN_BSWAP
+ select ARCH_USE_QUEUE_RWLOCK
select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION
select OLD_SIGACTION if X86_32
select COMPAT_OLD_SIGACTION if IA32_EMULATION
@@ -1671,7 +1672,6 @@ config RELOCATABLE
config RANDOMIZE_BASE
bool "Randomize the address of the kernel image"
depends on RELOCATABLE
- depends on !HIBERNATION
default n
---help---
Randomizes the physical and virtual address at which the
diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index 4dbf967da50..fc6091abedb 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -289,10 +289,17 @@ unsigned char *choose_kernel_location(unsigned char *input,
unsigned long choice = (unsigned long)output;
unsigned long random;
+#ifdef CONFIG_HIBERNATION
+ if (!cmdline_find_option_bool("kaslr")) {
+ debug_putstr("KASLR disabled by default...\n");
+ goto out;
+ }
+#else
if (cmdline_find_option_bool("nokaslr")) {
- debug_putstr("KASLR disabled...\n");
+ debug_putstr("KASLR disabled by cmdline...\n");
goto out;
}
+#endif
/* Record the various known unsafe memory ranges. */
mem_avoid_init((unsigned long)input, input_size,
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index f30cd10293f..8626b03e83b 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -141,7 +141,7 @@ static int sha512_ssse3_final(struct shash_desc *desc, u8 *out)
/* save number of bits */
bits[1] = cpu_to_be64(sctx->count[0] << 3);
- bits[0] = cpu_to_be64(sctx->count[1] << 3) | sctx->count[0] >> 61;
+ bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61);
/* Pad out to 112 mod 128 and append length */
index = sctx->count[0] & 0x7f;
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 4582e8e1cd1..7730c1c5c83 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -57,6 +57,12 @@
.long (from) - . ; \
.long (to) - . + 0x7ffffff0 ; \
.popsection
+
+# define _ASM_NOKPROBE(entry) \
+ .pushsection "_kprobe_blacklist","aw" ; \
+ _ASM_ALIGN ; \
+ _ASM_PTR (entry); \
+ .popsection
#else
# define _ASM_EXTABLE(from,to) \
" .pushsection \"__ex_table\",\"a\"\n" \
@@ -71,6 +77,7 @@
" .long (" #from ") - .\n" \
" .long (" #to ") - . + 0x7ffffff0\n" \
" .popsection\n"
+/* For C file, we already have NOKPROBE_SYMBOL macro */
#endif
#endif /* _ASM_X86_ASM_H */
diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h
index e6fd8a026c7..cd00e177449 100644
--- a/arch/x86/include/asm/checksum_64.h
+++ b/arch/x86/include/asm/checksum_64.h
@@ -184,8 +184,15 @@ static inline unsigned add32_with_carry(unsigned a, unsigned b)
asm("addl %2,%0\n\t"
"adcl $0,%0"
: "=r" (a)
- : "0" (a), "r" (b));
+ : "0" (a), "rm" (b));
return a;
}
+#define HAVE_ARCH_CSUM_ADD
+static inline __wsum csum_add(__wsum csum, __wsum addend)
+{
+ return (__force __wsum)add32_with_carry((__force unsigned)csum,
+ (__force unsigned)addend);
+}
+
#endif /* _ASM_X86_CHECKSUM_64_H */
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index cb6cfcd034c..a80cbb88ea9 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -43,7 +43,7 @@ extern int vector_used_by_percpu_irq(unsigned int vector);
extern void init_ISA_irqs(void);
#ifdef CONFIG_X86_LOCAL_APIC
-void arch_trigger_all_cpu_backtrace(void);
+void arch_trigger_all_cpu_backtrace(bool);
#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
#endif
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 9454c167629..53cdfb2857a 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -116,4 +116,6 @@ struct kprobe_ctlblk {
extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
extern int kprobe_exceptions_notify(struct notifier_block *self,
unsigned long val, void *data);
+extern int kprobe_int3_handler(struct pt_regs *regs);
+extern int kprobe_debug_handler(struct pt_regs *regs);
#endif /* _ASM_X86_KPROBES_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 49314155b66..49205d01b9a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -95,7 +95,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
#define KVM_REFILL_PAGES 25
#define KVM_MAX_CPUID_ENTRIES 80
#define KVM_NR_FIXED_MTRR_REGION 88
-#define KVM_NR_VAR_MTRR 8
+#define KVM_NR_VAR_MTRR 10
#define ASYNC_PF_PER_VCPU 64
@@ -461,7 +461,7 @@ struct kvm_vcpu_arch {
bool nmi_injected; /* Trying to inject an NMI this entry */
struct mtrr_state_type mtrr_state;
- u32 pat;
+ u64 pat;
unsigned switch_db_regs;
unsigned long db[KVM_NR_DB_REGS];
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 14fd6fd75a1..6205f0c434d 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -231,6 +231,22 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
#define ARCH_HAS_USER_SINGLE_STEP_INFO
+/*
+ * When hitting ptrace_stop(), we cannot return using SYSRET because
+ * that does not restore the full CPU state, only a minimal set. The
+ * ptracer can change arbitrary register values, which is usually okay
+ * because the usual ptrace stops run off the signal delivery path which
+ * forces IRET; however, ptrace_event() stops happen in arbitrary places
+ * in the kernel and don't force IRET path.
+ *
+ * So force IRET path after a ptrace stop.
+ */
+#define arch_ptrace_stop_needed(code, info) \
+({ \
+ set_thread_flag(TIF_NOTIFY_RESUME); \
+ false; \
+})
+
struct user_desc;
extern int do_get_thread_area(struct task_struct *p, int idx,
struct user_desc __user *info);
diff --git a/arch/x86/include/asm/qrwlock.h b/arch/x86/include/asm/qrwlock.h
new file mode 100644
index 00000000000..70f46f07f94
--- /dev/null
+++ b/arch/x86/include/asm/qrwlock.h
@@ -0,0 +1,17 @@
+#ifndef _ASM_X86_QRWLOCK_H
+#define _ASM_X86_QRWLOCK_H
+
+#include <asm-generic/qrwlock_types.h>
+
+#if !defined(CONFIG_X86_OOSTORE) && !defined(CONFIG_X86_PPRO_FENCE)
+#define queue_write_unlock queue_write_unlock
+static inline void queue_write_unlock(struct qrwlock *lock)
+{
+ barrier();
+ ACCESS_ONCE(*(u8 *)&lock->cnts) = 0;
+}
+#endif
+
+#include <asm-generic/qrwlock.h>
+
+#endif /* _ASM_X86_QRWLOCK_H */
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 0f62f5482d9..54f1c8068c0 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -187,6 +187,7 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
cpu_relax();
}
+#ifndef CONFIG_QUEUE_RWLOCK
/*
* Read-write spinlocks, allowing multiple readers
* but only one writer.
@@ -269,6 +270,9 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
asm volatile(LOCK_PREFIX WRITE_LOCK_ADD(%1) "%0"
: "+m" (rw->write) : "i" (RW_LOCK_BIAS) : "memory");
}
+#else
+#include <asm/qrwlock.h>
+#endif /* CONFIG_QUEUE_RWLOCK */
#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
index 4f1bea19945..73c4c007200 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -34,6 +34,10 @@ typedef struct arch_spinlock {
#define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } }
+#ifdef CONFIG_QUEUE_RWLOCK
+#include <asm-generic/qrwlock_types.h>
+#else
#include <asm/rwlock.h>
+#endif
#endif /* _ASM_X86_SPINLOCK_TYPES_H */
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 8ba18842c48..bc8352e7010 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -68,7 +68,7 @@ dotraplinkage void do_segment_not_present(struct pt_regs *, long);
dotraplinkage void do_stack_segment(struct pt_regs *, long);
#ifdef CONFIG_X86_64
dotraplinkage void do_double_fault(struct pt_regs *, long);
-asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *);
+asmlinkage struct pt_regs *sync_regs(struct pt_regs *);
#endif
dotraplinkage void do_general_protection(struct pt_regs *, long);
dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
@@ -103,7 +103,6 @@ static inline int get_si_code(unsigned long condition)
extern int panic_on_unrecovered_nmi;
-void math_error(struct pt_regs *, int, int);
void math_emulate(struct math_emu_info *);
#ifndef CONFIG_X86_32
asmlinkage void smp_thermal_interrupt(void);
diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
index 93bee7b9385..74f4c2ff642 100644
--- a/arch/x86/include/asm/uprobes.h
+++ b/arch/x86/include/asm/uprobes.h
@@ -41,18 +41,18 @@ struct arch_uprobe {
u8 ixol[MAX_UINSN_BYTES];
};
- u16 fixups;
const struct uprobe_xol_ops *ops;
union {
-#ifdef CONFIG_X86_64
- unsigned long rip_rela_target_address;
-#endif
struct {
s32 offs;
u8 ilen;
u8 opc1;
- } branch;
+ } branch;
+ struct {
+ u8 fixups;
+ u8 ilen;
+ } defparam;
};
};
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 491ef3e5985..047f9ff2e36 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-y += probe_roms.o
obj-$(CONFIG_X86_32) += i386_ksyms_32.o
obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
+obj-$(CONFIG_X86_64) += mcount_64.o
obj-y += syscall_$(BITS).o vsyscall_gtod.o
obj-$(CONFIG_X86_64) += vsyscall_64.o
obj-$(CONFIG_X86_64) += vsyscall_emu_64.o
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index df94598ad05..703130f469e 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -5,7 +5,6 @@
#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/stringify.h>
-#include <linux/kprobes.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/memory.h>
@@ -551,7 +550,7 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode,
*
* Note: Must be called under text_mutex.
*/
-void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
+void *text_poke(void *addr, const void *opcode, size_t len)
{
unsigned long flags;
char *vaddr;
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index eab67047dec..6a1e71bde32 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -33,34 +33,44 @@ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
/* "in progress" flag of arch_trigger_all_cpu_backtrace */
static unsigned long backtrace_flag;
-void arch_trigger_all_cpu_backtrace(void)
+void arch_trigger_all_cpu_backtrace(bool include_self)
{
int i;
+ int cpu = get_cpu();
- if (test_and_set_bit(0, &backtrace_flag))
+ if (test_and_set_bit(0, &backtrace_flag)) {
/*
* If there is already a trigger_all_cpu_backtrace() in progress
* (backtrace_flag == 1), don't output double cpu dump infos.
*/
+ put_cpu();
return;
+ }
cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
+ if (!include_self)
+ cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
- printk(KERN_INFO "sending NMI to all CPUs:\n");
- apic->send_IPI_all(NMI_VECTOR);
+ if (!cpumask_empty(to_cpumask(backtrace_mask))) {
+ pr_info("sending NMI to %s CPUs:\n",
+ (include_self ? "all" : "other"));
+ apic->send_IPI_mask(to_cpumask(backtrace_mask), NMI_VECTOR);
+ }
/* Wait for up to 10 seconds for all CPUs to do the backtrace */
for (i = 0; i < 10 * 1000; i++) {
if (cpumask_empty(to_cpumask(backtrace_mask)))
break;
mdelay(1);
+ touch_softlockup_watchdog();
}
clear_bit(0, &backtrace_flag);
smp_mb__after_atomic();
+ put_cpu();
}
-static int __kprobes
+static int
arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
{
int cpu;
@@ -80,6 +90,7 @@ arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
return NMI_DONE;
}
+NOKPROBE_SYMBOL(arch_trigger_all_cpu_backtrace_handler);
static int __init register_trigger_all_cpu_backtrace(void)
{
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 9d0a9795a0f..81e08eff05e 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2297,7 +2297,7 @@ int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
int err;
if (!config_enabled(CONFIG_SMP))
- return -1;
+ return -EPERM;
if (!cpumask_intersects(mask, cpu_online_mask))
return -EINVAL;
@@ -2328,7 +2328,7 @@ int native_ioapic_set_affinity(struct irq_data *data,
int ret;
if (!config_enabled(CONFIG_SMP))
- return -1;
+ return -EPERM;
raw_spin_lock_irqsave(&ioapic_lock, flags);
ret = __ioapic_set_affinity(data, mask, &dest);
@@ -3001,9 +3001,11 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
struct irq_cfg *cfg = data->chip_data;
struct msi_msg msg;
unsigned int dest;
+ int ret;
- if (__ioapic_set_affinity(data, mask, &dest))
- return -1;
+ ret = __ioapic_set_affinity(data, mask, &dest);
+ if (ret)
+ return ret;
__get_cached_msi_msg(data->msi_desc, &msg);
@@ -3100,9 +3102,11 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
struct irq_cfg *cfg = data->chip_data;
unsigned int dest, irq = data->irq;
struct msi_msg msg;
+ int ret;
- if (__ioapic_set_affinity(data, mask, &dest))
- return -1;
+ ret = __ioapic_set_affinity(data, mask, &dest);
+ if (ret)
+ return ret;
dmar_msi_read(irq, &msg);
@@ -3149,9 +3153,11 @@ static int hpet_msi_set_affinity(struct irq_data *data,
struct irq_cfg *cfg = data->chip_data;
struct msi_msg msg;
unsigned int dest;
+ int ret;
- if (__ioapic_set_affinity(data, mask, &dest))
- return -1;
+ ret = __ioapic_set_affinity(data, mask, &dest);
+ if (ret)
+ return ret;
hpet_msi_read(data->handler_data, &msg);
@@ -3218,9 +3224,11 @@ ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
{
struct irq_cfg *cfg = data->chip_data;
unsigned int dest;
+ int ret;
- if (__ioapic_set_affinity(data, mask, &dest))
- return -1;
+ ret = __ioapic_set_affinity(data, mask, &dest);
+ if (ret)
+ return ret;
target_ht_irq(data->irq, dest, cfg->vector);
return IRQ_SET_MASK_OK_NOCOPY;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 2cbbf88d8f2..ef1b93f18ed 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -8,6 +8,7 @@
#include <linux/delay.h>
#include <linux/sched.h>
#include <linux/init.h>
+#include <linux/kprobes.h>
#include <linux/kgdb.h>
#include <linux/smp.h>
#include <linux/io.h>
@@ -1193,6 +1194,7 @@ int is_debug_stack(unsigned long addr)
(addr <= __get_cpu_var(debug_stack_addr) &&
addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ));
}
+NOKPROBE_SYMBOL(is_debug_stack);
DEFINE_PER_CPU(u32, debug_idt_ctr);
@@ -1201,6 +1203,7 @@ void debug_stack_set_zero(void)
this_cpu_inc(debug_idt_ctr);
load_current_idt();
}
+NOKPROBE_SYMBOL(debug_stack_set_zero);
void debug_stack_reset(void)
{
@@ -1209,6 +1212,7 @@ void debug_stack_reset(void)
if (this_cpu_dec_return(debug_idt_ctr) == 0)
load_current_idt();
}
+NOKPROBE_SYMBOL(debug_stack_reset);
#else /* CONFIG_X86_64 */
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 76f98fe5b35..a450373e8e9 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -132,15 +132,6 @@ static void __init ms_hyperv_init_platform(void)
lapic_timer_frequency = hv_lapic_frequency;
printk(KERN_INFO "HyperV: LAPIC Timer Frequency: %#x\n",
lapic_timer_frequency);
-
- /*
- * On Hyper-V, when we are booting off an EFI firmware stack,
- * we do not have many legacy devices including PIC, PIT etc.
- */
- if (efi_enabled(EFI_BOOT)) {
- printk(KERN_INFO "HyperV: Using null_legacy_pic\n");
- legacy_pic = &null_legacy_pic;
- }
}
#endif
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 89f3b7c1af2..2bdfbff8a4f 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -303,15 +303,6 @@ int x86_setup_perfctr(struct perf_event *event)
hwc->sample_period = x86_pmu.max_period;
hwc->last_period = hwc->sample_period;
local64_set(&hwc->period_left, hwc->sample_period);
- } else {
- /*
- * If we have a PMU initialized but no APIC
- * interrupts, we cannot sample hardware
- * events (user-space has to fall back and
- * sample via a hrtimer based software event):
- */
- if (!x86_pmu.apic)
- return -EOPNOTSUPP;
}
if (attr->type == PERF_TYPE_RAW)
@@ -1293,7 +1284,7 @@ void perf_events_lapic_init(void)
apic_write(APIC_LVTPC, APIC_DM_NMI);
}
-static int __kprobes
+static int
perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
{
u64 start_clock;
@@ -1311,6 +1302,7 @@ perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
return ret;
}
+NOKPROBE_SYMBOL(perf_event_nmi_handler);
struct event_constraint emptyconstraint;
struct event_constraint unconstrained;
@@ -1366,6 +1358,15 @@ static void __init pmu_check_apic(void)
x86_pmu.apic = 0;
pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
pr_info("no hardware sampling interrupt available.\n");
+
+ /*
+ * If we have a PMU initialized but no APIC
+ * interrupts, we cannot sample hardware
+ * events (user-space has to fall back and
+ * sample via a hrtimer based software event):
+ */
+ pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
+
}
static struct attribute_group x86_pmu_format_group = {
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index 4c36bbe3173..cbb1be3ed9e 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -593,7 +593,7 @@ out:
return 1;
}
-static int __kprobes
+static int
perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
{
int handled = 0;
@@ -606,6 +606,7 @@ perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
return handled;
}
+NOKPROBE_SYMBOL(perf_ibs_nmi_handler);
static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
{
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index d82d155aca8..9dd2459a4c7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -384,6 +384,9 @@ static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
if (br_type & PERF_SAMPLE_BRANCH_NO_TX)
mask |= X86_BR_NO_TX;
+ if (br_type & PERF_SAMPLE_BRANCH_COND)
+ mask |= X86_BR_JCC;
+
/*
* stash actual user request into reg, it may
* be used by fixup code for some CPU
@@ -678,6 +681,7 @@ static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
* NHM/WSM erratum: must include IND_JMP to capture IND_CALL
*/
[PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP,
+ [PERF_SAMPLE_BRANCH_COND] = LBR_JCC,
};
static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
@@ -689,6 +693,7 @@ static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
[PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL
| LBR_FAR,
[PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL,
+ [PERF_SAMPLE_BRANCH_COND] = LBR_JCC,
};
/* core */
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index d9c12d3022a..b74ebc7c440 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -200,7 +200,7 @@ static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
static int die_owner = -1;
static unsigned int die_nest_count;
-unsigned __kprobes long oops_begin(void)
+unsigned long oops_begin(void)
{
int cpu;
unsigned long flags;
@@ -223,8 +223,9 @@ unsigned __kprobes long oops_begin(void)
return flags;
}
EXPORT_SYMBOL_GPL(oops_begin);
+NOKPROBE_SYMBOL(oops_begin);
-void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
+void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
{
if (regs && kexec_should_crash(current))
crash_kexec(regs);
@@ -247,8 +248,9 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
panic("Fatal exception");
do_exit(signr);
}
+NOKPROBE_SYMBOL(oops_end);
-int __kprobes __die(const char *str, struct pt_regs *regs, long err)
+int __die(const char *str, struct pt_regs *regs, long err)
{
#ifdef CONFIG_X86_32
unsigned short ss;
@@ -291,6 +293,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
#endif
return 0;
}
+NOKPROBE_SYMBOL(__die);
/*
* This is gone through when something in the kernel has done something bad
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 6cda0baeac9..2e1a6853e00 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -419,7 +419,7 @@ static size_t __init gen6_stolen_size(int num, int slot, int func)
return gmch_ctrl << 25; /* 32 MB units */
}
-static size_t gen8_stolen_size(int num, int slot, int func)
+static size_t __init gen8_stolen_size(int num, int slot, int func)
{
u16 gmch_ctrl;
@@ -429,48 +429,73 @@ static size_t gen8_stolen_size(int num, int slot, int func)
return gmch_ctrl << 25; /* 32 MB units */
}
+static size_t __init chv_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+
+ gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
+ gmch_ctrl >>= SNB_GMCH_GMS_SHIFT;
+ gmch_ctrl &= SNB_GMCH_GMS_MASK;
+
+ /*
+ * 0x0 to 0x10: 32MB increments starting at 0MB
+ * 0x11 to 0x16: 4MB increments starting at 8MB
+ * 0x17 to 0x1d: 4MB increments start at 36MB
+ */
+ if (gmch_ctrl < 0x11)
+ return gmch_ctrl << 25;
+ else if (gmch_ctrl < 0x17)
+ return (gmch_ctrl - 0x11 + 2) << 22;
+ else
+ return (gmch_ctrl - 0x17 + 9) << 22;
+}
struct intel_stolen_funcs {
size_t (*size)(int num, int slot, int func);
u32 (*base)(int num, int slot, int func, size_t size);
};
-static const struct intel_stolen_funcs i830_stolen_funcs = {
+static const struct intel_stolen_funcs i830_stolen_funcs __initconst = {
.base = i830_stolen_base,
.size = i830_stolen_size,
};
-static const struct intel_stolen_funcs i845_stolen_funcs = {
+static const struct intel_stolen_funcs i845_stolen_funcs __initconst = {
.base = i845_stolen_base,
.size = i830_stolen_size,
};
-static const struct intel_stolen_funcs i85x_stolen_funcs = {
+static const struct intel_stolen_funcs i85x_stolen_funcs __initconst = {
.base = i85x_stolen_base,
.size = gen3_stolen_size,
};
-static const struct intel_stolen_funcs i865_stolen_funcs = {
+static const struct intel_stolen_funcs i865_stolen_funcs __initconst = {
.base = i865_stolen_base,
.size = gen3_stolen_size,
};
-static const struct intel_stolen_funcs gen3_stolen_funcs = {
+static const struct intel_stolen_funcs gen3_stolen_funcs __initconst = {
.base = intel_stolen_base,
.size = gen3_stolen_size,
};
-static const struct intel_stolen_funcs gen6_stolen_funcs = {
+static const struct intel_stolen_funcs gen6_stolen_funcs __initconst = {
.base = intel_stolen_base,
.size = gen6_stolen_size,
};
-static const struct intel_stolen_funcs gen8_stolen_funcs = {
+static const struct intel_stolen_funcs gen8_stolen_funcs __initconst = {
.base = intel_stolen_base,
.size = gen8_stolen_size,
};
-static struct pci_device_id intel_stolen_ids[] __initdata = {
+static const struct intel_stolen_funcs chv_stolen_funcs __initconst = {
+ .base = intel_stolen_base,
+ .size = chv_stolen_size,
+};
+
+static const struct pci_device_id intel_stolen_ids[] __initconst = {
INTEL_I830_IDS(&i830_stolen_funcs),
INTEL_I845G_IDS(&i845_stolen_funcs),
INTEL_I85X_IDS(&i85x_stolen_funcs),
@@ -496,7 +521,8 @@ static struct pci_device_id intel_stolen_ids[] __initdata = {
INTEL_HSW_D_IDS(&gen6_stolen_funcs),
INTEL_HSW_M_IDS(&gen6_stolen_funcs),
INTEL_BDW_M_IDS(&gen8_stolen_funcs),
- INTEL_BDW_D_IDS(&gen8_stolen_funcs)
+ INTEL_BDW_D_IDS(&gen8_stolen_funcs),
+ INTEL_CHV_IDS(&chv_stolen_funcs),
};
static void __init intel_graphics_stolen(int num, int slot, int func)
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 98313ffaae6..dbaa23e78b3 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -315,10 +315,6 @@ ENTRY(ret_from_kernel_thread)
ENDPROC(ret_from_kernel_thread)
/*
- * Interrupt exit functions should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
-/*
* Return to user mode is not as complex as all this looks,
* but we want the default path for a system call return to
* go as quickly as possible which is why some of this is
@@ -372,10 +368,6 @@ need_resched:
END(resume_kernel)
#endif
CFI_ENDPROC
-/*
- * End of kprobes section
- */
- .popsection
/* SYSENTER_RETURN points to after the "sysenter" instruction in
the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
@@ -431,9 +423,10 @@ sysenter_past_esp:
jnz sysenter_audit
sysenter_do_call:
cmpl $(NR_syscalls), %eax
- jae syscall_badsys
+ jae sysenter_badsys
call *sys_call_table(,%eax,4)
movl %eax,PT_EAX(%esp)
+sysenter_after_call:
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
@@ -495,10 +488,6 @@ sysexit_audit:
PTGS_TO_GS_EX
ENDPROC(ia32_sysenter_target)
-/*
- * syscall stub including irq exit should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
# system call handler stub
ENTRY(system_call)
RING0_INT_FRAME # can't unwind into user space anyway
@@ -687,13 +676,14 @@ END(syscall_fault)
syscall_badsys:
movl $-ENOSYS,PT_EAX(%esp)
- jmp resume_userspace
+ jmp syscall_exit
+END(syscall_badsys)
+
+sysenter_badsys:
+ movl $-ENOSYS,PT_EAX(%esp)
+ jmp sysenter_after_call
END(syscall_badsys)
CFI_ENDPROC
-/*
- * End of kprobes section
- */
- .popsection
.macro FIXUP_ESPFIX_STACK
/*
@@ -784,10 +774,6 @@ common_interrupt:
ENDPROC(common_interrupt)
CFI_ENDPROC
-/*
- * Irq entries should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
#define BUILD_INTERRUPT3(name, nr, fn) \
ENTRY(name) \
RING0_INT_FRAME; \
@@ -964,10 +950,6 @@ ENTRY(spurious_interrupt_bug)
jmp error_code
CFI_ENDPROC
END(spurious_interrupt_bug)
-/*
- * End of kprobes section
- */
- .popsection
#ifdef CONFIG_XEN
/* Xen doesn't set %esp to be precisely what the normal sysenter
@@ -1242,11 +1224,6 @@ return_to_handler:
jmp *%ecx
#endif
-/*
- * Some functions should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
-
#ifdef CONFIG_TRACING
ENTRY(trace_page_fault)
RING0_EC_FRAME
@@ -1460,7 +1437,3 @@ ENTRY(async_page_fault)
END(async_page_fault)
#endif
-/*
- * End of kprobes section
- */
- .popsection
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 96987987c5d..b25ca969edd 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -53,7 +53,6 @@
#include <asm/page_types.h>
#include <asm/irqflags.h>
#include <asm/paravirt.h>
-#include <asm/ftrace.h>
#include <asm/percpu.h>
#include <asm/asm.h>
#include <asm/context_tracking.h>
@@ -70,209 +69,6 @@
.code64
.section .entry.text, "ax"
-#ifdef CONFIG_FUNCTION_TRACER
-
-#ifdef CC_USING_FENTRY
-# define function_hook __fentry__
-#else
-# define function_hook mcount
-#endif
-
-#ifdef CONFIG_DYNAMIC_FTRACE
-
-ENTRY(function_hook)
- retq
-END(function_hook)
-
-/* skip is set if stack has been adjusted */
-.macro ftrace_caller_setup skip=0
- MCOUNT_SAVE_FRAME \skip
-
- /* Load the ftrace_ops into the 3rd parameter */
- movq function_trace_op(%rip), %rdx
-
- /* Load ip into the first parameter */
- movq RIP(%rsp), %rdi
- subq $MCOUNT_INSN_SIZE, %rdi
- /* Load the parent_ip into the second parameter */
-#ifdef CC_USING_FENTRY
- movq SS+16(%rsp), %rsi
-#else
- movq 8(%rbp), %rsi
-#endif
-.endm
-
-ENTRY(ftrace_caller)
- /* Check if tracing was disabled (quick check) */
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
- ftrace_caller_setup
- /* regs go into 4th parameter (but make it NULL) */
- movq $0, %rcx
-
-GLOBAL(ftrace_call)
- call ftrace_stub
-
- MCOUNT_RESTORE_FRAME
-ftrace_return:
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-GLOBAL(ftrace_graph_call)
- jmp ftrace_stub
-#endif
-
-GLOBAL(ftrace_stub)
- retq
-END(ftrace_caller)
-
-ENTRY(ftrace_regs_caller)
- /* Save the current flags before compare (in SS location)*/
- pushfq
-
- /* Check if tracing was disabled (quick check) */
- cmpl $0, function_trace_stop
- jne ftrace_restore_flags
-
- /* skip=8 to skip flags saved in SS */
- ftrace_caller_setup 8
-
- /* Save the rest of pt_regs */
- movq %r15, R15(%rsp)
- movq %r14, R14(%rsp)
- movq %r13, R13(%rsp)
- movq %r12, R12(%rsp)
- movq %r11, R11(%rsp)
- movq %r10, R10(%rsp)
- movq %rbp, RBP(%rsp)
- movq %rbx, RBX(%rsp)
- /* Copy saved flags */
- movq SS(%rsp), %rcx
- movq %rcx, EFLAGS(%rsp)
- /* Kernel segments */
- movq $__KERNEL_DS, %rcx
- movq %rcx, SS(%rsp)
- movq $__KERNEL_CS, %rcx
- movq %rcx, CS(%rsp)
- /* Stack - skipping return address */
- leaq SS+16(%rsp), %rcx
- movq %rcx, RSP(%rsp)
-
- /* regs go into 4th parameter */
- leaq (%rsp), %rcx
-
-GLOBAL(ftrace_regs_call)
- call ftrace_stub
-
- /* Copy flags back to SS, to restore them */
- movq EFLAGS(%rsp), %rax
- movq %rax, SS(%rsp)
-
- /* Handlers can change the RIP */
- movq RIP(%rsp), %rax
- movq %rax, SS+8(%rsp)
-
- /* restore the rest of pt_regs */
- movq R15(%rsp), %r15
- movq R14(%rsp), %r14
- movq R13(%rsp), %r13
- movq R12(%rsp), %r12
- movq R10(%rsp), %r10
- movq RBP(%rsp), %rbp
- movq RBX(%rsp), %rbx
-
- /* skip=8 to skip flags saved in SS */
- MCOUNT_RESTORE_FRAME 8
-
- /* Restore flags */
- popfq
-
- jmp ftrace_return
-ftrace_restore_flags:
- popfq
- jmp ftrace_stub
-
-END(ftrace_regs_caller)
-
-
-#else /* ! CONFIG_DYNAMIC_FTRACE */
-
-ENTRY(function_hook)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
- cmpq $ftrace_stub, ftrace_trace_function
- jnz trace
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- cmpq $ftrace_stub, ftrace_graph_return
- jnz ftrace_graph_caller
-
- cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
- jnz ftrace_graph_caller
-#endif
-
-GLOBAL(ftrace_stub)
- retq
-
-trace:
- MCOUNT_SAVE_FRAME
-
- movq RIP(%rsp), %rdi
-#ifdef CC_USING_FENTRY
- movq SS+16(%rsp), %rsi
-#else
- movq 8(%rbp), %rsi
-#endif
- subq $MCOUNT_INSN_SIZE, %rdi
-
- call *ftrace_trace_function
-
- MCOUNT_RESTORE_FRAME
-
- jmp ftrace_stub
-END(function_hook)
-#endif /* CONFIG_DYNAMIC_FTRACE */
-#endif /* CONFIG_FUNCTION_TRACER */
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-ENTRY(ftrace_graph_caller)
- MCOUNT_SAVE_FRAME
-
-#ifdef CC_USING_FENTRY
- leaq SS+16(%rsp), %rdi
- movq $0, %rdx /* No framepointers needed */
-#else
- leaq 8(%rbp), %rdi
- movq (%rbp), %rdx
-#endif
- movq RIP(%rsp), %rsi
- subq $MCOUNT_INSN_SIZE, %rsi
-
- call prepare_ftrace_return
-
- MCOUNT_RESTORE_FRAME
-
- retq
-END(ftrace_graph_caller)
-
-GLOBAL(return_to_handler)
- subq $24, %rsp
-
- /* Save the return values */
- movq %rax, (%rsp)
- movq %rdx, 8(%rsp)
- movq %rbp, %rdi
-
- call ftrace_return_to_handler
-
- movq %rax, %rdi
- movq 8(%rsp), %rdx
- movq (%rsp), %rax
- addq $24, %rsp
- jmp *%rdi
-#endif
-
#ifndef CONFIG_PREEMPT
#define retint_kernel retint_restore_args
@@ -488,8 +284,6 @@ ENDPROC(native_usergs_sysret64)
TRACE_IRQS_OFF
.endm
-/* save complete stack frame */
- .pushsection .kprobes.text, "ax"
ENTRY(save_paranoid)
XCPT_FRAME 1 RDI+8
cld
@@ -518,7 +312,6 @@ ENTRY(save_paranoid)
1: ret
CFI_ENDPROC
END(save_paranoid)
- .popsection
/*
* A newly forked process directly context switches into this address.
@@ -976,10 +769,6 @@ END(interrupt)
call \func
.endm
-/*
- * Interrupt entry/exit should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
/*
* The interrupt stubs push (~vector+0x80) onto the stack and
* then jump to common_interrupt.
@@ -1187,11 +976,6 @@ END(__do_double_fault)
#endif
/*
- * End of kprobes section
- */
- .popsection
-
-/*
* APIC interrupts.
*/
.macro apicinterrupt3 num sym do_sym
@@ -1525,11 +1309,6 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
hyperv_callback_vector hyperv_vector_handler
#endif /* CONFIG_HYPERV */
-/*
- * Some functions should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
-
idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
idtentry stack_segment do_stack_segment has_error_code=1 paranoid=1
@@ -1946,7 +1725,3 @@ ENTRY(ignore_sysret)
CFI_ENDPROC
END(ignore_sysret)
-/*
- * End of kprobes section
- */
- .popsection
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 52819e816f8..cbc4a91b131 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -297,16 +297,7 @@ int ftrace_int3_handler(struct pt_regs *regs)
static int ftrace_write(unsigned long ip, const char *val, int size)
{
- /*
- * On x86_64, kernel text mappings are mapped read-only with
- * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
- * of the kernel text mapping to modify the kernel text.
- *
- * For 32bit kernels, these mappings are same and we can use
- * kernel identity mapping to modify code.
- */
- if (within(ip, (unsigned long)_text, (unsigned long)_etext))
- ip = (unsigned long)__va(__pa_symbol(ip));
+ ip = text_ip_addr(ip);
if (probe_kernel_write((void *)ip, val, size))
return -EPERM;
@@ -349,40 +340,14 @@ static int add_brk_on_nop(struct dyn_ftrace *rec)
return add_break(rec->ip, old);
}
-/*
- * If the record has the FTRACE_FL_REGS set, that means that it
- * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS
- * is not not set, then it wants to convert to the normal callback.
- */
-static unsigned long get_ftrace_addr(struct dyn_ftrace *rec)
-{
- if (rec->flags & FTRACE_FL_REGS)
- return (unsigned long)FTRACE_REGS_ADDR;
- else
- return (unsigned long)FTRACE_ADDR;
-}
-
-/*
- * The FTRACE_FL_REGS_EN is set when the record already points to
- * a function that saves all the regs. Basically the '_EN' version
- * represents the current state of the function.
- */
-static unsigned long get_ftrace_old_addr(struct dyn_ftrace *rec)
-{
- if (rec->flags & FTRACE_FL_REGS_EN)
- return (unsigned long)FTRACE_REGS_ADDR;
- else
- return (unsigned long)FTRACE_ADDR;
-}
-
static int add_breakpoints(struct dyn_ftrace *rec, int enable)
{
unsigned long ftrace_addr;
int ret;
- ret = ftrace_test_record(rec, enable);
+ ftrace_addr = ftrace_get_addr_curr(rec);
- ftrace_addr = get_ftrace_addr(rec);
+ ret = ftrace_test_record(rec, enable);
switch (ret) {
case FTRACE_UPDATE_IGNORE:
@@ -392,10 +357,7 @@ static int add_breakpoints(struct dyn_ftrace *rec, int enable)
/* converting nop to call */
return add_brk_on_nop(rec);
- case FTRACE_UPDATE_MODIFY_CALL_REGS:
case FTRACE_UPDATE_MODIFY_CALL:
- ftrace_addr = get_ftrace_old_addr(rec);
- /* fall through */
case FTRACE_UPDATE_MAKE_NOP:
/* converting a call to a nop */
return add_brk_on_call(rec, ftrace_addr);
@@ -440,14 +402,14 @@ static int remove_breakpoint(struct dyn_ftrace *rec)
* If not, don't touch the breakpoint, we make just create
* a disaster.
*/
- ftrace_addr = get_ftrace_addr(rec);
+ ftrace_addr = ftrace_get_addr_new(rec);
nop = ftrace_call_replace(ip, ftrace_addr);
if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) == 0)
goto update;
/* Check both ftrace_addr and ftrace_old_addr */
- ftrace_addr = get_ftrace_old_addr(rec);
+ ftrace_addr = ftrace_get_addr_curr(rec);
nop = ftrace_call_replace(ip, ftrace_addr);
if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0)
@@ -491,13 +453,12 @@ static int add_update(struct dyn_ftrace *rec, int enable)
ret = ftrace_test_record(rec, enable);
- ftrace_addr = get_ftrace_addr(rec);
+ ftrace_addr = ftrace_get_addr_new(rec);
switch (ret) {
case FTRACE_UPDATE_IGNORE:
return 0;
- case FTRACE_UPDATE_MODIFY_CALL_REGS:
case FTRACE_UPDATE_MODIFY_CALL:
case FTRACE_UPDATE_MAKE_CALL:
/* converting nop to call */
@@ -538,13 +499,12 @@ static int finish_update(struct dyn_ftrace *rec, int enable)
ret = ftrace_update_record(rec, enable);
- ftrace_addr = get_ftrace_addr(rec);
+ ftrace_addr = ftrace_get_addr_new(rec);
switch (ret) {
case FTRACE_UPDATE_IGNORE:
return 0;
- case FTRACE_UPDATE_MODIFY_CALL_REGS:
case FTRACE_UPDATE_MODIFY_CALL:
case FTRACE_UPDATE_MAKE_CALL:
/* converting nop to call */
@@ -621,8 +581,8 @@ void ftrace_replace_code(int enable)
return;
remove_breakpoints:
+ pr_warn("Failed on %s (%d):\n", report, count);
ftrace_bug(ret, rec ? rec->ip : 0);
- printk(KERN_WARNING "Failed on %s (%d):\n", report, count);
for_ftrace_rec_iter(iter) {
rec = ftrace_rec_iter_record(iter);
/*
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index a67b47c3131..5f9cf20cdb6 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -32,7 +32,6 @@
#include <linux/irqflags.h>
#include <linux/notifier.h>
#include <linux/kallsyms.h>
-#include <linux/kprobes.h>
#include <linux/percpu.h>
#include <linux/kdebug.h>
#include <linux/kernel.h>
@@ -424,7 +423,7 @@ EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
* NOTIFY_STOP returned for all other cases
*
*/
-static int __kprobes hw_breakpoint_handler(struct die_args *args)
+static int hw_breakpoint_handler(struct die_args *args)
{
int i, cpu, rc = NOTIFY_STOP;
struct perf_event *bp;
@@ -511,7 +510,7 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
/*
* Handle debug exception notifications.
*/
-int __kprobes hw_breakpoint_exceptions_notify(
+int hw_breakpoint_exceptions_notify(
struct notifier_block *unused, unsigned long val, void *data)
{
if (val != DIE_DEBUG)
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 2e977b5d61d..8af817105e2 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -299,13 +299,31 @@ static void unmask_8259A(void)
static void init_8259A(int auto_eoi)
{
unsigned long flags;
+ unsigned char probe_val = ~(1 << PIC_CASCADE_IR);
+ unsigned char new_val;
i8259A_auto_eoi = auto_eoi;
raw_spin_lock_irqsave(&i8259A_lock, flags);
- outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
+ /*
+ * Check to see if we have a PIC.
+ * Mask all except the cascade and read
+ * back the value we just wrote. If we don't
+ * have a PIC, we will read 0xff as opposed to the
+ * value we wrote.
+ */
outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
+ outb(probe_val, PIC_MASTER_IMR);
+ new_val = inb(PIC_MASTER_IMR);
+ if (new_val != probe_val) {
+ printk(KERN_INFO "Using NULL legacy PIC\n");
+ legacy_pic = &null_legacy_pic;
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+ return;
+ }
+
+ outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
/*
* outb_pic - this has to work on a wide range of PC hardware.
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 11ccfb0a63e..922d2858102 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -365,6 +365,7 @@ void fixup_irqs(void)
struct irq_desc *desc;
struct irq_data *data;
struct irq_chip *chip;
+ int ret;
for_each_irq_desc(irq, desc) {
int break_affinity = 0;
@@ -403,10 +404,14 @@ void fixup_irqs(void)
if (!irqd_can_move_in_process_context(data) && chip->irq_mask)
chip->irq_mask(data);
- if (chip->irq_set_affinity)
- chip->irq_set_affinity(data, affinity, true);
- else if (!(warned++))
- set_affinity = 0;
+ if (chip->irq_set_affinity) {
+ ret = chip->irq_set_affinity(data, affinity, true);
+ if (ret == -ENOSPC)
+ pr_crit("IRQ %d set affinity failed because there are no available vectors. The device assigned to this IRQ is unstable.\n", irq);
+ } else {
+ if (!(warned++))
+ set_affinity = 0;
+ }
/*
* We unmask if the irq was not marked masked by the
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 61b17dc2c27..7596df66490 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -112,7 +112,8 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {
const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
-static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
+static nokprobe_inline void
+__synthesize_relative_insn(void *from, void *to, u8 op)
{
struct __arch_relative_insn {
u8 op;
@@ -125,21 +126,23 @@ static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
}
/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
-void __kprobes synthesize_reljump(void *from, void *to)
+void synthesize_reljump(void *from, void *to)
{
__synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);
}
+NOKPROBE_SYMBOL(synthesize_reljump);
/* Insert a call instruction at address 'from', which calls address 'to'.*/
-void __kprobes synthesize_relcall(void *from, void *to)
+void synthesize_relcall(void *from, void *to)
{
__synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
}
+NOKPROBE_SYMBOL(synthesize_relcall);
/*
* Skip the prefixes of the instruction.
*/
-static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn)
+static kprobe_opcode_t *skip_prefixes(kprobe_opcode_t *insn)
{
insn_attr_t attr;
@@ -154,12 +157,13 @@ static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn)
#endif
return insn;
}
+NOKPROBE_SYMBOL(skip_prefixes);
/*
* Returns non-zero if opcode is boostable.
* RIP relative instructions are adjusted at copying time in 64 bits mode
*/
-int __kprobes can_boost(kprobe_opcode_t *opcodes)
+int can_boost(kprobe_opcode_t *opcodes)
{
kprobe_opcode_t opcode;
kprobe_opcode_t *orig_opcodes = opcodes;
@@ -260,7 +264,7 @@ unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long add
}
/* Check if paddr is at an instruction boundary */
-static int __kprobes can_probe(unsigned long paddr)
+static int can_probe(unsigned long paddr)
{
unsigned long addr, __addr, offset = 0;
struct insn insn;
@@ -299,7 +303,7 @@ static int __kprobes can_probe(unsigned long paddr)
/*
* Returns non-zero if opcode modifies the interrupt flag.
*/
-static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
+static int is_IF_modifier(kprobe_opcode_t *insn)
{
/* Skip prefixes */
insn = skip_prefixes(insn);
@@ -322,7 +326,7 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
* If not, return null.
* Only applicable to 64-bit x86.
*/
-int __kprobes __copy_instruction(u8 *dest, u8 *src)
+int __copy_instruction(u8 *dest, u8 *src)
{
struct insn insn;
kprobe_opcode_t buf[MAX_INSN_SIZE];
@@ -365,7 +369,7 @@ int __kprobes __copy_instruction(u8 *dest, u8 *src)
return insn.length;
}
-static int __kprobes arch_copy_kprobe(struct kprobe *p)
+static int arch_copy_kprobe(struct kprobe *p)
{
int ret;
@@ -392,7 +396,7 @@ static int __kprobes arch_copy_kprobe(struct kprobe *p)
return 0;
}
-int __kprobes arch_prepare_kprobe(struct kprobe *p)
+int arch_prepare_kprobe(struct kprobe *p)
{
if (alternatives_text_reserved(p->addr, p->addr))
return -EINVAL;
@@ -407,17 +411,17 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
return arch_copy_kprobe(p);
}
-void __kprobes arch_arm_kprobe(struct kprobe *p)
+void arch_arm_kprobe(struct kprobe *p)
{
text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
}
-void __kprobes arch_disarm_kprobe(struct kprobe *p)
+void arch_disarm_kprobe(struct kprobe *p)
{
text_poke(p->addr, &p->opcode, 1);
}
-void __kprobes arch_remove_kprobe(struct kprobe *p)
+void arch_remove_kprobe(struct kprobe *p)
{
if (p->ainsn.insn) {
free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
@@ -425,7 +429,8 @@ void __kprobes arch_remove_kprobe(struct kprobe *p)
}
}
-static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
+static nokprobe_inline void
+save_previous_kprobe(struct kprobe_ctlblk *kcb)
{
kcb->prev_kprobe.kp = kprobe_running();
kcb->prev_kprobe.status = kcb->kprobe_status;
@@ -433,7 +438,8 @@ static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags;
}
-static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
+static nokprobe_inline void
+restore_previous_kprobe(struct kprobe_ctlblk *kcb)
{
__this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
kcb->kprobe_status = kcb->prev_kprobe.status;
@@ -441,8 +447,9 @@ static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
}
-static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb)
+static nokprobe_inline void
+set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
{
__this_cpu_write(current_kprobe, p);
kcb->kprobe_saved_flags = kcb->kprobe_old_flags
@@ -451,7 +458,7 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
kcb->kprobe_saved_flags &= ~X86_EFLAGS_IF;
}
-static void __kprobes clear_btf(void)
+static nokprobe_inline void clear_btf(void)
{
if (test_thread_flag(TIF_BLOCKSTEP)) {
unsigned long debugctl = get_debugctlmsr();
@@ -461,7 +468,7 @@ static void __kprobes clear_btf(void)
}
}
-static void __kprobes restore_btf(void)
+static nokprobe_inline void restore_btf(void)
{
if (test_thread_flag(TIF_BLOCKSTEP)) {
unsigned long debugctl = get_debugctlmsr();
@@ -471,8 +478,7 @@ static void __kprobes restore_btf(void)
}
}
-void __kprobes
-arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
+void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
{
unsigned long *sara = stack_addr(regs);
@@ -481,9 +487,10 @@ arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
/* Replace the return addr with trampoline addr */
*sara = (unsigned long) &kretprobe_trampoline;
}
+NOKPROBE_SYMBOL(arch_prepare_kretprobe);
-static void __kprobes
-setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, int reenter)
+static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb, int reenter)
{
if (setup_detour_execution(p, regs, reenter))
return;
@@ -519,22 +526,24 @@ setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *k
else
regs->ip = (unsigned long)p->ainsn.insn;
}
+NOKPROBE_SYMBOL(setup_singlestep);
/*
* We have reentered the kprobe_handler(), since another probe was hit while
* within the handler. We save the original kprobes variables and just single
* step on the instruction of the new probe without calling any user handlers.
*/
-static int __kprobes
-reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb)
+static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
{
switch (kcb->kprobe_status) {
case KPROBE_HIT_SSDONE:
case KPROBE_HIT_ACTIVE:
+ case KPROBE_HIT_SS:
kprobes_inc_nmissed_count(p);
setup_singlestep(p, regs, kcb, 1);
break;
- case KPROBE_HIT_SS:
+ case KPROBE_REENTER:
/* A probe has been hit in the codepath leading up to, or just
* after, single-stepping of a probed instruction. This entire
* codepath should strictly reside in .kprobes.text section.
@@ -553,12 +562,13 @@ reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb
return 1;
}
+NOKPROBE_SYMBOL(reenter_kprobe);
/*
* Interrupts are disabled on entry as trap3 is an interrupt gate and they
* remain disabled throughout this function.
*/
-static int __kprobes kprobe_handler(struct pt_regs *regs)
+int kprobe_int3_handler(struct pt_regs *regs)
{
kprobe_opcode_t *addr;
struct kprobe *p;
@@ -621,12 +631,13 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
preempt_enable_no_resched();
return 0;
}
+NOKPROBE_SYMBOL(kprobe_int3_handler);
/*
* When a retprobed function returns, this code saves registers and
* calls trampoline_handler() runs, which calls the kretprobe's handler.
*/
-static void __used __kprobes kretprobe_trampoline_holder(void)
+static void __used kretprobe_trampoline_holder(void)
{
asm volatile (
".global kretprobe_trampoline\n"
@@ -657,11 +668,13 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
#endif
" ret\n");
}
+NOKPROBE_SYMBOL(kretprobe_trampoline_holder);
+NOKPROBE_SYMBOL(kretprobe_trampoline);
/*
* Called from kretprobe_trampoline
*/
-__visible __used __kprobes void *trampoline_handler(struct pt_regs *regs)
+__visible __used void *trampoline_handler(struct pt_regs *regs)
{
struct kretprobe_instance *ri = NULL;
struct hlist_head *head, empty_rp;
@@ -747,6 +760,7 @@ __visible __used __kprobes void *trampoline_handler(struct pt_regs *regs)
}
return (void *)orig_ret_address;
}
+NOKPROBE_SYMBOL(trampoline_handler);
/*
* Called after single-stepping. p->addr is the address of the
@@ -775,8 +789,8 @@ __visible __used __kprobes void *trampoline_handler(struct pt_regs *regs)
* jump instruction after the copied instruction, that jumps to the next
* instruction after the probepoint.
*/
-static void __kprobes
-resume_execution(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb)
+static void resume_execution(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
{
unsigned long *tos = stack_addr(regs);
unsigned long copy_ip = (unsigned long)p->ainsn.insn;
@@ -851,12 +865,13 @@ resume_execution(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *k
no_change:
restore_btf();
}
+NOKPROBE_SYMBOL(resume_execution);
/*
* Interrupts are disabled on entry as trap1 is an interrupt gate and they
* remain disabled throughout this function.
*/
-static int __kprobes post_kprobe_handler(struct pt_regs *regs)
+int kprobe_debug_handler(struct pt_regs *regs)
{
struct kprobe *cur = kprobe_running();
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
@@ -891,8 +906,9 @@ out:
return 1;
}
+NOKPROBE_SYMBOL(kprobe_debug_handler);
-int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
{
struct kprobe *cur = kprobe_running();
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
@@ -949,12 +965,13 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
return 0;
}
+NOKPROBE_SYMBOL(kprobe_fault_handler);
/*
* Wrapper routine for handling exceptions.
*/
-int __kprobes
-kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data)
+int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
+ void *data)
{
struct die_args *args = data;
int ret = NOTIFY_DONE;
@@ -962,22 +979,7 @@ kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *d
if (args->regs && user_mode_vm(args->regs))
return ret;
- switch (val) {
- case DIE_INT3:
- if (kprobe_handler(args->regs))
- ret = NOTIFY_STOP;
- break;
- case DIE_DEBUG:
- if (post_kprobe_handler(args->regs)) {
- /*
- * Reset the BS bit in dr6 (pointed by args->err) to
- * denote completion of processing
- */
- (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
- ret = NOTIFY_STOP;
- }
- break;
- case DIE_GPF:
+ if (val == DIE_GPF) {
/*
* To be potentially processing a kprobe fault and to
* trust the result from kprobe_running(), we have
@@ -986,14 +988,12 @@ kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *d
if (!preemptible() && kprobe_running() &&
kprobe_fault_handler(args->regs, args->trapnr))
ret = NOTIFY_STOP;
- break;
- default:
- break;
}
return ret;
}
+NOKPROBE_SYMBOL(kprobe_exceptions_notify);
-int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
{
struct jprobe *jp = container_of(p, struct jprobe, kp);
unsigned long addr;
@@ -1017,8 +1017,9 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
regs->ip = (unsigned long)(jp->entry);
return 1;
}
+NOKPROBE_SYMBOL(setjmp_pre_handler);
-void __kprobes jprobe_return(void)
+void jprobe_return(void)
{
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
@@ -1034,8 +1035,10 @@ void __kprobes jprobe_return(void)
" nop \n"::"b"
(kcb->jprobe_saved_sp):"memory");
}
+NOKPROBE_SYMBOL(jprobe_return);
+NOKPROBE_SYMBOL(jprobe_return_end);
-int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
+int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
{
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
u8 *addr = (u8 *) (regs->ip - 1);
@@ -1063,13 +1066,22 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
}
return 0;
}
+NOKPROBE_SYMBOL(longjmp_break_handler);
+
+bool arch_within_kprobe_blacklist(unsigned long addr)
+{
+ return (addr >= (unsigned long)__kprobes_text_start &&
+ addr < (unsigned long)__kprobes_text_end) ||
+ (addr >= (unsigned long)__entry_text_start &&
+ addr < (unsigned long)__entry_text_end);
+}
int __init arch_init_kprobes(void)
{
return 0;
}
-int __kprobes arch_trampoline_kprobe(struct kprobe *p)
+int arch_trampoline_kprobe(struct kprobe *p)
{
return 0;
}
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 23ef5c556f0..717b02a22e6 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -25,8 +25,9 @@
#include "common.h"
-static int __skip_singlestep(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb)
+static nokprobe_inline
+int __skip_singlestep(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
{
/*
* Emulate singlestep (and also recover regs->ip)
@@ -41,18 +42,19 @@ static int __skip_singlestep(struct kprobe *p, struct pt_regs *regs,
return 1;
}
-int __kprobes skip_singlestep(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb)
+int skip_singlestep(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
{
if (kprobe_ftrace(p))
return __skip_singlestep(p, regs, kcb);
else
return 0;
}
+NOKPROBE_SYMBOL(skip_singlestep);
/* Ftrace callback handler for kprobes */
-void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *ops, struct pt_regs *regs)
+void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct pt_regs *regs)
{
struct kprobe *p;
struct kprobe_ctlblk *kcb;
@@ -84,8 +86,9 @@ void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
end:
local_irq_restore(flags);
}
+NOKPROBE_SYMBOL(kprobe_ftrace_handler);
-int __kprobes arch_prepare_kprobe_ftrace(struct kprobe *p)
+int arch_prepare_kprobe_ftrace(struct kprobe *p)
{
p->ainsn.insn = NULL;
p->ainsn.boostable = -1;
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 898160b42e4..f304773285a 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -77,7 +77,7 @@ found:
}
/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
-static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
+static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
{
#ifdef CONFIG_X86_64
*addr++ = 0x48;
@@ -138,7 +138,8 @@ asm (
#define INT3_SIZE sizeof(kprobe_opcode_t)
/* Optimized kprobe call back function: called from optinsn */
-static void __kprobes optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
+static void
+optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
{
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
unsigned long flags;
@@ -168,8 +169,9 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op, struct pt_
}
local_irq_restore(flags);
}
+NOKPROBE_SYMBOL(optimized_callback);
-static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
+static int copy_optimized_instructions(u8 *dest, u8 *src)
{
int len = 0, ret;
@@ -189,7 +191,7 @@ static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
}
/* Check whether insn is indirect jump */
-static int __kprobes insn_is_indirect_jump(struct insn *insn)
+static int insn_is_indirect_jump(struct insn *insn)
{
return ((insn->opcode.bytes[0] == 0xff &&
(X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
@@ -224,7 +226,7 @@ static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
}
/* Decode whole function to ensure any instructions don't jump into target */
-static int __kprobes can_optimize(unsigned long paddr)
+static int can_optimize(unsigned long paddr)
{
unsigned long addr, size = 0, offset = 0;
struct insn insn;
@@ -275,7 +277,7 @@ static int __kprobes can_optimize(unsigned long paddr)
}
/* Check optimized_kprobe can actually be optimized. */
-int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
+int arch_check_optimized_kprobe(struct optimized_kprobe *op)
{
int i;
struct kprobe *p;
@@ -290,15 +292,15 @@ int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
}
/* Check the addr is within the optimized instructions. */
-int __kprobes
-arch_within_optimized_kprobe(struct optimized_kprobe *op, unsigned long addr)
+int arch_within_optimized_kprobe(struct optimized_kprobe *op,
+ unsigned long addr)
{
return ((unsigned long)op->kp.addr <= addr &&
(unsigned long)op->kp.addr + op->optinsn.size > addr);
}
/* Free optimized instruction slot */
-static __kprobes
+static
void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
{
if (op->optinsn.insn) {
@@ -308,7 +310,7 @@ void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
}
}
-void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
+void arch_remove_optimized_kprobe(struct optimized_kprobe *op)
{
__arch_remove_optimized_kprobe(op, 1);
}
@@ -318,7 +320,7 @@ void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
* Target instructions MUST be relocatable (checked inside)
* This is called when new aggr(opt)probe is allocated or reused.
*/
-int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
+int arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
{
u8 *buf;
int ret;
@@ -372,7 +374,7 @@ int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
* Replace breakpoints (int3) with relative jumps.
* Caller must call with locking kprobe_mutex and text_mutex.
*/
-void __kprobes arch_optimize_kprobes(struct list_head *oplist)
+void arch_optimize_kprobes(struct list_head *oplist)
{
struct optimized_kprobe *op, *tmp;
u8 insn_buf[RELATIVEJUMP_SIZE];
@@ -398,7 +400,7 @@ void __kprobes arch_optimize_kprobes(struct list_head *oplist)
}
/* Replace a relative jump with a breakpoint (int3). */
-void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
+void arch_unoptimize_kprobe(struct optimized_kprobe *op)
{
u8 insn_buf[RELATIVEJUMP_SIZE];
@@ -424,8 +426,7 @@ extern void arch_unoptimize_kprobes(struct list_head *oplist,
}
}
-int __kprobes
-setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
+int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
{
struct optimized_kprobe *op;
@@ -441,3 +442,4 @@ setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
}
return 0;
}
+NOKPROBE_SYMBOL(setup_detour_execution);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 7e97371387f..3dd8e2c4d74 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -251,8 +251,9 @@ u32 kvm_read_and_reset_pf_reason(void)
return reason;
}
EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
+NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
-dotraplinkage void __kprobes
+dotraplinkage void
do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
{
enum ctx_state prev_state;
@@ -276,6 +277,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
break;
}
}
+NOKPROBE_SYMBOL(do_async_page_fault);
static void __init paravirt_ops_setup(void)
{
diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S
new file mode 100644
index 00000000000..c050a015316
--- /dev/null
+++ b/arch/x86/kernel/mcount_64.S
@@ -0,0 +1,217 @@
+/*
+ * linux/arch/x86_64/mcount_64.S
+ *
+ * Copyright (C) 2014 Steven Rostedt, Red Hat Inc
+ */
+
+#include <linux/linkage.h>
+#include <asm/ptrace.h>
+#include <asm/ftrace.h>
+
+
+ .code64
+ .section .entry.text, "ax"
+
+
+#ifdef CONFIG_FUNCTION_TRACER
+
+#ifdef CC_USING_FENTRY
+# define function_hook __fentry__
+#else
+# define function_hook mcount
+#endif
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+ENTRY(function_hook)
+ retq
+END(function_hook)
+
+/* skip is set if stack has been adjusted */
+.macro ftrace_caller_setup skip=0
+ MCOUNT_SAVE_FRAME \skip
+
+ /* Load the ftrace_ops into the 3rd parameter */
+ movq function_trace_op(%rip), %rdx
+
+ /* Load ip into the first parameter */
+ movq RIP(%rsp), %rdi
+ subq $MCOUNT_INSN_SIZE, %rdi
+ /* Load the parent_ip into the second parameter */
+#ifdef CC_USING_FENTRY
+ movq SS+16(%rsp), %rsi
+#else
+ movq 8(%rbp), %rsi
+#endif
+.endm
+
+ENTRY(ftrace_caller)
+ /* Check if tracing was disabled (quick check) */
+ cmpl $0, function_trace_stop
+ jne ftrace_stub
+
+ ftrace_caller_setup
+ /* regs go into 4th parameter (but make it NULL) */
+ movq $0, %rcx
+
+GLOBAL(ftrace_call)
+ call ftrace_stub
+
+ MCOUNT_RESTORE_FRAME
+ftrace_return:
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+GLOBAL(ftrace_graph_call)
+ jmp ftrace_stub
+#endif
+
+GLOBAL(ftrace_stub)
+ retq
+END(ftrace_caller)
+
+ENTRY(ftrace_regs_caller)
+ /* Save the current flags before compare (in SS location)*/
+ pushfq
+
+ /* Check if tracing was disabled (quick check) */
+ cmpl $0, function_trace_stop
+ jne ftrace_restore_flags
+
+ /* skip=8 to skip flags saved in SS */
+ ftrace_caller_setup 8
+
+ /* Save the rest of pt_regs */
+ movq %r15, R15(%rsp)
+ movq %r14, R14(%rsp)
+ movq %r13, R13(%rsp)
+ movq %r12, R12(%rsp)
+ movq %r11, R11(%rsp)
+ movq %r10, R10(%rsp)
+ movq %rbp, RBP(%rsp)
+ movq %rbx, RBX(%rsp)
+ /* Copy saved flags */
+ movq SS(%rsp), %rcx
+ movq %rcx, EFLAGS(%rsp)
+ /* Kernel segments */
+ movq $__KERNEL_DS, %rcx
+ movq %rcx, SS(%rsp)
+ movq $__KERNEL_CS, %rcx
+ movq %rcx, CS(%rsp)
+ /* Stack - skipping return address */
+ leaq SS+16(%rsp), %rcx
+ movq %rcx, RSP(%rsp)
+
+ /* regs go into 4th parameter */
+ leaq (%rsp), %rcx
+
+GLOBAL(ftrace_regs_call)
+ call ftrace_stub
+
+ /* Copy flags back to SS, to restore them */
+ movq EFLAGS(%rsp), %rax
+ movq %rax, SS(%rsp)
+
+ /* Handlers can change the RIP */
+ movq RIP(%rsp), %rax
+ movq %rax, SS+8(%rsp)
+
+ /* restore the rest of pt_regs */
+ movq R15(%rsp), %r15
+ movq R14(%rsp), %r14
+ movq R13(%rsp), %r13
+ movq R12(%rsp), %r12
+ movq R10(%rsp), %r10
+ movq RBP(%rsp), %rbp
+ movq RBX(%rsp), %rbx
+
+ /* skip=8 to skip flags saved in SS */
+ MCOUNT_RESTORE_FRAME 8
+
+ /* Restore flags */
+ popfq
+
+ jmp ftrace_return
+ftrace_restore_flags:
+ popfq
+ jmp ftrace_stub
+
+END(ftrace_regs_caller)
+
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+
+ENTRY(function_hook)
+ cmpl $0, function_trace_stop
+ jne ftrace_stub
+
+ cmpq $ftrace_stub, ftrace_trace_function
+ jnz trace
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ cmpq $ftrace_stub, ftrace_graph_return
+ jnz ftrace_graph_caller
+
+ cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
+ jnz ftrace_graph_caller
+#endif
+
+GLOBAL(ftrace_stub)
+ retq
+
+trace:
+ MCOUNT_SAVE_FRAME
+
+ movq RIP(%rsp), %rdi
+#ifdef CC_USING_FENTRY
+ movq SS+16(%rsp), %rsi
+#else
+ movq 8(%rbp), %rsi
+#endif
+ subq $MCOUNT_INSN_SIZE, %rdi
+
+ call *ftrace_trace_function
+
+ MCOUNT_RESTORE_FRAME
+
+ jmp ftrace_stub
+END(function_hook)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FUNCTION_TRACER */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+ MCOUNT_SAVE_FRAME
+
+#ifdef CC_USING_FENTRY
+ leaq SS+16(%rsp), %rdi
+ movq $0, %rdx /* No framepointers needed */
+#else
+ leaq 8(%rbp), %rdi
+ movq (%rbp), %rdx
+#endif
+ movq RIP(%rsp), %rsi
+ subq $MCOUNT_INSN_SIZE, %rsi
+
+ call prepare_ftrace_return
+
+ MCOUNT_RESTORE_FRAME
+
+ retq
+END(ftrace_graph_caller)
+
+GLOBAL(return_to_handler)
+ subq $24, %rsp
+
+ /* Save the return values */
+ movq %rax, (%rsp)
+ movq %rdx, 8(%rsp)
+ movq %rbp, %rdi
+
+ call ftrace_return_to_handler
+
+ movq %rax, %rdi
+ movq 8(%rsp), %rdx
+ movq (%rsp), %rax
+ addq $24, %rsp
+ jmp *%rdi
+#endif
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index b4872b999a7..c3e985d1751 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -110,7 +110,7 @@ static void nmi_max_handler(struct irq_work *w)
a->handler, whole_msecs, decimal_msecs);
}
-static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
+static int nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
{
struct nmi_desc *desc = nmi_to_desc(type);
struct nmiaction *a;
@@ -146,6 +146,7 @@ static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2
/* return total number of NMI events handled */
return handled;
}
+NOKPROBE_SYMBOL(nmi_handle);
int __register_nmi_handler(unsigned int type, struct nmiaction *action)
{
@@ -208,7 +209,7 @@ void unregister_nmi_handler(unsigned int type, const char *name)
}
EXPORT_SYMBOL_GPL(unregister_nmi_handler);
-static __kprobes void
+static void
pci_serr_error(unsigned char reason, struct pt_regs *regs)
{
/* check to see if anyone registered against these types of errors */
@@ -238,8 +239,9 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)
reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
outb(reason, NMI_REASON_PORT);
}
+NOKPROBE_SYMBOL(pci_serr_error);
-static __kprobes void
+static void
io_check_error(unsigned char reason, struct pt_regs *regs)
{
unsigned long i;
@@ -269,8 +271,9 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
reason &= ~NMI_REASON_CLEAR_IOCHK;
outb(reason, NMI_REASON_PORT);
}
+NOKPROBE_SYMBOL(io_check_error);
-static __kprobes void
+static void
unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
{
int handled;
@@ -298,11 +301,12 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
pr_emerg("Dazed and confused, but trying to continue\n");
}
+NOKPROBE_SYMBOL(unknown_nmi_error);
static DEFINE_PER_CPU(bool, swallow_nmi);
static DEFINE_PER_CPU(unsigned long, last_nmi_rip);
-static __kprobes void default_do_nmi(struct pt_regs *regs)
+static void default_do_nmi(struct pt_regs *regs)
{
unsigned char reason = 0;
int handled;
@@ -401,6 +405,7 @@ static __kprobes void default_do_nmi(struct pt_regs *regs)
else
unknown_nmi_error(reason, regs);
}
+NOKPROBE_SYMBOL(default_do_nmi);
/*
* NMIs can hit breakpoints which will cause it to lose its
@@ -520,7 +525,7 @@ static inline void nmi_nesting_postprocess(void)
}
#endif
-dotraplinkage notrace __kprobes void
+dotraplinkage notrace void
do_nmi(struct pt_regs *regs, long error_code)
{
nmi_nesting_preprocess(regs);
@@ -537,6 +542,7 @@ do_nmi(struct pt_regs *regs, long error_code)
/* On i386, may loop back to preprocess */
nmi_nesting_postprocess();
}
+NOKPROBE_SYMBOL(do_nmi);
void stop_nmi(void)
{
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1b10af835c3..548d25f00c9 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -23,6 +23,7 @@
#include <linux/efi.h>
#include <linux/bcd.h>
#include <linux/highmem.h>
+#include <linux/kprobes.h>
#include <asm/bug.h>
#include <asm/paravirt.h>
@@ -389,6 +390,11 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
.end_context_switch = paravirt_nop,
};
+/* At this point, native_get/set_debugreg has real function entries */
+NOKPROBE_SYMBOL(native_get_debugreg);
+NOKPROBE_SYMBOL(native_set_debugreg);
+NOKPROBE_SYMBOL(native_load_idt);
+
struct pv_apic_ops pv_apic_ops = {
#ifdef CONFIG_X86_LOCAL_APIC
.startup_ipi_hook = paravirt_nop,
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 898d077617a..ca5b02d405c 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -413,12 +413,11 @@ void set_personality_ia32(bool x32)
set_thread_flag(TIF_ADDR32);
/* Mark the associated mm as containing 32-bit tasks. */
- if (current->mm)
- current->mm->context.ia32_compat = 1;
-
if (x32) {
clear_thread_flag(TIF_IA32);
set_thread_flag(TIF_X32);
+ if (current->mm)
+ current->mm->context.ia32_compat = TIF_X32;
current->personality &= ~READ_IMPLIES_EXEC;
/* is_compat_task() uses the presence of the x32
syscall bit flag to determine compat status */
@@ -426,6 +425,8 @@ void set_personality_ia32(bool x32)
} else {
set_thread_flag(TIF_IA32);
clear_thread_flag(TIF_X32);
+ if (current->mm)
+ current->mm->context.ia32_compat = TIF_IA32;
current->personality |= force_personality32;
/* Prepare the first "return" to user space */
current_thread_info()->status |= TS_COMPAT;
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index a0da58db43a..2851d63c120 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -363,7 +363,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
/* Set up to return from userspace. */
restorer = current->mm->context.vdso +
- selected_vdso32->sym___kernel_sigreturn;
+ selected_vdso32->sym___kernel_rt_sigreturn;
if (ksig->ka.sa.sa_flags & SA_RESTORER)
restorer = ksig->ka.sa.sa_restorer;
put_user_ex(restorer, &frame->pretcode);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index f73b5d435bd..0d0e922fafc 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -23,6 +23,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/ptrace.h>
+#include <linux/uprobes.h>
#include <linux/string.h>
#include <linux/delay.h>
#include <linux/errno.h>
@@ -106,7 +107,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
preempt_count_dec();
}
-static int __kprobes
+static nokprobe_inline int
do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
struct pt_regs *regs, long error_code)
{
@@ -136,7 +137,38 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
return -1;
}
-static void __kprobes
+static siginfo_t *fill_trap_info(struct pt_regs *regs, int signr, int trapnr,
+ siginfo_t *info)
+{
+ unsigned long siaddr;
+ int sicode;
+
+ switch (trapnr) {
+ default:
+ return SEND_SIG_PRIV;
+
+ case X86_TRAP_DE:
+ sicode = FPE_INTDIV;
+ siaddr = uprobe_get_trap_addr(regs);
+ break;
+ case X86_TRAP_UD:
+ sicode = ILL_ILLOPN;
+ siaddr = uprobe_get_trap_addr(regs);
+ break;
+ case X86_TRAP_AC:
+ sicode = BUS_ADRALN;
+ siaddr = 0;
+ break;
+ }
+
+ info->si_signo = signr;
+ info->si_errno = 0;
+ info->si_code = sicode;
+ info->si_addr = (void __user *)siaddr;
+ return info;
+}
+
+static void
do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
long error_code, siginfo_t *info)
{
@@ -168,60 +200,43 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
}
#endif
- if (info)
- force_sig_info(signr, info, tsk);
- else
- force_sig(signr, tsk);
+ force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk);
}
+NOKPROBE_SYMBOL(do_trap);
-#define DO_ERROR(trapnr, signr, str, name) \
-dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
-{ \
- enum ctx_state prev_state; \
- \
- prev_state = exception_enter(); \
- if (notify_die(DIE_TRAP, str, regs, error_code, \
- trapnr, signr) == NOTIFY_STOP) { \
- exception_exit(prev_state); \
- return; \
- } \
- conditional_sti(regs); \
- do_trap(trapnr, signr, str, regs, error_code, NULL); \
- exception_exit(prev_state); \
+static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
+ unsigned long trapnr, int signr)
+{
+ enum ctx_state prev_state = exception_enter();
+ siginfo_t info;
+
+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
+ NOTIFY_STOP) {
+ conditional_sti(regs);
+ do_trap(trapnr, signr, str, regs, error_code,
+ fill_trap_info(regs, signr, trapnr, &info));
+ }
+
+ exception_exit(prev_state);
}
-#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
+#define DO_ERROR(trapnr, signr, str, name) \
dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
{ \
- siginfo_t info; \
- enum ctx_state prev_state; \
- \
- info.si_signo = signr; \
- info.si_errno = 0; \
- info.si_code = sicode; \
- info.si_addr = (void __user *)siaddr; \
- prev_state = exception_enter(); \
- if (notify_die(DIE_TRAP, str, regs, error_code, \
- trapnr, signr) == NOTIFY_STOP) { \
- exception_exit(prev_state); \
- return; \
- } \
- conditional_sti(regs); \
- do_trap(trapnr, signr, str, regs, error_code, &info); \
- exception_exit(prev_state); \
+ do_error_trap(regs, error_code, str, trapnr, signr); \
}
-DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip )
-DO_ERROR (X86_TRAP_OF, SIGSEGV, "overflow", overflow )
-DO_ERROR (X86_TRAP_BR, SIGSEGV, "bounds", bounds )
-DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip )
-DO_ERROR (X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun )
-DO_ERROR (X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS )
-DO_ERROR (X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present )
+DO_ERROR(X86_TRAP_DE, SIGFPE, "divide error", divide_error)
+DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow)
+DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds)
+DO_ERROR(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op)
+DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",coprocessor_segment_overrun)
+DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS)
+DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present)
#ifdef CONFIG_X86_32
-DO_ERROR (X86_TRAP_SS, SIGBUS, "stack segment", stack_segment )
+DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment)
#endif
-DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0 )
+DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check)
#ifdef CONFIG_X86_64
/* Runs on IST stack */
@@ -263,7 +278,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
}
#endif
-dotraplinkage void __kprobes
+dotraplinkage void
do_general_protection(struct pt_regs *regs, long error_code)
{
struct task_struct *tsk;
@@ -305,13 +320,14 @@ do_general_protection(struct pt_regs *regs, long error_code)
pr_cont("\n");
}
- force_sig(SIGSEGV, tsk);
+ force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
exit:
exception_exit(prev_state);
}
+NOKPROBE_SYMBOL(do_general_protection);
/* May run on IST stack. */
-dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code)
+dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
{
enum ctx_state prev_state;
@@ -334,6 +350,11 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co
goto exit;
#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
+#ifdef CONFIG_KPROBES
+ if (kprobe_int3_handler(regs))
+ goto exit;
+#endif
+
if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
SIGTRAP) == NOTIFY_STOP)
goto exit;
@@ -350,6 +371,7 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co
exit:
exception_exit(prev_state);
}
+NOKPROBE_SYMBOL(do_int3);
#ifdef CONFIG_X86_64
/*
@@ -357,7 +379,7 @@ exit:
* for scheduling or signal handling. The actual stack switch is done in
* entry.S
*/
-asmlinkage __visible __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
+asmlinkage __visible struct pt_regs *sync_regs(struct pt_regs *eregs)
{
struct pt_regs *regs = eregs;
/* Did already sync */
@@ -376,6 +398,7 @@ asmlinkage __visible __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
*regs = *eregs;
return regs;
}
+NOKPROBE_SYMBOL(sync_regs);
#endif
/*
@@ -402,7 +425,7 @@ asmlinkage __visible __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
*
* May run on IST stack.
*/
-dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
+dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
{
struct task_struct *tsk = current;
enum ctx_state prev_state;
@@ -440,6 +463,11 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
/* Store the virtualized DR6 value */
tsk->thread.debugreg6 = dr6;
+#ifdef CONFIG_KPROBES
+ if (kprobe_debug_handler(regs))
+ goto exit;
+#endif
+
if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, error_code,
SIGTRAP) == NOTIFY_STOP)
goto exit;
@@ -482,13 +510,14 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
exit:
exception_exit(prev_state);
}
+NOKPROBE_SYMBOL(do_debug);
/*
* Note that we play around with the 'TS' bit in an attempt to get
* the correct behaviour even in the presence of the asynchronous
* IRQ13 behaviour
*/
-void math_error(struct pt_regs *regs, int error_code, int trapnr)
+static void math_error(struct pt_regs *regs, int error_code, int trapnr)
{
struct task_struct *task = current;
siginfo_t info;
@@ -518,7 +547,7 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)
task->thread.error_code = error_code;
info.si_signo = SIGFPE;
info.si_errno = 0;
- info.si_addr = (void __user *)regs->ip;
+ info.si_addr = (void __user *)uprobe_get_trap_addr(regs);
if (trapnr == X86_TRAP_MF) {
unsigned short cwd, swd;
/*
@@ -645,7 +674,7 @@ void math_state_restore(void)
*/
if (unlikely(restore_fpu_checking(tsk))) {
drop_init_fpu(tsk);
- force_sig(SIGSEGV, tsk);
+ force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
return;
}
@@ -653,7 +682,7 @@ void math_state_restore(void)
}
EXPORT_SYMBOL_GPL(math_state_restore);
-dotraplinkage void __kprobes
+dotraplinkage void
do_device_not_available(struct pt_regs *regs, long error_code)
{
enum ctx_state prev_state;
@@ -679,6 +708,7 @@ do_device_not_available(struct pt_regs *regs, long error_code)
#endif
exception_exit(prev_state);
}
+NOKPROBE_SYMBOL(do_device_not_available);
#ifdef CONFIG_X86_32
dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index ace22916ade..5d1cbfe4ae5 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -32,20 +32,20 @@
/* Post-execution fixups. */
-/* No fixup needed */
-#define UPROBE_FIX_NONE 0x0
-
/* Adjust IP back to vicinity of actual insn */
-#define UPROBE_FIX_IP 0x1
+#define UPROBE_FIX_IP 0x01
/* Adjust the return address of a call insn */
-#define UPROBE_FIX_CALL 0x2
+#define UPROBE_FIX_CALL 0x02
/* Instruction will modify TF, don't change it */
-#define UPROBE_FIX_SETF 0x4
+#define UPROBE_FIX_SETF 0x04
-#define UPROBE_FIX_RIP_AX 0x8000
-#define UPROBE_FIX_RIP_CX 0x4000
+#define UPROBE_FIX_RIP_SI 0x08
+#define UPROBE_FIX_RIP_DI 0x10
+#define UPROBE_FIX_RIP_BX 0x20
+#define UPROBE_FIX_RIP_MASK \
+ (UPROBE_FIX_RIP_SI | UPROBE_FIX_RIP_DI | UPROBE_FIX_RIP_BX)
#define UPROBE_TRAP_NR UINT_MAX
@@ -67,6 +67,7 @@
* to keep gcc from statically optimizing it out, as variable_test_bit makes
* some versions of gcc to think only *(unsigned long*) is used.
*/
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
static volatile u32 good_insns_32[256 / 32] = {
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
/* ---------------------------------------------- */
@@ -89,33 +90,12 @@ static volatile u32 good_insns_32[256 / 32] = {
/* ---------------------------------------------- */
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
};
+#else
+#define good_insns_32 NULL
+#endif
-/* Using this for both 64-bit and 32-bit apps */
-static volatile u32 good_2byte_insns[256 / 32] = {
- /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
- /* ---------------------------------------------- */
- W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */
- W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */
- W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
- W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
- W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
- W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
- W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */
- W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
- W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
- W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
- W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */
- W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
- W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
- W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
- W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */
- W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* f0 */
- /* ---------------------------------------------- */
- /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
-};
-
-#ifdef CONFIG_X86_64
/* Good-instruction tables for 64-bit apps */
+#if defined(CONFIG_X86_64)
static volatile u32 good_insns_64[256 / 32] = {
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
/* ---------------------------------------------- */
@@ -138,7 +118,33 @@ static volatile u32 good_insns_64[256 / 32] = {
/* ---------------------------------------------- */
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
};
+#else
+#define good_insns_64 NULL
#endif
+
+/* Using this for both 64-bit and 32-bit apps */
+static volatile u32 good_2byte_insns[256 / 32] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* ---------------------------------------------- */
+ W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */
+ W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */
+ W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
+ W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
+ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
+ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
+ W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */
+ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
+ W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
+ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
+ W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */
+ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
+ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
+ W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+ W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */
+ W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* f0 */
+ /* ---------------------------------------------- */
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+};
#undef W
/*
@@ -209,16 +215,25 @@ static bool is_prefix_bad(struct insn *insn)
return false;
}
-static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn)
+static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool x86_64)
{
- insn_init(insn, auprobe->insn, false);
+ u32 volatile *good_insns;
+
+ insn_init(insn, auprobe->insn, x86_64);
+ /* has the side-effect of processing the entire instruction */
+ insn_get_length(insn);
+ if (WARN_ON_ONCE(!insn_complete(insn)))
+ return -ENOEXEC;
- /* Skip good instruction prefixes; reject "bad" ones. */
- insn_get_opcode(insn);
if (is_prefix_bad(insn))
return -ENOTSUPP;
- if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_32))
+ if (x86_64)
+ good_insns = good_insns_64;
+ else
+ good_insns = good_insns_32;
+
+ if (test_bit(OPCODE1(insn), (unsigned long *)good_insns))
return 0;
if (insn->opcode.nbytes == 2) {
@@ -230,14 +245,18 @@ static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn)
}
#ifdef CONFIG_X86_64
+static inline bool is_64bit_mm(struct mm_struct *mm)
+{
+ return !config_enabled(CONFIG_IA32_EMULATION) ||
+ !(mm->context.ia32_compat == TIF_IA32);
+}
/*
* If arch_uprobe->insn doesn't use rip-relative addressing, return
* immediately. Otherwise, rewrite the instruction so that it accesses
* its memory operand indirectly through a scratch register. Set
- * arch_uprobe->fixups and arch_uprobe->rip_rela_target_address
- * accordingly. (The contents of the scratch register will be saved
- * before we single-step the modified instruction, and restored
- * afterward.)
+ * defparam->fixups accordingly. (The contents of the scratch register
+ * will be saved before we single-step the modified instruction,
+ * and restored afterward).
*
* We do this because a rip-relative instruction can access only a
* relatively small area (+/- 2 GB from the instruction), and the XOL
@@ -248,164 +267,192 @@ static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn)
*
* Some useful facts about rip-relative instructions:
*
- * - There's always a modrm byte.
+ * - There's always a modrm byte with bit layout "00 reg 101".
* - There's never a SIB byte.
* - The displacement is always 4 bytes.
+ * - REX.B=1 bit in REX prefix, which normally extends r/m field,
+ * has no effect on rip-relative mode. It doesn't make modrm byte
+ * with r/m=101 refer to register 1101 = R13.
*/
-static void
-handle_riprel_insn(struct arch_uprobe *auprobe, struct insn *insn)
+static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn)
{
u8 *cursor;
u8 reg;
+ u8 reg2;
if (!insn_rip_relative(insn))
return;
/*
- * insn_rip_relative() would have decoded rex_prefix, modrm.
+ * insn_rip_relative() would have decoded rex_prefix, vex_prefix, modrm.
* Clear REX.b bit (extension of MODRM.rm field):
- * we want to encode rax/rcx, not r8/r9.
+ * we want to encode low numbered reg, not r8+.
*/
if (insn->rex_prefix.nbytes) {
cursor = auprobe->insn + insn_offset_rex_prefix(insn);
- *cursor &= 0xfe; /* Clearing REX.B bit */
+ /* REX byte has 0100wrxb layout, clearing REX.b bit */
+ *cursor &= 0xfe;
+ }
+ /*
+ * Similar treatment for VEX3 prefix.
+ * TODO: add XOP/EVEX treatment when insn decoder supports them
+ */
+ if (insn->vex_prefix.nbytes == 3) {
+ /*
+ * vex2: c5 rvvvvLpp (has no b bit)
+ * vex3/xop: c4/8f rxbmmmmm wvvvvLpp
+ * evex: 62 rxbR00mm wvvvv1pp zllBVaaa
+ * (evex will need setting of both b and x since
+ * in non-sib encoding evex.x is 4th bit of MODRM.rm)
+ * Setting VEX3.b (setting because it has inverted meaning):
+ */
+ cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1;
+ *cursor |= 0x20;
}
/*
+ * Convert from rip-relative addressing to register-relative addressing
+ * via a scratch register.
+ *
+ * This is tricky since there are insns with modrm byte
+ * which also use registers not encoded in modrm byte:
+ * [i]div/[i]mul: implicitly use dx:ax
+ * shift ops: implicitly use cx
+ * cmpxchg: implicitly uses ax
+ * cmpxchg8/16b: implicitly uses dx:ax and bx:cx
+ * Encoding: 0f c7/1 modrm
+ * The code below thinks that reg=1 (cx), chooses si as scratch.
+ * mulx: implicitly uses dx: mulx r/m,r1,r2 does r1:r2 = dx * r/m.
+ * First appeared in Haswell (BMI2 insn). It is vex-encoded.
+ * Example where none of bx,cx,dx can be used as scratch reg:
+ * c4 e2 63 f6 0d disp32 mulx disp32(%rip),%ebx,%ecx
+ * [v]pcmpistri: implicitly uses cx, xmm0
+ * [v]pcmpistrm: implicitly uses xmm0
+ * [v]pcmpestri: implicitly uses ax, dx, cx, xmm0
+ * [v]pcmpestrm: implicitly uses ax, dx, xmm0
+ * Evil SSE4.2 string comparison ops from hell.
+ * maskmovq/[v]maskmovdqu: implicitly uses (ds:rdi) as destination.
+ * Encoding: 0f f7 modrm, 66 0f f7 modrm, vex-encoded: c5 f9 f7 modrm.
+ * Store op1, byte-masked by op2 msb's in each byte, to (ds:rdi).
+ * AMD says it has no 3-operand form (vex.vvvv must be 1111)
+ * and that it can have only register operands, not mem
+ * (its modrm byte must have mode=11).
+ * If these restrictions will ever be lifted,
+ * we'll need code to prevent selection of di as scratch reg!
+ *
+ * Summary: I don't know any insns with modrm byte which
+ * use SI register implicitly. DI register is used only
+ * by one insn (maskmovq) and BX register is used
+ * only by one too (cmpxchg8b).
+ * BP is stack-segment based (may be a problem?).
+ * AX, DX, CX are off-limits (many implicit users).
+ * SP is unusable (it's stack pointer - think about "pop mem";
+ * also, rsp+disp32 needs sib encoding -> insn length change).
+ */
+
+ reg = MODRM_REG(insn); /* Fetch modrm.reg */
+ reg2 = 0xff; /* Fetch vex.vvvv */
+ if (insn->vex_prefix.nbytes == 2)
+ reg2 = insn->vex_prefix.bytes[1];
+ else if (insn->vex_prefix.nbytes == 3)
+ reg2 = insn->vex_prefix.bytes[2];
+ /*
+ * TODO: add XOP, EXEV vvvv reading.
+ *
+ * vex.vvvv field is in bits 6-3, bits are inverted.
+ * But in 32-bit mode, high-order bit may be ignored.
+ * Therefore, let's consider only 3 low-order bits.
+ */
+ reg2 = ((reg2 >> 3) & 0x7) ^ 0x7;
+ /*
+ * Register numbering is ax,cx,dx,bx, sp,bp,si,di, r8..r15.
+ *
+ * Choose scratch reg. Order is important: must not select bx
+ * if we can use si (cmpxchg8b case!)
+ */
+ if (reg != 6 && reg2 != 6) {
+ reg2 = 6;
+ auprobe->defparam.fixups |= UPROBE_FIX_RIP_SI;
+ } else if (reg != 7 && reg2 != 7) {
+ reg2 = 7;
+ auprobe->defparam.fixups |= UPROBE_FIX_RIP_DI;
+ /* TODO (paranoia): force maskmovq to not use di */
+ } else {
+ reg2 = 3;
+ auprobe->defparam.fixups |= UPROBE_FIX_RIP_BX;
+ }
+ /*
* Point cursor at the modrm byte. The next 4 bytes are the
* displacement. Beyond the displacement, for some instructions,
* is the immediate operand.
*/
cursor = auprobe->insn + insn_offset_modrm(insn);
- insn_get_length(insn);
-
/*
- * Convert from rip-relative addressing to indirect addressing
- * via a scratch register. Change the r/m field from 0x5 (%rip)
- * to 0x0 (%rax) or 0x1 (%rcx), and squeeze out the offset field.
+ * Change modrm from "00 reg 101" to "10 reg reg2". Example:
+ * 89 05 disp32 mov %eax,disp32(%rip) becomes
+ * 89 86 disp32 mov %eax,disp32(%rsi)
*/
- reg = MODRM_REG(insn);
- if (reg == 0) {
- /*
- * The register operand (if any) is either the A register
- * (%rax, %eax, etc.) or (if the 0x4 bit is set in the
- * REX prefix) %r8. In any case, we know the C register
- * is NOT the register operand, so we use %rcx (register
- * #1) for the scratch register.
- */
- auprobe->fixups = UPROBE_FIX_RIP_CX;
- /* Change modrm from 00 000 101 to 00 000 001. */
- *cursor = 0x1;
- } else {
- /* Use %rax (register #0) for the scratch register. */
- auprobe->fixups = UPROBE_FIX_RIP_AX;
- /* Change modrm from 00 xxx 101 to 00 xxx 000 */
- *cursor = (reg << 3);
- }
-
- /* Target address = address of next instruction + (signed) offset */
- auprobe->rip_rela_target_address = (long)insn->length + insn->displacement.value;
+ *cursor = 0x80 | (reg << 3) | reg2;
+}
- /* Displacement field is gone; slide immediate field (if any) over. */
- if (insn->immediate.nbytes) {
- cursor++;
- memmove(cursor, cursor + insn->displacement.nbytes, insn->immediate.nbytes);
- }
+static inline unsigned long *
+scratch_reg(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ if (auprobe->defparam.fixups & UPROBE_FIX_RIP_SI)
+ return &regs->si;
+ if (auprobe->defparam.fixups & UPROBE_FIX_RIP_DI)
+ return &regs->di;
+ return &regs->bx;
}
/*
* If we're emulating a rip-relative instruction, save the contents
* of the scratch register and store the target address in that register.
*/
-static void
-pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs,
- struct arch_uprobe_task *autask)
-{
- if (auprobe->fixups & UPROBE_FIX_RIP_AX) {
- autask->saved_scratch_register = regs->ax;
- regs->ax = current->utask->vaddr;
- regs->ax += auprobe->rip_rela_target_address;
- } else if (auprobe->fixups & UPROBE_FIX_RIP_CX) {
- autask->saved_scratch_register = regs->cx;
- regs->cx = current->utask->vaddr;
- regs->cx += auprobe->rip_rela_target_address;
- }
-}
-
-static void
-handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction)
+static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
- if (auprobe->fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) {
- struct arch_uprobe_task *autask;
-
- autask = &current->utask->autask;
- if (auprobe->fixups & UPROBE_FIX_RIP_AX)
- regs->ax = autask->saved_scratch_register;
- else
- regs->cx = autask->saved_scratch_register;
+ if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) {
+ struct uprobe_task *utask = current->utask;
+ unsigned long *sr = scratch_reg(auprobe, regs);
- /*
- * The original instruction includes a displacement, and so
- * is 4 bytes longer than what we've just single-stepped.
- * Caller may need to apply other fixups to handle stuff
- * like "jmpq *...(%rip)" and "callq *...(%rip)".
- */
- if (correction)
- *correction += 4;
+ utask->autask.saved_scratch_register = *sr;
+ *sr = utask->vaddr + auprobe->defparam.ilen;
}
}
-static int validate_insn_64bits(struct arch_uprobe *auprobe, struct insn *insn)
+static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
- insn_init(insn, auprobe->insn, true);
-
- /* Skip good instruction prefixes; reject "bad" ones. */
- insn_get_opcode(insn);
- if (is_prefix_bad(insn))
- return -ENOTSUPP;
+ if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) {
+ struct uprobe_task *utask = current->utask;
+ unsigned long *sr = scratch_reg(auprobe, regs);
- if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_64))
- return 0;
-
- if (insn->opcode.nbytes == 2) {
- if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns))
- return 0;
+ *sr = utask->autask.saved_scratch_register;
}
- return -ENOTSUPP;
}
-
-static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
+#else /* 32-bit: */
+static inline bool is_64bit_mm(struct mm_struct *mm)
{
- if (mm->context.ia32_compat)
- return validate_insn_32bits(auprobe, insn);
- return validate_insn_64bits(auprobe, insn);
+ return false;
}
-#else /* 32-bit: */
/*
* No RIP-relative addressing on 32-bit
*/
-static void handle_riprel_insn(struct arch_uprobe *auprobe, struct insn *insn)
+static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn)
{
}
-static void pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs,
- struct arch_uprobe_task *autask)
+static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
}
-static void handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs,
- long *correction)
+static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
}
-
-static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
-{
- return validate_insn_32bits(auprobe, insn);
-}
#endif /* CONFIG_X86_64 */
struct uprobe_xol_ops {
bool (*emulate)(struct arch_uprobe *, struct pt_regs *);
int (*pre_xol)(struct arch_uprobe *, struct pt_regs *);
int (*post_xol)(struct arch_uprobe *, struct pt_regs *);
+ void (*abort)(struct arch_uprobe *, struct pt_regs *);
};
static inline int sizeof_long(void)
@@ -415,50 +462,67 @@ static inline int sizeof_long(void)
static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
- pre_xol_rip_insn(auprobe, regs, &current->utask->autask);
+ riprel_pre_xol(auprobe, regs);
return 0;
}
-/*
- * Adjust the return address pushed by a call insn executed out of line.
- */
-static int adjust_ret_addr(unsigned long sp, long correction)
+static int push_ret_address(struct pt_regs *regs, unsigned long ip)
{
- int rasize = sizeof_long();
- long ra;
-
- if (copy_from_user(&ra, (void __user *)sp, rasize))
- return -EFAULT;
+ unsigned long new_sp = regs->sp - sizeof_long();
- ra += correction;
- if (copy_to_user((void __user *)sp, &ra, rasize))
+ if (copy_to_user((void __user *)new_sp, &ip, sizeof_long()))
return -EFAULT;
+ regs->sp = new_sp;
return 0;
}
+/*
+ * We have to fix things up as follows:
+ *
+ * Typically, the new ip is relative to the copied instruction. We need
+ * to make it relative to the original instruction (FIX_IP). Exceptions
+ * are return instructions and absolute or indirect jump or call instructions.
+ *
+ * If the single-stepped instruction was a call, the return address that
+ * is atop the stack is the address following the copied instruction. We
+ * need to make it the address following the original instruction (FIX_CALL).
+ *
+ * If the original instruction was a rip-relative instruction such as
+ * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent
+ * instruction using a scratch register -- e.g., "movl %edx,0xnnnn(%rsi)".
+ * We need to restore the contents of the scratch register
+ * (FIX_RIP_reg).
+ */
static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
struct uprobe_task *utask = current->utask;
- long correction = (long)(utask->vaddr - utask->xol_vaddr);
- handle_riprel_post_xol(auprobe, regs, &correction);
- if (auprobe->fixups & UPROBE_FIX_IP)
+ riprel_post_xol(auprobe, regs);
+ if (auprobe->defparam.fixups & UPROBE_FIX_IP) {
+ long correction = utask->vaddr - utask->xol_vaddr;
regs->ip += correction;
-
- if (auprobe->fixups & UPROBE_FIX_CALL) {
- if (adjust_ret_addr(regs->sp, correction)) {
- regs->sp += sizeof_long();
+ } else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) {
+ regs->sp += sizeof_long(); /* Pop incorrect return address */
+ if (push_ret_address(regs, utask->vaddr + auprobe->defparam.ilen))
return -ERESTART;
- }
}
+ /* popf; tell the caller to not touch TF */
+ if (auprobe->defparam.fixups & UPROBE_FIX_SETF)
+ utask->autask.saved_tf = true;
return 0;
}
+static void default_abort_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ riprel_post_xol(auprobe, regs);
+}
+
static struct uprobe_xol_ops default_xol_ops = {
.pre_xol = default_pre_xol_op,
.post_xol = default_post_xol_op,
+ .abort = default_abort_op,
};
static bool branch_is_call(struct arch_uprobe *auprobe)
@@ -520,7 +584,6 @@ static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
unsigned long offs = (long)auprobe->branch.offs;
if (branch_is_call(auprobe)) {
- unsigned long new_sp = regs->sp - sizeof_long();
/*
* If it fails we execute this (mangled, see the comment in
* branch_clear_offset) insn out-of-line. In the likely case
@@ -530,9 +593,8 @@ static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
*
* But there is corner case, see the comment in ->post_xol().
*/
- if (copy_to_user((void __user *)new_sp, &new_ip, sizeof_long()))
+ if (push_ret_address(regs, new_ip))
return false;
- regs->sp = new_sp;
} else if (!check_jmp_cond(auprobe, regs)) {
offs = 0;
}
@@ -583,11 +645,7 @@ static struct uprobe_xol_ops branch_xol_ops = {
static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
{
u8 opc1 = OPCODE1(insn);
-
- /* has the side-effect of processing the entire instruction */
- insn_get_length(insn);
- if (WARN_ON_ONCE(!insn_complete(insn)))
- return -ENOEXEC;
+ int i;
switch (opc1) {
case 0xeb: /* jmp 8 */
@@ -612,6 +670,16 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
return -ENOSYS;
}
+ /*
+ * 16-bit overrides such as CALLW (66 e8 nn nn) are not supported.
+ * Intel and AMD behavior differ in 64-bit mode: Intel ignores 66 prefix.
+ * No one uses these insns, reject any branch insns with such prefix.
+ */
+ for (i = 0; i < insn->prefixes.nbytes; i++) {
+ if (insn->prefixes.bytes[i] == 0x66)
+ return -ENOTSUPP;
+ }
+
auprobe->branch.opc1 = opc1;
auprobe->branch.ilen = insn->length;
auprobe->branch.offs = insn->immediate.value;
@@ -630,10 +698,10 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr)
{
struct insn insn;
- bool fix_ip = true, fix_call = false;
+ u8 fix_ip_or_call = UPROBE_FIX_IP;
int ret;
- ret = validate_insn_bits(auprobe, mm, &insn);
+ ret = uprobe_init_insn(auprobe, &insn, is_64bit_mm(mm));
if (ret)
return ret;
@@ -642,44 +710,39 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm,
return ret;
/*
- * Figure out which fixups arch_uprobe_post_xol() will need to perform,
- * and annotate arch_uprobe->fixups accordingly. To start with, ->fixups
- * is either zero or it reflects rip-related fixups.
+ * Figure out which fixups default_post_xol_op() will need to perform,
+ * and annotate defparam->fixups accordingly.
*/
switch (OPCODE1(&insn)) {
case 0x9d: /* popf */
- auprobe->fixups |= UPROBE_FIX_SETF;
+ auprobe->defparam.fixups |= UPROBE_FIX_SETF;
break;
case 0xc3: /* ret or lret -- ip is correct */
case 0xcb:
case 0xc2:
case 0xca:
- fix_ip = false;
+ case 0xea: /* jmp absolute -- ip is correct */
+ fix_ip_or_call = 0;
break;
case 0x9a: /* call absolute - Fix return addr, not ip */
- fix_call = true;
- fix_ip = false;
- break;
- case 0xea: /* jmp absolute -- ip is correct */
- fix_ip = false;
+ fix_ip_or_call = UPROBE_FIX_CALL;
break;
case 0xff:
- insn_get_modrm(&insn);
switch (MODRM_REG(&insn)) {
case 2: case 3: /* call or lcall, indirect */
- fix_call = true;
+ fix_ip_or_call = UPROBE_FIX_CALL;
+ break;
case 4: case 5: /* jmp or ljmp, indirect */
- fix_ip = false;
+ fix_ip_or_call = 0;
+ break;
}
/* fall through */
default:
- handle_riprel_insn(auprobe, &insn);
+ riprel_analyze(auprobe, &insn);
}
- if (fix_ip)
- auprobe->fixups |= UPROBE_FIX_IP;
- if (fix_call)
- auprobe->fixups |= UPROBE_FIX_CALL;
+ auprobe->defparam.ilen = insn.length;
+ auprobe->defparam.fixups |= fix_ip_or_call;
auprobe->ops = &default_xol_ops;
return 0;
@@ -694,6 +757,12 @@ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
struct uprobe_task *utask = current->utask;
+ if (auprobe->ops->pre_xol) {
+ int err = auprobe->ops->pre_xol(auprobe, regs);
+ if (err)
+ return err;
+ }
+
regs->ip = utask->xol_vaddr;
utask->autask.saved_trap_nr = current->thread.trap_nr;
current->thread.trap_nr = UPROBE_TRAP_NR;
@@ -703,8 +772,6 @@ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
if (test_tsk_thread_flag(current, TIF_BLOCKSTEP))
set_task_blockstep(current, false);
- if (auprobe->ops->pre_xol)
- return auprobe->ops->pre_xol(auprobe, regs);
return 0;
}
@@ -732,56 +799,42 @@ bool arch_uprobe_xol_was_trapped(struct task_struct *t)
* single-step, we single-stepped a copy of the instruction.
*
* This function prepares to resume execution after the single-step.
- * We have to fix things up as follows:
- *
- * Typically, the new ip is relative to the copied instruction. We need
- * to make it relative to the original instruction (FIX_IP). Exceptions
- * are return instructions and absolute or indirect jump or call instructions.
- *
- * If the single-stepped instruction was a call, the return address that
- * is atop the stack is the address following the copied instruction. We
- * need to make it the address following the original instruction (FIX_CALL).
- *
- * If the original instruction was a rip-relative instruction such as
- * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent
- * instruction using a scratch register -- e.g., "movl %edx,(%rax)".
- * We need to restore the contents of the scratch register and adjust
- * the ip, keeping in mind that the instruction we executed is 4 bytes
- * shorter than the original instruction (since we squeezed out the offset
- * field). (FIX_RIP_AX or FIX_RIP_CX)
*/
int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
struct uprobe_task *utask = current->utask;
+ bool send_sigtrap = utask->autask.saved_tf;
+ int err = 0;
WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR);
+ current->thread.trap_nr = utask->autask.saved_trap_nr;
if (auprobe->ops->post_xol) {
- int err = auprobe->ops->post_xol(auprobe, regs);
+ err = auprobe->ops->post_xol(auprobe, regs);
if (err) {
- arch_uprobe_abort_xol(auprobe, regs);
/*
- * Restart the probed insn. ->post_xol() must ensure
- * this is really possible if it returns -ERESTART.
+ * Restore ->ip for restart or post mortem analysis.
+ * ->post_xol() must not return -ERESTART unless this
+ * is really possible.
*/
+ regs->ip = utask->vaddr;
if (err == -ERESTART)
- return 0;
- return err;
+ err = 0;
+ send_sigtrap = false;
}
}
-
- current->thread.trap_nr = utask->autask.saved_trap_nr;
/*
* arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP
* so we can get an extra SIGTRAP if we do not clear TF. We need
* to examine the opcode to make it right.
*/
- if (utask->autask.saved_tf)
+ if (send_sigtrap)
send_sig(SIGTRAP, current, 0);
- else if (!(auprobe->fixups & UPROBE_FIX_SETF))
+
+ if (!utask->autask.saved_tf)
regs->flags &= ~X86_EFLAGS_TF;
- return 0;
+ return err;
}
/* callback routine for handling exceptions. */
@@ -815,18 +868,18 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val,
/*
* This function gets called when XOL instruction either gets trapped or
- * the thread has a fatal signal, or if arch_uprobe_post_xol() failed.
- * Reset the instruction pointer to its probed address for the potential
- * restart or for post mortem analysis.
+ * the thread has a fatal signal. Reset the instruction pointer to its
+ * probed address for the potential restart or for post mortem analysis.
*/
void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
struct uprobe_task *utask = current->utask;
- current->thread.trap_nr = utask->autask.saved_trap_nr;
- handle_riprel_post_xol(auprobe, regs, NULL);
- instruction_pointer_set(regs, utask->vaddr);
+ if (auprobe->ops->abort)
+ auprobe->ops->abort(auprobe, regs);
+ current->thread.trap_nr = utask->autask.saved_trap_nr;
+ regs->ip = utask->vaddr;
/* clear TF if it was set by us in arch_uprobe_pre_xol() */
if (!utask->autask.saved_tf)
regs->flags &= ~X86_EFLAGS_TF;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ec8366c5cfe..b5e994ad013 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1462,6 +1462,7 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
*/
if (var->unusable)
var->db = 0;
+ var->dpl = to_svm(vcpu)->vmcb->save.cpl;
break;
}
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f32a02578c0..f6449334ec4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1898,7 +1898,7 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
break;
gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
- if (kvm_write_guest(kvm, data,
+ if (kvm_write_guest(kvm, gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT,
&tsc_ref, sizeof(tsc_ref)))
return 1;
mark_page_dirty(kvm, gfn);
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S
index 2930ae05d77..28f85c91671 100644
--- a/arch/x86/lib/thunk_32.S
+++ b/arch/x86/lib/thunk_32.S
@@ -4,8 +4,8 @@
* (inspired by Andi Kleen's thunk_64.S)
* Subject to the GNU public license, v.2. No warranty of any kind.
*/
-
#include <linux/linkage.h>
+ #include <asm/asm.h>
#ifdef CONFIG_TRACE_IRQFLAGS
/* put return address in eax (arg1) */
@@ -22,6 +22,7 @@
popl %ecx
popl %eax
ret
+ _ASM_NOKPROBE(\name)
.endm
thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index a63efd6bb6a..92d9feaff42 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -8,6 +8,7 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/calling.h>
+#include <asm/asm.h>
/* rdi: arg1 ... normal C conventions. rax is saved/restored. */
.macro THUNK name, func, put_ret_addr_in_rdi=0
@@ -25,6 +26,7 @@
call \func
jmp restore
CFI_ENDPROC
+ _ASM_NOKPROBE(\name)
.endm
#ifdef CONFIG_TRACE_IRQFLAGS
@@ -43,3 +45,4 @@ restore:
RESTORE_ARGS
ret
CFI_ENDPROC
+ _ASM_NOKPROBE(restore)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 858b47b5221..36642793e31 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -8,7 +8,7 @@
#include <linux/kdebug.h> /* oops_begin/end, ... */
#include <linux/module.h> /* search_exception_table */
#include <linux/bootmem.h> /* max_low_pfn */
-#include <linux/kprobes.h> /* __kprobes, ... */
+#include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */
#include <linux/mmiotrace.h> /* kmmio_handler, ... */
#include <linux/perf_event.h> /* perf_sw_event */
#include <linux/hugetlb.h> /* hstate_index_to_shift */
@@ -46,7 +46,7 @@ enum x86_pf_error_code {
* Returns 0 if mmiotrace is disabled, or if the fault is not
* handled by mmiotrace:
*/
-static inline int __kprobes
+static nokprobe_inline int
kmmio_fault(struct pt_regs *regs, unsigned long addr)
{
if (unlikely(is_kmmio_active()))
@@ -55,7 +55,7 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr)
return 0;
}
-static inline int __kprobes kprobes_fault(struct pt_regs *regs)
+static nokprobe_inline int kprobes_fault(struct pt_regs *regs)
{
int ret = 0;
@@ -262,7 +262,7 @@ void vmalloc_sync_all(void)
*
* Handle a fault on the vmalloc or module mapping area
*/
-static noinline __kprobes int vmalloc_fault(unsigned long address)
+static noinline int vmalloc_fault(unsigned long address)
{
unsigned long pgd_paddr;
pmd_t *pmd_k;
@@ -292,6 +292,7 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
return 0;
}
+NOKPROBE_SYMBOL(vmalloc_fault);
/*
* Did it hit the DOS screen memory VA from vm86 mode?
@@ -359,7 +360,7 @@ void vmalloc_sync_all(void)
*
* This assumes no large pages in there.
*/
-static noinline __kprobes int vmalloc_fault(unsigned long address)
+static noinline int vmalloc_fault(unsigned long address)
{
pgd_t *pgd, *pgd_ref;
pud_t *pud, *pud_ref;
@@ -426,6 +427,7 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
return 0;
}
+NOKPROBE_SYMBOL(vmalloc_fault);
#ifdef CONFIG_CPU_SUP_AMD
static const char errata93_warning[] =
@@ -928,7 +930,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
* There are no security implications to leaving a stale TLB when
* increasing the permissions on a page.
*/
-static noinline __kprobes int
+static noinline int
spurious_fault(unsigned long error_code, unsigned long address)
{
pgd_t *pgd;
@@ -976,6 +978,7 @@ spurious_fault(unsigned long error_code, unsigned long address)
return ret;
}
+NOKPROBE_SYMBOL(spurious_fault);
int show_unhandled_signals = 1;
@@ -1031,7 +1034,7 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
* {,trace_}do_page_fault() have notrace on. Having this an actual function
* guarantees there's a function trace entry.
*/
-static void __kprobes noinline
+static noinline void
__do_page_fault(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
{
@@ -1254,8 +1257,9 @@ good_area:
up_read(&mm->mmap_sem);
}
+NOKPROBE_SYMBOL(__do_page_fault);
-dotraplinkage void __kprobes notrace
+dotraplinkage void notrace
do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
unsigned long address = read_cr2(); /* Get the faulting address */
@@ -1273,10 +1277,12 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
__do_page_fault(regs, error_code, address);
exception_exit(prev_state);
}
+NOKPROBE_SYMBOL(do_page_fault);
#ifdef CONFIG_TRACING
-static void trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
- unsigned long error_code)
+static nokprobe_inline void
+trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
+ unsigned long error_code)
{
if (user_mode(regs))
trace_page_fault_user(address, regs, error_code);
@@ -1284,7 +1290,7 @@ static void trace_page_fault_entries(unsigned long address, struct pt_regs *regs
trace_page_fault_kernel(address, regs, error_code);
}
-dotraplinkage void __kprobes notrace
+dotraplinkage void notrace
trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
/*
@@ -1301,4 +1307,5 @@ trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
__do_page_fault(regs, error_code, address);
exception_exit(prev_state);
}
+NOKPROBE_SYMBOL(trace_do_page_fault);
#endif /* CONFIG_TRACING */
diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S
index 01495755701..6440221ced0 100644
--- a/arch/x86/net/bpf_jit.S
+++ b/arch/x86/net/bpf_jit.S
@@ -12,13 +12,16 @@
/*
* Calling convention :
- * rdi : skb pointer
+ * rbx : skb pointer (callee saved)
* esi : offset of byte(s) to fetch in skb (can be scratched)
- * r8 : copy of skb->data
+ * r10 : copy of skb->data
* r9d : hlen = skb->len - skb->data_len
*/
-#define SKBDATA %r8
+#define SKBDATA %r10
#define SKF_MAX_NEG_OFF $(-0x200000) /* SKF_LL_OFF from filter.h */
+#define MAX_BPF_STACK (512 /* from filter.h */ + \
+ 32 /* space for rbx,r13,r14,r15 */ + \
+ 8 /* space for skb_copy_bits */)
sk_load_word:
.globl sk_load_word
@@ -68,53 +71,31 @@ sk_load_byte_positive_offset:
movzbl (SKBDATA,%rsi),%eax
ret
-/**
- * sk_load_byte_msh - BPF_S_LDX_B_MSH helper
- *
- * Implements BPF_S_LDX_B_MSH : ldxb 4*([offset]&0xf)
- * Must preserve A accumulator (%eax)
- * Inputs : %esi is the offset value
- */
-sk_load_byte_msh:
- .globl sk_load_byte_msh
- test %esi,%esi
- js bpf_slow_path_byte_msh_neg
-
-sk_load_byte_msh_positive_offset:
- .globl sk_load_byte_msh_positive_offset
- cmp %esi,%r9d /* if (offset >= hlen) goto bpf_slow_path_byte_msh */
- jle bpf_slow_path_byte_msh
- movzbl (SKBDATA,%rsi),%ebx
- and $15,%bl
- shl $2,%bl
- ret
-
/* rsi contains offset and can be scratched */
#define bpf_slow_path_common(LEN) \
- push %rdi; /* save skb */ \
+ mov %rbx, %rdi; /* arg1 == skb */ \
push %r9; \
push SKBDATA; \
/* rsi already has offset */ \
mov $LEN,%ecx; /* len */ \
- lea -12(%rbp),%rdx; \
+ lea - MAX_BPF_STACK + 32(%rbp),%rdx; \
call skb_copy_bits; \
test %eax,%eax; \
pop SKBDATA; \
- pop %r9; \
- pop %rdi
+ pop %r9;
bpf_slow_path_word:
bpf_slow_path_common(4)
js bpf_error
- mov -12(%rbp),%eax
+ mov - MAX_BPF_STACK + 32(%rbp),%eax
bswap %eax
ret
bpf_slow_path_half:
bpf_slow_path_common(2)
js bpf_error
- mov -12(%rbp),%ax
+ mov - MAX_BPF_STACK + 32(%rbp),%ax
rol $8,%ax
movzwl %ax,%eax
ret
@@ -122,21 +103,11 @@ bpf_slow_path_half:
bpf_slow_path_byte:
bpf_slow_path_common(1)
js bpf_error
- movzbl -12(%rbp),%eax
- ret
-
-bpf_slow_path_byte_msh:
- xchg %eax,%ebx /* dont lose A , X is about to be scratched */
- bpf_slow_path_common(1)
- js bpf_error
- movzbl -12(%rbp),%eax
- and $15,%al
- shl $2,%al
- xchg %eax,%ebx
+ movzbl - MAX_BPF_STACK + 32(%rbp),%eax
ret
#define sk_negative_common(SIZE) \
- push %rdi; /* save skb */ \
+ mov %rbx, %rdi; /* arg1 == skb */ \
push %r9; \
push SKBDATA; \
/* rsi already has offset */ \
@@ -145,10 +116,8 @@ bpf_slow_path_byte_msh:
test %rax,%rax; \
pop SKBDATA; \
pop %r9; \
- pop %rdi; \
jz bpf_error
-
bpf_slow_path_word_neg:
cmp SKF_MAX_NEG_OFF, %esi /* test range */
jl bpf_error /* offset lower -> error */
@@ -179,22 +148,12 @@ sk_load_byte_negative_offset:
movzbl (%rax), %eax
ret
-bpf_slow_path_byte_msh_neg:
- cmp SKF_MAX_NEG_OFF, %esi
- jl bpf_error
-sk_load_byte_msh_negative_offset:
- .globl sk_load_byte_msh_negative_offset
- xchg %eax,%ebx /* dont lose A , X is about to be scratched */
- sk_negative_common(1)
- movzbl (%rax),%eax
- and $15,%al
- shl $2,%al
- xchg %eax,%ebx
- ret
-
bpf_error:
# force a return 0 from jit handler
- xor %eax,%eax
- mov -8(%rbp),%rbx
+ xor %eax,%eax
+ mov - MAX_BPF_STACK(%rbp),%rbx
+ mov - MAX_BPF_STACK + 8(%rbp),%r13
+ mov - MAX_BPF_STACK + 16(%rbp),%r14
+ mov - MAX_BPF_STACK + 24(%rbp),%r15
leaveq
ret
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 6d5663a599a..99bef86ed6d 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1,6 +1,7 @@
/* bpf_jit_comp.c : BPF JIT compiler
*
* Copyright (C) 2011-2013 Eric Dumazet (eric.dumazet@gmail.com)
+ * Internal BPF Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -14,28 +15,16 @@
#include <linux/if_vlan.h>
#include <linux/random.h>
-/*
- * Conventions :
- * EAX : BPF A accumulator
- * EBX : BPF X accumulator
- * RDI : pointer to skb (first argument given to JIT function)
- * RBP : frame pointer (even if CONFIG_FRAME_POINTER=n)
- * ECX,EDX,ESI : scratch registers
- * r9d : skb->len - skb->data_len (headlen)
- * r8 : skb->data
- * -8(RBP) : saved RBX value
- * -16(RBP)..-80(RBP) : BPF_MEMWORDS values
- */
int bpf_jit_enable __read_mostly;
/*
* assembly code in arch/x86/net/bpf_jit.S
*/
-extern u8 sk_load_word[], sk_load_half[], sk_load_byte[], sk_load_byte_msh[];
+extern u8 sk_load_word[], sk_load_half[], sk_load_byte[];
extern u8 sk_load_word_positive_offset[], sk_load_half_positive_offset[];
-extern u8 sk_load_byte_positive_offset[], sk_load_byte_msh_positive_offset[];
+extern u8 sk_load_byte_positive_offset[];
extern u8 sk_load_word_negative_offset[], sk_load_half_negative_offset[];
-extern u8 sk_load_byte_negative_offset[], sk_load_byte_msh_negative_offset[];
+extern u8 sk_load_byte_negative_offset[];
static inline u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
{
@@ -56,30 +45,44 @@ static inline u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
#define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2)
#define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3)
#define EMIT4(b1, b2, b3, b4) EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4)
-#define EMIT1_off32(b1, off) do { EMIT1(b1); EMIT(off, 4);} while (0)
-
-#define CLEAR_A() EMIT2(0x31, 0xc0) /* xor %eax,%eax */
-#define CLEAR_X() EMIT2(0x31, 0xdb) /* xor %ebx,%ebx */
+#define EMIT1_off32(b1, off) \
+ do {EMIT1(b1); EMIT(off, 4); } while (0)
+#define EMIT2_off32(b1, b2, off) \
+ do {EMIT2(b1, b2); EMIT(off, 4); } while (0)
+#define EMIT3_off32(b1, b2, b3, off) \
+ do {EMIT3(b1, b2, b3); EMIT(off, 4); } while (0)
+#define EMIT4_off32(b1, b2, b3, b4, off) \
+ do {EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
static inline bool is_imm8(int value)
{
return value <= 127 && value >= -128;
}
-static inline bool is_near(int offset)
+static inline bool is_simm32(s64 value)
{
- return offset <= 127 && offset >= -128;
+ return value == (s64) (s32) value;
}
-#define EMIT_JMP(offset) \
-do { \
- if (offset) { \
- if (is_near(offset)) \
- EMIT2(0xeb, offset); /* jmp .+off8 */ \
- else \
- EMIT1_off32(0xe9, offset); /* jmp .+off32 */ \
- } \
-} while (0)
+/* mov dst, src */
+#define EMIT_mov(DST, SRC) \
+ do {if (DST != SRC) \
+ EMIT3(add_2mod(0x48, DST, SRC), 0x89, add_2reg(0xC0, DST, SRC)); \
+ } while (0)
+
+static int bpf_size_to_x86_bytes(int bpf_size)
+{
+ if (bpf_size == BPF_W)
+ return 4;
+ else if (bpf_size == BPF_H)
+ return 2;
+ else if (bpf_size == BPF_B)
+ return 1;
+ else if (bpf_size == BPF_DW)
+ return 4; /* imm32 */
+ else
+ return 0;
+}
/* list of x86 cond jumps opcodes (. + s8)
* Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32)
@@ -90,27 +93,8 @@ do { \
#define X86_JNE 0x75
#define X86_JBE 0x76
#define X86_JA 0x77
-
-#define EMIT_COND_JMP(op, offset) \
-do { \
- if (is_near(offset)) \
- EMIT2(op, offset); /* jxx .+off8 */ \
- else { \
- EMIT2(0x0f, op + 0x10); \
- EMIT(offset, 4); /* jxx .+off32 */ \
- } \
-} while (0)
-
-#define COND_SEL(CODE, TOP, FOP) \
- case CODE: \
- t_op = TOP; \
- f_op = FOP; \
- goto cond_branch
-
-
-#define SEEN_DATAREF 1 /* might call external helpers */
-#define SEEN_XREG 2 /* ebx is used */
-#define SEEN_MEM 4 /* use mem[] for temporary storage */
+#define X86_JGE 0x7D
+#define X86_JG 0x7F
static inline void bpf_flush_icache(void *start, void *end)
{
@@ -125,26 +109,6 @@ static inline void bpf_flush_icache(void *start, void *end)
#define CHOOSE_LOAD_FUNC(K, func) \
((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset)
-/* Helper to find the offset of pkt_type in sk_buff
- * We want to make sure its still a 3bit field starting at a byte boundary.
- */
-#define PKT_TYPE_MAX 7
-static int pkt_type_offset(void)
-{
- struct sk_buff skb_probe = {
- .pkt_type = ~0,
- };
- char *ct = (char *)&skb_probe;
- unsigned int off;
-
- for (off = 0; off < sizeof(struct sk_buff); off++) {
- if (ct[off] == PKT_TYPE_MAX)
- return off;
- }
- pr_err_once("Please fix pkt_type_offset(), as pkt_type couldn't be found\n");
- return -1;
-}
-
struct bpf_binary_header {
unsigned int pages;
/* Note : for security reasons, bpf code will follow a randomly
@@ -178,583 +142,771 @@ static struct bpf_binary_header *bpf_alloc_binary(unsigned int proglen,
return header;
}
-void bpf_jit_compile(struct sk_filter *fp)
+/* pick a register outside of BPF range for JIT internal work */
+#define AUX_REG (MAX_BPF_REG + 1)
+
+/* the following table maps BPF registers to x64 registers.
+ * x64 register r12 is unused, since if used as base address register
+ * in load/store instructions, it always needs an extra byte of encoding
+ */
+static const int reg2hex[] = {
+ [BPF_REG_0] = 0, /* rax */
+ [BPF_REG_1] = 7, /* rdi */
+ [BPF_REG_2] = 6, /* rsi */
+ [BPF_REG_3] = 2, /* rdx */
+ [BPF_REG_4] = 1, /* rcx */
+ [BPF_REG_5] = 0, /* r8 */
+ [BPF_REG_6] = 3, /* rbx callee saved */
+ [BPF_REG_7] = 5, /* r13 callee saved */
+ [BPF_REG_8] = 6, /* r14 callee saved */
+ [BPF_REG_9] = 7, /* r15 callee saved */
+ [BPF_REG_FP] = 5, /* rbp readonly */
+ [AUX_REG] = 3, /* r11 temp register */
+};
+
+/* is_ereg() == true if BPF register 'reg' maps to x64 r8..r15
+ * which need extra byte of encoding.
+ * rax,rcx,...,rbp have simpler encoding
+ */
+static inline bool is_ereg(u32 reg)
{
- u8 temp[64];
- u8 *prog;
- unsigned int proglen, oldproglen = 0;
- int ilen, i;
- int t_offset, f_offset;
- u8 t_op, f_op, seen = 0, pass;
- u8 *image = NULL;
- struct bpf_binary_header *header = NULL;
- u8 *func;
- int pc_ret0 = -1; /* bpf index of first RET #0 instruction (if any) */
- unsigned int cleanup_addr; /* epilogue code offset */
- unsigned int *addrs;
- const struct sock_filter *filter = fp->insns;
- int flen = fp->len;
+ if (reg == BPF_REG_5 || reg == AUX_REG ||
+ (reg >= BPF_REG_7 && reg <= BPF_REG_9))
+ return true;
+ else
+ return false;
+}
- if (!bpf_jit_enable)
- return;
+/* add modifiers if 'reg' maps to x64 registers r8..r15 */
+static inline u8 add_1mod(u8 byte, u32 reg)
+{
+ if (is_ereg(reg))
+ byte |= 1;
+ return byte;
+}
- addrs = kmalloc(flen * sizeof(*addrs), GFP_KERNEL);
- if (addrs == NULL)
- return;
+static inline u8 add_2mod(u8 byte, u32 r1, u32 r2)
+{
+ if (is_ereg(r1))
+ byte |= 1;
+ if (is_ereg(r2))
+ byte |= 4;
+ return byte;
+}
- /* Before first pass, make a rough estimation of addrs[]
- * each bpf instruction is translated to less than 64 bytes
+/* encode 'dst_reg' register into x64 opcode 'byte' */
+static inline u8 add_1reg(u8 byte, u32 dst_reg)
+{
+ return byte + reg2hex[dst_reg];
+}
+
+/* encode 'dst_reg' and 'src_reg' registers into x64 opcode 'byte' */
+static inline u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
+{
+ return byte + reg2hex[dst_reg] + (reg2hex[src_reg] << 3);
+}
+
+struct jit_context {
+ unsigned int cleanup_addr; /* epilogue code offset */
+ bool seen_ld_abs;
+};
+
+static int do_jit(struct sk_filter *bpf_prog, int *addrs, u8 *image,
+ int oldproglen, struct jit_context *ctx)
+{
+ struct sock_filter_int *insn = bpf_prog->insnsi;
+ int insn_cnt = bpf_prog->len;
+ u8 temp[64];
+ int i;
+ int proglen = 0;
+ u8 *prog = temp;
+ int stacksize = MAX_BPF_STACK +
+ 32 /* space for rbx, r13, r14, r15 */ +
+ 8 /* space for skb_copy_bits() buffer */;
+
+ EMIT1(0x55); /* push rbp */
+ EMIT3(0x48, 0x89, 0xE5); /* mov rbp,rsp */
+
+ /* sub rsp, stacksize */
+ EMIT3_off32(0x48, 0x81, 0xEC, stacksize);
+
+ /* all classic BPF filters use R6(rbx) save it */
+
+ /* mov qword ptr [rbp-X],rbx */
+ EMIT3_off32(0x48, 0x89, 0x9D, -stacksize);
+
+ /* sk_convert_filter() maps classic BPF register X to R7 and uses R8
+ * as temporary, so all tcpdump filters need to spill/fill R7(r13) and
+ * R8(r14). R9(r15) spill could be made conditional, but there is only
+ * one 'bpf_error' return path out of helper functions inside bpf_jit.S
+ * The overhead of extra spill is negligible for any filter other
+ * than synthetic ones. Therefore not worth adding complexity.
*/
- for (proglen = 0, i = 0; i < flen; i++) {
- proglen += 64;
- addrs[i] = proglen;
+
+ /* mov qword ptr [rbp-X],r13 */
+ EMIT3_off32(0x4C, 0x89, 0xAD, -stacksize + 8);
+ /* mov qword ptr [rbp-X],r14 */
+ EMIT3_off32(0x4C, 0x89, 0xB5, -stacksize + 16);
+ /* mov qword ptr [rbp-X],r15 */
+ EMIT3_off32(0x4C, 0x89, 0xBD, -stacksize + 24);
+
+ /* clear A and X registers */
+ EMIT2(0x31, 0xc0); /* xor eax, eax */
+ EMIT3(0x4D, 0x31, 0xED); /* xor r13, r13 */
+
+ if (ctx->seen_ld_abs) {
+ /* r9d : skb->len - skb->data_len (headlen)
+ * r10 : skb->data
+ */
+ if (is_imm8(offsetof(struct sk_buff, len)))
+ /* mov %r9d, off8(%rdi) */
+ EMIT4(0x44, 0x8b, 0x4f,
+ offsetof(struct sk_buff, len));
+ else
+ /* mov %r9d, off32(%rdi) */
+ EMIT3_off32(0x44, 0x8b, 0x8f,
+ offsetof(struct sk_buff, len));
+
+ if (is_imm8(offsetof(struct sk_buff, data_len)))
+ /* sub %r9d, off8(%rdi) */
+ EMIT4(0x44, 0x2b, 0x4f,
+ offsetof(struct sk_buff, data_len));
+ else
+ EMIT3_off32(0x44, 0x2b, 0x8f,
+ offsetof(struct sk_buff, data_len));
+
+ if (is_imm8(offsetof(struct sk_buff, data)))
+ /* mov %r10, off8(%rdi) */
+ EMIT4(0x4c, 0x8b, 0x57,
+ offsetof(struct sk_buff, data));
+ else
+ /* mov %r10, off32(%rdi) */
+ EMIT3_off32(0x4c, 0x8b, 0x97,
+ offsetof(struct sk_buff, data));
}
- cleanup_addr = proglen; /* epilogue address */
- for (pass = 0; pass < 10; pass++) {
- u8 seen_or_pass0 = (pass == 0) ? (SEEN_XREG | SEEN_DATAREF | SEEN_MEM) : seen;
- /* no prologue/epilogue for trivial filters (RET something) */
- proglen = 0;
- prog = temp;
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ const s32 imm32 = insn->imm;
+ u32 dst_reg = insn->dst_reg;
+ u32 src_reg = insn->src_reg;
+ u8 b1 = 0, b2 = 0, b3 = 0;
+ s64 jmp_offset;
+ u8 jmp_cond;
+ int ilen;
+ u8 *func;
+
+ switch (insn->code) {
+ /* ALU */
+ case BPF_ALU | BPF_ADD | BPF_X:
+ case BPF_ALU | BPF_SUB | BPF_X:
+ case BPF_ALU | BPF_AND | BPF_X:
+ case BPF_ALU | BPF_OR | BPF_X:
+ case BPF_ALU | BPF_XOR | BPF_X:
+ case BPF_ALU64 | BPF_ADD | BPF_X:
+ case BPF_ALU64 | BPF_SUB | BPF_X:
+ case BPF_ALU64 | BPF_AND | BPF_X:
+ case BPF_ALU64 | BPF_OR | BPF_X:
+ case BPF_ALU64 | BPF_XOR | BPF_X:
+ switch (BPF_OP(insn->code)) {
+ case BPF_ADD: b2 = 0x01; break;
+ case BPF_SUB: b2 = 0x29; break;
+ case BPF_AND: b2 = 0x21; break;
+ case BPF_OR: b2 = 0x09; break;
+ case BPF_XOR: b2 = 0x31; break;
+ }
+ if (BPF_CLASS(insn->code) == BPF_ALU64)
+ EMIT1(add_2mod(0x48, dst_reg, src_reg));
+ else if (is_ereg(dst_reg) || is_ereg(src_reg))
+ EMIT1(add_2mod(0x40, dst_reg, src_reg));
+ EMIT2(b2, add_2reg(0xC0, dst_reg, src_reg));
+ break;
- if (seen_or_pass0) {
- EMIT4(0x55, 0x48, 0x89, 0xe5); /* push %rbp; mov %rsp,%rbp */
- EMIT4(0x48, 0x83, 0xec, 96); /* subq $96,%rsp */
- /* note : must save %rbx in case bpf_error is hit */
- if (seen_or_pass0 & (SEEN_XREG | SEEN_DATAREF))
- EMIT4(0x48, 0x89, 0x5d, 0xf8); /* mov %rbx, -8(%rbp) */
- if (seen_or_pass0 & SEEN_XREG)
- CLEAR_X(); /* make sure we dont leek kernel memory */
-
- /*
- * If this filter needs to access skb data,
- * loads r9 and r8 with :
- * r9 = skb->len - skb->data_len
- * r8 = skb->data
+ /* mov dst, src */
+ case BPF_ALU64 | BPF_MOV | BPF_X:
+ EMIT_mov(dst_reg, src_reg);
+ break;
+
+ /* mov32 dst, src */
+ case BPF_ALU | BPF_MOV | BPF_X:
+ if (is_ereg(dst_reg) || is_ereg(src_reg))
+ EMIT1(add_2mod(0x40, dst_reg, src_reg));
+ EMIT2(0x89, add_2reg(0xC0, dst_reg, src_reg));
+ break;
+
+ /* neg dst */
+ case BPF_ALU | BPF_NEG:
+ case BPF_ALU64 | BPF_NEG:
+ if (BPF_CLASS(insn->code) == BPF_ALU64)
+ EMIT1(add_1mod(0x48, dst_reg));
+ else if (is_ereg(dst_reg))
+ EMIT1(add_1mod(0x40, dst_reg));
+ EMIT2(0xF7, add_1reg(0xD8, dst_reg));
+ break;
+
+ case BPF_ALU | BPF_ADD | BPF_K:
+ case BPF_ALU | BPF_SUB | BPF_K:
+ case BPF_ALU | BPF_AND | BPF_K:
+ case BPF_ALU | BPF_OR | BPF_K:
+ case BPF_ALU | BPF_XOR | BPF_K:
+ case BPF_ALU64 | BPF_ADD | BPF_K:
+ case BPF_ALU64 | BPF_SUB | BPF_K:
+ case BPF_ALU64 | BPF_AND | BPF_K:
+ case BPF_ALU64 | BPF_OR | BPF_K:
+ case BPF_ALU64 | BPF_XOR | BPF_K:
+ if (BPF_CLASS(insn->code) == BPF_ALU64)
+ EMIT1(add_1mod(0x48, dst_reg));
+ else if (is_ereg(dst_reg))
+ EMIT1(add_1mod(0x40, dst_reg));
+
+ switch (BPF_OP(insn->code)) {
+ case BPF_ADD: b3 = 0xC0; break;
+ case BPF_SUB: b3 = 0xE8; break;
+ case BPF_AND: b3 = 0xE0; break;
+ case BPF_OR: b3 = 0xC8; break;
+ case BPF_XOR: b3 = 0xF0; break;
+ }
+
+ if (is_imm8(imm32))
+ EMIT3(0x83, add_1reg(b3, dst_reg), imm32);
+ else
+ EMIT2_off32(0x81, add_1reg(b3, dst_reg), imm32);
+ break;
+
+ case BPF_ALU64 | BPF_MOV | BPF_K:
+ /* optimization: if imm32 is positive,
+ * use 'mov eax, imm32' (which zero-extends imm32)
+ * to save 2 bytes
*/
- if (seen_or_pass0 & SEEN_DATAREF) {
- if (offsetof(struct sk_buff, len) <= 127)
- /* mov off8(%rdi),%r9d */
- EMIT4(0x44, 0x8b, 0x4f, offsetof(struct sk_buff, len));
- else {
- /* mov off32(%rdi),%r9d */
- EMIT3(0x44, 0x8b, 0x8f);
- EMIT(offsetof(struct sk_buff, len), 4);
- }
- if (is_imm8(offsetof(struct sk_buff, data_len)))
- /* sub off8(%rdi),%r9d */
- EMIT4(0x44, 0x2b, 0x4f, offsetof(struct sk_buff, data_len));
- else {
- EMIT3(0x44, 0x2b, 0x8f);
- EMIT(offsetof(struct sk_buff, data_len), 4);
- }
+ if (imm32 < 0) {
+ /* 'mov rax, imm32' sign extends imm32 */
+ b1 = add_1mod(0x48, dst_reg);
+ b2 = 0xC7;
+ b3 = 0xC0;
+ EMIT3_off32(b1, b2, add_1reg(b3, dst_reg), imm32);
+ break;
+ }
- if (is_imm8(offsetof(struct sk_buff, data)))
- /* mov off8(%rdi),%r8 */
- EMIT4(0x4c, 0x8b, 0x47, offsetof(struct sk_buff, data));
- else {
- /* mov off32(%rdi),%r8 */
- EMIT3(0x4c, 0x8b, 0x87);
- EMIT(offsetof(struct sk_buff, data), 4);
- }
+ case BPF_ALU | BPF_MOV | BPF_K:
+ /* mov %eax, imm32 */
+ if (is_ereg(dst_reg))
+ EMIT1(add_1mod(0x40, dst_reg));
+ EMIT1_off32(add_1reg(0xB8, dst_reg), imm32);
+ break;
+
+ /* dst %= src, dst /= src, dst %= imm32, dst /= imm32 */
+ case BPF_ALU | BPF_MOD | BPF_X:
+ case BPF_ALU | BPF_DIV | BPF_X:
+ case BPF_ALU | BPF_MOD | BPF_K:
+ case BPF_ALU | BPF_DIV | BPF_K:
+ case BPF_ALU64 | BPF_MOD | BPF_X:
+ case BPF_ALU64 | BPF_DIV | BPF_X:
+ case BPF_ALU64 | BPF_MOD | BPF_K:
+ case BPF_ALU64 | BPF_DIV | BPF_K:
+ EMIT1(0x50); /* push rax */
+ EMIT1(0x52); /* push rdx */
+
+ if (BPF_SRC(insn->code) == BPF_X)
+ /* mov r11, src_reg */
+ EMIT_mov(AUX_REG, src_reg);
+ else
+ /* mov r11, imm32 */
+ EMIT3_off32(0x49, 0xC7, 0xC3, imm32);
+
+ /* mov rax, dst_reg */
+ EMIT_mov(BPF_REG_0, dst_reg);
+
+ /* xor edx, edx
+ * equivalent to 'xor rdx, rdx', but one byte less
+ */
+ EMIT2(0x31, 0xd2);
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ /* if (src_reg == 0) return 0 */
+
+ /* cmp r11, 0 */
+ EMIT4(0x49, 0x83, 0xFB, 0x00);
+
+ /* jne .+9 (skip over pop, pop, xor and jmp) */
+ EMIT2(X86_JNE, 1 + 1 + 2 + 5);
+ EMIT1(0x5A); /* pop rdx */
+ EMIT1(0x58); /* pop rax */
+ EMIT2(0x31, 0xc0); /* xor eax, eax */
+
+ /* jmp cleanup_addr
+ * addrs[i] - 11, because there are 11 bytes
+ * after this insn: div, mov, pop, pop, mov
+ */
+ jmp_offset = ctx->cleanup_addr - (addrs[i] - 11);
+ EMIT1_off32(0xE9, jmp_offset);
}
- }
- switch (filter[0].code) {
- case BPF_S_RET_K:
- case BPF_S_LD_W_LEN:
- case BPF_S_ANC_PROTOCOL:
- case BPF_S_ANC_IFINDEX:
- case BPF_S_ANC_MARK:
- case BPF_S_ANC_RXHASH:
- case BPF_S_ANC_CPU:
- case BPF_S_ANC_VLAN_TAG:
- case BPF_S_ANC_VLAN_TAG_PRESENT:
- case BPF_S_ANC_QUEUE:
- case BPF_S_ANC_PKTTYPE:
- case BPF_S_LD_W_ABS:
- case BPF_S_LD_H_ABS:
- case BPF_S_LD_B_ABS:
- /* first instruction sets A register (or is RET 'constant') */
+ if (BPF_CLASS(insn->code) == BPF_ALU64)
+ /* div r11 */
+ EMIT3(0x49, 0xF7, 0xF3);
+ else
+ /* div r11d */
+ EMIT3(0x41, 0xF7, 0xF3);
+
+ if (BPF_OP(insn->code) == BPF_MOD)
+ /* mov r11, rdx */
+ EMIT3(0x49, 0x89, 0xD3);
+ else
+ /* mov r11, rax */
+ EMIT3(0x49, 0x89, 0xC3);
+
+ EMIT1(0x5A); /* pop rdx */
+ EMIT1(0x58); /* pop rax */
+
+ /* mov dst_reg, r11 */
+ EMIT_mov(dst_reg, AUX_REG);
break;
- default:
- /* make sure we dont leak kernel information to user */
- CLEAR_A(); /* A = 0 */
- }
- for (i = 0; i < flen; i++) {
- unsigned int K = filter[i].k;
+ case BPF_ALU | BPF_MUL | BPF_K:
+ case BPF_ALU | BPF_MUL | BPF_X:
+ case BPF_ALU64 | BPF_MUL | BPF_K:
+ case BPF_ALU64 | BPF_MUL | BPF_X:
+ EMIT1(0x50); /* push rax */
+ EMIT1(0x52); /* push rdx */
+
+ /* mov r11, dst_reg */
+ EMIT_mov(AUX_REG, dst_reg);
+
+ if (BPF_SRC(insn->code) == BPF_X)
+ /* mov rax, src_reg */
+ EMIT_mov(BPF_REG_0, src_reg);
+ else
+ /* mov rax, imm32 */
+ EMIT3_off32(0x48, 0xC7, 0xC0, imm32);
+
+ if (BPF_CLASS(insn->code) == BPF_ALU64)
+ EMIT1(add_1mod(0x48, AUX_REG));
+ else if (is_ereg(AUX_REG))
+ EMIT1(add_1mod(0x40, AUX_REG));
+ /* mul(q) r11 */
+ EMIT2(0xF7, add_1reg(0xE0, AUX_REG));
+
+ /* mov r11, rax */
+ EMIT_mov(AUX_REG, BPF_REG_0);
+
+ EMIT1(0x5A); /* pop rdx */
+ EMIT1(0x58); /* pop rax */
+
+ /* mov dst_reg, r11 */
+ EMIT_mov(dst_reg, AUX_REG);
+ break;
- switch (filter[i].code) {
- case BPF_S_ALU_ADD_X: /* A += X; */
- seen |= SEEN_XREG;
- EMIT2(0x01, 0xd8); /* add %ebx,%eax */
- break;
- case BPF_S_ALU_ADD_K: /* A += K; */
- if (!K)
- break;
- if (is_imm8(K))
- EMIT3(0x83, 0xc0, K); /* add imm8,%eax */
- else
- EMIT1_off32(0x05, K); /* add imm32,%eax */
- break;
- case BPF_S_ALU_SUB_X: /* A -= X; */
- seen |= SEEN_XREG;
- EMIT2(0x29, 0xd8); /* sub %ebx,%eax */
- break;
- case BPF_S_ALU_SUB_K: /* A -= K */
- if (!K)
- break;
- if (is_imm8(K))
- EMIT3(0x83, 0xe8, K); /* sub imm8,%eax */
- else
- EMIT1_off32(0x2d, K); /* sub imm32,%eax */
- break;
- case BPF_S_ALU_MUL_X: /* A *= X; */
- seen |= SEEN_XREG;
- EMIT3(0x0f, 0xaf, 0xc3); /* imul %ebx,%eax */
- break;
- case BPF_S_ALU_MUL_K: /* A *= K */
- if (is_imm8(K))
- EMIT3(0x6b, 0xc0, K); /* imul imm8,%eax,%eax */
- else {
- EMIT2(0x69, 0xc0); /* imul imm32,%eax */
- EMIT(K, 4);
- }
- break;
- case BPF_S_ALU_DIV_X: /* A /= X; */
- seen |= SEEN_XREG;
- EMIT2(0x85, 0xdb); /* test %ebx,%ebx */
- if (pc_ret0 > 0) {
- /* addrs[pc_ret0 - 1] is start address of target
- * (addrs[i] - 4) is the address following this jmp
- * ("xor %edx,%edx; div %ebx" being 4 bytes long)
- */
- EMIT_COND_JMP(X86_JE, addrs[pc_ret0 - 1] -
- (addrs[i] - 4));
- } else {
- EMIT_COND_JMP(X86_JNE, 2 + 5);
- CLEAR_A();
- EMIT1_off32(0xe9, cleanup_addr - (addrs[i] - 4)); /* jmp .+off32 */
- }
- EMIT4(0x31, 0xd2, 0xf7, 0xf3); /* xor %edx,%edx; div %ebx */
- break;
- case BPF_S_ALU_MOD_X: /* A %= X; */
- seen |= SEEN_XREG;
- EMIT2(0x85, 0xdb); /* test %ebx,%ebx */
- if (pc_ret0 > 0) {
- /* addrs[pc_ret0 - 1] is start address of target
- * (addrs[i] - 6) is the address following this jmp
- * ("xor %edx,%edx; div %ebx;mov %edx,%eax" being 6 bytes long)
- */
- EMIT_COND_JMP(X86_JE, addrs[pc_ret0 - 1] -
- (addrs[i] - 6));
- } else {
- EMIT_COND_JMP(X86_JNE, 2 + 5);
- CLEAR_A();
- EMIT1_off32(0xe9, cleanup_addr - (addrs[i] - 6)); /* jmp .+off32 */
- }
- EMIT2(0x31, 0xd2); /* xor %edx,%edx */
- EMIT2(0xf7, 0xf3); /* div %ebx */
- EMIT2(0x89, 0xd0); /* mov %edx,%eax */
- break;
- case BPF_S_ALU_MOD_K: /* A %= K; */
- if (K == 1) {
- CLEAR_A();
- break;
- }
- EMIT2(0x31, 0xd2); /* xor %edx,%edx */
- EMIT1(0xb9);EMIT(K, 4); /* mov imm32,%ecx */
- EMIT2(0xf7, 0xf1); /* div %ecx */
- EMIT2(0x89, 0xd0); /* mov %edx,%eax */
- break;
- case BPF_S_ALU_DIV_K: /* A /= K */
- if (K == 1)
- break;
- EMIT2(0x31, 0xd2); /* xor %edx,%edx */
- EMIT1(0xb9);EMIT(K, 4); /* mov imm32,%ecx */
- EMIT2(0xf7, 0xf1); /* div %ecx */
- break;
- case BPF_S_ALU_AND_X:
- seen |= SEEN_XREG;
- EMIT2(0x21, 0xd8); /* and %ebx,%eax */
- break;
- case BPF_S_ALU_AND_K:
- if (K >= 0xFFFFFF00) {
- EMIT2(0x24, K & 0xFF); /* and imm8,%al */
- } else if (K >= 0xFFFF0000) {
- EMIT2(0x66, 0x25); /* and imm16,%ax */
- EMIT(K, 2);
- } else {
- EMIT1_off32(0x25, K); /* and imm32,%eax */
- }
- break;
- case BPF_S_ALU_OR_X:
- seen |= SEEN_XREG;
- EMIT2(0x09, 0xd8); /* or %ebx,%eax */
- break;
- case BPF_S_ALU_OR_K:
- if (is_imm8(K))
- EMIT3(0x83, 0xc8, K); /* or imm8,%eax */
- else
- EMIT1_off32(0x0d, K); /* or imm32,%eax */
- break;
- case BPF_S_ANC_ALU_XOR_X: /* A ^= X; */
- case BPF_S_ALU_XOR_X:
- seen |= SEEN_XREG;
- EMIT2(0x31, 0xd8); /* xor %ebx,%eax */
- break;
- case BPF_S_ALU_XOR_K: /* A ^= K; */
- if (K == 0)
- break;
- if (is_imm8(K))
- EMIT3(0x83, 0xf0, K); /* xor imm8,%eax */
- else
- EMIT1_off32(0x35, K); /* xor imm32,%eax */
- break;
- case BPF_S_ALU_LSH_X: /* A <<= X; */
- seen |= SEEN_XREG;
- EMIT4(0x89, 0xd9, 0xd3, 0xe0); /* mov %ebx,%ecx; shl %cl,%eax */
- break;
- case BPF_S_ALU_LSH_K:
- if (K == 0)
- break;
- else if (K == 1)
- EMIT2(0xd1, 0xe0); /* shl %eax */
- else
- EMIT3(0xc1, 0xe0, K);
- break;
- case BPF_S_ALU_RSH_X: /* A >>= X; */
- seen |= SEEN_XREG;
- EMIT4(0x89, 0xd9, 0xd3, 0xe8); /* mov %ebx,%ecx; shr %cl,%eax */
- break;
- case BPF_S_ALU_RSH_K: /* A >>= K; */
- if (K == 0)
- break;
- else if (K == 1)
- EMIT2(0xd1, 0xe8); /* shr %eax */
- else
- EMIT3(0xc1, 0xe8, K);
- break;
- case BPF_S_ALU_NEG:
- EMIT2(0xf7, 0xd8); /* neg %eax */
- break;
- case BPF_S_RET_K:
- if (!K) {
- if (pc_ret0 == -1)
- pc_ret0 = i;
- CLEAR_A();
- } else {
- EMIT1_off32(0xb8, K); /* mov $imm32,%eax */
- }
- /* fallinto */
- case BPF_S_RET_A:
- if (seen_or_pass0) {
- if (i != flen - 1) {
- EMIT_JMP(cleanup_addr - addrs[i]);
- break;
- }
- if (seen_or_pass0 & SEEN_XREG)
- EMIT4(0x48, 0x8b, 0x5d, 0xf8); /* mov -8(%rbp),%rbx */
- EMIT1(0xc9); /* leaveq */
- }
- EMIT1(0xc3); /* ret */
- break;
- case BPF_S_MISC_TAX: /* X = A */
- seen |= SEEN_XREG;
- EMIT2(0x89, 0xc3); /* mov %eax,%ebx */
- break;
- case BPF_S_MISC_TXA: /* A = X */
- seen |= SEEN_XREG;
- EMIT2(0x89, 0xd8); /* mov %ebx,%eax */
- break;
- case BPF_S_LD_IMM: /* A = K */
- if (!K)
- CLEAR_A();
- else
- EMIT1_off32(0xb8, K); /* mov $imm32,%eax */
- break;
- case BPF_S_LDX_IMM: /* X = K */
- seen |= SEEN_XREG;
- if (!K)
- CLEAR_X();
+ /* shifts */
+ case BPF_ALU | BPF_LSH | BPF_K:
+ case BPF_ALU | BPF_RSH | BPF_K:
+ case BPF_ALU | BPF_ARSH | BPF_K:
+ case BPF_ALU64 | BPF_LSH | BPF_K:
+ case BPF_ALU64 | BPF_RSH | BPF_K:
+ case BPF_ALU64 | BPF_ARSH | BPF_K:
+ if (BPF_CLASS(insn->code) == BPF_ALU64)
+ EMIT1(add_1mod(0x48, dst_reg));
+ else if (is_ereg(dst_reg))
+ EMIT1(add_1mod(0x40, dst_reg));
+
+ switch (BPF_OP(insn->code)) {
+ case BPF_LSH: b3 = 0xE0; break;
+ case BPF_RSH: b3 = 0xE8; break;
+ case BPF_ARSH: b3 = 0xF8; break;
+ }
+ EMIT3(0xC1, add_1reg(b3, dst_reg), imm32);
+ break;
+
+ case BPF_ALU | BPF_END | BPF_FROM_BE:
+ switch (imm32) {
+ case 16:
+ /* emit 'ror %ax, 8' to swap lower 2 bytes */
+ EMIT1(0x66);
+ if (is_ereg(dst_reg))
+ EMIT1(0x41);
+ EMIT3(0xC1, add_1reg(0xC8, dst_reg), 8);
+ break;
+ case 32:
+ /* emit 'bswap eax' to swap lower 4 bytes */
+ if (is_ereg(dst_reg))
+ EMIT2(0x41, 0x0F);
else
- EMIT1_off32(0xbb, K); /* mov $imm32,%ebx */
- break;
- case BPF_S_LD_MEM: /* A = mem[K] : mov off8(%rbp),%eax */
- seen |= SEEN_MEM;
- EMIT3(0x8b, 0x45, 0xf0 - K*4);
- break;
- case BPF_S_LDX_MEM: /* X = mem[K] : mov off8(%rbp),%ebx */
- seen |= SEEN_XREG | SEEN_MEM;
- EMIT3(0x8b, 0x5d, 0xf0 - K*4);
- break;
- case BPF_S_ST: /* mem[K] = A : mov %eax,off8(%rbp) */
- seen |= SEEN_MEM;
- EMIT3(0x89, 0x45, 0xf0 - K*4);
- break;
- case BPF_S_STX: /* mem[K] = X : mov %ebx,off8(%rbp) */
- seen |= SEEN_XREG | SEEN_MEM;
- EMIT3(0x89, 0x5d, 0xf0 - K*4);
- break;
- case BPF_S_LD_W_LEN: /* A = skb->len; */
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
- if (is_imm8(offsetof(struct sk_buff, len)))
- /* mov off8(%rdi),%eax */
- EMIT3(0x8b, 0x47, offsetof(struct sk_buff, len));
- else {
- EMIT2(0x8b, 0x87);
- EMIT(offsetof(struct sk_buff, len), 4);
- }
- break;
- case BPF_S_LDX_W_LEN: /* X = skb->len; */
- seen |= SEEN_XREG;
- if (is_imm8(offsetof(struct sk_buff, len)))
- /* mov off8(%rdi),%ebx */
- EMIT3(0x8b, 0x5f, offsetof(struct sk_buff, len));
- else {
- EMIT2(0x8b, 0x9f);
- EMIT(offsetof(struct sk_buff, len), 4);
- }
- break;
- case BPF_S_ANC_PROTOCOL: /* A = ntohs(skb->protocol); */
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
- if (is_imm8(offsetof(struct sk_buff, protocol))) {
- /* movzwl off8(%rdi),%eax */
- EMIT4(0x0f, 0xb7, 0x47, offsetof(struct sk_buff, protocol));
- } else {
- EMIT3(0x0f, 0xb7, 0x87); /* movzwl off32(%rdi),%eax */
- EMIT(offsetof(struct sk_buff, protocol), 4);
- }
- EMIT2(0x86, 0xc4); /* ntohs() : xchg %al,%ah */
- break;
- case BPF_S_ANC_IFINDEX:
- if (is_imm8(offsetof(struct sk_buff, dev))) {
- /* movq off8(%rdi),%rax */
- EMIT4(0x48, 0x8b, 0x47, offsetof(struct sk_buff, dev));
- } else {
- EMIT3(0x48, 0x8b, 0x87); /* movq off32(%rdi),%rax */
- EMIT(offsetof(struct sk_buff, dev), 4);
- }
- EMIT3(0x48, 0x85, 0xc0); /* test %rax,%rax */
- EMIT_COND_JMP(X86_JE, cleanup_addr - (addrs[i] - 6));
- BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
- EMIT2(0x8b, 0x80); /* mov off32(%rax),%eax */
- EMIT(offsetof(struct net_device, ifindex), 4);
- break;
- case BPF_S_ANC_MARK:
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
- if (is_imm8(offsetof(struct sk_buff, mark))) {
- /* mov off8(%rdi),%eax */
- EMIT3(0x8b, 0x47, offsetof(struct sk_buff, mark));
- } else {
- EMIT2(0x8b, 0x87);
- EMIT(offsetof(struct sk_buff, mark), 4);
- }
- break;
- case BPF_S_ANC_RXHASH:
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
- if (is_imm8(offsetof(struct sk_buff, hash))) {
- /* mov off8(%rdi),%eax */
- EMIT3(0x8b, 0x47, offsetof(struct sk_buff, hash));
- } else {
- EMIT2(0x8b, 0x87);
- EMIT(offsetof(struct sk_buff, hash), 4);
- }
- break;
- case BPF_S_ANC_QUEUE:
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2);
- if (is_imm8(offsetof(struct sk_buff, queue_mapping))) {
- /* movzwl off8(%rdi),%eax */
- EMIT4(0x0f, 0xb7, 0x47, offsetof(struct sk_buff, queue_mapping));
- } else {
- EMIT3(0x0f, 0xb7, 0x87); /* movzwl off32(%rdi),%eax */
- EMIT(offsetof(struct sk_buff, queue_mapping), 4);
- }
- break;
- case BPF_S_ANC_CPU:
-#ifdef CONFIG_SMP
- EMIT4(0x65, 0x8b, 0x04, 0x25); /* mov %gs:off32,%eax */
- EMIT((u32)(unsigned long)&cpu_number, 4); /* A = smp_processor_id(); */
-#else
- CLEAR_A();
-#endif
+ EMIT1(0x0F);
+ EMIT1(add_1reg(0xC8, dst_reg));
break;
- case BPF_S_ANC_VLAN_TAG:
- case BPF_S_ANC_VLAN_TAG_PRESENT:
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2);
- if (is_imm8(offsetof(struct sk_buff, vlan_tci))) {
- /* movzwl off8(%rdi),%eax */
- EMIT4(0x0f, 0xb7, 0x47, offsetof(struct sk_buff, vlan_tci));
- } else {
- EMIT3(0x0f, 0xb7, 0x87); /* movzwl off32(%rdi),%eax */
- EMIT(offsetof(struct sk_buff, vlan_tci), 4);
- }
- BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
- if (filter[i].code == BPF_S_ANC_VLAN_TAG) {
- EMIT3(0x80, 0xe4, 0xef); /* and $0xef,%ah */
- } else {
- EMIT3(0xc1, 0xe8, 0x0c); /* shr $0xc,%eax */
- EMIT3(0x83, 0xe0, 0x01); /* and $0x1,%eax */
- }
- break;
- case BPF_S_ANC_PKTTYPE:
- {
- int off = pkt_type_offset();
-
- if (off < 0)
- goto out;
- if (is_imm8(off)) {
- /* movzbl off8(%rdi),%eax */
- EMIT4(0x0f, 0xb6, 0x47, off);
- } else {
- /* movbl off32(%rdi),%eax */
- EMIT3(0x0f, 0xb6, 0x87);
- EMIT(off, 4);
- }
- EMIT3(0x83, 0xe0, PKT_TYPE_MAX); /* and $0x7,%eax */
+ case 64:
+ /* emit 'bswap rax' to swap 8 bytes */
+ EMIT3(add_1mod(0x48, dst_reg), 0x0F,
+ add_1reg(0xC8, dst_reg));
break;
}
- case BPF_S_LD_W_ABS:
- func = CHOOSE_LOAD_FUNC(K, sk_load_word);
-common_load: seen |= SEEN_DATAREF;
- t_offset = func - (image + addrs[i]);
- EMIT1_off32(0xbe, K); /* mov imm32,%esi */
- EMIT1_off32(0xe8, t_offset); /* call */
- break;
- case BPF_S_LD_H_ABS:
- func = CHOOSE_LOAD_FUNC(K, sk_load_half);
- goto common_load;
- case BPF_S_LD_B_ABS:
- func = CHOOSE_LOAD_FUNC(K, sk_load_byte);
- goto common_load;
- case BPF_S_LDX_B_MSH:
- func = CHOOSE_LOAD_FUNC(K, sk_load_byte_msh);
- seen |= SEEN_DATAREF | SEEN_XREG;
- t_offset = func - (image + addrs[i]);
- EMIT1_off32(0xbe, K); /* mov imm32,%esi */
- EMIT1_off32(0xe8, t_offset); /* call sk_load_byte_msh */
- break;
- case BPF_S_LD_W_IND:
- func = sk_load_word;
-common_load_ind: seen |= SEEN_DATAREF | SEEN_XREG;
- t_offset = func - (image + addrs[i]);
- if (K) {
- if (is_imm8(K)) {
- EMIT3(0x8d, 0x73, K); /* lea imm8(%rbx), %esi */
- } else {
- EMIT2(0x8d, 0xb3); /* lea imm32(%rbx),%esi */
- EMIT(K, 4);
- }
- } else {
- EMIT2(0x89,0xde); /* mov %ebx,%esi */
- }
- EMIT1_off32(0xe8, t_offset); /* call sk_load_xxx_ind */
- break;
- case BPF_S_LD_H_IND:
- func = sk_load_half;
- goto common_load_ind;
- case BPF_S_LD_B_IND:
- func = sk_load_byte;
- goto common_load_ind;
- case BPF_S_JMP_JA:
- t_offset = addrs[i + K] - addrs[i];
- EMIT_JMP(t_offset);
- break;
- COND_SEL(BPF_S_JMP_JGT_K, X86_JA, X86_JBE);
- COND_SEL(BPF_S_JMP_JGE_K, X86_JAE, X86_JB);
- COND_SEL(BPF_S_JMP_JEQ_K, X86_JE, X86_JNE);
- COND_SEL(BPF_S_JMP_JSET_K,X86_JNE, X86_JE);
- COND_SEL(BPF_S_JMP_JGT_X, X86_JA, X86_JBE);
- COND_SEL(BPF_S_JMP_JGE_X, X86_JAE, X86_JB);
- COND_SEL(BPF_S_JMP_JEQ_X, X86_JE, X86_JNE);
- COND_SEL(BPF_S_JMP_JSET_X,X86_JNE, X86_JE);
-
-cond_branch: f_offset = addrs[i + filter[i].jf] - addrs[i];
- t_offset = addrs[i + filter[i].jt] - addrs[i];
-
- /* same targets, can avoid doing the test :) */
- if (filter[i].jt == filter[i].jf) {
- EMIT_JMP(t_offset);
- break;
- }
+ break;
+
+ case BPF_ALU | BPF_END | BPF_FROM_LE:
+ break;
+
+ /* ST: *(u8*)(dst_reg + off) = imm */
+ case BPF_ST | BPF_MEM | BPF_B:
+ if (is_ereg(dst_reg))
+ EMIT2(0x41, 0xC6);
+ else
+ EMIT1(0xC6);
+ goto st;
+ case BPF_ST | BPF_MEM | BPF_H:
+ if (is_ereg(dst_reg))
+ EMIT3(0x66, 0x41, 0xC7);
+ else
+ EMIT2(0x66, 0xC7);
+ goto st;
+ case BPF_ST | BPF_MEM | BPF_W:
+ if (is_ereg(dst_reg))
+ EMIT2(0x41, 0xC7);
+ else
+ EMIT1(0xC7);
+ goto st;
+ case BPF_ST | BPF_MEM | BPF_DW:
+ EMIT2(add_1mod(0x48, dst_reg), 0xC7);
+
+st: if (is_imm8(insn->off))
+ EMIT2(add_1reg(0x40, dst_reg), insn->off);
+ else
+ EMIT1_off32(add_1reg(0x80, dst_reg), insn->off);
+
+ EMIT(imm32, bpf_size_to_x86_bytes(BPF_SIZE(insn->code)));
+ break;
+
+ /* STX: *(u8*)(dst_reg + off) = src_reg */
+ case BPF_STX | BPF_MEM | BPF_B:
+ /* emit 'mov byte ptr [rax + off], al' */
+ if (is_ereg(dst_reg) || is_ereg(src_reg) ||
+ /* have to add extra byte for x86 SIL, DIL regs */
+ src_reg == BPF_REG_1 || src_reg == BPF_REG_2)
+ EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x88);
+ else
+ EMIT1(0x88);
+ goto stx;
+ case BPF_STX | BPF_MEM | BPF_H:
+ if (is_ereg(dst_reg) || is_ereg(src_reg))
+ EMIT3(0x66, add_2mod(0x40, dst_reg, src_reg), 0x89);
+ else
+ EMIT2(0x66, 0x89);
+ goto stx;
+ case BPF_STX | BPF_MEM | BPF_W:
+ if (is_ereg(dst_reg) || is_ereg(src_reg))
+ EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x89);
+ else
+ EMIT1(0x89);
+ goto stx;
+ case BPF_STX | BPF_MEM | BPF_DW:
+ EMIT2(add_2mod(0x48, dst_reg, src_reg), 0x89);
+stx: if (is_imm8(insn->off))
+ EMIT2(add_2reg(0x40, dst_reg, src_reg), insn->off);
+ else
+ EMIT1_off32(add_2reg(0x80, dst_reg, src_reg),
+ insn->off);
+ break;
+
+ /* LDX: dst_reg = *(u8*)(src_reg + off) */
+ case BPF_LDX | BPF_MEM | BPF_B:
+ /* emit 'movzx rax, byte ptr [rax + off]' */
+ EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB6);
+ goto ldx;
+ case BPF_LDX | BPF_MEM | BPF_H:
+ /* emit 'movzx rax, word ptr [rax + off]' */
+ EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB7);
+ goto ldx;
+ case BPF_LDX | BPF_MEM | BPF_W:
+ /* emit 'mov eax, dword ptr [rax+0x14]' */
+ if (is_ereg(dst_reg) || is_ereg(src_reg))
+ EMIT2(add_2mod(0x40, src_reg, dst_reg), 0x8B);
+ else
+ EMIT1(0x8B);
+ goto ldx;
+ case BPF_LDX | BPF_MEM | BPF_DW:
+ /* emit 'mov rax, qword ptr [rax+0x14]' */
+ EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B);
+ldx: /* if insn->off == 0 we can save one extra byte, but
+ * special case of x86 r13 which always needs an offset
+ * is not worth the hassle
+ */
+ if (is_imm8(insn->off))
+ EMIT2(add_2reg(0x40, src_reg, dst_reg), insn->off);
+ else
+ EMIT1_off32(add_2reg(0x80, src_reg, dst_reg),
+ insn->off);
+ break;
+
+ /* STX XADD: lock *(u32*)(dst_reg + off) += src_reg */
+ case BPF_STX | BPF_XADD | BPF_W:
+ /* emit 'lock add dword ptr [rax + off], eax' */
+ if (is_ereg(dst_reg) || is_ereg(src_reg))
+ EMIT3(0xF0, add_2mod(0x40, dst_reg, src_reg), 0x01);
+ else
+ EMIT2(0xF0, 0x01);
+ goto xadd;
+ case BPF_STX | BPF_XADD | BPF_DW:
+ EMIT3(0xF0, add_2mod(0x48, dst_reg, src_reg), 0x01);
+xadd: if (is_imm8(insn->off))
+ EMIT2(add_2reg(0x40, dst_reg, src_reg), insn->off);
+ else
+ EMIT1_off32(add_2reg(0x80, dst_reg, src_reg),
+ insn->off);
+ break;
+
+ /* call */
+ case BPF_JMP | BPF_CALL:
+ func = (u8 *) __bpf_call_base + imm32;
+ jmp_offset = func - (image + addrs[i]);
+ if (ctx->seen_ld_abs) {
+ EMIT2(0x41, 0x52); /* push %r10 */
+ EMIT2(0x41, 0x51); /* push %r9 */
+ /* need to adjust jmp offset, since
+ * pop %r9, pop %r10 take 4 bytes after call insn
+ */
+ jmp_offset += 4;
+ }
+ if (!imm32 || !is_simm32(jmp_offset)) {
+ pr_err("unsupported bpf func %d addr %p image %p\n",
+ imm32, func, image);
+ return -EINVAL;
+ }
+ EMIT1_off32(0xE8, jmp_offset);
+ if (ctx->seen_ld_abs) {
+ EMIT2(0x41, 0x59); /* pop %r9 */
+ EMIT2(0x41, 0x5A); /* pop %r10 */
+ }
+ break;
+
+ /* cond jump */
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JNE | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ case BPF_JMP | BPF_JSGT | BPF_X:
+ case BPF_JMP | BPF_JSGE | BPF_X:
+ /* cmp dst_reg, src_reg */
+ EMIT3(add_2mod(0x48, dst_reg, src_reg), 0x39,
+ add_2reg(0xC0, dst_reg, src_reg));
+ goto emit_cond_jmp;
+
+ case BPF_JMP | BPF_JSET | BPF_X:
+ /* test dst_reg, src_reg */
+ EMIT3(add_2mod(0x48, dst_reg, src_reg), 0x85,
+ add_2reg(0xC0, dst_reg, src_reg));
+ goto emit_cond_jmp;
+
+ case BPF_JMP | BPF_JSET | BPF_K:
+ /* test dst_reg, imm32 */
+ EMIT1(add_1mod(0x48, dst_reg));
+ EMIT2_off32(0xF7, add_1reg(0xC0, dst_reg), imm32);
+ goto emit_cond_jmp;
+
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JNE | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JSGT | BPF_K:
+ case BPF_JMP | BPF_JSGE | BPF_K:
+ /* cmp dst_reg, imm8/32 */
+ EMIT1(add_1mod(0x48, dst_reg));
+
+ if (is_imm8(imm32))
+ EMIT3(0x83, add_1reg(0xF8, dst_reg), imm32);
+ else
+ EMIT2_off32(0x81, add_1reg(0xF8, dst_reg), imm32);
+
+emit_cond_jmp: /* convert BPF opcode to x86 */
+ switch (BPF_OP(insn->code)) {
+ case BPF_JEQ:
+ jmp_cond = X86_JE;
+ break;
+ case BPF_JSET:
+ case BPF_JNE:
+ jmp_cond = X86_JNE;
+ break;
+ case BPF_JGT:
+ /* GT is unsigned '>', JA in x86 */
+ jmp_cond = X86_JA;
+ break;
+ case BPF_JGE:
+ /* GE is unsigned '>=', JAE in x86 */
+ jmp_cond = X86_JAE;
+ break;
+ case BPF_JSGT:
+ /* signed '>', GT in x86 */
+ jmp_cond = X86_JG;
+ break;
+ case BPF_JSGE:
+ /* signed '>=', GE in x86 */
+ jmp_cond = X86_JGE;
+ break;
+ default: /* to silence gcc warning */
+ return -EFAULT;
+ }
+ jmp_offset = addrs[i + insn->off] - addrs[i];
+ if (is_imm8(jmp_offset)) {
+ EMIT2(jmp_cond, jmp_offset);
+ } else if (is_simm32(jmp_offset)) {
+ EMIT2_off32(0x0F, jmp_cond + 0x10, jmp_offset);
+ } else {
+ pr_err("cond_jmp gen bug %llx\n", jmp_offset);
+ return -EFAULT;
+ }
+
+ break;
- switch (filter[i].code) {
- case BPF_S_JMP_JGT_X:
- case BPF_S_JMP_JGE_X:
- case BPF_S_JMP_JEQ_X:
- seen |= SEEN_XREG;
- EMIT2(0x39, 0xd8); /* cmp %ebx,%eax */
- break;
- case BPF_S_JMP_JSET_X:
- seen |= SEEN_XREG;
- EMIT2(0x85, 0xd8); /* test %ebx,%eax */
- break;
- case BPF_S_JMP_JEQ_K:
- if (K == 0) {
- EMIT2(0x85, 0xc0); /* test %eax,%eax */
- break;
- }
- case BPF_S_JMP_JGT_K:
- case BPF_S_JMP_JGE_K:
- if (K <= 127)
- EMIT3(0x83, 0xf8, K); /* cmp imm8,%eax */
+ case BPF_JMP | BPF_JA:
+ jmp_offset = addrs[i + insn->off] - addrs[i];
+ if (!jmp_offset)
+ /* optimize out nop jumps */
+ break;
+emit_jmp:
+ if (is_imm8(jmp_offset)) {
+ EMIT2(0xEB, jmp_offset);
+ } else if (is_simm32(jmp_offset)) {
+ EMIT1_off32(0xE9, jmp_offset);
+ } else {
+ pr_err("jmp gen bug %llx\n", jmp_offset);
+ return -EFAULT;
+ }
+ break;
+
+ case BPF_LD | BPF_IND | BPF_W:
+ func = sk_load_word;
+ goto common_load;
+ case BPF_LD | BPF_ABS | BPF_W:
+ func = CHOOSE_LOAD_FUNC(imm32, sk_load_word);
+common_load: ctx->seen_ld_abs = true;
+ jmp_offset = func - (image + addrs[i]);
+ if (!func || !is_simm32(jmp_offset)) {
+ pr_err("unsupported bpf func %d addr %p image %p\n",
+ imm32, func, image);
+ return -EINVAL;
+ }
+ if (BPF_MODE(insn->code) == BPF_ABS) {
+ /* mov %esi, imm32 */
+ EMIT1_off32(0xBE, imm32);
+ } else {
+ /* mov %rsi, src_reg */
+ EMIT_mov(BPF_REG_2, src_reg);
+ if (imm32) {
+ if (is_imm8(imm32))
+ /* add %esi, imm8 */
+ EMIT3(0x83, 0xC6, imm32);
else
- EMIT1_off32(0x3d, K); /* cmp imm32,%eax */
- break;
- case BPF_S_JMP_JSET_K:
- if (K <= 0xFF)
- EMIT2(0xa8, K); /* test imm8,%al */
- else if (!(K & 0xFFFF00FF))
- EMIT3(0xf6, 0xc4, K >> 8); /* test imm8,%ah */
- else if (K <= 0xFFFF) {
- EMIT2(0x66, 0xa9); /* test imm16,%ax */
- EMIT(K, 2);
- } else {
- EMIT1_off32(0xa9, K); /* test imm32,%eax */
- }
- break;
+ /* add %esi, imm32 */
+ EMIT2_off32(0x81, 0xC6, imm32);
}
- if (filter[i].jt != 0) {
- if (filter[i].jf && f_offset)
- t_offset += is_near(f_offset) ? 2 : 5;
- EMIT_COND_JMP(t_op, t_offset);
- if (filter[i].jf)
- EMIT_JMP(f_offset);
- break;
- }
- EMIT_COND_JMP(f_op, f_offset);
- break;
- default:
- /* hmm, too complex filter, give up with jit compiler */
- goto out;
}
- ilen = prog - temp;
- if (image) {
- if (unlikely(proglen + ilen > oldproglen)) {
- pr_err("bpb_jit_compile fatal error\n");
- kfree(addrs);
- module_free(NULL, header);
- return;
- }
- memcpy(image + proglen, temp, ilen);
+ /* skb pointer is in R6 (%rbx), it will be copied into
+ * %rdi if skb_copy_bits() call is necessary.
+ * sk_load_* helpers also use %r10 and %r9d.
+ * See bpf_jit.S
+ */
+ EMIT1_off32(0xE8, jmp_offset); /* call */
+ break;
+
+ case BPF_LD | BPF_IND | BPF_H:
+ func = sk_load_half;
+ goto common_load;
+ case BPF_LD | BPF_ABS | BPF_H:
+ func = CHOOSE_LOAD_FUNC(imm32, sk_load_half);
+ goto common_load;
+ case BPF_LD | BPF_IND | BPF_B:
+ func = sk_load_byte;
+ goto common_load;
+ case BPF_LD | BPF_ABS | BPF_B:
+ func = CHOOSE_LOAD_FUNC(imm32, sk_load_byte);
+ goto common_load;
+
+ case BPF_JMP | BPF_EXIT:
+ if (i != insn_cnt - 1) {
+ jmp_offset = ctx->cleanup_addr - addrs[i];
+ goto emit_jmp;
}
- proglen += ilen;
- addrs[i] = proglen;
- prog = temp;
+ /* update cleanup_addr */
+ ctx->cleanup_addr = proglen;
+ /* mov rbx, qword ptr [rbp-X] */
+ EMIT3_off32(0x48, 0x8B, 0x9D, -stacksize);
+ /* mov r13, qword ptr [rbp-X] */
+ EMIT3_off32(0x4C, 0x8B, 0xAD, -stacksize + 8);
+ /* mov r14, qword ptr [rbp-X] */
+ EMIT3_off32(0x4C, 0x8B, 0xB5, -stacksize + 16);
+ /* mov r15, qword ptr [rbp-X] */
+ EMIT3_off32(0x4C, 0x8B, 0xBD, -stacksize + 24);
+
+ EMIT1(0xC9); /* leave */
+ EMIT1(0xC3); /* ret */
+ break;
+
+ default:
+ /* By design x64 JIT should support all BPF instructions
+ * This error will be seen if new instruction was added
+ * to interpreter, but not to JIT
+ * or if there is junk in sk_filter
+ */
+ pr_err("bpf_jit: unknown opcode %02x\n", insn->code);
+ return -EINVAL;
}
- /* last bpf instruction is always a RET :
- * use it to give the cleanup instruction(s) addr
- */
- cleanup_addr = proglen - 1; /* ret */
- if (seen_or_pass0)
- cleanup_addr -= 1; /* leaveq */
- if (seen_or_pass0 & SEEN_XREG)
- cleanup_addr -= 4; /* mov -8(%rbp),%rbx */
+ ilen = prog - temp;
+ if (image) {
+ if (unlikely(proglen + ilen > oldproglen)) {
+ pr_err("bpf_jit_compile fatal error\n");
+ return -EFAULT;
+ }
+ memcpy(image + proglen, temp, ilen);
+ }
+ proglen += ilen;
+ addrs[i] = proglen;
+ prog = temp;
+ }
+ return proglen;
+}
+
+void bpf_jit_compile(struct sk_filter *prog)
+{
+}
+
+void bpf_int_jit_compile(struct sk_filter *prog)
+{
+ struct bpf_binary_header *header = NULL;
+ int proglen, oldproglen = 0;
+ struct jit_context ctx = {};
+ u8 *image = NULL;
+ int *addrs;
+ int pass;
+ int i;
+
+ if (!bpf_jit_enable)
+ return;
+
+ if (!prog || !prog->len)
+ return;
+
+ addrs = kmalloc(prog->len * sizeof(*addrs), GFP_KERNEL);
+ if (!addrs)
+ return;
+
+ /* Before first pass, make a rough estimation of addrs[]
+ * each bpf instruction is translated to less than 64 bytes
+ */
+ for (proglen = 0, i = 0; i < prog->len; i++) {
+ proglen += 64;
+ addrs[i] = proglen;
+ }
+ ctx.cleanup_addr = proglen;
+
+ for (pass = 0; pass < 10; pass++) {
+ proglen = do_jit(prog, addrs, image, oldproglen, &ctx);
+ if (proglen <= 0) {
+ image = NULL;
+ if (header)
+ module_free(NULL, header);
+ goto out;
+ }
if (image) {
if (proglen != oldproglen)
- pr_err("bpb_jit_compile proglen=%u != oldproglen=%u\n", proglen, oldproglen);
+ pr_err("bpf_jit: proglen=%d != oldproglen=%d\n",
+ proglen, oldproglen);
break;
}
if (proglen == oldproglen) {
@@ -766,17 +918,16 @@ cond_branch: f_offset = addrs[i + filter[i].jf] - addrs[i];
}
if (bpf_jit_enable > 1)
- bpf_jit_dump(flen, proglen, pass, image);
+ bpf_jit_dump(prog->len, proglen, 0, image);
if (image) {
bpf_flush_icache(header, image + proglen);
set_memory_ro((unsigned long)header, header->pages);
- fp->bpf_func = (void *)image;
- fp->jited = 1;
+ prog->bpf_func = (void *)image;
+ prog->jited = 1;
}
out:
kfree(addrs);
- return;
}
static void bpf_jit_free_deferred(struct work_struct *work)
diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile
index 097e7a7940d..af9307f2cc2 100644
--- a/arch/x86/platform/intel-mid/device_libs/Makefile
+++ b/arch/x86/platform/intel-mid/device_libs/Makefile
@@ -20,3 +20,4 @@ obj-$(subst m,y,$(CONFIG_DRM_MEDFIELD)) += platform_tc35876x.o
obj-$(subst m,y,$(CONFIG_SERIAL_MRST_MAX3110)) += platform_max3111.o
# MISC Devices
obj-$(subst m,y,$(CONFIG_KEYBOARD_GPIO)) += platform_gpio_keys.o
+obj-$(subst m,y,$(CONFIG_INTEL_MID_WATCHDOG)) += platform_wdt.o
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_wdt.c
new file mode 100644
index 00000000000..973cf3bfa9f
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_wdt.c
@@ -0,0 +1,72 @@
+/*
+ * platform_wdt.c: Watchdog platform library file
+ *
+ * (C) Copyright 2014 Intel Corporation
+ * Author: David Cohen <david.a.cohen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include <linux/platform_data/intel-mid_wdt.h>
+#include <asm/intel-mid.h>
+#include <asm/io_apic.h>
+
+#define TANGIER_EXT_TIMER0_MSI 15
+
+static struct platform_device wdt_dev = {
+ .name = "intel_mid_wdt",
+ .id = -1,
+};
+
+static int tangier_probe(struct platform_device *pdev)
+{
+ int ioapic;
+ int irq;
+ struct intel_mid_wdt_pdata *pdata = pdev->dev.platform_data;
+ struct io_apic_irq_attr irq_attr = { 0 };
+
+ if (!pdata)
+ return -EINVAL;
+
+ irq = pdata->irq;
+ ioapic = mp_find_ioapic(irq);
+ if (ioapic >= 0) {
+ int ret;
+ irq_attr.ioapic = ioapic;
+ irq_attr.ioapic_pin = irq;
+ irq_attr.trigger = 1;
+ /* irq_attr.polarity = 0; -> Active high */
+ ret = io_apic_set_pci_routing(NULL, irq, &irq_attr);
+ if (ret)
+ return ret;
+ } else {
+ dev_warn(&pdev->dev, "cannot find interrupt %d in ioapic\n",
+ irq);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct intel_mid_wdt_pdata tangier_pdata = {
+ .irq = TANGIER_EXT_TIMER0_MSI,
+ .probe = tangier_probe,
+};
+
+static int __init register_mid_wdt(void)
+{
+ if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_TANGIER) {
+ wdt_dev.dev.platform_data = &tangier_pdata;
+ return platform_device_register(&wdt_dev);
+ }
+
+ return -ENODEV;
+}
+
+rootfs_initcall(register_mid_wdt);
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 9769df09403..61b04fe36e6 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -9,18 +9,8 @@ VDSOX32-$(CONFIG_X86_X32_ABI) := y
VDSO32-$(CONFIG_X86_32) := y
VDSO32-$(CONFIG_COMPAT) := y
-vdso-install-$(VDSO64-y) += vdso.so
-vdso-install-$(VDSOX32-y) += vdsox32.so
-vdso-install-$(VDSO32-y) += $(vdso32-images)
-
-
# files to link into the vdso
-vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
-
-vobjs-$(VDSOX32-y) += $(vobjx32s-compat)
-
-# Filter out x32 objects.
-vobj64s := $(filter-out $(vobjx32s-compat),$(vobjs-y))
+vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vdso-fakesections.o
# files to link into kernel
obj-y += vma.o
@@ -34,7 +24,7 @@ vdso_img-$(VDSO32-y) += 32-sysenter
obj-$(VDSO32-y) += vdso32-setup.o
-vobjs := $(foreach F,$(vobj64s),$(obj)/$F)
+vobjs := $(foreach F,$(vobjs-y),$(obj)/$F)
$(obj)/vdso.o: $(obj)/vdso.so
@@ -76,7 +66,8 @@ $(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso2c FORCE
#
CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
$(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \
- -fno-omit-frame-pointer -foptimize-sibling-calls
+ -fno-omit-frame-pointer -foptimize-sibling-calls \
+ -DDISABLE_BRANCH_PROFILING
$(vobjs): KBUILD_CFLAGS += $(CFL)
@@ -104,7 +95,13 @@ VDSO_LDFLAGS_vdsox32.lds = -Wl,-m,elf32_x86_64 \
-Wl,-z,max-page-size=4096 \
-Wl,-z,common-page-size=4096
-vobjx32s-y := $(vobj64s:.o=-x32.o)
+# 64-bit objects to re-brand as x32
+vobjs64-for-x32 := $(filter-out $(vobjs-nox32),$(vobjs-y))
+
+# x32-rebranded versions
+vobjx32s-y := $(vobjs64-for-x32:.o=-x32.o)
+
+# same thing, but in the output directory
vobjx32s := $(foreach F,$(vobjx32s-y),$(obj)/$F)
# Convert 64bit object file to x32 for x32 vDSO.
@@ -137,7 +134,7 @@ override obj-dirs = $(dir $(obj)) $(obj)/vdso32/
targets += vdso32/vdso32.lds
targets += vdso32/note.o vdso32/vclock_gettime.o $(vdso32.so-y:%=vdso32/%.o)
-targets += vdso32/vclock_gettime.o
+targets += vdso32/vclock_gettime.o vdso32/vdso-fakesections.o
$(obj)/vdso32.o: $(vdso32-images:%=$(obj)/%)
@@ -153,11 +150,13 @@ KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic
KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector)
KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls)
KBUILD_CFLAGS_32 += -fno-omit-frame-pointer
+KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING
$(vdso32-images:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
$(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \
$(obj)/vdso32/vdso32.lds \
$(obj)/vdso32/vclock_gettime.o \
+ $(obj)/vdso32/vdso-fakesections.o \
$(obj)/vdso32/note.o \
$(obj)/vdso32/%.o
$(call if_changed,vdso)
@@ -172,19 +171,34 @@ quiet_cmd_vdso = VDSO $@
sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) \
- -Wl,-Bsymbolic $(LTO_CFLAGS)
+ $(call cc-ldoption, -Wl$(comma)--build-id) -Wl,-Bsymbolic $(LTO_CFLAGS)
GCOV_PROFILE := n
#
-# Install the unstripped copy of vdso*.so listed in $(vdso-install-y).
+# Install the unstripped copies of vdso*.so. If our toolchain supports
+# build-id, install .build-id links as well.
#
-quiet_cmd_vdso_install = INSTALL $@
- cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@
-$(vdso-install-y): %.so: $(obj)/%.so.dbg FORCE
+quiet_cmd_vdso_install = INSTALL $(@:install_%=%)
+define cmd_vdso_install
+ cp $< "$(MODLIB)/vdso/$(@:install_%=%)"; \
+ if readelf -n $< |grep -q 'Build ID'; then \
+ buildid=`readelf -n $< |grep 'Build ID' |sed -e 's/^.*Build ID: \(.*\)$$/\1/'`; \
+ first=`echo $$buildid | cut -b-2`; \
+ last=`echo $$buildid | cut -b3-`; \
+ mkdir -p "$(MODLIB)/vdso/.build-id/$$first"; \
+ ln -sf "../../$(@:install_%=%)" "$(MODLIB)/vdso/.build-id/$$first/$$last.debug"; \
+ fi
+endef
+
+vdso_img_insttargets := $(vdso_img_sodbg:%.dbg=install_%)
+
+$(MODLIB)/vdso: FORCE
@mkdir -p $(MODLIB)/vdso
+
+$(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE
$(call cmd,vdso_install)
-PHONY += vdso_install $(vdso-install-y)
-vdso_install: $(vdso-install-y)
+PHONY += vdso_install $(vdso_img_insttargets)
+vdso_install: $(vdso_img_insttargets) FORCE
clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80*
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index b2e4f493e5b..9793322751e 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -11,9 +11,6 @@
* Check with readelf after changing.
*/
-/* Disable profiling for userspace code: */
-#define DISABLE_BRANCH_PROFILING
-
#include <uapi/linux/time.h>
#include <asm/vgtod.h>
#include <asm/hpet.h>
diff --git a/arch/x86/vdso/vdso-fakesections.c b/arch/x86/vdso/vdso-fakesections.c
new file mode 100644
index 00000000000..aa5fbfab20a
--- /dev/null
+++ b/arch/x86/vdso/vdso-fakesections.c
@@ -0,0 +1,21 @@
+/*
+ * Copyright 2014 Andy Lutomirski
+ * Subject to the GNU Public License, v.2
+ *
+ * String table for loadable section headers. See vdso2c.h for why
+ * this exists.
+ */
+
+const char fake_shstrtab[] __attribute__((section(".fake_shstrtab"))) =
+ ".hash\0"
+ ".dynsym\0"
+ ".dynstr\0"
+ ".gnu.version\0"
+ ".gnu.version_d\0"
+ ".dynamic\0"
+ ".rodata\0"
+ ".fake_shstrtab\0" /* Yay, self-referential code. */
+ ".note\0"
+ ".eh_frame_hdr\0"
+ ".eh_frame\0"
+ ".text";
diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/vdso/vdso-layout.lds.S
index 2ec72f651eb..9197544eea9 100644
--- a/arch/x86/vdso/vdso-layout.lds.S
+++ b/arch/x86/vdso/vdso-layout.lds.S
@@ -6,6 +6,16 @@
* This script controls its layout.
*/
+#if defined(BUILD_VDSO64)
+# define SHDR_SIZE 64
+#elif defined(BUILD_VDSO32) || defined(BUILD_VDSOX32)
+# define SHDR_SIZE 40
+#else
+# error unknown VDSO target
+#endif
+
+#define NUM_FAKE_SHDRS 13
+
SECTIONS
{
. = SIZEOF_HEADERS;
@@ -18,36 +28,53 @@ SECTIONS
.gnu.version_d : { *(.gnu.version_d) }
.gnu.version_r : { *(.gnu.version_r) }
+ .dynamic : { *(.dynamic) } :text :dynamic
+
+ .rodata : {
+ *(.rodata*)
+ *(.data*)
+ *(.sdata*)
+ *(.got.plt) *(.got)
+ *(.gnu.linkonce.d.*)
+ *(.bss*)
+ *(.dynbss*)
+ *(.gnu.linkonce.b.*)
+
+ /*
+ * Ideally this would live in a C file, but that won't
+ * work cleanly for x32 until we start building the x32
+ * C code using an x32 toolchain.
+ */
+ VDSO_FAKE_SECTION_TABLE_START = .;
+ . = . + NUM_FAKE_SHDRS * SHDR_SIZE;
+ VDSO_FAKE_SECTION_TABLE_END = .;
+ } :text
+
+ .fake_shstrtab : { *(.fake_shstrtab) } :text
+
+
.note : { *(.note.*) } :text :note
.eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
.eh_frame : { KEEP (*(.eh_frame)) } :text
- .dynamic : { *(.dynamic) } :text :dynamic
-
- .rodata : { *(.rodata*) } :text
- .data : {
- *(.data*)
- *(.sdata*)
- *(.got.plt) *(.got)
- *(.gnu.linkonce.d.*)
- *(.bss*)
- *(.dynbss*)
- *(.gnu.linkonce.b.*)
- }
-
- .altinstructions : { *(.altinstructions) }
- .altinstr_replacement : { *(.altinstr_replacement) }
/*
- * Align the actual code well away from the non-instruction data.
- * This is the best thing for the I-cache.
+ * Text is well-separated from actual data: there's plenty of
+ * stuff that isn't used at runtime in between.
*/
- . = ALIGN(0x100);
.text : { *(.text*) } :text =0x90909090,
/*
+ * At the end so that eu-elflint stays happy when vdso2c strips
+ * these. A better implementation would avoid allocating space
+ * for these.
+ */
+ .altinstructions : { *(.altinstructions) } :text
+ .altinstr_replacement : { *(.altinstr_replacement) } :text
+
+ /*
* The remainder of the vDSO consists of special pages that are
* shared between the kernel and userspace. It needs to be at the
* end so that it doesn't overlap the mapping of the actual
@@ -75,6 +102,7 @@ SECTIONS
/DISCARD/ : {
*(.discard)
*(.discard.*)
+ *(__bug_table)
}
}
diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/vdso/vdso.lds.S
index 75e3404c83b..6807932643c 100644
--- a/arch/x86/vdso/vdso.lds.S
+++ b/arch/x86/vdso/vdso.lds.S
@@ -6,6 +6,8 @@
* the DSO.
*/
+#define BUILD_VDSO64
+
#include "vdso-layout.lds.S"
/*
diff --git a/arch/x86/vdso/vdso2c.c b/arch/x86/vdso/vdso2c.c
index 450ac6eaf61..238dbe82776 100644
--- a/arch/x86/vdso/vdso2c.c
+++ b/arch/x86/vdso/vdso2c.c
@@ -23,6 +23,8 @@ enum {
sym_vvar_page,
sym_hpet_page,
sym_end_mapping,
+ sym_VDSO_FAKE_SECTION_TABLE_START,
+ sym_VDSO_FAKE_SECTION_TABLE_END,
};
const int special_pages[] = {
@@ -30,15 +32,26 @@ const int special_pages[] = {
sym_hpet_page,
};
-char const * const required_syms[] = {
- [sym_vvar_page] = "vvar_page",
- [sym_hpet_page] = "hpet_page",
- [sym_end_mapping] = "end_mapping",
- "VDSO32_NOTE_MASK",
- "VDSO32_SYSENTER_RETURN",
- "__kernel_vsyscall",
- "__kernel_sigreturn",
- "__kernel_rt_sigreturn",
+struct vdso_sym {
+ const char *name;
+ bool export;
+};
+
+struct vdso_sym required_syms[] = {
+ [sym_vvar_page] = {"vvar_page", true},
+ [sym_hpet_page] = {"hpet_page", true},
+ [sym_end_mapping] = {"end_mapping", true},
+ [sym_VDSO_FAKE_SECTION_TABLE_START] = {
+ "VDSO_FAKE_SECTION_TABLE_START", false
+ },
+ [sym_VDSO_FAKE_SECTION_TABLE_END] = {
+ "VDSO_FAKE_SECTION_TABLE_END", false
+ },
+ {"VDSO32_NOTE_MASK", true},
+ {"VDSO32_SYSENTER_RETURN", true},
+ {"__kernel_vsyscall", true},
+ {"__kernel_sigreturn", true},
+ {"__kernel_rt_sigreturn", true},
};
__attribute__((format(printf, 1, 2))) __attribute__((noreturn))
@@ -54,7 +67,7 @@ static void fail(const char *format, ...)
}
/*
- * Evil macros to do a little-endian read.
+ * Evil macros for little-endian reads and writes
*/
#define GLE(x, bits, ifnot) \
__builtin_choose_expr( \
@@ -62,45 +75,42 @@ static void fail(const char *format, ...)
(__typeof__(*(x)))get_unaligned_le##bits(x), ifnot)
extern void bad_get_le(void);
-#define LAST_LE(x) \
+#define LAST_GLE(x) \
__builtin_choose_expr(sizeof(*(x)) == 1, *(x), bad_get_le())
#define GET_LE(x) \
- GLE(x, 64, GLE(x, 32, GLE(x, 16, LAST_LE(x))))
+ GLE(x, 64, GLE(x, 32, GLE(x, 16, LAST_GLE(x))))
+
+#define PLE(x, val, bits, ifnot) \
+ __builtin_choose_expr( \
+ (sizeof(*(x)) == bits/8), \
+ put_unaligned_le##bits((val), (x)), ifnot)
+
+extern void bad_put_le(void);
+#define LAST_PLE(x, val) \
+ __builtin_choose_expr(sizeof(*(x)) == 1, *(x) = (val), bad_put_le())
+
+#define PUT_LE(x, val) \
+ PLE(x, val, 64, PLE(x, val, 32, PLE(x, val, 16, LAST_PLE(x, val))))
+
#define NSYMS (sizeof(required_syms) / sizeof(required_syms[0]))
-#define BITS 64
-#define GOFUNC go64
-#define Elf_Ehdr Elf64_Ehdr
-#define Elf_Shdr Elf64_Shdr
-#define Elf_Phdr Elf64_Phdr
-#define Elf_Sym Elf64_Sym
-#define Elf_Dyn Elf64_Dyn
+#define BITSFUNC3(name, bits) name##bits
+#define BITSFUNC2(name, bits) BITSFUNC3(name, bits)
+#define BITSFUNC(name) BITSFUNC2(name, ELF_BITS)
+
+#define ELF_BITS_XFORM2(bits, x) Elf##bits##_##x
+#define ELF_BITS_XFORM(bits, x) ELF_BITS_XFORM2(bits, x)
+#define ELF(x) ELF_BITS_XFORM(ELF_BITS, x)
+
+#define ELF_BITS 64
#include "vdso2c.h"
-#undef BITS
-#undef GOFUNC
-#undef Elf_Ehdr
-#undef Elf_Shdr
-#undef Elf_Phdr
-#undef Elf_Sym
-#undef Elf_Dyn
-
-#define BITS 32
-#define GOFUNC go32
-#define Elf_Ehdr Elf32_Ehdr
-#define Elf_Shdr Elf32_Shdr
-#define Elf_Phdr Elf32_Phdr
-#define Elf_Sym Elf32_Sym
-#define Elf_Dyn Elf32_Dyn
+#undef ELF_BITS
+
+#define ELF_BITS 32
#include "vdso2c.h"
-#undef BITS
-#undef GOFUNC
-#undef Elf_Ehdr
-#undef Elf_Shdr
-#undef Elf_Phdr
-#undef Elf_Sym
-#undef Elf_Dyn
+#undef ELF_BITS
static void go(void *addr, size_t len, FILE *outfile, const char *name)
{
diff --git a/arch/x86/vdso/vdso2c.h b/arch/x86/vdso/vdso2c.h
index 8a074637a57..11b65d4f941 100644
--- a/arch/x86/vdso/vdso2c.h
+++ b/arch/x86/vdso/vdso2c.h
@@ -4,21 +4,139 @@
* are built for 32-bit userspace.
*/
-static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name)
+/*
+ * We're writing a section table for a few reasons:
+ *
+ * The Go runtime had a couple of bugs: it would read the section
+ * table to try to figure out how many dynamic symbols there were (it
+ * shouldn't have looked at the section table at all) and, if there
+ * were no SHT_SYNDYM section table entry, it would use an
+ * uninitialized value for the number of symbols. An empty DYNSYM
+ * table would work, but I see no reason not to write a valid one (and
+ * keep full performance for old Go programs). This hack is only
+ * needed on x86_64.
+ *
+ * The bug was introduced on 2012-08-31 by:
+ * https://code.google.com/p/go/source/detail?r=56ea40aac72b
+ * and was fixed on 2014-06-13 by:
+ * https://code.google.com/p/go/source/detail?r=fc1cd5e12595
+ *
+ * Binutils has issues debugging the vDSO: it reads the section table to
+ * find SHT_NOTE; it won't look at PT_NOTE for the in-memory vDSO, which
+ * would break build-id if we removed the section table. Binutils
+ * also requires that shstrndx != 0. See:
+ * https://sourceware.org/bugzilla/show_bug.cgi?id=17064
+ *
+ * elfutils might not look for PT_NOTE if there is a section table at
+ * all. I don't know whether this matters for any practical purpose.
+ *
+ * For simplicity, rather than hacking up a partial section table, we
+ * just write a mostly complete one. We omit non-dynamic symbols,
+ * though, since they're rather large.
+ *
+ * Once binutils gets fixed, we might be able to drop this for all but
+ * the 64-bit vdso, since build-id only works in kernel RPMs, and
+ * systems that update to new enough kernel RPMs will likely update
+ * binutils in sync. build-id has never worked for home-built kernel
+ * RPMs without manual symlinking, and I suspect that no one ever does
+ * that.
+ */
+struct BITSFUNC(fake_sections)
+{
+ ELF(Shdr) *table;
+ unsigned long table_offset;
+ int count, max_count;
+
+ int in_shstrndx;
+ unsigned long shstr_offset;
+ const char *shstrtab;
+ size_t shstrtab_len;
+
+ int out_shstrndx;
+};
+
+static unsigned int BITSFUNC(find_shname)(struct BITSFUNC(fake_sections) *out,
+ const char *name)
+{
+ const char *outname = out->shstrtab;
+ while (outname - out->shstrtab < out->shstrtab_len) {
+ if (!strcmp(name, outname))
+ return (outname - out->shstrtab) + out->shstr_offset;
+ outname += strlen(outname) + 1;
+ }
+
+ if (*name)
+ printf("Warning: could not find output name \"%s\"\n", name);
+ return out->shstr_offset + out->shstrtab_len - 1; /* Use a null. */
+}
+
+static void BITSFUNC(init_sections)(struct BITSFUNC(fake_sections) *out)
+{
+ if (!out->in_shstrndx)
+ fail("didn't find the fake shstrndx\n");
+
+ memset(out->table, 0, out->max_count * sizeof(ELF(Shdr)));
+
+ if (out->max_count < 1)
+ fail("we need at least two fake output sections\n");
+
+ PUT_LE(&out->table[0].sh_type, SHT_NULL);
+ PUT_LE(&out->table[0].sh_name, BITSFUNC(find_shname)(out, ""));
+
+ out->count = 1;
+}
+
+static void BITSFUNC(copy_section)(struct BITSFUNC(fake_sections) *out,
+ int in_idx, const ELF(Shdr) *in,
+ const char *name)
+{
+ uint64_t flags = GET_LE(&in->sh_flags);
+
+ bool copy = flags & SHF_ALLOC &&
+ (GET_LE(&in->sh_size) ||
+ (GET_LE(&in->sh_type) != SHT_RELA &&
+ GET_LE(&in->sh_type) != SHT_REL)) &&
+ strcmp(name, ".altinstructions") &&
+ strcmp(name, ".altinstr_replacement");
+
+ if (!copy)
+ return;
+
+ if (out->count >= out->max_count)
+ fail("too many copied sections (max = %d)\n", out->max_count);
+
+ if (in_idx == out->in_shstrndx)
+ out->out_shstrndx = out->count;
+
+ out->table[out->count] = *in;
+ PUT_LE(&out->table[out->count].sh_name,
+ BITSFUNC(find_shname)(out, name));
+
+ /* elfutils requires that a strtab have the correct type. */
+ if (!strcmp(name, ".fake_shstrtab"))
+ PUT_LE(&out->table[out->count].sh_type, SHT_STRTAB);
+
+ out->count++;
+}
+
+static void BITSFUNC(go)(void *addr, size_t len,
+ FILE *outfile, const char *name)
{
int found_load = 0;
unsigned long load_size = -1; /* Work around bogus warning */
unsigned long data_size;
- Elf_Ehdr *hdr = (Elf_Ehdr *)addr;
+ ELF(Ehdr) *hdr = (ELF(Ehdr) *)addr;
int i;
unsigned long j;
- Elf_Shdr *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr,
+ ELF(Shdr) *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr,
*alt_sec = NULL;
- Elf_Dyn *dyn = 0, *dyn_end = 0;
+ ELF(Dyn) *dyn = 0, *dyn_end = 0;
const char *secstrings;
uint64_t syms[NSYMS] = {};
- Elf_Phdr *pt = (Elf_Phdr *)(addr + GET_LE(&hdr->e_phoff));
+ struct BITSFUNC(fake_sections) fake_sections = {};
+
+ ELF(Phdr) *pt = (ELF(Phdr) *)(addr + GET_LE(&hdr->e_phoff));
/* Walk the segment table. */
for (i = 0; i < GET_LE(&hdr->e_phnum); i++) {
@@ -49,7 +167,7 @@ static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name)
for (i = 0; dyn + i < dyn_end &&
GET_LE(&dyn[i].d_tag) != DT_NULL; i++) {
typeof(dyn[i].d_tag) tag = GET_LE(&dyn[i].d_tag);
- if (tag == DT_REL || tag == DT_RELSZ ||
+ if (tag == DT_REL || tag == DT_RELSZ || tag == DT_RELA ||
tag == DT_RELENT || tag == DT_TEXTREL)
fail("vdso image contains dynamic relocations\n");
}
@@ -59,7 +177,7 @@ static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name)
GET_LE(&hdr->e_shentsize)*GET_LE(&hdr->e_shstrndx);
secstrings = addr + GET_LE(&secstrings_hdr->sh_offset);
for (i = 0; i < GET_LE(&hdr->e_shnum); i++) {
- Elf_Shdr *sh = addr + GET_LE(&hdr->e_shoff) +
+ ELF(Shdr) *sh = addr + GET_LE(&hdr->e_shoff) +
GET_LE(&hdr->e_shentsize) * i;
if (GET_LE(&sh->sh_type) == SHT_SYMTAB)
symtab_hdr = sh;
@@ -80,20 +198,62 @@ static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name)
i < GET_LE(&symtab_hdr->sh_size) / GET_LE(&symtab_hdr->sh_entsize);
i++) {
int k;
- Elf_Sym *sym = addr + GET_LE(&symtab_hdr->sh_offset) +
+ ELF(Sym) *sym = addr + GET_LE(&symtab_hdr->sh_offset) +
GET_LE(&symtab_hdr->sh_entsize) * i;
const char *name = addr + GET_LE(&strtab_hdr->sh_offset) +
GET_LE(&sym->st_name);
+
for (k = 0; k < NSYMS; k++) {
- if (!strcmp(name, required_syms[k])) {
+ if (!strcmp(name, required_syms[k].name)) {
if (syms[k]) {
fail("duplicate symbol %s\n",
- required_syms[k]);
+ required_syms[k].name);
}
syms[k] = GET_LE(&sym->st_value);
}
}
+
+ if (!strcmp(name, "fake_shstrtab")) {
+ ELF(Shdr) *sh;
+
+ fake_sections.in_shstrndx = GET_LE(&sym->st_shndx);
+ fake_sections.shstrtab = addr + GET_LE(&sym->st_value);
+ fake_sections.shstrtab_len = GET_LE(&sym->st_size);
+ sh = addr + GET_LE(&hdr->e_shoff) +
+ GET_LE(&hdr->e_shentsize) *
+ fake_sections.in_shstrndx;
+ fake_sections.shstr_offset = GET_LE(&sym->st_value) -
+ GET_LE(&sh->sh_addr);
+ }
+ }
+
+ /* Build the output section table. */
+ if (!syms[sym_VDSO_FAKE_SECTION_TABLE_START] ||
+ !syms[sym_VDSO_FAKE_SECTION_TABLE_END])
+ fail("couldn't find fake section table\n");
+ if ((syms[sym_VDSO_FAKE_SECTION_TABLE_END] -
+ syms[sym_VDSO_FAKE_SECTION_TABLE_START]) % sizeof(ELF(Shdr)))
+ fail("fake section table size isn't a multiple of sizeof(Shdr)\n");
+ fake_sections.table = addr + syms[sym_VDSO_FAKE_SECTION_TABLE_START];
+ fake_sections.table_offset = syms[sym_VDSO_FAKE_SECTION_TABLE_START];
+ fake_sections.max_count = (syms[sym_VDSO_FAKE_SECTION_TABLE_END] -
+ syms[sym_VDSO_FAKE_SECTION_TABLE_START]) /
+ sizeof(ELF(Shdr));
+
+ BITSFUNC(init_sections)(&fake_sections);
+ for (i = 0; i < GET_LE(&hdr->e_shnum); i++) {
+ ELF(Shdr) *sh = addr + GET_LE(&hdr->e_shoff) +
+ GET_LE(&hdr->e_shentsize) * i;
+ BITSFUNC(copy_section)(&fake_sections, i, sh,
+ secstrings + GET_LE(&sh->sh_name));
}
+ if (!fake_sections.out_shstrndx)
+ fail("didn't generate shstrndx?!?\n");
+
+ PUT_LE(&hdr->e_shoff, fake_sections.table_offset);
+ PUT_LE(&hdr->e_shentsize, sizeof(ELF(Shdr)));
+ PUT_LE(&hdr->e_shnum, fake_sections.count);
+ PUT_LE(&hdr->e_shstrndx, fake_sections.out_shstrndx);
/* Validate mapping addresses. */
for (i = 0; i < sizeof(special_pages) / sizeof(special_pages[0]); i++) {
@@ -102,22 +262,17 @@ static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name)
if (syms[i] % 4096)
fail("%s must be a multiple of 4096\n",
- required_syms[i]);
+ required_syms[i].name);
if (syms[i] < data_size)
fail("%s must be after the text mapping\n",
- required_syms[i]);
+ required_syms[i].name);
if (syms[sym_end_mapping] < syms[i] + 4096)
- fail("%s overruns end_mapping\n", required_syms[i]);
+ fail("%s overruns end_mapping\n",
+ required_syms[i].name);
}
if (syms[sym_end_mapping] % 4096)
fail("end_mapping must be a multiple of 4096\n");
- /* Remove sections. */
- hdr->e_shoff = 0;
- hdr->e_shentsize = 0;
- hdr->e_shnum = 0;
- hdr->e_shstrndx = htole16(SHN_UNDEF);
-
if (!name) {
fwrite(addr, load_size, 1, outfile);
return;
@@ -155,9 +310,9 @@ static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name)
(unsigned long)GET_LE(&alt_sec->sh_size));
}
for (i = 0; i < NSYMS; i++) {
- if (syms[i])
+ if (required_syms[i].export && syms[i])
fprintf(outfile, "\t.sym_%s = 0x%" PRIx64 ",\n",
- required_syms[i], syms[i]);
+ required_syms[i].name, syms[i]);
}
fprintf(outfile, "};\n");
}
diff --git a/arch/x86/vdso/vdso32/vdso-fakesections.c b/arch/x86/vdso/vdso32/vdso-fakesections.c
new file mode 100644
index 00000000000..541468e2526
--- /dev/null
+++ b/arch/x86/vdso/vdso32/vdso-fakesections.c
@@ -0,0 +1 @@
+#include "../vdso-fakesections.c"
diff --git a/arch/x86/vdso/vdsox32.lds.S b/arch/x86/vdso/vdsox32.lds.S
index 46b991b578a..697c11ece90 100644
--- a/arch/x86/vdso/vdsox32.lds.S
+++ b/arch/x86/vdso/vdsox32.lds.S
@@ -6,6 +6,8 @@
* the DSO.
*/
+#define BUILD_VDSOX32
+
#include "vdso-layout.lds.S"
/*
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index e1513c47872..5a5176de8d0 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -62,6 +62,9 @@ struct linux_binprm;
Only used for the 64-bit and x32 vdsos. */
static unsigned long vdso_addr(unsigned long start, unsigned len)
{
+#ifdef CONFIG_X86_32
+ return 0;
+#else
unsigned long addr, end;
unsigned offset;
end = (start + PMD_SIZE - 1) & PMD_MASK;
@@ -83,6 +86,7 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
addr = align_vdso_addr(addr);
return addr;
+#endif
}
static int map_vdso(const struct vdso_image *image, bool calculate_addr)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index f17b29210ac..ffb101e4573 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1537,7 +1537,10 @@ asmlinkage __visible void __init xen_start_kernel(void)
if (!xen_pvh_domain())
pv_cpu_ops = xen_cpu_ops;
- x86_init.resources.memory_setup = xen_memory_setup;
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ x86_init.resources.memory_setup = xen_auto_xlated_memory_setup;
+ else
+ x86_init.resources.memory_setup = xen_memory_setup;
x86_init.oem.arch_setup = xen_arch_setup;
x86_init.oem.banner = xen_banner;
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 821a11ada59..2e555163c2f 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -27,7 +27,6 @@
#include <xen/interface/memory.h>
#include <xen/interface/physdev.h>
#include <xen/features.h>
-#include "mmu.h"
#include "xen-ops.h"
#include "vdso.h"
@@ -82,9 +81,6 @@ static void __init xen_add_extra_mem(u64 start, u64 size)
memblock_reserve(start, size);
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return;
-
xen_max_p2m_pfn = PFN_DOWN(start + size);
for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) {
unsigned long mfn = pfn_to_mfn(pfn);
@@ -107,7 +103,6 @@ static unsigned long __init xen_do_chunk(unsigned long start,
.domid = DOMID_SELF
};
unsigned long len = 0;
- int xlated_phys = xen_feature(XENFEAT_auto_translated_physmap);
unsigned long pfn;
int ret;
@@ -121,7 +116,7 @@ static unsigned long __init xen_do_chunk(unsigned long start,
continue;
frame = mfn;
} else {
- if (!xlated_phys && mfn != INVALID_P2M_ENTRY)
+ if (mfn != INVALID_P2M_ENTRY)
continue;
frame = pfn;
}
@@ -159,13 +154,6 @@ static unsigned long __init xen_do_chunk(unsigned long start,
static unsigned long __init xen_release_chunk(unsigned long start,
unsigned long end)
{
- /*
- * Xen already ballooned out the E820 non RAM regions for us
- * and set them up properly in EPT.
- */
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return end - start;
-
return xen_do_chunk(start, end, true);
}
@@ -234,13 +222,7 @@ static void __init xen_set_identity_and_release_chunk(
* (except for the ISA region which must be 1:1 mapped) to
* release the refcounts (in Xen) on the original frames.
*/
-
- /*
- * PVH E820 matches the hypervisor's P2M which means we need to
- * account for the proper values of *release and *identity.
- */
- for (pfn = start_pfn; !xen_feature(XENFEAT_auto_translated_physmap) &&
- pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) {
+ for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) {
pte_t pte = __pte_ma(0);
if (pfn < PFN_UP(ISA_END_ADDRESS))
@@ -518,6 +500,35 @@ char * __init xen_memory_setup(void)
}
/*
+ * Machine specific memory setup for auto-translated guests.
+ */
+char * __init xen_auto_xlated_memory_setup(void)
+{
+ static struct e820entry map[E820MAX] __initdata;
+
+ struct xen_memory_map memmap;
+ int i;
+ int rc;
+
+ memmap.nr_entries = E820MAX;
+ set_xen_guest_handle(memmap.buffer, map);
+
+ rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
+ if (rc < 0)
+ panic("No memory map (%d)\n", rc);
+
+ sanitize_e820_map(map, ARRAY_SIZE(map), &memmap.nr_entries);
+
+ for (i = 0; i < memmap.nr_entries; i++)
+ e820_add_region(map[i].addr, map[i].size, map[i].type);
+
+ memblock_reserve(__pa(xen_start_info->mfn_list),
+ xen_start_info->pt_base - xen_start_info->mfn_list);
+
+ return "Xen";
+}
+
+/*
* Set the bit indicating "nosegneg" library variants should be used.
* We only need to bother in pure 32-bit mode; compat 32-bit processes
* can have un-truncated segments, so wrapping around is allowed.
@@ -590,13 +601,7 @@ void xen_enable_syscall(void)
}
#endif /* CONFIG_X86_64 */
}
-void xen_enable_nmi(void)
-{
-#ifdef CONFIG_X86_64
- if (register_callback(CALLBACKTYPE_nmi, (char *)nmi))
- BUG();
-#endif
-}
+
void __init xen_pvmmu_arch_setup(void)
{
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
@@ -611,7 +616,6 @@ void __init xen_pvmmu_arch_setup(void)
xen_enable_sysenter();
xen_enable_syscall();
- xen_enable_nmi();
}
/* This function is not called for HVM domains */
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index c834d4b231f..97d87659f77 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -36,6 +36,7 @@ void xen_mm_unpin_all(void);
void xen_set_pat(u64);
char * __init xen_memory_setup(void);
+char * xen_auto_xlated_memory_setup(void);
void __init xen_arch_setup(void);
void xen_enable_sysenter(void);
void xen_enable_syscall(void);