diff options
Diffstat (limited to 'arch/x86')
48 files changed, 1139 insertions, 846 deletions
diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 602f57e590b..33f71b01fd2 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -79,11 +79,14 @@ else UTS_MACHINE := x86_64 CHECKFLAGS += -D__x86_64__ -m64 + biarch := -m64 KBUILD_AFLAGS += -m64 KBUILD_CFLAGS += -m64 # Don't autogenerate traditional x87, MMX or SSE instructions - KBUILD_CFLAGS += -mno-mmx -mno-sse -mno-80387 -mno-fp-ret-in-387 + KBUILD_CFLAGS += -mno-mmx -mno-sse + KBUILD_CFLAGS += $(call cc-option,-mno-80387) + KBUILD_CFLAGS += $(call cc-option,-mno-fp-ret-in-387) # Use -mpreferred-stack-boundary=3 if supported. KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=3) @@ -250,8 +253,8 @@ archclean: PHONY += kvmconfig kvmconfig: $(if $(wildcard $(objtree)/.config),, $(error You need an existing .config for this target)) - $(Q)$(CONFIG_SHELL) $(srctree)/scripts/kconfig/merge_config.sh -m -O $(objtree) $(objtree)/.config arch/x86/configs/kvm_guest.config - $(Q)yes "" | $(MAKE) oldconfig + $(Q)$(CONFIG_SHELL) $(srctree)/scripts/kconfig/merge_config.sh -m -O $(objtree) $(objtree)/.config $(srctree)/arch/x86/configs/kvm_guest.config + $(Q)yes "" | $(MAKE) -f $(srctree)/Makefile oldconfig define archhelp echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)' diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index abb9eba61b5..dbe8dd2fe24 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -71,7 +71,7 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE SETUP_OBJS = $(addprefix $(obj)/,$(setup-y)) -sed-voffset := -e 's/^\([0-9a-fA-F]*\) . \(_text\|_end\)$$/\#define VO_\2 0x\1/p' +sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|_end\)$$/\#define VO_\2 0x\1/p' quiet_cmd_voffset = VOFFSET $@ cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@ @@ -80,7 +80,7 @@ targets += voffset.h $(obj)/voffset.h: vmlinux FORCE $(call if_changed,voffset) -sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p' +sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p' quiet_cmd_zoffset = ZOFFSET $@ cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@ diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 17684615374..57ab74df7ee 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -354,7 +354,7 @@ static void parse_elf(void *output) free(phdrs); } -asmlinkage void *decompress_kernel(void *rmode, memptr heap, +asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, unsigned char *input_data, unsigned long input_len, unsigned char *output, diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h index e6fd8a026c7..cd00e177449 100644 --- a/arch/x86/include/asm/checksum_64.h +++ b/arch/x86/include/asm/checksum_64.h @@ -184,8 +184,15 @@ static inline unsigned add32_with_carry(unsigned a, unsigned b) asm("addl %2,%0\n\t" "adcl $0,%0" : "=r" (a) - : "0" (a), "r" (b)); + : "0" (a), "rm" (b)); return a; } +#define HAVE_ARCH_CSUM_ADD +static inline __wsum csum_add(__wsum csum, __wsum addend) +{ + return (__force __wsum)add32_with_carry((__force unsigned)csum, + (__force unsigned)addend); +} + #endif /* _ASM_X86_CHECKSUM_64_H */ diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index b18df579c0e..36f7125945e 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -63,6 +63,7 @@ /* hpet memory map physical address */ extern unsigned long hpet_address; extern unsigned long force_hpet_address; +extern int boot_hpet_disable; extern u8 hpet_blockid; extern int hpet_force_user; extern u8 hpet_msi_disable; diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index a8091216963..68c05398bba 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -52,6 +52,7 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { + ptep_clear_flush(vma, addr, ptep); } static inline int huge_pte_none(pte_t pte) diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index c827ace3121..fcf2b3ae1bf 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -384,7 +384,7 @@ #define MSR_IA32_MISC_ENABLE_MWAIT_BIT 18 #define MSR_IA32_MISC_ENABLE_MWAIT (1ULL << MSR_IA32_MISC_ENABLE_MWAIT_BIT) #define MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT 22 -#define MSR_IA32_MISC_ENABLE_LIMIT_CPUID (1ULL << MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT); +#define MSR_IA32_MISC_ENABLE_LIMIT_CPUID (1ULL << MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) #define MSR_IA32_MISC_ENABLE_XTPR_DISABLE_BIT 23 #define MSR_IA32_MISC_ENABLE_XTPR_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_XTPR_DISABLE_BIT) #define MSR_IA32_MISC_ENABLE_XD_DISABLE_BIT 34 diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 3a2ae4c8894..31368207837 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -31,7 +31,7 @@ static char temp_stack[4096]; * * Wrapper around acpi_enter_sleep_state() to be called by assmebly. */ -acpi_status asmlinkage x86_acpi_enter_sleep_state(u8 state) +acpi_status asmlinkage __visible x86_acpi_enter_sleep_state(u8 state) { return acpi_enter_sleep_state(state); } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 6ad4658de70..992060e0989 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2189,7 +2189,7 @@ void send_cleanup_vector(struct irq_cfg *cfg) cfg->move_in_progress = 0; } -asmlinkage void smp_irq_move_cleanup_interrupt(void) +asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; @@ -3425,6 +3425,11 @@ int get_nr_irqs_gsi(void) return nr_irqs_gsi; } +unsigned int arch_dynirq_lower_bound(unsigned int from) +{ + return from < nr_irqs_gsi ? nr_irqs_gsi : from; +} + int __init arch_probe_nr_irqs(void) { int nr; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index eeee23ff75e..68317c80de7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -598,7 +598,6 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { struct mce m; int i; - unsigned long *v; this_cpu_inc(mce_poll_count); @@ -618,8 +617,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (!(m.status & MCI_STATUS_VAL)) continue; - v = &get_cpu_var(mce_polled_error); - set_bit(0, v); + this_cpu_write(mce_polled_error, 1); /* * Uncorrected or signalled events are handled by the exception * handler when it is enabled, so don't process those here. diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 3bdb95ae8c4..9a316b21df8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -42,7 +42,7 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); * cmci_discover_lock protects against parallel discovery attempts * which could race against each other. */ -static DEFINE_RAW_SPINLOCK(cmci_discover_lock); +static DEFINE_SPINLOCK(cmci_discover_lock); #define CMCI_THRESHOLD 1 #define CMCI_POLL_INTERVAL (30 * HZ) @@ -144,14 +144,14 @@ static void cmci_storm_disable_banks(void) int bank; u64 val; - raw_spin_lock_irqsave(&cmci_discover_lock, flags); + spin_lock_irqsave(&cmci_discover_lock, flags); owned = __get_cpu_var(mce_banks_owned); for_each_set_bit(bank, owned, MAX_NR_BANKS) { rdmsrl(MSR_IA32_MCx_CTL2(bank), val); val &= ~MCI_CTL2_CMCI_EN; wrmsrl(MSR_IA32_MCx_CTL2(bank), val); } - raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); + spin_unlock_irqrestore(&cmci_discover_lock, flags); } static bool cmci_storm_detect(void) @@ -211,7 +211,7 @@ static void cmci_discover(int banks) int i; int bios_wrong_thresh = 0; - raw_spin_lock_irqsave(&cmci_discover_lock, flags); + spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < banks; i++) { u64 val; int bios_zero_thresh = 0; @@ -266,7 +266,7 @@ static void cmci_discover(int banks) WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); } } - raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); + spin_unlock_irqrestore(&cmci_discover_lock, flags); if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) { pr_info_once( "bios_cmci_threshold: Some banks do not have valid thresholds set\n"); @@ -316,10 +316,10 @@ void cmci_clear(void) if (!cmci_supported(&banks)) return; - raw_spin_lock_irqsave(&cmci_discover_lock, flags); + spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < banks; i++) __cmci_disable_bank(i); - raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); + spin_unlock_irqrestore(&cmci_discover_lock, flags); } static void cmci_rediscover_work_func(void *arg) @@ -360,9 +360,9 @@ void cmci_disable_bank(int bank) if (!cmci_supported(&banks)) return; - raw_spin_lock_irqsave(&cmci_discover_lock, flags); + spin_lock_irqsave(&cmci_discover_lock, flags); __cmci_disable_bank(bank); - raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); + spin_unlock_irqrestore(&cmci_discover_lock, flags); } static void intel_init_cmci(void) diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index d921b7ee659..36a1bb6d1ee 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -429,14 +429,14 @@ static inline void __smp_thermal_interrupt(void) smp_thermal_vector(); } -asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) +asmlinkage __visible void smp_thermal_interrupt(struct pt_regs *regs) { entering_irq(); __smp_thermal_interrupt(); exiting_ack_irq(); } -asmlinkage void smp_trace_thermal_interrupt(struct pt_regs *regs) +asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs) { entering_irq(); trace_thermal_apic_entry(THERMAL_APIC_VECTOR); diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index fe6b1c86645..7245980186e 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -24,14 +24,14 @@ static inline void __smp_threshold_interrupt(void) mce_threshold_vector(); } -asmlinkage void smp_threshold_interrupt(void) +asmlinkage __visible void smp_threshold_interrupt(void) { entering_irq(); __smp_threshold_interrupt(); exiting_ack_irq(); } -asmlinkage void smp_trace_threshold_interrupt(void) +asmlinkage __visible void smp_trace_threshold_interrupt(void) { entering_irq(); trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index aa333d96688..adb02aa62af 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -169,7 +169,6 @@ static struct event_constraint intel_slm_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - FIXED_EVENT_CONSTRAINT(0x013c, 2), /* CPU_CLK_UNHALTED.REF */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */ EVENT_CONSTRAINT_END }; diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index 059218ed520..619f7699487 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -59,7 +59,7 @@ #define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */ #define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */ #define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */ -#define RAPL_IDX_PP1_NRG_STAT 3 /* DRAM */ +#define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */ #define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */ /* Clients have PP0, PKG */ @@ -72,6 +72,12 @@ 1<<RAPL_IDX_PKG_NRG_STAT|\ 1<<RAPL_IDX_RAM_NRG_STAT) +/* Servers have PP0, PKG, RAM, PP1 */ +#define RAPL_IDX_HSW (1<<RAPL_IDX_PP0_NRG_STAT|\ + 1<<RAPL_IDX_PKG_NRG_STAT|\ + 1<<RAPL_IDX_RAM_NRG_STAT|\ + 1<<RAPL_IDX_PP1_NRG_STAT) + /* * event code: LSB 8 bits, passed in attr->config * any other bit is reserved @@ -425,6 +431,24 @@ static struct attribute *rapl_events_cln_attr[] = { NULL, }; +static struct attribute *rapl_events_hsw_attr[] = { + EVENT_PTR(rapl_cores), + EVENT_PTR(rapl_pkg), + EVENT_PTR(rapl_gpu), + EVENT_PTR(rapl_ram), + + EVENT_PTR(rapl_cores_unit), + EVENT_PTR(rapl_pkg_unit), + EVENT_PTR(rapl_gpu_unit), + EVENT_PTR(rapl_ram_unit), + + EVENT_PTR(rapl_cores_scale), + EVENT_PTR(rapl_pkg_scale), + EVENT_PTR(rapl_gpu_scale), + EVENT_PTR(rapl_ram_scale), + NULL, +}; + static struct attribute_group rapl_pmu_events_group = { .name = "events", .attrs = NULL, /* patched at runtime */ @@ -511,6 +535,7 @@ static int rapl_cpu_prepare(int cpu) struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); int phys_id = topology_physical_package_id(cpu); u64 ms; + u64 msr_rapl_power_unit_bits; if (pmu) return 0; @@ -518,6 +543,10 @@ static int rapl_cpu_prepare(int cpu) if (phys_id < 0) return -1; + /* protect rdmsrl() to handle virtualization */ + if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits)) + return -1; + pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); if (!pmu) return -1; @@ -531,8 +560,7 @@ static int rapl_cpu_prepare(int cpu) * * we cache in local PMU instance */ - rdmsrl(MSR_RAPL_POWER_UNIT, pmu->hw_unit); - pmu->hw_unit = (pmu->hw_unit >> 8) & 0x1FULL; + pmu->hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; pmu->pmu = &rapl_pmu_class; /* @@ -631,11 +659,14 @@ static int __init rapl_pmu_init(void) switch (boot_cpu_data.x86_model) { case 42: /* Sandy Bridge */ case 58: /* Ivy Bridge */ - case 60: /* Haswell */ - case 69: /* Haswell-Celeron */ rapl_cntr_mask = RAPL_IDX_CLN; rapl_pmu_events_group.attrs = rapl_events_cln_attr; break; + case 60: /* Haswell */ + case 69: /* Haswell-Celeron */ + rapl_cntr_mask = RAPL_IDX_HSW; + rapl_pmu_events_group.attrs = rapl_events_hsw_attr; + break; case 45: /* Sandy Bridge-EP */ case 62: /* IvyTown */ rapl_cntr_mask = RAPL_IDX_SRV; @@ -650,7 +681,9 @@ static int __init rapl_pmu_init(void) cpu_notifier_register_begin(); for_each_online_cpu(cpu) { - rapl_cpu_prepare(cpu); + ret = rapl_cpu_prepare(cpu); + if (ret) + goto out; rapl_cpu_init(cpu); } @@ -673,6 +706,7 @@ static int __init rapl_pmu_init(void) hweight32(rapl_cntr_mask), ktime_to_ms(pmu->timer_interval)); +out: cpu_notifier_register_done(); return 0; diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index 384df5105fb..136ac74dee8 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -27,6 +27,7 @@ static int __init x86_rdrand_setup(char *s) { setup_clear_cpu_cap(X86_FEATURE_RDRAND); + setup_clear_cpu_cap(X86_FEATURE_RDSEED); return 1; } __setup("nordrand", x86_rdrand_setup); diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index b0cc3809723..6cda0baeac9 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -17,6 +17,7 @@ #include <asm/dma.h> #include <asm/io_apic.h> #include <asm/apic.h> +#include <asm/hpet.h> #include <asm/iommu.h> #include <asm/gart.h> #include <asm/irq_remapping.h> @@ -240,7 +241,7 @@ static u32 __init intel_stolen_base(int num, int slot, int func, size_t stolen_s return base; } -#define KB(x) ((x) * 1024) +#define KB(x) ((x) * 1024UL) #define MB(x) (KB (KB (x))) #define GB(x) (MB (KB (x))) @@ -530,6 +531,15 @@ static void __init intel_graphics_stolen(int num, int slot, int func) } } +static void __init force_disable_hpet(int num, int slot, int func) +{ +#ifdef CONFIG_HPET_TIMER + boot_hpet_disable = 1; + pr_info("x86/hpet: Will disable the HPET for this platform because it's not reliable\n"); +#endif +} + + #define QFLAG_APPLY_ONCE 0x1 #define QFLAG_APPLIED 0x2 #define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) @@ -567,6 +577,12 @@ static struct chipset early_qrk[] __initdata = { PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID, QFLAG_APPLY_ONCE, intel_graphics_stolen }, + /* + * HPET on current version of Baytrail platform has accuracy + * problems, disable it for now: + */ + { PCI_VENDOR_ID_INTEL, 0x0f00, + PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, {} }; diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index c61a14a4a31..d6c1b983699 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -29,7 +29,7 @@ static void __init i386_default_early_setup(void) reserve_ebda_region(); } -asmlinkage void __init i386_start_kernel(void) +asmlinkage __visible void __init i386_start_kernel(void) { sanitize_boot_params(&boot_params); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 85126ccbdf6..068054f4bf2 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -137,7 +137,7 @@ static void __init copy_bootdata(char *real_mode_data) } } -asmlinkage void __init x86_64_start_kernel(char * real_mode_data) +asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) { int i; diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 8d80ae01160..4177bfbc80b 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -88,7 +88,7 @@ static inline void hpet_clear_mapping(void) /* * HPET command line enable / disable */ -static int boot_hpet_disable; +int boot_hpet_disable; int hpet_force_user; static int hpet_verbose; diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 79a3f968287..61b17dc2c27 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -897,9 +897,10 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) struct kprobe *cur = kprobe_running(); struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - switch (kcb->kprobe_status) { - case KPROBE_HIT_SS: - case KPROBE_REENTER: + if (unlikely(regs->ip == (unsigned long)cur->ainsn.insn)) { + /* This must happen on single-stepping */ + WARN_ON(kcb->kprobe_status != KPROBE_HIT_SS && + kcb->kprobe_status != KPROBE_REENTER); /* * We are here because the instruction being single * stepped caused a page fault. We reset the current @@ -914,9 +915,8 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) else reset_current_kprobe(); preempt_enable_no_resched(); - break; - case KPROBE_HIT_ACTIVE: - case KPROBE_HIT_SSDONE: + } else if (kcb->kprobe_status == KPROBE_HIT_ACTIVE || + kcb->kprobe_status == KPROBE_HIT_SSDONE) { /* * We increment the nmissed count for accounting, * we can also use npre/npostfault count for accounting @@ -945,10 +945,8 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) * fixup routine could not handle it, * Let do_page_fault() fix it. */ - break; - default: - break; } + return 0; } diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index af1d14a9ebd..dcbbaa165bd 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -20,6 +20,8 @@ #include <asm/mmu_context.h> #include <asm/syscalls.h> +int sysctl_ldt16 = 0; + #ifdef CONFIG_SMP static void flush_ldt(void *current_mm) { @@ -234,7 +236,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) * IRET leaking the high bits of the kernel stack address. */ #ifdef CONFIG_X86_64 - if (!ldt_info.seg_32bit) { + if (!ldt_info.seg_32bit && !sysctl_ldt16) { error = -EINVAL; goto out_unlock; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 9c0280f93d0..898d077617a 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -52,7 +52,7 @@ asmlinkage extern void ret_from_fork(void); -asmlinkage DEFINE_PER_CPU(unsigned long, old_rsp); +__visible DEFINE_PER_CPU(unsigned long, old_rsp); /* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs *regs, int all) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 654b46574b9..52b1157c53e 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -114,8 +114,8 @@ EXPORT_SYMBOL(machine_real_restart); */ static int __init set_pci_reboot(const struct dmi_system_id *d) { - if (reboot_type != BOOT_CF9) { - reboot_type = BOOT_CF9; + if (reboot_type != BOOT_CF9_FORCE) { + reboot_type = BOOT_CF9_FORCE; pr_info("%s series board detected. Selecting %s-method for reboots.\n", d->ident, "PCI"); } @@ -191,6 +191,16 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { }, }, + /* Certec */ + { /* Handle problems with rebooting on Certec BPC600 */ + .callback = set_pci_reboot, + .ident = "Certec BPC600", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Certec"), + DMI_MATCH(DMI_PRODUCT_NAME, "BPC600"), + }, + }, + /* Dell */ { /* Handle problems with rebooting on Dell DXP061 */ .callback = set_bios_reboot, @@ -458,20 +468,23 @@ void __attribute__((weak)) mach_reboot_fixups(void) } /* - * Windows compatible x86 hardware expects the following on reboot: + * To the best of our knowledge Windows compatible x86 hardware expects + * the following on reboot: * * 1) If the FADT has the ACPI reboot register flag set, try it * 2) If still alive, write to the keyboard controller * 3) If still alive, write to the ACPI reboot register again * 4) If still alive, write to the keyboard controller again * 5) If still alive, call the EFI runtime service to reboot - * 6) If still alive, write to the PCI IO port 0xCF9 to reboot - * 7) If still alive, inform BIOS to do a proper reboot + * 6) If no EFI runtime service, call the BIOS to do a reboot + * + * We default to following the same pattern. We also have + * two other reboot methods: 'triple fault' and 'PCI', which + * can be triggered via the reboot= kernel boot option or + * via quirks. * - * If the machine is still alive at this stage, it gives up. We default to - * following the same pattern, except that if we're still alive after (7) we'll - * try to force a triple fault and then cycle between hitting the keyboard - * controller and doing that + * This means that this function can never return, it can misbehave + * by not rebooting properly and hanging. */ static void native_machine_emergency_restart(void) { @@ -492,6 +505,11 @@ static void native_machine_emergency_restart(void) for (;;) { /* Could also try the reset bit in the Hammer NB */ switch (reboot_type) { + case BOOT_ACPI: + acpi_reboot(); + reboot_type = BOOT_KBD; + break; + case BOOT_KBD: mach_reboot_fixups(); /* For board specific fixups */ @@ -509,43 +527,29 @@ static void native_machine_emergency_restart(void) } break; - case BOOT_TRIPLE: - load_idt(&no_idt); - __asm__ __volatile__("int3"); - - /* We're probably dead after this, but... */ - reboot_type = BOOT_KBD; - break; - - case BOOT_BIOS: - machine_real_restart(MRR_BIOS); - - /* We're probably dead after this, but... */ - reboot_type = BOOT_TRIPLE; - break; - - case BOOT_ACPI: - acpi_reboot(); - reboot_type = BOOT_KBD; - break; - case BOOT_EFI: if (efi_enabled(EFI_RUNTIME_SERVICES)) efi.reset_system(reboot_mode == REBOOT_WARM ? EFI_RESET_WARM : EFI_RESET_COLD, EFI_SUCCESS, 0, NULL); - reboot_type = BOOT_CF9_COND; + reboot_type = BOOT_BIOS; + break; + + case BOOT_BIOS: + machine_real_restart(MRR_BIOS); + + /* We're probably dead after this, but... */ + reboot_type = BOOT_CF9_SAFE; break; - case BOOT_CF9: + case BOOT_CF9_FORCE: port_cf9_safe = true; /* Fall through */ - case BOOT_CF9_COND: + case BOOT_CF9_SAFE: if (port_cf9_safe) { - u8 reboot_code = reboot_mode == REBOOT_WARM ? - 0x06 : 0x0E; + u8 reboot_code = reboot_mode == REBOOT_WARM ? 0x06 : 0x0E; u8 cf9 = inb(0xcf9) & ~reboot_code; outb(cf9|2, 0xcf9); /* Request hard reset */ udelay(50); @@ -553,7 +557,15 @@ static void native_machine_emergency_restart(void) outb(cf9|reboot_code, 0xcf9); udelay(50); } - reboot_type = BOOT_BIOS; + reboot_type = BOOT_TRIPLE; + break; + + case BOOT_TRIPLE: + load_idt(&no_idt); + __asm__ __volatile__("int3"); + + /* We're probably dead after this, but... */ + reboot_type = BOOT_KBD; break; } } diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 7c3a5a61f2e..be8e1bde07a 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -168,7 +168,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) * this function calls the 'stop' function on all other CPUs in the system. */ -asmlinkage void smp_reboot_interrupt(void) +asmlinkage __visible void smp_reboot_interrupt(void) { ack_APIC_irq(); irq_enter(); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 57409f6b8c6..f73b5d435bd 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -357,7 +357,7 @@ exit: * for scheduling or signal handling. The actual stack switch is done in * entry.S */ -asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) +asmlinkage __visible __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) { struct pt_regs *regs = eregs; /* Did already sync */ @@ -601,11 +601,11 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) #endif } -asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) +asmlinkage __visible void __attribute__((weak)) smp_thermal_interrupt(void) { } -asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) +asmlinkage __visible void __attribute__((weak)) smp_threshold_interrupt(void) { } diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index f6584a90aba..b99b9ad8540 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -26,6 +26,9 @@ #define TOPOLOGY_REGISTER_OFFSET 0x10 +/* Flag below is initialized once during vSMP PCI initialization. */ +static int irq_routing_comply = 1; + #if defined CONFIG_PCI && defined CONFIG_PARAVIRT /* * Interrupt control on vSMPowered systems: @@ -33,7 +36,7 @@ * and vice versa. */ -asmlinkage unsigned long vsmp_save_fl(void) +asmlinkage __visible unsigned long vsmp_save_fl(void) { unsigned long flags = native_save_fl(); @@ -53,7 +56,7 @@ __visible void vsmp_restore_fl(unsigned long flags) } PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl); -asmlinkage void vsmp_irq_disable(void) +asmlinkage __visible void vsmp_irq_disable(void) { unsigned long flags = native_save_fl(); @@ -61,7 +64,7 @@ asmlinkage void vsmp_irq_disable(void) } PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable); -asmlinkage void vsmp_irq_enable(void) +asmlinkage __visible void vsmp_irq_enable(void) { unsigned long flags = native_save_fl(); @@ -101,6 +104,10 @@ static void __init set_vsmp_pv_ops(void) #ifdef CONFIG_SMP if (cap & ctl & BIT(8)) { ctl &= ~BIT(8); + + /* Interrupt routing set to ignore */ + irq_routing_comply = 0; + #ifdef CONFIG_PROC_FS /* Don't let users change irq affinity via procfs */ no_irq_affinity = 1; @@ -218,7 +225,9 @@ static void vsmp_apic_post_init(void) { /* need to update phys_pkg_id */ apic->phys_pkg_id = apicid_phys_pkg_id; - apic->vector_allocation_domain = fill_vector_allocation_domain; + + if (!irq_routing_comply) + apic->vector_allocation_domain = fill_vector_allocation_domain; } void __init vsmp_init(void) diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c index f9c6e56e14b..9531fbb123b 100644 --- a/arch/x86/kernel/vsyscall_gtod.c +++ b/arch/x86/kernel/vsyscall_gtod.c @@ -43,7 +43,7 @@ void update_vsyscall(struct timekeeper *tk) vdata->monotonic_time_sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; vdata->monotonic_time_snsec = tk->xtime_nsec - + (tk->wall_to_monotonic.tv_nsec + + ((u64)tk->wall_to_monotonic.tv_nsec << tk->shift); while (vdata->monotonic_time_snsec >= (((u64)NSEC_PER_SEC) << tk->shift)) { diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1f68c583192..33e8c028842 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -503,7 +503,7 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) [number##_HIGH] = VMCS12_OFFSET(name)+4 -static const unsigned long shadow_read_only_fields[] = { +static unsigned long shadow_read_only_fields[] = { /* * We do NOT shadow fields that are modified when L0 * traps and emulates any vmx instruction (e.g. VMPTRLD, @@ -526,10 +526,10 @@ static const unsigned long shadow_read_only_fields[] = { GUEST_LINEAR_ADDRESS, GUEST_PHYSICAL_ADDRESS }; -static const int max_shadow_read_only_fields = +static int max_shadow_read_only_fields = ARRAY_SIZE(shadow_read_only_fields); -static const unsigned long shadow_read_write_fields[] = { +static unsigned long shadow_read_write_fields[] = { GUEST_RIP, GUEST_RSP, GUEST_CR0, @@ -558,7 +558,7 @@ static const unsigned long shadow_read_write_fields[] = { HOST_FS_SELECTOR, HOST_GS_SELECTOR }; -static const int max_shadow_read_write_fields = +static int max_shadow_read_write_fields = ARRAY_SIZE(shadow_read_write_fields); static const unsigned short vmcs_field_to_offset_table[] = { @@ -3009,6 +3009,41 @@ static void free_kvm_area(void) } } +static void init_vmcs_shadow_fields(void) +{ + int i, j; + + /* No checks for read only fields yet */ + + for (i = j = 0; i < max_shadow_read_write_fields; i++) { + switch (shadow_read_write_fields[i]) { + case GUEST_BNDCFGS: + if (!vmx_mpx_supported()) + continue; + break; + default: + break; + } + + if (j < i) + shadow_read_write_fields[j] = + shadow_read_write_fields[i]; + j++; + } + max_shadow_read_write_fields = j; + + /* shadowed fields guest access without vmexit */ + for (i = 0; i < max_shadow_read_write_fields; i++) { + clear_bit(shadow_read_write_fields[i], + vmx_vmwrite_bitmap); + clear_bit(shadow_read_write_fields[i], + vmx_vmread_bitmap); + } + for (i = 0; i < max_shadow_read_only_fields; i++) + clear_bit(shadow_read_only_fields[i], + vmx_vmread_bitmap); +} + static __init int alloc_kvm_area(void) { int cpu; @@ -3039,6 +3074,8 @@ static __init int hardware_setup(void) enable_vpid = 0; if (!cpu_has_vmx_shadow_vmcs()) enable_shadow_vmcs = 0; + if (enable_shadow_vmcs) + init_vmcs_shadow_fields(); if (!cpu_has_vmx_ept() || !cpu_has_vmx_ept_4levels()) { @@ -8803,14 +8840,6 @@ static int __init vmx_init(void) memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); - /* shadowed read/write fields */ - for (i = 0; i < max_shadow_read_write_fields; i++) { - clear_bit(shadow_read_write_fields[i], vmx_vmwrite_bitmap); - clear_bit(shadow_read_write_fields[i], vmx_vmread_bitmap); - } - /* shadowed read only fields */ - for (i = 0; i < max_shadow_read_only_fields; i++) - clear_bit(shadow_read_only_fields[i], vmx_vmread_bitmap); /* * Allow direct access to the PC debug port (it is often used for I/O diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8b8fc0b792b..b6c0bacca9b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -280,7 +280,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } EXPORT_SYMBOL_GPL(kvm_set_apic_base); -asmlinkage void kvm_spurious_fault(void) +asmlinkage __visible void kvm_spurious_fault(void) { /* Fault while not rebooting. We want the trace. */ BUG(); diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index ad1fb5f5392..aae94132bc2 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -233,13 +233,13 @@ static void lguest_end_context_switch(struct task_struct *next) * flags word contains all kind of stuff, but in practice Linux only cares * about the interrupt flag. Our "save_flags()" just returns that. */ -asmlinkage unsigned long lguest_save_fl(void) +asmlinkage __visible unsigned long lguest_save_fl(void) { return lguest_data.irq_enabled; } /* Interrupts go off... */ -asmlinkage void lguest_irq_disable(void) +asmlinkage __visible void lguest_irq_disable(void) { lguest_data.irq_enabled = 0; } diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index db9db446b71..43623739c7c 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c @@ -76,7 +76,7 @@ static inline int __flip_bit(u32 msr, u8 bit, bool set) if (m1.q == m.q) return 0; - err = msr_write(msr, &m); + err = msr_write(msr, &m1); if (err) return err; diff --git a/arch/x86/math-emu/errors.c b/arch/x86/math-emu/errors.c index a5449089cd9..9e6545f269e 100644 --- a/arch/x86/math-emu/errors.c +++ b/arch/x86/math-emu/errors.c @@ -302,7 +302,7 @@ static struct { 0x242 in div_Xsig.S */ -asmlinkage void FPU_exception(int n) +asmlinkage __visible void FPU_exception(int n) { int i, int_type; @@ -492,7 +492,7 @@ int real_2op_NaN(FPU_REG const *b, u_char tagb, /* Invalid arith operation on Valid registers */ /* Returns < 0 if the exception is unmasked */ -asmlinkage int arith_invalid(int deststnr) +asmlinkage __visible int arith_invalid(int deststnr) { EXCEPTION(EX_Invalid); @@ -507,7 +507,7 @@ asmlinkage int arith_invalid(int deststnr) } /* Divide a finite number by zero */ -asmlinkage int FPU_divide_by_zero(int deststnr, u_char sign) +asmlinkage __visible int FPU_divide_by_zero(int deststnr, u_char sign) { FPU_REG *dest = &st(deststnr); int tag = TAG_Valid; @@ -539,7 +539,7 @@ int set_precision_flag(int flags) } /* This may be called often, so keep it lean */ -asmlinkage void set_precision_flag_up(void) +asmlinkage __visible void set_precision_flag_up(void) { if (control_word & CW_Precision) partial_status |= (SW_Precision | SW_C1); /* The masked response */ @@ -548,7 +548,7 @@ asmlinkage void set_precision_flag_up(void) } /* This may be called often, so keep it lean */ -asmlinkage void set_precision_flag_down(void) +asmlinkage __visible void set_precision_flag_down(void) { if (control_word & CW_Precision) { /* The masked response */ partial_status &= ~SW_C1; @@ -557,7 +557,7 @@ asmlinkage void set_precision_flag_down(void) EXCEPTION(EX_Precision); } -asmlinkage int denormal_operand(void) +asmlinkage __visible int denormal_operand(void) { if (control_word & CW_Denormal) { /* The masked response */ partial_status |= SW_Denorm_Op; @@ -568,7 +568,7 @@ asmlinkage int denormal_operand(void) } } -asmlinkage int arith_overflow(FPU_REG *dest) +asmlinkage __visible int arith_overflow(FPU_REG *dest) { int tag = TAG_Valid; @@ -596,7 +596,7 @@ asmlinkage int arith_overflow(FPU_REG *dest) } -asmlinkage int arith_underflow(FPU_REG *dest) +asmlinkage __visible int arith_underflow(FPU_REG *dest) { int tag = TAG_Valid; diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S index 01495755701..6440221ced0 100644 --- a/arch/x86/net/bpf_jit.S +++ b/arch/x86/net/bpf_jit.S @@ -12,13 +12,16 @@ /* * Calling convention : - * rdi : skb pointer + * rbx : skb pointer (callee saved) * esi : offset of byte(s) to fetch in skb (can be scratched) - * r8 : copy of skb->data + * r10 : copy of skb->data * r9d : hlen = skb->len - skb->data_len */ -#define SKBDATA %r8 +#define SKBDATA %r10 #define SKF_MAX_NEG_OFF $(-0x200000) /* SKF_LL_OFF from filter.h */ +#define MAX_BPF_STACK (512 /* from filter.h */ + \ + 32 /* space for rbx,r13,r14,r15 */ + \ + 8 /* space for skb_copy_bits */) sk_load_word: .globl sk_load_word @@ -68,53 +71,31 @@ sk_load_byte_positive_offset: movzbl (SKBDATA,%rsi),%eax ret -/** - * sk_load_byte_msh - BPF_S_LDX_B_MSH helper - * - * Implements BPF_S_LDX_B_MSH : ldxb 4*([offset]&0xf) - * Must preserve A accumulator (%eax) - * Inputs : %esi is the offset value - */ -sk_load_byte_msh: - .globl sk_load_byte_msh - test %esi,%esi - js bpf_slow_path_byte_msh_neg - -sk_load_byte_msh_positive_offset: - .globl sk_load_byte_msh_positive_offset - cmp %esi,%r9d /* if (offset >= hlen) goto bpf_slow_path_byte_msh */ - jle bpf_slow_path_byte_msh - movzbl (SKBDATA,%rsi),%ebx - and $15,%bl - shl $2,%bl - ret - /* rsi contains offset and can be scratched */ #define bpf_slow_path_common(LEN) \ - push %rdi; /* save skb */ \ + mov %rbx, %rdi; /* arg1 == skb */ \ push %r9; \ push SKBDATA; \ /* rsi already has offset */ \ mov $LEN,%ecx; /* len */ \ - lea -12(%rbp),%rdx; \ + lea - MAX_BPF_STACK + 32(%rbp),%rdx; \ call skb_copy_bits; \ test %eax,%eax; \ pop SKBDATA; \ - pop %r9; \ - pop %rdi + pop %r9; bpf_slow_path_word: bpf_slow_path_common(4) js bpf_error - mov -12(%rbp),%eax + mov - MAX_BPF_STACK + 32(%rbp),%eax bswap %eax ret bpf_slow_path_half: bpf_slow_path_common(2) js bpf_error - mov -12(%rbp),%ax + mov - MAX_BPF_STACK + 32(%rbp),%ax rol $8,%ax movzwl %ax,%eax ret @@ -122,21 +103,11 @@ bpf_slow_path_half: bpf_slow_path_byte: bpf_slow_path_common(1) js bpf_error - movzbl -12(%rbp),%eax - ret - -bpf_slow_path_byte_msh: - xchg %eax,%ebx /* dont lose A , X is about to be scratched */ - bpf_slow_path_common(1) - js bpf_error - movzbl -12(%rbp),%eax - and $15,%al - shl $2,%al - xchg %eax,%ebx + movzbl - MAX_BPF_STACK + 32(%rbp),%eax ret #define sk_negative_common(SIZE) \ - push %rdi; /* save skb */ \ + mov %rbx, %rdi; /* arg1 == skb */ \ push %r9; \ push SKBDATA; \ /* rsi already has offset */ \ @@ -145,10 +116,8 @@ bpf_slow_path_byte_msh: test %rax,%rax; \ pop SKBDATA; \ pop %r9; \ - pop %rdi; \ jz bpf_error - bpf_slow_path_word_neg: cmp SKF_MAX_NEG_OFF, %esi /* test range */ jl bpf_error /* offset lower -> error */ @@ -179,22 +148,12 @@ sk_load_byte_negative_offset: movzbl (%rax), %eax ret -bpf_slow_path_byte_msh_neg: - cmp SKF_MAX_NEG_OFF, %esi - jl bpf_error -sk_load_byte_msh_negative_offset: - .globl sk_load_byte_msh_negative_offset - xchg %eax,%ebx /* dont lose A , X is about to be scratched */ - sk_negative_common(1) - movzbl (%rax),%eax - and $15,%al - shl $2,%al - xchg %eax,%ebx - ret - bpf_error: # force a return 0 from jit handler - xor %eax,%eax - mov -8(%rbp),%rbx + xor %eax,%eax + mov - MAX_BPF_STACK(%rbp),%rbx + mov - MAX_BPF_STACK + 8(%rbp),%r13 + mov - MAX_BPF_STACK + 16(%rbp),%r14 + mov - MAX_BPF_STACK + 24(%rbp),%r15 leaveq ret diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index dc017735bb9..080f3f071bb 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1,6 +1,7 @@ /* bpf_jit_comp.c : BPF JIT compiler * * Copyright (C) 2011-2013 Eric Dumazet (eric.dumazet@gmail.com) + * Internal BPF Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -14,28 +15,16 @@ #include <linux/if_vlan.h> #include <linux/random.h> -/* - * Conventions : - * EAX : BPF A accumulator - * EBX : BPF X accumulator - * RDI : pointer to skb (first argument given to JIT function) - * RBP : frame pointer (even if CONFIG_FRAME_POINTER=n) - * ECX,EDX,ESI : scratch registers - * r9d : skb->len - skb->data_len (headlen) - * r8 : skb->data - * -8(RBP) : saved RBX value - * -16(RBP)..-80(RBP) : BPF_MEMWORDS values - */ int bpf_jit_enable __read_mostly; /* * assembly code in arch/x86/net/bpf_jit.S */ -extern u8 sk_load_word[], sk_load_half[], sk_load_byte[], sk_load_byte_msh[]; +extern u8 sk_load_word[], sk_load_half[], sk_load_byte[]; extern u8 sk_load_word_positive_offset[], sk_load_half_positive_offset[]; -extern u8 sk_load_byte_positive_offset[], sk_load_byte_msh_positive_offset[]; +extern u8 sk_load_byte_positive_offset[]; extern u8 sk_load_word_negative_offset[], sk_load_half_negative_offset[]; -extern u8 sk_load_byte_negative_offset[], sk_load_byte_msh_negative_offset[]; +extern u8 sk_load_byte_negative_offset[]; static inline u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) { @@ -56,30 +45,44 @@ static inline u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) #define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2) #define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3) #define EMIT4(b1, b2, b3, b4) EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4) -#define EMIT1_off32(b1, off) do { EMIT1(b1); EMIT(off, 4);} while (0) - -#define CLEAR_A() EMIT2(0x31, 0xc0) /* xor %eax,%eax */ -#define CLEAR_X() EMIT2(0x31, 0xdb) /* xor %ebx,%ebx */ +#define EMIT1_off32(b1, off) \ + do {EMIT1(b1); EMIT(off, 4); } while (0) +#define EMIT2_off32(b1, b2, off) \ + do {EMIT2(b1, b2); EMIT(off, 4); } while (0) +#define EMIT3_off32(b1, b2, b3, off) \ + do {EMIT3(b1, b2, b3); EMIT(off, 4); } while (0) +#define EMIT4_off32(b1, b2, b3, b4, off) \ + do {EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0) static inline bool is_imm8(int value) { return value <= 127 && value >= -128; } -static inline bool is_near(int offset) +static inline bool is_simm32(s64 value) { - return offset <= 127 && offset >= -128; + return value == (s64) (s32) value; } -#define EMIT_JMP(offset) \ -do { \ - if (offset) { \ - if (is_near(offset)) \ - EMIT2(0xeb, offset); /* jmp .+off8 */ \ - else \ - EMIT1_off32(0xe9, offset); /* jmp .+off32 */ \ - } \ -} while (0) +/* mov A, X */ +#define EMIT_mov(A, X) \ + do {if (A != X) \ + EMIT3(add_2mod(0x48, A, X), 0x89, add_2reg(0xC0, A, X)); \ + } while (0) + +static int bpf_size_to_x86_bytes(int bpf_size) +{ + if (bpf_size == BPF_W) + return 4; + else if (bpf_size == BPF_H) + return 2; + else if (bpf_size == BPF_B) + return 1; + else if (bpf_size == BPF_DW) + return 4; /* imm32 */ + else + return 0; +} /* list of x86 cond jumps opcodes (. + s8) * Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32) @@ -90,27 +93,8 @@ do { \ #define X86_JNE 0x75 #define X86_JBE 0x76 #define X86_JA 0x77 - -#define EMIT_COND_JMP(op, offset) \ -do { \ - if (is_near(offset)) \ - EMIT2(op, offset); /* jxx .+off8 */ \ - else { \ - EMIT2(0x0f, op + 0x10); \ - EMIT(offset, 4); /* jxx .+off32 */ \ - } \ -} while (0) - -#define COND_SEL(CODE, TOP, FOP) \ - case CODE: \ - t_op = TOP; \ - f_op = FOP; \ - goto cond_branch - - -#define SEEN_DATAREF 1 /* might call external helpers */ -#define SEEN_XREG 2 /* ebx is used */ -#define SEEN_MEM 4 /* use mem[] for temporary storage */ +#define X86_JGE 0x7D +#define X86_JG 0x7F static inline void bpf_flush_icache(void *start, void *end) { @@ -125,26 +109,6 @@ static inline void bpf_flush_icache(void *start, void *end) #define CHOOSE_LOAD_FUNC(K, func) \ ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset) -/* Helper to find the offset of pkt_type in sk_buff - * We want to make sure its still a 3bit field starting at a byte boundary. - */ -#define PKT_TYPE_MAX 7 -static int pkt_type_offset(void) -{ - struct sk_buff skb_probe = { - .pkt_type = ~0, - }; - char *ct = (char *)&skb_probe; - unsigned int off; - - for (off = 0; off < sizeof(struct sk_buff); off++) { - if (ct[off] == PKT_TYPE_MAX) - return off; - } - pr_err_once("Please fix pkt_type_offset(), as pkt_type couldn't be found\n"); - return -1; -} - struct bpf_binary_header { unsigned int pages; /* Note : for security reasons, bpf code will follow a randomly @@ -171,590 +135,778 @@ static struct bpf_binary_header *bpf_alloc_binary(unsigned int proglen, memset(header, 0xcc, sz); /* fill whole space with int3 instructions */ header->pages = sz / PAGE_SIZE; - hole = sz - (proglen + sizeof(*header)); + hole = min(sz - (proglen + sizeof(*header)), PAGE_SIZE - sizeof(*header)); /* insert a random number of int3 instructions before BPF code */ *image_ptr = &header->image[prandom_u32() % hole]; return header; } -void bpf_jit_compile(struct sk_filter *fp) +/* pick a register outside of BPF range for JIT internal work */ +#define AUX_REG (MAX_BPF_REG + 1) + +/* the following table maps BPF registers to x64 registers. + * x64 register r12 is unused, since if used as base address register + * in load/store instructions, it always needs an extra byte of encoding + */ +static const int reg2hex[] = { + [BPF_REG_0] = 0, /* rax */ + [BPF_REG_1] = 7, /* rdi */ + [BPF_REG_2] = 6, /* rsi */ + [BPF_REG_3] = 2, /* rdx */ + [BPF_REG_4] = 1, /* rcx */ + [BPF_REG_5] = 0, /* r8 */ + [BPF_REG_6] = 3, /* rbx callee saved */ + [BPF_REG_7] = 5, /* r13 callee saved */ + [BPF_REG_8] = 6, /* r14 callee saved */ + [BPF_REG_9] = 7, /* r15 callee saved */ + [BPF_REG_FP] = 5, /* rbp readonly */ + [AUX_REG] = 3, /* r11 temp register */ +}; + +/* is_ereg() == true if BPF register 'reg' maps to x64 r8..r15 + * which need extra byte of encoding. + * rax,rcx,...,rbp have simpler encoding + */ +static inline bool is_ereg(u32 reg) { - u8 temp[64]; - u8 *prog; - unsigned int proglen, oldproglen = 0; - int ilen, i; - int t_offset, f_offset; - u8 t_op, f_op, seen = 0, pass; - u8 *image = NULL; - struct bpf_binary_header *header = NULL; - u8 *func; - int pc_ret0 = -1; /* bpf index of first RET #0 instruction (if any) */ - unsigned int cleanup_addr; /* epilogue code offset */ - unsigned int *addrs; - const struct sock_filter *filter = fp->insns; - int flen = fp->len; + if (reg == BPF_REG_5 || reg == AUX_REG || + (reg >= BPF_REG_7 && reg <= BPF_REG_9)) + return true; + else + return false; +} - if (!bpf_jit_enable) - return; +/* add modifiers if 'reg' maps to x64 registers r8..r15 */ +static inline u8 add_1mod(u8 byte, u32 reg) +{ + if (is_ereg(reg)) + byte |= 1; + return byte; +} - addrs = kmalloc(flen * sizeof(*addrs), GFP_KERNEL); - if (addrs == NULL) - return; +static inline u8 add_2mod(u8 byte, u32 r1, u32 r2) +{ + if (is_ereg(r1)) + byte |= 1; + if (is_ereg(r2)) + byte |= 4; + return byte; +} - /* Before first pass, make a rough estimation of addrs[] - * each bpf instruction is translated to less than 64 bytes +/* encode dest register 'a_reg' into x64 opcode 'byte' */ +static inline u8 add_1reg(u8 byte, u32 a_reg) +{ + return byte + reg2hex[a_reg]; +} + +/* encode dest 'a_reg' and src 'x_reg' registers into x64 opcode 'byte' */ +static inline u8 add_2reg(u8 byte, u32 a_reg, u32 x_reg) +{ + return byte + reg2hex[a_reg] + (reg2hex[x_reg] << 3); +} + +struct jit_context { + unsigned int cleanup_addr; /* epilogue code offset */ + bool seen_ld_abs; +}; + +static int do_jit(struct sk_filter *bpf_prog, int *addrs, u8 *image, + int oldproglen, struct jit_context *ctx) +{ + struct sock_filter_int *insn = bpf_prog->insnsi; + int insn_cnt = bpf_prog->len; + u8 temp[64]; + int i; + int proglen = 0; + u8 *prog = temp; + int stacksize = MAX_BPF_STACK + + 32 /* space for rbx, r13, r14, r15 */ + + 8 /* space for skb_copy_bits() buffer */; + + EMIT1(0x55); /* push rbp */ + EMIT3(0x48, 0x89, 0xE5); /* mov rbp,rsp */ + + /* sub rsp, stacksize */ + EMIT3_off32(0x48, 0x81, 0xEC, stacksize); + + /* all classic BPF filters use R6(rbx) save it */ + + /* mov qword ptr [rbp-X],rbx */ + EMIT3_off32(0x48, 0x89, 0x9D, -stacksize); + + /* sk_convert_filter() maps classic BPF register X to R7 and uses R8 + * as temporary, so all tcpdump filters need to spill/fill R7(r13) and + * R8(r14). R9(r15) spill could be made conditional, but there is only + * one 'bpf_error' return path out of helper functions inside bpf_jit.S + * The overhead of extra spill is negligible for any filter other + * than synthetic ones. Therefore not worth adding complexity. */ - for (proglen = 0, i = 0; i < flen; i++) { - proglen += 64; - addrs[i] = proglen; + + /* mov qword ptr [rbp-X],r13 */ + EMIT3_off32(0x4C, 0x89, 0xAD, -stacksize + 8); + /* mov qword ptr [rbp-X],r14 */ + EMIT3_off32(0x4C, 0x89, 0xB5, -stacksize + 16); + /* mov qword ptr [rbp-X],r15 */ + EMIT3_off32(0x4C, 0x89, 0xBD, -stacksize + 24); + + /* clear A and X registers */ + EMIT2(0x31, 0xc0); /* xor eax, eax */ + EMIT3(0x4D, 0x31, 0xED); /* xor r13, r13 */ + + if (ctx->seen_ld_abs) { + /* r9d : skb->len - skb->data_len (headlen) + * r10 : skb->data + */ + if (is_imm8(offsetof(struct sk_buff, len))) + /* mov %r9d, off8(%rdi) */ + EMIT4(0x44, 0x8b, 0x4f, + offsetof(struct sk_buff, len)); + else + /* mov %r9d, off32(%rdi) */ + EMIT3_off32(0x44, 0x8b, 0x8f, + offsetof(struct sk_buff, len)); + + if (is_imm8(offsetof(struct sk_buff, data_len))) + /* sub %r9d, off8(%rdi) */ + EMIT4(0x44, 0x2b, 0x4f, + offsetof(struct sk_buff, data_len)); + else + EMIT3_off32(0x44, 0x2b, 0x8f, + offsetof(struct sk_buff, data_len)); + + if (is_imm8(offsetof(struct sk_buff, data))) + /* mov %r10, off8(%rdi) */ + EMIT4(0x4c, 0x8b, 0x57, + offsetof(struct sk_buff, data)); + else + /* mov %r10, off32(%rdi) */ + EMIT3_off32(0x4c, 0x8b, 0x97, + offsetof(struct sk_buff, data)); } - cleanup_addr = proglen; /* epilogue address */ - for (pass = 0; pass < 10; pass++) { - u8 seen_or_pass0 = (pass == 0) ? (SEEN_XREG | SEEN_DATAREF | SEEN_MEM) : seen; - /* no prologue/epilogue for trivial filters (RET something) */ - proglen = 0; - prog = temp; + for (i = 0; i < insn_cnt; i++, insn++) { + const s32 K = insn->imm; + u32 a_reg = insn->a_reg; + u32 x_reg = insn->x_reg; + u8 b1 = 0, b2 = 0, b3 = 0; + s64 jmp_offset; + u8 jmp_cond; + int ilen; + u8 *func; + + switch (insn->code) { + /* ALU */ + case BPF_ALU | BPF_ADD | BPF_X: + case BPF_ALU | BPF_SUB | BPF_X: + case BPF_ALU | BPF_AND | BPF_X: + case BPF_ALU | BPF_OR | BPF_X: + case BPF_ALU | BPF_XOR | BPF_X: + case BPF_ALU64 | BPF_ADD | BPF_X: + case BPF_ALU64 | BPF_SUB | BPF_X: + case BPF_ALU64 | BPF_AND | BPF_X: + case BPF_ALU64 | BPF_OR | BPF_X: + case BPF_ALU64 | BPF_XOR | BPF_X: + switch (BPF_OP(insn->code)) { + case BPF_ADD: b2 = 0x01; break; + case BPF_SUB: b2 = 0x29; break; + case BPF_AND: b2 = 0x21; break; + case BPF_OR: b2 = 0x09; break; + case BPF_XOR: b2 = 0x31; break; + } + if (BPF_CLASS(insn->code) == BPF_ALU64) + EMIT1(add_2mod(0x48, a_reg, x_reg)); + else if (is_ereg(a_reg) || is_ereg(x_reg)) + EMIT1(add_2mod(0x40, a_reg, x_reg)); + EMIT2(b2, add_2reg(0xC0, a_reg, x_reg)); + break; - if (seen_or_pass0) { - EMIT4(0x55, 0x48, 0x89, 0xe5); /* push %rbp; mov %rsp,%rbp */ - EMIT4(0x48, 0x83, 0xec, 96); /* subq $96,%rsp */ - /* note : must save %rbx in case bpf_error is hit */ - if (seen_or_pass0 & (SEEN_XREG | SEEN_DATAREF)) - EMIT4(0x48, 0x89, 0x5d, 0xf8); /* mov %rbx, -8(%rbp) */ - if (seen_or_pass0 & SEEN_XREG) - CLEAR_X(); /* make sure we dont leek kernel memory */ - - /* - * If this filter needs to access skb data, - * loads r9 and r8 with : - * r9 = skb->len - skb->data_len - * r8 = skb->data + /* mov A, X */ + case BPF_ALU64 | BPF_MOV | BPF_X: + EMIT_mov(a_reg, x_reg); + break; + + /* mov32 A, X */ + case BPF_ALU | BPF_MOV | BPF_X: + if (is_ereg(a_reg) || is_ereg(x_reg)) + EMIT1(add_2mod(0x40, a_reg, x_reg)); + EMIT2(0x89, add_2reg(0xC0, a_reg, x_reg)); + break; + + /* neg A */ + case BPF_ALU | BPF_NEG: + case BPF_ALU64 | BPF_NEG: + if (BPF_CLASS(insn->code) == BPF_ALU64) + EMIT1(add_1mod(0x48, a_reg)); + else if (is_ereg(a_reg)) + EMIT1(add_1mod(0x40, a_reg)); + EMIT2(0xF7, add_1reg(0xD8, a_reg)); + break; + + case BPF_ALU | BPF_ADD | BPF_K: + case BPF_ALU | BPF_SUB | BPF_K: + case BPF_ALU | BPF_AND | BPF_K: + case BPF_ALU | BPF_OR | BPF_K: + case BPF_ALU | BPF_XOR | BPF_K: + case BPF_ALU64 | BPF_ADD | BPF_K: + case BPF_ALU64 | BPF_SUB | BPF_K: + case BPF_ALU64 | BPF_AND | BPF_K: + case BPF_ALU64 | BPF_OR | BPF_K: + case BPF_ALU64 | BPF_XOR | BPF_K: + if (BPF_CLASS(insn->code) == BPF_ALU64) + EMIT1(add_1mod(0x48, a_reg)); + else if (is_ereg(a_reg)) + EMIT1(add_1mod(0x40, a_reg)); + + switch (BPF_OP(insn->code)) { + case BPF_ADD: b3 = 0xC0; break; + case BPF_SUB: b3 = 0xE8; break; + case BPF_AND: b3 = 0xE0; break; + case BPF_OR: b3 = 0xC8; break; + case BPF_XOR: b3 = 0xF0; break; + } + + if (is_imm8(K)) + EMIT3(0x83, add_1reg(b3, a_reg), K); + else + EMIT2_off32(0x81, add_1reg(b3, a_reg), K); + break; + + case BPF_ALU64 | BPF_MOV | BPF_K: + /* optimization: if imm32 is positive, + * use 'mov eax, imm32' (which zero-extends imm32) + * to save 2 bytes */ - if (seen_or_pass0 & SEEN_DATAREF) { - if (offsetof(struct sk_buff, len) <= 127) - /* mov off8(%rdi),%r9d */ - EMIT4(0x44, 0x8b, 0x4f, offsetof(struct sk_buff, len)); - else { - /* mov off32(%rdi),%r9d */ - EMIT3(0x44, 0x8b, 0x8f); - EMIT(offsetof(struct sk_buff, len), 4); - } - if (is_imm8(offsetof(struct sk_buff, data_len))) - /* sub off8(%rdi),%r9d */ - EMIT4(0x44, 0x2b, 0x4f, offsetof(struct sk_buff, data_len)); - else { - EMIT3(0x44, 0x2b, 0x8f); - EMIT(offsetof(struct sk_buff, data_len), 4); - } + if (K < 0) { + /* 'mov rax, imm32' sign extends imm32 */ + b1 = add_1mod(0x48, a_reg); + b2 = 0xC7; + b3 = 0xC0; + EMIT3_off32(b1, b2, add_1reg(b3, a_reg), K); + break; + } - if (is_imm8(offsetof(struct sk_buff, data))) - /* mov off8(%rdi),%r8 */ - EMIT4(0x4c, 0x8b, 0x47, offsetof(struct sk_buff, data)); - else { - /* mov off32(%rdi),%r8 */ - EMIT3(0x4c, 0x8b, 0x87); - EMIT(offsetof(struct sk_buff, data), 4); - } + case BPF_ALU | BPF_MOV | BPF_K: + /* mov %eax, imm32 */ + if (is_ereg(a_reg)) + EMIT1(add_1mod(0x40, a_reg)); + EMIT1_off32(add_1reg(0xB8, a_reg), K); + break; + + /* A %= X, A /= X, A %= K, A /= K */ + case BPF_ALU | BPF_MOD | BPF_X: + case BPF_ALU | BPF_DIV | BPF_X: + case BPF_ALU | BPF_MOD | BPF_K: + case BPF_ALU | BPF_DIV | BPF_K: + case BPF_ALU64 | BPF_MOD | BPF_X: + case BPF_ALU64 | BPF_DIV | BPF_X: + case BPF_ALU64 | BPF_MOD | BPF_K: + case BPF_ALU64 | BPF_DIV | BPF_K: + EMIT1(0x50); /* push rax */ + EMIT1(0x52); /* push rdx */ + + if (BPF_SRC(insn->code) == BPF_X) + /* mov r11, X */ + EMIT_mov(AUX_REG, x_reg); + else + /* mov r11, K */ + EMIT3_off32(0x49, 0xC7, 0xC3, K); + + /* mov rax, A */ + EMIT_mov(BPF_REG_0, a_reg); + + /* xor edx, edx + * equivalent to 'xor rdx, rdx', but one byte less + */ + EMIT2(0x31, 0xd2); + + if (BPF_SRC(insn->code) == BPF_X) { + /* if (X == 0) return 0 */ + + /* cmp r11, 0 */ + EMIT4(0x49, 0x83, 0xFB, 0x00); + + /* jne .+9 (skip over pop, pop, xor and jmp) */ + EMIT2(X86_JNE, 1 + 1 + 2 + 5); + EMIT1(0x5A); /* pop rdx */ + EMIT1(0x58); /* pop rax */ + EMIT2(0x31, 0xc0); /* xor eax, eax */ + + /* jmp cleanup_addr + * addrs[i] - 11, because there are 11 bytes + * after this insn: div, mov, pop, pop, mov + */ + jmp_offset = ctx->cleanup_addr - (addrs[i] - 11); + EMIT1_off32(0xE9, jmp_offset); } - } - switch (filter[0].code) { - case BPF_S_RET_K: - case BPF_S_LD_W_LEN: - case BPF_S_ANC_PROTOCOL: - case BPF_S_ANC_IFINDEX: - case BPF_S_ANC_MARK: - case BPF_S_ANC_RXHASH: - case BPF_S_ANC_CPU: - case BPF_S_ANC_VLAN_TAG: - case BPF_S_ANC_VLAN_TAG_PRESENT: - case BPF_S_ANC_QUEUE: - case BPF_S_ANC_PKTTYPE: - case BPF_S_LD_W_ABS: - case BPF_S_LD_H_ABS: - case BPF_S_LD_B_ABS: - /* first instruction sets A register (or is RET 'constant') */ + if (BPF_CLASS(insn->code) == BPF_ALU64) + /* div r11 */ + EMIT3(0x49, 0xF7, 0xF3); + else + /* div r11d */ + EMIT3(0x41, 0xF7, 0xF3); + + if (BPF_OP(insn->code) == BPF_MOD) + /* mov r11, rdx */ + EMIT3(0x49, 0x89, 0xD3); + else + /* mov r11, rax */ + EMIT3(0x49, 0x89, 0xC3); + + EMIT1(0x5A); /* pop rdx */ + EMIT1(0x58); /* pop rax */ + + /* mov A, r11 */ + EMIT_mov(a_reg, AUX_REG); break; - default: - /* make sure we dont leak kernel information to user */ - CLEAR_A(); /* A = 0 */ - } - for (i = 0; i < flen; i++) { - unsigned int K = filter[i].k; + case BPF_ALU | BPF_MUL | BPF_K: + case BPF_ALU | BPF_MUL | BPF_X: + case BPF_ALU64 | BPF_MUL | BPF_K: + case BPF_ALU64 | BPF_MUL | BPF_X: + EMIT1(0x50); /* push rax */ + EMIT1(0x52); /* push rdx */ + + /* mov r11, A */ + EMIT_mov(AUX_REG, a_reg); + + if (BPF_SRC(insn->code) == BPF_X) + /* mov rax, X */ + EMIT_mov(BPF_REG_0, x_reg); + else + /* mov rax, K */ + EMIT3_off32(0x48, 0xC7, 0xC0, K); + + if (BPF_CLASS(insn->code) == BPF_ALU64) + EMIT1(add_1mod(0x48, AUX_REG)); + else if (is_ereg(AUX_REG)) + EMIT1(add_1mod(0x40, AUX_REG)); + /* mul(q) r11 */ + EMIT2(0xF7, add_1reg(0xE0, AUX_REG)); + + /* mov r11, rax */ + EMIT_mov(AUX_REG, BPF_REG_0); + + EMIT1(0x5A); /* pop rdx */ + EMIT1(0x58); /* pop rax */ + + /* mov A, r11 */ + EMIT_mov(a_reg, AUX_REG); + break; - switch (filter[i].code) { - case BPF_S_ALU_ADD_X: /* A += X; */ - seen |= SEEN_XREG; - EMIT2(0x01, 0xd8); /* add %ebx,%eax */ - break; - case BPF_S_ALU_ADD_K: /* A += K; */ - if (!K) - break; - if (is_imm8(K)) - EMIT3(0x83, 0xc0, K); /* add imm8,%eax */ - else - EMIT1_off32(0x05, K); /* add imm32,%eax */ - break; - case BPF_S_ALU_SUB_X: /* A -= X; */ - seen |= SEEN_XREG; - EMIT2(0x29, 0xd8); /* sub %ebx,%eax */ - break; - case BPF_S_ALU_SUB_K: /* A -= K */ - if (!K) - break; - if (is_imm8(K)) - EMIT3(0x83, 0xe8, K); /* sub imm8,%eax */ - else - EMIT1_off32(0x2d, K); /* sub imm32,%eax */ - break; - case BPF_S_ALU_MUL_X: /* A *= X; */ - seen |= SEEN_XREG; - EMIT3(0x0f, 0xaf, 0xc3); /* imul %ebx,%eax */ - break; - case BPF_S_ALU_MUL_K: /* A *= K */ - if (is_imm8(K)) - EMIT3(0x6b, 0xc0, K); /* imul imm8,%eax,%eax */ - else { - EMIT2(0x69, 0xc0); /* imul imm32,%eax */ - EMIT(K, 4); - } - break; - case BPF_S_ALU_DIV_X: /* A /= X; */ - seen |= SEEN_XREG; - EMIT2(0x85, 0xdb); /* test %ebx,%ebx */ - if (pc_ret0 > 0) { - /* addrs[pc_ret0 - 1] is start address of target - * (addrs[i] - 4) is the address following this jmp - * ("xor %edx,%edx; div %ebx" being 4 bytes long) - */ - EMIT_COND_JMP(X86_JE, addrs[pc_ret0 - 1] - - (addrs[i] - 4)); - } else { - EMIT_COND_JMP(X86_JNE, 2 + 5); - CLEAR_A(); - EMIT1_off32(0xe9, cleanup_addr - (addrs[i] - 4)); /* jmp .+off32 */ - } - EMIT4(0x31, 0xd2, 0xf7, 0xf3); /* xor %edx,%edx; div %ebx */ - break; - case BPF_S_ALU_MOD_X: /* A %= X; */ - seen |= SEEN_XREG; - EMIT2(0x85, 0xdb); /* test %ebx,%ebx */ - if (pc_ret0 > 0) { - /* addrs[pc_ret0 - 1] is start address of target - * (addrs[i] - 6) is the address following this jmp - * ("xor %edx,%edx; div %ebx;mov %edx,%eax" being 6 bytes long) - */ - EMIT_COND_JMP(X86_JE, addrs[pc_ret0 - 1] - - (addrs[i] - 6)); - } else { - EMIT_COND_JMP(X86_JNE, 2 + 5); - CLEAR_A(); - EMIT1_off32(0xe9, cleanup_addr - (addrs[i] - 6)); /* jmp .+off32 */ - } - EMIT2(0x31, 0xd2); /* xor %edx,%edx */ - EMIT2(0xf7, 0xf3); /* div %ebx */ - EMIT2(0x89, 0xd0); /* mov %edx,%eax */ - break; - case BPF_S_ALU_MOD_K: /* A %= K; */ - if (K == 1) { - CLEAR_A(); - break; - } - EMIT2(0x31, 0xd2); /* xor %edx,%edx */ - EMIT1(0xb9);EMIT(K, 4); /* mov imm32,%ecx */ - EMIT2(0xf7, 0xf1); /* div %ecx */ - EMIT2(0x89, 0xd0); /* mov %edx,%eax */ - break; - case BPF_S_ALU_DIV_K: /* A /= K */ - if (K == 1) - break; - EMIT2(0x31, 0xd2); /* xor %edx,%edx */ - EMIT1(0xb9);EMIT(K, 4); /* mov imm32,%ecx */ - EMIT2(0xf7, 0xf1); /* div %ecx */ - break; - case BPF_S_ALU_AND_X: - seen |= SEEN_XREG; - EMIT2(0x21, 0xd8); /* and %ebx,%eax */ - break; - case BPF_S_ALU_AND_K: - if (K >= 0xFFFFFF00) { - EMIT2(0x24, K & 0xFF); /* and imm8,%al */ - } else if (K >= 0xFFFF0000) { - EMIT2(0x66, 0x25); /* and imm16,%ax */ - EMIT(K, 2); - } else { - EMIT1_off32(0x25, K); /* and imm32,%eax */ - } - break; - case BPF_S_ALU_OR_X: - seen |= SEEN_XREG; - EMIT2(0x09, 0xd8); /* or %ebx,%eax */ - break; - case BPF_S_ALU_OR_K: - if (is_imm8(K)) - EMIT3(0x83, 0xc8, K); /* or imm8,%eax */ - else - EMIT1_off32(0x0d, K); /* or imm32,%eax */ - break; - case BPF_S_ANC_ALU_XOR_X: /* A ^= X; */ - case BPF_S_ALU_XOR_X: - seen |= SEEN_XREG; - EMIT2(0x31, 0xd8); /* xor %ebx,%eax */ - break; - case BPF_S_ALU_XOR_K: /* A ^= K; */ - if (K == 0) - break; - if (is_imm8(K)) - EMIT3(0x83, 0xf0, K); /* xor imm8,%eax */ - else - EMIT1_off32(0x35, K); /* xor imm32,%eax */ - break; - case BPF_S_ALU_LSH_X: /* A <<= X; */ - seen |= SEEN_XREG; - EMIT4(0x89, 0xd9, 0xd3, 0xe0); /* mov %ebx,%ecx; shl %cl,%eax */ - break; - case BPF_S_ALU_LSH_K: - if (K == 0) - break; - else if (K == 1) - EMIT2(0xd1, 0xe0); /* shl %eax */ - else - EMIT3(0xc1, 0xe0, K); - break; - case BPF_S_ALU_RSH_X: /* A >>= X; */ - seen |= SEEN_XREG; - EMIT4(0x89, 0xd9, 0xd3, 0xe8); /* mov %ebx,%ecx; shr %cl,%eax */ - break; - case BPF_S_ALU_RSH_K: /* A >>= K; */ - if (K == 0) - break; - else if (K == 1) - EMIT2(0xd1, 0xe8); /* shr %eax */ - else - EMIT3(0xc1, 0xe8, K); - break; - case BPF_S_ALU_NEG: - EMIT2(0xf7, 0xd8); /* neg %eax */ - break; - case BPF_S_RET_K: - if (!K) { - if (pc_ret0 == -1) - pc_ret0 = i; - CLEAR_A(); - } else { - EMIT1_off32(0xb8, K); /* mov $imm32,%eax */ - } - /* fallinto */ - case BPF_S_RET_A: - if (seen_or_pass0) { - if (i != flen - 1) { - EMIT_JMP(cleanup_addr - addrs[i]); - break; - } - if (seen_or_pass0 & SEEN_XREG) - EMIT4(0x48, 0x8b, 0x5d, 0xf8); /* mov -8(%rbp),%rbx */ - EMIT1(0xc9); /* leaveq */ - } - EMIT1(0xc3); /* ret */ - break; - case BPF_S_MISC_TAX: /* X = A */ - seen |= SEEN_XREG; - EMIT2(0x89, 0xc3); /* mov %eax,%ebx */ - break; - case BPF_S_MISC_TXA: /* A = X */ - seen |= SEEN_XREG; - EMIT2(0x89, 0xd8); /* mov %ebx,%eax */ - break; - case BPF_S_LD_IMM: /* A = K */ - if (!K) - CLEAR_A(); - else - EMIT1_off32(0xb8, K); /* mov $imm32,%eax */ - break; - case BPF_S_LDX_IMM: /* X = K */ - seen |= SEEN_XREG; - if (!K) - CLEAR_X(); + /* shifts */ + case BPF_ALU | BPF_LSH | BPF_K: + case BPF_ALU | BPF_RSH | BPF_K: + case BPF_ALU | BPF_ARSH | BPF_K: + case BPF_ALU64 | BPF_LSH | BPF_K: + case BPF_ALU64 | BPF_RSH | BPF_K: + case BPF_ALU64 | BPF_ARSH | BPF_K: + if (BPF_CLASS(insn->code) == BPF_ALU64) + EMIT1(add_1mod(0x48, a_reg)); + else if (is_ereg(a_reg)) + EMIT1(add_1mod(0x40, a_reg)); + + switch (BPF_OP(insn->code)) { + case BPF_LSH: b3 = 0xE0; break; + case BPF_RSH: b3 = 0xE8; break; + case BPF_ARSH: b3 = 0xF8; break; + } + EMIT3(0xC1, add_1reg(b3, a_reg), K); + break; + + case BPF_ALU | BPF_END | BPF_FROM_BE: + switch (K) { + case 16: + /* emit 'ror %ax, 8' to swap lower 2 bytes */ + EMIT1(0x66); + if (is_ereg(a_reg)) + EMIT1(0x41); + EMIT3(0xC1, add_1reg(0xC8, a_reg), 8); + break; + case 32: + /* emit 'bswap eax' to swap lower 4 bytes */ + if (is_ereg(a_reg)) + EMIT2(0x41, 0x0F); else - EMIT1_off32(0xbb, K); /* mov $imm32,%ebx */ - break; - case BPF_S_LD_MEM: /* A = mem[K] : mov off8(%rbp),%eax */ - seen |= SEEN_MEM; - EMIT3(0x8b, 0x45, 0xf0 - K*4); - break; - case BPF_S_LDX_MEM: /* X = mem[K] : mov off8(%rbp),%ebx */ - seen |= SEEN_XREG | SEEN_MEM; - EMIT3(0x8b, 0x5d, 0xf0 - K*4); - break; - case BPF_S_ST: /* mem[K] = A : mov %eax,off8(%rbp) */ - seen |= SEEN_MEM; - EMIT3(0x89, 0x45, 0xf0 - K*4); - break; - case BPF_S_STX: /* mem[K] = X : mov %ebx,off8(%rbp) */ - seen |= SEEN_XREG | SEEN_MEM; - EMIT3(0x89, 0x5d, 0xf0 - K*4); - break; - case BPF_S_LD_W_LEN: /* A = skb->len; */ - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); - if (is_imm8(offsetof(struct sk_buff, len))) - /* mov off8(%rdi),%eax */ - EMIT3(0x8b, 0x47, offsetof(struct sk_buff, len)); - else { - EMIT2(0x8b, 0x87); - EMIT(offsetof(struct sk_buff, len), 4); - } - break; - case BPF_S_LDX_W_LEN: /* X = skb->len; */ - seen |= SEEN_XREG; - if (is_imm8(offsetof(struct sk_buff, len))) - /* mov off8(%rdi),%ebx */ - EMIT3(0x8b, 0x5f, offsetof(struct sk_buff, len)); - else { - EMIT2(0x8b, 0x9f); - EMIT(offsetof(struct sk_buff, len), 4); - } - break; - case BPF_S_ANC_PROTOCOL: /* A = ntohs(skb->protocol); */ - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); - if (is_imm8(offsetof(struct sk_buff, protocol))) { - /* movzwl off8(%rdi),%eax */ - EMIT4(0x0f, 0xb7, 0x47, offsetof(struct sk_buff, protocol)); - } else { - EMIT3(0x0f, 0xb7, 0x87); /* movzwl off32(%rdi),%eax */ - EMIT(offsetof(struct sk_buff, protocol), 4); - } - EMIT2(0x86, 0xc4); /* ntohs() : xchg %al,%ah */ - break; - case BPF_S_ANC_IFINDEX: - if (is_imm8(offsetof(struct sk_buff, dev))) { - /* movq off8(%rdi),%rax */ - EMIT4(0x48, 0x8b, 0x47, offsetof(struct sk_buff, dev)); - } else { - EMIT3(0x48, 0x8b, 0x87); /* movq off32(%rdi),%rax */ - EMIT(offsetof(struct sk_buff, dev), 4); - } - EMIT3(0x48, 0x85, 0xc0); /* test %rax,%rax */ - EMIT_COND_JMP(X86_JE, cleanup_addr - (addrs[i] - 6)); - BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); - EMIT2(0x8b, 0x80); /* mov off32(%rax),%eax */ - EMIT(offsetof(struct net_device, ifindex), 4); + EMIT1(0x0F); + EMIT1(add_1reg(0xC8, a_reg)); break; - case BPF_S_ANC_MARK: - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); - if (is_imm8(offsetof(struct sk_buff, mark))) { - /* mov off8(%rdi),%eax */ - EMIT3(0x8b, 0x47, offsetof(struct sk_buff, mark)); - } else { - EMIT2(0x8b, 0x87); - EMIT(offsetof(struct sk_buff, mark), 4); - } - break; - case BPF_S_ANC_RXHASH: - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); - if (is_imm8(offsetof(struct sk_buff, hash))) { - /* mov off8(%rdi),%eax */ - EMIT3(0x8b, 0x47, offsetof(struct sk_buff, hash)); - } else { - EMIT2(0x8b, 0x87); - EMIT(offsetof(struct sk_buff, hash), 4); - } - break; - case BPF_S_ANC_QUEUE: - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2); - if (is_imm8(offsetof(struct sk_buff, queue_mapping))) { - /* movzwl off8(%rdi),%eax */ - EMIT4(0x0f, 0xb7, 0x47, offsetof(struct sk_buff, queue_mapping)); - } else { - EMIT3(0x0f, 0xb7, 0x87); /* movzwl off32(%rdi),%eax */ - EMIT(offsetof(struct sk_buff, queue_mapping), 4); - } - break; - case BPF_S_ANC_CPU: -#ifdef CONFIG_SMP - EMIT4(0x65, 0x8b, 0x04, 0x25); /* mov %gs:off32,%eax */ - EMIT((u32)(unsigned long)&cpu_number, 4); /* A = smp_processor_id(); */ -#else - CLEAR_A(); -#endif - break; - case BPF_S_ANC_VLAN_TAG: - case BPF_S_ANC_VLAN_TAG_PRESENT: - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); - if (is_imm8(offsetof(struct sk_buff, vlan_tci))) { - /* movzwl off8(%rdi),%eax */ - EMIT4(0x0f, 0xb7, 0x47, offsetof(struct sk_buff, vlan_tci)); - } else { - EMIT3(0x0f, 0xb7, 0x87); /* movzwl off32(%rdi),%eax */ - EMIT(offsetof(struct sk_buff, vlan_tci), 4); - } - BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); - if (filter[i].code == BPF_S_ANC_VLAN_TAG) { - EMIT3(0x80, 0xe4, 0xef); /* and $0xef,%ah */ - } else { - EMIT3(0xc1, 0xe8, 0x0c); /* shr $0xc,%eax */ - EMIT3(0x83, 0xe0, 0x01); /* and $0x1,%eax */ - } - break; - case BPF_S_ANC_PKTTYPE: - { - int off = pkt_type_offset(); - - if (off < 0) - goto out; - if (is_imm8(off)) { - /* movzbl off8(%rdi),%eax */ - EMIT4(0x0f, 0xb6, 0x47, off); - } else { - /* movbl off32(%rdi),%eax */ - EMIT3(0x0f, 0xb6, 0x87); - EMIT(off, 4); - } - EMIT3(0x83, 0xe0, PKT_TYPE_MAX); /* and $0x7,%eax */ + case 64: + /* emit 'bswap rax' to swap 8 bytes */ + EMIT3(add_1mod(0x48, a_reg), 0x0F, + add_1reg(0xC8, a_reg)); break; } - case BPF_S_LD_W_ABS: - func = CHOOSE_LOAD_FUNC(K, sk_load_word); -common_load: seen |= SEEN_DATAREF; - t_offset = func - (image + addrs[i]); - EMIT1_off32(0xbe, K); /* mov imm32,%esi */ - EMIT1_off32(0xe8, t_offset); /* call */ - break; - case BPF_S_LD_H_ABS: - func = CHOOSE_LOAD_FUNC(K, sk_load_half); - goto common_load; - case BPF_S_LD_B_ABS: - func = CHOOSE_LOAD_FUNC(K, sk_load_byte); - goto common_load; - case BPF_S_LDX_B_MSH: - func = CHOOSE_LOAD_FUNC(K, sk_load_byte_msh); - seen |= SEEN_DATAREF | SEEN_XREG; - t_offset = func - (image + addrs[i]); - EMIT1_off32(0xbe, K); /* mov imm32,%esi */ - EMIT1_off32(0xe8, t_offset); /* call sk_load_byte_msh */ - break; - case BPF_S_LD_W_IND: - func = sk_load_word; -common_load_ind: seen |= SEEN_DATAREF | SEEN_XREG; - t_offset = func - (image + addrs[i]); - if (K) { - if (is_imm8(K)) { - EMIT3(0x8d, 0x73, K); /* lea imm8(%rbx), %esi */ - } else { - EMIT2(0x8d, 0xb3); /* lea imm32(%rbx),%esi */ - EMIT(K, 4); - } - } else { - EMIT2(0x89,0xde); /* mov %ebx,%esi */ - } - EMIT1_off32(0xe8, t_offset); /* call sk_load_xxx_ind */ - break; - case BPF_S_LD_H_IND: - func = sk_load_half; - goto common_load_ind; - case BPF_S_LD_B_IND: - func = sk_load_byte; - goto common_load_ind; - case BPF_S_JMP_JA: - t_offset = addrs[i + K] - addrs[i]; - EMIT_JMP(t_offset); - break; - COND_SEL(BPF_S_JMP_JGT_K, X86_JA, X86_JBE); - COND_SEL(BPF_S_JMP_JGE_K, X86_JAE, X86_JB); - COND_SEL(BPF_S_JMP_JEQ_K, X86_JE, X86_JNE); - COND_SEL(BPF_S_JMP_JSET_K,X86_JNE, X86_JE); - COND_SEL(BPF_S_JMP_JGT_X, X86_JA, X86_JBE); - COND_SEL(BPF_S_JMP_JGE_X, X86_JAE, X86_JB); - COND_SEL(BPF_S_JMP_JEQ_X, X86_JE, X86_JNE); - COND_SEL(BPF_S_JMP_JSET_X,X86_JNE, X86_JE); - -cond_branch: f_offset = addrs[i + filter[i].jf] - addrs[i]; - t_offset = addrs[i + filter[i].jt] - addrs[i]; - - /* same targets, can avoid doing the test :) */ - if (filter[i].jt == filter[i].jf) { - EMIT_JMP(t_offset); - break; - } + break; + + case BPF_ALU | BPF_END | BPF_FROM_LE: + break; - switch (filter[i].code) { - case BPF_S_JMP_JGT_X: - case BPF_S_JMP_JGE_X: - case BPF_S_JMP_JEQ_X: - seen |= SEEN_XREG; - EMIT2(0x39, 0xd8); /* cmp %ebx,%eax */ - break; - case BPF_S_JMP_JSET_X: - seen |= SEEN_XREG; - EMIT2(0x85, 0xd8); /* test %ebx,%eax */ - break; - case BPF_S_JMP_JEQ_K: - if (K == 0) { - EMIT2(0x85, 0xc0); /* test %eax,%eax */ - break; - } - case BPF_S_JMP_JGT_K: - case BPF_S_JMP_JGE_K: - if (K <= 127) - EMIT3(0x83, 0xf8, K); /* cmp imm8,%eax */ + /* ST: *(u8*)(a_reg + off) = imm */ + case BPF_ST | BPF_MEM | BPF_B: + if (is_ereg(a_reg)) + EMIT2(0x41, 0xC6); + else + EMIT1(0xC6); + goto st; + case BPF_ST | BPF_MEM | BPF_H: + if (is_ereg(a_reg)) + EMIT3(0x66, 0x41, 0xC7); + else + EMIT2(0x66, 0xC7); + goto st; + case BPF_ST | BPF_MEM | BPF_W: + if (is_ereg(a_reg)) + EMIT2(0x41, 0xC7); + else + EMIT1(0xC7); + goto st; + case BPF_ST | BPF_MEM | BPF_DW: + EMIT2(add_1mod(0x48, a_reg), 0xC7); + +st: if (is_imm8(insn->off)) + EMIT2(add_1reg(0x40, a_reg), insn->off); + else + EMIT1_off32(add_1reg(0x80, a_reg), insn->off); + + EMIT(K, bpf_size_to_x86_bytes(BPF_SIZE(insn->code))); + break; + + /* STX: *(u8*)(a_reg + off) = x_reg */ + case BPF_STX | BPF_MEM | BPF_B: + /* emit 'mov byte ptr [rax + off], al' */ + if (is_ereg(a_reg) || is_ereg(x_reg) || + /* have to add extra byte for x86 SIL, DIL regs */ + x_reg == BPF_REG_1 || x_reg == BPF_REG_2) + EMIT2(add_2mod(0x40, a_reg, x_reg), 0x88); + else + EMIT1(0x88); + goto stx; + case BPF_STX | BPF_MEM | BPF_H: + if (is_ereg(a_reg) || is_ereg(x_reg)) + EMIT3(0x66, add_2mod(0x40, a_reg, x_reg), 0x89); + else + EMIT2(0x66, 0x89); + goto stx; + case BPF_STX | BPF_MEM | BPF_W: + if (is_ereg(a_reg) || is_ereg(x_reg)) + EMIT2(add_2mod(0x40, a_reg, x_reg), 0x89); + else + EMIT1(0x89); + goto stx; + case BPF_STX | BPF_MEM | BPF_DW: + EMIT2(add_2mod(0x48, a_reg, x_reg), 0x89); +stx: if (is_imm8(insn->off)) + EMIT2(add_2reg(0x40, a_reg, x_reg), insn->off); + else + EMIT1_off32(add_2reg(0x80, a_reg, x_reg), + insn->off); + break; + + /* LDX: a_reg = *(u8*)(x_reg + off) */ + case BPF_LDX | BPF_MEM | BPF_B: + /* emit 'movzx rax, byte ptr [rax + off]' */ + EMIT3(add_2mod(0x48, x_reg, a_reg), 0x0F, 0xB6); + goto ldx; + case BPF_LDX | BPF_MEM | BPF_H: + /* emit 'movzx rax, word ptr [rax + off]' */ + EMIT3(add_2mod(0x48, x_reg, a_reg), 0x0F, 0xB7); + goto ldx; + case BPF_LDX | BPF_MEM | BPF_W: + /* emit 'mov eax, dword ptr [rax+0x14]' */ + if (is_ereg(a_reg) || is_ereg(x_reg)) + EMIT2(add_2mod(0x40, x_reg, a_reg), 0x8B); + else + EMIT1(0x8B); + goto ldx; + case BPF_LDX | BPF_MEM | BPF_DW: + /* emit 'mov rax, qword ptr [rax+0x14]' */ + EMIT2(add_2mod(0x48, x_reg, a_reg), 0x8B); +ldx: /* if insn->off == 0 we can save one extra byte, but + * special case of x86 r13 which always needs an offset + * is not worth the hassle + */ + if (is_imm8(insn->off)) + EMIT2(add_2reg(0x40, x_reg, a_reg), insn->off); + else + EMIT1_off32(add_2reg(0x80, x_reg, a_reg), + insn->off); + break; + + /* STX XADD: lock *(u32*)(a_reg + off) += x_reg */ + case BPF_STX | BPF_XADD | BPF_W: + /* emit 'lock add dword ptr [rax + off], eax' */ + if (is_ereg(a_reg) || is_ereg(x_reg)) + EMIT3(0xF0, add_2mod(0x40, a_reg, x_reg), 0x01); + else + EMIT2(0xF0, 0x01); + goto xadd; + case BPF_STX | BPF_XADD | BPF_DW: + EMIT3(0xF0, add_2mod(0x48, a_reg, x_reg), 0x01); +xadd: if (is_imm8(insn->off)) + EMIT2(add_2reg(0x40, a_reg, x_reg), insn->off); + else + EMIT1_off32(add_2reg(0x80, a_reg, x_reg), + insn->off); + break; + + /* call */ + case BPF_JMP | BPF_CALL: + func = (u8 *) __bpf_call_base + K; + jmp_offset = func - (image + addrs[i]); + if (ctx->seen_ld_abs) { + EMIT2(0x41, 0x52); /* push %r10 */ + EMIT2(0x41, 0x51); /* push %r9 */ + /* need to adjust jmp offset, since + * pop %r9, pop %r10 take 4 bytes after call insn + */ + jmp_offset += 4; + } + if (!K || !is_simm32(jmp_offset)) { + pr_err("unsupported bpf func %d addr %p image %p\n", + K, func, image); + return -EINVAL; + } + EMIT1_off32(0xE8, jmp_offset); + if (ctx->seen_ld_abs) { + EMIT2(0x41, 0x59); /* pop %r9 */ + EMIT2(0x41, 0x5A); /* pop %r10 */ + } + break; + + /* cond jump */ + case BPF_JMP | BPF_JEQ | BPF_X: + case BPF_JMP | BPF_JNE | BPF_X: + case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JSGT | BPF_X: + case BPF_JMP | BPF_JSGE | BPF_X: + /* cmp a_reg, x_reg */ + EMIT3(add_2mod(0x48, a_reg, x_reg), 0x39, + add_2reg(0xC0, a_reg, x_reg)); + goto emit_cond_jmp; + + case BPF_JMP | BPF_JSET | BPF_X: + /* test a_reg, x_reg */ + EMIT3(add_2mod(0x48, a_reg, x_reg), 0x85, + add_2reg(0xC0, a_reg, x_reg)); + goto emit_cond_jmp; + + case BPF_JMP | BPF_JSET | BPF_K: + /* test a_reg, imm32 */ + EMIT1(add_1mod(0x48, a_reg)); + EMIT2_off32(0xF7, add_1reg(0xC0, a_reg), K); + goto emit_cond_jmp; + + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JNE | BPF_K: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JSGT | BPF_K: + case BPF_JMP | BPF_JSGE | BPF_K: + /* cmp a_reg, imm8/32 */ + EMIT1(add_1mod(0x48, a_reg)); + + if (is_imm8(K)) + EMIT3(0x83, add_1reg(0xF8, a_reg), K); + else + EMIT2_off32(0x81, add_1reg(0xF8, a_reg), K); + +emit_cond_jmp: /* convert BPF opcode to x86 */ + switch (BPF_OP(insn->code)) { + case BPF_JEQ: + jmp_cond = X86_JE; + break; + case BPF_JSET: + case BPF_JNE: + jmp_cond = X86_JNE; + break; + case BPF_JGT: + /* GT is unsigned '>', JA in x86 */ + jmp_cond = X86_JA; + break; + case BPF_JGE: + /* GE is unsigned '>=', JAE in x86 */ + jmp_cond = X86_JAE; + break; + case BPF_JSGT: + /* signed '>', GT in x86 */ + jmp_cond = X86_JG; + break; + case BPF_JSGE: + /* signed '>=', GE in x86 */ + jmp_cond = X86_JGE; + break; + default: /* to silence gcc warning */ + return -EFAULT; + } + jmp_offset = addrs[i + insn->off] - addrs[i]; + if (is_imm8(jmp_offset)) { + EMIT2(jmp_cond, jmp_offset); + } else if (is_simm32(jmp_offset)) { + EMIT2_off32(0x0F, jmp_cond + 0x10, jmp_offset); + } else { + pr_err("cond_jmp gen bug %llx\n", jmp_offset); + return -EFAULT; + } + + break; + + case BPF_JMP | BPF_JA: + jmp_offset = addrs[i + insn->off] - addrs[i]; + if (!jmp_offset) + /* optimize out nop jumps */ + break; +emit_jmp: + if (is_imm8(jmp_offset)) { + EMIT2(0xEB, jmp_offset); + } else if (is_simm32(jmp_offset)) { + EMIT1_off32(0xE9, jmp_offset); + } else { + pr_err("jmp gen bug %llx\n", jmp_offset); + return -EFAULT; + } + break; + + case BPF_LD | BPF_IND | BPF_W: + func = sk_load_word; + goto common_load; + case BPF_LD | BPF_ABS | BPF_W: + func = CHOOSE_LOAD_FUNC(K, sk_load_word); +common_load: ctx->seen_ld_abs = true; + jmp_offset = func - (image + addrs[i]); + if (!func || !is_simm32(jmp_offset)) { + pr_err("unsupported bpf func %d addr %p image %p\n", + K, func, image); + return -EINVAL; + } + if (BPF_MODE(insn->code) == BPF_ABS) { + /* mov %esi, imm32 */ + EMIT1_off32(0xBE, K); + } else { + /* mov %rsi, x_reg */ + EMIT_mov(BPF_REG_2, x_reg); + if (K) { + if (is_imm8(K)) + /* add %esi, imm8 */ + EMIT3(0x83, 0xC6, K); else - EMIT1_off32(0x3d, K); /* cmp imm32,%eax */ - break; - case BPF_S_JMP_JSET_K: - if (K <= 0xFF) - EMIT2(0xa8, K); /* test imm8,%al */ - else if (!(K & 0xFFFF00FF)) - EMIT3(0xf6, 0xc4, K >> 8); /* test imm8,%ah */ - else if (K <= 0xFFFF) { - EMIT2(0x66, 0xa9); /* test imm16,%ax */ - EMIT(K, 2); - } else { - EMIT1_off32(0xa9, K); /* test imm32,%eax */ - } - break; + /* add %esi, imm32 */ + EMIT2_off32(0x81, 0xC6, K); } - if (filter[i].jt != 0) { - if (filter[i].jf && f_offset) - t_offset += is_near(f_offset) ? 2 : 5; - EMIT_COND_JMP(t_op, t_offset); - if (filter[i].jf) - EMIT_JMP(f_offset); - break; - } - EMIT_COND_JMP(f_op, f_offset); - break; - default: - /* hmm, too complex filter, give up with jit compiler */ - goto out; } - ilen = prog - temp; - if (image) { - if (unlikely(proglen + ilen > oldproglen)) { - pr_err("bpb_jit_compile fatal error\n"); - kfree(addrs); - module_free(NULL, header); - return; - } - memcpy(image + proglen, temp, ilen); + /* skb pointer is in R6 (%rbx), it will be copied into + * %rdi if skb_copy_bits() call is necessary. + * sk_load_* helpers also use %r10 and %r9d. + * See bpf_jit.S + */ + EMIT1_off32(0xE8, jmp_offset); /* call */ + break; + + case BPF_LD | BPF_IND | BPF_H: + func = sk_load_half; + goto common_load; + case BPF_LD | BPF_ABS | BPF_H: + func = CHOOSE_LOAD_FUNC(K, sk_load_half); + goto common_load; + case BPF_LD | BPF_IND | BPF_B: + func = sk_load_byte; + goto common_load; + case BPF_LD | BPF_ABS | BPF_B: + func = CHOOSE_LOAD_FUNC(K, sk_load_byte); + goto common_load; + + case BPF_JMP | BPF_EXIT: + if (i != insn_cnt - 1) { + jmp_offset = ctx->cleanup_addr - addrs[i]; + goto emit_jmp; } - proglen += ilen; - addrs[i] = proglen; - prog = temp; + /* update cleanup_addr */ + ctx->cleanup_addr = proglen; + /* mov rbx, qword ptr [rbp-X] */ + EMIT3_off32(0x48, 0x8B, 0x9D, -stacksize); + /* mov r13, qword ptr [rbp-X] */ + EMIT3_off32(0x4C, 0x8B, 0xAD, -stacksize + 8); + /* mov r14, qword ptr [rbp-X] */ + EMIT3_off32(0x4C, 0x8B, 0xB5, -stacksize + 16); + /* mov r15, qword ptr [rbp-X] */ + EMIT3_off32(0x4C, 0x8B, 0xBD, -stacksize + 24); + + EMIT1(0xC9); /* leave */ + EMIT1(0xC3); /* ret */ + break; + + default: + /* By design x64 JIT should support all BPF instructions + * This error will be seen if new instruction was added + * to interpreter, but not to JIT + * or if there is junk in sk_filter + */ + pr_err("bpf_jit: unknown opcode %02x\n", insn->code); + return -EINVAL; } - /* last bpf instruction is always a RET : - * use it to give the cleanup instruction(s) addr - */ - cleanup_addr = proglen - 1; /* ret */ - if (seen_or_pass0) - cleanup_addr -= 1; /* leaveq */ - if (seen_or_pass0 & SEEN_XREG) - cleanup_addr -= 4; /* mov -8(%rbp),%rbx */ + ilen = prog - temp; + if (image) { + if (unlikely(proglen + ilen > oldproglen)) { + pr_err("bpf_jit_compile fatal error\n"); + return -EFAULT; + } + memcpy(image + proglen, temp, ilen); + } + proglen += ilen; + addrs[i] = proglen; + prog = temp; + } + return proglen; +} + +void bpf_jit_compile(struct sk_filter *prog) +{ +} + +void bpf_int_jit_compile(struct sk_filter *prog) +{ + struct bpf_binary_header *header = NULL; + int proglen, oldproglen = 0; + struct jit_context ctx = {}; + u8 *image = NULL; + int *addrs; + int pass; + int i; + + if (!bpf_jit_enable) + return; + + if (!prog || !prog->len) + return; + + addrs = kmalloc(prog->len * sizeof(*addrs), GFP_KERNEL); + if (!addrs) + return; + + /* Before first pass, make a rough estimation of addrs[] + * each bpf instruction is translated to less than 64 bytes + */ + for (proglen = 0, i = 0; i < prog->len; i++) { + proglen += 64; + addrs[i] = proglen; + } + ctx.cleanup_addr = proglen; + + for (pass = 0; pass < 10; pass++) { + proglen = do_jit(prog, addrs, image, oldproglen, &ctx); + if (proglen <= 0) { + image = NULL; + if (header) + module_free(NULL, header); + goto out; + } if (image) { if (proglen != oldproglen) - pr_err("bpb_jit_compile proglen=%u != oldproglen=%u\n", proglen, oldproglen); + pr_err("bpf_jit: proglen=%d != oldproglen=%d\n", + proglen, oldproglen); break; } if (proglen == oldproglen) { @@ -766,17 +918,16 @@ cond_branch: f_offset = addrs[i + filter[i].jf] - addrs[i]; } if (bpf_jit_enable > 1) - bpf_jit_dump(flen, proglen, pass, image); + bpf_jit_dump(prog->len, proglen, 0, image); if (image) { bpf_flush_icache(header, image + proglen); set_memory_ro((unsigned long)header, header->pages); - fp->bpf_func = (void *)image; - fp->jited = 1; + prog->bpf_func = (void *)image; + prog->jited = 1; } out: kfree(addrs); - return; } static void bpf_jit_free_deferred(struct work_struct *work) diff --git a/arch/x86/platform/efi/early_printk.c b/arch/x86/platform/efi/early_printk.c index 81b506d5bef..52414211729 100644 --- a/arch/x86/platform/efi/early_printk.c +++ b/arch/x86/platform/efi/early_printk.c @@ -14,48 +14,92 @@ static const struct font_desc *font; static u32 efi_x, efi_y; +static void *efi_fb; +static bool early_efi_keep; -static __init void early_efi_clear_scanline(unsigned int y) +/* + * efi earlyprintk need use early_ioremap to map the framebuffer. + * But early_ioremap is not usable for earlyprintk=efi,keep, ioremap should + * be used instead. ioremap will be available after paging_init() which is + * earlier than initcall callbacks. Thus adding this early initcall function + * early_efi_map_fb to map the whole efi framebuffer. + */ +static __init int early_efi_map_fb(void) { - unsigned long base, *dst; - u16 len; + unsigned long base, size; + + if (!early_efi_keep) + return 0; base = boot_params.screen_info.lfb_base; - len = boot_params.screen_info.lfb_linelength; + size = boot_params.screen_info.lfb_size; + efi_fb = ioremap(base, size); + + return efi_fb ? 0 : -ENOMEM; +} +early_initcall(early_efi_map_fb); + +/* + * early_efi_map maps efi framebuffer region [start, start + len -1] + * In case earlyprintk=efi,keep we have the whole framebuffer mapped already + * so just return the offset efi_fb + start. + */ +static __init_refok void *early_efi_map(unsigned long start, unsigned long len) +{ + unsigned long base; + + base = boot_params.screen_info.lfb_base; + + if (efi_fb) + return (efi_fb + start); + else + return early_ioremap(base + start, len); +} - dst = early_ioremap(base + y*len, len); +static __init_refok void early_efi_unmap(void *addr, unsigned long len) +{ + if (!efi_fb) + early_iounmap(addr, len); +} + +static void early_efi_clear_scanline(unsigned int y) +{ + unsigned long *dst; + u16 len; + + len = boot_params.screen_info.lfb_linelength; + dst = early_efi_map(y*len, len); if (!dst) return; memset(dst, 0, len); - early_iounmap(dst, len); + early_efi_unmap(dst, len); } -static __init void early_efi_scroll_up(void) +static void early_efi_scroll_up(void) { - unsigned long base, *dst, *src; + unsigned long *dst, *src; u16 len; u32 i, height; - base = boot_params.screen_info.lfb_base; len = boot_params.screen_info.lfb_linelength; height = boot_params.screen_info.lfb_height; for (i = 0; i < height - font->height; i++) { - dst = early_ioremap(base + i*len, len); + dst = early_efi_map(i*len, len); if (!dst) return; - src = early_ioremap(base + (i + font->height) * len, len); + src = early_efi_map((i + font->height) * len, len); if (!src) { - early_iounmap(dst, len); + early_efi_unmap(dst, len); return; } memmove(dst, src, len); - early_iounmap(src, len); - early_iounmap(dst, len); + early_efi_unmap(src, len); + early_efi_unmap(dst, len); } } @@ -79,16 +123,14 @@ static void early_efi_write_char(u32 *dst, unsigned char c, unsigned int h) } } -static __init void +static void early_efi_write(struct console *con, const char *str, unsigned int num) { struct screen_info *si; - unsigned long base; unsigned int len; const char *s; void *dst; - base = boot_params.screen_info.lfb_base; si = &boot_params.screen_info; len = si->lfb_linelength; @@ -109,7 +151,7 @@ early_efi_write(struct console *con, const char *str, unsigned int num) for (h = 0; h < font->height; h++) { unsigned int n, x; - dst = early_ioremap(base + (efi_y + h) * len, len); + dst = early_efi_map((efi_y + h) * len, len); if (!dst) return; @@ -123,7 +165,7 @@ early_efi_write(struct console *con, const char *str, unsigned int num) s++; } - early_iounmap(dst, len); + early_efi_unmap(dst, len); } num -= count; @@ -179,6 +221,9 @@ static __init int early_efi_setup(struct console *con, char *options) for (i = 0; i < (yres - efi_y) / font->height; i++) early_efi_scroll_up(); + /* early_console_register will unset CON_BOOT in case ,keep */ + if (!(con->flags & CON_BOOT)) + early_efi_keep = true; return 0; } diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c index ff0174dda81..a9acde72d4e 100644 --- a/arch/x86/platform/olpc/olpc-xo1-pm.c +++ b/arch/x86/platform/olpc/olpc-xo1-pm.c @@ -75,7 +75,7 @@ static int xo1_power_state_enter(suspend_state_t pm_state) return 0; } -asmlinkage int xo1_do_sleep(u8 sleep_state) +asmlinkage __visible int xo1_do_sleep(u8 sleep_state) { void *pgd_addr = __va(read_cr3()); diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 304fca20d96..35e2bb6c0f3 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -23,7 +23,7 @@ extern __visible const void __nosave_begin, __nosave_end; /* Defined in hibernate_asm_64.S */ -extern asmlinkage int restore_image(void); +extern asmlinkage __visible int restore_image(void); /* * Address to jump to in the last phase of restore in order to get to the image diff --git a/arch/x86/syscalls/Makefile b/arch/x86/syscalls/Makefile index f325af26107..3323c274524 100644 --- a/arch/x86/syscalls/Makefile +++ b/arch/x86/syscalls/Makefile @@ -54,5 +54,7 @@ syshdr-$(CONFIG_X86_64) += syscalls_64.h targets += $(uapisyshdr-y) $(syshdr-y) +PHONY += all all: $(addprefix $(uapi)/,$(uapisyshdr-y)) all: $(addprefix $(out)/,$(syshdr-y)) + @: diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index 96bc506ac6d..d6b86792161 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -359,3 +359,4 @@ 350 i386 finit_module sys_finit_module 351 i386 sched_setattr sys_sched_setattr 352 i386 sched_getattr sys_sched_getattr +353 i386 renameat2 sys_renameat2 diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile index e8120346903..604a37efd4d 100644 --- a/arch/x86/tools/Makefile +++ b/arch/x86/tools/Makefile @@ -40,4 +40,6 @@ $(obj)/insn_sanity.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/ina HOST_EXTRACFLAGS += -I$(srctree)/tools/include hostprogs-y += relocs relocs-objs := relocs_32.o relocs_64.o relocs_common.o +PHONY += relocs relocs: $(obj)/relocs + @: diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/vdso/vdso-layout.lds.S index 2e263f367b1..9df017ab228 100644 --- a/arch/x86/vdso/vdso-layout.lds.S +++ b/arch/x86/vdso/vdso-layout.lds.S @@ -9,12 +9,9 @@ SECTIONS #ifdef BUILD_VDSO32 #include <asm/vdso32.h> - .hpet_sect : { - hpet_page = . - VDSO_OFFSET(VDSO_HPET_PAGE); - } :text :hpet_sect + hpet_page = . - VDSO_OFFSET(VDSO_HPET_PAGE); - .vvar_sect : { - vvar = . - VDSO_OFFSET(VDSO_VVAR_PAGE); + vvar = . - VDSO_OFFSET(VDSO_VVAR_PAGE); /* Place all vvars at the offsets in asm/vvar.h. */ #define EMIT_VVAR(name, offset) vvar_ ## name = vvar + offset; @@ -22,7 +19,6 @@ SECTIONS #include <asm/vvar.h> #undef __VVAR_KERNEL_LDS #undef EMIT_VVAR - } :text :vvar_sect #endif . = SIZEOF_HEADERS; @@ -61,7 +57,12 @@ SECTIONS */ . = ALIGN(0x100); - .text : { *(.text*) } :text =0x90909090 + .text : { *(.text*) } :text =0x90909090, + + /* + * The comma above works around a bug in gold: + * https://sourceware.org/bugzilla/show_bug.cgi?id=16804 + */ /DISCARD/ : { *(.discard) @@ -84,8 +85,4 @@ PHDRS dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ note PT_NOTE FLAGS(4); /* PF_R */ eh_frame_hdr PT_GNU_EH_FRAME; -#ifdef BUILD_VDSO32 - vvar_sect PT_NULL FLAGS(4); /* PF_R */ - hpet_sect PT_NULL FLAGS(4); /* PF_R */ -#endif } diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 00348980a3a..e1f220e3ca6 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -39,6 +39,7 @@ #ifdef CONFIG_X86_64 #define vdso_enabled sysctl_vsyscall32 #define arch_setup_additional_pages syscall32_setup_pages +extern int sysctl_ldt16; #endif /* @@ -249,6 +250,13 @@ static struct ctl_table abi_table2[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "ldt16", + .data = &sysctl_ldt16, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, {} }; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 201d09a7c46..c34bfc4bbe7 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1515,7 +1515,7 @@ static void __init xen_pvh_early_guest_init(void) } /* First C function to be called on Xen boot */ -asmlinkage void __init xen_start_kernel(void) +asmlinkage __visible void __init xen_start_kernel(void) { struct physdev_set_iopl set_iopl; int rc; diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 08f763de26f..a1207cb6472 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -23,7 +23,7 @@ void xen_force_evtchn_callback(void) (void)HYPERVISOR_xen_version(0, NULL); } -asmlinkage unsigned long xen_save_fl(void) +asmlinkage __visible unsigned long xen_save_fl(void) { struct vcpu_info *vcpu; unsigned long flags; @@ -63,7 +63,7 @@ __visible void xen_restore_fl(unsigned long flags) } PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl); -asmlinkage void xen_irq_disable(void) +asmlinkage __visible void xen_irq_disable(void) { /* There's a one instruction preempt window here. We need to make sure we're don't switch CPUs between getting the vcpu @@ -74,7 +74,7 @@ asmlinkage void xen_irq_disable(void) } PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable); -asmlinkage void xen_irq_enable(void) +asmlinkage __visible void xen_irq_enable(void) { struct vcpu_info *vcpu; diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index a18eadd8bb4..7005974c3ff 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -441,10 +441,11 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) irq_ctx_init(cpu); #else clear_tsk_thread_flag(idle, TIF_FORK); +#endif per_cpu(kernel_stack, cpu) = (unsigned long)task_stack_page(idle) - KERNEL_STACK_OFFSET + THREAD_SIZE; -#endif + xen_setup_runstate_info(cpu); xen_setup_timer(cpu); xen_init_lock_cpu(cpu); diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 4d3acc34a99..0ba5f3b967f 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -274,7 +274,7 @@ void __init xen_init_spinlocks(void) printk(KERN_DEBUG "xen: PV spinlocks disabled\n"); return; } - + printk(KERN_DEBUG "xen: PV spinlocks enabled\n"); pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning); pv_lock_ops.unlock_kick = xen_unlock_kick; } @@ -290,6 +290,9 @@ static __init int xen_init_spinlocks_jump(void) if (!xen_pvspin) return 0; + if (!xen_domain()) + return 0; + static_key_slow_inc(¶virt_ticketlocks_enabled); return 0; } diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index 33ca6e42a4c..fd92a64d748 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -75,6 +75,17 @@ ENDPROC(xen_sysexit) * stack state in whatever form its in, we keep things simple by only * using a single register which is pushed/popped on the stack. */ + +.macro POP_FS +1: + popw %fs +.pushsection .fixup, "ax" +2: movw $0, (%esp) + jmp 1b +.popsection + _ASM_EXTABLE(1b,2b) +.endm + ENTRY(xen_iret) /* test eflags for special cases */ testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp) @@ -83,15 +94,13 @@ ENTRY(xen_iret) push %eax ESP_OFFSET=4 # bytes pushed onto stack - /* - * Store vcpu_info pointer for easy access. Do it this way to - * avoid having to reload %fs - */ + /* Store vcpu_info pointer for easy access */ #ifdef CONFIG_SMP - GET_THREAD_INFO(%eax) - movl %ss:TI_cpu(%eax), %eax - movl %ss:__per_cpu_offset(,%eax,4), %eax - mov %ss:xen_vcpu(%eax), %eax + pushw %fs + movl $(__KERNEL_PERCPU), %eax + movl %eax, %fs + movl %fs:xen_vcpu, %eax + POP_FS #else movl %ss:xen_vcpu, %eax #endif |