From f5deb79679af6eb41b61112fadcda28b2a4cfb0d Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 3 Feb 2009 14:22:48 +0800 Subject: x86: kexec: Use one page table in x86_64 machine_kexec Impact: reduce kernel BSS size by 7 pages, improve code readability Two page tables are used in current x86_64 kexec implementation. One is used to jump from kernel virtual address to identity map address, the other is used to map all physical memory. In fact, on x86_64, there is no conflict between kernel virtual address space and physical memory space, so just one page table is sufficient. The page table pages used to map control page are dynamically allocated to save memory if kexec image is not loaded. ASM code used to map control page is replaced by C code too. Signed-off-by: Huang Ying Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/kexec.h | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) (limited to 'arch/x86/include/asm/kexec.h') diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index c61d8b2ab8b..0ceb6d19ed3 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -9,23 +9,8 @@ # define PAGES_NR 4 #else # define PA_CONTROL_PAGE 0 -# define VA_CONTROL_PAGE 1 -# define PA_PGD 2 -# define VA_PGD 3 -# define PA_PUD_0 4 -# define VA_PUD_0 5 -# define PA_PMD_0 6 -# define VA_PMD_0 7 -# define PA_PTE_0 8 -# define VA_PTE_0 9 -# define PA_PUD_1 10 -# define VA_PUD_1 11 -# define PA_PMD_1 12 -# define VA_PMD_1 13 -# define PA_PTE_1 14 -# define VA_PTE_1 15 -# define PA_TABLE_PAGE 16 -# define PAGES_NR 17 +# define PA_TABLE_PAGE 1 +# define PAGES_NR 2 #endif #ifdef CONFIG_X86_32 @@ -157,9 +142,9 @@ relocate_kernel(unsigned long indirection_page, unsigned long start_address) ATTRIB_NORET; #endif -#ifdef CONFIG_X86_32 #define ARCH_HAS_KIMAGE_ARCH +#ifdef CONFIG_X86_32 struct kimage_arch { pgd_t *pgd; #ifdef CONFIG_X86_PAE @@ -169,6 +154,12 @@ struct kimage_arch { pte_t *pte0; pte_t *pte1; }; +#else +struct kimage_arch { + pud_t *pud; + pmd_t *pmd; + pte_t *pte; +}; #endif #endif /* __ASSEMBLY__ */ -- cgit v1.2.3-70-g09d2 From fee7b0d84cc8c7bc5dc212901c79e93eaf83a5b5 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 10 Mar 2009 10:57:16 +0800 Subject: x86, kexec: x86_64: add kexec jump support for x86_64 Impact: New major feature This patch add kexec jump support for x86_64. More information about kexec jump can be found in corresponding x86_32 support patch. Signed-off-by: Huang Ying Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 2 +- arch/x86/include/asm/kexec.h | 13 +-- arch/x86/kernel/machine_kexec_64.c | 42 ++++++++- arch/x86/kernel/relocate_kernel_64.S | 177 ++++++++++++++++++++++++++++------- arch/x86/kernel/vmlinux_64.lds.S | 7 ++ 5 files changed, 197 insertions(+), 44 deletions(-) (limited to 'arch/x86/include/asm/kexec.h') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 31758378bcd..87717f3687d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1431,7 +1431,7 @@ config CRASH_DUMP config KEXEC_JUMP bool "kexec jump (EXPERIMENTAL)" depends on EXPERIMENTAL - depends on KEXEC && HIBERNATION && X86_32 + depends on KEXEC && HIBERNATION ---help--- Jump between original kernel and kexeced kernel and invoke code in physical address mode via KEXEC diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 0ceb6d19ed3..317ff1703d0 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -9,13 +9,13 @@ # define PAGES_NR 4 #else # define PA_CONTROL_PAGE 0 -# define PA_TABLE_PAGE 1 -# define PAGES_NR 2 +# define VA_CONTROL_PAGE 1 +# define PA_TABLE_PAGE 2 +# define PA_SWAP_PAGE 3 +# define PAGES_NR 4 #endif -#ifdef CONFIG_X86_32 # define KEXEC_CONTROL_CODE_MAX_SIZE 2048 -#endif #ifndef __ASSEMBLY__ @@ -136,10 +136,11 @@ relocate_kernel(unsigned long indirection_page, unsigned int has_pae, unsigned int preserve_context); #else -NORET_TYPE void +unsigned long relocate_kernel(unsigned long indirection_page, unsigned long page_list, - unsigned long start_address) ATTRIB_NORET; + unsigned long start_address, + unsigned int preserve_context); #endif #define ARCH_HAS_KIMAGE_ARCH diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 7cc5d3d0148..89cea4d4467 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -270,19 +271,43 @@ void machine_kexec(struct kimage *image) { unsigned long page_list[PAGES_NR]; void *control_page; + int save_ftrace_enabled; - tracer_disable(); +#ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) + save_processor_state(); +#endif + + save_ftrace_enabled = __ftrace_enabled_save(); /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); + if (image->preserve_context) { +#ifdef CONFIG_X86_IO_APIC + /* + * We need to put APICs in legacy mode so that we can + * get timer interrupts in second kernel. kexec/kdump + * paths already have calls to disable_IO_APIC() in + * one form or other. kexec jump path also need + * one. + */ + disable_IO_APIC(); +#endif + } + control_page = page_address(image->control_code_page) + PAGE_SIZE; - memcpy(control_page, relocate_kernel, PAGE_SIZE); + memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); + page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; page_list[PA_TABLE_PAGE] = (unsigned long)__pa(page_address(image->control_code_page)); + if (image->type == KEXEC_TYPE_DEFAULT) + page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) + << PAGE_SHIFT); + /* * The segment registers are funny things, they have both a * visible and an invisible part. Whenever the visible part is @@ -302,8 +327,17 @@ void machine_kexec(struct kimage *image) set_idt(phys_to_virt(0), 0); /* now call it */ - relocate_kernel((unsigned long)image->head, (unsigned long)page_list, - image->start); + image->start = relocate_kernel((unsigned long)image->head, + (unsigned long)page_list, + image->start, + image->preserve_context); + +#ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) + restore_processor_state(); +#endif + + __ftrace_enabled_restore(save_ftrace_enabled); } void arch_crash_save_vmcoreinfo(void) diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index cfc0d24003d..4de8f5b3d47 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -19,6 +19,24 @@ #define PTR(x) (x << 3) #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) +/* + * control_page + KEXEC_CONTROL_CODE_MAX_SIZE + * ~ control_page + PAGE_SIZE are used as data storage and stack for + * jumping back + */ +#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset)) + +/* Minimal CPU state */ +#define RSP DATA(0x0) +#define CR0 DATA(0x8) +#define CR3 DATA(0x10) +#define CR4 DATA(0x18) + +/* other data */ +#define CP_PA_TABLE_PAGE DATA(0x20) +#define CP_PA_SWAP_PAGE DATA(0x28) +#define CP_PA_BACKUP_PAGES_MAP DATA(0x30) + .text .align PAGE_SIZE .code64 @@ -28,8 +46,27 @@ relocate_kernel: * %rdi indirection_page * %rsi page_list * %rdx start address + * %rcx preserve_context */ + /* Save the CPU context, used for jumping back */ + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushf + + movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 + movq %rsp, RSP(%r11) + movq %cr0, %rax + movq %rax, CR0(%r11) + movq %cr3, %rax + movq %rax, CR3(%r11) + movq %cr4, %rax + movq %rax, CR4(%r11) + /* zero out flags, and disable interrupts */ pushq $0 popfq @@ -41,10 +78,18 @@ relocate_kernel: movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 /* get physical address of page table now too */ - movq PTR(PA_TABLE_PAGE)(%rsi), %rcx + movq PTR(PA_TABLE_PAGE)(%rsi), %r9 + + /* get physical address of swap page now */ + movq PTR(PA_SWAP_PAGE)(%rsi), %r10 + + /* save some information for jumping back */ + movq %r9, CP_PA_TABLE_PAGE(%r11) + movq %r10, CP_PA_SWAP_PAGE(%r11) + movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11) /* Switch to the identity mapped page tables */ - movq %rcx, %cr3 + movq %r9, %cr3 /* setup a new stack at the end of the physical control page */ lea PAGE_SIZE(%r8), %rsp @@ -83,9 +128,87 @@ identity_mapped: 1: /* Flush the TLB (needed?) */ - movq %rcx, %cr3 + movq %r9, %cr3 + + movq %rcx, %r11 + call swap_pages + + /* + * To be certain of avoiding problems with self-modifying code + * I need to execute a serializing instruction here. + * So I flush the TLB by reloading %cr3 here, it's handy, + * and not processor dependent. + */ + movq %cr3, %rax + movq %rax, %cr3 + + /* + * set all of the registers to known values + * leave %rsp alone + */ + + testq %r11, %r11 + jnz 1f + xorq %rax, %rax + xorq %rbx, %rbx + xorq %rcx, %rcx + xorq %rdx, %rdx + xorq %rsi, %rsi + xorq %rdi, %rdi + xorq %rbp, %rbp + xorq %r8, %r8 + xorq %r9, %r9 + xorq %r10, %r9 + xorq %r11, %r11 + xorq %r12, %r12 + xorq %r13, %r13 + xorq %r14, %r14 + xorq %r15, %r15 + + ret + +1: + popq %rdx + leaq PAGE_SIZE(%r10), %rsp + call *%rdx + + /* get the re-entry point of the peer system */ + movq 0(%rsp), %rbp + call 1f +1: + popq %r8 + subq $(1b - relocate_kernel), %r8 + movq CP_PA_SWAP_PAGE(%r8), %r10 + movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi + movq CP_PA_TABLE_PAGE(%r8), %rax + movq %rax, %cr3 + lea PAGE_SIZE(%r8), %rsp + call swap_pages + movq $virtual_mapped, %rax + pushq %rax + ret + +virtual_mapped: + movq RSP(%r8), %rsp + movq CR4(%r8), %rax + movq %rax, %cr4 + movq CR3(%r8), %rax + movq CR0(%r8), %r8 + movq %rax, %cr3 + movq %r8, %cr0 + movq %rbp, %rax + + popf + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret /* Do the copies */ +swap_pages: movq %rdi, %rcx /* Put the page_list in %rcx */ xorq %rdi, %rdi xorq %rsi, %rsi @@ -117,39 +240,27 @@ identity_mapped: movq %rcx, %rsi /* For ever source page do a copy */ andq $0xfffffffffffff000, %rsi + movq %rdi, %rdx + movq %rsi, %rax + + movq %r10, %rdi movq $512, %rcx rep ; movsq - jmp 0b -3: - /* - * To be certain of avoiding problems with self-modifying code - * I need to execute a serializing instruction here. - * So I flush the TLB by reloading %cr3 here, it's handy, - * and not processor dependent. - */ - movq %cr3, %rax - movq %rax, %cr3 - - /* - * set all of the registers to known values - * leave %rsp alone - */ + movq %rax, %rdi + movq %rdx, %rsi + movq $512, %rcx + rep ; movsq - xorq %rax, %rax - xorq %rbx, %rbx - xorq %rcx, %rcx - xorq %rdx, %rdx - xorq %rsi, %rsi - xorq %rdi, %rdi - xorq %rbp, %rbp - xorq %r8, %r8 - xorq %r9, %r9 - xorq %r10, %r9 - xorq %r11, %r11 - xorq %r12, %r12 - xorq %r13, %r13 - xorq %r14, %r14 - xorq %r15, %r15 + movq %rdx, %rdi + movq %r10, %rsi + movq $512, %rcx + rep ; movsq + lea PAGE_SIZE(%rax), %rsi + jmp 0b +3: ret + + .globl kexec_control_code_size +.set kexec_control_code_size, . - relocate_kernel diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index fbfced6f680..5bf54e40c6e 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -275,3 +275,10 @@ ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), ASSERT((per_cpu__irq_stack_union == 0), "irq_stack_union is not at start of per-cpu area"); #endif + +#ifdef CONFIG_KEXEC +#include + +ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, + "kexec control code size is too big") +#endif -- cgit v1.2.3-70-g09d2