diff options
Diffstat (limited to 'arch')
167 files changed, 4803 insertions, 8491 deletions
diff --git a/arch/alpha/boot/misc.c b/arch/alpha/boot/misc.c index 1d65adf5691..c00646b25f6 100644 --- a/arch/alpha/boot/misc.c +++ b/arch/alpha/boot/misc.c @@ -98,7 +98,7 @@ extern int end; static ulg free_mem_ptr; static ulg free_mem_ptr_end; -#define HEAP_SIZE 0x2000 +#define HEAP_SIZE 0x3000 #include "../../../lib/inflate.c" diff --git a/arch/alpha/kernel/vmlinux.lds.S b/arch/alpha/kernel/vmlinux.lds.S index 4cc44bd33d3..cf1e6fc6c68 100644 --- a/arch/alpha/kernel/vmlinux.lds.S +++ b/arch/alpha/kernel/vmlinux.lds.S @@ -69,7 +69,7 @@ SECTIONS . = ALIGN(8); SECURITY_INIT - . = ALIGN(64); + . = ALIGN(8192); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/arm/boot/compressed/misc.c b/arch/arm/boot/compressed/misc.c index 283891c736c..9b444022cb9 100644 --- a/arch/arm/boot/compressed/misc.c +++ b/arch/arm/boot/compressed/misc.c @@ -239,7 +239,7 @@ extern int end; static ulg free_mem_ptr; static ulg free_mem_ptr_end; -#define HEAP_SIZE 0x2000 +#define HEAP_SIZE 0x3000 #include "../../../../lib/inflate.c" diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index ddbdad48f5b..d1a6a597ed9 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -59,7 +59,7 @@ SECTIONS usr/built-in.o(.init.ramfs) __initramfs_end = .; #endif - . = ALIGN(64); + . = ALIGN(4096); __per_cpu_start = .; *(.data.percpu) __per_cpu_end = .; diff --git a/arch/arm26/boot/compressed/misc.c b/arch/arm26/boot/compressed/misc.c index f17f50e5516..0714d19c577 100644 --- a/arch/arm26/boot/compressed/misc.c +++ b/arch/arm26/boot/compressed/misc.c @@ -182,7 +182,7 @@ extern int end; static ulg free_mem_ptr; static ulg free_mem_ptr_end; -#define HEAP_SIZE 0x2000 +#define HEAP_SIZE 0x3000 #include "../../../../lib/inflate.c" diff --git a/arch/cris/arch-v32/vmlinux.lds.S b/arch/cris/arch-v32/vmlinux.lds.S index e124fcd766d..dfa25e1542b 100644 --- a/arch/cris/arch-v32/vmlinux.lds.S +++ b/arch/cris/arch-v32/vmlinux.lds.S @@ -91,6 +91,7 @@ SECTIONS } SECURITY_INIT + . = ALIGN (8192); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/frv/kernel/vmlinux.lds.S b/arch/frv/kernel/vmlinux.lds.S index 97910e01682..28eae9735ad 100644 --- a/arch/frv/kernel/vmlinux.lds.S +++ b/arch/frv/kernel/vmlinux.lds.S @@ -57,6 +57,7 @@ SECTIONS __alt_instructions_end = .; .altinstr_replacement : { *(.altinstr_replacement) } + . = ALIGN(4096); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index bcf2fc408a1..a9af760c7e5 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -220,7 +220,7 @@ config PARAVIRT config VMI bool "VMI Paravirt-ops support" - depends on PARAVIRT && !COMPAT_VDSO + depends on PARAVIRT help VMI provides a paravirtualized interface to the VMware ESX server (it could be used by other hypervisors in theory too, but is not @@ -571,6 +571,9 @@ choice bool "3G/1G user/kernel split (for full 1G low memory)" config VMSPLIT_2G bool "2G/2G user/kernel split" + config VMSPLIT_2G_OPT + depends on !HIGHMEM + bool "2G/2G user/kernel split (for full 2G low memory)" config VMSPLIT_1G bool "1G/3G user/kernel split" endchoice @@ -578,7 +581,8 @@ endchoice config PAGE_OFFSET hex default 0xB0000000 if VMSPLIT_3G_OPT - default 0x78000000 if VMSPLIT_2G + default 0x80000000 if VMSPLIT_2G + default 0x78000000 if VMSPLIT_2G_OPT default 0x40000000 if VMSPLIT_1G default 0xC0000000 @@ -915,12 +919,9 @@ source kernel/power/Kconfig source "drivers/acpi/Kconfig" -menu "APM (Advanced Power Management) BIOS Support" -depends on PM && !X86_VISWS - -config APM +menuconfig APM tristate "APM (Advanced Power Management) BIOS support" - depends on PM + depends on PM && !X86_VISWS ---help--- APM is a BIOS specification for saving power using several different techniques. This is mostly useful for battery powered laptops with @@ -977,9 +978,10 @@ config APM To compile this driver as a module, choose M here: the module will be called apm. +if APM + config APM_IGNORE_USER_SUSPEND bool "Ignore USER SUSPEND" - depends on APM help This option will ignore USER SUSPEND requests. On machines with a compliant APM BIOS, you want to say N. However, on the NEC Versa M @@ -987,7 +989,6 @@ config APM_IGNORE_USER_SUSPEND config APM_DO_ENABLE bool "Enable PM at boot time" - depends on APM ---help--- Enable APM features at boot time. From page 36 of the APM BIOS specification: "When disabled, the APM BIOS does not automatically @@ -1005,7 +1006,6 @@ config APM_DO_ENABLE config APM_CPU_IDLE bool "Make CPU Idle calls when idle" - depends on APM help Enable calls to APM CPU Idle/CPU Busy inside the kernel's idle loop. On some machines, this can activate improved power savings, such as @@ -1017,7 +1017,6 @@ config APM_CPU_IDLE config APM_DISPLAY_BLANK bool "Enable console blanking using APM" - depends on APM help Enable console blanking using the APM. Some laptops can use this to turn off the LCD backlight when the screen blanker of the Linux @@ -1029,22 +1028,8 @@ config APM_DISPLAY_BLANK backlight at all, or it might print a lot of errors to the console, especially if you are using gpm. -config APM_RTC_IS_GMT - bool "RTC stores time in GMT" - depends on APM - help - Say Y here if your RTC (Real Time Clock a.k.a. hardware clock) - stores the time in GMT (Greenwich Mean Time). Say N if your RTC - stores localtime. - - It is in fact recommended to store GMT in your RTC, because then you - don't have to worry about daylight savings time changes. The only - reason not to use GMT in your RTC is if you also run a broken OS - that doesn't understand GMT. - config APM_ALLOW_INTS bool "Allow interrupts during APM BIOS calls" - depends on APM help Normally we disable external interrupts while we are making calls to the APM BIOS as a measure to lessen the effects of a badly behaving @@ -1055,13 +1040,12 @@ config APM_ALLOW_INTS config APM_REAL_MODE_POWER_OFF bool "Use real mode APM BIOS call to power off" - depends on APM help Use real mode APM BIOS calls to switch off the computer. This is a work-around for a number of buggy BIOSes. Switch this option on if your computer crashes instead of powering off properly. -endmenu +endif # APM source "arch/i386/kernel/cpu/cpufreq/Kconfig" diff --git a/arch/i386/Kconfig.cpu b/arch/i386/Kconfig.cpu index b99c0e2a4e6..dce6124cb84 100644 --- a/arch/i386/Kconfig.cpu +++ b/arch/i386/Kconfig.cpu @@ -43,6 +43,7 @@ config M386 - "Geode GX/LX" For AMD Geode GX and LX processors. - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3. - "VIA C3-2" for VIA C3-2 "Nehemiah" (model 9 and above). + - "VIA C7" for VIA C7. If you don't know what to do, choose "386". @@ -203,6 +204,12 @@ config MVIAC3_2 of SSE and tells gcc to treat the CPU as a 686. Note, this kernel will not boot on older (pre model 9) C3s. +config MVIAC7 + bool "VIA C7" + help + Select this for a VIA C7. Selecting this uses the correct cache + shift and tells gcc to treat the CPU as a 686. + endchoice config X86_GENERIC @@ -231,16 +238,21 @@ config X86_L1_CACHE_SHIFT default "7" if MPENTIUM4 || X86_GENERIC default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX - default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 + default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 + +config X86_XADD + bool + depends on !M386 + default y config RWSEM_GENERIC_SPINLOCK bool - depends on M386 + depends on !X86_XADD default y config RWSEM_XCHGADD_ALGORITHM bool - depends on !M386 + depends on X86_XADD default y config ARCH_HAS_ILOG2_U32 @@ -297,7 +309,7 @@ config X86_ALIGNMENT_16 config X86_GOOD_APIC bool - depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON || MCORE2 + depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON || MCORE2 || MVIAC7 default y config X86_INTEL_USERCOPY @@ -322,5 +334,18 @@ config X86_OOSTORE config X86_TSC bool - depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ + depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ default y + +# this should be set for all -march=.. options where the compiler +# generates cmov. +config X86_CMOV + bool + depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7) + default y + +config X86_MINIMUM_CPU_MODEL + int + default "4" if X86_XADD || X86_CMPXCHG || X86_BSWAP + default "0" + diff --git a/arch/i386/Kconfig.debug b/arch/i386/Kconfig.debug index 458bc161193..b31c0802e1c 100644 --- a/arch/i386/Kconfig.debug +++ b/arch/i386/Kconfig.debug @@ -85,14 +85,4 @@ config DOUBLEFAULT option saves about 4k and might cause you much additional grey hair. -config DEBUG_PARAVIRT - bool "Enable some paravirtualization debugging" - default n - depends on PARAVIRT && DEBUG_KERNEL - help - Currently deliberately clobbers regs which are allowed to be - clobbered in inlined paravirt hooks, even in native mode. - If turning this off solves a problem, then DISABLE_INTERRUPTS() or - ENABLE_INTERRUPTS() is lying about what registers can be clobbered. - endmenu diff --git a/arch/i386/Makefile b/arch/i386/Makefile index bd28f9f9b4b..6dc5e5d90fe 100644 --- a/arch/i386/Makefile +++ b/arch/i386/Makefile @@ -34,7 +34,7 @@ CHECKFLAGS += -D__i386__ CFLAGS += -pipe -msoft-float -mregparm=3 -freg-struct-return # prevent gcc from keeping the stack 16 byte aligned -CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2) +CFLAGS += -mpreferred-stack-boundary=4 # CPU-specific tuning. Anything which can be shared with UML should go here. include $(srctree)/arch/i386/Makefile.cpu diff --git a/arch/i386/Makefile.cpu b/arch/i386/Makefile.cpu index a32c031c90d..e372b584e91 100644 --- a/arch/i386/Makefile.cpu +++ b/arch/i386/Makefile.cpu @@ -4,9 +4,9 @@ #-mtune exists since gcc 3.4 HAS_MTUNE := $(call cc-option-yn, -mtune=i386) ifeq ($(HAS_MTUNE),y) -tune = $(call cc-option,-mtune=$(1),) +tune = $(call cc-option,-mtune=$(1),$(2)) else -tune = $(call cc-option,-mcpu=$(1),) +tune = $(call cc-option,-mcpu=$(1),$(2)) endif align := $(cc-option-align) @@ -32,7 +32,8 @@ cflags-$(CONFIG_MWINCHIP2) += $(call cc-option,-march=winchip2,-march=i586) cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) -cflags-$(CONFIG_MCORE2) += -march=i686 $(call cc-option,-mtune=core2,$(call cc-option,-mtune=generic,-mtune=i686)) +cflags-$(CONFIG_MVIAC7) += -march=i686 +cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2) # AMD Elan support cflags-$(CONFIG_X86_ELAN) += -march=i486 @@ -42,5 +43,5 @@ cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx # add at the end to overwrite eventual tuning options from earlier # cpu entries -cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic) +cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686)) diff --git a/arch/i386/boot/Makefile b/arch/i386/boot/Makefile index e9794662606..bfbc32098a4 100644 --- a/arch/i386/boot/Makefile +++ b/arch/i386/boot/Makefile @@ -36,9 +36,9 @@ HOSTCFLAGS_build.o := $(LINUXINCLUDE) # --------------------------------------------------------------------------- $(obj)/zImage: IMAGE_OFFSET := 0x1000 -$(obj)/zImage: EXTRA_AFLAGS := -traditional $(SVGA_MODE) $(RAMDISK) +$(obj)/zImage: EXTRA_AFLAGS := $(SVGA_MODE) $(RAMDISK) $(obj)/bzImage: IMAGE_OFFSET := 0x100000 -$(obj)/bzImage: EXTRA_AFLAGS := -traditional $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__ +$(obj)/bzImage: EXTRA_AFLAGS := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__ $(obj)/bzImage: BUILDFLAGS := -b quiet_cmd_image = BUILD $@ diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c index 1ce7017fd62..b28505c544c 100644 --- a/arch/i386/boot/compressed/misc.c +++ b/arch/i386/boot/compressed/misc.c @@ -189,7 +189,7 @@ static void putstr(const char *); static unsigned long free_mem_ptr; static unsigned long free_mem_end_ptr; -#define HEAP_SIZE 0x3000 +#define HEAP_SIZE 0x4000 static char *vidmem = (char *)0xb8000; static int vidport; diff --git a/arch/i386/boot/setup.S b/arch/i386/boot/setup.S index 06edf1c6624..f8b3b9cda2b 100644 --- a/arch/i386/boot/setup.S +++ b/arch/i386/boot/setup.S @@ -52,6 +52,7 @@ #include <asm/boot.h> #include <asm/e820.h> #include <asm/page.h> +#include <asm/setup.h> /* Signature words to ensure LILO loaded us right */ #define SIG1 0xAA55 @@ -81,7 +82,7 @@ start: # This is the setup header, and it must start at %cs:2 (old 0x9020:2) .ascii "HdrS" # header signature - .word 0x0205 # header version number (>= 0x0105) + .word 0x0206 # header version number (>= 0x0105) # or else old loadlin-1.5 will fail) realmode_swtch: .word 0, 0 # default_switch, SETUPSEG start_sys_seg: .word SYSSEG @@ -171,6 +172,10 @@ relocatable_kernel: .byte 0 pad2: .byte 0 pad3: .word 0 +cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line, + #added with boot protocol + #version 2.06 + trampoline: call start_of_setup .align 16 # The offset at this point is 0x240 @@ -297,7 +302,24 @@ good_sig: loader_panic_mess: .string "Wrong loader, giving up..." +# check minimum cpuid +# we do this here because it is the last place we can actually +# show a user visible error message. Later the video modus +# might be already messed up. loader_ok: + call verify_cpu + testl %eax,%eax + jz cpu_ok + lea cpu_panic_mess,%si + call prtstr +1: jmp 1b + +cpu_panic_mess: + .asciz "PANIC: CPU too old for this kernel." + +#include "../kernel/verify_cpu.S" + +cpu_ok: # Get memory size (extended mem, kB) xorl %eax, %eax diff --git a/arch/i386/defconfig b/arch/i386/defconfig index c96911c37ae..9da84412a83 100644 --- a/arch/i386/defconfig +++ b/arch/i386/defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.21-rc3 -# Wed Mar 7 15:29:47 2007 +# Linux kernel version: 2.6.21-git3 +# Tue May 1 07:30:51 2007 # CONFIG_X86_32=y CONFIG_GENERIC_TIME=y @@ -108,9 +108,9 @@ CONFIG_DEFAULT_IOSCHED="anticipatory" # # Processor type and features # -# CONFIG_TICK_ONESHOT is not set -# CONFIG_NO_HZ is not set -# CONFIG_HIGH_RES_TIMERS is not set +CONFIG_TICK_ONESHOT=y +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y CONFIG_SMP=y # CONFIG_X86_PC is not set # CONFIG_X86_ELAN is not set @@ -146,9 +146,11 @@ CONFIG_MPENTIUMIII=y # CONFIG_MGEODE_LX is not set # CONFIG_MCYRIXIII is not set # CONFIG_MVIAC3_2 is not set +# CONFIG_MVIAC7 is not set CONFIG_X86_GENERIC=y CONFIG_X86_CMPXCHG=y CONFIG_X86_L1_CACHE_SHIFT=7 +CONFIG_X86_XADD=y CONFIG_RWSEM_XCHGADD_ALGORITHM=y # CONFIG_ARCH_HAS_ILOG2_U32 is not set # CONFIG_ARCH_HAS_ILOG2_U64 is not set @@ -162,6 +164,8 @@ CONFIG_X86_GOOD_APIC=y CONFIG_X86_INTEL_USERCOPY=y CONFIG_X86_USE_PPRO_CHECKSUM=y CONFIG_X86_TSC=y +CONFIG_X86_CMOV=y +CONFIG_X86_MINIMUM_CPU_MODEL=4 CONFIG_HPET_TIMER=y CONFIG_HPET_EMULATE_RTC=y CONFIG_NR_CPUS=32 @@ -248,7 +252,6 @@ CONFIG_ACPI_FAN=y CONFIG_ACPI_PROCESSOR=y CONFIG_ACPI_THERMAL=y # CONFIG_ACPI_ASUS is not set -# CONFIG_ACPI_IBM is not set # CONFIG_ACPI_TOSHIBA is not set CONFIG_ACPI_BLACKLIST_YEAR=2001 CONFIG_ACPI_DEBUG=y @@ -257,10 +260,7 @@ CONFIG_ACPI_POWER=y CONFIG_ACPI_SYSTEM=y CONFIG_X86_PM_TIMER=y # CONFIG_ACPI_CONTAINER is not set - -# -# APM (Advanced Power Management) BIOS Support -# +# CONFIG_ACPI_SBS is not set # CONFIG_APM is not set # @@ -277,7 +277,7 @@ CONFIG_CPU_FREQ_GOV_PERFORMANCE=y # CONFIG_CPU_FREQ_GOV_POWERSAVE is not set CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_GOV_ONDEMAND=y -# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set +CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y # # CPUFreq processor drivers @@ -349,7 +349,6 @@ CONFIG_NET=y # # Networking options # -# CONFIG_NETDEBUG is not set CONFIG_PACKET=y # CONFIG_PACKET_MMAP is not set CONFIG_UNIX=y @@ -388,6 +387,7 @@ CONFIG_DEFAULT_TCP_CONG="cubic" CONFIG_IPV6=y # CONFIG_IPV6_PRIVACY is not set # CONFIG_IPV6_ROUTER_PREF is not set +# CONFIG_IPV6_OPTIMISTIC_DAD is not set # CONFIG_INET6_AH is not set # CONFIG_INET6_ESP is not set # CONFIG_INET6_IPCOMP is not set @@ -443,6 +443,13 @@ CONFIG_IPV6_SIT=y # CONFIG_HAMRADIO is not set # CONFIG_IRDA is not set # CONFIG_BT is not set +# CONFIG_AF_RXRPC is not set + +# +# Wireless +# +# CONFIG_CFG80211 is not set +# CONFIG_WIRELESS_EXT is not set # CONFIG_IEEE80211 is not set # @@ -463,10 +470,6 @@ CONFIG_FW_LOADER=y # Connector - unified userspace <-> kernelspace linker # # CONFIG_CONNECTOR is not set - -# -# Memory Technology Devices (MTD) -# # CONFIG_MTD is not set # @@ -513,6 +516,7 @@ CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024 # CONFIG_SGI_IOC4 is not set # CONFIG_TIFM_CORE is not set # CONFIG_SONY_LAPTOP is not set +# CONFIG_THINKPAD_ACPI is not set # # ATA/ATAPI/MFM/RLL support @@ -548,7 +552,6 @@ CONFIG_BLK_DEV_IDEPCI=y # CONFIG_BLK_DEV_RZ1000 is not set CONFIG_BLK_DEV_IDEDMA_PCI=y # CONFIG_BLK_DEV_IDEDMA_FORCED is not set -CONFIG_IDEDMA_PCI_AUTO=y # CONFIG_IDEDMA_ONLYDISK is not set # CONFIG_BLK_DEV_AEC62XX is not set # CONFIG_BLK_DEV_ALI15X3 is not set @@ -580,7 +583,6 @@ CONFIG_BLK_DEV_PIIX=y # CONFIG_IDE_ARM is not set CONFIG_BLK_DEV_IDEDMA=y # CONFIG_IDEDMA_IVB is not set -CONFIG_IDEDMA_AUTO=y # CONFIG_BLK_DEV_HD is not set # @@ -669,6 +671,7 @@ CONFIG_AIC79XX_DEBUG_MASK=0 # CONFIG_SCSI_DC390T is not set # CONFIG_SCSI_NSP32 is not set # CONFIG_SCSI_DEBUG is not set +# CONFIG_SCSI_ESP_CORE is not set # CONFIG_SCSI_SRP is not set # @@ -697,6 +700,7 @@ CONFIG_SATA_ACPI=y # CONFIG_PATA_AMD is not set # CONFIG_PATA_ARTOP is not set # CONFIG_PATA_ATIIXP is not set +# CONFIG_PATA_CMD640_PCI is not set # CONFIG_PATA_CMD64X is not set # CONFIG_PATA_CS5520 is not set # CONFIG_PATA_CS5530 is not set @@ -762,10 +766,9 @@ CONFIG_IEEE1394=y # Subsystem Options # # CONFIG_IEEE1394_VERBOSEDEBUG is not set -# CONFIG_IEEE1394_EXTRA_CONFIG_ROMS is not set # -# Device Drivers +# Controllers # # @@ -774,10 +777,11 @@ CONFIG_IEEE1394=y CONFIG_IEEE1394_OHCI1394=y # -# Protocol Drivers +# Protocols # # CONFIG_IEEE1394_VIDEO1394 is not set # CONFIG_IEEE1394_SBP2 is not set +# CONFIG_IEEE1394_ETH1394_ROM_ENTRY is not set # CONFIG_IEEE1394_ETH1394 is not set # CONFIG_IEEE1394_DV1394 is not set CONFIG_IEEE1394_RAWIO=y @@ -820,7 +824,9 @@ CONFIG_MII=y # CONFIG_HAPPYMEAL is not set # CONFIG_SUNGEM is not set # CONFIG_CASSINI is not set -# CONFIG_NET_VENDOR_3COM is not set +CONFIG_NET_VENDOR_3COM=y +CONFIG_VORTEX=y +# CONFIG_TYPHOON is not set # # Tulip family network device support @@ -901,9 +907,10 @@ CONFIG_BNX2=y # CONFIG_TR is not set # -# Wireless LAN (non-hamradio) +# Wireless LAN # -# CONFIG_NET_RADIO is not set +# CONFIG_WLAN_PRE80211 is not set +# CONFIG_WLAN_80211 is not set # # Wan interfaces @@ -917,7 +924,6 @@ CONFIG_BNX2=y # CONFIG_SHAPER is not set CONFIG_NETCONSOLE=y CONFIG_NETPOLL=y -# CONFIG_NETPOLL_RX is not set # CONFIG_NETPOLL_TRAP is not set CONFIG_NET_POLL_CONTROLLER=y @@ -1050,7 +1056,7 @@ CONFIG_MAX_RAW_DEVS=256 CONFIG_HPET=y # CONFIG_HPET_RTC_IRQ is not set CONFIG_HPET_MMAP=y -CONFIG_HANGCHECK_TIMER=y +# CONFIG_HANGCHECK_TIMER is not set # # TPM devices @@ -1142,6 +1148,14 @@ CONFIG_HID=y # CONFIG_HID_DEBUG is not set # +# USB Input Devices +# +CONFIG_USB_HID=y +# CONFIG_USB_HIDINPUT_POWERBOOK is not set +# CONFIG_HID_FF is not set +# CONFIG_USB_HIDDEV is not set + +# # USB support # CONFIG_USB_ARCH_HAS_HCD=y @@ -1154,6 +1168,7 @@ CONFIG_USB=y # Miscellaneous USB options # CONFIG_USB_DEVICEFS=y +# CONFIG_USB_DEVICE_CLASS is not set # CONFIG_USB_DYNAMIC_MINORS is not set # CONFIG_USB_SUSPEND is not set # CONFIG_USB_OTG is not set @@ -1204,10 +1219,6 @@ CONFIG_USB_STORAGE=y # # USB Input Devices # -CONFIG_USB_HID=y -# CONFIG_USB_HIDINPUT_POWERBOOK is not set -# CONFIG_HID_FF is not set -# CONFIG_USB_HIDDEV is not set # CONFIG_USB_AIPTEK is not set # CONFIG_USB_WACOM is not set # CONFIG_USB_ACECAD is not set @@ -1528,7 +1539,7 @@ CONFIG_DEBUG_KERNEL=y CONFIG_LOG_BUF_SHIFT=18 CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_SCHEDSTATS is not set -# CONFIG_TIMER_STATS is not set +CONFIG_TIMER_STATS=y # CONFIG_DEBUG_SLAB is not set # CONFIG_DEBUG_RT_MUTEXES is not set # CONFIG_RT_MUTEX_TESTER is not set diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 4ae3dcf1d2f..4f98516b9f9 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -39,12 +39,10 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_K8_NB) += k8.o -obj-$(CONFIG_VMI) += vmi.o vmitime.o +obj-$(CONFIG_VMI) += vmi.o vmiclock.o obj-$(CONFIG_PARAVIRT) += paravirt.o obj-y += pcspeaker.o -EXTRA_AFLAGS := -traditional - obj-$(CONFIG_SCx200) += scx200.o # vsyscall.o contains the vsyscall DSO images as __initdata. diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index 9ea5b8ecc7e..280898b045b 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -874,7 +874,7 @@ static void __init acpi_process_madt(void) acpi_ioapic = 1; smp_found_config = 1; - clustered_apic_check(); + setup_apic_routing(); } } if (error == -EINVAL) { diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c index 8f7efd38254..23f78efc577 100644 --- a/arch/i386/kernel/acpi/earlyquirk.c +++ b/arch/i386/kernel/acpi/earlyquirk.c @@ -10,7 +10,6 @@ #include <asm/pci-direct.h> #include <asm/acpi.h> #include <asm/apic.h> -#include <asm/irq.h> #ifdef CONFIG_ACPI @@ -48,24 +47,6 @@ static int __init check_bridge(int vendor, int device) return 0; } -static void check_intel(void) -{ - u16 vendor, device; - - vendor = read_pci_config_16(0, 0, 0, PCI_VENDOR_ID); - - if (vendor != PCI_VENDOR_ID_INTEL) - return; - - device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID); -#ifdef CONFIG_SMP - if (device == PCI_DEVICE_ID_INTEL_E7320_MCH || - device == PCI_DEVICE_ID_INTEL_E7520_MCH || - device == PCI_DEVICE_ID_INTEL_E7525_MCH) - quirk_intel_irqbalance(); -#endif -} - void __init check_acpi_pci(void) { int num, slot, func; @@ -77,8 +58,6 @@ void __init check_acpi_pci(void) if (!early_pci_allowed()) return; - check_intel(); - /* Poor man's PCI discovery */ for (num = 0; num < 32; num++) { for (slot = 0; slot < 32; slot++) { diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c index 426f59b0106..e5cec6685cc 100644 --- a/arch/i386/kernel/alternative.c +++ b/arch/i386/kernel/alternative.c @@ -5,6 +5,7 @@ #include <asm/alternative.h> #include <asm/sections.h> +static int noreplace_smp = 0; static int smp_alt_once = 0; static int debug_alternative = 0; @@ -13,15 +14,33 @@ static int __init bootonly(char *str) smp_alt_once = 1; return 1; } +__setup("smp-alt-boot", bootonly); + static int __init debug_alt(char *str) { debug_alternative = 1; return 1; } - -__setup("smp-alt-boot", bootonly); __setup("debug-alternative", debug_alt); +static int __init setup_noreplace_smp(char *str) +{ + noreplace_smp = 1; + return 1; +} +__setup("noreplace-smp", setup_noreplace_smp); + +#ifdef CONFIG_PARAVIRT +static int noreplace_paravirt = 0; + +static int __init setup_noreplace_paravirt(char *str) +{ + noreplace_paravirt = 1; + return 1; +} +__setup("noreplace-paravirt", setup_noreplace_paravirt); +#endif + #define DPRINTK(fmt, args...) if (debug_alternative) \ printk(KERN_DEBUG fmt, args) @@ -132,11 +151,8 @@ static void nop_out(void *insns, unsigned int len) } extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; -extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[]; extern u8 *__smp_locks[], *__smp_locks_end[]; -extern u8 __smp_alt_begin[], __smp_alt_end[]; - /* Replace instructions with better alternatives for this CPU type. This runs before SMP is initialized to avoid SMP problems with self modifying code. This implies that assymetric systems where @@ -171,29 +187,6 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end) #ifdef CONFIG_SMP -static void alternatives_smp_save(struct alt_instr *start, struct alt_instr *end) -{ - struct alt_instr *a; - - DPRINTK("%s: alt table %p-%p\n", __FUNCTION__, start, end); - for (a = start; a < end; a++) { - memcpy(a->replacement + a->replacementlen, - a->instr, - a->instrlen); - } -} - -static void alternatives_smp_apply(struct alt_instr *start, struct alt_instr *end) -{ - struct alt_instr *a; - - for (a = start; a < end; a++) { - memcpy(a->instr, - a->replacement + a->replacementlen, - a->instrlen); - } -} - static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) { u8 **ptr; @@ -211,6 +204,9 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end { u8 **ptr; + if (noreplace_smp) + return; + for (ptr = start; ptr < end; ptr++) { if (*ptr < text) continue; @@ -245,6 +241,9 @@ void alternatives_smp_module_add(struct module *mod, char *name, struct smp_alt_module *smp; unsigned long flags; + if (noreplace_smp) + return; + if (smp_alt_once) { if (boot_cpu_has(X86_FEATURE_UP)) alternatives_smp_unlock(locks, locks_end, @@ -279,7 +278,7 @@ void alternatives_smp_module_del(struct module *mod) struct smp_alt_module *item; unsigned long flags; - if (smp_alt_once) + if (smp_alt_once || noreplace_smp) return; spin_lock_irqsave(&smp_alt, flags); @@ -310,7 +309,7 @@ void alternatives_smp_switch(int smp) return; #endif - if (smp_alt_once) + if (noreplace_smp || smp_alt_once) return; BUG_ON(!smp && (num_online_cpus() > 1)); @@ -319,8 +318,6 @@ void alternatives_smp_switch(int smp) printk(KERN_INFO "SMP alternatives: switching to SMP code\n"); clear_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); clear_bit(X86_FEATURE_UP, cpu_data[0].x86_capability); - alternatives_smp_apply(__smp_alt_instructions, - __smp_alt_instructions_end); list_for_each_entry(mod, &smp_alt_modules, next) alternatives_smp_lock(mod->locks, mod->locks_end, mod->text, mod->text_end); @@ -328,8 +325,6 @@ void alternatives_smp_switch(int smp) printk(KERN_INFO "SMP alternatives: switching to UP code\n"); set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability); - apply_alternatives(__smp_alt_instructions, - __smp_alt_instructions_end); list_for_each_entry(mod, &smp_alt_modules, next) alternatives_smp_unlock(mod->locks, mod->locks_end, mod->text, mod->text_end); @@ -340,36 +335,31 @@ void alternatives_smp_switch(int smp) #endif #ifdef CONFIG_PARAVIRT -void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end) +void apply_paravirt(struct paravirt_patch_site *start, + struct paravirt_patch_site *end) { - struct paravirt_patch *p; + struct paravirt_patch_site *p; + + if (noreplace_paravirt) + return; for (p = start; p < end; p++) { unsigned int used; used = paravirt_ops.patch(p->instrtype, p->clobbers, p->instr, p->len); -#ifdef CONFIG_DEBUG_PARAVIRT - { - int i; - /* Deliberately clobber regs using "not %reg" to find bugs. */ - for (i = 0; i < 3; i++) { - if (p->len - used >= 2 && (p->clobbers & (1 << i))) { - memcpy(p->instr + used, "\xf7\xd0", 2); - p->instr[used+1] |= i; - used += 2; - } - } - } -#endif + + BUG_ON(used > p->len); + /* Pad the rest with nops */ nop_out(p->instr + used, p->len - used); } - /* Sync to be conservative, in case we patched following instructions */ + /* Sync to be conservative, in case we patched following + * instructions */ sync_core(); } -extern struct paravirt_patch __start_parainstructions[], +extern struct paravirt_patch_site __start_parainstructions[], __stop_parainstructions[]; #endif /* CONFIG_PARAVIRT */ @@ -396,23 +386,19 @@ void __init alternative_instructions(void) printk(KERN_INFO "SMP alternatives: switching to UP code\n"); set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability); - apply_alternatives(__smp_alt_instructions, - __smp_alt_instructions_end); alternatives_smp_unlock(__smp_locks, __smp_locks_end, _text, _etext); } free_init_pages("SMP alternatives", - (unsigned long)__smp_alt_begin, - (unsigned long)__smp_alt_end); + __pa_symbol(&__smp_locks), + __pa_symbol(&__smp_locks_end)); } else { - alternatives_smp_save(__smp_alt_instructions, - __smp_alt_instructions_end); alternatives_smp_module_add(NULL, "core kernel", __smp_locks, __smp_locks_end, _text, _etext); alternatives_smp_switch(0); } #endif - apply_paravirt(__start_parainstructions, __stop_parainstructions); + apply_paravirt(__parainstructions, __parainstructions_end); local_irq_restore(flags); } diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index 93aa911646a..aca054cc055 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -129,6 +129,28 @@ static int modern_apic(void) return lapic_get_version() >= 0x14; } +void apic_wait_icr_idle(void) +{ + while (apic_read(APIC_ICR) & APIC_ICR_BUSY) + cpu_relax(); +} + +unsigned long safe_apic_wait_icr_idle(void) +{ + unsigned long send_status; + int timeout; + + timeout = 0; + do { + send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; + if (!send_status) + break; + udelay(100); + } while (timeout++ < 1000); + + return send_status; +} + /** * enable_NMI_through_LVT0 - enable NMI through local vector table 0 */ diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c index 064bbf2861f..367ff1d930c 100644 --- a/arch/i386/kernel/apm.c +++ b/arch/i386/kernel/apm.c @@ -233,11 +233,10 @@ #include <asm/desc.h> #include <asm/i8253.h> #include <asm/paravirt.h> +#include <asm/reboot.h> #include "io_ports.h" -extern void machine_real_restart(unsigned char *, int); - #if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) extern int (*console_blank_hook)(int); #endif @@ -384,13 +383,6 @@ static int ignore_sys_suspend; static int ignore_normal_resume; static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL; -#ifdef CONFIG_APM_RTC_IS_GMT -# define clock_cmos_diff 0 -# define got_clock_diff 1 -#else -static long clock_cmos_diff; -static int got_clock_diff; -#endif static int debug __read_mostly; static int smp __read_mostly; static int apm_disabled = -1; diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index c37535163bf..27a776c9044 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -11,11 +11,11 @@ #include <linux/suspend.h> #include <asm/ucontext.h> #include "sigframe.h" +#include <asm/pgtable.h> #include <asm/fixmap.h> #include <asm/processor.h> #include <asm/thread_info.h> #include <asm/elf.h> -#include <asm/pda.h> #define DEFINE(sym, val) \ asm volatile("\n->" #sym " %0 " #val : : "i" (val)) @@ -25,6 +25,9 @@ #define OFFSET(sym, str, mem) \ DEFINE(sym, offsetof(struct str, mem)); +/* workaround for a warning with -Wmissing-prototypes */ +void foo(void); + void foo(void) { OFFSET(SIGCONTEXT_eax, sigcontext, eax); @@ -90,17 +93,18 @@ void foo(void) OFFSET(pbe_next, pbe, next); /* Offset from the sysenter stack to tss.esp0 */ - DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, esp0) - + DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, x86_tss.esp0) - sizeof(struct tss_struct)); DEFINE(PAGE_SIZE_asm, PAGE_SIZE); - DEFINE(VDSO_PRELINK, VDSO_PRELINK); + DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT); + DEFINE(PTRS_PER_PTE, PTRS_PER_PTE); + DEFINE(PTRS_PER_PMD, PTRS_PER_PMD); + DEFINE(PTRS_PER_PGD, PTRS_PER_PGD); - OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); + DEFINE(VDSO_PRELINK_asm, VDSO_PRELINK); - BLANK(); - OFFSET(PDA_cpu, i386_pda, cpu_number); - OFFSET(PDA_pcurrent, i386_pda, pcurrent); + OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); #ifdef CONFIG_PARAVIRT BLANK(); diff --git a/arch/i386/kernel/cpu/Makefile b/arch/i386/kernel/cpu/Makefile index 010aecfffbc..74f27a463db 100644 --- a/arch/i386/kernel/cpu/Makefile +++ b/arch/i386/kernel/cpu/Makefile @@ -2,7 +2,7 @@ # Makefile for x86-compatible CPU details and quirks # -obj-y := common.o proc.o +obj-y := common.o proc.o bugs.o obj-y += amd.o obj-y += cyrix.o @@ -17,3 +17,5 @@ obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ obj-$(CONFIG_CPU_FREQ) += cpufreq/ + +obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c index 2d47db48297..4fec702afd7 100644 --- a/arch/i386/kernel/cpu/amd.c +++ b/arch/i386/kernel/cpu/amd.c @@ -53,6 +53,8 @@ static __cpuinit int amd_apic_timer_broken(void) return 0; } +int force_mwait __cpuinitdata; + static void __cpuinit init_amd(struct cpuinfo_x86 *c) { u32 l, h; @@ -275,6 +277,9 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) if (amd_apic_timer_broken()) set_bit(X86_FEATURE_LAPIC_TIMER_BROKEN, c->x86_capability); + + if (c->x86 == 0x10 && !force_mwait) + clear_bit(X86_FEATURE_MWAIT, c->x86_capability); } static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size) @@ -314,13 +319,3 @@ int __init amd_init_cpu(void) cpu_devs[X86_VENDOR_AMD] = &amd_cpu_dev; return 0; } - -//early_arch_initcall(amd_init_cpu); - -static int __init amd_exit_cpu(void) -{ - cpu_devs[X86_VENDOR_AMD] = NULL; - return 0; -} - -late_initcall(amd_exit_cpu); diff --git a/arch/i386/kernel/cpu/bugs.c b/arch/i386/kernel/cpu/bugs.c new file mode 100644 index 00000000000..54428a2500f --- /dev/null +++ b/arch/i386/kernel/cpu/bugs.c @@ -0,0 +1,191 @@ +/* + * arch/i386/cpu/bugs.c + * + * Copyright (C) 1994 Linus Torvalds + * + * Cyrix stuff, June 1998 by: + * - Rafael R. Reilova (moved everything from head.S), + * <rreilova@ececs.uc.edu> + * - Channing Corn (tests & fixes), + * - Andrew D. Balsa (code cleanup). + */ +#include <linux/init.h> +#include <linux/utsname.h> +#include <asm/processor.h> +#include <asm/i387.h> +#include <asm/msr.h> +#include <asm/paravirt.h> +#include <asm/alternative.h> + +static int __init no_halt(char *s) +{ + boot_cpu_data.hlt_works_ok = 0; + return 1; +} + +__setup("no-hlt", no_halt); + +static int __init mca_pentium(char *s) +{ + mca_pentium_flag = 1; + return 1; +} + +__setup("mca-pentium", mca_pentium); + +static int __init no_387(char *s) +{ + boot_cpu_data.hard_math = 0; + write_cr0(0xE | read_cr0()); + return 1; +} + +__setup("no387", no_387); + +static double __initdata x = 4195835.0; +static double __initdata y = 3145727.0; + +/* + * This used to check for exceptions.. + * However, it turns out that to support that, + * the XMM trap handlers basically had to + * be buggy. So let's have a correct XMM trap + * handler, and forget about printing out + * some status at boot. + * + * We should really only care about bugs here + * anyway. Not features. + */ +static void __init check_fpu(void) +{ + if (!boot_cpu_data.hard_math) { +#ifndef CONFIG_MATH_EMULATION + printk(KERN_EMERG "No coprocessor found and no math emulation present.\n"); + printk(KERN_EMERG "Giving up.\n"); + for (;;) ; +#endif + return; + } + +/* trap_init() enabled FXSR and company _before_ testing for FP problems here. */ + /* Test for the divl bug.. */ + __asm__("fninit\n\t" + "fldl %1\n\t" + "fdivl %2\n\t" + "fmull %2\n\t" + "fldl %1\n\t" + "fsubp %%st,%%st(1)\n\t" + "fistpl %0\n\t" + "fwait\n\t" + "fninit" + : "=m" (*&boot_cpu_data.fdiv_bug) + : "m" (*&x), "m" (*&y)); + if (boot_cpu_data.fdiv_bug) + printk("Hmm, FPU with FDIV bug.\n"); +} + +static void __init check_hlt(void) +{ + if (paravirt_enabled()) + return; + + printk(KERN_INFO "Checking 'hlt' instruction... "); + if (!boot_cpu_data.hlt_works_ok) { + printk("disabled\n"); + return; + } + halt(); + halt(); + halt(); + halt(); + printk("OK.\n"); +} + +/* + * Most 386 processors have a bug where a POPAD can lock the + * machine even from user space. + */ + +static void __init check_popad(void) +{ +#ifndef CONFIG_X86_POPAD_OK + int res, inp = (int) &res; + + printk(KERN_INFO "Checking for popad bug... "); + __asm__ __volatile__( + "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx " + : "=&a" (res) + : "d" (inp) + : "ecx", "edi" ); + /* If this fails, it means that any user program may lock the CPU hard. Too bad. */ + if (res != 12345678) printk( "Buggy.\n" ); + else printk( "OK.\n" ); +#endif +} + +/* + * Check whether we are able to run this kernel safely on SMP. + * + * - In order to run on a i386, we need to be compiled for i386 + * (for due to lack of "invlpg" and working WP on a i386) + * - In order to run on anything without a TSC, we need to be + * compiled for a i486. + * - In order to support the local APIC on a buggy Pentium machine, + * we need to be compiled with CONFIG_X86_GOOD_APIC disabled, + * which happens implicitly if compiled for a Pentium or lower + * (unless an advanced selection of CPU features is used) as an + * otherwise config implies a properly working local APIC without + * the need to do extra reads from the APIC. +*/ + +static void __init check_config(void) +{ +/* + * We'd better not be a i386 if we're configured to use some + * i486+ only features! (WP works in supervisor mode and the + * new "invlpg" and "bswap" instructions) + */ +#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_BSWAP) + if (boot_cpu_data.x86 == 3) + panic("Kernel requires i486+ for 'invlpg' and other features"); +#endif + +/* + * If we configured ourselves for a TSC, we'd better have one! + */ +#ifdef CONFIG_X86_TSC + if (!cpu_has_tsc && !tsc_disable) + panic("Kernel compiled for Pentium+, requires TSC feature!"); +#endif + +/* + * If we were told we had a good local APIC, check for buggy Pentia, + * i.e. all B steppings and the C2 stepping of P54C when using their + * integrated APIC (see 11AP erratum in "Pentium Processor + * Specification Update"). + */ +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_GOOD_APIC) + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL + && cpu_has_apic + && boot_cpu_data.x86 == 5 + && boot_cpu_data.x86_model == 2 + && (boot_cpu_data.x86_mask < 6 || boot_cpu_data.x86_mask == 11)) + panic("Kernel compiled for PMMX+, assumes a local APIC without the read-before-write bug!"); +#endif +} + + +void __init check_bugs(void) +{ + identify_boot_cpu(); +#ifndef CONFIG_SMP + printk("CPU: "); + print_cpu_info(&boot_cpu_data); +#endif + check_config(); + check_fpu(); + check_hlt(); + check_popad(); + init_utsname()->machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); + alternative_instructions(); +} diff --git a/arch/i386/kernel/cpu/centaur.c b/arch/i386/kernel/cpu/centaur.c index 8c25047975c..473eac883c7 100644 --- a/arch/i386/kernel/cpu/centaur.c +++ b/arch/i386/kernel/cpu/centaur.c @@ -469,13 +469,3 @@ int __init centaur_init_cpu(void) cpu_devs[X86_VENDOR_CENTAUR] = ¢aur_cpu_dev; return 0; } - -//early_arch_initcall(centaur_init_cpu); - -static int __init centaur_exit_cpu(void) -{ - cpu_devs[X86_VENDOR_CENTAUR] = NULL; - return 0; -} - -late_initcall(centaur_exit_cpu); diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index dcbbd0a8bfc..794d593c47e 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -18,15 +18,37 @@ #include <asm/apic.h> #include <mach_apic.h> #endif -#include <asm/pda.h> #include "cpu.h" -DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); -EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr); +DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { + [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 }, + [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 }, + [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 }, + [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 }, + /* + * Segments used for calling PnP BIOS have byte granularity. + * They code segments and data segments have fixed 64k limits, + * the transfer segment sizes are set at run time. + */ + [GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */ + [GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */ + [GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */ + [GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */ + /* + * The APM segments have byte granularity and their bases + * are set at run time. All have 64k limits. + */ + [GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */ + /* 16-bit code */ + [GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 }, + [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */ -struct i386_pda *_cpu_pda[NR_CPUS] __read_mostly; -EXPORT_SYMBOL(_cpu_pda); + [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 }, + [GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 }, +} }; +EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); static int cachesize_override __cpuinitdata = -1; static int disable_x86_fxsr __cpuinitdata; @@ -368,7 +390,7 @@ __setup("serialnumber", x86_serial_nr_setup); /* * This does the hard work of actually picking apart the CPU stuff... */ -void __cpuinit identify_cpu(struct cpuinfo_x86 *c) +static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) { int i; @@ -479,15 +501,22 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c) /* Init Machine Check Exception if available. */ mcheck_init(c); +} - if (c == &boot_cpu_data) - sysenter_setup(); +void __init identify_boot_cpu(void) +{ + identify_cpu(&boot_cpu_data); + sysenter_setup(); enable_sep_cpu(); + mtrr_bp_init(); +} - if (c == &boot_cpu_data) - mtrr_bp_init(); - else - mtrr_ap_init(); +void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) +{ + BUG_ON(c == &boot_cpu_data); + identify_cpu(c); + enable_sep_cpu(); + mtrr_ap_init(); } #ifdef CONFIG_X86_HT @@ -601,129 +630,36 @@ void __init early_cpu_init(void) #endif } -/* Make sure %gs is initialized properly in idle threads */ +/* Make sure %fs is initialized properly in idle threads */ struct pt_regs * __devinit idle_regs(struct pt_regs *regs) { memset(regs, 0, sizeof(struct pt_regs)); - regs->xfs = __KERNEL_PDA; + regs->xfs = __KERNEL_PERCPU; return regs; } -static __cpuinit int alloc_gdt(int cpu) +/* Current gdt points %fs at the "master" per-cpu area: after this, + * it's on the real one. */ +void switch_to_new_gdt(void) { - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); - struct desc_struct *gdt; - struct i386_pda *pda; - - gdt = (struct desc_struct *)cpu_gdt_descr->address; - pda = cpu_pda(cpu); - - /* - * This is a horrible hack to allocate the GDT. The problem - * is that cpu_init() is called really early for the boot CPU - * (and hence needs bootmem) but much later for the secondary - * CPUs, when bootmem will have gone away - */ - if (NODE_DATA(0)->bdata->node_bootmem_map) { - BUG_ON(gdt != NULL || pda != NULL); - - gdt = alloc_bootmem_pages(PAGE_SIZE); - pda = alloc_bootmem(sizeof(*pda)); - /* alloc_bootmem(_pages) panics on failure, so no check */ - - memset(gdt, 0, PAGE_SIZE); - memset(pda, 0, sizeof(*pda)); - } else { - /* GDT and PDA might already have been allocated if - this is a CPU hotplug re-insertion. */ - if (gdt == NULL) - gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL); - - if (pda == NULL) - pda = kmalloc_node(sizeof(*pda), GFP_KERNEL, cpu_to_node(cpu)); - - if (unlikely(!gdt || !pda)) { - free_pages((unsigned long)gdt, 0); - kfree(pda); - return 0; - } - } - - cpu_gdt_descr->address = (unsigned long)gdt; - cpu_pda(cpu) = pda; - - return 1; -} + struct Xgt_desc_struct gdt_descr; -/* Initial PDA used by boot CPU */ -struct i386_pda boot_pda = { - ._pda = &boot_pda, - .cpu_number = 0, - .pcurrent = &init_task, -}; - -static inline void set_kernel_fs(void) -{ - /* Set %fs for this CPU's PDA. Memory clobber is to create a - barrier with respect to any PDA operations, so the compiler - doesn't move any before here. */ - asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory"); + gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); + asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory"); } -/* Initialize the CPU's GDT and PDA. The boot CPU does this for - itself, but secondaries find this done for them. */ -__cpuinit int init_gdt(int cpu, struct task_struct *idle) -{ - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); - struct desc_struct *gdt; - struct i386_pda *pda; - - /* For non-boot CPUs, the GDT and PDA should already have been - allocated. */ - if (!alloc_gdt(cpu)) { - printk(KERN_CRIT "CPU%d failed to allocate GDT or PDA\n", cpu); - return 0; - } - - gdt = (struct desc_struct *)cpu_gdt_descr->address; - pda = cpu_pda(cpu); - - BUG_ON(gdt == NULL || pda == NULL); - - /* - * Initialize the per-CPU GDT with the boot GDT, - * and set up the GDT descriptor: - */ - memcpy(gdt, cpu_gdt_table, GDT_SIZE); - cpu_gdt_descr->size = GDT_SIZE - 1; - - pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a, - (u32 *)&gdt[GDT_ENTRY_PDA].b, - (unsigned long)pda, sizeof(*pda) - 1, - 0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */ - - memset(pda, 0, sizeof(*pda)); - pda->_pda = pda; - pda->cpu_number = cpu; - pda->pcurrent = idle; - - return 1; -} - -void __cpuinit cpu_set_gdt(int cpu) -{ - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); - - /* Reinit these anyway, even if they've already been done (on - the boot CPU, this will transition from the boot gdt+pda to - the real ones). */ - load_gdt(cpu_gdt_descr); - set_kernel_fs(); -} - -/* Common CPU init for both boot and secondary CPUs */ -static void __cpuinit _cpu_init(int cpu, struct task_struct *curr) +/* + * cpu_init() initializes state that is per-CPU. Some data is already + * initialized (naturally) in the bootstrap process, such as the GDT + * and IDT. We reload them nevertheless, this function acts as a + * 'CPU state barrier', nothing should get across. + */ +void __cpuinit cpu_init(void) { + int cpu = smp_processor_id(); + struct task_struct *curr = current; struct tss_struct * t = &per_cpu(init_tss, cpu); struct thread_struct *thread = &curr->thread; @@ -744,6 +680,7 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr) } load_idt(&idt_descr); + switch_to_new_gdt(); /* * Set up and load the per-CPU TSS and LDT @@ -783,38 +720,6 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr) mxcsr_feature_mask_init(); } -/* Entrypoint to initialize secondary CPU */ -void __cpuinit secondary_cpu_init(void) -{ - int cpu = smp_processor_id(); - struct task_struct *curr = current; - - _cpu_init(cpu, curr); -} - -/* - * cpu_init() initializes state that is per-CPU. Some data is already - * initialized (naturally) in the bootstrap process, such as the GDT - * and IDT. We reload them nevertheless, this function acts as a - * 'CPU state barrier', nothing should get across. - */ -void __cpuinit cpu_init(void) -{ - int cpu = smp_processor_id(); - struct task_struct *curr = current; - - /* Set up the real GDT and PDA, so we can transition from the - boot versions. */ - if (!init_gdt(cpu, curr)) { - /* failed to allocate something; not much we can do... */ - for (;;) - local_irq_enable(); - } - - cpu_set_gdt(cpu); - _cpu_init(cpu, curr); -} - #ifdef CONFIG_HOTPLUG_CPU void __cpuinit cpu_uninit(void) { diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c index de27bd07bc9..0b8411a864f 100644 --- a/arch/i386/kernel/cpu/cyrix.c +++ b/arch/i386/kernel/cpu/cyrix.c @@ -279,7 +279,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) */ if (vendor == PCI_VENDOR_ID_CYRIX && (device == PCI_DEVICE_ID_CYRIX_5510 || device == PCI_DEVICE_ID_CYRIX_5520)) - pit_latch_buggy = 1; + mark_tsc_unstable("cyrix 5510/5520 detected"); } #endif c->x86_cache_size=16; /* Yep 16K integrated cache thats it */ @@ -448,16 +448,6 @@ int __init cyrix_init_cpu(void) return 0; } -//early_arch_initcall(cyrix_init_cpu); - -static int __init cyrix_exit_cpu(void) -{ - cpu_devs[X86_VENDOR_CYRIX] = NULL; - return 0; -} - -late_initcall(cyrix_exit_cpu); - static struct cpu_dev nsc_cpu_dev __cpuinitdata = { .c_vendor = "NSC", .c_ident = { "Geode by NSC" }, @@ -470,12 +460,3 @@ int __init nsc_init_cpu(void) return 0; } -//early_arch_initcall(nsc_init_cpu); - -static int __init nsc_exit_cpu(void) -{ - cpu_devs[X86_VENDOR_NSC] = NULL; - return 0; -} - -late_initcall(nsc_exit_cpu); diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c index 56fe2658495..dc4e08147b1 100644 --- a/arch/i386/kernel/cpu/intel.c +++ b/arch/i386/kernel/cpu/intel.c @@ -188,8 +188,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) } #endif - if (c->x86 == 15) + if (c->x86 == 15) { set_bit(X86_FEATURE_P4, c->x86_capability); + set_bit(X86_FEATURE_SYNC_RDTSC, c->x86_capability); + } if (c->x86 == 6) set_bit(X86_FEATURE_P3, c->x86_capability); if ((c->x86 == 0xf && c->x86_model >= 0x03) || diff --git a/arch/i386/kernel/cpu/mcheck/k7.c b/arch/i386/kernel/cpu/mcheck/k7.c index b0862af595a..f9fa4142551 100644 --- a/arch/i386/kernel/cpu/mcheck/k7.c +++ b/arch/i386/kernel/cpu/mcheck/k7.c @@ -75,6 +75,9 @@ void amd_mcheck_init(struct cpuinfo_x86 *c) machine_check_vector = k7_machine_check; wmb(); + if (!cpu_has(c, X86_FEATURE_MCE)) + return; + printk (KERN_INFO "Intel machine check architecture supported.\n"); rdmsr (MSR_IA32_MCG_CAP, l, h); if (l & (1<<8)) /* Control register present ? */ @@ -82,9 +85,13 @@ void amd_mcheck_init(struct cpuinfo_x86 *c) nr_mce_banks = l & 0xff; /* Clear status for MC index 0 separately, we don't touch CTL, - * as some Athlons cause spurious MCEs when its enabled. */ - wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0); - for (i=1; i<nr_mce_banks; i++) { + * as some K7 Athlons cause spurious MCEs when its enabled. */ + if (boot_cpu_data.x86 == 6) { + wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0); + i = 1; + } else + i = 0; + for (; i<nr_mce_banks; i++) { wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); } diff --git a/arch/i386/kernel/cpu/mcheck/mce.c b/arch/i386/kernel/cpu/mcheck/mce.c index 4f10c62d180..56cd485b127 100644 --- a/arch/i386/kernel/cpu/mcheck/mce.c +++ b/arch/i386/kernel/cpu/mcheck/mce.c @@ -38,8 +38,7 @@ void mcheck_init(struct cpuinfo_x86 *c) switch (c->x86_vendor) { case X86_VENDOR_AMD: - if (c->x86==6 || c->x86==15) - amd_mcheck_init(c); + amd_mcheck_init(c); break; case X86_VENDOR_INTEL: diff --git a/arch/i386/kernel/cpu/mcheck/p4.c b/arch/i386/kernel/cpu/mcheck/p4.c index 504434a4601..1509edfb231 100644 --- a/arch/i386/kernel/cpu/mcheck/p4.c +++ b/arch/i386/kernel/cpu/mcheck/p4.c @@ -124,13 +124,10 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) /* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ -static inline int intel_get_extended_msrs(struct intel_mce_extended_msrs *r) +static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) { u32 h; - if (mce_num_extended_msrs == 0) - goto done; - rdmsr (MSR_IA32_MCG_EAX, r->eax, h); rdmsr (MSR_IA32_MCG_EBX, r->ebx, h); rdmsr (MSR_IA32_MCG_ECX, r->ecx, h); @@ -141,12 +138,6 @@ static inline int intel_get_extended_msrs(struct intel_mce_extended_msrs *r) rdmsr (MSR_IA32_MCG_ESP, r->esp, h); rdmsr (MSR_IA32_MCG_EFLAGS, r->eflags, h); rdmsr (MSR_IA32_MCG_EIP, r->eip, h); - - /* can we rely on kmalloc to do a dynamic - * allocation for the reserved registers? - */ -done: - return mce_num_extended_msrs; } static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) @@ -155,7 +146,6 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) u32 alow, ahigh, high, low; u32 mcgstl, mcgsth; int i; - struct intel_mce_extended_msrs dbg; rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth); if (mcgstl & (1<<0)) /* Recoverable ? */ @@ -164,7 +154,9 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", smp_processor_id(), mcgsth, mcgstl); - if (intel_get_extended_msrs(&dbg)) { + if (mce_num_extended_msrs > 0) { + struct intel_mce_extended_msrs dbg; + intel_get_extended_msrs(&dbg); printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n", smp_processor_id(), dbg.eip, dbg.eflags); printk (KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n", diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c index f77fc53db65..5367e32e040 100644 --- a/arch/i386/kernel/cpu/mtrr/generic.c +++ b/arch/i386/kernel/cpu/mtrr/generic.c @@ -20,13 +20,25 @@ struct mtrr_state { mtrr_type def_type; }; +struct fixed_range_block { + int base_msr; /* start address of an MTRR block */ + int ranges; /* number of MTRRs in this block */ +}; + +static struct fixed_range_block fixed_range_blocks[] = { + { MTRRfix64K_00000_MSR, 1 }, /* one 64k MTRR */ + { MTRRfix16K_80000_MSR, 2 }, /* two 16k MTRRs */ + { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */ + {} +}; + static unsigned long smp_changes_mask; static struct mtrr_state mtrr_state = {}; #undef MODULE_PARAM_PREFIX #define MODULE_PARAM_PREFIX "mtrr." -static __initdata int mtrr_show; +static int mtrr_show; module_param_named(show, mtrr_show, bool, 0); /* Get the MSR pair relating to a var range */ @@ -37,7 +49,7 @@ get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); } -static void __init +static void get_fixed_ranges(mtrr_type * frs) { unsigned int *p = (unsigned int *) frs; @@ -51,12 +63,18 @@ get_fixed_ranges(mtrr_type * frs) rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); } -static void __init print_fixed(unsigned base, unsigned step, const mtrr_type*types) +void mtrr_save_fixed_ranges(void *info) +{ + get_fixed_ranges(mtrr_state.fixed_ranges); +} + +static void __cpuinit print_fixed(unsigned base, unsigned step, const mtrr_type*types) { unsigned i; for (i = 0; i < 8; ++i, ++types, base += step) - printk(KERN_INFO "MTRR %05X-%05X %s\n", base, base + step - 1, mtrr_attrib_to_str(*types)); + printk(KERN_INFO "MTRR %05X-%05X %s\n", + base, base + step - 1, mtrr_attrib_to_str(*types)); } /* Grab all of the MTRR state for this CPU into *state */ @@ -147,6 +165,44 @@ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b) smp_processor_id(), msr, a, b); } +/** + * Enable and allow read/write of extended fixed-range MTRR bits on K8 CPUs + * see AMD publication no. 24593, chapter 3.2.1 for more information + */ +static inline void k8_enable_fixed_iorrs(void) +{ + unsigned lo, hi; + + rdmsr(MSR_K8_SYSCFG, lo, hi); + mtrr_wrmsr(MSR_K8_SYSCFG, lo + | K8_MTRRFIXRANGE_DRAM_ENABLE + | K8_MTRRFIXRANGE_DRAM_MODIFY, hi); +} + +/** + * Checks and updates an fixed-range MTRR if it differs from the value it + * should have. If K8 extenstions are wanted, update the K8 SYSCFG MSR also. + * see AMD publication no. 24593, chapter 7.8.1, page 233 for more information + * \param msr MSR address of the MTTR which should be checked and updated + * \param changed pointer which indicates whether the MTRR needed to be changed + * \param msrwords pointer to the MSR values which the MSR should have + */ +static void set_fixed_range(int msr, int * changed, unsigned int * msrwords) +{ + unsigned lo, hi; + + rdmsr(msr, lo, hi); + + if (lo != msrwords[0] || hi != msrwords[1]) { + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86 == 15 && + ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK)) + k8_enable_fixed_iorrs(); + mtrr_wrmsr(msr, msrwords[0], msrwords[1]); + *changed = TRUE; + } +} + int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) /* [SUMMARY] Get a free MTRR. <base> The starting (base) address of the region. @@ -196,36 +252,21 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, *type = base_lo & 0xff; } +/** + * Checks and updates the fixed-range MTRRs if they differ from the saved set + * \param frs pointer to fixed-range MTRR values, saved by get_fixed_ranges() + */ static int set_fixed_ranges(mtrr_type * frs) { - unsigned int *p = (unsigned int *) frs; + unsigned long long *saved = (unsigned long long *) frs; int changed = FALSE; - int i; - unsigned int lo, hi; + int block=-1, range; - rdmsr(MTRRfix64K_00000_MSR, lo, hi); - if (p[0] != lo || p[1] != hi) { - mtrr_wrmsr(MTRRfix64K_00000_MSR, p[0], p[1]); - changed = TRUE; - } + while (fixed_range_blocks[++block].ranges) + for (range=0; range < fixed_range_blocks[block].ranges; range++) + set_fixed_range(fixed_range_blocks[block].base_msr + range, + &changed, (unsigned int *) saved++); - for (i = 0; i < 2; i++) { - rdmsr(MTRRfix16K_80000_MSR + i, lo, hi); - if (p[2 + i * 2] != lo || p[3 + i * 2] != hi) { - mtrr_wrmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], - p[3 + i * 2]); - changed = TRUE; - } - } - - for (i = 0; i < 8; i++) { - rdmsr(MTRRfix4K_C0000_MSR + i, lo, hi); - if (p[6 + i * 2] != lo || p[7 + i * 2] != hi) { - mtrr_wrmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], - p[7 + i * 2]); - changed = TRUE; - } - } return changed; } @@ -428,7 +469,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i } } - if (base + size < 0x100) { + if (base < 0x100) { printk(KERN_WARNING "mtrr: cannot set region below 1 MiB (0x%lx000,0x%lx000)\n", base, size); return -EINVAL; diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c index 0acfb6a5a22..02a2f39e5e0 100644 --- a/arch/i386/kernel/cpu/mtrr/main.c +++ b/arch/i386/kernel/cpu/mtrr/main.c @@ -729,6 +729,17 @@ void mtrr_ap_init(void) local_irq_restore(flags); } +/** + * Save current fixed-range MTRR state of the BSP + */ +void mtrr_save_state(void) +{ + if (smp_processor_id() == 0) + mtrr_save_fixed_ranges(NULL); + else + smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1, 1); +} + static int __init mtrr_init_finialize(void) { if (!mtrr_if) diff --git a/arch/i386/kernel/cpu/nexgen.c b/arch/i386/kernel/cpu/nexgen.c index 8bf23cc80c6..961fbe1a748 100644 --- a/arch/i386/kernel/cpu/nexgen.c +++ b/arch/i386/kernel/cpu/nexgen.c @@ -58,13 +58,3 @@ int __init nexgen_init_cpu(void) cpu_devs[X86_VENDOR_NEXGEN] = &nexgen_cpu_dev; return 0; } - -//early_arch_initcall(nexgen_init_cpu); - -static int __init nexgen_exit_cpu(void) -{ - cpu_devs[X86_VENDOR_NEXGEN] = NULL; - return 0; -} - -late_initcall(nexgen_exit_cpu); diff --git a/arch/i386/kernel/cpu/perfctr-watchdog.c b/arch/i386/kernel/cpu/perfctr-watchdog.c new file mode 100644 index 00000000000..2b04c8f1db6 --- /dev/null +++ b/arch/i386/kernel/cpu/perfctr-watchdog.c @@ -0,0 +1,658 @@ +/* local apic based NMI watchdog for various CPUs. + This file also handles reservation of performance counters for coordination + with other users (like oprofile). + + Note that these events normally don't tick when the CPU idles. This means + the frequency varies with CPU load. + + Original code for K7/P6 written by Keith Owens */ + +#include <linux/percpu.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/bitops.h> +#include <linux/smp.h> +#include <linux/nmi.h> +#include <asm/apic.h> +#include <asm/intel_arch_perfmon.h> + +struct nmi_watchdog_ctlblk { + unsigned int cccr_msr; + unsigned int perfctr_msr; /* the MSR to reset in NMI handler */ + unsigned int evntsel_msr; /* the MSR to select the events to handle */ +}; + +/* Interface defining a CPU specific perfctr watchdog */ +struct wd_ops { + int (*reserve)(void); + void (*unreserve)(void); + int (*setup)(unsigned nmi_hz); + void (*rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz); + void (*stop)(void *); + unsigned perfctr; + unsigned evntsel; + u64 checkbit; +}; + +static struct wd_ops *wd_ops; + +/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's + * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) + */ +#define NMI_MAX_COUNTER_BITS 66 + +/* perfctr_nmi_owner tracks the ownership of the perfctr registers: + * evtsel_nmi_owner tracks the ownership of the event selection + * - different performance counters/ event selection may be reserved for + * different subsystems this reservation system just tries to coordinate + * things a little + */ +static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS); +static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS); + +static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk); + +/* converts an msr to an appropriate reservation bit */ +static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) +{ + return wd_ops ? msr - wd_ops->perfctr : 0; +} + +/* converts an msr to an appropriate reservation bit */ +/* returns the bit offset of the event selection register */ +static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) +{ + return wd_ops ? msr - wd_ops->evntsel : 0; +} + +/* checks for a bit availability (hack for oprofile) */ +int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) +{ + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + return (!test_bit(counter, perfctr_nmi_owner)); +} + +/* checks the an msr for availability */ +int avail_to_resrv_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + return (!test_bit(counter, perfctr_nmi_owner)); +} + +int reserve_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + if (!test_and_set_bit(counter, perfctr_nmi_owner)) + return 1; + return 0; +} + +void release_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + clear_bit(counter, perfctr_nmi_owner); +} + +int reserve_evntsel_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_evntsel_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + if (!test_and_set_bit(counter, evntsel_nmi_owner)) + return 1; + return 0; +} + +void release_evntsel_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_evntsel_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + clear_bit(counter, evntsel_nmi_owner); +} + +EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); +EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); +EXPORT_SYMBOL(reserve_perfctr_nmi); +EXPORT_SYMBOL(release_perfctr_nmi); +EXPORT_SYMBOL(reserve_evntsel_nmi); +EXPORT_SYMBOL(release_evntsel_nmi); + +void disable_lapic_nmi_watchdog(void) +{ + BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); + + if (atomic_read(&nmi_active) <= 0) + return; + + on_each_cpu(wd_ops->stop, NULL, 0, 1); + wd_ops->unreserve(); + + BUG_ON(atomic_read(&nmi_active) != 0); +} + +void enable_lapic_nmi_watchdog(void) +{ + BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); + + /* are we already enabled */ + if (atomic_read(&nmi_active) != 0) + return; + + /* are we lapic aware */ + if (!wd_ops) + return; + if (!wd_ops->reserve()) { + printk(KERN_ERR "NMI watchdog: cannot reserve perfctrs\n"); + return; + } + + on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); + touch_nmi_watchdog(); +} + +/* + * Activate the NMI watchdog via the local APIC. + */ + +static unsigned int adjust_for_32bit_ctr(unsigned int hz) +{ + u64 counter_val; + unsigned int retval = hz; + + /* + * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter + * are writable, with higher bits sign extending from bit 31. + * So, we can only program the counter with 31 bit values and + * 32nd bit should be 1, for 33.. to be 1. + * Find the appropriate nmi_hz + */ + counter_val = (u64)cpu_khz * 1000; + do_div(counter_val, retval); + if (counter_val > 0x7fffffffULL) { + u64 count = (u64)cpu_khz * 1000; + do_div(count, 0x7fffffffUL); + retval = count + 1; + } + return retval; +} + +static void +write_watchdog_counter(unsigned int perfctr_msr, const char *descr, unsigned nmi_hz) +{ + u64 count = (u64)cpu_khz * 1000; + + do_div(count, nmi_hz); + if(descr) + Dprintk("setting %s to -0x%08Lx\n", descr, count); + wrmsrl(perfctr_msr, 0 - count); +} + +static void write_watchdog_counter32(unsigned int perfctr_msr, + const char *descr, unsigned nmi_hz) +{ + u64 count = (u64)cpu_khz * 1000; + + do_div(count, nmi_hz); + if(descr) + Dprintk("setting %s to -0x%08Lx\n", descr, count); + wrmsr(perfctr_msr, (u32)(-count), 0); +} + +/* AMD K7/K8/Family10h/Family11h support. AMD keeps this interface + nicely stable so there is not much variety */ + +#define K7_EVNTSEL_ENABLE (1 << 22) +#define K7_EVNTSEL_INT (1 << 20) +#define K7_EVNTSEL_OS (1 << 17) +#define K7_EVNTSEL_USR (1 << 16) +#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 +#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING + +static int setup_k7_watchdog(unsigned nmi_hz) +{ + unsigned int perfctr_msr, evntsel_msr; + unsigned int evntsel; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + perfctr_msr = MSR_K7_PERFCTR0; + evntsel_msr = MSR_K7_EVNTSEL0; + + wrmsrl(perfctr_msr, 0UL); + + evntsel = K7_EVNTSEL_INT + | K7_EVNTSEL_OS + | K7_EVNTSEL_USR + | K7_NMI_EVENT; + + /* setup the timer */ + wrmsr(evntsel_msr, evntsel, 0); + write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz); + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= K7_EVNTSEL_ENABLE; + wrmsr(evntsel_msr, evntsel, 0); + + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = 0; //unused + return 1; +} + +static void single_msr_stop_watchdog(void *arg) +{ + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + wrmsr(wd->evntsel_msr, 0, 0); +} + +static int single_msr_reserve(void) +{ + if (!reserve_perfctr_nmi(wd_ops->perfctr)) + return 0; + + if (!reserve_evntsel_nmi(wd_ops->evntsel)) { + release_perfctr_nmi(wd_ops->perfctr); + return 0; + } + return 1; +} + +static void single_msr_unreserve(void) +{ + release_evntsel_nmi(wd_ops->perfctr); + release_perfctr_nmi(wd_ops->evntsel); +} + +static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) +{ + /* start the cycle over again */ + write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz); +} + +static struct wd_ops k7_wd_ops = { + .reserve = single_msr_reserve, + .unreserve = single_msr_unreserve, + .setup = setup_k7_watchdog, + .rearm = single_msr_rearm, + .stop = single_msr_stop_watchdog, + .perfctr = MSR_K7_PERFCTR0, + .evntsel = MSR_K7_EVNTSEL0, + .checkbit = 1ULL<<63, +}; + +/* Intel Model 6 (PPro+,P2,P3,P-M,Core1) */ + +#define P6_EVNTSEL0_ENABLE (1 << 22) +#define P6_EVNTSEL_INT (1 << 20) +#define P6_EVNTSEL_OS (1 << 17) +#define P6_EVNTSEL_USR (1 << 16) +#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 +#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED + +static int setup_p6_watchdog(unsigned nmi_hz) +{ + unsigned int perfctr_msr, evntsel_msr; + unsigned int evntsel; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + perfctr_msr = MSR_P6_PERFCTR0; + evntsel_msr = MSR_P6_EVNTSEL0; + + wrmsrl(perfctr_msr, 0UL); + + evntsel = P6_EVNTSEL_INT + | P6_EVNTSEL_OS + | P6_EVNTSEL_USR + | P6_NMI_EVENT; + + /* setup the timer */ + wrmsr(evntsel_msr, evntsel, 0); + nmi_hz = adjust_for_32bit_ctr(nmi_hz); + write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz); + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= P6_EVNTSEL0_ENABLE; + wrmsr(evntsel_msr, evntsel, 0); + + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = 0; //unused + return 1; +} + +static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) +{ + /* P6 based Pentium M need to re-unmask + * the apic vector but it doesn't hurt + * other P6 variant. + * ArchPerfom/Core Duo also needs this */ + apic_write(APIC_LVTPC, APIC_DM_NMI); + /* P6/ARCH_PERFMON has 32 bit counter write */ + write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz); +} + +static struct wd_ops p6_wd_ops = { + .reserve = single_msr_reserve, + .unreserve = single_msr_unreserve, + .setup = setup_p6_watchdog, + .rearm = p6_rearm, + .stop = single_msr_stop_watchdog, + .perfctr = MSR_P6_PERFCTR0, + .evntsel = MSR_P6_EVNTSEL0, + .checkbit = 1ULL<<39, +}; + +/* Intel P4 performance counters. By far the most complicated of all. */ + +#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) +#define P4_ESCR_EVENT_SELECT(N) ((N)<<25) +#define P4_ESCR_OS (1<<3) +#define P4_ESCR_USR (1<<2) +#define P4_CCCR_OVF_PMI0 (1<<26) +#define P4_CCCR_OVF_PMI1 (1<<27) +#define P4_CCCR_THRESHOLD(N) ((N)<<20) +#define P4_CCCR_COMPLEMENT (1<<19) +#define P4_CCCR_COMPARE (1<<18) +#define P4_CCCR_REQUIRED (3<<16) +#define P4_CCCR_ESCR_SELECT(N) ((N)<<13) +#define P4_CCCR_ENABLE (1<<12) +#define P4_CCCR_OVF (1<<31) + +/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter + CRU_ESCR0 (with any non-null event selector) through a complemented + max threshold. [IA32-Vol3, Section 14.9.9] */ + +static int setup_p4_watchdog(unsigned nmi_hz) +{ + unsigned int perfctr_msr, evntsel_msr, cccr_msr; + unsigned int evntsel, cccr_val; + unsigned int misc_enable, dummy; + unsigned int ht_num; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy); + if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL)) + return 0; + +#ifdef CONFIG_SMP + /* detect which hyperthread we are on */ + if (smp_num_siblings == 2) { + unsigned int ebx, apicid; + + ebx = cpuid_ebx(1); + apicid = (ebx >> 24) & 0xff; + ht_num = apicid & 1; + } else +#endif + ht_num = 0; + + /* performance counters are shared resources + * assign each hyperthread its own set + * (re-use the ESCR0 register, seems safe + * and keeps the cccr_val the same) + */ + if (!ht_num) { + /* logical cpu 0 */ + perfctr_msr = MSR_P4_IQ_PERFCTR0; + evntsel_msr = MSR_P4_CRU_ESCR0; + cccr_msr = MSR_P4_IQ_CCCR0; + cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); + } else { + /* logical cpu 1 */ + perfctr_msr = MSR_P4_IQ_PERFCTR1; + evntsel_msr = MSR_P4_CRU_ESCR0; + cccr_msr = MSR_P4_IQ_CCCR1; + cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4); + } + + evntsel = P4_ESCR_EVENT_SELECT(0x3F) + | P4_ESCR_OS + | P4_ESCR_USR; + + cccr_val |= P4_CCCR_THRESHOLD(15) + | P4_CCCR_COMPLEMENT + | P4_CCCR_COMPARE + | P4_CCCR_REQUIRED; + + wrmsr(evntsel_msr, evntsel, 0); + wrmsr(cccr_msr, cccr_val, 0); + write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz); + apic_write(APIC_LVTPC, APIC_DM_NMI); + cccr_val |= P4_CCCR_ENABLE; + wrmsr(cccr_msr, cccr_val, 0); + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = cccr_msr; + return 1; +} + +static void stop_p4_watchdog(void *arg) +{ + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + wrmsr(wd->cccr_msr, 0, 0); + wrmsr(wd->evntsel_msr, 0, 0); +} + +static int p4_reserve(void) +{ + if (!reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR0)) + return 0; +#ifdef CONFIG_SMP + if (smp_num_siblings > 1 && !reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR1)) + goto fail1; +#endif + if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0)) + goto fail2; + /* RED-PEN why is ESCR1 not reserved here? */ + return 1; + fail2: +#ifdef CONFIG_SMP + if (smp_num_siblings > 1) + release_perfctr_nmi(MSR_P4_IQ_PERFCTR1); + fail1: +#endif + release_perfctr_nmi(MSR_P4_IQ_PERFCTR0); + return 0; +} + +static void p4_unreserve(void) +{ +#ifdef CONFIG_SMP + if (smp_num_siblings > 1) + release_evntsel_nmi(MSR_P4_IQ_PERFCTR1); +#endif + release_evntsel_nmi(MSR_P4_IQ_PERFCTR0); + release_perfctr_nmi(MSR_P4_CRU_ESCR0); +} + +static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) +{ + unsigned dummy; + /* + * P4 quirks: + * - An overflown perfctr will assert its interrupt + * until the OVF flag in its CCCR is cleared. + * - LVTPC is masked on interrupt and must be + * unmasked by the LVTPC handler. + */ + rdmsrl(wd->cccr_msr, dummy); + dummy &= ~P4_CCCR_OVF; + wrmsrl(wd->cccr_msr, dummy); + apic_write(APIC_LVTPC, APIC_DM_NMI); + /* start the cycle over again */ + write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz); +} + +static struct wd_ops p4_wd_ops = { + .reserve = p4_reserve, + .unreserve = p4_unreserve, + .setup = setup_p4_watchdog, + .rearm = p4_rearm, + .stop = stop_p4_watchdog, + /* RED-PEN this is wrong for the other sibling */ + .perfctr = MSR_P4_BPU_PERFCTR0, + .evntsel = MSR_P4_BSU_ESCR0, + .checkbit = 1ULL<<39, +}; + +/* Watchdog using the Intel architected PerfMon. Used for Core2 and hopefully + all future Intel CPUs. */ + +#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL +#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK + +static int setup_intel_arch_watchdog(unsigned nmi_hz) +{ + unsigned int ebx; + union cpuid10_eax eax; + unsigned int unused; + unsigned int perfctr_msr, evntsel_msr; + unsigned int evntsel; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + /* + * Check whether the Architectural PerfMon supports + * Unhalted Core Cycles Event or not. + * NOTE: Corresponding bit = 0 in ebx indicates event present. + */ + cpuid(10, &(eax.full), &ebx, &unused, &unused); + if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || + (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) + return 0; + + perfctr_msr = MSR_ARCH_PERFMON_PERFCTR1; + evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL1; + + wrmsrl(perfctr_msr, 0UL); + + evntsel = ARCH_PERFMON_EVENTSEL_INT + | ARCH_PERFMON_EVENTSEL_OS + | ARCH_PERFMON_EVENTSEL_USR + | ARCH_PERFMON_NMI_EVENT_SEL + | ARCH_PERFMON_NMI_EVENT_UMASK; + + /* setup the timer */ + wrmsr(evntsel_msr, evntsel, 0); + nmi_hz = adjust_for_32bit_ctr(nmi_hz); + write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz); + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsr(evntsel_msr, evntsel, 0); + + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = 0; //unused + wd_ops->checkbit = 1ULL << (eax.split.bit_width - 1); + return 1; +} + +static struct wd_ops intel_arch_wd_ops = { + .reserve = single_msr_reserve, + .unreserve = single_msr_unreserve, + .setup = setup_intel_arch_watchdog, + .rearm = p6_rearm, + .stop = single_msr_stop_watchdog, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .evntsel = MSR_ARCH_PERFMON_EVENTSEL0, +}; + +static void probe_nmi_watchdog(void) +{ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 && + boot_cpu_data.x86 != 16) + return; + wd_ops = &k7_wd_ops; + break; + case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + wd_ops = &intel_arch_wd_ops; + break; + } + switch (boot_cpu_data.x86) { + case 6: + if (boot_cpu_data.x86_model > 0xd) + return; + + wd_ops = &p6_wd_ops; + break; + case 15: + if (boot_cpu_data.x86_model > 0x4) + return; + + wd_ops = &p4_wd_ops; + break; + default: + return; + } + break; + } +} + +/* Interface to nmi.c */ + +int lapic_watchdog_init(unsigned nmi_hz) +{ + if (!wd_ops) { + probe_nmi_watchdog(); + if (!wd_ops) + return -1; + } + + if (!(wd_ops->setup(nmi_hz))) { + printk(KERN_ERR "Cannot setup NMI watchdog on CPU %d\n", + raw_smp_processor_id()); + return -1; + } + + return 0; +} + +void lapic_watchdog_stop(void) +{ + if (wd_ops) + wd_ops->stop(NULL); +} + +unsigned lapic_adjust_nmi_hz(unsigned hz) +{ + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + if (wd->perfctr_msr == MSR_P6_PERFCTR0 || + wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1) + hz = adjust_for_32bit_ctr(hz); + return hz; +} + +int lapic_wd_event(unsigned nmi_hz) +{ + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + u64 ctr; + rdmsrl(wd->perfctr_msr, ctr); + if (ctr & wd_ops->checkbit) { /* perfctr still running? */ + return 0; + } + wd_ops->rearm(wd, nmi_hz); + return 1; +} + +int lapic_watchdog_ok(void) +{ + return wd_ops != NULL; +} diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c index 47e3ebbfb28..89d91e6cc97 100644 --- a/arch/i386/kernel/cpu/proc.c +++ b/arch/i386/kernel/cpu/proc.c @@ -72,8 +72,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) "stc", "100mhzsteps", "hwpstate", - NULL, - NULL, /* constant_tsc - moved to flags */ + "", /* constant_tsc - moved to flags */ /* nothing */ }; struct cpuinfo_x86 *c = v; diff --git a/arch/i386/kernel/cpu/rise.c b/arch/i386/kernel/cpu/rise.c index 9317f741498..50076f22e90 100644 --- a/arch/i386/kernel/cpu/rise.c +++ b/arch/i386/kernel/cpu/rise.c @@ -50,12 +50,3 @@ int __init rise_init_cpu(void) return 0; } -//early_arch_initcall(rise_init_cpu); - -static int __init rise_exit_cpu(void) -{ - cpu_devs[X86_VENDOR_RISE] = NULL; - return 0; -} - -late_initcall(rise_exit_cpu); diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c index 5678d46863c..6471a5a1320 100644 --- a/arch/i386/kernel/cpu/transmeta.c +++ b/arch/i386/kernel/cpu/transmeta.c @@ -112,13 +112,3 @@ int __init transmeta_init_cpu(void) cpu_devs[X86_VENDOR_TRANSMETA] = &transmeta_cpu_dev; return 0; } - -//early_arch_initcall(transmeta_init_cpu); - -static int __init transmeta_exit_cpu(void) -{ - cpu_devs[X86_VENDOR_TRANSMETA] = NULL; - return 0; -} - -late_initcall(transmeta_exit_cpu); diff --git a/arch/i386/kernel/cpu/umc.c b/arch/i386/kernel/cpu/umc.c index 1bf3f87e9c5..a7a4e75bdcd 100644 --- a/arch/i386/kernel/cpu/umc.c +++ b/arch/i386/kernel/cpu/umc.c @@ -24,13 +24,3 @@ int __init umc_init_cpu(void) cpu_devs[X86_VENDOR_UMC] = &umc_cpu_dev; return 0; } - -//early_arch_initcall(umc_init_cpu); - -static int __init umc_exit_cpu(void) -{ - cpu_devs[X86_VENDOR_UMC] = NULL; - return 0; -} - -late_initcall(umc_exit_cpu); diff --git a/arch/i386/kernel/doublefault.c b/arch/i386/kernel/doublefault.c index b4d14c2eb34..265c5597efb 100644 --- a/arch/i386/kernel/doublefault.c +++ b/arch/i386/kernel/doublefault.c @@ -33,7 +33,7 @@ static void doublefault_fn(void) printk("double fault, tss at %08lx\n", tss); if (ptr_ok(tss)) { - struct tss_struct *t = (struct tss_struct *)tss; + struct i386_hw_tss *t = (struct i386_hw_tss *)tss; printk("eip = %08lx, esp = %08lx\n", t->eip, t->esp); @@ -49,18 +49,21 @@ static void doublefault_fn(void) } struct tss_struct doublefault_tss __cacheline_aligned = { - .esp0 = STACK_START, - .ss0 = __KERNEL_DS, - .ldt = 0, - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, + .x86_tss = { + .esp0 = STACK_START, + .ss0 = __KERNEL_DS, + .ldt = 0, + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, - .eip = (unsigned long) doublefault_fn, - .eflags = X86_EFLAGS_SF | 0x2, /* 0x2 bit is always set */ - .esp = STACK_START, - .es = __USER_DS, - .cs = __KERNEL_CS, - .ss = __KERNEL_DS, - .ds = __USER_DS, + .eip = (unsigned long) doublefault_fn, + /* 0x2 bit is always set */ + .eflags = X86_EFLAGS_SF | 0x2, + .esp = STACK_START, + .es = __USER_DS, + .cs = __KERNEL_CS, + .ss = __KERNEL_DS, + .ds = __USER_DS, - .__cr3 = __pa(swapper_pg_dir) + .__cr3 = __pa(swapper_pg_dir) + } }; diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c index 70f39560846..9645bb51f76 100644 --- a/arch/i386/kernel/e820.c +++ b/arch/i386/kernel/e820.c @@ -161,26 +161,27 @@ static struct resource standard_io_resources[] = { { static int __init romsignature(const unsigned char *rom) { + const unsigned short * const ptr = (const unsigned short *)rom; unsigned short sig; - return probe_kernel_address((const unsigned short *)rom, sig) == 0 && - sig == ROMSIGNATURE; + return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE; } -static int __init romchecksum(unsigned char *rom, unsigned long length) +static int __init romchecksum(const unsigned char *rom, unsigned long length) { - unsigned char sum; + unsigned char sum, c; - for (sum = 0; length; length--) - sum += *rom++; - return sum == 0; + for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--) + sum += c; + return !length && !sum; } static void __init probe_roms(void) { + const unsigned char *rom; unsigned long start, length, upper; - unsigned char *rom; - int i; + unsigned char c; + int i; /* video rom */ upper = adapter_rom_resources[0].start; @@ -191,8 +192,11 @@ static void __init probe_roms(void) video_rom_resource.start = start; + if (probe_kernel_address(rom + 2, c) != 0) + continue; + /* 0 < length <= 0x7f * 512, historically */ - length = rom[2] * 512; + length = c * 512; /* if checksum okay, trust length byte */ if (length && romchecksum(rom, length)) @@ -226,8 +230,11 @@ static void __init probe_roms(void) if (!romsignature(rom)) continue; + if (probe_kernel_address(rom + 2, c) != 0) + continue; + /* 0 < length <= 0x7f * 512, historically */ - length = rom[2] * 512; + length = c * 512; /* but accept any length that fits if checksum okay */ if (!length || start + length > upper || !romchecksum(rom, length)) @@ -386,10 +393,8 @@ int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) ____________________33__ ______________________4_ */ - printk("sanitize start\n"); /* if there's only one memory region, don't bother */ if (*pnr_map < 2) { - printk("sanitize bail 0\n"); return -1; } @@ -398,7 +403,6 @@ int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) /* bail out if we find any unreasonable addresses in bios map */ for (i=0; i<old_nr; i++) if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) { - printk("sanitize bail 1\n"); return -1; } @@ -494,7 +498,6 @@ int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); *pnr_map = new_nr; - printk("sanitize end\n"); return 0; } @@ -525,7 +528,6 @@ int __init copy_e820_map(struct e820entry * biosmap, int nr_map) unsigned long long size = biosmap->size; unsigned long long end = start + size; unsigned long type = biosmap->type; - printk("copy_e820_map() start: %016Lx size: %016Lx end: %016Lx type: %ld\n", start, size, end, type); /* Overflow in 64 bits? Ignore the memory map. */ if (start > end) @@ -536,17 +538,11 @@ int __init copy_e820_map(struct e820entry * biosmap, int nr_map) * Not right. Fix it up. */ if (type == E820_RAM) { - printk("copy_e820_map() type is E820_RAM\n"); if (start < 0x100000ULL && end > 0xA0000ULL) { - printk("copy_e820_map() lies in range...\n"); - if (start < 0xA0000ULL) { - printk("copy_e820_map() start < 0xA0000ULL\n"); + if (start < 0xA0000ULL) add_memory_region(start, 0xA0000ULL-start, type); - } - if (end <= 0x100000ULL) { - printk("copy_e820_map() end <= 0x100000ULL\n"); + if (end <= 0x100000ULL) continue; - } start = 0x100000ULL; size = end - start; } @@ -818,6 +814,26 @@ void __init limit_regions(unsigned long long size) print_memory_map("limit_regions endfunc"); } +/* + * This function checks if any part of the range <start,end> is mapped + * with type. + */ +int +e820_any_mapped(u64 start, u64 end, unsigned type) +{ + int i; + for (i = 0; i < e820.nr_map; i++) { + const struct e820entry *ei = &e820.map[i]; + if (type && ei->type != type) + continue; + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(e820_any_mapped); + /* * This function checks if the entire range <start,end> is mapped with type. * diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c index 8f9c624ace6..dd9e7faafa7 100644 --- a/arch/i386/kernel/efi.c +++ b/arch/i386/kernel/efi.c @@ -69,13 +69,11 @@ static void efi_call_phys_prelog(void) __acquires(efi_rt_lock) { unsigned long cr4; unsigned long temp; - struct Xgt_desc_struct *cpu_gdt_descr; + struct Xgt_desc_struct gdt_descr; spin_lock(&efi_rt_lock); local_irq_save(efi_rt_eflags); - cpu_gdt_descr = &per_cpu(cpu_gdt_descr, 0); - /* * If I don't have PSE, I should just duplicate two entries in page * directory. If I have PSE, I just need to duplicate one entry in @@ -105,17 +103,19 @@ static void efi_call_phys_prelog(void) __acquires(efi_rt_lock) */ local_flush_tlb(); - cpu_gdt_descr->address = __pa(cpu_gdt_descr->address); - load_gdt(cpu_gdt_descr); + gdt_descr.address = __pa(get_cpu_gdt_table(0)); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); } static void efi_call_phys_epilog(void) __releases(efi_rt_lock) { unsigned long cr4; - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, 0); + struct Xgt_desc_struct gdt_descr; - cpu_gdt_descr->address = (unsigned long)__va(cpu_gdt_descr->address); - load_gdt(cpu_gdt_descr); + gdt_descr.address = (unsigned long)get_cpu_gdt_table(0); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); cr4 = read_cr4(); diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 18bddcb8e9e..b1f16ee65e4 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -15,7 +15,7 @@ * I changed all the .align's to 4 (16 byte alignment), as that's faster * on a 486. * - * Stack layout in 'ret_from_system_call': + * Stack layout in 'syscall_exit': * ptrace needs to have all regs on the stack. * if the order here is changed, it needs to be * updated in fork.c:copy_process, signal.c:do_signal, @@ -132,7 +132,7 @@ VM_MASK = 0x00020000 movl $(__USER_DS), %edx; \ movl %edx, %ds; \ movl %edx, %es; \ - movl $(__KERNEL_PDA), %edx; \ + movl $(__KERNEL_PERCPU), %edx; \ movl %edx, %fs #define RESTORE_INT_REGS \ @@ -305,16 +305,12 @@ sysenter_past_esp: pushl $(__USER_CS) CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET cs, 0*/ -#ifndef CONFIG_COMPAT_VDSO /* * Push current_thread_info()->sysenter_return to the stack. * A tiny bit of offset fixup is necessary - 4*4 means the 4 words * pushed above; +8 corresponds to copy_thread's esp0 setting. */ pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) -#else - pushl $SYSENTER_RETURN -#endif CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET eip, 0 @@ -342,7 +338,7 @@ sysenter_past_esp: jae syscall_badsys call *sys_call_table(,%eax,4) movl %eax,PT_EAX(%esp) - DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX) + DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx @@ -560,9 +556,7 @@ END(syscall_badsys) #define FIXUP_ESPFIX_STACK \ /* since we are on a wrong stack, we cant make it a C code :( */ \ - movl %fs:PDA_cpu, %ebx; \ - PER_CPU(cpu_gdt_descr, %ebx); \ - movl GDS_address(%ebx), %ebx; \ + PER_CPU(gdt_page, %ebx); \ GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ addl %esp, %eax; \ pushl $__KERNEL_DS; \ @@ -635,7 +629,7 @@ ENTRY(name) \ SAVE_ALL; \ TRACE_IRQS_OFF \ movl %esp,%eax; \ - call smp_/**/name; \ + call smp_##name; \ jmp ret_from_intr; \ CFI_ENDPROC; \ ENDPROC(name) @@ -643,11 +637,6 @@ ENDPROC(name) /* The include is where all of the SMP etc. interrupts come from */ #include "entry_arch.h" -/* This alternate entry is needed because we hijack the apic LVTT */ -#if defined(CONFIG_VMI) && defined(CONFIG_X86_LOCAL_APIC) -BUILD_INTERRUPT(apic_vmi_timer_interrupt,LOCAL_TIMER_VECTOR) -#endif - KPROBE_ENTRY(page_fault) RING0_EC_FRAME pushl $do_page_fault @@ -686,7 +675,7 @@ error_code: pushl %fs CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET fs, 0*/ - movl $(__KERNEL_PDA), %ecx + movl $(__KERNEL_PERCPU), %ecx movl %ecx, %fs UNWIND_ESPFIX_STACK popl %ecx diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index 3fa7f9389af..9b10af65faa 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -34,17 +34,32 @@ /* * This is how much memory *in addition to the memory covered up to - * and including _end* we need mapped initially. We need one bit for - * each possible page, but only in low memory, which means - * 2^32/4096/8 = 128K worst case (4G/4G split.) + * and including _end* we need mapped initially. + * We need: + * - one bit for each possible page, but only in low memory, which means + * 2^32/4096/8 = 128K worst case (4G/4G split.) + * - enough space to map all low memory, which means + * (2^32/4096) / 1024 pages (worst case, non PAE) + * (2^32/4096) / 512 + 4 pages (worst case for PAE) + * - a few pages for allocator use before the kernel pagetable has + * been set up * * Modulo rounding, each megabyte assigned here requires a kilobyte of * memory, which is currently unreclaimed. * * This should be a multiple of a page. */ -#define INIT_MAP_BEYOND_END (128*1024) +LOW_PAGES = 1<<(32-PAGE_SHIFT_asm) +#if PTRS_PER_PMD > 1 +PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD +#else +PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD) +#endif +BOOTBITMAP_SIZE = LOW_PAGES / 8 +ALLOCATOR_SLOP = 4 + +INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm /* * 32-bit kernel entrypoint; only used by the boot CPU. On entry, @@ -147,8 +162,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20); /* * Non-boot CPU entry point; entered from trampoline.S * We can't lgdt here, because lgdt itself uses a data segment, but - * we know the trampoline has already loaded the boot_gdt_table GDT - * for us. + * we know the trampoline has already loaded the boot_gdt for us. * * If cpu hotplug is not supported then this code can go in init section * which will be freed later @@ -318,12 +332,12 @@ is386: movl $2,%ecx # set MP movl %eax,%cr0 call check_x87 - call setup_pda lgdt early_gdt_descr lidt idt_descr ljmp $(__KERNEL_CS),$1f 1: movl $(__KERNEL_DS),%eax # reload all the segment registers movl %eax,%ss # after changing gdt. + movl %eax,%fs # gets reset once there's real percpu movl $(__USER_DS),%eax # DS/ES contains default USER segment movl %eax,%ds @@ -333,16 +347,17 @@ is386: movl $2,%ecx # set MP movl %eax,%gs lldt %ax - movl $(__KERNEL_PDA),%eax - mov %eax,%fs - cld # gcc2 wants the direction flag cleared at all times pushl $0 # fake return address for unwinder #ifdef CONFIG_SMP movb ready, %cl movb $1, ready cmpb $0,%cl # the first CPU calls start_kernel - jne initialize_secondary # all other CPUs call initialize_secondary + je 1f + movl $(__KERNEL_PERCPU), %eax + movl %eax,%fs # set this cpu's percpu + jmp initialize_secondary # all other CPUs call initialize_secondary +1: #endif /* CONFIG_SMP */ jmp start_kernel @@ -366,23 +381,6 @@ check_x87: ret /* - * Point the GDT at this CPU's PDA. On boot this will be - * cpu_gdt_table and boot_pda; for secondary CPUs, these will be - * that CPU's GDT and PDA. - */ -ENTRY(setup_pda) - /* get the PDA pointer */ - movl start_pda, %eax - - /* slot the PDA address into the GDT */ - mov early_gdt_descr+2, %ecx - mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */ - shr $16, %eax - mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */ - mov %ah, (__KERNEL_PDA+4+3)(%ecx) /* base & 0xff000000 */ - ret - -/* * setup_idt * * sets up a idt with 256 entries pointing to @@ -554,9 +552,6 @@ ENTRY(empty_zero_page) * This starts the data section. */ .data -ENTRY(start_pda) - .long boot_pda - ENTRY(stack_start) .long init_thread_union+THREAD_SIZE .long __BOOT_DS @@ -588,7 +583,7 @@ fault_msg: .word 0 # 32 bit align gdt_desc.address boot_gdt_descr: .word __BOOT_DS+7 - .long boot_gdt_table - __PAGE_OFFSET + .long boot_gdt - __PAGE_OFFSET .word 0 # 32-bit align idt_desc.address idt_descr: @@ -599,67 +594,14 @@ idt_descr: .word 0 # 32 bit align gdt_desc.address ENTRY(early_gdt_descr) .word GDT_ENTRIES*8-1 - .long cpu_gdt_table + .long per_cpu__gdt_page /* Overwritten for secondary CPUs */ /* - * The boot_gdt_table must mirror the equivalent in setup.S and is + * The boot_gdt must mirror the equivalent in setup.S and is * used only for booting. */ .align L1_CACHE_BYTES -ENTRY(boot_gdt_table) +ENTRY(boot_gdt) .fill GDT_ENTRY_BOOT_CS,8,0 .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */ .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */ - -/* - * The Global Descriptor Table contains 28 quadwords, per-CPU. - */ - .align L1_CACHE_BYTES -ENTRY(cpu_gdt_table) - .quad 0x0000000000000000 /* NULL descriptor */ - .quad 0x0000000000000000 /* 0x0b reserved */ - .quad 0x0000000000000000 /* 0x13 reserved */ - .quad 0x0000000000000000 /* 0x1b reserved */ - .quad 0x0000000000000000 /* 0x20 unused */ - .quad 0x0000000000000000 /* 0x28 unused */ - .quad 0x0000000000000000 /* 0x33 TLS entry 1 */ - .quad 0x0000000000000000 /* 0x3b TLS entry 2 */ - .quad 0x0000000000000000 /* 0x43 TLS entry 3 */ - .quad 0x0000000000000000 /* 0x4b reserved */ - .quad 0x0000000000000000 /* 0x53 reserved */ - .quad 0x0000000000000000 /* 0x5b reserved */ - - .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */ - .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */ - .quad 0x00cffa000000ffff /* 0x73 user 4GB code at 0x00000000 */ - .quad 0x00cff2000000ffff /* 0x7b user 4GB data at 0x00000000 */ - - .quad 0x0000000000000000 /* 0x80 TSS descriptor */ - .quad 0x0000000000000000 /* 0x88 LDT descriptor */ - - /* - * Segments used for calling PnP BIOS have byte granularity. - * They code segments and data segments have fixed 64k limits, - * the transfer segment sizes are set at run time. - */ - .quad 0x00409a000000ffff /* 0x90 32-bit code */ - .quad 0x00009a000000ffff /* 0x98 16-bit code */ - .quad 0x000092000000ffff /* 0xa0 16-bit data */ - .quad 0x0000920000000000 /* 0xa8 16-bit data */ - .quad 0x0000920000000000 /* 0xb0 16-bit data */ - - /* - * The APM segments have byte granularity and their bases - * are set at run time. All have 64k limits. - */ - .quad 0x00409a000000ffff /* 0xb8 APM CS code */ - .quad 0x00009a000000ffff /* 0xc0 APM CS 16 code (16 bit) */ - .quad 0x004092000000ffff /* 0xc8 APM DS data */ - - .quad 0x00c0920000000000 /* 0xd0 - ESPFIX SS */ - .quad 0x00cf92000000ffff /* 0xd8 - PDA */ - .quad 0x0000000000000000 /* 0xe0 - unused */ - .quad 0x0000000000000000 /* 0xe8 - unused */ - .quad 0x0000000000000000 /* 0xf0 - unused */ - .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */ - diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c index 4afe26e8626..e3d4b73bfdb 100644 --- a/arch/i386/kernel/i386_ksyms.c +++ b/arch/i386/kernel/i386_ksyms.c @@ -28,5 +28,3 @@ EXPORT_SYMBOL(__read_lock_failed); #endif EXPORT_SYMBOL(csum_partial); - -EXPORT_SYMBOL(_proxy_pda); diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 89d85d24492..1b623cda3a6 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -35,6 +35,7 @@ #include <linux/msi.h> #include <linux/htirq.h> #include <linux/freezer.h> +#include <linux/kthread.h> #include <asm/io.h> #include <asm/smp.h> @@ -661,8 +662,6 @@ static int balanced_irq(void *unused) unsigned long prev_balance_time = jiffies; long time_remaining = balanced_irq_interval; - daemonize("kirqd"); - /* push everything to CPU 0 to give us a starting point. */ for (i = 0 ; i < NR_IRQS ; i++) { irq_desc[i].pending_mask = cpumask_of_cpu(0); @@ -722,10 +721,9 @@ static int __init balanced_irq_init(void) } printk(KERN_INFO "Starting balanced_irq\n"); - if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0) + if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd"))) return 0; - else - printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq"); + printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq"); failed: for_each_possible_cpu(i) { kfree(irq_cpu_data[i].irq_delta); @@ -1403,10 +1401,6 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in enable_8259A_irq(0); } -static inline void UNEXPECTED_IO_APIC(void) -{ -} - void __init print_IO_APIC(void) { int apic, i; @@ -1446,34 +1440,12 @@ void __init print_IO_APIC(void) printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); - if (reg_00.bits.ID >= get_physical_broadcast()) - UNEXPECTED_IO_APIC(); - if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2) - UNEXPECTED_IO_APIC(); printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw); printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); - if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */ - (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */ - (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */ - (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */ - (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */ - (reg_01.bits.entries != 0x2E) && - (reg_01.bits.entries != 0x3F) - ) - UNEXPECTED_IO_APIC(); printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); - if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */ - (reg_01.bits.version != 0x10) && /* oldest IO-APICs */ - (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */ - (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */ - (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */ - ) - UNEXPECTED_IO_APIC(); - if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2) - UNEXPECTED_IO_APIC(); /* * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, @@ -1483,8 +1455,6 @@ void __init print_IO_APIC(void) if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); - if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2) - UNEXPECTED_IO_APIC(); } /* @@ -1496,8 +1466,6 @@ void __init print_IO_APIC(void) reg_03.raw != reg_01.raw) { printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); - if (reg_03.bits.__reserved_1) - UNEXPECTED_IO_APIC(); } printk(KERN_DEBUG ".... IRQ redirection table:\n"); diff --git a/arch/i386/kernel/ioport.c b/arch/i386/kernel/ioport.c index 498e8bc197d..d1e42e0dbe6 100644 --- a/arch/i386/kernel/ioport.c +++ b/arch/i386/kernel/ioport.c @@ -16,6 +16,7 @@ #include <linux/stddef.h> #include <linux/slab.h> #include <linux/thread_info.h> +#include <linux/syscalls.h> /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) @@ -113,7 +114,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) * Reset the owner so that a process switch will not set * tss->io_bitmap_base to IO_BITMAP_OFFSET. */ - tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; + tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; tss->io_bitmap_owner = NULL; put_cpu(); diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c index 8db8d514c9c..d2daf672f4a 100644 --- a/arch/i386/kernel/irq.c +++ b/arch/i386/kernel/irq.c @@ -24,6 +24,9 @@ DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp; EXPORT_PER_CPU_SYMBOL(irq_stat); +DEFINE_PER_CPU(struct pt_regs *, irq_regs); +EXPORT_PER_CPU_SYMBOL(irq_regs); + /* * 'what should we do if we get a hw irq event on an illegal vector'. * each architecture has to answer this themselves. diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c index 4f5983c9866..0952eccd8f2 100644 --- a/arch/i386/kernel/mpparse.c +++ b/arch/i386/kernel/mpparse.c @@ -477,7 +477,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) } ++mpc_record; } - clustered_apic_check(); + setup_apic_routing(); if (!num_processors) printk(KERN_ERR "SMP mptable: no processors registered!\n"); return num_processors; diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 84c3497efb6..33cf2f3c444 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -20,7 +20,6 @@ #include <linux/sysdev.h> #include <linux/sysctl.h> #include <linux/percpu.h> -#include <linux/dmi.h> #include <linux/kprobes.h> #include <linux/cpumask.h> #include <linux/kernel_stat.h> @@ -28,30 +27,14 @@ #include <asm/smp.h> #include <asm/nmi.h> #include <asm/kdebug.h> -#include <asm/intel_arch_perfmon.h> #include "mach_traps.h" int unknown_nmi_panic; int nmi_watchdog_enabled; -/* perfctr_nmi_owner tracks the ownership of the perfctr registers: - * evtsel_nmi_owner tracks the ownership of the event selection - * - different performance counters/ event selection may be reserved for - * different subsystems this reservation system just tries to coordinate - * things a little - */ - -/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's - * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) - */ -#define NMI_MAX_COUNTER_BITS 66 -#define NMI_MAX_COUNTER_LONGS BITS_TO_LONGS(NMI_MAX_COUNTER_BITS) - -static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner[NMI_MAX_COUNTER_LONGS]); -static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[NMI_MAX_COUNTER_LONGS]); - static cpumask_t backtrace_mask = CPU_MASK_NONE; + /* nmi_active: * >0: the lapic NMI watchdog is active, but can be disabled * <0: the lapic NMI watchdog has not been set up, and cannot @@ -63,206 +46,11 @@ atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ unsigned int nmi_watchdog = NMI_DEFAULT; static unsigned int nmi_hz = HZ; -struct nmi_watchdog_ctlblk { - int enabled; - u64 check_bit; - unsigned int cccr_msr; - unsigned int perfctr_msr; /* the MSR to reset in NMI handler */ - unsigned int evntsel_msr; /* the MSR to select the events to handle */ -}; -static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk); +static DEFINE_PER_CPU(short, wd_enabled); /* local prototypes */ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); -extern void show_registers(struct pt_regs *regs); -extern int unknown_nmi_panic; - -/* converts an msr to an appropriate reservation bit */ -static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) -{ - /* returns the bit offset of the performance counter register */ - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - return (msr - MSR_K7_PERFCTR0); - case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return (msr - MSR_ARCH_PERFMON_PERFCTR0); - - switch (boot_cpu_data.x86) { - case 6: - return (msr - MSR_P6_PERFCTR0); - case 15: - return (msr - MSR_P4_BPU_PERFCTR0); - } - } - return 0; -} - -/* converts an msr to an appropriate reservation bit */ -static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) -{ - /* returns the bit offset of the event selection register */ - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - return (msr - MSR_K7_EVNTSEL0); - case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return (msr - MSR_ARCH_PERFMON_EVENTSEL0); - - switch (boot_cpu_data.x86) { - case 6: - return (msr - MSR_P6_EVNTSEL0); - case 15: - return (msr - MSR_P4_BSU_ESCR0); - } - } - return 0; -} - -/* checks for a bit availability (hack for oprofile) */ -int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) -{ - int cpu; - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - for_each_possible_cpu (cpu) { - if (test_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)[0])) - return 0; - } - return 1; -} - -/* checks the an msr for availability */ -int avail_to_resrv_perfctr_nmi(unsigned int msr) -{ - unsigned int counter; - int cpu; - - counter = nmi_perfctr_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - for_each_possible_cpu (cpu) { - if (test_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)[0])) - return 0; - } - return 1; -} - -static int __reserve_perfctr_nmi(int cpu, unsigned int msr) -{ - unsigned int counter; - if (cpu < 0) - cpu = smp_processor_id(); - - counter = nmi_perfctr_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - if (!test_and_set_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)[0])) - return 1; - return 0; -} - -static void __release_perfctr_nmi(int cpu, unsigned int msr) -{ - unsigned int counter; - if (cpu < 0) - cpu = smp_processor_id(); - - counter = nmi_perfctr_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - clear_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)[0]); -} - -int reserve_perfctr_nmi(unsigned int msr) -{ - int cpu, i; - for_each_possible_cpu (cpu) { - if (!__reserve_perfctr_nmi(cpu, msr)) { - for_each_possible_cpu (i) { - if (i >= cpu) - break; - __release_perfctr_nmi(i, msr); - } - return 0; - } - } - return 1; -} - -void release_perfctr_nmi(unsigned int msr) -{ - int cpu; - for_each_possible_cpu (cpu) { - __release_perfctr_nmi(cpu, msr); - } -} - -int __reserve_evntsel_nmi(int cpu, unsigned int msr) -{ - unsigned int counter; - if (cpu < 0) - cpu = smp_processor_id(); - - counter = nmi_evntsel_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - if (!test_and_set_bit(counter, &per_cpu(evntsel_nmi_owner, cpu)[0])) - return 1; - return 0; -} - -static void __release_evntsel_nmi(int cpu, unsigned int msr) -{ - unsigned int counter; - if (cpu < 0) - cpu = smp_processor_id(); - - counter = nmi_evntsel_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - clear_bit(counter, &per_cpu(evntsel_nmi_owner, cpu)[0]); -} - -int reserve_evntsel_nmi(unsigned int msr) -{ - int cpu, i; - for_each_possible_cpu (cpu) { - if (!__reserve_evntsel_nmi(cpu, msr)) { - for_each_possible_cpu (i) { - if (i >= cpu) - break; - __release_evntsel_nmi(i, msr); - } - return 0; - } - } - return 1; -} - -void release_evntsel_nmi(unsigned int msr) -{ - int cpu; - for_each_possible_cpu (cpu) { - __release_evntsel_nmi(cpu, msr); - } -} - -static __cpuinit inline int nmi_known_cpu(void) -{ - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6) - || (boot_cpu_data.x86 == 16)); - case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return 1; - else - return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)); - } - return 0; -} - static int endflag __initdata = 0; #ifdef CONFIG_SMP @@ -284,28 +72,6 @@ static __init void nmi_cpu_busy(void *data) } #endif -static unsigned int adjust_for_32bit_ctr(unsigned int hz) -{ - u64 counter_val; - unsigned int retval = hz; - - /* - * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter - * are writable, with higher bits sign extending from bit 31. - * So, we can only program the counter with 31 bit values and - * 32nd bit should be 1, for 33.. to be 1. - * Find the appropriate nmi_hz - */ - counter_val = (u64)cpu_khz * 1000; - do_div(counter_val, retval); - if (counter_val > 0x7fffffffULL) { - u64 count = (u64)cpu_khz * 1000; - do_div(count, 0x7fffffffUL); - retval = count + 1; - } - return retval; -} - static int __init check_nmi_watchdog(void) { unsigned int *prev_nmi_count; @@ -338,14 +104,14 @@ static int __init check_nmi_watchdog(void) if (!cpu_isset(cpu, cpu_callin_map)) continue; #endif - if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled) + if (!per_cpu(wd_enabled, cpu)) continue; if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n", cpu, prev_nmi_count[cpu], nmi_count(cpu)); - per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0; + per_cpu(wd_enabled, cpu) = 0; atomic_dec(&nmi_active); } } @@ -359,16 +125,8 @@ static int __init check_nmi_watchdog(void) /* now that we know it works we can reduce NMI frequency to something more reasonable; makes a difference in some configs */ - if (nmi_watchdog == NMI_LOCAL_APIC) { - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - nmi_hz = 1; - - if (wd->perfctr_msr == MSR_P6_PERFCTR0 || - wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { - nmi_hz = adjust_for_32bit_ctr(nmi_hz); - } - } + if (nmi_watchdog == NMI_LOCAL_APIC) + nmi_hz = lapic_adjust_nmi_hz(1); kfree(prev_nmi_count); return 0; @@ -391,85 +149,8 @@ static int __init setup_nmi_watchdog(char *str) __setup("nmi_watchdog=", setup_nmi_watchdog); -static void disable_lapic_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); - - if (atomic_read(&nmi_active) <= 0) - return; - - on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); - - BUG_ON(atomic_read(&nmi_active) != 0); -} - -static void enable_lapic_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); - - /* are we already enabled */ - if (atomic_read(&nmi_active) != 0) - return; - - /* are we lapic aware */ - if (nmi_known_cpu() <= 0) - return; - on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); - touch_nmi_watchdog(); -} - -void disable_timer_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_IO_APIC); - - if (atomic_read(&nmi_active) <= 0) - return; - - disable_irq(0); - on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); - - BUG_ON(atomic_read(&nmi_active) != 0); -} - -void enable_timer_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_IO_APIC); - - if (atomic_read(&nmi_active) == 0) { - touch_nmi_watchdog(); - on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); - enable_irq(0); - } -} - -static void __acpi_nmi_disable(void *__unused) -{ - apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); -} - -/* - * Disable timer based NMIs on all CPUs: - */ -void acpi_nmi_disable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); -} - -static void __acpi_nmi_enable(void *__unused) -{ - apic_write_around(APIC_LVT0, APIC_DM_NMI); -} - -/* - * Enable timer based NMIs on all CPUs: - */ -void acpi_nmi_enable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); -} +/* Suspend/resume support */ #ifdef CONFIG_PM @@ -516,7 +197,7 @@ static int __init init_lapic_nmi_sysfs(void) if (nmi_watchdog != NMI_LOCAL_APIC) return 0; - if ( atomic_read(&nmi_active) < 0 ) + if (atomic_read(&nmi_active) < 0) return 0; error = sysdev_class_register(&nmi_sysclass); @@ -529,433 +210,69 @@ late_initcall(init_lapic_nmi_sysfs); #endif /* CONFIG_PM */ -/* - * Activate the NMI watchdog via the local APIC. - * Original code written by Keith Owens. - */ - -static void write_watchdog_counter(unsigned int perfctr_msr, const char *descr) -{ - u64 count = (u64)cpu_khz * 1000; - - do_div(count, nmi_hz); - if(descr) - Dprintk("setting %s to -0x%08Lx\n", descr, count); - wrmsrl(perfctr_msr, 0 - count); -} - -static void write_watchdog_counter32(unsigned int perfctr_msr, - const char *descr) -{ - u64 count = (u64)cpu_khz * 1000; - - do_div(count, nmi_hz); - if(descr) - Dprintk("setting %s to -0x%08Lx\n", descr, count); - wrmsr(perfctr_msr, (u32)(-count), 0); -} - -/* Note that these events don't tick when the CPU idles. This means - the frequency varies with CPU load. */ - -#define K7_EVNTSEL_ENABLE (1 << 22) -#define K7_EVNTSEL_INT (1 << 20) -#define K7_EVNTSEL_OS (1 << 17) -#define K7_EVNTSEL_USR (1 << 16) -#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 -#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING - -static int setup_k7_watchdog(void) -{ - unsigned int perfctr_msr, evntsel_msr; - unsigned int evntsel; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - perfctr_msr = MSR_K7_PERFCTR0; - evntsel_msr = MSR_K7_EVNTSEL0; - if (!__reserve_perfctr_nmi(-1, perfctr_msr)) - goto fail; - - if (!__reserve_evntsel_nmi(-1, evntsel_msr)) - goto fail1; - - wrmsrl(perfctr_msr, 0UL); - - evntsel = K7_EVNTSEL_INT - | K7_EVNTSEL_OS - | K7_EVNTSEL_USR - | K7_NMI_EVENT; - - /* setup the timer */ - wrmsr(evntsel_msr, evntsel, 0); - write_watchdog_counter(perfctr_msr, "K7_PERFCTR0"); - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= K7_EVNTSEL_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); - - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; //unused - wd->check_bit = 1ULL<<63; - return 1; -fail1: - __release_perfctr_nmi(-1, perfctr_msr); -fail: - return 0; -} - -static void stop_k7_watchdog(void) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - wrmsr(wd->evntsel_msr, 0, 0); - - __release_evntsel_nmi(-1, wd->evntsel_msr); - __release_perfctr_nmi(-1, wd->perfctr_msr); -} - -#define P6_EVNTSEL0_ENABLE (1 << 22) -#define P6_EVNTSEL_INT (1 << 20) -#define P6_EVNTSEL_OS (1 << 17) -#define P6_EVNTSEL_USR (1 << 16) -#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 -#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED - -static int setup_p6_watchdog(void) -{ - unsigned int perfctr_msr, evntsel_msr; - unsigned int evntsel; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - perfctr_msr = MSR_P6_PERFCTR0; - evntsel_msr = MSR_P6_EVNTSEL0; - if (!__reserve_perfctr_nmi(-1, perfctr_msr)) - goto fail; - - if (!__reserve_evntsel_nmi(-1, evntsel_msr)) - goto fail1; - - wrmsrl(perfctr_msr, 0UL); - - evntsel = P6_EVNTSEL_INT - | P6_EVNTSEL_OS - | P6_EVNTSEL_USR - | P6_NMI_EVENT; - - /* setup the timer */ - wrmsr(evntsel_msr, evntsel, 0); - nmi_hz = adjust_for_32bit_ctr(nmi_hz); - write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0"); - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= P6_EVNTSEL0_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); - - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; //unused - wd->check_bit = 1ULL<<39; - return 1; -fail1: - __release_perfctr_nmi(-1, perfctr_msr); -fail: - return 0; -} - -static void stop_p6_watchdog(void) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - wrmsr(wd->evntsel_msr, 0, 0); - - __release_evntsel_nmi(-1, wd->evntsel_msr); - __release_perfctr_nmi(-1, wd->perfctr_msr); -} - -/* Note that these events don't tick when the CPU idles. This means - the frequency varies with CPU load. */ - -#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) -#define P4_ESCR_EVENT_SELECT(N) ((N)<<25) -#define P4_ESCR_OS (1<<3) -#define P4_ESCR_USR (1<<2) -#define P4_CCCR_OVF_PMI0 (1<<26) -#define P4_CCCR_OVF_PMI1 (1<<27) -#define P4_CCCR_THRESHOLD(N) ((N)<<20) -#define P4_CCCR_COMPLEMENT (1<<19) -#define P4_CCCR_COMPARE (1<<18) -#define P4_CCCR_REQUIRED (3<<16) -#define P4_CCCR_ESCR_SELECT(N) ((N)<<13) -#define P4_CCCR_ENABLE (1<<12) -#define P4_CCCR_OVF (1<<31) -/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter - CRU_ESCR0 (with any non-null event selector) through a complemented - max threshold. [IA32-Vol3, Section 14.9.9] */ - -static int setup_p4_watchdog(void) +static void __acpi_nmi_enable(void *__unused) { - unsigned int perfctr_msr, evntsel_msr, cccr_msr; - unsigned int evntsel, cccr_val; - unsigned int misc_enable, dummy; - unsigned int ht_num; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy); - if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL)) - return 0; - -#ifdef CONFIG_SMP - /* detect which hyperthread we are on */ - if (smp_num_siblings == 2) { - unsigned int ebx, apicid; - - ebx = cpuid_ebx(1); - apicid = (ebx >> 24) & 0xff; - ht_num = apicid & 1; - } else -#endif - ht_num = 0; - - /* performance counters are shared resources - * assign each hyperthread its own set - * (re-use the ESCR0 register, seems safe - * and keeps the cccr_val the same) - */ - if (!ht_num) { - /* logical cpu 0 */ - perfctr_msr = MSR_P4_IQ_PERFCTR0; - evntsel_msr = MSR_P4_CRU_ESCR0; - cccr_msr = MSR_P4_IQ_CCCR0; - cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); - } else { - /* logical cpu 1 */ - perfctr_msr = MSR_P4_IQ_PERFCTR1; - evntsel_msr = MSR_P4_CRU_ESCR0; - cccr_msr = MSR_P4_IQ_CCCR1; - cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4); - } - - if (!__reserve_perfctr_nmi(-1, perfctr_msr)) - goto fail; - - if (!__reserve_evntsel_nmi(-1, evntsel_msr)) - goto fail1; - - evntsel = P4_ESCR_EVENT_SELECT(0x3F) - | P4_ESCR_OS - | P4_ESCR_USR; - - cccr_val |= P4_CCCR_THRESHOLD(15) - | P4_CCCR_COMPLEMENT - | P4_CCCR_COMPARE - | P4_CCCR_REQUIRED; - - wrmsr(evntsel_msr, evntsel, 0); - wrmsr(cccr_msr, cccr_val, 0); - write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0"); - apic_write(APIC_LVTPC, APIC_DM_NMI); - cccr_val |= P4_CCCR_ENABLE; - wrmsr(cccr_msr, cccr_val, 0); - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = cccr_msr; - wd->check_bit = 1ULL<<39; - return 1; -fail1: - __release_perfctr_nmi(-1, perfctr_msr); -fail: - return 0; + apic_write_around(APIC_LVT0, APIC_DM_NMI); } -static void stop_p4_watchdog(void) +/* + * Enable timer based NMIs on all CPUs: + */ +void acpi_nmi_enable(void) { - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - wrmsr(wd->cccr_msr, 0, 0); - wrmsr(wd->evntsel_msr, 0, 0); - - __release_evntsel_nmi(-1, wd->evntsel_msr); - __release_perfctr_nmi(-1, wd->perfctr_msr); + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); } -#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL -#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK - -static int setup_intel_arch_watchdog(void) +static void __acpi_nmi_disable(void *__unused) { - unsigned int ebx; - union cpuid10_eax eax; - unsigned int unused; - unsigned int perfctr_msr, evntsel_msr; - unsigned int evntsel; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - /* - * Check whether the Architectural PerfMon supports - * Unhalted Core Cycles Event or not. - * NOTE: Corresponding bit = 0 in ebx indicates event present. - */ - cpuid(10, &(eax.full), &ebx, &unused, &unused); - if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || - (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) - goto fail; - - perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0; - evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0; - - if (!__reserve_perfctr_nmi(-1, perfctr_msr)) - goto fail; - - if (!__reserve_evntsel_nmi(-1, evntsel_msr)) - goto fail1; - - wrmsrl(perfctr_msr, 0UL); - - evntsel = ARCH_PERFMON_EVENTSEL_INT - | ARCH_PERFMON_EVENTSEL_OS - | ARCH_PERFMON_EVENTSEL_USR - | ARCH_PERFMON_NMI_EVENT_SEL - | ARCH_PERFMON_NMI_EVENT_UMASK; - - /* setup the timer */ - wrmsr(evntsel_msr, evntsel, 0); - nmi_hz = adjust_for_32bit_ctr(nmi_hz); - write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0"); - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); - - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; //unused - wd->check_bit = 1ULL << (eax.split.bit_width - 1); - return 1; -fail1: - __release_perfctr_nmi(-1, perfctr_msr); -fail: - return 0; + apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); } -static void stop_intel_arch_watchdog(void) +/* + * Disable timer based NMIs on all CPUs: + */ +void acpi_nmi_disable(void) { - unsigned int ebx; - union cpuid10_eax eax; - unsigned int unused; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - /* - * Check whether the Architectural PerfMon supports - * Unhalted Core Cycles Event or not. - * NOTE: Corresponding bit = 0 in ebx indicates event present. - */ - cpuid(10, &(eax.full), &ebx, &unused, &unused); - if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || - (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) - return; - - wrmsr(wd->evntsel_msr, 0, 0); - __release_evntsel_nmi(-1, wd->evntsel_msr); - __release_perfctr_nmi(-1, wd->perfctr_msr); + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); } void setup_apic_nmi_watchdog (void *unused) { - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - /* only support LOCAL and IO APICs for now */ - if ((nmi_watchdog != NMI_LOCAL_APIC) && - (nmi_watchdog != NMI_IO_APIC)) - return; - - if (wd->enabled == 1) - return; + if (__get_cpu_var(wd_enabled)) + return; /* cheap hack to support suspend/resume */ /* if cpu0 is not active neither should the other cpus */ if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) return; - if (nmi_watchdog == NMI_LOCAL_APIC) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 && - boot_cpu_data.x86 != 16) - return; - if (!setup_k7_watchdog()) - return; - break; - case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - if (!setup_intel_arch_watchdog()) - return; - break; - } - switch (boot_cpu_data.x86) { - case 6: - if (boot_cpu_data.x86_model > 0xd) - return; - - if (!setup_p6_watchdog()) - return; - break; - case 15: - if (boot_cpu_data.x86_model > 0x4) - return; - - if (!setup_p4_watchdog()) - return; - break; - default: - return; - } - break; - default: + switch (nmi_watchdog) { + case NMI_LOCAL_APIC: + __get_cpu_var(wd_enabled) = 1; /* enable it before to avoid race with handler */ + if (lapic_watchdog_init(nmi_hz) < 0) { + __get_cpu_var(wd_enabled) = 0; return; } + /* FALL THROUGH */ + case NMI_IO_APIC: + __get_cpu_var(wd_enabled) = 1; + atomic_inc(&nmi_active); } - wd->enabled = 1; - atomic_inc(&nmi_active); } void stop_apic_nmi_watchdog(void *unused) { - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - /* only support LOCAL and IO APICs for now */ if ((nmi_watchdog != NMI_LOCAL_APIC) && (nmi_watchdog != NMI_IO_APIC)) return; - - if (wd->enabled == 0) + if (__get_cpu_var(wd_enabled) == 0) return; - - if (nmi_watchdog == NMI_LOCAL_APIC) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - stop_k7_watchdog(); - break; - case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - stop_intel_arch_watchdog(); - break; - } - switch (boot_cpu_data.x86) { - case 6: - if (boot_cpu_data.x86_model > 0xd) - break; - stop_p6_watchdog(); - break; - case 15: - if (boot_cpu_data.x86_model > 0x4) - break; - stop_p4_watchdog(); - break; - } - break; - default: - return; - } - } - wd->enabled = 0; + if (nmi_watchdog == NMI_LOCAL_APIC) + lapic_watchdog_stop(); + __get_cpu_var(wd_enabled) = 0; atomic_dec(&nmi_active); } @@ -1011,8 +328,6 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) unsigned int sum; int touched = 0; int cpu = smp_processor_id(); - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - u64 dummy; int rc=0; /* check for other users first */ @@ -1055,53 +370,20 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) alert_counter[cpu] = 0; } /* see if the nmi watchdog went off */ - if (wd->enabled) { - if (nmi_watchdog == NMI_LOCAL_APIC) { - rdmsrl(wd->perfctr_msr, dummy); - if (dummy & wd->check_bit){ - /* this wasn't a watchdog timer interrupt */ - goto done; - } - - /* only Intel P4 uses the cccr msr */ - if (wd->cccr_msr != 0) { - /* - * P4 quirks: - * - An overflown perfctr will assert its interrupt - * until the OVF flag in its CCCR is cleared. - * - LVTPC is masked on interrupt and must be - * unmasked by the LVTPC handler. - */ - rdmsrl(wd->cccr_msr, dummy); - dummy &= ~P4_CCCR_OVF; - wrmsrl(wd->cccr_msr, dummy); - apic_write(APIC_LVTPC, APIC_DM_NMI); - /* start the cycle over again */ - write_watchdog_counter(wd->perfctr_msr, NULL); - } - else if (wd->perfctr_msr == MSR_P6_PERFCTR0 || - wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { - /* P6 based Pentium M need to re-unmask - * the apic vector but it doesn't hurt - * other P6 variant. - * ArchPerfom/Core Duo also needs this */ - apic_write(APIC_LVTPC, APIC_DM_NMI); - /* P6/ARCH_PERFMON has 32 bit counter write */ - write_watchdog_counter32(wd->perfctr_msr, NULL); - } else { - /* start the cycle over again */ - write_watchdog_counter(wd->perfctr_msr, NULL); - } - rc = 1; - } else if (nmi_watchdog == NMI_IO_APIC) { - /* don't know how to accurately check for this. - * just assume it was a watchdog timer interrupt - * This matches the old behaviour. - */ - rc = 1; - } + if (!__get_cpu_var(wd_enabled)) + return rc; + switch (nmi_watchdog) { + case NMI_LOCAL_APIC: + rc |= lapic_wd_event(nmi_hz); + break; + case NMI_IO_APIC: + /* don't know how to accurately check for this. + * just assume it was a watchdog timer interrupt + * This matches the old behaviour. + */ + rc = 1; + break; } -done: return rc; } @@ -1146,7 +428,7 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, } if (nmi_watchdog == NMI_DEFAULT) { - if (nmi_known_cpu() > 0) + if (lapic_watchdog_ok()) nmi_watchdog = NMI_LOCAL_APIC; else nmi_watchdog = NMI_IO_APIC; @@ -1182,11 +464,3 @@ void __trigger_all_cpu_backtrace(void) EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); -EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); -EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); -EXPORT_SYMBOL(reserve_perfctr_nmi); -EXPORT_SYMBOL(release_perfctr_nmi); -EXPORT_SYMBOL(reserve_evntsel_nmi); -EXPORT_SYMBOL(release_evntsel_nmi); -EXPORT_SYMBOL(disable_timer_nmi_watchdog); -EXPORT_SYMBOL(enable_timer_nmi_watchdog); diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index 2ec331e03fa..5c10f376bce 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -20,6 +20,7 @@ #include <linux/efi.h> #include <linux/bcd.h> #include <linux/start_kernel.h> +#include <linux/highmem.h> #include <asm/bug.h> #include <asm/paravirt.h> @@ -35,7 +36,7 @@ #include <asm/timer.h> /* nop stub */ -static void native_nop(void) +void _paravirt_nop(void) { } @@ -54,331 +55,148 @@ char *memory_setup(void) #define DEF_NATIVE(name, code) \ extern const char start_##name[], end_##name[]; \ asm("start_" #name ": " code "; end_" #name ":") -DEF_NATIVE(cli, "cli"); -DEF_NATIVE(sti, "sti"); -DEF_NATIVE(popf, "push %eax; popf"); -DEF_NATIVE(pushf, "pushf; pop %eax"); -DEF_NATIVE(pushf_cli, "pushf; pop %eax; cli"); + +DEF_NATIVE(irq_disable, "cli"); +DEF_NATIVE(irq_enable, "sti"); +DEF_NATIVE(restore_fl, "push %eax; popf"); +DEF_NATIVE(save_fl, "pushf; pop %eax"); DEF_NATIVE(iret, "iret"); -DEF_NATIVE(sti_sysexit, "sti; sysexit"); +DEF_NATIVE(irq_enable_sysexit, "sti; sysexit"); +DEF_NATIVE(read_cr2, "mov %cr2, %eax"); +DEF_NATIVE(write_cr3, "mov %eax, %cr3"); +DEF_NATIVE(read_cr3, "mov %cr3, %eax"); +DEF_NATIVE(clts, "clts"); +DEF_NATIVE(read_tsc, "rdtsc"); -static const struct native_insns -{ - const char *start, *end; -} native_insns[] = { - [PARAVIRT_IRQ_DISABLE] = { start_cli, end_cli }, - [PARAVIRT_IRQ_ENABLE] = { start_sti, end_sti }, - [PARAVIRT_RESTORE_FLAGS] = { start_popf, end_popf }, - [PARAVIRT_SAVE_FLAGS] = { start_pushf, end_pushf }, - [PARAVIRT_SAVE_FLAGS_IRQ_DISABLE] = { start_pushf_cli, end_pushf_cli }, - [PARAVIRT_INTERRUPT_RETURN] = { start_iret, end_iret }, - [PARAVIRT_STI_SYSEXIT] = { start_sti_sysexit, end_sti_sysexit }, -}; +DEF_NATIVE(ud2a, "ud2a"); static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len) { - unsigned int insn_len; - - /* Don't touch it if we don't have a replacement */ - if (type >= ARRAY_SIZE(native_insns) || !native_insns[type].start) - return len; - - insn_len = native_insns[type].end - native_insns[type].start; - - /* Similarly if we can't fit replacement. */ - if (len < insn_len) - return len; + const unsigned char *start, *end; + unsigned ret; + + switch(type) { +#define SITE(x) case PARAVIRT_PATCH(x): start = start_##x; end = end_##x; goto patch_site + SITE(irq_disable); + SITE(irq_enable); + SITE(restore_fl); + SITE(save_fl); + SITE(iret); + SITE(irq_enable_sysexit); + SITE(read_cr2); + SITE(read_cr3); + SITE(write_cr3); + SITE(clts); + SITE(read_tsc); +#undef SITE + + patch_site: + ret = paravirt_patch_insns(insns, len, start, end); + break; - memcpy(insns, native_insns[type].start, insn_len); - return insn_len; -} + case PARAVIRT_PATCH(make_pgd): + case PARAVIRT_PATCH(make_pte): + case PARAVIRT_PATCH(pgd_val): + case PARAVIRT_PATCH(pte_val): +#ifdef CONFIG_X86_PAE + case PARAVIRT_PATCH(make_pmd): + case PARAVIRT_PATCH(pmd_val): +#endif + /* These functions end up returning exactly what + they're passed, in the same registers. */ + ret = paravirt_patch_nop(); + break; -static unsigned long native_get_debugreg(int regno) -{ - unsigned long val = 0; /* Damn you, gcc! */ - - switch (regno) { - case 0: - asm("movl %%db0, %0" :"=r" (val)); break; - case 1: - asm("movl %%db1, %0" :"=r" (val)); break; - case 2: - asm("movl %%db2, %0" :"=r" (val)); break; - case 3: - asm("movl %%db3, %0" :"=r" (val)); break; - case 6: - asm("movl %%db6, %0" :"=r" (val)); break; - case 7: - asm("movl %%db7, %0" :"=r" (val)); break; default: - BUG(); - } - return val; -} - -static void native_set_debugreg(int regno, unsigned long value) -{ - switch (regno) { - case 0: - asm("movl %0,%%db0" : /* no output */ :"r" (value)); - break; - case 1: - asm("movl %0,%%db1" : /* no output */ :"r" (value)); - break; - case 2: - asm("movl %0,%%db2" : /* no output */ :"r" (value)); + ret = paravirt_patch_default(type, clobbers, insns, len); break; - case 3: - asm("movl %0,%%db3" : /* no output */ :"r" (value)); - break; - case 6: - asm("movl %0,%%db6" : /* no output */ :"r" (value)); - break; - case 7: - asm("movl %0,%%db7" : /* no output */ :"r" (value)); - break; - default: - BUG(); } -} - -void init_IRQ(void) -{ - paravirt_ops.init_IRQ(); -} - -static void native_clts(void) -{ - asm volatile ("clts"); -} - -static unsigned long native_read_cr0(void) -{ - unsigned long val; - asm volatile("movl %%cr0,%0\n\t" :"=r" (val)); - return val; -} - -static void native_write_cr0(unsigned long val) -{ - asm volatile("movl %0,%%cr0": :"r" (val)); -} - -static unsigned long native_read_cr2(void) -{ - unsigned long val; - asm volatile("movl %%cr2,%0\n\t" :"=r" (val)); - return val; -} - -static void native_write_cr2(unsigned long val) -{ - asm volatile("movl %0,%%cr2": :"r" (val)); -} - -static unsigned long native_read_cr3(void) -{ - unsigned long val; - asm volatile("movl %%cr3,%0\n\t" :"=r" (val)); - return val; -} - -static void native_write_cr3(unsigned long val) -{ - asm volatile("movl %0,%%cr3": :"r" (val)); -} - -static unsigned long native_read_cr4(void) -{ - unsigned long val; - asm volatile("movl %%cr4,%0\n\t" :"=r" (val)); - return val; -} - -static unsigned long native_read_cr4_safe(void) -{ - unsigned long val; - /* This could fault if %cr4 does not exist */ - asm("1: movl %%cr4, %0 \n" - "2: \n" - ".section __ex_table,\"a\" \n" - ".long 1b,2b \n" - ".previous \n" - : "=r" (val): "0" (0)); - return val; -} - -static void native_write_cr4(unsigned long val) -{ - asm volatile("movl %0,%%cr4": :"r" (val)); -} - -static unsigned long native_save_fl(void) -{ - unsigned long f; - asm volatile("pushfl ; popl %0":"=g" (f): /* no input */); - return f; -} - -static void native_restore_fl(unsigned long f) -{ - asm volatile("pushl %0 ; popfl": /* no output */ - :"g" (f) - :"memory", "cc"); -} - -static void native_irq_disable(void) -{ - asm volatile("cli": : :"memory"); -} - -static void native_irq_enable(void) -{ - asm volatile("sti": : :"memory"); -} - -static void native_safe_halt(void) -{ - asm volatile("sti; hlt": : :"memory"); -} -static void native_halt(void) -{ - asm volatile("hlt": : :"memory"); + return ret; } -static void native_wbinvd(void) +unsigned paravirt_patch_nop(void) { - asm volatile("wbinvd": : :"memory"); + return 0; } -static unsigned long long native_read_msr(unsigned int msr, int *err) +unsigned paravirt_patch_ignore(unsigned len) { - unsigned long long val; - - asm volatile("2: rdmsr ; xorl %0,%0\n" - "1:\n\t" - ".section .fixup,\"ax\"\n\t" - "3: movl %3,%0 ; jmp 1b\n\t" - ".previous\n\t" - ".section __ex_table,\"a\"\n" - " .align 4\n\t" - " .long 2b,3b\n\t" - ".previous" - : "=r" (*err), "=A" (val) - : "c" (msr), "i" (-EFAULT)); - - return val; + return len; } -static int native_write_msr(unsigned int msr, unsigned long long val) +unsigned paravirt_patch_call(void *target, u16 tgt_clobbers, + void *site, u16 site_clobbers, + unsigned len) { - int err; - asm volatile("2: wrmsr ; xorl %0,%0\n" - "1:\n\t" - ".section .fixup,\"ax\"\n\t" - "3: movl %4,%0 ; jmp 1b\n\t" - ".previous\n\t" - ".section __ex_table,\"a\"\n" - " .align 4\n\t" - " .long 2b,3b\n\t" - ".previous" - : "=a" (err) - : "c" (msr), "0" ((u32)val), "d" ((u32)(val>>32)), - "i" (-EFAULT)); - return err; -} + unsigned char *call = site; + unsigned long delta = (unsigned long)target - (unsigned long)(call+5); -static unsigned long long native_read_tsc(void) -{ - unsigned long long val; - asm volatile("rdtsc" : "=A" (val)); - return val; -} + if (tgt_clobbers & ~site_clobbers) + return len; /* target would clobber too much for this site */ + if (len < 5) + return len; /* call too long for patch site */ -static unsigned long long native_read_pmc(void) -{ - unsigned long long val; - asm volatile("rdpmc" : "=A" (val)); - return val; -} + *call++ = 0xe8; /* call */ + *(unsigned long *)call = delta; -static void native_load_tr_desc(void) -{ - asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); + return 5; } -static void native_load_gdt(const struct Xgt_desc_struct *dtr) +unsigned paravirt_patch_jmp(void *target, void *site, unsigned len) { - asm volatile("lgdt %0"::"m" (*dtr)); -} + unsigned char *jmp = site; + unsigned long delta = (unsigned long)target - (unsigned long)(jmp+5); -static void native_load_idt(const struct Xgt_desc_struct *dtr) -{ - asm volatile("lidt %0"::"m" (*dtr)); -} + if (len < 5) + return len; /* call too long for patch site */ -static void native_store_gdt(struct Xgt_desc_struct *dtr) -{ - asm ("sgdt %0":"=m" (*dtr)); -} + *jmp++ = 0xe9; /* jmp */ + *(unsigned long *)jmp = delta; -static void native_store_idt(struct Xgt_desc_struct *dtr) -{ - asm ("sidt %0":"=m" (*dtr)); + return 5; } -static unsigned long native_store_tr(void) +unsigned paravirt_patch_default(u8 type, u16 clobbers, void *site, unsigned len) { - unsigned long tr; - asm ("str %0":"=r" (tr)); - return tr; -} + void *opfunc = *((void **)¶virt_ops + type); + unsigned ret; -static void native_load_tls(struct thread_struct *t, unsigned int cpu) -{ -#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i] - C(0); C(1); C(2); -#undef C -} + if (opfunc == NULL) + /* If there's no function, patch it with a ud2a (BUG) */ + ret = paravirt_patch_insns(site, len, start_ud2a, end_ud2a); + else if (opfunc == paravirt_nop) + /* If the operation is a nop, then nop the callsite */ + ret = paravirt_patch_nop(); + else if (type == PARAVIRT_PATCH(iret) || + type == PARAVIRT_PATCH(irq_enable_sysexit)) + /* If operation requires a jmp, then jmp */ + ret = paravirt_patch_jmp(opfunc, site, len); + else + /* Otherwise call the function; assume target could + clobber any caller-save reg */ + ret = paravirt_patch_call(opfunc, CLBR_ANY, + site, clobbers, len); -static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32 entry_high) -{ - u32 *lp = (u32 *)((char *)dt + entry*8); - lp[0] = entry_low; - lp[1] = entry_high; + return ret; } -static void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high) +unsigned paravirt_patch_insns(void *site, unsigned len, + const char *start, const char *end) { - native_write_dt_entry(dt, entrynum, low, high); -} + unsigned insn_len = end - start; -static void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high) -{ - native_write_dt_entry(dt, entrynum, low, high); -} - -static void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high) -{ - native_write_dt_entry(dt, entrynum, low, high); -} + if (insn_len > len || start == NULL) + insn_len = len; + else + memcpy(site, start, insn_len); -static void native_load_esp0(struct tss_struct *tss, - struct thread_struct *thread) -{ - tss->esp0 = thread->esp0; - - /* This can only happen when SEP is enabled, no need to test "SEP"arately */ - if (unlikely(tss->ss1 != thread->sysenter_cs)) { - tss->ss1 = thread->sysenter_cs; - wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); - } + return insn_len; } -static void native_io_delay(void) +void init_IRQ(void) { - asm volatile("outb %al,$0x80"); + paravirt_ops.init_IRQ(); } static void native_flush_tlb(void) @@ -395,83 +213,11 @@ static void native_flush_tlb_global(void) __native_flush_tlb_global(); } -static void native_flush_tlb_single(u32 addr) +static void native_flush_tlb_single(unsigned long addr) { __native_flush_tlb_single(addr); } -#ifndef CONFIG_X86_PAE -static void native_set_pte(pte_t *ptep, pte_t pteval) -{ - *ptep = pteval; -} - -static void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval) -{ - *ptep = pteval; -} - -static void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) -{ - *pmdp = pmdval; -} - -#else /* CONFIG_X86_PAE */ - -static void native_set_pte(pte_t *ptep, pte_t pte) -{ - ptep->pte_high = pte.pte_high; - smp_wmb(); - ptep->pte_low = pte.pte_low; -} - -static void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte) -{ - ptep->pte_high = pte.pte_high; - smp_wmb(); - ptep->pte_low = pte.pte_low; -} - -static void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) -{ - ptep->pte_low = 0; - smp_wmb(); - ptep->pte_high = pte.pte_high; - smp_wmb(); - ptep->pte_low = pte.pte_low; -} - -static void native_set_pte_atomic(pte_t *ptep, pte_t pteval) -{ - set_64bit((unsigned long long *)ptep,pte_val(pteval)); -} - -static void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) -{ - set_64bit((unsigned long long *)pmdp,pmd_val(pmdval)); -} - -static void native_set_pud(pud_t *pudp, pud_t pudval) -{ - *pudp = pudval; -} - -static void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) -{ - ptep->pte_low = 0; - smp_wmb(); - ptep->pte_high = 0; -} - -static void native_pmd_clear(pmd_t *pmd) -{ - u32 *tmp = (u32 *)pmd; - *tmp = 0; - smp_wmb(); - *(tmp + 1) = 0; -} -#endif /* CONFIG_X86_PAE */ - /* These are in entry.S */ extern void native_iret(void); extern void native_irq_enable_sysexit(void); @@ -487,10 +233,11 @@ struct paravirt_ops paravirt_ops = { .name = "bare hardware", .paravirt_enabled = 0, .kernel_rpl = 0, + .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ .patch = native_patch, .banner = default_banner, - .arch_setup = native_nop, + .arch_setup = paravirt_nop, .memory_setup = machine_specific_memory_setup, .get_wallclock = native_get_wallclock, .set_wallclock = native_set_wallclock, @@ -517,8 +264,8 @@ struct paravirt_ops paravirt_ops = { .safe_halt = native_safe_halt, .halt = native_halt, .wbinvd = native_wbinvd, - .read_msr = native_read_msr, - .write_msr = native_write_msr, + .read_msr = native_read_msr_safe, + .write_msr = native_write_msr_safe, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, .get_scheduled_cycles = native_read_tsc, @@ -531,9 +278,9 @@ struct paravirt_ops paravirt_ops = { .store_idt = native_store_idt, .store_tr = native_store_tr, .load_tls = native_load_tls, - .write_ldt_entry = native_write_ldt_entry, - .write_gdt_entry = native_write_gdt_entry, - .write_idt_entry = native_write_idt_entry, + .write_ldt_entry = write_dt_entry, + .write_gdt_entry = write_dt_entry, + .write_idt_entry = write_dt_entry, .load_esp0 = native_load_esp0, .set_iopl_mask = native_set_iopl_mask, @@ -545,44 +292,57 @@ struct paravirt_ops paravirt_ops = { .apic_read = native_apic_read, .setup_boot_clock = setup_boot_APIC_clock, .setup_secondary_clock = setup_secondary_APIC_clock, + .startup_ipi_hook = paravirt_nop, #endif - .set_lazy_mode = (void *)native_nop, + .set_lazy_mode = paravirt_nop, + + .pagetable_setup_start = native_pagetable_setup_start, + .pagetable_setup_done = native_pagetable_setup_done, .flush_tlb_user = native_flush_tlb, .flush_tlb_kernel = native_flush_tlb_global, .flush_tlb_single = native_flush_tlb_single, + .flush_tlb_others = native_flush_tlb_others, - .map_pt_hook = (void *)native_nop, - - .alloc_pt = (void *)native_nop, - .alloc_pd = (void *)native_nop, - .alloc_pd_clone = (void *)native_nop, - .release_pt = (void *)native_nop, - .release_pd = (void *)native_nop, + .alloc_pt = paravirt_nop, + .alloc_pd = paravirt_nop, + .alloc_pd_clone = paravirt_nop, + .release_pt = paravirt_nop, + .release_pd = paravirt_nop, .set_pte = native_set_pte, .set_pte_at = native_set_pte_at, .set_pmd = native_set_pmd, - .pte_update = (void *)native_nop, - .pte_update_defer = (void *)native_nop, + .pte_update = paravirt_nop, + .pte_update_defer = paravirt_nop, + +#ifdef CONFIG_HIGHPTE + .kmap_atomic_pte = kmap_atomic, +#endif + #ifdef CONFIG_X86_PAE .set_pte_atomic = native_set_pte_atomic, .set_pte_present = native_set_pte_present, .set_pud = native_set_pud, .pte_clear = native_pte_clear, .pmd_clear = native_pmd_clear, + + .pmd_val = native_pmd_val, + .make_pmd = native_make_pmd, #endif + .pte_val = native_pte_val, + .pgd_val = native_pgd_val, + + .make_pte = native_make_pte, + .make_pgd = native_make_pgd, + .irq_enable_sysexit = native_irq_enable_sysexit, .iret = native_iret, - .startup_ipi_hook = (void *)native_nop, + .dup_mmap = paravirt_nop, + .exit_mmap = paravirt_nop, + .activate_mm = paravirt_nop, }; -/* - * NOTE: CONFIG_PARAVIRT is experimental and the paravirt_ops - * semantics are subject to change. Hence we only do this - * internal-only export of this, until it gets sorted out and - * all lowlevel CPU ops used by modules are separately exported. - */ -EXPORT_SYMBOL_GPL(paravirt_ops); +EXPORT_SYMBOL(paravirt_ops); diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 393a67d5d94..61999479b7a 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -39,6 +39,7 @@ #include <linux/random.h> #include <linux/personality.h> #include <linux/tick.h> +#include <linux/percpu.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -57,7 +58,6 @@ #include <asm/tlbflush.h> #include <asm/cpu.h> -#include <asm/pda.h> asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -66,6 +66,12 @@ static int hlt_counter; unsigned long boot_option_idle_override = 0; EXPORT_SYMBOL(boot_option_idle_override); +DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); + +DEFINE_PER_CPU(int, cpu_number); +EXPORT_PER_CPU_SYMBOL(cpu_number); + /* * Return saved PC of a blocked thread. */ @@ -272,25 +278,24 @@ void __devinit select_idle_routine(const struct cpuinfo_x86 *c) } } -static int __init idle_setup (char *str) +static int __init idle_setup(char *str) { - if (!strncmp(str, "poll", 4)) { + if (!strcmp(str, "poll")) { printk("using polling idle threads.\n"); pm_idle = poll_idle; #ifdef CONFIG_X86_SMP if (smp_num_siblings > 1) printk("WARNING: polling idle and HT enabled, performance may degrade.\n"); #endif - } else if (!strncmp(str, "halt", 4)) { - printk("using halt in idle threads.\n"); - pm_idle = default_idle; - } + } else if (!strcmp(str, "mwait")) + force_mwait = 1; + else + return -1; boot_option_idle_override = 1; - return 1; + return 0; } - -__setup("idle=", idle_setup); +early_param("idle", idle_setup); void show_regs(struct pt_regs * regs) { @@ -343,7 +348,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) regs.xds = __USER_DS; regs.xes = __USER_DS; - regs.xfs = __KERNEL_PDA; + regs.xfs = __KERNEL_PERCPU; regs.orig_eax = -1; regs.eip = (unsigned long) kernel_thread_helper; regs.xcs = __KERNEL_CS | get_kernel_rpl(); @@ -376,7 +381,7 @@ void exit_thread(void) t->io_bitmap_max = 0; tss->io_bitmap_owner = NULL; tss->io_bitmap_max = 0; - tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET; + tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; put_cpu(); } } @@ -555,7 +560,7 @@ static noinline void __switch_to_xtra(struct task_struct *next_p, * Disable the bitmap via an invalid offset. We still cache * the previous bitmap owner and the IO bitmap contents: */ - tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET; + tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; return; } @@ -565,7 +570,7 @@ static noinline void __switch_to_xtra(struct task_struct *next_p, * matches the next task, we dont have to do anything but * to set a valid offset in the TSS: */ - tss->io_bitmap_base = IO_BITMAP_OFFSET; + tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; return; } /* @@ -577,7 +582,7 @@ static noinline void __switch_to_xtra(struct task_struct *next_p, * redundant copies when the currently switched task does not * perform any I/O during its timeslice. */ - tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; + tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; } /* @@ -712,7 +717,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas if (prev->gs | next->gs) loadsegment(gs, next->gs); - write_pda(pcurrent, next_p); + x86_write_percpu(current_task, next_p); return prev_p; } diff --git a/arch/i386/kernel/quirks.c b/arch/i386/kernel/quirks.c index 34874c398b4..9f6ab1789bb 100644 --- a/arch/i386/kernel/quirks.c +++ b/arch/i386/kernel/quirks.c @@ -3,12 +3,10 @@ */ #include <linux/pci.h> #include <linux/irq.h> -#include <asm/pci-direct.h> -#include <asm/genapic.h> -#include <asm/cpu.h> #if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) -static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev) + +static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) { u8 config, rev; u32 word; @@ -16,12 +14,14 @@ static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev) /* BIOS may enable hardware IRQ balancing for * E7520/E7320/E7525(revision ID 0x9 and below) * based platforms. - * For those platforms, make sure that the genapic is set to 'flat' + * Disable SW irqbalance/affinity on those platforms. */ pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); if (rev > 0x9) return; + printk(KERN_INFO "Intel E7520/7320/7525 detected."); + /* enable access to config space*/ pci_read_config_byte(dev, 0xf4, &config); pci_write_config_byte(dev, 0xf4, config|0x2); @@ -30,44 +30,6 @@ static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev) raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); if (!(word & (1 << 13))) { -#ifdef CONFIG_X86_64 - if (genapic != &apic_flat) - panic("APIC mode must be flat on this system\n"); -#elif defined(CONFIG_X86_GENERICARCH) - if (genapic != &apic_default) - panic("APIC mode must be default(flat) on this system. Use apic=default\n"); -#endif - } - - /* put back the original value for config space*/ - if (!(config & 0x2)) - pci_write_config_byte(dev, 0xf4, config); -} - -void __init quirk_intel_irqbalance(void) -{ - u8 config, rev; - u32 word; - - /* BIOS may enable hardware IRQ balancing for - * E7520/E7320/E7525(revision ID 0x9 and below) - * based platforms. - * Disable SW irqbalance/affinity on those platforms. - */ - rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION); - if (rev > 0x9) - return; - - printk(KERN_INFO "Intel E7520/7320/7525 detected."); - - /* enable access to config space */ - config = read_pci_config_byte(0, 0, 0, 0xf4); - write_pci_config_byte(0, 0, 0, 0xf4, config|0x2); - - /* read xTPR register */ - word = read_pci_config_16(0, 0, 0x40, 0x4c); - - if (!(word & (1 << 13))) { printk(KERN_INFO "Disabling irq balancing and affinity\n"); #ifdef CONFIG_IRQBALANCE irqbalance_disable(""); @@ -76,24 +38,13 @@ void __init quirk_intel_irqbalance(void) #ifdef CONFIG_PROC_FS no_irq_affinity = 1; #endif -#ifdef CONFIG_HOTPLUG_CPU - printk(KERN_INFO "Disabling cpu hotplug control\n"); - enable_cpu_hotplug = 0; -#endif -#ifdef CONFIG_X86_64 - /* force the genapic selection to flat mode so that - * interrupts can be redirected to more than one CPU. - */ - genapic_force = &apic_flat; -#endif } - /* put back the original value for config space */ + /* put back the original value for config space*/ if (!(config & 0x2)) - write_pci_config_byte(0, 0, 0, 0xf4, config); + pci_write_config_byte(dev, 0xf4, config); } -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, verify_quirk_intel_irqbalance); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, verify_quirk_intel_irqbalance); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, verify_quirk_intel_irqbalance); - +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance); #endif diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c index 3514b4153f7..50dfc65319c 100644 --- a/arch/i386/kernel/reboot.c +++ b/arch/i386/kernel/reboot.c @@ -17,7 +17,8 @@ #include <asm/apic.h> #include <asm/desc.h> #include "mach_reboot.h" -#include <linux/reboot_fixups.h> +#include <asm/reboot_fixups.h> +#include <asm/reboot.h> /* * Power off function, if any @@ -197,8 +198,6 @@ static unsigned char jump_to_bios [] = */ void machine_real_restart(unsigned char *code, int length) { - unsigned long flags; - local_irq_disable(); /* Write zero to CMOS register number 0x0f, which the BIOS POST @@ -211,9 +210,9 @@ void machine_real_restart(unsigned char *code, int length) safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.) */ - spin_lock_irqsave(&rtc_lock, flags); + spin_lock(&rtc_lock); CMOS_WRITE(0x00, 0x8f); - spin_unlock_irqrestore(&rtc_lock, flags); + spin_unlock(&rtc_lock); /* Remap the kernel at virtual address zero, as well as offset zero from the kernel segment. This assumes the kernel segment starts at @@ -280,7 +279,7 @@ void machine_real_restart(unsigned char *code, int length) EXPORT_SYMBOL(machine_real_restart); #endif -void machine_shutdown(void) +static void native_machine_shutdown(void) { #ifdef CONFIG_SMP int reboot_cpu_id; @@ -316,7 +315,11 @@ void machine_shutdown(void) #endif } -void machine_emergency_restart(void) +void __attribute__((weak)) mach_reboot_fixups(void) +{ +} + +static void native_machine_emergency_restart(void) { if (!reboot_thru_bios) { if (efi_enabled) { @@ -340,17 +343,17 @@ void machine_emergency_restart(void) machine_real_restart(jump_to_bios, sizeof(jump_to_bios)); } -void machine_restart(char * __unused) +static void native_machine_restart(char * __unused) { machine_shutdown(); machine_emergency_restart(); } -void machine_halt(void) +static void native_machine_halt(void) { } -void machine_power_off(void) +static void native_machine_power_off(void) { if (pm_power_off) { machine_shutdown(); @@ -359,3 +362,35 @@ void machine_power_off(void) } +struct machine_ops machine_ops = { + .power_off = native_machine_power_off, + .shutdown = native_machine_shutdown, + .emergency_restart = native_machine_emergency_restart, + .restart = native_machine_restart, + .halt = native_machine_halt, +}; + +void machine_power_off(void) +{ + machine_ops.power_off(); +} + +void machine_shutdown(void) +{ + machine_ops.shutdown(); +} + +void machine_emergency_restart(void) +{ + machine_ops.emergency_restart(); +} + +void machine_restart(char *cmd) +{ + machine_ops.restart(cmd); +} + +void machine_halt(void) +{ + machine_ops.halt(); +} diff --git a/arch/i386/kernel/reboot_fixups.c b/arch/i386/kernel/reboot_fixups.c index 99aab41a05b..2d78d918340 100644 --- a/arch/i386/kernel/reboot_fixups.c +++ b/arch/i386/kernel/reboot_fixups.c @@ -10,7 +10,7 @@ #include <asm/delay.h> #include <linux/pci.h> -#include <linux/reboot_fixups.h> +#include <asm/reboot_fixups.h> static void cs5530a_warm_reset(struct pci_dev *dev) { diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index 0e8977871b1..89a45a9ddcd 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -165,20 +165,20 @@ void fastcall send_IPI_self(int vector) } /* - * This is only used on smaller machines. + * This is used to send an IPI with no shorthand notation (the destination is + * specified in bits 56 to 63 of the ICR). */ -void send_IPI_mask_bitmask(cpumask_t cpumask, int vector) +static inline void __send_IPI_dest_field(unsigned long mask, int vector) { - unsigned long mask = cpus_addr(cpumask)[0]; unsigned long cfg; - unsigned long flags; - local_irq_save(flags); - WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]); /* * Wait for idle. */ - apic_wait_icr_idle(); + if (unlikely(vector == NMI_VECTOR)) + safe_apic_wait_icr_idle(); + else + apic_wait_icr_idle(); /* * prepare target chip field @@ -195,13 +195,25 @@ void send_IPI_mask_bitmask(cpumask_t cpumask, int vector) * Send the IPI. The write to APIC_ICR fires this off. */ apic_write_around(APIC_ICR, cfg); +} + +/* + * This is only used on smaller machines. + */ +void send_IPI_mask_bitmask(cpumask_t cpumask, int vector) +{ + unsigned long mask = cpus_addr(cpumask)[0]; + unsigned long flags; + local_irq_save(flags); + WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]); + __send_IPI_dest_field(mask, vector); local_irq_restore(flags); } void send_IPI_mask_sequence(cpumask_t mask, int vector) { - unsigned long cfg, flags; + unsigned long flags; unsigned int query_cpu; /* @@ -211,30 +223,10 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector) */ local_irq_save(flags); - for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) { if (cpu_isset(query_cpu, mask)) { - - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - - /* - * prepare target chip field - */ - cfg = __prepare_ICR2(cpu_to_logical_apicid(query_cpu)); - apic_write_around(APIC_ICR2, cfg); - - /* - * program the ICR - */ - cfg = __prepare_ICR(0, vector); - - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - apic_write_around(APIC_ICR, cfg); + __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), + vector); } } local_irq_restore(flags); @@ -256,7 +248,6 @@ static cpumask_t flush_cpumask; static struct mm_struct * flush_mm; static unsigned long flush_va; static DEFINE_SPINLOCK(tlbstate_lock); -#define FLUSH_ALL 0xffffffff /* * We cannot call mmdrop() because we are in interrupt context, @@ -338,7 +329,7 @@ fastcall void smp_invalidate_interrupt(struct pt_regs *regs) if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) { if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) { - if (flush_va == FLUSH_ALL) + if (flush_va == TLB_FLUSH_ALL) local_flush_tlb(); else __flush_tlb_one(flush_va); @@ -353,9 +344,11 @@ out: put_cpu_no_resched(); } -static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, - unsigned long va) +void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, + unsigned long va) { + cpumask_t cpumask = *cpumaskp; + /* * A couple of (to be removed) sanity checks: * @@ -366,10 +359,12 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, BUG_ON(cpu_isset(smp_processor_id(), cpumask)); BUG_ON(!mm); +#ifdef CONFIG_HOTPLUG_CPU /* If a CPU which we ran on has gone down, OK. */ cpus_and(cpumask, cpumask, cpu_online_map); - if (cpus_empty(cpumask)) + if (unlikely(cpus_empty(cpumask))) return; +#endif /* * i'm not happy about this global shared spinlock in the @@ -380,17 +375,7 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, flush_mm = mm; flush_va = va; -#if NR_CPUS <= BITS_PER_LONG - atomic_set_mask(cpumask, &flush_cpumask); -#else - { - int k; - unsigned long *flush_mask = (unsigned long *)&flush_cpumask; - unsigned long *cpu_mask = (unsigned long *)&cpumask; - for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k) - atomic_set_mask(cpu_mask[k], &flush_mask[k]); - } -#endif + cpus_or(flush_cpumask, cpumask, flush_cpumask); /* * We have to send the IPI only to * CPUs affected. @@ -417,7 +402,7 @@ void flush_tlb_current_task(void) local_flush_tlb(); if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); preempt_enable(); } @@ -436,7 +421,7 @@ void flush_tlb_mm (struct mm_struct * mm) leave_mm(smp_processor_id()); } if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); preempt_enable(); } @@ -483,7 +468,7 @@ void flush_tlb_all(void) * it goes straight through and wastes no time serializing * anything. Worst case is that we lose a reschedule ... */ -void smp_send_reschedule(int cpu) +void native_smp_send_reschedule(int cpu) { WARN_ON(cpu_is_offline(cpu)); send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); @@ -515,36 +500,78 @@ void unlock_ipi_call_lock(void) static struct call_data_struct *call_data; +static void __smp_call_function(void (*func) (void *info), void *info, + int nonatomic, int wait) +{ + struct call_data_struct data; + int cpus = num_online_cpus() - 1; + + if (!cpus) + return; + + data.func = func; + data.info = info; + atomic_set(&data.started, 0); + data.wait = wait; + if (wait) + atomic_set(&data.finished, 0); + + call_data = &data; + mb(); + + /* Send a message to all other CPUs and wait for them to respond */ + send_IPI_allbutself(CALL_FUNCTION_VECTOR); + + /* Wait for response */ + while (atomic_read(&data.started) != cpus) + cpu_relax(); + + if (wait) + while (atomic_read(&data.finished) != cpus) + cpu_relax(); +} + + /** - * smp_call_function(): Run a function on all other CPUs. + * smp_call_function_mask(): Run a function on a set of other CPUs. + * @mask: The set of cpus to run on. Must not include the current cpu. * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. - * @nonatomic: currently unused. * @wait: If true, wait (atomically) until function has completed on other CPUs. * - * Returns 0 on success, else a negative status code. Does not return until - * remote CPUs are nearly ready to execute <<func>> or are or have executed. + * Returns 0 on success, else a negative status code. + * + * If @wait is true, then returns once @func has returned; otherwise + * it returns just before the target cpu calls @func. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. */ -int smp_call_function (void (*func) (void *info), void *info, int nonatomic, - int wait) +int native_smp_call_function_mask(cpumask_t mask, + void (*func)(void *), void *info, + int wait) { struct call_data_struct data; + cpumask_t allbutself; int cpus; + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + /* Holding any lock stops cpus from going down. */ spin_lock(&call_lock); - cpus = num_online_cpus() - 1; + + allbutself = cpu_online_map; + cpu_clear(smp_processor_id(), allbutself); + + cpus_and(mask, mask, allbutself); + cpus = cpus_weight(mask); + if (!cpus) { spin_unlock(&call_lock); return 0; } - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); - data.func = func; data.info = info; atomic_set(&data.started, 0); @@ -554,9 +581,12 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic, call_data = &data; mb(); - - /* Send a message to all other CPUs and wait for them to respond */ - send_IPI_allbutself(CALL_FUNCTION_VECTOR); + + /* Send a message to other CPUs */ + if (cpus_equal(mask, allbutself)) + send_IPI_allbutself(CALL_FUNCTION_VECTOR); + else + send_IPI_mask(mask, CALL_FUNCTION_VECTOR); /* Wait for response */ while (atomic_read(&data.started) != cpus) @@ -569,15 +599,68 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic, return 0; } + +/** + * smp_call_function(): Run a function on all other CPUs. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @nonatomic: Unused. + * @wait: If true, wait (atomically) until function has completed on other CPUs. + * + * Returns 0 on success, else a negative status code. + * + * If @wait is true, then returns once @func has returned; otherwise + * it returns just before the target cpu calls @func. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. + */ +int smp_call_function(void (*func) (void *info), void *info, int nonatomic, + int wait) +{ + return smp_call_function_mask(cpu_online_map, func, info, wait); +} EXPORT_SYMBOL(smp_call_function); +/** + * smp_call_function_single - Run a function on another CPU + * @cpu: The target CPU. Cannot be the calling CPU. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @nonatomic: Unused. + * @wait: If true, wait until function has completed on other CPUs. + * + * Returns 0 on success, else a negative status code. + * + * If @wait is true, then returns once @func has returned; otherwise + * it returns just before the target cpu calls @func. + */ +int smp_call_function_single(int cpu, void (*func) (void *info), void *info, + int nonatomic, int wait) +{ + /* prevent preemption and reschedule on another processor */ + int ret; + int me = get_cpu(); + if (cpu == me) { + WARN_ON(1); + put_cpu(); + return -EBUSY; + } + + ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait); + + put_cpu(); + return ret; +} +EXPORT_SYMBOL(smp_call_function_single); + static void stop_this_cpu (void * dummy) { + local_irq_disable(); /* * Remove this CPU: */ cpu_clear(smp_processor_id(), cpu_online_map); - local_irq_disable(); disable_local_APIC(); if (cpu_data[smp_processor_id()].hlt_works_ok) for(;;) halt(); @@ -588,13 +671,18 @@ static void stop_this_cpu (void * dummy) * this function calls the 'stop' function on all other CPUs in the system. */ -void smp_send_stop(void) +void native_smp_send_stop(void) { - smp_call_function(stop_this_cpu, NULL, 1, 0); + /* Don't deadlock on the call lock in panic */ + int nolock = !spin_trylock(&call_lock); + unsigned long flags; - local_irq_disable(); + local_irq_save(flags); + __smp_call_function(stop_this_cpu, NULL, 0, 0); + if (!nolock) + spin_unlock(&call_lock); disable_local_APIC(); - local_irq_enable(); + local_irq_restore(flags); } /* @@ -633,77 +721,6 @@ fastcall void smp_call_function_interrupt(struct pt_regs *regs) } } -/* - * this function sends a 'generic call function' IPI to one other CPU - * in the system. - * - * cpu is a standard Linux logical CPU number. - */ -static void -__smp_call_function_single(int cpu, void (*func) (void *info), void *info, - int nonatomic, int wait) -{ - struct call_data_struct data; - int cpus = 1; - - data.func = func; - data.info = info; - atomic_set(&data.started, 0); - data.wait = wait; - if (wait) - atomic_set(&data.finished, 0); - - call_data = &data; - wmb(); - /* Send a message to all other CPUs and wait for them to respond */ - send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR); - - /* Wait for response */ - while (atomic_read(&data.started) != cpus) - cpu_relax(); - - if (!wait) - return; - - while (atomic_read(&data.finished) != cpus) - cpu_relax(); -} - -/* - * smp_call_function_single - Run a function on another CPU - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @nonatomic: Currently unused. - * @wait: If true, wait until function has completed on other CPUs. - * - * Retrurns 0 on success, else a negative status code. - * - * Does not return until the remote CPU is nearly ready to execute <func> - * or is or has executed. - */ - -int smp_call_function_single(int cpu, void (*func) (void *info), void *info, - int nonatomic, int wait) -{ - /* prevent preemption and reschedule on another processor */ - int me = get_cpu(); - if (cpu == me) { - WARN_ON(1); - put_cpu(); - return -EBUSY; - } - - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); - - spin_lock_bh(&call_lock); - __smp_call_function_single(cpu, func, info, nonatomic, wait); - spin_unlock_bh(&call_lock); - put_cpu(); - return 0; -} -EXPORT_SYMBOL(smp_call_function_single); - static int convert_apicid_to_cpu(int apic_id) { int i; @@ -730,3 +747,14 @@ int safe_smp_processor_id(void) return cpuid >= 0 ? cpuid : 0; } + +struct smp_ops smp_ops = { + .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, + .smp_prepare_cpus = native_smp_prepare_cpus, + .cpu_up = native_cpu_up, + .smp_cpus_done = native_smp_cpus_done, + + .smp_send_stop = native_smp_send_stop, + .smp_send_reschedule = native_smp_send_reschedule, + .smp_call_function_mask = native_smp_call_function_mask, +}; diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 4ff55e67557..a4b7ad283f4 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -53,13 +53,12 @@ #include <asm/desc.h> #include <asm/arch_hooks.h> #include <asm/nmi.h> -#include <asm/pda.h> -#include <asm/genapic.h> #include <mach_apic.h> #include <mach_wakecpu.h> #include <smpboot_hooks.h> #include <asm/vmi.h> +#include <asm/mtrr.h> /* Set if we find a B stepping CPU */ static int __devinitdata smp_b_stepping; @@ -100,6 +99,9 @@ EXPORT_SYMBOL(x86_cpu_to_apicid); u8 apicid_2_node[MAX_APICID]; +DEFINE_PER_CPU(unsigned long, this_cpu_off); +EXPORT_PER_CPU_SYMBOL(this_cpu_off); + /* * Trampoline 80x86 program as an array. */ @@ -156,7 +158,7 @@ static void __cpuinit smp_store_cpu_info(int id) *c = boot_cpu_data; if (id!=0) - identify_cpu(c); + identify_secondary_cpu(c); /* * Mask B, Pentium, but not Pentium MMX */ @@ -379,14 +381,14 @@ set_cpu_sibling_map(int cpu) static void __cpuinit start_secondary(void *unused) { /* - * Don't put *anything* before secondary_cpu_init(), SMP - * booting is too fragile that we want to limit the - * things done here to the most necessary things. + * Don't put *anything* before cpu_init(), SMP booting is too + * fragile that we want to limit the things done here to the + * most necessary things. */ #ifdef CONFIG_VMI vmi_bringup(); #endif - secondary_cpu_init(); + cpu_init(); preempt_disable(); smp_callin(); while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) @@ -441,12 +443,6 @@ static void __cpuinit start_secondary(void *unused) void __devinit initialize_secondary(void) { /* - * switch to the per CPU GDT we already set up - * in do_boot_cpu() - */ - cpu_set_gdt(current_thread_info()->cpu); - - /* * We don't actually need to load the full TSS, * basically just the stack pointer and the eip. */ @@ -463,7 +459,6 @@ extern struct { void * esp; unsigned short ss; } stack_start; -extern struct i386_pda *start_pda; #ifdef CONFIG_NUMA @@ -521,12 +516,12 @@ static void unmap_cpu_to_logical_apicid(int cpu) unmap_cpu_to_node(cpu); } -#if APIC_DEBUG static inline void __inquire_remote_apic(int apicid) { int i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; char *names[] = { "ID", "VERSION", "SPIV" }; - int timeout, status; + int timeout; + unsigned long status; printk("Inquiring remote APIC #%d...\n", apicid); @@ -536,7 +531,9 @@ static inline void __inquire_remote_apic(int apicid) /* * Wait for idle. */ - apic_wait_icr_idle(); + status = safe_apic_wait_icr_idle(); + if (status) + printk("a previous APIC delivery may have failed\n"); apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); @@ -550,14 +547,13 @@ static inline void __inquire_remote_apic(int apicid) switch (status) { case APIC_ICR_RR_VALID: status = apic_read(APIC_RRR); - printk("%08x\n", status); + printk("%lx\n", status); break; default: printk("failed\n"); } } } -#endif #ifdef WAKE_SECONDARY_VIA_NMI /* @@ -568,8 +564,8 @@ static inline void __inquire_remote_apic(int apicid) static int __devinit wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) { - unsigned long send_status = 0, accept_status = 0; - int timeout, maxlvt; + unsigned long send_status, accept_status = 0; + int maxlvt; /* Target chip */ apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid)); @@ -579,12 +575,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); Dprintk("Waiting for send to finish...\n"); - timeout = 0; - do { - Dprintk("+"); - udelay(100); - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - } while (send_status && (timeout++ < 1000)); + send_status = safe_apic_wait_icr_idle(); /* * Give the other CPU some time to accept the IPI. @@ -614,8 +605,8 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) static int __devinit wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) { - unsigned long send_status = 0, accept_status = 0; - int maxlvt, timeout, num_starts, j; + unsigned long send_status, accept_status = 0; + int maxlvt, num_starts, j; /* * Be paranoid about clearing APIC errors. @@ -640,12 +631,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) | APIC_DM_INIT); Dprintk("Waiting for send to finish...\n"); - timeout = 0; - do { - Dprintk("+"); - udelay(100); - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - } while (send_status && (timeout++ < 1000)); + send_status = safe_apic_wait_icr_idle(); mdelay(10); @@ -658,12 +644,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); Dprintk("Waiting for send to finish...\n"); - timeout = 0; - do { - Dprintk("+"); - udelay(100); - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - } while (send_status && (timeout++ < 1000)); + send_status = safe_apic_wait_icr_idle(); atomic_set(&init_deasserted, 1); @@ -719,12 +700,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) Dprintk("Startup point 1.\n"); Dprintk("Waiting for send to finish...\n"); - timeout = 0; - do { - Dprintk("+"); - udelay(100); - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - } while (send_status && (timeout++ < 1000)); + send_status = safe_apic_wait_icr_idle(); /* * Give the other CPU some time to accept the IPI. @@ -788,6 +764,25 @@ static inline struct task_struct * alloc_idle_task(int cpu) #define alloc_idle_task(cpu) fork_idle(cpu) #endif +/* Initialize the CPU's GDT. This is either the boot CPU doing itself + (still using the master per-cpu area), or a CPU doing it for a + secondary which will soon come up. */ +static __cpuinit void init_gdt(int cpu) +{ + struct desc_struct *gdt = get_cpu_gdt_table(cpu); + + pack_descriptor((u32 *)&gdt[GDT_ENTRY_PERCPU].a, + (u32 *)&gdt[GDT_ENTRY_PERCPU].b, + __per_cpu_offset[cpu], 0xFFFFF, + 0x80 | DESCTYPE_S | 0x2, 0x8); + + per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu]; + per_cpu(cpu_number, cpu) = cpu; +} + +/* Defined in head.S */ +extern struct Xgt_desc_struct early_gdt_descr; + static int __cpuinit do_boot_cpu(int apicid, int cpu) /* * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad @@ -802,6 +797,12 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) unsigned short nmi_high = 0, nmi_low = 0; /* + * Save current MTRR state in case it was changed since early boot + * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync: + */ + mtrr_save_state(); + + /* * We can't use kernel_thread since we must avoid to * reschedule the child. */ @@ -809,13 +810,9 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) if (IS_ERR(idle)) panic("failed fork for CPU %d", cpu); - /* Pre-allocate and initialize the CPU's GDT and PDA so it - doesn't have to do any memory allocation during the - delicate CPU-bringup phase. */ - if (!init_gdt(cpu, idle)) { - printk(KERN_INFO "Couldn't allocate GDT/PDA for CPU %d\n", cpu); - return -1; /* ? */ - } + init_gdt(cpu); + per_cpu(current_task, cpu) = idle; + early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); idle->thread.eip = (unsigned long) start_secondary; /* start_eip had better be page-aligned! */ @@ -941,7 +938,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu) DECLARE_COMPLETION_ONSTACK(done); struct warm_boot_cpu_info info; int apicid, ret; - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); apicid = x86_cpu_to_apicid[cpu]; if (apicid == BAD_APICID) { @@ -949,18 +945,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu) goto exit; } - /* - * the CPU isn't initialized at boot time, allocate gdt table here. - * cpu_init will initialize it - */ - if (!cpu_gdt_descr->address) { - cpu_gdt_descr->address = get_zeroed_page(GFP_KERNEL); - if (!cpu_gdt_descr->address) - printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu); - ret = -ENOMEM; - goto exit; - } - info.complete = &done; info.apicid = apicid; info.cpu = cpu; @@ -1173,7 +1157,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus) /* These are wrappers to interface to the new boot process. Someone who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ -void __init smp_prepare_cpus(unsigned int max_cpus) +void __init native_smp_prepare_cpus(unsigned int max_cpus) { smp_commenced_mask = cpumask_of_cpu(0); cpu_callin_map = cpumask_of_cpu(0); @@ -1181,13 +1165,18 @@ void __init smp_prepare_cpus(unsigned int max_cpus) smp_boot_cpus(max_cpus); } -void __devinit smp_prepare_boot_cpu(void) +void __init native_smp_prepare_boot_cpu(void) { - cpu_set(smp_processor_id(), cpu_online_map); - cpu_set(smp_processor_id(), cpu_callout_map); - cpu_set(smp_processor_id(), cpu_present_map); - cpu_set(smp_processor_id(), cpu_possible_map); - per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; + unsigned int cpu = smp_processor_id(); + + init_gdt(cpu); + switch_to_new_gdt(); + + cpu_set(cpu, cpu_online_map); + cpu_set(cpu, cpu_callout_map); + cpu_set(cpu, cpu_present_map); + cpu_set(cpu, cpu_possible_map); + __get_cpu_var(cpu_state) = CPU_ONLINE; } #ifdef CONFIG_HOTPLUG_CPU @@ -1277,7 +1266,7 @@ void __cpu_die(unsigned int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -int __cpuinit __cpu_up(unsigned int cpu) +int __cpuinit native_cpu_up(unsigned int cpu) { unsigned long flags; #ifdef CONFIG_HOTPLUG_CPU @@ -1319,15 +1308,10 @@ int __cpuinit __cpu_up(unsigned int cpu) touch_nmi_watchdog(); } -#ifdef CONFIG_X86_GENERICARCH - if (num_online_cpus() > 8 && genapic == &apic_default) - panic("Default flat APIC routing can't be used with > 8 cpus\n"); -#endif - return 0; } -void __init smp_cpus_done(unsigned int max_cpus) +void __init native_smp_cpus_done(unsigned int max_cpus) { #ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c index 13ca54a85a1..ff4ee6f3326 100644 --- a/arch/i386/kernel/sysenter.c +++ b/arch/i386/kernel/sysenter.c @@ -22,16 +22,26 @@ #include <asm/msr.h> #include <asm/pgtable.h> #include <asm/unistd.h> +#include <asm/elf.h> +#include <asm/tlbflush.h> + +enum { + VDSO_DISABLED = 0, + VDSO_ENABLED = 1, + VDSO_COMPAT = 2, +}; + +#ifdef CONFIG_COMPAT_VDSO +#define VDSO_DEFAULT VDSO_COMPAT +#else +#define VDSO_DEFAULT VDSO_ENABLED +#endif /* * Should the kernel map a VDSO page into processes and pass its * address down to glibc upon exec()? */ -#ifdef CONFIG_PARAVIRT -unsigned int __read_mostly vdso_enabled = 0; -#else -unsigned int __read_mostly vdso_enabled = 1; -#endif +unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT; EXPORT_SYMBOL_GPL(vdso_enabled); @@ -46,6 +56,123 @@ __setup("vdso=", vdso_setup); extern asmlinkage void sysenter_entry(void); +static __init void reloc_symtab(Elf32_Ehdr *ehdr, + unsigned offset, unsigned size) +{ + Elf32_Sym *sym = (void *)ehdr + offset; + unsigned nsym = size / sizeof(*sym); + unsigned i; + + for(i = 0; i < nsym; i++, sym++) { + if (sym->st_shndx == SHN_UNDEF || + sym->st_shndx == SHN_ABS) + continue; /* skip */ + + if (sym->st_shndx > SHN_LORESERVE) { + printk(KERN_INFO "VDSO: unexpected st_shndx %x\n", + sym->st_shndx); + continue; + } + + switch(ELF_ST_TYPE(sym->st_info)) { + case STT_OBJECT: + case STT_FUNC: + case STT_SECTION: + case STT_FILE: + sym->st_value += VDSO_HIGH_BASE; + } + } +} + +static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset) +{ + Elf32_Dyn *dyn = (void *)ehdr + offset; + + for(; dyn->d_tag != DT_NULL; dyn++) + switch(dyn->d_tag) { + case DT_PLTGOT: + case DT_HASH: + case DT_STRTAB: + case DT_SYMTAB: + case DT_RELA: + case DT_INIT: + case DT_FINI: + case DT_REL: + case DT_DEBUG: + case DT_JMPREL: + case DT_VERSYM: + case DT_VERDEF: + case DT_VERNEED: + case DT_ADDRRNGLO ... DT_ADDRRNGHI: + /* definitely pointers needing relocation */ + dyn->d_un.d_ptr += VDSO_HIGH_BASE; + break; + + case DT_ENCODING ... OLD_DT_LOOS-1: + case DT_LOOS ... DT_HIOS-1: + /* Tags above DT_ENCODING are pointers if + they're even */ + if (dyn->d_tag >= DT_ENCODING && + (dyn->d_tag & 1) == 0) + dyn->d_un.d_ptr += VDSO_HIGH_BASE; + break; + + case DT_VERDEFNUM: + case DT_VERNEEDNUM: + case DT_FLAGS_1: + case DT_RELACOUNT: + case DT_RELCOUNT: + case DT_VALRNGLO ... DT_VALRNGHI: + /* definitely not pointers */ + break; + + case OLD_DT_LOOS ... DT_LOOS-1: + case DT_HIOS ... DT_VALRNGLO-1: + default: + if (dyn->d_tag > DT_ENCODING) + printk(KERN_INFO "VDSO: unexpected DT_tag %x\n", + dyn->d_tag); + break; + } +} + +static __init void relocate_vdso(Elf32_Ehdr *ehdr) +{ + Elf32_Phdr *phdr; + Elf32_Shdr *shdr; + int i; + + BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 || + !elf_check_arch(ehdr) || + ehdr->e_type != ET_DYN); + + ehdr->e_entry += VDSO_HIGH_BASE; + + /* rebase phdrs */ + phdr = (void *)ehdr + ehdr->e_phoff; + for (i = 0; i < ehdr->e_phnum; i++) { + phdr[i].p_vaddr += VDSO_HIGH_BASE; + + /* relocate dynamic stuff */ + if (phdr[i].p_type == PT_DYNAMIC) + reloc_dyn(ehdr, phdr[i].p_offset); + } + + /* rebase sections */ + shdr = (void *)ehdr + ehdr->e_shoff; + for(i = 0; i < ehdr->e_shnum; i++) { + if (!(shdr[i].sh_flags & SHF_ALLOC)) + continue; + + shdr[i].sh_addr += VDSO_HIGH_BASE; + + if (shdr[i].sh_type == SHT_SYMTAB || + shdr[i].sh_type == SHT_DYNSYM) + reloc_symtab(ehdr, shdr[i].sh_offset, + shdr[i].sh_size); + } +} + void enable_sep_cpu(void) { int cpu = get_cpu(); @@ -56,14 +183,33 @@ void enable_sep_cpu(void) return; } - tss->ss1 = __KERNEL_CS; - tss->esp1 = sizeof(struct tss_struct) + (unsigned long) tss; + tss->x86_tss.ss1 = __KERNEL_CS; + tss->x86_tss.esp1 = sizeof(struct tss_struct) + (unsigned long) tss; wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); - wrmsr(MSR_IA32_SYSENTER_ESP, tss->esp1, 0); + wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.esp1, 0); wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0); put_cpu(); } +static struct vm_area_struct gate_vma; + +static int __init gate_vma_init(void) +{ + gate_vma.vm_mm = NULL; + gate_vma.vm_start = FIXADDR_USER_START; + gate_vma.vm_end = FIXADDR_USER_END; + gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; + gate_vma.vm_page_prot = __P101; + /* + * Make sure the vDSO gets into every core dump. + * Dumping its contents makes post-mortem fully interpretable later + * without matching up the same kernel and hardware config to see + * what PC values meant. + */ + gate_vma.vm_flags |= VM_ALWAYSDUMP; + return 0; +} + /* * These symbols are defined by vsyscall.o to mark the bounds * of the ELF DSO images included therein. @@ -72,31 +218,48 @@ extern const char vsyscall_int80_start, vsyscall_int80_end; extern const char vsyscall_sysenter_start, vsyscall_sysenter_end; static struct page *syscall_pages[1]; +static void map_compat_vdso(int map) +{ + static int vdso_mapped; + + if (map == vdso_mapped) + return; + + vdso_mapped = map; + + __set_fixmap(FIX_VDSO, page_to_pfn(syscall_pages[0]) << PAGE_SHIFT, + map ? PAGE_READONLY_EXEC : PAGE_NONE); + + /* flush stray tlbs */ + flush_tlb_all(); +} + int __init sysenter_setup(void) { void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC); + const void *vsyscall; + size_t vsyscall_len; + syscall_pages[0] = virt_to_page(syscall_page); -#ifdef CONFIG_COMPAT_VDSO - __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY_EXEC); + gate_vma_init(); + printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO)); -#endif if (!boot_cpu_has(X86_FEATURE_SEP)) { - memcpy(syscall_page, - &vsyscall_int80_start, - &vsyscall_int80_end - &vsyscall_int80_start); - return 0; + vsyscall = &vsyscall_int80_start; + vsyscall_len = &vsyscall_int80_end - &vsyscall_int80_start; + } else { + vsyscall = &vsyscall_sysenter_start; + vsyscall_len = &vsyscall_sysenter_end - &vsyscall_sysenter_start; } - memcpy(syscall_page, - &vsyscall_sysenter_start, - &vsyscall_sysenter_end - &vsyscall_sysenter_start); + memcpy(syscall_page, vsyscall, vsyscall_len); + relocate_vdso(syscall_page); return 0; } -#ifndef CONFIG_COMPAT_VDSO /* Defined in vsyscall-sysenter.S */ extern void SYSENTER_RETURN; @@ -105,36 +268,52 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) { struct mm_struct *mm = current->mm; unsigned long addr; - int ret; + int ret = 0; + bool compat; down_write(&mm->mmap_sem); - addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); - if (IS_ERR_VALUE(addr)) { - ret = addr; - goto up_fail; - } - /* - * MAYWRITE to allow gdb to COW and set breakpoints - * - * Make sure the vDSO gets into every core dump. - * Dumping its contents makes post-mortem fully interpretable later - * without matching up the same kernel and hardware config to see - * what PC values meant. - */ - ret = install_special_mapping(mm, addr, PAGE_SIZE, - VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| - VM_ALWAYSDUMP, - syscall_pages); - if (ret) - goto up_fail; + /* Test compat mode once here, in case someone + changes it via sysctl */ + compat = (vdso_enabled == VDSO_COMPAT); + + map_compat_vdso(compat); + + if (compat) + addr = VDSO_HIGH_BASE; + else { + addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); + if (IS_ERR_VALUE(addr)) { + ret = addr; + goto up_fail; + } + + /* + * MAYWRITE to allow gdb to COW and set breakpoints + * + * Make sure the vDSO gets into every core dump. + * Dumping its contents makes post-mortem fully + * interpretable later without matching up the same + * kernel and hardware config to see what PC values + * meant. + */ + ret = install_special_mapping(mm, addr, PAGE_SIZE, + VM_READ|VM_EXEC| + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| + VM_ALWAYSDUMP, + syscall_pages); + + if (ret) + goto up_fail; + } current->mm->context.vdso = (void *)addr; current_thread_info()->sysenter_return = - (void *)VDSO_SYM(&SYSENTER_RETURN); -up_fail: + (void *)VDSO_SYM(&SYSENTER_RETURN); + + up_fail: up_write(&mm->mmap_sem); + return ret; } @@ -147,6 +326,11 @@ const char *arch_vma_name(struct vm_area_struct *vma) struct vm_area_struct *get_gate_vma(struct task_struct *tsk) { + struct mm_struct *mm = tsk->mm; + + /* Check to see if this task was created in compat vdso mode */ + if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE) + return &gate_vma; return NULL; } @@ -159,4 +343,3 @@ int in_gate_area_no_task(unsigned long addr) { return 0; } -#endif diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c index 94e5cb09110..a665df61f08 100644 --- a/arch/i386/kernel/time.c +++ b/arch/i386/kernel/time.c @@ -70,8 +70,6 @@ #include <asm/i8259.h> -int pit_latch_buggy; /* extern */ - #include "do_timer.h" unsigned int cpu_khz; /* Detected as we calibrate the TSC */ diff --git a/arch/i386/kernel/trampoline.S b/arch/i386/kernel/trampoline.S index 2f1814c5cfd..f62815f8d06 100644 --- a/arch/i386/kernel/trampoline.S +++ b/arch/i386/kernel/trampoline.S @@ -29,7 +29,7 @@ * * TYPE VALUE * R_386_32 startup_32_smp - * R_386_32 boot_gdt_table + * R_386_32 boot_gdt */ #include <linux/linkage.h> @@ -62,8 +62,8 @@ r_base = . * to 32 bit. */ - lidtl boot_idt - r_base # load idt with 0, 0 - lgdtl boot_gdt - r_base # load gdt with whatever is appropriate + lidtl boot_idt_descr - r_base # load idt with 0, 0 + lgdtl boot_gdt_descr - r_base # load gdt with whatever is appropriate xor %ax, %ax inc %ax # protected mode (PE) bit @@ -73,11 +73,11 @@ r_base = . # These need to be in the same 64K segment as the above; # hence we don't use the boot_gdt_descr defined in head.S -boot_gdt: +boot_gdt_descr: .word __BOOT_DS + 7 # gdt limit - .long boot_gdt_table-__PAGE_OFFSET # gdt base + .long boot_gdt - __PAGE_OFFSET # gdt base -boot_idt: +boot_idt_descr: .word 0 # idt limit = 0 .long 0 # idt base = 0L diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index af0d3f70a81..f21b41e7770 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -476,8 +476,6 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86, siginfo_t *info) { struct task_struct *tsk = current; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; if (regs->eflags & VM_MASK) { if (vm86) @@ -489,6 +487,18 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86, goto kernel_trap; trap_signal: { + /* + * We want error_code and trap_no set for userspace faults and + * kernelspace faults which result in die(), but not + * kernelspace faults which are fixed up. die() gives the + * process no chance to handle the signal and notice the + * kernel fault information, so that won't result in polluting + * the information about previously queued, but not yet + * delivered, faults. See also do_general_protection below. + */ + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; + if (info) force_sig_info(signr, info, tsk); else @@ -497,8 +507,11 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86, } kernel_trap: { - if (!fixup_exception(regs)) + if (!fixup_exception(regs)) { + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; die(str, regs, error_code); + } return; } @@ -583,7 +596,7 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs, * and we set the offset field correctly. Then we let the CPU to * restart the faulting instruction. */ - if (tss->io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && + if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && thread->io_bitmap_ptr) { memcpy(tss->io_bitmap, thread->io_bitmap_ptr, thread->io_bitmap_max); @@ -596,16 +609,13 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs, thread->io_bitmap_max, 0xff, tss->io_bitmap_max - thread->io_bitmap_max); tss->io_bitmap_max = thread->io_bitmap_max; - tss->io_bitmap_base = IO_BITMAP_OFFSET; + tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; tss->io_bitmap_owner = thread; put_cpu(); return; } put_cpu(); - current->thread.error_code = error_code; - current->thread.trap_no = 13; - if (regs->eflags & VM_MASK) goto gp_in_vm86; @@ -624,6 +634,8 @@ gp_in_vm86: gp_in_kernel: if (!fixup_exception(regs)) { + current->thread.error_code = error_code; + current->thread.trap_no = 13; if (notify_die(DIE_GPF, "general protection fault", regs, error_code, 13, SIGSEGV) == NOTIFY_STOP) return; @@ -1018,9 +1030,7 @@ fastcall void do_spurious_interrupt_bug(struct pt_regs * regs, fastcall unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) { - int cpu = smp_processor_id(); - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); - struct desc_struct *gdt = (struct desc_struct *)cpu_gdt_descr->address; + struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt; unsigned long base = (kesp - uesp) & -THREAD_SIZE; unsigned long new_kesp = kesp - base; unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c index 6cb8f533673..f64b81f3033 100644 --- a/arch/i386/kernel/tsc.c +++ b/arch/i386/kernel/tsc.c @@ -200,13 +200,10 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; - if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE) - write_seqlock_irq(&xtime_lock); - if (!ref_freq) { if (!freq->old){ ref_freq = freq->new; - goto end; + return 0; } ref_freq = freq->old; loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy; @@ -233,13 +230,10 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) * TSC based sched_clock turns * to junk w/ cpufreq */ - mark_tsc_unstable(); + mark_tsc_unstable("cpufreq changes"); } } } -end: - if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE) - write_sequnlock_irq(&xtime_lock); return 0; } @@ -281,11 +275,12 @@ static struct clocksource clocksource_tsc = { CLOCK_SOURCE_MUST_VERIFY, }; -void mark_tsc_unstable(void) +void mark_tsc_unstable(char *reason) { if (!tsc_unstable) { tsc_unstable = 1; tsc_enabled = 0; + printk("Marking TSC unstable due to: %s.\n", reason); /* Can be called before registration */ if (clocksource_tsc.mult) clocksource_change_rating(&clocksource_tsc, 0); diff --git a/arch/i386/kernel/verify_cpu.S b/arch/i386/kernel/verify_cpu.S new file mode 100644 index 00000000000..e51a8695d54 --- /dev/null +++ b/arch/i386/kernel/verify_cpu.S @@ -0,0 +1,65 @@ +/* Check if CPU has some minimum CPUID bits + This runs in 16bit mode so that the caller can still use the BIOS + to output errors on the screen */ +#include <asm/cpufeature.h> + +verify_cpu: + pushfl # Save caller passed flags + pushl $0 # Kill any dangerous flags + popfl + +#if CONFIG_X86_MINIMUM_CPU_MODEL >= 4 + pushfl + orl $(1<<18),(%esp) # try setting AC + popfl + pushfl + popl %eax + testl $(1<<18),%eax + jz bad +#endif +#if REQUIRED_MASK1 != 0 + pushfl # standard way to check for cpuid + popl %eax + movl %eax,%ebx + xorl $0x200000,%eax + pushl %eax + popfl + pushfl + popl %eax + cmpl %eax,%ebx + pushfl # standard way to check for cpuid + popl %eax + movl %eax,%ebx + xorl $0x200000,%eax + pushl %eax + popfl + pushfl + popl %eax + cmpl %eax,%ebx + jz bad # REQUIRED_MASK1 != 0 requires CPUID + + movl $0x0,%eax # See if cpuid 1 is implemented + cpuid + cmpl $0x1,%eax + jb bad # no cpuid 1 + + movl $0x1,%eax # Does the cpu have what it takes + cpuid + +#if CONFIG_X86_MINIMUM_CPU_MODEL > 4 +#error add proper model checking here +#endif + + andl $REQUIRED_MASK1,%edx + xorl $REQUIRED_MASK1,%edx + jnz bad +#endif /* REQUIRED_MASK1 */ + + popfl + xor %eax,%eax + ret + +bad: + popfl + movl $1,%eax + ret diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c index 697a70e8c0c..c8726c424b3 100644 --- a/arch/i386/kernel/vmi.c +++ b/arch/i386/kernel/vmi.c @@ -26,6 +26,7 @@ #include <linux/cpu.h> #include <linux/bootmem.h> #include <linux/mm.h> +#include <linux/highmem.h> #include <asm/vmi.h> #include <asm/io.h> #include <asm/fixmap.h> @@ -56,7 +57,7 @@ static int disable_noidle; static int disable_vmi_timer; /* Cached VMI operations */ -struct { +static struct { void (*cpuid)(void /* non-c */); void (*_set_ldt)(u32 selector); void (*set_tr)(u32 selector); @@ -65,16 +66,15 @@ struct { void (*release_page)(u32, u32); void (*set_pte)(pte_t, pte_t *, unsigned); void (*update_pte)(pte_t *, unsigned); - void (*set_linear_mapping)(int, u32, u32, u32); - void (*flush_tlb)(int); + void (*set_linear_mapping)(int, void *, u32, u32); + void (*_flush_tlb)(int); void (*set_initial_ap_state)(int, int); void (*halt)(void); void (*set_lazy_mode)(int mode); } vmi_ops; -/* XXX move this to alternative.h */ -extern struct paravirt_patch __start_parainstructions[], - __stop_parainstructions[]; +/* Cached VMI operations */ +struct vmi_timer_ops vmi_timer_ops; /* * VMI patching routines. @@ -83,11 +83,6 @@ extern struct paravirt_patch __start_parainstructions[], #define MNEM_JMP 0xe9 #define MNEM_RET 0xc3 -static char irq_save_disable_callout[] = { - MNEM_CALL, 0, 0, 0, 0, - MNEM_CALL, 0, 0, 0, 0, - MNEM_RET -}; #define IRQ_PATCH_INT_MASK 0 #define IRQ_PATCH_DISABLE 5 @@ -135,33 +130,17 @@ static unsigned patch_internal(int call, unsigned len, void *insns) static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, unsigned len) { switch (type) { - case PARAVIRT_IRQ_DISABLE: + case PARAVIRT_PATCH(irq_disable): return patch_internal(VMI_CALL_DisableInterrupts, len, insns); - case PARAVIRT_IRQ_ENABLE: + case PARAVIRT_PATCH(irq_enable): return patch_internal(VMI_CALL_EnableInterrupts, len, insns); - case PARAVIRT_RESTORE_FLAGS: + case PARAVIRT_PATCH(restore_fl): return patch_internal(VMI_CALL_SetInterruptMask, len, insns); - case PARAVIRT_SAVE_FLAGS: + case PARAVIRT_PATCH(save_fl): return patch_internal(VMI_CALL_GetInterruptMask, len, insns); - case PARAVIRT_SAVE_FLAGS_IRQ_DISABLE: - if (len >= 10) { - patch_internal(VMI_CALL_GetInterruptMask, len, insns); - patch_internal(VMI_CALL_DisableInterrupts, len-5, insns+5); - return 10; - } else { - /* - * You bastards didn't leave enough room to - * patch save_flags_irq_disable inline. Patch - * to a helper - */ - BUG_ON(len < 5); - *(char *)insns = MNEM_CALL; - patch_offset(insns, irq_save_disable_callout); - return 5; - } - case PARAVIRT_INTERRUPT_RETURN: + case PARAVIRT_PATCH(iret): return patch_internal(VMI_CALL_IRET, len, insns); - case PARAVIRT_STI_SYSEXIT: + case PARAVIRT_PATCH(irq_enable_sysexit): return patch_internal(VMI_CALL_SYSEXIT, len, insns); default: break; @@ -230,24 +209,24 @@ static void vmi_set_tr(void) static void vmi_load_esp0(struct tss_struct *tss, struct thread_struct *thread) { - tss->esp0 = thread->esp0; + tss->x86_tss.esp0 = thread->esp0; /* This can only happen when SEP is enabled, no need to test "SEP"arately */ - if (unlikely(tss->ss1 != thread->sysenter_cs)) { - tss->ss1 = thread->sysenter_cs; + if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { + tss->x86_tss.ss1 = thread->sysenter_cs; wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); } - vmi_ops.set_kernel_stack(__KERNEL_DS, tss->esp0); + vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.esp0); } static void vmi_flush_tlb_user(void) { - vmi_ops.flush_tlb(VMI_FLUSH_TLB); + vmi_ops._flush_tlb(VMI_FLUSH_TLB); } static void vmi_flush_tlb_kernel(void) { - vmi_ops.flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL); + vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL); } /* Stub to do nothing at all; used for delays and unimplemented calls */ @@ -255,18 +234,6 @@ static void vmi_nop(void) { } -/* For NO_IDLE_HZ, we stop the clock when halting the kernel */ -static fastcall void vmi_safe_halt(void) -{ - int idle = vmi_stop_hz_timer(); - vmi_ops.halt(); - if (idle) { - local_irq_disable(); - vmi_account_time_restart_hz_timer(); - local_irq_enable(); - } -} - #ifdef CONFIG_DEBUG_PAGE_TYPE #ifdef CONFIG_X86_PAE @@ -370,8 +337,11 @@ static void vmi_check_page_type(u32 pfn, int type) #define vmi_check_page_type(p,t) do { } while (0) #endif -static void vmi_map_pt_hook(int type, pte_t *va, u32 pfn) +#ifdef CONFIG_HIGHPTE +static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) { + void *va = kmap_atomic(page, type); + /* * Internally, the VMI ROM must map virtual addresses to physical * addresses for processing MMU updates. By the time MMU updates @@ -385,8 +355,11 @@ static void vmi_map_pt_hook(int type, pte_t *va, u32 pfn) * args: SLOT VA COUNT PFN */ BUG_ON(type != KM_PTE0 && type != KM_PTE1); - vmi_ops.set_linear_mapping((type - KM_PTE0)+1, (u32)va, 1, pfn); + vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page)); + + return va; } +#endif static void vmi_allocate_pt(u32 pfn) { @@ -443,13 +416,13 @@ static void vmi_release_pd(u32 pfn) ((level) | (is_current_as(mm, user) ? \ (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0)) -static void vmi_update_pte(struct mm_struct *mm, u32 addr, pte_t *ptep) +static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); } -static void vmi_update_pte_defer(struct mm_struct *mm, u32 addr, pte_t *ptep) +static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0)); @@ -462,7 +435,7 @@ static void vmi_set_pte(pte_t *ptep, pte_t pte) vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT); } -static void vmi_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte) +static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); @@ -516,7 +489,7 @@ static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); } -void vmi_pmd_clear(pmd_t *pmd) +static void vmi_pmd_clear(pmd_t *pmd) { const pte_t pte = { 0 }; vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD); @@ -525,8 +498,6 @@ void vmi_pmd_clear(pmd_t *pmd) #endif #ifdef CONFIG_SMP -extern void setup_pda(void); - static void __devinit vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, unsigned long start_esp) @@ -551,13 +522,11 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, ap.ds = __USER_DS; ap.es = __USER_DS; - ap.fs = __KERNEL_PDA; + ap.fs = __KERNEL_PERCPU; ap.gs = 0; ap.eflags = 0; - setup_pda(); - #ifdef CONFIG_X86_PAE /* efer should match BSP efer. */ if (cpu_has_nx) { @@ -575,9 +544,9 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, } #endif -static void vmi_set_lazy_mode(int mode) +static void vmi_set_lazy_mode(enum paravirt_lazy_mode mode) { - static DEFINE_PER_CPU(int, lazy_mode); + static DEFINE_PER_CPU(enum paravirt_lazy_mode, lazy_mode); if (!vmi_ops.set_lazy_mode) return; @@ -685,7 +654,7 @@ void vmi_bringup(void) { /* We must establish the lowmem mapping for MMU ops to work */ if (vmi_ops.set_linear_mapping) - vmi_ops.set_linear_mapping(0, __PAGE_OFFSET, max_low_pfn, 0); + vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, max_low_pfn, 0); } /* @@ -740,7 +709,6 @@ do { \ } \ } while (0) - /* * Activate the VMI interface and switch into paravirtualized mode */ @@ -796,12 +764,6 @@ static inline int __init activate_vmi(void) para_fill(irq_disable, DisableInterrupts); para_fill(irq_enable, EnableInterrupts); - /* irq_save_disable !!! sheer pain */ - patch_offset(&irq_save_disable_callout[IRQ_PATCH_INT_MASK], - (char *)paravirt_ops.save_fl); - patch_offset(&irq_save_disable_callout[IRQ_PATCH_DISABLE], - (char *)paravirt_ops.irq_disable); - para_fill(wbinvd, WBINVD); para_fill(read_tsc, RDTSC); @@ -831,8 +793,8 @@ static inline int __init activate_vmi(void) para_wrap(set_lazy_mode, vmi_set_lazy_mode, set_lazy_mode, SetLazyMode); /* user and kernel flush are just handled with different flags to FlushTLB */ - para_wrap(flush_tlb_user, vmi_flush_tlb_user, flush_tlb, FlushTLB); - para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, flush_tlb, FlushTLB); + para_wrap(flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB); + para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB); para_fill(flush_tlb_single, InvalPage); /* @@ -878,8 +840,13 @@ static inline int __init activate_vmi(void) paravirt_ops.release_pt = vmi_release_pt; paravirt_ops.release_pd = vmi_release_pd; } - para_wrap(map_pt_hook, vmi_map_pt_hook, set_linear_mapping, - SetLinearMapping); + + /* Set linear is needed in all cases */ + vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping); +#ifdef CONFIG_HIGHPTE + if (vmi_ops.set_linear_mapping) + paravirt_ops.kmap_atomic_pte = vmi_kmap_atomic_pte; +#endif /* * These MUST always be patched. Don't support indirect jumps @@ -920,8 +887,8 @@ static inline int __init activate_vmi(void) paravirt_ops.get_wallclock = vmi_get_wallclock; paravirt_ops.set_wallclock = vmi_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC - paravirt_ops.setup_boot_clock = vmi_timer_setup_boot_alarm; - paravirt_ops.setup_secondary_clock = vmi_timer_setup_secondary_alarm; + paravirt_ops.setup_boot_clock = vmi_time_bsp_init; + paravirt_ops.setup_secondary_clock = vmi_time_ap_init; #endif paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles; paravirt_ops.get_cpu_khz = vmi_cpu_khz; @@ -933,11 +900,7 @@ static inline int __init activate_vmi(void) disable_vmi_timer = 1; } - /* No idle HZ mode only works if VMI timer and no idle is enabled */ - if (disable_noidle || disable_vmi_timer) - para_fill(safe_halt, Halt); - else - para_wrap(safe_halt, vmi_safe_halt, halt, Halt); + para_fill(safe_halt, Halt); /* * Alternative instruction rewriting doesn't happen soon enough @@ -945,7 +908,7 @@ static inline int __init activate_vmi(void) * to do this before IRQs get reenabled. Fortunately, it is * idempotent. */ - apply_paravirt(__start_parainstructions, __stop_parainstructions); + apply_paravirt(__parainstructions, __parainstructions_end); vmi_bringup(); diff --git a/arch/i386/kernel/vmiclock.c b/arch/i386/kernel/vmiclock.c new file mode 100644 index 00000000000..26a37f8a876 --- /dev/null +++ b/arch/i386/kernel/vmiclock.c @@ -0,0 +1,318 @@ +/* + * VMI paravirtual timer support routines. + * + * Copyright (C) 2007, VMware, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include <linux/smp.h> +#include <linux/interrupt.h> +#include <linux/cpumask.h> +#include <linux/clocksource.h> +#include <linux/clockchips.h> + +#include <asm/vmi.h> +#include <asm/vmi_time.h> +#include <asm/arch_hooks.h> +#include <asm/apicdef.h> +#include <asm/apic.h> +#include <asm/timer.h> + +#include <irq_vectors.h> +#include "io_ports.h" + +#define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) +#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) + +static DEFINE_PER_CPU(struct clock_event_device, local_events); + +static inline u32 vmi_counter(u32 flags) +{ + /* Given VMI_ONESHOT or VMI_PERIODIC, return the corresponding + * cycle counter. */ + return flags & VMI_ALARM_COUNTER_MASK; +} + +/* paravirt_ops.get_wallclock = vmi_get_wallclock */ +unsigned long vmi_get_wallclock(void) +{ + unsigned long long wallclock; + wallclock = vmi_timer_ops.get_wallclock(); // nsec + (void)do_div(wallclock, 1000000000); // sec + + return wallclock; +} + +/* paravirt_ops.set_wallclock = vmi_set_wallclock */ +int vmi_set_wallclock(unsigned long now) +{ + return 0; +} + +/* paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles */ +unsigned long long vmi_get_sched_cycles(void) +{ + return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE); +} + +/* paravirt_ops.get_cpu_khz = vmi_cpu_khz */ +unsigned long vmi_cpu_khz(void) +{ + unsigned long long khz; + khz = vmi_timer_ops.get_cycle_frequency(); + (void)do_div(khz, 1000); + return khz; +} + +static inline unsigned int vmi_get_timer_vector(void) +{ +#ifdef CONFIG_X86_IO_APIC + return FIRST_DEVICE_VECTOR; +#else + return FIRST_EXTERNAL_VECTOR; +#endif +} + +/** vmi clockchip */ +#ifdef CONFIG_X86_LOCAL_APIC +static unsigned int startup_timer_irq(unsigned int irq) +{ + unsigned long val = apic_read(APIC_LVTT); + apic_write(APIC_LVTT, vmi_get_timer_vector()); + + return (val & APIC_SEND_PENDING); +} + +static void mask_timer_irq(unsigned int irq) +{ + unsigned long val = apic_read(APIC_LVTT); + apic_write(APIC_LVTT, val | APIC_LVT_MASKED); +} + +static void unmask_timer_irq(unsigned int irq) +{ + unsigned long val = apic_read(APIC_LVTT); + apic_write(APIC_LVTT, val & ~APIC_LVT_MASKED); +} + +static void ack_timer_irq(unsigned int irq) +{ + ack_APIC_irq(); +} + +static struct irq_chip vmi_chip __read_mostly = { + .name = "VMI-LOCAL", + .startup = startup_timer_irq, + .mask = mask_timer_irq, + .unmask = unmask_timer_irq, + .ack = ack_timer_irq +}; +#endif + +/** vmi clockevent */ +#define VMI_ALARM_WIRED_IRQ0 0x00000000 +#define VMI_ALARM_WIRED_LVTT 0x00010000 +static int vmi_wiring = VMI_ALARM_WIRED_IRQ0; + +static inline int vmi_get_alarm_wiring(void) +{ + return vmi_wiring; +} + +static void vmi_timer_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + cycle_t now, cycles_per_hz; + BUG_ON(!irqs_disabled()); + + switch (mode) { + case CLOCK_EVT_MODE_ONESHOT: + break; + case CLOCK_EVT_MODE_PERIODIC: + cycles_per_hz = vmi_timer_ops.get_cycle_frequency(); + (void)do_div(cycles_per_hz, HZ); + now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_PERIODIC)); + vmi_timer_ops.set_alarm(VMI_PERIODIC, now, cycles_per_hz); + break; + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + switch (evt->mode) { + case CLOCK_EVT_MODE_ONESHOT: + vmi_timer_ops.cancel_alarm(VMI_ONESHOT); + break; + case CLOCK_EVT_MODE_PERIODIC: + vmi_timer_ops.cancel_alarm(VMI_PERIODIC); + break; + default: + break; + } + break; + default: + break; + } +} + +static int vmi_timer_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + /* Unfortunately, set_next_event interface only passes relative + * expiry, but we want absolute expiry. It'd be better if were + * were passed an aboslute expiry, since a bunch of time may + * have been stolen between the time the delta is computed and + * when we set the alarm below. */ + cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT)); + + BUG_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT); + vmi_timer_ops.set_alarm(VMI_ONESHOT, now + delta, 0); + return 0; +} + +static struct clock_event_device vmi_clockevent = { + .name = "vmi-timer", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, + .shift = 22, + .set_mode = vmi_timer_set_mode, + .set_next_event = vmi_timer_next_event, + .rating = 1000, + .irq = 0, +}; + +static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id) +{ + struct clock_event_device *evt = &__get_cpu_var(local_events); + evt->event_handler(evt); + return IRQ_HANDLED; +} + +static struct irqaction vmi_clock_action = { + .name = "vmi-timer", + .handler = vmi_timer_interrupt, + .flags = IRQF_DISABLED | IRQF_NOBALANCING, + .mask = CPU_MASK_ALL, +}; + +static void __devinit vmi_time_init_clockevent(void) +{ + cycle_t cycles_per_msec; + struct clock_event_device *evt; + + int cpu = smp_processor_id(); + evt = &__get_cpu_var(local_events); + + /* Use cycles_per_msec since div_sc params are 32-bits. */ + cycles_per_msec = vmi_timer_ops.get_cycle_frequency(); + (void)do_div(cycles_per_msec, 1000); + + memcpy(evt, &vmi_clockevent, sizeof(*evt)); + /* Must pick .shift such that .mult fits in 32-bits. Choosing + * .shift to be 22 allows 2^(32-22) cycles per nano-seconds + * before overflow. */ + evt->mult = div_sc(cycles_per_msec, NSEC_PER_MSEC, evt->shift); + /* Upper bound is clockevent's use of ulong for cycle deltas. */ + evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt); + evt->min_delta_ns = clockevent_delta2ns(1, evt); + evt->cpumask = cpumask_of_cpu(cpu); + + printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n", + evt->name, evt->mult, evt->shift); + clockevents_register_device(evt); +} + +void __init vmi_time_init(void) +{ + /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */ + outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */ + + vmi_time_init_clockevent(); + setup_irq(0, &vmi_clock_action); +} + +#ifdef CONFIG_X86_LOCAL_APIC +void __devinit vmi_time_bsp_init(void) +{ + /* + * On APIC systems, we want local timers to fire on each cpu. We do + * this by programming LVTT to deliver timer events to the IRQ handler + * for IRQ-0, since we can't re-use the APIC local timer handler + * without interfering with that code. + */ + clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); + local_irq_disable(); +#ifdef CONFIG_X86_SMP + /* + * XXX handle_percpu_irq only defined for SMP; we need to switch over + * to using it, since this is a local interrupt, which each CPU must + * handle individually without locking out or dropping simultaneous + * local timers on other CPUs. We also don't want to trigger the + * quirk workaround code for interrupts which gets invoked from + * handle_percpu_irq via eoi, so we use our own IRQ chip. + */ + set_irq_chip_and_handler_name(0, &vmi_chip, handle_percpu_irq, "lvtt"); +#else + set_irq_chip_and_handler_name(0, &vmi_chip, handle_edge_irq, "lvtt"); +#endif + vmi_wiring = VMI_ALARM_WIRED_LVTT; + apic_write(APIC_LVTT, vmi_get_timer_vector()); + local_irq_enable(); + clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); +} + +void __devinit vmi_time_ap_init(void) +{ + vmi_time_init_clockevent(); + apic_write(APIC_LVTT, vmi_get_timer_vector()); +} +#endif + +/** vmi clocksource */ + +static cycle_t read_real_cycles(void) +{ + return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL); +} + +static struct clocksource clocksource_vmi = { + .name = "vmi-timer", + .rating = 450, + .read = read_real_cycles, + .mask = CLOCKSOURCE_MASK(64), + .mult = 0, /* to be set */ + .shift = 22, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static int __init init_vmi_clocksource(void) +{ + cycle_t cycles_per_msec; + + if (!vmi_timer_ops.get_cycle_frequency) + return 0; + /* Use khz2mult rather than hz2mult since hz arg is only 32-bits. */ + cycles_per_msec = vmi_timer_ops.get_cycle_frequency(); + (void)do_div(cycles_per_msec, 1000); + + /* Note that clocksource.{mult, shift} converts in the opposite direction + * as clockevents. */ + clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec, + clocksource_vmi.shift); + + printk(KERN_WARNING "vmi: registering clock source khz=%lld\n", cycles_per_msec); + return clocksource_register(&clocksource_vmi); + +} +module_init(init_vmi_clocksource); diff --git a/arch/i386/kernel/vmitime.c b/arch/i386/kernel/vmitime.c deleted file mode 100644 index 9dfb17739b6..00000000000 --- a/arch/i386/kernel/vmitime.c +++ /dev/null @@ -1,482 +0,0 @@ -/* - * VMI paravirtual timer support routines. - * - * Copyright (C) 2005, VMware, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send feedback to dhecht@vmware.com - * - */ - -/* - * Portions of this code from arch/i386/kernel/timers/timer_tsc.c. - * Portions of the CONFIG_NO_IDLE_HZ code from arch/s390/kernel/time.c. - * See comments there for proper credits. - */ - -#include <linux/spinlock.h> -#include <linux/init.h> -#include <linux/errno.h> -#include <linux/jiffies.h> -#include <linux/interrupt.h> -#include <linux/kernel_stat.h> -#include <linux/rcupdate.h> -#include <linux/clocksource.h> - -#include <asm/timer.h> -#include <asm/io.h> -#include <asm/apic.h> -#include <asm/div64.h> -#include <asm/timer.h> -#include <asm/desc.h> - -#include <asm/vmi.h> -#include <asm/vmi_time.h> - -#include <mach_timer.h> -#include <io_ports.h> - -#ifdef CONFIG_X86_LOCAL_APIC -#define VMI_ALARM_WIRING VMI_ALARM_WIRED_LVTT -#else -#define VMI_ALARM_WIRING VMI_ALARM_WIRED_IRQ0 -#endif - -/* Cached VMI operations */ -struct vmi_timer_ops vmi_timer_ops; - -#ifdef CONFIG_NO_IDLE_HZ - -/* /proc/sys/kernel/hz_timer state. */ -int sysctl_hz_timer; - -/* Some stats */ -static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_irqs); -static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_jiffies); -static DEFINE_PER_CPU(unsigned long, idle_start_jiffies); - -#endif /* CONFIG_NO_IDLE_HZ */ - -/* Number of alarms per second. By default this is CONFIG_VMI_ALARM_HZ. */ -static int alarm_hz = CONFIG_VMI_ALARM_HZ; - -/* Cache of the value get_cycle_frequency / HZ. */ -static signed long long cycles_per_jiffy; - -/* Cache of the value get_cycle_frequency / alarm_hz. */ -static signed long long cycles_per_alarm; - -/* The number of cycles accounted for by the 'jiffies'/'xtime' count. - * Protected by xtime_lock. */ -static unsigned long long real_cycles_accounted_system; - -/* The number of cycles accounted for by update_process_times(), per cpu. */ -static DEFINE_PER_CPU(unsigned long long, process_times_cycles_accounted_cpu); - -/* The number of stolen cycles accounted, per cpu. */ -static DEFINE_PER_CPU(unsigned long long, stolen_cycles_accounted_cpu); - -/* Clock source. */ -static cycle_t read_real_cycles(void) -{ - return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL); -} - -static cycle_t read_available_cycles(void) -{ - return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE); -} - -#if 0 -static cycle_t read_stolen_cycles(void) -{ - return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_STOLEN); -} -#endif /* 0 */ - -static struct clocksource clocksource_vmi = { - .name = "vmi-timer", - .rating = 450, - .read = read_real_cycles, - .mask = CLOCKSOURCE_MASK(64), - .mult = 0, /* to be set */ - .shift = 22, - .flags = CLOCK_SOURCE_IS_CONTINUOUS, -}; - - -/* Timer interrupt handler. */ -static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id); - -static struct irqaction vmi_timer_irq = { - .handler = vmi_timer_interrupt, - .flags = IRQF_DISABLED, - .mask = CPU_MASK_NONE, - .name = "VMI-alarm", -}; - -/* Alarm rate */ -static int __init vmi_timer_alarm_rate_setup(char* str) -{ - int alarm_rate; - if (get_option(&str, &alarm_rate) == 1 && alarm_rate > 0) { - alarm_hz = alarm_rate; - printk(KERN_WARNING "VMI timer alarm HZ set to %d\n", alarm_hz); - } - return 1; -} -__setup("vmi_timer_alarm_hz=", vmi_timer_alarm_rate_setup); - - -/* Initialization */ -static void vmi_get_wallclock_ts(struct timespec *ts) -{ - unsigned long long wallclock; - wallclock = vmi_timer_ops.get_wallclock(); // nsec units - ts->tv_nsec = do_div(wallclock, 1000000000); - ts->tv_sec = wallclock; -} - -unsigned long vmi_get_wallclock(void) -{ - struct timespec ts; - vmi_get_wallclock_ts(&ts); - return ts.tv_sec; -} - -int vmi_set_wallclock(unsigned long now) -{ - return -1; -} - -unsigned long long vmi_get_sched_cycles(void) -{ - return read_available_cycles(); -} - -unsigned long vmi_cpu_khz(void) -{ - unsigned long long khz; - - khz = vmi_timer_ops.get_cycle_frequency(); - (void)do_div(khz, 1000); - return khz; -} - -void __init vmi_time_init(void) -{ - unsigned long long cycles_per_sec, cycles_per_msec; - unsigned long flags; - - local_irq_save(flags); - setup_irq(0, &vmi_timer_irq); -#ifdef CONFIG_X86_LOCAL_APIC - set_intr_gate(LOCAL_TIMER_VECTOR, apic_vmi_timer_interrupt); -#endif - - real_cycles_accounted_system = read_real_cycles(); - per_cpu(process_times_cycles_accounted_cpu, 0) = read_available_cycles(); - - cycles_per_sec = vmi_timer_ops.get_cycle_frequency(); - cycles_per_jiffy = cycles_per_sec; - (void)do_div(cycles_per_jiffy, HZ); - cycles_per_alarm = cycles_per_sec; - (void)do_div(cycles_per_alarm, alarm_hz); - cycles_per_msec = cycles_per_sec; - (void)do_div(cycles_per_msec, 1000); - - printk(KERN_WARNING "VMI timer cycles/sec = %llu ; cycles/jiffy = %llu ;" - "cycles/alarm = %llu\n", cycles_per_sec, cycles_per_jiffy, - cycles_per_alarm); - - clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec, - clocksource_vmi.shift); - if (clocksource_register(&clocksource_vmi)) - printk(KERN_WARNING "Error registering VMITIME clocksource."); - - /* Disable PIT. */ - outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */ - - /* schedule the alarm. do this in phase with process_times_cycles_accounted_cpu - * reduce the latency calling update_process_times. */ - vmi_timer_ops.set_alarm( - VMI_ALARM_WIRED_IRQ0 | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE, - per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm, - cycles_per_alarm); - - local_irq_restore(flags); -} - -#ifdef CONFIG_X86_LOCAL_APIC - -void __init vmi_timer_setup_boot_alarm(void) -{ - local_irq_disable(); - - /* Route the interrupt to the correct vector. */ - apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR); - - /* Cancel the IRQ0 wired alarm, and setup the LVTT alarm. */ - vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE); - vmi_timer_ops.set_alarm( - VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE, - per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm, - cycles_per_alarm); - local_irq_enable(); -} - -/* Initialize the time accounting variables for an AP on an SMP system. - * Also, set the local alarm for the AP. */ -void __devinit vmi_timer_setup_secondary_alarm(void) -{ - int cpu = smp_processor_id(); - - /* Route the interrupt to the correct vector. */ - apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR); - - per_cpu(process_times_cycles_accounted_cpu, cpu) = read_available_cycles(); - - vmi_timer_ops.set_alarm( - VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE, - per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm, - cycles_per_alarm); -} - -#endif - -/* Update system wide (real) time accounting (e.g. jiffies, xtime). */ -static void vmi_account_real_cycles(unsigned long long cur_real_cycles) -{ - long long cycles_not_accounted; - - write_seqlock(&xtime_lock); - - cycles_not_accounted = cur_real_cycles - real_cycles_accounted_system; - while (cycles_not_accounted >= cycles_per_jiffy) { - /* systems wide jiffies. */ - do_timer(1); - - cycles_not_accounted -= cycles_per_jiffy; - real_cycles_accounted_system += cycles_per_jiffy; - } - - write_sequnlock(&xtime_lock); -} - -/* Update per-cpu process times. */ -static void vmi_account_process_times_cycles(struct pt_regs *regs, int cpu, - unsigned long long cur_process_times_cycles) -{ - long long cycles_not_accounted; - cycles_not_accounted = cur_process_times_cycles - - per_cpu(process_times_cycles_accounted_cpu, cpu); - - while (cycles_not_accounted >= cycles_per_jiffy) { - /* Account time to the current process. This includes - * calling into the scheduler to decrement the timeslice - * and possibly reschedule.*/ - update_process_times(user_mode(regs)); - /* XXX handle /proc/profile multiplier. */ - profile_tick(CPU_PROFILING); - - cycles_not_accounted -= cycles_per_jiffy; - per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy; - } -} - -#ifdef CONFIG_NO_IDLE_HZ -/* Update per-cpu idle times. Used when a no-hz halt is ended. */ -static void vmi_account_no_hz_idle_cycles(int cpu, - unsigned long long cur_process_times_cycles) -{ - long long cycles_not_accounted; - unsigned long no_idle_hz_jiffies = 0; - - cycles_not_accounted = cur_process_times_cycles - - per_cpu(process_times_cycles_accounted_cpu, cpu); - - while (cycles_not_accounted >= cycles_per_jiffy) { - no_idle_hz_jiffies++; - cycles_not_accounted -= cycles_per_jiffy; - per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy; - } - /* Account time to the idle process. */ - account_steal_time(idle_task(cpu), jiffies_to_cputime(no_idle_hz_jiffies)); -} -#endif - -/* Update per-cpu stolen time. */ -static void vmi_account_stolen_cycles(int cpu, - unsigned long long cur_real_cycles, - unsigned long long cur_avail_cycles) -{ - long long stolen_cycles_not_accounted; - unsigned long stolen_jiffies = 0; - - if (cur_real_cycles < cur_avail_cycles) - return; - - stolen_cycles_not_accounted = cur_real_cycles - cur_avail_cycles - - per_cpu(stolen_cycles_accounted_cpu, cpu); - - while (stolen_cycles_not_accounted >= cycles_per_jiffy) { - stolen_jiffies++; - stolen_cycles_not_accounted -= cycles_per_jiffy; - per_cpu(stolen_cycles_accounted_cpu, cpu) += cycles_per_jiffy; - } - /* HACK: pass NULL to force time onto cpustat->steal. */ - account_steal_time(NULL, jiffies_to_cputime(stolen_jiffies)); -} - -/* Body of either IRQ0 interrupt handler (UP no local-APIC) or - * local-APIC LVTT interrupt handler (UP & local-APIC or SMP). */ -static void vmi_local_timer_interrupt(int cpu) -{ - unsigned long long cur_real_cycles, cur_process_times_cycles; - - cur_real_cycles = read_real_cycles(); - cur_process_times_cycles = read_available_cycles(); - /* Update system wide (real) time state (xtime, jiffies). */ - vmi_account_real_cycles(cur_real_cycles); - /* Update per-cpu process times. */ - vmi_account_process_times_cycles(get_irq_regs(), cpu, cur_process_times_cycles); - /* Update time stolen from this cpu by the hypervisor. */ - vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles); -} - -#ifdef CONFIG_NO_IDLE_HZ - -/* Must be called only from idle loop, with interrupts disabled. */ -int vmi_stop_hz_timer(void) -{ - /* Note that cpu_set, cpu_clear are (SMP safe) atomic on x86. */ - - unsigned long seq, next; - unsigned long long real_cycles_expiry; - int cpu = smp_processor_id(); - - BUG_ON(!irqs_disabled()); - if (sysctl_hz_timer != 0) - return 0; - - cpu_set(cpu, nohz_cpu_mask); - smp_mb(); - - if (rcu_needs_cpu(cpu) || local_softirq_pending() || - (next = next_timer_interrupt(), - time_before_eq(next, jiffies + HZ/CONFIG_VMI_ALARM_HZ))) { - cpu_clear(cpu, nohz_cpu_mask); - return 0; - } - - /* Convert jiffies to the real cycle counter. */ - do { - seq = read_seqbegin(&xtime_lock); - real_cycles_expiry = real_cycles_accounted_system + - (long)(next - jiffies) * cycles_per_jiffy; - } while (read_seqretry(&xtime_lock, seq)); - - /* This cpu is going idle. Disable the periodic alarm. */ - vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE); - per_cpu(idle_start_jiffies, cpu) = jiffies; - /* Set the real time alarm to expire at the next event. */ - vmi_timer_ops.set_alarm( - VMI_ALARM_WIRING | VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL, - real_cycles_expiry, 0); - return 1; -} - -static void vmi_reenable_hz_timer(int cpu) -{ - /* For /proc/vmi/info idle_hz stat. */ - per_cpu(vmi_idle_no_hz_jiffies, cpu) += jiffies - per_cpu(idle_start_jiffies, cpu); - per_cpu(vmi_idle_no_hz_irqs, cpu)++; - - /* Don't bother explicitly cancelling the one-shot alarm -- at - * worse we will receive a spurious timer interrupt. */ - vmi_timer_ops.set_alarm( - VMI_ALARM_WIRING | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE, - per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm, - cycles_per_alarm); - /* Indicate this cpu is no longer nohz idle. */ - cpu_clear(cpu, nohz_cpu_mask); -} - -/* Called from interrupt handlers when (local) HZ timer is disabled. */ -void vmi_account_time_restart_hz_timer(void) -{ - unsigned long long cur_real_cycles, cur_process_times_cycles; - int cpu = smp_processor_id(); - - BUG_ON(!irqs_disabled()); - /* Account the time during which the HZ timer was disabled. */ - cur_real_cycles = read_real_cycles(); - cur_process_times_cycles = read_available_cycles(); - /* Update system wide (real) time state (xtime, jiffies). */ - vmi_account_real_cycles(cur_real_cycles); - /* Update per-cpu idle times. */ - vmi_account_no_hz_idle_cycles(cpu, cur_process_times_cycles); - /* Update time stolen from this cpu by the hypervisor. */ - vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles); - /* Reenable the hz timer. */ - vmi_reenable_hz_timer(cpu); -} - -#endif /* CONFIG_NO_IDLE_HZ */ - -/* UP (and no local-APIC) VMI-timer alarm interrupt handler. - * Handler for IRQ0. Not used when SMP or X86_LOCAL_APIC after - * APIC setup and setup_boot_vmi_alarm() is called. */ -static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id) -{ - vmi_local_timer_interrupt(smp_processor_id()); - return IRQ_HANDLED; -} - -#ifdef CONFIG_X86_LOCAL_APIC - -/* SMP VMI-timer alarm interrupt handler. Handler for LVTT vector. - * Also used in UP when CONFIG_X86_LOCAL_APIC. - * The wrapper code is from arch/i386/kernel/apic.c#smp_apic_timer_interrupt. */ -void smp_apic_vmi_timer_interrupt(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - int cpu = smp_processor_id(); - - /* - * the NMI deadlock-detector uses this. - */ - per_cpu(irq_stat,cpu).apic_timer_irqs++; - - /* - * NOTE! We'd better ACK the irq immediately, - * because timer handling can be slow. - */ - ack_APIC_irq(); - - /* - * update_process_times() expects us to have done irq_enter(). - * Besides, if we don't timer interrupts ignore the global - * interrupt lock, which is the WrongThing (tm) to do. - */ - irq_enter(); - vmi_local_timer_interrupt(cpu); - irq_exit(); - set_irq_regs(old_regs); -} - -#endif /* CONFIG_X86_LOCAL_APIC */ diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 6f38f818380..23e8614edee 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -26,12 +26,11 @@ OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") OUTPUT_ARCH(i386) ENTRY(phys_startup_32) jiffies = jiffies_64; -_proxy_pda = 1; PHDRS { text PT_LOAD FLAGS(5); /* R_E */ data PT_LOAD FLAGS(7); /* RWE */ - note PT_NOTE FLAGS(4); /* R__ */ + note PT_NOTE FLAGS(0); /* ___ */ } SECTIONS { @@ -61,8 +60,6 @@ SECTIONS __stop___ex_table = .; } - RODATA - BUG_TABLE . = ALIGN(4); @@ -72,6 +69,8 @@ SECTIONS __tracedata_end = .; } + RODATA + /* writeable */ . = ALIGN(4096); .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */ @@ -117,22 +116,11 @@ SECTIONS /* might get freed after init */ . = ALIGN(4096); - .smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) { - __smp_alt_begin = .; - __smp_alt_instructions = .; - *(.smp_altinstructions) - __smp_alt_instructions_end = .; - } - . = ALIGN(4); .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { __smp_locks = .; *(.smp_locks) __smp_locks_end = .; } - .smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) { - *(.smp_altinstr_replacement) - __smp_alt_end = .; - } /* will be freed after init * Following ALIGN() is required to make sure no other data falls on the * same page where __smp_alt_end is pointing as that page might be freed @@ -178,9 +166,9 @@ SECTIONS } . = ALIGN(4); .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { - __start_parainstructions = .; + __parainstructions = .; *(.parainstructions) - __stop_parainstructions = .; + __parainstructions_end = .; } /* .exit.text is discard at runtime, not link time, to deal with references from .altinstructions and .eh_frame */ @@ -194,7 +182,7 @@ SECTIONS __initramfs_end = .; } #endif - . = ALIGN(L1_CACHE_BYTES); + . = ALIGN(4096); .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { __per_cpu_start = .; *(.data.percpu) diff --git a/arch/i386/kernel/vsyscall.lds.S b/arch/i386/kernel/vsyscall.lds.S index f66cd11adb7..4a8b0ed9b8f 100644 --- a/arch/i386/kernel/vsyscall.lds.S +++ b/arch/i386/kernel/vsyscall.lds.S @@ -7,7 +7,7 @@ SECTIONS { - . = VDSO_PRELINK + SIZEOF_HEADERS; + . = VDSO_PRELINK_asm + SIZEOF_HEADERS; .hash : { *(.hash) } :text .gnu.hash : { *(.gnu.hash) } @@ -21,7 +21,7 @@ SECTIONS For the layouts to match, we need to skip more than enough space for the dynamic symbol table et al. If this amount is insufficient, ld -shared will barf. Just increase it here. */ - . = VDSO_PRELINK + 0x400; + . = VDSO_PRELINK_asm + 0x400; .text : { *(.text) } :text =0x90909090 .note : { *(.note.*) } :text :note diff --git a/arch/i386/lib/bitops.c b/arch/i386/lib/bitops.c index 97db3853dc8..afd0045595d 100644 --- a/arch/i386/lib/bitops.c +++ b/arch/i386/lib/bitops.c @@ -43,7 +43,7 @@ EXPORT_SYMBOL(find_next_bit); */ int find_next_zero_bit(const unsigned long *addr, int size, int offset) { - unsigned long * p = ((unsigned long *) addr) + (offset >> 5); + const unsigned long *p = addr + (offset >> 5); int set = 0, bit = offset & 31, res; if (bit) { @@ -64,7 +64,7 @@ int find_next_zero_bit(const unsigned long *addr, int size, int offset) /* * No zero yet, search remaining full bytes for a zero */ - res = find_first_zero_bit (p, size - 32 * (p - (unsigned long *) addr)); + res = find_first_zero_bit(p, size - 32 * (p - addr)); return (offset + set + res); } EXPORT_SYMBOL(find_next_zero_bit); diff --git a/arch/i386/lib/checksum.S b/arch/i386/lib/checksum.S index 75ffd02654f..adbccd0bbb7 100644 --- a/arch/i386/lib/checksum.S +++ b/arch/i386/lib/checksum.S @@ -25,6 +25,8 @@ * 2 of the License, or (at your option) any later version. */ +#include <linux/linkage.h> +#include <asm/dwarf2.h> #include <asm/errno.h> /* @@ -36,8 +38,6 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) */ .text -.align 4 -.globl csum_partial #ifndef CONFIG_X86_USE_PPRO_CHECKSUM @@ -48,9 +48,14 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) * Fortunately, it is easy to convert 2-byte alignment to 4-byte * alignment for the unrolled loop. */ -csum_partial: +ENTRY(csum_partial) + CFI_STARTPROC pushl %esi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esi, 0 pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebx, 0 movl 20(%esp),%eax # Function arg: unsigned int sum movl 16(%esp),%ecx # Function arg: int len movl 12(%esp),%esi # Function arg: unsigned char *buff @@ -128,16 +133,27 @@ csum_partial: roll $8, %eax 8: popl %ebx + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE ebx popl %esi + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE esi ret + CFI_ENDPROC +ENDPROC(csum_partial) #else /* Version for PentiumII/PPro */ -csum_partial: +ENTRY(csum_partial) + CFI_STARTPROC pushl %esi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esi, 0 pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebx, 0 movl 20(%esp),%eax # Function arg: unsigned int sum movl 16(%esp),%ecx # Function arg: int len movl 12(%esp),%esi # Function arg: const unsigned char *buf @@ -245,8 +261,14 @@ csum_partial: roll $8, %eax 90: popl %ebx + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE ebx popl %esi + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE esi ret + CFI_ENDPROC +ENDPROC(csum_partial) #endif @@ -278,19 +300,24 @@ unsigned int csum_partial_copy_generic (const char *src, char *dst, .long 9999b, 6002f ; \ .previous -.align 4 -.globl csum_partial_copy_generic - #ifndef CONFIG_X86_USE_PPRO_CHECKSUM #define ARGBASE 16 #define FP 12 -csum_partial_copy_generic: +ENTRY(csum_partial_copy_generic) + CFI_STARTPROC subl $4,%esp + CFI_ADJUST_CFA_OFFSET 4 pushl %edi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edi, 0 pushl %esi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esi, 0 pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebx, 0 movl ARGBASE+16(%esp),%eax # sum movl ARGBASE+12(%esp),%ecx # len movl ARGBASE+4(%esp),%esi # src @@ -400,10 +427,19 @@ DST( movb %cl, (%edi) ) .previous popl %ebx + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE ebx popl %esi + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE esi popl %edi + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE edi popl %ecx # equivalent to addl $4,%esp + CFI_ADJUST_CFA_OFFSET -4 ret + CFI_ENDPROC +ENDPROC(csum_partial_copy_generic) #else @@ -421,10 +457,17 @@ DST( movb %cl, (%edi) ) #define ARGBASE 12 -csum_partial_copy_generic: +ENTRY(csum_partial_copy_generic) + CFI_STARTPROC pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebx, 0 pushl %edi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edi, 0 pushl %esi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esi, 0 movl ARGBASE+4(%esp),%esi #src movl ARGBASE+8(%esp),%edi #dst movl ARGBASE+12(%esp),%ecx #len @@ -485,9 +528,17 @@ DST( movb %dl, (%edi) ) .previous popl %esi + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE esi popl %edi + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE edi popl %ebx + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE ebx ret + CFI_ENDPROC +ENDPROC(csum_partial_copy_generic) #undef ROUND #undef ROUND1 diff --git a/arch/i386/lib/getuser.S b/arch/i386/lib/getuser.S index 62d7f178a32..6d84b53f12a 100644 --- a/arch/i386/lib/getuser.S +++ b/arch/i386/lib/getuser.S @@ -8,6 +8,8 @@ * return an error value in addition to the "real" * return value. */ +#include <linux/linkage.h> +#include <asm/dwarf2.h> #include <asm/thread_info.h> @@ -24,19 +26,19 @@ */ .text -.align 4 -.globl __get_user_1 -__get_user_1: +ENTRY(__get_user_1) + CFI_STARTPROC GET_THREAD_INFO(%edx) cmpl TI_addr_limit(%edx),%eax jae bad_get_user 1: movzbl (%eax),%edx xorl %eax,%eax ret + CFI_ENDPROC +ENDPROC(__get_user_1) -.align 4 -.globl __get_user_2 -__get_user_2: +ENTRY(__get_user_2) + CFI_STARTPROC addl $1,%eax jc bad_get_user GET_THREAD_INFO(%edx) @@ -45,10 +47,11 @@ __get_user_2: 2: movzwl -1(%eax),%edx xorl %eax,%eax ret + CFI_ENDPROC +ENDPROC(__get_user_2) -.align 4 -.globl __get_user_4 -__get_user_4: +ENTRY(__get_user_4) + CFI_STARTPROC addl $3,%eax jc bad_get_user GET_THREAD_INFO(%edx) @@ -57,11 +60,16 @@ __get_user_4: 3: movl -3(%eax),%edx xorl %eax,%eax ret + CFI_ENDPROC +ENDPROC(__get_user_4) bad_get_user: + CFI_STARTPROC xorl %edx,%edx movl $-14,%eax ret + CFI_ENDPROC +END(bad_get_user) .section __ex_table,"a" .long 1b,bad_get_user diff --git a/arch/i386/lib/putuser.S b/arch/i386/lib/putuser.S index a32d9f570f4..f58fba109d1 100644 --- a/arch/i386/lib/putuser.S +++ b/arch/i386/lib/putuser.S @@ -8,6 +8,8 @@ * return an error value in addition to the "real" * return value. */ +#include <linux/linkage.h> +#include <asm/dwarf2.h> #include <asm/thread_info.h> @@ -23,23 +25,28 @@ * as they get called from within inline assembly. */ -#define ENTER pushl %ebx ; GET_THREAD_INFO(%ebx) -#define EXIT popl %ebx ; ret +#define ENTER CFI_STARTPROC ; \ + pushl %ebx ; \ + CFI_ADJUST_CFA_OFFSET 4 ; \ + CFI_REL_OFFSET ebx, 0 ; \ + GET_THREAD_INFO(%ebx) +#define EXIT popl %ebx ; \ + CFI_ADJUST_CFA_OFFSET -4 ; \ + CFI_RESTORE ebx ; \ + ret ; \ + CFI_ENDPROC .text -.align 4 -.globl __put_user_1 -__put_user_1: +ENTRY(__put_user_1) ENTER cmpl TI_addr_limit(%ebx),%ecx jae bad_put_user 1: movb %al,(%ecx) xorl %eax,%eax EXIT +ENDPROC(__put_user_1) -.align 4 -.globl __put_user_2 -__put_user_2: +ENTRY(__put_user_2) ENTER movl TI_addr_limit(%ebx),%ebx subl $1,%ebx @@ -48,10 +55,9 @@ __put_user_2: 2: movw %ax,(%ecx) xorl %eax,%eax EXIT +ENDPROC(__put_user_2) -.align 4 -.globl __put_user_4 -__put_user_4: +ENTRY(__put_user_4) ENTER movl TI_addr_limit(%ebx),%ebx subl $3,%ebx @@ -60,10 +66,9 @@ __put_user_4: 3: movl %eax,(%ecx) xorl %eax,%eax EXIT +ENDPROC(__put_user_4) -.align 4 -.globl __put_user_8 -__put_user_8: +ENTRY(__put_user_8) ENTER movl TI_addr_limit(%ebx),%ebx subl $7,%ebx @@ -73,10 +78,16 @@ __put_user_8: 5: movl %edx,4(%ecx) xorl %eax,%eax EXIT +ENDPROC(__put_user_8) bad_put_user: + CFI_STARTPROC simple + CFI_DEF_CFA esp, 2*4 + CFI_OFFSET eip, -1*4 + CFI_OFFSET ebx, -2*4 movl $-14,%eax EXIT +END(bad_put_user) .section __ex_table,"a" .long 1b,bad_put_user diff --git a/arch/i386/lib/usercopy.c b/arch/i386/lib/usercopy.c index 086b3726862..9f38b12b4af 100644 --- a/arch/i386/lib/usercopy.c +++ b/arch/i386/lib/usercopy.c @@ -716,7 +716,6 @@ do { \ unsigned long __copy_to_user_ll(void __user *to, const void *from, unsigned long n) { - BUG_ON((long) n < 0); #ifndef CONFIG_X86_WP_WORKS_OK if (unlikely(boot_cpu_data.wp_works_ok == 0) && ((unsigned long )to) < TASK_SIZE) { @@ -785,7 +784,6 @@ EXPORT_SYMBOL(__copy_to_user_ll); unsigned long __copy_from_user_ll(void *to, const void __user *from, unsigned long n) { - BUG_ON((long)n < 0); if (movsl_is_ok(to, from, n)) __copy_user_zeroing(to, from, n); else @@ -797,7 +795,6 @@ EXPORT_SYMBOL(__copy_from_user_ll); unsigned long __copy_from_user_ll_nozero(void *to, const void __user *from, unsigned long n) { - BUG_ON((long)n < 0); if (movsl_is_ok(to, from, n)) __copy_user(to, from, n); else @@ -810,7 +807,6 @@ EXPORT_SYMBOL(__copy_from_user_ll_nozero); unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from, unsigned long n) { - BUG_ON((long)n < 0); #ifdef CONFIG_X86_INTEL_USERCOPY if ( n > 64 && cpu_has_xmm2) n = __copy_user_zeroing_intel_nocache(to, from, n); @@ -825,7 +821,6 @@ unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from, unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from, unsigned long n) { - BUG_ON((long)n < 0); #ifdef CONFIG_X86_INTEL_USERCOPY if ( n > 64 && cpu_has_xmm2) n = __copy_user_intel_nocache(to, from, n); @@ -853,7 +848,6 @@ unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *fr unsigned long copy_to_user(void __user *to, const void *from, unsigned long n) { - BUG_ON((long) n < 0); if (access_ok(VERIFY_WRITE, to, n)) n = __copy_to_user(to, from, n); return n; @@ -879,7 +873,6 @@ EXPORT_SYMBOL(copy_to_user); unsigned long copy_from_user(void *to, const void __user *from, unsigned long n) { - BUG_ON((long) n < 0); if (access_ok(VERIFY_READ, from, n)) n = __copy_from_user(to, from, n); else diff --git a/arch/i386/mach-generic/bigsmp.c b/arch/i386/mach-generic/bigsmp.c index 8a210fa915b..e932d3485ae 100644 --- a/arch/i386/mach-generic/bigsmp.c +++ b/arch/i386/mach-generic/bigsmp.c @@ -45,7 +45,7 @@ static struct dmi_system_id __initdata bigsmp_dmi_table[] = { }; -static int probe_bigsmp(void) +static int __init probe_bigsmp(void) { if (def_to_bigsmp) dmi_bigsmp = 1; diff --git a/arch/i386/mach-generic/es7000.c b/arch/i386/mach-generic/es7000.c index b8963a5a3b2..b47f951c0ec 100644 --- a/arch/i386/mach-generic/es7000.c +++ b/arch/i386/mach-generic/es7000.c @@ -25,4 +25,45 @@ static int probe_es7000(void) return 0; } +extern void es7000_sw_apic(void); +static void __init enable_apic_mode(void) +{ + es7000_sw_apic(); + return; +} + +static __init int mps_oem_check(struct mp_config_table *mpc, char *oem, + char *productid) +{ + if (mpc->mpc_oemptr) { + struct mp_config_oemtable *oem_table = + (struct mp_config_oemtable *)mpc->mpc_oemptr; + if (!strncmp(oem, "UNISYS", 6)) + return parse_unisys_oem((char *)oem_table); + } + return 0; +} + +#ifdef CONFIG_ACPI +/* Hook from generic ACPI tables.c */ +static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + unsigned long oem_addr; + if (!find_unisys_acpi_oem_table(&oem_addr)) { + if (es7000_check_dsdt()) + return parse_unisys_oem((char *)oem_addr); + else { + setup_unisys(); + return 1; + } + } + return 0; +} +#else +static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + return 0; +} +#endif + struct genapic apic_es7000 = APIC_INIT("es7000", probe_es7000); diff --git a/arch/i386/mach-voyager/voyager_smp.c b/arch/i386/mach-voyager/voyager_smp.c index fe0ed393294..1a5e448a29c 100644 --- a/arch/i386/mach-voyager/voyager_smp.c +++ b/arch/i386/mach-voyager/voyager_smp.c @@ -573,15 +573,7 @@ do_boot_cpu(__u8 cpu) /* init_tasks (in sched.c) is indexed logically */ stack_start.esp = (void *) idle->thread.esp; - /* Pre-allocate and initialize the CPU's GDT and PDA so it - doesn't have to do any memory allocation during the - delicate CPU-bringup phase. */ - if (!init_gdt(cpu, idle)) { - printk(KERN_INFO "Couldn't allocate GDT/PDA for CPU %d\n", cpu); - cpucount--; - return; - } - + init_gdt(cpu, idle); irq_ctx_init(cpu); /* Note: Don't modify initial ss override */ @@ -749,12 +741,6 @@ initialize_secondary(void) #endif /* - * switch to the per CPU GDT we already set up - * in do_boot_cpu() - */ - cpu_set_gdt(current_thread_info()->cpu); - - /* * We don't actually need to load the full TSS, * basically just the stack pointer and the eip. */ diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c index b8c4e259fc8..f534c29e80b 100644 --- a/arch/i386/mm/fault.c +++ b/arch/i386/mm/fault.c @@ -20,6 +20,7 @@ #include <linux/tty.h> #include <linux/vt_kern.h> /* For unblank_screen() */ #include <linux/highmem.h> +#include <linux/bootmem.h> /* for max_low_pfn */ #include <linux/module.h> #include <linux/kprobes.h> #include <linux/uaccess.h> @@ -301,7 +302,6 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs, struct mm_struct *mm; struct vm_area_struct * vma; unsigned long address; - unsigned long page; int write, si_code; /* get the address */ @@ -510,7 +510,9 @@ no_context: bust_spinlocks(1); if (oops_may_print()) { - #ifdef CONFIG_X86_PAE + __typeof__(pte_val(__pte(0))) page; + +#ifdef CONFIG_X86_PAE if (error_code & 16) { pte_t *pte = lookup_address(address); @@ -519,7 +521,7 @@ no_context: "NX-protected page - exploit attempt? " "(uid: %d)\n", current->uid); } - #endif +#endif if (address < PAGE_SIZE) printk(KERN_ALERT "BUG: unable to handle kernel NULL " "pointer dereference"); @@ -529,25 +531,38 @@ no_context: printk(" at virtual address %08lx\n",address); printk(KERN_ALERT " printing eip:\n"); printk("%08lx\n", regs->eip); - } - page = read_cr3(); - page = ((unsigned long *) __va(page))[address >> 22]; - if (oops_may_print()) + + page = read_cr3(); + page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; +#ifdef CONFIG_X86_PAE + printk(KERN_ALERT "*pdpt = %016Lx\n", page); + if ((page >> PAGE_SHIFT) < max_low_pfn + && page & _PAGE_PRESENT) { + page &= PAGE_MASK; + page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) + & (PTRS_PER_PMD - 1)]; + printk(KERN_ALERT "*pde = %016Lx\n", page); + page &= ~_PAGE_NX; + } +#else printk(KERN_ALERT "*pde = %08lx\n", page); - /* - * We must not directly access the pte in the highpte - * case, the page table might be allocated in highmem. - * And lets rather not kmap-atomic the pte, just in case - * it's allocated already. - */ -#ifndef CONFIG_HIGHPTE - if ((page & 1) && oops_may_print()) { - page &= PAGE_MASK; - address &= 0x003ff000; - page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT]; - printk(KERN_ALERT "*pte = %08lx\n", page); - } #endif + + /* + * We must not directly access the pte in the highpte + * case if the page table is located in highmem. + * And let's rather not kmap-atomic the pte, just in case + * it's allocated already. + */ + if ((page >> PAGE_SHIFT) < max_low_pfn + && (page & _PAGE_PRESENT)) { + page &= PAGE_MASK; + page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) + & (PTRS_PER_PTE - 1)]; + printk(KERN_ALERT "*pte = %0*Lx\n", sizeof(page)*2, (u64)page); + } + } + tsk->thread.cr2 = address; tsk->thread.trap_no = 14; tsk->thread.error_code = error_code; @@ -588,7 +603,6 @@ do_sigbus: force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); } -#ifndef CONFIG_X86_PAE void vmalloc_sync_all(void) { /* @@ -601,6 +615,9 @@ void vmalloc_sync_all(void) static unsigned long start = TASK_SIZE; unsigned long address; + if (SHARED_KERNEL_PMD) + return; + BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { if (!test_bit(pgd_index(address), insync)) { @@ -623,4 +640,3 @@ void vmalloc_sync_all(void) start = address + PGDIR_SIZE; } } -#endif diff --git a/arch/i386/mm/highmem.c b/arch/i386/mm/highmem.c index ac70d09df7e..ad8d86cc683 100644 --- a/arch/i386/mm/highmem.c +++ b/arch/i386/mm/highmem.c @@ -26,7 +26,7 @@ void kunmap(struct page *page) * However when holding an atomic kmap is is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ -void *kmap_atomic(struct page *page, enum km_type type) +void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) { enum fixed_addresses idx; unsigned long vaddr; @@ -41,12 +41,17 @@ void *kmap_atomic(struct page *page, enum km_type type) return page_address(page); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); + set_pte(kmap_pte-idx, mk_pte(page, prot)); arch_flush_lazy_mmu_mode(); return (void*) vaddr; } +void *kmap_atomic(struct page *page, enum km_type type) +{ + return kmap_atomic_prot(page, type, kmap_prot); +} + void kunmap_atomic(void *kvaddr, enum km_type type) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; @@ -67,6 +72,7 @@ void kunmap_atomic(void *kvaddr, enum km_type type) #endif } + arch_flush_lazy_mmu_mode(); pagefault_enable(); } diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index ae436882af7..dbe16f63a56 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -22,6 +22,7 @@ #include <linux/init.h> #include <linux/highmem.h> #include <linux/pagemap.h> +#include <linux/pfn.h> #include <linux/poison.h> #include <linux/bootmem.h> #include <linux/slab.h> @@ -42,6 +43,7 @@ #include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/sections.h> +#include <asm/paravirt.h> unsigned int __VMALLOC_RESERVE = 128 << 20; @@ -61,17 +63,18 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) pmd_t *pmd_table; #ifdef CONFIG_X86_PAE - pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); - paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT); - set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - pud = pud_offset(pgd, 0); - if (pmd_table != pmd_offset(pud, 0)) - BUG(); -#else + if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { + pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); + + paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT); + set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); + pud = pud_offset(pgd, 0); + if (pmd_table != pmd_offset(pud, 0)) + BUG(); + } +#endif pud = pud_offset(pgd, 0); pmd_table = pmd_offset(pud, 0); -#endif - return pmd_table; } @@ -81,14 +84,12 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) */ static pte_t * __init one_page_table_init(pmd_t *pmd) { - if (pmd_none(*pmd)) { + if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); + paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); - if (page_table != pte_offset_kernel(pmd, 0)) - BUG(); - - return page_table; + BUG_ON(page_table != pte_offset_kernel(pmd, 0)); } return pte_offset_kernel(pmd, 0); @@ -108,7 +109,6 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base) { pgd_t *pgd; - pud_t *pud; pmd_t *pmd; int pgd_idx, pmd_idx; unsigned long vaddr; @@ -119,13 +119,10 @@ static void __init page_table_range_init (unsigned long start, unsigned long end pgd = pgd_base + pgd_idx; for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { - if (pgd_none(*pgd)) - one_md_table_init(pgd); - pud = pud_offset(pgd, vaddr); - pmd = pmd_offset(pud, vaddr); + pmd = one_md_table_init(pgd); + pmd = pmd + pmd_index(vaddr); for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { - if (pmd_none(*pmd)) - one_page_table_init(pmd); + one_page_table_init(pmd); vaddr += PMD_SIZE; } @@ -167,20 +164,22 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base) /* Map with big pages if possible, otherwise create normal page tables. */ if (cpu_has_pse) { unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; - if (is_kernel_text(address) || is_kernel_text(address2)) set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); else set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); + pfn += PTRS_PER_PTE; } else { pte = one_page_table_init(pmd); - for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) { - if (is_kernel_text(address)) - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); - else - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); + for (pte_ofs = 0; + pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; + pte++, pfn++, pte_ofs++, address += PAGE_SIZE) { + if (is_kernel_text(address)) + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); + else + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); } } } @@ -337,24 +336,78 @@ extern void __init remap_numa_kva(void); #define remap_numa_kva() do {} while (0) #endif -static void __init pagetable_init (void) +void __init native_pagetable_setup_start(pgd_t *base) { - unsigned long vaddr; - pgd_t *pgd_base = swapper_pg_dir; - #ifdef CONFIG_X86_PAE int i; - /* Init entries of the first-level page table to the zero page */ - for (i = 0; i < PTRS_PER_PGD; i++) - set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); + + /* + * Init entries of the first-level page table to the + * zero page, if they haven't already been set up. + * + * In a normal native boot, we'll be running on a + * pagetable rooted in swapper_pg_dir, but not in PAE + * mode, so this will end up clobbering the mappings + * for the lower 24Mbytes of the address space, + * without affecting the kernel address space. + */ + for (i = 0; i < USER_PTRS_PER_PGD; i++) + set_pgd(&base[i], + __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); + + /* Make sure kernel address space is empty so that a pagetable + will be allocated for it. */ + memset(&base[USER_PTRS_PER_PGD], 0, + KERNEL_PGD_PTRS * sizeof(pgd_t)); #else paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT); #endif +} + +void __init native_pagetable_setup_done(pgd_t *base) +{ +#ifdef CONFIG_X86_PAE + /* + * Add low memory identity-mappings - SMP needs it when + * starting up on an AP from real-mode. In the non-PAE + * case we already have these mappings through head.S. + * All user-space mappings are explicitly cleared after + * SMP startup. + */ + set_pgd(&base[0], base[USER_PTRS_PER_PGD]); +#endif +} + +/* + * Build a proper pagetable for the kernel mappings. Up until this + * point, we've been running on some set of pagetables constructed by + * the boot process. + * + * If we're booting on native hardware, this will be a pagetable + * constructed in arch/i386/kernel/head.S, and not running in PAE mode + * (even if we'll end up running in PAE). The root of the pagetable + * will be swapper_pg_dir. + * + * If we're booting paravirtualized under a hypervisor, then there are + * more options: we may already be running PAE, and the pagetable may + * or may not be based in swapper_pg_dir. In any case, + * paravirt_pagetable_setup_start() will set up swapper_pg_dir + * appropriately for the rest of the initialization to work. + * + * In general, pagetable_init() assumes that the pagetable may already + * be partially populated, and so it avoids stomping on any existing + * mappings. + */ +static void __init pagetable_init (void) +{ + unsigned long vaddr, end; + pgd_t *pgd_base = swapper_pg_dir; + + paravirt_pagetable_setup_start(pgd_base); /* Enable PSE if available */ - if (cpu_has_pse) { + if (cpu_has_pse) set_in_cr4(X86_CR4_PSE); - } /* Enable PGE if available */ if (cpu_has_pge) { @@ -371,20 +424,12 @@ static void __init pagetable_init (void) * created - mappings will be set by set_fixmap(): */ vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; - page_table_range_init(vaddr, 0, pgd_base); + end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; + page_table_range_init(vaddr, end, pgd_base); permanent_kmaps_init(pgd_base); -#ifdef CONFIG_X86_PAE - /* - * Add low memory identity-mappings - SMP needs it when - * starting up on an AP from real-mode. In the non-PAE - * case we already have these mappings through head.S. - * All user-space mappings are explicitly cleared after - * SMP startup. - */ - set_pgd(&pgd_base[0], pgd_base[USER_PTRS_PER_PGD]); -#endif + paravirt_pagetable_setup_done(pgd_base); } #if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP) @@ -700,6 +745,8 @@ struct kmem_cache *pmd_cache; void __init pgtable_cache_init(void) { + size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t); + if (PTRS_PER_PMD > 1) { pmd_cache = kmem_cache_create("pmd", PTRS_PER_PMD*sizeof(pmd_t), @@ -709,13 +756,23 @@ void __init pgtable_cache_init(void) NULL); if (!pmd_cache) panic("pgtable_cache_init(): cannot create pmd cache"); + + if (!SHARED_KERNEL_PMD) { + /* If we're in PAE mode and have a non-shared + kernel pmd, then the pgd size must be a + page size. This is because the pgd_list + links through the page structure, so there + can only be one pgd per page for this to + work. */ + pgd_size = PAGE_SIZE; + } } pgd_cache = kmem_cache_create("pgd", - PTRS_PER_PGD*sizeof(pgd_t), - PTRS_PER_PGD*sizeof(pgd_t), + pgd_size, + pgd_size, 0, pgd_ctor, - PTRS_PER_PMD == 1 ? pgd_dtor : NULL); + (!SHARED_KERNEL_PMD) ? pgd_dtor : NULL); if (!pgd_cache) panic("pgtable_cache_init(): Cannot create pgd cache"); } @@ -751,13 +808,25 @@ static int noinline do_test_wp_bit(void) void mark_rodata_ro(void) { - unsigned long addr = (unsigned long)__start_rodata; + unsigned long start = PFN_ALIGN(_text); + unsigned long size = PFN_ALIGN(_etext) - start; - for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE) - change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO); +#ifdef CONFIG_HOTPLUG_CPU + /* It must still be possible to apply SMP alternatives. */ + if (num_possible_cpus() <= 1) +#endif + { + change_page_attr(virt_to_page(start), + size >> PAGE_SHIFT, PAGE_KERNEL_RX); + printk("Write protecting the kernel text: %luk\n", size >> 10); + } - printk("Write protecting the kernel read-only data: %uk\n", - (__end_rodata - __start_rodata) >> 10); + start += size; + size = (unsigned long)__end_rodata - start; + change_page_attr(virt_to_page(start), + size >> PAGE_SHIFT, PAGE_KERNEL_RO); + printk("Write protecting the kernel read-only data: %luk\n", + size >> 10); /* * change_page_attr() requires a global_flush_tlb() call after it. @@ -774,26 +843,27 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) unsigned long addr; for (addr = begin; addr < end; addr += PAGE_SIZE) { - ClearPageReserved(virt_to_page(addr)); - init_page_count(virt_to_page(addr)); - memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); - free_page(addr); + struct page *page = pfn_to_page(addr >> PAGE_SHIFT); + ClearPageReserved(page); + init_page_count(page); + memset(page_address(page), POISON_FREE_INITMEM, PAGE_SIZE); + __free_page(page); totalram_pages++; } - printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); + printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); } void free_initmem(void) { free_init_pages("unused kernel memory", - (unsigned long)(&__init_begin), - (unsigned long)(&__init_end)); + __pa_symbol(&__init_begin), + __pa_symbol(&__init_end)); } #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_init_pages("initrd memory", start, end); + free_init_pages("initrd memory", __pa(start), __pa(end)); } #endif diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c index 412ebbd8adb..47bd477c8ec 100644 --- a/arch/i386/mm/pageattr.c +++ b/arch/i386/mm/pageattr.c @@ -91,7 +91,7 @@ static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) unsigned long flags; set_pte_atomic(kpte, pte); /* change init_mm */ - if (PTRS_PER_PMD > 1) + if (SHARED_KERNEL_PMD) return; spin_lock_irqsave(&pgd_lock, flags); @@ -142,7 +142,7 @@ __change_page_attr(struct page *page, pgprot_t prot) return -EINVAL; kpte_page = virt_to_page(kpte); if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) { - if ((pte_val(*kpte) & _PAGE_PSE) == 0) { + if (!pte_huge(*kpte)) { set_pte_atomic(kpte, mk_pte(page, prot)); } else { pgprot_t ref_prot; @@ -158,7 +158,7 @@ __change_page_attr(struct page *page, pgprot_t prot) kpte_page = split; } page_private(kpte_page)++; - } else if ((pte_val(*kpte) & _PAGE_PSE) == 0) { + } else if (!pte_huge(*kpte)) { set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL)); BUG_ON(page_private(kpte_page) == 0); page_private(kpte_page)--; diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c index fa0cfbd551e..9a96c164742 100644 --- a/arch/i386/mm/pgtable.c +++ b/arch/i386/mm/pgtable.c @@ -144,10 +144,8 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) } static int fixmaps; -#ifndef CONFIG_COMPAT_VDSO unsigned long __FIXADDR_TOP = 0xfffff000; EXPORT_SYMBOL(__FIXADDR_TOP); -#endif void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) { @@ -173,12 +171,8 @@ void reserve_top_address(unsigned long reserve) BUG_ON(fixmaps > 0); printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", (int)-reserve); -#ifdef CONFIG_COMPAT_VDSO - BUG_ON(reserve != 0); -#else __FIXADDR_TOP = -reserve - PAGE_SIZE; __VMALLOC_RESERVE += reserve; -#endif } pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) @@ -238,42 +232,92 @@ static inline void pgd_list_del(pgd_t *pgd) set_page_private(next, (unsigned long)pprev); } +#if (PTRS_PER_PMD == 1) +/* Non-PAE pgd constructor */ void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused) { unsigned long flags; - if (PTRS_PER_PMD == 1) { - memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); - spin_lock_irqsave(&pgd_lock, flags); - } + /* !PAE, no pagetable sharing */ + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); + + spin_lock_irqsave(&pgd_lock, flags); + /* must happen under lock */ clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, KERNEL_PGD_PTRS); - - if (PTRS_PER_PMD > 1) - return; - - /* must happen under lock */ paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, - __pa(swapper_pg_dir) >> PAGE_SHIFT, - USER_PTRS_PER_PGD, PTRS_PER_PGD - USER_PTRS_PER_PGD); - + __pa(swapper_pg_dir) >> PAGE_SHIFT, + USER_PTRS_PER_PGD, + KERNEL_PGD_PTRS); pgd_list_add(pgd); spin_unlock_irqrestore(&pgd_lock, flags); } +#else /* PTRS_PER_PMD > 1 */ +/* PAE pgd constructor */ +void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused) +{ + /* PAE, kernel PMD may be shared */ + + if (SHARED_KERNEL_PMD) { + clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, + swapper_pg_dir + USER_PTRS_PER_PGD, + KERNEL_PGD_PTRS); + } else { + unsigned long flags; + + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); + spin_lock_irqsave(&pgd_lock, flags); + pgd_list_add(pgd); + spin_unlock_irqrestore(&pgd_lock, flags); + } +} +#endif /* PTRS_PER_PMD */ -/* never called when PTRS_PER_PMD > 1 */ void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused) { unsigned long flags; /* can be called from interrupt context */ + BUG_ON(SHARED_KERNEL_PMD); + paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT); spin_lock_irqsave(&pgd_lock, flags); pgd_list_del(pgd); spin_unlock_irqrestore(&pgd_lock, flags); } +#define UNSHARED_PTRS_PER_PGD \ + (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) + +/* If we allocate a pmd for part of the kernel address space, then + make sure its initialized with the appropriate kernel mappings. + Otherwise use a cached zeroed pmd. */ +static pmd_t *pmd_cache_alloc(int idx) +{ + pmd_t *pmd; + + if (idx >= USER_PTRS_PER_PGD) { + pmd = (pmd_t *)__get_free_page(GFP_KERNEL); + + if (pmd) + memcpy(pmd, + (void *)pgd_page_vaddr(swapper_pg_dir[idx]), + sizeof(pmd_t) * PTRS_PER_PMD); + } else + pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); + + return pmd; +} + +static void pmd_cache_free(pmd_t *pmd, int idx) +{ + if (idx >= USER_PTRS_PER_PGD) + free_page((unsigned long)pmd); + else + kmem_cache_free(pmd_cache, pmd); +} + pgd_t *pgd_alloc(struct mm_struct *mm) { int i; @@ -282,10 +326,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm) if (PTRS_PER_PMD == 1 || !pgd) return pgd; - for (i = 0; i < USER_PTRS_PER_PGD; ++i) { - pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); + for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) { + pmd_t *pmd = pmd_cache_alloc(i); + if (!pmd) goto out_oom; + paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT); set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); } @@ -296,7 +342,7 @@ out_oom: pgd_t pgdent = pgd[i]; void* pmd = (void *)__va(pgd_val(pgdent)-1); paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); - kmem_cache_free(pmd_cache, pmd); + pmd_cache_free(pmd, i); } kmem_cache_free(pgd_cache, pgd); return NULL; @@ -308,11 +354,11 @@ void pgd_free(pgd_t *pgd) /* in the PAE case user pgd entries are overwritten before usage */ if (PTRS_PER_PMD > 1) - for (i = 0; i < USER_PTRS_PER_PGD; ++i) { + for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) { pgd_t pgdent = pgd[i]; void* pmd = (void *)__va(pgd_val(pgdent)-1); paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); - kmem_cache_free(pmd_cache, pmd); + pmd_cache_free(pmd, i); } /* in the non-PAE case, free_pgtables() clears user pgd entries */ kmem_cache_free(pgd_cache, pgd); diff --git a/arch/i386/oprofile/nmi_int.c b/arch/i386/oprofile/nmi_int.c index 8fda7be9dd4..695f737516a 100644 --- a/arch/i386/oprofile/nmi_int.c +++ b/arch/i386/oprofile/nmi_int.c @@ -414,6 +414,10 @@ int __init op_nmi_init(struct oprofile_operations *ops) user space an consistent name. */ cpu_type = "x86-64/hammer"; break; + case 0x10: + model = &op_athlon_spec; + cpu_type = "x86-64/family10"; + break; } break; diff --git a/arch/i386/pci/init.c b/arch/i386/pci/init.c index b21b6da8ab1..1cf11af96de 100644 --- a/arch/i386/pci/init.c +++ b/arch/i386/pci/init.c @@ -6,7 +6,7 @@ in the right sequence from here. */ static __init int pci_access_init(void) { - int type = 0; + int type __attribute__((unused)) = 0; #ifdef CONFIG_PCI_DIRECT type = pci_direct_probe(); diff --git a/arch/i386/pci/mmconfig-shared.c b/arch/i386/pci/mmconfig-shared.c index 747d8c63b0c..c7cabeed4d7 100644 --- a/arch/i386/pci/mmconfig-shared.c +++ b/arch/i386/pci/mmconfig-shared.c @@ -60,14 +60,19 @@ static const char __init *pci_mmcfg_e7520(void) u32 win; pci_conf1_read(0, 0, PCI_DEVFN(0,0), 0xce, 2, &win); - pci_mmcfg_config_num = 1; - pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL); - if (!pci_mmcfg_config) - return NULL; - pci_mmcfg_config[0].address = (win & 0xf000) << 16; - pci_mmcfg_config[0].pci_segment = 0; - pci_mmcfg_config[0].start_bus_number = 0; - pci_mmcfg_config[0].end_bus_number = 255; + win = win & 0xf000; + if(win == 0x0000 || win == 0xf000) + pci_mmcfg_config_num = 0; + else { + pci_mmcfg_config_num = 1; + pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL); + if (!pci_mmcfg_config) + return NULL; + pci_mmcfg_config[0].address = win << 16; + pci_mmcfg_config[0].pci_segment = 0; + pci_mmcfg_config[0].start_bus_number = 0; + pci_mmcfg_config[0].end_bus_number = 255; + } return "Intel Corporation E7520 Memory Controller Hub"; } @@ -108,6 +113,10 @@ static const char __init *pci_mmcfg_intel_945(void) if ((pciexbar & mask) & 0x0fffffffU) pci_mmcfg_config_num = 0; + /* Don't hit the APIC registers and their friends */ + if ((pciexbar & mask) >= 0xf0000000U) + pci_mmcfg_config_num = 0; + if (pci_mmcfg_config_num) { pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL); if (!pci_mmcfg_config) diff --git a/arch/i386/power/cpu.c b/arch/i386/power/cpu.c index 2c15500f871..998fd3ec0d6 100644 --- a/arch/i386/power/cpu.c +++ b/arch/i386/power/cpu.c @@ -21,6 +21,7 @@ unsigned long saved_context_eflags; void __save_processor_state(struct saved_context *ctxt) { + mtrr_save_fixed_ranges(NULL); kernel_fpu_begin(); /* diff --git a/arch/i386/power/suspend.c b/arch/i386/power/suspend.c index db5e98d2eb7..a0020b913f3 100644 --- a/arch/i386/power/suspend.c +++ b/arch/i386/power/suspend.c @@ -16,6 +16,9 @@ /* Defined in arch/i386/power/swsusp.S */ extern int restore_image(void); +/* References to section boundaries */ +extern const void __nosave_begin, __nosave_end; + /* Pointer to the temporary resume page tables */ pgd_t *resume_pg_dir; @@ -156,3 +159,14 @@ int swsusp_arch_resume(void) restore_image(); return 0; } + +/* + * pfn_is_nosave - check if given pfn is in the 'nosave' section + */ + +int pfn_is_nosave(unsigned long pfn) +{ + unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT; + unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; + return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); +} diff --git a/arch/m32r/kernel/vmlinux.lds.S b/arch/m32r/kernel/vmlinux.lds.S index 439cc257cd1..6c73bca3f47 100644 --- a/arch/m32r/kernel/vmlinux.lds.S +++ b/arch/m32r/kernel/vmlinux.lds.S @@ -110,7 +110,7 @@ SECTIONS __initramfs_end = .; #endif - . = ALIGN(32); + . = ALIGN(4096); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index c76b793310c..043f637e3d1 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -119,7 +119,7 @@ SECTIONS .init.ramfs : { *(.init.ramfs) } __initramfs_end = .; #endif - . = ALIGN(32); + . = ALIGN(_PAGE_SIZE); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S index 2a8253358c6..c7458599059 100644 --- a/arch/parisc/kernel/vmlinux.lds.S +++ b/arch/parisc/kernel/vmlinux.lds.S @@ -181,7 +181,7 @@ SECTIONS .init.ramfs : { *(.init.ramfs) } __initramfs_end = .; #endif - . = ALIGN(32); + . = ALIGN(ASM_PAGE_SIZE); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index e0fa80eca36..aa693d0f151 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -37,6 +37,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_6xx) += idle_6xx.o l2cr_6xx.o cpu_setup_6xx.o obj-$(CONFIG_TAU) += tau_6xx.o obj32-$(CONFIG_SOFTWARE_SUSPEND) += swsusp_32.o +obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o obj32-$(CONFIG_MODULES) += module_32.o ifeq ($(CONFIG_PPC_MERGE),y) diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 22083ce3cc3..6018178708a 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -582,14 +582,14 @@ void __init setup_per_cpu_areas(void) char *ptr; /* Copy section for each CPU (we discard the original) */ - size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES); + size = ALIGN(__per_cpu_end - __per_cpu_start, PAGE_SIZE); #ifdef CONFIG_MODULES if (size < PERCPU_ENOUGH_ROOM) size = PERCPU_ENOUGH_ROOM; #endif for_each_possible_cpu(i) { - ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size); + ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size); if (!ptr) panic("Cannot allocate cpu data for CPU %d\n", i); diff --git a/arch/powerpc/kernel/suspend.c b/arch/powerpc/kernel/suspend.c new file mode 100644 index 00000000000..8cee5710754 --- /dev/null +++ b/arch/powerpc/kernel/suspend.c @@ -0,0 +1,24 @@ +/* + * Suspend support specific for power. + * + * Distribute under GPLv2 + * + * Copyright (c) 2002 Pavel Machek <pavel@suse.cz> + * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> + */ + +#include <asm/page.h> + +/* References to section boundaries */ +extern const void __nosave_begin, __nosave_end; + +/* + * pfn_is_nosave - check if given pfn is in the 'nosave' section + */ + +int pfn_is_nosave(unsigned long pfn) +{ + unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; + unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT; + return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); +} diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 7eefeb4a30e..13206731314 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -139,11 +139,7 @@ SECTIONS __initramfs_end = .; } #endif -#ifdef CONFIG_PPC32 - . = ALIGN(32); -#else - . = ALIGN(128); -#endif + . = ALIGN(PAGE_SIZE); .data.percpu : { __per_cpu_start = .; *(.data.percpu) diff --git a/arch/ppc/kernel/vmlinux.lds.S b/arch/ppc/kernel/vmlinux.lds.S index a0625562a44..44cd128fb71 100644 --- a/arch/ppc/kernel/vmlinux.lds.S +++ b/arch/ppc/kernel/vmlinux.lds.S @@ -130,7 +130,7 @@ SECTIONS __ftr_fixup : { *(__ftr_fixup) } __stop___ftr_fixup = .; - . = ALIGN(32); + . = ALIGN(4096); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 418f6426a94..e9d3432aba6 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -107,7 +107,7 @@ SECTIONS . = ALIGN(2); __initramfs_end = .; #endif - . = ALIGN(256); + . = ALIGN(4096); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S index 78a6c09875b..2f606d0ce1f 100644 --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S @@ -54,7 +54,7 @@ SECTIONS . = ALIGN(PAGE_SIZE); .data.page_aligned : { *(.data.page_aligned) } - . = ALIGN(L1_CACHE_BYTES); + . = ALIGN(PAGE_SIZE); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/sh64/kernel/vmlinux.lds.S b/arch/sh64/kernel/vmlinux.lds.S index a59c5e99813..4f9616f3983 100644 --- a/arch/sh64/kernel/vmlinux.lds.S +++ b/arch/sh64/kernel/vmlinux.lds.S @@ -85,7 +85,7 @@ SECTIONS . = ALIGN(PAGE_SIZE); .data.page_aligned : C_PHYS(.data.page_aligned) { *(.data.page_aligned) } - . = ALIGN(L1_CACHE_BYTES); + . = ALIGN(PAGE_SIZE); __per_cpu_start = .; .data.percpu : C_PHYS(.data.percpu) { *(.data.percpu) } __per_cpu_end = . ; diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S index e5c24e0521d..f0bb6e60e62 100644 --- a/arch/sparc/kernel/vmlinux.lds.S +++ b/arch/sparc/kernel/vmlinux.lds.S @@ -65,7 +65,7 @@ SECTIONS __initramfs_end = .; #endif - . = ALIGN(32); + . = ALIGN(4096); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c index d4f0a70f484..1fac215252e 100644 --- a/arch/sparc64/kernel/smp.c +++ b/arch/sparc64/kernel/smp.c @@ -1343,11 +1343,11 @@ void __init setup_per_cpu_areas(void) /* Copy section for each CPU (we discard the original) */ goal = PERCPU_ENOUGH_ROOM; - __per_cpu_shift = 0; - for (size = 1UL; size < goal; size <<= 1UL) + __per_cpu_shift = PAGE_SHIFT; + for (size = PAGE_SIZE; size < goal; size <<= 1UL) __per_cpu_shift++; - ptr = alloc_bootmem(size * NR_CPUS); + ptr = alloc_bootmem_pages(size * NR_CPUS); __per_cpu_base = ptr - __per_cpu_start; diff --git a/arch/um/defconfig b/arch/um/defconfig index 780cc0a4a12..f938fa82214 100644 --- a/arch/um/defconfig +++ b/arch/um/defconfig @@ -41,6 +41,7 @@ CONFIG_M686=y # CONFIG_MGEODE_LX is not set # CONFIG_MCYRIXIII is not set # CONFIG_MVIAC3_2 is not set +# CONFIG_MVIAC7 is not set # CONFIG_X86_GENERIC is not set CONFIG_X86_CMPXCHG=y CONFIG_X86_XADD=y diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index e9b4f058a49..145bb824b2a 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -415,13 +415,13 @@ config OUT_OF_LINE_PFN_TO_PAGE depends on DISCONTIGMEM config NR_CPUS - int "Maximum number of CPUs (2-256)" + int "Maximum number of CPUs (2-255)" range 2 255 depends on SMP default "8" help This allows you to specify the maximum number of CPUs which this - kernel will support. Current maximum is 256 CPUs due to + kernel will support. Current maximum is 255 CPUs due to APIC addressing limits. Less depending on the hardware. This is purely to save memory - each supported CPU requires @@ -565,23 +565,56 @@ config CRASH_DUMP PHYSICAL_START. For more details see Documentation/kdump/kdump.txt +config RELOCATABLE + bool "Build a relocatable kernel(EXPERIMENTAL)" + depends on EXPERIMENTAL + help + Builds a relocatable kernel. This enables loading and running + a kernel binary from a different physical address than it has + been compiled for. + + One use is for the kexec on panic case where the recovery kernel + must live at a different physical address than the primary + kernel. + + Note: If CONFIG_RELOCATABLE=y, then kernel run from the address + it has been loaded at and compile time physical address + (CONFIG_PHYSICAL_START) is ignored. + config PHYSICAL_START hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) - default "0x1000000" if CRASH_DUMP default "0x200000" help - This gives the physical address where the kernel is loaded. Normally - for regular kernels this value is 0x200000 (2MB). But in the case - of kexec on panic the fail safe kernel needs to run at a different - address than the panic-ed kernel. This option is used to set the load - address for kernels used to capture crash dump on being kexec'ed - after panic. The default value for crash dump kernels is - 0x1000000 (16MB). This can also be set based on the "X" value as + This gives the physical address where the kernel is loaded. It + should be aligned to 2MB boundary. + + If kernel is a not relocatable (CONFIG_RELOCATABLE=n) then + bzImage will decompress itself to above physical address and + run from there. Otherwise, bzImage will run from the address where + it has been loaded by the boot loader and will ignore above physical + address. + + In normal kdump cases one does not have to set/change this option + as now bzImage can be compiled as a completely relocatable image + (CONFIG_RELOCATABLE=y) and be used to load and run from a different + address. This option is mainly useful for the folks who don't want + to use a bzImage for capturing the crash dump and want to use a + vmlinux instead. + + So if you are using bzImage for capturing the crash dump, leave + the value here unchanged to 0x200000 and set CONFIG_RELOCATABLE=y. + Otherwise if you plan to use vmlinux for capturing the crash dump + change this value to start of the reserved region (Typically 16MB + 0x1000000). In other words, it can be set based on the "X" value as specified in the "crashkernel=YM@XM" command line boot parameter passed to the panic-ed kernel. Typically this parameter is set as crashkernel=64M@16M. Please take a look at Documentation/kdump/kdump.txt for more details about crash dumps. + Usage of bzImage for capturing the crash dump is advantageous as + one does not have to build two kernels. Same kernel can be used + as production kernel and capture kernel. + Don't change this unless you know what you are doing. config SECCOMP @@ -627,14 +660,6 @@ config CC_STACKPROTECTOR_ALL source kernel/Kconfig.hz -config REORDER - bool "Function reordering" - default n - help - This option enables the toolchain to reorder functions for a more - optimal TLB usage. If you have pretty much any version of binutils, - this can increase your kernel build time by roughly one minute. - config K8_NB def_bool y depends on AGP_AMD64 || IOMMU || (PCI && NUMA) diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile index 2941a915d4e..29617ae3926 100644 --- a/arch/x86_64/Makefile +++ b/arch/x86_64/Makefile @@ -40,10 +40,6 @@ cflags-y += -m64 cflags-y += -mno-red-zone cflags-y += -mcmodel=kernel cflags-y += -pipe -cflags-kernel-$(CONFIG_REORDER) += -ffunction-sections -# this makes reading assembly source easier, but produces worse code -# actually it makes the kernel smaller too. -cflags-y += -fno-reorder-blocks cflags-y += -Wno-sign-compare cflags-y += -fno-asynchronous-unwind-tables ifneq ($(CONFIG_DEBUG_INFO),y) diff --git a/arch/x86_64/boot/Makefile b/arch/x86_64/boot/Makefile index deb063e7762..ee6f6505f95 100644 --- a/arch/x86_64/boot/Makefile +++ b/arch/x86_64/boot/Makefile @@ -36,7 +36,7 @@ subdir- := compressed/ #Let make clean descend in compressed/ # --------------------------------------------------------------------------- $(obj)/bzImage: IMAGE_OFFSET := 0x100000 -$(obj)/bzImage: EXTRA_AFLAGS := -traditional $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__ +$(obj)/bzImage: EXTRA_AFLAGS := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__ $(obj)/bzImage: BUILDFLAGS := -b quiet_cmd_image = BUILD $@ diff --git a/arch/x86_64/boot/compressed/Makefile b/arch/x86_64/boot/compressed/Makefile index e70fa6e1da0..705a3e33d7e 100644 --- a/arch/x86_64/boot/compressed/Makefile +++ b/arch/x86_64/boot/compressed/Makefile @@ -8,16 +8,14 @@ targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o EXTRA_AFLAGS := -traditional -AFLAGS := $(subst -m64,-m32,$(AFLAGS)) # cannot use EXTRA_CFLAGS because base CFLAGS contains -mkernel which conflicts with # -m32 -CFLAGS := -m32 -D__KERNEL__ -Iinclude -O2 -fno-strict-aliasing -LDFLAGS := -m elf_i386 +CFLAGS := -m64 -D__KERNEL__ -Iinclude -O2 -fno-strict-aliasing -fPIC -mcmodel=small -fno-builtin +LDFLAGS := -m elf_x86_64 -LDFLAGS_vmlinux := -Ttext $(IMAGE_OFFSET) -e startup_32 -m elf_i386 - -$(obj)/vmlinux: $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE +LDFLAGS_vmlinux := -T +$(obj)/vmlinux: $(src)/vmlinux.lds $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE $(call if_changed,ld) @: @@ -27,7 +25,7 @@ $(obj)/vmlinux.bin: vmlinux FORCE $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE $(call if_changed,gzip) -LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T +LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T $(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE $(call if_changed,ld) diff --git a/arch/x86_64/boot/compressed/head.S b/arch/x86_64/boot/compressed/head.S index 6f55565e4d4..f9d5692a010 100644 --- a/arch/x86_64/boot/compressed/head.S +++ b/arch/x86_64/boot/compressed/head.S @@ -26,116 +26,279 @@ #include <linux/linkage.h> #include <asm/segment.h> +#include <asm/pgtable.h> #include <asm/page.h> +#include <asm/msr.h> +.section ".text.head" .code32 .globl startup_32 - + startup_32: cld cli - movl $(__KERNEL_DS),%eax - movl %eax,%ds - movl %eax,%es - movl %eax,%fs - movl %eax,%gs - - lss stack_start,%esp - xorl %eax,%eax -1: incl %eax # check that A20 really IS enabled - movl %eax,0x000000 # loop forever if it isn't - cmpl %eax,0x100000 - je 1b + movl $(__KERNEL_DS), %eax + movl %eax, %ds + movl %eax, %es + movl %eax, %ss + +/* Calculate the delta between where we were compiled to run + * at and where we were actually loaded at. This can only be done + * with a short local call on x86. Nothing else will tell us what + * address we are running at. The reserved chunk of the real-mode + * data at 0x34-0x3f are used as the stack for this calculation. + * Only 4 bytes are needed. + */ + leal 0x40(%esi), %esp + call 1f +1: popl %ebp + subl $1b, %ebp + +/* setup a stack and make sure cpu supports long mode. */ + movl $user_stack_end, %eax + addl %ebp, %eax + movl %eax, %esp + + call verify_cpu + testl %eax, %eax + jnz no_longmode + +/* Compute the delta between where we were compiled to run at + * and where the code will actually run at. + */ +/* %ebp contains the address we are loaded at by the boot loader and %ebx + * contains the address where we should move the kernel image temporarily + * for safe in-place decompression. + */ + +#ifdef CONFIG_RELOCATABLE + movl %ebp, %ebx + addl $(LARGE_PAGE_SIZE -1), %ebx + andl $LARGE_PAGE_MASK, %ebx +#else + movl $CONFIG_PHYSICAL_START, %ebx +#endif + + /* Replace the compressed data size with the uncompressed size */ + subl input_len(%ebp), %ebx + movl output_len(%ebp), %eax + addl %eax, %ebx + /* Add 8 bytes for every 32K input block */ + shrl $12, %eax + addl %eax, %ebx + /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */ + addl $(32768 + 18 + 4095), %ebx + andl $~4095, %ebx /* - * Initialize eflags. Some BIOS's leave bits like NT set. This would - * confuse the debugger if this code is traced. - * XXX - best to initialize before switching to protected mode. + * Prepare for entering 64 bit mode */ - pushl $0 - popfl + + /* Load new GDT with the 64bit segments using 32bit descriptor */ + leal gdt(%ebp), %eax + movl %eax, gdt+2(%ebp) + lgdt gdt(%ebp) + + /* Enable PAE mode */ + xorl %eax, %eax + orl $(1 << 5), %eax + movl %eax, %cr4 + + /* + * Build early 4G boot pagetable + */ + /* Initialize Page tables to 0*/ + leal pgtable(%ebx), %edi + xorl %eax, %eax + movl $((4096*6)/4), %ecx + rep stosl + + /* Build Level 4 */ + leal pgtable + 0(%ebx), %edi + leal 0x1007 (%edi), %eax + movl %eax, 0(%edi) + + /* Build Level 3 */ + leal pgtable + 0x1000(%ebx), %edi + leal 0x1007(%edi), %eax + movl $4, %ecx +1: movl %eax, 0x00(%edi) + addl $0x00001000, %eax + addl $8, %edi + decl %ecx + jnz 1b + + /* Build Level 2 */ + leal pgtable + 0x2000(%ebx), %edi + movl $0x00000183, %eax + movl $2048, %ecx +1: movl %eax, 0(%edi) + addl $0x00200000, %eax + addl $8, %edi + decl %ecx + jnz 1b + + /* Enable the boot page tables */ + leal pgtable(%ebx), %eax + movl %eax, %cr3 + + /* Enable Long mode in EFER (Extended Feature Enable Register) */ + movl $MSR_EFER, %ecx + rdmsr + btsl $_EFER_LME, %eax + wrmsr + + /* Setup for the jump to 64bit mode + * + * When the jump is performend we will be in long mode but + * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1 + * (and in turn EFER.LMA = 1). To jump into 64bit mode we use + * the new gdt/idt that has __KERNEL_CS with CS.L = 1. + * We place all of the values on our mini stack so lret can + * used to perform that far jump. + */ + pushl $__KERNEL_CS + leal startup_64(%ebp), %eax + pushl %eax + + /* Enter paged protected Mode, activating Long Mode */ + movl $0x80000001, %eax /* Enable Paging and Protected mode */ + movl %eax, %cr0 + + /* Jump from 32bit compatibility mode into 64bit mode. */ + lret + +no_longmode: + /* This isn't an x86-64 CPU so hang */ +1: + hlt + jmp 1b + +#include "../../kernel/verify_cpu.S" + + /* Be careful here startup_64 needs to be at a predictable + * address so I can export it in an ELF header. Bootloaders + * should look at the ELF header to find this address, as + * it may change in the future. + */ + .code64 + .org 0x200 +ENTRY(startup_64) + /* We come here either from startup_32 or directly from a + * 64bit bootloader. If we come here from a bootloader we depend on + * an identity mapped page table being provied that maps our + * entire text+data+bss and hopefully all of memory. + */ + + /* Setup data segments. */ + xorl %eax, %eax + movl %eax, %ds + movl %eax, %es + movl %eax, %ss + + /* Compute the decompressed kernel start address. It is where + * we were loaded at aligned to a 2M boundary. %rbp contains the + * decompressed kernel start address. + * + * If it is a relocatable kernel then decompress and run the kernel + * from load address aligned to 2MB addr, otherwise decompress and + * run the kernel from CONFIG_PHYSICAL_START + */ + + /* Start with the delta to where the kernel will run at. */ +#ifdef CONFIG_RELOCATABLE + leaq startup_32(%rip) /* - $startup_32 */, %rbp + addq $(LARGE_PAGE_SIZE - 1), %rbp + andq $LARGE_PAGE_MASK, %rbp + movq %rbp, %rbx +#else + movq $CONFIG_PHYSICAL_START, %rbp + movq %rbp, %rbx +#endif + + /* Replace the compressed data size with the uncompressed size */ + movl input_len(%rip), %eax + subq %rax, %rbx + movl output_len(%rip), %eax + addq %rax, %rbx + /* Add 8 bytes for every 32K input block */ + shrq $12, %rax + addq %rax, %rbx + /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */ + addq $(32768 + 18 + 4095), %rbx + andq $~4095, %rbx + +/* Copy the compressed kernel to the end of our buffer + * where decompression in place becomes safe. + */ + leaq _end(%rip), %r8 + leaq _end(%rbx), %r9 + movq $_end /* - $startup_32 */, %rcx +1: subq $8, %r8 + subq $8, %r9 + movq 0(%r8), %rax + movq %rax, 0(%r9) + subq $8, %rcx + jnz 1b + +/* + * Jump to the relocated address. + */ + leaq relocated(%rbx), %rax + jmp *%rax + +.section ".text" +relocated: + /* * Clear BSS */ - xorl %eax,%eax - movl $_edata,%edi - movl $_end,%ecx - subl %edi,%ecx + xorq %rax, %rax + leaq _edata(%rbx), %rdi + leaq _end(%rbx), %rcx + subq %rdi, %rcx cld rep stosb + + /* Setup the stack */ + leaq user_stack_end(%rip), %rsp + + /* zero EFLAGS after setting rsp */ + pushq $0 + popfq + /* * Do the decompression, and jump to the new kernel.. */ - subl $16,%esp # place for structure on the stack - movl %esp,%eax - pushl %esi # real mode pointer as second arg - pushl %eax # address of structure as first arg - call decompress_kernel - orl %eax,%eax - jnz 3f - addl $8,%esp - xorl %ebx,%ebx - ljmp $(__KERNEL_CS), $__PHYSICAL_START + pushq %rsi # Save the real mode argument + movq %rsi, %rdi # real mode address + leaq _heap(%rip), %rsi # _heap + leaq input_data(%rip), %rdx # input_data + movl input_len(%rip), %eax + movq %rax, %rcx # input_len + movq %rbp, %r8 # output + call decompress_kernel + popq %rsi -/* - * We come here, if we were loaded high. - * We need to move the move-in-place routine down to 0x1000 - * and then start it with the buffer addresses in registers, - * which we got from the stack. - */ -3: - movl %esi,%ebx - movl $move_routine_start,%esi - movl $0x1000,%edi - movl $move_routine_end,%ecx - subl %esi,%ecx - addl $3,%ecx - shrl $2,%ecx - cld - rep - movsl - - popl %esi # discard the address - addl $4,%esp # real mode pointer - popl %esi # low_buffer_start - popl %ecx # lcount - popl %edx # high_buffer_start - popl %eax # hcount - movl $__PHYSICAL_START,%edi - cli # make sure we don't get interrupted - ljmp $(__KERNEL_CS), $0x1000 # and jump to the move routine /* - * Routine (template) for moving the decompressed kernel in place, - * if we were high loaded. This _must_ PIC-code ! + * Jump to the decompressed kernel. */ -move_routine_start: - movl %ecx,%ebp - shrl $2,%ecx - rep - movsl - movl %ebp,%ecx - andl $3,%ecx - rep - movsb - movl %edx,%esi - movl %eax,%ecx # NOTE: rep movsb won't move if %ecx == 0 - addl $3,%ecx - shrl $2,%ecx - rep - movsl - movl %ebx,%esi # Restore setup pointer - xorl %ebx,%ebx - ljmp $(__KERNEL_CS), $__PHYSICAL_START -move_routine_end: + jmp *%rbp - -/* Stack for uncompression */ - .align 32 -user_stack: + .data +gdt: + .word gdt_end - gdt + .long gdt + .word 0 + .quad 0x0000000000000000 /* NULL descriptor */ + .quad 0x00af9a000000ffff /* __KERNEL_CS */ + .quad 0x00cf92000000ffff /* __KERNEL_DS */ +gdt_end: + .bss +/* Stack for uncompression */ + .balign 4 +user_stack: .fill 4096,4,0 -stack_start: - .long user_stack+4096 - .word __KERNEL_DS - +user_stack_end: diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c index 3755b2e394d..f932b0e8909 100644 --- a/arch/x86_64/boot/compressed/misc.c +++ b/arch/x86_64/boot/compressed/misc.c @@ -9,10 +9,95 @@ * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 */ +#define _LINUX_STRING_H_ 1 +#define __LINUX_BITMAP_H 1 + +#include <linux/linkage.h> #include <linux/screen_info.h> #include <asm/io.h> #include <asm/page.h> +/* WARNING!! + * This code is compiled with -fPIC and it is relocated dynamically + * at run time, but no relocation processing is performed. + * This means that it is not safe to place pointers in static structures. + */ + +/* + * Getting to provable safe in place decompression is hard. + * Worst case behaviours need to be analized. + * Background information: + * + * The file layout is: + * magic[2] + * method[1] + * flags[1] + * timestamp[4] + * extraflags[1] + * os[1] + * compressed data blocks[N] + * crc[4] orig_len[4] + * + * resulting in 18 bytes of non compressed data overhead. + * + * Files divided into blocks + * 1 bit (last block flag) + * 2 bits (block type) + * + * 1 block occurs every 32K -1 bytes or when there 50% compression has been achieved. + * The smallest block type encoding is always used. + * + * stored: + * 32 bits length in bytes. + * + * fixed: + * magic fixed tree. + * symbols. + * + * dynamic: + * dynamic tree encoding. + * symbols. + * + * + * The buffer for decompression in place is the length of the + * uncompressed data, plus a small amount extra to keep the algorithm safe. + * The compressed data is placed at the end of the buffer. The output + * pointer is placed at the start of the buffer and the input pointer + * is placed where the compressed data starts. Problems will occur + * when the output pointer overruns the input pointer. + * + * The output pointer can only overrun the input pointer if the input + * pointer is moving faster than the output pointer. A condition only + * triggered by data whose compressed form is larger than the uncompressed + * form. + * + * The worst case at the block level is a growth of the compressed data + * of 5 bytes per 32767 bytes. + * + * The worst case internal to a compressed block is very hard to figure. + * The worst case can at least be boundined by having one bit that represents + * 32764 bytes and then all of the rest of the bytes representing the very + * very last byte. + * + * All of which is enough to compute an amount of extra data that is required + * to be safe. To avoid problems at the block level allocating 5 extra bytes + * per 32767 bytes of data is sufficient. To avoind problems internal to a block + * adding an extra 32767 bytes (the worst case uncompressed block size) is + * sufficient, to ensure that in the worst case the decompressed data for + * block will stop the byte before the compressed data for a block begins. + * To avoid problems with the compressed data's meta information an extra 18 + * bytes are needed. Leading to the formula: + * + * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size. + * + * Adding 8 bytes per 32K is a bit excessive but much easier to calculate. + * Adding 32768 instead of 32767 just makes for round numbers. + * Adding the decompressor_size is necessary as it musht live after all + * of the data as well. Last I measured the decompressor is about 14K. + * 10K of actuall data and 4K of bss. + * + */ + /* * gzip declarations */ @@ -28,15 +113,20 @@ typedef unsigned char uch; typedef unsigned short ush; typedef unsigned long ulg; -#define WSIZE 0x8000 /* Window size must be at least 32k, */ - /* and a power of two */ +#define WSIZE 0x80000000 /* Window size must be at least 32k, + * and a power of two + * We don't actually have a window just + * a huge output buffer so I report + * a 2G windows size, as that should + * always be larger than our output buffer. + */ -static uch *inbuf; /* input buffer */ -static uch window[WSIZE]; /* Sliding window buffer */ +static uch *inbuf; /* input buffer */ +static uch *window; /* Sliding window buffer, (and final output buffer) */ -static unsigned insize = 0; /* valid bytes in inbuf */ -static unsigned inptr = 0; /* index of next byte to be processed in inbuf */ -static unsigned outcnt = 0; /* bytes in output buffer */ +static unsigned insize; /* valid bytes in inbuf */ +static unsigned inptr; /* index of next byte to be processed in inbuf */ +static unsigned outcnt; /* bytes in output buffer */ /* gzip flag byte */ #define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */ @@ -87,8 +177,6 @@ extern unsigned char input_data[]; extern int input_len; static long bytes_out = 0; -static uch *output_data; -static unsigned long output_ptr = 0; static void *malloc(int size); static void free(void *where); @@ -98,17 +186,10 @@ static void *memcpy(void *dest, const void *src, unsigned n); static void putstr(const char *); -extern int end; -static long free_mem_ptr = (long)&end; +static long free_mem_ptr; static long free_mem_end_ptr; -#define INPLACE_MOVE_ROUTINE 0x1000 -#define LOW_BUFFER_START 0x2000 -#define LOW_BUFFER_MAX 0x90000 -#define HEAP_SIZE 0x3000 -static unsigned int low_buffer_end, low_buffer_size; -static int high_loaded =0; -static uch *high_buffer_start /* = (uch *)(((ulg)&end) + HEAP_SIZE)*/; +#define HEAP_SIZE 0x7000 static char *vidmem = (char *)0xb8000; static int vidport; @@ -218,58 +299,31 @@ static void* memcpy(void* dest, const void* src, unsigned n) */ static int fill_inbuf(void) { - if (insize != 0) { - error("ran out of input data"); - } - - inbuf = input_data; - insize = input_len; - inptr = 1; - return inbuf[0]; + error("ran out of input data"); + return 0; } /* =========================================================================== * Write the output window window[0..outcnt-1] and update crc and bytes_out. * (Used for the decompressed data only.) */ -static void flush_window_low(void) -{ - ulg c = crc; /* temporary variable */ - unsigned n; - uch *in, *out, ch; - - in = window; - out = &output_data[output_ptr]; - for (n = 0; n < outcnt; n++) { - ch = *out++ = *in++; - c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); - } - crc = c; - bytes_out += (ulg)outcnt; - output_ptr += (ulg)outcnt; - outcnt = 0; -} - -static void flush_window_high(void) -{ - ulg c = crc; /* temporary variable */ - unsigned n; - uch *in, ch; - in = window; - for (n = 0; n < outcnt; n++) { - ch = *output_data++ = *in++; - if ((ulg)output_data == low_buffer_end) output_data=high_buffer_start; - c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); - } - crc = c; - bytes_out += (ulg)outcnt; - outcnt = 0; -} - static void flush_window(void) { - if (high_loaded) flush_window_high(); - else flush_window_low(); + /* With my window equal to my output buffer + * I only need to compute the crc here. + */ + ulg c = crc; /* temporary variable */ + unsigned n; + uch *in, ch; + + in = window; + for (n = 0; n < outcnt; n++) { + ch = *in++; + c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); + } + crc = c; + bytes_out += (ulg)outcnt; + outcnt = 0; } static void error(char *x) @@ -281,57 +335,8 @@ static void error(char *x) while(1); /* Halt */ } -static void setup_normal_output_buffer(void) -{ -#ifdef STANDARD_MEMORY_BIOS_CALL - if (RM_EXT_MEM_K < 1024) error("Less than 2MB of memory"); -#else - if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < 1024) error("Less than 2MB of memory"); -#endif - output_data = (unsigned char *)__PHYSICAL_START; /* Normally Points to 1M */ - free_mem_end_ptr = (long)real_mode; -} - -struct moveparams { - uch *low_buffer_start; int lcount; - uch *high_buffer_start; int hcount; -}; - -static void setup_output_buffer_if_we_run_high(struct moveparams *mv) -{ - high_buffer_start = (uch *)(((ulg)&end) + HEAP_SIZE); -#ifdef STANDARD_MEMORY_BIOS_CALL - if (RM_EXT_MEM_K < (3*1024)) error("Less than 4MB of memory"); -#else - if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < (3*1024)) error("Less than 4MB of memory"); -#endif - mv->low_buffer_start = output_data = (unsigned char *)LOW_BUFFER_START; - low_buffer_end = ((unsigned int)real_mode > LOW_BUFFER_MAX - ? LOW_BUFFER_MAX : (unsigned int)real_mode) & ~0xfff; - low_buffer_size = low_buffer_end - LOW_BUFFER_START; - high_loaded = 1; - free_mem_end_ptr = (long)high_buffer_start; - if ( (__PHYSICAL_START + low_buffer_size) > ((ulg)high_buffer_start)) { - high_buffer_start = (uch *)(__PHYSICAL_START + low_buffer_size); - mv->hcount = 0; /* say: we need not to move high_buffer */ - } - else mv->hcount = -1; - mv->high_buffer_start = high_buffer_start; -} - -static void close_output_buffer_if_we_run_high(struct moveparams *mv) -{ - if (bytes_out > low_buffer_size) { - mv->lcount = low_buffer_size; - if (mv->hcount) - mv->hcount = bytes_out - low_buffer_size; - } else { - mv->lcount = bytes_out; - mv->hcount = 0; - } -} - -int decompress_kernel(struct moveparams *mv, void *rmode) +asmlinkage void decompress_kernel(void *rmode, unsigned long heap, + uch *input_data, unsigned long input_len, uch *output) { real_mode = rmode; @@ -346,13 +351,21 @@ int decompress_kernel(struct moveparams *mv, void *rmode) lines = RM_SCREEN_INFO.orig_video_lines; cols = RM_SCREEN_INFO.orig_video_cols; - if (free_mem_ptr < 0x100000) setup_normal_output_buffer(); - else setup_output_buffer_if_we_run_high(mv); + window = output; /* Output buffer (Normally at 1M) */ + free_mem_ptr = heap; /* Heap */ + free_mem_end_ptr = heap + HEAP_SIZE; + inbuf = input_data; /* Input buffer */ + insize = input_len; + inptr = 0; + + if ((ulg)output & (__KERNEL_ALIGN - 1)) + error("Destination address not 2M aligned"); + if ((ulg)output >= 0xffffffffffUL) + error("Destination address too large"); makecrc(); putstr(".\nDecompressing Linux..."); gunzip(); putstr("done.\nBooting the kernel.\n"); - if (high_loaded) close_output_buffer_if_we_run_high(mv); - return high_loaded; + return; } diff --git a/arch/x86_64/boot/compressed/vmlinux.lds b/arch/x86_64/boot/compressed/vmlinux.lds new file mode 100644 index 00000000000..94c13e557fb --- /dev/null +++ b/arch/x86_64/boot/compressed/vmlinux.lds @@ -0,0 +1,44 @@ +OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") +OUTPUT_ARCH(i386:x86-64) +ENTRY(startup_64) +SECTIONS +{ + /* Be careful parts of head.S assume startup_32 is at + * address 0. + */ + . = 0; + .text : { + _head = . ; + *(.text.head) + _ehead = . ; + *(.text.compressed) + _text = .; /* Text */ + *(.text) + *(.text.*) + _etext = . ; + } + .rodata : { + _rodata = . ; + *(.rodata) /* read-only data */ + *(.rodata.*) + _erodata = . ; + } + .data : { + _data = . ; + *(.data) + *(.data.*) + _edata = . ; + } + .bss : { + _bss = . ; + *(.bss) + *(.bss.*) + *(COMMON) + . = ALIGN(8); + _end = . ; + . = ALIGN(4096); + pgtable = . ; + . = . + 4096 * 6; + _heap = .; + } +} diff --git a/arch/x86_64/boot/compressed/vmlinux.scr b/arch/x86_64/boot/compressed/vmlinux.scr index 1ed9d791f86..bd1429ce193 100644 --- a/arch/x86_64/boot/compressed/vmlinux.scr +++ b/arch/x86_64/boot/compressed/vmlinux.scr @@ -1,9 +1,10 @@ SECTIONS { - .data : { + .text.compressed : { input_len = .; - LONG(input_data_end - input_data) input_data = .; - *(.data) - input_data_end = .; + LONG(input_data_end - input_data) input_data = .; + *(.data) + output_len = . - 4; + input_data_end = .; } } diff --git a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S index 770940cc010..e9e33f94969 100644 --- a/arch/x86_64/boot/setup.S +++ b/arch/x86_64/boot/setup.S @@ -51,6 +51,7 @@ #include <asm/boot.h> #include <asm/e820.h> #include <asm/page.h> +#include <asm/setup.h> /* Signature words to ensure LILO loaded us right */ #define SIG1 0xAA55 @@ -80,7 +81,7 @@ start: # This is the setup header, and it must start at %cs:2 (old 0x9020:2) .ascii "HdrS" # header signature - .word 0x0204 # header version number (>= 0x0105) + .word 0x0206 # header version number (>= 0x0105) # or else old loadlin-1.5 will fail) realmode_swtch: .word 0, 0 # default_switch, SETUPSEG start_sys_seg: .word SYSSEG @@ -155,7 +156,20 @@ cmd_line_ptr: .long 0 # (Header version 0x0202 or later) # low memory 0x10000 or higher. ramdisk_max: .long 0xffffffff - +kernel_alignment: .long 0x200000 # physical addr alignment required for + # protected mode relocatable kernel +#ifdef CONFIG_RELOCATABLE +relocatable_kernel: .byte 1 +#else +relocatable_kernel: .byte 0 +#endif +pad2: .byte 0 +pad3: .word 0 + +cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line, + #added with boot protocol + #version 2.06 + trampoline: call start_of_setup .align 16 # The offset at this point is 0x240 @@ -290,64 +304,10 @@ loader_ok: movw %cs,%ax movw %ax,%ds - /* minimum CPUID flags for x86-64 */ - /* see http://www.x86-64.org/lists/discuss/msg02971.html */ -#define SSE_MASK ((1<<25)|(1<<26)) -#define REQUIRED_MASK1 ((1<<0)|(1<<3)|(1<<4)|(1<<5)|(1<<6)|(1<<8)|\ - (1<<13)|(1<<15)|(1<<24)) -#define REQUIRED_MASK2 (1<<29) - - pushfl /* standard way to check for cpuid */ - popl %eax - movl %eax,%ebx - xorl $0x200000,%eax - pushl %eax - popfl - pushfl - popl %eax - cmpl %eax,%ebx - jz no_longmode /* cpu has no cpuid */ - movl $0x0,%eax - cpuid - cmpl $0x1,%eax - jb no_longmode /* no cpuid 1 */ - xor %di,%di - cmpl $0x68747541,%ebx /* AuthenticAMD */ - jnz noamd - cmpl $0x69746e65,%edx - jnz noamd - cmpl $0x444d4163,%ecx - jnz noamd - mov $1,%di /* cpu is from AMD */ -noamd: - movl $0x1,%eax - cpuid - andl $REQUIRED_MASK1,%edx - xorl $REQUIRED_MASK1,%edx - jnz no_longmode - movl $0x80000000,%eax - cpuid - cmpl $0x80000001,%eax - jb no_longmode /* no extended cpuid */ - movl $0x80000001,%eax - cpuid - andl $REQUIRED_MASK2,%edx - xorl $REQUIRED_MASK2,%edx - jnz no_longmode -sse_test: - movl $1,%eax - cpuid - andl $SSE_MASK,%edx - cmpl $SSE_MASK,%edx - je sse_ok - test %di,%di - jz no_longmode /* only try to force SSE on AMD */ - movl $0xc0010015,%ecx /* HWCR */ - rdmsr - btr $15,%eax /* enable SSE */ - wrmsr - xor %di,%di /* don't loop */ - jmp sse_test /* try again */ + call verify_cpu + testl %eax,%eax + jz sse_ok + no_longmode: call beep lea long_mode_panic,%si @@ -357,7 +317,8 @@ no_longmode_loop: long_mode_panic: .string "Your CPU does not support long mode. Use a 32bit distribution." .byte 0 - + +#include "../kernel/verify_cpu.S" sse_ok: popw %ds @@ -846,7 +807,7 @@ gdt_48: # Include video setup & detection code -#include "video.S" +#include "../../i386/boot/video.S" # Setup signature -- must be last setup_sig1: .word SIG1 diff --git a/arch/x86_64/boot/video.S b/arch/x86_64/boot/video.S deleted file mode 100644 index 6090516c9c7..00000000000 --- a/arch/x86_64/boot/video.S +++ /dev/null @@ -1,2043 +0,0 @@ -/* video.S - * - * Display adapter & video mode setup, version 2.13 (14-May-99) - * - * Copyright (C) 1995 -- 1998 Martin Mares <mj@ucw.cz> - * Based on the original setup.S code (C) Linus Torvalds and Mats Anderson - * - * Rewritten to use GNU 'as' by Chris Noe <stiker@northlink.com> May 1999 - * - * For further information, look at Documentation/svga.txt. - * - */ - -/* Enable autodetection of SVGA adapters and modes. */ -#undef CONFIG_VIDEO_SVGA - -/* Enable autodetection of VESA modes */ -#define CONFIG_VIDEO_VESA - -/* Enable compacting of mode table */ -#define CONFIG_VIDEO_COMPACT - -/* Retain screen contents when switching modes */ -#define CONFIG_VIDEO_RETAIN - -/* Enable local mode list */ -#undef CONFIG_VIDEO_LOCAL - -/* Force 400 scan lines for standard modes (hack to fix bad BIOS behaviour */ -#undef CONFIG_VIDEO_400_HACK - -/* Hack that lets you force specific BIOS mode ID and specific dimensions */ -#undef CONFIG_VIDEO_GFX_HACK -#define VIDEO_GFX_BIOS_AX 0x4f02 /* 800x600 on ThinkPad */ -#define VIDEO_GFX_BIOS_BX 0x0102 -#define VIDEO_GFX_DUMMY_RESOLUTION 0x6425 /* 100x37 */ - -/* This code uses an extended set of video mode numbers. These include: - * Aliases for standard modes - * NORMAL_VGA (-1) - * EXTENDED_VGA (-2) - * ASK_VGA (-3) - * Video modes numbered by menu position -- NOT RECOMMENDED because of lack - * of compatibility when extending the table. These are between 0x00 and 0xff. - */ -#define VIDEO_FIRST_MENU 0x0000 - -/* Standard BIOS video modes (BIOS number + 0x0100) */ -#define VIDEO_FIRST_BIOS 0x0100 - -/* VESA BIOS video modes (VESA number + 0x0200) */ -#define VIDEO_FIRST_VESA 0x0200 - -/* Video7 special modes (BIOS number + 0x0900) */ -#define VIDEO_FIRST_V7 0x0900 - -/* Special video modes */ -#define VIDEO_FIRST_SPECIAL 0x0f00 -#define VIDEO_80x25 0x0f00 -#define VIDEO_8POINT 0x0f01 -#define VIDEO_80x43 0x0f02 -#define VIDEO_80x28 0x0f03 -#define VIDEO_CURRENT_MODE 0x0f04 -#define VIDEO_80x30 0x0f05 -#define VIDEO_80x34 0x0f06 -#define VIDEO_80x60 0x0f07 -#define VIDEO_GFX_HACK 0x0f08 -#define VIDEO_LAST_SPECIAL 0x0f09 - -/* Video modes given by resolution */ -#define VIDEO_FIRST_RESOLUTION 0x1000 - -/* The "recalculate timings" flag */ -#define VIDEO_RECALC 0x8000 - -/* Positions of various video parameters passed to the kernel */ -/* (see also include/linux/tty.h) */ -#define PARAM_CURSOR_POS 0x00 -#define PARAM_VIDEO_PAGE 0x04 -#define PARAM_VIDEO_MODE 0x06 -#define PARAM_VIDEO_COLS 0x07 -#define PARAM_VIDEO_EGA_BX 0x0a -#define PARAM_VIDEO_LINES 0x0e -#define PARAM_HAVE_VGA 0x0f -#define PARAM_FONT_POINTS 0x10 - -#define PARAM_LFB_WIDTH 0x12 -#define PARAM_LFB_HEIGHT 0x14 -#define PARAM_LFB_DEPTH 0x16 -#define PARAM_LFB_BASE 0x18 -#define PARAM_LFB_SIZE 0x1c -#define PARAM_LFB_LINELENGTH 0x24 -#define PARAM_LFB_COLORS 0x26 -#define PARAM_VESAPM_SEG 0x2e -#define PARAM_VESAPM_OFF 0x30 -#define PARAM_LFB_PAGES 0x32 -#define PARAM_VESA_ATTRIB 0x34 -#define PARAM_CAPABILITIES 0x36 - -/* Define DO_STORE according to CONFIG_VIDEO_RETAIN */ -#ifdef CONFIG_VIDEO_RETAIN -#define DO_STORE call store_screen -#else -#define DO_STORE -#endif /* CONFIG_VIDEO_RETAIN */ - -# This is the main entry point called by setup.S -# %ds *must* be pointing to the bootsector -video: pushw %ds # We use different segments - pushw %ds # FS contains original DS - popw %fs - pushw %cs # DS is equal to CS - popw %ds - pushw %cs # ES is equal to CS - popw %es - xorw %ax, %ax - movw %ax, %gs # GS is zero - cld - call basic_detect # Basic adapter type testing (EGA/VGA/MDA/CGA) -#ifdef CONFIG_VIDEO_SELECT - movw %fs:(0x01fa), %ax # User selected video mode - cmpw $ASK_VGA, %ax # Bring up the menu - jz vid2 - - call mode_set # Set the mode - jc vid1 - - leaw badmdt, %si # Invalid mode ID - call prtstr -vid2: call mode_menu -vid1: -#ifdef CONFIG_VIDEO_RETAIN - call restore_screen # Restore screen contents -#endif /* CONFIG_VIDEO_RETAIN */ - call store_edid -#endif /* CONFIG_VIDEO_SELECT */ - call mode_params # Store mode parameters - popw %ds # Restore original DS - ret - -# Detect if we have CGA, MDA, EGA or VGA and pass it to the kernel. -basic_detect: - movb $0, %fs:(PARAM_HAVE_VGA) - movb $0x12, %ah # Check EGA/VGA - movb $0x10, %bl - int $0x10 - movw %bx, %fs:(PARAM_VIDEO_EGA_BX) # Identifies EGA to the kernel - cmpb $0x10, %bl # No, it's a CGA/MDA/HGA card. - je basret - - incb adapter - movw $0x1a00, %ax # Check EGA or VGA? - int $0x10 - cmpb $0x1a, %al # 1a means VGA... - jne basret # anything else is EGA. - - incb %fs:(PARAM_HAVE_VGA) # We've detected a VGA - incb adapter -basret: ret - -# Store the video mode parameters for later usage by the kernel. -# This is done by asking the BIOS except for the rows/columns -# parameters in the default 80x25 mode -- these are set directly, -# because some very obscure BIOSes supply insane values. -mode_params: -#ifdef CONFIG_VIDEO_SELECT - cmpb $0, graphic_mode - jnz mopar_gr -#endif - movb $0x03, %ah # Read cursor position - xorb %bh, %bh - int $0x10 - movw %dx, %fs:(PARAM_CURSOR_POS) - movb $0x0f, %ah # Read page/mode/width - int $0x10 - movw %bx, %fs:(PARAM_VIDEO_PAGE) - movw %ax, %fs:(PARAM_VIDEO_MODE) # Video mode and screen width - cmpb $0x7, %al # MDA/HGA => segment differs - jnz mopar0 - - movw $0xb000, video_segment -mopar0: movw %gs:(0x485), %ax # Font size - movw %ax, %fs:(PARAM_FONT_POINTS) # (valid only on EGA/VGA) - movw force_size, %ax # Forced size? - orw %ax, %ax - jz mopar1 - - movb %ah, %fs:(PARAM_VIDEO_COLS) - movb %al, %fs:(PARAM_VIDEO_LINES) - ret - -mopar1: movb $25, %al - cmpb $0, adapter # If we are on CGA/MDA/HGA, the - jz mopar2 # screen must have 25 lines. - - movb %gs:(0x484), %al # On EGA/VGA, use the EGA+ BIOS - incb %al # location of max lines. -mopar2: movb %al, %fs:(PARAM_VIDEO_LINES) - ret - -#ifdef CONFIG_VIDEO_SELECT -# Fetching of VESA frame buffer parameters -mopar_gr: - leaw modelist+1024, %di - movb $0x23, %fs:(PARAM_HAVE_VGA) - movw 16(%di), %ax - movw %ax, %fs:(PARAM_LFB_LINELENGTH) - movw 18(%di), %ax - movw %ax, %fs:(PARAM_LFB_WIDTH) - movw 20(%di), %ax - movw %ax, %fs:(PARAM_LFB_HEIGHT) - movb 25(%di), %al - movb $0, %ah - movw %ax, %fs:(PARAM_LFB_DEPTH) - movb 29(%di), %al - movb $0, %ah - movw %ax, %fs:(PARAM_LFB_PAGES) - movl 40(%di), %eax - movl %eax, %fs:(PARAM_LFB_BASE) - movl 31(%di), %eax - movl %eax, %fs:(PARAM_LFB_COLORS) - movl 35(%di), %eax - movl %eax, %fs:(PARAM_LFB_COLORS+4) - movw 0(%di), %ax - movw %ax, %fs:(PARAM_VESA_ATTRIB) - -# get video mem size - leaw modelist+1024, %di - movw $0x4f00, %ax - int $0x10 - xorl %eax, %eax - movw 18(%di), %ax - movl %eax, %fs:(PARAM_LFB_SIZE) - -# store mode capabilities - movl 10(%di), %eax - movl %eax, %fs:(PARAM_CAPABILITIES) - -# switching the DAC to 8-bit is for <= 8 bpp only - movw %fs:(PARAM_LFB_DEPTH), %ax - cmpw $8, %ax - jg dac_done - -# get DAC switching capability - xorl %eax, %eax - movb 10(%di), %al - testb $1, %al - jz dac_set - -# attempt to switch DAC to 8-bit - movw $0x4f08, %ax - movw $0x0800, %bx - int $0x10 - cmpw $0x004f, %ax - jne dac_set - movb %bh, dac_size # store actual DAC size - -dac_set: -# set color size to DAC size - movb dac_size, %al - movb %al, %fs:(PARAM_LFB_COLORS+0) - movb %al, %fs:(PARAM_LFB_COLORS+2) - movb %al, %fs:(PARAM_LFB_COLORS+4) - movb %al, %fs:(PARAM_LFB_COLORS+6) - -# set color offsets to 0 - movb $0, %fs:(PARAM_LFB_COLORS+1) - movb $0, %fs:(PARAM_LFB_COLORS+3) - movb $0, %fs:(PARAM_LFB_COLORS+5) - movb $0, %fs:(PARAM_LFB_COLORS+7) - -dac_done: -# get protected mode interface informations - movw $0x4f0a, %ax - xorw %bx, %bx - xorw %di, %di - int $0x10 - cmp $0x004f, %ax - jnz no_pm - - movw %es, %fs:(PARAM_VESAPM_SEG) - movw %di, %fs:(PARAM_VESAPM_OFF) -no_pm: ret - -# The video mode menu -mode_menu: - leaw keymsg, %si # "Return/Space/Timeout" message - call prtstr - call flush -nokey: call getkt - - cmpb $0x0d, %al # ENTER ? - je listm # yes - manual mode selection - - cmpb $0x20, %al # SPACE ? - je defmd1 # no - repeat - - call beep - jmp nokey - -defmd1: ret # No mode chosen? Default 80x25 - -listm: call mode_table # List mode table -listm0: leaw name_bann, %si # Print adapter name - call prtstr - movw card_name, %si - orw %si, %si - jnz an2 - - movb adapter, %al - leaw old_name, %si - orb %al, %al - jz an1 - - leaw ega_name, %si - decb %al - jz an1 - - leaw vga_name, %si - jmp an1 - -an2: call prtstr - leaw svga_name, %si -an1: call prtstr - leaw listhdr, %si # Table header - call prtstr - movb $0x30, %dl # DL holds mode number - leaw modelist, %si -lm1: cmpw $ASK_VGA, (%si) # End? - jz lm2 - - movb %dl, %al # Menu selection number - call prtchr - call prtsp2 - lodsw - call prthw # Mode ID - call prtsp2 - movb 0x1(%si), %al - call prtdec # Rows - movb $0x78, %al # the letter 'x' - call prtchr - lodsw - call prtdec # Columns - movb $0x0d, %al # New line - call prtchr - movb $0x0a, %al - call prtchr - incb %dl # Next character - cmpb $0x3a, %dl - jnz lm1 - - movb $0x61, %dl - jmp lm1 - -lm2: leaw prompt, %si # Mode prompt - call prtstr - leaw edit_buf, %di # Editor buffer -lm3: call getkey - cmpb $0x0d, %al # Enter? - jz lment - - cmpb $0x08, %al # Backspace? - jz lmbs - - cmpb $0x20, %al # Printable? - jc lm3 - - cmpw $edit_buf+4, %di # Enough space? - jz lm3 - - stosb - call prtchr - jmp lm3 - -lmbs: cmpw $edit_buf, %di # Backspace - jz lm3 - - decw %di - movb $0x08, %al - call prtchr - call prtspc - movb $0x08, %al - call prtchr - jmp lm3 - -lment: movb $0, (%di) - leaw crlft, %si - call prtstr - leaw edit_buf, %si - cmpb $0, (%si) # Empty string = default mode - jz lmdef - - cmpb $0, 1(%si) # One character = menu selection - jz mnusel - - cmpw $0x6373, (%si) # "scan" => mode scanning - jnz lmhx - - cmpw $0x6e61, 2(%si) - jz lmscan - -lmhx: xorw %bx, %bx # Else => mode ID in hex -lmhex: lodsb - orb %al, %al - jz lmuse1 - - subb $0x30, %al - jc lmbad - - cmpb $10, %al - jc lmhx1 - - subb $7, %al - andb $0xdf, %al - cmpb $10, %al - jc lmbad - - cmpb $16, %al - jnc lmbad - -lmhx1: shlw $4, %bx - orb %al, %bl - jmp lmhex - -lmuse1: movw %bx, %ax - jmp lmuse - -mnusel: lodsb # Menu selection - xorb %ah, %ah - subb $0x30, %al - jc lmbad - - cmpb $10, %al - jc lmuse - - cmpb $0x61-0x30, %al - jc lmbad - - subb $0x61-0x30-10, %al - cmpb $36, %al - jnc lmbad - -lmuse: call mode_set - jc lmdef - -lmbad: leaw unknt, %si - call prtstr - jmp lm2 -lmscan: cmpb $0, adapter # Scanning only on EGA/VGA - jz lmbad - - movw $0, mt_end # Scanning of modes is - movb $1, scanning # done as new autodetection. - call mode_table - jmp listm0 -lmdef: ret - -# Additional parts of mode_set... (relative jumps, you know) -setv7: # Video7 extended modes - DO_STORE - subb $VIDEO_FIRST_V7>>8, %bh - movw $0x6f05, %ax - int $0x10 - stc - ret - -_setrec: jmp setrec # Ugly... -_set_80x25: jmp set_80x25 - -# Aliases for backward compatibility. -setalias: - movw $VIDEO_80x25, %ax - incw %bx - jz mode_set - - movb $VIDEO_8POINT-VIDEO_FIRST_SPECIAL, %al - incw %bx - jnz setbad # Fall-through! - -# Setting of user mode (AX=mode ID) => CF=success -mode_set: - movw %ax, %fs:(0x01fa) # Store mode for use in acpi_wakeup.S - movw %ax, %bx - cmpb $0xff, %ah - jz setalias - - testb $VIDEO_RECALC>>8, %ah - jnz _setrec - - cmpb $VIDEO_FIRST_RESOLUTION>>8, %ah - jnc setres - - cmpb $VIDEO_FIRST_SPECIAL>>8, %ah - jz setspc - - cmpb $VIDEO_FIRST_V7>>8, %ah - jz setv7 - - cmpb $VIDEO_FIRST_VESA>>8, %ah - jnc check_vesa - - orb %ah, %ah - jz setmenu - - decb %ah - jz setbios - -setbad: clc - movb $0, do_restore # The screen needn't be restored - ret - -setvesa: - DO_STORE - subb $VIDEO_FIRST_VESA>>8, %bh - movw $0x4f02, %ax # VESA BIOS mode set call - int $0x10 - cmpw $0x004f, %ax # AL=4f if implemented - jnz setbad # AH=0 if OK - - stc - ret - -setbios: - DO_STORE - int $0x10 # Standard BIOS mode set call - pushw %bx - movb $0x0f, %ah # Check if really set - int $0x10 - popw %bx - cmpb %bl, %al - jnz setbad - - stc - ret - -setspc: xorb %bh, %bh # Set special mode - cmpb $VIDEO_LAST_SPECIAL-VIDEO_FIRST_SPECIAL, %bl - jnc setbad - - addw %bx, %bx - jmp *spec_inits(%bx) - -setmenu: - orb %al, %al # 80x25 is an exception - jz _set_80x25 - - pushw %bx # Set mode chosen from menu - call mode_table # Build the mode table - popw %ax - shlw $2, %ax - addw %ax, %si - cmpw %di, %si - jnc setbad - - movw (%si), %ax # Fetch mode ID -_m_s: jmp mode_set - -setres: pushw %bx # Set mode chosen by resolution - call mode_table - popw %bx - xchgb %bl, %bh -setr1: lodsw - cmpw $ASK_VGA, %ax # End of the list? - jz setbad - - lodsw - cmpw %bx, %ax - jnz setr1 - - movw -4(%si), %ax # Fetch mode ID - jmp _m_s - -check_vesa: -#ifdef CONFIG_FIRMWARE_EDID - leaw modelist+1024, %di - movw $0x4f00, %ax - int $0x10 - cmpw $0x004f, %ax - jnz setbad - - movw 4(%di), %ax - movw %ax, vbe_version -#endif - leaw modelist+1024, %di - subb $VIDEO_FIRST_VESA>>8, %bh - movw %bx, %cx # Get mode information structure - movw $0x4f01, %ax - int $0x10 - addb $VIDEO_FIRST_VESA>>8, %bh - cmpw $0x004f, %ax - jnz setbad - - movb (%di), %al # Check capabilities. - andb $0x19, %al - cmpb $0x09, %al - jz setvesa # This is a text mode - - movb (%di), %al # Check capabilities. - andb $0x99, %al - cmpb $0x99, %al - jnz _setbad # Doh! No linear frame buffer. - - subb $VIDEO_FIRST_VESA>>8, %bh - orw $0x4000, %bx # Use linear frame buffer - movw $0x4f02, %ax # VESA BIOS mode set call - int $0x10 - cmpw $0x004f, %ax # AL=4f if implemented - jnz _setbad # AH=0 if OK - - movb $1, graphic_mode # flag graphic mode - movb $0, do_restore # no screen restore - stc - ret - -_setbad: jmp setbad # Ugly... - -# Recalculate vertical display end registers -- this fixes various -# inconsistencies of extended modes on many adapters. Called when -# the VIDEO_RECALC flag is set in the mode ID. - -setrec: subb $VIDEO_RECALC>>8, %ah # Set the base mode - call mode_set - jnc rct3 - - movw %gs:(0x485), %ax # Font size in pixels - movb %gs:(0x484), %bl # Number of rows - incb %bl - mulb %bl # Number of visible - decw %ax # scan lines - 1 - movw $0x3d4, %dx - movw %ax, %bx - movb $0x12, %al # Lower 8 bits - movb %bl, %ah - outw %ax, %dx - movb $0x07, %al # Bits 8 and 9 in the overflow register - call inidx - xchgb %al, %ah - andb $0xbd, %ah - shrb %bh - jnc rct1 - orb $0x02, %ah -rct1: shrb %bh - jnc rct2 - orb $0x40, %ah -rct2: movb $0x07, %al - outw %ax, %dx - stc -rct3: ret - -# Table of routines for setting of the special modes. -spec_inits: - .word set_80x25 - .word set_8pixel - .word set_80x43 - .word set_80x28 - .word set_current - .word set_80x30 - .word set_80x34 - .word set_80x60 - .word set_gfx - -# Set the 80x25 mode. If already set, do nothing. -set_80x25: - movw $0x5019, force_size # Override possibly broken BIOS -use_80x25: -#ifdef CONFIG_VIDEO_400_HACK - movw $0x1202, %ax # Force 400 scan lines - movb $0x30, %bl - int $0x10 -#else - movb $0x0f, %ah # Get current mode ID - int $0x10 - cmpw $0x5007, %ax # Mode 7 (80x25 mono) is the only one available - jz st80 # on CGA/MDA/HGA and is also available on EGAM - - cmpw $0x5003, %ax # Unknown mode, force 80x25 color - jnz force3 - -st80: cmpb $0, adapter # CGA/MDA/HGA => mode 3/7 is always 80x25 - jz set80 - - movb %gs:(0x0484), %al # This is EGA+ -- beware of 80x50 etc. - orb %al, %al # Some buggy BIOS'es set 0 rows - jz set80 - - cmpb $24, %al # It's hopefully correct - jz set80 -#endif /* CONFIG_VIDEO_400_HACK */ -force3: DO_STORE - movw $0x0003, %ax # Forced set - int $0x10 -set80: stc - ret - -# Set the 80x50/80x43 8-pixel mode. Simple BIOS calls. -set_8pixel: - DO_STORE - call use_80x25 # The base is 80x25 -set_8pt: - movw $0x1112, %ax # Use 8x8 font - xorb %bl, %bl - int $0x10 - movw $0x1200, %ax # Use alternate print screen - movb $0x20, %bl - int $0x10 - movw $0x1201, %ax # Turn off cursor emulation - movb $0x34, %bl - int $0x10 - movb $0x01, %ah # Define cursor scan lines 6-7 - movw $0x0607, %cx - int $0x10 -set_current: - stc - ret - -# Set the 80x28 mode. This mode works on all VGA's, because it's a standard -# 80x25 mode with 14-point fonts instead of 16-point. -set_80x28: - DO_STORE - call use_80x25 # The base is 80x25 -set14: movw $0x1111, %ax # Use 9x14 font - xorb %bl, %bl - int $0x10 - movb $0x01, %ah # Define cursor scan lines 11-12 - movw $0x0b0c, %cx - int $0x10 - stc - ret - -# Set the 80x43 mode. This mode is works on all VGA's. -# It's a 350-scanline mode with 8-pixel font. -set_80x43: - DO_STORE - movw $0x1201, %ax # Set 350 scans - movb $0x30, %bl - int $0x10 - movw $0x0003, %ax # Reset video mode - int $0x10 - jmp set_8pt # Use 8-pixel font - -# Set the 80x30 mode (all VGA's). 480 scanlines, 16-pixel font. -set_80x30: - call use_80x25 # Start with real 80x25 - DO_STORE - movw $0x3cc, %dx # Get CRTC port - inb %dx, %al - movb $0xd4, %dl - rorb %al # Mono or color? - jc set48a - - movb $0xb4, %dl -set48a: movw $0x0c11, %ax # Vertical sync end (also unlocks CR0-7) - call outidx - movw $0x0b06, %ax # Vertical total - call outidx - movw $0x3e07, %ax # (Vertical) overflow - call outidx - movw $0xea10, %ax # Vertical sync start - call outidx - movw $0xdf12, %ax # Vertical display end - call outidx - movw $0xe715, %ax # Vertical blank start - call outidx - movw $0x0416, %ax # Vertical blank end - call outidx - pushw %dx - movb $0xcc, %dl # Misc output register (read) - inb %dx, %al - movb $0xc2, %dl # (write) - andb $0x0d, %al # Preserve clock select bits and color bit - orb $0xe2, %al # Set correct sync polarity - outb %al, %dx - popw %dx - movw $0x501e, force_size - stc # That's all. - ret - -# Set the 80x34 mode (all VGA's). 480 scans, 14-pixel font. -set_80x34: - call set_80x30 # Set 480 scans - call set14 # And 14-pt font - movw $0xdb12, %ax # VGA vertical display end - movw $0x5022, force_size -setvde: call outidx - stc - ret - -# Set the 80x60 mode (all VGA's). 480 scans, 8-pixel font. -set_80x60: - call set_80x30 # Set 480 scans - call set_8pt # And 8-pt font - movw $0xdf12, %ax # VGA vertical display end - movw $0x503c, force_size - jmp setvde - -# Special hack for ThinkPad graphics -set_gfx: -#ifdef CONFIG_VIDEO_GFX_HACK - movw $VIDEO_GFX_BIOS_AX, %ax - movw $VIDEO_GFX_BIOS_BX, %bx - int $0x10 - movw $VIDEO_GFX_DUMMY_RESOLUTION, force_size - stc -#endif - ret - -#ifdef CONFIG_VIDEO_RETAIN - -# Store screen contents to temporary buffer. -store_screen: - cmpb $0, do_restore # Already stored? - jnz stsr - - testb $CAN_USE_HEAP, loadflags # Have we space for storing? - jz stsr - - pushw %ax - pushw %bx - pushw force_size # Don't force specific size - movw $0, force_size - call mode_params # Obtain params of current mode - popw force_size - movb %fs:(PARAM_VIDEO_LINES), %ah - movb %fs:(PARAM_VIDEO_COLS), %al - movw %ax, %bx # BX=dimensions - mulb %ah - movw %ax, %cx # CX=number of characters - addw %ax, %ax # Calculate image size - addw $modelist+1024+4, %ax - cmpw heap_end_ptr, %ax - jnc sts1 # Unfortunately, out of memory - - movw %fs:(PARAM_CURSOR_POS), %ax # Store mode params - leaw modelist+1024, %di - stosw - movw %bx, %ax - stosw - pushw %ds # Store the screen - movw video_segment, %ds - xorw %si, %si - rep - movsw - popw %ds - incb do_restore # Screen will be restored later -sts1: popw %bx - popw %ax -stsr: ret - -# Restore screen contents from temporary buffer. -restore_screen: - cmpb $0, do_restore # Has the screen been stored? - jz res1 - - call mode_params # Get parameters of current mode - movb %fs:(PARAM_VIDEO_LINES), %cl - movb %fs:(PARAM_VIDEO_COLS), %ch - leaw modelist+1024, %si # Screen buffer - lodsw # Set cursor position - movw %ax, %dx - cmpb %cl, %dh - jc res2 - - movb %cl, %dh - decb %dh -res2: cmpb %ch, %dl - jc res3 - - movb %ch, %dl - decb %dl -res3: movb $0x02, %ah - movb $0x00, %bh - int $0x10 - lodsw # Display size - movb %ah, %dl # DL=number of lines - movb $0, %ah # BX=phys. length of orig. line - movw %ax, %bx - cmpb %cl, %dl # Too many? - jc res4 - - pushw %ax - movb %dl, %al - subb %cl, %al - mulb %bl - addw %ax, %si - addw %ax, %si - popw %ax - movb %cl, %dl -res4: cmpb %ch, %al # Too wide? - jc res5 - - movb %ch, %al # AX=width of src. line -res5: movb $0, %cl - xchgb %ch, %cl - movw %cx, %bp # BP=width of dest. line - pushw %es - movw video_segment, %es - xorw %di, %di # Move the data - addw %bx, %bx # Convert BX and BP to _bytes_ - addw %bp, %bp -res6: pushw %si - pushw %di - movw %ax, %cx - rep - movsw - popw %di - popw %si - addw %bp, %di - addw %bx, %si - decb %dl - jnz res6 - - popw %es # Done -res1: ret -#endif /* CONFIG_VIDEO_RETAIN */ - -# Write to indexed VGA register (AL=index, AH=data, DX=index reg. port) -outidx: outb %al, %dx - pushw %ax - movb %ah, %al - incw %dx - outb %al, %dx - decw %dx - popw %ax - ret - -# Build the table of video modes (stored after the setup.S code at the -# `modelist' label. Each video mode record looks like: -# .word MODE-ID (our special mode ID (see above)) -# .byte rows (number of rows) -# .byte columns (number of columns) -# Returns address of the end of the table in DI, the end is marked -# with a ASK_VGA ID. -mode_table: - movw mt_end, %di # Already filled? - orw %di, %di - jnz mtab1x - - leaw modelist, %di # Store standard modes: - movl $VIDEO_80x25 + 0x50190000, %eax # The 80x25 mode (ALL) - stosl - movb adapter, %al # CGA/MDA/HGA -- no more modes - orb %al, %al - jz mtabe - - decb %al - jnz mtabv - - movl $VIDEO_8POINT + 0x502b0000, %eax # The 80x43 EGA mode - stosl - jmp mtabe - -mtab1x: jmp mtab1 - -mtabv: leaw vga_modes, %si # All modes for std VGA - movw $vga_modes_end-vga_modes, %cx - rep # I'm unable to use movsw as I don't know how to store a half - movsb # of the expression above to cx without using explicit shr. - - cmpb $0, scanning # Mode scan requested? - jz mscan1 - - call mode_scan -mscan1: - -#ifdef CONFIG_VIDEO_LOCAL - call local_modes -#endif /* CONFIG_VIDEO_LOCAL */ - -#ifdef CONFIG_VIDEO_VESA - call vesa_modes # Detect VESA VGA modes -#endif /* CONFIG_VIDEO_VESA */ - -#ifdef CONFIG_VIDEO_SVGA - cmpb $0, scanning # Bypass when scanning - jnz mscan2 - - call svga_modes # Detect SVGA cards & modes -mscan2: -#endif /* CONFIG_VIDEO_SVGA */ - -mtabe: - -#ifdef CONFIG_VIDEO_COMPACT - leaw modelist, %si - movw %di, %dx - movw %si, %di -cmt1: cmpw %dx, %si # Scan all modes - jz cmt2 - - leaw modelist, %bx # Find in previous entries - movw 2(%si), %cx -cmt3: cmpw %bx, %si - jz cmt4 - - cmpw 2(%bx), %cx # Found => don't copy this entry - jz cmt5 - - addw $4, %bx - jmp cmt3 - -cmt4: movsl # Copy entry - jmp cmt1 - -cmt5: addw $4, %si # Skip entry - jmp cmt1 - -cmt2: -#endif /* CONFIG_VIDEO_COMPACT */ - - movw $ASK_VGA, (%di) # End marker - movw %di, mt_end -mtab1: leaw modelist, %si # SI=mode list, DI=list end -ret0: ret - -# Modes usable on all standard VGAs -vga_modes: - .word VIDEO_8POINT - .word 0x5032 # 80x50 - .word VIDEO_80x43 - .word 0x502b # 80x43 - .word VIDEO_80x28 - .word 0x501c # 80x28 - .word VIDEO_80x30 - .word 0x501e # 80x30 - .word VIDEO_80x34 - .word 0x5022 # 80x34 - .word VIDEO_80x60 - .word 0x503c # 80x60 -#ifdef CONFIG_VIDEO_GFX_HACK - .word VIDEO_GFX_HACK - .word VIDEO_GFX_DUMMY_RESOLUTION -#endif - -vga_modes_end: -# Detect VESA modes. - -#ifdef CONFIG_VIDEO_VESA -vesa_modes: - cmpb $2, adapter # VGA only - jnz ret0 - - movw %di, %bp # BP=original mode table end - addw $0x200, %di # Buffer space - movw $0x4f00, %ax # VESA Get card info call - int $0x10 - movw %bp, %di - cmpw $0x004f, %ax # Successful? - jnz ret0 - - cmpw $0x4556, 0x200(%di) - jnz ret0 - - cmpw $0x4153, 0x202(%di) - jnz ret0 - - movw $vesa_name, card_name # Set name to "VESA VGA" - pushw %gs - lgsw 0x20e(%di), %si # GS:SI=mode list - movw $128, %cx # Iteration limit -vesa1: -# gas version 2.9.1, using BFD version 2.9.1.0.23 buggers the next inst. -# XXX: lodsw %gs:(%si), %ax # Get next mode in the list - gs; lodsw - cmpw $0xffff, %ax # End of the table? - jz vesar - - cmpw $0x0080, %ax # Check validity of mode ID - jc vesa2 - - orb %ah, %ah # Valid IDs: 0x0000-0x007f/0x0100-0x07ff - jz vesan # Certain BIOSes report 0x80-0xff! - - cmpw $0x0800, %ax - jnc vesae - -vesa2: pushw %cx - movw %ax, %cx # Get mode information structure - movw $0x4f01, %ax - int $0x10 - movw %cx, %bx # BX=mode number - addb $VIDEO_FIRST_VESA>>8, %bh - popw %cx - cmpw $0x004f, %ax - jnz vesan # Don't report errors (buggy BIOSES) - - movb (%di), %al # Check capabilities. We require - andb $0x19, %al # a color text mode. - cmpb $0x09, %al - jnz vesan - - cmpw $0xb800, 8(%di) # Standard video memory address required - jnz vesan - - testb $2, (%di) # Mode characteristics supplied? - movw %bx, (%di) # Store mode number - jz vesa3 - - xorw %dx, %dx - movw 0x12(%di), %bx # Width - orb %bh, %bh - jnz vesan - - movb %bl, 0x3(%di) - movw 0x14(%di), %ax # Height - orb %ah, %ah - jnz vesan - - movb %al, 2(%di) - mulb %bl - cmpw $8193, %ax # Small enough for Linux console driver? - jnc vesan - - jmp vesaok - -vesa3: subw $0x8108, %bx # This mode has no detailed info specified, - jc vesan # so it must be a standard VESA mode. - - cmpw $5, %bx - jnc vesan - - movw vesa_text_mode_table(%bx), %ax - movw %ax, 2(%di) -vesaok: addw $4, %di # The mode is valid. Store it. -vesan: loop vesa1 # Next mode. Limit exceeded => error -vesae: leaw vesaer, %si - call prtstr - movw %bp, %di # Discard already found modes. -vesar: popw %gs - ret - -# Dimensions of standard VESA text modes -vesa_text_mode_table: - .byte 60, 80 # 0108 - .byte 25, 132 # 0109 - .byte 43, 132 # 010A - .byte 50, 132 # 010B - .byte 60, 132 # 010C -#endif /* CONFIG_VIDEO_VESA */ - -# Scan for video modes. A bit dirty, but should work. -mode_scan: - movw $0x0100, %cx # Start with mode 0 -scm1: movb $0, %ah # Test the mode - movb %cl, %al - int $0x10 - movb $0x0f, %ah - int $0x10 - cmpb %cl, %al - jnz scm2 # Mode not set - - movw $0x3c0, %dx # Test if it's a text mode - movb $0x10, %al # Mode bits - call inidx - andb $0x03, %al - jnz scm2 - - movb $0xce, %dl # Another set of mode bits - movb $0x06, %al - call inidx - shrb %al - jc scm2 - - movb $0xd4, %dl # Cursor location - movb $0x0f, %al - call inidx - orb %al, %al - jnz scm2 - - movw %cx, %ax # Ok, store the mode - stosw - movb %gs:(0x484), %al # Number of rows - incb %al - stosb - movw %gs:(0x44a), %ax # Number of columns - stosb -scm2: incb %cl - jns scm1 - - movw $0x0003, %ax # Return back to mode 3 - int $0x10 - ret - -tstidx: outw %ax, %dx # OUT DX,AX and inidx -inidx: outb %al, %dx # Read from indexed VGA register - incw %dx # AL=index, DX=index reg port -> AL=data - inb %dx, %al - decw %dx - ret - -# Try to detect type of SVGA card and supply (usually approximate) video -# mode table for it. - -#ifdef CONFIG_VIDEO_SVGA -svga_modes: - leaw svga_table, %si # Test all known SVGA adapters -dosvga: lodsw - movw %ax, %bp # Default mode table - orw %ax, %ax - jz didsv1 - - lodsw # Pointer to test routine - pushw %si - pushw %di - pushw %es - movw $0xc000, %bx - movw %bx, %es - call *%ax # Call test routine - popw %es - popw %di - popw %si - orw %bp, %bp - jz dosvga - - movw %bp, %si # Found, copy the modes - movb svga_prefix, %ah -cpsvga: lodsb - orb %al, %al - jz didsv - - stosw - movsw - jmp cpsvga - -didsv: movw %si, card_name # Store pointer to card name -didsv1: ret - -# Table of all known SVGA cards. For each card, we store a pointer to -# a table of video modes supported by the card and a pointer to a routine -# used for testing of presence of the card. The video mode table is always -# followed by the name of the card or the chipset. -svga_table: - .word ati_md, ati_test - .word oak_md, oak_test - .word paradise_md, paradise_test - .word realtek_md, realtek_test - .word s3_md, s3_test - .word chips_md, chips_test - .word video7_md, video7_test - .word cirrus5_md, cirrus5_test - .word cirrus6_md, cirrus6_test - .word cirrus1_md, cirrus1_test - .word ahead_md, ahead_test - .word everex_md, everex_test - .word genoa_md, genoa_test - .word trident_md, trident_test - .word tseng_md, tseng_test - .word 0 - -# Test routines and mode tables: - -# S3 - The test algorithm was taken from the SuperProbe package -# for XFree86 1.2.1. Report bugs to Christoph.Niemann@linux.org -s3_test: - movw $0x0f35, %cx # we store some constants in cl/ch - movw $0x03d4, %dx - movb $0x38, %al - call inidx - movb %al, %bh # store current CRT-register 0x38 - movw $0x0038, %ax - call outidx # disable writing to special regs - movb %cl, %al # check whether we can write special reg 0x35 - call inidx - movb %al, %bl # save the current value of CRT reg 0x35 - andb $0xf0, %al # clear bits 0-3 - movb %al, %ah - movb %cl, %al # and write it to CRT reg 0x35 - call outidx - call inidx # now read it back - andb %ch, %al # clear the upper 4 bits - jz s3_2 # the first test failed. But we have a - - movb %bl, %ah # second chance - movb %cl, %al - call outidx - jmp s3_1 # do the other tests - -s3_2: movw %cx, %ax # load ah with 0xf and al with 0x35 - orb %bl, %ah # set the upper 4 bits of ah with the orig value - call outidx # write ... - call inidx # ... and reread - andb %cl, %al # turn off the upper 4 bits - pushw %ax - movb %bl, %ah # restore old value in register 0x35 - movb %cl, %al - call outidx - popw %ax - cmpb %ch, %al # setting lower 4 bits was successful => bad - je no_s3 # writing is allowed => this is not an S3 - -s3_1: movw $0x4838, %ax # allow writing to special regs by putting - call outidx # magic number into CRT-register 0x38 - movb %cl, %al # check whether we can write special reg 0x35 - call inidx - movb %al, %bl - andb $0xf0, %al - movb %al, %ah - movb %cl, %al - call outidx - call inidx - andb %ch, %al - jnz no_s3 # no, we can't write => no S3 - - movw %cx, %ax - orb %bl, %ah - call outidx - call inidx - andb %ch, %al - pushw %ax - movb %bl, %ah # restore old value in register 0x35 - movb %cl, %al - call outidx - popw %ax - cmpb %ch, %al - jne no_s31 # writing not possible => no S3 - movb $0x30, %al - call inidx # now get the S3 id ... - leaw idS3, %di - movw $0x10, %cx - repne - scasb - je no_s31 - - movb %bh, %ah - movb $0x38, %al - jmp s3rest - -no_s3: movb $0x35, %al # restore CRT register 0x35 - movb %bl, %ah - call outidx -no_s31: xorw %bp, %bp # Detection failed -s3rest: movb %bh, %ah - movb $0x38, %al # restore old value of CRT register 0x38 - jmp outidx - -idS3: .byte 0x81, 0x82, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95 - .byte 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa8, 0xb0 - -s3_md: .byte 0x54, 0x2b, 0x84 - .byte 0x55, 0x19, 0x84 - .byte 0 - .ascii "S3" - .byte 0 - -# ATI cards. -ati_test: - leaw idati, %si - movw $0x31, %di - movw $0x09, %cx - repe - cmpsb - je atiok - - xorw %bp, %bp -atiok: ret - -idati: .ascii "761295520" - -ati_md: .byte 0x23, 0x19, 0x84 - .byte 0x33, 0x2c, 0x84 - .byte 0x22, 0x1e, 0x64 - .byte 0x21, 0x19, 0x64 - .byte 0x58, 0x21, 0x50 - .byte 0x5b, 0x1e, 0x50 - .byte 0 - .ascii "ATI" - .byte 0 - -# AHEAD -ahead_test: - movw $0x200f, %ax - movw $0x3ce, %dx - outw %ax, %dx - incw %dx - inb %dx, %al - cmpb $0x20, %al - je isahed - - cmpb $0x21, %al - je isahed - - xorw %bp, %bp -isahed: ret - -ahead_md: - .byte 0x22, 0x2c, 0x84 - .byte 0x23, 0x19, 0x84 - .byte 0x24, 0x1c, 0x84 - .byte 0x2f, 0x32, 0xa0 - .byte 0x32, 0x22, 0x50 - .byte 0x34, 0x42, 0x50 - .byte 0 - .ascii "Ahead" - .byte 0 - -# Chips & Tech. -chips_test: - movw $0x3c3, %dx - inb %dx, %al - orb $0x10, %al - outb %al, %dx - movw $0x104, %dx - inb %dx, %al - movb %al, %bl - movw $0x3c3, %dx - inb %dx, %al - andb $0xef, %al - outb %al, %dx - cmpb $0xa5, %bl - je cantok - - xorw %bp, %bp -cantok: ret - -chips_md: - .byte 0x60, 0x19, 0x84 - .byte 0x61, 0x32, 0x84 - .byte 0 - .ascii "Chips & Technologies" - .byte 0 - -# Cirrus Logic 5X0 -cirrus1_test: - movw $0x3d4, %dx - movb $0x0c, %al - outb %al, %dx - incw %dx - inb %dx, %al - movb %al, %bl - xorb %al, %al - outb %al, %dx - decw %dx - movb $0x1f, %al - outb %al, %dx - incw %dx - inb %dx, %al - movb %al, %bh - xorb %ah, %ah - shlb $4, %al - movw %ax, %cx - movb %bh, %al - shrb $4, %al - addw %ax, %cx - shlw $8, %cx - addw $6, %cx - movw %cx, %ax - movw $0x3c4, %dx - outw %ax, %dx - incw %dx - inb %dx, %al - andb %al, %al - jnz nocirr - - movb %bh, %al - outb %al, %dx - inb %dx, %al - cmpb $0x01, %al - je iscirr - -nocirr: xorw %bp, %bp -iscirr: movw $0x3d4, %dx - movb %bl, %al - xorb %ah, %ah - shlw $8, %ax - addw $0x0c, %ax - outw %ax, %dx - ret - -cirrus1_md: - .byte 0x1f, 0x19, 0x84 - .byte 0x20, 0x2c, 0x84 - .byte 0x22, 0x1e, 0x84 - .byte 0x31, 0x25, 0x64 - .byte 0 - .ascii "Cirrus Logic 5X0" - .byte 0 - -# Cirrus Logic 54XX -cirrus5_test: - movw $0x3c4, %dx - movb $6, %al - call inidx - movb %al, %bl # BL=backup - movw $6, %ax - call tstidx - cmpb $0x0f, %al - jne c5fail - - movw $0x1206, %ax - call tstidx - cmpb $0x12, %al - jne c5fail - - movb $0x1e, %al - call inidx - movb %al, %bh - movb %bh, %ah - andb $0xc0, %ah - movb $0x1e, %al - call tstidx - andb $0x3f, %al - jne c5xx - - movb $0x1e, %al - movb %bh, %ah - orb $0x3f, %ah - call tstidx - xorb $0x3f, %al - andb $0x3f, %al -c5xx: pushf - movb $0x1e, %al - movb %bh, %ah - outw %ax, %dx - popf - je c5done - -c5fail: xorw %bp, %bp -c5done: movb $6, %al - movb %bl, %ah - outw %ax, %dx - ret - -cirrus5_md: - .byte 0x14, 0x19, 0x84 - .byte 0x54, 0x2b, 0x84 - .byte 0 - .ascii "Cirrus Logic 54XX" - .byte 0 - -# Cirrus Logic 64XX -- no known extra modes, but must be identified, because -# it's misidentified by the Ahead test. -cirrus6_test: - movw $0x3ce, %dx - movb $0x0a, %al - call inidx - movb %al, %bl # BL=backup - movw $0xce0a, %ax - call tstidx - orb %al, %al - jne c2fail - - movw $0xec0a, %ax - call tstidx - cmpb $0x01, %al - jne c2fail - - movb $0xaa, %al - call inidx # 4X, 5X, 7X and 8X are valid 64XX chip ID's. - shrb $4, %al - subb $4, %al - jz c6done - - decb %al - jz c6done - - subb $2, %al - jz c6done - - decb %al - jz c6done - -c2fail: xorw %bp, %bp -c6done: movb $0x0a, %al - movb %bl, %ah - outw %ax, %dx - ret - -cirrus6_md: - .byte 0 - .ascii "Cirrus Logic 64XX" - .byte 0 - -# Everex / Trident -everex_test: - movw $0x7000, %ax - xorw %bx, %bx - int $0x10 - cmpb $0x70, %al - jne noevrx - - shrw $4, %dx - cmpw $0x678, %dx - je evtrid - - cmpw $0x236, %dx - jne evrxok - -evtrid: leaw trident_md, %bp -evrxok: ret - -noevrx: xorw %bp, %bp - ret - -everex_md: - .byte 0x03, 0x22, 0x50 - .byte 0x04, 0x3c, 0x50 - .byte 0x07, 0x2b, 0x64 - .byte 0x08, 0x4b, 0x64 - .byte 0x0a, 0x19, 0x84 - .byte 0x0b, 0x2c, 0x84 - .byte 0x16, 0x1e, 0x50 - .byte 0x18, 0x1b, 0x64 - .byte 0x21, 0x40, 0xa0 - .byte 0x40, 0x1e, 0x84 - .byte 0 - .ascii "Everex/Trident" - .byte 0 - -# Genoa. -genoa_test: - leaw idgenoa, %si # Check Genoa 'clues' - xorw %ax, %ax - movb %es:(0x37), %al - movw %ax, %di - movw $0x04, %cx - decw %si - decw %di -l1: incw %si - incw %di - movb (%si), %al - testb %al, %al - jz l2 - - cmpb %es:(%di), %al -l2: loope l1 - orw %cx, %cx - je isgen - - xorw %bp, %bp -isgen: ret - -idgenoa: .byte 0x77, 0x00, 0x99, 0x66 - -genoa_md: - .byte 0x58, 0x20, 0x50 - .byte 0x5a, 0x2a, 0x64 - .byte 0x60, 0x19, 0x84 - .byte 0x61, 0x1d, 0x84 - .byte 0x62, 0x20, 0x84 - .byte 0x63, 0x2c, 0x84 - .byte 0x64, 0x3c, 0x84 - .byte 0x6b, 0x4f, 0x64 - .byte 0x72, 0x3c, 0x50 - .byte 0x74, 0x42, 0x50 - .byte 0x78, 0x4b, 0x64 - .byte 0 - .ascii "Genoa" - .byte 0 - -# OAK -oak_test: - leaw idoakvga, %si - movw $0x08, %di - movw $0x08, %cx - repe - cmpsb - je isoak - - xorw %bp, %bp -isoak: ret - -idoakvga: .ascii "OAK VGA " - -oak_md: .byte 0x4e, 0x3c, 0x50 - .byte 0x4f, 0x3c, 0x84 - .byte 0x50, 0x19, 0x84 - .byte 0x51, 0x2b, 0x84 - .byte 0 - .ascii "OAK" - .byte 0 - -# WD Paradise. -paradise_test: - leaw idparadise, %si - movw $0x7d, %di - movw $0x04, %cx - repe - cmpsb - je ispara - - xorw %bp, %bp -ispara: ret - -idparadise: .ascii "VGA=" - -paradise_md: - .byte 0x41, 0x22, 0x50 - .byte 0x47, 0x1c, 0x84 - .byte 0x55, 0x19, 0x84 - .byte 0x54, 0x2c, 0x84 - .byte 0 - .ascii "Paradise" - .byte 0 - -# Trident. -trident_test: - movw $0x3c4, %dx - movb $0x0e, %al - outb %al, %dx - incw %dx - inb %dx, %al - xchgb %al, %ah - xorb %al, %al - outb %al, %dx - inb %dx, %al - xchgb %ah, %al - movb %al, %bl # Strange thing ... in the book this wasn't - andb $0x02, %bl # necessary but it worked on my card which - jz setb2 # is a trident. Without it the screen goes - # blurred ... - andb $0xfd, %al - jmp clrb2 - -setb2: orb $0x02, %al -clrb2: outb %al, %dx - andb $0x0f, %ah - cmpb $0x02, %ah - je istrid - - xorw %bp, %bp -istrid: ret - -trident_md: - .byte 0x50, 0x1e, 0x50 - .byte 0x51, 0x2b, 0x50 - .byte 0x52, 0x3c, 0x50 - .byte 0x57, 0x19, 0x84 - .byte 0x58, 0x1e, 0x84 - .byte 0x59, 0x2b, 0x84 - .byte 0x5a, 0x3c, 0x84 - .byte 0 - .ascii "Trident" - .byte 0 - -# Tseng. -tseng_test: - movw $0x3cd, %dx - inb %dx, %al # Could things be this simple ! :-) - movb %al, %bl - movb $0x55, %al - outb %al, %dx - inb %dx, %al - movb %al, %ah - movb %bl, %al - outb %al, %dx - cmpb $0x55, %ah - je istsen - -isnot: xorw %bp, %bp -istsen: ret - -tseng_md: - .byte 0x26, 0x3c, 0x50 - .byte 0x2a, 0x28, 0x64 - .byte 0x23, 0x19, 0x84 - .byte 0x24, 0x1c, 0x84 - .byte 0x22, 0x2c, 0x84 - .byte 0x21, 0x3c, 0x84 - .byte 0 - .ascii "Tseng" - .byte 0 - -# Video7. -video7_test: - movw $0x3cc, %dx - inb %dx, %al - movw $0x3b4, %dx - andb $0x01, %al - jz even7 - - movw $0x3d4, %dx -even7: movb $0x0c, %al - outb %al, %dx - incw %dx - inb %dx, %al - movb %al, %bl - movb $0x55, %al - outb %al, %dx - inb %dx, %al - decw %dx - movb $0x1f, %al - outb %al, %dx - incw %dx - inb %dx, %al - movb %al, %bh - decw %dx - movb $0x0c, %al - outb %al, %dx - incw %dx - movb %bl, %al - outb %al, %dx - movb $0x55, %al - xorb $0xea, %al - cmpb %bh, %al - jne isnot - - movb $VIDEO_FIRST_V7>>8, svga_prefix # Use special mode switching - ret - -video7_md: - .byte 0x40, 0x2b, 0x50 - .byte 0x43, 0x3c, 0x50 - .byte 0x44, 0x3c, 0x64 - .byte 0x41, 0x19, 0x84 - .byte 0x42, 0x2c, 0x84 - .byte 0x45, 0x1c, 0x84 - .byte 0 - .ascii "Video 7" - .byte 0 - -# Realtek VGA -realtek_test: - leaw idrtvga, %si - movw $0x45, %di - movw $0x0b, %cx - repe - cmpsb - je isrt - - xorw %bp, %bp -isrt: ret - -idrtvga: .ascii "REALTEK VGA" - -realtek_md: - .byte 0x1a, 0x3c, 0x50 - .byte 0x1b, 0x19, 0x84 - .byte 0x1c, 0x1e, 0x84 - .byte 0x1d, 0x2b, 0x84 - .byte 0x1e, 0x3c, 0x84 - .byte 0 - .ascii "REALTEK" - .byte 0 - -#endif /* CONFIG_VIDEO_SVGA */ - -# User-defined local mode table (VGA only) -#ifdef CONFIG_VIDEO_LOCAL -local_modes: - leaw local_mode_table, %si -locm1: lodsw - orw %ax, %ax - jz locm2 - - stosw - movsw - jmp locm1 - -locm2: ret - -# This is the table of local video modes which can be supplied manually -# by the user. Each entry consists of mode ID (word) and dimensions -# (byte for column count and another byte for row count). These modes -# are placed before all SVGA and VESA modes and override them if table -# compacting is enabled. The table must end with a zero word followed -# by NUL-terminated video adapter name. -local_mode_table: - .word 0x0100 # Example: 40x25 - .byte 25,40 - .word 0 - .ascii "Local" - .byte 0 -#endif /* CONFIG_VIDEO_LOCAL */ - -# Read a key and return the ASCII code in al, scan code in ah -getkey: xorb %ah, %ah - int $0x16 - ret - -# Read a key with a timeout of 30 seconds. -# The hardware clock is used to get the time. -getkt: call gettime - addb $30, %al # Wait 30 seconds - cmpb $60, %al - jl lminute - - subb $60, %al -lminute: - movb %al, %cl -again: movb $0x01, %ah - int $0x16 - jnz getkey # key pressed, so get it - - call gettime - cmpb %cl, %al - jne again - - movb $0x20, %al # timeout, return `space' - ret - -# Flush the keyboard buffer -flush: movb $0x01, %ah - int $0x16 - jz empty - - xorb %ah, %ah - int $0x16 - jmp flush - -empty: ret - -# Print hexadecimal number. -prthw: pushw %ax - movb %ah, %al - call prthb - popw %ax -prthb: pushw %ax - shrb $4, %al - call prthn - popw %ax - andb $0x0f, %al -prthn: cmpb $0x0a, %al - jc prth1 - - addb $0x07, %al -prth1: addb $0x30, %al - jmp prtchr - -# Print decimal number in al -prtdec: pushw %ax - pushw %cx - xorb %ah, %ah - movb $0x0a, %cl - idivb %cl - cmpb $0x09, %al - jbe lt100 - - call prtdec - jmp skip10 - -lt100: addb $0x30, %al - call prtchr -skip10: movb %ah, %al - addb $0x30, %al - call prtchr - popw %cx - popw %ax - ret - -store_edid: -#ifdef CONFIG_FIRMWARE_EDID - pushw %es # just save all registers - pushw %ax - pushw %bx - pushw %cx - pushw %dx - pushw %di - - pushw %fs - popw %es - - movl $0x13131313, %eax # memset block with 0x13 - movw $32, %cx - movw $0x140, %di - cld - rep - stosl - - cmpw $0x0200, vbe_version # only do EDID on >= VBE2.0 - jl no_edid - - pushw %es # save ES - xorw %di, %di # Report Capability - pushw %di - popw %es # ES:DI must be 0:0 - movw $0x4f15, %ax - xorw %bx, %bx - xorw %cx, %cx - int $0x10 - popw %es # restore ES - - cmpb $0x00, %ah # call successful - jne no_edid - - cmpb $0x4f, %al # function supported - jne no_edid - - movw $0x4f15, %ax # do VBE/DDC - movw $0x01, %bx - movw $0x00, %cx - movw $0x01, %dx - movw $0x140, %di - int $0x10 - -no_edid: - popw %di # restore all registers - popw %dx - popw %cx - popw %bx - popw %ax - popw %es -#endif - ret - -# VIDEO_SELECT-only variables -mt_end: .word 0 # End of video mode table if built -edit_buf: .space 6 # Line editor buffer -card_name: .word 0 # Pointer to adapter name -scanning: .byte 0 # Performing mode scan -do_restore: .byte 0 # Screen contents altered during mode change -svga_prefix: .byte VIDEO_FIRST_BIOS>>8 # Default prefix for BIOS modes -graphic_mode: .byte 0 # Graphic mode with a linear frame buffer -dac_size: .byte 6 # DAC bit depth -vbe_version: .word 0 # VBE bios version - -# Status messages -keymsg: .ascii "Press <RETURN> to see video modes available, " - .ascii "<SPACE> to continue or wait 30 secs" - .byte 0x0d, 0x0a, 0 - -listhdr: .byte 0x0d, 0x0a - .ascii "Mode: COLSxROWS:" - -crlft: .byte 0x0d, 0x0a, 0 - -prompt: .byte 0x0d, 0x0a - .asciz "Enter mode number or `scan': " - -unknt: .asciz "Unknown mode ID. Try again." - -badmdt: .ascii "You passed an undefined mode number." - .byte 0x0d, 0x0a, 0 - -vesaer: .ascii "Error: Scanning of VESA modes failed. Please " - .ascii "report to <mj@ucw.cz>." - .byte 0x0d, 0x0a, 0 - -old_name: .asciz "CGA/MDA/HGA" - -ega_name: .asciz "EGA" - -svga_name: .ascii " " - -vga_name: .asciz "VGA" - -vesa_name: .asciz "VESA" - -name_bann: .asciz "Video adapter: " -#endif /* CONFIG_VIDEO_SELECT */ - -# Other variables: -adapter: .byte 0 # Video adapter: 0=CGA/MDA/HGA,1=EGA,2=VGA -video_segment: .word 0xb800 # Video memory segment -force_size: .word 0 # Use this size instead of the one in BIOS vars diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig index b26378815b9..941a7e3aa5f 100644 --- a/arch/x86_64/defconfig +++ b/arch/x86_64/defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.21-rc3 -# Wed Mar 7 15:29:47 2007 +# Linux kernel version: 2.6.21-git3 +# Tue May 1 07:30:48 2007 # CONFIG_X86_64=y CONFIG_64BIT=y @@ -118,11 +118,11 @@ CONFIG_X86_PC=y # CONFIG_X86_VSMP is not set # CONFIG_MK8 is not set # CONFIG_MPSC is not set -# CONFIG_MCORE2 is not set -CONFIG_GENERIC_CPU=y -CONFIG_X86_L1_CACHE_BYTES=128 -CONFIG_X86_L1_CACHE_SHIFT=7 -CONFIG_X86_INTERNODE_CACHE_BYTES=128 +CONFIG_MCORE2=y +# CONFIG_GENERIC_CPU is not set +CONFIG_X86_L1_CACHE_BYTES=64 +CONFIG_X86_L1_CACHE_SHIFT=6 +CONFIG_X86_INTERNODE_CACHE_BYTES=64 CONFIG_X86_TSC=y CONFIG_X86_GOOD_APIC=y # CONFIG_MICROCODE is not set @@ -174,6 +174,7 @@ CONFIG_X86_MCE_INTEL=y CONFIG_X86_MCE_AMD=y # CONFIG_KEXEC is not set # CONFIG_CRASH_DUMP is not set +# CONFIG_RELOCATABLE is not set CONFIG_PHYSICAL_START=0x200000 CONFIG_SECCOMP=y # CONFIG_CC_STACKPROTECTOR is not set @@ -182,7 +183,6 @@ CONFIG_HZ_250=y # CONFIG_HZ_300 is not set # CONFIG_HZ_1000 is not set CONFIG_HZ=250 -# CONFIG_REORDER is not set CONFIG_K8_NB=y CONFIG_GENERIC_HARDIRQS=y CONFIG_GENERIC_IRQ_PROBE=y @@ -218,7 +218,6 @@ CONFIG_ACPI_HOTPLUG_CPU=y CONFIG_ACPI_THERMAL=y CONFIG_ACPI_NUMA=y # CONFIG_ACPI_ASUS is not set -# CONFIG_ACPI_IBM is not set # CONFIG_ACPI_TOSHIBA is not set CONFIG_ACPI_BLACKLIST_YEAR=0 # CONFIG_ACPI_DEBUG is not set @@ -243,7 +242,7 @@ CONFIG_CPU_FREQ_GOV_PERFORMANCE=y # CONFIG_CPU_FREQ_GOV_POWERSAVE is not set CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_GOV_ONDEMAND=y -# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set +CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y # # CPUFreq processor drivers @@ -299,7 +298,6 @@ CONFIG_NET=y # # Networking options # -# CONFIG_NETDEBUG is not set CONFIG_PACKET=y # CONFIG_PACKET_MMAP is not set CONFIG_UNIX=y @@ -334,6 +332,7 @@ CONFIG_DEFAULT_TCP_CONG="cubic" CONFIG_IPV6=y # CONFIG_IPV6_PRIVACY is not set # CONFIG_IPV6_ROUTER_PREF is not set +# CONFIG_IPV6_OPTIMISTIC_DAD is not set # CONFIG_INET6_AH is not set # CONFIG_INET6_ESP is not set # CONFIG_INET6_IPCOMP is not set @@ -389,6 +388,13 @@ CONFIG_IPV6_SIT=y # CONFIG_HAMRADIO is not set # CONFIG_IRDA is not set # CONFIG_BT is not set +# CONFIG_AF_RXRPC is not set + +# +# Wireless +# +# CONFIG_CFG80211 is not set +# CONFIG_WIRELESS_EXT is not set # CONFIG_IEEE80211 is not set # @@ -409,10 +415,6 @@ CONFIG_FW_LOADER=y # Connector - unified userspace <-> kernelspace linker # # CONFIG_CONNECTOR is not set - -# -# Memory Technology Devices (MTD) -# # CONFIG_MTD is not set # @@ -459,6 +461,7 @@ CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024 # CONFIG_SGI_IOC4 is not set # CONFIG_TIFM_CORE is not set # CONFIG_SONY_LAPTOP is not set +# CONFIG_THINKPAD_ACPI is not set # # ATA/ATAPI/MFM/RLL support @@ -494,7 +497,6 @@ CONFIG_BLK_DEV_IDEPCI=y # CONFIG_BLK_DEV_RZ1000 is not set CONFIG_BLK_DEV_IDEDMA_PCI=y # CONFIG_BLK_DEV_IDEDMA_FORCED is not set -CONFIG_IDEDMA_PCI_AUTO=y # CONFIG_IDEDMA_ONLYDISK is not set # CONFIG_BLK_DEV_AEC62XX is not set # CONFIG_BLK_DEV_ALI15X3 is not set @@ -525,7 +527,6 @@ CONFIG_BLK_DEV_PDC202XX_NEW=y # CONFIG_IDE_ARM is not set CONFIG_BLK_DEV_IDEDMA=y # CONFIG_IDEDMA_IVB is not set -CONFIG_IDEDMA_AUTO=y # CONFIG_BLK_DEV_HD is not set # @@ -584,11 +585,9 @@ CONFIG_AIC79XX_DEBUG_MASK=0 # CONFIG_AIC79XX_REG_PRETTY_PRINT is not set # CONFIG_SCSI_AIC94XX is not set # CONFIG_SCSI_ARCMSR is not set -CONFIG_MEGARAID_NEWGEN=y -CONFIG_MEGARAID_MM=y -CONFIG_MEGARAID_MAILBOX=y +# CONFIG_MEGARAID_NEWGEN is not set # CONFIG_MEGARAID_LEGACY is not set -CONFIG_MEGARAID_SAS=y +# CONFIG_MEGARAID_SAS is not set # CONFIG_SCSI_HPTIOP is not set # CONFIG_SCSI_BUSLOGIC is not set # CONFIG_SCSI_DMX3191D is not set @@ -608,6 +607,7 @@ CONFIG_MEGARAID_SAS=y # CONFIG_SCSI_DC395x is not set # CONFIG_SCSI_DC390T is not set # CONFIG_SCSI_DEBUG is not set +# CONFIG_SCSI_ESP_CORE is not set # CONFIG_SCSI_SRP is not set # @@ -636,6 +636,7 @@ CONFIG_SATA_ACPI=y # CONFIG_PATA_AMD is not set # CONFIG_PATA_ARTOP is not set # CONFIG_PATA_ATIIXP is not set +# CONFIG_PATA_CMD640_PCI is not set # CONFIG_PATA_CMD64X is not set # CONFIG_PATA_CS5520 is not set # CONFIG_PATA_CS5530 is not set @@ -687,7 +688,7 @@ CONFIG_BLK_DEV_DM=y CONFIG_FUSION=y CONFIG_FUSION_SPI=y # CONFIG_FUSION_FC is not set -CONFIG_FUSION_SAS=y +# CONFIG_FUSION_SAS is not set CONFIG_FUSION_MAX_SGE=128 # CONFIG_FUSION_CTL is not set @@ -700,19 +701,22 @@ CONFIG_IEEE1394=y # Subsystem Options # # CONFIG_IEEE1394_VERBOSEDEBUG is not set -# CONFIG_IEEE1394_EXTRA_CONFIG_ROMS is not set # -# Device Drivers +# Controllers +# + +# +# Texas Instruments PCILynx requires I2C # -# CONFIG_IEEE1394_PCILYNX is not set CONFIG_IEEE1394_OHCI1394=y # -# Protocol Drivers +# Protocols # # CONFIG_IEEE1394_VIDEO1394 is not set # CONFIG_IEEE1394_SBP2 is not set +# CONFIG_IEEE1394_ETH1394_ROM_ENTRY is not set # CONFIG_IEEE1394_ETH1394 is not set # CONFIG_IEEE1394_DV1394 is not set CONFIG_IEEE1394_RAWIO=y @@ -775,7 +779,8 @@ CONFIG_TULIP=y # CONFIG_HP100 is not set CONFIG_NET_PCI=y # CONFIG_PCNET32 is not set -# CONFIG_AMD8111_ETH is not set +CONFIG_AMD8111_ETH=y +# CONFIG_AMD8111E_NAPI is not set # CONFIG_ADAPTEC_STARFIRE is not set CONFIG_B44=y CONFIG_FORCEDETH=y @@ -837,9 +842,10 @@ CONFIG_S2IO=m # CONFIG_TR is not set # -# Wireless LAN (non-hamradio) +# Wireless LAN # -# CONFIG_NET_RADIO is not set +# CONFIG_WLAN_PRE80211 is not set +# CONFIG_WLAN_80211 is not set # # Wan interfaces @@ -853,7 +859,6 @@ CONFIG_S2IO=m # CONFIG_SHAPER is not set CONFIG_NETCONSOLE=y CONFIG_NETPOLL=y -# CONFIG_NETPOLL_RX is not set # CONFIG_NETPOLL_TRAP is not set CONFIG_NET_POLL_CONTROLLER=y @@ -987,57 +992,7 @@ CONFIG_HPET_MMAP=y # # I2C support # -CONFIG_I2C=m -CONFIG_I2C_CHARDEV=m - -# -# I2C Algorithms -# -# CONFIG_I2C_ALGOBIT is not set -# CONFIG_I2C_ALGOPCF is not set -# CONFIG_I2C_ALGOPCA is not set - -# -# I2C Hardware Bus support -# -# CONFIG_I2C_ALI1535 is not set -# CONFIG_I2C_ALI1563 is not set -# CONFIG_I2C_ALI15X3 is not set -# CONFIG_I2C_AMD756 is not set -# CONFIG_I2C_AMD8111 is not set -# CONFIG_I2C_I801 is not set -# CONFIG_I2C_I810 is not set -# CONFIG_I2C_PIIX4 is not set -CONFIG_I2C_ISA=m -# CONFIG_I2C_NFORCE2 is not set -# CONFIG_I2C_OCORES is not set -# CONFIG_I2C_PARPORT_LIGHT is not set -# CONFIG_I2C_PASEMI is not set -# CONFIG_I2C_PROSAVAGE is not set -# CONFIG_I2C_SAVAGE4 is not set -# CONFIG_I2C_SIS5595 is not set -# CONFIG_I2C_SIS630 is not set -# CONFIG_I2C_SIS96X is not set -# CONFIG_I2C_STUB is not set -# CONFIG_I2C_VIA is not set -# CONFIG_I2C_VIAPRO is not set -# CONFIG_I2C_VOODOO3 is not set -# CONFIG_I2C_PCA_ISA is not set - -# -# Miscellaneous I2C Chip support -# -# CONFIG_SENSORS_DS1337 is not set -# CONFIG_SENSORS_DS1374 is not set -# CONFIG_SENSORS_EEPROM is not set -# CONFIG_SENSORS_PCF8574 is not set -# CONFIG_SENSORS_PCA9539 is not set -# CONFIG_SENSORS_PCF8591 is not set -# CONFIG_SENSORS_MAX6875 is not set -# CONFIG_I2C_DEBUG_CORE is not set -# CONFIG_I2C_DEBUG_ALGO is not set -# CONFIG_I2C_DEBUG_BUS is not set -# CONFIG_I2C_DEBUG_CHIP is not set +# CONFIG_I2C is not set # # SPI support @@ -1053,54 +1008,8 @@ CONFIG_I2C_ISA=m # # Hardware Monitoring support # -CONFIG_HWMON=y +# CONFIG_HWMON is not set # CONFIG_HWMON_VID is not set -# CONFIG_SENSORS_ABITUGURU is not set -# CONFIG_SENSORS_ADM1021 is not set -# CONFIG_SENSORS_ADM1025 is not set -# CONFIG_SENSORS_ADM1026 is not set -# CONFIG_SENSORS_ADM1029 is not set -# CONFIG_SENSORS_ADM1031 is not set -# CONFIG_SENSORS_ADM9240 is not set -# CONFIG_SENSORS_K8TEMP is not set -# CONFIG_SENSORS_ASB100 is not set -# CONFIG_SENSORS_ATXP1 is not set -# CONFIG_SENSORS_DS1621 is not set -# CONFIG_SENSORS_F71805F is not set -# CONFIG_SENSORS_FSCHER is not set -# CONFIG_SENSORS_FSCPOS is not set -# CONFIG_SENSORS_GL518SM is not set -# CONFIG_SENSORS_GL520SM is not set -# CONFIG_SENSORS_IT87 is not set -# CONFIG_SENSORS_LM63 is not set -# CONFIG_SENSORS_LM75 is not set -# CONFIG_SENSORS_LM77 is not set -# CONFIG_SENSORS_LM78 is not set -# CONFIG_SENSORS_LM80 is not set -# CONFIG_SENSORS_LM83 is not set -# CONFIG_SENSORS_LM85 is not set -# CONFIG_SENSORS_LM87 is not set -# CONFIG_SENSORS_LM90 is not set -# CONFIG_SENSORS_LM92 is not set -# CONFIG_SENSORS_MAX1619 is not set -# CONFIG_SENSORS_PC87360 is not set -# CONFIG_SENSORS_PC87427 is not set -# CONFIG_SENSORS_SIS5595 is not set -# CONFIG_SENSORS_SMSC47M1 is not set -# CONFIG_SENSORS_SMSC47M192 is not set -CONFIG_SENSORS_SMSC47B397=m -# CONFIG_SENSORS_VIA686A is not set -# CONFIG_SENSORS_VT1211 is not set -# CONFIG_SENSORS_VT8231 is not set -# CONFIG_SENSORS_W83781D is not set -# CONFIG_SENSORS_W83791D is not set -# CONFIG_SENSORS_W83792D is not set -# CONFIG_SENSORS_W83793 is not set -# CONFIG_SENSORS_W83L785TS is not set -# CONFIG_SENSORS_W83627HF is not set -# CONFIG_SENSORS_W83627EHF is not set -# CONFIG_SENSORS_HDAPS is not set -# CONFIG_HWMON_DEBUG_CHIP is not set # # Multifunction device drivers @@ -1147,8 +1056,9 @@ CONFIG_SOUND=y # Open Sound System # CONFIG_SOUND_PRIME=y -# CONFIG_OBSOLETE_OSS is not set +CONFIG_OBSOLETE_OSS=y # CONFIG_SOUND_BT878 is not set +# CONFIG_SOUND_ES1371 is not set CONFIG_SOUND_ICH=y # CONFIG_SOUND_TRIDENT is not set # CONFIG_SOUND_MSNDCLAS is not set @@ -1163,6 +1073,14 @@ CONFIG_HID=y # CONFIG_HID_DEBUG is not set # +# USB Input Devices +# +CONFIG_USB_HID=y +# CONFIG_USB_HIDINPUT_POWERBOOK is not set +# CONFIG_HID_FF is not set +# CONFIG_USB_HIDDEV is not set + +# # USB support # CONFIG_USB_ARCH_HAS_HCD=y @@ -1175,6 +1093,7 @@ CONFIG_USB=y # Miscellaneous USB options # CONFIG_USB_DEVICEFS=y +# CONFIG_USB_DEVICE_CLASS is not set # CONFIG_USB_DYNAMIC_MINORS is not set # CONFIG_USB_SUSPEND is not set # CONFIG_USB_OTG is not set @@ -1225,10 +1144,6 @@ CONFIG_USB_STORAGE=y # # USB Input Devices # -CONFIG_USB_HID=y -# CONFIG_USB_HIDINPUT_POWERBOOK is not set -# CONFIG_HID_FF is not set -# CONFIG_USB_HIDDEV is not set # CONFIG_USB_AIPTEK is not set # CONFIG_USB_WACOM is not set # CONFIG_USB_ACECAD is not set @@ -1556,7 +1471,7 @@ CONFIG_DEBUG_KERNEL=y CONFIG_LOG_BUF_SHIFT=18 CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_SCHEDSTATS is not set -# CONFIG_TIMER_STATS is not set +CONFIG_TIMER_STATS=y # CONFIG_DEBUG_SLAB is not set # CONFIG_DEBUG_RT_MUTEXES is not set # CONFIG_RT_MUTEX_TESTER is not set diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c index 071100ea125..185399baaf6 100644 --- a/arch/x86_64/ia32/ia32_binfmt.c +++ b/arch/x86_64/ia32/ia32_binfmt.c @@ -5,6 +5,11 @@ * This tricks binfmt_elf.c into loading 32bit binaries using lots * of ugly preprocessor tricks. Talk about very very poor man's inheritance. */ +#define __ASM_X86_64_ELF_H 1 + +#undef ELF_CLASS +#define ELF_CLASS ELFCLASS32 + #include <linux/types.h> #include <linux/stddef.h> #include <linux/rwsem.h> @@ -50,9 +55,6 @@ struct elf_phdr; #undef ELF_ARCH #define ELF_ARCH EM_386 -#undef ELF_CLASS -#define ELF_CLASS ELFCLASS32 - #define ELF_DATA ELFDATA2LSB #define USE_ELF_CORE_DUMP 1 @@ -136,7 +138,7 @@ struct elf_prpsinfo #define user user32 -#define __ASM_X86_64_ELF_H 1 +#undef elf_read_implies_exec #define elf_read_implies_exec(ex, executable_stack) (executable_stack != EXSTACK_DISABLE_X) //#include <asm/ia32.h> #include <linux/elf.h> diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 796df6992f6..c48087db6f7 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -481,11 +481,7 @@ ia32_sys_call_table: .quad sys_symlink .quad sys_lstat .quad sys_readlink /* 85 */ -#ifdef CONFIG_IA32_AOUT .quad sys_uselib -#else - .quad quiet_ni_syscall -#endif .quad sys_swapon .quad sys_reboot .quad compat_sys_old_readdir diff --git a/arch/x86_64/ia32/syscall32.c b/arch/x86_64/ia32/syscall32.c index 568ff0df89e..fc4419ff035 100644 --- a/arch/x86_64/ia32/syscall32.c +++ b/arch/x86_64/ia32/syscall32.c @@ -13,6 +13,7 @@ #include <asm/proto.h> #include <asm/tlbflush.h> #include <asm/ia32_unistd.h> +#include <asm/vsyscall32.h> extern unsigned char syscall32_syscall[], syscall32_syscall_end[]; extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[]; diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index bb47e86f3d0..4d94c51803d 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -8,7 +8,8 @@ obj-y := process.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ x8664_ksyms.o i387.o syscall.o vsyscall.o \ setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \ - pci-dma.o pci-nommu.o alternative.o hpet.o tsc.o + pci-dma.o pci-nommu.o alternative.o hpet.o tsc.o bugs.o \ + perfctr-watchdog.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_X86_MCE) += mce.o therm_throt.o @@ -21,8 +22,7 @@ obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o tsc_sync.o obj-y += apic.o nmi.o -obj-y += io_apic.o mpparse.o \ - genapic.o genapic_cluster.o genapic_flat.o +obj-y += io_apic.o mpparse.o genapic.o genapic_flat.o obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_PM) += suspend.o @@ -58,3 +58,4 @@ i8237-y += ../../i386/kernel/i8237.o msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o alternative-y += ../../i386/kernel/alternative.o pcspeaker-y += ../../i386/kernel/pcspeaker.o +perfctr-watchdog-y += ../../i386/kernel/cpu/perfctr-watchdog.o diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c index e1548fbe95a..195b7034a14 100644 --- a/arch/x86_64/kernel/acpi/sleep.c +++ b/arch/x86_64/kernel/acpi/sleep.c @@ -60,19 +60,6 @@ extern char wakeup_start, wakeup_end; extern unsigned long acpi_copy_wakeup_routine(unsigned long); -static pgd_t low_ptr; - -static void init_low_mapping(void) -{ - pgd_t *slot0 = pgd_offset(current->mm, 0UL); - low_ptr = *slot0; - /* FIXME: We're playing with the current task's page tables here, which - * is potentially dangerous on SMP systems. - */ - set_pgd(slot0, *pgd_offset(current->mm, PAGE_OFFSET)); - local_flush_tlb(); -} - /** * acpi_save_state_mem - save kernel state * @@ -81,8 +68,6 @@ static void init_low_mapping(void) */ int acpi_save_state_mem(void) { - init_low_mapping(); - memcpy((void *)acpi_wakeup_address, &wakeup_start, &wakeup_end - &wakeup_start); acpi_copy_wakeup_routine(acpi_wakeup_address); @@ -95,8 +80,6 @@ int acpi_save_state_mem(void) */ void acpi_restore_state_mem(void) { - set_pgd(pgd_offset(current->mm, 0UL), low_ptr); - local_flush_tlb(); } /** @@ -109,10 +92,11 @@ void acpi_restore_state_mem(void) */ void __init acpi_reserve_bootmem(void) { - acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE); - if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) + acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2); + if ((&wakeup_end - &wakeup_start) > (PAGE_SIZE*2)) printk(KERN_CRIT - "ACPI: Wakeup code way too big, will crash on attempt to suspend\n"); + "ACPI: Wakeup code way too big, will crash on attempt" + " to suspend\n"); } static int __init acpi_sleep_setup(char *str) diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S index 185faa911db..8550a6ffa27 100644 --- a/arch/x86_64/kernel/acpi/wakeup.S +++ b/arch/x86_64/kernel/acpi/wakeup.S @@ -1,6 +1,7 @@ .text #include <linux/linkage.h> #include <asm/segment.h> +#include <asm/pgtable.h> #include <asm/page.h> #include <asm/msr.h> @@ -30,22 +31,28 @@ wakeup_code: cld # setup data segment movw %cs, %ax - movw %ax, %ds # Make ds:0 point to wakeup_start + movw %ax, %ds # Make ds:0 point to wakeup_start movw %ax, %ss - mov $(wakeup_stack - wakeup_code), %sp # Private stack is needed for ASUS board + # Private stack is needed for ASUS board + mov $(wakeup_stack - wakeup_code), %sp - pushl $0 # Kill any dangerous flags + pushl $0 # Kill any dangerous flags popfl movl real_magic - wakeup_code, %eax cmpl $0x12345678, %eax jne bogus_real_magic + call verify_cpu # Verify the cpu supports long + # mode + testl %eax, %eax + jnz no_longmode + testl $1, video_flags - wakeup_code jz 1f lcall $0xc000,$3 movw %cs, %ax - movw %ax, %ds # Bios might have played with that + movw %ax, %ds # Bios might have played with that movw %ax, %ss 1: @@ -61,12 +68,15 @@ wakeup_code: movb $0xa2, %al ; outb %al, $0x80 - lidt %ds:idt_48a - wakeup_code - xorl %eax, %eax - movw %ds, %ax # (Convert %ds:gdt to a linear ptr) - shll $4, %eax - addl $(gdta - wakeup_code), %eax - movl %eax, gdt_48a +2 - wakeup_code + mov %ds, %ax # Find 32bit wakeup_code addr + movzx %ax, %esi # (Convert %ds:gdt to a liner ptr) + shll $4, %esi + # Fix up the vectors + addl %esi, wakeup_32_vector - wakeup_code + addl %esi, wakeup_long64_vector - wakeup_code + addl %esi, gdt_48a + 2 - wakeup_code # Fixup the gdt pointer + + lidtl %ds:idt_48a - wakeup_code lgdtl %ds:gdt_48a - wakeup_code # load gdt with whatever is # appropriate @@ -75,86 +85,63 @@ wakeup_code: jmp 1f 1: - .byte 0x66, 0xea # prefix + jmpi-opcode - .long wakeup_32 - __START_KERNEL_map - .word __KERNEL_CS + ljmpl *(wakeup_32_vector - wakeup_code) + + .balign 4 +wakeup_32_vector: + .long wakeup_32 - wakeup_code + .word __KERNEL32_CS, 0 .code32 wakeup_32: # Running in this code, but at low address; paging is not yet turned on. movb $0xa5, %al ; outb %al, $0x80 - /* Check if extended functions are implemented */ - movl $0x80000000, %eax - cpuid - cmpl $0x80000000, %eax - jbe bogus_cpu - wbinvd - mov $0x80000001, %eax - cpuid - btl $29, %edx - jnc bogus_cpu - movl %edx,%edi - - movw $__KERNEL_DS, %ax - movw %ax, %ds - movw %ax, %es - movw %ax, %fs - movw %ax, %gs - - movw $__KERNEL_DS, %ax - movw %ax, %ss + movl $__KERNEL_DS, %eax + movl %eax, %ds - mov $(wakeup_stack - __START_KERNEL_map), %esp - movl saved_magic - __START_KERNEL_map, %eax - cmpl $0x9abcdef0, %eax - jne bogus_32_magic + movw $0x0e00 + 'i', %ds:(0xb8012) + movb $0xa8, %al ; outb %al, $0x80; /* * Prepare for entering 64bits mode */ - /* Enable PAE mode and PGE */ + /* Enable PAE */ xorl %eax, %eax btsl $5, %eax - btsl $7, %eax movl %eax, %cr4 /* Setup early boot stage 4 level pagetables */ - movl $(wakeup_level4_pgt - __START_KERNEL_map), %eax + leal (wakeup_level4_pgt - wakeup_code)(%esi), %eax movl %eax, %cr3 - /* Setup EFER (Extended Feature Enable Register) */ - movl $MSR_EFER, %ecx - rdmsr - /* Fool rdmsr and reset %eax to avoid dependences */ - xorl %eax, %eax + /* Check if nx is implemented */ + movl $0x80000001, %eax + cpuid + movl %edx,%edi + /* Enable Long Mode */ + xorl %eax, %eax btsl $_EFER_LME, %eax - /* Enable System Call */ - btsl $_EFER_SCE, %eax - /* No Execute supported? */ + /* No Execute supported? */ btl $20,%edi jnc 1f btsl $_EFER_NX, %eax -1: /* Make changes effective */ +1: movl $MSR_EFER, %ecx + xorl %edx, %edx wrmsr - wbinvd xorl %eax, %eax btsl $31, %eax /* Enable paging and in turn activate Long Mode */ btsl $0, %eax /* Enable protected mode */ - btsl $1, %eax /* Enable MP */ - btsl $4, %eax /* Enable ET */ - btsl $5, %eax /* Enable NE */ - btsl $16, %eax /* Enable WP */ - btsl $18, %eax /* Enable AM */ /* Make changes effective */ movl %eax, %cr0 + /* At this point: CR4.PAE must be 1 CS.L must be 0 @@ -162,11 +149,6 @@ wakeup_32: Next instruction must be a branch This must be on identity-mapped page */ - jmp reach_compatibility_mode -reach_compatibility_mode: - movw $0x0e00 + 'i', %ds:(0xb8012) - movb $0xa8, %al ; outb %al, $0x80; - /* * At this point we're in long mode but in 32bit compatibility mode * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn @@ -174,24 +156,19 @@ reach_compatibility_mode: * the new gdt/idt that has __KERNEL_CS with CS.L = 1. */ - movw $0x0e00 + 'n', %ds:(0xb8014) - movb $0xa9, %al ; outb %al, $0x80 - - /* Load new GDT with the 64bit segment using 32bit descriptor */ - movl $(pGDT32 - __START_KERNEL_map), %eax - lgdt (%eax) - - movl $(wakeup_jumpvector - __START_KERNEL_map), %eax /* Finally jump in 64bit mode */ - ljmp *(%eax) + ljmp *(wakeup_long64_vector - wakeup_code)(%esi) -wakeup_jumpvector: - .long wakeup_long64 - __START_KERNEL_map - .word __KERNEL_CS + .balign 4 +wakeup_long64_vector: + .long wakeup_long64 - wakeup_code + .word __KERNEL_CS, 0 .code64 - /* Hooray, we are in Long 64-bit mode (but still running in low memory) */ + /* Hooray, we are in Long 64-bit mode (but still running in + * low memory) + */ wakeup_long64: /* * We must switch to a new descriptor in kernel space for the GDT @@ -199,7 +176,15 @@ wakeup_long64: * addresses where we're currently running on. We have to do that here * because in 32bit we couldn't load a 64bit linear address. */ - lgdt cpu_gdt_descr - __START_KERNEL_map + lgdt cpu_gdt_descr + + movw $0x0e00 + 'n', %ds:(0xb8014) + movb $0xa9, %al ; outb %al, $0x80 + + movq saved_magic, %rax + movq $0x123456789abcdef0, %rdx + cmpq %rdx, %rax + jne bogus_64_magic movw $0x0e00 + 'u', %ds:(0xb8016) @@ -211,75 +196,58 @@ wakeup_long64: movw %ax, %es movw %ax, %fs movw %ax, %gs - movq saved_esp, %rsp + movq saved_rsp, %rsp movw $0x0e00 + 'x', %ds:(0xb8018) - movq saved_ebx, %rbx - movq saved_edi, %rdi - movq saved_esi, %rsi - movq saved_ebp, %rbp + movq saved_rbx, %rbx + movq saved_rdi, %rdi + movq saved_rsi, %rsi + movq saved_rbp, %rbp movw $0x0e00 + '!', %ds:(0xb801a) - movq saved_eip, %rax + movq saved_rip, %rax jmp *%rax .code32 .align 64 gdta: + /* Its good to keep gdt in sync with one in trampoline.S */ .word 0, 0, 0, 0 # dummy - - .word 0, 0, 0, 0 # unused - - .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) - .word 0 # base address = 0 - .word 0x9B00 # code read/exec. ??? Why I need 0x9B00 (as opposed to 0x9A00 in order for this to work?) - .word 0x00CF # granularity = 4096, 386 - # (+5th nibble of limit) - - .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) - .word 0 # base address = 0 - .word 0x9200 # data read/write - .word 0x00CF # granularity = 4096, 386 - # (+5th nibble of limit) -# this is 64bit descriptor for code - .word 0xFFFF - .word 0 - .word 0x9A00 # code read/exec - .word 0x00AF # as above, but it is long mode and with D=0 + /* ??? Why I need the accessed bit set in order for this to work? */ + .quad 0x00cf9b000000ffff # __KERNEL32_CS + .quad 0x00af9b000000ffff # __KERNEL_CS + .quad 0x00cf93000000ffff # __KERNEL_DS idt_48a: .word 0 # idt limit = 0 .word 0, 0 # idt base = 0L gdt_48a: - .word 0x8000 # gdt limit=2048, + .word 0x800 # gdt limit=2048, # 256 GDT entries - .word 0, 0 # gdt base (filled in later) - + .long gdta - wakeup_code # gdt base (relocated in later) -real_save_gdt: .word 0 - .quad 0 real_magic: .quad 0 video_mode: .quad 0 video_flags: .quad 0 +.code16 bogus_real_magic: - movb $0xba,%al ; outb %al,$0x80 + movb $0xba,%al ; outb %al,$0x80 jmp bogus_real_magic -bogus_32_magic: +.code64 +bogus_64_magic: movb $0xb3,%al ; outb %al,$0x80 - jmp bogus_32_magic + jmp bogus_64_magic -bogus_31_magic: - movb $0xb1,%al ; outb %al,$0x80 - jmp bogus_31_magic - -bogus_cpu: - movb $0xbc,%al ; outb %al,$0x80 - jmp bogus_cpu +.code16 +no_longmode: + movb $0xbc,%al ; outb %al,$0x80 + jmp no_longmode +#include "../verify_cpu.S" /* This code uses an extended set of video mode numbers. These include: * Aliases for standard modes @@ -301,6 +269,7 @@ bogus_cpu: #define VIDEO_FIRST_V7 0x0900 # Setting of user mode (AX=mode ID) => CF=success +.code16 mode_seta: movw %ax, %bx #if 0 @@ -346,21 +315,18 @@ check_vesaa: _setbada: jmp setbada - .code64 -bogus_magic: - movw $0x0e00 + 'B', %ds:(0xb8018) - jmp bogus_magic - -bogus_magic2: - movw $0x0e00 + '2', %ds:(0xb8018) - jmp bogus_magic2 - - wakeup_stack_begin: # Stack grows down .org 0xff0 wakeup_stack: # Just below end of page +.org 0x1000 +ENTRY(wakeup_level4_pgt) + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .fill 510,8,0 + /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ + .quad level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE + ENTRY(wakeup_end) ## @@ -373,28 +339,11 @@ ENTRY(wakeup_end) # # Returned address is location of code in low memory (past data and stack) # + .code64 ENTRY(acpi_copy_wakeup_routine) pushq %rax - pushq %rcx pushq %rdx - sgdt saved_gdt - sidt saved_idt - sldt saved_ldt - str saved_tss - - movq %cr3, %rdx - movq %rdx, saved_cr3 - movq %cr4, %rdx - movq %rdx, saved_cr4 - movq %cr0, %rdx - movq %rdx, saved_cr0 - sgdt real_save_gdt - wakeup_start (,%rdi) - movl $MSR_EFER, %ecx - rdmsr - movl %eax, saved_efer - movl %edx, saved_efer2 - movl saved_video_mode, %edx movl %edx, video_mode - wakeup_start (,%rdi) movl acpi_video_flags, %edx @@ -403,21 +352,13 @@ ENTRY(acpi_copy_wakeup_routine) movq $0x123456789abcdef0, %rdx movq %rdx, saved_magic - movl saved_magic - __START_KERNEL_map, %eax - cmpl $0x9abcdef0, %eax - jne bogus_32_magic - - # make sure %cr4 is set correctly (features, etc) - movl saved_cr4 - __START_KERNEL_map, %eax - movq %rax, %cr4 + movq saved_magic, %rax + movq $0x123456789abcdef0, %rdx + cmpq %rdx, %rax + jne bogus_64_magic - movl saved_cr0 - __START_KERNEL_map, %eax - movq %rax, %cr0 - jmp 1f # Flush pipelines -1: # restore the regs we used popq %rdx - popq %rcx popq %rax ENTRY(do_suspend_lowlevel_s4bios) ret @@ -450,13 +391,13 @@ do_suspend_lowlevel: movq %r15, saved_context_r15(%rip) pushfq ; popq saved_context_eflags(%rip) - movq $.L97, saved_eip(%rip) + movq $.L97, saved_rip(%rip) - movq %rsp,saved_esp - movq %rbp,saved_ebp - movq %rbx,saved_ebx - movq %rdi,saved_edi - movq %rsi,saved_esi + movq %rsp,saved_rsp + movq %rbp,saved_rbp + movq %rbx,saved_rbx + movq %rdi,saved_rdi + movq %rsi,saved_rsi addq $8, %rsp movl $3, %edi @@ -503,25 +444,12 @@ do_suspend_lowlevel: .data ALIGN -ENTRY(saved_ebp) .quad 0 -ENTRY(saved_esi) .quad 0 -ENTRY(saved_edi) .quad 0 -ENTRY(saved_ebx) .quad 0 +ENTRY(saved_rbp) .quad 0 +ENTRY(saved_rsi) .quad 0 +ENTRY(saved_rdi) .quad 0 +ENTRY(saved_rbx) .quad 0 -ENTRY(saved_eip) .quad 0 -ENTRY(saved_esp) .quad 0 +ENTRY(saved_rip) .quad 0 +ENTRY(saved_rsp) .quad 0 ENTRY(saved_magic) .quad 0 - -ALIGN -# saved registers -saved_gdt: .quad 0,0 -saved_idt: .quad 0,0 -saved_ldt: .quad 0 -saved_tss: .quad 0 - -saved_cr0: .quad 0 -saved_cr3: .quad 0 -saved_cr4: .quad 0 -saved_efer: .quad 0 -saved_efer2: .quad 0 diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c index b487396c4c5..a52af582059 100644 --- a/arch/x86_64/kernel/aperture.c +++ b/arch/x86_64/kernel/aperture.c @@ -51,7 +51,6 @@ static void __init insert_aperture_resource(u32 aper_base, u32 aper_size) static u32 __init allocate_aperture(void) { - pg_data_t *nd0 = NODE_DATA(0); u32 aper_size; void *p; @@ -65,12 +64,12 @@ static u32 __init allocate_aperture(void) * Unfortunately we cannot move it up because that would make the * IOMMU useless. */ - p = __alloc_bootmem_node(nd0, aper_size, aper_size, 0); + p = __alloc_bootmem_nopanic(aper_size, aper_size, 0); if (!p || __pa(p)+aper_size > 0xffffffff) { printk("Cannot allocate aperture memory hole (%p,%uK)\n", p, aper_size>>10); if (p) - free_bootmem_node(nd0, __pa(p), aper_size); + free_bootmem(__pa(p), aper_size); return 0; } printk("Mapping aperture over %d KB of RAM @ %lx\n", diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index bd3e45d47c3..d198f7d82e5 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -68,6 +68,28 @@ int using_apic_timer __read_mostly = 0; static void apic_pm_activate(void); +void apic_wait_icr_idle(void) +{ + while (apic_read(APIC_ICR) & APIC_ICR_BUSY) + cpu_relax(); +} + +unsigned int safe_apic_wait_icr_idle(void) +{ + unsigned int send_status; + int timeout; + + timeout = 0; + do { + send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; + if (!send_status) + break; + udelay(100); + } while (timeout++ < 1000); + + return send_status; +} + void enable_NMI_through_LVT0 (void * dummy) { unsigned int v; @@ -817,14 +839,15 @@ static void setup_APIC_timer(unsigned int clocks) static int __init calibrate_APIC_clock(void) { - int apic, apic_start, tsc, tsc_start; + unsigned apic, apic_start; + unsigned long tsc, tsc_start; int result; /* * Put whatever arbitrary (but long enough) timeout * value into the APIC clock, we just want to get the * counter running for calibration. */ - __setup_APIC_LVTT(1000000000); + __setup_APIC_LVTT(4000000000); apic_start = apic_read(APIC_TMCCT); #ifdef CONFIG_X86_PM_TIMER @@ -835,15 +858,15 @@ static int __init calibrate_APIC_clock(void) } else #endif { - rdtscl(tsc_start); + rdtscll(tsc_start); do { apic = apic_read(APIC_TMCCT); - rdtscl(tsc); + rdtscll(tsc); } while ((tsc - tsc_start) < TICK_COUNT && - (apic - apic_start) < TICK_COUNT); + (apic_start - apic) < TICK_COUNT); - result = (apic_start - apic) * 1000L * cpu_khz / + result = (apic_start - apic) * 1000L * tsc_khz / (tsc - tsc_start); } printk("result %d\n", result); diff --git a/arch/x86_64/kernel/asm-offsets.c b/arch/x86_64/kernel/asm-offsets.c index 96687e2beb2..778953bc636 100644 --- a/arch/x86_64/kernel/asm-offsets.c +++ b/arch/x86_64/kernel/asm-offsets.c @@ -21,6 +21,14 @@ #define BLANK() asm volatile("\n->" : : ) +#define __NO_STUBS 1 +#undef __SYSCALL +#undef _ASM_X86_64_UNISTD_H_ +#define __SYSCALL(nr, sym) [nr] = 1, +static char syscalls[] = { +#include <asm/unistd.h> +}; + int main(void) { #define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry)) @@ -71,5 +79,7 @@ int main(void) DEFINE(TSS_ist, offsetof(struct tss_struct, ist)); BLANK(); DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx)); + BLANK(); + DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); return 0; } diff --git a/arch/x86_64/kernel/bugs.c b/arch/x86_64/kernel/bugs.c new file mode 100644 index 00000000000..12b585b5345 --- /dev/null +++ b/arch/x86_64/kernel/bugs.c @@ -0,0 +1,21 @@ +/* + * arch/x86_64/kernel/bugs.c + * + * Copyright (C) 1994 Linus Torvalds + * Copyright (C) 2000 SuSE + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <asm/alternative.h> +#include <asm/processor.h> + +void __init check_bugs(void) +{ + identify_cpu(&boot_cpu_data); +#if !defined(CONFIG_SMP) + printk("CPU: "); + print_cpu_info(&boot_cpu_data); +#endif + alternative_instructions(); +} diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index a490fabfcf4..be8965427a9 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -25,7 +25,7 @@ #include <asm/bootsetup.h> #include <asm/sections.h> -struct e820map e820 __initdata; +struct e820map e820; /* * PFN of last memory page. @@ -98,7 +98,7 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size) * This function checks if any part of the range <start,end> is mapped * with type. */ -int __meminit +int e820_any_mapped(unsigned long start, unsigned long end, unsigned type) { int i; @@ -112,6 +112,7 @@ e820_any_mapped(unsigned long start, unsigned long end, unsigned type) } return 0; } +EXPORT_SYMBOL_GPL(e820_any_mapped); /* * This function checks if the entire range <start,end> is mapped with type. diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c index fede55a5399..990d9c218a5 100644 --- a/arch/x86_64/kernel/early-quirks.c +++ b/arch/x86_64/kernel/early-quirks.c @@ -71,18 +71,6 @@ static void __init ati_bugs(void) } } -static void intel_bugs(void) -{ - u16 device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID); - -#ifdef CONFIG_SMP - if (device == PCI_DEVICE_ID_INTEL_E7320_MCH || - device == PCI_DEVICE_ID_INTEL_E7520_MCH || - device == PCI_DEVICE_ID_INTEL_E7525_MCH) - quirk_intel_irqbalance(); -#endif -} - struct chipset { u16 vendor; void (*f)(void); @@ -92,7 +80,6 @@ static struct chipset early_qrk[] __initdata = { { PCI_VENDOR_ID_NVIDIA, nvidia_bugs }, { PCI_VENDOR_ID_VIA, via_bugs }, { PCI_VENDOR_ID_ATI, ati_bugs }, - { PCI_VENDOR_ID_INTEL, intel_bugs}, {} }; diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c index 47b6d90349d..92213d2b7c1 100644 --- a/arch/x86_64/kernel/early_printk.c +++ b/arch/x86_64/kernel/early_printk.c @@ -11,11 +11,10 @@ #ifdef __i386__ #include <asm/setup.h> -#define VGABASE (__ISA_IO_base + 0xb8000) #else #include <asm/bootsetup.h> -#define VGABASE ((void __iomem *)0xffffffff800b8000UL) #endif +#define VGABASE (__ISA_IO_base + 0xb8000) static int max_ypos = 25, max_xpos = 80; static int current_ypos = 25, current_xpos = 0; @@ -176,7 +175,7 @@ static noinline long simnow(long cmd, long a, long b, long c) return ret; } -void __init simnow_init(char *str) +static void __init simnow_init(char *str) { char *fn = "klog"; if (*str == '=') diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index ed4350ced3d..fa984b53e7e 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -701,6 +701,7 @@ END(spurious_interrupt) CFI_ADJUST_CFA_OFFSET 8 pushq %rax /* push real oldrax to the rdi slot */ CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rax,0 leaq \sym(%rip),%rax jmp error_entry CFI_ENDPROC @@ -710,6 +711,7 @@ END(spurious_interrupt) XCPT_FRAME pushq %rax CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rax,0 leaq \sym(%rip),%rax jmp error_entry CFI_ENDPROC @@ -817,6 +819,7 @@ paranoid_schedule\trace: */ KPROBE_ENTRY(error_entry) _frame RDI + CFI_REL_OFFSET rax,0 /* rdi slot contains rax, oldrax contains error code */ cld subq $14*8,%rsp @@ -824,6 +827,7 @@ KPROBE_ENTRY(error_entry) movq %rsi,13*8(%rsp) CFI_REL_OFFSET rsi,RSI movq 14*8(%rsp),%rsi /* load rax from rdi slot */ + CFI_REGISTER rax,rsi movq %rdx,12*8(%rsp) CFI_REL_OFFSET rdx,RDX movq %rcx,11*8(%rsp) @@ -857,6 +861,7 @@ error_swapgs: swapgs error_sti: movq %rdi,RDI(%rsp) + CFI_REL_OFFSET rdi,RDI movq %rsp,%rdi movq ORIG_RAX(%rsp),%rsi /* get error code */ movq $-1,ORIG_RAX(%rsp) diff --git a/arch/x86_64/kernel/functionlist b/arch/x86_64/kernel/functionlist deleted file mode 100644 index 7ae18ec1245..00000000000 --- a/arch/x86_64/kernel/functionlist +++ /dev/null @@ -1,1284 +0,0 @@ -*(.text.flush_thread) -*(.text.check_poison_obj) -*(.text.copy_page) -*(.text.__set_personality) -*(.text.gart_map_sg) -*(.text.kmem_cache_free) -*(.text.find_get_page) -*(.text._raw_spin_lock) -*(.text.ide_outb) -*(.text.unmap_vmas) -*(.text.copy_page_range) -*(.text.kprobe_handler) -*(.text.__handle_mm_fault) -*(.text.__d_lookup) -*(.text.copy_user_generic) -*(.text.__link_path_walk) -*(.text.get_page_from_freelist) -*(.text.kmem_cache_alloc) -*(.text.drive_cmd_intr) -*(.text.ia32_setup_sigcontext) -*(.text.huge_pte_offset) -*(.text.do_page_fault) -*(.text.page_remove_rmap) -*(.text.release_pages) -*(.text.ide_end_request) -*(.text.__mutex_lock_slowpath) -*(.text.__find_get_block) -*(.text.kfree) -*(.text.vfs_read) -*(.text._raw_spin_unlock) -*(.text.free_hot_cold_page) -*(.text.fget_light) -*(.text.schedule) -*(.text.memcmp) -*(.text.touch_atime) -*(.text.__might_sleep) -*(.text.__down_read_trylock) -*(.text.arch_pick_mmap_layout) -*(.text.find_vma) -*(.text.__make_request) -*(.text.do_generic_mapping_read) -*(.text.mutex_lock_interruptible) -*(.text.__generic_file_aio_read) -*(.text._atomic_dec_and_lock) -*(.text.__wake_up_bit) -*(.text.add_to_page_cache) -*(.text.cache_alloc_debugcheck_after) -*(.text.vm_normal_page) -*(.text.mutex_debug_check_no_locks_freed) -*(.text.net_rx_action) -*(.text.__find_first_zero_bit) -*(.text.put_page) -*(.text._raw_read_lock) -*(.text.__delay) -*(.text.dnotify_parent) -*(.text.do_path_lookup) -*(.text.do_sync_read) -*(.text.do_lookup) -*(.text.bit_waitqueue) -*(.text.file_read_actor) -*(.text.strncpy_from_user) -*(.text.__pagevec_lru_add_active) -*(.text.fget) -*(.text.dput) -*(.text.__strnlen_user) -*(.text.inotify_inode_queue_event) -*(.text.rw_verify_area) -*(.text.ide_intr) -*(.text.inotify_dentry_parent_queue_event) -*(.text.permission) -*(.text.memscan) -*(.text.hpet_rtc_interrupt) -*(.text.do_mmap_pgoff) -*(.text.current_fs_time) -*(.text.vfs_getattr) -*(.text.kmem_flagcheck) -*(.text.mark_page_accessed) -*(.text.free_pages_and_swap_cache) -*(.text.generic_fillattr) -*(.text.__block_prepare_write) -*(.text.__set_page_dirty_nobuffers) -*(.text.link_path_walk) -*(.text.find_get_pages_tag) -*(.text.ide_do_request) -*(.text.__alloc_pages) -*(.text.generic_permission) -*(.text.mod_page_state_offset) -*(.text.free_pgd_range) -*(.text.generic_file_buffered_write) -*(.text.number) -*(.text.ide_do_rw_disk) -*(.text.__brelse) -*(.text.__mod_page_state_offset) -*(.text.rotate_reclaimable_page) -*(.text.find_vma_prepare) -*(.text.find_vma_prev) -*(.text.lru_cache_add_active) -*(.text.__kmalloc_track_caller) -*(.text.smp_invalidate_interrupt) -*(.text.handle_IRQ_event) -*(.text.__find_get_block_slow) -*(.text.do_wp_page) -*(.text.do_select) -*(.text.set_user_nice) -*(.text.sys_read) -*(.text.do_munmap) -*(.text.csum_partial) -*(.text.__do_softirq) -*(.text.may_open) -*(.text.getname) -*(.text.get_empty_filp) -*(.text.__fput) -*(.text.remove_mapping) -*(.text.filp_ctor) -*(.text.poison_obj) -*(.text.unmap_region) -*(.text.test_set_page_writeback) -*(.text.__do_page_cache_readahead) -*(.text.sock_def_readable) -*(.text.ide_outl) -*(.text.shrink_zone) -*(.text.rb_insert_color) -*(.text.get_request) -*(.text.sys_pread64) -*(.text.spin_bug) -*(.text.ide_outsl) -*(.text.mask_and_ack_8259A) -*(.text.filemap_nopage) -*(.text.page_add_file_rmap) -*(.text.find_lock_page) -*(.text.tcp_poll) -*(.text.__mark_inode_dirty) -*(.text.file_ra_state_init) -*(.text.generic_file_llseek) -*(.text.__pagevec_lru_add) -*(.text.page_cache_readahead) -*(.text.n_tty_receive_buf) -*(.text.zonelist_policy) -*(.text.vma_adjust) -*(.text.test_clear_page_dirty) -*(.text.sync_buffer) -*(.text.do_exit) -*(.text.__bitmap_weight) -*(.text.alloc_pages_current) -*(.text.get_unused_fd) -*(.text.zone_watermark_ok) -*(.text.cpuset_update_task_memory_state) -*(.text.__bitmap_empty) -*(.text.sys_munmap) -*(.text.__inode_dir_notify) -*(.text.__generic_file_aio_write_nolock) -*(.text.__pte_alloc) -*(.text.sys_select) -*(.text.vm_acct_memory) -*(.text.vfs_write) -*(.text.__lru_add_drain) -*(.text.prio_tree_insert) -*(.text.generic_file_aio_read) -*(.text.vma_merge) -*(.text.block_write_full_page) -*(.text.__page_set_anon_rmap) -*(.text.apic_timer_interrupt) -*(.text.release_console_sem) -*(.text.sys_write) -*(.text.sys_brk) -*(.text.dup_mm) -*(.text.read_current_timer) -*(.text.ll_rw_block) -*(.text.blk_rq_map_sg) -*(.text.dbg_userword) -*(.text.__block_commit_write) -*(.text.cache_grow) -*(.text.copy_strings) -*(.text.release_task) -*(.text.do_sync_write) -*(.text.unlock_page) -*(.text.load_elf_binary) -*(.text.__follow_mount) -*(.text.__getblk) -*(.text.do_sys_open) -*(.text.current_kernel_time) -*(.text.call_rcu) -*(.text.write_chan) -*(.text.vsnprintf) -*(.text.dummy_inode_setsecurity) -*(.text.submit_bh) -*(.text.poll_freewait) -*(.text.bio_alloc_bioset) -*(.text.skb_clone) -*(.text.page_waitqueue) -*(.text.__mutex_lock_interruptible_slowpath) -*(.text.get_index) -*(.text.csum_partial_copy_generic) -*(.text.bad_range) -*(.text.remove_vma) -*(.text.cp_new_stat) -*(.text.alloc_arraycache) -*(.text.test_clear_page_writeback) -*(.text.strsep) -*(.text.open_namei) -*(.text._raw_read_unlock) -*(.text.get_vma_policy) -*(.text.__down_write_trylock) -*(.text.find_get_pages) -*(.text.tcp_rcv_established) -*(.text.generic_make_request) -*(.text.__block_write_full_page) -*(.text.cfq_set_request) -*(.text.sys_inotify_init) -*(.text.split_vma) -*(.text.__mod_timer) -*(.text.get_options) -*(.text.vma_link) -*(.text.mpage_writepages) -*(.text.truncate_complete_page) -*(.text.tcp_recvmsg) -*(.text.sigprocmask) -*(.text.filemap_populate) -*(.text.sys_close) -*(.text.inotify_dev_queue_event) -*(.text.do_task_stat) -*(.text.__dentry_open) -*(.text.unlink_file_vma) -*(.text.__pollwait) -*(.text.packet_rcv_spkt) -*(.text.drop_buffers) -*(.text.free_pgtables) -*(.text.generic_file_direct_write) -*(.text.copy_process) -*(.text.netif_receive_skb) -*(.text.dnotify_flush) -*(.text.print_bad_pte) -*(.text.anon_vma_unlink) -*(.text.sys_mprotect) -*(.text.sync_sb_inodes) -*(.text.find_inode_fast) -*(.text.dummy_inode_readlink) -*(.text.putname) -*(.text.init_smp_flush) -*(.text.dbg_redzone2) -*(.text.sk_run_filter) -*(.text.may_expand_vm) -*(.text.generic_file_aio_write) -*(.text.find_next_zero_bit) -*(.text.file_kill) -*(.text.audit_getname) -*(.text.arch_unmap_area_topdown) -*(.text.alloc_page_vma) -*(.text.tcp_transmit_skb) -*(.text.rb_next) -*(.text.dbg_redzone1) -*(.text.generic_file_mmap) -*(.text.vfs_fstat) -*(.text.sys_time) -*(.text.page_lock_anon_vma) -*(.text.get_unmapped_area) -*(.text.remote_llseek) -*(.text.__up_read) -*(.text.fd_install) -*(.text.eventpoll_init_file) -*(.text.dma_alloc_coherent) -*(.text.create_empty_buffers) -*(.text.__mutex_unlock_slowpath) -*(.text.dup_fd) -*(.text.d_alloc) -*(.text.tty_ldisc_try) -*(.text.sys_stime) -*(.text.__rb_rotate_right) -*(.text.d_validate) -*(.text.rb_erase) -*(.text.path_release) -*(.text.memmove) -*(.text.invalidate_complete_page) -*(.text.clear_inode) -*(.text.cache_estimate) -*(.text.alloc_buffer_head) -*(.text.smp_call_function_interrupt) -*(.text.flush_tlb_others) -*(.text.file_move) -*(.text.balance_dirty_pages_ratelimited) -*(.text.vma_prio_tree_add) -*(.text.timespec_trunc) -*(.text.mempool_alloc) -*(.text.iget_locked) -*(.text.d_alloc_root) -*(.text.cpuset_populate_dir) -*(.text.anon_vma_prepare) -*(.text.sys_newstat) -*(.text.alloc_page_interleave) -*(.text.__path_lookup_intent_open) -*(.text.__pagevec_free) -*(.text.inode_init_once) -*(.text.free_vfsmnt) -*(.text.__user_walk_fd) -*(.text.cfq_idle_slice_timer) -*(.text.sys_mmap) -*(.text.sys_llseek) -*(.text.prio_tree_remove) -*(.text.filp_close) -*(.text.file_permission) -*(.text.vma_prio_tree_remove) -*(.text.tcp_ack) -*(.text.nameidata_to_filp) -*(.text.sys_lseek) -*(.text.percpu_counter_mod) -*(.text.igrab) -*(.text.__bread) -*(.text.alloc_inode) -*(.text.filldir) -*(.text.__rb_rotate_left) -*(.text.irq_affinity_write_proc) -*(.text.init_request_from_bio) -*(.text.find_or_create_page) -*(.text.tty_poll) -*(.text.tcp_sendmsg) -*(.text.ide_wait_stat) -*(.text.free_buffer_head) -*(.text.flush_signal_handlers) -*(.text.tcp_v4_rcv) -*(.text.nr_blockdev_pages) -*(.text.locks_remove_flock) -*(.text.__iowrite32_copy) -*(.text.do_filp_open) -*(.text.try_to_release_page) -*(.text.page_add_new_anon_rmap) -*(.text.kmem_cache_size) -*(.text.eth_type_trans) -*(.text.try_to_free_buffers) -*(.text.schedule_tail) -*(.text.proc_lookup) -*(.text.no_llseek) -*(.text.kfree_skbmem) -*(.text.do_wait) -*(.text.do_mpage_readpage) -*(.text.vfs_stat_fd) -*(.text.tty_write) -*(.text.705) -*(.text.sync_page) -*(.text.__remove_shared_vm_struct) -*(.text.__kfree_skb) -*(.text.sock_poll) -*(.text.get_request_wait) -*(.text.do_sigaction) -*(.text.do_brk) -*(.text.tcp_event_data_recv) -*(.text.read_chan) -*(.text.pipe_writev) -*(.text.__emul_lookup_dentry) -*(.text.rtc_get_rtc_time) -*(.text.print_objinfo) -*(.text.file_update_time) -*(.text.do_signal) -*(.text.disable_8259A_irq) -*(.text.blk_queue_bounce) -*(.text.__anon_vma_link) -*(.text.__vma_link) -*(.text.vfs_rename) -*(.text.sys_newlstat) -*(.text.sys_newfstat) -*(.text.sys_mknod) -*(.text.__show_regs) -*(.text.iput) -*(.text.get_signal_to_deliver) -*(.text.flush_tlb_page) -*(.text.debug_mutex_wake_waiter) -*(.text.copy_thread) -*(.text.clear_page_dirty_for_io) -*(.text.buffer_io_error) -*(.text.vfs_permission) -*(.text.truncate_inode_pages_range) -*(.text.sys_recvfrom) -*(.text.remove_suid) -*(.text.mark_buffer_dirty) -*(.text.local_bh_enable) -*(.text.get_zeroed_page) -*(.text.get_vmalloc_info) -*(.text.flush_old_exec) -*(.text.dummy_inode_permission) -*(.text.__bio_add_page) -*(.text.prio_tree_replace) -*(.text.notify_change) -*(.text.mntput_no_expire) -*(.text.fput) -*(.text.__end_that_request_first) -*(.text.wake_up_bit) -*(.text.unuse_mm) -*(.text.shrink_icache_memory) -*(.text.sched_balance_self) -*(.text.__pmd_alloc) -*(.text.pipe_poll) -*(.text.normal_poll) -*(.text.__free_pages) -*(.text.follow_mount) -*(.text.cdrom_start_packet_command) -*(.text.blk_recount_segments) -*(.text.bio_put) -*(.text.__alloc_skb) -*(.text.__wake_up) -*(.text.vm_stat_account) -*(.text.sys_fcntl) -*(.text.sys_fadvise64) -*(.text._raw_write_unlock) -*(.text.__pud_alloc) -*(.text.alloc_page_buffers) -*(.text.vfs_llseek) -*(.text.sockfd_lookup) -*(.text._raw_write_lock) -*(.text.put_compound_page) -*(.text.prune_dcache) -*(.text.pipe_readv) -*(.text.mempool_free) -*(.text.make_ahead_window) -*(.text.lru_add_drain) -*(.text.constant_test_bit) -*(.text.__clear_user) -*(.text.arch_unmap_area) -*(.text.anon_vma_link) -*(.text.sys_chroot) -*(.text.setup_arg_pages) -*(.text.radix_tree_preload) -*(.text.init_rwsem) -*(.text.generic_osync_inode) -*(.text.generic_delete_inode) -*(.text.do_sys_poll) -*(.text.dev_queue_xmit) -*(.text.default_llseek) -*(.text.__writeback_single_inode) -*(.text.vfs_ioctl) -*(.text.__up_write) -*(.text.unix_poll) -*(.text.sys_rt_sigprocmask) -*(.text.sock_recvmsg) -*(.text.recalc_bh_state) -*(.text.__put_unused_fd) -*(.text.process_backlog) -*(.text.locks_remove_posix) -*(.text.lease_modify) -*(.text.expand_files) -*(.text.end_buffer_read_nobh) -*(.text.d_splice_alias) -*(.text.debug_mutex_init_waiter) -*(.text.copy_from_user) -*(.text.cap_vm_enough_memory) -*(.text.show_vfsmnt) -*(.text.release_sock) -*(.text.pfifo_fast_enqueue) -*(.text.half_md4_transform) -*(.text.fs_may_remount_ro) -*(.text.do_fork) -*(.text.copy_hugetlb_page_range) -*(.text.cache_free_debugcheck) -*(.text.__tcp_select_window) -*(.text.task_handoff_register) -*(.text.sys_open) -*(.text.strlcpy) -*(.text.skb_copy_datagram_iovec) -*(.text.set_up_list3s) -*(.text.release_open_intent) -*(.text.qdisc_restart) -*(.text.n_tty_chars_in_buffer) -*(.text.inode_change_ok) -*(.text.__downgrade_write) -*(.text.debug_mutex_unlock) -*(.text.add_timer_randomness) -*(.text.sock_common_recvmsg) -*(.text.set_bh_page) -*(.text.printk_lock) -*(.text.path_release_on_umount) -*(.text.ip_output) -*(.text.ide_build_dmatable) -*(.text.__get_user_8) -*(.text.end_buffer_read_sync) -*(.text.__d_path) -*(.text.d_move) -*(.text.del_timer) -*(.text.constant_test_bit) -*(.text.blockable_page_cache_readahead) -*(.text.tty_read) -*(.text.sys_readlink) -*(.text.sys_faccessat) -*(.text.read_swap_cache_async) -*(.text.pty_write_room) -*(.text.page_address_in_vma) -*(.text.kthread) -*(.text.cfq_exit_io_context) -*(.text.__tcp_push_pending_frames) -*(.text.sys_pipe) -*(.text.submit_bio) -*(.text.pid_revalidate) -*(.text.page_referenced_file) -*(.text.lock_sock) -*(.text.get_page_state_node) -*(.text.generic_block_bmap) -*(.text.do_setitimer) -*(.text.dev_queue_xmit_nit) -*(.text.copy_from_read_buf) -*(.text.__const_udelay) -*(.text.console_conditional_schedule) -*(.text.wake_up_new_task) -*(.text.wait_for_completion_interruptible) -*(.text.tcp_rcv_rtt_update) -*(.text.sys_mlockall) -*(.text.set_fs_altroot) -*(.text.schedule_timeout) -*(.text.nr_free_pagecache_pages) -*(.text.nf_iterate) -*(.text.mapping_tagged) -*(.text.ip_queue_xmit) -*(.text.ip_local_deliver) -*(.text.follow_page) -*(.text.elf_map) -*(.text.dummy_file_permission) -*(.text.dispose_list) -*(.text.dentry_open) -*(.text.dentry_iput) -*(.text.bio_alloc) -*(.text.wait_on_page_bit) -*(.text.vfs_readdir) -*(.text.vfs_lstat) -*(.text.seq_escape) -*(.text.__posix_lock_file) -*(.text.mm_release) -*(.text.kref_put) -*(.text.ip_rcv) -*(.text.__iget) -*(.text.free_pages) -*(.text.find_mergeable_anon_vma) -*(.text.find_extend_vma) -*(.text.dummy_inode_listsecurity) -*(.text.bio_add_page) -*(.text.__vm_enough_memory) -*(.text.vfs_stat) -*(.text.tty_paranoia_check) -*(.text.tcp_read_sock) -*(.text.tcp_data_queue) -*(.text.sys_uname) -*(.text.sys_renameat) -*(.text.__strncpy_from_user) -*(.text.__mutex_init) -*(.text.__lookup_hash) -*(.text.kref_get) -*(.text.ip_route_input) -*(.text.__insert_inode_hash) -*(.text.do_sock_write) -*(.text.blk_done_softirq) -*(.text.__wake_up_sync) -*(.text.__vma_link_rb) -*(.text.tty_ioctl) -*(.text.tracesys) -*(.text.sys_getdents) -*(.text.sys_dup) -*(.text.stub_execve) -*(.text.sha_transform) -*(.text.radix_tree_tag_clear) -*(.text.put_unused_fd) -*(.text.put_files_struct) -*(.text.mpage_readpages) -*(.text.may_delete) -*(.text.kmem_cache_create) -*(.text.ip_mc_output) -*(.text.interleave_nodes) -*(.text.groups_search) -*(.text.generic_drop_inode) -*(.text.generic_commit_write) -*(.text.fcntl_setlk) -*(.text.exit_mmap) -*(.text.end_page_writeback) -*(.text.__d_rehash) -*(.text.debug_mutex_free_waiter) -*(.text.csum_ipv6_magic) -*(.text.count) -*(.text.cleanup_rbuf) -*(.text.check_spinlock_acquired_node) -*(.text.can_vma_merge_after) -*(.text.bio_endio) -*(.text.alloc_pidmap) -*(.text.write_ldt) -*(.text.vmtruncate_range) -*(.text.vfs_create) -*(.text.__user_walk) -*(.text.update_send_head) -*(.text.unmap_underlying_metadata) -*(.text.tty_ldisc_deref) -*(.text.tcp_setsockopt) -*(.text.tcp_send_ack) -*(.text.sys_pause) -*(.text.sys_gettimeofday) -*(.text.sync_dirty_buffer) -*(.text.strncmp) -*(.text.release_posix_timer) -*(.text.proc_file_read) -*(.text.prepare_to_wait) -*(.text.locks_mandatory_locked) -*(.text.interruptible_sleep_on_timeout) -*(.text.inode_sub_bytes) -*(.text.in_group_p) -*(.text.hrtimer_try_to_cancel) -*(.text.filldir64) -*(.text.fasync_helper) -*(.text.dummy_sb_pivotroot) -*(.text.d_lookup) -*(.text.d_instantiate) -*(.text.__d_find_alias) -*(.text.cpu_idle_wait) -*(.text.cond_resched_lock) -*(.text.chown_common) -*(.text.blk_congestion_wait) -*(.text.activate_page) -*(.text.unlock_buffer) -*(.text.tty_wakeup) -*(.text.tcp_v4_do_rcv) -*(.text.tcp_current_mss) -*(.text.sys_openat) -*(.text.sys_fchdir) -*(.text.strnlen_user) -*(.text.strnlen) -*(.text.strchr) -*(.text.sock_common_getsockopt) -*(.text.skb_checksum) -*(.text.remove_wait_queue) -*(.text.rb_replace_node) -*(.text.radix_tree_node_ctor) -*(.text.pty_chars_in_buffer) -*(.text.profile_hit) -*(.text.prio_tree_left) -*(.text.pgd_clear_bad) -*(.text.pfifo_fast_dequeue) -*(.text.page_referenced) -*(.text.open_exec) -*(.text.mmput) -*(.text.mm_init) -*(.text.__ide_dma_off_quietly) -*(.text.ide_dma_intr) -*(.text.hrtimer_start) -*(.text.get_io_context) -*(.text.__get_free_pages) -*(.text.find_first_zero_bit) -*(.text.file_free_rcu) -*(.text.dummy_socket_sendmsg) -*(.text.do_unlinkat) -*(.text.do_arch_prctl) -*(.text.destroy_inode) -*(.text.can_vma_merge_before) -*(.text.block_sync_page) -*(.text.block_prepare_write) -*(.text.bio_init) -*(.text.arch_ptrace) -*(.text.wake_up_inode) -*(.text.wait_on_retry_sync_kiocb) -*(.text.vma_prio_tree_next) -*(.text.tcp_rcv_space_adjust) -*(.text.__tcp_ack_snd_check) -*(.text.sys_utime) -*(.text.sys_recvmsg) -*(.text.sys_mremap) -*(.text.sys_bdflush) -*(.text.sleep_on) -*(.text.set_page_dirty_lock) -*(.text.seq_path) -*(.text.schedule_timeout_interruptible) -*(.text.sched_fork) -*(.text.rt_run_flush) -*(.text.profile_munmap) -*(.text.prepare_binprm) -*(.text.__pagevec_release_nonlru) -*(.text.m_show) -*(.text.lookup_mnt) -*(.text.__lookup_mnt) -*(.text.lock_timer_base) -*(.text.is_subdir) -*(.text.invalidate_bh_lru) -*(.text.init_buffer_head) -*(.text.ifind_fast) -*(.text.ide_dma_start) -*(.text.__get_page_state) -*(.text.flock_to_posix_lock) -*(.text.__find_symbol) -*(.text.do_futex) -*(.text.do_execve) -*(.text.dirty_writeback_centisecs_handler) -*(.text.dev_watchdog) -*(.text.can_share_swap_page) -*(.text.blkdev_put) -*(.text.bio_get_nr_vecs) -*(.text.xfrm_compile_policy) -*(.text.vma_prio_tree_insert) -*(.text.vfs_lstat_fd) -*(.text.__user_path_lookup_open) -*(.text.thread_return) -*(.text.tcp_send_delayed_ack) -*(.text.sock_def_error_report) -*(.text.shrink_slab) -*(.text.serial_out) -*(.text.seq_read) -*(.text.secure_ip_id) -*(.text.search_binary_handler) -*(.text.proc_pid_unhash) -*(.text.pagevec_lookup) -*(.text.new_inode) -*(.text.memcpy_toiovec) -*(.text.locks_free_lock) -*(.text.__lock_page) -*(.text.__lock_buffer) -*(.text.load_module) -*(.text.is_bad_inode) -*(.text.invalidate_inode_buffers) -*(.text.insert_vm_struct) -*(.text.inode_setattr) -*(.text.inode_add_bytes) -*(.text.ide_read_24) -*(.text.ide_get_error_location) -*(.text.ide_do_drive_cmd) -*(.text.get_locked_pte) -*(.text.get_filesystem_list) -*(.text.generic_file_open) -*(.text.follow_down) -*(.text.find_next_bit) -*(.text.__find_first_bit) -*(.text.exit_mm) -*(.text.exec_keys) -*(.text.end_buffer_write_sync) -*(.text.end_bio_bh_io_sync) -*(.text.dummy_socket_shutdown) -*(.text.d_rehash) -*(.text.d_path) -*(.text.do_ioctl) -*(.text.dget_locked) -*(.text.copy_thread_group_keys) -*(.text.cdrom_end_request) -*(.text.cap_bprm_apply_creds) -*(.text.blk_rq_bio_prep) -*(.text.__bitmap_intersects) -*(.text.bio_phys_segments) -*(.text.bio_free) -*(.text.arch_get_unmapped_area_topdown) -*(.text.writeback_in_progress) -*(.text.vfs_follow_link) -*(.text.tcp_rcv_state_process) -*(.text.tcp_check_space) -*(.text.sys_stat) -*(.text.sys_rt_sigreturn) -*(.text.sys_rt_sigaction) -*(.text.sys_remap_file_pages) -*(.text.sys_pwrite64) -*(.text.sys_fchownat) -*(.text.sys_fchmodat) -*(.text.strncat) -*(.text.strlcat) -*(.text.strcmp) -*(.text.steal_locks) -*(.text.sock_create) -*(.text.sk_stream_rfree) -*(.text.sk_stream_mem_schedule) -*(.text.skip_atoi) -*(.text.sk_alloc) -*(.text.show_stat) -*(.text.set_fs_pwd) -*(.text.set_binfmt) -*(.text.pty_unthrottle) -*(.text.proc_symlink) -*(.text.pipe_release) -*(.text.pageout) -*(.text.n_tty_write_wakeup) -*(.text.n_tty_ioctl) -*(.text.nr_free_zone_pages) -*(.text.migration_thread) -*(.text.mempool_free_slab) -*(.text.meminfo_read_proc) -*(.text.max_sane_readahead) -*(.text.lru_cache_add) -*(.text.kill_fasync) -*(.text.kernel_read) -*(.text.invalidate_mapping_pages) -*(.text.inode_has_buffers) -*(.text.init_once) -*(.text.inet_sendmsg) -*(.text.idedisk_issue_flush) -*(.text.generic_file_write) -*(.text.free_more_memory) -*(.text.__free_fdtable) -*(.text.filp_dtor) -*(.text.exit_sem) -*(.text.exit_itimers) -*(.text.error_interrupt) -*(.text.end_buffer_async_write) -*(.text.eligible_child) -*(.text.elf_map) -*(.text.dump_task_regs) -*(.text.dummy_task_setscheduler) -*(.text.dummy_socket_accept) -*(.text.dummy_file_free_security) -*(.text.__down_read) -*(.text.do_sock_read) -*(.text.do_sigaltstack) -*(.text.do_mremap) -*(.text.current_io_context) -*(.text.cpu_swap_callback) -*(.text.copy_vma) -*(.text.cap_bprm_set_security) -*(.text.blk_insert_request) -*(.text.bio_map_kern_endio) -*(.text.bio_hw_segments) -*(.text.bictcp_cong_avoid) -*(.text.add_interrupt_randomness) -*(.text.wait_for_completion) -*(.text.version_read_proc) -*(.text.unix_write_space) -*(.text.tty_ldisc_ref_wait) -*(.text.tty_ldisc_put) -*(.text.try_to_wake_up) -*(.text.tcp_v4_tw_remember_stamp) -*(.text.tcp_try_undo_dsack) -*(.text.tcp_may_send_now) -*(.text.sys_waitid) -*(.text.sys_sched_getparam) -*(.text.sys_getppid) -*(.text.sys_getcwd) -*(.text.sys_dup2) -*(.text.sys_chmod) -*(.text.sys_chdir) -*(.text.sprintf) -*(.text.sock_wfree) -*(.text.sock_aio_write) -*(.text.skb_drop_fraglist) -*(.text.skb_dequeue) -*(.text.set_close_on_exec) -*(.text.set_brk) -*(.text.seq_puts) -*(.text.SELECT_DRIVE) -*(.text.sched_exec) -*(.text.return_EIO) -*(.text.remove_from_page_cache) -*(.text.rcu_start_batch) -*(.text.__put_task_struct) -*(.text.proc_pid_readdir) -*(.text.proc_get_inode) -*(.text.prepare_to_wait_exclusive) -*(.text.pipe_wait) -*(.text.pipe_new) -*(.text.pdflush_operation) -*(.text.__pagevec_release) -*(.text.pagevec_lookup_tag) -*(.text.packet_rcv) -*(.text.n_tty_set_room) -*(.text.nr_free_pages) -*(.text.__net_timestamp) -*(.text.mpage_end_io_read) -*(.text.mod_timer) -*(.text.__memcpy) -*(.text.mb_cache_shrink_fn) -*(.text.lock_rename) -*(.text.kstrdup) -*(.text.is_ignored) -*(.text.int_very_careful) -*(.text.inotify_inode_is_dead) -*(.text.inotify_get_cookie) -*(.text.inode_get_bytes) -*(.text.init_timer) -*(.text.init_dev) -*(.text.inet_getname) -*(.text.ide_map_sg) -*(.text.__ide_dma_end) -*(.text.hrtimer_get_remaining) -*(.text.get_task_mm) -*(.text.get_random_int) -*(.text.free_pipe_info) -*(.text.filemap_write_and_wait_range) -*(.text.exit_thread) -*(.text.enter_idle) -*(.text.end_that_request_first) -*(.text.end_8259A_irq) -*(.text.dummy_file_alloc_security) -*(.text.do_group_exit) -*(.text.debug_mutex_init) -*(.text.cpuset_exit) -*(.text.cpu_idle) -*(.text.copy_semundo) -*(.text.copy_files) -*(.text.chrdev_open) -*(.text.cdrom_transfer_packet_command) -*(.text.cdrom_mode_sense) -*(.text.blk_phys_contig_segment) -*(.text.blk_get_queue) -*(.text.bio_split) -*(.text.audit_alloc) -*(.text.anon_pipe_buf_release) -*(.text.add_wait_queue_exclusive) -*(.text.add_wait_queue) -*(.text.acct_process) -*(.text.account) -*(.text.zeromap_page_range) -*(.text.yield) -*(.text.writeback_acquire) -*(.text.worker_thread) -*(.text.wait_on_page_writeback_range) -*(.text.__wait_on_buffer) -*(.text.vscnprintf) -*(.text.vmalloc_to_pfn) -*(.text.vgacon_save_screen) -*(.text.vfs_unlink) -*(.text.vfs_rmdir) -*(.text.unregister_md_personality) -*(.text.unlock_new_inode) -*(.text.unix_stream_sendmsg) -*(.text.unix_stream_recvmsg) -*(.text.unhash_process) -*(.text.udp_v4_lookup_longway) -*(.text.tty_ldisc_flush) -*(.text.tty_ldisc_enable) -*(.text.tty_hung_up_p) -*(.text.tty_buffer_free_all) -*(.text.tso_fragment) -*(.text.try_to_del_timer_sync) -*(.text.tcp_v4_err) -*(.text.tcp_unhash) -*(.text.tcp_seq_next) -*(.text.tcp_select_initial_window) -*(.text.tcp_sacktag_write_queue) -*(.text.tcp_cwnd_validate) -*(.text.sys_vhangup) -*(.text.sys_uselib) -*(.text.sys_symlink) -*(.text.sys_signal) -*(.text.sys_poll) -*(.text.sys_mount) -*(.text.sys_kill) -*(.text.sys_ioctl) -*(.text.sys_inotify_add_watch) -*(.text.sys_getuid) -*(.text.sys_getrlimit) -*(.text.sys_getitimer) -*(.text.sys_getgroups) -*(.text.sys_ftruncate) -*(.text.sysfs_lookup) -*(.text.sys_exit_group) -*(.text.stub_fork) -*(.text.sscanf) -*(.text.sock_map_fd) -*(.text.sock_get_timestamp) -*(.text.__sock_create) -*(.text.smp_call_function_single) -*(.text.sk_stop_timer) -*(.text.skb_copy_and_csum_datagram) -*(.text.__skb_checksum_complete) -*(.text.single_next) -*(.text.sigqueue_alloc) -*(.text.shrink_dcache_parent) -*(.text.select_idle_routine) -*(.text.run_workqueue) -*(.text.run_local_timers) -*(.text.remove_inode_hash) -*(.text.remove_dquot_ref) -*(.text.register_binfmt) -*(.text.read_cache_pages) -*(.text.rb_last) -*(.text.pty_open) -*(.text.proc_root_readdir) -*(.text.proc_pid_flush) -*(.text.proc_pident_lookup) -*(.text.proc_fill_super) -*(.text.proc_exe_link) -*(.text.posix_locks_deadlock) -*(.text.pipe_iov_copy_from_user) -*(.text.opost) -*(.text.nf_register_hook) -*(.text.netif_rx_ni) -*(.text.m_start) -*(.text.mpage_writepage) -*(.text.mm_alloc) -*(.text.memory_open) -*(.text.mark_buffer_async_write) -*(.text.lru_add_drain_all) -*(.text.locks_init_lock) -*(.text.locks_delete_lock) -*(.text.lock_hrtimer_base) -*(.text.load_script) -*(.text.__kill_fasync) -*(.text.ip_mc_sf_allow) -*(.text.__ioremap) -*(.text.int_with_check) -*(.text.int_sqrt) -*(.text.install_thread_keyring) -*(.text.init_page_buffers) -*(.text.inet_sock_destruct) -*(.text.idle_notifier_register) -*(.text.ide_execute_command) -*(.text.ide_end_drive_cmd) -*(.text.__ide_dma_host_on) -*(.text.hrtimer_run_queues) -*(.text.hpet_mask_rtc_irq_bit) -*(.text.__get_zone_counts) -*(.text.get_zone_counts) -*(.text.get_write_access) -*(.text.get_fs_struct) -*(.text.get_dirty_limits) -*(.text.generic_readlink) -*(.text.free_hot_page) -*(.text.finish_wait) -*(.text.find_inode) -*(.text.find_first_bit) -*(.text.__filemap_fdatawrite_range) -*(.text.__filemap_copy_from_user_iovec) -*(.text.exit_aio) -*(.text.elv_set_request) -*(.text.elv_former_request) -*(.text.dup_namespace) -*(.text.dupfd) -*(.text.dummy_socket_getsockopt) -*(.text.dummy_sb_post_mountroot) -*(.text.dummy_quotactl) -*(.text.dummy_inode_rename) -*(.text.__do_SAK) -*(.text.do_pipe) -*(.text.do_fsync) -*(.text.d_instantiate_unique) -*(.text.d_find_alias) -*(.text.deny_write_access) -*(.text.dentry_unhash) -*(.text.d_delete) -*(.text.datagram_poll) -*(.text.cpuset_fork) -*(.text.cpuid_read) -*(.text.copy_namespace) -*(.text.cond_resched) -*(.text.check_version) -*(.text.__change_page_attr) -*(.text.cfq_slab_kill) -*(.text.cfq_completed_request) -*(.text.cdrom_pc_intr) -*(.text.cdrom_decode_status) -*(.text.cap_capset_check) -*(.text.blk_put_request) -*(.text.bio_fs_destructor) -*(.text.bictcp_min_cwnd) -*(.text.alloc_chrdev_region) -*(.text.add_element) -*(.text.acct_update_integrals) -*(.text.write_boundary_block) -*(.text.writeback_release) -*(.text.writeback_inodes) -*(.text.wake_up_state) -*(.text.__wake_up_locked) -*(.text.wake_futex) -*(.text.wait_task_inactive) -*(.text.__wait_on_freeing_inode) -*(.text.wait_noreap_copyout) -*(.text.vmstat_start) -*(.text.vgacon_do_font_op) -*(.text.vfs_readv) -*(.text.vfs_quota_sync) -*(.text.update_queue) -*(.text.unshare_files) -*(.text.unmap_vm_area) -*(.text.unix_socketpair) -*(.text.unix_release_sock) -*(.text.unix_detach_fds) -*(.text.unix_create1) -*(.text.unix_bind) -*(.text.udp_sendmsg) -*(.text.udp_rcv) -*(.text.udp_queue_rcv_skb) -*(.text.uart_write) -*(.text.uart_startup) -*(.text.uart_open) -*(.text.tty_vhangup) -*(.text.tty_termios_baud_rate) -*(.text.tty_release) -*(.text.tty_ldisc_ref) -*(.text.throttle_vm_writeout) -*(.text.058) -*(.text.tcp_xmit_probe_skb) -*(.text.tcp_v4_send_check) -*(.text.tcp_v4_destroy_sock) -*(.text.tcp_sync_mss) -*(.text.tcp_snd_test) -*(.text.tcp_slow_start) -*(.text.tcp_send_fin) -*(.text.tcp_rtt_estimator) -*(.text.tcp_parse_options) -*(.text.tcp_ioctl) -*(.text.tcp_init_tso_segs) -*(.text.tcp_init_cwnd) -*(.text.tcp_getsockopt) -*(.text.tcp_fin) -*(.text.tcp_connect) -*(.text.tcp_cong_avoid) -*(.text.__tcp_checksum_complete_user) -*(.text.task_dumpable) -*(.text.sys_wait4) -*(.text.sys_utimes) -*(.text.sys_symlinkat) -*(.text.sys_socketpair) -*(.text.sys_rmdir) -*(.text.sys_readahead) -*(.text.sys_nanosleep) -*(.text.sys_linkat) -*(.text.sys_fstat) -*(.text.sysfs_readdir) -*(.text.sys_execve) -*(.text.sysenter_tracesys) -*(.text.sys_chown) -*(.text.stub_clone) -*(.text.strrchr) -*(.text.strncpy) -*(.text.stopmachine_set_state) -*(.text.sock_sendmsg) -*(.text.sock_release) -*(.text.sock_fasync) -*(.text.sock_close) -*(.text.sk_stream_write_space) -*(.text.sk_reset_timer) -*(.text.skb_split) -*(.text.skb_recv_datagram) -*(.text.skb_queue_tail) -*(.text.sk_attach_filter) -*(.text.si_swapinfo) -*(.text.simple_strtoll) -*(.text.set_termios) -*(.text.set_task_comm) -*(.text.set_shrinker) -*(.text.set_normalized_timespec) -*(.text.set_brk) -*(.text.serial_in) -*(.text.seq_printf) -*(.text.secure_dccp_sequence_number) -*(.text.rwlock_bug) -*(.text.rt_hash_code) -*(.text.__rta_fill) -*(.text.__request_resource) -*(.text.relocate_new_kernel) -*(.text.release_thread) -*(.text.release_mem) -*(.text.rb_prev) -*(.text.rb_first) -*(.text.random_poll) -*(.text.__put_super_and_need_restart) -*(.text.pty_write) -*(.text.ptrace_stop) -*(.text.proc_self_readlink) -*(.text.proc_root_lookup) -*(.text.proc_root_link) -*(.text.proc_pid_make_inode) -*(.text.proc_pid_attr_write) -*(.text.proc_lookupfd) -*(.text.proc_delete_inode) -*(.text.posix_same_owner) -*(.text.posix_block_lock) -*(.text.poll_initwait) -*(.text.pipe_write) -*(.text.pipe_read_fasync) -*(.text.pipe_ioctl) -*(.text.pdflush) -*(.text.pci_user_read_config_dword) -*(.text.page_readlink) -*(.text.null_lseek) -*(.text.nf_hook_slow) -*(.text.netlink_sock_destruct) -*(.text.netlink_broadcast) -*(.text.neigh_resolve_output) -*(.text.name_to_int) -*(.text.mwait_idle) -*(.text.mutex_trylock) -*(.text.mutex_debug_check_no_locks_held) -*(.text.m_stop) -*(.text.mpage_end_io_write) -*(.text.mpage_alloc) -*(.text.move_page_tables) -*(.text.mounts_open) -*(.text.__memset) -*(.text.memcpy_fromiovec) -*(.text.make_8259A_irq) -*(.text.lookup_user_key_possessed) -*(.text.lookup_create) -*(.text.locks_insert_lock) -*(.text.locks_alloc_lock) -*(.text.kthread_should_stop) -*(.text.kswapd) -*(.text.kobject_uevent) -*(.text.kobject_get_path) -*(.text.kobject_get) -*(.text.klist_children_put) -*(.text.__ip_route_output_key) -*(.text.ip_flush_pending_frames) -*(.text.ip_compute_csum) -*(.text.ip_append_data) -*(.text.ioc_set_batching) -*(.text.invalidate_inode_pages) -*(.text.__invalidate_device) -*(.text.install_arg_page) -*(.text.in_sched_functions) -*(.text.inotify_unmount_inodes) -*(.text.init_once) -*(.text.init_cdrom_command) -*(.text.inet_stream_connect) -*(.text.inet_sk_rebuild_header) -*(.text.inet_csk_addr2sockaddr) -*(.text.inet_create) -*(.text.ifind) -*(.text.ide_setup_dma) -*(.text.ide_outsw) -*(.text.ide_fixstring) -*(.text.ide_dma_setup) -*(.text.ide_cdrom_packet) -*(.text.ide_cd_put) -*(.text.ide_build_sglist) -*(.text.i8259A_shutdown) -*(.text.hung_up_tty_ioctl) -*(.text.hrtimer_nanosleep) -*(.text.hrtimer_init) -*(.text.hrtimer_cancel) -*(.text.hash_futex) -*(.text.group_send_sig_info) -*(.text.grab_cache_page_nowait) -*(.text.get_wchan) -*(.text.get_stack) -*(.text.get_page_state) -*(.text.getnstimeofday) -*(.text.get_node) -*(.text.get_kprobe) -*(.text.generic_unplug_device) -*(.text.free_task) -*(.text.frag_show) -*(.text.find_next_zero_string) -*(.text.filp_open) -*(.text.fillonedir) -*(.text.exit_io_context) -*(.text.exit_idle) -*(.text.exact_lock) -*(.text.eth_header) -*(.text.dummy_unregister_security) -*(.text.dummy_socket_post_create) -*(.text.dummy_socket_listen) -*(.text.dummy_quota_on) -*(.text.dummy_inode_follow_link) -*(.text.dummy_file_receive) -*(.text.dummy_file_mprotect) -*(.text.dummy_file_lock) -*(.text.dummy_file_ioctl) -*(.text.dummy_bprm_post_apply_creds) -*(.text.do_writepages) -*(.text.__down_interruptible) -*(.text.do_notify_resume) -*(.text.do_acct_process) -*(.text.del_timer_sync) -*(.text.default_rebuild_header) -*(.text.d_callback) -*(.text.dcache_readdir) -*(.text.ctrl_dumpfamily) -*(.text.cpuset_rmdir) -*(.text.copy_strings_kernel) -*(.text.con_write_room) -*(.text.complete_all) -*(.text.collect_sigign_sigcatch) -*(.text.clear_user) -*(.text.check_unthrottle) -*(.text.cdrom_release) -*(.text.cdrom_newpc_intr) -*(.text.cdrom_ioctl) -*(.text.cdrom_check_status) -*(.text.cdev_put) -*(.text.cdev_add) -*(.text.cap_ptrace) -*(.text.cap_bprm_secureexec) -*(.text.cache_alloc_refill) -*(.text.bmap) -*(.text.blk_run_queue) -*(.text.blk_queue_dma_alignment) -*(.text.blk_ordered_req_seq) -*(.text.blk_backing_dev_unplug) -*(.text.__bitmap_subset) -*(.text.__bitmap_and) -*(.text.bio_unmap_user) -*(.text.__bforget) -*(.text.bd_forget) -*(.text.bad_pipe_w) -*(.text.bad_get_user) -*(.text.audit_free) -*(.text.anon_vma_ctor) -*(.text.anon_pipe_buf_map) -*(.text.alloc_sock_iocb) -*(.text.alloc_fdset) -*(.text.aio_kick_handler) -*(.text.__add_entropy_words) -*(.text.add_disk_randomness) diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c index 0b3603adf56..47496a40e84 100644 --- a/arch/x86_64/kernel/genapic.c +++ b/arch/x86_64/kernel/genapic.c @@ -11,120 +11,54 @@ #include <linux/threads.h> #include <linux/cpumask.h> #include <linux/string.h> +#include <linux/module.h> #include <linux/kernel.h> #include <linux/ctype.h> #include <linux/init.h> -#include <linux/module.h> #include <asm/smp.h> #include <asm/ipi.h> +#include <asm/genapic.h> -#if defined(CONFIG_ACPI) +#ifdef CONFIG_ACPI #include <acpi/acpi_bus.h> #endif /* which logical CPU number maps to which CPU (physical APIC ID) */ -u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; +u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly + = { [0 ... NR_CPUS-1] = BAD_APICID }; EXPORT_SYMBOL(x86_cpu_to_apicid); -u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; -extern struct genapic apic_cluster; -extern struct genapic apic_flat; -extern struct genapic apic_physflat; +u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; -struct genapic *genapic = &apic_flat; -struct genapic *genapic_force; +struct genapic __read_mostly *genapic = &apic_flat; /* * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. */ -void __init clustered_apic_check(void) +void __init setup_apic_routing(void) { - long i; - u8 clusters, max_cluster; - u8 id; - u8 cluster_cnt[NUM_APIC_CLUSTERS]; - int max_apic = 0; - - /* genapic selection can be forced because of certain quirks. - */ - if (genapic_force) { - genapic = genapic_force; - goto print; - } - -#if defined(CONFIG_ACPI) +#ifdef CONFIG_ACPI /* - * Some x86_64 machines use physical APIC mode regardless of how many - * procs/clusters are present (x86_64 ES7000 is an example). + * Quirk: some x86_64 machines can only use physical APIC mode + * regardless of how many processors are present (x86_64 ES7000 + * is an example). */ - if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID) - if (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL) { - genapic = &apic_cluster; - goto print; - } -#endif - - memset(cluster_cnt, 0, sizeof(cluster_cnt)); - for (i = 0; i < NR_CPUS; i++) { - id = bios_cpu_apicid[i]; - if (id == BAD_APICID) - continue; - if (id > max_apic) - max_apic = id; - cluster_cnt[APIC_CLUSTERID(id)]++; - } - - /* Don't use clustered mode on AMD platforms. */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && + (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) genapic = &apic_physflat; -#ifndef CONFIG_HOTPLUG_CPU - /* In the CPU hotplug case we cannot use broadcast mode - because that opens a race when a CPU is removed. - Stay at physflat mode in this case. - It is bad to do this unconditionally though. Once - we have ACPI platform support for CPU hotplug - we should detect hotplug capablity from ACPI tables and - only do this when really needed. -AK */ - if (max_apic <= 8) - genapic = &apic_flat; + else #endif - goto print; - } - clusters = 0; - max_cluster = 0; - - for (i = 0; i < NUM_APIC_CLUSTERS; i++) { - if (cluster_cnt[i] > 0) { - ++clusters; - if (cluster_cnt[i] > max_cluster) - max_cluster = cluster_cnt[i]; - } - } - - /* - * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode, - * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical - * else physical mode. - * (We don't use lowest priority delivery + HW APIC IRQ steering, so - * can ignore the clustered logical case and go straight to physical.) - */ - if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) { -#ifdef CONFIG_HOTPLUG_CPU - /* Don't use APIC shortcuts in CPU hotplug to avoid races */ - genapic = &apic_physflat; -#else + if (cpus_weight(cpu_possible_map) <= 8) genapic = &apic_flat; -#endif - } else - genapic = &apic_cluster; + else + genapic = &apic_physflat; -print: printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); } -/* Same for both flat and clustered. */ +/* Same for both flat and physical. */ void send_IPI_self(int vector) { diff --git a/arch/x86_64/kernel/genapic_cluster.c b/arch/x86_64/kernel/genapic_cluster.c deleted file mode 100644 index 73d76308b95..00000000000 --- a/arch/x86_64/kernel/genapic_cluster.c +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright 2004 James Cleverdon, IBM. - * Subject to the GNU Public License, v.2 - * - * Clustered APIC subarch code. Up to 255 CPUs, physical delivery. - * (A more realistic maximum is around 230 CPUs.) - * - * Hacked for x86-64 by James Cleverdon from i386 architecture code by - * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and - * James Cleverdon. - */ -#include <linux/threads.h> -#include <linux/cpumask.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/ctype.h> -#include <linux/init.h> -#include <asm/smp.h> -#include <asm/ipi.h> - - -/* - * Set up the logical destination ID. - * - * Intel recommends to set DFR, LDR and TPR before enabling - * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel - * document number 292116). So here it goes... - */ -static void cluster_init_apic_ldr(void) -{ - unsigned long val, id; - long i, count; - u8 lid; - u8 my_id = hard_smp_processor_id(); - u8 my_cluster = APIC_CLUSTER(my_id); - - /* Create logical APIC IDs by counting CPUs already in cluster. */ - for (count = 0, i = NR_CPUS; --i >= 0; ) { - lid = x86_cpu_to_log_apicid[i]; - if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster) - ++count; - } - /* - * We only have a 4 wide bitmap in cluster mode. There's no way - * to get above 60 CPUs and still give each one it's own bit. - * But, we're using physical IRQ delivery, so we don't care. - * Use bit 3 for the 4th through Nth CPU in each cluster. - */ - if (count >= XAPIC_DEST_CPUS_SHIFT) - count = 3; - id = my_cluster | (1UL << count); - x86_cpu_to_log_apicid[smp_processor_id()] = id; - apic_write(APIC_DFR, APIC_DFR_CLUSTER); - val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; - val |= SET_APIC_LOGICAL_ID(id); - apic_write(APIC_LDR, val); -} - -/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ - -static cpumask_t cluster_target_cpus(void) -{ - return cpumask_of_cpu(0); -} - -static cpumask_t cluster_vector_allocation_domain(int cpu) -{ - cpumask_t domain = CPU_MASK_NONE; - cpu_set(cpu, domain); - return domain; -} - -static void cluster_send_IPI_mask(cpumask_t mask, int vector) -{ - send_IPI_mask_sequence(mask, vector); -} - -static void cluster_send_IPI_allbutself(int vector) -{ - cpumask_t mask = cpu_online_map; - - cpu_clear(smp_processor_id(), mask); - - if (!cpus_empty(mask)) - cluster_send_IPI_mask(mask, vector); -} - -static void cluster_send_IPI_all(int vector) -{ - cluster_send_IPI_mask(cpu_online_map, vector); -} - -static int cluster_apic_id_registered(void) -{ - return 1; -} - -static unsigned int cluster_cpu_mask_to_apicid(cpumask_t cpumask) -{ - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - cpu = first_cpu(cpumask); - if ((unsigned)cpu < NR_CPUS) - return x86_cpu_to_apicid[cpu]; - else - return BAD_APICID; -} - -/* cpuid returns the value latched in the HW at reset, not the APIC ID - * register's value. For any box whose BIOS changes APIC IDs, like - * clustered APIC systems, we must use hard_smp_processor_id. - * - * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID. - */ -static unsigned int phys_pkg_id(int index_msb) -{ - return hard_smp_processor_id() >> index_msb; -} - -struct genapic apic_cluster = { - .name = "clustered", - .int_delivery_mode = dest_Fixed, - .int_dest_mode = (APIC_DEST_PHYSICAL != 0), - .target_cpus = cluster_target_cpus, - .vector_allocation_domain = cluster_vector_allocation_domain, - .apic_id_registered = cluster_apic_id_registered, - .init_apic_ldr = cluster_init_apic_ldr, - .send_IPI_all = cluster_send_IPI_all, - .send_IPI_allbutself = cluster_send_IPI_allbutself, - .send_IPI_mask = cluster_send_IPI_mask, - .cpu_mask_to_apicid = cluster_cpu_mask_to_apicid, - .phys_pkg_id = phys_pkg_id, -}; diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c index 7c01db8fa9d..ecb01eefdd2 100644 --- a/arch/x86_64/kernel/genapic_flat.c +++ b/arch/x86_64/kernel/genapic_flat.c @@ -8,6 +8,7 @@ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and * James Cleverdon. */ +#include <linux/errno.h> #include <linux/threads.h> #include <linux/cpumask.h> #include <linux/string.h> @@ -16,6 +17,7 @@ #include <linux/init.h> #include <asm/smp.h> #include <asm/ipi.h> +#include <asm/genapic.h> static cpumask_t flat_target_cpus(void) { @@ -60,31 +62,10 @@ static void flat_init_apic_ldr(void) static void flat_send_IPI_mask(cpumask_t cpumask, int vector) { unsigned long mask = cpus_addr(cpumask)[0]; - unsigned long cfg; unsigned long flags; local_irq_save(flags); - - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - - /* - * prepare target chip field - */ - cfg = __prepare_ICR2(mask); - apic_write(APIC_ICR2, cfg); - - /* - * program the ICR - */ - cfg = __prepare_ICR(0, vector, APIC_DEST_LOGICAL); - - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - apic_write(APIC_ICR, cfg); + __send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL); local_irq_restore(flags); } diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 598a4d0351f..1fab487dee8 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -5,6 +5,7 @@ * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> * Copyright (C) 2000 Karsten Keil <kkeil@suse.de> * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de> + * Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com> */ @@ -13,97 +14,131 @@ #include <linux/init.h> #include <asm/desc.h> #include <asm/segment.h> +#include <asm/pgtable.h> #include <asm/page.h> #include <asm/msr.h> #include <asm/cache.h> - + /* we are not able to switch in one step to the final KERNEL ADRESS SPACE - * because we need identity-mapped pages on setup so define __START_KERNEL to - * 0x100000 for this stage - * + * because we need identity-mapped pages. + * */ .text .section .bootstrap.text - .code32 - .globl startup_32 -/* %bx: 1 if coming from smp trampoline on secondary cpu */ -startup_32: - + .code64 + .globl startup_64 +startup_64: + /* - * At this point the CPU runs in 32bit protected mode (CS.D = 1) with - * paging disabled and the point of this file is to switch to 64bit - * long mode with a kernel mapping for kerneland to jump into the - * kernel virtual addresses. - * There is no stack until we set one up. + * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, + * and someone has loaded an identity mapped page table + * for us. These identity mapped page tables map all of the + * kernel pages and possibly all of memory. + * + * %esi holds a physical pointer to real_mode_data. + * + * We come here either directly from a 64bit bootloader, or from + * arch/x86_64/boot/compressed/head.S. + * + * We only come here initially at boot nothing else comes here. + * + * Since we may be loaded at an address different from what we were + * compiled to run at we first fixup the physical addresses in our page + * tables and then reload them. */ - /* Initialize the %ds segment register */ - movl $__KERNEL_DS,%eax - movl %eax,%ds - - /* Load new GDT with the 64bit segments using 32bit descriptor */ - lgdt pGDT32 - __START_KERNEL_map - - /* If the CPU doesn't support CPUID this will double fault. - * Unfortunately it is hard to check for CPUID without a stack. + /* Compute the delta between the address I am compiled to run at and the + * address I am actually running at. */ - - /* Check if extended functions are implemented */ - movl $0x80000000, %eax - cpuid - cmpl $0x80000000, %eax - jbe no_long_mode - /* Check if long mode is implemented */ - mov $0x80000001, %eax - cpuid - btl $29, %edx - jnc no_long_mode - - /* - * Prepare for entering 64bits mode + leaq _text(%rip), %rbp + subq $_text - __START_KERNEL_map, %rbp + + /* Is the address not 2M aligned? */ + movq %rbp, %rax + andl $~LARGE_PAGE_MASK, %eax + testl %eax, %eax + jnz bad_address + + /* Is the address too large? */ + leaq _text(%rip), %rdx + movq $PGDIR_SIZE, %rax + cmpq %rax, %rdx + jae bad_address + + /* Fixup the physical addresses in the page table */ + addq %rbp, init_level4_pgt + 0(%rip) + addq %rbp, init_level4_pgt + (258*8)(%rip) + addq %rbp, init_level4_pgt + (511*8)(%rip) + + addq %rbp, level3_ident_pgt + 0(%rip) + addq %rbp, level3_kernel_pgt + (510*8)(%rip) + + /* Add an Identity mapping if I am above 1G */ + leaq _text(%rip), %rdi + andq $LARGE_PAGE_MASK, %rdi + + movq %rdi, %rax + shrq $PUD_SHIFT, %rax + andq $(PTRS_PER_PUD - 1), %rax + jz ident_complete + + leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx + leaq level3_ident_pgt(%rip), %rbx + movq %rdx, 0(%rbx, %rax, 8) + + movq %rdi, %rax + shrq $PMD_SHIFT, %rax + andq $(PTRS_PER_PMD - 1), %rax + leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx + leaq level2_spare_pgt(%rip), %rbx + movq %rdx, 0(%rbx, %rax, 8) +ident_complete: + + /* Fixup the kernel text+data virtual addresses + */ + leaq level2_kernel_pgt(%rip), %rdi + leaq 4096(%rdi), %r8 + /* See if it is a valid page table entry */ +1: testq $1, 0(%rdi) + jz 2f + addq %rbp, 0(%rdi) + /* Go to the next page */ +2: addq $8, %rdi + cmp %r8, %rdi + jne 1b + + /* Fixup phys_base */ + addq %rbp, phys_base(%rip) - /* Enable PAE mode */ - xorl %eax, %eax - btsl $5, %eax - movl %eax, %cr4 - - /* Setup early boot stage 4 level pagetables */ - movl $(boot_level4_pgt - __START_KERNEL_map), %eax - movl %eax, %cr3 - - /* Setup EFER (Extended Feature Enable Register) */ - movl $MSR_EFER, %ecx - rdmsr - - /* Enable Long Mode */ - btsl $_EFER_LME, %eax - - /* Make changes effective */ - wrmsr +#ifdef CONFIG_SMP + addq %rbp, trampoline_level4_pgt + 0(%rip) + addq %rbp, trampoline_level4_pgt + (511*8)(%rip) +#endif +#ifdef CONFIG_ACPI_SLEEP + addq %rbp, wakeup_level4_pgt + 0(%rip) + addq %rbp, wakeup_level4_pgt + (511*8)(%rip) +#endif - xorl %eax, %eax - btsl $31, %eax /* Enable paging and in turn activate Long Mode */ - btsl $0, %eax /* Enable protected mode */ - /* Make changes effective */ - movl %eax, %cr0 - /* - * At this point we're in long mode but in 32bit compatibility mode - * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn - * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use - * the new gdt/idt that has __KERNEL_CS with CS.L = 1. + /* Due to ENTRY(), sometimes the empty space gets filled with + * zeros. Better take a jmp than relying on empty space being + * filled with 0x90 (nop) */ - ljmp $__KERNEL_CS, $(startup_64 - __START_KERNEL_map) - - .code64 - .org 0x100 - .globl startup_64 -startup_64: - /* We come here either from startup_32 - * or directly from a 64bit bootloader. - * Since we may have come directly from a bootloader we - * reload the page tables here. + jmp secondary_startup_64 +ENTRY(secondary_startup_64) + /* + * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, + * and someone has loaded a mapped page table. + * + * %esi holds a physical pointer to real_mode_data. + * + * We come here either from startup_64 (using physical addresses) + * or from trampoline.S (using virtual addresses). + * + * Using virtual addresses from trampoline.S removes the need + * to have any identity mapped pages in the kernel page table + * after the boot processor executes this code. */ /* Enable PAE mode and PGE */ @@ -113,9 +148,15 @@ startup_64: movq %rax, %cr4 /* Setup early boot stage 4 level pagetables. */ - movq $(boot_level4_pgt - __START_KERNEL_map), %rax + movq $(init_level4_pgt - __START_KERNEL_map), %rax + addq phys_base(%rip), %rax movq %rax, %cr3 + /* Ensure I am executing from virtual addresses */ + movq $1f, %rax + jmp *%rax +1: + /* Check if nx is implemented */ movl $0x80000001, %eax cpuid @@ -124,17 +165,11 @@ startup_64: /* Setup EFER (Extended Feature Enable Register) */ movl $MSR_EFER, %ecx rdmsr - - /* Enable System Call */ - btsl $_EFER_SCE, %eax - - /* No Execute supported? */ - btl $20,%edi + btsl $_EFER_SCE, %eax /* Enable System Call */ + btl $20,%edi /* No Execute supported? */ jnc 1f btsl $_EFER_NX, %eax -1: - /* Make changes effective */ - wrmsr +1: wrmsr /* Make changes effective */ /* Setup cr0 */ #define CR0_PM 1 /* protected mode */ @@ -161,7 +196,7 @@ startup_64: * addresses where we're currently running on. We have to do that here * because in 32bit we couldn't load a 64bit linear address. */ - lgdt cpu_gdt_descr + lgdt cpu_gdt_descr(%rip) /* set up data segments. actually 0 would do too */ movl $__KERNEL_DS,%eax @@ -212,6 +247,9 @@ initial_code: init_rsp: .quad init_thread_union+THREAD_SIZE-8 +bad_address: + jmp bad_address + ENTRY(early_idt_handler) cmpl $2,early_recursion_flag(%rip) jz 1f @@ -240,110 +278,66 @@ early_idt_msg: early_idt_ripmsg: .asciz "RIP %s\n" -.code32 -ENTRY(no_long_mode) - /* This isn't an x86-64 CPU so hang */ -1: - jmp 1b - -.org 0xf00 - .globl pGDT32 -pGDT32: - .word gdt_end-cpu_gdt_table-1 - .long cpu_gdt_table-__START_KERNEL_map - -.org 0xf10 -ljumpvector: - .long startup_64-__START_KERNEL_map - .word __KERNEL_CS +.balign PAGE_SIZE -ENTRY(stext) -ENTRY(_stext) - - $page = 0 #define NEXT_PAGE(name) \ - $page = $page + 1; \ - .org $page * 0x1000; \ - phys_/**/name = $page * 0x1000 + __PHYSICAL_START; \ + .balign PAGE_SIZE; \ ENTRY(name) +/* Automate the creation of 1 to 1 mapping pmd entries */ +#define PMDS(START, PERM, COUNT) \ + i = 0 ; \ + .rept (COUNT) ; \ + .quad (START) + (i << 21) + (PERM) ; \ + i = i + 1 ; \ + .endr + + /* + * This default setting generates an ident mapping at address 0x100000 + * and a mapping for the kernel that precisely maps virtual address + * 0xffffffff80000000 to physical address 0x000000. (always using + * 2Mbyte large pages provided by PAE mode) + */ NEXT_PAGE(init_level4_pgt) - /* This gets initialized in x86_64_start_kernel */ - .fill 512,8,0 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .fill 257,8,0 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .fill 252,8,0 + /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE NEXT_PAGE(level3_ident_pgt) - .quad phys_level2_ident_pgt | 0x007 + .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE .fill 511,8,0 NEXT_PAGE(level3_kernel_pgt) .fill 510,8,0 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ - .quad phys_level2_kernel_pgt | 0x007 + .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE .fill 1,8,0 NEXT_PAGE(level2_ident_pgt) - /* 40MB for bootup. */ - i = 0 - .rept 20 - .quad i << 21 | 0x083 - i = i + 1 - .endr - /* Temporary mappings for the super early allocator in arch/x86_64/mm/init.c */ - .globl temp_boot_pmds -temp_boot_pmds: - .fill 492,8,0 - + /* Since I easily can, map the first 1G. + * Don't set NX because code runs from these pages. + */ + PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) + NEXT_PAGE(level2_kernel_pgt) /* 40MB kernel mapping. The kernel code cannot be bigger than that. When you change this change KERNEL_TEXT_SIZE in page.h too. */ /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */ - i = 0 - .rept 20 - .quad i << 21 | 0x183 - i = i + 1 - .endr + PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, + KERNEL_TEXT_SIZE/PMD_SIZE) /* Module mapping starts here */ - .fill 492,8,0 + .fill (PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0 -NEXT_PAGE(level3_physmem_pgt) - .quad phys_level2_kernel_pgt | 0x007 /* so that __va works even before pagetable_init */ - .fill 511,8,0 +NEXT_PAGE(level2_spare_pgt) + .fill 512,8,0 +#undef PMDS #undef NEXT_PAGE .data - -#ifdef CONFIG_ACPI_SLEEP - .align PAGE_SIZE -ENTRY(wakeup_level4_pgt) - .quad phys_level3_ident_pgt | 0x007 - .fill 255,8,0 - .quad phys_level3_physmem_pgt | 0x007 - .fill 254,8,0 - /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad phys_level3_kernel_pgt | 0x007 -#endif - -#ifndef CONFIG_HOTPLUG_CPU - __INITDATA -#endif - /* - * This default setting generates an ident mapping at address 0x100000 - * and a mapping for the kernel that precisely maps virtual address - * 0xffffffff80000000 to physical address 0x000000. (always using - * 2Mbyte large pages provided by PAE mode) - */ - .align PAGE_SIZE -ENTRY(boot_level4_pgt) - .quad phys_level3_ident_pgt | 0x007 - .fill 255,8,0 - .quad phys_level3_physmem_pgt | 0x007 - .fill 254,8,0 - /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad phys_level3_kernel_pgt | 0x007 - - .data - .align 16 .globl cpu_gdt_descr cpu_gdt_descr: @@ -357,6 +351,10 @@ gdt: .endr #endif +ENTRY(phys_base) + /* This must match the first entry in level2_kernel_pgt */ + .quad 0x0000000000000000 + /* We need valid kernel segments for data and code in long mode too * IRET will check the segment types kkeil 2000/10/28 * Also sysret mandates a special GDT layout @@ -370,13 +368,13 @@ gdt: ENTRY(cpu_gdt_table) .quad 0x0000000000000000 /* NULL descriptor */ + .quad 0x00cf9b000000ffff /* __KERNEL32_CS */ + .quad 0x00af9b000000ffff /* __KERNEL_CS */ + .quad 0x00cf93000000ffff /* __KERNEL_DS */ + .quad 0x00cffb000000ffff /* __USER32_CS */ + .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */ + .quad 0x00affb000000ffff /* __USER_CS */ .quad 0x0 /* unused */ - .quad 0x00af9a000000ffff /* __KERNEL_CS */ - .quad 0x00cf92000000ffff /* __KERNEL_DS */ - .quad 0x00cffa000000ffff /* __USER32_CS */ - .quad 0x00cff2000000ffff /* __USER_DS, __USER32_DS */ - .quad 0x00affa000000ffff /* __USER_CS */ - .quad 0x00cf9a000000ffff /* __KERNEL32_CS */ .quad 0,0 /* TSS */ .quad 0,0 /* LDT */ .quad 0,0,0 /* three TLS descriptors */ diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index 5f197b0a330..213d90e0475 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -18,8 +18,16 @@ #include <asm/setup.h> #include <asm/desc.h> #include <asm/pgtable.h> +#include <asm/tlbflush.h> #include <asm/sections.h> +static void __init zap_identity_mappings(void) +{ + pgd_t *pgd = pgd_offset_k(0UL); + pgd_clear(pgd); + __flush_tlb(); +} + /* Don't add a printk in there. printk relies on the PDA which is not initialized yet. */ static void __init clear_bss(void) @@ -29,25 +37,24 @@ static void __init clear_bss(void) } #define NEW_CL_POINTER 0x228 /* Relative to real mode data */ -#define OLD_CL_MAGIC_ADDR 0x90020 +#define OLD_CL_MAGIC_ADDR 0x20 #define OLD_CL_MAGIC 0xA33F -#define OLD_CL_BASE_ADDR 0x90000 -#define OLD_CL_OFFSET 0x90022 +#define OLD_CL_OFFSET 0x22 static void __init copy_bootdata(char *real_mode_data) { - int new_data; + unsigned long new_data; char * command_line; memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE); - new_data = *(int *) (x86_boot_params + NEW_CL_POINTER); + new_data = *(u32 *) (x86_boot_params + NEW_CL_POINTER); if (!new_data) { - if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) { + if (OLD_CL_MAGIC != *(u16 *)(real_mode_data + OLD_CL_MAGIC_ADDR)) { return; } - new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET; + new_data = __pa(real_mode_data) + *(u16 *)(real_mode_data + OLD_CL_OFFSET); } - command_line = (char *) ((u64)(new_data)); + command_line = __va(new_data); memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); } @@ -55,26 +62,30 @@ void __init x86_64_start_kernel(char * real_mode_data) { int i; + /* + * Make sure kernel is aligned to 2MB address. Catching it at compile + * time is better. Change your config file and compile the kernel + * for a 2MB aligned address (CONFIG_PHYSICAL_START) + */ + BUILD_BUG_ON(CONFIG_PHYSICAL_START & (__KERNEL_ALIGN - 1)); + /* clear bss before set_intr_gate with early_idt_handler */ clear_bss(); + /* Make NULL pointers segfault */ + zap_identity_mappings(); + for (i = 0; i < IDT_ENTRIES; i++) set_intr_gate(i, early_idt_handler); asm volatile("lidt %0" :: "m" (idt_descr)); early_printk("Kernel alive\n"); - /* - * switch to init_level4_pgt from boot_level4_pgt - */ - memcpy(init_level4_pgt, boot_level4_pgt, PTRS_PER_PGD*sizeof(pgd_t)); - asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt))); - for (i = 0; i < NR_CPUS; i++) cpu_pda(i) = &boot_cpu_pda[i]; pda_init(0); - copy_bootdata(real_mode_data); + copy_bootdata(__va(real_mode_data)); #ifdef CONFIG_SMP cpu_set(0, cpu_online_map); #endif diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index b7d2b76b92d..2a2df14dab7 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -907,10 +907,6 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in enable_8259A_irq(0); } -void __init UNEXPECTED_IO_APIC(void) -{ -} - void __apicdebuginit print_IO_APIC(void) { int apic, i; @@ -946,40 +942,16 @@ void __apicdebuginit print_IO_APIC(void) printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); - if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2) - UNEXPECTED_IO_APIC(); printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); - if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */ - (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */ - (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */ - (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */ - (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */ - (reg_01.bits.entries != 0x2E) && - (reg_01.bits.entries != 0x3F) && - (reg_01.bits.entries != 0x03) - ) - UNEXPECTED_IO_APIC(); printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); - if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */ - (reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */ - (reg_01.bits.version != 0x10) && /* oldest IO-APICs */ - (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */ - (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */ - (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */ - ) - UNEXPECTED_IO_APIC(); - if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2) - UNEXPECTED_IO_APIC(); if (reg_01.bits.version >= 0x10) { printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); - if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2) - UNEXPECTED_IO_APIC(); } printk(KERN_DEBUG ".... IRQ redirection table:\n"); @@ -1407,8 +1379,7 @@ static void irq_complete_move(unsigned int irq) vector = ~get_irq_regs()->orig_rax; me = smp_processor_id(); - if ((vector == cfg->vector) && - cpu_isset(smp_processor_id(), cfg->domain)) { + if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { cpumask_t cleanup_mask; cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c index 745b1f0f494..387d347b0e0 100644 --- a/arch/x86_64/kernel/ioport.c +++ b/arch/x86_64/kernel/ioport.c @@ -16,6 +16,7 @@ #include <linux/stddef.h> #include <linux/slab.h> #include <linux/thread_info.h> +#include <linux/syscalls.h> /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c index 0497e3bd5bf..a8bb33c1a8f 100644 --- a/arch/x86_64/kernel/machine_kexec.c +++ b/arch/x86_64/kernel/machine_kexec.c @@ -191,19 +191,19 @@ NORET_TYPE void machine_kexec(struct kimage *image) page_list[PA_CONTROL_PAGE] = __pa(control_page); page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; - page_list[PA_PGD] = __pa(kexec_pgd); + page_list[PA_PGD] = __pa_symbol(&kexec_pgd); page_list[VA_PGD] = (unsigned long)kexec_pgd; - page_list[PA_PUD_0] = __pa(kexec_pud0); + page_list[PA_PUD_0] = __pa_symbol(&kexec_pud0); page_list[VA_PUD_0] = (unsigned long)kexec_pud0; - page_list[PA_PMD_0] = __pa(kexec_pmd0); + page_list[PA_PMD_0] = __pa_symbol(&kexec_pmd0); page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; - page_list[PA_PTE_0] = __pa(kexec_pte0); + page_list[PA_PTE_0] = __pa_symbol(&kexec_pte0); page_list[VA_PTE_0] = (unsigned long)kexec_pte0; - page_list[PA_PUD_1] = __pa(kexec_pud1); + page_list[PA_PUD_1] = __pa_symbol(&kexec_pud1); page_list[VA_PUD_1] = (unsigned long)kexec_pud1; - page_list[PA_PMD_1] = __pa(kexec_pmd1); + page_list[PA_PMD_1] = __pa_symbol(&kexec_pmd1); page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; - page_list[PA_PTE_1] = __pa(kexec_pte1); + page_list[PA_PTE_1] = __pa_symbol(&kexec_pte1); page_list[VA_PTE_1] = (unsigned long)kexec_pte1; page_list[PA_TABLE_PAGE] = diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index 8011a8e1c7d..fa267268247 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -323,10 +323,13 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status) #endif /* CONFIG_X86_MCE_INTEL */ /* - * Periodic polling timer for "silent" machine check errors. + * Periodic polling timer for "silent" machine check errors. If the + * poller finds an MCE, poll 2x faster. When the poller finds no more + * errors, poll 2x slower (up to check_interval seconds). */ static int check_interval = 5 * 60; /* 5 minutes */ +static int next_interval; /* in jiffies */ static void mcheck_timer(struct work_struct *work); static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer); @@ -339,7 +342,6 @@ static void mcheck_check_cpu(void *info) static void mcheck_timer(struct work_struct *work) { on_each_cpu(mcheck_check_cpu, NULL, 1, 1); - schedule_delayed_work(&mcheck_work, check_interval * HZ); /* * It's ok to read stale data here for notify_user and @@ -349,17 +351,30 @@ static void mcheck_timer(struct work_struct *work) * writes. */ if (notify_user && console_logged) { + static unsigned long last_print; + unsigned long now = jiffies; + + /* if we logged an MCE, reduce the polling interval */ + next_interval = max(next_interval/2, HZ/100); notify_user = 0; clear_bit(0, &console_logged); - printk(KERN_INFO "Machine check events logged\n"); + if (time_after_eq(now, last_print + (check_interval*HZ))) { + last_print = now; + printk(KERN_INFO "Machine check events logged\n"); + } + } else { + next_interval = min(next_interval*2, check_interval*HZ); } + + schedule_delayed_work(&mcheck_work, next_interval); } static __init int periodic_mcheck_init(void) { - if (check_interval) - schedule_delayed_work(&mcheck_work, check_interval*HZ); + next_interval = check_interval * HZ; + if (next_interval) + schedule_delayed_work(&mcheck_work, next_interval); return 0; } __initcall(periodic_mcheck_init); @@ -597,12 +612,13 @@ static int mce_resume(struct sys_device *dev) /* Reinit MCEs after user configuration changes */ static void mce_restart(void) { - if (check_interval) + if (next_interval) cancel_delayed_work(&mcheck_work); /* Timer race is harmless here */ on_each_cpu(mce_init, NULL, 1, 1); - if (check_interval) - schedule_delayed_work(&mcheck_work, check_interval*HZ); + next_interval = check_interval * HZ; + if (next_interval) + schedule_delayed_work(&mcheck_work, next_interval); } static struct sysdev_class mce_sysclass = { diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 455aa0b932f..d0dc4891599 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -300,7 +300,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) } } } - clustered_apic_check(); + setup_apic_routing(); if (!num_processors) printk(KERN_ERR "MPTABLE: no processors registered!\n"); return num_processors; diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index dfab9f16736..6cd2b30e2ff 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -27,28 +27,11 @@ #include <asm/proto.h> #include <asm/kdebug.h> #include <asm/mce.h> -#include <asm/intel_arch_perfmon.h> int unknown_nmi_panic; int nmi_watchdog_enabled; int panic_on_unrecovered_nmi; -/* perfctr_nmi_owner tracks the ownership of the perfctr registers: - * evtsel_nmi_owner tracks the ownership of the event selection - * - different performance counters/ event selection may be reserved for - * different subsystems this reservation system just tries to coordinate - * things a little - */ - -/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's - * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) - */ -#define NMI_MAX_COUNTER_BITS 66 -#define NMI_MAX_COUNTER_LONGS BITS_TO_LONGS(NMI_MAX_COUNTER_BITS) - -static DEFINE_PER_CPU(unsigned, perfctr_nmi_owner[NMI_MAX_COUNTER_LONGS]); -static DEFINE_PER_CPU(unsigned, evntsel_nmi_owner[NMI_MAX_COUNTER_LONGS]); - static cpumask_t backtrace_mask = CPU_MASK_NONE; /* nmi_active: @@ -63,191 +46,11 @@ int panic_on_timeout; unsigned int nmi_watchdog = NMI_DEFAULT; static unsigned int nmi_hz = HZ; -struct nmi_watchdog_ctlblk { - int enabled; - u64 check_bit; - unsigned int cccr_msr; - unsigned int perfctr_msr; /* the MSR to reset in NMI handler */ - unsigned int evntsel_msr; /* the MSR to select the events to handle */ -}; -static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk); +static DEFINE_PER_CPU(short, wd_enabled); /* local prototypes */ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); -/* converts an msr to an appropriate reservation bit */ -static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) -{ - /* returns the bit offset of the performance counter register */ - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - return (msr - MSR_K7_PERFCTR0); - case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return (msr - MSR_ARCH_PERFMON_PERFCTR0); - else - return (msr - MSR_P4_BPU_PERFCTR0); - } - return 0; -} - -/* converts an msr to an appropriate reservation bit */ -static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) -{ - /* returns the bit offset of the event selection register */ - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - return (msr - MSR_K7_EVNTSEL0); - case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return (msr - MSR_ARCH_PERFMON_EVENTSEL0); - else - return (msr - MSR_P4_BSU_ESCR0); - } - return 0; -} - -/* checks for a bit availability (hack for oprofile) */ -int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) -{ - int cpu; - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - for_each_possible_cpu (cpu) { - if (test_bit(counter, &per_cpu(perfctr_nmi_owner, cpu))) - return 0; - } - return 1; -} - -/* checks the an msr for availability */ -int avail_to_resrv_perfctr_nmi(unsigned int msr) -{ - unsigned int counter; - int cpu; - - counter = nmi_perfctr_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - for_each_possible_cpu (cpu) { - if (test_bit(counter, &per_cpu(perfctr_nmi_owner, cpu))) - return 0; - } - return 1; -} - -static int __reserve_perfctr_nmi(int cpu, unsigned int msr) -{ - unsigned int counter; - if (cpu < 0) - cpu = smp_processor_id(); - - counter = nmi_perfctr_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - if (!test_and_set_bit(counter, &per_cpu(perfctr_nmi_owner, cpu))) - return 1; - return 0; -} - -static void __release_perfctr_nmi(int cpu, unsigned int msr) -{ - unsigned int counter; - if (cpu < 0) - cpu = smp_processor_id(); - - counter = nmi_perfctr_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - clear_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)); -} - -int reserve_perfctr_nmi(unsigned int msr) -{ - int cpu, i; - for_each_possible_cpu (cpu) { - if (!__reserve_perfctr_nmi(cpu, msr)) { - for_each_possible_cpu (i) { - if (i >= cpu) - break; - __release_perfctr_nmi(i, msr); - } - return 0; - } - } - return 1; -} - -void release_perfctr_nmi(unsigned int msr) -{ - int cpu; - for_each_possible_cpu (cpu) - __release_perfctr_nmi(cpu, msr); -} - -int __reserve_evntsel_nmi(int cpu, unsigned int msr) -{ - unsigned int counter; - if (cpu < 0) - cpu = smp_processor_id(); - - counter = nmi_evntsel_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - if (!test_and_set_bit(counter, &per_cpu(evntsel_nmi_owner, cpu)[0])) - return 1; - return 0; -} - -static void __release_evntsel_nmi(int cpu, unsigned int msr) -{ - unsigned int counter; - if (cpu < 0) - cpu = smp_processor_id(); - - counter = nmi_evntsel_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - clear_bit(counter, &per_cpu(evntsel_nmi_owner, cpu)[0]); -} - -int reserve_evntsel_nmi(unsigned int msr) -{ - int cpu, i; - for_each_possible_cpu (cpu) { - if (!__reserve_evntsel_nmi(cpu, msr)) { - for_each_possible_cpu (i) { - if (i >= cpu) - break; - __release_evntsel_nmi(i, msr); - } - return 0; - } - } - return 1; -} - -void release_evntsel_nmi(unsigned int msr) -{ - int cpu; - for_each_possible_cpu (cpu) { - __release_evntsel_nmi(cpu, msr); - } -} - -static __cpuinit inline int nmi_known_cpu(void) -{ - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - return boot_cpu_data.x86 == 15 || boot_cpu_data.x86 == 16; - case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return 1; - else - return (boot_cpu_data.x86 == 15); - } - return 0; -} - /* Run after command line and cpu_init init, but before all other checks */ void nmi_watchdog_default(void) { @@ -277,23 +80,6 @@ static __init void nmi_cpu_busy(void *data) } #endif -static unsigned int adjust_for_32bit_ctr(unsigned int hz) -{ - unsigned int retval = hz; - - /* - * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter - * are writable, with higher bits sign extending from bit 31. - * So, we can only program the counter with 31 bit values and - * 32nd bit should be 1, for 33.. to be 1. - * Find the appropriate nmi_hz - */ - if ((((u64)cpu_khz * 1000) / retval) > 0x7fffffffULL) { - retval = ((u64)cpu_khz * 1000) / 0x7fffffffUL + 1; - } - return retval; -} - int __init check_nmi_watchdog (void) { int *counts; @@ -322,14 +108,14 @@ int __init check_nmi_watchdog (void) mdelay((20*1000)/nmi_hz); // wait 20 ticks for_each_online_cpu(cpu) { - if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled) + if (!per_cpu(wd_enabled, cpu)) continue; if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) { printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n", cpu, counts[cpu], cpu_pda(cpu)->__nmi_count); - per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0; + per_cpu(wd_enabled, cpu) = 0; atomic_dec(&nmi_active); } } @@ -344,13 +130,8 @@ int __init check_nmi_watchdog (void) /* now that we know it works we can reduce NMI frequency to something more reasonable; makes a difference in some configs */ - if (nmi_watchdog == NMI_LOCAL_APIC) { - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - nmi_hz = 1; - if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) - nmi_hz = adjust_for_32bit_ctr(nmi_hz); - } + if (nmi_watchdog == NMI_LOCAL_APIC) + nmi_hz = lapic_adjust_nmi_hz(1); kfree(counts); return 0; @@ -379,57 +160,6 @@ int __init setup_nmi_watchdog(char *str) __setup("nmi_watchdog=", setup_nmi_watchdog); -static void disable_lapic_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); - - if (atomic_read(&nmi_active) <= 0) - return; - - on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); - - BUG_ON(atomic_read(&nmi_active) != 0); -} - -static void enable_lapic_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); - - /* are we already enabled */ - if (atomic_read(&nmi_active) != 0) - return; - - /* are we lapic aware */ - if (nmi_known_cpu() <= 0) - return; - - on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); - touch_nmi_watchdog(); -} - -void disable_timer_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_IO_APIC); - - if (atomic_read(&nmi_active) <= 0) - return; - - disable_irq(0); - on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); - - BUG_ON(atomic_read(&nmi_active) != 0); -} - -void enable_timer_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_IO_APIC); - - if (atomic_read(&nmi_active) == 0) { - touch_nmi_watchdog(); - on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); - enable_irq(0); - } -} static void __acpi_nmi_disable(void *__unused) { @@ -515,275 +245,9 @@ late_initcall(init_lapic_nmi_sysfs); #endif /* CONFIG_PM */ -/* - * Activate the NMI watchdog via the local APIC. - * Original code written by Keith Owens. - */ - -/* Note that these events don't tick when the CPU idles. This means - the frequency varies with CPU load. */ - -#define K7_EVNTSEL_ENABLE (1 << 22) -#define K7_EVNTSEL_INT (1 << 20) -#define K7_EVNTSEL_OS (1 << 17) -#define K7_EVNTSEL_USR (1 << 16) -#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 -#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING - -static int setup_k7_watchdog(void) -{ - unsigned int perfctr_msr, evntsel_msr; - unsigned int evntsel; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - perfctr_msr = MSR_K7_PERFCTR0; - evntsel_msr = MSR_K7_EVNTSEL0; - if (!__reserve_perfctr_nmi(-1, perfctr_msr)) - goto fail; - - if (!__reserve_evntsel_nmi(-1, evntsel_msr)) - goto fail1; - - /* Simulator may not support it */ - if (checking_wrmsrl(evntsel_msr, 0UL)) - goto fail2; - wrmsrl(perfctr_msr, 0UL); - - evntsel = K7_EVNTSEL_INT - | K7_EVNTSEL_OS - | K7_EVNTSEL_USR - | K7_NMI_EVENT; - - /* setup the timer */ - wrmsr(evntsel_msr, evntsel, 0); - wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= K7_EVNTSEL_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); - - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; //unused - wd->check_bit = 1ULL<<63; - return 1; -fail2: - __release_evntsel_nmi(-1, evntsel_msr); -fail1: - __release_perfctr_nmi(-1, perfctr_msr); -fail: - return 0; -} - -static void stop_k7_watchdog(void) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - wrmsr(wd->evntsel_msr, 0, 0); - - __release_evntsel_nmi(-1, wd->evntsel_msr); - __release_perfctr_nmi(-1, wd->perfctr_msr); -} - -/* Note that these events don't tick when the CPU idles. This means - the frequency varies with CPU load. */ - -#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) -#define P4_ESCR_EVENT_SELECT(N) ((N)<<25) -#define P4_ESCR_OS (1<<3) -#define P4_ESCR_USR (1<<2) -#define P4_CCCR_OVF_PMI0 (1<<26) -#define P4_CCCR_OVF_PMI1 (1<<27) -#define P4_CCCR_THRESHOLD(N) ((N)<<20) -#define P4_CCCR_COMPLEMENT (1<<19) -#define P4_CCCR_COMPARE (1<<18) -#define P4_CCCR_REQUIRED (3<<16) -#define P4_CCCR_ESCR_SELECT(N) ((N)<<13) -#define P4_CCCR_ENABLE (1<<12) -#define P4_CCCR_OVF (1<<31) -/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter - CRU_ESCR0 (with any non-null event selector) through a complemented - max threshold. [IA32-Vol3, Section 14.9.9] */ - -static int setup_p4_watchdog(void) -{ - unsigned int perfctr_msr, evntsel_msr, cccr_msr; - unsigned int evntsel, cccr_val; - unsigned int misc_enable, dummy; - unsigned int ht_num; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy); - if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL)) - return 0; - -#ifdef CONFIG_SMP - /* detect which hyperthread we are on */ - if (smp_num_siblings == 2) { - unsigned int ebx, apicid; - - ebx = cpuid_ebx(1); - apicid = (ebx >> 24) & 0xff; - ht_num = apicid & 1; - } else -#endif - ht_num = 0; - - /* performance counters are shared resources - * assign each hyperthread its own set - * (re-use the ESCR0 register, seems safe - * and keeps the cccr_val the same) - */ - if (!ht_num) { - /* logical cpu 0 */ - perfctr_msr = MSR_P4_IQ_PERFCTR0; - evntsel_msr = MSR_P4_CRU_ESCR0; - cccr_msr = MSR_P4_IQ_CCCR0; - cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); - } else { - /* logical cpu 1 */ - perfctr_msr = MSR_P4_IQ_PERFCTR1; - evntsel_msr = MSR_P4_CRU_ESCR0; - cccr_msr = MSR_P4_IQ_CCCR1; - cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4); - } - - if (!__reserve_perfctr_nmi(-1, perfctr_msr)) - goto fail; - - if (!__reserve_evntsel_nmi(-1, evntsel_msr)) - goto fail1; - - evntsel = P4_ESCR_EVENT_SELECT(0x3F) - | P4_ESCR_OS - | P4_ESCR_USR; - - cccr_val |= P4_CCCR_THRESHOLD(15) - | P4_CCCR_COMPLEMENT - | P4_CCCR_COMPARE - | P4_CCCR_REQUIRED; - - wrmsr(evntsel_msr, evntsel, 0); - wrmsr(cccr_msr, cccr_val, 0); - wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); - apic_write(APIC_LVTPC, APIC_DM_NMI); - cccr_val |= P4_CCCR_ENABLE; - wrmsr(cccr_msr, cccr_val, 0); - - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = cccr_msr; - wd->check_bit = 1ULL<<39; - return 1; -fail1: - __release_perfctr_nmi(-1, perfctr_msr); -fail: - return 0; -} - -static void stop_p4_watchdog(void) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - wrmsr(wd->cccr_msr, 0, 0); - wrmsr(wd->evntsel_msr, 0, 0); - - __release_evntsel_nmi(-1, wd->evntsel_msr); - __release_perfctr_nmi(-1, wd->perfctr_msr); -} - -#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL -#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK - -static int setup_intel_arch_watchdog(void) -{ - unsigned int ebx; - union cpuid10_eax eax; - unsigned int unused; - unsigned int perfctr_msr, evntsel_msr; - unsigned int evntsel; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - /* - * Check whether the Architectural PerfMon supports - * Unhalted Core Cycles Event or not. - * NOTE: Corresponding bit = 0 in ebx indicates event present. - */ - cpuid(10, &(eax.full), &ebx, &unused, &unused); - if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || - (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) - goto fail; - - perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0; - evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0; - - if (!__reserve_perfctr_nmi(-1, perfctr_msr)) - goto fail; - - if (!__reserve_evntsel_nmi(-1, evntsel_msr)) - goto fail1; - - wrmsrl(perfctr_msr, 0UL); - - evntsel = ARCH_PERFMON_EVENTSEL_INT - | ARCH_PERFMON_EVENTSEL_OS - | ARCH_PERFMON_EVENTSEL_USR - | ARCH_PERFMON_NMI_EVENT_SEL - | ARCH_PERFMON_NMI_EVENT_UMASK; - - /* setup the timer */ - wrmsr(evntsel_msr, evntsel, 0); - - nmi_hz = adjust_for_32bit_ctr(nmi_hz); - wrmsr(perfctr_msr, (u32)(-((u64)cpu_khz * 1000 / nmi_hz)), 0); - - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); - - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; //unused - wd->check_bit = 1ULL << (eax.split.bit_width - 1); - return 1; -fail1: - __release_perfctr_nmi(-1, perfctr_msr); -fail: - return 0; -} - -static void stop_intel_arch_watchdog(void) -{ - unsigned int ebx; - union cpuid10_eax eax; - unsigned int unused; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - /* - * Check whether the Architectural PerfMon supports - * Unhalted Core Cycles Event or not. - * NOTE: Corresponding bit = 0 in ebx indicates event present. - */ - cpuid(10, &(eax.full), &ebx, &unused, &unused); - if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || - (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) - return; - - wrmsr(wd->evntsel_msr, 0, 0); - - __release_evntsel_nmi(-1, wd->evntsel_msr); - __release_perfctr_nmi(-1, wd->perfctr_msr); -} - void setup_apic_nmi_watchdog(void *unused) { - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - /* only support LOCAL and IO APICs for now */ - if ((nmi_watchdog != NMI_LOCAL_APIC) && - (nmi_watchdog != NMI_IO_APIC)) - return; - - if (wd->enabled == 1) + if (__get_cpu_var(wd_enabled) == 1) return; /* cheap hack to support suspend/resume */ @@ -791,62 +255,31 @@ void setup_apic_nmi_watchdog(void *unused) if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) return; - if (nmi_watchdog == NMI_LOCAL_APIC) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - if (strstr(boot_cpu_data.x86_model_id, "Screwdriver")) - return; - if (!setup_k7_watchdog()) - return; - break; - case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - if (!setup_intel_arch_watchdog()) - return; - break; - } - if (!setup_p4_watchdog()) - return; - break; - default: + switch (nmi_watchdog) { + case NMI_LOCAL_APIC: + __get_cpu_var(wd_enabled) = 1; + if (lapic_watchdog_init(nmi_hz) < 0) { + __get_cpu_var(wd_enabled) = 0; return; } + /* FALL THROUGH */ + case NMI_IO_APIC: + __get_cpu_var(wd_enabled) = 1; + atomic_inc(&nmi_active); } - wd->enabled = 1; - atomic_inc(&nmi_active); } void stop_apic_nmi_watchdog(void *unused) { - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - /* only support LOCAL and IO APICs for now */ if ((nmi_watchdog != NMI_LOCAL_APIC) && (nmi_watchdog != NMI_IO_APIC)) return; - - if (wd->enabled == 0) + if (__get_cpu_var(wd_enabled) == 0) return; - - if (nmi_watchdog == NMI_LOCAL_APIC) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - if (strstr(boot_cpu_data.x86_model_id, "Screwdriver")) - return; - stop_k7_watchdog(); - break; - case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - stop_intel_arch_watchdog(); - break; - } - stop_p4_watchdog(); - break; - default: - return; - } - } - wd->enabled = 0; + if (nmi_watchdog == NMI_LOCAL_APIC) + lapic_watchdog_stop(); + __get_cpu_var(wd_enabled) = 0; atomic_dec(&nmi_active); } @@ -885,9 +318,7 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) int sum; int touched = 0; int cpu = smp_processor_id(); - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - u64 dummy; - int rc=0; + int rc = 0; /* check for other users first */ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) @@ -934,55 +365,20 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) } /* see if the nmi watchdog went off */ - if (wd->enabled) { - if (nmi_watchdog == NMI_LOCAL_APIC) { - rdmsrl(wd->perfctr_msr, dummy); - if (dummy & wd->check_bit){ - /* this wasn't a watchdog timer interrupt */ - goto done; - } - - /* only Intel uses the cccr msr */ - if (wd->cccr_msr != 0) { - /* - * P4 quirks: - * - An overflown perfctr will assert its interrupt - * until the OVF flag in its CCCR is cleared. - * - LVTPC is masked on interrupt and must be - * unmasked by the LVTPC handler. - */ - rdmsrl(wd->cccr_msr, dummy); - dummy &= ~P4_CCCR_OVF; - wrmsrl(wd->cccr_msr, dummy); - apic_write(APIC_LVTPC, APIC_DM_NMI); - /* start the cycle over again */ - wrmsrl(wd->perfctr_msr, - -((u64)cpu_khz * 1000 / nmi_hz)); - } else if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { - /* - * ArchPerfom/Core Duo needs to re-unmask - * the apic vector - */ - apic_write(APIC_LVTPC, APIC_DM_NMI); - /* ARCH_PERFMON has 32 bit counter writes */ - wrmsr(wd->perfctr_msr, - (u32)(-((u64)cpu_khz * 1000 / nmi_hz)), 0); - } else { - /* start the cycle over again */ - wrmsrl(wd->perfctr_msr, - -((u64)cpu_khz * 1000 / nmi_hz)); - } - rc = 1; - } else if (nmi_watchdog == NMI_IO_APIC) { - /* don't know how to accurately check for this. - * just assume it was a watchdog timer interrupt - * This matches the old behaviour. - */ - rc = 1; - } else - printk(KERN_WARNING "Unknown enabled NMI hardware?!\n"); + if (!__get_cpu_var(wd_enabled)) + return rc; + switch (nmi_watchdog) { + case NMI_LOCAL_APIC: + rc |= lapic_wd_event(nmi_hz); + break; + case NMI_IO_APIC: + /* don't know how to accurately check for this. + * just assume it was a watchdog timer interrupt + * This matches the old behaviour. + */ + rc = 1; + break; } -done: return rc; } @@ -1067,12 +463,4 @@ void __trigger_all_cpu_backtrace(void) EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); -EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); -EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); -EXPORT_SYMBOL(reserve_perfctr_nmi); -EXPORT_SYMBOL(release_perfctr_nmi); -EXPORT_SYMBOL(reserve_evntsel_nmi); -EXPORT_SYMBOL(release_evntsel_nmi); -EXPORT_SYMBOL(disable_timer_nmi_watchdog); -EXPORT_SYMBOL(enable_timer_nmi_watchdog); EXPORT_SYMBOL(touch_nmi_watchdog); diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index 04480c3b68f..5bd20b542c1 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -507,7 +507,7 @@ error: return ret; } -static struct dma_mapping_ops calgary_dma_ops = { +static const struct dma_mapping_ops calgary_dma_ops = { .alloc_coherent = calgary_alloc_coherent, .map_single = calgary_map_single, .unmap_single = calgary_unmap_single, diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c index 0bae862e9a5..0a762e10f2b 100644 --- a/arch/x86_64/kernel/pci-gart.c +++ b/arch/x86_64/kernel/pci-gart.c @@ -556,7 +556,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) extern int agp_amd64_init(void); -static struct dma_mapping_ops gart_dma_ops = { +static const struct dma_mapping_ops gart_dma_ops = { .mapping_error = NULL, .map_single = gart_map_single, .map_simple = gart_map_simple, diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c index df09ab05a1b..6dade0c867c 100644 --- a/arch/x86_64/kernel/pci-nommu.c +++ b/arch/x86_64/kernel/pci-nommu.c @@ -79,7 +79,7 @@ void nommu_unmap_sg(struct device *dev, struct scatterlist *sg, { } -struct dma_mapping_ops nommu_dma_ops = { +const struct dma_mapping_ops nommu_dma_ops = { .map_single = nommu_map_single, .unmap_single = nommu_unmap_single, .map_sg = nommu_map_sg, diff --git a/arch/x86_64/kernel/pci-swiotlb.c b/arch/x86_64/kernel/pci-swiotlb.c index eb18be5a656..4b4569abc60 100644 --- a/arch/x86_64/kernel/pci-swiotlb.c +++ b/arch/x86_64/kernel/pci-swiotlb.c @@ -12,7 +12,7 @@ int swiotlb __read_mostly; EXPORT_SYMBOL(swiotlb); -struct dma_mapping_ops swiotlb_dma_ops = { +const struct dma_mapping_ops swiotlb_dma_ops = { .mapping_error = swiotlb_dma_mapping_error, .alloc_coherent = swiotlb_alloc_coherent, .free_coherent = swiotlb_free_coherent, diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index d8d5ccc245c..4f21765078b 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -288,16 +288,18 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) static int __init idle_setup (char *str) { - if (!strncmp(str, "poll", 4)) { + if (!strcmp(str, "poll")) { printk("using polling idle threads.\n"); pm_idle = poll_idle; - } + } else if (!strcmp(str, "mwait")) + force_mwait = 1; + else + return -1; boot_option_idle_override = 1; - return 1; + return 0; } - -__setup("idle=", idle_setup); +early_param("idle", idle_setup); /* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs * regs) diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 3d98b696881..db30b5bcef6 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -79,6 +79,8 @@ int bootloader_type; unsigned long saved_video_mode; +int force_mwait __cpuinitdata; + /* * Early DMI memory */ @@ -205,10 +207,10 @@ static void discover_ebda(void) * there is a real-mode segmented pointer pointing to the * 4K EBDA area at 0x40E */ - ebda_addr = *(unsigned short *)EBDA_ADDR_POINTER; + ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER); ebda_addr <<= 4; - ebda_size = *(unsigned short *)(unsigned long)ebda_addr; + ebda_size = *(unsigned short *)__va(ebda_addr); /* Round EBDA up to pages */ if (ebda_size == 0) @@ -243,11 +245,12 @@ void __init setup_arch(char **cmdline_p) init_mm.end_code = (unsigned long) &_etext; init_mm.end_data = (unsigned long) &_edata; init_mm.brk = (unsigned long) &_end; + init_mm.pgd = __va(__pa_symbol(&init_level4_pgt)); - code_resource.start = virt_to_phys(&_text); - code_resource.end = virt_to_phys(&_etext)-1; - data_resource.start = virt_to_phys(&_etext); - data_resource.end = virt_to_phys(&_edata)-1; + code_resource.start = __pa_symbol(&_text); + code_resource.end = __pa_symbol(&_etext)-1; + data_resource.start = __pa_symbol(&_etext); + data_resource.end = __pa_symbol(&_edata)-1; early_identify_cpu(&boot_cpu_data); @@ -274,8 +277,6 @@ void __init setup_arch(char **cmdline_p) dmi_scan_machine(); - zap_low_mappings(0); - #ifdef CONFIG_ACPI /* * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). @@ -329,15 +330,8 @@ void __init setup_arch(char **cmdline_p) #endif #ifdef CONFIG_SMP - /* - * But first pinch a few for the stack/trampoline stuff - * FIXME: Don't need the extra page at 4K, but need to fix - * trampoline before removing it. (see the GDT stuff) - */ - reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE); - /* Reserve SMP trampoline */ - reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE); + reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE); #endif #ifdef CONFIG_ACPI_SLEEP @@ -612,6 +606,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) /* RDTSC can be speculated around */ clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); + + /* Family 10 doesn't support C states in MWAIT so don't use it */ + if (c->x86 == 0x10 && !force_mwait) + clear_bit(X86_FEATURE_MWAIT, &c->x86_capability); } static void __cpuinit detect_ht(struct cpuinfo_x86 *c) @@ -987,9 +985,8 @@ static int show_cpuinfo(struct seq_file *m, void *v) "stc", "100mhzsteps", "hwpstate", - NULL, /* tsc invariant mapped to constant_tsc */ - NULL, - /* nothing */ /* constant_tsc - moved to flags */ + "", /* tsc invariant mapped to constant_tsc */ + /* nothing */ }; diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index 6a70b55f719..64379a80d76 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -103,9 +103,9 @@ void __init setup_per_cpu_areas(void) if (!NODE_DATA(cpu_to_node(i))) { printk("cpu with no node %d, num_online_nodes %d\n", i, num_online_nodes()); - ptr = alloc_bootmem(size); + ptr = alloc_bootmem_pages(size); } else { - ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size); + ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size); } if (!ptr) panic("Cannot allocate cpu data for CPU %d\n", i); @@ -201,7 +201,6 @@ void __cpuinit cpu_init (void) /* CPU 0 is initialised in head64.c */ if (cpu != 0) { pda_init(cpu); - zap_low_mappings(cpu); } else estacks = boot_exception_stacks; diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c index 49ec324cd14..c819625f331 100644 --- a/arch/x86_64/kernel/signal.c +++ b/arch/x86_64/kernel/signal.c @@ -141,7 +141,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) goto badframe; #ifdef DEBUG_SIG - printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs.rip,regs.rsp,frame,eax); + printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs->rip,regs->rsp,frame,eax); #endif if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->rsp) == -EFAULT) @@ -301,7 +301,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, if (test_thread_flag(TIF_SINGLESTEP)) ptrace_notify(SIGTRAP); #ifdef DEBUG_SIG - printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", + printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%p\n", current->comm, current->pid, frame, regs->rip, frame->pretcode); #endif @@ -463,7 +463,7 @@ void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { #ifdef DEBUG_SIG - printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%lx pending:%lx\n", + printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%p pending:%x\n", thread_info_flags, regs->rip, regs->rsp, __builtin_return_address(0),signal_pending(current)); #endif diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index af1ec4d23cf..22abae4e9f3 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -76,7 +76,7 @@ static inline void leave_mm(int cpu) if (read_pda(mmu_state) == TLBSTATE_OK) BUG(); cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask); - load_cr3(swapper_pg_dir); + load_cr3(init_mm.pgd); } /* @@ -452,42 +452,34 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic, } EXPORT_SYMBOL(smp_call_function); -void smp_stop_cpu(void) +static void stop_this_cpu(void *dummy) { - unsigned long flags; + local_irq_disable(); /* * Remove this CPU: */ cpu_clear(smp_processor_id(), cpu_online_map); - local_irq_save(flags); disable_local_APIC(); - local_irq_restore(flags); -} - -static void smp_really_stop_cpu(void *dummy) -{ - smp_stop_cpu(); for (;;) halt(); } void smp_send_stop(void) { - int nolock = 0; + int nolock; + unsigned long flags; + if (reboot_force) return; + /* Don't deadlock on the call lock in panic */ - if (!spin_trylock(&call_lock)) { - /* ignore locking because we have panicked anyways */ - nolock = 1; - } - __smp_call_function(smp_really_stop_cpu, NULL, 0, 0); + nolock = !spin_trylock(&call_lock); + local_irq_save(flags); + __smp_call_function(stop_this_cpu, NULL, 0, 0); if (!nolock) spin_unlock(&call_lock); - - local_irq_disable(); disable_local_APIC(); - local_irq_enable(); + local_irq_restore(flags); } /* diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index cd4643a3702..4d9dacfae57 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -60,7 +60,6 @@ #include <asm/irq.h> #include <asm/hw_irq.h> #include <asm/numa.h> -#include <asm/genapic.h> /* Number of siblings per CPU package */ int smp_num_siblings = 1; @@ -68,7 +67,6 @@ EXPORT_SYMBOL(smp_num_siblings); /* Last level cache ID of each logical CPU */ u8 cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID}; -EXPORT_SYMBOL(cpu_llc_id); /* Bitmask of currently online CPUs */ cpumask_t cpu_online_map __read_mostly; @@ -392,7 +390,8 @@ static void inquire_remote_apic(int apicid) { unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; char *names[] = { "ID", "VERSION", "SPIV" }; - int timeout, status; + int timeout; + unsigned int status; printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid); @@ -402,7 +401,9 @@ static void inquire_remote_apic(int apicid) /* * Wait for idle. */ - apic_wait_icr_idle(); + status = safe_apic_wait_icr_idle(); + if (status) + printk("a previous APIC delivery may have failed\n"); apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]); @@ -430,8 +431,8 @@ static void inquire_remote_apic(int apicid) */ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip) { - unsigned long send_status = 0, accept_status = 0; - int maxlvt, timeout, num_starts, j; + unsigned long send_status, accept_status = 0; + int maxlvt, num_starts, j; Dprintk("Asserting INIT.\n"); @@ -447,12 +448,7 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta | APIC_DM_INIT); Dprintk("Waiting for send to finish...\n"); - timeout = 0; - do { - Dprintk("+"); - udelay(100); - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - } while (send_status && (timeout++ < 1000)); + send_status = safe_apic_wait_icr_idle(); mdelay(10); @@ -465,12 +461,7 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); Dprintk("Waiting for send to finish...\n"); - timeout = 0; - do { - Dprintk("+"); - udelay(100); - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - } while (send_status && (timeout++ < 1000)); + send_status = safe_apic_wait_icr_idle(); mb(); atomic_set(&init_deasserted, 1); @@ -509,12 +500,7 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta Dprintk("Startup point 1.\n"); Dprintk("Waiting for send to finish...\n"); - timeout = 0; - do { - Dprintk("+"); - udelay(100); - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - } while (send_status && (timeout++ < 1000)); + send_status = safe_apic_wait_icr_idle(); /* * Give the other CPU some time to accept the IPI. @@ -945,6 +931,12 @@ int __cpuinit __cpu_up(unsigned int cpu) return -ENOSYS; } + /* + * Save current MTRR state in case it was changed since early boot + * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync: + */ + mtrr_save_state(); + per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; /* Boot it! */ err = do_boot_cpu(cpu, apicid); @@ -965,13 +957,6 @@ int __cpuinit __cpu_up(unsigned int cpu) while (!cpu_isset(cpu, cpu_online_map)) cpu_relax(); - - if (num_online_cpus() > 8 && genapic == &apic_flat) { - printk(KERN_WARNING - "flat APIC routing can't be used with > 8 cpus\n"); - BUG(); - } - err = 0; return err; diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c index 91f7e678bae..6a5a98f2a75 100644 --- a/arch/x86_64/kernel/suspend.c +++ b/arch/x86_64/kernel/suspend.c @@ -12,6 +12,10 @@ #include <asm/proto.h> #include <asm/page.h> #include <asm/pgtable.h> +#include <asm/mtrr.h> + +/* References to section boundaries */ +extern const void __nosave_begin, __nosave_end; struct saved_context saved_context; @@ -33,7 +37,6 @@ void __save_processor_state(struct saved_context *ctxt) asm volatile ("str %0" : "=m" (ctxt->tr)); /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ - /* EFER should be constant for kernel version, no need to handle it. */ /* * segment registers */ @@ -46,10 +49,12 @@ void __save_processor_state(struct saved_context *ctxt) rdmsrl(MSR_FS_BASE, ctxt->fs_base); rdmsrl(MSR_GS_BASE, ctxt->gs_base); rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); + mtrr_save_fixed_ranges(NULL); /* * control registers */ + rdmsrl(MSR_EFER, ctxt->efer); asm volatile ("movq %%cr0, %0" : "=r" (ctxt->cr0)); asm volatile ("movq %%cr2, %0" : "=r" (ctxt->cr2)); asm volatile ("movq %%cr3, %0" : "=r" (ctxt->cr3)); @@ -75,6 +80,7 @@ void __restore_processor_state(struct saved_context *ctxt) /* * control registers */ + wrmsrl(MSR_EFER, ctxt->efer); asm volatile ("movq %0, %%cr8" :: "r" (ctxt->cr8)); asm volatile ("movq %0, %%cr4" :: "r" (ctxt->cr4)); asm volatile ("movq %0, %%cr3" :: "r" (ctxt->cr3)); @@ -219,4 +225,15 @@ int swsusp_arch_resume(void) restore_image(); return 0; } + +/* + * pfn_is_nosave - check if given pfn is in the 'nosave' section + */ + +int pfn_is_nosave(unsigned long pfn) +{ + unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT; + unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; + return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); +} #endif /* CONFIG_SOFTWARE_SUSPEND */ diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S index bfbe00763c6..16d183f67bc 100644 --- a/arch/x86_64/kernel/suspend_asm.S +++ b/arch/x86_64/kernel/suspend_asm.S @@ -71,9 +71,10 @@ loop: jmp loop done: /* go back to the original page tables */ - leaq init_level4_pgt(%rip), %rax - subq $__START_KERNEL_map, %rax - movq %rax, %cr3 + movq $(init_level4_pgt - __START_KERNEL_map), %rax + addq phys_base(%rip), %rax + movq %rax, %cr3 + /* Flush TLB, including "global" things (vmalloc) */ movq mmu_cr4_features(%rip), %rax movq %rax, %rdx diff --git a/arch/x86_64/kernel/syscall.c b/arch/x86_64/kernel/syscall.c index 213fd6ab789..63d592c276c 100644 --- a/arch/x86_64/kernel/syscall.c +++ b/arch/x86_64/kernel/syscall.c @@ -3,6 +3,7 @@ #include <linux/linkage.h> #include <linux/sys.h> #include <linux/cache.h> +#include <asm/asm-offsets.h> #define __NO_STUBS diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 75d73a9aa9f..0652e173813 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -39,13 +39,11 @@ #include <asm/proto.h> #include <asm/hpet.h> #include <asm/sections.h> -#include <linux/cpufreq.h> #include <linux/hpet.h> #include <asm/apic.h> #include <asm/hpet.h> - -extern void i8254_timer_resume(void); -extern int using_apic_timer; +#include <asm/mpspec.h> +#include <asm/nmi.h> static char *timename = NULL; @@ -252,6 +250,51 @@ static unsigned long get_cmos_time(void) return mktime(year, mon, day, hour, min, sec); } +/* calibrate_cpu is used on systems with fixed rate TSCs to determine + * processor frequency */ +#define TICK_COUNT 100000000 +static unsigned int __init tsc_calibrate_cpu_khz(void) +{ + int tsc_start, tsc_now; + int i, no_ctr_free; + unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; + unsigned long flags; + + for (i = 0; i < 4; i++) + if (avail_to_resrv_perfctr_nmi_bit(i)) + break; + no_ctr_free = (i == 4); + if (no_ctr_free) { + i = 3; + rdmsrl(MSR_K7_EVNTSEL3, evntsel3); + wrmsrl(MSR_K7_EVNTSEL3, 0); + rdmsrl(MSR_K7_PERFCTR3, pmc3); + } else { + reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); + reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } + local_irq_save(flags); + /* start meauring cycles, incrementing from 0 */ + wrmsrl(MSR_K7_PERFCTR0 + i, 0); + wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); + rdtscl(tsc_start); + do { + rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); + tsc_now = get_cycles_sync(); + } while ((tsc_now - tsc_start) < TICK_COUNT); + + local_irq_restore(flags); + if (no_ctr_free) { + wrmsrl(MSR_K7_EVNTSEL3, 0); + wrmsrl(MSR_K7_PERFCTR3, pmc3); + wrmsrl(MSR_K7_EVNTSEL3, evntsel3); + } else { + release_perfctr_nmi(MSR_K7_PERFCTR0 + i); + release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } + + return pmc_now * tsc_khz / (tsc_now - tsc_start); +} /* * pit_calibrate_tsc() uses the speaker output (channel 2) of @@ -285,7 +328,7 @@ static unsigned int __init pit_calibrate_tsc(void) #define PIT_MODE 0x43 #define PIT_CH0 0x40 -static void __init __pit_init(int val, u8 mode) +static void __pit_init(int val, u8 mode) { unsigned long flags; @@ -301,12 +344,12 @@ void __init pit_init(void) __pit_init(LATCH, 0x34); /* binary, mode 2, LSB/MSB, ch 0 */ } -void __init pit_stop_interrupt(void) +void pit_stop_interrupt(void) { __pit_init(0, 0x30); /* mode 0 */ } -void __init stop_timer_interrupt(void) +void stop_timer_interrupt(void) { char *name; if (hpet_address) { @@ -339,23 +382,29 @@ void __init time_init(void) if (hpet_use_timer) { /* set tick_nsec to use the proper rate for HPET */ tick_nsec = TICK_NSEC_HPET; - cpu_khz = hpet_calibrate_tsc(); + tsc_khz = hpet_calibrate_tsc(); timename = "HPET"; } else { pit_init(); - cpu_khz = pit_calibrate_tsc(); + tsc_khz = pit_calibrate_tsc(); timename = "PIT"; } + cpu_khz = tsc_khz; + if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && + boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86 == 16) + cpu_khz = tsc_calibrate_cpu_khz(); + if (unsynchronized_tsc()) - mark_tsc_unstable(); + mark_tsc_unstable("TSCs unsynchronized"); if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) vgetcpu_mode = VGETCPU_RDTSCP; else vgetcpu_mode = VGETCPU_LSL; - set_cyc2ns_scale(cpu_khz); + set_cyc2ns_scale(tsc_khz); printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); init_tsc_clocksource(); diff --git a/arch/x86_64/kernel/trampoline.S b/arch/x86_64/kernel/trampoline.S index c79b99a9e2f..e7e2764c461 100644 --- a/arch/x86_64/kernel/trampoline.S +++ b/arch/x86_64/kernel/trampoline.S @@ -3,6 +3,7 @@ * Trampoline.S Derived from Setup.S by Linus Torvalds * * 4 Jan 1997 Michael Chastain: changed to gnu as. + * 15 Sept 2005 Eric Biederman: 64bit PIC support * * Entry: CS:IP point to the start of our code, we are * in real mode with no stack, but the rest of the @@ -17,15 +18,20 @@ * and IP is zero. Thus, data addresses need to be absolute * (no relocation) and are taken with regard to r_base. * + * With the addition of trampoline_level4_pgt this code can + * now enter a 64bit kernel that lives at arbitrary 64bit + * physical addresses. + * * If you work on this file, check the object module with objdump * --full-contents --reloc to make sure there are no relocation - * entries. For the GDT entry we do hand relocation in smpboot.c - * because of 64bit linker limitations. + * entries. */ #include <linux/linkage.h> -#include <asm/segment.h> +#include <asm/pgtable.h> #include <asm/page.h> +#include <asm/msr.h> +#include <asm/segment.h> .data @@ -33,15 +39,33 @@ ENTRY(trampoline_data) r_base = . + cli # We should be safe anyway wbinvd mov %cs, %ax # Code and data in the same place mov %ax, %ds + mov %ax, %es + mov %ax, %ss - cli # We should be safe anyway movl $0xA5A5A5A5, trampoline_data - r_base # write marker for master knows we're running + # Setup stack + movw $(trampoline_stack_end - r_base), %sp + + call verify_cpu # Verify the cpu supports long mode + testl %eax, %eax # Check for return code + jnz no_longmode + + mov %cs, %ax + movzx %ax, %esi # Find the 32bit trampoline location + shll $4, %esi + + # Fixup the vectors + addl %esi, startup_32_vector - r_base + addl %esi, startup_64_vector - r_base + addl %esi, tgdt + 2 - r_base # Fixup the gdt pointer + /* * GDT tables in non default location kernel can be beyond 16MB and * lgdt will not be able to load the address as in real mode default @@ -49,23 +73,94 @@ r_base = . * to 32 bit. */ - lidtl idt_48 - r_base # load idt with 0, 0 - lgdtl gdt_48 - r_base # load gdt with whatever is appropriate + lidtl tidt - r_base # load idt with 0, 0 + lgdtl tgdt - r_base # load gdt with whatever is appropriate xor %ax, %ax inc %ax # protected mode (PE) bit lmsw %ax # into protected mode - # flaush prefetch and jump to startup_32 in arch/x86_64/kernel/head.S - ljmpl $__KERNEL32_CS, $(startup_32-__START_KERNEL_map) + + # flush prefetch and jump to startup_32 + ljmpl *(startup_32_vector - r_base) + + .code32 + .balign 4 +startup_32: + movl $__KERNEL_DS, %eax # Initialize the %ds segment register + movl %eax, %ds + + xorl %eax, %eax + btsl $5, %eax # Enable PAE mode + movl %eax, %cr4 + + # Setup trampoline 4 level pagetables + leal (trampoline_level4_pgt - r_base)(%esi), %eax + movl %eax, %cr3 + + movl $MSR_EFER, %ecx + movl $(1 << _EFER_LME), %eax # Enable Long Mode + xorl %edx, %edx + wrmsr + + xorl %eax, %eax + btsl $31, %eax # Enable paging and in turn activate Long Mode + btsl $0, %eax # Enable protected mode + movl %eax, %cr0 + + /* + * At this point we're in long mode but in 32bit compatibility mode + * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn + * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use + * the new gdt/idt that has __KERNEL_CS with CS.L = 1. + */ + ljmp *(startup_64_vector - r_base)(%esi) + + .code64 + .balign 4 +startup_64: + # Now jump into the kernel using virtual addresses + movq $secondary_startup_64, %rax + jmp *%rax + + .code16 +no_longmode: + hlt + jmp no_longmode +#include "verify_cpu.S" # Careful these need to be in the same 64K segment as the above; -idt_48: +tidt: .word 0 # idt limit = 0 .word 0, 0 # idt base = 0L -gdt_48: - .short GDT_ENTRIES*8 - 1 # gdt limit - .long cpu_gdt_table-__START_KERNEL_map + # Duplicate the global descriptor table + # so the kernel can live anywhere + .balign 4 +tgdt: + .short tgdt_end - tgdt # gdt limit + .long tgdt - r_base + .short 0 + .quad 0x00cf9b000000ffff # __KERNEL32_CS + .quad 0x00af9b000000ffff # __KERNEL_CS + .quad 0x00cf93000000ffff # __KERNEL_DS +tgdt_end: + + .balign 4 +startup_32_vector: + .long startup_32 - r_base + .word __KERNEL32_CS, 0 + + .balign 4 +startup_64_vector: + .long startup_64 - r_base + .word __KERNEL_CS, 0 + +trampoline_stack: + .org 0x1000 +trampoline_stack_end: +ENTRY(trampoline_level4_pgt) + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .fill 510,8,0 + .quad level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE -.globl trampoline_end -trampoline_end: +ENTRY(trampoline_end) diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 09d2e8a10a4..d76fc32d459 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -426,8 +426,7 @@ void show_registers(struct pt_regs *regs) const int cpu = smp_processor_id(); struct task_struct *cur = cpu_pda(cpu)->pcurrent; - rsp = regs->rsp; - + rsp = regs->rsp; printk("CPU %d ", cpu); __show_regs(regs); printk("Process %s (pid: %d, threadinfo %p, task %p)\n", @@ -438,7 +437,6 @@ void show_registers(struct pt_regs *regs) * time of the fault.. */ if (in_kernel) { - printk("Stack: "); _show_stack(NULL, regs, (unsigned long*)rsp); @@ -581,10 +579,20 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, { struct task_struct *tsk = current; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - if (user_mode(regs)) { + /* + * We want error_code and trap_no set for userspace + * faults and kernelspace faults which result in + * die(), but not kernelspace faults which are fixed + * up. die() gives the process no chance to handle + * the signal and notice the kernel fault information, + * so that won't result in polluting the information + * about previously queued, but not yet delivered, + * faults. See also do_general_protection below. + */ + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; + if (exception_trace && unhandled_signal(tsk, signr)) printk(KERN_INFO "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n", @@ -605,8 +613,11 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, fixup = search_exception_tables(regs->rip); if (fixup) regs->rip = fixup->fixup; - else + else { + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; die(str, regs, error_code); + } return; } } @@ -682,10 +693,10 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, conditional_sti(regs); - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - if (user_mode(regs)) { + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 13; + if (exception_trace && unhandled_signal(tsk, SIGSEGV)) printk(KERN_INFO "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n", @@ -704,6 +715,9 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, regs->rip = fixup->fixup; return; } + + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 13; if (notify_die(DIE_GPF, "general protection fault", regs, error_code, 13, SIGSEGV) == NOTIFY_STOP) return; diff --git a/arch/x86_64/kernel/tsc.c b/arch/x86_64/kernel/tsc.c index 1a0edbbffaa..48f9a8e6aa9 100644 --- a/arch/x86_64/kernel/tsc.c +++ b/arch/x86_64/kernel/tsc.c @@ -13,6 +13,8 @@ static int notsc __initdata = 0; unsigned int cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); +unsigned int tsc_khz; +EXPORT_SYMBOL(tsc_khz); static unsigned int cyc2ns_scale __read_mostly; @@ -77,7 +79,7 @@ static void handle_cpufreq_delayed_get(struct work_struct *v) static unsigned int ref_freq = 0; static unsigned long loops_per_jiffy_ref = 0; -static unsigned long cpu_khz_ref = 0; +static unsigned long tsc_khz_ref = 0; static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) @@ -99,7 +101,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, if (!ref_freq) { ref_freq = freq->old; loops_per_jiffy_ref = *lpj; - cpu_khz_ref = cpu_khz; + tsc_khz_ref = tsc_khz; } if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || @@ -107,12 +109,12 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); - cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); + tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - mark_tsc_unstable(); + mark_tsc_unstable("cpufreq changes"); } - set_cyc2ns_scale(cpu_khz_ref); + set_cyc2ns_scale(tsc_khz_ref); return 0; } @@ -197,10 +199,11 @@ static struct clocksource clocksource_tsc = { .vread = vread_tsc, }; -void mark_tsc_unstable(void) +void mark_tsc_unstable(char *reason) { if (!tsc_unstable) { tsc_unstable = 1; + printk("Marking TSC unstable due to %s\n", reason); /* Change only the rating, when not registered */ if (clocksource_tsc.mult) clocksource_change_rating(&clocksource_tsc, 0); @@ -213,7 +216,7 @@ EXPORT_SYMBOL_GPL(mark_tsc_unstable); void __init init_tsc_clocksource(void) { if (!notsc) { - clocksource_tsc.mult = clocksource_khz2mult(cpu_khz, + clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, clocksource_tsc.shift); if (check_tsc_unstable()) clocksource_tsc.rating = 0; diff --git a/arch/x86_64/kernel/tsc_sync.c b/arch/x86_64/kernel/tsc_sync.c index 014f0db45df..355f5f506c8 100644 --- a/arch/x86_64/kernel/tsc_sync.c +++ b/arch/x86_64/kernel/tsc_sync.c @@ -50,7 +50,7 @@ static __cpuinit void check_tsc_warp(void) /* * The measurement runs for 20 msecs: */ - end = start + cpu_khz * 20ULL; + end = start + tsc_khz * 20ULL; now = start; for (i = 0; ; i++) { @@ -138,7 +138,7 @@ void __cpuinit check_tsc_sync_source(int cpu) printk("\n"); printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs," " turning off TSC clock.\n", max_warp); - mark_tsc_unstable(); + mark_tsc_unstable("check_tsc_sync_source failed"); nr_warps = 0; max_warp = 0; last_tsc = 0; diff --git a/arch/x86_64/kernel/verify_cpu.S b/arch/x86_64/kernel/verify_cpu.S new file mode 100644 index 00000000000..e035f594819 --- /dev/null +++ b/arch/x86_64/kernel/verify_cpu.S @@ -0,0 +1,119 @@ +/* + * + * verify_cpu.S - Code for cpu long mode and SSE verification. This + * code has been borrowed from boot/setup.S and was introduced by + * Andi Kleen. + * + * Copyright (c) 2007 Andi Kleen (ak@suse.de) + * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com) + * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com) + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + * + * This is a common code for verification whether CPU supports + * long mode and SSE or not. It is not called directly instead this + * file is included at various places and compiled in that context. + * Following are the current usage. + * + * This file is included by both 16bit and 32bit code. + * + * arch/x86_64/boot/setup.S : Boot cpu verification (16bit) + * arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit) + * arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit) + * arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit) + * + * verify_cpu, returns the status of cpu check in register %eax. + * 0: Success 1: Failure + * + * The caller needs to check for the error code and take the action + * appropriately. Either display a message or halt. + */ + +#include <asm/cpufeature.h> + +verify_cpu: + pushfl # Save caller passed flags + pushl $0 # Kill any dangerous flags + popfl + + /* minimum CPUID flags for x86-64 as defined by AMD */ +#define M(x) (1<<(x)) +#define M2(a,b) M(a)|M(b) +#define M4(a,b,c,d) M(a)|M(b)|M(c)|M(d) + +#define SSE_MASK \ + (M2(X86_FEATURE_XMM,X86_FEATURE_XMM2)) +#define REQUIRED_MASK1 \ + (M4(X86_FEATURE_FPU,X86_FEATURE_PSE,X86_FEATURE_TSC,X86_FEATURE_MSR)|\ + M4(X86_FEATURE_PAE,X86_FEATURE_CX8,X86_FEATURE_PGE,X86_FEATURE_CMOV)|\ + M(X86_FEATURE_FXSR)) +#define REQUIRED_MASK2 \ + (M(X86_FEATURE_LM - 32)) + + pushfl # standard way to check for cpuid + popl %eax + movl %eax,%ebx + xorl $0x200000,%eax + pushl %eax + popfl + pushfl + popl %eax + cmpl %eax,%ebx + jz verify_cpu_no_longmode # cpu has no cpuid + + movl $0x0,%eax # See if cpuid 1 is implemented + cpuid + cmpl $0x1,%eax + jb verify_cpu_no_longmode # no cpuid 1 + + xor %di,%di + cmpl $0x68747541,%ebx # AuthenticAMD + jnz verify_cpu_noamd + cmpl $0x69746e65,%edx + jnz verify_cpu_noamd + cmpl $0x444d4163,%ecx + jnz verify_cpu_noamd + mov $1,%di # cpu is from AMD + +verify_cpu_noamd: + movl $0x1,%eax # Does the cpu have what it takes + cpuid + andl $REQUIRED_MASK1,%edx + xorl $REQUIRED_MASK1,%edx + jnz verify_cpu_no_longmode + + movl $0x80000000,%eax # See if extended cpuid is implemented + cpuid + cmpl $0x80000001,%eax + jb verify_cpu_no_longmode # no extended cpuid + + movl $0x80000001,%eax # Does the cpu have what it takes + cpuid + andl $REQUIRED_MASK2,%edx + xorl $REQUIRED_MASK2,%edx + jnz verify_cpu_no_longmode + +verify_cpu_sse_test: + movl $1,%eax + cpuid + andl $SSE_MASK,%edx + cmpl $SSE_MASK,%edx + je verify_cpu_sse_ok + test %di,%di + jz verify_cpu_no_longmode # only try to force SSE on AMD + movl $0xc0010015,%ecx # HWCR + rdmsr + btr $15,%eax # enable SSE + wrmsr + xor %di,%di # don't loop + jmp verify_cpu_sse_test # try again + +verify_cpu_no_longmode: + popfl # Restore caller passed flags + movl $1,%eax + ret +verify_cpu_sse_ok: + popfl # Restore caller passed flags + xorl %eax, %eax + ret diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 5176ecf006e..88cfa50b424 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -29,9 +29,7 @@ SECTIONS .text : AT(ADDR(.text) - LOAD_OFFSET) { /* First the code that has to be first for bootstrapping */ *(.bootstrap.text) - /* Then all the functions that are "hot" in profiles, to group them - onto the same hugetlb entry */ - #include "functionlist" + _stext = .; /* Then the rest */ *(.text) SCHED_TEXT @@ -50,10 +48,10 @@ SECTIONS __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } __stop___ex_table = .; - RODATA - BUG_TABLE + RODATA + . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */ /* Data */ .data : AT(ADDR(.data) - LOAD_OFFSET) { @@ -94,6 +92,12 @@ SECTIONS { *(.vsyscall_gtod_data) } vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); + + .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) + { *(.vsyscall_1) } + .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) + { *(.vsyscall_2) } + .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) } vgetcpu_mode = VVIRT(.vgetcpu_mode); @@ -101,10 +105,6 @@ SECTIONS .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) } jiffies = VVIRT(.jiffies); - .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) - { *(.vsyscall_1) } - .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) - { *(.vsyscall_2) } .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) } @@ -194,7 +194,7 @@ SECTIONS __initramfs_end = .; #endif - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + . = ALIGN(4096); __per_cpu_start = .; .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index b43c698cf7d..dc32cef9619 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c @@ -45,14 +45,34 @@ #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) #define __syscall_clobber "r11","rcx","memory" +#define __pa_vsymbol(x) \ + ({unsigned long v; \ + extern char __vsyscall_0; \ + asm("" : "=r" (v) : "0" (x)); \ + ((v - VSYSCALL_FIRST_PAGE) + __pa_symbol(&__vsyscall_0)); }) +/* + * vsyscall_gtod_data contains data that is : + * - readonly from vsyscalls + * - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64) + * Try to keep this structure as small as possible to avoid cache line ping pongs + */ struct vsyscall_gtod_data_t { - seqlock_t lock; - int sysctl_enabled; - struct timeval wall_time_tv; + seqlock_t lock; + + /* open coded 'struct timespec' */ + time_t wall_time_sec; + u32 wall_time_nsec; + + int sysctl_enabled; struct timezone sys_tz; - cycle_t offset_base; - struct clocksource clock; + struct { /* extract of a clocksource struct */ + cycle_t (*vread)(void); + cycle_t cycle_last; + cycle_t mask; + u32 mult; + u32 shift; + } clock; }; int __vgetcpu_mode __section_vgetcpu_mode; @@ -68,9 +88,13 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); /* copy vsyscall data */ - vsyscall_gtod_data.clock = *clock; - vsyscall_gtod_data.wall_time_tv.tv_sec = wall_time->tv_sec; - vsyscall_gtod_data.wall_time_tv.tv_usec = wall_time->tv_nsec/1000; + vsyscall_gtod_data.clock.vread = clock->vread; + vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; + vsyscall_gtod_data.clock.mask = clock->mask; + vsyscall_gtod_data.clock.mult = clock->mult; + vsyscall_gtod_data.clock.shift = clock->shift; + vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; + vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; vsyscall_gtod_data.sys_tz = sys_tz; write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } @@ -105,7 +129,8 @@ static __always_inline long time_syscall(long *t) static __always_inline void do_vgettimeofday(struct timeval * tv) { cycle_t now, base, mask, cycle_delta; - unsigned long seq, mult, shift, nsec_delta; + unsigned seq; + unsigned long mult, shift, nsec; cycle_t (*vread)(void); do { seq = read_seqbegin(&__vsyscall_gtod_data.lock); @@ -121,21 +146,20 @@ static __always_inline void do_vgettimeofday(struct timeval * tv) mult = __vsyscall_gtod_data.clock.mult; shift = __vsyscall_gtod_data.clock.shift; - *tv = __vsyscall_gtod_data.wall_time_tv; - + tv->tv_sec = __vsyscall_gtod_data.wall_time_sec; + nsec = __vsyscall_gtod_data.wall_time_nsec; } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); /* calculate interval: */ cycle_delta = (now - base) & mask; /* convert to nsecs: */ - nsec_delta = (cycle_delta * mult) >> shift; + nsec += (cycle_delta * mult) >> shift; - /* convert to usecs and add to timespec: */ - tv->tv_usec += nsec_delta / NSEC_PER_USEC; - while (tv->tv_usec > USEC_PER_SEC) { + while (nsec >= NSEC_PER_SEC) { tv->tv_sec += 1; - tv->tv_usec -= USEC_PER_SEC; + nsec -= NSEC_PER_SEC; } + tv->tv_usec = nsec / NSEC_PER_USEC; } int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) @@ -151,11 +175,13 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) * unlikely */ time_t __vsyscall(1) vtime(time_t *t) { + time_t result; if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) return time_syscall(t); - else if (t) - *t = __vsyscall_gtod_data.wall_time_tv.tv_sec; - return __vsyscall_gtod_data.wall_time_tv.tv_sec; + result = __vsyscall_gtod_data.wall_time_sec; + if (t) + *t = result; + return result; } /* Fast way to get current CPU and node. @@ -224,10 +250,10 @@ static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, return ret; /* gcc has some trouble with __va(__pa()), so just do it this way. */ - map1 = ioremap(__pa_symbol(&vsysc1), 2); + map1 = ioremap(__pa_vsymbol(&vsysc1), 2); if (!map1) return -ENOMEM; - map2 = ioremap(__pa_symbol(&vsysc2), 2); + map2 = ioremap(__pa_vsymbol(&vsysc2), 2); if (!map2) { ret = -ENOMEM; goto out; diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index 6ada7231f3a..de99dba2c51 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -585,7 +585,7 @@ do_sigbus: } DEFINE_SPINLOCK(pgd_lock); -struct page *pgd_list; +LIST_HEAD(pgd_list); void vmalloc_sync_all(void) { @@ -605,8 +605,7 @@ void vmalloc_sync_all(void) if (pgd_none(*pgd_ref)) continue; spin_lock(&pgd_lock); - for (page = pgd_list; page; - page = (struct page *)page->index) { + list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; pgd = (pgd_t *)page_address(page) + pgd_index(address); if (pgd_none(*pgd)) diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index ec31534eb10..282b0a8f00a 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -22,10 +22,12 @@ #include <linux/bootmem.h> #include <linux/proc_fs.h> #include <linux/pci.h> +#include <linux/pfn.h> #include <linux/poison.h> #include <linux/dma-mapping.h> #include <linux/module.h> #include <linux/memory_hotplug.h> +#include <linux/nmi.h> #include <asm/processor.h> #include <asm/system.h> @@ -46,7 +48,7 @@ #define Dprintk(x...) #endif -struct dma_mapping_ops* dma_ops; +const struct dma_mapping_ops* dma_ops; EXPORT_SYMBOL(dma_ops); static unsigned long dma_reserve __initdata; @@ -72,6 +74,11 @@ void show_mem(void) for_each_online_pgdat(pgdat) { for (i = 0; i < pgdat->node_spanned_pages; ++i) { + /* this loop can take a while with 256 GB and 4k pages + so update the NMI watchdog */ + if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) { + touch_nmi_watchdog(); + } page = pfn_to_page(pgdat->node_start_pfn + i); total++; if (PageReserved(page)) @@ -167,23 +174,9 @@ __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot) unsigned long __initdata table_start, table_end; -extern pmd_t temp_boot_pmds[]; - -static struct temp_map { - pmd_t *pmd; - void *address; - int allocated; -} temp_mappings[] __initdata = { - { &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) }, - { &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) }, - {} -}; - -static __meminit void *alloc_low_page(int *index, unsigned long *phys) +static __meminit void *alloc_low_page(unsigned long *phys) { - struct temp_map *ti; - int i; - unsigned long pfn = table_end++, paddr; + unsigned long pfn = table_end++; void *adr; if (after_bootmem) { @@ -194,57 +187,63 @@ static __meminit void *alloc_low_page(int *index, unsigned long *phys) if (pfn >= end_pfn) panic("alloc_low_page: ran out of memory"); - for (i = 0; temp_mappings[i].allocated; i++) { - if (!temp_mappings[i].pmd) - panic("alloc_low_page: ran out of temp mappings"); - } - ti = &temp_mappings[i]; - paddr = (pfn << PAGE_SHIFT) & PMD_MASK; - set_pmd(ti->pmd, __pmd(paddr | _KERNPG_TABLE | _PAGE_PSE)); - ti->allocated = 1; - __flush_tlb(); - adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK); + + adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); memset(adr, 0, PAGE_SIZE); - *index = i; - *phys = pfn * PAGE_SIZE; - return adr; -} + *phys = pfn * PAGE_SIZE; + return adr; +} -static __meminit void unmap_low_page(int i) +static __meminit void unmap_low_page(void *adr) { - struct temp_map *ti; if (after_bootmem) return; - ti = &temp_mappings[i]; - set_pmd(ti->pmd, __pmd(0)); - ti->allocated = 0; + early_iounmap(adr, PAGE_SIZE); } /* Must run before zap_low_mappings */ __init void *early_ioremap(unsigned long addr, unsigned long size) { - unsigned long map = round_down(addr, LARGE_PAGE_SIZE); - - /* actually usually some more */ - if (size >= LARGE_PAGE_SIZE) { - return NULL; + unsigned long vaddr; + pmd_t *pmd, *last_pmd; + int i, pmds; + + pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; + vaddr = __START_KERNEL_map; + pmd = level2_kernel_pgt; + last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1; + for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) { + for (i = 0; i < pmds; i++) { + if (pmd_present(pmd[i])) + goto next; + } + vaddr += addr & ~PMD_MASK; + addr &= PMD_MASK; + for (i = 0; i < pmds; i++, addr += PMD_SIZE) + set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE)); + __flush_tlb(); + return (void *)vaddr; + next: + ; } - set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE)); - map += LARGE_PAGE_SIZE; - set_pmd(temp_mappings[1].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE)); - __flush_tlb(); - return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1)); + printk("early_ioremap(0x%lx, %lu) failed\n", addr, size); + return NULL; } /* To avoid virtual aliases later */ __init void early_iounmap(void *addr, unsigned long size) { - if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address) - printk("early_iounmap: bad address %p\n", addr); - set_pmd(temp_mappings[0].pmd, __pmd(0)); - set_pmd(temp_mappings[1].pmd, __pmd(0)); + unsigned long vaddr; + pmd_t *pmd; + int i, pmds; + + vaddr = (unsigned long)addr; + pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; + pmd = level2_kernel_pgt + pmd_index(vaddr); + for (i = 0; i < pmds; i++) + pmd_clear(pmd + i); __flush_tlb(); } @@ -289,7 +288,6 @@ static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigne for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) { - int map; unsigned long pmd_phys; pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; @@ -307,12 +305,12 @@ static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigne continue; } - pmd = alloc_low_page(&map, &pmd_phys); + pmd = alloc_low_page(&pmd_phys); spin_lock(&init_mm.page_table_lock); set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); phys_pmd_init(pmd, addr, end); spin_unlock(&init_mm.page_table_lock); - unmap_low_page(map); + unmap_low_page(pmd); } __flush_tlb(); } @@ -364,7 +362,6 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end) end = (unsigned long)__va(end); for (; start < end; start = next) { - int map; unsigned long pud_phys; pgd_t *pgd = pgd_offset_k(start); pud_t *pud; @@ -372,7 +369,7 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end) if (after_bootmem) pud = pud_offset(pgd, start & PGDIR_MASK); else - pud = alloc_low_page(&map, &pud_phys); + pud = alloc_low_page(&pud_phys); next = start + PGDIR_SIZE; if (next > end) @@ -380,7 +377,7 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end) phys_pud_init(pud, __pa(start), __pa(next)); if (!after_bootmem) set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); - unmap_low_page(map); + unmap_low_page(pud); } if (!after_bootmem) @@ -388,21 +385,6 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end) __flush_tlb_all(); } -void __cpuinit zap_low_mappings(int cpu) -{ - if (cpu == 0) { - pgd_t *pgd = pgd_offset_k(0UL); - pgd_clear(pgd); - } else { - /* - * For AP's, zap the low identity mappings by changing the cr3 - * to init_level4_pgt and doing local flush tlb all - */ - asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt))); - } - __flush_tlb_all(); -} - #ifndef CONFIG_NUMA void __init paging_init(void) { @@ -579,15 +561,6 @@ void __init mem_init(void) reservedpages << (PAGE_SHIFT-10), datasize >> 10, initsize >> 10); - -#ifdef CONFIG_SMP - /* - * Sync boot_level4_pgt mappings with the init_level4_pgt - * except for the low identity mappings which are already zapped - * in init_level4_pgt. This sync-up is essential for AP's bringup - */ - memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t)); -#endif } void free_init_pages(char *what, unsigned long begin, unsigned long end) @@ -597,37 +570,44 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) if (begin >= end) return; - printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); + printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); for (addr = begin; addr < end; addr += PAGE_SIZE) { - ClearPageReserved(virt_to_page(addr)); - init_page_count(virt_to_page(addr)); - memset((void *)(addr & ~(PAGE_SIZE-1)), - POISON_FREE_INITMEM, PAGE_SIZE); - free_page(addr); + struct page *page = pfn_to_page(addr >> PAGE_SHIFT); + ClearPageReserved(page); + init_page_count(page); + memset(page_address(page), POISON_FREE_INITMEM, PAGE_SIZE); + if (addr >= __START_KERNEL_map) + change_page_attr_addr(addr, 1, __pgprot(0)); + __free_page(page); totalram_pages++; } + if (addr > __START_KERNEL_map) + global_flush_tlb(); } void free_initmem(void) { - memset(__initdata_begin, POISON_FREE_INITDATA, - __initdata_end - __initdata_begin); free_init_pages("unused kernel memory", - (unsigned long)(&__init_begin), - (unsigned long)(&__init_end)); + __pa_symbol(&__init_begin), + __pa_symbol(&__init_end)); } #ifdef CONFIG_DEBUG_RODATA void mark_rodata_ro(void) { - unsigned long addr = (unsigned long)__start_rodata; + unsigned long start = PFN_ALIGN(__va(__pa_symbol(&_stext))), size; - for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE) - change_page_attr_addr(addr, 1, PAGE_KERNEL_RO); +#ifdef CONFIG_HOTPLUG_CPU + /* It must still be possible to apply SMP alternatives. */ + if (num_possible_cpus() > 1) + start = PFN_ALIGN(__va(__pa_symbol(&_etext))); +#endif + size = (unsigned long)__va(__pa_symbol(&__end_rodata)) - start; + change_page_attr_addr(start, size >> PAGE_SHIFT, PAGE_KERNEL_RO); - printk ("Write protecting the kernel read-only data: %luk\n", - (__end_rodata - __start_rodata) >> 10); + printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", + size >> 10); /* * change_page_attr_addr() requires a global_flush_tlb() call after it. @@ -642,7 +622,7 @@ void mark_rodata_ro(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_init_pages("initrd memory", start, end); + free_init_pages("initrd memory", __pa(start), __pa(end)); } #endif diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c index b5b8dba28b4..f983c75825d 100644 --- a/arch/x86_64/mm/k8topology.c +++ b/arch/x86_64/mm/k8topology.c @@ -49,11 +49,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) int found = 0; u32 reg; unsigned numnodes; - nodemask_t nodes_parsed; unsigned dualcore = 0; - nodes_clear(nodes_parsed); - if (!early_pci_allowed()) return -1; @@ -65,6 +62,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) reg = read_pci_config(0, nb, 0, 0x60); numnodes = ((reg >> 4) & 0xF) + 1; + if (numnodes <= 1) + return -1; printk(KERN_INFO "Number of nodes %d\n", numnodes); @@ -102,7 +101,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) nodeid, (base>>8)&3, (limit>>8) & 3); return -1; } - if (node_isset(nodeid, nodes_parsed)) { + if (node_isset(nodeid, node_possible_map)) { printk(KERN_INFO "Node %d already present. Skipping\n", nodeid); continue; @@ -155,7 +154,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) prevbase = base; - node_set(nodeid, nodes_parsed); + node_set(nodeid, node_possible_map); } if (!found) diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 41b8fb06992..51548947ad3 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -273,125 +273,213 @@ void __init numa_init_array(void) #ifdef CONFIG_NUMA_EMU /* Numa emulation */ -int numa_fake __initdata = 0; +#define E820_ADDR_HOLE_SIZE(start, end) \ + (e820_hole_size((start) >> PAGE_SHIFT, (end) >> PAGE_SHIFT) << \ + PAGE_SHIFT) +char *cmdline __initdata; /* - * This function is used to find out if the start and end correspond to - * different zones. + * Setups up nid to range from addr to addr + size. If the end boundary is + * greater than max_addr, then max_addr is used instead. The return value is 0 + * if there is additional memory left for allocation past addr and -1 otherwise. + * addr is adjusted to be at the end of the node. */ -int zone_cross_over(unsigned long start, unsigned long end) +static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, + u64 size, u64 max_addr) { - if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) && - (end >= (MAX_DMA32_PFN << PAGE_SHIFT))) - return 1; - return 0; + int ret = 0; + nodes[nid].start = *addr; + *addr += size; + if (*addr >= max_addr) { + *addr = max_addr; + ret = -1; + } + nodes[nid].end = *addr; + node_set(nid, node_possible_map); + printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, + nodes[nid].start, nodes[nid].end, + (nodes[nid].end - nodes[nid].start) >> 20); + return ret; } -static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) +/* + * Splits num_nodes nodes up equally starting at node_start. The return value + * is the number of nodes split up and addr is adjusted to be at the end of the + * last node allocated. + */ +static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, + u64 max_addr, int node_start, + int num_nodes) { - int i, big; - struct bootnode nodes[MAX_NUMNODES]; - unsigned long sz, old_sz; - unsigned long hole_size; - unsigned long start, end; - unsigned long max_addr = (end_pfn << PAGE_SHIFT); - - start = (start_pfn << PAGE_SHIFT); - hole_size = e820_hole_size(start, max_addr); - sz = (max_addr - start - hole_size) / numa_fake; - - /* Kludge needed for the hash function */ - - old_sz = sz; - /* - * Round down to the nearest FAKE_NODE_MIN_SIZE. - */ - sz &= FAKE_NODE_MIN_HASH_MASK; + unsigned int big; + u64 size; + int i; + if (num_nodes <= 0) + return -1; + if (num_nodes > MAX_NUMNODES) + num_nodes = MAX_NUMNODES; + size = (max_addr - *addr - E820_ADDR_HOLE_SIZE(*addr, max_addr)) / + num_nodes; /* - * We ensure that each node is at least 64MB big. Smaller than this - * size can cause VM hiccups. + * Calculate the number of big nodes that can be allocated as a result + * of consolidating the leftovers. */ - if (sz == 0) { - printk(KERN_INFO "Not enough memory for %d nodes. Reducing " - "the number of nodes\n", numa_fake); - numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE; - printk(KERN_INFO "Number of fake nodes will be = %d\n", - numa_fake); - sz = FAKE_NODE_MIN_SIZE; + big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) / + FAKE_NODE_MIN_SIZE; + + /* Round down to nearest FAKE_NODE_MIN_SIZE. */ + size &= FAKE_NODE_MIN_HASH_MASK; + if (!size) { + printk(KERN_ERR "Not enough memory for each node. " + "NUMA emulation disabled.\n"); + return -1; } - /* - * Find out how many nodes can get an extra NODE_MIN_SIZE granule. - * This logic ensures the extra memory gets distributed among as many - * nodes as possible (as compared to one single node getting all that - * extra memory. - */ - big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE; - printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: " - "%d\n", - (sz >> 20), (hole_size >> 20), big); - memset(&nodes,0,sizeof(nodes)); - end = start; - for (i = 0; i < numa_fake; i++) { - /* - * In case we are not able to allocate enough memory for all - * the nodes, we reduce the number of fake nodes. - */ - if (end >= max_addr) { - numa_fake = i - 1; - break; - } - start = nodes[i].start = end; - /* - * Final node can have all the remaining memory. - */ - if (i == numa_fake-1) - sz = max_addr - start; - end = nodes[i].start + sz; - /* - * Fir "big" number of nodes get extra granule. - */ + + for (i = node_start; i < num_nodes + node_start; i++) { + u64 end = *addr + size; if (i < big) end += FAKE_NODE_MIN_SIZE; /* - * Iterate over the range to ensure that this node gets at - * least sz amount of RAM (excluding holes) + * The final node can have the remaining system RAM. Other + * nodes receive roughly the same amount of available pages. */ - while ((end - start - e820_hole_size(start, end)) < sz) { - end += FAKE_NODE_MIN_SIZE; - if (end >= max_addr) - break; + if (i == num_nodes + node_start - 1) + end = max_addr; + else + while (end - *addr - E820_ADDR_HOLE_SIZE(*addr, end) < + size) { + end += FAKE_NODE_MIN_SIZE; + if (end > max_addr) { + end = max_addr; + break; + } + } + if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0) + break; + } + return i - node_start + 1; +} + +/* + * Splits the remaining system RAM into chunks of size. The remaining memory is + * always assigned to a final node and can be asymmetric. Returns the number of + * nodes split. + */ +static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, + u64 max_addr, int node_start, u64 size) +{ + int i = node_start; + size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; + while (!setup_node_range(i++, nodes, addr, size, max_addr)) + ; + return i - node_start; +} + +/* + * Sets up the system RAM area from start_pfn to end_pfn according to the + * numa=fake command-line option. + */ +static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) +{ + struct bootnode nodes[MAX_NUMNODES]; + u64 addr = start_pfn << PAGE_SHIFT; + u64 max_addr = end_pfn << PAGE_SHIFT; + int num_nodes = 0; + int coeff_flag; + int coeff = -1; + int num = 0; + u64 size; + int i; + + memset(&nodes, 0, sizeof(nodes)); + /* + * If the numa=fake command-line is just a single number N, split the + * system RAM into N fake nodes. + */ + if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { + num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, + simple_strtol(cmdline, NULL, 0)); + if (num_nodes < 0) + return num_nodes; + goto out; + } + + /* Parse the command line. */ + for (coeff_flag = 0; ; cmdline++) { + if (*cmdline && isdigit(*cmdline)) { + num = num * 10 + *cmdline - '0'; + continue; } - /* - * Look at the next node to make sure there is some real memory - * to map. Bad things happen when the only memory present - * in a zone on a fake node is IO hole. - */ - while (e820_hole_size(end, end + FAKE_NODE_MIN_SIZE) > 0) { - if (zone_cross_over(start, end + sz)) { - end = (MAX_DMA32_PFN << PAGE_SHIFT); + if (*cmdline == '*') { + if (num > 0) + coeff = num; + coeff_flag = 1; + } + if (!*cmdline || *cmdline == ',') { + if (!coeff_flag) + coeff = 1; + /* + * Round down to the nearest FAKE_NODE_MIN_SIZE. + * Command-line coefficients are in megabytes. + */ + size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; + if (size) + for (i = 0; i < coeff; i++, num_nodes++) + if (setup_node_range(num_nodes, nodes, + &addr, size, max_addr) < 0) + goto done; + if (!*cmdline) break; - } - if (end >= max_addr) + coeff_flag = 0; + coeff = -1; + } + num = 0; + } +done: + if (!num_nodes) + return -1; + /* Fill remainder of system RAM, if appropriate. */ + if (addr < max_addr) { + if (coeff_flag && coeff < 0) { + /* Split remaining nodes into num-sized chunks */ + num_nodes += split_nodes_by_size(nodes, &addr, max_addr, + num_nodes, num); + goto out; + } + switch (*(cmdline - 1)) { + case '*': + /* Split remaining nodes into coeff chunks */ + if (coeff <= 0) break; - end += FAKE_NODE_MIN_SIZE; + num_nodes += split_nodes_equally(nodes, &addr, max_addr, + num_nodes, coeff); + break; + case ',': + /* Do not allocate remaining system RAM */ + break; + default: + /* Give one final node */ + setup_node_range(num_nodes, nodes, &addr, + max_addr - addr, max_addr); + num_nodes++; } - if (end > max_addr) - end = max_addr; - nodes[i].end = end; - printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", - i, - nodes[i].start, nodes[i].end, - (nodes[i].end - nodes[i].start) >> 20); - node_set_online(i); - } - memnode_shift = compute_hash_shift(nodes, numa_fake); - if (memnode_shift < 0) { - memnode_shift = 0; - printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n"); - return -1; - } - for_each_online_node(i) { + } +out: + memnode_shift = compute_hash_shift(nodes, num_nodes); + if (memnode_shift < 0) { + memnode_shift = 0; + printk(KERN_ERR "No NUMA hash function found. NUMA emulation " + "disabled.\n"); + return -1; + } + + /* + * We need to vacate all active ranges that may have been registered by + * SRAT. + */ + remove_all_active_ranges(); + for_each_node_mask(i, node_possible_map) { e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, nodes[i].end >> PAGE_SHIFT); setup_node_bootmem(i, nodes[i].start, nodes[i].end); @@ -399,26 +487,32 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) numa_init_array(); return 0; } -#endif +#undef E820_ADDR_HOLE_SIZE +#endif /* CONFIG_NUMA_EMU */ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) { int i; + nodes_clear(node_possible_map); + #ifdef CONFIG_NUMA_EMU - if (numa_fake && !numa_emulation(start_pfn, end_pfn)) + if (cmdline && !numa_emulation(start_pfn, end_pfn)) return; + nodes_clear(node_possible_map); #endif #ifdef CONFIG_ACPI_NUMA if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT)) return; + nodes_clear(node_possible_map); #endif #ifdef CONFIG_K8_NUMA if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT)) return; + nodes_clear(node_possible_map); #endif printk(KERN_INFO "%s\n", numa_off ? "NUMA turned off" : "No NUMA configuration found"); @@ -432,6 +526,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) memnodemap[0] = 0; nodes_clear(node_online_map); node_set_online(0); + node_set(0, node_possible_map); for (i = 0; i < NR_CPUS; i++) numa_set_node(i, 0); node_to_cpumask[0] = cpumask_of_cpu(0); @@ -486,11 +581,8 @@ static __init int numa_setup(char *opt) if (!strncmp(opt,"off",3)) numa_off = 1; #ifdef CONFIG_NUMA_EMU - if(!strncmp(opt, "fake=", 5)) { - numa_fake = simple_strtoul(opt+5,NULL,0); ; - if (numa_fake >= MAX_NUMNODES) - numa_fake = MAX_NUMNODES; - } + if (!strncmp(opt, "fake=", 5)) + cmdline = opt + 5; #endif #ifdef CONFIG_ACPI_NUMA if (!strncmp(opt,"noacpi",6)) diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c index 081409aa345..bf4aa8dd425 100644 --- a/arch/x86_64/mm/pageattr.c +++ b/arch/x86_64/mm/pageattr.c @@ -51,7 +51,6 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot, SetPagePrivate(base); page_private(base) = 0; - address = __pa(address); addr = address & LARGE_PAGE_MASK; pbase = (pte_t *)page_address(base); for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { @@ -101,13 +100,12 @@ static inline void save_page(struct page *fpage) * No more special protections in this 2/4MB area - revert to a * large page again. */ -static void revert_page(unsigned long address, pgprot_t ref_prot) +static void revert_page(unsigned long address, unsigned long pfn, pgprot_t ref_prot) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t large_pte; - unsigned long pfn; pgd = pgd_offset_k(address); BUG_ON(pgd_none(*pgd)); @@ -115,7 +113,6 @@ static void revert_page(unsigned long address, pgprot_t ref_prot) BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, address); BUG_ON(pmd_val(*pmd) & _PAGE_PSE); - pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT; large_pte = pfn_pte(pfn, ref_prot); large_pte = pte_mkhuge(large_pte); set_pte((pte_t *)pmd, large_pte); @@ -141,7 +138,8 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, */ struct page *split; ref_prot2 = pte_pgprot(pte_clrhuge(*kpte)); - split = split_large_page(address, prot, ref_prot2); + split = split_large_page(pfn << PAGE_SHIFT, prot, + ref_prot2); if (!split) return -ENOMEM; set_pte(kpte, mk_pte(split, ref_prot2)); @@ -160,7 +158,7 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, if (page_private(kpte_page) == 0) { save_page(kpte_page); - revert_page(address, ref_prot); + revert_page(address, pfn, ref_prot); } return 0; } @@ -180,22 +178,32 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, */ int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) { - int err = 0; + unsigned long phys_base_pfn = __pa_symbol(__START_KERNEL_map) >> PAGE_SHIFT; + int err = 0, kernel_map = 0; int i; + if (address >= __START_KERNEL_map + && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) { + address = (unsigned long)__va(__pa(address)); + kernel_map = 1; + } + down_write(&init_mm.mmap_sem); for (i = 0; i < numpages; i++, address += PAGE_SIZE) { unsigned long pfn = __pa(address) >> PAGE_SHIFT; - err = __change_page_attr(address, pfn, prot, PAGE_KERNEL); - if (err) - break; + if (!kernel_map || pte_present(pfn_pte(0, prot))) { + err = __change_page_attr(address, pfn, prot, PAGE_KERNEL); + if (err) + break; + } /* Handle kernel mapping too which aliases part of the * lowmem */ - if (__pa(address) < KERNEL_TEXT_SIZE) { + if ((pfn >= phys_base_pfn) && + ((pfn - phys_base_pfn) < (KERNEL_TEXT_SIZE >> PAGE_SHIFT))) { unsigned long addr2; pgprot_t prot2; - addr2 = __START_KERNEL_map + __pa(address); + addr2 = __START_KERNEL_map + ((pfn - phys_base_pfn) << PAGE_SHIFT); /* Make sure the kernel mappings stay executable */ prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot))); err = __change_page_attr(addr2, pfn, prot2, diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c index 2efe215fc76..1e76bb0a727 100644 --- a/arch/x86_64/mm/srat.c +++ b/arch/x86_64/mm/srat.c @@ -419,19 +419,21 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) return -1; } + node_possible_map = nodes_parsed; + /* Finally register nodes */ - for_each_node_mask(i, nodes_parsed) + for_each_node_mask(i, node_possible_map) setup_node_bootmem(i, nodes[i].start, nodes[i].end); /* Try again in case setup_node_bootmem missed one due to missing bootmem */ - for_each_node_mask(i, nodes_parsed) + for_each_node_mask(i, node_possible_map) if (!node_online(i)) setup_node_bootmem(i, nodes[i].start, nodes[i].end); for (i = 0; i < NR_CPUS; i++) { if (cpu_to_node[i] == NUMA_NO_NODE) continue; - if (!node_isset(cpu_to_node[i], nodes_parsed)) + if (!node_isset(cpu_to_node[i], node_possible_map)) numa_set_node(i, NUMA_NO_NODE); } numa_init_array(); diff --git a/arch/xtensa/kernel/vmlinux.lds.S b/arch/xtensa/kernel/vmlinux.lds.S index ab6370054ce..4fbd66a52a8 100644 --- a/arch/xtensa/kernel/vmlinux.lds.S +++ b/arch/xtensa/kernel/vmlinux.lds.S @@ -198,7 +198,7 @@ SECTIONS __ftr_fixup : { *(__ftr_fixup) } __stop___ftr_fixup = .; - . = ALIGN(32); + . = ALIGN(4096); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; |