diff options
author | Tony Lindgren <tony@atomide.com> | 2011-05-24 00:45:06 -0700 |
---|---|---|
committer | Tony Lindgren <tony@atomide.com> | 2011-05-24 00:45:06 -0700 |
commit | 9b28b11e2a648f07c8481b9666ccf1c088e1ab74 (patch) | |
tree | ac3db2d4ae69e393d8423bb8c9304c75023dc805 /arch | |
parent | b7679ab3f70482ff4b75a8c735c8224ebedb6020 (diff) | |
parent | 99aa18278e867574d72201b806f82ace07d4804b (diff) |
Merge branch 'for_2.6.40/pm-cleanup' of ssh://master.kernel.org/pub/scm/linux/kernel/git/khilman/linux-omap-pm into omap-for-linus
Diffstat (limited to 'arch')
37 files changed, 451 insertions, 543 deletions
diff --git a/arch/alpha/include/asm/unistd.h b/arch/alpha/include/asm/unistd.h index 058937bf5a7..b1834166922 100644 --- a/arch/alpha/include/asm/unistd.h +++ b/arch/alpha/include/asm/unistd.h @@ -452,10 +452,14 @@ #define __NR_fanotify_init 494 #define __NR_fanotify_mark 495 #define __NR_prlimit64 496 +#define __NR_name_to_handle_at 497 +#define __NR_open_by_handle_at 498 +#define __NR_clock_adjtime 499 +#define __NR_syncfs 500 #ifdef __KERNEL__ -#define NR_SYSCALLS 497 +#define NR_SYSCALLS 501 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff --git a/arch/alpha/kernel/systbls.S b/arch/alpha/kernel/systbls.S index a6a1de9db16..15f999d41c7 100644 --- a/arch/alpha/kernel/systbls.S +++ b/arch/alpha/kernel/systbls.S @@ -498,23 +498,27 @@ sys_call_table: .quad sys_ni_syscall /* sys_timerfd */ .quad sys_eventfd .quad sys_recvmmsg - .quad sys_fallocate /* 480 */ + .quad sys_fallocate /* 480 */ .quad sys_timerfd_create .quad sys_timerfd_settime .quad sys_timerfd_gettime .quad sys_signalfd4 - .quad sys_eventfd2 /* 485 */ + .quad sys_eventfd2 /* 485 */ .quad sys_epoll_create1 .quad sys_dup3 .quad sys_pipe2 .quad sys_inotify_init1 - .quad sys_preadv /* 490 */ + .quad sys_preadv /* 490 */ .quad sys_pwritev .quad sys_rt_tgsigqueueinfo .quad sys_perf_event_open .quad sys_fanotify_init - .quad sys_fanotify_mark /* 495 */ + .quad sys_fanotify_mark /* 495 */ .quad sys_prlimit64 + .quad sys_name_to_handle_at + .quad sys_open_by_handle_at + .quad sys_clock_adjtime + .quad sys_syncfs /* 500 */ .size sys_call_table, . - sys_call_table .type sys_call_table, @object diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c index 918e8e0b72f..818e74ed45d 100644 --- a/arch/alpha/kernel/time.c +++ b/arch/alpha/kernel/time.c @@ -375,8 +375,7 @@ static struct clocksource clocksource_rpcc = { static inline void register_rpcc_clocksource(long cycle_freq) { - clocksource_calc_mult_shift(&clocksource_rpcc, cycle_freq, 4); - clocksource_register(&clocksource_rpcc); + clocksource_register_hz(&clocksource_rpcc, cycle_freq); } #else /* !CONFIG_SMP */ static inline void register_rpcc_clocksource(long cycle_freq) diff --git a/arch/arm/mach-omap2/board-3430sdp.c b/arch/arm/mach-omap2/board-3430sdp.c index 99218a5299c..52dbdf3ab66 100644 --- a/arch/arm/mach-omap2/board-3430sdp.c +++ b/arch/arm/mach-omap2/board-3430sdp.c @@ -59,24 +59,6 @@ #define TWL4030_MSECURE_GPIO 22 -/* FIXME: These values need to be updated based on more profiling on 3430sdp*/ -static struct cpuidle_params omap3_cpuidle_params_table[] = { - /* C1 */ - {1, 2, 2, 5}, - /* C2 */ - {1, 10, 10, 30}, - /* C3 */ - {1, 50, 50, 300}, - /* C4 */ - {1, 1500, 1800, 4000}, - /* C5 */ - {1, 2500, 7500, 12000}, - /* C6 */ - {1, 3000, 8500, 15000}, - /* C7 */ - {1, 10000, 30000, 300000}, -}; - static uint32_t board_keymap[] = { KEY(0, 0, KEY_LEFT), KEY(0, 1, KEY_RIGHT), @@ -800,7 +782,6 @@ static void __init omap_3430sdp_init(void) omap3_mux_init(board_mux, OMAP_PACKAGE_CBB); omap_board_config = sdp3430_config; omap_board_config_size = ARRAY_SIZE(sdp3430_config); - omap3_pm_init_cpuidle(omap3_cpuidle_params_table); omap3430_i2c_init(); omap_display_init(&sdp3430_dss_data); if (omap_rev() > OMAP3430_REV_ES1_0) diff --git a/arch/arm/mach-omap2/board-rx51.c b/arch/arm/mach-omap2/board-rx51.c index f8ba20a14e6..fec4cac8fa0 100644 --- a/arch/arm/mach-omap2/board-rx51.c +++ b/arch/arm/mach-omap2/board-rx51.c @@ -58,21 +58,25 @@ static struct platform_device leds_gpio = { }, }; +/* + * cpuidle C-states definition override from the default values. + * The 'exit_latency' field is the sum of sleep and wake-up latencies. + */ static struct cpuidle_params rx51_cpuidle_params[] = { /* C1 */ - {1, 110, 162, 5}, + {110 + 162, 5 , 1}, /* C2 */ - {1, 106, 180, 309}, + {106 + 180, 309, 1}, /* C3 */ - {0, 107, 410, 46057}, + {107 + 410, 46057, 0}, /* C4 */ - {0, 121, 3374, 46057}, + {121 + 3374, 46057, 0}, /* C5 */ - {1, 855, 1146, 46057}, + {855 + 1146, 46057, 1}, /* C6 */ - {0, 7580, 4134, 484329}, + {7580 + 4134, 484329, 0}, /* C7 */ - {1, 7505, 15274, 484329}, + {7505 + 15274, 484329, 1}, }; static struct omap_lcd_config rx51_lcd_config = { diff --git a/arch/arm/mach-omap2/cpuidle34xx.c b/arch/arm/mach-omap2/cpuidle34xx.c index 1c240eff391..4bf6e6e8b10 100644 --- a/arch/arm/mach-omap2/cpuidle34xx.c +++ b/arch/arm/mach-omap2/cpuidle34xx.c @@ -36,36 +36,6 @@ #ifdef CONFIG_CPU_IDLE -#define OMAP3_MAX_STATES 7 -#define OMAP3_STATE_C1 0 /* C1 - MPU WFI + Core active */ -#define OMAP3_STATE_C2 1 /* C2 - MPU WFI + Core inactive */ -#define OMAP3_STATE_C3 2 /* C3 - MPU CSWR + Core inactive */ -#define OMAP3_STATE_C4 3 /* C4 - MPU OFF + Core iactive */ -#define OMAP3_STATE_C5 4 /* C5 - MPU RET + Core RET */ -#define OMAP3_STATE_C6 5 /* C6 - MPU OFF + Core RET */ -#define OMAP3_STATE_C7 6 /* C7 - MPU OFF + Core OFF */ - -#define OMAP3_STATE_MAX OMAP3_STATE_C7 - -#define CPUIDLE_FLAG_CHECK_BM 0x10000 /* use omap3_enter_idle_bm() */ - -struct omap3_processor_cx { - u8 valid; - u8 type; - u32 sleep_latency; - u32 wakeup_latency; - u32 mpu_state; - u32 core_state; - u32 threshold; - u32 flags; - const char *desc; -}; - -struct omap3_processor_cx omap3_power_states[OMAP3_MAX_STATES]; -struct omap3_processor_cx current_cx_state; -struct powerdomain *mpu_pd, *core_pd, *per_pd; -struct powerdomain *cam_pd; - /* * The latencies/thresholds for various C states have * to be configured from the respective board files. @@ -75,27 +45,31 @@ struct powerdomain *cam_pd; */ static struct cpuidle_params cpuidle_params_table[] = { /* C1 */ - {1, 2, 2, 5}, + {2 + 2, 5, 1}, /* C2 */ - {1, 10, 10, 30}, + {10 + 10, 30, 1}, /* C3 */ - {1, 50, 50, 300}, + {50 + 50, 300, 1}, /* C4 */ - {1, 1500, 1800, 4000}, + {1500 + 1800, 4000, 1}, /* C5 */ - {1, 2500, 7500, 12000}, + {2500 + 7500, 12000, 1}, /* C6 */ - {1, 3000, 8500, 15000}, + {3000 + 8500, 15000, 1}, /* C7 */ - {1, 10000, 30000, 300000}, + {10000 + 30000, 300000, 1}, }; +#define OMAP3_NUM_STATES ARRAY_SIZE(cpuidle_params_table) -static int omap3_idle_bm_check(void) -{ - if (!omap3_can_sleep()) - return 1; - return 0; -} +/* Mach specific information to be recorded in the C-state driver_data */ +struct omap3_idle_statedata { + u32 mpu_state; + u32 core_state; + u8 valid; +}; +struct omap3_idle_statedata omap3_idle_data[OMAP3_NUM_STATES]; + +struct powerdomain *mpu_pd, *core_pd, *per_pd, *cam_pd; static int _cpuidle_allow_idle(struct powerdomain *pwrdm, struct clockdomain *clkdm) @@ -122,12 +96,10 @@ static int _cpuidle_deny_idle(struct powerdomain *pwrdm, static int omap3_enter_idle(struct cpuidle_device *dev, struct cpuidle_state *state) { - struct omap3_processor_cx *cx = cpuidle_get_statedata(state); + struct omap3_idle_statedata *cx = cpuidle_get_statedata(state); struct timespec ts_preidle, ts_postidle, ts_idle; u32 mpu_state = cx->mpu_state, core_state = cx->core_state; - current_cx_state = *cx; - /* Used to keep track of the total time in idle */ getnstimeofday(&ts_preidle); @@ -140,7 +112,8 @@ static int omap3_enter_idle(struct cpuidle_device *dev, if (omap_irq_pending() || need_resched()) goto return_sleep_time; - if (cx->type == OMAP3_STATE_C1) { + /* Deny idle for C1 */ + if (state == &dev->states[0]) { pwrdm_for_each_clkdm(mpu_pd, _cpuidle_deny_idle); pwrdm_for_each_clkdm(core_pd, _cpuidle_deny_idle); } @@ -148,7 +121,8 @@ static int omap3_enter_idle(struct cpuidle_device *dev, /* Execute ARM wfi */ omap_sram_idle(); - if (cx->type == OMAP3_STATE_C1) { + /* Re-allow idle for C1 */ + if (state == &dev->states[0]) { pwrdm_for_each_clkdm(mpu_pd, _cpuidle_allow_idle); pwrdm_for_each_clkdm(core_pd, _cpuidle_allow_idle); } @@ -164,41 +138,53 @@ return_sleep_time: } /** - * next_valid_state - Find next valid c-state + * next_valid_state - Find next valid C-state * @dev: cpuidle device - * @state: Currently selected c-state + * @state: Currently selected C-state * * If the current state is valid, it is returned back to the caller. * Else, this function searches for a lower c-state which is still - * valid (as defined in omap3_power_states[]). + * valid. + * + * A state is valid if the 'valid' field is enabled and + * if it satisfies the enable_off_mode condition. */ static struct cpuidle_state *next_valid_state(struct cpuidle_device *dev, - struct cpuidle_state *curr) + struct cpuidle_state *curr) { struct cpuidle_state *next = NULL; - struct omap3_processor_cx *cx; + struct omap3_idle_statedata *cx = cpuidle_get_statedata(curr); + u32 mpu_deepest_state = PWRDM_POWER_RET; + u32 core_deepest_state = PWRDM_POWER_RET; - cx = (struct omap3_processor_cx *)cpuidle_get_statedata(curr); + if (enable_off_mode) { + mpu_deepest_state = PWRDM_POWER_OFF; + /* + * Erratum i583: valable for ES rev < Es1.2 on 3630. + * CORE OFF mode is not supported in a stable form, restrict + * instead the CORE state to RET. + */ + if (!IS_PM34XX_ERRATUM(PM_SDRC_WAKEUP_ERRATUM_i583)) + core_deepest_state = PWRDM_POWER_OFF; + } /* Check if current state is valid */ - if (cx->valid) { + if ((cx->valid) && + (cx->mpu_state >= mpu_deepest_state) && + (cx->core_state >= core_deepest_state)) { return curr; } else { - u8 idx = OMAP3_STATE_MAX; + int idx = OMAP3_NUM_STATES - 1; - /* - * Reach the current state starting at highest C-state - */ - for (; idx >= OMAP3_STATE_C1; idx--) { + /* Reach the current state starting at highest C-state */ + for (; idx >= 0; idx--) { if (&dev->states[idx] == curr) { next = &dev->states[idx]; break; } } - /* - * Should never hit this condition. - */ + /* Should never hit this condition */ WARN_ON(next == NULL); /* @@ -206,17 +192,17 @@ static struct cpuidle_state *next_valid_state(struct cpuidle_device *dev, * Start search from the next (lower) state. */ idx--; - for (; idx >= OMAP3_STATE_C1; idx--) { - struct omap3_processor_cx *cx; - + for (; idx >= 0; idx--) { cx = cpuidle_get_statedata(&dev->states[idx]); - if (cx->valid) { + if ((cx->valid) && + (cx->mpu_state >= mpu_deepest_state) && + (cx->core_state >= core_deepest_state)) { next = &dev->states[idx]; break; } } /* - * C1 and C2 are always valid. + * C1 is always valid. * So, no need to check for 'next==NULL' outside this loop. */ } @@ -229,36 +215,22 @@ static struct cpuidle_state *next_valid_state(struct cpuidle_device *dev, * @dev: cpuidle device * @state: The target state to be programmed * - * Used for C states with CPUIDLE_FLAG_CHECK_BM flag set. This - * function checks for any pending activity and then programs the - * device to the specified or a safer state. + * This function checks for any pending activity and then programs + * the device to the specified or a safer state. */ static int omap3_enter_idle_bm(struct cpuidle_device *dev, struct cpuidle_state *state) { - struct cpuidle_state *new_state = next_valid_state(dev, state); - u32 core_next_state, per_next_state = 0, per_saved_state = 0; - u32 cam_state; - struct omap3_processor_cx *cx; + struct cpuidle_state *new_state; + u32 core_next_state, per_next_state = 0, per_saved_state = 0, cam_state; + struct omap3_idle_statedata *cx; int ret; - if ((state->flags & CPUIDLE_FLAG_CHECK_BM) && omap3_idle_bm_check()) { - BUG_ON(!dev->safe_state); + if (!omap3_can_sleep()) { new_state = dev->safe_state; goto select_state; } - cx = cpuidle_get_statedata(state); - core_next_state = cx->core_state; - - /* - * FIXME: we currently manage device-specific idle states - * for PER and CORE in combination with CPU-specific - * idle states. This is wrong, and device-specific - * idle management needs to be separated out into - * its own code. - */ - /* * Prevent idle completely if CAM is active. * CAM does not have wakeup capability in OMAP3. @@ -270,9 +242,19 @@ static int omap3_enter_idle_bm(struct cpuidle_device *dev, } /* + * FIXME: we currently manage device-specific idle states + * for PER and CORE in combination with CPU-specific + * idle states. This is wrong, and device-specific + * idle management needs to be separated out into + * its own code. + */ + + /* * Prevent PER off if CORE is not in retention or off as this * would disable PER wakeups completely. */ + cx = cpuidle_get_statedata(state); + core_next_state = cx->core_state; per_next_state = per_saved_state = pwrdm_read_next_pwrst(per_pd); if ((per_next_state == PWRDM_POWER_OFF) && (core_next_state > PWRDM_POWER_RET)) @@ -282,6 +264,8 @@ static int omap3_enter_idle_bm(struct cpuidle_device *dev, if (per_next_state != per_saved_state) pwrdm_set_next_pwrst(per_pd, per_next_state); + new_state = next_valid_state(dev, state); + select_state: dev->last_state = new_state; ret = omap3_enter_idle(dev, new_state); @@ -295,31 +279,6 @@ select_state: DEFINE_PER_CPU(struct cpuidle_device, omap3_idle_dev); -/** - * omap3_cpuidle_update_states() - Update the cpuidle states - * @mpu_deepest_state: Enable states up to and including this for mpu domain - * @core_deepest_state: Enable states up to and including this for core domain - * - * This goes through the list of states available and enables and disables the - * validity of C states based on deepest state that can be achieved for the - * variable domain - */ -void omap3_cpuidle_update_states(u32 mpu_deepest_state, u32 core_deepest_state) -{ - int i; - - for (i = OMAP3_STATE_C1; i < OMAP3_MAX_STATES; i++) { - struct omap3_processor_cx *cx = &omap3_power_states[i]; - - if ((cx->mpu_state >= mpu_deepest_state) && - (cx->core_state >= core_deepest_state)) { - cx->valid = 1; - } else { - cx->valid = 0; - } - } -} - void omap3_pm_init_cpuidle(struct cpuidle_params *cpuidle_board_params) { int i; @@ -327,212 +286,109 @@ void omap3_pm_init_cpuidle(struct cpuidle_params *cpuidle_board_params) if (!cpuidle_board_params) return; - for (i = OMAP3_STATE_C1; i < OMAP3_MAX_STATES; i++) { - cpuidle_params_table[i].valid = - cpuidle_board_params[i].valid; - cpuidle_params_table[i].sleep_latency = - cpuidle_board_params[i].sleep_latency; - cpuidle_params_table[i].wake_latency = - cpuidle_board_params[i].wake_latency; - cpuidle_params_table[i].threshold = - cpuidle_board_params[i].threshold; + for (i = 0; i < OMAP3_NUM_STATES; i++) { + cpuidle_params_table[i].valid = cpuidle_board_params[i].valid; + cpuidle_params_table[i].exit_latency = + cpuidle_board_params[i].exit_latency; + cpuidle_params_table[i].target_residency = + cpuidle_board_params[i].target_residency; } return; } -/* omap3_init_power_states - Initialises the OMAP3 specific C states. - * - * Below is the desciption of each C state. - * C1 . MPU WFI + Core active - * C2 . MPU WFI + Core inactive - * C3 . MPU CSWR + Core inactive - * C4 . MPU OFF + Core inactive - * C5 . MPU CSWR + Core CSWR - * C6 . MPU OFF + Core CSWR - * C7 . MPU OFF + Core OFF - */ -void omap_init_power_states(void) -{ - /* C1 . MPU WFI + Core active */ - omap3_power_states[OMAP3_STATE_C1].valid = - cpuidle_params_table[OMAP3_STATE_C1].valid; - omap3_power_states[OMAP3_STATE_C1].type = OMAP3_STATE_C1; - omap3_power_states[OMAP3_STATE_C1].sleep_latency = - cpuidle_params_table[OMAP3_STATE_C1].sleep_latency; - omap3_power_states[OMAP3_STATE_C1].wakeup_latency = - cpuidle_params_table[OMAP3_STATE_C1].wake_latency; - omap3_power_states[OMAP3_STATE_C1].threshold = - cpuidle_params_table[OMAP3_STATE_C1].threshold; - omap3_power_states[OMAP3_STATE_C1].mpu_state = PWRDM_POWER_ON; - omap3_power_states[OMAP3_STATE_C1].core_state = PWRDM_POWER_ON; - omap3_power_states[OMAP3_STATE_C1].flags = CPUIDLE_FLAG_TIME_VALID; - omap3_power_states[OMAP3_STATE_C1].desc = "MPU ON + CORE ON"; - - /* C2 . MPU WFI + Core inactive */ - omap3_power_states[OMAP3_STATE_C2].valid = - cpuidle_params_table[OMAP3_STATE_C2].valid; - omap3_power_states[OMAP3_STATE_C2].type = OMAP3_STATE_C2; - omap3_power_states[OMAP3_STATE_C2].sleep_latency = - cpuidle_params_table[OMAP3_STATE_C2].sleep_latency; - omap3_power_states[OMAP3_STATE_C2].wakeup_latency = - cpuidle_params_table[OMAP3_STATE_C2].wake_latency; - omap3_power_states[OMAP3_STATE_C2].threshold = - cpuidle_params_table[OMAP3_STATE_C2].threshold; - omap3_power_states[OMAP3_STATE_C2].mpu_state = PWRDM_POWER_ON; - omap3_power_states[OMAP3_STATE_C2].core_state = PWRDM_POWER_ON; - omap3_power_states[OMAP3_STATE_C2].flags = CPUIDLE_FLAG_TIME_VALID | - CPUIDLE_FLAG_CHECK_BM; - omap3_power_states[OMAP3_STATE_C2].desc = "MPU ON + CORE ON"; - - /* C3 . MPU CSWR + Core inactive */ - omap3_power_states[OMAP3_STATE_C3].valid = - cpuidle_params_table[OMAP3_STATE_C3].valid; - omap3_power_states[OMAP3_STATE_C3].type = OMAP3_STATE_C3; - omap3_power_states[OMAP3_STATE_C3].sleep_latency = - cpuidle_params_table[OMAP3_STATE_C3].sleep_latency; - omap3_power_states[OMAP3_STATE_C3].wakeup_latency = - cpuidle_params_table[OMAP3_STATE_C3].wake_latency; - omap3_power_states[OMAP3_STATE_C3].threshold = - cpuidle_params_table[OMAP3_STATE_C3].threshold; - omap3_power_states[OMAP3_STATE_C3].mpu_state = PWRDM_POWER_RET; - omap3_power_states[OMAP3_STATE_C3].core_state = PWRDM_POWER_ON; - omap3_power_states[OMAP3_STATE_C3].flags = CPUIDLE_FLAG_TIME_VALID | - CPUIDLE_FLAG_CHECK_BM; - omap3_power_states[OMAP3_STATE_C3].desc = "MPU RET + CORE ON"; - - /* C4 . MPU OFF + Core inactive */ - omap3_power_states[OMAP3_STATE_C4].valid = - cpuidle_params_table[OMAP3_STATE_C4].valid; - omap3_power_states[OMAP3_STATE_C4].type = OMAP3_STATE_C4; - omap3_power_states[OMAP3_STATE_C4].sleep_latency = - cpuidle_params_table[OMAP3_STATE_C4].sleep_latency; - omap3_power_states[OMAP3_STATE_C4].wakeup_latency = - cpuidle_params_table[OMAP3_STATE_C4].wake_latency; - omap3_power_states[OMAP3_STATE_C4].threshold = - cpuidle_params_table[OMAP3_STATE_C4].threshold; - omap3_power_states[OMAP3_STATE_C4].mpu_state = PWRDM_POWER_OFF; - omap3_power_states[OMAP3_STATE_C4].core_state = PWRDM_POWER_ON; - omap3_power_states[OMAP3_STATE_C4].flags = CPUIDLE_FLAG_TIME_VALID | - CPUIDLE_FLAG_CHECK_BM; - omap3_power_states[OMAP3_STATE_C4].desc = "MPU OFF + CORE ON"; - - /* C5 . MPU CSWR + Core CSWR*/ - omap3_power_states[OMAP3_STATE_C5].valid = - cpuidle_params_table[OMAP3_STATE_C5].valid; - omap3_power_states[OMAP3_STATE_C5].type = OMAP3_STATE_C5; - omap3_power_states[OMAP3_STATE_C5].sleep_latency = - cpuidle_params_table[OMAP3_STATE_C5].sleep_latency; - omap3_power_states[OMAP3_STATE_C5].wakeup_latency = - cpuidle_params_table[OMAP3_STATE_C5].wake_latency; - omap3_power_states[OMAP3_STATE_C5].threshold = - cpuidle_params_table[OMAP3_STATE_C5].threshold; - omap3_power_states[OMAP3_STATE_C5].mpu_state = PWRDM_POWER_RET; - omap3_power_states[OMAP3_STATE_C5].core_state = PWRDM_POWER_RET; - omap3_power_states[OMAP3_STATE_C5].flags = CPUIDLE_FLAG_TIME_VALID | - CPUIDLE_FLAG_CHECK_BM; - omap3_power_states[OMAP3_STATE_C5].desc = "MPU RET + CORE RET"; - - /* C6 . MPU OFF + Core CSWR */ - omap3_power_states[OMAP3_STATE_C6].valid = - cpuidle_params_table[OMAP3_STATE_C6].valid; - omap3_power_states[OMAP3_STATE_C6].type = OMAP3_STATE_C6; - omap3_power_states[OMAP3_STATE_C6].sleep_latency = - cpuidle_params_table[OMAP3_STATE_C6].sleep_latency; - omap3_power_states[OMAP3_STATE_C6].wakeup_latency = - cpuidle_params_table[OMAP3_STATE_C6].wake_latency; - omap3_power_states[OMAP3_STATE_C6].threshold = - cpuidle_params_table[OMAP3_STATE_C6].threshold; - omap3_power_states[OMAP3_STATE_C6].mpu_state = PWRDM_POWER_OFF; - omap3_power_states[OMAP3_STATE_C6].core_state = PWRDM_POWER_RET; - omap3_power_states[OMAP3_STATE_C6].flags = CPUIDLE_FLAG_TIME_VALID | - CPUIDLE_FLAG_CHECK_BM; - omap3_power_states[OMAP3_STATE_C6].desc = "MPU OFF + CORE RET"; - - /* C7 . MPU OFF + Core OFF */ - omap3_power_states[OMAP3_STATE_C7].valid = - cpuidle_params_table[OMAP3_STATE_C7].valid; - omap3_power_states[OMAP3_STATE_C7].type = OMAP3_STATE_C7; - omap3_power_states[OMAP3_STATE_C7].sleep_latency = - cpuidle_params_table[OMAP3_STATE_C7].sleep_latency; - omap3_power_states[OMAP3_STATE_C7].wakeup_latency = - cpuidle_params_table[OMAP3_STATE_C7].wake_latency; - omap3_power_states[OMAP3_STATE_C7].threshold = - cpuidle_params_table[OMAP3_STATE_C7].threshold; - omap3_power_states[OMAP3_STATE_C7].mpu_state = PWRDM_POWER_OFF; - omap3_power_states[OMAP3_STATE_C7].core_state = PWRDM_POWER_OFF; - omap3_power_states[OMAP3_STATE_C7].flags = CPUIDLE_FLAG_TIME_VALID | - CPUIDLE_FLAG_CHECK_BM; - omap3_power_states[OMAP3_STATE_C7].desc = "MPU OFF + CORE OFF"; - - /* - * Erratum i583: implementation for ES rev < Es1.2 on 3630. We cannot - * enable OFF mode in a stable form for previous revisions. - * we disable C7 state as a result. - */ - if (IS_PM34XX_ERRATUM(PM_SDRC_WAKEUP_ERRATUM_i583)) { - omap3_power_states[OMAP3_STATE_C7].valid = 0; - cpuidle_params_table[OMAP3_STATE_C7].valid = 0; - pr_warn("%s: core off state C7 disabled due to i583\n", - __func__); - } -} - struct cpuidle_driver omap3_idle_driver = { .name = "omap3_idle", .owner = THIS_MODULE, }; +/* Helper to fill the C-state common data and register the driver_data */ +static inline struct omap3_idle_statedata *_fill_cstate( + struct cpuidle_device *dev, + int idx, const char *descr) +{ + struct omap3_idle_statedata *cx = &omap3_idle_data[idx]; + struct cpuidle_state *state = &dev->states[idx]; + + state->exit_latency = cpuidle_params_table[idx].exit_latency; + state->target_residency = cpuidle_params_table[idx].target_residency; + state->flags = CPUIDLE_FLAG_TIME_VALID; + state->enter = omap3_enter_idle_bm; + cx->valid = cpuidle_params_table[idx].valid; + sprintf(state->name, "C%d", idx + 1); + strncpy(state->desc, descr, CPUIDLE_DESC_LEN); + cpuidle_set_statedata(state, cx); + + return cx; +} + /** * omap3_idle_init - Init routine for OMAP3 idle * - * Registers the OMAP3 specific cpuidle driver with the cpuidle + * Registers the OMAP3 specific cpuidle driver to the cpuidle * framework with the valid set of states. */ int __init omap3_idle_init(void) { - int i, count = 0; - struct omap3_processor_cx *cx; - struct cpuidle_state *state; struct cpuidle_device *dev; + struct omap3_idle_statedata *cx; mpu_pd = pwrdm_lookup("mpu_pwrdm"); core_pd = pwrdm_lookup("core_pwrdm"); per_pd = pwrdm_lookup("per_pwrdm"); cam_pd = pwrdm_lookup("cam_pwrdm"); - omap_init_power_states(); cpuidle_register_driver(&omap3_idle_driver); - dev = &per_cpu(omap3_idle_dev, smp_processor_id()); - for (i = OMAP3_STATE_C1; i < OMAP3_MAX_STATES; i++) { - cx = &omap3_power_states[i]; - state = &dev->states[count]; - - if (!cx->valid) - continue; - cpuidle_set_statedata(state, cx); - state->exit_latency = cx->sleep_latency + cx->wakeup_latency; - state->target_residency = cx->threshold; - state->flags = cx->flags; - state->enter = (state->flags & CPUIDLE_FLAG_CHECK_BM) ? - omap3_enter_idle_bm : omap3_enter_idle; - if (cx->type == OMAP3_STATE_C1) - dev->safe_state = state; - sprintf(state->name, "C%d", count+1); - strncpy(state->desc, cx->desc, CPUIDLE_DESC_LEN); - count++; - } + /* C1 . MPU WFI + Core active */ + cx = _fill_cstate(dev, 0, "MPU ON + CORE ON"); + (&dev->states[0])->enter = omap3_enter_idle; + dev->safe_state = &dev->states[0]; + cx->valid = 1; /* C1 is always valid */ + cx->mpu_state = PWRDM_POWER_ON; + cx->core_state = PWRDM_POWER_ON; - if (!count) - return -EINVAL; - dev->state_count = count; + /* C2 . MPU WFI + Core inactive */ + cx = _fill_cstate(dev, 1, "MPU ON + CORE ON"); + cx->mpu_state = PWRDM_POWER_ON; + cx->core_state = PWRDM_POWER_ON; + + /* C3 . MPU CSWR + Core inactive */ + cx = _fill_cstate(dev, 2, "MPU RET + CORE ON"); + cx->mpu_state = PWRDM_POWER_RET; + cx->core_state = PWRDM_POWER_ON; - if (enable_off_mode) - omap3_cpuidle_update_states(PWRDM_POWER_OFF, PWRDM_POWER_OFF); - else - omap3_cpuidle_update_states(PWRDM_POWER_RET, PWRDM_POWER_RET); + /* C4 . MPU OFF + Core inactive */ + cx = _fill_cstate(dev, 3, "MPU OFF + CORE ON"); + cx->mpu_state = PWRDM_POWER_OFF; + cx->core_state = PWRDM_POWER_ON; + + /* C5 . MPU RET + Core RET */ + cx = _fill_cstate(dev, 4, "MPU RET + CORE RET"); + cx->mpu_state = PWRDM_POWER_RET; + cx->core_state = PWRDM_POWER_RET; + + /* C6 . MPU OFF + Core RET */ + cx = _fill_cstate(dev, 5, "MPU OFF + CORE RET"); + cx->mpu_state = PWRDM_POWER_OFF; + cx->core_state = PWRDM_POWER_RET; + + /* C7 . MPU OFF + Core OFF */ + cx = _fill_cstate(dev, 6, "MPU OFF + CORE OFF"); + /* + * Erratum i583: implementation for ES rev < Es1.2 on 3630. We cannot + * enable OFF mode in a stable form for previous revisions. + * We disable C7 state as a result. + */ + if (IS_PM34XX_ERRATUM(PM_SDRC_WAKEUP_ERRATUM_i583)) { + cx->valid = 0; + pr_warn("%s: core off state C7 disabled due to i583\n", + __func__); + } + cx->mpu_state = PWRDM_POWER_OFF; + cx->core_state = PWRDM_POWER_OFF; + dev->state_count = OMAP3_NUM_STATES; if (cpuidle_register_device(dev)) { printk(KERN_ERR "%s: CPUidle register device failed\n", __func__); diff --git a/arch/arm/mach-omap2/pm.h b/arch/arm/mach-omap2/pm.h index 797bfd12b64..45bcfce7735 100644 --- a/arch/arm/mach-omap2/pm.h +++ b/arch/arm/mach-omap2/pm.h @@ -36,11 +36,16 @@ static inline int omap4_opp_init(void) } #endif +/* + * cpuidle mach specific parameters + * + * The board code can override the default C-states definition using + * omap3_pm_init_cpuidle + */ struct cpuidle_params { - u8 valid; - u32 sleep_latency; - u32 wake_latency; - u32 threshold; + u32 exit_latency; /* exit_latency = sleep + wake-up latencies */ + u32 target_residency; + u8 valid; /* validates the C-state */ }; #if defined(CONFIG_PM) && defined(CONFIG_CPU_IDLE) @@ -73,10 +78,6 @@ extern u32 sleep_while_idle; #define sleep_while_idle 0 #endif -#if defined(CONFIG_CPU_IDLE) -extern void omap3_cpuidle_update_states(u32, u32); -#endif - #if defined(CONFIG_PM_DEBUG) && defined(CONFIG_DEBUG_FS) extern void pm_dbg_update_time(struct powerdomain *pwrdm, int prev); extern int pm_dbg_regset_save(int reg_set); diff --git a/arch/arm/mach-omap2/pm34xx.c b/arch/arm/mach-omap2/pm34xx.c index 0c5e3a46a3a..c155c9d1c82 100644 --- a/arch/arm/mach-omap2/pm34xx.c +++ b/arch/arm/mach-omap2/pm34xx.c @@ -779,18 +779,6 @@ void omap3_pm_off_mode_enable(int enable) else state = PWRDM_POWER_RET; -#ifdef CONFIG_CPU_IDLE - /* - * Erratum i583: implementation for ES rev < Es1.2 on 3630. We cannot - * enable OFF mode in a stable form for previous revisions, restrict - * instead to RET - */ - if (IS_PM34XX_ERRATUM(PM_SDRC_WAKEUP_ERRATUM_i583)) - omap3_cpuidle_update_states(state, PWRDM_POWER_RET); - else - omap3_cpuidle_update_states(state, state); -#endif - list_for_each_entry(pwrst, &pwrst_list, node) { if (IS_PM34XX_ERRATUM(PM_SDRC_WAKEUP_ERRATUM_i583) && pwrst->pwrdm == core_pwrdm && @@ -895,8 +883,6 @@ static int __init omap3_pm_init(void) pm_errata_configure(); - printk(KERN_ERR "Power Management for TI OMAP3.\n"); - /* XXX prcm_setup_regs needs to be before enabling hw * supervised mode for powerdomains */ prcm_setup_regs(); diff --git a/arch/mips/ar7/gpio.c b/arch/mips/ar7/gpio.c index 425dfa5d6e1..bb571bcdb8f 100644 --- a/arch/mips/ar7/gpio.c +++ b/arch/mips/ar7/gpio.c @@ -325,9 +325,7 @@ int __init ar7_gpio_init(void) size = 0x1f; } - gpch->regs = ioremap_nocache(AR7_REGS_GPIO, - AR7_REGS_GPIO + 0x10); - + gpch->regs = ioremap_nocache(AR7_REGS_GPIO, size); if (!gpch->regs) { printk(KERN_ERR "%s: failed to ioremap regs\n", gpch->chip.label); diff --git a/arch/mips/include/asm/dma-mapping.h b/arch/mips/include/asm/dma-mapping.h index 655f849bd08..7aa37ddfca4 100644 --- a/arch/mips/include/asm/dma-mapping.h +++ b/arch/mips/include/asm/dma-mapping.h @@ -5,7 +5,9 @@ #include <asm/cache.h> #include <asm-generic/dma-coherent.h> +#ifndef CONFIG_SGI_IP27 /* Kludge to fix 2.6.39 build for IP27 */ #include <dma-coherence.h> +#endif extern struct dma_map_ops *mips_dma_map_ops; diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 71350f7f2d8..e9b3af27d84 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -374,7 +374,8 @@ void __noreturn die(const char *str, struct pt_regs *regs) unsigned long dvpret = dvpe(); #endif /* CONFIG_MIPS_MT_SMTC */ - notify_die(DIE_OOPS, str, regs, 0, regs_to_trapnr(regs), SIGSEGV); + if (notify_die(DIE_OOPS, str, regs, 0, regs_to_trapnr(regs), SIGSEGV) == NOTIFY_STOP) + sig = 0; console_verbose(); spin_lock_irq(&die_lock); @@ -383,9 +384,6 @@ void __noreturn die(const char *str, struct pt_regs *regs) mips_mt_regdump(dvpret); #endif /* CONFIG_MIPS_MT_SMTC */ - if (notify_die(DIE_OOPS, str, regs, 0, regs_to_trapnr(regs), SIGSEGV) == NOTIFY_STOP) - sig = 0; - printk("%s[#%d]:\n", str, ++die_counter); show_registers(regs); add_taint(TAINT_DIE); diff --git a/arch/mips/rb532/gpio.c b/arch/mips/rb532/gpio.c index 37de05d595e..6c47dfeb7be 100644 --- a/arch/mips/rb532/gpio.c +++ b/arch/mips/rb532/gpio.c @@ -185,7 +185,7 @@ int __init rb532_gpio_init(void) struct resource *r; r = rb532_gpio_reg0_res; - rb532_gpio_chip->regbase = ioremap_nocache(r->start, r->end - r->start); + rb532_gpio_chip->regbase = ioremap_nocache(r->start, resource_size(r)); if (!rb532_gpio_chip->regbase) { printk(KERN_ERR "rb532: cannot remap GPIO register 0\n"); diff --git a/arch/powerpc/platforms/83xx/suspend.c b/arch/powerpc/platforms/83xx/suspend.c index 188272934cf..104faa8aa23 100644 --- a/arch/powerpc/platforms/83xx/suspend.c +++ b/arch/powerpc/platforms/83xx/suspend.c @@ -318,17 +318,20 @@ static const struct platform_suspend_ops mpc83xx_suspend_ops = { .end = mpc83xx_suspend_end, }; +static struct of_device_id pmc_match[]; static int pmc_probe(struct platform_device *ofdev) { + const struct of_device_id *match; struct device_node *np = ofdev->dev.of_node; struct resource res; struct pmc_type *type; int ret = 0; - if (!ofdev->dev.of_match) + match = of_match_device(pmc_match, &ofdev->dev); + if (!match) return -EINVAL; - type = ofdev->dev.of_match->data; + type = match->data; if (!of_device_is_available(np)) return -ENODEV; diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c index d5679dc1e20..01cd2f08951 100644 --- a/arch/powerpc/sysdev/fsl_msi.c +++ b/arch/powerpc/sysdev/fsl_msi.c @@ -304,8 +304,10 @@ static int __devinit fsl_msi_setup_hwirq(struct fsl_msi *msi, return 0; } +static const struct of_device_id fsl_of_msi_ids[]; static int __devinit fsl_of_msi_probe(struct platform_device *dev) { + const struct of_device_id *match; struct fsl_msi *msi; struct resource res; int err, i, j, irq_index, count; @@ -316,9 +318,10 @@ static int __devinit fsl_of_msi_probe(struct platform_device *dev) u32 offset; static const u32 all_avail[] = { 0, NR_MSI_IRQS }; - if (!dev->dev.of_match) + match = of_match_device(fsl_of_msi_ids, &dev->dev); + if (!match) return -EINVAL; - features = dev->dev.of_match->data; + features = match->data; printk(KERN_DEBUG "Setting up Freescale MSI support\n"); diff --git a/arch/sparc/kernel/apc.c b/arch/sparc/kernel/apc.c index f679c57644d..1e34f29e58b 100644 --- a/arch/sparc/kernel/apc.c +++ b/arch/sparc/kernel/apc.c @@ -165,7 +165,7 @@ static int __devinit apc_probe(struct platform_device *op) return 0; } -static struct of_device_id __initdata apc_match[] = { +static struct of_device_id apc_match[] = { { .name = APC_OBPNAME, }, diff --git a/arch/sparc/kernel/pci_sabre.c b/arch/sparc/kernel/pci_sabre.c index 948068a083f..d1840dbdaa2 100644 --- a/arch/sparc/kernel/pci_sabre.c +++ b/arch/sparc/kernel/pci_sabre.c @@ -452,8 +452,10 @@ static void __devinit sabre_pbm_init(struct pci_pbm_info *pbm, sabre_scan_bus(pbm, &op->dev); } +static const struct of_device_id sabre_match[]; static int __devinit sabre_probe(struct platform_device *op) { + const struct of_device_id *match; const struct linux_prom64_registers *pr_regs; struct device_node *dp = op->dev.of_node; struct pci_pbm_info *pbm; @@ -463,7 +465,8 @@ static int __devinit sabre_probe(struct platform_device *op) const u32 *vdma; u64 clear_irq; - hummingbird_p = op->dev.of_match && (op->dev.of_match->data != NULL); + match = of_match_device(sabre_match, &op->dev); + hummingbird_p = match && (match->data != NULL); if (!hummingbird_p) { struct device_node *cpu_dp; diff --git a/arch/sparc/kernel/pci_schizo.c b/arch/sparc/kernel/pci_schizo.c index fecfcb2063c..283fbc329a4 100644 --- a/arch/sparc/kernel/pci_schizo.c +++ b/arch/sparc/kernel/pci_schizo.c @@ -1458,11 +1458,15 @@ out_err: return err; } +static const struct of_device_id schizo_match[]; static int __devinit schizo_probe(struct platform_device *op) { - if (!op->dev.of_match) + const struct of_device_id *match; + + match = of_match_device(schizo_match, &op->dev); + if (!match) return -EINVAL; - return __schizo_init(op, (unsigned long) op->dev.of_match->data); + return __schizo_init(op, (unsigned long)match->data); } /* The ordering of this table is very important. Some Tomatillo diff --git a/arch/sparc/kernel/pmc.c b/arch/sparc/kernel/pmc.c index 93d7b4465f8..6a585d39358 100644 --- a/arch/sparc/kernel/pmc.c +++ b/arch/sparc/kernel/pmc.c @@ -69,7 +69,7 @@ static int __devinit pmc_probe(struct platform_device *op) return 0; } -static struct of_device_id __initdata pmc_match[] = { +static struct of_device_id pmc_match[] = { { .name = PMC_OBPNAME, }, diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c index 91c10fb7085..850a1360c0d 100644 --- a/arch/sparc/kernel/smp_32.c +++ b/arch/sparc/kernel/smp_32.c @@ -53,6 +53,7 @@ cpumask_t smp_commenced_mask = CPU_MASK_NONE; void __cpuinit smp_store_cpu_info(int id) { int cpu_node; + int mid; cpu_data(id).udelay_val = loops_per_jiffy; @@ -60,10 +61,13 @@ void __cpuinit smp_store_cpu_info(int id) cpu_data(id).clock_tick = prom_getintdefault(cpu_node, "clock-frequency", 0); cpu_data(id).prom_node = cpu_node; - cpu_data(id).mid = cpu_get_hwmid(cpu_node); + mid = cpu_get_hwmid(cpu_node); - if (cpu_data(id).mid < 0) - panic("No MID found for CPU%d at node 0x%08d", id, cpu_node); + if (mid < 0) { + printk(KERN_NOTICE "No MID found for CPU%d at node 0x%08d", id, cpu_node); + mid = 0; + } + cpu_data(id).mid = mid; } void __init smp_cpus_done(unsigned int max_cpus) diff --git a/arch/sparc/kernel/time_32.c b/arch/sparc/kernel/time_32.c index 4e236391b63..96046a4024c 100644 --- a/arch/sparc/kernel/time_32.c +++ b/arch/sparc/kernel/time_32.c @@ -168,7 +168,7 @@ static int __devinit clock_probe(struct platform_device *op) return 0; } -static struct of_device_id __initdata clock_match[] = { +static struct of_device_id clock_match[] = { { .name = "eeprom", }, diff --git a/arch/sparc/lib/checksum_32.S b/arch/sparc/lib/checksum_32.S index 3632cb34e91..0084c3361e1 100644 --- a/arch/sparc/lib/checksum_32.S +++ b/arch/sparc/lib/checksum_32.S @@ -289,10 +289,16 @@ cc_end_cruft: /* Also, handle the alignment code out of band. */ cc_dword_align: - cmp %g1, 6 - bl,a ccte + cmp %g1, 16 + bge 1f + srl %g1, 1, %o3 +2: cmp %o3, 0 + be,a ccte andcc %g1, 0xf, %o3 - andcc %o0, 0x1, %g0 + andcc %o3, %o0, %g0 ! Check %o0 only (%o1 has the same last 2 bits) + be,a 2b + srl %o3, 1, %o3 +1: andcc %o0, 0x1, %g0 bne ccslow andcc %o0, 0x2, %g0 be 1f diff --git a/arch/um/os-Linux/util.c b/arch/um/os-Linux/util.c index 6ea77979531..42827cafa6a 100644 --- a/arch/um/os-Linux/util.c +++ b/arch/um/os-Linux/util.c @@ -5,6 +5,7 @@ #include <stdio.h> #include <stdlib.h> +#include <unistd.h> #include <errno.h> #include <signal.h> #include <string.h> @@ -75,6 +76,26 @@ void setup_hostinfo(char *buf, int len) host.release, host.version, host.machine); } +/* + * We cannot use glibc's abort(). It makes use of tgkill() which + * has no effect within UML's kernel threads. + * After that glibc would execute an invalid instruction to kill + * the calling process and UML crashes with SIGSEGV. + */ +static inline void __attribute__ ((noreturn)) uml_abort(void) +{ + sigset_t sig; + + fflush(NULL); + + if (!sigemptyset(&sig) && !sigaddset(&sig, SIGABRT)) + sigprocmask(SIG_UNBLOCK, &sig, 0); + + for (;;) + if (kill(getpid(), SIGABRT) < 0) + exit(127); +} + void os_dump_core(void) { int pid; @@ -116,5 +137,5 @@ void os_dump_core(void) while ((pid = waitpid(-1, NULL, WNOHANG | __WALL)) > 0) os_kill_ptraced_process(pid, 0); - abort(); + uml_abort(); } diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index d87988bacf3..34595d5e103 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -78,6 +78,7 @@ #define APIC_DEST_LOGICAL 0x00800 #define APIC_DEST_PHYSICAL 0x00000 #define APIC_DM_FIXED 0x00000 +#define APIC_DM_FIXED_MASK 0x00700 #define APIC_DM_LOWEST 0x00100 #define APIC_DM_SMI 0x00200 #define APIC_DM_REMRD 0x00300 diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 7db7723d1f3..d56187c6b83 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -299,6 +299,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, /* Install a pte for a particular vaddr in kernel space. */ void set_pte_vaddr(unsigned long vaddr, pte_t pte); +extern void native_pagetable_reserve(u64 start, u64 end); #ifdef CONFIG_X86_32 extern void native_pagetable_setup_start(pgd_t *base); extern void native_pagetable_setup_done(pgd_t *base); diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 3e094af443c..130f1eeee5f 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -94,6 +94,8 @@ /* after this # consecutive successes, bump up the throttle if it was lowered */ #define COMPLETE_THRESHOLD 5 +#define UV_LB_SUBNODEID 0x10 + /* * number of entries in the destination side payload queue */ @@ -124,7 +126,7 @@ * The distribution specification (32 bytes) is interpreted as a 256-bit * distribution vector. Adjacent bits correspond to consecutive even numbered * nodeIDs. The result of adding the index of a given bit to the 15-bit - * 'base_dest_nodeid' field of the header corresponds to the + * 'base_dest_nasid' field of the header corresponds to the * destination nodeID associated with that specified bit. */ struct bau_target_uvhubmask { @@ -176,7 +178,7 @@ struct bau_msg_payload { struct bau_msg_header { unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */ /* bits 5:0 */ - unsigned int base_dest_nodeid:15; /* nasid of the */ + unsigned int base_dest_nasid:15; /* nasid of the */ /* bits 20:6 */ /* first bit in uvhub map */ unsigned int command:8; /* message type */ /* bits 28:21 */ @@ -378,6 +380,10 @@ struct ptc_stats { unsigned long d_rcanceled; /* number of messages canceled by resets */ }; +struct hub_and_pnode { + short uvhub; + short pnode; +}; /* * one per-cpu; to locate the software tables */ @@ -399,10 +405,12 @@ struct bau_control { int baudisabled; int set_bau_off; short cpu; + short osnode; short uvhub_cpu; short uvhub; short cpus_in_socket; short cpus_in_uvhub; + short partition_base_pnode; unsigned short message_number; unsigned short uvhub_quiesce; short socket_acknowledge_count[DEST_Q_SIZE]; @@ -422,15 +430,16 @@ struct bau_control { int congested_period; cycles_t period_time; long period_requests; + struct hub_and_pnode *target_hub_and_pnode; }; static inline int bau_uvhub_isset(int uvhub, struct bau_target_uvhubmask *dstp) { return constant_test_bit(uvhub, &dstp->bits[0]); } -static inline void bau_uvhub_set(int uvhub, struct bau_target_uvhubmask *dstp) +static inline void bau_uvhub_set(int pnode, struct bau_target_uvhubmask *dstp) { - __set_bit(uvhub, &dstp->bits[0]); + __set_bit(pnode, &dstp->bits[0]); } static inline void bau_uvhubs_clear(struct bau_target_uvhubmask *dstp, int nbits) diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index a501741c233..4298002d0c8 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -398,6 +398,8 @@ struct uv_blade_info { unsigned short nr_online_cpus; unsigned short pnode; short memory_nid; + spinlock_t nmi_lock; + unsigned long nmi_count; }; extern struct uv_blade_info *uv_blade_info; extern short *uv_node_to_blade; diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h index 20cafeac745..f5bb64a823d 100644 --- a/arch/x86/include/asm/uv/uv_mmrs.h +++ b/arch/x86/include/asm/uv/uv_mmrs.h @@ -5,7 +5,7 @@ * * SGI UV MMR definitions * - * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2011 Silicon Graphics, Inc. All rights reserved. */ #ifndef _ASM_X86_UV_UV_MMRS_H @@ -1099,5 +1099,19 @@ union uvh_rtc1_int_config_u { } s; }; +/* ========================================================================= */ +/* UVH_SCRATCH5 */ +/* ========================================================================= */ +#define UVH_SCRATCH5 0x2d0200UL +#define UVH_SCRATCH5_32 0x00778 + +#define UVH_SCRATCH5_SCRATCH5_SHFT 0 +#define UVH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL +union uvh_scratch5_u { + unsigned long v; + struct uvh_scratch5_s { + unsigned long scratch5 : 64; /* RW, W1CS */ + } s; +}; #endif /* __ASM_UV_MMRS_X86_H__ */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 643ebf2e2ad..d3d859035af 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -68,6 +68,17 @@ struct x86_init_oem { }; /** + * struct x86_init_mapping - platform specific initial kernel pagetable setup + * @pagetable_reserve: reserve a range of addresses for kernel pagetable usage + * + * For more details on the purpose of this hook, look in + * init_memory_mapping and the commit that added it. + */ +struct x86_init_mapping { + void (*pagetable_reserve)(u64 start, u64 end); +}; + +/** * struct x86_init_paging - platform specific paging functions * @pagetable_setup_start: platform specific pre paging_init() call * @pagetable_setup_done: platform specific post paging_init() call @@ -123,6 +134,7 @@ struct x86_init_ops { struct x86_init_mpparse mpparse; struct x86_init_irqs irqs; struct x86_init_oem oem; + struct x86_init_mapping mapping; struct x86_init_paging paging; struct x86_init_timers timers; struct x86_init_iommu iommu; diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 33b10a0fc09..7acd2d2ac96 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -37,6 +37,13 @@ #include <asm/smp.h> #include <asm/x86_init.h> #include <asm/emergency-restart.h> +#include <asm/nmi.h> + +/* BMC sets a bit this MMR non-zero before sending an NMI */ +#define UVH_NMI_MMR UVH_SCRATCH5 +#define UVH_NMI_MMR_CLEAR (UVH_NMI_MMR + 8) +#define UV_NMI_PENDING_MASK (1UL << 63) +DEFINE_PER_CPU(unsigned long, cpu_last_nmi_count); DEFINE_PER_CPU(int, x2apic_extra_bits); @@ -642,18 +649,46 @@ void __cpuinit uv_cpu_init(void) */ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) { + unsigned long real_uv_nmi; + int bid; + if (reason != DIE_NMIUNKNOWN) return NOTIFY_OK; if (in_crash_kexec) /* do nothing if entering the crash kernel */ return NOTIFY_OK; + /* - * Use a lock so only one cpu prints at a time - * to prevent intermixed output. + * Each blade has an MMR that indicates when an NMI has been sent + * to cpus on the blade. If an NMI is detected, atomically + * clear the MMR and update a per-blade NMI count used to + * cause each cpu on the blade to notice a new NMI. + */ + bid = uv_numa_blade_id(); + real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK); + + if (unlikely(real_uv_nmi)) { + spin_lock(&uv_blade_info[bid].nmi_lock); + real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK); + if (real_uv_nmi) { + uv_blade_info[bid].nmi_count++; + uv_write_local_mmr(UVH_NMI_MMR_CLEAR, UV_NMI_PENDING_MASK); + } + spin_unlock(&uv_blade_info[bid].nmi_lock); + } + + if (likely(__get_cpu_var(cpu_last_nmi_count) == uv_blade_info[bid].nmi_count)) + return NOTIFY_DONE; + + __get_cpu_var(cpu_last_nmi_count) = uv_blade_info[bid].nmi_count; + + /* + * Use a lock so only one cpu prints at a time. + * This prevents intermixed output. */ spin_lock(&uv_nmi_lock); - pr_info("NMI stack dump cpu %u:\n", smp_processor_id()); + pr_info("UV NMI stack dump cpu %u:\n", smp_processor_id()); dump_stack(); spin_unlock(&uv_nmi_lock); @@ -661,7 +696,8 @@ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) } static struct notifier_block uv_dump_stack_nmi_nb = { - .notifier_call = uv_handle_nmi + .notifier_call = uv_handle_nmi, + .priority = NMI_LOCAL_LOW_PRIOR - 1, }; void uv_register_nmi_notifier(void) @@ -720,8 +756,9 @@ void __init uv_system_init(void) printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades()); bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); - uv_blade_info = kmalloc(bytes, GFP_KERNEL); + uv_blade_info = kzalloc(bytes, GFP_KERNEL); BUG_ON(!uv_blade_info); + for (blade = 0; blade < uv_num_possible_blades(); blade++) uv_blade_info[blade].memory_nid = -1; @@ -747,6 +784,7 @@ void __init uv_system_init(void) uv_blade_info[blade].pnode = pnode; uv_blade_info[blade].nr_possible_cpus = 0; uv_blade_info[blade].nr_online_cpus = 0; + spin_lock_init(&uv_blade_info[blade].nmi_lock); max_pnode = max(pnode, max_pnode); blade++; } diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index bb9eb29a52d..6f9d1f6063e 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -613,7 +613,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) #endif /* As a rule processors have APIC timer running in deep C states */ - if (c->x86 >= 0xf && !cpu_has_amd_erratum(amd_erratum_400)) + if (c->x86 > 0xf && !cpu_has_amd_erratum(amd_erratum_400)) set_cpu_cap(c, X86_FEATURE_ARAT); /* @@ -698,7 +698,7 @@ cpu_dev_register(amd_cpu_dev); */ const int amd_erratum_400[] = - AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0x0f, 0x4, 0x2, 0xff, 0xf), + AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); EXPORT_SYMBOL_GPL(amd_erratum_400); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 167f97b5596..bb0adad3514 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -509,6 +509,7 @@ recurse: out_free: if (b) { kobject_put(&b->kobj); + list_del(&b->miscj); kfree(b); } return err; diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 6f8c5e9da97..0f034460260 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -446,18 +446,20 @@ void intel_init_thermal(struct cpuinfo_x86 *c) */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); + h = lvtthmr_init; /* * The initial value of thermal LVT entries on all APs always reads * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI * sequence to them and LVT registers are reset to 0s except for * the mask bits which are set to 1s when APs receive INIT IPI. - * Always restore the value that BIOS has programmed on AP based on - * BSP's info we saved since BIOS is always setting the same value - * for all threads/cores + * If BIOS takes over the thermal interrupt and sets its interrupt + * delivery mode to SMI (not fixed), it restores the value that the + * BIOS has programmed on AP based on BSP's info we saved since BIOS + * is always setting the same value for all threads/cores. */ - apic_write(APIC_LVTTHMR, lvtthmr_init); + if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED) + apic_write(APIC_LVTTHMR, lvtthmr_init); - h = lvtthmr_init; if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { printk(KERN_DEBUG diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index c969fd9d156..f1a6244d7d9 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -1183,12 +1183,13 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + unsigned long flags; /* This is possible if op is under delayed unoptimizing */ if (kprobe_disabled(&op->kp)) return; - preempt_disable(); + local_irq_save(flags); if (kprobe_running()) { kprobes_inc_nmissed_count(&op->kp); } else { @@ -1207,7 +1208,7 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op, opt_pre_handler(&op->kp, regs); __this_cpu_write(current_kprobe, NULL); } - preempt_enable_no_resched(); + local_irq_restore(flags); } static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index c11514e9128..75ef4b18e9b 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -61,6 +61,10 @@ struct x86_init_ops x86_init __initdata = { .banner = default_banner, }, + .mapping = { + .pagetable_reserve = native_pagetable_reserve, + }, + .paging = { .pagetable_setup_start = native_pagetable_setup_start, .pagetable_setup_done = native_pagetable_setup_done, diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 286d289b039..37b8b0fe832 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -81,6 +81,11 @@ static void __init find_early_table_space(unsigned long end, int use_pse, end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT); } +void __init native_pagetable_reserve(u64 start, u64 end) +{ + memblock_x86_reserve_range(start, end, "PGTABLE"); +} + struct map_range { unsigned long start; unsigned long end; @@ -272,9 +277,24 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, __flush_tlb_all(); + /* + * Reserve the kernel pagetable pages we used (pgt_buf_start - + * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top) + * so that they can be reused for other purposes. + * + * On native it just means calling memblock_x86_reserve_range, on Xen it + * also means marking RW the pagetable pages that we allocated before + * but that haven't been used. + * + * In fact on xen we mark RO the whole range pgt_buf_start - + * pgt_buf_top, because we have to make sure that when + * init_memory_mapping reaches the pagetable pages area, it maps + * RO all the pagetable pages, including the ones that are beyond + * pgt_buf_end at that time. + */ if (!after_bootmem && pgt_buf_end > pgt_buf_start) - memblock_x86_reserve_range(pgt_buf_start << PAGE_SHIFT, - pgt_buf_end << PAGE_SHIFT, "PGTABLE"); + x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start), + PFN_PHYS(pgt_buf_end)); if (!after_bootmem) early_memtest(start, end); diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 7cb6424317f..c58e0ea39ef 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -699,16 +699,17 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, unsigned long va, unsigned int cpu) { - int tcpu; - int uvhub; int locals = 0; int remotes = 0; int hubs = 0; + int tcpu; + int tpnode; struct bau_desc *bau_desc; struct cpumask *flush_mask; struct ptc_stats *stat; struct bau_control *bcp; struct bau_control *tbcp; + struct hub_and_pnode *hpp; /* kernel was booted 'nobau' */ if (nobau) @@ -750,11 +751,18 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu; bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); - /* cpu statistics */ for_each_cpu(tcpu, flush_mask) { - uvhub = uv_cpu_to_blade_id(tcpu); - bau_uvhub_set(uvhub, &bau_desc->distribution); - if (uvhub == bcp->uvhub) + /* + * The distribution vector is a bit map of pnodes, relative + * to the partition base pnode (and the partition base nasid + * in the header). + * Translate cpu to pnode and hub using an array stored + * in local memory. + */ + hpp = &bcp->socket_master->target_hub_and_pnode[tcpu]; + tpnode = hpp->pnode - bcp->partition_base_pnode; + bau_uvhub_set(tpnode, &bau_desc->distribution); + if (hpp->uvhub == bcp->uvhub) locals++; else remotes++; @@ -855,7 +863,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs) * an interrupt, but causes an error message to be returned to * the sender. */ -static void uv_enable_timeouts(void) +static void __init uv_enable_timeouts(void) { int uvhub; int nuvhubs; @@ -1326,10 +1334,10 @@ static int __init uv_ptc_init(void) } /* - * initialize the sending side's sending buffers + * Initialize the sending side's sending buffers. */ static void -uv_activation_descriptor_init(int node, int pnode) +uv_activation_descriptor_init(int node, int pnode, int base_pnode) { int i; int cpu; @@ -1352,11 +1360,11 @@ uv_activation_descriptor_init(int node, int pnode) n = pa >> uv_nshift; m = pa & uv_mmask; + /* the 14-bit pnode */ uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, (n << UV_DESC_BASE_PNODE_SHIFT | m)); - /* - * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each + * Initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each * cpu even though we only use the first one; one descriptor can * describe a broadcast to 256 uv hubs. */ @@ -1365,12 +1373,13 @@ uv_activation_descriptor_init(int node, int pnode) memset(bd2, 0, sizeof(struct bau_desc)); bd2->header.sw_ack_flag = 1; /* - * base_dest_nodeid is the nasid of the first uvhub - * in the partition. The bit map will indicate uvhub numbers, - * which are 0-N in a partition. Pnodes are unique system-wide. + * The base_dest_nasid set in the message header is the nasid + * of the first uvhub in the partition. The bit map will + * indicate destination pnode numbers relative to that base. + * They may not be consecutive if nasid striding is being used. */ - bd2->header.base_dest_nodeid = UV_PNODE_TO_NASID(uv_partition_base_pnode); - bd2->header.dest_subnodeid = 0x10; /* the LB */ + bd2->header.base_dest_nasid = UV_PNODE_TO_NASID(base_pnode); + bd2->header.dest_subnodeid = UV_LB_SUBNODEID; bd2->header.command = UV_NET_ENDPOINT_INTD; bd2->header.int_both = 1; /* @@ -1442,7 +1451,7 @@ uv_payload_queue_init(int node, int pnode) /* * Initialization of each UV hub's structures */ -static void __init uv_init_uvhub(int uvhub, int vector) +static void __init uv_init_uvhub(int uvhub, int vector, int base_pnode) { int node; int pnode; @@ -1450,11 +1459,11 @@ static void __init uv_init_uvhub(int uvhub, int vector) node = uvhub_to_first_node(uvhub); pnode = uv_blade_to_pnode(uvhub); - uv_activation_descriptor_init(node, pnode); + uv_activation_descriptor_init(node, pnode, base_pnode); uv_payload_queue_init(node, pnode); /* - * the below initialization can't be in firmware because the - * messaging IRQ will be determined by the OS + * The below initialization can't be in firmware because the + * messaging IRQ will be determined by the OS. */ apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits; uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, @@ -1491,10 +1500,11 @@ calculate_destination_timeout(void) /* * initialize the bau_control structure for each cpu */ -static int __init uv_init_per_cpu(int nuvhubs) +static int __init uv_init_per_cpu(int nuvhubs, int base_part_pnode) { int i; int cpu; + int tcpu; int pnode; int uvhub; int have_hmaster; @@ -1528,6 +1538,15 @@ static int __init uv_init_per_cpu(int nuvhubs) bcp = &per_cpu(bau_control, cpu); memset(bcp, 0, sizeof(struct bau_control)); pnode = uv_cpu_hub_info(cpu)->pnode; + if ((pnode - base_part_pnode) >= UV_DISTRIBUTION_SIZE) { + printk(KERN_EMERG + "cpu %d pnode %d-%d beyond %d; BAU disabled\n", + cpu, pnode, base_part_pnode, + UV_DISTRIBUTION_SIZE); + return 1; + } + bcp->osnode = cpu_to_node(cpu); + bcp->partition_base_pnode = uv_partition_base_pnode; uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; *(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8)); bdp = &uvhub_descs[uvhub]; @@ -1536,7 +1555,7 @@ static int __init uv_init_per_cpu(int nuvhubs) bdp->pnode = pnode; /* kludge: 'assuming' one node per socket, and assuming that disabling a socket just leaves a gap in node numbers */ - socket = (cpu_to_node(cpu) & 1); + socket = bcp->osnode & 1; bdp->socket_mask |= (1 << socket); sdp = &bdp->socket[socket]; sdp->cpu_number[sdp->num_cpus] = cpu; @@ -1585,6 +1604,20 @@ static int __init uv_init_per_cpu(int nuvhubs) nextsocket: socket++; socket_mask = (socket_mask >> 1); + /* each socket gets a local array of pnodes/hubs */ + bcp = smaster; + bcp->target_hub_and_pnode = kmalloc_node( + sizeof(struct hub_and_pnode) * + num_possible_cpus(), GFP_KERNEL, bcp->osnode); + memset(bcp->target_hub_and_pnode, 0, + sizeof(struct hub_and_pnode) * + num_possible_cpus()); + for_each_present_cpu(tcpu) { + bcp->target_hub_and_pnode[tcpu].pnode = + uv_cpu_hub_info(tcpu)->pnode; + bcp->target_hub_and_pnode[tcpu].uvhub = + uv_cpu_hub_info(tcpu)->numa_blade_id; + } } } kfree(uvhub_descs); @@ -1637,21 +1670,22 @@ static int __init uv_bau_init(void) spin_lock_init(&disable_lock); congested_cycles = microsec_2_cycles(congested_response_us); - if (uv_init_per_cpu(nuvhubs)) { - nobau = 1; - return 0; - } - uv_partition_base_pnode = 0x7fffffff; - for (uvhub = 0; uvhub < nuvhubs; uvhub++) + for (uvhub = 0; uvhub < nuvhubs; uvhub++) { if (uv_blade_nr_possible_cpus(uvhub) && (uv_blade_to_pnode(uvhub) < uv_partition_base_pnode)) uv_partition_base_pnode = uv_blade_to_pnode(uvhub); + } + + if (uv_init_per_cpu(nuvhubs, uv_partition_base_pnode)) { + nobau = 1; + return 0; + } vector = UV_BAU_MESSAGE; for_each_possible_blade(uvhub) if (uv_blade_nr_possible_cpus(uvhub)) - uv_init_uvhub(uvhub, vector); + uv_init_uvhub(uvhub, vector, uv_partition_base_pnode); uv_enable_timeouts(); alloc_intr_gate(vector, uv_bau_message_intr1); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 55c965b38c2..0684f3c74d5 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1275,6 +1275,20 @@ static __init void xen_pagetable_setup_start(pgd_t *base) { } +static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) +{ + /* reserve the range used */ + native_pagetable_reserve(start, end); + + /* set as RW the rest */ + printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end, + PFN_PHYS(pgt_buf_top)); + while (end < PFN_PHYS(pgt_buf_top)) { + make_lowmem_page_readwrite(__va(end)); + end += PAGE_SIZE; + } +} + static void xen_post_allocator_init(void); static __init void xen_pagetable_setup_done(pgd_t *base) @@ -1463,119 +1477,6 @@ static int xen_pgd_alloc(struct mm_struct *mm) return ret; } -#ifdef CONFIG_X86_64 -static __initdata u64 __last_pgt_set_rw = 0; -static __initdata u64 __pgt_buf_start = 0; -static __initdata u64 __pgt_buf_end = 0; -static __initdata u64 __pgt_buf_top = 0; -/* - * As a consequence of the commit: - * - * commit 4b239f458c229de044d6905c2b0f9fe16ed9e01e - * Author: Yinghai Lu <yinghai@kernel.org> - * Date: Fri Dec 17 16:58:28 2010 -0800 - * - * x86-64, mm: Put early page table high - * - * at some point init_memory_mapping is going to reach the pagetable pages - * area and map those pages too (mapping them as normal memory that falls - * in the range of addresses passed to init_memory_mapping as argument). - * Some of those pages are already pagetable pages (they are in the range - * pgt_buf_start-pgt_buf_end) therefore they are going to be mapped RO and - * everything is fine. - * Some of these pages are not pagetable pages yet (they fall in the range - * pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they - * are going to be mapped RW. When these pages become pagetable pages and - * are hooked into the pagetable, xen will find that the guest has already - * a RW mapping of them somewhere and fail the operation. - * The reason Xen requires pagetables to be RO is that the hypervisor needs - * to verify that the pagetables are valid before using them. The validation - * operations are called "pinning". - * - * In order to fix the issue we mark all the pages in the entire range - * pgt_buf_start-pgt_buf_top as RO, however when the pagetable allocation - * is completed only the range pgt_buf_start-pgt_buf_end is reserved by - * init_memory_mapping. Hence the kernel is going to crash as soon as one - * of the pages in the range pgt_buf_end-pgt_buf_top is reused (b/c those - * ranges are RO). - * - * For this reason, 'mark_rw_past_pgt' is introduced which is called _after_ - * the init_memory_mapping has completed (in a perfect world we would - * call this function from init_memory_mapping, but lets ignore that). - * - * Because we are called _after_ init_memory_mapping the pgt_buf_[start, - * end,top] have all changed to new values (b/c init_memory_mapping - * is called and setting up another new page-table). Hence, the first time - * we enter this function, we save away the pgt_buf_start value and update - * the pgt_buf_[end,top]. - * - * When we detect that the "old" pgt_buf_start through pgt_buf_end - * PFNs have been reserved (so memblock_x86_reserve_range has been called), - * we immediately set out to RW the "old" pgt_buf_end through pgt_buf_top. - * - * And then we update those "old" pgt_buf_[end|top] with the new ones - * so that we can redo this on the next pagetable. - */ -static __init void mark_rw_past_pgt(void) { - - if (pgt_buf_end > pgt_buf_start) { - u64 addr, size; - - /* Save it away. */ - if (!__pgt_buf_start) { - __pgt_buf_start = pgt_buf_start; - __pgt_buf_end = pgt_buf_end; - __pgt_buf_top = pgt_buf_top; - return; - } - /* If we get the range that starts at __pgt_buf_end that means - * the range is reserved, and that in 'init_memory_mapping' - * the 'memblock_x86_reserve_range' has been called with the - * outdated __pgt_buf_start, __pgt_buf_end (the "new" - * pgt_buf_[start|end|top] refer now to a new pagetable. - * Note: we are called _after_ the pgt_buf_[..] have been - * updated.*/ - - addr = memblock_x86_find_in_range_size(PFN_PHYS(__pgt_buf_start), - &size, PAGE_SIZE); - - /* Still not reserved, meaning 'memblock_x86_reserve_range' - * hasn't been called yet. Update the _end and _top.*/ - if (addr == PFN_PHYS(__pgt_buf_start)) { - __pgt_buf_end = pgt_buf_end; - __pgt_buf_top = pgt_buf_top; - return; - } - - /* OK, the area is reserved, meaning it is time for us to - * set RW for the old end->top PFNs. */ - - /* ..unless we had already done this. */ - if (__pgt_buf_end == __last_pgt_set_rw) - return; - - addr = PFN_PHYS(__pgt_buf_end); - - /* set as RW the rest */ - printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", - PFN_PHYS(__pgt_buf_end), PFN_PHYS(__pgt_buf_top)); - - while (addr < PFN_PHYS(__pgt_buf_top)) { - make_lowmem_page_readwrite(__va(addr)); - addr += PAGE_SIZE; - } - /* And update everything so that we are ready for the next - * pagetable (the one created for regions past 4GB) */ - __last_pgt_set_rw = __pgt_buf_end; - __pgt_buf_start = pgt_buf_start; - __pgt_buf_end = pgt_buf_end; - __pgt_buf_top = pgt_buf_top; - } - return; -} -#else -static __init void mark_rw_past_pgt(void) { } -#endif static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) { #ifdef CONFIG_X86_64 @@ -1602,14 +1503,6 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) unsigned long pfn = pte_pfn(pte); /* - * A bit of optimization. We do not need to call the workaround - * when xen_set_pte_init is called with a PTE with 0 as PFN. - * That is b/c the pagetable at that point are just being populated - * with empty values and we can save some cycles by not calling - * the 'memblock' code.*/ - if (pfn) - mark_rw_past_pgt(); - /* * If the new pfn is within the range of the newly allocated * kernel pagetable, and it isn't being mapped into an * early_ioremap fixmap slot as a freshly allocated page, make sure @@ -2118,8 +2011,6 @@ __init void xen_ident_map_ISA(void) static __init void xen_post_allocator_init(void) { - mark_rw_past_pgt(); - #ifdef CONFIG_XEN_DEBUG pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug); #endif @@ -2228,6 +2119,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { void __init xen_init_mmu_ops(void) { + x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; pv_mmu_ops = xen_mmu_ops; |