diff options
Diffstat (limited to 'arch/x86/kernel')
55 files changed, 825 insertions, 513 deletions
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index a3ddad18aaa..fa2161d5003 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -150,6 +150,10 @@ static int __init acpi_sleep_setup(char *str) acpi_realmode_flags |= 2; if (strncmp(str, "s3_beep", 7) == 0) acpi_realmode_flags |= 4; +#ifdef CONFIG_HIBERNATION + if (strncmp(str, "s4_nohwsig", 10) == 0) + acpi_no_s4_hw_signature(); +#endif if (strncmp(str, "old_ordering", 12) == 0) acpi_old_suspend_ordering(); str = strchr(str, ','); diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index c25210e6ac8..22d7d050905 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -29,9 +29,6 @@ #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) -#define to_pages(addr, size) \ - (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) - #define EXIT_LOOP_COUNT 10000000 static DEFINE_RWLOCK(amd_iommu_devtable_lock); @@ -185,7 +182,7 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, u64 address, size_t size) { int s = 0; - unsigned pages = to_pages(address, size); + unsigned pages = iommu_num_pages(address, size); address &= PAGE_MASK; @@ -557,8 +554,8 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, if (iommu->exclusion_start && iommu->exclusion_start < dma_dom->aperture_size) { unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; - int pages = to_pages(iommu->exclusion_start, - iommu->exclusion_length); + int pages = iommu_num_pages(iommu->exclusion_start, + iommu->exclusion_length); dma_ops_reserve_addresses(dma_dom, startpage, pages); } @@ -667,7 +664,7 @@ static int get_device_resources(struct device *dev, _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); /* device not translated by any IOMMU in the system? */ - if (_bdf >= amd_iommu_last_bdf) { + if (_bdf > amd_iommu_last_bdf) { *iommu = NULL; *domain = NULL; *bdf = 0xffff; @@ -767,7 +764,7 @@ static dma_addr_t __map_single(struct device *dev, unsigned int pages; int i; - pages = to_pages(paddr, size); + pages = iommu_num_pages(paddr, size); paddr &= PAGE_MASK; address = dma_ops_alloc_addresses(dev, dma_dom, pages); @@ -802,7 +799,7 @@ static void __unmap_single(struct amd_iommu *iommu, if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size)) return; - pages = to_pages(dma_addr, size); + pages = iommu_num_pages(dma_addr, size); dma_addr &= PAGE_MASK; start = dma_addr; @@ -1085,7 +1082,7 @@ void prealloc_protection_domains(void) while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { devid = (dev->bus->number << 8) | dev->devfn; - if (devid >= amd_iommu_last_bdf) + if (devid > amd_iommu_last_bdf) continue; devid = amd_iommu_alias_table[devid]; if (domain_for_device(devid)) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index c9d8ff2eb13..d9a9da597e7 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -732,7 +732,7 @@ static int __init init_exclusion_range(struct ivmd_header *m) set_device_exclusion_range(m->devid, m); break; case ACPI_IVMD_TYPE_ALL: - for (i = 0; i < amd_iommu_last_bdf; ++i) + for (i = 0; i <= amd_iommu_last_bdf; ++i) set_device_exclusion_range(i, m); break; case ACPI_IVMD_TYPE_RANGE: @@ -934,7 +934,7 @@ int __init amd_iommu_init(void) /* * let all alias entries point to itself */ - for (i = 0; i < amd_iommu_last_bdf; ++i) + for (i = 0; i <= amd_iommu_last_bdf; ++i) amd_iommu_alias_table[i] = i; /* diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index d6c89835837..039a8d4aaf6 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1720,15 +1720,19 @@ static int __init parse_lapic_timer_c2_ok(char *arg) } early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); -static int __init apic_set_verbosity(char *str) +static int __init apic_set_verbosity(char *arg) { - if (strcmp("debug", str) == 0) + if (!arg) + return -EINVAL; + + if (strcmp(arg, "debug") == 0) apic_verbosity = APIC_DEBUG; - else if (strcmp("verbose", str) == 0) + else if (strcmp(arg, "verbose") == 0) apic_verbosity = APIC_VERBOSE; - return 1; + + return 0; } -__setup("apic=", apic_set_verbosity); +early_param("apic", apic_set_verbosity); static int __init lapic_insert_resource(void) { diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index bf9b441331e..9ee24e6bc4b 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -219,7 +219,6 @@ #include <linux/time.h> #include <linux/sched.h> #include <linux/pm.h> -#include <linux/pm_legacy.h> #include <linux/capability.h> #include <linux/device.h> #include <linux/kernel.h> diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index c9b58a806e8..c8e315f1aa8 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -50,6 +50,8 @@ static double __initdata y = 3145727.0; */ static void __init check_fpu(void) { + s32 fdiv_bug; + if (!boot_cpu_data.hard_math) { #ifndef CONFIG_MATH_EMULATION printk(KERN_EMERG "No coprocessor found and no math emulation present.\n"); @@ -74,8 +76,10 @@ static void __init check_fpu(void) "fistpl %0\n\t" "fwait\n\t" "fninit" - : "=m" (*&boot_cpu_data.fdiv_bug) + : "=m" (*&fdiv_bug) : "m" (*&x), "m" (*&y)); + + boot_cpu_data.fdiv_bug = fdiv_bug; if (boot_cpu_data.fdiv_bug) printk("Hmm, FPU with FDIV bug.\n"); } diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig index cb7a5715596..efae3b22a0f 100644 --- a/arch/x86/kernel/cpu/cpufreq/Kconfig +++ b/arch/x86/kernel/cpu/cpufreq/Kconfig @@ -235,9 +235,9 @@ config X86_LONGHAUL If in doubt, say N. config X86_E_POWERSAVER - tristate "VIA C7 Enhanced PowerSaver (EXPERIMENTAL)" + tristate "VIA C7 Enhanced PowerSaver" select CPU_FREQ_TABLE - depends on X86_32 && EXPERIMENTAL + depends on X86_32 help This adds the CPUFreq driver for VIA C7 processors. diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index b0c8208df9f..dd097b83583 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -202,7 +202,7 @@ static void drv_write(struct drv_cmd *cmd) cpumask_t saved_mask = current->cpus_allowed; unsigned int i; - for_each_cpu_mask(i, cmd->mask) { + for_each_cpu_mask_nr(i, cmd->mask) { set_cpus_allowed_ptr(current, &cpumask_of_cpu(i)); do_drv_write(cmd); } @@ -451,7 +451,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, freqs.old = perf->states[perf->state].core_frequency * 1000; freqs.new = data->freq_table[next_state].frequency; - for_each_cpu_mask(i, cmd.mask) { + for_each_cpu_mask_nr(i, cmd.mask) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -466,7 +466,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, } } - for_each_cpu_mask(i, cmd.mask) { + for_each_cpu_mask_nr(i, cmd.mask) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c index 94619c22f56..e4a4bf870e9 100644 --- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c +++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c @@ -44,7 +44,7 @@ struct s_elan_multiplier { * It is important that the frequencies * are listed in ascending order here! */ -struct s_elan_multiplier elan_multiplier[] = { +static struct s_elan_multiplier elan_multiplier[] = { {1000, 0x02, 0x18}, {2000, 0x02, 0x10}, {4000, 0x02, 0x08}, diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index 199e4e05e5d..f1685fb91fb 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c @@ -122,7 +122,7 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy, return 0; /* notifiers */ - for_each_cpu_mask(i, policy->cpus) { + for_each_cpu_mask_nr(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -130,11 +130,11 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy, /* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software * Developer's Manual, Volume 3 */ - for_each_cpu_mask(i, policy->cpus) + for_each_cpu_mask_nr(i, policy->cpus) cpufreq_p4_setdc(i, p4clockmod_table[newstate].index); /* notifiers */ - for_each_cpu_mask(i, policy->cpus) { + for_each_cpu_mask_nr(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 206791eb46e..4e7271999a7 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -66,7 +66,6 @@ static u32 find_freq_from_fid(u32 fid) return 800 + (fid * 100); } - /* Return a frequency in KHz, given an input fid */ static u32 find_khz_freq_from_fid(u32 fid) { @@ -78,7 +77,6 @@ static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data, u32 p return data[pstate].frequency; } - /* Return the vco fid for an input fid * * Each "low" fid has corresponding "high" fid, and you can get to "low" fids @@ -166,7 +164,6 @@ static void fidvid_msr_init(void) wrmsr(MSR_FIDVID_CTL, lo, hi); } - /* write the new fid value along with the other control fields to the msr */ static int write_new_fid(struct powernow_k8_data *data, u32 fid) { @@ -740,44 +737,63 @@ static int find_psb_table(struct powernow_k8_data *data) #ifdef CONFIG_X86_POWERNOW_K8_ACPI static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { - if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) + if (!data->acpi_data->state_count || (cpu_family == CPU_HW_PSTATE)) return; - data->irt = (data->acpi_data.states[index].control >> IRT_SHIFT) & IRT_MASK; - data->rvo = (data->acpi_data.states[index].control >> RVO_SHIFT) & RVO_MASK; - data->exttype = (data->acpi_data.states[index].control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; - data->plllock = (data->acpi_data.states[index].control >> PLL_L_SHIFT) & PLL_L_MASK; - data->vidmvs = 1 << ((data->acpi_data.states[index].control >> MVS_SHIFT) & MVS_MASK); - data->vstable = (data->acpi_data.states[index].control >> VST_SHIFT) & VST_MASK; + data->irt = (data->acpi_data->states[index].control >> IRT_SHIFT) & IRT_MASK; + data->rvo = (data->acpi_data->states[index].control >> RVO_SHIFT) & RVO_MASK; + data->exttype = (data->acpi_data->states[index].control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; + data->plllock = (data->acpi_data->states[index].control >> PLL_L_SHIFT) & PLL_L_MASK; + data->vidmvs = 1 << ((data->acpi_data->states[index].control >> MVS_SHIFT) & MVS_MASK); + data->vstable = (data->acpi_data->states[index].control >> VST_SHIFT) & VST_MASK; +} + + +static struct acpi_processor_performance *acpi_perf_data; +static int preregister_valid; + +static int powernow_k8_cpu_preinit_acpi(void) +{ + acpi_perf_data = alloc_percpu(struct acpi_processor_performance); + if (!acpi_perf_data) + return -ENODEV; + + if (acpi_processor_preregister_performance(acpi_perf_data)) + return -ENODEV; + else + preregister_valid = 1; + return 0; } static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { struct cpufreq_frequency_table *powernow_table; int ret_val; + int cpu = 0; - if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { + data->acpi_data = percpu_ptr(acpi_perf_data, cpu); + if (acpi_processor_register_performance(data->acpi_data, data->cpu)) { dprintk("register performance failed: bad ACPI data\n"); return -EIO; } /* verify the data contained in the ACPI structures */ - if (data->acpi_data.state_count <= 1) { + if (data->acpi_data->state_count <= 1) { dprintk("No ACPI P-States\n"); goto err_out; } - if ((data->acpi_data.control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) || - (data->acpi_data.status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) { + if ((data->acpi_data->control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) || + (data->acpi_data->status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) { dprintk("Invalid control/status registers (%x - %x)\n", - data->acpi_data.control_register.space_id, - data->acpi_data.status_register.space_id); + data->acpi_data->control_register.space_id, + data->acpi_data->status_register.space_id); goto err_out; } /* fill in data->powernow_table */ powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table) - * (data->acpi_data.state_count + 1)), GFP_KERNEL); + * (data->acpi_data->state_count + 1)), GFP_KERNEL); if (!powernow_table) { dprintk("powernow_table memory alloc failure\n"); goto err_out; @@ -790,12 +806,12 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) if (ret_val) goto err_out_mem; - powernow_table[data->acpi_data.state_count].frequency = CPUFREQ_TABLE_END; - powernow_table[data->acpi_data.state_count].index = 0; + powernow_table[data->acpi_data->state_count].frequency = CPUFREQ_TABLE_END; + powernow_table[data->acpi_data->state_count].index = 0; data->powernow_table = powernow_table; /* fill in data */ - data->numps = data->acpi_data.state_count; + data->numps = data->acpi_data->state_count; if (first_cpu(per_cpu(cpu_core_map, data->cpu)) == data->cpu) print_basics(data); powernow_k8_acpi_pst_values(data, 0); @@ -803,16 +819,31 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) /* notify BIOS that we exist */ acpi_processor_notify_smm(THIS_MODULE); + /* determine affinity, from ACPI if available */ + if (preregister_valid) { + if ((data->acpi_data->shared_type == CPUFREQ_SHARED_TYPE_ALL) || + (data->acpi_data->shared_type == CPUFREQ_SHARED_TYPE_ANY)) + data->starting_core_affinity = data->acpi_data->shared_cpu_map; + else + data->starting_core_affinity = cpumask_of_cpu(data->cpu); + } else { + /* best guess from family if not */ + if (cpu_family == CPU_HW_PSTATE) + data->starting_core_affinity = cpumask_of_cpu(data->cpu); + else + data->starting_core_affinity = per_cpu(cpu_core_map, data->cpu); + } + return 0; err_out_mem: kfree(powernow_table); err_out: - acpi_processor_unregister_performance(&data->acpi_data, data->cpu); + acpi_processor_unregister_performance(data->acpi_data, data->cpu); /* data->acpi_data.state_count informs us at ->exit() whether ACPI was used */ - data->acpi_data.state_count = 0; + data->acpi_data->state_count = 0; return -ENODEV; } @@ -824,10 +855,10 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpuf rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo); data->max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; - for (i = 0; i < data->acpi_data.state_count; i++) { + for (i = 0; i < data->acpi_data->state_count; i++) { u32 index; - index = data->acpi_data.states[i].control & HW_PSTATE_MASK; + index = data->acpi_data->states[i].control & HW_PSTATE_MASK; if (index > data->max_hw_pstate) { printk(KERN_ERR PFX "invalid pstate %d - bad value %d.\n", i, index); printk(KERN_ERR PFX "Please report to BIOS manufacturer\n"); @@ -843,7 +874,7 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpuf powernow_table[i].index = index; - powernow_table[i].frequency = data->acpi_data.states[i].core_frequency * 1000; + powernow_table[i].frequency = data->acpi_data->states[i].core_frequency * 1000; } return 0; } @@ -852,16 +883,16 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpuf { int i; int cntlofreq = 0; - for (i = 0; i < data->acpi_data.state_count; i++) { + for (i = 0; i < data->acpi_data->state_count; i++) { u32 fid; u32 vid; if (data->exttype) { - fid = data->acpi_data.states[i].status & EXT_FID_MASK; - vid = (data->acpi_data.states[i].status >> VID_SHIFT) & EXT_VID_MASK; + fid = data->acpi_data->states[i].status & EXT_FID_MASK; + vid = (data->acpi_data->states[i].status >> VID_SHIFT) & EXT_VID_MASK; } else { - fid = data->acpi_data.states[i].control & FID_MASK; - vid = (data->acpi_data.states[i].control >> VID_SHIFT) & VID_MASK; + fid = data->acpi_data->states[i].control & FID_MASK; + vid = (data->acpi_data->states[i].control >> VID_SHIFT) & VID_MASK; } dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid); @@ -902,10 +933,10 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpuf cntlofreq = i; } - if (powernow_table[i].frequency != (data->acpi_data.states[i].core_frequency * 1000)) { + if (powernow_table[i].frequency != (data->acpi_data->states[i].core_frequency * 1000)) { printk(KERN_INFO PFX "invalid freq entries %u kHz vs. %u kHz\n", powernow_table[i].frequency, - (unsigned int) (data->acpi_data.states[i].core_frequency * 1000)); + (unsigned int) (data->acpi_data->states[i].core_frequency * 1000)); powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; continue; } @@ -915,11 +946,12 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpuf static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) { - if (data->acpi_data.state_count) - acpi_processor_unregister_performance(&data->acpi_data, data->cpu); + if (data->acpi_data->state_count) + acpi_processor_unregister_performance(data->acpi_data, data->cpu); } #else +static int powernow_k8_cpu_preinit_acpi(void) { return -ENODEV; } static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { return -ENODEV; } static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) { return; } static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { return; } @@ -966,7 +998,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i freqs.old = find_khz_freq_from_fid(data->currfid); freqs.new = find_khz_freq_from_fid(fid); - for_each_cpu_mask(i, *(data->available_cores)) { + for_each_cpu_mask_nr(i, *(data->available_cores)) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -974,7 +1006,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i res = transition_fid_vid(data, fid, vid); freqs.new = find_khz_freq_from_fid(data->currfid); - for_each_cpu_mask(i, *(data->available_cores)) { + for_each_cpu_mask_nr(i, *(data->available_cores)) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } @@ -997,7 +1029,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i freqs.old = find_khz_freq_from_pstate(data->powernow_table, data->currpstate); freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); - for_each_cpu_mask(i, *(data->available_cores)) { + for_each_cpu_mask_nr(i, *(data->available_cores)) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -1005,7 +1037,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i res = transition_pstate(data, pstate); freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); - for_each_cpu_mask(i, *(data->available_cores)) { + for_each_cpu_mask_nr(i, *(data->available_cores)) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } @@ -1104,7 +1136,7 @@ static int powernowk8_verify(struct cpufreq_policy *pol) static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) { struct powernow_k8_data *data; - cpumask_t oldmask; + cpumask_t oldmask = CPU_MASK_ALL; int rc; if (!cpu_online(pol->cpu)) @@ -1177,10 +1209,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) /* run on any CPU again */ set_cpus_allowed_ptr(current, &oldmask); - if (cpu_family == CPU_HW_PSTATE) - pol->cpus = cpumask_of_cpu(pol->cpu); - else - pol->cpus = per_cpu(cpu_core_map, pol->cpu); + pol->cpus = data->starting_core_affinity; data->available_cores = &(pol->cpus); /* Take a crude guess here. @@ -1303,6 +1332,7 @@ static int __cpuinit powernowk8_init(void) } if (supported_cpus == num_online_cpus()) { + powernow_k8_cpu_preinit_acpi(); printk(KERN_INFO PFX "Found %d %s " "processors (%d cpu cores) (" VERSION ")\n", num_online_nodes(), @@ -1319,6 +1349,10 @@ static void __exit powernowk8_exit(void) dprintk("exit\n"); cpufreq_unregister_driver(&cpufreq_amd64_driver); + +#ifdef CONFIG_X86_POWERNOW_K8_ACPI + free_percpu(acpi_perf_data); +#endif } MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com> and Mark Langsdorf <mark.langsdorf@amd.com>"); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h index ab48cfed4d9..a62612cd4be 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h @@ -33,12 +33,13 @@ struct powernow_k8_data { #ifdef CONFIG_X86_POWERNOW_K8_ACPI /* the acpi table needs to be kept. it's only available if ACPI was * used to determine valid frequency/vid/fid states */ - struct acpi_processor_performance acpi_data; + struct acpi_processor_performance *acpi_data; #endif /* we need to keep track of associated cores, but let cpufreq * handle hotplug events - so just point at cpufreq pol->cpus * structure */ cpumask_t *available_cores; + cpumask_t starting_core_affinity; }; diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index 908dd347c67..15e13c01cc3 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -28,7 +28,8 @@ #define PFX "speedstep-centrino: " #define MAINTAINER "cpufreq@lists.linux.org.uk" -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) +#define dprintk(msg...) \ + cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) #define INTEL_MSR_RANGE (0xffff) @@ -66,11 +67,12 @@ struct cpu_model struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */ }; -static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x); +static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, + const struct cpu_id *x); /* Operating points for current CPU */ -static struct cpu_model *centrino_model[NR_CPUS]; -static const struct cpu_id *centrino_cpu[NR_CPUS]; +static DEFINE_PER_CPU(struct cpu_model *, centrino_model); +static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu); static struct cpufreq_driver centrino_driver; @@ -255,7 +257,7 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy) return -ENOENT; } - centrino_model[policy->cpu] = model; + per_cpu(centrino_model, policy->cpu) = model; dprintk("found \"%s\": max frequency: %dkHz\n", model->model_name, model->max_freq); @@ -264,10 +266,14 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy) } #else -static inline int centrino_cpu_init_table(struct cpufreq_policy *policy) { return -ENODEV; } +static inline int centrino_cpu_init_table(struct cpufreq_policy *policy) +{ + return -ENODEV; +} #endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */ -static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x) +static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, + const struct cpu_id *x) { if ((c->x86 == x->x86) && (c->x86_model == x->x86_model) && @@ -286,23 +292,28 @@ static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe) * for centrino, as some DSDTs are buggy. * Ideally, this can be done using the acpi_data structure. */ - if ((centrino_cpu[cpu] == &cpu_ids[CPU_BANIAS]) || - (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_A1]) || - (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_B0])) { + if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) || + (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) || + (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) { msr = (msr >> 8) & 0xff; return msr * 100000; } - if ((!centrino_model[cpu]) || (!centrino_model[cpu]->op_points)) + if ((!per_cpu(centrino_model, cpu)) || + (!per_cpu(centrino_model, cpu)->op_points)) return 0; msr &= 0xffff; - for (i=0;centrino_model[cpu]->op_points[i].frequency != CPUFREQ_TABLE_END; i++) { - if (msr == centrino_model[cpu]->op_points[i].index) - return centrino_model[cpu]->op_points[i].frequency; + for (i = 0; + per_cpu(centrino_model, cpu)->op_points[i].frequency + != CPUFREQ_TABLE_END; + i++) { + if (msr == per_cpu(centrino_model, cpu)->op_points[i].index) + return per_cpu(centrino_model, cpu)-> + op_points[i].frequency; } if (failsafe) - return centrino_model[cpu]->op_points[i-1].frequency; + return per_cpu(centrino_model, cpu)->op_points[i-1].frequency; else return 0; } @@ -347,7 +358,8 @@ static int centrino_cpu_init(struct cpufreq_policy *policy) int i; /* Only Intel makes Enhanced Speedstep-capable CPUs */ - if (cpu->x86_vendor != X86_VENDOR_INTEL || !cpu_has(cpu, X86_FEATURE_EST)) + if (cpu->x86_vendor != X86_VENDOR_INTEL || + !cpu_has(cpu, X86_FEATURE_EST)) return -ENODEV; if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC)) @@ -361,9 +373,9 @@ static int centrino_cpu_init(struct cpufreq_policy *policy) break; if (i != N_IDS) - centrino_cpu[policy->cpu] = &cpu_ids[i]; + per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i]; - if (!centrino_cpu[policy->cpu]) { + if (!per_cpu(centrino_cpu, policy->cpu)) { dprintk("found unsupported CPU with " "Enhanced SpeedStep: send /proc/cpuinfo to " MAINTAINER "\n"); @@ -386,23 +398,26 @@ static int centrino_cpu_init(struct cpufreq_policy *policy) /* check to see if it stuck */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); if (!(l & (1<<16))) { - printk(KERN_INFO PFX "couldn't enable Enhanced SpeedStep\n"); + printk(KERN_INFO PFX + "couldn't enable Enhanced SpeedStep\n"); return -ENODEV; } } freq = get_cur_freq(policy->cpu); - - policy->cpuinfo.transition_latency = 10000; /* 10uS transition latency */ + policy->cpuinfo.transition_latency = 10000; + /* 10uS transition latency */ policy->cur = freq; dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur); - ret = cpufreq_frequency_table_cpuinfo(policy, centrino_model[policy->cpu]->op_points); + ret = cpufreq_frequency_table_cpuinfo(policy, + per_cpu(centrino_model, policy->cpu)->op_points); if (ret) return (ret); - cpufreq_frequency_table_get_attr(centrino_model[policy->cpu]->op_points, policy->cpu); + cpufreq_frequency_table_get_attr( + per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu); return 0; } @@ -411,12 +426,12 @@ static int centrino_cpu_exit(struct cpufreq_policy *policy) { unsigned int cpu = policy->cpu; - if (!centrino_model[cpu]) + if (!per_cpu(centrino_model, cpu)) return -ENODEV; cpufreq_frequency_table_put_attr(cpu); - centrino_model[cpu] = NULL; + per_cpu(centrino_model, cpu) = NULL; return 0; } @@ -430,17 +445,26 @@ static int centrino_cpu_exit(struct cpufreq_policy *policy) */ static int centrino_verify (struct cpufreq_policy *policy) { - return cpufreq_frequency_table_verify(policy, centrino_model[policy->cpu]->op_points); + return cpufreq_frequency_table_verify(policy, + per_cpu(centrino_model, policy->cpu)->op_points); } /** * centrino_setpolicy - set a new CPUFreq policy * @policy: new policy * @target_freq: the target frequency - * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) + * @relation: how that frequency relates to achieved frequency + * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) * * Sets a new CPUFreq policy. */ +struct allmasks { + cpumask_t online_policy_cpus; + cpumask_t saved_mask; + cpumask_t set_mask; + cpumask_t covered_cpus; +}; + static int centrino_target (struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) @@ -448,48 +472,55 @@ static int centrino_target (struct cpufreq_policy *policy, unsigned int newstate = 0; unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu; struct cpufreq_freqs freqs; - cpumask_t online_policy_cpus; - cpumask_t saved_mask; - cpumask_t set_mask; - cpumask_t covered_cpus; int retval = 0; unsigned int j, k, first_cpu, tmp; - - if (unlikely(centrino_model[cpu] == NULL)) - return -ENODEV; + CPUMASK_ALLOC(allmasks); + CPUMASK_PTR(online_policy_cpus, allmasks); + CPUMASK_PTR(saved_mask, allmasks); + CPUMASK_PTR(set_mask, allmasks); + CPUMASK_PTR(covered_cpus, allmasks); + + if (unlikely(allmasks == NULL)) + return -ENOMEM; + + if (unlikely(per_cpu(centrino_model, cpu) == NULL)) { + retval = -ENODEV; + goto out; + } if (unlikely(cpufreq_frequency_table_target(policy, - centrino_model[cpu]->op_points, + per_cpu(centrino_model, cpu)->op_points, target_freq, relation, &newstate))) { - return -EINVAL; + retval = -EINVAL; + goto out; } #ifdef CONFIG_HOTPLUG_CPU /* cpufreq holds the hotplug lock, so we are safe from here on */ - cpus_and(online_policy_cpus, cpu_online_map, policy->cpus); + cpus_and(*online_policy_cpus, cpu_online_map, policy->cpus); #else - online_policy_cpus = policy->cpus; + *online_policy_cpus = policy->cpus; #endif - saved_mask = current->cpus_allowed; + *saved_mask = current->cpus_allowed; first_cpu = 1; - cpus_clear(covered_cpus); - for_each_cpu_mask(j, online_policy_cpus) { + cpus_clear(*covered_cpus); + for_each_cpu_mask_nr(j, *online_policy_cpus) { /* * Support for SMP systems. * Make sure we are running on CPU that wants to change freq */ - cpus_clear(set_mask); + cpus_clear(*set_mask); if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) - cpus_or(set_mask, set_mask, online_policy_cpus); + cpus_or(*set_mask, *set_mask, *online_policy_cpus); else - cpu_set(j, set_mask); + cpu_set(j, *set_mask); - set_cpus_allowed_ptr(current, &set_mask); + set_cpus_allowed_ptr(current, set_mask); preempt_disable(); - if (unlikely(!cpu_isset(smp_processor_id(), set_mask))) { + if (unlikely(!cpu_isset(smp_processor_id(), *set_mask))) { dprintk("couldn't limit to CPUs in this domain\n"); retval = -EAGAIN; if (first_cpu) { @@ -500,7 +531,7 @@ static int centrino_target (struct cpufreq_policy *policy, break; } - msr = centrino_model[cpu]->op_points[newstate].index; + msr = per_cpu(centrino_model, cpu)->op_points[newstate].index; if (first_cpu) { rdmsr(MSR_IA32_PERF_CTL, oldmsr, h); @@ -517,7 +548,7 @@ static int centrino_target (struct cpufreq_policy *policy, dprintk("target=%dkHz old=%d new=%d msr=%04x\n", target_freq, freqs.old, freqs.new, msr); - for_each_cpu_mask(k, online_policy_cpus) { + for_each_cpu_mask_nr(k, *online_policy_cpus) { freqs.cpu = k; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); @@ -536,11 +567,11 @@ static int centrino_target (struct cpufreq_policy *policy, break; } - cpu_set(j, covered_cpus); + cpu_set(j, *covered_cpus); preempt_enable(); } - for_each_cpu_mask(k, online_policy_cpus) { + for_each_cpu_mask_nr(k, *online_policy_cpus) { freqs.cpu = k; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } @@ -553,30 +584,32 @@ static int centrino_target (struct cpufreq_policy *policy, * Best effort undo.. */ - if (!cpus_empty(covered_cpus)) { - for_each_cpu_mask(j, covered_cpus) { + if (!cpus_empty(*covered_cpus)) + for_each_cpu_mask_nr(j, *covered_cpus) { set_cpus_allowed_ptr(current, &cpumask_of_cpu(j)); wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); } - } tmp = freqs.new; freqs.new = freqs.old; freqs.old = tmp; - for_each_cpu_mask(j, online_policy_cpus) { + for_each_cpu_mask_nr(j, *online_policy_cpus) { freqs.cpu = j; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } } - set_cpus_allowed_ptr(current, &saved_mask); - return 0; + set_cpus_allowed_ptr(current, saved_mask); + retval = 0; + goto out; migrate_end: preempt_enable(); - set_cpus_allowed_ptr(current, &saved_mask); - return 0; + set_cpus_allowed_ptr(current, saved_mask); +out: + CPUMASK_FREE(allmasks); + return retval; } static struct freq_attr* centrino_attr[] = { diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 1b50244b1fd..191f7263c61 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -279,7 +279,7 @@ static int speedstep_target (struct cpufreq_policy *policy, cpus_allowed = current->cpus_allowed; - for_each_cpu_mask(i, policy->cpus) { + for_each_cpu_mask_nr(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -292,7 +292,7 @@ static int speedstep_target (struct cpufreq_policy *policy, /* allow to be run on all CPUs */ set_cpus_allowed_ptr(current, &cpus_allowed); - for_each_cpu_mask(i, policy->cpus) { + for_each_cpu_mask_nr(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index ff517f0b8cc..6b0a10b002f 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -489,7 +489,7 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) int sibling; this_leaf = CPUID4_INFO_IDX(cpu, index); - for_each_cpu_mask(sibling, this_leaf->shared_cpu_map) { + for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) { sibling_leaf = CPUID4_INFO_IDX(sibling, index); cpu_clear(cpu, sibling_leaf->shared_cpu_map); } diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index c4a7ec31394..65a339678ec 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -580,7 +580,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, char __user *buf = ubuf; int i, err; - cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL); + cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); if (!cpu_tsc) return -ENOMEM; @@ -762,10 +762,14 @@ DEFINE_PER_CPU(struct sys_device, device_mce); /* Why are there no generic functions for this? */ #define ACCESSOR(name, var, start) \ - static ssize_t show_ ## name(struct sys_device *s, char *buf) { \ + static ssize_t show_ ## name(struct sys_device *s, \ + struct sysdev_attribute *attr, \ + char *buf) { \ return sprintf(buf, "%lx\n", (unsigned long)var); \ } \ - static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \ + static ssize_t set_ ## name(struct sys_device *s, \ + struct sysdev_attribute *attr, \ + const char *buf, size_t siz) { \ char *end; \ unsigned long new = simple_strtoul(buf, &end, 0); \ if (end == buf) return -EINVAL; \ @@ -786,14 +790,16 @@ ACCESSOR(bank3ctl,bank[3],mce_restart()) ACCESSOR(bank4ctl,bank[4],mce_restart()) ACCESSOR(bank5ctl,bank[5],mce_restart()) -static ssize_t show_trigger(struct sys_device *s, char *buf) +static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, + char *buf) { strcpy(buf, trigger); strcat(buf, "\n"); return strlen(trigger) + 1; } -static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz) +static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, + const char *buf,size_t siz) { char *p; int len; @@ -806,12 +812,12 @@ static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz) } static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); -ACCESSOR(tolerant,tolerant,) +static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); ACCESSOR(check_interval,check_interval,mce_restart()) static struct sysdev_attribute *mce_attributes[] = { &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, - &attr_tolerant, &attr_check_interval, &attr_trigger, + &attr_tolerant.attr, &attr_check_interval, &attr_trigger, NULL }; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 7c9a813e119..88736cadbaa 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -527,7 +527,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (err) goto out_free; - for_each_cpu_mask(i, b->cpus) { + for_each_cpu_mask_nr(i, b->cpus) { if (i == cpu) continue; @@ -617,7 +617,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) #endif /* remove all sibling symlinks before unregistering */ - for_each_cpu_mask(i, b->cpus) { + for_each_cpu_mask_nr(i, b->cpus) { if (i == cpu) continue; diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 1f4cc48c14c..d5ae2243f0b 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -35,6 +35,7 @@ atomic_t therm_throt_en = ATOMIC_INIT(0); #define define_therm_throt_sysdev_show_func(name) \ static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ + struct sysdev_attribute *attr, \ char *buf) \ { \ unsigned int cpu = dev->id; \ diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 0d0d9057e7c..a26c480b949 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -160,7 +160,7 @@ static void *c_start(struct seq_file *m, loff_t *pos) { if (*pos == 0) /* just in case, cpu 0 is not the first */ *pos = first_cpu(cpu_online_map); - if ((*pos) < NR_CPUS && cpu_online(*pos)) + if ((*pos) < nr_cpu_ids && cpu_online(*pos)) return &cpu_data(*pos); return NULL; } diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 2de5fa2bbf7..14b11b3be31 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -141,8 +141,8 @@ static __cpuinit int cpuid_device_create(int cpu) { struct device *dev; - dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), - "cpu%d", cpu); + dev = device_create_drvdata(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), + NULL, "cpu%d", cpu); return IS_ERR(dev) ? PTR_ERR(dev) : 0; } diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index cdfd94cc6b1..109792bc7cf 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -54,6 +54,16 @@ #include <asm/ftrace.h> #include <asm/irq_vectors.h> +/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ +#include <linux/elf-em.h> +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_LE 0x40000000 + +#ifndef CONFIG_AUDITSYSCALL +#define sysenter_audit syscall_trace_entry +#define sysexit_audit syscall_exit_work +#endif + /* * We use macros for low-level operations which need to be overridden * for paravirtualization. The following will never clobber any registers: @@ -333,7 +343,8 @@ sysenter_past_esp: /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) - jnz syscall_trace_entry + jnz sysenter_audit +sysenter_do_call: cmpl $(nr_syscalls), %eax jae syscall_badsys call *sys_call_table(,%eax,4) @@ -343,7 +354,8 @@ sysenter_past_esp: TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx - jne syscall_exit_work + jne sysexit_audit +sysenter_exit: /* if something modifies registers it must also disable sysexit */ movl PT_EIP(%esp), %edx movl PT_OLDESP(%esp), %ecx @@ -351,6 +363,45 @@ sysenter_past_esp: TRACE_IRQS_ON 1: mov PT_FS(%esp), %fs ENABLE_INTERRUPTS_SYSEXIT + +#ifdef CONFIG_AUDITSYSCALL +sysenter_audit: + testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + jnz syscall_trace_entry + addl $4,%esp + CFI_ADJUST_CFA_OFFSET -4 + /* %esi already in 8(%esp) 6th arg: 4th syscall arg */ + /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */ + /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */ + movl %ebx,%ecx /* 3rd arg: 1st syscall arg */ + movl %eax,%edx /* 2nd arg: syscall number */ + movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ + call audit_syscall_entry + pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + movl PT_EAX(%esp),%eax /* reload syscall number */ + jmp sysenter_do_call + +sysexit_audit: + testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx + jne syscall_exit_work + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) + movl %eax,%edx /* second arg, syscall return value */ + cmpl $0,%eax /* is it < 0? */ + setl %al /* 1 if so, 0 if not */ + movzbl %al,%eax /* zero-extend that */ + inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ + call audit_syscall_exit + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx + jne syscall_exit_work + movl PT_EAX(%esp),%eax /* reload syscall return value */ + jmp sysenter_exit +#endif + CFI_ENDPROC .pushsection .fixup,"ax" 2: movl $0,PT_FS(%esp) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 8410e26f418..89434d43960 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -53,6 +53,12 @@ #include <asm/paravirt.h> #include <asm/ftrace.h> +/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ +#include <linux/elf-em.h> +#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_64BIT 0x80000000 +#define __AUDIT_ARCH_LE 0x40000000 + .code64 #ifdef CONFIG_FTRACE @@ -351,6 +357,7 @@ ENTRY(system_call_after_swapgs) GET_THREAD_INFO(%rcx) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx) jnz tracesys +system_call_fastpath: cmpq $__NR_syscall_max,%rax ja badsys movq %r10,%rcx @@ -402,16 +409,16 @@ sysret_careful: sysret_signal: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - testl $_TIF_DO_NOTIFY_MASK,%edx - jz 1f - - /* Really a signal */ +#ifdef CONFIG_AUDITSYSCALL + bt $TIF_SYSCALL_AUDIT,%edx + jc sysret_audit +#endif /* edx: work flags (arg3) */ leaq do_notify_resume(%rip),%rax leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 xorl %esi,%esi # oldset -> arg2 call ptregscall_common -1: movl $_TIF_WORK_MASK,%edi + movl $_TIF_WORK_MASK,%edi /* Use IRET because user could have changed frame. This works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ DISABLE_INTERRUPTS(CLBR_NONE) @@ -422,8 +429,45 @@ badsys: movq $-ENOSYS,RAX-ARGOFFSET(%rsp) jmp ret_from_sys_call +#ifdef CONFIG_AUDITSYSCALL + /* + * Fast path for syscall audit without full syscall trace. + * We just call audit_syscall_entry() directly, and then + * jump back to the normal fast path. + */ +auditsys: + movq %r10,%r9 /* 6th arg: 4th syscall arg */ + movq %rdx,%r8 /* 5th arg: 3rd syscall arg */ + movq %rsi,%rcx /* 4th arg: 2nd syscall arg */ + movq %rdi,%rdx /* 3rd arg: 1st syscall arg */ + movq %rax,%rsi /* 2nd arg: syscall number */ + movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */ + call audit_syscall_entry + LOAD_ARGS 0 /* reload call-clobbered registers */ + jmp system_call_fastpath + + /* + * Return fast path for syscall audit. Call audit_syscall_exit() + * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT + * masked off. + */ +sysret_audit: + movq %rax,%rsi /* second arg, syscall return value */ + cmpq $0,%rax /* is it < 0? */ + setl %al /* 1 if so, 0 if not */ + movzbl %al,%edi /* zero-extend that into %edi */ + inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ + call audit_syscall_exit + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi + jmp sysret_check +#endif /* CONFIG_AUDITSYSCALL */ + /* Do syscall tracing */ tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx) + jz auditsys +#endif SAVE_REST movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ FIXUP_TOP_OF_STACK %rdi @@ -448,6 +492,7 @@ tracesys: * Has correct top of stack, but partial stack frame. */ .globl int_ret_from_sys_call + .globl int_with_check int_ret_from_sys_call: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c index 1fa8be5bd21..eaff0bbb144 100644 --- a/arch/x86/kernel/genapic_64.c +++ b/arch/x86/kernel/genapic_64.c @@ -99,3 +99,4 @@ int is_uv_system(void) { return uv_system_type != UV_NONE; } +EXPORT_SYMBOL_GPL(is_uv_system); diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c index 1a9c68845ee..786548a62d3 100644 --- a/arch/x86/kernel/genapic_flat_64.c +++ b/arch/x86/kernel/genapic_flat_64.c @@ -168,7 +168,7 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask) * May as well be the first. */ cpu = first_cpu(cpumask); - if ((unsigned)cpu < NR_CPUS) + if ((unsigned)cpu < nr_cpu_ids) return per_cpu(x86_cpu_to_apicid, cpu); else return BAD_APICID; diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 3c392934069..2cfcbded888 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -98,7 +98,7 @@ static void uv_send_IPI_mask(cpumask_t mask, int vector) { unsigned int cpu; - for (cpu = 0; cpu < NR_CPUS; ++cpu) + for_each_possible_cpu(cpu) if (cpu_isset(cpu, mask)) uv_send_IPI_one(cpu, vector); } @@ -132,7 +132,7 @@ static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask) * May as well be the first. */ cpu = first_cpu(cpumask); - if ((unsigned)cpu < NR_CPUS) + if ((unsigned)cpu < nr_cpu_ids) return per_cpu(x86_cpu_to_apicid, cpu); else return BAD_APICID; diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index f67e93441ca..a7010c3a377 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -456,9 +456,6 @@ is386: movl $2,%ecx # set MP 1: #endif /* CONFIG_SMP */ jmp *(initial_code) -.align 4 -ENTRY(initial_code) - .long i386_start_kernel /* * We depend on ET to be correct. This checks for 287/387. @@ -601,6 +598,11 @@ ignore_int: #endif iret +.section .cpuinit.data,"wa" +.align 4 +ENTRY(initial_code) + .long i386_start_kernel + .section .text /* * Real beginning of normal "text" segment diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 0ea6a19bfdf..ad2b15a1334 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -468,7 +468,7 @@ void hpet_disable(void) #define RTC_NUM_INTS 1 static unsigned long hpet_rtc_flags; -static unsigned long hpet_prev_update_sec; +static int hpet_prev_update_sec; static struct rtc_time hpet_alarm_time; static unsigned long hpet_pie_count; static unsigned long hpet_t1_cmp; @@ -575,6 +575,9 @@ int hpet_set_rtc_irq_bit(unsigned long bit_mask) hpet_rtc_flags |= bit_mask; + if ((bit_mask & RTC_UIE) && !(oldbits & RTC_UIE)) + hpet_prev_update_sec = -1; + if (!oldbits) hpet_rtc_timer_init(); @@ -652,7 +655,7 @@ static void hpet_rtc_timer_reinit(void) if (hpet_rtc_flags & RTC_PIE) hpet_pie_count += lost_ints; if (printk_ratelimit()) - printk(KERN_WARNING "rtc: lost %d interrupts\n", + printk(KERN_WARNING "hpet1: lost %d rtc interrupts\n", lost_ints); } } @@ -670,7 +673,8 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) if (hpet_rtc_flags & RTC_UIE && curr_time.tm_sec != hpet_prev_update_sec) { - rtc_int_flag = RTC_UF; + if (hpet_prev_update_sec >= 0) + rtc_int_flag = RTC_UF; hpet_prev_update_sec = curr_time.tm_sec; } diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index de9aa0e3a9c..09cddb57bec 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -57,7 +57,7 @@ atomic_t irq_mis_count; static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; static DEFINE_SPINLOCK(ioapic_lock); -static DEFINE_SPINLOCK(vector_lock); +DEFINE_SPINLOCK(vector_lock); int timer_through_8259 __initdata; @@ -1209,10 +1209,6 @@ static int assign_irq_vector(int irq) return vector; } -void setup_vector_irq(int cpu) -{ -} - static struct irq_chip ioapic_chip; #define IOAPIC_AUTO -1 diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 64a46affd85..61a83b70c18 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -101,7 +101,7 @@ int timer_through_8259 __initdata; static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; static DEFINE_SPINLOCK(ioapic_lock); -DEFINE_SPINLOCK(vector_lock); +static DEFINE_SPINLOCK(vector_lock); /* * # of IRQ routing registers @@ -697,6 +697,19 @@ static int pin_2_irq(int idx, int apic, int pin) return irq; } +void lock_vector_lock(void) +{ + /* Used to the online set of cpus does not change + * during assign_irq_vector. + */ + spin_lock(&vector_lock); +} + +void unlock_vector_lock(void) +{ + spin_unlock(&vector_lock); +} + static int __assign_irq_vector(int irq, cpumask_t mask) { /* @@ -732,7 +745,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask) return 0; } - for_each_cpu_mask(cpu, mask) { + for_each_cpu_mask_nr(cpu, mask) { cpumask_t domain, new_mask; int new_cpu; int vector, offset; @@ -753,7 +766,7 @@ next: continue; if (vector == IA32_SYSCALL_VECTOR) goto next; - for_each_cpu_mask(new_cpu, new_mask) + for_each_cpu_mask_nr(new_cpu, new_mask) if (per_cpu(vector_irq, new_cpu)[vector] != -1) goto next; /* Found one! */ @@ -763,7 +776,7 @@ next: cfg->move_in_progress = 1; cfg->old_domain = cfg->domain; } - for_each_cpu_mask(new_cpu, new_mask) + for_each_cpu_mask_nr(new_cpu, new_mask) per_cpu(vector_irq, new_cpu)[vector] = irq; cfg->vector = vector; cfg->domain = domain; @@ -795,14 +808,14 @@ static void __clear_irq_vector(int irq) vector = cfg->vector; cpus_and(mask, cfg->domain, cpu_online_map); - for_each_cpu_mask(cpu, mask) + for_each_cpu_mask_nr(cpu, mask) per_cpu(vector_irq, cpu)[vector] = -1; cfg->vector = 0; cpus_clear(cfg->domain); } -static void __setup_vector_irq(int cpu) +void __setup_vector_irq(int cpu) { /* Initialize vector_irq on a new cpu */ /* This function must be called with vector_lock held */ @@ -825,14 +838,6 @@ static void __setup_vector_irq(int cpu) } } -void setup_vector_irq(int cpu) -{ - spin_lock(&vector_lock); - __setup_vector_irq(smp_processor_id()); - spin_unlock(&vector_lock); -} - - static struct irq_chip ioapic_chip; static void ioapic_register_intr(int irq, unsigned long trigger) @@ -1373,12 +1378,10 @@ static unsigned int startup_ioapic_irq(unsigned int irq) static int ioapic_retrigger_irq(unsigned int irq) { struct irq_cfg *cfg = &irq_cfg[irq]; - cpumask_t mask; unsigned long flags; spin_lock_irqsave(&vector_lock, flags); - mask = cpumask_of_cpu(first_cpu(cfg->domain)); - send_IPI_mask(mask, cfg->vector); + send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector); spin_unlock_irqrestore(&vector_lock, flags); return 1; diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 0373e88de95..1f26fd9ec4f 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -43,10 +43,11 @@ #define BUILD_IRQ(nr) \ asmlinkage void IRQ_NAME(nr); \ - asm("\n.p2align\n" \ + asm("\n.text\n.p2align\n" \ "IRQ" #nr "_interrupt:\n\t" \ "push $~(" #nr ") ; " \ - "jmp common_interrupt"); + "jmp common_interrupt\n" \ + ".previous"); #define BI(x,y) \ BUILD_IRQ(x##y) diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 43c019f85f0..6c27679ec6a 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -431,7 +431,6 @@ static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) regs->ip = (unsigned long)p->ainsn.insn; } -/* Called with kretprobe_lock held */ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) { @@ -682,8 +681,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; INIT_HLIST_HEAD(&empty_rp); - spin_lock_irqsave(&kretprobe_lock, flags); - head = kretprobe_inst_table_head(current); + kretprobe_hash_lock(current, &head, &flags); /* fixup registers */ #ifdef CONFIG_X86_64 regs->cs = __KERNEL_CS; @@ -732,7 +730,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) kretprobe_assert(ri, orig_ret_address, trampoline_address); - spin_unlock_irqrestore(&kretprobe_lock, flags); + kretprobe_hash_unlock(current, &flags); hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { hlist_del(&ri->hlist); diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index c49ff2dc8f8..0ed5f939b90 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -63,12 +63,10 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) if (reload) { #ifdef CONFIG_SMP - cpumask_t mask; - preempt_disable(); load_LDT(pc); - mask = cpumask_of_cpu(smp_processor_id()); - if (!cpus_equal(current->mm->cpu_vm_mask, mask)) + if (!cpus_equal(current->mm->cpu_vm_mask, + cpumask_of_cpu(smp_processor_id()))) smp_call_function(flush_ldt, current->mm, 1); preempt_enable(); #else diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 8864230d55a..9fe478d9840 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -22,6 +22,7 @@ #include <asm/cpufeature.h> #include <asm/desc.h> #include <asm/system.h> +#include <asm/cacheflush.h> #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) static u32 kexec_pgd[1024] PAGE_ALIGNED; @@ -85,10 +86,12 @@ static void load_segments(void) * reboot code buffer to allow us to avoid allocations * later. * - * Currently nothing. + * Make control page executable. */ int machine_kexec_prepare(struct kimage *image) { + if (nx_enabled) + set_pages_x(image->control_code_page, 1); return 0; } @@ -98,27 +101,48 @@ int machine_kexec_prepare(struct kimage *image) */ void machine_kexec_cleanup(struct kimage *image) { + if (nx_enabled) + set_pages_nx(image->control_code_page, 1); } /* * Do not allocate memory (or fail in any way) in machine_kexec(). * We are past the point of no return, committed to rebooting now. */ -NORET_TYPE void machine_kexec(struct kimage *image) +void machine_kexec(struct kimage *image) { unsigned long page_list[PAGES_NR]; void *control_page; + asmlinkage unsigned long + (*relocate_kernel_ptr)(unsigned long indirection_page, + unsigned long control_page, + unsigned long start_address, + unsigned int has_pae, + unsigned int preserve_context); tracer_disable(); /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); + if (image->preserve_context) { +#ifdef CONFIG_X86_IO_APIC + /* We need to put APICs in legacy mode so that we can + * get timer interrupts in second kernel. kexec/kdump + * paths already have calls to disable_IO_APIC() in + * one form or other. kexec jump path also need + * one. + */ + disable_IO_APIC(); +#endif + } + control_page = page_address(image->control_code_page); - memcpy(control_page, relocate_kernel, PAGE_SIZE); + memcpy(control_page, relocate_kernel, PAGE_SIZE/2); + relocate_kernel_ptr = control_page; page_list[PA_CONTROL_PAGE] = __pa(control_page); - page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; + page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; page_list[PA_PGD] = __pa(kexec_pgd); page_list[VA_PGD] = (unsigned long)kexec_pgd; #ifdef CONFIG_X86_PAE @@ -131,6 +155,7 @@ NORET_TYPE void machine_kexec(struct kimage *image) page_list[VA_PTE_0] = (unsigned long)kexec_pte0; page_list[PA_PTE_1] = __pa(kexec_pte1); page_list[VA_PTE_1] = (unsigned long)kexec_pte1; + page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT); /* The segment registers are funny things, they have both a * visible and an invisible part. Whenever the visible part is @@ -149,8 +174,10 @@ NORET_TYPE void machine_kexec(struct kimage *image) set_idt(phys_to_virt(0),0); /* now call it */ - relocate_kernel((unsigned long)image->head, (unsigned long)page_list, - image->start, cpu_has_pae); + image->start = relocate_kernel_ptr((unsigned long)image->head, + (unsigned long)page_list, + image->start, cpu_has_pae, + image->preserve_context); } void arch_crash_save_vmcoreinfo(void) diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 9dd9262693a..c43caa3a91f 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -181,7 +181,7 @@ void machine_kexec_cleanup(struct kimage *image) * Do not allocate memory (or fail in any way) in machine_kexec(). * We are past the point of no return, committed to rebooting now. */ -NORET_TYPE void machine_kexec(struct kimage *image) +void machine_kexec(struct kimage *image) { unsigned long page_list[PAGES_NR]; void *control_page; diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 56b933119a0..652fa5c38eb 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -644,7 +644,9 @@ static void microcode_fini_cpu(int cpu) mutex_unlock(µcode_mutex); } -static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz) +static ssize_t reload_store(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, size_t sz) { struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; char *end; @@ -655,9 +657,7 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz) if (end == buf) return -EINVAL; if (val == 1) { - cpumask_t old; - - old = current->cpus_allowed; + cpumask_t old = current->cpus_allowed; get_online_cpus(); set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); @@ -674,14 +674,16 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz) return sz; } -static ssize_t version_show(struct sys_device *dev, char *buf) +static ssize_t version_show(struct sys_device *dev, + struct sysdev_attribute *attr, char *buf) { struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; return sprintf(buf, "0x%x\n", uci->rev); } -static ssize_t pf_show(struct sys_device *dev, char *buf) +static ssize_t pf_show(struct sys_device *dev, + struct sysdev_attribute *attr, char *buf) { struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c index 0e867676b5a..6ba87830d4b 100644 --- a/arch/x86/kernel/module_64.c +++ b/arch/x86/kernel/module_64.c @@ -22,6 +22,7 @@ #include <linux/fs.h> #include <linux/string.h> #include <linux/kernel.h> +#include <linux/mm.h> #include <linux/slab.h> #include <linux/bug.h> diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 6ae005ccaed..678090508a6 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -83,7 +83,7 @@ static void __init MP_bus_info(struct mpc_config_bus *m) if (x86_quirks->mpc_oem_bus_info) x86_quirks->mpc_oem_bus_info(m, str); else - printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str); + apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->mpc_busid, str); #if MAX_MP_BUSSES < 256 if (m->mpc_busid >= MAX_MP_BUSSES) { @@ -154,7 +154,7 @@ static void __init MP_ioapic_info(struct mpc_config_ioapic *m) static void print_MP_intsrc_info(struct mpc_config_intsrc *m) { - printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x," + apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," " IRQ %02x, APIC ID %x, APIC INT %02x\n", m->mpc_irqtype, m->mpc_irqflag & 3, (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, @@ -163,7 +163,7 @@ static void print_MP_intsrc_info(struct mpc_config_intsrc *m) static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq) { - printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x," + apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," " IRQ %02x, APIC ID %x, APIC INT %02x\n", mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3, (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus, @@ -235,7 +235,7 @@ static void __init MP_intsrc_info(struct mpc_config_intsrc *m) static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m) { - printk(KERN_INFO "Lint: type %d, pol %d, trig %d, bus %02x," + apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x," " IRQ %02x, APIC ID %x, APIC LINT %02x\n", m->mpc_irqtype, m->mpc_irqflag & 3, (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid, @@ -695,7 +695,8 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, unsigned int *bp = phys_to_virt(base); struct intel_mp_floating *mpf; - printk(KERN_DEBUG "Scan SMP from %p for %ld bytes.\n", bp, length); + apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n", + bp, length); BUILD_BUG_ON(sizeof(*mpf) != 16); while (length > 0) { diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index a153b3905f6..9fd80955244 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -149,8 +149,8 @@ static int __cpuinit msr_device_create(int cpu) { struct device *dev; - dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), - "msr%d", cpu); + dev = device_create_drvdata(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), + NULL, "msr%d", cpu); return IS_ERR(dev) ? PTR_ERR(dev) : 0; } diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 097d8a6797f..94da4d52d79 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -443,7 +443,7 @@ struct pv_mmu_ops pv_mmu_ops = { #endif /* PAGETABLE_LEVELS >= 3 */ .pte_val = native_pte_val, - .pte_flags = native_pte_val, + .pte_flags = native_pte_flags, .pgd_val = native_pgd_val, .make_pte = native_make_pte, diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 151f2d171f7..02d19328525 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -29,6 +29,7 @@ #include <linux/mm.h> #include <linux/spinlock.h> #include <linux/string.h> +#include <linux/crash_dump.h> #include <linux/dma-mapping.h> #include <linux/bitops.h> #include <linux/pci_ids.h> @@ -36,6 +37,7 @@ #include <linux/delay.h> #include <linux/scatterlist.h> #include <linux/iommu-helper.h> + #include <asm/iommu.h> #include <asm/calgary.h> #include <asm/tce.h> @@ -167,6 +169,8 @@ static void calgary_dump_error_regs(struct iommu_table *tbl); static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); static void calioc2_tce_cache_blast(struct iommu_table *tbl); static void calioc2_dump_error_regs(struct iommu_table *tbl); +static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl); +static void get_tce_space_from_tar(void); static struct cal_chipset_ops calgary_chip_ops = { .handle_quirks = calgary_handle_quirks, @@ -410,22 +414,6 @@ static void calgary_unmap_sg(struct device *dev, } } -static int calgary_nontranslate_map_sg(struct device* dev, - struct scatterlist *sg, int nelems, int direction) -{ - struct scatterlist *s; - int i; - - for_each_sg(sg, s, nelems, i) { - struct page *p = sg_page(s); - - BUG_ON(!p); - s->dma_address = virt_to_bus(sg_virt(s)); - s->dma_length = s->length; - } - return nelems; -} - static int calgary_map_sg(struct device *dev, struct scatterlist *sg, int nelems, int direction) { @@ -436,9 +424,6 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, unsigned long entry; int i; - if (!translation_enabled(tbl)) - return calgary_nontranslate_map_sg(dev, sg, nelems, direction); - for_each_sg(sg, s, nelems, i) { BUG_ON(!sg_page(s)); @@ -474,7 +459,6 @@ error: static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, size_t size, int direction) { - dma_addr_t dma_handle = bad_dma_address; void *vaddr = phys_to_virt(paddr); unsigned long uaddr; unsigned int npages; @@ -483,12 +467,7 @@ static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, uaddr = (unsigned long)vaddr; npages = num_dma_pages(uaddr, size); - if (translation_enabled(tbl)) - dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction); - else - dma_handle = virt_to_bus(vaddr); - - return dma_handle; + return iommu_alloc(dev, tbl, vaddr, npages, direction); } static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, @@ -497,9 +476,6 @@ static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, struct iommu_table *tbl = find_iommu_table(dev); unsigned int npages; - if (!translation_enabled(tbl)) - return; - npages = num_dma_pages(dma_handle, size); iommu_free(tbl, dma_handle, npages); } @@ -522,18 +498,12 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size, goto error; memset(ret, 0, size); - if (translation_enabled(tbl)) { - /* set up tces to cover the allocated range */ - mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); - if (mapping == bad_dma_address) - goto free; - - *dma_handle = mapping; - } else /* non translated slot */ - *dma_handle = virt_to_bus(ret); - + /* set up tces to cover the allocated range */ + mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); + if (mapping == bad_dma_address) + goto free; + *dma_handle = mapping; return ret; - free: free_pages((unsigned long)ret, get_order(size)); ret = NULL; @@ -541,7 +511,7 @@ error: return ret; } -static const struct dma_mapping_ops calgary_dma_ops = { +static struct dma_mapping_ops calgary_dma_ops = { .alloc_coherent = calgary_alloc_coherent, .map_single = calgary_map_single, .unmap_single = calgary_unmap_single, @@ -830,7 +800,11 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) tbl = pci_iommu(dev->bus); tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space; - tce_free(tbl, 0, tbl->it_size); + + if (is_kdump_kernel()) + calgary_init_bitmap_from_tce_table(tbl); + else + tce_free(tbl, 0, tbl->it_size); if (is_calgary(dev->device)) tbl->chip_ops = &calgary_chip_ops; @@ -1209,6 +1183,10 @@ static int __init calgary_init(void) if (ret) return ret; + /* Purely for kdump kernel case */ + if (is_kdump_kernel()) + get_tce_space_from_tar(); + do { dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); if (!dev) @@ -1230,6 +1208,16 @@ static int __init calgary_init(void) goto error; } while (1); + dev = NULL; + for_each_pci_dev(dev) { + struct iommu_table *tbl; + + tbl = find_iommu_table(&dev->dev); + + if (translation_enabled(tbl)) + dev->dev.archdata.dma_ops = &calgary_dma_ops; + } + return ret; error: @@ -1251,6 +1239,7 @@ error: calgary_disable_translation(dev); calgary_free_bus(dev); pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ + dev->dev.archdata.dma_ops = NULL; } while (1); return ret; @@ -1339,6 +1328,61 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev) return (val != 0xffffffff); } +/* + * calgary_init_bitmap_from_tce_table(): + * Funtion for kdump case. In the second/kdump kernel initialize + * the bitmap based on the tce table entries obtained from first kernel + */ +static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl) +{ + u64 *tp; + unsigned int index; + tp = ((u64 *)tbl->it_base); + for (index = 0 ; index < tbl->it_size; index++) { + if (*tp != 0x0) + set_bit(index, tbl->it_map); + tp++; + } +} + +/* + * get_tce_space_from_tar(): + * Function for kdump case. Get the tce tables from first kernel + * by reading the contents of the base adress register of calgary iommu + */ +static void get_tce_space_from_tar(void) +{ + int bus; + void __iomem *target; + unsigned long tce_space; + + for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { + struct calgary_bus_info *info = &bus_info[bus]; + unsigned short pci_device; + u32 val; + + val = read_pci_config(bus, 0, 0, 0); + pci_device = (val & 0xFFFF0000) >> 16; + + if (!is_cal_pci_dev(pci_device)) + continue; + if (info->translation_disabled) + continue; + + if (calgary_bus_has_devices(bus, pci_device) || + translate_empty_slots) { + target = calgary_reg(bus_info[bus].bbar, + tar_offset(bus)); + tce_space = be64_to_cpu(readq(target)); + tce_space = tce_space & TAR_SW_BITS; + + tce_space = tce_space & (~specified_table_size); + info->tce_space = (u64 *)__va(tce_space); + } + } + return; +} + void __init detect_calgary(void) { int bus; @@ -1394,7 +1438,8 @@ void __init detect_calgary(void) return; } - specified_table_size = determine_tce_table_size(max_pfn * PAGE_SIZE); + specified_table_size = determine_tce_table_size((is_kdump_kernel() ? + saved_max_pfn : max_pfn) * PAGE_SIZE); for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { struct calgary_bus_info *info = &bus_info[bus]; @@ -1412,10 +1457,16 @@ void __init detect_calgary(void) if (calgary_bus_has_devices(bus, pci_device) || translate_empty_slots) { - tbl = alloc_tce_table(); - if (!tbl) - goto cleanup; - info->tce_space = tbl; + /* + * If it is kdump kernel, find and use tce tables + * from first kernel, else allocate tce tables here + */ + if (!is_kdump_kernel()) { + tbl = alloc_tce_table(); + if (!tbl) + goto cleanup; + info->tce_space = tbl; + } calgary_found = 1; } } @@ -1430,6 +1481,10 @@ void __init detect_calgary(void) printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, debugging ? "enabled" : "disabled"); + + /* swiotlb for devices that aren't behind the Calgary. */ + if (max_pfn > MAX_DMA32_PFN) + swiotlb = 1; } return; @@ -1446,7 +1501,7 @@ int __init calgary_iommu_init(void) { int ret; - if (no_iommu || swiotlb) + if (no_iommu || (swiotlb && !calgary_detected)) return -ENODEV; if (!calgary_detected) @@ -1459,15 +1514,14 @@ int __init calgary_iommu_init(void) if (ret) { printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " "falling back to no_iommu\n", ret); - if (max_pfn > MAX_DMA32_PFN) - printk(KERN_ERR "WARNING more than 4GB of memory, " - "32bit PCI may malfunction.\n"); return ret; } force_iommu = 1; bad_dma_address = 0x0; - dma_ops = &calgary_dma_ops; + /* dma_ops is set to swiotlb or nommu */ + if (!dma_ops) + dma_ops = &nommu_dma_ops; return 0; } diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index a4213c00dff..87d4d6964ec 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -11,7 +11,7 @@ static int forbid_dac __read_mostly; -const struct dma_mapping_ops *dma_ops; +struct dma_mapping_ops *dma_ops; EXPORT_SYMBOL(dma_ops); static int iommu_sac_force __read_mostly; @@ -123,6 +123,14 @@ void __init pci_iommu_alloc(void) pci_swiotlb_init(); } + +unsigned long iommu_num_pages(unsigned long addr, unsigned long len) +{ + unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE); + + return size >> PAGE_SHIFT; +} +EXPORT_SYMBOL(iommu_num_pages); #endif /* @@ -192,136 +200,19 @@ static __init int iommu_setup(char *p) } early_param("iommu", iommu_setup); -#ifdef CONFIG_X86_32 -int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, - dma_addr_t device_addr, size_t size, int flags) -{ - void __iomem *mem_base = NULL; - int pages = size >> PAGE_SHIFT; - int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); - - if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0) - goto out; - if (!size) - goto out; - if (dev->dma_mem) - goto out; - - /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */ - - mem_base = ioremap(bus_addr, size); - if (!mem_base) - goto out; - - dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); - if (!dev->dma_mem) - goto out; - dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); - if (!dev->dma_mem->bitmap) - goto free1_out; - - dev->dma_mem->virt_base = mem_base; - dev->dma_mem->device_base = device_addr; - dev->dma_mem->size = pages; - dev->dma_mem->flags = flags; - - if (flags & DMA_MEMORY_MAP) - return DMA_MEMORY_MAP; - - return DMA_MEMORY_IO; - - free1_out: - kfree(dev->dma_mem); - out: - if (mem_base) - iounmap(mem_base); - return 0; -} -EXPORT_SYMBOL(dma_declare_coherent_memory); - -void dma_release_declared_memory(struct device *dev) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - - if (!mem) - return; - dev->dma_mem = NULL; - iounmap(mem->virt_base); - kfree(mem->bitmap); - kfree(mem); -} -EXPORT_SYMBOL(dma_release_declared_memory); - -void *dma_mark_declared_memory_occupied(struct device *dev, - dma_addr_t device_addr, size_t size) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - int pos, err; - int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1); - - pages >>= PAGE_SHIFT; - - if (!mem) - return ERR_PTR(-EINVAL); - - pos = (device_addr - mem->device_base) >> PAGE_SHIFT; - err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages)); - if (err != 0) - return ERR_PTR(err); - return mem->virt_base + (pos << PAGE_SHIFT); -} -EXPORT_SYMBOL(dma_mark_declared_memory_occupied); - -static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size, - dma_addr_t *dma_handle, void **ret) -{ - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; - int order = get_order(size); - - if (mem) { - int page = bitmap_find_free_region(mem->bitmap, mem->size, - order); - if (page >= 0) { - *dma_handle = mem->device_base + (page << PAGE_SHIFT); - *ret = mem->virt_base + (page << PAGE_SHIFT); - memset(*ret, 0, size); - } - if (mem->flags & DMA_MEMORY_EXCLUSIVE) - *ret = NULL; - } - return (mem != NULL); -} - -static int dma_release_coherent(struct device *dev, int order, void *vaddr) -{ - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; - - if (mem && vaddr >= mem->virt_base && vaddr < - (mem->virt_base + (mem->size << PAGE_SHIFT))) { - int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; - - bitmap_release_region(mem->bitmap, page, order); - return 1; - } - return 0; -} -#else -#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0) -#define dma_release_coherent(dev, order, vaddr) (0) -#endif /* CONFIG_X86_32 */ - int dma_supported(struct device *dev, u64 mask) { + struct dma_mapping_ops *ops = get_dma_ops(dev); + #ifdef CONFIG_PCI if (mask > 0xffffffff && forbid_dac > 0) { - printk(KERN_INFO "PCI: Disallowing DAC for device %s\n", - dev->bus_id); + dev_info(dev, "PCI: Disallowing DAC for device\n"); return 0; } #endif - if (dma_ops->dma_supported) - return dma_ops->dma_supported(dev, mask); + if (ops->dma_supported) + return ops->dma_supported(dev, mask); /* Copied from i386. Doesn't make much sense, because it will only work for pci_alloc_coherent. @@ -342,8 +233,7 @@ int dma_supported(struct device *dev, u64 mask) type. Normally this doesn't make any difference, but gives more gentle handling of IOMMU overflow. */ if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) { - printk(KERN_INFO "%s: Force SAC with mask %Lx\n", - dev->bus_id, mask); + dev_info(dev, "Force SAC with mask %Lx\n", mask); return 0; } @@ -369,6 +259,7 @@ void * dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) { + struct dma_mapping_ops *ops = get_dma_ops(dev); void *memory = NULL; struct page *page; unsigned long dma_mask = 0; @@ -378,7 +269,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, /* ignore region specifiers */ gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); - if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory)) + if (dma_alloc_from_coherent(dev, size, dma_handle, &memory)) return memory; if (!dev) { @@ -437,8 +328,8 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, /* Let low level make its own zone decisions */ gfp &= ~(GFP_DMA32|GFP_DMA); - if (dma_ops->alloc_coherent) - return dma_ops->alloc_coherent(dev, size, + if (ops->alloc_coherent) + return ops->alloc_coherent(dev, size, dma_handle, gfp); return NULL; } @@ -450,14 +341,14 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, } } - if (dma_ops->alloc_coherent) { + if (ops->alloc_coherent) { free_pages((unsigned long)memory, get_order(size)); gfp &= ~(GFP_DMA|GFP_DMA32); - return dma_ops->alloc_coherent(dev, size, dma_handle, gfp); + return ops->alloc_coherent(dev, size, dma_handle, gfp); } - if (dma_ops->map_simple) { - *dma_handle = dma_ops->map_simple(dev, virt_to_phys(memory), + if (ops->map_simple) { + *dma_handle = ops->map_simple(dev, virt_to_phys(memory), size, PCI_DMA_BIDIRECTIONAL); if (*dma_handle != bad_dma_address) @@ -479,12 +370,14 @@ EXPORT_SYMBOL(dma_alloc_coherent); void dma_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t bus) { + struct dma_mapping_ops *ops = get_dma_ops(dev); + int order = get_order(size); WARN_ON(irqs_disabled()); /* for portability */ - if (dma_release_coherent(dev, order, vaddr)) + if (dma_release_from_coherent(dev, order, vaddr)) return; - if (dma_ops->unmap_single) - dma_ops->unmap_single(dev, bus, size, 0); + if (ops->unmap_single) + ops->unmap_single(dev, bus, size, 0); free_pages((unsigned long)vaddr, order); } EXPORT_SYMBOL(dma_free_coherent); diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index be60961f869..49285f8fd4d 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -67,9 +67,6 @@ static u32 gart_unmapped_entry; (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) #define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) -#define to_pages(addr, size) \ - (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) - #define EMERGENCY_PAGES 32 /* = 128KB */ #ifdef CONFIG_AGP @@ -198,9 +195,7 @@ static void iommu_full(struct device *dev, size_t size, int dir) * out. Hopefully no network devices use single mappings that big. */ - printk(KERN_ERR - "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n", - size, dev->bus_id); + dev_err(dev, "PCI-DMA: Out of IOMMU space for %lu bytes\n", size); if (size > PAGE_SIZE*EMERGENCY_PAGES) { if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) @@ -243,7 +238,7 @@ nonforced_iommu(struct device *dev, unsigned long addr, size_t size) static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, size_t size, int dir) { - unsigned long npages = to_pages(phys_mem, size); + unsigned long npages = iommu_num_pages(phys_mem, size); unsigned long iommu_page = alloc_iommu(dev, npages); int i; @@ -306,7 +301,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, return; iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; - npages = to_pages(dma_addr, size); + npages = iommu_num_pages(dma_addr, size); for (i = 0; i < npages; i++) { iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; CLEAR_LEAK(iommu_page + i); @@ -389,7 +384,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start, } addr = phys_addr; - pages = to_pages(s->offset, s->length); + pages = iommu_num_pages(s->offset, s->length); while (pages--) { iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); SET_LEAK(iommu_page); @@ -472,7 +467,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) seg_size += s->length; need = nextneed; - pages += to_pages(s->offset, s->length); + pages += iommu_num_pages(s->offset, s->length); ps = s; } if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) @@ -694,8 +689,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) extern int agp_amd64_init(void); -static const struct dma_mapping_ops gart_dma_ops = { - .mapping_error = NULL, +static struct dma_mapping_ops gart_dma_ops = { .map_single = gart_map_single, .map_simple = gart_map_simple, .unmap_single = gart_unmap_single, diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 792b9179eff..3f91f71cdc3 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -72,21 +72,9 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, return nents; } -/* Make sure we keep the same behaviour */ -static int nommu_mapping_error(dma_addr_t dma_addr) -{ -#ifdef CONFIG_X86_32 - return 0; -#else - return (dma_addr == bad_dma_address); -#endif -} - - -const struct dma_mapping_ops nommu_dma_ops = { +struct dma_mapping_ops nommu_dma_ops = { .map_single = nommu_map_single, .map_sg = nommu_map_sg, - .mapping_error = nommu_mapping_error, .is_phys = 1, }; diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c index 20df839b9c2..c4ce0332759 100644 --- a/arch/x86/kernel/pci-swiotlb_64.c +++ b/arch/x86/kernel/pci-swiotlb_64.c @@ -18,7 +18,7 @@ swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size, return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction); } -const struct dma_mapping_ops swiotlb_dma_ops = { +struct dma_mapping_ops swiotlb_dma_ops = { .mapping_error = swiotlb_dma_mapping_error, .alloc_coherent = swiotlb_alloc_coherent, .free_coherent = swiotlb_free_coherent, diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 7218bccd176..7b6e44a7c62 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -130,7 +130,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(); + tick_nohz_stop_sched_tick(1); while (!need_resched()) { check_pgt_cache(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index c78090dd0c5..87d7dfdcf46 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -121,7 +121,7 @@ void cpu_idle(void) current_thread_info()->status |= TS_POLLING; /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(); + tick_nohz_stop_sched_tick(1); while (!need_resched()) { rmb(); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 9dcf39c0297..724adfc63cb 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -411,10 +411,9 @@ void native_machine_shutdown(void) { /* Stop the cpus and apics */ #ifdef CONFIG_SMP - int reboot_cpu_id; /* The boot cpu is always logical cpu 0 */ - reboot_cpu_id = 0; + int reboot_cpu_id = 0; #ifdef CONFIG_X86_32 /* See if there has been given a command line override */ diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index c30fe25d470..703310a9902 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -20,11 +20,44 @@ #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) #define PAE_PGD_ATTR (_PAGE_PRESENT) +/* control_page + PAGE_SIZE/2 ~ control_page + PAGE_SIZE * 3/4 are + * used to save some data for jumping back + */ +#define DATA(offset) (PAGE_SIZE/2+(offset)) + +/* Minimal CPU state */ +#define ESP DATA(0x0) +#define CR0 DATA(0x4) +#define CR3 DATA(0x8) +#define CR4 DATA(0xc) + +/* other data */ +#define CP_VA_CONTROL_PAGE DATA(0x10) +#define CP_PA_PGD DATA(0x14) +#define CP_PA_SWAP_PAGE DATA(0x18) +#define CP_PA_BACKUP_PAGES_MAP DATA(0x1c) + .text .align PAGE_SIZE .globl relocate_kernel relocate_kernel: - movl 8(%esp), %ebp /* list of pages */ + /* Save the CPU context, used for jumping back */ + + pushl %ebx + pushl %esi + pushl %edi + pushl %ebp + pushf + + movl 20+8(%esp), %ebp /* list of pages */ + movl PTR(VA_CONTROL_PAGE)(%ebp), %edi + movl %esp, ESP(%edi) + movl %cr0, %eax + movl %eax, CR0(%edi) + movl %cr3, %eax + movl %eax, CR3(%edi) + movl %cr4, %eax + movl %eax, CR4(%edi) #ifdef CONFIG_X86_PAE /* map the control page at its virtual address */ @@ -138,15 +171,25 @@ relocate_kernel: relocate_new_kernel: /* read the arguments and say goodbye to the stack */ - movl 4(%esp), %ebx /* page_list */ - movl 8(%esp), %ebp /* list of pages */ - movl 12(%esp), %edx /* start address */ - movl 16(%esp), %ecx /* cpu_has_pae */ + movl 20+4(%esp), %ebx /* page_list */ + movl 20+8(%esp), %ebp /* list of pages */ + movl 20+12(%esp), %edx /* start address */ + movl 20+16(%esp), %ecx /* cpu_has_pae */ + movl 20+20(%esp), %esi /* preserve_context */ /* zero out flags, and disable interrupts */ pushl $0 popfl + /* save some information for jumping back */ + movl PTR(VA_CONTROL_PAGE)(%ebp), %edi + movl %edi, CP_VA_CONTROL_PAGE(%edi) + movl PTR(PA_PGD)(%ebp), %eax + movl %eax, CP_PA_PGD(%edi) + movl PTR(PA_SWAP_PAGE)(%ebp), %eax + movl %eax, CP_PA_SWAP_PAGE(%edi) + movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi) + /* get physical address of control page now */ /* this is impossible after page table switch */ movl PTR(PA_CONTROL_PAGE)(%ebp), %edi @@ -197,8 +240,90 @@ identity_mapped: xorl %eax, %eax movl %eax, %cr3 + movl CP_PA_SWAP_PAGE(%edi), %eax + pushl %eax + pushl %ebx + call swap_pages + addl $8, %esp + + /* To be certain of avoiding problems with self-modifying code + * I need to execute a serializing instruction here. + * So I flush the TLB, it's handy, and not processor dependent. + */ + xorl %eax, %eax + movl %eax, %cr3 + + /* set all of the registers to known values */ + /* leave %esp alone */ + + testl %esi, %esi + jnz 1f + xorl %edi, %edi + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %esi, %esi + xorl %ebp, %ebp + ret +1: + popl %edx + movl CP_PA_SWAP_PAGE(%edi), %esp + addl $PAGE_SIZE, %esp +2: + call *%edx + + /* get the re-entry point of the peer system */ + movl 0(%esp), %ebp + call 1f +1: + popl %ebx + subl $(1b - relocate_kernel), %ebx + movl CP_VA_CONTROL_PAGE(%ebx), %edi + lea PAGE_SIZE(%ebx), %esp + movl CP_PA_SWAP_PAGE(%ebx), %eax + movl CP_PA_BACKUP_PAGES_MAP(%ebx), %edx + pushl %eax + pushl %edx + call swap_pages + addl $8, %esp + movl CP_PA_PGD(%ebx), %eax + movl %eax, %cr3 + movl %cr0, %eax + orl $(1<<31), %eax + movl %eax, %cr0 + lea PAGE_SIZE(%edi), %esp + movl %edi, %eax + addl $(virtual_mapped - relocate_kernel), %eax + pushl %eax + ret + +virtual_mapped: + movl CR4(%edi), %eax + movl %eax, %cr4 + movl CR3(%edi), %eax + movl %eax, %cr3 + movl CR0(%edi), %eax + movl %eax, %cr0 + movl ESP(%edi), %esp + movl %ebp, %eax + + popf + popl %ebp + popl %edi + popl %esi + popl %ebx + ret + /* Do the copies */ - movl %ebx, %ecx +swap_pages: + movl 8(%esp), %edx + movl 4(%esp), %ecx + pushl %ebp + pushl %ebx + pushl %edi + pushl %esi + movl %ecx, %ebx jmp 1f 0: /* top, read another word from the indirection page */ @@ -226,27 +351,28 @@ identity_mapped: movl %ecx, %esi /* For every source page do a copy */ andl $0xfffff000, %esi + movl %edi, %eax + movl %esi, %ebp + + movl %edx, %edi movl $1024, %ecx rep ; movsl - jmp 0b -3: - - /* To be certain of avoiding problems with self-modifying code - * I need to execute a serializing instruction here. - * So I flush the TLB, it's handy, and not processor dependent. - */ - xorl %eax, %eax - movl %eax, %cr3 + movl %ebp, %edi + movl %eax, %esi + movl $1024, %ecx + rep ; movsl - /* set all of the registers to known values */ - /* leave %esp alone */ + movl %eax, %edi + movl %edx, %esi + movl $1024, %ecx + rep ; movsl - xorl %eax, %eax - xorl %ebx, %ebx - xorl %ecx, %ecx - xorl %edx, %edx - xorl %esi, %esi - xorl %edi, %edi - xorl %ebp, %ebp + lea PAGE_SIZE(%ebp), %esi + jmp 0b +3: + popl %esi + popl %edi + popl %ebx + popl %ebp ret diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ec952aa5394..68b48e3fbcb 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -597,13 +597,21 @@ void __init setup_arch(char **cmdline_p) memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); visws_early_detect(); pre_setup_arch_hook(); - early_cpu_init(); #else printk(KERN_INFO "Command line: %s\n", boot_command_line); #endif + early_cpu_init(); early_ioremap_init(); +#if defined(CONFIG_VMI) && defined(CONFIG_X86_32) + /* + * Must be before kernel pagetables are setup + * or fixmap area is touched. + */ + vmi_init(); +#endif + ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); screen_info = boot_params.screen_info; edid_info = boot_params.edid_info; @@ -665,9 +673,6 @@ void __init setup_arch(char **cmdline_p) bss_resource.start = virt_to_phys(&__bss_start); bss_resource.end = virt_to_phys(&__bss_stop)-1; -#ifdef CONFIG_X86_64 - early_cpu_init(); -#endif strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; @@ -680,7 +685,7 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_LOCAL_APIC disable_apic = 1; #endif - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + setup_clear_cpu_cap(X86_FEATURE_APIC); } #ifdef CONFIG_PCI @@ -791,10 +796,6 @@ void __init setup_arch(char **cmdline_p) initmem_init(0, max_pfn); -#ifdef CONFIG_X86_64 - dma32_reserve_bootmem(); -#endif - #ifdef CONFIG_ACPI_SLEEP /* * Reserve low memory region for sleep support. @@ -809,20 +810,21 @@ void __init setup_arch(char **cmdline_p) #endif reserve_crashkernel(); +#ifdef CONFIG_X86_64 + /* + * dma32_reserve_bootmem() allocates bootmem which may conflict + * with the crashkernel command line, so do that after + * reserve_crashkernel() + */ + dma32_reserve_bootmem(); +#endif + reserve_ibft_region(); #ifdef CONFIG_KVM_CLOCK kvmclock_init(); #endif -#if defined(CONFIG_VMI) && defined(CONFIG_X86_32) - /* - * Must be after max_low_pfn is determined, and before kernel - * pagetables are setup. - */ - vmi_init(); -#endif - paravirt_pagetable_setup_start(swapper_pg_dir); paging_init(); paravirt_pagetable_setup_done(swapper_pg_dir); @@ -859,12 +861,6 @@ void __init setup_arch(char **cmdline_p) init_apic_mappings(); ioapic_init_mappings(); -#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC) && defined(CONFIG_X86_32) - if (def_to_bigsmp) - printk(KERN_WARNING "More than 8 CPUs detected and " - "CONFIG_X86_PC cannot handle it.\nUse " - "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n"); -#endif kvm_guest_init(); e820_reserve_resources(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index f7745f94c00..76e305e064f 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -80,24 +80,6 @@ static void __init setup_per_cpu_maps(void) #endif } -#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP -cpumask_t *cpumask_of_cpu_map __read_mostly; -EXPORT_SYMBOL(cpumask_of_cpu_map); - -/* requires nr_cpu_ids to be initialized */ -static void __init setup_cpumask_of_cpu(void) -{ - int i; - - /* alloc_bootmem zeroes memory */ - cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids); - for (i = 0; i < nr_cpu_ids; i++) - cpu_set(i, cpumask_of_cpu_map[i]); -} -#else -static inline void setup_cpumask_of_cpu(void) { } -#endif - #ifdef CONFIG_X86_32 /* * Great future not-so-futuristic plan: make i386 and x86_64 do it @@ -197,9 +179,6 @@ void __init setup_per_cpu_areas(void) /* Setup node to cpumask map */ setup_node_to_cpumask_map(); - - /* Setup cpumask_of_cpu map */ - setup_cpumask_of_cpu(); } #endif diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 5cede1045ce..0c727f64e79 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -662,8 +662,5 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); - if (thread_info_flags & _TIF_HRTICK_RESCHED) - hrtick_resched(); - clear_thread_flag(TIF_IRET); } diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index b95a0a60905..02b02583f5b 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -54,6 +54,59 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, return do_sigaltstack(uss, uoss, regs->sp); } +/* + * Signal frame handlers. + */ + +static inline int save_i387(struct _fpstate __user *buf) +{ + struct task_struct *tsk = current; + int err = 0; + + BUILD_BUG_ON(sizeof(struct user_i387_struct) != + sizeof(tsk->thread.xstate->fxsave)); + + if ((unsigned long)buf % 16) + printk("save_i387: bad fpstate %p\n", buf); + + if (!used_math()) + return 0; + clear_used_math(); /* trigger finit */ + if (task_thread_info(tsk)->status & TS_USEDFPU) { + err = save_i387_checking((struct i387_fxsave_struct __user *) + buf); + if (err) + return err; + task_thread_info(tsk)->status &= ~TS_USEDFPU; + stts(); + } else { + if (__copy_to_user(buf, &tsk->thread.xstate->fxsave, + sizeof(struct i387_fxsave_struct))) + return -1; + } + return 1; +} + +/* + * This restores directly out of user space. Exceptions are handled. + */ +static inline int restore_i387(struct _fpstate __user *buf) +{ + struct task_struct *tsk = current; + int err; + + if (!used_math()) { + err = init_fpu(tsk); + if (err) + return err; + } + + if (!(task_thread_info(current)->status & TS_USEDFPU)) { + clts(); + task_thread_info(current)->status |= TS_USEDFPU; + } + return restore_fpu_checking((__force struct i387_fxsave_struct *)buf); +} /* * Do a signal return; undo the signal stack. @@ -497,9 +550,6 @@ void do_notify_resume(struct pt_regs *regs, void *unused, /* deal with pending signal delivery */ if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); - - if (thread_info_flags & _TIF_HRTICK_RESCHED) - hrtick_resched(); } void signal_fault(struct pt_regs *regs, void __user *frame, char *where) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index a9331a42f76..6112c363234 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -326,12 +326,16 @@ static void __cpuinit start_secondary(void *unused) * for which cpus receive the IPI. Holding this * lock helps us to not include this cpu in a currently in progress * smp_call_function(). + * + * We need to hold vector_lock so there the set of online cpus + * does not change while we are assigning vectors to cpus. Holding + * this lock ensures we don't half assign or remove an irq from a cpu. */ ipi_call_lock_irq(); -#ifdef CONFIG_X86_IO_APIC - setup_vector_irq(smp_processor_id()); -#endif + lock_vector_lock(); + __setup_vector_irq(smp_processor_id()); cpu_set(smp_processor_id(), cpu_online_map); + unlock_vector_lock(); ipi_call_unlock_irq(); per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; @@ -438,7 +442,7 @@ void __cpuinit set_cpu_sibling_map(int cpu) cpu_set(cpu, cpu_sibling_setup_map); if (smp_num_siblings > 1) { - for_each_cpu_mask(i, cpu_sibling_setup_map) { + for_each_cpu_mask_nr(i, cpu_sibling_setup_map) { if (c->phys_proc_id == cpu_data(i).phys_proc_id && c->cpu_core_id == cpu_data(i).cpu_core_id) { cpu_set(i, per_cpu(cpu_sibling_map, cpu)); @@ -461,7 +465,7 @@ void __cpuinit set_cpu_sibling_map(int cpu) return; } - for_each_cpu_mask(i, cpu_sibling_setup_map) { + for_each_cpu_mask_nr(i, cpu_sibling_setup_map) { if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { cpu_set(i, c->llc_shared_map); @@ -990,7 +994,17 @@ int __cpuinit native_cpu_up(unsigned int cpu) flush_tlb_all(); low_mappings = 1; +#ifdef CONFIG_X86_PC + if (def_to_bigsmp && apicid > 8) { + printk(KERN_WARNING + "More than 8 CPUs detected - skipping them.\n" + "Use CONFIG_X86_GENERICARCH and CONFIG_X86_BIGSMP.\n"); + err = -1; + } else + err = do_boot_cpu(apicid, cpu); +#else err = do_boot_cpu(apicid, cpu); +#endif zap_low_mappings(); low_mappings = 0; @@ -1219,7 +1233,7 @@ static void remove_siblinginfo(int cpu) int sibling; struct cpuinfo_x86 *c = &cpu_data(cpu); - for_each_cpu_mask(sibling, per_cpu(cpu_core_map, cpu)) { + for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) { cpu_clear(cpu, per_cpu(cpu_core_map, sibling)); /*/ * last thread sibling in this cpu core going down @@ -1228,7 +1242,7 @@ static void remove_siblinginfo(int cpu) cpu_data(sibling).booted_cores--; } - for_each_cpu_mask(sibling, per_cpu(cpu_sibling_map, cpu)) + for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu)) cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling)); cpus_clear(per_cpu(cpu_sibling_map, cpu)); cpus_clear(per_cpu(cpu_core_map, cpu)); @@ -1336,7 +1350,9 @@ int __cpu_disable(void) remove_siblinginfo(cpu); /* It's now safe to remove this processor from the online map */ + lock_vector_lock(); remove_cpu_from_maps(cpu); + unlock_vector_lock(); fixup_irqs(cpu_online_map); return 0; } diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index adff5562f5f..d44395ff34c 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -326,3 +326,9 @@ ENTRY(sys_call_table) .long sys_fallocate .long sys_timerfd_settime /* 325 */ .long sys_timerfd_gettime + .long sys_signalfd4 + .long sys_eventfd2 + .long sys_epoll_create1 + .long sys_dup3 /* 330 */ + .long sys_pipe2 + .long sys_inotify_init1 diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 0a1b1a9d922..6ca515d6db5 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -37,6 +37,7 @@ #include <asm/timer.h> #include <asm/vmi_time.h> #include <asm/kmap_types.h> +#include <asm/setup.h> /* Convenient for calling VMI functions indirectly in the ROM */ typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void); @@ -683,7 +684,7 @@ void vmi_bringup(void) { /* We must establish the lowmem mapping for MMU ops to work */ if (vmi_ops.set_linear_mapping) - vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, max_low_pfn, 0); + vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, MAXMEM_PFN, 0); } /* |