diff options
Diffstat (limited to 'arch')
27 files changed, 666 insertions, 357 deletions
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 8f146a4b475..bbea636ff68 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -32,6 +32,7 @@ #include <asm/leds.h> #include <asm/processor.h> #include <asm/uaccess.h> +#include <asm/mach/time.h> extern const char *processor_modes[]; extern void setup_mm_for_reboot(char mode); @@ -85,8 +86,10 @@ EXPORT_SYMBOL(pm_power_off); void default_idle(void) { local_irq_disable(); - if (!need_resched() && !hlt_counter) + if (!need_resched() && !hlt_counter) { + timer_dyn_reprogram(); arch_idle(); + } local_irq_enable(); } diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c index 06054c9ba07..1b7fcd50c3e 100644 --- a/arch/arm/kernel/time.c +++ b/arch/arm/kernel/time.c @@ -424,15 +424,19 @@ static int timer_dyn_tick_disable(void) return ret; } +/* + * Reprogram the system timer for at least the calculated time interval. + * This function should be called from the idle thread with IRQs disabled, + * immediately before sleeping. + */ void timer_dyn_reprogram(void) { struct dyn_tick_timer *dyn_tick = system_timer->dyn_tick; - unsigned long flags; - write_seqlock_irqsave(&xtime_lock, flags); + write_seqlock(&xtime_lock); if (dyn_tick->state & DYN_TICK_ENABLED) dyn_tick->reprogram(next_timer_interrupt() - jiffies); - write_sequnlock_irqrestore(&xtime_lock, flags); + write_sequnlock(&xtime_lock); } static ssize_t timer_show_dyn_tick(struct sys_device *dev, char *buf) diff --git a/arch/arm/mach-aaec2000/Makefile.boot b/arch/arm/mach-aaec2000/Makefile.boot new file mode 100644 index 00000000000..8f5a8b7c53c --- /dev/null +++ b/arch/arm/mach-aaec2000/Makefile.boot @@ -0,0 +1 @@ + zreladdr-y := 0xf0008000 diff --git a/arch/arm/mach-omap/usb.c b/arch/arm/mach-omap/usb.c index 6e805d451d0..7f37857b1a2 100644 --- a/arch/arm/mach-omap/usb.c +++ b/arch/arm/mach-omap/usb.c @@ -288,8 +288,8 @@ static void usb_release(struct device *dev) static struct resource udc_resources[] = { /* order is significant! */ { /* registers */ - .start = IO_ADDRESS(UDC_BASE), - .end = IO_ADDRESS(UDC_BASE + 0xff), + .start = UDC_BASE, + .end = UDC_BASE + 0xff, .flags = IORESOURCE_MEM, }, { /* general IRQ */ .start = IH2_BASE + 20, @@ -355,8 +355,8 @@ static struct platform_device ohci_device = { static struct resource otg_resources[] = { /* order is significant! */ { - .start = IO_ADDRESS(OTG_BASE), - .end = IO_ADDRESS(OTG_BASE + 0xff), + .start = OTG_BASE, + .end = OTG_BASE + 0xff, .flags = IORESOURCE_MEM, }, { .start = IH2_BASE + 8, diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index c08710b1ff0..6dcb23d64bf 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -522,6 +522,69 @@ static inline void free_area(unsigned long addr, unsigned long end, char *s) printk(KERN_INFO "Freeing %s memory: %dK\n", s, size); } +static inline void +free_memmap(int node, unsigned long start_pfn, unsigned long end_pfn) +{ + struct page *start_pg, *end_pg; + unsigned long pg, pgend; + + /* + * Convert start_pfn/end_pfn to a struct page pointer. + */ + start_pg = pfn_to_page(start_pfn); + end_pg = pfn_to_page(end_pfn); + + /* + * Convert to physical addresses, and + * round start upwards and end downwards. + */ + pg = PAGE_ALIGN(__pa(start_pg)); + pgend = __pa(end_pg) & PAGE_MASK; + + /* + * If there are free pages between these, + * free the section of the memmap array. + */ + if (pg < pgend) + free_bootmem_node(NODE_DATA(node), pg, pgend - pg); +} + +/* + * The mem_map array can get very big. Free the unused area of the memory map. + */ +static void __init free_unused_memmap_node(int node, struct meminfo *mi) +{ + unsigned long bank_start, prev_bank_end = 0; + unsigned int i; + + /* + * [FIXME] This relies on each bank being in address order. This + * may not be the case, especially if the user has provided the + * information on the command line. + */ + for (i = 0; i < mi->nr_banks; i++) { + if (mi->bank[i].size == 0 || mi->bank[i].node != node) + continue; + + bank_start = mi->bank[i].start >> PAGE_SHIFT; + if (bank_start < prev_bank_end) { + printk(KERN_ERR "MEM: unordered memory banks. " + "Not freeing memmap.\n"); + break; + } + + /* + * If we had a previous bank, and there is a space + * between the current bank and the previous, free it. + */ + if (prev_bank_end && prev_bank_end != bank_start) + free_memmap(node, prev_bank_end, bank_start); + + prev_bank_end = (mi->bank[i].start + + mi->bank[i].size) >> PAGE_SHIFT; + } +} + /* * mem_init() marks the free areas in the mem_map and tells us how much * memory is free. This is done after various parts of the system have @@ -540,16 +603,12 @@ void __init mem_init(void) max_mapnr = virt_to_page(high_memory) - mem_map; #endif - /* - * We may have non-contiguous memory. - */ - if (meminfo.nr_banks != 1) - create_memmap_holes(&meminfo); - /* this will put all unused low memory onto the freelists */ for_each_online_node(node) { pg_data_t *pgdat = NODE_DATA(node); + free_unused_memmap_node(node, &meminfo); + if (pgdat->node_spanned_pages != 0) totalram_pages += free_all_bootmem_node(pgdat); } diff --git a/arch/arm/mm/mm-armv.c b/arch/arm/mm/mm-armv.c index 2c2b93d77d4..052ab443ec4 100644 --- a/arch/arm/mm/mm-armv.c +++ b/arch/arm/mm/mm-armv.c @@ -169,7 +169,14 @@ pgd_t *get_pgd_slow(struct mm_struct *mm) memzero(new_pgd, FIRST_KERNEL_PGD_NR * sizeof(pgd_t)); + /* + * Copy over the kernel and IO PGD entries + */ init_pgd = pgd_offset_k(0); + memcpy(new_pgd + FIRST_KERNEL_PGD_NR, init_pgd + FIRST_KERNEL_PGD_NR, + (PTRS_PER_PGD - FIRST_KERNEL_PGD_NR) * sizeof(pgd_t)); + + clean_dcache_area(new_pgd, PTRS_PER_PGD * sizeof(pgd_t)); if (!vectors_high()) { /* @@ -198,14 +205,6 @@ pgd_t *get_pgd_slow(struct mm_struct *mm) spin_unlock(&mm->page_table_lock); } - /* - * Copy over the kernel and IO PGD entries - */ - memcpy(new_pgd + FIRST_KERNEL_PGD_NR, init_pgd + FIRST_KERNEL_PGD_NR, - (PTRS_PER_PGD - FIRST_KERNEL_PGD_NR) * sizeof(pgd_t)); - - clean_dcache_area(new_pgd, PTRS_PER_PGD * sizeof(pgd_t)); - return new_pgd; no_pte: @@ -698,75 +697,3 @@ void __init iotable_init(struct map_desc *io_desc, int nr) for (i = 0; i < nr; i++) create_mapping(io_desc + i); } - -static inline void -free_memmap(int node, unsigned long start_pfn, unsigned long end_pfn) -{ - struct page *start_pg, *end_pg; - unsigned long pg, pgend; - - /* - * Convert start_pfn/end_pfn to a struct page pointer. - */ - start_pg = pfn_to_page(start_pfn); - end_pg = pfn_to_page(end_pfn); - - /* - * Convert to physical addresses, and - * round start upwards and end downwards. - */ - pg = PAGE_ALIGN(__pa(start_pg)); - pgend = __pa(end_pg) & PAGE_MASK; - - /* - * If there are free pages between these, - * free the section of the memmap array. - */ - if (pg < pgend) - free_bootmem_node(NODE_DATA(node), pg, pgend - pg); -} - -static inline void free_unused_memmap_node(int node, struct meminfo *mi) -{ - unsigned long bank_start, prev_bank_end = 0; - unsigned int i; - - /* - * [FIXME] This relies on each bank being in address order. This - * may not be the case, especially if the user has provided the - * information on the command line. - */ - for (i = 0; i < mi->nr_banks; i++) { - if (mi->bank[i].size == 0 || mi->bank[i].node != node) - continue; - - bank_start = mi->bank[i].start >> PAGE_SHIFT; - if (bank_start < prev_bank_end) { - printk(KERN_ERR "MEM: unordered memory banks. " - "Not freeing memmap.\n"); - break; - } - - /* - * If we had a previous bank, and there is a space - * between the current bank and the previous, free it. - */ - if (prev_bank_end && prev_bank_end != bank_start) - free_memmap(node, prev_bank_end, bank_start); - - prev_bank_end = PAGE_ALIGN(mi->bank[i].start + - mi->bank[i].size) >> PAGE_SHIFT; - } -} - -/* - * The mem_map array can get very big. Free - * the unused area of the memory map. - */ -void __init create_memmap_holes(struct meminfo *mi) -{ - int node; - - for_each_online_node(node) - free_unused_memmap_node(node, mi); -} diff --git a/arch/arm/tools/mach-types b/arch/arm/tools/mach-types index 30c1dfbb052..6d3a79e5fef 100644 --- a/arch/arm/tools/mach-types +++ b/arch/arm/tools/mach-types @@ -6,7 +6,7 @@ # To add an entry into this database, please see Documentation/arm/README, # or contact rmk@arm.linux.org.uk # -# Last update: Thu Mar 24 14:34:50 2005 +# Last update: Thu Jun 23 20:19:33 2005 # # machine_is_xxx CONFIG_xxxx MACH_TYPE_xxx number # @@ -243,7 +243,7 @@ yoho ARCH_YOHO YOHO 231 jasper ARCH_JASPER JASPER 232 dsc25 ARCH_DSC25 DSC25 233 omap_innovator MACH_OMAP_INNOVATOR OMAP_INNOVATOR 234 -ramses ARCH_RAMSES RAMSES 235 +mnci ARCH_RAMSES RAMSES 235 s28x ARCH_S28X S28X 236 mport3 ARCH_MPORT3 MPORT3 237 pxa_eagle250 ARCH_PXA_EAGLE250 PXA_EAGLE250 238 @@ -323,7 +323,7 @@ nimbra29x ARCH_NIMBRA29X NIMBRA29X 311 nimbra210 ARCH_NIMBRA210 NIMBRA210 312 hhp_d95xx ARCH_HHP_D95XX HHP_D95XX 313 labarm ARCH_LABARM LABARM 314 -m825xx ARCH_M825XX M825XX 315 +comcerto ARCH_M825XX M825XX 315 m7100 SA1100_M7100 M7100 316 nipc2 ARCH_NIPC2 NIPC2 317 fu7202 ARCH_FU7202 FU7202 318 @@ -724,3 +724,66 @@ lpc22xx MACH_LPC22XX LPC22XX 715 omap_comet3 MACH_COMET3 COMET3 716 omap_comet4 MACH_COMET4 COMET4 717 csb625 MACH_CSB625 CSB625 718 +fortunet2 MACH_FORTUNET2 FORTUNET2 719 +s5h2200 MACH_S5H2200 S5H2200 720 +optorm920 MACH_OPTORM920 OPTORM920 721 +adsbitsyxb MACH_ADSBITSYXB ADSBITSYXB 722 +adssphere MACH_ADSSPHERE ADSSPHERE 723 +adsportal MACH_ADSPORTAL ADSPORTAL 724 +ln2410sbc MACH_LN2410SBC LN2410SBC 725 +cb3rufc MACH_CB3RUFC CB3RUFC 726 +mp2usb MACH_MP2USB MP2USB 727 +ntnp425c MACH_NTNP425C NTNP425C 728 +colibri MACH_COLIBRI COLIBRI 729 +pcm7220 MACH_PCM7220 PCM7220 730 +gateway7001 MACH_GATEWAY7001 GATEWAY7001 731 +pcm027 MACH_PCM027 PCM027 732 +cmpxa MACH_CMPXA CMPXA 733 +anubis MACH_ANUBIS ANUBIS 734 +ite8152 MACH_ITE8152 ITE8152 735 +lpc3xxx MACH_LPC3XXX LPC3XXX 736 +puppeteer MACH_PUPPETEER PUPPETEER 737 +vt001 MACH_MACH_VADATECH MACH_VADATECH 738 +e570 MACH_E570 E570 739 +x50 MACH_X50 X50 740 +recon MACH_RECON RECON 741 +xboardgp8 MACH_XBOARDGP8 XBOARDGP8 742 +fpic2 MACH_FPIC2 FPIC2 743 +akita MACH_AKITA AKITA 744 +a81 MACH_A81 A81 745 +svm_sc25x MACH_SVM_SC25X SVM_SC25X 746 +vt020 MACH_VADATECH020 VADATECH020 747 +tli MACH_TLI TLI 748 +edb9315lc MACH_EDB9315LC EDB9315LC 749 +passec MACH_PASSEC PASSEC 750 +ds_tiger MACH_DS_TIGER DS_TIGER 751 +e310 MACH_E310 E310 752 +e330 MACH_E330 E330 753 +rt3000 MACH_RT3000 RT3000 754 +nokia770 MACH_NOKIA770 NOKIA770 755 +pnx0106 MACH_PNX0106 PNX0106 756 +hx21xx MACH_HX21XX HX21XX 757 +faraday MACH_FARADAY FARADAY 758 +sbc9312 MACH_SBC9312 SBC9312 759 +batman MACH_BATMAN BATMAN 760 +jpd201 MACH_JPD201 JPD201 761 +mipsa MACH_MIPSA MIPSA 762 +kacom MACH_KACOM KACOM 763 +swarcocpu MACH_SWARCOCPU SWARCOCPU 764 +swarcodsl MACH_SWARCODSL SWARCODSL 765 +blueangel MACH_BLUEANGEL BLUEANGEL 766 +hairygrama MACH_HAIRYGRAMA HAIRYGRAMA 767 +banff MACH_BANFF BANFF 768 +carmeva MACH_CARMEVA CARMEVA 769 +sam255 MACH_SAM255 SAM255 770 +ppm10 MACH_PPM10 PPM10 771 +edb9315a MACH_EDB9315A EDB9315A 772 +sunset MACH_SUNSET SUNSET 773 +stargate2 MACH_STARGATE2 STARGATE2 774 +intelmote2 MACH_INTELMOTE2 INTELMOTE2 775 +trizeps4 MACH_TRIZEPS4 TRIZEPS4 776 +mainstone2 MACH_MAINSTONE2 MAINSTONE2 777 +ez_ixp42x MACH_EZ_IXP42X EZ_IXP42X 778 +tapwave_zodiac MACH_TAPWAVE_ZODIAC TAPWAVE_ZODIAC 779 +universalmeter MACH_UNIVERSALMETER UNIVERSALMETER 780 +hicoarm9 MACH_HICOARM9 HICOARM9 781 diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c index 3762f6b35ab..fc8b1752176 100644 --- a/arch/i386/kernel/kprobes.c +++ b/arch/i386/kernel/kprobes.c @@ -127,48 +127,23 @@ static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) regs->eip = (unsigned long)&p->ainsn.insn; } -struct task_struct *arch_get_kprobe_task(void *ptr) -{ - return ((struct thread_info *) (((unsigned long) ptr) & - (~(THREAD_SIZE -1))))->task; -} - void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs) { unsigned long *sara = (unsigned long *)®s->esp; - struct kretprobe_instance *ri; - static void *orig_ret_addr; + struct kretprobe_instance *ri; + + if ((ri = get_free_rp_inst(rp)) != NULL) { + ri->rp = rp; + ri->task = current; + ri->ret_addr = (kprobe_opcode_t *) *sara; - /* - * Save the return address when the return probe hits - * the first time, and use it to populate the (krprobe - * instance)->ret_addr for subsequent return probes at - * the same addrress since stack address would have - * the kretprobe_trampoline by then. - */ - if (((void*) *sara) != kretprobe_trampoline) - orig_ret_addr = (void*) *sara; - - if ((ri = get_free_rp_inst(rp)) != NULL) { - ri->rp = rp; - ri->stack_addr = sara; - ri->ret_addr = orig_ret_addr; - add_rp_inst(ri); /* Replace the return addr with trampoline addr */ *sara = (unsigned long) &kretprobe_trampoline; - } else { - rp->nmissed++; - } -} -void arch_kprobe_flush_task(struct task_struct *tk) -{ - struct kretprobe_instance *ri; - while ((ri = get_rp_inst_tsk(tk)) != NULL) { - *((unsigned long *)(ri->stack_addr)) = - (unsigned long) ri->ret_addr; - recycle_rp_inst(ri); - } + add_rp_inst(ri); + } else { + rp->nmissed++; + } } /* @@ -286,36 +261,59 @@ no_kprobe: */ int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) { - struct task_struct *tsk; - struct kretprobe_instance *ri; - struct hlist_head *head; - struct hlist_node *node; - unsigned long *sara = ((unsigned long *) ®s->esp) - 1; - - tsk = arch_get_kprobe_task(sara); - head = kretprobe_inst_table_head(tsk); - - hlist_for_each_entry(ri, node, head, hlist) { - if (ri->stack_addr == sara && ri->rp) { - if (ri->rp->handler) - ri->rp->handler(ri, regs); - } - } - return 0; -} + struct kretprobe_instance *ri = NULL; + struct hlist_head *head; + struct hlist_node *node, *tmp; + unsigned long orig_ret_address = 0; + unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; -void trampoline_post_handler(struct kprobe *p, struct pt_regs *regs, - unsigned long flags) -{ - struct kretprobe_instance *ri; - /* RA already popped */ - unsigned long *sara = ((unsigned long *)®s->esp) - 1; + head = kretprobe_inst_table_head(current); - while ((ri = get_rp_inst(sara))) { - regs->eip = (unsigned long)ri->ret_addr; + /* + * It is possible to have multiple instances associated with a given + * task either because an multiple functions in the call path + * have a return probe installed on them, and/or more then one return + * return probe was registered for a target function. + * + * We can handle this because: + * - instances are always inserted at the head of the list + * - when multiple return probes are registered for the same + * function, the first instance's ret_addr will point to the + * real return address, and all the rest will point to + * kretprobe_trampoline + */ + hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + if (ri->rp && ri->rp->handler) + ri->rp->handler(ri, regs); + + orig_ret_address = (unsigned long)ri->ret_addr; recycle_rp_inst(ri); + + if (orig_ret_address != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; } - regs->eflags &= ~TF_MASK; + + BUG_ON(!orig_ret_address || (orig_ret_address == trampoline_address)); + regs->eip = orig_ret_address; + + unlock_kprobes(); + preempt_enable_no_resched(); + + /* + * By returning a non-zero value, we are telling + * kprobe_handler() that we have handled unlocking + * and re-enabling preemption. + */ + return 1; } /* @@ -403,8 +401,7 @@ static inline int post_kprobe_handler(struct pt_regs *regs) current_kprobe->post_handler(current_kprobe, regs, 0); } - if (current_kprobe->post_handler != trampoline_post_handler) - resume_execution(current_kprobe, regs); + resume_execution(current_kprobe, regs); regs->eflags |= kprobe_saved_eflags; /*Restore back the original saved kprobes variables and continue. */ @@ -534,3 +531,13 @@ int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) } return 0; } + +static struct kprobe trampoline_p = { + .addr = (kprobe_opcode_t *) &kretprobe_trampoline, + .pre_handler = trampoline_probe_handler +}; + +int __init arch_init(void) +{ + return register_kprobe(&trampoline_p); +} diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 5f8cfa6b794..ba243a4cc11 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -617,6 +617,33 @@ handle_io_bitmap(struct thread_struct *next, struct tss_struct *tss) } /* + * This function selects if the context switch from prev to next + * has to tweak the TSC disable bit in the cr4. + */ +static inline void disable_tsc(struct task_struct *prev_p, + struct task_struct *next_p) +{ + struct thread_info *prev, *next; + + /* + * gcc should eliminate the ->thread_info dereference if + * has_secure_computing returns 0 at compile time (SECCOMP=n). + */ + prev = prev_p->thread_info; + next = next_p->thread_info; + + if (has_secure_computing(prev) || has_secure_computing(next)) { + /* slow path here */ + if (has_secure_computing(prev) && + !has_secure_computing(next)) { + write_cr4(read_cr4() & ~X86_CR4_TSD); + } else if (!has_secure_computing(prev) && + has_secure_computing(next)) + write_cr4(read_cr4() | X86_CR4_TSD); + } +} + +/* * switch_to(x,yn) should switch tasks from x to y. * * We fsave/fwait so that an exception goes off at the right time @@ -695,6 +722,8 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) handle_io_bitmap(next, tss); + disable_tsc(prev_p, next_p); + return prev_p; } diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S index 442a6e937b1..3db9a04aec6 100644 --- a/arch/i386/kernel/syscall_table.S +++ b/arch/i386/kernel/syscall_table.S @@ -289,3 +289,5 @@ ENTRY(sys_call_table) .long sys_add_key .long sys_request_key .long sys_keyctl + .long sys_ioprio_set + .long sys_ioprio_get /* 290 */ diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index b1d5d3d5276..785a51b0ad8 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -1577,8 +1577,8 @@ sys_call_table: data8 sys_add_key data8 sys_request_key data8 sys_keyctl - data8 sys_ni_syscall - data8 sys_ni_syscall // 1275 + data8 sys_ioprio_set + data8 sys_ioprio_get // 1275 data8 sys_set_zone_reclaim data8 sys_ni_syscall data8 sys_ni_syscall diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c index 5978823d5c6..3aa3167edbe 100644 --- a/arch/ia64/kernel/kprobes.c +++ b/arch/ia64/kernel/kprobes.c @@ -34,6 +34,7 @@ #include <asm/pgtable.h> #include <asm/kdebug.h> +#include <asm/sections.h> extern void jprobe_inst_return(void); @@ -263,13 +264,33 @@ static inline void get_kprobe_inst(bundle_t *bundle, uint slot, } } +/* Returns non-zero if the addr is in the Interrupt Vector Table */ +static inline int in_ivt_functions(unsigned long addr) +{ + return (addr >= (unsigned long)__start_ivt_text + && addr < (unsigned long)__end_ivt_text); +} + static int valid_kprobe_addr(int template, int slot, unsigned long addr) { if ((slot > 2) || ((bundle_encoding[template][1] == L) && slot > 1)) { - printk(KERN_WARNING "Attempting to insert unaligned kprobe at 0x%lx\n", - addr); + printk(KERN_WARNING "Attempting to insert unaligned kprobe " + "at 0x%lx\n", addr); return -EINVAL; } + + if (in_ivt_functions(addr)) { + printk(KERN_WARNING "Kprobes can't be inserted inside " + "IVT functions at 0x%lx\n", addr); + return -EINVAL; + } + + if (slot == 1 && bundle_encoding[template][1] != L) { + printk(KERN_WARNING "Inserting kprobes on slot #1 " + "is not supported\n"); + return -EINVAL; + } + return 0; } @@ -290,6 +311,94 @@ static inline void set_current_kprobe(struct kprobe *p) current_kprobe = p; } +static void kretprobe_trampoline(void) +{ +} + +/* + * At this point the target function has been tricked into + * returning into our trampoline. Lookup the associated instance + * and then: + * - call the handler function + * - cleanup by marking the instance as unused + * - long jump back to the original return address + */ +int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kretprobe_instance *ri = NULL; + struct hlist_head *head; + struct hlist_node *node, *tmp; + unsigned long orig_ret_address = 0; + unsigned long trampoline_address = + ((struct fnptr *)kretprobe_trampoline)->ip; + + head = kretprobe_inst_table_head(current); + + /* + * It is possible to have multiple instances associated with a given + * task either because an multiple functions in the call path + * have a return probe installed on them, and/or more then one return + * return probe was registered for a target function. + * + * We can handle this because: + * - instances are always inserted at the head of the list + * - when multiple return probes are registered for the same + * function, the first instance's ret_addr will point to the + * real return address, and all the rest will point to + * kretprobe_trampoline + */ + hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + if (ri->rp && ri->rp->handler) + ri->rp->handler(ri, regs); + + orig_ret_address = (unsigned long)ri->ret_addr; + recycle_rp_inst(ri); + + if (orig_ret_address != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; + } + + BUG_ON(!orig_ret_address || (orig_ret_address == trampoline_address)); + regs->cr_iip = orig_ret_address; + + unlock_kprobes(); + preempt_enable_no_resched(); + + /* + * By returning a non-zero value, we are telling + * kprobe_handler() that we have handled unlocking + * and re-enabling preemption. + */ + return 1; +} + +void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs) +{ + struct kretprobe_instance *ri; + + if ((ri = get_free_rp_inst(rp)) != NULL) { + ri->rp = rp; + ri->task = current; + ri->ret_addr = (kprobe_opcode_t *)regs->b0; + + /* Replace the return addr with trampoline addr */ + regs->b0 = ((struct fnptr *)kretprobe_trampoline)->ip; + + add_rp_inst(ri); + } else { + rp->nmissed++; + } +} + int arch_prepare_kprobe(struct kprobe *p) { unsigned long addr = (unsigned long) p->addr; @@ -492,8 +601,8 @@ static int pre_kprobes_handler(struct die_args *args) if (p->pre_handler && p->pre_handler(p, regs)) /* * Our pre-handler is specifically requesting that we just - * do a return. This is handling the case where the - * pre-handler is really our special jprobe pre-handler. + * do a return. This is used for both the jprobe pre-handler + * and the kretprobe trampoline */ return 1; @@ -599,3 +708,14 @@ int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) *regs = jprobe_saved_regs; return 1; } + +static struct kprobe trampoline_p = { + .pre_handler = trampoline_probe_handler +}; + +int __init arch_init(void) +{ + trampoline_p.addr = + (kprobe_opcode_t *)((struct fnptr *)kretprobe_trampoline)->ip; + return register_kprobe(&trampoline_p); +} diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index ebb71f3d6d1..6e35bff05d5 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -27,6 +27,7 @@ #include <linux/efi.h> #include <linux/interrupt.h> #include <linux/delay.h> +#include <linux/kprobes.h> #include <asm/cpu.h> #include <asm/delay.h> @@ -707,6 +708,13 @@ kernel_thread_helper (int (*fn)(void *), void *arg) void flush_thread (void) { + /* + * Remove function-return probe instances associated with this task + * and put them back on the free list. Do not insert an exit probe for + * this function, it will be disabled by kprobe_flush_task if you do. + */ + kprobe_flush_task(current); + /* drop floating-point and debug-register state if it exists: */ current->thread.flags &= ~(IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID); ia64_drop_fpu(current); @@ -721,6 +729,14 @@ flush_thread (void) void exit_thread (void) { + + /* + * Remove function-return probe instances associated with this task + * and put them back on the free list. Do not insert an exit probe for + * this function, it will be disabled by kprobe_flush_task if you do. + */ + kprobe_flush_task(current); + ia64_drop_fpu(current); #ifdef CONFIG_PERFMON /* if needed, stop monitoring and flush state to perfmon context */ diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S index b9f0db4c1b0..a676e79e068 100644 --- a/arch/ia64/kernel/vmlinux.lds.S +++ b/arch/ia64/kernel/vmlinux.lds.S @@ -8,6 +8,11 @@ #define LOAD_OFFSET (KERNEL_START - KERNEL_TR_PAGE_SIZE) #include <asm-generic/vmlinux.lds.h> +#define IVT_TEXT \ + VMLINUX_SYMBOL(__start_ivt_text) = .; \ + *(.text.ivt) \ + VMLINUX_SYMBOL(__end_ivt_text) = .; + OUTPUT_FORMAT("elf64-ia64-little") OUTPUT_ARCH(ia64) ENTRY(phys_start) @@ -39,7 +44,7 @@ SECTIONS .text : AT(ADDR(.text) - LOAD_OFFSET) { - *(.text.ivt) + IVT_TEXT *(.text) SCHED_TEXT LOCK_TEXT diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c index 508026ae584..65ee15396ff 100644 --- a/arch/mips/kernel/signal.c +++ b/arch/mips/kernel/signal.c @@ -457,7 +457,7 @@ static int do_signal(sigset_t *oldset, struct pt_regs *regs) if (!user_mode(regs)) return 1; - if (try_to_freeze(0)) + if (try_to_freeze()) goto no_signal; if (!oldset) diff --git a/arch/ppc/kernel/misc.S b/arch/ppc/kernel/misc.S index b6a63a49a23..191a8def3bd 100644 --- a/arch/ppc/kernel/misc.S +++ b/arch/ppc/kernel/misc.S @@ -1449,3 +1449,5 @@ _GLOBAL(sys_call_table) .long sys_request_key /* 270 */ .long sys_keyctl .long sys_waitid + .long sys_ioprio_set + .long sys_ioprio_get diff --git a/arch/ppc/mm/init.c b/arch/ppc/mm/init.c index 334ef4150d9..6164a2b3473 100644 --- a/arch/ppc/mm/init.c +++ b/arch/ppc/mm/init.c @@ -606,9 +606,19 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, struct page *page = pfn_to_page(pfn); if (!PageReserved(page) && !test_bit(PG_arch_1, &page->flags)) { - if (vma->vm_mm == current->active_mm) + if (vma->vm_mm == current->active_mm) { +#ifdef CONFIG_8xx + /* On 8xx, cache control instructions (particularly + * "dcbst" from flush_dcache_icache) fault as write + * operation if there is an unpopulated TLB entry + * for the address in question. To workaround that, + * we invalidate the TLB here, thus avoiding dcbst + * misbehaviour. + */ + _tlbie(address); +#endif __flush_dcache_icache((void *) address); - else + } else flush_dcache_icache_page(page); set_bit(PG_arch_1, &page->flags); } diff --git a/arch/ppc/platforms/pmac_sleep.S b/arch/ppc/platforms/pmac_sleep.S index f459ade1bd6..016a7464915 100644 --- a/arch/ppc/platforms/pmac_sleep.S +++ b/arch/ppc/platforms/pmac_sleep.S @@ -46,7 +46,7 @@ .section .text .align 5 -#if defined(CONFIG_PMAC_PBOOK) || defined(CONFIG_CPU_FREQ_PMAC) +#if defined(CONFIG_PM) || defined(CONFIG_CPU_FREQ_PMAC) /* This gets called by via-pmu.c late during the sleep process. * The PMU was already send the sleep command and will shut us down @@ -382,7 +382,7 @@ turn_on_mmu: isync rfi -#endif /* defined(CONFIG_PMAC_PBOOK) || defined(CONFIG_CPU_FREQ) */ +#endif /* defined(CONFIG_PM) || defined(CONFIG_CPU_FREQ) */ .section .data .balign L1_CACHE_LINE_SIZE diff --git a/arch/ppc/platforms/pmac_time.c b/arch/ppc/platforms/pmac_time.c index de60ccc7db9..778ce4fec36 100644 --- a/arch/ppc/platforms/pmac_time.c +++ b/arch/ppc/platforms/pmac_time.c @@ -206,7 +206,7 @@ via_calibrate_decr(void) return 1; } -#ifdef CONFIG_PMAC_PBOOK +#ifdef CONFIG_PM /* * Reset the time after a sleep. */ @@ -238,7 +238,7 @@ time_sleep_notify(struct pmu_sleep_notifier *self, int when) static struct pmu_sleep_notifier time_sleep_notifier __pmacdata = { time_sleep_notify, SLEEP_LEVEL_MISC, }; -#endif /* CONFIG_PMAC_PBOOK */ +#endif /* CONFIG_PM */ /* * Query the OF and get the decr frequency. @@ -251,9 +251,9 @@ pmac_calibrate_decr(void) struct device_node *cpu; unsigned int freq, *fp; -#ifdef CONFIG_PMAC_PBOOK +#ifdef CONFIG_PM pmu_register_sleep_notifier(&time_sleep_notifier); -#endif /* CONFIG_PMAC_PBOOK */ +#endif /* CONFIG_PM */ /* We assume MacRISC2 machines have correct device-tree * calibration. That's better since the VIA itself seems diff --git a/arch/ppc/platforms/sandpoint.c b/arch/ppc/platforms/sandpoint.c index 70e58f43f2b..8b149c2fc54 100644 --- a/arch/ppc/platforms/sandpoint.c +++ b/arch/ppc/platforms/sandpoint.c @@ -324,6 +324,7 @@ sandpoint_setup_arch(void) pdata[1].irq = 0; pdata[1].mapbase = 0; } + } printk(KERN_INFO "Motorola SPS Sandpoint Test Platform\n"); printk(KERN_INFO "Port by MontaVista Software, Inc. (source@mvista.com)\n"); diff --git a/arch/ppc/syslib/open_pic.c b/arch/ppc/syslib/open_pic.c index b45d8268bf9..ad39b86ca92 100644 --- a/arch/ppc/syslib/open_pic.c +++ b/arch/ppc/syslib/open_pic.c @@ -370,8 +370,9 @@ void __init openpic_init(int offset) /* Initialize IPI interrupts */ if ( ppc_md.progress ) ppc_md.progress("openpic: ipi",0x3bb); for (i = 0; i < OPENPIC_NUM_IPI; i++) { - /* Disabled, Priority 10..13 */ - openpic_initipi(i, 10+i, OPENPIC_VEC_IPI+i+offset); + /* Disabled, increased priorities 10..13 */ + openpic_initipi(i, OPENPIC_PRIORITY_IPI_BASE+i, + OPENPIC_VEC_IPI+i+offset); /* IPIs are per-CPU */ irq_desc[OPENPIC_VEC_IPI+i+offset].status |= IRQ_PER_CPU; irq_desc[OPENPIC_VEC_IPI+i+offset].handler = &open_pic_ipi; @@ -399,8 +400,9 @@ void __init openpic_init(int offset) if (sense & IRQ_SENSE_MASK) irq_desc[i+offset].status = IRQ_LEVEL; - /* Enabled, Priority 8 */ - openpic_initirq(i, 8, i+offset, (sense & IRQ_POLARITY_MASK), + /* Enabled, Default priority */ + openpic_initirq(i, OPENPIC_PRIORITY_DEFAULT, i+offset, + (sense & IRQ_POLARITY_MASK), (sense & IRQ_SENSE_MASK)); /* Processor 0 */ openpic_mapirq(i, CPU_MASK_CPU0, CPU_MASK_NONE); @@ -656,6 +658,18 @@ static void __init openpic_maptimer(u_int timer, cpumask_t cpumask) } /* + * Change the priority of an interrupt + */ +void __init +openpic_set_irq_priority(u_int irq, u_int pri) +{ + check_arg_irq(irq); + openpic_safe_writefield(&ISR[irq - open_pic_irq_offset]->Vector_Priority, + OPENPIC_PRIORITY_MASK, + pri << OPENPIC_PRIORITY_SHIFT); +} + +/* * Initalize the interrupt source which will generate an NMI. * This raises the interrupt's priority from 8 to 9. * @@ -665,9 +679,7 @@ void __init openpic_init_nmi_irq(u_int irq) { check_arg_irq(irq); - openpic_safe_writefield(&ISR[irq - open_pic_irq_offset]->Vector_Priority, - OPENPIC_PRIORITY_MASK, - 9 << OPENPIC_PRIORITY_SHIFT); + openpic_set_irq_priority(irq, OPENPIC_PRIORITY_NMI); } /* diff --git a/arch/ppc64/kernel/kprobes.c b/arch/ppc64/kernel/kprobes.c index 782ce3efa2c..1d2ff6d6b0b 100644 --- a/arch/ppc64/kernel/kprobes.c +++ b/arch/ppc64/kernel/kprobes.c @@ -36,6 +36,8 @@ #include <asm/kdebug.h> #include <asm/sstep.h> +static DECLARE_MUTEX(kprobe_mutex); + static struct kprobe *current_kprobe; static unsigned long kprobe_status, kprobe_saved_msr; static struct kprobe *kprobe_prev; @@ -54,6 +56,15 @@ int arch_prepare_kprobe(struct kprobe *p) printk("Cannot register a kprobe on rfid or mtmsrd\n"); ret = -EINVAL; } + + /* insn must be on a special executable page on ppc64 */ + if (!ret) { + up(&kprobe_mutex); + p->ainsn.insn = get_insn_slot(); + down(&kprobe_mutex); + if (!p->ainsn.insn) + ret = -ENOMEM; + } return ret; } @@ -79,16 +90,22 @@ void arch_disarm_kprobe(struct kprobe *p) void arch_remove_kprobe(struct kprobe *p) { + up(&kprobe_mutex); + free_insn_slot(p->ainsn.insn); + down(&kprobe_mutex); } static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) { + kprobe_opcode_t insn = *p->ainsn.insn; + regs->msr |= MSR_SE; - /*single step inline if it a breakpoint instruction*/ - if (p->opcode == BREAKPOINT_INSTRUCTION) + + /* single step inline if it is a trap variant */ + if (IS_TW(insn) || IS_TD(insn) || IS_TWI(insn) || IS_TDI(insn)) regs->nip = (unsigned long)p->addr; else - regs->nip = (unsigned long)&p->ainsn.insn; + regs->nip = (unsigned long)p->ainsn.insn; } static inline void save_previous_kprobe(void) @@ -105,6 +122,23 @@ static inline void restore_previous_kprobe(void) kprobe_saved_msr = kprobe_saved_msr_prev; } +void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs) +{ + struct kretprobe_instance *ri; + + if ((ri = get_free_rp_inst(rp)) != NULL) { + ri->rp = rp; + ri->task = current; + ri->ret_addr = (kprobe_opcode_t *)regs->link; + + /* Replace the return addr with trampoline addr */ + regs->link = (unsigned long)kretprobe_trampoline; + add_rp_inst(ri); + } else { + rp->nmissed++; + } +} + static inline int kprobe_handler(struct pt_regs *regs) { struct kprobe *p; @@ -195,6 +229,78 @@ no_kprobe: } /* + * Function return probe trampoline: + * - init_kprobes() establishes a probepoint here + * - When the probed function returns, this probe + * causes the handlers to fire + */ +void kretprobe_trampoline_holder(void) +{ + asm volatile(".global kretprobe_trampoline\n" + "kretprobe_trampoline:\n" + "nop\n"); +} + +/* + * Called when the probe at kretprobe trampoline is hit + */ +int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kretprobe_instance *ri = NULL; + struct hlist_head *head; + struct hlist_node *node, *tmp; + unsigned long orig_ret_address = 0; + unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; + + head = kretprobe_inst_table_head(current); + + /* + * It is possible to have multiple instances associated with a given + * task either because an multiple functions in the call path + * have a return probe installed on them, and/or more then one return + * return probe was registered for a target function. + * + * We can handle this because: + * - instances are always inserted at the head of the list + * - when multiple return probes are registered for the same + * function, the first instance's ret_addr will point to the + * real return address, and all the rest will point to + * kretprobe_trampoline + */ + hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + if (ri->rp && ri->rp->handler) + ri->rp->handler(ri, regs); + + orig_ret_address = (unsigned long)ri->ret_addr; + recycle_rp_inst(ri); + + if (orig_ret_address != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; + } + + BUG_ON(!orig_ret_address || (orig_ret_address == trampoline_address)); + regs->nip = orig_ret_address; + + unlock_kprobes(); + + /* + * By returning a non-zero value, we are telling + * kprobe_handler() that we have handled unlocking + * and re-enabling preemption. + */ + return 1; +} + +/* * Called after single-stepping. p->addr is the address of the * instruction whose first byte has been replaced by the "breakpoint" * instruction. To avoid the SMP problems that can occur when we @@ -205,9 +311,10 @@ no_kprobe: static void resume_execution(struct kprobe *p, struct pt_regs *regs) { int ret; + unsigned int insn = *p->ainsn.insn; regs->nip = (unsigned long)p->addr; - ret = emulate_step(regs, p->ainsn.insn[0]); + ret = emulate_step(regs, insn); if (ret == 0) regs->nip = (unsigned long)p->addr + 4; } @@ -331,3 +438,13 @@ int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) memcpy(regs, &jprobe_saved_regs, sizeof(struct pt_regs)); return 1; } + +static struct kprobe trampoline_p = { + .addr = (kprobe_opcode_t *) &kretprobe_trampoline, + .pre_handler = trampoline_probe_handler +}; + +int __init arch_init(void) +{ + return register_kprobe(&trampoline_p); +} diff --git a/arch/ppc64/kernel/ppc_ksyms.c b/arch/ppc64/kernel/ppc_ksyms.c index b230a63fe4c..705742f4eec 100644 --- a/arch/ppc64/kernel/ppc_ksyms.c +++ b/arch/ppc64/kernel/ppc_ksyms.c @@ -75,6 +75,7 @@ EXPORT_SYMBOL(giveup_fpu); EXPORT_SYMBOL(giveup_altivec); #endif EXPORT_SYMBOL(__flush_icache_range); +EXPORT_SYMBOL(flush_dcache_range); #ifdef CONFIG_SMP #ifdef CONFIG_PPC_ISERIES diff --git a/arch/ppc64/kernel/process.c b/arch/ppc64/kernel/process.c index aba89554d89..f7cae05e40f 100644 --- a/arch/ppc64/kernel/process.c +++ b/arch/ppc64/kernel/process.c @@ -36,6 +36,7 @@ #include <linux/kallsyms.h> #include <linux/interrupt.h> #include <linux/utsname.h> +#include <linux/kprobes.h> #include <asm/pgtable.h> #include <asm/uaccess.h> @@ -307,6 +308,8 @@ void show_regs(struct pt_regs * regs) void exit_thread(void) { + kprobe_flush_task(current); + #ifndef CONFIG_SMP if (last_task_used_math == current) last_task_used_math = NULL; @@ -321,6 +324,7 @@ void flush_thread(void) { struct thread_info *t = current_thread_info(); + kprobe_flush_task(current); if (t->flags & _TIF_ABI_PENDING) t->flags ^= (_TIF_ABI_PENDING | _TIF_32BIT); diff --git a/arch/ppc64/kernel/time.c b/arch/ppc64/kernel/time.c index 2348a75e050..2a532db9138 100644 --- a/arch/ppc64/kernel/time.c +++ b/arch/ppc64/kernel/time.c @@ -91,6 +91,7 @@ unsigned long tb_to_xs; unsigned tb_to_us; unsigned long processor_freq; DEFINE_SPINLOCK(rtc_lock); +EXPORT_SYMBOL_GPL(rtc_lock); unsigned long tb_to_ns_scale; unsigned long tb_to_ns_shift; diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c index 4e680f87a75..acd2a778ebe 100644 --- a/arch/x86_64/kernel/kprobes.c +++ b/arch/x86_64/kernel/kprobes.c @@ -38,7 +38,7 @@ #include <linux/string.h> #include <linux/slab.h> #include <linux/preempt.h> -#include <linux/moduleloader.h> + #include <asm/cacheflush.h> #include <asm/pgtable.h> #include <asm/kdebug.h> @@ -51,8 +51,6 @@ static struct kprobe *kprobe_prev; static unsigned long kprobe_status_prev, kprobe_old_rflags_prev, kprobe_saved_rflags_prev; static struct pt_regs jprobe_saved_regs; static long *jprobe_saved_rsp; -static kprobe_opcode_t *get_insn_slot(void); -static void free_insn_slot(kprobe_opcode_t *slot); void jprobe_return_end(void); /* copy of the kernel stack at the probe fire time */ @@ -274,48 +272,23 @@ static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) regs->rip = (unsigned long)p->ainsn.insn; } -struct task_struct *arch_get_kprobe_task(void *ptr) -{ - return ((struct thread_info *) (((unsigned long) ptr) & - (~(THREAD_SIZE -1))))->task; -} - void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs) { unsigned long *sara = (unsigned long *)regs->rsp; - struct kretprobe_instance *ri; - static void *orig_ret_addr; + struct kretprobe_instance *ri; + + if ((ri = get_free_rp_inst(rp)) != NULL) { + ri->rp = rp; + ri->task = current; + ri->ret_addr = (kprobe_opcode_t *) *sara; - /* - * Save the return address when the return probe hits - * the first time, and use it to populate the (krprobe - * instance)->ret_addr for subsequent return probes at - * the same addrress since stack address would have - * the kretprobe_trampoline by then. - */ - if (((void*) *sara) != kretprobe_trampoline) - orig_ret_addr = (void*) *sara; - - if ((ri = get_free_rp_inst(rp)) != NULL) { - ri->rp = rp; - ri->stack_addr = sara; - ri->ret_addr = orig_ret_addr; - add_rp_inst(ri); /* Replace the return addr with trampoline addr */ *sara = (unsigned long) &kretprobe_trampoline; - } else { - rp->nmissed++; - } -} -void arch_kprobe_flush_task(struct task_struct *tk) -{ - struct kretprobe_instance *ri; - while ((ri = get_rp_inst_tsk(tk)) != NULL) { - *((unsigned long *)(ri->stack_addr)) = - (unsigned long) ri->ret_addr; - recycle_rp_inst(ri); - } + add_rp_inst(ri); + } else { + rp->nmissed++; + } } /* @@ -428,36 +401,59 @@ no_kprobe: */ int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) { - struct task_struct *tsk; - struct kretprobe_instance *ri; - struct hlist_head *head; - struct hlist_node *node; - unsigned long *sara = (unsigned long *)regs->rsp - 1; - - tsk = arch_get_kprobe_task(sara); - head = kretprobe_inst_table_head(tsk); - - hlist_for_each_entry(ri, node, head, hlist) { - if (ri->stack_addr == sara && ri->rp) { - if (ri->rp->handler) - ri->rp->handler(ri, regs); - } - } - return 0; -} + struct kretprobe_instance *ri = NULL; + struct hlist_head *head; + struct hlist_node *node, *tmp; + unsigned long orig_ret_address = 0; + unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; -void trampoline_post_handler(struct kprobe *p, struct pt_regs *regs, - unsigned long flags) -{ - struct kretprobe_instance *ri; - /* RA already popped */ - unsigned long *sara = ((unsigned long *)regs->rsp) - 1; + head = kretprobe_inst_table_head(current); - while ((ri = get_rp_inst(sara))) { - regs->rip = (unsigned long)ri->ret_addr; + /* + * It is possible to have multiple instances associated with a given + * task either because an multiple functions in the call path + * have a return probe installed on them, and/or more then one return + * return probe was registered for a target function. + * + * We can handle this because: + * - instances are always inserted at the head of the list + * - when multiple return probes are registered for the same + * function, the first instance's ret_addr will point to the + * real return address, and all the rest will point to + * kretprobe_trampoline + */ + hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + if (ri->rp && ri->rp->handler) + ri->rp->handler(ri, regs); + + orig_ret_address = (unsigned long)ri->ret_addr; recycle_rp_inst(ri); + + if (orig_ret_address != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; } - regs->eflags &= ~TF_MASK; + + BUG_ON(!orig_ret_address || (orig_ret_address == trampoline_address)); + regs->rip = orig_ret_address; + + unlock_kprobes(); + preempt_enable_no_resched(); + + /* + * By returning a non-zero value, we are telling + * kprobe_handler() that we have handled unlocking + * and re-enabling preemption. + */ + return 1; } /* @@ -550,8 +546,7 @@ int post_kprobe_handler(struct pt_regs *regs) current_kprobe->post_handler(current_kprobe, regs, 0); } - if (current_kprobe->post_handler != trampoline_post_handler) - resume_execution(current_kprobe, regs); + resume_execution(current_kprobe, regs); regs->eflags |= kprobe_saved_rflags; /* Restore the original saved kprobes variables and continue. */ @@ -682,111 +677,12 @@ int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) return 0; } -/* - * kprobe->ainsn.insn points to the copy of the instruction to be single-stepped. - * By default on x86_64, pages we get from kmalloc or vmalloc are not - * executable. Single-stepping an instruction on such a page yields an - * oops. So instead of storing the instruction copies in their respective - * kprobe objects, we allocate a page, map it executable, and store all the - * instruction copies there. (We can allocate additional pages if somebody - * inserts a huge number of probes.) Each page can hold up to INSNS_PER_PAGE - * instruction slots, each of which is MAX_INSN_SIZE*sizeof(kprobe_opcode_t) - * bytes. - */ -#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE*sizeof(kprobe_opcode_t))) -struct kprobe_insn_page { - struct hlist_node hlist; - kprobe_opcode_t *insns; /* page of instruction slots */ - char slot_used[INSNS_PER_PAGE]; - int nused; +static struct kprobe trampoline_p = { + .addr = (kprobe_opcode_t *) &kretprobe_trampoline, + .pre_handler = trampoline_probe_handler }; -static struct hlist_head kprobe_insn_pages; - -/** - * get_insn_slot() - Find a slot on an executable page for an instruction. - * We allocate an executable page if there's no room on existing ones. - */ -static kprobe_opcode_t *get_insn_slot(void) -{ - struct kprobe_insn_page *kip; - struct hlist_node *pos; - - hlist_for_each(pos, &kprobe_insn_pages) { - kip = hlist_entry(pos, struct kprobe_insn_page, hlist); - if (kip->nused < INSNS_PER_PAGE) { - int i; - for (i = 0; i < INSNS_PER_PAGE; i++) { - if (!kip->slot_used[i]) { - kip->slot_used[i] = 1; - kip->nused++; - return kip->insns + (i*MAX_INSN_SIZE); - } - } - /* Surprise! No unused slots. Fix kip->nused. */ - kip->nused = INSNS_PER_PAGE; - } - } - - /* All out of space. Need to allocate a new page. Use slot 0.*/ - kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); - if (!kip) { - return NULL; - } - - /* - * For the %rip-relative displacement fixups to be doable, we - * need our instruction copy to be within +/- 2GB of any data it - * might access via %rip. That is, within 2GB of where the - * kernel image and loaded module images reside. So we allocate - * a page in the module loading area. - */ - kip->insns = module_alloc(PAGE_SIZE); - if (!kip->insns) { - kfree(kip); - return NULL; - } - INIT_HLIST_NODE(&kip->hlist); - hlist_add_head(&kip->hlist, &kprobe_insn_pages); - memset(kip->slot_used, 0, INSNS_PER_PAGE); - kip->slot_used[0] = 1; - kip->nused = 1; - return kip->insns; -} - -/** - * free_insn_slot() - Free instruction slot obtained from get_insn_slot(). - */ -static void free_insn_slot(kprobe_opcode_t *slot) +int __init arch_init(void) { - struct kprobe_insn_page *kip; - struct hlist_node *pos; - - hlist_for_each(pos, &kprobe_insn_pages) { - kip = hlist_entry(pos, struct kprobe_insn_page, hlist); - if (kip->insns <= slot - && slot < kip->insns+(INSNS_PER_PAGE*MAX_INSN_SIZE)) { - int i = (slot - kip->insns) / MAX_INSN_SIZE; - kip->slot_used[i] = 0; - kip->nused--; - if (kip->nused == 0) { - /* - * Page is no longer in use. Free it unless - * it's the last one. We keep the last one - * so as not to have to set it up again the - * next time somebody inserts a probe. - */ - hlist_del(&kip->hlist); - if (hlist_empty(&kprobe_insn_pages)) { - INIT_HLIST_NODE(&kip->hlist); - hlist_add_head(&kip->hlist, - &kprobe_insn_pages); - } else { - module_free(NULL, kip->insns); - kfree(kip); - } - } - return; - } - } + return register_kprobe(&trampoline_p); } diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index 1d91271796e..7577f9d7a75 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -482,6 +482,33 @@ out: } /* + * This function selects if the context switch from prev to next + * has to tweak the TSC disable bit in the cr4. + */ +static inline void disable_tsc(struct task_struct *prev_p, + struct task_struct *next_p) +{ + struct thread_info *prev, *next; + + /* + * gcc should eliminate the ->thread_info dereference if + * has_secure_computing returns 0 at compile time (SECCOMP=n). + */ + prev = prev_p->thread_info; + next = next_p->thread_info; + + if (has_secure_computing(prev) || has_secure_computing(next)) { + /* slow path here */ + if (has_secure_computing(prev) && + !has_secure_computing(next)) { + write_cr4(read_cr4() & ~X86_CR4_TSD); + } else if (!has_secure_computing(prev) && + has_secure_computing(next)) + write_cr4(read_cr4() | X86_CR4_TSD); + } +} + +/* * This special macro can be used to load a debugging register */ #define loaddebug(thread,r) set_debug(thread->debugreg ## r, r) @@ -599,6 +626,8 @@ struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct * } } + disable_tsc(prev_p, next_p); + return prev_p; } |