diff options
Diffstat (limited to 'arch/ia64/mm')
-rw-r--r-- | arch/ia64/mm/Makefile | 12 | ||||
-rw-r--r-- | arch/ia64/mm/contig.c | 299 | ||||
-rw-r--r-- | arch/ia64/mm/discontig.c | 737 | ||||
-rw-r--r-- | arch/ia64/mm/extable.c | 90 | ||||
-rw-r--r-- | arch/ia64/mm/fault.c | 261 | ||||
-rw-r--r-- | arch/ia64/mm/hugetlbpage.c | 357 | ||||
-rw-r--r-- | arch/ia64/mm/init.c | 597 | ||||
-rw-r--r-- | arch/ia64/mm/numa.c | 49 | ||||
-rw-r--r-- | arch/ia64/mm/tlb.c | 190 |
9 files changed, 2592 insertions, 0 deletions
diff --git a/arch/ia64/mm/Makefile b/arch/ia64/mm/Makefile new file mode 100644 index 00000000000..7078f67887e --- /dev/null +++ b/arch/ia64/mm/Makefile @@ -0,0 +1,12 @@ +# +# Makefile for the ia64-specific parts of the memory manager. +# + +obj-y := init.o fault.o tlb.o extable.o + +obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o +obj-$(CONFIG_NUMA) += numa.o +obj-$(CONFIG_DISCONTIGMEM) += discontig.o +ifndef CONFIG_DISCONTIGMEM +obj-y += contig.o +endif diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c new file mode 100644 index 00000000000..6daf15ac894 --- /dev/null +++ b/arch/ia64/mm/contig.c @@ -0,0 +1,299 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 1998-2003 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + * Stephane Eranian <eranian@hpl.hp.com> + * Copyright (C) 2000, Rohit Seth <rohit.seth@intel.com> + * Copyright (C) 1999 VA Linux Systems + * Copyright (C) 1999 Walt Drummond <drummond@valinux.com> + * Copyright (C) 2003 Silicon Graphics, Inc. All rights reserved. + * + * Routines used by ia64 machines with contiguous (or virtually contiguous) + * memory. + */ +#include <linux/config.h> +#include <linux/bootmem.h> +#include <linux/efi.h> +#include <linux/mm.h> +#include <linux/swap.h> + +#include <asm/meminit.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/sections.h> +#include <asm/mca.h> + +#ifdef CONFIG_VIRTUAL_MEM_MAP +static unsigned long num_dma_physpages; +#endif + +/** + * show_mem - display a memory statistics summary + * + * Just walks the pages in the system and describes where they're allocated. + */ +void +show_mem (void) +{ + int i, total = 0, reserved = 0; + int shared = 0, cached = 0; + + printk("Mem-info:\n"); + show_free_areas(); + + printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); + i = max_mapnr; + while (i-- > 0) { + if (!pfn_valid(i)) + continue; + total++; + if (PageReserved(mem_map+i)) + reserved++; + else if (PageSwapCache(mem_map+i)) + cached++; + else if (page_count(mem_map + i)) + shared += page_count(mem_map + i) - 1; + } + printk("%d pages of RAM\n", total); + printk("%d reserved pages\n", reserved); + printk("%d pages shared\n", shared); + printk("%d pages swap cached\n", cached); + printk("%ld pages in page table cache\n", pgtable_cache_size); +} + +/* physical address where the bootmem map is located */ +unsigned long bootmap_start; + +/** + * find_max_pfn - adjust the maximum page number callback + * @start: start of range + * @end: end of range + * @arg: address of pointer to global max_pfn variable + * + * Passed as a callback function to efi_memmap_walk() to determine the highest + * available page frame number in the system. + */ +int +find_max_pfn (unsigned long start, unsigned long end, void *arg) +{ + unsigned long *max_pfnp = arg, pfn; + + pfn = (PAGE_ALIGN(end - 1) - PAGE_OFFSET) >> PAGE_SHIFT; + if (pfn > *max_pfnp) + *max_pfnp = pfn; + return 0; +} + +/** + * find_bootmap_location - callback to find a memory area for the bootmap + * @start: start of region + * @end: end of region + * @arg: unused callback data + * + * Find a place to put the bootmap and return its starting address in + * bootmap_start. This address must be page-aligned. + */ +int +find_bootmap_location (unsigned long start, unsigned long end, void *arg) +{ + unsigned long needed = *(unsigned long *)arg; + unsigned long range_start, range_end, free_start; + int i; + +#if IGNORE_PFN0 + if (start == PAGE_OFFSET) { + start += PAGE_SIZE; + if (start >= end) + return 0; + } +#endif + + free_start = PAGE_OFFSET; + + for (i = 0; i < num_rsvd_regions; i++) { + range_start = max(start, free_start); + range_end = min(end, rsvd_region[i].start & PAGE_MASK); + + free_start = PAGE_ALIGN(rsvd_region[i].end); + + if (range_end <= range_start) + continue; /* skip over empty range */ + + if (range_end - range_start >= needed) { + bootmap_start = __pa(range_start); + return -1; /* done */ + } + + /* nothing more available in this segment */ + if (range_end == end) + return 0; + } + return 0; +} + +/** + * find_memory - setup memory map + * + * Walk the EFI memory map and find usable memory for the system, taking + * into account reserved areas. + */ +void +find_memory (void) +{ + unsigned long bootmap_size; + + reserve_memory(); + + /* first find highest page frame number */ + max_pfn = 0; + efi_memmap_walk(find_max_pfn, &max_pfn); + + /* how many bytes to cover all the pages */ + bootmap_size = bootmem_bootmap_pages(max_pfn) << PAGE_SHIFT; + + /* look for a location to hold the bootmap */ + bootmap_start = ~0UL; + efi_memmap_walk(find_bootmap_location, &bootmap_size); + if (bootmap_start == ~0UL) + panic("Cannot find %ld bytes for bootmap\n", bootmap_size); + + bootmap_size = init_bootmem(bootmap_start >> PAGE_SHIFT, max_pfn); + + /* Free all available memory, then mark bootmem-map as being in use. */ + efi_memmap_walk(filter_rsvd_memory, free_bootmem); + reserve_bootmem(bootmap_start, bootmap_size); + + find_initrd(); +} + +#ifdef CONFIG_SMP +/** + * per_cpu_init - setup per-cpu variables + * + * Allocate and setup per-cpu data areas. + */ +void * +per_cpu_init (void) +{ + void *cpu_data; + int cpu; + + /* + * get_free_pages() cannot be used before cpu_init() done. BSP + * allocates "NR_CPUS" pages for all CPUs to avoid that AP calls + * get_zeroed_page(). + */ + if (smp_processor_id() == 0) { + cpu_data = __alloc_bootmem(PERCPU_PAGE_SIZE * NR_CPUS, + PERCPU_PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + for (cpu = 0; cpu < NR_CPUS; cpu++) { + memcpy(cpu_data, __phys_per_cpu_start, __per_cpu_end - __per_cpu_start); + __per_cpu_offset[cpu] = (char *) cpu_data - __per_cpu_start; + cpu_data += PERCPU_PAGE_SIZE; + per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu]; + } + } + return __per_cpu_start + __per_cpu_offset[smp_processor_id()]; +} +#endif /* CONFIG_SMP */ + +static int +count_pages (u64 start, u64 end, void *arg) +{ + unsigned long *count = arg; + + *count += (end - start) >> PAGE_SHIFT; + return 0; +} + +#ifdef CONFIG_VIRTUAL_MEM_MAP +static int +count_dma_pages (u64 start, u64 end, void *arg) +{ + unsigned long *count = arg; + + if (start < MAX_DMA_ADDRESS) + *count += (min(end, MAX_DMA_ADDRESS) - start) >> PAGE_SHIFT; + return 0; +} +#endif + +/* + * Set up the page tables. + */ + +void +paging_init (void) +{ + unsigned long max_dma; + unsigned long zones_size[MAX_NR_ZONES]; +#ifdef CONFIG_VIRTUAL_MEM_MAP + unsigned long zholes_size[MAX_NR_ZONES]; + unsigned long max_gap; +#endif + + /* initialize mem_map[] */ + + memset(zones_size, 0, sizeof(zones_size)); + + num_physpages = 0; + efi_memmap_walk(count_pages, &num_physpages); + + max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT; + +#ifdef CONFIG_VIRTUAL_MEM_MAP + memset(zholes_size, 0, sizeof(zholes_size)); + + num_dma_physpages = 0; + efi_memmap_walk(count_dma_pages, &num_dma_physpages); + + if (max_low_pfn < max_dma) { + zones_size[ZONE_DMA] = max_low_pfn; + zholes_size[ZONE_DMA] = max_low_pfn - num_dma_physpages; + } else { + zones_size[ZONE_DMA] = max_dma; + zholes_size[ZONE_DMA] = max_dma - num_dma_physpages; + if (num_physpages > num_dma_physpages) { + zones_size[ZONE_NORMAL] = max_low_pfn - max_dma; + zholes_size[ZONE_NORMAL] = + ((max_low_pfn - max_dma) - + (num_physpages - num_dma_physpages)); + } + } + + max_gap = 0; + efi_memmap_walk(find_largest_hole, (u64 *)&max_gap); + if (max_gap < LARGE_GAP) { + vmem_map = (struct page *) 0; + free_area_init_node(0, &contig_page_data, zones_size, 0, + zholes_size); + } else { + unsigned long map_size; + + /* allocate virtual_mem_map */ + + map_size = PAGE_ALIGN(max_low_pfn * sizeof(struct page)); + vmalloc_end -= map_size; + vmem_map = (struct page *) vmalloc_end; + efi_memmap_walk(create_mem_map_page_table, NULL); + + NODE_DATA(0)->node_mem_map = vmem_map; + free_area_init_node(0, &contig_page_data, zones_size, + 0, zholes_size); + + printk("Virtual mem_map starts at 0x%p\n", mem_map); + } +#else /* !CONFIG_VIRTUAL_MEM_MAP */ + if (max_low_pfn < max_dma) + zones_size[ZONE_DMA] = max_low_pfn; + else { + zones_size[ZONE_DMA] = max_dma; + zones_size[ZONE_NORMAL] = max_low_pfn - max_dma; + } + free_area_init(zones_size); +#endif /* !CONFIG_VIRTUAL_MEM_MAP */ + zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); +} diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c new file mode 100644 index 00000000000..3456a9b6971 --- /dev/null +++ b/arch/ia64/mm/discontig.c @@ -0,0 +1,737 @@ +/* + * Copyright (c) 2000, 2003 Silicon Graphics, Inc. All rights reserved. + * Copyright (c) 2001 Intel Corp. + * Copyright (c) 2001 Tony Luck <tony.luck@intel.com> + * Copyright (c) 2002 NEC Corp. + * Copyright (c) 2002 Kimio Suganuma <k-suganuma@da.jp.nec.com> + * Copyright (c) 2004 Silicon Graphics, Inc + * Russ Anderson <rja@sgi.com> + * Jesse Barnes <jbarnes@sgi.com> + * Jack Steiner <steiner@sgi.com> + */ + +/* + * Platform initialization for Discontig Memory + */ + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/bootmem.h> +#include <linux/acpi.h> +#include <linux/efi.h> +#include <linux/nodemask.h> +#include <asm/pgalloc.h> +#include <asm/tlb.h> +#include <asm/meminit.h> +#include <asm/numa.h> +#include <asm/sections.h> + +/* + * Track per-node information needed to setup the boot memory allocator, the + * per-node areas, and the real VM. + */ +struct early_node_data { + struct ia64_node_data *node_data; + pg_data_t *pgdat; + unsigned long pernode_addr; + unsigned long pernode_size; + struct bootmem_data bootmem_data; + unsigned long num_physpages; + unsigned long num_dma_physpages; + unsigned long min_pfn; + unsigned long max_pfn; +}; + +static struct early_node_data mem_data[MAX_NUMNODES] __initdata; + +/** + * reassign_cpu_only_nodes - called from find_memory to move CPU-only nodes to a memory node + * + * This function will move nodes with only CPUs (no memory) + * to a node with memory which is at the minimum numa_slit distance. + * Any reassigments will result in the compression of the nodes + * and renumbering the nid values where appropriate. + * The static declarations below are to avoid large stack size which + * makes the code not re-entrant. + */ +static void __init reassign_cpu_only_nodes(void) +{ + struct node_memblk_s *p; + int i, j, k, nnode, nid, cpu, cpunid, pxm; + u8 cslit, slit; + static DECLARE_BITMAP(nodes_with_mem, MAX_NUMNODES) __initdata; + static u8 numa_slit_fix[MAX_NUMNODES * MAX_NUMNODES] __initdata; + static int node_flip[MAX_NUMNODES] __initdata; + static int old_nid_map[NR_CPUS] __initdata; + + for (nnode = 0, p = &node_memblk[0]; p < &node_memblk[num_node_memblks]; p++) + if (!test_bit(p->nid, (void *) nodes_with_mem)) { + set_bit(p->nid, (void *) nodes_with_mem); + nnode++; + } + + /* + * All nids with memory. + */ + if (nnode == num_online_nodes()) + return; + + /* + * Change nids and attempt to migrate CPU-only nodes + * to the best numa_slit (closest neighbor) possible. + * For reassigned CPU nodes a nid can't be arrived at + * until after this loop because the target nid's new + * identity might not have been established yet. So + * new nid values are fabricated above num_online_nodes() and + * mapped back later to their true value. + */ + /* MCD - This code is a bit complicated, but may be unnecessary now. + * We can now handle much more interesting node-numbering. + * The old requirement that 0 <= nid <= numnodes <= MAX_NUMNODES + * and that there be no holes in the numbering 0..numnodes + * has become simply 0 <= nid <= MAX_NUMNODES. + */ + nid = 0; + for_each_online_node(i) { + if (test_bit(i, (void *) nodes_with_mem)) { + /* + * Save original nid value for numa_slit + * fixup and node_cpuid reassignments. + */ + node_flip[nid] = i; + + if (i == nid) { + nid++; + continue; + } + + for (p = &node_memblk[0]; p < &node_memblk[num_node_memblks]; p++) + if (p->nid == i) + p->nid = nid; + + cpunid = nid; + nid++; + } else + cpunid = MAX_NUMNODES; + + for (cpu = 0; cpu < NR_CPUS; cpu++) + if (node_cpuid[cpu].nid == i) { + /* + * For nodes not being reassigned just + * fix the cpu's nid and reverse pxm map + */ + if (cpunid < MAX_NUMNODES) { + pxm = nid_to_pxm_map[i]; + pxm_to_nid_map[pxm] = + node_cpuid[cpu].nid = cpunid; + continue; + } + + /* + * For nodes being reassigned, find best node by + * numa_slit information and then make a temporary + * nid value based on current nid and num_online_nodes(). + */ + slit = 0xff; + k = 2*num_online_nodes(); + for_each_online_node(j) { + if (i == j) + continue; + else if (test_bit(j, (void *) nodes_with_mem)) { + cslit = numa_slit[i * num_online_nodes() + j]; + if (cslit < slit) { + k = num_online_nodes() + j; + slit = cslit; + } + } + } + + /* save old nid map so we can update the pxm */ + old_nid_map[cpu] = node_cpuid[cpu].nid; + node_cpuid[cpu].nid = k; + } + } + + /* + * Fixup temporary nid values for CPU-only nodes. + */ + for (cpu = 0; cpu < NR_CPUS; cpu++) + if (node_cpuid[cpu].nid == (2*num_online_nodes())) { + pxm = nid_to_pxm_map[old_nid_map[cpu]]; + pxm_to_nid_map[pxm] = node_cpuid[cpu].nid = nnode - 1; + } else { + for (i = 0; i < nnode; i++) { + if (node_flip[i] != (node_cpuid[cpu].nid - num_online_nodes())) + continue; + + pxm = nid_to_pxm_map[old_nid_map[cpu]]; + pxm_to_nid_map[pxm] = node_cpuid[cpu].nid = i; + break; + } + } + + /* + * Fix numa_slit by compressing from larger + * nid array to reduced nid array. + */ + for (i = 0; i < nnode; i++) + for (j = 0; j < nnode; j++) + numa_slit_fix[i * nnode + j] = + numa_slit[node_flip[i] * num_online_nodes() + node_flip[j]]; + + memcpy(numa_slit, numa_slit_fix, sizeof (numa_slit)); + + nodes_clear(node_online_map); + for (i = 0; i < nnode; i++) + node_set_online(i); + + return; +} + +/* + * To prevent cache aliasing effects, align per-node structures so that they + * start at addresses that are strided by node number. + */ +#define NODEDATA_ALIGN(addr, node) \ + ((((addr) + 1024*1024-1) & ~(1024*1024-1)) + (node)*PERCPU_PAGE_SIZE) + +/** + * build_node_maps - callback to setup bootmem structs for each node + * @start: physical start of range + * @len: length of range + * @node: node where this range resides + * + * We allocate a struct bootmem_data for each piece of memory that we wish to + * treat as a virtually contiguous block (i.e. each node). Each such block + * must start on an %IA64_GRANULE_SIZE boundary, so we round the address down + * if necessary. Any non-existent pages will simply be part of the virtual + * memmap. We also update min_low_pfn and max_low_pfn here as we receive + * memory ranges from the caller. + */ +static int __init build_node_maps(unsigned long start, unsigned long len, + int node) +{ + unsigned long cstart, epfn, end = start + len; + struct bootmem_data *bdp = &mem_data[node].bootmem_data; + + epfn = GRANULEROUNDUP(end) >> PAGE_SHIFT; + cstart = GRANULEROUNDDOWN(start); + + if (!bdp->node_low_pfn) { + bdp->node_boot_start = cstart; + bdp->node_low_pfn = epfn; + } else { + bdp->node_boot_start = min(cstart, bdp->node_boot_start); + bdp->node_low_pfn = max(epfn, bdp->node_low_pfn); + } + + min_low_pfn = min(min_low_pfn, bdp->node_boot_start>>PAGE_SHIFT); + max_low_pfn = max(max_low_pfn, bdp->node_low_pfn); + + return 0; +} + +/** + * early_nr_phys_cpus_node - return number of physical cpus on a given node + * @node: node to check + * + * Count the number of physical cpus on @node. These are cpus that actually + * exist. We can't use nr_cpus_node() yet because + * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been + * called yet. + */ +static int early_nr_phys_cpus_node(int node) +{ + int cpu, n = 0; + + for (cpu = 0; cpu < NR_CPUS; cpu++) + if (node == node_cpuid[cpu].nid) + if ((cpu == 0) || node_cpuid[cpu].phys_id) + n++; + + return n; +} + + +/** + * early_nr_cpus_node - return number of cpus on a given node + * @node: node to check + * + * Count the number of cpus on @node. We can't use nr_cpus_node() yet because + * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been + * called yet. Note that node 0 will also count all non-existent cpus. + */ +static int early_nr_cpus_node(int node) +{ + int cpu, n = 0; + + for (cpu = 0; cpu < NR_CPUS; cpu++) + if (node == node_cpuid[cpu].nid) + n++; + + return n; +} + +/** + * find_pernode_space - allocate memory for memory map and per-node structures + * @start: physical start of range + * @len: length of range + * @node: node where this range resides + * + * This routine reserves space for the per-cpu data struct, the list of + * pg_data_ts and the per-node data struct. Each node will have something like + * the following in the first chunk of addr. space large enough to hold it. + * + * ________________________ + * | | + * |~~~~~~~~~~~~~~~~~~~~~~~~| <-- NODEDATA_ALIGN(start, node) for the first + * | PERCPU_PAGE_SIZE * | start and length big enough + * | cpus_on_this_node | Node 0 will also have entries for all non-existent cpus. + * |------------------------| + * | local pg_data_t * | + * |------------------------| + * | local ia64_node_data | + * |------------------------| + * | ??? | + * |________________________| + * + * Once this space has been set aside, the bootmem maps are initialized. We + * could probably move the allocation of the per-cpu and ia64_node_data space + * outside of this function and use alloc_bootmem_node(), but doing it here + * is straightforward and we get the alignments we want so... + */ +static int __init find_pernode_space(unsigned long start, unsigned long len, + int node) +{ + unsigned long epfn, cpu, cpus, phys_cpus; + unsigned long pernodesize = 0, pernode, pages, mapsize; + void *cpu_data; + struct bootmem_data *bdp = &mem_data[node].bootmem_data; + + epfn = (start + len) >> PAGE_SHIFT; + + pages = bdp->node_low_pfn - (bdp->node_boot_start >> PAGE_SHIFT); + mapsize = bootmem_bootmap_pages(pages) << PAGE_SHIFT; + + /* + * Make sure this memory falls within this node's usable memory + * since we may have thrown some away in build_maps(). + */ + if (start < bdp->node_boot_start || epfn > bdp->node_low_pfn) + return 0; + + /* Don't setup this node's local space twice... */ + if (mem_data[node].pernode_addr) + return 0; + + /* + * Calculate total size needed, incl. what's necessary + * for good alignment and alias prevention. + */ + cpus = early_nr_cpus_node(node); + phys_cpus = early_nr_phys_cpus_node(node); + pernodesize += PERCPU_PAGE_SIZE * cpus; + pernodesize += node * L1_CACHE_BYTES; + pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t)); + pernodesize += L1_CACHE_ALIGN(sizeof(struct ia64_node_data)); + pernodesize = PAGE_ALIGN(pernodesize); + pernode = NODEDATA_ALIGN(start, node); + + /* Is this range big enough for what we want to store here? */ + if (start + len > (pernode + pernodesize + mapsize)) { + mem_data[node].pernode_addr = pernode; + mem_data[node].pernode_size = pernodesize; + memset(__va(pernode), 0, pernodesize); + + cpu_data = (void *)pernode; + pernode += PERCPU_PAGE_SIZE * cpus; + pernode += node * L1_CACHE_BYTES; + + mem_data[node].pgdat = __va(pernode); + pernode += L1_CACHE_ALIGN(sizeof(pg_data_t)); + + mem_data[node].node_data = __va(pernode); + pernode += L1_CACHE_ALIGN(sizeof(struct ia64_node_data)); + + mem_data[node].pgdat->bdata = bdp; + pernode += L1_CACHE_ALIGN(sizeof(pg_data_t)); + + /* + * Copy the static per-cpu data into the region we + * just set aside and then setup __per_cpu_offset + * for each CPU on this node. + */ + for (cpu = 0; cpu < NR_CPUS; cpu++) { + if (node == node_cpuid[cpu].nid) { + memcpy(__va(cpu_data), __phys_per_cpu_start, + __per_cpu_end - __per_cpu_start); + __per_cpu_offset[cpu] = (char*)__va(cpu_data) - + __per_cpu_start; + cpu_data += PERCPU_PAGE_SIZE; + } + } + } + + return 0; +} + +/** + * free_node_bootmem - free bootmem allocator memory for use + * @start: physical start of range + * @len: length of range + * @node: node where this range resides + * + * Simply calls the bootmem allocator to free the specified ranged from + * the given pg_data_t's bdata struct. After this function has been called + * for all the entries in the EFI memory map, the bootmem allocator will + * be ready to service allocation requests. + */ +static int __init free_node_bootmem(unsigned long start, unsigned long len, + int node) +{ + free_bootmem_node(mem_data[node].pgdat, start, len); + + return 0; +} + +/** + * reserve_pernode_space - reserve memory for per-node space + * + * Reserve the space used by the bootmem maps & per-node space in the boot + * allocator so that when we actually create the real mem maps we don't + * use their memory. + */ +static void __init reserve_pernode_space(void) +{ + unsigned long base, size, pages; + struct bootmem_data *bdp; + int node; + + for_each_online_node(node) { + pg_data_t *pdp = mem_data[node].pgdat; + + bdp = pdp->bdata; + + /* First the bootmem_map itself */ + pages = bdp->node_low_pfn - (bdp->node_boot_start>>PAGE_SHIFT); + size = bootmem_bootmap_pages(pages) << PAGE_SHIFT; + base = __pa(bdp->node_bootmem_map); + reserve_bootmem_node(pdp, base, size); + + /* Now the per-node space */ + size = mem_data[node].pernode_size; + base = __pa(mem_data[node].pernode_addr); + reserve_bootmem_node(pdp, base, size); + } +} + +/** + * initialize_pernode_data - fixup per-cpu & per-node pointers + * + * Each node's per-node area has a copy of the global pg_data_t list, so + * we copy that to each node here, as well as setting the per-cpu pointer + * to the local node data structure. The active_cpus field of the per-node + * structure gets setup by the platform_cpu_init() function later. + */ +static void __init initialize_pernode_data(void) +{ + int cpu, node; + pg_data_t *pgdat_list[MAX_NUMNODES]; + + for_each_online_node(node) + pgdat_list[node] = mem_data[node].pgdat; + + /* Copy the pg_data_t list to each node and init the node field */ + for_each_online_node(node) { + memcpy(mem_data[node].node_data->pg_data_ptrs, pgdat_list, + sizeof(pgdat_list)); + } + + /* Set the node_data pointer for each per-cpu struct */ + for (cpu = 0; cpu < NR_CPUS; cpu++) { + node = node_cpuid[cpu].nid; + per_cpu(cpu_info, cpu).node_data = mem_data[node].node_data; + } +} + +/** + * find_memory - walk the EFI memory map and setup the bootmem allocator + * + * Called early in boot to setup the bootmem allocator, and to + * allocate the per-cpu and per-node structures. + */ +void __init find_memory(void) +{ + int node; + + reserve_memory(); + + if (num_online_nodes() == 0) { + printk(KERN_ERR "node info missing!\n"); + node_set_online(0); + } + + min_low_pfn = -1; + max_low_pfn = 0; + + if (num_online_nodes() > 1) + reassign_cpu_only_nodes(); + + /* These actually end up getting called by call_pernode_memory() */ + efi_memmap_walk(filter_rsvd_memory, build_node_maps); + efi_memmap_walk(filter_rsvd_memory, find_pernode_space); + + /* + * Initialize the boot memory maps in reverse order since that's + * what the bootmem allocator expects + */ + for (node = MAX_NUMNODES - 1; node >= 0; node--) { + unsigned long pernode, pernodesize, map; + struct bootmem_data *bdp; + + if (!node_online(node)) + continue; + + bdp = &mem_data[node].bootmem_data; + pernode = mem_data[node].pernode_addr; + pernodesize = mem_data[node].pernode_size; + map = pernode + pernodesize; + + /* Sanity check... */ + if (!pernode) + panic("pernode space for node %d " + "could not be allocated!", node); + + init_bootmem_node(mem_data[node].pgdat, + map>>PAGE_SHIFT, + bdp->node_boot_start>>PAGE_SHIFT, + bdp->node_low_pfn); + } + + efi_memmap_walk(filter_rsvd_memory, free_node_bootmem); + + reserve_pernode_space(); + initialize_pernode_data(); + + max_pfn = max_low_pfn; + + find_initrd(); +} + +/** + * per_cpu_init - setup per-cpu variables + * + * find_pernode_space() does most of this already, we just need to set + * local_per_cpu_offset + */ +void *per_cpu_init(void) +{ + int cpu; + + if (smp_processor_id() == 0) { + for (cpu = 0; cpu < NR_CPUS; cpu++) { + per_cpu(local_per_cpu_offset, cpu) = + __per_cpu_offset[cpu]; + } + } + + return __per_cpu_start + __per_cpu_offset[smp_processor_id()]; +} + +/** + * show_mem - give short summary of memory stats + * + * Shows a simple page count of reserved and used pages in the system. + * For discontig machines, it does this on a per-pgdat basis. + */ +void show_mem(void) +{ + int i, total_reserved = 0; + int total_shared = 0, total_cached = 0; + unsigned long total_present = 0; + pg_data_t *pgdat; + + printk("Mem-info:\n"); + show_free_areas(); + printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); + for_each_pgdat(pgdat) { + unsigned long present = pgdat->node_present_pages; + int shared = 0, cached = 0, reserved = 0; + printk("Node ID: %d\n", pgdat->node_id); + for(i = 0; i < pgdat->node_spanned_pages; i++) { + if (!ia64_pfn_valid(pgdat->node_start_pfn+i)) + continue; + if (PageReserved(pgdat->node_mem_map+i)) + reserved++; + else if (PageSwapCache(pgdat->node_mem_map+i)) + cached++; + else if (page_count(pgdat->node_mem_map+i)) + shared += page_count(pgdat->node_mem_map+i)-1; + } + total_present += present; + total_reserved += reserved; + total_cached += cached; + total_shared += shared; + printk("\t%ld pages of RAM\n", present); + printk("\t%d reserved pages\n", reserved); + printk("\t%d pages shared\n", shared); + printk("\t%d pages swap cached\n", cached); + } + printk("%ld pages of RAM\n", total_present); + printk("%d reserved pages\n", total_reserved); + printk("%d pages shared\n", total_shared); + printk("%d pages swap cached\n", total_cached); + printk("Total of %ld pages in page table cache\n", pgtable_cache_size); + printk("%d free buffer pages\n", nr_free_buffer_pages()); +} + +/** + * call_pernode_memory - use SRAT to call callback functions with node info + * @start: physical start of range + * @len: length of range + * @arg: function to call for each range + * + * efi_memmap_walk() knows nothing about layout of memory across nodes. Find + * out to which node a block of memory belongs. Ignore memory that we cannot + * identify, and split blocks that run across multiple nodes. + * + * Take this opportunity to round the start address up and the end address + * down to page boundaries. + */ +void call_pernode_memory(unsigned long start, unsigned long len, void *arg) +{ + unsigned long rs, re, end = start + len; + void (*func)(unsigned long, unsigned long, int); + int i; + + start = PAGE_ALIGN(start); + end &= PAGE_MASK; + if (start >= end) + return; + + func = arg; + + if (!num_node_memblks) { + /* No SRAT table, so assume one node (node 0) */ + if (start < end) + (*func)(start, end - start, 0); + return; + } + + for (i = 0; i < num_node_memblks; i++) { + rs = max(start, node_memblk[i].start_paddr); + re = min(end, node_memblk[i].start_paddr + + node_memblk[i].size); + + if (rs < re) + (*func)(rs, re - rs, node_memblk[i].nid); + + if (re == end) + break; + } +} + +/** + * count_node_pages - callback to build per-node memory info structures + * @start: physical start of range + * @len: length of range + * @node: node where this range resides + * + * Each node has it's own number of physical pages, DMAable pages, start, and + * end page frame number. This routine will be called by call_pernode_memory() + * for each piece of usable memory and will setup these values for each node. + * Very similar to build_maps(). + */ +static __init int count_node_pages(unsigned long start, unsigned long len, int node) +{ + unsigned long end = start + len; + + mem_data[node].num_physpages += len >> PAGE_SHIFT; + if (start <= __pa(MAX_DMA_ADDRESS)) + mem_data[node].num_dma_physpages += + (min(end, __pa(MAX_DMA_ADDRESS)) - start) >>PAGE_SHIFT; + start = GRANULEROUNDDOWN(start); + start = ORDERROUNDDOWN(start); + end = GRANULEROUNDUP(end); + mem_data[node].max_pfn = max(mem_data[node].max_pfn, + end >> PAGE_SHIFT); + mem_data[node].min_pfn = min(mem_data[node].min_pfn, + start >> PAGE_SHIFT); + + return 0; +} + +/** + * paging_init - setup page tables + * + * paging_init() sets up the page tables for each node of the system and frees + * the bootmem allocator memory for general use. + */ +void __init paging_init(void) +{ + unsigned long max_dma; + unsigned long zones_size[MAX_NR_ZONES]; + unsigned long zholes_size[MAX_NR_ZONES]; + unsigned long pfn_offset = 0; + int node; + + max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT; + + /* so min() will work in count_node_pages */ + for_each_online_node(node) + mem_data[node].min_pfn = ~0UL; + + efi_memmap_walk(filter_rsvd_memory, count_node_pages); + + for_each_online_node(node) { + memset(zones_size, 0, sizeof(zones_size)); + memset(zholes_size, 0, sizeof(zholes_size)); + + num_physpages += mem_data[node].num_physpages; + + if (mem_data[node].min_pfn >= max_dma) { + /* All of this node's memory is above ZONE_DMA */ + zones_size[ZONE_NORMAL] = mem_data[node].max_pfn - + mem_data[node].min_pfn; + zholes_size[ZONE_NORMAL] = mem_data[node].max_pfn - + mem_data[node].min_pfn - + mem_data[node].num_physpages; + } else if (mem_data[node].max_pfn < max_dma) { + /* All of this node's memory is in ZONE_DMA */ + zones_size[ZONE_DMA] = mem_data[node].max_pfn - + mem_data[node].min_pfn; + zholes_size[ZONE_DMA] = mem_data[node].max_pfn - + mem_data[node].min_pfn - + mem_data[node].num_dma_physpages; + } else { + /* This node has memory in both zones */ + zones_size[ZONE_DMA] = max_dma - + mem_data[node].min_pfn; + zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] - + mem_data[node].num_dma_physpages; + zones_size[ZONE_NORMAL] = mem_data[node].max_pfn - + max_dma; + zholes_size[ZONE_NORMAL] = zones_size[ZONE_NORMAL] - + (mem_data[node].num_physpages - + mem_data[node].num_dma_physpages); + } + + if (node == 0) { + vmalloc_end -= + PAGE_ALIGN(max_low_pfn * sizeof(struct page)); + vmem_map = (struct page *) vmalloc_end; + + efi_memmap_walk(create_mem_map_page_table, NULL); + printk("Virtual mem_map starts at 0x%p\n", vmem_map); + } + + pfn_offset = mem_data[node].min_pfn; + + NODE_DATA(node)->node_mem_map = vmem_map + pfn_offset; + free_area_init_node(node, NODE_DATA(node), zones_size, + pfn_offset, zholes_size); + } + + zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); +} diff --git a/arch/ia64/mm/extable.c b/arch/ia64/mm/extable.c new file mode 100644 index 00000000000..6d259e34f35 --- /dev/null +++ b/arch/ia64/mm/extable.c @@ -0,0 +1,90 @@ +/* + * Kernel exception handling table support. Derived from arch/alpha/mm/extable.c. + * + * Copyright (C) 1998, 1999, 2001-2002, 2004 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + */ + +#include <linux/config.h> +#include <linux/sort.h> + +#include <asm/uaccess.h> +#include <asm/module.h> + +static int cmp_ex(const void *a, const void *b) +{ + const struct exception_table_entry *l = a, *r = b; + u64 lip = (u64) &l->addr + l->addr; + u64 rip = (u64) &r->addr + r->addr; + + /* avoid overflow */ + if (lip > rip) + return 1; + if (lip < rip) + return -1; + return 0; +} + +static void swap_ex(void *a, void *b, int size) +{ + struct exception_table_entry *l = a, *r = b, tmp; + u64 delta = (u64) r - (u64) l; + + tmp = *l; + l->addr = r->addr + delta; + l->cont = r->cont + delta; + r->addr = tmp.addr - delta; + r->cont = tmp.cont - delta; +} + +/* + * Sort the exception table. It's usually already sorted, but there + * may be unordered entries due to multiple text sections (such as the + * .init text section). Note that the exception-table-entries contain + * location-relative addresses, which requires a bit of care during + * sorting to avoid overflows in the offset members (e.g., it would + * not be safe to make a temporary copy of an exception-table entry on + * the stack, because the stack may be more than 2GB away from the + * exception-table). + */ +void sort_extable (struct exception_table_entry *start, + struct exception_table_entry *finish) +{ + sort(start, finish - start, sizeof(struct exception_table_entry), + cmp_ex, swap_ex); +} + +const struct exception_table_entry * +search_extable (const struct exception_table_entry *first, + const struct exception_table_entry *last, + unsigned long ip) +{ + const struct exception_table_entry *mid; + unsigned long mid_ip; + long diff; + + while (first <= last) { + mid = &first[(last - first)/2]; + mid_ip = (u64) &mid->addr + mid->addr; + diff = mid_ip - ip; + if (diff == 0) + return mid; + else if (diff < 0) + first = mid + 1; + else + last = mid - 1; + } + return NULL; +} + +void +ia64_handle_exception (struct pt_regs *regs, const struct exception_table_entry *e) +{ + long fix = (u64) &e->cont + e->cont; + + regs->r8 = -EFAULT; + if (fix & 4) + regs->r9 = 0; + regs->cr_iip = fix & ~0xf; + ia64_psr(regs)->ri = fix & 0x3; /* set continuation slot number */ +} diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c new file mode 100644 index 00000000000..da859125aae --- /dev/null +++ b/arch/ia64/mm/fault.c @@ -0,0 +1,261 @@ +/* + * MMU fault handling support. + * + * Copyright (C) 1998-2002 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + */ +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/smp_lock.h> +#include <linux/interrupt.h> + +#include <asm/pgtable.h> +#include <asm/processor.h> +#include <asm/system.h> +#include <asm/uaccess.h> + +extern void die (char *, struct pt_regs *, long); + +/* + * This routine is analogous to expand_stack() but instead grows the + * register backing store (which grows towards higher addresses). + * Since the register backing store is access sequentially, we + * disallow growing the RBS by more than a page at a time. Note that + * the VM_GROWSUP flag can be set on any VM area but that's fine + * because the total process size is still limited by RLIMIT_STACK and + * RLIMIT_AS. + */ +static inline long +expand_backing_store (struct vm_area_struct *vma, unsigned long address) +{ + unsigned long grow; + + grow = PAGE_SIZE >> PAGE_SHIFT; + if (address - vma->vm_start > current->signal->rlim[RLIMIT_STACK].rlim_cur + || (((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->signal->rlim[RLIMIT_AS].rlim_cur)) + return -ENOMEM; + vma->vm_end += PAGE_SIZE; + vma->vm_mm->total_vm += grow; + if (vma->vm_flags & VM_LOCKED) + vma->vm_mm->locked_vm += grow; + __vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, grow); + return 0; +} + +/* + * Return TRUE if ADDRESS points at a page in the kernel's mapped segment + * (inside region 5, on ia64) and that page is present. + */ +static int +mapped_kernel_page_is_present (unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + + pgd = pgd_offset_k(address); + if (pgd_none(*pgd) || pgd_bad(*pgd)) + return 0; + + pud = pud_offset(pgd, address); + if (pud_none(*pud) || pud_bad(*pud)) + return 0; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd) || pmd_bad(*pmd)) + return 0; + + ptep = pte_offset_kernel(pmd, address); + if (!ptep) + return 0; + + pte = *ptep; + return pte_present(pte); +} + +void +ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *regs) +{ + int signal = SIGSEGV, code = SEGV_MAPERR; + struct vm_area_struct *vma, *prev_vma; + struct mm_struct *mm = current->mm; + struct siginfo si; + unsigned long mask; + + /* + * If we're in an interrupt or have no user context, we must not take the fault.. + */ + if (in_atomic() || !mm) + goto no_context; + +#ifdef CONFIG_VIRTUAL_MEM_MAP + /* + * If fault is in region 5 and we are in the kernel, we may already + * have the mmap_sem (pfn_valid macro is called during mmap). There + * is no vma for region 5 addr's anyway, so skip getting the semaphore + * and go directly to the exception handling code. + */ + + if ((REGION_NUMBER(address) == 5) && !user_mode(regs)) + goto bad_area_no_up; +#endif + + down_read(&mm->mmap_sem); + + vma = find_vma_prev(mm, address, &prev_vma); + if (!vma) + goto bad_area; + + /* find_vma_prev() returns vma such that address < vma->vm_end or NULL */ + if (address < vma->vm_start) + goto check_expansion; + + good_area: + code = SEGV_ACCERR; + + /* OK, we've got a good vm_area for this memory area. Check the access permissions: */ + +# define VM_READ_BIT 0 +# define VM_WRITE_BIT 1 +# define VM_EXEC_BIT 2 + +# if (((1 << VM_READ_BIT) != VM_READ || (1 << VM_WRITE_BIT) != VM_WRITE) \ + || (1 << VM_EXEC_BIT) != VM_EXEC) +# error File is out of sync with <linux/mm.h>. Please update. +# endif + + mask = ( (((isr >> IA64_ISR_X_BIT) & 1UL) << VM_EXEC_BIT) + | (((isr >> IA64_ISR_W_BIT) & 1UL) << VM_WRITE_BIT) + | (((isr >> IA64_ISR_R_BIT) & 1UL) << VM_READ_BIT)); + + if ((vma->vm_flags & mask) != mask) + goto bad_area; + + survive: + /* + * If for any reason at all we couldn't handle the fault, make + * sure we exit gracefully rather than endlessly redo the + * fault. + */ + switch (handle_mm_fault(mm, vma, address, (mask & VM_WRITE) != 0)) { + case VM_FAULT_MINOR: + ++current->min_flt; + break; + case VM_FAULT_MAJOR: + ++current->maj_flt; + break; + case VM_FAULT_SIGBUS: + /* + * We ran out of memory, or some other thing happened + * to us that made us unable to handle the page fault + * gracefully. + */ + signal = SIGBUS; + goto bad_area; + case VM_FAULT_OOM: + goto out_of_memory; + default: + BUG(); + } + up_read(&mm->mmap_sem); + return; + + check_expansion: + if (!(prev_vma && (prev_vma->vm_flags & VM_GROWSUP) && (address == prev_vma->vm_end))) { + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto bad_area; + if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start) + || REGION_OFFSET(address) >= RGN_MAP_LIMIT) + goto bad_area; + if (expand_stack(vma, address)) + goto bad_area; + } else { + vma = prev_vma; + if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start) + || REGION_OFFSET(address) >= RGN_MAP_LIMIT) + goto bad_area; + if (expand_backing_store(vma, address)) + goto bad_area; + } + goto good_area; + + bad_area: + up_read(&mm->mmap_sem); +#ifdef CONFIG_VIRTUAL_MEM_MAP + bad_area_no_up: +#endif + if ((isr & IA64_ISR_SP) + || ((isr & IA64_ISR_NA) && (isr & IA64_ISR_CODE_MASK) == IA64_ISR_CODE_LFETCH)) + { + /* + * This fault was due to a speculative load or lfetch.fault, set the "ed" + * bit in the psr to ensure forward progress. (Target register will get a + * NaT for ld.s, lfetch will be canceled.) + */ + ia64_psr(regs)->ed = 1; + return; + } + if (user_mode(regs)) { + si.si_signo = signal; + si.si_errno = 0; + si.si_code = code; + si.si_addr = (void __user *) address; + si.si_isr = isr; + si.si_flags = __ISR_VALID; + force_sig_info(signal, &si, current); + return; + } + + no_context: + if (isr & IA64_ISR_SP) { + /* + * This fault was due to a speculative load set the "ed" bit in the psr to + * ensure forward progress (target register will get a NaT). + */ + ia64_psr(regs)->ed = 1; + return; + } + + if (ia64_done_with_exception(regs)) + return; + + /* + * Since we have no vma's for region 5, we might get here even if the address is + * valid, due to the VHPT walker inserting a non present translation that becomes + * stale. If that happens, the non present fault handler already purged the stale + * translation, which fixed the problem. So, we check to see if the translation is + * valid, and return if it is. + */ + if (REGION_NUMBER(address) == 5 && mapped_kernel_page_is_present(address)) + return; + + /* + * Oops. The kernel tried to access some bad page. We'll have to terminate things + * with extreme prejudice. + */ + bust_spinlocks(1); + + if (address < PAGE_SIZE) + printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference (address %016lx)\n", address); + else + printk(KERN_ALERT "Unable to handle kernel paging request at " + "virtual address %016lx\n", address); + die("Oops", regs, isr); + bust_spinlocks(0); + do_exit(SIGKILL); + return; + + out_of_memory: + up_read(&mm->mmap_sem); + if (current->pid == 1) { + yield(); + down_read(&mm->mmap_sem); + goto survive; + } + printk(KERN_CRIT "VM: killing process %s\n", current->comm); + if (user_mode(regs)) + do_exit(SIGKILL); + goto no_context; +} diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c new file mode 100644 index 00000000000..40ad8328ffd --- /dev/null +++ b/arch/ia64/mm/hugetlbpage.c @@ -0,0 +1,357 @@ +/* + * IA-64 Huge TLB Page Support for Kernel. + * + * Copyright (C) 2002-2004 Rohit Seth <rohit.seth@intel.com> + * Copyright (C) 2003-2004 Ken Chen <kenneth.w.chen@intel.com> + * + * Sep, 2003: add numa support + * Feb, 2004: dynamic hugetlb page size via boot parameter + */ + +#include <linux/config.h> +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/pagemap.h> +#include <linux/smp_lock.h> +#include <linux/slab.h> +#include <linux/sysctl.h> +#include <asm/mman.h> +#include <asm/pgalloc.h> +#include <asm/tlb.h> +#include <asm/tlbflush.h> + +unsigned int hpage_shift=HPAGE_SHIFT_DEFAULT; + +static pte_t * +huge_pte_alloc (struct mm_struct *mm, unsigned long addr) +{ + unsigned long taddr = htlbpage_to_page(addr); + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte = NULL; + + pgd = pgd_offset(mm, taddr); + pud = pud_alloc(mm, pgd, taddr); + if (pud) { + pmd = pmd_alloc(mm, pud, taddr); + if (pmd) + pte = pte_alloc_map(mm, pmd, taddr); + } + return pte; +} + +static pte_t * +huge_pte_offset (struct mm_struct *mm, unsigned long addr) +{ + unsigned long taddr = htlbpage_to_page(addr); + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte = NULL; + + pgd = pgd_offset(mm, taddr); + if (pgd_present(*pgd)) { + pud = pud_offset(pgd, taddr); + if (pud_present(*pud)) { + pmd = pmd_offset(pud, taddr); + if (pmd_present(*pmd)) + pte = pte_offset_map(pmd, taddr); + } + } + + return pte; +} + +#define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; } + +static void +set_huge_pte (struct mm_struct *mm, struct vm_area_struct *vma, + struct page *page, pte_t * page_table, int write_access) +{ + pte_t entry; + + add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE); + if (write_access) { + entry = + pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); + } else + entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); + entry = pte_mkyoung(entry); + mk_pte_huge(entry); + set_pte(page_table, entry); + return; +} +/* + * This function checks for proper alignment of input addr and len parameters. + */ +int is_aligned_hugepage_range(unsigned long addr, unsigned long len) +{ + if (len & ~HPAGE_MASK) + return -EINVAL; + if (addr & ~HPAGE_MASK) + return -EINVAL; + if (REGION_NUMBER(addr) != REGION_HPAGE) + return -EINVAL; + + return 0; +} + +int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma) +{ + pte_t *src_pte, *dst_pte, entry; + struct page *ptepage; + unsigned long addr = vma->vm_start; + unsigned long end = vma->vm_end; + + while (addr < end) { + dst_pte = huge_pte_alloc(dst, addr); + if (!dst_pte) + goto nomem; + src_pte = huge_pte_offset(src, addr); + entry = *src_pte; + ptepage = pte_page(entry); + get_page(ptepage); + set_pte(dst_pte, entry); + add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE); + addr += HPAGE_SIZE; + } + return 0; +nomem: + return -ENOMEM; +} + +int +follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, + struct page **pages, struct vm_area_struct **vmas, + unsigned long *st, int *length, int i) +{ + pte_t *ptep, pte; + unsigned long start = *st; + unsigned long pstart; + int len = *length; + struct page *page; + + do { + pstart = start & HPAGE_MASK; + ptep = huge_pte_offset(mm, start); + pte = *ptep; + +back1: + page = pte_page(pte); + if (pages) { + page += ((start & ~HPAGE_MASK) >> PAGE_SHIFT); + get_page(page); + pages[i] = page; + } + if (vmas) + vmas[i] = vma; + i++; + len--; + start += PAGE_SIZE; + if (((start & HPAGE_MASK) == pstart) && len && + (start < vma->vm_end)) + goto back1; + } while (len && start < vma->vm_end); + *length = len; + *st = start; + return i; +} + +struct page *follow_huge_addr(struct mm_struct *mm, unsigned long addr, int write) +{ + struct page *page; + pte_t *ptep; + + if (REGION_NUMBER(addr) != REGION_HPAGE) + return ERR_PTR(-EINVAL); + + ptep = huge_pte_offset(mm, addr); + if (!ptep || pte_none(*ptep)) + return NULL; + page = pte_page(*ptep); + page += ((addr & ~HPAGE_MASK) >> PAGE_SHIFT); + return page; +} +int pmd_huge(pmd_t pmd) +{ + return 0; +} +struct page * +follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) +{ + return NULL; +} + +/* + * Same as generic free_pgtables(), except constant PGDIR_* and pgd_offset + * are hugetlb region specific. + */ +void hugetlb_free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev, + unsigned long start, unsigned long end) +{ + unsigned long first = start & HUGETLB_PGDIR_MASK; + unsigned long last = end + HUGETLB_PGDIR_SIZE - 1; + struct mm_struct *mm = tlb->mm; + + if (!prev) { + prev = mm->mmap; + if (!prev) + goto no_mmaps; + if (prev->vm_end > start) { + if (last > prev->vm_start) + last = prev->vm_start; + goto no_mmaps; + } + } + for (;;) { + struct vm_area_struct *next = prev->vm_next; + + if (next) { + if (next->vm_start < start) { + prev = next; + continue; + } + if (last > next->vm_start) + last = next->vm_start; + } + if (prev->vm_end > first) + first = prev->vm_end; + break; + } +no_mmaps: + if (last < first) /* for arches with discontiguous pgd indices */ + return; + clear_page_range(tlb, first, last); +} + +void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long address; + pte_t *pte; + struct page *page; + + BUG_ON(start & (HPAGE_SIZE - 1)); + BUG_ON(end & (HPAGE_SIZE - 1)); + + for (address = start; address < end; address += HPAGE_SIZE) { + pte = huge_pte_offset(mm, address); + if (pte_none(*pte)) + continue; + page = pte_page(*pte); + put_page(page); + pte_clear(mm, address, pte); + } + add_mm_counter(mm, rss, - ((end - start) >> PAGE_SHIFT)); + flush_tlb_range(vma, start, end); +} + +int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) +{ + struct mm_struct *mm = current->mm; + unsigned long addr; + int ret = 0; + + BUG_ON(vma->vm_start & ~HPAGE_MASK); + BUG_ON(vma->vm_end & ~HPAGE_MASK); + + spin_lock(&mm->page_table_lock); + for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { + unsigned long idx; + pte_t *pte = huge_pte_alloc(mm, addr); + struct page *page; + + if (!pte) { + ret = -ENOMEM; + goto out; + } + if (!pte_none(*pte)) + continue; + + idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) + + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); + page = find_get_page(mapping, idx); + if (!page) { + /* charge the fs quota first */ + if (hugetlb_get_quota(mapping)) { + ret = -ENOMEM; + goto out; + } + page = alloc_huge_page(); + if (!page) { + hugetlb_put_quota(mapping); + ret = -ENOMEM; + goto out; + } + ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); + if (! ret) { + unlock_page(page); + } else { + hugetlb_put_quota(mapping); + page_cache_release(page); + goto out; + } + } + set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE); + } +out: + spin_unlock(&mm->page_table_lock); + return ret; +} + +unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + struct vm_area_struct *vmm; + + if (len > RGN_MAP_LIMIT) + return -ENOMEM; + if (len & ~HPAGE_MASK) + return -EINVAL; + /* This code assumes that REGION_HPAGE != 0. */ + if ((REGION_NUMBER(addr) != REGION_HPAGE) || (addr & (HPAGE_SIZE - 1))) + addr = HPAGE_REGION_BASE; + else + addr = ALIGN(addr, HPAGE_SIZE); + for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) { + /* At this point: (!vmm || addr < vmm->vm_end). */ + if (REGION_OFFSET(addr) + len > RGN_MAP_LIMIT) + return -ENOMEM; + if (!vmm || (addr + len) <= vmm->vm_start) + return addr; + addr = ALIGN(vmm->vm_end, HPAGE_SIZE); + } +} + +static int __init hugetlb_setup_sz(char *str) +{ + u64 tr_pages; + unsigned long long size; + + if (ia64_pal_vm_page_size(&tr_pages, NULL) != 0) + /* + * shouldn't happen, but just in case. + */ + tr_pages = 0x15557000UL; + + size = memparse(str, &str); + if (*str || (size & (size-1)) || !(tr_pages & size) || + size <= PAGE_SIZE || + size >= (1UL << PAGE_SHIFT << MAX_ORDER)) { + printk(KERN_WARNING "Invalid huge page size specified\n"); + return 1; + } + + hpage_shift = __ffs(size); + /* + * boot cpu already executed ia64_mmu_init, and has HPAGE_SHIFT_DEFAULT + * override here with new page shift. + */ + ia64_set_rr(HPAGE_REGION_BASE, hpage_shift << 2); + return 1; +} +__setup("hugepagesz=", hugetlb_setup_sz); diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c new file mode 100644 index 00000000000..65cf839573e --- /dev/null +++ b/arch/ia64/mm/init.c @@ -0,0 +1,597 @@ +/* + * Initialize MMU support. + * + * Copyright (C) 1998-2003 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + */ +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/init.h> + +#include <linux/bootmem.h> +#include <linux/efi.h> +#include <linux/elf.h> +#include <linux/mm.h> +#include <linux/mmzone.h> +#include <linux/module.h> +#include <linux/personality.h> +#include <linux/reboot.h> +#include <linux/slab.h> +#include <linux/swap.h> +#include <linux/proc_fs.h> +#include <linux/bitops.h> + +#include <asm/a.out.h> +#include <asm/dma.h> +#include <asm/ia32.h> +#include <asm/io.h> +#include <asm/machvec.h> +#include <asm/numa.h> +#include <asm/patch.h> +#include <asm/pgalloc.h> +#include <asm/sal.h> +#include <asm/sections.h> +#include <asm/system.h> +#include <asm/tlb.h> +#include <asm/uaccess.h> +#include <asm/unistd.h> +#include <asm/mca.h> + +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); + +extern void ia64_tlb_init (void); + +unsigned long MAX_DMA_ADDRESS = PAGE_OFFSET + 0x100000000UL; + +#ifdef CONFIG_VIRTUAL_MEM_MAP +unsigned long vmalloc_end = VMALLOC_END_INIT; +EXPORT_SYMBOL(vmalloc_end); +struct page *vmem_map; +EXPORT_SYMBOL(vmem_map); +#endif + +static int pgt_cache_water[2] = { 25, 50 }; + +struct page *zero_page_memmap_ptr; /* map entry for zero page */ +EXPORT_SYMBOL(zero_page_memmap_ptr); + +void +check_pgt_cache (void) +{ + int low, high; + + low = pgt_cache_water[0]; + high = pgt_cache_water[1]; + + preempt_disable(); + if (pgtable_cache_size > (u64) high) { + do { + if (pgd_quicklist) + free_page((unsigned long)pgd_alloc_one_fast(NULL)); + if (pmd_quicklist) + free_page((unsigned long)pmd_alloc_one_fast(NULL, 0)); + } while (pgtable_cache_size > (u64) low); + } + preempt_enable(); +} + +void +lazy_mmu_prot_update (pte_t pte) +{ + unsigned long addr; + struct page *page; + + if (!pte_exec(pte)) + return; /* not an executable page... */ + + page = pte_page(pte); + addr = (unsigned long) page_address(page); + + if (test_bit(PG_arch_1, &page->flags)) + return; /* i-cache is already coherent with d-cache */ + + flush_icache_range(addr, addr + PAGE_SIZE); + set_bit(PG_arch_1, &page->flags); /* mark page as clean */ +} + +inline void +ia64_set_rbs_bot (void) +{ + unsigned long stack_size = current->signal->rlim[RLIMIT_STACK].rlim_max & -16; + + if (stack_size > MAX_USER_STACK_SIZE) + stack_size = MAX_USER_STACK_SIZE; + current->thread.rbs_bot = STACK_TOP - stack_size; +} + +/* + * This performs some platform-dependent address space initialization. + * On IA-64, we want to setup the VM area for the register backing + * store (which grows upwards) and install the gateway page which is + * used for signal trampolines, etc. + */ +void +ia64_init_addr_space (void) +{ + struct vm_area_struct *vma; + + ia64_set_rbs_bot(); + + /* + * If we're out of memory and kmem_cache_alloc() returns NULL, we simply ignore + * the problem. When the process attempts to write to the register backing store + * for the first time, it will get a SEGFAULT in this case. + */ + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (vma) { + memset(vma, 0, sizeof(*vma)); + vma->vm_mm = current->mm; + vma->vm_start = current->thread.rbs_bot & PAGE_MASK; + vma->vm_end = vma->vm_start + PAGE_SIZE; + vma->vm_page_prot = protection_map[VM_DATA_DEFAULT_FLAGS & 0x7]; + vma->vm_flags = VM_DATA_DEFAULT_FLAGS | VM_GROWSUP; + down_write(¤t->mm->mmap_sem); + if (insert_vm_struct(current->mm, vma)) { + up_write(¤t->mm->mmap_sem); + kmem_cache_free(vm_area_cachep, vma); + return; + } + up_write(¤t->mm->mmap_sem); + } + + /* map NaT-page at address zero to speed up speculative dereferencing of NULL: */ + if (!(current->personality & MMAP_PAGE_ZERO)) { + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (vma) { + memset(vma, 0, sizeof(*vma)); + vma->vm_mm = current->mm; + vma->vm_end = PAGE_SIZE; + vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT); + vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO | VM_RESERVED; + down_write(¤t->mm->mmap_sem); + if (insert_vm_struct(current->mm, vma)) { + up_write(¤t->mm->mmap_sem); + kmem_cache_free(vm_area_cachep, vma); + return; + } + up_write(¤t->mm->mmap_sem); + } + } +} + +void +free_initmem (void) +{ + unsigned long addr, eaddr; + + addr = (unsigned long) ia64_imva(__init_begin); + eaddr = (unsigned long) ia64_imva(__init_end); + while (addr < eaddr) { + ClearPageReserved(virt_to_page(addr)); + set_page_count(virt_to_page(addr), 1); + free_page(addr); + ++totalram_pages; + addr += PAGE_SIZE; + } + printk(KERN_INFO "Freeing unused kernel memory: %ldkB freed\n", + (__init_end - __init_begin) >> 10); +} + +void +free_initrd_mem (unsigned long start, unsigned long end) +{ + struct page *page; + /* + * EFI uses 4KB pages while the kernel can use 4KB or bigger. + * Thus EFI and the kernel may have different page sizes. It is + * therefore possible to have the initrd share the same page as + * the end of the kernel (given current setup). + * + * To avoid freeing/using the wrong page (kernel sized) we: + * - align up the beginning of initrd + * - align down the end of initrd + * + * | | + * |=============| a000 + * | | + * | | + * | | 9000 + * |/////////////| + * |/////////////| + * |=============| 8000 + * |///INITRD////| + * |/////////////| + * |/////////////| 7000 + * | | + * |KKKKKKKKKKKKK| + * |=============| 6000 + * |KKKKKKKKKKKKK| + * |KKKKKKKKKKKKK| + * K=kernel using 8KB pages + * + * In this example, we must free page 8000 ONLY. So we must align up + * initrd_start and keep initrd_end as is. + */ + start = PAGE_ALIGN(start); + end = end & PAGE_MASK; + + if (start < end) + printk(KERN_INFO "Freeing initrd memory: %ldkB freed\n", (end - start) >> 10); + + for (; start < end; start += PAGE_SIZE) { + if (!virt_addr_valid(start)) + continue; + page = virt_to_page(start); + ClearPageReserved(page); + set_page_count(page, 1); + free_page(start); + ++totalram_pages; + } +} + +/* + * This installs a clean page in the kernel's page table. + */ +struct page * +put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + if (!PageReserved(page)) + printk(KERN_ERR "put_kernel_page: page at 0x%p not in reserved memory\n", + page_address(page)); + + pgd = pgd_offset_k(address); /* note: this is NOT pgd_offset()! */ + + spin_lock(&init_mm.page_table_lock); + { + pud = pud_alloc(&init_mm, pgd, address); + if (!pud) + goto out; + + pmd = pmd_alloc(&init_mm, pud, address); + if (!pmd) + goto out; + pte = pte_alloc_map(&init_mm, pmd, address); + if (!pte) + goto out; + if (!pte_none(*pte)) { + pte_unmap(pte); + goto out; + } + set_pte(pte, mk_pte(page, pgprot)); + pte_unmap(pte); + } + out: spin_unlock(&init_mm.page_table_lock); + /* no need for flush_tlb */ + return page; +} + +static void +setup_gate (void) +{ + struct page *page; + + /* + * Map the gate page twice: once read-only to export the ELF headers etc. and once + * execute-only page to enable privilege-promotion via "epc": + */ + page = virt_to_page(ia64_imva(__start_gate_section)); + put_kernel_page(page, GATE_ADDR, PAGE_READONLY); +#ifdef HAVE_BUGGY_SEGREL + page = virt_to_page(ia64_imva(__start_gate_section + PAGE_SIZE)); + put_kernel_page(page, GATE_ADDR + PAGE_SIZE, PAGE_GATE); +#else + put_kernel_page(page, GATE_ADDR + PERCPU_PAGE_SIZE, PAGE_GATE); +#endif + ia64_patch_gate(); +} + +void __devinit +ia64_mmu_init (void *my_cpu_data) +{ + unsigned long psr, pta, impl_va_bits; + extern void __devinit tlb_init (void); + +#ifdef CONFIG_DISABLE_VHPT +# define VHPT_ENABLE_BIT 0 +#else +# define VHPT_ENABLE_BIT 1 +#endif + + /* Pin mapping for percpu area into TLB */ + psr = ia64_clear_ic(); + ia64_itr(0x2, IA64_TR_PERCPU_DATA, PERCPU_ADDR, + pte_val(pfn_pte(__pa(my_cpu_data) >> PAGE_SHIFT, PAGE_KERNEL)), + PERCPU_PAGE_SHIFT); + + ia64_set_psr(psr); + ia64_srlz_i(); + + /* + * Check if the virtually mapped linear page table (VMLPT) overlaps with a mapped + * address space. The IA-64 architecture guarantees that at least 50 bits of + * virtual address space are implemented but if we pick a large enough page size + * (e.g., 64KB), the mapped address space is big enough that it will overlap with + * VMLPT. I assume that once we run on machines big enough to warrant 64KB pages, + * IMPL_VA_MSB will be significantly bigger, so this is unlikely to become a + * problem in practice. Alternatively, we could truncate the top of the mapped + * address space to not permit mappings that would overlap with the VMLPT. + * --davidm 00/12/06 + */ +# define pte_bits 3 +# define mapped_space_bits (3*(PAGE_SHIFT - pte_bits) + PAGE_SHIFT) + /* + * The virtual page table has to cover the entire implemented address space within + * a region even though not all of this space may be mappable. The reason for + * this is that the Access bit and Dirty bit fault handlers perform + * non-speculative accesses to the virtual page table, so the address range of the + * virtual page table itself needs to be covered by virtual page table. + */ +# define vmlpt_bits (impl_va_bits - PAGE_SHIFT + pte_bits) +# define POW2(n) (1ULL << (n)) + + impl_va_bits = ffz(~(local_cpu_data->unimpl_va_mask | (7UL << 61))); + + if (impl_va_bits < 51 || impl_va_bits > 61) + panic("CPU has bogus IMPL_VA_MSB value of %lu!\n", impl_va_bits - 1); + + /* place the VMLPT at the end of each page-table mapped region: */ + pta = POW2(61) - POW2(vmlpt_bits); + + if (POW2(mapped_space_bits) >= pta) + panic("mm/init: overlap between virtually mapped linear page table and " + "mapped kernel space!"); + /* + * Set the (virtually mapped linear) page table address. Bit + * 8 selects between the short and long format, bits 2-7 the + * size of the table, and bit 0 whether the VHPT walker is + * enabled. + */ + ia64_set_pta(pta | (0 << 8) | (vmlpt_bits << 2) | VHPT_ENABLE_BIT); + + ia64_tlb_init(); + +#ifdef CONFIG_HUGETLB_PAGE + ia64_set_rr(HPAGE_REGION_BASE, HPAGE_SHIFT << 2); + ia64_srlz_d(); +#endif +} + +#ifdef CONFIG_VIRTUAL_MEM_MAP + +int +create_mem_map_page_table (u64 start, u64 end, void *arg) +{ + unsigned long address, start_page, end_page; + struct page *map_start, *map_end; + int node; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + map_start = vmem_map + (__pa(start) >> PAGE_SHIFT); + map_end = vmem_map + (__pa(end) >> PAGE_SHIFT); + + start_page = (unsigned long) map_start & PAGE_MASK; + end_page = PAGE_ALIGN((unsigned long) map_end); + node = paddr_to_nid(__pa(start)); + + for (address = start_page; address < end_page; address += PAGE_SIZE) { + pgd = pgd_offset_k(address); + if (pgd_none(*pgd)) + pgd_populate(&init_mm, pgd, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE)); + pud = pud_offset(pgd, address); + + if (pud_none(*pud)) + pud_populate(&init_mm, pud, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE)); + pmd = pmd_offset(pud, address); + + if (pmd_none(*pmd)) + pmd_populate_kernel(&init_mm, pmd, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE)); + pte = pte_offset_kernel(pmd, address); + + if (pte_none(*pte)) + set_pte(pte, pfn_pte(__pa(alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE)) >> PAGE_SHIFT, + PAGE_KERNEL)); + } + return 0; +} + +struct memmap_init_callback_data { + struct page *start; + struct page *end; + int nid; + unsigned long zone; +}; + +static int +virtual_memmap_init (u64 start, u64 end, void *arg) +{ + struct memmap_init_callback_data *args; + struct page *map_start, *map_end; + + args = (struct memmap_init_callback_data *) arg; + map_start = vmem_map + (__pa(start) >> PAGE_SHIFT); + map_end = vmem_map + (__pa(end) >> PAGE_SHIFT); + + if (map_start < args->start) + map_start = args->start; + if (map_end > args->end) + map_end = args->end; + + /* + * We have to initialize "out of bounds" struct page elements that fit completely + * on the same pages that were allocated for the "in bounds" elements because they + * may be referenced later (and found to be "reserved"). + */ + map_start -= ((unsigned long) map_start & (PAGE_SIZE - 1)) / sizeof(struct page); + map_end += ((PAGE_ALIGN((unsigned long) map_end) - (unsigned long) map_end) + / sizeof(struct page)); + + if (map_start < map_end) + memmap_init_zone((unsigned long)(map_end - map_start), + args->nid, args->zone, page_to_pfn(map_start)); + return 0; +} + +void +memmap_init (unsigned long size, int nid, unsigned long zone, + unsigned long start_pfn) +{ + if (!vmem_map) + memmap_init_zone(size, nid, zone, start_pfn); + else { + struct page *start; + struct memmap_init_callback_data args; + + start = pfn_to_page(start_pfn); + args.start = start; + args.end = start + size; + args.nid = nid; + args.zone = zone; + + efi_memmap_walk(virtual_memmap_init, &args); + } +} + +int +ia64_pfn_valid (unsigned long pfn) +{ + char byte; + struct page *pg = pfn_to_page(pfn); + + return (__get_user(byte, (char __user *) pg) == 0) + && ((((u64)pg & PAGE_MASK) == (((u64)(pg + 1) - 1) & PAGE_MASK)) + || (__get_user(byte, (char __user *) (pg + 1) - 1) == 0)); +} +EXPORT_SYMBOL(ia64_pfn_valid); + +int +find_largest_hole (u64 start, u64 end, void *arg) +{ + u64 *max_gap = arg; + + static u64 last_end = PAGE_OFFSET; + + /* NOTE: this algorithm assumes efi memmap table is ordered */ + + if (*max_gap < (start - last_end)) + *max_gap = start - last_end; + last_end = end; + return 0; +} +#endif /* CONFIG_VIRTUAL_MEM_MAP */ + +static int +count_reserved_pages (u64 start, u64 end, void *arg) +{ + unsigned long num_reserved = 0; + unsigned long *count = arg; + + for (; start < end; start += PAGE_SIZE) + if (PageReserved(virt_to_page(start))) + ++num_reserved; + *count += num_reserved; + return 0; +} + +/* + * Boot command-line option "nolwsys" can be used to disable the use of any light-weight + * system call handler. When this option is in effect, all fsyscalls will end up bubbling + * down into the kernel and calling the normal (heavy-weight) syscall handler. This is + * useful for performance testing, but conceivably could also come in handy for debugging + * purposes. + */ + +static int nolwsys; + +static int __init +nolwsys_setup (char *s) +{ + nolwsys = 1; + return 1; +} + +__setup("nolwsys", nolwsys_setup); + +void +mem_init (void) +{ + long reserved_pages, codesize, datasize, initsize; + unsigned long num_pgt_pages; + pg_data_t *pgdat; + int i; + static struct kcore_list kcore_mem, kcore_vmem, kcore_kernel; + +#ifdef CONFIG_PCI + /* + * This needs to be called _after_ the command line has been parsed but _before_ + * any drivers that may need the PCI DMA interface are initialized or bootmem has + * been freed. + */ + platform_dma_init(); +#endif + +#ifndef CONFIG_DISCONTIGMEM + if (!mem_map) + BUG(); + max_mapnr = max_low_pfn; +#endif + + high_memory = __va(max_low_pfn * PAGE_SIZE); + + kclist_add(&kcore_mem, __va(0), max_low_pfn * PAGE_SIZE); + kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START); + kclist_add(&kcore_kernel, _stext, _end - _stext); + + for_each_pgdat(pgdat) + totalram_pages += free_all_bootmem_node(pgdat); + + reserved_pages = 0; + efi_memmap_walk(count_reserved_pages, &reserved_pages); + + codesize = (unsigned long) _etext - (unsigned long) _stext; + datasize = (unsigned long) _edata - (unsigned long) _etext; + initsize = (unsigned long) __init_end - (unsigned long) __init_begin; + + printk(KERN_INFO "Memory: %luk/%luk available (%luk code, %luk reserved, " + "%luk data, %luk init)\n", (unsigned long) nr_free_pages() << (PAGE_SHIFT - 10), + num_physpages << (PAGE_SHIFT - 10), codesize >> 10, + reserved_pages << (PAGE_SHIFT - 10), datasize >> 10, initsize >> 10); + + /* + * Allow for enough (cached) page table pages so that we can map the entire memory + * at least once. Each task also needs a couple of page tables pages, so add in a + * fudge factor for that (don't use "threads-max" here; that would be wrong!). + * Don't allow the cache to be more than 10% of total memory, though. + */ +# define NUM_TASKS 500 /* typical number of tasks */ + num_pgt_pages = nr_free_pages() / PTRS_PER_PGD + NUM_TASKS; + if (num_pgt_pages > nr_free_pages() / 10) + num_pgt_pages = nr_free_pages() / 10; + if (num_pgt_pages > (u64) pgt_cache_water[1]) + pgt_cache_water[1] = num_pgt_pages; + + /* + * For fsyscall entrpoints with no light-weight handler, use the ordinary + * (heavy-weight) handler, but mark it by setting bit 0, so the fsyscall entry + * code can tell them apart. + */ + for (i = 0; i < NR_syscalls; ++i) { + extern unsigned long fsyscall_table[NR_syscalls]; + extern unsigned long sys_call_table[NR_syscalls]; + + if (!fsyscall_table[i] || nolwsys) + fsyscall_table[i] = sys_call_table[i] | 1; + } + setup_gate(); + +#ifdef CONFIG_IA32_SUPPORT + ia32_mem_init(); +#endif +} diff --git a/arch/ia64/mm/numa.c b/arch/ia64/mm/numa.c new file mode 100644 index 00000000000..77118bbf3d8 --- /dev/null +++ b/arch/ia64/mm/numa.c @@ -0,0 +1,49 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * This file contains NUMA specific variables and functions which can + * be split away from DISCONTIGMEM and are used on NUMA machines with + * contiguous memory. + * + * 2002/08/07 Erich Focht <efocht@ess.nec.de> + */ + +#include <linux/config.h> +#include <linux/cpu.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/node.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <asm/mmzone.h> +#include <asm/numa.h> + + +/* + * The following structures are usually initialized by ACPI or + * similar mechanisms and describe the NUMA characteristics of the machine. + */ +int num_node_memblks; +struct node_memblk_s node_memblk[NR_NODE_MEMBLKS]; +struct node_cpuid_s node_cpuid[NR_CPUS]; +/* + * This is a matrix with "distances" between nodes, they should be + * proportional to the memory access latency ratios. + */ +u8 numa_slit[MAX_NUMNODES * MAX_NUMNODES]; + +/* Identify which cnode a physical address resides on */ +int +paddr_to_nid(unsigned long paddr) +{ + int i; + + for (i = 0; i < num_node_memblks; i++) + if (paddr >= node_memblk[i].start_paddr && + paddr < node_memblk[i].start_paddr + node_memblk[i].size) + break; + + return (i < num_node_memblks) ? node_memblk[i].nid : (num_node_memblks ? -1 : 0); +} diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c new file mode 100644 index 00000000000..464557e4ed8 --- /dev/null +++ b/arch/ia64/mm/tlb.c @@ -0,0 +1,190 @@ +/* + * TLB support routines. + * + * Copyright (C) 1998-2001, 2003 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + * + * 08/02/00 A. Mallick <asit.k.mallick@intel.com> + * Modified RID allocation for SMP + * Goutham Rao <goutham.rao@intel.com> + * IPI based ptc implementation and A-step IPI implementation. + */ +#include <linux/config.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/smp.h> +#include <linux/mm.h> + +#include <asm/delay.h> +#include <asm/mmu_context.h> +#include <asm/pgalloc.h> +#include <asm/pal.h> +#include <asm/tlbflush.h> + +static struct { + unsigned long mask; /* mask of supported purge page-sizes */ + unsigned long max_bits; /* log2() of largest supported purge page-size */ +} purge; + +struct ia64_ctx ia64_ctx = { + .lock = SPIN_LOCK_UNLOCKED, + .next = 1, + .limit = (1 << 15) - 1, /* start out with the safe (architected) limit */ + .max_ctx = ~0U +}; + +DEFINE_PER_CPU(u8, ia64_need_tlb_flush); + +/* + * Acquire the ia64_ctx.lock before calling this function! + */ +void +wrap_mmu_context (struct mm_struct *mm) +{ + unsigned long tsk_context, max_ctx = ia64_ctx.max_ctx; + struct task_struct *tsk; + int i; + + if (ia64_ctx.next > max_ctx) + ia64_ctx.next = 300; /* skip daemons */ + ia64_ctx.limit = max_ctx + 1; + + /* + * Scan all the task's mm->context and set proper safe range + */ + + read_lock(&tasklist_lock); + repeat: + for_each_process(tsk) { + if (!tsk->mm) + continue; + tsk_context = tsk->mm->context; + if (tsk_context == ia64_ctx.next) { + if (++ia64_ctx.next >= ia64_ctx.limit) { + /* empty range: reset the range limit and start over */ + if (ia64_ctx.next > max_ctx) + ia64_ctx.next = 300; + ia64_ctx.limit = max_ctx + 1; + goto repeat; + } + } + if ((tsk_context > ia64_ctx.next) && (tsk_context < ia64_ctx.limit)) + ia64_ctx.limit = tsk_context; + } + read_unlock(&tasklist_lock); + /* can't call flush_tlb_all() here because of race condition with O(1) scheduler [EF] */ + { + int cpu = get_cpu(); /* prevent preemption/migration */ + for (i = 0; i < NR_CPUS; ++i) + if (cpu_online(i) && (i != cpu)) + per_cpu(ia64_need_tlb_flush, i) = 1; + put_cpu(); + } + local_flush_tlb_all(); +} + +void +ia64_global_tlb_purge (unsigned long start, unsigned long end, unsigned long nbits) +{ + static DEFINE_SPINLOCK(ptcg_lock); + + /* HW requires global serialization of ptc.ga. */ + spin_lock(&ptcg_lock); + { + do { + /* + * Flush ALAT entries also. + */ + ia64_ptcga(start, (nbits<<2)); + ia64_srlz_i(); + start += (1UL << nbits); + } while (start < end); + } + spin_unlock(&ptcg_lock); +} + +void +local_flush_tlb_all (void) +{ + unsigned long i, j, flags, count0, count1, stride0, stride1, addr; + + addr = local_cpu_data->ptce_base; + count0 = local_cpu_data->ptce_count[0]; + count1 = local_cpu_data->ptce_count[1]; + stride0 = local_cpu_data->ptce_stride[0]; + stride1 = local_cpu_data->ptce_stride[1]; + + local_irq_save(flags); + for (i = 0; i < count0; ++i) { + for (j = 0; j < count1; ++j) { + ia64_ptce(addr); + addr += stride1; + } + addr += stride0; + } + local_irq_restore(flags); + ia64_srlz_i(); /* srlz.i implies srlz.d */ +} + +void +flush_tlb_range (struct vm_area_struct *vma, unsigned long start, unsigned long end) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long size = end - start; + unsigned long nbits; + + if (mm != current->active_mm) { + /* this does happen, but perhaps it's not worth optimizing for? */ +#ifdef CONFIG_SMP + flush_tlb_all(); +#else + mm->context = 0; +#endif + return; + } + + nbits = ia64_fls(size + 0xfff); + while (unlikely (((1UL << nbits) & purge.mask) == 0) && (nbits < purge.max_bits)) + ++nbits; + if (nbits > purge.max_bits) + nbits = purge.max_bits; + start &= ~((1UL << nbits) - 1); + +# ifdef CONFIG_SMP + platform_global_tlb_purge(start, end, nbits); +# else + do { + ia64_ptcl(start, (nbits<<2)); + start += (1UL << nbits); + } while (start < end); +# endif + + ia64_srlz_i(); /* srlz.i implies srlz.d */ +} +EXPORT_SYMBOL(flush_tlb_range); + +void __devinit +ia64_tlb_init (void) +{ + ia64_ptce_info_t ptce_info; + unsigned long tr_pgbits; + long status; + + if ((status = ia64_pal_vm_page_size(&tr_pgbits, &purge.mask)) != 0) { + printk(KERN_ERR "PAL_VM_PAGE_SIZE failed with status=%ld;" + "defaulting to architected purge page-sizes.\n", status); + purge.mask = 0x115557000UL; + } + purge.max_bits = ia64_fls(purge.mask); + + ia64_get_ptce(&ptce_info); + local_cpu_data->ptce_base = ptce_info.base; + local_cpu_data->ptce_count[0] = ptce_info.count[0]; + local_cpu_data->ptce_count[1] = ptce_info.count[1]; + local_cpu_data->ptce_stride[0] = ptce_info.stride[0]; + local_cpu_data->ptce_stride[1] = ptce_info.stride[1]; + + local_flush_tlb_all(); /* nuke left overs from bootstrapping... */ +} |