diff options
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r-- | mm/mempolicy.c | 471 |
1 files changed, 281 insertions, 190 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9033f0859aa..bec88c81244 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2,6 +2,7 @@ * Simple NUMA memory policy for the Linux kernel. * * Copyright 2003,2004 Andi Kleen, SuSE Labs. + * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. * Subject to the GNU Public License, version 2. * * NUMA policy allows the user to give hints in which node(s) memory should @@ -17,13 +18,19 @@ * offset into the backing object or offset into the mapping * for anonymous memory. For process policy an process counter * is used. + * * bind Only allocate memory on a specific set of nodes, * no fallback. + * FIXME: memory is allocated starting with the first node + * to the last. It would be better if bind would truly restrict + * the allocation to memory nodes instead + * * preferred Try a specific node first before normal fallback. * As a special case node -1 here means do the allocation * on the local CPU. This is normally identical to default, * but useful to set in a VMA when you have a non default * process policy. + * * default Allocate on the local node first, or when on a VMA * use the process policy. This is what Linux always did * in a NUMA aware kernel and still does by, ahem, default. @@ -93,23 +100,10 @@ struct mempolicy default_policy = { .policy = MPOL_DEFAULT, }; -/* Check if all specified nodes are online */ -static int nodes_online(unsigned long *nodes) -{ - DECLARE_BITMAP(online2, MAX_NUMNODES); - - bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES); - if (bitmap_empty(online2, MAX_NUMNODES)) - set_bit(0, online2); - if (!bitmap_subset(nodes, online2, MAX_NUMNODES)) - return -EINVAL; - return 0; -} - /* Do sanity checking on a policy */ -static int mpol_check_policy(int mode, unsigned long *nodes) +static int mpol_check_policy(int mode, nodemask_t *nodes) { - int empty = bitmap_empty(nodes, MAX_NUMNODES); + int empty = nodes_empty(*nodes); switch (mode) { case MPOL_DEFAULT: @@ -124,71 +118,20 @@ static int mpol_check_policy(int mode, unsigned long *nodes) return -EINVAL; break; } - return nodes_online(nodes); -} - -/* Copy a node mask from user space. */ -static int get_nodes(unsigned long *nodes, unsigned long __user *nmask, - unsigned long maxnode, int mode) -{ - unsigned long k; - unsigned long nlongs; - unsigned long endmask; - - --maxnode; - bitmap_zero(nodes, MAX_NUMNODES); - if (maxnode == 0 || !nmask) - return 0; - - nlongs = BITS_TO_LONGS(maxnode); - if ((maxnode % BITS_PER_LONG) == 0) - endmask = ~0UL; - else - endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; - - /* When the user specified more nodes than supported just check - if the non supported part is all zero. */ - if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { - if (nlongs > PAGE_SIZE/sizeof(long)) - return -EINVAL; - for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { - unsigned long t; - if (get_user(t, nmask + k)) - return -EFAULT; - if (k == nlongs - 1) { - if (t & endmask) - return -EINVAL; - } else if (t) - return -EINVAL; - } - nlongs = BITS_TO_LONGS(MAX_NUMNODES); - endmask = ~0UL; - } - - if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long))) - return -EFAULT; - nodes[nlongs-1] &= endmask; - /* Update current mems_allowed */ - cpuset_update_current_mems_allowed(); - /* Ignore nodes not set in current->mems_allowed */ - cpuset_restrict_to_mems_allowed(nodes); - return mpol_check_policy(mode, nodes); + return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; } - /* Generate a custom zonelist for the BIND policy. */ -static struct zonelist *bind_zonelist(unsigned long *nodes) +static struct zonelist *bind_zonelist(nodemask_t *nodes) { struct zonelist *zl; int num, max, nd; - max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES); + max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); if (!zl) return NULL; num = 0; - for (nd = find_first_bit(nodes, MAX_NUMNODES); - nd < MAX_NUMNODES; - nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) { + for_each_node_mask(nd, *nodes) { int k; for (k = MAX_NR_ZONES-1; k >= 0; k--) { struct zone *z = &NODE_DATA(nd)->node_zones[k]; @@ -199,17 +142,16 @@ static struct zonelist *bind_zonelist(unsigned long *nodes) policy_zone = k; } } - BUG_ON(num >= max); zl->zones[num] = NULL; return zl; } /* Create a new policy */ -static struct mempolicy *mpol_new(int mode, unsigned long *nodes) +static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) { struct mempolicy *policy; - PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]); + PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]); if (mode == MPOL_DEFAULT) return NULL; policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); @@ -218,10 +160,10 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes) atomic_set(&policy->refcnt, 1); switch (mode) { case MPOL_INTERLEAVE: - bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES); + policy->v.nodes = *nodes; break; case MPOL_PREFERRED: - policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES); + policy->v.preferred_node = first_node(*nodes); if (policy->v.preferred_node >= MAX_NUMNODES) policy->v.preferred_node = -1; break; @@ -238,34 +180,33 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes) } /* Ensure all existing pages follow the policy. */ -static int check_pte_range(struct mm_struct *mm, pmd_t *pmd, - unsigned long addr, unsigned long end, unsigned long *nodes) +static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, nodemask_t *nodes) { pte_t *orig_pte; pte_t *pte; + spinlock_t *ptl; - spin_lock(&mm->page_table_lock); - orig_pte = pte = pte_offset_map(pmd, addr); + orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); do { - unsigned long pfn; + struct page *page; unsigned int nid; if (!pte_present(*pte)) continue; - pfn = pte_pfn(*pte); - if (!pfn_valid(pfn)) + page = vm_normal_page(vma, addr, *pte); + if (!page) continue; - nid = pfn_to_nid(pfn); - if (!test_bit(nid, nodes)) + nid = page_to_nid(page); + if (!node_isset(nid, *nodes)) break; } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap(orig_pte); - spin_unlock(&mm->page_table_lock); + pte_unmap_unlock(orig_pte, ptl); return addr != end; } -static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud, - unsigned long addr, unsigned long end, unsigned long *nodes) +static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, + unsigned long addr, unsigned long end, nodemask_t *nodes) { pmd_t *pmd; unsigned long next; @@ -275,14 +216,14 @@ static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud, next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - if (check_pte_range(mm, pmd, addr, next, nodes)) + if (check_pte_range(vma, pmd, addr, next, nodes)) return -EIO; } while (pmd++, addr = next, addr != end); return 0; } -static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd, - unsigned long addr, unsigned long end, unsigned long *nodes) +static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, + unsigned long addr, unsigned long end, nodemask_t *nodes) { pud_t *pud; unsigned long next; @@ -292,24 +233,24 @@ static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd, next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - if (check_pmd_range(mm, pud, addr, next, nodes)) + if (check_pmd_range(vma, pud, addr, next, nodes)) return -EIO; } while (pud++, addr = next, addr != end); return 0; } -static inline int check_pgd_range(struct mm_struct *mm, - unsigned long addr, unsigned long end, unsigned long *nodes) +static inline int check_pgd_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long end, nodemask_t *nodes) { pgd_t *pgd; unsigned long next; - pgd = pgd_offset(mm, addr); + pgd = pgd_offset(vma->vm_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - if (check_pud_range(mm, pgd, addr, next, nodes)) + if (check_pud_range(vma, pgd, addr, next, nodes)) return -EIO; } while (pgd++, addr = next, addr != end); return 0; @@ -318,7 +259,7 @@ static inline int check_pgd_range(struct mm_struct *mm, /* Step 1: check the range */ static struct vm_area_struct * check_range(struct mm_struct *mm, unsigned long start, unsigned long end, - unsigned long *nodes, unsigned long flags) + nodemask_t *nodes, unsigned long flags) { int err; struct vm_area_struct *first, *vma, *prev; @@ -338,8 +279,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, endvma = end; if (vma->vm_start > start) start = vma->vm_start; - err = check_pgd_range(vma->vm_mm, - start, endvma, nodes); + err = check_pgd_range(vma, start, endvma, nodes); if (err) { first = ERR_PTR(err); break; @@ -393,17 +333,25 @@ static int mbind_range(struct vm_area_struct *vma, unsigned long start, return err; } -/* Change policy for a memory range */ -asmlinkage long sys_mbind(unsigned long start, unsigned long len, - unsigned long mode, - unsigned long __user *nmask, unsigned long maxnode, - unsigned flags) +static int contextualize_policy(int mode, nodemask_t *nodes) +{ + if (!nodes) + return 0; + + /* Update current mems_allowed */ + cpuset_update_current_mems_allowed(); + /* Ignore nodes not set in current->mems_allowed */ + cpuset_restrict_to_mems_allowed(nodes->bits); + return mpol_check_policy(mode, nodes); +} + +long do_mbind(unsigned long start, unsigned long len, + unsigned long mode, nodemask_t *nmask, unsigned long flags) { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; struct mempolicy *new; unsigned long end; - DECLARE_BITMAP(nodes, MAX_NUMNODES); int err; if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) @@ -418,20 +366,17 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, return -EINVAL; if (end == start) return 0; - - err = get_nodes(nodes, nmask, maxnode, mode); - if (err) - return err; - - new = mpol_new(mode, nodes); + if (mpol_check_policy(mode, nmask)) + return -EINVAL; + new = mpol_new(mode, nmask); if (IS_ERR(new)) return PTR_ERR(new); PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, - mode,nodes[0]); + mode,nodes_addr(nodes)[0]); down_write(&mm->mmap_sem); - vma = check_range(mm, start, end, nodes, flags); + vma = check_range(mm, start, end, nmask, flags); err = PTR_ERR(vma); if (!IS_ERR(vma)) err = mbind_range(vma, start, end, new); @@ -441,50 +386,45 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, } /* Set the process memory policy */ -asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, - unsigned long maxnode) +long do_set_mempolicy(int mode, nodemask_t *nodes) { - int err; struct mempolicy *new; - DECLARE_BITMAP(nodes, MAX_NUMNODES); - if (mode < 0 || mode > MPOL_MAX) + if (contextualize_policy(mode, nodes)) return -EINVAL; - err = get_nodes(nodes, nmask, maxnode, mode); - if (err) - return err; new = mpol_new(mode, nodes); if (IS_ERR(new)) return PTR_ERR(new); mpol_free(current->mempolicy); current->mempolicy = new; if (new && new->policy == MPOL_INTERLEAVE) - current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES); + current->il_next = first_node(new->v.nodes); return 0; } /* Fill a zone bitmap for a policy */ -static void get_zonemask(struct mempolicy *p, unsigned long *nodes) +static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) { int i; - bitmap_zero(nodes, MAX_NUMNODES); + nodes_clear(*nodes); switch (p->policy) { case MPOL_BIND: for (i = 0; p->v.zonelist->zones[i]; i++) - __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes); + node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, + *nodes); break; case MPOL_DEFAULT: break; case MPOL_INTERLEAVE: - bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES); + *nodes = p->v.nodes; break; case MPOL_PREFERRED: /* or use current node instead of online map? */ if (p->v.preferred_node < 0) - bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES); + *nodes = node_online_map; else - __set_bit(p->v.preferred_node, nodes); + node_set(p->v.preferred_node, *nodes); break; default: BUG(); @@ -504,37 +444,18 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr) return err; } -/* Copy a kernel node mask to user space */ -static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, - void *nodes, unsigned nbytes) -{ - unsigned long copy = ALIGN(maxnode-1, 64) / 8; - - if (copy > nbytes) { - if (copy > PAGE_SIZE) - return -EINVAL; - if (clear_user((char __user *)mask + nbytes, copy - nbytes)) - return -EFAULT; - copy = nbytes; - } - return copy_to_user(mask, nodes, copy) ? -EFAULT : 0; -} - /* Retrieve NUMA policy */ -asmlinkage long sys_get_mempolicy(int __user *policy, - unsigned long __user *nmask, - unsigned long maxnode, - unsigned long addr, unsigned long flags) +long do_get_mempolicy(int *policy, nodemask_t *nmask, + unsigned long addr, unsigned long flags) { - int err, pval; + int err; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; struct mempolicy *pol = current->mempolicy; + cpuset_update_current_mems_allowed(); if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) return -EINVAL; - if (nmask != NULL && maxnode < MAX_NUMNODES) - return -EINVAL; if (flags & MPOL_F_ADDR) { down_read(&mm->mmap_sem); vma = find_vma_intersection(mm, addr, addr+1); @@ -557,31 +478,25 @@ asmlinkage long sys_get_mempolicy(int __user *policy, err = lookup_node(mm, addr); if (err < 0) goto out; - pval = err; + *policy = err; } else if (pol == current->mempolicy && pol->policy == MPOL_INTERLEAVE) { - pval = current->il_next; + *policy = current->il_next; } else { err = -EINVAL; goto out; } } else - pval = pol->policy; + *policy = pol->policy; if (vma) { up_read(¤t->mm->mmap_sem); vma = NULL; } - if (policy && put_user(pval, policy)) - return -EFAULT; - err = 0; - if (nmask) { - DECLARE_BITMAP(nodes, MAX_NUMNODES); - get_zonemask(pol, nodes); - err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes)); - } + if (nmask) + get_zonemask(pol, nmask); out: if (vma) @@ -589,6 +504,126 @@ asmlinkage long sys_get_mempolicy(int __user *policy, return err; } +/* + * User space interface with variable sized bitmaps for nodelists. + */ + +/* Copy a node mask from user space. */ +static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask, + unsigned long maxnode) +{ + unsigned long k; + unsigned long nlongs; + unsigned long endmask; + + --maxnode; + nodes_clear(*nodes); + if (maxnode == 0 || !nmask) + return 0; + + nlongs = BITS_TO_LONGS(maxnode); + if ((maxnode % BITS_PER_LONG) == 0) + endmask = ~0UL; + else + endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; + + /* When the user specified more nodes than supported just check + if the non supported part is all zero. */ + if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { + if (nlongs > PAGE_SIZE/sizeof(long)) + return -EINVAL; + for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { + unsigned long t; + if (get_user(t, nmask + k)) + return -EFAULT; + if (k == nlongs - 1) { + if (t & endmask) + return -EINVAL; + } else if (t) + return -EINVAL; + } + nlongs = BITS_TO_LONGS(MAX_NUMNODES); + endmask = ~0UL; + } + + if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) + return -EFAULT; + nodes_addr(*nodes)[nlongs-1] &= endmask; + return 0; +} + +/* Copy a kernel node mask to user space */ +static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, + nodemask_t *nodes) +{ + unsigned long copy = ALIGN(maxnode-1, 64) / 8; + const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); + + if (copy > nbytes) { + if (copy > PAGE_SIZE) + return -EINVAL; + if (clear_user((char __user *)mask + nbytes, copy - nbytes)) + return -EFAULT; + copy = nbytes; + } + return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; +} + +asmlinkage long sys_mbind(unsigned long start, unsigned long len, + unsigned long mode, + unsigned long __user *nmask, unsigned long maxnode, + unsigned flags) +{ + nodemask_t nodes; + int err; + + err = get_nodes(&nodes, nmask, maxnode); + if (err) + return err; + return do_mbind(start, len, mode, &nodes, flags); +} + +/* Set the process memory policy */ +asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, + unsigned long maxnode) +{ + int err; + nodemask_t nodes; + + if (mode < 0 || mode > MPOL_MAX) + return -EINVAL; + err = get_nodes(&nodes, nmask, maxnode); + if (err) + return err; + return do_set_mempolicy(mode, &nodes); +} + +/* Retrieve NUMA policy */ +asmlinkage long sys_get_mempolicy(int __user *policy, + unsigned long __user *nmask, + unsigned long maxnode, + unsigned long addr, unsigned long flags) +{ + int err, pval; + nodemask_t nodes; + + if (nmask != NULL && maxnode < MAX_NUMNODES) + return -EINVAL; + + err = do_get_mempolicy(&pval, &nodes, addr, flags); + + if (err) + return err; + + if (policy && put_user(pval, policy)) + return -EFAULT; + + if (nmask) + err = copy_nodes_to_user(nmask, maxnode, &nodes); + + return err; +} + #ifdef CONFIG_COMPAT asmlinkage long compat_sys_get_mempolicy(int __user *policy, @@ -649,15 +684,15 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, long err = 0; unsigned long __user *nm = NULL; unsigned long nr_bits, alloc_size; - DECLARE_BITMAP(bm, MAX_NUMNODES); + nodemask_t bm; nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; if (nmask) { - err = compat_get_bitmap(bm, nmask, nr_bits); + err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits); nm = compat_alloc_user_space(alloc_size); - err |= copy_to_user(nm, bm, alloc_size); + err |= copy_to_user(nm, nodes_addr(bm), alloc_size); } if (err) @@ -676,7 +711,7 @@ get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned lo if (vma) { if (vma->vm_ops && vma->vm_ops->get_policy) - pol = vma->vm_ops->get_policy(vma, addr); + pol = vma->vm_ops->get_policy(vma, addr); else if (vma->vm_policy && vma->vm_policy->policy != MPOL_DEFAULT) pol = vma->vm_policy; @@ -687,7 +722,7 @@ get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned lo } /* Return a zonelist representing a mempolicy */ -static struct zonelist *zonelist_policy(unsigned int __nocast gfp, struct mempolicy *policy) +static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) { int nd; @@ -700,7 +735,7 @@ static struct zonelist *zonelist_policy(unsigned int __nocast gfp, struct mempol case MPOL_BIND: /* Lower zones don't get a policy applied */ /* Careful: current->mems_allowed might have moved */ - if ((gfp & GFP_ZONEMASK) >= policy_zone) + if (gfp_zone(gfp) >= policy_zone) if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist)) return policy->v.zonelist; /*FALL THROUGH*/ @@ -712,7 +747,7 @@ static struct zonelist *zonelist_policy(unsigned int __nocast gfp, struct mempol nd = 0; BUG(); } - return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK); + return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp); } /* Do dynamic interleaving for a process */ @@ -722,10 +757,9 @@ static unsigned interleave_nodes(struct mempolicy *policy) struct task_struct *me = current; nid = me->il_next; - BUG_ON(nid >= MAX_NUMNODES); - next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid); + next = next_node(nid, policy->v.nodes); if (next >= MAX_NUMNODES) - next = find_first_bit(policy->v.nodes, MAX_NUMNODES); + next = first_node(policy->v.nodes); me->il_next = next; return nid; } @@ -734,30 +768,28 @@ static unsigned interleave_nodes(struct mempolicy *policy) static unsigned offset_il_node(struct mempolicy *pol, struct vm_area_struct *vma, unsigned long off) { - unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES); + unsigned nnodes = nodes_weight(pol->v.nodes); unsigned target = (unsigned)off % nnodes; int c; int nid = -1; c = 0; do { - nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1); + nid = next_node(nid, pol->v.nodes); c++; } while (c <= target); - BUG_ON(nid >= MAX_NUMNODES); - BUG_ON(!test_bit(nid, pol->v.nodes)); return nid; } /* Allocate a page in interleaved policy. Own path because it needs to do special accounting. */ -static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned order, unsigned nid) +static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, + unsigned nid) { struct zonelist *zl; struct page *page; - BUG_ON(!node_online(nid)); - zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK); + zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); page = __alloc_pages(gfp, order, zl); if (page && page_zone(page) == zl->zones[0]) { zone_pcp(zl->zones[0],get_cpu())->interleave_hit++; @@ -789,7 +821,7 @@ static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned or * Should be called with the mm_sem of the vma hold. */ struct page * -alloc_page_vma(unsigned int __nocast gfp, struct vm_area_struct *vma, unsigned long addr) +alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = get_vma_policy(current, vma, addr); @@ -799,8 +831,6 @@ alloc_page_vma(unsigned int __nocast gfp, struct vm_area_struct *vma, unsigned l unsigned nid; if (vma) { unsigned long off; - BUG_ON(addr >= vma->vm_end); - BUG_ON(addr < vma->vm_start); off = vma->vm_pgoff; off += (addr - vma->vm_start) >> PAGE_SHIFT; nid = offset_il_node(pol, vma, off); @@ -832,7 +862,7 @@ alloc_page_vma(unsigned int __nocast gfp, struct vm_area_struct *vma, unsigned l * 1) it's ok to take cpuset_sem (can WAIT), and * 2) allocating for current task (not interrupt). */ -struct page *alloc_pages_current(unsigned int __nocast gfp, unsigned order) +struct page *alloc_pages_current(gfp_t gfp, unsigned order) { struct mempolicy *pol = current->mempolicy; @@ -878,7 +908,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) case MPOL_DEFAULT: return 1; case MPOL_INTERLEAVE: - return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES); + return nodes_equal(a->v.nodes, b->v.nodes); case MPOL_PREFERRED: return a->v.preferred_node == b->v.preferred_node; case MPOL_BIND: { @@ -1117,7 +1147,7 @@ int mpol_set_shared_policy(struct shared_policy *info, PDprintk("set_shared_policy %lx sz %lu %d %lx\n", vma->vm_pgoff, sz, npol? npol->policy : -1, - npol ? npol->v.nodes[0] : -1); + npol ? nodes_addr(npol->v.nodes)[0] : -1); if (npol) { new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); @@ -1164,14 +1194,75 @@ void __init numa_policy_init(void) /* Set interleaving policy for system init. This way not all the data structures allocated at system boot end up in node zero. */ - if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map), - MAX_NUMNODES) < 0) + if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map)) printk("numa_policy_init: interleaving failed\n"); } -/* Reset policy of current process to default. - * Assumes fs == KERNEL_DS */ +/* Reset policy of current process to default */ void numa_default_policy(void) { - sys_set_mempolicy(MPOL_DEFAULT, NULL, 0); + do_set_mempolicy(MPOL_DEFAULT, NULL); +} + +/* Migrate a policy to a different set of nodes */ +static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, + const nodemask_t *new) +{ + nodemask_t tmp; + + if (!pol) + return; + + switch (pol->policy) { + case MPOL_DEFAULT: + break; + case MPOL_INTERLEAVE: + nodes_remap(tmp, pol->v.nodes, *old, *new); + pol->v.nodes = tmp; + current->il_next = node_remap(current->il_next, *old, *new); + break; + case MPOL_PREFERRED: + pol->v.preferred_node = node_remap(pol->v.preferred_node, + *old, *new); + break; + case MPOL_BIND: { + nodemask_t nodes; + struct zone **z; + struct zonelist *zonelist; + + nodes_clear(nodes); + for (z = pol->v.zonelist->zones; *z; z++) + node_set((*z)->zone_pgdat->node_id, nodes); + nodes_remap(tmp, nodes, *old, *new); + nodes = tmp; + + zonelist = bind_zonelist(&nodes); + + /* If no mem, then zonelist is NULL and we keep old zonelist. + * If that old zonelist has no remaining mems_allowed nodes, + * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT. + */ + + if (zonelist) { + /* Good - got mem - substitute new zonelist */ + kfree(pol->v.zonelist); + pol->v.zonelist = zonelist; + } + break; + } + default: + BUG(); + break; + } +} + +/* + * Someone moved this task to different nodes. Fixup mempolicies. + * + * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well, + * once we have a cpuset mechanism to mark which cpuset subtree is migrating. + */ +void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new) +{ + rebind_policy(current->mempolicy, old, new); } |