From 6d556294d5b27fb12f18be7495af45b6156a409e Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Mon, 24 May 2010 14:31:59 -0700 Subject: mempolicy: remove redundant code 1. In funtion is_valid_nodemask(), varibable k will be inited to 0 in the following loop, needn't init to policy_zone anymore. 2. (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES) has already defined to MPOL_MODE_FLAGS in mempolicy.h. Signed-off-by: Bob Liu Acked-by: David Rientjes Cc: KOSAKI Motohiro Cc: Christoph Lameter Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 08f40a2f3fe..ad500f3b12b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -127,9 +127,6 @@ static int is_valid_nodemask(const nodemask_t *nodemask) { int nd, k; - /* Check that there is something useful in this mask */ - k = policy_zone; - for_each_node_mask(nd, *nodemask) { struct zone *z; @@ -145,7 +142,7 @@ static int is_valid_nodemask(const nodemask_t *nodemask) static inline int mpol_store_user_nodemask(const struct mempolicy *pol) { - return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES); + return pol->flags & MPOL_MODE_FLAGS; } static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, -- cgit v1.2.3-70-g09d2 From 6eb27e1fdf5781719a3d2e90e6c89fa012135c62 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Mon, 24 May 2010 14:32:00 -0700 Subject: mempolicy: remove case MPOL_INTERLEAVE from policy_zonelist() In policy_zonelist() mode MPOL_INTERLEAVE shouldn't happen, so fall through to BUG() instead of break to return. I also fixed the comment. Signed-off-by: Bob Liu Acked-by: David Rientjes Cc: Andi Kleen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ad500f3b12b..d97355b744a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1441,15 +1441,13 @@ static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy) /* * Normally, MPOL_BIND allocations are node-local within the * allowed nodemask. However, if __GFP_THISNODE is set and the - * current node is part of the mask, we use the zonelist for + * current node isn't part of the mask, we use the zonelist for * the first node in the mask instead. */ if (unlikely(gfp & __GFP_THISNODE) && unlikely(!node_isset(nd, policy->v.nodes))) nd = first_node(policy->v.nodes); break; - case MPOL_INTERLEAVE: /* should not happen */ - break; default: BUG(); } -- cgit v1.2.3-70-g09d2 From 1980050250fa052b1c24a19f9b3d82fae14d77f8 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Mon, 24 May 2010 14:32:01 -0700 Subject: mempolicy: remove redundant check Lee's patch "mempolicy: use MPOL_PREFERRED for system-wide default policy" has made the MPOL_DEFAULT only used in the memory policy APIs. So, no need to check in __mpol_equal also. Also get rid of mpol_match_intent() and move its logic directly into __mpol_equal(). Signed-off-by: Bob Liu Acked-by: David Rientjes Cc: Andi Kleen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d97355b744a..ac5aeafaec9 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1787,16 +1787,6 @@ struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol, return tompol; } -static int mpol_match_intent(const struct mempolicy *a, - const struct mempolicy *b) -{ - if (a->flags != b->flags) - return 0; - if (!mpol_store_user_nodemask(a)) - return 1; - return nodes_equal(a->w.user_nodemask, b->w.user_nodemask); -} - /* Slow path of a mempolicy comparison */ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) { @@ -1804,8 +1794,12 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) return 0; if (a->mode != b->mode) return 0; - if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b)) + if (a->flags != b->flags) return 0; + if (mpol_store_user_nodemask(a)) + if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) + return 0; + switch (a->mode) { case MPOL_BIND: /* Fall through */ -- cgit v1.2.3-70-g09d2 From e17f74af351cce9a1bade7b33af179497fdf95cf Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Mon, 24 May 2010 14:32:02 -0700 Subject: mempolicy: don't call mpol_set_nodemask() when no_context No need to call mpol_set_nodemask() when we have no context for the mempolicy. This can occur when we're parsing a tmpfs 'mpol' mount option. Just save the raw nodemask in the mempolicy's w.user_nodemask member for use when a tmpfs/shmem file is created. mpol_shared_policy_init() will "contextualize" the policy for the new file based on the creating task's context. Signed-off-by: Lee Schermerhorn Cc: Hugh Dickins Cc: Ravikiran Thirumalai Cc: KOSAKI Motohiro Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ac5aeafaec9..0e1b293e405 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2239,7 +2239,10 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) if (IS_ERR(new)) goto out; - { + if (no_context) { + /* save for contextualization */ + new->w.user_nodemask = nodes; + } else { int ret; NODEMASK_SCRATCH(scratch); if (scratch) { @@ -2255,10 +2258,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) } } err = 0; - if (no_context) { - /* save for contextualization */ - new->w.user_nodemask = nodes; - } out: /* Restore string for error message */ -- cgit v1.2.3-70-g09d2 From b4652e8429100ba5c3ddb49499faa1188c98c246 Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Mon, 24 May 2010 14:32:03 -0700 Subject: mempolicy: lose unnecessary loop variable in mpol_parse_str() We don't really need the extra variable 'i' in mpol_parse_str(). The only use is as the the loop variable. Then, it's assigned to 'mode'. Just use mode, and loose the 'uninitialized_var()' macro. Signed-off-by: Lee Schermerhorn Cc: Hugh Dickins Cc: Ravikiran Thirumalai Cc: KOSAKI Motohiro Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0e1b293e405..b4f1265df2d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2148,12 +2148,11 @@ static const char * const policy_types[] = int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) { struct mempolicy *new = NULL; - unsigned short uninitialized_var(mode); + unsigned short mode; unsigned short uninitialized_var(mode_flags); nodemask_t nodes; char *nodelist = strchr(str, ':'); char *flags = strchr(str, '='); - int i; int err = 1; if (nodelist) { @@ -2169,13 +2168,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) if (flags) *flags++ = '\0'; /* terminate mode string */ - for (i = 0; i <= MPOL_LOCAL; i++) { - if (!strcmp(str, policy_types[i])) { - mode = i; + for (mode = 0; mode <= MPOL_LOCAL; mode++) { + if (!strcmp(str, policy_types[mode])) { break; } } - if (i > MPOL_LOCAL) + if (mode > MPOL_LOCAL) goto out; switch (mode) { -- cgit v1.2.3-70-g09d2 From 345ace9c797030e77da8ff211b9502370b9d81ab Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Mon, 24 May 2010 14:32:04 -0700 Subject: mempolicy: rename policy_types and cleanup initialization Rename 'policy_types[]' to 'policy_modes[]' to better match the array contents. Use designated intializer syntax for policy_modes[]. Signed-off-by: Lee Schermerhorn Cc: Hugh Dickins Cc: Ravikiran Thirumalai Cc: KOSAKI Motohiro Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b4f1265df2d..ade57322fa2 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2121,9 +2121,15 @@ void numa_default_policy(void) * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag * Used only for mpol_parse_str() and mpol_to_str() */ -#define MPOL_LOCAL (MPOL_INTERLEAVE + 1) -static const char * const policy_types[] = - { "default", "prefer", "bind", "interleave", "local" }; +#define MPOL_LOCAL MPOL_MAX +static const char * const policy_modes[] = +{ + [MPOL_DEFAULT] = "default", + [MPOL_PREFERRED] = "prefer", + [MPOL_BIND] = "bind", + [MPOL_INTERLEAVE] = "interleave", + [MPOL_LOCAL] = "local" +}; #ifdef CONFIG_TMPFS @@ -2169,7 +2175,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) *flags++ = '\0'; /* terminate mode string */ for (mode = 0; mode <= MPOL_LOCAL; mode++) { - if (!strcmp(str, policy_types[mode])) { + if (!strcmp(str, policy_modes[mode])) { break; } } @@ -2324,11 +2330,11 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) BUG(); } - l = strlen(policy_types[mode]); + l = strlen(policy_modes[mode]); if (buffer + maxlen < p + l + 1) return -ENOSPC; - strcpy(p, policy_types[mode]); + strcpy(p, policy_modes[mode]); p += l; if (flags & MPOL_MODE_FLAGS) { -- cgit v1.2.3-70-g09d2 From 15d77835ac48dbc2d4884376ea6a08b65b1c40ba Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Mon, 24 May 2010 14:32:04 -0700 Subject: mempolicy: factor mpol_shared_policy_init() return paths Factor out duplicate put/frees in mpol_shared_policy_init() to a common return path. Signed-off-by: Lee Schermerhorn Cc: Hugh Dickins Cc: Ravikiran Thirumalai Cc: KOSAKI Motohiro Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ade57322fa2..0c73c8b814c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1995,26 +1995,22 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) return; /* contextualize the tmpfs mount point mempolicy */ new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); - if (IS_ERR(new)) { - mpol_put(mpol); /* drop our ref on sb mpol */ - NODEMASK_SCRATCH_FREE(scratch); - return; /* no valid nodemask intersection */ - } + if (IS_ERR(new)) + goto put_free; /* no valid nodemask intersection */ task_lock(current); ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); task_unlock(current); mpol_put(mpol); /* drop our ref on sb mpol */ - if (ret) { - NODEMASK_SCRATCH_FREE(scratch); - mpol_put(new); - return; - } + if (ret) + goto put_free; /* Create pseudo-vma that contains just the policy */ memset(&pvma, 0, sizeof(struct vm_area_struct)); pvma.vm_end = TASK_SIZE; /* policy covers entire file */ mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ + +put_free: mpol_put(new); /* drop initial ref */ NODEMASK_SCRATCH_FREE(scratch); } -- cgit v1.2.3-70-g09d2 From 708c1bbc9d0c3e57f40501794d9b0eed29d10fce Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 24 May 2010 14:32:07 -0700 Subject: mempolicy: restructure rebinding-mempolicy functions Nick Piggin reported that the allocator may see an empty nodemask when changing cpuset's mems[1]. It happens only on the kernel that do not do atomic nodemask_t stores. (MAX_NUMNODES > BITS_PER_LONG) But I found that there is also a problem on the kernel that can do atomic nodemask_t stores. The problem is that the allocator can't find a node to alloc page when changing cpuset's mems though there is a lot of free memory. The reason is like this: (mpol: mempolicy) task1 task1's mpol task2 alloc page 1 alloc on node0? NO 1 1 change mems from 1 to 0 1 rebind task1's mpol 0-1 set new bits 0 clear disallowed bits alloc on node1? NO 0 ... can't alloc page goto oom I can use the attached program reproduce it by the following step: # mkdir /dev/cpuset # mount -t cpuset cpuset /dev/cpuset # mkdir /dev/cpuset/1 # echo `cat /dev/cpuset/cpus` > /dev/cpuset/1/cpus # echo `cat /dev/cpuset/mems` > /dev/cpuset/1/mems # echo $$ > /dev/cpuset/1/tasks # numactl --membind=`cat /dev/cpuset/mems` ./cpuset_mem_hog & = max(nr_cpus - 1, 1) # killall -s SIGUSR1 cpuset_mem_hog # ./change_mems.sh several hours later, oom will happen though there is a lot of free memory. This patchset fixes this problem by expanding the nodes range first(set newly allowed bits) and shrink it lazily(clear newly disallowed bits). So we use a variable to tell the write-side task that read-side task is reading nodemask, and the write-side task clears newly disallowed nodes after read-side task ends the current memory allocation. This patch: In order to fix no node to alloc memory, when we want to update mempolicy and mems_allowed, we expand the set of nodes first (set all the newly nodes) and shrink the set of nodes lazily(clean disallowed nodes), But the mempolicy's rebind functions may breaks the expanding. So we restructure the mempolicy's rebind functions and split the rebind work to two steps, just like the update of cpuset's mems: The 1st step: expand the set of the mempolicy's nodes. The 2nd step: shrink the set of the mempolicy's nodes. It is used when there is no real lock to protect the mempolicy in the read-side. Otherwise we can do rebind work at once. In order to implement it, we define enum mpol_rebind_step { MPOL_REBIND_ONCE, MPOL_REBIND_STEP1, MPOL_REBIND_STEP2, MPOL_REBIND_NSTEP, }; If the mempolicy needn't be updated by two steps, we can pass MPOL_REBIND_ONCE to the rebind functions. Or we can pass MPOL_REBIND_STEP1 to do the first step of the rebind work and pass MPOL_REBIND_STEP2 to do the second step work. Besides that, it maybe long time between these two step and we have to release the lock that protects mempolicy and mems_allowed. If we hold the lock once again, we must check whether the current mempolicy is under the rebinding (the first step has been done) or not, because the task may alloc a new mempolicy when we don't hold the lock. So we defined the following flag to identify it: #define MPOL_F_REBINDING (1 << 2) The new functions will be used in the next patch. Signed-off-by: Miao Xie Cc: David Rientjes Cc: Nick Piggin Cc: Paul Menage Cc: Lee Schermerhorn Cc: Hugh Dickins Cc: Ravikiran Thirumalai Cc: KOSAKI Motohiro Cc: Christoph Lameter Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 15 ++++-- kernel/cpuset.c | 4 +- mm/mempolicy.c | 124 +++++++++++++++++++++++++++++++++++++++------- 3 files changed, 119 insertions(+), 24 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 1cc966cd3e5..7b9ef6bf45a 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -23,6 +23,13 @@ enum { MPOL_MAX, /* always last member of enum */ }; +enum mpol_rebind_step { + MPOL_REBIND_ONCE, /* do rebind work at once(not by two step) */ + MPOL_REBIND_STEP1, /* first step(set all the newly nodes) */ + MPOL_REBIND_STEP2, /* second step(clean all the disallowed nodes)*/ + MPOL_REBIND_NSTEP, +}; + /* Flags for set_mempolicy */ #define MPOL_F_STATIC_NODES (1 << 15) #define MPOL_F_RELATIVE_NODES (1 << 14) @@ -51,6 +58,7 @@ enum { */ #define MPOL_F_SHARED (1 << 0) /* identify shared policies */ #define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */ +#define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */ #ifdef __KERNEL__ @@ -193,8 +201,8 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, extern void numa_default_policy(void); extern void numa_policy_init(void); -extern void mpol_rebind_task(struct task_struct *tsk, - const nodemask_t *new); +extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, + enum mpol_rebind_step step); extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); extern void mpol_fix_fork_child_flag(struct task_struct *p); @@ -308,7 +316,8 @@ static inline void numa_default_policy(void) } static inline void mpol_rebind_task(struct task_struct *tsk, - const nodemask_t *new) + const nodemask_t *new, + enum mpol_rebind_step step) { } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 9a50c5f6e72..db0990ac3fa 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -953,8 +953,8 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, nodemask_t *newmems) { nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); - mpol_rebind_task(tsk, &tsk->mems_allowed); - mpol_rebind_task(tsk, newmems); + mpol_rebind_task(tsk, &tsk->mems_allowed, MPOL_REBIND_ONCE); + mpol_rebind_task(tsk, newmems, MPOL_REBIND_ONCE); tsk->mems_allowed = *newmems; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0c73c8b814c..8a993db8802 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -119,7 +119,22 @@ struct mempolicy default_policy = { static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); - void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); + /* + * If read-side task has no lock to protect task->mempolicy, write-side + * task will rebind the task->mempolicy by two step. The first step is + * setting all the newly nodes, and the second step is cleaning all the + * disallowed nodes. In this way, we can avoid finding no node to alloc + * page. + * If we have a lock to protect task->mempolicy in read-side, we do + * rebind directly. + * + * step: + * MPOL_REBIND_ONCE - do rebind work at once + * MPOL_REBIND_STEP1 - set all the newly nodes + * MPOL_REBIND_STEP2 - clean all the disallowed nodes + */ + void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes, + enum mpol_rebind_step step); } mpol_ops[MPOL_MAX]; /* Check that the nodemask contains at least one populated zone */ @@ -274,12 +289,19 @@ void __mpol_put(struct mempolicy *p) kmem_cache_free(policy_cache, p); } -static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) +static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes, + enum mpol_rebind_step step) { } -static void mpol_rebind_nodemask(struct mempolicy *pol, - const nodemask_t *nodes) +/* + * step: + * MPOL_REBIND_ONCE - do rebind work at once + * MPOL_REBIND_STEP1 - set all the newly nodes + * MPOL_REBIND_STEP2 - clean all the disallowed nodes + */ +static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, + enum mpol_rebind_step step) { nodemask_t tmp; @@ -288,12 +310,31 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, else if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); else { - nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed, - *nodes); - pol->w.cpuset_mems_allowed = *nodes; + /* + * if step == 1, we use ->w.cpuset_mems_allowed to cache the + * result + */ + if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) { + nodes_remap(tmp, pol->v.nodes, + pol->w.cpuset_mems_allowed, *nodes); + pol->w.cpuset_mems_allowed = step ? tmp : *nodes; + } else if (step == MPOL_REBIND_STEP2) { + tmp = pol->w.cpuset_mems_allowed; + pol->w.cpuset_mems_allowed = *nodes; + } else + BUG(); } - pol->v.nodes = tmp; + if (nodes_empty(tmp)) + tmp = *nodes; + + if (step == MPOL_REBIND_STEP1) + nodes_or(pol->v.nodes, pol->v.nodes, tmp); + else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2) + pol->v.nodes = tmp; + else + BUG(); + if (!node_isset(current->il_next, tmp)) { current->il_next = next_node(current->il_next, tmp); if (current->il_next >= MAX_NUMNODES) @@ -304,7 +345,8 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, } static void mpol_rebind_preferred(struct mempolicy *pol, - const nodemask_t *nodes) + const nodemask_t *nodes, + enum mpol_rebind_step step) { nodemask_t tmp; @@ -327,16 +369,45 @@ static void mpol_rebind_preferred(struct mempolicy *pol, } } -/* Migrate a policy to a different set of nodes */ -static void mpol_rebind_policy(struct mempolicy *pol, - const nodemask_t *newmask) +/* + * mpol_rebind_policy - Migrate a policy to a different set of nodes + * + * If read-side task has no lock to protect task->mempolicy, write-side + * task will rebind the task->mempolicy by two step. The first step is + * setting all the newly nodes, and the second step is cleaning all the + * disallowed nodes. In this way, we can avoid finding no node to alloc + * page. + * If we have a lock to protect task->mempolicy in read-side, we do + * rebind directly. + * + * step: + * MPOL_REBIND_ONCE - do rebind work at once + * MPOL_REBIND_STEP1 - set all the newly nodes + * MPOL_REBIND_STEP2 - clean all the disallowed nodes + */ +static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, + enum mpol_rebind_step step) { if (!pol) return; - if (!mpol_store_user_nodemask(pol) && + if (!mpol_store_user_nodemask(pol) && step == 0 && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) return; - mpol_ops[pol->mode].rebind(pol, newmask); + + if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING)) + return; + + if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING)) + BUG(); + + if (step == MPOL_REBIND_STEP1) + pol->flags |= MPOL_F_REBINDING; + else if (step == MPOL_REBIND_STEP2) + pol->flags &= ~MPOL_F_REBINDING; + else if (step >= MPOL_REBIND_NSTEP) + BUG(); + + mpol_ops[pol->mode].rebind(pol, newmask, step); } /* @@ -346,9 +417,10 @@ static void mpol_rebind_policy(struct mempolicy *pol, * Called with task's alloc_lock held. */ -void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) +void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, + enum mpol_rebind_step step) { - mpol_rebind_policy(tsk->mempolicy, new); + mpol_rebind_policy(tsk->mempolicy, new, step); } /* @@ -363,7 +435,7 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) down_write(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) - mpol_rebind_policy(vma->vm_policy, new); + mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE); up_write(&mm->mmap_sem); } @@ -1745,6 +1817,9 @@ EXPORT_SYMBOL(alloc_pages_current); * with the mems_allowed returned by cpuset_mems_allowed(). This * keeps mempolicies cpuset relative after its cpuset moves. See * further kernel/cpuset.c update_nodemask(). + * + * current's mempolicy may be rebinded by the other task(the task that changes + * cpuset's mems), so we needn't do rebind work for current task. */ /* Slow path of a mempolicy duplicate */ @@ -1754,13 +1829,24 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) if (!new) return ERR_PTR(-ENOMEM); + + /* task's mempolicy is protected by alloc_lock */ + if (old == current->mempolicy) { + task_lock(current); + *new = *old; + task_unlock(current); + } else + *new = *old; + rcu_read_lock(); if (current_cpuset_is_being_rebound()) { nodemask_t mems = cpuset_mems_allowed(current); - mpol_rebind_policy(old, &mems); + if (new->flags & MPOL_F_REBINDING) + mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2); + else + mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); } rcu_read_unlock(); - *new = *old; atomic_set(&new->refcnt, 1); return new; } -- cgit v1.2.3-70-g09d2 From c0ff7453bb5c7c98e0885fb94279f2571946f280 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 24 May 2010 14:32:08 -0700 Subject: cpuset,mm: fix no node to alloc memory when changing cpuset's mems Before applying this patch, cpuset updates task->mems_allowed and mempolicy by setting all new bits in the nodemask first, and clearing all old unallowed bits later. But in the way, the allocator may find that there is no node to alloc memory. The reason is that cpuset rebinds the task's mempolicy, it cleans the nodes which the allocater can alloc pages on, for example: (mpol: mempolicy) task1 task1's mpol task2 alloc page 1 alloc on node0? NO 1 1 change mems from 1 to 0 1 rebind task1's mpol 0-1 set new bits 0 clear disallowed bits alloc on node1? NO 0 ... can't alloc page goto oom This patch fixes this problem by expanding the nodes range first(set newly allowed bits) and shrink it lazily(clear newly disallowed bits). So we use a variable to tell the write-side task that read-side task is reading nodemask, and the write-side task clears newly disallowed nodes after read-side task ends the current memory allocation. [akpm@linux-foundation.org: fix spello] Signed-off-by: Miao Xie Cc: David Rientjes Cc: Nick Piggin Cc: Paul Menage Cc: Lee Schermerhorn Cc: Hugh Dickins Cc: Ravikiran Thirumalai Cc: KOSAKI Motohiro Cc: Christoph Lameter Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpuset.h | 43 +++++++++++++++++++++++++++++++++++++ include/linux/sched.h | 1 + kernel/cpuset.c | 58 +++++++++++++++++++++++++++++++++++++++++++------- kernel/exit.c | 2 ++ mm/filemap.c | 10 +++++++-- mm/hugetlb.c | 12 +++++++---- mm/mempolicy.c | 24 +++++++++++++++++---- mm/page_alloc.c | 6 +++++- mm/slab.c | 4 ++++ mm/slub.c | 6 +++++- mm/vmscan.c | 2 ++ 11 files changed, 148 insertions(+), 20 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index a73454aec33..20b51cab659 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -86,9 +86,44 @@ extern void rebuild_sched_domains(void); extern void cpuset_print_task_mems_allowed(struct task_struct *p); +/* + * reading current mems_allowed and mempolicy in the fastpath must protected + * by get_mems_allowed() + */ +static inline void get_mems_allowed(void) +{ + current->mems_allowed_change_disable++; + + /* + * ensure that reading mems_allowed and mempolicy happens after the + * update of ->mems_allowed_change_disable. + * + * the write-side task finds ->mems_allowed_change_disable is not 0, + * and knows the read-side task is reading mems_allowed or mempolicy, + * so it will clear old bits lazily. + */ + smp_mb(); +} + +static inline void put_mems_allowed(void) +{ + /* + * ensure that reading mems_allowed and mempolicy before reducing + * mems_allowed_change_disable. + * + * the write-side task will know that the read-side task is still + * reading mems_allowed or mempolicy, don't clears old bits in the + * nodemask. + */ + smp_mb(); + --ACCESS_ONCE(current->mems_allowed_change_disable); +} + static inline void set_mems_allowed(nodemask_t nodemask) { + task_lock(current); current->mems_allowed = nodemask; + task_unlock(current); } #else /* !CONFIG_CPUSETS */ @@ -187,6 +222,14 @@ static inline void set_mems_allowed(nodemask_t nodemask) { } +static inline void get_mems_allowed(void) +{ +} + +static inline void put_mems_allowed(void) +{ +} + #endif /* !CONFIG_CPUSETS */ #endif /* _LINUX_CPUSET_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index b55e988988b..415b8f8a3f4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1421,6 +1421,7 @@ struct task_struct { #endif #ifdef CONFIG_CPUSETS nodemask_t mems_allowed; /* Protected by alloc_lock */ + int mems_allowed_change_disable; int cpuset_mem_spread_rotor; #endif #ifdef CONFIG_CGROUPS diff --git a/kernel/cpuset.c b/kernel/cpuset.c index db0990ac3fa..61d6af7fa67 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -946,16 +946,62 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, * In order to avoid seeing no nodes if the old and new nodes are disjoint, * we structure updates as setting all new allowed nodes, then clearing newly * disallowed ones. - * - * Called with task's alloc_lock held */ static void cpuset_change_task_nodemask(struct task_struct *tsk, nodemask_t *newmems) { +repeat: + /* + * Allow tasks that have access to memory reserves because they have + * been OOM killed to get memory anywhere. + */ + if (unlikely(test_thread_flag(TIF_MEMDIE))) + return; + if (current->flags & PF_EXITING) /* Let dying task have memory */ + return; + + task_lock(tsk); nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); - mpol_rebind_task(tsk, &tsk->mems_allowed, MPOL_REBIND_ONCE); - mpol_rebind_task(tsk, newmems, MPOL_REBIND_ONCE); + mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); + + + /* + * ensure checking ->mems_allowed_change_disable after setting all new + * allowed nodes. + * + * the read-side task can see an nodemask with new allowed nodes and + * old allowed nodes. and if it allocates page when cpuset clears newly + * disallowed ones continuous, it can see the new allowed bits. + * + * And if setting all new allowed nodes is after the checking, setting + * all new allowed nodes and clearing newly disallowed ones will be done + * continuous, and the read-side task may find no node to alloc page. + */ + smp_mb(); + + /* + * Allocation of memory is very fast, we needn't sleep when waiting + * for the read-side. + */ + while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) { + task_unlock(tsk); + if (!task_curr(tsk)) + yield(); + goto repeat; + } + + /* + * ensure checking ->mems_allowed_change_disable before clearing all new + * disallowed nodes. + * + * if clearing newly disallowed bits before the checking, the read-side + * task may find no node to alloc page. + */ + smp_mb(); + + mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); tsk->mems_allowed = *newmems; + task_unlock(tsk); } /* @@ -978,9 +1024,7 @@ static void cpuset_change_nodemask(struct task_struct *p, cs = cgroup_cs(scan->cg); guarantee_online_mems(cs, newmems); - task_lock(p); cpuset_change_task_nodemask(p, newmems); - task_unlock(p); NODEMASK_FREE(newmems); @@ -1383,9 +1427,7 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, err = set_cpus_allowed_ptr(tsk, cpus_attach); WARN_ON_ONCE(err); - task_lock(tsk); cpuset_change_task_nodemask(tsk, to); - task_unlock(tsk); cpuset_update_task_spread_flag(cs, tsk); } diff --git a/kernel/exit.c b/kernel/exit.c index eabca5a73a8..019a2843bf9 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1002,8 +1002,10 @@ NORET_TYPE void do_exit(long code) exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA + task_lock(tsk); mpol_put(tsk->mempolicy); tsk->mempolicy = NULL; + task_unlock(tsk); #endif #ifdef CONFIG_FUTEX if (unlikely(current->pi_state_cache)) diff --git a/mm/filemap.c b/mm/filemap.c index d6f4f073836..88d719665a2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -461,9 +461,15 @@ EXPORT_SYMBOL_GPL(add_to_page_cache_lru); #ifdef CONFIG_NUMA struct page *__page_cache_alloc(gfp_t gfp) { + int n; + struct page *page; + if (cpuset_do_page_mem_spread()) { - int n = cpuset_mem_spread_node(); - return alloc_pages_exact_node(n, gfp, 0); + get_mems_allowed(); + n = cpuset_mem_spread_node(); + page = alloc_pages_exact_node(n, gfp, 0); + put_mems_allowed(); + return page; } return alloc_pages(gfp, 0); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4c9e6bbf377..54d42b009db 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -465,11 +465,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, struct page *page = NULL; struct mempolicy *mpol; nodemask_t *nodemask; - struct zonelist *zonelist = huge_zonelist(vma, address, - htlb_alloc_mask, &mpol, &nodemask); + struct zonelist *zonelist; struct zone *zone; struct zoneref *z; + get_mems_allowed(); + zonelist = huge_zonelist(vma, address, + htlb_alloc_mask, &mpol, &nodemask); /* * A child process with MAP_PRIVATE mappings created by their parent * have no page reserves. This check ensures that reservations are @@ -477,11 +479,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, */ if (!vma_has_reserves(vma) && h->free_huge_pages - h->resv_huge_pages == 0) - return NULL; + goto err; /* If reserves cannot be used, ensure enough pages are in the pool */ if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) - return NULL; + goto err;; for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { @@ -500,7 +502,9 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, break; } } +err: mpol_cond_put(mpol); + put_mems_allowed(); return page; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8a993db8802..721b2b33803 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1639,6 +1639,8 @@ static inline unsigned interleave_nid(struct mempolicy *pol, * to the struct mempolicy for conditional unref after allocation. * If the effective policy is 'BIND, returns a pointer to the mempolicy's * @nodemask for filtering the zonelist. + * + * Must be protected by get_mems_allowed() */ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, @@ -1684,6 +1686,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) if (!(mask && current->mempolicy)) return false; + task_lock(current); mempolicy = current->mempolicy; switch (mempolicy->mode) { case MPOL_PREFERRED: @@ -1703,6 +1706,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) default: BUG(); } + task_unlock(current); return true; } @@ -1750,13 +1754,17 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = get_vma_policy(current, vma, addr); struct zonelist *zl; + struct page *page; + get_mems_allowed(); if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); mpol_cond_put(pol); - return alloc_page_interleave(gfp, 0, nid); + page = alloc_page_interleave(gfp, 0, nid); + put_mems_allowed(); + return page; } zl = policy_zonelist(gfp, pol); if (unlikely(mpol_needs_cond_ref(pol))) { @@ -1766,12 +1774,15 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) struct page *page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); __mpol_put(pol); + put_mems_allowed(); return page; } /* * fast path: default or task policy */ - return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); + page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); + put_mems_allowed(); + return page; } /** @@ -1796,18 +1807,23 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) struct page *alloc_pages_current(gfp_t gfp, unsigned order) { struct mempolicy *pol = current->mempolicy; + struct page *page; if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) pol = &default_policy; + get_mems_allowed(); /* * No reference counting needed for current->mempolicy * nor system default_policy */ if (pol->mode == MPOL_INTERLEAVE) - return alloc_page_interleave(gfp, order, interleave_nodes(pol)); - return __alloc_pages_nodemask(gfp, order, + page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); + else + page = __alloc_pages_nodemask(gfp, order, policy_zonelist(gfp, pol), policy_nodemask(gfp, pol)); + put_mems_allowed(); + return page; } EXPORT_SYMBOL(alloc_pages_current); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 596180fedd3..f7da2a2934b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1990,10 +1990,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; + get_mems_allowed(); /* The preferred zone is used for statistics later */ first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); - if (!preferred_zone) + if (!preferred_zone) { + put_mems_allowed(); return NULL; + } /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, @@ -2003,6 +2006,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, page = __alloc_pages_slowpath(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); + put_mems_allowed(); trace_mm_page_alloc(page, order, gfp_mask, migratetype); return page; diff --git a/mm/slab.c b/mm/slab.c index 50a73fca19c..02786e1a32d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3217,10 +3217,12 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) if (in_interrupt() || (flags & __GFP_THISNODE)) return NULL; nid_alloc = nid_here = numa_node_id(); + get_mems_allowed(); if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) nid_alloc = cpuset_mem_spread_node(); else if (current->mempolicy) nid_alloc = slab_node(current->mempolicy); + put_mems_allowed(); if (nid_alloc != nid_here) return ____cache_alloc_node(cachep, flags, nid_alloc); return NULL; @@ -3247,6 +3249,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) if (flags & __GFP_THISNODE) return NULL; + get_mems_allowed(); zonelist = node_zonelist(slab_node(current->mempolicy), flags); local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); @@ -3302,6 +3305,7 @@ retry: } } } + put_mems_allowed(); return obj; } diff --git a/mm/slub.c b/mm/slub.c index e46e3129697..26f0cb9cc58 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1360,6 +1360,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) get_cycles() % 1024 > s->remote_node_defrag_ratio) return NULL; + get_mems_allowed(); zonelist = node_zonelist(slab_node(current->mempolicy), flags); for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { struct kmem_cache_node *n; @@ -1369,10 +1370,13 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) if (n && cpuset_zone_allowed_hardwall(zone, flags) && n->nr_partial > s->min_partial) { page = get_partial_node(n); - if (page) + if (page) { + put_mems_allowed(); return page; + } } } + put_mems_allowed(); #endif return NULL; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 3ff3311447f..f2c367c9ec1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1774,6 +1774,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); unsigned long writeback_threshold; + get_mems_allowed(); delayacct_freepages_start(); if (scanning_global_lru(sc)) @@ -1857,6 +1858,7 @@ out: mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); delayacct_freepages_end(); + put_mems_allowed(); return ret; } -- cgit v1.2.3-70-g09d2 From 6ec3a12712ac67ffa4b80d16e0767ffd2431a68d Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Mon, 24 May 2010 14:32:33 -0700 Subject: mm: consider the entire user address space during node migration Use mm->task_size instead of TASK_SIZE to ensure that the entire user address space is migrated. mm->task_size is independent of the calling task context. TASK SIZE may be dependant on the address space size of the calling process. Usage of TASK_SIZE can lead to partial address space migration if the calling process was 32 bit and the migrating process was 64 bit. Here is the test script used on 64 system with a 32 bit echo process: mount -t cgroup none /cgroup -o cpuset cd /cgroup mkdir 0 echo 1 > 0/cpuset.cpus echo 0 > 0/cpuset.mems echo 1 > 0/cpuset.memory_migrate mkdir 1 echo 1 > 1/cpuset.cpus echo 1 > 1/cpuset.mems echo 1 > 1/cpuset.memory_migrate echo $$ > 0/tasks 64_bit_process & pid=$! echo $pid > 1/tasks # This does not migrate all process pages without # this patch. If 64 bit echo is used or this patch is # applied, then the full address space of $pid is # migrated. To check memory migration, I watched: grep MemUsed /sys/devices/system/node/node*/meminfo Signed-off-by: Greg Thelen Acked-by: Christoph Lameter Reviewed-by: KOSAKI Motohiro Cc: Mel Gorman Cc: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 721b2b33803..75751012c55 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -928,7 +928,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, nodes_clear(nmask); node_set(source, nmask); - check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask, + check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) -- cgit v1.2.3-70-g09d2 From 0cae3457b1a6e88f31020272bcfd90c178716053 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 25 May 2010 23:42:58 -0700 Subject: mempolicy: ERR_PTR dereference in mpol_shared_policy_init() The original code called mpol_put(new) while "new" was an ERR_PTR. Signed-off-by: Dan Carpenter Cc: Lee Schermerhorn Cc: KOSAKI Motohiro Cc: Christoph Lameter Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 75751012c55..5d6fb339de0 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2098,7 +2098,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) /* contextualize the tmpfs mount point mempolicy */ new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); if (IS_ERR(new)) - goto put_free; /* no valid nodemask intersection */ + goto free_scratch; /* no valid nodemask intersection */ task_lock(current); ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); @@ -2114,6 +2114,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) put_free: mpol_put(new); /* drop initial ref */ +free_scratch: NODEMASK_SCRATCH_FREE(scratch); } } -- cgit v1.2.3-70-g09d2