> for_each_zone_zonelist_nodemask(zone, z, zonelist,
> MAX_NR_ZONES - 1, nodemask) {
> nid = zone_to_nid(zone);
> @@ -1835,10 +1846,15 @@ __setup("default_hugepagesz=", hugetlb_default_setup);
> static unsigned int cpuset_mems_nr(unsigned int *array)
> {
> int node;
> - unsigned int nr = 0;
> + unsigned int nr;
> + unsigned long flags, seq;
>
> - for_each_node_mask(node, cpuset_current_mems_allowed)
> - nr += array[node];
> + do {
> + nr = 0;
> + seq = mems_fastpath_lock_irqsave(current, flags);
> + for_each_node_mask(node, cpuset_current_mems_allowed)
> + nr += array[node];
> + } while (mems_fastpath_unlock_irqrestore(current, seq, flags));
>
> return nr;
> }
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index dd3f5c5..49abf11 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -187,8 +187,10 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
> * parameter with respect to the policy mode and flags. But, we need to
> * handle an empty nodemask with MPOL_PREFERRED here.
> *
> - * Must be called holding task's alloc_lock to protect task's mems_allowed
> - * and mempolicy. May also be called holding the mmap_semaphore for write.
> + * Must be called using
> + * mems_slowpath_lock_irqsave()/mems_slowpath_unlock_irqrestore()
> + * to protect task's mems_allowed and mempolicy. May also be called holding
> + * the mmap_semaphore for write.
> */
> static int mpol_set_nodemask(struct mempolicy *pol,
> const nodemask_t *nodes, struct nodemask_scratch *nsc)
> @@ -344,9 +346,10 @@ static void mpol_rebind_policy(struct mempolicy *pol,
> * Wrapper for mpol_rebind_policy() that just requires task
> * pointer, and updates task mempolicy.
> *
> - * Called with task's alloc_lock held.
> + * Using
> + * mems_slowpath_lock_irqsave()/mems_slowpath_unlock_irqrestore()
> + * to protect it.
> */
> -
> void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
> {
> mpol_rebind_policy(tsk->mempolicy, new);
> @@ -644,6 +647,7 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
> struct mempolicy *new, *old;
> struct mm_struct *mm = current->mm;
> NODEMASK_SCRATCH(scratch);
> + unsigned long irqflags;
> int ret;
>
> if (!scratch)
> @@ -662,10 +666,10 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
> */
> if (mm)
> down_write(&mm->mmap_sem);
> - task_lock(current);
> + mems_slowpath_lock_irqsave(current, irqflags);
> ret = mpol_set_nodemask(new, nodes, scratch);
> if (ret) {
> - task_unlock(current);
> + mems_slowpath_unlock_irqrestore(current, irqflags);
> if (mm)
> up_write(&mm->mmap_sem);
> mpol_put(new);
> @@ -677,7 +681,7 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
> if (new && new->mode == MPOL_INTERLEAVE &&
> nodes_weight(new->v.nodes))
> current->il_next = first_node(new->v.nodes);
> - task_unlock(current);
> + mems_slowpath_unlock_irqrestore(current, irqflags);
> if (mm)
> up_write(&mm->mmap_sem);
>
> @@ -691,7 +695,9 @@ out:
> /*
> * Return nodemask for policy for get_mempolicy() query
> *
> - * Called with task's alloc_lock held
> + * Must be called using mems_slowpath_lock_irqsave()/
> + * mems_slowpath_unlock_irqrestore() to
> + * protect it.
> */
> static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
> {
> @@ -736,6 +742,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
> struct mm_struct *mm = current->mm;
> struct vm_area_struct *vma = NULL;
> struct mempolicy *pol = current->mempolicy;
> + unsigned long irqflags;
>
> if (flags &
> ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
> @@ -745,9 +752,10 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
> if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
> return -EINVAL;
> *policy = 0; /* just so it's initialized */
> - task_lock(current);
> +
> + mems_slowpath_lock_irqsave(current, irqflags);
> *nmask = cpuset_current_mems_allowed;
> - task_unlock(current);
> + mems_slowpath_unlock_irqrestore(current, irqflags);
> return 0;
> }
>
> @@ -803,13 +811,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
>
> err = 0;
> if (nmask) {
> + mems_slowpath_lock_irqsave(current, irqflags);
> if (mpol_store_user_nodemask(pol)) {
> *nmask = pol->w.user_nodemask;
> } else {
> - task_lock(current);
> get_policy_nodemask(pol, nmask);
> - task_unlock(current);
> }
> + mems_slowpath_unlock_irqrestore(current, irqflags);
> }
>
> out:
> @@ -1008,6 +1016,7 @@ static long do_mbind(unsigned long start, unsigned long len,
> struct mempolicy *new;
> unsigned long end;
> int err;
> + unsigned long irqflags;
> LIST_HEAD(pagelist);
>
> if (flags & ~(unsigned long)(MPOL_MF_STRICT |
> @@ -1055,9 +1064,9 @@ static long do_mbind(unsigned long start, unsigned long len,
> NODEMASK_SCRATCH(scratch);
> if (scratch) {
> down_write(&mm->mmap_sem);
> - task_lock(current);
> + mems_slowpath_lock_irqsave(current, irqflags);
> err = mpol_set_nodemask(new, nmask, scratch);
> - task_unlock(current);
> + mems_slowpath_unlock_irqrestore(current, irqflags);
> if (err)
> up_write(&mm->mmap_sem);
> } else
> @@ -1408,8 +1417,10 @@ static struct mempolicy *get_vma_policy(struct task_struct *task,
> } else if (vma->vm_policy)
> pol = vma->vm_policy;
> }
> +
> if (!pol)
> pol = &default_policy;
> +
> return pol;
> }
>
> @@ -1475,7 +1486,7 @@ static unsigned interleave_nodes(struct mempolicy *policy)
> * next slab entry.
> * @policy must be protected by freeing by the caller. If @policy is
> * the current task's mempolicy, this protection is implicit, as only the
> - * task can change it's policy. The system default policy requires no
> + * task can free it's policy. The system default policy requires no
> * such protection.
> */
> unsigned slab_node(struct mempolicy *policy)
> @@ -1574,16 +1585,33 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
> nodemask_t **nodemask)
> {
> struct zonelist *zl;
> + struct mempolicy policy;
> + struct mempolicy *pol;
> + unsigned long seq, irqflag;
>
> *mpol = get_vma_policy(current, vma, addr);
> *nodemask = NULL; /* assume !MPOL_BIND */
>
> - if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
> - zl = node_zonelist(interleave_nid(*mpol, vma, addr,
> + pol = *mpol;
> + if (pol == current->mempolicy) {
> + /*
> + * get_vma_policy() doesn't return NULL, so we needn't worry
> + * whether pol is NULL or not.
> + */
> + do {
> + seq = mems_fastpath_lock_irqsave(current, irqflag);
> + policy = *pol;
> + } while (mems_fastpath_unlock_irqrestore(current,
> + seq, irqflag));
> + pol = &policy;
> + }
> +
> + if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
> + zl = node_zonelist(interleave_nid(pol, vma, addr,
> huge_page_shift(hstate_vma(vma))), gfp_flags);
> } else {
> - zl = policy_zonelist(gfp_flags, *mpol);
> - if ((*mpol)->mode == MPOL_BIND)
> + zl = policy_zonelist(gfp_flags, pol);
> + if (pol->mode == MPOL_BIND)
> *nodemask = &(*mpol)->v.nodes;
> }
> return zl;
> @@ -1609,11 +1637,14 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
> {
> struct mempolicy *mempolicy;
> int nid;
> + unsigned long irqflags;
>
> if (!(mask && current->mempolicy))
> return false;
>
> + mems_slowpath_lock_irqsave(current, irqflags);
> mempolicy = current->mempolicy;
> +
> switch (mempolicy->mode) {
> case MPOL_PREFERRED:
> if (mempolicy->flags & MPOL_F_LOCAL)
> @@ -1633,6 +1664,8 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
> BUG();
> }
>
> + mems_slowpath_unlock_irqrestore(current, irqflags);
> +
> return true;
> }
> #endif
> @@ -1722,7 +1755,22 @@ struct page *
> alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
> {
> struct mempolicy *pol = get_vma_policy(current, vma, addr);
> + struct mempolicy policy;
> struct zonelist *zl;
> + struct page *page;
> + unsigned long seq, iflags;
> +
> + if (pol == current->mempolicy) {
> + /*
> + * get_vma_policy() doesn't return NULL, so we needn't worry
> + * whether pol is NULL or not.
> + */
> + do {
> + seq = mems_fastpath_lock_irqsave(current, iflags);
> + policy = *pol;
> + } while (mems_fastpath_unlock_irqrestore(current, seq, iflags));
> + pol = &policy;
> + }
>
> if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
> unsigned nid;
> @@ -1736,15 +1784,16 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
> /*
> * slow path: ref counted shared policy
> */
> - struct page *page = __alloc_pages_nodemask(gfp, 0,
> - zl, policy_nodemask(gfp, pol));
> + page = __alloc_pages_nodemask(gfp, 0, zl,
> + policy_nodemask(gfp, pol));
> __mpol_put(pol);
> return page;
> }
> /*
> * fast path: default or task policy
> */
> - return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
> + page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
> + return page;
> }
>
> /**
> @@ -1761,26 +1810,37 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
> * Allocate a page from the kernel page pool. When not in
> * interrupt context and apply the current process NUMA policy.
> * Returns NULL when no page can be allocated.
> - *
> - * Don't call cpuset_update_task_memory_state() unless
> - * 1) it's ok to take cpuset_sem (can WAIT), and
> - * 2) allocating for current task (not interrupt).
> */
> struct page *alloc_pages_current(gfp_t gfp, unsigned order)
> {
> struct mempolicy *pol = current->mempolicy;
> + struct mempolicy policy;
> + struct page *page;
> + unsigned long seq, irqflags;
> +
>
> if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
> pol = &default_policy;
> -
> + else {
> + do {
> + seq = mems_fastpath_lock_irqsave(current, irqflags);
> + policy = *pol;
> + } while (mems_fastpath_unlock_irqrestore(current,
> + seq, irqflags));
> + pol = &policy;
> + }
> /*
> * No reference counting needed for current->mempolicy
> * nor system default_policy
> */
> if (pol->mode == MPOL_INTERLEAVE)
> - return alloc_page_interleave(gfp, order, interleave_nodes(pol));
> - return __alloc_pages_nodemask(gfp, order,
> - policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
> + page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
> + else
> + page = __alloc_pages_nodemask(gfp, order,
> + policy_zonelist(gfp, pol),
> + policy_nodemask(gfp, pol));
> +
> + return page;
> }
> EXPORT_SYMBOL(alloc_pages_current);
>
> @@ -2026,6 +2086,7 @@ restart:
> */
> void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
> {
> + unsigned long irqflags;
> int ret;
>
> sp->root = RB_ROOT; /* empty tree == default mempolicy */
> @@ -2043,9 +2104,9 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
> if (IS_ERR(new))
> goto put_free; /* no valid nodemask intersection */
>
> - task_lock(current);
> + mems_slowpath_lock_irqsave(current, irqflags);
> ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
> - task_unlock(current);
> + mems_slowpath_unlock_irqrestore(current, irqflags);
> mpol_put(mpol); /* drop our ref on sb mpol */
> if (ret)
> goto put_free;
> @@ -2200,6 +2261,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
> nodemask_t nodes;
> char *nodelist = strchr(str, ':');
> char *flags = strchr(str, '=');
> + unsigned long irqflags;
> int err = 1;
>
> if (nodelist) {
> @@ -2291,9 +2353,9 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
> int ret;
> NODEMASK_SCRATCH(scratch);
> if (scratch) {
> - task_lock(current);
> + mems_slowpath_lock_irqsave(current, irqflags);
> ret = mpol_set_nodemask(new, &nodes, scratch);
> - task_unlock(current);
> + mems_slowpath_unlock_irqrestore(current, irqflags);
> } else
> ret = -ENOMEM;
> NODEMASK_SCRATCH_FREE(scratch);
> @@ -2487,8 +2549,10 @@ int show_numa_map(struct seq_file *m, void *v)
> struct file *file = vma->vm_file;
> struct mm_struct *mm = vma->vm_mm;
> struct mempolicy *pol;
> + struct mempolicy policy;
> int n;
> char buffer[50];
> + unsigned long iflags, seq;
>
> if (!mm)
> return 0;
> @@ -2498,6 +2562,18 @@ int show_numa_map(struct seq_file *m, void *v)
> return 0;
>
> pol = get_vma_policy(priv->task, vma, vma->vm_start);
> + if (pol == current->mempolicy) {
> + /*
> + * get_vma_policy() doesn't return NULL, so we needn't worry
> + * whether pol is NULL or not.
> + */
> + do {
> + seq = mems_fastpath_lock_irqsave(current, iflags);
> + policy = *pol;
> + } while (mems_fastpath_unlock_irqrestore(current, seq, iflags));
> + pol = &policy;
> + }
> +
> mpol_to_str(buffer, sizeof(buffer), pol, 0);
> mpol_cond_put(pol);
>
> diff --git a/mm/slab.c b/mm/slab.c
> index 09f1572..b8f5acb 100644
> --- a/mm/slab.c
> +++ b/mm/slab.c
> @@ -3282,14 +3282,24 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
> static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
> {
> int nid_alloc, nid_here;
> + unsigned long lflags, seq;
> + struct mempolicy mpol;
>
> if (in_interrupt() || (flags & __GFP_THISNODE))
> return NULL;
> +
> nid_alloc = nid_here = numa_node_id();
> if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
> nid_alloc = cpuset_mem_spread_node();
> - else if (current->mempolicy)
> - nid_alloc = slab_node(current->mempolicy);
> + else if (current->mempolicy) {
> + do {
> + seq = mems_fastpath_lock_irqsave(current, lflags);
> + mpol = *(current->mempolicy);
> + } while (mems_fastpath_unlock_irqrestore(current, seq, lflags));
> +
> + nid_alloc = slab_node(&mpol);
> + }
> +
> if (nid_alloc != nid_here)
> return ____cache_alloc_node(cachep, flags, nid_alloc);
> return NULL;
> @@ -3312,11 +3322,21 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
> enum zone_type high_zoneidx = gfp_zone(flags);
> void *obj = NULL;
> int nid;
> + unsigned long lflags, seq;
> + struct mempolicy mpol;
>
> if (flags & __GFP_THISNODE)
> return NULL;
>
> - zonelist = node_zonelist(slab_node(current->mempolicy), flags);
> + if (current->mempolicy) {
> + do {
> + seq = mems_fastpath_lock_irqsave(current, lflags);
> + mpol = *(current->mempolicy);
> + } while (mems_fastpath_unlock_irqrestore(current, seq, lflags));
> + zonelist = node_zonelist(slab_node(&mpol), flags);
> + } else
> + zonelist = node_zonelist(slab_node(NULL), flags);
> +
> local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
>
> retry:
> diff --git a/mm/slub.c b/mm/slub.c
> index b364844..436c521 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -1345,6 +1345,8 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
> struct zone *zone;
> enum zone_type high_zoneidx = gfp_zone(flags);
> struct page *page;
> + unsigned long lflags, seq;
> + struct mempolicy mpol;
>
> /*
> * The defrag ratio allows a configuration of the tradeoffs between
> @@ -1368,7 +1370,15 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
> get_cycles() % 1024 > s->remote_node_defrag_ratio)
> return NULL;
>
> - zonelist = node_zonelist(slab_node(current->mempolicy), flags);
> + if (current->mempolicy) {
> + do {
> + seq = mems_fastpath_lock_irqsave(current, lflags);
> + mpol = *(current->mempolicy);
> + } while (mems_fastpath_unlock_irqrestore(current, seq, lflags));
> + zonelist = node_zonelist(slab_node(&mpol), flags);
> + } else
> + zonelist = node_zonelist(slab_node(NULL), flags);
> +
> for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
> struct kmem_cache_node *n;
>
> --
> 1.6.5.2
>
>
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to
majordomo@kvack.org. For more info on Linux MM,
> see:
http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org">
email@kvack.org </a>
>