All vma manipulation is somewhat protected by a down_read on
mmap_lock, so vma mempolicy is clear to obtain a reference.
But there is no locking in process context and have a mix
of reference counting and per-task requirements which is rather
subtle and easy to get wrong.
we would have get_vma_policy() always returning a reference
counted policy, except for static policy. For better performance,
we replace atomic_t ref with percpu_ref in mempolicy, which is
usually the performance bottleneck in hot path.
This series adjust the reference of mempolicy in process context,
which will be protected by RCU in read hot path. Besides,
task->mempolicy is also protected by task_lock(). Percpu_ref
is a good way to reduce cache line bouncing.
The mpol_get/put() can just increment or decrement the local
counter. Mpol_kill() must be called to initiate the destruction
of mempolicy. A mempolicy will be freed when the mpol_kill()
is called and the reference count decrese to zero.
Suggested-by: Michal Hocko <[email protected]>
Signed-off-by: Zhongkun He <[email protected]>
---
include/linux/mempolicy.h | 65 +++++++++++++++++++------------
include/uapi/linux/mempolicy.h | 2 +-
mm/mempolicy.c | 71 ++++++++++++++++++++++------------
3 files changed, 89 insertions(+), 49 deletions(-)
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index d232de7cdc56..9178b008eadf 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -28,12 +28,16 @@ struct mm_struct;
* of the current process.
*
* Locking policy for interleave:
- * In process context there is no locking because only the process accesses
- * its own state. All vma manipulation is somewhat protected by a down_read on
+ * percpu_ref is used to reduce cache line bouncing.
+ * In process context we should obtain a reference via mpol_get()
+ * protected by RCU.
+ * All vma manipulation is somewhat protected by a down_read on
* mmap_lock.
*
* Freeing policy:
- * Mempolicy objects are reference counted. A mempolicy will be freed when
+ * Mempolicy objects are reference counted. The mpol_get/put can just increment
+ * or decrement the local counter. Mpol_kill() must be called to initiate the
+ * destruction of mempolicy. A mempolicy will be freed when mpol_kill()/
* mpol_put() decrements the reference count to zero.
*
* Duplicating policy objects:
@@ -42,43 +46,36 @@ struct mm_struct;
* to 1, representing the caller of mpol_dup().
*/
struct mempolicy {
- atomic_t refcnt;
- unsigned short mode; /* See MPOL_* above */
+ struct percpu_ref refcnt; /* reduce cache line bouncing */
+ unsigned short mode; /* See MPOL_* above */
unsigned short flags; /* See set_mempolicy() MPOL_F_* above */
+ int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */
nodemask_t nodes; /* interleave/bind/perfer */
- int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */
union {
nodemask_t cpuset_mems_allowed; /* relative to these nodes */
nodemask_t user_nodemask; /* nodemask passed by user */
+ struct rcu_head rcu; /* used for freeing in an RCU-safe manner */
} w;
};
/*
- * Support for managing mempolicy data objects (clone, copy, destroy)
- * The default fast path of a NULL MPOL_DEFAULT policy is always inlined.
+ * Mempolicy pol need explicit unref after use, except for
+ * static policies(default_policy, preferred_node_policy).
*/
-
-extern void __mpol_put(struct mempolicy *pol);
-static inline void mpol_put(struct mempolicy *pol)
+static inline int mpol_needs_cond_ref(struct mempolicy *pol)
{
- if (pol)
- __mpol_put(pol);
+ return (pol && !(pol->flags & MPOL_F_STATIC));
}
/*
- * Does mempolicy pol need explicit unref after use?
- * Currently only needed for shared policies.
+ * Put a mpol reference obtained via mpol_get().
*/
-static inline int mpol_needs_cond_ref(struct mempolicy *pol)
-{
- return (pol && (pol->flags & MPOL_F_SHARED));
-}
-static inline void mpol_cond_put(struct mempolicy *pol)
+static inline void mpol_put(struct mempolicy *pol)
{
if (mpol_needs_cond_ref(pol))
- __mpol_put(pol);
+ percpu_ref_put(&pol->refcnt);
}
extern struct mempolicy *__mpol_dup(struct mempolicy *pol);
@@ -91,12 +88,28 @@ static inline struct mempolicy *mpol_dup(struct mempolicy *pol)
#define vma_policy(vma) ((vma)->vm_policy)
+/* Obtain a reference on the specified mpol */
static inline void mpol_get(struct mempolicy *pol)
{
if (pol)
- atomic_inc(&pol->refcnt);
+ percpu_ref_get(&pol->refcnt);
+}
+
+static inline bool mpol_tryget(struct mempolicy *pol)
+{
+ return pol && percpu_ref_tryget(&pol->refcnt);
}
+/*
+ * This function initiates destruction of mempolicy.
+ */
+static inline void mpol_kill(struct mempolicy *pol)
+{
+ if (pol)
+ percpu_ref_kill(&pol->refcnt);
+}
+
+
extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b);
static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
@@ -197,11 +210,15 @@ static inline void mpol_put(struct mempolicy *p)
{
}
-static inline void mpol_cond_put(struct mempolicy *pol)
+static inline void mpol_get(struct mempolicy *pol)
{
}
-static inline void mpol_get(struct mempolicy *pol)
+static inline bool mpol_tryget(struct mempolicy *pol)
+{
+}
+
+static inline void mpol_kill(struct mempolicy *pol)
{
}
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 046d0ccba4cd..940e1a88a4da 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -60,7 +60,7 @@ enum {
* "mode flags". These flags are allocated from bit 0 up, as they
* are never OR'ed into the mode in mempolicy API arguments.
*/
-#define MPOL_F_SHARED (1 << 0) /* identify shared policies */
+#define MPOL_F_STATIC (1 << 0) /* identify static policies(e.g default_policy) */
#define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */
#define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 61aa9aedb728..ee3e2ed5ef07 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -124,8 +124,8 @@ enum zone_type policy_zone = 0;
* run-time system-wide default policy => local allocation
*/
static struct mempolicy default_policy = {
- .refcnt = ATOMIC_INIT(1), /* never free it */
.mode = MPOL_LOCAL,
+ .flags = MPOL_F_STATIC,
};
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
@@ -158,9 +158,32 @@ int numa_map_to_online_node(int node)
}
EXPORT_SYMBOL_GPL(numa_map_to_online_node);
+/* Obtain a reference on the specified task mempolicy */
+static mempolicy *get_task_mpol(struct task_struct *p)
+{
+ struct mempolicy *pol;
+
+ rcu_read_lock();
+ pol = rcu_dereference(p->mempolicy);
+
+ if (!pol || mpol_tryget(pol))
+ pol = NULL;
+ rcu_read_unlock();
+
+ return pol;
+}
+
+static void mpol_release(struct percpu_ref *ref)
+{
+ struct mempolicy *mpol = container_of(ref, struct mempolicy, refcnt);
+
+ percpu_ref_exit(ref);
+ kfree_rcu(mpol, w.rcu);
+}
+
struct mempolicy *get_task_policy(struct task_struct *p)
{
- struct mempolicy *pol = p->mempolicy;
+ struct mempolicy *pol = get_task_mpol(p);
int node;
if (pol)
@@ -296,7 +319,12 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!policy)
return ERR_PTR(-ENOMEM);
- atomic_set(&policy->refcnt, 1);
+
+ if (percpu_ref_init(&policy->refcnt, mpol_release, 0,
+ GFP_KERNEL)) {
+ kmem_cache_free(policy_cache, policy);
+ return ERR_PTR(-ENOMEM);
+ }
policy->mode = mode;
policy->flags = flags;
policy->home_node = NUMA_NO_NODE;
@@ -304,14 +332,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
return policy;
}
-/* Slow path of a mpol destructor. */
-void __mpol_put(struct mempolicy *p)
-{
- if (!atomic_dec_and_test(&p->refcnt))
- return;
- kmem_cache_free(policy_cache, p);
-}
-
static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
{
}
@@ -1759,14 +1779,8 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
} else if (vma->vm_policy) {
pol = vma->vm_policy;
- /*
- * shmem_alloc_page() passes MPOL_F_SHARED policy with
- * a pseudo vma whose vma->vm_ops=NULL. Take a reference
- * count on these policies which will be dropped by
- * mpol_cond_put() later
- */
- if (mpol_needs_cond_ref(pol))
- mpol_get(pol);
+ /* vma policy is protected by mmap_lock. */
+ mpol_get(pol);
}
}
@@ -2423,7 +2437,13 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
nodemask_t mems = cpuset_mems_allowed(current);
mpol_rebind_policy(new, &mems);
}
- atomic_set(&new->refcnt, 1);
+
+ if (percpu_ref_init(&new->refcnt, mpol_release, 0,
+ GFP_KERNEL)) {
+ kmem_cache_free(policy_cache, new);
+ return ERR_PTR(-ENOMEM);
+ }
+
return new;
}
@@ -2687,7 +2707,6 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
kmem_cache_free(sn_cache, n);
return NULL;
}
- newpol->flags |= MPOL_F_SHARED;
sp_node_init(n, start, end, newpol);
return n;
@@ -2720,7 +2739,10 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
goto alloc_new;
*mpol_new = *n->policy;
- atomic_set(&mpol_new->refcnt, 1);
+ ret = percpu_ref_init(&mpol_new->refcnt,
+ mpol_release, 0, GFP_KERNEL);
+ if (ret)
+ goto err_out;
sp_node_init(n_new, end, n->end, mpol_new);
n->end = start;
sp_insert(sp, n_new);
@@ -2756,7 +2778,7 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!mpol_new)
goto err_out;
- atomic_set(&mpol_new->refcnt, 1);
+
goto restart;
}
@@ -2917,7 +2939,8 @@ void __init numa_policy_init(void)
preferred_node_policy[nid] = (struct mempolicy) {
.refcnt = ATOMIC_INIT(1),
.mode = MPOL_PREFERRED,
- .flags = MPOL_F_MOF | MPOL_F_MORON,
+ .flags = MPOL_F_MOF | MPOL_F_MORON
+ | MPOL_F_STATIC,
.nodes = nodemask_of_node(nid),
};
}
--
2.25.1
mpol_kill() is used to initiate destruction of mempolicy.
so it is called to free the mempolicy. The pol_put() just
decrement the local counter.
Suggested-by: Michal Hocko <[email protected]>
Signed-off-by: Zhongkun He <[email protected]>
---
fs/hugetlbfs/inode.c | 2 +-
fs/proc/task_mmu.c | 3 +--
kernel/fork.c | 4 ++--
mm/hugetlb.c | 6 +++---
mm/mempolicy.c | 42 +++++++++++++++++++++---------------------
mm/mmap.c | 10 +++++-----
mm/shmem.c | 10 +++++-----
7 files changed, 38 insertions(+), 39 deletions(-)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index dd54f67e47fd..bad1b07f8653 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -93,7 +93,7 @@ static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
{
- mpol_cond_put(vma->vm_policy);
+ mpol_put(vma->vm_policy);
}
#else
static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 8a74cdcc9af0..24aac42428b3 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -107,7 +107,6 @@ static void hold_task_mempolicy(struct proc_maps_private *priv)
task_lock(task);
priv->task_mempolicy = get_task_policy(task);
- mpol_get(priv->task_mempolicy);
task_unlock(task);
}
static void release_task_mempolicy(struct proc_maps_private *priv)
@@ -1949,7 +1948,7 @@ static int show_numa_map(struct seq_file *m, void *v)
pol = __get_vma_policy(vma, vma->vm_start);
if (pol) {
mpol_to_str(buffer, sizeof(buffer), pol);
- mpol_cond_put(pol);
+ mpol_put(pol);
} else {
mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy);
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 08969f5aa38d..97ba127a1b89 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -712,7 +712,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
fail_nomem_mas_store:
unlink_anon_vmas(tmp);
fail_nomem_anon_vma_fork:
- mpol_put(vma_policy(tmp));
+ mpol_kill(vma_policy(tmp));
fail_nomem_policy:
vm_area_free(tmp);
fail_nomem:
@@ -2537,7 +2537,7 @@ static __latent_entropy struct task_struct *copy_process(
bad_fork_cleanup_policy:
lockdep_free_task(p);
#ifdef CONFIG_NUMA
- mpol_put(p->mempolicy);
+ mpol_kill(p->mempolicy);
#endif
bad_fork_cleanup_delayacct:
delayacct_tsk_free(p);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 546df97c31e4..277330f40818 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1246,7 +1246,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
h->resv_huge_pages--;
}
- mpol_cond_put(mpol);
+ mpol_put(mpol);
return page;
err:
@@ -2315,7 +2315,7 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
if (!page)
page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
- mpol_cond_put(mpol);
+ mpol_put(mpol);
return page;
}
@@ -2351,7 +2351,7 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
gfp_mask = htlb_alloc_mask(h);
node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask);
- mpol_cond_put(mpol);
+ mpol_put(mpol);
return page;
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index ee3e2ed5ef07..f1857ebded46 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -795,11 +795,11 @@ static int vma_replace_policy(struct vm_area_struct *vma,
old = vma->vm_policy;
vma->vm_policy = new; /* protected by mmap_lock */
- mpol_put(old);
+ mpol_kill(old);
return 0;
err_out:
- mpol_put(new);
+ mpol_kill(new);
return err;
}
@@ -890,7 +890,7 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
ret = mpol_set_nodemask(new, nodes, scratch);
if (ret) {
task_unlock(current);
- mpol_put(new);
+ mpol_kill(new);
goto out;
}
@@ -899,7 +899,7 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
if (new && new->mode == MPOL_INTERLEAVE)
current->il_prev = MAX_NUMNODES-1;
task_unlock(current);
- mpol_put(old);
+ mpol_kill(old);
ret = 0;
out:
NODEMASK_SCRATCH_FREE(scratch);
@@ -925,8 +925,7 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
*nodes = p->nodes;
break;
case MPOL_LOCAL:
- /* return empty node mask for local allocation */
- break;
+ /* return empty node mask for local allocation */killbreak;
default:
BUG();
}
@@ -1370,7 +1369,7 @@ static long do_mbind(unsigned long start, unsigned long len,
mmap_write_unlock(mm);
mpol_out:
- mpol_put(new);
+ mpol_kill(new);
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
lru_cache_enable();
return err;
@@ -1566,7 +1565,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le
new->home_node = home_node;
err = mbind_range(mm, vmstart, vmend, new);
- mpol_put(new);
+ mpol_kill(new);
if (err)
break;
}
@@ -1813,14 +1812,13 @@ static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
bool vma_policy_mof(struct vm_area_struct *vma)
{
struct mempolicy *pol;
+ bool ret = false;
if (vma->vm_ops && vma->vm_ops->get_policy) {
- bool ret = false;
-
pol = vma->vm_ops->get_policy(vma, vma->vm_start);
if (pol && (pol->flags & MPOL_F_MOF))
ret = true;
- mpol_cond_put(pol);
+ mpol_put(pol);
return ret;
}
@@ -1828,8 +1826,9 @@ bool vma_policy_mof(struct vm_area_struct *vma)
pol = vma->vm_policy;
if (!pol)
pol = get_task_policy(current);
+ mpol_put(pol);
- return pol->flags & MPOL_F_MOF;
+ return ret;
}
bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
@@ -2193,7 +2192,7 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
unsigned nid;
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
- mpol_cond_put(pol);
+ mpol_put(pol);
gfp |= __GFP_COMP;
page = alloc_page_interleave(gfp, order, nid);
if (page && order > 1)
@@ -2208,7 +2207,7 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
node = policy_node(gfp, pol, node);
gfp |= __GFP_COMP;
page = alloc_pages_preferred_many(gfp, order, node, pol);
- mpol_cond_put(pol);
+ mpol_put(pol);
if (page && order > 1)
prep_transhuge_page(page);
folio = (struct folio *)page;
@@ -2233,7 +2232,7 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
nmask = policy_nodemask(gfp, pol);
if (!nmask || node_isset(hpage_node, *nmask)) {
- mpol_cond_put(pol);
+ mpol_put(pol);
/*
* First, try to allocate THP only on local node, but
* don't reclaim unnecessarily, just compact.
@@ -2258,7 +2257,7 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
nmask = policy_nodemask(gfp, pol);
preferred_nid = policy_node(gfp, pol, node);
folio = __folio_alloc(gfp, order, preferred_nid, nmask);
- mpol_cond_put(pol);
+ mpol_put(pol);
out:
return folio;
}
@@ -2300,6 +2299,7 @@ struct page *alloc_pages(gfp_t gfp, unsigned order)
policy_node(gfp, pol, numa_node_id()),
policy_nodemask(gfp, pol));
+ mpol_put(pol);
return page;
}
EXPORT_SYMBOL(alloc_pages);
@@ -2566,7 +2566,7 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
static void sp_free(struct sp_node *n)
{
- mpol_put(n->policy);
+ mpol_kill(n->policy);
kmem_cache_free(sn_cache, n);
}
@@ -2655,7 +2655,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
if (curnid != polnid)
ret = polnid;
out:
- mpol_cond_put(pol);
+ mpol_put(pol);
return ret;
}
@@ -2674,7 +2674,7 @@ void mpol_put_task_policy(struct task_struct *task)
pol = task->mempolicy;
task->mempolicy = NULL;
task_unlock(task);
- mpol_put(pol);
+ mpol_kill(pol);
}
static void sp_delete(struct shared_policy *sp, struct sp_node *n)
@@ -2763,7 +2763,7 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
err_out:
if (mpol_new)
- mpol_put(mpol_new);
+ mpol_kill(mpol_new);
if (n_new)
kmem_cache_free(sn_cache, n_new);
@@ -2823,7 +2823,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
put_new:
- mpol_put(new); /* drop initial ref */
+ mpol_kill(new); /* drop initial ref */
free_scratch:
NODEMASK_SCRATCH_FREE(scratch);
put_mpol:
diff --git a/mm/mmap.c b/mm/mmap.c
index 2def55555e05..7bf785463499 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -140,7 +140,7 @@ static void remove_vma(struct vm_area_struct *vma)
vma->vm_ops->close(vma);
if (vma->vm_file)
fput(vma->vm_file);
- mpol_put(vma_policy(vma));
+ mpol_kill(vma_policy(vma));
vm_area_free(vma);
}
@@ -595,7 +595,7 @@ inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma,
if (next->anon_vma)
anon_vma_merge(vma, next);
mm->map_count--;
- mpol_put(vma_policy(next));
+ mpol_kill(vma_policy(next));
vm_area_free(next);
}
@@ -836,7 +836,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
if (next->anon_vma)
anon_vma_merge(vma, next);
mm->map_count--;
- mpol_put(vma_policy(next));
+ mpol_kill(vma_policy(next));
if (remove_next != 2)
BUG_ON(vma->vm_end < next->vm_end);
vm_area_free(next);
@@ -2253,7 +2253,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
fput(new->vm_file);
unlink_anon_vmas(new);
out_free_mpol:
- mpol_put(vma_policy(new));
+ mpol_kill(vma_policy(new));
out_free_vma:
vm_area_free(new);
validate_mm_mt(mm);
@@ -3246,7 +3246,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
unlink_anon_vmas(new_vma);
out_free_mempol:
- mpol_put(vma_policy(new_vma));
+ mpol_kill(vma_policy(new_vma));
out_free_vma:
vm_area_free(new_vma);
out:
diff --git a/mm/shmem.c b/mm/shmem.c
index c1d8b8a1aa3b..11e57d79c104 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1485,7 +1485,7 @@ static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
{
/* Drop reference taken by mpol_shared_policy_lookup() */
- mpol_cond_put(vma->vm_policy);
+ mpol_put(vma->vm_policy);
}
static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp,
@@ -3528,7 +3528,7 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
break;
case Opt_mpol:
if (IS_ENABLED(CONFIG_NUMA)) {
- mpol_put(ctx->mpol);
+ mpol_kill(ctx->mpol);
ctx->mpol = NULL;
if (mpol_parse_str(param->string, &ctx->mpol))
goto bad_value;
@@ -3666,7 +3666,7 @@ static int shmem_reconfigure(struct fs_context *fc)
ctx->mpol = NULL;
}
raw_spin_unlock(&sbinfo->stat_lock);
- mpol_put(mpol);
+ mpol_kill(mpol);
return 0;
out:
raw_spin_unlock(&sbinfo->stat_lock);
@@ -3730,7 +3730,7 @@ static void shmem_put_super(struct super_block *sb)
free_percpu(sbinfo->ino_batch);
percpu_counter_destroy(&sbinfo->used_blocks);
- mpol_put(sbinfo->mpol);
+ mpol_kill(sbinfo->mpol);
kfree(sbinfo);
sb->s_fs_info = NULL;
}
@@ -3830,7 +3830,7 @@ static void shmem_free_fc(struct fs_context *fc)
struct shmem_options *ctx = fc->fs_private;
if (ctx) {
- mpol_put(ctx->mpol);
+ mpol_kill(ctx->mpol);
kfree(ctx);
}
}
--
2.25.1
Hi Zhongkun,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on akpm-mm/mm-everything]
url: https://github.com/intel-lab-lkp/linux/commits/Zhongkun-He/mm-replace-atomic_t-with-percpu_ref-in-mempolicy/20221205-001554
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/r/20221204161432.2149375-1-hezhongkun.hzk%40bytedance.com
patch subject: [PATCH 0/3] mm: replace atomic_t with percpu_ref in mempolicy.
config: i386-randconfig-a013
compiler: clang version 14.0.6 (https://github.com/llvm/llvm-project f28c006a5895fc0e329fe15fead81e37457cb1d1)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/ea5329aa9a0c20be63930f7c0fbb8aa238772851
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Zhongkun-He/mm-replace-atomic_t-with-percpu_ref-in-mempolicy/20221205-001554
git checkout ea5329aa9a0c20be63930f7c0fbb8aa238772851
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=i386 SHELL=/bin/bash
If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <[email protected]>
All errors (new ones prefixed by >>):
In file included from drivers/char/mem.c:25:
In file included from include/linux/shmem_fs.h:7:
>> include/linux/mempolicy.h:219:1: error: non-void function does not return a value [-Werror,-Wreturn-type]
}
^
1 error generated.
--
In file included from kernel/exit.c:38:
>> include/linux/mempolicy.h:219:1: error: non-void function does not return a value [-Werror,-Wreturn-type]
}
^
kernel/exit.c:1839:13: warning: no previous prototype for function 'abort' [-Wmissing-prototypes]
__weak void abort(void)
^
kernel/exit.c:1839:8: note: declare 'static' if the function is not intended to be used outside of this translation unit
__weak void abort(void)
^
static
1 warning and 1 error generated.
--
In file included from mm/shmem.c:38:
In file included from include/linux/hugetlb.h:30:
>> include/linux/mempolicy.h:219:1: error: non-void function does not return a value [-Werror,-Wreturn-type]
}
^
mm/shmem.c:1487:2: error: implicit declaration of function 'mpol_cond_put' is invalid in C99 [-Werror,-Wimplicit-function-declaration]
mpol_cond_put(vma->vm_policy);
^
2 errors generated.
--
In file included from mm/hugetlb.c:15:
>> include/linux/mempolicy.h:219:1: error: non-void function does not return a value [-Werror,-Wreturn-type]
}
^
mm/hugetlb.c:1249:2: error: implicit declaration of function 'mpol_cond_put' is invalid in C99 [-Werror,-Wimplicit-function-declaration]
mpol_cond_put(mpol);
^
mm/hugetlb.c:2324:2: error: implicit declaration of function 'mpol_cond_put' is invalid in C99 [-Werror,-Wimplicit-function-declaration]
mpol_cond_put(mpol);
^
mm/hugetlb.c:2360:2: error: implicit declaration of function 'mpol_cond_put' is invalid in C99 [-Werror,-Wimplicit-function-declaration]
mpol_cond_put(mpol);
^
4 errors generated.
--
In file included from kernel/sched/fair.c:44:
>> include/linux/mempolicy.h:219:1: error: non-void function does not return a value [-Werror,-Wreturn-type]
}
^
kernel/sched/fair.c:5794:6: warning: no previous prototype for function 'init_cfs_bandwidth' [-Wmissing-prototypes]
void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
^
kernel/sched/fair.c:5794:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
^
static
kernel/sched/fair.c:12118:6: warning: no previous prototype for function 'free_fair_sched_group' [-Wmissing-prototypes]
void free_fair_sched_group(struct task_group *tg) { }
^
kernel/sched/fair.c:12118:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
void free_fair_sched_group(struct task_group *tg) { }
^
static
kernel/sched/fair.c:12120:5: warning: no previous prototype for function 'alloc_fair_sched_group' [-Wmissing-prototypes]
int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
^
kernel/sched/fair.c:12120:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
^
static
kernel/sched/fair.c:12125:6: warning: no previous prototype for function 'online_fair_sched_group' [-Wmissing-prototypes]
void online_fair_sched_group(struct task_group *tg) { }
^
kernel/sched/fair.c:12125:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
void online_fair_sched_group(struct task_group *tg) { }
^
static
kernel/sched/fair.c:12127:6: warning: no previous prototype for function 'unregister_fair_sched_group' [-Wmissing-prototypes]
void unregister_fair_sched_group(struct task_group *tg) { }
^
kernel/sched/fair.c:12127:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
void unregister_fair_sched_group(struct task_group *tg) { }
^
static
5 warnings and 1 error generated.
vim +219 include/linux/mempolicy.h
216
217 static inline bool mpol_tryget(struct mempolicy *pol)
218 {
> 219 }
220
--
0-DAY CI Kernel Test Service
https://01.org/lkp
On Mon 05-12-22 00:14:29, Zhongkun He wrote:
> All vma manipulation is somewhat protected by a down_read on
> mmap_lock, so vma mempolicy is clear to obtain a reference.
> But there is no locking in process context and have a mix
> of reference counting and per-task requirements which is rather
> subtle and easy to get wrong.
>
> we would have get_vma_policy() always returning a reference
> counted policy, except for static policy. For better performance,
> we replace atomic_t ref with percpu_ref in mempolicy, which is
> usually the performance bottleneck in hot path.
>
> This series adjust the reference of mempolicy in process context,
> which will be protected by RCU in read hot path. Besides,
> task->mempolicy is also protected by task_lock(). Percpu_ref
> is a good way to reduce cache line bouncing.
>
> The mpol_get/put() can just increment or decrement the local
> counter. Mpol_kill() must be called to initiate the destruction
> of mempolicy. A mempolicy will be freed when the mpol_kill()
> is called and the reference count decrese to zero.
This is really hard to follow. Without having the context from previous
discussions I would be completely lost. Please structure your cover
letter but also other patch in general in the form:
- what is the problem you would like to deal with
- want to introduce pidfd_set_mempolicy because XYZ
- what stands in the way
- mempolicy objects access constrains (reliance on operating in
the current context)
- reference counting needs to be unconditional
- why regular reference counting is not sufficient (performance)
- what is this patchset proposing
- per cpu reference counting
- how is it implemented
- how is the patch series structured
- make the reference counting unconditional
- special case static (never released) policies
- replace standard ref counting by per-cpu reference counting
- how has this been tested?
Makes sense?
> Suggested-by: Michal Hocko <[email protected]>
> Signed-off-by: Zhongkun He <[email protected]>
> ---
> include/linux/mempolicy.h | 65 +++++++++++++++++++------------
> include/uapi/linux/mempolicy.h | 2 +-
> mm/mempolicy.c | 71 ++++++++++++++++++++++------------
> 3 files changed, 89 insertions(+), 49 deletions(-)
>
Is the following the cumulative diff?
> diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
> index d232de7cdc56..9178b008eadf 100644
> --- a/include/linux/mempolicy.h
> +++ b/include/linux/mempolicy.h
> @@ -28,12 +28,16 @@ struct mm_struct;
> * of the current process.
> *
> * Locking policy for interleave:
> - * In process context there is no locking because only the process accesses
> - * its own state. All vma manipulation is somewhat protected by a down_read on
> + * percpu_ref is used to reduce cache line bouncing.
> + * In process context we should obtain a reference via mpol_get()
> + * protected by RCU.
> + * All vma manipulation is somewhat protected by a down_read on
> * mmap_lock.
> *
> * Freeing policy:
> - * Mempolicy objects are reference counted. A mempolicy will be freed when
> + * Mempolicy objects are reference counted. The mpol_get/put can just increment
> + * or decrement the local counter. Mpol_kill() must be called to initiate the
> + * destruction of mempolicy. A mempolicy will be freed when mpol_kill()/
> * mpol_put() decrements the reference count to zero.
> *
> * Duplicating policy objects:
> @@ -42,43 +46,36 @@ struct mm_struct;
> * to 1, representing the caller of mpol_dup().
> */
> struct mempolicy {
> - atomic_t refcnt;
> - unsigned short mode; /* See MPOL_* above */
> + struct percpu_ref refcnt; /* reduce cache line bouncing */
> + unsigned short mode; /* See MPOL_* above */
> unsigned short flags; /* See set_mempolicy() MPOL_F_* above */
> + int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */
> nodemask_t nodes; /* interleave/bind/perfer */
> - int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */
>
> union {
> nodemask_t cpuset_mems_allowed; /* relative to these nodes */
> nodemask_t user_nodemask; /* nodemask passed by user */
> + struct rcu_head rcu; /* used for freeing in an RCU-safe manner */
> } w;
> };
>
> /*
> - * Support for managing mempolicy data objects (clone, copy, destroy)
> - * The default fast path of a NULL MPOL_DEFAULT policy is always inlined.
> + * Mempolicy pol need explicit unref after use, except for
> + * static policies(default_policy, preferred_node_policy).
> */
> -
> -extern void __mpol_put(struct mempolicy *pol);
> -static inline void mpol_put(struct mempolicy *pol)
> +static inline int mpol_needs_cond_ref(struct mempolicy *pol)
> {
> - if (pol)
> - __mpol_put(pol);
> + return (pol && !(pol->flags & MPOL_F_STATIC));
> }
>
> /*
> - * Does mempolicy pol need explicit unref after use?
> - * Currently only needed for shared policies.
> + * Put a mpol reference obtained via mpol_get().
> */
> -static inline int mpol_needs_cond_ref(struct mempolicy *pol)
> -{
> - return (pol && (pol->flags & MPOL_F_SHARED));
> -}
>
> -static inline void mpol_cond_put(struct mempolicy *pol)
> +static inline void mpol_put(struct mempolicy *pol)
> {
> if (mpol_needs_cond_ref(pol))
> - __mpol_put(pol);
> + percpu_ref_put(&pol->refcnt);
> }
>
> extern struct mempolicy *__mpol_dup(struct mempolicy *pol);
> @@ -91,12 +88,28 @@ static inline struct mempolicy *mpol_dup(struct mempolicy *pol)
>
> #define vma_policy(vma) ((vma)->vm_policy)
>
> +/* Obtain a reference on the specified mpol */
> static inline void mpol_get(struct mempolicy *pol)
> {
> if (pol)
> - atomic_inc(&pol->refcnt);
> + percpu_ref_get(&pol->refcnt);
> +}
> +
> +static inline bool mpol_tryget(struct mempolicy *pol)
> +{
> + return pol && percpu_ref_tryget(&pol->refcnt);
> }
>
> +/*
> + * This function initiates destruction of mempolicy.
> + */
> +static inline void mpol_kill(struct mempolicy *pol)
> +{
> + if (pol)
> + percpu_ref_kill(&pol->refcnt);
> +}
> +
> +
> extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b);
> static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
> {
> @@ -197,11 +210,15 @@ static inline void mpol_put(struct mempolicy *p)
> {
> }
>
> -static inline void mpol_cond_put(struct mempolicy *pol)
> +static inline void mpol_get(struct mempolicy *pol)
> {
> }
>
> -static inline void mpol_get(struct mempolicy *pol)
> +static inline bool mpol_tryget(struct mempolicy *pol)
> +{
> +}
> +
> +static inline void mpol_kill(struct mempolicy *pol)
> {
> }
>
> diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
> index 046d0ccba4cd..940e1a88a4da 100644
> --- a/include/uapi/linux/mempolicy.h
> +++ b/include/uapi/linux/mempolicy.h
> @@ -60,7 +60,7 @@ enum {
> * "mode flags". These flags are allocated from bit 0 up, as they
> * are never OR'ed into the mode in mempolicy API arguments.
> */
> -#define MPOL_F_SHARED (1 << 0) /* identify shared policies */
> +#define MPOL_F_STATIC (1 << 0) /* identify static policies(e.g default_policy) */
> #define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */
> #define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */
>
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 61aa9aedb728..ee3e2ed5ef07 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -124,8 +124,8 @@ enum zone_type policy_zone = 0;
> * run-time system-wide default policy => local allocation
> */
> static struct mempolicy default_policy = {
> - .refcnt = ATOMIC_INIT(1), /* never free it */
> .mode = MPOL_LOCAL,
> + .flags = MPOL_F_STATIC,
> };
>
> static struct mempolicy preferred_node_policy[MAX_NUMNODES];
> @@ -158,9 +158,32 @@ int numa_map_to_online_node(int node)
> }
> EXPORT_SYMBOL_GPL(numa_map_to_online_node);
>
> +/* Obtain a reference on the specified task mempolicy */
> +static mempolicy *get_task_mpol(struct task_struct *p)
> +{
> + struct mempolicy *pol;
> +
> + rcu_read_lock();
> + pol = rcu_dereference(p->mempolicy);
> +
> + if (!pol || mpol_tryget(pol))
> + pol = NULL;
> + rcu_read_unlock();
> +
> + return pol;
> +}
> +
> +static void mpol_release(struct percpu_ref *ref)
> +{
> + struct mempolicy *mpol = container_of(ref, struct mempolicy, refcnt);
> +
> + percpu_ref_exit(ref);
> + kfree_rcu(mpol, w.rcu);
> +}
> +
> struct mempolicy *get_task_policy(struct task_struct *p)
> {
> - struct mempolicy *pol = p->mempolicy;
> + struct mempolicy *pol = get_task_mpol(p);
> int node;
>
> if (pol)
> @@ -296,7 +319,12 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
> policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
> if (!policy)
> return ERR_PTR(-ENOMEM);
> - atomic_set(&policy->refcnt, 1);
> +
> + if (percpu_ref_init(&policy->refcnt, mpol_release, 0,
> + GFP_KERNEL)) {
> + kmem_cache_free(policy_cache, policy);
> + return ERR_PTR(-ENOMEM);
> + }
> policy->mode = mode;
> policy->flags = flags;
> policy->home_node = NUMA_NO_NODE;
> @@ -304,14 +332,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
> return policy;
> }
>
> -/* Slow path of a mpol destructor. */
> -void __mpol_put(struct mempolicy *p)
> -{
> - if (!atomic_dec_and_test(&p->refcnt))
> - return;
> - kmem_cache_free(policy_cache, p);
> -}
> -
> static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
> {
> }
> @@ -1759,14 +1779,8 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
> } else if (vma->vm_policy) {
> pol = vma->vm_policy;
>
> - /*
> - * shmem_alloc_page() passes MPOL_F_SHARED policy with
> - * a pseudo vma whose vma->vm_ops=NULL. Take a reference
> - * count on these policies which will be dropped by
> - * mpol_cond_put() later
> - */
> - if (mpol_needs_cond_ref(pol))
> - mpol_get(pol);
> + /* vma policy is protected by mmap_lock. */
> + mpol_get(pol);
> }
> }
>
> @@ -2423,7 +2437,13 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
> nodemask_t mems = cpuset_mems_allowed(current);
> mpol_rebind_policy(new, &mems);
> }
> - atomic_set(&new->refcnt, 1);
> +
> + if (percpu_ref_init(&new->refcnt, mpol_release, 0,
> + GFP_KERNEL)) {
> + kmem_cache_free(policy_cache, new);
> + return ERR_PTR(-ENOMEM);
> + }
> +
> return new;
> }
>
> @@ -2687,7 +2707,6 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
> kmem_cache_free(sn_cache, n);
> return NULL;
> }
> - newpol->flags |= MPOL_F_SHARED;
> sp_node_init(n, start, end, newpol);
>
> return n;
> @@ -2720,7 +2739,10 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
> goto alloc_new;
>
> *mpol_new = *n->policy;
> - atomic_set(&mpol_new->refcnt, 1);
> + ret = percpu_ref_init(&mpol_new->refcnt,
> + mpol_release, 0, GFP_KERNEL);
> + if (ret)
> + goto err_out;
> sp_node_init(n_new, end, n->end, mpol_new);
> n->end = start;
> sp_insert(sp, n_new);
> @@ -2756,7 +2778,7 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
> mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
> if (!mpol_new)
> goto err_out;
> - atomic_set(&mpol_new->refcnt, 1);
> +
> goto restart;
> }
>
> @@ -2917,7 +2939,8 @@ void __init numa_policy_init(void)
> preferred_node_policy[nid] = (struct mempolicy) {
> .refcnt = ATOMIC_INIT(1),
> .mode = MPOL_PREFERRED,
> - .flags = MPOL_F_MOF | MPOL_F_MORON,
> + .flags = MPOL_F_MOF | MPOL_F_MORON
> + | MPOL_F_STATIC,
> .nodes = nodemask_of_node(nid),
> };
> }
> --
> 2.25.1
--
Michal Hocko
SUSE Labs
On Fri 13-01-23 17:20:39, Michal Hocko wrote:
> On Mon 05-12-22 00:14:29, Zhongkun He wrote:
> > All vma manipulation is somewhat protected by a down_read on
> > mmap_lock, so vma mempolicy is clear to obtain a reference.
> > But there is no locking in process context and have a mix
> > of reference counting and per-task requirements which is rather
> > subtle and easy to get wrong.
> >
> > we would have get_vma_policy() always returning a reference
> > counted policy, except for static policy. For better performance,
> > we replace atomic_t ref with percpu_ref in mempolicy, which is
> > usually the performance bottleneck in hot path.
> >
> > This series adjust the reference of mempolicy in process context,
> > which will be protected by RCU in read hot path. Besides,
> > task->mempolicy is also protected by task_lock(). Percpu_ref
> > is a good way to reduce cache line bouncing.
> >
> > The mpol_get/put() can just increment or decrement the local
> > counter. Mpol_kill() must be called to initiate the destruction
> > of mempolicy. A mempolicy will be freed when the mpol_kill()
> > is called and the reference count decrese to zero.
>
> This is really hard to follow. Without having the context from previous
> discussions I would be completely lost. Please structure your cover
> letter but also other patch in general in the form:
> - what is the problem you would like to deal with
> - want to introduce pidfd_set_mempolicy because XYZ
> - what stands in the way
> - mempolicy objects access constrains (reliance on operating in
> the current context)
> - reference counting needs to be unconditional
> - why regular reference counting is not sufficient (performance)
> - what is this patchset proposing
> - per cpu reference counting
> - how is it implemented
> - how is the patch series structured
> - make the reference counting unconditional
> - special case static (never released) policies
> - replace standard ref counting by per-cpu reference counting
- introduce pidfd_set_mempolicy
> - how has this been tested?
--
Michal Hocko
SUSE Labs
On Mon 05-12-22 00:14:29, Zhongkun He wrote:
[...]
> +/* Obtain a reference on the specified mpol */
> static inline void mpol_get(struct mempolicy *pol)
> {
> if (pol)
Shouldn't this be mpol_needs_cond_ref?
> - atomic_inc(&pol->refcnt);
> + percpu_ref_get(&pol->refcnt);
> +}
> +
> +static inline bool mpol_tryget(struct mempolicy *pol)
> +{
> + return pol && percpu_ref_tryget(&pol->refcnt);
> }
>
> +/*
> + * This function initiates destruction of mempolicy.
This is not a useful comment. It would be much more helpful to say when
this should be called.
> + */
> +static inline void mpol_kill(struct mempolicy *pol)
> +{
> + if (pol)
> + percpu_ref_kill(&pol->refcnt);
> +}
> +
> +
> extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b);
> static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
> {
> @@ -197,11 +210,15 @@ static inline void mpol_put(struct mempolicy *p)
> {
> }
>
> -static inline void mpol_cond_put(struct mempolicy *pol)
> +static inline void mpol_get(struct mempolicy *pol)
> {
> }
>
> -static inline void mpol_get(struct mempolicy *pol)
> +static inline bool mpol_tryget(struct mempolicy *pol)
> +{
> +}
This should return false, right?
[...]
> +/* Obtain a reference on the specified task mempolicy */
Again, this is pretty much clear from the name. It would be more useful
to explain how the pointer can be used - e.g. needs to call mpol_put
or mpol_kill depending on the calling context.
> +static mempolicy *get_task_mpol(struct task_struct *p)
> +{
> + struct mempolicy *pol;
> +
> + rcu_read_lock();
> + pol = rcu_dereference(p->mempolicy);
> +
> + if (!pol || mpol_tryget(pol))
Shouldn't be !mpol_tryget?
> + pol = NULL;
> + rcu_read_unlock();
> +
> + return pol;
> +}
> +
I do not see any rcu_assign_pointer for the newly created policy so this
seems incomplete. Ditto no mpol_kill calls. I am unlikely to get into
follow up patches now. Please split up the work so that it is reviewable
more easily and then I can have a further look.
Thanks!
--
Michal Hocko
SUSE Labs
> On Fri 13-01-23 17:20:39, Michal Hocko wrote:
>>
>> This is really hard to follow. Without having the context from previous
>> discussions I would be completely lost. Please structure your cover
>> letter but also other patch in general in the form:
>> - what is the problem you would like to deal with
>> - want to introduce pidfd_set_mempolicy because XYZ
>> - what stands in the way
>> - mempolicy objects access constrains (reliance on operating in
>> the current context)
>> - reference counting needs to be unconditional
>> - why regular reference counting is not sufficient (performance)
>> - what is this patchset proposing
>> - per cpu reference counting
>> - how is it implemented
>> - how is the patch series structured
>> - make the reference counting unconditional
>> - special case static (never released) policies
>> - replace standard ref counting by per-cpu reference counting
> - introduce pidfd_set_mempolicy
>> - how has this been tested?
Hi Michal, thanks for your review and suggestions.
I will follow the advice above to structure the letter and
split the patches smaller on next version.
Thanks.
> On Mon 05-12-22 00:14:29, Zhongkun He wrote:
> [...]
>> +/* Obtain a reference on the specified mpol */
>> static inline void mpol_get(struct mempolicy *pol)
>> {
>> if (pol)
>
> Shouldn't this be mpol_needs_cond_ref?
>
>> - atomic_inc(&pol->refcnt);
>> + percpu_ref_get(&pol->refcnt);
>> +}
>> +
>> +static inline bool mpol_tryget(struct mempolicy *pol)
>> +{
>> + return pol && percpu_ref_tryget(&pol->refcnt);
>> }
>>
>> +/*
>> + * This function initiates destruction of mempolicy.
>
> This is not a useful comment. It would be much more helpful to say when
> this should be called.
>
>> + */
>> +static inline void mpol_kill(struct mempolicy *pol)
>> +{
>> + if (pol)
>> + percpu_ref_kill(&pol->refcnt);
>> +}
>> +
>> +
>> extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b);
>> static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
>> {
>> @@ -197,11 +210,15 @@ static inline void mpol_put(struct mempolicy *p)
>> {
>> }
>>
>> -static inline void mpol_cond_put(struct mempolicy *pol)
>> +static inline void mpol_get(struct mempolicy *pol)
>> {
>> }
>>
>> -static inline void mpol_get(struct mempolicy *pol)
>> +static inline bool mpol_tryget(struct mempolicy *pol)
>> +{
>> +}
>
> This should return false, right?
>
> [...]
>> +/* Obtain a reference on the specified task mempolicy */
>
> Again, this is pretty much clear from the name. It would be more useful
> to explain how the pointer can be used - e.g. needs to call mpol_put
> or mpol_kill depending on the calling context.
>
>> +static mempolicy *get_task_mpol(struct task_struct *p)
>> +{
>> + struct mempolicy *pol;
>> +
>> + rcu_read_lock();
>> + pol = rcu_dereference(p->mempolicy);
>> +
>> + if (!pol || mpol_tryget(pol))
>
> Shouldn't be !mpol_tryget?
>
>> + pol = NULL;
>> + rcu_read_unlock();
>> +
>> + return pol;
>> +}
>> +
>
> I do not see any rcu_assign_pointer for the newly created policy so this
> seems incomplete. Ditto no mpol_kill calls. I am unlikely to get into
> follow up patches now. Please split up the work so that it is reviewable
> more easily and then I can have a further look.
>
> Thanks!
Thanks for your review, some changes may be in other patch,i will
reorganize the patches according to the suggestions to make
things clear.
Thanks.