Here are some revised patches based on the latest
topic-sunrpc-thread-scheduling
The refactoring is now a bit tidier
Thanks,
NeilBrown
[PATCH 1/6] SUNRPC: move all of xprt handling into svc_xprt_handle()
[PATCH 2/6] SUNRPC: rename and refactor svc_get_next_xprt()
[PATCH 3/6] SUNRPC: integrate back-channel processing with svc_recv()
[PATCH 4/6] SUNRPC: change how svc threads are asked to exit.
[PATCH 5/6] SUNRPC: add list of idle threads
[PATCH 6/6] SUNRPC: discard SP_CONGESTED
svc_get_next_xprt() does a lot more than just get an xprt. It also
decides if it needs to sleep, depending not only on the availability of
xprts but also on the need to exit or handle external work.
So rename it to svc_rqst_wait_for_work() and only do the testing and
waiting. Move all the waiting-related code out of svc_recv() into the
new svc_rqst_wait_for_work().
Move the dequeueing code out of svc_get_next_xprt() into svc_recv().
Previously svc_xprt_dequeue() would be called twice, once before waiting
and possibly once after. Now instead rqst_should_sleep() is called
twice. Once to decide if waiting is needed, and once to check against
after setting the task state do see if we might have missed a wakeup.
signed-off-by: NeilBrown <[email protected]>
---
net/sunrpc/svc_xprt.c | 91 ++++++++++++++++++++-----------------------
1 file changed, 43 insertions(+), 48 deletions(-)
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 60759647fee4..f1d64ded89fb 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -722,51 +722,33 @@ rqst_should_sleep(struct svc_rqst *rqstp)
return true;
}
-static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp)
+static void svc_rqst_wait_for_work(struct svc_rqst *rqstp)
{
- struct svc_pool *pool = rqstp->rq_pool;
-
- /* rq_xprt should be clear on entry */
- WARN_ON_ONCE(rqstp->rq_xprt);
+ struct svc_pool *pool = rqstp->rq_pool;
- rqstp->rq_xprt = svc_xprt_dequeue(pool);
- if (rqstp->rq_xprt)
- goto out_found;
-
- set_current_state(TASK_IDLE);
- smp_mb__before_atomic();
- clear_bit(SP_CONGESTED, &pool->sp_flags);
- clear_bit(RQ_BUSY, &rqstp->rq_flags);
- smp_mb__after_atomic();
-
- if (likely(rqst_should_sleep(rqstp)))
- schedule();
- else
- __set_current_state(TASK_RUNNING);
+ if (rqst_should_sleep(rqstp)) {
+ set_current_state(TASK_IDLE);
+ smp_mb__before_atomic();
+ clear_bit(SP_CONGESTED, &pool->sp_flags);
+ clear_bit(RQ_BUSY, &rqstp->rq_flags);
+ smp_mb__after_atomic();
+
+ /* Need to check should_sleep() again after
+ * setting task state in case a wakeup happened
+ * between testing and setting.
+ */
+ if (rqst_should_sleep(rqstp)) {
+ schedule();
+ } else {
+ __set_current_state(TASK_RUNNING);
+ cond_resched();
+ }
+ set_bit(RQ_BUSY, &rqstp->rq_flags);
+ smp_mb__after_atomic();
+ } else
+ cond_resched();
try_to_freeze();
-
- set_bit(RQ_BUSY, &rqstp->rq_flags);
- smp_mb__after_atomic();
- clear_bit(SP_TASK_PENDING, &pool->sp_flags);
- rqstp->rq_xprt = svc_xprt_dequeue(pool);
- if (rqstp->rq_xprt)
- goto out_found;
-
- if (kthread_should_stop())
- return NULL;
- return NULL;
-out_found:
- clear_bit(SP_TASK_PENDING, &pool->sp_flags);
- /* Normally we will wait up to 5 seconds for any required
- * cache information to be provided.
- */
- if (!test_bit(SP_CONGESTED, &pool->sp_flags))
- rqstp->rq_chandle.thread_wait = 5*HZ;
- else
- rqstp->rq_chandle.thread_wait = 1*HZ;
- trace_svc_xprt_dequeue(rqstp);
- return rqstp->rq_xprt;
}
static void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt)
@@ -858,20 +840,33 @@ static void svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
*/
void svc_recv(struct svc_rqst *rqstp)
{
- struct svc_xprt *xprt = NULL;
+ struct svc_pool *pool = rqstp->rq_pool;
if (!svc_alloc_arg(rqstp))
return;
- try_to_freeze();
- cond_resched();
+ svc_rqst_wait_for_work(rqstp);
+
+ clear_bit(SP_TASK_PENDING, &pool->sp_flags);
+
if (kthread_should_stop())
- goto out;
+ return;
+
+ rqstp->rq_xprt = svc_xprt_dequeue(pool);
+ if (rqstp->rq_xprt) {
+ struct svc_xprt *xprt = rqstp->rq_xprt;
+
+ /* Normally we will wait up to 5 seconds for any required
+ * cache information to be provided.
+ */
+ if (test_bit(SP_CONGESTED, &pool->sp_flags))
+ rqstp->rq_chandle.thread_wait = 5 * HZ;
+ else
+ rqstp->rq_chandle.thread_wait = 1 * HZ;
- xprt = svc_get_next_xprt(rqstp);
- if (xprt)
+ trace_svc_xprt_dequeue(rqstp);
svc_handle_xprt(rqstp, xprt);
-out:
+ }
}
EXPORT_SYMBOL_GPL(svc_recv);
--
2.40.1
Rather than searching a list of threads to find an idle one, having a
list of idle threads allows an idle thread to be found immediately.
This adds some spin_lock calls which is not ideal, but as the hold-time
is tiny it is still faster than searching a list. A future patch will
remove them using llist.h. This involves some subtlety and so is left
to a separate patch.
This removes the need for the RQ_BUSY flag. The rqst is "busy"
precisely when it is not on the "idle" list.
Signed-off-by: NeilBrown <[email protected]>
---
include/linux/sunrpc/svc.h | 25 ++++++++++++++++++++++++-
include/trace/events/sunrpc.h | 1 -
net/sunrpc/svc.c | 13 ++++++++-----
net/sunrpc/svc_xprt.c | 15 +++++++++++----
4 files changed, 43 insertions(+), 11 deletions(-)
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 1ac6f74781aa..8b93af92dd53 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -37,6 +37,7 @@ struct svc_pool {
struct list_head sp_sockets; /* pending sockets */
unsigned int sp_nrthreads; /* # of threads in pool */
struct list_head sp_all_threads; /* all server threads */
+ struct list_head sp_idle_threads; /* idle server threads */
/* statistics on pool operation */
struct percpu_counter sp_messages_arrived;
@@ -186,6 +187,7 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp);
*/
struct svc_rqst {
struct list_head rq_all; /* all threads list */
+ struct list_head rq_idle; /* On the idle list */
struct rcu_head rq_rcu_head; /* for RCU deferred kfree */
struct svc_xprt * rq_xprt; /* transport ptr */
@@ -262,10 +264,31 @@ enum {
RQ_SPLICE_OK, /* turned off in gss privacy to prevent
* encrypting page cache pages */
RQ_VICTIM, /* Have agreed to shut down */
- RQ_BUSY, /* request is busy */
RQ_DATA, /* request has data */
};
+/**
+ * svc_thread_set_busy - mark a thread as busy
+ * @rqstp: the thread which is now busy
+ *
+ * If rq_idle is "empty", the thread must be busy.
+ */
+static inline void svc_thread_set_busy(struct svc_rqst *rqstp)
+{
+ INIT_LIST_HEAD(&rqstp->rq_idle);
+}
+
+/**
+ * svc_thread_busy - check if a thread as busy
+ * @rqstp: the thread which might be busy
+ *
+ * If rq_idle is "empty", the thread must be busy.
+ */
+static inline bool svc_thread_busy(struct svc_rqst *rqstp)
+{
+ return list_empty(&rqstp->rq_idle);
+}
+
#define SVC_NET(rqst) (rqst->rq_xprt ? rqst->rq_xprt->xpt_net : rqst->rq_bc_net)
/*
diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index 6beb38c1dcb5..337c90787fb1 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -1677,7 +1677,6 @@ DEFINE_SVCXDRBUF_EVENT(sendto);
svc_rqst_flag(DROPME) \
svc_rqst_flag(SPLICE_OK) \
svc_rqst_flag(VICTIM) \
- svc_rqst_flag(BUSY) \
svc_rqst_flag_end(DATA)
#undef svc_rqst_flag
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 1233d72714b9..dce433dea1bd 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -641,7 +641,7 @@ svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node)
folio_batch_init(&rqstp->rq_fbatch);
- __set_bit(RQ_BUSY, &rqstp->rq_flags);
+ svc_thread_set_busy(rqstp);
rqstp->rq_server = serv;
rqstp->rq_pool = pool;
@@ -702,10 +702,13 @@ void svc_pool_wake_idle_thread(struct svc_pool *pool)
struct svc_rqst *rqstp;
rcu_read_lock();
- list_for_each_entry_rcu(rqstp, &pool->sp_all_threads, rq_all) {
- if (test_and_set_bit(RQ_BUSY, &rqstp->rq_flags))
- continue;
-
+ spin_lock_bh(&pool->sp_lock);
+ rqstp = list_first_entry_or_null(&pool->sp_idle_threads,
+ struct svc_rqst, rq_idle);
+ if (rqstp)
+ list_del_init(&rqstp->rq_idle);
+ spin_unlock_bh(&pool->sp_lock);
+ if (rqstp) {
WRITE_ONCE(rqstp->rq_qtime, ktime_get());
wake_up_process(rqstp->rq_task);
rcu_read_unlock();
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 0a300ae6a7ed..e44efcc21b63 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -737,8 +737,9 @@ static void svc_rqst_wait_for_work(struct svc_rqst *rqstp)
set_current_state(TASK_IDLE);
smp_mb__before_atomic();
clear_bit(SP_CONGESTED, &pool->sp_flags);
- clear_bit(RQ_BUSY, &rqstp->rq_flags);
- smp_mb__after_atomic();
+ spin_lock_bh(&pool->sp_lock);
+ list_add(&rqstp->rq_idle, &pool->sp_idle_threads);
+ spin_unlock_bh(&pool->sp_lock);
/* Need to check should_sleep() again after
* setting task state in case a wakeup happened
@@ -751,8 +752,14 @@ static void svc_rqst_wait_for_work(struct svc_rqst *rqstp)
cond_resched();
}
- set_bit(RQ_BUSY, &rqstp->rq_flags);
- smp_mb__after_atomic();
+ /* We *must* be removed from the list before we can continue.
+ * If we were woken, this is already done
+ */
+ if (!svc_thread_busy(rqstp)) {
+ spin_lock_bh(&pool->sp_lock);
+ list_del_init(&rqstp->rq_idle);
+ spin_unlock_bh(&pool->sp_lock);
+ }
} else
cond_resched();
try_to_freeze();
--
2.40.1
svc threads are currently stopped using kthread_stop(). This requires
identifying a specific thread. However we don't care which thread
stops, just as long as one does.
So instead, set a flag in the svc_pool to say that a thread needs to
die, and have each thread check this flag instead of calling
kthread_should_stop(). The first to find it clear the flag and moves
towards shutting down.
This removes an explicit dependency on sp_all_threads which will make a
future patch simpler.
Signed-off-by: NeilBrown <[email protected]>
---
fs/lockd/svc.c | 5 ++---
fs/lockd/svclock.c | 5 ++---
fs/nfs/callback.c | 2 +-
fs/nfsd/nfs4proc.c | 8 ++++---
fs/nfsd/nfssvc.c | 2 +-
include/linux/lockd/lockd.h | 2 +-
include/linux/sunrpc/svc.h | 22 ++++++++++++++++++-
net/sunrpc/svc.c | 43 ++++++++++++++++++-------------------
net/sunrpc/svc_xprt.c | 7 +++---
9 files changed, 57 insertions(+), 39 deletions(-)
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 6579948070a4..b441c706c2b8 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -24,7 +24,6 @@
#include <linux/uio.h>
#include <linux/smp.h>
#include <linux/mutex.h>
-#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/inetdevice.h>
@@ -135,11 +134,11 @@ lockd(void *vrqstp)
* The main request loop. We don't terminate until the last
* NFS mount or NFS daemon has gone away.
*/
- while (!kthread_should_stop()) {
+ while (!svc_thread_should_stop(rqstp)) {
/* update sv_maxconn if it has changed */
rqstp->rq_server->sv_maxconn = nlm_max_connections;
- nlmsvc_retry_blocked();
+ nlmsvc_retry_blocked(rqstp);
svc_recv(rqstp);
}
if (nlmsvc_ops)
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 43aeba9de55c..5fea06555f42 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -30,7 +30,6 @@
#include <linux/sunrpc/svc_xprt.h>
#include <linux/lockd/nlm.h>
#include <linux/lockd/lockd.h>
-#include <linux/kthread.h>
#include <linux/exportfs.h>
#define NLMDBG_FACILITY NLMDBG_SVCLOCK
@@ -1020,13 +1019,13 @@ retry_deferred_block(struct nlm_block *block)
* be retransmitted.
*/
void
-nlmsvc_retry_blocked(void)
+nlmsvc_retry_blocked(struct svc_rqst *rqstp)
{
unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
struct nlm_block *block;
spin_lock(&nlm_blocked_lock);
- while (!list_empty(&nlm_blocked) && !kthread_should_stop()) {
+ while (!list_empty(&nlm_blocked) && !svc_thread_should_stop(rqstp)) {
block = list_entry(nlm_blocked.next, struct nlm_block, b_list);
if (block->b_when == NLM_NEVER)
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 42a0c2f1e785..4ffa1f469e90 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -78,7 +78,7 @@ nfs4_callback_svc(void *vrqstp)
set_freezable();
- while (!kthread_should_stop())
+ while (!svc_thread_should_stop(rqstp))
svc_recv(rqstp);
svc_exit_thread(rqstp);
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index aa4f21f92deb..669b16348571 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1340,7 +1340,8 @@ extern void nfs_sb_deactive(struct super_block *sb);
* setup a work entry in the ssc delayed unmount list.
*/
static __be32 nfsd4_ssc_setup_dul(struct nfsd_net *nn, char *ipaddr,
- struct nfsd4_ssc_umount_item **nsui)
+ struct nfsd4_ssc_umount_item **nsui,
+ struct svc_rqst *rqstp)
{
struct nfsd4_ssc_umount_item *ni = NULL;
struct nfsd4_ssc_umount_item *work = NULL;
@@ -1362,7 +1363,7 @@ static __be32 nfsd4_ssc_setup_dul(struct nfsd_net *nn, char *ipaddr,
spin_unlock(&nn->nfsd_ssc_lock);
/* allow 20secs for mount/unmount for now - revisit */
- if (kthread_should_stop() ||
+ if (svc_thread_should_stop(rqstp) ||
(schedule_timeout(20*HZ) == 0)) {
finish_wait(&nn->nfsd_ssc_waitq, &wait);
kfree(work);
@@ -1478,7 +1479,7 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp,
goto out_free_rawdata;
snprintf(dev_name, len + 5, "%s%s%s:/", startsep, ipaddr, endsep);
- status = nfsd4_ssc_setup_dul(nn, ipaddr, nsui);
+ status = nfsd4_ssc_setup_dul(nn, ipaddr, nsui, rqstp);
if (status)
goto out_free_devname;
if ((*nsui)->nsui_vfsmount)
@@ -1653,6 +1654,7 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy,
if (bytes_total == 0)
bytes_total = ULLONG_MAX;
do {
+ /* Only async copies can be stopped here */
if (kthread_should_stop())
break;
bytes_copied = nfsd_copy_file_range(src, src_pos, dst, dst_pos,
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 1582af33e204..062f51fe4dfb 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -957,7 +957,7 @@ nfsd(void *vrqstp)
/*
* The main request loop
*/
- while (!kthread_should_stop()) {
+ while (!svc_thread_should_stop(rqstp)) {
/* Update sv_maxconn if it has changed */
rqstp->rq_server->sv_maxconn = nn->max_connections;
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index 0f016d69c996..9f565416d186 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -282,7 +282,7 @@ __be32 nlmsvc_testlock(struct svc_rqst *, struct nlm_file *,
struct nlm_host *, struct nlm_lock *,
struct nlm_lock *, struct nlm_cookie *);
__be32 nlmsvc_cancel_blocked(struct net *net, struct nlm_file *, struct nlm_lock *);
-void nlmsvc_retry_blocked(void);
+void nlmsvc_retry_blocked(struct svc_rqst *rqstp);
void nlmsvc_traverse_blocks(struct nlm_host *, struct nlm_file *,
nlm_host_match_fn_t match);
void nlmsvc_grant_reply(struct nlm_cookie *, __be32);
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 5e2d3b83999e..1ac6f74781aa 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -50,6 +50,8 @@ struct svc_pool {
enum {
SP_TASK_PENDING, /* still work to do even if no xprt is queued */
SP_CONGESTED, /* all threads are busy, none idle */
+ SP_NEED_VICTIM, /* One thread needs to agree to exit */
+ SP_VICTIM_REMAINS, /* One thread needs to actually exit */
};
@@ -259,7 +261,7 @@ enum {
RQ_DROPME, /* drop current reply */
RQ_SPLICE_OK, /* turned off in gss privacy to prevent
* encrypting page cache pages */
- RQ_VICTIM, /* about to be shut down */
+ RQ_VICTIM, /* Have agreed to shut down */
RQ_BUSY, /* request is busy */
RQ_DATA, /* request has data */
};
@@ -299,6 +301,24 @@ static inline struct sockaddr *svc_daddr(const struct svc_rqst *rqst)
return (struct sockaddr *) &rqst->rq_daddr;
}
+/**
+ * svc_thread_should_stop - check if this thread should stop
+ * @rqstp: the thread that might need to stop
+ *
+ * To stop an svc thread, the pool flags SP_NEED_VICTIM and SP_VICTIM_REMAINS
+ * are set. The firs thread which sees SP_NEED_VICTIM clear it becoming
+ * the victim using this function. It should then promptly call
+ * svc_exit_thread() which completes the process, clearing SP_VICTIM_REMAINS
+ * so the task waiting for a thread to exit can wake and continue.
+ */
+static inline bool svc_thread_should_stop(struct svc_rqst *rqstp)
+{
+ if (test_and_clear_bit(SP_NEED_VICTIM, &rqstp->rq_pool->sp_flags))
+ set_bit(RQ_VICTIM, &rqstp->rq_flags);
+
+ return test_bit(RQ_VICTIM, &rqstp->rq_flags);
+}
+
struct svc_deferred_req {
u32 prot; /* protocol (UDP or TCP) */
struct svc_xprt *xprt;
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 0c3272f1b3b6..1233d72714b9 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -725,19 +725,22 @@ svc_pool_next(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state)
return pool ? pool : &serv->sv_pools[(*state)++ % serv->sv_nrpools];
}
-static struct task_struct *
+static struct svc_pool *
svc_pool_victim(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state)
{
unsigned int i;
- struct task_struct *task = NULL;
if (pool != NULL) {
spin_lock_bh(&pool->sp_lock);
+ if (pool->sp_nrthreads)
+ goto found_pool;
+ spin_unlock_bh(&pool->sp_lock);
+ return NULL;
} else {
for (i = 0; i < serv->sv_nrpools; i++) {
pool = &serv->sv_pools[--(*state) % serv->sv_nrpools];
spin_lock_bh(&pool->sp_lock);
- if (!list_empty(&pool->sp_all_threads))
+ if (pool->sp_nrthreads)
goto found_pool;
spin_unlock_bh(&pool->sp_lock);
}
@@ -745,16 +748,10 @@ svc_pool_victim(struct svc_serv *serv, struct svc_pool *pool, unsigned int *stat
}
found_pool:
- if (!list_empty(&pool->sp_all_threads)) {
- struct svc_rqst *rqstp;
-
- rqstp = list_entry(pool->sp_all_threads.next, struct svc_rqst, rq_all);
- set_bit(RQ_VICTIM, &rqstp->rq_flags);
- list_del_rcu(&rqstp->rq_all);
- task = rqstp->rq_task;
- }
+ set_bit(SP_VICTIM_REMAINS, &pool->sp_flags);
+ set_bit(SP_NEED_VICTIM, &pool->sp_flags);
spin_unlock_bh(&pool->sp_lock);
- return task;
+ return pool;
}
static int
@@ -795,18 +792,16 @@ svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
static int
svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
{
- struct svc_rqst *rqstp;
- struct task_struct *task;
unsigned int state = serv->sv_nrthreads-1;
+ struct svc_pool *victim;
do {
- task = svc_pool_victim(serv, pool, &state);
- if (task == NULL)
+ victim = svc_pool_victim(serv, pool, &state);
+ if (!victim)
break;
- rqstp = kthread_data(task);
- /* Did we lose a race to svo_function threadfn? */
- if (kthread_stop(task) == -EINTR)
- svc_exit_thread(rqstp);
+ svc_pool_wake_idle_thread(victim);
+ wait_on_bit(&victim->sp_flags, SP_VICTIM_REMAINS,
+ TASK_IDLE);
nrservs++;
} while (nrservs < 0);
return 0;
@@ -926,8 +921,7 @@ svc_exit_thread(struct svc_rqst *rqstp)
spin_lock_bh(&pool->sp_lock);
pool->sp_nrthreads--;
- if (!test_and_set_bit(RQ_VICTIM, &rqstp->rq_flags))
- list_del_rcu(&rqstp->rq_all);
+ list_del_rcu(&rqstp->rq_all);
spin_unlock_bh(&pool->sp_lock);
spin_lock_bh(&serv->sv_lock);
@@ -938,6 +932,11 @@ svc_exit_thread(struct svc_rqst *rqstp)
svc_rqst_free(rqstp);
svc_put(serv);
+ /* That svc_put() cannot be the last, because the thread
+ * waiting for SP_VICTIM_REMAINS to clear must hold
+ * a reference. So it is still safe to access pool.
+ */
+ clear_and_wake_up_bit(SP_VICTIM_REMAINS, &pool->sp_flags);
}
EXPORT_SYMBOL_GPL(svc_exit_thread);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 39582ac33cbd..0a300ae6a7ed 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -9,7 +9,6 @@
#include <linux/sched/mm.h>
#include <linux/errno.h>
#include <linux/freezer.h>
-#include <linux/kthread.h>
#include <linux/slab.h>
#include <net/sock.h>
#include <linux/sunrpc/addr.h>
@@ -675,7 +674,7 @@ static bool svc_alloc_arg(struct svc_rqst *rqstp)
continue;
set_current_state(TASK_IDLE);
- if (kthread_should_stop()) {
+ if (svc_thread_should_stop(rqstp)) {
set_current_state(TASK_RUNNING);
return false;
}
@@ -713,7 +712,7 @@ rqst_should_sleep(struct svc_rqst *rqstp)
return false;
/* are we shutting down? */
- if (kthread_should_stop())
+ if (svc_thread_should_stop(rqstp))
return false;
/* are we freezing? */
@@ -857,7 +856,7 @@ void svc_recv(struct svc_rqst *rqstp)
clear_bit(SP_TASK_PENDING, &pool->sp_flags);
- if (kthread_should_stop())
+ if (svc_thread_should_stop(rqstp))
return;
rqstp->rq_xprt = svc_xprt_dequeue(pool);
--
2.40.1
We can tell if a pool is congested by checking if the idle list is
empty. We don't need a separate flag.
Signed-off-by: NeilBrown <[email protected]>
---
include/linux/sunrpc/svc.h | 1 -
net/sunrpc/svc.c | 1 -
net/sunrpc/svc_xprt.c | 4 +---
3 files changed, 1 insertion(+), 5 deletions(-)
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 8b93af92dd53..5ea76d432015 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -50,7 +50,6 @@ struct svc_pool {
/* bits for sp_flags */
enum {
SP_TASK_PENDING, /* still work to do even if no xprt is queued */
- SP_CONGESTED, /* all threads are busy, none idle */
SP_NEED_VICTIM, /* One thread needs to agree to exit */
SP_VICTIM_REMAINS, /* One thread needs to actually exit */
};
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index dce433dea1bd..5df7b4353320 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -718,7 +718,6 @@ void svc_pool_wake_idle_thread(struct svc_pool *pool)
}
rcu_read_unlock();
- set_bit(SP_CONGESTED, &pool->sp_flags);
}
EXPORT_SYMBOL_GPL(svc_pool_wake_idle_thread);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index e44efcc21b63..dd60e5810cdb 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -735,8 +735,6 @@ static void svc_rqst_wait_for_work(struct svc_rqst *rqstp)
if (rqst_should_sleep(rqstp)) {
set_current_state(TASK_IDLE);
- smp_mb__before_atomic();
- clear_bit(SP_CONGESTED, &pool->sp_flags);
spin_lock_bh(&pool->sp_lock);
list_add(&rqstp->rq_idle, &pool->sp_idle_threads);
spin_unlock_bh(&pool->sp_lock);
@@ -873,7 +871,7 @@ void svc_recv(struct svc_rqst *rqstp)
/* Normally we will wait up to 5 seconds for any required
* cache information to be provided.
*/
- if (test_bit(SP_CONGESTED, &pool->sp_flags))
+ if (list_empty(&pool->sp_idle_threads))
rqstp->rq_chandle.thread_wait = 5 * HZ;
else
rqstp->rq_chandle.thread_wait = 1 * HZ;
--
2.40.1
svc_xprt_handle() does lots of things itself, but leaves some to the
caller - svc_recv(). This isn't elegant.
Move that code out of svc_recv() into svc_xprt_handle()
Remove the calls to svc_xprt_release() from svc_send() and svc_drop()
(the two possible final steps in svc_process()) and from svc_recv() (in
the case where svc_process() wasn't called) into svc_xprt_handle().
Signed-off-by: NeilBrown <[email protected]>
---
net/sunrpc/svc_xprt.c | 53 ++++++++++++++++---------------------------
1 file changed, 20 insertions(+), 33 deletions(-)
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 4cfe9640df48..60759647fee4 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -785,7 +785,7 @@ static void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt
svc_xprt_received(newxpt);
}
-static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
+static void svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
{
struct svc_serv *serv = rqstp->rq_server;
int len = 0;
@@ -826,11 +826,26 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
len = xprt->xpt_ops->xpo_recvfrom(rqstp);
rqstp->rq_reserved = serv->sv_max_mesg;
atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
+ if (len <= 0)
+ goto out;
+
+ trace_svc_xdr_recvfrom(&rqstp->rq_arg);
+
+ clear_bit(XPT_OLD, &xprt->xpt_flags);
+
+ rqstp->rq_chandle.defer = svc_defer;
+
+ if (serv->sv_stats)
+ serv->sv_stats->netcnt++;
+ percpu_counter_inc(&rqstp->rq_pool->sp_messages_arrived);
+ rqstp->rq_stime = ktime_get();
+ svc_process(rqstp);
} else
svc_xprt_received(xprt);
out:
- return len;
+ rqstp->rq_res.len = 0;
+ svc_xprt_release(rqstp);
}
/**
@@ -844,11 +859,9 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
void svc_recv(struct svc_rqst *rqstp)
{
struct svc_xprt *xprt = NULL;
- struct svc_serv *serv = rqstp->rq_server;
- int len;
if (!svc_alloc_arg(rqstp))
- goto out;
+ return;
try_to_freeze();
cond_resched();
@@ -856,31 +869,9 @@ void svc_recv(struct svc_rqst *rqstp)
goto out;
xprt = svc_get_next_xprt(rqstp);
- if (!xprt)
- goto out;
-
- len = svc_handle_xprt(rqstp, xprt);
-
- /* No data, incomplete (TCP) read, or accept() */
- if (len <= 0)
- goto out_release;
-
- trace_svc_xdr_recvfrom(&rqstp->rq_arg);
-
- clear_bit(XPT_OLD, &xprt->xpt_flags);
-
- rqstp->rq_chandle.defer = svc_defer;
-
- if (serv->sv_stats)
- serv->sv_stats->netcnt++;
- percpu_counter_inc(&rqstp->rq_pool->sp_messages_arrived);
- rqstp->rq_stime = ktime_get();
- svc_process(rqstp);
+ if (xprt)
+ svc_handle_xprt(rqstp, xprt);
out:
- return;
-out_release:
- rqstp->rq_res.len = 0;
- svc_xprt_release(rqstp);
}
EXPORT_SYMBOL_GPL(svc_recv);
@@ -890,7 +881,6 @@ EXPORT_SYMBOL_GPL(svc_recv);
void svc_drop(struct svc_rqst *rqstp)
{
trace_svc_drop(rqstp);
- svc_xprt_release(rqstp);
}
EXPORT_SYMBOL_GPL(svc_drop);
@@ -906,8 +896,6 @@ void svc_send(struct svc_rqst *rqstp)
int status;
xprt = rqstp->rq_xprt;
- if (!xprt)
- return;
/* calculate over-all length */
xb = &rqstp->rq_res;
@@ -920,7 +908,6 @@ void svc_send(struct svc_rqst *rqstp)
status = xprt->xpt_ops->xpo_sendto(rqstp);
trace_svc_send(rqstp, status);
- svc_xprt_release(rqstp);
}
/*
--
2.40.1
Using svc_recv() for (NFSv4.1) back-channel handling means we have just
one mechanism for waking threads.
Also change kthread_freezable_should_stop() in nfs4_callback_svc() to
kthread_should_stop() as used elsewhere.
kthread_freezable_should_stop() effectively adds a try_to_freeze() call,
and svc_recv() already contains that at an appropriate place.
Signed-off-by: NeilBrown <[email protected]>
---
fs/nfs/callback.c | 46 ++-----------------------------
include/linux/sunrpc/svc.h | 2 --
net/sunrpc/backchannel_rqst.c | 8 ++----
net/sunrpc/svc.c | 2 +-
net/sunrpc/svc_xprt.c | 32 +++++++++++++++++++++
net/sunrpc/xprtrdma/backchannel.c | 2 +-
6 files changed, 39 insertions(+), 53 deletions(-)
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 466ebf1d41b2..42a0c2f1e785 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -78,7 +78,7 @@ nfs4_callback_svc(void *vrqstp)
set_freezable();
- while (!kthread_freezable_should_stop(NULL))
+ while (!kthread_should_stop())
svc_recv(rqstp);
svc_exit_thread(rqstp);
@@ -86,45 +86,6 @@ nfs4_callback_svc(void *vrqstp)
}
#if defined(CONFIG_NFS_V4_1)
-/*
- * The callback service for NFSv4.1 callbacks
- */
-static int
-nfs41_callback_svc(void *vrqstp)
-{
- struct svc_rqst *rqstp = vrqstp;
- struct svc_serv *serv = rqstp->rq_server;
- struct rpc_rqst *req;
- int error;
- DEFINE_WAIT(wq);
-
- set_freezable();
-
- while (!kthread_freezable_should_stop(NULL)) {
- prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_IDLE);
- spin_lock_bh(&serv->sv_cb_lock);
- if (!list_empty(&serv->sv_cb_list)) {
- req = list_first_entry(&serv->sv_cb_list,
- struct rpc_rqst, rq_bc_list);
- list_del(&req->rq_bc_list);
- spin_unlock_bh(&serv->sv_cb_lock);
- finish_wait(&serv->sv_cb_waitq, &wq);
- dprintk("Invoking bc_svc_process()\n");
- error = bc_svc_process(serv, req, rqstp);
- dprintk("bc_svc_process() returned w/ error code= %d\n",
- error);
- } else {
- spin_unlock_bh(&serv->sv_cb_lock);
- if (!kthread_should_stop())
- schedule();
- finish_wait(&serv->sv_cb_waitq, &wq);
- }
- }
-
- svc_exit_thread(rqstp);
- return 0;
-}
-
static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
struct svc_serv *serv)
{
@@ -237,10 +198,7 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
cb_info->users);
threadfn = nfs4_callback_svc;
-#if defined(CONFIG_NFS_V4_1)
- if (minorversion)
- threadfn = nfs41_callback_svc;
-#else
+#if !defined(CONFIG_NFS_V4_1)
if (minorversion)
return ERR_PTR(-ENOTSUPP);
#endif
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index dbf5b21feafe..5e2d3b83999e 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -92,8 +92,6 @@ struct svc_serv {
* that arrive over the same
* connection */
spinlock_t sv_cb_lock; /* protects the svc_cb_list */
- wait_queue_head_t sv_cb_waitq; /* sleep here if there are no
- * entries in the svc_cb_list */
bool sv_bc_enabled; /* service uses backchannel */
#endif /* CONFIG_SUNRPC_BACKCHANNEL */
};
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index 65a6c6429a53..44b7c89a635f 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -349,10 +349,8 @@ struct rpc_rqst *xprt_lookup_bc_request(struct rpc_xprt *xprt, __be32 xid)
}
/*
- * Add callback request to callback list. The callback
- * service sleeps on the sv_cb_waitq waiting for new
- * requests. Wake it up after adding enqueing the
- * request.
+ * Add callback request to callback list. Wake a thread
+ * on the first pool (usually the only pool) to handle it.
*/
void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
{
@@ -371,6 +369,6 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
xprt_get(xprt);
spin_lock(&bc_serv->sv_cb_lock);
list_add(&req->rq_bc_list, &bc_serv->sv_cb_list);
- wake_up(&bc_serv->sv_cb_waitq);
spin_unlock(&bc_serv->sv_cb_lock);
+ svc_pool_wake_idle_thread(&bc_serv->sv_pools[0]);
}
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index dc21e6c732db..0c3272f1b3b6 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -440,7 +440,6 @@ __svc_init_bc(struct svc_serv *serv)
{
INIT_LIST_HEAD(&serv->sv_cb_list);
spin_lock_init(&serv->sv_cb_lock);
- init_waitqueue_head(&serv->sv_cb_waitq);
}
#else
static void
@@ -718,6 +717,7 @@ void svc_pool_wake_idle_thread(struct svc_pool *pool)
set_bit(SP_CONGESTED, &pool->sp_flags);
}
+EXPORT_SYMBOL_GPL(svc_pool_wake_idle_thread);
static struct svc_pool *
svc_pool_next(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state)
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index f1d64ded89fb..39582ac33cbd 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -17,6 +17,7 @@
#include <linux/sunrpc/svc_xprt.h>
#include <linux/sunrpc/svcsock.h>
#include <linux/sunrpc/xprt.h>
+#include <linux/sunrpc/bc_xprt.h>
#include <linux/module.h>
#include <linux/netdevice.h>
#include <trace/events/sunrpc.h>
@@ -719,6 +720,13 @@ rqst_should_sleep(struct svc_rqst *rqstp)
if (freezing(current))
return false;
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+ if (svc_is_backchannel(rqstp)) {
+ if (!list_empty(&rqstp->rq_server->sv_cb_list))
+ return false;
+ }
+#endif
+
return true;
}
@@ -867,6 +875,30 @@ void svc_recv(struct svc_rqst *rqstp)
trace_svc_xprt_dequeue(rqstp);
svc_handle_xprt(rqstp, xprt);
}
+
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+ if (svc_is_backchannel(rqstp)) {
+ struct svc_serv *serv = rqstp->rq_server;
+ struct rpc_rqst *req;
+
+ spin_lock_bh(&serv->sv_cb_lock);
+ req = list_first_entry_or_null(&serv->sv_cb_list,
+ struct rpc_rqst, rq_bc_list);
+ if (req) {
+ int error;
+
+ list_del(&req->rq_bc_list);
+ spin_unlock_bh(&serv->sv_cb_lock);
+
+ dprintk("Invoking bc_svc_process()\n");
+ error = bc_svc_process(rqstp->rq_server, req, rqstp);
+ dprintk("bc_svc_process() returned w/ error code= %d\n",
+ error);
+ return;
+ }
+ spin_unlock_bh(&serv->sv_cb_lock);
+ }
+#endif
}
EXPORT_SYMBOL_GPL(svc_recv);
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index e4d84a13c566..bfc434ec52a7 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -267,7 +267,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
list_add(&rqst->rq_bc_list, &bc_serv->sv_cb_list);
spin_unlock(&bc_serv->sv_cb_lock);
- wake_up(&bc_serv->sv_cb_waitq);
+ svc_pool_wake_idle_thread(&bc_serv->sv_pools[0]);
r_xprt->rx_stats.bcall_count++;
return;
--
2.40.1
On Wed, Aug 02, 2023 at 05:34:39PM +1000, NeilBrown wrote:
> svc_get_next_xprt() does a lot more than just get an xprt. It also
> decides if it needs to sleep, depending not only on the availability of
> xprts but also on the need to exit or handle external work.
>
> So rename it to svc_rqst_wait_for_work() and only do the testing and
> waiting. Move all the waiting-related code out of svc_recv() into the
> new svc_rqst_wait_for_work().
>
> Move the dequeueing code out of svc_get_next_xprt() into svc_recv().
>
> Previously svc_xprt_dequeue() would be called twice, once before waiting
> and possibly once after. Now instead rqst_should_sleep() is called
> twice. Once to decide if waiting is needed, and once to check against
> after setting the task state do see if we might have missed a wakeup.
>
> signed-off-by: NeilBrown <[email protected]>
I've tested and applied this one and the previous one to the thread
scheduling branch, with a few minor fix-ups. Apologies for how long
this took, I'm wrestling with a SATA/block bug on the v6.6 test
system that is being very sassy and hard to nail down.
I need to dive into the backchannel patch next. I'm trying to think
of how I want to test that one.
> ---
> net/sunrpc/svc_xprt.c | 91 ++++++++++++++++++++-----------------------
> 1 file changed, 43 insertions(+), 48 deletions(-)
>
> diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
> index 60759647fee4..f1d64ded89fb 100644
> --- a/net/sunrpc/svc_xprt.c
> +++ b/net/sunrpc/svc_xprt.c
> @@ -722,51 +722,33 @@ rqst_should_sleep(struct svc_rqst *rqstp)
> return true;
> }
>
> -static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp)
> +static void svc_rqst_wait_for_work(struct svc_rqst *rqstp)
> {
> - struct svc_pool *pool = rqstp->rq_pool;
> -
> - /* rq_xprt should be clear on entry */
> - WARN_ON_ONCE(rqstp->rq_xprt);
> + struct svc_pool *pool = rqstp->rq_pool;
>
> - rqstp->rq_xprt = svc_xprt_dequeue(pool);
> - if (rqstp->rq_xprt)
> - goto out_found;
> -
> - set_current_state(TASK_IDLE);
> - smp_mb__before_atomic();
> - clear_bit(SP_CONGESTED, &pool->sp_flags);
> - clear_bit(RQ_BUSY, &rqstp->rq_flags);
> - smp_mb__after_atomic();
> -
> - if (likely(rqst_should_sleep(rqstp)))
> - schedule();
> - else
> - __set_current_state(TASK_RUNNING);
> + if (rqst_should_sleep(rqstp)) {
> + set_current_state(TASK_IDLE);
> + smp_mb__before_atomic();
> + clear_bit(SP_CONGESTED, &pool->sp_flags);
> + clear_bit(RQ_BUSY, &rqstp->rq_flags);
> + smp_mb__after_atomic();
> +
> + /* Need to check should_sleep() again after
> + * setting task state in case a wakeup happened
> + * between testing and setting.
> + */
> + if (rqst_should_sleep(rqstp)) {
> + schedule();
> + } else {
> + __set_current_state(TASK_RUNNING);
> + cond_resched();
> + }
>
> + set_bit(RQ_BUSY, &rqstp->rq_flags);
> + smp_mb__after_atomic();
> + } else
> + cond_resched();
> try_to_freeze();
> -
> - set_bit(RQ_BUSY, &rqstp->rq_flags);
> - smp_mb__after_atomic();
> - clear_bit(SP_TASK_PENDING, &pool->sp_flags);
> - rqstp->rq_xprt = svc_xprt_dequeue(pool);
> - if (rqstp->rq_xprt)
> - goto out_found;
> -
> - if (kthread_should_stop())
> - return NULL;
> - return NULL;
> -out_found:
> - clear_bit(SP_TASK_PENDING, &pool->sp_flags);
> - /* Normally we will wait up to 5 seconds for any required
> - * cache information to be provided.
> - */
> - if (!test_bit(SP_CONGESTED, &pool->sp_flags))
> - rqstp->rq_chandle.thread_wait = 5*HZ;
> - else
> - rqstp->rq_chandle.thread_wait = 1*HZ;
> - trace_svc_xprt_dequeue(rqstp);
> - return rqstp->rq_xprt;
> }
>
> static void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt)
> @@ -858,20 +840,33 @@ static void svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
> */
> void svc_recv(struct svc_rqst *rqstp)
> {
> - struct svc_xprt *xprt = NULL;
> + struct svc_pool *pool = rqstp->rq_pool;
>
> if (!svc_alloc_arg(rqstp))
> return;
>
> - try_to_freeze();
> - cond_resched();
> + svc_rqst_wait_for_work(rqstp);
> +
> + clear_bit(SP_TASK_PENDING, &pool->sp_flags);
> +
> if (kthread_should_stop())
> - goto out;
> + return;
> +
> + rqstp->rq_xprt = svc_xprt_dequeue(pool);
> + if (rqstp->rq_xprt) {
> + struct svc_xprt *xprt = rqstp->rq_xprt;
> +
> + /* Normally we will wait up to 5 seconds for any required
> + * cache information to be provided.
> + */
> + if (test_bit(SP_CONGESTED, &pool->sp_flags))
> + rqstp->rq_chandle.thread_wait = 5 * HZ;
> + else
> + rqstp->rq_chandle.thread_wait = 1 * HZ;
>
> - xprt = svc_get_next_xprt(rqstp);
> - if (xprt)
> + trace_svc_xprt_dequeue(rqstp);
> svc_handle_xprt(rqstp, xprt);
> -out:
> + }
> }
> EXPORT_SYMBOL_GPL(svc_recv);
>
> --
> 2.40.1
>
--
Chuck Lever
On Fri, 04 Aug 2023, Chuck Lever wrote:
> On Wed, Aug 02, 2023 at 05:34:39PM +1000, NeilBrown wrote:
> > svc_get_next_xprt() does a lot more than just get an xprt. It also
> > decides if it needs to sleep, depending not only on the availability of
> > xprts but also on the need to exit or handle external work.
> >
> > So rename it to svc_rqst_wait_for_work() and only do the testing and
> > waiting. Move all the waiting-related code out of svc_recv() into the
> > new svc_rqst_wait_for_work().
> >
> > Move the dequeueing code out of svc_get_next_xprt() into svc_recv().
> >
> > Previously svc_xprt_dequeue() would be called twice, once before waiting
> > and possibly once after. Now instead rqst_should_sleep() is called
> > twice. Once to decide if waiting is needed, and once to check against
> > after setting the task state do see if we might have missed a wakeup.
> >
> > signed-off-by: NeilBrown <[email protected]>
>
> I've tested and applied this one and the previous one to the thread
> scheduling branch, with a few minor fix-ups. Apologies for how long
> this took, I'm wrestling with a SATA/block bug on the v6.6 test
> system that is being very sassy and hard to nail down.
I'm happy that we are making progress and the series is getting improved
along the way. Good lock with the block bug.
>
> I need to dive into the backchannel patch next. I'm trying to think
> of how I want to test that one.
>
I think lock-grant call backs should be easiest to trigger.
However I just tried mounting the filesystem twice with nosharecache,
and the locking that same file on both mounts. I expected one to block
and hoped I would see the callback when the first lock was dropped.
However the second lock didn't block! That's a bug.
I haven't dug very deep yet, but I think the client is getting a
delegation for the first open (O_RDWR) so the server doesn't see the
lock.
Then when the lock arrives on the second mount, there is no conflicting
lock and the write delegation maybe isn't treated as a conflict?
I'll try to look more early next week.
NeilBrown
On Fri, 04 Aug 2023, NeilBrown wrote:
> On Fri, 04 Aug 2023, Chuck Lever wrote:
> > On Wed, Aug 02, 2023 at 05:34:39PM +1000, NeilBrown wrote:
> > > svc_get_next_xprt() does a lot more than just get an xprt. It also
> > > decides if it needs to sleep, depending not only on the availability of
> > > xprts but also on the need to exit or handle external work.
> > >
> > > So rename it to svc_rqst_wait_for_work() and only do the testing and
> > > waiting. Move all the waiting-related code out of svc_recv() into the
> > > new svc_rqst_wait_for_work().
> > >
> > > Move the dequeueing code out of svc_get_next_xprt() into svc_recv().
> > >
> > > Previously svc_xprt_dequeue() would be called twice, once before waiting
> > > and possibly once after. Now instead rqst_should_sleep() is called
> > > twice. Once to decide if waiting is needed, and once to check against
> > > after setting the task state do see if we might have missed a wakeup.
> > >
> > > signed-off-by: NeilBrown <[email protected]>
> >
> > I've tested and applied this one and the previous one to the thread
> > scheduling branch, with a few minor fix-ups. Apologies for how long
> > this took, I'm wrestling with a SATA/block bug on the v6.6 test
> > system that is being very sassy and hard to nail down.
>
> I'm happy that we are making progress and the series is getting improved
> along the way. Good lock with the block bug.
>
> >
> > I need to dive into the backchannel patch next. I'm trying to think
> > of how I want to test that one.
> >
>
> I think lock-grant call backs should be easiest to trigger.
> However I just tried mounting the filesystem twice with nosharecache,
> and the locking that same file on both mounts. I expected one to block
> and hoped I would see the callback when the first lock was dropped.
> However the second lock didn't block! That's a bug.
> I haven't dug very deep yet, but I think the client is getting a
> delegation for the first open (O_RDWR) so the server doesn't see the
> lock.
> Then when the lock arrives on the second mount, there is no conflicting
> lock and the write delegation maybe isn't treated as a conflict?
>
> I'll try to look more early next week.
The bug appears to be client-side.
When I mount the same filesystem twice using nosharecache the client
only creates a single session. One of the mounts opens the file and
gets a write delegation. The other mount opens the same file but this
doesn't trigger a delegation recall as the server thinks it is the same
client as it is using the same session. But the client is caching the
two mounts separately and not sharing the delegation.
So when the second mount locks the file the server allows it, even
though the first mount thinks it holds a lock thanks to the delegation.
I think nosharecache needs to use a separate identity and create a
separate session.
I repeated the test using network namespaces to create a genuinely
separate clients so the server now sees two clients opening the same file
and trying to lock it. I now see both CB_RECALL and CB_NOTIFY_LOCK
callbacks being handled correctly.
NeilBrown
On Wed, Aug 02, 2023 at 05:34:40PM +1000, NeilBrown wrote:
> Using svc_recv() for (NFSv4.1) back-channel handling means we have just
> one mechanism for waking threads.
>
> Also change kthread_freezable_should_stop() in nfs4_callback_svc() to
> kthread_should_stop() as used elsewhere.
> kthread_freezable_should_stop() effectively adds a try_to_freeze() call,
> and svc_recv() already contains that at an appropriate place.
>
> Signed-off-by: NeilBrown <[email protected]>
This one needs review and ack from the client maintainers.
Anna/Trond, if you haven't been following along, we're working on
updating the server-side thread scheduler. This patch replaces the
ad hoc scheduler in the client's NFSv4 callback service so that all
kernel RPC services utilize the same thread scheduling mechanism.
The whole series so far appears in the topic-sunrpc-thread-scheduling
branch in the public nfsd repo:
https://git.kernel.org/pub/scm/linux/kernel/git/cel/linux.git
> ---
> fs/nfs/callback.c | 46 ++-----------------------------
> include/linux/sunrpc/svc.h | 2 --
> net/sunrpc/backchannel_rqst.c | 8 ++----
> net/sunrpc/svc.c | 2 +-
> net/sunrpc/svc_xprt.c | 32 +++++++++++++++++++++
> net/sunrpc/xprtrdma/backchannel.c | 2 +-
> 6 files changed, 39 insertions(+), 53 deletions(-)
>
> diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
> index 466ebf1d41b2..42a0c2f1e785 100644
> --- a/fs/nfs/callback.c
> +++ b/fs/nfs/callback.c
> @@ -78,7 +78,7 @@ nfs4_callback_svc(void *vrqstp)
>
> set_freezable();
>
> - while (!kthread_freezable_should_stop(NULL))
> + while (!kthread_should_stop())
> svc_recv(rqstp);
>
> svc_exit_thread(rqstp);
> @@ -86,45 +86,6 @@ nfs4_callback_svc(void *vrqstp)
> }
>
> #if defined(CONFIG_NFS_V4_1)
> -/*
> - * The callback service for NFSv4.1 callbacks
> - */
> -static int
> -nfs41_callback_svc(void *vrqstp)
> -{
> - struct svc_rqst *rqstp = vrqstp;
> - struct svc_serv *serv = rqstp->rq_server;
> - struct rpc_rqst *req;
> - int error;
> - DEFINE_WAIT(wq);
> -
> - set_freezable();
> -
> - while (!kthread_freezable_should_stop(NULL)) {
> - prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_IDLE);
> - spin_lock_bh(&serv->sv_cb_lock);
> - if (!list_empty(&serv->sv_cb_list)) {
> - req = list_first_entry(&serv->sv_cb_list,
> - struct rpc_rqst, rq_bc_list);
> - list_del(&req->rq_bc_list);
> - spin_unlock_bh(&serv->sv_cb_lock);
> - finish_wait(&serv->sv_cb_waitq, &wq);
> - dprintk("Invoking bc_svc_process()\n");
> - error = bc_svc_process(serv, req, rqstp);
> - dprintk("bc_svc_process() returned w/ error code= %d\n",
> - error);
> - } else {
> - spin_unlock_bh(&serv->sv_cb_lock);
> - if (!kthread_should_stop())
> - schedule();
> - finish_wait(&serv->sv_cb_waitq, &wq);
> - }
> - }
> -
> - svc_exit_thread(rqstp);
> - return 0;
> -}
> -
> static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
> struct svc_serv *serv)
> {
> @@ -237,10 +198,7 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
> cb_info->users);
>
> threadfn = nfs4_callback_svc;
> -#if defined(CONFIG_NFS_V4_1)
> - if (minorversion)
> - threadfn = nfs41_callback_svc;
> -#else
> +#if !defined(CONFIG_NFS_V4_1)
> if (minorversion)
> return ERR_PTR(-ENOTSUPP);
> #endif
> diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
> index dbf5b21feafe..5e2d3b83999e 100644
> --- a/include/linux/sunrpc/svc.h
> +++ b/include/linux/sunrpc/svc.h
> @@ -92,8 +92,6 @@ struct svc_serv {
> * that arrive over the same
> * connection */
> spinlock_t sv_cb_lock; /* protects the svc_cb_list */
> - wait_queue_head_t sv_cb_waitq; /* sleep here if there are no
> - * entries in the svc_cb_list */
> bool sv_bc_enabled; /* service uses backchannel */
> #endif /* CONFIG_SUNRPC_BACKCHANNEL */
> };
> diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
> index 65a6c6429a53..44b7c89a635f 100644
> --- a/net/sunrpc/backchannel_rqst.c
> +++ b/net/sunrpc/backchannel_rqst.c
> @@ -349,10 +349,8 @@ struct rpc_rqst *xprt_lookup_bc_request(struct rpc_xprt *xprt, __be32 xid)
> }
>
> /*
> - * Add callback request to callback list. The callback
> - * service sleeps on the sv_cb_waitq waiting for new
> - * requests. Wake it up after adding enqueing the
> - * request.
> + * Add callback request to callback list. Wake a thread
> + * on the first pool (usually the only pool) to handle it.
> */
> void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
> {
> @@ -371,6 +369,6 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
> xprt_get(xprt);
> spin_lock(&bc_serv->sv_cb_lock);
> list_add(&req->rq_bc_list, &bc_serv->sv_cb_list);
> - wake_up(&bc_serv->sv_cb_waitq);
> spin_unlock(&bc_serv->sv_cb_lock);
> + svc_pool_wake_idle_thread(&bc_serv->sv_pools[0]);
> }
> diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
> index dc21e6c732db..0c3272f1b3b6 100644
> --- a/net/sunrpc/svc.c
> +++ b/net/sunrpc/svc.c
> @@ -440,7 +440,6 @@ __svc_init_bc(struct svc_serv *serv)
> {
> INIT_LIST_HEAD(&serv->sv_cb_list);
> spin_lock_init(&serv->sv_cb_lock);
> - init_waitqueue_head(&serv->sv_cb_waitq);
> }
> #else
> static void
> @@ -718,6 +717,7 @@ void svc_pool_wake_idle_thread(struct svc_pool *pool)
>
> set_bit(SP_CONGESTED, &pool->sp_flags);
> }
> +EXPORT_SYMBOL_GPL(svc_pool_wake_idle_thread);
>
> static struct svc_pool *
> svc_pool_next(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state)
> diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
> index f1d64ded89fb..39582ac33cbd 100644
> --- a/net/sunrpc/svc_xprt.c
> +++ b/net/sunrpc/svc_xprt.c
> @@ -17,6 +17,7 @@
> #include <linux/sunrpc/svc_xprt.h>
> #include <linux/sunrpc/svcsock.h>
> #include <linux/sunrpc/xprt.h>
> +#include <linux/sunrpc/bc_xprt.h>
> #include <linux/module.h>
> #include <linux/netdevice.h>
> #include <trace/events/sunrpc.h>
> @@ -719,6 +720,13 @@ rqst_should_sleep(struct svc_rqst *rqstp)
> if (freezing(current))
> return false;
>
> +#if defined(CONFIG_SUNRPC_BACKCHANNEL)
> + if (svc_is_backchannel(rqstp)) {
> + if (!list_empty(&rqstp->rq_server->sv_cb_list))
> + return false;
> + }
> +#endif
> +
> return true;
> }
>
> @@ -867,6 +875,30 @@ void svc_recv(struct svc_rqst *rqstp)
> trace_svc_xprt_dequeue(rqstp);
> svc_handle_xprt(rqstp, xprt);
> }
> +
> +#if defined(CONFIG_SUNRPC_BACKCHANNEL)
> + if (svc_is_backchannel(rqstp)) {
> + struct svc_serv *serv = rqstp->rq_server;
> + struct rpc_rqst *req;
> +
> + spin_lock_bh(&serv->sv_cb_lock);
> + req = list_first_entry_or_null(&serv->sv_cb_list,
> + struct rpc_rqst, rq_bc_list);
> + if (req) {
> + int error;
> +
> + list_del(&req->rq_bc_list);
> + spin_unlock_bh(&serv->sv_cb_lock);
> +
> + dprintk("Invoking bc_svc_process()\n");
> + error = bc_svc_process(rqstp->rq_server, req, rqstp);
> + dprintk("bc_svc_process() returned w/ error code= %d\n",
> + error);
> + return;
> + }
> + spin_unlock_bh(&serv->sv_cb_lock);
> + }
> +#endif
> }
> EXPORT_SYMBOL_GPL(svc_recv);
>
> diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
> index e4d84a13c566..bfc434ec52a7 100644
> --- a/net/sunrpc/xprtrdma/backchannel.c
> +++ b/net/sunrpc/xprtrdma/backchannel.c
> @@ -267,7 +267,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
> list_add(&rqst->rq_bc_list, &bc_serv->sv_cb_list);
> spin_unlock(&bc_serv->sv_cb_lock);
>
> - wake_up(&bc_serv->sv_cb_waitq);
> + svc_pool_wake_idle_thread(&bc_serv->sv_pools[0]);
>
> r_xprt->rx_stats.bcall_count++;
> return;
> --
> 2.40.1
>
--
Chuck Lever
> On Aug 6, 2023, at 7:07 PM, NeilBrown <[email protected]> wrote:
>
> On Fri, 04 Aug 2023, NeilBrown wrote:
>> On Fri, 04 Aug 2023, Chuck Lever wrote:
>>> On Wed, Aug 02, 2023 at 05:34:39PM +1000, NeilBrown wrote:
>>>> svc_get_next_xprt() does a lot more than just get an xprt. It also
>>>> decides if it needs to sleep, depending not only on the availability of
>>>> xprts but also on the need to exit or handle external work.
>>>>
>>>> So rename it to svc_rqst_wait_for_work() and only do the testing and
>>>> waiting. Move all the waiting-related code out of svc_recv() into the
>>>> new svc_rqst_wait_for_work().
>>>>
>>>> Move the dequeueing code out of svc_get_next_xprt() into svc_recv().
>>>>
>>>> Previously svc_xprt_dequeue() would be called twice, once before waiting
>>>> and possibly once after. Now instead rqst_should_sleep() is called
>>>> twice. Once to decide if waiting is needed, and once to check against
>>>> after setting the task state do see if we might have missed a wakeup.
>>>>
>>>> signed-off-by: NeilBrown <[email protected]>
>>>
>>> I've tested and applied this one and the previous one to the thread
>>> scheduling branch, with a few minor fix-ups. Apologies for how long
>>> this took, I'm wrestling with a SATA/block bug on the v6.6 test
>>> system that is being very sassy and hard to nail down.
>>
>> I'm happy that we are making progress and the series is getting improved
>> along the way. Good lock with the block bug.
>>
>>>
>>> I need to dive into the backchannel patch next. I'm trying to think
>>> of how I want to test that one.
>>>
>>
>> I think lock-grant call backs should be easiest to trigger.
>> However I just tried mounting the filesystem twice with nosharecache,
>> and the locking that same file on both mounts. I expected one to block
>> and hoped I would see the callback when the first lock was dropped.
>> However the second lock didn't block! That's a bug.
>> I haven't dug very deep yet, but I think the client is getting a
>> delegation for the first open (O_RDWR) so the server doesn't see the
>> lock.
>> Then when the lock arrives on the second mount, there is no conflicting
>> lock and the write delegation maybe isn't treated as a conflict?
>>
>> I'll try to look more early next week.
>
> The bug appears to be client-side.
> When I mount the same filesystem twice using nosharecache the client
> only creates a single session. One of the mounts opens the file and
> gets a write delegation. The other mount opens the same file but this
> doesn't trigger a delegation recall as the server thinks it is the same
> client as it is using the same session. But the client is caching the
> two mounts separately and not sharing the delegation.
> So when the second mount locks the file the server allows it, even
> though the first mount thinks it holds a lock thanks to the delegation.
>
> I think nosharecache needs to use a separate identity and create a
> separate session.
>
> I repeated the test using network namespaces to create a genuinely
> separate clients so the server now sees two clients opening the same file
> and trying to lock it. I now see both CB_RECALL and CB_NOTIFY_LOCK
> callbacks being handled correctly.
Thanks for these results. I'll apply this one to the topic branch,
but I'd like to get client review/ack for it first.
Meanwhile, I've applied the rpc_status patches to nfsd-next, and
pulled in the first half of topic-sunrpc-thread-scheduling for
v6.6.
--
Chuck Lever