From: Greg Banks Subject: [PATCH 010 of 11] knfsd: make pools numa aware Date: Tue, 25 Jul 2006 15:16:58 +1000 Message-ID: <1153804618.21040.25.camel@hole.melbourne.sgi.com> Mime-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Cc: Linux NFS Mailing List Return-path: Received: from sc8-sf-mx2-b.sourceforge.net ([10.3.1.92] helo=mail.sourceforge.net) by sc8-sf-list2-new.sourceforge.net with esmtp (Exim 4.43) id 1G5FI0-000430-Fy for nfs@lists.sourceforge.net; Mon, 24 Jul 2006 22:17:04 -0700 Received: from omx2-ext.sgi.com ([192.48.171.19] helo=omx2.sgi.com) by mail.sourceforge.net with esmtp (Exim 4.44) id 1G5FI0-0007yQ-GV for nfs@lists.sourceforge.net; Mon, 24 Jul 2006 22:17:04 -0700 To: Neil Brown List-Id: "Discussion of NFS under Linux development, interoperability, and testing." List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Sender: nfs-bounces@lists.sourceforge.net Errors-To: nfs-bounces@lists.sourceforge.net knfsd: Actually implement multiple pools. On NUMA machines, allocate a svc_pool per NUMA node; on SMP a svc_pool per CPU; otherwise a single global pool. Enqueue sockets on the svc_pool corresponding to the CPU on which the socket bh is run (i.e. the NIC interrupt CPU). Threads have their cpu mask set to limit them to the CPUs in the svc_pool that owns them. This is the patch that allows an Altix to scale NFS traffic linearly beyond 4 CPUs and 4 NICs. Signed-off-by: Greg Banks --- include/linux/sunrpc/svc.h | 62 +++++++++++ net/sunrpc/svc.c | 184 +++++++++++++++++++++++++++++++++- net/sunrpc/svcsock.c | 7 + 3 files changed, 251 insertions(+), 2 deletions(-) Index: linus-git/net/sunrpc/svc.c =================================================================== --- linus-git.orig/net/sunrpc/svc.c 2006-07-24 22:16:36.157203063 +1000 +++ linus-git/net/sunrpc/svc.c 2006-07-24 22:54:13.557820093 +1000 @@ -4,6 +4,10 @@ * High-level RPC service routines * * Copyright (C) 1995, 1996 Olaf Kirch + * + * Multiple threads pools and NUMAisation + * Copyright (c) 2006 Silicon Graphics, Inc. + * by Greg Banks */ #include @@ -24,6 +28,161 @@ #define RPCDBG_FACILITY RPCDBG_SVCDSP #define RPC_PARANOIA 1 + +#if SVC_HAVE_MULTIPLE_POOLS + +struct svc_pool_map svc_pool_map = { .mode = -1, .init = 0 }; + +/* + * Build the global map of cpus to pools and vice versa. + */ +static unsigned int +svc_pool_map_init(void) +{ + struct svc_pool_map *m = &svc_pool_map; + unsigned int node; + unsigned int cpu; + unsigned int pidx = 0; + unsigned int maxpools; + + if (m->init) + return m->npools; + m->init = 1; + + if (m->mode < 0) { + /* + * Detect best pool mapping mode heuristically. + */ + m->mode = 0; /* default: one global pool */ +#ifdef CONFIG_NUMA + if (num_online_nodes() > 1) { + /* + * Actually have multiple NUMA nodes, + * so split pools on NUMA node boundaries + */ + m->mode = 2; + } else { + node = any_online_node(node_online_map); + if (nr_cpus_node(node) > 2) { + /* + * Apparently we're running with CONFIG_NUMA + * on non-NUMA hardware, e.g. with a generic + * x86_64 kernel on Xeons. In this case we + * want to divide the pools on cpu boundaries. + */ + m->mode = 1; + } + } +#else + if (num_online_cpus() > 1) { + /* + * Plain SMP with multiple CPUs online. + */ + m->mode = 1; + } +#endif + } + + switch (m->mode) { + case 0: +fallback: + m->mode = 0; + m->npools = 1; + printk("nfsd: initialising 1 global pool\n"); + break; + + case 1: + maxpools = num_possible_cpus(); + m->cpu_to_pool = kcalloc(maxpools, sizeof(unsigned int), + GFP_KERNEL); + if (!m->cpu_to_pool) + goto fallback; + m->pool_to_cpu = kcalloc(maxpools, sizeof(unsigned int), + GFP_KERNEL); + if (!m->pool_to_cpu) { + kfree(m->cpu_to_pool); + goto fallback; + } + for_each_online_cpu(cpu) { + BUG_ON(pidx > maxpools); + m->cpu_to_pool[cpu] = pidx; + m->pool_to_cpu[pidx] = cpu; + pidx++; + } + /* cpus brought online later all get mapped to pool0, sorry */ + m->npools = pidx; + + printk("nfsd: initialising %u pools, one per cpu\n", m->npools); + break; + +#ifdef CONFIG_NUMA + case 2: + maxpools = num_possible_nodes(); + m->node_to_pool = kcalloc(maxpools, sizeof(unsigned int), + GFP_KERNEL); + if (!m->node_to_pool) + goto fallback; + m->pool_to_node = kcalloc(maxpools, sizeof(unsigned int), + GFP_KERNEL); + if (!m->pool_to_node) { + kfree(m->node_to_pool); + goto fallback; + } + for_each_node_with_cpus(node) { + /* some architectures (e.g. SN2) have cpuless nodes */ + BUG_ON(pidx > maxpools); + m->node_to_pool[node] = pidx; + m->pool_to_node[pidx] = node; + pidx++; + } + /* nodes brought online later all get mapped to pool0, sorry */ + m->npools = pidx; + + printk("nfsd: initialising %u pools, one per numa node\n", m->npools); + break; +#endif /* CONFIG_NUMA */ + } + + return m->npools; +} + +/* + * Set the current thread's cpus_allowed mask so that it + * will only run on cpus in the given pool. + * + * Returns 1 and fills in oldmask iff a cpumask was applied. + */ +static int +svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask) +{ + struct svc_pool_map *m = &svc_pool_map; + unsigned int node; + unsigned int cpu; + + BUG_ON(!m->init); + + switch (m->mode) + { + default: + case 0: + return 0; + case 1: + cpu = m->pool_to_cpu[pidx]; + *oldmask = current->cpus_allowed; + set_cpus_allowed(current, cpumask_of_cpu(cpu)); + return 1; +#ifdef CONFIG_NUMA + case 2: + node = m->pool_to_node[pidx]; + *oldmask = current->cpus_allowed; + set_cpus_allowed(current, node_to_cpumask(node)); + return 1; +#endif /* CONFIG_NUMA */ + } +} + +#endif /* SVC_HAVE_MULTIPLE_POOLS */ + /* * Create an RPC service */ @@ -101,8 +260,13 @@ svc_create_pooled(struct svc_program *pr svc_thread_fn func, int sig, struct module *mod) { struct svc_serv *serv; + unsigned int npools = 1; - serv = __svc_create(prog, bufsize, /*npools*/1); +#if SVC_HAVE_MULTIPLE_POOLS + npools = svc_pool_map_init(); +#endif + + serv = __svc_create(prog, bufsize, npools); if (serv != NULL) { serv->sv_function = func; @@ -202,12 +366,18 @@ svc_release_buffer(struct svc_rqst *rqst /* * Create a thread in the given pool. Caller must hold BKL. + * On a NUMA or SMP machine, with a multi-pool serv, the thread + * will be restricted to run on the cpus belonging to the pool. */ static int __svc_create_thread(svc_thread_fn func, struct svc_serv *serv, struct svc_pool *pool) { struct svc_rqst *rqstp; int error = -ENOMEM; +#if SVC_HAVE_MULTIPLE_POOLS + int have_oldmask = 0; + cpumask_t oldmask; +#endif rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL); if (!rqstp) @@ -227,7 +397,19 @@ __svc_create_thread(svc_thread_fn func, spin_unlock_bh(&pool->sp_lock); rqstp->rq_server = serv; rqstp->rq_pool = pool; + +#if SVC_HAVE_MULTIPLE_POOLS + if (serv->sv_nrpools > 1) + have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask); +#endif + error = kernel_thread((int (*)(void *)) func, rqstp, 0); + +#if SVC_HAVE_MULTIPLE_POOLS + if (have_oldmask) + set_cpus_allowed(current, oldmask); +#endif + if (error < 0) goto out_thread; svc_sock_update_bufs(serv); Index: linus-git/net/sunrpc/svcsock.c =================================================================== --- linus-git.orig/net/sunrpc/svcsock.c 2006-07-24 20:44:46.911435470 +1000 +++ linus-git/net/sunrpc/svcsock.c 2006-07-24 22:45:23.263878219 +1000 @@ -150,8 +150,9 @@ static void svc_sock_enqueue(struct svc_sock *svsk) { struct svc_serv *serv = svsk->sk_server; - struct svc_pool *pool = &serv->sv_pools[0]; + struct svc_pool *pool; struct svc_rqst *rqstp; + int cpu; if (!(svsk->sk_flags & ( (1<sk_flags)) return; + cpu = get_cpu(); + pool = svc_pool_for_cpu(svsk->sk_server, cpu); + put_cpu(); + spin_lock_bh(&pool->sp_lock); if (!list_empty(&pool->sp_threads) && Index: linus-git/include/linux/sunrpc/svc.h =================================================================== --- linus-git.orig/include/linux/sunrpc/svc.h 2006-07-24 22:16:36.041218126 +1000 +++ linus-git/include/linux/sunrpc/svc.h 2006-07-24 22:45:23.347867112 +1000 @@ -41,6 +41,39 @@ struct svc_pool { struct list_head sp_all_threads; /* all server threads */ } ____cacheline_aligned_in_smp; +#if defined(CONFIG_NUMA) || defined(CONFIG_SMP) +#define SVC_HAVE_MULTIPLE_POOLS 1 +#else +#define SVC_HAVE_MULTIPLE_POOLS 0 +#endif + +#if SVC_HAVE_MULTIPLE_POOLS +/* + * Global structure for mapping cpus to pools and vice versa. + * Setup once during sunrpc initialisation. + */ +struct svc_pool_map { + /* + * Mode for mapping cpus to pools. + * + * -1 = automatic, choose one of the other modes at boot + * 0 = no mapping, just a single global pool (legacy & UP mode) + * 1 = one pool per cpu + * 2 = one pool per numa node + */ + int mode; + int init; + unsigned int npools; + unsigned int *pool_to_cpu; + unsigned int *cpu_to_pool; +#ifdef CONFIG_NUMA + unsigned int *node_to_pool; + unsigned int *pool_to_node; +#endif /* CONFIG_NUMA */ +}; +#endif /* SVC_HAVE_MULTIPLE_POOLS */ + + /* * RPC service. * @@ -360,5 +393,34 @@ int svc_process(struct svc_serv *, s int svc_register(struct svc_serv *, int, unsigned short); void svc_wake_up(struct svc_serv *); void svc_reserve(struct svc_rqst *rqstp, int space); +extern struct svc_pool_map svc_pool_map; + + +static inline struct svc_pool *svc_pool_for_cpu(struct svc_serv *serv, int cpu) +{ +#if SVC_HAVE_MULTIPLE_POOLS + struct svc_pool_map *m = &svc_pool_map; + unsigned int pidx; + + switch (m->mode) { + default: + case 0: + pidx = 0; + break; + case 1: + pidx = m->cpu_to_pool[cpu]; + break; +#ifdef CONFIG_NUMA + case 2: + pidx = m->node_to_pool[cpu_to_node(cpu)]; + break; +#endif /* CONFIG_NUMA */ + } + return &serv->sv_pools[pidx % serv->sv_nrpools]; +#else + return &serv->sv_pools[0]; +#endif +} + #endif /* SUNRPC_SVC_H */ -- Greg Banks, R&D Software Engineer, SGI Australian Software Group. I don't speak for SGI. ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys -- and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV _______________________________________________ NFS maillist - NFS@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nfs