The following patches change struct proto.memory_allocated,
proto.sockets_allocated to use per-cpu counters. This patchset also switches
the proto.inuse percpu varible to use alloc_percpu, instead of NR_CPUS *
cacheline size padding.
We saw 5 % improvement in apache bench requests per second with this
patchset, on a multi nic 8 way 3.3 GHZ IBM x460 Xeon server.
Patches follow.
Thanks,
Kiran
Add percpu_counter_mod_bh for using these counters safely from
both softirq and process context.
Signed-off by: Pravin B. Shelar <[email protected]>
Signed-off by: Ravikiran G Thirumalai <[email protected]>
Signed-off by: Shai Fultheim <[email protected]>
Index: linux-2.6.16-rc1/include/linux/percpu_counter.h
===================================================================
--- linux-2.6.16-rc1.orig/include/linux/percpu_counter.h 2006-01-25 11:53:56.000000000 -0800
+++ linux-2.6.16-rc1/include/linux/percpu_counter.h 2006-01-25 12:09:41.000000000 -0800
@@ -11,6 +11,7 @@
#include <linux/smp.h>
#include <linux/threads.h>
#include <linux/percpu.h>
+#include <linux/interrupt.h>
#ifdef CONFIG_SMP
@@ -102,4 +103,11 @@ static inline void percpu_counter_dec(st
percpu_counter_mod(fbc, -1);
}
+static inline void percpu_counter_mod_bh(struct percpu_counter *fbc, long amount)
+{
+ local_bh_disable();
+ percpu_counter_mod(fbc, amount);
+ local_bh_enable();
+}
+
#endif /* _LINUX_PERCPU_COUNTER_H */
Change struct proto->memory_allocated to a batching per-CPU counter
(percpu_counter) from an atomic_t. A batching counter is better than a
plain per-CPU counter as this field is read often.
Signed-off-by: Pravin B. Shelar <[email protected]>
Signed-off-by: Ravikiran Thirumalai <[email protected]>
Signed-off-by: Shai Fultheim <[email protected]>
Index: linux-2.6.16-rc1mm3/include/net/sock.h
===================================================================
--- linux-2.6.16-rc1mm3.orig/include/net/sock.h 2006-01-25 16:25:16.000000000 -0800
+++ linux-2.6.16-rc1mm3/include/net/sock.h 2006-01-25 16:56:59.000000000 -0800
@@ -48,6 +48,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h> /* struct sk_buff */
#include <linux/security.h>
+#include <linux/percpu_counter.h>
#include <linux/filter.h>
@@ -541,8 +542,9 @@ struct proto {
/* Memory pressure */
void (*enter_memory_pressure)(void);
- atomic_t *memory_allocated; /* Current allocated memory. */
+ struct percpu_counter *memory_allocated; /* Current allocated memory. */
atomic_t *sockets_allocated; /* Current number of sockets. */
+
/*
* Pressure flag: try to collapse.
* Technical note: it is used by multiple contexts non atomically.
Index: linux-2.6.16-rc1mm3/include/net/tcp.h
===================================================================
--- linux-2.6.16-rc1mm3.orig/include/net/tcp.h 2006-01-25 16:25:16.000000000 -0800
+++ linux-2.6.16-rc1mm3/include/net/tcp.h 2006-01-25 16:56:59.000000000 -0800
@@ -220,7 +220,7 @@ extern int sysctl_tcp_moderate_rcvbuf;
extern int sysctl_tcp_tso_win_divisor;
extern int sysctl_tcp_abc;
-extern atomic_t tcp_memory_allocated;
+extern struct percpu_counter tcp_memory_allocated;
extern atomic_t tcp_sockets_allocated;
extern int tcp_memory_pressure;
Index: linux-2.6.16-rc1mm3/net/core/sock.c
===================================================================
--- linux-2.6.16-rc1mm3.orig/net/core/sock.c 2006-01-25 16:25:16.000000000 -0800
+++ linux-2.6.16-rc1mm3/net/core/sock.c 2006-01-25 16:56:59.000000000 -0800
@@ -1616,12 +1616,13 @@ static char proto_method_implemented(con
static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
{
- seq_printf(seq, "%-9s %4u %6d %6d %-3s %6u %-3s %-10s "
+ seq_printf(seq, "%-9s %4u %6d %6lu %-3s %6u %-3s %-10s "
"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
proto->name,
proto->obj_size,
proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
- proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
+ proto->memory_allocated != NULL ?
+ percpu_counter_read(proto->memory_allocated) : -1,
proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
proto->max_header,
proto->slab == NULL ? "no" : "yes",
Index: linux-2.6.16-rc1mm3/net/core/stream.c
===================================================================
--- linux-2.6.16-rc1mm3.orig/net/core/stream.c 2006-01-25 16:25:16.000000000 -0800
+++ linux-2.6.16-rc1mm3/net/core/stream.c 2006-01-25 16:56:59.000000000 -0800
@@ -17,6 +17,7 @@
#include <linux/signal.h>
#include <linux/tcp.h>
#include <linux/wait.h>
+#include <linux/percpu.h>
#include <net/sock.h>
/**
@@ -196,11 +197,11 @@ EXPORT_SYMBOL(sk_stream_error);
void __sk_stream_mem_reclaim(struct sock *sk)
{
if (sk->sk_forward_alloc >= SK_STREAM_MEM_QUANTUM) {
- atomic_sub(sk->sk_forward_alloc / SK_STREAM_MEM_QUANTUM,
- sk->sk_prot->memory_allocated);
+ percpu_counter_mod_bh(sk->sk_prot->memory_allocated,
+ -(sk->sk_forward_alloc / SK_STREAM_MEM_QUANTUM));
sk->sk_forward_alloc &= SK_STREAM_MEM_QUANTUM - 1;
if (*sk->sk_prot->memory_pressure &&
- (atomic_read(sk->sk_prot->memory_allocated) <
+ (percpu_counter_read(sk->sk_prot->memory_allocated) <
sk->sk_prot->sysctl_mem[0]))
*sk->sk_prot->memory_pressure = 0;
}
@@ -213,23 +214,26 @@ int sk_stream_mem_schedule(struct sock *
int amt = sk_stream_pages(size);
sk->sk_forward_alloc += amt * SK_STREAM_MEM_QUANTUM;
- atomic_add(amt, sk->sk_prot->memory_allocated);
+ percpu_counter_mod_bh(sk->sk_prot->memory_allocated, amt);
/* Under limit. */
- if (atomic_read(sk->sk_prot->memory_allocated) < sk->sk_prot->sysctl_mem[0]) {
+ if (percpu_counter_read(sk->sk_prot->memory_allocated) <
+ sk->sk_prot->sysctl_mem[0]) {
if (*sk->sk_prot->memory_pressure)
*sk->sk_prot->memory_pressure = 0;
return 1;
}
/* Over hard limit. */
- if (atomic_read(sk->sk_prot->memory_allocated) > sk->sk_prot->sysctl_mem[2]) {
+ if (percpu_counter_read(sk->sk_prot->memory_allocated) >
+ sk->sk_prot->sysctl_mem[2]) {
sk->sk_prot->enter_memory_pressure();
goto suppress_allocation;
}
/* Under pressure. */
- if (atomic_read(sk->sk_prot->memory_allocated) > sk->sk_prot->sysctl_mem[1])
+ if (percpu_counter_read(sk->sk_prot->memory_allocated) >
+ sk->sk_prot->sysctl_mem[1])
sk->sk_prot->enter_memory_pressure();
if (kind) {
@@ -259,7 +263,7 @@ suppress_allocation:
/* Alas. Undo changes. */
sk->sk_forward_alloc -= amt * SK_STREAM_MEM_QUANTUM;
- atomic_sub(amt, sk->sk_prot->memory_allocated);
+ percpu_counter_mod_bh(sk->sk_prot->memory_allocated, -amt);
return 0;
}
Index: linux-2.6.16-rc1mm3/net/decnet/af_decnet.c
===================================================================
--- linux-2.6.16-rc1mm3.orig/net/decnet/af_decnet.c 2006-01-25 16:25:16.000000000 -0800
+++ linux-2.6.16-rc1mm3/net/decnet/af_decnet.c 2006-01-25 17:25:15.000000000 -0800
@@ -154,7 +154,7 @@ static const struct proto_ops dn_proto_o
static DEFINE_RWLOCK(dn_hash_lock);
static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE];
static struct hlist_head dn_wild_sk;
-static atomic_t decnet_memory_allocated;
+static struct percpu_counter decnet_memory_allocated;
static int __dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen, int flags);
static int __dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen, int flags);
@@ -2383,7 +2383,8 @@ static int __init decnet_init(void)
rc = proto_register(&dn_proto, 1);
if (rc != 0)
goto out;
-
+
+ percpu_counter_init(&decnet_memory_allocated);
dn_neigh_init();
dn_dev_init();
dn_route_init();
@@ -2424,6 +2425,7 @@ static void __exit decnet_exit(void)
proc_net_remove("decnet");
proto_unregister(&dn_proto);
+ percpu_counter_destroy(&decnet_memory_allocated);
}
module_exit(decnet_exit);
#endif
Index: linux-2.6.16-rc1mm3/net/ipv4/proc.c
===================================================================
--- linux-2.6.16-rc1mm3.orig/net/ipv4/proc.c 2006-01-25 16:25:16.000000000 -0800
+++ linux-2.6.16-rc1mm3/net/ipv4/proc.c 2006-01-25 16:56:59.000000000 -0800
@@ -61,10 +61,10 @@ static int fold_prot_inuse(struct proto
static int sockstat_seq_show(struct seq_file *seq, void *v)
{
socket_seq_show(seq);
- seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n",
+ seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %lu\n",
fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count),
tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated),
- atomic_read(&tcp_memory_allocated));
+ percpu_counter_read(&tcp_memory_allocated));
seq_printf(seq, "UDP: inuse %d\n", fold_prot_inuse(&udp_prot));
seq_printf(seq, "RAW: inuse %d\n", fold_prot_inuse(&raw_prot));
seq_printf(seq, "FRAG: inuse %d memory %d\n", ip_frag_nqueues,
Index: linux-2.6.16-rc1mm3/net/ipv4/tcp.c
===================================================================
--- linux-2.6.16-rc1mm3.orig/net/ipv4/tcp.c 2006-01-25 16:25:16.000000000 -0800
+++ linux-2.6.16-rc1mm3/net/ipv4/tcp.c 2006-01-25 16:56:59.000000000 -0800
@@ -283,7 +283,7 @@ EXPORT_SYMBOL(sysctl_tcp_mem);
EXPORT_SYMBOL(sysctl_tcp_rmem);
EXPORT_SYMBOL(sysctl_tcp_wmem);
-atomic_t tcp_memory_allocated; /* Current allocated memory. */
+struct percpu_counter tcp_memory_allocated; /* Current allocated memory. */
atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
EXPORT_SYMBOL(tcp_memory_allocated);
@@ -1593,7 +1593,7 @@ adjudge_to_death:
sk_stream_mem_reclaim(sk);
if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans ||
(sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
- atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
+ percpu_counter_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
if (net_ratelimit())
printk(KERN_INFO "TCP: too many of orphaned "
"sockets\n");
@@ -2127,6 +2127,8 @@ void __init tcp_init(void)
"(established %d bind %d)\n",
tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size);
+ percpu_counter_init(&tcp_memory_allocated);
+
tcp_register_congestion_control(&tcp_reno);
}
Index: linux-2.6.16-rc1mm3/net/ipv4/tcp_input.c
===================================================================
--- linux-2.6.16-rc1mm3.orig/net/ipv4/tcp_input.c 2006-01-25 16:25:16.000000000 -0800
+++ linux-2.6.16-rc1mm3/net/ipv4/tcp_input.c 2006-01-25 16:56:59.000000000 -0800
@@ -333,7 +333,7 @@ static void tcp_clamp_window(struct sock
if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
!tcp_memory_pressure &&
- atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
+ percpu_counter_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
sysctl_tcp_rmem[2]);
}
@@ -3508,7 +3508,7 @@ static int tcp_should_expand_sndbuf(stru
return 0;
/* If we are under soft global TCP memory pressure, do not expand. */
- if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
+ if (percpu_counter_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
return 0;
/* If we filled the congestion window, do not expand. */
Index: linux-2.6.16-rc1mm3/net/ipv4/tcp_timer.c
===================================================================
--- linux-2.6.16-rc1mm3.orig/net/ipv4/tcp_timer.c 2006-01-25 16:25:16.000000000 -0800
+++ linux-2.6.16-rc1mm3/net/ipv4/tcp_timer.c 2006-01-25 16:56:59.000000000 -0800
@@ -80,7 +80,7 @@ static int tcp_out_of_resources(struct s
if (orphans >= sysctl_tcp_max_orphans ||
(sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
- atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
+ percpu_counter_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
if (net_ratelimit())
printk(KERN_INFO "Out of socket memory\n");
Change the atomic_t sockets_allocated member of struct proto to a
per-cpu counter.
Signed-off-by: Pravin B. Shelar <[email protected]>
Signed-off-by: Ravikiran Thirumalai <[email protected]>
Signed-off-by: Shai Fultheim <[email protected]>
Index: linux-2.6.16-rc1/include/net/sock.h
===================================================================
--- linux-2.6.16-rc1.orig/include/net/sock.h 2006-01-24 14:37:45.000000000 -0800
+++ linux-2.6.16-rc1/include/net/sock.h 2006-01-24 17:35:34.000000000 -0800
@@ -543,7 +543,7 @@ struct proto {
/* Memory pressure */
void (*enter_memory_pressure)(void);
struct percpu_counter *memory_allocated; /* Current allocated memory. */
- atomic_t *sockets_allocated; /* Current number of sockets. */
+ int *sockets_allocated; /* Current number of sockets (percpu counter). */
/*
* Pressure flag: try to collapse.
@@ -579,6 +579,24 @@ struct proto {
} stats[NR_CPUS];
};
+static inline int read_sockets_allocated(struct proto *prot)
+{
+ int total = 0;
+ int cpu;
+ for_each_cpu(cpu)
+ total += *per_cpu_ptr(prot->sockets_allocated, cpu);
+ return total;
+}
+
+static inline void mod_sockets_allocated(int *sockets_allocated, int count)
+{
+ (*per_cpu_ptr(sockets_allocated, get_cpu())) += count;
+ put_cpu();
+}
+
+#define inc_sockets_allocated(c) mod_sockets_allocated(c, 1)
+#define dec_sockets_allocated(c) mod_sockets_allocated(c, -1)
+
extern int proto_register(struct proto *prot, int alloc_slab);
extern void proto_unregister(struct proto *prot);
Index: linux-2.6.16-rc1/include/net/tcp.h
===================================================================
--- linux-2.6.16-rc1.orig/include/net/tcp.h 2006-01-24 14:37:45.000000000 -0800
+++ linux-2.6.16-rc1/include/net/tcp.h 2006-01-24 17:05:34.000000000 -0800
@@ -221,7 +221,7 @@ extern int sysctl_tcp_tso_win_divisor;
extern int sysctl_tcp_abc;
extern struct percpu_counter tcp_memory_allocated;
-extern atomic_t tcp_sockets_allocated;
+extern int *tcp_sockets_allocated;
extern int tcp_memory_pressure;
/*
Index: linux-2.6.16-rc1/net/core/sock.c
===================================================================
--- linux-2.6.16-rc1.orig/net/core/sock.c 2006-01-24 14:37:45.000000000 -0800
+++ linux-2.6.16-rc1/net/core/sock.c 2006-01-24 16:47:55.000000000 -0800
@@ -771,7 +771,7 @@ struct sock *sk_clone(const struct sock
newsk->sk_sleep = NULL;
if (newsk->sk_prot->sockets_allocated)
- atomic_inc(newsk->sk_prot->sockets_allocated);
+ inc_sockets_allocated(newsk->sk_prot->sockets_allocated);
}
out:
return newsk;
@@ -1620,7 +1620,7 @@ static void proto_seq_printf(struct seq_
"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
proto->name,
proto->obj_size,
- proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
+ proto->sockets_allocated != NULL ? read_sockets_allocated(proto) : -1,
proto->memory_allocated != NULL ?
percpu_counter_read(proto->memory_allocated) : -1,
proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
Index: linux-2.6.16-rc1/net/core/stream.c
===================================================================
--- linux-2.6.16-rc1.orig/net/core/stream.c 2006-01-24 14:37:45.000000000 -0800
+++ linux-2.6.16-rc1/net/core/stream.c 2006-01-24 16:20:22.000000000 -0800
@@ -243,7 +243,7 @@ int sk_stream_mem_schedule(struct sock *
return 1;
if (!*sk->sk_prot->memory_pressure ||
- sk->sk_prot->sysctl_mem[2] > atomic_read(sk->sk_prot->sockets_allocated) *
+ sk->sk_prot->sysctl_mem[2] > read_sockets_allocated(sk->sk_prot) *
sk_stream_pages(sk->sk_wmem_queued +
atomic_read(&sk->sk_rmem_alloc) +
sk->sk_forward_alloc))
Index: linux-2.6.16-rc1/net/ipv4/proc.c
===================================================================
--- linux-2.6.16-rc1.orig/net/ipv4/proc.c 2006-01-24 14:37:45.000000000 -0800
+++ linux-2.6.16-rc1/net/ipv4/proc.c 2006-01-24 16:20:22.000000000 -0800
@@ -63,7 +63,7 @@ static int sockstat_seq_show(struct seq_
socket_seq_show(seq);
seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %lu\n",
fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count),
- tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated),
+ tcp_death_row.tw_count, read_sockets_allocated(&tcp_prot),
percpu_counter_read(&tcp_memory_allocated));
seq_printf(seq, "UDP: inuse %d\n", fold_prot_inuse(&udp_prot));
seq_printf(seq, "RAW: inuse %d\n", fold_prot_inuse(&raw_prot));
Index: linux-2.6.16-rc1/net/ipv4/tcp.c
===================================================================
--- linux-2.6.16-rc1.orig/net/ipv4/tcp.c 2006-01-24 14:38:37.000000000 -0800
+++ linux-2.6.16-rc1/net/ipv4/tcp.c 2006-01-24 17:39:25.000000000 -0800
@@ -284,7 +284,7 @@ EXPORT_SYMBOL(sysctl_tcp_rmem);
EXPORT_SYMBOL(sysctl_tcp_wmem);
struct percpu_counter tcp_memory_allocated; /* Current allocated memory. */
-atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
+int *tcp_sockets_allocated; /* Current number of TCP sockets. */
EXPORT_SYMBOL(tcp_memory_allocated);
EXPORT_SYMBOL(tcp_sockets_allocated);
@@ -2055,6 +2055,12 @@ void __init tcp_init(void)
if (!tcp_hashinfo.bind_bucket_cachep)
panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
+ tcp_sockets_allocated = alloc_percpu(int);
+ if (!tcp_sockets_allocated)
+ panic("tcp_init: Cannot alloc tcp_sockets_allocated");
+
+ tcp_prot.sockets_allocated = tcp_sockets_allocated;
+
/* Size and allocate the main established and bind bucket
* hash tables.
*
Index: linux-2.6.16-rc1/net/ipv4/tcp_ipv4.c
===================================================================
--- linux-2.6.16-rc1.orig/net/ipv4/tcp_ipv4.c 2006-01-24 13:52:24.000000000 -0800
+++ linux-2.6.16-rc1/net/ipv4/tcp_ipv4.c 2006-01-24 16:59:08.000000000 -0800
@@ -1272,7 +1272,7 @@ static int tcp_v4_init_sock(struct sock
sk->sk_sndbuf = sysctl_tcp_wmem[1];
sk->sk_rcvbuf = sysctl_tcp_rmem[1];
- atomic_inc(&tcp_sockets_allocated);
+ inc_sockets_allocated(tcp_sockets_allocated);
return 0;
}
@@ -1306,7 +1306,7 @@ int tcp_v4_destroy_sock(struct sock *sk)
sk->sk_sndmsg_page = NULL;
}
- atomic_dec(&tcp_sockets_allocated);
+ dec_sockets_allocated(tcp_sockets_allocated);
return 0;
}
@@ -1814,7 +1814,6 @@ struct proto tcp_prot = {
.unhash = tcp_unhash,
.get_port = tcp_v4_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
- .sockets_allocated = &tcp_sockets_allocated,
.orphan_count = &tcp_orphan_count,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
Index: linux-2.6.16-rc1/net/ipv6/tcp_ipv6.c
===================================================================
--- linux-2.6.16-rc1.orig/net/ipv6/tcp_ipv6.c 2006-01-24 13:52:24.000000000 -0800
+++ linux-2.6.16-rc1/net/ipv6/tcp_ipv6.c 2006-01-24 17:01:17.000000000 -0800
@@ -1373,7 +1373,7 @@ static int tcp_v6_init_sock(struct sock
sk->sk_sndbuf = sysctl_tcp_wmem[1];
sk->sk_rcvbuf = sysctl_tcp_rmem[1];
- atomic_inc(&tcp_sockets_allocated);
+ inc_sockets_allocated(tcp_sockets_allocated);
return 0;
}
@@ -1571,7 +1571,6 @@ struct proto tcpv6_prot = {
.unhash = tcp_unhash,
.get_port = tcp_v6_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
- .sockets_allocated = &tcp_sockets_allocated,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
.orphan_count = &tcp_orphan_count,
@@ -1604,6 +1603,7 @@ static struct inet_protosw tcpv6_protosw
void __init tcpv6_init(void)
{
int err;
+ tcpv6_prot.sockets_allocated = tcp_sockets_allocated;
/* register inet6 protocol */
if (inet6_add_protocol(&tcpv6_protocol, IPPROTO_TCP) < 0)
This patch uses alloc_percpu to allocate per-CPU memory for the proto->inuse
field. The inuse field is currently per-CPU as in NR_CPUS * cacheline size --
a big bloat on arches with large cachelines. Also marks some frequently
used protos read mostly.
Signed-off-by: Pravin B. Shelar <[email protected]>
Signed-off-by: Ravikiran Thirumalai <[email protected]>
Signed-off-by: Shai Fultheim <[email protected]>
Index: linux-2.6.16-rc1/include/net/sock.h
===================================================================
--- linux-2.6.16-rc1.orig/include/net/sock.h 2006-01-25 11:55:46.000000000 -0800
+++ linux-2.6.16-rc1/include/net/sock.h 2006-01-25 11:55:48.000000000 -0800
@@ -573,10 +573,7 @@ struct proto {
#ifdef SOCK_REFCNT_DEBUG
atomic_t socks;
#endif
- struct {
- int inuse;
- u8 __pad[SMP_CACHE_BYTES - sizeof(int)];
- } stats[NR_CPUS];
+ int *inuse;
};
static inline int read_sockets_allocated(struct proto *prot)
@@ -628,12 +625,12 @@ static inline void sk_refcnt_debug_relea
/* Called with local bh disabled */
static __inline__ void sock_prot_inc_use(struct proto *prot)
{
- prot->stats[smp_processor_id()].inuse++;
+ (*per_cpu_ptr(prot->inuse, smp_processor_id())) += 1;
}
static __inline__ void sock_prot_dec_use(struct proto *prot)
{
- prot->stats[smp_processor_id()].inuse--;
+ (*per_cpu_ptr(prot->inuse, smp_processor_id())) -= 1;
}
/* With per-bucket locks this operation is not-atomic, so that
Index: linux-2.6.16-rc1/net/core/sock.c
===================================================================
--- linux-2.6.16-rc1.orig/net/core/sock.c 2006-01-25 11:55:46.000000000 -0800
+++ linux-2.6.16-rc1/net/core/sock.c 2006-01-25 11:57:29.000000000 -0800
@@ -1508,12 +1508,21 @@ int proto_register(struct proto *prot, i
}
}
+ prot->inuse = alloc_percpu(int);
+ if (prot->inuse == NULL) {
+ if (alloc_slab)
+ goto out_free_timewait_sock_slab_name_cache;
+ else
+ goto out;
+ }
write_lock(&proto_list_lock);
list_add(&prot->node, &proto_list);
write_unlock(&proto_list_lock);
rc = 0;
out:
return rc;
+out_free_timewait_sock_slab_name_cache:
+ kmem_cache_destroy(prot->twsk_prot->twsk_slab);
out_free_timewait_sock_slab_name:
kfree(timewait_sock_slab_name);
out_free_request_sock_slab:
@@ -1537,6 +1546,11 @@ void proto_unregister(struct proto *prot
list_del(&prot->node);
write_unlock(&proto_list_lock);
+ if (prot->inuse != NULL) {
+ free_percpu(prot->inuse);
+ prot->inuse = NULL;
+ }
+
if (prot->slab != NULL) {
kmem_cache_destroy(prot->slab);
prot->slab = NULL;
Index: linux-2.6.16-rc1/net/ipv4/proc.c
===================================================================
--- linux-2.6.16-rc1.orig/net/ipv4/proc.c 2006-01-25 11:55:46.000000000 -0800
+++ linux-2.6.16-rc1/net/ipv4/proc.c 2006-01-25 11:55:48.000000000 -0800
@@ -50,7 +50,7 @@ static int fold_prot_inuse(struct proto
int cpu;
for_each_cpu(cpu)
- res += proto->stats[cpu].inuse;
+ res += (*per_cpu_ptr(proto->inuse, cpu));
return res;
}
Index: linux-2.6.16-rc1/net/ipv4/raw.c
===================================================================
--- linux-2.6.16-rc1.orig/net/ipv4/raw.c 2006-01-25 11:43:42.000000000 -0800
+++ linux-2.6.16-rc1/net/ipv4/raw.c 2006-01-25 11:55:48.000000000 -0800
@@ -718,7 +718,7 @@ static int raw_ioctl(struct sock *sk, in
}
}
-struct proto raw_prot = {
+struct proto raw_prot __read_mostly = {
.name = "RAW",
.owner = THIS_MODULE,
.close = raw_close,
Index: linux-2.6.16-rc1/net/ipv4/tcp_ipv4.c
===================================================================
--- linux-2.6.16-rc1.orig/net/ipv4/tcp_ipv4.c 2006-01-25 11:55:46.000000000 -0800
+++ linux-2.6.16-rc1/net/ipv4/tcp_ipv4.c 2006-01-25 11:55:48.000000000 -0800
@@ -1794,7 +1794,7 @@ void tcp4_proc_exit(void)
}
#endif /* CONFIG_PROC_FS */
-struct proto tcp_prot = {
+struct proto tcp_prot __read_mostly = {
.name = "TCP",
.owner = THIS_MODULE,
.close = tcp_close,
Index: linux-2.6.16-rc1/net/ipv4/udp.c
===================================================================
--- linux-2.6.16-rc1.orig/net/ipv4/udp.c 2006-01-25 11:43:42.000000000 -0800
+++ linux-2.6.16-rc1/net/ipv4/udp.c 2006-01-25 11:55:48.000000000 -0800
@@ -1340,7 +1340,7 @@ unsigned int udp_poll(struct file *file,
}
-struct proto udp_prot = {
+struct proto udp_prot __read_mostly = {
.name = "UDP",
.owner = THIS_MODULE,
.close = udp_close,
Index: linux-2.6.16-rc1/net/ipv6/proc.c
===================================================================
--- linux-2.6.16-rc1.orig/net/ipv6/proc.c 2006-01-25 11:43:42.000000000 -0800
+++ linux-2.6.16-rc1/net/ipv6/proc.c 2006-01-25 11:55:48.000000000 -0800
@@ -39,7 +39,7 @@ static int fold_prot_inuse(struct proto
int cpu;
for_each_cpu(cpu)
- res += proto->stats[cpu].inuse;
+ res += (*per_cpu_ptr(proto->inuse, cpu));
return res;
}
Ravikiran G Thirumalai a ?crit :
> Change the atomic_t sockets_allocated member of struct proto to a
> per-cpu counter.
>
> Signed-off-by: Pravin B. Shelar <[email protected]>
> Signed-off-by: Ravikiran Thirumalai <[email protected]>
> Signed-off-by: Shai Fultheim <[email protected]>
>
Hi Ravikiran
If I correctly read this patch, I think there is a scalability problem.
On a big SMP machine, read_sockets_allocated() is going to be a real killer.
Say we have 128 Opterons CPUS in a box.
You'll need to bring 128 cache lines (plus 8*128 bytes to read the 128
pointers inside percpu structure)
I think a solution 'a la percpu_counter' is preferable, or even better a
dedicated per_cpu with a threshold management (see mm/swap.c , function
vm_acct_memory() to see how vm_committed_space is updated without too bad SMP
scalability)
Thank you
Eric
Ravikiran G Thirumalai a ?crit :
> Change struct proto->memory_allocated to a batching per-CPU counter
> (percpu_counter) from an atomic_t. A batching counter is better than a
> plain per-CPU counter as this field is read often.
>
> Signed-off-by: Pravin B. Shelar <[email protected]>
> Signed-off-by: Ravikiran Thirumalai <[email protected]>
> Signed-off-by: Shai Fultheim <[email protected]>
>
Hello Ravikiran
I like this patch, but I'm not sure current percpu_counter fits the needs.
The percpu_counter_read() can return a value that is off by +-
FBC_BATCH*NR_CPUS, ie 2*(NR_CPUS^2) or 4*(NR_CPUS^2)
if NR_CPUS = 128, the 'error' from percpu_counter_read() is +- 32768
Is it acceptable ?
Thank you
Eric
On Fri, Jan 27, 2006 at 09:53:53AM +0100, Eric Dumazet wrote:
> Ravikiran G Thirumalai a ?crit :
> >Change the atomic_t sockets_allocated member of struct proto to a
> >per-cpu counter.
> >
> >Signed-off-by: Pravin B. Shelar <[email protected]>
> >Signed-off-by: Ravikiran Thirumalai <[email protected]>
> >Signed-off-by: Shai Fultheim <[email protected]>
> >
> Hi Ravikiran
>
> If I correctly read this patch, I think there is a scalability problem.
>
> On a big SMP machine, read_sockets_allocated() is going to be a real killer.
>
> Say we have 128 Opterons CPUS in a box.
read_sockets_allocated is being invoked when when /proc/net/protocols is read,
which can be assumed as not frequent.
At sk_stream_mem_schedule(), read_sockets_allocated() is invoked only
certain conditions, under memory pressure -- on a large CPU count machine,
you'd have large memory, and I don't think read_sockets_allocated would get
called often. It did not atleast on our 8cpu/16G box. So this should be OK
I think.
There're no 128 CPU Opteron boxes yet afaik ;).
Thanks,
Kiran
Ravikiran G Thirumalai <[email protected]> wrote:
>
> On Fri, Jan 27, 2006 at 09:53:53AM +0100, Eric Dumazet wrote:
> > Ravikiran G Thirumalai a ?crit :
> > >Change the atomic_t sockets_allocated member of struct proto to a
> > >per-cpu counter.
> > >
> > >Signed-off-by: Pravin B. Shelar <[email protected]>
> > >Signed-off-by: Ravikiran Thirumalai <[email protected]>
> > >Signed-off-by: Shai Fultheim <[email protected]>
> > >
> > Hi Ravikiran
> >
> > If I correctly read this patch, I think there is a scalability problem.
> >
> > On a big SMP machine, read_sockets_allocated() is going to be a real killer.
> >
> > Say we have 128 Opterons CPUS in a box.
>
> read_sockets_allocated is being invoked when when /proc/net/protocols is read,
> which can be assumed as not frequent.
> At sk_stream_mem_schedule(), read_sockets_allocated() is invoked only
> certain conditions, under memory pressure -- on a large CPU count machine,
> you'd have large memory, and I don't think read_sockets_allocated would get
> called often. It did not atleast on our 8cpu/16G box. So this should be OK
> I think.
That being said, the percpu_counters aren't a terribly successful concept
and probably do need a revisit due to the high inaccuracy at high CPU
counts. It might be better to do some generic version of vm_acct_memory()
instead.
If the benchmarks say that we need to. If we cannot observe any problems
in testing of existing code and if we can't demonstrate any benefit from
the patched code then one option is to go off and do something else ;)
Andrew Morton a ?crit :
> Ravikiran G Thirumalai <[email protected]> wrote:
>> On Fri, Jan 27, 2006 at 09:53:53AM +0100, Eric Dumazet wrote:
>>> Ravikiran G Thirumalai a ?crit :
>>>> Change the atomic_t sockets_allocated member of struct proto to a
>>>> per-cpu counter.
>>>>
>>>> Signed-off-by: Pravin B. Shelar <[email protected]>
>>>> Signed-off-by: Ravikiran Thirumalai <[email protected]>
>>>> Signed-off-by: Shai Fultheim <[email protected]>
>>>>
>>> Hi Ravikiran
>>>
>>> If I correctly read this patch, I think there is a scalability problem.
>>>
>>> On a big SMP machine, read_sockets_allocated() is going to be a real killer.
>>>
>>> Say we have 128 Opterons CPUS in a box.
>> read_sockets_allocated is being invoked when when /proc/net/protocols is read,
>> which can be assumed as not frequent.
>> At sk_stream_mem_schedule(), read_sockets_allocated() is invoked only
>> certain conditions, under memory pressure -- on a large CPU count machine,
>> you'd have large memory, and I don't think read_sockets_allocated would get
>> called often. It did not atleast on our 8cpu/16G box. So this should be OK
>> I think.
>
> That being said, the percpu_counters aren't a terribly successful concept
> and probably do need a revisit due to the high inaccuracy at high CPU
> counts. It might be better to do some generic version of vm_acct_memory()
> instead.
There are several issues here :
alloc_percpu() current implementation is a a waste of ram. (because it uses
slab allocations that have a minimum size of 32 bytes)
Currently we cannot use per_cpu(&some_object, cpu), so a generic version of
vm_acct_memory() would need a rework of percpu.h and maybe this is not
possible on every platform ?
#define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu]))
-->
#define per_cpu_name(var) per_cpu__##var
#define per_cpu_addr(var) &per_cpu_name(var)
#define per_cpu(var, cpu) (*RELOC_HIDE(per_cpu_addr(var), __per_cpu_offset[cpu])
But this could render TLS migration difficult...
Eric
On Fri, Jan 27, 2006 at 12:16:02PM -0800, Andrew Morton wrote:
> Ravikiran G Thirumalai <[email protected]> wrote:
> >
> > which can be assumed as not frequent.
> > At sk_stream_mem_schedule(), read_sockets_allocated() is invoked only
> > certain conditions, under memory pressure -- on a large CPU count machine,
> > you'd have large memory, and I don't think read_sockets_allocated would get
> > called often. It did not atleast on our 8cpu/16G box. So this should be OK
> > I think.
>
> That being said, the percpu_counters aren't a terribly successful concept
> and probably do need a revisit due to the high inaccuracy at high CPU
> counts. It might be better to do some generic version of vm_acct_memory()
> instead.
AFAICS vm_acct_memory is no better. The deviation on large cpu counts is the
same as percpu_counters -- (NR_CPUS * NR_CPUS * 2) ...
>
> If the benchmarks say that we need to. If we cannot observe any problems
> in testing of existing code and if we can't demonstrate any benefit from
> the patched code then one option is to go off and do something else ;)
We first tried plain per-CPU counters for memory_allocated, found that reads
on memory_allocated was causing cacheline transfers, and then
switched over to batching. So batching reads is useful. To avoid
inaccuracy, we can maybe change percpu_counter_init to:
void percpu_counter_init(struct percpu_counter *fbc, int maxdev)
the percpu batching limit would then be maxdev/num_possible_cpus. One would
use batching counters only when both reads and writes are frequent. With
the above scheme, we would go fetch cachelines from other cpus for read
often only on large cpu counts, which is not any worse than the global
counter alternative, but it would still be beneficial on smaller machines,
without sacrificing a pre-set deviation.
Comments?
Thanks,
Kiran
On Fri, Jan 27, 2006 at 11:30:23PM +0100, Eric Dumazet wrote:
>
> There are several issues here :
>
> alloc_percpu() current implementation is a a waste of ram. (because it uses
> slab allocations that have a minimum size of 32 bytes)
Oh there was a solution for that :).
http://lwn.net/Articles/119532/
I can quickly revive it if there is interest.
Thanks,
Kiran
Ravikiran G Thirumalai a ?crit :
> On Fri, Jan 27, 2006 at 12:16:02PM -0800, Andrew Morton wrote:
>> Ravikiran G Thirumalai <[email protected]> wrote:
>>> which can be assumed as not frequent.
>>> At sk_stream_mem_schedule(), read_sockets_allocated() is invoked only
>>> certain conditions, under memory pressure -- on a large CPU count machine,
>>> you'd have large memory, and I don't think read_sockets_allocated would get
>>> called often. It did not atleast on our 8cpu/16G box. So this should be OK
>>> I think.
>> That being said, the percpu_counters aren't a terribly successful concept
>> and probably do need a revisit due to the high inaccuracy at high CPU
>> counts. It might be better to do some generic version of vm_acct_memory()
>> instead.
>
> AFAICS vm_acct_memory is no better. The deviation on large cpu counts is the
> same as percpu_counters -- (NR_CPUS * NR_CPUS * 2) ...
Ah... yes you are right, I read min(16, NR_CPUS*2)
I wonder if it is not a typo... I mean, I understand the more cpus you have,
the less updates on central atomic_t is desirable, but a quadratic offset
seems too much...
Eric
Ravikiran G Thirumalai <[email protected]> wrote:
>
> On Fri, Jan 27, 2006 at 12:16:02PM -0800, Andrew Morton wrote:
> > Ravikiran G Thirumalai <[email protected]> wrote:
> > >
> > > which can be assumed as not frequent.
> > > At sk_stream_mem_schedule(), read_sockets_allocated() is invoked only
> > > certain conditions, under memory pressure -- on a large CPU count machine,
> > > you'd have large memory, and I don't think read_sockets_allocated would get
> > > called often. It did not atleast on our 8cpu/16G box. So this should be OK
> > > I think.
> >
> > That being said, the percpu_counters aren't a terribly successful concept
> > and probably do need a revisit due to the high inaccuracy at high CPU
> > counts. It might be better to do some generic version of vm_acct_memory()
> > instead.
>
> AFAICS vm_acct_memory is no better. The deviation on large cpu counts is the
> same as percpu_counters -- (NR_CPUS * NR_CPUS * 2) ...
I suppose so. Except vm_acct_memory() has
#define ACCT_THRESHOLD max(16, NR_CPUS * 2)
But if we were to perform similar tuning to percpu_counter, yes, they're
pretty similar.
Oh, and because vm_acct_memory() is counting a singleton object, it can use
DEFINE_PER_CPU rather than alloc_percpu(), so it saves on a bit of kmalloc
overhead.
> >
> > If the benchmarks say that we need to. If we cannot observe any problems
> > in testing of existing code and if we can't demonstrate any benefit from
> > the patched code then one option is to go off and do something else ;)
>
> We first tried plain per-CPU counters for memory_allocated, found that reads
> on memory_allocated was causing cacheline transfers, and then
> switched over to batching. So batching reads is useful. To avoid
> inaccuracy, we can maybe change percpu_counter_init to:
>
> void percpu_counter_init(struct percpu_counter *fbc, int maxdev)
>
> the percpu batching limit would then be maxdev/num_possible_cpus. One would
> use batching counters only when both reads and writes are frequent. With
> the above scheme, we would go fetch cachelines from other cpus for read
> often only on large cpu counts, which is not any worse than the global
> counter alternative, but it would still be beneficial on smaller machines,
> without sacrificing a pre-set deviation.
>
> Comments?
Sounds sane.
Andrew Morton <[email protected]> wrote:
>
> Oh, and because vm_acct_memory() is counting a singleton object, it can use
> DEFINE_PER_CPU rather than alloc_percpu(), so it saves on a bit of kmalloc
> overhead.
Actually, I don't think that's true. we're allocating a sizeof(long) with
kmalloc_node() so there shouldn't be memory wastage.
Eric Dumazet <[email protected]> wrote:
>
> Ravikiran G Thirumalai a ?crit :
> > On Fri, Jan 27, 2006 at 12:16:02PM -0800, Andrew Morton wrote:
> >> Ravikiran G Thirumalai <[email protected]> wrote:
> >>> which can be assumed as not frequent.
> >>> At sk_stream_mem_schedule(), read_sockets_allocated() is invoked only
> >>> certain conditions, under memory pressure -- on a large CPU count machine,
> >>> you'd have large memory, and I don't think read_sockets_allocated would get
> >>> called often. It did not atleast on our 8cpu/16G box. So this should be OK
> >>> I think.
> >> That being said, the percpu_counters aren't a terribly successful concept
> >> and probably do need a revisit due to the high inaccuracy at high CPU
> >> counts. It might be better to do some generic version of vm_acct_memory()
> >> instead.
> >
> > AFAICS vm_acct_memory is no better. The deviation on large cpu counts is the
> > same as percpu_counters -- (NR_CPUS * NR_CPUS * 2) ...
>
> Ah... yes you are right, I read min(16, NR_CPUS*2)
So did I ;)
> I wonder if it is not a typo... I mean, I understand the more cpus you have,
> the less updates on central atomic_t is desirable, but a quadratic offset
> seems too much...
I'm not sure whether it was a mistake or if I intended it and didn't do the
sums on accuracy :(
An advantage of retaining a spinlock in percpu_counter is that if accuracy
is needed at a low rate (say, /proc reading) we can take the lock and then
go spill each CPU's local count into the main one. It would need to be a
very low rate though. Or we make the cpu-local counters atomic too.
Certainly it's sensible to delegate the tuning to the creator of the
percpu_counter, but it'll be a difficult thing to get right.
Ravikiran G Thirumalai a ?crit :
> On Fri, Jan 27, 2006 at 11:30:23PM +0100, Eric Dumazet wrote:
>> There are several issues here :
>>
>> alloc_percpu() current implementation is a a waste of ram. (because it uses
>> slab allocations that have a minimum size of 32 bytes)
>
> Oh there was a solution for that :).
>
> http://lwn.net/Articles/119532/
>
> I can quickly revive it if there is interest.
>
Well, nice work ! :)
Maybe a litle bit complicated if expected percpu space is 50 KB per cpu ?
Why not use a boot time allocated percpu area (as done today in
setup_per_cpu_areas()), but instead of reserving extra space for module's
percpu data, being able to serve alloc_percpu() from this reserved area (ie no
kmalloced data anymore), and keeping your
#define per_cpu_ptr(ptr, cpu) ((__typeof__(ptr)) \
(RELOC_HIDE(ptr, PCPU_BLKSIZE * cpu)))
Some code from kernel/module.c could be reworked to serve both as an allocator
when a module percpudata must be relocated (insmod)/freed (rmmod), and serve
alloc_percpu() for 'dynamic allocations'
Eric
On Fri, Jan 27, 2006 at 03:08:47PM -0800, Andrew Morton wrote:
> Andrew Morton <[email protected]> wrote:
> >
> > Oh, and because vm_acct_memory() is counting a singleton object, it can use
> > DEFINE_PER_CPU rather than alloc_percpu(), so it saves on a bit of kmalloc
> > overhead.
>
> Actually, I don't think that's true. we're allocating a sizeof(long) with
> kmalloc_node() so there shouldn't be memory wastage.
Oh yeah there is. Each dynamic per-cpu object would have been atleast
(NR_CPUS * sizeof (void *) + num_cpus_possible * cacheline_size ).
Now kmalloc_node will fall back on size-32 for allocation of long, so
replace the cacheline_size above with 32 -- which then means dynamic per-cpu
data are not on a cacheline boundary anymore (most modern cpus have 64byte/128
byte cache lines) which means per-cpu data could end up false shared....
Kiran
Ravikiran G Thirumalai <[email protected]> wrote:
>
> On Fri, Jan 27, 2006 at 03:08:47PM -0800, Andrew Morton wrote:
> > Andrew Morton <[email protected]> wrote:
> > >
> > > Oh, and because vm_acct_memory() is counting a singleton object, it can use
> > > DEFINE_PER_CPU rather than alloc_percpu(), so it saves on a bit of kmalloc
> > > overhead.
> >
> > Actually, I don't think that's true. we're allocating a sizeof(long) with
> > kmalloc_node() so there shouldn't be memory wastage.
>
> Oh yeah there is. Each dynamic per-cpu object would have been atleast
> (NR_CPUS * sizeof (void *) + num_cpus_possible * cacheline_size ).
> Now kmalloc_node will fall back on size-32 for allocation of long, so
> replace the cacheline_size above with 32 -- which then means dynamic per-cpu
> data are not on a cacheline boundary anymore (most modern cpus have 64byte/128
> byte cache lines) which means per-cpu data could end up false shared....
>
OK. But isn't the core of the problem the fact that __alloc_percpu() is
using kmalloc_node() rather than a (new, as-yet-unimplemented)
kmalloc_cpu()? kmalloc_cpu() wouldn't need the L1 cache alignment.
It might be worth creating just a small number of per-cpu slabs (4-byte,
8-byte). A kmalloc_cpu() would just need a per-cpu array of
kmem_cache_t*'s and it'd internally use kmalloc_node(cpu_to_node), no?
Or we could just give __alloc_percpu() a custom, hand-rolled,
not-cacheline-padded sizeof(long) slab per CPU and use that if (size ==
sizeof(long)). Or something.
struct percpu_counter {
atomic_long_t count;
atomic_long_t *counters;
};
#ifdef CONFIG_SMP
void percpu_counter_mod(struct percpu_counter *fbc, long amount)
{
long old, new;
atomic_long_t *pcount;
pcount = per_cpu_ptr(fbc->counters, get_cpu());
start:
old = atomic_long_read(pcount);
new = old + amount;
if (new >= FBC_BATCH || new <= -FBC_BATCH) {
if (unlikely(atomic_long_cmpxchg(pcount, old, 0) != old))
goto start;
atomic_long_add(new, &fbc->count);
} else
atomic_long_add(amount, pcount);
put_cpu();
}
EXPORT_SYMBOL(percpu_counter_mod);
long percpu_counter_read_accurate(struct percpu_counter *fbc)
{
long res = 0;
int cpu;
atomic_long_t *pcount;
for_each_cpu(cpu) {
pcount = per_cpu_ptr(fbc->counters, cpu);
/* dont dirty cache line if not necessary */
if (atomic_long_read(pcount))
res += atomic_long_xchg(pcount, 0);
}
return res;
}
EXPORT_SYMBOL(percpu_counter_read_accurate);
#endif /* CONFIG_SMP */
Eric Dumazet a ?crit :
> Andrew Morton a ?crit :
>> Eric Dumazet <[email protected]> wrote:
>>> Ravikiran G Thirumalai a ?crit :
>>>> On Fri, Jan 27, 2006 at 12:16:02PM -0800, Andrew Morton wrote:
>>>>> Ravikiran G Thirumalai <[email protected]> wrote:
>>>>>> which can be assumed as not frequent. At
>>>>>> sk_stream_mem_schedule(), read_sockets_allocated() is invoked only
>>>>>> certain conditions, under memory pressure -- on a large CPU count
>>>>>> machine, you'd have large memory, and I don't think
>>>>>> read_sockets_allocated would get called often. It did not atleast
>>>>>> on our 8cpu/16G box. So this should be OK I think.
>>>>> That being said, the percpu_counters aren't a terribly successful
>>>>> concept
>>>>> and probably do need a revisit due to the high inaccuracy at high CPU
>>>>> counts. It might be better to do some generic version of
>>>>> vm_acct_memory()
>>>>> instead.
>>>> AFAICS vm_acct_memory is no better. The deviation on large cpu
>>>> counts is the same as percpu_counters -- (NR_CPUS * NR_CPUS * 2) ...
>>> Ah... yes you are right, I read min(16, NR_CPUS*2)
>>
>> So did I ;)
>>
>>> I wonder if it is not a typo... I mean, I understand the more cpus
>>> you have, the less updates on central atomic_t is desirable, but a
>>> quadratic offset seems too much...
>>
>> I'm not sure whether it was a mistake or if I intended it and didn't
>> do the
>> sums on accuracy :(
>>
>> An advantage of retaining a spinlock in percpu_counter is that if
>> accuracy
>> is needed at a low rate (say, /proc reading) we can take the lock and
>> then
>> go spill each CPU's local count into the main one. It would need to be a
>> very low rate though. Or we make the cpu-local counters atomic too.
>
> We might use atomic_long_t only (and no spinlocks)
> Something like this ?
>
>
> ------------------------------------------------------------------------
>
> struct percpu_counter {
> atomic_long_t count;
> atomic_long_t *counters;
> };
>
> #ifdef CONFIG_SMP
> void percpu_counter_mod(struct percpu_counter *fbc, long amount)
> {
> long old, new;
> atomic_long_t *pcount;
>
> pcount = per_cpu_ptr(fbc->counters, get_cpu());
> start:
> old = atomic_long_read(pcount);
> new = old + amount;
> if (new >= FBC_BATCH || new <= -FBC_BATCH) {
> if (unlikely(atomic_long_cmpxchg(pcount, old, 0) != old))
> goto start;
> atomic_long_add(new, &fbc->count);
> } else
> atomic_long_add(amount, pcount);
>
> put_cpu();
> }
> EXPORT_SYMBOL(percpu_counter_mod);
>
> long percpu_counter_read_accurate(struct percpu_counter *fbc)
> {
> long res = 0;
> int cpu;
> atomic_long_t *pcount;
>
> for_each_cpu(cpu) {
> pcount = per_cpu_ptr(fbc->counters, cpu);
> /* dont dirty cache line if not necessary */
> if (atomic_long_read(pcount))
> res += atomic_long_xchg(pcount, 0);
> }
atomic_long_add(res, &fbc->count);
res = atomic_long_read(&fbc->count);
> return res;
> }
> EXPORT_SYMBOL(percpu_counter_read_accurate);
> #endif /* CONFIG_SMP */
>
On Sat, Jan 28, 2006 at 12:21:07AM +0100, Eric Dumazet wrote:
> Ravikiran G Thirumalai a ?crit :
> >On Fri, Jan 27, 2006 at 11:30:23PM +0100, Eric Dumazet wrote:
>
> Why not use a boot time allocated percpu area (as done today in
> setup_per_cpu_areas()), but instead of reserving extra space for module's
> percpu data, being able to serve alloc_percpu() from this reserved area (ie
> no kmalloced data anymore), and keeping your
At that time ia64 placed a limit on the max size of per-CPU area
(PERCPU_ENOUGH_ROOM). I think that the limit is still there, But hopefully
64K per-CPU should be enough for static + dynamic + modules?
Let me do a allyesconfig on my box and verify.
Thanks,
Kiran
Eric Dumazet <[email protected]> wrote:
>
> >
> > An advantage of retaining a spinlock in percpu_counter is that if accuracy
> > is needed at a low rate (say, /proc reading) we can take the lock and then
> > go spill each CPU's local count into the main one. It would need to be a
> > very low rate though. Or we make the cpu-local counters atomic too.
>
> We might use atomic_long_t only (and no spinlocks)
Yup, that's it.
> Something like this ?
>
It'd be a lot neater if we had atomic_long_xchg().
--- a/include/asm-generic/atomic.h 2006-01-28 02:59:49.000000000 +0100
+++ b/include/asm-generic/atomic.h 2006-01-28 02:57:36.000000000 +0100
@@ -66,6 +66,18 @@
atomic64_sub(i, v);
}
+static inline long atomic_long_xchg(atomic_long_t *l, long val)
+{
+ atomic64_t *v = (atomic64_t *)l;
+ return atomic64_xchg(v, val);
+}
+
+static inline long atomic_long_cmpxchg(atomic_long_t *l, long old, long new)
+{
+ atomic64_t *v = (atomic64_t *)l;
+ return atomic64_cmpxchg(v, old, new);
+}
+
#else
typedef atomic_t atomic_long_t;
@@ -113,5 +125,17 @@
atomic_sub(i, v);
}
+static inline long atomic_long_xchg(atomic_long_t *l, long val)
+{
+ atomic_t *v = (atomic_t *)l;
+ return atomic_xchg(v, val);
+}
+
+static inline long atomic_long_cmpxchg(atomic_long_t *l, long old, long new)
+{
+ atomic_t *v = (atomic_t *)l;
+ return atomic_cmpxchg(v, old, new);
+}
+
#endif
#endif
Eric Dumazet <[email protected]> wrote:
>
> [PATCH] Add atomic_long_xchg() and atomic_long_cmpxchg() wrappers
>
> ...
>
> +static inline long atomic_long_xchg(atomic_long_t *l, long val)
> +{
> + atomic64_t *v = (atomic64_t *)l;
> + return atomic64_xchg(v, val);
All we need now is some implementations of atomic64_xchg() ;)
On Sat, Jan 28, 2006 at 01:35:03AM +0100, Eric Dumazet wrote:
> Eric Dumazet a ?crit :
> >Andrew Morton a ?crit :
> >>Eric Dumazet <[email protected]> wrote:
> >
> >#ifdef CONFIG_SMP
> >void percpu_counter_mod(struct percpu_counter *fbc, long amount)
> >{
> > long old, new;
> > atomic_long_t *pcount;
> >
> > pcount = per_cpu_ptr(fbc->counters, get_cpu());
> >start:
> > old = atomic_long_read(pcount);
> > new = old + amount;
> > if (new >= FBC_BATCH || new <= -FBC_BATCH) {
> > if (unlikely(atomic_long_cmpxchg(pcount, old, 0) != old))
> > goto start;
> > atomic_long_add(new, &fbc->count);
> > } else
> > atomic_long_add(amount, pcount);
> >
> > put_cpu();
> >}
> >EXPORT_SYMBOL(percpu_counter_mod);
> >
> >long percpu_counter_read_accurate(struct percpu_counter *fbc)
> >{
> > long res = 0;
> > int cpu;
> > atomic_long_t *pcount;
> >
> > for_each_cpu(cpu) {
> > pcount = per_cpu_ptr(fbc->counters, cpu);
> > /* dont dirty cache line if not necessary */
> > if (atomic_long_read(pcount))
> > res += atomic_long_xchg(pcount, 0);
---------------------------> (A)
> > }
>
> atomic_long_add(res, &fbc->count);
---------------------------> (B)
> res = atomic_long_read(&fbc->count);
>
> > return res;
> >}
The read is still theoritically FBC_BATCH * NR_CPUS inaccurate no?
What happens when other cpus update their local counters at (A) and (B)?
(I am hoping we don't need percpu_counter_read_accurate anywhere yet and
this is just demo ;). I certainly don't want to go on all cpus for read /
add cmpxchg on the write path for the proto counters that started this
discussion)
Thanks,
Kiran
struct percpu_counter {
atomic_long_t count;
atomic_long_t *counters;
};
#ifdef CONFIG_SMP
void percpu_counter_mod(struct percpu_counter *fbc, long amount)
{
long new;
atomic_long_t *pcount;
pcount = per_cpu_ptr(fbc->counters, get_cpu());
new = atomic_long_read(pcount) + amount;
if (new >= FBC_BATCH || new <= -FBC_BATCH) {
new = atomic_long_xchg(pcount, 0) + amount;
if (new)
atomic_long_add(new, &fbc->count);
} else
atomic_long_add(amount, pcount);
put_cpu();
}
EXPORT_SYMBOL(percpu_counter_mod);
long percpu_counter_read_accurate(struct percpu_counter *fbc)
{
long res = 0;
int cpu;
atomic_long_t *pcount;
for_each_cpu(cpu) {
pcount = per_cpu_ptr(fbc->counters, cpu);
/* dont dirty cache line if not necessary */
if (atomic_long_read(pcount))
res += atomic_long_xchg(pcount, 0);
}
atomic_long_add(res, &fbc->count);
return atomic_long_read(&fbc->count);
}
EXPORT_SYMBOL(percpu_counter_read_accurate);
#endif /* CONFIG_SMP */
On Sat, Jan 28, 2006 at 01:28:20AM +0100, Eric Dumazet wrote:
> We might use atomic_long_t only (and no spinlocks)
> Something like this ?
Erk, complex and slow... Try using local_t instead, which is substantially
cheaper on the P4 as it doesn't use the lock prefix and act as a memory
barrier. See asm/local.h.
-ben
Benjamin LaHaise <[email protected]> wrote:
>
> On Sat, Jan 28, 2006 at 01:28:20AM +0100, Eric Dumazet wrote:
> > We might use atomic_long_t only (and no spinlocks)
> > Something like this ?
>
> Erk, complex and slow... Try using local_t instead, which is substantially
> cheaper on the P4 as it doesn't use the lock prefix and act as a memory
> barrier. See asm/local.h.
>
local_t isn't much use until we get rid of asm-generic/local.h. Bloaty,
racy with nested interrupts.
On Sat, Jan 28, 2006 at 04:55:49PM -0800, Andrew Morton wrote:
> local_t isn't much use until we get rid of asm-generic/local.h. Bloaty,
> racy with nested interrupts.
The overuse of atomics is horrific in what is being proposed. All the
major architectures except powerpc (i386, x86-64, ia64, and sparc64)
implement local_t. It would make far more sense to push the last few
stragglers (which mostly seem to be uniprocessor) into writing the
appropriate implementations. Perhaps it's time to add a #error in
asm-generic/local.h?
-ben
--
"Ladies and gentlemen, I'm sorry to interrupt, but the police are here
and they've asked us to stop the party." Don't Email: <[email protected]>.
Benjamin LaHaise <[email protected]> wrote:
>
> On Sat, Jan 28, 2006 at 04:55:49PM -0800, Andrew Morton wrote:
> > local_t isn't much use until we get rid of asm-generic/local.h. Bloaty,
> > racy with nested interrupts.
>
> The overuse of atomics is horrific in what is being proposed.
Well yeah, it wasn't really a very serious proposal. There's some
significant work yet to be done on this stuff.
> Perhaps it's time to add a #error in asm-generic/local.h?
heh, or #warning "you suck" or something.
On Sat, Jan 28, 2006 at 08:19:44PM -0500, Benjamin LaHaise wrote:
> The overuse of atomics is horrific in what is being proposed. All the
> major architectures except powerpc (i386, x86-64, ia64, and sparc64)
> implement local_t. It would make far more sense to push the last few
> stragglers (which mostly seem to be uniprocessor) into writing the
> appropriate implementations. Perhaps it's time to add a #error in
> asm-generic/local.h?
>
Surely asm-generic/local.h could now be reimplemented using atomic_long_t
to kill that aberration that is the BITS_PER_LONG != 32 case currently...?
[adding linux-arch]
On Sunday 29 January 2006 01:55, Andrew Morton wrote:
> Benjamin LaHaise <[email protected]> wrote:
> > On Sat, Jan 28, 2006 at 01:28:20AM +0100, Eric Dumazet wrote:
> > > We might use atomic_long_t only (and no spinlocks)
> > > Something like this ?
> >
> > Erk, complex and slow... Try using local_t instead, which is
> > substantially cheaper on the P4 as it doesn't use the lock prefix and act
> > as a memory barrier. See asm/local.h.
>
> local_t isn't much use until we get rid of asm-generic/local.h. Bloaty,
> racy with nested interrupts.
It is just implemented wrong. It should use
local_irq_save()/local_irq_restore() instead. But my bigger problem
with local_t is these few architectures (IA64, PPC64) who implement it
with atomic_t. This means we can't replace local statistics counters
with local_t because it would be regression for them. I haven't done the
benchmarks yet, but I suspect both IA64 and PPC64 really should
just turn off interrupts.
-Andi
Benjamin LaHaise a ?crit :
> On Sat, Jan 28, 2006 at 01:28:20AM +0100, Eric Dumazet wrote:
>> We might use atomic_long_t only (and no spinlocks)
>> Something like this ?
>
> Erk, complex and slow... Try using local_t instead, which is substantially
> cheaper on the P4 as it doesn't use the lock prefix and act as a memory
> barrier. See asm/local.h.
>
Well, I think that might be doable, maybe RCU magic ?
1) local_t are not that nice on all archs.
2) The consolidation phase (summing all the cpus local offset to consolidate
the central counter) might be more difficult to do (we would need kind of 2
counters per cpu, and a index that can be changed by the cpu that wants a
consolidation (still 'expensive'))
struct cpu_offset {
local_t offset[2];
};
struct percpu_counter {
atomic_long_t count;
unsigned int offidx;
spinlock_t lock; /* to guard offidx changes */
cpu_offset *counters;
};
void percpu_counter_mod(struct percpu_counter *fbc, long amount)
{
long val;
struct cpu_offset *cp;
local_t *l;
cp = per_cpu_ptr(fbc->counters, get_cpu());
l = &cp[fbc->offidx];
local_add(amount, l);
val = local_read(l);
if (new >= FBC_BATCH || new <= -FBC_BATCH) {
local_set(l, 0);
atomic_long_add(val, &fbc->count);
}
put_cpu();
}
long percpu_counter_read_accurate(struct percpu_counter *fbc)
{
long res = 0, val;
int cpu;
struct cpu_offset *cp;
local_t *l;
spin_lock(&fbc->lock);
idx = fbc->offidx;
fbc->offidx ^= 1;
mb();
/*
* FIXME :
* must 'wait' other cpus dont touch anymore their old local_t
*/
for_each_cpu(cpu) {
cp = per_cpu_ptr(fbc->counters, cpu);
l = &cp[idx];
val = local_read(l);
/* dont dirty alien cache line if not necessary */
if (val)
local_set(l, 0);
res += val;
}
spin_unlock(&fbc->lock);
atomic_long_add(res, &fbc->count);
return atomic_long_read(&fbc->count);
}
3) Are the locked ops so expensive if done on a cache line that is mostly in
exclusive state in cpu cache ?
Thank you
Eric
On Sun, Jan 29, 2006 at 07:54:09AM +0100, Eric Dumazet wrote:
> Well, I think that might be doable, maybe RCU magic ?
>
> 1) local_t are not that nice on all archs.
It is for the users that matter, and the hooks are there if someone finds
it to be a performance problem.
> 2) The consolidation phase (summing all the cpus local offset to
> consolidate the central counter) might be more difficult to do (we would
> need kind of 2 counters per cpu, and a index that can be changed by the cpu
> that wants a consolidation (still 'expensive'))
For the vast majority of these sorts of statistics counters, we don't
need 100% accurate counts. And I think it should be possible to choose
between a lightweight implementation and the expensive implementation.
On a chip like the Core Duo the cost of bouncing between the two cores
is minimal, so all the extra code and data is a waste.
> 3) Are the locked ops so expensive if done on a cache line that is mostly
> in exclusive state in cpu cache ?
Yes. What happens on the P4 is that it forces outstanding memory
transactions in the reorder buffer to be flushed so that the memory barrier
semantics of the lock prefix are observed. This can take a long time as
there can be over a hundred instructions in flight.
-ben
--
"Ladies and gentlemen, I'm sorry to interrupt, but the police are here
and they've asked us to stop the party." Don't Email: <[email protected]>.
On Fri, Jan 27, 2006 at 03:01:06PM -0800, Andrew Morton wrote:
> Ravikiran G Thirumalai <[email protected]> wrote:
>
>
> > >
> > > If the benchmarks say that we need to. If we cannot observe any problems
> > > in testing of existing code and if we can't demonstrate any benefit from
> > > the patched code then one option is to go off and do something else ;)
> >
> > We first tried plain per-CPU counters for memory_allocated, found that reads
> > on memory_allocated was causing cacheline transfers, and then
> > switched over to batching. So batching reads is useful. To avoid
> > inaccuracy, we can maybe change percpu_counter_init to:
> >
> > void percpu_counter_init(struct percpu_counter *fbc, int maxdev)
> >
> > the percpu batching limit would then be maxdev/num_possible_cpus. One would
> > use batching counters only when both reads and writes are frequent. With
> > the above scheme, we would go fetch cachelines from other cpus for read
> > often only on large cpu counts, which is not any worse than the global
> > counter alternative, but it would still be beneficial on smaller machines,
> > without sacrificing a pre-set deviation.
> >
> > Comments?
>
> Sounds sane.
>
Here's an implementation which delegates tuning of batching to the user. We
don't really need local_t at all as percpu_counter_mod is not safe against
interrupts and softirqs as it is. If we have a counter which could be
modified in process context and irq/bh context, we just have to use a
wrapper like percpu_counter_mod_bh which will just disable and enable bottom
halves. Reads on the counters are safe as they are atomic_reads, and the
cpu local variables are always accessed by that cpu only.
(PS: the maxerr for ext2/ext3 is just guesstimate)
Comments?
Index: linux-2.6.16-rc1mm4/include/linux/percpu_counter.h
===================================================================
--- linux-2.6.16-rc1mm4.orig/include/linux/percpu_counter.h 2006-02-02 11:18:54.000000000 -0800
+++ linux-2.6.16-rc1mm4/include/linux/percpu_counter.h 2006-02-02 18:29:46.000000000 -0800
@@ -16,24 +16,32 @@
struct percpu_counter {
atomic_long_t count;
+ int percpu_batch;
long *counters;
};
-#if NR_CPUS >= 16
-#define FBC_BATCH (NR_CPUS*2)
-#else
-#define FBC_BATCH (NR_CPUS*4)
-#endif
-static inline void percpu_counter_init(struct percpu_counter *fbc)
+/*
+ * Choose maxerr carefully. maxerr/num_possible_cpus indicates per-cpu batching
+ * Set maximum tolerance for better performance on large systems.
+ */
+static inline void percpu_counter_init(struct percpu_counter *fbc,
+ unsigned int maxerr)
{
atomic_long_set(&fbc->count, 0);
- fbc->counters = alloc_percpu(long);
+ fbc->percpu_batch = maxerr/num_possible_cpus();
+ if (fbc->percpu_batch) {
+ fbc->counters = alloc_percpu(long);
+ if (!fbc->counters)
+ fbc->percpu_batch = 0;
+ }
+
}
static inline void percpu_counter_destroy(struct percpu_counter *fbc)
{
- free_percpu(fbc->counters);
+ if (fbc->percpu_batch)
+ free_percpu(fbc->counters);
}
void percpu_counter_mod(struct percpu_counter *fbc, long amount);
@@ -63,7 +71,8 @@ struct percpu_counter {
long count;
};
-static inline void percpu_counter_init(struct percpu_counter *fbc)
+static inline void percpu_counter_init(struct percpu_counter *fbc,
+ unsigned int maxerr)
{
fbc->count = 0;
}
Index: linux-2.6.16-rc1mm4/mm/swap.c
===================================================================
--- linux-2.6.16-rc1mm4.orig/mm/swap.c 2006-01-29 20:20:20.000000000 -0800
+++ linux-2.6.16-rc1mm4/mm/swap.c 2006-02-02 18:36:21.000000000 -0800
@@ -470,13 +470,20 @@ static int cpu_swap_callback(struct noti
#ifdef CONFIG_SMP
void percpu_counter_mod(struct percpu_counter *fbc, long amount)
{
- long count;
long *pcount;
- int cpu = get_cpu();
+ long count;
+ int cpu;
+ /* Slow mode */
+ if (unlikely(!fbc->percpu_batch)) {
+ atomic_long_add(amount, &fbc->count);
+ return;
+ }
+
+ cpu = get_cpu();
pcount = per_cpu_ptr(fbc->counters, cpu);
count = *pcount + amount;
- if (count >= FBC_BATCH || count <= -FBC_BATCH) {
+ if (count >= fbc->percpu_batch || count <= -fbc->percpu_batch) {
atomic_long_add(count, &fbc->count);
count = 0;
}
Index: linux-2.6.16-rc1mm4/fs/ext2/super.c
===================================================================
--- linux-2.6.16-rc1mm4.orig/fs/ext2/super.c 2006-02-02 18:30:28.000000000 -0800
+++ linux-2.6.16-rc1mm4/fs/ext2/super.c 2006-02-02 18:36:39.000000000 -0800
@@ -610,6 +610,7 @@ static int ext2_fill_super(struct super_
int db_count;
int i, j;
__le32 features;
+ int maxerr;
sbi = kmalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
@@ -835,9 +836,14 @@ static int ext2_fill_super(struct super_
printk ("EXT2-fs: not enough memory\n");
goto failed_mount;
}
- percpu_counter_init(&sbi->s_freeblocks_counter);
- percpu_counter_init(&sbi->s_freeinodes_counter);
- percpu_counter_init(&sbi->s_dirs_counter);
+
+ if (num_possible_cpus() <= 16 )
+ maxerr = 256;
+ else
+ maxerr = 1024;
+ percpu_counter_init(&sbi->s_freeblocks_counter, maxerr);
+ percpu_counter_init(&sbi->s_freeinodes_counter, maxerr);
+ percpu_counter_init(&sbi->s_dirs_counter, maxerr);
bgl_lock_init(&sbi->s_blockgroup_lock);
sbi->s_debts = kmalloc(sbi->s_groups_count * sizeof(*sbi->s_debts),
GFP_KERNEL);
Index: linux-2.6.16-rc1mm4/fs/ext3/super.c
===================================================================
--- linux-2.6.16-rc1mm4.orig/fs/ext3/super.c 2006-02-02 18:30:28.000000000 -0800
+++ linux-2.6.16-rc1mm4/fs/ext3/super.c 2006-02-02 18:38:10.000000000 -0800
@@ -1353,6 +1353,7 @@ static int ext3_fill_super (struct super
int i;
int needs_recovery;
__le32 features;
+ int maxerr;
sbi = kmalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
@@ -1578,9 +1579,14 @@ static int ext3_fill_super (struct super
goto failed_mount;
}
- percpu_counter_init(&sbi->s_freeblocks_counter);
- percpu_counter_init(&sbi->s_freeinodes_counter);
- percpu_counter_init(&sbi->s_dirs_counter);
+ if (num_possible_cpus() <= 16)
+ maxerr = 256;
+ else
+ maxerr = 1024;
+
+ percpu_counter_init(&sbi->s_freeblocks_counter, maxerr);
+ percpu_counter_init(&sbi->s_freeinodes_counter, maxerr);
+ percpu_counter_init(&sbi->s_dirs_counter, maxerr);
bgl_lock_init(&sbi->s_blockgroup_lock);
for (i = 0; i < db_count; i++) {
Ravikiran G Thirumalai <[email protected]> wrote:
>
> On Fri, Jan 27, 2006 at 03:01:06PM -0800, Andrew Morton wrote:
> > Ravikiran G Thirumalai <[email protected]> wrote:
> >
> >
> > > >
> > > > If the benchmarks say that we need to. If we cannot observe any problems
> > > > in testing of existing code and if we can't demonstrate any benefit from
> > > > the patched code then one option is to go off and do something else ;)
> > >
> > > We first tried plain per-CPU counters for memory_allocated, found that reads
> > > on memory_allocated was causing cacheline transfers, and then
> > > switched over to batching. So batching reads is useful. To avoid
> > > inaccuracy, we can maybe change percpu_counter_init to:
> > >
> > > void percpu_counter_init(struct percpu_counter *fbc, int maxdev)
> > >
> > > the percpu batching limit would then be maxdev/num_possible_cpus. One would
> > > use batching counters only when both reads and writes are frequent. With
> > > the above scheme, we would go fetch cachelines from other cpus for read
> > > often only on large cpu counts, which is not any worse than the global
> > > counter alternative, but it would still be beneficial on smaller machines,
> > > without sacrificing a pre-set deviation.
> > >
> > > Comments?
> >
> > Sounds sane.
> >
>
> Here's an implementation which delegates tuning of batching to the user. We
> don't really need local_t at all as percpu_counter_mod is not safe against
> interrupts and softirqs as it is. If we have a counter which could be
> modified in process context and irq/bh context, we just have to use a
> wrapper like percpu_counter_mod_bh which will just disable and enable bottom
> halves. Reads on the counters are safe as they are atomic_reads, and the
> cpu local variables are always accessed by that cpu only.
>
> (PS: the maxerr for ext2/ext3 is just guesstimate)
Well that's the problem. We need to choose production-quality values for
use in there.
> Comments?
Using num_possible_cpus() in that header file is just asking for build
errors. Probably best to uninline the function rather than adding the
needed include of cpumask.h.
On Thu, Feb 02, 2006 at 07:16:00PM -0800, Andrew Morton wrote:
> Ravikiran G Thirumalai <[email protected]> wrote:
> >
> > On Fri, Jan 27, 2006 at 03:01:06PM -0800, Andrew Morton wrote:
> > Here's an implementation which delegates tuning of batching to the user. We
> > don't really need local_t at all as percpu_counter_mod is not safe against
> > interrupts and softirqs as it is. If we have a counter which could be
> > modified in process context and irq/bh context, we just have to use a
> > wrapper like percpu_counter_mod_bh which will just disable and enable bottom
> > halves. Reads on the counters are safe as they are atomic_reads, and the
> > cpu local variables are always accessed by that cpu only.
> >
> > (PS: the maxerr for ext2/ext3 is just guesstimate)
>
> Well that's the problem. We need to choose production-quality values for
> use in there.
The guesstimate was loosely based on keeping the per-cpu batch at atleast 8
on reasonably sized systems, while not letting maxerr grow too big. I guess
machines with cpu counts more than 128 don't use ext3 :). And if they do,
they can tune the counters with a higher maxerr. I guess it might be a bit
ugly on the user side with all the if num_possibl_cpus(), but is there a
better alternative?
(I plan to test the counter values for ext2 and ext3 on a 16 way box, and
change these if they turn out to be not so good)
>
> > Comments?
>
> Using num_possible_cpus() in that header file is just asking for build
> errors. Probably best to uninline the function rather than adding the
> needed include of cpumask.h.
Yup,
Here it is.
Change the percpu_counter interface so that user can specify the maximum
tolerable deviation.
Signed-off-by: Ravikiran Thirumalai <[email protected]>
Index: linux-2.6.16-rc1mm4/include/linux/percpu_counter.h
===================================================================
--- linux-2.6.16-rc1mm4.orig/include/linux/percpu_counter.h 2006-02-02 11:18:54.000000000 -0800
+++ linux-2.6.16-rc1mm4/include/linux/percpu_counter.h 2006-02-03 11:04:05.000000000 -0800
@@ -16,24 +16,20 @@
struct percpu_counter {
atomic_long_t count;
+ int percpu_batch;
long *counters;
};
-#if NR_CPUS >= 16
-#define FBC_BATCH (NR_CPUS*2)
-#else
-#define FBC_BATCH (NR_CPUS*4)
-#endif
-
-static inline void percpu_counter_init(struct percpu_counter *fbc)
-{
- atomic_long_set(&fbc->count, 0);
- fbc->counters = alloc_percpu(long);
-}
+/*
+ * Choose maxerr carefully. maxerr/num_possible_cpus indicates per-cpu
+ * batching. Set maximum tolerance for better performance on large systems.
+ */
+void percpu_counter_init(struct percpu_counter *fbc, unsigned int maxerr);
static inline void percpu_counter_destroy(struct percpu_counter *fbc)
{
- free_percpu(fbc->counters);
+ if (fbc->percpu_batch)
+ free_percpu(fbc->counters);
}
void percpu_counter_mod(struct percpu_counter *fbc, long amount);
@@ -63,7 +59,8 @@ struct percpu_counter {
long count;
};
-static inline void percpu_counter_init(struct percpu_counter *fbc)
+static inline void percpu_counter_init(struct percpu_counter *fbc,
+ unsigned int maxerr)
{
fbc->count = 0;
}
Index: linux-2.6.16-rc1mm4/mm/swap.c
===================================================================
--- linux-2.6.16-rc1mm4.orig/mm/swap.c 2006-01-29 20:20:20.000000000 -0800
+++ linux-2.6.16-rc1mm4/mm/swap.c 2006-02-03 11:02:05.000000000 -0800
@@ -468,15 +468,39 @@ static int cpu_swap_callback(struct noti
#endif /* CONFIG_SMP */
#ifdef CONFIG_SMP
+
+/*
+ * Choose maxerr carefully. maxerr/num_possible_cpus indicates per-cpu
+ * batching. Set maximum tolerance for better performance on large systems.
+ */
+void percpu_counter_init(struct percpu_counter *fbc, unsigned int maxerr)
+{
+ atomic_long_set(&fbc->count, 0);
+ fbc->percpu_batch = maxerr/num_possible_cpus();
+ if (fbc->percpu_batch) {
+ fbc->counters = alloc_percpu(long);
+ if (!fbc->counters)
+ fbc->percpu_batch = 0;
+ }
+
+}
+
void percpu_counter_mod(struct percpu_counter *fbc, long amount)
{
- long count;
long *pcount;
- int cpu = get_cpu();
+ long count;
+ int cpu;
+ /* Slow mode */
+ if (unlikely(!fbc->percpu_batch)) {
+ atomic_long_add(amount, &fbc->count);
+ return;
+ }
+
+ cpu = get_cpu();
pcount = per_cpu_ptr(fbc->counters, cpu);
count = *pcount + amount;
- if (count >= FBC_BATCH || count <= -FBC_BATCH) {
+ if (count >= fbc->percpu_batch || count <= -fbc->percpu_batch) {
atomic_long_add(count, &fbc->count);
count = 0;
}
@@ -484,6 +508,7 @@ void percpu_counter_mod(struct percpu_co
put_cpu();
}
EXPORT_SYMBOL(percpu_counter_mod);
+EXPORT_SYMBOL(percpu_counter_init);
#endif
/*
Index: linux-2.6.16-rc1mm4/fs/ext2/super.c
===================================================================
--- linux-2.6.16-rc1mm4.orig/fs/ext2/super.c 2006-02-03 11:03:54.000000000 -0800
+++ linux-2.6.16-rc1mm4/fs/ext2/super.c 2006-02-03 11:04:10.000000000 -0800
@@ -610,6 +610,7 @@ static int ext2_fill_super(struct super_
int db_count;
int i, j;
__le32 features;
+ int maxerr;
sbi = kmalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
@@ -835,9 +836,14 @@ static int ext2_fill_super(struct super_
printk ("EXT2-fs: not enough memory\n");
goto failed_mount;
}
- percpu_counter_init(&sbi->s_freeblocks_counter);
- percpu_counter_init(&sbi->s_freeinodes_counter);
- percpu_counter_init(&sbi->s_dirs_counter);
+
+ if (num_possible_cpus() <= 16 )
+ maxerr = 256;
+ else
+ maxerr = 1024;
+ percpu_counter_init(&sbi->s_freeblocks_counter, maxerr);
+ percpu_counter_init(&sbi->s_freeinodes_counter, maxerr);
+ percpu_counter_init(&sbi->s_dirs_counter, maxerr);
bgl_lock_init(&sbi->s_blockgroup_lock);
sbi->s_debts = kmalloc(sbi->s_groups_count * sizeof(*sbi->s_debts),
GFP_KERNEL);
Index: linux-2.6.16-rc1mm4/fs/ext3/super.c
===================================================================
--- linux-2.6.16-rc1mm4.orig/fs/ext3/super.c 2006-02-03 11:03:54.000000000 -0800
+++ linux-2.6.16-rc1mm4/fs/ext3/super.c 2006-02-03 11:04:10.000000000 -0800
@@ -1353,6 +1353,7 @@ static int ext3_fill_super (struct super
int i;
int needs_recovery;
__le32 features;
+ int maxerr;
sbi = kmalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
@@ -1578,9 +1579,14 @@ static int ext3_fill_super (struct super
goto failed_mount;
}
- percpu_counter_init(&sbi->s_freeblocks_counter);
- percpu_counter_init(&sbi->s_freeinodes_counter);
- percpu_counter_init(&sbi->s_dirs_counter);
+ if (num_possible_cpus() <= 16)
+ maxerr = 256;
+ else
+ maxerr = 1024;
+
+ percpu_counter_init(&sbi->s_freeblocks_counter, maxerr);
+ percpu_counter_init(&sbi->s_freeinodes_counter, maxerr);
+ percpu_counter_init(&sbi->s_dirs_counter, maxerr);
bgl_lock_init(&sbi->s_blockgroup_lock);
for (i = 0; i < db_count; i++) {
Ravikiran G Thirumalai <[email protected]> wrote:
>
> On Thu, Feb 02, 2006 at 07:16:00PM -0800, Andrew Morton wrote:
> > Ravikiran G Thirumalai <[email protected]> wrote:
> > >
> > > On Fri, Jan 27, 2006 at 03:01:06PM -0800, Andrew Morton wrote:
> > > Here's an implementation which delegates tuning of batching to the user. We
> > > don't really need local_t at all as percpu_counter_mod is not safe against
> > > interrupts and softirqs as it is. If we have a counter which could be
> > > modified in process context and irq/bh context, we just have to use a
> > > wrapper like percpu_counter_mod_bh which will just disable and enable bottom
> > > halves. Reads on the counters are safe as they are atomic_reads, and the
> > > cpu local variables are always accessed by that cpu only.
> > >
> > > (PS: the maxerr for ext2/ext3 is just guesstimate)
> >
> > Well that's the problem. We need to choose production-quality values for
> > use in there.
>
> The guesstimate was loosely based on keeping the per-cpu batch at atleast 8
> on reasonably sized systems, while not letting maxerr grow too big. I guess
> machines with cpu counts more than 128 don't use ext3 :). And if they do,
> they can tune the counters with a higher maxerr. I guess it might be a bit
> ugly on the user side with all the if num_possibl_cpus(), but is there a
> better alternative?
>
> (I plan to test the counter values for ext2 and ext3 on a 16 way box, and
> change these if they turn out to be not so good)
OK, thanks. Frankly I think I went overboard on the scalability thing when
adding percpu counters to ext2 and ext3. I suspect they're not providing
significant benefit over per-sb-spinlock and a ulong.
> >
> > > Comments?
> >
> > Using num_possible_cpus() in that header file is just asking for build
> > errors. Probably best to uninline the function rather than adding the
> > needed include of cpumask.h.
>
> Yup,
>
> Here it is.
>
> Change the percpu_counter interface so that user can specify the maximum
> tolerable deviation.
OK, thanks. I need to sit down and a) remember why we're even discussing
this and b) see what we've merged thus far and work out what it all does ;)