Hi Andrew,
This is the kmalloc_percpu patch. I have another patch which
tests the allocator if you want to see that to. This is the precursor
to per-cpu stuff in modules, but also allows other space gains for
structures which currently embed per-cpu arrays (eg. your fuzzy counters).
Cheers,
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
Name: kmalloc_percpu to use same percpu operators
Author: Rusty Russell
Status: Tested on 2.5.68-bk11
D: By overallocating the per-cpu data at boot, we can make quite an
D: efficient allocator, and then use it to support per-cpu data in
D: modules (next patch).
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29880-linux-2.5.69/include/asm-generic/percpu.h .29880-linux-2.5.69.updated/include/asm-generic/percpu.h
--- .29880-linux-2.5.69/include/asm-generic/percpu.h 2003-01-02 12:32:47.000000000 +1100
+++ .29880-linux-2.5.69.updated/include/asm-generic/percpu.h 2003-05-05 17:36:25.000000000 +1000
@@ -2,37 +2,11 @@
#define _ASM_GENERIC_PERCPU_H_
#include <linux/compiler.h>
-#define __GENERIC_PER_CPU
+/* Some archs may want to keep __per_cpu_offset for this CPU in a register,
+ or do their own allocation. */
#ifdef CONFIG_SMP
-
-extern unsigned long __per_cpu_offset[NR_CPUS];
-
-/* Separate out the type, so (int[3], foo) works. */
-#ifndef MODULE
-#define DEFINE_PER_CPU(type, name) \
- __attribute__((__section__(".data.percpu"))) __typeof__(type) name##__per_cpu
-#endif
-
-/* var is in discarded region: offset to particular copy we want */
-#define per_cpu(var, cpu) (*RELOC_HIDE(&var##__per_cpu, __per_cpu_offset[cpu]))
#define __get_cpu_var(var) per_cpu(var, smp_processor_id())
-
-#else /* ! SMP */
-
-/* Can't define per-cpu variables in modules. Sorry --RR */
-#ifndef MODULE
-#define DEFINE_PER_CPU(type, name) \
- __typeof__(type) name##__per_cpu
-#endif
-
-#define per_cpu(var, cpu) ((void)cpu, var##__per_cpu)
-#define __get_cpu_var(var) var##__per_cpu
-
-#endif /* SMP */
-
-#define DECLARE_PER_CPU(type, name) extern __typeof__(type) name##__per_cpu
-
-#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(var##__per_cpu)
-#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(var##__per_cpu)
-
+#define __get_cpu_ptr(var) per_cpu_ptr(ptr, smp_processor_id())
+#define __NEED_SETUP_PER_CPU_AREAS
+#endif /* SMP */
#endif /* _ASM_GENERIC_PERCPU_H_ */
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29880-linux-2.5.69/include/linux/genhd.h .29880-linux-2.5.69.updated/include/linux/genhd.h
--- .29880-linux-2.5.69/include/linux/genhd.h 2003-05-05 12:37:12.000000000 +1000
+++ .29880-linux-2.5.69.updated/include/linux/genhd.h 2003-05-05 17:36:25.000000000 +1000
@@ -160,10 +160,9 @@ static inline void disk_stat_set_all(str
#ifdef CONFIG_SMP
static inline int init_disk_stats(struct gendisk *disk)
{
- disk->dkstats = kmalloc_percpu(sizeof (struct disk_stats), GFP_KERNEL);
+ disk->dkstats = kmalloc_percpu(struct disk_stats);
if (!disk->dkstats)
return 0;
- disk_stat_set_all(disk, 0);
return 1;
}
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29880-linux-2.5.69/include/linux/percpu.h .29880-linux-2.5.69.updated/include/linux/percpu.h
--- .29880-linux-2.5.69/include/linux/percpu.h 2003-02-07 19:20:01.000000000 +1100
+++ .29880-linux-2.5.69.updated/include/linux/percpu.h 2003-05-05 17:36:25.000000000 +1000
@@ -1,71 +1,85 @@
#ifndef __LINUX_PERCPU_H
#define __LINUX_PERCPU_H
-#include <linux/spinlock.h> /* For preempt_disable() */
-#include <linux/slab.h> /* For kmalloc_percpu() */
+#include <linux/preempt.h> /* For preempt_disable() */
+#include <linux/slab.h> /* For kmalloc() */
+#include <linux/cache.h>
+#include <linux/string.h>
+#include <asm/bug.h>
#include <asm/percpu.h>
-/* Must be an lvalue. */
+/* Total pool for percpu data (for each CPU). */
+#ifndef PERCPU_POOL_SIZE
+#define PERCPU_POOL_SIZE 32768
+#endif
+
+/* For variables declared with DECLARE_PER_CPU()/DEFINE_PER_CPU(). */
#define get_cpu_var(var) (*({ preempt_disable(); &__get_cpu_var(var); }))
#define put_cpu_var(var) preempt_enable()
+/* Also, per_cpu(var, cpu) to get another cpu's value. */
+
+/* For ptrs allocated with kmalloc_percpu */
+#define get_cpu_ptr(ptr) ({ preempt_disable(); __get_cpu_ptr(ptr); })
+#define put_cpu_ptr(ptr) preempt_enable()
+/* Also, per_cpu_ptr(ptr, cpu) to get another cpu's value. */
#ifdef CONFIG_SMP
-struct percpu_data {
- void *ptrs[NR_CPUS];
- void *blkp;
-};
+/* __alloc_percpu zeros memory for every cpu, as a convenience. */
+extern void *__alloc_percpu(size_t size, size_t align);
+extern void kfree_percpu(const void *);
-/*
- * Use this to get to a cpu's version of the per-cpu object allocated using
- * kmalloc_percpu. If you want to get "this cpu's version", maybe you want
- * to use get_cpu_ptr...
- */
-#define per_cpu_ptr(ptr, cpu) \
-({ \
- struct percpu_data *__p = (struct percpu_data *)~(unsigned long)(ptr); \
- (__typeof__(ptr))__p->ptrs[(cpu)]; \
-})
+extern unsigned long __per_cpu_offset[NR_CPUS];
-extern void *kmalloc_percpu(size_t size, int flags);
-extern void kfree_percpu(const void *);
-extern void kmalloc_percpu_init(void);
+/* Separate out the type, so (int[3], foo) works. */
+#ifndef MODULE
+#define DEFINE_PER_CPU(type, name) \
+ __attribute__((__section__(".data.percpu"))) __typeof__(type) name##__per_cpu
+#endif
-#else /* CONFIG_SMP */
+/* var is in discarded region: offset to particular copy we want */
+#define per_cpu(var, cpu) (*RELOC_HIDE(&var##__per_cpu, __per_cpu_offset[cpu]))
+#define per_cpu_ptr(ptr, cpu) ((__typeof__(ptr))(RELOC_HIDE(ptr, __per_cpu_offset[cpu])))
-#define per_cpu_ptr(ptr, cpu) (ptr)
+extern void setup_per_cpu_areas(void);
+#else /* !CONFIG_SMP */
-static inline void *kmalloc_percpu(size_t size, int flags)
+/* Can't define per-cpu variables in modules. Sorry --RR */
+#ifndef MODULE
+#define DEFINE_PER_CPU(type, name) \
+ __typeof__(type) name##__per_cpu
+#endif
+
+#define per_cpu(var, cpu) ((void)(cpu), var##__per_cpu)
+#define __get_cpu_var(var) var##__per_cpu
+#define per_cpu_ptr(ptr, cpu) ((void)(cpu), (ptr))
+#define __get_cpu_ptr(ptr) (ptr)
+
+static inline void *__alloc_percpu(size_t size, size_t align)
{
- return(kmalloc(size, flags));
+ void *ret;
+ /* kmalloc always cacheline aligns. */
+ BUG_ON(align > SMP_CACHE_BYTES);
+ BUG_ON(size > PERCPU_POOL_SIZE/2);
+ ret = kmalloc(size, GFP_KERNEL);
+ if (ret)
+ memset(ret, 0, size);
+ return ret;
}
static inline void kfree_percpu(const void *ptr)
{
kfree(ptr);
}
-static inline void kmalloc_percpu_init(void) { }
+static inline void setup_per_cpu_areas(void) { }
#endif /* CONFIG_SMP */
-/*
- * Use these with kmalloc_percpu. If
- * 1. You want to operate on memory allocated by kmalloc_percpu (dereference
- * and read/modify/write) AND
- * 2. You want "this cpu's version" of the object AND
- * 3. You want to do this safely since:
- * a. On multiprocessors, you don't want to switch between cpus after
- * you've read the current processor id due to preemption -- this would
- * take away the implicit advantage to not have any kind of traditional
- * serialization for per-cpu data
- * b. On uniprocessors, you don't want another kernel thread messing
- * up with the same per-cpu data due to preemption
- *
- * So, Use get_cpu_ptr to disable preemption and get pointer to the
- * local cpu version of the per-cpu object. Use put_cpu_ptr to enable
- * preemption. Operations on per-cpu data between get_ and put_ is
- * then considered to be safe. And ofcourse, "Thou shalt not sleep between
- * get_cpu_ptr and put_cpu_ptr"
- */
-#define get_cpu_ptr(ptr) per_cpu_ptr(ptr, get_cpu())
-#define put_cpu_ptr(ptr) put_cpu()
+/* Simple wrapper for the common case. Zeros memory. */
+#define kmalloc_percpu(type) \
+ ((type *)(__alloc_percpu(sizeof(type), __alignof__(type))))
+
+#define DECLARE_PER_CPU(type, name) extern __typeof__(type) name##__per_cpu
+
+#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(var##__per_cpu)
+#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(var##__per_cpu)
#endif /* __LINUX_PERCPU_H */
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29880-linux-2.5.69/include/net/ipv6.h .29880-linux-2.5.69.updated/include/net/ipv6.h
--- .29880-linux-2.5.69/include/net/ipv6.h 2003-05-05 12:37:12.000000000 +1000
+++ .29880-linux-2.5.69.updated/include/net/ipv6.h 2003-05-05 17:36:25.000000000 +1000
@@ -145,7 +145,7 @@ extern atomic_t inet6_sock_nr;
int snmp6_register_dev(struct inet6_dev *idev);
int snmp6_unregister_dev(struct inet6_dev *idev);
-int snmp6_mib_init(void *ptr[2], size_t mibsize);
+int snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign);
void snmp6_mib_free(void *ptr[2]);
struct ip6_ra_chain
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29880-linux-2.5.69/init/main.c .29880-linux-2.5.69.updated/init/main.c
--- .29880-linux-2.5.69/init/main.c 2003-05-05 12:37:13.000000000 +1000
+++ .29880-linux-2.5.69.updated/init/main.c 2003-05-05 17:36:25.000000000 +1000
@@ -301,35 +301,10 @@ static void __init smp_init(void)
#define smp_init() do { } while (0)
#endif
-static inline void setup_per_cpu_areas(void) { }
static inline void smp_prepare_cpus(unsigned int maxcpus) { }
#else
-#ifdef __GENERIC_PER_CPU
-unsigned long __per_cpu_offset[NR_CPUS];
-
-static void __init setup_per_cpu_areas(void)
-{
- unsigned long size, i;
- char *ptr;
- /* Created by linker magic */
- extern char __per_cpu_start[], __per_cpu_end[];
-
- /* Copy section for each CPU (we discard the original) */
- size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
- if (!size)
- return;
-
- ptr = alloc_bootmem(size * NR_CPUS);
-
- for (i = 0; i < NR_CPUS; i++, ptr += size) {
- __per_cpu_offset[i] = ptr - __per_cpu_start;
- memcpy(ptr, __per_cpu_start, size);
- }
-}
-#endif /* !__GENERIC_PER_CPU */
-
/* Called by boot processor to activate the rest. */
static void __init smp_init(void)
{
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29880-linux-2.5.69/kernel/ksyms.c .29880-linux-2.5.69.updated/kernel/ksyms.c
--- .29880-linux-2.5.69/kernel/ksyms.c 2003-05-05 12:37:13.000000000 +1000
+++ .29880-linux-2.5.69.updated/kernel/ksyms.c 2003-05-05 17:36:25.000000000 +1000
@@ -99,7 +99,7 @@ EXPORT_SYMBOL(remove_shrinker);
EXPORT_SYMBOL(kmalloc);
EXPORT_SYMBOL(kfree);
#ifdef CONFIG_SMP
-EXPORT_SYMBOL(kmalloc_percpu);
+EXPORT_SYMBOL(__alloc_percpu);
EXPORT_SYMBOL(kfree_percpu);
EXPORT_SYMBOL(percpu_counter_mod);
#endif
@@ -607,7 +607,7 @@ EXPORT_SYMBOL(init_thread_union);
EXPORT_SYMBOL(tasklist_lock);
EXPORT_SYMBOL(find_task_by_pid);
EXPORT_SYMBOL(next_thread);
-#if defined(CONFIG_SMP) && defined(__GENERIC_PER_CPU)
+#if defined(CONFIG_SMP)
EXPORT_SYMBOL(__per_cpu_offset);
#endif
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29880-linux-2.5.69/mm/Makefile .29880-linux-2.5.69.updated/mm/Makefile
--- .29880-linux-2.5.69/mm/Makefile 2003-02-11 14:26:20.000000000 +1100
+++ .29880-linux-2.5.69.updated/mm/Makefile 2003-05-05 17:36:25.000000000 +1000
@@ -12,3 +12,4 @@ obj-y := bootmem.o filemap.o mempool.o
slab.o swap.o truncate.o vcache.o vmscan.o $(mmu-y)
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
+obj-$(CONFIG_SMP) += percpu.o
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29880-linux-2.5.69/mm/percpu.c .29880-linux-2.5.69.updated/mm/percpu.c
--- .29880-linux-2.5.69/mm/percpu.c 1970-01-01 10:00:00.000000000 +1000
+++ .29880-linux-2.5.69.updated/mm/percpu.c 2003-05-05 17:36:26.000000000 +1000
@@ -0,0 +1,204 @@
+/*
+ * Dynamic per-cpu allocation.
+ * Originally by Dipankar Sarma <[email protected]>
+ * This version (C) 2003 Rusty Russell, IBM Corporation.
+ */
+
+/* Simple allocator: we don't stress it hard, but do want it
+ fairly space-efficient. */
+#include <linux/percpu.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <asm/semaphore.h>
+
+static DECLARE_MUTEX(pcpu_lock);
+
+struct pcpu_block
+{
+ /* Number of blocks used and allocated. */
+ unsigned short num_used, num_allocated;
+
+ /* Size of each block. -ve means used. */
+ int size[0];
+};
+static struct pcpu_block *pcpu; /* = NULL */
+
+/* Created by linker magic */
+extern char __per_cpu_start[], __per_cpu_end[];
+
+/* Splits a block into two. Reallocs pcpu if neccessary. */
+static int split_block(unsigned int i, unsigned short size)
+{
+ /* Reallocation required? */
+ if (pcpu->num_used + 1 > pcpu->num_allocated) {
+ struct pcpu_block *new;
+
+ new = kmalloc(sizeof(*pcpu)
+ + sizeof(pcpu->size[0]) * pcpu->num_allocated*2,
+ GFP_KERNEL);
+ if (!new)
+ return 0;
+ new->num_used = pcpu->num_used;
+ new->num_allocated = pcpu->num_allocated * 2;
+ memcpy(new->size, pcpu->size,
+ sizeof(pcpu->size[0])*pcpu->num_used);
+ kfree(pcpu);
+ pcpu = new;
+ }
+
+ /* Insert a new subblock */
+ memmove(&pcpu->size[i+1], &pcpu->size[i],
+ sizeof(pcpu->size[0]) * (pcpu->num_used - i));
+ pcpu->num_used++;
+
+ pcpu->size[i+1] -= size;
+ pcpu->size[i] = size;
+ return 1;
+}
+
+static inline unsigned int abs(int val)
+{
+ if (val < 0)
+ return -val;
+ return val;
+}
+
+static inline void zero_all(void *pcpuptr, unsigned int size)
+{
+ unsigned int i;;
+
+ for (i = 0; i < NR_CPUS; i++)
+ memset(per_cpu_ptr(pcpuptr, i), 0, size);
+}
+
+void *__alloc_percpu(size_t size, size_t align)
+{
+ unsigned long extra;
+ unsigned int i;
+ void *ptr;
+
+ BUG_ON(align > SMP_CACHE_BYTES);
+ BUG_ON(size > PERCPU_POOL_SIZE/2);
+
+ down(&pcpu_lock);
+ ptr = __per_cpu_start;
+ for (i = 0; i < pcpu->num_used; ptr += abs(pcpu->size[i]), i++) {
+ /* Extra for alignment requirement. */
+ extra = ALIGN((unsigned long)ptr, align) - (unsigned long)ptr;
+
+ /* Allocated or not large enough? */
+ if (pcpu->size[i] < 0 || pcpu->size[i] < extra + size)
+ continue;
+
+ /* Transfer extra to previous block. */
+ if (pcpu->size[i-1] < 0)
+ pcpu->size[i-1] -= extra;
+ else
+ pcpu->size[i-1] += extra;
+ pcpu->size[i] -= extra;
+ ptr += extra;
+
+ /* Split block if warranted */
+ if (pcpu->size[i] - size > sizeof(unsigned long))
+ if (!split_block(i, size))
+ break;
+
+ /* Mark allocated */
+ pcpu->size[i] = -pcpu->size[i];
+ zero_all(ptr, size);
+ goto out;
+ }
+ ptr = NULL;
+ out:
+ up(&pcpu_lock);
+ return ptr;
+}
+
+void kfree_percpu(const void *freeme)
+{
+ unsigned int i;
+ void *ptr = __per_cpu_start;
+
+ down(&pcpu_lock);
+ for (i = 0; i < pcpu->num_used; ptr += abs(pcpu->size[i]), i++) {
+ if (ptr == freeme) {
+ /* Double free? */
+ BUG_ON(pcpu->size[i] > 0);
+ /* Block 0 is for non-dynamic per-cpu data. */
+ BUG_ON(i == 0);
+ pcpu->size[i] = -pcpu->size[i];
+ goto merge;
+ }
+ }
+ BUG();
+
+ merge:
+ /* Merge with previous? */
+ if (pcpu->size[i-1] >= 0) {
+ pcpu->size[i-1] += pcpu->size[i];
+ pcpu->num_used--;
+ memmove(&pcpu->size[i], &pcpu->size[i+1],
+ (pcpu->num_used - i) * sizeof(pcpu->size[0]));
+ i--;
+ }
+ /* Merge with next? */
+ if (i+1 < pcpu->num_used && pcpu->size[i+1] >= 0) {
+ pcpu->size[i] += pcpu->size[i+1];
+ pcpu->num_used--;
+ memmove(&pcpu->size[i+1], &pcpu->size[i+2],
+ (pcpu->num_used - (i+1)) * sizeof(pcpu->size[0]));
+ }
+
+ /* There's always one block: the core kernel one. */
+ BUG_ON(pcpu->num_used == 0);
+ up(&pcpu_lock);
+}
+
+unsigned long __per_cpu_offset[NR_CPUS];
+EXPORT_SYMBOL(__per_cpu_offset);
+
+#define PERCPU_INIT_BLOCKS 4
+
+#ifdef __NEED_SETUP_PER_CPU_AREAS
+/* Generic version: allocates for all NR_CPUs. */
+void __init setup_per_cpu_areas(void)
+{
+ unsigned long i;
+ void *ptr;
+
+ ptr = alloc_bootmem(PERCPU_POOL_SIZE * NR_CPUS);
+
+ /* Don't panic yet, they won't see it */
+ if (__per_cpu_end - __per_cpu_start > PERCPU_POOL_SIZE)
+ return;
+
+ for (i = 0; i < NR_CPUS; i++, ptr += PERCPU_POOL_SIZE) {
+ __per_cpu_offset[i] = ptr - (void *)__per_cpu_start;
+ /* Copy section for each CPU (we discard the original) */
+ memcpy(__per_cpu_start + __per_cpu_offset[i],
+ __per_cpu_start,
+ __per_cpu_end - __per_cpu_start);
+ }
+}
+#endif
+
+static int init_alloc_percpu(void)
+{
+ printk("Per-cpu data: %Zu of %u bytes\n",
+ __per_cpu_end - __per_cpu_start, PERCPU_POOL_SIZE);
+
+ if (__per_cpu_end - __per_cpu_start > PERCPU_POOL_SIZE)
+ panic("Too much per-cpu data.\n");
+
+ pcpu = kmalloc(sizeof(*pcpu)+sizeof(pcpu->size[0])*PERCPU_INIT_BLOCKS,
+ GFP_KERNEL);
+ pcpu->num_allocated = PERCPU_INIT_BLOCKS;
+ pcpu->num_used = 2;
+ pcpu->size[0] = -(__per_cpu_end - __per_cpu_start);
+ pcpu->size[1] = PERCPU_POOL_SIZE-(__per_cpu_end - __per_cpu_start);
+
+ return 0;
+}
+
+__initcall(init_alloc_percpu);
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29880-linux-2.5.69/mm/slab.c .29880-linux-2.5.69.updated/mm/slab.c
--- .29880-linux-2.5.69/mm/slab.c 2003-04-20 18:05:16.000000000 +1000
+++ .29880-linux-2.5.69.updated/mm/slab.c 2003-05-05 17:36:25.000000000 +1000
@@ -1902,54 +1902,6 @@ void * kmalloc (size_t size, int flags)
return NULL;
}
-#ifdef CONFIG_SMP
-/**
- * kmalloc_percpu - allocate one copy of the object for every present
- * cpu in the system.
- * Objects should be dereferenced using per_cpu_ptr/get_cpu_ptr
- * macros only.
- *
- * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
- * The @flags argument may be one of:
- *
- * %GFP_USER - Allocate memory on behalf of user. May sleep.
- *
- * %GFP_KERNEL - Allocate normal kernel ram. May sleep.
- *
- * %GFP_ATOMIC - Allocation will not sleep. Use inside interrupt handlers.
- */
-void *
-kmalloc_percpu(size_t size, int flags)
-{
- int i;
- struct percpu_data *pdata = kmalloc(sizeof (*pdata), flags);
-
- if (!pdata)
- return NULL;
-
- for (i = 0; i < NR_CPUS; i++) {
- if (!cpu_possible(i))
- continue;
- pdata->ptrs[i] = kmalloc(size, flags);
- if (!pdata->ptrs[i])
- goto unwind_oom;
- }
-
- /* Catch derefs w/o wrappers */
- return (void *) (~(unsigned long) pdata);
-
-unwind_oom:
- while (--i >= 0) {
- if (!cpu_possible(i))
- continue;
- kfree(pdata->ptrs[i]);
- }
- kfree(pdata);
- return NULL;
-}
-#endif
-
/**
* kmem_cache_free - Deallocate an object
* @cachep: The cache the allocation was from.
@@ -1988,28 +1940,6 @@ void kfree (const void *objp)
local_irq_restore(flags);
}
-#ifdef CONFIG_SMP
-/**
- * kfree_percpu - free previously allocated percpu memory
- * @objp: pointer returned by kmalloc_percpu.
- *
- * Don't free memory not originally allocated by kmalloc_percpu()
- * The complemented objp is to check for that.
- */
-void
-kfree_percpu(const void *objp)
-{
- int i;
- struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
-
- for (i = 0; i < NR_CPUS; i++) {
- if (!cpu_possible(i))
- continue;
- kfree(p->ptrs[i]);
- }
-}
-#endif
-
unsigned int kmem_cache_size(kmem_cache_t *cachep)
{
unsigned int objlen = cachep->objsize;
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29880-linux-2.5.69/net/ipv4/af_inet.c .29880-linux-2.5.69.updated/net/ipv4/af_inet.c
--- .29880-linux-2.5.69/net/ipv4/af_inet.c 2003-05-05 12:37:14.000000000 +1000
+++ .29880-linux-2.5.69.updated/net/ipv4/af_inet.c 2003-05-05 17:36:25.000000000 +1000
@@ -1061,54 +1061,21 @@ static struct inet_protocol icmp_protoco
static int __init init_ipv4_mibs(void)
{
- int i;
-
- net_statistics[0] =
- kmalloc_percpu(sizeof (struct linux_mib), GFP_KERNEL);
- net_statistics[1] =
- kmalloc_percpu(sizeof (struct linux_mib), GFP_KERNEL);
- ip_statistics[0] = kmalloc_percpu(sizeof (struct ip_mib), GFP_KERNEL);
- ip_statistics[1] = kmalloc_percpu(sizeof (struct ip_mib), GFP_KERNEL);
- icmp_statistics[0] =
- kmalloc_percpu(sizeof (struct icmp_mib), GFP_KERNEL);
- icmp_statistics[1] =
- kmalloc_percpu(sizeof (struct icmp_mib), GFP_KERNEL);
- tcp_statistics[0] = kmalloc_percpu(sizeof (struct tcp_mib), GFP_KERNEL);
- tcp_statistics[1] = kmalloc_percpu(sizeof (struct tcp_mib), GFP_KERNEL);
- udp_statistics[0] = kmalloc_percpu(sizeof (struct udp_mib), GFP_KERNEL);
- udp_statistics[1] = kmalloc_percpu(sizeof (struct udp_mib), GFP_KERNEL);
+ net_statistics[0] = kmalloc_percpu(struct linux_mib);
+ net_statistics[1] = kmalloc_percpu(struct linux_mib);
+ ip_statistics[0] = kmalloc_percpu(struct ip_mib);
+ ip_statistics[1] = kmalloc_percpu(struct ip_mib);
+ icmp_statistics[0] = kmalloc_percpu(struct icmp_mib);
+ icmp_statistics[1] = kmalloc_percpu(struct icmp_mib);
+ tcp_statistics[0] = kmalloc_percpu(struct tcp_mib);
+ tcp_statistics[1] = kmalloc_percpu(struct tcp_mib);
+ udp_statistics[0] = kmalloc_percpu(struct udp_mib);
+ udp_statistics[1] = kmalloc_percpu(struct udp_mib);
if (!
(net_statistics[0] && net_statistics[1] && ip_statistics[0]
&& ip_statistics[1] && tcp_statistics[0] && tcp_statistics[1]
&& udp_statistics[0] && udp_statistics[1]))
return -ENOMEM;
-
- /* Set all the per cpu copies of the mibs to zero */
- for (i = 0; i < NR_CPUS; i++) {
- if (cpu_possible(i)) {
- memset(per_cpu_ptr(net_statistics[0], i), 0,
- sizeof (struct linux_mib));
- memset(per_cpu_ptr(net_statistics[1], i), 0,
- sizeof (struct linux_mib));
- memset(per_cpu_ptr(ip_statistics[0], i), 0,
- sizeof (struct ip_mib));
- memset(per_cpu_ptr(ip_statistics[1], i), 0,
- sizeof (struct ip_mib));
- memset(per_cpu_ptr(icmp_statistics[0], i), 0,
- sizeof (struct icmp_mib));
- memset(per_cpu_ptr(icmp_statistics[1], i), 0,
- sizeof (struct icmp_mib));
- memset(per_cpu_ptr(tcp_statistics[0], i), 0,
- sizeof (struct tcp_mib));
- memset(per_cpu_ptr(tcp_statistics[1], i), 0,
- sizeof (struct tcp_mib));
- memset(per_cpu_ptr(udp_statistics[0], i), 0,
- sizeof (struct udp_mib));
- memset(per_cpu_ptr(udp_statistics[1], i), 0,
- sizeof (struct udp_mib));
- }
- }
-
(void) tcp_mib_init();
return 0;
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29880-linux-2.5.69/net/ipv4/route.c .29880-linux-2.5.69.updated/net/ipv4/route.c
--- .29880-linux-2.5.69/net/ipv4/route.c 2003-05-05 12:37:14.000000000 +1000
+++ .29880-linux-2.5.69.updated/net/ipv4/route.c 2003-05-05 17:36:25.000000000 +1000
@@ -2689,17 +2689,9 @@ int __init ip_rt_init(void)
ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
ip_rt_max_size = (rt_hash_mask + 1) * 16;
- rt_cache_stat = kmalloc_percpu(sizeof (struct rt_cache_stat),
- GFP_KERNEL);
+ rt_cache_stat = kmalloc_percpu(struct rt_cache_stat);
if (!rt_cache_stat)
goto out_enomem1;
- for (i = 0; i < NR_CPUS; i++) {
- if (cpu_possible(i)) {
- memset(per_cpu_ptr(rt_cache_stat, i), 0,
- sizeof (struct rt_cache_stat));
- }
- }
-
devinet_init();
ip_fib_init();
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29880-linux-2.5.69/net/ipv6/af_inet6.c .29880-linux-2.5.69.updated/net/ipv6/af_inet6.c
--- .29880-linux-2.5.69/net/ipv6/af_inet6.c 2003-05-05 12:37:15.000000000 +1000
+++ .29880-linux-2.5.69.updated/net/ipv6/af_inet6.c 2003-05-05 17:36:25.000000000 +1000
@@ -633,28 +633,20 @@ inet6_unregister_protosw(struct inet_pro
}
int
-snmp6_mib_init(void *ptr[2], size_t mibsize)
+snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign)
{
int i;
if (ptr == NULL)
return -EINVAL;
- ptr[0] = kmalloc_percpu(mibsize, GFP_KERNEL);
+ ptr[0] = __alloc_percpu(mibsize, mibalign);
if (!ptr[0])
goto err0;
- ptr[1] = kmalloc_percpu(mibsize, GFP_KERNEL);
+ ptr[1] = __alloc_percpu(mibsize, mibalign);
if (!ptr[1])
goto err1;
-
- /* Zero percpu version of the mibs */
- for (i = 0; i < NR_CPUS; i++) {
- if (cpu_possible(i)) {
- memset(per_cpu_ptr(ptr[0], i), 0, mibsize);
- memset(per_cpu_ptr(ptr[1], i), 0, mibsize);
- }
- }
return 0;
err1:
@@ -676,11 +668,14 @@ snmp6_mib_free(void *ptr[2])
static int __init init_ipv6_mibs(void)
{
- if (snmp6_mib_init((void **)ipv6_statistics, sizeof (struct ipv6_mib)) < 0)
+ if (snmp6_mib_init((void **)ipv6_statistics, sizeof (struct ipv6_mib),
+ __alignof__(struct ipv6_mib)) < 0)
goto err_ip_mib;
- if (snmp6_mib_init((void **)icmpv6_statistics, sizeof (struct icmpv6_mib)) < 0)
+ if (snmp6_mib_init((void **)icmpv6_statistics, sizeof (struct icmpv6_mib),
+ __alignof__(struct icmpv6_mib)) < 0)
goto err_icmp_mib;
- if (snmp6_mib_init((void **)udp_stats_in6, sizeof (struct udp_mib)) < 0)
+ if (snmp6_mib_init((void **)udp_stats_in6, sizeof (struct udp_mib),
+ __alignof__(struct udp_mib)) < 0)
goto err_udp_mib;
return 0;
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29880-linux-2.5.69/net/ipv6/proc.c .29880-linux-2.5.69.updated/net/ipv6/proc.c
--- .29880-linux-2.5.69/net/ipv6/proc.c 2003-05-05 12:37:15.000000000 +1000
+++ .29880-linux-2.5.69.updated/net/ipv6/proc.c 2003-05-05 17:36:25.000000000 +1000
@@ -226,7 +226,7 @@ int snmp6_register_dev(struct inet6_dev
if (!idev || !idev->dev)
return -EINVAL;
- if (snmp6_mib_init((void **)idev->stats.icmpv6, sizeof(struct icmpv6_mib)) < 0)
+ if (snmp6_mib_init((void **)idev->stats.icmpv6, sizeof(struct icmpv6_mib), __alignof__(struct icmpv6_mib)) < 0)
goto err_icmp;
#ifdef CONFIG_PROC_FS
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29880-linux-2.5.69/net/sctp/protocol.c .29880-linux-2.5.69.updated/net/sctp/protocol.c
--- .29880-linux-2.5.69/net/sctp/protocol.c 2003-05-05 12:37:16.000000000 +1000
+++ .29880-linux-2.5.69.updated/net/sctp/protocol.c 2003-05-05 17:36:26.000000000 +1000
@@ -855,26 +855,14 @@ static int __init init_sctp_mibs(void)
{
int i;
- sctp_statistics[0] = kmalloc_percpu(sizeof (struct sctp_mib),
- GFP_KERNEL);
+ sctp_statistics[0] = kmalloc_percpu(struct sctp_mib);
if (!sctp_statistics[0])
return -ENOMEM;
- sctp_statistics[1] = kmalloc_percpu(sizeof (struct sctp_mib),
- GFP_KERNEL);
+ sctp_statistics[1] = kmalloc_percpu(struct sctp_mib);
if (!sctp_statistics[1]) {
kfree_percpu(sctp_statistics[0]);
return -ENOMEM;
}
-
- /* Zero all percpu versions of the mibs */
- for (i = 0; i < NR_CPUS; i++) {
- if (cpu_possible(i)) {
- memset(per_cpu_ptr(sctp_statistics[0], i), 0,
- sizeof (struct sctp_mib));
- memset(per_cpu_ptr(sctp_statistics[1], i), 0,
- sizeof (struct sctp_mib));
- }
- }
return 0;
}
Rusty Russell <[email protected]> wrote:
>
> This is the kmalloc_percpu patch.
How does it work? What restrictions does it have, and
what compromises were made?
+#define PERCPU_POOL_SIZE 32768
What's this?
The current implementation of kmalloc_per_cpu() turned out to be fairly
disappointing because of the number of derefs which were necessary to get at
the data in fastpaths. How does this implementation compare?
Thanks.
In message <[email protected]> you write:
> Rusty Russell <[email protected]> wrote:
> >
> > This is the kmalloc_percpu patch.
>
> How does it work? What restrictions does it have, and
> what compromises were made?
>
> +#define PERCPU_POOL_SIZE 32768
>
> What's this?
OK. It has a size restriction: PERCPU_POOL_SIZE is the maximum total
kmalloc_percpu + static DECLARE_PER_CPU you'll get, ever. This is the
main downside. It's allocated at boot.
The __alloc_percpu allocator is extremely space efficient, by not
insisting on cache-line aligning everything: __alloc_percpu(SIZE)
overhead is sizeof(int), plus SIZE bytes (rounded up to alignment
requirements) removed from per-cpu pool.
The allocator is fairly slow: they're not expected to be thrown around
like candy.
> The current implementation of kmalloc_per_cpu() turned out to be fairly
> disappointing because of the number of derefs which were necessary to get at
> the data in fastpaths. How does this implementation compare?
It uses the same method as the static ones, so it's a single addition
of __per_cpu_offset (assuming arch doesn't override implementation).
This is a requirement for modules to use them (which was my aim: the
other side effects are cream).
Hope that clarifies,
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
Rusty Russell <[email protected]> wrote:
>
> > +#define PERCPU_POOL_SIZE 32768
> >
> > What's this?
>
> OK. It has a size restriction: PERCPU_POOL_SIZE is the maximum total
> kmalloc_percpu + static DECLARE_PER_CPU you'll get, ever. This is the
> main downside. It's allocated at boot.
And is subject to fragmentation.
Is it not possible to go allocate another N * PERCPU_POOL_SIZE from
slab if it runs out?
That way, PERCPU_POOL_SIZE only needs to be sized for non-modular static
percpu data, which sounds more robust.
On Mon, 2003-05-05 at 18:52, Andrew Morton wrote:
> Rusty Russell <[email protected]> wrote:
> > OK. It has a size restriction: PERCPU_POOL_SIZE is the maximum total
> > kmalloc_percpu + static DECLARE_PER_CPU you'll get, ever. This is the
> > main downside. It's allocated at boot.
>
> And is subject to fragmentation.
>
> Is it not possible to go allocate another N * PERCPU_POOL_SIZE from
> slab if it runs out?
No, then you go back to things requireing multiple levels of
dereferencing. It's hard to realloc() because you have to
freeze the whole kernel to do that properly, and that is not
simple at all.
I think the fixed size pool is perfectly reasonable.
--
David S. Miller <[email protected]>
In message <[email protected]> you write:
> On Mon, 2003-05-05 at 18:52, Andrew Morton wrote:
> > Rusty Russell <[email protected]> wrote:
> > > OK. It has a size restriction: PERCPU_POOL_SIZE is the maximum total
> > > kmalloc_percpu + static DECLARE_PER_CPU you'll get, ever. This is the
> > > main downside. It's allocated at boot.
> >
> > And is subject to fragmentation.
> >
> > Is it not possible to go allocate another N * PERCPU_POOL_SIZE from
> > slab if it runs out?
>
> No, then you go back to things requireing multiple levels of
> dereferencing.
Actually, you can; my previous patch did this. But then all CPUS have
to be one continuous allocation: since modern big-SMP machines are
non-uniform, so you don't want this.
http://www.kernel.org/pub/linux/kernel/people/rusty/patches/Misc/kmalloc_percpu-orig.patch.gz
> I think the fixed size pool is perfectly reasonable.
Yes. It's a tradeoff. I think it's worth it at the moment (although
I'll add a limited printk to __alloc_percpu if it fails).
Cheers,
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
In message <[email protected]> you write:
> Rusty Russell <[email protected]> wrote:
> >
> > > +#define PERCPU_POOL_SIZE 32768
> > >
> > > What's this?
> >
> > OK. It has a size restriction: PERCPU_POOL_SIZE is the maximum total
> > kmalloc_percpu + static DECLARE_PER_CPU you'll get, ever. This is the
> > main downside. It's allocated at boot.
>
> And is subject to fragmentation.
Absolutely. However, we're looking at an allocation at module insert,
and maybe at each mount (if used for your fuzzy counters). Until we
see this in practice, I don't think complicating the allocator is
worth it.
Cheers,
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
Rusty Russell <[email protected]> wrote:
>
> > I think the fixed size pool is perfectly reasonable.
>
> Yes. It's a tradeoff. I think it's worth it at the moment (although
> I'll add a limited printk to __alloc_percpu if it fails).
>
It's OK as long as nobody uses the feature! Once it starts to be commonly
used (say, in driver ->open() methods) then we'll get into the same problems
as with vmalloc exhaustion, vmalloc fragmentation, large physically-contig
allocations, etc.
Ho-hum. Can the magical constant become a __setup thing?
From: Andrew Morton <[email protected]>
Date: Mon, 5 May 2003 21:28:16 -0700
It's OK as long as nobody uses the feature!
I think this is closer to say, allocation of kmap types,
than it is to vmalloc() et al. (as you suggest).
Ho-hum. Can the magical constant become a __setup thing?
Remember that there are physical limitations, for example
on ia64, as to how big this thing can be. So whatever any
of us think about physical limitations, we have to deal with
them anyways :-)
I think firstly, that we should define that this isn't
something you be doing after module_init() (ie. your
->open() example, that's rediculious). Ideas on how to
enforce this are welcome.
Next, we can calculate how much per-cpu space all the modules
need. And because we can do that, we can preallocate slots
if we wanted to in order to deal with whatever theoretical
fragmentation problems you think exist (much like how Jakub Jelink's
prelinking works).
I personally don't know how smart it is to let random modules use
kmalloc_percpu() with impunity. But aparently someone thinks
there is some value in that.
From: Rusty Russell <[email protected]>
Date: Tue, 06 May 2003 14:08:27 +1000
In message <[email protected]> you write:
> I think the fixed size pool is perfectly reasonable.
Yes. It's a tradeoff. I think it's worth it at the moment (although
I'll add a limited printk to __alloc_percpu if it fails).
I think you should BUG() if a module calls kmalloc_percpu() outside
of mod->init(), this is actually implementable.
Andrew's example with some module doing kmalloc_percpu() inside
of fops->open() is just rediculious.
"David S. Miller" <[email protected]> wrote:
>
> I think you should BUG() if a module calls kmalloc_percpu() outside
> of mod->init(), this is actually implementable.
>
> Andrew's example with some module doing kmalloc_percpu() inside
> of fops->open() is just rediculious.
crap. Modules deal with per-device and per-mount objects. If a module
cannot use kmalloc_per_cpu on behalf of the primary object which it manages
then the facility is simply not useful to modules.
The static DEFINE_PER_CPU allocation works OK in core kernel because core
kernel does _not_ use per-instance objects. But modules do.
A case in point, which Rusty has twice mentioned, is the three per-mount
fuzzy counters in the ext2 superblock. And lo, ext2 cannot use the code in
this patch, because people want to scale to 4000 mounts.
In those very rare cases where a module wants allocation to be performed at
module_init()-time (presumably global stats counters), they can use
DEFINE_PER_CPU, so we should just not export kmalloc_per_cpu() to modules at
all.
On Mon, May 05, 2003 at 08:46:58AM +0000, Andrew Morton wrote:
> Rusty Russell <[email protected]> wrote:
> >
> > This is the kmalloc_percpu patch.
>
> How does it work? What restrictions does it have, and
> what compromises were made?
>
> +#define PERCPU_POOL_SIZE 32768
>
> What's this?
>
>
> The current implementation of kmalloc_per_cpu() turned out to be fairly
> disappointing because of the number of derefs which were necessary to get at
> the data in fastpaths. How does this implementation compare?
>
Andrew,
Here is a comparision of kmalloc_percpu techniques as I see it,
Current Implementation:
1. Two dereferences to get to the per-cpu data
2. Allocates for cpu_possible cpus only, and can deal with sparse cpu nos
Rusty's Implementation
1. One extra memory reference (__per_cpu_offset)
2. allocates for NR_CPUS and probably breaks with sparse cpu nos?
3. Let you do per-cpu data in modules
4. fragmentation
The simpler patch I mailed you sometime back,
1. Minimal dereference overhead, offsets to per-cpu data calculated at
compile time
2. allocates for NR_CPUS and problems with sparse cpu nos
3. Very Simple.
My guess is performancewise Rusty's iplementation and the simpler
implementation of kmalloc_percpu will be comparable. (I'll run some
tests to compare them and post them later). I am including the
simpler kmalloc_percpu patch which I'd mailed to you earlier.
Thanks,
Kiran
diff -ruN -X dontdiff linux-2.5.65/include/linux/percpu.h kmalloc-new-2.5.65/include/linux/percpu.h
--- linux-2.5.65/include/linux/percpu.h Tue Mar 18 03:14:43 2003
+++ kmalloc-new-2.5.65/include/linux/percpu.h Wed Mar 19 17:18:59 2003
@@ -9,22 +9,14 @@
#define put_cpu_var(var) preempt_enable()
#ifdef CONFIG_SMP
-
-struct percpu_data {
- void *ptrs[NR_CPUS];
- void *blkp;
-};
-
/*
* Use this to get to a cpu's version of the per-cpu object allocated using
* kmalloc_percpu. If you want to get "this cpu's version", maybe you want
* to use get_cpu_ptr...
*/
#define per_cpu_ptr(ptr, cpu) \
-({ \
- struct percpu_data *__p = (struct percpu_data *)~(unsigned long)(ptr); \
- (__typeof__(ptr))__p->ptrs[(cpu)]; \
-})
+ ((__typeof__(ptr)) \
+ (RELOC_HIDE(ptr, ALIGN(sizeof (*ptr), SMP_CACHE_BYTES)*cpu)))
extern void *kmalloc_percpu(size_t size, int flags);
extern void kfree_percpu(const void *);
diff -ruN -X dontdiff linux-2.5.65/mm/slab.c kmalloc-new-2.5.65/mm/slab.c
--- linux-2.5.65/mm/slab.c Tue Mar 18 03:14:38 2003
+++ kmalloc-new-2.5.65/mm/slab.c Wed Mar 19 16:32:33 2003
@@ -1951,31 +1951,7 @@
void *
kmalloc_percpu(size_t size, int flags)
{
- int i;
- struct percpu_data *pdata = kmalloc(sizeof (*pdata), flags);
-
- if (!pdata)
- return NULL;
-
- for (i = 0; i < NR_CPUS; i++) {
- if (!cpu_possible(i))
- continue;
- pdata->ptrs[i] = kmalloc(size, flags);
- if (!pdata->ptrs[i])
- goto unwind_oom;
- }
-
- /* Catch derefs w/o wrappers */
- return (void *) (~(unsigned long) pdata);
-
-unwind_oom:
- while (--i >= 0) {
- if (!cpu_possible(i))
- continue;
- kfree(pdata->ptrs[i]);
- }
- kfree(pdata);
- return NULL;
+ return kmalloc(ALIGN(size, SMP_CACHE_BYTES)*NR_CPUS, flags);
}
#endif
@@ -2028,14 +2004,7 @@
void
kfree_percpu(const void *objp)
{
- int i;
- struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
-
- for (i = 0; i < NR_CPUS; i++) {
- if (!cpu_possible(i))
- continue;
- kfree(p->ptrs[i]);
- }
+ kfree(objp);
}
#endif
On Mon, May 05, 2003 at 08:40:02PM -0700, David S. Miller wrote:
> I think you should BUG() if a module calls kmalloc_percpu() outside
> of mod->init(), this is actually implementable.
>
> Andrew's example with some module doing kmalloc_percpu() inside
> of fops->open() is just rediculious.
The disk stats are already per-cpu. So, what happens when you offline/online
a disk ? How do you allocate per-cpu memory during that ?
Thanks
Dipankar
From: Andrew Morton <[email protected]>
Date: Mon, 5 May 2003 22:02:50 -0700
"David S. Miller" <[email protected]> wrote:
> Andrew's example with some module doing kmalloc_percpu() inside
> of fops->open() is just rediculious.
crap. Modules deal with per-device and per-mount objects. If a module
cannot use kmalloc_per_cpu on behalf of the primary object which it manages
then the facility is simply not useful to modules.
Ok then.
Please address the ia64 concerns then :-) It probably means we
have to stay with the dereferencing stuff... at which point you
might as well use normal kmalloc() and smp_processor_id() indexing
inside of modules.
"David S. Miller" <[email protected]> wrote:
>
>
> Please address the ia64 concerns then :-) It probably means we
> have to stay with the dereferencing stuff... at which point you
> might as well use normal kmalloc() and smp_processor_id() indexing
> inside of modules.
I think so. So we'd end up with:
- DEFINE_PER_CPU and kmalloc_percpu() work in core kernel, and use the 32k
pool.
- DEFINE_PER_CPU in modules uses the 32k pool as well (core kernel does the
allocation).
- kmalloc_per_cpu() is unavailble to modules (it ain't exported).
AFAICT the only thing which will break is sctp, which needs a trivial
conversion to DEFINE_PER_CPU.
Andrew Morton <[email protected]> wrote:
>
> - DEFINE_PER_CPU and kmalloc_percpu() work in core kernel, and use the 32k
> pool.
except sizeof(struct disk_stats)=44, so we run out of percpu space at 744
disks.
Can't think of anything very clever there, except to go and un-percpuify the
disk stats. I think that's best, really - disk requests only come in at 100
to 200 per second - atomic_t's or int-plus-per-disk-spinlock will be fine.
From: Andrew Morton <[email protected]>
Date: Mon, 5 May 2003 22:48:15 -0700
I think so. So we'd end up with:
- DEFINE_PER_CPU and kmalloc_percpu() work in core kernel, and use the 32k
pool.
- DEFINE_PER_CPU in modules uses the 32k pool as well (core kernel does the
allocation).
- kmalloc_per_cpu() is unavailble to modules (it ain't exported).
AFAICT the only thing which will break is sctp, which needs a trivial
conversion to DEFINE_PER_CPU.
Your grep is faulty, we're using kmalloc_percpu() in ipv6 for per-cpu
and per-device icmp stats.
You solution doesn't work in that case. Also ipv4 will have the same
problems if we make that modular at some point.
I also don't see how this fits in for your ext2 fuzzy counter stuff.
It isn't a "module" for most people, I can't even remember if I've
ever built ext2 non-statically. :-) It almost appears as if you
are suggesting kmalloc_percpu() is not usable at all.
So there we have it, there are a total of 3 users of kmalloc_percpu()
(ipv4/ipv6/diskstats) so let's decide if it's going to continue to
live longer or not before there are any more. :-)
From: Andrew Morton <[email protected]>
Date: Mon, 5 May 2003 23:42:48 -0700
Can't think of anything very clever there, except to go and un-percpuify the
disk stats. I think that's best, really - disk requests only come in at 100
to 200 per second - atomic_t's or int-plus-per-disk-spinlock will be fine.
Use some spinlock we already have to be holding during the
counter bumps.
Frankly, these things don't need to be %100 accurate. Using
a new spinlock or an atomic_t for this seems rediculious.
"David S. Miller" <[email protected]> wrote:
>
> Your grep is faulty, we're using kmalloc_percpu() in ipv6 for per-cpu
> and per-device icmp stats.
My grep is fine. It's the brain which failed ;)
> You solution doesn't work in that case. Also ipv4 will have the same
> problems if we make that modular at some point.
>
> I also don't see how this fits in for your ext2 fuzzy counter stuff.
Well it doesn't fit. With the proposals which we are discussing here, ext2
(and, we hope, soon ext3) would continue to use foo[NR_CPUS].
> It isn't a "module" for most people, I can't even remember if I've
> ever built ext2 non-statically. :-) It almost appears as if you
> are suggesting kmalloc_percpu() is not usable at all.
kmalloc_per_cpu() is useful at present. But with Rusty's patch it becomes
less useful.
Aside: I rather think that 4000 filesystems isn't sane. 4000 disks is, but I
find it hard to believe that people will have a separate fs on each one...
> So there we have it, there are a total of 3 users of kmalloc_percpu()
> (ipv4/ipv6/diskstats) so let's decide if it's going to continue to
> live longer or not before there are any more. :-)
How about we leave kmalloc_per_cpu as-is (it uses kmalloc()), and only apply
Rusty's new code to DEFINE_PER_CPU?
"David S. Miller" <[email protected]> wrote:
>
> From: Andrew Morton <[email protected]>
> Date: Mon, 5 May 2003 23:42:48 -0700
>
> Can't think of anything very clever there, except to go and un-percpuify the
> disk stats. I think that's best, really - disk requests only come in at 100
> to 200 per second - atomic_t's or int-plus-per-disk-spinlock will be fine.
>
> Use some spinlock we already have to be holding during the
> counter bumps.
Last time we looked at that, q->lock was already held in almost all the right
places so yes, that'd work.
> Frankly, these things don't need to be %100 accurate. Using
> a new spinlock or an atomic_t for this seems rediculious.
The disk_stats structure has an "in flight" member. If we don't have proper
locking around that, disks will appear to have -3 requests in flight for all
time, which would look a tad odd.
From: Andrew Morton <[email protected]>
Date: Mon, 5 May 2003 23:55:49 -0700
How about we leave kmalloc_per_cpu as-is (it uses kmalloc()), and
only apply Rusty's new code to DEFINE_PER_CPU?
I propose to make it use kmalloc() all the time.
It simply doesn't make sense to use a pool given what you've
shown me. If we've decided that any limit whatsover is bad,
why impose any limit at all? Smells like bad design frankly.
Normal DEFINE_PER_CPU() need not a pool, therefore we don't need
a pool for anything.
Make kmalloc_per_cpu() merely a convenience macro, made up of existing
non-percpu primitives.
On Mon, May 05, 2003 at 11:55:49PM -0700, Andrew Morton wrote:
> > I also don't see how this fits in for your ext2 fuzzy counter stuff.
>
> Well it doesn't fit. With the proposals which we are discussing here, ext2
> (and, we hope, soon ext3) would continue to use foo[NR_CPUS].
And that is not what we want to do in a NUMA box.
Thanks
Dipankar
From: Andrew Morton <[email protected]>
Date: Tue, 6 May 2003 00:22:29 -0700
"David S. Miller" <[email protected]> wrote:
>
> Make kmalloc_per_cpu() merely a convenience macro, made up of existing
> non-percpu primitives.
I think we're agreeing here.
As just pointed out by dipankar the only issue is NUMA...
so it has to be something more sophisticated than simply
kmalloc()[smp_processor_id];
"David S. Miller" <[email protected]> wrote:
>
> Make kmalloc_per_cpu() merely a convenience macro, made up of existing
> non-percpu primitives.
I think we're agreeing here.
The current kmalloc_percpu() is a wrapper around kmalloc. That seems OK to
me.
What we _do_ want to solve is the problem that DEFINE_PERCPU() does not work
in modules. Rusty's patch (reworked to not alter kmalloc_percpu) would suit
that requirement.
(kiran has a new version of kmalloc_percpu() which may be faster than the
current one, but for the purposes of this discussion it's equivalent).
On Mon, May 05 2003, Andrew Morton wrote:
> "David S. Miller" <[email protected]> wrote:
> >
> > From: Andrew Morton <[email protected]>
> > Date: Mon, 5 May 2003 23:42:48 -0700
> >
> > Can't think of anything very clever there, except to go and un-percpuify the
> > disk stats. I think that's best, really - disk requests only come in at 100
> > to 200 per second - atomic_t's or int-plus-per-disk-spinlock will be fine.
> >
> > Use some spinlock we already have to be holding during the
> > counter bumps.
>
> Last time we looked at that, q->lock was already held in almost all the right
> places so yes, that'd work.
As far as I can see, queue lock _is_ held in all the right spot. At
least where it matters, adding new samples.
> > Frankly, these things don't need to be %100 accurate. Using
> > a new spinlock or an atomic_t for this seems rediculious.
>
> The disk_stats structure has an "in flight" member. If we don't have proper
> locking around that, disks will appear to have -3 requests in flight for all
> time, which would look a tad odd.
So check for < 0 in flight? I totally agree with davem here.
--
Jens Axboe
"David S. Miller" <[email protected]> wrote:
>
> As just pointed out by dipankar the only issue is NUMA...
> so it has to be something more sophisticated than simply
> kmalloc()[smp_processor_id];
The proposed patch doesn't do anything about that either.
+ ptr = alloc_bootmem(PERCPU_POOL_SIZE * NR_CPUS);
So yes, we need an api which could be extended to use node-affine memory at
some time in the future. I think we have that.
In message <[email protected]> you write:
> "David S. Miller" <[email protected]> wrote:
> >
> > I think you should BUG() if a module calls kmalloc_percpu() outside
> > of mod->init(), this is actually implementable.
> >
> > Andrew's example with some module doing kmalloc_percpu() inside
> > of fops->open() is just rediculious.
>
> crap. Modules deal with per-device and per-mount objects. If a module
> cannot use kmalloc_per_cpu on behalf of the primary object which it manages
> then the facility is simply not useful to modules.
No, the allocator is dog-slow. If you want to use it on open, you
need a new allocator, not one that does a linear search for blocks.
> A case in point, which Rusty has twice mentioned, is the three per-mount
> fuzzy counters in the ext2 superblock. And lo, ext2 cannot use the code in
> this patch, because people want to scale to 4000 mounts.
Well, 4000 will fit on 32-bit archs, easily. Yes, it's a limit.
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
In message <[email protected]> you write:
> > It isn't a "module" for most people, I can't even remember if I've
> > ever built ext2 non-statically. :-) It almost appears as if you
> > are suggesting kmalloc_percpu() is not usable at all.
>
> kmalloc_per_cpu() is useful at present. But with Rusty's patch it becomes
> less useful.
It's a tradeoff, but I think it's worth it for a kmalloc_percpu which
is fast, space-efficient and numa-aware, since the code is needed
anyway.
How about a compromise like the following (scaled with mem)?
Untested, but you get the idea...
Cheers,
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
Name: kmalloc_percpu to use same percpu operators
Author: Rusty Russell
Status: Untested
D: By overallocating the per-cpu data at boot, we can make quite an
D: efficient allocator, and then use it to support per-cpu data in
D: modules (next patch).
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26261-linux-2.5.69/include/asm-generic/percpu.h .26261-linux-2.5.69.updated/include/asm-generic/percpu.h
--- .26261-linux-2.5.69/include/asm-generic/percpu.h 2003-01-02 12:32:47.000000000 +1100
+++ .26261-linux-2.5.69.updated/include/asm-generic/percpu.h 2003-05-06 18:14:22.000000000 +1000
@@ -2,37 +2,11 @@
#define _ASM_GENERIC_PERCPU_H_
#include <linux/compiler.h>
-#define __GENERIC_PER_CPU
+/* Some archs may want to keep __per_cpu_offset for this CPU in a register,
+ or do their own allocation. */
#ifdef CONFIG_SMP
-
-extern unsigned long __per_cpu_offset[NR_CPUS];
-
-/* Separate out the type, so (int[3], foo) works. */
-#ifndef MODULE
-#define DEFINE_PER_CPU(type, name) \
- __attribute__((__section__(".data.percpu"))) __typeof__(type) name##__per_cpu
-#endif
-
-/* var is in discarded region: offset to particular copy we want */
-#define per_cpu(var, cpu) (*RELOC_HIDE(&var##__per_cpu, __per_cpu_offset[cpu]))
#define __get_cpu_var(var) per_cpu(var, smp_processor_id())
-
-#else /* ! SMP */
-
-/* Can't define per-cpu variables in modules. Sorry --RR */
-#ifndef MODULE
-#define DEFINE_PER_CPU(type, name) \
- __typeof__(type) name##__per_cpu
-#endif
-
-#define per_cpu(var, cpu) ((void)cpu, var##__per_cpu)
-#define __get_cpu_var(var) var##__per_cpu
-
-#endif /* SMP */
-
-#define DECLARE_PER_CPU(type, name) extern __typeof__(type) name##__per_cpu
-
-#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(var##__per_cpu)
-#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(var##__per_cpu)
-
+#define __get_cpu_ptr(var) per_cpu_ptr(ptr, smp_processor_id())
+#define __NEED_SETUP_PER_CPU_AREAS
+#endif /* SMP */
#endif /* _ASM_GENERIC_PERCPU_H_ */
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26261-linux-2.5.69/include/linux/genhd.h .26261-linux-2.5.69.updated/include/linux/genhd.h
--- .26261-linux-2.5.69/include/linux/genhd.h 2003-05-05 12:37:12.000000000 +1000
+++ .26261-linux-2.5.69.updated/include/linux/genhd.h 2003-05-06 18:14:22.000000000 +1000
@@ -160,10 +160,9 @@ static inline void disk_stat_set_all(str
#ifdef CONFIG_SMP
static inline int init_disk_stats(struct gendisk *disk)
{
- disk->dkstats = kmalloc_percpu(sizeof (struct disk_stats), GFP_KERNEL);
+ disk->dkstats = kmalloc_percpu(struct disk_stats);
if (!disk->dkstats)
return 0;
- disk_stat_set_all(disk, 0);
return 1;
}
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26261-linux-2.5.69/include/linux/percpu.h .26261-linux-2.5.69.updated/include/linux/percpu.h
--- .26261-linux-2.5.69/include/linux/percpu.h 2003-02-07 19:20:01.000000000 +1100
+++ .26261-linux-2.5.69.updated/include/linux/percpu.h 2003-05-06 18:26:53.000000000 +1000
@@ -1,71 +1,79 @@
#ifndef __LINUX_PERCPU_H
#define __LINUX_PERCPU_H
-#include <linux/spinlock.h> /* For preempt_disable() */
-#include <linux/slab.h> /* For kmalloc_percpu() */
+#include <linux/preempt.h> /* For preempt_disable() */
+#include <linux/slab.h> /* For kmalloc() */
+#include <linux/cache.h>
+#include <linux/string.h>
+#include <asm/bug.h>
#include <asm/percpu.h>
-/* Must be an lvalue. */
+/* For variables declared with DECLARE_PER_CPU()/DEFINE_PER_CPU(). */
#define get_cpu_var(var) (*({ preempt_disable(); &__get_cpu_var(var); }))
#define put_cpu_var(var) preempt_enable()
+/* Also, per_cpu(var, cpu) to get another cpu's value. */
+
+/* For ptrs allocated with kmalloc_percpu */
+#define get_cpu_ptr(ptr) ({ preempt_disable(); __get_cpu_ptr(ptr); })
+#define put_cpu_ptr(ptr) preempt_enable()
+/* Also, per_cpu_ptr(ptr, cpu) to get another cpu's value. */
#ifdef CONFIG_SMP
-struct percpu_data {
- void *ptrs[NR_CPUS];
- void *blkp;
-};
+/* __alloc_percpu zeros memory for every cpu, as a convenience. */
+extern void *__alloc_percpu(size_t size, size_t align);
+extern void kfree_percpu(const void *);
-/*
- * Use this to get to a cpu's version of the per-cpu object allocated using
- * kmalloc_percpu. If you want to get "this cpu's version", maybe you want
- * to use get_cpu_ptr...
- */
-#define per_cpu_ptr(ptr, cpu) \
-({ \
- struct percpu_data *__p = (struct percpu_data *)~(unsigned long)(ptr); \
- (__typeof__(ptr))__p->ptrs[(cpu)]; \
-})
+extern unsigned long __per_cpu_offset[NR_CPUS];
-extern void *kmalloc_percpu(size_t size, int flags);
-extern void kfree_percpu(const void *);
-extern void kmalloc_percpu_init(void);
+/* Separate out the type, so (int[3], foo) works. */
+#ifndef MODULE
+#define DEFINE_PER_CPU(type, name) \
+ __attribute__((__section__(".data.percpu"))) __typeof__(type) name##__per_cpu
+#endif
-#else /* CONFIG_SMP */
+/* var is in discarded region: offset to particular copy we want */
+#define per_cpu(var, cpu) (*RELOC_HIDE(&var##__per_cpu, __per_cpu_offset[cpu]))
+#define per_cpu_ptr(ptr, cpu) ((__typeof__(ptr))(RELOC_HIDE(ptr, __per_cpu_offset[cpu])))
-#define per_cpu_ptr(ptr, cpu) (ptr)
+extern void setup_per_cpu_areas(void);
+#else /* !CONFIG_SMP */
-static inline void *kmalloc_percpu(size_t size, int flags)
+/* Can't define per-cpu variables in modules. Sorry --RR */
+#ifndef MODULE
+#define DEFINE_PER_CPU(type, name) \
+ __typeof__(type) name##__per_cpu
+#endif
+
+#define per_cpu(var, cpu) ((void)(cpu), var##__per_cpu)
+#define __get_cpu_var(var) var##__per_cpu
+#define per_cpu_ptr(ptr, cpu) ((void)(cpu), (ptr))
+#define __get_cpu_ptr(ptr) (ptr)
+
+static inline void *__alloc_percpu(size_t size, size_t align)
{
- return(kmalloc(size, flags));
+ void *ret;
+ /* kmalloc always cacheline aligns. */
+ BUG_ON(align > SMP_CACHE_BYTES);
+ ret = kmalloc(size, GFP_KERNEL);
+ if (ret)
+ memset(ret, 0, size);
+ return ret;
}
static inline void kfree_percpu(const void *ptr)
{
kfree(ptr);
}
-static inline void kmalloc_percpu_init(void) { }
+static inline void setup_per_cpu_areas(void) { }
#endif /* CONFIG_SMP */
-/*
- * Use these with kmalloc_percpu. If
- * 1. You want to operate on memory allocated by kmalloc_percpu (dereference
- * and read/modify/write) AND
- * 2. You want "this cpu's version" of the object AND
- * 3. You want to do this safely since:
- * a. On multiprocessors, you don't want to switch between cpus after
- * you've read the current processor id due to preemption -- this would
- * take away the implicit advantage to not have any kind of traditional
- * serialization for per-cpu data
- * b. On uniprocessors, you don't want another kernel thread messing
- * up with the same per-cpu data due to preemption
- *
- * So, Use get_cpu_ptr to disable preemption and get pointer to the
- * local cpu version of the per-cpu object. Use put_cpu_ptr to enable
- * preemption. Operations on per-cpu data between get_ and put_ is
- * then considered to be safe. And ofcourse, "Thou shalt not sleep between
- * get_cpu_ptr and put_cpu_ptr"
- */
-#define get_cpu_ptr(ptr) per_cpu_ptr(ptr, get_cpu())
-#define put_cpu_ptr(ptr) put_cpu()
+/* Simple wrapper for the common case. Zeros memory. */
+#define kmalloc_percpu(type) \
+ ((type *)(__alloc_percpu(sizeof(type), __alignof__(type))))
+
+#define DECLARE_PER_CPU(type, name) extern __typeof__(type) name##__per_cpu
+
+#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(var##__per_cpu)
+#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(var##__per_cpu)
#endif /* __LINUX_PERCPU_H */
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26261-linux-2.5.69/include/net/ipv6.h .26261-linux-2.5.69.updated/include/net/ipv6.h
--- .26261-linux-2.5.69/include/net/ipv6.h 2003-05-05 12:37:12.000000000 +1000
+++ .26261-linux-2.5.69.updated/include/net/ipv6.h 2003-05-06 18:14:22.000000000 +1000
@@ -145,7 +145,7 @@ extern atomic_t inet6_sock_nr;
int snmp6_register_dev(struct inet6_dev *idev);
int snmp6_unregister_dev(struct inet6_dev *idev);
-int snmp6_mib_init(void *ptr[2], size_t mibsize);
+int snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign);
void snmp6_mib_free(void *ptr[2]);
struct ip6_ra_chain
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26261-linux-2.5.69/init/main.c .26261-linux-2.5.69.updated/init/main.c
--- .26261-linux-2.5.69/init/main.c 2003-05-05 12:37:13.000000000 +1000
+++ .26261-linux-2.5.69.updated/init/main.c 2003-05-06 18:14:22.000000000 +1000
@@ -301,35 +301,10 @@ static void __init smp_init(void)
#define smp_init() do { } while (0)
#endif
-static inline void setup_per_cpu_areas(void) { }
static inline void smp_prepare_cpus(unsigned int maxcpus) { }
#else
-#ifdef __GENERIC_PER_CPU
-unsigned long __per_cpu_offset[NR_CPUS];
-
-static void __init setup_per_cpu_areas(void)
-{
- unsigned long size, i;
- char *ptr;
- /* Created by linker magic */
- extern char __per_cpu_start[], __per_cpu_end[];
-
- /* Copy section for each CPU (we discard the original) */
- size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
- if (!size)
- return;
-
- ptr = alloc_bootmem(size * NR_CPUS);
-
- for (i = 0; i < NR_CPUS; i++, ptr += size) {
- __per_cpu_offset[i] = ptr - __per_cpu_start;
- memcpy(ptr, __per_cpu_start, size);
- }
-}
-#endif /* !__GENERIC_PER_CPU */
-
/* Called by boot processor to activate the rest. */
static void __init smp_init(void)
{
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26261-linux-2.5.69/kernel/ksyms.c .26261-linux-2.5.69.updated/kernel/ksyms.c
--- .26261-linux-2.5.69/kernel/ksyms.c 2003-05-05 12:37:13.000000000 +1000
+++ .26261-linux-2.5.69.updated/kernel/ksyms.c 2003-05-06 18:14:22.000000000 +1000
@@ -99,7 +99,7 @@ EXPORT_SYMBOL(remove_shrinker);
EXPORT_SYMBOL(kmalloc);
EXPORT_SYMBOL(kfree);
#ifdef CONFIG_SMP
-EXPORT_SYMBOL(kmalloc_percpu);
+EXPORT_SYMBOL(__alloc_percpu);
EXPORT_SYMBOL(kfree_percpu);
EXPORT_SYMBOL(percpu_counter_mod);
#endif
@@ -607,7 +607,7 @@ EXPORT_SYMBOL(init_thread_union);
EXPORT_SYMBOL(tasklist_lock);
EXPORT_SYMBOL(find_task_by_pid);
EXPORT_SYMBOL(next_thread);
-#if defined(CONFIG_SMP) && defined(__GENERIC_PER_CPU)
+#if defined(CONFIG_SMP)
EXPORT_SYMBOL(__per_cpu_offset);
#endif
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26261-linux-2.5.69/mm/Makefile .26261-linux-2.5.69.updated/mm/Makefile
--- .26261-linux-2.5.69/mm/Makefile 2003-02-11 14:26:20.000000000 +1100
+++ .26261-linux-2.5.69.updated/mm/Makefile 2003-05-06 18:14:22.000000000 +1000
@@ -12,3 +12,4 @@ obj-y := bootmem.o filemap.o mempool.o
slab.o swap.o truncate.o vcache.o vmscan.o $(mmu-y)
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
+obj-$(CONFIG_SMP) += percpu.o
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26261-linux-2.5.69/mm/percpu.c .26261-linux-2.5.69.updated/mm/percpu.c
--- .26261-linux-2.5.69/mm/percpu.c 1970-01-01 10:00:00.000000000 +1000
+++ .26261-linux-2.5.69.updated/mm/percpu.c 2003-05-06 18:26:34.000000000 +1000
@@ -0,0 +1,217 @@
+/*
+ * Dynamic per-cpu allocation.
+ * Originally by Dipankar Sarma <[email protected]>
+ * This version (C) 2003 Rusty Russell, IBM Corporation.
+ */
+
+/* Simple allocator: we don't stress it hard, but do want it
+ fairly space-efficient. */
+#include <linux/percpu.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <asm/semaphore.h>
+
+static DECLARE_MUTEX(pcpu_lock);
+
+struct pcpu_block
+{
+ /* Number of blocks used and allocated. */
+ unsigned short num_used, num_allocated;
+
+ /* Size of each block. -ve means used. */
+ int size[0];
+};
+static struct pcpu_block *pcpu; /* = NULL */
+
+/* Created by linker magic */
+extern char __per_cpu_start[], __per_cpu_end[];
+
+/* Splits a block into two. Reallocs pcpu if neccessary. */
+static int split_block(unsigned int i, unsigned short size)
+{
+ /* Reallocation required? */
+ if (pcpu->num_used + 1 > pcpu->num_allocated) {
+ struct pcpu_block *new;
+
+ new = kmalloc(sizeof(*pcpu)
+ + sizeof(pcpu->size[0]) * pcpu->num_allocated*2,
+ GFP_KERNEL);
+ if (!new)
+ return 0;
+ new->num_used = pcpu->num_used;
+ new->num_allocated = pcpu->num_allocated * 2;
+ memcpy(new->size, pcpu->size,
+ sizeof(pcpu->size[0])*pcpu->num_used);
+ kfree(pcpu);
+ pcpu = new;
+ }
+
+ /* Insert a new subblock */
+ memmove(&pcpu->size[i+1], &pcpu->size[i],
+ sizeof(pcpu->size[0]) * (pcpu->num_used - i));
+ pcpu->num_used++;
+
+ pcpu->size[i+1] -= size;
+ pcpu->size[i] = size;
+ return 1;
+}
+
+static inline unsigned int abs(int val)
+{
+ if (val < 0)
+ return -val;
+ return val;
+}
+
+static inline void zero_all(void *pcpuptr, unsigned int size)
+{
+ unsigned int i;;
+
+ for (i = 0; i < NR_CPUS; i++)
+ memset(per_cpu_ptr(pcpuptr, i), 0, size);
+}
+
+static unsigned long pool_size;
+
+void *__alloc_percpu(size_t size, size_t align)
+{
+ unsigned long extra;
+ unsigned int i;
+ void *ptr;
+
+ BUG_ON(align > SMP_CACHE_BYTES);
+ BUG_ON(size > pool_size/2);
+
+ down(&pcpu_lock);
+ ptr = __per_cpu_start;
+ for (i = 0; i < pcpu->num_used; ptr += abs(pcpu->size[i]), i++) {
+ /* Extra for alignment requirement. */
+ extra = ALIGN((unsigned long)ptr, align) - (unsigned long)ptr;
+
+ /* Allocated or not large enough? */
+ if (pcpu->size[i] < 0 || pcpu->size[i] < extra + size)
+ continue;
+
+ /* Transfer extra to previous block. */
+ if (pcpu->size[i-1] < 0)
+ pcpu->size[i-1] -= extra;
+ else
+ pcpu->size[i-1] += extra;
+ pcpu->size[i] -= extra;
+ ptr += extra;
+
+ /* Split block if warranted */
+ if (pcpu->size[i] - size > sizeof(unsigned long))
+ if (!split_block(i, size))
+ break;
+
+ /* Mark allocated */
+ pcpu->size[i] = -pcpu->size[i];
+ zero_all(ptr, size);
+ goto out;
+ }
+ ptr = NULL;
+ out:
+ up(&pcpu_lock);
+ return ptr;
+}
+
+void kfree_percpu(const void *freeme)
+{
+ unsigned int i;
+ void *ptr = __per_cpu_start;
+
+ down(&pcpu_lock);
+ for (i = 0; i < pcpu->num_used; ptr += abs(pcpu->size[i]), i++) {
+ if (ptr == freeme) {
+ /* Double free? */
+ BUG_ON(pcpu->size[i] > 0);
+ /* Block 0 is for non-dynamic per-cpu data. */
+ BUG_ON(i == 0);
+ pcpu->size[i] = -pcpu->size[i];
+ goto merge;
+ }
+ }
+ BUG();
+
+ merge:
+ /* Merge with previous? */
+ if (pcpu->size[i-1] >= 0) {
+ pcpu->size[i-1] += pcpu->size[i];
+ pcpu->num_used--;
+ memmove(&pcpu->size[i], &pcpu->size[i+1],
+ (pcpu->num_used - i) * sizeof(pcpu->size[0]));
+ i--;
+ }
+ /* Merge with next? */
+ if (i+1 < pcpu->num_used && pcpu->size[i+1] >= 0) {
+ pcpu->size[i] += pcpu->size[i+1];
+ pcpu->num_used--;
+ memmove(&pcpu->size[i+1], &pcpu->size[i+2],
+ (pcpu->num_used - (i+1)) * sizeof(pcpu->size[0]));
+ }
+
+ /* There's always one block: the core kernel one. */
+ BUG_ON(pcpu->num_used == 0);
+ up(&pcpu_lock);
+}
+
+unsigned long __per_cpu_offset[NR_CPUS];
+EXPORT_SYMBOL(__per_cpu_offset);
+
+#define PERCPU_INIT_BLOCKS 4
+
+#ifdef __NEED_SETUP_PER_CPU_AREAS
+/* Generic version: allocates for all NR_CPUs. */
+void __init setup_per_cpu_areas(void)
+{
+ unsigned long i;
+ void *ptr;
+
+ /* Leave at least 16k for __alloc_percpu */
+ pool_size = ALIGN(__per_cpu_end - __per_cpu_start + 16384,
+ SMP_CACHE_BYTES);
+ /* Plenty of memory? 1GB = 64k per-cpu. */
+ pool_size = max(((long long)num_physpages << PAGE_SHIFT) / 16384,
+ (long long)pool_size);
+#ifdef PERCPU_POOL_MAX
+ if (pool_size > PERCPU_POOL_MAX)
+ pool_size = PERCPU_POOL_MAX;
+#endif
+
+ ptr = alloc_bootmem(pool_size * NR_CPUS);
+
+ /* Don't panic yet, they won't see it */
+ if (__per_cpu_end - __per_cpu_start > pool_size)
+ return;
+
+ for (i = 0; i < NR_CPUS; i++, ptr += pool_size) {
+ __per_cpu_offset[i] = ptr - (void *)__per_cpu_start;
+ /* Copy section for each CPU (we discard the original) */
+ memcpy(__per_cpu_start + __per_cpu_offset[i],
+ __per_cpu_start,
+ __per_cpu_end - __per_cpu_start);
+ }
+}
+#endif
+
+static int init_alloc_percpu(void)
+{
+ printk("Per-cpu data: %Zu of %u bytes\n",
+ __per_cpu_end - __per_cpu_start, pool_size);
+
+ if (__per_cpu_end - __per_cpu_start > pool_size)
+ panic("Too much per-cpu data.\n");
+
+ pcpu = kmalloc(sizeof(*pcpu)+sizeof(pcpu->size[0])*PERCPU_INIT_BLOCKS,
+ GFP_KERNEL);
+ pcpu->num_allocated = PERCPU_INIT_BLOCKS;
+ pcpu->num_used = 2;
+ pcpu->size[0] = -(__per_cpu_end - __per_cpu_start);
+ pcpu->size[1] = pool_size-(__per_cpu_end - __per_cpu_start);
+
+ return 0;
+}
+
+__initcall(init_alloc_percpu);
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26261-linux-2.5.69/mm/slab.c .26261-linux-2.5.69.updated/mm/slab.c
--- .26261-linux-2.5.69/mm/slab.c 2003-04-20 18:05:16.000000000 +1000
+++ .26261-linux-2.5.69.updated/mm/slab.c 2003-05-06 18:14:22.000000000 +1000
@@ -1902,54 +1902,6 @@ void * kmalloc (size_t size, int flags)
return NULL;
}
-#ifdef CONFIG_SMP
-/**
- * kmalloc_percpu - allocate one copy of the object for every present
- * cpu in the system.
- * Objects should be dereferenced using per_cpu_ptr/get_cpu_ptr
- * macros only.
- *
- * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
- * The @flags argument may be one of:
- *
- * %GFP_USER - Allocate memory on behalf of user. May sleep.
- *
- * %GFP_KERNEL - Allocate normal kernel ram. May sleep.
- *
- * %GFP_ATOMIC - Allocation will not sleep. Use inside interrupt handlers.
- */
-void *
-kmalloc_percpu(size_t size, int flags)
-{
- int i;
- struct percpu_data *pdata = kmalloc(sizeof (*pdata), flags);
-
- if (!pdata)
- return NULL;
-
- for (i = 0; i < NR_CPUS; i++) {
- if (!cpu_possible(i))
- continue;
- pdata->ptrs[i] = kmalloc(size, flags);
- if (!pdata->ptrs[i])
- goto unwind_oom;
- }
-
- /* Catch derefs w/o wrappers */
- return (void *) (~(unsigned long) pdata);
-
-unwind_oom:
- while (--i >= 0) {
- if (!cpu_possible(i))
- continue;
- kfree(pdata->ptrs[i]);
- }
- kfree(pdata);
- return NULL;
-}
-#endif
-
/**
* kmem_cache_free - Deallocate an object
* @cachep: The cache the allocation was from.
@@ -1988,28 +1940,6 @@ void kfree (const void *objp)
local_irq_restore(flags);
}
-#ifdef CONFIG_SMP
-/**
- * kfree_percpu - free previously allocated percpu memory
- * @objp: pointer returned by kmalloc_percpu.
- *
- * Don't free memory not originally allocated by kmalloc_percpu()
- * The complemented objp is to check for that.
- */
-void
-kfree_percpu(const void *objp)
-{
- int i;
- struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
-
- for (i = 0; i < NR_CPUS; i++) {
- if (!cpu_possible(i))
- continue;
- kfree(p->ptrs[i]);
- }
-}
-#endif
-
unsigned int kmem_cache_size(kmem_cache_t *cachep)
{
unsigned int objlen = cachep->objsize;
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26261-linux-2.5.69/net/ipv4/af_inet.c .26261-linux-2.5.69.updated/net/ipv4/af_inet.c
--- .26261-linux-2.5.69/net/ipv4/af_inet.c 2003-05-05 12:37:14.000000000 +1000
+++ .26261-linux-2.5.69.updated/net/ipv4/af_inet.c 2003-05-06 18:14:22.000000000 +1000
@@ -1061,54 +1061,21 @@ static struct inet_protocol icmp_protoco
static int __init init_ipv4_mibs(void)
{
- int i;
-
- net_statistics[0] =
- kmalloc_percpu(sizeof (struct linux_mib), GFP_KERNEL);
- net_statistics[1] =
- kmalloc_percpu(sizeof (struct linux_mib), GFP_KERNEL);
- ip_statistics[0] = kmalloc_percpu(sizeof (struct ip_mib), GFP_KERNEL);
- ip_statistics[1] = kmalloc_percpu(sizeof (struct ip_mib), GFP_KERNEL);
- icmp_statistics[0] =
- kmalloc_percpu(sizeof (struct icmp_mib), GFP_KERNEL);
- icmp_statistics[1] =
- kmalloc_percpu(sizeof (struct icmp_mib), GFP_KERNEL);
- tcp_statistics[0] = kmalloc_percpu(sizeof (struct tcp_mib), GFP_KERNEL);
- tcp_statistics[1] = kmalloc_percpu(sizeof (struct tcp_mib), GFP_KERNEL);
- udp_statistics[0] = kmalloc_percpu(sizeof (struct udp_mib), GFP_KERNEL);
- udp_statistics[1] = kmalloc_percpu(sizeof (struct udp_mib), GFP_KERNEL);
+ net_statistics[0] = kmalloc_percpu(struct linux_mib);
+ net_statistics[1] = kmalloc_percpu(struct linux_mib);
+ ip_statistics[0] = kmalloc_percpu(struct ip_mib);
+ ip_statistics[1] = kmalloc_percpu(struct ip_mib);
+ icmp_statistics[0] = kmalloc_percpu(struct icmp_mib);
+ icmp_statistics[1] = kmalloc_percpu(struct icmp_mib);
+ tcp_statistics[0] = kmalloc_percpu(struct tcp_mib);
+ tcp_statistics[1] = kmalloc_percpu(struct tcp_mib);
+ udp_statistics[0] = kmalloc_percpu(struct udp_mib);
+ udp_statistics[1] = kmalloc_percpu(struct udp_mib);
if (!
(net_statistics[0] && net_statistics[1] && ip_statistics[0]
&& ip_statistics[1] && tcp_statistics[0] && tcp_statistics[1]
&& udp_statistics[0] && udp_statistics[1]))
return -ENOMEM;
-
- /* Set all the per cpu copies of the mibs to zero */
- for (i = 0; i < NR_CPUS; i++) {
- if (cpu_possible(i)) {
- memset(per_cpu_ptr(net_statistics[0], i), 0,
- sizeof (struct linux_mib));
- memset(per_cpu_ptr(net_statistics[1], i), 0,
- sizeof (struct linux_mib));
- memset(per_cpu_ptr(ip_statistics[0], i), 0,
- sizeof (struct ip_mib));
- memset(per_cpu_ptr(ip_statistics[1], i), 0,
- sizeof (struct ip_mib));
- memset(per_cpu_ptr(icmp_statistics[0], i), 0,
- sizeof (struct icmp_mib));
- memset(per_cpu_ptr(icmp_statistics[1], i), 0,
- sizeof (struct icmp_mib));
- memset(per_cpu_ptr(tcp_statistics[0], i), 0,
- sizeof (struct tcp_mib));
- memset(per_cpu_ptr(tcp_statistics[1], i), 0,
- sizeof (struct tcp_mib));
- memset(per_cpu_ptr(udp_statistics[0], i), 0,
- sizeof (struct udp_mib));
- memset(per_cpu_ptr(udp_statistics[1], i), 0,
- sizeof (struct udp_mib));
- }
- }
-
(void) tcp_mib_init();
return 0;
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26261-linux-2.5.69/net/ipv4/route.c .26261-linux-2.5.69.updated/net/ipv4/route.c
--- .26261-linux-2.5.69/net/ipv4/route.c 2003-05-05 12:37:14.000000000 +1000
+++ .26261-linux-2.5.69.updated/net/ipv4/route.c 2003-05-06 18:14:22.000000000 +1000
@@ -2689,17 +2689,9 @@ int __init ip_rt_init(void)
ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
ip_rt_max_size = (rt_hash_mask + 1) * 16;
- rt_cache_stat = kmalloc_percpu(sizeof (struct rt_cache_stat),
- GFP_KERNEL);
+ rt_cache_stat = kmalloc_percpu(struct rt_cache_stat);
if (!rt_cache_stat)
goto out_enomem1;
- for (i = 0; i < NR_CPUS; i++) {
- if (cpu_possible(i)) {
- memset(per_cpu_ptr(rt_cache_stat, i), 0,
- sizeof (struct rt_cache_stat));
- }
- }
-
devinet_init();
ip_fib_init();
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26261-linux-2.5.69/net/ipv6/af_inet6.c .26261-linux-2.5.69.updated/net/ipv6/af_inet6.c
--- .26261-linux-2.5.69/net/ipv6/af_inet6.c 2003-05-05 12:37:15.000000000 +1000
+++ .26261-linux-2.5.69.updated/net/ipv6/af_inet6.c 2003-05-06 18:14:22.000000000 +1000
@@ -633,28 +633,20 @@ inet6_unregister_protosw(struct inet_pro
}
int
-snmp6_mib_init(void *ptr[2], size_t mibsize)
+snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign)
{
int i;
if (ptr == NULL)
return -EINVAL;
- ptr[0] = kmalloc_percpu(mibsize, GFP_KERNEL);
+ ptr[0] = __alloc_percpu(mibsize, mibalign);
if (!ptr[0])
goto err0;
- ptr[1] = kmalloc_percpu(mibsize, GFP_KERNEL);
+ ptr[1] = __alloc_percpu(mibsize, mibalign);
if (!ptr[1])
goto err1;
-
- /* Zero percpu version of the mibs */
- for (i = 0; i < NR_CPUS; i++) {
- if (cpu_possible(i)) {
- memset(per_cpu_ptr(ptr[0], i), 0, mibsize);
- memset(per_cpu_ptr(ptr[1], i), 0, mibsize);
- }
- }
return 0;
err1:
@@ -676,11 +668,14 @@ snmp6_mib_free(void *ptr[2])
static int __init init_ipv6_mibs(void)
{
- if (snmp6_mib_init((void **)ipv6_statistics, sizeof (struct ipv6_mib)) < 0)
+ if (snmp6_mib_init((void **)ipv6_statistics, sizeof (struct ipv6_mib),
+ __alignof__(struct ipv6_mib)) < 0)
goto err_ip_mib;
- if (snmp6_mib_init((void **)icmpv6_statistics, sizeof (struct icmpv6_mib)) < 0)
+ if (snmp6_mib_init((void **)icmpv6_statistics, sizeof (struct icmpv6_mib),
+ __alignof__(struct icmpv6_mib)) < 0)
goto err_icmp_mib;
- if (snmp6_mib_init((void **)udp_stats_in6, sizeof (struct udp_mib)) < 0)
+ if (snmp6_mib_init((void **)udp_stats_in6, sizeof (struct udp_mib),
+ __alignof__(struct udp_mib)) < 0)
goto err_udp_mib;
return 0;
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26261-linux-2.5.69/net/ipv6/proc.c .26261-linux-2.5.69.updated/net/ipv6/proc.c
--- .26261-linux-2.5.69/net/ipv6/proc.c 2003-05-05 12:37:15.000000000 +1000
+++ .26261-linux-2.5.69.updated/net/ipv6/proc.c 2003-05-06 18:14:22.000000000 +1000
@@ -226,7 +226,7 @@ int snmp6_register_dev(struct inet6_dev
if (!idev || !idev->dev)
return -EINVAL;
- if (snmp6_mib_init((void **)idev->stats.icmpv6, sizeof(struct icmpv6_mib)) < 0)
+ if (snmp6_mib_init((void **)idev->stats.icmpv6, sizeof(struct icmpv6_mib), __alignof__(struct icmpv6_mib)) < 0)
goto err_icmp;
#ifdef CONFIG_PROC_FS
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26261-linux-2.5.69/net/sctp/protocol.c .26261-linux-2.5.69.updated/net/sctp/protocol.c
--- .26261-linux-2.5.69/net/sctp/protocol.c 2003-05-05 12:37:16.000000000 +1000
+++ .26261-linux-2.5.69.updated/net/sctp/protocol.c 2003-05-06 18:14:23.000000000 +1000
@@ -855,26 +855,14 @@ static int __init init_sctp_mibs(void)
{
int i;
- sctp_statistics[0] = kmalloc_percpu(sizeof (struct sctp_mib),
- GFP_KERNEL);
+ sctp_statistics[0] = kmalloc_percpu(struct sctp_mib);
if (!sctp_statistics[0])
return -ENOMEM;
- sctp_statistics[1] = kmalloc_percpu(sizeof (struct sctp_mib),
- GFP_KERNEL);
+ sctp_statistics[1] = kmalloc_percpu(struct sctp_mib);
if (!sctp_statistics[1]) {
kfree_percpu(sctp_statistics[0]);
return -ENOMEM;
}
-
- /* Zero all percpu versions of the mibs */
- for (i = 0; i < NR_CPUS; i++) {
- if (cpu_possible(i)) {
- memset(per_cpu_ptr(sctp_statistics[0], i), 0,
- sizeof (struct sctp_mib));
- memset(per_cpu_ptr(sctp_statistics[1], i), 0,
- sizeof (struct sctp_mib));
- }
- }
return 0;
}
In message <[email protected]> you write:
> On Mon, May 05, 2003 at 08:46:58AM +0000, Andrew Morton wrote:
> Andrew,
> Here is a comparision of kmalloc_percpu techniques as I see it,
>
> Current Implementation:
> 1. Two dereferences to get to the per-cpu data
> 2. Allocates for cpu_possible cpus only, and can deal with sparse cpu nos
>
> Rusty's Implementation
> 1. One extra memory reference (__per_cpu_offset)
> 2. allocates for NR_CPUS and probably breaks with sparse cpu nos?
> 3. Let you do per-cpu data in modules
> 4. fragmentation
And #3 is, in fact, the one I care about. The extra memory reference
is already probably cachehot (all static per-cpu use it), and might be
in a register on some archs.
Doesn't break with sparce CPU #s, but yes, it is inefficient.
Cheers,
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
"David S. Miller" <[email protected]> wrote:
>> As just pointed out by dipankar the only issue is NUMA...
>> so it has to be something more sophisticated than simply
>> kmalloc()[smp_processor_id];
On Tue, May 06, 2003 at 12:34:12AM -0700, Andrew Morton wrote:
> The proposed patch doesn't do anything about that either.
> + ptr = alloc_bootmem(PERCPU_POOL_SIZE * NR_CPUS);
> So yes, we need an api which could be extended to use node-affine memory at
> some time in the future. I think we have that.
IIRC that can be overridden; I wrote something to do node-local per-cpu
areas for i386 for some prior incarnations of per-cpu stuff and even
posted it, and this looks like it bootstraps at the same time (before
free_area_init_core() that is) and has the same override hook.
-- wli
Rusty Russell <[email protected]> wrote:
>
> It's a tradeoff, but I think it's worth it for a kmalloc_percpu which
> is fast, space-efficient and numa-aware, since the code is needed
> anyway.
I don't beleive that kmalloc_percpu() itself needs to be fast, as you say.
The code is _not_ NUMA-aware. Is it?
> How about a compromise like the following (scaled with mem)?
> Untested, but you get the idea...
> + /* Plenty of memory? 1GB = 64k per-cpu. */
> + pool_size = max(((long long)num_physpages << PAGE_SHIFT) / 16384,
> + (long long)pool_size);
On 64GB 32-way that's 128MB of lowmem. Unpopular. I'd settle for a __setup
thingy here, and a printk when the memory runs out.
btw, what's wrong with leaving kmalloc_percpu() as-is, and only using this
allocator for DEFINE_PERCPU()?
On Tue, May 06, 2003 at 06:03:15PM +1000, Rusty Russell wrote:
> In message <[email protected]> you write:
> ..
> Doesn't break with sparce CPU #s, but yes, it is inefficient.
>
If you don't reduce NR_CPUS with CONFIG_NR_CPUS, you waste space (32 bit folks
won't like it) and if you say change CONFIG_NR_CPUS to 2,
and we have cpuid 4 on a 2 way you break right? If we have to address these
issues at all, why can't we use the simpler kmalloc_percpu patch
which I posted in the morning and avoid so much complexity and arch
dependency?
Thanks,
Kiran
On Tue, May 06, 2003 at 03:04:11PM +0530, Ravikiran G Thirumalai wrote:
> > Doesn't break with sparce CPU #s, but yes, it is inefficient.
> >
>
> If you don't reduce NR_CPUS with CONFIG_NR_CPUS, you waste space (32 bit folks
> won't like it) and if you say change CONFIG_NR_CPUS to 2,
> and we have cpuid 4 on a 2 way you break right? If we have to address these
> issues at all, why can't we use the simpler kmalloc_percpu patch
> which I posted in the morning and avoid so much complexity and arch
> dependency?
We can have something like that for !CONFIG_NUMA and a NUMA-aware
allocator with additional dereferencing cost for CONFIG_NUMA.
Hopefully gains from numa-awareness will more than offset
dereferencing costs.
Thanks
Dipankar
On Tue, 2003-05-06 at 01:03, Rusty Russell wrote:
> And #3 is, in fact, the one I care about. The extra memory reference
> is already probably cachehot (all static per-cpu use it), and might be
> in a register on some archs.
>
> Doesn't break with sparce CPU #s, but yes, it is inefficient.
Maybe we should treat this stuff like architecture ABIs that
have a "small data" section?
Before elaborating, let me state that if we are going to use this
for things like disk stats and per-interface ipv6 ICMP stats, a
fixed size pool is absolutely unacceptable. People are configuring up
thousands upon thousands of VLAN net devices today, and if IPV6 is
enabled each of those will get a per-cpu ICMP stats block allocated.
And as Andrew said, there can be thousands of block devices.
Therefore, for these per-cpu applications there must be no limits built
into the mechanism.
Now, what I propose is that kmalloc_percpu() keeps it's current
implementation. Then we have a kmalloc_percpu_small() that must only
be invoked during module init and we limit the size to some reasonable
amount. This kmalloc_percpu_small() uses the 32K pool or whatever, as
does DECLARE_PERCPU in a module.
How about this?
--
David S. Miller <[email protected]>
>> As just pointed out by dipankar the only issue is NUMA...
>> so it has to be something more sophisticated than simply
>> kmalloc()[smp_processor_id];
>
> The proposed patch doesn't do anything about that either.
>
> + ptr = alloc_bootmem(PERCPU_POOL_SIZE * NR_CPUS);
>
> So yes, we need an api which could be extended to use node-affine memory at
> some time in the future. I think we have that.
You can just call alloc_bootmem_node for each CPU instead. It doesn't
work on i386 at the moment (well, it'll work but come out of node 0),
but it probably ought to.
M.
--On Tuesday, May 06, 2003 18:28:24 +1000 Rusty Russell <[email protected]> wrote:
> + pool_size = max(((long long)num_physpages << PAGE_SHIFT) / 16384,
> + (long long)pool_size);
I like the idea of scaling it with the machine, but can you scale this off
lowmem instead? Something like max_low_pfn or similar.
Thanks,
M.
On Mon, 2003-05-05 at 23:57, Andrew Morton wrote:
> The disk_stats structure has an "in flight" member. If we don't have proper
> locking around that, disks will appear to have -3 requests in flight for all
> time, which would look a tad odd.
For what it's worth, the disk stats patch that vendors have been
shipping forever with 2.4 (and that Christoph pushed into 2.4.20) has
this problem, so at least people will be used to it.
<b
In message <[email protected]> you write:
> Rusty Russell <[email protected]> wrote:
> >
> > It's a tradeoff, but I think it's worth it for a kmalloc_percpu which
> > is fast, space-efficient and numa-aware, since the code is needed
> > anyway.
>
> I don't beleive that kmalloc_percpu() itself needs to be fast, as you say.
>
> The code is _not_ NUMA-aware. Is it?
No, it's arch-overridable. You know me, I don't mess with
arch-specific stuff. 8)
> On 64GB 32-way that's 128MB of lowmem. Unpopular. I'd settle for a __setup
> thingy here, and a printk when the memory runs out.
Currently __setup is run far too late. Fortunately Bartlomiej has
been working on moving __setup earlier, for other reasons (two-stage
parameter parsing). In this case, it'd need to be before arch setup,
hmm...
> btw, what's wrong with leaving kmalloc_percpu() as-is, and only using this
> allocator for DEFINE_PERCPU()?
I figured that since the allocator is going to be there anyway, it
made sense to express kmalloc_percpu() in those terms. If you think
the price is too high, I can respect that.
The more people use the __per_cpu_offset instead of
smp_processor_id(), the cheaper it gets, even to the stage where some
archs might want to express smp_processor_id() in terms of
__per_cpu_offset, rather than vice-versa. Given that it's also more
space efficient (each var doesn't take one cacheline) too, I'd
recommend __alloc_percpu() of eg. single ints for long-lived objects
where I wouldn't recommend the current kmalloc_percpu().
Paul Mackerras points out that we could get the numa-aware allocation
plus "one big alloc" properties by playing with page mappings: reserve
1MB of virtual address, and map more pages as required. I didn't
think that we'd need that yet, though.
So, numerous options, and you're smarter than me, so you can decide 8)
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
In message <[email protected]> you write:
> On Tue, May 06, 2003 at 06:03:15PM +1000, Rusty Russell wrote:
> > In message <[email protected]> you write:
> > ..
> > Doesn't break with sparce CPU #s, but yes, it is inefficient.
> >
>
> If you don't reduce NR_CPUS with CONFIG_NR_CPUS, you waste space (32
> bit folks won't like it) and if you say change CONFIG_NR_CPUS to 2,
> and we have cpuid 4 on a 2 way you break right?
You misunderstand, I think: on *all* archs, smp_processor_id() <
NR_CPUS is an axiom. Always has been.
The generic solution involved cpu_possible(), but it's too early in
boot for that: attempts to make it a requirement have to change
numerous archs. Not that I'm against that change...
Of course, an arch can override this default allocator, and may have
more info with which to do optimal allocation.
> If we have to address these issues at all, why can't we use the
> simpler kmalloc_percpu patch which I posted in the morning and avoid
> so much complexity and arch dependency?
Because we still need the __alloc_percpu for DECLARE_PER_CPU support
in modules 8(
Hope that clarifies,
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
On Wed, May 07, 2003 at 11:57:13AM +1000, Rusty Russell wrote:
> Paul Mackerras points out that we could get the numa-aware allocation
> plus "one big alloc" properties by playing with page mappings: reserve
> 1MB of virtual address, and map more pages as required. I didn't
> think that we'd need that yet, though.
This is somewhat painful to do (though possible) on i386. The cost of
task migration would increase at the very least.
-- wli
William Lee Irwin III writes:
> On Wed, May 07, 2003 at 11:57:13AM +1000, Rusty Russell wrote:
> > Paul Mackerras points out that we could get the numa-aware allocation
> > plus "one big alloc" properties by playing with page mappings: reserve
> > 1MB of virtual address, and map more pages as required. I didn't
> > think that we'd need that yet, though.
>
> This is somewhat painful to do (though possible) on i386. The cost of
> task migration would increase at the very least.
Explain? We're not talking about having the same address mapped
differently on different CPUs, we're just talking about something
which is the equivalent of a sparsely-instantiated vmalloc.
Paul.
In message <[email protected]> you write:
> On Wed, May 07, 2003 at 11:57:13AM +1000, Rusty Russell wrote:
> > Paul Mackerras points out that we could get the numa-aware allocation
> > plus "one big alloc" properties by playing with page mappings: reserve
> > 1MB of virtual address, and map more pages as required. I didn't
> > think that we'd need that yet, though.
>
> This is somewhat painful to do (though possible) on i386. The cost of
> task migration would increase at the very least.
You misunderstand I think. All cpus would have the same mappings.
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
William Lee Irwin III writes:
>> This is somewhat painful to do (though possible) on i386. The cost of
>> task migration would increase at the very least.
On Wed, May 07, 2003 at 02:03:46PM +1000, Paul Mackerras wrote:
> Explain? We're not talking about having the same address mapped
> differently on different CPUs, we're just talking about something
> which is the equivalent of a sparsely-instantiated vmalloc.
Same address mapped differently on different cpus is what I thought
you meant. It does make sense, and besides, it only really matters
when the thing is being switched in, so I think it's not such a big
deal. e.g. mark per-thread mm context with the cpu it was prepped for,
if they don't match at load-time then reset the kernel pmd's pgd entry
in the per-thread pgd at the top level. x86 blows away the TLB at the
drop of a hat anyway, and it wouldn't even introduce an extra whole-TLB
flush. That said, I'm not terribly interested in hacking all that up
even though the proper hooks to make it pure arch code appear to exist.
The vmallocspace bit is easier, though the virtualspace reservation
could get uncomfortably large depending on how much is crammed in there.
That can go node-local also. I guess it has some runtime arithmetic
overhead vs. the per-cpu TLB entries in exchange for less complex code.
-- wli
William Lee Irwin III writes:
> Same address mapped differently on different cpus is what I thought
> you meant. It does make sense, and besides, it only really matters
> when the thing is being switched in, so I think it's not such a big
> deal. e.g. mark per-thread mm context with the cpu it was prepped for,
> if they don't match at load-time then reset the kernel pmd's pgd entry
> in the per-thread pgd at the top level. x86 blows away the TLB at the
Having to have a pgdir per thread would be a bit sucky, wouldn't it?
On PPCs with the hash-table based MMU, if we wanted to do different
mappings of the same address on different CPUs, we would have to have
a separate hash table for each CPU, which would chew up a lot of
memory. On PPC64 machines with logical partitioning, I don't think
the hypervisor would let you have a separate hash table for each CPU.
On the flip side, PPC can afford a register to point to a per-cpu data
area more easily than x86 can.
> The vmallocspace bit is easier, though the virtualspace reservation
> could get uncomfortably large depending on how much is crammed in there.
> That can go node-local also. I guess it has some runtime arithmetic
> overhead vs. the per-cpu TLB entries in exchange for less complex code.
I was thinking of something like 64kB per cpu times 32 cpus = 2MB.
Anyway, 32-bit machines with > 8 cpus are a pretty rare corner case.
On 64-bit machines we have enough virtual space to give each cpu
gigabytes of per-cpu data if we want to.
Paul.
William Lee Irwin III writes:
>> Same address mapped differently on different cpus is what I thought
>> you meant. It does make sense, and besides, it only really matters
>> when the thing is being switched in, so I think it's not such a big
>> deal. e.g. mark per-thread mm context with the cpu it was prepped for,
>> if they don't match at load-time then reset the kernel pmd's pgd entry
>> in the per-thread pgd at the top level. x86 blows away the TLB at the
On Wed, May 07, 2003 at 02:56:53PM +1000, Paul Mackerras wrote:
> Having to have a pgdir per thread would be a bit sucky, wouldn't it?
Not as bad as it initially sounds; on non-PAE i386, it's 4KB and would
hurt. On PAE i386, it's 32B and can be shoehorned, say, in thread_info.
Then the rest is just a per-cpu kernel pmd and properly handling vmalloc
faults (which are already handled properly for non-PAE vmallocspace).
There might be other reasons to do it, like reducing the virtualspace
overhead of the atomic kmap area, but it's not really time yet.
On Wed, May 07, 2003 at 02:56:53PM +1000, Paul Mackerras wrote:
> On PPCs with the hash-table based MMU, if we wanted to do different
> mappings of the same address on different CPUs, we would have to have
> a separate hash table for each CPU, which would chew up a lot of
> memory. On PPC64 machines with logical partitioning, I don't think
> the hypervisor would let you have a separate hash table for each CPU.
> On the flip side, PPC can afford a register to point to a per-cpu data
> area more easily than x86 can.
Well, presumably it'd have to be abstracted so the mechanism isn't
exposed to core code if ever done. The arch code insulation appears to
be there to keep one going, though not necessarily accessors. Probably
the only reason to seriously think about it is that the arithmetic
shows up as a disincentive on the register-starved FPOS's I'm stuck on.
William Lee Irwin III writes:
>> The vmallocspace bit is easier, though the virtualspace reservation
>> could get uncomfortably large depending on how much is crammed in there.
>> That can go node-local also. I guess it has some runtime arithmetic
>> overhead vs. the per-cpu TLB entries in exchange for less complex code.
On Wed, May 07, 2003 at 02:56:53PM +1000, Paul Mackerras wrote:
> I was thinking of something like 64kB per cpu times 32 cpus = 2MB.
> Anyway, 32-bit machines with > 8 cpus are a pretty rare corner case.
> On 64-bit machines we have enough virtual space to give each cpu
> gigabytes of per-cpu data if we want to.
2MB vmallocspace is doable; it'd need to be bigger or per-something
besides cpus to hurt.
-- wli
Rusty Russell <[email protected]> wrote:
>
> I figured that since the allocator is going to be there anyway, it
> made sense to express kmalloc_percpu() in those terms. If you think
> the price is too high, I can respect that.
I suggest that we use your new mechanism to fix DEFINE_PERCPU() in modules,
and leave kmalloc_percpu() as it is for now.
On Mon, May 05, 2003 at 08:17:07AM +0000, Rusty Russell wrote:
> Hi Andrew,
>
> This is the kmalloc_percpu patch. I have another patch which
> tests the allocator if you want to see that to. This is the precursor
> to per-cpu stuff in modules, but also allows other space gains for
> structures which currently embed per-cpu arrays (eg. your fuzzy counters).
>
I tried to run a test to compare this implementation, but got an oops.
Here is the oops and the patch I was trying...
btw, why the change from kmalloc_percpu(size) to kmalloc_percpu(type)?
You do kmalloc(sizeof (long)) for the usual kmalloc, but
kmalloc_percpu(long) for percpu data...looks strange no?
Thanks,
Kiran
diff -ruN -X dontdiff linux-2.5.69/mm/swap.c rusty-1-2.5.69/mm/swap.c
--- linux-2.5.69/mm/swap.c Mon May 5 05:23:13 2003
+++ rusty-1-2.5.69/mm/swap.c Wed May 7 10:01:00 2003
@@ -356,14 +356,14 @@
*/
#define ACCT_THRESHOLD max(16, NR_CPUS * 2)
-static DEFINE_PER_CPU(long, committed_space) = 0;
+static long *committed_space;
void vm_acct_memory(long pages)
{
long *local;
preempt_disable();
- local = &__get_cpu_var(committed_space);
+ local = per_cpu_ptr(committed_space, smp_processor_id());
*local += pages;
if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) {
atomic_add(*local, &vm_committed_space);
@@ -371,6 +371,12 @@
}
preempt_enable();
}
+
+static int init_committed_space(void)
+{
+ committed_space = kmalloc_percpu(long);
+ return committed_space;
+}
#endif
@@ -390,4 +396,6 @@
* Right now other parts of the system means that we
* _really_ don't want to cluster much more
*/
+ if (!init_committed_space)
+ panic("swap_setup\n");
}
Unable to handle kernel paging request at virtual address 0115d040
printing eip:
c01392ae
*pde = 1fccf001
*pte = 00000000
Oops: 0000 [#1]
CPU: 0
EIP: 0060:[<c01392ae>] Not tainted
EFLAGS: 00010202
EIP is at vm_acct_memory+0x1e/0x60
eax: 00000000 ebx: dfcca4e0 ecx: 0115d040 edx: 00000001
esi: 00000001 edi: dff7e000 ebp: dff7fe3c esp: dff7fc7c
ds: 007b es: 007b ss: 0068
Process init (pid: 1, threadinfo=dff7e000 task=dff7c040)
Stack: c013fa8f 00000001 dffe4120 00000000 dfcd8720 dff7fe3c dfcca4e0 dfcd3dc0
c0155966 00000001 dff7e000 bffe0000 dff7e000 dff7fe3c c02ba305 00000000
c0171013 dff7fe3c 00000000 00000000 00000000 ffffffff 00000003 00000000
Call Trace:
[<c013fa8f>] vm_enough_memory+0xf/0xc0
[<c0155966>] setup_arg_pages+0x96/0x190
[<c0171013>] load_elf_binary+0x4c3/0x9f0
[<c01376ea>] cache_grow+0x1fa/0x2d0
[<c0134ab9>] __alloc_pages+0x89/0x2f0
[<c01556c8>] copy_strings+0x238/0x250
[<c0170b50>] load_elf_binary+0x0/0x9f0
[<c0156975>] search_binary_handler+0xa5/0x1f0
[<c0155707>] copy_strings_kernel+0x27/0x30
[<c0156c3f>] do_execve+0x17f/0x200
[<c015821e>] getname+0x5e/0xa0
[<c01078f0>] sys_execve+0x30/0x70
[<c0109087>] syscall_call+0x7/0xb
[<c01051b1>] init+0x151/0x1d0
[<c0105060>] init+0x0/0x1d0
[<c01070f5>] kernel_thread_helper+0x5/0x10
Code: 8b 01 01 c2 89 11 83 fa 40 7f 09 89 d0 83 f8 c0 7d 11 eb 02
<0>Kernel panic: Attempted to kill init!
>>> Same address mapped differently on different cpus is what I thought
>>> you meant. It does make sense, and besides, it only really matters
>>> when the thing is being switched in, so I think it's not such a big
>>> deal. e.g. mark per-thread mm context with the cpu it was prepped for,
>>> if they don't match at load-time then reset the kernel pmd's pgd entry
>>> in the per-thread pgd at the top level. x86 blows away the TLB at the
>
> On Wed, May 07, 2003 at 02:56:53PM +1000, Paul Mackerras wrote:
>> Having to have a pgdir per thread would be a bit sucky, wouldn't it?
>
> Not as bad as it initially sounds; on non-PAE i386, it's 4KB and would
> hurt. On PAE i386, it's 32B and can be shoehorned, say, in thread_info.
> Then the rest is just a per-cpu kernel pmd and properly handling vmalloc
> faults (which are already handled properly for non-PAE vmallocspace).
> There might be other reasons to do it, like reducing the virtualspace
> overhead of the atomic kmap area, but it's not really time yet.
Even if the space isn't a problem, having a full TLB flush on thread
context switch sounds sucky. Per-node is sufficient for most things,
and is much cheaper (update on cross-node migrate).
M.
At some point in the past, my attribution was stripped from:
>> Not as bad as it initially sounds; on non-PAE i386, it's 4KB and would
>> hurt. On PAE i386, it's 32B and can be shoehorned, say, in thread_info.
>> Then the rest is just a per-cpu kernel pmd and properly handling vmalloc
>> faults (which are already handled properly for non-PAE vmallocspace).
>> There might be other reasons to do it, like reducing the virtualspace
>> overhead of the atomic kmap area, but it's not really time yet.
On Tue, May 06, 2003 at 09:10:24PM -0700, Martin J. Bligh wrote:
> Even if the space isn't a problem, having a full TLB flush on thread
> context switch sounds sucky. Per-node is sufficient for most things,
> and is much cheaper (update on cross-node migrate).
The scheme described requires no TLB flushing or pgd switching when
switching between threads in the same mm. Providing a true per-thread
mapping, like one for the stack, might require an invlpg or two at each
switch, but that wasn't mentioned here.
-- wli
On Tue, May 06, 2003 at 09:25:00AM +0200, Jens Axboe wrote:
> On Mon, May 05 2003, Andrew Morton wrote:
> > "David S. Miller" <[email protected]> wrote:
> > The disk_stats structure has an "in flight" member. If we don't have proper
> > locking around that, disks will appear to have -3 requests in flight for all
> > time, which would look a tad odd.
>
> So check for < 0 in flight? I totally agree with davem here.
If the disk_stats structure will never be accurate, it will show
nonsense values. If it shows nonsense values, the values have no
meaning anymore and we could remove them alltogether.
If some additions/subtractions will come in later it will not
matter, but if the value we add to or subtract from is already
wrong, then the error will really propagate to much.
Has somebody analyzed this? Can we tell userspace a maximum error
value? Is the maximum error value acceptable?
Is the above questions are all answered with "no", then I (and
several sysadmins) would prefer to just rip the stats or make
them a compile time option and be exact, if we really want them.
What about that?
Most people just want them per system to measure their
IO bandwidth and they want the spots (=per disk stats), just
to analyze the problematic cases.
For the real performance, they could always compile out the per
disk stats, once they are satisfied with the overall bandwidth.
Regards
Ingo Oeser
In message <[email protected]> you write:
> I tried to run a test to compare this implementation, but got an oops.
> Here is the oops and the patch I was trying...
> + if (!init_committed_space)
init_committed_space is a function. You meant to call it 8)
> btw, why the change from kmalloc_percpu(size) to kmalloc_percpu(type)?
> You do kmalloc(sizeof (long)) for the usual kmalloc, but
> kmalloc_percpu(long) for percpu data...looks strange no?
Yes, I'd probably want to change the name, if Andrew had agreed to the
concept. But the type is very convenient, because you want to know
the alignment (kmalloc doesn't care, it just pads to cachline).
Cheers,
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
In message <[email protected]> you write:
> Rusty Russell <[email protected]> wrote:
> >
> > I figured that since the allocator is going to be there anyway, it
> > made sense to express kmalloc_percpu() in those terms. If you think
> > the price is too high, I can respect that.
>
> I suggest that we use your new mechanism to fix DEFINE_PERCPU() in modules,
> and leave kmalloc_percpu() as it is for now.
Sure, we can revisit it later if it makes sense. After all, there's
supposed to be this freeze thing...
I'll prepare a new, minimal patch.
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
On Wed, May 07, 2003 at 04:16:06PM +1000, Rusty Russell wrote:
> In message <[email protected]> you write:
> > I tried to run a test to compare this implementation, but got an oops.
> > Here is the oops and the patch I was trying...
>
> > + if (!init_committed_space)
>
> init_committed_space is a function. You meant to call it 8)
Yeah :(..sorry abt that, I'd actually tested an earlier patch of
yours (one u sent to Dipankar) and I'd got an oops in
__alloc_percpu (Same application but I'd actually called
init_commmited_space earlier). So when I got a oops again
with the newer patch, I just mailed it.
But still, even after actually calling init_committed_space
with the newer patch (The first one which you mailed to Andrew in this
thread) I get an oops at __alloc_percpu.
Here's the oops report...looks like the
struct pcpu_block *pcpu has NULL...
Thanks,
Kiran
Unable to handle kernel NULL pointer dereference at virtual address 00000000
printing eip:
c014a5de
*pde = 00104001
*pte = 00000000
Oops: 0000 [#1]
CPU: 0
EIP: 0060:[<c014a5de>] Not tainted
EFLAGS: 00010246
EIP is at __alloc_percpu+0x4e/0x190
eax: c02f9cbc ebx: c03a8328 ecx: 00000000 edx: 00000000
esi: 00000001 edi: 00000000 ebp: 00000000 esp: dff7ff94
ds: 007b es: 007b ss: 0068
Process swapper (pid: 1, threadinfo=dff7e000 task=dff7c040)
Stack: c01070f0 00000003 c03cfe20 c03ad080 c03a8328 00000001 00000020 00000000
c01392f9 00000004 00000004 c036ccda c036cd26 c03a8328 c03607eb dff7e000
00000000 c01050c2 00000020 c0105060 00000000 00000000 00000000 c01070f5
Call Trace:
[<c01070f0>] kernel_thread_helper+0x0/0x10
[<c01392f9>] init_committed_space+0x9/0x11
[<c036ccda>] swap_setup+0x1a/0x30
[<c036cd26>] kswapd_init+0x6/0x40
[<c03607eb>] do_initcalls+0x3b/0x90
[<c01050c2>] init+0x62/0x1d0
[<c0105060>] init+0x0/0x1d0
[<c01070f5>] kernel_thread_helper+0x5/0x10
Code: 66 83 39 00 0f 84 08 01 00 00 c7 44 24 04 fc ff ff ff 31 f6
<0>Kernel panic: Attempted to kill init!
In message <[email protected]> you write:
> But still, even after actually calling init_committed_space
> with the newer patch (The first one which you mailed to Andrew in this
> thread) I get an oops at __alloc_percpu.
>
> Here's the oops report...looks like the
> struct pcpu_block *pcpu has NULL...
Damn initialization order.
Change:
__initcall(init_alloc_percpu);
to
core_initcall(init_alloc_percpu);
Anyway, patch has been scrapped, at least for now.
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.