2007-02-10 11:50:18

by Andi Kleen

[permalink] [raw]
Subject: [PATCH 2.6.21 review I] [1/25] x86_64: Add __copy_from_user_nocache


This does user copies in fs write() into the page cache with write combining.
This pushes the destination out of the CPU's cache, but allows higher bandwidth
in some case.

The theory is that the page cache data is usually not touched by the
CPU again and it's better to not pollute the cache with it. Also it is a little
faster.

Signed-off-by: Andi Kleen <[email protected]>

---
arch/x86_64/kernel/x8664_ksyms.c | 1
arch/x86_64/lib/Makefile | 2
arch/x86_64/lib/copy_user_nocache.S | 217 ++++++++++++++++++++++++++++++++++++
include/asm-x86_64/uaccess.h | 14 ++
4 files changed, 233 insertions(+), 1 deletion(-)

Index: linux/arch/x86_64/lib/Makefile
===================================================================
--- linux.orig/arch/x86_64/lib/Makefile
+++ linux/arch/x86_64/lib/Makefile
@@ -9,4 +9,4 @@ obj-y := io.o iomap_copy.o
lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \
usercopy.o getuser.o putuser.o \
thunk.o clear_page.o copy_page.o bitstr.o bitops.o
-lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o
+lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o copy_user_nocache.o
Index: linux/arch/x86_64/lib/copy_user_nocache.S
===================================================================
--- /dev/null
+++ linux/arch/x86_64/lib/copy_user_nocache.S
@@ -0,0 +1,217 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v2.
+ *
+ * Functions to copy from and to user space.
+ */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+#define FIX_ALIGNMENT 1
+
+#include <asm/current.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/cpufeature.h>
+
+/*
+ * copy_user_nocache - Uncached memory copy with exception handling
+ * This will force destination/source out of cache for more performance.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ * rcx zero flag when 1 zero on exception
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
+ENTRY(__copy_user_nocache)
+ CFI_STARTPROC
+ pushq %rbx
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_REL_OFFSET rbx, 0
+ pushq %rcx /* save zero flag */
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_REL_OFFSET rcx, 0
+
+ xorl %eax,%eax /* zero for the exception handler */
+
+#ifdef FIX_ALIGNMENT
+ /* check for bad alignment of destination */
+ movl %edi,%ecx
+ andl $7,%ecx
+ jnz .Lbad_alignment
+.Lafter_bad_alignment:
+#endif
+
+ movq %rdx,%rcx
+
+ movl $64,%ebx
+ shrq $6,%rdx
+ decq %rdx
+ js .Lhandle_tail
+
+ .p2align 4
+.Lloop:
+.Ls1: movq (%rsi),%r11
+.Ls2: movq 1*8(%rsi),%r8
+.Ls3: movq 2*8(%rsi),%r9
+.Ls4: movq 3*8(%rsi),%r10
+.Ld1: movnti %r11,(%rdi)
+.Ld2: movnti %r8,1*8(%rdi)
+.Ld3: movnti %r9,2*8(%rdi)
+.Ld4: movnti %r10,3*8(%rdi)
+
+.Ls5: movq 4*8(%rsi),%r11
+.Ls6: movq 5*8(%rsi),%r8
+.Ls7: movq 6*8(%rsi),%r9
+.Ls8: movq 7*8(%rsi),%r10
+.Ld5: movnti %r11,4*8(%rdi)
+.Ld6: movnti %r8,5*8(%rdi)
+.Ld7: movnti %r9,6*8(%rdi)
+.Ld8: movnti %r10,7*8(%rdi)
+
+ dec %rdx
+
+ leaq 64(%rsi),%rsi
+ leaq 64(%rdi),%rdi
+
+ jns .Lloop
+
+ .p2align 4
+.Lhandle_tail:
+ movl %ecx,%edx
+ andl $63,%ecx
+ shrl $3,%ecx
+ jz .Lhandle_7
+ movl $8,%ebx
+ .p2align 4
+.Lloop_8:
+.Ls9: movq (%rsi),%r8
+.Ld9: movnti %r8,(%rdi)
+ decl %ecx
+ leaq 8(%rdi),%rdi
+ leaq 8(%rsi),%rsi
+ jnz .Lloop_8
+
+.Lhandle_7:
+ movl %edx,%ecx
+ andl $7,%ecx
+ jz .Lende
+ .p2align 4
+.Lloop_1:
+.Ls10: movb (%rsi),%bl
+.Ld10: movb %bl,(%rdi)
+ incq %rdi
+ incq %rsi
+ decl %ecx
+ jnz .Lloop_1
+
+ CFI_REMEMBER_STATE
+.Lende:
+ popq %rcx
+ CFI_ADJUST_CFA_OFFSET -8
+ CFI_RESTORE %rcx
+ popq %rbx
+ CFI_ADJUST_CFA_OFFSET -8
+ CFI_RESTORE rbx
+ ret
+ CFI_RESTORE_STATE
+
+#ifdef FIX_ALIGNMENT
+ /* align destination */
+ .p2align 4
+.Lbad_alignment:
+ movl $8,%r9d
+ subl %ecx,%r9d
+ movl %r9d,%ecx
+ cmpq %r9,%rdx
+ jz .Lhandle_7
+ js .Lhandle_7
+.Lalign_1:
+.Ls11: movb (%rsi),%bl
+.Ld11: movb %bl,(%rdi)
+ incq %rsi
+ incq %rdi
+ decl %ecx
+ jnz .Lalign_1
+ subq %r9,%rdx
+ jmp .Lafter_bad_alignment
+#endif
+
+ /* table sorted by exception address */
+ .section __ex_table,"a"
+ .align 8
+ .quad .Ls1,.Ls1e
+ .quad .Ls2,.Ls2e
+ .quad .Ls3,.Ls3e
+ .quad .Ls4,.Ls4e
+ .quad .Ld1,.Ls1e
+ .quad .Ld2,.Ls2e
+ .quad .Ld3,.Ls3e
+ .quad .Ld4,.Ls4e
+ .quad .Ls5,.Ls5e
+ .quad .Ls6,.Ls6e
+ .quad .Ls7,.Ls7e
+ .quad .Ls8,.Ls8e
+ .quad .Ld5,.Ls5e
+ .quad .Ld6,.Ls6e
+ .quad .Ld7,.Ls7e
+ .quad .Ld8,.Ls8e
+ .quad .Ls9,.Le_quad
+ .quad .Ld9,.Le_quad
+ .quad .Ls10,.Le_byte
+ .quad .Ld10,.Le_byte
+#ifdef FIX_ALIGNMENT
+ .quad .Ls11,.Lzero_rest
+ .quad .Ld11,.Lzero_rest
+#endif
+ .quad .Le5,.Le_zero
+ .previous
+
+ /* compute 64-offset for main loop. 8 bytes accuracy with error on the
+ pessimistic side. this is gross. it would be better to fix the
+ interface. */
+ /* eax: zero, ebx: 64 */
+.Ls1e: addl $8,%eax
+.Ls2e: addl $8,%eax
+.Ls3e: addl $8,%eax
+.Ls4e: addl $8,%eax
+.Ls5e: addl $8,%eax
+.Ls6e: addl $8,%eax
+.Ls7e: addl $8,%eax
+.Ls8e: addl $8,%eax
+ addq %rbx,%rdi /* +64 */
+ subq %rax,%rdi /* correct destination with computed offset */
+
+ shlq $6,%rdx /* loop counter * 64 (stride length) */
+ addq %rax,%rdx /* add offset to loopcnt */
+ andl $63,%ecx /* remaining bytes */
+ addq %rcx,%rdx /* add them */
+ jmp .Lzero_rest
+
+ /* exception on quad word loop in tail handling */
+ /* ecx: loopcnt/8, %edx: length, rdi: correct */
+.Le_quad:
+ shll $3,%ecx
+ andl $7,%edx
+ addl %ecx,%edx
+ /* edx: bytes to zero, rdi: dest, eax:zero */
+.Lzero_rest:
+ cmpl $0,(%rsp) /* zero flag set? */
+ jz .Le_zero
+ movq %rdx,%rcx
+.Le_byte:
+ xorl %eax,%eax
+.Le5: rep
+ stosb
+ /* when there is another exception while zeroing the rest just return */
+.Le_zero:
+ movq %rdx,%rax
+ jmp .Lende
+ CFI_ENDPROC
+ENDPROC(__copy_user_nocache)
+
+
Index: linux/include/asm-x86_64/uaccess.h
===================================================================
--- linux.orig/include/asm-x86_64/uaccess.h
+++ linux/include/asm-x86_64/uaccess.h
@@ -367,4 +367,18 @@ __copy_to_user_inatomic(void __user *dst
return copy_user_generic((__force void *)dst, src, size);
}

+#define ARCH_HAS_NOCACHE_UACCESS 1
+extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size, int zerorest);
+
+static inline int __copy_from_user_nocache(void *dst, const void __user *src, unsigned size)
+{
+ might_sleep();
+ return __copy_user_nocache(dst, (__force void *)src, size, 1);
+}
+
+static inline int __copy_from_user_inatomic_nocache(void *dst, const void __user *src, unsigned size)
+{
+ return __copy_user_nocache(dst, (__force void *)src, size, 0);
+}
+
#endif /* __X86_64_UACCESS_H */
Index: linux/arch/x86_64/kernel/x8664_ksyms.c
===================================================================
--- linux.orig/arch/x86_64/kernel/x8664_ksyms.c
+++ linux/arch/x86_64/kernel/x8664_ksyms.c
@@ -26,6 +26,7 @@ EXPORT_SYMBOL(__put_user_4);
EXPORT_SYMBOL(__put_user_8);

EXPORT_SYMBOL(copy_user_generic);
+EXPORT_SYMBOL(__copy_user_nocache);
EXPORT_SYMBOL(copy_from_user);
EXPORT_SYMBOL(copy_to_user);
EXPORT_SYMBOL(__copy_from_user_inatomic);


2007-02-10 11:50:22

by Andi Kleen

[permalink] [raw]
Subject: [PATCH 2.6.21 review I] [2/25] x86_64: Make the NUMA hash function nodemap allocation


From: Amul Shah <[email protected]>
Remove the statically allocated memory to NUMA node hash map in favor of a
dynamically allocated memory to node hash map (it is cache aligned).

This patch has the nice side effect in that it allows the hash map to grow
for systems with large amounts of memory (256GB - 1TB), but suffer from
having small PCI space tacked onto the boot node (which is somewhere
between 192MB to 512MB on the ES7000).

Signed-off-by: Amul Shah <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>
Cc: Andi Kleen <[email protected]>
Cc: Rohit Seth <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
---
Updated patch to fix a bug that Andi Kleen found for platforms that
don't support NUMA (or "numa=off").

---
arch/x86_64/kernel/e820.c | 7 ++++
arch/x86_64/kernel/setup.c | 5 ++
arch/x86_64/mm/numa.c | 74 ++++++++++++++++++++++++++++++++++++++------
include/asm-x86_64/e820.h | 1
include/asm-x86_64/mmzone.h | 13 ++++---
5 files changed, 85 insertions(+), 15 deletions(-)

Index: linux/arch/x86_64/kernel/e820.c
===================================================================
--- linux.orig/arch/x86_64/kernel/e820.c
+++ linux/arch/x86_64/kernel/e820.c
@@ -83,6 +83,13 @@ static inline int bad_addr(unsigned long
return 1;
}

+#ifdef CONFIG_NUMA
+ /* NUMA memory to node map */
+ if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) {
+ *addrp = nodemap_addr + nodemap_size;
+ return 1;
+ }
+#endif
/* XXX ramdisk image here? */
return 0;
}
Index: linux/arch/x86_64/kernel/setup.c
===================================================================
--- linux.orig/arch/x86_64/kernel/setup.c
+++ linux/arch/x86_64/kernel/setup.c
@@ -444,6 +444,11 @@ void __init setup_arch(char **cmdline_p)
/* reserve ebda region */
if (ebda_addr)
reserve_bootmem_generic(ebda_addr, ebda_size);
+#ifdef CONFIG_NUMA
+ /* reserve nodemap region */
+ if (nodemap_addr)
+ reserve_bootmem_generic(nodemap_addr, nodemap_size);
+#endif

#ifdef CONFIG_SMP
/*
Index: linux/arch/x86_64/mm/numa.c
===================================================================
--- linux.orig/arch/x86_64/mm/numa.c
+++ linux/arch/x86_64/mm/numa.c
@@ -36,6 +36,8 @@ unsigned char apicid_to_node[MAX_LOCAL_A
cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;

int numa_off __initdata;
+unsigned long __initdata nodemap_addr;
+unsigned long __initdata nodemap_size;


/*
@@ -52,34 +54,87 @@ populate_memnodemap(const struct bootnod
int res = -1;
unsigned long addr, end;

- if (shift >= 64)
- return -1;
- memset(memnodemap, 0xff, sizeof(memnodemap));
+ memset(memnodemap, 0xff, memnodemapsize);
for (i = 0; i < numnodes; i++) {
addr = nodes[i].start;
end = nodes[i].end;
if (addr >= end)
continue;
- if ((end >> shift) >= NODEMAPSIZE)
+ if ((end >> shift) >= memnodemapsize)
return 0;
do {
if (memnodemap[addr >> shift] != 0xff)
return -1;
memnodemap[addr >> shift] = i;
- addr += (1UL << shift);
+ addr += (1UL << shift);
} while (addr < end);
res = 1;
}
return res;
}

-int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
+static int __init allocate_cachealigned_memnodemap(void)
+{
+ unsigned long pad, pad_addr;
+
+ memnodemap = memnode.embedded_map;
+ if (memnodemapsize <= 48) {
+ printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
+ nodemap_addr, nodemap_addr + nodemap_size);
+ return 0;
+ }
+
+ pad = L1_CACHE_BYTES - 1;
+ pad_addr = 0x8000;
+ nodemap_size = pad + memnodemapsize;
+ nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT,
+ nodemap_size);
+ if (nodemap_addr == -1UL) {
+ printk(KERN_ERR
+ "NUMA: Unable to allocate Memory to Node hash map\n");
+ nodemap_addr = nodemap_size = 0;
+ return -1;
+ }
+ pad_addr = (nodemap_addr + pad) & ~pad;
+ memnodemap = phys_to_virt(pad_addr);
+
+ printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
+ nodemap_addr, nodemap_addr + nodemap_size);
+ return 0;
+}
+
+/*
+ * The LSB of all start and end addresses in the node map is the value of the
+ * maximum possible shift.
+ */
+static int __init
+extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes)
{
- int shift = 20;
+ int i;
+ unsigned long start, end;
+ unsigned long bitfield = 0, memtop = 0;

- while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
- shift++;
+ for (i = 0; i < numnodes; i++) {
+ start = nodes[i].start;
+ end = nodes[i].end;
+ if (start >= end)
+ continue;
+ bitfield |= start | end;
+ if (end > memtop)
+ memtop = end;
+ }
+ i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
+ memnodemapsize = (memtop >> i)+1;
+ return i;
+}
+
+int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
+{
+ int shift;

+ shift = extract_lsb_from_nodes(nodes, numnodes);
+ if (allocate_cachealigned_memnodemap())
+ return -1;
printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
shift);

@@ -290,6 +345,7 @@ void __init numa_initmem_init(unsigned l
end_pfn << PAGE_SHIFT);
/* setup dummy node covering all memory */
memnode_shift = 63;
+ memnodemap = memnode.embedded_map;
memnodemap[0] = 0;
nodes_clear(node_online_map);
node_set_online(0);
Index: linux/include/asm-x86_64/e820.h
===================================================================
--- linux.orig/include/asm-x86_64/e820.h
+++ linux/include/asm-x86_64/e820.h
@@ -56,6 +56,7 @@ extern void finish_e820_parsing(void);
extern struct e820map e820;

extern unsigned ebda_addr, ebda_size;
+extern unsigned long nodemap_addr, nodemap_size;
#endif/*!__ASSEMBLY__*/

#endif/*__E820_HEADER*/
Index: linux/include/asm-x86_64/mmzone.h
===================================================================
--- linux.orig/include/asm-x86_64/mmzone.h
+++ linux/include/asm-x86_64/mmzone.h
@@ -11,24 +11,25 @@

#include <asm/smp.h>

-/* Should really switch to dynamic allocation at some point */
-#define NODEMAPSIZE 0x4fff
-
/* Simple perfect hash to map physical addresses to node numbers */
struct memnode {
int shift;
- u8 map[NODEMAPSIZE];
-} ____cacheline_aligned;
+ unsigned int mapsize;
+ u8 *map;
+ u8 embedded_map[64-16];
+} ____cacheline_aligned; /* total size = 64 bytes */
extern struct memnode memnode;
#define memnode_shift memnode.shift
#define memnodemap memnode.map
+#define memnodemapsize memnode.mapsize

extern struct pglist_data *node_data[];

static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
{
unsigned nid;
- VIRTUAL_BUG_ON((addr >> memnode_shift) >= NODEMAPSIZE);
+ VIRTUAL_BUG_ON(!memnodemap);
+ VIRTUAL_BUG_ON((addr >> memnode_shift) >= memnodemapsize);
nid = memnodemap[addr >> memnode_shift];
VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]);
return nid;

2007-02-10 11:50:57

by Andi Kleen

[permalink] [raw]
Subject: [PATCH 2.6.21 review I] [25/25] i386: arch/i386/kernel/e820.c should #include <asm/setup.h


From: Adrian Bunk <[email protected]>

Every file should #include the headers containing the prototypes for
its global functions.

Signed-off-by: Adrian Bunk <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>

---

arch/i386/kernel/e820.c | 1 +
1 file changed, 1 insertion(+)

Index: linux/arch/i386/kernel/e820.c
===================================================================
--- linux.orig/arch/i386/kernel/e820.c
+++ linux/arch/i386/kernel/e820.c
@@ -14,6 +14,7 @@
#include <asm/pgtable.h>
#include <asm/page.h>
#include <asm/e820.h>
+#include <asm/setup.h>

#ifdef CONFIG_EFI
int efi_enabled = 0;

2007-02-10 11:51:00

by Andi Kleen

[permalink] [raw]
Subject: [PATCH 2.6.21 review I] [21/25] x86_64: a memcpy that tries to reduce cache pressure


From: "Bryan O'Sullivan" <[email protected]>

This copy routine is memcpy-compatible, but on some architectures will use
cache-bypassing loads to avoid bringing the source data into the cache.

One case where this is useful is when a device issues a DMA to a memory
region, and the CPU must copy the DMAed data elsewhere before doing any work
with it. Since the source data is read-once, write-never from the CPU's
perspective, caching the data at those addresses can only evict potentially
useful data.

We provide an x86_64 implementation that uses SSE non-temporal loads, and a
generic version that falls back to plain memcpy.

Implementors for other arches should not use cache-bypassing stores to the
destination, as in most cases, the destination is accessed almost immediately
after a copy finishes.

[[email protected]: add module export]
[[email protected]: remove an ARCH_HAS_foo]
Signed-off-by: Bryan O'Sullivan <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>
Cc: Andi Kleen <[email protected]>
Cc: Roland Dreier <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
---

arch/x86_64/kernel/x8664_ksyms.c | 2
arch/x86_64/lib/Makefile | 1
arch/x86_64/lib/memcpy_uncached_read.S | 142 +++++++++++++++++++++++++++++++++
include/asm-x86_64/string.h | 2
include/linux/string.h | 3
5 files changed, 150 insertions(+)

Index: linux/arch/x86_64/kernel/x8664_ksyms.c
===================================================================
--- linux.orig/arch/x86_64/kernel/x8664_ksyms.c
+++ linux/arch/x86_64/kernel/x8664_ksyms.c
@@ -8,6 +8,7 @@
#include <asm/processor.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
+#include <asm/string.h>

EXPORT_SYMBOL(kernel_thread);

@@ -54,6 +55,7 @@ extern void * __memcpy(void *,const void
EXPORT_SYMBOL(memset);
EXPORT_SYMBOL(memcpy);
EXPORT_SYMBOL(__memcpy);
+EXPORT_SYMBOL(memcpy_uncached_read);

EXPORT_SYMBOL(empty_zero_page);
EXPORT_SYMBOL(init_level4_pgt);
Index: linux/arch/x86_64/lib/Makefile
===================================================================
--- linux.orig/arch/x86_64/lib/Makefile
+++ linux/arch/x86_64/lib/Makefile
@@ -10,3 +10,4 @@ lib-y := csum-partial.o csum-copy.o csum
usercopy.o getuser.o putuser.o \
thunk.o clear_page.o copy_page.o bitstr.o bitops.o
lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o copy_user_nocache.o
+lib-y += memcpy_uncached_read.o
Index: linux/arch/x86_64/lib/memcpy_uncached_read.S
===================================================================
--- /dev/null
+++ linux/arch/x86_64/lib/memcpy_uncached_read.S
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2006 QLogic Corporation. All Rights Reserved.
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+/*
+ * memcpy_uncached_read - memcpy-compatible copy routine, using streaming loads
+ * @dest: destination address
+ * @src: source address (will not be cached)
+ * @count: number of bytes to copy
+ *
+ * Use streaming loads and normal stores for a special-case copy where
+ * we know we won't be reading the source again, but will be reading the
+ * destination again soon.
+ */
+ .text
+ .p2align 4,,15
+ /* rdi destination, rsi source, rdx count */
+ .globl memcpy_uncached_read
+ .type memcpy_uncached_read, @function
+memcpy_uncached_read:
+ movq %rdi, %rax
+.L5:
+ cmpq $15, %rdx
+ ja .L34
+.L3:
+ cmpl $8, %edx /* rdx is 0..15 */
+ jbe .L9
+.L6:
+ testb $8, %dxl /* rdx is 3,5,6,7,9..15 */
+ je .L13
+ movq (%rsi), %rcx
+ addq $8, %rsi
+ movq %rcx, (%rdi)
+ addq $8, %rdi
+.L13:
+ testb $4, %dxl
+ je .L15
+ movl (%rsi), %ecx
+ addq $4, %rsi
+ movl %ecx, (%rdi)
+ addq $4, %rdi
+.L15:
+ testb $2, %dxl
+ je .L17
+ movzwl (%rsi), %ecx
+ addq $2, %rsi
+ movw %cx, (%rdi)
+ addq $2, %rdi
+.L17:
+ testb $1, %dxl
+ je .L33
+.L1:
+ movzbl (%rsi), %ecx
+ movb %cl, (%rdi)
+.L33:
+ ret
+.L34:
+ cmpq $63, %rdx /* rdx is > 15 */
+ ja .L64
+ movl $16, %ecx /* rdx is 16..63 */
+.L25:
+ movq 8(%rsi), %r8
+ movq (%rsi), %r9
+ addq %rcx, %rsi
+ movq %r8, 8(%rdi)
+ movq %r9, (%rdi)
+ addq %rcx, %rdi
+ subq %rcx, %rdx
+ cmpl %edx, %ecx /* is rdx >= 16? */
+ jbe .L25
+ jmp .L3 /* rdx is 0..15 */
+ .p2align 4,,7
+.L64:
+ movl $64, %ecx
+.L42:
+ prefetchnta 128(%rsi)
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ movq 16(%rsi), %r10
+ movq 24(%rsi), %r11
+ subq %rcx, %rdx
+ movq %r8, (%rdi)
+ movq 32(%rsi), %r8
+ movq %r9, 8(%rdi)
+ movq 40(%rsi), %r9
+ movq %r10, 16(%rdi)
+ movq 48(%rsi), %r10
+ movq %r11, 24(%rdi)
+ movq 56(%rsi), %r11
+ addq %rcx, %rsi
+ movq %r8, 32(%rdi)
+ movq %r9, 40(%rdi)
+ movq %r10, 48(%rdi)
+ movq %r11, 56(%rdi)
+ addq %rcx, %rdi
+ cmpq %rdx, %rcx /* is rdx >= 64? */
+ jbe .L42
+ sfence
+ orl %edx, %edx
+ je .L33
+ jmp .L5
+.L9:
+ jmp *.L12(,%rdx,8) /* rdx is 0..8 */
+ .section .rodata
+ .align 8
+ .align 4
+.L12:
+ .quad .L33
+ .quad .L1
+ .quad .L2
+ .quad .L6
+ .quad .L4
+ .quad .L6
+ .quad .L6
+ .quad .L6
+ .quad .L8
+ .text
+.L2:
+ movzwl (%rsi), %ecx
+ movw %cx, (%rdi)
+ ret
+.L4:
+ movl (%rsi), %ecx
+ movl %ecx, (%rdi)
+ ret
+.L8:
+ movq (%rsi), %rcx
+ movq %rcx, (%rdi)
+ ret
Index: linux/include/asm-x86_64/string.h
===================================================================
--- linux.orig/include/asm-x86_64/string.h
+++ linux/include/asm-x86_64/string.h
@@ -39,6 +39,8 @@ extern void *__memcpy(void *to, const vo
__ret = __builtin_memcpy((dst),(src),__len); \
__ret; })

+extern void *memcpy_uncached_read(void *to, const void *from, size_t len);
+#define memcpy_uncached_read memcpy_uncached_read

#define __HAVE_ARCH_MEMSET
void *memset(void *s, int c, size_t n);
Index: linux/include/linux/string.h
===================================================================
--- linux.orig/include/linux/string.h
+++ linux/include/linux/string.h
@@ -85,6 +85,9 @@ extern void * memset(void *,int,__kernel
#ifndef __HAVE_ARCH_MEMCPY
extern void * memcpy(void *,const void *,__kernel_size_t);
#endif
+#ifndef memcpy_uncached_read
+#define memcpy_uncached_read(dest, src, count) memcpy((dest), (src), (count))
+#endif
#ifndef __HAVE_ARCH_MEMMOVE
extern void * memmove(void *,const void *,__kernel_size_t);
#endif

2007-02-10 11:51:45

by Andi Kleen

[permalink] [raw]
Subject: [PATCH 2.6.21 review I] [9/25] x86_64: always use physical delivery mode on > 8 CPUs


From: Ingo Molnar <[email protected]>

Remove clustered APIC mode. There's little point in the use of clustered APIC
mode, broadcasting is limited to within the cluster only, and chipsets have
bugs in this area as well. So default to physical APIC mode when the CPU
count is large, and default to logical APIC mode when the CPU count is 8 or
smaller.

(this patch only removes the use of genapic_cluster and cleans up the
resulting genapic.c file - removal of all remaining traces of clustered
mode will be done by another patch.)

Signed-off-by: Ingo Molnar <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>
Cc: Suresh Siddha <[email protected]>
Cc: Andi Kleen <[email protected]>
Cc: "Li, Shaohua" <[email protected]>
Cc: "Eric W. Biederman" <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
---

arch/x86_64/kernel/genapic.c | 71 +++++++++----------------------------------
include/asm-x86_64/genapic.h | 4 +-
2 files changed, 18 insertions(+), 57 deletions(-)

Index: linux/arch/x86_64/kernel/genapic.c
===================================================================
--- linux.orig/arch/x86_64/kernel/genapic.c
+++ linux/arch/x86_64/kernel/genapic.c
@@ -11,26 +11,24 @@
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/string.h>
+#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/ctype.h>
#include <linux/init.h>
-#include <linux/module.h>

#include <asm/smp.h>
#include <asm/ipi.h>

-#if defined(CONFIG_ACPI)
+#ifdef CONFIG_ACPI
#include <acpi/acpi_bus.h>
#endif

/* which logical CPU number maps to which CPU (physical APIC ID) */
-u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
+u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly
+ = { [0 ... NR_CPUS-1] = BAD_APICID };
EXPORT_SYMBOL(x86_cpu_to_apicid);
-u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };

-extern struct genapic apic_cluster;
-extern struct genapic apic_flat;
-extern struct genapic apic_physflat;
+u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };

struct genapic __read_mostly *genapic = &apic_flat;

@@ -39,76 +37,37 @@ struct genapic __read_mostly *genapic =
*/
void __init clustered_apic_check(void)
{
- int i;
- u8 clusters, max_cluster;
+ unsigned int i, max_apic = 0;
u8 id;
- u8 cluster_cnt[NUM_APIC_CLUSTERS];
- int max_apic = 0;

#ifdef CONFIG_ACPI
/*
- * Some x86_64 machines use physical APIC mode regardless of how many
- * procs/clusters are present (x86_64 ES7000 is an example).
+ * Quirk: some x86_64 machines can only use physical APIC mode
+ * regardless of how many processors are present (x86_64 ES7000
+ * is an example).
*/
- if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID)
- if (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL) {
- genapic = &apic_cluster;
- goto print;
- }
+ if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
+ (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
+ genapic = &apic_physflat;
#endif

- memset(cluster_cnt, 0, sizeof(cluster_cnt));
for (i = 0; i < NR_CPUS; i++) {
id = bios_cpu_apicid[i];
if (id == BAD_APICID)
continue;
if (id > max_apic)
max_apic = id;
- cluster_cnt[APIC_CLUSTERID(id)]++;
}

- /*
- * Don't use clustered mode on AMD platforms, default
- * to flat logical mode.
- */
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
- /*
- * Switch to physical flat mode if more than 8 APICs
- * (In the case of 8 CPUs APIC ID goes from 0 to 7):
- */
- if (max_apic >= 8)
- genapic = &apic_physflat;
- goto print;
- }
-
- clusters = 0;
- max_cluster = 0;
-
- for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
- if (cluster_cnt[i] > 0) {
- ++clusters;
- if (cluster_cnt[i] > max_cluster)
- max_cluster = cluster_cnt[i];
- }
- }
-
- /*
- * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode,
- * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical
- * else physical mode.
- * (We don't use lowest priority delivery + HW APIC IRQ steering, so
- * can ignore the clustered logical case and go straight to physical.)
- */
- if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster)
+ if (max_apic < 8)
genapic = &apic_flat;
else
- genapic = &apic_cluster;
+ genapic = &apic_physflat;

-print:
printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
}

-/* Same for both flat and clustered. */
+/* Same for both flat and physical. */

void send_IPI_self(int vector)
{
Index: linux/include/asm-x86_64/genapic.h
===================================================================
--- linux.orig/include/asm-x86_64/genapic.h
+++ linux/include/asm-x86_64/genapic.h
@@ -29,7 +29,9 @@ struct genapic {
unsigned int (*phys_pkg_id)(int index_msb);
};

-
extern struct genapic *genapic;

+extern struct genapic apic_flat;
+extern struct genapic apic_physflat;
+
#endif

2007-02-10 11:51:45

by Andi Kleen

[permalink] [raw]
Subject: [PATCH 2.6.21 review I] [11/25] x86: default to physical mode on hotplug CPU kernels


From: Ingo Molnar <[email protected]>

Default to physical mode on hotplug CPU kernels. Furher simplify and clean up
the APIC initialization code.

Signed-off-by: Ingo Molnar <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>
Cc: Suresh Siddha <[email protected]>
Cc: Andi Kleen <[email protected]>
Cc: "Li, Shaohua" <[email protected]>
Cc: "Eric W. Biederman" <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
---

arch/i386/kernel/acpi/boot.c | 2 +-
arch/i386/kernel/mpparse.c | 2 +-
arch/x86_64/kernel/genapic.c | 16 +++-------------
arch/x86_64/kernel/mpparse.c | 2 +-
include/asm-i386/genapic.h | 4 ++--
include/asm-i386/mach-bigsmp/mach_apic.h | 2 +-
include/asm-i386/mach-default/mach_apic.h | 2 +-
include/asm-i386/mach-es7000/mach_apic.h | 2 +-
include/asm-i386/mach-generic/mach_apic.h | 2 +-
include/asm-i386/mach-numaq/mach_apic.h | 2 +-
include/asm-i386/mach-summit/mach_apic.h | 2 +-
include/asm-i386/mach-visws/mach_apic.h | 2 +-
include/asm-x86_64/apic.h | 2 +-
13 files changed, 16 insertions(+), 26 deletions(-)

Index: linux/arch/i386/kernel/acpi/boot.c
===================================================================
--- linux.orig/arch/i386/kernel/acpi/boot.c
+++ linux/arch/i386/kernel/acpi/boot.c
@@ -890,7 +890,7 @@ static void __init acpi_process_madt(voi
acpi_ioapic = 1;

smp_found_config = 1;
- clustered_apic_check();
+ setup_apic_routing();
}
}
if (error == -EINVAL) {
Index: linux/arch/i386/kernel/mpparse.c
===================================================================
--- linux.orig/arch/i386/kernel/mpparse.c
+++ linux/arch/i386/kernel/mpparse.c
@@ -477,7 +477,7 @@ static int __init smp_read_mpc(struct mp
}
++mpc_record;
}
- clustered_apic_check();
+ setup_apic_routing();
if (!num_processors)
printk(KERN_ERR "SMP mptable: no processors registered!\n");
return num_processors;
Index: linux/arch/x86_64/kernel/genapic.c
===================================================================
--- linux.orig/arch/x86_64/kernel/genapic.c
+++ linux/arch/x86_64/kernel/genapic.c
@@ -35,11 +35,8 @@ struct genapic __read_mostly *genapic =
/*
* Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
*/
-void __init clustered_apic_check(void)
+void __init setup_apic_routing(void)
{
- unsigned int i, max_apic = 0;
- u8 id;
-
#ifdef CONFIG_ACPI
/*
* Quirk: some x86_64 machines can only use physical APIC mode
@@ -49,17 +46,10 @@ void __init clustered_apic_check(void)
if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
(acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
genapic = &apic_physflat;
+ else
#endif

- for (i = 0; i < NR_CPUS; i++) {
- id = bios_cpu_apicid[i];
- if (id == BAD_APICID)
- continue;
- if (id > max_apic)
- max_apic = id;
- }
-
- if (max_apic < 8)
+ if (cpus_weight(cpu_possible_map) <= 8)
genapic = &apic_flat;
else
genapic = &apic_physflat;
Index: linux/arch/x86_64/kernel/mpparse.c
===================================================================
--- linux.orig/arch/x86_64/kernel/mpparse.c
+++ linux/arch/x86_64/kernel/mpparse.c
@@ -300,7 +300,7 @@ static int __init smp_read_mpc(struct mp
}
}
}
- clustered_apic_check();
+ setup_apic_routing();
if (!num_processors)
printk(KERN_ERR "MPTABLE: no processors registered!\n");
return num_processors;
Index: linux/include/asm-i386/genapic.h
===================================================================
--- linux.orig/include/asm-i386/genapic.h
+++ linux/include/asm-i386/genapic.h
@@ -36,7 +36,7 @@ struct genapic {
void (*init_apic_ldr)(void);
physid_mask_t (*ioapic_phys_id_map)(physid_mask_t map);

- void (*clustered_apic_check)(void);
+ void (*setup_apic_routing)(void);
int (*multi_timer_check)(int apic, int irq);
int (*apicid_to_node)(int logical_apicid);
int (*cpu_to_logical_apicid)(int cpu);
@@ -99,7 +99,7 @@ struct genapic {
APICFUNC(check_apicid_present) \
APICFUNC(init_apic_ldr) \
APICFUNC(ioapic_phys_id_map) \
- APICFUNC(clustered_apic_check) \
+ APICFUNC(setup_apic_routing) \
APICFUNC(multi_timer_check) \
APICFUNC(apicid_to_node) \
APICFUNC(cpu_to_logical_apicid) \
Index: linux/include/asm-i386/mach-bigsmp/mach_apic.h
===================================================================
--- linux.orig/include/asm-i386/mach-bigsmp/mach_apic.h
+++ linux/include/asm-i386/mach-bigsmp/mach_apic.h
@@ -71,7 +71,7 @@ static inline void init_apic_ldr(void)
apic_write_around(APIC_LDR, val);
}

-static inline void clustered_apic_check(void)
+static inline void setup_apic_routing(void)
{
printk("Enabling APIC mode: %s. Using %d I/O APICs\n",
"Physflat", nr_ioapics);
Index: linux/include/asm-i386/mach-default/mach_apic.h
===================================================================
--- linux.orig/include/asm-i386/mach-default/mach_apic.h
+++ linux/include/asm-i386/mach-default/mach_apic.h
@@ -54,7 +54,7 @@ static inline physid_mask_t ioapic_phys_
return phys_map;
}

-static inline void clustered_apic_check(void)
+static inline void setup_apic_routing(void)
{
printk("Enabling APIC mode: %s. Using %d I/O APICs\n",
"Flat", nr_ioapics);
Index: linux/include/asm-i386/mach-es7000/mach_apic.h
===================================================================
--- linux.orig/include/asm-i386/mach-es7000/mach_apic.h
+++ linux/include/asm-i386/mach-es7000/mach_apic.h
@@ -81,7 +81,7 @@ static inline void enable_apic_mode(void
}

extern int apic_version [MAX_APICS];
-static inline void clustered_apic_check(void)
+static inline void setup_apic_routing(void)
{
int apic = bios_cpu_apicid[smp_processor_id()];
printk("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
Index: linux/include/asm-i386/mach-generic/mach_apic.h
===================================================================
--- linux.orig/include/asm-i386/mach-generic/mach_apic.h
+++ linux/include/asm-i386/mach-generic/mach_apic.h
@@ -13,7 +13,7 @@
#define apic_id_registered (genapic->apic_id_registered)
#define init_apic_ldr (genapic->init_apic_ldr)
#define ioapic_phys_id_map (genapic->ioapic_phys_id_map)
-#define clustered_apic_check (genapic->clustered_apic_check)
+#define setup_apic_routing (genapic->setup_apic_routing)
#define multi_timer_check (genapic->multi_timer_check)
#define apicid_to_node (genapic->apicid_to_node)
#define cpu_to_logical_apicid (genapic->cpu_to_logical_apicid)
Index: linux/include/asm-i386/mach-numaq/mach_apic.h
===================================================================
--- linux.orig/include/asm-i386/mach-numaq/mach_apic.h
+++ linux/include/asm-i386/mach-numaq/mach_apic.h
@@ -34,7 +34,7 @@ static inline void init_apic_ldr(void)
/* Already done in NUMA-Q firmware */
}

-static inline void clustered_apic_check(void)
+static inline void setup_apic_routing(void)
{
printk("Enabling APIC mode: %s. Using %d I/O APICs\n",
"NUMA-Q", nr_ioapics);
Index: linux/include/asm-i386/mach-summit/mach_apic.h
===================================================================
--- linux.orig/include/asm-i386/mach-summit/mach_apic.h
+++ linux/include/asm-i386/mach-summit/mach_apic.h
@@ -80,7 +80,7 @@ static inline int apic_id_registered(voi
return 1;
}

-static inline void clustered_apic_check(void)
+static inline void setup_apic_routing(void)
{
printk("Enabling APIC mode: Summit. Using %d I/O APICs\n",
nr_ioapics);
Index: linux/include/asm-i386/mach-visws/mach_apic.h
===================================================================
--- linux.orig/include/asm-i386/mach-visws/mach_apic.h
+++ linux/include/asm-i386/mach-visws/mach_apic.h
@@ -47,7 +47,7 @@ static inline void summit_check(char *oe
{
}

-static inline void clustered_apic_check(void)
+static inline void setup_apic_routing(void)
{
}

Index: linux/include/asm-x86_64/apic.h
===================================================================
--- linux.orig/include/asm-x86_64/apic.h
+++ linux/include/asm-x86_64/apic.h
@@ -83,7 +83,7 @@ extern void setup_secondary_APIC_clock (
extern int APIC_init_uniprocessor (void);
extern void disable_APIC_timer(void);
extern void enable_APIC_timer(void);
-extern void clustered_apic_check(void);
+extern void setup_apic_routing(void);

extern void setup_APIC_extened_lvt(unsigned char lvt_off, unsigned char vector,
unsigned char msg_type, unsigned char mask);

2007-02-10 11:51:48

by Andi Kleen

[permalink] [raw]
Subject: [PATCH 2.6.21 review I] [4/25] x86: kernel-mode faults pollute current->thead


From: Jeff Dike <[email protected]>

Kernel-mode traps on x86_64 can pollute the trap information for a previous
userspace trap for which the signal has not yet been delivered to the
process.

do_trap and do_general_protection set task->thread.error_code and .trapno
for kernel traps. If a kernel-mode trap arrives between the arrival of a
userspace trap and the delivery of the associated SISGEGV to the process,
the process will get the kernel trap information in its sigcontext.

This causes UML process segfaults, as the trapno that the UML kernel sees
is 13, rather than the 14 for normal page faults. So, the UML kernel
passes the SIGSEGV along to its process.

I don't claim to fully understand the problem. On the one hand, a check in
do_general_protection for a pending SIGSEGV turned up nothing. On the
other hand, this patch fixed the UML process segfault problem.

The patch below moves the setting of error_code and trapno so that that
only happens in the case of userspace faults. As a side-effect, this
should speed up kernel-mode fault handling a tiny bit.

I looked at i386, and there is a similar situation. In this case, there is
duplicate code setting task->thread.error_code and trapno. I deleted one,
leaving the copy that runs in the case of a userspace fault.

Signed-off-by: Jeff Dike <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>
Cc: Andi Kleen <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
---

arch/i386/kernel/traps.c | 8 +++-----
arch/x86_64/kernel/traps.c | 12 ++++++------
2 files changed, 9 insertions(+), 11 deletions(-)

Index: linux/arch/i386/kernel/traps.c
===================================================================
--- linux.orig/arch/i386/kernel/traps.c
+++ linux/arch/i386/kernel/traps.c
@@ -474,8 +474,6 @@ static void __kprobes do_trap(int trapnr
siginfo_t *info)
{
struct task_struct *tsk = current;
- tsk->thread.error_code = error_code;
- tsk->thread.trap_no = trapnr;

if (regs->eflags & VM_MASK) {
if (vm86)
@@ -487,6 +485,9 @@ static void __kprobes do_trap(int trapnr
goto kernel_trap;

trap_signal: {
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_no = trapnr;
+
if (info)
force_sig_info(signr, info, tsk);
else
@@ -601,9 +602,6 @@ fastcall void __kprobes do_general_prote
}
put_cpu();

- current->thread.error_code = error_code;
- current->thread.trap_no = 13;
-
if (regs->eflags & VM_MASK)
goto gp_in_vm86;

Index: linux/arch/x86_64/kernel/traps.c
===================================================================
--- linux.orig/arch/x86_64/kernel/traps.c
+++ linux/arch/x86_64/kernel/traps.c
@@ -581,10 +581,10 @@ static void __kprobes do_trap(int trapnr
{
struct task_struct *tsk = current;

- tsk->thread.error_code = error_code;
- tsk->thread.trap_no = trapnr;
-
if (user_mode(regs)) {
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_no = trapnr;
+
if (exception_trace && unhandled_signal(tsk, signr))
printk(KERN_INFO
"%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
@@ -682,10 +682,10 @@ asmlinkage void __kprobes do_general_pro

conditional_sti(regs);

- tsk->thread.error_code = error_code;
- tsk->thread.trap_no = 13;
-
if (user_mode(regs)) {
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_no = 13;
+
if (exception_trace && unhandled_signal(tsk, SIGSEGV))
printk(KERN_INFO
"%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",

2007-02-10 11:52:28

by Andi Kleen

[permalink] [raw]
Subject: [PATCH 2.6.21 review I] [6/25] x86_64: revert x86_64-mm-add-genapic_force


From: Andrew Morton <[email protected]>

This is obsoleted by new Ingo genapic patches.

Cc: Suresh Siddha <[email protected]>
Cc: Andi Kleen <[email protected]>
Cc: "Li, Shaohua" <[email protected]>
Cc: Ingo Molnar <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>

---

arch/x86_64/kernel/genapic.c | 9 +--------
include/asm-x86_64/genapic.h | 2 +-
2 files changed, 2 insertions(+), 9 deletions(-)

Index: linux/arch/x86_64/kernel/genapic.c
===================================================================
--- linux.orig/arch/x86_64/kernel/genapic.c
+++ linux/arch/x86_64/kernel/genapic.c
@@ -33,7 +33,7 @@ extern struct genapic apic_flat;
extern struct genapic apic_physflat;

struct genapic *genapic = &apic_flat;
-struct genapic *genapic_force;
+

/*
* Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
@@ -46,13 +46,6 @@ void __init clustered_apic_check(void)
u8 cluster_cnt[NUM_APIC_CLUSTERS];
int max_apic = 0;

- /* genapic selection can be forced because of certain quirks.
- */
- if (genapic_force) {
- genapic = genapic_force;
- goto print;
- }
-
#if defined(CONFIG_ACPI)
/*
* Some x86_64 machines use physical APIC mode regardless of how many
Index: linux/include/asm-x86_64/genapic.h
===================================================================
--- linux.orig/include/asm-x86_64/genapic.h
+++ linux/include/asm-x86_64/genapic.h
@@ -30,6 +30,6 @@ struct genapic {
};


-extern struct genapic *genapic, *genapic_force, apic_flat;
+extern struct genapic *genapic;

#endif

2007-02-10 11:52:28

by Andi Kleen

[permalink] [raw]
Subject: [PATCH 2.6.21 review I] [7/25] x86: revert x86_64-mm-fix-the-irqbalance-quirk-for-e7320-e7520-e7525


From: Andrew Morton <[email protected]>

Obsoleted by Ingo's genapic stuff.

Cc: Ingo Molnar <[email protected]>
Cc: Suresh Siddha <[email protected]>
Cc: Andi Kleen <[email protected]>
Cc: "Li, Shaohua" <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>

---

arch/i386/kernel/acpi/earlyquirk.c | 21 ----------------
arch/i386/kernel/quirks.c | 46 ++++++++-----------------------------
arch/i386/kernel/smpboot.c | 7 -----
arch/x86_64/kernel/early-quirks.c | 13 ----------
arch/x86_64/kernel/smpboot.c | 8 ------
include/asm-i386/genapic.h | 2 -
include/asm-i386/irq.h | 2 -
include/asm-x86_64/proto.h | 1
8 files changed, 12 insertions(+), 88 deletions(-)

Index: linux/arch/i386/kernel/acpi/earlyquirk.c
===================================================================
--- linux.orig/arch/i386/kernel/acpi/earlyquirk.c
+++ linux/arch/i386/kernel/acpi/earlyquirk.c
@@ -10,7 +10,6 @@
#include <asm/pci-direct.h>
#include <asm/acpi.h>
#include <asm/apic.h>
-#include <asm/irq.h>

#ifdef CONFIG_ACPI

@@ -50,24 +49,6 @@ static int __init check_bridge(int vendo
return 0;
}

-static void check_intel(void)
-{
- u16 vendor, device;
-
- vendor = read_pci_config_16(0, 0, 0, PCI_VENDOR_ID);
-
- if (vendor != PCI_VENDOR_ID_INTEL)
- return;
-
- device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID);
-#ifdef CONFIG_SMP
- if (device == PCI_DEVICE_ID_INTEL_E7320_MCH ||
- device == PCI_DEVICE_ID_INTEL_E7520_MCH ||
- device == PCI_DEVICE_ID_INTEL_E7525_MCH)
- quirk_intel_irqbalance();
-#endif
-}
-
void __init check_acpi_pci(void)
{
int num, slot, func;
@@ -79,8 +60,6 @@ void __init check_acpi_pci(void)
if (!early_pci_allowed())
return;

- check_intel();
-
/* Poor man's PCI discovery */
for (num = 0; num < 32; num++) {
for (slot = 0; slot < 32; slot++) {
Index: linux/arch/i386/kernel/quirks.c
===================================================================
--- linux.orig/arch/i386/kernel/quirks.c
+++ linux/arch/i386/kernel/quirks.c
@@ -3,23 +3,10 @@
*/
#include <linux/pci.h>
#include <linux/irq.h>
-#include <asm/pci-direct.h>
-#include <asm/genapic.h>
-#include <asm/cpu.h>

#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
-static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev)
-{
-#ifdef CONFIG_X86_64
- if (genapic != &apic_flat)
- panic("APIC mode must be flat on this system\n");
-#elif defined(CONFIG_X86_GENERICARCH)
- if (genapic != &apic_default)
- panic("APIC mode must be default(flat) on this system. Use apic=default\n");
-#endif
-}

-void __init quirk_intel_irqbalance(void)
+static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
{
u8 config, rev;
u32 word;
@@ -29,18 +16,18 @@ void __init quirk_intel_irqbalance(void)
* based platforms.
* Disable SW irqbalance/affinity on those platforms.
*/
- rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION);
+ pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
if (rev > 0x9)
return;

printk(KERN_INFO "Intel E7520/7320/7525 detected.");

- /* enable access to config space */
- config = read_pci_config_byte(0, 0, 0, 0xf4);
- write_pci_config_byte(0, 0, 0, 0xf4, config|0x2);
+ /* enable access to config space*/
+ pci_read_config_byte(dev, 0xf4, &config);
+ pci_write_config_byte(dev, 0xf4, config|0x2);

/* read xTPR register */
- word = read_pci_config_16(0, 0, 0x40, 0x4c);
+ raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);

if (!(word & (1 << 13))) {
printk(KERN_INFO "Disabling irq balancing and affinity\n");
@@ -51,24 +38,13 @@ void __init quirk_intel_irqbalance(void)
#ifdef CONFIG_PROC_FS
no_irq_affinity = 1;
#endif
-#ifdef CONFIG_HOTPLUG_CPU
- printk(KERN_INFO "Disabling cpu hotplug control\n");
- enable_cpu_hotplug = 0;
-#endif
-#ifdef CONFIG_X86_64
- /* force the genapic selection to flat mode so that
- * interrupts can be redirected to more than one CPU.
- */
- genapic_force = &apic_flat;
-#endif
}

- /* put back the original value for config space */
+ /* put back the original value for config space*/
if (!(config & 0x2))
- write_pci_config_byte(0, 0, 0, 0xf4, config);
+ pci_write_config_byte(dev, 0xf4, config);
}
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, verify_quirk_intel_irqbalance);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, verify_quirk_intel_irqbalance);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, verify_quirk_intel_irqbalance);
-
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance);
#endif
Index: linux/arch/i386/kernel/smpboot.c
===================================================================
--- linux.orig/arch/i386/kernel/smpboot.c
+++ linux/arch/i386/kernel/smpboot.c
@@ -58,7 +58,6 @@
#include <asm/arch_hooks.h>
#include <asm/nmi.h>
#include <asm/pda.h>
-#include <asm/genapic.h>

#include <mach_apic.h>
#include <mach_wakecpu.h>
@@ -1466,12 +1465,6 @@ int __cpuinit __cpu_up(unsigned int cpu)
cpu_set(cpu, smp_commenced_mask);
while (!cpu_isset(cpu, cpu_online_map))
cpu_relax();
-
-#ifdef CONFIG_X86_GENERICARCH
- if (num_online_cpus() > 8 && genapic == &apic_default)
- panic("Default flat APIC routing can't be used with > 8 cpus\n");
-#endif
-
return 0;
}

Index: linux/arch/x86_64/kernel/early-quirks.c
===================================================================
--- linux.orig/arch/x86_64/kernel/early-quirks.c
+++ linux/arch/x86_64/kernel/early-quirks.c
@@ -76,18 +76,6 @@ static void ati_bugs(void)
}
}

-static void intel_bugs(void)
-{
- u16 device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID);
-
-#ifdef CONFIG_SMP
- if (device == PCI_DEVICE_ID_INTEL_E7320_MCH ||
- device == PCI_DEVICE_ID_INTEL_E7520_MCH ||
- device == PCI_DEVICE_ID_INTEL_E7525_MCH)
- quirk_intel_irqbalance();
-#endif
-}
-
struct chipset {
u16 vendor;
void (*f)(void);
@@ -97,7 +85,6 @@ static struct chipset early_qrk[] = {
{ PCI_VENDOR_ID_NVIDIA, nvidia_bugs },
{ PCI_VENDOR_ID_VIA, via_bugs },
{ PCI_VENDOR_ID_ATI, ati_bugs },
- { PCI_VENDOR_ID_INTEL, intel_bugs},
{}
};

Index: linux/arch/x86_64/kernel/smpboot.c
===================================================================
--- linux.orig/arch/x86_64/kernel/smpboot.c
+++ linux/arch/x86_64/kernel/smpboot.c
@@ -60,7 +60,6 @@
#include <asm/irq.h>
#include <asm/hw_irq.h>
#include <asm/numa.h>
-#include <asm/genapic.h>

/* Number of siblings per CPU package */
int smp_num_siblings = 1;
@@ -1170,13 +1169,6 @@ int __cpuinit __cpu_up(unsigned int cpu)

while (!cpu_isset(cpu, cpu_online_map))
cpu_relax();
-
- if (num_online_cpus() > 8 && genapic == &apic_flat) {
- printk(KERN_WARNING
- "flat APIC routing can't be used with > 8 cpus\n");
- BUG();
- }
-
err = 0;

return err;
Index: linux/include/asm-i386/genapic.h
===================================================================
--- linux.orig/include/asm-i386/genapic.h
+++ linux/include/asm-i386/genapic.h
@@ -122,6 +122,6 @@ struct genapic {
APICFUNC(phys_pkg_id) \
}

-extern struct genapic *genapic, apic_default;
+extern struct genapic *genapic;

#endif
Index: linux/include/asm-i386/irq.h
===================================================================
--- linux.orig/include/asm-i386/irq.h
+++ linux/include/asm-i386/irq.h
@@ -37,8 +37,6 @@ static __inline__ int irq_canonicalize(i
extern int irqbalance_disable(char *str);
#endif

-extern void quirk_intel_irqbalance(void);
-
#ifdef CONFIG_HOTPLUG_CPU
extern void fixup_irqs(cpumask_t map);
#endif
Index: linux/include/asm-x86_64/proto.h
===================================================================
--- linux.orig/include/asm-x86_64/proto.h
+++ linux/include/asm-x86_64/proto.h
@@ -87,7 +87,6 @@ extern void syscall32_cpu_init(void);
extern void setup_node_bootmem(int nodeid, unsigned long start, unsigned long end);

extern void early_quirks(void);
-extern void quirk_intel_irqbalance(void);
extern void check_efer(void);

extern int unhandled_signal(struct task_struct *tsk, int sig);

2007-02-10 11:53:00

by Andi Kleen

[permalink] [raw]
Subject: [PATCH 2.6.21 review I] [19/25] x86: Reject a broken MCFG tables on Asus etc


From: OGAWA Hirofumi <[email protected]>
This rejects a broken MCFG tables on Asus etc.
Arjan and Andi suggest this.

Signed-off-by: OGAWA Hirofumi <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>

---

arch/i386/pci/mmconfig-shared.c | 24 ++++++++++++++++++-
arch/i386/pci/mmconfig.c | 9 -------
arch/x86_64/pci/mmconfig.c | 50 +++++++++++-----------------------------
3 files changed, 37 insertions(+), 46 deletions(-)

Index: linux/arch/i386/pci/mmconfig-shared.c
===================================================================
--- linux.orig/arch/i386/pci/mmconfig-shared.c
+++ linux/arch/i386/pci/mmconfig-shared.c
@@ -197,6 +197,26 @@ static __init void pci_mmcfg_insert_reso
}
}

+static void __init pci_mmcfg_reject_broken(void)
+{
+ typeof(pci_mmcfg_config[0]) *cfg = &pci_mmcfg_config[0];
+
+ /*
+ * Handle more broken MCFG tables on Asus etc.
+ * They only contain a single entry for bus 0-0.
+ */
+ if (pci_mmcfg_config_num == 1 &&
+ cfg->pci_segment == 0 &&
+ (cfg->start_bus_number | cfg->end_bus_number) == 0) {
+ kfree(pci_mmcfg_config);
+ pci_mmcfg_config = NULL;
+ pci_mmcfg_config_num = 0;
+
+ printk(KERN_ERR "PCI: start and end of bus number is 0. "
+ "Rejected as broken MCFG.");
+ }
+}
+
void __init pci_mmcfg_init(int type)
{
int known_bridge = 0;
@@ -207,8 +227,10 @@ void __init pci_mmcfg_init(int type)
if (type == 1 && pci_mmcfg_check_hostbridge())
known_bridge = 1;

- if (!known_bridge)
+ if (!known_bridge) {
acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg);
+ pci_mmcfg_reject_broken();
+ }

if ((pci_mmcfg_config_num == 0) ||
(pci_mmcfg_config == NULL) ||
Index: linux/arch/i386/pci/mmconfig.c
===================================================================
--- linux.orig/arch/i386/pci/mmconfig.c
+++ linux/arch/i386/pci/mmconfig.c
@@ -47,15 +47,6 @@ static u32 get_base_addr(unsigned int se
return cfg->address;
}

- /* Handle more broken MCFG tables on Asus etc.
- They only contain a single entry for bus 0-0. Assume
- this applies to all busses. */
- cfg = &pci_mmcfg_config[0];
- if (pci_mmcfg_config_num == 1 &&
- cfg->pci_segment == 0 &&
- (cfg->start_bus_number | cfg->end_bus_number) == 0)
- return cfg->address;
-
/* Fall back to type 0 */
return 0;
}
Index: linux/arch/x86_64/pci/mmconfig.c
===================================================================
--- linux.orig/arch/x86_64/pci/mmconfig.c
+++ linux/arch/x86_64/pci/mmconfig.c
@@ -28,39 +28,6 @@ struct mmcfg_virt {
};
static struct mmcfg_virt *pci_mmcfg_virt;

-static inline int mcfg_broken(void)
-{
- struct acpi_mcfg_allocation *cfg = &pci_mmcfg_config[0];
-
- /* Handle more broken MCFG tables on Asus etc.
- They only contain a single entry for bus 0-0. Assume
- this applies to all busses. */
- if (pci_mmcfg_config_num == 1 &&
- cfg->pci_segment_group_number == 0 &&
- (cfg->start_bus_number | cfg->end_bus_number) == 0)
- return 1;
- return 0;
-}
-
-static void __iomem *mcfg_ioremap(struct acpi_mcfg_allocation *cfg)
-{
- void __iomem *addr;
- u32 size;
-
- if (mcfg_broken())
- size = 256 << 20;
- else
- size = (cfg->end_bus_number + 1) << 20;
-
- addr = ioremap_nocache(cfg->base_address, size);
- if (addr) {
- printk(KERN_INFO "PCI: Using MMCONFIG at %x - %x\n",
- cfg->base_address,
- cfg->base_address + size - 1);
- }
- return addr;
-}
-
static char __iomem *get_virt(unsigned int seg, unsigned bus)
{
int cfg_num = -1;
@@ -78,9 +45,6 @@ static char __iomem *get_virt(unsigned i
return pci_mmcfg_virt[cfg_num].virt;
}

- if (mcfg_broken())
- return pci_mmcfg_virt[0].virt;
-
/* Fall back to type 0 */
return NULL;
}
@@ -160,6 +124,20 @@ static struct pci_raw_ops pci_mmcfg = {
.write = pci_mmcfg_write,
};

+static void __iomem * __init mcfg_ioremap(struct acpi_mcfg_allocation *cfg)
+{
+ void __iomem *addr;
+ u32 size;
+
+ size = (cfg->end_bus_number + 1) << 20;
+ addr = ioremap_nocache(cfg->address, size);
+ if (addr) {
+ printk(KERN_INFO "PCI: Using MMCONFIG at %Lx - %Lx\n",
+ cfg->address, cfg->address + size - 1);
+ }
+ return addr;
+}
+
int __init pci_mmcfg_arch_init(void)
{
int i;

2007-02-10 11:53:04

by Andi Kleen

[permalink] [raw]
Subject: [PATCH 2.6.21 review I] [17/25] i386: Reserve resources but only when we're sure about them.


From: Olivier Galibert <[email protected]>

Put back the resource reservation as per
4c6e052adfe285ede5884e4e8c4d33af33932c13 but use it *only* when the range(s)
come from a chipset probe instead of the bios.

Signed-off-by: Olivier Galibert <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>
Cc: Andi Kleen <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
---

arch/i386/pci/mmconfig-shared.c | 33 +++++++++++++++++++++++++++++++++
1 file changed, 33 insertions(+)

Index: linux/arch/i386/pci/mmconfig-shared.c
===================================================================
--- linux.orig/arch/i386/pci/mmconfig-shared.c
+++ linux/arch/i386/pci/mmconfig-shared.c
@@ -166,6 +166,37 @@ static int __init pci_mmcfg_check_hostbr
return name != NULL;
}

+static __init void pci_mmcfg_insert_resources(void)
+{
+#define PCI_MMCFG_RESOURCE_NAME_LEN 19
+ int i;
+ struct resource *res;
+ char *names;
+ unsigned num_buses;
+
+ res = kcalloc(PCI_MMCFG_RESOURCE_NAME_LEN + sizeof(*res),
+ pci_mmcfg_config_num, GFP_KERNEL);
+
+ if (!res) {
+ printk(KERN_ERR "PCI: Unable to allocate MMCONFIG resources\n");
+ return;
+ }
+
+ names = (void *)&res[pci_mmcfg_config_num];
+ for (i = 0; i < pci_mmcfg_config_num; i++, res++) {
+ num_buses = pci_mmcfg_config[i].end_bus_number -
+ pci_mmcfg_config[i].start_bus_number + 1;
+ res->name = names;
+ snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN, "PCI MMCONFIG %u",
+ pci_mmcfg_config[i].pci_segment);
+ res->start = pci_mmcfg_config[i].address;
+ res->end = res->start + (num_buses << 20) - 1;
+ res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ insert_resource(&iomem_resource, res);
+ names += PCI_MMCFG_RESOURCE_NAME_LEN;
+ }
+}
+
void __init pci_mmcfg_init(int type)
{
int known_bridge = 0;
@@ -199,6 +230,8 @@ void __init pci_mmcfg_init(int type)
if (pci_mmcfg_arch_init()) {
if (type == 1)
unreachable_devices();
+ if (known_bridge)
+ pci_mmcfg_insert_resources();
pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
}
}

2007-02-10 11:53:02

by Andi Kleen

[permalink] [raw]
Subject: [PATCH 2.6.21 review I] [3/25] i386: Convert i386 PDA code to use %fs


From: Jeremy Fitzhardinge <[email protected]>

Convert the PDA code to use %fs rather than %gs as the segment for
per-processor data. This is because some processors show a small but
measurable performance gain for reloading a NULL segment selector (as %fs
generally is in user-space) versus a non-NULL one (as %gs generally is).

On modern processors the difference is very small, perhaps undetectable.
Some old AMD "K6 3D+" processors are noticably slower when %fs is used
rather than %gs; I have no idea why this might be, but I think they're
sufficiently rare that it doesn't matter much.

This patch also fixes the math emulator, which had not been adjusted to
match the changed struct pt_regs.

[[email protected]: fixit with gdb]
[[email protected]: Fix KVM too]

Signed-off-by: Jeremy Fitzhardinge <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>
Cc: Ian Campbell <[email protected]>
Acked-by: Ingo Molnar <[email protected]>
Cc: Andi Kleen <[email protected]>
Acked-by: Zachary Amsden <[email protected]>
Cc: Eric Dumazet <[email protected]>
Signed-off-by: Frederik Deweerdt <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
---

arch/i386/kernel/asm-offsets.c | 2 +-
arch/i386/kernel/cpu/common.c | 14 +++++++-------
arch/i386/kernel/entry.S | 32 ++++++++++++++++----------------
arch/i386/kernel/head.S | 6 +++---
arch/i386/kernel/kprobes.c | 4 ++--
arch/i386/kernel/process.c | 24 +++++++++++-------------
arch/i386/kernel/ptrace.c | 16 ++++++++--------
arch/i386/kernel/signal.c | 10 +++++-----
arch/i386/kernel/traps.c | 7 ++++---
arch/i386/kernel/vm86.c | 33 +++++++++++++++++----------------
arch/i386/math-emu/get_address.c | 14 +++++---------
drivers/kvm/vmx.c | 12 ++++++------
include/asm-i386/elf.h | 4 ++--
include/asm-i386/mmu_context.h | 2 +-
include/asm-i386/pda.h | 12 ++++++------
include/asm-i386/processor.h | 6 +++---
include/asm-i386/ptrace.h | 4 ++--
17 files changed, 99 insertions(+), 103 deletions(-)

Index: linux/arch/i386/kernel/asm-offsets.c
===================================================================
--- linux.orig/arch/i386/kernel/asm-offsets.c
+++ linux/arch/i386/kernel/asm-offsets.c
@@ -72,7 +72,7 @@ void foo(void)
OFFSET(PT_EAX, pt_regs, eax);
OFFSET(PT_DS, pt_regs, xds);
OFFSET(PT_ES, pt_regs, xes);
- OFFSET(PT_GS, pt_regs, xgs);
+ OFFSET(PT_FS, pt_regs, xfs);
OFFSET(PT_ORIG_EAX, pt_regs, orig_eax);
OFFSET(PT_EIP, pt_regs, eip);
OFFSET(PT_CS, pt_regs, xcs);
Index: linux/arch/i386/kernel/cpu/common.c
===================================================================
--- linux.orig/arch/i386/kernel/cpu/common.c
+++ linux/arch/i386/kernel/cpu/common.c
@@ -605,7 +605,7 @@ void __init early_cpu_init(void)
struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
{
memset(regs, 0, sizeof(struct pt_regs));
- regs->xgs = __KERNEL_PDA;
+ regs->xfs = __KERNEL_PDA;
return regs;
}

@@ -662,12 +662,12 @@ struct i386_pda boot_pda = {
.pcurrent = &init_task,
};

-static inline void set_kernel_gs(void)
+static inline void set_kernel_fs(void)
{
- /* Set %gs for this CPU's PDA. Memory clobber is to create a
+ /* Set %fs for this CPU's PDA. Memory clobber is to create a
barrier with respect to any PDA operations, so the compiler
doesn't move any before here. */
- asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory");
+ asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory");
}

/* Initialize the CPU's GDT and PDA. The boot CPU does this for
@@ -718,7 +718,7 @@ void __cpuinit cpu_set_gdt(int cpu)
the boot CPU, this will transition from the boot gdt+pda to
the real ones). */
load_gdt(cpu_gdt_descr);
- set_kernel_gs();
+ set_kernel_fs();
}

/* Common CPU init for both boot and secondary CPUs */
@@ -764,8 +764,8 @@ static void __cpuinit _cpu_init(int cpu,
__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
#endif

- /* Clear %fs. */
- asm volatile ("mov %0, %%fs" : : "r" (0));
+ /* Clear %gs. */
+ asm volatile ("mov %0, %%gs" : : "r" (0));

/* Clear all 6 debug registers: */
set_debugreg(0, 0);
Index: linux/arch/i386/kernel/entry.S
===================================================================
--- linux.orig/arch/i386/kernel/entry.S
+++ linux/arch/i386/kernel/entry.S
@@ -30,7 +30,7 @@
* 18(%esp) - %eax
* 1C(%esp) - %ds
* 20(%esp) - %es
- * 24(%esp) - %gs
+ * 24(%esp) - %fs
* 28(%esp) - orig_eax
* 2C(%esp) - %eip
* 30(%esp) - %cs
@@ -99,9 +99,9 @@ VM_MASK = 0x00020000

#define SAVE_ALL \
cld; \
- pushl %gs; \
+ pushl %fs; \
CFI_ADJUST_CFA_OFFSET 4;\
- /*CFI_REL_OFFSET gs, 0;*/\
+ /*CFI_REL_OFFSET fs, 0;*/\
pushl %es; \
CFI_ADJUST_CFA_OFFSET 4;\
/*CFI_REL_OFFSET es, 0;*/\
@@ -133,7 +133,7 @@ VM_MASK = 0x00020000
movl %edx, %ds; \
movl %edx, %es; \
movl $(__KERNEL_PDA), %edx; \
- movl %edx, %gs
+ movl %edx, %fs

#define RESTORE_INT_REGS \
popl %ebx; \
@@ -166,9 +166,9 @@ VM_MASK = 0x00020000
2: popl %es; \
CFI_ADJUST_CFA_OFFSET -4;\
/*CFI_RESTORE es;*/\
-3: popl %gs; \
+3: popl %fs; \
CFI_ADJUST_CFA_OFFSET -4;\
- /*CFI_RESTORE gs;*/\
+ /*CFI_RESTORE fs;*/\
.pushsection .fixup,"ax"; \
4: movl $0,(%esp); \
jmp 1b; \
@@ -349,11 +349,11 @@ sysenter_past_esp:
movl PT_OLDESP(%esp), %ecx
xorl %ebp,%ebp
TRACE_IRQS_ON
-1: mov PT_GS(%esp), %gs
+1: mov PT_FS(%esp), %fs
ENABLE_INTERRUPTS_SYSEXIT
CFI_ENDPROC
.pushsection .fixup,"ax"
-2: movl $0,PT_GS(%esp)
+2: movl $0,PT_FS(%esp)
jmp 1b
.section __ex_table,"a"
.align 4
@@ -550,7 +550,7 @@ syscall_badsys:

#define FIXUP_ESPFIX_STACK \
/* since we are on a wrong stack, we cant make it a C code :( */ \
- movl %gs:PDA_cpu, %ebx; \
+ movl %fs:PDA_cpu, %ebx; \
PER_CPU(cpu_gdt_descr, %ebx); \
movl GDS_address(%ebx), %ebx; \
GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
@@ -632,7 +632,7 @@ KPROBE_ENTRY(page_fault)
CFI_ADJUST_CFA_OFFSET 4
ALIGN
error_code:
- /* the function address is in %gs's slot on the stack */
+ /* the function address is in %fs's slot on the stack */
pushl %es
CFI_ADJUST_CFA_OFFSET 4
/*CFI_REL_OFFSET es, 0*/
@@ -661,20 +661,20 @@ error_code:
CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET ebx, 0
cld
- pushl %gs
+ pushl %fs
CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET gs, 0*/
+ /*CFI_REL_OFFSET fs, 0*/
movl $(__KERNEL_PDA), %ecx
- movl %ecx, %gs
+ movl %ecx, %fs
UNWIND_ESPFIX_STACK
popl %ecx
CFI_ADJUST_CFA_OFFSET -4
/*CFI_REGISTER es, ecx*/
- movl PT_GS(%esp), %edi # get the function address
+ movl PT_FS(%esp), %edi # get the function address
movl PT_ORIG_EAX(%esp), %edx # get the error code
movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
- mov %ecx, PT_GS(%esp)
- /*CFI_REL_OFFSET gs, ES*/
+ mov %ecx, PT_FS(%esp)
+ /*CFI_REL_OFFSET fs, ES*/
movl $(__USER_DS), %ecx
movl %ecx, %ds
movl %ecx, %es
Index: linux/arch/i386/kernel/head.S
===================================================================
--- linux.orig/arch/i386/kernel/head.S
+++ linux/arch/i386/kernel/head.S
@@ -319,12 +319,12 @@ is386: movl $2,%ecx # set MP
movl %eax,%ds
movl %eax,%es

- xorl %eax,%eax # Clear FS and LDT
- movl %eax,%fs
+ xorl %eax,%eax # Clear GS and LDT
+ movl %eax,%gs
lldt %ax

movl $(__KERNEL_PDA),%eax
- mov %eax,%gs
+ mov %eax,%fs

cld # gcc2 wants the direction flag cleared at all times
pushl $0 # fake return address for unwinder
Index: linux/arch/i386/kernel/kprobes.c
===================================================================
--- linux.orig/arch/i386/kernel/kprobes.c
+++ linux/arch/i386/kernel/kprobes.c
@@ -363,7 +363,7 @@ no_kprobe:
" pushf\n"
/* skip cs, eip, orig_eax */
" subl $12, %esp\n"
- " pushl %gs\n"
+ " pushl %fs\n"
" pushl %ds\n"
" pushl %es\n"
" pushl %eax\n"
@@ -387,7 +387,7 @@ no_kprobe:
" popl %edi\n"
" popl %ebp\n"
" popl %eax\n"
- /* skip eip, orig_eax, es, ds, gs */
+ /* skip eip, orig_eax, es, ds, fs */
" addl $20, %esp\n"
" popf\n"
" ret\n");
Index: linux/arch/i386/kernel/process.c
===================================================================
--- linux.orig/arch/i386/kernel/process.c
+++ linux/arch/i386/kernel/process.c
@@ -308,8 +308,8 @@ void show_regs(struct pt_regs * regs)
regs->eax,regs->ebx,regs->ecx,regs->edx);
printk("ESI: %08lx EDI: %08lx EBP: %08lx",
regs->esi, regs->edi, regs->ebp);
- printk(" DS: %04x ES: %04x GS: %04x\n",
- 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xgs);
+ printk(" DS: %04x ES: %04x FS: %04x\n",
+ 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xfs);

cr0 = read_cr0();
cr2 = read_cr2();
@@ -340,7 +340,7 @@ int kernel_thread(int (*fn)(void *), voi

regs.xds = __USER_DS;
regs.xes = __USER_DS;
- regs.xgs = __KERNEL_PDA;
+ regs.xfs = __KERNEL_PDA;
regs.orig_eax = -1;
regs.eip = (unsigned long) kernel_thread_helper;
regs.xcs = __KERNEL_CS | get_kernel_rpl();
@@ -425,7 +425,7 @@ int copy_thread(int nr, unsigned long cl

p->thread.eip = (unsigned long) ret_from_fork;

- savesegment(fs,p->thread.fs);
+ savesegment(gs,p->thread.gs);

tsk = current;
if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
@@ -501,8 +501,8 @@ void dump_thread(struct pt_regs * regs,
dump->regs.eax = regs->eax;
dump->regs.ds = regs->xds;
dump->regs.es = regs->xes;
- savesegment(fs,dump->regs.fs);
- dump->regs.gs = regs->xgs;
+ dump->regs.fs = regs->xfs;
+ savesegment(gs,dump->regs.gs);
dump->regs.orig_eax = regs->orig_eax;
dump->regs.eip = regs->eip;
dump->regs.cs = regs->xcs;
@@ -653,7 +653,7 @@ struct task_struct fastcall * __switch_t
load_esp0(tss, next);

/*
- * Save away %fs. No need to save %gs, as it was saved on the
+ * Save away %gs. No need to save %fs, as it was saved on the
* stack on entry. No need to save %es and %ds, as those are
* always kernel segments while inside the kernel. Doing this
* before setting the new TLS descriptors avoids the situation
@@ -662,7 +662,7 @@ struct task_struct fastcall * __switch_t
* used %fs or %gs (it does not today), or if the kernel is
* running inside of a hypervisor layer.
*/
- savesegment(fs, prev->fs);
+ savesegment(gs, prev->gs);

/*
* Load the per-thread Thread-Local Storage descriptor.
@@ -670,12 +670,10 @@ struct task_struct fastcall * __switch_t
load_TLS(next, cpu);

/*
- * Restore %fs if needed.
- *
- * Glibc normally makes %fs be zero.
+ * Restore %gs if needed (which is common)
*/
- if (unlikely(prev->fs | next->fs))
- loadsegment(fs, next->fs);
+ if (prev->gs | next->gs)
+ loadsegment(gs, next->gs);

write_pda(pcurrent, next_p);

Index: linux/arch/i386/kernel/ptrace.c
===================================================================
--- linux.orig/arch/i386/kernel/ptrace.c
+++ linux/arch/i386/kernel/ptrace.c
@@ -89,14 +89,14 @@ static int putreg(struct task_struct *ch
unsigned long regno, unsigned long value)
{
switch (regno >> 2) {
- case FS:
+ case GS:
if (value && (value & 3) != 3)
return -EIO;
- child->thread.fs = value;
+ child->thread.gs = value;
return 0;
case DS:
case ES:
- case GS:
+ case FS:
if (value && (value & 3) != 3)
return -EIO;
value &= 0xffff;
@@ -112,7 +112,7 @@ static int putreg(struct task_struct *ch
value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK;
break;
}
- if (regno > ES*4)
+ if (regno > FS*4)
regno -= 1*4;
put_stack_long(child, regno, value);
return 0;
@@ -124,18 +124,18 @@ static unsigned long getreg(struct task_
unsigned long retval = ~0UL;

switch (regno >> 2) {
- case FS:
- retval = child->thread.fs;
+ case GS:
+ retval = child->thread.gs;
break;
case DS:
case ES:
- case GS:
+ case FS:
case SS:
case CS:
retval = 0xffff;
/* fall through */
default:
- if (regno > ES*4)
+ if (regno > FS*4)
regno -= 1*4;
retval &= get_stack_long(child, regno);
}
Index: linux/arch/i386/kernel/signal.c
===================================================================
--- linux.orig/arch/i386/kernel/signal.c
+++ linux/arch/i386/kernel/signal.c
@@ -128,8 +128,8 @@ restore_sigcontext(struct pt_regs *regs,
X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \
X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF)

- COPY_SEG(gs);
- GET_SEG(fs);
+ GET_SEG(gs);
+ COPY_SEG(fs);
COPY_SEG(es);
COPY_SEG(ds);
COPY(edi);
@@ -244,9 +244,9 @@ setup_sigcontext(struct sigcontext __use
{
int tmp, err = 0;

- err |= __put_user(regs->xgs, (unsigned int __user *)&sc->gs);
- savesegment(fs, tmp);
- err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
+ err |= __put_user(regs->xfs, (unsigned int __user *)&sc->fs);
+ savesegment(gs, tmp);
+ err |= __put_user(tmp, (unsigned int __user *)&sc->gs);

err |= __put_user(regs->xes, (unsigned int __user *)&sc->es);
err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds);
Index: linux/arch/i386/kernel/traps.c
===================================================================
--- linux.orig/arch/i386/kernel/traps.c
+++ linux/arch/i386/kernel/traps.c
@@ -291,10 +291,11 @@ void show_registers(struct pt_regs *regs
int i;
int in_kernel = 1;
unsigned long esp;
- unsigned short ss;
+ unsigned short ss, gs;

esp = (unsigned long) (&regs->esp);
savesegment(ss, ss);
+ savesegment(gs, gs);
if (user_mode_vm(regs)) {
in_kernel = 0;
esp = regs->esp;
@@ -313,8 +314,8 @@ void show_registers(struct pt_regs *regs
regs->eax, regs->ebx, regs->ecx, regs->edx);
printk(KERN_EMERG "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
regs->esi, regs->edi, regs->ebp, esp);
- printk(KERN_EMERG "ds: %04x es: %04x ss: %04x\n",
- regs->xds & 0xffff, regs->xes & 0xffff, ss);
+ printk(KERN_EMERG "ds: %04x es: %04x fs: %04x gs: %04x ss: %04x\n",
+ regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss);
printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
TASK_COMM_LEN, current->comm, current->pid,
current_thread_info(), current, current->thread_info);
Index: linux/arch/i386/kernel/vm86.c
===================================================================
--- linux.orig/arch/i386/kernel/vm86.c
+++ linux/arch/i386/kernel/vm86.c
@@ -96,12 +96,12 @@ static int copy_vm86_regs_to_user(struct
{
int ret = 0;

- /* kernel_vm86_regs is missing xfs, so copy everything up to
- (but not including) xgs, and then rest after xgs. */
- ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.xgs));
- ret += copy_to_user(&user->__null_gs, &regs->pt.xgs,
+ /* kernel_vm86_regs is missing xgs, so copy everything up to
+ (but not including) orig_eax, and then rest including orig_eax. */
+ ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_eax));
+ ret += copy_to_user(&user->orig_eax, &regs->pt.orig_eax,
sizeof(struct kernel_vm86_regs) -
- offsetof(struct kernel_vm86_regs, pt.xgs));
+ offsetof(struct kernel_vm86_regs, pt.orig_eax));

return ret;
}
@@ -113,12 +113,13 @@ static int copy_vm86_regs_from_user(stru
{
int ret = 0;

- ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.xgs));
- ret += copy_from_user(&regs->pt.xgs, &user->__null_gs,
+ /* copy eax-xfs inclusive */
+ ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_eax));
+ /* copy orig_eax-__gsh+extra */
+ ret += copy_from_user(&regs->pt.orig_eax, &user->orig_eax,
sizeof(struct kernel_vm86_regs) -
- offsetof(struct kernel_vm86_regs, pt.xgs) +
+ offsetof(struct kernel_vm86_regs, pt.orig_eax) +
extra);
-
return ret;
}

@@ -157,8 +158,8 @@ struct pt_regs * fastcall save_v86_state

ret = KVM86->regs32;

- loadsegment(fs, current->thread.saved_fs);
- ret->xgs = current->thread.saved_gs;
+ ret->xfs = current->thread.saved_fs;
+ loadsegment(gs, current->thread.saved_gs);

return ret;
}
@@ -285,9 +286,9 @@ static void do_sys_vm86(struct kernel_vm
*/
info->regs.pt.xds = 0;
info->regs.pt.xes = 0;
- info->regs.pt.xgs = 0;
+ info->regs.pt.xfs = 0;

-/* we are clearing fs later just before "jmp resume_userspace",
+/* we are clearing gs later just before "jmp resume_userspace",
* because it is not saved/restored.
*/

@@ -321,8 +322,8 @@ static void do_sys_vm86(struct kernel_vm
*/
info->regs32->eax = 0;
tsk->thread.saved_esp0 = tsk->thread.esp0;
- savesegment(fs, tsk->thread.saved_fs);
- tsk->thread.saved_gs = info->regs32->xgs;
+ tsk->thread.saved_fs = info->regs32->xfs;
+ savesegment(gs, tsk->thread.saved_gs);

tss = &per_cpu(init_tss, get_cpu());
tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
@@ -342,7 +343,7 @@ static void do_sys_vm86(struct kernel_vm
__asm__ __volatile__(
"movl %0,%%esp\n\t"
"movl %1,%%ebp\n\t"
- "mov %2, %%fs\n\t"
+ "mov %2, %%gs\n\t"
"jmp resume_userspace"
: /* no outputs */
:"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
Index: linux/arch/i386/math-emu/get_address.c
===================================================================
--- linux.orig/arch/i386/math-emu/get_address.c
+++ linux/arch/i386/math-emu/get_address.c
@@ -56,15 +56,14 @@ static int reg_offset_vm86[] = {
#define VM86_REG_(x) (*(unsigned short *) \
(reg_offset_vm86[((unsigned)x)]+(u_char *) FPU_info))

-/* These are dummy, fs and gs are not saved on the stack. */
-#define ___FS ___ds
+/* This dummy, gs is not saved on the stack. */
#define ___GS ___ds

static int reg_offset_pm[] = {
offsetof(struct info,___cs),
offsetof(struct info,___ds),
offsetof(struct info,___es),
- offsetof(struct info,___FS),
+ offsetof(struct info,___fs),
offsetof(struct info,___GS),
offsetof(struct info,___ss),
offsetof(struct info,___ds)
@@ -169,13 +168,10 @@ static long pm_address(u_char FPU_modrm,

switch ( segment )
{
- /* fs and gs aren't used by the kernel, so they still have their
- user-space values. */
- case PREFIX_FS_-1:
- /* N.B. - movl %seg, mem is a 2 byte write regardless of prefix */
- savesegment(fs, addr->selector);
- break;
+ /* gs isn't used by the kernel, so it still has its
+ user-space value. */
case PREFIX_GS_-1:
+ /* N.B. - movl %seg, mem is a 2 byte write regardless of prefix */
savesegment(gs, addr->selector);
break;
default:
Index: linux/include/asm-i386/elf.h
===================================================================
--- linux.orig/include/asm-i386/elf.h
+++ linux/include/asm-i386/elf.h
@@ -90,8 +90,8 @@ typedef struct user_fxsr_struct elf_fpxr
pr_reg[6] = regs->eax; \
pr_reg[7] = regs->xds; \
pr_reg[8] = regs->xes; \
- savesegment(fs,pr_reg[9]); \
- pr_reg[10] = regs->xgs; \
+ pr_reg[9] = regs->xfs; \
+ savesegment(gs,pr_reg[10]); \
pr_reg[11] = regs->orig_eax; \
pr_reg[12] = regs->eip; \
pr_reg[13] = regs->xcs; \
Index: linux/include/asm-i386/mmu_context.h
===================================================================
--- linux.orig/include/asm-i386/mmu_context.h
+++ linux/include/asm-i386/mmu_context.h
@@ -63,7 +63,7 @@ static inline void switch_mm(struct mm_s
}

#define deactivate_mm(tsk, mm) \
- asm("movl %0,%%fs": :"r" (0));
+ asm("movl %0,%%gs": :"r" (0));

#define activate_mm(prev, next) \
switch_mm((prev),(next),NULL)
Index: linux/include/asm-i386/pda.h
===================================================================
--- linux.orig/include/asm-i386/pda.h
+++ linux/include/asm-i386/pda.h
@@ -39,19 +39,19 @@ extern struct i386_pda _proxy_pda;
if (0) { T__ tmp__; tmp__ = (val); } \
switch (sizeof(_proxy_pda.field)) { \
case 1: \
- asm(op "b %1,%%gs:%c2" \
+ asm(op "b %1,%%fs:%c2" \
: "+m" (_proxy_pda.field) \
:"ri" ((T__)val), \
"i"(pda_offset(field))); \
break; \
case 2: \
- asm(op "w %1,%%gs:%c2" \
+ asm(op "w %1,%%fs:%c2" \
: "+m" (_proxy_pda.field) \
:"ri" ((T__)val), \
"i"(pda_offset(field))); \
break; \
case 4: \
- asm(op "l %1,%%gs:%c2" \
+ asm(op "l %1,%%fs:%c2" \
: "+m" (_proxy_pda.field) \
:"ri" ((T__)val), \
"i"(pda_offset(field))); \
@@ -65,19 +65,19 @@ extern struct i386_pda _proxy_pda;
typeof(_proxy_pda.field) ret__; \
switch (sizeof(_proxy_pda.field)) { \
case 1: \
- asm(op "b %%gs:%c1,%0" \
+ asm(op "b %%fs:%c1,%0" \
: "=r" (ret__) \
: "i" (pda_offset(field)), \
"m" (_proxy_pda.field)); \
break; \
case 2: \
- asm(op "w %%gs:%c1,%0" \
+ asm(op "w %%fs:%c1,%0" \
: "=r" (ret__) \
: "i" (pda_offset(field)), \
"m" (_proxy_pda.field)); \
break; \
case 4: \
- asm(op "l %%gs:%c1,%0" \
+ asm(op "l %%fs:%c1,%0" \
: "=r" (ret__) \
: "i" (pda_offset(field)), \
"m" (_proxy_pda.field)); \
Index: linux/include/asm-i386/processor.h
===================================================================
--- linux.orig/include/asm-i386/processor.h
+++ linux/include/asm-i386/processor.h
@@ -424,7 +424,7 @@ struct thread_struct {
.vm86_info = NULL, \
.sysenter_cs = __KERNEL_CS, \
.io_bitmap_ptr = NULL, \
- .gs = __KERNEL_PDA, \
+ .fs = __KERNEL_PDA, \
}

/*
@@ -442,8 +442,8 @@ struct thread_struct {
}

#define start_thread(regs, new_eip, new_esp) do { \
- __asm__("movl %0,%%fs": :"r" (0)); \
- regs->xgs = 0; \
+ __asm__("movl %0,%%gs": :"r" (0)); \
+ regs->xfs = 0; \
set_fs(USER_DS); \
regs->xds = __USER_DS; \
regs->xes = __USER_DS; \
Index: linux/include/asm-i386/ptrace.h
===================================================================
--- linux.orig/include/asm-i386/ptrace.h
+++ linux/include/asm-i386/ptrace.h
@@ -16,8 +16,8 @@ struct pt_regs {
long eax;
int xds;
int xes;
- /* int xfs; */
- int xgs;
+ int xfs;
+ /* int xgs; */
long orig_eax;
long eip;
int xcs;
Index: linux/drivers/kvm/vmx.c
===================================================================
--- linux.orig/drivers/kvm/vmx.c
+++ linux/drivers/kvm/vmx.c
@@ -1863,12 +1863,6 @@ again:
asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
#endif

- /*
- * Profile KVM exit RIPs:
- */
- if (unlikely(prof_on == KVM_PROFILING))
- profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP));
-
kvm_run->exit_type = 0;
if (fail) {
kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY;
@@ -1891,6 +1885,12 @@ again:

reload_tss();
}
+ /*
+ * Profile KVM exit RIPs:
+ */
+ if (unlikely(prof_on == KVM_PROFILING))
+ profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP));
+
vcpu->launched = 1;
kvm_run->exit_type = KVM_EXIT_TYPE_VM_EXIT;
r = kvm_handle_exit(kvm_run, vcpu);

2007-02-10 11:53:03

by Andi Kleen

[permalink] [raw]
Subject: [PATCH 2.6.21 review I] [20/25] x86_64: get rid of ARCH_HAVE_XTIME_LOCK


From: Eric Dumazet <[email protected]>

ARCH_HAVE_XTIME_LOCK is used by x86_64 arch . This arch needs to place a
read only copy of xtime_lock into vsyscall page. This read only copy is
named __xtime_lock, and xtime_lock is defined in
arch/x86_64/kernel/vmlinux.lds.S as an alias. So the declaration of
xtime_lock in kernel/timer.c was guarded by ARCH_HAVE_XTIME_LOCK define,
defined to true on x86_64.

We can get same result with _attribute__((weak)) in the declaration. linker
should do the job.

Signed-off-by: Eric Dumazet <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>
Cc: Andi Kleen <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
---

include/asm-x86_64/vsyscall.h | 5 -----
include/linux/time.h | 2 +-
kernel/timer.c | 4 +---
3 files changed, 2 insertions(+), 9 deletions(-)

Index: linux/include/asm-x86_64/vsyscall.h
===================================================================
--- linux.orig/include/asm-x86_64/vsyscall.h
+++ linux/include/asm-x86_64/vsyscall.h
@@ -56,11 +56,6 @@ extern struct vxtime_data vxtime;
extern int vgetcpu_mode;
extern struct timezone sys_tz;
extern int sysctl_vsyscall;
-extern seqlock_t xtime_lock;
-
-extern int sysctl_vsyscall;
-
-#define ARCH_HAVE_XTIME_LOCK 1

#endif /* __KERNEL__ */

Index: linux/include/linux/time.h
===================================================================
--- linux.orig/include/linux/time.h
+++ linux/include/linux/time.h
@@ -90,7 +90,7 @@ static inline struct timespec timespec_s

extern struct timespec xtime;
extern struct timespec wall_to_monotonic;
-extern seqlock_t xtime_lock;
+extern seqlock_t xtime_lock __attribute__((weak));

void timekeeping_init(void);

Index: linux/kernel/timer.c
===================================================================
--- linux.orig/kernel/timer.c
+++ linux/kernel/timer.c
@@ -1162,11 +1162,9 @@ static inline void calc_load(unsigned lo
* This read-write spinlock protects us from races in SMP while
* playing with xtime and avenrun.
*/
-#ifndef ARCH_HAVE_XTIME_LOCK
-__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
+__attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);

EXPORT_SYMBOL(xtime_lock);
-#endif

/*
* This function runs timers and the timer-tq in bottom half context.

2007-02-10 11:58:17

by Andi Kleen

[permalink] [raw]
Subject: [PATCH 2.6.21 review I] [15/25] i386: Only call unreachable_devices() when type 1 is available.


From: Olivier Galibert <[email protected]>

unreachable_devices compares between the results of pci configuration accesses
through type1 and mmconfig, so it should be called only if type1 actually
works in the first place.

Signed-off-by: Olivier Galibert <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>
Cc: Andi Kleen <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
---

arch/i386/pci/mmconfig-shared.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

Index: linux/arch/i386/pci/mmconfig-shared.c
===================================================================
--- linux.orig/arch/i386/pci/mmconfig-shared.c
+++ linux/arch/i386/pci/mmconfig-shared.c
@@ -80,7 +80,8 @@ void __init pci_mmcfg_init(int type)
}

if (pci_mmcfg_arch_init()) {
- unreachable_devices();
+ if (type == 1)
+ unreachable_devices();
pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
}
}

2007-02-10 11:58:17

by Andi Kleen

[permalink] [raw]
Subject: [PATCH 2.6.21 review I] [16/25] i386: Detect and support the E7520 and the 945G/GZ/P/PL


From: Olivier Galibert <[email protected]>

It seems that the only way to reliably support mmconfig in the presence of
funky biosen is to detect the hostbridge and read where the window is mapped
from its registers. Do that for the E7520 and the 945G/GZ/P/PL for a start.

Signed-off-by: Olivier Galibert <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>
Cc: Andi Kleen <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
---

arch/i386/pci/mmconfig-shared.c | 121 +++++++++++++++++++++++++++++++++++++++-
1 file changed, 119 insertions(+), 2 deletions(-)

Index: linux/arch/i386/pci/mmconfig-shared.c
===================================================================
--- linux.orig/arch/i386/pci/mmconfig-shared.c
+++ linux/arch/i386/pci/mmconfig-shared.c
@@ -3,6 +3,7 @@
* MMCONFIG - common code between i386 and x86-64.
*
* This code does:
+ * - known chipset handling
* - ACPI decoding and validation
*
* Per-architecture code takes care of the mappings and accesses
@@ -55,12 +56,128 @@ static __init void unreachable_devices(v
}
}

+static __init const char *pci_mmcfg_e7520(void)
+{
+ u32 win;
+ pci_conf1_read(0, 0, PCI_DEVFN(0,0), 0xce, 2, &win);
+
+ pci_mmcfg_config_num = 1;
+ pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL);
+ if (!pci_mmcfg_config)
+ return NULL;
+ pci_mmcfg_config[0].address = (win & 0xf000) << 16;
+ pci_mmcfg_config[0].pci_segment = 0;
+ pci_mmcfg_config[0].start_bus_number = 0;
+ pci_mmcfg_config[0].end_bus_number = 255;
+
+ return "Intel Corporation E7520 Memory Controller Hub";
+}
+
+static __init const char *pci_mmcfg_intel_945(void)
+{
+ u32 pciexbar, mask = 0, len = 0;
+
+ pci_mmcfg_config_num = 1;
+
+ pci_conf1_read(0, 0, PCI_DEVFN(0,0), 0x48, 4, &pciexbar);
+
+ /* Enable bit */
+ if (!(pciexbar & 1))
+ pci_mmcfg_config_num = 0;
+
+ /* Size bits */
+ switch ((pciexbar >> 1) & 3) {
+ case 0:
+ mask = 0xf0000000U;
+ len = 0x10000000U;
+ break;
+ case 1:
+ mask = 0xf8000000U;
+ len = 0x08000000U;
+ break;
+ case 2:
+ mask = 0xfc000000U;
+ len = 0x04000000U;
+ break;
+ default:
+ pci_mmcfg_config_num = 0;
+ }
+
+ /* Errata #2, things break when not aligned on a 256Mb boundary */
+ /* Can only happen in 64M/128M mode */
+
+ if ((pciexbar & mask) & 0x0fffffffU)
+ pci_mmcfg_config_num = 0;
+
+ if (pci_mmcfg_config_num) {
+ pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL);
+ if (!pci_mmcfg_config)
+ return NULL;
+ pci_mmcfg_config[0].address = pciexbar & mask;
+ pci_mmcfg_config[0].pci_segment = 0;
+ pci_mmcfg_config[0].start_bus_number = 0;
+ pci_mmcfg_config[0].end_bus_number = (len >> 20) - 1;
+ }
+
+ return "Intel Corporation 945G/GZ/P/PL Express Memory Controller Hub";
+}
+
+struct pci_mmcfg_hostbridge_probe {
+ u32 vendor;
+ u32 device;
+ const char *(*probe)(void);
+};
+
+static __initdata struct pci_mmcfg_hostbridge_probe pci_mmcfg_probes[] = {
+ { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, pci_mmcfg_e7520 },
+ { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82945G_HB, pci_mmcfg_intel_945 },
+};
+
+static int __init pci_mmcfg_check_hostbridge(void)
+{
+ u32 l;
+ u16 vendor, device;
+ int i;
+ const char *name;
+
+ pci_conf1_read(0, 0, PCI_DEVFN(0,0), 0, 4, &l);
+ vendor = l & 0xffff;
+ device = (l >> 16) & 0xffff;
+
+ pci_mmcfg_config_num = 0;
+ pci_mmcfg_config = NULL;
+ name = NULL;
+
+ for (i = 0; !name && i < ARRAY_SIZE(pci_mmcfg_probes); i++)
+ if ((pci_mmcfg_probes[i].vendor == PCI_ANY_ID ||
+ pci_mmcfg_probes[i].vendor == vendor) &&
+ (pci_mmcfg_probes[i].device == PCI_ANY_ID ||
+ pci_mmcfg_probes[i].device == device))
+ name = pci_mmcfg_probes[i].probe();
+
+ if (name) {
+ if (pci_mmcfg_config_num)
+ printk(KERN_INFO "PCI: Found %s with MMCONFIG support.\n", name);
+ else
+ printk(KERN_INFO "PCI: Found %s without MMCONFIG support.\n",
+ name);
+ }
+
+ return name != NULL;
+}
+
void __init pci_mmcfg_init(int type)
{
+ int known_bridge = 0;
+
if ((pci_probe & PCI_PROBE_MMCONF) == 0)
return;

- acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg);
+ if (type == 1 && pci_mmcfg_check_hostbridge())
+ known_bridge = 1;
+
+ if (!known_bridge)
+ acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg);

if ((pci_mmcfg_config_num == 0) ||
(pci_mmcfg_config == NULL) ||
@@ -69,7 +186,7 @@ void __init pci_mmcfg_init(int type)

/* Only do this check when type 1 works. If it doesn't work
assume we run on a Mac and always use MCFG */
- if (type == 1 &&
+ if (type == 1 && !known_bridge &&
!e820_all_mapped(pci_mmcfg_config[0].address,
pci_mmcfg_config[0].address + MMCONFIG_APER_MIN,
E820_RESERVED)) {

2007-02-10 11:58:18

by Andi Kleen

[permalink] [raw]
Subject: [PATCH 2.6.21 review I] [14/25] x86: Share what's shareable.


From: Olivier Galibert <[email protected]>

i386 and x86-64 pci mmconfig code have a lot in common. So share what's
shareable between the two.

Signed-off-by: Olivier Galibert <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>
Cc: Andi Kleen <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
---

arch/i386/pci/Makefile | 2
arch/i386/pci/mmconfig-shared.c | 86 ++++++++++++++++++++++++++++++++++++++++
arch/i386/pci/mmconfig.c | 74 +---------------------------------
arch/i386/pci/pci.h | 6 ++
arch/x86_64/pci/Makefile | 3 -
arch/x86_64/pci/mmconfig.c | 76 +++++------------------------------
6 files changed, 111 insertions(+), 136 deletions(-)

Index: linux/arch/i386/pci/Makefile
===================================================================
--- linux.orig/arch/i386/pci/Makefile
+++ linux/arch/i386/pci/Makefile
@@ -1,7 +1,7 @@
obj-y := i386.o init.o

obj-$(CONFIG_PCI_BIOS) += pcbios.o
-obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o
+obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o mmconfig-shared.o
obj-$(CONFIG_PCI_DIRECT) += direct.o

pci-y := fixup.o
Index: linux/arch/i386/pci/mmconfig-shared.c
===================================================================
--- /dev/null
+++ linux/arch/i386/pci/mmconfig-shared.c
@@ -0,0 +1,86 @@
+/*
+ * mmconfig-shared.c - Low-level direct PCI config space access via
+ * MMCONFIG - common code between i386 and x86-64.
+ *
+ * This code does:
+ * - ACPI decoding and validation
+ *
+ * Per-architecture code takes care of the mappings and accesses
+ * themselves.
+ */
+
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/acpi.h>
+#include <linux/bitmap.h>
+#include <asm/e820.h>
+
+#include "pci.h"
+
+/* aperture is up to 256MB but BIOS may reserve less */
+#define MMCONFIG_APER_MIN (2 * 1024*1024)
+#define MMCONFIG_APER_MAX (256 * 1024*1024)
+
+/* Verify the first 16 busses. We assume that systems with more busses
+ get MCFG right. */
+#define PCI_MMCFG_MAX_CHECK_BUS 16
+
+DECLARE_BITMAP(pci_mmcfg_fallback_slots, 32*PCI_MMCFG_MAX_CHECK_BUS);
+
+/* K8 systems have some devices (typically in the builtin northbridge)
+ that are only accessible using type1
+ Normally this can be expressed in the MCFG by not listing them
+ and assigning suitable _SEGs, but this isn't implemented in some BIOS.
+ Instead try to discover all devices on bus 0 that are unreachable using MM
+ and fallback for them. */
+static __init void unreachable_devices(void)
+{
+ int i, k;
+ /* Use the max bus number from ACPI here? */
+ for (k = 0; k < PCI_MMCFG_MAX_CHECK_BUS; k++) {
+ for (i = 0; i < 32; i++) {
+ u32 val1, val2;
+
+ pci_conf1_read(0, k, PCI_DEVFN(i,0), 0, 4, &val1);
+ if (val1 == 0xffffffff)
+ continue;
+
+ raw_pci_ops->read(0, k, PCI_DEVFN(i, 0), 0, 4, &val2);
+ if (val1 != val2) {
+ set_bit(i + 32*k, pci_mmcfg_fallback_slots);
+ printk(KERN_NOTICE "PCI: No mmconfig possible"
+ " on device %02x:%02x\n", k, i);
+ }
+ }
+ }
+}
+
+void __init pci_mmcfg_init(int type)
+{
+ if ((pci_probe & PCI_PROBE_MMCONF) == 0)
+ return;
+
+ acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg);
+
+ if ((pci_mmcfg_config_num == 0) ||
+ (pci_mmcfg_config == NULL) ||
+ (pci_mmcfg_config[0].address == 0))
+ return;
+
+ /* Only do this check when type 1 works. If it doesn't work
+ assume we run on a Mac and always use MCFG */
+ if (type == 1 &&
+ !e820_all_mapped(pci_mmcfg_config[0].address,
+ pci_mmcfg_config[0].address + MMCONFIG_APER_MIN,
+ E820_RESERVED)) {
+ printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %Lx is not E820-reserved\n",
+ pci_mmcfg_config[0].address);
+ printk(KERN_ERR "PCI: Not using MMCONFIG.\n");
+ return;
+ }
+
+ if (pci_mmcfg_arch_init()) {
+ unreachable_devices();
+ pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
+ }
+}
Index: linux/arch/i386/pci/mmconfig.c
===================================================================
--- linux.orig/arch/i386/pci/mmconfig.c
+++ linux/arch/i386/pci/mmconfig.c
@@ -15,21 +15,13 @@
#include <asm/e820.h>
#include "pci.h"

-/* aperture is up to 256MB but BIOS may reserve less */
-#define MMCONFIG_APER_MIN (2 * 1024*1024)
-#define MMCONFIG_APER_MAX (256 * 1024*1024)
-
/* Assume systems with more busses have correct MCFG */
-#define MAX_CHECK_BUS 16
-
#define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG))

/* The base address of the last MMCONFIG device accessed */
static u32 mmcfg_last_accessed_device;
static int mmcfg_last_accessed_cpu;

-static DECLARE_BITMAP(fallback_slots, MAX_CHECK_BUS*32);
-
/*
* Functions for accessing PCI configuration space with MMCONFIG accesses
*/
@@ -38,8 +30,8 @@ static u32 get_base_addr(unsigned int se
int cfg_num = -1;
struct acpi_mcfg_allocation *cfg;

- if (seg == 0 && bus < MAX_CHECK_BUS &&
- test_bit(PCI_SLOT(devfn) + 32*bus, fallback_slots))
+ if (seg == 0 && bus < PCI_MMCFG_MAX_CHECK_BUS &&
+ test_bit(PCI_SLOT(devfn) + 32*bus, pci_mmcfg_fallback_slots))
return 0;

while (1) {
@@ -158,67 +150,9 @@ static struct pci_raw_ops pci_mmcfg = {
.write = pci_mmcfg_write,
};

-/* K8 systems have some devices (typically in the builtin northbridge)
- that are only accessible using type1
- Normally this can be expressed in the MCFG by not listing them
- and assigning suitable _SEGs, but this isn't implemented in some BIOS.
- Instead try to discover all devices on bus 0 that are unreachable using MM
- and fallback for them. */
-static __init void unreachable_devices(void)
-{
- int i, k;
- unsigned long flags;
-
- for (k = 0; k < MAX_CHECK_BUS; k++) {
- for (i = 0; i < 32; i++) {
- u32 val1;
- u32 addr;
-
- pci_conf1_read(0, k, PCI_DEVFN(i, 0), 0, 4, &val1);
- if (val1 == 0xffffffff)
- continue;
-
- /* Locking probably not needed, but safer */
- spin_lock_irqsave(&pci_config_lock, flags);
- addr = get_base_addr(0, k, PCI_DEVFN(i, 0));
- if (addr != 0)
- pci_exp_set_dev_base(addr, k, PCI_DEVFN(i, 0));
- if (addr == 0 ||
- readl((u32 __iomem *)mmcfg_virt_addr) != val1) {
- set_bit(i + 32*k, fallback_slots);
- printk(KERN_NOTICE
- "PCI: No mmconfig possible on %x:%x\n", k, i);
- }
- spin_unlock_irqrestore(&pci_config_lock, flags);
- }
- }
-}
-
-void __init pci_mmcfg_init(int type)
+int __init pci_mmcfg_arch_init(void)
{
- if ((pci_probe & PCI_PROBE_MMCONF) == 0)
- return;
-
- acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg);
- if ((pci_mmcfg_config_num == 0) ||
- (pci_mmcfg_config == NULL) ||
- (pci_mmcfg_config[0].address == 0))
- return;
-
- /* Only do this check when type 1 works. If it doesn't work
- assume we run on a Mac and always use MCFG */
- if (type == 1 && !e820_all_mapped(pci_mmcfg_config[0].address,
- pci_mmcfg_config[0].address + MMCONFIG_APER_MIN,
- E820_RESERVED)) {
- printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %lx is not E820-reserved\n",
- (unsigned long)pci_mmcfg_config[0].address);
- printk(KERN_ERR "PCI: Not using MMCONFIG.\n");
- return;
- }
-
printk(KERN_INFO "PCI: Using MMCONFIG\n");
raw_pci_ops = &pci_mmcfg;
- pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
-
- unreachable_devices();
+ return 1;
}
Index: linux/arch/i386/pci/pci.h
===================================================================
--- linux.orig/arch/i386/pci/pci.h
+++ linux/arch/i386/pci/pci.h
@@ -94,3 +94,9 @@ extern void pci_pcbios_init(void);
extern void pci_mmcfg_init(int type);
extern void pcibios_sort(void);

+/* pci-mmconfig.c */
+
+#define PCI_MMCFG_MAX_CHECK_BUS 16
+extern DECLARE_BITMAP(pci_mmcfg_fallback_slots, 32*PCI_MMCFG_MAX_CHECK_BUS);
+
+extern int pci_mmcfg_arch_init(void);
Index: linux/arch/x86_64/pci/Makefile
===================================================================
--- linux.orig/arch/x86_64/pci/Makefile
+++ linux/arch/x86_64/pci/Makefile
@@ -11,7 +11,7 @@ obj-y += fixup.o init.o
obj-$(CONFIG_ACPI) += acpi.o
obj-y += legacy.o irq.o common.o early.o
# mmconfig has a 64bit special
-obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o
+obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o mmconfig-shared.o

obj-$(CONFIG_NUMA) += k8-bus.o

@@ -24,3 +24,4 @@ fixup-y += ../../i386/pci/fixup.o
i386-y += ../../i386/pci/i386.o
init-y += ../../i386/pci/init.o
early-y += ../../i386/pci/early.o
+mmconfig-shared-y += ../../i386/pci/mmconfig-shared.o
Index: linux/arch/x86_64/pci/mmconfig.c
===================================================================
--- linux.orig/arch/x86_64/pci/mmconfig.c
+++ linux/arch/x86_64/pci/mmconfig.c
@@ -19,9 +19,7 @@

/* Verify the first 16 busses. We assume that systems with more busses
get MCFG right. */
-#define MAX_CHECK_BUS 16
-
-static DECLARE_BITMAP(fallback_slots, 32*MAX_CHECK_BUS);
+#define PCI_MMCFG_MAX_CHECK_BUS 16

/* Static virtual mapping of the MMCONFIG aperture */
struct mmcfg_virt {
@@ -63,8 +61,8 @@ static char __iomem *get_virt(unsigned i
static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn)
{
char __iomem *addr;
- if (seg == 0 && bus < MAX_CHECK_BUS &&
- test_bit(32*bus + PCI_SLOT(devfn), fallback_slots))
+ if (seg == 0 && bus < PCI_MMCFG_MAX_CHECK_BUS &&
+ test_bit(32*bus + PCI_SLOT(devfn), pci_mmcfg_fallback_slots))
return NULL;
addr = get_virt(seg, bus);
if (!addr)
@@ -135,63 +133,16 @@ static struct pci_raw_ops pci_mmcfg = {
.write = pci_mmcfg_write,
};

-/* K8 systems have some devices (typically in the builtin northbridge)
- that are only accessible using type1
- Normally this can be expressed in the MCFG by not listing them
- and assigning suitable _SEGs, but this isn't implemented in some BIOS.
- Instead try to discover all devices on bus 0 that are unreachable using MM
- and fallback for them. */
-static __init void unreachable_devices(void)
-{
- int i, k;
- /* Use the max bus number from ACPI here? */
- for (k = 0; k < MAX_CHECK_BUS; k++) {
- for (i = 0; i < 32; i++) {
- u32 val1;
- char __iomem *addr;
-
- pci_conf1_read(0, k, PCI_DEVFN(i,0), 0, 4, &val1);
- if (val1 == 0xffffffff)
- continue;
- addr = pci_dev_base(0, k, PCI_DEVFN(i, 0));
- if (addr == NULL|| readl(addr) != val1) {
- set_bit(i + 32*k, fallback_slots);
- printk(KERN_NOTICE "PCI: No mmconfig possible"
- " on device %02x:%02x\n", k, i);
- }
- }
- }
-}
-
-void __init pci_mmcfg_init(int type)
+int __init pci_mmcfg_arch_init(void)
{
int i;
-
- if ((pci_probe & PCI_PROBE_MMCONF) == 0)
- return;
-
- acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg);
- if ((pci_mmcfg_config_num == 0) ||
- (pci_mmcfg_config == NULL) ||
- (pci_mmcfg_config[0].address == 0))
- return;
-
- /* Only do this check when type 1 works. If it doesn't work
- assume we run on a Mac and always use MCFG */
- if (type == 1 && !e820_all_mapped(pci_mmcfg_config[0].address,
- pci_mmcfg_config[0].address + MMCONFIG_APER_MIN,
- E820_RESERVED)) {
- printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %lx is not E820-reserved\n",
- (unsigned long)pci_mmcfg_config[0].address);
- printk(KERN_ERR "PCI: Not using MMCONFIG.\n");
- return;
- }
-
- pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * pci_mmcfg_config_num, GFP_KERNEL);
+ pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) *
+ pci_mmcfg_config_num, GFP_KERNEL);
if (pci_mmcfg_virt == NULL) {
printk(KERN_ERR "PCI: Can not allocate memory for mmconfig structures\n");
- return;
+ return 0;
}
+
for (i = 0; i < pci_mmcfg_config_num; ++i) {
pci_mmcfg_virt[i].cfg = &pci_mmcfg_config[i];
pci_mmcfg_virt[i].virt = ioremap_nocache(pci_mmcfg_config[i].address,
@@ -200,14 +151,11 @@ void __init pci_mmcfg_init(int type)
printk(KERN_ERR "PCI: Cannot map mmconfig aperture for "
"segment %d\n",
pci_mmcfg_config[i].pci_segment);
- return;
+ return 0;
}
- printk(KERN_INFO "PCI: Using MMCONFIG at %lx\n",
- (unsigned long)pci_mmcfg_config[i].address);
+ printk(KERN_INFO "PCI: Using MMCONFIG at %Lx\n",
+ pci_mmcfg_config[i].address);
}
-
- unreachable_devices();
-
raw_pci_ops = &pci_mmcfg;
- pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
+ return 1;
}

2007-02-10 11:59:18

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [PATCH 2.6.21 review I] [18/25] x86_64: Fix x86_64 ioremap base_address

Andi Kleen wrote:
> From: OGAWA Hirofumi <[email protected]>
>
> Current mmconfig has some problems of remapped range.


eh wasn't there a patch that just ignored the MCFG for the broken
system instead?

2007-02-10 12:00:34

by Andi Kleen

[permalink] [raw]
Subject: [PATCH 2.6.21 review I] [23/25] x86_64: improved iommu documentation


From: Karsten Weiss <[email protected]>

- add SWIOTLB config help text
- mention Documentation/x86_64/boot-options.txt in
Documentation/kernel-parameters.txt
- remove the duplication of the iommu kernel parameter documentation.
- Better explanation of some of the iommu kernel parameter options.
- "32MB<<order" instead of "32MB^order".
- Mention the default "order" value.
- list the four existing PCI-DMA mapping implementations of arch x86_64
- group the iommu= option keywords by PCI-DMA mapping implementation.
- Distinguish iommu= option keywords from number arguments.
- Explain the meaning of DAC and SAC.

Signed-off-by: Karsten Weiss <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>
Acked-by: Muli Ben-Yehuda <[email protected]>
Cc: Andi Kleen <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
---

Documentation/kernel-parameters.txt | 3
Documentation/x86_64/boot-options.txt | 107 +++++++++++++++++++++++-----------
arch/x86_64/Kconfig | 10 ++-
arch/x86_64/kernel/pci-dma.c | 28 +-------
4 files changed, 89 insertions(+), 59 deletions(-)

Index: linux/Documentation/kernel-parameters.txt
===================================================================
--- linux.orig/Documentation/kernel-parameters.txt
+++ linux/Documentation/kernel-parameters.txt
@@ -104,6 +104,9 @@ loader, and have no meaning to the kerne
Do not modify the syntax of boot loader parameters without extreme
need or coordination with <Documentation/i386/boot.txt>.

+There are also arch-specific kernel-parameters not documented here.
+See for example <Documentation/x86_64/boot-options.txt>.
+
Note that ALL kernel parameters listed below are CASE SENSITIVE, and that
a trailing = on the name of any parameter states that that parameter will
be entered as an environment variable, whereas its absence indicates that
Index: linux/Documentation/x86_64/boot-options.txt
===================================================================
--- linux.orig/Documentation/x86_64/boot-options.txt
+++ linux/Documentation/x86_64/boot-options.txt
@@ -180,40 +180,81 @@ PCI
pci=lastbus=NUMBER Scan upto NUMBER busses, no matter what the mptable says.
pci=noacpi Don't use ACPI to set up PCI interrupt routing.

-IOMMU
+IOMMU (input/output memory management unit)

- iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge]
- [,forcesac][,fullflush][,nomerge][,noaperture][,calgary]
- size set size of iommu (in bytes)
- noagp don't initialize the AGP driver and use full aperture.
- off don't use the IOMMU
- leak turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK is on)
- memaper[=order] allocate an own aperture over RAM with size 32MB^order.
- noforce don't force IOMMU usage. Default.
- force Force IOMMU.
- merge Do SG merging. Implies force (experimental)
- nomerge Don't do SG merging.
- forcesac For SAC mode for masks <40bits (experimental)
- fullflush Flush IOMMU on each allocation (default)
- nofullflush Don't use IOMMU fullflush
- allowed overwrite iommu off workarounds for specific chipsets.
- soft Use software bounce buffering (default for Intel machines)
- noaperture Don't touch the aperture for AGP.
- allowdac Allow DMA >4GB
- When off all DMA over >4GB is forced through an IOMMU or bounce
- buffering.
- nodac Forbid DMA >4GB
- panic Always panic when IOMMU overflows
- calgary Use the Calgary IOMMU if it is available
-
- swiotlb=pages[,force]
-
- pages Prereserve that many 128K pages for the software IO bounce buffering.
- force Force all IO through the software TLB.
-
- calgary=[64k,128k,256k,512k,1M,2M,4M,8M]
- calgary=[translate_empty_slots]
- calgary=[disable=<PCI bus number>]
+ Currently four x86-64 PCI-DMA mapping implementations exist:
+
+ 1. <arch/x86_64/kernel/pci-nommu.c>: use no hardware/software IOMMU at all
+ (e.g. because you have < 3 GB memory).
+ Kernel boot message: "PCI-DMA: Disabling IOMMU"
+
+ 2. <arch/x86_64/kernel/pci-gart.c>: AMD GART based hardware IOMMU.
+ Kernel boot message: "PCI-DMA: using GART IOMMU"
+
+ 3. <arch/x86_64/kernel/pci-swiotlb.c> : Software IOMMU implementation. Used
+ e.g. if there is no hardware IOMMU in the system and it is need because
+ you have >3GB memory or told the kernel to us it (iommu=soft))
+ Kernel boot message: "PCI-DMA: Using software bounce buffering
+ for IO (SWIOTLB)"
+
+ 4. <arch/x86_64/pci-calgary.c> : IBM Calgary hardware IOMMU. Used in IBM
+ pSeries and xSeries servers. This hardware IOMMU supports DMA address
+ mapping with memory protection, etc.
+ Kernel boot message: "PCI-DMA: Using Calgary IOMMU"
+
+ iommu=[<size>][,noagp][,off][,force][,noforce][,leak[=<nr_of_leak_pages>]
+ [,memaper[=<order>]][,merge][,forcesac][,fullflush][,nomerge]
+ [,noaperture][,calgary]
+
+ General iommu options:
+ off Don't initialize and use any kind of IOMMU.
+ noforce Don't force hardware IOMMU usage when it is not needed.
+ (default).
+ force Force the use of the hardware IOMMU even when it is
+ not actually needed (e.g. because < 3 GB memory).
+ soft Use software bounce buffering (SWIOTLB) (default for
+ Intel machines). This can be used to prevent the usage
+ of an available hardware IOMMU.
+
+ iommu options only relevant to the AMD GART hardware IOMMU:
+ <size> Set the size of the remapping area in bytes.
+ allowed Overwrite iommu off workarounds for specific chipsets.
+ fullflush Flush IOMMU on each allocation (default).
+ nofullflush Don't use IOMMU fullflush.
+ leak Turn on simple iommu leak tracing (only when
+ CONFIG_IOMMU_LEAK is on). Default number of leak pages
+ is 20.
+ memaper[=<order>] Allocate an own aperture over RAM with size 32MB<<order.
+ (default: order=1, i.e. 64MB)
+ merge Do scather-gather (SG) merging. Implies "force"
+ (experimental).
+ nomerge Don't do scather-gather (SG) merging.
+ noaperture Ask the IOMMU not to touch the aperture for AGP.
+ forcesac Force single-address cycle (SAC) mode for masks <40bits
+ (experimental).
+ noagp Don't initialize the AGP driver and use full aperture.
+ allowdac Allow double-address cycle (DAC) mode, i.e. DMA >4GB.
+ DAC is used with 32-bit PCI to push a 64-bit address in
+ two cycles. When off all DMA over >4GB is forced through
+ an IOMMU or software bounce buffering.
+ nodac Forbid DAC mode, i.e. DMA >4GB.
+ panic Always panic when IOMMU overflows.
+ calgary Use the Calgary IOMMU if it is available
+
+ iommu options only relevant to the software bounce buffering (SWIOTLB) IOMMU
+ implementation:
+ swiotlb=<pages>[,force]
+ <pages> Prereserve that many 128K pages for the software IO
+ bounce buffering.
+ force Force all IO through the software TLB.
+
+ Settings for the IBM Calgary hardware IOMMU currently found in IBM
+ pSeries and xSeries machines:
+
+ calgary=[64k,128k,256k,512k,1M,2M,4M,8M]
+ calgary=[translate_empty_slots]
+ calgary=[disable=<PCI bus number>]
+ panic Always panic when IOMMU overflows

64k,...,8M - Set the size of each PCI slot's translation table
when using the Calgary IOMMU. This is the size of the translation
Index: linux/arch/x86_64/Kconfig
===================================================================
--- linux.orig/arch/x86_64/Kconfig
+++ linux/arch/x86_64/Kconfig
@@ -454,8 +454,8 @@ config IOMMU
on systems with more than 3GB. This is usually needed for USB,
sound, many IDE/SATA chipsets and some other devices.
Provides a driver for the AMD Athlon64/Opteron/Turion/Sempron GART
- based IOMMU and a software bounce buffer based IOMMU used on Intel
- systems and as fallback.
+ based hardware IOMMU and a software bounce buffer based IOMMU used
+ on Intel systems and as fallback.
The code is only active when needed (enough memory and limited
device) unless CONFIG_IOMMU_DEBUG or iommu=force is specified
too.
@@ -492,6 +492,12 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
# need this always selected by IOMMU for the VIA workaround
config SWIOTLB
bool
+ help
+ Support for software bounce buffers used on x86-64 systems
+ which don't have a hardware IOMMU (e.g. the current generation
+ of Intel's x86-64 CPUs). Using this PCI devices which can only
+ access 32-bits of memory can be used on systems with more than
+ 3 GB of memory. If unsure, say Y.

config X86_MCE
bool "Machine check support" if EMBEDDED
Index: linux/arch/x86_64/kernel/pci-dma.c
===================================================================
--- linux.orig/arch/x86_64/kernel/pci-dma.c
+++ linux/arch/x86_64/kernel/pci-dma.c
@@ -223,30 +223,10 @@ int dma_set_mask(struct device *dev, u64
}
EXPORT_SYMBOL(dma_set_mask);

-/* iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge]
- [,forcesac][,fullflush][,nomerge][,biomerge]
- size set size of iommu (in bytes)
- noagp don't initialize the AGP driver and use full aperture.
- off don't use the IOMMU
- leak turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK is on)
- memaper[=order] allocate an own aperture over RAM with size 32MB^order.
- noforce don't force IOMMU usage. Default.
- force Force IOMMU.
- merge Do lazy merging. This may improve performance on some block devices.
- Implies force (experimental)
- biomerge Do merging at the BIO layer. This is more efficient than merge,
- but should be only done with very big IOMMUs. Implies merge,force.
- nomerge Don't do SG merging.
- forcesac For SAC mode for masks <40bits (experimental)
- fullflush Flush IOMMU on each allocation (default)
- nofullflush Don't use IOMMU fullflush
- allowed overwrite iommu off workarounds for specific chipsets.
- soft Use software bounce buffering (default for Intel machines)
- noaperture Don't touch the aperture for AGP.
- allowdac Allow DMA >4GB
- nodac Forbid DMA >4GB
- panic Force panic when IOMMU overflows
-*/
+/*
+ * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
+ * documentation.
+ */
__init int iommu_setup(char *p)
{
iommu_merge = 1;

2007-02-10 12:00:35

by Andi Kleen

[permalink] [raw]
Subject: [PATCH 2.6.21 review I] [24/25] x86_64: do not always end the stack trace with ULONG_MAX


From: Catalin Marinas <[email protected]>

It makes more sense to end the stack trace with ULONG_MAX only if
nr_entries < max_entries. Otherwise, we lose one entry in the long stack
traces and cannot know whether the trace was complete or not.

Signed-off-by: Catalin Marinas <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>
Cc: Andi Kleen <[email protected]>
Cc: Jan Beulich <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
---

arch/x86_64/kernel/stacktrace.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)

Index: linux/arch/x86_64/kernel/stacktrace.c
===================================================================
--- linux.orig/arch/x86_64/kernel/stacktrace.c
+++ linux/arch/x86_64/kernel/stacktrace.c
@@ -32,7 +32,7 @@ static void save_stack_address(void *dat
trace->skip--;
return;
}
- if (trace->nr_entries < trace->max_entries - 1)
+ if (trace->nr_entries < trace->max_entries)
trace->entries[trace->nr_entries++] = addr;
}

@@ -49,7 +49,8 @@ static struct stacktrace_ops save_stack_
void save_stack_trace(struct stack_trace *trace, struct task_struct *task)
{
dump_trace(task, NULL, NULL, &save_stack_ops, trace);
- trace->entries[trace->nr_entries++] = ULONG_MAX;
+ if (trace->nr_entries < trace->max_entries)
+ trace->entries[trace->nr_entries++] = ULONG_MAX;
}
EXPORT_SYMBOL(save_stack_trace);

2007-02-10 12:00:35

by Andi Kleen

[permalink] [raw]
Subject: [PATCH 2.6.21 review I] [18/25] x86_64: Fix x86_64 ioremap base_address


From: OGAWA Hirofumi <[email protected]>

Current mmconfig has some problems of remapped range.

a) In the case of broken MCFG tables on Asus etc., we need to remap 256M
range, but currently only remap 1M.

b) The base address always corresponds to bus number 0, but currently we
are assuming it corresponds to start bus number.

This patch fixes the above problems.

(akpm: Arjan suggests that if the MCFG table is broken we just shouldn't use
it, rather than try to work around things).

Signed-off-by: OGAWA Hirofumi <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>
Cc: Arjan van de Ven <[email protected]>
Cc: Andi Kleen <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
---

arch/x86_64/pci/mmconfig.c | 46 ++++++++++++++++++++++++++++++++++-----------
1 file changed, 35 insertions(+), 11 deletions(-)

Index: linux/arch/x86_64/pci/mmconfig.c
===================================================================
--- linux.orig/arch/x86_64/pci/mmconfig.c
+++ linux/arch/x86_64/pci/mmconfig.c
@@ -28,6 +28,39 @@ struct mmcfg_virt {
};
static struct mmcfg_virt *pci_mmcfg_virt;

+static inline int mcfg_broken(void)
+{
+ struct acpi_mcfg_allocation *cfg = &pci_mmcfg_config[0];
+
+ /* Handle more broken MCFG tables on Asus etc.
+ They only contain a single entry for bus 0-0. Assume
+ this applies to all busses. */
+ if (pci_mmcfg_config_num == 1 &&
+ cfg->pci_segment_group_number == 0 &&
+ (cfg->start_bus_number | cfg->end_bus_number) == 0)
+ return 1;
+ return 0;
+}
+
+static void __iomem *mcfg_ioremap(struct acpi_mcfg_allocation *cfg)
+{
+ void __iomem *addr;
+ u32 size;
+
+ if (mcfg_broken())
+ size = 256 << 20;
+ else
+ size = (cfg->end_bus_number + 1) << 20;
+
+ addr = ioremap_nocache(cfg->base_address, size);
+ if (addr) {
+ printk(KERN_INFO "PCI: Using MMCONFIG at %x - %x\n",
+ cfg->base_address,
+ cfg->base_address + size - 1);
+ }
+ return addr;
+}
+
static char __iomem *get_virt(unsigned int seg, unsigned bus)
{
int cfg_num = -1;
@@ -45,13 +78,7 @@ static char __iomem *get_virt(unsigned i
return pci_mmcfg_virt[cfg_num].virt;
}

- /* Handle more broken MCFG tables on Asus etc.
- They only contain a single entry for bus 0-0. Assume
- this applies to all busses. */
- cfg = &pci_mmcfg_config[0];
- if (pci_mmcfg_config_num == 1 &&
- cfg->pci_segment == 0 &&
- (cfg->start_bus_number | cfg->end_bus_number) == 0)
+ if (mcfg_broken())
return pci_mmcfg_virt[0].virt;

/* Fall back to type 0 */
@@ -145,16 +172,13 @@ int __init pci_mmcfg_arch_init(void)

for (i = 0; i < pci_mmcfg_config_num; ++i) {
pci_mmcfg_virt[i].cfg = &pci_mmcfg_config[i];
- pci_mmcfg_virt[i].virt = ioremap_nocache(pci_mmcfg_config[i].address,
- MMCONFIG_APER_MAX);
+ pci_mmcfg_virt[i].virt = mcfg_ioremap(&pci_mmcfg_config[i]);
if (!pci_mmcfg_virt[i].virt) {
printk(KERN_ERR "PCI: Cannot map mmconfig aperture for "
"segment %d\n",
pci_mmcfg_config[i].pci_segment);
return 0;
}
- printk(KERN_INFO "PCI: Using MMCONFIG at %Lx\n",
- pci_mmcfg_config[i].address);
}
raw_pci_ops = &pci_mmcfg;
return 1;

2007-02-10 12:01:44

by Andi Kleen

[permalink] [raw]
Subject: [PATCH 2.6.21 review I] [22/25] x86_64: use memcpy_uncached_read() in RDMA interrupt handler to reduce packet loss


From: "Bryan O'Sullivan" <[email protected]>

In cases where a large incoming RDMA is being received, we have to copy data
inside the interrupt handler before we can ACK each packet. The source is
DMAed to by the hardware, which means that the CPU won't have it cached. We
only read the source this one time; using normal load instructions pollutes
the dcache with useless data, reducing performance to the point where we can
lose a significant number of packets.

We use memcpy_uncached_read to try to not fill the dcache with useless data.
Avoiding the cache refill penalty lets us keep up better with the sender,
resulting in many fewer dropped packets.

Signed-off-by: Bryan O'Sullivan <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>
Cc: Andi Kleen <[email protected]>
Cc: Roland Dreier <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
---

drivers/infiniband/hw/ipath/ipath_verbs.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux/drivers/infiniband/hw/ipath/ipath_verbs.c
===================================================================
--- linux.orig/drivers/infiniband/hw/ipath/ipath_verbs.c
+++ linux/drivers/infiniband/hw/ipath/ipath_verbs.c
@@ -167,7 +167,7 @@ void ipath_copy_sge(struct ipath_sge_sta
BUG_ON(len == 0);
if (len > length)
len = length;
- memcpy(sge->vaddr, data, len);
+ memcpy_uncached_read(sge->vaddr, data, len);
sge->vaddr += len;
sge->length -= len;
sge->sge_length -= len;

2007-02-10 12:02:05

by Andi Kleen

[permalink] [raw]
Subject: [PATCH 2.6.21 review I] [8/25] x86_64: optimize & fix APIC mode setup


From: Ingo Molnar <[email protected]>

Fix a couple of inconsistencies/problems I found while reviewing the x86_64
genapic code (when I was chasing mysterious eth0 timeouts that would only
trigger if CPU_HOTPLUG is enabled):

- AMD systems defaulted to the slower flat-physical mode instead
of the flat-logical mode. The only restriction on AMD systems
is that they should not use clustered APIC mode.

- removed the CPU hotplug hacks, switching the default for small
systems back from phys-flat to logical-flat. The switching to logical
flat mode on small systems fixed sporadic ethernet driver timeouts i
was getting on a dual-core Athlon64 system:

NETDEV WATCHDOG: eth0: transmit timed out
eth0: Transmit timeout, status 0c 0005 c07f media 80.
eth0: Tx queue start entry 32 dirty entry 28.
eth0: Tx descriptor 0 is 0008a04a. (queue head)
eth0: Tx descriptor 1 is 0008a04a.
eth0: Tx descriptor 2 is 0008a04a.
eth0: Tx descriptor 3 is 0008a04a.
eth0: link up, 100Mbps, full-duplex, lpa 0xC5E1

- The use of '<= 8' was a bug by itself (the valid APIC ids
for logical flat mode go from 0 to 7, not 0 to 8). The new logic
is to use logical flat mode on both AMD and Intel systems, and
to only switch to physical mode when logical mode cannot be used.
If CPU hotplug is racy wrt. APIC shutdown then CPU hotplug needs
fixing, not the whole IRQ system be made inconsistent and slowed
down.

- minor cleanups: simplified some code constructs

build & booted on a couple of AMD and Intel SMP systems.

Signed-off-by: Ingo Molnar <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>
Cc: Suresh Siddha <[email protected]>
Cc: Andi Kleen <[email protected]>
Cc: "Li, Shaohua" <[email protected]>
Cc: "Eric W. Biederman" <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
---

arch/x86_64/kernel/genapic.c | 39 +++++++++++++++------------------------
1 file changed, 15 insertions(+), 24 deletions(-)

Index: linux/arch/x86_64/kernel/genapic.c
===================================================================
--- linux.orig/arch/x86_64/kernel/genapic.c
+++ linux/arch/x86_64/kernel/genapic.c
@@ -32,21 +32,20 @@ extern struct genapic apic_cluster;
extern struct genapic apic_flat;
extern struct genapic apic_physflat;

-struct genapic *genapic = &apic_flat;
-
+struct genapic __read_mostly *genapic = &apic_flat;

/*
* Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
*/
void __init clustered_apic_check(void)
{
- long i;
+ int i;
u8 clusters, max_cluster;
u8 id;
u8 cluster_cnt[NUM_APIC_CLUSTERS];
int max_apic = 0;

-#if defined(CONFIG_ACPI)
+#ifdef CONFIG_ACPI
/*
* Some x86_64 machines use physical APIC mode regardless of how many
* procs/clusters are present (x86_64 ES7000 is an example).
@@ -68,20 +67,17 @@ void __init clustered_apic_check(void)
cluster_cnt[APIC_CLUSTERID(id)]++;
}

- /* Don't use clustered mode on AMD platforms. */
+ /*
+ * Don't use clustered mode on AMD platforms, default
+ * to flat logical mode.
+ */
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
- genapic = &apic_physflat;
-#ifndef CONFIG_HOTPLUG_CPU
- /* In the CPU hotplug case we cannot use broadcast mode
- because that opens a race when a CPU is removed.
- Stay at physflat mode in this case.
- It is bad to do this unconditionally though. Once
- we have ACPI platform support for CPU hotplug
- we should detect hotplug capablity from ACPI tables and
- only do this when really needed. -AK */
- if (max_apic <= 8)
- genapic = &apic_flat;
-#endif
+ /*
+ * Switch to physical flat mode if more than 8 APICs
+ * (In the case of 8 CPUs APIC ID goes from 0 to 7):
+ */
+ if (max_apic >= 8)
+ genapic = &apic_physflat;
goto print;
}

@@ -103,14 +99,9 @@ void __init clustered_apic_check(void)
* (We don't use lowest priority delivery + HW APIC IRQ steering, so
* can ignore the clustered logical case and go straight to physical.)
*/
- if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) {
-#ifdef CONFIG_HOTPLUG_CPU
- /* Don't use APIC shortcuts in CPU hotplug to avoid races */
- genapic = &apic_physflat;
-#else
+ if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster)
genapic = &apic_flat;
-#endif
- } else
+ else
genapic = &apic_cluster;

print:

2007-02-10 12:02:06

by Andi Kleen

[permalink] [raw]
Subject: [PATCH 2.6.21 review I] [12/25] x86_64: x86_64-make-the-numa-hash-function-nodemap-allocation fix fix


From: Amul Shah <[email protected]>

- Removed an extraneous debug message from allocate_cachealigned_map

- Changed extract_lsb_from_nodes to return 63 for the case where there was
only one memory node. The prevents the creation of the dynamic hashmap.

- Changed extract_lsb_from_nodes to use only the starting memory address of
a node. On an ES7000, our nodes overlap the starting and ending address,
meaning, that we see nodes like

00000 - 10000
10000 - 20000

But other systems have nodes whose start and end addresses do not overlap.
For example:

00000 - 0FFFF
10000 - 1FFFF

In this case, using the ending address will result in an LSB much lower
than what is possible. In this case an LSB of 1 when in reality it should
be 16.

Cc: Andi Kleen <[email protected]>
Cc: Rohit Seth <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>

---

arch/x86_64/mm/numa.c | 15 ++++++++-------
1 file changed, 8 insertions(+), 7 deletions(-)

Index: linux/arch/x86_64/mm/numa.c
===================================================================
--- linux.orig/arch/x86_64/mm/numa.c
+++ linux/arch/x86_64/mm/numa.c
@@ -78,11 +78,8 @@ static int __init allocate_cachealigned_
unsigned long pad, pad_addr;

memnodemap = memnode.embedded_map;
- if (memnodemapsize <= 48) {
- printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
- nodemap_addr, nodemap_addr + nodemap_size);
+ if (memnodemapsize <= 48)
return 0;
- }

pad = L1_CACHE_BYTES - 1;
pad_addr = 0x8000;
@@ -110,7 +107,7 @@ static int __init allocate_cachealigned_
static int __init
extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes)
{
- int i;
+ int i, nodes_used = 0;
unsigned long start, end;
unsigned long bitfield = 0, memtop = 0;

@@ -119,11 +116,15 @@ extract_lsb_from_nodes (const struct boo
end = nodes[i].end;
if (start >= end)
continue;
- bitfield |= start | end;
+ bitfield |= start;
+ nodes_used++;
if (end > memtop)
memtop = end;
}
- i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
+ if (nodes_used <= 1)
+ i = 63;
+ else
+ i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
memnodemapsize = (memtop >> i)+1;
return i;
}

2007-02-10 12:02:24

by Andi Kleen

[permalink] [raw]
Subject: [PATCH 2.6.21 review I] [10/25] x86_64: remove clustered APIC mode


From: Ingo Molnar <[email protected]>

Remove now unused clustered APIC mode code.

Signed-off-by: Ingo Molnar <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>
Cc: Suresh Siddha <[email protected]>
Cc: Andi Kleen <[email protected]>
Cc: "Li, Shaohua" <[email protected]>
Cc: "Eric W. Biederman" <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
---

arch/x86_64/kernel/Makefile | 3
arch/x86_64/kernel/genapic_cluster.c | 137 -----------------------------------
2 files changed, 1 insertion(+), 139 deletions(-)

Index: linux/arch/x86_64/kernel/Makefile
===================================================================
--- linux.orig/arch/x86_64/kernel/Makefile
+++ linux/arch/x86_64/kernel/Makefile
@@ -21,8 +21,7 @@ obj-$(CONFIG_MICROCODE) += microcode.o
obj-$(CONFIG_X86_CPUID) += cpuid.o
obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o
obj-y += apic.o nmi.o
-obj-y += io_apic.o mpparse.o \
- genapic.o genapic_cluster.o genapic_flat.o
+obj-y += io_apic.o mpparse.o genapic.o genapic_flat.o
obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_PM) += suspend.o
Index: linux/arch/x86_64/kernel/genapic_cluster.c
===================================================================
--- linux.orig/arch/x86_64/kernel/genapic_cluster.c
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright 2004 James Cleverdon, IBM.
- * Subject to the GNU Public License, v.2
- *
- * Clustered APIC subarch code. Up to 255 CPUs, physical delivery.
- * (A more realistic maximum is around 230 CPUs.)
- *
- * Hacked for x86-64 by James Cleverdon from i386 architecture code by
- * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
- * James Cleverdon.
- */
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/init.h>
-#include <asm/smp.h>
-#include <asm/ipi.h>
-
-
-/*
- * Set up the logical destination ID.
- *
- * Intel recommends to set DFR, LDR and TPR before enabling
- * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
- * document number 292116). So here it goes...
- */
-static void cluster_init_apic_ldr(void)
-{
- unsigned long val, id;
- long i, count;
- u8 lid;
- u8 my_id = hard_smp_processor_id();
- u8 my_cluster = APIC_CLUSTER(my_id);
-
- /* Create logical APIC IDs by counting CPUs already in cluster. */
- for (count = 0, i = NR_CPUS; --i >= 0; ) {
- lid = x86_cpu_to_log_apicid[i];
- if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster)
- ++count;
- }
- /*
- * We only have a 4 wide bitmap in cluster mode. There's no way
- * to get above 60 CPUs and still give each one it's own bit.
- * But, we're using physical IRQ delivery, so we don't care.
- * Use bit 3 for the 4th through Nth CPU in each cluster.
- */
- if (count >= XAPIC_DEST_CPUS_SHIFT)
- count = 3;
- id = my_cluster | (1UL << count);
- x86_cpu_to_log_apicid[smp_processor_id()] = id;
- apic_write(APIC_DFR, APIC_DFR_CLUSTER);
- val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
- val |= SET_APIC_LOGICAL_ID(id);
- apic_write(APIC_LDR, val);
-}
-
-/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
-
-static cpumask_t cluster_target_cpus(void)
-{
- return cpumask_of_cpu(0);
-}
-
-static cpumask_t cluster_vector_allocation_domain(int cpu)
-{
- cpumask_t domain = CPU_MASK_NONE;
- cpu_set(cpu, domain);
- return domain;
-}
-
-static void cluster_send_IPI_mask(cpumask_t mask, int vector)
-{
- send_IPI_mask_sequence(mask, vector);
-}
-
-static void cluster_send_IPI_allbutself(int vector)
-{
- cpumask_t mask = cpu_online_map;
-
- cpu_clear(smp_processor_id(), mask);
-
- if (!cpus_empty(mask))
- cluster_send_IPI_mask(mask, vector);
-}
-
-static void cluster_send_IPI_all(int vector)
-{
- cluster_send_IPI_mask(cpu_online_map, vector);
-}
-
-static int cluster_apic_id_registered(void)
-{
- return 1;
-}
-
-static unsigned int cluster_cpu_mask_to_apicid(cpumask_t cpumask)
-{
- int cpu;
-
- /*
- * We're using fixed IRQ delivery, can only return one phys APIC ID.
- * May as well be the first.
- */
- cpu = first_cpu(cpumask);
- if ((unsigned)cpu < NR_CPUS)
- return x86_cpu_to_apicid[cpu];
- else
- return BAD_APICID;
-}
-
-/* cpuid returns the value latched in the HW at reset, not the APIC ID
- * register's value. For any box whose BIOS changes APIC IDs, like
- * clustered APIC systems, we must use hard_smp_processor_id.
- *
- * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID.
- */
-static unsigned int phys_pkg_id(int index_msb)
-{
- return hard_smp_processor_id() >> index_msb;
-}
-
-struct genapic apic_cluster = {
- .name = "clustered",
- .int_delivery_mode = dest_Fixed,
- .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
- .target_cpus = cluster_target_cpus,
- .vector_allocation_domain = cluster_vector_allocation_domain,
- .apic_id_registered = cluster_apic_id_registered,
- .init_apic_ldr = cluster_init_apic_ldr,
- .send_IPI_all = cluster_send_IPI_all,
- .send_IPI_allbutself = cluster_send_IPI_allbutself,
- .send_IPI_mask = cluster_send_IPI_mask,
- .cpu_mask_to_apicid = cluster_cpu_mask_to_apicid,
- .phys_pkg_id = phys_pkg_id,
-};

2007-02-10 12:03:13

by Andi Kleen

[permalink] [raw]
Subject: [PATCH 2.6.21 review I] [5/25] i386: revert i386-fix-the-verify_quirk_intel_irqbalance


From: Andrew Morton <[email protected]>

This is unneeded with Ingo's genapic rework.

Cc: Suresh Siddha <[email protected]>
Cc: Andi Kleen <[email protected]>
Cc: Ingo Molnar <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>

---

arch/i386/kernel/quirks.c | 33 ++++-----------------------------
1 file changed, 4 insertions(+), 29 deletions(-)

Index: linux/arch/i386/kernel/quirks.c
===================================================================
--- linux.orig/arch/i386/kernel/quirks.c
+++ linux/arch/i386/kernel/quirks.c
@@ -10,38 +10,13 @@
#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev)
{
- u8 config, rev;
- u32 word;
-
- /* BIOS may enable hardware IRQ balancing for
- * E7520/E7320/E7525(revision ID 0x9 and below)
- * based platforms.
- * For those platforms, make sure that the genapic is set to 'flat'
- */
- pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
- if (rev > 0x9)
- return;
-
- /* enable access to config space*/
- pci_read_config_byte(dev, 0xf4, &config);
- pci_write_config_byte(dev, 0xf4, config|0x2);
-
- /* read xTPR register */
- raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
-
- if (!(word & (1 << 13))) {
#ifdef CONFIG_X86_64
- if (genapic != &apic_flat)
- panic("APIC mode must be flat on this system\n");
+ if (genapic != &apic_flat)
+ panic("APIC mode must be flat on this system\n");
#elif defined(CONFIG_X86_GENERICARCH)
- if (genapic != &apic_default)
- panic("APIC mode must be default(flat) on this system. Use apic=default\n");
+ if (genapic != &apic_default)
+ panic("APIC mode must be default(flat) on this system. Use apic=default\n");
#endif
- }
-
- /* put back the original value for config space*/
- if (!(config & 0x2))
- pci_write_config_byte(dev, 0xf4, config);
}

void __init quirk_intel_irqbalance(void)

2007-02-10 12:03:26

by Andi Kleen

[permalink] [raw]
Subject: [PATCH 2.6.21 review I] [13/25] i386: Fix a typo in an IRQ handler name


From: "Maciej W. Rozycki" <[email protected]>

The "fasteoi" IRQ handler is named "fasteio" incorrectly. This is a fix.

Signed-off-by: Maciej W. Rozycki <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>
Cc: Andi Kleen <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
---

arch/i386/kernel/io_apic.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux/arch/i386/kernel/io_apic.c
===================================================================
--- linux.orig/arch/i386/kernel/io_apic.c
+++ linux/arch/i386/kernel/io_apic.c
@@ -2310,7 +2310,7 @@ static inline void __init check_timer(vo

disable_8259A_irq(0);
set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
- "fasteio");
+ "fasteoi");
apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
enable_8259A_irq(0);

2007-02-10 12:07:22

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 2.6.21 review I] [18/25] x86_64: Fix x86_64 ioremap base_address

On Saturday 10 February 2007 12:58, Arjan van de Ven wrote:
> Andi Kleen wrote:
> > From: OGAWA Hirofumi <[email protected]>
> >
> > Current mmconfig has some problems of remapped range.
>
>
> eh wasn't there a patch that just ignored the MCFG for the broken
> system instead?

That's done in a followup patch

-Andi

2007-02-11 11:13:42

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 2.6.21 review I] [11/25] x86: default to physical mode on hotplug CPU kernels

Andi Kleen <[email protected]> writes:

> From: Ingo Molnar <[email protected]>
>
> Default to physical mode on hotplug CPU kernels. Furher simplify and clean up
> the APIC initialization code.

Where is the code that the subject describes?

I have two problems here.

- I don't see anything handling the hotplug case, and forcing us to
physical mode.
- Ingo's other patch asserts that hotplug should be made to handle
logical deliver mode.

With logical deliver mode the experimental evidence is that the
destination cpu is a hint, and you can arrive at a cpu that is not
in your cpu mask. Now I only saw that problem on hyperthreaded cpus
but we didn't have the code enabled long enough to see it in other
cases.

Maybe if I can finish getting irq migration back into process context,
and someone verifies that the cpu disable in the hotplug path actually
disables the cpu/hyperthread instead of sitting in a hlt loop. We
won't have a problem. But broadcast ipis and irqs that don't go where
you tell them to are things we need to be very careful with.

Eric

2007-02-12 09:32:44

by Jan Beulich

[permalink] [raw]
Subject: Re: [patches] [PATCH 2.6.21 review I] [4/25] x86: kernel-mode faults pollute current->thead

>>> Andi Kleen <[email protected]> 10.02.07 12:50 >>>
>
>From: Jeff Dike <[email protected]>
>
>Kernel-mode traps on x86_64 can pollute the trap information for a previous
>userspace trap for which the signal has not yet been delivered to the
>process.
>
>do_trap and do_general_protection set task->thread.error_code and .trapno
>for kernel traps. If a kernel-mode trap arrives between the arrival of a
>userspace trap and the delivery of the associated SISGEGV to the process,
>the process will get the kernel trap information in its sigcontext.
>
>This causes UML process segfaults, as the trapno that the UML kernel sees
>is 13, rather than the 14 for normal page faults. So, the UML kernel
>passes the SIGSEGV along to its process.
>
>I don't claim to fully understand the problem. On the one hand, a check in
>do_general_protection for a pending SIGSEGV turned up nothing. On the
>other hand, this patch fixed the UML process segfault problem.
>
>The patch below moves the setting of error_code and trapno so that that
>only happens in the case of userspace faults. As a side-effect, this
>should speed up kernel-mode fault handling a tiny bit.

This breaks consumers of notify_die() relying on the proper trap number being
passed, as the call to notify_die() from die() currently reads
current->thread.trap_no.

Also, you seem to leave other places where trap_no gets set untouched -
is this intentional (do_debug - probably correct here, kernel_math_error -
probably incorrect here)?

>I looked at i386, and there is a similar situation. In this case, there is
>duplicate code setting task->thread.error_code and trapno. I deleted one,
>leaving the copy that runs in the case of a userspace fault.

Likewise.

Jan

2007-02-12 09:56:50

by Jan Beulich

[permalink] [raw]
Subject: Re: [patches] [PATCH 2.6.21 review I] [21/25] x86_64: a memcpy that tries to reduce cache pressure

>>> Andi Kleen <[email protected]> 10.02.07 12:50 >>>
>
>From: "Bryan O'Sullivan" <[email protected]>
>
>This copy routine is memcpy-compatible, but on some architectures will use
>cache-bypassing loads to avoid bringing the source data into the cache.
>
>One case where this is useful is when a device issues a DMA to a memory
>region, and the CPU must copy the DMAed data elsewhere before doing any work
>with it. Since the source data is read-once, write-never from the CPU's
>perspective, caching the data at those addresses can only evict potentially
>useful data.
>
>We provide an x86_64 implementation that uses SSE non-temporal loads, and a
>generic version that falls back to plain memcpy.
>
>Implementors for other arches should not use cache-bypassing stores to the
>destination, as in most cases, the destination is accessed almost immediately
>after a copy finishes.

This looks a little strange to me:
- the first 128 bytes are still going through the cache
- up to 192 bytes past the copied area are being marked non-temporal, while
there's nothing known about that area
- sfence seems questionable here, I would have thought this should be lfence,
or perhaps even none at all

Minor remarks would be to remove the double .align before .L12 and replace
or-ing a register with itself by test.

Jan

2007-02-12 10:25:46

by Andi Kleen

[permalink] [raw]
Subject: Re: [patches] [PATCH 2.6.21 review I] [21/25] x86_64: a memcpy that tries to reduce cache pressure


> This looks a little strange to me:
> - the first 128 bytes are still going through the cache
> - up to 192 bytes past the copied area are being marked non-temporal, while
> there's nothing known about that area

Yes that seems quite bogus.

> - sfence seems questionable here, I would have thought this should be lfence,
> or perhaps even none at all

Agreed -- it's not needed.

I think i also objected earlier to the jump table which is likely slower.

Will drop for now.

-Andi

2007-02-12 16:50:42

by Jeff Dike

[permalink] [raw]
Subject: Re: [patches] [PATCH 2.6.21 review I] [4/25] x86: kernel-mode faults pollute current->thead

On Mon, Feb 12, 2007 at 09:32:10AM +0000, Jan Beulich wrote:
> This breaks consumers of notify_die() relying on the proper trap number being
> passed, as the call to notify_die() from die() currently reads
> current->thread.trap_no.

Rats, good point.

> Also, you seem to leave other places where trap_no gets set untouched -
> is this intentional (do_debug - probably correct here, kernel_math_error -
> probably incorrect here)?

I did check the other trap handlers. kernel_math_error calls die,
which calls do_exit(SIGSEGV). This doesn't seem to allow the process
the opportunity to trap the SIGSEGV and examine the fault information.

> >I looked at i386, and there is a similar situation. In this case, there is
> >duplicate code setting task->thread.error_code and trapno. I deleted one,
> >leaving the copy that runs in the case of a userspace fault.
>
> Likewise.

Yup. How does this patch look to you? We set error_code and trap_no
for userspace faults and kernel faults which call die(). We don't set
them for kernelspace faults which are fixed up.

Index: linux-2.6/arch/i386/kernel/traps.c
===================================================================
--- linux-2.6.orig/arch/i386/kernel/traps.c
+++ linux-2.6/arch/i386/kernel/traps.c
@@ -619,6 +619,8 @@ gp_in_vm86:

gp_in_kernel:
if (!fixup_exception(regs)) {
+ current->thread.error_code = error_code;
+ current->thread.trap_no = 13;
if (notify_die(DIE_GPF, "general protection fault", regs,
error_code, 13, SIGSEGV) == NOTIFY_STOP)
return;
Index: linux-2.6/arch/x86_64/kernel/traps.c
===================================================================
--- linux-2.6.orig/arch/x86_64/kernel/traps.c
+++ linux-2.6/arch/x86_64/kernel/traps.c
@@ -605,8 +605,11 @@ static void __kprobes do_trap(int trapnr
fixup = search_exception_tables(regs->rip);
if (fixup)
regs->rip = fixup->fixup;
- else
+ else {
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_no = trapnr;
die(str, regs, error_code);
+ }
return;
}
}


--
Work email - jdike at linux dot intel dot com

2007-02-12 17:01:27

by Jan Beulich

[permalink] [raw]
Subject: Re: [patches] [PATCH 2.6.21 review I] [4/25] x86: kernel-mode faults pollute current->thead

>Yup. How does this patch look to you? We set error_code and trap_no
>for userspace faults and kernel faults which call die(). We don't set
>them for kernelspace faults which are fixed up.

That seems a reasonable approach.

Thanks, Jan

2007-02-12 22:38:57

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 2.6.21 review I] [11/25] x86: default to physical mode on hotplug CPU kernels

On Sunday 11 February 2007 12:13, Eric W. Biederman wrote:
> Andi Kleen <[email protected]> writes:
>
> > From: Ingo Molnar <[email protected]>
> >
> > Default to physical mode on hotplug CPU kernels. Furher simplify and clean up
> > the APIC initialization code.
>
> Where is the code that the subject describes?

True, that seems to be missing.

I agree that the patch seems to consist mostly of renaming doesn't make
it any easier to read.

And it's worrying that it doesn't handle the hotplug case at all.

> I have two problems here.
>
> - I don't see anything handling the hotplug case, and forcing us to
> physical mode.
> - Ingo's other patch asserts that hotplug should be made to handle
> logical deliver mode.
>
> With logical deliver mode the experimental evidence is that the
> destination cpu is a hint,

What experimental evidence did you have?

But I'm tempted to drop this unless the hotplug mystery can be cleared
up. There was past information that logical is unsafe for hotplug.

Ingo? Suresh?

-Andi

2007-02-12 23:11:21

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 2.6.21 review I] [11/25] x86: default to physical mode on hotplug CPU kernels

Andi Kleen <[email protected]> writes:


> What experimental evidence did you have?
>
> But I'm tempted to drop this unless the hotplug mystery can be cleared
> up. There was past information that logical is unsafe for hotplug.

Basically as I commented in genapic_flat, that at least on hyperthreading
cpus the destination mask is not always honored, and so if you only
allow one hyperthread I have seen the irq show up on the other hyperthread.

Now if the cpu is actually disabled I don't think that is a problem, but
I know early versions of hotplug did actually disable the cpu.

I think the renaming in this patch makes things clearer in a useful way.


The more I look at the hotplug cpu code the more I think it should be
filed under EXPERIMENTAL (as in the code is buggy and not ready for
production use yet).

Trying to see if I can improve the irq migration mess I stumbled upon
the following. Currently set_affinity needs to be called with
irq_desc[irq].lock held. It needs to be called from interrupt context.
And 1 millisecond delay appears utterly bogus, although the enable
the irqs do something disable the irqs likely flushes pending irqs.
A problem that cannot occurs differently if the irqs are migrated
from interrupt context.

Looking further this buggy set_affinity usage also appears in
setup_ioapic_dest. Although it is much less dangerous there,
as the code is largely a noop.

Is it just me or are we crazy for support software controlled irq
migration?

void fixup_irqs(cpumask_t map)
{
unsigned int irq;
static int warned;

for (irq = 0; irq < NR_IRQS; irq++) {
cpumask_t mask;
if (irq == 2)
continue;

cpus_and(mask, irq_desc[irq].affinity, map);
if (any_online_cpu(mask) == NR_CPUS) {
printk("Breaking affinity for irq %i\n", irq);
mask = map;
}
if (irq_desc[irq].chip->set_affinity)
irq_desc[irq].chip->set_affinity(irq, mask);
else if (irq_desc[irq].action && !(warned++))
printk("Cannot set affinity for irq %i\n", irq);
}

/* That doesn't seem sufficient. Give it 1ms. */
local_irq_enable();
mdelay(1);
local_irq_disable();
}

Eric

2007-02-13 00:17:57

by Suresh Siddha

[permalink] [raw]
Subject: Re: [PATCH 2.6.21 review I] [11/25] x86: default to physical mode on hotplug CPU kernels

On Mon, Feb 12, 2007 at 11:36:23PM +0100, Andi Kleen wrote:
> True, that seems to be missing.
>
> I agree that the patch seems to consist mostly of renaming doesn't make
> it any easier to read.
>
> And it's worrying that it doesn't handle the hotplug case at all.

This patch is mostly a cleanup patch and doesn't have to do anything with
hotplug. I think the change log comment about hotplug kernel is a leftover
from an old patch.

> But I'm tempted to drop this unless the hotplug mystery can be cleared
> up. There was past information that logical is unsafe for hotplug.
>
> Ingo? Suresh?

logical clustered mode has problems with cpu hotplug.
( http://marc.theaimsgroup.com/?l=linux-kernel&m=113261865814107&w=2 )

I think logical flat is fine. We should be fine in logical flat, as long as
we are not using apic IPI shortcuts.

thanks,
suresh

2007-02-13 00:25:27

by Suresh Siddha

[permalink] [raw]
Subject: Re: [PATCH 2.6.21 review I] [11/25] x86: default to physical mode on hotplug CPU kernels

On Mon, Feb 12, 2007 at 04:10:44PM -0700, Eric W. Biederman wrote:
> Basically as I commented in genapic_flat, that at least on hyperthreading
> cpus the destination mask is not always honored, and so if you only
> allow one hyperthread I have seen the irq show up on the other hyperthread.

Which platform is this? I haven't heard this before.

thanks,
suresh

2007-02-13 11:27:54

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH 2.6.21 review I] [21/25] x86_64: a memcpy that tries to reduce cache pressure

Andi Kleen a ?crit :
> From: "Bryan O'Sullivan" <[email protected]>
>
> This copy routine is memcpy-compatible, but on some architectures will use
> cache-bypassing loads to avoid bringing the source data into the cache.
>
> One case where this is useful is when a device issues a DMA to a memory
> region, and the CPU must copy the DMAed data elsewhere before doing any work
> with it. Since the source data is read-once, write-never from the CPU's
> perspective, caching the data at those addresses can only evict potentially
> useful data.
>
> We provide an x86_64 implementation that uses SSE non-temporal loads, and a
> generic version that falls back to plain memcpy.

> + movq %r11, 56(%rdi)
> + addq %rcx, %rdi
> + cmpq %rdx, %rcx /* is rdx >= 64? */
> + jbe .L42
> + sfence
> + orl %edx, %edx
> + je .L33

I have three questions/remarks

1) Just curious why sfence is necessary here ?

2) Shouldnt we use this for large buffers, and restrict them to a size
multiple of 64, to avoid all these conditional branches ?

3) Also, the first 128 bytes of the source buffer will be bring into cache.