LinuxLists.cc - [PATCH for 2.6.19] [1/9] x86_64: Fix partial page check to ensure unusable memory is not being marked usable.

2006-11-14 16:08:53

Subject: [PATCH for 2.6.19] [1/9] x86_64: Fix partial page check to ensure unusable memory is not being marked usable.

From: "Aaron Durbin" <[email protected]>
Fix partial page check in e820_register_active_regions to ensure
partial pages are
not being marked as active in the memory pool.

Signed-off-by: Aaron Durbin <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>

---
This was causing a machine to reboot w/ an area in the e820 that was less
than the page size because the upper address was being use to mark a hole as
active in the memory pool.

arch/x86_64/kernel/e820.c | 2 +-
1 files changed, 1 insertion(+), 1 deletion(-)

Index: linux/arch/x86_64/kernel/e820.c
===================================================================
--- linux.orig/arch/x86_64/kernel/e820.c
+++ linux/arch/x86_64/kernel/e820.c
@@ -278,7 +278,7 @@ e820_register_active_regions(int nid, un
>> PAGE_SHIFT;

/* Skip map entries smaller than a page */
- if (ei_startpfn > ei_endpfn)
+ if (ei_startpfn >= ei_endpfn)
continue;

/* Check if end_pfn_map should be updated */

2006-11-14 16:09:52

by Andi Kleen

[permalink] [raw]

Subject: [PATCH for 2.6.19] [8/9] x86_64: Fix vgetcpu when CONFIG_HOTPLUG_CPU is disabled

The vgetcpu per CPU initialization previously relied on CPU hotplug
events for all CPUs to initialize the per CPU state. That only
worked only on kernels with CONFIG_HOTPLUG_CPU enabled. On the
others some CPUs didn't get their state initialized properly
and vgetcpu wouldn't work.

Change the initialization sequence to instead run in a normal
initcall (which runs after the normal CPU bootup) and initialize
all running CPUs there. Later hotplug CPUs are still handled
with an hotplug notifier.

This actually simplifies the code somewhat.

Signed-off-by: Andi Kleen <[email protected]>

---
arch/x86_64/kernel/smp.c | 3 --
arch/x86_64/kernel/time.c | 11 ----------
arch/x86_64/kernel/vsyscall.c | 45 +++++++++++++++++++++++-------------------
include/asm-x86_64/vsyscall.h | 2 -
4 files changed, 26 insertions(+), 35 deletions(-)

Index: linux/arch/x86_64/kernel/vsyscall.c
===================================================================
--- linux.orig/arch/x86_64/kernel/vsyscall.c
+++ linux/arch/x86_64/kernel/vsyscall.c
@@ -27,6 +27,9 @@
#include <linux/jiffies.h>
#include <linux/sysctl.h>
#include <linux/getcpu.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/notifier.h>

#include <asm/vsyscall.h>
#include <asm/pgtable.h>
@@ -243,32 +246,17 @@ static ctl_table kernel_root_table2[] =

#endif

-static void __cpuinit write_rdtscp_cb(void *info)
-{
- write_rdtscp_aux((unsigned long)info);
-}
-
-void __cpuinit vsyscall_set_cpu(int cpu)
+/* Assume __initcall executes before all user space. Hopefully kmod
+ doesn't violate that. We'll find out if it does. */
+static void __cpuinit vsyscall_set_cpu(int cpu)
{
unsigned long *d;
unsigned long node = 0;
#ifdef CONFIG_NUMA
node = cpu_to_node[cpu];
#endif
- if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) {
- void *info = (void *)((node << 12) | cpu);
- /* Can happen on preemptive kernel */
- if (get_cpu() == cpu)
- write_rdtscp_cb(info);
-#ifdef CONFIG_SMP
- else {
- /* the notifier is unfortunately not executed on the
- target CPU */
- smp_call_function_single(cpu,write_rdtscp_cb,info,0,1);
- }
-#endif
- put_cpu();
- }
+ if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP))
+ write_rdtscp_aux((node << 12) | cpu);

/* Store cpu number in limit so that it can be loaded quickly
in user space in vgetcpu.
@@ -280,6 +268,21 @@ void __cpuinit vsyscall_set_cpu(int cpu)
*d |= (node >> 4) << 48;
}

+static void __cpuinit cpu_vsyscall_init(void *arg)
+{
+ /* preemption should be already off */
+ vsyscall_set_cpu(raw_smp_processor_id());
+}
+
+static int __cpuinit
+cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
+{
+ long cpu = (long)arg;
+ if (action == CPU_ONLINE)
+ smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
+ return NOTIFY_DONE;
+}
+
static void __init map_vsyscall(void)
{
extern char __vsyscall_0;
@@ -299,6 +302,8 @@ static int __init vsyscall_init(void)
#ifdef CONFIG_SYSCTL
register_sysctl_table(kernel_root_table2, 0);
#endif
+ on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
+ hotcpu_notifier(cpu_vsyscall_notifier, 0);
return 0;
}

Index: linux/include/asm-x86_64/vsyscall.h
===================================================================
--- linux.orig/include/asm-x86_64/vsyscall.h
+++ linux/include/asm-x86_64/vsyscall.h
@@ -59,8 +59,6 @@ extern seqlock_t xtime_lock;

extern int sysctl_vsyscall;

-extern void vsyscall_set_cpu(int cpu);
-
#define ARCH_HAVE_XTIME_LOCK 1

#endif /* __KERNEL__ */
Index: linux/arch/x86_64/kernel/time.c
===================================================================
--- linux.orig/arch/x86_64/kernel/time.c
+++ linux/arch/x86_64/kernel/time.c
@@ -876,15 +876,6 @@ static struct irqaction irq0 = {
timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL
};

-static int __cpuinit
-time_cpu_notifier(struct notifier_block *nb, unsigned long action, void *hcpu)
-{
- unsigned cpu = (unsigned long) hcpu;
- if (action == CPU_ONLINE)
- vsyscall_set_cpu(cpu);
- return NOTIFY_DONE;
-}
-
void __init time_init(void)
{
if (nohpet)
@@ -925,8 +916,6 @@ void __init time_init(void)
vxtime.last_tsc = get_cycles_sync();
set_cyc2ns_scale(cpu_khz);
setup_irq(0, &irq0);
- hotcpu_notifier(time_cpu_notifier, 0);
- time_cpu_notifier(NULL, CPU_ONLINE, (void *)(long)smp_processor_id());

#ifndef CONFIG_SMP
time_init_gtod();
Index: linux/arch/x86_64/kernel/smp.c
===================================================================
--- linux.orig/arch/x86_64/kernel/smp.c
+++ linux/arch/x86_64/kernel/smp.c
@@ -376,9 +376,8 @@ int smp_call_function_single (int cpu, v
/* prevent preemption and reschedule on another processor */
int me = get_cpu();
if (cpu == me) {
- WARN_ON(1);
put_cpu();
- return -EBUSY;
+ return 0;
}
spin_lock_bh(&call_lock);
__smp_call_function_single(cpu, func, info, nonatomic, wait);

2006-11-14 16:09:21

by Andi Kleen

[permalink] [raw]

Subject: [PATCH for 2.6.19] [7/9] x86: Add acpi_user_timer_override option for Asus boards

Timer overrides are normally disabled on Nvidia board because
they are commonly wrong, except on new ones with HPET support.
Unfortunately there are quite some Asus boards around that
don't have HPET, but need a timer override.

We don't know yet how to handle this transparently,
but at least add a command line option to force the timer override
and let them boot.

Cc: [email protected]

Signed-off-by: Andi Kleen <[email protected]>

---
Documentation/kernel-parameters.txt | 4 ++++
arch/i386/kernel/acpi/boot.c | 8 ++++++++
arch/i386/kernel/acpi/earlyquirk.c | 8 +++++++-
arch/x86_64/kernel/early-quirks.c | 8 ++++++++
include/asm-i386/acpi.h | 1 +
include/asm-x86_64/acpi.h | 1 +
6 files changed, 29 insertions(+), 1 deletion(-)

Index: linux/arch/i386/kernel/acpi/boot.c
===================================================================
--- linux.orig/arch/i386/kernel/acpi/boot.c
+++ linux/arch/i386/kernel/acpi/boot.c
@@ -82,6 +82,7 @@ EXPORT_SYMBOL(acpi_strict);
acpi_interrupt_flags acpi_sci_flags __initdata;
int acpi_sci_override_gsi __initdata;
int acpi_skip_timer_override __initdata;
+int acpi_use_timer_override __initdata;

#ifdef CONFIG_X86_LOCAL_APIC
static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
@@ -1300,6 +1301,13 @@ static int __init parse_acpi_skip_timer_
return 0;
}
early_param("acpi_skip_timer_override", parse_acpi_skip_timer_override);
+
+static int __init parse_acpi_use_timer_override(char *arg)
+{
+ acpi_use_timer_override = 1;
+ return 0;
+}
+early_param("acpi_use_timer_override", parse_acpi_use_timer_override);
#endif /* CONFIG_X86_IO_APIC */

static int __init setup_acpi_sci(char *s)
Index: linux/arch/i386/kernel/acpi/earlyquirk.c
===================================================================
--- linux.orig/arch/i386/kernel/acpi/earlyquirk.c
+++ linux/arch/i386/kernel/acpi/earlyquirk.c
@@ -27,11 +27,17 @@ static int __init check_bridge(int vendo
#ifdef CONFIG_ACPI
/* According to Nvidia all timer overrides are bogus unless HPET
is enabled. */
- if (vendor == PCI_VENDOR_ID_NVIDIA) {
+ if (!acpi_use_timer_override && vendor == PCI_VENDOR_ID_NVIDIA) {
nvidia_hpet_detected = 0;
acpi_table_parse(ACPI_HPET, nvidia_hpet_check);
if (nvidia_hpet_detected == 0) {
acpi_skip_timer_override = 1;
+ printk(KERN_INFO "Nvidia board "
+ "detected. Ignoring ACPI "
+ "timer override.\n");
+ printk(KERN_INFO "If you got timer trouble "
+ "try acpi_use_timer_override\n");
+
}
}
#endif
Index: linux/arch/x86_64/kernel/early-quirks.c
===================================================================
--- linux.orig/arch/x86_64/kernel/early-quirks.c
+++ linux/arch/x86_64/kernel/early-quirks.c
@@ -45,7 +45,13 @@ static void nvidia_bugs(void)
/*
* All timer overrides on Nvidia are
* wrong unless HPET is enabled.
+ * Unfortunately that's not true on many Asus boards.
+ * We don't know yet how to detect this automatically, but
+ * at least allow a command line override.
*/
+ if (acpi_use_timer_override)
+ return;
+
nvidia_hpet_detected = 0;
acpi_table_parse(ACPI_HPET, nvidia_hpet_check);
if (nvidia_hpet_detected == 0) {
@@ -53,6 +59,8 @@ static void nvidia_bugs(void)
printk(KERN_INFO "Nvidia board "
"detected. Ignoring ACPI "
"timer override.\n");
+ printk(KERN_INFO "If you got timer trouble "
+ "try acpi_use_timer_override\n");
}
#endif
/* RED-PEN skip them on mptables too? */
Index: linux/include/asm-i386/acpi.h
===================================================================
--- linux.orig/include/asm-i386/acpi.h
+++ linux/include/asm-i386/acpi.h
@@ -132,6 +132,7 @@ extern int acpi_gsi_to_irq(u32 gsi, unsi

#ifdef CONFIG_X86_IO_APIC
extern int acpi_skip_timer_override;
+extern int acpi_use_timer_override;
#endif

static inline void acpi_noirq_set(void) { acpi_noirq = 1; }
Index: linux/include/asm-x86_64/acpi.h
===================================================================
--- linux.orig/include/asm-x86_64/acpi.h
+++ linux/include/asm-x86_64/acpi.h
@@ -163,6 +163,7 @@ extern u8 x86_acpiid_to_apicid[];
#define ARCH_HAS_POWER_INIT 1

extern int acpi_skip_timer_override;
+extern int acpi_use_timer_override;

#endif /*__KERNEL__*/

Index: linux/Documentation/kernel-parameters.txt
===================================================================
--- linux.orig/Documentation/kernel-parameters.txt
+++ linux/Documentation/kernel-parameters.txt
@@ -164,6 +164,10 @@ and is between 256 and 4096 characters.
acpi_skip_timer_override [HW,ACPI]
Recognize and ignore IRQ0/pin2 Interrupt Override.
For broken nForce2 BIOS resulting in XT-PIC timer.
+ acpi_use_timer_override [HW,ACPI}
+ Use timer override. For some broken Nvidia NF5 boards
+ that require a timer override, but don't have
+ HPET

acpi_dbg_layer= [HW,ACPI]
Format: <int>

2006-11-14 16:09:52

by Andi Kleen

[permalink] [raw]

Subject: [PATCH for 2.6.19] [6/9] x86_64: Update MMCONFIG resource insertion to check against e820 map.

From: "Aaron Durbin" <[email protected]>
Check to see if MMCONFIG region is marked as reserved in the e820 map before
inserting the MMCONFIG region into the resource map. If the region is not
entirely marked as reserved in the e820 map attempt to find a region that is.
Only insert the MMCONFIG region into the resource map if there was a region
found marked as reserved in the e820 map. This should fix a known regression
in 2.6.19 by not reserving all of the I/O space on misconfigured systems.

Signed-off-by: Andi Kleen <[email protected]>

---

This patch is against 2.6.19-rc4.

arch/x86_64/pci/mmconfig.c | 76 ++++++++++++++++++++++++++++++++++++++-------
1 files changed, 65 insertions(+), 11 deletions(-)

Index: linux/arch/x86_64/pci/mmconfig.c
===================================================================
--- linux.orig/arch/x86_64/pci/mmconfig.c
+++ linux/arch/x86_64/pci/mmconfig.c
@@ -163,33 +163,87 @@ static __init void unreachable_devices(v
}
}

+#define PCI_MMCFG_RESOURCE_NAME_LEN 19
+/* Check the given mcfg_entry to see if its reported address range is marked
+ * as reserved in the e820 map. If it is not entirely marked as reserved it
+ * attempts to find a given bus range that is marked as reserved. If no range
+ * is determined, do not insert the MCFG resource into the resource map. */
+static __init void pci_mmcfg_check_and_insert_resource(int mcfg_entry,
+ struct resource *res)
+{
+ struct acpi_table_mcfg_config *mcfg;
+ unsigned start_bus_num, end_bus_num;
+ unsigned num_buses;
+
+ mcfg = &pci_mmcfg_config[mcfg_entry];
+
+ start_bus_num = mcfg->start_bus_number;
+ end_bus_num = mcfg->end_bus_number;
+
+ if (end_bus_num < start_bus_num) {
+ printk(KERN_ERR "PCI: BIOS Bug: MCFG region %u has "
+ "misconfigured bus entries [%u,%u].\n",
+ mcfg_entry, mcfg->start_bus_number,
+ mcfg->end_bus_number);
+ return;
+ }
+
+ while (end_bus_num >= start_bus_num) {
+ num_buses = end_bus_num - start_bus_num + 1;
+ if (e820_all_mapped(mcfg->base_address,
+ mcfg->base_address + (num_buses << 20) -1,
+ E820_RESERVED))
+ break;
+ end_bus_num--;
+ }
+
+ if (mcfg->end_bus_number != end_bus_num) {
+ unsigned long end_addr;
+ unsigned long start_addr;
+ start_addr = mcfg->base_address;
+ num_buses = mcfg->end_bus_number - mcfg->start_bus_number + 1;
+ end_addr = mcfg->base_address + (num_buses << 20) - 1;
+ printk(KERN_ERR "PCI: BIOS Bug: MCFG region %u not entirely "
+ "marked as e280-reserved (%016lx-%016lx).\n",
+ mcfg_entry, start_addr, end_addr);
+ }
+
+ /* If we could not find a region reserved in the e820 then we should
+ * not reserve the resource. We will hope for the best that there
+ * are no collisions. */
+ if (end_bus_num < start_bus_num)
+ return;
+
+ /* Fixup the resource limits for allocation without affecting the
+ * reported bus number limits in the MCFG table. */
+ num_buses = end_bus_num - start_bus_num + 1;
+ res->start = mcfg->base_address;
+ res->end = res->start + (num_buses << 20) - 1;
+
+ snprintf((char *)res->name, PCI_MMCFG_RESOURCE_NAME_LEN,
+ "PCI MMCONFIG %u", mcfg->pci_segment_group_number);
+ res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ insert_resource(&iomem_resource, res);
+}
+
static __init void pci_mmcfg_insert_resources(void)
{
-#define PCI_MMCFG_RESOURCE_NAME_LEN 19
int i;
struct resource *res;
char *names;
- unsigned num_buses;

res = kcalloc(PCI_MMCFG_RESOURCE_NAME_LEN + sizeof(*res),
pci_mmcfg_config_num, GFP_KERNEL);

if (!res) {
- printk(KERN_ERR "PCI: Unable to allocate MMCONFIG resources\n");
+ printk(KERN_ERR "PCI: Unable to allocate MMCONFIG resources.\n");
return;
}

names = (void *)&res[pci_mmcfg_config_num];
for (i = 0; i < pci_mmcfg_config_num; i++, res++) {
- num_buses = pci_mmcfg_config[i].end_bus_number -
- pci_mmcfg_config[i].start_bus_number + 1;
res->name = names;
- snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN, "PCI MMCONFIG %u",
- pci_mmcfg_config[i].pci_segment_group_number);
- res->start = pci_mmcfg_config[i].base_address;
- res->end = res->start + (num_buses << 20) - 1;
- res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
- insert_resource(&iomem_resource, res);
+ pci_mmcfg_check_and_insert_resource(i, res);
names += PCI_MMCFG_RESOURCE_NAME_LEN;
}
}

2006-11-14 16:10:36

by Andi Kleen

[permalink] [raw]

Subject: [PATCH for 2.6.19] [3/9] x86_64: shorten the x86_64 boot setup GDT to what the comment says

From: Steven Rostedt <[email protected]>

Stephen Tweedie, Herbert Xu, and myself have been struggling with a very
nasty bug in Xen. But it also pointed out a small bug in the x86_64
kernel boot setup.

The GDT limit being setup by the initial bzImage code when entering into
protected mode is way too big. The comment by the code states that the
size of the GDT is 2048, but the actual size being set up is much bigger
(32768). This happens simply because of one extra '0'.

Instead of setting up a 0x800 size, 0x8000 is set up. On bare metal this
is fine because the CPU wont load any segments unless they are
explicitly used. But unfortunately, this breaks Xen on vmx FV, since it
(for now) blindly loads all the segments into the VMCS if they are less
than the gdt limit. Since the real mode segments are around 0x3000, we are
getting junk into the VMCS and that later causes an exception.

Stephen Tweedie has written up a patch to fix the Xen side and will be
submitting that to those folks. But that doesn't excuse the GDT limit
being a magnitude too big.

AK: changed to compute true gdt size in assembler, fixed comment

Signed-off-by: Steven Rostedt <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>

---
arch/x86_64/boot/setup.S | 5 ++---
1 files changed, 2 insertions(+), 3 deletions(-)

Index: linux/arch/x86_64/boot/setup.S
===================================================================
--- linux.orig/arch/x86_64/boot/setup.S
+++ linux/arch/x86_64/boot/setup.S
@@ -836,13 +836,12 @@ gdt:
.word 0x9200 # data read/write
.word 0x00CF # granularity = 4096, 386
# (+5th nibble of limit)
+gdt_end:
idt_48:
.word 0 # idt limit = 0
.word 0, 0 # idt base = 0L
gdt_48:
- .word 0x8000 # gdt limit=2048,
- # 256 GDT entries
-
+ .word gdt_end-gdt-1 # gdt limit
.word 0, 0 # gdt base (filled in later)

# Include video setup & detection code

2006-11-14 16:10:36

by Andi Kleen

[permalink] [raw]

Subject: [PATCH for 2.6.19] [9/9] x86_64: Fix race in exit_idle

When another interrupt happens in exit_idle the exit idle notifier
could be called an incorrect number of times.

Add a test_and_clear_bit_pda and use it handle the bit
atomically against interrupts to avoid this.

Pointed out by Stephane Eranian

Signed-off-by: Andi Kleen <[email protected]>

---
arch/x86_64/kernel/process.c | 3 +--
include/asm-x86_64/pda.h | 9 +++++++++
2 files changed, 10 insertions(+), 2 deletions(-)

Index: linux/include/asm-x86_64/pda.h
===================================================================
--- linux.orig/include/asm-x86_64/pda.h
+++ linux/include/asm-x86_64/pda.h
@@ -109,6 +109,15 @@ extern struct x8664_pda _proxy_pda;
#define sub_pda(field,val) pda_to_op("sub",field,val)
#define or_pda(field,val) pda_to_op("or",field,val)

+/* This is not atomic against other CPUs -- CPU preemption needs to be off */
+#define test_and_clear_bit_pda(bit,field) ({ \
+ int old__; \
+ asm volatile("btr %2,%%gs:%c3\n\tsbbl %0,%0" \
+ : "=r" (old__), "+m" (_proxy_pda.field) \
+ : "dIr" (bit), "i" (pda_offset(field)) : "memory"); \
+ old__; \
+})
+
#endif

#define PDA_STACKOFFSET (5*8)
Index: linux/arch/x86_64/kernel/process.c
===================================================================
--- linux.orig/arch/x86_64/kernel/process.c
+++ linux/arch/x86_64/kernel/process.c
@@ -88,9 +88,8 @@ void enter_idle(void)

static void __exit_idle(void)
{
- if (read_pda(isidle) == 0)
+ if (test_and_clear_bit_pda(0, isidle) == 0)
return;
- write_pda(isidle, 0);
atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
}

2006-11-14 16:08:55

by Andi Kleen

[permalink] [raw]

Subject: [PATCH for 2.6.19] [2/9] x86_64: Fix PTRACE_[SG]ET_THREAD_AREA regression with ia32 emulation.

ptrace(PTRACE_[SG]ET_THREAD_AREA) calls from ia32 code
should be passed onto the x86_64 implementation.

The default case in sys32_ptrace used to call to sys_ptrace(), but is
now EINVAL. This patch fixes a regression caused by that changed.

Signed-off-by: Mike McCormack <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>

---
arch/x86_64/ia32/ptrace32.c | 2 ++
1 files changed, 2 insertions(+)

Index: linux/arch/x86_64/ia32/ptrace32.c
===================================================================
--- linux.orig/arch/x86_64/ia32/ptrace32.c
+++ linux/arch/x86_64/ia32/ptrace32.c
@@ -244,6 +244,8 @@ asmlinkage long sys32_ptrace(long reques
case PTRACE_DETACH:
case PTRACE_SYSCALL:
case PTRACE_SETOPTIONS:
+ case PTRACE_SET_THREAD_AREA:
+ case PTRACE_GET_THREAD_AREA:
return sys_ptrace(request, pid, addr, data);

default:

2006-11-14 16:09:28

by Andi Kleen

[permalink] [raw]

Subject: [PATCH for 2.6.19] [5/9] x86_64: setup saved_max_pfn correctly (kdump)

From: Magnus Damm <[email protected]>
x86_64: setup saved_max_pfn correctly

2.6.19-rc4 has broken CONFIG_CRASH_DUMP support on x86_64. It is impossible
to read out the kernel contents from /proc/vmcore because saved_max_pfn is set
to zero instead of the max_pfn value before the user map is setup.

This happens because saved_max_pfn is initialized at parse_early_param() time,
and at this time no active regions have been registered. save_max_pfn is setup
from e820_end_of_ram(), more exact find_max_pfn_with_active_regions() which
returns 0 because no regions exist.

This patch fixes this by registering before and removing after the call
to e820_end_of_ram().

Signed-off-by: Magnus Damm <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>

---

Applies to 2.6.19-rc4.

arch/x86_64/kernel/e820.c | 2 ++
1 files changed, 2 insertions(+)

Index: linux/arch/x86_64/kernel/e820.c
===================================================================
--- linux.orig/arch/x86_64/kernel/e820.c
+++ linux/arch/x86_64/kernel/e820.c
@@ -594,7 +594,9 @@ static int __init parse_memmap_opt(char
* size before original memory map is
* reset.
*/
+ e820_register_active_regions(0, 0, -1UL);
saved_max_pfn = e820_end_of_ram();
+ remove_all_active_ranges();
#endif
end_pfn_map = 0;
e820.nr_map = 0;

2006-11-14 16:09:28

by Andi Kleen

[permalink] [raw]

Subject: [PATCH for 2.6.19] [4/9] x86_64: Handle reserve_bootmem_generic beyond end_pfn

This can happen on kexec kernels with some configurations, in particularly
on Unisys ES7000 systems.

Analysis by Amul Shah

Cc: Amul Shah <[email protected]>

Signed-off-by: Andi Kleen <[email protected]>

---
arch/x86_64/mm/init.c | 15 ++++++++++++++-
1 files changed, 14 insertions(+), 1 deletion(-)

Index: linux/arch/x86_64/mm/init.c
===================================================================
--- linux.orig/arch/x86_64/mm/init.c
+++ linux/arch/x86_64/mm/init.c
@@ -655,9 +655,22 @@ void free_initrd_mem(unsigned long start

void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
{
- /* Should check here against the e820 map to avoid double free */
#ifdef CONFIG_NUMA
int nid = phys_to_nid(phys);
+#endif
+ unsigned long pfn = phys >> PAGE_SHIFT;
+ if (pfn >= end_pfn) {
+ /* This can happen with kdump kernels when accessing firmware
+ tables. */
+ if (pfn < end_pfn_map)
+ return;
+ printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
+ phys, len);
+ return;
+ }
+
+ /* Should check here against the e820 map to avoid double free */
+#ifdef CONFIG_NUMA
reserve_bootmem_node(NODE_DATA(nid), phys, len);
#else
reserve_bootmem(phys, len);

2006-11-14 18:38:52

by Andi Kleen

[permalink] [raw]

Subject: Re: [PATCH for 2.6.19] [6/9] x86_64: Update MMCONFIG resource insertion to check against e820 map.

Andi Kleen <[email protected]> writes:

> From: "Aaron Durbin" <[email protected]>
> Check to see if MMCONFIG region is marked as reserved in the e820 map before
> inserting the MMCONFIG region into the resource map. If the region is not
> entirely marked as reserved in the e820 map attempt to find a region that is.
> Only insert the MMCONFIG region into the resource map if there was a region
> found marked as reserved in the e820 map. This should fix a known regression
> in 2.6.19 by not reserving all of the I/O space on misconfigured systems.

[...]

Before anyone complains. This one patch is actually not in, because
Linus' decision was it instead to revert the mcfg reservation
code for .19. He already did it for i386 and i followed on x86-64.
But this patch went into the posted patchkit by mistake.
Will be probably revisited for .20.

-Andi

2006-11-14 18:47:30

by Aaron Durbin

[permalink] [raw]

Subject: Re: [PATCH for 2.6.19] [6/9] x86_64: Update MMCONFIG resource insertion to check against e820 map.

On 14 Nov 2006 19:38:49 +0100, Andi Kleen <[email protected]> wrote:
> Andi Kleen <[email protected]> writes:
>
> > From: "Aaron Durbin" <[email protected]>
> > Check to see if MMCONFIG region is marked as reserved in the e820 map before
> > inserting the MMCONFIG region into the resource map. If the region is not
> > entirely marked as reserved in the e820 map attempt to find a region that is.
> > Only insert the MMCONFIG region into the resource map if there was a region
> > found marked as reserved in the e820 map. This should fix a known regression
> > in 2.6.19 by not reserving all of the I/O space on misconfigured systems.
>
>
> [...]
>
> Before anyone complains. This one patch is actually not in, because
> Linus' decision was it instead to revert the mcfg reservation
> code for .19. He already did it for i386 and i followed on x86-64.
> But this patch went into the posted patchkit by mistake.
> Will be probably revisited for .20.

I would like to know what others think regarding this area. I think it
would be a good
idea to converge the mmconfig.c implementations for both x86-64 and i386. Is
this not feasable for some reasons I am unaware of? It should lead to more
code reuse and allow for a more unified stance in how both architectures handle
the PCI memory-mapped config space.

What is everyone's thoughts and ideas on such a suggestion?

I think the resource allocation can be addressed in the future after we have
tackled a unified approach.

-Aaron

2006-11-14 18:58:22

by Andi Kleen

[permalink] [raw]

Subject: Re: [PATCH for 2.6.19] [6/9] x86_64: Update MMCONFIG resource insertion to check against e820 map.

> I would like to know what others think regarding this area. I think it
> would be a good
> idea to converge the mmconfig.c implementations for both x86-64 and i386. Is
> this not feasable for some reasons I am unaware of? It should lead to more
> code reuse and allow for a more unified stance in how both architectures handle
> the PCI memory-mapped config space.

Yes, it should be done. But not 100% because x86-64 can use a much more
efficient mapping scheme than i386.

Probably with a mmconfig-common.c. When mmconfig.c was originally written
there wasn't that much support code and the fork wasn't a issue, it just has
grown over time as we work around more and more bugs.

-Andi

2006-11-15 10:01:21

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH for 2.6.19] [3/9] x86_64: shorten the x86_64 boot setup GDT to what the comment says

> From: Steven Rostedt <[email protected]>
>
> Stephen Tweedie, Herbert Xu, and myself have been struggling with a very
> nasty bug in Xen. But it also pointed out a small bug in the x86_64
> kernel boot setup.
>
> The GDT limit being setup by the initial bzImage code when entering into
> protected mode is way too big. The comment by the code states that the
> size of the GDT is 2048, but the actual size being set up is much bigger
> (32768). This happens simply because of one extra '0'.
>
> Instead of setting up a 0x800 size, 0x8000 is set up. On bare metal this
> is fine because the CPU wont load any segments unless they are
> explicitly used. But unfortunately, this breaks Xen on vmx FV, since it
> (for now) blindly loads all the segments into the VMCS if they are less
> than the gdt limit. Since the real mode segments are around 0x3000, we are
> getting junk into the VMCS and that later causes an exception.
>
> Stephen Tweedie has written up a patch to fix the Xen side and will be
> submitting that to those folks. But that doesn't excuse the GDT limit
> being a magnitude too big.
>
> AK: changed to compute true gdt size in assembler, fixed comment
>
> Signed-off-by: Steven Rostedt <[email protected]>
> Signed-off-by: Andi Kleen <[email protected]>

Acked-by: Ingo Molnar <[email protected]>

note, it seems to me that i386 had this fix years ago already.

Ingo