When the kernel command line option "retain_initrd" is set, we do not
free the initrd memory. However, we also don't expose it to anyone for
consumption. That leaves us in a weird situation where the only user of
this feature is ppc64 and arm64 specific kexec tooling.
To make it more generally useful, this patch adds a kobject to the
firmware object that contains the initrd context when "retain_initrd"
is set. That way, we can access the initrd any time after boot from
user space and for example hand it into kexec as --initrd parameter
if we want to reboot the same initrd. Or inspect it directly locally.
With this patch applied, there is a new /sys/firmware/initrd file when
the kernel was booted with an initrd and "retain_initrd" command line
option is set.
Signed-off-by: Alexander Graf <[email protected]>
---
v1 -> v2:
- Reword commit message to explain the new file path
- Add a Documentation/ABI/testing/sysfs-firmware-initrd file
---
.../ABI/testing/sysfs-firmware-initrd | 8 ++++++++
.../admin-guide/kernel-parameters.txt | 5 +++--
init/initramfs.c | 18 +++++++++++++++++-
3 files changed, 28 insertions(+), 3 deletions(-)
create mode 100644 Documentation/ABI/testing/sysfs-firmware-initrd
diff --git a/Documentation/ABI/testing/sysfs-firmware-initrd b/Documentation/ABI/testing/sysfs-firmware-initrd
new file mode 100644
index 000000000000..20bf7cf77a19
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-firmware-initrd
@@ -0,0 +1,8 @@
+What: /sys/firmware/initrd
+Date: December 2023
+Contact: Alexander Graf <[email protected]>
+Description:
+ When the kernel was booted with an initrd and the
+ "retain_initrd" option is set on the kernel command
+ line, /sys/firmware/initrd contains the contents of the
+ initrd that the kernel was booted with.
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 65731b060e3f..51575cd31741 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2438,7 +2438,7 @@
between unregistering the boot console and initializing
the real console.
- keepinitrd [HW,ARM]
+ keepinitrd [HW,ARM] See retain_initrd.
kernelcore= [KNL,X86,IA-64,PPC]
Format: nn[KMGTPE] | nn% | "mirror"
@@ -5580,7 +5580,8 @@
Useful for devices that are detected asynchronously
(e.g. USB and MMC devices).
- retain_initrd [RAM] Keep initrd memory after extraction
+ retain_initrd [RAM] Keep initrd memory after extraction. After boot, it will
+ be accessible via /sys/firmware/initrd.
retbleed= [X86] Control mitigation of RETBleed (Arbitrary
Speculative Code Execution with Return Instructions)
diff --git a/init/initramfs.c b/init/initramfs.c
index 8d0fd946cdd2..25244e2a5739 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -574,6 +574,16 @@ extern unsigned long __initramfs_size;
#include <linux/initrd.h>
#include <linux/kexec.h>
+static ssize_t raw_read(struct file *file, struct kobject *kobj,
+ struct bin_attribute *attr, char *buf,
+ loff_t pos, size_t count)
+{
+ memcpy(buf, attr->private + pos, count);
+ return count;
+}
+
+static BIN_ATTR(initrd, 0440, raw_read, NULL, 0);
+
void __init reserve_initrd_mem(void)
{
phys_addr_t start;
@@ -715,8 +725,14 @@ static void __init do_populate_rootfs(void *unused, async_cookie_t cookie)
* If the initrd region is overlapped with crashkernel reserved region,
* free only memory that is not part of crashkernel region.
*/
- if (!do_retain_initrd && initrd_start && !kexec_free_initrd())
+ if (!do_retain_initrd && initrd_start && !kexec_free_initrd()) {
free_initrd_mem(initrd_start, initrd_end);
+ } else if (do_retain_initrd) {
+ bin_attr_initrd.size = initrd_end - initrd_start;
+ bin_attr_initrd.private = (void *)initrd_start;
+ if (sysfs_create_bin_file(firmware_kobj, &bin_attr_initrd))
+ pr_err("Failed to create initrd sysfs file");
+ }
initrd_start = 0;
initrd_end = 0;
--
2.40.1
Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879
On Wed, 2023-12-06 at 21:33 +0000, Alexander Graf wrote:
> --- a/init/initramfs.c
> +++ b/init/initramfs.c
> @@ -574,6 +574,16 @@ extern unsigned long __initramfs_size;
> #include <linux/initrd.h>
> #include <linux/kexec.h>
>
> +static ssize_t raw_read(struct file *file, struct kobject *kobj,
> + struct bin_attribute *attr, char *buf,
> + loff_t pos, size_t count)
> +{
> + memcpy(buf, attr->private + pos, count);
> + return count;
> +}
> +
> +static BIN_ATTR(initrd, 0440, raw_read, NULL, 0);
> +
> void __init reserve_initrd_mem(void)
> {
> phys_addr_t start;
> @@ -715,8 +725,14 @@ static void __init do_populate_rootfs(void *unused, async_cookie_t cookie)
> * If the initrd region is overlapped with crashkernel reserved region,
> * free only memory that is not part of crashkernel region.
> */
> - if (!do_retain_initrd && initrd_start && !kexec_free_initrd())
> + if (!do_retain_initrd && initrd_start && !kexec_free_initrd()) {
> free_initrd_mem(initrd_start, initrd_end);
> + } else if (do_retain_initrd) {
> + bin_attr_initrd.size = initrd_end - initrd_start;
> + bin_attr_initrd.private = (void *)initrd_start;
> + if (sysfs_create_bin_file(firmware_kobj, &bin_attr_initrd))
> + pr_err("Failed to create initrd sysfs file");
> + }
> initrd_start = 0;
> initrd_end = 0;
When adding this to my dev environment and forgot to actually give QEMU
an initramfs file, but did add the retain_initrd cmdline param. This
caused a zero-sized /sys/firmware/initrd.
When trying to read that zero sized file it generates a NPE because
attr->private is NULL.
Do you want to do some bounds checking or perhaps not expose the file if
there's not actually an initramfs?
I was also wondering if we need to do bounds checking on pos + count to
prevent reading outside the initrd data in general, but it seems like
the generic code does that.
JG
[ 17.942640] BUG: kernel NULL pointer dereference, address: 0000000000000000
[ 17.944465] #PF: supervisor read access in kernel mode
[ 17.945753] #PF: error_code(0x0000) - not-present page
[ 17.946901] PGD 0 P4D 0
[ 17.947397] Oops: 0000 [#1] PREEMPT SMP NOPTI
[ 17.948384] CPU: 0 PID: 325 Comm: cat Not tainted 6.4.0-rc7-00232-g6290264ae247-dirty #415
[ 17.948676] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1 04/01/2014
[ 17.948988] RIP: 0010:memcpy_orig+0x1e/0x140
[ 17.949142] Code: 90 90 90 90 90 90 90 90 90 90 90 90 66 0f 1f 00 48 89 f8 48 83 fa 20 0f 82 86 00 00 00 40 38 fe 7c 35 48 83 ea 20 48 83 ea 20 <4c> 8b 06 4c 8b 4e 08 4c 8b 567
[ 17.949914] RSP: 0018:ffffc90000347e18 EFLAGS: 00010206
[ 17.950103] RAX: ffff888104fc0000 RBX: ffff888101991f00 RCX: ffff888104fc0000
[ 17.950381] RDX: 0000000000000fc0 RSI: 0000000000000000 RDI: ffff888104fc0000
[ 17.950680] RBP: ffffc90000347e98 R08: 0000000000000000 R09: 0000000000001000
[ 17.950963] R10: ffff888103448900 R11: ffff888100140040 R12: 0000000000001000
[ 17.951223] R13: ffffc90000347e70 R14: 0000000000001000 R15: ffff888101991f20
[ 17.951552] FS: 00007f4ce18d7580(0000) GS:ffff88813dc00000(0000) knlGS:0000000000000000
[ 17.952021] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 17.952345] CR2: 0000000000000000 CR3: 000000010368c001 CR4: 0000000000770ef0
[ 17.952833] PKRU: 55555554
[ 17.953086] Call Trace:
[ 17.953234] <TASK>
[ 17.953345] ? __die+0x1f/0x70
[ 17.953518] ? page_fault_oops+0x156/0x420
[ 17.953693] ? exc_page_fault+0x69/0x150
[ 17.953876] ? asm_exc_page_fault+0x26/0x30
[ 17.954059] ? memcpy_orig+0x1e/0x140
[ 17.954220] raw_read+0x1b/0x30
[ 17.954438] kernfs_fop_read_iter+0xa2/0x1a0
[ 17.954696] vfs_read+0x1b4/0x2d0
[ 17.954844] ksys_read+0x5e/0xe0
[ 17.954985] do_syscall_64+0x3c/0x90
[ 17.955158] entry_SYSCALL_64_after_hwframe+0x72/0xdc
[ 17.955380] RIP: 0033:0x7f4ce17f1fd2
On Wed, Dec 06, 2023 at 09:33:23PM +0000, Alexander Graf wrote:
> diff --git a/Documentation/ABI/testing/sysfs-firmware-initrd b/Documentation/ABI/testing/sysfs-firmware-initrd
> new file mode 100644
> index 000000000000..20bf7cf77a19
> --- /dev/null
> +++ b/Documentation/ABI/testing/sysfs-firmware-initrd
> @@ -0,0 +1,8 @@
> +What: /sys/firmware/initrd
> +Date: December 2023
> +Contact: Alexander Graf <[email protected]>
> +Description:
> + When the kernel was booted with an initrd and the
> + "retain_initrd" option is set on the kernel command
> + line, /sys/firmware/initrd contains the contents of the
> + initrd that the kernel was booted with.
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index 65731b060e3f..51575cd31741 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -2438,7 +2438,7 @@
> between unregistering the boot console and initializing
> the real console.
>
> - keepinitrd [HW,ARM]
> + keepinitrd [HW,ARM] See retain_initrd.
>
> kernelcore= [KNL,X86,IA-64,PPC]
> Format: nn[KMGTPE] | nn% | "mirror"
> @@ -5580,7 +5580,8 @@
> Useful for devices that are detected asynchronously
> (e.g. USB and MMC devices).
>
> - retain_initrd [RAM] Keep initrd memory after extraction
> + retain_initrd [RAM] Keep initrd memory after extraction. After boot, it will
> + be accessible via /sys/firmware/initrd.
>
> retbleed= [X86] Control mitigation of RETBleed (Arbitrary
> Speculative Code Execution with Return Instructions)
> diff --git a/init/initramfs.c b/init/initramfs.c
> index 8d0fd946cdd2..25244e2a5739 100644
> --- a/init/initramfs.c
> +++ b/init/initramfs.c
> @@ -574,6 +574,16 @@ extern unsigned long __initramfs_size;
> #include <linux/initrd.h>
> #include <linux/kexec.h>
>
> +static ssize_t raw_read(struct file *file, struct kobject *kobj,
> + struct bin_attribute *attr, char *buf,
> + loff_t pos, size_t count)
> +{
> + memcpy(buf, attr->private + pos, count);
> + return count;
> +}
> +
> +static BIN_ATTR(initrd, 0440, raw_read, NULL, 0);
> +
> void __init reserve_initrd_mem(void)
> {
> phys_addr_t start;
> @@ -715,8 +725,14 @@ static void __init do_populate_rootfs(void *unused, async_cookie_t cookie)
> * If the initrd region is overlapped with crashkernel reserved region,
> * free only memory that is not part of crashkernel region.
> */
> - if (!do_retain_initrd && initrd_start && !kexec_free_initrd())
> + if (!do_retain_initrd && initrd_start && !kexec_free_initrd()) {
> free_initrd_mem(initrd_start, initrd_end);
> + } else if (do_retain_initrd) {
> + bin_attr_initrd.size = initrd_end - initrd_start;
> + bin_attr_initrd.private = (void *)initrd_start;
> + if (sysfs_create_bin_file(firmware_kobj, &bin_attr_initrd))
> + pr_err("Failed to create initrd sysfs file");
> + }
> initrd_start = 0;
> initrd_end = 0;
>
On my Arch Linux system, /sys/firmware/initrd is not same as initramfs image
from /boot partition that is uncompressed. `ls -l` listing shows
(with /tmp/initramfs-boot is unzstd'ed initramfs of the same kernel booted):
```
-r--r----- 1 root root 22967535 Dec 7 19:32 /sys/firmware/initrd
-rw------- 1 root root 40960000 Dec 7 19:26 /tmp/initramfs-boot
```
And thus, `cpio -i -v` listing differs. While in uncompressed initramfs,
I got expected initramfs contents (early userpace for booting), doing the same
to /sys/firmware/initrd only shows Intel microcode.
Regardless, exposing initramfs as advertised in the patch description works for
me.
Tested-by: Bagas Sanjaya <[email protected]>
Thanks.
--
An old man doll... just what I always wanted! - Clara
Hi Bagas,
On 07.12.23 13:37, Bagas Sanjaya wrote:
> On Wed, Dec 06, 2023 at 09:33:23PM +0000, Alexander Graf wrote:
>> diff --git a/Documentation/ABI/testing/sysfs-firmware-initrd b/Documentation/ABI/testing/sysfs-firmware-initrd
>> new file mode 100644
>> index 000000000000..20bf7cf77a19
>> --- /dev/null
>> +++ b/Documentation/ABI/testing/sysfs-firmware-initrd
>> @@ -0,0 +1,8 @@
>> +What: /sys/firmware/initrd
>> +Date: December 2023
>> +Contact: Alexander Graf <[email protected]>
>> +Description:
>> + When the kernel was booted with an initrd and the
>> + "retain_initrd" option is set on the kernel command
>> + line, /sys/firmware/initrd contains the contents of the
>> + initrd that the kernel was booted with.
>> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
>> index 65731b060e3f..51575cd31741 100644
>> --- a/Documentation/admin-guide/kernel-parameters.txt
>> +++ b/Documentation/admin-guide/kernel-parameters.txt
>> @@ -2438,7 +2438,7 @@
>> between unregistering the boot console and initializing
>> the real console.
>>
>> - keepinitrd [HW,ARM]
>> + keepinitrd [HW,ARM] See retain_initrd.
>>
>> kernelcore= [KNL,X86,IA-64,PPC]
>> Format: nn[KMGTPE] | nn% | "mirror"
>> @@ -5580,7 +5580,8 @@
>> Useful for devices that are detected asynchronously
>> (e.g. USB and MMC devices).
>>
>> - retain_initrd [RAM] Keep initrd memory after extraction
>> + retain_initrd [RAM] Keep initrd memory after extraction. After boot, it will
>> + be accessible via /sys/firmware/initrd.
>>
>> retbleed= [X86] Control mitigation of RETBleed (Arbitrary
>> Speculative Code Execution with Return Instructions)
>> diff --git a/init/initramfs.c b/init/initramfs.c
>> index 8d0fd946cdd2..25244e2a5739 100644
>> --- a/init/initramfs.c
>> +++ b/init/initramfs.c
>> @@ -574,6 +574,16 @@ extern unsigned long __initramfs_size;
>> #include <linux/initrd.h>
>> #include <linux/kexec.h>
>>
>> +static ssize_t raw_read(struct file *file, struct kobject *kobj,
>> + struct bin_attribute *attr, char *buf,
>> + loff_t pos, size_t count)
>> +{
>> + memcpy(buf, attr->private + pos, count);
>> + return count;
>> +}
>> +
>> +static BIN_ATTR(initrd, 0440, raw_read, NULL, 0);
>> +
>> void __init reserve_initrd_mem(void)
>> {
>> phys_addr_t start;
>> @@ -715,8 +725,14 @@ static void __init do_populate_rootfs(void *unused, async_cookie_t cookie)
>> * If the initrd region is overlapped with crashkernel reserved region,
>> * free only memory that is not part of crashkernel region.
>> */
>> - if (!do_retain_initrd && initrd_start && !kexec_free_initrd())
>> + if (!do_retain_initrd && initrd_start && !kexec_free_initrd()) {
>> free_initrd_mem(initrd_start, initrd_end);
>> + } else if (do_retain_initrd) {
>> + bin_attr_initrd.size = initrd_end - initrd_start;
>> + bin_attr_initrd.private = (void *)initrd_start;
>> + if (sysfs_create_bin_file(firmware_kobj, &bin_attr_initrd))
>> + pr_err("Failed to create initrd sysfs file");
>> + }
>> initrd_start = 0;
>> initrd_end = 0;
>>
> On my Arch Linux system, /sys/firmware/initrd is not same as initramfs image
> from /boot partition that is uncompressed. `ls -l` listing shows
> (with /tmp/initramfs-boot is unzstd'ed initramfs of the same kernel booted):
>
> ```
> -r--r----- 1 root root 22967535 Dec 7 19:32 /sys/firmware/initrd
> -rw------- 1 root root 40960000 Dec 7 19:26 /tmp/initramfs-boot
> ```
>
> And thus, `cpio -i -v` listing differs. While in uncompressed initramfs,
> I got expected initramfs contents (early userpace for booting), doing the same
> to /sys/firmware/initrd only shows Intel microcode.
>
> Regardless, exposing initramfs as advertised in the patch description works for
> me.
Thanks a bunch for testing the patch!
The reason you're seeing microcode is that something in your boot chain
(grub maybe? sd-boot?) sends multiple initrd blobs to Linux: One that
contains microcode and another that contains the real initrd. Linux
continues extracting past the first cpio archive.
Alex
Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879
On Fri, Dec 08, 2023 at 12:54:18AM +0100, Alexander Graf wrote:
> Hi Bagas,
>
> On 07.12.23 13:37, Bagas Sanjaya wrote:
> > On my Arch Linux system, /sys/firmware/initrd is not same as initramfs image
> > from /boot partition that is uncompressed. `ls -l` listing shows
> > (with /tmp/initramfs-boot is unzstd'ed initramfs of the same kernel booted):
> >
> > ```
> > -r--r----- 1 root root 22967535 Dec 7 19:32 /sys/firmware/initrd
> > -rw------- 1 root root 40960000 Dec 7 19:26 /tmp/initramfs-boot
> > ```
> >
> > And thus, `cpio -i -v` listing differs. While in uncompressed initramfs,
> > I got expected initramfs contents (early userpace for booting), doing the same
> > to /sys/firmware/initrd only shows Intel microcode.
> >
> > Regardless, exposing initramfs as advertised in the patch description works for
> > me.
>
>
> Thanks a bunch for testing the patch!
>
> The reason you're seeing microcode is that something in your boot chain
> (grub maybe? sd-boot?) sends multiple initrd blobs to Linux: One that
> contains microcode and another that contains the real initrd. Linux
> continues extracting past the first cpio archive.
>
Yes, I use grub on my setup.
Ciao!
--
An old man doll... just what I always wanted! - Clara