2002-11-02 08:08:17

by Jeff Garzik

[permalink] [raw]
Subject: [BK PATCHES] initramfs merge, part 1 of N

diff -Nru a/Makefile b/Makefile
--- a/Makefile Sat Nov 2 02:34:50 2002
+++ b/Makefile Sat Nov 2 02:34:50 2002
@@ -209,7 +209,7 @@
drivers-y := drivers/ sound/
net-y := net/
libs-y := lib/
-core-y :=
+core-y := usr/
SUBDIRS :=

ifeq ($(filter $(noconfig_targets),$(MAKECMDGOALS)),)
diff -Nru a/arch/i386/Makefile b/arch/i386/Makefile
--- a/arch/i386/Makefile Sat Nov 2 02:34:50 2002
+++ b/arch/i386/Makefile Sat Nov 2 02:34:50 2002
@@ -18,6 +18,7 @@

LDFLAGS := -m elf_i386
OBJCOPYFLAGS := -O binary -R .note -R .comment -S
+ARCHBLOBLFLAGS := -I binary -O elf32-i386 -B i386
LDFLAGS_vmlinux := -e stext

CFLAGS += -pipe
diff -Nru a/arch/i386/vmlinux.lds.S b/arch/i386/vmlinux.lds.S
--- a/arch/i386/vmlinux.lds.S Sat Nov 2 02:34:50 2002
+++ b/arch/i386/vmlinux.lds.S Sat Nov 2 02:34:50 2002
@@ -77,6 +77,10 @@
*(.initcall7.init)
}
__initcall_end = .;
+ . = ALIGN(4096);
+ __initramfs_start = .;
+ .init.ramfs : { *(.init.initramfs) }
+ __initramfs_end = .;
. = ALIGN(32);
__per_cpu_start = .;
.data.percpu : { *(.data.percpu) }
diff -Nru a/init/Makefile b/init/Makefile
--- a/init/Makefile Sat Nov 2 02:34:50 2002
+++ b/init/Makefile Sat Nov 2 02:34:50 2002
@@ -2,7 +2,7 @@
# Makefile for the linux kernel.
#

-obj-y := main.o version.o do_mounts.o
+obj-y := main.o version.o do_mounts.o initramfs.o

# files to be removed upon make clean
clean-files := ../include/linux/compile.h
diff -Nru a/init/do_mounts.c b/init/do_mounts.c
--- a/init/do_mounts.c Sat Nov 2 02:34:50 2002
+++ b/init/do_mounts.c Sat Nov 2 02:34:50 2002
@@ -748,9 +748,7 @@
mount_initrd = 0;
real_root_dev = ROOT_DEV;
#endif
- sys_mkdir("/dev", 0700);
- sys_mkdir("/root", 0700);
- sys_mknod("/dev/console", S_IFCHR|0600, MKDEV(TTYAUX_MAJOR, 1));
+
#ifdef CONFIG_DEVFS_FS
sys_mount("devfs", "/dev", "devfs", 0, NULL);
do_devfs = 1;
diff -Nru a/init/initramfs.c b/init/initramfs.c
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/init/initramfs.c Sat Nov 2 02:34:50 2002
@@ -0,0 +1,462 @@
+#define __KERNEL_SYSCALLS__
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/unistd.h>
+#include <linux/delay.h>
+
+static void __init error(char *x)
+{
+ panic("populate_root: %s\n", x);
+}
+
+static void __init *malloc(int size)
+{
+ return kmalloc(size, GFP_KERNEL);
+}
+
+static void __init free(void *where)
+{
+ kfree(where);
+}
+
+asmlinkage long sys_mkdir(char *name, int mode);
+asmlinkage long sys_mknod(char *name, int mode, dev_t dev);
+asmlinkage long sys_symlink(char *old, char *new);
+asmlinkage long sys_link(char *old, char *new);
+asmlinkage long sys_write(int fd, void *buf, ssize_t size);
+asmlinkage long sys_chown(char *name, uid_t uid, gid_t gid);
+asmlinkage long sys_lchown(char *name, uid_t uid, gid_t gid);
+asmlinkage long sys_fchown(int fd, uid_t uid, gid_t gid);
+asmlinkage long sys_chmod(char *name, mode_t mode);
+asmlinkage long sys_fchmod(int fd, mode_t mode);
+
+/* link hash */
+
+static struct hash {
+ int ino, minor, major;
+ struct hash *next;
+ char *name;
+} *head[32];
+
+static inline int hash(int major, int minor, int ino)
+{
+ unsigned long tmp = ino + minor + (major << 3);
+ tmp += tmp >> 5;
+ return tmp & 31;
+}
+
+static char __init *find_link(int major, int minor, int ino, char *name)
+{
+ struct hash **p, *q;
+ for (p = head + hash(major, minor, ino); *p; p = &(*p)->next) {
+ if ((*p)->ino != ino)
+ continue;
+ if ((*p)->minor != minor)
+ continue;
+ if ((*p)->major != major)
+ continue;
+ return (*p)->name;
+ }
+ q = (struct hash *)malloc(sizeof(struct hash));
+ if (!q)
+ error("can't allocate link hash entry");
+ q->ino = ino;
+ q->minor = minor;
+ q->major = major;
+ q->name = name;
+ q->next = NULL;
+ *p = q;
+ return NULL;
+}
+
+static void __init free_hash(void)
+{
+ struct hash **p, *q;
+ for (p = head; p < head + 32; p++) {
+ while (*p) {
+ q = *p;
+ *p = q->next;
+ free(q);
+ }
+ }
+}
+
+/* cpio header parsing */
+
+static __initdata unsigned long ino, major, minor, nlink;
+static __initdata mode_t mode;
+static __initdata unsigned long body_len, name_len;
+static __initdata uid_t uid;
+static __initdata gid_t gid;
+static __initdata dev_t rdev;
+
+static void __init parse_header(char *s)
+{
+ unsigned long parsed[12];
+ char buf[9];
+ int i;
+
+ buf[8] = '\0';
+ for (i = 0, s += 6; i < 12; i++, s += 8) {
+ memcpy(buf, s, 8);
+ parsed[i] = simple_strtoul(buf, NULL, 16);
+ }
+ ino = parsed[0];
+ mode = parsed[1];
+ uid = parsed[2];
+ gid = parsed[3];
+ nlink = parsed[4];
+ body_len = parsed[6];
+ major = parsed[7];
+ minor = parsed[8];
+ rdev = MKDEV(parsed[9], parsed[10]);
+ name_len = parsed[11];
+}
+
+/* FSM */
+
+enum state {
+ Start,
+ Collect,
+ GotHeader,
+ SkipIt,
+ GotName,
+ CopyFile,
+ GotSymlink,
+ Reset
+} state, next_state;
+
+char *victim;
+unsigned count;
+loff_t this_header, next_header;
+
+static inline void eat(unsigned n)
+{
+ victim += n;
+ this_header += n;
+ count -= n;
+}
+
+#define N_ALIGN(len) ((((len) + 1) & ~3) + 2)
+
+static __initdata char *collected;
+static __initdata int remains;
+static __initdata char *collect;
+
+static void __init read_into(char *buf, unsigned size, enum state next)
+{
+ if (count >= size) {
+ collected = victim;
+ eat(size);
+ state = next;
+ } else {
+ collect = collected = buf;
+ remains = size;
+ next_state = next;
+ state = Collect;
+ }
+}
+
+static __initdata char *header_buf, *symlink_buf, *name_buf;
+
+static int __init do_start(void)
+{
+ read_into(header_buf, 110, GotHeader);
+ return 0;
+}
+
+static int __init do_collect(void)
+{
+ unsigned n = remains;
+ if (count < n)
+ n = count;
+ memcpy(collect, victim, n);
+ eat(n);
+ collect += n;
+ if (remains -= n)
+ return 1;
+ state = next_state;
+ return 0;
+}
+
+static int __init do_header(void)
+{
+ parse_header(collected);
+ next_header = this_header + N_ALIGN(name_len) + body_len;
+ next_header = (next_header + 3) & ~3;
+ if (name_len <= 0 || name_len > PATH_MAX)
+ state = SkipIt;
+ else if (S_ISLNK(mode)) {
+ if (body_len > PATH_MAX)
+ state = SkipIt;
+ else {
+ collect = collected = symlink_buf;
+ remains = N_ALIGN(name_len) + body_len;
+ next_state = GotSymlink;
+ state = Collect;
+ }
+ } else if (body_len && !S_ISREG(mode))
+ state = SkipIt;
+ else
+ read_into(name_buf, N_ALIGN(name_len), GotName);
+ return 0;
+}
+
+static int __init do_skip(void)
+{
+ if (this_header + count <= next_header) {
+ eat(count);
+ return 1;
+ } else {
+ eat(next_header - this_header);
+ state = next_state;
+ return 0;
+ }
+}
+
+static int __init do_reset(void)
+{
+ while(count && *victim == '\0')
+ eat(1);
+ if (count && (this_header & 3))
+ error("broken padding");
+ return 1;
+}
+
+static int __init maybe_link(void)
+{
+ if (nlink >= 2) {
+ char *old = find_link(major, minor, ino, collected);
+ if (old)
+ return (sys_link(old, collected) < 0) ? -1 : 1;
+ }
+ return 0;
+}
+
+static __initdata int wfd;
+
+static int __init do_name(void)
+{
+ state = SkipIt;
+ next_state = Start;
+ if (strcmp(collected, "TRAILER!!!") == 0) {
+ free_hash();
+ next_state = Reset;
+ return 0;
+ }
+ printk(KERN_INFO "-> %s\n", collected);
+ if (S_ISREG(mode)) {
+ if (maybe_link() >= 0) {
+ wfd = sys_open(collected, O_WRONLY|O_CREAT, mode);
+ if (wfd >= 0) {
+ sys_fchown(wfd, uid, gid);
+ sys_fchmod(wfd, mode);
+ state = CopyFile;
+ }
+ }
+ } else if (S_ISDIR(mode)) {
+ sys_mkdir(collected, mode);
+ sys_chown(collected, uid, gid);
+ } else if (S_ISBLK(mode) || S_ISCHR(mode) ||
+ S_ISFIFO(mode) || S_ISSOCK(mode)) {
+ if (maybe_link() == 0) {
+ sys_mknod(collected, mode, rdev);
+ sys_chown(collected, uid, gid);
+ }
+ } else
+ panic("populate_root: bogus mode: %o\n", mode);
+ return 0;
+}
+
+static int __init do_copy(void)
+{
+ if (count >= body_len) {
+ sys_write(wfd, victim, body_len);
+ sys_close(wfd);
+ eat(body_len);
+ state = SkipIt;
+ return 0;
+ } else {
+ sys_write(wfd, victim, count);
+ body_len -= count;
+ eat(count);
+ return 1;
+ }
+}
+
+static int __init do_symlink(void)
+{
+ collected[N_ALIGN(name_len) + body_len] = '\0';
+ sys_symlink(collected + N_ALIGN(name_len), collected);
+ sys_lchown(collected, uid, gid);
+ state = SkipIt;
+ next_state = Start;
+ return 0;
+}
+
+static __initdata int (*actions[])(void) = {
+ [Start] do_start,
+ [Collect] do_collect,
+ [GotHeader] do_header,
+ [SkipIt] do_skip,
+ [GotName] do_name,
+ [CopyFile] do_copy,
+ [GotSymlink] do_symlink,
+ [Reset] do_reset,
+};
+
+static int __init write_buffer(char *buf, unsigned len)
+{
+ count = len;
+ victim = buf;
+
+ while (!actions[state]())
+ ;
+ return len - count;
+}
+
+static void __init flush_buffer(char *buf, unsigned len)
+{
+ int written;
+ while ((written = write_buffer(buf, len)) < len) {
+ char c = buf[written];
+ if (c == '0') {
+ buf += written;
+ len -= written;
+ state = Start;
+ continue;
+ } else
+ error("junk in compressed archive");
+ }
+}
+
+/*
+ * gzip declarations
+ */
+
+#define OF(args) args
+
+#ifndef memzero
+#define memzero(s, n) memset ((s), 0, (n))
+#endif
+
+typedef unsigned char uch;
+typedef unsigned short ush;
+typedef unsigned long ulg;
+
+#define WSIZE 0x8000 /* window size--must be a power of two, and */
+ /* at least 32K for zip's deflate method */
+
+static uch *inbuf;
+static uch *window;
+
+static unsigned insize; /* valid bytes in inbuf */
+static unsigned inptr; /* index of next byte to be processed in inbuf */
+static unsigned outcnt; /* bytes in output buffer */
+static long bytes_out;
+
+#define get_byte() (inptr < insize ? inbuf[inptr++] : -1)
+
+/* Diagnostic functions (stubbed out) */
+#define Assert(cond,msg)
+#define Trace(x)
+#define Tracev(x)
+#define Tracevv(x)
+#define Tracec(c,x)
+#define Tracecv(c,x)
+
+#define STATIC static
+
+static void flush_window(void);
+static void error(char *m);
+static void gzip_mark(void **);
+static void gzip_release(void **);
+
+#include "../lib/inflate.c"
+
+static void __init gzip_mark(void **ptr)
+{
+}
+
+static void __init gzip_release(void **ptr)
+{
+}
+
+/* ===========================================================================
+ * Write the output window window[0..outcnt-1] and update crc and bytes_out.
+ * (Used for the decompressed data only.)
+ */
+static void __init flush_window(void)
+{
+ ulg c = crc; /* temporary variable */
+ unsigned n;
+ uch *in, ch;
+
+ flush_buffer(window, outcnt);
+ in = window;
+ for (n = 0; n < outcnt; n++) {
+ ch = *in++;
+ c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
+ }
+ crc = c;
+ bytes_out += (ulg)outcnt;
+ outcnt = 0;
+}
+
+static void __init unpack_to_rootfs(char *buf, unsigned len)
+{
+ int written;
+ header_buf = malloc(110);
+ symlink_buf = malloc(PATH_MAX + N_ALIGN(PATH_MAX) + 1);
+ name_buf = malloc(N_ALIGN(PATH_MAX));
+ window = malloc(WSIZE);
+ if (!window || !header_buf || !symlink_buf || !name_buf)
+ error("can't allocate buffers");
+ state = Start;
+ this_header = 0;
+ while (len) {
+ loff_t saved_offset = this_header;
+ if (*buf == '0' && !(this_header & 3)) {
+ state = Start;
+ written = write_buffer(buf, len);
+ buf += written;
+ len -= written;
+ continue;
+ } else if (!*buf) {
+ buf++;
+ len--;
+ this_header++;
+ continue;
+ }
+ this_header = 0;
+ insize = len;
+ inbuf = buf;
+ inptr = 0;
+ outcnt = 0; /* bytes in output buffer */
+ bytes_out = 0;
+ crc = (ulg)0xffffffffL; /* shift register contents */
+ makecrc();
+ if (gunzip())
+ error("ungzip failed");
+ if (state != Reset)
+ error("junk in gzipped archive");
+ this_header = saved_offset + inptr;
+ buf += inptr;
+ len -= inptr;
+ }
+ free(window);
+ free(name_buf);
+ free(symlink_buf);
+ free(header_buf);
+}
+
+extern unsigned long __initramfs_start, __initramfs_end;
+
+void __init populate_rootfs(void)
+{
+ unpack_to_rootfs((void *) &__initramfs_start,
+ &__initramfs_end - &__initramfs_start);
+}
diff -Nru a/init/main.c b/init/main.c
--- a/init/main.c Sat Nov 2 02:34:50 2002
+++ b/init/main.c Sat Nov 2 02:34:50 2002
@@ -72,6 +72,7 @@
extern void pte_chain_init(void);
extern void radix_tree_init(void);
extern void free_initmem(void);
+extern void populate_rootfs(void);

#ifdef CONFIG_TC
extern void tc_init(void);
@@ -433,6 +434,7 @@
vfs_caches_init(num_physpages);
radix_tree_init();
signals_init();
+ populate_rootfs();
#ifdef CONFIG_PROC_FS
proc_root_init();
#endif
diff -Nru a/usr/Makefile b/usr/Makefile
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/usr/Makefile Sat Nov 2 02:34:50 2002
@@ -0,0 +1,18 @@
+
+include arch/$(ARCH)/Makefile
+
+obj-y := initramfs_data.o
+
+host-progs := gen_init_cpio
+
+clean-files := initramfs_data.cpio.gz
+
+$(obj)/initramfs_data.o: $(obj)/initramfs_data.cpio.gz
+ $(OBJCOPY) $(ARCHBLOBLFLAGS) \
+ --rename-section .data=.init.initramfs \
+ $(obj)/initramfs_data.cpio.gz $(obj)/initramfs_data.o
+ $(STRIP) -s $(obj)/initramfs_data.o
+
+$(obj)/initramfs_data.cpio.gz: $(obj)/gen_init_cpio
+ ( cd $(obj) ; ./gen_init_cpio | gzip -9c > initramfs_data.cpio.gz )
+
diff -Nru a/usr/gen_init_cpio.c b/usr/gen_init_cpio.c
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/usr/gen_init_cpio.c Sat Nov 2 02:34:50 2002
@@ -0,0 +1,137 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <string.h>
+#include <unistd.h>
+#include <time.h>
+
+static unsigned int offset;
+static unsigned int ino = 721;
+
+static void push_rest(const char *name)
+{
+ unsigned int name_len = strlen(name) + 1;
+ unsigned int tmp_ofs;
+
+ fputs(name, stdout);
+ putchar(0);
+ offset += name_len;
+
+ tmp_ofs = name_len + 110;
+ while (tmp_ofs & 3) {
+ putchar(0);
+ offset++;
+ tmp_ofs++;
+ }
+}
+
+static void push_hdr(const char *s)
+{
+ fputs(s, stdout);
+ offset += 110;
+}
+
+static void cpio_trailer(void)
+{
+ char s[256];
+ const char *name = "TRAILER!!!";
+
+ sprintf(s, "%s%08X%08X%08lX%08lX%08X%08lX"
+ "%08X%08X%08X%08X%08X%08X%08X",
+ "070701", /* magic */
+ 0, /* ino */
+ 0, /* mode */
+ (long) 0, /* uid */
+ (long) 0, /* gid */
+ 1, /* nlink */
+ (long) 0, /* mtime */
+ 0, /* filesize */
+ 0, /* major */
+ 0, /* minor */
+ 0, /* rmajor */
+ 0, /* rminor */
+ strlen(name) + 1, /* namesize */
+ 0); /* chksum */
+ push_hdr(s);
+ push_rest(name);
+
+ while (offset % 512) {
+ putchar(0);
+ offset++;
+ }
+}
+
+static void cpio_mkdir(const char *name, unsigned int mode,
+ uid_t uid, gid_t gid)
+{
+ char s[256];
+ time_t mtime = time(NULL);
+
+ sprintf(s,"%s%08X%08X%08lX%08lX%08X%08lX"
+ "%08X%08X%08X%08X%08X%08X%08X",
+ "070701", /* magic */
+ ino++, /* ino */
+ S_IFDIR | mode, /* mode */
+ (long) uid, /* uid */
+ (long) gid, /* gid */
+ 2, /* nlink */
+ (long) mtime, /* mtime */
+ 0, /* filesize */
+ 3, /* major */
+ 1, /* minor */
+ 0, /* rmajor */
+ 0, /* rminor */
+ strlen(name) + 1, /* namesize */
+ 0); /* chksum */
+ push_hdr(s);
+ push_rest(name);
+}
+
+static void cpio_mknod(const char *name, unsigned int mode,
+ uid_t uid, gid_t gid, int dev_type,
+ unsigned int maj, unsigned int min)
+{
+ char s[256];
+ time_t mtime = time(NULL);
+
+ if (dev_type == 'b')
+ mode |= S_IFBLK;
+ else
+ mode |= S_IFCHR;
+
+ sprintf(s,"%s%08X%08X%08lX%08lX%08X%08lX"
+ "%08X%08X%08X%08X%08X%08X%08X",
+ "070701", /* magic */
+ ino++, /* ino */
+ mode, /* mode */
+ (long) uid, /* uid */
+ (long) gid, /* gid */
+ 1, /* nlink */
+ (long) mtime, /* mtime */
+ 0, /* filesize */
+ 3, /* major */
+ 1, /* minor */
+ maj, /* rmajor */
+ min, /* rminor */
+ strlen(name) + 1, /* namesize */
+ 0); /* chksum */
+ push_hdr(s);
+ push_rest(name);
+}
+
+int main (int argc, char *argv[])
+{
+ cpio_mkdir("/dev", 0700, 0, 0);
+ cpio_mknod("/dev/console", 0600, 0, 0, 'c', 5, 1);
+ cpio_mkdir("/root", 0700, 0, 0);
+ cpio_trailer();
+
+ exit(0);
+
+ /* silence compiler warnings */
+ return 0;
+ (void) argc;
+ (void) argv;
+}
+


Attachments:
minitramfs-2.5.txt (811.00 B)
patch (15.70 kB)
Download all attachments

2002-11-02 08:12:31

by Jeff Garzik

[permalink] [raw]
Subject: Re: [BK PATCHES] initramfs merge, part 1 of N

Oh yeah... quick addition.

At some point in the evolution, I will add the ability to load initramfs
in all the ways that initrd is currently loaded now (from the
bootloader, etc.). Substituting a custom initramfs cpio archive in the
kernel link will also be added at a later time.



2002-11-02 08:36:26

by Aaron Lehmann

[permalink] [raw]
Subject: Re: [BK PATCHES] initramfs merge, part 1 of N

On Sat, Nov 02, 2002 at 03:13:45AM -0500, Jeff Garzik wrote:
> The Future.
>
> Early userspace is going to be merged in a series of evolutionary
> changes, following what I call "The Al Viro model." NO KERNEL BEHAVIOR
> SHOULD CHANGE. [that's for the lkml listeners, not you <g>] "make"
> will continue to simply Do The Right Thing(tm) on all platforms, while
> the kernel image continues to get progressively smaller.

Won't the initial userspace be linked into the kernel? If so, why will
the kernel image get smaller?

2002-11-02 08:41:01

by Jeff Garzik

[permalink] [raw]
Subject: Re: [BK PATCHES] initramfs merge, part 1 of N

Aaron Lehmann wrote:

>On Sat, Nov 02, 2002 at 03:13:45AM -0500, Jeff Garzik wrote:
>
>
>>The Future.
>>
>>Early userspace is going to be merged in a series of evolutionary
>>changes, following what I call "The Al Viro model." NO KERNEL BEHAVIOR
>>SHOULD CHANGE. [that's for the lkml listeners, not you <g>] "make"
>>will continue to simply Do The Right Thing(tm) on all platforms, while
>>the kernel image continues to get progressively smaller.
>>
>>
>
>Won't the initial userspace be linked into the kernel? If so, why will
>the kernel image get smaller?
>
>

Yes and no ;-)

Ignoring for a moment initramfses loaded from your bootloader (a la
initrd)... The amount of code that runs in kernel space shrinks, which
is the main point of early userspace. If you are talking in terms of
overall kernel image size, yes, but the initramfs cpio archive is
ditching along with the rest of __init code, so you're really only
talking about wasting a couple of additional pages in vmlinux -- a
slight increase in disk space usage, and that's it.

So runtime memory usage certainly does not increase...

Jeff




2002-11-02 08:44:41

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [BK PATCHES] initramfs merge, part 1 of N

Jeff Garzik wrote:
> Aaron Lehmann wrote:
>
>> On Sat, Nov 02, 2002 at 03:13:45AM -0500, Jeff Garzik wrote:
>>
>>
>>> The Future.
>>>
>>> Early userspace is going to be merged in a series of evolutionary
>>> changes, following what I call "The Al Viro model." NO KERNEL
>>> BEHAVIOR SHOULD CHANGE. [that's for the lkml listeners, not you
>>> <g>] "make" will continue to simply Do The Right Thing(tm) on all
>>> platforms, while the kernel image continues to get progressively
>>> smaller.
>>>
>>
>>
>> Won't the initial userspace be linked into the kernel? If so, why will
>> the kernel image get smaller?
>>
>>
>
> Yes and no ;-)
>
> Ignoring for a moment initramfses loaded from your bootloader (a la
> initrd)... The amount of code that runs in kernel space shrinks, which
> is the main point of early userspace. If you are talking in terms of
> overall kernel image size, yes, but the initramfs cpio archive is
> ditching along with the rest of __init code, so you're really only
> talking about wasting a couple of additional pages in vmlinux -- a
> slight increase in disk space usage, and that's it.
>
> So runtime memory usage certainly does not increase...
>

By the way, the final initramfs should typically be a union of whatever
sources there are; with the ones linked into the kernel image unpacked
first (so they can be overwritten if so specified to the bootloader.)

-hpa


2002-11-02 10:45:01

by Milton Miller

[permalink] [raw]
Subject: Re: [BK PATCHES] initramfs merge, part 1 of N


> Items For Discussion
>
> #1 - shared kinit
>
> "kinit" is _the_ early userspace binary -- but not necessarily the only
> one. Peter Anvin and Russell King have several binaries in the klibc
> tarball, gzip, ash, and several smaller utilities. Peter also put work
> into making klibc a shared object -- that doesn't need an shlib loader.
> It's pretty nifty how he does it, IMO: klibc.so becomes an ELF
> interpreter for any klibc-linked binaries. klibc-linked binaries are,
> to the ELF system, static binaries, but they wind up sharing klibc.so
> anyway due to this trick.
>
> Anyway, there is a certain elegance in adding coding to kinit instead of
> an explosion of binaries and shell scripts. The other side of that coin
> is that with elegance you sacrifice some ease of making changes. I am
> 60% certain we want a shared klibc and multiple binaries, but am willing
> to be convinced in either direction. If you think about it, there _are_
> several benefits to leaving kinit as the lone binary in the stock kernel
> early userspace build, so the decision is not as cut-n-dry as it may
> immediately seem.


One idea I experimented some time ago with (and can revive after
some sleep) is, rather than interpreting cpio in the kernel, objcopy
a binary into a init and copy that into pagecache in a ramfs/libfs
file system. The population was all initfunctions, trying to make
it disappear at runtime. /dev/initrd was left for userspace to
expand the rest of the loaders. With libfs, the write code reinstated
so standard directories, device nodes, console and initrd nodes
can be created and opened in userspace, further shrinking the static
linked-in code.

This argues that this initial code is unshared and uncompressed
(or rather, compressed like the rest of the kernel); for shared we
would have to copy a couple of pieces this way. It traded off a
table of offset,length,mode,name with cpio headers and parsing.

I had this running on 2.4.19-pre10 (around the time of the kernel
summit, just before the fixed directory link counts went in) with
busybox. (I seperated the 2.4 compat vs 2.5 stuff at that time).

Comments?

milton

2002-11-02 16:38:05

by Matt Porter

[permalink] [raw]
Subject: Re: [BK PATCHES] initramfs merge, part 1 of N

On Sat, Nov 02, 2002 at 03:13:45AM -0500, Jeff Garzik wrote:
> #4 - move mounting root to userspace
>
> People probably breathed a sigh of relief at patch #3, they will heave a
> bigger sigh for this patch :) This moves mounting of the root
> filesystem to early userspace, including getting rid of
> NFSroot/bootp/dhcp code in the kernel.

For those of us who only develop on nfsroot-based systems, does this
step include adding userspace network interface configuration and
bootp/dhcp client functionality to kinit? I want to assume that
"getting rid of NFSroot/bootp/dhcp" means moving that particular
functionality as part of this step. Just wondering what the
short-term impact will be on the poor embedded guys. :)

Regards,
--
Matt Porter
[email protected]
This is Linux Country. On a quiet night, you can hear Windows reboot.

2002-11-02 18:55:41

by Linus Torvalds

[permalink] [raw]
Subject: Re: [BK PATCHES] initramfs merge, part 1 of N


On Sat, 2 Nov 2002, Aaron Lehmann wrote:
>
> Won't the initial userspace be linked into the kernel? If so, why will
> the kernel image get smaller?

Note that the reason I personally really want initramfs is not to make the
kernel boot image smaller, or the kernel sources smaller. That won't
happen for a long time, since I suspect that we'll be carrying the
initramfs user space with us for quite a while (eventually it will
probably split up into a project of its own, but certainly for the
forseeable future it would be very closely tied to the kernel).

The real advantage to me is two-fold:

- make it easier for people to customize their initial system without
having to muck with kernel code or even use a different boot sequence.
One example of this is the difference between vendor install kernels
(using initrd) and a normal install kernel (which doesn't).

So I'd much rather see us _always_ using initrd, and the difference
between an install kernel and a regular kernel is really just the size
of the initrd thing.

- Many things are much more easily done in user space, because user space
has protections, "infinite stack", and in general a lot better
infrastructure (ie easier to debug etc). At the same time, many things
need to be done _before_ the kernel is fully ready to hand over control
to a normal user space: do ACPI parsing so that we can initialize the
devices so that we can use the "real" user space that is installed on
disk etc.

Sometimes there is overlap between these two things (ie the "easier to
do in user space" and "needs to be done before normal user space can be
loaded"). ACPI is one potential example. Mounting the root filesystem
over NFS after having done DHCP or other auto-discovery is another.

So "shrinking the kernel" is not on my list here. It's really a matter of
"some initialization is better done in user space", and not primarily "we
want to make the kernel smaller". I'm not a big believer in microkernels
and trying to get everything out of the kernel itself, but I _do_ believe
that sometimes it's easier to just let the user do his own choices (while
still giving him all the protection implied by running in user space).

Linus

2002-11-02 20:18:03

by Alexander Viro

[permalink] [raw]
Subject: Re: [BK PATCHES] initramfs merge, part 1 of N



On Sat, 2 Nov 2002, Linus Torvalds wrote:

> Note that the reason I personally really want initramfs is not to make the
> kernel boot image smaller, or the kernel sources smaller. That won't
> happen for a long time, since I suspect that we'll be carrying the
> initramfs user space with us for quite a while (eventually it will
> probably split up into a project of its own, but certainly for the
> forseeable future it would be very closely tied to the kernel).
>
> The real advantage to me is two-fold:
[snip]

Let me add the third one: userland is more limited. And no, that's not
a typo - and it's a good thing. Userland has to use normal, regular
syscalls instead of poking its fingers into hell knows what parts of
kernel data structures.

Which means that it's more robust and that it doesn't stand in the way
of work on kernel. 90% of PITA with super.c used to be of that kind -
mounting root filesystem had been done with very ugly kludges and what's
more, these kludges got filtered down in the normal codepath. Getting
rid of that took a _lot_ of very careful manipulations with the guts
of the thing. And guess what? There was no reason why all that black
magic would be necessary - current code uses normal, garden-variety
system calls.

In effect, we used to have special cases of mount(2), etc., with very
kludgy semantics. They were not exposed to userland, but that didn't
make them less nasty or less painful to work with. They still cluttered
the code, they still stood in the way of work on the thing and they still
were butt-ugly.

And that's what moving code to userland should prevent - it's much easier
to catch somebody bringing a patch with magical extension of system call
than to catch an attempt to sneak special-case code used only by kernel.

BTW, that's a thing we need to watch for - there obviously will be a lot
of patches moving stuff to userland and there will be a strong temptation
to add magic interfaces just for that. _That_ should be prevented - it's
better to leave ugly crap as is than export the same crap to userland.
The point is to get the things cleaned up and make sure that they stay
clean, not to cement them in place by adding a magic ioctl/syscall/flag/whatnot.
We may very well end up extending existing interfaces, but we'd damn better
make sure that such additions make sense for generic use.

We have a lot of ugly crap that would be unnecessary if we had early
access to writable fs. Basically, we got magic methods, magic codepaths,
etc. simply because the normal access to the functionality in question
required opened file descriptors. Now we _do_ have a writable filesystem
mounted very early, so that cruft can be killed off. And moving code
to userland acts as a filter - there we don't have access to magic, so
all such magic immediately shows up. It could be done in the kernel
(and quite a few things had been done already), but move to userland
acts as a safeguard against reintroduction of magic crap.

2002-11-02 20:01:14

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [BK PATCHES] initramfs merge, part 1 of N

Linus Torvalds wrote:
>
> The real advantage to me is two-fold:
>
> - make it easier for people to customize their initial system without
> having to muck with kernel code or even use a different boot sequence.
> One example of this is the difference between vendor install kernels
> (using initrd) and a normal install kernel (which doesn't).
>
> So I'd much rather see us _always_ using initrd, and the difference
> between an install kernel and a regular kernel is really just the size
> of the initrd thing.
>
> - Many things are much more easily done in user space, because user space
> has protections, "infinite stack", and in general a lot better
> infrastructure (ie easier to debug etc). At the same time, many things
> need to be done _before_ the kernel is fully ready to hand over control
> to a normal user space: do ACPI parsing so that we can initialize the
> devices so that we can use the "real" user space that is installed on
> disk etc.
>
> Sometimes there is overlap between these two things (ie the "easier to
> do in user space" and "needs to be done before normal user space can be
> loaded"). ACPI is one potential example. Mounting the root filesystem
> over NFS after having done DHCP or other auto-discovery is another.
>

I agree 100% with this. I don't think <kernel>+<early userspace> will
ever be smaller than the current kernel, but I have invested quite a bit
of effort into it for exactly the reasons done above.

klibc binaries might not be what one usually tends to run, but during
klibc development I could still use standard gdb, strace, and just plain
"run it off the command line" debugging techniques from a full-blown
environment. When that doesn't work (like testing dynamic klibc),
chroot will usually do the trick. The compile-test-debug cycle is so
much faster than for a kernel boot that it's just plain amazing.

-hpa


2002-11-02 20:08:35

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [BK PATCHES] initramfs merge, part 1 of N

Matt Porter wrote:
> On Sat, Nov 02, 2002 at 03:13:45AM -0500, Jeff Garzik wrote:
>
>>#4 - move mounting root to userspace
>>
>>People probably breathed a sigh of relief at patch #3, they will heave a
>>bigger sigh for this patch :) This moves mounting of the root
>>filesystem to early userspace, including getting rid of
>>NFSroot/bootp/dhcp code in the kernel.
>
>
> For those of us who only develop on nfsroot-based systems, does this
> step include adding userspace network interface configuration and
> bootp/dhcp client functionality to kinit? I want to assume that
> "getting rid of NFSroot/bootp/dhcp" means moving that particular
> functionality as part of this step. Just wondering what the
> short-term impact will be on the poor embedded guys. :)
>

Probably not to kinit, but to early userspace, yes. There is no real
reason to put everything into kinit, and a lot of these things we have
already written up as part of the klibc bundle.

-hpa



2002-11-02 20:30:36

by Anu

[permalink] [raw]
Subject: an idling kernel

disclaimer: if this is the wrong ng to be posting this to, its only due to
ignorance.. I dont know the first thing about where to post this
question..

----------------------------------------------------------------------

Hello,
Im ready to be beaten up for asking this question ( I am not sure
which group to post to -- all this is new to me) but, I was wondering how
one could figure out if the kernel was in idle mode (or idling).

I *have* tried to look for the answer and here is waht I have come up with
so far :

Process 0 is the idle process.. but, I dont understand how you can tell if
this means that the kernel is in idle mode. Do we just probe the state
field of all process entries and check to see if everyone is sleeping and
conclude that the kernel is idling??

for_each_process(p)
{
if(process->state == S)
{
countup;
}
}

if countup == number of processes, then the kernel was idling?


-anu

********************************************************************************

Think, Train, Be

*******************************************************************************


2002-11-02 20:32:09

by Alexander Viro

[permalink] [raw]
Subject: Re: [BK PATCHES] initramfs merge, part 1 of N



On Sat, 2 Nov 2002, H. Peter Anvin wrote:

> Probably not to kinit, but to early userspace, yes. There is no real
> reason to put everything into kinit, and a lot of these things we have
> already written up as part of the klibc bundle.

s/probably/definitely/

There is a lot of reasons for _not_ putting everything into one binary -
if nothing else, it allows to deal with situations like
/* do a lot of things that are OK for userland */
/* do ugly magic */
/* do a lot of things that are OK for userland */
without exporting ugly crap.

It's much better to have several userland helpers called from init sequence
to do sane stuff in userland and leave remaining crap where it is, than
to add user-visible interfaces that don't make sense.

2002-11-02 21:10:00

by Jos Hulzink

[permalink] [raw]
Subject: Re: an idling kernel

Hi,

Well.. this mailing list is not that bad for questions like this. You got the
idea somewhat right though it is implemented quite different. The big word
here is scheduling. A scheduler is a piece of code that determines what
thread is to be executed next. How this is done is something entire books are
written about, and a topic that will be discussed on the lkml often.

With linux, the idle thread is entered when the scheduler finds no threads
ready for executing. (Not only sleeping, but also waiting for Disk etc) With
some BSD clones, there is something of an idle queue, a list of threads that
is only to be executed when the system is actually idle. When the scheduler
access this queue, you know it has nothing important to do anymore. Linux
uses priority queueing (see the nice manual for info on that). But, for both
solutions holds: as soon as the scheduler reaches the end of the queue(Linux)
/ queues (some BSDs) without finding a thread that can be executed, the
scheduler enters the real idle thread.

In short: you don't really have to count. You only have to check if you reach
the end of your thread list. Checking if a thread is able to run is what your
scheduler already does.

See the scheduler code for more info.

Jos


On Saturday 02 November 2002 21:37, Anu wrote:
> disclaimer: if this is the wrong ng to be posting this to, its only due to
> ignorance.. I dont know the first thing about where to post this
> question..
>
> ----------------------------------------------------------------------
>
> Hello,
> Im ready to be beaten up for asking this question ( I am not sure
> which group to post to -- all this is new to me) but, I was wondering how
> one could figure out if the kernel was in idle mode (or idling).
>
> I *have* tried to look for the answer and here is waht I have come up with
> so far :
>
> Process 0 is the idle process.. but, I dont understand how you can tell if
> this means that the kernel is in idle mode. Do we just probe the state
> field of all process entries and check to see if everyone is sleeping and
> conclude that the kernel is idling??
>
> for_each_process(p)
> {
> if(process->state == S)
> {
> countup;
> }
> }
>
> if countup == number of processes, then the kernel was idling?
>
>
> -anu
>
> ***************************************************************************
>*****
>
> Think, Train, Be
>
> ***************************************************************************
>****
>
>
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/


2002-11-02 23:01:44

by Matt Porter

[permalink] [raw]
Subject: Re: [BK PATCHES] initramfs merge, part 1 of N

On Sat, Nov 02, 2002 at 04:14:34AM -0800, H. Peter Anvin wrote:
> Matt Porter wrote:
> > On Sat, Nov 02, 2002 at 03:13:45AM -0500, Jeff Garzik wrote:
> >
> >>#4 - move mounting root to userspace
> >>
> >>People probably breathed a sigh of relief at patch #3, they will heave a
> >>bigger sigh for this patch :) This moves mounting of the root
> >>filesystem to early userspace, including getting rid of
> >>NFSroot/bootp/dhcp code in the kernel.
> >
> >
> > For those of us who only develop on nfsroot-based systems, does this
> > step include adding userspace network interface configuration and
> > bootp/dhcp client functionality to kinit? I want to assume that
> > "getting rid of NFSroot/bootp/dhcp" means moving that particular
> > functionality as part of this step. Just wondering what the
> > short-term impact will be on the poor embedded guys. :)
> >
>
> Probably not to kinit, but to early userspace, yes. There is no real
> reason to put everything into kinit, and a lot of these things we have
> already written up as part of the klibc bundle.

Ok, sounds good. I only mentioned kinit since Jeff's roadmap seemed
to be hazy on whether there was consensus on the single binary approach
versus several binaries. For maintenance sake, it seems that optional
separate binaries is the only way to go. Glad to hear that this is the
plan.

Regards,
--
Matt Porter
[email protected]
This is Linux Country. On a quiet night, you can hear Windows reboot.

2002-11-02 23:41:31

by Dave Cinege

[permalink] [raw]
Subject: Re: [BK PATCHES] initramfs merge, part 1 of N

On Saturday 02 November 2002 14:01, Linus Torvalds wrote:
>
> Note that the reason I personally really want initramfs is not to make the
> kernel boot image smaller, or the kernel sources smaller.

Again for your consideration:

Initrd Dynamic (Dynamic Initial Ramdisk)

Initrd Dynamic allows extracting tar and tar.gz archives to the rootfs.
It additonally cleans do_mounts, and rewrites the legacy initrd system.

It provides the same functionality of initramfs but in a more mature and
robust system. It does not depend on legacy initrd operation. It will
prepare 'early userspace' with klibc, et al.

With your acceptance an additonal patch will be forthcoming making
the legacy initrd system a compile time option, and moving the call to
initrd_mount() from do_mounts, to main. (IE compile time initrd.o)

Further patches will purge specific legacy initrd operations from the
general code base and move them to initrd.c where appropreate.

A patch against 2.5.45 is here and attached:
http://ftp.psychosis.com/linux/initrd-dyn/kernelpatches/2.5.45/initrd_dynamic-2.5.45.diff.gz

You can view the primary files involved here, already 'post-patched' 2.5.45:
http://ftp.psychosis.com/linux/initrd-dyn/kernelpatches/2.5.45/do_mounts.c
http://ftp.psychosis.com/linux/initrd-dyn/kernelpatches/2.5.45/initrd.c
http://ftp.psychosis.com/linux/initrd-dyn/kernelpatches/2.5.45/untar.c


Attachments:
initrd_dynamic-2.5.45.diff (53.06 kB)

2002-11-03 00:37:18

by Anu

[permalink] [raw]
Subject: identifying the idling kernel and kernel hacking.

Hello,
I am looking at some way of "automatically" figuring out when a
kernel might be idle -- more at the level of the kernel code itself. After
a day's reading i have the following pieces of information:

.a. There is something called the run_queue which has a list of process
that can be run.
.b. When nothing is running, we have the swapper process running (process
0 ) that is the ancestor of all processes.
.c. the scheduler goes in and checks this readyqueue every so often, so,
we can figure out if there are no processes running at any given
time..
.d. I am now trying to modify the kernel to do something interesting when
there is only the idle process running.. No idea what though. Linux
2.4.9 (which is the version i am looking at ) has a bunch of gotos and I
think i have identified that the section under still_running_back: is the
place to identify when the run_queue is empty.. I am thinking of putting
in some printfs() to make the kernel put out a message that says something
like "i am idling.." everytime the kernel is idling.. and execute a ps uax
simultaneously to show that the kernel is indeed idling..

are there any obvious disasters that u chaps see? (im not an OS person..)

-a



********************************************************************************

Think, Train, Be

*******************************************************************************


2002-11-04 19:10:12

by Werner Almesberger

[permalink] [raw]
Subject: Re: an idling kernel

Anu wrote:
> Im ready to be beaten up for asking this question ( I am not sure
> which group to post to -- all this is new to me) but, I was wondering how
> one could figure out if the kernel was in idle mode (or idling).

There's more to is than just processes: if your kernel has runnable
tasklets or pending interrupts, it is not truly idle, even though
there may be no runnable processes.

In umlsim, I have some heuristics that seem to catch most cases, but
may be a bit too paranoid. Look at timer.c:wait_kernel (called from
idle) in http://www.almesberger.net/umlsim/umlsim-4.tar.gz

- Werner

--
_________________________________________________________________________
/ Werner Almesberger, Buenos Aires, Argentina [email protected] /
/_http://www.almesberger.net/____________________________________________/