It has come to our attention that a system running a specific user
space init program will not boot if you add "debug" to the kernel
command line. What happens is that the user space tool parses the
kernel command line, and if it sees "debug" it will spit out so much
information that the system fails to boot. This basically renders the
"debug" option for the kernel useless.
This bug has been reported to the developers of said tool
here:
https://bugs.freedesktop.org/show_bug.cgi?id=76935
The response is:
"Generic terms are generic, not the first user owns them."
That is, the "debug" statement on the *kernel* command line is not
owned by the kernel just because it was the first user of it, and
they refuse to fix their bug.
Well, my response is, we OWN the kernel command line, and as such, we
can keep the users from seeing stuff on it if we so choose. And with
that, I propose this patch, which hides "debug" from /proc/cmdline,
such that we don't have to worry about tools parsing for it and causing
hardship for those trying to debug the kernel.
Reported-by: Borislav Petkov <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c
index cbd82df..fdb75f7 100644
--- a/fs/proc/cmdline.c
+++ b/fs/proc/cmdline.c
@@ -5,7 +5,7 @@
static int cmdline_proc_show(struct seq_file *m, void *v)
{
- seq_printf(m, "%s\n", saved_command_line);
+ seq_printf(m, "%s\n", proc_command_line);
return 0;
}
diff --git a/include/linux/init.h b/include/linux/init.h
index a3ba270..daf978a 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -145,6 +145,7 @@ typedef void (*ctor_fn_t)(void);
extern int do_one_initcall(initcall_t fn);
extern char __initdata boot_command_line[];
extern char *saved_command_line;
+extern char *proc_command_line;
extern unsigned int reset_devices;
/* used by init/main.c */
diff --git a/init/main.c b/init/main.c
index 9c7fd4c..6c2022f 100644
--- a/init/main.c
+++ b/init/main.c
@@ -121,8 +121,10 @@ void (*__initdata late_time_init)(void);
/* Untouched command line saved by arch-specific code. */
char __initdata boot_command_line[COMMAND_LINE_SIZE];
-/* Untouched saved command line (eg. for /proc) */
+/* Untouched saved command line */
char *saved_command_line;
+/* Slightly touched command line (for /proc) */
+char *proc_command_line;
/* Command line for parameter parsing */
static char *static_command_line;
/* Command line for per-initcall parameter parsing */
@@ -349,13 +351,44 @@ static inline void smp_prepare_cpus(unsigned int maxcpus) { }
*/
static void __init setup_command_line(char *command_line)
{
- saved_command_line =
- memblock_virt_alloc(strlen(boot_command_line) + 1, 0);
- initcall_command_line =
- memblock_virt_alloc(strlen(boot_command_line) + 1, 0);
+ char *str;
+ int len = strlen(boot_command_line);
+
+ saved_command_line = memblock_virt_alloc(len + 1, 0);
+ initcall_command_line = memblock_virt_alloc(len + 1, 0);
static_command_line = memblock_virt_alloc(strlen(command_line) + 1, 0);
strcpy (saved_command_line, boot_command_line);
strcpy (static_command_line, command_line);
+
+ proc_command_line = saved_command_line;
+
+ /*
+ * To keep user processes from abusing 'debug', we strip it
+ * from what /proc/cmdline shows.
+ */
+ str = saved_command_line;
+ do {
+ int index;
+
+ str = strstr(str, "debug");
+ if (!str)
+ break;
+
+ /* Make sure debug is by itself */
+ if ((str != saved_command_line && str[-1] != ' ') ||
+ (str[5] != '\0' && str[5] != ' ')) {
+ str += 5;
+ continue;
+ }
+
+ /* Remove "debug" from command line */
+ index = str - saved_command_line;
+ proc_command_line = memblock_virt_alloc(len - 4, 0);
+ strncpy(proc_command_line, boot_command_line, index);
+ strncpy(proc_command_line + index,
+ boot_command_line + index + 5, len - (index + 5));
+ break;
+ } while (*str != '\0');
}
/*
On Wed, Apr 2, 2014 at 11:42 AM, Steven Rostedt <[email protected]> wrote:
>
> The response is:
>
> "Generic terms are generic, not the first user owns them."
And by "their" you mean Kay Sievers.
Key, I'm f*cking tired of the fact that you don't fix problems in the
code *you* write, so that the kernel then has to work around the
problems you cause.
Greg - just for your information, I will *not* be merging any code
from Kay into the kernel until this constant pattern is fixed.
This has been going on for *years*, and doesn't seem to be getting any
better. This is relevant to you because I have seen you talk about the
kdbus patches, and this is a heads-up that you need to keep them
separate from other work. Let distributions merge it as they need to
and maybe we can merge it once it has been proven to be stable by
whatever distro that was willing to play games with the developers.
But I'm not willing to merge something where the maintainer is known
to not care about bugs and regressions and then forces people in other
projects to fix their project. Because I am *not* willing to take
patches from people who don't clean up after their problems, and don't
admit that it's their problem to fix.
Kay - one more time: you caused the problem, you need to fix it. None
of this "I can do whatever I want, others have to clean up after me"
crap.
Linus
On Wed, 2 Apr 2014 14:42:19 -0400 Steven Rostedt <[email protected]> wrote:
> It has come to our attention that a system running a specific user
> space init program will not boot if you add "debug" to the kernel
> command line. What happens is that the user space tool parses the
> kernel command line, and if it sees "debug" it will spit out so much
> information that the system fails to boot. This basically renders the
> "debug" option for the kernel useless.
>
> This bug has been reported to the developers of said tool
> here:
>
> https://bugs.freedesktop.org/show_bug.cgi?id=76935
>
> The response is:
>
> "Generic terms are generic, not the first user owns them."
>
> That is, the "debug" statement on the *kernel* command line is not
> owned by the kernel just because it was the first user of it, and
> they refuse to fix their bug.
>
> Well, my response is, we OWN the kernel command line, and as such, we
> can keep the users from seeing stuff on it if we so choose. And with
> that, I propose this patch, which hides "debug" from /proc/cmdline,
> such that we don't have to worry about tools parsing for it and causing
> hardship for those trying to debug the kernel.
>
I had to check the date on this but surprisingly, it's all post
April 1.
--- a/fs/read_write.c~a
+++ a/fs/read_write.c
@@ -513,6 +513,8 @@ SYSCALL_DEFINE3(read, unsigned int, fd,
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
+ BUG_ON(!strcmp(current->comm, "systemd"));
+
if (f.file) {
loff_t pos = file_pos_read(f.file);
ret = vfs_read(f.file, buf, count, &pos);
_
On Wed, Apr 02, 2014 at 12:04:40PM -0700, Andrew Morton wrote:
> On Wed, 2 Apr 2014 14:42:19 -0400 Steven Rostedt <[email protected]> wrote:
>
> > It has come to our attention that a system running a specific user
> > space init program will not boot if you add "debug" to the kernel
> > command line. What happens is that the user space tool parses the
> > kernel command line, and if it sees "debug" it will spit out so much
> > information that the system fails to boot. This basically renders the
> > "debug" option for the kernel useless.
> >
> > This bug has been reported to the developers of said tool
> > here:
> >
> > https://bugs.freedesktop.org/show_bug.cgi?id=76935
> >
> > The response is:
> >
> > "Generic terms are generic, not the first user owns them."
> >
> > That is, the "debug" statement on the *kernel* command line is not
> > owned by the kernel just because it was the first user of it, and
> > they refuse to fix their bug.
> >
> > Well, my response is, we OWN the kernel command line, and as such, we
> > can keep the users from seeing stuff on it if we so choose. And with
> > that, I propose this patch, which hides "debug" from /proc/cmdline,
> > such that we don't have to worry about tools parsing for it and causing
> > hardship for those trying to debug the kernel.
> >
>
> I had to check the date on this but surprisingly, it's all post
> April 1.
>
> --- a/fs/read_write.c~a
> +++ a/fs/read_write.c
> @@ -513,6 +513,8 @@ SYSCALL_DEFINE3(read, unsigned int, fd,
> struct fd f = fdget_pos(fd);
> ssize_t ret = -EBADF;
>
> + BUG_ON(!strcmp(current->comm, "systemd"));
> +
> if (f.file) {
> loff_t pos = file_pos_read(f.file);
> ret = vfs_read(f.file, buf, count, &pos);
ACK!
--
Regards/Gruss,
Boris.
Sent from a fat crate under my desk. Formatting is fine.
--
On 04/02/2014 12:04 PM, Andrew Morton wrote:
> On Wed, 2 Apr 2014 14:42:19 -0400 Steven Rostedt <[email protected]> wrote:
>
>> It has come to our attention that a system running a specific user
>> space init program will not boot if you add "debug" to the kernel
>> command line. What happens is that the user space tool parses the
>> kernel command line, and if it sees "debug" it will spit out so much
>> information that the system fails to boot. This basically renders the
>> "debug" option for the kernel useless.
>>
>> This bug has been reported to the developers of said tool
>> here:
>>
>> https://bugs.freedesktop.org/show_bug.cgi?id=76935
>>
>> The response is:
>>
>> "Generic terms are generic, not the first user owns them."
>>
>> That is, the "debug" statement on the *kernel* command line is not
>> owned by the kernel just because it was the first user of it, and
>> they refuse to fix their bug.
>>
>> Well, my response is, we OWN the kernel command line, and as such, we
>> can keep the users from seeing stuff on it if we so choose. And with
>> that, I propose this patch, which hides "debug" from /proc/cmdline,
>> such that we don't have to worry about tools parsing for it and causing
>> hardship for those trying to debug the kernel.
>>
>
> I had to check the date on this but surprisingly, it's all post
> April 1.
>
> --- a/fs/read_write.c~a
> +++ a/fs/read_write.c
> @@ -513,6 +513,8 @@ SYSCALL_DEFINE3(read, unsigned int, fd,
> struct fd f = fdget_pos(fd);
> ssize_t ret = -EBADF;
>
> + BUG_ON(!strcmp(current->comm, "systemd"));
> +
> if (f.file) {
> loff_t pos = file_pos_read(f.file);
> ret = vfs_read(f.file, buf, count, &pos);
> _
Ack here.
This situation is ridiculous.
--
~Randy
On Wed, Apr 02, 2014 at 02:42:19PM -0400, Steven Rostedt wrote:
> It has come to our attention that a system running a specific user
> space init program will not boot if you add "debug" to the kernel
> command line. What happens is that the user space tool parses the
> kernel command line, and if it sees "debug" it will spit out so much
> information that the system fails to boot. This basically renders the
> "debug" option for the kernel useless.
>
> This bug has been reported to the developers of said tool
> here:
>
> https://bugs.freedesktop.org/show_bug.cgi?id=76935
>
> The response is:
>
> "Generic terms are generic, not the first user owns them."
>
> That is, the "debug" statement on the *kernel* command line is not
> owned by the kernel just because it was the first user of it, and
> they refuse to fix their bug.
>
> Well, my response is, we OWN the kernel command line, and as such, we
> can keep the users from seeing stuff on it if we so choose. And with
> that, I propose this patch, which hides "debug" from /proc/cmdline,
> such that we don't have to worry about tools parsing for it and causing
> hardship for those trying to debug the kernel.
>
> Reported-by: Borislav Petkov <[email protected]>
> Signed-off-by: Steven Rostedt <[email protected]>
Well, maybe the minor nit that it leaves two spaces in /proc/cmdline now:
"root=/dev/sda1 ignore_loglevel log_buf_len=10M earlyprintk=ttyS0,115200 console=ttyS0,115200 console=tty0"
Also, you need to add
Suggested-by: PeterZ.
Other than that:
Vehemently-and-totally-acked-by: Borislav Petkov <[email protected]>
It is about time this idiotic shit stopped.
--
Regards/Gruss,
Boris.
Sent from a fat crate under my desk. Formatting is fine.
--
On Wed, 2 Apr 2014 21:08:51 +0200
Borislav Petkov <[email protected]> wrote:
> Well, maybe the minor nit that it leaves two spaces in /proc/cmdline now:
>
> "root=/dev/sda1 ignore_loglevel log_buf_len=10M earlyprintk=ttyS0,115200 console=ttyS0,115200 console=tty0"
>
> Also, you need to add
>
> Suggested-by: PeterZ.
And surprisingly, I forgot to Cc him. How the heck did I miss that :-p
-- Steve
>
> Other than that:
>
> Vehemently-and-totally-acked-by: Borislav Petkov <[email protected]>
>
> It is about time this idiotic shit stopped.
>
On Wed, 2 Apr 2014, Andrew Morton wrote:
> On Wed, 2 Apr 2014 14:42:19 -0400 Steven Rostedt <[email protected]> wrote:
>
> > It has come to our attention that a system running a specific user
> > space init program will not boot if you add "debug" to the kernel
> > command line. What happens is that the user space tool parses the
> > kernel command line, and if it sees "debug" it will spit out so much
> > information that the system fails to boot. This basically renders the
> > "debug" option for the kernel useless.
> >
> > This bug has been reported to the developers of said tool
> > here:
> >
> > https://bugs.freedesktop.org/show_bug.cgi?id=76935
> >
> > The response is:
> >
> > "Generic terms are generic, not the first user owns them."
> >
> > That is, the "debug" statement on the *kernel* command line is not
> > owned by the kernel just because it was the first user of it, and
> > they refuse to fix their bug.
> >
> > Well, my response is, we OWN the kernel command line, and as such, we
> > can keep the users from seeing stuff on it if we so choose. And with
> > that, I propose this patch, which hides "debug" from /proc/cmdline,
> > such that we don't have to worry about tools parsing for it and causing
> > hardship for those trying to debug the kernel.
> >
>
> I had to check the date on this but surprisingly, it's all post
> April 1.
>
> --- a/fs/read_write.c~a
> +++ a/fs/read_write.c
> @@ -513,6 +513,8 @@ SYSCALL_DEFINE3(read, unsigned int, fd,
> struct fd f = fdget_pos(fd);
> ssize_t ret = -EBADF;
>
> + BUG_ON(!strcmp(current->comm, "systemd"));
Can we make that strncmp(current->comm, "systemd", 7) please? So we
catch the systemd-xxx stuff as well!
Otherwise Acked-by-me
tglx
On Wed, Apr 2, 2014 at 9:50 PM, Thomas Gleixner <[email protected]> wrote:
> On Wed, 2 Apr 2014, Andrew Morton wrote:
>> On Wed, 2 Apr 2014 14:42:19 -0400 Steven Rostedt <[email protected]> wrote:
>>
>> > It has come to our attention that a system running a specific user
>> > space init program will not boot if you add "debug" to the kernel
>> > command line. What happens is that the user space tool parses the
>> > kernel command line, and if it sees "debug" it will spit out so much
>> > information that the system fails to boot. This basically renders the
>> > "debug" option for the kernel useless.
>> >
>> > This bug has been reported to the developers of said tool
>> > here:
>> >
>> > https://bugs.freedesktop.org/show_bug.cgi?id=76935
>> >
>> > The response is:
>> >
>> > "Generic terms are generic, not the first user owns them."
>> >
>> > That is, the "debug" statement on the *kernel* command line is not
>> > owned by the kernel just because it was the first user of it, and
>> > they refuse to fix their bug.
>> >
>> > Well, my response is, we OWN the kernel command line, and as such, we
>> > can keep the users from seeing stuff on it if we so choose. And with
>> > that, I propose this patch, which hides "debug" from /proc/cmdline,
>> > such that we don't have to worry about tools parsing for it and causing
>> > hardship for those trying to debug the kernel.
>> >
>>
>> I had to check the date on this but surprisingly, it's all post
>> April 1.
>>
>> --- a/fs/read_write.c~a
>> +++ a/fs/read_write.c
>> @@ -513,6 +513,8 @@ SYSCALL_DEFINE3(read, unsigned int, fd,
>> struct fd f = fdget_pos(fd);
>> ssize_t ret = -EBADF;
>>
>> + BUG_ON(!strcmp(current->comm, "systemd"));
>
> Can we make that strncmp(current->comm, "systemd", 7) please? So we
> catch the systemd-xxx stuff as well!
>
> Otherwise Acked-by-me
>
> tglx
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
While we are here, can we please also talk about the !cgroup situation?
https://bugs.freedesktop.org/show_bug.cgi?id=74589
A segfaulting systemd on CONFIG_CGROUPS=n is no fun.
--
Thanks,
//richard
On Wed, 2 Apr 2014, Richard Weinberger wrote:
> While we are here, can we please also talk about the !cgroup situation?
> https://bugs.freedesktop.org/show_bug.cgi?id=74589
>
> A segfaulting systemd on CONFIG_CGROUPS=n is no fun.
Wow, the attitude there is amazing:
"To make this work we'd need a patch, as nobody of us tests this."
Rule #1: systemd maintainers worldview is always correct
Rule #2: In the unexpected case that you think the worldview of
systemd maintainers is incorrect, apply rule #1.
Thanks,
tglx
On Wed, Apr 02, 2014 at 02:42:19PM -0400, Steven Rostedt wrote:
> It has come to our attention that a system running a specific user
> space init program will not boot if you add "debug" to the kernel
> command line. What happens is that the user space tool parses the
> kernel command line, and if it sees "debug" it will spit out so much
> information that the system fails to boot. This basically renders the
> "debug" option for the kernel useless.
>
> This bug has been reported to the developers of said tool
> here:
>
> https://bugs.freedesktop.org/show_bug.cgi?id=76935
>
> The response is:
>
> "Generic terms are generic, not the first user owns them."
>
> That is, the "debug" statement on the *kernel* command line is not
> owned by the kernel just because it was the first user of it, and
> they refuse to fix their bug.
>
> Well, my response is, we OWN the kernel command line, and as such, we
> can keep the users from seeing stuff on it if we so choose. And with
> that, I propose this patch, which hides "debug" from /proc/cmdline,
> such that we don't have to worry about tools parsing for it and causing
> hardship for those trying to debug the kernel.
>
Well, parsing kernel cmdline by systemd is a bad idea, and hiding
"debug" is even worse. What will happen when the next keyword clashes?
And how should I check the kernel is booted with "debug"?
If there is a real need to pass arguments to systemd, how about a
dedicated option (initargs= or whatever, where it has to be last in
cmdline), then systemd would be spawned with these arguments and would
just go over its argv.
--
Mateusz Guzik
On Wed, Apr 02, 2014 at 12:04:40PM -0700, Andrew Morton wrote:
> On Wed, 2 Apr 2014 14:42:19 -0400 Steven Rostedt <[email protected]> wrote:
>
> > It has come to our attention that a system running a specific user
> > space init program will not boot if you add "debug" to the kernel
> > command line. What happens is that the user space tool parses the
> > kernel command line, and if it sees "debug" it will spit out so much
> > information that the system fails to boot. This basically renders the
> > "debug" option for the kernel useless.
> >
> > This bug has been reported to the developers of said tool
> > here:
> >
> > https://bugs.freedesktop.org/show_bug.cgi?id=76935
> >
> > The response is:
> >
> > "Generic terms are generic, not the first user owns them."
> >
> > That is, the "debug" statement on the *kernel* command line is not
> > owned by the kernel just because it was the first user of it, and
> > they refuse to fix their bug.
> >
> > Well, my response is, we OWN the kernel command line, and as such, we
> > can keep the users from seeing stuff on it if we so choose. And with
> > that, I propose this patch, which hides "debug" from /proc/cmdline,
> > such that we don't have to worry about tools parsing for it and causing
> > hardship for those trying to debug the kernel.
> >
>
> I had to check the date on this but surprisingly, it's all post
> April 1.
>
> --- a/fs/read_write.c~a
> +++ a/fs/read_write.c
> @@ -513,6 +513,8 @@ SYSCALL_DEFINE3(read, unsigned int, fd,
> struct fd f = fdget_pos(fd);
> ssize_t ret = -EBADF;
>
> + BUG_ON(!strcmp(current->comm, "systemd"));
> +
> if (f.file) {
> loff_t pos = file_pos_read(f.file);
> ret = vfs_read(f.file, buf, count, &pos);
> _
Sure, bring a box up and the first user that logs in you will crash the
machine? That's going to be fun to explain to users.
Unless Linux doesn't care about local users anymore.
I understand the sediment, but really people...
On 04/02/2014 03:12 PM, Mateusz Guzik wrote:
> On Wed, Apr 02, 2014 at 02:42:19PM -0400, Steven Rostedt wrote:
>> It has come to our attention that a system running a specific user
>> space init program will not boot if you add "debug" to the kernel
>> command line. What happens is that the user space tool parses the
>> kernel command line, and if it sees "debug" it will spit out so much
>> information that the system fails to boot. This basically renders the
>> "debug" option for the kernel useless.
>>
>> This bug has been reported to the developers of said tool
>> here:
>>
>> https://bugs.freedesktop.org/show_bug.cgi?id=76935
>>
>> The response is:
>>
>> "Generic terms are generic, not the first user owns them."
>>
>> That is, the "debug" statement on the *kernel* command line is not
>> owned by the kernel just because it was the first user of it, and
>> they refuse to fix their bug.
>>
>> Well, my response is, we OWN the kernel command line, and as such, we
>> can keep the users from seeing stuff on it if we so choose. And with
>> that, I propose this patch, which hides "debug" from /proc/cmdline,
>> such that we don't have to worry about tools parsing for it and causing
>> hardship for those trying to debug the kernel.
>>
>
> Well, parsing kernel cmdline by systemd is a bad idea, and hiding
> "debug" is even worse. What will happen when the next keyword clashes?
> And how should I check the kernel is booted with "debug"?
>
> If there is a real need to pass arguments to systemd, how about a
> dedicated option (initargs= or whatever, where it has to be last in
> cmdline), then systemd would be spawned with these arguments and would
> just go over its argv.
>
What if systemd only parsed kernel command line arguments of the form:
... systemd.arg_foo systemd.arg_bar=x ...
Then it wouldn't get confused by arguments that weren't directly
targeted at it.
There is precedence for this form, as it is what we already use for
built-in modules. As a bonus, we would be ready for when systemd is
integrated into the kernel as a module itself.
Thanks,
David Daney
On Thu, Apr 03, 2014 at 12:12:13AM +0200, Mateusz Guzik wrote:
> On Wed, Apr 02, 2014 at 02:42:19PM -0400, Steven Rostedt wrote:
> > It has come to our attention that a system running a specific user
> > space init program will not boot if you add "debug" to the kernel
> > command line. What happens is that the user space tool parses the
> > kernel command line, and if it sees "debug" it will spit out so much
> > information that the system fails to boot. This basically renders the
> > "debug" option for the kernel useless.
> >
> > This bug has been reported to the developers of said tool
> > here:
> >
> > https://bugs.freedesktop.org/show_bug.cgi?id=76935
> >
> > The response is:
> >
> > "Generic terms are generic, not the first user owns them."
> >
> > That is, the "debug" statement on the *kernel* command line is not
> > owned by the kernel just because it was the first user of it, and
> > they refuse to fix their bug.
> >
> > Well, my response is, we OWN the kernel command line, and as such, we
> > can keep the users from seeing stuff on it if we so choose. And with
> > that, I propose this patch, which hides "debug" from /proc/cmdline,
> > such that we don't have to worry about tools parsing for it and causing
> > hardship for those trying to debug the kernel.
> >
>
> Well, parsing kernel cmdline by systemd is a bad idea, and hiding
> "debug" is even worse. What will happen when the next keyword clashes?
> And how should I check the kernel is booted with "debug"?
>
> If there is a real need to pass arguments to systemd, how about a
> dedicated option (initargs= or whatever, where it has to be last in
> cmdline), then systemd would be spawned with these arguments and would
> just go over its argv.
Right now it looks like systemd relies on everything with a "systemd."
namespace on the command line, with the exception of "quiet" and
"debug". I've just submitted a patch to systemd to fix up "debug", and
odds are, "quiet" should be switched as well, but unless anyone runs
into a problem with it, I'll leave that alone for now.
thanks,
greg k-h
On Wed, Apr 2, 2014 at 3:12 PM, Mateusz Guzik <[email protected]> wrote:
>
> Well, parsing kernel cmdline by systemd is a bad idea
No, we very much expose /proc/cmdline for a reason. System services
are *supposed* to parse it, because it gives a unified way for people
to pass in various flags. The kernel doesn't complain about flags it
doesn't recognize, exactly because the kernel realizes that "hey,
maybe this flag is for something else".
The classic example of this is things like "charset" markers, but also
options to modules that modprobe parses etc etc.
And yes, that does include "quiet" and "debug". Parsing them and doing
something sane with them is not a bug, it's a feature.
But the problem appears when system services seem to think that they
*own* those flags, and nothing else matters, and they don't do
something "sane" any more.
And the thing is, this has never really been a problem in practice.
Because nobody minds if some kernel option like "debug" makes not only
the kernel enable debugging, but some system deamon log a bit more
too.
HOWEVER.
It does become a problem when you have a system service developer who
thinks the universe revolves around him, and nobody else matters, and
people sending him bug-reports are annoyances that should be ignored
rather than acknowledged and fixed. At that point, it's a problem.
It looks like Greg has stepped in as a baby-sitter for Kay, and things
are going to be fixed. And I'd really like to avoid adding hacky code
to the kernel because of Kay's continued bad behavior, so I hope this
works. But it's really sad that things like this get elevated to this
kind of situation, and I personally find it annoying that it's always
the same f*cking primadonna involved.
Steven, Borislav, one thing that strikes me might be a good idea is to
limit the amount of non-kernel noise in dmesg. We already have the
concept of rate-limiting various spammy internal kernel messages for
when device drivers misbehave etc. Maybe we can just add rate-limiting
to the interfaces that add messages to the kernel buffers, and work
around this problem that way instead while waiting for Gregs fix to
percolate? Or are the systemd debug messages going to so many other
places too that that wouldn't really help?
Linus
On Wed, 2 Apr 2014, Linus Torvalds wrote:
> Steven, Borislav, one thing that strikes me might be a good idea is to
> limit the amount of non-kernel noise in dmesg. We already have the
> concept of rate-limiting various spammy internal kernel messages for
> when device drivers misbehave etc. Maybe we can just add rate-limiting
> to the interfaces that add messages to the kernel buffers, and work
> around this problem that way instead while waiting for Gregs fix to
> percolate? Or are the systemd debug messages going to so many other
> places too that that wouldn't really help?
I think that it's in principle a good idea, however ... the in-kernel
ratelimiting always happens per sourcecode location, but this will be
rather hard to achieve with interface such as /dev/kmsg.
If /dev/kmsg is going to be ratelimited as a whole, it might potentially
create a severely unfair situation between individual userspace programs
trying to do logging (although there is apparently only one userspace
service doing any logging through this interface whatsoever, right?).
--
Jiri Kosina
SUSE Labs
On Thu, 3 Apr 2014 01:23:15 +0200 (CEST) Jiri Kosina <[email protected]> wrote:
> On Wed, 2 Apr 2014, Linus Torvalds wrote:
>
> > Steven, Borislav, one thing that strikes me might be a good idea is to
> > limit the amount of non-kernel noise in dmesg. We already have the
> > concept of rate-limiting various spammy internal kernel messages for
> > when device drivers misbehave etc. Maybe we can just add rate-limiting
> > to the interfaces that add messages to the kernel buffers, and work
> > around this problem that way instead while waiting for Gregs fix to
> > percolate? Or are the systemd debug messages going to so many other
> > places too that that wouldn't really help?
>
> I think that it's in principle a good idea, however ... the in-kernel
> ratelimiting always happens per sourcecode location, but this will be
> rather hard to achieve with interface such as /dev/kmsg.
>
> If /dev/kmsg is going to be ratelimited as a whole, it might potentially
> create a severely unfair situation between individual userspace programs
> trying to do logging (although there is apparently only one userspace
> service doing any logging through this interface whatsoever, right?).
Could be done per-fd: put a struct ratelimit_state into struct
devkmsg_user.
On Wed, Apr 2, 2014 at 4:23 PM, Jiri Kosina <[email protected]> wrote:
>
> I think that it's in principle a good idea, however ... the in-kernel
> ratelimiting always happens per sourcecode location, but this will be
> rather hard to achieve with interface such as /dev/kmsg.
>
> If /dev/kmsg is going to be ratelimited as a whole, it might potentially
> create a severely unfair situation between individual userspace programs
> trying to do logging (although there is apparently only one userspace
> service doing any logging through this interface whatsoever, right?).
So I think we could/should make the rate limiting of kmsg be tied to
perhaps 'struct cred' and then just use current_cred()->rs. Or perhaps
even make it per-task.
It wouldn't be hard to just embed one "struct ratelimit_state" into
the appropriate struct. They're not *that* huge.
Linus
On Wed, Apr 2, 2014 at 4:28 PM, Andrew Morton <[email protected]> wrote:
>
> Could be done per-fd: put a struct ratelimit_state into struct
> devkmsg_user.
Yeah, what Andrew said. My suggestion of per-task or per-cred is
obviously moronic in comparison.
Linus "hangs head in shame" Torvalds
On Wed, 2 Apr 2014, Linus Torvalds wrote:
> > Could be done per-fd: put a struct ratelimit_state into struct
> > devkmsg_user.
>
> Yeah, what Andrew said. My suggestion of per-task or per-cred is
> obviously moronic in comparison.
Which doesn't really protect you from tasks that do open()/write()/close()
cycle for /dev/kmsg write every 2ms though.
--
Jiri Kosina
SUSE Labs
On Wed, 2014-04-02 at 16:42 -0700, Linus Torvalds wrote:
> On Wed, Apr 2, 2014 at 4:28 PM, Andrew Morton <[email protected]> wrote:
> >
> > Could be done per-fd: put a struct ratelimit_state into struct
> > devkmsg_user.
>
> Yeah, what Andrew said.
The default ratelimit state values are pretty low
and may need to be exposed or increased here.
include/linux/ratelimit.h:#define DEFAULT_RATELIMIT_INTERVAL (5 * HZ)
include/linux/ratelimit.h:#define DEFAULT_RATELIMIT_BURST 10
On Wed, Apr 2, 2014 at 4:47 PM, Jiri Kosina <[email protected]> wrote:
>
> Which doesn't really protect you from tasks that do open()/write()/close()
> cycle for /dev/kmsg write every 2ms though.
I don't think we should try to protect against wilful bad behavior
unless that is shown to be necessary. Yeah, if it turns out that
systemd really does that just to mess with us, we'd need to extend it,
but in the absence of proof to the contrary, maybe this simple
attached patch works?
TOTALLY UNTESTED. But it really isn't complex.
The complexity is likely to be about setting the actual default
values. So consider this a RFC and a "does this fix the problem that
Borislav reported?" test-patch.
Borislav?
Linus
On Wed, 2 Apr 2014, Linus Torvalds wrote:
> > Which doesn't really protect you from tasks that do open()/write()/close()
> > cycle for /dev/kmsg write every 2ms though.
>
> I don't think we should try to protect against wilful bad behavior
> unless that is shown to be necessary. Yeah, if it turns out that
> systemd really does that just to mess with us, we'd need to extend it,
> but in the absence of proof to the contrary, maybe this simple
> attached patch works?
>
> TOTALLY UNTESTED. But it really isn't complex.
[ ... snip ... [
@@ -483,6 +484,8 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
if (!user)
return -EBADF;
+ if (!___ratelimit(&user->rs, current->comm))
+ return 0;
I am admittedly rather new to this 'abuse the hell out of kernel
ringbuffer' thing, but shouldn't we better be limiting the
devkmsg_writev()?
--
Jiri Kosina
SUSE Labs
On Thu, 3 Apr 2014 00:12:13 +0200
Mateusz Guzik <[email protected]> wrote:
> Well, parsing kernel cmdline by systemd is a bad idea, and hiding
> "debug" is even worse. What will happen when the next keyword clashes?
> And how should I check the kernel is booted with "debug"?
No, I think you got it backwards. Hiding "debug" is a bad idea, having
systemd abuse the hell out of it is even worse.
>
> If there is a real need to pass arguments to systemd, how about a
> dedicated option (initargs= or whatever, where it has to be last in
> cmdline), then systemd would be spawned with these arguments and would
> just go over its argv.
>
No, the correct solution was to have systemd use its own flag. Say
"systemd.debug", which I believe from froward reading my emails, Greg
has already sent a patch to do so.
That solution was even suggested in the BZ and was turned down with the
quote I had in my change log. That we don't own the generic terms, even
though it's our f*cking command line, not theirs!
-- Steve
On Wed, 2 Apr 2014 16:52:59 -0700
Linus Torvalds <[email protected]> wrote:
> On Wed, Apr 2, 2014 at 4:47 PM, Jiri Kosina <[email protected]> wrote:
> >
> > Which doesn't really protect you from tasks that do open()/write()/close()
> > cycle for /dev/kmsg write every 2ms though.
>
> I don't think we should try to protect against wilful bad behavior
> unless that is shown to be necessary. Yeah, if it turns out that
> systemd really does that just to mess with us, we'd need to extend it,
> but in the absence of proof to the contrary, maybe this simple
> attached patch works?
I wonder if we should also add a "ignore_devkmsg" that basically
turns writing into /dev/kmsg into /dev/null. That way, a simple "debug"
and "ignore_devkmsg" wont blow away the printk ring buffer from data you
want to look at on boot up.
-- Steve
On Wed, Apr 2, 2014 at 4:52 PM, Linus Torvalds
<[email protected]> wrote:
>
> TOTALLY UNTESTED. But it really isn't complex.
Oh, and here's a patch that is actually lightly tested. I did
while :; do echo hello; done > /dev/kmsg
(the 'yes' program buffers output, so won't work) and I get
[ 122.062912] hello
[ 122.062915] hello
[ 122.062918] hello
[ 122.062921] hello
[ 122.062924] hello
[ 122.062927] hello
[ 122.062930] hello
[ 122.062932] hello
[ 122.062935] hello
[ 122.062938] hello
[ 127.062671] bash: 2104439 callbacks suppressed
so it works (repeating every five seconds, as expected).
It's definitely not perfect - if we suppress output, and the process
then closes the file descriptor rather than continuing to write more,
you won't get that "suppressed" message. But it's a usable starting
point for testing and commentary on the actual limits.
So we should probably add reporting about suppressed messages at file
close time, and we should tweak the limits (for example, perhaps not
limit things if the buffers are largely empty - which happens at
bootup), but on the whole I think this is a reasonable thing to do.
Whether it actually fixes the problem that Borislav had is
questionable, of course. For all I know, systemd debug mode generates
so much data in *other* ways and then causes feedback loops with the
kernel debugging that this patch is totally immaterial, and dmesg was
never the main issue. But unlike the "hide 'debug' from
/proc/cmdline", I think this patch at least _conceptually_ makes a lot
of sense, even if systemd gets fixed, so ...
Borislav?
Linus
On Wed, Apr 02, 2014 at 06:47:57PM -0700, Linus Torvalds wrote:
> Borislav?
We're trying to reproduce the original issue with the assertion firing
and drowning dmesg but it is a huuge box and a bit flaky so it'll take
some time.
I'll let you know as soon as I have something.
Thanks.
--
Regards/Gruss,
Boris.
Sent from a fat crate under my desk. Formatting is fine.
--
Linus Torvalds <[email protected]> writes:
> On Wed, Apr 2, 2014 at 4:47 PM, Jiri Kosina <[email protected]> wrote:
>>
>> Which doesn't really protect you from tasks that do open()/write()/close()
>> cycle for /dev/kmsg write every 2ms though.
>
> I don't think we should try to protect against wilful bad behavior
> unless that is shown to be necessary. Yeah, if it turns out that
> systemd really does that just to mess with us, we'd need to extend it,
> but in the absence of proof to the contrary, maybe this simple
> attached patch works?
Once is an accident. Twice is incompetence. Three times is malice.
--
M?ns Rullg?rd
[email protected]
On Wed, Apr 02, 2014 at 06:47:57PM -0700, Linus Torvalds wrote:
> Whether it actually fixes the problem that Borislav had is
> questionable, of course. For all I know, systemd debug mode generates
> so much data in *other* ways and then causes feedback loops with the
> kernel debugging that this patch is totally immaterial, and dmesg was
> never the main issue. But unlike the "hide 'debug' from
> /proc/cmdline", I think this patch at least _conceptually_ makes a lot
> of sense, even if systemd gets fixed, so ...
How about just ignoring writes to /dev/kmsg altogether by default
(unless explicitly enabled in Kconfig or on the kernel cmdline)? Maybe I
am missing something but what are the legitimate use-cases for generally
allowing user-space to write into the kernel-log?
Joerg
On Thu, Apr 03, 2014 at 11:34:15AM +0100, Måns Rullgård wrote:
> Once is an accident. Twice is incompetence. Three times is malice.
Yeah, maybe it is time Linus started his own init daemon project, like
that other thing, git, he did start a while ago. We can put it in
tools/. I'm sure it can get off the ground pretty quickly, judging by
other projects kernel people have jumped on in the past.
:-) :-)
--
Regards/Gruss,
Boris.
Sent from a fat crate under my desk. Formatting is fine.
--
On Wed, Apr 02, 2014 at 06:47:57PM -0700, Linus Torvalds wrote:
> Whether it actually fixes the problem that Borislav had is
> questionable, of course. For all I know, systemd debug mode generates
> so much data in *other* ways and then causes feedback loops with
> the kernel debugging that this patch is totally immaterial, and
> dmesg was never the main issue. But unlike the "hide 'debug' from
> /proc/cmdline", I think this patch at least _conceptually_ makes a lot
> of sense, even if systemd gets fixed, so ...
>
> Borislav?
Yes, ratelimiting makes the box actually boot all the way. Here's how it
looks like:
^MLoading Linux 3.12.16-boris-00441-g60dc93e99a75-dirty ...
^MLoading initial ramdisk ...
^M[ 0.000000] Initializing cgroup subsys cpuset
[ 0.000000] Initializing cgroup subsys cpu
[ 0.000000] Initializing cgroup subsys cpuacct
[ 0.000000] Linux version 3.12.16-boris-00441-g60dc93e99a75-dirty (gcc version 4.3.4 [gcc-4_3-branch revision 152973] (SUSE Linux) ) #11 SMP Thu Apr 3 10:19:49 CEST 2014
[ 0.000000] Command line: BOOT_IMAGE=/@/boot/vmlinuz-3.12.16-boris-00441-g60dc93e99a75-dirty root=UUID=5f1a28cf-d910-
4ed0-85ac-0113d43e553f rootflags=subvol=@ resume=/dev/disk/by-uuid/45671aef-cef3-4fcd-9087-7d4f3f81aa70 splash=off showo
pts crashkernel=256M-:128M pci=hpiosize=0 pci=hpmemsize=4m pciehp.pciehp_force=1 notsc mem=0x40000000000 loop.max_loop=6
4 log_buf_len=16M console=tty0 console=ttyS0,38400n8 ignore_loglevel debug
^^^^^
We haz the "debug" thing on.
...
Here we go:
[ 50.636000] be2net 0000:db:00.0: irq 102 for MSI/MSI-X
[ 50.636000] be2net 0000:db:00.0: irq 103 for MSI/MSI-X
[ 50.636000] be2net 0000:db:00.0: irq 104 for MSI/MSI-X
[ 50.636000] be2net 0000:db:00.0: enabled 4 MSI-x vector(s) for NIC
[ 50.896000] systemd: 1008 callbacks suppressed
^^^^
[ 50.896000] systemd[1]: Started Show Plymouth Boot Screen.
[ 50.896000] systemd[1]: Got SIGCHLD for process 1192 (swapon)
[ 50.896000] systemd[1]: Child 1192 died (code=exited, status=0/SUCCESS)
[ 50.896000] systemd[1]: Child 1192 belongs to dev-disk-by\x2duuid-45671aef\x2dcef3\x2d4fcd\x2d9087\x2d7d4f3f81aa70.swap
[ 50.896000] systemd[1]: dev-disk-by\x2duuid-45671aef\x2dcef3\x2d4fcd\x2d9087\x2d7d4f3f81aa70.swap swap process exited, code=exited status=0
[ 50.896000] systemd[1]: dev-disk-by\x2duuid-45671aef\x2dcef3\x2d4fcd\x2d9087\x2d7d4f3f81aa70.swap changed activating -> active
[ 50.896000] systemd[1]: Job dev-disk-by\x2duuid-45671aef\x2dcef3\x2d4fcd\x2d9087\x2d7d4f3f81aa70.swap/start finished, result=done
...
[ 57.988000] scsi2 : Emulex 10Gbe open-iscsi Initiator Driver
[ 63.028000] be2iscsi 0000:db:00.3: irq 117 for MSI/MSI-X
[ 63.044000] be2iscsi 0000:db:00.3: irq 118 for MSI/MSI-X
[ 63.064000] be2iscsi 0000:db:00.3: irq 119 for MSI/MSI-X
[ 63.080000] be2iscsi 0000:db:00.3: irq 120 for MSI/MSI-X
[ 63.100000] be2iscsi 0000:db:00.3: irq 121 for MSI/MSI-X
[ 63.116000] be2iscsi 0000:db:00.3: irq 122 for MSI/MSI-X
[ 63.132000] be2iscsi 0000:db:00.3: irq 123 for MSI/MSI-X
[ 63.152000] be2iscsi 0000:db:00.3: irq 124 for MSI/MSI-X
[ 63.572000] scsi host2: BC_373 : Link Down on Port 1
[ 63.588000] scsi host2: BM_4336 : No boot session
[ 63.656000] systemd: 11828 callbacks suppressed
^^^^^
We luvz to talk a lot of crap, don't we.
[ 63.672000] systemd[1]: Received SIGCHLD from PID 833 (udevadm).
[ 63.692000] systemd[1]: Got SIGCHLD for process 833 (udevadm)
[ 63.712000] systemd[1]: Child 833 died (code=exited, status=0/SUCCESS)
[ 63.732000] systemd[1]: Child 833 belongs to systemd-udev-settle.service
[ 63.756000] systemd[1]: systemd-udev-settle.service: main process exited, code=exited, status=0/SUCCESS
[ 63.788000] systemd[1]: systemd-udev-settle.service changed start -> exited
[ 63.812000] systemd[1]: Job systemd-udev-settle.service/start finished, result=done
[ 63.836000] systemd[1]: Started udev Wait for Complete Device Initialization.
and so on until we get to the boot prompt:
[ 186.828000] systemd[3512]: Executing: /etc/postfix/system/wait_qmgr 60
[ 186.856000] systemd[3514]: Executing: /etc/postfix/system/cond_slp register
[ 186.892000] systemd[3524]: Executing: /usr/sbin/cron -n
[ 186.892000] systemd[3525]: Executing: /usr/lib/systemd/systemd-update-utmp runlevel
[ 186.896000] systemd[3418]: Executing: /etc/init.d/after.local
[ 186.956000] systemd[3416]: Execung: /sbin/agetty --noclear tty1
[ 186.956000] systemd[3415]: Executing: /sbin/agetty --keep-baud ttyS0 115200,38400,9600
^M
Welcome to SUSE Linux Enterprise Server 12 Beta1 (x86_64) - Kernel 3.12.16-boris-00441-g60dc93e99a75-dirty (ttyS0).
...
Then it keeps spitting out crap off and on but it is much more
manageable. As it should be.
So thanks for that fix:
Acked-and-tested-by: Borislav Petkov <[email protected]>
I'll pick it up for SLE12 once it sees a bit more upstream time.
--
Regards/Gruss,
Boris.
Sent from a fat crate under my desk. Formatting is fine.
--
Jiri Kosina <[email protected]> writes:
> On Wed, 2 Apr 2014, Linus Torvalds wrote:
>
>> Steven, Borislav, one thing that strikes me might be a good idea is to
>> limit the amount of non-kernel noise in dmesg. We already have the
>> concept of rate-limiting various spammy internal kernel messages for
>> when device drivers misbehave etc. Maybe we can just add rate-limiting
>> to the interfaces that add messages to the kernel buffers, and work
>> around this problem that way instead while waiting for Gregs fix to
>> percolate? Or are the systemd debug messages going to so many other
>> places too that that wouldn't really help?
>
> I think that it's in principle a good idea, however ... the in-kernel
> ratelimiting always happens per sourcecode location, but this will be
> rather hard to achieve with interface such as /dev/kmsg.
>
> If /dev/kmsg is going to be ratelimited as a whole, it might potentially
> create a severely unfair situation between individual userspace programs
> trying to do logging (although there is apparently only one userspace
> service doing any logging through this interface whatsoever, right?).
The point is that /dev/kmsg is *not* intended as a syslog replacement.
--
M?ns Rullg?rd
[email protected]
* Borislav Petkov <[email protected]> wrote:
> On Wed, Apr 02, 2014 at 06:47:57PM -0700, Linus Torvalds wrote:
> > Whether it actually fixes the problem that Borislav had is
> > questionable, of course. For all I know, systemd debug mode generates
> > so much data in *other* ways and then causes feedback loops with
> > the kernel debugging that this patch is totally immaterial, and
> > dmesg was never the main issue. But unlike the "hide 'debug' from
> > /proc/cmdline", I think this patch at least _conceptually_ makes a lot
> > of sense, even if systemd gets fixed, so ...
> >
> > Borislav?
>
> Yes, ratelimiting makes the box actually boot all the way. Here's how it
> looks like:
>
> ^MLoading Linux 3.12.16-boris-00441-g60dc93e99a75-dirty ...
> ^MLoading initial ramdisk ...
> ^M[ 0.000000] Initializing cgroup subsys cpuset
> [ 0.000000] Initializing cgroup subsys cpu
> [ 0.000000] Initializing cgroup subsys cpuacct
> [ 0.000000] Linux version 3.12.16-boris-00441-g60dc93e99a75-dirty (gcc version 4.3.4 [gcc-4_3-branch revision 152973] (SUSE Linux) ) #11 SMP Thu Apr 3 10:19:49 CEST 2014
> [ 0.000000] Command line: BOOT_IMAGE=/@/boot/vmlinuz-3.12.16-boris-00441-g60dc93e99a75-dirty root=UUID=5f1a28cf-d910-
> 4ed0-85ac-0113d43e553f rootflags=subvol=@ resume=/dev/disk/by-uuid/45671aef-cef3-4fcd-9087-7d4f3f81aa70 splash=off showo
> pts crashkernel=256M-:128M pci=hpiosize=0 pci=hpmemsize=4m pciehp.pciehp_force=1 notsc mem=0x40000000000 loop.max_loop=6
> 4 log_buf_len=16M console=tty0 console=ttyS0,38400n8 ignore_loglevel debug
> ^^^^^
>
> We haz the "debug" thing on.
>
> ...
>
> Here we go:
>
> [ 50.636000] be2net 0000:db:00.0: irq 102 for MSI/MSI-X
> [ 50.636000] be2net 0000:db:00.0: irq 103 for MSI/MSI-X
> [ 50.636000] be2net 0000:db:00.0: irq 104 for MSI/MSI-X
> [ 50.636000] be2net 0000:db:00.0: enabled 4 MSI-x vector(s) for NIC
> [ 50.896000] systemd: 1008 callbacks suppressed
> ^^^^
>
> [ 50.896000] systemd[1]: Started Show Plymouth Boot Screen.
> [ 50.896000] systemd[1]: Got SIGCHLD for process 1192 (swapon)
> [ 50.896000] systemd[1]: Child 1192 died (code=exited, status=0/SUCCESS)
> [ 50.896000] systemd[1]: Child 1192 belongs to dev-disk-by\x2duuid-45671aef\x2dcef3\x2d4fcd\x2d9087\x2d7d4f3f81aa70.swap
> [ 50.896000] systemd[1]: dev-disk-by\x2duuid-45671aef\x2dcef3\x2d4fcd\x2d9087\x2d7d4f3f81aa70.swap swap process exited, code=exited status=0
> [ 50.896000] systemd[1]: dev-disk-by\x2duuid-45671aef\x2dcef3\x2d4fcd\x2d9087\x2d7d4f3f81aa70.swap changed activating -> active
> [ 50.896000] systemd[1]: Job dev-disk-by\x2duuid-45671aef\x2dcef3\x2d4fcd\x2d9087\x2d7d4f3f81aa70.swap/start finished, result=done
>
> ...
>
> [ 57.988000] scsi2 : Emulex 10Gbe open-iscsi Initiator Driver
> [ 63.028000] be2iscsi 0000:db:00.3: irq 117 for MSI/MSI-X
> [ 63.044000] be2iscsi 0000:db:00.3: irq 118 for MSI/MSI-X
> [ 63.064000] be2iscsi 0000:db:00.3: irq 119 for MSI/MSI-X
> [ 63.080000] be2iscsi 0000:db:00.3: irq 120 for MSI/MSI-X
> [ 63.100000] be2iscsi 0000:db:00.3: irq 121 for MSI/MSI-X
> [ 63.116000] be2iscsi 0000:db:00.3: irq 122 for MSI/MSI-X
> [ 63.132000] be2iscsi 0000:db:00.3: irq 123 for MSI/MSI-X
> [ 63.152000] be2iscsi 0000:db:00.3: irq 124 for MSI/MSI-X
> [ 63.572000] scsi host2: BC_373 : Link Down on Port 1
> [ 63.588000] scsi host2: BM_4336 : No boot session
> [ 63.656000] systemd: 11828 callbacks suppressed
> ^^^^^
>
> We luvz to talk a lot of crap, don't we.
When I suggested that systemd should reuse the tracing and perf ring
buffer infrastructure instead of cluttering the syslog it was
handwaved away...
But at least there's an upside for me: I don't have to deal with the
systemd maintainers' excessively passive-aggressive behavior ;-)
Thanks,
Ingo
On Thu, Apr 3, 2014 at 4:25 AM, M?ns Rullg?rd <[email protected]> wrote:
> Jiri Kosina <[email protected]> writes:
>
>> On Wed, 2 Apr 2014, Linus Torvalds wrote:
>>
>>> Steven, Borislav, one thing that strikes me might be a good idea is to
>>> limit the amount of non-kernel noise in dmesg. We already have the
>>> concept of rate-limiting various spammy internal kernel messages for
>>> when device drivers misbehave etc. Maybe we can just add rate-limiting
>>> to the interfaces that add messages to the kernel buffers, and work
>>> around this problem that way instead while waiting for Gregs fix to
>>> percolate? Or are the systemd debug messages going to so many other
>>> places too that that wouldn't really help?
>>
>> I think that it's in principle a good idea, however ... the in-kernel
>> ratelimiting always happens per sourcecode location, but this will be
>> rather hard to achieve with interface such as /dev/kmsg.
>>
>> If /dev/kmsg is going to be ratelimited as a whole, it might potentially
>> create a severely unfair situation between individual userspace programs
>> trying to do logging (although there is apparently only one userspace
>> service doing any logging through this interface whatsoever, right?).
>
> The point is that /dev/kmsg is *not* intended as a syslog replacement.
Agreed in general for many systems. I'll just point out that for ultra-tiny
(i.e. embedded) systems, it's nice to only have one logging implementation
in the system.
I had no idea systemd was so verbose and was abusing the kernel
log buffers so badly. I'm not a big fan of the rate-limiting, as this just
seems to encourage this kind of abuse.
-- Tim Bird
On Thu, Apr 03, 2014 at 12:43:08PM +0200, Joerg Roedel wrote:
>
> How about just ignoring writes to /dev/kmsg altogether by default
> (unless explicitly enabled in Kconfig or on the kernel cmdline)? Maybe I
> am missing something but what are the legitimate use-cases for generally
> allowing user-space to write into the kernel-log?
I'll give you one example which where /dev/kmesg is useful --- if you
are running automated kernel tests, echoing "running test shared/127"
.... several minutes later .... "running test shared/128", is very
useful since if the kernel starts spewing warnings, or even
oops/panics/livelocks, you know what test was running at the time of
the failure.
So in general, most of the valid use cases I can see for /dev/kmesg
are small amounts of information where understanding the relationship
between what is going in userspace can help understand the kernel
warnings, errors, or other printk messages. Which is why I think rate
limiting, even with a very low threshold, is a perfectly good alternative.
If you need to do bulk logging, and the problem is you want to make
sure the information doesn't get lost because syslogd/journald hasn't
started up yet, or the file system hasn't been remounted read/write
yet, there is a simple answer to this, and it doesn't involve spamming
the kernel ring buffer (because kernel memory is non-swappable).
The answer is logsave(8), which I developed to solve this specific
problem. I wanted to make sure distributions could capture the output
of fsck, even when checking the read-only root file system. Note that
I did not even *consider* spamming the dmesg log with e2fsck output.
Instead, I created a userspace logsave process which could buffer the
output (which of course was still displayed on the console) until the
root file system was writeable (and/or /var was mounted), at which
point the contents could be saved to a file in /var/log.
So there are so many other ways of solving this problem without trying
to abuse the kernel logging facilities (which were never intended to
be a general-purpose syslog replacement). I suspect some systemd
developer was being lazy....
- Ted
On 04/03/2014 10:05 AM, Theodore Ts'o wrote:
>
> So there are so many other ways of solving this problem without trying
> to abuse the kernel logging facilities (which were never intended to
> be a general-purpose syslog replacement). I suspect some systemd
> developer was being lazy....
>
Having the kernel be the keeper of the logging IPC isn't at all
unreasonable. However, kmsg in its current form isn't adequate.
Augmenting it into a proper logging IPC might be the right thing to do.
(Hmm... new IPC... does this sound a bit like kdbus to anyone?)
The original motivation for /dev/kmsg was of course in the context of
klibc to maintain the current behavior of the items I was hoping to move
from kernel to user space.
-hpa
On Thu, Apr 03, 2014 at 10:09:29AM -0700, H. Peter Anvin wrote:
>
> Having the kernel be the keeper of the logging IPC isn't at all
> unreasonable. However, kmsg in its current form isn't adequate.
> Augmenting it into a proper logging IPC might be the right thing to do.
> (Hmm... new IPC... does this sound a bit like kdbus to anyone?)
I'm not sure it makes sense for the kernel to be stashing log entries
until there is a good place to save them. So if systemd wants to send
hundreds of thousands of messages before the file system is remounted
read-only, we don't really want to storing them in non-swappable
kernel memory. In a userspace process, you can do things like do
compression, and in the worst case, the oom killer can kill the
logging daemon if the systemd verbosity has been turned up too high,
and it's taking too long to get /var/log mounted and writeable.
That's why I wrote logsave(8) --- because IMHO, this is a userspace
problem, and not something that we should even be trying to solve in
kernel space.
- Ted
On Thu, Apr 03, 2014 at 08:17:33AM -0700, Tim Bird wrote:
>
> I had no idea systemd was so verbose and was abusing the kernel
> log buffers so badly. I'm not a big fan of the rate-limiting, as this just
> seems to encourage this kind of abuse.
That was a bug in systemd, and has been fixed up in the latest versions,
so it shouldn't happen anymore, even with debugging enabled.
thanks,
greg k-h
They will be in memory one way or another, and during boot memory is usually plentiful to the kernel. Also, of the kernel knows it is log data it can be dropped if needed.
On April 3, 2014 10:18:55 AM PDT, Theodore Ts'o <[email protected]> wrote:
>On Thu, Apr 03, 2014 at 10:09:29AM -0700, H. Peter Anvin wrote:
>>
>> Having the kernel be the keeper of the logging IPC isn't at all
>> unreasonable. However, kmsg in its current form isn't adequate.
>> Augmenting it into a proper logging IPC might be the right thing to
>do.
>> (Hmm... new IPC... does this sound a bit like kdbus to anyone?)
>
>I'm not sure it makes sense for the kernel to be stashing log entries
>until there is a good place to save them. So if systemd wants to send
>hundreds of thousands of messages before the file system is remounted
>read-only, we don't really want to storing them in non-swappable
>kernel memory. In a userspace process, you can do things like do
>compression, and in the worst case, the oom killer can kill the
>logging daemon if the systemd verbosity has been turned up too high,
>and it's taking too long to get /var/log mounted and writeable.
>
>That's why I wrote logsave(8) --- because IMHO, this is a userspace
>problem, and not something that we should even be trying to solve in
>kernel space.
>
> - Ted
--
Sent from my mobile phone. Please pardon brevity and lack of formatting.
On 04/03/2014 10:05 AM, Theodore Ts'o wrote:
> On Thu, Apr 03, 2014 at 12:43:08PM +0200, Joerg Roedel wrote:
>>
>> How about just ignoring writes to /dev/kmsg altogether by default
>> (unless explicitly enabled in Kconfig or on the kernel cmdline)? Maybe I
>> am missing something but what are the legitimate use-cases for generally
>> allowing user-space to write into the kernel-log?
>
> I'll give you one example which where /dev/kmesg is useful --- if you
> are running automated kernel tests, echoing "running test shared/127"
> .... several minutes later .... "running test shared/128", is very
> useful since if the kernel starts spewing warnings, or even
> oops/panics/livelocks, you know what test was running at the time of
> the failure.
I'm using /dev/kmsg in virtme so that I can easily capture, with
timestamps, the ten or so log lines that it produces. It would be sad
if I had to worry about small ratelimits here.
/dev/kmsg is genuinely useful for the case where an initramfs wants to
log something (preferably only a little bit) and doesn't want to invent
a whole protocol for passing logging data through to the final logging
system.
The other thing I've used /dev/kmsg for is to shove a "I'm starting
something now" message in. This is only really necessary because the
current kernel log timestamps are unusable crap. (We could fix that,
hint hint.)
--Andy
On Fri, Apr 4, 2014 at 11:21 AM, Andy Lutomirski <[email protected]> wrote:
>
> The other thing I've used /dev/kmsg for is to shove a "I'm starting
> something now" message in. This is only really necessary because the
> current kernel log timestamps are unusable crap. (We could fix that,
> hint hint.)
I'd actually love to fix that, but I disagree with the "we could fix
it". There are tons of people who know how to parse them (admittedly
often only to ignore them), so changing the format is not likely to
work.
The good news is that "dmesg -H" does help if you're human. While at
the same time being an example of that very "there are tools that know
about the current horrid format" issue.. D'oh.
Linus
On Fri, Apr 4, 2014 at 11:21 AM, Andy Lutomirski <[email protected]> wrote:
>
> I'm using /dev/kmsg in virtme so that I can easily capture, with
> timestamps, the ten or so log lines that it produces. It would be sad
> if I had to worry about small ratelimits here.
So the _default_ rate limits (which is what my example patch used) are
almost certainly not appropriate for /dev/kmsg.
It would be good to hear suggestions for what *would* be reasonable.
Because it does sound like we'll have to add rate limiting.
Most users will never notice. The whole "writable /dev/kmsg" may go
back to 2002, but there aren't _that_ many users, and pretty much all
I've ever seen tend to write the occasional "I started up" messages or
other very small notes. I had to write a test-script to trigger even
the (very draconian) default ratelimits.
Linus
On Fri, 4 Apr 2014 11:42:51 -0700 Linus Torvalds <[email protected]> wrote:
> Most users will never notice. The whole "writable /dev/kmsg" may go
> back to 2002, but there aren't _that_ many users, and pretty much all
> I've ever seen tend to write the occasional "I started up" messages or
> other very small notes. I had to write a test-script to trigger even
> the (very draconian) default ratelimits.
Is there anything here that we really need to fix? What goes wrong if
we leave kmsg as-is and systemd gets fixed?
On Fri, Apr 4, 2014 at 11:51 AM, Andrew Morton
<[email protected]> wrote:
>
> Is there anything here that we really need to fix? What goes wrong if
> we leave kmsg as-is and systemd gets fixed?
Since I haven't even heard a "my bad" from the systemd people, I'd be
inclined to say that a bit of protection for future issues would be a
good idea.
But I'm not going to force the issue if people think it's solvable.
That said, the fact that we rate limit even our own sources does tend
to argue that "once bitten, twice shy" is the right approach. It
didn't take a lot of spammy device drivers overwriting out message
logs to make us go "let's rate limit device output".
Linus
On Fri, Apr 4, 2014 at 11:32 AM, Linus Torvalds
<[email protected]> wrote:
> On Fri, Apr 4, 2014 at 11:21 AM, Andy Lutomirski <[email protected]> wrote:
>>
>> The other thing I've used /dev/kmsg for is to shove a "I'm starting
>> something now" message in. This is only really necessary because the
>> current kernel log timestamps are unusable crap. (We could fix that,
>> hint hint.)
>
> I'd actually love to fix that, but I disagree with the "we could fix
> it". There are tons of people who know how to parse them (admittedly
> often only to ignore them), so changing the format is not likely to
> work.
What would break if we used CLOCK_BOOTTIME or CLOCK_MONOTONIC? It's
currently sched_clock, which is measured in seconds but otherwise
seems to behave rather unpredictably. We could keep exactly the same
format, but just use numbers measured in actual seconds instead of in
"seconds, but not really".
There are also log message metadata fields now; we could stick more
than one timestamp in there, and existing tools either won't see them
at all (if they use klogctl) or they already know how to ignore
unknown fields (/dev/kmsg).
For example, instead of logging:
[12345.67890] You wish you know what 12345 meant
DEVICE=blah
we could log:
[12345.67890] Hah! CLOCK_BOOTTIME says 12345.67890
DEVICE=blah
CLOCK_REALTIME=89173929281.171273
It would be really neat if we even went back and edited the queued up
messages once we have a clocksource. Userspace isn't running yet, so
it can't be surprised. We just won't show CLOCK_REALTIME at all until
someone sets the clock or we've read the RTC. Interested users still
have CLOCK_BOOTTIME.
>
> The good news is that "dmesg -H" does help if you're human. While at
> the same time being an example of that very "there are tools that know
> about the current horrid format" issue.. D'oh.
>
I admit I have no idea wtf causes this lovely excerpt from dmesg -H:
[ +26.385160] systemd-readahead[629]: Failed to rename readahead file: Permissio
[Apr 3 20:50] TCP: lp registered
I assume it's because dmesg -H decides that it can't handle old
timestamps. I don't know how it does with recent ones, though.
--Andy
On Fri, Apr 4, 2014 at 11:42 AM, Linus Torvalds
<[email protected]> wrote:
> On Fri, Apr 4, 2014 at 11:21 AM, Andy Lutomirski <[email protected]> wrote:
>>
>> I'm using /dev/kmsg in virtme so that I can easily capture, with
>> timestamps, the ten or so log lines that it produces. It would be sad
>> if I had to worry about small ratelimits here.
>
> So the _default_ rate limits (which is what my example patch used) are
> almost certainly not appropriate for /dev/kmsg.
>
If this kind of patch goes in, I'd like to see a burst of 50 or 100
lines allowed, at least early on. If virtme's init stuff runs for
more than five seconds, something's wrong anyway.
That being said, virtme will be happily immune to a per-struct file
approach, since it's mostly written in sh and it does things like
"echo virtme-initramfs: in ur box eating ur cycles" >/dev/kmsg.
--Andy
On Fri, Apr 4, 2014 at 11:57 AM, Andy Lutomirski <[email protected]> wrote:
>
> What would break if we used CLOCK_BOOTTIME or CLOCK_MONOTONIC?
I wouldn't object to trying - I thought you meant changing the format
itself. If we keep the semantics (seconds since boot) but just change
the clock source in other ways, I wouldn't worry.
Of course, one issue is locking and performance. Most clock sources
are nasty as hell to read. "sched_clock()" is special.
But I'm absolutely ok with trying things out, so if you have an idea
for a patch...
> It would be really neat if we even went back and edited the queued up
> messages once we have a clocksource.
Sounds great, I'd love to see it. Patches welcome ;)
> There are also log message metadata fields now; we could stick more
> than one timestamp in there, and existing tools either won't see them
> at all (if they use klogctl) or they already know how to ignore
> unknown fields (/dev/kmsg).
That sounds like overkill, but trying to aim for something that is
actually accurate to within a second is fine.
That said, I *also* suspect that we could easily just have a simple
heartbeat message that gives accurate real time. Adding a kernel
heartbeat to dmesg is something I've often wanted to have myself.
Having something that prints out real wall-clock time every 30 minutes
or so in dmesg does not sound like a bad idea, and could be used to
correct for time skew with the existing one even if we decided that
CLOCK_BOOTTIME has too many locking/performance issues.
Linus
On Fri, 4 Apr 2014 11:51:39 -0700
Andrew Morton <[email protected]> wrote:
> On Fri, 4 Apr 2014 11:42:51 -0700 Linus Torvalds <[email protected]> wrote:
>
> > Most users will never notice. The whole "writable /dev/kmsg" may go
> > back to 2002, but there aren't _that_ many users, and pretty much all
> > I've ever seen tend to write the occasional "I started up" messages or
> > other very small notes. I had to write a test-script to trigger even
> > the (very draconian) default ratelimits.
>
> Is there anything here that we really need to fix? What goes wrong if
> we leave kmsg as-is and systemd gets fixed?
I saw one commenter say that this was a kernel bug because writing to
kmsg shouldn't cause the system to hang.
The rate-limit patch would go along with that idea, and I honestly
think it would be good to rate-limit it in case something else breaks
and starts spamming kmsg.
-- Steve
On Fri, Apr 04, 2014 at 03:44:26PM -0400, Steven Rostedt wrote:
> I saw one commenter say that this was a kernel bug because writing to
> kmsg shouldn't cause the system to hang.
>
> The rate-limit patch would go along with that idea, and I honestly
> think it would be good to rate-limit it in case something else breaks
> and starts spamming kmsg.
I agree; the only question is what is the appropriate rate limits,
which is the question Linus was asking.
Personally, I like keeping the current limits (no more than ten
messages every 5 seconds) because I don't think dmesg, which is a
circular buffer and deliberately kept simple so that printk is
guaranteed to work even when things go really bad (and if things do go
really bad, there are ways of reading dmesg out from a crash dump, for
example, so we want to keep things simple).
Peter has argued that it might be cool if the Kernel had a
purpose-built in-kernel syslogd sort of interface, that could accept
arbitrarily large amounts of data, and presumably it would allocate
memory as needed, and since the kernel knows this is log data, if we
are under memory pressure, it could release some of the log data, even
if the userspace hasn't picked it up yet, under extreme memory
pressure.
I don't know that it makes sense to do this, since IMHO we can just as
easily do this in a user-space process.
But I *do* think we should keep the facility used by printk to be as
simple and as bulletproof as possible, which means we should really
try to keep users of /dev/kmesg to the simple "I'm starting test
<foo>", or similar messages. And that argues for using things like
the current ratelimit defaults.
- Ted
>>>>> "Linus" == Linus Torvalds <[email protected]> writes:
Linus> On Fri, Apr 4, 2014 at 11:21 AM, Andy Lutomirski <[email protected]> wrote:
>>
>> The other thing I've used /dev/kmsg for is to shove a "I'm starting
>> something now" message in. This is only really necessary because the
>> current kernel log timestamps are unusable crap. (We could fix that,
>> hint hint.)
Linus> I'd actually love to fix that, but I disagree with the "we could fix
Linus> it". There are tons of people who know how to parse them (admittedly
Linus> often only to ignore them), so changing the format is not likely to
Linus> work.
Linus> The good news is that "dmesg -H" does help if you're
Linus> human. While at the same time being an example of that very
Linus> "there are tools that know about the current horrid format"
Linus> issue.. D'oh.
I think you mean "dmesg -T", and unfortunately it seems Debian 6.0.9
(or older) doesn ship a new enough linux-util since I've only got
2.17.2-9 install.
And RHEL/Centos 5.6 and 6.5 don't seem to ship that by default either,
they have got util-linux-2.13-0.56.el5 and
util-linux-ng-2.17.2-12.14.el6.x86_64 respectively. Blech! It's in
Linux Mint 16 at least, haven't checked older versions.
John
On Fri, Apr 4, 2014 at 3:45 PM, Alexei Starovoitov
<[email protected]> wrote:
>
> can there be two bulletproof buffers: one for in kernel printk
> and another ratelimited one for writes into /dev/kmsg.
> On the read from /dev/kmsg they're combined by time.
Or, you know, people could just stop spamming /dev/kmsg.
Let's not overdesign this. No sane use will be impacted by rate
limiting, and insane uses aren't something we should design for. Even
the systemd spam was a *bug*, for chrissake.
Linus
On Fri, Apr 4, 2014 at 1:17 PM, Theodore Ts'o <[email protected]> wrote:
> On Fri, Apr 04, 2014 at 03:44:26PM -0400, Steven Rostedt wrote:
>> I saw one commenter say that this was a kernel bug because writing to
>> kmsg shouldn't cause the system to hang.
>>
>> The rate-limit patch would go along with that idea, and I honestly
>> think it would be good to rate-limit it in case something else breaks
>> and starts spamming kmsg.
>
> I agree; the only question is what is the appropriate rate limits,
> which is the question Linus was asking.
>
> Personally, I like keeping the current limits (no more than ten
> messages every 5 seconds) because I don't think dmesg, which is a
> circular buffer and deliberately kept simple so that printk is
> guaranteed to work even when things go really bad (and if things do go
> really bad, there are ways of reading dmesg out from a crash dump, for
> example, so we want to keep things simple).
>
> Peter has argued that it might be cool if the Kernel had a
> purpose-built in-kernel syslogd sort of interface, that could accept
> arbitrarily large amounts of data, and presumably it would allocate
> memory as needed, and since the kernel knows this is log data, if we
> are under memory pressure, it could release some of the log data, even
> if the userspace hasn't picked it up yet, under extreme memory
> pressure.
>
> I don't know that it makes sense to do this, since IMHO we can just as
> easily do this in a user-space process.
>
> But I *do* think we should keep the facility used by printk to be as
> simple and as bulletproof as possible, which means we should really
> try to keep users of /dev/kmesg to the simple "I'm starting test
> <foo>", or similar messages. And that argues for using things like
> the current ratelimit defaults.
can there be two bulletproof buffers: one for in kernel printk
and another ratelimited one for writes into /dev/kmsg.
On the read from /dev/kmsg they're combined by time.
On Fri, Apr 04, 2014 at 05:17:09PM -0400, John Stoffel wrote:
> >>>>> "Linus" == Linus Torvalds <[email protected]> writes:
>
> Linus> On Fri, Apr 4, 2014 at 11:21 AM, Andy Lutomirski <[email protected]> wrote:
> >>
> >> The other thing I've used /dev/kmsg for is to shove a "I'm starting
> >> something now" message in. This is only really necessary because the
> >> current kernel log timestamps are unusable crap. (We could fix that,
> >> hint hint.)
>
> Linus> I'd actually love to fix that, but I disagree with the "we could fix
> Linus> it". There are tons of people who know how to parse them (admittedly
> Linus> often only to ignore them), so changing the format is not likely to
> Linus> work.
>
> Linus> The good news is that "dmesg -H" does help if you're
> Linus> human. While at the same time being an example of that very
> Linus> "there are tools that know about the current horrid format"
> Linus> issue.. D'oh.
>
> I think you mean "dmesg -T", and unfortunately it seems Debian 6.0.9
> (or older) doesn ship a new enough linux-util since I've only got
> 2.17.2-9 install.
No, 'dmesg -H' is the right thing, you just need a modern version of
util-linux :)
> And RHEL/Centos 5.6 and 6.5 don't seem to ship that by default either,
> they have got util-linux-2.13-0.56.el5 and
> util-linux-ng-2.17.2-12.14.el6.x86_64 respectively. Blech! It's in
> Linux Mint 16 at least, haven't checked older versions.
util-linux is on version 2.24.1 at the moment:
https://www.kernel.org/pub/linux/utils/util-linux/v2.24/
2.17.2 was from back in 2012, I think it's time to switch to a modern
distro...
thanks,
greg k-h
>>>>> "Greg" == Greg Kroah-Hartman <[email protected]> writes:
Greg> On Fri, Apr 04, 2014 at 05:17:09PM -0400, John Stoffel wrote:
>> >>>>> "Linus" == Linus Torvalds <[email protected]> writes:
>>
Linus> On Fri, Apr 4, 2014 at 11:21 AM, Andy Lutomirski <[email protected]> wrote:
>> >>
>> >> The other thing I've used /dev/kmsg for is to shove a "I'm starting
>> >> something now" message in. This is only really necessary because the
>> >> current kernel log timestamps are unusable crap. (We could fix that,
>> >> hint hint.)
>>
Linus> I'd actually love to fix that, but I disagree with the "we could fix
Linus> it". There are tons of people who know how to parse them (admittedly
Linus> often only to ignore them), so changing the format is not likely to
Linus> work.
>>
Linus> The good news is that "dmesg -H" does help if you're
Linus> human. While at the same time being an example of that very
Linus> "there are tools that know about the current horrid format"
Linus> issue.. D'oh.
>>
>> I think you mean "dmesg -T", and unfortunately it seems Debian 6.0.9
>> (or older) doesn ship a new enough linux-util since I've only got
>> 2.17.2-9 install.
Greg> No, 'dmesg -H' is the right thing, you just need a modern version of
Greg> util-linux :)
Probably. But I'm working within my limitations, esp at work we can
upgrade too aggresively because of support needs for the tools we
use. And at home... usually Debian is pretty good about updates like
this, but for my main NAS box, I'm not aggresive either.
>> And RHEL/Centos 5.6 and 6.5 don't seem to ship that by default either,
>> they have got util-linux-2.13-0.56.el5 and
>> util-linux-ng-2.17.2-12.14.el6.x86_64 respectively. Blech! It's in
>> Linux Mint 16 at least, haven't checked older versions.
Greg> util-linux is on version 2.24.1 at the moment:
Greg> https://www.kernel.org/pub/linux/utils/util-linux/v2.24/
Greg> 2.17.2 was from back in 2012, I think it's time to switch to a
Greg> modern distro...
2012 isn't ancient... :-) 2002 is ancient. And I still run some
Solaris 5.8 hosts at work from that era.
John
On Fri, Apr 04, 2014 at 04:17:49PM -0700, Greg Kroah-Hartman wrote:
> > I think you mean "dmesg -T", and unfortunately it seems Debian 6.0.9
> > (or older) doesn ship a new enough linux-util since I've only got
> > 2.17.2-9 install.
>
> No, 'dmesg -H' is the right thing, you just need a modern version of
> util-linux :)
I've been poking the Debian folks. Hopefully they'll be able to ship
a newer util-linux soon. Unfortunately, the previous package
maintainer had something like a thousand local patches, and it's not
clear which patches got pushed upstream, which have been resolved some
other way, which are now moot, and which might result in an actual
regression if Debian were to go to the latest version of util-linux.
Most Debian packages are better maintained; hopefully this issue will
be resolved soon. I believe the right way at this point is to declare
"patch bankruptcy", ala "inbox bankruptcy", and then fix up the bugs
as users report them. And I think the new maintainer believes that
too; he just has to convince a few other people of that.
Anyway, that's largely out of scope for LKML, except that many us
kernel developers who use Debian have been rather handicapped by this.
(I keep a set of binaries of the latest util-linux when I need modern
functionality.)
- Ted
On Thu, 3 Apr 2014 13:03:39 +0200
Borislav Petkov <[email protected]> wrote:
> On Thu, Apr 03, 2014 at 11:34:15AM +0100, Måns Rullgård wrote:
> > Once is an accident. Twice is incompetence. Three times is malice.
>
> Yeah, maybe it is time Linus started his own init daemon project, like
> that other thing, git, he did start a while ago. We can put it in
> tools/. I'm sure it can get off the ground pretty quickly, judging by
> other projects kernel people have jumped on in the past.
>
We have a perfectly good, dependancy understanding, self ordering,
paralleising startup, service level and event management tool.
Its called "make"
We shipped devices using it about fifteen years ago 8)
Alan
On Fri, Apr 4, 2014 at 11:57 AM, Linus Torvalds
<[email protected]> wrote:
> Since I haven't even heard a "my bad" from the systemd people, I'd be
> inclined to say that a bit of protection for future issues would be a
> good idea.
Just coming back to this thread now, I'll say something. I'm a systemd
maintainer, and I'm sorry systemd's assertion bug, combined with
aliasing the debug option with the kernel's, broke a bunch of
workflows (including boot) and caused a bunch of unnecessary pain.
As a few other people have already said, this assertion should now be
fixed. The question is now about the debug option, and I've given my
+1 to Greg's patch for that. I think it's justified to put a condom on
systemd's debug mode given how badly the assertion bug cascaded into
affecting non-systemd work.
Long-term, it would be nice to find a way to satisfy both the "admin
who's just trying to find what's broken case" without harming the
kernel developer case. Maybe distros could ship a "debug everything"
boot option that independently enables it for the kernel, systemd, and
possibly other tools. But, that's out of scope for LKML. I'll look
into that as part of Fedora's Server Working Group assuming we
namespace to systemd.debug. We're already shipping a "recovery" boot
option.
Linus Torvalds <[email protected]> writes:
> On Wed, Apr 2, 2014 at 3:12 PM, Mateusz Guzik <[email protected]> wrote:
>>
>> Well, parsing kernel cmdline by systemd is a bad idea
>
> No, we very much expose /proc/cmdline for a reason. System services
> are *supposed* to parse it, because it gives a unified way for people
> to pass in various flags. The kernel doesn't complain about flags it
> doesn't recognize, exactly because the kernel realizes that "hey,
> maybe this flag is for something else".
How's this in future?
Cheers,
Rusty.
Subject: param: hand arguments after -- straight to init
The kernel passes any args it doesn't need through to init, except it
assumes anything containing '.' belongs to the kernel (for a module).
This change means all users can clearly distinguish which arguments
are for init.
For example, the kernel uses debug ("dee-bug") to mean log everything to
the console, where systemd uses the debug from the Scandinavian "day-boog"
meaning "fail to boot". If a future versions uses argv[] instead of
reading /proc/cmdline, this confusion will be avoided.
eg: test 'FOO="this is --foo"' -- 'systemd.debug="true true true"'
Gives:
argv[0] = '/debug-init'
argv[1] = 'test'
argv[2] = 'systemd.debug=true true true'
envp[0] = 'HOME=/'
envp[1] = 'TERM=linux'
envp[2] = 'FOO=this is --foo'
Signed-off-by: Rusty Russell <[email protected]>
diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 204a67743804..b1990c5524e1 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -321,7 +321,7 @@ extern bool parameq(const char *name1, const char *name2);
extern bool parameqn(const char *name1, const char *name2, size_t n);
/* Called on module insert or kernel boot */
-extern int parse_args(const char *name,
+extern char *parse_args(const char *name,
char *args,
const struct kernel_param *params,
unsigned num,
diff --git a/init/main.c b/init/main.c
index 9c7fd4c9249f..e9d458b5d77b 100644
--- a/init/main.c
+++ b/init/main.c
@@ -252,6 +252,27 @@ static int __init repair_env_string(char *param, char *val, const char *unused)
return 0;
}
+/* Anything after -- gets handed straight to init. */
+static int __init set_init_arg(char *param, char *val, const char *unused)
+{
+ unsigned int i;
+
+ if (panic_later)
+ return 0;
+
+ repair_env_string(param, val, unused);
+
+ for (i = 0; argv_init[i]; i++) {
+ if (i == MAX_INIT_ARGS) {
+ panic_later = "init";
+ panic_param = param;
+ return 0;
+ }
+ }
+ argv_init[i] = param;
+ return 0;
+}
+
/*
* Unknown boot options get handed to init, unless they look like
* unused parameters (modprobe will find them in /proc/cmdline).
@@ -478,7 +499,7 @@ static void __init mm_init(void)
asmlinkage void __init start_kernel(void)
{
- char * command_line;
+ char * command_line, *after_dashes;
extern const struct kernel_param __start___param[], __stop___param[];
/*
@@ -519,9 +540,13 @@ asmlinkage void __init start_kernel(void)
pr_notice("Kernel command line: %s\n", boot_command_line);
parse_early_param();
- parse_args("Booting kernel", static_command_line, __start___param,
- __stop___param - __start___param,
- -1, -1, &unknown_bootoption);
+ after_dashes = parse_args("Booting kernel",
+ static_command_line, __start___param,
+ __stop___param - __start___param,
+ -1, -1, &unknown_bootoption);
+ if (after_dashes)
+ parse_args("Setting init args", after_dashes, NULL, 0, -1, -1,
+ set_init_arg);
jump_label_init();
diff --git a/kernel/module.c b/kernel/module.c
index 29f7790eaa14..600d1fbf1773 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3193,6 +3193,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
{
struct module *mod;
long err;
+ char *after_dashes;
err = module_sig_check(info);
if (err)
@@ -3277,10 +3278,15 @@ static int load_module(struct load_info *info, const char __user *uargs,
goto ddebug_cleanup;
/* Module is ready to execute: parsing args may do that. */
- err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
- -32768, 32767, unknown_module_param_cb);
- if (err < 0)
+ after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
+ -32768, 32767, unknown_module_param_cb);
+ if (IS_ERR(after_dashes)) {
+ err = PTR_ERR(after_dashes);
goto bug_cleanup;
+ } else if (after_dashes) {
+ pr_warn("%s: parameters '%s' after `--' ignored\n",
+ mod->name, after_dashes);
+ }
/* Link in to syfs. */
err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
diff --git a/kernel/params.c b/kernel/params.c
index b00142e7f3ba..1e52ca233fd9 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -177,13 +177,13 @@ static char *next_arg(char *args, char **param, char **val)
}
/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
-int parse_args(const char *doing,
- char *args,
- const struct kernel_param *params,
- unsigned num,
- s16 min_level,
- s16 max_level,
- int (*unknown)(char *param, char *val, const char *doing))
+char *parse_args(const char *doing,
+ char *args,
+ const struct kernel_param *params,
+ unsigned num,
+ s16 min_level,
+ s16 max_level,
+ int (*unknown)(char *param, char *val, const char *doing))
{
char *param, *val;
@@ -198,6 +198,9 @@ int parse_args(const char *doing,
int irq_was_disabled;
args = next_arg(args, ¶m, &val);
+ /* Stop at -- */
+ if (!val && strcmp(param, "--") == 0)
+ return args;
irq_was_disabled = irqs_disabled();
ret = parse_one(param, val, doing, params, num,
min_level, max_level, unknown);
@@ -208,22 +211,22 @@ int parse_args(const char *doing,
switch (ret) {
case -ENOENT:
pr_err("%s: Unknown parameter `%s'\n", doing, param);
- return ret;
+ return ERR_PTR(ret);
case -ENOSPC:
pr_err("%s: `%s' too large for parameter `%s'\n",
doing, val ?: "", param);
- return ret;
+ return ERR_PTR(ret);
case 0:
break;
default:
pr_err("%s: `%s' invalid for parameter `%s'\n",
doing, val ?: "", param);
- return ret;
+ return ERR_PTR(ret);
}
}
/* All parsed OK. */
- return 0;
+ return NULL;
}
/* Lazy bastard, eh? */
Hi Linus,
just checking on the status here: what did we decide on this one in
the end?
It works as expected, it is a good idea to have it as a protection
against every user space abuser, maybe we should apply it now that the
merge window is over and things are calming down?
Or should I remind you the next merge window?
Thanks.
On Wed, Apr 02, 2014 at 06:47:57PM -0700, Linus Torvalds wrote:
> On Wed, Apr 2, 2014 at 4:52 PM, Linus Torvalds
> <[email protected]> wrote:
> >
> > TOTALLY UNTESTED. But it really isn't complex.
>
> Oh, and here's a patch that is actually lightly tested. I did
>
> while :; do echo hello; done > /dev/kmsg
>
> (the 'yes' program buffers output, so won't work) and I get
>
> [ 122.062912] hello
> [ 122.062915] hello
> [ 122.062918] hello
> [ 122.062921] hello
> [ 122.062924] hello
> [ 122.062927] hello
> [ 122.062930] hello
> [ 122.062932] hello
> [ 122.062935] hello
> [ 122.062938] hello
> [ 127.062671] bash: 2104439 callbacks suppressed
>
> so it works (repeating every five seconds, as expected).
>
> It's definitely not perfect - if we suppress output, and the process
> then closes the file descriptor rather than continuing to write more,
> you won't get that "suppressed" message. But it's a usable starting
> point for testing and commentary on the actual limits.
>
> So we should probably add reporting about suppressed messages at file
> close time, and we should tweak the limits (for example, perhaps not
> limit things if the buffers are largely empty - which happens at
> bootup), but on the whole I think this is a reasonable thing to do.
>
> Whether it actually fixes the problem that Borislav had is
> questionable, of course. For all I know, systemd debug mode generates
> so much data in *other* ways and then causes feedback loops with the
> kernel debugging that this patch is totally immaterial, and dmesg was
> never the main issue. But unlike the "hide 'debug' from
> /proc/cmdline", I think this patch at least _conceptually_ makes a lot
> of sense, even if systemd gets fixed, so ...
>
> Borislav?
>
> Linus
> kernel/printk/printk.c | 26 ++++++++++++++++----------
> 1 file changed, 16 insertions(+), 10 deletions(-)
>
> diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
> index 4dae9cbe9259..b01ba10fb1b9 100644
> --- a/kernel/printk/printk.c
> +++ b/kernel/printk/printk.c
> @@ -410,6 +410,7 @@ struct devkmsg_user {
> u64 seq;
> u32 idx;
> enum log_flags prev;
> + struct ratelimit_state rs;
> struct mutex lock;
> char buf[8192];
> };
> @@ -421,11 +422,15 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
> int i;
> int level = default_message_loglevel;
> int facility = 1; /* LOG_USER */
> + struct file *file = iocb->ki_filp;
> + struct devkmsg_user *user = file->private_data;
> size_t len = iov_length(iv, count);
> ssize_t ret = len;
>
> - if (len > LOG_LINE_MAX)
> + if (!user || len > LOG_LINE_MAX)
> return -EINVAL;
> + if (!___ratelimit(&user->rs, current->comm))
> + return ret;
> buf = kmalloc(len+1, GFP_KERNEL);
> if (buf == NULL)
> return -ENOMEM;
> @@ -656,21 +661,22 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
> static int devkmsg_open(struct inode *inode, struct file *file)
> {
> struct devkmsg_user *user;
> - int err;
> -
> - /* write-only does not need any file context */
> - if ((file->f_flags & O_ACCMODE) == O_WRONLY)
> - return 0;
>
> - err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
> - SYSLOG_FROM_READER);
> - if (err)
> - return err;
> + /* write-only does not need to check read permissions */
> + if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
> + int err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
> + SYSLOG_FROM_READER);
> + if (err)
> + return err;
> + }
>
> user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
> if (!user)
> return -ENOMEM;
>
> + /* Configurable? */
> + ratelimit_state_init(&user->rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
> +
> mutex_init(&user->lock);
>
> raw_spin_lock_irq(&logbuf_lock);
--
Regards/Gruss,
Boris.
Sent from a fat crate under my desk. Formatting is fine.
--
Hi Linus,
here's some more massaging of your patch. (Btw, let's start a new
thread).
On Wed, Apr 02, 2014 at 06:47:57PM -0700, Linus Torvalds wrote:
> It's definitely not perfect - if we suppress output, and the process
> then closes the file descriptor rather than continuing to write more,
> you won't get that "suppressed" message. But it's a usable starting
> point for testing and commentary on the actual limits.
>
> So we should probably add reporting about suppressed messages at file
> close time,
see below.
> and we should tweak the limits (for example, perhaps not limit things
> if the buffers are largely empty - which happens at bootup), but on
> the whole I think this is a reasonable thing to do.
Err, help me out here pls, which buffers? Do you mean we should look at
log_buf's fill level?
> Whether it actually fixes the problem that Borislav had is
> questionable, of course. For all I know, systemd debug mode generates
> so much data in *other* ways and then causes feedback loops with the
> kernel debugging that this patch is totally immaterial, and dmesg was
> never the main issue. But unlike the "hide 'debug' from
> /proc/cmdline", I think this patch at least _conceptually_ makes a lot
> of sense, even if systemd gets fixed, so ...
Ok, here's a dirty hack that issues ratelimit messages at release time.
I probably should wrap it nicely in ratelimit_*() accessors instead
of poking directly at ratelimit_state. Yeah, maybe a ratelimit_exit()
wrapper which does all the fun automatically.
Anyway, with it, it looks like this:
[ 3.098474] systemd-fstab-g: 4 callbacks suppressed
[ 9.256317] audit_printk_skb: 108 callbacks suppressed
[ 31.486281] systemd-journal: 464 callbacks suppressed
In dmesg, it basically shuts up:
...
[ 3.603657] systemd-journald[115]: Vacuuming...
[ 3.603666] systemd-journald[115]: Vacuuming done, freed 0 bytes
[ 3.603759] systemd-journald[115]: Assertion 'dual_timestamp_is_set(&e->timestamp)' failed at src/libsystemd/sd-event/sd-event.c:2204, function sd_event_get_now_monotonic(). Ignoring.
[ 3.603759] systemd-journald[115]: Flushing /dev/kmsg...
[ 3.603759] systemd-journald[115]: Assertion 'dual_timestamp_is_set(&e->timestamp)' failed at src/libsystemd/sd-event/sd-event.c:2204, function sd_event_get_now_monotonic(). Ignoring.
[ 3.603759] systemd-journald[115]: Assertion 'dual_timestamp_is_set(&e->timestamp)' failed at src/libsystemd/sd-event/sd-event.c:2204, function sd_event_get_now_monotonic(). Ignoring.
[ 3.603759] systemd-journald[115]: Assertion 'dual_timestamp_is_set(&e->timestamp)' failed at src/libsystemd/sd-event/sd-event.c:2204, function sd_event_get_now_monotonic(). Ignoring.
[ 3.640216] BTRFS info (device sda2): disk space caching is enabled
[ 3.815059] systemd-udevd[142]: starting version 210
and then on shutdown, when it releases kmsg:
...
[ OK ] Stopped target Local File Systems (Pre).
Stopping Remount Root and Kernel File Systems...
[ OK ] Stopped Remount Root and Kernel File Systems.
[ OK ] Reached target Shutdown.
Sending SIGTERM to remaining processes...
[ 31.486281] systemd-journal: 464 callbacks suppressed
[ 32.246116] mtrr: no MTRR for fc000000,400000 found
Sending SIGKILL to remaining processes...
Unmounting file systems.
[ 32.356186] BTRFS info (device sda2): disk space caching is enabled
Unmounting /tmp.
[ 32.392842] BTRFS info (device sda2): disk space caching is enabled
Unmounting /var/log.
...
Comments?
Thanks.
---
diff --git a/include/linux/ratelimit.h b/include/linux/ratelimit.h
index 0a260d8a18bf..ab8d9fb76789 100644
--- a/include/linux/ratelimit.h
+++ b/include/linux/ratelimit.h
@@ -7,6 +7,8 @@
#define DEFAULT_RATELIMIT_INTERVAL (5 * HZ)
#define DEFAULT_RATELIMIT_BURST 10
+#define RATELIMIT_MSG_ON_RELEASE BIT(0)
+
struct ratelimit_state {
raw_spinlock_t lock; /* protect the state */
@@ -15,6 +17,7 @@ struct ratelimit_state {
int printed;
int missed;
unsigned long begin;
+ unsigned long flags;
};
#define DEFINE_RATELIMIT_STATE(name, interval_init, burst_init) \
@@ -28,12 +31,11 @@ struct ratelimit_state {
static inline void ratelimit_state_init(struct ratelimit_state *rs,
int interval, int burst)
{
+ memset(rs, 0, sizeof(*rs));
+
raw_spin_lock_init(&rs->lock);
rs->interval = interval;
rs->burst = burst;
- rs->printed = 0;
- rs->missed = 0;
- rs->begin = 0;
}
extern struct ratelimit_state printk_ratelimit_state;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 5b5fdd8eeb75..18cfa5f5b058 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -450,6 +450,7 @@ struct devkmsg_user {
u64 seq;
u32 idx;
enum log_flags prev;
+ struct ratelimit_state rs;
struct mutex lock;
char buf[8192];
};
@@ -461,11 +462,15 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
int i;
int level = default_message_loglevel;
int facility = 1; /* LOG_USER */
+ struct file *file = iocb->ki_filp;
+ struct devkmsg_user *user = file->private_data;
size_t len = iov_length(iv, count);
ssize_t ret = len;
- if (len > LOG_LINE_MAX)
+ if (!user || len > LOG_LINE_MAX)
return -EINVAL;
+ if (!___ratelimit(&user->rs, current->comm))
+ return ret;
buf = kmalloc(len+1, GFP_KERNEL);
if (buf == NULL)
return -ENOMEM;
@@ -696,21 +701,25 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
static int devkmsg_open(struct inode *inode, struct file *file)
{
struct devkmsg_user *user;
- int err;
-
- /* write-only does not need any file context */
- if ((file->f_flags & O_ACCMODE) == O_WRONLY)
- return 0;
- err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
- SYSLOG_FROM_READER);
- if (err)
- return err;
+ /* write-only does not need to check read permissions */
+ if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
+ int err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
+ SYSLOG_FROM_READER);
+ if (err)
+ return err;
+ }
user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
if (!user)
return -ENOMEM;
+ /* Configurable? */
+ ratelimit_state_init(&user->rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
+
+ /* We'll say something at release time. */
+ user->rs.flags |= RATELIMIT_MSG_ON_RELEASE;
+
mutex_init(&user->lock);
raw_spin_lock_irq(&logbuf_lock);
@@ -729,6 +738,10 @@ static int devkmsg_release(struct inode *inode, struct file *file)
if (!user)
return 0;
+ if (user->rs.missed)
+ pr_warning("%s: %d callbacks suppressed\n",
+ current->comm, user->rs.missed);
+
mutex_destroy(&user->lock);
kfree(user);
return 0;
diff --git a/lib/ratelimit.c b/lib/ratelimit.c
index 40e03ea2a967..97b461a9fd52 100644
--- a/lib/ratelimit.c
+++ b/lib/ratelimit.c
@@ -46,12 +46,13 @@ int ___ratelimit(struct ratelimit_state *rs, const char *func)
rs->begin = jiffies;
if (time_is_before_jiffies(rs->begin + rs->interval)) {
- if (rs->missed)
+ if (rs->missed && !(rs->flags & RATELIMIT_MSG_ON_RELEASE))
printk(KERN_WARNING "%s: %d callbacks suppressed\n",
func, rs->missed);
rs->begin = 0;
rs->printed = 0;
- rs->missed = 0;
+ if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE))
+ rs->missed = 0;
}
if (rs->burst && rs->burst > rs->printed) {
rs->printed++;
--
Regards/Gruss,
Boris.
Sent from a fat crate under my desk. Formatting is fine.
--
On Wed, Apr 23, 2014 at 05:15:07PM +0200, Borislav Petkov wrote:
> Ok, here's a dirty hack that issues ratelimit messages at release
> time. I probably should wrap it nicely in ratelimit_*() accessors
> instead of poking directly at ratelimit_state. Yeah, maybe a
> ratelimit_exit() wrapper which does all the fun automatically.
I.e., something like that:
---
fs/fat/inode.c | 2 +-
include/linux/ratelimit.h | 28 ++++++++++++++++++++++------
kernel/printk/printk.c | 32 ++++++++++++++++++++++----------
lib/ratelimit.c | 6 ++++--
4 files changed, 49 insertions(+), 19 deletions(-)
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 0062da21dd8b..33550f4d6ae8 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1277,7 +1277,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
sb->s_export_op = &fat_export_ops;
mutex_init(&sbi->nfs_build_inode_lock);
ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
+ DEFAULT_RATELIMIT_BURST, 0);
error = parse_options(sb, data, isvfat, silent, &debug, &sbi->options);
if (error)
diff --git a/include/linux/ratelimit.h b/include/linux/ratelimit.h
index 0a260d8a18bf..fb4cf72fcfc7 100644
--- a/include/linux/ratelimit.h
+++ b/include/linux/ratelimit.h
@@ -2,11 +2,15 @@
#define _LINUX_RATELIMIT_H
#include <linux/param.h>
+#include <linux/sched.h>
#include <linux/spinlock.h>
+
#define DEFAULT_RATELIMIT_INTERVAL (5 * HZ)
#define DEFAULT_RATELIMIT_BURST 10
+#define RATELIMIT_MSG_ON_RELEASE BIT(0)
+
struct ratelimit_state {
raw_spinlock_t lock; /* protect the state */
@@ -15,6 +19,7 @@ struct ratelimit_state {
int printed;
int missed;
unsigned long begin;
+ unsigned long flags;
};
#define DEFINE_RATELIMIT_STATE(name, interval_init, burst_init) \
@@ -26,14 +31,25 @@ struct ratelimit_state {
}
static inline void ratelimit_state_init(struct ratelimit_state *rs,
- int interval, int burst)
+ int interval, int burst,
+ unsigned long flags)
{
+ memset(rs, 0, sizeof(*rs));
+
raw_spin_lock_init(&rs->lock);
- rs->interval = interval;
- rs->burst = burst;
- rs->printed = 0;
- rs->missed = 0;
- rs->begin = 0;
+ rs->interval = interval;
+ rs->burst = burst;
+ rs->flags = flags;
+}
+
+static inline void ratelimit_state_exit(struct ratelimit_state *rs)
+{
+ if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE))
+ return;
+
+ if (rs->missed)
+ printk(KERN_WARNING "%s: %d callbacks suppressed\n",
+ current->comm, rs->missed);
}
extern struct ratelimit_state printk_ratelimit_state;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 5b5fdd8eeb75..195f7733e6bb 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -450,6 +450,7 @@ struct devkmsg_user {
u64 seq;
u32 idx;
enum log_flags prev;
+ struct ratelimit_state rs;
struct mutex lock;
char buf[8192];
};
@@ -461,11 +462,17 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
int i;
int level = default_message_loglevel;
int facility = 1; /* LOG_USER */
+ struct file *file = iocb->ki_filp;
+ struct devkmsg_user *user = file->private_data;
size_t len = iov_length(iv, count);
ssize_t ret = len;
- if (len > LOG_LINE_MAX)
+ if (!user || len > LOG_LINE_MAX)
return -EINVAL;
+
+ if (!___ratelimit(&user->rs, current->comm))
+ return ret;
+
buf = kmalloc(len+1, GFP_KERNEL);
if (buf == NULL)
return -ENOMEM;
@@ -696,21 +703,24 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
static int devkmsg_open(struct inode *inode, struct file *file)
{
struct devkmsg_user *user;
- int err;
- /* write-only does not need any file context */
- if ((file->f_flags & O_ACCMODE) == O_WRONLY)
- return 0;
-
- err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
- SYSLOG_FROM_READER);
- if (err)
- return err;
+ /* write-only does not need to check read permissions */
+ if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
+ int err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
+ SYSLOG_FROM_READER);
+ if (err)
+ return err;
+ }
user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
if (!user)
return -ENOMEM;
+ /* Configurable? */
+ ratelimit_state_init(&user->rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST,
+ RATELIMIT_MSG_ON_RELEASE);
+
mutex_init(&user->lock);
raw_spin_lock_irq(&logbuf_lock);
@@ -729,6 +739,8 @@ static int devkmsg_release(struct inode *inode, struct file *file)
if (!user)
return 0;
+ ratelimit_state_exit(&user->rs);
+
mutex_destroy(&user->lock);
kfree(user);
return 0;
diff --git a/lib/ratelimit.c b/lib/ratelimit.c
index 40e03ea2a967..b4763cb10d8f 100644
--- a/lib/ratelimit.c
+++ b/lib/ratelimit.c
@@ -46,12 +46,14 @@ int ___ratelimit(struct ratelimit_state *rs, const char *func)
rs->begin = jiffies;
if (time_is_before_jiffies(rs->begin + rs->interval)) {
- if (rs->missed)
+ if (rs->missed && !(rs->flags & RATELIMIT_MSG_ON_RELEASE))
printk(KERN_WARNING "%s: %d callbacks suppressed\n",
func, rs->missed);
rs->begin = 0;
rs->printed = 0;
- rs->missed = 0;
+
+ if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE))
+ rs->missed = 0;
}
if (rs->burst && rs->burst > rs->printed) {
rs->printed++;
--
1.9.0
--
Regards/Gruss,
Boris.
Sent from a fat crate under my desk. Formatting is fine.
--