taskstats information may be used for gathering private information.
E.g. for openssh and vsftpd daemons read_characters/write_characters may
be used to learn the precise password length. Restrict it to processes
being able to ptrace the target process.
For TASKSTATS_CMD_ATTR_REGISTER_CPUMASK the fix is euid check instead of
a ptrace check as the handler is processed in the context of the target
process, not the listener process'. When ptrace_task_may_access_current()
is introduced, it should be used instead of euid check. Currently there
is a small race when a process temporarily changes its euid (e.g. to
access user's files), until the process sets euid back user's processes
may gather privileged process' statistics.
Signed-off-by: Vasiliy Kulikov <[email protected]>
---
kernel/taskstats.c | 23 ++++++++++++++++++++++-
1 files changed, 22 insertions(+), 1 deletions(-)
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 9ffea36..d92c95a 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -27,6 +27,7 @@
#include <linux/cgroup.h>
#include <linux/fs.h>
#include <linux/file.h>
+#include <linux/ptrace.h>
#include <net/genetlink.h>
#include <asm/atomic.h>
@@ -132,6 +133,8 @@ static void send_cpu_listeners(struct sk_buff *skb,
struct sk_buff *skb_next, *skb_cur = skb;
void *reply = genlmsg_data(genlhdr);
int rc, delcount = 0;
+ const struct cred *cred = current_cred();
+ struct task_struct *task;
rc = genlmsg_end(skb, reply);
if (rc < 0) {
@@ -142,6 +145,15 @@ static void send_cpu_listeners(struct sk_buff *skb,
rc = 0;
down_read(&listeners->sem);
list_for_each_entry(s, &listeners->list, list) {
+
+ rcu_read_lock();
+ task = find_task_by_vpid(s->pid);
+ if (!task || __task_cred(task)->euid != cred->euid) {
+ rcu_read_unlock();
+ continue;
+ }
+ rcu_read_unlock();
+
skb_next = NULL;
if (!list_is_last(&s->list, &listeners->list)) {
skb_next = skb_clone(skb_cur, GFP_KERNEL);
@@ -199,14 +211,19 @@ static void fill_stats(struct task_struct *tsk, struct taskstats *stats)
static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
{
struct task_struct *tsk;
+ int rc = -ESRCH;
rcu_read_lock();
tsk = find_task_by_vpid(pid);
+ if (tsk && !ptrace_may_access(tsk, PTRACE_MODE_READ)) {
+ tsk = NULL;
+ rc = -EACCES;
+ }
if (tsk)
get_task_struct(tsk);
rcu_read_unlock();
if (!tsk)
- return -ESRCH;
+ return rc;
fill_stats(tsk, stats);
put_task_struct(tsk);
return 0;
@@ -224,6 +241,10 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
*/
rcu_read_lock();
first = find_task_by_vpid(tgid);
+ if (first && !ptrace_may_access(first, PTRACE_MODE_READ)) {
+ rc = -EACCES;
+ goto out;
+ }
if (!first || !lock_task_sighand(first, &flags))
goto out;
--
1.7.0.4
On Fri, Jun 24, 2011 at 5:39 PM, Vasiliy Kulikov <[email protected]> wrote:
> taskstats information may be used for gathering private information.
> E.g. for openssh and vsftpd daemons read_characters/write_characters may
> be used to learn the precise password length. ?Restrict it to processes
> being able to ptrace the target process.
>
> For TASKSTATS_CMD_ATTR_REGISTER_CPUMASK the fix is euid check instead of
> a ptrace check as the handler is processed in the context of the target
> process, not the listener process'. ?When ptrace_task_may_access_current()
> is introduced, it should be used instead of euid check. ?Currently there
> is a small race when a process temporarily changes its euid (e.g. to
> access user's files), until the process sets euid back user's processes
> may gather privileged process' statistics.
>
> Signed-off-by: Vasiliy Kulikov <[email protected]>
> ---
> ?kernel/taskstats.c | ? 23 ++++++++++++++++++++++-
> ?1 files changed, 22 insertions(+), 1 deletions(-)
>
> diff --git a/kernel/taskstats.c b/kernel/taskstats.c
> index 9ffea36..d92c95a 100644
> --- a/kernel/taskstats.c
> +++ b/kernel/taskstats.c
> @@ -27,6 +27,7 @@
> ?#include <linux/cgroup.h>
> ?#include <linux/fs.h>
> ?#include <linux/file.h>
> +#include <linux/ptrace.h>
> ?#include <net/genetlink.h>
> ?#include <asm/atomic.h>
>
> @@ -132,6 +133,8 @@ static void send_cpu_listeners(struct sk_buff *skb,
> ? ? ? ?struct sk_buff *skb_next, *skb_cur = skb;
> ? ? ? ?void *reply = genlmsg_data(genlhdr);
> ? ? ? ?int rc, delcount = 0;
> + ? ? ? const struct cred *cred = current_cred();
> + ? ? ? struct task_struct *task;
>
> ? ? ? ?rc = genlmsg_end(skb, reply);
> ? ? ? ?if (rc < 0) {
> @@ -142,6 +145,15 @@ static void send_cpu_listeners(struct sk_buff *skb,
> ? ? ? ?rc = 0;
> ? ? ? ?down_read(&listeners->sem);
Why not grab RCU lock here
> ? ? ? ?list_for_each_entry(s, &listeners->list, list) {
> +
> + ? ? ? ? ? ? ? rcu_read_lock();
You'll end up grabbing RCU read lock too often here, do you need to?
> + ? ? ? ? ? ? ? task = find_task_by_vpid(s->pid);
> + ? ? ? ? ? ? ? if (!task || __task_cred(task)->euid != cred->euid) {
> + ? ? ? ? ? ? ? ? ? ? ? rcu_read_unlock();
> + ? ? ? ? ? ? ? ? ? ? ? continue;
> + ? ? ? ? ? ? ? }
> + ? ? ? ? ? ? ? rcu_read_unlock();
> +
Release the lock prior to up_read()
> ? ? ? ? ? ? ? ?skb_next = NULL;
> ? ? ? ? ? ? ? ?if (!list_is_last(&s->list, &listeners->list)) {
> ? ? ? ? ? ? ? ? ? ? ? ?skb_next = skb_clone(skb_cur, GFP_KERNEL);
> @@ -199,14 +211,19 @@ static void fill_stats(struct task_struct *tsk, struct taskstats *stats)
> ?static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
> ?{
> ? ? ? ?struct task_struct *tsk;
> + ? ? ? int rc = -ESRCH;
>
> ? ? ? ?rcu_read_lock();
> ? ? ? ?tsk = find_task_by_vpid(pid);
> + ? ? ? if (tsk && !ptrace_may_access(tsk, PTRACE_MODE_READ)) {
> + ? ? ? ? ? ? ? tsk = NULL;
> + ? ? ? ? ? ? ? rc = -EACCES;
> + ? ? ? }
> ? ? ? ?if (tsk)
> ? ? ? ? ? ? ? ?get_task_struct(tsk);
> ? ? ? ?rcu_read_unlock();
> ? ? ? ?if (!tsk)
> - ? ? ? ? ? ? ? return -ESRCH;
> + ? ? ? ? ? ? ? return rc;
> ? ? ? ?fill_stats(tsk, stats);
> ? ? ? ?put_task_struct(tsk);
> ? ? ? ?return 0;
> @@ -224,6 +241,10 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
> ? ? ? ? */
> ? ? ? ?rcu_read_lock();
> ? ? ? ?first = find_task_by_vpid(tgid);
> + ? ? ? if (first && !ptrace_may_access(first, PTRACE_MODE_READ)) {
> + ? ? ? ? ? ? ? rc = -EACCES;
> + ? ? ? ? ? ? ? goto out;
> + ? ? ? }
>
> ? ? ? ?if (!first || !lock_task_sighand(first, &flags))
> ? ? ? ? ? ? ? ?goto out;
Balbir Singh
On Wed, Jun 29, 2011 at 06:57 +0530, Balbir Singh wrote:
> On Fri, Jun 24, 2011 at 5:39 PM, Vasiliy Kulikov <[email protected]> wrote:
> > taskstats information may be used for gathering private information.
> > E.g. for openssh and vsftpd daemons read_characters/write_characters may
> > be used to learn the precise password length. ?Restrict it to processes
> > being able to ptrace the target process.
> >
> > For TASKSTATS_CMD_ATTR_REGISTER_CPUMASK the fix is euid check instead of
> > a ptrace check as the handler is processed in the context of the target
> > process, not the listener process'. ?When ptrace_task_may_access_current()
> > is introduced, it should be used instead of euid check. ?Currently there
> > is a small race when a process temporarily changes its euid (e.g. to
> > access user's files), until the process sets euid back user's processes
> > may gather privileged process' statistics.
> >
> > Signed-off-by: Vasiliy Kulikov <[email protected]>
> > ---
> > ?kernel/taskstats.c | ? 23 ++++++++++++++++++++++-
> > ?1 files changed, 22 insertions(+), 1 deletions(-)
> >
> > diff --git a/kernel/taskstats.c b/kernel/taskstats.c
> > index 9ffea36..d92c95a 100644
> > --- a/kernel/taskstats.c
> > +++ b/kernel/taskstats.c
> > @@ -27,6 +27,7 @@
> > ?#include <linux/cgroup.h>
> > ?#include <linux/fs.h>
> > ?#include <linux/file.h>
> > +#include <linux/ptrace.h>
> > ?#include <net/genetlink.h>
> > ?#include <asm/atomic.h>
> >
> > @@ -132,6 +133,8 @@ static void send_cpu_listeners(struct sk_buff *skb,
> > ? ? ? ?struct sk_buff *skb_next, *skb_cur = skb;
> > ? ? ? ?void *reply = genlmsg_data(genlhdr);
> > ? ? ? ?int rc, delcount = 0;
> > + ? ? ? const struct cred *cred = current_cred();
> > + ? ? ? struct task_struct *task;
> >
> > ? ? ? ?rc = genlmsg_end(skb, reply);
> > ? ? ? ?if (rc < 0) {
> > @@ -142,6 +145,15 @@ static void send_cpu_listeners(struct sk_buff *skb,
> > ? ? ? ?rc = 0;
> > ? ? ? ?down_read(&listeners->sem);
>
> Why not grab RCU lock here
Yes, it makes sense. I was thinking about not holding a lock for a too
long time, but it should be rather cheap anyway.
Thank you,
--
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments
On Fri, Jun 24, 2011 at 5:09 AM, Vasiliy Kulikov <[email protected]> wrote:
> taskstats information may be used for gathering private information.
> E.g. for openssh and vsftpd daemons read_characters/write_characters may
> be used to learn the precise password length. ?Restrict it to processes
> being able to ptrace the target process.
Ok, having looked at this some more, I'm quite ready to just mark the
whole TASKSTATS config option as BROKEN. It does seem to be a horrible
security hazard, and very little seems to use it.
It also seems to be really fundamentally broken. Afaik, there's no way
to filter taskstats not only by security issues (Vasiliy's patch
really is very ugly), but it seems to be some global cross-namespace
thing too, so it exposes taskstats across pid namespaces afaik. It
does that even with Vasiliy's patch, afaik, although then I think you
need to have collissions in the namespaces if I read the code
correctly.
I suspect that could be fixed by adding a pid namespace to the
'listener' structure. Also adding a 'cred' pointer (or the actual
listener thread pointer) to it would make Vasiliy's patch more
palatable, since then you wouldn't need to look up the credentials at
send_cpu_listeners() time.
Maybe I have mis-read the code. But it does all make me shudder. There
doesn't even seem to be all that many _users_ of the thing, so the
problems it has really makes me go "is that code worth it"? We
probably should never have merged it in the first place.
Linus
On Wed, Jun 29, 2011 at 06:57 +0530, Balbir Singh wrote:
> On Fri, Jun 24, 2011 at 5:39 PM, Vasiliy Kulikov <[email protected]> wrote:
> > + ? ? ? ? ? ? ? task = find_task_by_vpid(s->pid);
> > + ? ? ? ? ? ? ? if (!task || __task_cred(task)->euid != cred->euid) {
If consider this patch for inclusion, it also needs some check for
the listener root ability. __task_cred(task)->euid == 0 or smth like
that. But ptrace_task_may_access_current() is better.
> > + ? ? ? ? ? ? ? ? ? ? ? rcu_read_unlock();
> > + ? ? ? ? ? ? ? ? ? ? ? continue;
> > + ? ? ? ? ? ? ? }
> > + ? ? ? ? ? ? ? rcu_read_unlock();
> > +
>
> Release the lock prior to up_read()
>
> > ? ? ? ? ? ? ? ?skb_next = NULL;
> > ? ? ? ? ? ? ? ?if (!list_is_last(&s->list, &listeners->list)) {
> > ? ? ? ? ? ? ? ? ? ? ? ?skb_next = skb_clone(skb_cur, GFP_KERNEL);
> > @@ -199,14 +211,19 @@ static void fill_stats(struct task_struct *tsk, struct taskstats *stats)
> > ?static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
> > ?{
> > ? ? ? ?struct task_struct *tsk;
> > + ? ? ? int rc = -ESRCH;
> >
> > ? ? ? ?rcu_read_lock();
> > ? ? ? ?tsk = find_task_by_vpid(pid);
> > + ? ? ? if (tsk && !ptrace_may_access(tsk, PTRACE_MODE_READ)) {
> > + ? ? ? ? ? ? ? tsk = NULL;
> > + ? ? ? ? ? ? ? rc = -EACCES;
> > + ? ? ? }
> > ? ? ? ?if (tsk)
> > ? ? ? ? ? ? ? ?get_task_struct(tsk);
> > ? ? ? ?rcu_read_unlock();
> > ? ? ? ?if (!tsk)
> > - ? ? ? ? ? ? ? return -ESRCH;
> > + ? ? ? ? ? ? ? return rc;
> > ? ? ? ?fill_stats(tsk, stats);
> > ? ? ? ?put_task_struct(tsk);
> > ? ? ? ?return 0;
> > @@ -224,6 +241,10 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
> > ? ? ? ? */
> > ? ? ? ?rcu_read_lock();
> > ? ? ? ?first = find_task_by_vpid(tgid);
> > + ? ? ? if (first && !ptrace_may_access(first, PTRACE_MODE_READ)) {
> > + ? ? ? ? ? ? ? rc = -EACCES;
> > + ? ? ? ? ? ? ? goto out;
> > + ? ? ? }
> >
> > ? ? ? ?if (!first || !lock_task_sighand(first, &flags))
> > ? ? ? ? ? ? ? ?goto out;
>
> Balbir Singh
--
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments
On Wed, Jun 29, 2011 at 13:09 -0700, Linus Torvalds wrote:
> Ok, having looked at this some more, I'm quite ready to just mark the
> whole TASKSTATS config option as BROKEN. It does seem to be a horrible
> security hazard, and very little seems to use it.
>
> It also seems to be really fundamentally broken. Afaik, there's no way
> to filter taskstats not only by security issues (Vasiliy's patch
> really is very ugly), but it seems to be some global cross-namespace
> thing too, so it exposes taskstats across pid namespaces afaik.
The problem here is that it keeps a pid in a global list. This list is
then browsed by all namespaces. Looking into the code, I see 2 real
problems (didn't check, though):
1) If there are 2+ pid namespaces, one listener in pid_ns #1 and some
process dies in pid_ns #2, then the dying task wouldn't find the
listener via find_task_by_vpid() and would just delete the listener from
listeners list. Looks like this problem was created by optimization
patch f9fd8914c1acca0.
2) Netlink sockets are per net namespace, but this accounting thing is per
pid namespace. So, if the dying task and the listener are in a single
pid namespace, but in different net namespaces, the message will not be
sent via genlmsg_unicast(). I suspect it is a problem of all non-net
netlink sockets.
> It
> does that even with Vasiliy's patch, afaik, although then I think you
> need to have collissions in the namespaces if I read the code
> correctly.
>
> I suspect that could be fixed by adding a pid namespace to the
> 'listener' structure. Also adding a 'cred' pointer (or the actual
> listener thread pointer) to it would make Vasiliy's patch more
> palatable, since then you wouldn't need to look up the credentials at
> send_cpu_listeners() time.
>
> Maybe I have mis-read the code. But it does all make me shudder. There
> doesn't even seem to be all that many _users_ of the thing, so the
> problems it has really makes me go "is that code worth it"? We
> probably should never have merged it in the first place.
> but it seems to be some global cross-namespace
> thing too, so it exposes taskstats across pid namespaces afaik.
Thanks,
--
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments
On Thu, Jun 30, 2011 at 1:27 PM, Vasiliy Kulikov <[email protected]> wrote:
> On Wed, Jun 29, 2011 at 13:09 -0700, Linus Torvalds wrote:
>> Ok, having looked at this some more, I'm quite ready to just mark the
>> whole TASKSTATS config option as BROKEN. It does seem to be a horrible
>> security hazard, and very little seems to use it.
>>
I am not sure how you define BROKEN, BROKEN as per security rules?
/proc/$pid/xxx also expose similar information.
>> It also seems to be really fundamentally broken. Afaik, there's no way
>> to filter taskstats not only by security issues (Vasiliy's patch
>> really is very ugly), but it seems to be some global cross-namespace
>> thing too, so it exposes taskstats across pid namespaces afaik.
>
> The problem here is that it keeps a pid in a global list. ?This list is
> then browsed by all namespaces. ?Looking into the code, I see 2 real
> problems (didn't check, though):
>
> 1) If there are 2+ pid namespaces, one listener in pid_ns #1 and some
> process dies in pid_ns #2, then the dying task wouldn't find the
> listener via find_task_by_vpid() and would just delete the listener from
> listeners list. ?Looks like this problem was created by optimization
> patch f9fd8914c1acca0.
I must admit, I did not pay much attention to pid namespace changes as
they were being made to taskstats. I'll look at the code later this
week.
>
> 2) Netlink sockets are per net namespace, but this accounting thing is per
> pid namespace. ?So, if the dying task and the listener are in a single
> pid namespace, but in different net namespaces, the message will not be
> sent via genlmsg_unicast(). ?I suspect it is a problem of all non-net
> netlink sockets.
>
Good catch, under what circumstances would tasks have the same pid
namespace, but different net namespaces?
>
>> It
>> does that even with Vasiliy's patch, afaik, although then I think you
>> need to have collissions in the namespaces if I read the code
>> correctly.
>>
>> I suspect that could be fixed by adding a pid namespace to the
>> 'listener' structure. Also adding a 'cred' pointer (or the actual
>> listener thread pointer) to it would make Vasiliy's patch more
>> palatable, since then you wouldn't need to look up the credentials at
>> send_cpu_listeners() time.
>>
A simple work around could be to disable taskstats under network
namespace or even the current behaviour should just warn the user that
the network namespace of the listener is distinct and disallow the
listener
>> Maybe I have mis-read the code. But it does all make me shudder. There
>> doesn't even seem to be all that many _users_ of the thing, so the
>> problems it has really makes me go "is that code worth it"? We
>> probably should never have merged it in the first place.
iotop(), getdelays (in kernel) with many developers using it and many
more. The interface has been published for a while now
Thanks for the review,
Balbir Singh
On Thu, Jun 30, 2011 at 16:29 +0530, Balbir Singh wrote:
> On Thu, Jun 30, 2011 at 1:27 PM, Vasiliy Kulikov <[email protected]> wrote:
> > On Wed, Jun 29, 2011 at 13:09 -0700, Linus Torvalds wrote:
> >> Ok, having looked at this some more, I'm quite ready to just mark the
> >> whole TASKSTATS config option as BROKEN. It does seem to be a horrible
> >> security hazard, and very little seems to use it.
> >>
>
> I am not sure how you define BROKEN, BROKEN as per security rules?
> /proc/$pid/xxx also expose similar information.
I'm sure this information is also dangerous.
> >> It also seems to be really fundamentally broken. Afaik, there's no way
> >> to filter taskstats not only by security issues (Vasiliy's patch
> >> really is very ugly), but it seems to be some global cross-namespace
> >> thing too, so it exposes taskstats across pid namespaces afaik.
> >
> > The problem here is that it keeps a pid in a global list. ?This list is
> > then browsed by all namespaces. ?Looking into the code, I see 2 real
> > problems (didn't check, though):
> >
> > 1) If there are 2+ pid namespaces, one listener in pid_ns #1 and some
> > process dies in pid_ns #2, then the dying task wouldn't find the
> > listener via find_task_by_vpid() and would just delete the listener from
> > listeners list. ?Looks like this problem was created by optimization
> > patch f9fd8914c1acca0.
>
> I must admit, I did not pay much attention to pid namespace changes as
> they were being made to taskstats. I'll look at the code later this
> week.
>
> >
> > 2) Netlink sockets are per net namespace, but this accounting thing is per
> > pid namespace. ?So, if the dying task and the listener are in a single
> > pid namespace, but in different net namespaces, the message will not be
> > sent via genlmsg_unicast(). ?I suspect it is a problem of all non-net
> > netlink sockets.
This is not a problem of all non-net netlink sockets, but only of those
who finds a task by the pid. AFAIU, the normal netlink policy is using
broadcast packets.
3) A process cannot own 2 taskstats sockets because the information
stored in listeners array doesn't make difference between different
sockets of the same task. Both searches would stop on the first socket,
the first socket would receive 2 messages while the second would be
silent. It was introduced by f9fd8914c1acca0.
> Good catch, under what circumstances would tasks have the same pid
> namespace, but different net namespaces?
Umm, if one did unshare() or clone() to unshare only net namespace?
> >> It
> >> does that even with Vasiliy's patch, afaik, although then I think you
> >> need to have collissions in the namespaces if I read the code
> >> correctly.
> >>
> >> I suspect that could be fixed by adding a pid namespace to the
> >> 'listener' structure. Also adding a 'cred' pointer (or the actual
> >> listener thread pointer) to it would make Vasiliy's patch more
> >> palatable, since then you wouldn't need to look up the credentials at
> >> send_cpu_listeners() time.
> >>
>
> A simple work around could be to disable taskstats under network
> namespace or even the current behaviour should just warn the user that
> the network namespace of the listener is distinct and disallow the
> listener
>
> >> Maybe I have mis-read the code. But it does all make me shudder. There
> >> doesn't even seem to be all that many _users_ of the thing, so the
> >> problems it has really makes me go "is that code worth it"? We
> >> probably should never have merged it in the first place.
>
> iotop(), getdelays (in kernel) with many developers using it and many
> more. The interface has been published for a while now
>
> Thanks for the review,
> Balbir Singh
Thanks,
--
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments
On Thu, Jun 30, 2011 at 3:59 AM, Balbir Singh <[email protected]> wrote:
> On Thu, Jun 30, 2011 at 1:27 PM, Vasiliy Kulikov <[email protected]> wrote:
>> On Wed, Jun 29, 2011 at 13:09 -0700, Linus Torvalds wrote:
>>> Ok, having looked at this some more, I'm quite ready to just mark the
>>> whole TASKSTATS config option as BROKEN. It does seem to be a horrible
>>> security hazard, and very little seems to use it.
>>>
>
> I am not sure how you define BROKEN, BROKEN as per security rules?
> /proc/$pid/xxx also expose similar information.
No it doesn't.
/proc/$pid/xyz has always had proper security checks. Now, sometimes
they've been a bit too lax (iow, we've had cases where we just allowed
things we shouldn't - like this "io" file), but /proc/pid/ at least
*conceptually* has always had the "do I have permission to read this
or not". Do a "cat /proc/*/* > /dev/null" and you'll be getting a lot
of "permission denied" etc.
TASKSTATS? Nothing. Nada. Completely open.
That's just broken.
TASKSTATS also isn't apparently used by any normal program, and so
never got updated to namespaces. Again, /proc/xyz/ at least *tries*:
I'm not guaranteeing that it doesn't have some gaping holes, but I can
at least find the logic that saves and uses namespace information.
Again, TASKSTATS? Not so much.
> I must admit, I did not pay much attention to pid namespace changes as
> they were being made to taskstats. I'll look at the code later this
> week.
Some of the pid namespace issues could be fixed with the attached kind
of starting point.
But it's broken. See the comment I added. Why is it broken? Again -
taskstats is fundamentally broken, and doesn't seem to actually clean
up after itself. The "cleanup" depends on noticing at run-time that
some pid is stale and no longer listening. Again, that's just
fundamental brokenness in taskstats.
And it also only look sat pid namespaces. The network namespace issue
is separate.
So that's why I think it should be marked BROKEN. What applications
actually depend on this? iotop and what else? Because if it's just
iotop, I do suspect we might be better off telling people "ok,
disabling this will break iotop, but quite frankly, you're better off
without it".
Linus
On Thu, Jun 30, 2011 at 10:10 PM, Linus Torvalds
<[email protected]> wrote:
> On Thu, Jun 30, 2011 at 3:59 AM, Balbir Singh <[email protected]> wrote:
>> On Thu, Jun 30, 2011 at 1:27 PM, Vasiliy Kulikov <[email protected]> wrote:
>>> On Wed, Jun 29, 2011 at 13:09 -0700, Linus Torvalds wrote:
>>>> Ok, having looked at this some more, I'm quite ready to just mark the
>>>> whole TASKSTATS config option as BROKEN. It does seem to be a horrible
>>>> security hazard, and very little seems to use it.
>>>>
>>
>> I am not sure how you define BROKEN, BROKEN as per security rules?
>> /proc/$pid/xxx also expose similar information.
>
> No it doesn't.
>
> /proc/$pid/xyz has always had proper security checks. Now, sometimes
> they've been a bit too lax (iow, we've had cases where we just allowed
> things we shouldn't - like this "io" file), but /proc/pid/ at least
> *conceptually* has always had the "do I have permission to read this
> or not". Do a "cat /proc/*/* > /dev/null" and you'll be getting a lot
> of "permission denied" etc.
>
I've tried that, but fundamentally the statistics we export in
taskstats is what a utility like top would need. Compare them against
cat /proc/$pid/sched and /proc/$pid/status
> TASKSTATS? Nothing. Nada. Completely open.
>
> That's just broken.
>
Except for the I/O part, which turned about to be a cause of concern,
look at struct taskstats and please see the comment above. We have
some additional delay statistics, but I don't see them as a core
broken issue
> TASKSTATS also isn't apparently used by any normal program, and so
> never got updated to namespaces. Again, /proc/xyz/ at least *tries*:
> I'm not guaranteeing that it doesn't have some gaping holes, but I can
> at least find the logic that saves and uses namespace information.
>
Or namespaces are not being used as much, I know a lot of users use
iotop(), I've frequently gotten and handled requests to enable the
feature across distros
> Again, TASKSTATS? Not so much.
>
>> I must admit, I did not pay much attention to pid namespace changes as
>> they were being made to taskstats. I'll look at the code later this
>> week.
>
> Some of the pid namespace issues could be fixed with the attached kind
> of starting point.
>
> But it's broken. See the comment I added. Why is it broken? Again -
> taskstats is fundamentally broken, and doesn't seem to actually clean
> up after itself. The "cleanup" depends on noticing at run-time that
> some pid is stale and no longer listening. Again, that's just
> fundamental brokenness in taskstats.
>
That is lazy cleanup to avoid trapping exit events of listeners and
then having to check against the list. But if the cleanup needs to be
more active, it should be easy to fix. What I am saying is that it is
not as BROKEN as you make it sound. The attached patch is definitely a
step in the right direction and should be applied.
> And it also only look sat pid namespaces. The network namespace issue
> is separate.
>
> So that's why I think it should be marked BROKEN. What applications
> actually depend on this? iotop and what else? Because if it's just
> iotop, I do suspect we might be better off telling people "ok,
> disabling this will break iotop, but quite frankly, you're better off
> without it".
I beg to differ, due to the reasons above. I'd rather find time and
fix the pending issues (network namespace), you've fixed the pid
namespace issue. I'd also look for exiting listeners
Balbir Singh