2008-01-02 16:26:09

by Ingo Molnar

[permalink] [raw]
Subject: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

revert commit:

commit 6f5391c283d7fdcf24bf40786ea79061919d1e1d
Author: Matthew Wilcox <[email protected]>
Date: Tue Sep 25 12:42:04 2007 -0400

[SCSI] Get rid of scsi_cmnd->done

this is a supposed-to-be-cleanup commit, but apparently it causes
regressions:

Bug 9370 - v2.6.24-rc2-409-g9418d5d: attempt to access beyond end of device
http://bugzilla.kernel.org/show_bug.cgi?id=9370

this patch should be reintroduced in a more split-up form to make
testing of it easier.

Signed-off-by: Ingo Molnar <[email protected]>
---
drivers/scsi/scsi.c | 20 +++-----------------
drivers/scsi/scsi_error.c | 1 +
drivers/scsi/scsi_lib.c | 14 ++++++++++++++
drivers/scsi/scsi_priv.h | 1 -
drivers/scsi/sd.c | 28 ++++++++++------------------
drivers/scsi/sr.c | 21 +++++++++++++++------
include/scsi/scsi_cmnd.h | 2 ++
include/scsi/scsi_driver.h | 1 -
include/scsi/sd.h | 13 +++++++++++++
9 files changed, 58 insertions(+), 43 deletions(-)

Index: linux/drivers/scsi/scsi.c
===================================================================
--- linux.orig/drivers/scsi/scsi.c
+++ linux/drivers/scsi/scsi.c
@@ -59,7 +59,6 @@
#include <scsi/scsi_cmnd.h>
#include <scsi/scsi_dbg.h>
#include <scsi/scsi_device.h>
-#include <scsi/scsi_driver.h>
#include <scsi/scsi_eh.h>
#include <scsi/scsi_host.h>
#include <scsi/scsi_tcq.h>
@@ -368,8 +367,9 @@ void scsi_log_send(struct scsi_cmnd *cmd
scsi_print_command(cmd);
if (level > 3) {
printk(KERN_INFO "buffer = 0x%p, bufflen = %d,"
- " queuecommand 0x%p\n",
+ " done = 0x%p, queuecommand 0x%p\n",
scsi_sglist(cmd), scsi_bufflen(cmd),
+ cmd->done,
cmd->device->host->hostt->queuecommand);

}
@@ -654,12 +654,6 @@ void __scsi_done(struct scsi_cmnd *cmd)
blk_complete_request(rq);
}

-/* Move this to a header if it becomes more generally useful */
-static struct scsi_driver *scsi_cmd_to_driver(struct scsi_cmnd *cmd)
-{
- return *(struct scsi_driver **)cmd->request->rq_disk->private_data;
-}
-
/*
* Function: scsi_finish_command
*
@@ -671,8 +665,6 @@ void scsi_finish_command(struct scsi_cmn
{
struct scsi_device *sdev = cmd->device;
struct Scsi_Host *shost = sdev->host;
- struct scsi_driver *drv;
- unsigned int good_bytes;

scsi_device_unbusy(sdev);

@@ -698,13 +690,7 @@ void scsi_finish_command(struct scsi_cmn
"Notifying upper driver of completion "
"(result %x)\n", cmd->result));

- good_bytes = cmd->request_bufflen;
- if (cmd->request->cmd_type != REQ_TYPE_BLOCK_PC) {
- drv = scsi_cmd_to_driver(cmd);
- if (drv->done)
- good_bytes = drv->done(cmd);
- }
- scsi_io_completion(cmd, good_bytes);
+ cmd->done(cmd);
}
EXPORT_SYMBOL(scsi_finish_command);

Index: linux/drivers/scsi/scsi_error.c
===================================================================
--- linux.orig/drivers/scsi/scsi_error.c
+++ linux/drivers/scsi/scsi_error.c
@@ -1699,6 +1699,7 @@ scsi_reset_provider(struct scsi_device *
memset(&scmd->cmnd, '\0', sizeof(scmd->cmnd));

scmd->scsi_done = scsi_reset_provider_done_command;
+ scmd->done = NULL;
scmd->request_buffer = NULL;
scmd->request_bufflen = 0;

Index: linux/drivers/scsi/scsi_lib.c
===================================================================
--- linux.orig/drivers/scsi/scsi_lib.c
+++ linux/drivers/scsi/scsi_lib.c
@@ -1092,6 +1092,7 @@ void scsi_io_completion(struct scsi_cmnd
}
scsi_end_request(cmd, 0, this_count, !result);
}
+EXPORT_SYMBOL(scsi_io_completion);

/*
* Function: scsi_init_io()
@@ -1170,6 +1171,18 @@ static struct scsi_cmnd *scsi_get_cmd_fr
return cmd;
}

+static void scsi_blk_pc_done(struct scsi_cmnd *cmd)
+{
+ BUG_ON(!blk_pc_request(cmd->request));
+ /*
+ * This will complete the whole command with uptodate=1 so
+ * as far as the block layer is concerned the command completed
+ * successfully. Since this is a REQ_BLOCK_PC command the
+ * caller should check the request's errors value
+ */
+ scsi_io_completion(cmd, cmd->request_bufflen);
+}
+
int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req)
{
struct scsi_cmnd *cmd;
@@ -1219,6 +1232,7 @@ int scsi_setup_blk_pc_cmnd(struct scsi_d
cmd->transfersize = req->data_len;
cmd->allowed = req->retries;
cmd->timeout_per_command = req->timeout;
+ cmd->done = scsi_blk_pc_done;
return BLKPREP_OK;
}
EXPORT_SYMBOL(scsi_setup_blk_pc_cmnd);
Index: linux/drivers/scsi/scsi_priv.h
===================================================================
--- linux.orig/drivers/scsi/scsi_priv.h
+++ linux/drivers/scsi/scsi_priv.h
@@ -68,7 +68,6 @@ extern int scsi_maybe_unblock_host(struc
extern void scsi_device_unbusy(struct scsi_device *sdev);
extern int scsi_queue_insert(struct scsi_cmnd *cmd, int reason);
extern void scsi_next_command(struct scsi_cmnd *cmd);
-extern void scsi_io_completion(struct scsi_cmnd *, unsigned int);
extern void scsi_run_host_queues(struct Scsi_Host *shost);
extern struct request_queue *scsi_alloc_queue(struct scsi_device *sdev);
extern void scsi_free_queue(struct request_queue *q);
Index: linux/drivers/scsi/sd.c
===================================================================
--- linux.orig/drivers/scsi/sd.c
+++ linux/drivers/scsi/sd.c
@@ -86,19 +86,6 @@ MODULE_ALIAS_SCSI_DEVICE(TYPE_DISK);
MODULE_ALIAS_SCSI_DEVICE(TYPE_MOD);
MODULE_ALIAS_SCSI_DEVICE(TYPE_RBC);

-static int sd_revalidate_disk(struct gendisk *);
-static int sd_probe(struct device *);
-static int sd_remove(struct device *);
-static void sd_shutdown(struct device *);
-static int sd_suspend(struct device *, pm_message_t state);
-static int sd_resume(struct device *);
-static void sd_rescan(struct device *);
-static int sd_done(struct scsi_cmnd *);
-static void sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer);
-static void scsi_disk_release(struct class_device *cdev);
-static void sd_print_sense_hdr(struct scsi_disk *, struct scsi_sense_hdr *);
-static void sd_print_result(struct scsi_disk *, int);
-
static DEFINE_IDR(sd_index_idr);
static DEFINE_SPINLOCK(sd_index_lock);

@@ -253,7 +240,6 @@ static struct scsi_driver sd_template =
.shutdown = sd_shutdown,
},
.rescan = sd_rescan,
- .done = sd_done,
};

/*
@@ -523,6 +509,12 @@ static int sd_prep_fn(struct request_que
SCpnt->timeout_per_command = timeout;

/*
+ * This is the completion routine we use. This is matched in terms
+ * of capability to this function.
+ */
+ SCpnt->done = sd_rw_intr;
+
+ /*
* This indicates that the command is ready from our end to be
* queued.
*/
@@ -895,13 +887,13 @@ static struct block_device_operations sd
};

/**
- * sd_done - bottom half handler: called when the lower level
+ * sd_rw_intr - bottom half handler: called when the lower level
* driver has completed (successfully or otherwise) a scsi command.
* @SCpnt: mid-level's per command structure.
*
* Note: potentially run from within an ISR. Must not block.
**/
-static int sd_done(struct scsi_cmnd *SCpnt)
+static void sd_rw_intr(struct scsi_cmnd * SCpnt)
{
int result = SCpnt->result;
unsigned int xfer_size = SCpnt->request_bufflen;
@@ -922,7 +914,7 @@ static int sd_done(struct scsi_cmnd *SCp
SCSI_LOG_HLCOMPLETE(1, scsi_print_result(SCpnt));
if (sense_valid) {
SCSI_LOG_HLCOMPLETE(1, scmd_printk(KERN_INFO, SCpnt,
- "sd_done: sb[respc,sk,asc,"
+ "sd_rw_intr: sb[respc,sk,asc,"
"ascq]=%x,%x,%x,%x\n",
sshdr.response_code,
sshdr.sense_key, sshdr.asc,
@@ -994,7 +986,7 @@ static int sd_done(struct scsi_cmnd *SCp
break;
}
out:
- return good_bytes;
+ scsi_io_completion(SCpnt, good_bytes);
}

static int media_not_present(struct scsi_disk *sdkp,
Index: linux/drivers/scsi/sr.c
===================================================================
--- linux.orig/drivers/scsi/sr.c
+++ linux/drivers/scsi/sr.c
@@ -78,7 +78,6 @@ MODULE_ALIAS_SCSI_DEVICE(TYPE_WORM);

static int sr_probe(struct device *);
static int sr_remove(struct device *);
-static int sr_done(struct scsi_cmnd *);

static struct scsi_driver sr_template = {
.owner = THIS_MODULE,
@@ -87,7 +86,6 @@ static struct scsi_driver sr_template =
.probe = sr_probe,
.remove = sr_remove,
},
- .done = sr_done,
};

static unsigned long sr_index_bits[SR_DISKS / BITS_PER_LONG];
@@ -210,12 +208,12 @@ static int sr_media_change(struct cdrom_
}

/*
- * sr_done is the interrupt routine for the device driver.
+ * rw_intr is the interrupt routine for the device driver.
*
- * It will be notified on the end of a SCSI read / write, and will take one
+ * It will be notified on the end of a SCSI read / write, and will take on
* of several actions based on success or failure.
*/
-static int sr_done(struct scsi_cmnd *SCpnt)
+static void rw_intr(struct scsi_cmnd * SCpnt)
{
int result = SCpnt->result;
int this_count = SCpnt->request_bufflen;
@@ -288,7 +286,12 @@ static int sr_done(struct scsi_cmnd *SCp
}
}

- return good_bytes;
+ /*
+ * This calls the generic completion function, now that we know
+ * how many actual sectors finished, and how many sectors we need
+ * to say have failed.
+ */
+ scsi_io_completion(SCpnt, good_bytes);
}

static int sr_prep_fn(struct request_queue *q, struct request *rq)
@@ -425,6 +428,12 @@ static int sr_prep_fn(struct request_que
SCpnt->timeout_per_command = timeout;

/*
+ * This is the completion routine we use. This is matched in terms
+ * of capability to this function.
+ */
+ SCpnt->done = rw_intr;
+
+ /*
* This indicates that the command is ready from our end to be
* queued.
*/
Index: linux/include/scsi/scsi_cmnd.h
===================================================================
--- linux.orig/include/scsi/scsi_cmnd.h
+++ linux/include/scsi/scsi_cmnd.h
@@ -34,6 +34,7 @@ struct scsi_cmnd {
struct list_head list; /* scsi_cmnd participates in queue lists */
struct list_head eh_entry; /* entry for the host eh_cmd_q */
int eh_eflags; /* Used by error handlr */
+ void (*done) (struct scsi_cmnd *); /* Mid-level done function */

/*
* A SCSI Command is assigned a nonzero serial_number before passed
@@ -121,6 +122,7 @@ extern struct scsi_cmnd *__scsi_get_comm
extern void scsi_put_command(struct scsi_cmnd *);
extern void __scsi_put_command(struct Scsi_Host *, struct scsi_cmnd *,
struct device *);
+extern void scsi_io_completion(struct scsi_cmnd *, unsigned int);
extern void scsi_finish_command(struct scsi_cmnd *cmd);
extern void scsi_req_abort_cmd(struct scsi_cmnd *cmd);

Index: linux/include/scsi/scsi_driver.h
===================================================================
--- linux.orig/include/scsi/scsi_driver.h
+++ linux/include/scsi/scsi_driver.h
@@ -15,7 +15,6 @@ struct scsi_driver {
struct device_driver gendrv;

void (*rescan)(struct device *);
- int (*done)(struct scsi_cmnd *);
};
#define to_scsi_driver(drv) \
container_of((drv), struct scsi_driver, gendrv)
Index: linux/include/scsi/sd.h
===================================================================
--- linux.orig/include/scsi/sd.h
+++ linux/include/scsi/sd.h
@@ -47,6 +47,19 @@ struct scsi_disk {
};
#define to_scsi_disk(obj) container_of(obj,struct scsi_disk,cdev)

+static int sd_revalidate_disk(struct gendisk *disk);
+static void sd_rw_intr(struct scsi_cmnd * SCpnt);
+static int sd_probe(struct device *);
+static int sd_remove(struct device *);
+static void sd_shutdown(struct device *dev);
+static int sd_suspend(struct device *dev, pm_message_t state);
+static int sd_resume(struct device *dev);
+static void sd_rescan(struct device *);
+static void sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer);
+static void scsi_disk_release(struct class_device *cdev);
+static void sd_print_sense_hdr(struct scsi_disk *, struct scsi_sense_hdr *);
+static void sd_print_result(struct scsi_disk *, int);
+
#define sd_printk(prefix, sdsk, fmt, a...) \
(sdsk)->disk ? \
sdev_printk(prefix, (sdsk)->device, "[%s] " fmt, \


2008-01-02 16:46:36

by James Bottomley

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"


On Wed, 2008-01-02 at 17:25 +0100, Ingo Molnar wrote:
> revert commit:
>
> commit 6f5391c283d7fdcf24bf40786ea79061919d1e1d
> Author: Matthew Wilcox <[email protected]>
> Date: Tue Sep 25 12:42:04 2007 -0400
>
> [SCSI] Get rid of scsi_cmnd->done
>
> this is a supposed-to-be-cleanup commit, but apparently it causes
> regressions:
>
> Bug 9370 - v2.6.24-rc2-409-g9418d5d: attempt to access beyond end of device
> http://bugzilla.kernel.org/show_bug.cgi?id=9370
>
> this patch should be reintroduced in a more split-up form to make
> testing of it easier.

I disagree with this. We only have one reporter of a problem and it
appears to be some type of obscure interaction with pktdvd which no-one
can track down (although it's not really helped by the reporter not
being very responsive).

The correct thing to do is root cause the problem and fix it at source,
since it's very likely that this is a pre-existing bug that was simply
uncovered by the patch you're recommending we revert.

Unfortunately, I suspect it won't get fixed until someone else actually
manages to reproduce it (which I haven't been able to so far).

James

2008-01-02 19:19:49

by Linus Torvalds

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"



On Wed, 2 Jan 2008, James Bottomley wrote:
>
> I disagree with this. We only have one reporter of a problem and it
> appears to be some type of obscure interaction with pktdvd which no-one
> can track down (although it's not really helped by the reporter not
> being very responsive).

It's totally immaterial if we have one reporter or many. The fact is, that
thing has been outstanding for almost two months now. The root cause is
almost certainly known (and Matthew is apparently even aware of it), but
it has been dicked-around-with by having totally weak excuses ("is it an
ide-scsi thing?")

> The correct thing to do is root cause the problem and fix it at source,
> since it's very likely that this is a pre-existing bug that was simply
> uncovered by the patch you're recommending we revert.

No, the correct thing would have been to fix it a month ago.

By now, it needs to either have a patch, or to be reverted. It's too late
to make excuses.

And no, it doesn't really need any more reporters or ways to reproduce it,
all it needs is looking at the patch and seeing what the differences are
AT A SOURCE level. And quite frankly, I see one huge honking difference,
which is that new test for

if (cmd->request->cmd_type != REQ_TYPE_BLOCK_PC)

which will disable all the crud that sd_done() does.

And guess what? Look at what sd_done() does: it checks whether the request
was partially filled, and tries to handle that end-of-medium case nicely.
Which is *exactly* what seems to have broken.

So I think you are making totally idiotic excuses. I may not know the SCSI
layer all that intimately, and maybe I'm wrong and there is some other
cause for this, but quite frankly, I doubt it. And I can look at just the
patch and have a damn good idea of why something is broken, but I can't
actually fix it myself because I don't know how to look up a a
"scsi_driver" from a "scsi_cmnd" sanely.

But almost two months after the fact, we should have had a patch that does
that, or we should revert it.

NO MORE BOGUS EXCUSES!

Linus

2008-01-02 19:40:39

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

On Wed, Jan 02, 2008 at 11:19:14AM -0800, Linus Torvalds wrote:
> It's totally immaterial if we have one reporter or many. The fact is, that
> thing has been outstanding for almost two months now. The root cause is
> almost certainly known (and Matthew is apparently even aware of it), but

I really don't think the root cause is known.

> By now, it needs to either have a patch, or to be reverted. It's too late
> to make excuses.

I think reverting it is the correct thing to do.

> And no, it doesn't really need any more reporters or ways to reproduce it,
> all it needs is looking at the patch and seeing what the differences are
> AT A SOURCE level. And quite frankly, I see one huge honking difference,
> which is that new test for
>
> if (cmd->request->cmd_type != REQ_TYPE_BLOCK_PC)
>
> which will disable all the crud that sd_done() does.

sd_done and sr_done are called for REQ_TYPE_FS -- if the request comes
in through one of the SG interfaces, we call scsi_setup_blk_pc_cmnd()
which sets the ->done callback to scsi_blk_pc_done.

> So I think you are making totally idiotic excuses. I may not know the SCSI
> layer all that intimately, and maybe I'm wrong and there is some other
> cause for this, but quite frankly, I doubt it. And I can look at just the
> patch and have a damn good idea of why something is broken, but I can't
> actually fix it myself because I don't know how to look up a a
> "scsi_driver" from a "scsi_cmnd" sanely.

That's in the patch. But it would be the wrong thing to do because SG
commands should not be molested by the sr/sd driver.

--
Intel are signing my paycheques ... these opinions are still mine
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."

2008-01-02 19:58:21

by Linus Torvalds

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"



On Wed, 2 Jan 2008, Matthew Wilcox wrote:
>
> sd_done and sr_done are called for REQ_TYPE_FS -- if the request comes
> in through one of the SG interfaces, we call scsi_setup_blk_pc_cmnd()
> which sets the ->done callback to scsi_blk_pc_done.

Why do you think that REQ_TYPE_BLOCK_PC has anything to do with SG?

It has *nothing* to do with SG, and anybody who uses SG in this day and
age on a block device is just crazy.

The way you do generic SCSI commands (on perfectly normal block device
nodes) is using the SCSI ioctl() interfaces. That's how you are supposed
to do things like burn DVD's or do any kind of special ops.

So REQ_TYPE_BLOCK_PC does quite commonly happen on perfectly regular block
devices, it's how all commands that aren't pure reads or writes done by
the kernel behave.

If you actually use /dev/sg*, you will be using the SG driver, and if you
don't want that to have a ->done callback, then just set "done" to NULL
for sg_driver.

Linus

2008-01-02 20:12:42

by James Bottomley

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"


On Wed, 2008-01-02 at 12:40 -0700, Matthew Wilcox wrote:
> On Wed, Jan 02, 2008 at 11:19:14AM -0800, Linus Torvalds wrote:
> > It's totally immaterial if we have one reporter or many. The fact is, that
> > thing has been outstanding for almost two months now. The root cause is
> > almost certainly known (and Matthew is apparently even aware of it), but
>
> I really don't think the root cause is known.
>
> > By now, it needs to either have a patch, or to be reverted. It's too late
> > to make excuses.
>
> I think reverting it is the correct thing to do.

OK ... I'll revert it. However, I still think it's the wrong course of
action, because as far as my analysis goes, this code is functionally
equivalent to what went before with the exception that we now rely on
the request->cmd_type information in the post processing (previously we
just relied on the cmnd->done pointer).

As far as I understand what's going on, the reporter is misusing the
interface. He sets up a packet mapping and then accesses the underlying
device, which is wrong (you're supposed to access now via the packet
device). The one thing the packet mapping does is to alter the
blocksize, which could lead to the errors reported (because the
underlying fs block size and the actual block size differ) ... however,
I think it should do this all the time, so what I'm unable to explain is
why it doesn't write past the end of the device with this commit
reverted.

But all of the above is the hallmark of an existing bug that this commit
exposes ... revert it, and we'll cover the underlying problem up again.

James

2008-01-02 20:18:16

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

On Wed, Jan 02, 2008 at 11:57:10AM -0800, Linus Torvalds wrote:
>
>
> On Wed, 2 Jan 2008, Matthew Wilcox wrote:
> >
> > sd_done and sr_done are called for REQ_TYPE_FS -- if the request comes
> > in through one of the SG interfaces, we call scsi_setup_blk_pc_cmnd()
> > which sets the ->done callback to scsi_blk_pc_done.
>
> Why do you think that REQ_TYPE_BLOCK_PC has anything to do with SG?
>
> It has *nothing* to do with SG, and anybody who uses SG in this day and
> age on a block device is just crazy.

I think you misunderstood Matthew here. REQ_TYPE_BLOCK_PC is indeed
used by any kind of SG_IO or similar passthrough no matter where it
originates. And exactly because REQ_TYPE_BLOCK_PC are entirely passthru
the actual driver (sd, sr or sg) is not doing the actual command
completion but it's handled in the scsi layer because it's exactly
the same no matter what driver it came on.

2008-01-02 20:18:46

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

On Wed, Jan 02, 2008 at 11:57:10AM -0800, Linus Torvalds wrote:
> On Wed, 2 Jan 2008, Matthew Wilcox wrote:
> >
> > sd_done and sr_done are called for REQ_TYPE_FS -- if the request comes
> > in through one of the SG interfaces, we call scsi_setup_blk_pc_cmnd()
> > which sets the ->done callback to scsi_blk_pc_done.
>
> Why do you think that REQ_TYPE_BLOCK_PC has anything to do with SG?
>
> It has *nothing* to do with SG, and anybody who uses SG in this day and
> age on a block device is just crazy.
>
> The way you do generic SCSI commands (on perfectly normal block device
> nodes) is using the SCSI ioctl() interfaces. That's how you are supposed
> to do things like burn DVD's or do any kind of special ops.
>
> So REQ_TYPE_BLOCK_PC does quite commonly happen on perfectly regular block
> devices, it's how all commands that aren't pure reads or writes done by
> the kernel behave.

I spoke imprecisely; a raw command one through SG or ioctl is
REQ_TYPE_BLOCK_PC and did not go through sd_done or sr_done before this
patch.

> If you actually use /dev/sg*, you will be using the SG driver, and if you
> don't want that to have a ->done callback, then just set "done" to NULL
> for sg_driver.

Unless you think that we should see different behaviour when using
/dev/sg* and /dev/sd*, the aspect of the patch you criticised was
correct.

--
Intel are signing my paycheques ... these opinions are still mine
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."

2008-01-02 20:45:53

by Linus Torvalds

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"



On Wed, 2 Jan 2008, James Bottomley wrote:
>
> OK ... I'll revert it. However, I still think it's the wrong course of
> action, because as far as my analysis goes, this code is functionally
> equivalent to what went before with the exception that we now rely on
> the request->cmd_type information in the post processing (previously we
> just relied on the cmnd->done pointer).

To say that another way:

"the code is functionally equivalent, EXCEPT IT ISN'T, and it's
known to be broken".

wouldn't you say my version is more honest and correct?

The old code did a per-command callback. The new one doesn't. The code was
*supposed* to be equivalent, but it clearly isn't. Why argue the point?

And no, maybe it's not that REQ_TYPE_BLOCK_PC should be calling ->done,
maybe it's that some REQ_TYPE_FS commands should *not* be calling ->done.
Or maybe we somehow got the wrong ->done in the first place, because we
now get it from a different source.

I don't know, but what I'm arguing (very strongly) against is this
attitude of "we don't know what's wrong, but wë́'ll leave it broken because
we can't be bothered to figure it out".

That is exactly what reverting is there for. It doesn't matter one *whit*
if the new code is cleaner and prettier, if it doesn't work.

Linus

2008-01-02 20:50:10

by Linus Torvalds

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"



On Wed, 2 Jan 2008, Christoph Hellwig wrote:
>
> I think you misunderstood Matthew here. REQ_TYPE_BLOCK_PC is indeed
> used by any kind of SG_IO or similar passthrough no matter where it
> originates. And exactly because REQ_TYPE_BLOCK_PC are entirely passthru
> the actual driver (sd, sr or sg) is not doing the actual command
> completion but it's handled in the scsi layer because it's exactly
> the same no matter what driver it came on.

You say that, but you then ignore that something *did* change.

Maybe it's not that one suspicious test. Maybe it's somethign else. But
that commit was confirmed to break something, almost two months ago. You
guys seem to be in denial, and saying "it didn't change anything".

And no, waiting for more reporters when one reporter has already narrowed
it down to the exact (smallish) commit, is simply not good. Either you can
fix it by looking at the source, or it gets reverted.

I was hoping somebody in SCSI-land would actually look at the commit and
try to find out what's wrong, instead of all of you apparently trying to
say "nothing is wrong".

I hate the excuses of "but, but, but.. it *should* work". It doesn't. Face
that, *then* you can argue about why.

Linus

2008-01-02 20:53:31

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

On Wed, Jan 02, 2008 at 12:49:32PM -0800, Linus Torvalds wrote:
> Maybe it's not that one suspicious test. Maybe it's somethign else. But
> that commit was confirmed to break something, almost two months ago. You
> guys seem to be in denial, and saying "it didn't change anything".
>
> And no, waiting for more reporters when one reporter has already narrowed
> it down to the exact (smallish) commit, is simply not good. Either you can
> fix it by looking at the source, or it gets reverted.
>
> I was hoping somebody in SCSI-land would actually look at the commit and
> try to find out what's wrong, instead of all of you apparently trying to
> say "nothing is wrong".

I've spent about a week of time looking at it over the last couple of
months. I haven't been able to figure it out. That's why I'm calling
for it to be reverted, not because I'm "in denial".

--
Intel are signing my paycheques ... these opinions are still mine
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."

2008-01-02 23:33:22

by James Bottomley

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"


On Wed, 2008-01-02 at 12:45 -0800, Linus Torvalds wrote:
>
> On Wed, 2 Jan 2008, James Bottomley wrote:
> >
> > OK ... I'll revert it. However, I still think it's the wrong course of
> > action, because as far as my analysis goes, this code is functionally
> > equivalent to what went before with the exception that we now rely on
> > the request->cmd_type information in the post processing (previously we
> > just relied on the cmnd->done pointer).
>
> To say that another way:
>
> "the code is functionally equivalent, EXCEPT IT ISN'T, and it's
> known to be broken".
>
> wouldn't you say my version is more honest and correct?

No. Just because a bug appears when a particular piece of code is in
and disappears when it is reverted doesn't automatically equate to the
code in question being buggy. We seem to get a lot of these second
order effect type things; sometimes its just a problem caused by a
particular routine compiling to a longer byte sequence and pushing
something else out.

Do give us credit for thinking "functional equivalency problem" when
this bug report first came in ... I've had myself and several other
people over the code. If there's an inequivalency somewhere I'm damned
if I can spot it. The most promising other failure mode we tried was
request type changes over the lifetime of the command, but we can't make
that one fly either.

Look at the taxonomy of the bug. This is the form of the error:

buffer I/O error on device sr0, logical block 20304
attempt to access beyond end of device
sr0: rw=0, want=81224, limit=40944

The last limit is the most suggestive, that comes straight from
bdev->bd_inode->i_size>>9 and is supposed to be the size of the block
device in 512 byte blocks. For a 4.7GB DVD, it's a little small.
Nothing in the sr code sets this directly (although it does come from
get_blkdev() for the first opener). pktcdvd does set it, though ... and
probably wrongly if the drive in question isn't UDF formatted.

I have also tried on many occasions to reproduce this without success
(there's a simple recipe in the bug report, but it just doesn't work for
me). My setup is with an aic94xxx->expander->SATAPI DVD, whereas the
original reporter is ata_piix -> PATA DVD, so it could be stack
differences---but again, if it is, the bug itself can't be a simple one
in the generic code. The fact that there are no other reporters of
problems like this also indicate to me that it isn't a widespread
problem (again, pointing to something more specific in the setup of the
reporter).

The unreproduceability coupled with the lack of other error reports
leads me to be about 90% confident the problem isn't in the code you
want reverted. However, I grant that we cannot seem to find the root
cause, and reverting the code will cause our bug metrics to go down by
one (at least until something else causes it to reappear), so it is the
corporate thing to do, I suppose. I'll send in a reversion with the
sr_mod removal bug fix.

James

2008-01-03 01:59:19

by Linus Torvalds

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"



On Wed, 2 Jan 2008, James Bottomley wrote:
> >
> > To say that another way:
> >
> > "the code is functionally equivalent, EXCEPT IT ISN'T, and it's
> > known to be broken".
> >
> > wouldn't you say my version is more honest and correct?
>
> No. Just because a bug appears when a particular piece of code is in
> and disappears when it is reverted doesn't automatically equate to the
> code in question being buggy.

But it *DOES* mean that it's not equivalent.

> Look at the taxonomy of the bug. This is the form of the error:
>
> buffer I/O error on device sr0, logical block 20304
> attempt to access beyond end of device
> sr0: rw=0, want=81224, limit=40944
>
> The last limit is the most suggestive, that comes straight from
> bdev->bd_inode->i_size>>9 and is supposed to be the size of the block
> device in 512 byte blocks. For a 4.7GB DVD, it's a little small.
> Nothing in the sr code sets this directly (although it does come from
> get_blkdev() for the first opener). pktcdvd does set it, though ... and
> probably wrongly if the drive in question isn't UDF formatted.

.. but you're ignoring the fact that if pktcdvd sets it wrong, then it
should be visible with the pre-commit kernel *also*.

In other words, you continue to ignore the fact that BEHAVIOUR CHANGED.

Why?

Linus

2008-01-06 02:55:47

by Peter Osterlund

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

Linus Torvalds <[email protected]> writes:

> On Wed, 2 Jan 2008, James Bottomley wrote:
>
> > Look at the taxonomy of the bug. This is the form of the error:
> >
> > buffer I/O error on device sr0, logical block 20304
> > attempt to access beyond end of device
> > sr0: rw=0, want=81224, limit=40944
> >
> > The last limit is the most suggestive, that comes straight from
> > bdev->bd_inode->i_size>>9 and is supposed to be the size of the block
> > device in 512 byte blocks. For a 4.7GB DVD, it's a little small.
> > Nothing in the sr code sets this directly (although it does come from
> > get_blkdev() for the first opener). pktcdvd does set it, though ... and
> > probably wrongly if the drive in question isn't UDF formatted.

pktcdvd sets it when opening the /dev/pktcdvd device, but when the
drive is later opened as /dev/scd0, there is nothing that sets it
back. (Btw, 40944 is possible if the disk is a CDRW that was formatted
with "cdrwtool -m 10236".)

The problem is that pktcdvd opens the cd device in non-blocking mode
when pktsetup is run, and doesn't close it again until pktsetup -d
is run. The effect is that if you meanwhile open the cd device,
blkdev.c:do_open() doesn't call bd_set_size() because bdev->bd_openers
is non-zero.

I don't know the correct way to fix this. Maybe adding bd_set_size()
to sr.c:get_sectorsize() which already does set_capacity() would
work.

> .. but you're ignoring the fact that if pktcdvd sets it wrong, then it
> should be visible with the pre-commit kernel *also*.

I can repeat this bug, both with and without the scsi patch that is
claimed to make a difference, both with an external USB drive and an
internal IDE drive.

To repeat:

1. Start with an empty drive.
2. pktsetup 0 /dev/scd0
3. Insert a CD containing an isofs filesystem.
4. mount /dev/pktcdvd/0 /mnt/tmp
5. umount /mnt/tmp
6. Press the eject button.
7. Insert a DVD containing a non-writable filesystem.
8. mount /dev/scd0 /mnt/tmp
9. find /mnt/tmp -type f -print0 | xargs -0 sha1sum >/dev/null
10. If the DVD contains data beyond the physical size of a CD, you
get I/O errors in the terminal, and dmesg reports lots of
"attempt to access beyond end of device" errors.

--
Peter Osterlund - [email protected]
http://web.telia.com/~u89404340

2008-01-06 03:44:06

by Linus Torvalds

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"



On Sat, 6 Jan 2008, Peter Osterlund wrote:
>
> The problem is that pktcdvd opens the cd device in non-blocking mode
> when pktsetup is run, and doesn't close it again until pktsetup -d
> is run. The effect is that if you meanwhile open the cd device,
> blkdev.c:do_open() doesn't call bd_set_size() because bdev->bd_openers
> is non-zero.
>
> I don't know the correct way to fix this. Maybe adding bd_set_size()
> to sr.c:get_sectorsize() which already does set_capacity() would
> work.

Hmm.. I wouldn't say that's the correct way to fix it. The thing is, if
somebody has explicitly set the size of the device, than that *is* the
size.

The kernel should do what it is told, and it very much on purpose does the
size probing only on the first open: exactly because other open fd's in
progress may be there explicitly to fix up something, or may simply depend
on some size it already knew about (ie we don't want to change the size
behind its back).

And in particular, setting the size with bd_set_size also sets the
block-size, and while most things migt react fairly well to the pure
*size* changing, they will react very badly indeed to the blocksize
changing!

So no, doing a "bd_set_size()" in any but the outermost opener simply
isn't acceptable. It would cause serious problems and total chaos for the
block cache (we do not handle aliasing of multiple different blocksizes).
So we are very careful indeed to only call bd_set_size() when bd_openers
is zero.

That said, at least in your scenario:

> 1. Start with an empty drive.
> 2. pktsetup 0 /dev/scd0
> 3. Insert a CD containing an isofs filesystem.
> 4. mount /dev/pktcdvd/0 /mnt/tmp
> 5. umount /mnt/tmp
> 6. Press the eject button.
> 7. Insert a DVD containing a non-writable filesystem.
> 8. mount /dev/scd0 /mnt/tmp
> 9. find /mnt/tmp -type f -print0 | xargs -0 sha1sum >/dev/null
> 10. If the DVD contains data beyond the physical size of a CD, you
> get I/O errors in the terminal, and dmesg reports lots of
> "attempt to access beyond end of device" errors.

part of the problem here seems to be that the "media change" notification
that /dev/scd0 hopefully handled correctly and caused the "set_capacity()"
hass not made it to the i_size thing (that the block device layer checks).

So the way things are *supposed* to work is that the media-change function
("revalidate_disk()") should have triggered as part of the media change,
and that *should* have already done the set_capacity(), and that in turn
is the thing that should do it all (sets the disk capacity *without*
changing the blocksize!)

But since "set_capacity()" doesn't actually change "i_size", only
dev->capacity (which is correct - there may be many inodes associated with
one device), we actually ended up having this subtle dependency on
calling bd_set_size() (which we can *only* do on the first open, due to
the blocksize issues).

Which means that media-change won't fix these things like it is supposed
to. And I suspect we've had this bug (well, it *appears* to be a bug) for
a while, simply because all *normal* uses will open and close the device
properly.

Maybe a patch something like this might work out. I haven't really thought
it through entirely - but it basically just sets the size, without doing
the whole blocksize thing.

Jens? Al? Comments?

Does a patch like this change the behaviour you see at all?

This all still leaves the question unanswered why that commit
6f5391c283d7fdcf24bf40786ea79061919d1e1d changed any behaviour at all.
Because the thing that Peter is describing has nothing to do with any
low-level drivers what-so-ever.

Linus

---
fs/block_dev.c | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 993f78c..6a20da9 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1191,6 +1191,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
}
if (bdev->bd_invalidated)
rescan_partitions(bdev->bd_disk, bdev);
+ bd_inode->i_size = (loff_t)get_capacity(disk)<<9;
}
}
bdev->bd_openers++;

2008-01-06 12:23:48

by Peter Osterlund

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

Linus Torvalds <[email protected]> writes:

> Does a patch like this change the behaviour you see at all?

> + bd_inode->i_size = (loff_t)get_capacity(disk)<<9;

It does fix my scenario, with the trivial fix of adding bdev-> at the
beginning of that line, ie:

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 993f78c..a8ed344 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1191,6 +1191,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
}
if (bdev->bd_invalidated)
rescan_partitions(bdev->bd_disk, bdev);
+ bdev->bd_inode->i_size = (loff_t)get_capacity(disk)<<9;
}
}
bdev->bd_openers++;

--
Peter Osterlund - [email protected]
http://web.telia.com/~u89404340

2008-01-06 13:56:04

by Thomas Meyer

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

Hi,

I confirm Peter's observations:

> To repeat:
>
> 1. Start with an empty drive.
> 2. pktsetup 0 /dev/scd0
> 3. Insert a CD containing an isofs filesystem.
> 4. mount /dev/pktcdvd/0 /mnt/tmp
> 5. umount /mnt/tmp
> 6. Press the eject button.
> 7. Insert a DVD containing a non-writable filesystem.
> 8. mount /dev/scd0 /mnt/tmp
> 9. find /mnt/tmp -type f -print0 | xargs -0 sha1sum >/dev/null
> 10. If the DVD contains data beyond the physical size of a CD, you
> get I/O errors in the terminal, and dmesg reports lots of
> "attempt to access beyond end of device" errors.

This is the correct setup to trigger the bug. And the commit
6f5391c283d7fdcf24bf40786ea79061919d1e1d has nothing to do with this
bug. It was bad luck that i couldn't trigger the bug with said commit
reverted (in my tests, the second boot with the reverted commit missed
the first mount/umount smaller cd step, so...)

So sorry for all the mess and stress.

I'm glad that Peter found the real nature of this bug.

mfg
thomas

2008-01-06 13:58:18

by James Bottomley

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"


On Sun, 2008-01-06 at 03:55 +0100, Peter Osterlund wrote:
> Linus Torvalds <[email protected]> writes:
>
> > On Wed, 2 Jan 2008, James Bottomley wrote:
> >
> > > Look at the taxonomy of the bug. This is the form of the error:
> > >
> > > buffer I/O error on device sr0, logical block 20304
> > > attempt to access beyond end of device
> > > sr0: rw=0, want=81224, limit=40944
> > >
> > > The last limit is the most suggestive, that comes straight from
> > > bdev->bd_inode->i_size>>9 and is supposed to be the size of the block
> > > device in 512 byte blocks. For a 4.7GB DVD, it's a little small.
> > > Nothing in the sr code sets this directly (although it does come from
> > > get_blkdev() for the first opener). pktcdvd does set it, though ... and
> > > probably wrongly if the drive in question isn't UDF formatted.
>
> pktcdvd sets it when opening the /dev/pktcdvd device, but when the
> drive is later opened as /dev/scd0, there is nothing that sets it
> back. (Btw, 40944 is possible if the disk is a CDRW that was formatted
> with "cdrwtool -m 10236".)
>
> The problem is that pktcdvd opens the cd device in non-blocking mode
> when pktsetup is run, and doesn't close it again until pktsetup -d
> is run. The effect is that if you meanwhile open the cd device,
> blkdev.c:do_open() doesn't call bd_set_size() because bdev->bd_openers
> is non-zero.
>
> I don't know the correct way to fix this. Maybe adding bd_set_size()
> to sr.c:get_sectorsize() which already does set_capacity() would
> work.

Could be ... this is deep viro magic, though; I've added him to the Cc
list to get his input.

> > .. but you're ignoring the fact that if pktcdvd sets it wrong, then it
> > should be visible with the pre-commit kernel *also*.
>
> I can repeat this bug, both with and without the scsi patch that is
> claimed to make a difference, both with an external USB drive and an
> internal IDE drive.
>
> To repeat:
>
> 1. Start with an empty drive.
> 2. pktsetup 0 /dev/scd0
> 3. Insert a CD containing an isofs filesystem.
> 4. mount /dev/pktcdvd/0 /mnt/tmp
> 5. umount /mnt/tmp
> 6. Press the eject button.
> 7. Insert a DVD containing a non-writable filesystem.
> 8. mount /dev/scd0 /mnt/tmp
> 9. find /mnt/tmp -type f -print0 | xargs -0 sha1sum >/dev/null
> 10. If the DVD contains data beyond the physical size of a CD, you
> get I/O errors in the terminal, and dmesg reports lots of
> "attempt to access beyond end of device" errors.

Brilliant! I can confirm the reproduction of the bug too (that's with
the originally fingered commit reverted).

James

2008-01-06 14:05:19

by James Bottomley

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

On Sat, 2008-01-05 at 19:43 -0800, Linus Torvalds wrote:
> This all still leaves the question unanswered why that commit
> 6f5391c283d7fdcf24bf40786ea79061919d1e1d changed any behaviour at
> all.
> Because the thing that Peter is describing has nothing to do with any
> low-level drivers what-so-ever.

It isn't even a secondary effect like I thought. This commit genuinely
didn't have anything to do with the bug, it was purely accidental. It
came about because if you look at the reporter's recipe to reproduce,
which all of us tried without success, it's missing several steps. To
get the bug, these steps must have been done somehow, but I bet by pure
chance they weren't when reverting
6f5391c283d7fdcf24bf40786ea79061919d1e1d which led to wrongly fingering
this commit.

Now, if only someone who understood the mechanics of what the commit was
doing tried to stop you reverting it we could have saved a lot of
trouble ...

James

2008-01-06 14:51:38

by James Bottomley

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

On Sat, 2008-01-05 at 19:43 -0800, Linus Torvalds wrote:
> fs/block_dev.c | 1 +
> 1 files changed, 1 insertions(+), 0 deletions(-)
>
> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index 993f78c..6a20da9 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -1191,6 +1191,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
> }
> if (bdev->bd_invalidated)
> rescan_partitions(bdev->bd_disk, bdev);
> + bd_inode->i_size = (loff_t)get_capacity(disk)<<9;
> }
> }
> bdev->bd_openers++;

Actually, I think that code is fine ... the block size shouldn't change
across positive values of bd_openers. I think the true fix is below:
pktcdvd shouldn't be mucking with the size of the underlying CD/DVD ...
it can change its own layered device capacity, certainly, but it
shouldn't be mucking with the capacity that was already set in the
sr_probe routine. I'm in two minds as to whether the set capacity on
the underlying device should be removed as well ... I think it should,
but it is harmless as the sr_probe will also reset it.

However, I'll defer to what Al wants to do.

James

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 3535ef8..10f3692 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2344,7 +2344,6 @@ static int pkt_open_dev(struct pktcdvd_device *pd, int write)

set_capacity(pd->disk, lba << 2);
set_capacity(pd->bdev->bd_disk, lba << 2);
- bd_set_size(pd->bdev, (loff_t)lba << 11);

q = bdev_get_queue(pd->bdev);
if (write) {

2008-01-06 14:52:16

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"


* James Bottomley <[email protected]> wrote:

> > I can repeat this bug, both with and without the scsi patch that is
> > claimed to make a difference, both with an external USB drive and an
> > internal IDE drive.
> >
> > To repeat:
> >
> > 1. Start with an empty drive.
> > 2. pktsetup 0 /dev/scd0
> > 3. Insert a CD containing an isofs filesystem.
> > 4. mount /dev/pktcdvd/0 /mnt/tmp
> > 5. umount /mnt/tmp
> > 6. Press the eject button.
> > 7. Insert a DVD containing a non-writable filesystem.
> > 8. mount /dev/scd0 /mnt/tmp
> > 9. find /mnt/tmp -type f -print0 | xargs -0 sha1sum >/dev/null
> > 10. If the DVD contains data beyond the physical size of a CD, you
> > get I/O errors in the terminal, and dmesg reports lots of
> > "attempt to access beyond end of device" errors.
>
> Brilliant! I can confirm the reproduction of the bug too (that's with
> the originally fingered commit reverted).

may i point out the obvious at this stage? The thing that finally got
movement into this bug was ... :

exposure on lkml

The reproducer came to you via Peter Osterlund who has never authored a
single drivers/scsi/ commit before (according to git-log) and who (and
here i'm out on a limb guessing it) does not even follow
[email protected].

this bug was obscure and hidden on [email protected] for
_months_, (it is a rarely visited and rarely read mailing list) and
there was just not enough "critical mass" to get this issue fixed.

_THAT_ is the power of lkml. People who are not generally interested in
your subsystem come and help. There is extra noise, but it's manageable.

so may i at this point suggest that you as the SCSI maintainer start
reading SCSI bugreports on lkml and start participating in SCSI topics
there, without extra prompting? It _is_ an important aggregation mailing
list for development, just like -mm or the upstream kernel is an
aggregation point of all things Linux.

I believe the "I only read linux-scsi, please post bugs there" approach
is harmful. If lkml traffic is too big then i'd suggest to set up email
filters to separate out mails that have 'SCSI' in their subject line or
body. Fortunately it's a really easy key to filter on. [ Scheduler mails
are much harder to filter out :-/ ] In fact i'd suggest to do all SCSI
development on lkml. We've got one upstream kernel codebase, one git
stream of commits and hence we should use one lkml feed to discuss
things on.

Ingo

2008-01-06 15:02:24

by Peter Osterlund

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

On Sun, 6 Jan 2008, James Bottomley wrote:

> On Sat, 2008-01-05 at 19:43 -0800, Linus Torvalds wrote:
>>
>> diff --git a/fs/block_dev.c b/fs/block_dev.c
>> index 993f78c..6a20da9 100644
>> --- a/fs/block_dev.c
>> +++ b/fs/block_dev.c
>> @@ -1191,6 +1191,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
>> }
>> if (bdev->bd_invalidated)
>> rescan_partitions(bdev->bd_disk, bdev);
>> + bd_inode->i_size = (loff_t)get_capacity(disk)<<9;
>> }
>> }
>> bdev->bd_openers++;
>
> Actually, I think that code is fine ... the block size shouldn't change
> across positive values of bd_openers. I think the true fix is below:
> pktcdvd shouldn't be mucking with the size of the underlying CD/DVD ...
> it can change its own layered device capacity, certainly, but it
> shouldn't be mucking with the capacity that was already set in the
> sr_probe routine. I'm in two minds as to whether the set capacity on
> the underlying device should be removed as well ... I think it should,
> but it is harmless as the sr_probe will also reset it.
>
> However, I'll defer to what Al wants to do.
>
> James
>
> diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
> index 3535ef8..10f3692 100644
> --- a/drivers/block/pktcdvd.c
> +++ b/drivers/block/pktcdvd.c
> @@ -2344,7 +2344,6 @@ static int pkt_open_dev(struct pktcdvd_device *pd, int write)
>
> set_capacity(pd->disk, lba << 2);
> set_capacity(pd->bdev->bd_disk, lba << 2);
> - bd_set_size(pd->bdev, (loff_t)lba << 11);
>
> q = bdev_get_queue(pd->bdev);
> if (write) {

That code was added to fix a similar bug, see:

http://linux.derkeiler.com/Mailing-Lists/Kernel/2004-08/4288.html

Maybe that fix was wrong and should have just set bd_inode->i_size instead
of calling bd_set_size().

--
Peter Osterlund - [email protected]
http://web.telia.com/~u89404340

2008-01-06 15:21:02

by James Bottomley

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"


On Sun, 2008-01-06 at 15:47 +0100, Ingo Molnar wrote:
> * James Bottomley <[email protected]> wrote:
>
> > > I can repeat this bug, both with and without the scsi patch that is
> > > claimed to make a difference, both with an external USB drive and an
> > > internal IDE drive.
> > >
> > > To repeat:
> > >
> > > 1. Start with an empty drive.
> > > 2. pktsetup 0 /dev/scd0
> > > 3. Insert a CD containing an isofs filesystem.
> > > 4. mount /dev/pktcdvd/0 /mnt/tmp
> > > 5. umount /mnt/tmp
> > > 6. Press the eject button.
> > > 7. Insert a DVD containing a non-writable filesystem.
> > > 8. mount /dev/scd0 /mnt/tmp
> > > 9. find /mnt/tmp -type f -print0 | xargs -0 sha1sum >/dev/null
> > > 10. If the DVD contains data beyond the physical size of a CD, you
> > > get I/O errors in the terminal, and dmesg reports lots of
> > > "attempt to access beyond end of device" errors.
> >
> > Brilliant! I can confirm the reproduction of the bug too (that's with
> > the originally fingered commit reverted).
>
> may i point out the obvious at this stage? The thing that finally got
> movement into this bug was ... :
>
> exposure on lkml

I won't disagree with that. That's why my philosophy is to try to force
all bug reports out of bugzilla and on to the relevant mailing list
because of the many eyes approach this engenders.

> The reproducer came to you via Peter Osterlund who has never authored a
> single drivers/scsi/ commit before (according to git-log) and who (and
> here i'm out on a limb guessing it) does not even follow
> [email protected].
>
> this bug was obscure and hidden on [email protected] for
> _months_, (it is a rarely visited and rarely read mailing list) and
> there was just not enough "critical mass" to get this issue fixed.

If I were you, I'd actually make a cursory effort to get my facts
straight before spouting off.

This bug was actually hidden in bugzilla for ages, where Matthew Wilcox
was trying to deal with it on his own. The first I heard of it (apart
from a linux-scsi question on 13 November, when regrettably, I was busy
with other things) was on 18 Dec when Natalie added me to the bugzilla
cc list. The first thing I did on that date was finger pktcdvd and add
Jens to the cc list ... however, since there was no mailing list thread
to follow he ended up asking for context which no-one provided.

The whole problem with this bug was generated precisely because it was
kept in bugzilla where too few people actually looked at it. You're the
one who annotated the bugzilla entries with trite little homilies asking
why there was no action *without* ever notifying any mailing list, I
might add.

The fault lies in our bug processing methodology. Bugzilla is a fine
tracking tool, but it's a bloody useless workflow one for actually
solving problems because, as you say, and I agree, the mailing lists are
where we produce the solutions.

> _THAT_ is the power of lkml. People who are not generally interested in
> your subsystem come and help. There is extra noise, but it's manageable.
>
> so may i at this point suggest that you as the SCSI maintainer start
> reading SCSI bugreports on lkml and start participating in SCSI topics
> there, without extra prompting? It _is_ an important aggregation mailing
> list for development, just like -mm or the upstream kernel is an
> aggregation point of all things Linux.
>
> I believe the "I only read linux-scsi, please post bugs there" approach
> is harmful. If lkml traffic is too big then i'd suggest to set up email
> filters to separate out mails that have 'SCSI' in their subject line or
> body. Fortunately it's a really easy key to filter on. [ Scheduler mails
> are much harder to filter out :-/ ] In fact i'd suggest to do all SCSI
> development on lkml. We've got one upstream kernel codebase, one git
> stream of commits and hence we should use one lkml feed to discuss
> things on.

Oh good grief ... we do add other mailing lists to the cc list (most
often lkml) when it becomes evident that it's not a SCSI problem. Most
bug reports actually start off going to a set of lists (including lkml)
anyway, so there's usually full context. You may love drinking from the
firehose ... I find it makes me want to pee a lot. Forcing your work
habits on everyone else isn't really a very community way of solving
anything.

James

2008-01-06 15:46:38

by Adrian Bunk

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

On Sun, Jan 06, 2008 at 09:20:45AM -0600, James Bottomley wrote:
>
> On Sun, 2008-01-06 at 15:47 +0100, Ingo Molnar wrote:
> > * James Bottomley <[email protected]> wrote:
> >
> > > > I can repeat this bug, both with and without the scsi patch that is
> > > > claimed to make a difference, both with an external USB drive and an
> > > > internal IDE drive.
> > > >
> > > > To repeat:
> > > >
> > > > 1. Start with an empty drive.
> > > > 2. pktsetup 0 /dev/scd0
> > > > 3. Insert a CD containing an isofs filesystem.
> > > > 4. mount /dev/pktcdvd/0 /mnt/tmp
> > > > 5. umount /mnt/tmp
> > > > 6. Press the eject button.
> > > > 7. Insert a DVD containing a non-writable filesystem.
> > > > 8. mount /dev/scd0 /mnt/tmp
> > > > 9. find /mnt/tmp -type f -print0 | xargs -0 sha1sum >/dev/null
> > > > 10. If the DVD contains data beyond the physical size of a CD, you
> > > > get I/O errors in the terminal, and dmesg reports lots of
> > > > "attempt to access beyond end of device" errors.
> > >
> > > Brilliant! I can confirm the reproduction of the bug too (that's with
> > > the originally fingered commit reverted).
> >
> > may i point out the obvious at this stage? The thing that finally got
> > movement into this bug was ... :
> >
> > exposure on lkml
>
> I won't disagree with that. That's why my philosophy is to try to force
> all bug reports out of bugzilla and on to the relevant mailing list
> because of the many eyes approach this engenders.

The problem is that mailing lists are far too often equivalent to
/dev/null for many bug reports.

Tracking e.g. helps with not missing regressions and getting more of
them fixed.

And another of the advantages of using Bugzilla is that it gives us
numbers how bad we are in terms of introducing regressions and having
unfixed bugs, so developers are no longer able to tell we didn't have a
problem in this area...

> > The reproducer came to you via Peter Osterlund who has never authored a
> > single drivers/scsi/ commit before (according to git-log) and who (and
> > here i'm out on a limb guessing it) does not even follow
> > [email protected].
> >
> > this bug was obscure and hidden on [email protected] for
> > _months_, (it is a rarely visited and rarely read mailing list) and
> > there was just not enough "critical mass" to get this issue fixed.
>
> If I were you, I'd actually make a cursory effort to get my facts
> straight before spouting off.
>
> This bug was actually hidden in bugzilla for ages, where Matthew Wilcox
> was trying to deal with it on his own. The first I heard of it (apart
> from a linux-scsi question on 13 November, when regrettably, I was busy
> with other things) was on 18 Dec when Natalie added me to the bugzilla
> cc list. The first thing I did on that date was finger pktcdvd and add
> Jens to the cc list ... however, since there was no mailing list thread
> to follow he ended up asking for context which no-one provided.
>
> The whole problem with this bug was generated precisely because it was
> kept in bugzilla where too few people actually looked at it. You're the
> one who annotated the bugzilla entries with trite little homilies asking
> why there was no action *without* ever notifying any mailing list, I
> might add.
>
> The fault lies in our bug processing methodology. Bugzilla is a fine
> tracking tool, but it's a bloody useless workflow one for actually
> solving problems because, as you say, and I agree, the mailing lists are
> where we produce the solutions.
>...

Bugzilla for tracking and mailing lists for discussing are not mutually
exclusive.

What about asking the Bugzilla admins to set the default owner of new
SCSI bugs to [email protected]?

This way all SCSI bugs submitted in Bugzilla will automatically be
forwarded to the linux-scsi mailing list.

> James

cu
Adrian

--

"Is there not promise of rain?" Ling Tan asked suddenly out
of the darkness. There had been need of rain for many days.
"Only a promise," Lao Er said.
Pearl S. Buck - Dragon Seed

2008-01-06 16:00:48

by James Bottomley

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

On Sun, 2008-01-06 at 17:45 +0200, Adrian Bunk wrote:
> Bugzilla for tracking and mailing lists for discussing are not mutually
> exclusive.

I didn't ever say they were. What I said was we needed the work flow on
the mailing lists.

> What about asking the Bugzilla admins to set the default owner of new
> SCSI bugs to [email protected]?
>
> This way all SCSI bugs submitted in Bugzilla will automatically be
> forwarded to the linux-scsi mailing list.

Well, I've only been asking for this for the last two years. If you can
actually make it happen, I'd be eternally grateful.

James

2008-01-06 16:12:36

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"


* James Bottomley <[email protected]> wrote:

> > The reproducer came to you via Peter Osterlund who has never
> > authored a single drivers/scsi/ commit before (according to git-log)
> > and who (and here i'm out on a limb guessing it) does not even
> > follow [email protected].
> >
> > this bug was obscure and hidden on [email protected] for
> > _months_, (it is a rarely visited and rarely read mailing list) and
> > there was just not enough "critical mass" to get this issue fixed.
>
> If I were you, I'd actually make a cursory effort to get my facts
> straight before spouting off.
>
> This bug was actually hidden in bugzilla for ages, where Matthew
> Wilcox was trying to deal with it on his own. [...]

Huh? The bugzilla just tracked a bug reported to lkml. The very
description of the bugzilla says:

Subject : v2.6.24-rc2-409-g9418d5d: attempt to access beyond end of device
Submitter : Thomas Meyer <[email protected]>
References : http://lkml.org/lkml/2007/11/13/250

so no, it was evidently not "hidden in bugzilla for ages" - all the
important action happened on lkml.

> The whole problem with this bug was generated precisely because it was
> kept in bugzilla where too few people actually looked at it. You're
> the one who annotated the bugzilla entries with trite little homilies
> asking why there was no action *without* ever notifying any mailing
> list, I might add.

again, this bugzilla entry originated from lkml. I did ping the bugzilla
because i saw that the suspected commit's author was Cc:-ed already. Why
should every bug reporter and debugger be fully aware of the absolutely
SILLY little details and preferences of maintainers about how and whom
to report bugs?

YOU made the workflow fragile in the first place, by going away to
linux-scsi and ignoring lkml reports. Or in your own words, in the
bugzilla, comment #9:

http://bugzilla.kernel.org/show_bug.cgi?id=9370#c9

Reply-To: [email protected]

[...]
Erm, actually no ... this is the first I've heard of it, except as a
passing question from matthew. It's usually safe to assume if it's
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
not on linux-scsi I haven't seen it.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

that's fundamentally flawed. For testers there should be only one,
simple as possible rule:

"if you have a problem with the Linux kernel, then report it to lkml"

[ or report it to your distro or bugzilla.kernel.org, where it will be
propagated towards lkml by others. ]

not to "report it to one of the 100+ lists listed here - good luck
getting it right":

L: [email protected]
L: [email protected]
L: [email protected] (subscribers-only)
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected] (subscribers-only)
L: http://lists.twibble.org/mailman/listinfo/dc395x/
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected] (subscribers-only)
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected] (subscribers-only)
L: [email protected] (subscribers-only)
L: [email protected] (subscribers-only)
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected] (subscription required)
L: [email protected], [email protected]
L: [email protected]
L: [email protected] (subscribers-only)
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected] (in Japanese)
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected] (subscribers-only)
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected] (subscribers-only)
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected] (subscribers-only, general discussion)
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected] (subscribers-only)
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected] (subscribers-only)
L: [email protected] (subscribers-only)
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected]
L: [email protected] (subscribers-only)

yes, there can be other lists and mails, so this isnt a rigid rule in
any way, but the _default_ is for maintainers of major subsystems to be
aware of all bugs related (or suspected to be related) to their
subsystems reported to lkml. Not to be dragged on to lkml kicking and
screaming ;-) lkml isnt just for the core kernel, it's for the _whole_
kernel.

Ingo

2008-01-06 16:20:05

by Boaz Harrosh

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

On Sun, Jan 06 2008 at 5:43 +0200, Linus Torvalds <[email protected]> wrote:
>
>
> This all still leaves the question unanswered why that commit
> 6f5391c283d7fdcf24bf40786ea79061919d1e1d changed any behaviour at all.
> Because the thing that Peter is describing has nothing to do with any
> low-level drivers what-so-ever.
>
> Linus
>

James Matthew.
I have a (very) wild guess at what maybe have changed with the cmnd->done
patch:

Do you remember the effective loop in scsi_lib:scsi_end_request() where
if bufflen was smaller then original request size, do to truncation
of bufflen by ULD, then the remaining of the request is re-queued again
as a new scsi-command. Well I think that the old system would call
cmnd->done for every iteration, and the new system, since the done is
called by the block-Q, does not see the resubmit of the new command.

I have not followed all code path of the matter, but I know that sr does
alters bufflen in some cases.
All this is not a bug in itself, but it is a change in behavior that might
cause the current sr hack to fail.

Boaz

2008-01-06 16:48:11

by James Bottomley

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"


On Sun, 2008-01-06 at 18:19 +0200, Boaz Harrosh wrote:
> On Sun, Jan 06 2008 at 5:43 +0200, Linus Torvalds <[email protected]> wrote:
> >
> >
> > This all still leaves the question unanswered why that commit
> > 6f5391c283d7fdcf24bf40786ea79061919d1e1d changed any behaviour at all.
> > Because the thing that Peter is describing has nothing to do with any
> > low-level drivers what-so-ever.
> >
> > Linus
> >
>
> James Matthew.
> I have a (very) wild guess at what maybe have changed with the cmnd->done
> patch:
>
> Do you remember the effective loop in scsi_lib:scsi_end_request() where
> if bufflen was smaller then original request size, do to truncation
> of bufflen by ULD, then the remaining of the request is re-queued again
> as a new scsi-command. Well I think that the old system would call
> cmnd->done for every iteration, and the new system, since the done is
> called by the block-Q, does not see the resubmit of the new command.

Actually, this is cmnd->done, not req->done we're removing.
cmnd->done() isn't seen by the block layer; all its uses are in the SCSI
mid-layer.

> I have not followed all code path of the matter, but I know that sr does
> alters bufflen in some cases.
> All this is not a bug in itself, but it is a change in behavior that might
> cause the current sr hack to fail.

It's a good thought. You're right, the old code calls done for every
iteration. However, it calls it in scsi_finish_completion. The new
code will actually call drv->done() in that same spot for every
iteration as well.

The requeue is done via scsi_requeue_request which calls
blk_requeue_request, which resets the START flag and sends the command
right back through the system (including the prep function because
scsi_requeue_request unpreps the command), so even with the new code
we'll go back through all the same done paths.

James

2008-01-06 16:56:50

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

On Sun, Jan 06, 2008 at 02:55:07PM +0100, Thomas Meyer wrote:
> This is the correct setup to trigger the bug. And the commit
> 6f5391c283d7fdcf24bf40786ea79061919d1e1d has nothing to do with this
> bug. It was bad luck that i couldn't trigger the bug with said commit
> reverted (in my tests, the second boot with the reverted commit missed
> the first mount/umount smaller cd step, so...)

Great. Linus, can you revert the revert now?

--
Intel are signing my paycheques ... these opinions are still mine
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."

2008-01-06 17:10:30

by James Bottomley

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"


On Sun, 2008-01-06 at 17:12 +0100, Ingo Molnar wrote:
> * James Bottomley <[email protected]> wrote:
>
> > > The reproducer came to you via Peter Osterlund who has never
> > > authored a single drivers/scsi/ commit before (according to git-log)
> > > and who (and here i'm out on a limb guessing it) does not even
> > > follow [email protected].
> > >
> > > this bug was obscure and hidden on [email protected] for
> > > _months_, (it is a rarely visited and rarely read mailing list) and
> > > there was just not enough "critical mass" to get this issue fixed.
> >
> > If I were you, I'd actually make a cursory effort to get my facts
> > straight before spouting off.
> >
> > This bug was actually hidden in bugzilla for ages, where Matthew
> > Wilcox was trying to deal with it on his own. [...]
>
> Huh? The bugzilla just tracked a bug reported to lkml. The very
> description of the bugzilla says:
>
> Subject : v2.6.24-rc2-409-g9418d5d: attempt to access beyond end of device
> Submitter : Thomas Meyer <[email protected]>
> References : http://lkml.org/lkml/2007/11/13/250
>
> so no, it was evidently not "hidden in bugzilla for ages" - all the
> important action happened on lkml.

... and your original accusation was "this bug was obscure and hidden on
[email protected] for _months_" which I was pointing out wasn't
true.

Even the original lkml report was obscured by sweeping the report into
bugzilla and forgetting about it, so in fact, no action happened, even
on lkml.

The history is that I was made aware of the bug on 18 Dec. I suggested
then it was a pktcdvd problem and actually cc'd the wrong maintainer ...
again an error which is compounded by following this single person
workflow bugzilla requires. I also told you in no uncertain terms that
it wasn't caused by the commit you were trying to blame, but that didn't
stop the crusade to affix blame and close the bug without doing proper
root cause analysis.

Can we stop it with the recriminations and blame shifting now. The
problem we need to solve is fixing our broken bug resolution workflow.

My suggestion (for actually the third time in various fora) is:

to get the best of both worlds, file a bugzilla and note the
bugid. Then email a complete report to the relevant list, but
add [BUG <bugid>] to the subject line and cc
[email protected] If you do this, bugzilla will
keep track of the entire discussion as it progresses and allow
those who track bugs through bugzilla to get a pretty accurate
idea of the status. You should never need to touch bugzilla
again once the initial bug report is filed: all future
information flow is via the mailing lists.

I don't give a toss what you recommend the default list to be ... use
lkml if you wish. There are a lot of people who will vector it back on
to linux-scsi and keep lkml in the cc.

I also think that bug reports need to be complete, so copy the
information from the mailing list, don't just give a URL ... one of the
other enforced breaks in workflow is that the URL in the original
bugzilla wasn't working when we all tried to look at the original email
on 18 Dec, that's why you get several comments asking for more
information and where the original thread is.

James

2008-01-06 17:12:16

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

On Sun, Jan 06, 2008 at 09:20:45AM -0600, James Bottomley wrote:
> This bug was actually hidden in bugzilla for ages, where Matthew Wilcox
> was trying to deal with it on his own. The first I heard of it (apart
> from a linux-scsi question on 13 November, when regrettably, I was busy
> with other things) was on 18 Dec when Natalie added me to the bugzilla
> cc list. The first thing I did on that date was finger pktcdvd and add
> Jens to the cc list ... however, since there was no mailing list thread
> to follow he ended up asking for context which no-one provided.

Not true. I sent my response to the bug to linux-scsi and cc'd bugzilla.
Unfortunately, the bug reporter sent the information requested to bugzilla
instead of linux-scsi. So it got lost and overlooked.

If there's willingness to change, I'm willing to put some effort into
moving us to a bug tracking system that fits our workflow better than
bugzilla. But if that effort will be disregarded, then let me know now
so I don't waste my time.

--
Intel are signing my paycheques ... these opinions are still mine
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."

2008-01-06 17:30:37

by Stefan Richter

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

Ingo Molnar wrote:
> If lkml traffic is too big then i'd suggest to set up email
> filters to separate out mails that have 'SCSI' in their subject line

Good idea. Minor flaw: If somebody forgets to Cc LSML, he might also
forget to put SCSI or scsi into the Subject.

> or body.

This yields false positives whenever a .config is posted.

Also, filtering based on message body contents is costlier than
filtering by headers. I for one use Sieve to sort mails into different
IMAP folders at my mail provider's server, and I think Sieve doesn't
offer tests for patterns in message bodies at all.
--
Stefan Richter
-=====-==--- ---= --==-
http://arcgraph.de/sr/

2008-01-06 17:36:37

by James Bottomley

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"


On Sun, 2008-01-06 at 10:11 -0700, Matthew Wilcox wrote:
> On Sun, Jan 06, 2008 at 09:20:45AM -0600, James Bottomley wrote:
> > This bug was actually hidden in bugzilla for ages, where Matthew Wilcox
> > was trying to deal with it on his own. The first I heard of it (apart
> > from a linux-scsi question on 13 November, when regrettably, I was busy
> > with other things) was on 18 Dec when Natalie added me to the bugzilla
> > cc list. The first thing I did on that date was finger pktcdvd and add
> > Jens to the cc list ... however, since there was no mailing list thread
> > to follow he ended up asking for context which no-one provided.
>
> Not true. I sent my response to the bug to linux-scsi and cc'd bugzilla.
> Unfortunately, the bug reporter sent the information requested to bugzilla
> instead of linux-scsi. So it got lost and overlooked.
>
> If there's willingness to change, I'm willing to put some effort into
> moving us to a bug tracking system that fits our workflow better than
> bugzilla. But if that effort will be disregarded, then let me know now
> so I don't waste my time.

Well, I'd say yes, certainly, and I'll support the effort ... but the
problem is that I'm not one of the powers that be who control our
current bugzilla ... that's the constituency we need to convince. As I
keep saying, just getting new SCSI bug reports tipped onto the SCSI
mailing list will give us about 90% of what we need, but I haven't even
managed to get that to happen.

James

2008-01-06 18:23:05

by Linus Torvalds

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"



On Sun, 6 Jan 2008, James Bottomley wrote:
> > index 993f78c..6a20da9 100644
> > --- a/fs/block_dev.c
> > +++ b/fs/block_dev.c
> > @@ -1191,6 +1191,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
> > }
> > if (bdev->bd_invalidated)
> > rescan_partitions(bdev->bd_disk, bdev);
> > + bd_inode->i_size = (loff_t)get_capacity(disk)<<9;
> > }
> > }
> > bdev->bd_openers++;
>
> Actually, I think that code is fine ... the block size shouldn't change
> across positive values of bd_openers.

The block size should indeed not change, and that's why we must *not* do
bd_set_size() there (because that changes the block size too, not just the
size of the device).

But I would argue that if we have had a device invalidation event (ie
media change), then we should indeed change the actual *size* of the block
device as seen by anybody who opens the file again.

And yes, it will affect old openers of the same inode too (since the size
is in the inode, not the file descriptor), but considering that this
really should only happen for media change events, I think that's better
than what we used to have.

Now, we could have made it even more clear by moving the "i_size" setting
into the same "if (dbev->bd_invalidated)" conditional, but the thing is,
we only set bd_invalidated for devices that have minors, so things like
floppies etc that don't have partition support also don't have
bd_invalidated set (in effect, bd_invalidated really *is* just a flag for
the partitioning code, not for disk change in general).

That said:

> pktcdvd shouldn't be mucking with the size of the underlying CD/DVD ...

I'm not sure if it should be mucking with the size or not, but it
definitely shouldn't be mucking with the block-size, because that can
indeed cause huge problems.

So removing the bd_set_size() is almost certainly the right thing to do
(because it's always incorrect to do at an "inner" open), and the real
size should have already been set by the regular open.

But this should be tested by somebody who uses the dang thing. My personal
favourite for a real fix would actually be to always make the capacity of
the pktcdvd device match the capacity of the underlying device, ie the
diff would look something like the appended (untested as usual), but that
may be a bit too extensive a change.

But regardless, I think the i_size change makes sense - at least in some
form. It doesn't necessarily have to be the explicit i_size setting: I
also considered just changing the block_dev.c do_open() make bd_set_size()
_unconditionally_, and then move the "if (!bdev->bd_openers)" test into
bd_set_size(), so that people couldn't even change the blocksize by
mistake!

I'd still like to hear comments from Al in particular, if he has something
to say.

Linus
---
drivers/block/pktcdvd.c | 4 +---
1 files changed, 1 insertions(+), 3 deletions(-)

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 3535ef8..1b23681 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2342,9 +2342,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, int write)
goto out_unclaim;
}

- set_capacity(pd->disk, lba << 2);
- set_capacity(pd->bdev->bd_disk, lba << 2);
- bd_set_size(pd->bdev, (loff_t)lba << 11);
+ set_capacity(pd->disk, get_capacity(pd->bdev->bd_disk));

q = bdev_get_queue(pd->bdev);
if (write) {

2008-01-06 18:43:26

by Willy Tarreau

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

On Sun, Jan 06, 2008 at 11:36:23AM -0600, James Bottomley wrote:
> On Sun, 2008-01-06 at 10:11 -0700, Matthew Wilcox wrote:
> > If there's willingness to change, I'm willing to put some effort into
> > moving us to a bug tracking system that fits our workflow better than
> > bugzilla. But if that effort will be disregarded, then let me know now
> > so I don't waste my time.
>
> Well, I'd say yes, certainly, and I'll support the effort ... but the
> problem is that I'm not one of the powers that be who control our
> current bugzilla ... that's the constituency we need to convince. As I
> keep saying, just getting new SCSI bug reports tipped onto the SCSI
> mailing list will give us about 90% of what we need, but I haven't even
> managed to get that to happen.

As long as the process will be that much complicated, it will never
correctly work. The primary requirement for a useful bug reporting
workflow is *not to put the burden on the reporter*.

I agree with Ingo here. How can a user know where to post ? He has
a problem with his Linux distro. He finally understands or gets told
that the strange numbers on the screen indicate a kernel oops and
that he must report it if he wants it to be fixed. He then googles
for how/where to report a bug, and the first reply says :

http://www.kernel.org/pub/linux/docs/lkml/reporting-bugs.html

in which you can read :

"If you are totally stumped as to whom to send the report, send
it to [email protected]".

So *this* is the official central entry point, like it or not. And
in fact it works, given the number of off-topic reports we get. It
proves that end users know how to report their problems there.

Other lists should be used when the problem is clearly qualified.
And most of the time, it's not up to the end user to qualify his
problem, but to *us*. Our mission is not to blindly write lines of
code, but to spend some time getting user feedback, and bug reports
are part of this feedback.

In my opinion, the most important reader contribution to LKML is
just reading bugs reports and redirecting them to the proper list
if obviously required. People do this all the time and it has
always worked.

In parallel, bug entries may be added into bugzilla, either by the
reporter once suggested to do so, or by the person redirecting him
to the proper list.

So a working workflow would look like this :

1) from: user
to: lkml
subject: help needed with my CD burner

2) from: any LKML reader
to: user
cc: lkml, linux-scsi
subject: re: help needed with my CD burner

Message forwarded to linux-scsi. You may accelerate resolution
by describing your problem in bugzilla [url, mail...]

=> user knows his problem is being considered (very important)
=> user is connected with the proper list and possibly with a
bugzilla entry. As long as the bug is not 100% sure relevant
to linux-scsi, lkml should remain CCed so that other readers
may occasionally spot the problem.

I would also like to make a parallel with how support works in
commercial products. Generally, you as the end user don't know
anything about the vendor's internal process. You don't even
know if you have an account on your vendor's support site (another
blocking factor of bugzilla BTW). So what you do is call any of
your contacts overthere, quickly describe your problem, and most
of the time he gives you all the indications required to report
a fine bug. And if he understands it will be too hard for you
(classically because of missing account), he will initiate it
by himself. At this point the bug is tracked and followed by the
appropriate persons.

LKML *is* the contact for any Linux Kernel problem or question.
As long as we will try to bypass it and create parallel ways,
it will only maintain confusion and lead to bugs getting dropped
with user frustration.

IMHO, the only missing indication in "reporting-bugs.html" above
is :

"if you don't get any reply within 2 days, surely your mail
has not been noticed, repost it, if possible with a more
appropriate subject".

We will *never* be able to educate newcomers, but we can (and must)
educate regular readers to help newcomers report bugs. If only 100
regular readers randomly forward 2 mails a month, those are 200 bugs
which get properly handled. Just don't forget to change your reply-to
to lkml if you don't want to get polluted by the discussion.

As to using bugzilla for bug tracking... Well, I agree that bug
tracking is important when you work on multiple problems at once.
But bugzilla should be the developer's tool, not the end user's.
That means that it should only be our job to create entries there
if end users find it too difficult, and that we should just invite
them to *try* to report there to save us some time.

Willy

2008-01-06 18:45:24

by Linus Torvalds

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"



On Sun, 6 Jan 2008, Linus Torvalds wrote:
>
> That said:
>
> > pktcdvd shouldn't be mucking with the size of the underlying CD/DVD ...
>
> I'm not sure if it should be mucking with the size or not, but it
> definitely shouldn't be mucking with the block-size, because that can
> indeed cause huge problems.

Hmm. Looking closer, it's probably ok in that case, because it does do a
"bd_claim()" to make sure that it has exclusive access, so while there may
be other openers around, at least those other openers won't be filesystem
mounts or anything that opened with O_EXCL.

So changing the blocksize is probably ok in this case.

That still leaves the question whether pktcdvd *should* muck with the base
device at all, and I'm not at all sure about that. But I'm no longer sure
that the pktcdvd code is necessarily *clearly* broken, now it's more of a
"should it really do that?" thing.

So I still suspect that this:

> - set_capacity(pd->disk, lba << 2);
> - set_capacity(pd->bdev->bd_disk, lba << 2);
> - bd_set_size(pd->bdev, (loff_t)lba << 11);
> + set_capacity(pd->disk, get_capacity(pd->bdev->bd_disk));

is likely a good thing to do (in conjunction with my patch that made
i_size be "reliable" after an open), but there may be some reason why
pktcdvd really wants to control the size rather than be on the receiving
end of the size.

Peter, this is your decision. Apparently my one-liner fixes the immediate
bug (but it's not really a regression either - I think the i_size issue
has been there since pretty much day #1), and what pktcdvd does is
somewhat less critical an issue?

Linus

2008-01-06 18:55:19

by James Bottomley

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"


On Sun, 2008-01-06 at 10:44 -0800, Linus Torvalds wrote:
>
> On Sun, 6 Jan 2008, Linus Torvalds wrote:
> >
> > That said:
> >
> > > pktcdvd shouldn't be mucking with the size of the underlying CD/DVD ...
> >
> > I'm not sure if it should be mucking with the size or not, but it
> > definitely shouldn't be mucking with the block-size, because that can
> > indeed cause huge problems.
>
> Hmm. Looking closer, it's probably ok in that case, because it does do a
> "bd_claim()" to make sure that it has exclusive access, so while there may
> be other openers around, at least those other openers won't be filesystem
> mounts or anything that opened with O_EXCL.
>
> So changing the blocksize is probably ok in this case.
>
> That still leaves the question whether pktcdvd *should* muck with the base
> device at all, and I'm not at all sure about that. But I'm no longer sure
> that the pktcdvd code is necessarily *clearly* broken, now it's more of a
> "should it really do that?" thing.
>
> So I still suspect that this:
>
> > - set_capacity(pd->disk, lba << 2);
> > - set_capacity(pd->bdev->bd_disk, lba << 2);
> > - bd_set_size(pd->bdev, (loff_t)lba << 11);
> > + set_capacity(pd->disk, get_capacity(pd->bdev->bd_disk));
>
> is likely a good thing to do (in conjunction with my patch that made
> i_size be "reliable" after an open), but there may be some reason why
> pktcdvd really wants to control the size rather than be on the receiving
> end of the size.
>
> Peter, this is your decision. Apparently my one-liner fixes the immediate
> bug (but it's not really a regression either - I think the i_size issue
> has been there since pretty much day #1), and what pktcdvd does is
> somewhat less critical an issue?

I think perhaps the true bug lies in the way we handle layered devices
like this. pktcdvd holds the underlying device open, so its refcount
never drops to zero. This is what causes the gendisk/block layer never
to update the sizes, and what lead to pktcdvd doing it instead.

However, what perhaps really needs to happen is that pktcdvd needs to
take over the media change events as well. That way it could see the
disk change and invalidate and reread its own setting of the block size
(and possibly re set the size of the underlying device).

I agree, though, this isn't a regression. It's probably obscure enough
in reproduction to warrant not holding up 2.6.24 --- especially as I
think the true fix will do small perturbations to a lot of subsystems.
If this were a product and I were the release manager, I'd be updating
the release notes with a note about having to break pktcdvd binding
across media changes to work around this bug and a promise to fix it in
the next release.

James

2008-01-06 18:57:13

by Adrian Bunk

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

On Sun, Jan 06, 2008 at 07:34:02PM +0100, Willy Tarreau wrote:
>...
> As to using bugzilla for bug tracking... Well, I agree that bug
> tracking is important when you work on multiple problems at once.
> But bugzilla should be the developer's tool, not the end user's.
> That means that it should only be our job to create entries there
> if end users find it too difficult, and that we should just invite
> them to *try* to report there to save us some time.

Where does this opinion end users would find Bugzilla too difficult come
from? Many other projects use Bugzilla without big problems.

Is this just plain FUD or based on real reports from end users?
In the latter case, please give pointers to them so that whatever
problems exist can be fixed.

And different to LKML, you don't run into problems like majordomo
silently dropping your email because it contains HTML or a vCard.

> Willy

cu
Adrian

--

"Is there not promise of rain?" Ling Tan asked suddenly out
of the darkness. There had been need of rain for many days.
"Only a promise," Lao Er said.
Pearl S. Buck - Dragon Seed

2008-01-06 19:20:22

by Willy Tarreau

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

On Sun, Jan 06, 2008 at 08:56:25PM +0200, Adrian Bunk wrote:
> On Sun, Jan 06, 2008 at 07:34:02PM +0100, Willy Tarreau wrote:
> >...
> > As to using bugzilla for bug tracking... Well, I agree that bug
> > tracking is important when you work on multiple problems at once.
> > But bugzilla should be the developer's tool, not the end user's.
> > That means that it should only be our job to create entries there
> > if end users find it too difficult, and that we should just invite
> > them to *try* to report there to save us some time.
>
> Where does this opinion end users would find Bugzilla too difficult come
> from? Many other projects use Bugzilla without big problems.
>
> Is this just plain FUD or based on real reports from end users?
> In the latter case, please give pointers to them so that whatever
> problems exist can be fixed.

I, as an end user of ntpd, have been harrassed to use it to get an
ntp bug reported "because by mail it would get lost". What complicated
an interface it is when you don't know it ! I remember I wanted to
attach a patch and it didn't even get through the first time. I did
it wrong. Blame me if you want, but an interface which need training
for proper use is certainly not for casual end users.

Also, it's very annoying to have to create an account somewhere, leaving
there one of the passwords you use on many other sites, just to help a
random developer fix a bug in his code. You quickly wonder if someone
else will report it and have more patience.

Another recent example: a coworker recently told me he installed the
latest beta from ubuntu, and that he had some problems with his WIFI
randomly hanging. I asked him if he filed a bug, he replied me "no,
it's too much boring, I'm not the only one with this hardware, others
have certainly already done it". When the release went out, he insisted
telling me he was right not filing the bug because indeed it was fixed !

We must accept that end users :
1) do not like creating accounts (remember or divulgate passwords,
and risk of getting spam)

2) do not know how to classify their problem, and are not even
sure it's a real bug. On the first page, when uncertain they
would probably click "Other". Adding doubt in the reporter's
mind is counter-productive as it will refrain him from being
precise about what he did to get the problem.

3) are not familiar with our vocabulary :
- "Tree" : mainline? mm? mjb? ac? what's that ?
- "Component" : Configuration? LSM? Modules? Other?

=> finally, I'm not sure I had to click "Other" in the first place,
I want to choose something else, I click "Back" and I get back
to the login page! Bye bye.

Also :
"No binary modules - NVIDIA users this means YOU!"
=> about half the reporters will wonder if they should stop here
or not. Most of those with an NVidia chipset and/or graphics
card will wonder, while the bug may still interest us.

At least, on the mailing list, there's no real rules, the mail will
be posted anyway. And if the user gets flamed, at least we have the
report.

> And different to LKML, you don't run into problems like majordomo
> silently dropping your email because it contains HTML or a vCard.

That's true. But do we have statistics on the ratio of client IP
addresses which go as far as the login page and which have finally
not filed their bug (either stopped at the login page or given up
after logging in) ? It should be very interesting...

Regards,
Willy

2008-01-06 19:58:43

by Adrian Bunk

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

On Sun, Jan 06, 2008 at 08:10:44PM +0100, Willy Tarreau wrote:
> On Sun, Jan 06, 2008 at 08:56:25PM +0200, Adrian Bunk wrote:
> > On Sun, Jan 06, 2008 at 07:34:02PM +0100, Willy Tarreau wrote:
> > >...
> > > As to using bugzilla for bug tracking... Well, I agree that bug
> > > tracking is important when you work on multiple problems at once.
> > > But bugzilla should be the developer's tool, not the end user's.
> > > That means that it should only be our job to create entries there
> > > if end users find it too difficult, and that we should just invite
> > > them to *try* to report there to save us some time.
> >
> > Where does this opinion end users would find Bugzilla too difficult come
> > from? Many other projects use Bugzilla without big problems.
> >
> > Is this just plain FUD or based on real reports from end users?
> > In the latter case, please give pointers to them so that whatever
> > problems exist can be fixed.
>
> I, as an end user of ntpd, have been harrassed to use it to get an
> ntp bug reported "because by mail it would get lost". What complicated

Noone knows how many thousand bug reports have never reached lkml
since majordomo silently dropped them.

This is the price for having lkml relatively spam-free.

> an interface it is when you don't know it ! I remember I wanted to
> attach a patch and it didn't even get through the first time. I did
> it wrong. Blame me if you want, but an interface which need training
> for proper use is certainly not for casual end users.

What exactly is the problem with attaching files?
What is "it didn't even get through the first time" exactly?

> Also, it's very annoying to have to create an account somewhere, leaving
> there one of the passwords you use on many other sites, just to help a
> random developer fix a bug in his code. You quickly wonder if someone
> else will report it and have more patience.

If you already lack patience at this point, would you be willing to
bisect which requires more than a dozen kernel recompiles and reboots?

> Another recent example: a coworker recently told me he installed the
> latest beta from ubuntu, and that he had some problems with his WIFI
> randomly hanging. I asked him if he filed a bug, he replied me "no,
> it's too much boring, I'm not the only one with this hardware, others
> have certainly already done it". When the release went out, he insisted
> telling me he was right not filing the bug because indeed it was fixed !

He wouldn't have sent a bug report no matter how to report it.

> We must accept that end users :
> 1) do not like creating accounts (remember or divulgate passwords,
> and risk of getting spam)

Send _one_ email to lkml and you'll get forever spam to this address.

With one email addresses of mine exactly that happened.

> 2) do not know how to classify their problem, and are not even
> sure it's a real bug. On the first page, when uncertain they
> would probably click "Other". Adding doubt in the reporter's
> mind is counter-productive as it will refrain him from being
> precise about what he did to get the problem.

If the bug ends at Other/Other that's not a problem - this usually gets
fixed within a few hours.

> 3) are not familiar with our vocabulary :
> - "Tree" : mainline? mm? mjb? ac? what's that ?
> - "Component" : Configuration? LSM? Modules? Other?

Then let's improve that.

> => finally, I'm not sure I had to click "Other" in the first place,
> I want to choose something else, I click "Back" and I get back
> to the login page! Bye bye.

Works fine here.

Did you disable cookies?

> Also :
> "No binary modules - NVIDIA users this means YOU!"
> => about half the reporters will wonder if they should stop here
> or not. Most of those with an NVidia chipset and/or graphics
> card will wonder, while the bug may still interest us.

Then sugggest a better text.

> At least, on the mailing list, there's no real rules, the mail will
> be posted anyway. And if the user gets flamed, at least we have the
> report.
>...

If majordomo didn't drop it.

And if it didn't get ignored and forgotten.

> Regards,
> Willy

cu
Adrian

--

"Is there not promise of rain?" Ling Tan asked suddenly out
of the darkness. There had been need of rain for many days.
"Only a promise," Lao Er said.
Pearl S. Buck - Dragon Seed

2008-01-06 20:27:38

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"


* Stefan Richter <[email protected]> wrote:

> Ingo Molnar wrote:
> > If lkml traffic is too big then i'd suggest to set up email
> > filters to separate out mails that have 'SCSI' in their subject line
>
> Good idea. Minor flaw: If somebody forgets to Cc LSML, he might also
> forget to put SCSI or scsi into the Subject.

yeah, that's very frequent, but i dont think it's a big issue:

> > or body.
>
> This yields false positives whenever a .config is posted.

these two patterns:

"^CONFIG_"
"^# "

will exclude .config-ish lines.

Ingo

2008-01-06 21:17:53

by Willy Tarreau

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

On Sun, Jan 06, 2008 at 09:58:02PM +0200, Adrian Bunk wrote:
> On Sun, Jan 06, 2008 at 08:10:44PM +0100, Willy Tarreau wrote:
> > I, as an end user of ntpd, have been harrassed to use it to get an
> > ntp bug reported "because by mail it would get lost". What complicated
>
> Noone knows how many thousand bug reports have never reached lkml
> since majordomo silently dropped them.

Since none of my mails has been dropped yet, I think that the false
positives are rather rare. Yes, sometimes someone complains about a
mail getting repeatedly killed. But that's not *that* much frequent
IMO and people are already used to re-post when mailing their friends,
coworkers or customers. It's no different here. Still, I agree that
it is a problem.

> This is the price for having lkml relatively spam-free.

yes and I think it works fairly well.

> > an interface it is when you don't know it ! I remember I wanted to
> > attach a patch and it didn't even get through the first time. I did
> > it wrong. Blame me if you want, but an interface which need training
> > for proper use is certainly not for casual end users.
>
> What exactly is the problem with attaching files?
> What is "it didn't even get through the first time" exactly?

Well, it's quite old in my memories, it may be 2 years ago. IIRC, when
I wanted to attach files, I got brought to another page for this, and
once done there was some confusing indication about how to complete the
filing or get back to terminate the report. Sorry for not being much
precise on this one, it's too far ago.

> > Also, it's very annoying to have to create an account somewhere, leaving
> > there one of the passwords you use on many other sites, just to help a
> > random developer fix a bug in his code. You quickly wonder if someone
> > else will report it and have more patience.
>
> If you already lack patience at this point,

Well, it took me 2 minutes to send my patch to the maintainer by then,
he very politely told me that the only way was through bugzilla, and
then it took me half an hour if not more to create a bugzilla account,
find how to use it, attach the files and put a description in a text-area.

Also, I remember that the ongoing mail exchanges through bugzilla
systematically removed the mail history, which made it very hard to
follow a discussion, because all mails I received were almost single-lined
looking like "how did that happen ?" or "in what circumstances ?" without
any history... Maybe this is configurable though.

> would you be willing to
> bisect which requires more than a dozen kernel recompiles and reboots?

Certainly not! But I would like kernel people to become less egocentric
and understand that what they routinely do all the day appears very
complicated and time-consuming to many users, and that by imposing them
complex and/or costly methods to report bugs, they filter a lot of
reports out. Sure, there are still a bunch of them doing everything
up to and including the git bisects. But what percentage ?

People who encounter problems at work will not do that to start with,
because they cannot delay all their work to spend half a day building
kernels when their boss or customers are waiting for their work. Others
will report the problems they encounter at a customer's and will not
even be able to git-bisect because the customer's mail server is not
like a notebook they have everywhere with them and can reboot at will.
Some of them are more free of their time and will probably enjoy the
experience, but they are a minority IMHO.

If we had stats on the periods git bisects are run on, I suspect that
night and week-ends are the most frequent moments, simply because it's
when people have time.

IMHO, git bisect is excellent for kernel people. Not for random users.
They first have to install git, find free space, *clone the kernel tree*
and start discovering the beast.

> > Another recent example: a coworker recently told me he installed the
> > latest beta from ubuntu, and that he had some problems with his WIFI
> > randomly hanging. I asked him if he filed a bug, he replied me "no,
> > it's too much boring, I'm not the only one with this hardware, others
> > have certainly already done it". When the release went out, he insisted
> > telling me he was right not filing the bug because indeed it was fixed !
>
> He wouldn't have sent a bug report no matter how to report it.

I don't agree. It's a matter of effort vs expected advantage. Just
sending a 5-lines mail from work presents a lower entry barrier than
having to create an account and discover a new tool.

In fact, from the user's perspective, filing a kernel bug report is seen
as something annoying and useless, simply because the kernel is so much
used that someone else will file the same bug anyway. They act just like
microsoft users. Do you know anyone in your relatives who has *ever*
filed a bug to microsoft ? Probably zero. They passively wait for the
next patch, and just whine if their bug does not get magically fixed.

We must understand that our users who file bug reports are not doing this
*for them* anymore, they are offering us *presents for free*, because
someone tells them "report it before the release so that it gets fixed".
We must do everything to incitate them to do so. If the present becomes
even slightly annoying, we never get it. Have you noticed the number of
"me too" on the list ? Users find any sort of excuse for not having filed
a report in the first time, but are still willing to confirm another
one's bug. That's normal, they're just humans after all.

The ones making the most efforts are those with driver problems on rare
hardware, or those who encounter problems which look very specific to
their setups, because they know that nobody else will work on a fix if
they don't report the problem.

> > We must accept that end users :
> > 1) do not like creating accounts (remember or divulgate passwords,
> > and risk of getting spam)
>
> Send _one_ email to lkml and you'll get forever spam to this address.
> With one email addresses of mine exactly that happened.

That's true too. But given the number of people who randomly forward
stupidities by mails to lists of "friends" from their work address, I
think that getting their address spammed is not a problem for many of
them.

Oh and BTW, mail addresses entered in bugzilla are publicly readable
anyway. I've just randomly picked bug #1234 and the reporter and
participants may trivially be spammed.

> > 2) do not know how to classify their problem, and are not even
> > sure it's a real bug. On the first page, when uncertain they
> > would probably click "Other". Adding doubt in the reporter's
> > mind is counter-productive as it will refrain him from being
> > precise about what he did to get the problem.
>
> If the bug ends at Other/Other that's not a problem - this usually gets
> fixed within a few hours.

OK, but we should indicate it clearly at the very beginning :

"If you don't know which category your report belongs to, simply
select Other, and it will be qualified by a more skilled person"

> > 3) are not familiar with our vocabulary :
> > - "Tree" : mainline? mm? mjb? ac? what's that ?
> > - "Component" : Configuration? LSM? Modules? Other?
>
> Then let's improve that.
>
> > => finally, I'm not sure I had to click "Other" in the first place,
> > I want to choose something else, I click "Back" and I get back
> > to the login page! Bye bye.
>
> Works fine here.
>
> Did you disable cookies?

Not at all (I don't even think the web works well anymore without cookies
these days). I retried. In fact I was still authenticated it seems, I
think that it's just that the login page is poorly placed in the sequence
of pages. By clicking on "back", I intuitively expected to get back to
the selection of another category.

> > Also :
> > "No binary modules - NVIDIA users this means YOU!"
> > => about half the reporters will wonder if they should stop here
> > or not. Most of those with an NVidia chipset and/or graphics
> > card will wonder, while the bug may still interest us.
>
> Then sugggest a better text.

"Before reporting a bug, please ensure that you can reproduce the
problem without any binary module loaded (such as Nvidia's drivers,
ndiswrapper, etc...). Ensuring that a problem happens without any
third party drivers (which are not provided with the kernel) increases
your report's accuracy and avoids wasting developer's time. If you
are not sure, please check for [blacklist names here] by running
'lsmod|more' in a shell window. If you do not know how to run without
those drivers, please still file the report, it may still be considered
valuable, but you may be recontacted and guided to unload them."

> > At least, on the mailing list, there's no real rules, the mail will
> > be posted anyway. And if the user gets flamed, at least we have the
> > report.
> >...
>
> If majordomo didn't drop it.

You seem to have experienced a lot of trouble with LKML!

> And if it didn't get ignored and forgotten.

... by maintainers who deliberately refuse to read LKML ? :-)

Seriously, LKML is bad for *long term* tracking, as most people will
rotate their mailboxes once a month or week and old mails become dead
archives with no reminder. Something like bugzilla makes it possible
never to forget them. But the expensive work is still to get the bug
description there without discouraging newcomers.

Regards,
Willy

2008-01-06 22:26:18

by Adrian Bunk

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

On Sun, Jan 06, 2008 at 10:08:13PM +0100, Willy Tarreau wrote:
> On Sun, Jan 06, 2008 at 09:58:02PM +0200, Adrian Bunk wrote:
> > On Sun, Jan 06, 2008 at 08:10:44PM +0100, Willy Tarreau wrote:
> > > I, as an end user of ntpd, have been harrassed to use it to get an
> > > ntp bug reported "because by mail it would get lost". What complicated
> >
> > Noone knows how many thousand bug reports have never reached lkml
> > since majordomo silently dropped them.
>
> Since none of my mails has been dropped yet, I think that the false
> positives are rather rare. Yes, sometimes someone complains about a
> mail getting repeatedly killed. But that's not *that* much frequent
> IMO and people are already used to re-post when mailing their friends,
> coworkers or customers. It's no different here. Still, I agree that
> it is a problem.

If someone works in a company where the default MUA setting is to also
attach the text as HTML and to add a vCard to all emails people might
try once to submit their bug report, not notice that it didn't arrive
on the list, and then simply give up.

> > This is the price for having lkml relatively spam-free.
>
> yes and I think it works fairly well.
>
> > > an interface it is when you don't know it ! I remember I wanted to
> > > attach a patch and it didn't even get through the first time. I did
> > > it wrong. Blame me if you want, but an interface which need training
> > > for proper use is certainly not for casual end users.
> >
> > What exactly is the problem with attaching files?
> > What is "it didn't even get through the first time" exactly?
>
> Well, it's quite old in my memories, it may be 2 years ago. IIRC, when
> I wanted to attach files, I got brought to another page for this, and
> once done there was some confusing indication about how to complete the
> filing or get back to terminate the report. Sorry for not being much
> precise on this one, it's too far ago.

Currently the page for attaching a file has a "Submit" button.

> > > Also, it's very annoying to have to create an account somewhere, leaving
> > > there one of the passwords you use on many other sites, just to help a
> > > random developer fix a bug in his code. You quickly wonder if someone
> > > else will report it and have more patience.
> >
> > If you already lack patience at this point,
>
> Well, it took me 2 minutes to send my patch to the maintainer by then,
> he very politely told me that the only way was through bugzilla, and
> then it took me half an hour if not more to create a bugzilla account,
> find how to use it, attach the files and put a description in a text-area.

People reporting bugs together with a patch to fix it are not the usual
case but an exception.

> Also, I remember that the ongoing mail exchanges through bugzilla
> systematically removed the mail history, which made it very hard to
> follow a discussion, because all mails I received were almost single-lined
> looking like "how did that happen ?" or "in what circumstances ?" without
> any history... Maybe this is configurable though.

This depends on how people answer in Bugzilla.

But an advantage of Bugzilla is that each email contains a link to the
Bugzilla bug containing all discussions in the bug.

> > would you be willing to
> > bisect which requires more than a dozen kernel recompiles and reboots?
>
> Certainly not! But I would like kernel people to become less egocentric
> and understand that what they routinely do all the day appears very
> complicated and time-consuming to many users, and that by imposing them
> complex and/or costly methods to report bugs, they filter a lot of
> reports out. Sure, there are still a bunch of them doing everything
> up to and including the git bisects. But what percentage ?

If you report a regression in the kernel and are not willing to bisect
the probability of the bug being resolved becomes _much_ smaller.

Partially due to this requiring much more developer time, but also
partially due to the fact that many regressions are undebuggable without
a bisection.

> People who encounter problems at work will not do that to start with,
> because they cannot delay all their work to spend half a day building
> kernels when their boss or customers are waiting for their work. Others
> will report the problems they encounter at a customer's and will not
> even be able to git-bisect because the customer's mail server is not
> like a notebook they have everywhere with them and can reboot at will.
> Some of them are more free of their time and will probably enjoy the
> experience, but they are a minority IMHO.

People tend to report bugs if and only they have no other choice (like
some workaround).

So when they report a bug they really need a fix for their problem.

And have you ever worked in a company that pays a seven digit amount of
Euros each year to Oracle for licences and support for their database?
I have. It's not that spending some man days on debugging or working
around one of the many regressions in the POS they ship to their paying
costumers was unusual.

But you might need the new release e.g. because the older release no
longer has security support or has another bug that is fixed in the
latest release, so your boss has no choice than assigning you for
as long as required at helping Oracle support to figure out what they
have broken this time.

> If we had stats on the periods git bisects are run on, I suspect that
> night and week-ends are the most frequent moments, simply because it's
> when people have time.
>
> IMHO, git bisect is excellent for kernel people. Not for random users.
> They first have to install git, find free space, *clone the kernel tree*
> and start discovering the beast.

It's good for finding what caused a regression.

If a user doesn't want to spend some time helping to find a problem he
experiences there's nothing we can do.

Where we can and should improve is to no longer scare people who do
report bugs and who are willing to spend some time on helping to debug
bugs away by keeping their bug reports unanswered.

> > > Another recent example: a coworker recently told me he installed the
> > > latest beta from ubuntu, and that he had some problems with his WIFI
> > > randomly hanging. I asked him if he filed a bug, he replied me "no,
> > > it's too much boring, I'm not the only one with this hardware, others
> > > have certainly already done it". When the release went out, he insisted
> > > telling me he was right not filing the bug because indeed it was fixed !
> >
> > He wouldn't have sent a bug report no matter how to report it.
>
> I don't agree. It's a matter of effort vs expected advantage. Just
> sending a 5-lines mail from work presents a lower entry barrier than
> having to create an account and discover a new tool.

And how would he react when he gets a request to bisect the bug?

> In fact, from the user's perspective, filing a kernel bug report is seen
> as something annoying and useless, simply because the kernel is so much
> used that someone else will file the same bug anyway. They act just like
> microsoft users. Do you know anyone in your relatives who has *ever*
> filed a bug to microsoft ? Probably zero. They passively wait for the
> next patch, and just whine if their bug does not get magically fixed.

That's neither our fault nor our problem.

Our problem are the people who whine because bugs they actually
reported stay unfixed.

> We must understand that our users who file bug reports are not doing this
> *for them* anymore, they are offering us *presents for free*, because
> someone tells them "report it before the release so that it gets fixed".
> We must do everything to incitate them to do so. If the present becomes
> even slightly annoying, we never get it. Have you noticed the number of
> "me too" on the list ? Users find any sort of excuse for not having filed
> a report in the first time, but are still willing to confirm another
> one's bug. That's normal, they're just humans after all.

If users don't report a bug they run into that's their problem.

> The ones making the most efforts are those with driver problems on rare
> hardware, or those who encounter problems which look very specific to
> their setups, because they know that nobody else will work on a fix if
> they don't report the problem.
>
> > > We must accept that end users :
> > > 1) do not like creating accounts (remember or divulgate passwords,
> > > and risk of getting spam)
> >
> > Send _one_ email to lkml and you'll get forever spam to this address.
> > With one email addresses of mine exactly that happened.
>
> That's true too. But given the number of people who randomly forward
> stupidities by mails to lists of "friends" from their work address, I
> think that getting their address spammed is not a problem for many of
> them.
>
> Oh and BTW, mail addresses entered in bugzilla are publicly readable
> anyway. I've just randomly picked bug #1234 and the reporter and
> participants may trivially be spammed.
>...

Sure, that's no different from email addresses in lkml archives...

> > And if it didn't get ignored and forgotten.
>
> ... by maintainers who deliberately refuse to read LKML ? :-)
>
> Seriously, LKML is bad for *long term* tracking, as most people will
> rotate their mailboxes once a month or week and old mails become dead
> archives with no reminder. Something like bugzilla makes it possible
> never to forget them. But the expensive work is still to get the bug
> description there without discouraging newcomers.

When the expensive part of bug reporting is pasting the bug report
somewhere the submitter most likely hasn't spent enough time on writing
a proper bugreport.

Bug reports are important contributions to development, but our
problems are not related to getting more bug reports but to coping with
the incoming bug reports.

> Regards,
> Willy

cu
Adrian

--

"Is there not promise of rain?" Ling Tan asked suddenly out
of the darkness. There had been need of rain for many days.
"Only a promise," Lao Er said.
Pearl S. Buck - Dragon Seed

2008-01-07 15:26:42

by John Stoffel

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"


I'll agree with what Willy wrote here, Bugzilla is a pain to use, you
can't just dump an email into it and have it captured. I think we
should be looking at something more like 'WebRT' which is an *issue*
tracker software. But that too might be too heavy weight and too
noisy as well.

And suddenly it would starting putting ticket numbers onto all the
non-problem conversations we have here on lkml as well, which I'm not
sure we really want.

Does it mean that we needs to have a '[email protected]' email for
people to report bugs/problems/issues, while lkml remains (but is
copied for all '[email protected]' emails) primarily the developer point
of contact?

The question to me really revolves around how do you automate the
process in a transparent manner so that people don't have to change
much how they interact with lkml.

John

2008-01-07 19:05:53

by Stefan Richter

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

John Stoffel wrote:
> The question to me really revolves around how do you automate the
> process in a transparent manner so that people don't have to change
> much how they interact with lkml.

I think the more important questions are:
- Are there people who know how to get reports to developers?
(We can't train all potential reporters to get reports right.)
- Are there enough developers to work towards bug fixes?
The technical means to capture reports, let alone the bugtracking tools,
are of secondary importance.
--
Stefan Richter
-=====-==--- ---= --===
http://arcgraph.de/sr/

2008-01-07 20:00:18

by John Stoffel

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

>>>>> "Stefan" == Stefan Richter <[email protected]> writes:

Stefan> John Stoffel wrote:
>> The question to me really revolves around how do you automate the
>> process in a transparent manner so that people don't have to change
>> much how they interact with lkml.

Stefan> I think the more important questions are:
Stefan> - Are there people who know how to get reports to developers?
Stefan> (We can't train all potential reporters to get reports right.)

Sure, but telling them "email to [email protected]" isn't tough.

Stefan> - Are there enough developers to work towards bug fixes?

Nope, never enough.

Stefan> The technical means to capture reports, let alone the
Stefan> bugtracking tools, are of secondary importance.

The technical means are secondary, it's the human factors of how bug
reports are captured which is of prime concern. It's obvious people
aren't happy with bugzilla. Most, but not all, are happy with LKML,
but a dumb mailing list isn't ideal for tracking bug reports.

As a SysAdmin, I love being able to tell me user 'just email help' to
open a ticket. As a SysAdmin, I can then interact with the tickets
which are created and managed using 'RT' (http://www.fsck.org/rt) from
inside my email client without having to think about it too much.

Again, transparency to the end-user AND tothe developers is key. If
neither finds if easy to use, it won't be. So we need to come up with
a work flow which works with us (read you, I just read and contribute
bug reports myself... :-) and for us. But which never gets in the way
if at all possible.

John

2008-01-07 20:51:27

by Valdis Klētnieks

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

On Sun, 06 Jan 2008 22:08:13 +0100, Willy Tarreau said:

> even slightly annoying, we never get it. Have you noticed the number of
> "me too" on the list ? Users find any sort of excuse for not having filed
> a report in the first time, but are still willing to confirm another
> one's bug. That's normal, they're just humans after all.

Personally, I've lost count of the number of times I've *not* reported a
bug/oops just because of the "NVidia users this means you" statement, only
to have the exact same issue reported several weeks/months later by somebody
who's able to replicate it with an untainted kernel.

> 'lsmod|more' in a shell window. If you do not know how to run without
> those drivers, please still file the report, it may still be considered
> valuable, but you may be recontacted and guided to unload them."

Exactly.


Attachments:
(No filename) (226.00 B)

2008-01-07 21:34:48

by Alan

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

> Personally, I've lost count of the number of times I've *not* reported a
> bug/oops just because of the "NVidia users this means you" statement, only
> to have the exact same issue reported several weeks/months later by somebody
> who's able to replicate it with an untainted kernel.

And I've lost count of the number of bugs reported that turned out to be
the Nvidia modules.

Alan

2008-01-07 21:37:31

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

> Personally, I've lost count of the number of times I've *not* reported a
> bug/oops just because of the "NVidia users this means you" statement, only
> to have the exact same issue reported several weeks/months later by somebody
> who's able to replicate it with an untainted kernel.

If you can reproduce a bug reliably, you can reproduce it without the
nvidia module loaded.

--
Intel are signing my paycheques ... these opinions are still mine
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."

2008-01-07 23:05:09

by Valdis Klētnieks

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

On Mon, 07 Jan 2008 14:37:17 MST, Matthew Wilcox said:
> If you can reproduce a bug reliably, you can reproduce it without the
> nvidia module loaded.

Theoretically, at least. Sometimes, in the real world, other constraints
enter into it...

You're welcome to stop by and figure out why (I've sunk multiple tens of hours
into trying already) the NVidia binary driver can handle the Gateway monitor
hanging off my docking station and the open-source one selects a non-functional
sync rate, or pay for a replacement (management doesn't want to shell out $$ to
replace a monitor that works just fine when the manufacturer's drivers/config
are installed - surprising, that). Or I could get behind the docking station,
unplug the ergonomic keyboard, the power brick, the trackball, and the cat-5,
set the laptop up on the desk, find some way to arrange the laptop, cat-5,
trackball, power brick, and keyboard so things are reasonable to use for a long
day of typing, and wait for it to *maybe* happen again, and then put everything
back when it's safe to use the monitor again. Oh, and the open source driver
has some *horrid* color-banding issues on the laptop's LCD, as well, you're
welcome to fix those while you're at it.. ;)

And no, I'm *not* just winging it with the laptop for the day - I have enough
carpal tunnel issues *with* an ergonomic keyboard and a trackball, I'm not
going to press my luck with the onboard keyboard and mouse.

Or I can just say "screw it" and not bother reporting any bugs that aren't
easily repeatable before GDM gets started.

Guess which one ends up happening in the real world?


Attachments:
(No filename) (226.00 B)

2008-01-07 23:19:40

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

On Mon, Jan 07, 2008 at 06:04:25PM -0500, [email protected] wrote:
> Theoretically, at least. Sometimes, in the real world, other constraints
> enter into it...

So you're saying that you can't find reliable ways to reproduce problems
on demand? Those are some of the lower quality bug reports, so I don't
think we're losing much by having you not report them.

--
Intel are signing my paycheques ... these opinions are still mine
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."

2008-01-08 16:48:30

by Stefan Richter

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

Matthew Wilcox wrote:
> So you're saying that you can't find reliable ways to reproduce problems
> on demand? Those are some of the lower quality bug reports,

Or those are the more difficult problems.
--
Stefan Richter
-=====-==--- ---= -=---
http://arcgraph.de/sr/

2008-01-08 16:56:15

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"


* James Bottomley <[email protected]> wrote:
>
> On Sun, 2008-01-06 at 17:12 +0100, Ingo Molnar wrote:
> > * James Bottomley <[email protected]> wrote:
> >
> > > > The reproducer came to you via Peter Osterlund who has never
> > > > authored a single drivers/scsi/ commit before (according to git-log)
> > > > and who (and here i'm out on a limb guessing it) does not even
> > > > follow [email protected].
> > > >
> > > > this bug was obscure and hidden on [email protected]
> > > > for _months_, (it is a rarely visited and rarely read mailing
> > > > list) and there was just not enough "critical mass" to get this
> > > > issue fixed.
> > >
> > > If I were you, I'd actually make a cursory effort to get my facts
> > > straight before spouting off.
> > >
> > > This bug was actually hidden in bugzilla for ages, where Matthew
> > > Wilcox was trying to deal with it on his own. [...]
> >
> > Huh? The bugzilla just tracked a bug reported to lkml. The very
> > description of the bugzilla says:
> >
> > Subject : v2.6.24-rc2-409-g9418d5d: attempt to access beyond end of device
> > Submitter : Thomas Meyer <[email protected]>
> > References : http://lkml.org/lkml/2007/11/13/250
> >
> > so no, it was evidently not "hidden in bugzilla for ages" - all the
> > important action happened on lkml.
>
> ... and your original accusation was "this bug was obscure and hidden
> on [email protected] for _months_" which I was pointing out
> wasn't true.

you are right - let me rephrase it as: "this issue was mainly hidden due
to the unhealthy ping-pong between lkml (which you said you didnt read),
linux-scsi and bugzilla".

> Even the original lkml report was obscured by sweeping the report into
> bugzilla and forgetting about it, so in fact, no action happened, even
> on lkml.

all the "action" already happened on the first day of reporting the bug.
(the wrong commit was identified, but that's besides the point - it all
sat inactive after that point. I pinged the bugzilla to get the lkml
discussion active again, not to debug it there.)

what got movement into it all again was the revert.

> Can we stop it with the recriminations and blame shifting now. [...]

what "blame shifting" ???

all i'm worried about here is the long latency for a bugfix which very
apparently (to me) happened due to the isolation of linux-scsi and the
resulting bug processing inefficiencies. Bugs happen and nobody is to be
"blamed" for the bug itself - but the bug processing flow was broken and
i've pointed that out. (If you see similar cases for code i maintain,
and if you can pinpoint the reason why you think it happened and how to
improve that, then please point it out to me as well.)

Ingo

2008-01-08 17:13:38

by Linus Torvalds

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"



On Tue, 8 Jan 2008, Stefan Richter wrote:

> Matthew Wilcox wrote:
> > So you're saying that you can't find reliable ways to reproduce problems
> > on demand? Those are some of the lower quality bug reports,
>
> Or those are the more difficult problems.

Indeed. If it's some race condition, or dependent on memory pressure at
just the right time, or a use-after-free that corrupts some memory that
normally nobody will even notice (it's freed, after all, and not
necessarily re-allocated), reproduction can be really very hard.

It's happily not exactly *common*, but it's certainly not unheard of
either, when you need to run some specific workload for hours to trigger
the bug - and then when it doesn't happen, you have to ask yourself: "was
I just lunky, punk?"

Some of those things also go away magically between kernel versions or
subtly different configurations. A use-after-free problem might be obvious
in one config, but then another configuration might change the size of a
structure, and suddenly the two kmalloc's that used to be in the same slab
(and made the problem more visible) end up in different slabs, and now you
suddenly cannot reproduce it with that particular load at all any more!

These things *are* fairly rare (most bugs by _far_ are of the trivial
stupid kind), but some of those things can stay around for a long time,
and it can take months of different people reporting similar problems
until somebody finally puts two and two together and sees the pattern.

When we get a good bug-reporter that is willing and able to reproduce and
test kernels, that's wonderful, but when we get some "background noise" of
bad bug-reports, that's usually good too - even if it's good only in the
long run (ie sometimes we just have to accept that the bug-report didn't
contain enough information for us to really do anythign about it, and just
let it be - and hope that future events will clarify things)

Linus

2008-01-08 20:02:29

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"


* Linus Torvalds <[email protected]> wrote:

> These things *are* fairly rare (most bugs by _far_ are of the trivial
> stupid kind), but some of those things can stay around for a long
> time, and it can take months of different people reporting similar
> problems until somebody finally puts two and two together and sees the
> pattern.

one common pattern i've noticed is bug dependency. In some areas we need
to fix a series of 2-3 increasingly less trivial bugs to get enough test
exposure, tester confidence and developer attention to trigger (and fix)
the _truly_ bad bugs.

That's why agressive regression elimination (and prevention) is so
important IMHO - trivial regressions can totally block testing of
certain areas of code, and with an agressive 90 days release schedule
every day counts.

Ingo

2008-01-09 04:01:57

by Valdis Klētnieks

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

On Mon, 07 Jan 2008 16:19:30 MST, Matthew Wilcox said:
> On Mon, Jan 07, 2008 at 06:04:25PM -0500, [email protected] wrote:
> > Theoretically, at least. Sometimes, in the real world, other constraints
> > enter into it...
>
> So you're saying that you can't find reliable ways to reproduce problems
> on demand? Those are some of the lower quality bug reports, so I don't
> think we're losing much by having you not report them.

I'm sure that *everybody* on this list would *love* to know how you find
a reliable way to reproduce all the bugs that start off with "after X days of
uptime". But when you're chasing what might be a race condition with a
very small timing hole, you may need an event to happen several million times
before the accumulated chance of hitting it becomes appreciable.


Attachments:
(No filename) (226.00 B)

2008-01-09 04:03:50

by Valdis Klētnieks

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

On Mon, 07 Jan 2008 16:19:30 MST, Matthew Wilcox said:

> So you're saying that you can't find reliable ways to reproduce problems
> on demand? Those are some of the lower quality bug reports, so I don't
> think we're losing much by having you not report them.

And in the next e-mail in my lkml folder we see:

On Mon, 07 Jan 2008 18:21:45 EST, Parag Warudkar said:
> BTW, I have so far tested 2.6.24-rc4/5/6/7 and 2.6.23.12 - all of
> which have this problem.
>
> Yesterday I went back to using 2.6.22.15 and after a day's uptime it
> has not reproduced with the same config.
>
> Time for git-bisect I suppose? (the only problem is that this takes
> anywhere between 20 minutes to 8 hrs to confirm reliably.)

Are you saying that we're not losing much if Parag says "screw it" and
doesn't report the problem?


Attachments:
(No filename) (226.00 B)

2008-01-09 04:11:49

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

On Tue, 08 Jan 2008 23:01:07 -0500 [email protected] wrote:

> On Mon, 07 Jan 2008 16:19:30 MST, Matthew Wilcox said:
> > On Mon, Jan 07, 2008 at 06:04:25PM -0500, [email protected] wrote:
> > > Theoretically, at least. Sometimes, in the real world, other constraints
> > > enter into it...
> >
> > So you're saying that you can't find reliable ways to reproduce problems
> > on demand? Those are some of the lower quality bug reports, so I don't
> > think we're losing much by having you not report them.
>
> I'm sure that *everybody* on this list would *love* to know how you find
> a reliable way to reproduce all the bugs that start off with "after X days of
> uptime". But when you're chasing what might be a race condition with a
> very small timing hole, you may need an event to happen several million times
> before the accumulated chance of hitting it becomes appreciable.
>

I must say that the number of bugs which actually go away when the user
stops using nvidia/fglrx/ndiswrapper/etc is a small minority.

And you can usually tell beforehand too: if the user reports bad_page
warnings or pte table scroggage or whatever and they're using nvidia I just
hit 'd'. But people who think that removing the nvidia driver will
magically fix that khubd-got-stuck-in-D-state bug are urinating up an
incline.


Facts:

- lots of people use nvidia/etc

- most bugs they report aren't caused by nvidia/etc

- we need lots of testers

draw you own conclusions.

2008-01-09 06:15:58

by Willy Tarreau

[permalink] [raw]
Subject: Re: [patch] scsi: revert "[SCSI] Get rid of scsi_cmnd->done"

On Tue, Jan 08, 2008 at 08:10:42PM -0800, Andrew Morton wrote:
[...]
> I must say that the number of bugs which actually go away when the user
> stops using nvidia/fglrx/ndiswrapper/etc is a small minority.

[...]
> But people who think that removing the nvidia driver will
> magically fix that khubd-got-stuck-in-D-state bug are urinating up an
> incline.
>
>
> Facts:
>
> - lots of people use nvidia/etc
>
> - most bugs they report aren't caused by nvidia/etc
>
> - we need lots of testers
>
> draw you own conclusions.

Thanks Andrew for this demonstration. At least now I know I'm not the only
one to think that. And no, I do not have any nvidia/etc. It's just that I
value their users' reports as much as the other ones just because otherwise
we would only track some elite's bugs, thus reducing the amount of information
we have to understand the circumstances under which it happens.

Cheers,
Willy