Hello:
One problem I see when trying to use a box with 128 SCSI disks
is that sd_mod sometimes refuses to load. Earlier kernels simply
oopsed when it happened, but that is fixed in 2.4.18. The root
of the evil is the enormous array sd[] that sd_init allocates.
Alan suggested to split the allocation, which is what I did.
Arjan said that it may be easier to use vmalloc, and sure it is.
However, I heard that vmalloc space is not too big, so it may
make sense to conserve it (especially on non-x86 32-bitters).
Does anyone care to give it a test run?
-- Pete
diff -ur -X dontdiff linux-2.4.19-pre4/drivers/block/ll_rw_blk.c linux-2.4.19-pre4-sd/drivers/block/ll_rw_blk.c
--- linux-2.4.19-pre4/drivers/block/ll_rw_blk.c Thu Mar 21 15:46:13 2002
+++ linux-2.4.19-pre4-sd/drivers/block/ll_rw_blk.c Fri Mar 22 11:42:01 2002
@@ -85,7 +85,7 @@
int * blk_size[MAX_BLKDEV];
/*
- * blksize_size contains the size of all block-devices:
+ * blksize_size contains the block size of all block-devices:
*
* blksize_size[MAJOR][MINOR]
*
diff -ur -X dontdiff linux-2.4.19-pre4/drivers/scsi/sd.c linux-2.4.19-pre4-sd/drivers/scsi/sd.c
--- linux-2.4.19-pre4/drivers/scsi/sd.c Thu Mar 21 15:46:21 2002
+++ linux-2.4.19-pre4-sd/drivers/scsi/sd.c Fri Mar 22 11:42:01 2002
@@ -65,8 +65,13 @@
* static const char RCSid[] = "$Header:";
*/
+/* system major --> sd_gendisks index */
+#define SD_MAJOR_IDX(i) (MAJOR(i) & SD_MAJOR_MASK)
+/* sd_gendisks index --> system major */
#define SD_MAJOR(i) (!(i) ? SCSI_DISK0_MAJOR : SCSI_DISK1_MAJOR-1+(i))
+#define SD_PARTITION(dev) ((SD_MAJOR_IDX(dev) << 8) | (MINOR(dev) & 255))
+
#define SCSI_DISKS_PER_MAJOR 16
#define SD_MAJOR_NUMBER(i) SD_MAJOR((i) >> 8)
#define SD_MINOR_NUMBER(i) ((i) & 255)
@@ -84,9 +89,8 @@
#define SD_TIMEOUT (30 * HZ)
#define SD_MOD_TIMEOUT (75 * HZ)
-struct hd_struct *sd;
-
static Scsi_Disk *rscsi_disks;
+static struct gendisk *sd_gendisks;
static int *sd_sizes;
static int *sd_blocksizes;
static int *sd_hardsizes; /* Hardware sector size */
@@ -195,7 +199,9 @@
if (put_user(diskinfo[0], &loc->heads) ||
put_user(diskinfo[1], &loc->sectors) ||
put_user(diskinfo[2], &loc->cylinders) ||
- put_user(sd[SD_PARTITION(inode->i_rdev)].start_sect, &loc->start))
+ put_user(sd_gendisks[SD_MAJOR_IDX(
+ inode->i_rdev)].part[MINOR(
+ inode->i_rdev)].start_sect, &loc->start))
return -EFAULT;
return 0;
}
@@ -226,7 +232,9 @@
if (put_user(diskinfo[0], &loc->heads) ||
put_user(diskinfo[1], &loc->sectors) ||
put_user(diskinfo[2], (unsigned int *) &loc->cylinders) ||
- put_user(sd[SD_PARTITION(inode->i_rdev)].start_sect, &loc->start))
+ put_user(sd_gendisks[SD_MAJOR_IDX(
+ inode->i_rdev)].part[MINOR(
+ inode->i_rdev)].start_sect, &loc->start))
return -EFAULT;
return 0;
}
@@ -286,30 +294,32 @@
static int sd_init_command(Scsi_Cmnd * SCpnt)
{
- int dev, devm, block, this_count;
+ int dev, block, this_count;
+ struct hd_struct *ppnt;
Scsi_Disk *dpnt;
#if CONFIG_SCSI_LOGGING
char nbuff[6];
#endif
- devm = SD_PARTITION(SCpnt->request.rq_dev);
+ ppnt = &sd_gendisks[SD_MAJOR_IDX(SCpnt->request.rq_dev)].part[MINOR(SCpnt->request.rq_dev)];
dev = DEVICE_NR(SCpnt->request.rq_dev);
block = SCpnt->request.sector;
this_count = SCpnt->request_bufflen >> 9;
- SCSI_LOG_HLQUEUE(1, printk("Doing sd request, dev = %d, block = %d\n", devm, block));
+ SCSI_LOG_HLQUEUE(1, printk("Doing sd request, dev = 0x%x, block = %d\n",
+ SCpnt->request.rq_dev, block));
dpnt = &rscsi_disks[dev];
- if (devm >= (sd_template.dev_max << 4) ||
+ if (dev >= sd_template.dev_max ||
!dpnt->device ||
!dpnt->device->online ||
- block + SCpnt->request.nr_sectors > sd[devm].nr_sects) {
+ block + SCpnt->request.nr_sectors > ppnt->nr_sects) {
SCSI_LOG_HLQUEUE(2, printk("Finishing %ld sectors\n", SCpnt->request.nr_sectors));
SCSI_LOG_HLQUEUE(2, printk("Retry with 0x%p\n", SCpnt));
return 0;
}
- block += sd[devm].start_sect;
+ block += ppnt->start_sect;
if (dpnt->device->changed) {
/*
* quietly refuse to do anything to a changed disc until the changed
@@ -318,7 +328,7 @@
/* printk("SCSI disk has been changed. Prohibiting further I/O.\n"); */
return 0;
}
- SCSI_LOG_HLQUEUE(2, sd_devname(devm, nbuff));
+ SCSI_LOG_HLQUEUE(2, sd_devname(dev, nbuff));
SCSI_LOG_HLQUEUE(2, printk("%s : real dev = /dev/%d, block = %d\n",
nbuff, dev, block));
@@ -576,8 +586,6 @@
fops: &sd_fops,
};
-static struct gendisk *sd_gendisks = &sd_gendisk;
-
#define SD_GENDISK(i) sd_gendisks[(i) / SCSI_DISKS_PER_MAJOR]
/*
@@ -644,7 +652,9 @@
default:
break;
}
- error_sector -= sd[SD_PARTITION(SCpnt->request.rq_dev)].start_sect;
+ error_sector -= sd_gendisks[SD_MAJOR_IDX(
+ SCpnt->request.rq_dev)].part[MINOR(
+ SCpnt->request.rq_dev)].start_sect;
error_sector &= ~(block_sectors - 1);
good_sectors = error_sector - SCpnt->request.sector;
if (good_sectors < 0 || good_sectors >= this_count)
@@ -1146,23 +1156,12 @@
hardsect_size[SD_MAJOR(i)] = sd_hardsizes + i * (SCSI_DISKS_PER_MAJOR << 4);
max_sectors[SD_MAJOR(i)] = sd_max_sectors + i * (SCSI_DISKS_PER_MAJOR << 4);
}
- /*
- * FIXME: should unregister blksize_size, hardsect_size and max_sectors when
- * the module is unloaded.
- */
- sd = kmalloc((sd_template.dev_max << 4) *
- sizeof(struct hd_struct),
- GFP_ATOMIC);
- if (!sd)
- goto cleanup_sd;
- memset(sd, 0, (sd_template.dev_max << 4) * sizeof(struct hd_struct));
-
- if (N_USED_SD_MAJORS > 1)
- sd_gendisks = kmalloc(N_USED_SD_MAJORS * sizeof(struct gendisk), GFP_ATOMIC);
- if (!sd_gendisks)
- goto cleanup_sd_gendisks;
+
+ sd_gendisks = kmalloc(N_USED_SD_MAJORS * sizeof(struct gendisk), GFP_ATOMIC);
+ if (!sd_gendisks)
+ goto cleanup_sd_gendisks;
for (i = 0; i < N_USED_SD_MAJORS; i++) {
- sd_gendisks[i] = sd_gendisk;
+ sd_gendisks[i] = sd_gendisk; /* memcpy */
sd_gendisks[i].de_arr = kmalloc (SCSI_DISKS_PER_MAJOR * sizeof *sd_gendisks[i].de_arr,
GFP_ATOMIC);
if (!sd_gendisks[i].de_arr)
@@ -1179,7 +1178,11 @@
sd_gendisks[i].major_name = "sd";
sd_gendisks[i].minor_shift = 4;
sd_gendisks[i].max_p = 1 << 4;
- sd_gendisks[i].part = sd + (i * SCSI_DISKS_PER_MAJOR << 4);
+ sd_gendisks[i].part = kmalloc((SCSI_DISKS_PER_MAJOR << 4) * sizeof(struct hd_struct),
+ GFP_ATOMIC);
+ if (!sd_gendisks[i].part)
+ goto cleanup_gendisks_part;
+ memset(sd_gendisks[i].part, 0, (SCSI_DISKS_PER_MAJOR << 4) * sizeof(struct hd_struct));
sd_gendisks[i].sizes = sd_sizes + (i * SCSI_DISKS_PER_MAJOR << 4);
sd_gendisks[i].nr_real = 0;
sd_gendisks[i].real_devices =
@@ -1188,18 +1191,19 @@
return 0;
+cleanup_gendisks_part:
+ kfree(sd_gendisks[i].flags);
cleanup_gendisks_flags:
kfree(sd_gendisks[i].de_arr);
cleanup_gendisks_de_arr:
while (--i >= 0 ) {
kfree(sd_gendisks[i].de_arr);
kfree(sd_gendisks[i].flags);
+ kfree(sd_gendisks[i].part);
}
- if (sd_gendisks != &sd_gendisk)
- kfree(sd_gendisks);
+ kfree(sd_gendisks);
+ sd_gendisks = NULL;
cleanup_sd_gendisks:
- kfree(sd);
-cleanup_sd:
kfree(sd_max_sectors);
cleanup_max_sectors:
kfree(sd_hardsizes);
@@ -1320,6 +1324,7 @@
*/
int revalidate_scsidisk(kdev_t dev, int maxusage)
{
+ struct gendisk *sdgd;
int target;
int max_p;
int start;
@@ -1333,14 +1338,15 @@
}
DEVICE_BUSY = 1;
- max_p = sd_gendisks->max_p;
- start = target << sd_gendisks->minor_shift;
+ sdgd = &SD_GENDISK(target);
+ max_p = sd_gendisk.max_p;
+ start = target << sd_gendisk.minor_shift;
for (i = max_p - 1; i >= 0; i--) {
int index = start + i;
invalidate_device(MKDEV_SD_PARTITION(index), 1);
- sd_gendisks->part[index].start_sect = 0;
- sd_gendisks->part[index].nr_sects = 0;
+ sdgd->part[SD_MINOR_NUMBER(index)].start_sect = 0;
+ sdgd->part[SD_MINOR_NUMBER(index)].nr_sects = 0;
/*
* Reset the blocksize for everything so that we can read
* the partition table. Technically we will determine the
@@ -1372,6 +1378,7 @@
static void sd_detach(Scsi_Device * SDp)
{
Scsi_Disk *dpnt;
+ struct gendisk *sdgd;
int i, j;
int max_p;
int start;
@@ -1384,17 +1391,18 @@
/* If we are disconnecting a disk driver, sync and invalidate
* everything */
+ sdgd = &SD_GENDISK(i);
max_p = sd_gendisk.max_p;
start = i << sd_gendisk.minor_shift;
for (j = max_p - 1; j >= 0; j--) {
int index = start + j;
invalidate_device(MKDEV_SD_PARTITION(index), 1);
- sd_gendisks->part[index].start_sect = 0;
- sd_gendisks->part[index].nr_sects = 0;
+ sdgd->part[SD_MINOR_NUMBER(index)].start_sect = 0;
+ sdgd->part[SD_MINOR_NUMBER(index)].nr_sects = 0;
sd_sizes[index] = 0;
}
- devfs_register_partitions (&SD_GENDISK (i),
+ devfs_register_partitions (sdgd,
SD_MINOR_NUMBER (start), 1);
/* unregister_disk() */
dpnt->has_part_table = 0;
@@ -1430,16 +1438,22 @@
kfree(sd_sizes);
kfree(sd_blocksizes);
kfree(sd_hardsizes);
- kfree((char *) sd);
+ for (i = 0; i < N_USED_SD_MAJORS; i++) {
+#if 0 /* XXX aren't we forgetting to deallocate something? */
+ kfree(sd_gendisks[i].de_arr);
+ kfree(sd_gendisks[i].flags);
+#endif
+ kfree(sd_gendisks[i].part);
+ }
}
for (i = 0; i < N_USED_SD_MAJORS; i++) {
del_gendisk(&sd_gendisks[i]);
- blk_size[SD_MAJOR(i)] = NULL;
+ blk_size[SD_MAJOR(i)] = NULL; /* XXX blksize_size actually? */
hardsect_size[SD_MAJOR(i)] = NULL;
read_ahead[SD_MAJOR(i)] = 0;
}
sd_template.dev_max = 0;
- if (sd_gendisks != &sd_gendisk)
+ if (sd_gendisks != NULL) /* kfree tests for 0, but leave explicit */
kfree(sd_gendisks);
}
diff -ur -X dontdiff linux-2.4.19-pre4/drivers/scsi/sd.h linux-2.4.19-pre4-sd/drivers/scsi/sd.h
--- linux-2.4.19-pre4/drivers/scsi/sd.h Thu Nov 22 11:49:15 2001
+++ linux-2.4.19-pre4-sd/drivers/scsi/sd.h Fri Mar 22 11:42:01 2002
@@ -23,8 +23,6 @@
#include <linux/genhd.h>
#endif
-extern struct hd_struct *sd;
-
typedef struct scsi_disk {
unsigned capacity; /* size in blocks */
Scsi_Device *device;
@@ -45,7 +43,6 @@
#define N_SD_MAJORS 8
#define SD_MAJOR_MASK (N_SD_MAJORS - 1)
-#define SD_PARTITION(i) (((MAJOR(i) & SD_MAJOR_MASK) << 8) | (MINOR(i) & 255))
#endif
Pete Zaitcev wrote:
> Hello:
>
> One problem I see when trying to use a box with 128 SCSI disks
> is that sd_mod sometimes refuses to load. Earlier kernels simply
> oopsed when it happened, but that is fixed in 2.4.18. The root
> of the evil is the enormous array sd[] that sd_init allocates.
> Alan suggested to split the allocation, which is what I did.
>
> Arjan said that it may be easier to use vmalloc, and sure it is.
> However, I heard that vmalloc space is not too big, so it may
> make sense to conserve it (especially on non-x86 32-bitters).
kmalloc is spare - the vmalloc space is *HUUUUUGE*.
(The v stands for virtual as in virtual memmory...)
In article <[email protected]> you wrote:
> Pete Zaitcev wrote:
>> Hello:
>>
>> One problem I see when trying to use a box with 128 SCSI disks
>> is that sd_mod sometimes refuses to load. Earlier kernels simply
>> oopsed when it happened, but that is fixed in 2.4.18. The root
>> of the evil is the enormous array sd[] that sd_init allocates.
>> Alan suggested to split the allocation, which is what I did.
>>
>> Arjan said that it may be easier to use vmalloc, and sure it is.
>> However, I heard that vmalloc space is not too big, so it may
>> make sense to conserve it (especially on non-x86 32-bitters).
>
> kmalloc is spare - the vmalloc space is *HUUUUUGE*.
64Mb (effective usable size) is HUGE. sure. but you share it
with all other vmalloc users. I agree with Pete that kmalloc is the
superior solution; I mentioned vmalloc to him before because that would
have been the minimal fix; Pete decided to fix it for real...
(also vmalloc is also rather slow for hotpaths due to tlb effects)
Pete Zaitcev wrote:
>
> Hello:
>
> One problem I see when trying to use a box with 128 SCSI disks
> is that sd_mod sometimes refuses to load. Earlier kernels simply
> oopsed when it happened, but that is fixed in 2.4.18. The root
> of the evil is the enormous array sd[] that sd_init allocates.
> Alan suggested to split the allocation, which is what I did.
Pete,
I haven't looked too closely at your patch, but I
can recount a little history: all the upper level scsi
drivers used to have an "EXTRA_DEVS" config option.
This put an upper limit on the number of new devices
that could be attached _after_ the upper level driver
was loaded. With help from Eric Y. we got rid of those
in the sg and st drivers using another level of
indirection.
So the only thing that is now contiguous is an array of
pointers (to device state structures). A driver-scope
read-write lock protects that structure. The sg
driver does a small overallocation on this array
(#define SG_DEV_ARR_LUMP 6). If new devices are
registered after driver initialization and the device
pointer array is not big enough then a write lock is
taken and the pointers moved over into a larger array.
By using the private_data member of the file structure,
most reads of that array can be avoided (otherwise
a read_lock is taken).
There have been no reported errors with this approach
during the lk 2.4 series. A patched sg driver (together
with Richard Gooch's sd-many patch) has been able to
address over 300 (similated) disks without noticeable
memory problems on a modestly-sized box.
I believe that it was Eric's intention to implement the
same solution in sd. The generic disk stuff and the
partitions are a complicating factor.
All those parallel arrays set up by sd_init (e.g.
rscsi_disks[], sd_sizes[], sd_blocksizes[],
sd_hardsizes[], sd_max_sectors[] and sd[] are a mess.
It is false economy to do the number of index
operations that sd does, my guess is that 90% of them
are redundant. Couldn't the sd driver just
have a device structure that contains an (16 element)
array of pointers to partition structures?
BTW. It is probably worth looking at the sd-many patch
as it must have been faced with a similar problem.
Doug Gilbert
> Arjan said that it may be easier to use vmalloc, and sure it is.
> However, I heard that vmalloc space is not too big, so it may
> make sense to conserve it (especially on non-x86 32-bitters).
>
> Does anyone care to give it a test run?
>
> -- Pete
>
> diff -ur -X dontdiff linux-2.4.19-pre4/drivers/block/ll_rw_blk.c linux-2.4.19-pre4-sd/drivers/block/ll_rw_blk.c
> --- linux-2.4.19-pre4/drivers/block/ll_rw_blk.c Thu Mar 21 15:46:13 2002
> +++ linux-2.4.19-pre4-sd/drivers/block/ll_rw_blk.c Fri Mar 22 11:42:01 2002
> @@ -85,7 +85,7 @@
> int * blk_size[MAX_BLKDEV];
>
> /*
> - * blksize_size contains the size of all block-devices:
> + * blksize_size contains the block size of all block-devices:
> *
> * blksize_size[MAJOR][MINOR]
> *
> diff -ur -X dontdiff linux-2.4.19-pre4/drivers/scsi/sd.c linux-2.4.19-pre4-sd/drivers/scsi/sd.c
> --- linux-2.4.19-pre4/drivers/scsi/sd.c Thu Mar 21 15:46:21 2002
> +++ linux-2.4.19-pre4-sd/drivers/scsi/sd.c Fri Mar 22 11:42:01 2002
> @@ -65,8 +65,13 @@
> * static const char RCSid[] = "$Header:";
> */
>
> +/* system major --> sd_gendisks index */
> +#define SD_MAJOR_IDX(i) (MAJOR(i) & SD_MAJOR_MASK)
> +/* sd_gendisks index --> system major */
> #define SD_MAJOR(i) (!(i) ? SCSI_DISK0_MAJOR : SCSI_DISK1_MAJOR-1+(i))
>
> +#define SD_PARTITION(dev) ((SD_MAJOR_IDX(dev) << 8) | (MINOR(dev) & 255))
> +
> #define SCSI_DISKS_PER_MAJOR 16
> #define SD_MAJOR_NUMBER(i) SD_MAJOR((i) >> 8)
> #define SD_MINOR_NUMBER(i) ((i) & 255)
> @@ -84,9 +89,8 @@
> #define SD_TIMEOUT (30 * HZ)
> #define SD_MOD_TIMEOUT (75 * HZ)
>
> -struct hd_struct *sd;
> -
> static Scsi_Disk *rscsi_disks;
> +static struct gendisk *sd_gendisks;
> static int *sd_sizes;
> static int *sd_blocksizes;
> static int *sd_hardsizes; /* Hardware sector size */
> @@ -195,7 +199,9 @@
> if (put_user(diskinfo[0], &loc->heads) ||
> put_user(diskinfo[1], &loc->sectors) ||
> put_user(diskinfo[2], &loc->cylinders) ||
> - put_user(sd[SD_PARTITION(inode->i_rdev)].start_sect, &loc->start))
> + put_user(sd_gendisks[SD_MAJOR_IDX(
> + inode->i_rdev)].part[MINOR(
> + inode->i_rdev)].start_sect, &loc->start))
> return -EFAULT;
> return 0;
> }
> @@ -226,7 +232,9 @@
> if (put_user(diskinfo[0], &loc->heads) ||
> put_user(diskinfo[1], &loc->sectors) ||
> put_user(diskinfo[2], (unsigned int *) &loc->cylinders) ||
> - put_user(sd[SD_PARTITION(inode->i_rdev)].start_sect, &loc->start))
> + put_user(sd_gendisks[SD_MAJOR_IDX(
> + inode->i_rdev)].part[MINOR(
> + inode->i_rdev)].start_sect, &loc->start))
> return -EFAULT;
> return 0;
> }
> @@ -286,30 +294,32 @@
>
> static int sd_init_command(Scsi_Cmnd * SCpnt)
> {
> - int dev, devm, block, this_count;
> + int dev, block, this_count;
> + struct hd_struct *ppnt;
> Scsi_Disk *dpnt;
> #if CONFIG_SCSI_LOGGING
> char nbuff[6];
> #endif
>
> - devm = SD_PARTITION(SCpnt->request.rq_dev);
> + ppnt = &sd_gendisks[SD_MAJOR_IDX(SCpnt->request.rq_dev)].part[MINOR(SCpnt->request.rq_dev)];
> dev = DEVICE_NR(SCpnt->request.rq_dev);
>
> block = SCpnt->request.sector;
> this_count = SCpnt->request_bufflen >> 9;
>
> - SCSI_LOG_HLQUEUE(1, printk("Doing sd request, dev = %d, block = %d\n", devm, block));
> + SCSI_LOG_HLQUEUE(1, printk("Doing sd request, dev = 0x%x, block = %d\n",
> + SCpnt->request.rq_dev, block));
>
> dpnt = &rscsi_disks[dev];
> - if (devm >= (sd_template.dev_max << 4) ||
> + if (dev >= sd_template.dev_max ||
> !dpnt->device ||
> !dpnt->device->online ||
> - block + SCpnt->request.nr_sectors > sd[devm].nr_sects) {
> + block + SCpnt->request.nr_sectors > ppnt->nr_sects) {
> SCSI_LOG_HLQUEUE(2, printk("Finishing %ld sectors\n", SCpnt->request.nr_sectors));
> SCSI_LOG_HLQUEUE(2, printk("Retry with 0x%p\n", SCpnt));
> return 0;
> }
> - block += sd[devm].start_sect;
> + block += ppnt->start_sect;
> if (dpnt->device->changed) {
> /*
> * quietly refuse to do anything to a changed disc until the changed
> @@ -318,7 +328,7 @@
> /* printk("SCSI disk has been changed. Prohibiting further I/O.\n"); */
> return 0;
> }
> - SCSI_LOG_HLQUEUE(2, sd_devname(devm, nbuff));
> + SCSI_LOG_HLQUEUE(2, sd_devname(dev, nbuff));
> SCSI_LOG_HLQUEUE(2, printk("%s : real dev = /dev/%d, block = %d\n",
> nbuff, dev, block));
>
> @@ -576,8 +586,6 @@
> fops: &sd_fops,
> };
>
> -static struct gendisk *sd_gendisks = &sd_gendisk;
> -
> #define SD_GENDISK(i) sd_gendisks[(i) / SCSI_DISKS_PER_MAJOR]
>
> /*
> @@ -644,7 +652,9 @@
> default:
> break;
> }
> - error_sector -= sd[SD_PARTITION(SCpnt->request.rq_dev)].start_sect;
> + error_sector -= sd_gendisks[SD_MAJOR_IDX(
> + SCpnt->request.rq_dev)].part[MINOR(
> + SCpnt->request.rq_dev)].start_sect;
> error_sector &= ~(block_sectors - 1);
> good_sectors = error_sector - SCpnt->request.sector;
> if (good_sectors < 0 || good_sectors >= this_count)
> @@ -1146,23 +1156,12 @@
> hardsect_size[SD_MAJOR(i)] = sd_hardsizes + i * (SCSI_DISKS_PER_MAJOR << 4);
> max_sectors[SD_MAJOR(i)] = sd_max_sectors + i * (SCSI_DISKS_PER_MAJOR << 4);
> }
> - /*
> - * FIXME: should unregister blksize_size, hardsect_size and max_sectors when
> - * the module is unloaded.
> - */
> - sd = kmalloc((sd_template.dev_max << 4) *
> - sizeof(struct hd_struct),
> - GFP_ATOMIC);
> - if (!sd)
> - goto cleanup_sd;
> - memset(sd, 0, (sd_template.dev_max << 4) * sizeof(struct hd_struct));
> -
> - if (N_USED_SD_MAJORS > 1)
> - sd_gendisks = kmalloc(N_USED_SD_MAJORS * sizeof(struct gendisk), GFP_ATOMIC);
> - if (!sd_gendisks)
> - goto cleanup_sd_gendisks;
> +
> + sd_gendisks = kmalloc(N_USED_SD_MAJORS * sizeof(struct gendisk), GFP_ATOMIC);
> + if (!sd_gendisks)
> + goto cleanup_sd_gendisks;
> for (i = 0; i < N_USED_SD_MAJORS; i++) {
> - sd_gendisks[i] = sd_gendisk;
> + sd_gendisks[i] = sd_gendisk; /* memcpy */
> sd_gendisks[i].de_arr = kmalloc (SCSI_DISKS_PER_MAJOR * sizeof *sd_gendisks[i].de_arr,
> GFP_ATOMIC);
> if (!sd_gendisks[i].de_arr)
> @@ -1179,7 +1178,11 @@
> sd_gendisks[i].major_name = "sd";
> sd_gendisks[i].minor_shift = 4;
> sd_gendisks[i].max_p = 1 << 4;
> - sd_gendisks[i].part = sd + (i * SCSI_DISKS_PER_MAJOR << 4);
> + sd_gendisks[i].part = kmalloc((SCSI_DISKS_PER_MAJOR << 4) * sizeof(struct hd_struct),
> + GFP_ATOMIC);
> + if (!sd_gendisks[i].part)
> + goto cleanup_gendisks_part;
> + memset(sd_gendisks[i].part, 0, (SCSI_DISKS_PER_MAJOR << 4) * sizeof(struct hd_struct));
> sd_gendisks[i].sizes = sd_sizes + (i * SCSI_DISKS_PER_MAJOR << 4);
> sd_gendisks[i].nr_real = 0;
> sd_gendisks[i].real_devices =
> @@ -1188,18 +1191,19 @@
>
> return 0;
>
> +cleanup_gendisks_part:
> + kfree(sd_gendisks[i].flags);
> cleanup_gendisks_flags:
> kfree(sd_gendisks[i].de_arr);
> cleanup_gendisks_de_arr:
> while (--i >= 0 ) {
> kfree(sd_gendisks[i].de_arr);
> kfree(sd_gendisks[i].flags);
> + kfree(sd_gendisks[i].part);
> }
> - if (sd_gendisks != &sd_gendisk)
> - kfree(sd_gendisks);
> + kfree(sd_gendisks);
> + sd_gendisks = NULL;
> cleanup_sd_gendisks:
> - kfree(sd);
> -cleanup_sd:
> kfree(sd_max_sectors);
> cleanup_max_sectors:
> kfree(sd_hardsizes);
> @@ -1320,6 +1324,7 @@
> */
> int revalidate_scsidisk(kdev_t dev, int maxusage)
> {
> + struct gendisk *sdgd;
> int target;
> int max_p;
> int start;
> @@ -1333,14 +1338,15 @@
> }
> DEVICE_BUSY = 1;
>
> - max_p = sd_gendisks->max_p;
> - start = target << sd_gendisks->minor_shift;
> + sdgd = &SD_GENDISK(target);
> + max_p = sd_gendisk.max_p;
> + start = target << sd_gendisk.minor_shift;
>
> for (i = max_p - 1; i >= 0; i--) {
> int index = start + i;
> invalidate_device(MKDEV_SD_PARTITION(index), 1);
> - sd_gendisks->part[index].start_sect = 0;
> - sd_gendisks->part[index].nr_sects = 0;
> + sdgd->part[SD_MINOR_NUMBER(index)].start_sect = 0;
> + sdgd->part[SD_MINOR_NUMBER(index)].nr_sects = 0;
> /*
> * Reset the blocksize for everything so that we can read
> * the partition table. Technically we will determine the
> @@ -1372,6 +1378,7 @@
> static void sd_detach(Scsi_Device * SDp)
> {
> Scsi_Disk *dpnt;
> + struct gendisk *sdgd;
> int i, j;
> int max_p;
> int start;
> @@ -1384,17 +1391,18 @@
>
> /* If we are disconnecting a disk driver, sync and invalidate
> * everything */
> + sdgd = &SD_GENDISK(i);
> max_p = sd_gendisk.max_p;
> start = i << sd_gendisk.minor_shift;
>
> for (j = max_p - 1; j >= 0; j--) {
> int index = start + j;
> invalidate_device(MKDEV_SD_PARTITION(index), 1);
> - sd_gendisks->part[index].start_sect = 0;
> - sd_gendisks->part[index].nr_sects = 0;
> + sdgd->part[SD_MINOR_NUMBER(index)].start_sect = 0;
> + sdgd->part[SD_MINOR_NUMBER(index)].nr_sects = 0;
> sd_sizes[index] = 0;
> }
> - devfs_register_partitions (&SD_GENDISK (i),
> + devfs_register_partitions (sdgd,
> SD_MINOR_NUMBER (start), 1);
> /* unregister_disk() */
> dpnt->has_part_table = 0;
> @@ -1430,16 +1438,22 @@
> kfree(sd_sizes);
> kfree(sd_blocksizes);
> kfree(sd_hardsizes);
> - kfree((char *) sd);
> + for (i = 0; i < N_USED_SD_MAJORS; i++) {
> +#if 0 /* XXX aren't we forgetting to deallocate something? */
> + kfree(sd_gendisks[i].de_arr);
> + kfree(sd_gendisks[i].flags);
> +#endif
> + kfree(sd_gendisks[i].part);
> + }
> }
> for (i = 0; i < N_USED_SD_MAJORS; i++) {
> del_gendisk(&sd_gendisks[i]);
> - blk_size[SD_MAJOR(i)] = NULL;
> + blk_size[SD_MAJOR(i)] = NULL; /* XXX blksize_size actually? */
> hardsect_size[SD_MAJOR(i)] = NULL;
> read_ahead[SD_MAJOR(i)] = 0;
> }
> sd_template.dev_max = 0;
> - if (sd_gendisks != &sd_gendisk)
> + if (sd_gendisks != NULL) /* kfree tests for 0, but leave explicit */
> kfree(sd_gendisks);
> }
>
> diff -ur -X dontdiff linux-2.4.19-pre4/drivers/scsi/sd.h linux-2.4.19-pre4-sd/drivers/scsi/sd.h
> --- linux-2.4.19-pre4/drivers/scsi/sd.h Thu Nov 22 11:49:15 2001
> +++ linux-2.4.19-pre4-sd/drivers/scsi/sd.h Fri Mar 22 11:42:01 2002
> @@ -23,8 +23,6 @@
> #include <linux/genhd.h>
> #endif
>
> -extern struct hd_struct *sd;
> -
> typedef struct scsi_disk {
> unsigned capacity; /* size in blocks */
> Scsi_Device *device;
> @@ -45,7 +43,6 @@
> #define N_SD_MAJORS 8
>
> #define SD_MAJOR_MASK (N_SD_MAJORS - 1)
> -#define SD_PARTITION(i) (((MAJOR(i) & SD_MAJOR_MASK) << 8) | (MINOR(i) & 255))
>
> #endif
>
> -
> To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> I believe that it was Eric's intention to implement the
> same solution in sd. The generic disk stuff and the
> partitions are a complicating factor.
> All those parallel arrays set up by sd_init (e.g.
> rscsi_disks[], sd_sizes[], sd_blocksizes[],
> sd_hardsizes[], sd_max_sectors[] and sd[] are a mess.
> It is false economy to do the number of index
> operations that sd does, my guess is that 90% of them
> are redundant. Couldn't the sd driver just
> have a device structure that contains an (16 element)
> array of pointers to partition structures?
Correct on all counts. Part of what was holding things up was all of
the nonsense related to partition handling. To a lesser degree the cdrom
driver is being held up for similar reasons.
The proper cleanup is to eliminate those damned arrays (and make the
access SMP safe at the same time) that live in ll_rw_blk.c. There would
need to be a generic SMP safe way of obtaining the same information (things
like filesystems need to know this info, for example), and then add SMP safe
ways for low-level drivers to update the sizes of things as required.
The arrays you mention above are just inserted into the even messier
arrays that live in ll_rw_blk.c - as things currently stand, it really isn't
possible to clean up the arrays in sd.c without solving the larger problem
of the mess of arrays in ll_rw_blk.c.
I believe it was Jens who had been talking about folding some of this
information into the request_queue_t, and I don't know where this went if
anywhere. Maybe there was some problem that couldn't be easily resolved.
Another design goal of messing with this would be to do it in such a way
that support for a larger dev_t is possible.
Once the basic design is complete, the actual changes shouldn't be too
hard - just tedious.
-Eric
> Date: Sat, 23 Mar 2002 12:07:15 -0500
> From: Douglas Gilbert <[email protected]>
> > One problem I see when trying to use a box with 128 SCSI disks
> > is that sd_mod sometimes refuses to load. Earlier kernels simply
> > oopsed when it happened, but that is fixed in 2.4.18. The root
> > of the evil is the enormous array sd[] that sd_init allocates.
> > Alan suggested to split the allocation, which is what I did.
> So the only thing that is now contiguous is an array of
> pointers (to device state structures). [...]
> There have been no reported errors with this approach
> during the lk 2.4 series. A patched sg driver (together
> with Richard Gooch's sd-many patch) has been able to
> address over 300 (similated) disks without noticeable
> memory problems on a modestly-sized box.
The sg driver does not have any hd_struct arrays to allocate,
because it's not a disk.
> I believe that it was Eric's intention to implement the
> same solution in sd. The generic disk stuff and the
> partitions are a complicating factor.
> All those parallel arrays set up by sd_init (e.g.
> rscsi_disks[], sd_sizes[], sd_blocksizes[],
> sd_hardsizes[], sd_max_sectors[] and sd[] are a mess.
Excuse me, but I think you are trying to solve quite different
problem here. It looks that you target the code cleanliness first,
and the biggest allocation as an afterthought: "partitions
are a complicating factor". I target the biggest allocation,
which is the array of hd_struct (without loosing any code
cleanliness, if any remains in that rathole). Do you see the
difference?
Even after my patch broke the biggest allocation into 8 parts,
it is still the biggest! Every one of those other arrays is smaller
than an array of 256 hd_struct's. There is no way to switch to
arrays of pointers for hd_struct, because it is indexed with
minor in ll_rw_blk. Really, my change is independent of any
cleanups for other arrays (such as rscsi_disks[]).
It would be very nice if someone actually looked into detangling
those arrays in 2.5. Currently, Andreas Jaeger rewrote that part
without changing anything, only adding a bunch of butt-ugly macroses.
2.5 is where the better place for array squashing excercises is,
because I certainly would like to see this GONE:
if (rscsi_disks)
return 0;
/* allocate memory */
#define init_mem_lth(x,n) x = kmalloc((n) * sizeof(*x), GFP_ATOMIC)
#define zero_mem_lth(x,n) memset(x, 0, (n) * sizeof(*x))
>[...]
> BTW. It is probably worth looking at the sd-many patch
> as it must have been faced with a similar problem.
It just occured to me after I sent the patch.
I would appreciate if someone applied and used my patch and told
me how it went. Array cleanups are parallel to the break-up of
the biggest allocation in sd (which must stay an array :-P).
-- Pete
Pete Zaitcev writes:
> > Date: Sat, 23 Mar 2002 12:07:15 -0500
> > From: Douglas Gilbert <[email protected]>
>
> > > One problem I see when trying to use a box with 128 SCSI disks
> > > is that sd_mod sometimes refuses to load. Earlier kernels simply
> > > oopsed when it happened, but that is fixed in 2.4.18. The root
> > > of the evil is the enormous array sd[] that sd_init allocates.
> > > Alan suggested to split the allocation, which is what I did.
>
> > So the only thing that is now contiguous is an array of
> > pointers (to device state structures). [...]
> > There have been no reported errors with this approach
> > during the lk 2.4 series. A patched sg driver (together
> > with Richard Gooch's sd-many patch) has been able to
> > address over 300 (similated) disks without noticeable
> > memory problems on a modestly-sized box.
>
> The sg driver does not have any hd_struct arrays to allocate,
> because it's not a disk.
>
> > I believe that it was Eric's intention to implement the
> > same solution in sd. The generic disk stuff and the
> > partitions are a complicating factor.
> > All those parallel arrays set up by sd_init (e.g.
> > rscsi_disks[], sd_sizes[], sd_blocksizes[],
> > sd_hardsizes[], sd_max_sectors[] and sd[] are a mess.
>
> Excuse me, but I think you are trying to solve quite different
> problem here. It looks that you target the code cleanliness first,
> and the biggest allocation as an afterthought: "partitions
> are a complicating factor". I target the biggest allocation,
> which is the array of hd_struct (without loosing any code
> cleanliness, if any remains in that rathole). Do you see the
> difference?
>
> Even after my patch broke the biggest allocation into 8 parts,
> it is still the biggest! Every one of those other arrays is smaller
> than an array of 256 hd_struct's. There is no way to switch to
> arrays of pointers for hd_struct, because it is indexed with
> minor in ll_rw_blk. Really, my change is independent of any
> cleanups for other arrays (such as rscsi_disks[]).
>
> It would be very nice if someone actually looked into detangling
> those arrays in 2.5. Currently, Andreas Jaeger rewrote that part
> without changing anything, only adding a bunch of butt-ugly macroses.
> 2.5 is where the better place for array squashing excercises is,
> because I certainly would like to see this GONE:
>
> if (rscsi_disks)
> return 0;
>
> /* allocate memory */
> #define init_mem_lth(x,n) x = kmalloc((n) * sizeof(*x), GFP_ATOMIC)
> #define zero_mem_lth(x,n) memset(x, 0, (n) * sizeof(*x))
>
> >[...]
> > BTW. It is probably worth looking at the sd-many patch
> > as it must have been faced with a similar problem.
>
> It just occured to me after I sent the patch.
>
> I would appreciate if someone applied and used my patch and told
> me how it went. Array cleanups are parallel to the break-up of
> the biggest allocation in sd (which must stay an array :-P).
One of the things my sd-many patch did was to switch to vmalloc(). I
checked all the paths leading to these allocations, and they are all
in process context. Ergo, vmalloc() is safe, and thus allows many more
SD's.
Regards,
Richard....
Permanent: [email protected]
Current: [email protected]
Pete Zaitcev wrote:
>
> > Date: Sat, 23 Mar 2002 12:07:15 -0500
> > From: Douglas Gilbert <[email protected]>
>
> > > One problem I see when trying to use a box with 128 SCSI disks
> > > is that sd_mod sometimes refuses to load. Earlier kernels simply
> > > oopsed when it happened, but that is fixed in 2.4.18. The root
> > > of the evil is the enormous array sd[] that sd_init allocates.
> > > Alan suggested to split the allocation, which is what I did.
>
> > So the only thing that is now contiguous is an array of
> > pointers (to device state structures). [...]
> > There have been no reported errors with this approach
> > during the lk 2.4 series. A patched sg driver (together
> > with Richard Gooch's sd-many patch) has been able to
> > address over 300 (similated) disks without noticeable
> > memory problems on a modestly-sized box.
>
> The sg driver does not have any hd_struct arrays to allocate,
> because it's not a disk.
>
> > I believe that it was Eric's intention to implement the
> > same solution in sd. The generic disk stuff and the
> > partitions are a complicating factor.
> > All those parallel arrays set up by sd_init (e.g.
> > rscsi_disks[], sd_sizes[], sd_blocksizes[],
> > sd_hardsizes[], sd_max_sectors[] and sd[] are a mess.
>
> Excuse me, but I think you are trying to solve quite different
> problem here. It looks that you target the code cleanliness first,
> and the biggest allocation as an afterthought: "partitions
> are a complicating factor". I target the biggest allocation,
> which is the array of hd_struct (without loosing any code
> cleanliness, if any remains in that rathole). Do you see the
> difference?
>
> Even after my patch broke the biggest allocation into 8 parts,
> it is still the biggest! Every one of those other arrays is smaller
> than an array of 256 hd_struct's. There is no way to switch to
> arrays of pointers for hd_struct, because it is indexed with
> minor in ll_rw_blk. Really, my change is independent of any
> cleanups for other arrays (such as rscsi_disks[]).
>
> It would be very nice if someone actually looked into detangling
> those arrays in 2.5. Currently, Andreas Jaeger rewrote that part
> without changing anything, only adding a bunch of butt-ugly macroses.
> 2.5 is where the better place for array squashing excercises is,
> because I certainly would like to see this GONE:
>
> if (rscsi_disks)
> return 0;
>
> /* allocate memory */
> #define init_mem_lth(x,n) x = kmalloc((n) * sizeof(*x), GFP_ATOMIC)
> #define zero_mem_lth(x,n) memset(x, 0, (n) * sizeof(*x))
>
> >[...]
> > BTW. It is probably worth looking at the sd-many patch
> > as it must have been faced with a similar problem.
>
> It just occured to me after I sent the patch.
>
> I would appreciate if someone applied and used my patch and told
> me how it went. Array cleanups are parallel to the break-up of
> the biggest allocation in sd (which must stay an array :-P).
Your patch worked ok for me. I have a couple of real
disks and 120 simulated ones with scsi_debug. My last disk
was /dev/sddq and I was able to fdisk, mke2fs, mount
and copy files to it ok.
I had a look at ide-disk.c (lk 2.4.19-pre4) and it
looks remarkably clean compared to sd.c . It seems
to warrant further study.
Doug Gilbert
On Sat, 23 Mar 2002, Douglas Gilbert wrote:
> Your patch worked ok for me. I have a couple of real
> disks and 120 simulated ones with scsi_debug. My last disk
> was /dev/sddq and I was able to fdisk, mke2fs, mount
> and copy files to it ok.
>
>
> I had a look at ide-disk.c (lk 2.4.19-pre4) and it
> looks remarkably clean compared to sd.c . It seems
> to warrant further study.
>
> Doug Gilbert
WOW, that is the first compliment I have ever heard about my work from
another storage expert. Doug if I could have a minute to make a
suggestion about the ./drivers/scsi/, would you concider making sg.c into
the core transport layer for the subsystem? This would be similar to what
I am doing in ./drivers/ide with ide-taskfile.c. Where as mine intial
migration will cover all "ATA" commands, but there are ZERO real state
machine engines for ATAPI. I have considered and still looking at the
scope of pkt-taskfile.c as a generic transport layer for all atapi but
mating all the various standards into one is ugly. I would prefer to
provide an ASPI layer between ATA/SCSI and work with you to create real
personality extentions.
sd.c direct sane ide-disk.c
sr.c optical/rom more sg'ish ide-cd.c/ide-floppy.c
st.c stream noise makes from hell. ide-tape.c
My goal is to force the personalities in ata/atapi to deal with their own
errors and destroy the mainloop error thread/jungle. Also export to the
personality cores their own request and completion mappings.
I think similar things could be done in SCSI vi SG, then close up some of
the goofiness we both have (me more so) on HBA or OBHA (onboard).
Comments?
Cheers,
Andre Hedrick
LAD Storage Consulting Group
PS: I already popped the balloon head so no need to get out the voodoo dolls.
> Your patch worked ok for me. I have a couple of real
> disks and 120 simulated ones with scsi_debug. My last disk
> was /dev/sddq and I was able to fdisk, mke2fs, mount
> and copy files to it ok.
Great many thanks. I'll give it some time to settle and will ask
Alan or Marcelo to take it. BTW, the real test is not being able
to load modules and do stuff. The bad part is what happens when
you do a string of rmmod/modprobe in random order and with varying
parameters for scsi_debug (scsi_debug_num_devs=NN). Then it's going
to show if memory leaks or get corrupted. I certainly hope that
I did it right, but I was known to make mistakes before.
Another side note: towards the end of the patch, there is a reminder
about an entirely different issue that noted when doing the patch:
+ for (i = 0; i < N_USED_SD_MAJORS; i++) {
+#if 0 /* XXX aren't we forgetting to deallocate something? */
+ kfree(sd_gendisks[i].de_arr);
+ kfree(sd_gendisks[i].flags);
+#endif
+ kfree(sd_gendisks[i].part);
+ }
}
for (i = 0; i < N_USED_SD_MAJORS; i++) {
del_gendisk(&sd_gendisks[i]);
- blk_size[SD_MAJOR(i)] = NULL;
+ blk_size[SD_MAJOR(i)] = NULL; /* XXX blksize_size actually? */ hardsect_size[SD_MAJOR(i)] = NULL;
read_ahead[SD_MAJOR(i)] = 0;
}
E.g. I do not see a place where .flags and .de_arr are freed; also,
I am not sure why we assign blksize_size[], but we zero blk_size[].
I do not want to mix this up with the split-allocation patch, but
it is curious. I think we leak memory on sd_mod unload here.
-- Pete