2009-04-02 14:49:32

by Matthew Wilcox

[permalink] [raw]
Subject: [PATCH 1/5] Block: Discard may need to allocate pages

From: Matthew Wilcox <[email protected]>

SCSI and ATA both need to send data to the device. In order to do this,
the BIO must be allocated with room for a page to be added, and the bio
needs to be passed to the discard prep function. We also need to free
the page attached to the BIO before we free it.

init_request_from_bio() is not currently called from a context which
forbids sleeping, and to make sure it stays that way (so we don't have
to use GFP_ATOMIC), add a might_sleep() to it.

Signed-off-by: Matthew Wilcox <[email protected]>
---
block/blk-barrier.c | 4 +++-
block/blk-core.c | 4 +++-
block/ioctl.c | 4 +++-
drivers/mtd/mtd_blkdevs.c | 2 +-
include/linux/blkdev.h | 3 ++-
5 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index f7dae57..82a3035 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -356,6 +356,8 @@ static void blkdev_discard_end_io(struct bio *bio, int err)
clear_bit(BIO_UPTODATE, &bio->bi_flags);
}

+ if (bio_has_data(bio))
+ __free_page(bio_page(bio));
bio_put(bio);
}

@@ -387,7 +389,7 @@ int blkdev_issue_discard(struct block_device *bdev,
return -EOPNOTSUPP;

while (nr_sects && !ret) {
- bio = bio_alloc(gfp_mask, 0);
+ bio = bio_alloc(gfp_mask, 1);
if (!bio)
return -ENOMEM;

diff --git a/block/blk-core.c b/block/blk-core.c
index 996ed90..7899761 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1095,6 +1095,8 @@ EXPORT_SYMBOL(blk_put_request);

void init_request_from_bio(struct request *req, struct bio *bio)
{
+ might_sleep();
+
req->cpu = bio->bi_comp_cpu;
req->cmd_type = REQ_TYPE_FS;

@@ -1118,7 +1120,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
req->cmd_flags |= REQ_DISCARD;
if (bio_barrier(bio))
req->cmd_flags |= REQ_SOFTBARRIER;
- req->q->prepare_discard_fn(req->q, req);
+ req->q->prepare_discard_fn(req->q, req, bio);
} else if (unlikely(bio_barrier(bio)))
req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);

diff --git a/block/ioctl.c b/block/ioctl.c
index 0f22e62..088a9ba 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -145,7 +145,7 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
DECLARE_COMPLETION_ONSTACK(wait);
struct bio *bio;

- bio = bio_alloc(GFP_KERNEL, 0);
+ bio = bio_alloc(GFP_KERNEL, 1);
if (!bio)
return -ENOMEM;

@@ -170,6 +170,8 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
ret = -EOPNOTSUPP;
else if (!bio_flagged(bio, BIO_UPTODATE))
ret = -EIO;
+ if (bio_has_data(bio))
+ __free_page(bio_page(bio));
bio_put(bio);
}
return ret;
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 1409f01..2b6ed4b 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -33,7 +33,7 @@ struct mtd_blkcore_priv {
};

static int blktrans_discard_request(struct request_queue *q,
- struct request *req)
+ struct request *req, struct bio *bio)
{
req->cmd_type = REQ_TYPE_LINUX_BLOCK;
req->cmd[0] = REQ_LB_OP_DISCARD;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 465d6ba..9d9bd7b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -260,7 +260,8 @@ typedef void (request_fn_proc) (struct request_queue *q);
typedef int (make_request_fn) (struct request_queue *q, struct bio *bio);
typedef int (prep_rq_fn) (struct request_queue *, struct request *);
typedef void (unplug_fn) (struct request_queue *);
-typedef int (prepare_discard_fn) (struct request_queue *, struct request *);
+typedef int (prepare_discard_fn) (struct request_queue *, struct request *,
+ struct bio *bio);

struct bio_vec;
struct bvec_merge_data {
--
1.6.2.1


2009-04-02 14:49:16

by Matthew Wilcox

[permalink] [raw]
Subject: [PATCH 5/5] libata: Add support for TRIM

Signed-off-by: Matthew Wilcox <[email protected]>
---
drivers/ata/libata-scsi.c | 46 +++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 46 insertions(+), 0 deletions(-)

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index b9747fa..d4c8b8b 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -1029,6 +1029,46 @@ static void ata_gen_ata_sense(struct ata_queued_cmd *qc)
desc[11] = block;
}

+static int ata_discard_fn(struct request_queue *q, struct request *req,
+ struct bio *bio)
+{
+ unsigned size;
+ struct page *page = alloc_page(GFP_KERNEL);
+ if (!page)
+ goto error;
+
+ size = ata_set_lba_range_entries(page_address(page), PAGE_SIZE / 8,
+ bio->bi_sector, bio_sectors(bio));
+ bio->bi_size = 0;
+ if (bio_add_pc_page(q, bio, page, size, 0) < size)
+ goto free_page;
+
+ req->cmd_type = REQ_TYPE_BLOCK_PC;
+ req->cmd_len = 16;
+ req->cmd[0] = ATA_16;
+ req->cmd[1] = (6 << 1) | 1; /* dma, 48-bit */
+ req->cmd[2] = 0x6; /* length, direction */
+ req->cmd[3] = 0; /* feature high */
+ req->cmd[4] = ATA_DSM_TRIM; /* feature low */
+ req->cmd[5] = (size / 512) >> 8; /* nsect high */
+ req->cmd[6] = size / 512; /* nsect low */
+ req->cmd[7] = 0; /* lba */
+ req->cmd[8] = 0; /* lba */
+ req->cmd[9] = 0; /* lba */
+ req->cmd[10] = 0; /* lba */
+ req->cmd[11] = 0; /* lba */
+ req->cmd[12] = 0; /* lba */
+ req->cmd[13] = ATA_LBA; /* device */
+ req->cmd[14] = ATA_CMD_DSM; /* command */
+ req->cmd[15] = 0; /* control */
+
+ return 0;
+ free_page:
+ __free_page(page);
+ error:
+ return -ENOMEM;
+}
+
static void ata_scsi_sdev_config(struct scsi_device *sdev)
{
sdev->use_10_for_rw = 1;
@@ -1077,6 +1117,9 @@ static int ata_scsi_dev_config(struct scsi_device *sdev,
/* configure max sectors */
blk_queue_max_sectors(sdev->request_queue, dev->max_sectors);

+ if (ata_id_has_trim(dev->id))
+ blk_queue_set_discard(sdev->request_queue, ata_discard_fn);
+
if (dev->class == ATA_DEV_ATAPI) {
struct request_queue *q = sdev->request_queue;
void *buf;
@@ -2385,6 +2428,9 @@ static unsigned int ata_scsiop_read_cap(struct ata_scsi_args *args, u8 *rbuf)
/* sector size */
rbuf[10] = ATA_SECT_SIZE >> 8;
rbuf[11] = ATA_SECT_SIZE & 0xff;
+
+ if (ata_id_has_trim(args->id))
+ rbuf[14] = 0x80;
}

return 0;
--
1.6.2.1

2009-04-02 15:05:19

by Matthew Wilcox

[permalink] [raw]
Subject: [PATCH 2/5] Make DISCARD_BARRIER and DISCARD_NOBARRIER writes instead of reads

From: David Woodhouse <[email protected]>

The commands are conceptually writes, and in the case of IDE and SCSI
commands actually are writes. They were only reads because we thought
that would interact better with the elevators. Now the elevators know
about discard requests, that advantage no longer exists.

Signed-off-by: David Woodhouse <[email protected]>
Signed-off-by: Matthew Wilcox <[email protected]>
---
include/linux/fs.h | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 61211ad..e5dc992 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -98,8 +98,8 @@ struct inodes_stat_t {
#define WRITE_SYNC (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
#define SWRITE_SYNC (SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
#define WRITE_BARRIER (WRITE | (1 << BIO_RW_BARRIER))
-#define DISCARD_NOBARRIER (1 << BIO_RW_DISCARD)
-#define DISCARD_BARRIER ((1 << BIO_RW_DISCARD) | (1 << BIO_RW_BARRIER))
+#define DISCARD_NOBARRIER (WRITE | (1 << BIO_RW_DISCARD))
+#define DISCARD_BARRIER (DISCARD_NOBARRIER | (1 << BIO_RW_BARRIER))

#define SEL_IN 1
#define SEL_OUT 2
--
1.6.2.1

2009-04-02 15:04:49

by Matthew Wilcox

[permalink] [raw]
Subject: [PATCH 4/5] ide: Add support for TRIM

From: David Woodhouse <[email protected]>

Signed-off-by: David Woodhouse <[email protected]>
[modified to reduce amount of special casing needed for discard]
Signed-off-by: Matthew Wilcox <[email protected]>
---
drivers/ide/ide-disk.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 49 insertions(+), 0 deletions(-)

diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index c998cf8..ed34bd3 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -404,6 +404,52 @@ static void idedisk_prepare_flush(struct request_queue *q, struct request *rq)
rq->special = cmd;
}

+static int idedisk_prepare_discard(struct request_queue *q, struct request *rq,
+ struct bio *bio)
+{
+ ide_task_t *task;
+ unsigned size;
+ struct page *page = alloc_page(GFP_KERNEL);
+ if (!page)
+ goto error;
+
+ /* FIXME: map struct ide_taskfile on rq->cmd[] */
+ task = kzalloc(sizeof(*task), GFP_KERNEL);
+ if (!task)
+ goto free_page;
+
+ size = ata_set_lba_range_entries(page_address(page), PAGE_SIZE,
+ bio->bi_sector, bio_sectors(bio));
+ bio->bi_size = 0;
+ if (bio_add_pc_page(q, bio, page, size, 0) < size)
+ goto free_task;
+
+ task->tf.command = ATA_CMD_DSM;
+ task->tf.feature = ATA_DSM_TRIM;
+ task->tf.hob_feature = 0x00;
+ task->tf.nsect = size / 512;
+ task->tf.hob_nsect = (size / 512) >> 8;
+
+ task->tf_flags = IDE_TFLAG_LBA48 | IDE_TFLAG_OUT_HOB |
+ IDE_TFLAG_OUT_TF | IDE_TFLAG_OUT_DEVICE |
+ IDE_TFLAG_DYN;
+ task->data_phase = TASKFILE_OUT_DMA;
+
+ rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
+ rq->special = task;
+ rq->nr_sectors = size / 512;
+
+ return 0;
+
+ free_task:
+ kfree(task);
+ free_page:
+ __free_page(page);
+ error:
+ return -ENOMEM;
+}
+
+
ide_devset_get(multcount, mult_count);

/*
@@ -521,6 +567,9 @@ static int set_wcache(ide_drive_t *drive, int arg)

update_ordered(drive);

+ if (ata_id_has_trim(drive->id))
+ blk_queue_set_discard(drive->queue, idedisk_prepare_discard);
+
return err;
}

--
1.6.2.1

2009-04-02 15:05:51

by Matthew Wilcox

[permalink] [raw]
Subject: [PATCH 3/5] ata: Add TRIM infrastructure

This is common code shared between the IDE and libata implementations

Signed-off-by: David Woodhouse <[email protected]>
Signed-off-by: Matthew Wilcox <[email protected]>
---
include/linux/ata.h | 41 +++++++++++++++++++++++++++++++++++++++++
1 files changed, 41 insertions(+), 0 deletions(-)

diff --git a/include/linux/ata.h b/include/linux/ata.h
index 6617c9f..cb79b7a 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -29,6 +29,8 @@
#ifndef __LINUX_ATA_H__
#define __LINUX_ATA_H__

+#include <linux/kernel.h>
+#include <linux/string.h>
#include <linux/types.h>
#include <asm/byteorder.h>

@@ -91,6 +93,7 @@ enum {
ATA_ID_CFA_POWER = 160,
ATA_ID_CFA_KEY_MGMT = 162,
ATA_ID_CFA_MODES = 163,
+ ATA_ID_DATA_SET_MGMT = 169,
ATA_ID_ROT_SPEED = 217,
ATA_ID_PIO4 = (1 << 1),

@@ -248,6 +251,7 @@ enum {
ATA_CMD_SMART = 0xB0,
ATA_CMD_MEDIA_LOCK = 0xDE,
ATA_CMD_MEDIA_UNLOCK = 0xDF,
+ ATA_CMD_DSM = 0x06,
/* marked obsolete in the ATA/ATAPI-7 spec */
ATA_CMD_RESTORE = 0x10,

@@ -321,6 +325,9 @@ enum {
ATA_SMART_READ_VALUES = 0xD0,
ATA_SMART_READ_THRESHOLDS = 0xD1,

+ /* feature values for Data Set Management */
+ ATA_DSM_TRIM = 0x01,
+
/* password used in LBA Mid / LBA High for executing SMART commands */
ATA_SMART_LBAM_PASS = 0x4F,
ATA_SMART_LBAH_PASS = 0xC2,
@@ -723,6 +730,14 @@ static inline int ata_id_has_unload(const u16 *id)
return 0;
}

+static inline int ata_id_has_trim(const u16 *id)
+{
+ if (ata_id_major_version(id) >= 7 &&
+ (id[ATA_ID_DATA_SET_MGMT] & 1))
+ return 1;
+ return 0;
+}
+
static inline int ata_id_current_chs_valid(const u16 *id)
{
/* For ATA-1 devices, if the INITIALIZE DEVICE PARAMETERS command
@@ -863,6 +878,32 @@ static inline void ata_id_to_hd_driveid(u16 *id)
#endif
}

+/*
+ * Write up to 'max' LBA Range Entries to the buffer that will cover the
+ * extent from sector to sector + count. This is used for TRIM and for
+ * ADD LBA(S) TO NV CACHE PINNED SET.
+ */
+static inline unsigned ata_set_lba_range_entries(void *_buffer, unsigned max,
+ u64 sector, unsigned long count)
+{
+ __le64 *buffer = _buffer;
+ unsigned i = 0;
+
+ while (i < max) {
+ u64 entry = sector |
+ ((u64)(count > 0xffff ? 0xffff : count) << 48);
+ buffer[i++] = __cpu_to_le64(entry);
+ if (count <= 0xffff)
+ break;
+ count -= 0xffff;
+ sector += 0xffff;
+ }
+
+ max = ALIGN(i * 8, 512);
+ memset(buffer + i, 0, max - i * 8);
+ return max;
+}
+
static inline int is_multi_taskfile(struct ata_taskfile *tf)
{
return (tf->command == ATA_CMD_READ_MULTI) ||
--
1.6.2.1

2009-04-02 15:55:28

by Sergei Shtylyov

[permalink] [raw]
Subject: Re: [PATCH 3/5] ata: Add TRIM infrastructure

Matthew Wilcox wrote:

> This is common code shared between the IDE and libata implementations

> Signed-off-by: David Woodhouse <[email protected]>
> Signed-off-by: Matthew Wilcox <[email protected]>

[...]

> diff --git a/include/linux/ata.h b/include/linux/ata.h
> index 6617c9f..cb79b7a 100644
> --- a/include/linux/ata.h
> +++ b/include/linux/ata.h
> @@ -29,6 +29,8 @@
> #ifndef __LINUX_ATA_H__
> #define __LINUX_ATA_H__
>
> +#include <linux/kernel.h>
> +#include <linux/string.h>
> #include <linux/types.h>
> #include <asm/byteorder.h>
>
> @@ -91,6 +93,7 @@ enum {
> ATA_ID_CFA_POWER = 160,
> ATA_ID_CFA_KEY_MGMT = 162,
> ATA_ID_CFA_MODES = 163,
> + ATA_ID_DATA_SET_MGMT = 169,
> ATA_ID_ROT_SPEED = 217,
> ATA_ID_PIO4 = (1 << 1),
>
> @@ -248,6 +251,7 @@ enum {
> ATA_CMD_SMART = 0xB0,
> ATA_CMD_MEDIA_LOCK = 0xDE,
> ATA_CMD_MEDIA_UNLOCK = 0xDF,
> + ATA_CMD_DSM = 0x06,
> /* marked obsolete in the ATA/ATAPI-7 spec */
> ATA_CMD_RESTORE = 0x10,
>
> @@ -321,6 +325,9 @@ enum {
> ATA_SMART_READ_VALUES = 0xD0,
> ATA_SMART_READ_THRESHOLDS = 0xD1,
>
> + /* feature values for Data Set Management */
> + ATA_DSM_TRIM = 0x01,
> +
> /* password used in LBA Mid / LBA High for executing SMART commands */
> ATA_SMART_LBAM_PASS = 0x4F,
> ATA_SMART_LBAH_PASS = 0xC2,

I wonder where to read about the word 169 and commnand 0x06... what's it
all about?

WBR, Sergei

2009-04-02 15:58:39

by Sergei Shtylyov

[permalink] [raw]
Subject: Re: [PATCH 4/5] ide: Add support for TRIM

Hello.

Matthew Wilcox wrote:

> From: David Woodhouse <[email protected]>

> Signed-off-by: David Woodhouse <[email protected]>
> [modified to reduce amount of special casing needed for discard]
> Signed-off-by: Matthew Wilcox <[email protected]>



> diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
> index c998cf8..ed34bd3 100644
> --- a/drivers/ide/ide-disk.c
> +++ b/drivers/ide/ide-disk.c
> @@ -404,6 +404,52 @@ static void idedisk_prepare_flush(struct request_queue *q, struct request *rq)
> rq->special = cmd;
> }
>
> +static int idedisk_prepare_discard(struct request_queue *q, struct request *rq,
> + struct bio *bio)

Weird indentation.

> +{
> + ide_task_t *task;

This patch is already obsolete as 'ide_task_t' is gone. Use 'struct
ide_cmd' instead.

> + unsigned size;
> + struct page *page = alloc_page(GFP_KERNEL);

Missing empty line after the declaration block...

> + if (!page)
> + goto error;

Unneeded goto -- why not just return?

> +
> + /* FIXME: map struct ide_taskfile on rq->cmd[] */
> + task = kzalloc(sizeof(*task), GFP_KERNEL);
> + if (!task)
> + goto free_page;
> +
> + size = ata_set_lba_range_entries(page_address(page), PAGE_SIZE,
> + bio->bi_sector, bio_sectors(bio));
> + bio->bi_size = 0;
> + if (bio_add_pc_page(q, bio, page, size, 0) < size)
> + goto free_task;
> +
> + task->tf.command = ATA_CMD_DSM;
> + task->tf.feature = ATA_DSM_TRIM;
> + task->tf.hob_feature = 0x00;
> + task->tf.nsect = size / 512;
> + task->tf.hob_nsect = (size / 512) >> 8;
> +
> + task->tf_flags = IDE_TFLAG_LBA48 | IDE_TFLAG_OUT_HOB |
> + IDE_TFLAG_OUT_TF | IDE_TFLAG_OUT_DEVICE |

The last 3 flags are going to be obsoleted too...

MBR, Sergei

2009-04-02 16:18:48

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH 3/5] ata: Add TRIM infrastructure

On Thu, Apr 02, 2009 at 07:55:39PM +0400, Sergei Shtylyov wrote:
> I wonder where to read about the word 169 and commnand 0x06... what's
> it all about?

It's in d2015r1-ATAATAPI_Command_Set_-_2_ACS-2.pdf which can be found on
the t13 website.

2009-04-02 16:29:00

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH 4/5] ide: Add support for TRIM

On Thu, Apr 02, 2009 at 07:58:59PM +0400, Sergei Shtylyov wrote:
>> +static int idedisk_prepare_discard(struct request_queue *q, struct
>> request *rq,
>> + struct bio *bio)
>
> Weird indentation.

Not at all. New line, so tab the next line over as far as it will go
without falling off the 80-column limit. It's fairly common.

To quote Documentation/CodingStyle:

Statements longer than 80 columns will be broken into sensible chunks.
Descendants are always substantially shorter than the parent and are placed
substantially to the right. The same applies to function headers with a long
argument list. Long strings are as well broken into shorter strings. The
only exception to this is where exceeding 80 columns significantly increases
readability and does not hide information.

>> +{
>> + ide_task_t *task;
>
> This patch is already obsolete as 'ide_task_t' is gone. Use 'struct
> ide_cmd' instead.

Thanks, fixed.

>> + unsigned size;
>> + struct page *page = alloc_page(GFP_KERNEL);
>
> Missing empty line after the declaration block...

Empty line not necessary.

>> + if (!page)
>> + goto error;
>
> Unneeded goto -- why not just return?

Because it then looks the same as the other two cases where we can error
out.

>> + task->tf_flags = IDE_TFLAG_LBA48 | IDE_TFLAG_OUT_HOB |
>> + IDE_TFLAG_OUT_TF | IDE_TFLAG_OUT_DEVICE |
>
> The last 3 flags are going to be obsoleted too...

So if I remove them today, the command will still work?

2009-04-02 16:32:39

by Sergei Shtylyov

[permalink] [raw]
Subject: Re: [PATCH 3/5] ata: Add TRIM infrastructure

Matthew Wilcox wrote:

>> I wonder where to read about the word 169 and commnand 0x06... what's
>>it all about?

> It's in d2015r1-ATAATAPI_Command_Set_-_2_ACS-2.pdf which can be found on
> the t13 website.

But since neither ATA/PI-7 nor ATA/PI-8 mention neither the word nor
command, why your validity check requires only ATA/PI-7+ compliance?

MBR, Sergei

2009-04-02 16:37:52

by Sergei Shtylyov

[permalink] [raw]
Subject: Re: [PATCH 4/5] ide: Add support for TRIM

Hello.

Matthew Wilcox wrote:

>>> +static int idedisk_prepare_discard(struct request_queue *q, struct
>>>request *rq,
>>>+ struct bio *bio)

>> Weird indentation.

> Not at all. New line, so tab the next line over as far as it will go
> without falling off the 80-column limit. It's fairly common.

> To quote Documentation/CodingStyle:

> Statements longer than 80 columns will be broken into sensible chunks.
> Descendants are always substantially shorter than the parent and are placed
> substantially to the right. The same applies to function headers with a long
> argument list. Long strings are as well broken into shorter strings. The
> only exception to this is where exceeding 80 columns significantly increases
> readability and does not hide information.

That's all clear, it's just that the second line was overly indented, at
least to my taste. :-)

>>>+{
>>>+ ide_task_t *task;

>> This patch is already obsolete as 'ide_task_t' is gone. Use 'struct
>>ide_cmd' instead.

> Thanks, fixed.

>>>+ unsigned size;
>>>+ struct page *page = alloc_page(GFP_KERNEL);

>> Missing empty line after the declaration block...

> Empty line not necessary.

Usually it's there.

>>>+ task->tf_flags = IDE_TFLAG_LBA48 | IDE_TFLAG_OUT_HOB |
>>>+ IDE_TFLAG_OUT_TF | IDE_TFLAG_OUT_DEVICE |

>> The last 3 flags are going to be obsoleted too...

> So if I remove them today, the command will still work?

s/obsoleted/renamed and moved to another other field/ -- I'm going to
submit a patchset refactoring 'struct ide_cmd and 'struct ide-taskfile' at last...

MBR, Sergei

2009-04-02 16:49:45

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH 3/5] ata: Add TRIM infrastructure

On Thu, Apr 02, 2009 at 08:32:56PM +0400, Sergei Shtylyov wrote:
> Matthew Wilcox wrote:
>
>>> I wonder where to read about the word 169 and commnand 0x06...
>>> what's it all about?
>
>> It's in d2015r1-ATAATAPI_Command_Set_-_2_ACS-2.pdf which can be found on
>> the t13 website.
>
> But since neither ATA/PI-7 nor ATA/PI-8 mention neither the word nor
> command, why your validity check requires only ATA/PI-7+ compliance?

Because there are ATA-7 devices which implement this feature.

2009-04-02 16:53:05

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH 4/5] ide: Add support for TRIM

On Thu, Apr 02, 2009 at 08:38:03PM +0400, Sergei Shtylyov wrote:
> Hello.
>>>> + task->tf_flags = IDE_TFLAG_LBA48 | IDE_TFLAG_OUT_HOB |
>>>> + IDE_TFLAG_OUT_TF | IDE_TFLAG_OUT_DEVICE |
>
>>> The last 3 flags are going to be obsoleted too...
>
>> So if I remove them today, the command will still work?
>
> s/obsoleted/renamed and moved to another other field/ -- I'm going to
> submit a patchset refactoring 'struct ide_cmd and 'struct ide-taskfile'
> at last...

Since I'm coding to today's kernel, not to a patch which doesn't exist
yet, taking them out won't work very well.

I've not been paying much attention to drivers/ide, so I've no idea
whether the following patch works. It does at least compile:

+++ b/drivers/ide/ide-disk.c
@@ -407,7 +407,7 @@ static void idedisk_prepare_flush(struct request_queue *q, s
static int idedisk_prepare_discard(struct request_queue *q, struct request *rq,
struct bio *bio)
{
- ide_task_t *task;
+ struct ide_cmd *task;
unsigned size;
struct page *page = alloc_page(GFP_KERNEL);
if (!page)
@@ -432,8 +432,8 @@ static int idedisk_prepare_discard(struct request_queue *q,

task->tf_flags = IDE_TFLAG_LBA48 | IDE_TFLAG_OUT_HOB |
IDE_TFLAG_OUT_TF | IDE_TFLAG_OUT_DEVICE |
- IDE_TFLAG_DYN;
- task->data_phase = TASKFILE_OUT_DMA;
+ IDE_TFLAG_DYN | IDE_TFLAG_WRITE;
+ task->protocol = ATA_PROT_DMA;

rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
rq->special = task;

If I've understood 0dfb991c6943c810175376b58d1c29cfe532541b correctly,
this should be equivalent. Bart?

2009-04-02 17:20:39

by Mark Lord

[permalink] [raw]
Subject: Re: [PATCH 5/5] libata: Add support for TRIM

Matthew Wilcox wrote:
> Signed-off-by: Matthew Wilcox <[email protected]>
> ---
> drivers/ata/libata-scsi.c | 46 +++++++++++++++++++++++++++++++++++++++++++++
> 1 files changed, 46 insertions(+), 0 deletions(-)
>
> diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
> index b9747fa..d4c8b8b 100644
> --- a/drivers/ata/libata-scsi.c
> +++ b/drivers/ata/libata-scsi.c
> @@ -1029,6 +1029,46 @@ static void ata_gen_ata_sense(struct ata_queued_cmd *qc)
> desc[11] = block;
> }
>
> +static int ata_discard_fn(struct request_queue *q, struct request *req,
> + struct bio *bio)
> +{
> + unsigned size;
> + struct page *page = alloc_page(GFP_KERNEL);
> + if (!page)
> + goto error;
> +
> + size = ata_set_lba_range_entries(page_address(page), PAGE_SIZE / 8,
> + bio->bi_sector, bio_sectors(bio));
> + bio->bi_size = 0;
> + if (bio_add_pc_page(q, bio, page, size, 0) < size)
> + goto free_page;
> +
> + req->cmd_type = REQ_TYPE_BLOCK_PC;
> + req->cmd_len = 16;
> + req->cmd[0] = ATA_16;
> + req->cmd[1] = (6 << 1) | 1; /* dma, 48-bit */
> + req->cmd[2] = 0x6; /* length, direction */
> + req->cmd[3] = 0; /* feature high */
> + req->cmd[4] = ATA_DSM_TRIM; /* feature low */
> + req->cmd[5] = (size / 512) >> 8; /* nsect high */
> + req->cmd[6] = size / 512; /* nsect low */
> + req->cmd[7] = 0; /* lba */
> + req->cmd[8] = 0; /* lba */
> + req->cmd[9] = 0; /* lba */
> + req->cmd[10] = 0; /* lba */
> + req->cmd[11] = 0; /* lba */
> + req->cmd[12] = 0; /* lba */
> + req->cmd[13] = ATA_LBA; /* device */
> + req->cmd[14] = ATA_CMD_DSM; /* command */
> + req->cmd[15] = 0; /* control */
> +
> + return 0;
> + free_page:
> + __free_page(page);
> + error:
> + return -ENOMEM;
> +}
> +
> static void ata_scsi_sdev_config(struct scsi_device *sdev)
> {
> sdev->use_10_for_rw = 1;
> @@ -1077,6 +1117,9 @@ static int ata_scsi_dev_config(struct scsi_device *sdev,
> /* configure max sectors */
> blk_queue_max_sectors(sdev->request_queue, dev->max_sectors);
>
> + if (ata_id_has_trim(dev->id))
> + blk_queue_set_discard(sdev->request_queue, ata_discard_fn);
> +
> if (dev->class == ATA_DEV_ATAPI) {
> struct request_queue *q = sdev->request_queue;
> void *buf;
> @@ -2385,6 +2428,9 @@ static unsigned int ata_scsiop_read_cap(struct ata_scsi_args *args, u8 *rbuf)
> /* sector size */
> rbuf[10] = ATA_SECT_SIZE >> 8;
> rbuf[11] = ATA_SECT_SIZE & 0xff;
> +
> + if (ata_id_has_trim(args->id))
> + rbuf[14] = 0x80;
> }
>
> return 0;
..

Now we just need an implementation of ata_discard_fn()
that issues the CFA ERASE SECTORS command instead of TRIM
when speaking to a CFA device.

Ah, life is good!

2009-04-02 17:55:55

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH 5/5] libata: Add support for TRIM

On Thu, Apr 02, 2009 at 01:20:01PM -0400, Mark Lord wrote:
> Now we just need an implementation of ata_discard_fn()
> that issues the CFA ERASE SECTORS command instead of TRIM
> when speaking to a CFA device.

Yes, should be quite straightforward. The only minor wrinkle I can see
is that CFA ERASE SECTORS only allows you to erase up to 256 sectors at a
time. I think Dave's infrastructure for Discard lets us handle that ...

> Ah, life is good!

Isn't it though?

Subject: Re: [PATCH 4/5] ide: Add support for TRIM

On Thursday 02 April 2009, Matthew Wilcox wrote:
> On Thu, Apr 02, 2009 at 08:38:03PM +0400, Sergei Shtylyov wrote:
> > Hello.
> >>>> + task->tf_flags = IDE_TFLAG_LBA48 | IDE_TFLAG_OUT_HOB |
> >>>> + IDE_TFLAG_OUT_TF | IDE_TFLAG_OUT_DEVICE |
> >
> >>> The last 3 flags are going to be obsoleted too...
> >
> >> So if I remove them today, the command will still work?
> >
> > s/obsoleted/renamed and moved to another other field/ -- I'm going to
> > submit a patchset refactoring 'struct ide_cmd and 'struct ide-taskfile'
> > at last...
>
> Since I'm coding to today's kernel, not to a patch which doesn't exist
> yet, taking them out won't work very well.
>
> I've not been paying much attention to drivers/ide, so I've no idea
> whether the following patch works. It does at least compile:
>
> +++ b/drivers/ide/ide-disk.c
> @@ -407,7 +407,7 @@ static void idedisk_prepare_flush(struct request_queue *q, s
> static int idedisk_prepare_discard(struct request_queue *q, struct request *rq,
> struct bio *bio)
> {
> - ide_task_t *task;
> + struct ide_cmd *task;
> unsigned size;
> struct page *page = alloc_page(GFP_KERNEL);
> if (!page)
> @@ -432,8 +432,8 @@ static int idedisk_prepare_discard(struct request_queue *q,
>
> task->tf_flags = IDE_TFLAG_LBA48 | IDE_TFLAG_OUT_HOB |
> IDE_TFLAG_OUT_TF | IDE_TFLAG_OUT_DEVICE |
> - IDE_TFLAG_DYN;
> - task->data_phase = TASKFILE_OUT_DMA;
> + IDE_TFLAG_DYN | IDE_TFLAG_WRITE;
> + task->protocol = ATA_PROT_DMA;
>
> rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
> rq->special = task;
>
> If I've understood 0dfb991c6943c810175376b58d1c29cfe532541b correctly,
> this should be equivalent. Bart?

Yep. The final patch looks fine overall (thanks for adding TRIM support
also to drivers/ide), with the small exception for this chunk:

@@ -521,6 +567,9 @@ static int set_wcache(ide_drive_t *drive, int arg)

update_ordered(drive);

+ if (ata_id_has_trim(drive->id))
+ blk_queue_set_discard(drive->queue, idedisk_prepare_discard);
+
return err;
}

as it would fit better somewhere in ide_disk_setup() ISO set_wcache()...

[ When it comes to CodingStyle issues I don't care that much though I think
that applying Sergei's suggestions will make the patch more consistent with
the rest of drivers/ide code... ]

2009-04-05 12:30:44

by Boaz Harrosh

[permalink] [raw]
Subject: Re: [PATCH 1/5] Block: Discard may need to allocate pages

On 04/02/2009 05:37 PM, Matthew Wilcox wrote:
> From: Matthew Wilcox <[email protected]>
>
> SCSI and ATA both need to send data to the device. In order to do this,
> the BIO must be allocated with room for a page to be added, and the bio
> needs to be passed to the discard prep function. We also need to free
> the page attached to the BIO before we free it.
>
> init_request_from_bio() is not currently called from a context which
> forbids sleeping, and to make sure it stays that way (so we don't have
> to use GFP_ATOMIC), add a might_sleep() to it.
>

I understand you have inherited this code, but I think it is a bit of a mess
and you are only adding to the it.

> Signed-off-by: Matthew Wilcox <[email protected]>
> ---
> block/blk-barrier.c | 4 +++-
> block/blk-core.c | 4 +++-
> block/ioctl.c | 4 +++-
> drivers/mtd/mtd_blkdevs.c | 2 +-
> include/linux/blkdev.h | 3 ++-
> 5 files changed, 12 insertions(+), 5 deletions(-)
>
> diff --git a/block/blk-barrier.c b/block/blk-barrier.c
> index f7dae57..82a3035 100644
> --- a/block/blk-barrier.c
> +++ b/block/blk-barrier.c
> @@ -356,6 +356,8 @@ static void blkdev_discard_end_io(struct bio *bio, int err)
> clear_bit(BIO_UPTODATE, &bio->bi_flags);
> }
>
> + if (bio_has_data(bio))
> + __free_page(bio_page(bio));

Page freed which was allocated by the LLD

> bio_put(bio);

OK bio was allocated by user code but shouldn't

> }
>
> @@ -387,7 +389,7 @@ int blkdev_issue_discard(struct block_device *bdev,
> return -EOPNOTSUPP;
>
> while (nr_sects && !ret) {
> - bio = bio_alloc(gfp_mask, 0);
> + bio = bio_alloc(gfp_mask, 1);

blkdev_issue_discard() and blk_ioctl_discard() has half a page
of common (and changing) code, could be done to use a common
helper that sets policy about bio allocation sizes and such.

Just my $0.017

> if (!bio)
> return -ENOMEM;
>
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 996ed90..7899761 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -1095,6 +1095,8 @@ EXPORT_SYMBOL(blk_put_request);
>
> void init_request_from_bio(struct request *req, struct bio *bio)
> {
> + might_sleep();
> +
> req->cpu = bio->bi_comp_cpu;
> req->cmd_type = REQ_TYPE_FS;
>
> @@ -1118,7 +1120,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
> req->cmd_flags |= REQ_DISCARD;
> if (bio_barrier(bio))
> req->cmd_flags |= REQ_SOFTBARRIER;
> - req->q->prepare_discard_fn(req->q, req);
> + req->q->prepare_discard_fn(req->q, req, bio);

Allocation of bio page could be done commonly here.
The prepare_discard_fn() is made to return the needed size. It is not as if we actually
give the driver a choice about the allocation.

So now we allocate the page and free it at the same level.
And we do it only in one place.

Same common code in [PATCH 4/5] and [PATCH 4/5] is done once, here.

> } else if (unlikely(bio_barrier(bio)))
> req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
>
> diff --git a/block/ioctl.c b/block/ioctl.c
> index 0f22e62..088a9ba 100644
> --- a/block/ioctl.c
> +++ b/block/ioctl.c
> @@ -145,7 +145,7 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
> DECLARE_COMPLETION_ONSTACK(wait);
> struct bio *bio;
>
> - bio = bio_alloc(GFP_KERNEL, 0);
> + bio = bio_alloc(GFP_KERNEL, 1);

This is deja vu, don't you think ;)

> if (!bio)
> return -ENOMEM;
>
> @@ -170,6 +170,8 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
> ret = -EOPNOTSUPP;
> else if (!bio_flagged(bio, BIO_UPTODATE))
> ret = -EIO;
> + if (bio_has_data(bio))
> + __free_page(bio_page(bio));
> bio_put(bio);
> }
> return ret;
> diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
> index 1409f01..2b6ed4b 100644
> --- a/drivers/mtd/mtd_blkdevs.c
> +++ b/drivers/mtd/mtd_blkdevs.c
> @@ -33,7 +33,7 @@ struct mtd_blkcore_priv {
> };
>
> static int blktrans_discard_request(struct request_queue *q,
> - struct request *req)
> + struct request *req, struct bio *bio)
> {
> req->cmd_type = REQ_TYPE_LINUX_BLOCK;
> req->cmd[0] = REQ_LB_OP_DISCARD;
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 465d6ba..9d9bd7b 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -260,7 +260,8 @@ typedef void (request_fn_proc) (struct request_queue *q);
> typedef int (make_request_fn) (struct request_queue *q, struct bio *bio);
> typedef int (prep_rq_fn) (struct request_queue *, struct request *);
> typedef void (unplug_fn) (struct request_queue *);
> -typedef int (prepare_discard_fn) (struct request_queue *, struct request *);
> +typedef int (prepare_discard_fn) (struct request_queue *, struct request *,
> + struct bio *bio);
>
> struct bio_vec;
> struct bvec_merge_data {

I have one question:

At [PATCH 4/5] and [PATCH 4/5] you do:
+ struct page *page = alloc_page(GFP_KERNEL);

does that zero the alloced page? since if I understand correctly this page
will go on the wire, a SW target on the other size could snoop random Kernel
memory, is that allowed? OK I might be totally clueless here.

Have a good day
Boaz

2009-04-06 20:35:12

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH 1/5] Block: Discard may need to allocate pages

On Sun, Apr 05, 2009 at 03:28:07PM +0300, Boaz Harrosh wrote:
> > + if (bio_has_data(bio))
> > + __free_page(bio_page(bio));
>
> Page freed which was allocated by the LLD

Not by the LLD. By the ULD.

> blkdev_issue_discard() and blk_ioctl_discard() has half a page
> of common (and changing) code, could be done to use a common
> helper that sets policy about bio allocation sizes and such.

Sure, could be done.

> > - req->q->prepare_discard_fn(req->q, req);
> > + req->q->prepare_discard_fn(req->q, req, bio);
>
> Allocation of bio page could be done commonly here.

Not all prepare_discard_fn() implementations need or want a bio page.
The CFA ERASE SECTORS command is a non-data command, for example.

> The prepare_discard_fn() is made to return the needed size. It is not as if we actually
> give the driver a choice about the allocation.

Umm ... prepare_discard_fn() needs to fill in the page. I don't
understand what code you propose here.

> > - bio = bio_alloc(GFP_KERNEL, 0);
> > + bio = bio_alloc(GFP_KERNEL, 1);
>
> This is deja vu, don't you think ;)

Yep.

> I have one question:
>
> At [PATCH 4/5] and [PATCH 4/5] you do:
> + struct page *page = alloc_page(GFP_KERNEL);
>
> does that zero the alloced page? since if I understand correctly this page
> will go on the wire, a SW target on the other size could snoop random Kernel
> memory, is that allowed? OK I might be totally clueless here.

The prepare_discard_fn() is responsible for not leaking data. The ATA
one does it via memset() to a 512 byte boundary. The SCSI one
initialises the 24 bytes that it sends on the wire. I don't think
either implementation leaks uninitialised data.

2009-04-07 00:02:53

by Jeff Garzik

[permalink] [raw]
Subject: Re: [PATCH 3/5] ata: Add TRIM infrastructure

Matthew Wilcox wrote:
> This is common code shared between the IDE and libata implementations
>
> Signed-off-by: David Woodhouse <[email protected]>
> Signed-off-by: Matthew Wilcox <[email protected]>
> ---
> include/linux/ata.h | 41 +++++++++++++++++++++++++++++++++++++++++
> 1 files changed, 41 insertions(+), 0 deletions(-)

applied

2009-04-07 17:20:35

by Jeff Garzik

[permalink] [raw]
Subject: Re: [PATCH 4/5] ide: Add support for TRIM

Matthew Wilcox wrote:
> From: David Woodhouse <[email protected]>
>
> Signed-off-by: David Woodhouse <[email protected]>
> [modified to reduce amount of special casing needed for discard]
> Signed-off-by: Matthew Wilcox <[email protected]>
> ---
> drivers/ide/ide-disk.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++
> 1 files changed, 49 insertions(+), 0 deletions(-)

OK, the linux/ata.h portion of TRIM support is now upstream. This can
go via Bart's tree.

2009-04-07 17:58:17

by Mark Lord

[permalink] [raw]
Subject: Re: [PATCH 4/5] ide: Add support for TRIM

Are there any commercially available SSDs that have this feature yet?
Anyone know which ones?

Thanks

2009-04-07 18:10:47

by Markus Trippelsdorf

[permalink] [raw]
Subject: Re: [PATCH 4/5] ide: Add support for TRIM

On Tue, Apr 07, 2009 at 01:57:54PM -0400, Mark Lord wrote:
> Are there any commercially available SSDs that have this feature yet?
> Anyone know which ones?

OCZ will release a new firmware for their Vertex line in the coming
days, which will feature ATA TRIM support.
And since they get their firmware directly from Indilinx, all SSDs
with this controller will be TRIM ready in the future.

http://www.ocztechnologyforum.com/forum/showthread.php?t=54373

--
Markus

2009-04-07 19:59:24

by Mark Lord

[permalink] [raw]
Subject: Re: [PATCH 4/5] ide: Add support for TRIM

Markus Trippelsdorf wrote:
> On Tue, Apr 07, 2009 at 01:57:54PM -0400, Mark Lord wrote:
>> Are there any commercially available SSDs that have this feature yet?
>> Anyone know which ones?
>
> OCZ will release a new firmware for their Vertex line in the coming
> days, which will feature ATA TRIM support.
> And since they get their firmware directly from Indilinx, all SSDs
> with this controller will be TRIM ready in the future.
>
> http://www.ocztechnologyforum.com/forum/showthread.php?t=54373
..

Mmm.. sounds like they really don't understand "standards" there.
All of that nonsense about waiting until their proprietary "trim utility"
is working on Windows-7 before looking at a Linux implementation.

WTF? So long as they implement the required ATA opcode (correctly),
then it should "just work" fine on Linux. But that Windows-centred
discussion there really doesn't sound encouraging.

But hey, if they do get their act together, then the Vertex series
are supposedly the second-best consumer SSDs available these days,
after the highly touted/priced Intel ones. :)

Cheers

Subject: Re: [PATCH 4/5] ide: Add support for TRIM

On Thursday 02 April 2009 21:37:35 Bartlomiej Zolnierkiewicz wrote:
> On Thursday 02 April 2009, Matthew Wilcox wrote:
> > On Thu, Apr 02, 2009 at 08:38:03PM +0400, Sergei Shtylyov wrote:
> > > Hello.
> > >>>> + task->tf_flags = IDE_TFLAG_LBA48 | IDE_TFLAG_OUT_HOB |
> > >>>> + IDE_TFLAG_OUT_TF | IDE_TFLAG_OUT_DEVICE |
> > >
> > >>> The last 3 flags are going to be obsoleted too...
> > >
> > >> So if I remove them today, the command will still work?
> > >
> > > s/obsoleted/renamed and moved to another other field/ -- I'm going to
> > > submit a patchset refactoring 'struct ide_cmd and 'struct ide-taskfile'
> > > at last...
> >
> > Since I'm coding to today's kernel, not to a patch which doesn't exist
> > yet, taking them out won't work very well.
> >
> > I've not been paying much attention to drivers/ide, so I've no idea
> > whether the following patch works. It does at least compile:
> >
> > +++ b/drivers/ide/ide-disk.c
> > @@ -407,7 +407,7 @@ static void idedisk_prepare_flush(struct request_queue *q, s
> > static int idedisk_prepare_discard(struct request_queue *q, struct request *rq,
> > struct bio *bio)
> > {
> > - ide_task_t *task;
> > + struct ide_cmd *task;
> > unsigned size;
> > struct page *page = alloc_page(GFP_KERNEL);
> > if (!page)
> > @@ -432,8 +432,8 @@ static int idedisk_prepare_discard(struct request_queue *q,
> >
> > task->tf_flags = IDE_TFLAG_LBA48 | IDE_TFLAG_OUT_HOB |
> > IDE_TFLAG_OUT_TF | IDE_TFLAG_OUT_DEVICE |
> > - IDE_TFLAG_DYN;
> > - task->data_phase = TASKFILE_OUT_DMA;
> > + IDE_TFLAG_DYN | IDE_TFLAG_WRITE;
> > + task->protocol = ATA_PROT_DMA;
> >
> > rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
> > rq->special = task;
> >
> > If I've understood 0dfb991c6943c810175376b58d1c29cfe532541b correctly,
> > this should be equivalent. Bart?
>
> Yep. The final patch looks fine overall (thanks for adding TRIM support
> also to drivers/ide), with the small exception for this chunk:
>
> @@ -521,6 +567,9 @@ static int set_wcache(ide_drive_t *drive, int arg)
>
> update_ordered(drive);
>
> + if (ata_id_has_trim(drive->id))
> + blk_queue_set_discard(drive->queue, idedisk_prepare_discard);
> +
> return err;
> }
>
> as it would fit better somewhere in ide_disk_setup() ISO set_wcache()...
>
> [ When it comes to CodingStyle issues I don't care that much though I think
> that applying Sergei's suggestions will make the patch more consistent with
> the rest of drivers/ide code... ]

I would like to see this merged for -rc1 so I tried to put it altogether
and then integrate it over recent IDE and block changes...

I ended up with the patch below which I'll push to Linus in a few minutes.

Once it hits Linus' tree please test that it still works. :)

Thanks.

From: David Woodhouse <[email protected]>
Subject: [PATCH 4/5] ide: Add support for TRIM

Bart: Since this is a new feature and hasn't seen much testing with
production devices it is not enabled by default yet (requires use of
"ide_core.trim=1" kernel parameter).

Signed-off-by: David Woodhouse <[email protected]>
[modified to reduce amount of special casing needed for discard]
Signed-off-by: Matthew Wilcox <[email protected]>
Cc: Jeff Garzik <[email protected]>
Cc: Sergei Shtylyov <[email protected]>
[bart: apply follow-up changes from Matthew]
[bart: apply CodingStyle fixups per Sergei's suggestion]
[bart: move blk_queue_set_discard() call to ide_disk_setup()]
[bart: port over recent IDE changes]
[bart: remove "bio" argument from idedisk_prepare_discard()]
[bart: s/PAGE_SIZE/PAGE_SIZE + 8/ + set ATA_LBA bit in tf.device]
[bart: add "ide_core.trim=" kernel parameter]
Signed-off-by: Bartlomiej Zolnierkiewicz <[email protected]>
---
drivers/ide/ide-disk.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++
drivers/ide/ide.c | 6 +++++
2 files changed, 59 insertions(+)

Index: b/drivers/ide/ide-disk.c
===================================================================
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -415,6 +415,54 @@ static void idedisk_prepare_flush(struct
rq->special = cmd;
}

+static int idedisk_prepare_discard(struct request_queue *q, struct request *rq)
+{
+ struct ide_cmd *cmd;
+ struct page *page;
+ struct bio *bio = rq->bio;
+ unsigned int size;
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ goto error;
+
+ cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+ if (!cmd)
+ goto free_page;
+
+ size = ata_set_lba_range_entries(page_address(page), PAGE_SIZE / 8,
+ bio->bi_sector, bio_sectors(bio));
+ bio->bi_size = 0;
+
+ if (bio_add_pc_page(q, bio, page, size, 0) < size)
+ goto free_task;
+
+ cmd->tf.command = ATA_CMD_DSM;
+ cmd->tf.feature = ATA_DSM_TRIM;
+ cmd->tf.nsect = size / 512;
+ cmd->hob.nsect = (size / 512) >> 8;
+ cmd->tf.device = ATA_LBA;
+
+ cmd->valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
+ cmd->valid.out.hob = IDE_VALID_OUT_HOB;
+
+ cmd->tf_flags = IDE_TFLAG_LBA48 | IDE_TFLAG_WRITE | IDE_TFLAG_DYN;
+ cmd->protocol = ATA_PROT_DMA;
+
+ rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
+ rq->special = cmd;
+ rq->nr_sectors = size / 512;
+
+ return 0;
+
+ free_task:
+ kfree(cmd);
+ free_page:
+ __free_page(page);
+ error:
+ return -ENOMEM;
+}
+
ide_devset_get(multcount, mult_count);

/*
@@ -606,6 +654,8 @@ static int ide_disk_check(ide_drive_t *d
return 1;
}

+extern int ide_trim;
+
static void ide_disk_setup(ide_drive_t *drive)
{
struct ide_disk_obj *idkp = drive->driver_data;
@@ -693,6 +743,9 @@ static void ide_disk_setup(ide_drive_t *

set_wcache(drive, 1);

+ if (ata_id_has_trim(id) && ide_trim)
+ blk_queue_set_discard(q, idedisk_prepare_discard);
+
if ((drive->dev_flags & IDE_DFLAG_LBA) == 0 &&
(drive->head == 0 || drive->head > 16)) {
printk(KERN_ERR "%s: invalid geometry: %d physical heads?\n",
Index: b/drivers/ide/ide.c
===================================================================
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -178,6 +178,12 @@ EXPORT_SYMBOL_GPL(ide_pci_clk);
module_param_named(pci_clock, ide_pci_clk, int, 0);
MODULE_PARM_DESC(pci_clock, "PCI bus clock frequency (in MHz)");

+int ide_trim = 0;
+EXPORT_SYMBOL_GPL(ide_trim);
+
+module_param_named(trim, ide_trim, int, 0);
+MODULE_PARM_DESC(trim, "TRIM support (0=off, 1=on)");
+
static int ide_set_dev_param_mask(const char *s, struct kernel_param *kp)
{
int a, b, i, j = 1;

2009-04-07 22:16:15

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH 4/5] ide: Add support for TRIM

On Tue, Apr 07, 2009 at 11:38:59PM +0200, Bartlomiej Zolnierkiewicz wrote:
> I would like to see this merged for -rc1 so I tried to put it altogether
> and then integrate it over recent IDE and block changes...

You're missing 1/5 and 2/5. This patch will crash horribly if you use
it with a drive which supports TRIM.

2009-04-07 22:27:25

by Jeff Garzik

[permalink] [raw]
Subject: Re: [PATCH 4/5] ide: Add support for TRIM

Matthew Wilcox wrote:
> On Tue, Apr 07, 2009 at 11:38:59PM +0200, Bartlomiej Zolnierkiewicz wrote:
>> I would like to see this merged for -rc1 so I tried to put it altogether
>> and then integrate it over recent IDE and block changes...
>
> You're missing 1/5 and 2/5. This patch will crash horribly if you use
> it with a drive which supports TRIM.

For things that need to hit along with the block layer, Bart and I could
Ack the IDE+libata patches, and Jens could merge all of them.

Or just poke Jens to review and merge the block stuff. Then the rest
can go at any time.

Jeff


Subject: Re: [PATCH 4/5] ide: Add support for TRIM

On Wednesday 08 April 2009 00:15:54 Matthew Wilcox wrote:
> On Tue, Apr 07, 2009 at 11:38:59PM +0200, Bartlomiej Zolnierkiewicz wrote:
> > I would like to see this merged for -rc1 so I tried to put it altogether
> > and then integrate it over recent IDE and block changes...
>
> You're missing 1/5 and 2/5. This patch will crash horribly if you use
> it with a drive which supports TRIM.

Heh, I knew something was out of place...

2009-04-08 07:14:23

by Markus Trippelsdorf

[permalink] [raw]
Subject: Re: [PATCH 4/5] ide: Add support for TRIM

On Tue, Apr 07, 2009 at 03:58:41PM -0400, Mark Lord wrote:
> Markus Trippelsdorf wrote:
> > On Tue, Apr 07, 2009 at 01:57:54PM -0400, Mark Lord wrote:
> >> Are there any commercially available SSDs that have this feature yet?
> >> Anyone know which ones?
> >
> > OCZ will release a new firmware for their Vertex line in the coming
> > days, which will feature ATA TRIM support.
> > And since they get their firmware directly from Indilinx, all SSDs
> > with this controller will be TRIM ready in the future.
> >
> > http://www.ocztechnologyforum.com/forum/showthread.php?t=54373
> ..
>
> Mmm.. sounds like they really don't understand "standards" there.
> All of that nonsense about waiting until their proprietary "trim utility"
> is working on Windows-7 before looking at a Linux implementation.
>
> WTF? So long as they implement the required ATA opcode (correctly),
> then it should "just work" fine on Linux. But that Windows-centred
> discussion there really doesn't sound encouraging.
>
> But hey, if they do get their act together, then the Vertex series
> are supposedly the second-best consumer SSDs available these days,
> after the highly touted/priced Intel ones. :)

The new firmware is out and looks promising:

########## Begin SSD TRIM output ##########
word21 bit10 = 1: support for the DATA SET MANAGEMENT is changeable.
word169 bit0 = 1: device supports the Trim bit of DATA SET MANAGEMENT command.
word69 bit14 = 1: Trim function of the DATA SET MANAGEMENT command supports determinate behavior.
########## End SSD TRIM output ##########

This is the output from a modified hdparm, that checks the relevant
bits.

(I didn't run hdparm on my machine, because I'm waiting for a flasher that
doesn't require XP. I just copied the output from:
http://www.ocztechnologyforum.com/forum/showpost.php?p=371277&postcount=28 )

--
Markus

2009-04-08 14:25:52

by Mark Lord

[permalink] [raw]
Subject: Re: [PATCH 4/5] ide: Add support for TRIM

Markus Trippelsdorf wrote:
> ..
> The new firmware is out and looks promising:
>
> ########## Begin SSD TRIM output ##########
> word21 bit10 = 1: support for the DATA SET MANAGEMENT is changeable.
> word169 bit0 = 1: device supports the Trim bit of DATA SET MANAGEMENT command.
> word69 bit14 = 1: Trim function of the DATA SET MANAGEMENT command supports determinate behavior.
> ########## End SSD TRIM output ##########
>
> This is the output from a modified hdparm, that checks the relevant bits.
..

I'll release hdparm-9.14 shortly, with reporting for the TRIM feature in the -I output.

Cheers

2009-04-08 14:33:26

by Mark Lord

[permalink] [raw]
Subject: Re: [PATCH 4/5] ide: Add support for TRIM

Matthew Wilcox wrote:
> From: David Woodhouse <[email protected]>
>
> Signed-off-by: David Woodhouse <[email protected]>
> [modified to reduce amount of special casing needed for discard]
> Signed-off-by: Matthew Wilcox <[email protected]>
> ---
..
> + task->tf.command = ATA_CMD_DSM;
> + task->tf.feature = ATA_DSM_TRIM;
> + task->tf.hob_feature = 0x00;
> + task->tf.nsect = size / 512;
> + task->tf.hob_nsect = (size / 512) >> 8;
..

Matthew (or others),

Could you perhaps explain what the purpose of the data portion
of this command is for? The draft ATA spec I have here has mangled
text in the description -- like it's missing a crucial sentence there.

So it's not obvious exactly why this command even needs data to
be transfered.

Thanks

2009-04-08 14:44:23

by Dongjun Shin

[permalink] [raw]
Subject: Re: [PATCH 4/5] ide: Add support for TRIM

On Wed, Apr 8, 2009 at 11:33 PM, Mark Lord <[email protected]> wrote:
>
> Matthew Wilcox wrote:
>>
>> From: David Woodhouse <[email protected]>
>>
>> Signed-off-by: David Woodhouse <[email protected]>
>> [modified to reduce amount of special casing needed for discard]
>> Signed-off-by: Matthew Wilcox <[email protected]>
>> ---
>
> ..
>>
>> + ? ? ? task->tf.command = ATA_CMD_DSM;
>> + ? ? ? task->tf.feature = ATA_DSM_TRIM;
>> + ? ? ? task->tf.hob_feature = 0x00;
>> + ? ? ? task->tf.nsect = size / 512;
>> + ? ? ? task->tf.hob_nsect = (size / 512) >> 8;
>
> ..
>
> Matthew (or others),
>
> Could you perhaps explain what the purpose of the data portion
> of this command is for? ?The draft ATA spec I have here has mangled
> text in the description -- like it's missing a crucial sentence there.
>
> So it's not obvious exactly why this command even needs data to
> be transfered.

It's for transfering list of LBA ranges which composed of
LBA offset (48bit) and length of data to be trimmed (16bit).

--
Dongjun

2009-04-08 15:00:48

by Jeff Garzik

[permalink] [raw]
Subject: Re: [PATCH 4/5] ide: Add support for TRIM

Mark Lord wrote:
> Matthew Wilcox wrote:
>> From: David Woodhouse <[email protected]>
>>
>> Signed-off-by: David Woodhouse <[email protected]>
>> [modified to reduce amount of special casing needed for discard]
>> Signed-off-by: Matthew Wilcox <[email protected]>
>> ---
> ..
>> + task->tf.command = ATA_CMD_DSM;
>> + task->tf.feature = ATA_DSM_TRIM;
>> + task->tf.hob_feature = 0x00;
>> + task->tf.nsect = size / 512;
>> + task->tf.hob_nsect = (size / 512) >> 8;
> ..
>
> Matthew (or others),
>
> Could you perhaps explain what the purpose of the data portion
> of this command is for? The draft ATA spec I have here has mangled
> text in the description -- like it's missing a crucial sentence there.
>
> So it's not obvious exactly why this command even needs data to
> be transfered.

It's a list of LBA ranges, like the NV-cache stuff recently added.

Jeff


2009-04-08 15:50:31

by Mark Lord

[permalink] [raw]
Subject: Re: [PATCH 4/5] ide: Add support for TRIM

Jeff Garzik wrote:
> Mark Lord wrote:
..
>> So it's not obvious exactly why this command even needs data to
>> be transfered.
>
> It's a list of LBA ranges, like the NV-cache stuff recently added.
..

Ack. Thanks. I found the LBA range description (intact) in the drafts now.

-ml

2009-04-16 20:25:56

by Mark Lord

[permalink] [raw]
Subject: Re: [PATCH 5/5] libata: Add support for TRIM

Matthew Wilcox wrote:
> Signed-off-by: Matthew Wilcox <[email protected]>
> ---
> drivers/ata/libata-scsi.c | 46 +++++++++++++++++++++++++++++++++++++++++++++
> 1 files changed, 46 insertions(+), 0 deletions(-)
>
> diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
> index b9747fa..d4c8b8b 100644
> --- a/drivers/ata/libata-scsi.c
> +++ b/drivers/ata/libata-scsi.c
...

It works. :)

I'm using TRIM on a 120GB OCZ Vertex SSD here now,
and instrumented ata_discard_fn() to log what it does.
Seems to work perfectly, and the sector ranges for TRIM
match those given by "hdparm --fibmap" for the files I've tried.

Does this also TRIM metadata (eg. block bitmaps and the like) ?

Thanks Matthew (and others).

2009-04-17 19:45:01

by Mark Lord

[permalink] [raw]
Subject: Re: [PATCH 5/5] libata: Add support for TRIM

Mark Lord wrote:
> Matthew Wilcox wrote:
>> Signed-off-by: Matthew Wilcox <[email protected]>
>> ---
>> drivers/ata/libata-scsi.c | 46
>> +++++++++++++++++++++++++++++++++++++++++++++
>> 1 files changed, 46 insertions(+), 0 deletions(-)
>>
>> diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
>> index b9747fa..d4c8b8b 100644
>> --- a/drivers/ata/libata-scsi.c
>> +++ b/drivers/ata/libata-scsi.c
> ...
>
> It works. :)
>
> I'm using TRIM on a 120GB OCZ Vertex SSD here now,
..

I take that back now. The command is apparently failing,
but we don't log the failure, and nor do we stop attempting
to continue to use it on future TRIMs.

I dug deeper into the kernel side, when my own attempts in hdparm
all failed, both using SG_IO directly and using the BLKDISCARD ioctl().

The commands are simply rejected by the drive, with status=0x51, err=0x04.

Everything *looks* okay on the Linux side, so I'm guessing that the OCZ
drive uses a vendor-unique opcode or protocol for it. They do have it
in there for a windows tool, but no further info than that.

-ml

2009-04-17 21:23:57

by Mark Lord

[permalink] [raw]
Subject: Re: [PATCH 1/5] Block: Discard may need to allocate pages

Matthew Wilcox wrote:
> From: Matthew Wilcox <[email protected]>
>
> SCSI and ATA both need to send data to the device. In order to do this,
> the BIO must be allocated with room for a page to be added, and the bio
> needs to be passed to the discard prep function. We also need to free
> the page attached to the BIO before we free it.
>
> init_request_from_bio() is not currently called from a context which
> forbids sleeping, and to make sure it stays that way (so we don't have
> to use GFP_ATOMIC), add a might_sleep() to it.
>
> Signed-off-by: Matthew Wilcox <[email protected]>
...
> @@ -1118,7 +1120,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
> req->cmd_flags |= REQ_DISCARD;
> if (bio_barrier(bio))
> req->cmd_flags |= REQ_SOFTBARRIER;
> - req->q->prepare_discard_fn(req->q, req);
> + req->q->prepare_discard_fn(req->q, req, bio);
> } else if (unlikely(bio_barrier(bio)))
> req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
>
..

Matthew, note that prepare_discardfn() may fail,
and return an error result. The above code does not
check for this or handle it very well.

Cheers