From: Saugata Das <[email protected]>
On eMMC and UFS devices there is a new feature of setting context with each
read or write. The idea is to classify the data from different files and
apply the realibility on the complete file instead of individual writes,
which helps in performance. A new address space operation has been a added
to get the context from file system and set up the bi_context field in bio.
Then we need to ensure that bio from different contexts are not merged. The
context is then passed to the underlying driver as part of the read or write
request. Since the number of MMC contexts is limited, multiple file system
contexts are mapped to single MMC context.
Signed-off-by: Saugata Das <[email protected]>
---
block/blk-core.c | 1 +
block/blk-merge.c | 3 +++
fs/mpage.c | 12 ++++++++++++
include/linux/blk_types.h | 1 +
include/linux/blkdev.h | 1 +
include/linux/buffer_head.h | 2 ++
include/linux/fs.h | 1 +
7 files changed, 21 insertions(+), 0 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index 1f61b74..274e05d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1309,6 +1309,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
req->errors = 0;
req->__sector = bio->bi_sector;
req->ioprio = bio_prio(bio);
+ req->context = bio->bi_context;
blk_rq_bio_prep(req->q, req, bio);
}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 160035f..ed70d56 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -497,6 +497,9 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
if (bio_integrity(bio) != blk_integrity_rq(rq))
return false;
+ if (bio->bi_context != rq->bio->bi_context)
+ return false;
+
return true;
}
diff --git a/fs/mpage.c b/fs/mpage.c
index 0face1c..4889842 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -293,6 +293,12 @@ alloc_new:
goto confused;
}
+ if (page && page->mapping && page->mapping->a_ops &&
+ page->mapping->a_ops->get_context)
+ bio->bi_context = page->mapping->a_ops->get_context(page);
+ else
+ bio->bi_context = 0;
+
length = first_hole << blkbits;
if (bio_add_page(bio, page, length, 0) < length) {
bio = mpage_bio_submit(READ, bio);
@@ -581,6 +587,12 @@ alloc_new:
goto confused;
}
+ if (page && page->mapping && page->mapping->a_ops &&
+ page->mapping->a_ops->get_context)
+ bio->bi_context = page->mapping->a_ops->get_context(page);
+ else
+ bio->bi_context = 0;
+
/*
* Must try to add the page before marking the buffer clean or
* the confused fail path above (OOM) will be very confused when
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 4053cbd..f3ac448 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -42,6 +42,7 @@ struct bio {
unsigned short bi_vcnt; /* how many bio_vec's */
unsigned short bi_idx; /* current index into bvl_vec */
+ unsigned long bi_context; /* context of this bio */
/* Number of segments in this BIO after
* physical address coalescing is performed.
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2aa2466..0dd9a08 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -167,6 +167,7 @@ struct request {
struct list_head timeout_list;
unsigned int timeout;
int retries;
+ unsigned long context; /* context of this request */
/*
* completion callback.
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 13bba17..0776564 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -72,6 +72,8 @@ struct buffer_head {
struct list_head b_assoc_buffers; /* associated with another mapping */
struct address_space *b_assoc_map; /* mapping this buffer is
associated with */
+ unsigned long b_context; /* context for this buffer within the
+ storage device */
atomic_t b_count; /* users using this buffer_head */
};
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8de6755..4b379d8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -626,6 +626,7 @@ struct address_space_operations {
int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
unsigned long);
int (*error_remove_page)(struct address_space *, struct page *);
+ int (*get_context)(struct page *);
};
extern const struct address_space_operations empty_aops;
--
1.7.4.3
From: Saugata Das <[email protected]>
On eMMC and UFS devices there is a new feature of setting context with
each read or write. The idea is to classify the data from different files
and apply the realibility on the complete file instead of individual writes.
On ext4 file system, the inode number of the file is passed as b_context
in the bh structure during write and via the get_context callback function
during read. Since number of MMC contexts is limited, multiple file system
contexts are mapped to single MMC context.
Signed-off-by: Saugata Das <[email protected]>
---
fs/ext4/inode.c | 33 +++++++++++++++++++++++++++++++++
fs/ext4/page-io.c | 1 +
2 files changed, 34 insertions(+), 0 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 754fe77..2667396 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -790,6 +790,21 @@ static int do_journal_get_write_access(handle_t *handle,
return ret;
}
+/* Get the context of the buffer within the underlying storage device */
+static int ext4_get_context(struct page *page)
+{
+ if (page && page->mapping && page->mapping->host)
+ return page->mapping->host->i_ino;
+ else
+ return 0;
+}
+
+static int ext4_set_buffer_context(handle_t *handle, struct buffer_head *bh)
+{
+ bh->b_context = ext4_get_context(bh->b_page);
+ return 0;
+}
+
static int ext4_get_block_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
static int ext4_write_begin(struct file *file, struct address_space *mapping,
@@ -843,6 +858,11 @@ retry:
from, to, NULL, do_journal_get_write_access);
}
+ if (!ret && walk_page_buffers(NULL, page_buffers(page),
+ from, to, NULL, ext4_set_buffer_context)) {
+ ext4_warning(inode->i_sb, "Couldn't set context\n");
+ }
+
if (ret) {
unlock_page(page);
page_cache_release(page);
@@ -2394,8 +2414,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
pgoff_t index;
struct inode *inode = mapping->host;
handle_t *handle;
+ unsigned from, to;
index = pos >> PAGE_CACHE_SHIFT;
+ from = pos & (PAGE_CACHE_SIZE - 1);
+ to = from + len;
if (ext4_nonda_switch(inode->i_sb)) {
*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
@@ -2444,6 +2467,12 @@ retry:
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
+
+ if (walk_page_buffers(NULL, page_buffers(page),
+ from, to, NULL, ext4_set_buffer_context)) {
+ ext4_warning(inode->i_sb, "Couldn't set context\n");
+ }
+
out:
return ret;
}
@@ -3040,6 +3069,7 @@ static const struct address_space_operations ext4_ordered_aops = {
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
+ .get_context = ext4_get_context,
};
static const struct address_space_operations ext4_writeback_aops = {
@@ -3055,6 +3085,7 @@ static const struct address_space_operations ext4_writeback_aops = {
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
+ .get_context = ext4_get_context,
};
static const struct address_space_operations ext4_journalled_aops = {
@@ -3070,6 +3101,7 @@ static const struct address_space_operations ext4_journalled_aops = {
.direct_IO = ext4_direct_IO,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
+ .get_context = ext4_get_context,
};
static const struct address_space_operations ext4_da_aops = {
@@ -3086,6 +3118,7 @@ static const struct address_space_operations ext4_da_aops = {
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
+ .get_context = ext4_get_context,
};
void ext4_set_aops(struct inode *inode)
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index dcdeef1..bf1381e 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -296,6 +296,7 @@ static int io_submit_init(struct ext4_io_submit *io,
bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
+ bio->bi_context = bh->b_context;
bio->bi_private = io->io_end = io_end;
bio->bi_end_io = ext4_end_bio;
--
1.7.4.3
From: Saugata Das <[email protected]>
This patch implements the context ID support at MMC layer. From file system
(ext4), the context is passed in the request structure. At MMC layer the
context is retrieved from the request structure and then used in the CMD23
argument. Since number of MMC contexts is limited, multiple file system
contexts are mapped to single MMC contexts. When the REQ_SYNC or REQ_FLUSH
flag is set, the context is flushed or sync'ed, in which the context is
closed so that the data blocks are safely written out to non-volatile memory
and then the context is opened again.
Signed-off-by: Saugata Das <[email protected]>
---
drivers/mmc/card/block.c | 37 ++++++++++++++++++++++++++-
drivers/mmc/core/core.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++
drivers/mmc/core/mmc.c | 25 +++++++++++++++++++
include/linux/mmc/card.h | 6 ++++
include/linux/mmc/core.h | 4 +++
include/linux/mmc/host.h | 1 +
include/linux/mmc/mmc.h | 2 +
7 files changed, 134 insertions(+), 2 deletions(-)
diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index dabec55..4760d5a 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -958,6 +958,15 @@ static int mmc_blk_issue_flush(struct mmc_queue *mq, struct request *req)
struct mmc_card *card = md->queue.card;
int ret = 0;
+ /*
+ * The flush command is a synchronization point from file system.
+ * The contexts are flushed here to ensure that the data written
+ * in the open contexts are saved reliably in non-volatile media
+ */
+ ret = mmc_flush_contexts(card);
+ if (ret)
+ ret = -EIO;
+
ret = mmc_flush_cache(card);
if (ret)
ret = -EIO;
@@ -1207,11 +1216,16 @@ static void mmc_blk_rw_rq_prep(struct mmc_queue_req *mqrq,
*/
if ((md->flags & MMC_BLK_CMD23) && mmc_op_multi(brq->cmd.opcode) &&
(do_rel_wr || !(card->quirks & MMC_QUIRK_BLK_NO_CMD23) ||
- do_data_tag)) {
+ do_data_tag || (card->ext_csd.max_context_id > 0))) {
+ int context_id = (req->context &&
+ card->ext_csd.max_context_id) ?
+ (req->context % card->ext_csd.max_context_id + 1) :
+ 0;
brq->sbc.opcode = MMC_SET_BLOCK_COUNT;
brq->sbc.arg = brq->data.blocks |
(do_rel_wr ? (1 << 31) : 0) |
- (do_data_tag ? (1 << 29) : 0);
+ (do_data_tag ? (1 << 29) : 0) |
+ (!do_data_tag ? (context_id << 25) : 0);
brq->sbc.flags = MMC_RSP_R1 | MMC_CMD_AC;
brq->mrq.sbc = &brq->sbc;
}
@@ -1440,6 +1454,25 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req)
mmc_blk_issue_rw_rq(mq, NULL);
ret = mmc_blk_issue_flush(mq, req);
} else {
+ if (req && (req->cmd_flags & REQ_SYNC) &&
+ req->context && card->ext_csd.max_context_id) {
+ int context_cfg_id =
+ req->context % card->ext_csd.max_context_id;
+ /*
+ * The SYNC command is a synchronization point from
+ * file system. The relevent context is sync'ed here
+ * to ensure that the data written in the open context
+ * are saved reliably in non-volatile media
+ */
+ if (card->host->areq)
+ mmc_blk_issue_rw_rq(mq, NULL);
+ mmc_sync_context(card, context_cfg_id);
+ /*
+ * This write will go without context to ensure
+ * that it is reliably written
+ */
+ req->context = 0;
+ }
ret = mmc_blk_issue_rw_rq(mq, req);
}
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index ba821fe..54857f9 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -2262,6 +2262,67 @@ int mmc_cache_ctrl(struct mmc_host *host, u8 enable)
}
EXPORT_SYMBOL(mmc_cache_ctrl);
+static inline int mmc_set_context_conf(struct mmc_card *card,
+ int context_cfg_id, int context_act_dir)
+{
+ return mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
+ EXT_CSD_CONTEXT_CONF + context_cfg_id,
+ context_act_dir, card->ext_csd.generic_cmd6_time);
+}
+
+/*
+ * Synchronize a context by first closing the context and then
+ * opening it
+ */
+int mmc_sync_context(struct mmc_card *card, int context_cfg_id)
+{
+ int err = 0;
+
+ err = mmc_set_context_conf(card, context_cfg_id, MMC_CONTEXT_CLOSE);
+ if (err)
+ return err;
+
+ err = mmc_set_context_conf(card, context_cfg_id, MMC_CONTEXT_ACT_RW);
+ return err;
+}
+EXPORT_SYMBOL(mmc_sync_context);
+
+int mmc_flush_contexts(struct mmc_card *card)
+{
+ int i, err = 0;
+
+ for (i = 0; i < card->ext_csd.max_context_id; i++) {
+ int err1 = mmc_sync_context(card, i);
+ err = (err1 && !err) ? err1 : err;
+ }
+ return err;
+}
+EXPORT_SYMBOL(mmc_flush_contexts);
+
+/*
+ * Initialize all the MMC contexts in read-write and non-LU mode
+ */
+int mmc_init_context(struct mmc_card *card)
+{
+ int i, err = 0;
+
+ for (i = 0; i < card->ext_csd.max_context_id; i++) {
+ err = mmc_set_context_conf(card, i, MMC_CONTEXT_ACT_RW);
+ if (err) {
+ pr_warning("%s: Activating of context %d failed [%x]\n",
+ mmc_hostname(card->host), i, err);
+ break;
+ }
+ }
+
+ if (!err)
+ return 0;
+
+ card->ext_csd.max_context_id = i;
+ return err;
+}
+EXPORT_SYMBOL(mmc_init_context);
+
#ifdef CONFIG_PM
/**
diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 54df5ad..77476a3 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -533,6 +533,16 @@ static int mmc_read_ext_csd(struct mmc_card *card, u8 *ext_csd)
} else {
card->ext_csd.data_tag_unit_size = 0;
}
+
+ card->ext_csd.max_context_id =
+ ext_csd[EXT_CSD_CONTEXT_CAPABILITIES] & 0x0f;
+
+ if (card->ext_csd.max_context_id < VALID_MAX_MMC_CONTEXT_ID) {
+ pr_warning("%s: card has invalid number of contexts [%d]\n",
+ mmc_hostname(card->host),
+ card->ext_csd.max_context_id);
+ card->ext_csd.max_context_id = 0;
+ }
}
out:
@@ -1267,6 +1277,21 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,
}
}
+ if (host->caps2 & MMC_CAP2_CONTEXT) {
+ if (card->ext_csd.max_context_id > 0) {
+ err = mmc_init_context(card);
+ if (err && err != -EBADMSG)
+ goto free_card;
+ if (err) {
+ pr_warning("%s: failed to activate context (%x)\n",
+ mmc_hostname(card->host), err);
+ card->ext_csd.max_context_id = 0;
+ err = 0;
+ }
+ }
+ } else
+ card->ext_csd.max_context_id = 0;
+
if (!oldcard)
host->card = card;
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 629b823..3d60849 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -74,6 +74,7 @@ struct mmc_ext_csd {
unsigned int hpi_cmd; /* cmd used as HPI */
unsigned int data_sector_size; /* 512 bytes or 4KB */
unsigned int data_tag_unit_size; /* DATA TAG UNIT size */
+ unsigned int max_context_id;
unsigned int boot_ro_lock; /* ro lock support */
bool boot_ro_lockable;
u8 raw_partition_support; /* 160 */
@@ -184,6 +185,11 @@ struct sdio_func_tuple;
#define MMC_NUM_PHY_PARTITION 6
#define MAX_MMC_PART_NAME_LEN 20
+#define MAX_MMC_CONTEXT_ID 15
+#define VALID_MAX_MMC_CONTEXT_ID 5
+#define MMC_CONTEXT_CLOSE 0
+#define MMC_CONTEXT_ACT_RW 3
+
/*
* MMC Physical partitions
*/
diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index 1b431c7..a4e6bc9 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -179,6 +179,10 @@ extern int mmc_try_claim_host(struct mmc_host *host);
extern int mmc_flush_cache(struct mmc_card *);
+extern int mmc_sync_context(struct mmc_card *card, int context_id);
+extern int mmc_flush_contexts(struct mmc_card *card);
+extern int mmc_init_context(struct mmc_card *card);
+
extern int mmc_detect_card_removed(struct mmc_host *host);
/**
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 0707d22..688348f 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -233,6 +233,7 @@ struct mmc_host {
#define MMC_CAP2_NO_SLEEP_CMD (1 << 4) /* Don't allow sleep command */
#define MMC_CAP2_HS200_1_8V_SDR (1 << 5) /* can support */
#define MMC_CAP2_HS200_1_2V_SDR (1 << 6) /* can support */
+#define MMC_CAP2_CONTEXT (1<<7) /* Context ID supported */
#define MMC_CAP2_HS200 (MMC_CAP2_HS200_1_8V_SDR | \
MMC_CAP2_HS200_1_2V_SDR)
#define MMC_CAP2_BROKEN_VOLTAGE (1 << 7) /* Use the broken voltage */
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index b822a2c..96d73aa 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -274,6 +274,7 @@ struct _mmc_csd {
#define EXT_CSD_FLUSH_CACHE 32 /* W */
#define EXT_CSD_CACHE_CTRL 33 /* R/W */
#define EXT_CSD_POWER_OFF_NOTIFICATION 34 /* R/W */
+#define EXT_CSD_CONTEXT_CONF 37 /* R/W */
#define EXT_CSD_DATA_SECTOR_SIZE 61 /* R */
#define EXT_CSD_GP_SIZE_MULT 143 /* R/W */
#define EXT_CSD_PARTITION_ATTRIBUTE 156 /* R/W */
@@ -316,6 +317,7 @@ struct _mmc_csd {
#define EXT_CSD_POWER_OFF_LONG_TIME 247 /* RO */
#define EXT_CSD_GENERIC_CMD6_TIME 248 /* RO */
#define EXT_CSD_CACHE_SIZE 249 /* RO, 4 bytes */
+#define EXT_CSD_CONTEXT_CAPABILITIES 496 /* RO */
#define EXT_CSD_TAG_UNIT_SIZE 498 /* RO */
#define EXT_CSD_DATA_TAG_SUPPORT 499 /* RO */
#define EXT_CSD_HPI_FEATURES 503 /* RO */
--
1.7.4.3
On Mon, 2012-06-11 at 16:16 +0530, Saugata Das wrote:
> +/* Get the context of the buffer within the underlying storage device */
> +static int ext4_get_context(struct page *page)
> +{
> + if (page && page->mapping && page->mapping->host)
> + return page->mapping->host->i_ino;
> + else
> + return 0;
> +}
Word "context" is very generic and it is widely used various things, and
I believe we should try to avoid overloading this term and obfuscating
the I/O stack with various functions and other identifiers like
"get_context()". This would hurt readability. It is fine to use it
withing the UFS-specific code, but not globally withing the kernel code.
I do not really have good name candidates, but even "ufscontext" is
already better than just "context". Or "iocontext" ? Or just "ufsdata" ?
--
Best Regards,
Artem Bityutskiy
On Mon, Jun 11, 2012 at 02:41:31PM +0300, Artem Bityutskiy wrote:
>
> Word "context" is very generic and it is widely used various things, and
> I believe we should try to avoid overloading this term and obfuscating
> the I/O stack with various functions and other identifiers like
> "get_context()". This would hurt readability. It is fine to use it
> withing the UFS-specific code, but not globally withing the kernel code.
>
> I do not really have good name candidates, but even "ufscontext" is
> already better than just "context". Or "iocontext" ? Or just "ufsdata" ?
Before we try naming it, can we get some more details about exactly
how context in the eMMC context works?
It appears to be a way of grouping related writes together (yes?) but
at what granularity? What are the restrictions at the device level?
The proof-of-concept patches seem to use the inode number as a way of
trying to group related writes, but what about at a larger level than
that? For example, if we install a RPM or deb package where all of
the files will likely be replaced together, should that be given the
same context?
How likely does it have to be that related blocks written under the
same context must be deleted at the same time for this concept to be
helpful? If we have a context where it is the context assumption does
not hold (example: a database where you have a random access
read/write pattern with blocks updated in place) how harm will it be
to the device format if those blocks are written under the same
context?
The next set of questions we need to ask is how generalizable is this
concept to devices that might be more sophisticated than simple eMMC
devices. If we're going to expose something all the way out to the
file system layer, it would be nice if it worked on more than just
low-end flash devices, but also on more sophisticated devices as well.
Regards,
- Ted
On 11 June 2012 17:57, Ted Ts'o <[email protected]> wrote:
> On Mon, Jun 11, 2012 at 02:41:31PM +0300, Artem Bityutskiy wrote:
>>
>> Word "context" is very generic and it is widely used various things, and
>> I believe we should try to avoid overloading this term and obfuscating
>> the I/O stack with various functions and other identifiers like
>> "get_context()". This would hurt readability. It is fine to use it
>> withing the UFS-specific code, but not globally withing the kernel code.
>>
>> I do not really have good name candidates, but even "ufscontext" is
>> already better than just "context". Or "iocontext" ? Or just "ufsdata" ?
>
> Before we try naming it, can we get some more details about exactly
> how context in the eMMC context works?
>
> It appears to be a way of grouping related writes together (yes?) but
> at what granularity? ?What are the restrictions at the device level?
>
Yes, the idea is to group the read, write requests for a file to a
common context so that MMC can optimize the performance.
There is no restriction on the number of blocks which can be added in
the context. However, MMC restricts the number of contexts to 15. So,
potentially, multiple file system contexts will map to single MMC
context.
> The proof-of-concept patches seem to use the inode number as a way of
> trying to group related writes, but what about at a larger level than
> that? ?For example, if we install a RPM or deb package where all of
> the files will likely be replaced together, should that be given the
> same context?
In this patch, context is used at file level based on inode number.
So, in the above example, multiple contexts will be used for the
directory, file updates during RPM installation.
>
> How likely does it have to be that related blocks written under the
> same context must be deleted at the same time for this concept to be
> helpful?
There is no restriction that related blocks within the MMC context
needs to be deleted together
> If we have a context where it is the context assumption does
> not hold (example: a database where you have a random access
> read/write pattern with blocks updated in place) how harm will it be
> to the device format if those blocks are written under the same
> context?
>
MMC context allows the data blocks to be overwritten or randomly accessed
> The next set of questions we need to ask is how generalizable is this
> concept to devices that might be more sophisticated than simple eMMC
> devices. ?If we're going to expose something all the way out to the
> file system layer, it would be nice if it worked on more than just
> low-end flash devices, but also on more sophisticated devices as well.
>
This context mechanism will be used on both UFS and MMC devices. If
there are some alternate suggestions on what can be used as context
from file system perspective, then please suggest.
> Regards,
>
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?- Ted
On Tue, Jun 12, 2012 at 05:51:22PM +0530, Saugata Das wrote:
> > The proof-of-concept patches seem to use the inode number as a way of
> > trying to group related writes, but what about at a larger level than
> > that? ?For example, if we install a RPM or deb package where all of
> > the files will likely be replaced together, should that be given the
> > same context?
>
> In this patch, context is used at file level based on inode number.
> So, in the above example, multiple contexts will be used for the
> directory, file updates during RPM installation.
Yes --- I was trying to explore if we should try to do better than
that. For example, it would probably be a good idea (at minimum) to
hash some kind of partition id into the inode number to form the
context id. It's probably rare that a MMC card would be partitioned,
but it might be much more common for other flash devices.
I could also imagine schemes where via some fcntl, additional file
descriptors could be joined to the context of another file descriptor.
Or maybe we have a scheme whereby some high-level, abstract, context
id's could be generated, and assigned to a group of file descriptors.
A hueristic where all file creations that occur within close together
in time are considered to be related might work well for the package
installation case, but might break down in others. Which is why I
asked the question of what is the downside if the hueristics screw up,
and occasionally group together files/blocks that aren't related with
respect to when they are deallocated (since that *is* what the flash
devices care about, right?)
- Ted
On Tuesday 12 June 2012, Saugata Das wrote:
> On 11 June 2012 17:57, Ted Ts'o <[email protected]> wrote:
> > On Mon, Jun 11, 2012 at 02:41:31PM +0300, Artem Bityutskiy wrote:
> > The proof-of-concept patches seem to use the inode number as a way of
> > trying to group related writes, but what about at a larger level than
> > that? For example, if we install a RPM or deb package where all of
> > the files will likely be replaced together, should that be given the
> > same context?
>
> In this patch, context is used at file level based on inode number.
> So, in the above example, multiple contexts will be used for the
> directory, file updates during RPM installation.
>
> >
> > How likely does it have to be that related blocks written under the
> > same context must be deleted at the same time for this concept to be
> > helpful?
>
> There is no restriction that related blocks within the MMC context
> needs to be deleted together
I don't think that is correct. The most obvious implementation in eMMC
hardware for this would be to group all data from one context to be
written into the same erase block, in order to reduce the amount
of garbage collection that needs to happen at erase time. AFAICT,
the main interest here is, as Ted is guessing correctly, to make sure
that all data which gets written into one context has roughly the
same life time before it gets erased or overwritten.
> > If we have a context where it is the context assumption does
> > not hold (example: a database where you have a random access
> > read/write pattern with blocks updated in place) how harm will it be
> > to the device format if those blocks are written under the same
> > context?
> >
>
> MMC context allows the data blocks to be overwritten or randomly accessed
That is of course the defined behavior of a block device that does
not change with the use of contexts. To get the best performance,
a random-write database file would always reside in a context by itself
and not get mixed with long-lived write-once data. If we have a way
in the file system to tell whether a file is written linearly or randomly
(e.g. by looking at the O_APPEND or O_CREAT flag), it might make sense
to split the context space accordingly.
> > The next set of questions we need to ask is how generalizable is this
> > concept to devices that might be more sophisticated than simple eMMC
> > devices. If we're going to expose something all the way out to the
> > file system layer, it would be nice if it worked on more than just
> > low-end flash devices, but also on more sophisticated devices as well.
> >
>
> This context mechanism will be used on both UFS and MMC devices. If
> there are some alternate suggestions on what can be used as context
> from file system perspective, then please suggest.
One suggestion that has been made before was to base the context on
the process ID rather than the inode number, but that has many other
problems, e.g. when the same file gets written by multiple processes.
Arnd
On 12 June 2012 18:59, Arnd Bergmann <[email protected]> wrote:
> On Tuesday 12 June 2012, Saugata Das wrote:
>> On 11 June 2012 17:57, Ted Ts'o <[email protected]> wrote:
>> > On Mon, Jun 11, 2012 at 02:41:31PM +0300, Artem Bityutskiy wrote:
>> > The proof-of-concept patches seem to use the inode number as a way of
>> > trying to group related writes, but what about at a larger level than
>> > that? ?For example, if we install a RPM or deb package where all of
>> > the files will likely be replaced together, should that be given the
>> > same context?
>>
>> In this patch, context is used at file level based on inode number.
>> So, in the above example, multiple contexts will be used for the
>> directory, file updates during RPM installation.
>>
>> >
>> > How likely does it have to be that related blocks written under the
>> > same context must be deleted at the same time for this concept to be
>> > helpful?
>>
>> There is no restriction that related blocks within the MMC context
>> needs to be deleted together
>
> I don't think that is correct. The most obvious implementation in eMMC
> hardware for this would be to group all data from one context to be
> written into the same erase block, in order to reduce the amount
> of garbage collection that needs to happen at erase time. AFAICT,
> the main interest here is, as Ted is guessing correctly, to make sure
> that all data which gets written into one context has roughly the
> same life time before it gets erased or overwritten.
>
The restriction is there on "large unit" context, which prevents
trim/erase of the blocks till the context is active. But we do not
enable "large unit". On non-"large unit" context, the specification
does not restrict the trim/erase of blocks based on context.
>> > If we have a context where it is the context assumption does
>> > not hold (example: a database where you have a random access
>> > read/write pattern with blocks updated in place) how harm will it be
>> > to the device format if those blocks are written under the same
>> > context?
>> >
>>
>> MMC context allows the data blocks to be overwritten or randomly accessed
>
> That is of course the defined behavior of a block device that does
> not change with the use of contexts. To get the best performance,
> a random-write database file would always reside in a context by itself
> and not get mixed with long-lived write-once data. If we have a way
> in the file system to tell whether a file is written linearly or randomly
> (e.g. by looking at the O_APPEND or O_CREAT flag), it might make sense
> to split the context space accordingly.
>
>> > The next set of questions we need to ask is how generalizable is this
>> > concept to devices that might be more sophisticated than simple eMMC
>> > devices. ?If we're going to expose something all the way out to the
>> > file system layer, it would be nice if it worked on more than just
>> > low-end flash devices, but also on more sophisticated devices as well.
>> >
>>
>> This context mechanism will be used on both UFS and MMC devices. If
>> there are some alternate suggestions on what can be used as context
>> from file system perspective, then please ?suggest.
>
> One suggestion that has been made before was to base the context on
> the process ID rather than the inode number, but that has many other
> problems, e.g. when the same file gets written by multiple processes.
>
> ? ? ? ?Arnd
On Tuesday 12 June 2012, Saugata Das wrote:
> > I don't think that is correct. The most obvious implementation in eMMC
> > hardware for this would be to group all data from one context to be
> > written into the same erase block, in order to reduce the amount
> > of garbage collection that needs to happen at erase time. AFAICT,
> > the main interest here is, as Ted is guessing correctly, to make sure
> > that all data which gets written into one context has roughly the
> > same life time before it gets erased or overwritten.
>
> The restriction is there on "large unit" context, which prevents
> trim/erase of the blocks till the context is active. But we do not
> enable "large unit". On non-"large unit" context, the specification
> does not restrict the trim/erase of blocks based on context.
As I said, it's not a technical limitation, but a logical conclusion
from trying to use the context ID for something useful. The only
reason to use context ID in the first place is to reduce the amount
of garbage collection in the device (improving performance and expected
life of the device), so any context ID annotations we make should be
directed at giving useful information to the device to actually do that.
Arnd
On Tue, Jun 12, 2012 at 02:55:50PM +0000, Arnd Bergmann wrote:
>
> As I said, it's not a technical limitation, but a logical conclusion
> from trying to use the context ID for something useful. The only
> reason to use context ID in the first place is to reduce the amount
> of garbage collection in the device (improving performance and expected
> life of the device), so any context ID annotations we make should be
> directed at giving useful information to the device to actually do that.
... and a big part of that is knowing what is the downside if we give
incorrect information to the device. And what are the exact
implications of what it means to group a set of blocks into a
"context".
If it is fundamentally a promise that blocks in a context will be
overwritten or trimmed at the same time then is it counterproductive
to group blocks for overwrite-in-place database where the lifetimes of
the block are extremely different? Is that giving "wrong" information
going to significantly increase the write amplification factor?
It may be that the standard doesn't actually answer these questions
and even worse, SSD manufactures may be stupidly trying to keep this
stuff as a "trade securet" --- but we do need to know in order to
optimize performance on real hardware....
- Ted
On Tuesday 12 June 2012, Ted Ts'o wrote:
> On Tue, Jun 12, 2012 at 02:55:50PM +0000, Arnd Bergmann wrote:
> >
> > As I said, it's not a technical limitation, but a logical conclusion
> > from trying to use the context ID for something useful. The only
> > reason to use context ID in the first place is to reduce the amount
> > of garbage collection in the device (improving performance and expected
> > life of the device), so any context ID annotations we make should be
> > directed at giving useful information to the device to actually do that.
>
> ... and a big part of that is knowing what is the downside if we give
> incorrect information to the device. And what are the exact
> implications of what it means to group a set of blocks into a
> "context".
>
> If it is fundamentally a promise that blocks in a context will be
> overwritten or trimmed at the same time then is it counterproductive
> to group blocks for overwrite-in-place database where the lifetimes of
> the block are extremely different? Is that giving "wrong" information
> going to significantly increase the write amplification factor?
I don't think that can be derived from the definition of the context.
Instead, the important part is that we separate the data with predictable
lifetime from data with unpredictable lifetime. If we happen to be
writing both a linear file on the one hand (or multiple such files) and
at the same time updating a database, any reasonable implementation would
be able to benefit from the fact that the linear data is now in a different
erase block from the random-access data. The database file is still
screwed like it is without context support, but it no longer makes
the linear access worse.
> It may be that the standard doesn't actually answer these questions
> and even worse, SSD manufactures may be stupidly trying to keep this
> stuff as a "trade securet" --- but we do need to know in order to
> optimize performance on real hardware....
Right. The danger here is that the context support was described in
the standard first, while none of the devices seem to even be
smart enough to make use of the information we put in there. Once
operating systems start putting some data in there, at least
some manufacturers will start making use of that data to optimize
the accesses, but it's very unlikely that they will tell us exactly
what they are doing. Having code in ext4 that uses the contexts will
at least make it more likely that the firmware optimizations are
based on ext4 measurements rather than some other file system or
operating system.
>From talking with the emmc device vendors, I can tell you that ext4
is very high on the list of file systems to optimize for, because
they all target Android products.
Arnd
On Tue, Jun 12, 2012 at 08:07:28PM +0000, Arnd Bergmann wrote:
> Right. The danger here is that the context support was described in
> the standard first, while none of the devices seem to even be
> smart enough to make use of the information we put in there. Once
> operating systems start putting some data in there, at least
> some manufacturers will start making use of that data to optimize
> the accesses, but it's very unlikely that they will tell us exactly
> what they are doing. Having code in ext4 that uses the contexts will
> at least make it more likely that the firmware optimizations are
> based on ext4 measurements rather than some other file system or
> operating system.
>
> From talking with the emmc device vendors, I can tell you that ext4
> is very high on the list of file systems to optimize for, because
> they all target Android products.
Well, I have a contact at SanDisk where I can discuss things under
NDA, if that will help. He had reached out to me specifically because
of ext4 and Android --- he's the guy that I invited to give a talk at
the LSF workshop last year.
- Ted
On Tuesday 12 June 2012, Ted Ts'o wrote:
> On Tue, Jun 12, 2012 at 08:07:28PM +0000, Arnd Bergmann wrote:
> > Right. The danger here is that the context support was described in
> > the standard first, while none of the devices seem to even be
> > smart enough to make use of the information we put in there. Once
> > operating systems start putting some data in there, at least
> > some manufacturers will start making use of that data to optimize
> > the accesses, but it's very unlikely that they will tell us exactly
> > what they are doing. Having code in ext4 that uses the contexts will
> > at least make it more likely that the firmware optimizations are
> > based on ext4 measurements rather than some other file system or
> > operating system.
> >
> > From talking with the emmc device vendors, I can tell you that ext4
> > is very high on the list of file systems to optimize for, because
> > they all target Android products.
>
> Well, I have a contact at SanDisk where I can discuss things under
> NDA, if that will help. He had reached out to me specifically because
> of ext4 and Android --- he's the guy that I invited to give a talk at
> the LSF workshop last year.
Well, the Linaro storage team is in close contact with Alex Lemberg
from Sandisk, Luca Porzio from Micron and Hyojin Jeong from Samsung,
and we discussed this patch in our meeting two weeks ago and on
our Linaro mailing lists before that.
I have a good feeling about that work relationship, and they
all understand the needs of the Linux file systems, but my impression
is also that with an NDA in place we would not be able to put any
better implementation into the Linux kernel that makes use of hw
details of one of the manufacturers. Also note that the eMMC standard
is intentionally written in an abstract way to give the hardware
manufacturers the option to provide better implementations over time,
e.g. when new devices start using large amounts of cache, or replace
NAND flash with phase change memory or other technologies.
That said, I think it is rather clear what the authors of the spec
had in mind, and there is only one reasonable implementation given
current flash technology: You get something like a log structured
file system with 15 contexts, where each context writes to exactly
one erase block at a given time. This is not all that different
from how eMMC/SD/USB works already without context support, the main
difference being that the context normally gets picked based on the
LBA of the write in segments between 512KB and 16MB. Because the number
of active contexts is smaller than the number of total segments in
the device, the device keeps an LRU list of something between 5 and
30 segments.
Letting the file system pick the context number based on information
it has about the contents rather than the LBA should reduce the amount
of garbage collection if there is a stronger correlation between life
times of data written to the same context than there is between
life times of data written to adjacent LBA numbers.
The trouble with this is of course that getting the file system to
do a really good job at picking the context numbers is a harder
task than coming up with a block allocation scheme that just gets
it right for devices without context ID support ;-).
I think using the inode number is a reasonable fit. Using the
inode number of the parent directory might be more appropriate
but it breaks with hard links and cross-directory renames (we
must not use the same LBA with conflicting context numbers,
or flush the old context inbetween).
Arnd
On Wed, Jun 13, 2012 at 07:44:35PM +0000, Arnd Bergmann wrote:
>
> I think using the inode number is a reasonable fit. Using the
> inode number of the parent directory might be more appropriate
> but it breaks with hard links and cross-directory renames (we
> must not use the same LBA with conflicting context numbers,
> or flush the old context inbetween).
I think the inode number of the parent directory by itself is actually
*not* a good idea, because there are plenty of cases where files in
the same directory do not have the same life time. For example,
consider your openoffice files in ~/Documents, for example. Or worse,
the files in ~/Downloads written by your web browser.
It might be worth considering the hueristic of a series of files
written by a single process close together in time as belonging to a
single context. That still might not be quite right in the case of a
git checkout for example, most of the time I think that hueristic
would be quite valid.
One thing that *would* be worth consider when trying to decide the
right granularity for a context would be the size of the erase block.
If the erase block is 2 megs, and we are writing a lot of 8 meg files,
a per-inode context granularity probably makes a lot of sense.
OTOH, if the erase block size is 8mb, and we are writing a whole bunch
of small files, we probably want to use a much more aggressive way of
aggregating relating blocks than just "inodes" that average in size of
say, 32k or 128k. Getting this information may requiring leaning
rather hard on the eMMC manufacturers, since they (irrationally, in my
opinion) think this should be trade secret information. :-(
- Ted
On Wednesday 13 June 2012, Ted Ts'o wrote:
> On Wed, Jun 13, 2012 at 07:44:35PM +0000, Arnd Bergmann wrote:
> >
> > I think using the inode number is a reasonable fit. Using the
> > inode number of the parent directory might be more appropriate
> > but it breaks with hard links and cross-directory renames (we
> > must not use the same LBA with conflicting context numbers,
> > or flush the old context inbetween).
>
> I think the inode number of the parent directory by itself is actually
> not a good idea, because there are plenty of cases where files in
> the same directory do not have the same life time. For example,
> consider your openoffice files in ~/Documents, for example. Or worse,
> the files in ~/Downloads written by your web browser.
Well, using the lower 4 bits of the inode number has an even higher chance
of putting stuff in the same category that does not belong there.
E.g. if you write 1000 small files in a row, they are likely to be
in just one directory, or a small number of directories, but using the
inode number as the context ID, we end up spreading them over all 15
contexts even though it would be appropriate to have them all in the
same one.
> It might be worth considering the hueristic of a series of files
> written by a single process close together in time as belonging to a
> single context. That still might not be quite right in the case of a
> git checkout for example, most of the time I think that hueristic
> would be quite valid.
I agree that using the process as an indication would be nice, but
I could not come up with a way to ensure that we use the same
context ID if two processes are writing to the same file.
> One thing that would be worth consider when trying to decide the
> right granularity for a context would be the size of the erase block.
> If the erase block is 2 megs, and we are writing a lot of 8 meg files,
> a per-inode context granularity probably makes a lot of sense.
>
> OTOH, if the erase block size is 8mb, and we are writing a whole bunch
> of small files, we probably want to use a much more aggressive way of
> aggregating relating blocks than just "inodes" that average in size of
> say, 32k or 128k.
I think ideally we would also want to write small files separately from
large files in the file system, and that would also make support for
contexts less useful.
For any large (sufficiently larger than erasesize) files, it would also
be nice if the extents were aligned on erase block boundaries. Again,
if we do this, using context annotations should have no benefit over
just using the default context.
> Getting this information may requiring leaning
> rather hard on the eMMC manufacturers, since they (irrationally, in my
> opinion) think this should be trade secret information. :-(
For eMMC at least the erase block size is information that we should
be able to figure out. While I've seen devices that are lying there,
the representatives of the eMMC manufactures that I talked to basically
agreed that we should take the provided information to be correct
and if it happens to be wrong, that should be considered a firmware
bug that may result in bad performance and should be fixed in the
next version.
For SD cards, almost everyone is lying and we cannot trust the
information, and for USB flash, there is no way to ask the device.
In both of these cases, we probably want to detect the erase block
size at mkfs time using some timing attack that I worked on before.
Note that those devices also do not offer support for context IDs.
Arnd
On Wed, Jun 13, 2012 at 08:43:47PM +0000, Arnd Bergmann wrote:
> > It might be worth considering the hueristic of a series of files
> > written by a single process close together in time as belonging to a
> > single context. That still might not be quite right in the case of a
> > git checkout for example, most of the time I think that hueristic
> > would be quite valid.
>
> I agree that using the process as an indication would be nice, but
> I could not come up with a way to ensure that we use the same
> context ID if two processes are writing to the same file.
Oh, well *that's* easy. Whichever process opens the file drops a
context ID into fs-specific inode structure (for ext4, that would be
struct ext4_inode_info), and if a second process opens the file, we
use the same context ID. When the last file descriptor for the inode
is closed, we zap the context ID.
It also occurs to me that if a file is being written to by two
processes, it's likely that it's a update-in-place database, and we
want to treat those special; no matter what the size, we probably
don't want to group that file into the same context as the others.
More generally, if a file is opened without O_CREAT, it's probably a
good bet that it wants to either be in a context by itself, or not
part of any context.
The files which we would probably find most interesting is the files
which are created from scratch, and more specifically, for files which
are dumped out all at once: i.e., open w/O_CREAT, optional fallocate,
write, optional fsync, and close. If we can detect a series of file
operations with this characteristic originating from the same process,
when we detect a second open w/O_CREAT very shortly after the first
O_CREAT in the same directory from the same process, we simply reuse
the context ID for the second and subsequent files.
> I think ideally we would also want to write small files separately from
> large files in the file system, and that would also make support for
> contexts less useful.
Well, for file systems with delayed allocation, this is actually
pretty easy. By the time we do the writeback for a file with delayed
allocation, if it's substantially bigger than the erase block size and
we haven't yet written any blocks for the file, we should give it a
new context ID. And furthermore, your idea that we should try to
align the file on an erase block boundary would be a great thing to
do.
> For eMMC at least the erase block size is information that we should
> be able to figure out. While I've seen devices that are lying there,
> the representatives of the eMMC manufactures that I talked to basically
> agreed that we should take the provided information to be correct
> and if it happens to be wrong, that should be considered a firmware
> bug that may result in bad performance and should be fixed in the
> next version.
What would be *great* is if the erase block size were exposed in
sysfs, and that the blockid library (which is how mke2fs and other
similar mkfs programs get other storage device parameters) were
enhanced to return this information.
> For SD cards, almost everyone is lying and we cannot trust the
> information, and for USB flash, there is no way to ask the device.
> In both of these cases, we probably want to detect the erase block
> size at mkfs time using some timing attack that I worked on before.
> Note that those devices also do not offer support for context IDs.
Yes, although presumably aligning large files to erase block
boundaries would still be useful, yes?
So adding an erase block size to the ext2/3/4 superblock sounds like a
first great step. By making it be a superblock field, that way it's
possible to override the value returned by the hardware if it turns
out to be a lie, and we can also use programs like flashbench to
figure out the erase block size and populate the superblock value via
some userspace process. (Possibly called out of mke2fs directly if we
can automate it completely, and make it dead reliable.)
- Ted
On Wed, 13 Jun 2012, Ted Ts'o wrote:
> On Wed, Jun 13, 2012 at 08:43:47PM +0000, Arnd Bergmann wrote:
>
> > For SD cards, almost everyone is lying and we cannot trust the
> > information, and for USB flash, there is no way to ask the device.
> > In both of these cases, we probably want to detect the erase block
> > size at mkfs time using some timing attack that I worked on before.
> > Note that those devices also do not offer support for context IDs.
>
> Yes, although presumably aligning large files to erase block
> boundaries would still be useful, yes?
>
> So adding an erase block size to the ext2/3/4 superblock sounds like a
> first great step. By making it be a superblock field, that way it's
> possible to override the value returned by the hardware if it turns
> out to be a lie, and we can also use programs like flashbench to
> figure out the erase block size and populate the superblock value via
> some userspace process. (Possibly called out of mke2fs directly if we
> can automate it completely, and make it dead reliable.)
Let's not forget that, in almost all cases, filesystem images are not
created live on the final medium. Factories are picking a batch of
flash devices and a pre-built filesystem image is stamped on them, and
there might not even be a guarantee that those flash devices will all
have the same characteristics. So to say that making this tuning at
mkfs time is probably not the best strategy.
Nicolas
On Thu, 2012-06-14 at 12:14 -0400, Nicolas Pitre wrote:
> Let's not forget that, in almost all cases, filesystem images are not
> created live on the final medium. Factories are picking a batch of
> flash devices and a pre-built filesystem image is stamped on them, and
> there might not even be a guarantee that those flash devices will all
> have the same characteristics. So to say that making this tuning at
> mkfs time is probably not the best strategy.
Good point. But we can always set the eraseblock size in the superblock
to -1 which would mean "unknown".
--
Best Regards,
Artem Bityutskiy
On Thu, Jun 14, 2012 at 12:14:13PM -0400, Nicolas Pitre wrote:
> Let's not forget that, in almost all cases, filesystem images are not
> created live on the final medium. Factories are picking a batch of
> flash devices and a pre-built filesystem image is stamped on them, and
> there might not even be a guarantee that those flash devices will all
> have the same characteristics. So to say that making this tuning at
> mkfs time is probably not the best strategy.
Sure, that just means there needs to be a way of overriding the values
used by mke2fs. (Because as you create the fs image, the storage
device parameters may make a difference to how blocks get allocated.)
The reason why I talk about making it work automatically at mke2fs
time is that the vast majority of created file systems (where a
specially created fs by a handset vendor counts as "one", even if it
then gets stamped on millions of devices), the end user is someone
naive/oblivious, so the right thing *has* to happen by default in the
common case of running mke2fs on the storage device where the file
system gets used.
- Ted
On Thu, 14 Jun 2012, Ted Ts'o wrote:
> On Thu, Jun 14, 2012 at 12:14:13PM -0400, Nicolas Pitre wrote:
> > Let's not forget that, in almost all cases, filesystem images are not
> > created live on the final medium. Factories are picking a batch of
> > flash devices and a pre-built filesystem image is stamped on them, and
> > there might not even be a guarantee that those flash devices will all
> > have the same characteristics. So to say that making this tuning at
> > mkfs time is probably not the best strategy.
>
> Sure, that just means there needs to be a way of overriding the values
> used by mke2fs. (Because as you create the fs image, the storage
> device parameters may make a difference to how blocks get allocated.)
>
> The reason why I talk about making it work automatically at mke2fs
> time is that the vast majority of created file systems (where a
> specially created fs by a handset vendor counts as "one", even if it
> then gets stamped on millions of devices), the end user is someone
> naive/oblivious, so the right thing *has* to happen by default in the
> common case of running mke2fs on the storage device where the file
> system gets used.
Absolutely. However it is fair to say that less than 0.01% of total end
users will even think of running mke2fs on their device. So another
strategy that can be executed at run time when the fs is live would be
required too.
Nicolas
On Thursday 14 June 2012, Ted Ts'o wrote:
> On Wed, Jun 13, 2012 at 08:43:47PM +0000, Arnd Bergmann wrote:
> > > It might be worth considering the hueristic of a series of files
> > > written by a single process close together in time as belonging to a
> > > single context. That still might not be quite right in the case of a
> > > git checkout for example, most of the time I think that hueristic
> > > would be quite valid.
> >
> > I agree that using the process as an indication would be nice, but
> > I could not come up with a way to ensure that we use the same
> > context ID if two processes are writing to the same file.
>
> Oh, well *that's* easy. Whichever process opens the file drops a
> context ID into fs-specific inode structure (for ext4, that would be
> struct ext4_inode_info), and if a second process opens the file, we
> use the same context ID. When the last file descriptor for the inode
> is closed, we zap the context ID.
Right, that would work.
> It also occurs to me that if a file is being written to by two
> processes, it's likely that it's a update-in-place database, and we
> want to treat those special; no matter what the size, we probably
> don't want to group that file into the same context as the others.
> More generally, if a file is opened without O_CREAT, it's probably a
> good bet that it wants to either be in a context by itself, or not
> part of any context.
I think in the latter case, we actually want the database file to
be in its own context as well, to let the device know that it's
different from the other data that we send without a context.
Saugata just proposed on IRC that we could split the available
set of contexts into some that are used for linear access and
others that are used for random access. We can also make use
of POSIX_FADV_SEQUENTIAL/POSIX_FADV_RANDOM in an application
to put a file into one of these categories.
As soon as we get into the territory of the file system being
smart about keeping separate contexts for some files rather than
just using the low bits of the inode number or the pid, we get
more problems:
* The block device needs to communicate the number of available
contexts to the file system
* We have to arbitrate between contexts used on different partitions
of the same device
> The files which we would probably find most interesting is the files
> which are created from scratch, and more specifically, for files which
> are dumped out all at once: i.e., open w/O_CREAT, optional fallocate,
> write, optional fsync, and close. If we can detect a series of file
> operations with this characteristic originating from the same process,
> when we detect a second open w/O_CREAT very shortly after the first
> O_CREAT in the same directory from the same process, we simply reuse
> the context ID for the second and subsequent files.
Yes, makes sense.
> > I think ideally we would also want to write small files separately from
> > large files in the file system, and that would also make support for
> > contexts less useful.
>
> Well, for file systems with delayed allocation, this is actually
> pretty easy. By the time we do the writeback for a file with delayed
> allocation, if it's substantially bigger than the erase block size and
> we haven't yet written any blocks for the file, we should give it a
> new context ID. And furthermore, your idea that we should try to
> align the file on an erase block boundary would be a great thing to
> do.
My feeling is that we would actually benefit much more from the
erase block alignment than from the context for the large files.
There is one more option we have to give the best possible performance,
although that would be a huge amount of work to implement:
Any large file gets put into its own context, and we mark that
context "write-only" "unreliable" and "large-unit". This means the
file system has to write the file sequentially, filling one erase
block at a time, writing only "superpage" units (e.g. 16KB) or
multiples of that at once. We can neither overwrite nor read back
any of the data in that context until it is closed, and there is
no guarantee that any of the data has made it to the physical medium
before the context is closed. We are allowed to do read and write
accesses to any other context between superpage writes though.
After closing the context, the data will be just like any other
block again.
Right now, there is no support for large-unit context and also not for
read-only or write-only contexts, which means we don't have to
enforce strict policies and can basically treat the context ID
as a hint. Using the advanced features would require that we
keep track of the context IDs across partitions and have to flush
write-only contexts before reading the data again. If we want to
do that, we can probably discard the patch series and start over.
> > For eMMC at least the erase block size is information that we should
> > be able to figure out. While I've seen devices that are lying there,
> > the representatives of the eMMC manufactures that I talked to basically
> > agreed that we should take the provided information to be correct
> > and if it happens to be wrong, that should be considered a firmware
> > bug that may result in bad performance and should be fixed in the
> > next version.
>
> What would be *great* is if the erase block size were exposed in
> sysfs, and that the blockid library (which is how mke2fs and other
> similar mkfs programs get other storage device parameters) were
> enhanced to return this information.
For eMMC and SD devices, it's available in the preferred_erase_size
sysfs attribute, but other devices don't have that. What we've also
discussed in the past is to make that size available to the
I/O scheduler in order to implement a way to flush out all writes
for a given erase block at once, because that essentially comes
for free once we do the first write into that erase block.
That value would have to be user-selectable though, and we need
to come up with a way to do that for partitioned devices. While it
would be nice for ext4 to be able to set the property of the
block device based on the superblock data, that would fail as soon
as we have multiple partitions with conflicting settings.
> > For SD cards, almost everyone is lying and we cannot trust the
> > information, and for USB flash, there is no way to ask the device.
> > In both of these cases, we probably want to detect the erase block
> > size at mkfs time using some timing attack that I worked on before.
> > Note that those devices also do not offer support for context IDs.
>
> Yes, although presumably aligning large files to erase block
> boundaries would still be useful, yes?
Yes, very much so.
> So adding an erase block size to the ext2/3/4 superblock sounds like a
> first great step. By making it be a superblock field, that way it's
> possible to override the value returned by the hardware if it turns
> out to be a lie, and we can also use programs like flashbench to
> figure out the erase block size and populate the superblock value via
> some userspace process. (Possibly called out of mke2fs directly if we
> can automate it completely, and make it dead reliable.)
I think this is something we can do in the Linaro storage team.
We actually have plans to also put the erase block size in the swap
header, so we should be able to use the same code in mke2fs and mkswap,
and potentially others. What we discussed in the storage team meeting
today is that we start out by making ext4 aware of the erase block
size through the superblock and aligning extents for large files to
erase block boundaries.
If that works out well, the second step would be to detect which small
files are use a random-write pattern and group them in erase blocks
that are distinct from erase blocks for linear-write files.
Arnd
On 2012-06-14, at 3:55 PM, Arnd Bergmann wrote:
> My feeling is that we would actually benefit much more from the
> erase block alignment than from the context for the large files.
>
> I think this is something we can do in the Linaro storage team.
> We actually have plans to also put the erase block size in the swap
> header, so we should be able to use the same code in mke2fs and mkswap,
> and potentially others. What we discussed in the storage team meeting
> today is that we start out by making ext4 aware of the erase block
> size through the superblock and aligning extents for large files to
> erase block boundaries.
Note that there are already the s_raid_stride and s_raid_stripe_width,
used by the ext4 allocator to align the start and size of allocations
on RAID systems. The erase block size would be like s_raid_stride
(the minimum amount of data to allocate and write contiguously).
I don't know that there is a benefit to having a separate erase block
size, since in the end it means the same as s_raid_stride to the
allocator - make sure allocations/writes are aligned and sized on
multiples of this.
Cheers, Andreas
On Thursday 14 June 2012, Nicolas Pitre wrote:
> On Thu, 14 Jun 2012, Ted Ts'o wrote:
>
> > The reason why I talk about making it work automatically at mke2fs
> > time is that the vast majority of created file systems (where a
> > specially created fs by a handset vendor counts as "one", even if it
> > then gets stamped on millions of devices), the end user is someone
> > naive/oblivious, so the right thing has to happen by default in the
> > common case of running mke2fs on the storage device where the file
> > system gets used.
>
> Absolutely. However it is fair to say that less than 0.01% of total end
> users will even think of running mke2fs on their device. So another
> strategy that can be executed at run time when the fs is live would be
> required too.
The trouble is that detecting the erase block size requires us to
write specific patterns to the device, which is generally a bad
idea after the file system has been created.
I think the best we can do is
* default to "unspecified" as before
* if "unspecified", make the file system ask the block device. in
case of eMMC, that will usually be reliable
* Add an option to mkfs and tunefs to hardcode a specific size for
users that know the size and can't rely on the blockdev reporting
it correctly to the file system.
* Add an option to mkfs to autodetect the size for the drive it's
run on.
Arnd
On Friday 15 June 2012, Andreas Dilger wrote:
> On 2012-06-14, at 3:55 PM, Arnd Bergmann wrote:
> > My feeling is that we would actually benefit much more from the
> > erase block alignment than from the context for the large files.
> >
> > I think this is something we can do in the Linaro storage team.
> > We actually have plans to also put the erase block size in the swap
> > header, so we should be able to use the same code in mke2fs and mkswap,
> > and potentially others. What we discussed in the storage team meeting
> > today is that we start out by making ext4 aware of the erase block
> > size through the superblock and aligning extents for large files to
> > erase block boundaries.
>
> Note that there are already the s_raid_stride and s_raid_stripe_width,
> used by the ext4 allocator to align the start and size of allocations
> on RAID systems. The erase block size would be like s_raid_stride
> (the minimum amount of data to allocate and write contiguously).
>
> I don't know that there is a benefit to having a separate erase block
> size, since in the end it means the same as s_raid_stride to the
> allocator - make sure allocations/writes are aligned and sized on
> multiples of this.
Good point. For flash drives, the specific optimizations we do might
be different from what we do on RAID, but they have enough in common
that we could use the same mechanism to detect them.
Is ext4 able to cope well with stride sizes between 512KB and 24MB?
Arnd
On 2012-06-15, at 3:25 AM, Arnd Bergmann wrote:
> On Friday 15 June 2012, Andreas Dilger wrote:
>> On 2012-06-14, at 3:55 PM, Arnd Bergmann wrote:
>>> My feeling is that we would actually benefit much more from the
>>> erase block alignment than from the context for the large files.
>>>
>>> I think this is something we can do in the Linaro storage team.
>>> We actually have plans to also put the erase block size in the swap
>>> header, so we should be able to use the same code in mke2fs and mkswap,
>>> and potentially others. What we discussed in the storage team meeting
>>> today is that we start out by making ext4 aware of the erase block
>>> size through the superblock and aligning extents for large files to
>>> erase block boundaries.
>>
>> Note that there are already the s_raid_stride and s_raid_stripe_width,
>> used by the ext4 allocator to align the start and size of allocations
>> on RAID systems. The erase block size would be like s_raid_stride
>> (the minimum amount of data to allocate and write contiguously).
>>
>> I don't know that there is a benefit to having a separate erase block
>> size, since in the end it means the same as s_raid_stride to the
>> allocator - make sure allocations/writes are aligned and sized on
>> multiples of this.
>
> Good point. For flash drives, the specific optimizations we do might
> be different from what we do on RAID, but they have enough in common
> that we could use the same mechanism to detect them.
>
> Is ext4 able to cope well with stride sizes between 512KB and 24MB?
It is typically used on RAID arrays with 1MB or 4MB alignment. It is
considerably more CPU efficient to use power-of-two alignment, but it
is also possible to use non-power-of-two values if needed, so long as
they are at least a multiple of the block size.
Cheers, Andreas
On Friday 15 June 2012, Andreas Dilger wrote:
>
> On 2012-06-15, at 3:25 AM, Arnd Bergmann wrote:
> >
> > Good point. For flash drives, the specific optimizations we do might
> > be different from what we do on RAID, but they have enough in common
> > that we could use the same mechanism to detect them.
> >
> > Is ext4 able to cope well with stride sizes between 512KB and 24MB?
>
> It is typically used on RAID arrays with 1MB or 4MB alignment.
Ok, that sounds like it's the same order of magnitued, which definitely
helps. The most common erase block sizes today are 4 MB and 8 MB,
though they tend to double every one or two years.
> It is considerably more CPU efficient to use power-of-two alignment,
> but it is also possible to use non-power-of-two values if needed, so
> long as they are at least a multiple of the block size.
I see. It's quite common to have a multiple of three nowadays (1.5M B,
3 MB, 6 MB, 12 MB) because of the way that TLC flash is getting used,
but I've also seen TLC based devices that cut a 4 MB erase block in
three parts, rounded to the next superpage size (1376+1376+1344 KB).
In the former case, we might represent that as a stride=2^n and
stripe-width=3*stride if that helps, in the latter case it sounds
like it has to be stride=stripe-width=2^n anyway.
Usually we're interested in the larger of these two sizes anyway.
Arnd
On Fri, Jun 15, 2012 at 09:19:23AM +0000, Arnd Bergmann wrote:
>
> The trouble is that detecting the erase block size requires us to
> write specific patterns to the device, which is generally a bad
> idea after the file system has been created.
How much space do you need? It's not hard to allocate a bunch of
space, in a file, use FIEMAP ioctl to verify that you have a
contiguous range of blocks, and then do direct I/O into that region.
> I think the best we can do is
>
> * default to "unspecified" as before
> * if "unspecified", make the file system ask the block device. in
> case of eMMC, that will usually be reliable
> * Add an option to mkfs and tunefs to hardcode a specific size for
> users that know the size and can't rely on the blockdev reporting
> it correctly to the file system.
> * Add an option to mkfs to autodetect the size for the drive it's
> run on.
Well, I think we can do better; the question is whether or not it's
worth the effort. It may not be....
- Ted
On Thu, Jun 14, 2012 at 09:55:31PM +0000, Arnd Bergmann wrote:
>
> As soon as we get into the territory of the file system being
> smart about keeping separate contexts for some files rather than
> just using the low bits of the inode number or the pid, we get
> more problems:
>
> * The block device needs to communicate the number of available
> contexts to the file system
> * We have to arbitrate between contexts used on different partitions
> of the same device
Can't we virtualize this? Would this work?
The file system can simply create as many virtual contexts as it
likes; if there are no more contexts available, the block device
simply closes the least recently used context (no matter what
partition). If the file system tries to use a virtual context where
the underlying physical context has been closed, the block device will
simply open a new physical context (possibly closing some other old
context).
> There is one more option we have to give the best possible performance,
> although that would be a huge amount of work to implement:
>
> Any large file gets put into its own context, and we mark that
> context "write-only" "unreliable" and "large-unit". This means the
> file system has to write the file sequentially, filling one erase
> block at a time, writing only "superpage" units (e.g. 16KB) or
> multiples of that at once. We can neither overwrite nor read back
> any of the data in that context until it is closed, and there is
> no guarantee that any of the data has made it to the physical medium
> before the context is closed. We are allowed to do read and write
> accesses to any other context between superpage writes though.
> After closing the context, the data will be just like any other
> block again.
Oh, that's cool. And I don't think that's hard to do. We could just
keep a flag in the in-core inode indicating whether it is in "large
unit" mode. If it is in large unit mode, we can make the fs writeback
function make sure that we adhere to the restrictions of the large
unit mode, and if at any point we need to do something that might
violate the constraints, the file system would simply close the
context.
The only reason I can think of why this might be problematic is if
there is a substantial performance cost involved with opening and
closing contexts on eMMC devices. Is that an issue we need to be
worried about?
> Right now, there is no support for large-unit context and also not for
> read-only or write-only contexts, which means we don't have to
> enforce strict policies and can basically treat the context ID
> as a hint. Using the advanced features would require that we
> keep track of the context IDs across partitions and have to flush
> write-only contexts before reading the data again. If we want to
> do that, we can probably discard the patch series and start over.
Well, I'm interested in getting something upstream, which is useful
not just for the consumer-grade eMMC devices in handsets, but which
might also be extensible to SSD's, and all the way up to PCIe-attached
flash devices that might be used in large data centers.
I think if we do things right, it should be possible to do something
which would accomodate a large range of devices (which is why I
brought up the concept of exposing virtualized contexts to the file
system layer).
Regards,
- Ted
On 2012-06-15, at 4:04 PM, Ted Ts'o wrote:
> On Thu, Jun 14, 2012 at 09:55:31PM +0000, Arnd Bergmann wrote:
>> There is one more option we have to give the best possible performance,
>> although that would be a huge amount of work to implement:
>>
>> Any large file gets put into its own context, and we mark that
>> context "write-only" "unreliable" and "large-unit". This means the
>> file system has to write the file sequentially, filling one erase
>> block at a time, writing only "superpage" units (e.g. 16KB) or
>> multiples of that at once. We can neither overwrite nor read back
>> any of the data in that context until it is closed, and there is
>> no guarantee that any of the data has made it to the physical medium
>> before the context is closed. We are allowed to do read and write
>> accesses to any other context between superpage writes though.
>> After closing the context, the data will be just like any other
>> block again.
>
> Oh, that's cool. And I don't think that's hard to do. We could just
> keep a flag in the in-core inode indicating whether it is in "large
> unit" mode. If it is in large unit mode, we can make the fs writeback
> function make sure that we adhere to the restrictions of the large
> unit mode, and if at any point we need to do something that might
> violate the constraints, the file system would simply close the
> context.
This is very similar to what was implemented in mballoc preallocation.
Large files will get their own preallocation context, while small files
would share a context (i.e. an 8MB extent) and be packed densely into
this extent to avoid seeking. It wouldn't be unreasonable to just give
each mballoc context a different eMMC context.
> The only reason I can think of why this might be problematic is if
> there is a substantial performance cost involved with opening and
> closing contexts on eMMC devices. Is that an issue we need to be
> worried about?
>
>> Right now, there is no support for large-unit context and also not for
>> read-only or write-only contexts, which means we don't have to
>> enforce strict policies and can basically treat the context ID
>> as a hint. Using the advanced features would require that we
>> keep track of the context IDs across partitions and have to flush
>> write-only contexts before reading the data again. If we want to
>> do that, we can probably discard the patch series and start over.
>
> Well, I'm interested in getting something upstream, which is useful
> not just for the consumer-grade eMMC devices in handsets, but which
> might also be extensible to SSD's, and all the way up to PCIe-attached
> flash devices that might be used in large data centers.
>
> I think if we do things right, it should be possible to do something
> which would accomodate a large range of devices (which is why I
> brought up the concept of exposing virtualized contexts to the file
> system layer).
>
> Regards,
>
> - Ted
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
Cheers, Andreas
On Friday 15 June 2012, Ted Ts'o wrote:
> On Fri, Jun 15, 2012 at 09:19:23AM +0000, Arnd Bergmann wrote:
> >
> > The trouble is that detecting the erase block size requires us to
> > write specific patterns to the device, which is generally a bad
> > idea after the file system has been created.
>
> How much space do you need? It's not hard to allocate a bunch of
> space, in a file, use FIEMAP ioctl to verify that you have a
> contiguous range of blocks, and then do direct I/O into that region.
We need a few erase blocks, spaced apart by a few erase blocks each.
Since we don't have to detect the number of erase blocks that the
device can handle, a small number would be ok I guess, so in order
to detect an 8 MB erase block correctly, we might use 3 erase blocks
that are spaced apart by 5 erase blocks, for a total of 104MB.
Once we figure out the erase block size, it would also help to
verify that we can write to at least e.g. 5 blocks concurrently
without triggering garbage collection, so we can print a warning
if it doesn't.
Arnd
On Friday 15 June 2012, Andreas Dilger wrote:
> > Oh, that's cool. And I don't think that's hard to do. We could just
> > keep a flag in the in-core inode indicating whether it is in "large
> > unit" mode. If it is in large unit mode, we can make the fs writeback
> > function make sure that we adhere to the restrictions of the large
> > unit mode, and if at any point we need to do something that might
> > violate the constraints, the file system would simply close the
> > context.
>
> This is very similar to what was implemented in mballoc preallocation.
> Large files will get their own preallocation context, while small files
> would share a context (i.e. an 8MB extent) and be packed densely into
> this extent to avoid seeking. It wouldn't be unreasonable to just give
> each mballoc context a different eMMC context.
My understanding is that once we do that, we have already won much more
than we can by using contexts, because we get perfect write patterns.
The only thing that contexts would still buy us is that the device has
more freedom to cache things separately in each context if we write
with less than superpage alignment.
Is the mballoc algorithm you describe something that is already implemented
with the semantics you describe, or is there something we need to change
still, e.g. making sure that all allocations are aligned to the stripe-width?
Arnd
On Friday 15 June 2012, Ted Ts'o wrote:
> On Thu, Jun 14, 2012 at 09:55:31PM +0000, Arnd Bergmann wrote:
> >
> > As soon as we get into the territory of the file system being
> > smart about keeping separate contexts for some files rather than
> > just using the low bits of the inode number or the pid, we get
> > more problems:
> >
> > * The block device needs to communicate the number of available
> > contexts to the file system
> > * We have to arbitrate between contexts used on different partitions
> > of the same device
>
> Can't we virtualize this? Would this work?
>
> The file system can simply create as many virtual contexts as it
> likes; if there are no more contexts available, the block device
> simply closes the least recently used context (no matter what
> partition). If the file system tries to use a virtual context where
> the underlying physical context has been closed, the block device will
> simply open a new physical context (possibly closing some other old
> context).
Yes, that sounds like a useful thing to do. It just means that we
have to throw away and redo all the patches, but I think that's ok.
> > There is one more option we have to give the best possible performance,
> > although that would be a huge amount of work to implement:
> >
> > Any large file gets put into its own context, and we mark that
> > context "write-only" "unreliable" and "large-unit". This means the
> > file system has to write the file sequentially, filling one erase
> > block at a time, writing only "superpage" units (e.g. 16KB) or
> > multiples of that at once. We can neither overwrite nor read back
> > any of the data in that context until it is closed, and there is
> > no guarantee that any of the data has made it to the physical medium
> > before the context is closed. We are allowed to do read and write
> > accesses to any other context between superpage writes though.
> > After closing the context, the data will be just like any other
> > block again.
>
> Oh, that's cool. And I don't think that's hard to do. We could just
> keep a flag in the in-core inode indicating whether it is in "large
> unit" mode. If it is in large unit mode, we can make the fs writeback
> function make sure that we adhere to the restrictions of the large
> unit mode, and if at any point we need to do something that might
> violate the constraints, the file system would simply close the
> context.
Really? I actually had expected this to be a major issue, to the
point that I thought we would only ever do large contexts in
special emmc-optimized file sytems.
> The only reason I can think of why this might be problematic is if
> there is a substantial performance cost involved with opening and
> closing contexts on eMMC devices. Is that an issue we need to be
> worried about?
I don't think so. Opening a context should basically be free, and
while closing a context can take some time, my understanding is
that in a sensible implementation that time would never be more
than the time we saved in the first place by using the context:
With a write-only context, the device does not actually have to
write all the data (it may have to write some of it, depending
on the exact mode the context is put into) until the context gets
closed, so it can take advantage of smarter allocation and batched
writes at close time.
> > Right now, there is no support for large-unit context and also not for
> > read-only or write-only contexts, which means we don't have to
> > enforce strict policies and can basically treat the context ID
> > as a hint. Using the advanced features would require that we
> > keep track of the context IDs across partitions and have to flush
> > write-only contexts before reading the data again. If we want to
> > do that, we can probably discard the patch series and start over.
>
> Well, I'm interested in getting something upstream, which is useful
> not just for the consumer-grade eMMC devices in handsets, but which
> might also be extensible to SSD's, and all the way up to PCIe-attached
> flash devices that might be used in large data centers.
>
> I think if we do things right, it should be possible to do something
> which would accomodate a large range of devices (which is why I
> brought up the concept of exposing virtualized contexts to the file
> system layer).
I am not aware of any actual SSD technology that would take advantage
of it, but at least the upcoming UFS standard that is supposed to
replace eMMC should do it, and it's somewhere inbetween an eMMC and
an SSD in many ways.
Arnd
On Saturday 16 June 2012, Arnd Bergmann wrote:
> On Friday 15 June 2012, Andreas Dilger wrote:
> > > Oh, that's cool. And I don't think that's hard to do. We could just
> > > keep a flag in the in-core inode indicating whether it is in "large
> > > unit" mode. If it is in large unit mode, we can make the fs writeback
> > > function make sure that we adhere to the restrictions of the large
> > > unit mode, and if at any point we need to do something that might
> > > violate the constraints, the file system would simply close the
> > > context.
> >
> > This is very similar to what was implemented in mballoc preallocation.
> > Large files will get their own preallocation context, while small files
> > would share a context (i.e. an 8MB extent) and be packed densely into
> > this extent to avoid seeking. It wouldn't be unreasonable to just give
> > each mballoc context a different eMMC context.
>
> My understanding is that once we do that, we have already won much more
> than we can by using contexts, because we get perfect write patterns.
> The only thing that contexts would still buy us is that the device has
> more freedom to cache things separately in each context if we write
> with less than superpage alignment.
Sorry, I replied in the wrong order and had not actually read what Ted
said about actually being able to use the large-unit contexts. If we use
large-unit contexts in write-only mode, that would indeed be a way for
the device to get significantly better than if we just do the alignment.
Arnd
On Sat, Jun 16, 2012 at 07:26:07AM +0000, Arnd Bergmann wrote:
> > Oh, that's cool. And I don't think that's hard to do. We could just
> > keep a flag in the in-core inode indicating whether it is in "large
> > unit" mode. If it is in large unit mode, we can make the fs writeback
> > function make sure that we adhere to the restrictions of the large
> > unit mode, and if at any point we need to do something that might
> > violate the constraints, the file system would simply close the
> > context.
>
> Really? I actually had expected this to be a major issue, to the
> point that I thought we would only ever do large contexts in
> special emmc-optimized file sytems.
Yeah, it's easy, for file systems (like ext4) which have delayed
allocation. It's always faster to write in large contiguous chunks,
so we do a lot of work to make sure we can make that happen. Take a
look of a blktrace of ext4 when writing large set of files; most of
the I/O will be in contiguous, large chunks. So it's just a matter of
telling the block device layer when we are about to do that large
write. We could probably do some tuning to make the chunks be larger
and adjust some parameters in the block allocation, but that's easy.
One thing which is going to be tricky is that ext4 currently uses a
buddy allocator, so it will work well for erase blocks of two. You
mentioned some devices might have erase block sizes of 3*2**N, so that
might require reworking the block allocator some, if we need to align
writes on erase block boundaries.
> > Well, I'm interested in getting something upstream, which is useful
> > not just for the consumer-grade eMMC devices in handsets, but which
> > might also be extensible to SSD's, and all the way up to PCIe-attached
> > flash devices that might be used in large data centers.
> >
>
> I am not aware of any actual SSD technology that would take advantage
> of it, but at least the upcoming UFS standard that is supposed to
> replace eMMC should do it, and it's somewhere inbetween an eMMC and
> an SSD in many ways.
I'm not aware that anything has been announced, but this is one of
those things which the high end folks have *got* to be thinking about.
The issues involved aren't only just for eMMC, you know... :-)
- Ted
On Saturday 16 June 2012, Ted Ts'o wrote:
> On Sat, Jun 16, 2012 at 07:26:07AM +0000, Arnd Bergmann wrote:
> > > Oh, that's cool. And I don't think that's hard to do. We could just
> > > keep a flag in the in-core inode indicating whether it is in "large
> > > unit" mode. If it is in large unit mode, we can make the fs writeback
> > > function make sure that we adhere to the restrictions of the large
> > > unit mode, and if at any point we need to do something that might
> > > violate the constraints, the file system would simply close the
> > > context.
> >
> > Really? I actually had expected this to be a major issue, to the
> > point that I thought we would only ever do large contexts in
> > special emmc-optimized file sytems.
>
> Yeah, it's easy, for file systems (like ext4) which have delayed
> allocation. It's always faster to write in large contiguous chunks,
> so we do a lot of work to make sure we can make that happen. Take a
> look of a blktrace of ext4 when writing large set of files; most of
> the I/O will be in contiguous, large chunks. So it's just a matter of
> telling the block device layer when we are about to do that large
> write. We could probably do some tuning to make the chunks be larger
> and adjust some parameters in the block allocation, but that's easy.
>
> One thing which is going to be tricky is that ext4 currently uses a
> buddy allocator, so it will work well for erase blocks of two. You
> mentioned some devices might have erase block sizes of 3*2**N, so that
> might require reworking the block allocator some, if we need to align
> writes on erase block boundaries.
What about the other restrictions I mentioned though? If we use large-unit
read-only contexts, it's not just about writing the entire erase block
from start to end, we have to make sure we follow other rules:
* We cannot read from write-only large-unit context, so we have to
do one of these:
a) ensure we never drop any pages from page-cache between writing
them to the large context and closing that context
b) if we need to read some data that we have just written to the
large-unit context, close that context and open a new rw-context
without the large-unit flag set (or write in the default context)
* All writes to the large-unit context have to be done in superpage
size, which means something between 8 and 32 kb typically, so more
than the underlying fs block size
* We can only start the large unit at the start of an erase block. If
we unmount the drive and later continue writing, it has to continue
without the large-unit flag at first until we hit an erase block
boundary.
* If we run out of contexts in the block device, we might have to
close a large-unit context before getting to the end of it.
> > > Well, I'm interested in getting something upstream, which is useful
> > > not just for the consumer-grade eMMC devices in handsets, but which
> > > might also be extensible to SSD's, and all the way up to PCIe-attached
> > > flash devices that might be used in large data centers.
> > >
> >
> > I am not aware of any actual SSD technology that would take advantage
> > of it, but at least the upcoming UFS standard that is supposed to
> > replace eMMC should do it, and it's somewhere inbetween an eMMC and
> > an SSD in many ways.
>
> I'm not aware that anything has been announced, but this is one of
> those things which the high end folks have *got* to be thinking about.
> The issues involved aren't only just for eMMC, you know... :-)
My impression was always that the high-end storage folks try to make
everything behave nicely whatever the access patterns are, and they
can do it because an SSD controllers has vast amounts of cache (megabytes,
not kilobytes) and processing power (e.g. 1Ghz ARMv5 instead of 50 Mhz
8051) to handle it, and they also make use of tagged command queuing to
let the device have multiple outstanding requests.
Arnd
On Sat, Jun 16, 2012 at 05:41:23PM +0000, Arnd Bergmann wrote:
>
> * We cannot read from write-only large-unit context, so we have to
> do one of these:
> a) ensure we never drop any pages from page-cache between writing
> them to the large context and closing that context
> b) if we need to read some data that we have just written to the
> large-unit context, close that context and open a new rw-context
> without the large-unit flag set (or write in the default context)
If we ever a read on the inode in question, we close the large-unit
context. That's the simplest thing to do, since we then don't need to
track which blocks had been written from the inode. And in general,
if you have a random read/write workload, large-unit contexts probably
won't help you. We mainly would need this when the workload is doing
large sequential writes, which is *easy* to optimize for.
> * All writes to the large-unit context have to be done in superpage
> size, which means something between 8 and 32 kb typically, so more
> than the underlying fs block size
Right, so we only enable the large-unit context when we are in
ext4_da_writepages() and we can do the first write in a way that meets
the requirements (i.e., the write starts aligned on the erase block,
and is a multiple of the superpage size). The moment we need to do a
read (see above) or a write which doesn't meet the large-unit
restrictions, we close the large-unit context.
(This is why I asked the question about whether there are performance
penalties for opening and closing contexts. If it requires flushing
the NCQ queues, ala the trim request, then we might need to be more
careful.)
> * We can only start the large unit at the start of an erase block. If
> we unmount the drive and later continue writing, it has to continue
> without the large-unit flag at first until we hit an erase block
> boundary.
My assumption was that when you umount the drive, the file system
would close all of the contexts.
> * If we run out of contexts in the block device, we might have to
> close a large-unit context before getting to the end of it.
Yep.
> My impression was always that the high-end storage folks try to make
> everything behave nicely whatever the access patterns are, and they
> can do it because an SSD controllers has vast amounts of cache (megabytes,
> not kilobytes) and processing power (e.g. 1Ghz ARMv5 instead of 50 Mhz
> 8051) to handle it, and they also make use of tagged command queuing to
> let the device have multiple outstanding requests.
Well, the high-end stoarge folks still would need to know if a set of
blocks being written are related. The large-unit contexts might not
matter as much, but knowing that a set of writes *are* related is
something that would help them.
- Ted
On Monday 18 June 2012, Ted Ts'o wrote:
> On Sat, Jun 16, 2012 at 05:41:23PM +0000, Arnd Bergmann wrote:
> >
> > * We cannot read from write-only large-unit context, so we have to
> > do one of these:
> > a) ensure we never drop any pages from page-cache between writing
> > them to the large context and closing that context
> > b) if we need to read some data that we have just written to the
> > large-unit context, close that context and open a new rw-context
> > without the large-unit flag set (or write in the default context)
>
> If we ever a read on the inode in question, we close the large-unit
> context. That's the simplest thing to do, since we then don't need to
> track which blocks had been written from the inode. And in general,
> if you have a random read/write workload, large-unit contexts probably
> won't help you. We mainly would need this when the workload is doing
> large sequential writes, which is *easy* to optimize for.
right.
> > * All writes to the large-unit context have to be done in superpage
> > size, which means something between 8 and 32 kb typically, so more
> > than the underlying fs block size
>
> Right, so we only enable the large-unit context when we are in
> ext4_da_writepages() and we can do the first write in a way that meets
> the requirements (i.e., the write starts aligned on the erase block,
> and is a multiple of the superpage size). The moment we need to do a
> read (see above) or a write which doesn't meet the large-unit
> restrictions, we close the large-unit context.
>
> (This is why I asked the question about whether there are performance
> penalties for opening and closing contexts. If it requires flushing
> the NCQ queues, ala the trim request, then we might need to be more
> careful.)
I believe it should only require flushing that one context, although
a specific hardware implementation might be worse than that. Maybe
Luca or Alex can comment on this.
> > * We can only start the large unit at the start of an erase block. If
> > we unmount the drive and later continue writing, it has to continue
> > without the large-unit flag at first until we hit an erase block
> > boundary.
>
> My assumption was that when you umount the drive, the file system
> would close all of the contexts.
Yes, makes sense. This is probably required to ensure that the data
has made to the drive, at least for the large contexts, but it is
definitely required for housekeeping of contexts if we manage them
from the block layer.
Arnd
Hi,
Some feedbacks inlined below.
Moreover some generic comments of mine hereafter.
First of all I agree with a previous comment from Arnd that a FS aware of virtual page / erase block is much better than using contexts especially if this requires low effort by re-using similar concepts as in the scsi stripe example.
My opinion on contexts is:
- A wrong context (context used not in the expected way) can cause much impact on performances than not using it at all
- Fewer contexts are better: eMMC have limited resources. You can expect performance benefit when opening few (3~4) contexts but opening many contexts can be critical.
You can imagine an eMMC like an Observer who tries to dispatch contents based on what "he perceives" as the traffic flow (how much sequential is this data? How much randomic? How much likely is it to be rewritten? etc). ContextIDs are the attempt to move part of the burden from the internal observer to an external observer called FileSystem.
Given the story above, to give their best I strongly agree that having an open discussion on how to best dispatch the burden between internal and external observer is key to the success of this feature.
Cheers,
Luca
> -----Original Message-----
> From: Arnd Bergmann [mailto:[email protected]]
> Sent: Tuesday, June 19, 2012 5:17 PM
> To: Ted Ts'o
> Cc: Alex Lemberg; HYOJIN JEONG; Saugata Das; Artem Bityutskiy; Saugata Das;
> [email protected]; [email protected]; linux-
> [email protected]; [email protected]; [email protected]; Luca Porzio
> (lporzio)
> Subject: Re: [PATCH 2/3] ext4: Context support
>
> On Monday 18 June 2012, Ted Ts'o wrote:
> > On Sat, Jun 16, 2012 at 05:41:23PM +0000, Arnd Bergmann wrote:
> > >
> > > * We cannot read from write-only large-unit context, so we have to
> > > do one of these:
> > > a) ensure we never drop any pages from page-cache between writing
> > > them to the large context and closing that context
> > > b) if we need to read some data that we have just written to the
> > > large-unit context, close that context and open a new rw-context
> > > without the large-unit flag set (or write in the default context)
> >
> > If we ever a read on the inode in question, we close the large-unit
> > context. That's the simplest thing to do, since we then don't need to
> > track which blocks had been written from the inode. And in general,
> > if you have a random read/write workload, large-unit contexts probably
> > won't help you. We mainly would need this when the workload is doing
> > large sequential writes, which is *easy* to optimize for.
>
> right.
>
I agree. Also you can open the large unit context in read/write mode so that you don't need to close the context if you just want to read while writing.
Again I would suggest not to use context unless absolutely sure that the context will be used in the right way.
With the latter, I am not worried about the closing cost but more on the performance impact.
> > > * All writes to the large-unit context have to be done in superpage
> > > size, which means something between 8 and 32 kb typically, so more
> > > than the underlying fs block size
> >
I would expect even larger numbers than 32KB.
> > Right, so we only enable the large-unit context when we are in
> > ext4_da_writepages() and we can do the first write in a way that meets
> > the requirements (i.e., the write starts aligned on the erase block,
> > and is a multiple of the superpage size). The moment we need to do a
> > read (see above) or a write which doesn't meet the large-unit
> > restrictions, we close the large-unit context.
> >
> > (This is why I asked the question about whether there are performance
> > penalties for opening and closing contexts. If it requires flushing
> > the NCQ queues, ala the trim request, then we might need to be more
> > careful.)
>
> I believe it should only require flushing that one context, although
> a specific hardware implementation might be worse than that. Maybe
> Luca or Alex can comment on this.
>
That's an interesting question.
The short answer is that unless we define a use case, it is hard for me to give you meaningful numbers.
> > > * We can only start the large unit at the start of an erase block. If
> > > we unmount the drive and later continue writing, it has to continue
> > > without the large-unit flag at first until we hit an erase block
> > > boundary.
> >
> > My assumption was that when you umount the drive, the file system
> > would close all of the contexts.
>
> Yes, makes sense. This is probably required to ensure that the data
> has made to the drive, at least for the large contexts, but it is
> definitely required for housekeeping of contexts if we manage them
> from the block layer.
>
One comment here, large unit contexts (according to spec) are not bounded to erase blocks. They can span one or more blocks, actually they are not related to block size at all (just virtual page size of the device which can be read from the EXT_CSD configuration registers for eMMC).
> Arnd
On Wednesday 20 June 2012, Luca Porzio (lporzio) wrote:
> > > > * We can only start the large unit at the start of an erase block. If
> > > > we unmount the drive and later continue writing, it has to continue
> > > > without the large-unit flag at first until we hit an erase block
> > > > boundary.
> > >
> > > My assumption was that when you umount the drive, the file system
> > > would close all of the contexts.
> >
> > Yes, makes sense. This is probably required to ensure that the data
> > has made to the drive, at least for the large contexts, but it is
> > definitely required for housekeeping of contexts if we manage them
> > from the block layer.
> >
>
> One comment here, large unit contexts (according to spec) are not bounded
> to erase blocks. They can span one or more blocks, actually they are not
> related to block size at all (just virtual page size of the device which
> can be read from the EXT_CSD configuration registers for eMMC).
Well, when I say erase block, I really mean the "large unit", which is some
multiple of entire megabytes, because the only reasonable way to use this
is to define this to the size of the erase block or a very small multiple
of that. When detecting the "erase block size" of an eMMC, we should report
the smallest multiple of the ERASE_GPR_SIZE, HC_ERASE_GRP_SIZE and
LARGE_UNIT_SIZE_M1.
Arnd
On Wed, 2012-06-13 at 19:44 +0000, Arnd Bergmann wrote:
> I think using the inode number is a reasonable fit. Using the
> inode number of the parent directory might be more appropriate
> but it breaks with hard links and cross-directory renames (we
> must not use the same LBA with conflicting context numbers,
> or flush the old context inbetween).
I would put it this way.
1. contex = inode number for data blocks.
2. context = parent directory's inode number for stat data (times,
permissions, etc) blocks and directory entry blocks. Should help things
like readdir and readdir + stat. Besides, this stuff tend to change more
often than the data, so mixing it with the data in the same eraseblock
is not smart.
3. context = parent inode number for all the stuff belonging to xattrs.
We do something similar in UBIFS.
--
Best Regards,
Artem Bityutskiy
Hallo,
> -----Original Message-----
> From: [email protected] [mailto:[email protected]]
> On Behalf Of Artem Bityutskiy
> Sent: Friday, June 22, 2012 3:29 PM
> To: Arnd Bergmann
> Cc: Ted Ts'o; Alex Lemberg; HYOJIN JEONG; Saugata Das; Saugata Das; linux-
> [email protected]; [email protected]; linux-
> [email protected]; [email protected]; [email protected]; Luca Porzio
> (lporzio)
> Subject: Re: [PATCH 2/3] ext4: Context support
>
> On Wed, 2012-06-13 at 19:44 +0000, Arnd Bergmann wrote:
> > I think using the inode number is a reasonable fit. Using the inode
> > number of the parent directory might be more appropriate but it breaks
> > with hard links and cross-directory renames (we must not use the same
> > LBA with conflicting context numbers, or flush the old context
> > inbetween).
>
> I would put it this way.
>
> 1. contex = inode number for data blocks.
> 2. context = parent directory's inode number for stat data (times,
> permissions, etc) blocks and directory entry blocks. Should help things like
> readdir and readdir + stat. Besides, this stuff tend to change more often than
> the data, so mixing it with the data in the same eraseblock is not smart.
> 3. context = parent inode number for all the stuff belonging to xattrs.
>
> We do something similar in UBIFS.
>
Doesn't this end up using too many contexts?
Opening one contexts per inode would end up in opening more contexts than available.
The eMMC spec forbids more than 15 contexts for the whole device.
> --
> Best Regards,
> Artem Bityutskiy
Cheers,
Luca