2008-11-25 21:49:12

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 6/9] swapfile: swapon use discard (trim)

When adding swap, all the old data on swap can be forgotten: sys_swapon()
discard all but the header page of the swap partition (or every extent
but the header of the swap file), to give a solidstate swap device the
opportunity to optimize its wear-levelling.

If that succeeds, note SWP_DISCARDABLE for later use, and report it
with a "D" at the right end of the kernel's "Adding ... swap" message.
Perhaps something should be shown in /proc/swaps (swapon -s), but we
have to be more cautious before making any addition to that format.

Signed-off-by: Hugh Dickins <[email protected]>
---
swapfile.c cleanup patches 0-5 just went to linux-mm: patches 6-9
may be of wider interest, so I'm extending the Cc list for them.

include/linux/swap.h | 1 +
mm/swapfile.c | 39 +++++++++++++++++++++++++++++++++++++--
2 files changed, 38 insertions(+), 2 deletions(-)

--- swapfile5/include/linux/swap.h 2008-11-25 12:41:31.000000000 +0000
+++ swapfile6/include/linux/swap.h 2008-11-25 12:41:34.000000000 +0000
@@ -120,6 +120,7 @@ struct swap_extent {
enum {
SWP_USED = (1 << 0), /* is slot in swap_info[] used? */
SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */
+ SWP_DISCARDABLE = (1 << 2), /* blkdev supports discard */
/* add others here before... */
SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */
};
--- swapfile5/mm/swapfile.c 2008-11-25 12:41:31.000000000 +0000
+++ swapfile6/mm/swapfile.c 2008-11-25 12:41:34.000000000 +0000
@@ -84,6 +84,37 @@ void swap_unplug_io_fn(struct backing_de
up_read(&swap_unplug_sem);
}

+/*
+ * swapon tell device that all the old swap contents can be discarded,
+ * to allow the swap device to optimize its wear-levelling.
+ */
+static int discard_swap(struct swap_info_struct *si)
+{
+ struct swap_extent *se;
+ int err = 0;
+
+ list_for_each_entry(se, &si->extent_list, list) {
+ sector_t start_block = se->start_block << (PAGE_SHIFT - 9);
+ pgoff_t nr_blocks = se->nr_pages << (PAGE_SHIFT - 9);
+
+ if (se->start_page == 0) {
+ /* Do not discard the swap header page! */
+ start_block += 1 << (PAGE_SHIFT - 9);
+ nr_blocks -= 1 << (PAGE_SHIFT - 9);
+ if (!nr_blocks)
+ continue;
+ }
+
+ err = blkdev_issue_discard(si->bdev, start_block,
+ nr_blocks, GFP_KERNEL);
+ if (err)
+ break;
+
+ cond_resched();
+ }
+ return err; /* That will often be -EOPNOTSUPP */
+}
+
#define SWAPFILE_CLUSTER 256
#define LATENCY_LIMIT 256

@@ -1649,6 +1680,9 @@ asmlinkage long sys_swapon(const char __
goto bad_swap;
}

+ if (discard_swap(p) == 0)
+ p->flags |= SWP_DISCARDABLE;
+
mutex_lock(&swapon_mutex);
spin_lock(&swap_lock);
if (swap_flags & SWAP_FLAG_PREFER)
@@ -1662,9 +1696,10 @@ asmlinkage long sys_swapon(const char __
total_swap_pages += nr_good_pages;

printk(KERN_INFO "Adding %uk swap on %s. "
- "Priority:%d extents:%d across:%lluk\n",
+ "Priority:%d extents:%d across:%lluk%s\n",
nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,
- nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10));
+ nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
+ (p->flags & SWP_DISCARDABLE) ? " D" : "");

/* insert swap space into swap_list: */
prev = -1;


2008-11-25 21:49:47

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 7/9] swapfile: swap allocation use discard

When scan_swap_map() finds a free cluster of swap pages to allocate,
discard the old contents of the cluster if the device supports discard.
But don't bother when swap is so fragmented that we allocate single pages.

Be careful about racing allocations made while we're scanning for
a cluster; and hold up allocations made while we're discarding.

Signed-off-by: Hugh Dickins <[email protected]>
---

include/linux/swap.h | 3 +
mm/swapfile.c | 119 ++++++++++++++++++++++++++++++++++++++++-
2 files changed, 121 insertions(+), 1 deletion(-)

--- swapfile6/include/linux/swap.h 2008-11-25 12:41:34.000000000 +0000
+++ swapfile7/include/linux/swap.h 2008-11-25 12:41:40.000000000 +0000
@@ -121,6 +121,7 @@ enum {
SWP_USED = (1 << 0), /* is slot in swap_info[] used? */
SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */
SWP_DISCARDABLE = (1 << 2), /* blkdev supports discard */
+ SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */
/* add others here before... */
SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */
};
@@ -144,6 +145,8 @@ struct swap_info_struct {
unsigned short *swap_map;
unsigned int lowest_bit;
unsigned int highest_bit;
+ unsigned int lowest_alloc; /* while preparing discard cluster */
+ unsigned int highest_alloc; /* while preparing discard cluster */
unsigned int cluster_next;
unsigned int cluster_nr;
unsigned int pages;
--- swapfile6/mm/swapfile.c 2008-11-25 12:41:34.000000000 +0000
+++ swapfile7/mm/swapfile.c 2008-11-25 12:41:40.000000000 +0000
@@ -115,14 +115,62 @@ static int discard_swap(struct swap_info
return err; /* That will often be -EOPNOTSUPP */
}

+/*
+ * swap allocation tell device that a cluster of swap can now be discarded,
+ * to allow the swap device to optimize its wear-levelling.
+ */
+static void discard_swap_cluster(struct swap_info_struct *si,
+ pgoff_t start_page, pgoff_t nr_pages)
+{
+ struct swap_extent *se = si->curr_swap_extent;
+ int found_extent = 0;
+
+ while (nr_pages) {
+ struct list_head *lh;
+
+ if (se->start_page <= start_page &&
+ start_page < se->start_page + se->nr_pages) {
+ pgoff_t offset = start_page - se->start_page;
+ sector_t start_block = se->start_block + offset;
+ pgoff_t nr_blocks = se->nr_pages - offset;
+
+ if (nr_blocks > nr_pages)
+ nr_blocks = nr_pages;
+ start_page += nr_blocks;
+ nr_pages -= nr_blocks;
+
+ if (!found_extent++)
+ si->curr_swap_extent = se;
+
+ start_block <<= PAGE_SHIFT - 9;
+ nr_blocks <<= PAGE_SHIFT - 9;
+ if (blkdev_issue_discard(si->bdev, start_block,
+ nr_blocks, GFP_NOIO))
+ break;
+ }
+
+ lh = se->list.next;
+ if (lh == &si->extent_list)
+ lh = lh->next;
+ se = list_entry(lh, struct swap_extent, list);
+ }
+}
+
+static int wait_for_discard(void *word)
+{
+ schedule();
+ return 0;
+}
+
#define SWAPFILE_CLUSTER 256
#define LATENCY_LIMIT 256

static inline unsigned long scan_swap_map(struct swap_info_struct *si)
{
unsigned long offset;
- unsigned long last_in_cluster;
+ unsigned long last_in_cluster = 0;
int latency_ration = LATENCY_LIMIT;
+ int found_free_cluster = 0;

/*
* We try to cluster swap pages by allocating them sequentially
@@ -142,6 +190,19 @@ static inline unsigned long scan_swap_ma
si->cluster_nr = SWAPFILE_CLUSTER - 1;
goto checks;
}
+ if (si->flags & SWP_DISCARDABLE) {
+ /*
+ * Start range check on racing allocations, in case
+ * they overlap the cluster we eventually decide on
+ * (we scan without swap_lock to allow preemption).
+ * It's hardly conceivable that cluster_nr could be
+ * wrapped during our scan, but don't depend on it.
+ */
+ if (si->lowest_alloc)
+ goto checks;
+ si->lowest_alloc = si->max;
+ si->highest_alloc = 0;
+ }
spin_unlock(&swap_lock);

offset = si->lowest_bit;
@@ -156,6 +217,7 @@ static inline unsigned long scan_swap_ma
offset -= SWAPFILE_CLUSTER - 1;
si->cluster_next = offset;
si->cluster_nr = SWAPFILE_CLUSTER - 1;
+ found_free_cluster = 1;
goto checks;
}
if (unlikely(--latency_ration < 0)) {
@@ -167,6 +229,7 @@ static inline unsigned long scan_swap_ma
offset = si->lowest_bit;
spin_lock(&swap_lock);
si->cluster_nr = SWAPFILE_CLUSTER - 1;
+ si->lowest_alloc = 0;
}

checks:
@@ -191,6 +254,60 @@ checks:
si->swap_map[offset] = 1;
si->cluster_next = offset + 1;
si->flags -= SWP_SCANNING;
+
+ if (si->lowest_alloc) {
+ /*
+ * Only set when SWP_DISCARDABLE, and there's a scan
+ * for a free cluster in progress or just completed.
+ */
+ if (found_free_cluster) {
+ /*
+ * To optimize wear-levelling, discard the
+ * old data of the cluster, taking care not to
+ * discard any of its pages that have already
+ * been allocated by racing tasks (offset has
+ * already stepped over any at the beginning).
+ */
+ if (offset < si->highest_alloc &&
+ si->lowest_alloc <= last_in_cluster)
+ last_in_cluster = si->lowest_alloc - 1;
+ si->flags |= SWP_DISCARDING;
+ spin_unlock(&swap_lock);
+
+ if (offset < last_in_cluster)
+ discard_swap_cluster(si, offset,
+ last_in_cluster - offset + 1);
+
+ spin_lock(&swap_lock);
+ si->lowest_alloc = 0;
+ si->flags &= ~SWP_DISCARDING;
+
+ smp_mb(); /* wake_up_bit advises this */
+ wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
+
+ } else if (si->flags & SWP_DISCARDING) {
+ /*
+ * Delay using pages allocated by racing tasks
+ * until the whole discard has been issued. We
+ * could defer that delay until swap_writepage,
+ * but it's easier to keep this self-contained.
+ */
+ spin_unlock(&swap_lock);
+ wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
+ wait_for_discard, TASK_UNINTERRUPTIBLE);
+ spin_lock(&swap_lock);
+ } else {
+ /*
+ * Note pages allocated by racing tasks while
+ * scan for a free cluster is in progress, so
+ * that its final discard can exclude them.
+ */
+ if (offset < si->lowest_alloc)
+ si->lowest_alloc = offset;
+ if (offset > si->highest_alloc)
+ si->highest_alloc = offset;
+ }
+ }
return offset;

scan:

2008-11-25 21:50:14

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 8/9] swapfile: swapon randomize if nonrot

Swap allocation has always started from the beginning of the swap area;
but if we're dealing with a solidstate swap device which can only remap
blocks within limited zones, that would sooner wear out the first zone.

Therefore sys_swapon() test whether blk_queue is non-rotational,
and if so randomize the cluster_next starting position for allocation.

If blk_queue is nonrot, note SWP_SOLIDSTATE for later use, and report it
with an "SS" at the right end of the kernel's "Adding ... swap" message
(so that if it's both nonrot and discardable, "SSD" will be shown there).
Perhaps something should be shown in /proc/swaps (swapon -s), but we
have to be more cautious before making any addition to that format.

Signed-off-by: Hugh Dickins <[email protected]>
---
But how to get my SD card, accessed by USB card reader, reported as NONROT?

include/linux/swap.h | 1 +
mm/swapfile.c | 11 +++++++++--
2 files changed, 10 insertions(+), 2 deletions(-)

--- swapfile7/include/linux/swap.h 2008-11-25 12:41:40.000000000 +0000
+++ swapfile8/include/linux/swap.h 2008-11-25 12:41:42.000000000 +0000
@@ -122,6 +122,7 @@ enum {
SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */
SWP_DISCARDABLE = (1 << 2), /* blkdev supports discard */
SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */
+ SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */
/* add others here before... */
SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */
};
--- swapfile7/mm/swapfile.c 2008-11-25 12:41:40.000000000 +0000
+++ swapfile8/mm/swapfile.c 2008-11-25 12:41:42.000000000 +0000
@@ -16,6 +16,7 @@
#include <linux/namei.h>
#include <linux/shm.h>
#include <linux/blkdev.h>
+#include <linux/random.h>
#include <linux/writeback.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
@@ -1797,6 +1798,11 @@ asmlinkage long sys_swapon(const char __
goto bad_swap;
}

+ if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
+ p->flags |= SWP_SOLIDSTATE;
+ srandom32((u32)get_seconds());
+ p->cluster_next = 1 + (random32() % p->highest_bit);
+ }
if (discard_swap(p) == 0)
p->flags |= SWP_DISCARDABLE;

@@ -1813,10 +1819,11 @@ asmlinkage long sys_swapon(const char __
total_swap_pages += nr_good_pages;

printk(KERN_INFO "Adding %uk swap on %s. "
- "Priority:%d extents:%d across:%lluk%s\n",
+ "Priority:%d extents:%d across:%lluk %s%s\n",
nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,
nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
- (p->flags & SWP_DISCARDABLE) ? " D" : "");
+ (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
+ (p->flags & SWP_DISCARDABLE) ? "D" : "");

/* insert swap space into swap_list: */
prev = -1;

2008-11-25 21:50:56

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 9/9] swapfile: swap allocation cycle if nonrot

Though attempting to find free clusters (Andrea), swap allocation has
always restarted its searches from the beginning of the swap area (sct),
to reduce seek times between swap pages, by not scattering them all over
the partition.

But on a solidstate swap device, seeks are cheap, and block remapping
to level the wear may be limited by zones: in that case it's better to
cycle around the whole partition.

Signed-off-by: Hugh Dickins <[email protected]>
---

mm/swapfile.c | 50 ++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 46 insertions(+), 4 deletions(-)

--- swapfile8/mm/swapfile.c 2008-11-25 12:41:42.000000000 +0000
+++ swapfile9/mm/swapfile.c 2008-11-25 12:41:44.000000000 +0000
@@ -169,6 +169,7 @@ static int wait_for_discard(void *word)
static inline unsigned long scan_swap_map(struct swap_info_struct *si)
{
unsigned long offset;
+ unsigned long scan_base;
unsigned long last_in_cluster = 0;
int latency_ration = LATENCY_LIMIT;
int found_free_cluster = 0;
@@ -181,10 +182,11 @@ static inline unsigned long scan_swap_ma
* all over the entire swap partition, so that we reduce
* overall disk seek times between swap pages. -- sct
* But we do now try to find an empty cluster. -Andrea
+ * And we let swap pages go all over an SSD partition. Hugh
*/

si->flags += SWP_SCANNING;
- offset = si->cluster_next;
+ scan_base = offset = si->cluster_next;

if (unlikely(!si->cluster_nr--)) {
if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
@@ -206,7 +208,16 @@ static inline unsigned long scan_swap_ma
}
spin_unlock(&swap_lock);

- offset = si->lowest_bit;
+ /*
+ * If seek is expensive, start searching for new cluster from
+ * start of partition, to minimize the span of allocated swap.
+ * But if seek is cheap, search from our current position, so
+ * that swap is allocated from all over the partition: if the
+ * Flash Translation Layer only remaps within limited zones,
+ * we don't want to wear out the first zone too quickly.
+ */
+ if (!(si->flags & SWP_SOLIDSTATE))
+ scan_base = offset = si->lowest_bit;
last_in_cluster = offset + SWAPFILE_CLUSTER - 1;

/* Locate the first empty (unaligned) cluster */
@@ -228,6 +239,27 @@ static inline unsigned long scan_swap_ma
}

offset = si->lowest_bit;
+ last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
+
+ /* Locate the first empty (unaligned) cluster */
+ for (; last_in_cluster < scan_base; offset++) {
+ if (si->swap_map[offset])
+ last_in_cluster = offset + SWAPFILE_CLUSTER;
+ else if (offset == last_in_cluster) {
+ spin_lock(&swap_lock);
+ offset -= SWAPFILE_CLUSTER - 1;
+ si->cluster_next = offset;
+ si->cluster_nr = SWAPFILE_CLUSTER - 1;
+ found_free_cluster = 1;
+ goto checks;
+ }
+ if (unlikely(--latency_ration < 0)) {
+ cond_resched();
+ latency_ration = LATENCY_LIMIT;
+ }
+ }
+
+ offset = scan_base;
spin_lock(&swap_lock);
si->cluster_nr = SWAPFILE_CLUSTER - 1;
si->lowest_alloc = 0;
@@ -239,7 +271,7 @@ checks:
if (!si->highest_bit)
goto no_page;
if (offset > si->highest_bit)
- offset = si->lowest_bit;
+ scan_base = offset = si->lowest_bit;
if (si->swap_map[offset])
goto scan;

@@ -323,8 +355,18 @@ scan:
latency_ration = LATENCY_LIMIT;
}
}
+ offset = si->lowest_bit;
+ while (++offset < scan_base) {
+ if (!si->swap_map[offset]) {
+ spin_lock(&swap_lock);
+ goto checks;
+ }
+ if (unlikely(--latency_ration < 0)) {
+ cond_resched();
+ latency_ration = LATENCY_LIMIT;
+ }
+ }
spin_lock(&swap_lock);
- goto checks;

no_page:
si->flags -= SWP_SCANNING;

2008-11-26 01:19:27

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 6/9] swapfile: swapon use discard (trim)

On Tue, 25 Nov 2008 21:44:34 +0000 (GMT)
Hugh Dickins <[email protected]> wrote:

> When adding swap, all the old data on swap can be forgotten: sys_swapon()
> discard all but the header page of the swap partition (or every extent
> but the header of the swap file), to give a solidstate swap device the
> opportunity to optimize its wear-levelling.
>
> If that succeeds, note SWP_DISCARDABLE for later use, and report it
> with a "D" at the right end of the kernel's "Adding ... swap" message.
> Perhaps something should be shown in /proc/swaps (swapon -s), but we
> have to be more cautious before making any addition to that format.
>


When reading the above text it's a bit hard to tell whether it's
talking about "this is how things are at present" or "this is how
things are after the patch". This is fairly common with Hugh
changelogs.

> ---
> swapfile.c cleanup patches 0-5 just went to linux-mm: patches 6-9
> may be of wider interest, so I'm extending the Cc list for them.
>
> include/linux/swap.h | 1 +
> mm/swapfile.c | 39 +++++++++++++++++++++++++++++++++++++--
> 2 files changed, 38 insertions(+), 2 deletions(-)
>
> --- swapfile5/include/linux/swap.h 2008-11-25 12:41:31.000000000 +0000
> +++ swapfile6/include/linux/swap.h 2008-11-25 12:41:34.000000000 +0000
> @@ -120,6 +120,7 @@ struct swap_extent {
> enum {
> SWP_USED = (1 << 0), /* is slot in swap_info[] used? */
> SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */
> + SWP_DISCARDABLE = (1 << 2), /* blkdev supports discard */
> /* add others here before... */
> SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */
> };
> --- swapfile5/mm/swapfile.c 2008-11-25 12:41:31.000000000 +0000
> +++ swapfile6/mm/swapfile.c 2008-11-25 12:41:34.000000000 +0000
> @@ -84,6 +84,37 @@ void swap_unplug_io_fn(struct backing_de
> up_read(&swap_unplug_sem);
> }
>
> +/*
> + * swapon tell device that all the old swap contents can be discarded,
> + * to allow the swap device to optimize its wear-levelling.
> + */
> +static int discard_swap(struct swap_info_struct *si)
> +{
> + struct swap_extent *se;
> + int err = 0;
> +
> + list_for_each_entry(se, &si->extent_list, list) {
> + sector_t start_block = se->start_block << (PAGE_SHIFT - 9);
> + pgoff_t nr_blocks = se->nr_pages << (PAGE_SHIFT - 9);

I trust we don't have any shift overflows here.

It's a bit dissonant to see a pgoff_t with "blocks" in its name. But
swap is like that..


> + if (se->start_page == 0) {
> + /* Do not discard the swap header page! */
> + start_block += 1 << (PAGE_SHIFT - 9);
> + nr_blocks -= 1 << (PAGE_SHIFT - 9);
> + if (!nr_blocks)
> + continue;
> + }
> +
> + err = blkdev_issue_discard(si->bdev, start_block,
> + nr_blocks, GFP_KERNEL);
> + if (err)
> + break;
> +
> + cond_resched();
> + }
> + return err; /* That will often be -EOPNOTSUPP */
> +}
> +
> #define SWAPFILE_CLUSTER 256
> #define LATENCY_LIMIT 256
>
> @@ -1649,6 +1680,9 @@ asmlinkage long sys_swapon(const char __
> goto bad_swap;
> }
>
> + if (discard_swap(p) == 0)
> + p->flags |= SWP_DISCARDABLE;
> +
> mutex_lock(&swapon_mutex);
> spin_lock(&swap_lock);
> if (swap_flags & SWAP_FLAG_PREFER)
> @@ -1662,9 +1696,10 @@ asmlinkage long sys_swapon(const char __
> total_swap_pages += nr_good_pages;
>
> printk(KERN_INFO "Adding %uk swap on %s. "
> - "Priority:%d extents:%d across:%lluk\n",
> + "Priority:%d extents:%d across:%lluk%s\n",
> nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,
> - nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10));
> + nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
> + (p->flags & SWP_DISCARDABLE) ? " D" : "");
>
> /* insert swap space into swap_list: */
> prev = -1;

2008-11-26 01:21:44

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 8/9] swapfile: swapon randomize if nonrot

On Tue, 25 Nov 2008 21:46:56 +0000 (GMT)
Hugh Dickins <[email protected]> wrote:

> But how to get my SD card, accessed by USB card reader, reported as NONROT?

Dunno. udev rules, perhaps?

2008-11-26 03:38:29

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH 8/9] swapfile: swapon randomize if nonrot

On Tue, Nov 25, 2008 at 05:20:39PM -0800, Andrew Morton wrote:
> On Tue, 25 Nov 2008 21:46:56 +0000 (GMT)
> Hugh Dickins <[email protected]> wrote:
>
> > But how to get my SD card, accessed by USB card reader, reported as NONROT?
>
> Dunno. udev rules, perhaps?

I didn't see patch 8/9, but the 'non-rotating' bit is in word 217 of the
inquiry data. Unfortunately, Jeff insisted that we only report the
contents of that bit for devices claiming ATA-8 support, which is
ridiculous as even the Intel SSDs only claim conformance to ATA-7.

I notice that Jens was allowed to ignore Jeff's insane requirement and
doesn't have to check ATA revision at all.

--
Matthew Wilcox Intel Open Source Technology Centre
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."

2008-11-26 06:03:31

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH 6/9] swapfile: swapon use discard (trim)

On Tue, 25 Nov 2008, Andrew Morton wrote:
> On Tue, 25 Nov 2008 21:44:34 +0000 (GMT)
> Hugh Dickins <[email protected]> wrote:
>
> > When adding swap, all the old data on swap can be forgotten: sys_swapon()
> > discard all but the header page of the swap partition (or every extent
> > but the header of the swap file), to give a solidstate swap device the
> > opportunity to optimize its wear-levelling.
> >
> > If that succeeds, note SWP_DISCARDABLE for later use, and report it
> > with a "D" at the right end of the kernel's "Adding ... swap" message.
> > Perhaps something should be shown in /proc/swaps (swapon -s), but we
> > have to be more cautious before making any addition to that format.
>
> When reading the above text it's a bit hard to tell whether it's
> talking about "this is how things are at present" or "this is how
> things are after the patch". This is fairly common with Hugh
> changelogs.

;) Sorry about that - yes, that's often true. In this case, it's
all talking about how things are after the patch. I think it's that
first sentence which bothers you - "all the old data on swap can be
forgotten". In this case, I'm meaning "it's a good idea to let the
device know that it can forget about all the old data"; but it's easy
to imagine another patch coming from me in which the same sentence
would mean "we've got a terrible data-loss bug, such that all the
data already written to swap gets erased". Let's hope I didn't
implement the latter.

> > +static int discard_swap(struct swap_info_struct *si)
> > +{
> > + struct swap_extent *se;
> > + int err = 0;
> > +
> > + list_for_each_entry(se, &si->extent_list, list) {
> > + sector_t start_block = se->start_block << (PAGE_SHIFT - 9);
> > + pgoff_t nr_blocks = se->nr_pages << (PAGE_SHIFT - 9);
>
> I trust we don't have any shift overflows here.
>
> It's a bit dissonant to see a pgoff_t with "blocks" in its name. But
> swap is like that..

In fact we don't have a shift overflow there, but you've such a good eye.

I noticed that "pgoff_t nr_blocks" line just as I was about to send off
the patches, and had a little worry about it. By that time I was at
the stage that if I went into the patch and changed a few pgoff_ts
to sector_ts at the last minute, likelihood was I'd screw something
up badly, in one CONFIG combination or another, and if I delayed
it'd be tomorrow.

It would be good to make that change when built and tested,
just for reassurance. There isn't a shift overflow as it stands,
but the reasons are too contingent for my liking: on 64-bit there
isn't an issue because pgoff_t is as big as sector_t; on 32-bit,
it's because a swp_entry_t is an unsigned long, and it has to
contain five bits for the "type" (which of the 30 or 32 swapfiles
is addressed), and the pages-to-sectors shift is less than 5 on
all 32-bit machines for the foreseeable future. Oh, and it also
relies on the fact that by the time we're setting up swap extents,
we've already curtailed the size to what's usable by a swp_entry_t,
if that's less than the size given in the swap header.

So, not actually a bug there, but certainly a source of anxiety,
better eliminated.

Hugh

2008-12-01 00:31:31

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 10/9] swapfile: change discard pgoff_t to sector_t

Change pgoff_t nr_blocks in discard_swap() and discard_swap_cluster() to
sector_t: given the constraints on swap offsets (in particular, the 5 bits
of swap type accommodated in the same unsigned long), pgoff_t was actually
safe as is, but it certainly looked worrying when shifted left.

Signed-off-by: Hugh Dickins <[email protected]>
---
To follow 9/9 swapfile-swap-allocation-cycle-if-nonrot.patch

mm/swapfile.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

--- swapfile9/mm/swapfile.c 2008-11-26 12:19:00.000000000 +0000
+++ swapfile10/mm/swapfile.c 2008-11-28 20:36:44.000000000 +0000
@@ -96,7 +96,7 @@ static int discard_swap(struct swap_info

list_for_each_entry(se, &si->extent_list, list) {
sector_t start_block = se->start_block << (PAGE_SHIFT - 9);
- pgoff_t nr_blocks = se->nr_pages << (PAGE_SHIFT - 9);
+ sector_t nr_blocks = se->nr_pages << (PAGE_SHIFT - 9);

if (se->start_page == 0) {
/* Do not discard the swap header page! */
@@ -133,7 +133,7 @@ static void discard_swap_cluster(struct
start_page < se->start_page + se->nr_pages) {
pgoff_t offset = start_page - se->start_page;
sector_t start_block = se->start_block + offset;
- pgoff_t nr_blocks = se->nr_pages - offset;
+ sector_t nr_blocks = se->nr_pages - offset;

if (nr_blocks > nr_pages)
nr_blocks = nr_pages;

2008-12-01 00:32:26

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 11/9] swapfile: let others seed random

Remove the srandom32((u32)get_seconds()) from non-rotational swapon:
there's been a coincidental discussion of earlier randomization, assume
that goes ahead, let swapon be a client rather than stirring for itself.

Signed-off-by: Hugh Dickins <[email protected]>
---
To follow 10/9

mm/swapfile.c | 1 -
1 file changed, 1 deletion(-)

--- swapfile10/mm/swapfile.c 2008-11-28 20:36:44.000000000 +0000
+++ swapfile11/mm/swapfile.c 2008-11-28 20:37:16.000000000 +0000
@@ -1842,7 +1842,6 @@ asmlinkage long sys_swapon(const char __

if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
p->flags |= SWP_SOLIDSTATE;
- srandom32((u32)get_seconds());
p->cluster_next = 1 + (random32() % p->highest_bit);
}
if (discard_swap(p) == 0)

2008-12-03 00:48:35

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 10/9] swapfile: change discard pgoff_t to sector_t

On Mon, 1 Dec 2008 00:29:41 +0000 (GMT)
Hugh Dickins <[email protected]> wrote:

> Change pgoff_t nr_blocks in discard_swap() and discard_swap_cluster() to
> sector_t: given the constraints on swap offsets (in particular, the 5 bits
> of swap type accommodated in the same unsigned long), pgoff_t was actually
> safe as is, but it certainly looked worrying when shifted left.
>
> Signed-off-by: Hugh Dickins <[email protected]>
> ---
> To follow 9/9 swapfile-swap-allocation-cycle-if-nonrot.patch
>
> mm/swapfile.c | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
> --- swapfile9/mm/swapfile.c 2008-11-26 12:19:00.000000000 +0000
> +++ swapfile10/mm/swapfile.c 2008-11-28 20:36:44.000000000 +0000
> @@ -96,7 +96,7 @@ static int discard_swap(struct swap_info
>
> list_for_each_entry(se, &si->extent_list, list) {
> sector_t start_block = se->start_block << (PAGE_SHIFT - 9);
> - pgoff_t nr_blocks = se->nr_pages << (PAGE_SHIFT - 9);
> + sector_t nr_blocks = se->nr_pages << (PAGE_SHIFT - 9);

but, but, that didn't change anything? se->nr_pages must be cast to
sector_t?

> if (se->start_page == 0) {
> /* Do not discard the swap header page! */
> @@ -133,7 +133,7 @@ static void discard_swap_cluster(struct
> start_page < se->start_page + se->nr_pages) {
> pgoff_t offset = start_page - se->start_page;
> sector_t start_block = se->start_block + offset;
> - pgoff_t nr_blocks = se->nr_pages - offset;
> + sector_t nr_blocks = se->nr_pages - offset;
>
> if (nr_blocks > nr_pages)
> nr_blocks = nr_pages;

2008-12-03 12:53:06

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH 10/9] swapfile: change discard pgoff_t to sector_t

On Tue, 2 Dec 2008, Andrew Morton wrote:
> On Mon, 1 Dec 2008 00:29:41 +0000 (GMT)
> Hugh Dickins <[email protected]> wrote:
>
> > - pgoff_t nr_blocks = se->nr_pages << (PAGE_SHIFT - 9);
> > + sector_t nr_blocks = se->nr_pages << (PAGE_SHIFT - 9);
>
> but, but, that didn't change anything? se->nr_pages must be cast to
> sector_t?

I'm squirming, you are right, thanks for fixing it up.

Hugh