2024-03-22 16:50:17

by Johannes Weiner

[permalink] [raw]
Subject: [RFC PATCH] mm: swapfile: fix SSD detection with swapfile on btrfs

btrfs sets si->bdev during swap activation, which currently happens
after swapon's SSD detection and cluster setup. Thus none of the SSD
optimizations and cluster lock splitting are enabled for btrfs swap.

Rearrange the swapon sequence so that filesystem activation happens
before determining swap behavior based on the backing device.

Afterwards, the nonrotational drive is detected correctly:

- Adding 2097148k swap on /mnt/swapfile. Priority:-3 extents:1 across:2097148k
+ Adding 2097148k swap on /mnt/swapfile. Priority:-3 extents:1 across:2097148k SS

Signed-off-by: Johannes Weiner <[email protected]>
---
mm/swapfile.c | 141 +++++++++++++++++++++++++-------------------------
1 file changed, 70 insertions(+), 71 deletions(-)

This survives a swapping smoketest, but I would really appreciate more
eyes on the swap and fs implications of this change.

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4919423cce76..4dd5f2e8190d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2919,22 +2919,15 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
static int setup_swap_map_and_extents(struct swap_info_struct *p,
union swap_header *swap_header,
unsigned char *swap_map,
- struct swap_cluster_info *cluster_info,
unsigned long maxpages,
sector_t *span)
{
- unsigned int j, k;
unsigned int nr_good_pages;
+ unsigned long i;
int nr_extents;
- unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
- unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
- unsigned long i, idx;

nr_good_pages = maxpages - 1; /* omit header page */

- cluster_list_init(&p->free_clusters);
- cluster_list_init(&p->discard_clusters);
-
for (i = 0; i < swap_header->info.nr_badpages; i++) {
unsigned int page_nr = swap_header->info.badpages[i];
if (page_nr == 0 || page_nr > swap_header->info.last_page)
@@ -2942,25 +2935,11 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
if (page_nr < maxpages) {
swap_map[page_nr] = SWAP_MAP_BAD;
nr_good_pages--;
- /*
- * Haven't marked the cluster free yet, no list
- * operation involved
- */
- inc_cluster_info_page(p, cluster_info, page_nr);
}
}

- /* Haven't marked the cluster free yet, no list operation involved */
- for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
- inc_cluster_info_page(p, cluster_info, i);
-
if (nr_good_pages) {
swap_map[0] = SWAP_MAP_BAD;
- /*
- * Not mark the cluster free yet, no list
- * operation involved
- */
- inc_cluster_info_page(p, cluster_info, 0);
p->max = maxpages;
p->pages = nr_good_pages;
nr_extents = setup_swap_extents(p, span);
@@ -2973,9 +2952,55 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
return -EINVAL;
}

+ return nr_extents;
+}
+
+static struct swap_cluster_info *setup_clusters(struct swap_info_struct *p,
+ unsigned char *swap_map)
+{
+ unsigned long nr_clusters = DIV_ROUND_UP(p->max, SWAPFILE_CLUSTER);
+ unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
+ struct swap_cluster_info *cluster_info;
+ unsigned long i, j, k, idx;
+ int cpu, err = -ENOMEM;
+
+ cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL);
if (!cluster_info)
- return nr_extents;
+ goto err;
+
+ for (i = 0; i < nr_clusters; i++)
+ spin_lock_init(&cluster_info[i].lock);

+ p->cluster_next_cpu = alloc_percpu(unsigned int);
+ if (!p->cluster_next_cpu)
+ goto err_free;
+
+ /* Random start position to help with wear leveling */
+ for_each_possible_cpu(cpu)
+ per_cpu(*p->cluster_next_cpu, cpu) =
+ get_random_u32_inclusive(1, p->highest_bit);
+
+ p->percpu_cluster = alloc_percpu(struct percpu_cluster);
+ if (!p->percpu_cluster)
+ goto err_free;
+
+ for_each_possible_cpu(cpu) {
+ struct percpu_cluster *cluster;
+
+ cluster = per_cpu_ptr(p->percpu_cluster, cpu);
+ cluster_set_null(&cluster->index);
+ }
+
+ /*
+ * Mark unusable pages as unavailable. The clusters aren't
+ * marked free yet, so no list operations are involved yet.
+ */
+ for (i = 0; i < round_up(p->max, SWAPFILE_CLUSTER); i++)
+ if (i >= p->max || swap_map[i] == SWAP_MAP_BAD)
+ inc_cluster_info_page(p, cluster_info, i);
+
+ cluster_list_init(&p->free_clusters);
+ cluster_list_init(&p->discard_clusters);

/*
* Reduce false cache line sharing between cluster_info and
@@ -2994,7 +3019,13 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
idx);
}
}
- return nr_extents;
+
+ return cluster_info;
+
+err_free:
+ kvfree(cluster_info);
+err:
+ return ERR_PTR(err);
}

SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
@@ -3090,6 +3121,17 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
goto bad_swap_unlock_inode;
}

+ error = swap_cgroup_swapon(p->type, maxpages);
+ if (error)
+ goto bad_swap_unlock_inode;
+
+ nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
+ maxpages, &span);
+ if (unlikely(nr_extents < 0)) {
+ error = nr_extents;
+ goto bad_swap_unlock_inode;
+ }
+
if (p->bdev && bdev_stable_writes(p->bdev))
p->flags |= SWP_STABLE_WRITES;

@@ -3097,61 +3139,18 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
p->flags |= SWP_SYNCHRONOUS_IO;

if (p->bdev && bdev_nonrot(p->bdev)) {
- int cpu;
- unsigned long ci, nr_cluster;
-
p->flags |= SWP_SOLIDSTATE;
- p->cluster_next_cpu = alloc_percpu(unsigned int);
- if (!p->cluster_next_cpu) {
- error = -ENOMEM;
- goto bad_swap_unlock_inode;
- }
- /*
- * select a random position to start with to help wear leveling
- * SSD
- */
- for_each_possible_cpu(cpu) {
- per_cpu(*p->cluster_next_cpu, cpu) =
- get_random_u32_inclusive(1, p->highest_bit);
- }
- nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
-
- cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
- GFP_KERNEL);
- if (!cluster_info) {
- error = -ENOMEM;
- goto bad_swap_unlock_inode;
- }
-
- for (ci = 0; ci < nr_cluster; ci++)
- spin_lock_init(&((cluster_info + ci)->lock));
-
- p->percpu_cluster = alloc_percpu(struct percpu_cluster);
- if (!p->percpu_cluster) {
- error = -ENOMEM;
+ cluster_info = setup_clusters(p, swap_map);
+ if (IS_ERR(cluster_info)) {
+ error = PTR_ERR(cluster_info);
+ cluster_info = NULL;
goto bad_swap_unlock_inode;
}
- for_each_possible_cpu(cpu) {
- struct percpu_cluster *cluster;
- cluster = per_cpu_ptr(p->percpu_cluster, cpu);
- cluster_set_null(&cluster->index);
- }
} else {
atomic_inc(&nr_rotate_swap);
inced_nr_rotate_swap = true;
}

- error = swap_cgroup_swapon(p->type, maxpages);
- if (error)
- goto bad_swap_unlock_inode;
-
- nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
- cluster_info, maxpages, &span);
- if (unlikely(nr_extents < 0)) {
- error = nr_extents;
- goto bad_swap_unlock_inode;
- }
-
if ((swap_flags & SWAP_FLAG_DISCARD) &&
p->bdev && bdev_max_discard_sectors(p->bdev)) {
/*
--
2.44.0



2024-03-26 05:49:53

by Huang, Ying

[permalink] [raw]
Subject: Re: [RFC PATCH] mm: swapfile: fix SSD detection with swapfile on btrfs

Hi, Johannes,

Johannes Weiner <[email protected]> writes:

> +static struct swap_cluster_info *setup_clusters(struct swap_info_struct *p,
> + unsigned char *swap_map)
> +{
> + unsigned long nr_clusters = DIV_ROUND_UP(p->max, SWAPFILE_CLUSTER);
> + unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
> + struct swap_cluster_info *cluster_info;
> + unsigned long i, j, k, idx;
> + int cpu, err = -ENOMEM;
> +
> + cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL);
> if (!cluster_info)
> - return nr_extents;
> + goto err;
> +
> + for (i = 0; i < nr_clusters; i++)
> + spin_lock_init(&cluster_info[i].lock);
>
> + p->cluster_next_cpu = alloc_percpu(unsigned int);
> + if (!p->cluster_next_cpu)
> + goto err_free;
> +
> + /* Random start position to help with wear leveling */
> + for_each_possible_cpu(cpu)
> + per_cpu(*p->cluster_next_cpu, cpu) =
> + get_random_u32_inclusive(1, p->highest_bit);
> +
> + p->percpu_cluster = alloc_percpu(struct percpu_cluster);
> + if (!p->percpu_cluster)
> + goto err_free;
> +
> + for_each_possible_cpu(cpu) {
> + struct percpu_cluster *cluster;
> +
> + cluster = per_cpu_ptr(p->percpu_cluster, cpu);
> + cluster_set_null(&cluster->index);
> + }
> +
> + /*
> + * Mark unusable pages as unavailable. The clusters aren't
> + * marked free yet, so no list operations are involved yet.
> + */
> + for (i = 0; i < round_up(p->max, SWAPFILE_CLUSTER); i++)
> + if (i >= p->max || swap_map[i] == SWAP_MAP_BAD)
> + inc_cluster_info_page(p, cluster_info, i);

If p->max is large, it seems better to use an loop like below?

for (i = 0; i < swap_header->info.nr_badpages; i++) {
/* check i and inc_cluster_info_page() */
}

in most cases, swap_header->info.nr_badpages should be much smaller than
p->max.

> +
> + cluster_list_init(&p->free_clusters);
> + cluster_list_init(&p->discard_clusters);
>
> /*
> * Reduce false cache line sharing between cluster_info and
> @@ -2994,7 +3019,13 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
> idx);
> }
> }
> - return nr_extents;
> +
> + return cluster_info;
> +
> +err_free:
> + kvfree(cluster_info);
> +err:
> + return ERR_PTR(err);
> }
>

[snip]

--
Best Regards,
Huang, Ying

2024-03-26 12:52:12

by Johannes Weiner

[permalink] [raw]
Subject: Re: [RFC PATCH] mm: swapfile: fix SSD detection with swapfile on btrfs

Hi Ying,

Thanks for taking a look!

On Tue, Mar 26, 2024 at 01:47:45PM +0800, Huang, Ying wrote:
> Johannes Weiner <[email protected]> writes:
> > +static struct swap_cluster_info *setup_clusters(struct swap_info_struct *p,
> > + unsigned char *swap_map)
> > +{
> > + unsigned long nr_clusters = DIV_ROUND_UP(p->max, SWAPFILE_CLUSTER);
> > + unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
> > + struct swap_cluster_info *cluster_info;
> > + unsigned long i, j, k, idx;
> > + int cpu, err = -ENOMEM;
> > +
> > + cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL);
> > if (!cluster_info)
> > - return nr_extents;
> > + goto err;
> > +
> > + for (i = 0; i < nr_clusters; i++)
> > + spin_lock_init(&cluster_info[i].lock);
> >
> > + p->cluster_next_cpu = alloc_percpu(unsigned int);
> > + if (!p->cluster_next_cpu)
> > + goto err_free;
> > +
> > + /* Random start position to help with wear leveling */
> > + for_each_possible_cpu(cpu)
> > + per_cpu(*p->cluster_next_cpu, cpu) =
> > + get_random_u32_inclusive(1, p->highest_bit);
> > +
> > + p->percpu_cluster = alloc_percpu(struct percpu_cluster);
> > + if (!p->percpu_cluster)
> > + goto err_free;
> > +
> > + for_each_possible_cpu(cpu) {
> > + struct percpu_cluster *cluster;
> > +
> > + cluster = per_cpu_ptr(p->percpu_cluster, cpu);
> > + cluster_set_null(&cluster->index);
> > + }
> > +
> > + /*
> > + * Mark unusable pages as unavailable. The clusters aren't
> > + * marked free yet, so no list operations are involved yet.
> > + */
> > + for (i = 0; i < round_up(p->max, SWAPFILE_CLUSTER); i++)
> > + if (i >= p->max || swap_map[i] == SWAP_MAP_BAD)
> > + inc_cluster_info_page(p, cluster_info, i);
>
> If p->max is large, it seems better to use an loop like below?
>
> for (i = 0; i < swap_header->info.nr_badpages; i++) {
> /* check i and inc_cluster_info_page() */
> }
>
> in most cases, swap_header->info.nr_badpages should be much smaller than
> p->max.

Yes, it's a little crappy. I've tried to not duplicate the smarts from
setup_swap_map_and_extents() to avoid bugs if they go out of
sync. Consulting the map directly is a bit more robust. Right now it's
the badpages, but also the header at map[0], that needs to be marked.

But you're right this could be slow with big files. I can send an
update and add a comment to keep the functions in sync.