Some small buddy allocator/struct page patches
The first two patches are small buddy allocator cleanups - they were supposed to
be prep work patches to replacing buddy allocator freelists with radix trees,
but I'm not sure if that's actually going to be feasible due to highmem - but
the patches are still improvements so I wanted to send them out.
The third, to mm/page_reporting, also came about because that code walks buddy
allocator freelists but is a much more significant cleanup. I have no idea how
to test page reporting though so it's only Correct By Obviousness, so hopefully
Alexander can help us out with that.
The last two patches are important for the struct page cleanups currently
underway. We have a lot of code using page->index and page->mapping in ad-hoc
ways, and this is a real problem given our goal of cutting struct page up into
different types that each have a well defined purpose - and it turns out that a
lot of that code is using those fields for very minor conveniences. We still
need a lot more cleanups like this, I've only done two of the easier ones.
Kent Overstreet (5):
mm: Make free_area->nr_free per migratetype
mm: Introduce struct page_free_list
mm/page_reporting: Improve control flow
md: Kill usage of page->index
brd: Kill usage of page->index
drivers/block/brd.c | 4 --
drivers/md/md-bitmap.c | 44 ++++++------
include/linux/mmzone.h | 22 ++++--
kernel/crash_core.c | 4 +-
mm/compaction.c | 20 +++---
mm/page_alloc.c | 50 +++++++------
mm/page_reporting.c | 158 +++++++++++++++--------------------------
mm/vmstat.c | 28 +-------
8 files changed, 138 insertions(+), 192 deletions(-)
--
2.33.0
As part of the struct page cleanups underway, we want to remove as much
usage of page->mapping and page->index as possible, as frequently they
are known from context - as they are here in the md bitmap code.
Signed-off-by: Kent Overstreet <[email protected]>
---
drivers/md/md-bitmap.c | 44 ++++++++++++++++++++----------------------
1 file changed, 21 insertions(+), 23 deletions(-)
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index e29c6298ef..dcdb4597c5 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -165,10 +165,8 @@ static int read_sb_page(struct mddev *mddev, loff_t offset,
if (sync_page_io(rdev, target,
roundup(size, bdev_logical_block_size(rdev->bdev)),
- page, REQ_OP_READ, 0, true)) {
- page->index = index;
+ page, REQ_OP_READ, 0, true))
return 0;
- }
}
return -EIO;
}
@@ -209,7 +207,8 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde
return NULL;
}
-static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
+static int write_sb_page(struct bitmap *bitmap, struct page *page,
+ unsigned long index, int wait)
{
struct md_rdev *rdev;
struct block_device *bdev;
@@ -224,7 +223,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
- if (page->index == store->file_pages-1) {
+ if (index == store->file_pages-1) {
int last_page_size = store->bytes & (PAGE_SIZE-1);
if (last_page_size == 0)
last_page_size = PAGE_SIZE;
@@ -236,8 +235,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
*/
if (mddev->external) {
/* Bitmap could be anywhere. */
- if (rdev->sb_start + offset + (page->index
- * (PAGE_SIZE/512))
+ if (rdev->sb_start + offset + index * PAGE_SECTORS
> rdev->data_offset
&&
rdev->sb_start + offset
@@ -247,7 +245,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
} else if (offset < 0) {
/* DATA BITMAP METADATA */
if (offset
- + (long)(page->index * (PAGE_SIZE/512))
+ + (long)(index * PAGE_SECTORS)
+ size/512 > 0)
/* bitmap runs in to metadata */
goto bad_alignment;
@@ -259,7 +257,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
/* METADATA BITMAP DATA */
if (rdev->sb_start
+ offset
- + page->index*(PAGE_SIZE/512) + size/512
+ + index * PAGE_SECTORS + size/512
> rdev->data_offset)
/* bitmap runs in to data */
goto bad_alignment;
@@ -268,7 +266,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
}
md_super_write(mddev, rdev,
rdev->sb_start + offset
- + page->index * (PAGE_SIZE/512),
+ + index * PAGE_SECTORS,
size,
page);
}
@@ -285,12 +283,13 @@ static void md_bitmap_file_kick(struct bitmap *bitmap);
/*
* write out a page to a file
*/
-static void write_page(struct bitmap *bitmap, struct page *page, int wait)
+static void write_page(struct bitmap *bitmap, struct page *page,
+ unsigned long index, int wait)
{
struct buffer_head *bh;
if (bitmap->storage.file == NULL) {
- switch (write_sb_page(bitmap, page, wait)) {
+ switch (write_sb_page(bitmap, page, index, wait)) {
case -EINVAL:
set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
}
@@ -399,7 +398,6 @@ static int read_page(struct file *file, unsigned long index,
blk_cur++;
bh = bh->b_this_page;
}
- page->index = index;
wait_event(bitmap->write_wait,
atomic_read(&bitmap->pending_writes)==0);
@@ -472,7 +470,7 @@ void md_bitmap_update_sb(struct bitmap *bitmap)
sb->sectors_reserved = cpu_to_le32(bitmap->mddev->
bitmap_info.space);
kunmap_atomic(sb);
- write_page(bitmap, bitmap->storage.sb_page, 1);
+ write_page(bitmap, bitmap->storage.sb_page, 0, 1);
}
EXPORT_SYMBOL(md_bitmap_update_sb);
@@ -524,7 +522,6 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap)
bitmap->storage.sb_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (bitmap->storage.sb_page == NULL)
return -ENOMEM;
- bitmap->storage.sb_page->index = 0;
sb = kmap_atomic(bitmap->storage.sb_page);
@@ -802,7 +799,6 @@ static int md_bitmap_storage_alloc(struct bitmap_storage *store,
if (store->sb_page) {
store->filemap[0] = store->sb_page;
pnum = 1;
- store->sb_page->index = offset;
}
for ( ; pnum < num_pages; pnum++) {
@@ -929,6 +925,7 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
unsigned long chunk = block >> bitmap->counts.chunkshift;
struct bitmap_storage *store = &bitmap->storage;
unsigned long node_offset = 0;
+ unsigned long index = file_page_index(store, chunk);
if (mddev_is_clustered(bitmap->mddev))
node_offset = bitmap->cluster_slot * store->file_pages;
@@ -945,9 +942,9 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
else
set_bit_le(bit, kaddr);
kunmap_atomic(kaddr);
- pr_debug("set file bit %lu page %lu\n", bit, page->index);
+ pr_debug("set file bit %lu page %lu\n", bit, index);
/* record page number so it gets flushed to disk when unplug occurs */
- set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_DIRTY);
+ set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_DIRTY);
}
static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
@@ -958,6 +955,7 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
unsigned long chunk = block >> bitmap->counts.chunkshift;
struct bitmap_storage *store = &bitmap->storage;
unsigned long node_offset = 0;
+ unsigned long index = file_page_index(store, chunk);
if (mddev_is_clustered(bitmap->mddev))
node_offset = bitmap->cluster_slot * store->file_pages;
@@ -972,8 +970,8 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
else
clear_bit_le(bit, paddr);
kunmap_atomic(paddr);
- if (!test_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_NEEDWRITE)) {
- set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_PENDING);
+ if (!test_page_attr(bitmap, index - node_offset, BITMAP_PAGE_NEEDWRITE)) {
+ set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_PENDING);
bitmap->allclean = 0;
}
}
@@ -1027,7 +1025,7 @@ void md_bitmap_unplug(struct bitmap *bitmap)
"md bitmap_unplug");
}
clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING);
- write_page(bitmap, bitmap->storage.filemap[i], 0);
+ write_page(bitmap, bitmap->storage.filemap[i], i, 0);
writing = 1;
}
}
@@ -1137,7 +1135,7 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
memset(paddr + offset, 0xff,
PAGE_SIZE - offset);
kunmap_atomic(paddr);
- write_page(bitmap, page, 1);
+ write_page(bitmap, page, index, 1);
ret = -EIO;
if (test_bit(BITMAP_WRITE_ERROR,
@@ -1336,7 +1334,7 @@ void md_bitmap_daemon_work(struct mddev *mddev)
if (bitmap->storage.filemap &&
test_and_clear_page_attr(bitmap, j,
BITMAP_PAGE_NEEDWRITE)) {
- write_page(bitmap, bitmap->storage.filemap[j], 0);
+ write_page(bitmap, bitmap->storage.filemap[j], j, 0);
}
}
--
2.33.0
As part of the struct page cleanups underway, we want to remove as much
usage of page->mapping and page->index as possible, as frequently they
are known from context.
In the brd code, we're never actually reading from page->index except in
assertions, so references to it can be safely deleted.
Signed-off-by: Kent Overstreet <[email protected]>
---
drivers/block/brd.c | 4 ----
1 file changed, 4 deletions(-)
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 58ec167aa0..0a55aed832 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -72,8 +72,6 @@ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
page = radix_tree_lookup(&brd->brd_pages, idx);
rcu_read_unlock();
- BUG_ON(page && page->index != idx);
-
return page;
}
@@ -108,12 +106,10 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
spin_lock(&brd->brd_lock);
idx = sector >> PAGE_SECTORS_SHIFT;
- page->index = idx;
if (radix_tree_insert(&brd->brd_pages, idx, page)) {
__free_page(page);
page = radix_tree_lookup(&brd->brd_pages, idx);
BUG_ON(!page);
- BUG_ON(page->index != idx);
} else {
brd->brd_nr_pages++;
}
--
2.33.0
This splits out page_reporting_get_pages() from page_reporting_cycle(),
which is a considerable simplification and lets us delete some
duplicated code. We're cleaning up code that touches page freelists as
prep work for possibly converting them to radix trees, but this is a
worthy cleanup on its own.
Signed-off-by: Kent Overstreet <[email protected]>
---
mm/page_reporting.c | 154 ++++++++++++++++----------------------------
1 file changed, 54 insertions(+), 100 deletions(-)
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index c4362b4b0c..ab2be13d8e 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -71,10 +71,8 @@ void __page_reporting_notify(void)
static void
page_reporting_drain(struct page_reporting_dev_info *prdev,
- struct scatterlist *sgl, unsigned int nents, bool reported)
+ struct scatterlist *sg, bool reported)
{
- struct scatterlist *sg = sgl;
-
/*
* Drain the now reported pages back into their respective
* free lists/areas. We assume at least one page is populated.
@@ -100,9 +98,45 @@ page_reporting_drain(struct page_reporting_dev_info *prdev,
if (PageBuddy(page) && buddy_order(page) == order)
__SetPageReported(page);
} while ((sg = sg_next(sg)));
+}
+
+static int
+page_reporting_get_pages(struct page_reporting_dev_info *prdev, struct zone *zone,
+ unsigned int order, unsigned int mt,
+ struct scatterlist *sgl)
+{
+ struct page_free_list *list = &zone->free_area[order].free[mt];
+ unsigned int page_len = PAGE_SIZE << order;
+ struct page *page, *next;
+ unsigned nr_got = 0;
+
+ spin_lock_irq(&zone->lock);
+
+ /* loop through free list adding unreported pages to sg list */
+ list_for_each_entry_safe(page, next, &list->list, lru) {
+ /* We are going to skip over the reported pages. */
+ if (PageReported(page))
+ continue;
+
+ /* Attempt to pull page from list and place in scatterlist */
+ if (!__isolate_free_page(page, order)) {
+ next = page;
+ break;
+ }
+
+ sg_set_page(&sgl[nr_got++], page, page_len, 0);
+ if (nr_got == PAGE_REPORTING_CAPACITY)
+ break;
+ }
+
+ /* Rotate any leftover pages to the head of the freelist */
+ if (!list_entry_is_head(next, &list->list, lru) &&
+ !list_is_first(&next->lru, &list->list))
+ list_rotate_to_front(&next->lru, &list->list);
+
+ spin_unlock_irq(&zone->lock);
- /* reinitialize scatterlist now that it is empty */
- sg_init_table(sgl, nents);
+ return nr_got;
}
/*
@@ -113,23 +147,13 @@ page_reporting_drain(struct page_reporting_dev_info *prdev,
static int
page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone,
unsigned int order, unsigned int mt,
- struct scatterlist *sgl, unsigned int *offset)
+ struct scatterlist *sgl)
{
struct page_free_list *list = &zone->free_area[order].free[mt];
- unsigned int page_len = PAGE_SIZE << order;
- struct page *page, *next;
+ unsigned nr_pages;
long budget;
int err = 0;
- /*
- * Perform early check, if free area is empty there is
- * nothing to process so we can skip this free_list.
- */
- if (list_empty(&list->list))
- return err;
-
- spin_lock_irq(&zone->lock);
-
/*
* Limit how many calls we will be making to the page reporting
* device for this list. By doing this we avoid processing any
@@ -146,80 +170,25 @@ page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone,
*/
budget = DIV_ROUND_UP(list->nr, PAGE_REPORTING_CAPACITY * 16);
- /* loop through free list adding unreported pages to sg list */
- list_for_each_entry_safe(page, next, &list->list, lru) {
- /* We are going to skip over the reported pages. */
- if (PageReported(page))
- continue;
+ while (budget > 0 && !err) {
+ sg_init_table(sgl, PAGE_REPORTING_CAPACITY);
- /*
- * If we fully consumed our budget then update our
- * state to indicate that we are requesting additional
- * processing and exit this list.
- */
- if (budget < 0) {
- atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED);
- next = page;
+ nr_pages = page_reporting_get_pages(prdev, zone, order, mt, sgl);
+ if (!nr_pages)
break;
- }
-
- /* Attempt to pull page from list and place in scatterlist */
- if (*offset) {
- if (!__isolate_free_page(page, order)) {
- next = page;
- break;
- }
-
- /* Add page to scatter list */
- --(*offset);
- sg_set_page(&sgl[*offset], page, page_len, 0);
-
- continue;
- }
-
- /*
- * Make the first non-reported page in the free list
- * the new head of the free list before we release the
- * zone lock.
- */
- if (!list_is_first(&page->lru, &list->list))
- list_rotate_to_front(&page->lru, &list->list);
-
- /* release lock before waiting on report processing */
- spin_unlock_irq(&zone->lock);
-
- /* begin processing pages in local list */
- err = prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY);
- /* reset offset since the full list was reported */
- *offset = PAGE_REPORTING_CAPACITY;
+ sg_mark_end(sgl + nr_pages);
- /* update budget to reflect call to report function */
- budget--;
+ budget -= nr_pages;
+ err = prdev->report(prdev, sgl, nr_pages);
- /* reacquire zone lock and resume processing */
spin_lock_irq(&zone->lock);
-
- /* flush reported pages from the sg list */
- page_reporting_drain(prdev, sgl, PAGE_REPORTING_CAPACITY, !err);
-
- /*
- * Reset next to first entry, the old next isn't valid
- * since we dropped the lock to report the pages
- */
- next = list_first_entry(&list->list, struct page, lru);
-
- /* exit on error */
- if (err)
- break;
+ page_reporting_drain(prdev, sgl, !err);
+ spin_unlock_irq(&zone->lock);
}
- /* Rotate any leftover pages to the head of the freelist */
- if (!list_entry_is_head(next, &list->list, lru) &&
- !list_is_first(&next->lru, &list->list))
- list_rotate_to_front(&next->lru, &list->list);
-
- spin_unlock_irq(&zone->lock);
+ if (budget <= 0 && list->nr)
+ atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED);
return err;
}
@@ -228,7 +197,7 @@ static int
page_reporting_process_zone(struct page_reporting_dev_info *prdev,
struct scatterlist *sgl, struct zone *zone)
{
- unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY;
+ unsigned int order, mt;
unsigned long watermark;
int err = 0;
@@ -250,25 +219,12 @@ page_reporting_process_zone(struct page_reporting_dev_info *prdev,
if (is_migrate_isolate(mt))
continue;
- err = page_reporting_cycle(prdev, zone, order, mt,
- sgl, &offset);
+ err = page_reporting_cycle(prdev, zone, order, mt, sgl);
if (err)
return err;
}
}
- /* report the leftover pages before going idle */
- leftover = PAGE_REPORTING_CAPACITY - offset;
- if (leftover) {
- sgl = &sgl[offset];
- err = prdev->report(prdev, sgl, leftover);
-
- /* flush any remaining pages out from the last report */
- spin_lock_irq(&zone->lock);
- page_reporting_drain(prdev, sgl, leftover, !err);
- spin_unlock_irq(&zone->lock);
- }
-
return err;
}
@@ -294,8 +250,6 @@ static void page_reporting_process(struct work_struct *work)
if (!sgl)
goto err_out;
- sg_init_table(sgl, PAGE_REPORTING_CAPACITY);
-
for_each_zone(zone) {
err = page_reporting_process_zone(prdev, sgl, zone);
if (err)
--
2.33.0
This is prep work for introducing a struct page_free_list, which will
have a list head and nr_free - it turns out a fair amount of the code
looking at free_area->nr_free actually wants the number of elements on a
particular freelist.
Signed-off-by: Kent Overstreet <[email protected]>
---
include/linux/mmzone.h | 14 ++++++++++++--
mm/page_alloc.c | 30 +++++++++++++++++-------------
mm/page_reporting.c | 2 +-
mm/vmstat.c | 28 +++-------------------------
4 files changed, 33 insertions(+), 41 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 6a1d79d846..089587b918 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -96,7 +96,7 @@ extern int page_group_by_mobility_disabled;
struct free_area {
struct list_head free_list[MIGRATE_TYPES];
- unsigned long nr_free;
+ unsigned long nr_free[MIGRATE_TYPES];
};
static inline struct page *get_page_from_free_area(struct free_area *area,
@@ -108,7 +108,17 @@ static inline struct page *get_page_from_free_area(struct free_area *area,
static inline bool free_area_empty(struct free_area *area, int migratetype)
{
- return list_empty(&area->free_list[migratetype]);
+ return area->nr_free[migratetype] == 0;
+}
+
+static inline size_t free_area_nr_free(struct free_area *area)
+{
+ unsigned migratetype;
+ size_t nr_free = 0;
+
+ for (migratetype = 0; migratetype < MIGRATE_TYPES; migratetype++)
+ nr_free += area->nr_free[migratetype];
+ return nr_free;
}
struct pglist_data;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b37435c274..8918c00a91 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -966,7 +966,7 @@ static inline void add_to_free_list(struct page *page, struct zone *zone,
struct free_area *area = &zone->free_area[order];
list_add(&page->lru, &area->free_list[migratetype]);
- area->nr_free++;
+ area->nr_free[migratetype]++;
}
/* Used for pages not on another list */
@@ -976,7 +976,7 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
struct free_area *area = &zone->free_area[order];
list_add_tail(&page->lru, &area->free_list[migratetype]);
- area->nr_free++;
+ area->nr_free[migratetype]++;
}
/*
@@ -993,7 +993,7 @@ static inline void move_to_free_list(struct page *page, struct zone *zone,
}
static inline void del_page_from_free_list(struct page *page, struct zone *zone,
- unsigned int order)
+ unsigned int order, int migratetype)
{
/* clear reported state and update reported page count */
if (page_reported(page))
@@ -1002,7 +1002,7 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone,
list_del(&page->lru);
__ClearPageBuddy(page);
set_page_private(page, 0);
- zone->free_area[order].nr_free--;
+ zone->free_area[order].nr_free[migratetype]--;
}
/*
@@ -1098,7 +1098,7 @@ static inline void __free_one_page(struct page *page,
if (page_is_guard(buddy))
clear_page_guard(zone, buddy, order, migratetype);
else
- del_page_from_free_list(buddy, zone, order);
+ del_page_from_free_list(buddy, zone, order, migratetype);
combined_pfn = buddy_pfn & pfn;
page = page + (combined_pfn - pfn);
pfn = combined_pfn;
@@ -2456,7 +2456,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
page = get_page_from_free_area(area, migratetype);
if (!page)
continue;
- del_page_from_free_list(page, zone, current_order);
+ del_page_from_free_list(page, zone, current_order, migratetype);
expand(zone, page, order, current_order, migratetype);
set_pcppage_migratetype(page, migratetype);
return page;
@@ -3525,7 +3525,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
/* Remove page from free list */
- del_page_from_free_list(page, zone, order);
+ del_page_from_free_list(page, zone, order, mt);
/*
* Set the pageblock if the isolated page is at least half of a
@@ -6038,14 +6038,16 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
struct free_area *area = &zone->free_area[order];
int type;
- nr[order] = area->nr_free;
- total += nr[order] << order;
+ nr[order] = 0;
+ types[order] = 0;
- types[order] = 0;
for (type = 0; type < MIGRATE_TYPES; type++) {
if (!free_area_empty(area, type))
types[order] |= 1 << type;
+ nr[order] += area->nr_free[type];
}
+
+ total += nr[order] << order;
}
spin_unlock_irqrestore(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
@@ -6623,7 +6625,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
unsigned int order, t;
for_each_migratetype_order(order, t) {
INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
- zone->free_area[order].nr_free = 0;
+ zone->free_area[order].nr_free[t] = 0;
}
}
@@ -9317,6 +9319,7 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
struct page *page;
struct zone *zone;
unsigned int order;
+ unsigned int migratetype;
unsigned long flags;
offline_mem_sections(pfn, end_pfn);
@@ -9346,7 +9349,8 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
BUG_ON(page_count(page));
BUG_ON(!PageBuddy(page));
order = buddy_order(page);
- del_page_from_free_list(page, zone, order);
+ migratetype = get_pfnblock_migratetype(page, pfn);
+ del_page_from_free_list(page, zone, order, migratetype);
pfn += (1 << order);
}
spin_unlock_irqrestore(&zone->lock, flags);
@@ -9428,7 +9432,7 @@ bool take_page_off_buddy(struct page *page)
int migratetype = get_pfnblock_migratetype(page_head,
pfn_head);
- del_page_from_free_list(page_head, zone, page_order);
+ del_page_from_free_list(page_head, zone, page_order, migratetype);
break_down_buddy_pages(zone, page_head, page, 0,
page_order, migratetype);
if (!is_migrate_isolate(migratetype))
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index 382958eef8..4e45ae95db 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -145,7 +145,7 @@ page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone,
* The division here should be cheap since PAGE_REPORTING_CAPACITY
* should always be a power of 2.
*/
- budget = DIV_ROUND_UP(area->nr_free, PAGE_REPORTING_CAPACITY * 16);
+ budget = DIV_ROUND_UP(area->nr_free[mt], PAGE_REPORTING_CAPACITY * 16);
/* loop through free list adding unreported pages to sg list */
list_for_each_entry_safe(page, next, list, lru) {
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8ce2620344..eb46f99c72 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1071,7 +1071,7 @@ static void fill_contig_page_info(struct zone *zone,
unsigned long blocks;
/* Count number of free blocks */
- blocks = zone->free_area[order].nr_free;
+ blocks = free_area_nr_free(&zone->free_area[order]);
info->free_blocks_total += blocks;
/* Count free base pages */
@@ -1445,7 +1445,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
for (order = 0; order < MAX_ORDER; ++order)
- seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
+ seq_printf(m, "%6zu ", free_area_nr_free(&zone->free_area[order]));
seq_putc(m, '\n');
}
@@ -1470,29 +1470,7 @@ static void pagetypeinfo_showfree_print(struct seq_file *m,
zone->name,
migratetype_names[mtype]);
for (order = 0; order < MAX_ORDER; ++order) {
- unsigned long freecount = 0;
- struct free_area *area;
- struct list_head *curr;
- bool overflow = false;
-
- area = &(zone->free_area[order]);
-
- list_for_each(curr, &area->free_list[mtype]) {
- /*
- * Cap the free_list iteration because it might
- * be really large and we are under a spinlock
- * so a long time spent here could trigger a
- * hard lockup detector. Anyway this is a
- * debugging tool so knowing there is a handful
- * of pages of this order should be more than
- * sufficient.
- */
- if (++freecount >= 100000) {
- overflow = true;
- break;
- }
- }
- seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount);
+ seq_printf(m, "%6zu ", zone->free_area[order].nr_free[mtype]);
spin_unlock_irq(&zone->lock);
cond_resched();
spin_lock_irq(&zone->lock);
--
2.33.0
Mostly LGTM. I recall that in some corner cases the migratetype stored
for a pcppage does not correspond to the pagetype of the pfnblock ... I
do wonder if that can trick us here in doing some accounting wrong., no
that we account free pages per mirgatetype.
> /*
> * Set the pageblock if the isolated page is at least half of a
> @@ -6038,14 +6038,16 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
> struct free_area *area = &zone->free_area[order];
> int type;
>
> - nr[order] = area->nr_free;
> - total += nr[order] << order;
> + nr[order] = 0;
> + types[order] = 0;
Why the indentation change? Looks unrelated to me.
>
> - types[order] = 0;
> for (type = 0; type < MIGRATE_TYPES; type++) {
> if (!free_area_empty(area, type))
> types[order] |= 1 << type;
> + nr[order] += area->nr_free[type];
> }
> +
> + total += nr[order] << order;
> }
> spin_unlock_irqrestore(&zone->lock, flags);
> for (order = 0; order < MAX_ORDER; order++) {
> @@ -6623,7 +6625,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
> unsigned int order, t;
> for_each_migratetype_order(order, t) {
> INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
> - zone->free_area[order].nr_free = 0;
> + zone->free_area[order].nr_free[t] = 0;
> }
> }
>
> @@ -9317,6 +9319,7 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
> struct page *page;
> struct zone *zone;
> unsigned int order;
> + unsigned int migratetype;
> unsigned long flags;
>
> offline_mem_sections(pfn, end_pfn);
> @@ -9346,7 +9349,8 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
> BUG_ON(page_count(page));
> BUG_ON(!PageBuddy(page));
> order = buddy_order(page);
> - del_page_from_free_list(page, zone, order);
> + migratetype = get_pfnblock_migratetype(page, pfn);
As the free pages are isolated, theoretically this should be
MIGRATE_ISOLATE.
> + del_page_from_free_list(page, zone, order, migratetype);
> pfn += (1 << order);
> }
> spin_unlock_irqrestore(&zone->lock, flags);
> @@ -9428,7 +9432,7 @@ bool take_page_off_buddy(struct page *page)
> int migratetype = get_pfnblock_migratetype(page_head,
> pfn_head);
>
> - del_page_from_free_list(page_head, zone, page_order);
> + del_page_from_free_list(page_head, zone, page_order, migratetype);
> break_down_buddy_pages(zone, page_head, page, 0,
> page_order, migratetype);
> if (!is_migrate_isolate(migratetype))
> diff --git a/mm/page_reporting.c b/mm/page_reporting.c
> index 382958eef8..4e45ae95db 100644
> --- a/mm/page_reporting.c
> +++ b/mm/page_reporting.c
> @@ -145,7 +145,7 @@ page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone,
> * The division here should be cheap since PAGE_REPORTING_CAPACITY
> * should always be a power of 2.
> */
> - budget = DIV_ROUND_UP(area->nr_free, PAGE_REPORTING_CAPACITY * 16);
> + budget = DIV_ROUND_UP(area->nr_free[mt], PAGE_REPORTING_CAPACITY * 16);
>
I think we might want the total free pages here. If we want to change
the behavior, we should do it in a separate patch.
--
Thanks,
David / dhildenb
On 10/14/21 12:00 AM, Kent Overstreet wrote:
> As part of the struct page cleanups underway, we want to remove as much
> usage of page->mapping and page->index as possible, as frequently they
> are known from context - as they are here in the md bitmap code.
>
> Signed-off-by: Kent Overstreet <[email protected]>
> ---
> drivers/md/md-bitmap.c | 44 ++++++++++++++++++++----------------------
> 1 file changed, 21 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
> index e29c6298ef..dcdb4597c5 100644
> --- a/drivers/md/md-bitmap.c
> +++ b/drivers/md/md-bitmap.c
> @@ -165,10 +165,8 @@ static int read_sb_page(struct mddev *mddev, loff_t offset,
>
> if (sync_page_io(rdev, target,
> roundup(size, bdev_logical_block_size(rdev->bdev)),
> - page, REQ_OP_READ, 0, true)) {
> - page->index = index;
> + page, REQ_OP_READ, 0, true))
> return 0;
> - }
> }
> return -EIO;
> }
> @@ -209,7 +207,8 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde
> return NULL;
> }
>
> -static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
> +static int write_sb_page(struct bitmap *bitmap, struct page *page,
> + unsigned long index, int wait)
> {
> struct md_rdev *rdev;
> struct block_device *bdev;
> @@ -224,7 +223,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
>
> bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
>
> - if (page->index == store->file_pages-1) {
> + if (index == store->file_pages-1) {
> int last_page_size = store->bytes & (PAGE_SIZE-1);
> if (last_page_size == 0)
> last_page_size = PAGE_SIZE;
> @@ -236,8 +235,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
> */
> if (mddev->external) {
> /* Bitmap could be anywhere. */
> - if (rdev->sb_start + offset + (page->index
> - * (PAGE_SIZE/512))
> + if (rdev->sb_start + offset + index * PAGE_SECTORS
> > rdev->data_offset
> &&
> rdev->sb_start + offset
> @@ -247,7 +245,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
> } else if (offset < 0) {
> /* DATA BITMAP METADATA */
> if (offset
> - + (long)(page->index * (PAGE_SIZE/512))
> + + (long)(index * PAGE_SECTORS)
> + size/512 > 0)
> /* bitmap runs in to metadata */
> goto bad_alignment;
> @@ -259,7 +257,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
> /* METADATA BITMAP DATA */
> if (rdev->sb_start
> + offset
> - + page->index*(PAGE_SIZE/512) + size/512
> + + index * PAGE_SECTORS + size/512
> > rdev->data_offset)
> /* bitmap runs in to data */
> goto bad_alignment;
> @@ -268,7 +266,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
> }
> md_super_write(mddev, rdev,
> rdev->sb_start + offset
> - + page->index * (PAGE_SIZE/512),
> + + index * PAGE_SECTORS,
> size,
> page);
> }
> @@ -285,12 +283,13 @@ static void md_bitmap_file_kick(struct bitmap *bitmap);
> /*
> * write out a page to a file
> */
> -static void write_page(struct bitmap *bitmap, struct page *page, int wait)
> +static void write_page(struct bitmap *bitmap, struct page *page,
> + unsigned long index, int wait)
> {
> struct buffer_head *bh;
>
> if (bitmap->storage.file == NULL) {
> - switch (write_sb_page(bitmap, page, wait)) {
> + switch (write_sb_page(bitmap, page, index, wait)) {
> case -EINVAL:
> set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
> }
> @@ -399,7 +398,6 @@ static int read_page(struct file *file, unsigned long index,
> blk_cur++;
> bh = bh->b_this_page;
> }
> - page->index = index;
>
> wait_event(bitmap->write_wait,
> atomic_read(&bitmap->pending_writes)==0);
> @@ -472,7 +470,7 @@ void md_bitmap_update_sb(struct bitmap *bitmap)
> sb->sectors_reserved = cpu_to_le32(bitmap->mddev->
> bitmap_info.space);
> kunmap_atomic(sb);
> - write_page(bitmap, bitmap->storage.sb_page, 1);
> + write_page(bitmap, bitmap->storage.sb_page, 0, 1);
> }
> EXPORT_SYMBOL(md_bitmap_update_sb);
>
> @@ -524,7 +522,6 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap)
> bitmap->storage.sb_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
> if (bitmap->storage.sb_page == NULL)
> return -ENOMEM;
> - bitmap->storage.sb_page->index = 0;
>
> sb = kmap_atomic(bitmap->storage.sb_page);
>
> @@ -802,7 +799,6 @@ static int md_bitmap_storage_alloc(struct bitmap_storage *store,
> if (store->sb_page) {
> store->filemap[0] = store->sb_page;
> pnum = 1;
> - store->sb_page->index = offset;
The offset is related with slot num, so it is better to verify the
change with clustered raid.
@Heming
Thanks,
Guoqing
Hello all,
The page->index takes an important role for cluster-md module.
i.e, a two-node HA env, node A bitmap may be managed in first 4K area, then
node B bitmap is in 8K area (the second page). this patch removes the index
and fix/hardcode index with value 0, which will only operate first node bitmap.
If this serial patches are important and must be merged in mainline, we should
redesign code logic for the related code.
Thanks,
Heming
On 10/14/21 16:02, Guoqing Jiang wrote:
>
>
> On 10/14/21 12:00 AM, Kent Overstreet wrote:
>> As part of the struct page cleanups underway, we want to remove as much
>> usage of page->mapping and page->index as possible, as frequently they
>> are known from context - as they are here in the md bitmap code.
>>
>> Signed-off-by: Kent Overstreet <[email protected]>
>> ---
>> drivers/md/md-bitmap.c | 44 ++++++++++++++++++++----------------------
>> 1 file changed, 21 insertions(+), 23 deletions(-)
>>
>> diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
>> index e29c6298ef..dcdb4597c5 100644
>> --- a/drivers/md/md-bitmap.c
>> +++ b/drivers/md/md-bitmap.c
>> @@ -165,10 +165,8 @@ static int read_sb_page(struct mddev *mddev, loff_t offset,
>> if (sync_page_io(rdev, target,
>> roundup(size, bdev_logical_block_size(rdev->bdev)),
>> - page, REQ_OP_READ, 0, true)) {
>> - page->index = index;
>> + page, REQ_OP_READ, 0, true))
>> return 0;
>> - }
>> }
>> return -EIO;
>> }
>> @@ -209,7 +207,8 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde
>> return NULL;
>> }
>> -static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
>> +static int write_sb_page(struct bitmap *bitmap, struct page *page,
>> + unsigned long index, int wait)
>> {
>> struct md_rdev *rdev;
>> struct block_device *bdev;
>> @@ -224,7 +223,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
>> bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
>> - if (page->index == store->file_pages-1) {
>> + if (index == store->file_pages-1) {
>> int last_page_size = store->bytes & (PAGE_SIZE-1);
>> if (last_page_size == 0)
>> last_page_size = PAGE_SIZE;
>> @@ -236,8 +235,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
>> */
>> if (mddev->external) {
>> /* Bitmap could be anywhere. */
>> - if (rdev->sb_start + offset + (page->index
>> - * (PAGE_SIZE/512))
>> + if (rdev->sb_start + offset + index * PAGE_SECTORS
>> > rdev->data_offset
>> &&
>> rdev->sb_start + offset
>> @@ -247,7 +245,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
>> } else if (offset < 0) {
>> /* DATA BITMAP METADATA */
>> if (offset
>> - + (long)(page->index * (PAGE_SIZE/512))
>> + + (long)(index * PAGE_SECTORS)
>> + size/512 > 0)
>> /* bitmap runs in to metadata */
>> goto bad_alignment;
>> @@ -259,7 +257,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
>> /* METADATA BITMAP DATA */
>> if (rdev->sb_start
>> + offset
>> - + page->index*(PAGE_SIZE/512) + size/512
>> + + index * PAGE_SECTORS + size/512
>> > rdev->data_offset)
>> /* bitmap runs in to data */
>> goto bad_alignment;
>> @@ -268,7 +266,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
>> }
>> md_super_write(mddev, rdev,
>> rdev->sb_start + offset
>> - + page->index * (PAGE_SIZE/512),
>> + + index * PAGE_SECTORS,
>> size,
>> page);
>> }
>> @@ -285,12 +283,13 @@ static void md_bitmap_file_kick(struct bitmap *bitmap);
>> /*
>> * write out a page to a file
>> */
>> -static void write_page(struct bitmap *bitmap, struct page *page, int wait)
>> +static void write_page(struct bitmap *bitmap, struct page *page,
>> + unsigned long index, int wait)
>> {
>> struct buffer_head *bh;
>> if (bitmap->storage.file == NULL) {
>> - switch (write_sb_page(bitmap, page, wait)) {
>> + switch (write_sb_page(bitmap, page, index, wait)) {
>> case -EINVAL:
>> set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
>> }
>> @@ -399,7 +398,6 @@ static int read_page(struct file *file, unsigned long index,
>> blk_cur++;
>> bh = bh->b_this_page;
>> }
>> - page->index = index;
>> wait_event(bitmap->write_wait,
>> atomic_read(&bitmap->pending_writes)==0);
>> @@ -472,7 +470,7 @@ void md_bitmap_update_sb(struct bitmap *bitmap)
>> sb->sectors_reserved = cpu_to_le32(bitmap->mddev->
>> bitmap_info.space);
>> kunmap_atomic(sb);
>> - write_page(bitmap, bitmap->storage.sb_page, 1);
>> + write_page(bitmap, bitmap->storage.sb_page, 0, 1);
>> }
>> EXPORT_SYMBOL(md_bitmap_update_sb);
>> @@ -524,7 +522,6 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap)
>> bitmap->storage.sb_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
>> if (bitmap->storage.sb_page == NULL)
>> return -ENOMEM;
>> - bitmap->storage.sb_page->index = 0;
>> sb = kmap_atomic(bitmap->storage.sb_page);
>> @@ -802,7 +799,6 @@ static int md_bitmap_storage_alloc(struct bitmap_storage *store,
>> if (store->sb_page) {
>> store->filemap[0] = store->sb_page;
>> pnum = 1;
>> - store->sb_page->index = offset;
>
> The offset is related with slot num, so it is better to verify the change with clustered raid.
>
> @Heming
>
>
> Thanks,
> Guoqing
>
On Thu, Oct 14, 2021 at 04:58:46PM +0800, [email protected] wrote:
> Hello all,
>
> The page->index takes an important role for cluster-md module.
> i.e, a two-node HA env, node A bitmap may be managed in first 4K area, then
> node B bitmap is in 8K area (the second page). this patch removes the index
> and fix/hardcode index with value 0, which will only operate first node bitmap.
>
> If this serial patches are important and must be merged in mainline, we should
> redesign code logic for the related code.
>
> Thanks,
> Heming
Can you look at and test the updated patch below? The more I look at the md
bitmap code the more it scares me.
-- >8 --
Subject: [PATCH] md: Kill usage of page->index
As part of the struct page cleanups underway, we want to remove as much
usage of page->mapping and page->index as possible, as frequently they
are known from context - as they are here in the md bitmap code.
Signed-off-by: Kent Overstreet <[email protected]>
---
drivers/md/md-bitmap.c | 49 ++++++++++++++++++++----------------------
drivers/md/md-bitmap.h | 1 +
2 files changed, 24 insertions(+), 26 deletions(-)
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index e29c6298ef..316e4cd5a7 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -165,10 +165,8 @@ static int read_sb_page(struct mddev *mddev, loff_t offset,
if (sync_page_io(rdev, target,
roundup(size, bdev_logical_block_size(rdev->bdev)),
- page, REQ_OP_READ, 0, true)) {
- page->index = index;
+ page, REQ_OP_READ, 0, true))
return 0;
- }
}
return -EIO;
}
@@ -209,7 +207,8 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde
return NULL;
}
-static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
+static int write_sb_page(struct bitmap *bitmap, struct page *page,
+ unsigned long index, int wait)
{
struct md_rdev *rdev;
struct block_device *bdev;
@@ -224,7 +223,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
- if (page->index == store->file_pages-1) {
+ if (index == store->file_pages-1) {
int last_page_size = store->bytes & (PAGE_SIZE-1);
if (last_page_size == 0)
last_page_size = PAGE_SIZE;
@@ -236,8 +235,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
*/
if (mddev->external) {
/* Bitmap could be anywhere. */
- if (rdev->sb_start + offset + (page->index
- * (PAGE_SIZE/512))
+ if (rdev->sb_start + offset + index * PAGE_SECTORS
> rdev->data_offset
&&
rdev->sb_start + offset
@@ -247,7 +245,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
} else if (offset < 0) {
/* DATA BITMAP METADATA */
if (offset
- + (long)(page->index * (PAGE_SIZE/512))
+ + (long)(index * PAGE_SECTORS)
+ size/512 > 0)
/* bitmap runs in to metadata */
goto bad_alignment;
@@ -259,7 +257,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
/* METADATA BITMAP DATA */
if (rdev->sb_start
+ offset
- + page->index*(PAGE_SIZE/512) + size/512
+ + index * PAGE_SECTORS + size/512
> rdev->data_offset)
/* bitmap runs in to data */
goto bad_alignment;
@@ -268,7 +266,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
}
md_super_write(mddev, rdev,
rdev->sb_start + offset
- + page->index * (PAGE_SIZE/512),
+ + index * PAGE_SECTORS,
size,
page);
}
@@ -285,12 +283,13 @@ static void md_bitmap_file_kick(struct bitmap *bitmap);
/*
* write out a page to a file
*/
-static void write_page(struct bitmap *bitmap, struct page *page, int wait)
+static void write_page(struct bitmap *bitmap, struct page *page,
+ unsigned long index, int wait)
{
struct buffer_head *bh;
if (bitmap->storage.file == NULL) {
- switch (write_sb_page(bitmap, page, wait)) {
+ switch (write_sb_page(bitmap, page, index, wait)) {
case -EINVAL:
set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
}
@@ -399,7 +398,6 @@ static int read_page(struct file *file, unsigned long index,
blk_cur++;
bh = bh->b_this_page;
}
- page->index = index;
wait_event(bitmap->write_wait,
atomic_read(&bitmap->pending_writes)==0);
@@ -472,7 +470,7 @@ void md_bitmap_update_sb(struct bitmap *bitmap)
sb->sectors_reserved = cpu_to_le32(bitmap->mddev->
bitmap_info.space);
kunmap_atomic(sb);
- write_page(bitmap, bitmap->storage.sb_page, 1);
+ write_page(bitmap, bitmap->storage.sb_page, bitmap->storage.sb_index, 1);
}
EXPORT_SYMBOL(md_bitmap_update_sb);
@@ -524,7 +522,6 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap)
bitmap->storage.sb_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (bitmap->storage.sb_page == NULL)
return -ENOMEM;
- bitmap->storage.sb_page->index = 0;
sb = kmap_atomic(bitmap->storage.sb_page);
@@ -776,7 +773,7 @@ static int md_bitmap_storage_alloc(struct bitmap_storage *store,
unsigned long chunks, int with_super,
int slot_number)
{
- int pnum, offset = 0;
+ int pnum;
unsigned long num_pages;
unsigned long bytes;
@@ -785,7 +782,7 @@ static int md_bitmap_storage_alloc(struct bitmap_storage *store,
bytes += sizeof(bitmap_super_t);
num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE);
- offset = slot_number * num_pages;
+ store->sb_index = slot_number * num_pages;
store->filemap = kmalloc_array(num_pages, sizeof(struct page *),
GFP_KERNEL);
@@ -802,7 +799,6 @@ static int md_bitmap_storage_alloc(struct bitmap_storage *store,
if (store->sb_page) {
store->filemap[0] = store->sb_page;
pnum = 1;
- store->sb_page->index = offset;
}
for ( ; pnum < num_pages; pnum++) {
@@ -811,7 +807,6 @@ static int md_bitmap_storage_alloc(struct bitmap_storage *store,
store->file_pages = pnum;
return -ENOMEM;
}
- store->filemap[pnum]->index = pnum + offset;
}
store->file_pages = pnum;
@@ -929,6 +924,7 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
unsigned long chunk = block >> bitmap->counts.chunkshift;
struct bitmap_storage *store = &bitmap->storage;
unsigned long node_offset = 0;
+ unsigned long index = file_page_index(store, chunk);
if (mddev_is_clustered(bitmap->mddev))
node_offset = bitmap->cluster_slot * store->file_pages;
@@ -945,9 +941,9 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
else
set_bit_le(bit, kaddr);
kunmap_atomic(kaddr);
- pr_debug("set file bit %lu page %lu\n", bit, page->index);
+ pr_debug("set file bit %lu page %lu\n", bit, index);
/* record page number so it gets flushed to disk when unplug occurs */
- set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_DIRTY);
+ set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_DIRTY);
}
static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
@@ -958,6 +954,7 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
unsigned long chunk = block >> bitmap->counts.chunkshift;
struct bitmap_storage *store = &bitmap->storage;
unsigned long node_offset = 0;
+ unsigned long index = file_page_index(store, chunk);
if (mddev_is_clustered(bitmap->mddev))
node_offset = bitmap->cluster_slot * store->file_pages;
@@ -972,8 +969,8 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
else
clear_bit_le(bit, paddr);
kunmap_atomic(paddr);
- if (!test_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_NEEDWRITE)) {
- set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_PENDING);
+ if (!test_page_attr(bitmap, index - node_offset, BITMAP_PAGE_NEEDWRITE)) {
+ set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_PENDING);
bitmap->allclean = 0;
}
}
@@ -1027,7 +1024,7 @@ void md_bitmap_unplug(struct bitmap *bitmap)
"md bitmap_unplug");
}
clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING);
- write_page(bitmap, bitmap->storage.filemap[i], 0);
+ write_page(bitmap, bitmap->storage.filemap[i], i, 0);
writing = 1;
}
}
@@ -1137,7 +1134,7 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
memset(paddr + offset, 0xff,
PAGE_SIZE - offset);
kunmap_atomic(paddr);
- write_page(bitmap, page, 1);
+ write_page(bitmap, page, index, 1);
ret = -EIO;
if (test_bit(BITMAP_WRITE_ERROR,
@@ -1336,7 +1333,7 @@ void md_bitmap_daemon_work(struct mddev *mddev)
if (bitmap->storage.filemap &&
test_and_clear_page_attr(bitmap, j,
BITMAP_PAGE_NEEDWRITE)) {
- write_page(bitmap, bitmap->storage.filemap[j], 0);
+ write_page(bitmap, bitmap->storage.filemap[j], j, 0);
}
}
diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h
index cfd7395de8..6e820eea32 100644
--- a/drivers/md/md-bitmap.h
+++ b/drivers/md/md-bitmap.h
@@ -207,6 +207,7 @@ struct bitmap {
* w/ filemap pages */
unsigned long file_pages; /* number of pages in the file*/
unsigned long bytes; /* total bytes in the bitmap */
+ unsigned long sb_index; /* sb page index */
} storage;
unsigned long flags;
--
2.33.0
On 13.10.21 18:00, Kent Overstreet wrote:
> As part of the struct page cleanups underway, we want to remove as much
> usage of page->mapping and page->index as possible, as frequently they
> are known from context.
>
> In the brd code, we're never actually reading from page->index except in
> assertions, so references to it can be safely deleted.
>
> Signed-off-by: Kent Overstreet <[email protected]>
> ---
> drivers/block/brd.c | 4 ----
> 1 file changed, 4 deletions(-)
>
> diff --git a/drivers/block/brd.c b/drivers/block/brd.c
> index 58ec167aa0..0a55aed832 100644
> --- a/drivers/block/brd.c
> +++ b/drivers/block/brd.c
> @@ -72,8 +72,6 @@ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
> page = radix_tree_lookup(&brd->brd_pages, idx);
> rcu_read_unlock();
>
> - BUG_ON(page && page->index != idx);
> -
> return page;
> }
>
> @@ -108,12 +106,10 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
>
> spin_lock(&brd->brd_lock);
> idx = sector >> PAGE_SECTORS_SHIFT;
> - page->index = idx;
> if (radix_tree_insert(&brd->brd_pages, idx, page)) {
> __free_page(page);
> page = radix_tree_lookup(&brd->brd_pages, idx);
> BUG_ON(!page);
> - BUG_ON(page->index != idx);
> } else {
> brd->brd_nr_pages++;
> }
>
Reviewed-by: David Hildenbrand <[email protected]>
--
Thanks,
David / dhildenb
On Wed, Oct 13, 2021 at 12:00:34PM -0400, Kent Overstreet wrote:
> As part of the struct page cleanups underway, we want to remove as much
> usage of page->mapping and page->index as possible, as frequently they
> are known from context.
>
> In the brd code, we're never actually reading from page->index except in
> assertions, so references to it can be safely deleted.
>
> Signed-off-by: Kent Overstreet <[email protected]>
More than that ... this is essentially asserting that the radix tree
code works, and we have a test suite to ensure that.
Reviewed-by: Matthew Wilcox (Oracle) <[email protected]>
On Wed, Oct 13, 2021 at 06:33:06PM +0200, David Hildenbrand wrote:
> > @@ -9317,6 +9319,7 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
> > struct page *page;
> > struct zone *zone;
> > unsigned int order;
> > + unsigned int migratetype;
> > unsigned long flags;
> >
> > offline_mem_sections(pfn, end_pfn);
> > @@ -9346,7 +9349,8 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
> > BUG_ON(page_count(page));
> > BUG_ON(!PageBuddy(page));
> > order = buddy_order(page);
> > - del_page_from_free_list(page, zone, order);
> > + migratetype = get_pfnblock_migratetype(page, pfn);
>
> As the free pages are isolated, theoretically this should be
> MIGRATE_ISOLATE.
Thanks for noticing that - I somehow missed the fact that pageblock migratetypes
change at runtime, so my patch is wrong. I'm going to have to rework my patch to
store the migratetype of free pages in the page itself.
On 10/14/21 10:30 PM, Kent Overstreet wrote:
> On Thu, Oct 14, 2021 at 04:58:46PM +0800,[email protected] wrote:
>> Hello all,
>>
>> The page->index takes an important role for cluster-md module.
>> i.e, a two-node HA env, node A bitmap may be managed in first 4K area, then
>> node B bitmap is in 8K area (the second page). this patch removes the index
>> and fix/hardcode index with value 0, which will only operate first node bitmap.
>>
>> If this serial patches are important and must be merged in mainline, we should
>> redesign code logic for the related code.
>>
>> Thanks,
>> Heming
> Can you look at and test the updated patch below? The more I look at the md
> bitmap code the more it scares me.
>
> -- >8 --
> Subject: [PATCH] md: Kill usage of page->index
>
> As part of the struct page cleanups underway, we want to remove as much
> usage of page->mapping and page->index as possible, as frequently they
> are known from context - as they are here in the md bitmap code.
>
> Signed-off-by: Kent Overstreet<[email protected]>
> ---
> drivers/md/md-bitmap.c | 49 ++++++++++++++++++++----------------------
> drivers/md/md-bitmap.h | 1 +
> 2 files changed, 24 insertions(+), 26 deletions(-)
>
> diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
> index e29c6298ef..316e4cd5a7 100644
> --- a/drivers/md/md-bitmap.c
> +++ b/drivers/md/md-bitmap.c
> @@ -165,10 +165,8 @@ static int read_sb_page(struct mddev *mddev, loff_t offset,
>
> if (sync_page_io(rdev, target,
> roundup(size, bdev_logical_block_size(rdev->bdev)),
> - page, REQ_OP_READ, 0, true)) {
> - page->index = index;
> + page, REQ_OP_READ, 0, true))
> return 0;
> - }
> }
> return -EIO;
> }
> @@ -209,7 +207,8 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde
> return NULL;
> }
>
> -static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
> +static int write_sb_page(struct bitmap *bitmap, struct page *page,
> + unsigned long index, int wait)
> {
> struct md_rdev *rdev;
> struct block_device *bdev;
> @@ -224,7 +223,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
>
> bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
>
> - if (page->index == store->file_pages-1) {
> + if (index == store->file_pages-1) {
> int last_page_size = store->bytes & (PAGE_SIZE-1);
> if (last_page_size == 0)
> last_page_size = PAGE_SIZE;
> @@ -236,8 +235,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
> */
> if (mddev->external) {
> /* Bitmap could be anywhere. */
> - if (rdev->sb_start + offset + (page->index
> - * (PAGE_SIZE/512))
> + if (rdev->sb_start + offset + index * PAGE_SECTORS
> > rdev->data_offset
> &&
> rdev->sb_start + offset
> @@ -247,7 +245,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
> } else if (offset < 0) {
> /* DATA BITMAP METADATA */
> if (offset
> - + (long)(page->index * (PAGE_SIZE/512))
> + + (long)(index * PAGE_SECTORS)
> + size/512 > 0)
> /* bitmap runs in to metadata */
> goto bad_alignment;
> @@ -259,7 +257,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
> /* METADATA BITMAP DATA */
> if (rdev->sb_start
> + offset
> - + page->index*(PAGE_SIZE/512) + size/512
> + + index * PAGE_SECTORS + size/512
> > rdev->data_offset)
> /* bitmap runs in to data */
> goto bad_alignment;
> @@ -268,7 +266,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
> }
> md_super_write(mddev, rdev,
> rdev->sb_start + offset
> - + page->index * (PAGE_SIZE/512),
> + + index * PAGE_SECTORS,
> size,
> page);
> }
> @@ -285,12 +283,13 @@ static void md_bitmap_file_kick(struct bitmap *bitmap);
> /*
> * write out a page to a file
> */
> -static void write_page(struct bitmap *bitmap, struct page *page, int wait)
> +static void write_page(struct bitmap *bitmap, struct page *page,
> + unsigned long index, int wait)
> {
> struct buffer_head *bh;
>
> if (bitmap->storage.file == NULL) {
> - switch (write_sb_page(bitmap, page, wait)) {
> + switch (write_sb_page(bitmap, page, index, wait)) {
> case -EINVAL:
> set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
> }
> @@ -399,7 +398,6 @@ static int read_page(struct file *file, unsigned long index,
> blk_cur++;
> bh = bh->b_this_page;
> }
> - page->index = index;
>
> wait_event(bitmap->write_wait,
> atomic_read(&bitmap->pending_writes)==0);
> @@ -472,7 +470,7 @@ void md_bitmap_update_sb(struct bitmap *bitmap)
> sb->sectors_reserved = cpu_to_le32(bitmap->mddev->
> bitmap_info.space);
> kunmap_atomic(sb);
> - write_page(bitmap, bitmap->storage.sb_page, 1);
> + write_page(bitmap, bitmap->storage.sb_page, bitmap->storage.sb_index, 1);
I guess it is fine for sb_page now.
[...]
> @@ -1027,7 +1024,7 @@ void md_bitmap_unplug(struct bitmap *bitmap)
> "md bitmap_unplug");
> }
> clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING);
> - write_page(bitmap, bitmap->storage.filemap[i], 0);
> + write_page(bitmap, bitmap->storage.filemap[i], i, 0);
But for filemap page, I am not sure the above is correct.
> writing = 1;
> }
> }
> @@ -1137,7 +1134,7 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
> memset(paddr + offset, 0xff,
> PAGE_SIZE - offset);
> kunmap_atomic(paddr);
> - write_page(bitmap, page, 1);
> + write_page(bitmap, page, index, 1);
Ditto.
>
> ret = -EIO;
> if (test_bit(BITMAP_WRITE_ERROR,
> @@ -1336,7 +1333,7 @@ void md_bitmap_daemon_work(struct mddev *mddev)
> if (bitmap->storage.filemap &&
> test_and_clear_page_attr(bitmap, j,
> BITMAP_PAGE_NEEDWRITE)) {
> - write_page(bitmap, bitmap->storage.filemap[j], 0);
> + write_page(bitmap, bitmap->storage.filemap[j], j, 0);
Ditto.
> }
> }
>
> diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h
> index cfd7395de8..6e820eea32 100644
> --- a/drivers/md/md-bitmap.h
> +++ b/drivers/md/md-bitmap.h
> @@ -207,6 +207,7 @@ struct bitmap {
> * w/ filemap pages */
> unsigned long file_pages; /* number of pages in the file*/
> unsigned long bytes; /* total bytes in the bitmap */
> + unsigned long sb_index; /* sb page index */
I guess it resolve the issue for sb_page, and we might need to do
similar things
for filemap as well if I am not misunderstood.
Thanks,
Guoqing
Greeting,
FYI, we noticed the following commit (built with gcc-9):
commit: 1609369623c4b6fe45602ee0d8192f6df9d4b1fe ("[PATCH 1/5] mm: Make free_area->nr_free per migratetype")
url: https://github.com/0day-ci/linux/commits/Kent-Overstreet/Minor-mm-struct-page-work/20211014-000511
base: https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git 5816b3e6577eaa676ceb00a848f0fd65fe2adc29
in testcase: xfstests
version: xfstests-x86_64-99bc497-1_20211014
with following parameters:
disk: 4HDD
fs: xfs
test: xfs-reflink-25
ucode: 0x28
test-description: xfstests is a regression test suite for xfs and other files ystems.
test-url: git://git.kernel.org/pub/scm/fs/xfs/xfstests-dev.git
on test machine: 8 threads 1 sockets Intel(R) Core(TM) i7-4790 v3 @ 3.60GHz with 6G memory
caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):
+-------------------------------------------------------------+-----------+------------+
| | v5.15-rc3 | 1609369623 |
+-------------------------------------------------------------+-----------+------------+
| boot_successes | 121 | 0 |
| BUG:kernel_NULL_pointer_dereference,address | 0 | 6 |
| Oops:#[##] | 0 | 6 |
| RIP:steal_suitable_fallback | 0 | 6 |
| Kernel_panic-not_syncing:Fatal_exception | 0 | 6 |
+-------------------------------------------------------------+-----------+------------+
If you fix the issue, kindly add following tag
Reported-by: kernel test robot <[email protected]>
[ 1.331661][ T0] BUG: kernel NULL pointer dereference, address: 0000000000000028
[ 1.339184][ T0] #PF: supervisor read access in kernel mode
[ 1.344956][ T0] #PF: error_code(0x0000) - not-present page
[ 1.350724][ T0] PGD 0 P4D 0
[ 1.353914][ T0] Oops: 0000 [#1] SMP PTI
[ 1.358047][ T0] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.15.0-rc3-00001-g1609369623c4 #1
[ 1.366659][ T0] Hardware name: Dell Inc. OptiPlex 9020/03CPWF, BIOS A11 04/01/2015
[ 1.374497][ T0] RIP: 0010:steal_suitable_fallback+0x2a/0x240
[ 1.380438][ T0] Code: 0f 1f 44 00 00 41 57 41 89 d7 ba 07 00 00 00 41 56 41 55 49 89 fd 41 54 49 89 f4 55 4c 89 e7 48 63 e9 53 44 89 c3 48 83 ec 18 <4c> 8b 76 28 48 2b 35 1b 7a 32 01 65 48 8b 04 25 28 00 00 00 48 89
[ 1.399730][ T0] RSP: 0000:ffffffff828039d8 EFLAGS: 00010096
[ 1.405587][ T0] RAX: ffff88819edd6720 RBX: 0000000000000001 RCX: 0000000000000002
[ 1.413338][ T0] RDX: 0000000000000007 RSI: 0000000000000000 RDI: 0000000000000000
[ 1.421088][ T0] RBP: 0000000000000002 R08: 0000000000000001 R09: 0000000000000000
[ 1.428837][ T0] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
[ 1.436585][ T0] R13: ffff88819edd6080 R14: 0000000000000000 R15: 0000000000000101
[ 1.444338][ T0] FS: 0000000000000000(0000) GS:ffff88817de00000(0000) knlGS:0000000000000000
[ 1.453034][ T0] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1.459405][ T0] CR2: 0000000000000028 CR3: 000000019da10001 CR4: 00000000000606b0
[ 1.467155][ T0] Call Trace:
[ 1.470255][ T0] rmqueue_bulk+0x882/0x980
[ 1.474559][ T0] rmqueue+0x551/0xe00
[ 1.478434][ T0] ? rmqueue_bulk+0x331/0x980
[ 1.482911][ T0] ? rmqueue_bulk+0x331/0x980
[ 1.487389][ T0] ? __raw_callee_save___native_queued_spin_unlock+0x11/0x1e
[ 1.494535][ T0] get_page_from_freelist+0xde/0x400
[ 1.499615][ T0] __alloc_pages+0x197/0x380
[ 1.504008][ T0] allocate_slab+0x2f8/0x440
[ 1.508399][ T0] ___slab_alloc+0x6aa/0x800
[ 1.512790][ T0] ? radix_tree_node_alloc+0x40/0xc0
[ 1.518901][ T0] ? pcpu_alloc_area+0x1d8/0x300
[ 1.523636][ T0] ? __raw_callee_save___native_queued_spin_unlock+0x11/0x1e
[ 1.530786][ T0] ? radix_tree_node_alloc+0x40/0xc0
[ 1.536897][ T0] __slab_alloc+0x1c/0x40
[ 1.541636][ T0] kmem_cache_alloc+0x382/0x400
[ 1.546287][ T0] radix_tree_node_alloc+0x40/0xc0
[ 1.552230][ T0] idr_get_free+0x1e1/0x300
[ 1.556537][ T0] idr_alloc_u32+0x5f/0xc0
[ 1.560758][ T0] idr_alloc+0x39/0x80
[ 1.564632][ T0] workqueue_init_early+0x17d/0x33b
[ 1.569627][ T0] start_kernel+0x378/0x5fb
[ 1.573933][ T0] ? load_ucode_intel_bsp+0x21/0x52
[ 1.578927][ T0] secondary_startup_64_no_verify+0xc2/0xcb
[ 1.584611][ T0] Modules linked in:
[ 1.588313][ T0] CR2: 0000000000000028
[ 1.592275][ T0] ---[ end trace 1ab7942b05fdb9ba ]---
[ 1.597526][ T0] RIP: 0010:steal_suitable_fallback+0x2a/0x240
[ 1.603469][ T0] Code: 0f 1f 44 00 00 41 57 41 89 d7 ba 07 00 00 00 41 56 41 55 49 89 fd 41 54 49 89 f4 55 4c 89 e7 48 63 e9 53 44 89 c3 48 83 ec 18 <4c> 8b 76 28 48 2b 35 1b 7a 32 01 65 48 8b 04 25 28 00 00 00 48 89
[ 1.622760][ T0] RSP: 0000:ffffffff828039d8 EFLAGS: 00010096
[ 1.628614][ T0] RAX: ffff88819edd6720 RBX: 0000000000000001 RCX: 0000000000000002
[ 1.636365][ T0] RDX: 0000000000000007 RSI: 0000000000000000 RDI: 0000000000000000
[ 1.644115][ T0] RBP: 0000000000000002 R08: 0000000000000001 R09: 0000000000000000
[ 1.651867][ T0] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
[ 1.659616][ T0] R13: ffff88819edd6080 R14: 0000000000000000 R15: 0000000000000101
[ 1.667368][ T0] FS: 0000000000000000(0000) GS:ffff88817de00000(0000) knlGS:0000000000000000
[ 1.676065][ T0] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1.682438][ T0] CR2: 0000000000000028 CR3: 000000019da10001 CR4: 00000000000606b0
[ 1.690190][ T0] Kernel panic - not syncing: Fatal exception
To reproduce:
git clone https://github.com/intel/lkp-tests.git
cd lkp-tests
sudo bin/lkp install job.yaml # job file is attached in this email
bin/lkp split-job --compatible job.yaml # generate the yaml file for lkp run
sudo bin/lkp run generated-yaml-file
# if come across any failure that blocks the test,
# please remove ~/.lkp and /lkp dir to run from a clean state.
---
0DAY/LKP+ Test Infrastructure Open Source Technology Center
https://lists.01.org/hyperkitty/list/[email protected] Intel Corporation
Thanks,
Oliver Sang
I agree Guoqing's comments.
In Documentation/driver-api/md/md-cluster.rst, there is a pic in "1. On-disk format".
Which describes the layout of "sb+bitmap" area.
And even in non-cluster array, bitmap area more than 1 pages also needs index/offset.
After the serial patches removing page->index, md layer should create a similar
struct to manage it.
- Heming
On 10/15/21 11:01, Guoqing Jiang wrote:
>
>
> On 10/14/21 10:30 PM, Kent Overstreet wrote:
>> On Thu, Oct 14, 2021 at 04:58:46PM +0800,[email protected] wrote:
>>> Hello all,
>>>
>>> The page->index takes an important role for cluster-md module.
>>> i.e, a two-node HA env, node A bitmap may be managed in first 4K area, then
>>> node B bitmap is in 8K area (the second page). this patch removes the index
>>> and fix/hardcode index with value 0, which will only operate first node bitmap.
>>>
>>> If this serial patches are important and must be merged in mainline, we should
>>> redesign code logic for the related code.
>>>
>>> Thanks,
>>> Heming
>> Can you look at and test the updated patch below? The more I look at the md
>> bitmap code the more it scares me.
>>
>> -- >8 --
>> Subject: [PATCH] md: Kill usage of page->index
>>
>> As part of the struct page cleanups underway, we want to remove as much
>> usage of page->mapping and page->index as possible, as frequently they
>> are known from context - as they are here in the md bitmap code.
>>
>> Signed-off-by: Kent Overstreet<[email protected]>
>> ---
>> drivers/md/md-bitmap.c | 49 ++++++++++++++++++++----------------------
>> drivers/md/md-bitmap.h | 1 +
>> 2 files changed, 24 insertions(+), 26 deletions(-)
>>
>> diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
>> index e29c6298ef..316e4cd5a7 100644
>> --- a/drivers/md/md-bitmap.c
>> +++ b/drivers/md/md-bitmap.c
>> @@ -165,10 +165,8 @@ static int read_sb_page(struct mddev *mddev, loff_t offset,
>> if (sync_page_io(rdev, target,
>> roundup(size, bdev_logical_block_size(rdev->bdev)),
>> - page, REQ_OP_READ, 0, true)) {
>> - page->index = index;
>> + page, REQ_OP_READ, 0, true))
>> return 0;
>> - }
>> }
>> return -EIO;
>> }
>> @@ -209,7 +207,8 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde
>> return NULL;
>> }
>> -static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
>> +static int write_sb_page(struct bitmap *bitmap, struct page *page,
>> + unsigned long index, int wait)
>> {
>> struct md_rdev *rdev;
>> struct block_device *bdev;
>> @@ -224,7 +223,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
>> bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
>> - if (page->index == store->file_pages-1) {
>> + if (index == store->file_pages-1) {
>> int last_page_size = store->bytes & (PAGE_SIZE-1);
>> if (last_page_size == 0)
>> last_page_size = PAGE_SIZE;
>> @@ -236,8 +235,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
>> */
>> if (mddev->external) {
>> /* Bitmap could be anywhere. */
>> - if (rdev->sb_start + offset + (page->index
>> - * (PAGE_SIZE/512))
>> + if (rdev->sb_start + offset + index * PAGE_SECTORS
>> > rdev->data_offset
>> &&
>> rdev->sb_start + offset
>> @@ -247,7 +245,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
>> } else if (offset < 0) {
>> /* DATA BITMAP METADATA */
>> if (offset
>> - + (long)(page->index * (PAGE_SIZE/512))
>> + + (long)(index * PAGE_SECTORS)
>> + size/512 > 0)
>> /* bitmap runs in to metadata */
>> goto bad_alignment;
>> @@ -259,7 +257,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
>> /* METADATA BITMAP DATA */
>> if (rdev->sb_start
>> + offset
>> - + page->index*(PAGE_SIZE/512) + size/512
>> + + index * PAGE_SECTORS + size/512
>> > rdev->data_offset)
>> /* bitmap runs in to data */
>> goto bad_alignment;
>> @@ -268,7 +266,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
>> }
>> md_super_write(mddev, rdev,
>> rdev->sb_start + offset
>> - + page->index * (PAGE_SIZE/512),
>> + + index * PAGE_SECTORS,
>> size,
>> page);
>> }
>> @@ -285,12 +283,13 @@ static void md_bitmap_file_kick(struct bitmap *bitmap);
>> /*
>> * write out a page to a file
>> */
>> -static void write_page(struct bitmap *bitmap, struct page *page, int wait)
>> +static void write_page(struct bitmap *bitmap, struct page *page,
>> + unsigned long index, int wait)
>> {
>> struct buffer_head *bh;
>> if (bitmap->storage.file == NULL) {
>> - switch (write_sb_page(bitmap, page, wait)) {
>> + switch (write_sb_page(bitmap, page, index, wait)) {
>> case -EINVAL:
>> set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
>> }
>> @@ -399,7 +398,6 @@ static int read_page(struct file *file, unsigned long index,
>> blk_cur++;
>> bh = bh->b_this_page;
>> }
>> - page->index = index;
>> wait_event(bitmap->write_wait,
>> atomic_read(&bitmap->pending_writes)==0);
>> @@ -472,7 +470,7 @@ void md_bitmap_update_sb(struct bitmap *bitmap)
>> sb->sectors_reserved = cpu_to_le32(bitmap->mddev->
>> bitmap_info.space);
>> kunmap_atomic(sb);
>> - write_page(bitmap, bitmap->storage.sb_page, 1);
>> + write_page(bitmap, bitmap->storage.sb_page, bitmap->storage.sb_index, 1);
>
> I guess it is fine for sb_page now.
>
> [...]
>
>> @@ -1027,7 +1024,7 @@ void md_bitmap_unplug(struct bitmap *bitmap)
>> "md bitmap_unplug");
>> }
>> clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING);
>> - write_page(bitmap, bitmap->storage.filemap[i], 0);
>> + write_page(bitmap, bitmap->storage.filemap[i], i, 0);
>
> But for filemap page, I am not sure the above is correct.
>
>> writing = 1;
>> }
>> }
>> @@ -1137,7 +1134,7 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
>> memset(paddr + offset, 0xff,
>> PAGE_SIZE - offset);
>> kunmap_atomic(paddr);
>> - write_page(bitmap, page, 1);
>> + write_page(bitmap, page, index, 1);
>
> Ditto.
>
>> ret = -EIO;
>> if (test_bit(BITMAP_WRITE_ERROR,
>> @@ -1336,7 +1333,7 @@ void md_bitmap_daemon_work(struct mddev *mddev)
>> if (bitmap->storage.filemap &&
>> test_and_clear_page_attr(bitmap, j,
>> BITMAP_PAGE_NEEDWRITE)) {
>> - write_page(bitmap, bitmap->storage.filemap[j], 0);
>> + write_page(bitmap, bitmap->storage.filemap[j], j, 0);
>
> Ditto.
>
>
>> }
>> }
>> diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h
>> index cfd7395de8..6e820eea32 100644
>> --- a/drivers/md/md-bitmap.h
>> +++ b/drivers/md/md-bitmap.h
>> @@ -207,6 +207,7 @@ struct bitmap {
>> * w/ filemap pages */
>> unsigned long file_pages; /* number of pages in the file*/
>> unsigned long bytes; /* total bytes in the bitmap */
>> + unsigned long sb_index; /* sb page index */
>
> I guess it resolve the issue for sb_page, and we might need to do similar things
> for filemap as well if I am not misunderstood.
>
> Thanks,
> Guoqing
>
On 10/14/21 16:45, Kent Overstreet wrote:
> On Wed, Oct 13, 2021 at 06:33:06PM +0200, David Hildenbrand wrote:
>> > @@ -9317,6 +9319,7 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
>> > struct page *page;
>> > struct zone *zone;
>> > unsigned int order;
>> > + unsigned int migratetype;
>> > unsigned long flags;
>> >
>> > offline_mem_sections(pfn, end_pfn);
>> > @@ -9346,7 +9349,8 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
>> > BUG_ON(page_count(page));
>> > BUG_ON(!PageBuddy(page));
>> > order = buddy_order(page);
>> > - del_page_from_free_list(page, zone, order);
>> > + migratetype = get_pfnblock_migratetype(page, pfn);
>>
>> As the free pages are isolated, theoretically this should be
>> MIGRATE_ISOLATE.
>
> Thanks for noticing that - I somehow missed the fact that pageblock migratetypes
> change at runtime,
Not only that. Buddy merging will also merge pages from different
migratetypes. I think that's the main reason why nr_free is not per
migratetype. Your patch has been attempted few times already, e.g. here [1].
[1]
https://lore.kernel.org/all/[email protected]/
> so my patch is wrong. I'm going to have to rework my patch to
> store the migratetype of free pages in the page itself.
Yeah that would be the solution. Will likely bring some overhead to
alloc/free fastpaths, which would have to be worth it, so nobody attempted it.