From: =?UTF-8?q?Rados=C5=82aw=20Smogura?= Subject: [PATCH 08/18] Generic routines for defragmenting pagecache. Date: Thu, 16 Feb 2012 15:31:35 +0100 Message-ID: <1329402705-25454-8-git-send-email-mail@smogura.eu> References: <1329402705-25454-1-git-send-email-mail@smogura.eu> Mime-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: quoted-printable Cc: Yongqiang Yang , mail@smogura.eu, linux-ext4@vger.kernel.org To: linux-mm@kvack.org Return-path: In-Reply-To: <1329402705-25454-1-git-send-email-mail@smogura.eu> Sender: owner-linux-mm@kvack.org List-Id: linux-ext4.vger.kernel.org Those are generic rountines with support for SHMFS (TMPFS). Signed-off-by: Rados=C5=82aw Smogura --- include/linux/defrag-pagecache.h | 62 +++++ include/linux/fs.h | 23 ++ mm/Makefile | 1 + mm/defrag-pagecache.c | 489 ++++++++++++++++++++++++++++++++= ++++++ 4 files changed, 575 insertions(+), 0 deletions(-) create mode 100644 include/linux/defrag-pagecache.h create mode 100644 mm/defrag-pagecache.c diff --git a/include/linux/defrag-pagecache.h b/include/linux/defrag-page= cache.h new file mode 100644 index 0000000..46793de --- /dev/null +++ b/include/linux/defrag-pagecache.h @@ -0,0 +1,62 @@ +/* + * linux/include/linux/defrag-pagecache.h + * + * Defragments pagecache into compound pages + * + * (c) 2011 Rados=C5=82aw Smogura + */ + +#ifndef DEFRAG_PAGECACHE_H +#define DEFRAG_PAGECACHE_H +#include + +/* XXX Split this file into two public and protected - comments below + * Protected will contain + * declaration of generic and helper methods for file systems developers= , + * public just general structures and controls. + */ +struct file; +struct inode; +struct defrag_pagecache_ctl; +struct address_space; + +typedef struct page *defrag_generic_get_page( + const struct defrag_pagecache_ctl *ctl, struct inode *inode, + pgoff_t pageIndex); + +/** Passes additional information and controls to page defragmentation. = */ +struct defrag_pagecache_ctl { + /** If yes defragmentation will try to fill page caches. */ + char fillPages:1; + + /** If filling of page fails, defragmentation will fail too. Setting + * this requires {@link #fillPages} will be setted. + */ + char requireFillPages:1; + + /** If yes defragmentation will try to force in many aspects, this may + * cause, operation to run longer, but with greater probability of + * success. */ + char force:1; +}; + +/** Defragments page cache of specified file and migrates it's to huge p= ages. + * + * @param f + * @param offset + * @param size + * @return + */ +extern int defragPageCache(struct file *f, unsigned long offset, + unsigned long size, const struct defrag_pagecache_ctl *defragCtl); + +/** Tries to fix to huge page mappings, buy walking through given Trnaps= arent + * Huge Page */ +extern int thpFixMappings(struct page *hugePage); + +extern int defrag_generic_shm(struct file *file, struct address_space *m= apping, + loff_t pos, + struct page **pagep, + struct defrag_pagecache_ctl *ctl); +#endif /* DEFRAG_PAGECACHE_H */ + diff --git a/include/linux/fs.h b/include/linux/fs.h index 386da09..bfd9122 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -11,6 +11,10 @@ #include #include =20 +#ifdef CONFIG_HUGEPAGECACHE +#include +#endif + /* * It's silly to have NR_OPEN bigger than NR_FILE, but you can change * the file limit at runtime and only root can increase the per-process @@ -602,6 +606,25 @@ struct address_space_operations { loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); =20 +#ifdef CONFIG_HUGEPAGECACHE + /** Used to defrag (migrate) pages at position {@code pos} + * to huge pages. Having this not {@code NULL} will indicate that + * address space, generally, supports huge pages (transaprent + * huge page may be established). + *
+ * It's like migrate pages, but different :) + * + * @param pagep on success will be setted to established huge page + * + * @returns TODO What to return? + * {@code 0} on success, value less then {@code 0} on error + */ + int (*defragpage) (struct file *, struct address_space *mapping, + loff_t pos, + struct page **pagep, + const struct defrag_pagecache_ctl *ctl); +#endif + /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ sector_t (*bmap)(struct address_space *, sector_t); void (*invalidatepage) (struct page *, unsigned long); diff --git a/mm/Makefile b/mm/Makefile index 50ec00e..75389c8 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -51,3 +51,4 @@ obj-$(CONFIG_HWPOISON_INJECT) +=3D hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) +=3D kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) +=3D kmemleak-test.o obj-$(CONFIG_CLEANCACHE) +=3D cleancache.o +obj-$(CONFIG_HUGEPAGECACHE) +=3D defrag-pagecache.o \ No newline at end of file diff --git a/mm/defrag-pagecache.c b/mm/defrag-pagecache.c new file mode 100644 index 0000000..5a14fe8 --- /dev/null +++ b/mm/defrag-pagecache.c @@ -0,0 +1,489 @@ +/* + * linux/mm/defrag-pagecache.c + * + * Defragments pagecache into compound pages + * + * (c) 2011 Rados=C5=82aw Smogura + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" +/*#include */ + +struct migration_private { + loff_t startIndex; + pgoff_t nextIndex; + pgoff_t pagesToMigrateCount; + + struct page *hugePage; + struct inode *inode; + + const struct defrag_pagecache_ctl *defragCtl; + + int stop; + int result; + int stoppedCompoundFound; + + /** Callback method used to obtain next page. */ + defrag_generic_get_page *getNextPage; +}; + +static const struct defrag_pagecache_ctl defaultDefragCtl =3D { + .fillPages =3D 0, + .requireFillPages =3D 0, + .force =3D 0 +}; + +#define HUGEPAGE_ALLOC_GFP (GFP_HIGHUSER | __GFP_COMP \ + | __GFP_REPEAT | __GFP_NOWARN | __GFP_WAIT) + +static int defrageOneHugePage(struct file *file, loff_t offset, + struct page **pagep, + const struct defrag_pagecache_ctl *defragCtl, + defrag_generic_get_page *getPage); + +int defragPageCache(struct file *f, unsigned long offset, unsigned long = size, + const struct defrag_pagecache_ctl *defragCtl) +{ + /* Calculate requested huge page order. + * XXX Is below caluclation mutliplatform? + */ + const int hugePageOrder =3D (PMD_SHIFT - PAGE_SHIFT); + const int chunkSize =3D 1 << hugePageOrder; + unsigned long offsetIdx =3D offset; + unsigned long chunksToProceed; + + struct inode *inode =3D f->f_path.dentry->d_inode; + + const struct address_space_operations *aops =3D + inode->i_mapping->a_ops; + + /* TODO: Use hugepage state or something better instead of hardcoded... + * value. */ + if ((offset !=3D ((offset >> hugePageOrder) << hugePageOrder) || + size !=3D ((size >> hugePageOrder) << hugePageOrder)) + /* && (size !=3D (1 << hugePageOrder))*/) { + /* Start and length must be huge page "aligned". */ + return -EINVAL; + } + + offsetIdx =3D offset; + chunksToProceed =3D size >> hugePageOrder; + for (; chunksToProceed; chunksToProceed--, offsetIdx +=3D chunkSize) { + struct page *pagep; + int result =3D aops->defragpage(f, inode->i_mapping, offsetIdx, + &pagep, + defragCtl); + if (result) + return result; + } + + return 0; +} + +/** Callback for getting page for tmpfs. + * Tmpfs uses {@link shmem_read_mapping_page_gfp} function to read + * page from page cache. + */ +struct page *shmem_defrag_get_page(const struct defrag_pagecache_ctl *ct= l, + struct inode *inode, pgoff_t pageIndex) +{ + + return shmem_read_mapping_page_gfp( + inode->i_mapping, pageIndex, + mapping_gfp_mask(inode->i_mapping)); +} + +static void defrag_generic_mig_result(struct page *oldPage, + struct page *newPage, struct migration_ctl *ctl, int result) +{ + struct migration_private *prv =3D + (struct migration_private *) ctl->privateData; + + if (!result) { + /* Update index only on success; on fail, index will be used to + * clean up. */ + prv->nextIndex++; + + if (!PageTail(newPage)) + putback_lru_page(newPage); + else + put_page(newPage); + } else { + prv->stop =3D 1; + } + + /* XXX No isolated zone status update! */ + putback_lru_page(oldPage); + put_page(oldPage); +/* + unlock_page(oldPage); +*/ + + prv->result =3D result; +} + +static struct page *defrag_generic_mig_page_new(struct page *oldPage, + struct migration_ctl *ctl) +{ + struct migration_private *prv =3D + (struct migration_private *) ctl->privateData; + + return prv->hugePage + prv->nextIndex; +} + +static struct page *defrag_generic_mig_page_next(struct migration_ctl *c= tl, + page_mode *mode) +{ + struct migration_private *prv =3D + (struct migration_private *) ctl->privateData; + const struct defrag_pagecache_ctl *defragCtl; + + /** Hold current page cache page, we are going to migrate. */ + struct page *filePage; + + struct inode *inode; + + pgoff_t pageIndex; + + if (!(prv->nextIndex < prv->pagesToMigrateCount)) + return NULL; + + if (prv->result || prv->stop) + return NULL; + + inode =3D prv->inode; + pageIndex =3D prv->startIndex + prv->nextIndex; + defragCtl =3D prv->defragCtl; + +repeat_find: + filePage =3D find_lock_page(inode->i_mapping, pageIndex); + + if (filePage) + if (PageUptodate(filePage)) + goto skip_fill_pages; + + /* Try to upread page, if this was intention of caller, + * we don't need to check if page is writeback, migrate pages do it. */ + if (!defragCtl->fillPages) { + prv->result =3D 0; + prv->stop =3D 1; + return NULL; + } + + filePage =3D prv->getNextPage(prv->defragCtl, inode, pageIndex); + + if (IS_ERR(filePage)) { + prv->result =3D PTR_ERR(filePage); + prv->stop =3D 1; + return NULL; + } + + lock_page(filePage); + /* Validate page */ + if (!filePage->mapping + || filePage->index !=3D pageIndex + || !PageUptodate(filePage)) { + unlock_page(filePage); + goto repeat_find; + } + +skip_fill_pages: + if (/* ??? !defragCtl->fillPages && */ PageCompound(filePage)) { + /* Heare I think about giving support that in page + * cache may exists huge page but not uptodate whole. + * + * Currently this idea is suspended, due to many + * complications. + */ + prv->stoppedCompoundFound =3D 1; + goto out_unlock_and_stop; + } + + /* Prepare page for isolation, check if it can be isolated. */ + if (!PageLRU(filePage)) { + if (defragCtl->force) { + /* Isolation requires page in LRU, we may need to drain + * it if not present. */ + lru_add_drain(); + if (!PageLRU(filePage)) { + lru_add_drain_all(); + if (!PageLRU(filePage)) { + prv->result =3D -EBUSY; + goto out_unlock_and_stop; + } + } + } else { + prv->result =3D -EBUSY; + goto out_unlock_and_stop; + } + } + + /* Isolate pages. */ + if (isolate_lru_page(filePage)) { + prv->result =3D -EBUSY; + goto putback_page_and_stop; + } + + *mode =3D PAGE_LOCKED; + return filePage; + +putback_page_and_stop: + putback_lru_page(filePage); + +out_unlock_and_stop: + unlock_page(filePage); + put_page(filePage); + + return NULL; + +} + +int defrag_generic_shm(struct file *file, struct address_space *mapping, + loff_t pos, + struct page **pagep, + struct defrag_pagecache_ctl *ctl) +{ + return defrageOneHugePage(file, pos, pagep, ctl, shmem_defrag_get_page)= ; +} +EXPORT_SYMBOL(defrag_generic_shm); + +int defrag_generic_pagecache(struct file *file, + struct address_space *mapping, + loff_t pos, + struct page **pagep, + struct defrag_pagecache_ctl *ctl) +{ + /* As we do not support generic page cache defragmentaion, yet. */ + BUG(); + return 0; +} +/** Internal method for defragmenting one chunk of page cache. + * + *
+ * This is in some + * way common logic to operate on page cache. It's highly probably that = this + * method will be exposed as "generic" to add support for transparent + * huge pages for page cache. + */ +static int defrageOneHugePage(struct file *file, loff_t offset, + struct page **pagep, + const struct defrag_pagecache_ctl *defragCtl, + defrag_generic_get_page *getPage) +{ + const int hugePageOrder =3D (PMD_SHIFT - PAGE_SHIFT); + + /** Huge page we migrate to. */ + struct page *hugePage; + + /** Private migration data. */ + struct migration_private migrationPrv; + + struct migration_ctl migration_ctl; + + struct inode *inode =3D file->f_path.dentry->d_inode; + + const int size =3D 1 << hugePageOrder; + + /** Helpers */ + pgoff_t i; + + /* Over here we callback based migration. */ + /* READ. + * + * This code is in develop stage, and following problems must be + * resolved: + * - page is read from page cache, but lock is droped, in meantime + * page may be no longer up to date, or may be removed from + * page cache. This will be resolved by changing migrat function + */ + /* Allocate one huge page. */ + hugePage =3D alloc_pages(HUGEPAGE_ALLOC_GFP, hugePageOrder); + if (!hugePage) + return -ENOMEM; + + migrationPrv.nextIndex =3D 0; + migrationPrv.pagesToMigrateCount =3D size; + migrationPrv.hugePage =3D hugePage; + migrationPrv.stop =3D 0; + migrationPrv.result =3D 0; + migrationPrv.stoppedCompoundFound =3D 0; + migrationPrv.getNextPage =3D getPage; + migrationPrv.startIndex =3D offset; + migrationPrv.inode =3D inode; + migrationPrv.defragCtl =3D + (const struct defrag_pagecache_ctl *) defragCtl; + /* Elevate page counts */ + for (i =3D 1; i < size; i++) { + struct page *p =3D hugePage + i; + /* Elevate page counters. */ + get_page(p); + } + + migration_ctl.getNextPage =3D defrag_generic_mig_page_next; + migration_ctl.getNewPage =3D defrag_generic_mig_page_new; + migration_ctl.notifyResult =3D defrag_generic_mig_result; + migration_ctl.privateData =3D (unsigned long) &migrationPrv; + + /* Aquire compund lock. */ + compound_lock(hugePage); + + /* Migrate pages. Currently page migrate will auto put back pages, + * and may fail and repeat, we need array of pages, to match + * each subpage. This behaviour isn't good. + */ + migrate_pages_cb(&migration_ctl, true, + MIGRATE_SYNC | MIGRATE_SRC_GETTED); + if (migrationPrv.nextIndex < migrationPrv.pagesToMigrateCount) { + /* XXX Simulate various bugs, at least do it hardcoded. */ + /* XXX Everything here is BUG, because need to opcode spliting + */ + if (migrationPrv.stoppedCompoundFound) { + /* If any page has been migrated it's a BUG */ + BUG_ON(migrationPrv.nextIndex); + goto compound_unlock_end; + } + /* Not all pages has been migrated, split target page. */ + /* Downgrade counts of tail pages - may cause deadlock. */ + VM_BUG_ON(1); + } else { + goto compound_unlock_end; + } + +compound_unlock_end: + compound_unlock(hugePage); +/* + put_page(hugePage); +*/ + + /* All file pages are unlocked, and should be freed. Huge should be on + * Unevictable list. + */ + return migrationPrv.result; +} + +static int thpFixMappingsRmapWalk(struct page *page, struct vm_area_stru= ct *vma, + unsigned long addr, void *prvData) { + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd, _pmd; + pte_t *pte; + + int i; + +/* + printk(KERN_INFO "Starting address is %lx", addr); +*/ + if (vma->vm_flags & VM_NONLINEAR || (addr & ~HPAGE_PMD_MASK)) { + /* Skip nonlinear VMAs, and not aligned addresses*/ + return SWAP_AGAIN; + } + + /* We will set pmd only if all tail pages meets following requirements: + * - all pages are up to data + * - all pages have same protection bits + * - ??? + */ + pgd =3D pgd_offset(vma->vm_mm, addr); + if (!pgd_present(*pgd)) + return SWAP_AGAIN; + + pud =3D pud_offset(pgd, addr); + if (!pud_present(*pud)) + return SWAP_AGAIN; + + pmd =3D pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + return SWAP_AGAIN; + + pte =3D (pte_t *) pmd; + if (pte_huge(*pte)) + return SWAP_AGAIN; + + + /*printk(KERN_INFO "Checking head flags"); */ + pte =3D pte_offset_map(pmd, addr); + if (!pte_present(*pte)) { + /* printk(KERN_INFO "Pte not present."); */ + pte_unmap(pte); + return SWAP_AGAIN; + } + + for (i =3D 1; i < HPAGE_PMD_NR; i++) { + struct page *tail_page; + + addr +=3D PAGE_SIZE; + + pte =3D pte_offset_map(pmd, addr); + if (!pte_present(*pte)) { + /* + * printk(KERN_INFO "No %d pte returning.", i); + */ + pte_unmap(pte); + return SWAP_AGAIN; + } + + tail_page =3D pte_page(*pte); + if (!tail_page) { + /* printk(KERN_INFO "Page +%d not present.", i); */ + goto unmap_out; + } + + /* We check index, howver we do not allow not linear mapping :) + */ + /* smp_mb(); */ + int i1 =3D tail_page->mapping =3D=3D page->mapping; + int i2 =3D tail_page->index =3D=3D (page->index + i); + if (i1 && i2) { + /* + printk(KERN_INFO "Page +%d present mappings and" + " indices ok", i); + */ + } else { + printk(KERN_INFO "Page +%d has good mapping %d, and" + " good index %d (%d, %d).", + i, + i1, + i2, + tail_page->index, + page->index); + goto unmap_out; + } + pte_unmap(pte); + } + pmd_clear(pmd); + _pmd =3D pmd_mkhuge(pmd_modify(*pmd, vma->vm_page_prot)); + + set_pmd_at(vma->vm_mm, addr, pmd, _pmd); + /* Everything is ok. */ + + /* TODO Do not flush all :) */ + flush_tlb_mm(vma->vm_mm); + printk(KERN_INFO "Replaced by pmd"); + return SWAP_AGAIN; +unmap_out: + pte_unmap(pte); + + return SWAP_AGAIN; +} + +int thpFixMappings(struct page *hugePage) +{ + BUG_ON(PageAnon(hugePage)); + /* lock_page(hugePage); */ + BUG_ON(!PageTransHuge(hugePage)); + rmap_walk(hugePage, thpFixMappingsRmapWalk, NULL); + /* unlock_page(hugePage); */ + + return 0; +} --=20 1.7.3.4 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom internet charges in Canada: sign http://stopthemeter= .ca/ Don't email: email@kvack.org