Since the introduction of FAULT_FLAG_SIZE to the vm_fault flag, it has
been somewhat painful with getting the flags set and removed at the
correct locations. More than one kernel oops was introduced due to
difficulties of getting the placement correctly. Removing the flag
values and introducing an input parameter to huge_fault that indicates
the size of the page entry. This makes the code easier to trace and
should avoid the issues we see with the fault flags where removal of the
flag was necessary in the fallback paths.
Signed-off-by: Dave Jiang <[email protected]>
---
drivers/dax/dax.c | 18 ++++++++++++------
fs/dax.c | 9 +++++----
fs/ext2/file.c | 2 +-
fs/ext4/file.c | 12 +++++++++---
fs/xfs/xfs_file.c | 9 +++++----
include/linux/dax.h | 3 ++-
include/linux/mm.h | 14 ++++++++------
mm/memory.c | 17 ++++-------------
8 files changed, 46 insertions(+), 38 deletions(-)
diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index b90bb30..b75c772 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -538,7 +538,8 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
}
#endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-static int dax_dev_fault(struct vm_fault *vmf)
+static int dax_dev_huge_fault(struct vm_fault *vmf,
+ enum page_entry_size pe_size)
{
int rc;
struct file *filp = vmf->vma->vm_file;
@@ -550,14 +551,14 @@ static int dax_dev_fault(struct vm_fault *vmf)
vmf->vma->vm_start, vmf->vma->vm_end);
rcu_read_lock();
- switch (vmf->flags & FAULT_FLAG_SIZE_MASK) {
- case FAULT_FLAG_SIZE_PTE:
+ switch (pe_size) {
+ case PE_SIZE_PTE:
rc = __dax_dev_pte_fault(dax_dev, vmf);
break;
- case FAULT_FLAG_SIZE_PMD:
+ case PE_SIZE_PMD:
rc = __dax_dev_pmd_fault(dax_dev, vmf);
break;
- case FAULT_FLAG_SIZE_PUD:
+ case PE_SIZE_PUD:
rc = __dax_dev_pud_fault(dax_dev, vmf);
break;
default:
@@ -568,9 +569,14 @@ static int dax_dev_fault(struct vm_fault *vmf)
return rc;
}
+static int dax_dev_fault(struct vm_fault *vmf)
+{
+ return dax_dev_huge_fault(vmf, PE_SIZE_PTE);
+}
+
static const struct vm_operations_struct dax_dev_vm_ops = {
.fault = dax_dev_fault,
- .huge_fault = dax_dev_fault,
+ .huge_fault = dax_dev_huge_fault,
};
static int dax_mmap(struct file *filp, struct vm_area_struct *vma)
diff --git a/fs/dax.c b/fs/dax.c
index 25f791d..97b8ecb 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1446,12 +1446,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, struct iomap_ops *ops)
* has done all the necessary locking for page fault to proceed
* successfully.
*/
-int dax_iomap_fault(struct vm_fault *vmf, struct iomap_ops *ops)
+int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
+ struct iomap_ops *ops)
{
- switch (vmf->flags & FAULT_FLAG_SIZE_MASK) {
- case FAULT_FLAG_SIZE_PTE:
+ switch (pe_size) {
+ case PE_SIZE_PTE:
return dax_iomap_pte_fault(vmf, ops);
- case FAULT_FLAG_SIZE_PMD:
+ case PE_SIZE_PMD:
return dax_iomap_pmd_fault(vmf, ops);
default:
return VM_FAULT_FALLBACK;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 6873883..b21891a 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -99,7 +99,7 @@ static int ext2_dax_fault(struct vm_fault *vmf)
}
down_read(&ei->dax_sem);
- ret = dax_iomap_fault(vmf, &ext2_iomap_ops);
+ ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &ext2_iomap_ops);
up_read(&ei->dax_sem);
if (vmf->flags & FAULT_FLAG_WRITE)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 51d7155..e8ab46e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -255,7 +255,8 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
#ifdef CONFIG_FS_DAX
-static int ext4_dax_fault(struct vm_fault *vmf)
+static int ext4_dax_huge_fault(struct vm_fault *vmf,
+ enum page_entry_size pe_size)
{
int result;
struct inode *inode = file_inode(vmf->vma->vm_file);
@@ -267,7 +268,7 @@ static int ext4_dax_fault(struct vm_fault *vmf)
file_update_time(vmf->vma->vm_file);
}
down_read(&EXT4_I(inode)->i_mmap_sem);
- result = dax_iomap_fault(vmf, &ext4_iomap_ops);
+ result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops);
up_read(&EXT4_I(inode)->i_mmap_sem);
if (write)
sb_end_pagefault(sb);
@@ -275,6 +276,11 @@ static int ext4_dax_fault(struct vm_fault *vmf)
return result;
}
+static int ext4_dax_fault(struct vm_fault *vmf)
+{
+ return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
+}
+
/*
* Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
* handler we check for races agaist truncate. Note that since we cycle through
@@ -307,7 +313,7 @@ static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf)
static const struct vm_operations_struct ext4_dax_vm_ops = {
.fault = ext4_dax_fault,
- .huge_fault = ext4_dax_fault,
+ .huge_fault = ext4_dax_huge_fault,
.page_mkwrite = ext4_dax_fault,
.pfn_mkwrite = ext4_dax_pfn_mkwrite,
};
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index c4fe261..c37f435 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1385,7 +1385,7 @@ xfs_filemap_page_mkwrite(
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (IS_DAX(inode)) {
- ret = dax_iomap_fault(vmf, &xfs_iomap_ops);
+ ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
} else {
ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
ret = block_page_mkwrite_return(ret);
@@ -1412,7 +1412,7 @@ xfs_filemap_fault(
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (IS_DAX(inode))
- ret = dax_iomap_fault(vmf, &xfs_iomap_ops);
+ ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
else
ret = filemap_fault(vmf);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1429,7 +1429,8 @@ xfs_filemap_fault(
*/
STATIC int
xfs_filemap_huge_fault(
- struct vm_fault *vmf)
+ struct vm_fault *vmf,
+ enum page_entry_size pe_size)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
struct xfs_inode *ip = XFS_I(inode);
@@ -1446,7 +1447,7 @@ xfs_filemap_huge_fault(
}
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- ret = dax_iomap_fault(vmf, &xfs_iomap_ops);
+ ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (vmf->flags & FAULT_FLAG_WRITE)
diff --git a/include/linux/dax.h b/include/linux/dax.h
index a3bfa26..df63730 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -38,7 +38,8 @@ static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
struct iomap_ops *ops);
-int dax_iomap_fault(struct vm_fault *vmf, struct iomap_ops *ops);
+int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
+ struct iomap_ops *ops);
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index);
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7646ae5..7b11431 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -285,11 +285,6 @@ extern pgprot_t protection_map[16];
#define FAULT_FLAG_REMOTE 0x80 /* faulting for non current tsk/mm */
#define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */
-#define FAULT_FLAG_SIZE_MASK 0x7000 /* Support up to 8-level page tables */
-#define FAULT_FLAG_SIZE_PTE 0x0000 /* First level (eg 4k) */
-#define FAULT_FLAG_SIZE_PMD 0x1000 /* Second level (eg 2MB) */
-#define FAULT_FLAG_SIZE_PUD 0x2000 /* Third level (eg 1GB) */
-
#define FAULT_FLAG_TRACE \
{ FAULT_FLAG_WRITE, "WRITE" }, \
{ FAULT_FLAG_MKWRITE, "MKWRITE" }, \
@@ -349,6 +344,13 @@ struct vm_fault {
*/
};
+/* page entry size for vm->huge_fault() */
+enum page_entry_size {
+ PE_SIZE_PTE = 0,
+ PE_SIZE_PMD,
+ PE_SIZE_PUD,
+};
+
/*
* These are the virtual MM functions - opening of an area, closing and
* unmapping it (needed to keep files on disk up-to-date etc), pointer
@@ -359,7 +361,7 @@ struct vm_operations_struct {
void (*close)(struct vm_area_struct * area);
int (*mremap)(struct vm_area_struct * area);
int (*fault)(struct vm_fault *vmf);
- int (*huge_fault)(struct vm_fault *vmf);
+ int (*huge_fault)(struct vm_fault *vmf, enum page_entry_size pe_size);
void (*map_pages)(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff);
diff --git a/mm/memory.c b/mm/memory.c
index 41e2a2d..6040b74 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3489,7 +3489,7 @@ static int create_huge_pmd(struct vm_fault *vmf)
if (vma_is_anonymous(vmf->vma))
return do_huge_pmd_anonymous_page(vmf);
if (vmf->vma->vm_ops->huge_fault)
- return vmf->vma->vm_ops->huge_fault(vmf);
+ return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
return VM_FAULT_FALLBACK;
}
@@ -3498,7 +3498,7 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
if (vma_is_anonymous(vmf->vma))
return do_huge_pmd_wp_page(vmf, orig_pmd);
if (vmf->vma->vm_ops->huge_fault)
- return vmf->vma->vm_ops->huge_fault(vmf);
+ return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
/* COW handled on pte level: split pmd */
VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
@@ -3519,7 +3519,7 @@ static int create_huge_pud(struct vm_fault *vmf)
if (vma_is_anonymous(vmf->vma))
return VM_FAULT_FALLBACK;
if (vmf->vma->vm_ops->huge_fault)
- return vmf->vma->vm_ops->huge_fault(vmf);
+ return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
return VM_FAULT_FALLBACK;
}
@@ -3531,7 +3531,7 @@ static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
if (vma_is_anonymous(vmf->vma))
return VM_FAULT_FALLBACK;
if (vmf->vma->vm_ops->huge_fault)
- return vmf->vma->vm_ops->huge_fault(vmf);
+ return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
return VM_FAULT_FALLBACK;
}
@@ -3659,7 +3659,6 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
if (!vmf.pud)
return VM_FAULT_OOM;
if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) {
- vmf.flags |= FAULT_FLAG_SIZE_PUD;
ret = create_huge_pud(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -3670,8 +3669,6 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
unsigned int dirty = flags & FAULT_FLAG_WRITE;
- vmf.flags |= FAULT_FLAG_SIZE_PUD;
-
/* NUMA case for anonymous PUDs would go here */
if (dirty && !pud_write(orig_pud)) {
@@ -3689,18 +3686,14 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
if (!vmf.pmd)
return VM_FAULT_OOM;
if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
- vmf.flags |= FAULT_FLAG_SIZE_PMD;
ret = create_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
- /* fall through path, remove PMD flag */
- vmf.flags &= ~FAULT_FLAG_SIZE_PMD;
} else {
pmd_t orig_pmd = *vmf.pmd;
barrier();
if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
- vmf.flags |= FAULT_FLAG_SIZE_PMD;
if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
return do_huge_pmd_numa_page(&vmf, orig_pmd);
@@ -3709,8 +3702,6 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
ret = wp_huge_pmd(&vmf, orig_pmd);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
- /* fall through path, remove PUD flag */
- vmf.flags &= ~FAULT_FLAG_SIZE_PUD;
} else {
huge_pmd_set_accessed(&vmf, orig_pmd);
return 0;
On Fri, Feb 3, 2017 at 1:31 PM, Dave Jiang <[email protected]> wrote:
> Since the introduction of FAULT_FLAG_SIZE to the vm_fault flag, it has
> been somewhat painful with getting the flags set and removed at the
> correct locations. More than one kernel oops was introduced due to
> difficulties of getting the placement correctly. Removing the flag
> values and introducing an input parameter to huge_fault that indicates
> the size of the page entry. This makes the code easier to trace and
> should avoid the issues we see with the fault flags where removal of the
> flag was necessary in the fallback paths.
>
> Signed-off-by: Dave Jiang <[email protected]>
Tested-by: Dan Williams <[email protected]>
This fixes a crash I can produce with the existing ndctl unit tests
[1] on next-20170202. Now to go extend the tests to go after the PUD
case...
[1]: https://github.com/pmem/ndctl
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
Hi Dave,
[auto build test ERROR on mmotm/master]
[cannot apply to linus/master linux/master v4.10-rc6 next-20170203]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
url: https://github.com/0day-ci/linux/commits/Dave-Jiang/mm-replace-FAULT_FLAG_SIZE-with-parameter-to-huge_fault/20170204-053548
base: git://git.cmpxchg.org/linux-mmotm.git master
config: i386-randconfig-x004-201705 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
# save the attached .config to linux build tree
make ARCH=i386
All errors (new ones prefixed by >>):
>> fs/ext4/file.c:280:1: error: conflicting types for 'ext4_dax_huge_fault'
ext4_dax_huge_fault(struct vm_fault *vmf)
^~~~~~~~~~~~~~~~~~~
fs/ext4/file.c:258:12: note: previous definition of 'ext4_dax_huge_fault' was here
static int ext4_dax_huge_fault(struct vm_fault *vmf,
^~~~~~~~~~~~~~~~~~~
fs/ext4/file.c: In function 'ext4_dax_huge_fault':
>> fs/ext4/file.c:292:32: error: incompatible type for argument 2 of 'dax_iomap_fault'
result = dax_iomap_fault(vmf, &ext4_iomap_ops);
^
In file included from fs/ext4/file.c:25:0:
include/linux/dax.h:41:5: note: expected 'enum page_entry_size' but argument is of type 'struct iomap_ops *'
int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
^~~~~~~~~~~~~~~
>> fs/ext4/file.c:292:11: error: too few arguments to function 'dax_iomap_fault'
result = dax_iomap_fault(vmf, &ext4_iomap_ops);
^~~~~~~~~~~~~~~
In file included from fs/ext4/file.c:25:0:
include/linux/dax.h:41:5: note: declared here
int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
^~~~~~~~~~~~~~~
fs/ext4/file.c: In function 'ext4_dax_fault':
>> fs/ext4/file.c:302:9: error: too many arguments to function 'ext4_dax_huge_fault'
return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
^~~~~~~~~~~~~~~~~~~
fs/ext4/file.c:280:1: note: declared here
ext4_dax_huge_fault(struct vm_fault *vmf)
^~~~~~~~~~~~~~~~~~~
fs/ext4/file.c: At top level:
>> fs/ext4/file.c:337:16: error: initialization from incompatible pointer type [-Werror=incompatible-pointer-types]
.huge_fault = ext4_dax_huge_fault,
^~~~~~~~~~~~~~~~~~~
fs/ext4/file.c:337:16: note: (near initialization for 'ext4_dax_vm_ops.huge_fault')
fs/ext4/file.c:258:12: warning: 'ext4_dax_huge_fault' defined but not used [-Wunused-function]
static int ext4_dax_huge_fault(struct vm_fault *vmf,
^~~~~~~~~~~~~~~~~~~
cc1: some warnings being treated as errors
vim +/ext4_dax_huge_fault +280 fs/ext4/file.c
01a33b4ac Matthew Wilcox 2015-09-08 274 sb_end_pagefault(sb);
01a33b4ac Matthew Wilcox 2015-09-08 275
01a33b4ac Matthew Wilcox 2015-09-08 276 return result;
923ae0ff9 Ross Zwisler 2015-02-16 277 }
923ae0ff9 Ross Zwisler 2015-02-16 278
c6da0697e Dave Jiang 2017-02-02 279 static int
30599588c Dave Jiang 2017-02-02 @280 ext4_dax_huge_fault(struct vm_fault *vmf)
11bd1a9ec Matthew Wilcox 2015-09-08 281 {
01a33b4ac Matthew Wilcox 2015-09-08 282 int result;
e6ae40ec2 Dave Jiang 2017-02-02 283 struct inode *inode = file_inode(vmf->vma->vm_file);
01a33b4ac Matthew Wilcox 2015-09-08 284 struct super_block *sb = inode->i_sb;
c6da0697e Dave Jiang 2017-02-02 285 bool write = vmf->flags & FAULT_FLAG_WRITE;
01a33b4ac Matthew Wilcox 2015-09-08 286
01a33b4ac Matthew Wilcox 2015-09-08 287 if (write) {
01a33b4ac Matthew Wilcox 2015-09-08 288 sb_start_pagefault(sb);
e6ae40ec2 Dave Jiang 2017-02-02 289 file_update_time(vmf->vma->vm_file);
1db175428 Jan Kara 2016-10-21 290 }
ea3d7209c Jan Kara 2015-12-07 291 down_read(&EXT4_I(inode)->i_mmap_sem);
30599588c Dave Jiang 2017-02-02 @292 result = dax_iomap_fault(vmf, &ext4_iomap_ops);
ea3d7209c Jan Kara 2015-12-07 293 up_read(&EXT4_I(inode)->i_mmap_sem);
1db175428 Jan Kara 2016-10-21 294 if (write)
01a33b4ac Matthew Wilcox 2015-09-08 295 sb_end_pagefault(sb);
01a33b4ac Matthew Wilcox 2015-09-08 296
01a33b4ac Matthew Wilcox 2015-09-08 297 return result;
11bd1a9ec Matthew Wilcox 2015-09-08 298 }
11bd1a9ec Matthew Wilcox 2015-09-08 299
22711acc4 Dave Jiang 2017-02-03 300 static int ext4_dax_fault(struct vm_fault *vmf)
22711acc4 Dave Jiang 2017-02-03 301 {
22711acc4 Dave Jiang 2017-02-03 @302 return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
22711acc4 Dave Jiang 2017-02-03 303 }
22711acc4 Dave Jiang 2017-02-03 304
ea3d7209c Jan Kara 2015-12-07 305 /*
1e9d180ba Ross Zwisler 2016-02-27 306 * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
ea3d7209c Jan Kara 2015-12-07 307 * handler we check for races agaist truncate. Note that since we cycle through
ea3d7209c Jan Kara 2015-12-07 308 * i_mmap_sem, we are sure that also any hole punching that began before we
ea3d7209c Jan Kara 2015-12-07 309 * were called is finished by now and so if it included part of the file we
ea3d7209c Jan Kara 2015-12-07 310 * are working on, our pte will get unmapped and the check for pte_same() in
ea3d7209c Jan Kara 2015-12-07 311 * wp_pfn_shared() fails. Thus fault gets retried and things work out as
ea3d7209c Jan Kara 2015-12-07 312 * desired.
ea3d7209c Jan Kara 2015-12-07 313 */
1ebf3e0da Dave Jiang 2017-02-02 314 static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf)
ea3d7209c Jan Kara 2015-12-07 315 {
1ebf3e0da Dave Jiang 2017-02-02 316 struct inode *inode = file_inode(vmf->vma->vm_file);
ea3d7209c Jan Kara 2015-12-07 317 struct super_block *sb = inode->i_sb;
ea3d7209c Jan Kara 2015-12-07 318 loff_t size;
d5be7a03b Ross Zwisler 2016-01-22 319 int ret;
ea3d7209c Jan Kara 2015-12-07 320
ea3d7209c Jan Kara 2015-12-07 321 sb_start_pagefault(sb);
1ebf3e0da Dave Jiang 2017-02-02 322 file_update_time(vmf->vma->vm_file);
ea3d7209c Jan Kara 2015-12-07 323 down_read(&EXT4_I(inode)->i_mmap_sem);
ea3d7209c Jan Kara 2015-12-07 324 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
ea3d7209c Jan Kara 2015-12-07 325 if (vmf->pgoff >= size)
ea3d7209c Jan Kara 2015-12-07 326 ret = VM_FAULT_SIGBUS;
d5be7a03b Ross Zwisler 2016-01-22 327 else
1ebf3e0da Dave Jiang 2017-02-02 328 ret = dax_pfn_mkwrite(vmf);
ea3d7209c Jan Kara 2015-12-07 329 up_read(&EXT4_I(inode)->i_mmap_sem);
ea3d7209c Jan Kara 2015-12-07 330 sb_end_pagefault(sb);
ea3d7209c Jan Kara 2015-12-07 331
ea3d7209c Jan Kara 2015-12-07 332 return ret;
923ae0ff9 Ross Zwisler 2015-02-16 333 }
923ae0ff9 Ross Zwisler 2015-02-16 334
923ae0ff9 Ross Zwisler 2015-02-16 335 static const struct vm_operations_struct ext4_dax_vm_ops = {
923ae0ff9 Ross Zwisler 2015-02-16 336 .fault = ext4_dax_fault,
22711acc4 Dave Jiang 2017-02-03 @337 .huge_fault = ext4_dax_huge_fault,
1e9d180ba Ross Zwisler 2016-02-27 338 .page_mkwrite = ext4_dax_fault,
ea3d7209c Jan Kara 2015-12-07 339 .pfn_mkwrite = ext4_dax_pfn_mkwrite,
923ae0ff9 Ross Zwisler 2015-02-16 340 };
:::::: The code at line 280 was first introduced by commit
:::::: 30599588c9eaccc211d383c9974a3a88dfa6e7d5 mm,fs,dax: change ->pmd_fault to ->huge_fault
:::::: TO: Dave Jiang <[email protected]>
:::::: CC: Johannes Weiner <[email protected]>
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
On 02/03/2017 03:56 PM, kbuild test robot wrote:
> Hi Dave,
>
> [auto build test ERROR on mmotm/master]
> [cannot apply to linus/master linux/master v4.10-rc6 next-20170203]
> [if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
This one is a bit odd. I just pulled mmotm tree master branch and built
with the attached .config and it passed for me (and I don't see this
commit in the master branch). I also built linux-next with this patch on
top and it also passes with attached .config. Looking at the err log
below it seems the code has a mix of partial from before and after the
patch. I'm rather confused about it....
>
> url: https://github.com/0day-ci/linux/commits/Dave-Jiang/mm-replace-FAULT_FLAG_SIZE-with-parameter-to-huge_fault/20170204-053548
> base: git://git.cmpxchg.org/linux-mmotm.git master
> config: i386-randconfig-x004-201705 (attached as .config)
> compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
> reproduce:
> # save the attached .config to linux build tree
> make ARCH=i386
>
> All errors (new ones prefixed by >>):
>
>>> fs/ext4/file.c:280:1: error: conflicting types for 'ext4_dax_huge_fault'
> ext4_dax_huge_fault(struct vm_fault *vmf)
> ^~~~~~~~~~~~~~~~~~~
> fs/ext4/file.c:258:12: note: previous definition of 'ext4_dax_huge_fault' was here
> static int ext4_dax_huge_fault(struct vm_fault *vmf,
> ^~~~~~~~~~~~~~~~~~~
> fs/ext4/file.c: In function 'ext4_dax_huge_fault':
>>> fs/ext4/file.c:292:32: error: incompatible type for argument 2 of 'dax_iomap_fault'
> result = dax_iomap_fault(vmf, &ext4_iomap_ops);
> ^
> In file included from fs/ext4/file.c:25:0:
> include/linux/dax.h:41:5: note: expected 'enum page_entry_size' but argument is of type 'struct iomap_ops *'
> int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
> ^~~~~~~~~~~~~~~
>>> fs/ext4/file.c:292:11: error: too few arguments to function 'dax_iomap_fault'
> result = dax_iomap_fault(vmf, &ext4_iomap_ops);
> ^~~~~~~~~~~~~~~
> In file included from fs/ext4/file.c:25:0:
> include/linux/dax.h:41:5: note: declared here
> int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
> ^~~~~~~~~~~~~~~
> fs/ext4/file.c: In function 'ext4_dax_fault':
>>> fs/ext4/file.c:302:9: error: too many arguments to function 'ext4_dax_huge_fault'
> return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
> ^~~~~~~~~~~~~~~~~~~
> fs/ext4/file.c:280:1: note: declared here
> ext4_dax_huge_fault(struct vm_fault *vmf)
> ^~~~~~~~~~~~~~~~~~~
> fs/ext4/file.c: At top level:
>>> fs/ext4/file.c:337:16: error: initialization from incompatible pointer type [-Werror=incompatible-pointer-types]
> .huge_fault = ext4_dax_huge_fault,
> ^~~~~~~~~~~~~~~~~~~
> fs/ext4/file.c:337:16: note: (near initialization for 'ext4_dax_vm_ops.huge_fault')
> fs/ext4/file.c:258:12: warning: 'ext4_dax_huge_fault' defined but not used [-Wunused-function]
> static int ext4_dax_huge_fault(struct vm_fault *vmf,
> ^~~~~~~~~~~~~~~~~~~
> cc1: some warnings being treated as errors
>
> vim +/ext4_dax_huge_fault +280 fs/ext4/file.c
>
> 01a33b4ac Matthew Wilcox 2015-09-08 274 sb_end_pagefault(sb);
> 01a33b4ac Matthew Wilcox 2015-09-08 275
> 01a33b4ac Matthew Wilcox 2015-09-08 276 return result;
> 923ae0ff9 Ross Zwisler 2015-02-16 277 }
> 923ae0ff9 Ross Zwisler 2015-02-16 278
> c6da0697e Dave Jiang 2017-02-02 279 static int
> 30599588c Dave Jiang 2017-02-02 @280 ext4_dax_huge_fault(struct vm_fault *vmf)
> 11bd1a9ec Matthew Wilcox 2015-09-08 281 {
> 01a33b4ac Matthew Wilcox 2015-09-08 282 int result;
> e6ae40ec2 Dave Jiang 2017-02-02 283 struct inode *inode = file_inode(vmf->vma->vm_file);
> 01a33b4ac Matthew Wilcox 2015-09-08 284 struct super_block *sb = inode->i_sb;
> c6da0697e Dave Jiang 2017-02-02 285 bool write = vmf->flags & FAULT_FLAG_WRITE;
> 01a33b4ac Matthew Wilcox 2015-09-08 286
> 01a33b4ac Matthew Wilcox 2015-09-08 287 if (write) {
> 01a33b4ac Matthew Wilcox 2015-09-08 288 sb_start_pagefault(sb);
> e6ae40ec2 Dave Jiang 2017-02-02 289 file_update_time(vmf->vma->vm_file);
> 1db175428 Jan Kara 2016-10-21 290 }
> ea3d7209c Jan Kara 2015-12-07 291 down_read(&EXT4_I(inode)->i_mmap_sem);
> 30599588c Dave Jiang 2017-02-02 @292 result = dax_iomap_fault(vmf, &ext4_iomap_ops);
> ea3d7209c Jan Kara 2015-12-07 293 up_read(&EXT4_I(inode)->i_mmap_sem);
> 1db175428 Jan Kara 2016-10-21 294 if (write)
> 01a33b4ac Matthew Wilcox 2015-09-08 295 sb_end_pagefault(sb);
> 01a33b4ac Matthew Wilcox 2015-09-08 296
> 01a33b4ac Matthew Wilcox 2015-09-08 297 return result;
> 11bd1a9ec Matthew Wilcox 2015-09-08 298 }
> 11bd1a9ec Matthew Wilcox 2015-09-08 299
> 22711acc4 Dave Jiang 2017-02-03 300 static int ext4_dax_fault(struct vm_fault *vmf)
> 22711acc4 Dave Jiang 2017-02-03 301 {
> 22711acc4 Dave Jiang 2017-02-03 @302 return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
> 22711acc4 Dave Jiang 2017-02-03 303 }
> 22711acc4 Dave Jiang 2017-02-03 304
> ea3d7209c Jan Kara 2015-12-07 305 /*
> 1e9d180ba Ross Zwisler 2016-02-27 306 * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
> ea3d7209c Jan Kara 2015-12-07 307 * handler we check for races agaist truncate. Note that since we cycle through
> ea3d7209c Jan Kara 2015-12-07 308 * i_mmap_sem, we are sure that also any hole punching that began before we
> ea3d7209c Jan Kara 2015-12-07 309 * were called is finished by now and so if it included part of the file we
> ea3d7209c Jan Kara 2015-12-07 310 * are working on, our pte will get unmapped and the check for pte_same() in
> ea3d7209c Jan Kara 2015-12-07 311 * wp_pfn_shared() fails. Thus fault gets retried and things work out as
> ea3d7209c Jan Kara 2015-12-07 312 * desired.
> ea3d7209c Jan Kara 2015-12-07 313 */
> 1ebf3e0da Dave Jiang 2017-02-02 314 static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf)
> ea3d7209c Jan Kara 2015-12-07 315 {
> 1ebf3e0da Dave Jiang 2017-02-02 316 struct inode *inode = file_inode(vmf->vma->vm_file);
> ea3d7209c Jan Kara 2015-12-07 317 struct super_block *sb = inode->i_sb;
> ea3d7209c Jan Kara 2015-12-07 318 loff_t size;
> d5be7a03b Ross Zwisler 2016-01-22 319 int ret;
> ea3d7209c Jan Kara 2015-12-07 320
> ea3d7209c Jan Kara 2015-12-07 321 sb_start_pagefault(sb);
> 1ebf3e0da Dave Jiang 2017-02-02 322 file_update_time(vmf->vma->vm_file);
> ea3d7209c Jan Kara 2015-12-07 323 down_read(&EXT4_I(inode)->i_mmap_sem);
> ea3d7209c Jan Kara 2015-12-07 324 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
> ea3d7209c Jan Kara 2015-12-07 325 if (vmf->pgoff >= size)
> ea3d7209c Jan Kara 2015-12-07 326 ret = VM_FAULT_SIGBUS;
> d5be7a03b Ross Zwisler 2016-01-22 327 else
> 1ebf3e0da Dave Jiang 2017-02-02 328 ret = dax_pfn_mkwrite(vmf);
> ea3d7209c Jan Kara 2015-12-07 329 up_read(&EXT4_I(inode)->i_mmap_sem);
> ea3d7209c Jan Kara 2015-12-07 330 sb_end_pagefault(sb);
> ea3d7209c Jan Kara 2015-12-07 331
> ea3d7209c Jan Kara 2015-12-07 332 return ret;
> 923ae0ff9 Ross Zwisler 2015-02-16 333 }
> 923ae0ff9 Ross Zwisler 2015-02-16 334
> 923ae0ff9 Ross Zwisler 2015-02-16 335 static const struct vm_operations_struct ext4_dax_vm_ops = {
> 923ae0ff9 Ross Zwisler 2015-02-16 336 .fault = ext4_dax_fault,
> 22711acc4 Dave Jiang 2017-02-03 @337 .huge_fault = ext4_dax_huge_fault,
> 1e9d180ba Ross Zwisler 2016-02-27 338 .page_mkwrite = ext4_dax_fault,
> ea3d7209c Jan Kara 2015-12-07 339 .pfn_mkwrite = ext4_dax_pfn_mkwrite,
> 923ae0ff9 Ross Zwisler 2015-02-16 340 };
>
> :::::: The code at line 280 was first introduced by commit
> :::::: 30599588c9eaccc211d383c9974a3a88dfa6e7d5 mm,fs,dax: change ->pmd_fault to ->huge_fault
>
> :::::: TO: Dave Jiang <[email protected]>
> :::::: CC: Johannes Weiner <[email protected]>
>
> ---
> 0-DAY kernel test infrastructure Open Source Technology Center
> https://lists.01.org/pipermail/kbuild-all Intel Corporation
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On Fri, Feb 3, 2017 at 3:25 PM, Dave Jiang <[email protected]> wrote:
> On 02/03/2017 03:56 PM, kbuild test robot wrote:
>> Hi Dave,
>>
>> [auto build test ERROR on mmotm/master]
>> [cannot apply to linus/master linux/master v4.10-rc6 next-20170203]
>> [if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
>
> This one is a bit odd. I just pulled mmotm tree master branch and built
> with the attached .config and it passed for me (and I don't see this
> commit in the master branch). I also built linux-next with this patch on
> top and it also passes with attached .config. Looking at the err log
> below it seems the code has a mix of partial from before and after the
> patch. I'm rather confused about it....
This is a false positive. It tried to build it against latest mainline
instead of linux-next.
On Fri, Feb 3, 2017 at 3:26 PM, Dan Williams <[email protected]> wrote:
> On Fri, Feb 3, 2017 at 3:25 PM, Dave Jiang <[email protected]> wrote:
>> On 02/03/2017 03:56 PM, kbuild test robot wrote:
>>> Hi Dave,
>>>
>>> [auto build test ERROR on mmotm/master]
>>> [cannot apply to linus/master linux/master v4.10-rc6 next-20170203]
>>> [if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
>>
>> This one is a bit odd. I just pulled mmotm tree master branch and built
>> with the attached .config and it passed for me (and I don't see this
>> commit in the master branch). I also built linux-next with this patch on
>> top and it also passes with attached .config. Looking at the err log
>> below it seems the code has a mix of partial from before and after the
>> patch. I'm rather confused about it....
>
> This is a false positive. It tried to build it against latest mainline
> instead of linux-next.
On second look it seems I ended up with a duplicate
ext4_huge_dax_fault after "git am" when I apply this on top of
next-20170202. The following fixes it up for me and tests fine:
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index f8f4f6d068e5..e8ab46efc4f9 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -276,27 +276,6 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
return result;
}
-static int
-ext4_dax_huge_fault(struct vm_fault *vmf)
-{
- int result;
- struct inode *inode = file_inode(vmf->vma->vm_file);
- struct super_block *sb = inode->i_sb;
- bool write = vmf->flags & FAULT_FLAG_WRITE;
-
- if (write) {
- sb_start_pagefault(sb);
- file_update_time(vmf->vma->vm_file);
- }
- down_read(&EXT4_I(inode)->i_mmap_sem);
- result = dax_iomap_fault(vmf, &ext4_iomap_ops);
- up_read(&EXT4_I(inode)->i_mmap_sem);
- if (write)
- sb_end_pagefault(sb);
-
- return result;
-}
-
static int ext4_dax_fault(struct vm_fault *vmf)
{
return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On 02/03/2017 05:00 PM, Dan Williams wrote:
> On Fri, Feb 3, 2017 at 3:26 PM, Dan Williams <[email protected]> wrote:
>> On Fri, Feb 3, 2017 at 3:25 PM, Dave Jiang <[email protected]> wrote:
>>> On 02/03/2017 03:56 PM, kbuild test robot wrote:
>>>> Hi Dave,
>>>>
>>>> [auto build test ERROR on mmotm/master]
>>>> [cannot apply to linus/master linux/master v4.10-rc6 next-20170203]
>>>> [if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
>>>
>>> This one is a bit odd. I just pulled mmotm tree master branch and built
>>> with the attached .config and it passed for me (and I don't see this
>>> commit in the master branch). I also built linux-next with this patch on
>>> top and it also passes with attached .config. Looking at the err log
>>> below it seems the code has a mix of partial from before and after the
>>> patch. I'm rather confused about it....
>>
>> This is a false positive. It tried to build it against latest mainline
>> instead of linux-next.
>
> On second look it seems I ended up with a duplicate
> ext4_huge_dax_fault after "git am" when I apply this on top of
> next-20170202. The following fixes it up for me and tests fine:
I think it's missing this patch from Ross
http://marc.info/?l=linux-mm&m=148581319303697&w=2
>
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index f8f4f6d068e5..e8ab46efc4f9 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -276,27 +276,6 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
> return result;
> }
>
> -static int
> -ext4_dax_huge_fault(struct vm_fault *vmf)
> -{
> - int result;
> - struct inode *inode = file_inode(vmf->vma->vm_file);
> - struct super_block *sb = inode->i_sb;
> - bool write = vmf->flags & FAULT_FLAG_WRITE;
> -
> - if (write) {
> - sb_start_pagefault(sb);
> - file_update_time(vmf->vma->vm_file);
> - }
> - down_read(&EXT4_I(inode)->i_mmap_sem);
> - result = dax_iomap_fault(vmf, &ext4_iomap_ops);
> - up_read(&EXT4_I(inode)->i_mmap_sem);
> - if (write)
> - sb_end_pagefault(sb);
> -
> - return result;
> -}
> -
> static int ext4_dax_fault(struct vm_fault *vmf)
> {
> return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
>
Hi Dave,
[auto build test WARNING on mmotm/master]
[cannot apply to linus/master linux/master v4.10-rc6 next-20170203]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
url: https://github.com/0day-ci/linux/commits/Dave-Jiang/mm-replace-FAULT_FLAG_SIZE-with-parameter-to-huge_fault/20170204-053548
base: git://git.cmpxchg.org/linux-mmotm.git master
config: x86_64-randconfig-in0-02040556 (attached as .config)
compiler: gcc-4.6 (Debian 4.6.4-7) 4.6.4
reproduce:
# save the attached .config to linux build tree
make ARCH=x86_64
All warnings (new ones prefixed by >>):
fs/ext4/file.c:280:1: error: conflicting types for 'ext4_dax_huge_fault'
fs/ext4/file.c:258:12: note: previous definition of 'ext4_dax_huge_fault' was here
fs/ext4/file.c: In function 'ext4_dax_huge_fault':
fs/ext4/file.c:292:2: error: incompatible type for argument 2 of 'dax_iomap_fault'
include/linux/dax.h:41:5: note: expected 'enum page_entry_size' but argument is of type 'struct iomap_ops *'
fs/ext4/file.c:292:2: error: too few arguments to function 'dax_iomap_fault'
include/linux/dax.h:41:5: note: declared here
fs/ext4/file.c: In function 'ext4_dax_fault':
fs/ext4/file.c:302:2: error: too many arguments to function 'ext4_dax_huge_fault'
fs/ext4/file.c:280:1: note: declared here
fs/ext4/file.c: At top level:
>> fs/ext4/file.c:337:2: warning: initialization from incompatible pointer type [enabled by default]
fs/ext4/file.c:337:2: warning: (near initialization for 'ext4_dax_vm_ops.huge_fault') [enabled by default]
fs/ext4/file.c:258:12: warning: 'ext4_dax_huge_fault' defined but not used [-Wunused-function]
vim +337 fs/ext4/file.c
286
287 if (write) {
288 sb_start_pagefault(sb);
289 file_update_time(vmf->vma->vm_file);
290 }
291 down_read(&EXT4_I(inode)->i_mmap_sem);
> 292 result = dax_iomap_fault(vmf, &ext4_iomap_ops);
293 up_read(&EXT4_I(inode)->i_mmap_sem);
294 if (write)
295 sb_end_pagefault(sb);
296
297 return result;
298 }
299
300 static int ext4_dax_fault(struct vm_fault *vmf)
301 {
302 return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
303 }
304
305 /*
306 * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
307 * handler we check for races agaist truncate. Note that since we cycle through
308 * i_mmap_sem, we are sure that also any hole punching that began before we
309 * were called is finished by now and so if it included part of the file we
310 * are working on, our pte will get unmapped and the check for pte_same() in
311 * wp_pfn_shared() fails. Thus fault gets retried and things work out as
312 * desired.
313 */
314 static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf)
315 {
316 struct inode *inode = file_inode(vmf->vma->vm_file);
317 struct super_block *sb = inode->i_sb;
318 loff_t size;
319 int ret;
320
321 sb_start_pagefault(sb);
322 file_update_time(vmf->vma->vm_file);
323 down_read(&EXT4_I(inode)->i_mmap_sem);
324 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
325 if (vmf->pgoff >= size)
326 ret = VM_FAULT_SIGBUS;
327 else
328 ret = dax_pfn_mkwrite(vmf);
329 up_read(&EXT4_I(inode)->i_mmap_sem);
330 sb_end_pagefault(sb);
331
332 return ret;
333 }
334
335 static const struct vm_operations_struct ext4_dax_vm_ops = {
336 .fault = ext4_dax_fault,
> 337 .huge_fault = ext4_dax_huge_fault,
338 .page_mkwrite = ext4_dax_fault,
339 .pfn_mkwrite = ext4_dax_pfn_mkwrite,
340 };
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
On Fri 03-02-17 14:31:22, Dave Jiang wrote:
> Since the introduction of FAULT_FLAG_SIZE to the vm_fault flag, it has
> been somewhat painful with getting the flags set and removed at the
> correct locations. More than one kernel oops was introduced due to
> difficulties of getting the placement correctly. Removing the flag
> values and introducing an input parameter to huge_fault that indicates
> the size of the page entry. This makes the code easier to trace and
> should avoid the issues we see with the fault flags where removal of the
> flag was necessary in the fallback paths.
>
> Signed-off-by: Dave Jiang <[email protected]>
Yeah, the code looks less error-prone this way. I like it. You can add:
Reviewed-by: Jan Kara <[email protected]>
Honza
--
Jan Kara <jack-IBi9RG/[email protected]>
SUSE Labs, CR
On Fri, Feb 03, 2017 at 02:31:22PM -0700, Dave Jiang wrote:
> Since the introduction of FAULT_FLAG_SIZE to the vm_fault flag, it has
> been somewhat painful with getting the flags set and removed at the
> correct locations. More than one kernel oops was introduced due to
> difficulties of getting the placement correctly. Removing the flag
> values and introducing an input parameter to huge_fault that indicates
> the size of the page entry. This makes the code easier to trace and
> should avoid the issues we see with the fault flags where removal of the
> flag was necessary in the fallback paths.
Why is this not in struct vm_fault? Also can be use this opportunity
to fold ->huge_fault into ->fault?
On Mon, Feb 6, 2017 at 6:36 AM, Christoph Hellwig <[email protected]> wrote:
> On Fri, Feb 03, 2017 at 02:31:22PM -0700, Dave Jiang wrote:
>> Since the introduction of FAULT_FLAG_SIZE to the vm_fault flag, it has
>> been somewhat painful with getting the flags set and removed at the
>> correct locations. More than one kernel oops was introduced due to
>> difficulties of getting the placement correctly. Removing the flag
>> values and introducing an input parameter to huge_fault that indicates
>> the size of the page entry. This makes the code easier to trace and
>> should avoid the issues we see with the fault flags where removal of the
>> flag was necessary in the fallback paths.
>
> Why is this not in struct vm_fault?
Because this is easier to read and harder to get wrong. Same arguments
as getting rid of struct blk_dax_ctl.
> Also can be use this opportunity
> to fold ->huge_fault into ->fault?
Hmm, yes, just need a scheme to not attempt huge_faults on pte-only handlers.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On Mon, Feb 06, 2017 at 08:24:48AM -0800, Dan Williams wrote:
> > Also can be use this opportunity
> > to fold ->huge_fault into ->fault?
>
> Hmm, yes, just need a scheme to not attempt huge_faults on pte-only handlers.
Do we need anything more than checking vma->vm_flags for VM_HUGETLB?
On Mon, Feb 6, 2017 at 9:27 AM, Christoph Hellwig <[email protected]> wrote:
> On Mon, Feb 06, 2017 at 08:24:48AM -0800, Dan Williams wrote:
>> > Also can be use this opportunity
>> > to fold ->huge_fault into ->fault?
>>
>> Hmm, yes, just need a scheme to not attempt huge_faults on pte-only handlers.
>
> Do we need anything more than checking vma->vm_flags for VM_HUGETLB?
s/VM_HUGETLB/VM_HUGEPAGE/
...but yes as long as we specify that a VM_HUGEPAGE handler must
minimally handle pud and pmd.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On Mon, Feb 06, 2017 at 09:30:22AM -0800, Dan Williams wrote:
> On Mon, Feb 6, 2017 at 9:27 AM, Christoph Hellwig <[email protected]> wrote:
> > On Mon, Feb 06, 2017 at 08:24:48AM -0800, Dan Williams wrote:
> >> > Also can be use this opportunity
> >> > to fold ->huge_fault into ->fault?
BTW, for tmpfs we already use ->fault for both small and huge pages.
If ->fault returned THP, core mm look if it's possible to map the page as
huge in this particular VMA (due to size/alignment). If yes mm maps the
page with PMD, if not fallback to PTE.
I think it would be nice to do the same for DAX: filesystem provides core
mm with largest page this part of file can be mapped with (base aligned
address + lenght for DAX) and core mm sort out the rest.
> >> Hmm, yes, just need a scheme to not attempt huge_faults on pte-only handlers.
> >
> > Do we need anything more than checking vma->vm_flags for VM_HUGETLB?
>
> s/VM_HUGETLB/VM_HUGEPAGE/
>
> ...but yes as long as we specify that a VM_HUGEPAGE handler must
> minimally handle pud and pmd.
VM_HUGEPAGE is result of MADV_HUGEPAGE. It's not required to have THP in
the VMA.
--
Kirill A. Shutemov
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On Tue, Feb 7, 2017 at 12:44 AM, Kirill A. Shutemov
<[email protected]> wrote:
> On Mon, Feb 06, 2017 at 09:30:22AM -0800, Dan Williams wrote:
>> On Mon, Feb 6, 2017 at 9:27 AM, Christoph Hellwig <[email protected]> wrote:
>> > On Mon, Feb 06, 2017 at 08:24:48AM -0800, Dan Williams wrote:
>> >> > Also can be use this opportunity
>> >> > to fold ->huge_fault into ->fault?
>
> BTW, for tmpfs we already use ->fault for both small and huge pages.
> If ->fault returned THP, core mm look if it's possible to map the page as
> huge in this particular VMA (due to size/alignment). If yes mm maps the
> page with PMD, if not fallback to PTE.
>
> I think it would be nice to do the same for DAX: filesystem provides core
> mm with largest page this part of file can be mapped with (base aligned
> address + lenght for DAX) and core mm sort out the rest.
For DAX we would need plumb pfn_t into the core mm so that we have the
PFN_DEV and PFN_MAP flags beyond the raw pfn.
>
>> >> Hmm, yes, just need a scheme to not attempt huge_faults on pte-only handlers.
>> >
>> > Do we need anything more than checking vma->vm_flags for VM_HUGETLB?
>>
>> s/VM_HUGETLB/VM_HUGEPAGE/
>>
>> ...but yes as long as we specify that a VM_HUGEPAGE handler must
>> minimally handle pud and pmd.
>
> VM_HUGEPAGE is result of MADV_HUGEPAGE. It's not required to have THP in
> the VMA.
Filesystem-DAX and Device-DAX specify VM_HUGEPAGE by default.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On Tue, Feb 07, 2017 at 08:56:56AM -0800, Dan Williams wrote:
> On Tue, Feb 7, 2017 at 12:44 AM, Kirill A. Shutemov
> <[email protected]> wrote:
> > On Mon, Feb 06, 2017 at 09:30:22AM -0800, Dan Williams wrote:
> >> On Mon, Feb 6, 2017 at 9:27 AM, Christoph Hellwig <[email protected]> wrote:
> >> > On Mon, Feb 06, 2017 at 08:24:48AM -0800, Dan Williams wrote:
> >> >> > Also can be use this opportunity
> >> >> > to fold ->huge_fault into ->fault?
> >
> > BTW, for tmpfs we already use ->fault for both small and huge pages.
> > If ->fault returned THP, core mm look if it's possible to map the page as
> > huge in this particular VMA (due to size/alignment). If yes mm maps the
> > page with PMD, if not fallback to PTE.
> >
> > I think it would be nice to do the same for DAX: filesystem provides core
> > mm with largest page this part of file can be mapped with (base aligned
> > address + lenght for DAX) and core mm sort out the rest.
>
> For DAX we would need plumb pfn_t into the core mm so that we have the
> PFN_DEV and PFN_MAP flags beyond the raw pfn.
Sounds good to me.
> >> >> Hmm, yes, just need a scheme to not attempt huge_faults on pte-only handlers.
> >> >
> >> > Do we need anything more than checking vma->vm_flags for VM_HUGETLB?
> >>
> >> s/VM_HUGETLB/VM_HUGEPAGE/
> >>
> >> ...but yes as long as we specify that a VM_HUGEPAGE handler must
> >> minimally handle pud and pmd.
> >
> > VM_HUGEPAGE is result of MADV_HUGEPAGE. It's not required to have THP in
> > the VMA.
>
> Filesystem-DAX and Device-DAX specify VM_HUGEPAGE by default.
But why? Looks like abuse of the flag.
--
Kirill A. Shutemov
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On Tue, Feb 7, 2017 at 9:40 AM, Kirill A. Shutemov <[email protected]> wrote:
> On Tue, Feb 07, 2017 at 08:56:56AM -0800, Dan Williams wrote:
>> On Tue, Feb 7, 2017 at 12:44 AM, Kirill A. Shutemov
>> <[email protected]> wrote:
>> > On Mon, Feb 06, 2017 at 09:30:22AM -0800, Dan Williams wrote:
>> >> On Mon, Feb 6, 2017 at 9:27 AM, Christoph Hellwig <[email protected]> wrote:
>> >> > On Mon, Feb 06, 2017 at 08:24:48AM -0800, Dan Williams wrote:
>> >> >> > Also can be use this opportunity
>> >> >> > to fold ->huge_fault into ->fault?
>> >
>> > BTW, for tmpfs we already use ->fault for both small and huge pages.
>> > If ->fault returned THP, core mm look if it's possible to map the page as
>> > huge in this particular VMA (due to size/alignment). If yes mm maps the
>> > page with PMD, if not fallback to PTE.
>> >
>> > I think it would be nice to do the same for DAX: filesystem provides core
>> > mm with largest page this part of file can be mapped with (base aligned
>> > address + lenght for DAX) and core mm sort out the rest.
>>
>> For DAX we would need plumb pfn_t into the core mm so that we have the
>> PFN_DEV and PFN_MAP flags beyond the raw pfn.
>
> Sounds good to me.
>
>> >> >> Hmm, yes, just need a scheme to not attempt huge_faults on pte-only handlers.
>> >> >
>> >> > Do we need anything more than checking vma->vm_flags for VM_HUGETLB?
>> >>
>> >> s/VM_HUGETLB/VM_HUGEPAGE/
>> >>
>> >> ...but yes as long as we specify that a VM_HUGEPAGE handler must
>> >> minimally handle pud and pmd.
>> >
>> > VM_HUGEPAGE is result of MADV_HUGEPAGE. It's not required to have THP in
>> > the VMA.
>>
>> Filesystem-DAX and Device-DAX specify VM_HUGEPAGE by default.
>
> But why? Looks like abuse of the flag.
Good question, that's been there since DAX was initially added and I
don't see a good reason for it currently. I'll take a look at what you
have for huge-tmpfs support.
On Tue 07-02-17 08:56:56, Dan Williams wrote:
> On Tue, Feb 7, 2017 at 12:44 AM, Kirill A. Shutemov
> <[email protected]> wrote:
> > On Mon, Feb 06, 2017 at 09:30:22AM -0800, Dan Williams wrote:
> >> On Mon, Feb 6, 2017 at 9:27 AM, Christoph Hellwig <[email protected]> wrote:
> >> > On Mon, Feb 06, 2017 at 08:24:48AM -0800, Dan Williams wrote:
> >> >> > Also can be use this opportunity
> >> >> > to fold ->huge_fault into ->fault?
> >
> > BTW, for tmpfs we already use ->fault for both small and huge pages.
> > If ->fault returned THP, core mm look if it's possible to map the page as
> > huge in this particular VMA (due to size/alignment). If yes mm maps the
> > page with PMD, if not fallback to PTE.
> >
> > I think it would be nice to do the same for DAX: filesystem provides core
> > mm with largest page this part of file can be mapped with (base aligned
> > address + lenght for DAX) and core mm sort out the rest.
>
> For DAX we would need plumb pfn_t into the core mm so that we have the
> PFN_DEV and PFN_MAP flags beyond the raw pfn.
So we can pass necessary information through struct vm_fault rather easily.
However due to DAX locking we cannot really "return" pfn for generic code
to install (we need to unlock radix tree locks after modifying page
tables). So if we want generic code to handle PFNs what needs to be done is
to teach finish_fault() to handle pfn_t which is passed to it and install
it in page tables.
Long term we could transition all page fault handlers (at least the
non-trivial ones) to using finish_fault() which would IMO make the code
flow easier to follow and export less of MM internals into drivers. However
there's so many fault handlers that I didn't have a good motivation to do
that yet.
Honza
--
Jan Kara <jack-IBi9RG/[email protected]>
SUSE Labs, CR