Tracepoints are the standard way to capture debugging and tracing
information in many parts of the kernel, including the XFS and ext4
filesystems. This series creates a tracepoint header for FS DAX and add
the first few DAX tracepoints to the PMD fault handler. This allows the
tracing for DAX to be done in the same way as the filesystem tracing so
that developers can look at them together and get a coherent idea of what
the system is doing.
I do intend to add tracepoints to the normal 4k DAX fault path and to the
DAX I/O path, but I wanted to get feedback on the PMD tracepoints before I
went any further.
This series is based on Jan Kara's "dax: Clear dirty bits after flushing
caches" series:
https://lists.01.org/pipermail/linux-nvdimm/2016-November/007864.html
I've pushed a git tree with this work here:
https://git.kernel.org/cgit/linux/kernel/git/zwisler/linux.git/log/?h=dax_tracepoints
Ross Zwisler (6):
dax: fix build breakage with ext4, dax and !iomap
dax: remove leading space from labels
dax: add tracepoint infrastructure, PMD tracing
dax: update MAINTAINERS entries for FS DAX
dax: add tracepoints to dax_pmd_load_hole()
dax: add tracepoints to dax_pmd_insert_mapping()
MAINTAINERS | 4 +-
fs/Kconfig | 1 +
fs/dax.c | 78 ++++++++++++++----------
fs/ext2/Kconfig | 1 -
include/linux/mm.h | 14 +++++
include/linux/pfn_t.h | 6 ++
include/trace/events/fs_dax.h | 135 ++++++++++++++++++++++++++++++++++++++++++
7 files changed, 206 insertions(+), 33 deletions(-)
create mode 100644 include/trace/events/fs_dax.h
--
2.7.4
No functional change.
As of this commit:
commit 218dd85887da (".gitattributes: set git diff driver for C source code
files")
git-diff and git-format-patch both generate diffs whose hunks are correctly
prefixed by function names instead of labels, even if those labels aren't
indented with spaces.
Signed-off-by: Ross Zwisler <[email protected]>
---
fs/dax.c | 24 ++++++++++++------------
1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/fs/dax.c b/fs/dax.c
index d8fe3eb..cc8a069 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -422,7 +422,7 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
return page;
}
entry = lock_slot(mapping, slot);
- out_unlock:
+out_unlock:
spin_unlock_irq(&mapping->tree_lock);
return entry;
}
@@ -557,7 +557,7 @@ static int dax_load_hole(struct address_space *mapping, void **entry,
vmf->gfp_mask | __GFP_ZERO);
if (!page)
return VM_FAULT_OOM;
- out:
+out:
vmf->page = page;
ret = finish_fault(vmf);
vmf->page = NULL;
@@ -659,7 +659,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
}
if (vmf->flags & FAULT_FLAG_WRITE)
radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
- unlock:
+unlock:
spin_unlock_irq(&mapping->tree_lock);
if (hole_fill) {
radix_tree_preload_end();
@@ -812,12 +812,12 @@ static int dax_writeback_one(struct block_device *bdev,
spin_lock_irq(&mapping->tree_lock);
radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
spin_unlock_irq(&mapping->tree_lock);
- unmap:
+unmap:
dax_unmap_atomic(bdev, &dax);
put_locked_mapping_entry(mapping, index, entry);
return ret;
- put_unlocked:
+put_unlocked:
put_unlocked_mapping_entry(mapping, index, entry2);
spin_unlock_irq(&mapping->tree_lock);
return ret;
@@ -1193,11 +1193,11 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
break;
}
- error_unlock_entry:
+error_unlock_entry:
vmf_ret = dax_fault_return(error) | major;
- unlock_entry:
+unlock_entry:
put_locked_mapping_entry(mapping, vmf->pgoff, entry);
- finish_iomap:
+finish_iomap:
if (ops->iomap_end) {
int copied = PAGE_SIZE;
@@ -1254,7 +1254,7 @@ static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write);
- unmap_fallback:
+unmap_fallback:
dax_unmap_atomic(bdev, &dax);
return VM_FAULT_FALLBACK;
}
@@ -1378,9 +1378,9 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
break;
}
- unlock_entry:
+unlock_entry:
put_locked_mapping_entry(mapping, pgoff, entry);
- finish_iomap:
+finish_iomap:
if (ops->iomap_end) {
int copied = PMD_SIZE;
@@ -1395,7 +1395,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
&iomap);
}
- fallback:
+fallback:
if (result == VM_FAULT_FALLBACK) {
split_huge_pmd(vma, pmd, address);
count_vm_event(THP_FAULT_FALLBACK);
--
2.7.4
Add tracepoints to dax_pmd_load_hole(), following the same logging
conventions as the tracepoints in dax_iomap_pmd_fault().
Here is an example PMD fault showing the new tracepoints:
read_big-1393 [007] .... 32.133809: dax_pmd_fault: shared mapping read
address 0x10400000 vm_start 0x10200000 vm_end 0x10600000 pgoff 0x200
max_pgoff 0x1400
read_big-1393 [007] .... 32.134067: dax_pmd_load_hole: shared mapping
read address 0x10400000 zero_page ffffea0002b98000 radix_entry 0x1e
read_big-1393 [007] .... 32.134069: dax_pmd_fault_done: shared mapping
read address 0x10400000 vm_start 0x10200000 vm_end 0x10600000 pgoff 0x200
max_pgoff 0x1400 NOPAGE
Signed-off-by: Ross Zwisler <[email protected]>
---
fs/dax.c | 13 +++++++++----
include/trace/events/fs_dax.h | 32 ++++++++++++++++++++++++++++++++
2 files changed, 41 insertions(+), 4 deletions(-)
diff --git a/fs/dax.c b/fs/dax.c
index 1aa7616..2824414 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1269,32 +1269,37 @@ static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
struct address_space *mapping = vma->vm_file->f_mapping;
unsigned long pmd_addr = address & PMD_MASK;
struct page *zero_page;
+ void *ret = NULL;
spinlock_t *ptl;
pmd_t pmd_entry;
- void *ret;
zero_page = mm_get_huge_zero_page(vma->vm_mm);
if (unlikely(!zero_page))
- return VM_FAULT_FALLBACK;
+ goto fallback;
ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0,
RADIX_DAX_PMD | RADIX_DAX_HZP);
if (IS_ERR(ret))
- return VM_FAULT_FALLBACK;
+ goto fallback;
*entryp = ret;
ptl = pmd_lock(vma->vm_mm, pmd);
if (!pmd_none(*pmd)) {
spin_unlock(ptl);
- return VM_FAULT_FALLBACK;
+ goto fallback;
}
pmd_entry = mk_pmd(zero_page, vma->vm_page_prot);
pmd_entry = pmd_mkhuge(pmd_entry);
set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry);
spin_unlock(ptl);
+ trace_dax_pmd_load_hole(vma, address, zero_page, ret);
return VM_FAULT_NOPAGE;
+
+fallback:
+ trace_dax_pmd_load_hole_fallback(vma, address, zero_page, ret);
+ return VM_FAULT_FALLBACK;
}
int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index f9ed4eb..8814b1a 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -54,6 +54,38 @@ DEFINE_EVENT(dax_pmd_fault_class, name, \
DEFINE_PMD_FAULT_EVENT(dax_pmd_fault);
DEFINE_PMD_FAULT_EVENT(dax_pmd_fault_done);
+DECLARE_EVENT_CLASS(dax_pmd_load_hole_class,
+ TP_PROTO(struct vm_area_struct *vma, unsigned long address,
+ struct page *zero_page, void *radix_entry),
+ TP_ARGS(vma, address, zero_page, radix_entry),
+ TP_STRUCT__entry(
+ __field(unsigned long, vm_flags)
+ __field(unsigned long, address)
+ __field(struct page *, zero_page)
+ __field(void *, radix_entry)
+ ),
+ TP_fast_assign(
+ __entry->vm_flags = vma->vm_flags;
+ __entry->address = address;
+ __entry->zero_page = zero_page;
+ __entry->radix_entry = radix_entry;
+ ),
+ TP_printk("%s mapping read address %#lx zero_page %p radix_entry %#lx",
+ __entry->vm_flags & VM_SHARED ? "shared" : "private",
+ __entry->address,
+ __entry->zero_page,
+ (unsigned long)__entry->radix_entry
+ )
+)
+
+#define DEFINE_PMD_LOAD_HOLE_EVENT(name) \
+DEFINE_EVENT(dax_pmd_load_hole_class, name, \
+ TP_PROTO(struct vm_area_struct *vma, unsigned long address, \
+ struct page *zero_page, void *radix_entry), \
+ TP_ARGS(vma, address, zero_page, radix_entry))
+
+DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole);
+DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole_fallback);
#endif /* _TRACE_FS_DAX_H */
--
2.7.4
Add tracepoints to dax_pmd_insert_mapping(), following the same logging
conventions as the tracepoints in dax_iomap_pmd_fault().
Here is an example PMD fault showing the new tracepoints:
big-1544 [006] .... 48.153479: dax_pmd_fault: shared mapping write
address 0x10505000 vm_start 0x10200000 vm_end 0x10700000 pgoff 0x200
max_pgoff 0x1400
big-1544 [006] .... 48.155230: dax_pmd_insert_mapping: shared mapping
write address 0x10505000 length 0x200000 pfn 0x100600 DEV|MAP radix_entry
0xc000e
big-1544 [006] .... 48.155266: dax_pmd_fault_done: shared mapping write
address 0x10505000 vm_start 0x10200000 vm_end 0x10700000 pgoff 0x200
max_pgoff 0x1400 NOPAGE
Signed-off-by: Ross Zwisler <[email protected]>
---
fs/dax.c | 10 +++++++---
include/linux/pfn_t.h | 6 ++++++
include/trace/events/fs_dax.h | 42 ++++++++++++++++++++++++++++++++++++++++++
3 files changed, 55 insertions(+), 3 deletions(-)
diff --git a/fs/dax.c b/fs/dax.c
index 2824414..d6ba4a3 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1236,10 +1236,10 @@ static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
.size = PMD_SIZE,
};
long length = dax_map_atomic(bdev, &dax);
- void *ret;
+ void *ret = NULL;
if (length < 0) /* dax_map_atomic() failed */
- return VM_FAULT_FALLBACK;
+ goto fallback;
if (length < PMD_SIZE)
goto unmap_fallback;
if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)
@@ -1252,13 +1252,17 @@ static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector,
RADIX_DAX_PMD);
if (IS_ERR(ret))
- return VM_FAULT_FALLBACK;
+ goto fallback;
*entryp = ret;
+ trace_dax_pmd_insert_mapping(vma, address, write, length, dax.pfn, ret);
return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write);
unmap_fallback:
dax_unmap_atomic(bdev, &dax);
+fallback:
+ trace_dax_pmd_insert_mapping_fallback(vma, address, write, length,
+ dax.pfn, ret);
return VM_FAULT_FALLBACK;
}
diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h
index a3d90b9..033fc7b 100644
--- a/include/linux/pfn_t.h
+++ b/include/linux/pfn_t.h
@@ -15,6 +15,12 @@
#define PFN_DEV (1ULL << (BITS_PER_LONG_LONG - 3))
#define PFN_MAP (1ULL << (BITS_PER_LONG_LONG - 4))
+#define PFN_FLAGS_TRACE \
+ { PFN_SG_CHAIN, "SG_CHAIN" }, \
+ { PFN_SG_LAST, "SG_LAST" }, \
+ { PFN_DEV, "DEV" }, \
+ { PFN_MAP, "MAP" }
+
static inline pfn_t __pfn_to_pfn_t(unsigned long pfn, u64 flags)
{
pfn_t pfn_t = { .val = pfn | (flags & PFN_FLAGS_MASK), };
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index 8814b1a..a03f820 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -87,6 +87,48 @@ DEFINE_EVENT(dax_pmd_load_hole_class, name, \
DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole);
DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole_fallback);
+DECLARE_EVENT_CLASS(dax_pmd_insert_mapping_class,
+ TP_PROTO(struct vm_area_struct *vma, unsigned long address, int write,
+ long length, pfn_t pfn, void *radix_entry),
+ TP_ARGS(vma, address, write, length, pfn, radix_entry),
+ TP_STRUCT__entry(
+ __field(unsigned long, vm_flags)
+ __field(unsigned long, address)
+ __field(int, write)
+ __field(long, length)
+ __field(u64, pfn_val)
+ __field(void *, radix_entry)
+ ),
+ TP_fast_assign(
+ __entry->vm_flags = vma->vm_flags;
+ __entry->address = address;
+ __entry->write = write;
+ __entry->length = length;
+ __entry->pfn_val = pfn.val;
+ __entry->radix_entry = radix_entry;
+ ),
+ TP_printk("%s mapping %s address %#lx length %#lx pfn %#llx %s"
+ " radix_entry %#lx",
+ __entry->vm_flags & VM_SHARED ? "shared" : "private",
+ __entry->write ? "write" : "read",
+ __entry->address,
+ __entry->length,
+ __entry->pfn_val & ~PFN_FLAGS_MASK,
+ __print_flags(__entry->pfn_val & PFN_FLAGS_MASK, "|",
+ PFN_FLAGS_TRACE),
+ (unsigned long)__entry->radix_entry
+ )
+)
+
+#define DEFINE_PMD_INSERT_MAPPING_EVENT(name) \
+DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \
+ TP_PROTO(struct vm_area_struct *vma, unsigned long address, \
+ int write, long length, pfn_t pfn, void *radix_entry), \
+ TP_ARGS(vma, address, write, length, pfn, radix_entry))
+
+DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping);
+DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping_fallback);
+
#endif /* _TRACE_FS_DAX_H */
/* This part must be outside protection */
--
2.7.4
Add the new include/trace/events/fs_dax.h tracepoint header, update
Matthew's email address and add myself as a maintainer for filesystem DAX.
Signed-off-by: Ross Zwisler <[email protected]>
Suggested-by: Matthew Wilcox <[email protected]>
---
MAINTAINERS | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/MAINTAINERS b/MAINTAINERS
index 2a58eea..8fef4bf 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3855,10 +3855,12 @@ S: Maintained
F: drivers/i2c/busses/i2c-diolan-u2c.c
DIRECT ACCESS (DAX)
-M: Matthew Wilcox <[email protected]>
+M: Matthew Wilcox <[email protected]>
+M: Ross Zwisler <[email protected]>
L: [email protected]
S: Supported
F: fs/dax.c
+F: include/trace/events/fs_dax.h
DIRECTORY NOTIFICATION (DNOTIFY)
M: Eric Paris <[email protected]>
--
2.7.4
Tracepoints are the standard way to capture debugging and tracing
information in many parts of the kernel, including the XFS and ext4
filesystems. Create a tracepoint header for FS DAX and add the first DAX
tracepoints to the PMD fault handler. This allows the tracing for DAX to
be done in the same way as the filesystem tracing so that developers can
look at them together and get a coherent idea of what the system is doing.
I added both an entry and exit tracepoint because future patches will add
tracepoints to child functions of dax_iomap_pmd_fault() like
dax_pmd_load_hole() and dax_pmd_insert_mapping(). We want those messages to
be wrapped by the parent function tracepoints so the code flow is more
easily understood. Having entry and exit tracepoints for faults also
allows us to easily see what filesystems functions were called during the
fault. These filesystem functions get executed via iomap_begin() and
iomap_end() calls, for example, and will have their own tracepoints.
For PMD faults we primarily want to understand the faulting address and
whether it fell back to 4k faults. If it fell back to 4k faults the
tracepoints should let us understand why.
I named the new tracepoint header file "fs_dax.h" to allow for device DAX
to have its own separate tracing header in the same directory at some
point.
Here is an example output for these events from a successful PMD fault:
big-2057 [000] .... 136.396855: dax_pmd_fault: shared mapping write
address 0x10505000 vm_start 0x10200000 vm_end 0x10700000 pgoff 0x200
max_pgoff 0x1400
big-2057 [000] .... 136.397943: dax_pmd_fault_done: shared mapping write
address 0x10505000 vm_start 0x10200000 vm_end 0x10700000 pgoff 0x200
max_pgoff 0x1400 NOPAGE
Signed-off-by: Ross Zwisler <[email protected]>
Suggested-by: Dave Chinner <[email protected]>
---
fs/dax.c | 29 +++++++++++++-------
include/linux/mm.h | 14 ++++++++++
include/trace/events/fs_dax.h | 61 +++++++++++++++++++++++++++++++++++++++++++
3 files changed, 94 insertions(+), 10 deletions(-)
create mode 100644 include/trace/events/fs_dax.h
diff --git a/fs/dax.c b/fs/dax.c
index cc8a069..1aa7616 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -35,6 +35,9 @@
#include <linux/iomap.h>
#include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/fs_dax.h>
+
/* We choose 4096 entries - same as per-zone page wait tables */
#define DAX_WAIT_TABLE_BITS 12
#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
@@ -1310,6 +1313,16 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
loff_t pos;
int error;
+ /*
+ * Check whether offset isn't beyond end of file now. Caller is
+ * supposed to hold locks serializing us with truncate / punch hole so
+ * this is a reliable test.
+ */
+ pgoff = linear_page_index(vma, pmd_addr);
+ max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
+
+ trace_dax_pmd_fault(vma, address, flags, pgoff, max_pgoff, 0);
+
/* Fall back to PTEs if we're going to COW */
if (write && !(vma->vm_flags & VM_SHARED))
goto fallback;
@@ -1320,16 +1333,10 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
if ((pmd_addr + PMD_SIZE) > vma->vm_end)
goto fallback;
- /*
- * Check whether offset isn't beyond end of file now. Caller is
- * supposed to hold locks serializing us with truncate / punch hole so
- * this is a reliable test.
- */
- pgoff = linear_page_index(vma, pmd_addr);
- max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
-
- if (pgoff > max_pgoff)
- return VM_FAULT_SIGBUS;
+ if (pgoff > max_pgoff) {
+ result = VM_FAULT_SIGBUS;
+ goto out;
+ }
/* If the PMD would extend beyond the file size */
if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
@@ -1400,6 +1407,8 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
split_huge_pmd(vma, pmd, address);
count_vm_event(THP_FAULT_FALLBACK);
}
+out:
+ trace_dax_pmd_fault_done(vma, address, flags, pgoff, max_pgoff, result);
return result;
}
EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a5f52c0..e373917 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1107,6 +1107,20 @@ static inline void clear_page_pfmemalloc(struct page *page)
VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \
VM_FAULT_FALLBACK)
+#define VM_FAULT_RESULT_TRACE \
+ { VM_FAULT_OOM, "OOM" }, \
+ { VM_FAULT_SIGBUS, "SIGBUS" }, \
+ { VM_FAULT_MAJOR, "MAJOR" }, \
+ { VM_FAULT_WRITE, "WRITE" }, \
+ { VM_FAULT_HWPOISON, "HWPOISON" }, \
+ { VM_FAULT_HWPOISON_LARGE, "HWPOISON_LARGE" }, \
+ { VM_FAULT_SIGSEGV, "SIGSEGV" }, \
+ { VM_FAULT_NOPAGE, "NOPAGE" }, \
+ { VM_FAULT_LOCKED, "LOCKED" }, \
+ { VM_FAULT_RETRY, "RETRY" }, \
+ { VM_FAULT_FALLBACK, "FALLBACK" }, \
+ { VM_FAULT_DONE_COW, "DONE_COW" }
+
/* Encode hstate index for a hwpoisoned large page */
#define VM_FAULT_SET_HINDEX(x) ((x) << 12)
#define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf)
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
new file mode 100644
index 0000000..f9ed4eb
--- /dev/null
+++ b/include/trace/events/fs_dax.h
@@ -0,0 +1,61 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM fs_dax
+
+#if !defined(_TRACE_FS_DAX_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_FS_DAX_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(dax_pmd_fault_class,
+ TP_PROTO(struct vm_area_struct *vma, unsigned long address,
+ unsigned int flags, pgoff_t pgoff, pgoff_t max_pgoff,
+ int result),
+ TP_ARGS(vma, address, flags, pgoff, max_pgoff, result),
+ TP_STRUCT__entry(
+ __field(unsigned long, vm_start)
+ __field(unsigned long, vm_end)
+ __field(unsigned long, vm_flags)
+ __field(unsigned long, address)
+ __field(unsigned int, flags)
+ __field(pgoff_t, pgoff)
+ __field(pgoff_t, max_pgoff)
+ __field(int, result)
+ ),
+ TP_fast_assign(
+ __entry->vm_start = vma->vm_start;
+ __entry->vm_end = vma->vm_end;
+ __entry->vm_flags = vma->vm_flags;
+ __entry->address = address;
+ __entry->flags = flags;
+ __entry->pgoff = pgoff;
+ __entry->max_pgoff = max_pgoff;
+ __entry->result = result;
+ ),
+ TP_printk("%s mapping %s address %#lx vm_start %#lx vm_end %#lx "
+ "pgoff %#lx max_pgoff %#lx %s",
+ __entry->vm_flags & VM_SHARED ? "shared" : "private",
+ __entry->flags & FAULT_FLAG_WRITE ? "write" : "read",
+ __entry->address,
+ __entry->vm_start,
+ __entry->vm_end,
+ __entry->pgoff,
+ __entry->max_pgoff,
+ __print_flags(__entry->result, "|", VM_FAULT_RESULT_TRACE)
+ )
+)
+
+#define DEFINE_PMD_FAULT_EVENT(name) \
+DEFINE_EVENT(dax_pmd_fault_class, name, \
+ TP_PROTO(struct vm_area_struct *vma, unsigned long address, \
+ unsigned int flags, pgoff_t pgoff, pgoff_t max_pgoff, \
+ int result), \
+ TP_ARGS(vma, address, flags, pgoff, max_pgoff, result))
+
+DEFINE_PMD_FAULT_EVENT(dax_pmd_fault);
+DEFINE_PMD_FAULT_EVENT(dax_pmd_fault_done);
+
+
+#endif /* _TRACE_FS_DAX_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
--
2.7.4
With the current Kconfig setup it is possible to have the following:
CONFIG_EXT4_FS=y
CONFIG_FS_DAX=y
CONFIG_FS_IOMAP=n # this is in fs/Kconfig & isn't user accessible
With this config we get build failures in ext4_dax_fault() because the
iomap functions in fs/dax.c are missing:
fs/built-in.o: In function `ext4_dax_fault':
file.c:(.text+0x7f3ac): undefined reference to `dax_iomap_fault'
file.c:(.text+0x7f404): undefined reference to `dax_iomap_fault'
fs/built-in.o: In function `ext4_file_read_iter':
file.c:(.text+0x7fc54): undefined reference to `dax_iomap_rw'
fs/built-in.o: In function `ext4_file_write_iter':
file.c:(.text+0x7fe9a): undefined reference to `dax_iomap_rw'
file.c:(.text+0x7feed): undefined reference to `dax_iomap_rw'
fs/built-in.o: In function `ext4_block_zero_page_range':
inode.c:(.text+0x85c0d): undefined reference to `iomap_zero_range'
Now that the struct buffer_head based DAX fault paths and I/O path have
been removed we really depend on iomap support being present for DAX. Make
this explicit by selecting FS_IOMAP if we compile in DAX support.
Signed-off-by: Ross Zwisler <[email protected]>
---
fs/Kconfig | 1 +
fs/dax.c | 2 --
fs/ext2/Kconfig | 1 -
3 files changed, 1 insertion(+), 3 deletions(-)
diff --git a/fs/Kconfig b/fs/Kconfig
index 8e9e5f41..18024bf 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -38,6 +38,7 @@ config FS_DAX
bool "Direct Access (DAX) support"
depends on MMU
depends on !(ARM || MIPS || SPARC)
+ select FS_IOMAP
help
Direct Access (DAX) can be used on memory-backed block devices.
If the block device supports DAX and the filesystem supports DAX,
diff --git a/fs/dax.c b/fs/dax.c
index be39633..d8fe3eb 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -968,7 +968,6 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
}
EXPORT_SYMBOL_GPL(__dax_zero_page_range);
-#ifdef CONFIG_FS_IOMAP
static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
{
return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9);
@@ -1405,4 +1404,3 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
}
EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
#endif /* CONFIG_FS_DAX_PMD */
-#endif /* CONFIG_FS_IOMAP */
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
index 36bea5a..c634874e 100644
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig
@@ -1,6 +1,5 @@
config EXT2_FS
tristate "Second extended fs support"
- select FS_IOMAP if FS_DAX
help
Ext2 is a standard Linux file system for hard disks.
--
2.7.4
On Wed 23-11-16 11:44:18, Ross Zwisler wrote:
> No functional change.
>
> As of this commit:
>
> commit 218dd85887da (".gitattributes: set git diff driver for C source code
> files")
>
> git-diff and git-format-patch both generate diffs whose hunks are correctly
> prefixed by function names instead of labels, even if those labels aren't
> indented with spaces.
Fine by me. I just have some 4 remaining DAX patches (will send them out
today) and they will clash with this. So I'd prefer if this happened after
they are merged...
Honza
>
> Signed-off-by: Ross Zwisler <[email protected]>
> ---
> fs/dax.c | 24 ++++++++++++------------
> 1 file changed, 12 insertions(+), 12 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index d8fe3eb..cc8a069 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -422,7 +422,7 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
> return page;
> }
> entry = lock_slot(mapping, slot);
> - out_unlock:
> +out_unlock:
> spin_unlock_irq(&mapping->tree_lock);
> return entry;
> }
> @@ -557,7 +557,7 @@ static int dax_load_hole(struct address_space *mapping, void **entry,
> vmf->gfp_mask | __GFP_ZERO);
> if (!page)
> return VM_FAULT_OOM;
> - out:
> +out:
> vmf->page = page;
> ret = finish_fault(vmf);
> vmf->page = NULL;
> @@ -659,7 +659,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
> }
> if (vmf->flags & FAULT_FLAG_WRITE)
> radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
> - unlock:
> +unlock:
> spin_unlock_irq(&mapping->tree_lock);
> if (hole_fill) {
> radix_tree_preload_end();
> @@ -812,12 +812,12 @@ static int dax_writeback_one(struct block_device *bdev,
> spin_lock_irq(&mapping->tree_lock);
> radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
> spin_unlock_irq(&mapping->tree_lock);
> - unmap:
> +unmap:
> dax_unmap_atomic(bdev, &dax);
> put_locked_mapping_entry(mapping, index, entry);
> return ret;
>
> - put_unlocked:
> +put_unlocked:
> put_unlocked_mapping_entry(mapping, index, entry2);
> spin_unlock_irq(&mapping->tree_lock);
> return ret;
> @@ -1193,11 +1193,11 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
> break;
> }
>
> - error_unlock_entry:
> +error_unlock_entry:
> vmf_ret = dax_fault_return(error) | major;
> - unlock_entry:
> +unlock_entry:
> put_locked_mapping_entry(mapping, vmf->pgoff, entry);
> - finish_iomap:
> +finish_iomap:
> if (ops->iomap_end) {
> int copied = PAGE_SIZE;
>
> @@ -1254,7 +1254,7 @@ static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
>
> return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write);
>
> - unmap_fallback:
> +unmap_fallback:
> dax_unmap_atomic(bdev, &dax);
> return VM_FAULT_FALLBACK;
> }
> @@ -1378,9 +1378,9 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
> break;
> }
>
> - unlock_entry:
> +unlock_entry:
> put_locked_mapping_entry(mapping, pgoff, entry);
> - finish_iomap:
> +finish_iomap:
> if (ops->iomap_end) {
> int copied = PMD_SIZE;
>
> @@ -1395,7 +1395,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
> ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
> &iomap);
> }
> - fallback:
> +fallback:
> if (result == VM_FAULT_FALLBACK) {
> split_huge_pmd(vma, pmd, address);
> count_vm_event(THP_FAULT_FALLBACK);
> --
> 2.7.4
>
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Wed 23-11-16 11:44:17, Ross Zwisler wrote:
> With the current Kconfig setup it is possible to have the following:
>
> CONFIG_EXT4_FS=y
> CONFIG_FS_DAX=y
> CONFIG_FS_IOMAP=n # this is in fs/Kconfig & isn't user accessible
>
> With this config we get build failures in ext4_dax_fault() because the
> iomap functions in fs/dax.c are missing:
>
> fs/built-in.o: In function `ext4_dax_fault':
> file.c:(.text+0x7f3ac): undefined reference to `dax_iomap_fault'
> file.c:(.text+0x7f404): undefined reference to `dax_iomap_fault'
> fs/built-in.o: In function `ext4_file_read_iter':
> file.c:(.text+0x7fc54): undefined reference to `dax_iomap_rw'
> fs/built-in.o: In function `ext4_file_write_iter':
> file.c:(.text+0x7fe9a): undefined reference to `dax_iomap_rw'
> file.c:(.text+0x7feed): undefined reference to `dax_iomap_rw'
> fs/built-in.o: In function `ext4_block_zero_page_range':
> inode.c:(.text+0x85c0d): undefined reference to `iomap_zero_range'
>
> Now that the struct buffer_head based DAX fault paths and I/O path have
> been removed we really depend on iomap support being present for DAX. Make
> this explicit by selecting FS_IOMAP if we compile in DAX support.
>
> Signed-off-by: Ross Zwisler <[email protected]>
I've sent the same patch to Ted yesterday and he will probably queue it on
top of ext4 iomap patches. If it doesn't happen for some reason, feel free
to add:
Reviewed-by: Jan Kara <[email protected]>
Honza
> ---
> fs/Kconfig | 1 +
> fs/dax.c | 2 --
> fs/ext2/Kconfig | 1 -
> 3 files changed, 1 insertion(+), 3 deletions(-)
>
> diff --git a/fs/Kconfig b/fs/Kconfig
> index 8e9e5f41..18024bf 100644
> --- a/fs/Kconfig
> +++ b/fs/Kconfig
> @@ -38,6 +38,7 @@ config FS_DAX
> bool "Direct Access (DAX) support"
> depends on MMU
> depends on !(ARM || MIPS || SPARC)
> + select FS_IOMAP
> help
> Direct Access (DAX) can be used on memory-backed block devices.
> If the block device supports DAX and the filesystem supports DAX,
> diff --git a/fs/dax.c b/fs/dax.c
> index be39633..d8fe3eb 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -968,7 +968,6 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
> }
> EXPORT_SYMBOL_GPL(__dax_zero_page_range);
>
> -#ifdef CONFIG_FS_IOMAP
> static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
> {
> return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9);
> @@ -1405,4 +1404,3 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
> }
> EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
> #endif /* CONFIG_FS_DAX_PMD */
> -#endif /* CONFIG_FS_IOMAP */
> diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
> index 36bea5a..c634874e 100644
> --- a/fs/ext2/Kconfig
> +++ b/fs/ext2/Kconfig
> @@ -1,6 +1,5 @@
> config EXT2_FS
> tristate "Second extended fs support"
> - select FS_IOMAP if FS_DAX
> help
> Ext2 is a standard Linux file system for hard disks.
>
> --
> 2.7.4
>
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Wed 23-11-16 11:44:19, Ross Zwisler wrote:
> Tracepoints are the standard way to capture debugging and tracing
> information in many parts of the kernel, including the XFS and ext4
> filesystems. Create a tracepoint header for FS DAX and add the first DAX
> tracepoints to the PMD fault handler. This allows the tracing for DAX to
> be done in the same way as the filesystem tracing so that developers can
> look at them together and get a coherent idea of what the system is doing.
>
> I added both an entry and exit tracepoint because future patches will add
> tracepoints to child functions of dax_iomap_pmd_fault() like
> dax_pmd_load_hole() and dax_pmd_insert_mapping(). We want those messages to
> be wrapped by the parent function tracepoints so the code flow is more
> easily understood. Having entry and exit tracepoints for faults also
> allows us to easily see what filesystems functions were called during the
> fault. These filesystem functions get executed via iomap_begin() and
> iomap_end() calls, for example, and will have their own tracepoints.
>
> For PMD faults we primarily want to understand the faulting address and
> whether it fell back to 4k faults. If it fell back to 4k faults the
> tracepoints should let us understand why.
>
> I named the new tracepoint header file "fs_dax.h" to allow for device DAX
> to have its own separate tracing header in the same directory at some
> point.
>
> Here is an example output for these events from a successful PMD fault:
>
> big-2057 [000] .... 136.396855: dax_pmd_fault: shared mapping write
> address 0x10505000 vm_start 0x10200000 vm_end 0x10700000 pgoff 0x200
> max_pgoff 0x1400
>
> big-2057 [000] .... 136.397943: dax_pmd_fault_done: shared mapping write
> address 0x10505000 vm_start 0x10200000 vm_end 0x10700000 pgoff 0x200
> max_pgoff 0x1400 NOPAGE
>
> Signed-off-by: Ross Zwisler <[email protected]>
> Suggested-by: Dave Chinner <[email protected]>
Looks good. Just one minor comment:
> + TP_printk("%s mapping %s address %#lx vm_start %#lx vm_end %#lx "
> + "pgoff %#lx max_pgoff %#lx %s",
> + __entry->vm_flags & VM_SHARED ? "shared" : "private",
> + __entry->flags & FAULT_FLAG_WRITE ? "write" : "read",
> + __entry->address,
> + __entry->vm_start,
> + __entry->vm_end,
> + __entry->pgoff,
> + __entry->max_pgoff,
> + __print_flags(__entry->result, "|", VM_FAULT_RESULT_TRACE)
> + )
> +)
I think it may be useful to dump full 'flags', not just FAULT_FLAG_WRITE...
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Wed 23-11-16 11:44:22, Ross Zwisler wrote:
> Add tracepoints to dax_pmd_insert_mapping(), following the same logging
> conventions as the tracepoints in dax_iomap_pmd_fault().
>
> Here is an example PMD fault showing the new tracepoints:
>
> big-1544 [006] .... 48.153479: dax_pmd_fault: shared mapping write
> address 0x10505000 vm_start 0x10200000 vm_end 0x10700000 pgoff 0x200
> max_pgoff 0x1400
>
> big-1544 [006] .... 48.155230: dax_pmd_insert_mapping: shared mapping
> write address 0x10505000 length 0x200000 pfn 0x100600 DEV|MAP radix_entry
> 0xc000e
>
> big-1544 [006] .... 48.155266: dax_pmd_fault_done: shared mapping write
> address 0x10505000 vm_start 0x10200000 vm_end 0x10700000 pgoff 0x200
> max_pgoff 0x1400 NOPAGE
>
> Signed-off-by: Ross Zwisler <[email protected]>
Looks good. You can add:
Reviewed-by: Jan Kara <[email protected]>
Honza
> ---
> fs/dax.c | 10 +++++++---
> include/linux/pfn_t.h | 6 ++++++
> include/trace/events/fs_dax.h | 42 ++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 55 insertions(+), 3 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index 2824414..d6ba4a3 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1236,10 +1236,10 @@ static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
> .size = PMD_SIZE,
> };
> long length = dax_map_atomic(bdev, &dax);
> - void *ret;
> + void *ret = NULL;
>
> if (length < 0) /* dax_map_atomic() failed */
> - return VM_FAULT_FALLBACK;
> + goto fallback;
> if (length < PMD_SIZE)
> goto unmap_fallback;
> if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)
> @@ -1252,13 +1252,17 @@ static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
> ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector,
> RADIX_DAX_PMD);
> if (IS_ERR(ret))
> - return VM_FAULT_FALLBACK;
> + goto fallback;
> *entryp = ret;
>
> + trace_dax_pmd_insert_mapping(vma, address, write, length, dax.pfn, ret);
> return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write);
>
> unmap_fallback:
> dax_unmap_atomic(bdev, &dax);
> +fallback:
> + trace_dax_pmd_insert_mapping_fallback(vma, address, write, length,
> + dax.pfn, ret);
> return VM_FAULT_FALLBACK;
> }
>
> diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h
> index a3d90b9..033fc7b 100644
> --- a/include/linux/pfn_t.h
> +++ b/include/linux/pfn_t.h
> @@ -15,6 +15,12 @@
> #define PFN_DEV (1ULL << (BITS_PER_LONG_LONG - 3))
> #define PFN_MAP (1ULL << (BITS_PER_LONG_LONG - 4))
>
> +#define PFN_FLAGS_TRACE \
> + { PFN_SG_CHAIN, "SG_CHAIN" }, \
> + { PFN_SG_LAST, "SG_LAST" }, \
> + { PFN_DEV, "DEV" }, \
> + { PFN_MAP, "MAP" }
> +
> static inline pfn_t __pfn_to_pfn_t(unsigned long pfn, u64 flags)
> {
> pfn_t pfn_t = { .val = pfn | (flags & PFN_FLAGS_MASK), };
> diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
> index 8814b1a..a03f820 100644
> --- a/include/trace/events/fs_dax.h
> +++ b/include/trace/events/fs_dax.h
> @@ -87,6 +87,48 @@ DEFINE_EVENT(dax_pmd_load_hole_class, name, \
> DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole);
> DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole_fallback);
>
> +DECLARE_EVENT_CLASS(dax_pmd_insert_mapping_class,
> + TP_PROTO(struct vm_area_struct *vma, unsigned long address, int write,
> + long length, pfn_t pfn, void *radix_entry),
> + TP_ARGS(vma, address, write, length, pfn, radix_entry),
> + TP_STRUCT__entry(
> + __field(unsigned long, vm_flags)
> + __field(unsigned long, address)
> + __field(int, write)
> + __field(long, length)
> + __field(u64, pfn_val)
> + __field(void *, radix_entry)
> + ),
> + TP_fast_assign(
> + __entry->vm_flags = vma->vm_flags;
> + __entry->address = address;
> + __entry->write = write;
> + __entry->length = length;
> + __entry->pfn_val = pfn.val;
> + __entry->radix_entry = radix_entry;
> + ),
> + TP_printk("%s mapping %s address %#lx length %#lx pfn %#llx %s"
> + " radix_entry %#lx",
> + __entry->vm_flags & VM_SHARED ? "shared" : "private",
> + __entry->write ? "write" : "read",
> + __entry->address,
> + __entry->length,
> + __entry->pfn_val & ~PFN_FLAGS_MASK,
> + __print_flags(__entry->pfn_val & PFN_FLAGS_MASK, "|",
> + PFN_FLAGS_TRACE),
> + (unsigned long)__entry->radix_entry
> + )
> +)
> +
> +#define DEFINE_PMD_INSERT_MAPPING_EVENT(name) \
> +DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \
> + TP_PROTO(struct vm_area_struct *vma, unsigned long address, \
> + int write, long length, pfn_t pfn, void *radix_entry), \
> + TP_ARGS(vma, address, write, length, pfn, radix_entry))
> +
> +DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping);
> +DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping_fallback);
> +
> #endif /* _TRACE_FS_DAX_H */
>
> /* This part must be outside protection */
> --
> 2.7.4
>
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Wed 23-11-16 11:44:21, Ross Zwisler wrote:
> Add tracepoints to dax_pmd_load_hole(), following the same logging
> conventions as the tracepoints in dax_iomap_pmd_fault().
>
> Here is an example PMD fault showing the new tracepoints:
>
> read_big-1393 [007] .... 32.133809: dax_pmd_fault: shared mapping read
> address 0x10400000 vm_start 0x10200000 vm_end 0x10600000 pgoff 0x200
> max_pgoff 0x1400
>
> read_big-1393 [007] .... 32.134067: dax_pmd_load_hole: shared mapping
> read address 0x10400000 zero_page ffffea0002b98000 radix_entry 0x1e
>
> read_big-1393 [007] .... 32.134069: dax_pmd_fault_done: shared mapping
> read address 0x10400000 vm_start 0x10200000 vm_end 0x10600000 pgoff 0x200
> max_pgoff 0x1400 NOPAGE
>
> Signed-off-by: Ross Zwisler <[email protected]>
Looks good. You can add:
Reviewed-by: Jan Kara <[email protected]>
Honza
> ---
> fs/dax.c | 13 +++++++++----
> include/trace/events/fs_dax.h | 32 ++++++++++++++++++++++++++++++++
> 2 files changed, 41 insertions(+), 4 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index 1aa7616..2824414 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1269,32 +1269,37 @@ static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
> struct address_space *mapping = vma->vm_file->f_mapping;
> unsigned long pmd_addr = address & PMD_MASK;
> struct page *zero_page;
> + void *ret = NULL;
> spinlock_t *ptl;
> pmd_t pmd_entry;
> - void *ret;
>
> zero_page = mm_get_huge_zero_page(vma->vm_mm);
>
> if (unlikely(!zero_page))
> - return VM_FAULT_FALLBACK;
> + goto fallback;
>
> ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0,
> RADIX_DAX_PMD | RADIX_DAX_HZP);
> if (IS_ERR(ret))
> - return VM_FAULT_FALLBACK;
> + goto fallback;
> *entryp = ret;
>
> ptl = pmd_lock(vma->vm_mm, pmd);
> if (!pmd_none(*pmd)) {
> spin_unlock(ptl);
> - return VM_FAULT_FALLBACK;
> + goto fallback;
> }
>
> pmd_entry = mk_pmd(zero_page, vma->vm_page_prot);
> pmd_entry = pmd_mkhuge(pmd_entry);
> set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry);
> spin_unlock(ptl);
> + trace_dax_pmd_load_hole(vma, address, zero_page, ret);
> return VM_FAULT_NOPAGE;
> +
> +fallback:
> + trace_dax_pmd_load_hole_fallback(vma, address, zero_page, ret);
> + return VM_FAULT_FALLBACK;
> }
>
> int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
> diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
> index f9ed4eb..8814b1a 100644
> --- a/include/trace/events/fs_dax.h
> +++ b/include/trace/events/fs_dax.h
> @@ -54,6 +54,38 @@ DEFINE_EVENT(dax_pmd_fault_class, name, \
> DEFINE_PMD_FAULT_EVENT(dax_pmd_fault);
> DEFINE_PMD_FAULT_EVENT(dax_pmd_fault_done);
>
> +DECLARE_EVENT_CLASS(dax_pmd_load_hole_class,
> + TP_PROTO(struct vm_area_struct *vma, unsigned long address,
> + struct page *zero_page, void *radix_entry),
> + TP_ARGS(vma, address, zero_page, radix_entry),
> + TP_STRUCT__entry(
> + __field(unsigned long, vm_flags)
> + __field(unsigned long, address)
> + __field(struct page *, zero_page)
> + __field(void *, radix_entry)
> + ),
> + TP_fast_assign(
> + __entry->vm_flags = vma->vm_flags;
> + __entry->address = address;
> + __entry->zero_page = zero_page;
> + __entry->radix_entry = radix_entry;
> + ),
> + TP_printk("%s mapping read address %#lx zero_page %p radix_entry %#lx",
> + __entry->vm_flags & VM_SHARED ? "shared" : "private",
> + __entry->address,
> + __entry->zero_page,
> + (unsigned long)__entry->radix_entry
> + )
> +)
> +
> +#define DEFINE_PMD_LOAD_HOLE_EVENT(name) \
> +DEFINE_EVENT(dax_pmd_load_hole_class, name, \
> + TP_PROTO(struct vm_area_struct *vma, unsigned long address, \
> + struct page *zero_page, void *radix_entry), \
> + TP_ARGS(vma, address, zero_page, radix_entry))
> +
> +DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole);
> +DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole_fallback);
>
> #endif /* _TRACE_FS_DAX_H */
>
> --
> 2.7.4
>
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Wed, Nov 23, 2016 at 11:44:19AM -0700, Ross Zwisler wrote:
> Tracepoints are the standard way to capture debugging and tracing
> information in many parts of the kernel, including the XFS and ext4
> filesystems. Create a tracepoint header for FS DAX and add the first DAX
> tracepoints to the PMD fault handler. This allows the tracing for DAX to
> be done in the same way as the filesystem tracing so that developers can
> look at them together and get a coherent idea of what the system is doing.
It also has one hell of potential for becoming a massive nuisance.
Keep in mind that if any userland code becomes dependent on those - that's it,
they have become parts of stable userland ABI and are to be maintained
indefinitely. Don't expect "tracepoints are special case" to prevent that.
So treat anything you add in that manner as potential stable ABI
you might have to keep around forever. It's *not* a glorified debugging
printk.
On Thu, Nov 24, 2016 at 1:11 AM, Jan Kara <[email protected]> wrote:
> On Wed 23-11-16 11:44:18, Ross Zwisler wrote:
>> No functional change.
>>
>> As of this commit:
>>
>> commit 218dd85887da (".gitattributes: set git diff driver for C source code
>> files")
>>
>> git-diff and git-format-patch both generate diffs whose hunks are correctly
>> prefixed by function names instead of labels, even if those labels aren't
>> indented with spaces.
>
> Fine by me. I just have some 4 remaining DAX patches (will send them out
> today) and they will clash with this. So I'd prefer if this happened after
> they are merged...
Let's just leave them alone, it's not like this thrash buys us
anything at this point. We can just stop including spaces in new
code.
On Thu, Nov 24, 2016 at 05:32:20PM +0000, Al Viro wrote:
> On Wed, Nov 23, 2016 at 11:44:19AM -0700, Ross Zwisler wrote:
> > Tracepoints are the standard way to capture debugging and tracing
> > information in many parts of the kernel, including the XFS and ext4
> > filesystems. Create a tracepoint header for FS DAX and add the first DAX
> > tracepoints to the PMD fault handler. This allows the tracing for DAX to
> > be done in the same way as the filesystem tracing so that developers can
> > look at them together and get a coherent idea of what the system is doing.
>
> It also has one hell of potential for becoming a massive nuisance.
> Keep in mind that if any userland code becomes dependent on those - that's it,
> they have become parts of stable userland ABI and are to be maintained
> indefinitely. Don't expect "tracepoints are special case" to prevent that.
I call bullshit just like I always do when someone spouts this
"tracepoints are stable ABI" garbage.
If we want to provide stable tracepoints, then we need to /create a
stable tracepoint API/ and convert all the tracepoints that /need
to be stable/ to use it. Then developers only need to be careful
about modifying code around the /explicitly stable/ tracepoints and
we avoid retrospectively locking the kernel implementation into a
KABI so tight we can't do anything anymore....
Quite frankly, anyone that wants to stop us from
adding/removing/changing tracepoints or the code that they are
reporting information about "because ABI" can go take a long walk
off a short cliff. Diagnostic tracepoints are not part of the
stable ABI. End of story.
> So treat anything you add in that manner as potential stable ABI
> you might have to keep around forever. It's *not* a glorified debugging
> printk.
trace_printk() is the glorified debugging printk for tracing, not
trace events.
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Wed, Nov 23, 2016 at 11:44:19AM -0700, Ross Zwisler wrote:
> Tracepoints are the standard way to capture debugging and tracing
> information in many parts of the kernel, including the XFS and ext4
> filesystems. Create a tracepoint header for FS DAX and add the first DAX
> tracepoints to the PMD fault handler. This allows the tracing for DAX to
> be done in the same way as the filesystem tracing so that developers can
> look at them together and get a coherent idea of what the system is doing.
>
> I added both an entry and exit tracepoint because future patches will add
> tracepoints to child functions of dax_iomap_pmd_fault() like
> dax_pmd_load_hole() and dax_pmd_insert_mapping(). We want those messages to
> be wrapped by the parent function tracepoints so the code flow is more
> easily understood. Having entry and exit tracepoints for faults also
> allows us to easily see what filesystems functions were called during the
> fault. These filesystem functions get executed via iomap_begin() and
> iomap_end() calls, for example, and will have their own tracepoints.
>
> For PMD faults we primarily want to understand the faulting address and
> whether it fell back to 4k faults. If it fell back to 4k faults the
> tracepoints should let us understand why.
>
> I named the new tracepoint header file "fs_dax.h" to allow for device DAX
> to have its own separate tracing header in the same directory at some
> point.
>
> Here is an example output for these events from a successful PMD fault:
>
> big-2057 [000] .... 136.396855: dax_pmd_fault: shared mapping write
> address 0x10505000 vm_start 0x10200000 vm_end 0x10700000 pgoff 0x200
> max_pgoff 0x1400
>
> big-2057 [000] .... 136.397943: dax_pmd_fault_done: shared mapping write
> address 0x10505000 vm_start 0x10200000 vm_end 0x10700000 pgoff 0x200
> max_pgoff 0x1400 NOPAGE
Can we make the output use the same format as most of the filesystem
code? i.e. the output starts with backing device + inode number like
so:
xfs_ilock: dev 8:96 ino 0x493 flags ILOCK_EXCL....
This way we can filter the output easily across both dax and
filesystem tracepoints with 'grep "ino 0x493"'...
Cheers,
Dave.
--
Dave Chinner
[email protected]
[Linus Cc'd]
On Fri, Nov 25, 2016 at 01:49:18PM +1100, Dave Chinner wrote:
> > they have become parts of stable userland ABI and are to be maintained
> > indefinitely. Don't expect "tracepoints are special case" to prevent that.
>
> I call bullshit just like I always do when someone spouts this
> "tracepoints are stable ABI" garbage.
> Quite frankly, anyone that wants to stop us from
> adding/removing/changing tracepoints or the code that they are
> reporting information about "because ABI" can go take a long walk
> off a short cliff. Diagnostic tracepoints are not part of the
> stable ABI. End of story.
Tell that to Linus. You had been in the room, IIRC, when that had been
brought up this year in Santa Fe. "End of story" is not going to be
yours (or mine, for that matter) to declare - Linus is the only one who
can do that. If he says "if userland code relies upon it, so that
userland code needs to be fixed" - I'm very happy (and everyone involved
can count upon quite a few free drinks from me at the next summit). If
it's "that userland code really shouldn't have relied upon it, and it's
real unfortunate that it does, but we still get to keep it working" -
too bad, "because ABI" is the reality and we will be the ones to take
that long walk.
What I heard from Linus sounded a lot closer to the second variant.
_IF_ I have misinterpreted that, I'd love to hear that. Linus?
PS: I'm dead serious about large amounts of booze of choice at LSFS 2017.
Bribery or shared celebration - call it whatever you like; I really,
really want to have tracepoints free from ABIfication concerns. They
certainly are useful for debugging purposes - no arguments here.
On Fri, Nov 25, 2016 at 04:14:19AM +0000, Al Viro wrote:
> [Linus Cc'd]
>
> On Fri, Nov 25, 2016 at 01:49:18PM +1100, Dave Chinner wrote:
> > > they have become parts of stable userland ABI and are to be maintained
> > > indefinitely. Don't expect "tracepoints are special case" to prevent that.
> >
> > I call bullshit just like I always do when someone spouts this
> > "tracepoints are stable ABI" garbage.
>
> > Quite frankly, anyone that wants to stop us from
> > adding/removing/changing tracepoints or the code that they are
> > reporting information about "because ABI" can go take a long walk
> > off a short cliff. Diagnostic tracepoints are not part of the
> > stable ABI. End of story.
>
> Tell that to Linus. You had been in the room, IIRC, when that had been
> brought up this year in Santa Fe.
No, I wasn't at KS or plumbers, so this is all news to me. Beleive
me, if I was in the room when this discussion was in progress, you'd
remember it /very clearly/.
> "End of story" is not going to be
> yours (or mine, for that matter) to declare - Linus is the only one who
> can do that. If he says "if userland code relies upon it, so that
> userland code needs to be fixed" - I'm very happy (and everyone involved
> can count upon quite a few free drinks from me at the next summit). If
> it's "that userland code really shouldn't have relied upon it, and it's
> real unfortunate that it does, but we still get to keep it working" -
> too bad, "because ABI" is the reality and we will be the ones to take
> that long walk.
When the tracepoint infrastructure was added it was considered a
debugging tool and not stable - it was even exposed through
/sys/kernel/debug! We connected up the ~280 /debug/ tracepoints we
had in XFS at the time with the understanding it was a /diagnostic
tool/. We exposed all sorts of internal details we'd previously been
exposing with tracing through lcrash and kdb (and Irix before that)
so we could diagnose problems quickly on a running kernel.
The scope of tracepoints may have grown since then, but it does not
change the fact that many of the tracepoints that were added years
ago were done under the understanding that it was a mutable
interface and nobody could rely on any specific tracepoint detail
remaining unchanged.
We're still treating then as mutable diagnostic and debugging aids
across the kernel. In XFS, We've now got over *500* unique trace
events and *650* tracepoints; ignoring comments, *4%* of the entire
XFS kernel code base is tracing code. We expose structure contents,
transaction states, locking algorithms, object life cycles, journal
operations, etc. All the new reverse mapping and shared data extent
code that has been merged in 4.8 and 4.9 has been extensively
exposed by tracepoints - these changes also modified a significant
number of existing tracepoints.
Put simply: every major and most minor pieces of functionality in
XFS are exposed via tracepoints.
Hence if the stable ABI tracepoint rules you've just described are
going to enforced, it will mean we will not be able to change
anything signficant in XFS because almost everything significant we
do involves changing tracepoints in some way. This leaves us with
three unacceptable choices:
1. stop developing XFS so we can maintain the stable
tracepoint ABI;
2. ignore the ABI rules and hope that Linus keeps pulling
code that obviously ignores the ABI rules; or
3. screw over our upstream/vanilla kernel users by removing
the tracepoints from Linus' tree and suck up the pain of
maintaining an out of tree patch for XFS developers and
distros so kernel tracepoint ABI rules can be ignored.
Nobody wins if these are the only choices we are being given.
I understand why there is a desire for stable tracepoints, and
that's why I suggested that there should be an in-kernel API to
declare stable tracepoints. That way we can have the best of both
worlds - tracepoints that applications need to be stable can be
declared, reviewed and explicitly marked as stable in full knowledge
of what that implies. The rest of the vast body of tracepoints can
be left as mutable with no stability or existence guarantees so that
developers can continue to treat them in a way that best suits
problem diagnosis without compromising the future development of the
code being traced. If userspace finds some of those tracepoints
useful, then they can be taken through the process of making them
into a maintainable stable form and being marked as such.
We already have distros mounting the tracing subsystem on
/sys/kernel/tracing. Expose all the stable tracepoints there, and
leave all the other tracepoints under /sys/kernel/debug/tracing.
Simple, clear separation between stable and mutable diagnostic
tracepoints for users, combined with a simple, clear in-kernel API
and process for making tracepoints stable....
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Fri, Nov 25, 2016 at 06:06:42PM +1100, Dave Chinner wrote:
> > Tell that to Linus. You had been in the room, IIRC, when that had been
> > brought up this year in Santa Fe.
>
> No, I wasn't at KS or plumbers, so this is all news to me.
Sorry, thought you had been at KS ;-/ My apologies...
[snip bloody good points I fully agree with]
> I understand why there is a desire for stable tracepoints, and
> that's why I suggested that there should be an in-kernel API to
> declare stable tracepoints. That way we can have the best of both
> worlds - tracepoints that applications need to be stable can be
> declared, reviewed and explicitly marked as stable in full knowledge
> of what that implies. The rest of the vast body of tracepoints can
> be left as mutable with no stability or existence guarantees so that
> developers can continue to treat them in a way that best suits
> problem diagnosis without compromising the future development of the
> code being traced. If userspace finds some of those tracepoints
> useful, then they can be taken through the process of making them
> into a maintainable stable form and being marked as such.
My impression is that nobody (at least kernel-side) wants them to be
a stable ABI, so long as nobody in userland screams about their code
being broken, everything is fine. As usual, if nobody notices an ABI
change, it hasn't happened. The question is what happens when somebody
does.
> We already have distros mounting the tracing subsystem on
> /sys/kernel/tracing. Expose all the stable tracepoints there, and
> leave all the other tracepoints under /sys/kernel/debug/tracing.
> Simple, clear separation between stable and mutable diagnostic
> tracepoints for users, combined with a simple, clear in-kernel API
> and process for making tracepoints stable....
Yep. That kind of separation would be my preference as well - ideally,
with review for stable ones being a lot less casual that for unstable;
AFAICS what happens now is that we have no mechanisms for marking them as
stable or unstable and everything keeps going on hope that nobody will
cause a mess by creating such a userland dependency. So far it's been mostly
working, but as the set of tracepoints (and their use) gets wider and wider,
IMO it's only matter of time until we get seriously screwed that way.
Basically, we are gambling on the next one to be cast in stone by userland
dependency being sane enough to make it possible to maintain it indefinitely
and I don't like the odds.
On Thu, Nov 24, 2016 at 11:37 PM, Al Viro <[email protected]> wrote:
>
> My impression is that nobody (at least kernel-side) wants them to be
> a stable ABI, so long as nobody in userland screams about their code
> being broken, everything is fine. As usual, if nobody notices an ABI
> change, it hasn't happened. The question is what happens when somebody
> does.
Right. There is basically _no_ "stable API" for the kernel anywhere,
it's just an issue of "you can't break workflow for normal people".
And if somebody writes his own trace scripts, and some random trace
point goes away (or changes semantics), that's easy: he can just fix
his script. Tracepoints aren't ever going to be stable in that sense.
But when then somebody writes a trace script that is so useful that
distros pick it up, and people start using it and depending on it,
then _that_ trace point may well have become effectively locked in
stone.
That's happened once already with the whole powertop thing. It didn't
get all that widely spread, and the fix was largely to improve on
powertop to the point where it wasn't a problem any more, but we've
clearly had one case of this happening.
But I suspect that something like powertop is fairly unusual. There is
certainly room for similar things in the VFS layer (think "better
vmstat that uses tracepoints"), but I suspect the bulk of tracepoints
are going to be for random debugging (so that developers can say
"please run this script") rather than for an actual user application
kind of situation.
So I don't think we should be _too_ afraid of tracepoints either. When
being too anti-tracepoint is a bigger practical problem than the
possible problems down the line, the balance is wrong.
As long as tracepoints make sense from a higher standpoint (ie not
just random implementation detail of the day), and they aren't
everywhere, they are unlikely to cause much problem.
We do have filesystem code that is just disgusting. As an example:
fs/afs/ tends to have these crazy "_enter()/_exit()" macros in every
single function. If you want that, use the function tracer. That seems
to be just debugging code that has been left around for others to
stumble over. I do *not* believe that we should encourage that kind of
"machine gun spray" use of tracepoints.
But tracing actual high-level things like IO and faults? I think that
makes perfect sense, as long as the data that is collected is also the
actual event data, and not so much a random implementation issue of
the day.
Linus
> We do have filesystem code that is just disgusting. As an example:
> fs/afs/ tends to have these crazy "_enter()/_exit()" macros in every
> single function.
Hmmm... we have "gossip" statements in Orangefs which can be triggered with
a debugfs knob... lots of functions have a gossip statement at the
top... is that
disgusting?
-Mike
On Fri, Nov 25, 2016 at 2:51 PM, Linus Torvalds
<[email protected]> wrote:
> On Thu, Nov 24, 2016 at 11:37 PM, Al Viro <[email protected]> wrote:
>>
>> My impression is that nobody (at least kernel-side) wants them to be
>> a stable ABI, so long as nobody in userland screams about their code
>> being broken, everything is fine. As usual, if nobody notices an ABI
>> change, it hasn't happened. The question is what happens when somebody
>> does.
>
> Right. There is basically _no_ "stable API" for the kernel anywhere,
> it's just an issue of "you can't break workflow for normal people".
>
> And if somebody writes his own trace scripts, and some random trace
> point goes away (or changes semantics), that's easy: he can just fix
> his script. Tracepoints aren't ever going to be stable in that sense.
>
> But when then somebody writes a trace script that is so useful that
> distros pick it up, and people start using it and depending on it,
> then _that_ trace point may well have become effectively locked in
> stone.
>
> That's happened once already with the whole powertop thing. It didn't
> get all that widely spread, and the fix was largely to improve on
> powertop to the point where it wasn't a problem any more, but we've
> clearly had one case of this happening.
>
> But I suspect that something like powertop is fairly unusual. There is
> certainly room for similar things in the VFS layer (think "better
> vmstat that uses tracepoints"), but I suspect the bulk of tracepoints
> are going to be for random debugging (so that developers can say
> "please run this script") rather than for an actual user application
> kind of situation.
>
> So I don't think we should be _too_ afraid of tracepoints either. When
> being too anti-tracepoint is a bigger practical problem than the
> possible problems down the line, the balance is wrong.
>
> As long as tracepoints make sense from a higher standpoint (ie not
> just random implementation detail of the day), and they aren't
> everywhere, they are unlikely to cause much problem.
>
> We do have filesystem code that is just disgusting. As an example:
> fs/afs/ tends to have these crazy "_enter()/_exit()" macros in every
> single function. If you want that, use the function tracer. That seems
> to be just debugging code that has been left around for others to
> stumble over. I do *not* believe that we should encourage that kind of
> "machine gun spray" use of tracepoints.
>
> But tracing actual high-level things like IO and faults? I think that
> makes perfect sense, as long as the data that is collected is also the
> actual event data, and not so much a random implementation issue of
> the day.
>
> Linus
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Nov 25, 2016 at 11:51:26AM -0800, Linus Torvalds wrote:
> We do have filesystem code that is just disgusting. As an example:
> fs/afs/ tends to have these crazy "_enter()/_exit()" macros in every
> single function. If you want that, use the function tracer. That seems
> to be just debugging code that has been left around for others to
> stumble over. I do *not* believe that we should encourage that kind of
> "machine gun spray" use of tracepoints.
There is a reason why people want to be able to do that, and that's
because kprobes doesn't give you access to the arguments and return
codes to the functions. Maybe there could be a way to do this more
easily using DWARF information and EBPF magic, perhaps? It won't help
for inlined functions, of course, but most of the functions where
people want to do this aren't generally functions which are going to
be inlined, but rather things like write_begin, writepages, which are
called via a struct ops table and so will never be inlined to begin
with.
And it *is* handy to be able to do this when you don't know ahead of
time that you might need to debug a production system that is
malfunctioning for some reason. This is the "S" in RAS (Reliability,
Availability, Serviceability). This is why it's nice if there were a
way to be clear that it is intended for debugging purposes only ---
and maybe kprobes with EBPF and DWARF would be the answer.
After all, we need *some* way of saying this can never be considered
stable --- what would we do if some userspace program like powertop
started depending on a function name via ktrace and that function
disappeared --- would the userspace application really be intended to
demand that we revert the recatoring, because eliminating a function
name that they were depending on via ktrace point broke them?
- Ted
On Fri, Nov 25, 2016 at 1:48 PM, Theodore Ts'o <[email protected]> wrote:
>
> There is a reason why people want to be able to do that, and that's
> because kprobes doesn't give you access to the arguments and return
> codes to the functions.
Honestly, that's simply not a good reason.
What if everybody did this? Do we pollute the whole kernel with this crap? No.
And if not, then what's so special about something like afs that it
would make sense there?
The thing is, with function tracing, you *can* get the return value
and arguments. Sure, you'll probably need to write eBPF and just
attach it to that fentry call point, and yes, if something is inlined
you're just screwed, but Christ, if you do debugging that way you
shouldn't be writing kernel code in the first place.
If you cannot do filesystem debugging without tracing every single
function entry, you are doing something seriously wrong. Add a couple
of relevant and valid trace points to get the initial arguments etc
(and perhaps to turn on the function tracing going down the stack).
> After all, we need *some* way of saying this can never be considered
> stable.
Oh, if you pollute the kernel with random idiotic trace points, not
only are they not going to be considered stable, after a while people
should stop pulling from you.
I do think we should probably add a few generic VFS level breakpoints
to make it easier for people to catch the arguments they get from the
VFS layer (not every system call - if you're a filesystem person, you
_shouldn't_ care about all the stuff that the VFS layer caches for you
so that you never even have to see it). I do think that Al's "no trace
points what-so-ever" is too strict.
But I think a lot of people add complete crap with the "maybe it's
needed some day" kind of mentality.
The tracepoints should have a good _specific_ reason, and they should
make sense. Not be randomly sprinkled "just because".
Linus
On Fri, Nov 25, 2016 at 11:51:26AM -0800, Linus Torvalds wrote:
> On Thu, Nov 24, 2016 at 11:37 PM, Al Viro <[email protected]> wrote:
> >
> > My impression is that nobody (at least kernel-side) wants them to be
> > a stable ABI, so long as nobody in userland screams about their code
> > being broken, everything is fine. As usual, if nobody notices an ABI
> > change, it hasn't happened. The question is what happens when somebody
> > does.
>
> Right. There is basically _no_ "stable API" for the kernel anywhere,
> it's just an issue of "you can't break workflow for normal people".
>
> And if somebody writes his own trace scripts, and some random trace
> point goes away (or changes semantics), that's easy: he can just fix
> his script. Tracepoints aren't ever going to be stable in that sense.
>
> But when then somebody writes a trace script that is so useful that
> distros pick it up, and people start using it and depending on it,
> then _that_ trace point may well have become effectively locked in
> stone.
And that's exactly why we need a method of marking tracepoints as
stable. How else are we going to know whether a specific tracepoint
is stable if the kernel code doesn't document that it's stable? And
how are we going to know why it's considered stable if there isn't a
commit message that explains why it was made stable?
> We do have filesystem code that is just disgusting. As an example:
> fs/afs/ tends to have these crazy "_enter()/_exit()" macros in every
> single function. If you want that, use the function tracer. That seems
> to be just debugging code that has been left around for others to
> stumble over. I do *not* believe that we should encourage that kind of
> "machine gun spray" use of tracepoints.
Inappropriate use of tracepoints is a different problem. The issue
here is getting rid of the uncertainty caused by the handwavy
"tracepoints a mutable until someone, somewhere decides to use it in
userspace" policy.
> But tracing actual high-level things like IO and faults? I think that
> makes perfect sense, as long as the data that is collected is also the
> actual event data, and not so much a random implementation issue of
> the day.
IME, a tracepoint that doesn't expose detailed context specific
information isn't really useful for complex problem diagnosis...
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Sun, Nov 27, 2016 at 2:42 PM, Dave Chinner <[email protected]> wrote:
>
> And that's exactly why we need a method of marking tracepoints as
> stable. How else are we going to know whether a specific tracepoint
> is stable if the kernel code doesn't document that it's stable?
You are living in some unrealistic dream-world where you think you can
get the right tracepoint on the first try.
So there is no way in hell I would ever mark any tracepoint "stable"
until it has had a fair amount of use, and there are useful tools that
actually make use of it, and it has shown itself to be the right
trace-point.
And once that actually happens, what's the advantage of marking it
stable? None. It's a catch-22. Before it has uses and has been tested
and found to be good, it's not stable. And after, it's pointless.
So at no point does such a "stable" tracepoint marking make sense. At
most, you end up adding a comment saying "this tracepoint is used by
tools such-and-such".
Linus
On Sun, Nov 27, 2016 at 04:58:43PM -0800, Linus Torvalds wrote:
> You are living in some unrealistic dream-world where you think you can
> get the right tracepoint on the first try.
>
> So there is no way in hell I would ever mark any tracepoint "stable"
> until it has had a fair amount of use, and there are useful tools that
> actually make use of it, and it has shown itself to be the right
> trace-point.
>
> And once that actually happens, what's the advantage of marking it
> stable? None. It's a catch-22. Before it has uses and has been tested
> and found to be good, it's not stable. And after, it's pointless.
>
> So at no point does such a "stable" tracepoint marking make sense. At
> most, you end up adding a comment saying "this tracepoint is used by
> tools such-and-such".
I can't speak for Dave, but I suspect that it's more about "this, this and
that tracepoints are purely internal and we can and will change them whenever
we bloody feel like that; stick your fingers in those and they _will_ get
crushed".
Incidentally, take a look at
trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
filp->f_path.dentry->d_name.len,
filp->f_path.dentry->d_name.name,
to->nr_segs); /* GRRRRR */
Note that there is nothing whatsoever protecting the use of ->d_name in
there (not that poking in iov_iter guts was a good idea). Besides, suppose
something *did* grab a hold of that one a while ago. What would we have
to do to avoid stepping on its toes every time when somebody call ocfs2
->splice_read(), which has recently started to go through ->read_iter()
calls? Prepend something like if (!(to->type & ITER_PIPE)) to it?
I'm very tempted to just go and remove it, along with its analogues.
If nothing else, the use of ->d_name *is* racy, and while it might be
tolerable for occasional debugging, for anything in heavier use it's
asking for trouble...
On Fri 25-11-16 16:48:40, Ted Tso wrote:
> On Fri, Nov 25, 2016 at 11:51:26AM -0800, Linus Torvalds wrote:
> > We do have filesystem code that is just disgusting. As an example:
> > fs/afs/ tends to have these crazy "_enter()/_exit()" macros in every
> > single function. If you want that, use the function tracer. That seems
> > to be just debugging code that has been left around for others to
> > stumble over. I do *not* believe that we should encourage that kind of
> > "machine gun spray" use of tracepoints.
>
> There is a reason why people want to be able to do that, and that's
> because kprobes doesn't give you access to the arguments and return
> codes to the functions. Maybe there could be a way to do this more
> easily using DWARF information and EBPF magic, perhaps? It won't help
> for inlined functions, of course, but most of the functions where
> people want to do this aren't generally functions which are going to
> be inlined, but rather things like write_begin, writepages, which are
> called via a struct ops table and so will never be inlined to begin
> with.
Actually, you can print register & stack contents from a kprobe and you can
get a function return value from a kretprobe (see
Documentation/trace/kprobetrace.txt). Since calling convention is fixed
(arg 1 in RDI, arg 2 in RSI...) you can relatively easily dump function
arguments on entry and dump return value on return for arbitrary function
of your choice. I was already debugging issues like that several times (in
VFS actually because of missing trace points ;)). You can even create a
kprobe to dump register contents in the middle of the function (although
there it takes more effort reading the dissasembly to see what you are
interested in).
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Sun, Nov 27, 2016 at 04:58:43PM -0800, Linus Torvalds wrote:
> On Sun, Nov 27, 2016 at 2:42 PM, Dave Chinner <[email protected]> wrote:
> >
> > And that's exactly why we need a method of marking tracepoints as
> > stable. How else are we going to know whether a specific tracepoint
> > is stable if the kernel code doesn't document that it's stable?
>
> You are living in some unrealistic dream-world where you think you can
> get the right tracepoint on the first try.
No, I'm under no illusions that we'd get stable tracepoints right
the first go. I don't care about how we stabilise stable
tracepoints, because nothing I maintain will use stable tracepoints.
However, I will point out that we have /already solved these ABI
development problems/.
$ ls Documentation/ABI/
obsolete README removed stable testing
$
Expands this to include stable tracepoints, not just sysfs files.
New stable stuff gets classified as "testing" meaning it is supposed
to be stable but may change before being declared officially stable.
"stable" is obvious, are "obsolete" and "removed".
> So there is no way in hell I would ever mark any tracepoint "stable"
> until it has had a fair amount of use, and there are useful tools that
> actually make use of it, and it has shown itself to be the right
> trace-point.
>
> And once that actually happens, what's the advantage of marking it
> stable? None. It's a catch-22. Before it has uses and has been tested
> and found to be good, it's not stable. And after, it's pointless.
And that "catch-22" is *precisely the problem we need to solve*.
Pointing out that there's a catch-22 doesn't help anyone - all the
developers that are telling you that they really need a way to mark
stable tracepoints already understand this catch-22 and they want a
way to avoid it. Being able to either say "this is stable and we'll
support it forever" or "this will never be stable so use at your own
risk" is a simple way of avoiding the catch-22. If an unstable
tracepoint is useful to applications *and it can be implemented in a
maintainable, stable form* then it can go through the process of
being made stable and documented in Documentation/ABI/stable.
Problem is solved, catch-22 is gone.
All we want is some method of making a clear, unambiguous statement
about the nature of a specific tracepoint and a process for
transitioning a tracepoint to a stable, maintainable form. We do it
for other ABI interfaces, so why can't we do this for tracepoints?
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Thu, Nov 24, 2016 at 10:02:39AM +0100, Jan Kara wrote:
> On Wed 23-11-16 11:44:17, Ross Zwisler wrote:
> > With the current Kconfig setup it is possible to have the following:
> >
> > CONFIG_EXT4_FS=y
> > CONFIG_FS_DAX=y
> > CONFIG_FS_IOMAP=n # this is in fs/Kconfig & isn't user accessible
> >
> > With this config we get build failures in ext4_dax_fault() because the
> > iomap functions in fs/dax.c are missing:
> >
> > fs/built-in.o: In function `ext4_dax_fault':
> > file.c:(.text+0x7f3ac): undefined reference to `dax_iomap_fault'
> > file.c:(.text+0x7f404): undefined reference to `dax_iomap_fault'
> > fs/built-in.o: In function `ext4_file_read_iter':
> > file.c:(.text+0x7fc54): undefined reference to `dax_iomap_rw'
> > fs/built-in.o: In function `ext4_file_write_iter':
> > file.c:(.text+0x7fe9a): undefined reference to `dax_iomap_rw'
> > file.c:(.text+0x7feed): undefined reference to `dax_iomap_rw'
> > fs/built-in.o: In function `ext4_block_zero_page_range':
> > inode.c:(.text+0x85c0d): undefined reference to `iomap_zero_range'
> >
> > Now that the struct buffer_head based DAX fault paths and I/O path have
> > been removed we really depend on iomap support being present for DAX. Make
> > this explicit by selecting FS_IOMAP if we compile in DAX support.
> >
> > Signed-off-by: Ross Zwisler <[email protected]>
>
> I've sent the same patch to Ted yesterday and he will probably queue it on
> top of ext4 iomap patches. If it doesn't happen for some reason, feel free
> to add:
>
> Reviewed-by: Jan Kara <[email protected]>
Cool, looks like Ted has pulled in your patch.
I think we still eventually want this patch because it cleans up our handling
of FS_IOMAP. With your patch we select it separately in both ext4 & ext2
based on whether we include DAX, and we still have #ifdefs in fs/dax.c for
FS_IOMAP.
I'll pull your most recent patch into my baseline & rework this patch.
On Thu, Nov 24, 2016 at 11:42:38AM -0800, Dan Williams wrote:
> On Thu, Nov 24, 2016 at 1:11 AM, Jan Kara <[email protected]> wrote:
> > On Wed 23-11-16 11:44:18, Ross Zwisler wrote:
> >> No functional change.
> >>
> >> As of this commit:
> >>
> >> commit 218dd85887da (".gitattributes: set git diff driver for C source code
> >> files")
> >>
> >> git-diff and git-format-patch both generate diffs whose hunks are correctly
> >> prefixed by function names instead of labels, even if those labels aren't
> >> indented with spaces.
> >
> > Fine by me. I just have some 4 remaining DAX patches (will send them out
> > today) and they will clash with this. So I'd prefer if this happened after
> > they are merged...
>
> Let's just leave them alone, it's not like this thrash buys us
> anything at this point. We can just stop including spaces in new
> code.
Honestly I'm not sure which is better. I understand your argument about not
introducing "thrash" for cleanup like this, but at the same time knowingly
leaving inconsistencies in the code style just because seems...gross?
In any case, sure Jan, if this patch happens lets do it after your remaining
DAX patches.
On Fri, Nov 25, 2016 at 02:00:59PM +1100, Dave Chinner wrote:
> On Wed, Nov 23, 2016 at 11:44:19AM -0700, Ross Zwisler wrote:
> > Tracepoints are the standard way to capture debugging and tracing
> > information in many parts of the kernel, including the XFS and ext4
> > filesystems. Create a tracepoint header for FS DAX and add the first DAX
> > tracepoints to the PMD fault handler. This allows the tracing for DAX to
> > be done in the same way as the filesystem tracing so that developers can
> > look at them together and get a coherent idea of what the system is doing.
> >
> > I added both an entry and exit tracepoint because future patches will add
> > tracepoints to child functions of dax_iomap_pmd_fault() like
> > dax_pmd_load_hole() and dax_pmd_insert_mapping(). We want those messages to
> > be wrapped by the parent function tracepoints so the code flow is more
> > easily understood. Having entry and exit tracepoints for faults also
> > allows us to easily see what filesystems functions were called during the
> > fault. These filesystem functions get executed via iomap_begin() and
> > iomap_end() calls, for example, and will have their own tracepoints.
> >
> > For PMD faults we primarily want to understand the faulting address and
> > whether it fell back to 4k faults. If it fell back to 4k faults the
> > tracepoints should let us understand why.
> >
> > I named the new tracepoint header file "fs_dax.h" to allow for device DAX
> > to have its own separate tracing header in the same directory at some
> > point.
> >
> > Here is an example output for these events from a successful PMD fault:
> >
> > big-2057 [000] .... 136.396855: dax_pmd_fault: shared mapping write
> > address 0x10505000 vm_start 0x10200000 vm_end 0x10700000 pgoff 0x200
> > max_pgoff 0x1400
> >
> > big-2057 [000] .... 136.397943: dax_pmd_fault_done: shared mapping write
> > address 0x10505000 vm_start 0x10200000 vm_end 0x10700000 pgoff 0x200
> > max_pgoff 0x1400 NOPAGE
>
> Can we make the output use the same format as most of the filesystem
> code? i.e. the output starts with backing device + inode number like
> so:
>
> xfs_ilock: dev 8:96 ino 0x493 flags ILOCK_EXCL....
>
> This way we can filter the output easily across both dax and
> filesystem tracepoints with 'grep "ino 0x493"'...
I think I can include the inode number, which I have via mapping->host. Am I
correct in assuming "struct inode.i_ino" will always be the same as
"struct xfs_inode.i_ino"?
Unfortunately I don't have access to the major/minor (the dev_t) until I call
iomap_begin(). Currently we call iomap_begin() only after we've done most of
our sanity checking that would cause us to fall back to PTEs.
I don't think we want to reorder things so that we start with an iomap_begin()
- that would mean that we would begin by asking the filesystem for a block
allocation, when in many cases we would then do an alignment check or
something similar and then fall back to PTEs.
I'll add "ino" to the output so it looks something like this:
big-2057 [000] .... 136.397943: dax_pmd_fault_done: ino 0x493 shared
mapping write address 0x10505000 vm_start 0x10200000 vm_end 0x10700000 pgoff
0x200 max_pgoff 0x1400 NOPAGE
On Mon, Nov 28, 2016 at 03:46:51PM -0700, Ross Zwisler wrote:
> On Fri, Nov 25, 2016 at 02:00:59PM +1100, Dave Chinner wrote:
> > On Wed, Nov 23, 2016 at 11:44:19AM -0700, Ross Zwisler wrote:
> > > Tracepoints are the standard way to capture debugging and tracing
> > > information in many parts of the kernel, including the XFS and ext4
> > > filesystems. Create a tracepoint header for FS DAX and add the first DAX
> > > tracepoints to the PMD fault handler. This allows the tracing for DAX to
> > > be done in the same way as the filesystem tracing so that developers can
> > > look at them together and get a coherent idea of what the system is doing.
> > >
> > > I added both an entry and exit tracepoint because future patches will add
> > > tracepoints to child functions of dax_iomap_pmd_fault() like
> > > dax_pmd_load_hole() and dax_pmd_insert_mapping(). We want those messages to
> > > be wrapped by the parent function tracepoints so the code flow is more
> > > easily understood. Having entry and exit tracepoints for faults also
> > > allows us to easily see what filesystems functions were called during the
> > > fault. These filesystem functions get executed via iomap_begin() and
> > > iomap_end() calls, for example, and will have their own tracepoints.
> > >
> > > For PMD faults we primarily want to understand the faulting address and
> > > whether it fell back to 4k faults. If it fell back to 4k faults the
> > > tracepoints should let us understand why.
> > >
> > > I named the new tracepoint header file "fs_dax.h" to allow for device DAX
> > > to have its own separate tracing header in the same directory at some
> > > point.
> > >
> > > Here is an example output for these events from a successful PMD fault:
> > >
> > > big-2057 [000] .... 136.396855: dax_pmd_fault: shared mapping write
> > > address 0x10505000 vm_start 0x10200000 vm_end 0x10700000 pgoff 0x200
> > > max_pgoff 0x1400
> > >
> > > big-2057 [000] .... 136.397943: dax_pmd_fault_done: shared mapping write
> > > address 0x10505000 vm_start 0x10200000 vm_end 0x10700000 pgoff 0x200
> > > max_pgoff 0x1400 NOPAGE
> >
> > Can we make the output use the same format as most of the filesystem
> > code? i.e. the output starts with backing device + inode number like
> > so:
> >
> > xfs_ilock: dev 8:96 ino 0x493 flags ILOCK_EXCL....
> >
> > This way we can filter the output easily across both dax and
> > filesystem tracepoints with 'grep "ino 0x493"'...
>
> I think I can include the inode number, which I have via mapping->host. Am I
> correct in assuming "struct inode.i_ino" will always be the same as
> "struct xfs_inode.i_ino"?
Yes - just use inode.i_ino.
> Unfortunately I don't have access to the major/minor (the dev_t) until I call
> iomap_begin().
In general, filesystem tracing uses inode->sb->s_dev as the
identifier. NFS, gfs2, XFS, ext4 and others all use this.
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Mon 28-11-16 12:15:04, Ross Zwisler wrote:
> On Thu, Nov 24, 2016 at 10:02:39AM +0100, Jan Kara wrote:
> > On Wed 23-11-16 11:44:17, Ross Zwisler wrote:
> > > With the current Kconfig setup it is possible to have the following:
> > >
> > > CONFIG_EXT4_FS=y
> > > CONFIG_FS_DAX=y
> > > CONFIG_FS_IOMAP=n # this is in fs/Kconfig & isn't user accessible
> > >
> > > With this config we get build failures in ext4_dax_fault() because the
> > > iomap functions in fs/dax.c are missing:
> > >
> > > fs/built-in.o: In function `ext4_dax_fault':
> > > file.c:(.text+0x7f3ac): undefined reference to `dax_iomap_fault'
> > > file.c:(.text+0x7f404): undefined reference to `dax_iomap_fault'
> > > fs/built-in.o: In function `ext4_file_read_iter':
> > > file.c:(.text+0x7fc54): undefined reference to `dax_iomap_rw'
> > > fs/built-in.o: In function `ext4_file_write_iter':
> > > file.c:(.text+0x7fe9a): undefined reference to `dax_iomap_rw'
> > > file.c:(.text+0x7feed): undefined reference to `dax_iomap_rw'
> > > fs/built-in.o: In function `ext4_block_zero_page_range':
> > > inode.c:(.text+0x85c0d): undefined reference to `iomap_zero_range'
> > >
> > > Now that the struct buffer_head based DAX fault paths and I/O path have
> > > been removed we really depend on iomap support being present for DAX. Make
> > > this explicit by selecting FS_IOMAP if we compile in DAX support.
> > >
> > > Signed-off-by: Ross Zwisler <[email protected]>
> >
> > I've sent the same patch to Ted yesterday and he will probably queue it on
> > top of ext4 iomap patches. If it doesn't happen for some reason, feel free
> > to add:
> >
> > Reviewed-by: Jan Kara <[email protected]>
>
> Cool, looks like Ted has pulled in your patch.
>
> I think we still eventually want this patch because it cleans up our handling
> of FS_IOMAP. With your patch we select it separately in both ext4 & ext2
> based on whether we include DAX, and we still have #ifdefs in fs/dax.c for
> FS_IOMAP.
Actually, based on Dave's request I've also sent Ted updated version which
did select FS_IOMAP in CONFIG_DAX section. However Ted didn't pull that
patch (yet?). Anyway, I don't care whose patch gets merged, I just wanted
to notify you of possible conflict.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Tue, Nov 29, 2016 at 09:53:03AM +0100, Jan Kara wrote:
> On Mon 28-11-16 12:15:04, Ross Zwisler wrote:
> > On Thu, Nov 24, 2016 at 10:02:39AM +0100, Jan Kara wrote:
> > > On Wed 23-11-16 11:44:17, Ross Zwisler wrote:
> > > > With the current Kconfig setup it is possible to have the following:
> > > >
> > > > CONFIG_EXT4_FS=y
> > > > CONFIG_FS_DAX=y
> > > > CONFIG_FS_IOMAP=n # this is in fs/Kconfig & isn't user accessible
> > > >
> > > > With this config we get build failures in ext4_dax_fault() because the
> > > > iomap functions in fs/dax.c are missing:
> > > >
> > > > fs/built-in.o: In function `ext4_dax_fault':
> > > > file.c:(.text+0x7f3ac): undefined reference to `dax_iomap_fault'
> > > > file.c:(.text+0x7f404): undefined reference to `dax_iomap_fault'
> > > > fs/built-in.o: In function `ext4_file_read_iter':
> > > > file.c:(.text+0x7fc54): undefined reference to `dax_iomap_rw'
> > > > fs/built-in.o: In function `ext4_file_write_iter':
> > > > file.c:(.text+0x7fe9a): undefined reference to `dax_iomap_rw'
> > > > file.c:(.text+0x7feed): undefined reference to `dax_iomap_rw'
> > > > fs/built-in.o: In function `ext4_block_zero_page_range':
> > > > inode.c:(.text+0x85c0d): undefined reference to `iomap_zero_range'
> > > >
> > > > Now that the struct buffer_head based DAX fault paths and I/O path have
> > > > been removed we really depend on iomap support being present for DAX. Make
> > > > this explicit by selecting FS_IOMAP if we compile in DAX support.
> > > >
> > > > Signed-off-by: Ross Zwisler <[email protected]>
> > >
> > > I've sent the same patch to Ted yesterday and he will probably queue it on
> > > top of ext4 iomap patches. If it doesn't happen for some reason, feel free
> > > to add:
> > >
> > > Reviewed-by: Jan Kara <[email protected]>
> >
> > Cool, looks like Ted has pulled in your patch.
> >
> > I think we still eventually want this patch because it cleans up our handling
> > of FS_IOMAP. With your patch we select it separately in both ext4 & ext2
> > based on whether we include DAX, and we still have #ifdefs in fs/dax.c for
> > FS_IOMAP.
>
> Actually, based on Dave's request I've also sent Ted updated version which
> did select FS_IOMAP in CONFIG_DAX section. However Ted didn't pull that
> patch (yet?). Anyway, I don't care whose patch gets merged, I just wanted
> to notify you of possible conflict.
Can you please CC me on these patches in the future? I also don't care whose
patches end up fixing this, but I want to make sure we end up in a world where
the "select FS_IOMAP" just happens directly for FS_DAX in fs/Kconfig so that
I can get rid of the unnecessary #ifdefs in fs/dax.c for CONFIG_FS_IOMAP.
On Wed 30-11-16 12:04:31, Ross Zwisler wrote:
> On Tue, Nov 29, 2016 at 09:53:03AM +0100, Jan Kara wrote:
> > On Mon 28-11-16 12:15:04, Ross Zwisler wrote:
> > > On Thu, Nov 24, 2016 at 10:02:39AM +0100, Jan Kara wrote:
> > > > On Wed 23-11-16 11:44:17, Ross Zwisler wrote:
> > > > > With the current Kconfig setup it is possible to have the following:
> > > > >
> > > > > CONFIG_EXT4_FS=y
> > > > > CONFIG_FS_DAX=y
> > > > > CONFIG_FS_IOMAP=n # this is in fs/Kconfig & isn't user accessible
> > > > >
> > > > > With this config we get build failures in ext4_dax_fault() because the
> > > > > iomap functions in fs/dax.c are missing:
> > > > >
> > > > > fs/built-in.o: In function `ext4_dax_fault':
> > > > > file.c:(.text+0x7f3ac): undefined reference to `dax_iomap_fault'
> > > > > file.c:(.text+0x7f404): undefined reference to `dax_iomap_fault'
> > > > > fs/built-in.o: In function `ext4_file_read_iter':
> > > > > file.c:(.text+0x7fc54): undefined reference to `dax_iomap_rw'
> > > > > fs/built-in.o: In function `ext4_file_write_iter':
> > > > > file.c:(.text+0x7fe9a): undefined reference to `dax_iomap_rw'
> > > > > file.c:(.text+0x7feed): undefined reference to `dax_iomap_rw'
> > > > > fs/built-in.o: In function `ext4_block_zero_page_range':
> > > > > inode.c:(.text+0x85c0d): undefined reference to `iomap_zero_range'
> > > > >
> > > > > Now that the struct buffer_head based DAX fault paths and I/O path have
> > > > > been removed we really depend on iomap support being present for DAX. Make
> > > > > this explicit by selecting FS_IOMAP if we compile in DAX support.
> > > > >
> > > > > Signed-off-by: Ross Zwisler <[email protected]>
> > > >
> > > > I've sent the same patch to Ted yesterday and he will probably queue it on
> > > > top of ext4 iomap patches. If it doesn't happen for some reason, feel free
> > > > to add:
> > > >
> > > > Reviewed-by: Jan Kara <[email protected]>
> > >
> > > Cool, looks like Ted has pulled in your patch.
> > >
> > > I think we still eventually want this patch because it cleans up our handling
> > > of FS_IOMAP. With your patch we select it separately in both ext4 & ext2
> > > based on whether we include DAX, and we still have #ifdefs in fs/dax.c for
> > > FS_IOMAP.
> >
> > Actually, based on Dave's request I've also sent Ted updated version which
> > did select FS_IOMAP in CONFIG_DAX section. However Ted didn't pull that
> > patch (yet?). Anyway, I don't care whose patch gets merged, I just wanted
> > to notify you of possible conflict.
>
> Can you please CC me on these patches in the future? I also don't care whose
> patches end up fixing this, but I want to make sure we end up in a world where
> the "select FS_IOMAP" just happens directly for FS_DAX in fs/Kconfig so that
> I can get rid of the unnecessary #ifdefs in fs/dax.c for CONFIG_FS_IOMAP.
Sure, will do.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
This is a reply to a thread from back in Nov 2016...
Linus> The thing is, with function tracing, you *can* get the return value
Linus> and arguments. Sure, you'll probably need to write eBPF and just
Linus> attach it to that fentry call point, and yes, if something is inlined
Linus> you're just screwed, but Christ, if you do debugging that way you
Linus> shouldn't be writing kernel code in the first place.
I've been thinking about this thread ever since then... lately I've been
reading about eBPF and looking at Brendan Gregg's bcc tool, and
reading about kprobes and I made some test tracepoints too...
Orangefs has a function:
orangefs_create(struct inode *dir,
struct dentry *dentry,
umode_t mode,
bool exclusive)
I found it was easy to use a kprobe to get a function's return code:
echo 'r:orangefs_create_retprobe orangefs_create $retval' >>
/sys/kernel/debug/tracing/kprobe_events
It was also easy to use a kprobe to print the arguments to a function,
but since arguments are often pointers, maybe the pointer value isn't
really what you want to see, rather you might wish you could "see into"
a structure that you have a pointer to...
Here's a kprobe for looking at the arguments to a function, the funny
di, si, dx, stack
stuff has to do with a particular architecture's parameter passing mechanism as
it relates to registers and the stack... I mention that because the
example in kprobetrace.txt
seems to be for a 32-bit computer and I'm (like most of us?) on a
64-bit computer.
echo 'p:orangefs_create_probe orangefs_create dir=%di dentry=%si
mode=%dx exclusive=+4($stack)' >
/sys/kernel/debug/tracing/kprobe_events
I wrote a bcc program to extract the name of an object from the dentry struct
passed into orangefs_create. The bcc program is kind of verbose, but is
mostly "templatey", here it is... is this the kind of thing Linus was
talking about?
Either way, I think it is kind of cool <g>...
#!/usr/bin/python
#
# Brendan Gregg's mdflush used as a template.
#
from __future__ import print_function
from bcc import BPF
from time import strftime
import ctypes as ct
# load BPF program
b = BPF(text="""
#include <uapi/linux/limits.h>
#include <uapi/linux/ptrace.h>
#include <linux/sched.h>
#include <linux/genhd.h>
#include <linux/dcache.h>
struct data_t {
u64 pid;
char comm[TASK_COMM_LEN];
char objname[NAME_MAX];
};
BPF_PERF_OUTPUT(events);
int kprobe__orangefs_create(struct pt_regs *ctx,
void *dir,
struct dentry *dentry)
{
struct data_t data = {};
u32 pid = bpf_get_current_pid_tgid();
data.pid = pid;
bpf_get_current_comm(&data.comm, sizeof(data.comm));
bpf_probe_read(&data.objname,
sizeof(data.objname),
(void *)dentry->d_name.name);
events.perf_submit(ctx, &data, sizeof(data));
return 0;
}
""")
# event data
TASK_COMM_LEN = 16 # linux/sched.h
NAME_MAX = 255 # uapi/linux/limits.h
class Data(ct.Structure):
_fields_ = [
("pid", ct.c_ulonglong),
("comm", ct.c_char * TASK_COMM_LEN),
("objname", ct.c_char * NAME_MAX)
]
# header
print("orangefs creates... Hit Ctrl-C to end.")
print("%-8s %-6s %-16s %s" % ("TIME", "PID", "COMM", "OBJNAME"))
# process event
# print_event is the callback invoked from open_perf_buffers...
# cpu refers to a set of per-cpu ring buffers that will receive the event data.
def print_event(cpu, data, size):
event = ct.cast(data, ct.POINTER(Data)).contents
print("%-8s %-6d %-16s %s" % (strftime("%H:%M:%S"), event.pid, event.comm,
event.objname))
# read events
b["events"].open_perf_buffer(print_event)
while 1:
b.kprobe_poll()
-Mike
On Mon, Nov 28, 2016 at 9:02 PM, Dave Chinner <[email protected]> wrote:
> On Mon, Nov 28, 2016 at 03:46:51PM -0700, Ross Zwisler wrote:
>> On Fri, Nov 25, 2016 at 02:00:59PM +1100, Dave Chinner wrote:
>> > On Wed, Nov 23, 2016 at 11:44:19AM -0700, Ross Zwisler wrote:
>> > > Tracepoints are the standard way to capture debugging and tracing
>> > > information in many parts of the kernel, including the XFS and ext4
>> > > filesystems. Create a tracepoint header for FS DAX and add the first DAX
>> > > tracepoints to the PMD fault handler. This allows the tracing for DAX to
>> > > be done in the same way as the filesystem tracing so that developers can
>> > > look at them together and get a coherent idea of what the system is doing.
>> > >
>> > > I added both an entry and exit tracepoint because future patches will add
>> > > tracepoints to child functions of dax_iomap_pmd_fault() like
>> > > dax_pmd_load_hole() and dax_pmd_insert_mapping(). We want those messages to
>> > > be wrapped by the parent function tracepoints so the code flow is more
>> > > easily understood. Having entry and exit tracepoints for faults also
>> > > allows us to easily see what filesystems functions were called during the
>> > > fault. These filesystem functions get executed via iomap_begin() and
>> > > iomap_end() calls, for example, and will have their own tracepoints.
>> > >
>> > > For PMD faults we primarily want to understand the faulting address and
>> > > whether it fell back to 4k faults. If it fell back to 4k faults the
>> > > tracepoints should let us understand why.
>> > >
>> > > I named the new tracepoint header file "fs_dax.h" to allow for device DAX
>> > > to have its own separate tracing header in the same directory at some
>> > > point.
>> > >
>> > > Here is an example output for these events from a successful PMD fault:
>> > >
>> > > big-2057 [000] .... 136.396855: dax_pmd_fault: shared mapping write
>> > > address 0x10505000 vm_start 0x10200000 vm_end 0x10700000 pgoff 0x200
>> > > max_pgoff 0x1400
>> > >
>> > > big-2057 [000] .... 136.397943: dax_pmd_fault_done: shared mapping write
>> > > address 0x10505000 vm_start 0x10200000 vm_end 0x10700000 pgoff 0x200
>> > > max_pgoff 0x1400 NOPAGE
>> >
>> > Can we make the output use the same format as most of the filesystem
>> > code? i.e. the output starts with backing device + inode number like
>> > so:
>> >
>> > xfs_ilock: dev 8:96 ino 0x493 flags ILOCK_EXCL....
>> >
>> > This way we can filter the output easily across both dax and
>> > filesystem tracepoints with 'grep "ino 0x493"'...
>>
>> I think I can include the inode number, which I have via mapping->host. Am I
>> correct in assuming "struct inode.i_ino" will always be the same as
>> "struct xfs_inode.i_ino"?
>
> Yes - just use inode.i_ino.
>
>> Unfortunately I don't have access to the major/minor (the dev_t) until I call
>> iomap_begin().
>
> In general, filesystem tracing uses inode->sb->s_dev as the
> identifier. NFS, gfs2, XFS, ext4 and others all use this.
>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> [email protected]
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html