Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S932177AbcDKXip (ORCPT ); Mon, 11 Apr 2016 19:38:45 -0400 Received: from mail-pf0-f178.google.com ([209.85.192.178]:36720 "EHLO mail-pf0-f178.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754837AbcDKXgP (ORCPT ); Mon, 11 Apr 2016 19:36:15 -0400 From: Andrey Vagin To: linux-kernel@vger.kernel.org Cc: Andrey Vagin , Oleg Nesterov , Andrew Morton , Cyrill Gorcunov , Pavel Emelyanov , Roger Luethi , Arnd Bergmann , Arnaldo Carvalho de Melo , David Ahern , Andy Lutomirski , Pavel Odintsov Subject: [PATCH 06/15] task_diag: add a new group to get tasks memory mappings (v2) Date: Mon, 11 Apr 2016 16:35:46 -0700 Message-Id: <1460417755-18201-7-git-send-email-avagin@openvz.org> X-Mailer: git-send-email 2.5.5 In-Reply-To: <1460417755-18201-1-git-send-email-avagin@openvz.org> References: <1460417755-18201-1-git-send-email-avagin@openvz.org> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 14049 Lines: 521 v2: Fixes from David Ahern * Fix 8-byte alignment * Change implementation of DIAG_VMA attribute: This patch puts the filename into the task_diag_vma struct and converts TASK_DIAG_VMA attribute into a series of task_diag_vma. Now is there is a single TASK_DIAG_VMA attribute that is parsed as: | struct task_diag_vma | filename | ... Cc: David Ahern Signed-off-by: Andrey Vagin --- fs/proc/internal.h | 21 ++++ fs/proc/task_diag.c | 279 ++++++++++++++++++++++++++++++++++++++++- fs/proc/task_mmu.c | 18 +-- include/uapi/linux/task_diag.h | 85 +++++++++++++ 4 files changed, 385 insertions(+), 18 deletions(-) diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 2a2b1e6..75b57a3 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -316,3 +316,24 @@ task_next_child(struct task_struct *parent, struct task_struct *prev, unsigned i struct task_struct *task_first_tid(struct pid *pid, int tid, loff_t f_pos, struct pid_namespace *ns); struct task_struct *task_next_tid(struct task_struct *start); + +struct mem_size_stats { + unsigned long resident; + unsigned long shared_clean; + unsigned long shared_dirty; + unsigned long private_clean; + unsigned long private_dirty; + unsigned long referenced; + unsigned long anonymous; + unsigned long anonymous_thp; + unsigned long swap; + unsigned long shared_hugetlb; + unsigned long private_hugetlb; + u64 pss; + u64 swap_pss; + bool check_shmem_swap; +}; + +struct mm_walk; +int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + struct mm_walk *walk); diff --git a/fs/proc/task_diag.c b/fs/proc/task_diag.c index fc31771..9c1ed45 100644 --- a/fs/proc/task_diag.c +++ b/fs/proc/task_diag.c @@ -7,6 +7,8 @@ #include #include +#include "internal.h" + struct task_diag_cb { struct sk_buff *req; struct sk_buff *resp; @@ -14,6 +16,11 @@ struct task_diag_cb { pid_t pid; int pos; int attr; + union { /* per-attribute */ + struct { + unsigned long mark; + } vma; + }; }; /* @@ -122,6 +129,267 @@ static int fill_creds(struct task_struct *p, struct sk_buff *skb, return 0; } +static u64 get_vma_flags(struct vm_area_struct *vma) +{ + u64 flags = 0; + + static const u64 mnemonics[BITS_PER_LONG] = { + /* + * In case if we meet a flag we don't know about. + */ + [0 ... (BITS_PER_LONG-1)] = 0, + + [ilog2(VM_READ)] = TASK_DIAG_VMA_F_READ, + [ilog2(VM_WRITE)] = TASK_DIAG_VMA_F_WRITE, + [ilog2(VM_EXEC)] = TASK_DIAG_VMA_F_EXEC, + [ilog2(VM_SHARED)] = TASK_DIAG_VMA_F_SHARED, + [ilog2(VM_MAYREAD)] = TASK_DIAG_VMA_F_MAYREAD, + [ilog2(VM_MAYWRITE)] = TASK_DIAG_VMA_F_MAYWRITE, + [ilog2(VM_MAYEXEC)] = TASK_DIAG_VMA_F_MAYEXEC, + [ilog2(VM_MAYSHARE)] = TASK_DIAG_VMA_F_MAYSHARE, + [ilog2(VM_GROWSDOWN)] = TASK_DIAG_VMA_F_GROWSDOWN, + [ilog2(VM_PFNMAP)] = TASK_DIAG_VMA_F_PFNMAP, + [ilog2(VM_DENYWRITE)] = TASK_DIAG_VMA_F_DENYWRITE, +#ifdef CONFIG_X86_INTEL_MPX + [ilog2(VM_MPX)] = TASK_DIAG_VMA_F_MPX, +#endif + [ilog2(VM_LOCKED)] = TASK_DIAG_VMA_F_LOCKED, + [ilog2(VM_IO)] = TASK_DIAG_VMA_F_IO, + [ilog2(VM_SEQ_READ)] = TASK_DIAG_VMA_F_SEQ_READ, + [ilog2(VM_RAND_READ)] = TASK_DIAG_VMA_F_RAND_READ, + [ilog2(VM_DONTCOPY)] = TASK_DIAG_VMA_F_DONTCOPY, + [ilog2(VM_DONTEXPAND)] = TASK_DIAG_VMA_F_DONTEXPAND, + [ilog2(VM_ACCOUNT)] = TASK_DIAG_VMA_F_ACCOUNT, + [ilog2(VM_NORESERVE)] = TASK_DIAG_VMA_F_NORESERVE, + [ilog2(VM_HUGETLB)] = TASK_DIAG_VMA_F_HUGETLB, + [ilog2(VM_ARCH_1)] = TASK_DIAG_VMA_F_ARCH_1, + [ilog2(VM_DONTDUMP)] = TASK_DIAG_VMA_F_DONTDUMP, +#ifdef CONFIG_MEM_SOFT_DIRTY + [ilog2(VM_SOFTDIRTY)] = TASK_DIAG_VMA_F_SOFTDIRTY, +#endif + [ilog2(VM_MIXEDMAP)] = TASK_DIAG_VMA_F_MIXEDMAP, + [ilog2(VM_HUGEPAGE)] = TASK_DIAG_VMA_F_HUGEPAGE, + [ilog2(VM_NOHUGEPAGE)] = TASK_DIAG_VMA_F_NOHUGEPAGE, + [ilog2(VM_MERGEABLE)] = TASK_DIAG_VMA_F_MERGEABLE, + }; + size_t i; + + for (i = 0; i < BITS_PER_LONG; i++) { + if (vma->vm_flags & (1UL << i)) + flags |= mnemonics[i]; + } + + return flags; +} + +/* + * use a tmp variable and copy to input arg to deal with + * alignment issues. diag_vma contains u64 elements which + * means extended load operations can be used and those can + * require 8-byte alignment (e.g., sparc) + */ +static void fill_diag_vma(struct vm_area_struct *vma, + struct task_diag_vma *diag_vma) +{ + struct task_diag_vma tmp; + + /* We don't show the stack guard page in /proc/maps */ + tmp.start = vma->vm_start; + if (stack_guard_page_start(vma, tmp.start)) + tmp.start += PAGE_SIZE; + + tmp.end = vma->vm_end; + if (stack_guard_page_end(vma, tmp.end)) + tmp.end -= PAGE_SIZE; + tmp.vm_flags = get_vma_flags(vma); + + if (vma->vm_file) { + struct inode *inode = file_inode(vma->vm_file); + dev_t dev; + + dev = inode->i_sb->s_dev; + tmp.major = MAJOR(dev); + tmp.minor = MINOR(dev); + tmp.inode = inode->i_ino; + tmp.generation = inode->i_generation; + tmp.pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; + } else { + tmp.major = 0; + tmp.minor = 0; + tmp.inode = 0; + tmp.generation = 0; + tmp.pgoff = 0; + } + + memcpy(diag_vma, &tmp, sizeof(*diag_vma)); +} + +static const char *get_vma_name(struct vm_area_struct *vma, char *page) +{ + const char *name = NULL; + + if (vma->vm_file) { + name = d_path(&vma->vm_file->f_path, page, PAGE_SIZE); + goto out; + } + + if (vma->vm_ops && vma->vm_ops->name) { + name = vma->vm_ops->name(vma); + if (name) + goto out; + } + + name = arch_vma_name(vma); + +out: + return name; +} + +static void fill_diag_vma_stat(struct vm_area_struct *vma, + struct task_diag_vma_stat *stat) +{ + struct task_diag_vma_stat tmp; + struct mem_size_stats mss; + struct mm_walk smaps_walk = { + .pmd_entry = smaps_pte_range, + .mm = vma->vm_mm, + .private = &mss, + }; + + memset(&mss, 0, sizeof(mss)); + memset(&tmp, 0, sizeof(tmp)); + + /* mmap_sem is held in m_start */ + walk_page_vma(vma, &smaps_walk); + + tmp.resident = mss.resident; + tmp.pss = mss.pss; + tmp.shared_clean = mss.shared_clean; + tmp.private_clean = mss.private_clean; + tmp.private_dirty = mss.private_dirty; + tmp.referenced = mss.referenced; + tmp.anonymous = mss.anonymous; + tmp.anonymous_thp = mss.anonymous_thp; + tmp.swap = mss.swap; + + memcpy(stat, &tmp, sizeof(*stat)); +} + +static int fill_vma(struct task_struct *p, struct sk_buff *skb, + struct task_diag_cb *cb, bool *progress, u64 show_flags) +{ + struct vm_area_struct *vma; + struct mm_struct *mm; + struct nlattr *attr = NULL; + struct task_diag_vma *diag_vma; + unsigned long mark = 0; + char *page; + int i, rc = -EMSGSIZE, size; + + if (cb) + mark = cb->vma.mark; + + mm = p->mm; + if (!mm || !atomic_inc_not_zero(&mm->mm_users)) + return 0; + + page = (char *)__get_free_page(GFP_TEMPORARY); + if (!page) { + mmput(mm); + return -ENOMEM; + } + + size = NLA_ALIGN(sizeof(struct task_diag_vma)); + if (show_flags & TASK_DIAG_SHOW_VMA_STAT) + size += NLA_ALIGN(sizeof(struct task_diag_vma_stat)); + + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next, i++) { + unsigned char *b = skb_tail_pointer(skb); + const char *name; + void *pfile; + + + if (mark >= vma->vm_start) + continue; + + /* setup pointer for next map */ + if (attr == NULL) { + attr = nla_reserve(skb, TASK_DIAG_VMA, size); + if (!attr) + goto err; + + diag_vma = nla_data(attr); + } else { + diag_vma = nla_reserve_nohdr(skb, size); + + if (diag_vma == NULL) { + nlmsg_trim(skb, b); + goto out; + } + } + + fill_diag_vma(vma, diag_vma); + + if (show_flags & TASK_DIAG_SHOW_VMA_STAT) { + struct task_diag_vma_stat *stat; + + stat = (void *) diag_vma + NLA_ALIGN(sizeof(*diag_vma)); + + fill_diag_vma_stat(vma, stat); + diag_vma->stat_len = sizeof(struct task_diag_vma_stat); + diag_vma->stat_off = (void *) stat - (void *)diag_vma; + } else { + diag_vma->stat_len = 0; + diag_vma->stat_off = 0; + } + + name = get_vma_name(vma, page); + if (IS_ERR(name)) { + nlmsg_trim(skb, b); + rc = PTR_ERR(name); + goto out; + } + + if (name) { + diag_vma->name_len = strlen(name) + 1; + + /* reserves NLA_ALIGN(len) */ + pfile = nla_reserve_nohdr(skb, diag_vma->name_len); + if (pfile == NULL) { + nlmsg_trim(skb, b); + goto out; + } + diag_vma->name_off = pfile - (void *) diag_vma; + memcpy(pfile, name, diag_vma->name_len); + } else { + diag_vma->name_len = 0; + diag_vma->name_off = 0; + } + + mark = vma->vm_start; + + diag_vma->vma_len = skb_tail_pointer(skb) - (unsigned char *) diag_vma; + + *progress = true; + } + + rc = 0; + mark = 0; +out: + if (*progress) + attr->nla_len = skb_tail_pointer(skb) - (unsigned char *) attr; + +err: + up_read(&mm->mmap_sem); + mmput(mm); + free_page((unsigned long) page); + if (cb) + cb->vma.mark = mark; + + return rc; +} + static int task_diag_fill(struct task_struct *tsk, struct sk_buff *skb, struct task_diag_pid *req, struct task_diag_cb *cb, struct pid_namespace *pidns, @@ -131,6 +399,7 @@ static int task_diag_fill(struct task_struct *tsk, struct sk_buff *skb, struct nlmsghdr *nlh; struct task_diag_msg *msg; int err = 0, i = 0, n = 0; + bool progress = false; int flags = 0; if (cb) { @@ -163,13 +432,21 @@ static int task_diag_fill(struct task_struct *tsk, struct sk_buff *skb, i++; } + if (show_flags & TASK_DIAG_SHOW_VMA) { + if (i >= n) + err = fill_vma(tsk, skb, cb, &progress, show_flags); + if (err) + goto err; + i++; + } + nlmsg_end(skb, nlh); if (cb) cb->attr = 0; return 0; err: - if (err == -EMSGSIZE && (i > n)) { + if (err == -EMSGSIZE && (i > n || progress)) { if (cb) cb->attr = i; nlmsg_end(skb, nlh); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 229cb54..211147e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -439,22 +439,6 @@ const struct file_operations proc_tid_maps_operations = { #define PSS_SHIFT 12 #ifdef CONFIG_PROC_PAGE_MONITOR -struct mem_size_stats { - unsigned long resident; - unsigned long shared_clean; - unsigned long shared_dirty; - unsigned long private_clean; - unsigned long private_dirty; - unsigned long referenced; - unsigned long anonymous; - unsigned long anonymous_thp; - unsigned long swap; - unsigned long shared_hugetlb; - unsigned long private_hugetlb; - u64 pss; - u64 swap_pss; - bool check_shmem_swap; -}; static void smaps_account(struct mem_size_stats *mss, struct page *page, bool compound, bool young, bool dirty) @@ -586,7 +570,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, } #endif -static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, +int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; diff --git a/include/uapi/linux/task_diag.h b/include/uapi/linux/task_diag.h index ea500c6..3486f2f 100644 --- a/include/uapi/linux/task_diag.h +++ b/include/uapi/linux/task_diag.h @@ -16,6 +16,8 @@ struct task_diag_msg { enum { TASK_DIAG_BASE = 0, TASK_DIAG_CRED, + TASK_DIAG_VMA, + TASK_DIAG_VMA_STAT, __TASK_DIAG_ATTR_MAX #define TASK_DIAG_ATTR_MAX (__TASK_DIAG_ATTR_MAX - 1) @@ -23,6 +25,8 @@ enum { #define TASK_DIAG_SHOW_BASE (1ULL << TASK_DIAG_BASE) #define TASK_DIAG_SHOW_CRED (1ULL << TASK_DIAG_CRED) +#define TASK_DIAG_SHOW_VMA (1ULL << TASK_DIAG_VMA) +#define TASK_DIAG_SHOW_VMA_STAT (1ULL << TASK_DIAG_VMA_STAT) enum { TASK_DIAG_RUNNING, @@ -66,6 +70,87 @@ struct task_diag_creds { __u32 sgid; __u32 fsgid; }; + +#define TASK_DIAG_VMA_F_READ (1ULL << 0) +#define TASK_DIAG_VMA_F_WRITE (1ULL << 1) +#define TASK_DIAG_VMA_F_EXEC (1ULL << 2) +#define TASK_DIAG_VMA_F_SHARED (1ULL << 3) +#define TASK_DIAG_VMA_F_MAYREAD (1ULL << 4) +#define TASK_DIAG_VMA_F_MAYWRITE (1ULL << 5) +#define TASK_DIAG_VMA_F_MAYEXEC (1ULL << 6) +#define TASK_DIAG_VMA_F_MAYSHARE (1ULL << 7) +#define TASK_DIAG_VMA_F_GROWSDOWN (1ULL << 8) +#define TASK_DIAG_VMA_F_PFNMAP (1ULL << 9) +#define TASK_DIAG_VMA_F_DENYWRITE (1ULL << 10) +#define TASK_DIAG_VMA_F_MPX (1ULL << 11) +#define TASK_DIAG_VMA_F_LOCKED (1ULL << 12) +#define TASK_DIAG_VMA_F_IO (1ULL << 13) +#define TASK_DIAG_VMA_F_SEQ_READ (1ULL << 14) +#define TASK_DIAG_VMA_F_RAND_READ (1ULL << 15) +#define TASK_DIAG_VMA_F_DONTCOPY (1ULL << 16) +#define TASK_DIAG_VMA_F_DONTEXPAND (1ULL << 17) +#define TASK_DIAG_VMA_F_ACCOUNT (1ULL << 18) +#define TASK_DIAG_VMA_F_NORESERVE (1ULL << 19) +#define TASK_DIAG_VMA_F_HUGETLB (1ULL << 20) +#define TASK_DIAG_VMA_F_ARCH_1 (1ULL << 21) +#define TASK_DIAG_VMA_F_DONTDUMP (1ULL << 22) +#define TASK_DIAG_VMA_F_SOFTDIRTY (1ULL << 23) +#define TASK_DIAG_VMA_F_MIXEDMAP (1ULL << 24) +#define TASK_DIAG_VMA_F_HUGEPAGE (1ULL << 25) +#define TASK_DIAG_VMA_F_NOHUGEPAGE (1ULL << 26) +#define TASK_DIAG_VMA_F_MERGEABLE (1ULL << 27) + +struct task_diag_vma_stat { + __u64 resident; + __u64 shared_clean; + __u64 shared_dirty; + __u64 private_clean; + __u64 private_dirty; + __u64 referenced; + __u64 anonymous; + __u64 anonymous_thp; + __u64 swap; + __u64 pss; +} __attribute__((__aligned__(NLA_ALIGNTO))); + +/* task_diag_vma must be NLA_ALIGN'ed */ +struct task_diag_vma { + __u64 start, end; + __u64 vm_flags; + __u64 pgoff; + __u32 major; + __u32 minor; + __u64 inode; + __u32 generation; + __u16 vma_len; + __u16 name_off; + __u16 name_len; + __u16 stat_off; + __u16 stat_len; +} __attribute__((__aligned__(NLA_ALIGNTO))); + +static inline char *task_diag_vma_name(struct task_diag_vma *vma) +{ + if (!vma->name_len) + return NULL; + + return ((char *)vma) + vma->name_off; +} + +static inline +struct task_diag_vma_stat *task_diag_vma_stat(struct task_diag_vma *vma) +{ + if (!vma->stat_len) + return NULL; + + return ((void *)vma) + vma->stat_off; +} + +#define task_diag_for_each_vma(vma, attr) \ + for (vma = nla_data(attr); \ + (void *) vma < nla_data(attr) + nla_len(attr); \ + vma = (void *) vma + vma->vma_len) + #define TASK_DIAG_DUMP_ALL 0 #define TASK_DIAG_DUMP_ONE 1 -- 2.5.5