2013-10-30 11:30:51

by Peng Tao

[permalink] [raw]
Subject: [PATCH 1/4] staging/lustre/llite: cache jobid in lu_env

We will switch to find jobid from /proc/self/environ and we need an
extra memory copy to do it, so let's cache it in lu_env.
It is then copied from lu_env to ll_inode_info upon every read/write.

Reviewed-by: Niu Yawei <[email protected]>
Signed-off-by: Peng Tao <[email protected]>
Signed-off-by: Andreas Dilger <[email protected]>
---
drivers/staging/lustre/lustre/llite/file.c | 2 ++
.../staging/lustre/lustre/llite/llite_internal.h | 22 ++++++++++++++++++++
drivers/staging/lustre/lustre/llite/llite_mmap.c | 1 +
drivers/staging/lustre/lustre/llite/vvp_io.c | 8 -------
4 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/drivers/staging/lustre/lustre/llite/file.c b/drivers/staging/lustre/lustre/llite/file.c
index bc534db..2fa0107 100644
--- a/drivers/staging/lustre/lustre/llite/file.c
+++ b/drivers/staging/lustre/lustre/llite/file.c
@@ -846,6 +846,8 @@ ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
struct cl_io *io;
ssize_t result;

+ ll_io_set_jobid(env, lli);
+
restart:
io = ccc_env_thread_io(env);
ll_io_init(io, file, iot == CIT_WRITE);
diff --git a/drivers/staging/lustre/lustre/llite/llite_internal.h b/drivers/staging/lustre/lustre/llite/llite_internal.h
index 47e443d..3cac141 100644
--- a/drivers/staging/lustre/lustre/llite/llite_internal.h
+++ b/drivers/staging/lustre/lustre/llite/llite_internal.h
@@ -968,6 +968,7 @@ struct vvp_thread_info {
struct ra_io_arg vti_ria;
struct kiocb vti_kiocb;
struct ll_cl_context vti_io_ctx;
+ char vti_jobid[JOBSTATS_JOBID_SIZE];
};

static inline struct vvp_thread_info *vvp_env_info(const struct lu_env *env)
@@ -980,6 +981,27 @@ static inline struct vvp_thread_info *vvp_env_info(const struct lu_env *env)
return info;
}

+static inline char *vvp_env_jobid(const struct lu_env *env)
+{
+ return vvp_env_info(env)->vti_jobid;
+}
+
+#define LL_JOBID_NOT_FOUND 0x1
+static inline void ll_io_set_jobid(const struct lu_env *env, struct ll_inode_info *lli)
+{
+ char *jobid = vvp_env_jobid(env);
+
+ if (jobid[0] == '\0') {
+ lustre_get_jobid(jobid);
+
+ if (jobid[0] == '\0')
+ jobid[0] = LL_JOBID_NOT_FOUND;
+ }
+
+ if (jobid[0] != LL_JOBID_NOT_FOUND)
+ memcpy(lli->lli_jobid, jobid, JOBSTATS_JOBID_SIZE);
+}
+
static inline struct vvp_io_args *vvp_env_args(const struct lu_env *env,
enum vvp_io_subtype type)
{
diff --git a/drivers/staging/lustre/lustre/llite/llite_mmap.c b/drivers/staging/lustre/lustre/llite/llite_mmap.c
index caed642..f64f915 100644
--- a/drivers/staging/lustre/lustre/llite/llite_mmap.c
+++ b/drivers/staging/lustre/lustre/llite/llite_mmap.c
@@ -205,6 +205,7 @@ static int ll_page_mkwrite0(struct vm_area_struct *vma, struct page *vmpage,
* while truncate is on-going. */
inode = ccc_object_inode(io->ci_obj);
lli = ll_i2info(inode);
+ ll_io_set_jobid(env, lli);
down_read(&lli->lli_trunc_sem);

result = cl_io_loop(env, io);
diff --git a/drivers/staging/lustre/lustre/llite/vvp_io.c b/drivers/staging/lustre/lustre/llite/vvp_io.c
index 3ff664c..efcce29 100644
--- a/drivers/staging/lustre/lustre/llite/vvp_io.c
+++ b/drivers/staging/lustre/lustre/llite/vvp_io.c
@@ -1117,7 +1117,6 @@ int vvp_io_init(const struct lu_env *env, struct cl_object *obj,
result = 0;
if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE) {
size_t count;
- struct ll_inode_info *lli = ll_i2info(inode);

count = io->u.ci_rw.crw_count;
/* "If nbyte is 0, read() will return 0 and have no other
@@ -1128,13 +1127,6 @@ int vvp_io_init(const struct lu_env *env, struct cl_object *obj,
cio->cui_tot_count = count;
cio->cui_tot_nrsegs = 0;
}
- /* for read/write, we store the jobid in the inode, and
- * it'll be fetched by osc when building RPC.
- *
- * it's not accurate if the file is shared by different
- * jobs.
- */
- lustre_get_jobid(lli->lli_jobid);
} else if (io->ci_type == CIT_SETATTR) {
if (!cl_io_is_trunc(io))
io->ci_lockreq = CILR_MANDATORY;
--
1.7.9.5


2013-10-30 11:30:58

by Peng Tao

[permalink] [raw]
Subject: [PATCH 2/4] staging/lustre/obdclass: read jobid from proc

so that we can get rid of cfs_get_environ() that needs access_process_vm() that
is a core mm function and is not available on some architectures.

Reviewed-by: Niu Yawei <[email protected]>
Signed-off-by: Peng Tao <[email protected]>
Signed-off-by: Andreas Dilger <[email protected]>
---
drivers/staging/lustre/lustre/obdclass/class_obd.c | 119 +++++++++++++++++++-
1 file changed, 115 insertions(+), 4 deletions(-)

diff --git a/drivers/staging/lustre/lustre/obdclass/class_obd.c b/drivers/staging/lustre/lustre/obdclass/class_obd.c
index b1024a6..99ce863 100644
--- a/drivers/staging/lustre/lustre/obdclass/class_obd.c
+++ b/drivers/staging/lustre/lustre/obdclass/class_obd.c
@@ -96,8 +96,117 @@ EXPORT_SYMBOL(obd_dirty_transit_pages);
char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE;
EXPORT_SYMBOL(obd_jobid_var);

-/* Get jobid of current process by reading the environment variable
- * stored in between the "env_start" & "env_end" of task struct.
+static char *self_environ_file = "/proc/self/environ";
+static int obd_get_environ(const char *key, char *value, int *val_len)
+{
+ struct mm_struct *mm;
+ struct path path;
+ struct file *filp = NULL;
+ int buf_len = PAGE_CACHE_SIZE;
+ int key_len = strlen(key);
+ char *buffer = NULL;
+ loff_t pos = 0;
+ int rc;
+
+ /*
+ * test mm->mmap_sem to avoid deadlock if this ever gets called from
+ * mmap code.
+ */
+ mm = get_task_mm(current);
+ if (!mm)
+ return -EINVAL;
+ if (down_read_trylock(&mm->mmap_sem) == 0) {
+ mmput(mm);
+ return -EDEADLK;
+ }
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+
+ buffer = kmalloc(buf_len, GFP_NOIO);
+ if (!buffer)
+ return -ENOMEM;
+
+ rc = kern_path(self_environ_file, LOOKUP_FOLLOW, &path);
+ if (rc)
+ goto out;
+
+ filp = dentry_open(&path, O_RDONLY, current_cred());
+ if (IS_ERR(filp)) {
+ rc = PTR_ERR(filp);
+ filp = NULL;
+ goto out;
+ }
+
+ /* loop reading... */
+ while (1) {
+ int scan_len, this_len;
+ char *env_start, *env_end;
+ rc = kernel_read(filp, pos, buffer, buf_len);
+ if (rc <= 0)
+ break;
+
+ pos += rc;
+ /* Parse the buffer to find out the specified key/value pair.
+ * The "key=value" entries are separated by '\0'. */
+ env_start = buffer;
+ scan_len = this_len = rc;
+ while (scan_len) {
+ char *entry;
+ int entry_len;
+
+ env_end = memscan(env_start, '\0', scan_len);
+ LASSERT(env_end >= env_start &&
+ env_end <= env_start + scan_len);
+
+ /* The last entry of this buffer cross the buffer
+ * boundary, reread it in next cycle. */
+ if (unlikely(env_end - env_start == scan_len)) {
+ /* This entry is too large to fit in buffer */
+ if (unlikely(scan_len == this_len)) {
+ rc = -EINVAL;
+ CWARN("Environment variable '%s' too long: rc = %d\n",
+ key, rc);
+ goto out;
+ }
+ pos -= scan_len;
+ break;
+ }
+
+ entry = env_start;
+ entry_len = env_end - env_start;
+
+ /* Key length + length of '=' */
+ if (entry_len > key_len + 1 &&
+ !memcmp(entry, key, key_len)) {
+ entry += key_len + 1;
+ entry_len -= key_len + 1;
+ /* The 'value' buffer passed in is too small.*/
+ if (entry_len >= *val_len)
+ GOTO(out, rc = -EOVERFLOW);
+
+ memcpy(value, entry, entry_len);
+ *val_len = entry_len;
+ rc = 0;
+ goto out;
+ }
+
+ scan_len -= (env_end - env_start + 1);
+ env_start = env_end + 1;
+ }
+ }
+ if (rc >= 0)
+ rc = -ENOENT;
+out:
+ if (filp)
+ fput(filp);
+ if (buffer)
+ kfree(buffer);
+ return rc;
+}
+
+/*
+ * Get jobid of current process by reading the environment variable
+ * from /proc/self/environ.
*
* TODO:
* It's better to cache the jobid for later use if there is any
@@ -126,14 +235,14 @@ int lustre_get_jobid(char *jobid)
return 0;
}

- rc = cfs_get_environ(obd_jobid_var, jobid, &jobid_len);
+ rc = obd_get_environ(obd_jobid_var, jobid, &jobid_len);
if (rc) {
if (rc == -EOVERFLOW) {
/* For the PBS_JOBID and LOADL_STEP_ID keys (which are
* variable length strings instead of just numbers), it
* might make sense to keep the unique parts for JobID,
* instead of just returning an error. That means a
- * larger temp buffer for cfs_get_environ(), then
+ * larger temp buffer for obd_get_environ(), then
* truncating the string at some separator to fit into
* the specified jobid_len. Fix later if needed. */
static bool printed;
@@ -149,6 +258,8 @@ int lustre_get_jobid(char *jobid)
"Get jobid for (%s) failed: rc = %d\n",
obd_jobid_var, rc);
}
+ } else {
+ CDEBUG(D_INFO, "Got jobid for (%s) value (%s)\n", obd_jobid_var, jobid);
}
return rc;
}
--
1.7.9.5

2013-10-30 11:31:04

by Peng Tao

[permalink] [raw]
Subject: [PATCH 4/4] staging/lustre: enable build on MIPS/XTENSA/SUPERH

The missing symbol copy_from_user_page() is not needed now.

Cc: Guenter Roeck <[email protected]>
Reviewed-by: Niu Yawei <[email protected]>
Signed-off-by: Peng Tao <[email protected]>
Signed-off-by: Andreas Dilger <[email protected]>
---
drivers/staging/lustre/lustre/Kconfig | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/staging/lustre/lustre/Kconfig b/drivers/staging/lustre/lustre/Kconfig
index 2156a44..e44965b 100644
--- a/drivers/staging/lustre/lustre/Kconfig
+++ b/drivers/staging/lustre/lustre/Kconfig
@@ -1,6 +1,6 @@
config LUSTRE_FS
tristate "Lustre file system client support"
- depends on INET && m && !MIPS && !XTENSA && !SUPERH
+ depends on INET && m
select LNET
select CRYPTO
select CRYPTO_CRC32
--
1.7.9.5

2013-10-30 11:31:25

by Peng Tao

[permalink] [raw]
Subject: [PATCH 3/4] staging/lustre: remove cfs_get_environ and cfs_access_process_vm

Reviewed-by: Niu Yawei <[email protected]>
Signed-off-by: Peng Tao <[email protected]>
Signed-off-by: Andreas Dilger <[email protected]>
---
.../staging/lustre/include/linux/libcfs/curproc.h | 1 -
.../lustre/lustre/libcfs/linux/linux-curproc.c | 150 --------------------
2 files changed, 151 deletions(-)

diff --git a/drivers/staging/lustre/include/linux/libcfs/curproc.h b/drivers/staging/lustre/include/linux/libcfs/curproc.h
index de8e35b..1c49ef4 100644
--- a/drivers/staging/lustre/include/linux/libcfs/curproc.h
+++ b/drivers/staging/lustre/include/linux/libcfs/curproc.h
@@ -64,7 +64,6 @@ int cfs_curproc_groups_nr(void);
int current_is_32bit(void);
#define current_pid() (current->pid)
#define current_comm() (current->comm)
-int cfs_get_environ(const char *key, char *value, int *val_len);

typedef __u32 cfs_cap_t;

diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c
index ea9e949..007f197 100644
--- a/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c
@@ -152,156 +152,6 @@ int current_is_32bit(void)
return is_compat_task();
}

-static int cfs_access_process_vm(struct task_struct *tsk, unsigned long addr,
- void *buf, int len, int write)
-{
- /* Just copied from kernel for the kernels which doesn't
- * have access_process_vm() exported */
- struct mm_struct *mm;
- struct vm_area_struct *vma;
- struct page *page;
- void *old_buf = buf;
-
- mm = get_task_mm(tsk);
- if (!mm)
- return 0;
-
- down_read(&mm->mmap_sem);
- /* ignore errors, just check how much was sucessfully transfered */
- while (len) {
- int bytes, rc, offset;
- void *maddr;
-
- rc = get_user_pages(tsk, mm, addr, 1,
- write, 1, &page, &vma);
- if (rc <= 0)
- break;
-
- bytes = len;
- offset = addr & (PAGE_SIZE-1);
- if (bytes > PAGE_SIZE-offset)
- bytes = PAGE_SIZE-offset;
-
- maddr = kmap(page);
- if (write) {
- copy_to_user_page(vma, page, addr,
- maddr + offset, buf, bytes);
- set_page_dirty_lock(page);
- } else {
- copy_from_user_page(vma, page, addr,
- buf, maddr + offset, bytes);
- }
- kunmap(page);
- page_cache_release(page);
- len -= bytes;
- buf += bytes;
- addr += bytes;
- }
- up_read(&mm->mmap_sem);
- mmput(mm);
-
- return buf - old_buf;
-}
-
-/* Read the environment variable of current process specified by @key. */
-int cfs_get_environ(const char *key, char *value, int *val_len)
-{
- struct mm_struct *mm;
- char *buffer, *tmp_buf = NULL;
- int buf_len = PAGE_CACHE_SIZE;
- int key_len = strlen(key);
- unsigned long addr;
- int rc;
-
- buffer = kmalloc(buf_len, GFP_USER);
- if (!buffer)
- return -ENOMEM;
-
- mm = get_task_mm(current);
- if (!mm) {
- kfree(buffer);
- return -EINVAL;
- }
-
- /* Avoid deadlocks on mmap_sem if called from sys_mmap_pgoff(),
- * which is already holding mmap_sem for writes. If some other
- * thread gets the write lock in the meantime, this thread will
- * block, but at least it won't deadlock on itself. LU-1735 */
- if (down_read_trylock(&mm->mmap_sem) == 0)
- return -EDEADLK;
- up_read(&mm->mmap_sem);
-
- addr = mm->env_start;
- while (addr < mm->env_end) {
- int this_len, retval, scan_len;
- char *env_start, *env_end;
-
- memset(buffer, 0, buf_len);
-
- this_len = min_t(int, mm->env_end - addr, buf_len);
- retval = cfs_access_process_vm(current, addr, buffer,
- this_len, 0);
- if (retval != this_len)
- break;
-
- addr += retval;
-
- /* Parse the buffer to find out the specified key/value pair.
- * The "key=value" entries are separated by '\0'. */
- env_start = buffer;
- scan_len = this_len;
- while (scan_len) {
- char *entry;
- int entry_len;
-
- env_end = memscan(env_start, '\0', scan_len);
- LASSERT(env_end >= env_start &&
- env_end <= env_start + scan_len);
-
- /* The last entry of this buffer cross the buffer
- * boundary, reread it in next cycle. */
- if (unlikely(env_end - env_start == scan_len)) {
- /* This entry is too large to fit in buffer */
- if (unlikely(scan_len == this_len)) {
- CERROR("Too long env variable.\n");
- GOTO(out, rc = -EINVAL);
- }
- addr -= scan_len;
- break;
- }
-
- entry = env_start;
- entry_len = env_end - env_start;
-
- /* Key length + length of '=' */
- if (entry_len > key_len + 1 &&
- !memcmp(entry, key, key_len)) {
- entry += key_len + 1;
- entry_len -= key_len + 1;
- /* The 'value' buffer passed in is too small.*/
- if (entry_len >= *val_len)
- GOTO(out, rc = -EOVERFLOW);
-
- memcpy(value, entry, entry_len);
- *val_len = entry_len;
- GOTO(out, rc = 0);
- }
-
- scan_len -= (env_end - env_start + 1);
- env_start = env_end + 1;
- }
- }
- GOTO(out, rc = -ENOENT);
-
-out:
- mmput(mm);
- kfree((void *)buffer);
- if (tmp_buf)
- kfree((void *)tmp_buf);
- return rc;
-}
-EXPORT_SYMBOL(cfs_get_environ);
-
EXPORT_SYMBOL(cfs_curproc_groups_nr);
EXPORT_SYMBOL(cfs_cap_raise);
EXPORT_SYMBOL(cfs_cap_lower);
--
1.7.9.5

2013-10-30 13:17:44

by Greg Kroah-Hartman

[permalink] [raw]
Subject: Re: [PATCH 1/4] staging/lustre/llite: cache jobid in lu_env

On Wed, Oct 30, 2013 at 07:30:33PM +0800, Peng Tao wrote:
> We will switch to find jobid from /proc/self/environ and we need an
> extra memory copy to do it, so let's cache it in lu_env.
> It is then copied from lu_env to ll_inode_info upon every read/write.

What does this mean?

And you better not be reading proc values from within the kernel...

greg k-h

2013-10-30 13:19:04

by Greg Kroah-Hartman

[permalink] [raw]
Subject: Re: [PATCH 2/4] staging/lustre/obdclass: read jobid from proc

On Wed, Oct 30, 2013 at 07:30:34PM +0800, Peng Tao wrote:
> so that we can get rid of cfs_get_environ() that needs access_process_vm() that
> is a core mm function and is not available on some architectures.
>
> Reviewed-by: Niu Yawei <[email protected]>
> Signed-off-by: Peng Tao <[email protected]>
> Signed-off-by: Andreas Dilger <[email protected]>
> ---
> drivers/staging/lustre/lustre/obdclass/class_obd.c | 119 +++++++++++++++++++-
> 1 file changed, 115 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/staging/lustre/lustre/obdclass/class_obd.c b/drivers/staging/lustre/lustre/obdclass/class_obd.c
> index b1024a6..99ce863 100644
> --- a/drivers/staging/lustre/lustre/obdclass/class_obd.c
> +++ b/drivers/staging/lustre/lustre/obdclass/class_obd.c
> @@ -96,8 +96,117 @@ EXPORT_SYMBOL(obd_dirty_transit_pages);
> char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE;
> EXPORT_SYMBOL(obd_jobid_var);
>
> -/* Get jobid of current process by reading the environment variable
> - * stored in between the "env_start" & "env_end" of task struct.
> +static char *self_environ_file = "/proc/self/environ";

Heh, no, that's not ok at all.

This is a _huge_ sign that you are doing something wrong in your driver
if you need something that isn't exported, or that you have to dig out
of proc.

Sorry, I can't take this, please fix the underlying problems that would
even think that you need access to the environment from within a kernel
driver.

greg k-h

2013-10-30 13:31:47

by Peng Tao

[permalink] [raw]
Subject: Re: [PATCH 2/4] staging/lustre/obdclass: read jobid from proc

On Wed, Oct 30, 2013 at 9:21 PM, Greg Kroah-Hartman
<[email protected]> wrote:
> On Wed, Oct 30, 2013 at 07:30:34PM +0800, Peng Tao wrote:
>> so that we can get rid of cfs_get_environ() that needs access_process_vm() that
>> is a core mm function and is not available on some architectures.
>>
>> Reviewed-by: Niu Yawei <[email protected]>
>> Signed-off-by: Peng Tao <[email protected]>
>> Signed-off-by: Andreas Dilger <[email protected]>
>> ---
>> drivers/staging/lustre/lustre/obdclass/class_obd.c | 119 +++++++++++++++++++-
>> 1 file changed, 115 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/staging/lustre/lustre/obdclass/class_obd.c b/drivers/staging/lustre/lustre/obdclass/class_obd.c
>> index b1024a6..99ce863 100644
>> --- a/drivers/staging/lustre/lustre/obdclass/class_obd.c
>> +++ b/drivers/staging/lustre/lustre/obdclass/class_obd.c
>> @@ -96,8 +96,117 @@ EXPORT_SYMBOL(obd_dirty_transit_pages);
>> char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE;
>> EXPORT_SYMBOL(obd_jobid_var);
>>
>> -/* Get jobid of current process by reading the environment variable
>> - * stored in between the "env_start" & "env_end" of task struct.
>> +static char *self_environ_file = "/proc/self/environ";
>
> Heh, no, that's not ok at all.
>
> This is a _huge_ sign that you are doing something wrong in your driver
> if you need something that isn't exported, or that you have to dig out
> of proc.
>
Lustre has a functionality that lets applications specify a global
jobid by setting the same environment variable across the cluster and
thus server can trace IO that is issued by different job. Currently it
is implemented by having a private version of cfs_access_process_vm
because access_process_vm is not exported. The patch intends to change
it to reading from proc. If we cannot dig it out from proc either, how
can we keep the functionality then? Please advise.

Thanks,
Tao

> Sorry, I can't take this, please fix the underlying problems that would
> even think that you need access to the environment from within a kernel
> driver.
>
> greg k-h

2013-10-30 14:18:19

by Greg Kroah-Hartman

[permalink] [raw]
Subject: Re: [PATCH 2/4] staging/lustre/obdclass: read jobid from proc

On Wed, Oct 30, 2013 at 09:31:24PM +0800, Peng Tao wrote:
> On Wed, Oct 30, 2013 at 9:21 PM, Greg Kroah-Hartman
> <[email protected]> wrote:
> > On Wed, Oct 30, 2013 at 07:30:34PM +0800, Peng Tao wrote:
> >> so that we can get rid of cfs_get_environ() that needs access_process_vm() that
> >> is a core mm function and is not available on some architectures.
> >>
> >> Reviewed-by: Niu Yawei <[email protected]>
> >> Signed-off-by: Peng Tao <[email protected]>
> >> Signed-off-by: Andreas Dilger <[email protected]>
> >> ---
> >> drivers/staging/lustre/lustre/obdclass/class_obd.c | 119 +++++++++++++++++++-
> >> 1 file changed, 115 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/drivers/staging/lustre/lustre/obdclass/class_obd.c b/drivers/staging/lustre/lustre/obdclass/class_obd.c
> >> index b1024a6..99ce863 100644
> >> --- a/drivers/staging/lustre/lustre/obdclass/class_obd.c
> >> +++ b/drivers/staging/lustre/lustre/obdclass/class_obd.c
> >> @@ -96,8 +96,117 @@ EXPORT_SYMBOL(obd_dirty_transit_pages);
> >> char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE;
> >> EXPORT_SYMBOL(obd_jobid_var);
> >>
> >> -/* Get jobid of current process by reading the environment variable
> >> - * stored in between the "env_start" & "env_end" of task struct.
> >> +static char *self_environ_file = "/proc/self/environ";
> >
> > Heh, no, that's not ok at all.
> >
> > This is a _huge_ sign that you are doing something wrong in your driver
> > if you need something that isn't exported, or that you have to dig out
> > of proc.
> >
> Lustre has a functionality that lets applications specify a global
> jobid by setting the same environment variable across the cluster and
> thus server can trace IO that is issued by different job. Currently it
> is implemented by having a private version of cfs_access_process_vm
> because access_process_vm is not exported. The patch intends to change
> it to reading from proc. If we cannot dig it out from proc either, how
> can we keep the functionality then? Please advise.

Kernel code should not care about environment variables. Just because
you created a crazy requirement, does not mean it is a valid one, nor
that I have to be the one to tell you how to implement it.

I'm never going to take code that parses proc files, you all know better
than that.

greg k-h

2013-10-30 15:15:57

by Peng Tao

[permalink] [raw]
Subject: Re: [PATCH 2/4] staging/lustre/obdclass: read jobid from proc

On Wed, Oct 30, 2013 at 10:20 PM, Greg Kroah-Hartman
<[email protected]> wrote:
> On Wed, Oct 30, 2013 at 09:31:24PM +0800, Peng Tao wrote:
>> On Wed, Oct 30, 2013 at 9:21 PM, Greg Kroah-Hartman
>> <[email protected]> wrote:
>> > On Wed, Oct 30, 2013 at 07:30:34PM +0800, Peng Tao wrote:
>> >> so that we can get rid of cfs_get_environ() that needs access_process_vm() that
>> >> is a core mm function and is not available on some architectures.
>> >>
>> >> Reviewed-by: Niu Yawei <[email protected]>
>> >> Signed-off-by: Peng Tao <[email protected]>
>> >> Signed-off-by: Andreas Dilger <[email protected]>
>> >> ---
>> >> drivers/staging/lustre/lustre/obdclass/class_obd.c | 119 +++++++++++++++++++-
>> >> 1 file changed, 115 insertions(+), 4 deletions(-)
>> >>
>> >> diff --git a/drivers/staging/lustre/lustre/obdclass/class_obd.c b/drivers/staging/lustre/lustre/obdclass/class_obd.c
>> >> index b1024a6..99ce863 100644
>> >> --- a/drivers/staging/lustre/lustre/obdclass/class_obd.c
>> >> +++ b/drivers/staging/lustre/lustre/obdclass/class_obd.c
>> >> @@ -96,8 +96,117 @@ EXPORT_SYMBOL(obd_dirty_transit_pages);
>> >> char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE;
>> >> EXPORT_SYMBOL(obd_jobid_var);
>> >>
>> >> -/* Get jobid of current process by reading the environment variable
>> >> - * stored in between the "env_start" & "env_end" of task struct.
>> >> +static char *self_environ_file = "/proc/self/environ";
>> >
>> > Heh, no, that's not ok at all.
>> >
>> > This is a _huge_ sign that you are doing something wrong in your driver
>> > if you need something that isn't exported, or that you have to dig out
>> > of proc.
>> >
>> Lustre has a functionality that lets applications specify a global
>> jobid by setting the same environment variable across the cluster and
>> thus server can trace IO that is issued by different job. Currently it
>> is implemented by having a private version of cfs_access_process_vm
>> because access_process_vm is not exported. The patch intends to change
>> it to reading from proc. If we cannot dig it out from proc either, how
>> can we keep the functionality then? Please advise.
>
> Kernel code should not care about environment variables. Just because
> you created a crazy requirement, does not mean it is a valid one, nor
> that I have to be the one to tell you how to implement it.
>
> I'm never going to take code that parses proc files, you all know better
> than that.
I see. I'll drop the patchset and keep the cfs_access_process_vm code
until better option is found.

Thanks,
Tao

2014-02-04 06:33:49

by Oleg Drokin

[permalink] [raw]
Subject: Re: [PATCH 2/4] staging/lustre/obdclass: read jobid from proc

Hello!

On Wed, Oct 30, 2013 at 06:21:01AM -0700, Greg Kroah-Hartman wrote:
> > - * stored in between the "env_start" & "env_end" of task struct.
> > +static char *self_environ_file = "/proc/self/environ";
>
> Heh, no, that's not ok at all.
>
> This is a _huge_ sign that you are doing something wrong in your driver
> if you need something that isn't exported, or that you have to dig out
> of proc.
>
> Sorry, I can't take this, please fix the underlying problems that would
> even think that you need access to the environment from within a kernel
> driver.

I took a stab at this.
This is not a final patch, I know there's still some number of checkpatch
warnings and the proc layout is not finalized yet for example.

But before I spend any more time in polishing this, can you please take a look
and advise if this direction would be acceptable for you when driven to
completion?

Thanks.


>From 6a5b58657cc32163738d4a8c210e8683159b582f Mon Sep 17 00:00:00 2001
From: Oleg Drokin <[email protected]>
Date: Tue, 4 Feb 2014 00:32:12 -0500
Subject: [PATCH] staging/lustre: Obtain jobid invormation via upcall

Replace lustre jobid information fetching directly from
process env variable with either node-wide jobid obtained via
a proc file, or through an upcall that would provide the jobid
if more fine-grained operations are necessary.

Signed-off-by: Oleg Drokin <[email protected]>
---
.../staging/lustre/include/linux/libcfs/curproc.h | 1 -
.../staging/lustre/include/linux/libcfs/lucache.h | 6 +
.../staging/lustre/lustre/include/lprocfs_status.h | 1 +
.../lustre/lustre/libcfs/linux/linux-curproc.c | 152 -----------------
drivers/staging/lustre/lustre/obdclass/Makefile | 2 +-
drivers/staging/lustre/lustre/obdclass/class_obd.c | 67 ++++----
.../staging/lustre/lustre/obdclass/jobid_cache.c | 181 +++++++++++++++++++++
.../lustre/lustre/obdclass/jobid_internal.h | 10 ++
.../lustre/lustre/obdclass/linux/linux-module.c | 114 +++++++++++++
9 files changed, 345 insertions(+), 189 deletions(-)
create mode 100644 drivers/staging/lustre/lustre/obdclass/jobid_cache.c
create mode 100644 drivers/staging/lustre/lustre/obdclass/jobid_internal.h

diff --git a/drivers/staging/lustre/include/linux/libcfs/curproc.h b/drivers/staging/lustre/include/linux/libcfs/curproc.h
index 507d16b..cf1f26b 100644
--- a/drivers/staging/lustre/include/linux/libcfs/curproc.h
+++ b/drivers/staging/lustre/include/linux/libcfs/curproc.h
@@ -63,7 +63,6 @@ int cfs_curproc_groups_nr(void);
/* check if task is running in compat mode.*/
#define current_pid() (current->pid)
#define current_comm() (current->comm)
-int cfs_get_environ(const char *key, char *value, int *val_len);

typedef __u32 cfs_cap_t;

diff --git a/drivers/staging/lustre/include/linux/libcfs/lucache.h b/drivers/staging/lustre/include/linux/libcfs/lucache.h
index 9668b39..f8361d7 100644
--- a/drivers/staging/lustre/include/linux/libcfs/lucache.h
+++ b/drivers/staging/lustre/include/linux/libcfs/lucache.h
@@ -82,6 +82,11 @@ struct md_identity {
struct md_perm *mi_perms;
};

+struct jobid_cache_entry {
+ struct upcall_cache_entry *jce_uc_entry;
+ char *jce_jobid;
+};
+
struct upcall_cache_entry {
struct list_head ue_hash;
__u64 ue_key;
@@ -92,6 +97,7 @@ struct upcall_cache_entry {
cfs_time_t ue_expire;
union {
struct md_identity identity;
+ struct jobid_cache_entry jobid;
} u;
};

diff --git a/drivers/staging/lustre/lustre/include/lprocfs_status.h b/drivers/staging/lustre/lustre/include/lprocfs_status.h
index 428e3e4..3c99dcf 100644
--- a/drivers/staging/lustre/lustre/include/lprocfs_status.h
+++ b/drivers/staging/lustre/lustre/include/lprocfs_status.h
@@ -369,6 +369,7 @@ static inline void s2dhms(struct dhms *ts, time_t secs)
#define JOBSTATS_JOBID_VAR_MAX_LEN 20
#define JOBSTATS_DISABLE "disable"
#define JOBSTATS_PROCNAME_UID "procname_uid"
+#define JOBSTATS_NODELOCAL "nodelocal"

extern int lprocfs_write_frac_helper(const char *buffer, unsigned long count,
int *val, int mult);
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c
index a2ef64c..7c48601 100644
--- a/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c
@@ -140,158 +140,6 @@ int cfs_capable(cfs_cap_t cap)
return capable(cfs_cap_unpack(cap));
}

-static int cfs_access_process_vm(struct task_struct *tsk, unsigned long addr,
- void *buf, int len, int write)
-{
- /* Just copied from kernel for the kernels which doesn't
- * have access_process_vm() exported */
- struct mm_struct *mm;
- struct vm_area_struct *vma;
- struct page *page;
- void *old_buf = buf;
-
- mm = get_task_mm(tsk);
- if (!mm)
- return 0;
-
- down_read(&mm->mmap_sem);
- /* ignore errors, just check how much was successfully transferred */
- while (len) {
- int bytes, rc, offset;
- void *maddr;
-
- rc = get_user_pages(tsk, mm, addr, 1,
- write, 1, &page, &vma);
- if (rc <= 0)
- break;
-
- bytes = len;
- offset = addr & (PAGE_SIZE-1);
- if (bytes > PAGE_SIZE-offset)
- bytes = PAGE_SIZE-offset;
-
- maddr = kmap(page);
- if (write) {
- copy_to_user_page(vma, page, addr,
- maddr + offset, buf, bytes);
- set_page_dirty_lock(page);
- } else {
- copy_from_user_page(vma, page, addr,
- buf, maddr + offset, bytes);
- }
- kunmap(page);
- page_cache_release(page);
- len -= bytes;
- buf += bytes;
- addr += bytes;
- }
- up_read(&mm->mmap_sem);
- mmput(mm);
-
- return buf - old_buf;
-}
-
-/* Read the environment variable of current process specified by @key. */
-int cfs_get_environ(const char *key, char *value, int *val_len)
-{
- struct mm_struct *mm;
- char *buffer, *tmp_buf = NULL;
- int buf_len = PAGE_CACHE_SIZE;
- int key_len = strlen(key);
- unsigned long addr;
- int rc;
-
- buffer = kmalloc(buf_len, GFP_USER);
- if (!buffer)
- return -ENOMEM;
-
- mm = get_task_mm(current);
- if (!mm) {
- kfree(buffer);
- return -EINVAL;
- }
-
- /* Avoid deadlocks on mmap_sem if called from sys_mmap_pgoff(),
- * which is already holding mmap_sem for writes. If some other
- * thread gets the write lock in the meantime, this thread will
- * block, but at least it won't deadlock on itself. LU-1735 */
- if (down_read_trylock(&mm->mmap_sem) == 0) {
- kfree(buffer);
- return -EDEADLK;
- }
- up_read(&mm->mmap_sem);
-
- addr = mm->env_start;
- while (addr < mm->env_end) {
- int this_len, retval, scan_len;
- char *env_start, *env_end;
-
- memset(buffer, 0, buf_len);
-
- this_len = min_t(int, mm->env_end - addr, buf_len);
- retval = cfs_access_process_vm(current, addr, buffer,
- this_len, 0);
- if (retval != this_len)
- break;
-
- addr += retval;
-
- /* Parse the buffer to find out the specified key/value pair.
- * The "key=value" entries are separated by '\0'. */
- env_start = buffer;
- scan_len = this_len;
- while (scan_len) {
- char *entry;
- int entry_len;
-
- env_end = memscan(env_start, '\0', scan_len);
- LASSERT(env_end >= env_start &&
- env_end <= env_start + scan_len);
-
- /* The last entry of this buffer cross the buffer
- * boundary, reread it in next cycle. */
- if (unlikely(env_end - env_start == scan_len)) {
- /* This entry is too large to fit in buffer */
- if (unlikely(scan_len == this_len)) {
- CERROR("Too long env variable.\n");
- GOTO(out, rc = -EINVAL);
- }
- addr -= scan_len;
- break;
- }
-
- entry = env_start;
- entry_len = env_end - env_start;
-
- /* Key length + length of '=' */
- if (entry_len > key_len + 1 &&
- !memcmp(entry, key, key_len)) {
- entry += key_len + 1;
- entry_len -= key_len + 1;
- /* The 'value' buffer passed in is too small.*/
- if (entry_len >= *val_len)
- GOTO(out, rc = -EOVERFLOW);
-
- memcpy(value, entry, entry_len);
- *val_len = entry_len;
- GOTO(out, rc = 0);
- }
-
- scan_len -= (env_end - env_start + 1);
- env_start = env_end + 1;
- }
- }
- GOTO(out, rc = -ENOENT);
-
-out:
- mmput(mm);
- kfree((void *)buffer);
- if (tmp_buf)
- kfree((void *)tmp_buf);
- return rc;
-}
-EXPORT_SYMBOL(cfs_get_environ);
-
EXPORT_SYMBOL(cfs_curproc_groups_nr);
EXPORT_SYMBOL(cfs_cap_raise);
EXPORT_SYMBOL(cfs_cap_lower);
diff --git a/drivers/staging/lustre/lustre/obdclass/Makefile b/drivers/staging/lustre/lustre/obdclass/Makefile
index 8a0e08c..7949838 100644
--- a/drivers/staging/lustre/lustre/obdclass/Makefile
+++ b/drivers/staging/lustre/lustre/obdclass/Makefile
@@ -7,7 +7,7 @@ obdclass-y := linux/linux-module.o linux/linux-obdo.o linux/linux-sysctl.o \
local_storage.o statfs_pack.o obdo.o obd_config.o obd_mount.o\
mea.o lu_object.o dt_object.o capa.o cl_object.o \
cl_page.o cl_lock.o cl_io.o lu_ref.o acl.o idmap.o \
- lu_ucred.o
+ lu_ucred.o jobid_cache.o


ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/obdclass/class_obd.c b/drivers/staging/lustre/lustre/obdclass/class_obd.c
index c93131e..e4fdd73 100644
--- a/drivers/staging/lustre/lustre/obdclass/class_obd.c
+++ b/drivers/staging/lustre/lustre/obdclass/class_obd.c
@@ -45,7 +45,9 @@
#include <lustre/lustre_build_version.h>
#include <linux/list.h>
#include <cl_object.h>
+#include <linux/libcfs/lucache.h>
#include "llog_internal.h"
+#include "jobid_internal.h"


struct obd_device *obd_devs[MAX_OBD_DEVICES];
@@ -102,22 +104,18 @@ EXPORT_SYMBOL(obd_dirty_transit_pages);
char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE;
EXPORT_SYMBOL(obd_jobid_var);

-/* Get jobid of current process by reading the environment variable
- * stored in between the "env_start" & "env_end" of task struct.
- *
- * TODO:
- * It's better to cache the jobid for later use if there is any
- * efficient way, the cl_env code probably could be reused for this
- * purpose.
+char obd_jobid_node[JOBSTATS_JOBID_SIZE + 1];
+
+/* Get jobid of current process from stored variable or from upcall.
+ * jobid obtained from upcall would be cached in upcall cache.
*
- * If some job scheduler doesn't store jobid in the "env_start/end",
- * then an upcall could be issued here to get the jobid by utilizing
- * the userspace tools/api. Then, the jobid must be cached.
+ * Historically this was also done by reading the environment variable
+ * stored in between the "env_start" & "env_end" of task struct.
+ * This is now deprecated and slated for removal at a later date, though.
*/
int lustre_get_jobid(char *jobid)
{
int jobid_len = JOBSTATS_JOBID_SIZE;
- int rc = 0;

memset(jobid, 0, JOBSTATS_JOBID_SIZE);
/* Jobstats isn't enabled */
@@ -132,31 +130,27 @@ int lustre_get_jobid(char *jobid)
return 0;
}

- rc = cfs_get_environ(obd_jobid_var, jobid, &jobid_len);
- if (rc) {
- if (rc == -EOVERFLOW) {
- /* For the PBS_JOBID and LOADL_STEP_ID keys (which are
- * variable length strings instead of just numbers), it
- * might make sense to keep the unique parts for JobID,
- * instead of just returning an error. That means a
- * larger temp buffer for cfs_get_environ(), then
- * truncating the string at some separator to fit into
- * the specified jobid_len. Fix later if needed. */
- static bool printed;
- if (unlikely(!printed)) {
- LCONSOLE_ERROR_MSG(0x16b, "%s value too large "
- "for JobID buffer (%d)\n",
- obd_jobid_var, jobid_len);
- printed = true;
- }
- } else {
- CDEBUG((rc == -ENOENT || rc == -EINVAL ||
- rc == -EDEADLK) ? D_INFO : D_ERROR,
- "Get jobid for (%s) failed: rc = %d\n",
- obd_jobid_var, rc);
- }
+ /* Whole node dedicated to single job */
+ if (strcmp(obd_jobid_var, JOBSTATS_NODELOCAL) == 0) {
+ strcpy(jobid, obd_jobid_node);
+ return 0;
}
- return rc;
+
+ /* If there's an upcall defined, let's try that */
+ if (obd_jobid_upcall != NULL) {
+ struct jobid_cache_entry *entry;
+
+ entry = jobid_cache_get(current_pid());
+
+ if (IS_ERR(entry))
+ return PTR_ERR(entry);
+
+ strncpy(jobid, entry->jce_jobid, JOBSTATS_JOBID_SIZE);
+ jobid_cache_put(entry);
+ return 0;
+ }
+
+ return -ENOENT;
}
EXPORT_SYMBOL(lustre_get_jobid);

@@ -674,6 +668,9 @@ static void cleanup_obdclass(void)

class_procfs_clean();

+ if (obd_jobid_upcall)
+ upcall_cache_cleanup(obd_jobid_upcall);
+
class_handle_cleanup();
class_exit_uuidlist();
obd_zombie_impexp_stop();
diff --git a/drivers/staging/lustre/lustre/obdclass/jobid_cache.c b/drivers/staging/lustre/lustre/obdclass/jobid_cache.c
new file mode 100644
index 0000000..4c99ff0
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/jobid_cache.c
@@ -0,0 +1,181 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit http://www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * lustre/obdclass/jobid_cache.c
+ *
+ * Author: Lai Siyao <[email protected]>
+ * Author: Fan Yong <[email protected]>
+ * Author: Oleg Drokin <[email protected]>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/version.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/libcfs/lucache.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_lib.h>
+
+#include "jobid_internal.h"
+
+struct upcall_cache *obd_jobid_upcall;
+
+static void jobid_cache_entry_init(struct upcall_cache_entry *entry,
+ void *unused)
+{
+ entry->u.jobid.jce_uc_entry = entry;
+}
+
+static void jobid_cache_entry_free(struct upcall_cache *cache,
+ struct upcall_cache_entry *entry)
+{
+ if (entry->u.jobid.jce_jobid)
+ OBD_FREE(entry->u.jobid.jce_jobid, JOBSTATS_JOBID_SIZE);
+}
+
+static int jobid_cache_do_upcall(struct upcall_cache *cache,
+ struct upcall_cache_entry *entry)
+{
+ char keystr[32];
+ char *argv[] = {
+ [0] = cache->uc_upcall,
+ [1] = cache->uc_name,
+ [2] = keystr,
+ [3] = NULL
+ };
+ char *envp[] = {
+ [0] = "HOME=/",
+ [1] = "PATH=/sbin:/usr/sbin",
+ [2] = NULL
+ };
+ struct timeval start, end;
+ int rc;
+
+ read_lock(&cache->uc_upcall_rwlock);
+ CDEBUG(D_INFO, "The upcall is: '%s'\n", cache->uc_upcall);
+
+ if (unlikely(!strcmp(cache->uc_upcall, "/NONE")))
+ GOTO(out, rc = -ENOENT);
+
+ argv[0] = cache->uc_upcall;
+ snprintf(keystr, sizeof(keystr), LPU64, entry->ue_key);
+
+ do_gettimeofday(&start);
+ rc = call_usermodehelper(argv[0], argv, envp, 1);
+ do_gettimeofday(&end);
+ if (rc < 0) {
+ CERROR("%s: error invoking upcall %s %s %s: rc %d; "
+ "check /proc/fs/lustre/jobid_upcall, "
+ "time %ldus\n",
+ cache->uc_name, argv[0], argv[1], argv[2], rc,
+ cfs_timeval_sub(&end, &start, NULL));
+ } else {
+ CDEBUG(D_CACHE, "%s: invoked upcall %s %s %s, time %ldus\n",
+ cache->uc_name, argv[0], argv[1], argv[2],
+ cfs_timeval_sub(&end, &start, NULL));
+ rc = 0;
+ }
+out:
+ read_unlock(&cache->uc_upcall_rwlock);
+ return rc;
+}
+
+static int jobid_cache_parse_downcall(struct upcall_cache *cache,
+ struct upcall_cache_entry *entry,
+ void *args)
+{
+ struct jobid_cache_entry *jobid = &entry->u.jobid;
+ char *val = args;
+
+ if (jobid == NULL)
+ return -ENOENT;
+
+ if (jobid->jce_jobid == NULL)
+ OBD_ALLOC(jobid->jce_jobid, JOBSTATS_JOBID_SIZE);
+
+ strncpy(jobid->jce_jobid, val, JOBSTATS_JOBID_SIZE);
+
+ return 0;
+}
+
+struct jobid_cache_entry *jobid_cache_get(__u64 pid)
+{
+ struct upcall_cache_entry *entry;
+
+ if (!obd_jobid_upcall)
+ return ERR_PTR(-ENOENT);
+
+ entry = upcall_cache_get_entry(obd_jobid_upcall, pid, NULL);
+ if (IS_ERR(entry))
+ return ERR_PTR(PTR_ERR(entry));
+ else if (unlikely(!entry))
+ return ERR_PTR(-ENOENT);
+ else
+ return &entry->u.jobid;
+}
+
+void jobid_cache_put(struct jobid_cache_entry *jobid)
+{
+ if (!obd_jobid_upcall)
+ return;
+
+ LASSERT(jobid);
+ upcall_cache_put_entry(obd_jobid_upcall, jobid->jce_uc_entry);
+}
+
+struct upcall_cache_ops jobid_cache_upcall_cache_ops = {
+ .init_entry = jobid_cache_entry_init,
+ .free_entry = jobid_cache_entry_free,
+ .do_upcall = jobid_cache_do_upcall,
+ .parse_downcall = jobid_cache_parse_downcall,
+};
+
+void jobid_flush_cache(__u64 pid)
+{
+ if (!obd_jobid_upcall)
+ return;
+
+ if (pid < 0)
+ upcall_cache_flush_idle(obd_jobid_upcall);
+ else
+ upcall_cache_flush_one(obd_jobid_upcall, pid, NULL);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/jobid_internal.h b/drivers/staging/lustre/lustre/obdclass/jobid_internal.h
new file mode 100644
index 0000000..56db606
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/jobid_internal.h
@@ -0,0 +1,10 @@
+#include <linux/libcfs/lucache.h>
+
+extern struct upcall_cache_ops jobid_cache_upcall_cache_ops;
+extern char obd_jobid_node[];
+extern struct upcall_cache *obd_jobid_upcall;
+
+struct jobid_cache_entry *jobid_cache_get(__u64 pid);
+void jobid_cache_put(struct jobid_cache_entry *jobid);
+void jobid_flush_cache(__u64 pid);
+
diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c
index 121a856..3acdbbc 100644
--- a/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c
+++ b/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c
@@ -66,12 +66,14 @@
#include <linux/seq_file.h>

#include <linux/libcfs/libcfs.h>
+#include <linux/libcfs/lucache.h>
#include <obd_support.h>
#include <obd_class.h>
#include <linux/lnet/lnetctl.h>
#include <lprocfs_status.h>
#include <lustre_ver.h>
#include <lustre/lustre_build_version.h>
+#include "../jobid_internal.h"

int proc_version;

@@ -283,6 +285,114 @@ static ssize_t obd_proc_jobid_var_seq_write(struct file *file, const char *buffe
}
LPROC_SEQ_FOPS(obd_proc_jobid_var);

+static int obd_proc_jobid_upcall_seq_show(struct seq_file *m, void *v)
+{
+
+ return seq_printf(m, "%s\n",
+ obd_jobid_upcall?obd_jobid_upcall->uc_upcall:"NONE");
+}
+
+static ssize_t
+obd_proc_jobid_upcall_seq_write(struct file *file, const char *buffer,
+ size_t count, loff_t *off)
+{
+ char *kernbuf;
+ int rc = 0;
+
+ if (!count || count >= UC_CACHE_UPCALL_MAXPATH)
+ return -EINVAL;
+
+ OBD_ALLOC(kernbuf, count + 1);
+
+ if (copy_from_user(kernbuf, buffer, count)) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ if (count > 10 && strcmp(kernbuf, "downcall: ") == 0) {
+ __u64 pid;
+ char *end;
+
+ if (obd_jobid_upcall == NULL) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ kernbuf += 10;
+
+ pid = simple_strtoul(kernbuf, &end, 10);
+
+ if (kernbuf == end) {
+ rc = -EINVAL;
+ } else {
+ end++;
+ if (strcmp(end, "NONE") == 0)
+ rc = -ENOENT;
+ rc = upcall_cache_downcall(obd_jobid_upcall, rc, pid,
+ end);
+ }
+ goto out;
+
+ } else if (count >= 5 && strcmp(kernbuf, "flush") == 0) {
+ jobid_flush_cache(-1);
+ goto out;
+ }
+
+ /* Somebody must be trying to setup an upcall */
+ if (kernbuf[0] != '/') {
+ /* Not an absolute pathname? Ignore. */
+ rc = -EINVAL;
+ goto out;
+ }
+
+ /* If there's no cache yet, need to create it */
+ if (obd_jobid_upcall == NULL) {
+ obd_jobid_upcall = upcall_cache_init("jobid", "",
+ &jobid_cache_upcall_cache_ops);
+ if (IS_ERR(obd_jobid_upcall)) {
+ rc = PTR_ERR(obd_jobid_upcall);
+ obd_jobid_upcall = NULL;
+
+ goto out;
+ }
+ }
+
+ /* Remove any extraneous bits from the upcall (e.g. linefeeds) */
+ write_lock(&obd_jobid_upcall->uc_upcall_rwlock);
+ sscanf(kernbuf, "%s", obd_jobid_upcall->uc_upcall);
+ write_unlock(&obd_jobid_upcall->uc_upcall_rwlock);
+
+out:
+ OBD_FREE(kernbuf, count + 1);
+ return rc?rc:count;
+}
+LPROC_SEQ_FOPS(obd_proc_jobid_upcall);
+
+static int obd_proc_jobid_name_seq_show(struct seq_file *m, void *v)
+{
+ return seq_printf(m, "%s\n", obd_jobid_var);
+}
+
+static ssize_t
+obd_proc_jobid_name_seq_write(struct file *file, const char *buffer,
+ size_t count, loff_t *off)
+{
+ if (!count || count > JOBSTATS_JOBID_SIZE)
+ return -EINVAL;
+
+ if (copy_from_user(obd_jobid_node, buffer, count))
+ return -EFAULT;
+
+ obd_jobid_node[count] = 0;
+
+ /* Trim the trailing '\n' if any */
+ if (obd_jobid_node[count - 1] == '\n')
+ obd_jobid_node[count - 1] = 0;
+
+ return count;
+}
+LPROC_SEQ_FOPS(obd_proc_jobid_name);
+
/* Root for /proc/fs/lustre */
struct proc_dir_entry *proc_lustre_root = NULL;
EXPORT_SYMBOL(proc_lustre_root);
@@ -292,6 +402,10 @@ struct lprocfs_vars lprocfs_base[] = {
{ "pinger", &obd_proc_pinger_fops },
{ "health_check", &obd_proc_health_fops },
{ "jobid_var", &obd_proc_jobid_var_fops },
+ { .name = "jobid_upcall",
+ .fops = &obd_proc_jobid_upcall_fops},
+ { .name = "jobid_name",
+ .fops = &obd_proc_jobid_name_fops},
{ 0 }
};

--
1.8.5.3

2014-02-04 16:56:38

by Greg Kroah-Hartman

[permalink] [raw]
Subject: Re: [PATCH 2/4] staging/lustre/obdclass: read jobid from proc

On Tue, Feb 04, 2014 at 10:12:10AM +0400, Oleg Drokin wrote:
> Hello!
>
> On Wed, Oct 30, 2013 at 06:21:01AM -0700, Greg Kroah-Hartman wrote:
> > > - * stored in between the "env_start" & "env_end" of task struct.
> > > +static char *self_environ_file = "/proc/self/environ";
> >
> > Heh, no, that's not ok at all.
> >
> > This is a _huge_ sign that you are doing something wrong in your driver
> > if you need something that isn't exported, or that you have to dig out
> > of proc.
> >
> > Sorry, I can't take this, please fix the underlying problems that would
> > even think that you need access to the environment from within a kernel
> > driver.
>
> I took a stab at this.
> This is not a final patch, I know there's still some number of checkpatch
> warnings and the proc layout is not finalized yet for example.
>
> But before I spend any more time in polishing this, can you please take a look
> and advise if this direction would be acceptable for you when driven to
> completion?

What exactly are you doing here? Calling out to userspace for what
information? And how are you going to handle namespaces and containers
by doing that? Are you going to block in the kernel for this
information?

What are you trying to solve with this code in the first place?

greg k-h

2014-02-04 17:28:03

by Oleg Drokin

[permalink] [raw]
Subject: Re: [PATCH 2/4] staging/lustre/obdclass: read jobid from proc

Hello!

On Feb 4, 2014, at 11:57 AM, Greg Kroah-Hartman wrote:

> What exactly are you doing here? Calling out to userspace for what
> information? And how are you going to handle namespaces and containers
> by doing that? Are you going to block in the kernel for this
> information?
>
> What are you trying to solve with this code in the first place?

So, as an overview of a feature:

When you have tens of thousand (and even hundreds) of nodes doing IO,
it's no longer practical to tell them apart separately or in some
network-related groups based on their address server-side
(for purposes like monitoring and load control).
Since such systems are usually managed by some sort of a job/batch scheduler,
it's much more natural to organize them into "jobs" as known to the
job scheduler instead. This job id information is harvested and added to
all RPCs so that server side can do useful things with this information
(like aggregate statistics, identify pathologically bad workcases,
QoS and so on).

Most of batch schedulers out there let userspace know their own JOBID as an
environment variable. So original implementation was just harvesting this
info directly from process environment. I certainly can see why this is not
really desirable.

So, the patch does away with the environment parsing, instead it adds two
new venues of getting this information:

1. In vast majorities of cases entire node is dedicated to a single job, so
we just create a /proc variable where you can input job id from job scheduler
prologue (and then clear it from an epilogue). Getting jobid in this case does
does not dip in userspace anymore. This also does not block anywhere.

2. In some more rare systems with lots of cores they actually seem to be subdividing
individual nodes across jobs. Additionally all systems usually have login/interactive
nodes. While these sort of interactive nodes do not have jobs scheduled on them, it
still might be useful to distinguish between different user sessions happening there.
This is where the upcalls come into play. First time a process does IO an upcall
would be called that would provide the kernel with jobid identifier (however it might
want to obtain it, we don't really care at this point). This would block (with a timeout)
for the reply. The reply is then cached and reused for subsequent io from the same process.
I did not really think about containers before, but I think it would work out anyway:
I think namespaces and containers still have non-intersecting pid space so we should be
fine in this regard. There is a "master container"/namespace of some sort I think, that is able to see
entire pid space, and that's where the upcall would be run (do I need to somehow force that?)

Bye,
Oleg