2002-11-14 23:20:30

by Timothy Hockin

[permalink] [raw]
Subject: [BK PATCH 1/2] Remove NGROUPS hardlimit (resend w/o qsort)

# This is a BitKeeper generated patch for the following project:
# Project Name: Linux kernel tree
# This patch format is intended for GNU patch command version 2.5 or higher.
# This patch includes the following deltas:
# ChangeSet 1.853 -> 1.854
# include/linux/kernel.h 1.24 -> 1.25
# lib/Makefile 1.15 -> 1.16
# include/linux/init_task.h 1.19 -> 1.20
# include/linux/sched.h 1.103 -> 1.104
# kernel/fork.c 1.83 -> 1.84
# kernel/sys.c 1.32 -> 1.33
# include/asm-i386/param.h 1.2 -> 1.3
# kernel/uid16.c 1.2 -> 1.3
# fs/proc/array.c 1.32 -> 1.33
# kernel/exit.c 1.73 -> 1.74
# include/linux/limits.h 1.3 -> 1.4
# (new) -> 1.1 lib/bsearch.c
#
# The following is the BitKeeper ChangeSet Log
# --------------------------------------------
# 02/11/11 [email protected] 1.854
# Remove the limit of 32 groups. We now have a per-task, dynamic array of
# groups, which is kept sorted and refcounted.
#
# This ChangeSet incorporates all the core functionality. but does not fixup
# all the incorrect usages of groups. That is in a seperate ChangeSet.
# --------------------------------------------
#
diff -Nru a/fs/proc/array.c b/fs/proc/array.c
--- a/fs/proc/array.c Mon Nov 11 17:36:55 2002
+++ b/fs/proc/array.c Mon Nov 11 17:36:55 2002
@@ -172,7 +172,7 @@
p->files ? p->files->max_fds : 0);
task_unlock(p);

- for (g = 0; g < p->ngroups; g++)
+ for (g = 0; g < min(p->ngroups, OLD_NGROUPS); g++)
buffer += sprintf(buffer, "%d ", p->groups[g]);

buffer += sprintf(buffer, "\n");
diff -Nru a/include/asm-i386/param.h b/include/asm-i386/param.h
--- a/include/asm-i386/param.h Mon Nov 11 17:36:55 2002
+++ b/include/asm-i386/param.h Mon Nov 11 17:36:55 2002
@@ -13,10 +13,6 @@

#define EXEC_PAGESIZE 4096

-#ifndef NGROUPS
-#define NGROUPS 32
-#endif
-
#ifndef NOGROUP
#define NOGROUP (-1)
#endif
diff -Nru a/include/linux/init_task.h b/include/linux/init_task.h
--- a/include/linux/init_task.h Mon Nov 11 17:36:55 2002
+++ b/include/linux/init_task.h Mon Nov 11 17:36:55 2002
@@ -80,6 +80,7 @@
.real_timer = { \
.function = it_real_fn \
}, \
+ .ngroups = 0, \
.cap_effective = CAP_INIT_EFF_SET, \
.cap_inheritable = CAP_INIT_INH_SET, \
.cap_permitted = CAP_FULL_SET, \
diff -Nru a/include/linux/kernel.h b/include/linux/kernel.h
--- a/include/linux/kernel.h Mon Nov 11 17:36:55 2002
+++ b/include/linux/kernel.h Mon Nov 11 17:36:55 2002
@@ -215,4 +215,7 @@
#define __FUNCTION__ (__func__)
#endif

+void *bsearch(const void *key, const void *base, size_t nmemb, size_t size,
+ int (*compar)(const void *, const void *));
+
#endif
diff -Nru a/include/linux/limits.h b/include/linux/limits.h
--- a/include/linux/limits.h Mon Nov 11 17:36:55 2002
+++ b/include/linux/limits.h Mon Nov 11 17:36:55 2002
@@ -3,7 +3,6 @@

#define NR_OPEN 1024

-#define NGROUPS_MAX 32 /* supplemental group IDs are available */
#define ARG_MAX 131072 /* # bytes of args + environ for exec() */
#define CHILD_MAX 999 /* no limit :-) */
#define OPEN_MAX 256 /* # open files a process may have */
@@ -18,5 +17,7 @@
#define XATTR_LIST_MAX 65536 /* size of extended attribute namelist (64k) */

#define RTSIG_MAX 32
+
+#define OLD_NGROUPS 32 /* old limit of supplemental group IDs */

#endif
diff -Nru a/include/linux/sched.h b/include/linux/sched.h
--- a/include/linux/sched.h Mon Nov 11 17:36:55 2002
+++ b/include/linux/sched.h Mon Nov 11 17:36:55 2002
@@ -348,7 +348,8 @@
uid_t uid,euid,suid,fsuid;
gid_t gid,egid,sgid,fsgid;
int ngroups;
- gid_t groups[NGROUPS];
+ gid_t *groups;
+ atomic_t *groups_refcount;
kernel_cap_t cap_effective, cap_inheritable, cap_permitted;
int keep_capabilities:1;
struct user_struct *user;
diff -Nru a/kernel/exit.c b/kernel/exit.c
--- a/kernel/exit.c Mon Nov 11 17:36:55 2002
+++ b/kernel/exit.c Mon Nov 11 17:36:55 2002
@@ -7,6 +7,7 @@
#include <linux/config.h>
#include <linux/mm.h>
#include <linux/slab.h>
+#include <linux/vmalloc.h>
#include <linux/interrupt.h>
#include <linux/smp_lock.h>
#include <linux/module.h>
@@ -65,6 +66,11 @@

if (p != current)
wait_task_inactive(p);
+
+ if (p->ngroups && atomic_dec_and_test(p->groups_refcount)) {
+ vfree(p->groups_refcount);
+ vfree(p->groups);
+ }

atomic_dec(&p->user->processes);
security_ops->task_free_security(p);
diff -Nru a/kernel/fork.c b/kernel/fork.c
--- a/kernel/fork.c Mon Nov 11 17:36:55 2002
+++ b/kernel/fork.c Mon Nov 11 17:36:55 2002
@@ -815,6 +815,10 @@
*/
clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);

+ /* increment the groups ref count */
+ if (p->ngroups)
+ atomic_inc(p->groups_refcount);
+
/* Our parent execution domain becomes current domain
These must match for thread signalling to apply */

diff -Nru a/kernel/sys.c b/kernel/sys.c
--- a/kernel/sys.c Mon Nov 11 17:36:55 2002
+++ b/kernel/sys.c Mon Nov 11 17:36:55 2002
@@ -21,6 +21,8 @@
#include <linux/times.h>
#include <linux/security.h>
#include <linux/dcookies.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>

#include <asm/uaccess.h>
#include <asm/io.h>
@@ -1060,42 +1062,114 @@
return i;
}

+/* a simple shell-metzner sort */
+static void groupsort(gid_t *grouplist, int gidsetsize)
+{
+ int right, left, cur, max, stride;
+
+ stride = gidsetsize / 2;
+ while (stride) {
+ max = gidsetsize - stride;
+
+ for (left = 0; left < max; left++) {
+ cur = left;
+ while (cur >= 0) {
+ right = cur + stride;
+ if (grouplist[right] < grouplist[cur]) {
+ gid_t tmp = grouplist[cur];
+ grouplist[cur] = grouplist[right];
+ grouplist[right] = tmp;
+ cur -= stride;
+ } else {
+ break;
+ }
+ }
+ }
+ stride /= 2;
+ }
+}
+
+static int gid_t_cmp(const void *a, const void *b)
+{
+ return *((gid_t *)a) - *((gid_t *)b);
+}
+
/*
- * SMP: Our groups are not shared. We can copy to/from them safely
+ * SMP: Our groups are copy-on-write. We can set them safely
* without another task interfering.
*/
+int do_setgroups(int gidsetsize, gid_t *grouplist)
+{
+ atomic_t *newrefcnt = NULL;
+
+ BUG_ON(gidsetsize && !grouplist);
+ if (gidsetsize) {
+ newrefcnt = vmalloc(sizeof(*newrefcnt));
+ if (!newrefcnt) {
+ vfree(grouplist);
+ return -ENOMEM;
+ }
+
+ atomic_set(newrefcnt, 1);
+
+ /* sort the groupslist for faster searches */
+ groupsort(grouplist, gidsetsize);
+ }
+
+ /* disassociate ourselves from any shared group list */
+ if (current->ngroups
+ && atomic_dec_and_test(current->groups_refcount)) {
+ vfree(current->groups_refcount);
+ vfree(current->groups);
+ }
+
+ current->groups = grouplist;
+ current->groups_refcount = newrefcnt;
+ current->ngroups = gidsetsize;
+
+ return 0;
+}

asmlinkage long sys_setgroups(int gidsetsize, gid_t *grouplist)
{
- gid_t groups[NGROUPS];
+ gid_t *groups = NULL;
int retval;

if (!capable(CAP_SETGID))
return -EPERM;
- if ((unsigned) gidsetsize > NGROUPS)
- return -EINVAL;
- if(copy_from_user(groups, grouplist, gidsetsize * sizeof(gid_t)))
- return -EFAULT;
+ if (gidsetsize) {
+ /*
+ * make sure there is at least OLD_NGROUPS amount of space in
+ * the group list for backwards compatiblity sake.
+ */
+ int alloc_size = (gidsetsize > OLD_NGROUPS) ?
+ gidsetsize : OLD_NGROUPS;
+ groups = vmalloc(alloc_size * sizeof(gid_t));
+ if (!groups)
+ return -ENOMEM;
+
+ if (copy_from_user(groups, grouplist,
+ gidsetsize * sizeof(gid_t))) {
+ vfree(groups);
+ return -EFAULT;
+ }
+ }
+
retval = security_ops->task_setgroups(gidsetsize, groups);
- if (retval)
+ if (retval) {
+ if (groups)
+ vfree(groups);
return retval;
- memcpy(current->groups, groups, gidsetsize * sizeof(gid_t));
- current->ngroups = gidsetsize;
- return 0;
+ }
+ return do_setgroups(gidsetsize, groups);
}

static int supplemental_group_member(gid_t grp)
{
- int i = current->ngroups;
-
- if (i) {
- gid_t *groups = current->groups;
- do {
- if (*groups == grp)
- return 1;
- groups++;
- i--;
- } while (i);
+ if (current->ngroups) {
+ if (bsearch(&grp, current->groups, current->ngroups,
+ sizeof(gid_t), gid_t_cmp))
+ return 1;
}
return 0;
}
@@ -1388,3 +1462,4 @@
EXPORT_SYMBOL(unregister_reboot_notifier);
EXPORT_SYMBOL(in_group_p);
EXPORT_SYMBOL(in_egroup_p);
+EXPORT_SYMBOL(sys_setgroups);
diff -Nru a/kernel/uid16.c b/kernel/uid16.c
--- a/kernel/uid16.c Mon Nov 11 17:36:55 2002
+++ b/kernel/uid16.c Mon Nov 11 17:36:55 2002
@@ -13,6 +13,7 @@
#include <linux/init.h>
#include <linux/highuid.h>
#include <linux/security.h>
+#include <linux/vmalloc.h>

#include <asm/uaccess.h>

@@ -27,6 +28,7 @@
extern asmlinkage long sys_setresgid(gid_t, gid_t, gid_t);
extern asmlinkage long sys_setfsuid(uid_t);
extern asmlinkage long sys_setfsgid(gid_t);
+extern int do_setgroups(int gidsetsize, gid_t *grouplist);

asmlinkage long sys_chown16(const char * filename, old_uid_t user, old_gid_t group)
{
@@ -109,43 +111,74 @@

asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t *grouplist)
{
- old_gid_t groups[NGROUPS];
+ old_gid_t *groups;
int i,j;

if (gidsetsize < 0)
return -EINVAL;
i = current->ngroups;
- if (gidsetsize) {
+ if (i && gidsetsize) {
if (i > gidsetsize)
return -EINVAL;
+ groups = vmalloc(sizeof(old_gid_t) * i);
+ if (!groups)
+ return -ENOMEM;
for(j=0;j<i;j++)
groups[j] = current->groups[j];
- if (copy_to_user(grouplist, groups, sizeof(old_gid_t)*i))
+ if (copy_to_user(grouplist, groups, sizeof(old_gid_t)*i)) {
+ vfree(groups);
return -EFAULT;
+ }
+ vfree(groups);
}
return i;
}

asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t *grouplist)
{
- old_gid_t groups[NGROUPS];
- gid_t new_groups[NGROUPS];
+ old_gid_t *groups;
+ gid_t *new_groups = NULL;
int i;

if (!capable(CAP_SETGID))
return -EPERM;
- if ((unsigned) gidsetsize > NGROUPS)
- return -EINVAL;
- if (copy_from_user(groups, grouplist, gidsetsize * sizeof(old_gid_t)))
- return -EFAULT;
- for (i = 0 ; i < gidsetsize ; i++)
- new_groups[i] = (gid_t)groups[i];
+ if (gidsetsize) {
+ /*
+ * make sure there is at least OLD_NGROUPS amount of space in
+ * the group list for backwards compatiblity sake.
+ */
+ int alloc_size = (gidsetsize > OLD_NGROUPS) ?
+ gidsetsize : OLD_NGROUPS;
+
+ groups = vmalloc(sizeof(old_gid_t) * gidsetsize);
+ if (!groups)
+ return -ENOMEM;
+
+ if (copy_from_user(groups, grouplist,
+ gidsetsize * sizeof(old_gid_t))) {
+ vfree(groups);
+ return -EFAULT;
+ }
+
+ if (!(new_groups = vmalloc(sizeof(gid_t) * alloc_size))) {
+ vfree(groups);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < gidsetsize; i++)
+ new_groups[i] = (gid_t)groups[i];
+
+ vfree(groups);
+ }
+
i = security_ops->task_setgroups(gidsetsize, new_groups);
- if (i)
+ if (i) {
+ if (new_groups)
+ vfree(new_groups);
return i;
- memcpy(current->groups, new_groups, gidsetsize * sizeof(gid_t));
- current->ngroups = gidsetsize;
- return 0;
+ }
+ /* handles the vmalloc()ed new_groups */
+ return do_setgroups(gidsetsize, new_groups);
}

asmlinkage long sys_getuid16(void)
diff -Nru a/lib/Makefile b/lib/Makefile
--- a/lib/Makefile Mon Nov 11 17:36:55 2002
+++ b/lib/Makefile Mon Nov 11 17:36:55 2002
@@ -9,11 +9,11 @@
L_TARGET := lib.a

export-objs := cmdline.o dec_and_lock.o rwsem-spinlock.o rwsem.o \
- crc32.o rbtree.o radix-tree.o kobject.o
+ crc32.o rbtree.o radix-tree.o kobject.o bsearch.o

obj-y := errno.o ctype.o string.o vsprintf.o brlock.o cmdline.o \
bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o \
- kobject.o
+ kobject.o bsearch.o

obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
diff -Nru a/lib/bsearch.c b/lib/bsearch.c
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/lib/bsearch.c Mon Nov 11 17:36:55 2002
@@ -0,0 +1,49 @@
+/* Copyright (C) 1991, 1992, 1997, 2000 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Library General Public License as
+ published by the Free Software Foundation; either version 2 of the
+ License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Library General Public License for more details.
+
+ You should have received a copy of the GNU Library General Public
+ License along with the GNU C Library; see the file COPYING.LIB. If not,
+ write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ Boston, MA 02111-1307, USA. */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+/* Perform a binary search for KEY in BASE which has NMEMB elements
+ of SIZE bytes each. The comparisons are done by (*COMPAR)(). */
+void *
+bsearch(const void *key, const void *base, size_t nmemb, size_t size,
+ int (*compar)(const void *, const void *))
+{
+ size_t l, u, idx;
+ const void *p;
+ int comparison;
+
+ l = 0;
+ u = nmemb;
+ while (l < u) {
+ idx = (l + u) / 2;
+ p = (void *)(((const char *)base) + (idx * size));
+ comparison = (*compar)(key, p);
+ if (comparison < 0)
+ u = idx;
+ else if (comparison > 0)
+ l = idx + 1;
+ else
+ return (void *)p;
+ }
+
+ return NULL;
+}
+
+EXPORT_SYMBOL(bsearch);


2002-11-14 23:59:32

by Pete Zaitcev

[permalink] [raw]
Subject: Re: [BK PATCH 1/2] Remove NGROUPS hardlimit (resend w/o qsort)

Two questions.

1. Why are arrays vmalloc-ed? This is a goochism which you have
to justify.

2. How do these changes sit with LLNL's changes to increase
number of groups that NFS client can support? It's not
a showstopper, but would be nice if you two cooperated.

-- Pete

2002-11-15 00:07:39

by Tim Hockin

[permalink] [raw]
Subject: Re: [BK PATCH 1/2] Remove NGROUPS hardlimit (resend w/o qsort)

Pete Zaitcev wrote:
> Two questions.
>
> 1. Why are arrays vmalloc-ed? This is a goochism which you have
> to justify.

Because they can be as large as root allows, and when we used kmalloc()
it would actually fail from time to time.

> 2. How do these changes sit with LLNL's changes to increase
> number of groups that NFS client can support? It's not
> a showstopper, but would be nice if you two cooperated.

hmm, I haven't heard anything about them - can you offer an email or URL?

Thanks


--
Tim Hockin
Systems Software Engineer
Sun Microsystems, Linux Kernel Engineering
[email protected]

2002-11-15 00:25:08

by Pete Zaitcev

[permalink] [raw]
Subject: Re: [BK PATCH 1/2] Remove NGROUPS hardlimit (resend w/o qsort)

> Date: Thu, 14 Nov 2002 16:14:29 -0800
> From: Tim Hockin <[email protected]>

> > 1. Why are arrays vmalloc-ed? This is a goochism which you have
> > to justify.
>
> Because they can be as large as root allows, and when we used kmalloc()
> it would actually fail from time to time.

OK. I think in your case it's probably harmless. I thought
that two (order 1) 4K pages can hold 2000 4 byte group IDs,
and that "ought to be enough for anybody". If you envision
10,000 groups, then perhaps you are right, except that it may
be about time to think about your data structures a little more.
I'll let Andi to remind you about the performance impact (vmalloc
area is outside of the big TLB area).

> > 2. How do these changes sit with LLNL's changes to increase
> > number of groups that NFS client can support? It's not
> > a showstopper, but would be nice if you two cooperated.
>
> hmm, I haven't heard anything about them - can you offer an email or URL?

http://www.uwsg.iu.edu/hypermail/linux/kernel/0010.0/0788.html

The sad part is that the patch was around since 2000, but the
effort to get it in was a little half-hearted, perhaps.
I am thinking about reviving it.

-- Pete

2002-11-15 00:31:16

by Alan

[permalink] [raw]
Subject: Re: [BK PATCH 1/2] Remove NGROUPS hardlimit (resend w/o qsort)

On Fri, 2002-11-15 at 00:31, Pete Zaitcev wrote:
> > Date: Thu, 14 Nov 2002 16:14:29 -0800
> OK. I think in your case it's probably harmless. I thought
> that two (order 1) 4K pages can hold 2000 4 byte group IDs,
> and that "ought to be enough for anybody". If you envision
> 10,000 groups, then perhaps you are right, except that it may
> be about time to think about your data structures a little more.
> I'll let Andi to remind you about the performance impact (vmalloc
> area is outside of the big TLB area).

vmalloc will allocate 12K of address space or so and will actually die
first with fragmentation problems. We should also keep small numbers of
groups inline otherwise you are going to upset the "million threads"
people 8)

2002-11-15 00:39:44

by Tim Hockin

[permalink] [raw]
Subject: Re: [BK PATCH 1/2] Remove NGROUPS hardlimit (resend w/o qsort)

Pete Zaitcev wrote:


> be about time to think about your data structures a little more.
> I'll let Andi to remind you about the performance impact (vmalloc
> area is outside of the big TLB area).

Offer an alternative? :) Linked list costs us as much or MORE for
->next as the gid_t. kmalloc() doesn't work for previous reasoning. I
considered a list of gid arr[256] or similar. A voice reminds me that
it doesn't impact us noticably in real use. Now, maybe other
architectures will find a good reason to switch to kmalloc() list of
smaller arrays, and the associated complextities or something else more
clever.

>>hmm, I haven't heard anything about them - can you offer an email or URL?

> http://www.uwsg.iu.edu/hypermail/linux/kernel/0010.0/0788.html

Thanks - will check it..


--
Tim Hockin
Systems Software Engineer
Sun Microsystems, Linux Kernel Engineering
[email protected]

2002-11-15 01:17:24

by Tim Hockin

[permalink] [raw]
Subject: Re: [BK PATCH 1/2] Remove NGROUPS hardlimit (resend w/o qsort)

Andrew Morton wrote:

> What are you actually using the search for?
>
>>From a quick look, it seems that it's purely to answer
> the question "is this process a member of group X?". Is
> that correct?
>
> If so, test_bit() would work nicely.

This could work if we find the max gid, allocate an array of
max_gid/CHAR_BITS + 1 bytes then test_bit, but given the non-contiguity
(is that a word) of group memberships, we'll waste a lot of space on
holes. Now, it could be argued that 10,000 groups are PROBABLY local
enough. Getting the groups back out will be nasty nastiness, though.

perhaps:

if (gidsetsize < (2 * EXEC_PAGESIZE)/sizeof(gid_t)) { /* or something */
/* use kmalloc() */
else
/* use vmalloc() */

thoughts?

--
Tim Hockin
Systems Software Engineer
Sun Microsystems, Linux Kernel Engineering
[email protected]

2002-11-15 01:15:29

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [BK PATCH 1/2] Remove NGROUPS hardlimit (resend w/o qsort)

On Thu, Nov 14, 2002 at 04:46:36PM -0800, Tim Hockin wrote:
> Offer an alternative? :) Linked list costs us as much or MORE for
> ->next as the gid_t. kmalloc() doesn't work for previous reasoning. I
> considered a list of gid arr[256] or similar. A voice reminds me that
> it doesn't impact us noticably in real use. Now, maybe other
> architectures will find a good reason to switch to kmalloc() list of
> smaller arrays, and the associated complextities or something else more
> clever.

Well, there are always B-trees; nice low arrival rates to the allocator
owing to elements/node and O(lg(n)) searches with low constants due to
big fat branching factors. Not my call though.


Bill

2002-11-15 01:23:47

by Andrew Morton

[permalink] [raw]
Subject: Re: [BK PATCH 1/2] Remove NGROUPS hardlimit (resend w/o qsort)

Tim Hockin wrote:
>
> Andrew Morton wrote:
>
> > What are you actually using the search for?
> >
> >>From a quick look, it seems that it's purely to answer
> > the question "is this process a member of group X?". Is
> > that correct?
> >
> > If so, test_bit() would work nicely.
>
> This could work if we find the max gid, allocate an array of
> max_gid/CHAR_BITS + 1 bytes then test_bit, but given the non-contiguity
> (is that a word) of group memberships, we'll waste a lot of space on
> holes. Now, it could be argued that 10,000 groups are PROBABLY local
> enough. Getting the groups back out will be nasty nastiness, though.
>
> perhaps:
>
> if (gidsetsize < (2 * EXEC_PAGESIZE)/sizeof(gid_t)) { /* or something */
> /* use kmalloc() */
> else
> /* use vmalloc() */
>
> thoughts?
>

10,000 bits isn't much. Maybe:

- add `char groups[16]' to task_struct

- add `struct page *groups_page' to task_struct

- then
if (getsetsize <= 256)
use current->groups[] /* 256 groups max */
else
use current->groups_page; /* 32768 groups max */

2002-11-15 01:38:54

by Pete Zaitcev

[permalink] [raw]
Subject: Re: [BK PATCH 1/2] Remove NGROUPS hardlimit (resend w/o qsort)

> > Offer an alternative? :) Linked list costs us as much or MORE for
> > ->next as the gid_t. kmalloc() doesn't work for previous reasoning. I
> > considered a list of gid arr[256] or similar. A voice reminds me that
> > it doesn't impact us noticably in real use. Now, maybe other
> > architectures will find a good reason to switch to kmalloc() list of
> > smaller arrays, and the associated complextities or something else more
> > clever.
>
> Well, there are always B-trees; nice low arrival rates to the allocator
> owing to elements/node and O(lg(n)) searches with low constants due to
> big fat branching factors. Not my call though.

This sounds intriguing.

Bill, if I may borrow from your data structure expertise,
what would you do if you wanted gid_t's indexed by two criteria?
Obviously, we want them them indexed by value (to look them up
for access checking), but NFS also needs them sorted by usage,
to fit the last 3 into the RPC parameters. This looks like
something requiring two overlaying trees who share leafs,
and every leaf being a single gid_t, with nightmarish overhead.

Before Tim came to the scene, the hope was that lookups would
do exhaustive search of arrays, sorted by LRU, while RPC
picked N leading elements of said sorted array. Tim busts
this scheme to pieces, because he sorts arrays by value
(if I read it right).

-- Pete

2002-11-15 01:48:46

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [BK PATCH 1/2] Remove NGROUPS hardlimit (resend w/o qsort)

On Thu, Nov 14, 2002 at 08:45:39PM -0500, Pete Zaitcev wrote:
> This sounds intriguing.
> Bill, if I may borrow from your data structure expertise,
> what would you do if you wanted gid_t's indexed by two criteria?
> Obviously, we want them them indexed by value (to look them up
> for access checking), but NFS also needs them sorted by usage,
> to fit the last 3 into the RPC parameters. This looks like
> something requiring two overlaying trees who share leafs,
> and every leaf being a single gid_t, with nightmarish overhead.
> Before Tim came to the scene, the hope was that lookups would
> do exhaustive search of arrays, sorted by LRU, while RPC
> picked N leading elements of said sorted array. Tim busts
> this scheme to pieces, because he sorts arrays by value
> (if I read it right).

B+ trees separate metadata from data entirely, so two distinct
B+ tree "indices" attached will work just fine for this overlaying
of trees that share leaves.


Bill

2002-11-15 02:26:36

by Tim Hockin

[permalink] [raw]
Subject: Re: [BK PATCH 1/2] Remove NGROUPS hardlimit (resend w/o qsort)

> 10,000 bits isn't much. Maybe:

That's 10000 USED bits. Remember groups are non-contiguously allocated. If
a task is a member of just groups 32767 and 65535, you'll get one bit per
page used, and when they call getgroups() you need to pull it apart and
return an array of gid_t.

> - add `char groups[16]' to task_struct
>
> - add `struct page *groups_page' to task_struct
>
> - then
> if (getsetsize <= 256)
> use current->groups[] /* 256 groups max */
> else
> use current->groups_page; /* 32768 groups max */

2002-11-15 02:35:03

by Andrew Morton

[permalink] [raw]
Subject: Re: [BK PATCH 1/2] Remove NGROUPS hardlimit (resend w/o qsort)

Tim Hockin wrote:
>
> > 10,000 bits isn't much. Maybe:
>
> That's 10000 USED bits. Remember groups are non-contiguously allocated. If
> a task is a member of just groups 32767 and 65535, you'll get one bit per
> page used, and when they call getgroups() you need to pull it apart and
> return an array of gid_t.
>

Well that's what I was asking.

What is the maximum group ID? 0xffffffff?

In that case a radix tree _might_ suit. All you need to put in the
node is a (void *)1 or (void *)0. But it won't be very space-efficient
for really sparse groups.

2002-11-15 05:53:34

by Aaron Lehmann

[permalink] [raw]
Subject: Re: [BK PATCH 1/2] Remove NGROUPS hardlimit (resend w/o qsort)

On Thu, Nov 14, 2002 at 05:30:33PM -0800, Andrew Morton wrote:
> - add `char groups[16]' to task_struct
>
> - add `struct page *groups_page' to task_struct

Wouldn't it be better to use a union to save space?

2002-11-15 13:26:58

by Frank van Maarseveen

[permalink] [raw]
Subject: Re: [BK PATCH 1/2] Remove NGROUPS hardlimit (resend w/o qsort)

On Thu, Nov 14, 2002 at 07:31:56PM -0500, Pete Zaitcev wrote:
> > > 2. How do these changes sit with LLNL's changes to increase
> > > number of groups that NFS client can support? It's not
> > > a showstopper, but would be nice if you two cooperated.
> >
> > hmm, I haven't heard anything about them - can you offer an email or URL?
>
> http://www.uwsg.iu.edu/hypermail/linux/kernel/0010.0/0788.html
>
> The sad part is that the patch was around since 2000, but the
> effort to get it in was a little half-hearted, perhaps.
> I am thinking about reviving it.

It's still alive. I'll make a 2.4.20 version of the patch once 2.4.20
is released. Or maybe earlier (a -rc version) since 2.4.20 appears to
be imminent and I don't expect changes in that area. 2.5.latest is next
on my list. Latest version is:

http://web.inter.nl.net/users/fvm/nfs-ngroups/2.4.19-nfs-ngroups-3.98.patch

I have dropped all the older versions (they go back to 2.2.13)

--
Frank

2002-11-15 14:40:43

by Alan

[permalink] [raw]
Subject: Re: [BK PATCH 1/2] Remove NGROUPS hardlimit (resend w/o qsort)

On Fri, 2002-11-15 at 02:41, Andrew Morton wrote:
> In that case a radix tree _might_ suit. All you need to put in the
> node is a (void *)1 or (void *)0. But it won't be very space-efficient
> for really sparse groups.

99.999% of users will have < 16 groups, probably less than 8. If the
system doesn't get that default case as fast and memory efficient as
before the priorities are badly wrong.

2002-11-15 16:24:09

by Horst H. von Brand

[permalink] [raw]
Subject: Re: [BK PATCH 1/2] Remove NGROUPS hardlimit (resend w/o qsort)

Timothy Hockin <[email protected]> said:

[...]

> +/* a simple shell-metzner sort */
> +static void groupsort(gid_t *grouplist, int gidsetsize)
> +{
> + int right, left, cur, max, stride;
> +
> + stride = gidsetsize / 2;

This guarantees bad performance for certain gidgetsizes... customary wisdom
(at least, Knuth sayeth so) is to go:

for(stride = 1; stride < gidgetsize; stride = 3 * stride + 1)
;
do {
stride /= 3;
...
} while (stride > 1) {

> + while (stride) {
> + max = gidsetsize - stride;
> +
> + for (left = 0; left < max; left++) {
> + cur = left;
> + while (cur >= 0) {
> + right = cur + stride;
> + if (grouplist[right] < grouplist[cur]) {
> + gid_t tmp = grouplist[cur];
> + grouplist[cur] = grouplist[right];
> + grouplist[right] = tmp;
> + cur -= stride;
> + } else {
> + break;
> + }
> + }

You should work by stuffing the new element in a temporary variable, and
just shift the greater ones up, then stuff the one in

My version of shellsort (h is stride, n is size) goes:

/*
* shellsort.c: Shell sort
*/

#include "sort.h"

void
sort(double a[], int n)

{
int i, j, h;
double tmp;

for(h = 1; h < n; h = 3 * h + 1)
;

do {
h /= 3;
for(i = h; i < n; i++) {
tmp = a[i];
for(j = i - h; j >= 0 && tmp < a[j]; j -= h)
a[j + h] = a[j];
a[j + h] = tmp;
}
} while(h > 1);
}

This gets around bad h (stride) values, and does just a bit more than 1
assignment per element moved in the inner loop (you do 3). Plus it is
shorter ;-)
--
Dr. Horst H. von Brand User #22616 counter.li.org
Departamento de Informatica Fono: +56 32 654431
Universidad Tecnica Federico Santa Maria +56 32 654239
Casilla 110-V, Valparaiso, Chile Fax: +56 32 797513