Hello everybody !!
The patch below is what i think is a better approach to
compute int_sqrt().
What about it ?
Thanks !!
---
--- a/lib/int_sqrt.c 2008-04-17 08:19:44.000000000 +0530
+++ b/lib/int_sqrt.c 2008-07-02 11:37:01.000000000 +0530
@@ -1,4 +1,3 @@
-
#include <linux/kernel.h>
#include <linux/module.h>
@@ -7,26 +6,21 @@
* @x: integer of which to calculate the sqrt
*
* A very rough approximation to the sqrt() function.
+ * Improved version from the previous one.
*/
unsigned long int_sqrt(unsigned long x)
{
- unsigned long op, res, one;
-
- op = x;
- res = 0;
-
- one = 1UL << (BITS_PER_LONG - 2);
- while (one > op)
- one >>= 2;
-
- while (one != 0) {
- if (op >= res + one) {
- op = op - (res + one);
- res = res + 2 * one;
- }
- res /= 2;
- one /= 4;
- }
- return res;
+ unsigned long ub, lb, m;
+ lb = 1; /* lower bound */
+ ub = (x >> 5) + 8; /* upper bound */
+ do {
+ m = (ub + lb) >> 1; /* middle value */
+ if((m * m) > x)
+ ub = m - 1;
+ else
+ lb = m + 1;
+ } while(ub >= lb);
+
+ return lb - 1;
}
EXPORT_SYMBOL(int_sqrt);
On Wed, 2008-07-16 at 13:19 -0700, Soumyadip Das Mahapatra wrote:
> Hello everybody !!
> The patch below is what i think is a better approach to
> compute int_sqrt().
>
> What about it ?
Indeed, what about it?
How is it better;
- is it cheaper
- how so
- on what platform
- it is more accurate
- who needs it
Please provide a little more information about why you suggest this
change.
> Thanks !!
>
> ---
> --- a/lib/int_sqrt.c 2008-04-17 08:19:44.000000000 +0530
> +++ b/lib/int_sqrt.c 2008-07-02 11:37:01.000000000 +0530
> @@ -1,4 +1,3 @@
> -
> #include <linux/kernel.h>
> #include <linux/module.h>
>
> @@ -7,26 +6,21 @@
> * @x: integer of which to calculate the sqrt
> *
> * A very rough approximation to the sqrt() function.
> + * Improved version from the previous one.
With the previuos one being gone, this comment adds little but
confusion..
> */
> unsigned long int_sqrt(unsigned long x)
> {
> - unsigned long op, res, one;
> -
> - op = x;
> - res = 0;
> -
> - one = 1UL << (BITS_PER_LONG - 2);
> - while (one > op)
> - one >>= 2;
> -
> - while (one != 0) {
> - if (op >= res + one) {
> - op = op - (res + one);
> - res = res + 2 * one;
> - }
> - res /= 2;
> - one /= 4;
> - }
> - return res;
> + unsigned long ub, lb, m;
> + lb = 1; /* lower bound */
> + ub = (x >> 5) + 8; /* upper bound */
> + do {
> + m = (ub + lb) >> 1; /* middle value */
> + if((m * m) > x)
> + ub = m - 1;
> + else
> + lb = m + 1;
> + } while(ub >= lb);
> +
> + return lb - 1;
> }
> EXPORT_SYMBOL(int_sqrt);
>
>
>
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
> From: Peter Zijlstra <[email protected]>
> To: Soumyadip Das Mahapatra <[email protected]>
> Cc: [email protected]
> Sent: Thursday, July 17, 2008 2:21:09 AM
> Subject: Re: [PATCH] : A better approach to compute int_sqrt in lib/int_sqrt.c
>
> On Wed, 2008-07-16 at 13:19 -0700, Soumyadip Das Mahapatra wrote:
> > Hello everybody !!
> > The patch below is what i think is a better approach to
> > compute int_sqrt().
> >
> > What about it ?
>
> Indeed, what about it?
>
> How is it better;
> - is it cheaper
> - how so
> - on what platform
>
> - it is more accurate
> - who needs it
>
> Please provide a little more information about why you suggest this
> change.
>
> > Thanks !!
> >
> > ---
> > --- a/lib/int_sqrt.c 2008-04-17 08:19:44.000000000 +0530
> > +++ b/lib/int_sqrt.c 2008-07-02 11:37:01.000000000 +0530
> > @@ -1,4 +1,3 @@
> > -
> > #include
> > #include
> >
> > @@ -7,26 +6,21 @@
> > * @x: integer of which to calculate the sqrt
> > *
> > * A very rough approximation to the sqrt() function.
> > + * Improved version from the previous one.
>
> With the previuos one being gone, this comment adds little but
> confusion..
>
> > */
> > unsigned long int_sqrt(unsigned long x)
> > {
> > - unsigned long op, res, one;
> > -
> > - op = x;
> > - res = 0;
> > -
> > - one = 1UL << (BITS_PER_LONG - 2);
> > - while (one > op)
> > - one >>= 2;
> > -
> > - while (one != 0) {
> > - if (op >= res + one) {
> > - op = op - (res + one);
> > - res = res + 2 * one;
> > - }
> > - res /= 2;
> > - one /= 4;
> > - }
> > - return res;
> > + unsigned long ub, lb, m;
> > + lb = 1; /* lower bound */
> > + ub = (x >> 5) + 8; /* upper bound */
> > + do {
> > + m = (ub + lb) >> 1; /* middle value */
> > + if((m * m) > x)
> > + ub = m - 1;
> > + else
> > + lb = m + 1;
> > + } while(ub >= lb);
> > +
> > + return lb - 1;
> > }
> > EXPORT_SYMBOL(int_sqrt);
> >
Thanks Peter for noticing :-)
Sorry, I should have it explained before. Really sorry
for that. Here are they...
0 It is better because
o it uses only one loop instead of two
o contains no division operator (older version has two)
which are surely comparatively slow task in computer
0 Currently find . -name '*.[ch]' | xargs grep int_sqrt gives me this
....
./fs/nfs/write.c: nfs_congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
./drivers/video/fbmon.c: h_period = int_sqrt(h_period);
./mm/page_alloc.c: min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
./mm/oom_kill.c: s = int_sqrt(cpu_time);
./mm/oom_kill.c: s = int_sqrt(int_sqrt(run_time));
....
So this function works in critical computing sections like frame-buffer, paging.
Which means betterment of this function should not be ignored.
Besides, if there is a better way to do things then why should not we do that ?
Anyways thanks :-)
On Wed, Jul 16, 2008 at 02:35:56PM -0700, Soumyadip Das Mahapatra wrote:
> Thanks Peter for noticing :-)
> Sorry, I should have it explained before. Really sorry
> for that. Here are they...
>
> 0 It is better because
> o it uses only one loop instead of two
> o contains no division operator (older version has two)
> which are surely comparatively slow task in computer
>
> 0 Currently find . -name '*.[ch]' | xargs grep int_sqrt gives me this
> ....
> ./fs/nfs/write.c: nfs_congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
> ./drivers/video/fbmon.c: h_period = int_sqrt(h_period);
> ./mm/page_alloc.c: min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
> ./mm/oom_kill.c: s = int_sqrt(cpu_time);
> ./mm/oom_kill.c: s = int_sqrt(int_sqrt(run_time));
> ....
> So this function works in critical computing sections like frame-buffer, paging.
> Which means betterment of this function should not be ignored.
> Besides, if there is a better way to do things then why should not we do that ?
>
> Anyways thanks :-)
It is also very inaccurate:
int_sqrt(9380489) returns 3062 with the old code and 146574 with the new
code. I wonder which one is closer to right. It seems as soon as the
input is 2^22 or higher, the new code goes all to hell and starts
returning 2^16-1 or similarly silly values rather than 2^11-1 or
similar.
Here is how I tested:
(compiled with gcc -Wall -O2 -std=c99)
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#define BITS_PER_LONG 32
unsigned long old_int_sqrt(unsigned long x) {
unsigned long op, res, one;
op = x;
res = 0;
one = 1UL << (BITS_PER_LONG - 2);
while (one > op)
one >>= 2;
while (one != 0) {
if (op >= res + one) {
op = op - (res + one);
res = res + 2 * one;
}
res /= 2;
one /= 4;
}
return res;
}
unsigned long new_int_sqrt(unsigned long x) {
unsigned long ub, lb, m;
lb = 1; /* lower bound */
ub = (x >> 5) + 8; /* upper bound */
do {
m = (ub + lb) >> 1; /* middle value */
if((m * m) > x)
ub = m - 1;
else
lb = m + 1;
} while(ub >= lb);
return lb - 1;
}
int main() {
unsigned long i;
unsigned long old;
unsigned long new;
for(i=0;i<10000000;i++) {
old=old_int_sqrt(i);
new=new_int_sqrt(i);
if(new!=old) {
printf("sqrt(%lu)= %lu(new)->%llu %lu(old)->%llu",i,new,(unsigned long long)new*(unsigned long long)new,old,(unsigned long long)old*(unsigned long long)old);
if(llabs((unsigned long long)new*(unsigned long long)new - (unsigned long long)i) < llabs((unsigned long long)old*(unsigned long long)old - (unsigned long long)i)) {
printf(" (new is best)\n");
} else {
printf(" (old is best)\n");
}
}
}
return 0;
}
Example output:
sqrt(9380468)= 146574(new)->21483937476 3062(old)->9375844 (old is best)
sqrt(9380469)= 146574(new)->21483937476 3062(old)->9375844 (old is best)
sqrt(9380470)= 146574(new)->21483937476 3062(old)->9375844 (old is best)
sqrt(9380471)= 146574(new)->21483937476 3062(old)->9375844 (old is best)
sqrt(9380472)= 146574(new)->21483937476 3062(old)->9375844 (old is best)
sqrt(9380473)= 146574(new)->21483937476 3062(old)->9375844 (old is best)
sqrt(9380474)= 146574(new)->21483937476 3062(old)->9375844 (old is best)
sqrt(9380475)= 146574(new)->21483937476 3062(old)->9375844 (old is best)
sqrt(9380476)= 146574(new)->21483937476 3062(old)->9375844 (old is best)
sqrt(9380477)= 146574(new)->21483937476 3062(old)->9375844 (old is best)
sqrt(9380478)= 146574(new)->21483937476 3062(old)->9375844 (old is best)
--
Len Sorensen
On Wednesday 16 July 2008 02:35:56 pm Soumyadip Das Mahapatra wrote:
> > > */
> > > unsigned long int_sqrt(unsigned long x)
> > > {
> > > - unsigned long op, res, one;
> > > -
> > > - op = x;
> > > - res = 0;
> > > -
> > > - one = 1UL << (BITS_PER_LONG - 2);
> > > - while (one > op)
> > > - one >>= 2;
> > > -
> > > - while (one != 0) {
> > > - if (op >= res + one) {
> > > - op = op - (res + one);
> > > - res = res + 2 * one;
> > > - }
> > > - res /= 2;
> > > - one /= 4;
> > > - }
> > > - return res;
> > > + unsigned long ub, lb, m;
> > > + lb = 1; /* lower bound */
> > > + ub = (x >> 5) + 8; /* upper bound */
> > > + do {
> > > + m = (ub + lb) >> 1; /* middle value */
> > > + if((m * m) > x)
> > > + ub = m - 1;
> > > + else
> > > + lb = m + 1;
> > > + } while(ub >= lb);
> > > +
> > > + return lb - 1;
> > > }
> > > EXPORT_SYMBOL(int_sqrt);
>
> 0 It is better because
> o contains no division operator (older version has two)
> which are surely comparatively slow task in computer
Actually, the old version has zero division operators; those are simply
right-shifts.
-- Vadim Lobanov
On Wed, Jul 16, 2008 at 06:05:56PM -0400, Lennart Sorensen wrote:
> It is also very inaccurate:
>
> int_sqrt(9380489) returns 3062 with the old code and 146574 with the new
> code. I wonder which one is closer to right. It seems as soon as the
> input is 2^22 or higher, the new code goes all to hell and starts
> returning 2^16-1 or similarly silly values rather than 2^11-1 or
> similar.
>
> Here is how I tested:
>
> (compiled with gcc -Wall -O2 -std=c99)
>
> #include <stdio.h>
> #include <unistd.h>
> #include <stdlib.h>
>
> #define BITS_PER_LONG 32
>
> unsigned long old_int_sqrt(unsigned long x) {
> unsigned long op, res, one;
>
> op = x;
> res = 0;
>
> one = 1UL << (BITS_PER_LONG - 2);
> while (one > op)
> one >>= 2;
>
> while (one != 0) {
> if (op >= res + one) {
> op = op - (res + one);
> res = res + 2 * one;
> }
> res /= 2;
> one /= 4;
> }
> return res;
> }
>
> unsigned long new_int_sqrt(unsigned long x) {
> unsigned long ub, lb, m;
> lb = 1; /* lower bound */
> ub = (x >> 5) + 8; /* upper bound */
> do {
> m = (ub + lb) >> 1; /* middle value */
> if((m * m) > x)
This line overflows while the result is good if changed to:
if(((unsigned long long)m * (unsigned long long)m) > (unsigned long long)x)
Perhaps there is a better way to do this.
> ub = m - 1;
> else
> lb = m + 1;
> } while(ub >= lb);
>
> return lb - 1;
> }
>
--
Len Sorensen
On Wed, Jul 16, 2008 at 02:35:56PM -0700, Soumyadip Das Mahapatra wrote:
> Thanks Peter for noticing :-)
> Sorry, I should have it explained before. Really sorry
> for that. Here are they...
>
> 0 It is better because
> o it uses only one loop instead of two
> o contains no division operator (older version has two)
> which are surely comparatively slow task in computer
>
> 0 Currently find . -name '*.[ch]' | xargs grep int_sqrt gives me this
> ....
> ./fs/nfs/write.c: nfs_congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
> ./drivers/video/fbmon.c: h_period = int_sqrt(h_period);
> ./mm/page_alloc.c: min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
> ./mm/oom_kill.c: s = int_sqrt(cpu_time);
> ./mm/oom_kill.c: s = int_sqrt(int_sqrt(run_time));
> ....
> So this function works in critical computing sections like frame-buffer, paging.
> Which means betterment of this function should not be ignored.
> Besides, if there is a better way to do things then why should not we do that ?
>
> Anyways thanks :-)
So so far the line:
if((m * m) > x)
overflows easily in which case the result is wrong.
Also at least on my Athlon 2500, this new algorithm takes 3.5 times
longer than the old one to get result when doing all square roots from 0
to 200000000, which is no improvement.
--
Len Sorensen
On Wed, 2008-07-16 at 14:35 -0700, Soumyadip Das Mahapatra wrote:
> 0 It is better because
> o it uses only one loop instead of two
> o contains no division operator (older version has two)
> which are surely comparatively slow task in computer
As Lennart has said, gcc is smart enough to transform a division by a
power-of-two into shifts.
> 0 Currently find . -name '*.[ch]' | xargs grep int_sqrt gives me this
> ....
> ./fs/nfs/write.c: nfs_congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
> ./drivers/video/fbmon.c: h_period = int_sqrt(h_period);
> ./mm/page_alloc.c: min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
> ./mm/oom_kill.c: s = int_sqrt(cpu_time);
> ./mm/oom_kill.c: s = int_sqrt(int_sqrt(run_time));
> ....
fs/nfs/write.c is init code
mm/page_alloc.c is also init code
mm/oom_kill.c isn't a hot path
which leaves the fbmon case, which after a quick peek is setup code, so
not a hot path either.
So while that doesn't preclude us from changing it if you can indeed
show its a better implementation, its not on anybodies hit-list.