2004-01-15 20:53:30

by Jesse Barnes

[permalink] [raw]
Subject: [PATCH] readX_relaxed interface

Based on the PIO ordering disucssion, I've come up with the following
patch. It has the potential to help any platform that has seperate PIO
and DMA channels, and allows them to be reorderd wrt each other. Some
SGI MIPS platforms, as well as the SGI Altix (aka sn2) platform behave
this way, and will thus benefit from this patch.

It adds a new PIO read routine for PIOs that don't have to be ordered
wrt DMA on the system.

If it looks ok, I'll add in macros for the other arches and send it out
for inclusion.

Thanks,
Jesse

diff -Nru a/arch/ia64/sn/kernel/sn2/io.c b/arch/ia64/sn/kernel/sn2/io.c
--- a/arch/ia64/sn/kernel/sn2/io.c Thu Jan 15 11:39:13 2004
+++ b/arch/ia64/sn/kernel/sn2/io.c Thu Jan 15 11:39:13 2004
@@ -23,6 +23,10 @@
#undef __sn_readw
#undef __sn_readl
#undef __sn_readq
+#undef __sn_readb_relaxed
+#undef __sn_readw_relaxed
+#undef __sn_readl_relaxed
+#undef __sn_readq_relaxed

unsigned int
__sn_inb (unsigned long port)
@@ -82,6 +86,30 @@
__sn_readq (void *addr)
{
return ___sn_readq (addr);
+}
+
+unsigned char
+__sn_readb_relaxed (void *addr)
+{
+ return ___sn_readb_relaxed (addr);
+}
+
+unsigned short
+__sn_readw_relaxed (void *addr)
+{
+ return ___sn_readw_relaxed (addr);
+}
+
+unsigned int
+__sn_readl_relaxed (void *addr)
+{
+ return ___sn_readl_relaxed (addr);
+}
+
+unsigned long
+__sn_readq_relaxed (void *addr)
+{
+ return ___sn_readq_relaxed (addr);
}

#endif
diff -Nru a/include/asm-ia64/io.h b/include/asm-ia64/io.h
--- a/include/asm-ia64/io.h Thu Jan 15 11:39:13 2004
+++ b/include/asm-ia64/io.h Thu Jan 15 11:39:13 2004
@@ -125,6 +125,10 @@
#define __ia64_readw ___ia64_readw
#define __ia64_readl ___ia64_readl
#define __ia64_readq ___ia64_readq
+#define __ia64_readb_relaxed ___ia64_readb
+#define __ia64_readw_relaxed ___ia64_readw
+#define __ia64_readl_relaxed ___ia64_readl
+#define __ia64_readq_relaxed ___ia64_readq
#define __ia64_writeb ___ia64_writeb
#define __ia64_writew ___ia64_writew
#define __ia64_writel ___ia64_writel
@@ -337,15 +341,27 @@
#define __readw platform_readw
#define __readl platform_readl
#define __readq platform_readq
+#define __readb_relaxed platform_readb_relaxed
+#define __readw_relaxed platform_readw_relaxed
+#define __readl_relaxed platform_readl_relaxed
+#define __readq_relaxed platform_readq_relaxed

#define readb(a) __readb((void *)(a))
#define readw(a) __readw((void *)(a))
#define readl(a) __readl((void *)(a))
#define readq(a) __readq((void *)(a))
+#define readb_relaxed(a) __readb_relaxed((void *)(a))
+#define readw_relaxed(a) __readw_relaxed((void *)(a))
+#define readl_relaxed(a) __readl_relaxed((void *)(a))
+#define readq_relaxed(a) __readq_relaxed((void *)(a))
#define __raw_readb readb
#define __raw_readw readw
#define __raw_readl readl
#define __raw_readq readq
+#define __raw_readb_relaxed readb_relaxed
+#define __raw_readw_relaxed readw_relaxed
+#define __raw_readl_relaxed readl_relaxed
+#define __raw_readq_relaxed readq_relaxed
#define writeb(v,a) __writeb((v), (void *) (a))
#define writew(v,a) __writew((v), (void *) (a))
#define writel(v,a) __writel((v), (void *) (a))
diff -Nru a/include/asm-ia64/machvec.h b/include/asm-ia64/machvec.h
--- a/include/asm-ia64/machvec.h Thu Jan 15 11:39:13 2004
+++ b/include/asm-ia64/machvec.h Thu Jan 15 11:39:13 2004
@@ -64,6 +64,10 @@
typedef unsigned short ia64_mv_readw_t (void *);
typedef unsigned int ia64_mv_readl_t (void *);
typedef unsigned long ia64_mv_readq_t (void *);
+typedef unsigned char ia64_mv_readb_relaxed_t (void *);
+typedef unsigned short ia64_mv_readw_relaxed_t (void *);
+typedef unsigned int ia64_mv_readl_relaxed_t (void *);
+typedef unsigned long ia64_mv_readq_relaxed_t (void *);

extern void machvec_noop (void);
extern void machvec_memory_fence (void);
@@ -114,6 +118,10 @@
# define platform_readw ia64_mv.readw
# define platform_readl ia64_mv.readl
# define platform_readq ia64_mv.readq
+# define platform_readb_relaxed ia64_mv.readb_relaxed
+# define platform_readw_relaxed ia64_mv.readw_relaxed
+# define platform_readl_relaxed ia64_mv.readl_relaxed
+# define platform_readq_relaxed ia64_mv.readq_relaxed
# endif

/* __attribute__((__aligned__(16))) is required to make size of the
@@ -155,6 +163,10 @@
ia64_mv_readw_t *readw;
ia64_mv_readl_t *readl;
ia64_mv_readq_t *readq;
+ ia64_mv_readb_relaxed_t *readb_relaxed;
+ ia64_mv_readw_relaxed_t *readw_relaxed;
+ ia64_mv_readl_relaxed_t *readl_relaxed;
+ ia64_mv_readq_relaxed_t *readq_relaxed;
} __attribute__((__aligned__(16))); /* align attrib? see above comment */

#define MACHVEC_INIT(name) \
@@ -192,6 +204,10 @@
platform_readw, \
platform_readl, \
platform_readq, \
+ platform_readb_relaxed, \
+ platform_readw_relaxed, \
+ platform_readl_relaxed, \
+ platform_readq_relaxed, \
}

extern struct ia64_machine_vector ia64_mv;
@@ -314,6 +330,18 @@
#endif
#ifndef platform_readq
# define platform_readq __ia64_readq
+#endif
+#ifndef platform_readb_relaxed
+# define platform_readb_relaxed __ia64_readb_relaxed
+#endif
+#ifndef platform_readw_relaxed
+# define platform_readw_relaxed __ia64_readw_relaxed
+#endif
+#ifndef platform_readl_relaxed
+# define platform_readl_relaxed __ia64_readl_relaxed
+#endif
+#ifndef platform_readq_relaxed
+# define platform_readq_relaxed __ia64_readq_relaxed
#endif

#endif /* _ASM_IA64_MACHVEC_H */
diff -Nru a/include/asm-ia64/machvec_sn2.h b/include/asm-ia64/machvec_sn2.h
--- a/include/asm-ia64/machvec_sn2.h Thu Jan 15 11:39:13 2004
+++ b/include/asm-ia64/machvec_sn2.h Thu Jan 15 11:39:13 2004
@@ -51,6 +51,10 @@
extern ia64_mv_readw_t __sn_readw;
extern ia64_mv_readl_t __sn_readl;
extern ia64_mv_readq_t __sn_readq;
+extern ia64_mv_readb_t __sn_readb_relaxed;
+extern ia64_mv_readw_t __sn_readw_relaxed;
+extern ia64_mv_readl_t __sn_readl_relaxed;
+extern ia64_mv_readq_t __sn_readq_relaxed;
extern ia64_mv_dma_alloc_coherent sn_dma_alloc_coherent;
extern ia64_mv_dma_free_coherent sn_dma_free_coherent;
extern ia64_mv_dma_map_single sn_dma_map_single;
@@ -85,6 +89,10 @@
#define platform_readw __sn_readw
#define platform_readl __sn_readl
#define platform_readq __sn_readq
+#define platform_readb_relaxed __sn_readb_relaxed
+#define platform_readw_relaxed __sn_readw_relaxed
+#define platform_readl_relaxed __sn_readl_relaxed
+#define platform_readq_relaxed __sn_readq_relaxed
#define platform_irq_desc sn_irq_desc
#define platform_irq_to_vector sn_irq_to_vector
#define platform_local_vector_to_irq sn_local_vector_to_irq
diff -Nru a/include/asm-ia64/sn/sn2/io.h b/include/asm-ia64/sn/sn2/io.h
--- a/include/asm-ia64/sn/sn2/io.h Thu Jan 15 11:39:13 2004
+++ b/include/asm-ia64/sn/sn2/io.h Thu Jan 15 11:39:13 2004
@@ -27,6 +27,10 @@
#define __sn_readw ___sn_readw
#define __sn_readl ___sn_readl
#define __sn_readq ___sn_readq
+#define __sn_readb_relaxed ___sn_readb_relaxed
+#define __sn_readw_relaxed ___sn_readw_relaxed
+#define __sn_readl_relaxed ___sn_readl_relaxed
+#define __sn_readq_relaxed ___sn_readq_relaxed

/*
* The following routines are SN Platform specific, called when
@@ -208,25 +212,25 @@
}

static inline unsigned char
-sn_readb_fast (void *addr)
+___sn_readb_relaxed (void *addr)
{
return *(volatile unsigned char *)addr;
}

static inline unsigned short
-sn_readw_fast (void *addr)
+___sn_readw_relaxed (void *addr)
{
return *(volatile unsigned short *)addr;
}

static inline unsigned int
-sn_readl_fast (void *addr)
+___sn_readl_relaxed (void *addr)
{
return *(volatile unsigned int *) addr;
}

static inline unsigned long
-sn_readq_fast (void *addr)
+___sn_readq_relaxed (void *addr)
{
return *(volatile unsigned long *) addr;
}


2004-01-15 22:16:05

by Grant Grundler

[permalink] [raw]
Subject: Re: [PATCH] readX_relaxed interface

On Thu, Jan 15, 2004 at 12:49:13PM -0800, Jesse Barnes wrote:
> Based on the PIO ordering disucssion, I've come up with the following
> patch. It has the potential to help any platform that has seperate PIO
> and DMA channels, and allows them to be reorderd wrt each other.

This is only significant for DMA writes (inbound) vs. PIO Read returns.
The ZX1 platforms have reordering enabled for outbound DMA (vs PIO
writes) since last summer.

Outside the context of PCI-X Relaxed Ordering, this violates PCI
ordering rules. Any patches to drivers *using* the new readb()
variants in effect work around this violation. I"m ok with that - just
want it to be clear.

PCI-X support will need a different interface
(eg pcix_enable_relaxed_ordering()) to support
it's form of "Relaxed Ordering".

> Some
> SGI MIPS platforms, as well as the SGI Altix (aka sn2) platform behave
> this way, and will thus benefit from this patch.
>
> It adds a new PIO read routine for PIOs that don't have to be ordered
> wrt DMA on the system.
>
> If it looks ok, I'll add in macros for the other arches and send it out
> for inclusion.

It looks ok to me.

thanks,
grant

2004-01-15 22:59:57

by Jesse Barnes

[permalink] [raw]
Subject: Re: [PATCH] readX_relaxed interface

On Thu, Jan 15, 2004 at 02:16:40PM -0800, Grant Grundler wrote:
> Outside the context of PCI-X Relaxed Ordering, this violates PCI
> ordering rules. Any patches to drivers *using* the new readb()
> variants in effect work around this violation. I"m ok with that - just
> want it to be clear.

Yep, that's an advantage of this API--you only use it when you know it's
ok to violate those rules.

> PCI-X support will need a different interface
> (eg pcix_enable_relaxed_ordering()) to support
> it's form of "Relaxed Ordering".

Right, seperate issue.

> > If it looks ok, I'll add in macros for the other arches and send it out
> > for inclusion.
>
> It looks ok to me.

Great, thanks.

Jesse

2004-01-16 01:07:00

by Greg KH

[permalink] [raw]
Subject: Re: [PATCH] readX_relaxed interface

On Thu, Jan 15, 2004 at 12:49:13PM -0800, Jesse Barnes wrote:
> Based on the PIO ordering disucssion, I've come up with the following
> patch. It has the potential to help any platform that has seperate PIO
> and DMA channels, and allows them to be reorderd wrt each other. Some
> SGI MIPS platforms, as well as the SGI Altix (aka sn2) platform behave
> this way, and will thus benefit from this patch.
>
> It adds a new PIO read routine for PIOs that don't have to be ordered
> wrt DMA on the system.
>
> If it looks ok, I'll add in macros for the other arches and send it out
> for inclusion.

It looks ok, but it would really be good if we could indicate if the
read actually was successful. Right now some platforms can detect
faults and do not have a way to get that error back to the driver in a
sane manner. If we were to change the read* functions to look something
like:
int readb(void *addr, u8 *data);
it would be a world easier.

Now I'm not saying I want to change the existing interfaces to support
this, that's too much code to change for even me (and is a 2.7 thing.)

Just wanted to put this idea in people's heads that we need to start
planning for something like it.

thanks,

greg k-h

2004-01-16 02:22:10

by Jesse Barnes

[permalink] [raw]
Subject: Re: [PATCH] readX_relaxed interface

On Thu, Jan 15, 2004 at 04:32:25PM -0800, Greg KH wrote:
> It looks ok, but it would really be good if we could indicate if the
> read actually was successful. Right now some platforms can detect
> faults and do not have a way to get that error back to the driver in a
> sane manner. If we were to change the read* functions to look something
> like:
> int readb(void *addr, u8 *data);
> it would be a world easier.

At one point, I thought it would be nice if it took a struct device *
too, but that's probably a bit much.

> Now I'm not saying I want to change the existing interfaces to support
> this, that's too much code to change for even me (and is a 2.7 thing.)
>
> Just wanted to put this idea in people's heads that we need to start
> planning for something like it.

Sounds reasonable. It should be helpful.

Thanks,
Jesse

2004-01-16 03:21:03

by Jeremy Higdon

[permalink] [raw]
Subject: Re: [PATCH] readX_relaxed interface

On Thu, Jan 15, 2004 at 02:16:40PM -0800, Grant Grundler wrote:
> On Thu, Jan 15, 2004 at 12:49:13PM -0800, Jesse Barnes wrote:
> > Based on the PIO ordering disucssion, I've come up with the following
> > patch. It has the potential to help any platform that has seperate PIO
> > and DMA channels, and allows them to be reorderd wrt each other.
>
> This is only significant for DMA writes (inbound) vs. PIO Read returns.

Correct.

> The ZX1 platforms have reordering enabled for outbound DMA (vs PIO
> writes) since last summer.

The SGI NUMA platforms do this also. This is always safe, at least
in real life systems, I think (though someone will now undoubtedly
come up with an example where it isn't), as long as CPU updates to
memory made by the CPU prior to issuing the PIO are coherent by the
time the device sees the PIO. If not, then you need some sort of
cache writeback, which is already provided for in APIs.

> Outside the context of PCI-X Relaxed Ordering, this violates PCI
> ordering rules. Any patches to drivers *using* the new readb()
> variants in effect work around this violation. I"m ok with that - just
> want it to be clear.

I would put it a different way. We are currently conforming to
PCI ordering rules using a relatively expensive sw/hw workaround
in the SN versions of readX().
These readX_relaxed() variants allow us to speed up drivers in
cases where DMA write and PIO read ordering is unnecessary or
taken care of some other way (maybe a previous readX call).

So with this patch, we're providing a fast PIO read that violates
PCI ordering rules, to be used only when the ordering rules are
unnecessary.

Btw, in certain situations, this can cut what would be a 50us or
longer PIO read down to about 1us, which is why we're pushing this.

> PCI-X support will need a different interface
> (eg pcix_enable_relaxed_ordering()) to support
> it's form of "Relaxed Ordering".

Right.


Thanks for the reviews.

jeremy

2004-01-16 05:00:20

by Grant Grundler

[permalink] [raw]
Subject: Re: [PATCH] readX_relaxed interface

On Thu, Jan 15, 2004 at 04:32:25PM -0800, Greg KH wrote:
> It looks ok, but it would really be good if we could indicate if the
> read actually was successful. Right now some platforms can detect
> faults and do not have a way to get that error back to the driver in a
> sane manner. If we were to change the read* functions to look something
> like:
> int readb(void *addr, u8 *data);
> it would be a world easier.

I've worked on systems with that kind of an interface and it
really makes a mess of the code. And many of the drivers just
ignored the read return value.

> Now I'm not saying I want to change the existing interfaces to support
> this, that's too much code to change for even me (and is a 2.7 thing.)

I think you'll find it's extremely invasive if it's going to be useful.
The drivers have to be rewritten to check each PIO return value
and then do something intelligent at that point. HPUX had drivers
that did this for "Host Power Fail" support 10 years ago but
it's *very* difficult to get all the error handling right in
each of the code pathes.

My preference is the driver register a "clean up all pending IO and
free related data structures" so it's back to a state as if it hadn't
been started. Then when a PIO read (or write) fails, the mechanism for
detecting the read failure doesn't depend on synchronous errors being
reported/checked by software on each read. ie the mechanism for
detecting the failure *can* be in the PIO read code path but
doesn't have to be if HW has facilities to detect failures.
(I'm thinking of parisc HPMC and ia64 MCA handling).

> Just wanted to put this idea in people's heads that we need to start
> planning for something like it.

yeah - getting to the next level of availability on higher end systems
is hard. I'm not totally convinced it's the right thing for linux
to do, but if someone wants to fund the work, it'll be interesting
to work on.

grant

2004-01-16 05:27:44

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] readX_relaxed interface



On Thu, 15 Jan 2004, Greg KH wrote:
>
> It looks ok, but it would really be good if we could indicate if the
> read actually was successful. Right now some platforms can detect
> faults and do not have a way to get that error back to the driver in a
> sane manner. If we were to change the read* functions to look something
> like:
> int readb(void *addr, u8 *data);
> it would be a world easier.

NOOOO!

Please don't. 99.99% of all uses don't care one whit, and an interface
like the above ends up being total cr*p to use.

If you care about machine check errors, use a special interface for that.
A _really_ special one. Especially as on many systems you'll likely have
to read status registers etc (and clear them before doing the IO) to see
the errors.

So that way you can get errors working, AND it won't actually make normal
code any uglier.

Linus

2004-01-16 05:49:41

by Grant Grundler

[permalink] [raw]
Subject: Re: [PATCH] readX_relaxed interface

On Thu, Jan 15, 2004 at 04:32:25PM -0800, Greg KH wrote:
[ deleted reminder for readb() to return success/fail codes ]
> Just wanted to put this idea in people's heads that we need to start
> planning for something like it.

I just remembered another part of linux 2.4/2.6 that needs revisiting:
DMA mapping routines don't return an error code.
ie pci_map_single() must panic since it can't return a failure.
It was designed that way on purpose to make life easier for driver
writers (and I agree, it has).

(my guess is x86-64 needs this change more urgently than any other arch.)

I'm sure there are other robustness issues too.
Looking for "panic" will probably give alot of them away.
The current 2.6.1 tree has over 1000 panic() calls.
I used "find -name \*.c | fgrep panic\( | wc ".

And for my amusement:
grundler <506>find drivers/scsi -name \*.c | xargs fgrep panic\( | wc
183 1243 14722
grundler <507>find drivers/net -name \*.c | xargs fgrep panic\( | wc
10 53 662

My point is a substantial number of things can be done to improve
robustness besides (or in addition to?) recovering from IO subsystem
crashes.

grant

2004-01-16 17:21:19

by Jesse Barnes

[permalink] [raw]
Subject: Re: [PATCH] readX_relaxed interface

On Thu, Jan 15, 2004 at 09:00:10PM -0800, Linus Torvalds wrote:
> If you care about machine check errors, use a special interface for that.
> A _really_ special one. Especially as on many systems you'll likely have
> to read status registers etc (and clear them before doing the IO) to see
> the errors.
>
> So that way you can get errors working, AND it won't actually make normal
> code any uglier.

How about one that allows you to register an error handling function for
a given address range and/or device? That would cover both read() and
write() cases, and would be optional so drivers wouldn't be forced to
become more complicated.

Jesse

2004-01-19 09:31:58

by Hironobu Ishii

[permalink] [raw]
Subject: Re: [PATCH] readX_relaxed interface


----- Original Message -----
From: "Grant Grundler" <[email protected]>


> On Thu, Jan 15, 2004 at 04:32:25PM -0800, Greg KH wrote:
> > It looks ok, but it would really be good if we could indicate if the
> > read actually was successful. Right now some platforms can detect
> > faults and do not have a way to get that error back to the driver in a
> > sane manner. If we were to change the read* functions to look something
> > like:
> > int readb(void *addr, u8 *data);
> > it would be a world easier.
>
> I've worked on systems with that kind of an interface and it
> really makes a mess of the code. And many of the drivers just
> ignored the read return value.

I've also worked on such system.
I understand it's difficult all drivers use that kind of an interface.
But if some common drivers like scsi, FC and LAN drivers use such interface,
it's usefull for most users.

>
> > Now I'm not saying I want to change the existing interfaces to support
> > this, that's too much code to change for even me (and is a 2.7 thing.)
>
> I think you'll find it's extremely invasive if it's going to be useful.
> The drivers have to be rewritten to check each PIO return value
> and then do something intelligent at that point. HPUX had drivers
> that did this for "Host Power Fail" support 10 years ago but
> it's *very* difficult to get all the error handling right in
> each of the code pathes.
>
> My preference is the driver register a "clean up all pending IO and
> free related data structures" so it's back to a state as if it hadn't
> been started. Then when a PIO read (or write) fails, the mechanism for
> detecting the read failure doesn't depend on synchronous errors being
> reported/checked by software on each read. ie the mechanism for
> detecting the failure *can* be in the PIO read code path but
> doesn't have to be if HW has facilities to detect failures.
> (I'm thinking of parisc HPMC and ia64 MCA handling).

But, when the read thread continues without noticing the error
(before the error is asynchronously notified),
the thread runs based on wrong data and may panic.
So I think PIO read error must be notified synchronously.

On the other hand, PIO write error can be notified asynchronously,
because software does not use it.

Hironobu Ishii

2004-01-19 18:21:59

by Grant Grundler

[permalink] [raw]
Subject: Re: [PATCH] readX_relaxed interface

On Mon, Jan 19, 2004 at 06:31:42PM +0900, Hironobu Ishii wrote:
> But, when the read thread continues without noticing the error
> (before the error is asynchronously notified),

I wasn't suggesting asynchonous notification.

> the thread runs based on wrong data and may panic.

So far I've been assuming resources/IO requests can be cleaned up
more easily in a shared code path. I was assuming the readb() would
call the "cleanup" and then return a "harmless" value (eg. 0 or -1)
that was provided by the driver before hand. I'm more worried
about the code that evaluates the readb() return value than
synchronous notification or cleaning up resources.

Having unique error recovery code after each PIO read did work
but it was not an elegant solution. It was a problem of too much
"unused" code interferring with the regular code path. And it
didn't distinguish sufficiently between code to handle "platform"
errors (failure to talk to a card) vs card errors (card
failed an IO).

I guess I'd need to modify one driver using my proposal
instead of assuming it doesn't matter wether the recovery code
lives immediately after the PIO read or in some common routine.
Problem is I have other issues to deal with right now
even though I've made clear to my management "HW error recovery"
is required for higher levels of availability with linux.

> So I think PIO read error must be notified synchronously.

I agree.

> On the other hand, PIO write error can be notified asynchronously,
> because software does not use it.

yes.

thanks,
grant