According to LoPAR, ibm,query-pe-dma-window output named "IO Page Sizes"
will let the OS know all possible pagesizes that can be used for creating a
new DDW.
Currently Linux will only try using 3 of the 8 available options:
4K, 64K and 16M. According to LoPAR, Hypervisor may also offer 32M, 64M,
128M, 256M and 16G.
Enabling bigger pages would be interesting for direct mapping systems
with a lot of RAM, while using less TCE entries.
Signed-off-by: Leonardo Bras <[email protected]>
---
arch/powerpc/platforms/pseries/iommu.c | 49 ++++++++++++++++++++++----
1 file changed, 42 insertions(+), 7 deletions(-)
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 9fc5217f0c8e..6cda1c92597d 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -53,6 +53,20 @@ enum {
DDW_EXT_QUERY_OUT_SIZE = 2
};
+#define QUERY_DDW_PGSIZE_4K 0x01
+#define QUERY_DDW_PGSIZE_64K 0x02
+#define QUERY_DDW_PGSIZE_16M 0x04
+#define QUERY_DDW_PGSIZE_32M 0x08
+#define QUERY_DDW_PGSIZE_64M 0x10
+#define QUERY_DDW_PGSIZE_128M 0x20
+#define QUERY_DDW_PGSIZE_256M 0x40
+#define QUERY_DDW_PGSIZE_16G 0x80
+
+struct iommu_ddw_pagesize {
+ u32 mask;
+ int shift;
+};
+
static struct iommu_table_group *iommu_pseries_alloc_group(int node)
{
struct iommu_table_group *table_group;
@@ -1099,6 +1113,31 @@ static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn)
ret);
}
+/* Returns page shift based on "IO Page Sizes" output at ibm,query-pe-dma-window. See LoPAR */
+static int iommu_get_page_shift(u32 query_page_size)
+{
+ const struct iommu_ddw_pagesize ddw_pagesize[] = {
+ { QUERY_DDW_PGSIZE_16G, __builtin_ctz(SZ_16G) },
+ { QUERY_DDW_PGSIZE_256M, __builtin_ctz(SZ_256M) },
+ { QUERY_DDW_PGSIZE_128M, __builtin_ctz(SZ_128M) },
+ { QUERY_DDW_PGSIZE_64M, __builtin_ctz(SZ_64M) },
+ { QUERY_DDW_PGSIZE_32M, __builtin_ctz(SZ_32M) },
+ { QUERY_DDW_PGSIZE_16M, __builtin_ctz(SZ_16M) },
+ { QUERY_DDW_PGSIZE_64K, __builtin_ctz(SZ_64K) },
+ { QUERY_DDW_PGSIZE_4K, __builtin_ctz(SZ_4K) }
+ };
+
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(ddw_pagesize); i++) {
+ if (query_page_size & ddw_pagesize[i].mask)
+ return ddw_pagesize[i].shift;
+ }
+
+ /* No valid page size found. */
+ return 0;
+}
+
/*
* If the PE supports dynamic dma windows, and there is space for a table
* that can map all pages in a linear offset, then setup such a table,
@@ -1206,13 +1245,9 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
goto out_failed;
}
}
- if (query.page_size & 4) {
- page_shift = 24; /* 16MB */
- } else if (query.page_size & 2) {
- page_shift = 16; /* 64kB */
- } else if (query.page_size & 1) {
- page_shift = 12; /* 4kB */
- } else {
+
+ page_shift = iommu_get_page_shift(query.page_size);
+ if (!page_shift) {
dev_dbg(&dev->dev, "no supported direct page size in mask %x",
query.page_size);
goto out_failed;
--
2.30.2
Hi Leonardo,
Thank you for the patch! Perhaps something to improve:
[auto build test WARNING on powerpc/next]
[also build test WARNING on v5.12-rc6 next-20210407]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]
url: https://github.com/0day-ci/linux/commits/Leonardo-Bras/powerpc-iommu-Enable-remaining-IOMMU-Pagesizes-present-in-LoPAR/20210408-035800
base: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc-allyesconfig (attached as .config)
compiler: powerpc64-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/0day-ci/linux/commit/faa8b10e5b9652dbd56ed8e759a1cc09b95805be
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Leonardo-Bras/powerpc-iommu-Enable-remaining-IOMMU-Pagesizes-present-in-LoPAR/20210408-035800
git checkout faa8b10e5b9652dbd56ed8e759a1cc09b95805be
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=powerpc
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>
All warnings (new ones prefixed by >>):
In file included from include/vdso/const.h:5,
from include/linux/const.h:4,
from include/linux/bits.h:5,
from include/linux/bitops.h:6,
from include/linux/kernel.h:11,
from include/asm-generic/bug.h:20,
from arch/powerpc/include/asm/bug.h:109,
from include/linux/bug.h:5,
from include/linux/mmdebug.h:5,
from include/linux/gfp.h:5,
from include/linux/slab.h:15,
from arch/powerpc/platforms/pseries/iommu.c:15:
arch/powerpc/platforms/pseries/iommu.c: In function 'iommu_get_page_shift':
>> include/uapi/linux/const.h:20:19: warning: conversion from 'long long unsigned int' to 'unsigned int' changes value from '17179869184' to '0' [-Woverflow]
20 | #define __AC(X,Y) (X##Y)
| ^~~~~~
include/uapi/linux/const.h:21:18: note: in expansion of macro '__AC'
21 | #define _AC(X,Y) __AC(X,Y)
| ^~~~
include/linux/sizes.h:48:19: note: in expansion of macro '_AC'
48 | #define SZ_16G _AC(0x400000000, ULL)
| ^~~
arch/powerpc/platforms/pseries/iommu.c:1120:42: note: in expansion of macro 'SZ_16G'
1120 | { QUERY_DDW_PGSIZE_16G, __builtin_ctz(SZ_16G) },
| ^~~~~~
vim +20 include/uapi/linux/const.h
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal 2007-05-02 6
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal 2007-05-02 7 /* Some constant macros are used in both assembler and
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal 2007-05-02 8 * C code. Therefore we cannot annotate them always with
6df95fd7ad9a84 include/linux/const.h Randy Dunlap 2007-05-08 9 * 'UL' and other type specifiers unilaterally. We
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal 2007-05-02 10 * use the following macros to deal with this.
74ef649fe847fd include/linux/const.h Jeremy Fitzhardinge 2008-01-30 11 *
74ef649fe847fd include/linux/const.h Jeremy Fitzhardinge 2008-01-30 12 * Similarly, _AT() will cast an expression with a type in C, but
74ef649fe847fd include/linux/const.h Jeremy Fitzhardinge 2008-01-30 13 * leave it unchanged in asm.
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal 2007-05-02 14 */
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal 2007-05-02 15
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal 2007-05-02 16 #ifdef __ASSEMBLY__
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal 2007-05-02 17 #define _AC(X,Y) X
74ef649fe847fd include/linux/const.h Jeremy Fitzhardinge 2008-01-30 18 #define _AT(T,X) X
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal 2007-05-02 19 #else
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal 2007-05-02 @20 #define __AC(X,Y) (X##Y)
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal 2007-05-02 21 #define _AC(X,Y) __AC(X,Y)
74ef649fe847fd include/linux/const.h Jeremy Fitzhardinge 2008-01-30 22 #define _AT(T,X) ((T)(X))
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal 2007-05-02 23 #endif
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal 2007-05-02 24
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]
Leonardo Bras <[email protected]> writes:
> According to LoPAR, ibm,query-pe-dma-window output named "IO Page Sizes"
> will let the OS know all possible pagesizes that can be used for creating a
> new DDW.
>
> Currently Linux will only try using 3 of the 8 available options:
> 4K, 64K and 16M. According to LoPAR, Hypervisor may also offer 32M, 64M,
> 128M, 256M and 16G.
Do we know of any hardware & hypervisor combination that will actually
give us bigger pages?
> Enabling bigger pages would be interesting for direct mapping systems
> with a lot of RAM, while using less TCE entries.
>
> Signed-off-by: Leonardo Bras <[email protected]>
> ---
> arch/powerpc/platforms/pseries/iommu.c | 49 ++++++++++++++++++++++----
> 1 file changed, 42 insertions(+), 7 deletions(-)
>
> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> index 9fc5217f0c8e..6cda1c92597d 100644
> --- a/arch/powerpc/platforms/pseries/iommu.c
> +++ b/arch/powerpc/platforms/pseries/iommu.c
> @@ -53,6 +53,20 @@ enum {
> DDW_EXT_QUERY_OUT_SIZE = 2
> };
A comment saying where the values come from would be good.
> +#define QUERY_DDW_PGSIZE_4K 0x01
> +#define QUERY_DDW_PGSIZE_64K 0x02
> +#define QUERY_DDW_PGSIZE_16M 0x04
> +#define QUERY_DDW_PGSIZE_32M 0x08
> +#define QUERY_DDW_PGSIZE_64M 0x10
> +#define QUERY_DDW_PGSIZE_128M 0x20
> +#define QUERY_DDW_PGSIZE_256M 0x40
> +#define QUERY_DDW_PGSIZE_16G 0x80
I'm not sure the #defines really gain us much vs just putting the
literal values in the array below?
> +struct iommu_ddw_pagesize {
> + u32 mask;
> + int shift;
> +};
> +
> static struct iommu_table_group *iommu_pseries_alloc_group(int node)
> {
> struct iommu_table_group *table_group;
> @@ -1099,6 +1113,31 @@ static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn)
> ret);
> }
>
> +/* Returns page shift based on "IO Page Sizes" output at ibm,query-pe-dma-window. See LoPAR */
> +static int iommu_get_page_shift(u32 query_page_size)
> +{
> + const struct iommu_ddw_pagesize ddw_pagesize[] = {
> + { QUERY_DDW_PGSIZE_16G, __builtin_ctz(SZ_16G) },
> + { QUERY_DDW_PGSIZE_256M, __builtin_ctz(SZ_256M) },
> + { QUERY_DDW_PGSIZE_128M, __builtin_ctz(SZ_128M) },
> + { QUERY_DDW_PGSIZE_64M, __builtin_ctz(SZ_64M) },
> + { QUERY_DDW_PGSIZE_32M, __builtin_ctz(SZ_32M) },
> + { QUERY_DDW_PGSIZE_16M, __builtin_ctz(SZ_16M) },
> + { QUERY_DDW_PGSIZE_64K, __builtin_ctz(SZ_64K) },
> + { QUERY_DDW_PGSIZE_4K, __builtin_ctz(SZ_4K) }
> + };
cheers
Hello Michael, thank you for this feedback!
Comments inline:
On Thu, 2021-04-08 at 15:37 +1000, Michael Ellerman wrote:
> Leonardo Bras <[email protected]> writes:
> > According to LoPAR, ibm,query-pe-dma-window output named "IO Page Sizes"
> > will let the OS know all possible pagesizes that can be used for creating a
> > new DDW.
> >
> > Currently Linux will only try using 3 of the 8 available options:
> > 4K, 64K and 16M. According to LoPAR, Hypervisor may also offer 32M, 64M,
> > 128M, 256M and 16G.
>
> Do we know of any hardware & hypervisor combination that will actually
> give us bigger pages?
>
> > Enabling bigger pages would be interesting for direct mapping systems
> > with a lot of RAM, while using less TCE entries.
> >
> > Signed-off-by: Leonardo Bras <[email protected]>
> > ---
> > arch/powerpc/platforms/pseries/iommu.c | 49 ++++++++++++++++++++++----
> > 1 file changed, 42 insertions(+), 7 deletions(-)
> >
> > diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> > index 9fc5217f0c8e..6cda1c92597d 100644
> > --- a/arch/powerpc/platforms/pseries/iommu.c
> > +++ b/arch/powerpc/platforms/pseries/iommu.c
> > @@ -53,6 +53,20 @@ enum {
> > DDW_EXT_QUERY_OUT_SIZE = 2
> > };
>
> A comment saying where the values come from would be good.
Sure, I will add the information about LoPAR.
>
> > +#define QUERY_DDW_PGSIZE_4K 0x01
> > +#define QUERY_DDW_PGSIZE_64K 0x02
> > +#define QUERY_DDW_PGSIZE_16M 0x04
> > +#define QUERY_DDW_PGSIZE_32M 0x08
> > +#define QUERY_DDW_PGSIZE_64M 0x10
> > +#define QUERY_DDW_PGSIZE_128M 0x20
> > +#define QUERY_DDW_PGSIZE_256M 0x40
> > +#define QUERY_DDW_PGSIZE_16G 0x80
>
> I'm not sure the #defines really gain us much vs just putting the
> literal values in the array below?
My v1 did not use the define approach, what do you think of that?
http://patchwork.ozlabs.org/project/linuxppc-dev/patch/[email protected]/
>
> > +struct iommu_ddw_pagesize {
> > + u32 mask;
> > + int shift;
> > +};
> > +
> > static struct iommu_table_group *iommu_pseries_alloc_group(int node)
> > {
> > struct iommu_table_group *table_group;
> > @@ -1099,6 +1113,31 @@ static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn)
> > ret);
> > }
> >
> >
> >
> >
> >
> >
> >
> >
> >
> >
> >
> >
> >
> >
> >
> >
> >
> >
> >
> >
> >
> >
> >
> >
> >
> >
> >
> >
> >
> >
> >
> >
> > +/* Returns page shift based on "IO Page Sizes" output at ibm,query-pe-dma-window. See LoPAR */
> > +static int iommu_get_page_shift(u32 query_page_size)
> > +{
> > + const struct iommu_ddw_pagesize ddw_pagesize[] = {
> > + { QUERY_DDW_PGSIZE_16G, __builtin_ctz(SZ_16G) },
> > + { QUERY_DDW_PGSIZE_256M, __builtin_ctz(SZ_256M) },
> > + { QUERY_DDW_PGSIZE_128M, __builtin_ctz(SZ_128M) },
> > + { QUERY_DDW_PGSIZE_64M, __builtin_ctz(SZ_64M) },
> > + { QUERY_DDW_PGSIZE_32M, __builtin_ctz(SZ_32M) },
> > + { QUERY_DDW_PGSIZE_16M, __builtin_ctz(SZ_16M) },
> > + { QUERY_DDW_PGSIZE_64K, __builtin_ctz(SZ_64K) },
> > + { QUERY_DDW_PGSIZE_4K, __builtin_ctz(SZ_4K) }
> > + };
>
>
> cheers
Best regards,
Leonardo Bras
On Thu, 2021-04-08 at 03:20 -0300, Leonardo Bras wrote:
> > > +#define QUERY_DDW_PGSIZE_4K 0x01
> > > +#define QUERY_DDW_PGSIZE_64K 0x02
> > > +#define QUERY_DDW_PGSIZE_16M 0x04
> > > +#define QUERY_DDW_PGSIZE_32M 0x08
> > > +#define QUERY_DDW_PGSIZE_64M 0x10
> > > +#define QUERY_DDW_PGSIZE_128M 0x20
> > > +#define QUERY_DDW_PGSIZE_256M 0x40
> > > +#define QUERY_DDW_PGSIZE_16G 0x80
> >
> > I'm not sure the #defines really gain us much vs just putting the
> > literal values in the array below?
>
> My v1 did not use the define approach, what do you think of that?
> http://patchwork.ozlabs.org/project/linuxppc-dev/patch/[email protected]/
>
>
(of course, it would be that without the pageshift defines also, using
the __builtin_ctz() approach suggested by Alexey.)
On 08/04/2021 15:37, Michael Ellerman wrote:
> Leonardo Bras <[email protected]> writes:
>> According to LoPAR, ibm,query-pe-dma-window output named "IO Page Sizes"
>> will let the OS know all possible pagesizes that can be used for creating a
>> new DDW.
>>
>> Currently Linux will only try using 3 of the 8 available options:
>> 4K, 64K and 16M. According to LoPAR, Hypervisor may also offer 32M, 64M,
>> 128M, 256M and 16G.
>
> Do we know of any hardware & hypervisor combination that will actually
> give us bigger pages?
On P8 16MB host pages and 16MB hardware iommu pages worked.
On P9, VM's 16MB IOMMU pages worked on top of 2MB host pages + 2MB
hardware IOMMU pages.
>
>> Enabling bigger pages would be interesting for direct mapping systems
>> with a lot of RAM, while using less TCE entries.
>>
>> Signed-off-by: Leonardo Bras <[email protected]>
>> ---
>> arch/powerpc/platforms/pseries/iommu.c | 49 ++++++++++++++++++++++----
>> 1 file changed, 42 insertions(+), 7 deletions(-)
>>
>> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
>> index 9fc5217f0c8e..6cda1c92597d 100644
>> --- a/arch/powerpc/platforms/pseries/iommu.c
>> +++ b/arch/powerpc/platforms/pseries/iommu.c
>> @@ -53,6 +53,20 @@ enum {
>> DDW_EXT_QUERY_OUT_SIZE = 2
>> };
>
> A comment saying where the values come from would be good.
>
>> +#define QUERY_DDW_PGSIZE_4K 0x01
>> +#define QUERY_DDW_PGSIZE_64K 0x02
>> +#define QUERY_DDW_PGSIZE_16M 0x04
>> +#define QUERY_DDW_PGSIZE_32M 0x08
>> +#define QUERY_DDW_PGSIZE_64M 0x10
>> +#define QUERY_DDW_PGSIZE_128M 0x20
>> +#define QUERY_DDW_PGSIZE_256M 0x40
>> +#define QUERY_DDW_PGSIZE_16G 0x80
>
> I'm not sure the #defines really gain us much vs just putting the
> literal values in the array below?
Then someone says "uuuuu magic values" :) I do not mind either way. Thanks,
>> +struct iommu_ddw_pagesize {
>> + u32 mask;
>> + int shift;
>> +};
>> +
>> static struct iommu_table_group *iommu_pseries_alloc_group(int node)
>> {
>> struct iommu_table_group *table_group;
>> @@ -1099,6 +1113,31 @@ static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn)
>> ret);
>> }
>>
>> +/* Returns page shift based on "IO Page Sizes" output at ibm,query-pe-dma-window. See LoPAR */
>> +static int iommu_get_page_shift(u32 query_page_size)
>> +{
>> + const struct iommu_ddw_pagesize ddw_pagesize[] = {
>> + { QUERY_DDW_PGSIZE_16G, __builtin_ctz(SZ_16G) },
>> + { QUERY_DDW_PGSIZE_256M, __builtin_ctz(SZ_256M) },
>> + { QUERY_DDW_PGSIZE_128M, __builtin_ctz(SZ_128M) },
>> + { QUERY_DDW_PGSIZE_64M, __builtin_ctz(SZ_64M) },
>> + { QUERY_DDW_PGSIZE_32M, __builtin_ctz(SZ_32M) },
>> + { QUERY_DDW_PGSIZE_16M, __builtin_ctz(SZ_16M) },
>> + { QUERY_DDW_PGSIZE_64K, __builtin_ctz(SZ_64K) },
>> + { QUERY_DDW_PGSIZE_4K, __builtin_ctz(SZ_4K) }
>> + };
>
>
> cheers
>
--
Alexey
Hi Leonardo,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on powerpc/next]
[also build test ERROR on v5.12-rc6 next-20210407]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]
url: https://github.com/0day-ci/linux/commits/Leonardo-Bras/powerpc-iommu-Enable-remaining-IOMMU-Pagesizes-present-in-LoPAR/20210408-035800
base: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc-randconfig-r016-20210407 (attached as .config)
compiler: powerpc64-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/0day-ci/linux/commit/faa8b10e5b9652dbd56ed8e759a1cc09b95805be
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Leonardo-Bras/powerpc-iommu-Enable-remaining-IOMMU-Pagesizes-present-in-LoPAR/20210408-035800
git checkout faa8b10e5b9652dbd56ed8e759a1cc09b95805be
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=powerpc
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>
All errors (new ones prefixed by >>):
In file included from include/vdso/const.h:5,
from include/linux/const.h:4,
from include/linux/bits.h:5,
from include/linux/bitops.h:6,
from include/linux/kernel.h:11,
from include/asm-generic/bug.h:20,
from arch/powerpc/include/asm/bug.h:109,
from include/linux/bug.h:5,
from include/linux/mmdebug.h:5,
from include/linux/gfp.h:5,
from include/linux/slab.h:15,
from arch/powerpc/platforms/pseries/iommu.c:15:
arch/powerpc/platforms/pseries/iommu.c: In function 'iommu_get_page_shift':
>> include/uapi/linux/const.h:20:19: error: conversion from 'long long unsigned int' to 'unsigned int' changes value from '17179869184' to '0' [-Werror=overflow]
20 | #define __AC(X,Y) (X##Y)
| ^~~~~~
include/uapi/linux/const.h:21:18: note: in expansion of macro '__AC'
21 | #define _AC(X,Y) __AC(X,Y)
| ^~~~
include/linux/sizes.h:48:19: note: in expansion of macro '_AC'
48 | #define SZ_16G _AC(0x400000000, ULL)
| ^~~
arch/powerpc/platforms/pseries/iommu.c:1120:42: note: in expansion of macro 'SZ_16G'
1120 | { QUERY_DDW_PGSIZE_16G, __builtin_ctz(SZ_16G) },
| ^~~~~~
cc1: all warnings being treated as errors
vim +20 include/uapi/linux/const.h
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal 2007-05-02 6
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal 2007-05-02 7 /* Some constant macros are used in both assembler and
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal 2007-05-02 8 * C code. Therefore we cannot annotate them always with
6df95fd7ad9a84 include/linux/const.h Randy Dunlap 2007-05-08 9 * 'UL' and other type specifiers unilaterally. We
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal 2007-05-02 10 * use the following macros to deal with this.
74ef649fe847fd include/linux/const.h Jeremy Fitzhardinge 2008-01-30 11 *
74ef649fe847fd include/linux/const.h Jeremy Fitzhardinge 2008-01-30 12 * Similarly, _AT() will cast an expression with a type in C, but
74ef649fe847fd include/linux/const.h Jeremy Fitzhardinge 2008-01-30 13 * leave it unchanged in asm.
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal 2007-05-02 14 */
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal 2007-05-02 15
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal 2007-05-02 16 #ifdef __ASSEMBLY__
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal 2007-05-02 17 #define _AC(X,Y) X
74ef649fe847fd include/linux/const.h Jeremy Fitzhardinge 2008-01-30 18 #define _AT(T,X) X
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal 2007-05-02 19 #else
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal 2007-05-02 @20 #define __AC(X,Y) (X##Y)
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal 2007-05-02 21 #define _AC(X,Y) __AC(X,Y)
74ef649fe847fd include/linux/const.h Jeremy Fitzhardinge 2008-01-30 22 #define _AT(T,X) ((T)(X))
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal 2007-05-02 23 #endif
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal 2007-05-02 24
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]
Alexey Kardashevskiy <[email protected]> writes:
> On 08/04/2021 15:37, Michael Ellerman wrote:
>> Leonardo Bras <[email protected]> writes:
>>> According to LoPAR, ibm,query-pe-dma-window output named "IO Page Sizes"
>>> will let the OS know all possible pagesizes that can be used for creating a
>>> new DDW.
>>>
>>> Currently Linux will only try using 3 of the 8 available options:
>>> 4K, 64K and 16M. According to LoPAR, Hypervisor may also offer 32M, 64M,
>>> 128M, 256M and 16G.
>>
>> Do we know of any hardware & hypervisor combination that will actually
>> give us bigger pages?
>
>
> On P8 16MB host pages and 16MB hardware iommu pages worked.
>
> On P9, VM's 16MB IOMMU pages worked on top of 2MB host pages + 2MB
> hardware IOMMU pages.
The current code already tries 16MB though.
I'm wondering if we're going to ask for larger sizes that have never
been tested and possibly expose bugs. But it sounds like this is mainly
targeted at future platforms.
>>> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
>>> index 9fc5217f0c8e..6cda1c92597d 100644
>>> --- a/arch/powerpc/platforms/pseries/iommu.c
>>> +++ b/arch/powerpc/platforms/pseries/iommu.c
>>> @@ -53,6 +53,20 @@ enum {
>>> DDW_EXT_QUERY_OUT_SIZE = 2
>>> };
>>
>> A comment saying where the values come from would be good.
>>
>>> +#define QUERY_DDW_PGSIZE_4K 0x01
>>> +#define QUERY_DDW_PGSIZE_64K 0x02
>>> +#define QUERY_DDW_PGSIZE_16M 0x04
>>> +#define QUERY_DDW_PGSIZE_32M 0x08
>>> +#define QUERY_DDW_PGSIZE_64M 0x10
>>> +#define QUERY_DDW_PGSIZE_128M 0x20
>>> +#define QUERY_DDW_PGSIZE_256M 0x40
>>> +#define QUERY_DDW_PGSIZE_16G 0x80
>>
>> I'm not sure the #defines really gain us much vs just putting the
>> literal values in the array below?
>
> Then someone says "uuuuu magic values" :) I do not mind either way. Thanks,
Yeah that's true. But #defining them doesn't make them less magic, if
you only use them in one place :)
cheers
Leonardo Bras <[email protected]> writes:
> On Thu, 2021-04-08 at 03:20 -0300, Leonardo Bras wrote:
>> > > +#define QUERY_DDW_PGSIZE_4K 0x01
>> > > +#define QUERY_DDW_PGSIZE_64K 0x02
>> > > +#define QUERY_DDW_PGSIZE_16M 0x04
>> > > +#define QUERY_DDW_PGSIZE_32M 0x08
>> > > +#define QUERY_DDW_PGSIZE_64M 0x10
>> > > +#define QUERY_DDW_PGSIZE_128M 0x20
>> > > +#define QUERY_DDW_PGSIZE_256M 0x40
>> > > +#define QUERY_DDW_PGSIZE_16G 0x80
>> >
>> > I'm not sure the #defines really gain us much vs just putting the
>> > literal values in the array below?
>>
>> My v1 did not use the define approach, what do you think of that?
>> http://patchwork.ozlabs.org/project/linuxppc-dev/patch/[email protected]/
>>
>>
> (of course, it would be that without the pageshift defines also, using
> the __builtin_ctz() approach suggested by Alexey.)
Yeah I think I like that better.
cheers
On 08/04/2021 19:04, Michael Ellerman wrote:
> Alexey Kardashevskiy <[email protected]> writes:
>> On 08/04/2021 15:37, Michael Ellerman wrote:
>>> Leonardo Bras <[email protected]> writes:
>>>> According to LoPAR, ibm,query-pe-dma-window output named "IO Page Sizes"
>>>> will let the OS know all possible pagesizes that can be used for creating a
>>>> new DDW.
>>>>
>>>> Currently Linux will only try using 3 of the 8 available options:
>>>> 4K, 64K and 16M. According to LoPAR, Hypervisor may also offer 32M, 64M,
>>>> 128M, 256M and 16G.
>>>
>>> Do we know of any hardware & hypervisor combination that will actually
>>> give us bigger pages?
>>
>>
>> On P8 16MB host pages and 16MB hardware iommu pages worked.
>>
>> On P9, VM's 16MB IOMMU pages worked on top of 2MB host pages + 2MB
>> hardware IOMMU pages.
>
> The current code already tries 16MB though.
>
> I'm wondering if we're going to ask for larger sizes that have never
> been tested and possibly expose bugs. But it sounds like this is mainly
> targeted at future platforms.
I tried for fun to pass through a PCI device to a guest with this patch as:
pbuild/qemu-killslof-aiku1904le-ppc64/qemu-system-ppc64 \
-nodefaults \
-chardev stdio,id=STDIO0,signal=off,mux=on \
-device spapr-vty,id=svty0,reg=0x71000110,chardev=STDIO0 \
-mon id=MON0,chardev=STDIO0,mode=readline \
-nographic \
-vga none \
-enable-kvm \
-m 16G \
-kernel ./vmldbg \
-initrd /home/aik/t/le.cpio \
-device vfio-pci,id=vfio0001_01_00_0,host=0001:01:00.0 \
-mem-prealloc \
-mem-path qemu_hp_1G_node0 \
-global spapr-pci-host-bridge.pgsz=0xffffff000 \
-machine cap-cfpc=broken,cap-ccf-assist=off \
-smp 1,threads=1 \
-L /home/aik/t/qemu-ppc64-bios/ \
-trace events=qemu_trace_events \
-d guest_errors,mmu \
-chardev socket,id=SOCKET0,server=on,wait=off,path=qemu.mon.1_1_0_0 \
-mon chardev=SOCKET0,mode=control
The guest created a huge window:
xhci_hcd 0000:00:00.0: ibm,create-pe-dma-window(2027) 0 8000000 20000000
22 22 returned 0 (liobn = 0x80000001 starting addr = 8000000 0)
The first "22" is page_shift in hex (16GB), the second "22" is
window_shift (so we have 1 TCE).
On the host side the window#1 was created with 1GB pages:
pci 0001:01 : [PE# fd] Setting up window#1
800000000000000..80007ffffffffff pg=40000000
The XHCI seems working. Without the patch 16MB was the maximum.
>
>>>> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
>>>> index 9fc5217f0c8e..6cda1c92597d 100644
>>>> --- a/arch/powerpc/platforms/pseries/iommu.c
>>>> +++ b/arch/powerpc/platforms/pseries/iommu.c
>>>> @@ -53,6 +53,20 @@ enum {
>>>> DDW_EXT_QUERY_OUT_SIZE = 2
>>>> };
>>>
>>> A comment saying where the values come from would be good.
>>>
>>>> +#define QUERY_DDW_PGSIZE_4K 0x01
>>>> +#define QUERY_DDW_PGSIZE_64K 0x02
>>>> +#define QUERY_DDW_PGSIZE_16M 0x04
>>>> +#define QUERY_DDW_PGSIZE_32M 0x08
>>>> +#define QUERY_DDW_PGSIZE_64M 0x10
>>>> +#define QUERY_DDW_PGSIZE_128M 0x20
>>>> +#define QUERY_DDW_PGSIZE_256M 0x40
>>>> +#define QUERY_DDW_PGSIZE_16G 0x80
>>>
>>> I'm not sure the #defines really gain us much vs just putting the
>>> literal values in the array below?
>>
>> Then someone says "uuuuu magic values" :) I do not mind either way. Thanks,
>
> Yeah that's true. But #defining them doesn't make them less magic, if
> you only use them in one place :)
Defining them with "QUERY_DDW" in the names kinda tells where they are
from. Can also grep QEMU using these to see how the other side handles
it. Dunno.
btw the bot complained about __builtin_ctz(SZ_16G) which should be
__builtin_ctzl(SZ_16G) so we have to ask Leonardo to repost anyway :)
--
Alexey
On Fri, Apr 09, 2021 at 02:36:16PM +1000, Alexey Kardashevskiy wrote:
> On 08/04/2021 19:04, Michael Ellerman wrote:
> >>>>+#define QUERY_DDW_PGSIZE_4K 0x01
> >>>>+#define QUERY_DDW_PGSIZE_64K 0x02
> >>>>+#define QUERY_DDW_PGSIZE_16M 0x04
> >>>>+#define QUERY_DDW_PGSIZE_32M 0x08
> >>>>+#define QUERY_DDW_PGSIZE_64M 0x10
> >>>>+#define QUERY_DDW_PGSIZE_128M 0x20
> >>>>+#define QUERY_DDW_PGSIZE_256M 0x40
> >>>>+#define QUERY_DDW_PGSIZE_16G 0x80
> >>>
> >>>I'm not sure the #defines really gain us much vs just putting the
> >>>literal values in the array below?
> >>
> >>Then someone says "uuuuu magic values" :) I do not mind either way.
> >>Thanks,
> >
> >Yeah that's true. But #defining them doesn't make them less magic, if
> >you only use them in one place :)
>
> Defining them with "QUERY_DDW" in the names kinda tells where they are
> from. Can also grep QEMU using these to see how the other side handles
> it. Dunno.
And *not* defining anything reduces the mental load a lot. You can add
a comment at the single spot you use them, explaining what this is, in a
much better way!
Comments are *good*.
Segher
On Mon, 2021-04-12 at 17:21 -0500, Segher Boessenkool wrote:
> On Fri, Apr 09, 2021 at 02:36:16PM +1000, Alexey Kardashevskiy wrote:
> > On 08/04/2021 19:04, Michael Ellerman wrote:
> > > > > > +#define QUERY_DDW_PGSIZE_4K 0x01
> > > > > > +#define QUERY_DDW_PGSIZE_64K 0x02
> > > > > > +#define QUERY_DDW_PGSIZE_16M 0x04
> > > > > > +#define QUERY_DDW_PGSIZE_32M 0x08
> > > > > > +#define QUERY_DDW_PGSIZE_64M 0x10
> > > > > > +#define QUERY_DDW_PGSIZE_128M 0x20
> > > > > > +#define QUERY_DDW_PGSIZE_256M 0x40
> > > > > > +#define QUERY_DDW_PGSIZE_16G 0x80
> > > > >
> > > > > I'm not sure the #defines really gain us much vs just putting the
> > > > > literal values in the array below?
> > > >
> > > > Then someone says "uuuuu magic values" :) I do not mind either way.
> > > > Thanks,
> > >
> > > Yeah that's true. But #defining them doesn't make them less magic, if
> > > you only use them in one place :)
> >
> > Defining them with "QUERY_DDW" in the names kinda tells where they are
> > from. Can also grep QEMU using these to see how the other side handles
> > it. Dunno.
>
> And *not* defining anything reduces the mental load a lot. You can add
> a comment at the single spot you use them, explaining what this is, in a
> much better way!
>
> Comments are *good*.
>
>
> Segher
Thanks for the feedback Alexey, Michael and Segher!
I have sent a v3 for this patch.
http://patchwork.ozlabs.org/project/linuxppc-dev/patch/[email protected]/
Please let me know of your feedback in it.
Best regards,
Leonardo Bras