2018-04-29 01:56:26

by Guenter Roeck

[permalink] [raw]
Subject: [PATCH 1/2] x86/amd_nb: Add support for Raven Ridge CPUs

Add Raven Ridge root bridge and data fabric PCI IDs.
This is required for amd_pci_dev_to_node_id() and amd_smn_read().

Signed-off-by: Guenter Roeck <[email protected]>
---
This patch is a prerequisite for the second patch in the series.
I'll be happy to apply both patches through hwmon if that is acceptable
(and Cc: stable for 4.16+). If not, I'll be happy to wait for this patch
to be available upstream.

Since that there is no public documentation available for Raven Ridge,
PCI IDs are derived from output of lspci.

arch/x86/kernel/amd_nb.c | 6 ++++++
1 file changed, 6 insertions(+)

diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index c88e0b127810..bd33613ecb7c 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -14,8 +14,11 @@
#include <asm/amd_nb.h>

#define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450
+#define PCI_DEVICE_ID_AMD_17H_RR_ROOT 0x15d0
#define PCI_DEVICE_ID_AMD_17H_DF_F3 0x1463
#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464
+#define PCI_DEVICE_ID_AMD_17H_RR_DF_F3 0x15eb
+#define PCI_DEVICE_ID_AMD_17H_RR_DF_F4 0x15ec

/* Protect the PCI config register pairs used for SMN and DF indirect access. */
static DEFINE_MUTEX(smn_mutex);
@@ -24,6 +27,7 @@ static u32 *flush_words;

static const struct pci_device_id amd_root_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_RR_ROOT) },
{}
};

@@ -39,6 +43,7 @@ const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_RR_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) },
{}
};
@@ -51,6 +56,7 @@ static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_RR_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
{}
};
--
2.7.4



2018-04-29 01:56:15

by Guenter Roeck

[permalink] [raw]
Subject: [PATCH 2/2] hwmon: (k10temp) Use API function to access System Management Network

The SMN (System Management Network) on Family 17h AMD CPUs is also accessed
from other drivers, specifically EDAC. Accessing it directly is racy.
On top of that, accessing the SMN through root bridge 00:00 is wrong on
multi-die CPUs and may result in reading the temperature from the wrong
die. Use available API functions to fix the problem.

For this to work, also change the Raven Ridge PCI device ID to point to
Data Fabric Function 3, since this ID is used by the API functions to
find the CPU node.

Signed-off-by: Guenter Roeck <[email protected]>
---
drivers/hwmon/k10temp.c | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c
index b06bb1f90853..00e785afae0d 100644
--- a/drivers/hwmon/k10temp.c
+++ b/drivers/hwmon/k10temp.c
@@ -23,6 +23,7 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/pci.h>
+#include <asm/amd_nb.h>
#include <asm/processor.h>

MODULE_DESCRIPTION("AMD Family 10h+ CPU core temperature monitor");
@@ -40,8 +41,8 @@ static DEFINE_MUTEX(nb_smu_ind_mutex);
#define PCI_DEVICE_ID_AMD_17H_DF_F3 0x1463
#endif

-#ifndef PCI_DEVICE_ID_AMD_17H_RR_NB
-#define PCI_DEVICE_ID_AMD_17H_RR_NB 0x15d0
+#ifndef PCI_DEVICE_ID_AMD_17H_RR_DF_F3
+#define PCI_DEVICE_ID_AMD_17H_RR_DF_F3 0x14eb
#endif

/* CPUID function 0x80000001, ebx */
@@ -136,8 +137,8 @@ static void read_tempreg_nb_f15(struct pci_dev *pdev, u32 *regval)

static void read_tempreg_nb_f17(struct pci_dev *pdev, u32 *regval)
{
- amd_nb_index_read(pdev, PCI_DEVFN(0, 0), 0x60,
- F17H_M01H_REPORTED_TEMP_CTRL_OFFSET, regval);
+ amd_smn_read(amd_pci_dev_to_node_id(pdev),
+ F17H_M01H_REPORTED_TEMP_CTRL_OFFSET, regval);
}

static ssize_t temp1_input_show(struct device *dev,
@@ -323,7 +324,7 @@ static const struct pci_device_id k10temp_id_table[] = {
{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },
{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) },
- { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_RR_NB) },
+ { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_RR_DF_F3) },
{}
};
MODULE_DEVICE_TABLE(pci, k10temp_id_table);
--
2.7.4


2018-04-29 04:20:02

by Gabriel C

[permalink] [raw]
Subject: Re: [PATCH 1/2] x86/amd_nb: Add support for Raven Ridge CPUs

2018-04-29 3:54 GMT+02:00 Guenter Roeck <[email protected]>:
> Add Raven Ridge root bridge and data fabric PCI IDs.
> This is required for amd_pci_dev_to_node_id() and amd_smn_read().
>
> Signed-off-by: Guenter Roeck <[email protected]>
> ---
> This patch is a prerequisite for the second patch in the series.
> I'll be happy to apply both patches through hwmon if that is acceptable
> (and Cc: stable for 4.16+). If not, I'll be happy to wait for this patch
> to be available upstream.
>
> Since that there is no public documentation available for Raven Ridge,
> PCI IDs are derived from output of lspci.
>
> arch/x86/kernel/amd_nb.c | 6 ++++++
> 1 file changed, 6 insertions(+)
>
> diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
> index c88e0b127810..bd33613ecb7c 100644
> --- a/arch/x86/kernel/amd_nb.c
> +++ b/arch/x86/kernel/amd_nb.c
> @@ -14,8 +14,11 @@
> #include <asm/amd_nb.h>
>
> #define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450
> +#define PCI_DEVICE_ID_AMD_17H_RR_ROOT 0x15d0
> #define PCI_DEVICE_ID_AMD_17H_DF_F3 0x1463
> #define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464
> +#define PCI_DEVICE_ID_AMD_17H_RR_DF_F3 0x15eb
> +#define PCI_DEVICE_ID_AMD_17H_RR_DF_F4 0x15ec
>
> /* Protect the PCI config register pairs used for SMN and DF indirect access. */
> static DEFINE_MUTEX(smn_mutex);
> @@ -24,6 +27,7 @@ static u32 *flush_words;
>
> static const struct pci_device_id amd_root_ids[] = {
> { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) },
> + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_RR_ROOT) },
> {}
> };
>
> @@ -39,6 +43,7 @@ const struct pci_device_id amd_nb_misc_ids[] = {
> { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
> { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },
> { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) },
> + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_RR_DF_F3) },
> { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) },
> {}
> };
> @@ -51,6 +56,7 @@ static const struct pci_device_id amd_nb_link_ids[] = {
> { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) },
> { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) },
> { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) },
> + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_RR_DF_F4) },
> { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
> {}
> };
> --
> 2.7.4
>

Works fine for me on top stable and on top v4.17-rc2-398-gcdface520934

Tested-by: Gabriel Craciunescu <[email protected]>


Regards

2018-04-29 04:20:02

by Gabriel C

[permalink] [raw]
Subject: Re: [PATCH 1/2] x86/amd_nb: Add support for Raven Ridge CPUs

2018-04-29 3:54 GMT+02:00 Guenter Roeck <[email protected]>:
> Add Raven Ridge root bridge and data fabric PCI IDs.
> This is required for amd_pci_dev_to_node_id() and amd_smn_read().
>
> Signed-off-by: Guenter Roeck <[email protected]>
> ---
> This patch is a prerequisite for the second patch in the series.
> I'll be happy to apply both patches through hwmon if that is acceptable
> (and Cc: stable for 4.16+). If not, I'll be happy to wait for this patch
> to be available upstream.
>
> Since that there is no public documentation available for Raven Ridge,
> PCI IDs are derived from output of lspci.
>
> arch/x86/kernel/amd_nb.c | 6 ++++++
> 1 file changed, 6 insertions(+)
>
> diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
> index c88e0b127810..bd33613ecb7c 100644
> --- a/arch/x86/kernel/amd_nb.c
> +++ b/arch/x86/kernel/amd_nb.c
> @@ -14,8 +14,11 @@
> #include <asm/amd_nb.h>
>
> #define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450
> +#define PCI_DEVICE_ID_AMD_17H_RR_ROOT 0x15d0
> #define PCI_DEVICE_ID_AMD_17H_DF_F3 0x1463
> #define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464
> +#define PCI_DEVICE_ID_AMD_17H_RR_DF_F3 0x15eb
> +#define PCI_DEVICE_ID_AMD_17H_RR_DF_F4 0x15ec
>
> /* Protect the PCI config register pairs used for SMN and DF indirect access. */
> static DEFINE_MUTEX(smn_mutex);
> @@ -24,6 +27,7 @@ static u32 *flush_words;
>
> static const struct pci_device_id amd_root_ids[] = {
> { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) },
> + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_RR_ROOT) },
> {}
> };
>
> @@ -39,6 +43,7 @@ const struct pci_device_id amd_nb_misc_ids[] = {
> { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
> { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },
> { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) },
> + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_RR_DF_F3) },
> { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) },
> {}
> };
> @@ -51,6 +56,7 @@ static const struct pci_device_id amd_nb_link_ids[] = {
> { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) },
> { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) },
> { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) },
> + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_RR_DF_F4) },
> { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
> {}
> };
> --
> 2.7.4
>

Works fine for me on top stable and on top v4.17-rc2-398-gcdface520934

Tested-by: Gabriel Craciunescu <[email protected]>

Regards

2018-04-29 04:34:48

by Gabriel C

[permalink] [raw]
Subject: Re: [PATCH 2/2] hwmon: (k10temp) Use API function to access System Management Network

2018-04-29 3:54 GMT+02:00 Guenter Roeck <[email protected]>:
> The SMN (System Management Network) on Family 17h AMD CPUs is also accessed
> from other drivers, specifically EDAC. Accessing it directly is racy.
> On top of that, accessing the SMN through root bridge 00:00 is wrong on
> multi-die CPUs and may result in reading the temperature from the wrong
> die. Use available API functions to fix the problem.
>
> For this to work, also change the Raven Ridge PCI device ID to point to
> Data Fabric Function 3, since this ID is used by the API functions to
> find the CPU node.
>
> Signed-off-by: Guenter Roeck <[email protected]>
> ---
> drivers/hwmon/k10temp.c | 11 ++++++-----
> 1 file changed, 6 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c
> index b06bb1f90853..00e785afae0d 100644
> --- a/drivers/hwmon/k10temp.c
> +++ b/drivers/hwmon/k10temp.c
> @@ -23,6 +23,7 @@
> #include <linux/init.h>
> #include <linux/module.h>
> #include <linux/pci.h>
> +#include <asm/amd_nb.h>
> #include <asm/processor.h>
>
> MODULE_DESCRIPTION("AMD Family 10h+ CPU core temperature monitor");
> @@ -40,8 +41,8 @@ static DEFINE_MUTEX(nb_smu_ind_mutex);
> #define PCI_DEVICE_ID_AMD_17H_DF_F3 0x1463
> #endif
>
> -#ifndef PCI_DEVICE_ID_AMD_17H_RR_NB
> -#define PCI_DEVICE_ID_AMD_17H_RR_NB 0x15d0
> +#ifndef PCI_DEVICE_ID_AMD_17H_RR_DF_F3
> +#define PCI_DEVICE_ID_AMD_17H_RR_DF_F3 0x14eb
> #endif
>
> /* CPUID function 0x80000001, ebx */
> @@ -136,8 +137,8 @@ static void read_tempreg_nb_f15(struct pci_dev *pdev, u32 *regval)
>
> static void read_tempreg_nb_f17(struct pci_dev *pdev, u32 *regval)
> {
> - amd_nb_index_read(pdev, PCI_DEVFN(0, 0), 0x60,
> - F17H_M01H_REPORTED_TEMP_CTRL_OFFSET, regval);
> + amd_smn_read(amd_pci_dev_to_node_id(pdev),
> + F17H_M01H_REPORTED_TEMP_CTRL_OFFSET, regval);
> }
>
> static ssize_t temp1_input_show(struct device *dev,
> @@ -323,7 +324,7 @@ static const struct pci_device_id k10temp_id_table[] = {
> { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
> { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },
> { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) },
> - { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_RR_NB) },
> + { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_RR_DF_F3) },
> {}
> };
> MODULE_DEVICE_TABLE(pci, k10temp_id_table);
> --
> 2.7.4
>

Works fine for me on top stable and on top v4.17-rc2-398-gcdface520934

Tested-by: Gabriel Craciunescu <[email protected]>

Regards

2018-04-29 17:48:51

by Guenter Roeck

[permalink] [raw]
Subject: Re: [PATCH 2/2] hwmon: (k10temp) Use API function to access System Management Network

On 04/28/2018 06:54 PM, Guenter Roeck wrote:
> The SMN (System Management Network) on Family 17h AMD CPUs is also accessed
> from other drivers, specifically EDAC. Accessing it directly is racy.
> On top of that, accessing the SMN through root bridge 00:00 is wrong on
> multi-die CPUs and may result in reading the temperature from the wrong
> die. Use available API functions to fix the problem.
>
> For this to work, also change the Raven Ridge PCI device ID to point to
> Data Fabric Function 3, since this ID is used by the API functions to
> find the CPU node.
>
> Signed-off-by: Guenter Roeck <[email protected]>
> ---
> drivers/hwmon/k10temp.c | 11 ++++++-----
> 1 file changed, 6 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c
> index b06bb1f90853..00e785afae0d 100644
> --- a/drivers/hwmon/k10temp.c
> +++ b/drivers/hwmon/k10temp.c
> @@ -23,6 +23,7 @@
> #include <linux/init.h>
> #include <linux/module.h>
> #include <linux/pci.h>
> +#include <asm/amd_nb.h>
> #include <asm/processor.h>
>
> MODULE_DESCRIPTION("AMD Family 10h+ CPU core temperature monitor");
> @@ -40,8 +41,8 @@ static DEFINE_MUTEX(nb_smu_ind_mutex);
> #define PCI_DEVICE_ID_AMD_17H_DF_F3 0x1463
> #endif
>
> -#ifndef PCI_DEVICE_ID_AMD_17H_RR_NB
> -#define PCI_DEVICE_ID_AMD_17H_RR_NB 0x15d0
> +#ifndef PCI_DEVICE_ID_AMD_17H_RR_DF_F3
> +#define PCI_DEVICE_ID_AMD_17H_RR_DF_F3 0x14eb

This should have been 0x15eb. I'll resend after a week or so, waiting for more feedback.

Guenter

> #endif
>
> /* CPUID function 0x80000001, ebx */
> @@ -136,8 +137,8 @@ static void read_tempreg_nb_f15(struct pci_dev *pdev, u32 *regval)
>
> static void read_tempreg_nb_f17(struct pci_dev *pdev, u32 *regval)
> {
> - amd_nb_index_read(pdev, PCI_DEVFN(0, 0), 0x60,
> - F17H_M01H_REPORTED_TEMP_CTRL_OFFSET, regval);
> + amd_smn_read(amd_pci_dev_to_node_id(pdev),
> + F17H_M01H_REPORTED_TEMP_CTRL_OFFSET, regval);
> }
>
> static ssize_t temp1_input_show(struct device *dev,
> @@ -323,7 +324,7 @@ static const struct pci_device_id k10temp_id_table[] = {
> { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
> { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },
> { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) },
> - { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_RR_NB) },
> + { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_RR_DF_F3) },
> {}
> };
> MODULE_DEVICE_TABLE(pci, k10temp_id_table);
>


2018-04-29 17:54:30

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH 1/2] x86/amd_nb: Add support for Raven Ridge CPUs

On Sat, Apr 28, 2018 at 06:54:38PM -0700, Guenter Roeck wrote:
> Add Raven Ridge root bridge and data fabric PCI IDs.
> This is required for amd_pci_dev_to_node_id() and amd_smn_read().
>
> Signed-off-by: Guenter Roeck <[email protected]>
> ---
> This patch is a prerequisite for the second patch in the series.
> I'll be happy to apply both patches through hwmon if that is acceptable
> (and Cc: stable for 4.16+). If not, I'll be happy to wait for this patch
> to be available upstream.
>
> Since that there is no public documentation available for Raven Ridge,
> PCI IDs are derived from output of lspci.
>
> arch/x86/kernel/amd_nb.c | 6 ++++++
> 1 file changed, 6 insertions(+)
>
> diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
> index c88e0b127810..bd33613ecb7c 100644
> --- a/arch/x86/kernel/amd_nb.c
> +++ b/arch/x86/kernel/amd_nb.c
> @@ -14,8 +14,11 @@
> #include <asm/amd_nb.h>
>
> #define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450
> +#define PCI_DEVICE_ID_AMD_17H_RR_ROOT 0x15d0

I think the nomenclature we decided upon at the time was

...AMD_<family>H_M<model>H...

PCI_DEVICE_ID_AMD_15H_M10H_F3, for example.

And in this case, it should be

PCI_DEVICE_ID_AMD_17H_M<which model is RV>H_F<PCI function number>

Yazen, which is the first model of Raven Ridge?

Thx.

--
Regards/Gruss,
Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
--

2018-04-29 18:21:43

by Gabriel C

[permalink] [raw]
Subject: Re: [PATCH 2/2] hwmon: (k10temp) Use API function to access System Management Network

2018-04-29 19:46 GMT+02:00 Guenter Roeck <[email protected]>:
> On 04/28/2018 06:54 PM, Guenter Roeck wrote:
>>
>> The SMN (System Management Network) on Family 17h AMD CPUs is also
>> accessed
>> from other drivers, specifically EDAC. Accessing it directly is racy.
>> On top of that, accessing the SMN through root bridge 00:00 is wrong on
>> multi-die CPUs and may result in reading the temperature from the wrong
>> die. Use available API functions to fix the problem.
>>
>> For this to work, also change the Raven Ridge PCI device ID to point to
>> Data Fabric Function 3, since this ID is used by the API functions to
>> find the CPU node.
>>
>> Signed-off-by: Guenter Roeck <[email protected]>
>> ---
>> drivers/hwmon/k10temp.c | 11 ++++++-----
>> 1 file changed, 6 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c
>> index b06bb1f90853..00e785afae0d 100644
>> --- a/drivers/hwmon/k10temp.c
>> +++ b/drivers/hwmon/k10temp.c
>> @@ -23,6 +23,7 @@
>> #include <linux/init.h>
>> #include <linux/module.h>
>> #include <linux/pci.h>
>> +#include <asm/amd_nb.h>
>> #include <asm/processor.h>
>> MODULE_DESCRIPTION("AMD Family 10h+ CPU core temperature monitor");
>> @@ -40,8 +41,8 @@ static DEFINE_MUTEX(nb_smu_ind_mutex);
>> #define PCI_DEVICE_ID_AMD_17H_DF_F3 0x1463
>> #endif
>> -#ifndef PCI_DEVICE_ID_AMD_17H_RR_NB
>> -#define PCI_DEVICE_ID_AMD_17H_RR_NB 0x15d0
>> +#ifndef PCI_DEVICE_ID_AMD_17H_RR_DF_F3
>> +#define PCI_DEVICE_ID_AMD_17H_RR_DF_F3 0x14eb
>
>
> This should have been 0x15eb. I'll resend after a week or so, waiting for
> more feedback.


re-tested with that too .. Doesn't seems to matter here ..

with original patch:

crazy@ant:~/Work/Linux/linux$ sensors
k10temp-pci-00f3
Adapter: PCI adapter
Tdie: +22.2°C (high = +70.0°C)
Tctl: +22.2°C

k10temp-pci-00e3
Adapter: PCI adapter
Tdie: +23.8°C (high = +70.0°C)
Tctl: +23.8°C

k10temp-pci-00d3
Adapter: PCI adapter
Tdie: +23.0°C (high = +70.0°C)
Tctl: +23.0°C

k10temp-pci-00c3
Adapter: PCI adapter
Tdie: +25.0°C (high = +70.0°C)
Tctl: +25.0°C

k10temp-pci-00fb
Adapter: PCI adapter
Tdie: +22.8°C (high = +70.0°C)
Tctl: +22.8°C

k10temp-pci-00eb
Adapter: PCI adapter
Tdie: +23.2°C (high = +70.0°C)
Tctl: +23.2°C

k10temp-pci-00db
Adapter: PCI adapter
Tdie: +22.8°C (high = +70.0°C)
Tctl: +22.8°C

k10temp-pci-00cb
Adapter: PCI adapter
Tdie: +22.6°C (high = +70.0°C)
Tctl: +22.6°C

now with 0x15eb

crazy@ant:~/Work/Linux/linux$ sudo rmmod k10temp
crazy@ant:~/Work/Linux/linux$ git grep -w PCI_DEVICE_ID_AMD_17H_RR_DF_F3
arch/x86/kernel/amd_nb.c:#define PCI_DEVICE_ID_AMD_17H_RR_DF_F3 0x15eb
arch/x86/kernel/amd_nb.c: { PCI_DEVICE(PCI_VENDOR_ID_AMD,
PCI_DEVICE_ID_AMD_17H_RR_DF_F3) },
drivers/hwmon/k10temp.c:#ifndef PCI_DEVICE_ID_AMD_17H_RR_DF_F3
drivers/hwmon/k10temp.c:#define PCI_DEVICE_ID_AMD_17H_RR_DF_F3 0x15eb
drivers/hwmon/k10temp.c: { PCI_VDEVICE(AMD,
PCI_DEVICE_ID_AMD_17H_RR_DF_F3) },
crazy@ant:~/Work/Linux/linux$ sudo insmod ./drivers/hwmon/k10temp.ko
crazy@ant:~/Work/Linux/linux$ sensors
k10temp-pci-00f3
Adapter: PCI adapter
Tdie: +22.2°C (high = +70.0°C)
Tctl: +22.2°C

k10temp-pci-00e3
Adapter: PCI adapter
Tdie: +23.8°C (high = +70.0°C)
Tctl: +23.8°C

k10temp-pci-00d3
Adapter: PCI adapter
Tdie: +23.0°C (high = +70.0°C)
Tctl: +23.0°C

k10temp-pci-00c3
Adapter: PCI adapter
Tdie: +25.0°C (high = +70.0°C)
Tctl: +25.0°C

k10temp-pci-00fb
Adapter: PCI adapter
Tdie: +22.9°C (high = +70.0°C)
Tctl: +22.9°C

k10temp-pci-00eb
Adapter: PCI adapter
Tdie: +23.2°C (high = +70.0°C)
Tctl: +23.2°C

k10temp-pci-00db
Adapter: PCI adapter
Tdie: +22.8°C (high = +70.0°C)
Tctl: +22.8°C

k10temp-pci-00cb
Adapter: PCI adapter
Tdie: +22.8°C (high = +70.0°C)
Tctl: +22.8°C


>
>> #endif
>> /* CPUID function 0x80000001, ebx */
>> @@ -136,8 +137,8 @@ static void read_tempreg_nb_f15(struct pci_dev *pdev,
>> u32 *regval)
>> static void read_tempreg_nb_f17(struct pci_dev *pdev, u32 *regval)
>> {
>> - amd_nb_index_read(pdev, PCI_DEVFN(0, 0), 0x60,
>> - F17H_M01H_REPORTED_TEMP_CTRL_OFFSET, regval);
>> + amd_smn_read(amd_pci_dev_to_node_id(pdev),
>> + F17H_M01H_REPORTED_TEMP_CTRL_OFFSET, regval);
>> }
>> static ssize_t temp1_input_show(struct device *dev,
>> @@ -323,7 +324,7 @@ static const struct pci_device_id k10temp_id_table[] =
>> {
>> { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
>> { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },
>> { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) },
>> - { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_RR_NB) },
>> + { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_RR_DF_F3) },
>> {}
>> };
>> MODULE_DEVICE_TABLE(pci, k10temp_id_table);
>>
>

2018-04-29 18:24:01

by Guenter Roeck

[permalink] [raw]
Subject: Re: [PATCH 2/2] hwmon: (k10temp) Use API function to access System Management Network

On 04/29/2018 11:19 AM, Gabriel C wrote:
> 2018-04-29 19:46 GMT+02:00 Guenter Roeck <[email protected]>:
>> On 04/28/2018 06:54 PM, Guenter Roeck wrote:
>>>
>>> The SMN (System Management Network) on Family 17h AMD CPUs is also
>>> accessed
>>> from other drivers, specifically EDAC. Accessing it directly is racy.
>>> On top of that, accessing the SMN through root bridge 00:00 is wrong on
>>> multi-die CPUs and may result in reading the temperature from the wrong
>>> die. Use available API functions to fix the problem.
>>>
>>> For this to work, also change the Raven Ridge PCI device ID to point to
>>> Data Fabric Function 3, since this ID is used by the API functions to
>>> find the CPU node.
>>>
>>> Signed-off-by: Guenter Roeck <[email protected]>
>>> ---
>>> drivers/hwmon/k10temp.c | 11 ++++++-----
>>> 1 file changed, 6 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c
>>> index b06bb1f90853..00e785afae0d 100644
>>> --- a/drivers/hwmon/k10temp.c
>>> +++ b/drivers/hwmon/k10temp.c
>>> @@ -23,6 +23,7 @@
>>> #include <linux/init.h>
>>> #include <linux/module.h>
>>> #include <linux/pci.h>
>>> +#include <asm/amd_nb.h>
>>> #include <asm/processor.h>
>>> MODULE_DESCRIPTION("AMD Family 10h+ CPU core temperature monitor");
>>> @@ -40,8 +41,8 @@ static DEFINE_MUTEX(nb_smu_ind_mutex);
>>> #define PCI_DEVICE_ID_AMD_17H_DF_F3 0x1463
>>> #endif
>>> -#ifndef PCI_DEVICE_ID_AMD_17H_RR_NB
>>> -#define PCI_DEVICE_ID_AMD_17H_RR_NB 0x15d0
>>> +#ifndef PCI_DEVICE_ID_AMD_17H_RR_DF_F3
>>> +#define PCI_DEVICE_ID_AMD_17H_RR_DF_F3 0x14eb
>>
>>
>> This should have been 0x15eb. I'll resend after a week or so, waiting for
>> more feedback.
>
>
> re-tested with that too .. Doesn't seems to matter here ..
>

Yes, that only matters for Raven Ridge CPUs (eg 2200G, 2400G).

Thanks,
Guenter

> with original patch:
>
> crazy@ant:~/Work/Linux/linux$ sensors
> k10temp-pci-00f3
> Adapter: PCI adapter
> Tdie: +22.2°C (high = +70.0°C)
> Tctl: +22.2°C
>
> k10temp-pci-00e3
> Adapter: PCI adapter
> Tdie: +23.8°C (high = +70.0°C)
> Tctl: +23.8°C
>
> k10temp-pci-00d3
> Adapter: PCI adapter
> Tdie: +23.0°C (high = +70.0°C)
> Tctl: +23.0°C
>
> k10temp-pci-00c3
> Adapter: PCI adapter
> Tdie: +25.0°C (high = +70.0°C)
> Tctl: +25.0°C
>
> k10temp-pci-00fb
> Adapter: PCI adapter
> Tdie: +22.8°C (high = +70.0°C)
> Tctl: +22.8°C
>
> k10temp-pci-00eb
> Adapter: PCI adapter
> Tdie: +23.2°C (high = +70.0°C)
> Tctl: +23.2°C
>
> k10temp-pci-00db
> Adapter: PCI adapter
> Tdie: +22.8°C (high = +70.0°C)
> Tctl: +22.8°C
>
> k10temp-pci-00cb
> Adapter: PCI adapter
> Tdie: +22.6°C (high = +70.0°C)
> Tctl: +22.6°C
>
> now with 0x15eb
>
> crazy@ant:~/Work/Linux/linux$ sudo rmmod k10temp
> crazy@ant:~/Work/Linux/linux$ git grep -w PCI_DEVICE_ID_AMD_17H_RR_DF_F3
> arch/x86/kernel/amd_nb.c:#define PCI_DEVICE_ID_AMD_17H_RR_DF_F3 0x15eb
> arch/x86/kernel/amd_nb.c: { PCI_DEVICE(PCI_VENDOR_ID_AMD,
> PCI_DEVICE_ID_AMD_17H_RR_DF_F3) },
> drivers/hwmon/k10temp.c:#ifndef PCI_DEVICE_ID_AMD_17H_RR_DF_F3
> drivers/hwmon/k10temp.c:#define PCI_DEVICE_ID_AMD_17H_RR_DF_F3 0x15eb
> drivers/hwmon/k10temp.c: { PCI_VDEVICE(AMD,
> PCI_DEVICE_ID_AMD_17H_RR_DF_F3) },
> crazy@ant:~/Work/Linux/linux$ sudo insmod ./drivers/hwmon/k10temp.ko
> crazy@ant:~/Work/Linux/linux$ sensors
> k10temp-pci-00f3
> Adapter: PCI adapter
> Tdie: +22.2°C (high = +70.0°C)
> Tctl: +22.2°C
>
> k10temp-pci-00e3
> Adapter: PCI adapter
> Tdie: +23.8°C (high = +70.0°C)
> Tctl: +23.8°C
>
> k10temp-pci-00d3
> Adapter: PCI adapter
> Tdie: +23.0°C (high = +70.0°C)
> Tctl: +23.0°C
>
> k10temp-pci-00c3
> Adapter: PCI adapter
> Tdie: +25.0°C (high = +70.0°C)
> Tctl: +25.0°C
>
> k10temp-pci-00fb
> Adapter: PCI adapter
> Tdie: +22.9°C (high = +70.0°C)
> Tctl: +22.9°C
>
> k10temp-pci-00eb
> Adapter: PCI adapter
> Tdie: +23.2°C (high = +70.0°C)
> Tctl: +23.2°C
>
> k10temp-pci-00db
> Adapter: PCI adapter
> Tdie: +22.8°C (high = +70.0°C)
> Tctl: +22.8°C
>
> k10temp-pci-00cb
> Adapter: PCI adapter
> Tdie: +22.8°C (high = +70.0°C)
> Tctl: +22.8°C
>
>
>>
>>> #endif
>>> /* CPUID function 0x80000001, ebx */
>>> @@ -136,8 +137,8 @@ static void read_tempreg_nb_f15(struct pci_dev *pdev,
>>> u32 *regval)
>>> static void read_tempreg_nb_f17(struct pci_dev *pdev, u32 *regval)
>>> {
>>> - amd_nb_index_read(pdev, PCI_DEVFN(0, 0), 0x60,
>>> - F17H_M01H_REPORTED_TEMP_CTRL_OFFSET, regval);
>>> + amd_smn_read(amd_pci_dev_to_node_id(pdev),
>>> + F17H_M01H_REPORTED_TEMP_CTRL_OFFSET, regval);
>>> }
>>> static ssize_t temp1_input_show(struct device *dev,
>>> @@ -323,7 +324,7 @@ static const struct pci_device_id k10temp_id_table[] =
>>> {
>>> { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
>>> { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },
>>> { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) },
>>> - { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_RR_NB) },
>>> + { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_RR_DF_F3) },
>>> {}
>>> };
>>> MODULE_DEVICE_TABLE(pci, k10temp_id_table);
>>>
>>
>


2018-04-29 18:25:17

by Guenter Roeck

[permalink] [raw]
Subject: Re: [PATCH 1/2] x86/amd_nb: Add support for Raven Ridge CPUs

On 04/29/2018 10:53 AM, Borislav Petkov wrote:
> On Sat, Apr 28, 2018 at 06:54:38PM -0700, Guenter Roeck wrote:
>> Add Raven Ridge root bridge and data fabric PCI IDs.
>> This is required for amd_pci_dev_to_node_id() and amd_smn_read().
>>
>> Signed-off-by: Guenter Roeck <[email protected]>
>> ---
>> This patch is a prerequisite for the second patch in the series.
>> I'll be happy to apply both patches through hwmon if that is acceptable
>> (and Cc: stable for 4.16+). If not, I'll be happy to wait for this patch
>> to be available upstream.
>>
>> Since that there is no public documentation available for Raven Ridge,
>> PCI IDs are derived from output of lspci.
>>
>> arch/x86/kernel/amd_nb.c | 6 ++++++
>> 1 file changed, 6 insertions(+)
>>
>> diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
>> index c88e0b127810..bd33613ecb7c 100644
>> --- a/arch/x86/kernel/amd_nb.c
>> +++ b/arch/x86/kernel/amd_nb.c
>> @@ -14,8 +14,11 @@
>> #include <asm/amd_nb.h>
>>
>> #define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450
>> +#define PCI_DEVICE_ID_AMD_17H_RR_ROOT 0x15d0
>
> I think the nomenclature we decided upon at the time was
>
> ...AMD_<family>H_M<model>H...
>
> PCI_DEVICE_ID_AMD_15H_M10H_F3, for example.
>
> And in this case, it should be
>
> PCI_DEVICE_ID_AMD_17H_M<which model is RV>H_F<PCI function number>
>

Makes sense.

> Yazen, which is the first model of Raven Ridge?
>

2400G is model 17 (0x11). I was unable to find information if there are
other chips/models using the same set of PCI IDs.

I'll wait for additional feedback before resending.

Thanks,
Guenter


2018-04-30 15:40:56

by Yazen Ghannam

[permalink] [raw]
Subject: RE: [PATCH 1/2] x86/amd_nb: Add support for Raven Ridge CPUs

> -----Original Message-----
> From: Guenter Roeck <[email protected]> On Behalf Of Guenter Roeck
> Sent: Sunday, April 29, 2018 2:24 PM
> To: Borislav Petkov <[email protected]>; Ghannam, Yazen
> <[email protected]>
> Cc: Thomas Gleixner <[email protected]>; Clemens Ladisch
> <[email protected]>; [email protected]; Jean Delvare <[email protected]>;
> [email protected]; [email protected]; Woods, Brian
> <[email protected]>
> Subject: Re: [PATCH 1/2] x86/amd_nb: Add support for Raven Ridge CPUs
>
> On 04/29/2018 10:53 AM, Borislav Petkov wrote:
> > On Sat, Apr 28, 2018 at 06:54:38PM -0700, Guenter Roeck wrote:
> >> Add Raven Ridge root bridge and data fabric PCI IDs.
> >> This is required for amd_pci_dev_to_node_id() and amd_smn_read().
> >>
> >> Signed-off-by: Guenter Roeck <[email protected]>
> >> ---
> >> This patch is a prerequisite for the second patch in the series.
> >> I'll be happy to apply both patches through hwmon if that is acceptable
> >> (and Cc: stable for 4.16+). If not, I'll be happy to wait for this patch
> >> to be available upstream.
> >>
> >> Since that there is no public documentation available for Raven Ridge,
> >> PCI IDs are derived from output of lspci.
> >>
> >> arch/x86/kernel/amd_nb.c | 6 ++++++
> >> 1 file changed, 6 insertions(+)
> >>
> >> diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
> >> index c88e0b127810..bd33613ecb7c 100644
> >> --- a/arch/x86/kernel/amd_nb.c
> >> +++ b/arch/x86/kernel/amd_nb.c
> >> @@ -14,8 +14,11 @@
> >> #include <asm/amd_nb.h>
> >>
> >> #define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450
> >> +#define PCI_DEVICE_ID_AMD_17H_RR_ROOT 0x15d0
> >
> > I think the nomenclature we decided upon at the time was
> >
> > ...AMD_<family>H_M<model>H...
> >
> > PCI_DEVICE_ID_AMD_15H_M10H_F3, for example.
> >
> > And in this case, it should be
> >
> > PCI_DEVICE_ID_AMD_17H_M<which model is RV>H_F<PCI function
> number>
> >
>
> Makes sense.
>
> > Yazen, which is the first model of Raven Ridge?
> >
>
> 2400G is model 17 (0x11). I was unable to find information if there are
> other chips/models using the same set of PCI IDs.
>
> I'll wait for additional feedback before resending.

There are other models using the same set of IDs, but the first is 10h.

Also, the F3 IDs for Fam17h and Fam17hMod10h are used in both amd_nb.c
and k10temp.c. Can you please delete the F3 IDs from these files and add
them to "include/linux/pci_ids.h"?

Thanks!

-Yazen

2018-04-30 16:02:28

by Guenter Roeck

[permalink] [raw]
Subject: Re: [PATCH 1/2] x86/amd_nb: Add support for Raven Ridge CPUs

On Mon, Apr 30, 2018 at 03:38:59PM +0000, Ghannam, Yazen wrote:
> > -----Original Message-----
> > From: Guenter Roeck <[email protected]> On Behalf Of Guenter Roeck
> > Sent: Sunday, April 29, 2018 2:24 PM
> > To: Borislav Petkov <[email protected]>; Ghannam, Yazen
> > <[email protected]>
> > Cc: Thomas Gleixner <[email protected]>; Clemens Ladisch
> > <[email protected]>; [email protected]; Jean Delvare <[email protected]>;
> > [email protected]; [email protected]; Woods, Brian
> > <[email protected]>
> > Subject: Re: [PATCH 1/2] x86/amd_nb: Add support for Raven Ridge CPUs
> >
> > On 04/29/2018 10:53 AM, Borislav Petkov wrote:
> > > On Sat, Apr 28, 2018 at 06:54:38PM -0700, Guenter Roeck wrote:
> > >> Add Raven Ridge root bridge and data fabric PCI IDs.
> > >> This is required for amd_pci_dev_to_node_id() and amd_smn_read().
> > >>
> > >> Signed-off-by: Guenter Roeck <[email protected]>
> > >> ---
> > >> This patch is a prerequisite for the second patch in the series.
> > >> I'll be happy to apply both patches through hwmon if that is acceptable
> > >> (and Cc: stable for 4.16+). If not, I'll be happy to wait for this patch
> > >> to be available upstream.
> > >>
> > >> Since that there is no public documentation available for Raven Ridge,
> > >> PCI IDs are derived from output of lspci.
> > >>
> > >> arch/x86/kernel/amd_nb.c | 6 ++++++
> > >> 1 file changed, 6 insertions(+)
> > >>
> > >> diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
> > >> index c88e0b127810..bd33613ecb7c 100644
> > >> --- a/arch/x86/kernel/amd_nb.c
> > >> +++ b/arch/x86/kernel/amd_nb.c
> > >> @@ -14,8 +14,11 @@
> > >> #include <asm/amd_nb.h>
> > >>
> > >> #define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450
> > >> +#define PCI_DEVICE_ID_AMD_17H_RR_ROOT 0x15d0
> > >
> > > I think the nomenclature we decided upon at the time was
> > >
> > > ...AMD_<family>H_M<model>H...
> > >
> > > PCI_DEVICE_ID_AMD_15H_M10H_F3, for example.
> > >
> > > And in this case, it should be
> > >
> > > PCI_DEVICE_ID_AMD_17H_M<which model is RV>H_F<PCI function
> > number>
> > >
> >
> > Makes sense.
> >
> > > Yazen, which is the first model of Raven Ridge?
> > >
> >
> > 2400G is model 17 (0x11). I was unable to find information if there are
> > other chips/models using the same set of PCI IDs.
> >
> > I'll wait for additional feedback before resending.
>
> There are other models using the same set of IDs, but the first is 10h.
>
Ok, I'll make it PCI_DEVICE_ID_AMD_17H_M10H_ROOT and
PCI_DEVICE_ID_AMD_17H_M10H_DF_F{3,4}.

> Also, the F3 IDs for Fam17h and Fam17hMod10h are used in both amd_nb.c
> and k10temp.c. Can you please delete the F3 IDs from these files and add
> them to "include/linux/pci_ids.h"?
>
I'll be happy to do that. However, my preference would be to do that
in a separate patch. I would like to see this patch and patch 2/2
applied to 4.16+ since together they fix a potential race condition
as well as temperature reporting problems with multi-die Zen chips
(Threadripper and EPYC). The pci_id.h changes are not bug fixes
and would be more appropriate for v4.18.

Thanks,
Guenter

> Thanks!
>
> -Yazen