Add regulator device in pmbus_data & initialize the same during PMBus
regulator register.
Signed-off-by: Naresh Solanki <[email protected]>
---
drivers/hwmon/pmbus/pmbus_core.c | 15 ++++++++++-----
1 file changed, 10 insertions(+), 5 deletions(-)
diff --git a/drivers/hwmon/pmbus/pmbus_core.c b/drivers/hwmon/pmbus/pmbus_core.c
index 0ddef2c9ba9b..d93405f1a495 100644
--- a/drivers/hwmon/pmbus/pmbus_core.c
+++ b/drivers/hwmon/pmbus/pmbus_core.c
@@ -81,6 +81,7 @@ struct pmbus_label {
struct pmbus_data {
struct device *dev;
struct device *hwmon_dev;
+ struct regulator_dev **rdevs;
u32 flags; /* from platform data */
@@ -3109,9 +3110,13 @@ static int pmbus_regulator_register(struct pmbus_data *data)
struct device *dev = data->dev;
const struct pmbus_driver_info *info = data->info;
const struct pmbus_platform_data *pdata = dev_get_platdata(dev);
- struct regulator_dev *rdev;
int i;
+ data->rdevs = devm_kzalloc(dev, sizeof(struct regulator_dev *) * info->num_regulators,
+ GFP_KERNEL);
+ if (!data->rdevs)
+ return -ENOMEM;
+
for (i = 0; i < info->num_regulators; i++) {
struct regulator_config config = { };
@@ -3121,10 +3126,10 @@ static int pmbus_regulator_register(struct pmbus_data *data)
if (pdata && pdata->reg_init_data)
config.init_data = &pdata->reg_init_data[i];
- rdev = devm_regulator_register(dev, &info->reg_desc[i],
- &config);
- if (IS_ERR(rdev))
- return dev_err_probe(dev, PTR_ERR(rdev),
+ data->rdevs[i] = devm_regulator_register(dev, &info->reg_desc[i],
+ &config);
+ if (IS_ERR(data->rdevs[i]))
+ return dev_err_probe(dev, PTR_ERR(data->rdevs[i]),
"Failed to register %s regulator\n",
info->reg_desc[i].name);
}
base-commit: 8a863eb1b1162653d133856702e13560f3596b85
--
2.39.1
From: Patrick Rudolph <[email protected]>
Add regulator events corresponding to regulator error in regulator flag
map.
Also capture the same in pmbus_regulator_get_flags.
Signed-off-by: Patrick Rudolph <[email protected]>
Signed-off-by: Naresh Solanki <[email protected]>
---
drivers/hwmon/pmbus/pmbus_core.c | 74 +++++++++++++++++++++-----------
1 file changed, 49 insertions(+), 25 deletions(-)
diff --git a/drivers/hwmon/pmbus/pmbus_core.c b/drivers/hwmon/pmbus/pmbus_core.c
index d93405f1a495..509bc0ef1706 100644
--- a/drivers/hwmon/pmbus/pmbus_core.c
+++ b/drivers/hwmon/pmbus/pmbus_core.c
@@ -2693,9 +2693,9 @@ static int pmbus_init_common(struct i2c_client *client, struct pmbus_data *data,
return 0;
}
-/* A PMBus status flag and the corresponding REGULATOR_ERROR_* flag */
+/* A PMBus status flag and the corresponding REGULATOR_ERROR_* and REGULATOR_EVENTS_* flag */
struct pmbus_status_assoc {
- int pflag, rflag;
+ int pflag, rflag, eflag;
};
/* PMBus->regulator bit mappings for a PMBus status register */
@@ -2710,27 +2710,36 @@ static const struct pmbus_status_category __maybe_unused pmbus_status_flag_map[]
.func = PMBUS_HAVE_STATUS_VOUT,
.reg = PMBUS_STATUS_VOUT,
.bits = (const struct pmbus_status_assoc[]) {
- { PB_VOLTAGE_UV_WARNING, REGULATOR_ERROR_UNDER_VOLTAGE_WARN },
- { PB_VOLTAGE_UV_FAULT, REGULATOR_ERROR_UNDER_VOLTAGE },
- { PB_VOLTAGE_OV_WARNING, REGULATOR_ERROR_OVER_VOLTAGE_WARN },
- { PB_VOLTAGE_OV_FAULT, REGULATOR_ERROR_REGULATION_OUT },
+ { PB_VOLTAGE_UV_WARNING, REGULATOR_ERROR_UNDER_VOLTAGE_WARN,
+ REGULATOR_EVENT_UNDER_VOLTAGE_WARN },
+ { PB_VOLTAGE_UV_FAULT, REGULATOR_ERROR_UNDER_VOLTAGE,
+ REGULATOR_EVENT_UNDER_VOLTAGE },
+ { PB_VOLTAGE_OV_WARNING, REGULATOR_ERROR_OVER_VOLTAGE_WARN,
+ REGULATOR_EVENT_OVER_VOLTAGE_WARN },
+ { PB_VOLTAGE_OV_FAULT, REGULATOR_ERROR_REGULATION_OUT,
+ REGULATOR_EVENT_OVER_VOLTAGE_WARN },
{ },
},
}, {
.func = PMBUS_HAVE_STATUS_IOUT,
.reg = PMBUS_STATUS_IOUT,
.bits = (const struct pmbus_status_assoc[]) {
- { PB_IOUT_OC_WARNING, REGULATOR_ERROR_OVER_CURRENT_WARN },
- { PB_IOUT_OC_FAULT, REGULATOR_ERROR_OVER_CURRENT },
- { PB_IOUT_OC_LV_FAULT, REGULATOR_ERROR_OVER_CURRENT },
+ { PB_IOUT_OC_WARNING, REGULATOR_ERROR_OVER_CURRENT_WARN,
+ REGULATOR_EVENT_OVER_CURRENT_WARN },
+ { PB_IOUT_OC_FAULT, REGULATOR_ERROR_OVER_CURRENT,
+ REGULATOR_EVENT_OVER_CURRENT },
+ { PB_IOUT_OC_LV_FAULT, REGULATOR_ERROR_OVER_CURRENT,
+ REGULATOR_EVENT_OVER_CURRENT },
{ },
},
}, {
.func = PMBUS_HAVE_STATUS_TEMP,
.reg = PMBUS_STATUS_TEMPERATURE,
.bits = (const struct pmbus_status_assoc[]) {
- { PB_TEMP_OT_WARNING, REGULATOR_ERROR_OVER_TEMP_WARN },
- { PB_TEMP_OT_FAULT, REGULATOR_ERROR_OVER_TEMP },
+ { PB_TEMP_OT_WARNING, REGULATOR_ERROR_OVER_TEMP_WARN,
+ REGULATOR_EVENT_OVER_TEMP_WARN },
+ { PB_TEMP_OT_FAULT, REGULATOR_ERROR_OVER_TEMP,
+ REGULATOR_EVENT_OVER_TEMP },
{ },
},
},
@@ -2790,7 +2799,7 @@ static void pmbus_notify(struct pmbus_data *data, int page, int reg, int flags)
}
static int _pmbus_get_flags(struct pmbus_data *data, u8 page, unsigned int *flags,
- bool notify)
+ unsigned int *event, bool notify)
{
int i, status;
const struct pmbus_status_category *cat;
@@ -2800,6 +2809,7 @@ static int _pmbus_get_flags(struct pmbus_data *data, u8 page, unsigned int *flag
int func = data->info->func[page];
*flags = 0;
+ *event = 0;
for (i = 0; i < ARRAY_SIZE(pmbus_status_flag_map); i++) {
cat = &pmbus_status_flag_map[i];
@@ -2810,10 +2820,11 @@ static int _pmbus_get_flags(struct pmbus_data *data, u8 page, unsigned int *flag
if (status < 0)
return status;
- for (bit = cat->bits; bit->pflag; bit++) {
- if (status & bit->pflag)
+ for (bit = cat->bits; bit->pflag; bit++)
+ if (status & bit->pflag) {
*flags |= bit->rflag;
- }
+ *event |= bit->eflag;
+ }
if (notify && status)
pmbus_notify(data, page, cat->reg, status);
@@ -2834,20 +2845,28 @@ static int _pmbus_get_flags(struct pmbus_data *data, u8 page, unsigned int *flag
return status;
if (_pmbus_is_enabled(dev, page)) {
- if (status & PB_STATUS_OFF)
+ if (status & PB_STATUS_OFF) {
*flags |= REGULATOR_ERROR_FAIL;
+ *event |= REGULATOR_EVENT_FAIL;
+ }
- if (status & PB_STATUS_POWER_GOOD_N)
+ if (status & PB_STATUS_POWER_GOOD_N) {
*flags |= REGULATOR_ERROR_REGULATION_OUT;
+ *event |= REGULATOR_EVENT_REGULATION_OUT;
+ }
}
/*
* Unlike most other status bits, PB_STATUS_{IOUT_OC,VOUT_OV} are
* defined strictly as fault indicators (not warnings).
*/
- if (status & PB_STATUS_IOUT_OC)
+ if (status & PB_STATUS_IOUT_OC) {
*flags |= REGULATOR_ERROR_OVER_CURRENT;
- if (status & PB_STATUS_VOUT_OV)
+ *event |= REGULATOR_EVENT_OVER_CURRENT;
+ }
+ if (status & PB_STATUS_VOUT_OV) {
*flags |= REGULATOR_ERROR_REGULATION_OUT;
+ *event |= REGULATOR_EVENT_FAIL;
+ }
/*
* If we haven't discovered any thermal faults or warnings via
@@ -2855,19 +2874,22 @@ static int _pmbus_get_flags(struct pmbus_data *data, u8 page, unsigned int *flag
* a (conservative) best-effort interpretation.
*/
if (!(*flags & (REGULATOR_ERROR_OVER_TEMP | REGULATOR_ERROR_OVER_TEMP_WARN)) &&
- (status & PB_STATUS_TEMPERATURE))
+ (status & PB_STATUS_TEMPERATURE)) {
*flags |= REGULATOR_ERROR_OVER_TEMP_WARN;
+ *event |= REGULATOR_EVENT_OVER_TEMP_WARN;
+ }
+
return 0;
}
static int __maybe_unused pmbus_get_flags(struct pmbus_data *data, u8 page, unsigned int *flags,
- bool notify)
+ unsigned int *event, bool notify)
{
int ret;
mutex_lock(&data->update_lock);
- ret = _pmbus_get_flags(data, page, flags, notify);
+ ret = _pmbus_get_flags(data, page, flags, event, notify);
mutex_unlock(&data->update_lock);
return ret;
@@ -2911,8 +2933,9 @@ static int pmbus_regulator_get_error_flags(struct regulator_dev *rdev, unsigned
struct device *dev = rdev_get_dev(rdev);
struct i2c_client *client = to_i2c_client(dev->parent);
struct pmbus_data *data = i2c_get_clientdata(client);
+ int event;
- return pmbus_get_flags(data, rdev_get_id(rdev), flags, false);
+ return pmbus_get_flags(data, rdev_get_id(rdev), flags, &event, false);
}
static int pmbus_regulator_get_status(struct regulator_dev *rdev)
@@ -3152,10 +3175,11 @@ static irqreturn_t pmbus_fault_handler(int irq, void *pdata)
{
struct pmbus_data *data = pdata;
struct i2c_client *client = to_i2c_client(data->dev);
- int i, status;
+
+ int i, status, event;
mutex_lock(&data->update_lock);
for (i = 0; i < data->info->pages; i++)
- _pmbus_get_flags(data, i, &status, true);
+ _pmbus_get_flags(data, i, &status, &event, true);
pmbus_clear_faults(client);
mutex_unlock(&data->update_lock);
--
2.39.1
Notify regulator events in PMBus irq handler.
Signed-off-by: Naresh Solanki <[email protected]>
---
drivers/hwmon/pmbus/pmbus_core.c | 24 +++++++++++++++++++++++-
1 file changed, 23 insertions(+), 1 deletion(-)
diff --git a/drivers/hwmon/pmbus/pmbus_core.c b/drivers/hwmon/pmbus/pmbus_core.c
index 509bc0ef1706..86cc8001a788 100644
--- a/drivers/hwmon/pmbus/pmbus_core.c
+++ b/drivers/hwmon/pmbus/pmbus_core.c
@@ -3159,11 +3159,29 @@ static int pmbus_regulator_register(struct pmbus_data *data)
return 0;
}
+
+static int pmbus_regulator_notify(struct pmbus_data *data, int page, int event)
+{
+ int j;
+
+ for (j = 0; j < data->info->num_regulators; j++) {
+ if (page == rdev_get_id(data->rdevs[j])) {
+ regulator_notifier_call_chain(data->rdevs[j], event, NULL);
+ break;
+ }
+ }
+ return 0;
+}
#else
static int pmbus_regulator_register(struct pmbus_data *data)
{
return 0;
}
+
+static int pmbus_regulator_notify(struct pmbus_data *data, int page, int event)
+{
+ return 0;
+}
#endif
static int pmbus_write_smbalert_mask(struct i2c_client *client, u8 page, u8 reg, u8 val)
@@ -3178,9 +3196,13 @@ static irqreturn_t pmbus_fault_handler(int irq, void *pdata)
int i, status, event;
mutex_lock(&data->update_lock);
- for (i = 0; i < data->info->pages; i++)
+ for (i = 0; i < data->info->pages; i++) {
_pmbus_get_flags(data, i, &status, &event, true);
+ if (event)
+ pmbus_regulator_notify(data, i, event);
+ }
+
pmbus_clear_faults(client);
mutex_unlock(&data->update_lock);
--
2.39.1
Hi Naresh, all
This mail is maybe more of a question so that I can get on the same page
with the rest of the world than anything else. I just have to ask this
as I am trying to figure out what kind of handling there could be for
regulator errors. I added Mark and couple of others to the CC as I am
under the impression they have done some work with the regulator error
handling lately :)
On 3/28/23 18:03, Naresh Solanki wrote:
> From: Patrick Rudolph <[email protected]>
>
> Add regulator events corresponding to regulator error in regulator flag
> map.
> Also capture the same in pmbus_regulator_get_flags.
>
> Signed-off-by: Patrick Rudolph <[email protected]>
> Signed-off-by: Naresh Solanki <[email protected]>
> ---
> drivers/hwmon/pmbus/pmbus_core.c | 74 +++++++++++++++++++++-----------
> 1 file changed, 49 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/hwmon/pmbus/pmbus_core.c b/drivers/hwmon/pmbus/pmbus_core.c
> index d93405f1a495..509bc0ef1706 100644
> --- a/drivers/hwmon/pmbus/pmbus_core.c
> +++ b/drivers/hwmon/pmbus/pmbus_core.c
> @@ -2693,9 +2693,9 @@ static int pmbus_init_common(struct i2c_client *client, struct pmbus_data *data,
> return 0;
> }
>
> -/* A PMBus status flag and the corresponding REGULATOR_ERROR_* flag */
> +/* A PMBus status flag and the corresponding REGULATOR_ERROR_* and REGULATOR_EVENTS_* flag */
> struct pmbus_status_assoc {
> - int pflag, rflag;
> + int pflag, rflag, eflag;
> };
>
> /* PMBus->regulator bit mappings for a PMBus status register */
> @@ -2710,27 +2710,36 @@ static const struct pmbus_status_category __maybe_unused pmbus_status_flag_map[]
> .func = PMBUS_HAVE_STATUS_VOUT,
> .reg = PMBUS_STATUS_VOUT,
> .bits = (const struct pmbus_status_assoc[]) {
> - { PB_VOLTAGE_UV_WARNING, REGULATOR_ERROR_UNDER_VOLTAGE_WARN },
> - { PB_VOLTAGE_UV_FAULT, REGULATOR_ERROR_UNDER_VOLTAGE },
> - { PB_VOLTAGE_OV_WARNING, REGULATOR_ERROR_OVER_VOLTAGE_WARN },
> - { PB_VOLTAGE_OV_FAULT, REGULATOR_ERROR_REGULATION_OUT },
> + { PB_VOLTAGE_UV_WARNING, REGULATOR_ERROR_UNDER_VOLTAGE_WARN,
> + REGULATOR_EVENT_UNDER_VOLTAGE_WARN },
> + { PB_VOLTAGE_UV_FAULT, REGULATOR_ERROR_UNDER_VOLTAGE,
> + REGULATOR_EVENT_UNDER_VOLTAGE },
> + { PB_VOLTAGE_OV_WARNING, REGULATOR_ERROR_OVER_VOLTAGE_WARN,
> + REGULATOR_EVENT_OVER_VOLTAGE_WARN },
> + { PB_VOLTAGE_OV_FAULT, REGULATOR_ERROR_REGULATION_OUT,
> + REGULATOR_EVENT_OVER_VOLTAGE_WARN },
The question I have is: Are these mappings to regulator
errors/notifications always correct?
(I don't know the PMBUS specification in details - and thus I am
_asking_ this here, not telling that the mapping is incorrect).
Let me explain why I am asking this.
What I have gathered is that at least some ICs allow setting for example
'voltage limits' for different PMBUS over-voltage WARNs/FAULTs. I
however don't know if for example the "fatality" of errors indicated by
FAULTS vs WARNs is mandated by any specification - or if a hw designers
have free hands to decide what these events indicate on their board - or
what type of action should be taken when certain ERROR/WARN is emitted.
Then to the handling of regulator errors:
In order to be able to create any handling for the regulator
errors/notifications, we should be able to classify the
errors/notifications at least by the severity. The very fundamental
decision is whether to turn-off the regulator - or even the whole system
- in order to protect the hardware from damage.
There are few other questions related to error handling as well - for
example questions like:
Who should handle error? (we may have many consumers?)
When should consumer use for example forced regulator-off without
knowing the other consumers?
When should we have in-kernel handling for errors?
When should the errors be sent to user-space trusting someone there is
taking care of the situation?
Following is largely my own pondering - and I would like to gain better
understanding while also avoid sending wrong events/errors for detected
HW issues so that we could actually implement recovery actions based on
regulator errors / notifications.
I have been trying to understand how error handling with regulator
events should / could work. In my head (and in the regulator fault
detection limit setting) we have 3 severity categories:
1. PROTECTION:
The most 'severe' type of issue. This is reserved for cases where the
hardware shuts down the regulator(s) without any SW interaction. In most
cases there is no need for notification or error status because soc is
likely to go down when the power is cut off. Border case is when HW
autonomously shuts down a regulator which does not deliver power to any
critical component. I am unsure if such situation should be indicated by
ERROR level notification.
2. ERROR:
Situation where system is no longer usable but the hardware does not do
error handling. I would like to suggest that the proper handling for
this type of events is regulator or system shutdown. I think the
errors/notifications:
#define REGULATOR_ERROR_UNDER_VOLTAGE BIT(1)
#define REGULATOR_ERROR_OVER_CURRENT BIT(2)
#define REGULATOR_ERROR_REGULATION_OUT BIT(3)
#define REGULATOR_ERROR_FAIL BIT(4)
#define REGULATOR_ERROR_OVER_TEMP BIT(5)
#define REGULATOR_EVENT_UNDER_VOLTAGE 0x01
#define REGULATOR_EVENT_OVER_CURRENT 0x02
#define REGULATOR_EVENT_REGULATION_OUT 0x04
#define REGULATOR_EVENT_FAIL 0x08
#define REGULATOR_EVENT_OVER_TEMP 0x10
should only be used to indicate errors with this severity. That would
allow actually implementing the handling for these errors. If these
errors are sent for "less severe" issues, then we will not be able to do
any generic error handling.
3. WARNING:
Situation where something is off-the-spec, but system is still thought
to be usable. Here some - probably board/system (use-case?) specific -
handling may be taking place to prevent things getting worse. I added
following flags for this purpose:
#define REGULATOR_EVENT_UNDER_VOLTAGE_WARN 0x2000
#define REGULATOR_EVENT_OVER_CURRENT_WARN 0x4000
#define REGULATOR_EVENT_OVER_VOLTAGE_WARN 0x8000
#define REGULATOR_EVENT_OVER_TEMP_WARN 0x10000
#define REGULATOR_EVENT_WARN_MASK 0x1E000
#define REGULATOR_ERROR_UNDER_VOLTAGE_WARN BIT(6)
#define REGULATOR_ERROR_OVER_CURRENT_WARN BIT(7)
#define REGULATOR_ERROR_OVER_VOLTAGE_WARN BIT(8)
#define REGULATOR_ERROR_OVER_TEMP_WARN BIT(9)
So, my question(s) are:
1) Is this "classification" sensible and is it still possible?
2) does PMBUS *_WARNING status bits always indicate error which maps to
severity WARNING above? And more importantly
3) does PMBUS *_FAULT status bits always indicate error which maps to
severity ERROR above? Eg, can we assume correct handling for _FAULT is
shutting down the regulator/system?
We have something similar in a few (non PMBUS compatible) PMICs. For
example the ROHM BD9576 has a PROTECTION level error detection
(automatic shutdown by HW) and then another error detection which just
generates an IRQ and allows software to decide what should be done.
While writing the driver for that PMIC my thinking was that the decision
whether IRQ is indicating a fatal error or a warning should be on the
board-designer's table. Thus I implemented it so that the severity and
limit configuration for this error is given via device-tree - and it is
up to board designer to decide whether the fault is ERROR or WARN - and
notification sent by the driver for this IRQ will reflect the given
severity.
I wonder if something like this is needed for PMBUS - or if we can
always say the *_FAULT maps to regulator ERROR and _WARNING maps to
regulator WARNING no matter how board using the IC is designed?
Yours,
-- Matti
--
Matti Vaittinen
Linux kernel developer at ROHM Semiconductors
Oulu Finland
~~ When things go utterly wrong vim users can always type :help! ~~
On 3/28/23 23:48, Matti Vaittinen wrote:
> Hi Naresh, all
>
> This mail is maybe more of a question so that I can get on the same page with the rest of the world than anything else. I just have to ask this as I am trying to figure out what kind of handling there could be for regulator errors. I added Mark and couple of others to the CC as I am under the impression they have done some work with the regulator error handling lately :)
>
> On 3/28/23 18:03, Naresh Solanki wrote:
>> From: Patrick Rudolph <[email protected]>
>>
>> Add regulator events corresponding to regulator error in regulator flag
>> map.
>> Also capture the same in pmbus_regulator_get_flags.
>>
>> Signed-off-by: Patrick Rudolph <[email protected]>
>> Signed-off-by: Naresh Solanki <[email protected]>
>> ---
>> drivers/hwmon/pmbus/pmbus_core.c | 74 +++++++++++++++++++++-----------
>> 1 file changed, 49 insertions(+), 25 deletions(-)
>>
>> diff --git a/drivers/hwmon/pmbus/pmbus_core.c b/drivers/hwmon/pmbus/pmbus_core.c
>> index d93405f1a495..509bc0ef1706 100644
>> --- a/drivers/hwmon/pmbus/pmbus_core.c
>> +++ b/drivers/hwmon/pmbus/pmbus_core.c
>> @@ -2693,9 +2693,9 @@ static int pmbus_init_common(struct i2c_client *client, struct pmbus_data *data,
>> return 0;
>> }
>> -/* A PMBus status flag and the corresponding REGULATOR_ERROR_* flag */
>> +/* A PMBus status flag and the corresponding REGULATOR_ERROR_* and REGULATOR_EVENTS_* flag */
>> struct pmbus_status_assoc {
>> - int pflag, rflag;
>> + int pflag, rflag, eflag;
>> };
>> /* PMBus->regulator bit mappings for a PMBus status register */
>> @@ -2710,27 +2710,36 @@ static const struct pmbus_status_category __maybe_unused pmbus_status_flag_map[]
>> .func = PMBUS_HAVE_STATUS_VOUT,
>> .reg = PMBUS_STATUS_VOUT,
>> .bits = (const struct pmbus_status_assoc[]) {
>> - { PB_VOLTAGE_UV_WARNING, REGULATOR_ERROR_UNDER_VOLTAGE_WARN },
>> - { PB_VOLTAGE_UV_FAULT, REGULATOR_ERROR_UNDER_VOLTAGE },
>> - { PB_VOLTAGE_OV_WARNING, REGULATOR_ERROR_OVER_VOLTAGE_WARN },
>> - { PB_VOLTAGE_OV_FAULT, REGULATOR_ERROR_REGULATION_OUT },
>> + { PB_VOLTAGE_UV_WARNING, REGULATOR_ERROR_UNDER_VOLTAGE_WARN,
>> + REGULATOR_EVENT_UNDER_VOLTAGE_WARN },
>> + { PB_VOLTAGE_UV_FAULT, REGULATOR_ERROR_UNDER_VOLTAGE,
>> + REGULATOR_EVENT_UNDER_VOLTAGE },
>> + { PB_VOLTAGE_OV_WARNING, REGULATOR_ERROR_OVER_VOLTAGE_WARN,
>> + REGULATOR_EVENT_OVER_VOLTAGE_WARN },
>> + { PB_VOLTAGE_OV_FAULT, REGULATOR_ERROR_REGULATION_OUT,
>> + REGULATOR_EVENT_OVER_VOLTAGE_WARN },
>
> The question I have is: Are these mappings to regulator errors/notifications always correct?
>
No, they are not always "correct", and can not be, because each chip implements
the PMBus standard as they see fit.
> (I don't know the PMBUS specification in details - and thus I am _asking_ this here, not telling that the mapping is incorrect).
>
> Let me explain why I am asking this.
>
> What I have gathered is that at least some ICs allow setting for example 'voltage limits' for different PMBUS over-voltage WARNs/FAULTs. I however don't know if for example the "fatality" of errors indicated by FAULTS vs WARNs is mandated by any specification - or if a hw designers have free hands to decide what these events indicate on their board - or what type of action should be taken when certain ERROR/WARN is emitted.
>
PMBus actually specifies which action can and should be taken on faults,
and that action is configurable. That - again - does not mean, however,
that each chip implements this the same way, only that they should.
Possible (standard) actions for voltages are:
- continue operating without interruption
- continue operating for a specified amount of time,
then shut down if fault still exists and optionally
retry
- shut down and optionally retry for a configurable
number of times
- disable output while fault condition is present
(e.g. for temperature faults) and re-enable after
fault condition no longer exists
Similar configuration settings are defined for current faults,
with additional support for current limiting.
> Then to the handling of regulator errors:
>
> In order to be able to create any handling for the regulator errors/notifications, we should be able to classify the errors/notifications at least by the severity. The very fundamental decision is whether to turn-off the regulator - or even the whole system - in order to protect the hardware from damage.
>
> There are few other questions related to error handling as well - for example questions like:
> Who should handle error? (we may have many consumers?)
> When should consumer use for example forced regulator-off without knowing the other consumers?
> When should we have in-kernel handling for errors?
> When should the errors be sent to user-space trusting someone there is taking care of the situation?
>
> Following is largely my own pondering - and I would like to gain better understanding while also avoid sending wrong events/errors for detected HW issues so that we could actually implement recovery actions based on regulator errors / notifications.
>
> I have been trying to understand how error handling with regulator events should / could work. In my head (and in the regulator fault detection limit setting) we have 3 severity categories:
>
> 1. PROTECTION:
> The most 'severe' type of issue. This is reserved for cases where the hardware shuts down the regulator(s) without any SW interaction. In most cases there is no need for notification or error status because soc is likely to go down when the power is cut off. Border case is when HW autonomously shuts down a regulator which does not deliver power to any critical component. I am unsure if such situation should be indicated by ERROR level notification.
>
> 2. ERROR:
> Situation where system is no longer usable but the hardware does not do error handling. I would like to suggest that the proper handling for this type of events is regulator or system shutdown. I think the errors/notifications:
>
> #define REGULATOR_ERROR_UNDER_VOLTAGE BIT(1)
> #define REGULATOR_ERROR_OVER_CURRENT BIT(2)
> #define REGULATOR_ERROR_REGULATION_OUT BIT(3)
> #define REGULATOR_ERROR_FAIL BIT(4)
> #define REGULATOR_ERROR_OVER_TEMP BIT(5)
>
> #define REGULATOR_EVENT_UNDER_VOLTAGE 0x01
> #define REGULATOR_EVENT_OVER_CURRENT 0x02
> #define REGULATOR_EVENT_REGULATION_OUT 0x04
> #define REGULATOR_EVENT_FAIL 0x08
> #define REGULATOR_EVENT_OVER_TEMP 0x10
>
> should only be used to indicate errors with this severity. That would allow actually implementing the handling for these errors. If these errors are sent for "less severe" issues, then we will not be able to do any generic error handling.
>
> 3. WARNING:
> Situation where something is off-the-spec, but system is still thought to be usable. Here some - probably board/system (use-case?) specific - handling may be taking place to prevent things getting worse. I added following flags for this purpose:
>
> #define REGULATOR_EVENT_UNDER_VOLTAGE_WARN 0x2000
> #define REGULATOR_EVENT_OVER_CURRENT_WARN 0x4000
> #define REGULATOR_EVENT_OVER_VOLTAGE_WARN 0x8000
> #define REGULATOR_EVENT_OVER_TEMP_WARN 0x10000
> #define REGULATOR_EVENT_WARN_MASK 0x1E000
>
> #define REGULATOR_ERROR_UNDER_VOLTAGE_WARN BIT(6)
> #define REGULATOR_ERROR_OVER_CURRENT_WARN BIT(7)
> #define REGULATOR_ERROR_OVER_VOLTAGE_WARN BIT(8)
> #define REGULATOR_ERROR_OVER_TEMP_WARN BIT(9)
>
>
> So, my question(s) are:
>
> 1) Is this "classification" sensible and is it still possible?
> 2) does PMBUS *_WARNING status bits always indicate error which maps to severity WARNING above? And more importantly
> 3) does PMBUS *_FAULT status bits always indicate error which maps to severity ERROR above? Eg, can we assume correct handling for _FAULT is shutting down the regulator/system?
>
No, as per above, since PMBus chips implement (or are supposed to implement)
their own fault handling.
> We have something similar in a few (non PMBUS compatible) PMICs. For example the ROHM BD9576 has a PROTECTION level error detection (automatic shutdown by HW) and then another error detection which just generates an IRQ and allows software to decide what should be done.
>
> While writing the driver for that PMIC my thinking was that the decision whether IRQ is indicating a fatal error or a warning should be on the board-designer's table. Thus I implemented it so that the severity and limit configuration for this error is given via device-tree - and it is up to board designer to decide whether the fault is ERROR or WARN - and notification sent by the driver for this IRQ will reflect the given severity.
>
> I wonder if something like this is needed for PMBUS - or if we can always say the *_FAULT maps to regulator ERROR and _WARNING maps to regulator WARNING no matter how board using the IC is designed?
>
In summary, I must admit that I don't entirely understand your questions
or why they are tied to PMBus regulator implementations, except that you
seem to be saying that because reporting faults to the regulator subsystem as
errors may result in shutdowns requested by the regulator subsystem, PMBus
chips must not report such faults to the regulator subsystem as errors
because that would interfer with fault handling implemented by PMBus chips.
I am not sure if PMBus is made an example of here, but it seems to me
that I don't have a choice. I'll hold up this series until I have a better
understanding of the implications.
Guenter
Hi Guenter,
On 4/5/23 16:15, Guenter Roeck wrote:
> On 3/28/23 23:48, Matti Vaittinen wrote:
>> Hi Naresh, all
>>
>> This mail is maybe more of a question so that I can get on the same
>> page with the rest of the world than anything else. I just have to ask
>> this as I am trying to figure out what kind of handling there could be
>> for regulator errors. I added Mark and couple of others to the CC as I
>> am under the impression they have done some work with the regulator
>> error handling lately :)
>>
>> On 3/28/23 18:03, Naresh Solanki wrote:
>>> From: Patrick Rudolph <[email protected]>
>>>
>>> Add regulator events corresponding to regulator error in regulator flag
>>> map.
>>> Also capture the same in pmbus_regulator_get_flags.
>>>
>>> Signed-off-by: Patrick Rudolph <[email protected]>
>>> Signed-off-by: Naresh Solanki <[email protected]>
>>> ---
>>> drivers/hwmon/pmbus/pmbus_core.c | 74 +++++++++++++++++++++-----------
>>> 1 file changed, 49 insertions(+), 25 deletions(-)
>>>
>>> diff --git a/drivers/hwmon/pmbus/pmbus_core.c
>>> b/drivers/hwmon/pmbus/pmbus_core.c
>>> index d93405f1a495..509bc0ef1706 100644
>>> --- a/drivers/hwmon/pmbus/pmbus_core.c
>>> +++ b/drivers/hwmon/pmbus/pmbus_core.c
>>> @@ -2693,9 +2693,9 @@ static int pmbus_init_common(struct i2c_client
>>> *client, struct pmbus_data *data,
>>> return 0;
>>> }
>>> -/* A PMBus status flag and the corresponding REGULATOR_ERROR_* flag */
>>> +/* A PMBus status flag and the corresponding REGULATOR_ERROR_* and
>>> REGULATOR_EVENTS_* flag */
>>> struct pmbus_status_assoc {
>>> - int pflag, rflag;
>>> + int pflag, rflag, eflag;
>>> };
>>> /* PMBus->regulator bit mappings for a PMBus status register */
>>> @@ -2710,27 +2710,36 @@ static const struct pmbus_status_category
>>> __maybe_unused pmbus_status_flag_map[]
>>> .func = PMBUS_HAVE_STATUS_VOUT,
>>> .reg = PMBUS_STATUS_VOUT,
>>> .bits = (const struct pmbus_status_assoc[]) {
>>> - { PB_VOLTAGE_UV_WARNING,
>>> REGULATOR_ERROR_UNDER_VOLTAGE_WARN },
>>> - { PB_VOLTAGE_UV_FAULT, REGULATOR_ERROR_UNDER_VOLTAGE },
>>> - { PB_VOLTAGE_OV_WARNING,
>>> REGULATOR_ERROR_OVER_VOLTAGE_WARN },
>>> - { PB_VOLTAGE_OV_FAULT, REGULATOR_ERROR_REGULATION_OUT },
>>> + { PB_VOLTAGE_UV_WARNING,
>>> REGULATOR_ERROR_UNDER_VOLTAGE_WARN,
>>> + REGULATOR_EVENT_UNDER_VOLTAGE_WARN },
>>> + { PB_VOLTAGE_UV_FAULT, REGULATOR_ERROR_UNDER_VOLTAGE,
>>> + REGULATOR_EVENT_UNDER_VOLTAGE },
>>> + { PB_VOLTAGE_OV_WARNING, REGULATOR_ERROR_OVER_VOLTAGE_WARN,
>>> + REGULATOR_EVENT_OVER_VOLTAGE_WARN },
>>> + { PB_VOLTAGE_OV_FAULT, REGULATOR_ERROR_REGULATION_OUT,
>>> + REGULATOR_EVENT_OVER_VOLTAGE_WARN },
>>
>> The question I have is: Are these mappings to regulator
>> errors/notifications always correct?
>>
>
> No, they are not always "correct", and can not be, because each chip
> implements
> the PMBus standard as they see fit.
>
>> (I don't know the PMBUS specification in details - and thus I am
>> _asking_ this here, not telling that the mapping is incorrect).
>>
>> Let me explain why I am asking this.
>>
>> What I have gathered is that at least some ICs allow setting for
>> example 'voltage limits' for different PMBUS over-voltage
>> WARNs/FAULTs. I however don't know if for example the "fatality" of
>> errors indicated by FAULTS vs WARNs is mandated by any specification -
>> or if a hw designers have free hands to decide what these events
>> indicate on their board - or what type of action should be taken when
>> certain ERROR/WARN is emitted.
>>
>
> PMBus actually specifies which action can and should be taken on faults,
> and that action is configurable. That - again - does not mean, however,
> that each chip implements this the same way, only that they should.
>
> Possible (standard) actions for voltages are:
>
> - continue operating without interruption
> - continue operating for a specified amount of time,
> then shut down if fault still exists and optionally
> retry
> - shut down and optionally retry for a configurable
> number of times
> - disable output while fault condition is present
> (e.g. for temperature faults) and re-enable after
> fault condition no longer exists
>
> Similar configuration settings are defined for current faults,
> with additional support for current limiting.
>
>> Then to the handling of regulator errors:
>>
>> In order to be able to create any handling for the regulator
>> errors/notifications, we should be able to classify the
>> errors/notifications at least by the severity. The very fundamental
>> decision is whether to turn-off the regulator - or even the whole
>> system - in order to protect the hardware from damage.
>>
>> There are few other questions related to error handling as well - for
>> example questions like:
>> Who should handle error? (we may have many consumers?)
>> When should consumer use for example forced regulator-off without
>> knowing the other consumers?
>> When should we have in-kernel handling for errors?
>> When should the errors be sent to user-space trusting someone there is
>> taking care of the situation?
>>
>> Following is largely my own pondering - and I would like to gain
>> better understanding while also avoid sending wrong events/errors for
>> detected HW issues so that we could actually implement recovery
>> actions based on regulator errors / notifications.
>>
>> I have been trying to understand how error handling with regulator
>> events should / could work. In my head (and in the regulator fault
>> detection limit setting) we have 3 severity categories:
>>
>> 1. PROTECTION:
>> The most 'severe' type of issue. This is reserved for cases where the
>> hardware shuts down the regulator(s) without any SW interaction. In
>> most cases there is no need for notification or error status because
>> soc is likely to go down when the power is cut off. Border case is
>> when HW autonomously shuts down a regulator which does not deliver
>> power to any critical component. I am unsure if such situation should
>> be indicated by ERROR level notification.
>>
>> 2. ERROR:
>> Situation where system is no longer usable but the hardware does not
>> do error handling. I would like to suggest that the proper handling
>> for this type of events is regulator or system shutdown. I think the
>> errors/notifications:
>>
>> #define REGULATOR_ERROR_UNDER_VOLTAGE BIT(1)
>> #define REGULATOR_ERROR_OVER_CURRENT BIT(2)
>> #define REGULATOR_ERROR_REGULATION_OUT BIT(3)
>> #define REGULATOR_ERROR_FAIL BIT(4)
>> #define REGULATOR_ERROR_OVER_TEMP BIT(5)
>>
>> #define REGULATOR_EVENT_UNDER_VOLTAGE 0x01
>> #define REGULATOR_EVENT_OVER_CURRENT 0x02
>> #define REGULATOR_EVENT_REGULATION_OUT 0x04
>> #define REGULATOR_EVENT_FAIL 0x08
>> #define REGULATOR_EVENT_OVER_TEMP 0x10
>>
>> should only be used to indicate errors with this severity. That would
>> allow actually implementing the handling for these errors. If these
>> errors are sent for "less severe" issues, then we will not be able to
>> do any generic error handling.
>>
>> 3. WARNING:
>> Situation where something is off-the-spec, but system is still thought
>> to be usable. Here some - probably board/system (use-case?) specific -
>> handling may be taking place to prevent things getting worse. I added
>> following flags for this purpose:
>>
>> #define REGULATOR_EVENT_UNDER_VOLTAGE_WARN 0x2000
>> #define REGULATOR_EVENT_OVER_CURRENT_WARN 0x4000
>> #define REGULATOR_EVENT_OVER_VOLTAGE_WARN 0x8000
>> #define REGULATOR_EVENT_OVER_TEMP_WARN 0x10000
>> #define REGULATOR_EVENT_WARN_MASK 0x1E000
>>
>> #define REGULATOR_ERROR_UNDER_VOLTAGE_WARN BIT(6)
>> #define REGULATOR_ERROR_OVER_CURRENT_WARN BIT(7)
>> #define REGULATOR_ERROR_OVER_VOLTAGE_WARN BIT(8)
>> #define REGULATOR_ERROR_OVER_TEMP_WARN BIT(9)
>>
>>
>> So, my question(s) are:
>>
>> 1) Is this "classification" sensible and is it still possible?
>> 2) does PMBUS *_WARNING status bits always indicate error which maps
>> to severity WARNING above? And more importantly
>> 3) does PMBUS *_FAULT status bits always indicate error which maps to
>> severity ERROR above? Eg, can we assume correct handling for _FAULT is
>> shutting down the regulator/system?
>>
>
> No, as per above, since PMBus chips implement (or are supposed to
> implement)
> their own fault handling.
>
>> We have something similar in a few (non PMBUS compatible) PMICs. For
>> example the ROHM BD9576 has a PROTECTION level error detection
>> (automatic shutdown by HW) and then another error detection which just
>> generates an IRQ and allows software to decide what should be done.
>>
>> While writing the driver for that PMIC my thinking was that the
>> decision whether IRQ is indicating a fatal error or a warning should
>> be on the board-designer's table. Thus I implemented it so that the
>> severity and limit configuration for this error is given via
>> device-tree - and it is up to board designer to decide whether the
>> fault is ERROR or WARN - and notification sent by the driver for this
>> IRQ will reflect the given severity.
>>
>> I wonder if something like this is needed for PMBUS - or if we can
>> always say the *_FAULT maps to regulator ERROR and _WARNING maps to
>> regulator WARNING no matter how board using the IC is designed?
>>
>
> In summary, I must admit that I don't entirely understand your questions
> or why they are tied to PMBus regulator implementations, except that you
> seem to be saying that because reporting faults to the regulator
> subsystem as
> errors may result in shutdowns requested by the regulator subsystem,
At the moment, no. I don't think the regulator subsystem itself requests
shut-down for the system.
As far as I know, the handling of regulator errors is currently done by
regulator consumers. If the only consumers for PMBus regulators are
PMBus specific drivers - then the in-kernel error handling is in the
hands of PMBus specific drivers.
Furthermore, I don't think we have in-tree handling for the regulator
events. Hence, even if there were 'non PMBus specific' consumers for
these regulators - there will probably be no catastrophic things
happening as of now - but if these notifications can leak to non PMBus
regulator consumers, then we place a restriction on handling of these
errors. (Eg, other but PMBus specific regulator consumers must not
handle these).
The one thing to consider is that (as far as I understand) some work has
been done to get these events visible to user-space. I am unsure how the
user-space should know what to do upon these events if these may be
generated for 'different severity' issues.
> PMBus
> chips must not report such faults to the regulator subsystem as errors
> because that would interfer with fault handling implemented by PMBus chips.
I am sorry for making the noise here. As I said above, if the only
consumers for these regulators are PMBus devices - then I guess this
does not cause problems. Yet, I wonder if this means that not all of the
regulator notifications should be sent to user-land.
> I am not sure if PMBus is made an example of here, but it seems to me
> that I don't have a choice. I'll hold up this series until I have a better
> understanding of the implications.
Please, don't block the series if no-one else but me is concerned. It is
highly possible it is me who does not understand the purpose of these
notifications or how they are thought to be handled.
Yours,
-- Matti
--
Matti Vaittinen
Linux kernel developer at ROHM Semiconductors
Oulu Finland
~~ When things go utterly wrong vim users can always type :help! ~~
On 4/5/23 06:57, Matti Vaittinen wrote:
> Hi Guenter,
>
> On 4/5/23 16:15, Guenter Roeck wrote:
>> On 3/28/23 23:48, Matti Vaittinen wrote:
>>> Hi Naresh, all
>>>
>>> This mail is maybe more of a question so that I can get on the same page with the rest of the world than anything else. I just have to ask this as I am trying to figure out what kind of handling there could be for regulator errors. I added Mark and couple of others to the CC as I am under the impression they have done some work with the regulator error handling lately :)
>>>
>>> On 3/28/23 18:03, Naresh Solanki wrote:
>>>> From: Patrick Rudolph <[email protected]>
>>>>
>>>> Add regulator events corresponding to regulator error in regulator flag
>>>> map.
>>>> Also capture the same in pmbus_regulator_get_flags.
>>>>
>>>> Signed-off-by: Patrick Rudolph <[email protected]>
>>>> Signed-off-by: Naresh Solanki <[email protected]>
>>>> ---
>>>> drivers/hwmon/pmbus/pmbus_core.c | 74 +++++++++++++++++++++-----------
>>>> 1 file changed, 49 insertions(+), 25 deletions(-)
>>>>
>>>> diff --git a/drivers/hwmon/pmbus/pmbus_core.c b/drivers/hwmon/pmbus/pmbus_core.c
>>>> index d93405f1a495..509bc0ef1706 100644
>>>> --- a/drivers/hwmon/pmbus/pmbus_core.c
>>>> +++ b/drivers/hwmon/pmbus/pmbus_core.c
>>>> @@ -2693,9 +2693,9 @@ static int pmbus_init_common(struct i2c_client *client, struct pmbus_data *data,
>>>> return 0;
>>>> }
>>>> -/* A PMBus status flag and the corresponding REGULATOR_ERROR_* flag */
>>>> +/* A PMBus status flag and the corresponding REGULATOR_ERROR_* and REGULATOR_EVENTS_* flag */
>>>> struct pmbus_status_assoc {
>>>> - int pflag, rflag;
>>>> + int pflag, rflag, eflag;
>>>> };
>>>> /* PMBus->regulator bit mappings for a PMBus status register */
>>>> @@ -2710,27 +2710,36 @@ static const struct pmbus_status_category __maybe_unused pmbus_status_flag_map[]
>>>> .func = PMBUS_HAVE_STATUS_VOUT,
>>>> .reg = PMBUS_STATUS_VOUT,
>>>> .bits = (const struct pmbus_status_assoc[]) {
>>>> - { PB_VOLTAGE_UV_WARNING, REGULATOR_ERROR_UNDER_VOLTAGE_WARN },
>>>> - { PB_VOLTAGE_UV_FAULT, REGULATOR_ERROR_UNDER_VOLTAGE },
>>>> - { PB_VOLTAGE_OV_WARNING, REGULATOR_ERROR_OVER_VOLTAGE_WARN },
>>>> - { PB_VOLTAGE_OV_FAULT, REGULATOR_ERROR_REGULATION_OUT },
>>>> + { PB_VOLTAGE_UV_WARNING, REGULATOR_ERROR_UNDER_VOLTAGE_WARN,
>>>> + REGULATOR_EVENT_UNDER_VOLTAGE_WARN },
>>>> + { PB_VOLTAGE_UV_FAULT, REGULATOR_ERROR_UNDER_VOLTAGE,
>>>> + REGULATOR_EVENT_UNDER_VOLTAGE },
>>>> + { PB_VOLTAGE_OV_WARNING, REGULATOR_ERROR_OVER_VOLTAGE_WARN,
>>>> + REGULATOR_EVENT_OVER_VOLTAGE_WARN },
>>>> + { PB_VOLTAGE_OV_FAULT, REGULATOR_ERROR_REGULATION_OUT,
>>>> + REGULATOR_EVENT_OVER_VOLTAGE_WARN },
>>>
>>> The question I have is: Are these mappings to regulator errors/notifications always correct?
>>>
>>
>> No, they are not always "correct", and can not be, because each chip implements
>> the PMBus standard as they see fit.
>>
>>> (I don't know the PMBUS specification in details - and thus I am _asking_ this here, not telling that the mapping is incorrect).
>>>
>>> Let me explain why I am asking this.
>>>
>>> What I have gathered is that at least some ICs allow setting for example 'voltage limits' for different PMBUS over-voltage WARNs/FAULTs. I however don't know if for example the "fatality" of errors indicated by FAULTS vs WARNs is mandated by any specification - or if a hw designers have free hands to decide what these events indicate on their board - or what type of action should be taken when certain ERROR/WARN is emitted.
>>>
>>
>> PMBus actually specifies which action can and should be taken on faults,
>> and that action is configurable. That - again - does not mean, however,
>> that each chip implements this the same way, only that they should.
>>
>> Possible (standard) actions for voltages are:
>>
>> - continue operating without interruption
>> - continue operating for a specified amount of time,
>> then shut down if fault still exists and optionally
>> retry
>> - shut down and optionally retry for a configurable
>> number of times
>> - disable output while fault condition is present
>> (e.g. for temperature faults) and re-enable after
>> fault condition no longer exists
>>
>> Similar configuration settings are defined for current faults,
>> with additional support for current limiting.
>>
>>> Then to the handling of regulator errors:
>>>
>>> In order to be able to create any handling for the regulator errors/notifications, we should be able to classify the errors/notifications at least by the severity. The very fundamental decision is whether to turn-off the regulator - or even the whole system - in order to protect the hardware from damage.
>>>
>>> There are few other questions related to error handling as well - for example questions like:
>>> Who should handle error? (we may have many consumers?)
>>> When should consumer use for example forced regulator-off without knowing the other consumers?
>>> When should we have in-kernel handling for errors?
>>> When should the errors be sent to user-space trusting someone there is taking care of the situation?
>>>
>>> Following is largely my own pondering - and I would like to gain better understanding while also avoid sending wrong events/errors for detected HW issues so that we could actually implement recovery actions based on regulator errors / notifications.
>>>
>>> I have been trying to understand how error handling with regulator events should / could work. In my head (and in the regulator fault detection limit setting) we have 3 severity categories:
>>>
>>> 1. PROTECTION:
>>> The most 'severe' type of issue. This is reserved for cases where the hardware shuts down the regulator(s) without any SW interaction. In most cases there is no need for notification or error status because soc is likely to go down when the power is cut off. Border case is when HW autonomously shuts down a regulator which does not deliver power to any critical component. I am unsure if such situation should be indicated by ERROR level notification.
>>>
>>> 2. ERROR:
>>> Situation where system is no longer usable but the hardware does not do error handling. I would like to suggest that the proper handling for this type of events is regulator or system shutdown. I think the errors/notifications:
>>>
>>> #define REGULATOR_ERROR_UNDER_VOLTAGE BIT(1)
>>> #define REGULATOR_ERROR_OVER_CURRENT BIT(2)
>>> #define REGULATOR_ERROR_REGULATION_OUT BIT(3)
>>> #define REGULATOR_ERROR_FAIL BIT(4)
>>> #define REGULATOR_ERROR_OVER_TEMP BIT(5)
>>>
>>> #define REGULATOR_EVENT_UNDER_VOLTAGE 0x01
>>> #define REGULATOR_EVENT_OVER_CURRENT 0x02
>>> #define REGULATOR_EVENT_REGULATION_OUT 0x04
>>> #define REGULATOR_EVENT_FAIL 0x08
>>> #define REGULATOR_EVENT_OVER_TEMP 0x10
>>>
>>> should only be used to indicate errors with this severity. That would allow actually implementing the handling for these errors. If these errors are sent for "less severe" issues, then we will not be able to do any generic error handling.
>>>
>>> 3. WARNING:
>>> Situation where something is off-the-spec, but system is still thought to be usable. Here some - probably board/system (use-case?) specific - handling may be taking place to prevent things getting worse. I added following flags for this purpose:
>>>
>>> #define REGULATOR_EVENT_UNDER_VOLTAGE_WARN 0x2000
>>> #define REGULATOR_EVENT_OVER_CURRENT_WARN 0x4000
>>> #define REGULATOR_EVENT_OVER_VOLTAGE_WARN 0x8000
>>> #define REGULATOR_EVENT_OVER_TEMP_WARN 0x10000
>>> #define REGULATOR_EVENT_WARN_MASK 0x1E000
>>>
>>> #define REGULATOR_ERROR_UNDER_VOLTAGE_WARN BIT(6)
>>> #define REGULATOR_ERROR_OVER_CURRENT_WARN BIT(7)
>>> #define REGULATOR_ERROR_OVER_VOLTAGE_WARN BIT(8)
>>> #define REGULATOR_ERROR_OVER_TEMP_WARN BIT(9)
>>>
>>>
>>> So, my question(s) are:
>>>
>>> 1) Is this "classification" sensible and is it still possible?
>>> 2) does PMBUS *_WARNING status bits always indicate error which maps to severity WARNING above? And more importantly
>>> 3) does PMBUS *_FAULT status bits always indicate error which maps to severity ERROR above? Eg, can we assume correct handling for _FAULT is shutting down the regulator/system?
>>>
>>
>> No, as per above, since PMBus chips implement (or are supposed to implement)
>> their own fault handling.
>>
>>> We have something similar in a few (non PMBUS compatible) PMICs. For example the ROHM BD9576 has a PROTECTION level error detection (automatic shutdown by HW) and then another error detection which just generates an IRQ and allows software to decide what should be done.
>>>
>>> While writing the driver for that PMIC my thinking was that the decision whether IRQ is indicating a fatal error or a warning should be on the board-designer's table. Thus I implemented it so that the severity and limit configuration for this error is given via device-tree - and it is up to board designer to decide whether the fault is ERROR or WARN - and notification sent by the driver for this IRQ will reflect the given severity.
>>>
>>> I wonder if something like this is needed for PMBUS - or if we can always say the *_FAULT maps to regulator ERROR and _WARNING maps to regulator WARNING no matter how board using the IC is designed?
>>>
>>
>> In summary, I must admit that I don't entirely understand your questions
>> or why they are tied to PMBus regulator implementations, except that you
>> seem to be saying that because reporting faults to the regulator subsystem as
>> errors may result in shutdowns requested by the regulator subsystem,
>
> At the moment, no. I don't think the regulator subsystem itself requests shut-down for the system.
>
> As far as I know, the handling of regulator errors is currently done by regulator consumers. If the only consumers for PMBus regulators are PMBus specific drivers - then the in-kernel error handling is in the hands of PMBus specific drivers.
>
> Furthermore, I don't think we have in-tree handling for the regulator events. Hence, even if there were 'non PMBus specific' consumers for these regulators - there will probably be no catastrophic things happening as of now - but if these notifications can leak to non PMBus regulator consumers, then we place a restriction on handling of these errors. (Eg, other but PMBus specific regulator consumers must not handle these).
>
> The one thing to consider is that (as far as I understand) some work has been done to get these events visible to user-space. I am unsure how the user-space should know what to do upon these events if these may be generated for 'different severity' issues.
>
The same is true for hardware monitoring events, which are reported to user space
with sysfs attributes.
>> PMBus
>> chips must not report such faults to the regulator subsystem as errors
>> because that would interfer with fault handling implemented by PMBus chips.
>
> I am sorry for making the noise here. As I said above, if the only consumers for these regulators are PMBus devices - then I guess this does not cause problems. Yet, I wonder if this means that not all of the regulator notifications should be sent to user-land.
>
The consumers for the regulators provided by PMBus devices are the devices
connected to those PMBus devices, just like with (presumably) every other
regulator. Not sure why a regulator instantiated by a regulator driver
would provide a regulator regulating itself.
>> I am not sure if PMBus is made an example of here, but it seems to me
>> that I don't have a choice. I'll hold up this series until I have a better
>> understanding of the implications.
>
> Please, don't block the series if no-one else but me is concerned. It is highly possible it is me who does not understand the purpose of these notifications or how they are thought to be handled.
>
Same situation. I though a regulator driver would notify the regulator subsystem
if it observes a problem with the supplies it regulates, but based on your feedback
I am not sure anymore what those notifications are supposed to be used for,
and if such notifications are appropriate. That means I'll have to read up on all
this, and I don't have the time to do that in the near future given that I am buried
in pending reviews and the work I am actually getting paid for.
Guenter
On Wed, Apr 05, 2023 at 07:18:32AM -0700, Guenter Roeck wrote:
> The consumers for the regulators provided by PMBus devices are the devices
> connected to those PMBus devices, just like with (presumably) every other
> regulator. Not sure why a regulator instantiated by a regulator driver
> would provide a regulator regulating itself.
Some devices have optionally used internal regulators which can also be
left idle with the supply done externally, and it is also very common to
chain LDOs to DCDCs provided by the same chip (DCDCs provide much better
efficiency but have lower quality output so doing most of the voltage
drop with a DCDC then cleaning up the output with a LDO can be an
effective combination).
> Same situation. I though a regulator driver would notify the regulator subsystem
> if it observes a problem with the supplies it regulates, but based on your feedback
> I am not sure anymore what those notifications are supposed to be used for,
> and if such notifications are appropriate. That means I'll have to read up on all
> this, and I don't have the time to do that in the near future given that I am buried
> in pending reviews and the work I am actually getting paid for.
The theory is that if a consumer detects that the device it's
controlling has bad power then it can take corrective action if there's
some specification mandated error handling (for something like a
standard bus) or risk of hardware damage. It can also try to avoid
interacting with hardware if that might not work.
ke 5. huhtik. 2023 klo 18.19 Mark Brown ([email protected]) kirjoitti:
>
> On Wed, Apr 05, 2023 at 07:18:32AM -0700, Guenter Roeck wrote:
> > Same situation. I though a regulator driver would notify the regulator subsystem
I think this notification is not processed by the regulator subsystem.
It is just delivered to the consumer driver(s) who have subscribed the
notifications.
> > if it observes a problem with the supplies it regulates, but based on your feedback
> > I am not sure anymore what those notifications are supposed to be used for,
> > and if such notifications are appropriate. That means I'll have to read up on all
> > this, and I don't have the time to do that in the near future given that I am buried
> > in pending reviews and the work I am actually getting paid for.
>
> The theory is that if a consumer detects that the device it's
> controlling has bad power then it can take corrective action if there's
> some specification mandated error handling (for something like a
> standard bus) or risk of hardware damage.
The problem I see here is that, if the error information itself
(severity + notification type) does not 'classify' the error handling
- then I don't see how any consumers can do handling unless they are
targeted to one specific system only. My original thinking has been
that ERROR level notifications are sent only when HW is no longer
operable - and consumers are expected to do what-ever protective
actions they can, including turning off the faulty regulator. That is
why I originally asked about handling the PMBUS errors.
As I said, this is my understanding only - I am not in a position
where I can tell people how to use these notifications. Still,
assuming PMBUS has its own handling for these errors (separately from
the regulator events), and assuming PMBUS does not want regulator
consumers to interfere with this handling - then I might avoid at
least sending the ERROR level notifications to regulator consumers.
> It can also try to avoid
> interacting with hardware if that might not work.
It'd be great to have documentation / specification for sending and/or
handling the regulator events. I don't think we currently have such.
As far as I understand, the notifications can be picked up by all
consumers of a regulator. I am a bit worried about:
a) Situations where notification handlers 'collide' by doing 'actions'
which are unexpected by other handlers
b) Situations where different notification senders send similar
severity-level notifications for faults expecting different types of
handling.
Or, is it so that no "generic handling" of these errors is to be
expected? Eg, consumers who implement any handling must always be
targeted to a very specific system? My thinking has been that the
device sending the notification knows the severity of the problem and
- for example the REGULATOR_EVENT_REGULATION_OUT is only sent with
such severe problems that consumers can try disabling the regulator,
whereas the _WARN level notifications may not warrant such action. But
again, I don't think we have a specification for this - so this is
just my thinking - which may be off.
Yours,
-- Matti
--
Matti Vaittinen
Linux kernel developer at ROHM Semiconductors
Oulu Finland
~~ When things go utterly wrong vim users can always type :help! ~~
Discuss - Estimate - Plan - Report and finally accomplish this:
void do_work(int time) __attribute__ ((const));
On Thu, Apr 06, 2023 at 11:00:02AM +0300, Matti Vaittinen wrote:
> ke 5. huhtik. 2023 klo 18.19 Mark Brown ([email protected]) kirjoitti:
> > On Wed, Apr 05, 2023 at 07:18:32AM -0700, Guenter Roeck wrote:
> > > Same situation. I though a regulator driver would notify the regulator subsystem
> I think this notification is not processed by the regulator subsystem.
> It is just delivered to the consumer driver(s) who have subscribed the
> notifications.
Right. The hardware might autonomously do something but there's nothing
in the framework which will take any action.
> > The theory is that if a consumer detects that the device it's
> > controlling has bad power then it can take corrective action if there's
> > some specification mandated error handling (for something like a
> > standard bus) or risk of hardware damage.
> The problem I see here is that, if the error information itself
> (severity + notification type) does not 'classify' the error handling
> - then I don't see how any consumers can do handling unless they are
> targeted to one specific system only. My original thinking has been
> that ERROR level notifications are sent only when HW is no longer
> operable - and consumers are expected to do what-ever protective
> actions they can, including turning off the faulty regulator. That is
> why I originally asked about handling the PMBUS errors.
TBH I think if you're at the point where you've got regulator hardware
flagging problems in most system designs there's serious problems, I'm
not sure how likely it is that it's worth trying to classify the errors.
Perhaps there's some systems that frequently flag low level errors
though.
> > It can also try to avoid
> > interacting with hardware if that might not work.
> It'd be great to have documentation / specification for sending and/or
> handling the regulator events. I don't think we currently have such.
> As far as I understand, the notifications can be picked up by all
> consumers of a regulator. I am a bit worried about:
> a) Situations where notification handlers 'collide' by doing 'actions'
> which are unexpected by other handlers
I'm not sure what you're expecting there? A device working with itself
shouldn't disrupt any other users.
> b) Situations where different notification senders send similar
> severity-level notifications for faults expecting different types of
> handling.
Like I say I'm not sure how much practical difference it makes to think
too hard about differentiating the errors.
> Or, is it so that no "generic handling" of these errors is to be
> expected? Eg, consumers who implement any handling must always be
> targeted to a very specific system? My thinking has been that the
> device sending the notification knows the severity of the problem and
> - for example the REGULATOR_EVENT_REGULATION_OUT is only sent with
> such severe problems that consumers can try disabling the regulator,
> whereas the _WARN level notifications may not warrant such action. But
> again, I don't think we have a specification for this - so this is
> just my thinking - which may be off.
Do we actually have practical examples of systems sending warnings that
aren't followed in very short order by more severe errors, notified or
otherwise?
to 6. huhtik. 2023 klo 16.43 Mark Brown ([email protected]) kirjoitti:
>
> On Thu, Apr 06, 2023 at 11:00:02AM +0300, Matti Vaittinen wrote:
> > ke 5. huhtik. 2023 klo 18.19 Mark Brown ([email protected]) kirjoitti:
> > > On Wed, Apr 05, 2023 at 07:18:32AM -0700, Guenter Roeck wrote:
> > > It can also try to avoid
> > > interacting with hardware if that might not work.
>
> > It'd be great to have documentation / specification for sending and/or
> > handling the regulator events. I don't think we currently have such.
> > As far as I understand, the notifications can be picked up by all
> > consumers of a regulator. I am a bit worried about:
> > a) Situations where notification handlers 'collide' by doing 'actions'
> > which are unexpected by other handlers
>
> I'm not sure what you're expecting there? A device working with itself
> shouldn't disrupt any other users.
I have no concrete idea, just a vague uneasy feeling knowing that
devices tend to interact with each other. I guess it is more about the
amount of uncertainty caused by my lack of knowledge regarding what
could be done by these handlers. So, as I already said - if no one
else is bothered by this then I definitely don't want to block the
series. Still, if the error handling should be kept internal to PMBus
- then we should probably either say that consumer drivers must not
(forcibly) turn off the supply when receiving these notifications - or
not send these notifications from PMBus and allow PMBus to decide
error handling internally. (Again, I don't know if any in-tree
consumer drivers do turn off the supply regulator in error handlers -
but I don't think it is actually forbidden). Or am I just making a
problem that does not exist?
> > b) Situations where different notification senders send similar
> > severity-level notifications for faults expecting different types of
> > handling.
>
> Like I say I'm not sure how much practical difference it makes to think
> too hard about differentiating the errors.
I would do at least two classes.
1) critical class - it is Ok for the consumer to forcibly shut down
the regulator, or maybe the whole system.
2) warning class - it is not Ok to forcibly shut down the regulator.
OTOH, after writing this down - if this was the division, then it
could be clearer to implement the shutdown at critical errors in the
regulator driver (or core) and just send a specific notification to
consumers telling this was done.
> > Or, is it so that no "generic handling" of these errors is to be
> > expected? Eg, consumers who implement any handling must always be
> > targeted to a very specific system? My thinking has been that the
> > device sending the notification knows the severity of the problem and
> > - for example the REGULATOR_EVENT_REGULATION_OUT is only sent with
> > such severe problems that consumers can try disabling the regulator,
> > whereas the _WARN level notifications may not warrant such action. But
> > again, I don't think we have a specification for this - so this is
> > just my thinking - which may be off.
>
> Do we actually have practical examples of systems sending warnings that
> aren't followed in very short order by more severe errors, notified or
> otherwise?
No. I don't. I will send one more question about the real-world use of
BD9576 'warning' level IRQs - but I am highly sceptical I will receive
any real information.
Thanks for the education and time Mark & Guenter. It's a bit hard for
me to let go of the thought that we would benefit from the handling of
different severity level errors - but maybe this was just my illusion
after all.
Yours,
-- Matti
--
Matti Vaittinen
Linux kernel developer at ROHM Semiconductors
Oulu Finland
~~ When things go utterly wrong vim users can always type :help! ~~
Discuss - Estimate - Plan - Report and finally accomplish this:
void do_work(int time) __attribute__ ((const));
On 4/10/23 01:19, Matti Vaittinen wrote:
> to 6. huhtik. 2023 klo 16.43 Mark Brown ([email protected]) kirjoitti:
>>
>> On Thu, Apr 06, 2023 at 11:00:02AM +0300, Matti Vaittinen wrote:
>>> ke 5. huhtik. 2023 klo 18.19 Mark Brown ([email protected]) kirjoitti:
>>>> On Wed, Apr 05, 2023 at 07:18:32AM -0700, Guenter Roeck wrote:
>
>>>> It can also try to avoid
>>>> interacting with hardware if that might not work.
>>
>>> It'd be great to have documentation / specification for sending and/or
>>> handling the regulator events. I don't think we currently have such.
>>> As far as I understand, the notifications can be picked up by all
>>> consumers of a regulator. I am a bit worried about:
>>> a) Situations where notification handlers 'collide' by doing 'actions'
>>> which are unexpected by other handlers
>>
>> I'm not sure what you're expecting there? A device working with itself
>> shouldn't disrupt any other users.
>
> I have no concrete idea, just a vague uneasy feeling knowing that
> devices tend to interact with each other. I guess it is more about the
> amount of uncertainty caused by my lack of knowledge regarding what
> could be done by these handlers. So, as I already said - if no one
> else is bothered by this then I definitely don't want to block the
> series. Still, if the error handling should be kept internal to PMBus
> - then we should probably either say that consumer drivers must not
> (forcibly) turn off the supply when receiving these notifications - or
> not send these notifications from PMBus and allow PMBus to decide
> error handling internally. (Again, I don't know if any in-tree
> consumer drivers do turn off the supply regulator in error handlers -
> but I don't think it is actually forbidden). Or am I just making a
> problem that does not exist?
>
For my part I (still) don't understand why this is considered a problem
for this driver but not for all the other drivers reporting various
error conditions to the regulator subsystem. At least some of them
also have programmable reaction to such error conditions.
Guenter
On 4/10/23 17:40, Guenter Roeck wrote:
> On 4/10/23 01:19, Matti Vaittinen wrote:
>> to 6. huhtik. 2023 klo 16.43 Mark Brown ([email protected]) kirjoitti:
>>>
>>> On Thu, Apr 06, 2023 at 11:00:02AM +0300, Matti Vaittinen wrote:
>>>> ke 5. huhtik. 2023 klo 18.19 Mark Brown ([email protected]) kirjoitti:
>>>>> On Wed, Apr 05, 2023 at 07:18:32AM -0700, Guenter Roeck wrote:
>>
>>>>> It can also try to avoid
>>>>> interacting with hardware if that might not work.
>>>
>>>> It'd be great to have documentation / specification for sending and/or
>>>> handling the regulator events. I don't think we currently have such.
>>>> As far as I understand, the notifications can be picked up by all
>>>> consumers of a regulator. I am a bit worried about:
>>>> a) Situations where notification handlers 'collide' by doing 'actions'
>>>> which are unexpected by other handlers
>>>
>>> I'm not sure what you're expecting there? A device working with itself
>>> shouldn't disrupt any other users.
>>
>> I have no concrete idea, just a vague uneasy feeling knowing that
>> devices tend to interact with each other. I guess it is more about the
>> amount of uncertainty caused by my lack of knowledge regarding what
>> could be done by these handlers. So, as I already said - if no one
>> else is bothered by this then I definitely don't want to block the
>> series. Still, if the error handling should be kept internal to PMBus
>> - then we should probably either say that consumer drivers must not
>> (forcibly) turn off the supply when receiving these notifications - or
>> not send these notifications from PMBus and allow PMBus to decide
>> error handling internally. (Again, I don't know if any in-tree
>> consumer drivers do turn off the supply regulator in error handlers -
>> but I don't think it is actually forbidden). Or am I just making a
>> problem that does not exist?
>>
>
> For my part I (still) don't understand why this is considered a problem
> for this driver but not for all the other drivers reporting various
> error conditions to the regulator subsystem. At least some of them
> also have programmable reaction to such error conditions.
I may not know the drivers you're referring to. And, as I said, maybe
there is no problem.
To explain why I asked this question for this driver:
What caught my attention was use of the REGULATOR_EVENT_*_WARN flags,
which were originally added so regulators could be flagging 'not yet
critical'-errors Vs. the other, older REGULATOR_EVENT_* flags. From the
discussions I have understood the older flags were informing severe
hardware failures where system is typically thought to be no longer
usable. I have understood that the most likely handling for such
notification is shutting off the regulator. I have further understood
that there are not many consumers doing handling of these events. (This
is all just my understanding based on discussions - take it with grain
of salt).
I was the one who added these warning level notifications. Thus, I have
been following (only) use of those warnings. I have no proper insight to
existing drivers using all these flags.
When grepping for the WARNING level regulator events I can find
following drivers:
drivers/regulator/bd9576-regulator.c
drivers/regulator/max597x-regulator.c
drivers/regulator/tps65219-regulator.c
The difference (in my head) between these and PMBus-core is that these
are very specific PMIC ICs. The board designer should have a good
understanding which of the power-rails may have 'warning level'
failures, and which errors are 'critical'. They should select and set
the IRQ limits and error severities in the device-tree accordingly.
PMBus core (again, in my head) is more generic purpose system. This is
why I originally asked if the 'error severity' in PMBus specifies also
the error handling - and if these regulator notifications map to
intended handling. Now, after this discussion I think that:
PMBus has it's own error-handling which is implemented independently
from these notifications. This handling should not be messed-up by
regulator consumers, for example by touching the regulator state.
This is what made me think sending regulator notifications might not be
the best approach - (but as I said, I may be wrong. I am no longer sure
what kind of handling there is expected for these events. Furthermore,
as we see below, I did not find in-tree consumers taking "radical"
actions when receiving the notifications).
Yep, I didn't find other in-tree consumers for these notifications except:
drivers/usb/host/ohci-da8xx.c
(I was not thorough so may have missed some, but seems there is not many
in-tree consumers.)
I did only a very very brief shallow peek but it seems to me that even
there driver only sets a flag - which is used in debug message. (I may
have missed something here as well).
Judging this it seems to me that, at the moment, these notifications are
mostly ignored by consumers - and they are sent by only a handful of
devices. There probably exist some downstream users for those, but I
have not heard of them. Maybe they are only used on very specific
systems. This could explain why there has been no problems.
I know, I know. Lot's of guessing, assuming and hand waving. Sorry :/
Yours,
-- Matti.
--
Matti Vaittinen
Linux kernel developer at ROHM Semiconductors
Oulu Finland
~~ When things go utterly wrong vim users can always type :help! ~~
On Mon, Apr 10, 2023 at 07:53:46PM +0300, Matti Vaittinen wrote:
> On 4/10/23 17:40, Guenter Roeck wrote:
> > On 4/10/23 01:19, Matti Vaittinen wrote:
> > > to 6. huhtik. 2023 klo 16.43 Mark Brown ([email protected]) kirjoitti:
> > > >
> > > > On Thu, Apr 06, 2023 at 11:00:02AM +0300, Matti Vaittinen wrote:
> > > > > ke 5. huhtik. 2023 klo 18.19 Mark Brown ([email protected]) kirjoitti:
> > > > > > On Wed, Apr 05, 2023 at 07:18:32AM -0700, Guenter Roeck wrote:
> > >
> > > > > > It can also try to avoid
> > > > > > interacting with hardware if that might not work.
> > > >
> > > > > It'd be great to have documentation / specification for sending and/or
> > > > > handling the regulator events. I don't think we currently have such.
> > > > > As far as I understand, the notifications can be picked up by all
> > > > > consumers of a regulator. I am a bit worried about:
> > > > > a) Situations where notification handlers 'collide' by doing 'actions'
> > > > > which are unexpected by other handlers
> > > >
> > > > I'm not sure what you're expecting there?? A device working with itself
> > > > shouldn't disrupt any other users.
> > >
> > > I have no concrete idea, just a vague uneasy feeling knowing that
> > > devices tend to interact with each other. I guess it is more about the
> > > amount of uncertainty caused by my lack of knowledge regarding what
> > > could be done by these handlers. So, as I already said - if no one
> > > else is bothered by this then I definitely don't want to block the
> > > series. Still, if the error handling should be kept internal to PMBus
> > > - then we should probably either say that consumer drivers must not
> > > (forcibly) turn off the supply when receiving these notifications - or
> > > not send these notifications from PMBus and allow PMBus to decide
> > > error handling internally. (Again, I don't know if any in-tree
> > > consumer drivers do turn off the supply regulator in error handlers -
> > > but I don't think it is actually forbidden). Or am I just making? a
> > > problem that does not exist?
> > >
> >
> > For my part I (still) don't understand why this is considered a problem
> > for this driver but not for all the other drivers reporting various
> > error conditions to the regulator subsystem. At least some of them
> > also have programmable reaction to such error conditions.
>
> I may not know the drivers you're referring to. And, as I said, maybe there
> is no problem.
>
> To explain why I asked this question for this driver:
>
> What caught my attention was use of the REGULATOR_EVENT_*_WARN flags, which
> were originally added so regulators could be flagging 'not yet
> critical'-errors Vs. the other, older REGULATOR_EVENT_* flags. From the
> discussions I have understood the older flags were informing severe hardware
> failures where system is typically thought to be no longer usable. I have
> understood that the most likely handling for such notification is shutting
> off the regulator. I have further understood that there are not many
> consumers doing handling of these events. (This is all just my understanding
> based on discussions - take it with grain of salt).
>
> I was the one who added these warning level notifications. Thus, I have been
> following (only) use of those warnings. I have no proper insight to existing
> drivers using all these flags.
>
> When grepping for the WARNING level regulator events I can find following
> drivers:
> drivers/regulator/bd9576-regulator.c
> drivers/regulator/max597x-regulator.c
> drivers/regulator/tps65219-regulator.c
>
> The difference (in my head) between these and PMBus-core is that these are
> very specific PMIC ICs. The board designer should have a good understanding
> which of the power-rails may have 'warning level' failures, and which errors
> are 'critical'. They should select and set the IRQ limits and error
> severities in the device-tree accordingly.
>
> PMBus core (again, in my head) is more generic purpose system. This is why I
> originally asked if the 'error severity' in PMBus specifies also the error
> handling - and if these regulator notifications map to intended handling.
> Now, after this discussion I think that:
>
> PMBus has it's own error-handling which is implemented independently from
> these notifications. This handling should not be messed-up by regulator
> consumers, for example by touching the regulator state.
>
> This is what made me think sending regulator notifications might not be the
> best approach - (but as I said, I may be wrong. I am no longer sure what
> kind of handling there is expected for these events. Furthermore, as we see
> below, I did not find in-tree consumers taking "radical" actions when
> receiving the notifications).
>
> Yep, I didn't find other in-tree consumers for these notifications except:
> drivers/usb/host/ohci-da8xx.c
>
> (I was not thorough so may have missed some, but seems there is not many
> in-tree consumers.)
>
> I did only a very very brief shallow peek but it seems to me that even there
> driver only sets a flag - which is used in debug message. (I may have missed
> something here as well).
>
> Judging this it seems to me that, at the moment, these notifications are
> mostly ignored by consumers - and they are sent by only a handful of
> devices. There probably exist some downstream users for those, but I have
> not heard of them. Maybe they are only used on very specific systems. This
> could explain why there has been no problems.
>
> I know, I know. Lot's of guessing, assuming and hand waving. Sorry :/
>
Oh, now the problem (though I still don't understand what the problem
actually is) is extended to warnings. I thought you were concerned
about errors.
Personally I think you are concerned about a non-issue, but without
explicit guidance from regulator maintainers (and no clear definition if
and when regulator notifications should or may be sent) I won't be able
to apply this series.
Having said this, I find the whole discussion kind of surreal,
especially since the PMBus drivers already report error states to the
regulator subsystem using the get_error_flags callback, but it is what
it is. Also, no, I won't revert that code without a clear explanation
of the _actual_ (not perceived) problem that such a revert is supposed
to solve.
Guenter
On 4/10/23 20:47, Guenter Roeck wrote:
> On Mon, Apr 10, 2023 at 07:53:46PM +0300, Matti Vaittinen wrote:
>> On 4/10/23 17:40, Guenter Roeck wrote:
>>> On 4/10/23 01:19, Matti Vaittinen wrote:
>>>> to 6. huhtik. 2023 klo 16.43 Mark Brown ([email protected]) kirjoitti:
>>>>>
>>>>> On Thu, Apr 06, 2023 at 11:00:02AM +0300, Matti Vaittinen wrote:
>>>>>> ke 5. huhtik. 2023 klo 18.19 Mark Brown ([email protected]) kirjoitti:
>>>>>>> On Wed, Apr 05, 2023 at 07:18:32AM -0700, Guenter Roeck wrote:
>>>>
>>>>>>> It can also try to avoid
>>>>>>> interacting with hardware if that might not work.
>>>>>
>>>>>> It'd be great to have documentation / specification for sending and/or
>>>>>> handling the regulator events. I don't think we currently have such.
>>>>>> As far as I understand, the notifications can be picked up by all
>>>>>> consumers of a regulator. I am a bit worried about:
>>>>>> a) Situations where notification handlers 'collide' by doing 'actions'
>>>>>> which are unexpected by other handlers
>>>>>
>>>>> I'm not sure what you're expecting there? A device working with itself
>>>>> shouldn't disrupt any other users.
>>>>
>>>> I have no concrete idea, just a vague uneasy feeling knowing that
>>>> devices tend to interact with each other. I guess it is more about the
>>>> amount of uncertainty caused by my lack of knowledge regarding what
>>>> could be done by these handlers. So, as I already said - if no one
>>>> else is bothered by this then I definitely don't want to block the
>>>> series. Still, if the error handling should be kept internal to PMBus
>>>> - then we should probably either say that consumer drivers must not
>>>> (forcibly) turn off the supply when receiving these notifications - or
>>>> not send these notifications from PMBus and allow PMBus to decide
>>>> error handling internally. (Again, I don't know if any in-tree
>>>> consumer drivers do turn off the supply regulator in error handlers -
>>>> but I don't think it is actually forbidden). Or am I just making a
>>>> problem that does not exist?
>>>>
>>>
>>> For my part I (still) don't understand why this is considered a problem
>>> for this driver but not for all the other drivers reporting various
>>> error conditions to the regulator subsystem. At least some of them
>>> also have programmable reaction to such error conditions.
>>
>> I may not know the drivers you're referring to. And, as I said, maybe there
>> is no problem.
>>
>> To explain why I asked this question for this driver:
>>
>> What caught my attention was use of the REGULATOR_EVENT_*_WARN flags, which
>> were originally added so regulators could be flagging 'not yet
>> critical'-errors Vs. the other, older REGULATOR_EVENT_* flags. From the
>> discussions I have understood the older flags were informing severe hardware
>> failures where system is typically thought to be no longer usable. I have
>> understood that the most likely handling for such notification is shutting
>> off the regulator. I have further understood that there are not many
>> consumers doing handling of these events. (This is all just my understanding
>> based on discussions - take it with grain of salt).
>>
>> I was the one who added these warning level notifications. Thus, I have been
>> following (only) use of those warnings. I have no proper insight to existing
>> drivers using all these flags.
>>
>> When grepping for the WARNING level regulator events I can find following
>> drivers:
>> drivers/regulator/bd9576-regulator.c
>> drivers/regulator/max597x-regulator.c
>> drivers/regulator/tps65219-regulator.c
>>
>> The difference (in my head) between these and PMBus-core is that these are
>> very specific PMIC ICs. The board designer should have a good understanding
>> which of the power-rails may have 'warning level' failures, and which errors
>> are 'critical'. They should select and set the IRQ limits and error
>> severities in the device-tree accordingly.
>>
>> PMBus core (again, in my head) is more generic purpose system. This is why I
>> originally asked if the 'error severity' in PMBus specifies also the error
>> handling - and if these regulator notifications map to intended handling.
>> Now, after this discussion I think that:
>>
>> PMBus has it's own error-handling which is implemented independently from
>> these notifications. This handling should not be messed-up by regulator
>> consumers, for example by touching the regulator state.
>>
>> This is what made me think sending regulator notifications might not be the
>> best approach - (but as I said, I may be wrong. I am no longer sure what
>> kind of handling there is expected for these events. Furthermore, as we see
>> below, I did not find in-tree consumers taking "radical" actions when
>> receiving the notifications).
>>
>> Yep, I didn't find other in-tree consumers for these notifications except:
>> drivers/usb/host/ohci-da8xx.c
>>
>> (I was not thorough so may have missed some, but seems there is not many
>> in-tree consumers.)
>>
>> I did only a very very brief shallow peek but it seems to me that even there
>> driver only sets a flag - which is used in debug message. (I may have missed
>> something here as well).
>>
>> Judging this it seems to me that, at the moment, these notifications are
>> mostly ignored by consumers - and they are sent by only a handful of
>> devices. There probably exist some downstream users for those, but I have
>> not heard of them. Maybe they are only used on very specific systems. This
>> could explain why there has been no problems.
>>
>> I know, I know. Lot's of guessing, assuming and hand waving. Sorry :/
>>
>
> Oh, now the problem (though I still don't understand what the problem
> actually is) is extended to warnings. I thought you were concerned
> about errors.
Yes. I am concerned about errors - as I thought handling errors might
have an impact to the regulator supply state. I don't see this as a
problem with warnings.
It was the use of warnings that got this series in my mailbox. Hence
this series caught my attention.
The only thing I could be concerned about with warnings is whether they
are sent for problems that are actually errors. But even if they were,
that would not really cause any issues outside the specific use-case.
So, no. I am not really concerned about warnings but I tried to explain
why I was commenting on this PMBus series as you asked that.
> Personally I think you are concerned about a non-issue, but without
> explicit guidance from regulator maintainers (and no clear definition if
> and when regulator notifications should or may be sent) I won't be able
> to apply this series.
Thanks Guenter. I value your opinion on this. (And there was _no_
sarcasm. I've learned that I am often discussing with people who have
much broader understanding than I do, so I do truly appreciate hearing
your opinion).
I think part of the problem is that we lack of insight as to how these
notifications / errors are used. I am trying to clarify this. As a
matter of fact, I today received an email that my talk proposal to ELCE
was accepted. I am going to go through these notifications and error
flags there, and I hope to be able to collect some feed-back as to how
they are / can be utilized.
> Having said this, I find the whole discussion kind of surreal,
Sorry about that. What do you think I should've done differently? Do you
think I should not have brought-up these questions? I have openly
admitted I may not understand all the details and thus You as a
maintainer are free to act as if I never asked a thing. I've explained
why I think there is a potential problem - but if you or others do not
see the issue, then it's your call to decide.
> especially since the PMBus drivers already report error states to the
> regulator subsystem using the get_error_flags callback, but it is what
> it is. Also, no, I won't revert that code without a clear explanation
> of the _actual_ (not perceived) problem that such a revert is supposed
> to solve.
I agree with you that these notifications and error flags go
hand-by-hand. Actually, the use of (warning) notifications does more or
less require using also the error flags (as we don't have a notification
when things get back to normal). The flags do not require using
notifications though, some consumers can do polling of flags.
Let's hope no problems with these error flags surface.
Yours,
-- Matti
--
Matti Vaittinen
Linux kernel developer at ROHM Semiconductors
Oulu Finland
~~ When things go utterly wrong vim users can always type :help! ~~
On 4/6/23 16:43, Mark Brown wrote:
> On Thu, Apr 06, 2023 at 11:00:02AM +0300, Matti Vaittinen wrote:
>> ke 5. huhtik. 2023 klo 18.19 Mark Brown ([email protected]) kirjoitti:
>>> On Wed, Apr 05, 2023 at 07:18:32AM -0700, Guenter Roeck wrote:
>> Or, is it so that no "generic handling" of these errors is to be
>> expected? Eg, consumers who implement any handling must always be
>> targeted to a very specific system? My thinking has been that the
>> device sending the notification knows the severity of the problem and
>> - for example the REGULATOR_EVENT_REGULATION_OUT is only sent with
>> such severe problems that consumers can try disabling the regulator,
>> whereas the _WARN level notifications may not warrant such action. But
>> again, I don't think we have a specification for this - so this is
>> just my thinking - which may be off.
>
> Do we actually have practical examples of systems sending warnings that
> aren't followed in very short order by more severe errors, notified or
> otherwise?
I asked about this from the hardware colleague(s) in Japan. This far
they were not able to provide me a concrete information.
One potential example use-case they mentioned was "rejecting and
re-doing a measurement if regulator warnings are flagged". This sounds
(to me) like using a voltage from a regulator as some measurement
reference or supply, with very strict warning limits. Still, as said, I
did not have a concrete example if/where this is actually implemented -
which means the answer to your question remains still "no".
Yours,
-- Matti
--
Matti Vaittinen
Linux kernel developer at ROHM Semiconductors
Oulu Finland
~~ When things go utterly wrong vim users can always type :help! ~~
On Mon, Apr 10, 2023 at 10:47:18AM -0700, Guenter Roeck wrote:
> Personally I think you are concerned about a non-issue, but without
> explicit guidance from regulator maintainers (and no clear definition if
> and when regulator notifications should or may be sent) I won't be able
> to apply this series.
I would expect that regulator driver should just pass through whatever
they get from the hardware. I really don't understand what the
confusion is here.
On Mon, Apr 10, 2023 at 11:19:41AM +0300, Matti Vaittinen wrote:
> to 6. huhtik. 2023 klo 16.43 Mark Brown ([email protected]) kirjoitti:
> > I'm not sure what you're expecting there? A device working with itself
> > shouldn't disrupt any other users.
> I have no concrete idea, just a vague uneasy feeling knowing that
> devices tend to interact with each other. I guess it is more about the
> amount of uncertainty caused by my lack of knowledge regarding what
> could be done by these handlers. So, as I already said - if no one
> else is bothered by this then I definitely don't want to block the
> series. Still, if the error handling should be kept internal to PMBus
> - then we should probably either say that consumer drivers must not
> (forcibly) turn off the supply when receiving these notifications - or
> not send these notifications from PMBus and allow PMBus to decide
> error handling internally. (Again, I don't know if any in-tree
> consumer drivers do turn off the supply regulator in error handlers -
> but I don't think it is actually forbidden). Or am I just making a
> problem that does not exist?
I think you are making a problem that doesn't exist.
> > Like I say I'm not sure how much practical difference it makes to think
> > too hard about differentiating the errors.
> I would do at least two classes.
> 1) critical class - it is Ok for the consumer to forcibly shut down
> the regulator, or maybe the whole system.
> 2) warning class - it is not Ok to forcibly shut down the regulator.
How severe an issue bad power is will be partly determined by what the
consumer is doing with the power, it's going to be in a fairly narrow
range but there is a range.
On 4/11/23 15:07, Mark Brown wrote:
> On Mon, Apr 10, 2023 at 11:19:41AM +0300, Matti Vaittinen wrote:
>> to 6. huhtik. 2023 klo 16.43 Mark Brown ([email protected]) kirjoitti:
>
>>> I'm not sure what you're expecting there? A device working with itself
>>> shouldn't disrupt any other users.
>
>> I have no concrete idea, just a vague uneasy feeling knowing that
>> devices tend to interact with each other. I guess it is more about the
>> amount of uncertainty caused by my lack of knowledge regarding what
>> could be done by these handlers. So, as I already said - if no one
>> else is bothered by this then I definitely don't want to block the
>> series. Still, if the error handling should be kept internal to PMBus
>> - then we should probably either say that consumer drivers must not
>> (forcibly) turn off the supply when receiving these notifications - or
>> not send these notifications from PMBus and allow PMBus to decide
>> error handling internally. (Again, I don't know if any in-tree
>> consumer drivers do turn off the supply regulator in error handlers -
>> but I don't think it is actually forbidden). Or am I just making a
>> problem that does not exist?
>
> I think you are making a problem that doesn't exist.
In this case, sorry folks. I thought consumers could be more 'intrusive'
with the handling of these notifications - which apparently is not true
then. Guenter, I hope the statement from Mark cleared confusion I
caused. I have no further questions about this series.
>>> Like I say I'm not sure how much practical difference it makes to think
>>> too hard about differentiating the errors.
>
>> I would do at least two classes.
>
>> 1) critical class - it is Ok for the consumer to forcibly shut down
>> the regulator, or maybe the whole system.
>> 2) warning class - it is not Ok to forcibly shut down the regulator.
>
> How severe an issue bad power is will be partly determined by what the
> consumer is doing with the power, it's going to be in a fairly narrow
> range but there is a range.
No longer related to this patch series - I am still trying to gather
information where the "detection" level PMIC warning IRQs are used in
real-world systems. This might give me some insight how these
notifications are (or could be) used.
Yours,
-- Matti
--
Matti Vaittinen
Linux kernel developer at ROHM Semiconductors
Oulu Finland
~~ When things go utterly wrong vim users can always type :help! ~~
On Tue, Mar 28, 2023 at 05:03:33PM +0200, Naresh Solanki wrote:
> Add regulator device in pmbus_data & initialize the same during PMBus
> regulator register.
>
> Signed-off-by: Naresh Solanki <[email protected]>
Applied to hwmon-next.
Thanks,
Guenter
On Tue, Mar 28, 2023 at 05:03:34PM +0200, Naresh Solanki wrote:
> From: Patrick Rudolph <[email protected]>
>
> Add regulator events corresponding to regulator error in regulator flag
> map.
> Also capture the same in pmbus_regulator_get_flags.
>
> Signed-off-by: Patrick Rudolph <[email protected]>
> Signed-off-by: Naresh Solanki <[email protected]>
Applied to hwmon-next.
Thanks,
Guenter
On Tue, Mar 28, 2023 at 05:03:35PM +0200, Naresh Solanki wrote:
> Notify regulator events in PMBus irq handler.
>
> Signed-off-by: Naresh Solanki <[email protected]>
Applied to hwmon-next.
Please check your e-mail settings: checkpatch complains about a
mismatch between [email protected] and
[email protected]. Not that it matters in this case,
but it is annoying to get a checkpatch alert each time I apply
one of your patches.
Thanks,
Guenter