In order to depricate cmpxchg_double(), replace all its usage with
cmpxchg128().
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
arch/s390/include/asm/cpu_mf.h | 29 ++++++++++++-----
arch/s390/kernel/perf_cpum_sf.c | 65 +++++++++++++++++++++++++---------------
2 files changed, 63 insertions(+), 31 deletions(-)
--- a/arch/s390/include/asm/cpu_mf.h
+++ b/arch/s390/include/asm/cpu_mf.h
@@ -131,19 +131,32 @@ struct hws_combined_entry {
struct hws_diag_entry diag; /* Diagnostic-sampling data entry */
} __packed;
+union hws_flags_and_overflow {
+ struct {
+ unsigned long flags;
+ unsigned long overflow;
+ };
+ u128 full;
+};
+
struct hws_trailer_entry {
union {
struct {
- unsigned int f:1; /* 0 - Block Full Indicator */
- unsigned int a:1; /* 1 - Alert request control */
- unsigned int t:1; /* 2 - Timestamp format */
- unsigned int :29; /* 3 - 31: Reserved */
- unsigned int bsdes:16; /* 32-47: size of basic SDE */
- unsigned int dsdes:16; /* 48-63: size of diagnostic SDE */
+ union {
+ struct {
+ unsigned int f:1; /* 0 - Block Full Indicator */
+ unsigned int a:1; /* 1 - Alert request control */
+ unsigned int t:1; /* 2 - Timestamp format */
+ unsigned int :29; /* 3 - 31: Reserved */
+ unsigned int bsdes:16; /* 32-47: size of basic SDE */
+ unsigned int dsdes:16; /* 48-63: size of diagnostic SDE */
+ };
+ unsigned long long flags; /* 0 - 63: All indicators */
+ };
+ unsigned long long overflow; /* 64 - sample Overflow count */
};
- unsigned long long flags; /* 0 - 63: All indicators */
+ union hws_flags_and_overflow flags_and_overflow;
};
- unsigned long long overflow; /* 64 - sample Overflow count */
unsigned char timestamp[16]; /* 16 - 31 timestamp */
unsigned long long reserved1; /* 32 -Reserved */
unsigned long long reserved2; /* */
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -1227,6 +1227,8 @@ static void hw_collect_samples(struct pe
}
}
+typedef union hws_flags_and_overflow fao_t;
+
/* hw_perf_event_update() - Process sampling buffer
* @event: The perf event
* @flush_all: Flag to also flush partially filled sample-data-blocks
@@ -1243,10 +1245,11 @@ static void hw_collect_samples(struct pe
*/
static void hw_perf_event_update(struct perf_event *event, int flush_all)
{
+ unsigned long long event_overflow, sampl_overflow, num_sdb;
struct hw_perf_event *hwc = &event->hw;
struct hws_trailer_entry *te;
+ fao_t old_fao, new_fao;
unsigned long *sdbt;
- unsigned long long event_overflow, sampl_overflow, num_sdb, te_flags;
int done;
/*
@@ -1294,12 +1297,16 @@ static void hw_perf_event_update(struct
num_sdb++;
/* Reset trailer (using compare-double-and-swap) */
+ old_fao = te->flags_and_overflow;
do {
- te_flags = te->flags & ~SDB_TE_BUFFER_FULL_MASK;
- te_flags |= SDB_TE_ALERT_REQ_MASK;
- } while (!cmpxchg_double(&te->flags, &te->overflow,
- te->flags, te->overflow,
- te_flags, 0ULL));
+ new_fao = (fao_t){
+ .flags = old_fao.flags,
+ .overflow = 0,
+ };
+ new_fao.flags &= ~SDB_TE_BUFFER_FULL_MASK;
+ new_fao.flags |= SDB_TE_ALERT_REQ_MASK;
+ } while (!try_cmpxchg128(&te->flags_and_overflow.full,
+ &old_fao.full, new_fao.full));
/* Advance to next sample-data-block */
sdbt++;
@@ -1475,14 +1482,19 @@ static int aux_output_begin(struct perf_
static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
unsigned long long *overflow)
{
- unsigned long long orig_overflow, orig_flags, new_flags;
struct hws_trailer_entry *te;
+ fao_t old_fao, new_fao;
te = aux_sdb_trailer(aux, alert_index);
+
+ old_fao = te->flags_and_overflow;
do {
- orig_flags = te->flags;
- *overflow = orig_overflow = te->overflow;
- if (orig_flags & SDB_TE_BUFFER_FULL_MASK) {
+ new_fao = (fao_t){
+ .flags = old_fao.flags,
+ .overflow = 0,
+ };
+ *overflow = old_fao.overflow;
+ if (new_fao.flags & SDB_TE_BUFFER_FULL_MASK) {
/*
* SDB is already set by hardware.
* Abort and try to set somewhere
@@ -1490,10 +1502,11 @@ static bool aux_set_alert(struct aux_buf
*/
return false;
}
- new_flags = orig_flags | SDB_TE_ALERT_REQ_MASK;
- } while (!cmpxchg_double(&te->flags, &te->overflow,
- orig_flags, orig_overflow,
- new_flags, 0ULL));
+ new_fao.flags |= SDB_TE_ALERT_REQ_MASK;
+
+ } while (!try_cmpxchg128(&te->flags_and_overflow.full,
+ &old_fao.full, new_fao.full));
+
return true;
}
@@ -1522,9 +1535,10 @@ static bool aux_set_alert(struct aux_buf
static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range,
unsigned long long *overflow)
{
- unsigned long long orig_overflow, orig_flags, new_flags;
unsigned long i, range_scan, idx, idx_old;
+ unsigned long long orig_overflow;
struct hws_trailer_entry *te;
+ fao_t old_fao, new_fao;
debug_sprintf_event(sfdbg, 6, "%s: range %ld head %ld alert %ld "
"empty %ld\n", __func__, range, aux->head,
@@ -1554,17 +1568,22 @@ static bool aux_reset_buffer(struct aux_
idx_old = idx = aux->empty_mark + 1;
for (i = 0; i < range_scan; i++, idx++) {
te = aux_sdb_trailer(aux, idx);
+
+ old_fao = te->flags_and_overflow;
do {
- orig_flags = te->flags;
- orig_overflow = te->overflow;
- new_flags = orig_flags & ~SDB_TE_BUFFER_FULL_MASK;
+ new_fao = (fao_t){
+ .flags = old_fao.flags,
+ .overflow = 0,
+ };
+ orig_overflow = old_fao.overflow;
+ new_fao.flags &= ~SDB_TE_BUFFER_FULL_MASK;
if (idx == aux->alert_mark)
- new_flags |= SDB_TE_ALERT_REQ_MASK;
+ new_fao.flags |= SDB_TE_ALERT_REQ_MASK;
else
- new_flags &= ~SDB_TE_ALERT_REQ_MASK;
- } while (!cmpxchg_double(&te->flags, &te->overflow,
- orig_flags, orig_overflow,
- new_flags, 0ULL));
+ new_fao.flags &= ~SDB_TE_ALERT_REQ_MASK;
+ } while (!try_cmpxchg128(&te->flags_and_overflow.full,
+ &old_fao.full, new_fao.full));
+
*overflow += orig_overflow;
}
On Mon, Dec 19, 2022 at 04:35:33PM +0100, Peter Zijlstra wrote:
> In order to depricate cmpxchg_double(), replace all its usage with
> cmpxchg128().
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> arch/s390/include/asm/cpu_mf.h | 29 ++++++++++++-----
> arch/s390/kernel/perf_cpum_sf.c | 65 +++++++++++++++++++++++++---------------
> 2 files changed, 63 insertions(+), 31 deletions(-)
So, Alexander Gordeev reported that this code was already prior to your
changes potentially broken with respect to missing READ_ONCE() within the
cmpxchg_double() loops.
In order to fix that and have a patch that can be backported I would go
with something like the patch below, which I would also plan to send for
-rc4, unless there are objections.
This can then easily be converted to the new cmpxchg128() later.
From 7b271f42946b306620a748c0da5f07f8c786888d Mon Sep 17 00:00:00 2001
From: Heiko Carstens <[email protected]>
Date: Thu, 5 Jan 2023 15:44:20 +0100
Subject: [PATCH] s390/cpum_sf: add READ_ONCE() semantics to compare and swap
loops
The current cmpxchg_double() loops within the perf hw sampling code do not
have READ_ONCE() semantics to read the old value from memory. This allows
the compiler to generate code which reads the "old" value several times
from memory, which again allows for inconsistencies.
For example:
/* Reset trailer (using compare-double-and-swap) */
do {
te_flags = te->flags & ~SDB_TE_BUFFER_FULL_MASK;
te_flags |= SDB_TE_ALERT_REQ_MASK;
} while (!cmpxchg_double(&te->flags, &te->overflow,
te->flags, te->overflow,
te_flags, 0ULL));
The compiler could generate code where te->flags used within the
cmpxchg_double() call may be refetched from memory and which is not
necessarily identical to the previous read version which was used to
generate te_flags. Which in turn means that an incorrect update could
happen.
Fix this by adding READ_ONCE() semantics to all cmpxchg_double()
loops. Given that READ_ONCE() cannot generate code on s390 which atomically
reads 16 bytes, use a private compare-and-swap-double implementation to
achieve that.
Also replace cmpxchg_double() with the private implementation to be able to
re-use the old value within the loops.
As a side effect this converts the whole code to only use bit fields
to read and modify bits within the hws trailer header.
Reported-by: Alexander Gordeev <[email protected]>
Acked-by: Alexander Gordeev <[email protected]>
Acked-by: Hendrik Brueckner <[email protected]>
Signed-off-by: Heiko Carstens <[email protected]>
---
arch/s390/include/asm/cpu_mf.h | 31 +++++-----
arch/s390/kernel/perf_cpum_sf.c | 101 ++++++++++++++++++++------------
2 files changed, 77 insertions(+), 55 deletions(-)
diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h
index feaba12dbecb..efa103b52a1a 100644
--- a/arch/s390/include/asm/cpu_mf.h
+++ b/arch/s390/include/asm/cpu_mf.h
@@ -131,19 +131,21 @@ struct hws_combined_entry {
struct hws_diag_entry diag; /* Diagnostic-sampling data entry */
} __packed;
-struct hws_trailer_entry {
- union {
- struct {
- unsigned int f:1; /* 0 - Block Full Indicator */
- unsigned int a:1; /* 1 - Alert request control */
- unsigned int t:1; /* 2 - Timestamp format */
- unsigned int :29; /* 3 - 31: Reserved */
- unsigned int bsdes:16; /* 32-47: size of basic SDE */
- unsigned int dsdes:16; /* 48-63: size of diagnostic SDE */
- };
- unsigned long long flags; /* 0 - 63: All indicators */
+union hws_trailer_header {
+ struct {
+ unsigned int f:1; /* 0 - Block Full Indicator */
+ unsigned int a:1; /* 1 - Alert request control */
+ unsigned int t:1; /* 2 - Timestamp format */
+ unsigned int :29; /* 3 - 31: Reserved */
+ unsigned int bsdes:16; /* 32-47: size of basic SDE */
+ unsigned int dsdes:16; /* 48-63: size of diagnostic SDE */
+ unsigned long long overflow; /* 64 - Overflow Count */
};
- unsigned long long overflow; /* 64 - sample Overflow count */
+ __uint128_t val;
+};
+
+struct hws_trailer_entry {
+ union hws_trailer_header header; /* 0 - 15 Flags + Overflow Count */
unsigned char timestamp[16]; /* 16 - 31 timestamp */
unsigned long long reserved1; /* 32 -Reserved */
unsigned long long reserved2; /* */
@@ -290,14 +292,11 @@ static inline unsigned long sample_rate_to_freq(struct hws_qsi_info_block *qsi,
return USEC_PER_SEC * qsi->cpu_speed / rate;
}
-#define SDB_TE_ALERT_REQ_MASK 0x4000000000000000UL
-#define SDB_TE_BUFFER_FULL_MASK 0x8000000000000000UL
-
/* Return TOD timestamp contained in an trailer entry */
static inline unsigned long long trailer_timestamp(struct hws_trailer_entry *te)
{
/* TOD in STCKE format */
- if (te->t)
+ if (te->header.t)
return *((unsigned long long *) &te->timestamp[1]);
/* TOD in STCK format */
diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index 332a49965130..ce886a03545a 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -163,14 +163,15 @@ static void free_sampling_buffer(struct sf_buffer *sfb)
static int alloc_sample_data_block(unsigned long *sdbt, gfp_t gfp_flags)
{
- unsigned long sdb, *trailer;
+ struct hws_trailer_entry *te;
+ unsigned long sdb;
/* Allocate and initialize sample-data-block */
sdb = get_zeroed_page(gfp_flags);
if (!sdb)
return -ENOMEM;
- trailer = trailer_entry_ptr(sdb);
- *trailer = SDB_TE_ALERT_REQ_MASK;
+ te = (struct hws_trailer_entry *)trailer_entry_ptr(sdb);
+ te->header.a = 1;
/* Link SDB into the sample-data-block-table */
*sdbt = sdb;
@@ -1206,7 +1207,7 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
"%s: Found unknown"
" sampling data entry: te->f %i"
" basic.def %#4x (%p)\n", __func__,
- te->f, sample->def, sample);
+ te->header.f, sample->def, sample);
/* Sample slot is not yet written or other record.
*
* This condition can occur if the buffer was reused
@@ -1217,7 +1218,7 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
* that are not full. Stop processing if the first
* invalid format was detected.
*/
- if (!te->f)
+ if (!te->header.f)
break;
}
@@ -1227,6 +1228,16 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
}
}
+static inline __uint128_t __cdsg(__uint128_t *ptr, __uint128_t old, __uint128_t new)
+{
+ asm volatile(
+ " cdsg %[old],%[new],%[ptr]\n"
+ : [old] "+d" (old), [ptr] "+QS" (*ptr)
+ : [new] "d" (new)
+ : "memory", "cc");
+ return old;
+}
+
/* hw_perf_event_update() - Process sampling buffer
* @event: The perf event
* @flush_all: Flag to also flush partially filled sample-data-blocks
@@ -1243,10 +1254,11 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
*/
static void hw_perf_event_update(struct perf_event *event, int flush_all)
{
+ unsigned long long event_overflow, sampl_overflow, num_sdb;
+ union hws_trailer_header old, prev, new;
struct hw_perf_event *hwc = &event->hw;
struct hws_trailer_entry *te;
unsigned long *sdbt;
- unsigned long long event_overflow, sampl_overflow, num_sdb, te_flags;
int done;
/*
@@ -1266,25 +1278,25 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all)
te = (struct hws_trailer_entry *) trailer_entry_ptr(*sdbt);
/* Leave loop if no more work to do (block full indicator) */
- if (!te->f) {
+ if (!te->header.f) {
done = 1;
if (!flush_all)
break;
}
/* Check the sample overflow count */
- if (te->overflow)
+ if (te->header.overflow)
/* Account sample overflows and, if a particular limit
* is reached, extend the sampling buffer.
* For details, see sfb_account_overflows().
*/
- sampl_overflow += te->overflow;
+ sampl_overflow += te->header.overflow;
/* Timestamps are valid for full sample-data-blocks only */
debug_sprintf_event(sfdbg, 6, "%s: sdbt %#lx "
"overflow %llu timestamp %#llx\n",
- __func__, (unsigned long)sdbt, te->overflow,
- (te->f) ? trailer_timestamp(te) : 0ULL);
+ __func__, (unsigned long)sdbt, te->header.overflow,
+ (te->header.f) ? trailer_timestamp(te) : 0ULL);
/* Collect all samples from a single sample-data-block and
* flag if an (perf) event overflow happened. If so, the PMU
@@ -1294,12 +1306,16 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all)
num_sdb++;
/* Reset trailer (using compare-double-and-swap) */
+ /* READ_ONCE() 16 byte header */
+ prev.val = __cdsg(&te->header.val, 0, 0);
do {
- te_flags = te->flags & ~SDB_TE_BUFFER_FULL_MASK;
- te_flags |= SDB_TE_ALERT_REQ_MASK;
- } while (!cmpxchg_double(&te->flags, &te->overflow,
- te->flags, te->overflow,
- te_flags, 0ULL));
+ old.val = prev.val;
+ new.val = prev.val;
+ new.f = 0;
+ new.a = 1;
+ new.overflow = 0;
+ prev.val = __cdsg(&te->header.val, old.val, new.val);
+ } while (prev.val != old.val);
/* Advance to next sample-data-block */
sdbt++;
@@ -1384,7 +1400,7 @@ static void aux_output_end(struct perf_output_handle *handle)
range_scan = AUX_SDB_NUM_ALERT(aux);
for (i = 0, idx = aux->head; i < range_scan; i++, idx++) {
te = aux_sdb_trailer(aux, idx);
- if (!(te->flags & SDB_TE_BUFFER_FULL_MASK))
+ if (!te->header.f)
break;
}
/* i is num of SDBs which are full */
@@ -1392,7 +1408,7 @@ static void aux_output_end(struct perf_output_handle *handle)
/* Remove alert indicators in the buffer */
te = aux_sdb_trailer(aux, aux->alert_mark);
- te->flags &= ~SDB_TE_ALERT_REQ_MASK;
+ te->header.a = 0;
debug_sprintf_event(sfdbg, 6, "%s: SDBs %ld range %ld head %ld\n",
__func__, i, range_scan, aux->head);
@@ -1437,9 +1453,9 @@ static int aux_output_begin(struct perf_output_handle *handle,
idx = aux->empty_mark + 1;
for (i = 0; i < range_scan; i++, idx++) {
te = aux_sdb_trailer(aux, idx);
- te->flags &= ~(SDB_TE_BUFFER_FULL_MASK |
- SDB_TE_ALERT_REQ_MASK);
- te->overflow = 0;
+ te->header.f = 0;
+ te->header.a = 0;
+ te->header.overflow = 0;
}
/* Save the position of empty SDBs */
aux->empty_mark = aux->head + range - 1;
@@ -1448,7 +1464,7 @@ static int aux_output_begin(struct perf_output_handle *handle,
/* Set alert indicator */
aux->alert_mark = aux->head + range/2 - 1;
te = aux_sdb_trailer(aux, aux->alert_mark);
- te->flags = te->flags | SDB_TE_ALERT_REQ_MASK;
+ te->header.a = 1;
/* Reset hardware buffer head */
head = AUX_SDB_INDEX(aux, aux->head);
@@ -1475,14 +1491,17 @@ static int aux_output_begin(struct perf_output_handle *handle,
static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
unsigned long long *overflow)
{
- unsigned long long orig_overflow, orig_flags, new_flags;
+ union hws_trailer_header old, prev, new;
struct hws_trailer_entry *te;
te = aux_sdb_trailer(aux, alert_index);
+ /* READ_ONCE() 16 byte header */
+ prev.val = __cdsg(&te->header.val, 0, 0);
do {
- orig_flags = te->flags;
- *overflow = orig_overflow = te->overflow;
- if (orig_flags & SDB_TE_BUFFER_FULL_MASK) {
+ old.val = prev.val;
+ new.val = prev.val;
+ *overflow = old.overflow;
+ if (old.f) {
/*
* SDB is already set by hardware.
* Abort and try to set somewhere
@@ -1490,10 +1509,10 @@ static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
*/
return false;
}
- new_flags = orig_flags | SDB_TE_ALERT_REQ_MASK;
- } while (!cmpxchg_double(&te->flags, &te->overflow,
- orig_flags, orig_overflow,
- new_flags, 0ULL));
+ new.a = 1;
+ new.overflow = 0;
+ prev.val = __cdsg(&te->header.val, old.val, new.val);
+ } while (prev.val != old.val);
return true;
}
@@ -1522,8 +1541,9 @@ static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range,
unsigned long long *overflow)
{
- unsigned long long orig_overflow, orig_flags, new_flags;
unsigned long i, range_scan, idx, idx_old;
+ union hws_trailer_header old, prev, new;
+ unsigned long long orig_overflow;
struct hws_trailer_entry *te;
debug_sprintf_event(sfdbg, 6, "%s: range %ld head %ld alert %ld "
@@ -1554,17 +1574,20 @@ static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range,
idx_old = idx = aux->empty_mark + 1;
for (i = 0; i < range_scan; i++, idx++) {
te = aux_sdb_trailer(aux, idx);
+ /* READ_ONCE() 16 byte header */
+ prev.val = __cdsg(&te->header.val, 0, 0);
do {
- orig_flags = te->flags;
- orig_overflow = te->overflow;
- new_flags = orig_flags & ~SDB_TE_BUFFER_FULL_MASK;
+ old.val = prev.val;
+ new.val = prev.val;
+ orig_overflow = old.overflow;
+ new.f = 0;
+ new.overflow = 0;
if (idx == aux->alert_mark)
- new_flags |= SDB_TE_ALERT_REQ_MASK;
+ new.a = 1;
else
- new_flags &= ~SDB_TE_ALERT_REQ_MASK;
- } while (!cmpxchg_double(&te->flags, &te->overflow,
- orig_flags, orig_overflow,
- new_flags, 0ULL));
+ new.a = 0;
+ prev.val = __cdsg(&te->header.val, old.val, new.val);
+ } while (prev.val != old.val);
*overflow += orig_overflow;
}
--
2.34.1
On Tue, Jan 10, 2023 at 08:23:05AM +0100, Heiko Carstens wrote:
> So, Alexander Gordeev reported that this code was already prior to your
> changes potentially broken with respect to missing READ_ONCE() within the
> cmpxchg_double() loops.
Unless there's an early exit, that shouldn't matter. If you managed to
read garbage the cmpxchg itself will simply fail and the loop retries.
> @@ -1294,12 +1306,16 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all)
> num_sdb++;
>
> /* Reset trailer (using compare-double-and-swap) */
> + /* READ_ONCE() 16 byte header */
> + prev.val = __cdsg(&te->header.val, 0, 0);
> do {
> + old.val = prev.val;
> + new.val = prev.val;
> + new.f = 0;
> + new.a = 1;
> + new.overflow = 0;
> + prev.val = __cdsg(&te->header.val, old.val, new.val);
> + } while (prev.val != old.val);
So this, and
> + /* READ_ONCE() 16 byte header */
> + prev.val = __cdsg(&te->header.val, 0, 0);
> do {
> + old.val = prev.val;
> + new.val = prev.val;
> + orig_overflow = old.overflow;
> + new.f = 0;
> + new.overflow = 0;
> if (idx == aux->alert_mark)
> + new.a = 1;
> else
> + new.a = 0;
> + prev.val = __cdsg(&te->header.val, old.val, new.val);
> + } while (prev.val != old.val);
this case are just silly and expensive. If that initial read is split
and manages to read gibberish the cmpxchg will fail and we retry anyway.
> + /* READ_ONCE() 16 byte header */
> + prev.val = __cdsg(&te->header.val, 0, 0);
> do {
> + old.val = prev.val;
> + new.val = prev.val;
> + *overflow = old.overflow;
> + if (old.f) {
> /*
> * SDB is already set by hardware.
> * Abort and try to set somewhere
> @@ -1490,10 +1509,10 @@ static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
> */
> return false;
> }
> + new.a = 1;
> + new.overflow = 0;
> + prev.val = __cdsg(&te->header.val, old.val, new.val);
> + } while (prev.val != old.val);
And while this case has an early exit, it only cares about a single bit
(although you made it a full word) and so also shouldn't care. If
aux_reset_buffer() returns false, @overflow isn't consumed.
So I really don't see the point of this patch.
On Tue, Jan 10, 2023 at 09:32:55AM +0100, Peter Zijlstra wrote:
> On Tue, Jan 10, 2023 at 08:23:05AM +0100, Heiko Carstens wrote:
>
> > So, Alexander Gordeev reported that this code was already prior to your
> > changes potentially broken with respect to missing READ_ONCE() within the
> > cmpxchg_double() loops.
>
> Unless there's an early exit, that shouldn't matter. If you managed to
> read garbage the cmpxchg itself will simply fail and the loop retries.
I don't think that's true; without READ_ONCE() the compiler could (but is
very unlikely to) read multiple times, and that could cause problems.
For example:
| prev = *ptr;
|
| do {
| new = some_function_of(prev);
| old = cmpxchg(ptr, prev, new);
| } while (old != prev);
Could effectively become:
| prev1 = *ptr;
| prev2 = *ptr;
|
| do {
| new = some_function_of(prev1)
| old = cmpxchg(ptr, prev2, new);
| } while (old != prev2);
... which would effectively udpate from a stale value, throwing away prev2.
That and the two generated reads could be in either order.
So I do think it's warranted to use READ_ONCE() for the prev value feeding into
a cmpxchg operation, even if that's only for the "once" part rather than lack
of tearing.
Thanks,
Mark.
On Tue, Jan 10, 2023 at 09:32:55AM +0100, Peter Zijlstra wrote:
> On Tue, Jan 10, 2023 at 08:23:05AM +0100, Heiko Carstens wrote:
> > So, Alexander Gordeev reported that this code was already prior to your
> > changes potentially broken with respect to missing READ_ONCE() within the
> > cmpxchg_double() loops.
>
> Unless there's an early exit, that shouldn't matter. If you managed to
> read garbage the cmpxchg itself will simply fail and the loop retries.
>
> > @@ -1294,12 +1306,16 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all)
> > num_sdb++;
> >
> > /* Reset trailer (using compare-double-and-swap) */
> > + /* READ_ONCE() 16 byte header */
> > + prev.val = __cdsg(&te->header.val, 0, 0);
> > do {
> > + old.val = prev.val;
> > + new.val = prev.val;
> > + new.f = 0;
> > + new.a = 1;
> > + new.overflow = 0;
> > + prev.val = __cdsg(&te->header.val, old.val, new.val);
> > + } while (prev.val != old.val);
>
> So this, and
...
> this case are just silly and expensive. If that initial read is split
> and manages to read gibberish the cmpxchg will fail and we retry anyway.
While I do agree that there is no need to necessarily read the whole 16
bytes atomically in advance here, there is still the problem about the
missing initial READ_ONCE() in the original code.
As I tried to outline here:
For example:
/* Reset trailer (using compare-double-and-swap) */
do {
te_flags = te->flags & ~SDB_TE_BUFFER_FULL_MASK;
te_flags |= SDB_TE_ALERT_REQ_MASK;
} while (!cmpxchg_double(&te->flags, &te->overflow,
te->flags, te->overflow,
te_flags, 0ULL));
The compiler could generate code where te->flags used within the
cmpxchg_double() call may be refetched from memory and which is not
necessarily identical to the previous read version which was used to
generate te_flags. Which in turn means that an incorrect update could
happen.
Is there anything that prevents te->flags from being read several times?
> > + /* READ_ONCE() 16 byte header */
> > + prev.val = __cdsg(&te->header.val, 0, 0);
> > do {
> > + old.val = prev.val;
> > + new.val = prev.val;
> > + *overflow = old.overflow;
> > + if (old.f) {
> > /*
> > * SDB is already set by hardware.
> > * Abort and try to set somewhere
> > @@ -1490,10 +1509,10 @@ static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
> > */
> > return false;
> > }
> > + new.a = 1;
> > + new.overflow = 0;
> > + prev.val = __cdsg(&te->header.val, old.val, new.val);
> > + } while (prev.val != old.val);
>
> And while this case has an early exit, it only cares about a single bit
> (although you made it a full word) and so also shouldn't care. If
> aux_reset_buffer() returns false, @overflow isn't consumed.
Yes, except that it is anything but obvious that @overflow isn't consumed.
> So I really don't see the point of this patch.
As stated above: READ_ONCE() is missing. And while at it I wanted to have a
consistent complete previous value - also considering that cdsg is not very
expensive.
And while it also reuse the returned values from cdsg, instead of throwing
them away and reading from memory again in a splitted and potentially
inconsistent way.
On Tue, Jan 10, 2023 at 12:46:44PM +0100, Heiko Carstens wrote:
> > > + /* READ_ONCE() 16 byte header */
> > > + prev.val = __cdsg(&te->header.val, 0, 0);
> > > do {
> > > + old.val = prev.val;
> > > + new.val = prev.val;
> > > + *overflow = old.overflow;
I guess, it would also make sense to place write to overflow
after the while loop. So the output variable left intact in
case the function bailed out. Not sure if it should be part
of this patch though.
> > > + if (old.f) {
> > > /*
> > > * SDB is already set by hardware.
> > > * Abort and try to set somewhere
> > > @@ -1490,10 +1509,10 @@ static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
> > > */
> > > return false;
> > > }
> > > + new.a = 1;
> > > + new.overflow = 0;
> > > + prev.val = __cdsg(&te->header.val, old.val, new.val);
> > > + } while (prev.val != old.val);
> >
> > And while this case has an early exit, it only cares about a single bit
> > (although you made it a full word) and so also shouldn't care. If
> > aux_reset_buffer() returns false, @overflow isn't consumed.
>
> Yes, except that it is anything but obvious that @overflow isn't consumed.