Currently ramoops uses a single zone to store function traces. To make this
work, it has to uses locking to synchronize accesses to the buffers. Recently
the synchronization was completely moved from a cmpxchg mechanism to raw
spinlocks due to difficulties in using cmpxchg on uncached memory and also on
RAMs behind PCIe. [1] This change further dropped the peformance of ramoops
pstore backend by more than half in my tests.
This patch series improves the situation dramatically by around 280% from what
it is now by creating a ramoops persistent zone for each CPU and avoiding use of
locking altogether for ftrace. At init time, the persistent zones are then
merged together.
Here are some tests to show the improvements. Tested using a qemu quad core
x86_64 instance with -mem-path to persist the guest RAM to a file. I measured
avergage throughput of dd over 30 seconds:
dd if=/dev/zero | pv | dd of=/dev/null
Without this patch series: 24MB/s
with per-cpu buffers and trace_clock: 51.9 MB/s
With per-cpu buffers and counter increment: 91.5 MB/s (improvement by ~ 281%)
Changes since v1:
- Use PSTORE_ prefix for RAM_LOCK and RAM_NOLOCK macros
Changes since RFC [2]:
- improve commit message clarity for optional locking of zone buffers.
- use macro for better code clarity of locking requirements
- use kcalloc instead of kmalloc for allocating prz array
- print warning if pmsg calls write_buf instead of write_buf_user
- free zones properly for ftrace per CPU usecase.
[1] https://lkml.org/lkml/2016/9/8/375
[2] https://lkml.org/lkml/2016/10/8/12
Joel Fernandes (7):
pstore: Make spinlock per zone instead of global
pstore: locking: dont lock unless caller asks to
pstore: Warn for the case of PSTORE_TYPE_PMSG write using deprecated
function
pstore: Make ramoops_init_przs generic for other prz arrays
ramoops: Split ftrace buffer space into per-CPU zones
pstore: Add support to store timestamp counter in ftrace records
pstore: Merge per-CPU ftrace zones into one zone for output
fs/pstore/ftrace.c | 3 +
fs/pstore/inode.c | 7 +-
fs/pstore/internal.h | 34 -------
fs/pstore/ram.c | 236 +++++++++++++++++++++++++++++++++++----------
fs/pstore/ram_core.c | 30 +++---
include/linux/pstore.h | 69 +++++++++++++
include/linux/pstore_ram.h | 14 ++-
7 files changed, 291 insertions(+), 102 deletions(-)
--
2.7.4
Currently pstore has a global spinlock for all zones. Since the zones are
independent and modify different areas of memory, there's no need to have a
global lock, so we should use a per-zone lock as introduced here. Also,
ramoops's ftrace usecase has a FTRACE_PER_CPU flag introduced in this patch
series which splits the ftrace memory area into a single zone per CPU thus
eliminating the need for locking. In preparation for this, make the locking
optional.
Signed-off-by: Joel Fernandes <[email protected]>
---
fs/pstore/ram_core.c | 11 +++++------
include/linux/pstore_ram.h | 1 +
2 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 3975dee..cb92055 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -48,8 +48,6 @@ static inline size_t buffer_start(struct persistent_ram_zone *prz)
return atomic_read(&prz->buffer->start);
}
-static DEFINE_RAW_SPINLOCK(buffer_lock);
-
/* increase and wrap the start pointer, returning the old value */
static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
{
@@ -57,7 +55,7 @@ static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
int new;
unsigned long flags;
- raw_spin_lock_irqsave(&buffer_lock, flags);
+ raw_spin_lock_irqsave(&prz->buffer_lock, flags);
old = atomic_read(&prz->buffer->start);
new = old + a;
@@ -65,7 +63,7 @@ static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
new -= prz->buffer_size;
atomic_set(&prz->buffer->start, new);
- raw_spin_unlock_irqrestore(&buffer_lock, flags);
+ raw_spin_unlock_irqrestore(&prz->buffer_lock, flags);
return old;
}
@@ -77,7 +75,7 @@ static void buffer_size_add(struct persistent_ram_zone *prz, size_t a)
size_t new;
unsigned long flags;
- raw_spin_lock_irqsave(&buffer_lock, flags);
+ raw_spin_lock_irqsave(&prz->buffer_lock, flags);
old = atomic_read(&prz->buffer->size);
if (old == prz->buffer_size)
@@ -89,7 +87,7 @@ static void buffer_size_add(struct persistent_ram_zone *prz, size_t a)
atomic_set(&prz->buffer->size, new);
exit:
- raw_spin_unlock_irqrestore(&buffer_lock, flags);
+ raw_spin_unlock_irqrestore(&prz->buffer_lock, flags);
}
static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz,
@@ -493,6 +491,7 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig,
prz->buffer->sig = sig;
persistent_ram_zap(prz);
+ prz->buffer_lock = __RAW_SPIN_LOCK_UNLOCKED(buffer_lock);
return 0;
}
diff --git a/include/linux/pstore_ram.h b/include/linux/pstore_ram.h
index c668c86..244d242 100644
--- a/include/linux/pstore_ram.h
+++ b/include/linux/pstore_ram.h
@@ -40,6 +40,7 @@ struct persistent_ram_zone {
void *vaddr;
struct persistent_ram_buffer *buffer;
size_t buffer_size;
+ raw_spinlock_t buffer_lock;
/* ECC correction */
char *par_buffer;
--
2.7.4
Currently ramoops_init_przs is hard wired only for panic dump zone array. In
preparation for the ftrace zone array (one zone per-cpu), make the function
more generic to be able to handle this case.
Add a new ramoops_init_dump_przs function and use the generic function for the
dump prz array.
Signed-off-by: Joel Fernandes <[email protected]>
---
fs/pstore/ram.c | 72 ++++++++++++++++++++++++++++++++++++---------------------
1 file changed, 46 insertions(+), 26 deletions(-)
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 9104ae9..5caa512 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -405,54 +405,74 @@ static void ramoops_free_przs(struct ramoops_context *cxt)
}
static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
- phys_addr_t *paddr, size_t dump_mem_sz)
+ struct persistent_ram_zone ***przs,
+ phys_addr_t *paddr, size_t mem_sz,
+ unsigned int cnt, u32 sig)
{
int err = -ENOMEM;
int i;
+ size_t zone_sz;
+ struct persistent_ram_zone **prz_ar;
- if (!cxt->record_size)
- return 0;
-
- if (*paddr + dump_mem_sz - cxt->phys_addr > cxt->size) {
- dev_err(dev, "no room for dumps\n");
+ if (*paddr + mem_sz - cxt->phys_addr > cxt->size) {
+ dev_err(dev, "no room for mem region (0x%zx@0x%llx) in (0x%lx@0x%llx)\n",
+ mem_sz, (unsigned long long)*paddr,
+ cxt->size, (unsigned long long)cxt->phys_addr);
return -ENOMEM;
}
- cxt->max_dump_cnt = dump_mem_sz / cxt->record_size;
- if (!cxt->max_dump_cnt)
+ zone_sz = mem_sz / cnt;
+ if (!zone_sz)
return -ENOMEM;
- cxt->przs = kzalloc(sizeof(*cxt->przs) * cxt->max_dump_cnt,
- GFP_KERNEL);
- if (!cxt->przs) {
- dev_err(dev, "failed to initialize a prz array for dumps\n");
- goto fail_mem;
+ prz_ar = kcalloc(cnt, sizeof(**przs), GFP_KERNEL);
+ if (!prz_ar) {
+ dev_err(dev, "Failed to initialize prz array\n");
+ return -ENOMEM;
}
- for (i = 0; i < cxt->max_dump_cnt; i++) {
- cxt->przs[i] = persistent_ram_new(*paddr, cxt->record_size, 0,
+ for (i = 0; i < cnt; i++) {
+ prz_ar[i] = persistent_ram_new(*paddr, zone_sz, sig,
&cxt->ecc_info,
cxt->memtype);
- if (IS_ERR(cxt->przs[i])) {
- err = PTR_ERR(cxt->przs[i]);
+ if (IS_ERR(prz_ar[i])) {
+ err = PTR_ERR(prz_ar[i]);
dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n",
cxt->record_size, (unsigned long long)*paddr, err);
while (i > 0) {
i--;
- persistent_ram_free(cxt->przs[i]);
+ persistent_ram_free(prz_ar[i]);
}
- goto fail_prz;
+ kfree(prz_ar);
+ return err;
}
- *paddr += cxt->record_size;
+ *paddr += zone_sz;
}
+ *przs = prz_ar;
return 0;
-fail_prz:
- kfree(cxt->przs);
-fail_mem:
- cxt->max_dump_cnt = 0;
- return err;
+}
+
+static int ramoops_init_dump_przs(struct device *dev,
+ struct ramoops_context *cxt,
+ phys_addr_t *paddr, size_t mem_sz)
+{
+ int ret;
+
+ if (!cxt->record_size)
+ return 0;
+
+ cxt->max_dump_cnt = mem_sz / cxt->record_size;
+ if (!cxt->max_dump_cnt)
+ return -ENOMEM;
+
+ ret = ramoops_init_przs(dev, cxt, &cxt->przs, paddr, mem_sz,
+ cxt->max_dump_cnt, 0);
+ if (ret)
+ cxt->max_dump_cnt = 0;
+
+ return ret;
}
static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt,
@@ -604,7 +624,7 @@ static int ramoops_probe(struct platform_device *pdev)
dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size
- cxt->pmsg_size;
- err = ramoops_init_przs(dev, cxt, &paddr, dump_mem_sz);
+ err = ramoops_init_dump_przs(dev, cxt, &paddr, dump_mem_sz);
if (err)
goto fail_out;
--
2.7.4
In preparation of not locking at all for certain buffers depending on if
there's contention, make locking optional depending if caller requested it.
Signed-off-by: Joel Fernandes <[email protected]>
---
fs/pstore/ram.c | 10 +++++-----
fs/pstore/ram_core.c | 27 ++++++++++++++++-----------
include/linux/pstore_ram.h | 10 +++++++++-
3 files changed, 30 insertions(+), 17 deletions(-)
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index f1219af..cb07ef6 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -260,7 +260,7 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz,
compressed ? 'C' : 'D');
WARN_ON_ONCE(!hdr);
len = hdr ? strlen(hdr) : 0;
- persistent_ram_write(prz, hdr, len);
+ persistent_ram_write(prz, hdr, len, PSTORE_RAM_LOCK);
kfree(hdr);
return len;
@@ -280,17 +280,17 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
if (type == PSTORE_TYPE_CONSOLE) {
if (!cxt->cprz)
return -ENOMEM;
- persistent_ram_write(cxt->cprz, buf, size);
+ persistent_ram_write(cxt->cprz, buf, size, PSTORE_RAM_LOCK);
return 0;
} else if (type == PSTORE_TYPE_FTRACE) {
if (!cxt->fprz)
return -ENOMEM;
- persistent_ram_write(cxt->fprz, buf, size);
+ persistent_ram_write(cxt->fprz, buf, size, PSTORE_RAM_LOCK);
return 0;
} else if (type == PSTORE_TYPE_PMSG) {
if (!cxt->mprz)
return -ENOMEM;
- persistent_ram_write(cxt->mprz, buf, size);
+ persistent_ram_write(cxt->mprz, buf, size, PSTORE_RAM_LOCK);
return 0;
}
@@ -324,7 +324,7 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
hlen = ramoops_write_kmsg_hdr(prz, compressed);
if (size + hlen > prz->buffer_size)
size = prz->buffer_size - hlen;
- persistent_ram_write(prz, buf, size);
+ persistent_ram_write(prz, buf, size, PSTORE_RAM_LOCK);
cxt->dump_write_cnt = (cxt->dump_write_cnt + 1) % cxt->max_dump_cnt;
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index cb92055..69c7b96 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -49,13 +49,15 @@ static inline size_t buffer_start(struct persistent_ram_zone *prz)
}
/* increase and wrap the start pointer, returning the old value */
-static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
+static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a,
+ int lock)
{
int old;
int new;
unsigned long flags;
- raw_spin_lock_irqsave(&prz->buffer_lock, flags);
+ if (lock == PSTORE_RAM_LOCK)
+ raw_spin_lock_irqsave(&prz->buffer_lock, flags);
old = atomic_read(&prz->buffer->start);
new = old + a;
@@ -63,19 +65,21 @@ static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
new -= prz->buffer_size;
atomic_set(&prz->buffer->start, new);
- raw_spin_unlock_irqrestore(&prz->buffer_lock, flags);
+ if (lock == PSTORE_RAM_LOCK)
+ raw_spin_unlock_irqrestore(&prz->buffer_lock, flags);
return old;
}
/* increase the size counter until it hits the max size */
-static void buffer_size_add(struct persistent_ram_zone *prz, size_t a)
+static void buffer_size_add(struct persistent_ram_zone *prz, size_t a, int lock)
{
size_t old;
size_t new;
unsigned long flags;
- raw_spin_lock_irqsave(&prz->buffer_lock, flags);
+ if (lock == PSTORE_RAM_LOCK)
+ raw_spin_lock_irqsave(&prz->buffer_lock, flags);
old = atomic_read(&prz->buffer->size);
if (old == prz->buffer_size)
@@ -87,7 +91,8 @@ static void buffer_size_add(struct persistent_ram_zone *prz, size_t a)
atomic_set(&prz->buffer->size, new);
exit:
- raw_spin_unlock_irqrestore(&prz->buffer_lock, flags);
+ if (lock == PSTORE_RAM_LOCK)
+ raw_spin_unlock_irqrestore(&prz->buffer_lock, flags);
}
static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz,
@@ -300,7 +305,7 @@ void persistent_ram_save_old(struct persistent_ram_zone *prz)
}
int notrace persistent_ram_write(struct persistent_ram_zone *prz,
- const void *s, unsigned int count)
+ const void *s, unsigned int count, int lock)
{
int rem;
int c = count;
@@ -311,9 +316,9 @@ int notrace persistent_ram_write(struct persistent_ram_zone *prz,
c = prz->buffer_size;
}
- buffer_size_add(prz, c);
+ buffer_size_add(prz, c, lock);
- start = buffer_start_add(prz, c);
+ start = buffer_start_add(prz, c, lock);
rem = prz->buffer_size - start;
if (unlikely(rem < c)) {
@@ -342,9 +347,9 @@ int notrace persistent_ram_write_user(struct persistent_ram_zone *prz,
c = prz->buffer_size;
}
- buffer_size_add(prz, c);
+ buffer_size_add(prz, c, PSTORE_RAM_LOCK);
- start = buffer_start_add(prz, c);
+ start = buffer_start_add(prz, c, PSTORE_RAM_LOCK);
rem = prz->buffer_size - start;
if (unlikely(rem < c)) {
diff --git a/include/linux/pstore_ram.h b/include/linux/pstore_ram.h
index 244d242..69f3487 100644
--- a/include/linux/pstore_ram.h
+++ b/include/linux/pstore_ram.h
@@ -24,6 +24,14 @@
#include <linux/list.h>
#include <linux/types.h>
+/*
+ * Choose whether access to the RAM zone requires locking or not. If a zone
+ * can be written to from different CPUs like with ftrace for example, then
+ * PSTORE_RAM_LOCK is used. For all other cases, PSTORE_RAM_NOLOCK should be used.
+ */
+#define PSTORE_RAM_NOLOCK 0
+#define PSTORE_RAM_LOCK 1
+
struct persistent_ram_buffer;
struct rs_control;
@@ -61,7 +69,7 @@ void persistent_ram_free(struct persistent_ram_zone *prz);
void persistent_ram_zap(struct persistent_ram_zone *prz);
int persistent_ram_write(struct persistent_ram_zone *prz, const void *s,
- unsigned int count);
+ unsigned int count, int lock);
int persistent_ram_write_user(struct persistent_ram_zone *prz,
const void __user *s, unsigned int count);
--
2.7.4
Up until this patch, each of the per CPU buffers appear as a separate
ftrace-ramoops-N file. In this patch we merge all the zones into one and
populate the ftrace-ramoops-0 file.
Signed-off-by: Joel Fernandes <[email protected]>
---
fs/pstore/ram.c | 91 +++++++++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 85 insertions(+), 6 deletions(-)
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 5e96f78..547b4bb 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -181,6 +181,53 @@ static bool prz_ok(struct persistent_ram_zone *prz)
persistent_ram_ecc_string(prz, NULL, 0));
}
+static void ftrace_old_log_combine(struct persistent_ram_zone *dest,
+ struct persistent_ram_zone *src)
+{
+ int dest_size, src_size, total, destoff, srcoff, i = 0, j = 0, k = 0;
+ void *mbuf;
+ struct pstore_ftrace_record *drec, *srec, *mrec;
+ int record_size = sizeof(struct pstore_ftrace_record);
+
+ destoff = dest->old_log_size % record_size;
+ dest_size = dest->old_log_size - destoff;
+
+ srcoff = src->old_log_size % record_size;
+ src_size = src->old_log_size - srcoff;
+
+ total = dest_size + src_size;
+ mbuf = kmalloc(total, GFP_KERNEL);
+
+ drec = (struct pstore_ftrace_record *)(dest->old_log + destoff);
+ srec = (struct pstore_ftrace_record *)(src->old_log + srcoff);
+ mrec = (struct pstore_ftrace_record *)(mbuf);
+
+ while (dest_size > 0 && src_size > 0) {
+ if (pstore_ftrace_read_timestamp(&drec[i]) <
+ pstore_ftrace_read_timestamp(&srec[j])) {
+ mrec[k++] = drec[i++];
+ dest_size -= record_size;
+ } else {
+ mrec[k++] = srec[j++];
+ src_size -= record_size;
+ }
+ }
+
+ while (dest_size > 0) {
+ mrec[k++] = drec[i++];
+ dest_size -= record_size;
+ }
+
+ while (src_size > 0) {
+ mrec[k++] = srec[j++];
+ src_size -= record_size;
+ }
+
+ kfree(dest->old_log);
+ dest->old_log = mbuf;
+ dest->old_log_size = total;
+}
+
static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
int *count, struct timespec *time,
char **buf, bool *compressed,
@@ -190,7 +237,7 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
ssize_t size;
struct ramoops_context *cxt = psi->data;
struct persistent_ram_zone *prz = NULL;
- int header_length = 0;
+ int header_length = 0, free_prz = 0;
/* Ramoops headers provide time stamps for PSTORE_TYPE_DMESG, but
* PSTORE_TYPE_CONSOLE and PSTORE_TYPE_FTRACE don't currently have
@@ -221,13 +268,39 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt,
1, id, type, PSTORE_TYPE_CONSOLE, 0);
if (!prz_ok(prz)) {
- int max = (cxt->flags & FTRACE_PER_CPU) ? nr_cpu_ids : 1;
- while (cxt->ftrace_read_cnt < max && !prz) {
+ if (!(cxt->flags & FTRACE_PER_CPU)) {
prz = ramoops_get_next_prz(cxt->fprzs,
- &cxt->ftrace_read_cnt, max, id, type,
+ &cxt->ftrace_read_cnt, 1, id, type,
PSTORE_TYPE_FTRACE, 0);
- if (!prz_ok(prz))
- continue;
+ } else {
+ /*
+ * Build a new dummy zone which combines all the
+ * per-cpu zones and accumulate the zone data and ecc
+ * info.
+ */
+ int max;
+ struct persistent_ram_zone *tmp_prz, *prz_next;
+
+ tmp_prz = kzalloc(sizeof(struct persistent_ram_zone),
+ GFP_KERNEL);
+ max = nr_cpu_ids;
+ while (cxt->ftrace_read_cnt < max) {
+ prz_next = ramoops_get_next_prz(cxt->fprzs,
+ &cxt->ftrace_read_cnt, max, id,
+ type, PSTORE_TYPE_FTRACE, 0);
+
+ if (!prz_ok(prz_next))
+ continue;
+
+ tmp_prz->ecc_info = prz_next->ecc_info;
+ tmp_prz->corrected_bytes +=
+ prz_next->corrected_bytes;
+ tmp_prz->bad_blocks += prz_next->bad_blocks;
+ ftrace_old_log_combine(tmp_prz, prz_next);
+ }
+ *id = 0;
+ prz = tmp_prz;
+ free_prz = 1;
}
}
@@ -247,6 +320,12 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
return -ENOMEM;
memcpy(*buf, (char *)persistent_ram_old(prz) + header_length, size);
+
+ if (free_prz) {
+ kfree(prz->old_log);
+ kfree(prz);
+ }
+
persistent_ram_ecc_string(prz, *buf + size, *ecc_notice_size + 1);
return size;
--
2.7.4
In preparation for merging the per CPU buffers into one buffer when we retrieve
the pstore ftrace data, we store the timestamp as a counter in the ftrace
pstore record. We store the CPU number as well if !PSTORE_CPU_IN_IP, in this
case we shift the counter and may lose ordering there but we preserve the same
record size. The timestamp counter is also racy, and not doing any locking or
synchronization here results in the benefit of lower overhead. Since we don't
care much here for exact ordering of function traces across CPUs, we don't
synchronize and may lose some counter updates but I'm ok with that.
Using trace_clock() results in much lower performance so avoid using it since
we don't want accuracy in timestamp and need a rough ordering to perform merge.
Signed-off-by: Joel Fernandes <[email protected]>
---
fs/pstore/ftrace.c | 3 +++
fs/pstore/inode.c | 7 ++---
fs/pstore/internal.h | 34 -------------------------
include/linux/pstore.h | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 76 insertions(+), 37 deletions(-)
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index d488770..1c6cf86 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -27,6 +27,8 @@
#include <asm/barrier.h>
#include "internal.h"
+static u64 pstore_ftrace_stamp;
+
static void notrace pstore_ftrace_call(unsigned long ip,
unsigned long parent_ip,
struct ftrace_ops *op,
@@ -42,6 +44,7 @@ static void notrace pstore_ftrace_call(unsigned long ip,
rec.ip = ip;
rec.parent_ip = parent_ip;
+ pstore_ftrace_write_timestamp(&rec, pstore_ftrace_stamp++);
pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id());
psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec,
0, sizeof(rec), psinfo);
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index ec9ddef..fcc5d1a 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -107,9 +107,10 @@ static int pstore_ftrace_seq_show(struct seq_file *s, void *v)
struct pstore_ftrace_seq_data *data = v;
struct pstore_ftrace_record *rec = (void *)(ps->data + data->off);
- seq_printf(s, "%d %08lx %08lx %pf <- %pF\n",
- pstore_ftrace_decode_cpu(rec), rec->ip, rec->parent_ip,
- (void *)rec->ip, (void *)rec->parent_ip);
+ seq_printf(s, "CPU:%d ts:%llu %08lx %08lx %pf <- %pF\n",
+ pstore_ftrace_decode_cpu(rec), pstore_ftrace_read_timestamp(rec),
+ rec->ip, rec->parent_ip, (void *)rec->ip,
+ (void *)rec->parent_ip);
return 0;
}
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index e38a22b..da416e6 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -5,40 +5,6 @@
#include <linux/time.h>
#include <linux/pstore.h>
-#if NR_CPUS <= 2 && defined(CONFIG_ARM_THUMB)
-#define PSTORE_CPU_IN_IP 0x1
-#elif NR_CPUS <= 4 && defined(CONFIG_ARM)
-#define PSTORE_CPU_IN_IP 0x3
-#endif
-
-struct pstore_ftrace_record {
- unsigned long ip;
- unsigned long parent_ip;
-#ifndef PSTORE_CPU_IN_IP
- unsigned int cpu;
-#endif
-};
-
-static inline void
-pstore_ftrace_encode_cpu(struct pstore_ftrace_record *rec, unsigned int cpu)
-{
-#ifndef PSTORE_CPU_IN_IP
- rec->cpu = cpu;
-#else
- rec->ip |= cpu;
-#endif
-}
-
-static inline unsigned int
-pstore_ftrace_decode_cpu(struct pstore_ftrace_record *rec)
-{
-#ifndef PSTORE_CPU_IN_IP
- return rec->cpu;
-#else
- return rec->ip & PSTORE_CPU_IN_IP;
-#endif
-}
-
#ifdef CONFIG_PSTORE_FTRACE
extern void pstore_register_ftrace(void);
extern void pstore_unregister_ftrace(void);
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index 92013cc..4e7fc3c 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -89,4 +89,73 @@ extern int pstore_register(struct pstore_info *);
extern void pstore_unregister(struct pstore_info *);
extern bool pstore_cannot_block_path(enum kmsg_dump_reason reason);
+struct pstore_ftrace_record {
+ unsigned long ip;
+ unsigned long parent_ip;
+ u64 ts;
+};
+
+/*
+ * ftrace related stuff: Both backends and frontends need these so expose them
+ */
+
+#if NR_CPUS <= 2 && defined(CONFIG_ARM_THUMB)
+#define PSTORE_CPU_IN_IP 0x1
+#elif NR_CPUS <= 4 && defined(CONFIG_ARM)
+#define PSTORE_CPU_IN_IP 0x3
+#endif
+
+#define TS_CPU_SHIFT 8
+#define TS_CPU_MASK (BIT(TS_CPU_SHIFT) - 1)
+
+/*
+ * If CPU number can be stored in IP, store it there
+ * else store it in the time stamp.
+ */
+#ifdef PSTORE_CPU_IN_IP
+static inline void
+pstore_ftrace_encode_cpu(struct pstore_ftrace_record *rec, unsigned int cpu)
+{
+ rec->ip |= cpu;
+}
+
+static inline unsigned int
+pstore_ftrace_decode_cpu(struct pstore_ftrace_record *rec)
+{
+ return rec->ip & PSTORE_CPU_IN_IP;
+}
+
+static inline u64 pstore_ftrace_read_timestamp(struct pstore_ftrace_record *rec)
+{
+ return rec->ts;
+}
+
+static inline void pstore_ftrace_write_timestamp(struct pstore_ftrace_record *rec, u64 val)
+{
+ rec->ts = val;
+}
+#else
+static inline void
+pstore_ftrace_encode_cpu(struct pstore_ftrace_record *rec, unsigned int cpu)
+{
+ rec->ts &= ~(TS_CPU_MASK);
+ rec->ts |= cpu;
+}
+
+static inline unsigned int
+pstore_ftrace_decode_cpu(struct pstore_ftrace_record *rec)
+{
+ return rec->ts & TS_CPU_MASK;
+}
+
+static inline u64 pstore_ftrace_read_timestamp(struct pstore_ftrace_record *rec)
+{
+ return rec->ts >> TS_CPU_SHIFT;
+}
+
+static inline void pstore_ftrace_write_timestamp(struct pstore_ftrace_record *rec, u64 val)
+{
+ rec->ts = (rec->ts & TS_CPU_MASK) | (val << TS_CPU_SHIFT);
+}
+#endif
#endif /*_LINUX_PSTORE_H*/
--
2.7.4
If FTRACE_PER_CPU flag is passed to ramoops pdata, split the space into
multiple zones depending on the number of CPUs.
This speeds up the performance of function tracing by about 280% in my tests as
we avoid the locking. The trade off being lesser space available per CPU. Let
the ramoops user decide which option they want based on pdata flag.
Signed-off-by: Joel Fernandes <[email protected]>
---
fs/pstore/ram.c | 72 +++++++++++++++++++++++++++++++++++-----------
include/linux/pstore_ram.h | 3 ++
2 files changed, 58 insertions(+), 17 deletions(-)
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 5caa512..5e96f78 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -87,7 +87,7 @@ MODULE_PARM_DESC(ramoops_ecc,
struct ramoops_context {
struct persistent_ram_zone **przs;
struct persistent_ram_zone *cprz;
- struct persistent_ram_zone *fprz;
+ struct persistent_ram_zone **fprzs;
struct persistent_ram_zone *mprz;
phys_addr_t phys_addr;
unsigned long size;
@@ -97,6 +97,7 @@ struct ramoops_context {
size_t ftrace_size;
size_t pmsg_size;
int dump_oops;
+ int flags;
struct persistent_ram_ecc_info ecc_info;
unsigned int max_dump_cnt;
unsigned int dump_write_cnt;
@@ -219,9 +220,17 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
if (!prz_ok(prz))
prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt,
1, id, type, PSTORE_TYPE_CONSOLE, 0);
- if (!prz_ok(prz))
- prz = ramoops_get_next_prz(&cxt->fprz, &cxt->ftrace_read_cnt,
- 1, id, type, PSTORE_TYPE_FTRACE, 0);
+ if (!prz_ok(prz)) {
+ int max = (cxt->flags & FTRACE_PER_CPU) ? nr_cpu_ids : 1;
+ while (cxt->ftrace_read_cnt < max && !prz) {
+ prz = ramoops_get_next_prz(cxt->fprzs,
+ &cxt->ftrace_read_cnt, max, id, type,
+ PSTORE_TYPE_FTRACE, 0);
+ if (!prz_ok(prz))
+ continue;
+ }
+ }
+
if (!prz_ok(prz))
prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt,
1, id, type, PSTORE_TYPE_PMSG, 0);
@@ -283,9 +292,23 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
persistent_ram_write(cxt->cprz, buf, size, PSTORE_RAM_LOCK);
return 0;
} else if (type == PSTORE_TYPE_FTRACE) {
- if (!cxt->fprz)
+ int zonenum, lock;
+
+ if (!cxt->fprzs)
return -ENOMEM;
- persistent_ram_write(cxt->fprz, buf, size, PSTORE_RAM_LOCK);
+ /*
+ * If per-cpu buffers, don't lock. Otherwise there's only
+ * 1 zone for ftrace (zone 0) and all CPUs share it, so lock.
+ */
+ if (cxt->flags & FTRACE_PER_CPU) {
+ zonenum = smp_processor_id();
+ lock = PSTORE_RAM_NOLOCK;
+ } else {
+ zonenum = 0;
+ lock = PSTORE_RAM_LOCK;
+ }
+
+ persistent_ram_write(cxt->fprzs[zonenum], buf, size, lock);
return 0;
} else if (type == PSTORE_TYPE_PMSG) {
pr_warn_ratelimited("ramoops: warning: PMSG shouldn't call %s\n",
@@ -352,6 +375,7 @@ static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
{
struct ramoops_context *cxt = psi->data;
struct persistent_ram_zone *prz;
+ int max;
switch (type) {
case PSTORE_TYPE_DMESG:
@@ -363,7 +387,10 @@ static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
prz = cxt->cprz;
break;
case PSTORE_TYPE_FTRACE:
- prz = cxt->fprz;
+ max = (cxt->flags & FTRACE_PER_CPU) ? nr_cpu_ids : 1;
+ if (id >= max)
+ return -EINVAL;
+ prz = cxt->fprzs[id];
break;
case PSTORE_TYPE_PMSG:
prz = cxt->mprz;
@@ -394,14 +421,23 @@ static void ramoops_free_przs(struct ramoops_context *cxt)
{
int i;
- if (!cxt->przs)
- return;
+ /* Free dump PRZs */
+ if (cxt->przs) {
+ for (i = 0; i < cxt->max_dump_cnt; i++)
+ persistent_ram_free(cxt->przs[i]);
- for (i = 0; i < cxt->max_dump_cnt; i++)
- persistent_ram_free(cxt->przs[i]);
+ kfree(cxt->przs);
+ cxt->max_dump_cnt = 0;
+ }
- kfree(cxt->przs);
- cxt->max_dump_cnt = 0;
+ /* Free ftrace PRZs */
+ if (cxt->fprzs) {
+ int nr = (cxt->flags & FTRACE_PER_CPU) ? nr_cpu_ids : 1;
+
+ for (i = 0; i < nr; i++)
+ persistent_ram_free(cxt->fprzs[i]);
+ kfree(cxt->fprzs);
+ }
}
static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
@@ -618,6 +654,7 @@ static int ramoops_probe(struct platform_device *pdev)
cxt->ftrace_size = pdata->ftrace_size;
cxt->pmsg_size = pdata->pmsg_size;
cxt->dump_oops = pdata->dump_oops;
+ cxt->flags = pdata->flags;
cxt->ecc_info = pdata->ecc_info;
paddr = cxt->phys_addr;
@@ -633,8 +670,9 @@ static int ramoops_probe(struct platform_device *pdev)
if (err)
goto fail_init_cprz;
- err = ramoops_init_prz(dev, cxt, &cxt->fprz, &paddr, cxt->ftrace_size,
- LINUX_VERSION_CODE);
+ err = ramoops_init_przs(dev, cxt, &cxt->fprzs, &paddr, cxt->ftrace_size,
+ (cxt->flags & FTRACE_PER_CPU) ? nr_cpu_ids : 1,
+ LINUX_VERSION_CODE);
if (err)
goto fail_init_fprz;
@@ -698,7 +736,6 @@ fail_clear:
cxt->pstore.bufsize = 0;
persistent_ram_free(cxt->mprz);
fail_init_mprz:
- persistent_ram_free(cxt->fprz);
fail_init_fprz:
persistent_ram_free(cxt->cprz);
fail_init_cprz:
@@ -717,7 +754,6 @@ static int ramoops_remove(struct platform_device *pdev)
cxt->pstore.bufsize = 0;
persistent_ram_free(cxt->mprz);
- persistent_ram_free(cxt->fprz);
persistent_ram_free(cxt->cprz);
ramoops_free_przs(cxt);
@@ -759,6 +795,8 @@ static void ramoops_register_dummy(void)
dummy_data->ftrace_size = ramoops_ftrace_size;
dummy_data->pmsg_size = ramoops_pmsg_size;
dummy_data->dump_oops = dump_oops;
+ dummy_data->flags = FTRACE_PER_CPU;
+
/*
* For backwards compatibility ramoops.ecc=1 means 16 bytes ECC
* (using 1 byte for ECC isn't much of use anyway).
diff --git a/include/linux/pstore_ram.h b/include/linux/pstore_ram.h
index 69f3487..122f78d 100644
--- a/include/linux/pstore_ram.h
+++ b/include/linux/pstore_ram.h
@@ -86,6 +86,8 @@ ssize_t persistent_ram_ecc_string(struct persistent_ram_zone *prz,
* @mem_address physical memory address to contain ramoops
*/
+#define FTRACE_PER_CPU BIT(0)
+
struct ramoops_platform_data {
unsigned long mem_size;
phys_addr_t mem_address;
@@ -95,6 +97,7 @@ struct ramoops_platform_data {
unsigned long ftrace_size;
unsigned long pmsg_size;
int dump_oops;
+ int flags;
struct persistent_ram_ecc_info ecc_info;
};
--
2.7.4
PMSG now uses ramoops_pstore_write_buf_user instead of ramoops_pstore_write_buf
Print a ratelimited warning if the ramoops_pstore_write_buf is called.
Signed-off-by: Joel Fernandes <[email protected]>
---
fs/pstore/ram.c | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index cb07ef6..9104ae9 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -288,10 +288,8 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
persistent_ram_write(cxt->fprz, buf, size, PSTORE_RAM_LOCK);
return 0;
} else if (type == PSTORE_TYPE_PMSG) {
- if (!cxt->mprz)
- return -ENOMEM;
- persistent_ram_write(cxt->mprz, buf, size, PSTORE_RAM_LOCK);
- return 0;
+ pr_warn_ratelimited("ramoops: warning: PMSG shouldn't call %s\n",
+ __func__);
}
if (type != PSTORE_TYPE_DMESG)
--
2.7.4
On Thu, Oct 20, 2016 at 12:34 AM, Joel Fernandes <[email protected]> wrote:
> Currently pstore has a global spinlock for all zones. Since the zones are
> independent and modify different areas of memory, there's no need to have a
> global lock, so we should use a per-zone lock as introduced here. Also,
> ramoops's ftrace usecase has a FTRACE_PER_CPU flag introduced in this patch
> series which splits the ftrace memory area into a single zone per CPU thus
> eliminating the need for locking. In preparation for this, make the locking
> optional.
>
> Signed-off-by: Joel Fernandes <[email protected]>
> ---
> fs/pstore/ram_core.c | 11 +++++------
> include/linux/pstore_ram.h | 1 +
> 2 files changed, 6 insertions(+), 6 deletions(-)
>
> diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
> index 3975dee..cb92055 100644
> --- a/fs/pstore/ram_core.c
> +++ b/fs/pstore/ram_core.c
> @@ -48,8 +48,6 @@ static inline size_t buffer_start(struct persistent_ram_zone *prz)
> return atomic_read(&prz->buffer->start);
> }
>
> -static DEFINE_RAW_SPINLOCK(buffer_lock);
> -
> /* increase and wrap the start pointer, returning the old value */
> static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
> {
> @@ -57,7 +55,7 @@ static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
> int new;
> unsigned long flags;
>
> - raw_spin_lock_irqsave(&buffer_lock, flags);
> + raw_spin_lock_irqsave(&prz->buffer_lock, flags);
>
> old = atomic_read(&prz->buffer->start);
> new = old + a;
> @@ -65,7 +63,7 @@ static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
> new -= prz->buffer_size;
> atomic_set(&prz->buffer->start, new);
>
> - raw_spin_unlock_irqrestore(&buffer_lock, flags);
> + raw_spin_unlock_irqrestore(&prz->buffer_lock, flags);
>
> return old;
> }
> @@ -77,7 +75,7 @@ static void buffer_size_add(struct persistent_ram_zone *prz, size_t a)
> size_t new;
> unsigned long flags;
>
> - raw_spin_lock_irqsave(&buffer_lock, flags);
> + raw_spin_lock_irqsave(&prz->buffer_lock, flags);
>
> old = atomic_read(&prz->buffer->size);
> if (old == prz->buffer_size)
> @@ -89,7 +87,7 @@ static void buffer_size_add(struct persistent_ram_zone *prz, size_t a)
> atomic_set(&prz->buffer->size, new);
>
> exit:
> - raw_spin_unlock_irqrestore(&buffer_lock, flags);
> + raw_spin_unlock_irqrestore(&prz->buffer_lock, flags);
> }
>
> static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz,
> @@ -493,6 +491,7 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig,
>
> prz->buffer->sig = sig;
> persistent_ram_zap(prz);
> + prz->buffer_lock = __RAW_SPIN_LOCK_UNLOCKED(buffer_lock);
>
> return 0;
> }
> diff --git a/include/linux/pstore_ram.h b/include/linux/pstore_ram.h
> index c668c86..244d242 100644
> --- a/include/linux/pstore_ram.h
> +++ b/include/linux/pstore_ram.h
> @@ -40,6 +40,7 @@ struct persistent_ram_zone {
> void *vaddr;
> struct persistent_ram_buffer *buffer;
> size_t buffer_size;
> + raw_spinlock_t buffer_lock;
>
> /* ECC correction */
> char *par_buffer;
> --
> 2.7.4
>
This one looks good, thanks. I'll get this in for -next since it's a
good clean-up on its own.
Have you put these patches through scripts/checkpatch.pl? Some of
lines wrap or have wide commit messages, but that's just cosmetic to
fix up.
-Kees
--
Kees Cook
Nexus Security
On Thu, Oct 20, 2016 at 12:34 AM, Joel Fernandes <[email protected]> wrote:
> In preparation of not locking at all for certain buffers depending on if
> there's contention, make locking optional depending if caller requested it.
>
> Signed-off-by: Joel Fernandes <[email protected]>
> ---
> fs/pstore/ram.c | 10 +++++-----
> fs/pstore/ram_core.c | 27 ++++++++++++++++-----------
> include/linux/pstore_ram.h | 10 +++++++++-
> 3 files changed, 30 insertions(+), 17 deletions(-)
>
> diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
> index f1219af..cb07ef6 100644
> --- a/fs/pstore/ram.c
> +++ b/fs/pstore/ram.c
> @@ -260,7 +260,7 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz,
> compressed ? 'C' : 'D');
> WARN_ON_ONCE(!hdr);
> len = hdr ? strlen(hdr) : 0;
> - persistent_ram_write(prz, hdr, len);
> + persistent_ram_write(prz, hdr, len, PSTORE_RAM_LOCK);
> kfree(hdr);
>
> return len;
> @@ -280,17 +280,17 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
> if (type == PSTORE_TYPE_CONSOLE) {
> if (!cxt->cprz)
> return -ENOMEM;
> - persistent_ram_write(cxt->cprz, buf, size);
> + persistent_ram_write(cxt->cprz, buf, size, PSTORE_RAM_LOCK);
> return 0;
> } else if (type == PSTORE_TYPE_FTRACE) {
> if (!cxt->fprz)
> return -ENOMEM;
> - persistent_ram_write(cxt->fprz, buf, size);
> + persistent_ram_write(cxt->fprz, buf, size, PSTORE_RAM_LOCK);
> return 0;
> } else if (type == PSTORE_TYPE_PMSG) {
> if (!cxt->mprz)
> return -ENOMEM;
> - persistent_ram_write(cxt->mprz, buf, size);
> + persistent_ram_write(cxt->mprz, buf, size, PSTORE_RAM_LOCK);
> return 0;
> }
>
> @@ -324,7 +324,7 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
> hlen = ramoops_write_kmsg_hdr(prz, compressed);
> if (size + hlen > prz->buffer_size)
> size = prz->buffer_size - hlen;
> - persistent_ram_write(prz, buf, size);
> + persistent_ram_write(prz, buf, size, PSTORE_RAM_LOCK);
>
> cxt->dump_write_cnt = (cxt->dump_write_cnt + 1) % cxt->max_dump_cnt;
>
> diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
> index cb92055..69c7b96 100644
> --- a/fs/pstore/ram_core.c
> +++ b/fs/pstore/ram_core.c
> @@ -49,13 +49,15 @@ static inline size_t buffer_start(struct persistent_ram_zone *prz)
> }
>
> /* increase and wrap the start pointer, returning the old value */
> -static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
> +static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a,
> + int lock)
> {
> int old;
> int new;
> unsigned long flags;
>
> - raw_spin_lock_irqsave(&prz->buffer_lock, flags);
> + if (lock == PSTORE_RAM_LOCK)
> + raw_spin_lock_irqsave(&prz->buffer_lock, flags);
>
> old = atomic_read(&prz->buffer->start);
> new = old + a;
> @@ -63,19 +65,21 @@ static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
> new -= prz->buffer_size;
> atomic_set(&prz->buffer->start, new);
>
> - raw_spin_unlock_irqrestore(&prz->buffer_lock, flags);
> + if (lock == PSTORE_RAM_LOCK)
> + raw_spin_unlock_irqrestore(&prz->buffer_lock, flags);
>
> return old;
> }
>
> /* increase the size counter until it hits the max size */
> -static void buffer_size_add(struct persistent_ram_zone *prz, size_t a)
> +static void buffer_size_add(struct persistent_ram_zone *prz, size_t a, int lock)
> {
> size_t old;
> size_t new;
> unsigned long flags;
>
> - raw_spin_lock_irqsave(&prz->buffer_lock, flags);
> + if (lock == PSTORE_RAM_LOCK)
> + raw_spin_lock_irqsave(&prz->buffer_lock, flags);
>
> old = atomic_read(&prz->buffer->size);
> if (old == prz->buffer_size)
> @@ -87,7 +91,8 @@ static void buffer_size_add(struct persistent_ram_zone *prz, size_t a)
> atomic_set(&prz->buffer->size, new);
>
> exit:
> - raw_spin_unlock_irqrestore(&prz->buffer_lock, flags);
> + if (lock == PSTORE_RAM_LOCK)
> + raw_spin_unlock_irqrestore(&prz->buffer_lock, flags);
> }
>
> static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz,
Does this pass a sparse test? Depending on how dumb sparse is, you may
need to add "else" statements to the lock == RAM_LOCK tests:
else
__acquire(&prz->buffer_lock);
...
else
__release(&prz->buffer_lock);
Otherwise, this looks fine. :)
-Kees
--
Kees Cook
Nexus Security
On Thu, Oct 20, 2016 at 12:34 AM, Joel Fernandes <[email protected]> wrote:
> PMSG now uses ramoops_pstore_write_buf_user instead of ramoops_pstore_write_buf
> Print a ratelimited warning if the ramoops_pstore_write_buf is called.
>
> Signed-off-by: Joel Fernandes <[email protected]>
> ---
> fs/pstore/ram.c | 6 ++----
> 1 file changed, 2 insertions(+), 4 deletions(-)
>
> diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
> index cb07ef6..9104ae9 100644
> --- a/fs/pstore/ram.c
> +++ b/fs/pstore/ram.c
> @@ -288,10 +288,8 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
> persistent_ram_write(cxt->fprz, buf, size, PSTORE_RAM_LOCK);
> return 0;
> } else if (type == PSTORE_TYPE_PMSG) {
> - if (!cxt->mprz)
> - return -ENOMEM;
> - persistent_ram_write(cxt->mprz, buf, size, PSTORE_RAM_LOCK);
> - return 0;
> + pr_warn_ratelimited("ramoops: warning: PMSG shouldn't call %s\n",
> + __func__);
> }
>
> if (type != PSTORE_TYPE_DMESG)
> --
> 2.7.4
>
Cool, I'll take this into -next too since it's stand-alone.
Thanks!
-Kees
--
Kees Cook
Nexus Security
On Thu, Oct 20, 2016 at 12:34 AM, Joel Fernandes <[email protected]> wrote:
> Currently ramoops_init_przs is hard wired only for panic dump zone array. In
> preparation for the ftrace zone array (one zone per-cpu), make the function
> more generic to be able to handle this case.
>
> Add a new ramoops_init_dump_przs function and use the generic function for the
> dump prz array.
>
> Signed-off-by: Joel Fernandes <[email protected]>
> ---
> fs/pstore/ram.c | 72 ++++++++++++++++++++++++++++++++++++---------------------
> 1 file changed, 46 insertions(+), 26 deletions(-)
>
> diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
> index 9104ae9..5caa512 100644
> --- a/fs/pstore/ram.c
> +++ b/fs/pstore/ram.c
> @@ -405,54 +405,74 @@ static void ramoops_free_przs(struct ramoops_context *cxt)
> }
>
> static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
> - phys_addr_t *paddr, size_t dump_mem_sz)
> + struct persistent_ram_zone ***przs,
> + phys_addr_t *paddr, size_t mem_sz,
> + unsigned int cnt, u32 sig)
> {
> int err = -ENOMEM;
> int i;
> + size_t zone_sz;
> + struct persistent_ram_zone **prz_ar;
>
> - if (!cxt->record_size)
> - return 0;
> -
> - if (*paddr + dump_mem_sz - cxt->phys_addr > cxt->size) {
> - dev_err(dev, "no room for dumps\n");
> + if (*paddr + mem_sz - cxt->phys_addr > cxt->size) {
> + dev_err(dev, "no room for mem region (0x%zx@0x%llx) in (0x%lx@0x%llx)\n",
> + mem_sz, (unsigned long long)*paddr,
> + cxt->size, (unsigned long long)cxt->phys_addr);
> return -ENOMEM;
> }
>
> - cxt->max_dump_cnt = dump_mem_sz / cxt->record_size;
> - if (!cxt->max_dump_cnt)
> + zone_sz = mem_sz / cnt;
> + if (!zone_sz)
> return -ENOMEM;
>
> - cxt->przs = kzalloc(sizeof(*cxt->przs) * cxt->max_dump_cnt,
> - GFP_KERNEL);
> - if (!cxt->przs) {
> - dev_err(dev, "failed to initialize a prz array for dumps\n");
> - goto fail_mem;
> + prz_ar = kcalloc(cnt, sizeof(**przs), GFP_KERNEL);
> + if (!prz_ar) {
> + dev_err(dev, "Failed to initialize prz array\n");
> + return -ENOMEM;
> }
>
> - for (i = 0; i < cxt->max_dump_cnt; i++) {
> - cxt->przs[i] = persistent_ram_new(*paddr, cxt->record_size, 0,
> + for (i = 0; i < cnt; i++) {
> + prz_ar[i] = persistent_ram_new(*paddr, zone_sz, sig,
> &cxt->ecc_info,
> cxt->memtype);
> - if (IS_ERR(cxt->przs[i])) {
> - err = PTR_ERR(cxt->przs[i]);
> + if (IS_ERR(prz_ar[i])) {
> + err = PTR_ERR(prz_ar[i]);
> dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n",
> cxt->record_size, (unsigned long long)*paddr, err);
>
> while (i > 0) {
> i--;
> - persistent_ram_free(cxt->przs[i]);
> + persistent_ram_free(prz_ar[i]);
> }
> - goto fail_prz;
> + kfree(prz_ar);
> + return err;
> }
> - *paddr += cxt->record_size;
> + *paddr += zone_sz;
> }
>
> + *przs = prz_ar;
> return 0;
> -fail_prz:
> - kfree(cxt->przs);
> -fail_mem:
> - cxt->max_dump_cnt = 0;
> - return err;
> +}
> +
> +static int ramoops_init_dump_przs(struct device *dev,
> + struct ramoops_context *cxt,
> + phys_addr_t *paddr, size_t mem_sz)
> +{
> + int ret;
> +
> + if (!cxt->record_size)
> + return 0;
> +
> + cxt->max_dump_cnt = mem_sz / cxt->record_size;
> + if (!cxt->max_dump_cnt)
> + return -ENOMEM;
> +
> + ret = ramoops_init_przs(dev, cxt, &cxt->przs, paddr, mem_sz,
> + cxt->max_dump_cnt, 0);
> + if (ret)
> + cxt->max_dump_cnt = 0;
> +
> + return ret;
> }
>
> static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt,
> @@ -604,7 +624,7 @@ static int ramoops_probe(struct platform_device *pdev)
>
> dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size
> - cxt->pmsg_size;
> - err = ramoops_init_przs(dev, cxt, &paddr, dump_mem_sz);
> + err = ramoops_init_dump_przs(dev, cxt, &paddr, dump_mem_sz);
> if (err)
> goto fail_out;
>
> --
> 2.7.4
>
This looks good. One thing that I think needs to be adjusted is that
the actual size of the allocation ("cnt") keeps being recalculated and
passed around. It seems like the counts need to be managed for each
prz array in the context. (i.e. dump_max_cnt exists, but not for the
ftrace prz array). What do you think of adding that?
-Kees
--
Kees Cook
Nexus Security
On Thu, Oct 20, 2016 at 12:34 AM, Joel Fernandes <[email protected]> wrote:
> If FTRACE_PER_CPU flag is passed to ramoops pdata, split the space into
> multiple zones depending on the number of CPUs.
>
> This speeds up the performance of function tracing by about 280% in my tests as
> we avoid the locking. The trade off being lesser space available per CPU. Let
> the ramoops user decide which option they want based on pdata flag.
>
> Signed-off-by: Joel Fernandes <[email protected]>
> ---
> fs/pstore/ram.c | 72 +++++++++++++++++++++++++++++++++++-----------
> include/linux/pstore_ram.h | 3 ++
> 2 files changed, 58 insertions(+), 17 deletions(-)
>
> diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
> index 5caa512..5e96f78 100644
> --- a/fs/pstore/ram.c
> +++ b/fs/pstore/ram.c
> @@ -87,7 +87,7 @@ MODULE_PARM_DESC(ramoops_ecc,
> struct ramoops_context {
> struct persistent_ram_zone **przs;
> struct persistent_ram_zone *cprz;
> - struct persistent_ram_zone *fprz;
> + struct persistent_ram_zone **fprzs;
> struct persistent_ram_zone *mprz;
> phys_addr_t phys_addr;
> unsigned long size;
> @@ -97,6 +97,7 @@ struct ramoops_context {
> size_t ftrace_size;
> size_t pmsg_size;
> int dump_oops;
> + int flags;
> struct persistent_ram_ecc_info ecc_info;
> unsigned int max_dump_cnt;
> unsigned int dump_write_cnt;
> @@ -219,9 +220,17 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
> if (!prz_ok(prz))
> prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt,
> 1, id, type, PSTORE_TYPE_CONSOLE, 0);
> - if (!prz_ok(prz))
> - prz = ramoops_get_next_prz(&cxt->fprz, &cxt->ftrace_read_cnt,
> - 1, id, type, PSTORE_TYPE_FTRACE, 0);
> + if (!prz_ok(prz)) {
> + int max = (cxt->flags & FTRACE_PER_CPU) ? nr_cpu_ids : 1;
> + while (cxt->ftrace_read_cnt < max && !prz) {
> + prz = ramoops_get_next_prz(cxt->fprzs,
> + &cxt->ftrace_read_cnt, max, id, type,
> + PSTORE_TYPE_FTRACE, 0);
> + if (!prz_ok(prz))
> + continue;
> + }
> + }
> +
> if (!prz_ok(prz))
> prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt,
> 1, id, type, PSTORE_TYPE_PMSG, 0);
> @@ -283,9 +292,23 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
> persistent_ram_write(cxt->cprz, buf, size, PSTORE_RAM_LOCK);
> return 0;
> } else if (type == PSTORE_TYPE_FTRACE) {
> - if (!cxt->fprz)
> + int zonenum, lock;
> +
> + if (!cxt->fprzs)
> return -ENOMEM;
> - persistent_ram_write(cxt->fprz, buf, size, PSTORE_RAM_LOCK);
> + /*
> + * If per-cpu buffers, don't lock. Otherwise there's only
> + * 1 zone for ftrace (zone 0) and all CPUs share it, so lock.
> + */
> + if (cxt->flags & FTRACE_PER_CPU) {
> + zonenum = smp_processor_id();
> + lock = PSTORE_RAM_NOLOCK;
> + } else {
> + zonenum = 0;
> + lock = PSTORE_RAM_LOCK;
> + }
> +
> + persistent_ram_write(cxt->fprzs[zonenum], buf, size, lock);
> return 0;
> } else if (type == PSTORE_TYPE_PMSG) {
> pr_warn_ratelimited("ramoops: warning: PMSG shouldn't call %s\n",
> @@ -352,6 +375,7 @@ static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
> {
> struct ramoops_context *cxt = psi->data;
> struct persistent_ram_zone *prz;
> + int max;
>
> switch (type) {
> case PSTORE_TYPE_DMESG:
> @@ -363,7 +387,10 @@ static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
> prz = cxt->cprz;
> break;
> case PSTORE_TYPE_FTRACE:
> - prz = cxt->fprz;
> + max = (cxt->flags & FTRACE_PER_CPU) ? nr_cpu_ids : 1;
> + if (id >= max)
> + return -EINVAL;
> + prz = cxt->fprzs[id];
> break;
> case PSTORE_TYPE_PMSG:
> prz = cxt->mprz;
> @@ -394,14 +421,23 @@ static void ramoops_free_przs(struct ramoops_context *cxt)
> {
> int i;
>
> - if (!cxt->przs)
> - return;
> + /* Free dump PRZs */
> + if (cxt->przs) {
> + for (i = 0; i < cxt->max_dump_cnt; i++)
> + persistent_ram_free(cxt->przs[i]);
>
> - for (i = 0; i < cxt->max_dump_cnt; i++)
> - persistent_ram_free(cxt->przs[i]);
> + kfree(cxt->przs);
> + cxt->max_dump_cnt = 0;
> + }
>
> - kfree(cxt->przs);
> - cxt->max_dump_cnt = 0;
> + /* Free ftrace PRZs */
> + if (cxt->fprzs) {
> + int nr = (cxt->flags & FTRACE_PER_CPU) ? nr_cpu_ids : 1;
> +
> + for (i = 0; i < nr; i++)
> + persistent_ram_free(cxt->fprzs[i]);
> + kfree(cxt->fprzs);
> + }
> }
>
> static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
> @@ -618,6 +654,7 @@ static int ramoops_probe(struct platform_device *pdev)
> cxt->ftrace_size = pdata->ftrace_size;
> cxt->pmsg_size = pdata->pmsg_size;
> cxt->dump_oops = pdata->dump_oops;
> + cxt->flags = pdata->flags;
> cxt->ecc_info = pdata->ecc_info;
>
> paddr = cxt->phys_addr;
> @@ -633,8 +670,9 @@ static int ramoops_probe(struct platform_device *pdev)
> if (err)
> goto fail_init_cprz;
>
> - err = ramoops_init_prz(dev, cxt, &cxt->fprz, &paddr, cxt->ftrace_size,
> - LINUX_VERSION_CODE);
> + err = ramoops_init_przs(dev, cxt, &cxt->fprzs, &paddr, cxt->ftrace_size,
> + (cxt->flags & FTRACE_PER_CPU) ? nr_cpu_ids : 1,
> + LINUX_VERSION_CODE);
> if (err)
> goto fail_init_fprz;
>
> @@ -698,7 +736,6 @@ fail_clear:
> cxt->pstore.bufsize = 0;
> persistent_ram_free(cxt->mprz);
> fail_init_mprz:
> - persistent_ram_free(cxt->fprz);
> fail_init_fprz:
> persistent_ram_free(cxt->cprz);
> fail_init_cprz:
> @@ -717,7 +754,6 @@ static int ramoops_remove(struct platform_device *pdev)
> cxt->pstore.bufsize = 0;
>
> persistent_ram_free(cxt->mprz);
> - persistent_ram_free(cxt->fprz);
> persistent_ram_free(cxt->cprz);
> ramoops_free_przs(cxt);
>
> @@ -759,6 +795,8 @@ static void ramoops_register_dummy(void)
> dummy_data->ftrace_size = ramoops_ftrace_size;
> dummy_data->pmsg_size = ramoops_pmsg_size;
> dummy_data->dump_oops = dump_oops;
> + dummy_data->flags = FTRACE_PER_CPU;
> +
> /*
> * For backwards compatibility ramoops.ecc=1 means 16 bytes ECC
> * (using 1 byte for ECC isn't much of use anyway).
> diff --git a/include/linux/pstore_ram.h b/include/linux/pstore_ram.h
> index 69f3487..122f78d 100644
> --- a/include/linux/pstore_ram.h
> +++ b/include/linux/pstore_ram.h
> @@ -86,6 +86,8 @@ ssize_t persistent_ram_ecc_string(struct persistent_ram_zone *prz,
> * @mem_address physical memory address to contain ramoops
> */
>
> +#define FTRACE_PER_CPU BIT(0)
> +
> struct ramoops_platform_data {
> unsigned long mem_size;
> phys_addr_t mem_address;
> @@ -95,6 +97,7 @@ struct ramoops_platform_data {
> unsigned long ftrace_size;
> unsigned long pmsg_size;
> int dump_oops;
> + int flags;
> struct persistent_ram_ecc_info ecc_info;
> };
>
> --
> 2.7.4
>
Is there a case for not setting FTRACE_PER_CPU?
-Kees
--
Kees Cook
Nexus Security
On Thu, Oct 20, 2016 at 12:34 AM, Joel Fernandes <[email protected]> wrote:
> In preparation for merging the per CPU buffers into one buffer when we retrieve
> the pstore ftrace data, we store the timestamp as a counter in the ftrace
> pstore record. We store the CPU number as well if !PSTORE_CPU_IN_IP, in this
> case we shift the counter and may lose ordering there but we preserve the same
> record size. The timestamp counter is also racy, and not doing any locking or
> synchronization here results in the benefit of lower overhead. Since we don't
> care much here for exact ordering of function traces across CPUs, we don't
> synchronize and may lose some counter updates but I'm ok with that.
>
> Using trace_clock() results in much lower performance so avoid using it since
> we don't want accuracy in timestamp and need a rough ordering to perform merge.
>
> Signed-off-by: Joel Fernandes <[email protected]>
> ---
> fs/pstore/ftrace.c | 3 +++
> fs/pstore/inode.c | 7 ++---
> fs/pstore/internal.h | 34 -------------------------
> include/linux/pstore.h | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 76 insertions(+), 37 deletions(-)
>
> diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
> index d488770..1c6cf86 100644
> --- a/fs/pstore/ftrace.c
> +++ b/fs/pstore/ftrace.c
> @@ -27,6 +27,8 @@
> #include <asm/barrier.h>
> #include "internal.h"
>
> +static u64 pstore_ftrace_stamp;
I think the race described in the commit message deserves a comment here.
> +
> static void notrace pstore_ftrace_call(unsigned long ip,
> unsigned long parent_ip,
> struct ftrace_ops *op,
> @@ -42,6 +44,7 @@ static void notrace pstore_ftrace_call(unsigned long ip,
>
> rec.ip = ip;
> rec.parent_ip = parent_ip;
> + pstore_ftrace_write_timestamp(&rec, pstore_ftrace_stamp++);
> pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id());
> psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec,
> 0, sizeof(rec), psinfo);
> diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
> index ec9ddef..fcc5d1a 100644
> --- a/fs/pstore/inode.c
> +++ b/fs/pstore/inode.c
> @@ -107,9 +107,10 @@ static int pstore_ftrace_seq_show(struct seq_file *s, void *v)
> struct pstore_ftrace_seq_data *data = v;
> struct pstore_ftrace_record *rec = (void *)(ps->data + data->off);
>
> - seq_printf(s, "%d %08lx %08lx %pf <- %pF\n",
> - pstore_ftrace_decode_cpu(rec), rec->ip, rec->parent_ip,
> - (void *)rec->ip, (void *)rec->parent_ip);
> + seq_printf(s, "CPU:%d ts:%llu %08lx %08lx %pf <- %pF\n",
> + pstore_ftrace_decode_cpu(rec), pstore_ftrace_read_timestamp(rec),
> + rec->ip, rec->parent_ip, (void *)rec->ip,
> + (void *)rec->parent_ip);
>
> return 0;
> }
> diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
> index e38a22b..da416e6 100644
> --- a/fs/pstore/internal.h
> +++ b/fs/pstore/internal.h
> @@ -5,40 +5,6 @@
> #include <linux/time.h>
> #include <linux/pstore.h>
>
> -#if NR_CPUS <= 2 && defined(CONFIG_ARM_THUMB)
> -#define PSTORE_CPU_IN_IP 0x1
> -#elif NR_CPUS <= 4 && defined(CONFIG_ARM)
> -#define PSTORE_CPU_IN_IP 0x3
> -#endif
> -
> -struct pstore_ftrace_record {
> - unsigned long ip;
> - unsigned long parent_ip;
> -#ifndef PSTORE_CPU_IN_IP
> - unsigned int cpu;
> -#endif
> -};
> -
> -static inline void
> -pstore_ftrace_encode_cpu(struct pstore_ftrace_record *rec, unsigned int cpu)
> -{
> -#ifndef PSTORE_CPU_IN_IP
> - rec->cpu = cpu;
> -#else
> - rec->ip |= cpu;
> -#endif
> -}
> -
> -static inline unsigned int
> -pstore_ftrace_decode_cpu(struct pstore_ftrace_record *rec)
> -{
> -#ifndef PSTORE_CPU_IN_IP
> - return rec->cpu;
> -#else
> - return rec->ip & PSTORE_CPU_IN_IP;
> -#endif
> -}
> -
> #ifdef CONFIG_PSTORE_FTRACE
> extern void pstore_register_ftrace(void);
> extern void pstore_unregister_ftrace(void);
> diff --git a/include/linux/pstore.h b/include/linux/pstore.h
> index 92013cc..4e7fc3c 100644
> --- a/include/linux/pstore.h
> +++ b/include/linux/pstore.h
> @@ -89,4 +89,73 @@ extern int pstore_register(struct pstore_info *);
> extern void pstore_unregister(struct pstore_info *);
> extern bool pstore_cannot_block_path(enum kmsg_dump_reason reason);
>
> +struct pstore_ftrace_record {
> + unsigned long ip;
> + unsigned long parent_ip;
> + u64 ts;
> +};
> +
> +/*
> + * ftrace related stuff: Both backends and frontends need these so expose them
> + */
> +
> +#if NR_CPUS <= 2 && defined(CONFIG_ARM_THUMB)
> +#define PSTORE_CPU_IN_IP 0x1
> +#elif NR_CPUS <= 4 && defined(CONFIG_ARM)
> +#define PSTORE_CPU_IN_IP 0x3
> +#endif
> +
> +#define TS_CPU_SHIFT 8
> +#define TS_CPU_MASK (BIT(TS_CPU_SHIFT) - 1)
> +
> +/*
> + * If CPU number can be stored in IP, store it there
> + * else store it in the time stamp.
> + */
> +#ifdef PSTORE_CPU_IN_IP
> +static inline void
> +pstore_ftrace_encode_cpu(struct pstore_ftrace_record *rec, unsigned int cpu)
> +{
> + rec->ip |= cpu;
> +}
> +
> +static inline unsigned int
> +pstore_ftrace_decode_cpu(struct pstore_ftrace_record *rec)
> +{
> + return rec->ip & PSTORE_CPU_IN_IP;
> +}
> +
> +static inline u64 pstore_ftrace_read_timestamp(struct pstore_ftrace_record *rec)
> +{
> + return rec->ts;
> +}
> +
> +static inline void pstore_ftrace_write_timestamp(struct pstore_ftrace_record *rec, u64 val)
> +{
> + rec->ts = val;
> +}
> +#else
> +static inline void
> +pstore_ftrace_encode_cpu(struct pstore_ftrace_record *rec, unsigned int cpu)
> +{
> + rec->ts &= ~(TS_CPU_MASK);
> + rec->ts |= cpu;
> +}
> +
> +static inline unsigned int
> +pstore_ftrace_decode_cpu(struct pstore_ftrace_record *rec)
> +{
> + return rec->ts & TS_CPU_MASK;
> +}
> +
> +static inline u64 pstore_ftrace_read_timestamp(struct pstore_ftrace_record *rec)
> +{
> + return rec->ts >> TS_CPU_SHIFT;
> +}
> +
> +static inline void pstore_ftrace_write_timestamp(struct pstore_ftrace_record *rec, u64 val)
> +{
> + rec->ts = (rec->ts & TS_CPU_MASK) | (val << TS_CPU_SHIFT);
> +}
> +#endif
> #endif /*_LINUX_PSTORE_H*/
> --
> 2.7.4
>
Should these ftrace helpers live somewhere else? It seems really
specific to ftrace, not pstore?
-Kees
--
Kees Cook
Nexus Security
On Thu, Nov 10, 2016 at 4:13 PM, Kees Cook <[email protected]> wrote:
> On Thu, Oct 20, 2016 at 12:34 AM, Joel Fernandes <[email protected]> wrote:
>> If FTRACE_PER_CPU flag is passed to ramoops pdata, split the space into
>> multiple zones depending on the number of CPUs.
>>
>> This speeds up the performance of function tracing by about 280% in my tests as
>> we avoid the locking. The trade off being lesser space available per CPU. Let
>> the ramoops user decide which option they want based on pdata flag.
>>
>> Signed-off-by: Joel Fernandes <[email protected]>
>> ---
>> fs/pstore/ram.c | 72 +++++++++++++++++++++++++++++++++++-----------
>> include/linux/pstore_ram.h | 3 ++
>> 2 files changed, 58 insertions(+), 17 deletions(-)
>>
>> diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
>> index 5caa512..5e96f78 100644
>> --- a/fs/pstore/ram.c
>> +++ b/fs/pstore/ram.c
>> @@ -87,7 +87,7 @@ MODULE_PARM_DESC(ramoops_ecc,
>> struct ramoops_context {
>> struct persistent_ram_zone **przs;
>> struct persistent_ram_zone *cprz;
>> - struct persistent_ram_zone *fprz;
>> + struct persistent_ram_zone **fprzs;
>> struct persistent_ram_zone *mprz;
>> phys_addr_t phys_addr;
>> unsigned long size;
>> @@ -97,6 +97,7 @@ struct ramoops_context {
>> size_t ftrace_size;
>> size_t pmsg_size;
>> int dump_oops;
>> + int flags;
>> struct persistent_ram_ecc_info ecc_info;
>> unsigned int max_dump_cnt;
>> unsigned int dump_write_cnt;
>> @@ -219,9 +220,17 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
>> if (!prz_ok(prz))
>> prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt,
>> 1, id, type, PSTORE_TYPE_CONSOLE, 0);
>> - if (!prz_ok(prz))
>> - prz = ramoops_get_next_prz(&cxt->fprz, &cxt->ftrace_read_cnt,
>> - 1, id, type, PSTORE_TYPE_FTRACE, 0);
>> + if (!prz_ok(prz)) {
>> + int max = (cxt->flags & FTRACE_PER_CPU) ? nr_cpu_ids : 1;
>> + while (cxt->ftrace_read_cnt < max && !prz) {
>> + prz = ramoops_get_next_prz(cxt->fprzs,
>> + &cxt->ftrace_read_cnt, max, id, type,
>> + PSTORE_TYPE_FTRACE, 0);
>> + if (!prz_ok(prz))
>> + continue;
>> + }
>> + }
>> +
>> if (!prz_ok(prz))
>> prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt,
>> 1, id, type, PSTORE_TYPE_PMSG, 0);
>> @@ -283,9 +292,23 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
>> persistent_ram_write(cxt->cprz, buf, size, PSTORE_RAM_LOCK);
>> return 0;
>> } else if (type == PSTORE_TYPE_FTRACE) {
>> - if (!cxt->fprz)
>> + int zonenum, lock;
>> +
>> + if (!cxt->fprzs)
>> return -ENOMEM;
>> - persistent_ram_write(cxt->fprz, buf, size, PSTORE_RAM_LOCK);
>> + /*
>> + * If per-cpu buffers, don't lock. Otherwise there's only
>> + * 1 zone for ftrace (zone 0) and all CPUs share it, so lock.
>> + */
>> + if (cxt->flags & FTRACE_PER_CPU) {
>> + zonenum = smp_processor_id();
>> + lock = PSTORE_RAM_NOLOCK;
>> + } else {
>> + zonenum = 0;
>> + lock = PSTORE_RAM_LOCK;
>> + }
>> +
>> + persistent_ram_write(cxt->fprzs[zonenum], buf, size, lock);
>> return 0;
>> } else if (type == PSTORE_TYPE_PMSG) {
>> pr_warn_ratelimited("ramoops: warning: PMSG shouldn't call %s\n",
>> @@ -352,6 +375,7 @@ static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
>> {
>> struct ramoops_context *cxt = psi->data;
>> struct persistent_ram_zone *prz;
>> + int max;
>>
>> switch (type) {
>> case PSTORE_TYPE_DMESG:
>> @@ -363,7 +387,10 @@ static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
>> prz = cxt->cprz;
>> break;
>> case PSTORE_TYPE_FTRACE:
>> - prz = cxt->fprz;
>> + max = (cxt->flags & FTRACE_PER_CPU) ? nr_cpu_ids : 1;
>> + if (id >= max)
>> + return -EINVAL;
>> + prz = cxt->fprzs[id];
>> break;
>> case PSTORE_TYPE_PMSG:
>> prz = cxt->mprz;
>> @@ -394,14 +421,23 @@ static void ramoops_free_przs(struct ramoops_context *cxt)
>> {
>> int i;
>>
>> - if (!cxt->przs)
>> - return;
>> + /* Free dump PRZs */
>> + if (cxt->przs) {
>> + for (i = 0; i < cxt->max_dump_cnt; i++)
>> + persistent_ram_free(cxt->przs[i]);
>>
>> - for (i = 0; i < cxt->max_dump_cnt; i++)
>> - persistent_ram_free(cxt->przs[i]);
>> + kfree(cxt->przs);
>> + cxt->max_dump_cnt = 0;
>> + }
>>
>> - kfree(cxt->przs);
>> - cxt->max_dump_cnt = 0;
>> + /* Free ftrace PRZs */
>> + if (cxt->fprzs) {
>> + int nr = (cxt->flags & FTRACE_PER_CPU) ? nr_cpu_ids : 1;
>> +
>> + for (i = 0; i < nr; i++)
>> + persistent_ram_free(cxt->fprzs[i]);
>> + kfree(cxt->fprzs);
>> + }
>> }
>>
>> static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
>> @@ -618,6 +654,7 @@ static int ramoops_probe(struct platform_device *pdev)
>> cxt->ftrace_size = pdata->ftrace_size;
>> cxt->pmsg_size = pdata->pmsg_size;
>> cxt->dump_oops = pdata->dump_oops;
>> + cxt->flags = pdata->flags;
>> cxt->ecc_info = pdata->ecc_info;
>>
>> paddr = cxt->phys_addr;
>> @@ -633,8 +670,9 @@ static int ramoops_probe(struct platform_device *pdev)
>> if (err)
>> goto fail_init_cprz;
>>
>> - err = ramoops_init_prz(dev, cxt, &cxt->fprz, &paddr, cxt->ftrace_size,
>> - LINUX_VERSION_CODE);
>> + err = ramoops_init_przs(dev, cxt, &cxt->fprzs, &paddr, cxt->ftrace_size,
>> + (cxt->flags & FTRACE_PER_CPU) ? nr_cpu_ids : 1,
>> + LINUX_VERSION_CODE);
>> if (err)
>> goto fail_init_fprz;
>>
>> @@ -698,7 +736,6 @@ fail_clear:
>> cxt->pstore.bufsize = 0;
>> persistent_ram_free(cxt->mprz);
>> fail_init_mprz:
>> - persistent_ram_free(cxt->fprz);
>> fail_init_fprz:
>> persistent_ram_free(cxt->cprz);
>> fail_init_cprz:
>> @@ -717,7 +754,6 @@ static int ramoops_remove(struct platform_device *pdev)
>> cxt->pstore.bufsize = 0;
>>
>> persistent_ram_free(cxt->mprz);
>> - persistent_ram_free(cxt->fprz);
>> persistent_ram_free(cxt->cprz);
>> ramoops_free_przs(cxt);
>>
>> @@ -759,6 +795,8 @@ static void ramoops_register_dummy(void)
>> dummy_data->ftrace_size = ramoops_ftrace_size;
>> dummy_data->pmsg_size = ramoops_pmsg_size;
>> dummy_data->dump_oops = dump_oops;
>> + dummy_data->flags = FTRACE_PER_CPU;
>> +
>> /*
>> * For backwards compatibility ramoops.ecc=1 means 16 bytes ECC
>> * (using 1 byte for ECC isn't much of use anyway).
>> diff --git a/include/linux/pstore_ram.h b/include/linux/pstore_ram.h
>> index 69f3487..122f78d 100644
>> --- a/include/linux/pstore_ram.h
>> +++ b/include/linux/pstore_ram.h
>> @@ -86,6 +86,8 @@ ssize_t persistent_ram_ecc_string(struct persistent_ram_zone *prz,
>> * @mem_address physical memory address to contain ramoops
>> */
>>
>> +#define FTRACE_PER_CPU BIT(0)
>> +
>> struct ramoops_platform_data {
>> unsigned long mem_size;
>> phys_addr_t mem_address;
>> @@ -95,6 +97,7 @@ struct ramoops_platform_data {
>> unsigned long ftrace_size;
>> unsigned long pmsg_size;
>> int dump_oops;
>> + int flags;
>> struct persistent_ram_ecc_info ecc_info;
>> };
>>
>> --
>> 2.7.4
>>
>
> Is there a case for not setting FTRACE_PER_CPU?
Yes, if there are too many CPUs and only a few are in use, then the
rest of the space allocated for the other CPUs not in use go waste. If
the ram zone is begin enough though, then it still makes sense even
for the many CPUs case.
Thanks,
Joel
On Thu, Nov 10, 2016 at 4:15 PM, Kees Cook <[email protected]> wrote:
> On Thu, Oct 20, 2016 at 12:34 AM, Joel Fernandes <[email protected]> wrote:
>> In preparation for merging the per CPU buffers into one buffer when we retrieve
>> the pstore ftrace data, we store the timestamp as a counter in the ftrace
>> pstore record. We store the CPU number as well if !PSTORE_CPU_IN_IP, in this
>> case we shift the counter and may lose ordering there but we preserve the same
>> record size. The timestamp counter is also racy, and not doing any locking or
>> synchronization here results in the benefit of lower overhead. Since we don't
>> care much here for exact ordering of function traces across CPUs, we don't
>> synchronize and may lose some counter updates but I'm ok with that.
>>
>> Using trace_clock() results in much lower performance so avoid using it since
>> we don't want accuracy in timestamp and need a rough ordering to perform merge.
>>
>> Signed-off-by: Joel Fernandes <[email protected]>
>> ---
>> fs/pstore/ftrace.c | 3 +++
>> fs/pstore/inode.c | 7 ++---
>> fs/pstore/internal.h | 34 -------------------------
>> include/linux/pstore.h | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++
>> 4 files changed, 76 insertions(+), 37 deletions(-)
>>
>> diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
>> index d488770..1c6cf86 100644
>> --- a/fs/pstore/ftrace.c
>> +++ b/fs/pstore/ftrace.c
>> @@ -27,6 +27,8 @@
>> #include <asm/barrier.h>
>> #include "internal.h"
>>
>> +static u64 pstore_ftrace_stamp;
>
> I think the race described in the commit message deserves a comment here.
Sure, I can add that.
>> +
>> static void notrace pstore_ftrace_call(unsigned long ip,
>> unsigned long parent_ip,
>> struct ftrace_ops *op,
>> @@ -42,6 +44,7 @@ static void notrace pstore_ftrace_call(unsigned long ip,
>>
>> rec.ip = ip;
>> rec.parent_ip = parent_ip;
>> + pstore_ftrace_write_timestamp(&rec, pstore_ftrace_stamp++);
>> pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id());
>> psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec,
>> 0, sizeof(rec), psinfo);
>> diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
>> index ec9ddef..fcc5d1a 100644
>> --- a/fs/pstore/inode.c
>> +++ b/fs/pstore/inode.c
>> @@ -107,9 +107,10 @@ static int pstore_ftrace_seq_show(struct seq_file *s, void *v)
>> struct pstore_ftrace_seq_data *data = v;
>> struct pstore_ftrace_record *rec = (void *)(ps->data + data->off);
>>
>> - seq_printf(s, "%d %08lx %08lx %pf <- %pF\n",
>> - pstore_ftrace_decode_cpu(rec), rec->ip, rec->parent_ip,
>> - (void *)rec->ip, (void *)rec->parent_ip);
>> + seq_printf(s, "CPU:%d ts:%llu %08lx %08lx %pf <- %pF\n",
>> + pstore_ftrace_decode_cpu(rec), pstore_ftrace_read_timestamp(rec),
>> + rec->ip, rec->parent_ip, (void *)rec->ip,
>> + (void *)rec->parent_ip);
>>
>> return 0;
>> }
>> diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
>> index e38a22b..da416e6 100644
>> --- a/fs/pstore/internal.h
>> +++ b/fs/pstore/internal.h
>> @@ -5,40 +5,6 @@
>> #include <linux/time.h>
>> #include <linux/pstore.h>
>>
>> -#if NR_CPUS <= 2 && defined(CONFIG_ARM_THUMB)
>> -#define PSTORE_CPU_IN_IP 0x1
>> -#elif NR_CPUS <= 4 && defined(CONFIG_ARM)
>> -#define PSTORE_CPU_IN_IP 0x3
>> -#endif
>> -
>> -struct pstore_ftrace_record {
>> - unsigned long ip;
>> - unsigned long parent_ip;
>> -#ifndef PSTORE_CPU_IN_IP
>> - unsigned int cpu;
>> -#endif
>> -};
>> -
>> -static inline void
>> -pstore_ftrace_encode_cpu(struct pstore_ftrace_record *rec, unsigned int cpu)
>> -{
>> -#ifndef PSTORE_CPU_IN_IP
>> - rec->cpu = cpu;
>> -#else
>> - rec->ip |= cpu;
>> -#endif
>> -}
>> -
>> -static inline unsigned int
>> -pstore_ftrace_decode_cpu(struct pstore_ftrace_record *rec)
>> -{
>> -#ifndef PSTORE_CPU_IN_IP
>> - return rec->cpu;
>> -#else
>> - return rec->ip & PSTORE_CPU_IN_IP;
>> -#endif
>> -}
>> -
>> #ifdef CONFIG_PSTORE_FTRACE
>> extern void pstore_register_ftrace(void);
>> extern void pstore_unregister_ftrace(void);
>> diff --git a/include/linux/pstore.h b/include/linux/pstore.h
>> index 92013cc..4e7fc3c 100644
>> --- a/include/linux/pstore.h
>> +++ b/include/linux/pstore.h
>> @@ -89,4 +89,73 @@ extern int pstore_register(struct pstore_info *);
>> extern void pstore_unregister(struct pstore_info *);
>> extern bool pstore_cannot_block_path(enum kmsg_dump_reason reason);
>>
>> +struct pstore_ftrace_record {
>> + unsigned long ip;
>> + unsigned long parent_ip;
>> + u64 ts;
>> +};
>> +
>> +/*
>> + * ftrace related stuff: Both backends and frontends need these so expose them
>> + */
>> +
>> +#if NR_CPUS <= 2 && defined(CONFIG_ARM_THUMB)
>> +#define PSTORE_CPU_IN_IP 0x1
>> +#elif NR_CPUS <= 4 && defined(CONFIG_ARM)
>> +#define PSTORE_CPU_IN_IP 0x3
>> +#endif
>> +
>> +#define TS_CPU_SHIFT 8
>> +#define TS_CPU_MASK (BIT(TS_CPU_SHIFT) - 1)
>> +
>> +/*
>> + * If CPU number can be stored in IP, store it there
>> + * else store it in the time stamp.
>> + */
>> +#ifdef PSTORE_CPU_IN_IP
>> +static inline void
>> +pstore_ftrace_encode_cpu(struct pstore_ftrace_record *rec, unsigned int cpu)
>> +{
>> + rec->ip |= cpu;
>> +}
>> +
>> +static inline unsigned int
>> +pstore_ftrace_decode_cpu(struct pstore_ftrace_record *rec)
>> +{
>> + return rec->ip & PSTORE_CPU_IN_IP;
>> +}
>> +
>> +static inline u64 pstore_ftrace_read_timestamp(struct pstore_ftrace_record *rec)
>> +{
>> + return rec->ts;
>> +}
>> +
>> +static inline void pstore_ftrace_write_timestamp(struct pstore_ftrace_record *rec, u64 val)
>> +{
>> + rec->ts = val;
>> +}
>> +#else
>> +static inline void
>> +pstore_ftrace_encode_cpu(struct pstore_ftrace_record *rec, unsigned int cpu)
>> +{
>> + rec->ts &= ~(TS_CPU_MASK);
>> + rec->ts |= cpu;
>> +}
>> +
>> +static inline unsigned int
>> +pstore_ftrace_decode_cpu(struct pstore_ftrace_record *rec)
>> +{
>> + return rec->ts & TS_CPU_MASK;
>> +}
>> +
>> +static inline u64 pstore_ftrace_read_timestamp(struct pstore_ftrace_record *rec)
>> +{
>> + return rec->ts >> TS_CPU_SHIFT;
>> +}
>> +
>> +static inline void pstore_ftrace_write_timestamp(struct pstore_ftrace_record *rec, u64 val)
>> +{
>> + rec->ts = (rec->ts & TS_CPU_MASK) | (val << TS_CPU_SHIFT);
>> +}
>> +#endif
>> #endif /*_LINUX_PSTORE_H*/
>> --
>> 2.7.4
>>
>
> Should these ftrace helpers live somewhere else? It seems really
> specific to ftrace, not pstore?
>
> -Kees
>
Actually the time stamp and CPU information is not specific to low
level ftrace hooks. Ftrace just provides the hooks into the function
calls, its upto the user to store this information how they want. So
the pstore ftrace helpers should be included in the pstore header IMO.
Thanks,
Joel
On Thu, Nov 10, 2016 at 4:06 PM, Kees Cook <[email protected]> wrote:
> On Thu, Oct 20, 2016 at 12:34 AM, Joel Fernandes <[email protected]> wrote:
>> Currently pstore has a global spinlock for all zones. Since the zones are
>> independent and modify different areas of memory, there's no need to have a
>> global lock, so we should use a per-zone lock as introduced here. Also,
>> ramoops's ftrace usecase has a FTRACE_PER_CPU flag introduced in this patch
>> series which splits the ftrace memory area into a single zone per CPU thus
>> eliminating the need for locking. In preparation for this, make the locking
>> optional.
>>
>> Signed-off-by: Joel Fernandes <[email protected]>
>> ---
>> fs/pstore/ram_core.c | 11 +++++------
>> include/linux/pstore_ram.h | 1 +
>> 2 files changed, 6 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
>> index 3975dee..cb92055 100644
>> --- a/fs/pstore/ram_core.c
>> +++ b/fs/pstore/ram_core.c
>> @@ -48,8 +48,6 @@ static inline size_t buffer_start(struct persistent_ram_zone *prz)
>> return atomic_read(&prz->buffer->start);
>> }
>>
>> -static DEFINE_RAW_SPINLOCK(buffer_lock);
>> -
>> /* increase and wrap the start pointer, returning the old value */
>> static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
>> {
>> @@ -57,7 +55,7 @@ static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
>> int new;
>> unsigned long flags;
>>
>> - raw_spin_lock_irqsave(&buffer_lock, flags);
>> + raw_spin_lock_irqsave(&prz->buffer_lock, flags);
>>
>> old = atomic_read(&prz->buffer->start);
>> new = old + a;
>> @@ -65,7 +63,7 @@ static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
>> new -= prz->buffer_size;
>> atomic_set(&prz->buffer->start, new);
>>
>> - raw_spin_unlock_irqrestore(&buffer_lock, flags);
>> + raw_spin_unlock_irqrestore(&prz->buffer_lock, flags);
>>
>> return old;
>> }
>> @@ -77,7 +75,7 @@ static void buffer_size_add(struct persistent_ram_zone *prz, size_t a)
>> size_t new;
>> unsigned long flags;
>>
>> - raw_spin_lock_irqsave(&buffer_lock, flags);
>> + raw_spin_lock_irqsave(&prz->buffer_lock, flags);
>>
>> old = atomic_read(&prz->buffer->size);
>> if (old == prz->buffer_size)
>> @@ -89,7 +87,7 @@ static void buffer_size_add(struct persistent_ram_zone *prz, size_t a)
>> atomic_set(&prz->buffer->size, new);
>>
>> exit:
>> - raw_spin_unlock_irqrestore(&buffer_lock, flags);
>> + raw_spin_unlock_irqrestore(&prz->buffer_lock, flags);
>> }
>>
>> static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz,
>> @@ -493,6 +491,7 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig,
>>
>> prz->buffer->sig = sig;
>> persistent_ram_zap(prz);
>> + prz->buffer_lock = __RAW_SPIN_LOCK_UNLOCKED(buffer_lock);
>>
>> return 0;
>> }
>> diff --git a/include/linux/pstore_ram.h b/include/linux/pstore_ram.h
>> index c668c86..244d242 100644
>> --- a/include/linux/pstore_ram.h
>> +++ b/include/linux/pstore_ram.h
>> @@ -40,6 +40,7 @@ struct persistent_ram_zone {
>> void *vaddr;
>> struct persistent_ram_buffer *buffer;
>> size_t buffer_size;
>> + raw_spinlock_t buffer_lock;
>>
>> /* ECC correction */
>> char *par_buffer;
>> --
>> 2.7.4
>>
>
> This one looks good, thanks. I'll get this in for -next since it's a
> good clean-up on its own.
>
Thanks.
> Have you put these patches through scripts/checkpatch.pl? Some of
> lines wrap or have wide commit messages, but that's just cosmetic to
> fix up.
I didn't have any errors, just 1 or 2 warnings about wide commit
messages yes. I'll fix those up on the other patches you commented on
and resend them, thanks!
Regards,
Joel
On Thu, Nov 10, 2016 at 4:10 PM, Kees Cook <[email protected]> wrote:
> On Thu, Oct 20, 2016 at 12:34 AM, Joel Fernandes <[email protected]> wrote:
>> In preparation of not locking at all for certain buffers depending on if
>> there's contention, make locking optional depending if caller requested it.
>>
>> Signed-off-by: Joel Fernandes <[email protected]>
>> ---
>> fs/pstore/ram.c | 10 +++++-----
>> fs/pstore/ram_core.c | 27 ++++++++++++++++-----------
>> include/linux/pstore_ram.h | 10 +++++++++-
>> 3 files changed, 30 insertions(+), 17 deletions(-)
>>
>> diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
>> index f1219af..cb07ef6 100644
>> --- a/fs/pstore/ram.c
>> +++ b/fs/pstore/ram.c
>> @@ -260,7 +260,7 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz,
>> compressed ? 'C' : 'D');
>> WARN_ON_ONCE(!hdr);
>> len = hdr ? strlen(hdr) : 0;
>> - persistent_ram_write(prz, hdr, len);
>> + persistent_ram_write(prz, hdr, len, PSTORE_RAM_LOCK);
>> kfree(hdr);
>>
>> return len;
>> @@ -280,17 +280,17 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
>> if (type == PSTORE_TYPE_CONSOLE) {
>> if (!cxt->cprz)
>> return -ENOMEM;
>> - persistent_ram_write(cxt->cprz, buf, size);
>> + persistent_ram_write(cxt->cprz, buf, size, PSTORE_RAM_LOCK);
>> return 0;
>> } else if (type == PSTORE_TYPE_FTRACE) {
>> if (!cxt->fprz)
>> return -ENOMEM;
>> - persistent_ram_write(cxt->fprz, buf, size);
>> + persistent_ram_write(cxt->fprz, buf, size, PSTORE_RAM_LOCK);
>> return 0;
>> } else if (type == PSTORE_TYPE_PMSG) {
>> if (!cxt->mprz)
>> return -ENOMEM;
>> - persistent_ram_write(cxt->mprz, buf, size);
>> + persistent_ram_write(cxt->mprz, buf, size, PSTORE_RAM_LOCK);
>> return 0;
>> }
>>
>> @@ -324,7 +324,7 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
>> hlen = ramoops_write_kmsg_hdr(prz, compressed);
>> if (size + hlen > prz->buffer_size)
>> size = prz->buffer_size - hlen;
>> - persistent_ram_write(prz, buf, size);
>> + persistent_ram_write(prz, buf, size, PSTORE_RAM_LOCK);
>>
>> cxt->dump_write_cnt = (cxt->dump_write_cnt + 1) % cxt->max_dump_cnt;
>>
>> diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
>> index cb92055..69c7b96 100644
>> --- a/fs/pstore/ram_core.c
>> +++ b/fs/pstore/ram_core.c
>> @@ -49,13 +49,15 @@ static inline size_t buffer_start(struct persistent_ram_zone *prz)
>> }
>>
>> /* increase and wrap the start pointer, returning the old value */
>> -static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
>> +static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a,
>> + int lock)
>> {
>> int old;
>> int new;
>> unsigned long flags;
>>
>> - raw_spin_lock_irqsave(&prz->buffer_lock, flags);
>> + if (lock == PSTORE_RAM_LOCK)
>> + raw_spin_lock_irqsave(&prz->buffer_lock, flags);
>>
>> old = atomic_read(&prz->buffer->start);
>> new = old + a;
>> @@ -63,19 +65,21 @@ static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
>> new -= prz->buffer_size;
>> atomic_set(&prz->buffer->start, new);
>>
>> - raw_spin_unlock_irqrestore(&prz->buffer_lock, flags);
>> + if (lock == PSTORE_RAM_LOCK)
>> + raw_spin_unlock_irqrestore(&prz->buffer_lock, flags);
>>
>> return old;
>> }
>>
>> /* increase the size counter until it hits the max size */
>> -static void buffer_size_add(struct persistent_ram_zone *prz, size_t a)
>> +static void buffer_size_add(struct persistent_ram_zone *prz, size_t a, int lock)
>> {
>> size_t old;
>> size_t new;
>> unsigned long flags;
>>
>> - raw_spin_lock_irqsave(&prz->buffer_lock, flags);
>> + if (lock == PSTORE_RAM_LOCK)
>> + raw_spin_lock_irqsave(&prz->buffer_lock, flags);
>>
>> old = atomic_read(&prz->buffer->size);
>> if (old == prz->buffer_size)
>> @@ -87,7 +91,8 @@ static void buffer_size_add(struct persistent_ram_zone *prz, size_t a)
>> atomic_set(&prz->buffer->size, new);
>>
>> exit:
>> - raw_spin_unlock_irqrestore(&prz->buffer_lock, flags);
>> + if (lock == PSTORE_RAM_LOCK)
>> + raw_spin_unlock_irqrestore(&prz->buffer_lock, flags);
>> }
>>
>> static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz,
>
> Does this pass a sparse test? Depending on how dumb sparse is, you may
> need to add "else" statements to the lock == RAM_LOCK tests:
>
> else
> __acquire(&prz->buffer_lock);
>
> ...
>
> else
> __release(&prz->buffer_lock);
>
>
>
> Otherwise, this looks fine. :)
To answer this: yes, sparse is fine with this. :)
-Kees
--
Kees Cook
Nexus Security
On Thu, Nov 10, 2016 at 4:16 PM, Joel Fernandes <[email protected]> wrote:
> On Thu, Nov 10, 2016 at 4:13 PM, Kees Cook <[email protected]> wrote:
>> On Thu, Oct 20, 2016 at 12:34 AM, Joel Fernandes <[email protected]> wrote:
>>> If FTRACE_PER_CPU flag is passed to ramoops pdata, split the space into
>>> multiple zones depending on the number of CPUs.
>>>
>>> This speeds up the performance of function tracing by about 280% in my tests as
>>> we avoid the locking. The trade off being lesser space available per CPU. Let
>>> the ramoops user decide which option they want based on pdata flag.
>>>
>>> Signed-off-by: Joel Fernandes <[email protected]>
>>> ---
>>> fs/pstore/ram.c | 72 +++++++++++++++++++++++++++++++++++-----------
>>> include/linux/pstore_ram.h | 3 ++
>>> 2 files changed, 58 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
>>> index 5caa512..5e96f78 100644
>>> --- a/fs/pstore/ram.c
>>> +++ b/fs/pstore/ram.c
>>> @@ -87,7 +87,7 @@ MODULE_PARM_DESC(ramoops_ecc,
>>> struct ramoops_context {
>>> struct persistent_ram_zone **przs;
>>> struct persistent_ram_zone *cprz;
>>> - struct persistent_ram_zone *fprz;
>>> + struct persistent_ram_zone **fprzs;
>>> struct persistent_ram_zone *mprz;
>>> phys_addr_t phys_addr;
>>> unsigned long size;
>>> @@ -97,6 +97,7 @@ struct ramoops_context {
>>> size_t ftrace_size;
>>> size_t pmsg_size;
>>> int dump_oops;
>>> + int flags;
>>> struct persistent_ram_ecc_info ecc_info;
>>> unsigned int max_dump_cnt;
>>> unsigned int dump_write_cnt;
>>> @@ -219,9 +220,17 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
>>> if (!prz_ok(prz))
>>> prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt,
>>> 1, id, type, PSTORE_TYPE_CONSOLE, 0);
>>> - if (!prz_ok(prz))
>>> - prz = ramoops_get_next_prz(&cxt->fprz, &cxt->ftrace_read_cnt,
>>> - 1, id, type, PSTORE_TYPE_FTRACE, 0);
>>> + if (!prz_ok(prz)) {
>>> + int max = (cxt->flags & FTRACE_PER_CPU) ? nr_cpu_ids : 1;
>>> + while (cxt->ftrace_read_cnt < max && !prz) {
>>> + prz = ramoops_get_next_prz(cxt->fprzs,
>>> + &cxt->ftrace_read_cnt, max, id, type,
>>> + PSTORE_TYPE_FTRACE, 0);
>>> + if (!prz_ok(prz))
>>> + continue;
>>> + }
>>> + }
>>> +
>>> if (!prz_ok(prz))
>>> prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt,
>>> 1, id, type, PSTORE_TYPE_PMSG, 0);
>>> @@ -283,9 +292,23 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
>>> persistent_ram_write(cxt->cprz, buf, size, PSTORE_RAM_LOCK);
>>> return 0;
>>> } else if (type == PSTORE_TYPE_FTRACE) {
>>> - if (!cxt->fprz)
>>> + int zonenum, lock;
>>> +
>>> + if (!cxt->fprzs)
>>> return -ENOMEM;
>>> - persistent_ram_write(cxt->fprz, buf, size, PSTORE_RAM_LOCK);
>>> + /*
>>> + * If per-cpu buffers, don't lock. Otherwise there's only
>>> + * 1 zone for ftrace (zone 0) and all CPUs share it, so lock.
>>> + */
>>> + if (cxt->flags & FTRACE_PER_CPU) {
>>> + zonenum = smp_processor_id();
>>> + lock = PSTORE_RAM_NOLOCK;
>>> + } else {
>>> + zonenum = 0;
>>> + lock = PSTORE_RAM_LOCK;
>>> + }
>>> +
>>> + persistent_ram_write(cxt->fprzs[zonenum], buf, size, lock);
>>> return 0;
>>> } else if (type == PSTORE_TYPE_PMSG) {
>>> pr_warn_ratelimited("ramoops: warning: PMSG shouldn't call %s\n",
>>> @@ -352,6 +375,7 @@ static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
>>> {
>>> struct ramoops_context *cxt = psi->data;
>>> struct persistent_ram_zone *prz;
>>> + int max;
>>>
>>> switch (type) {
>>> case PSTORE_TYPE_DMESG:
>>> @@ -363,7 +387,10 @@ static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
>>> prz = cxt->cprz;
>>> break;
>>> case PSTORE_TYPE_FTRACE:
>>> - prz = cxt->fprz;
>>> + max = (cxt->flags & FTRACE_PER_CPU) ? nr_cpu_ids : 1;
>>> + if (id >= max)
>>> + return -EINVAL;
>>> + prz = cxt->fprzs[id];
>>> break;
>>> case PSTORE_TYPE_PMSG:
>>> prz = cxt->mprz;
>>> @@ -394,14 +421,23 @@ static void ramoops_free_przs(struct ramoops_context *cxt)
>>> {
>>> int i;
>>>
>>> - if (!cxt->przs)
>>> - return;
>>> + /* Free dump PRZs */
>>> + if (cxt->przs) {
>>> + for (i = 0; i < cxt->max_dump_cnt; i++)
>>> + persistent_ram_free(cxt->przs[i]);
>>>
>>> - for (i = 0; i < cxt->max_dump_cnt; i++)
>>> - persistent_ram_free(cxt->przs[i]);
>>> + kfree(cxt->przs);
>>> + cxt->max_dump_cnt = 0;
>>> + }
>>>
>>> - kfree(cxt->przs);
>>> - cxt->max_dump_cnt = 0;
>>> + /* Free ftrace PRZs */
>>> + if (cxt->fprzs) {
>>> + int nr = (cxt->flags & FTRACE_PER_CPU) ? nr_cpu_ids : 1;
>>> +
>>> + for (i = 0; i < nr; i++)
>>> + persistent_ram_free(cxt->fprzs[i]);
>>> + kfree(cxt->fprzs);
>>> + }
>>> }
>>>
>>> static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
>>> @@ -618,6 +654,7 @@ static int ramoops_probe(struct platform_device *pdev)
>>> cxt->ftrace_size = pdata->ftrace_size;
>>> cxt->pmsg_size = pdata->pmsg_size;
>>> cxt->dump_oops = pdata->dump_oops;
>>> + cxt->flags = pdata->flags;
>>> cxt->ecc_info = pdata->ecc_info;
>>>
>>> paddr = cxt->phys_addr;
>>> @@ -633,8 +670,9 @@ static int ramoops_probe(struct platform_device *pdev)
>>> if (err)
>>> goto fail_init_cprz;
>>>
>>> - err = ramoops_init_prz(dev, cxt, &cxt->fprz, &paddr, cxt->ftrace_size,
>>> - LINUX_VERSION_CODE);
>>> + err = ramoops_init_przs(dev, cxt, &cxt->fprzs, &paddr, cxt->ftrace_size,
>>> + (cxt->flags & FTRACE_PER_CPU) ? nr_cpu_ids : 1,
>>> + LINUX_VERSION_CODE);
>>> if (err)
>>> goto fail_init_fprz;
>>>
>>> @@ -698,7 +736,6 @@ fail_clear:
>>> cxt->pstore.bufsize = 0;
>>> persistent_ram_free(cxt->mprz);
>>> fail_init_mprz:
>>> - persistent_ram_free(cxt->fprz);
>>> fail_init_fprz:
>>> persistent_ram_free(cxt->cprz);
>>> fail_init_cprz:
>>> @@ -717,7 +754,6 @@ static int ramoops_remove(struct platform_device *pdev)
>>> cxt->pstore.bufsize = 0;
>>>
>>> persistent_ram_free(cxt->mprz);
>>> - persistent_ram_free(cxt->fprz);
>>> persistent_ram_free(cxt->cprz);
>>> ramoops_free_przs(cxt);
>>>
>>> @@ -759,6 +795,8 @@ static void ramoops_register_dummy(void)
>>> dummy_data->ftrace_size = ramoops_ftrace_size;
>>> dummy_data->pmsg_size = ramoops_pmsg_size;
>>> dummy_data->dump_oops = dump_oops;
>>> + dummy_data->flags = FTRACE_PER_CPU;
>>> +
>>> /*
>>> * For backwards compatibility ramoops.ecc=1 means 16 bytes ECC
>>> * (using 1 byte for ECC isn't much of use anyway).
>>> diff --git a/include/linux/pstore_ram.h b/include/linux/pstore_ram.h
>>> index 69f3487..122f78d 100644
>>> --- a/include/linux/pstore_ram.h
>>> +++ b/include/linux/pstore_ram.h
>>> @@ -86,6 +86,8 @@ ssize_t persistent_ram_ecc_string(struct persistent_ram_zone *prz,
>>> * @mem_address physical memory address to contain ramoops
>>> */
>>>
>>> +#define FTRACE_PER_CPU BIT(0)
>>> +
>>> struct ramoops_platform_data {
>>> unsigned long mem_size;
>>> phys_addr_t mem_address;
>>> @@ -95,6 +97,7 @@ struct ramoops_platform_data {
>>> unsigned long ftrace_size;
>>> unsigned long pmsg_size;
>>> int dump_oops;
>>> + int flags;
>>> struct persistent_ram_ecc_info ecc_info;
>>> };
>>>
>>> --
>>> 2.7.4
>>>
>>
>> Is there a case for not setting FTRACE_PER_CPU?
>
> Yes, if there are too many CPUs and only a few are in use, then the
> rest of the space allocated for the other CPUs not in use go waste. If
> the ram zone is begin enough though, then it still makes sense even
> for the many CPUs case.
Okay. In this case, let's make flags a u32, since int doesn't make
sense for flags. Also, with the addition of this in the platform data,
"flags" will need to be added to the EFI implementation
(ramoops_parse_dt() with a default of 0) and documentation
(Documentation/devicetree/bindings/reserved-memory/ramoops.txt).
-Kees
--
Kees Cook
Nexus Security
On Thu, Nov 10, 2016 at 4:21 PM, Joel Fernandes <[email protected]> wrote:
> On Thu, Nov 10, 2016 at 4:15 PM, Kees Cook <[email protected]> wrote:
>> On Thu, Oct 20, 2016 at 12:34 AM, Joel Fernandes <[email protected]> wrote:
>>> In preparation for merging the per CPU buffers into one buffer when we retrieve
>>> the pstore ftrace data, we store the timestamp as a counter in the ftrace
>>> pstore record. We store the CPU number as well if !PSTORE_CPU_IN_IP, in this
>>> case we shift the counter and may lose ordering there but we preserve the same
>>> record size. The timestamp counter is also racy, and not doing any locking or
>>> synchronization here results in the benefit of lower overhead. Since we don't
>>> care much here for exact ordering of function traces across CPUs, we don't
>>> synchronize and may lose some counter updates but I'm ok with that.
>>>
>>> Using trace_clock() results in much lower performance so avoid using it since
>>> we don't want accuracy in timestamp and need a rough ordering to perform merge.
>>>
>>> Signed-off-by: Joel Fernandes <[email protected]>
>>> ---
>>> fs/pstore/ftrace.c | 3 +++
>>> fs/pstore/inode.c | 7 ++---
>>> fs/pstore/internal.h | 34 -------------------------
>>> include/linux/pstore.h | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>> 4 files changed, 76 insertions(+), 37 deletions(-)
>>>
>>> diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
>>> index d488770..1c6cf86 100644
>>> --- a/fs/pstore/ftrace.c
>>> +++ b/fs/pstore/ftrace.c
>>> @@ -27,6 +27,8 @@
>>> #include <asm/barrier.h>
>>> #include "internal.h"
>>>
>>> +static u64 pstore_ftrace_stamp;
>>
>> I think the race described in the commit message deserves a comment here.
>
> Sure, I can add that.
>
>>> +
>>> static void notrace pstore_ftrace_call(unsigned long ip,
>>> unsigned long parent_ip,
>>> struct ftrace_ops *op,
>>> @@ -42,6 +44,7 @@ static void notrace pstore_ftrace_call(unsigned long ip,
>>>
>>> rec.ip = ip;
>>> rec.parent_ip = parent_ip;
>>> + pstore_ftrace_write_timestamp(&rec, pstore_ftrace_stamp++);
>>> pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id());
>>> psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec,
>>> 0, sizeof(rec), psinfo);
>>> diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
>>> index ec9ddef..fcc5d1a 100644
>>> --- a/fs/pstore/inode.c
>>> +++ b/fs/pstore/inode.c
>>> @@ -107,9 +107,10 @@ static int pstore_ftrace_seq_show(struct seq_file *s, void *v)
>>> struct pstore_ftrace_seq_data *data = v;
>>> struct pstore_ftrace_record *rec = (void *)(ps->data + data->off);
>>>
>>> - seq_printf(s, "%d %08lx %08lx %pf <- %pF\n",
>>> - pstore_ftrace_decode_cpu(rec), rec->ip, rec->parent_ip,
>>> - (void *)rec->ip, (void *)rec->parent_ip);
>>> + seq_printf(s, "CPU:%d ts:%llu %08lx %08lx %pf <- %pF\n",
>>> + pstore_ftrace_decode_cpu(rec), pstore_ftrace_read_timestamp(rec),
>>> + rec->ip, rec->parent_ip, (void *)rec->ip,
>>> + (void *)rec->parent_ip);
>>>
>>> return 0;
>>> }
>>> diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
>>> index e38a22b..da416e6 100644
>>> --- a/fs/pstore/internal.h
>>> +++ b/fs/pstore/internal.h
>>> @@ -5,40 +5,6 @@
>>> #include <linux/time.h>
>>> #include <linux/pstore.h>
>>>
>>> -#if NR_CPUS <= 2 && defined(CONFIG_ARM_THUMB)
>>> -#define PSTORE_CPU_IN_IP 0x1
>>> -#elif NR_CPUS <= 4 && defined(CONFIG_ARM)
>>> -#define PSTORE_CPU_IN_IP 0x3
>>> -#endif
>>> -
>>> -struct pstore_ftrace_record {
>>> - unsigned long ip;
>>> - unsigned long parent_ip;
>>> -#ifndef PSTORE_CPU_IN_IP
>>> - unsigned int cpu;
>>> -#endif
>>> -};
>>> -
>>> -static inline void
>>> -pstore_ftrace_encode_cpu(struct pstore_ftrace_record *rec, unsigned int cpu)
>>> -{
>>> -#ifndef PSTORE_CPU_IN_IP
>>> - rec->cpu = cpu;
>>> -#else
>>> - rec->ip |= cpu;
>>> -#endif
>>> -}
>>> -
>>> -static inline unsigned int
>>> -pstore_ftrace_decode_cpu(struct pstore_ftrace_record *rec)
>>> -{
>>> -#ifndef PSTORE_CPU_IN_IP
>>> - return rec->cpu;
>>> -#else
>>> - return rec->ip & PSTORE_CPU_IN_IP;
>>> -#endif
>>> -}
>>> -
>>> #ifdef CONFIG_PSTORE_FTRACE
>>> extern void pstore_register_ftrace(void);
>>> extern void pstore_unregister_ftrace(void);
>>> diff --git a/include/linux/pstore.h b/include/linux/pstore.h
>>> index 92013cc..4e7fc3c 100644
>>> --- a/include/linux/pstore.h
>>> +++ b/include/linux/pstore.h
>>> @@ -89,4 +89,73 @@ extern int pstore_register(struct pstore_info *);
>>> extern void pstore_unregister(struct pstore_info *);
>>> extern bool pstore_cannot_block_path(enum kmsg_dump_reason reason);
>>>
>>> +struct pstore_ftrace_record {
>>> + unsigned long ip;
>>> + unsigned long parent_ip;
>>> + u64 ts;
>>> +};
>>> +
>>> +/*
>>> + * ftrace related stuff: Both backends and frontends need these so expose them
>>> + */
>>> +
>>> +#if NR_CPUS <= 2 && defined(CONFIG_ARM_THUMB)
>>> +#define PSTORE_CPU_IN_IP 0x1
>>> +#elif NR_CPUS <= 4 && defined(CONFIG_ARM)
>>> +#define PSTORE_CPU_IN_IP 0x3
>>> +#endif
>>> +
>>> +#define TS_CPU_SHIFT 8
>>> +#define TS_CPU_MASK (BIT(TS_CPU_SHIFT) - 1)
>>> +
>>> +/*
>>> + * If CPU number can be stored in IP, store it there
>>> + * else store it in the time stamp.
>>> + */
>>> +#ifdef PSTORE_CPU_IN_IP
>>> +static inline void
>>> +pstore_ftrace_encode_cpu(struct pstore_ftrace_record *rec, unsigned int cpu)
>>> +{
>>> + rec->ip |= cpu;
>>> +}
>>> +
>>> +static inline unsigned int
>>> +pstore_ftrace_decode_cpu(struct pstore_ftrace_record *rec)
>>> +{
>>> + return rec->ip & PSTORE_CPU_IN_IP;
>>> +}
>>> +
>>> +static inline u64 pstore_ftrace_read_timestamp(struct pstore_ftrace_record *rec)
>>> +{
>>> + return rec->ts;
>>> +}
>>> +
>>> +static inline void pstore_ftrace_write_timestamp(struct pstore_ftrace_record *rec, u64 val)
>>> +{
>>> + rec->ts = val;
>>> +}
>>> +#else
>>> +static inline void
>>> +pstore_ftrace_encode_cpu(struct pstore_ftrace_record *rec, unsigned int cpu)
>>> +{
>>> + rec->ts &= ~(TS_CPU_MASK);
>>> + rec->ts |= cpu;
>>> +}
>>> +
>>> +static inline unsigned int
>>> +pstore_ftrace_decode_cpu(struct pstore_ftrace_record *rec)
>>> +{
>>> + return rec->ts & TS_CPU_MASK;
>>> +}
>>> +
>>> +static inline u64 pstore_ftrace_read_timestamp(struct pstore_ftrace_record *rec)
>>> +{
>>> + return rec->ts >> TS_CPU_SHIFT;
>>> +}
>>> +
>>> +static inline void pstore_ftrace_write_timestamp(struct pstore_ftrace_record *rec, u64 val)
>>> +{
>>> + rec->ts = (rec->ts & TS_CPU_MASK) | (val << TS_CPU_SHIFT);
>>> +}
>>> +#endif
>>> #endif /*_LINUX_PSTORE_H*/
>>> --
>>> 2.7.4
>>>
>>
>> Should these ftrace helpers live somewhere else? It seems really
>> specific to ftrace, not pstore?
>>
>> -Kees
>>
>
> Actually the time stamp and CPU information is not specific to low
> level ftrace hooks. Ftrace just provides the hooks into the function
> calls, its upto the user to store this information how they want. So
> the pstore ftrace helpers should be included in the pstore header IMO.
Okay, gotcha. Thanks!
-Kees
--
Kees Cook
Nexus Security