Hi,
This series adds DMA operations support to the qemu fw_cfg kernel
module and populates "etc/vmcoreinfo" with vmcoreinfo location
details (entry added since qemu 2.11 with -device vmcoreinfo).
v13:
- reorder patch series, introduce DMA write before DMA read
- do some measurements of DMA read speed-ups
v12:
- fix virt_to_phys(NULL) panic with CONFIG_DEBUG_VIRTUAL=y
- do not use DMA read, except for kmalloc() memory we allocated
ourself (only fw_cfg_register_dir_entries() so far)
v11:
- add #include <linux/crash_core.h> in last patch,
fixing kbuild .config test
Marc-André Lureau (4):
crash: export paddr_vmcoreinfo_note()
fw_cfg: add DMA register
fw_cfg: write vmcoreinfo details
RFC: fw_cfg: do DMA read operation
drivers/firmware/qemu_fw_cfg.c | 257 ++++++++++++++++++++++++++++++++++++-----
kernel/crash_core.c | 1 +
2 files changed, 230 insertions(+), 28 deletions(-)
--
2.16.1.73.g5832b7e9f2
Add an optional <dma_off> kernel module (or command line) parameter
using the following syntax:
[qemu_fw_cfg.]ioport=<size>@<base>[:<ctrl_off>:<data_off>[:<dma_off>]]
or
[qemu_fw_cfg.]mmio=<size>@<base>[:<ctrl_off>:<data_off>[:<dma_off>]]
and initializes the register address using given or default offset.
Signed-off-by: Marc-André Lureau <[email protected]>
Reviewed-by: Gabriel Somlo <[email protected]>
---
drivers/firmware/qemu_fw_cfg.c | 53 ++++++++++++++++++++++++++++++++----------
1 file changed, 41 insertions(+), 12 deletions(-)
diff --git a/drivers/firmware/qemu_fw_cfg.c b/drivers/firmware/qemu_fw_cfg.c
index deb483064f53..740df0df2260 100644
--- a/drivers/firmware/qemu_fw_cfg.c
+++ b/drivers/firmware/qemu_fw_cfg.c
@@ -10,20 +10,21 @@
* and select subsets of aarch64), a Device Tree node (on arm), or using
* a kernel module (or command line) parameter with the following syntax:
*
- * [qemu_fw_cfg.]ioport=<size>@<base>[:<ctrl_off>:<data_off>]
+ * [qemu_fw_cfg.]ioport=<size>@<base>[:<ctrl_off>:<data_off>[:<dma_off>]]
* or
- * [qemu_fw_cfg.]mmio=<size>@<base>[:<ctrl_off>:<data_off>]
+ * [qemu_fw_cfg.]mmio=<size>@<base>[:<ctrl_off>:<data_off>[:<dma_off>]]
*
* where:
* <size> := size of ioport or mmio range
* <base> := physical base address of ioport or mmio range
* <ctrl_off> := (optional) offset of control register
* <data_off> := (optional) offset of data register
+ * <dma_off> := (optional) offset of dma register
*
* e.g.:
- * qemu_fw_cfg.ioport=2@0x510:0:1 (the default on x86)
+ * qemu_fw_cfg.ioport=12@0x510:0:1:4 (the default on x86)
* or
- * qemu_fw_cfg.mmio=0xA@0x9020000:8:0 (the default on arm)
+ * qemu_fw_cfg.mmio=16@0x9020000:8:0:16 (the default on arm)
*/
#include <linux/module.h>
@@ -63,6 +64,7 @@ static resource_size_t fw_cfg_p_size;
static void __iomem *fw_cfg_dev_base;
static void __iomem *fw_cfg_reg_ctrl;
static void __iomem *fw_cfg_reg_data;
+static void __iomem *fw_cfg_reg_dma;
/* atomic access to fw_cfg device (potentially slow i/o, so using mutex) */
static DEFINE_MUTEX(fw_cfg_dev_lock);
@@ -118,12 +120,14 @@ static void fw_cfg_io_cleanup(void)
# if (defined(CONFIG_ARM) || defined(CONFIG_ARM64))
# define FW_CFG_CTRL_OFF 0x08
# define FW_CFG_DATA_OFF 0x00
+# define FW_CFG_DMA_OFF 0x10
# elif (defined(CONFIG_PPC_PMAC) || defined(CONFIG_SPARC32)) /* ppc/mac,sun4m */
# define FW_CFG_CTRL_OFF 0x00
# define FW_CFG_DATA_OFF 0x02
# elif (defined(CONFIG_X86) || defined(CONFIG_SPARC64)) /* x86, sun4u */
# define FW_CFG_CTRL_OFF 0x00
# define FW_CFG_DATA_OFF 0x01
+# define FW_CFG_DMA_OFF 0x04
# else
# error "QEMU FW_CFG not available on this architecture!"
# endif
@@ -133,7 +137,7 @@ static void fw_cfg_io_cleanup(void)
static int fw_cfg_do_platform_probe(struct platform_device *pdev)
{
char sig[FW_CFG_SIG_SIZE];
- struct resource *range, *ctrl, *data;
+ struct resource *range, *ctrl, *data, *dma;
/* acquire i/o range details */
fw_cfg_is_mmio = false;
@@ -170,6 +174,7 @@ static int fw_cfg_do_platform_probe(struct platform_device *pdev)
/* were custom register offsets provided (e.g. on the command line)? */
ctrl = platform_get_resource_byname(pdev, IORESOURCE_REG, "ctrl");
data = platform_get_resource_byname(pdev, IORESOURCE_REG, "data");
+ dma = platform_get_resource_byname(pdev, IORESOURCE_REG, "dma");
if (ctrl && data) {
fw_cfg_reg_ctrl = fw_cfg_dev_base + ctrl->start;
fw_cfg_reg_data = fw_cfg_dev_base + data->start;
@@ -179,6 +184,13 @@ static int fw_cfg_do_platform_probe(struct platform_device *pdev)
fw_cfg_reg_data = fw_cfg_dev_base + FW_CFG_DATA_OFF;
}
+ if (dma)
+ fw_cfg_reg_dma = fw_cfg_dev_base + dma->start;
+#ifdef FW_CFG_DMA_OFF
+ else
+ fw_cfg_reg_dma = fw_cfg_dev_base + FW_CFG_DMA_OFF;
+#endif
+
/* verify fw_cfg device signature */
fw_cfg_read_blob(FW_CFG_SIGNATURE, sig, 0, FW_CFG_SIG_SIZE);
if (memcmp(sig, "QEMU", FW_CFG_SIG_SIZE) != 0) {
@@ -629,6 +641,7 @@ static struct platform_device *fw_cfg_cmdline_dev;
/* use special scanf/printf modifier for phys_addr_t, resource_size_t */
#define PH_ADDR_SCAN_FMT "@%" __PHYS_ADDR_PREFIX "i%n" \
":%" __PHYS_ADDR_PREFIX "i" \
+ ":%" __PHYS_ADDR_PREFIX "i%n" \
":%" __PHYS_ADDR_PREFIX "i%n"
#define PH_ADDR_PR_1_FMT "0x%" __PHYS_ADDR_PREFIX "x@" \
@@ -638,12 +651,15 @@ static struct platform_device *fw_cfg_cmdline_dev;
":%" __PHYS_ADDR_PREFIX "u" \
":%" __PHYS_ADDR_PREFIX "u"
+#define PH_ADDR_PR_4_FMT PH_ADDR_PR_3_FMT \
+ ":%" __PHYS_ADDR_PREFIX "u"
+
static int fw_cfg_cmdline_set(const char *arg, const struct kernel_param *kp)
{
- struct resource res[3] = {};
+ struct resource res[4] = {};
char *str;
phys_addr_t base;
- resource_size_t size, ctrl_off, data_off;
+ resource_size_t size, ctrl_off, data_off, dma_off;
int processed, consumed = 0;
/* only one fw_cfg device can exist system-wide, so if one
@@ -659,19 +675,20 @@ static int fw_cfg_cmdline_set(const char *arg, const struct kernel_param *kp)
/* consume "<size>" portion of command line argument */
size = memparse(arg, &str);
- /* get "@<base>[:<ctrl_off>:<data_off>]" chunks */
+ /* get "@<base>[:<ctrl_off>:<data_off>[:<dma_off>]]" chunks */
processed = sscanf(str, PH_ADDR_SCAN_FMT,
&base, &consumed,
- &ctrl_off, &data_off, &consumed);
+ &ctrl_off, &data_off, &consumed,
+ &dma_off, &consumed);
- /* sscanf() must process precisely 1 or 3 chunks:
+ /* sscanf() must process precisely 1, 3 or 4 chunks:
* <base> is mandatory, optionally followed by <ctrl_off>
- * and <data_off>;
+ * and <data_off>, and <dma_off>;
* there must be no extra characters after the last chunk,
* so str[consumed] must be '\0'.
*/
if (str[consumed] ||
- (processed != 1 && processed != 3))
+ (processed != 1 && processed != 3 && processed != 4))
return -EINVAL;
res[0].start = base;
@@ -688,6 +705,11 @@ static int fw_cfg_cmdline_set(const char *arg, const struct kernel_param *kp)
res[2].start = data_off;
res[2].flags = IORESOURCE_REG;
}
+ if (processed > 3) {
+ res[3].name = "dma";
+ res[3].start = dma_off;
+ res[3].flags = IORESOURCE_REG;
+ }
/* "processed" happens to nicely match the number of resources
* we need to pass in to this platform device.
@@ -722,6 +744,13 @@ static int fw_cfg_cmdline_get(char *buf, const struct kernel_param *kp)
fw_cfg_cmdline_dev->resource[0].start,
fw_cfg_cmdline_dev->resource[1].start,
fw_cfg_cmdline_dev->resource[2].start);
+ case 4:
+ return snprintf(buf, PAGE_SIZE, PH_ADDR_PR_4_FMT,
+ resource_size(&fw_cfg_cmdline_dev->resource[0]),
+ fw_cfg_cmdline_dev->resource[0].start,
+ fw_cfg_cmdline_dev->resource[1].start,
+ fw_cfg_cmdline_dev->resource[2].start,
+ fw_cfg_cmdline_dev->resource[3].start);
}
/* Should never get here */
--
2.16.1.73.g5832b7e9f2
If the "etc/vmcoreinfo" fw_cfg file is present and we are not running
the kdump kernel, write the addr/size of the vmcoreinfo ELF note.
The DMA operation is expected to run synchronously with today qemu,
but the specification states that it may become async, so we run
"control" field check in a loop for eventual changes.
Signed-off-by: Marc-André Lureau <[email protected]>
---
drivers/firmware/qemu_fw_cfg.c | 157 ++++++++++++++++++++++++++++++++++++++++-
1 file changed, 154 insertions(+), 3 deletions(-)
diff --git a/drivers/firmware/qemu_fw_cfg.c b/drivers/firmware/qemu_fw_cfg.c
index 740df0df2260..fd576ba7b337 100644
--- a/drivers/firmware/qemu_fw_cfg.c
+++ b/drivers/firmware/qemu_fw_cfg.c
@@ -33,6 +33,9 @@
#include <linux/slab.h>
#include <linux/io.h>
#include <linux/ioport.h>
+#include <linux/delay.h>
+#include <linux/crash_dump.h>
+#include <linux/crash_core.h>
MODULE_AUTHOR("Gabriel L. Somlo <[email protected]>");
MODULE_DESCRIPTION("QEMU fw_cfg sysfs support");
@@ -43,12 +46,24 @@ MODULE_LICENSE("GPL");
#define FW_CFG_ID 0x01
#define FW_CFG_FILE_DIR 0x19
+#define FW_CFG_VERSION_DMA 0x02
+#define FW_CFG_DMA_CTL_ERROR 0x01
+#define FW_CFG_DMA_CTL_READ 0x02
+#define FW_CFG_DMA_CTL_SKIP 0x04
+#define FW_CFG_DMA_CTL_SELECT 0x08
+#define FW_CFG_DMA_CTL_WRITE 0x10
+
/* size in bytes of fw_cfg signature */
#define FW_CFG_SIG_SIZE 4
/* fw_cfg "file name" is up to 56 characters (including terminating nul) */
#define FW_CFG_MAX_FILE_PATH 56
+#define VMCOREINFO_FORMAT_ELF 0x1
+
+/* fw_cfg revision attribute, in /sys/firmware/qemu_fw_cfg top-level dir. */
+static u32 fw_cfg_rev;
+
/* fw_cfg file directory entry type */
struct fw_cfg_file {
u32 size;
@@ -57,6 +72,12 @@ struct fw_cfg_file {
char name[FW_CFG_MAX_FILE_PATH];
};
+struct fw_cfg_dma {
+ u32 control;
+ u32 length;
+ u64 address;
+} __packed;
+
/* fw_cfg device i/o register addresses */
static bool fw_cfg_is_mmio;
static phys_addr_t fw_cfg_p_base;
@@ -75,6 +96,59 @@ static inline u16 fw_cfg_sel_endianness(u16 key)
return fw_cfg_is_mmio ? cpu_to_be16(key) : cpu_to_le16(key);
}
+static inline bool fw_cfg_dma_enabled(void)
+{
+ return fw_cfg_rev & FW_CFG_VERSION_DMA && fw_cfg_reg_dma;
+}
+
+/* qemu fw_cfg device is sync today, but spec says it may become async */
+static void fw_cfg_wait_for_control(struct fw_cfg_dma *d)
+{
+ do {
+ u32 ctrl = be32_to_cpu(READ_ONCE(d->control));
+
+ if ((ctrl & ~FW_CFG_DMA_CTL_ERROR) == 0)
+ return;
+
+ usleep_range(50, 100);
+ } while (true);
+}
+
+static ssize_t fw_cfg_dma_transfer(void *address, u32 length, u32 control)
+{
+ phys_addr_t dma;
+ struct fw_cfg_dma *d = NULL;
+ ssize_t ret = length;
+
+ d = kmalloc(sizeof(*d), GFP_KERNEL);
+ if (!d) {
+ ret = -ENOMEM;
+ goto end;
+ }
+
+ *d = (struct fw_cfg_dma) {
+ .address = address ? cpu_to_be64(virt_to_phys(address)) : 0,
+ .length = cpu_to_be32(length),
+ .control = cpu_to_be32(control)
+ };
+
+ dma = virt_to_phys(d);
+
+ iowrite32be((u64)dma >> 32, fw_cfg_reg_dma);
+ iowrite32be(dma, fw_cfg_reg_dma + 4);
+
+ fw_cfg_wait_for_control(d);
+
+ if (be32_to_cpu(READ_ONCE(d->control)) & FW_CFG_DMA_CTL_ERROR) {
+ ret = -EIO;
+ }
+
+end:
+ kfree(d);
+
+ return ret;
+}
+
/* read chunk of given fw_cfg blob (caller responsible for sanity-check) */
static inline void fw_cfg_read_blob(u16 key,
void *buf, loff_t pos, size_t count)
@@ -103,6 +177,47 @@ static inline void fw_cfg_read_blob(u16 key,
acpi_release_global_lock(glk);
}
+#ifdef CONFIG_CRASH_CORE
+/* write chunk of given fw_cfg blob (caller responsible for sanity-check) */
+static ssize_t fw_cfg_write_blob(u16 key,
+ void *buf, loff_t pos, size_t count)
+{
+ u32 glk = -1U;
+ acpi_status status;
+ ssize_t ret = count;
+
+ /* If we have ACPI, ensure mutual exclusion against any potential
+ * device access by the firmware, e.g. via AML methods:
+ */
+ status = acpi_acquire_global_lock(ACPI_WAIT_FOREVER, &glk);
+ if (ACPI_FAILURE(status) && status != AE_NOT_CONFIGURED) {
+ /* Should never get here */
+ WARN(1, "%s: Failed to lock ACPI!\n", __func__);
+ return -EINVAL;
+ }
+
+ mutex_lock(&fw_cfg_dev_lock);
+ if (pos == 0) {
+ ret = fw_cfg_dma_transfer(buf, count, key << 16
+ | FW_CFG_DMA_CTL_SELECT
+ | FW_CFG_DMA_CTL_WRITE);
+ } else {
+ iowrite16(fw_cfg_sel_endianness(key), fw_cfg_reg_ctrl);
+ ret = fw_cfg_dma_transfer(NULL, pos, FW_CFG_DMA_CTL_SKIP);
+ if (ret < 0)
+ goto end;
+ ret = fw_cfg_dma_transfer(buf, count, FW_CFG_DMA_CTL_WRITE);
+ }
+
+end:
+ mutex_unlock(&fw_cfg_dev_lock);
+
+ acpi_release_global_lock(glk);
+
+ return ret;
+}
+#endif /* CONFIG_CRASH_CORE */
+
/* clean up fw_cfg device i/o */
static void fw_cfg_io_cleanup(void)
{
@@ -201,9 +316,6 @@ static int fw_cfg_do_platform_probe(struct platform_device *pdev)
return 0;
}
-/* fw_cfg revision attribute, in /sys/firmware/qemu_fw_cfg top-level dir. */
-static u32 fw_cfg_rev;
-
static ssize_t fw_cfg_showrev(struct kobject *k, struct attribute *a, char *buf)
{
return sprintf(buf, "%u\n", fw_cfg_rev);
@@ -224,6 +336,37 @@ struct fw_cfg_sysfs_entry {
struct list_head list;
};
+#ifdef CONFIG_CRASH_CORE
+static ssize_t write_vmcoreinfo(const struct fw_cfg_file *f)
+{
+ struct vmci {
+ __le16 host_format;
+ __le16 guest_format;
+ __le32 size;
+ __le64 paddr;
+ } __packed;
+ static struct vmci *data;
+ ssize_t ret;
+
+ data = kmalloc(sizeof(struct vmci), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ *data = (struct vmci) {
+ .guest_format = cpu_to_le16(VMCOREINFO_FORMAT_ELF),
+ .size = cpu_to_le32(VMCOREINFO_NOTE_SIZE),
+ .paddr = cpu_to_le64(paddr_vmcoreinfo_note())
+ };
+ /* spare ourself reading host format support for now since we
+ * don't know what else to format - host may ignore ours
+ */
+ ret = fw_cfg_write_blob(f->select, data, 0, sizeof(struct vmci));
+
+ kfree(data);
+ return ret;
+}
+#endif /* CONFIG_CRASH_CORE */
+
/* get fw_cfg_sysfs_entry from kobject member */
static inline struct fw_cfg_sysfs_entry *to_entry(struct kobject *kobj)
{
@@ -464,6 +607,14 @@ static int fw_cfg_register_file(const struct fw_cfg_file *f)
int err;
struct fw_cfg_sysfs_entry *entry;
+#ifdef CONFIG_CRASH_CORE
+ if (fw_cfg_dma_enabled() &&
+ strcmp(f->name, "etc/vmcoreinfo") == 0 && !is_kdump_kernel()) {
+ if (write_vmcoreinfo(f) < 0)
+ pr_warn("fw_cfg: failed to write vmcoreinfo");
+ }
+#endif
+
/* allocate new entry */
entry = kzalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
--
2.16.1.73.g5832b7e9f2
The following patch is going to use the symbol from the fw_cfg module,
to call the function and write the note location details in the
vmcoreinfo entry, so qemu can produce dumps with the vmcoreinfo note.
CC: Andrew Morton <[email protected]>
CC: Baoquan He <[email protected]>
CC: Dave Young <[email protected]>
CC: Dave Young <[email protected]>
CC: Hari Bathini <[email protected]>
CC: Tony Luck <[email protected]>
CC: Vivek Goyal <[email protected]>
Signed-off-by: Marc-André Lureau <[email protected]>
Acked-by: Gabriel Somlo <[email protected]>
---
kernel/crash_core.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 4f63597c824d..a93590cdd9e1 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -376,6 +376,7 @@ phys_addr_t __weak paddr_vmcoreinfo_note(void)
{
return __pa(vmcoreinfo_note);
}
+EXPORT_SYMBOL(paddr_vmcoreinfo_note);
static int __init crash_save_vmcoreinfo_init(void)
{
--
2.16.1.73.g5832b7e9f2
Modify fw_cfg_read_blob() to use DMA if the device supports it.
Return errors, because the operation may fail.
So far, only one call in fw_cfg_register_dir_entries() is using
kmalloc'ed buf and is thus clearly eligible to DMA read.
Initially, I didn't implement DMA read to speed up boot time, but as a
first step before introducing DMA write (since read operations were
already presents). Even more, I didn't realize fw-cfg entries were
being read by the kernel during boot by default. But actally fw-cfg
entries are being populated during module probe. I knew DMA improved a
lot bios boot time (the main reason the DMA interface was added
afaik). Let see the time it would take to read the whole ACPI
tables (128kb allocated)
# time cat /sys/firmware/qemu_fw_cfg/by_name/etc/acpi/tables/raw
- with DMA: sys 0m0.003s
- without DMA (-global fw_cfg.dma_enabled=off): sys 0m7.674s
FW_CFG_FILE_DIR (0x19) is the only "file" that is read during kernel
boot to populate sysfs qemu_fw_cfg directory, and it is quite
small (1-2kb). Since it does not expose itself, in order to measure
the time it takes to read such small file, I took a comparable sized
file of 2048 bytes and exposed it (-fw_cfg test,file=file with a
modified read_raw enabling DMA)
# perf stat -r 100 cat /sys/firmware/qemu_fw_cfg/by_name/test/raw >/dev/null
- with DMA:
0.636037 task-clock (msec) # 0.141 CPUs utilized ( +- 1.19% )
- without DMA:
6.430128 task-clock (msec) # 0.622 CPUs utilized ( +- 0.22% )
That's a few msec saved during boot by enabling DMA read (the gain
would be more substantial if other & bigger fw-cfg entries are read by
others from sysfs, unfortunately, it's not clear if we can always
enable DMA there)
Signed-off-by: Marc-André Lureau <[email protected]>
---
drivers/firmware/qemu_fw_cfg.c | 47 ++++++++++++++++++++++++++++++------------
1 file changed, 34 insertions(+), 13 deletions(-)
diff --git a/drivers/firmware/qemu_fw_cfg.c b/drivers/firmware/qemu_fw_cfg.c
index fd576ba7b337..3721dc868a2b 100644
--- a/drivers/firmware/qemu_fw_cfg.c
+++ b/drivers/firmware/qemu_fw_cfg.c
@@ -150,11 +150,13 @@ static ssize_t fw_cfg_dma_transfer(void *address, u32 length, u32 control)
}
/* read chunk of given fw_cfg blob (caller responsible for sanity-check) */
-static inline void fw_cfg_read_blob(u16 key,
- void *buf, loff_t pos, size_t count)
+static ssize_t fw_cfg_read_blob(u16 key,
+ void *buf, loff_t pos, size_t count,
+ bool dma)
{
u32 glk = -1U;
acpi_status status;
+ ssize_t ret = count;
/* If we have ACPI, ensure mutual exclusion against any potential
* device access by the firmware, e.g. via AML methods:
@@ -164,17 +166,36 @@ static inline void fw_cfg_read_blob(u16 key,
/* Should never get here */
WARN(1, "fw_cfg_read_blob: Failed to lock ACPI!\n");
memset(buf, 0, count);
- return;
+ return -EINVAL;
}
mutex_lock(&fw_cfg_dev_lock);
- iowrite16(fw_cfg_sel_endianness(key), fw_cfg_reg_ctrl);
- while (pos-- > 0)
- ioread8(fw_cfg_reg_data);
- ioread8_rep(fw_cfg_reg_data, buf, count);
+ if (dma && fw_cfg_dma_enabled()) {
+ if (pos == 0) {
+ ret = fw_cfg_dma_transfer(buf, count, key << 16
+ | FW_CFG_DMA_CTL_SELECT
+ | FW_CFG_DMA_CTL_READ);
+ } else {
+ iowrite16(fw_cfg_sel_endianness(key), fw_cfg_reg_ctrl);
+ ret = fw_cfg_dma_transfer(NULL, pos, FW_CFG_DMA_CTL_SKIP);
+ if (ret < 0)
+ goto end;
+ ret = fw_cfg_dma_transfer(buf, count,
+ FW_CFG_DMA_CTL_READ);
+ }
+ } else {
+ iowrite16(fw_cfg_sel_endianness(key), fw_cfg_reg_ctrl);
+ while (pos-- > 0)
+ ioread8(fw_cfg_reg_data);
+ ioread8_rep(fw_cfg_reg_data, buf, count);
+ }
+
+end:
mutex_unlock(&fw_cfg_dev_lock);
acpi_release_global_lock(glk);
+
+ return ret;
}
#ifdef CONFIG_CRASH_CORE
@@ -307,7 +328,7 @@ static int fw_cfg_do_platform_probe(struct platform_device *pdev)
#endif
/* verify fw_cfg device signature */
- fw_cfg_read_blob(FW_CFG_SIGNATURE, sig, 0, FW_CFG_SIG_SIZE);
+ fw_cfg_read_blob(FW_CFG_SIGNATURE, sig, 0, FW_CFG_SIG_SIZE, false);
if (memcmp(sig, "QEMU", FW_CFG_SIG_SIZE) != 0) {
fw_cfg_io_cleanup();
return -ENODEV;
@@ -494,8 +515,8 @@ static ssize_t fw_cfg_sysfs_read_raw(struct file *filp, struct kobject *kobj,
if (count > entry->f.size - pos)
count = entry->f.size - pos;
- fw_cfg_read_blob(entry->f.select, buf, pos, count);
- return count;
+ /* do not use DMA, virt_to_phys(buf) might not be ok */
+ return fw_cfg_read_blob(entry->f.select, buf, pos, count, false);
}
static struct bin_attribute fw_cfg_sysfs_attr_raw = {
@@ -656,7 +677,7 @@ static int fw_cfg_register_dir_entries(void)
struct fw_cfg_file *dir;
size_t dir_size;
- fw_cfg_read_blob(FW_CFG_FILE_DIR, &count, 0, sizeof(count));
+ fw_cfg_read_blob(FW_CFG_FILE_DIR, &count, 0, sizeof(count), false);
count = be32_to_cpu(count);
dir_size = count * sizeof(struct fw_cfg_file);
@@ -664,7 +685,7 @@ static int fw_cfg_register_dir_entries(void)
if (!dir)
return -ENOMEM;
- fw_cfg_read_blob(FW_CFG_FILE_DIR, dir, sizeof(count), dir_size);
+ fw_cfg_read_blob(FW_CFG_FILE_DIR, dir, sizeof(count), dir_size, true);
for (i = 0; i < count; i++) {
dir[i].size = be32_to_cpu(dir[i].size);
@@ -713,7 +734,7 @@ static int fw_cfg_sysfs_probe(struct platform_device *pdev)
goto err_probe;
/* get revision number, add matching top-level attribute */
- fw_cfg_read_blob(FW_CFG_ID, &fw_cfg_rev, 0, sizeof(fw_cfg_rev));
+ fw_cfg_read_blob(FW_CFG_ID, &fw_cfg_rev, 0, sizeof(fw_cfg_rev), false);
fw_cfg_rev = le32_to_cpu(fw_cfg_rev);
err = sysfs_create_file(fw_cfg_top_ko, &fw_cfg_rev_attr.attr);
if (err)
--
2.16.1.73.g5832b7e9f2
On Wed, Feb 07, 2018 at 02:35:25AM +0100, Marc-Andr? Lureau wrote:
> Modify fw_cfg_read_blob() to use DMA if the device supports it.
> Return errors, because the operation may fail.
>
> So far, only one call in fw_cfg_register_dir_entries() is using
> kmalloc'ed buf and is thus clearly eligible to DMA read.
>
> Initially, I didn't implement DMA read to speed up boot time, but as a
> first step before introducing DMA write (since read operations were
> already presents). Even more, I didn't realize fw-cfg entries were
> being read by the kernel during boot by default. But actally fw-cfg
> entries are being populated during module probe. I knew DMA improved a
> lot bios boot time (the main reason the DMA interface was added
> afaik). Let see the time it would take to read the whole ACPI
> tables (128kb allocated)
>
> # time cat /sys/firmware/qemu_fw_cfg/by_name/etc/acpi/tables/raw
> - with DMA: sys 0m0.003s
> - without DMA (-global fw_cfg.dma_enabled=off): sys 0m7.674s
>
> FW_CFG_FILE_DIR (0x19) is the only "file" that is read during kernel
> boot to populate sysfs qemu_fw_cfg directory, and it is quite
> small (1-2kb). Since it does not expose itself, in order to measure
> the time it takes to read such small file, I took a comparable sized
> file of 2048 bytes and exposed it (-fw_cfg test,file=file with a
> modified read_raw enabling DMA)
>
> # perf stat -r 100 cat /sys/firmware/qemu_fw_cfg/by_name/test/raw >/dev/null
> - with DMA:
> 0.636037 task-clock (msec) # 0.141 CPUs utilized ( +- 1.19% )
> - without DMA:
> 6.430128 task-clock (msec) # 0.622 CPUs utilized ( +- 0.22% )
>
> That's a few msec saved during boot by enabling DMA read (the gain
> would be more substantial if other & bigger fw-cfg entries are read by
> others from sysfs, unfortunately, it's not clear if we can always
> enable DMA there)
>
> Signed-off-by: Marc-Andr? Lureau <[email protected]>
> ---
> drivers/firmware/qemu_fw_cfg.c | 47 ++++++++++++++++++++++++++++++------------
> 1 file changed, 34 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/firmware/qemu_fw_cfg.c b/drivers/firmware/qemu_fw_cfg.c
> index fd576ba7b337..3721dc868a2b 100644
> --- a/drivers/firmware/qemu_fw_cfg.c
> +++ b/drivers/firmware/qemu_fw_cfg.c
> @@ -150,11 +150,13 @@ static ssize_t fw_cfg_dma_transfer(void *address, u32 length, u32 control)
> }
>
> /* read chunk of given fw_cfg blob (caller responsible for sanity-check) */
> -static inline void fw_cfg_read_blob(u16 key,
> - void *buf, loff_t pos, size_t count)
> +static ssize_t fw_cfg_read_blob(u16 key,
> + void *buf, loff_t pos, size_t count,
> + bool dma)
> {
> u32 glk = -1U;
> acpi_status status;
> + ssize_t ret = count;
>
> /* If we have ACPI, ensure mutual exclusion against any potential
> * device access by the firmware, e.g. via AML methods:
> @@ -164,17 +166,36 @@ static inline void fw_cfg_read_blob(u16 key,
> /* Should never get here */
> WARN(1, "fw_cfg_read_blob: Failed to lock ACPI!\n");
> memset(buf, 0, count);
> - return;
> + return -EINVAL;
> }
>
> mutex_lock(&fw_cfg_dev_lock);
> - iowrite16(fw_cfg_sel_endianness(key), fw_cfg_reg_ctrl);
> - while (pos-- > 0)
> - ioread8(fw_cfg_reg_data);
> - ioread8_rep(fw_cfg_reg_data, buf, count);
> + if (dma && fw_cfg_dma_enabled()) {
> + if (pos == 0) {
> + ret = fw_cfg_dma_transfer(buf, count, key << 16
> + | FW_CFG_DMA_CTL_SELECT
> + | FW_CFG_DMA_CTL_READ);
> + } else {
> + iowrite16(fw_cfg_sel_endianness(key), fw_cfg_reg_ctrl);
> + ret = fw_cfg_dma_transfer(NULL, pos, FW_CFG_DMA_CTL_SKIP);
> + if (ret < 0)
> + goto end;
> + ret = fw_cfg_dma_transfer(buf, count,
> + FW_CFG_DMA_CTL_READ);
> + }
> + } else {
> + iowrite16(fw_cfg_sel_endianness(key), fw_cfg_reg_ctrl);
> + while (pos-- > 0)
> + ioread8(fw_cfg_reg_data);
> + ioread8_rep(fw_cfg_reg_data, buf, count);
> + }
> +
> +end:
> mutex_unlock(&fw_cfg_dev_lock);
>
> acpi_release_global_lock(glk);
> +
> + return ret;
> }
>
These two functions share no common code at all.
Pls name the dma one fw_cfg_dma_read or something like this,
cleaner than a flag.
> #ifdef CONFIG_CRASH_CORE
> @@ -307,7 +328,7 @@ static int fw_cfg_do_platform_probe(struct platform_device *pdev)
> #endif
>
> /* verify fw_cfg device signature */
> - fw_cfg_read_blob(FW_CFG_SIGNATURE, sig, 0, FW_CFG_SIG_SIZE);
> + fw_cfg_read_blob(FW_CFG_SIGNATURE, sig, 0, FW_CFG_SIG_SIZE, false);
> if (memcmp(sig, "QEMU", FW_CFG_SIG_SIZE) != 0) {
> fw_cfg_io_cleanup();
> return -ENODEV;
BTW it looks like fw_cfg_read_blob can fail and that failure isn't
handled properly here.
> @@ -494,8 +515,8 @@ static ssize_t fw_cfg_sysfs_read_raw(struct file *filp, struct kobject *kobj,
> if (count > entry->f.size - pos)
> count = entry->f.size - pos;
>
> - fw_cfg_read_blob(entry->f.select, buf, pos, count);
> - return count;
> + /* do not use DMA, virt_to_phys(buf) might not be ok */
> + return fw_cfg_read_blob(entry->f.select, buf, pos, count, false);
> }
>
> static struct bin_attribute fw_cfg_sysfs_attr_raw = {
> @@ -656,7 +677,7 @@ static int fw_cfg_register_dir_entries(void)
> struct fw_cfg_file *dir;
> size_t dir_size;
>
> - fw_cfg_read_blob(FW_CFG_FILE_DIR, &count, 0, sizeof(count));
> + fw_cfg_read_blob(FW_CFG_FILE_DIR, &count, 0, sizeof(count), false);
> count = be32_to_cpu(count);
> dir_size = count * sizeof(struct fw_cfg_file);
>
> @@ -664,7 +685,7 @@ static int fw_cfg_register_dir_entries(void)
> if (!dir)
> return -ENOMEM;
>
> - fw_cfg_read_blob(FW_CFG_FILE_DIR, dir, sizeof(count), dir_size);
> + fw_cfg_read_blob(FW_CFG_FILE_DIR, dir, sizeof(count), dir_size, true);
>
> for (i = 0; i < count; i++) {
> dir[i].size = be32_to_cpu(dir[i].size);
> @@ -713,7 +734,7 @@ static int fw_cfg_sysfs_probe(struct platform_device *pdev)
> goto err_probe;
>
> /* get revision number, add matching top-level attribute */
> - fw_cfg_read_blob(FW_CFG_ID, &fw_cfg_rev, 0, sizeof(fw_cfg_rev));
> + fw_cfg_read_blob(FW_CFG_ID, &fw_cfg_rev, 0, sizeof(fw_cfg_rev), false);
> fw_cfg_rev = le32_to_cpu(fw_cfg_rev);
> err = sysfs_create_file(fw_cfg_top_ko, &fw_cfg_rev_attr.attr);
> if (err)
> --
> 2.16.1.73.g5832b7e9f2
On Wed, Feb 07, 2018 at 02:35:24AM +0100, Marc-Andr? Lureau wrote:
> If the "etc/vmcoreinfo" fw_cfg file is present and we are not running
> the kdump kernel, write the addr/size of the vmcoreinfo ELF note.
>
> The DMA operation is expected to run synchronously with today qemu,
> but the specification states that it may become async, so we run
> "control" field check in a loop for eventual changes.
>
> Signed-off-by: Marc-Andr? Lureau <[email protected]>
> ---
> drivers/firmware/qemu_fw_cfg.c | 157 ++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 154 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/firmware/qemu_fw_cfg.c b/drivers/firmware/qemu_fw_cfg.c
> index 740df0df2260..fd576ba7b337 100644
> --- a/drivers/firmware/qemu_fw_cfg.c
> +++ b/drivers/firmware/qemu_fw_cfg.c
> @@ -33,6 +33,9 @@
> #include <linux/slab.h>
> #include <linux/io.h>
> #include <linux/ioport.h>
> +#include <linux/delay.h>
> +#include <linux/crash_dump.h>
> +#include <linux/crash_core.h>
>
> MODULE_AUTHOR("Gabriel L. Somlo <[email protected]>");
> MODULE_DESCRIPTION("QEMU fw_cfg sysfs support");
> @@ -43,12 +46,24 @@ MODULE_LICENSE("GPL");
> #define FW_CFG_ID 0x01
> #define FW_CFG_FILE_DIR 0x19
>
> +#define FW_CFG_VERSION_DMA 0x02
> +#define FW_CFG_DMA_CTL_ERROR 0x01
> +#define FW_CFG_DMA_CTL_READ 0x02
> +#define FW_CFG_DMA_CTL_SKIP 0x04
> +#define FW_CFG_DMA_CTL_SELECT 0x08
> +#define FW_CFG_DMA_CTL_WRITE 0x10
> +
> /* size in bytes of fw_cfg signature */
> #define FW_CFG_SIG_SIZE 4
>
> /* fw_cfg "file name" is up to 56 characters (including terminating nul) */
> #define FW_CFG_MAX_FILE_PATH 56
>
> +#define VMCOREINFO_FORMAT_ELF 0x1
How about exporting interface parts in include/uapi/linux/ ?
QEMU can import it from there then.
This is what virtio does.
> +
> +/* fw_cfg revision attribute, in /sys/firmware/qemu_fw_cfg top-level dir. */
> +static u32 fw_cfg_rev;
> +
> /* fw_cfg file directory entry type */
> struct fw_cfg_file {
> u32 size;
> @@ -57,6 +72,12 @@ struct fw_cfg_file {
> char name[FW_CFG_MAX_FILE_PATH];
> };
>
> +struct fw_cfg_dma {
> + u32 control;
> + u32 length;
> + u64 address;
> +} __packed;
> +
you can drop __packed here - it's always aligned properly.
> /* fw_cfg device i/o register addresses */
> static bool fw_cfg_is_mmio;
> static phys_addr_t fw_cfg_p_base;
> @@ -75,6 +96,59 @@ static inline u16 fw_cfg_sel_endianness(u16 key)
> return fw_cfg_is_mmio ? cpu_to_be16(key) : cpu_to_le16(key);
> }
>
> +static inline bool fw_cfg_dma_enabled(void)
> +{
> + return fw_cfg_rev & FW_CFG_VERSION_DMA && fw_cfg_reg_dma;
Why do you use () with == below but not with && here?
> +}
> +
> +/* qemu fw_cfg device is sync today, but spec says it may become async */
> +static void fw_cfg_wait_for_control(struct fw_cfg_dma *d)
> +{
> + do {
> + u32 ctrl = be32_to_cpu(READ_ONCE(d->control));
> +
> + if ((ctrl & ~FW_CFG_DMA_CTL_ERROR) == 0)
> + return;
> +
> + usleep_range(50, 100);
> + } while (true);
And you need an smp rmb here.
> +}
> +
> +static ssize_t fw_cfg_dma_transfer(void *address, u32 length, u32 control)
> +{
> + phys_addr_t dma;
> + struct fw_cfg_dma *d = NULL;
> + ssize_t ret = length;
> +
> + d = kmalloc(sizeof(*d), GFP_KERNEL);
> + if (!d) {
> + ret = -ENOMEM;
> + goto end;
> + }
> +
> + *d = (struct fw_cfg_dma) {
> + .address = address ? cpu_to_be64(virt_to_phys(address)) : 0,
> + .length = cpu_to_be32(length),
> + .control = cpu_to_be32(control)
> + };
> +
> + dma = virt_to_phys(d);
Pls add docs on why this DMA bypasses the DMA API.
> +
> + iowrite32be((u64)dma >> 32, fw_cfg_reg_dma);
> + iowrite32be(dma, fw_cfg_reg_dma + 4);
> +
> + fw_cfg_wait_for_control(d);
> +
> + if (be32_to_cpu(READ_ONCE(d->control)) & FW_CFG_DMA_CTL_ERROR) {
> + ret = -EIO;
> + }
> +
> +end:
> + kfree(d);
> +
> + return ret;
> +}
> +
> /* read chunk of given fw_cfg blob (caller responsible for sanity-check) */
> static inline void fw_cfg_read_blob(u16 key,
> void *buf, loff_t pos, size_t count)
> @@ -103,6 +177,47 @@ static inline void fw_cfg_read_blob(u16 key,
> acpi_release_global_lock(glk);
> }
>
> +#ifdef CONFIG_CRASH_CORE
> +/* write chunk of given fw_cfg blob (caller responsible for sanity-check) */
> +static ssize_t fw_cfg_write_blob(u16 key,
> + void *buf, loff_t pos, size_t count)
fw_cfg_dma_write seems like a nicer name.
> +{
> + u32 glk = -1U;
> + acpi_status status;
> + ssize_t ret = count;
> +
> + /* If we have ACPI, ensure mutual exclusion against any potential
> + * device access by the firmware, e.g. via AML methods:
> + */
> + status = acpi_acquire_global_lock(ACPI_WAIT_FOREVER, &glk);
> + if (ACPI_FAILURE(status) && status != AE_NOT_CONFIGURED) {
> + /* Should never get here */
> + WARN(1, "%s: Failed to lock ACPI!\n", __func__);
> + return -EINVAL;
> + }
> +
> + mutex_lock(&fw_cfg_dev_lock);
> + if (pos == 0) {
> + ret = fw_cfg_dma_transfer(buf, count, key << 16
> + | FW_CFG_DMA_CTL_SELECT
> + | FW_CFG_DMA_CTL_WRITE);
> + } else {
> + iowrite16(fw_cfg_sel_endianness(key), fw_cfg_reg_ctrl);
> + ret = fw_cfg_dma_transfer(NULL, pos, FW_CFG_DMA_CTL_SKIP);
> + if (ret < 0)
> + goto end;
> + ret = fw_cfg_dma_transfer(buf, count, FW_CFG_DMA_CTL_WRITE);
> + }
> +
> +end:
> + mutex_unlock(&fw_cfg_dev_lock);
> +
> + acpi_release_global_lock(glk);
> +
> + return ret;
> +}
> +#endif /* CONFIG_CRASH_CORE */
> +
> /* clean up fw_cfg device i/o */
> static void fw_cfg_io_cleanup(void)
> {
> @@ -201,9 +316,6 @@ static int fw_cfg_do_platform_probe(struct platform_device *pdev)
> return 0;
> }
>
> -/* fw_cfg revision attribute, in /sys/firmware/qemu_fw_cfg top-level dir. */
> -static u32 fw_cfg_rev;
> -
> static ssize_t fw_cfg_showrev(struct kobject *k, struct attribute *a, char *buf)
> {
> return sprintf(buf, "%u\n", fw_cfg_rev);
> @@ -224,6 +336,37 @@ struct fw_cfg_sysfs_entry {
> struct list_head list;
> };
>
> +#ifdef CONFIG_CRASH_CORE
> +static ssize_t write_vmcoreinfo(const struct fw_cfg_file *f)
why not prefix with fw_cfg here?
> +{
> + struct vmci {
> + __le16 host_format;
> + __le16 guest_format;
> + __le32 size;
> + __le64 paddr;
> + } __packed;
No need for the __packed attribute.
And pls do not declare structures within functions.
Name them sanely and place in a header or near top of file.
> + static struct vmci *data;
> + ssize_t ret;
> +
> + data = kmalloc(sizeof(struct vmci), GFP_KERNEL);
> + if (!data)
> + return -ENOMEM;
> +
> + *data = (struct vmci) {
> + .guest_format = cpu_to_le16(VMCOREINFO_FORMAT_ELF),
> + .size = cpu_to_le32(VMCOREINFO_NOTE_SIZE),
> + .paddr = cpu_to_le64(paddr_vmcoreinfo_note())
> + };
> + /* spare ourself reading host format support for now since we
> + * don't know what else to format - host may ignore ours
> + */
> + ret = fw_cfg_write_blob(f->select, data, 0, sizeof(struct vmci));
> +
> + kfree(data);
> + return ret;
> +}
> +#endif /* CONFIG_CRASH_CORE */
> +
> /* get fw_cfg_sysfs_entry from kobject member */
> static inline struct fw_cfg_sysfs_entry *to_entry(struct kobject *kobj)
> {
> @@ -464,6 +607,14 @@ static int fw_cfg_register_file(const struct fw_cfg_file *f)
> int err;
> struct fw_cfg_sysfs_entry *entry;
>
> +#ifdef CONFIG_CRASH_CORE
> + if (fw_cfg_dma_enabled() &&
> + strcmp(f->name, "etc/vmcoreinfo") == 0 && !is_kdump_kernel()) {
> + if (write_vmcoreinfo(f) < 0)
> + pr_warn("fw_cfg: failed to write vmcoreinfo");
> + }
> +#endif
> +
> /* allocate new entry */
> entry = kzalloc(sizeof(*entry), GFP_KERNEL);
> if (!entry)
> --
> 2.16.1.73.g5832b7e9f2
Hi
On Mon, Feb 12, 2018 at 4:43 AM, Michael S. Tsirkin <[email protected]> wrote:
> On Wed, Feb 07, 2018 at 02:35:24AM +0100, Marc-André Lureau wrote:
>> If the "etc/vmcoreinfo" fw_cfg file is present and we are not running
>> the kdump kernel, write the addr/size of the vmcoreinfo ELF note.
>>
>> The DMA operation is expected to run synchronously with today qemu,
>> but the specification states that it may become async, so we run
>> "control" field check in a loop for eventual changes.
>>
>> Signed-off-by: Marc-André Lureau <[email protected]>
>> ---
>> drivers/firmware/qemu_fw_cfg.c | 157 ++++++++++++++++++++++++++++++++++++++++-
>> 1 file changed, 154 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/firmware/qemu_fw_cfg.c b/drivers/firmware/qemu_fw_cfg.c
>> index 740df0df2260..fd576ba7b337 100644
>> --- a/drivers/firmware/qemu_fw_cfg.c
>> +++ b/drivers/firmware/qemu_fw_cfg.c
>> @@ -33,6 +33,9 @@
>> #include <linux/slab.h>
>> #include <linux/io.h>
>> #include <linux/ioport.h>
>> +#include <linux/delay.h>
>> +#include <linux/crash_dump.h>
>> +#include <linux/crash_core.h>
>>
>> MODULE_AUTHOR("Gabriel L. Somlo <[email protected]>");
>> MODULE_DESCRIPTION("QEMU fw_cfg sysfs support");
>> @@ -43,12 +46,24 @@ MODULE_LICENSE("GPL");
>> #define FW_CFG_ID 0x01
>> #define FW_CFG_FILE_DIR 0x19
>>
>> +#define FW_CFG_VERSION_DMA 0x02
>> +#define FW_CFG_DMA_CTL_ERROR 0x01
>> +#define FW_CFG_DMA_CTL_READ 0x02
>> +#define FW_CFG_DMA_CTL_SKIP 0x04
>> +#define FW_CFG_DMA_CTL_SELECT 0x08
>> +#define FW_CFG_DMA_CTL_WRITE 0x10
>> +
>> /* size in bytes of fw_cfg signature */
>> #define FW_CFG_SIG_SIZE 4
>>
>> /* fw_cfg "file name" is up to 56 characters (including terminating nul) */
>> #define FW_CFG_MAX_FILE_PATH 56
>>
>> +#define VMCOREINFO_FORMAT_ELF 0x1
>
> How about exporting interface parts in include/uapi/linux/ ?
> QEMU can import it from there then.
> This is what virtio does.
Good idea, we didn't have it yet. So this is an additional change.
I'll work on it. Though, if this should delay more this series, I
think we should drop it.
>
>> +
>> +/* fw_cfg revision attribute, in /sys/firmware/qemu_fw_cfg top-level dir. */
>> +static u32 fw_cfg_rev;
>> +
>> /* fw_cfg file directory entry type */
>> struct fw_cfg_file {
>> u32 size;
>> @@ -57,6 +72,12 @@ struct fw_cfg_file {
>> char name[FW_CFG_MAX_FILE_PATH];
>> };
>>
>> +struct fw_cfg_dma {
>> + u32 control;
>> + u32 length;
>> + u64 address;
>> +} __packed;
>> +
>
> you can drop __packed here - it's always aligned properly.
Isn't it preferable to make that explicit? Fwiw, qemu also declares
the struct packed in its headers.
>
>> /* fw_cfg device i/o register addresses */
>> static bool fw_cfg_is_mmio;
>> static phys_addr_t fw_cfg_p_base;
>> @@ -75,6 +96,59 @@ static inline u16 fw_cfg_sel_endianness(u16 key)
>> return fw_cfg_is_mmio ? cpu_to_be16(key) : cpu_to_le16(key);
>> }
>>
>> +static inline bool fw_cfg_dma_enabled(void)
>> +{
>> + return fw_cfg_rev & FW_CFG_VERSION_DMA && fw_cfg_reg_dma;
>
> Why do you use () with == below but not with && here?
>
Let's add them.
>> +}
>> +
>> +/* qemu fw_cfg device is sync today, but spec says it may become async */
>> +static void fw_cfg_wait_for_control(struct fw_cfg_dma *d)
>> +{
>> + do {
>> + u32 ctrl = be32_to_cpu(READ_ONCE(d->control));
>> +
>> + if ((ctrl & ~FW_CFG_DMA_CTL_ERROR) == 0)
>> + return;
>> +
>> + usleep_range(50, 100);
>> + } while (true);
>
> And you need an smp rmb here.
Could you explain? thanks
>
>> +}
>> +
>> +static ssize_t fw_cfg_dma_transfer(void *address, u32 length, u32 control)
>> +{
>> + phys_addr_t dma;
>> + struct fw_cfg_dma *d = NULL;
>> + ssize_t ret = length;
>> +
>> + d = kmalloc(sizeof(*d), GFP_KERNEL);
>> + if (!d) {
>> + ret = -ENOMEM;
>> + goto end;
>> + }
>> +
>> + *d = (struct fw_cfg_dma) {
>> + .address = address ? cpu_to_be64(virt_to_phys(address)) : 0,
>> + .length = cpu_to_be32(length),
>> + .control = cpu_to_be32(control)
>> + };
>> +
>> + dma = virt_to_phys(d);
>
> Pls add docs on why this DMA bypasses the DMA API.
Peter said in his patch: "fw_cfg device does not need IOMMU
protection, so use physical addresses
always. That's how QEMU implements fw_cfg. Otherwise we'll see call
traces during boot."
Is that enough justification?
>
>> +
>> + iowrite32be((u64)dma >> 32, fw_cfg_reg_dma);
>> + iowrite32be(dma, fw_cfg_reg_dma + 4);
>> +
>> + fw_cfg_wait_for_control(d);
>> +
>> + if (be32_to_cpu(READ_ONCE(d->control)) & FW_CFG_DMA_CTL_ERROR) {
>> + ret = -EIO;
>> + }
>> +
>> +end:
>> + kfree(d);
>> +
>> + return ret;
>> +}
>> +
>> /* read chunk of given fw_cfg blob (caller responsible for sanity-check) */
>> static inline void fw_cfg_read_blob(u16 key,
>> void *buf, loff_t pos, size_t count)
>> @@ -103,6 +177,47 @@ static inline void fw_cfg_read_blob(u16 key,
>> acpi_release_global_lock(glk);
>> }
>>
>> +#ifdef CONFIG_CRASH_CORE
>> +/* write chunk of given fw_cfg blob (caller responsible for sanity-check) */
>> +static ssize_t fw_cfg_write_blob(u16 key,
>> + void *buf, loff_t pos, size_t count)
>
> fw_cfg_dma_write seems like a nicer name.
ok (I used the same naming as fw_cfg_read_blob() for consistency)
>
>> +{
>> + u32 glk = -1U;
>> + acpi_status status;
>> + ssize_t ret = count;
>> +
>> + /* If we have ACPI, ensure mutual exclusion against any potential
>> + * device access by the firmware, e.g. via AML methods:
>> + */
>> + status = acpi_acquire_global_lock(ACPI_WAIT_FOREVER, &glk);
>> + if (ACPI_FAILURE(status) && status != AE_NOT_CONFIGURED) {
>> + /* Should never get here */
>> + WARN(1, "%s: Failed to lock ACPI!\n", __func__);
>> + return -EINVAL;
>> + }
>> +
>> + mutex_lock(&fw_cfg_dev_lock);
>> + if (pos == 0) {
>> + ret = fw_cfg_dma_transfer(buf, count, key << 16
>> + | FW_CFG_DMA_CTL_SELECT
>> + | FW_CFG_DMA_CTL_WRITE);
>> + } else {
>> + iowrite16(fw_cfg_sel_endianness(key), fw_cfg_reg_ctrl);
>> + ret = fw_cfg_dma_transfer(NULL, pos, FW_CFG_DMA_CTL_SKIP);
>> + if (ret < 0)
>> + goto end;
>> + ret = fw_cfg_dma_transfer(buf, count, FW_CFG_DMA_CTL_WRITE);
>> + }
>> +
>> +end:
>> + mutex_unlock(&fw_cfg_dev_lock);
>> +
>> + acpi_release_global_lock(glk);
>> +
>> + return ret;
>> +}
>> +#endif /* CONFIG_CRASH_CORE */
>> +
>> /* clean up fw_cfg device i/o */
>> static void fw_cfg_io_cleanup(void)
>> {
>> @@ -201,9 +316,6 @@ static int fw_cfg_do_platform_probe(struct platform_device *pdev)
>> return 0;
>> }
>>
>> -/* fw_cfg revision attribute, in /sys/firmware/qemu_fw_cfg top-level dir. */
>> -static u32 fw_cfg_rev;
>> -
>> static ssize_t fw_cfg_showrev(struct kobject *k, struct attribute *a, char *buf)
>> {
>> return sprintf(buf, "%u\n", fw_cfg_rev);
>> @@ -224,6 +336,37 @@ struct fw_cfg_sysfs_entry {
>> struct list_head list;
>> };
>>
>> +#ifdef CONFIG_CRASH_CORE
>> +static ssize_t write_vmcoreinfo(const struct fw_cfg_file *f)
>
> why not prefix with fw_cfg here?
ok
>
>> +{
>> + struct vmci {
>> + __le16 host_format;
>> + __le16 guest_format;
>> + __le32 size;
>> + __le64 paddr;
>> + } __packed;
>
>
> No need for the __packed attribute.
discussed above
> And pls do not declare structures within functions.
> Name them sanely and place in a header or near top of file.
ok
>
>> + static struct vmci *data;
>> + ssize_t ret;
>> +
>> + data = kmalloc(sizeof(struct vmci), GFP_KERNEL);
>> + if (!data)
>> + return -ENOMEM;
>> +
>> + *data = (struct vmci) {
>> + .guest_format = cpu_to_le16(VMCOREINFO_FORMAT_ELF),
>> + .size = cpu_to_le32(VMCOREINFO_NOTE_SIZE),
>> + .paddr = cpu_to_le64(paddr_vmcoreinfo_note())
>> + };
>> + /* spare ourself reading host format support for now since we
>> + * don't know what else to format - host may ignore ours
>> + */
>> + ret = fw_cfg_write_blob(f->select, data, 0, sizeof(struct vmci));
>> +
>> + kfree(data);
>> + return ret;
>> +}
>> +#endif /* CONFIG_CRASH_CORE */
>> +
>> /* get fw_cfg_sysfs_entry from kobject member */
>> static inline struct fw_cfg_sysfs_entry *to_entry(struct kobject *kobj)
>> {
>> @@ -464,6 +607,14 @@ static int fw_cfg_register_file(const struct fw_cfg_file *f)
>> int err;
>> struct fw_cfg_sysfs_entry *entry;
>>
>> +#ifdef CONFIG_CRASH_CORE
>> + if (fw_cfg_dma_enabled() &&
>> + strcmp(f->name, "etc/vmcoreinfo") == 0 && !is_kdump_kernel()) {
>> + if (write_vmcoreinfo(f) < 0)
>> + pr_warn("fw_cfg: failed to write vmcoreinfo");
>> + }
>> +#endif
>> +
>> /* allocate new entry */
>> entry = kzalloc(sizeof(*entry), GFP_KERNEL);
>> if (!entry)
>> --
>> 2.16.1.73.g5832b7e9f2
thanks
Hi
On Mon, Feb 12, 2018 at 4:30 AM, Michael S. Tsirkin <[email protected]> wrote:
> On Wed, Feb 07, 2018 at 02:35:25AM +0100, Marc-André Lureau wrote:
>> Modify fw_cfg_read_blob() to use DMA if the device supports it.
>> Return errors, because the operation may fail.
>>
>> So far, only one call in fw_cfg_register_dir_entries() is using
>> kmalloc'ed buf and is thus clearly eligible to DMA read.
>>
>> Initially, I didn't implement DMA read to speed up boot time, but as a
>> first step before introducing DMA write (since read operations were
>> already presents). Even more, I didn't realize fw-cfg entries were
>> being read by the kernel during boot by default. But actally fw-cfg
>> entries are being populated during module probe. I knew DMA improved a
>> lot bios boot time (the main reason the DMA interface was added
>> afaik). Let see the time it would take to read the whole ACPI
>> tables (128kb allocated)
>>
>> # time cat /sys/firmware/qemu_fw_cfg/by_name/etc/acpi/tables/raw
>> - with DMA: sys 0m0.003s
>> - without DMA (-global fw_cfg.dma_enabled=off): sys 0m7.674s
>>
>> FW_CFG_FILE_DIR (0x19) is the only "file" that is read during kernel
>> boot to populate sysfs qemu_fw_cfg directory, and it is quite
>> small (1-2kb). Since it does not expose itself, in order to measure
>> the time it takes to read such small file, I took a comparable sized
>> file of 2048 bytes and exposed it (-fw_cfg test,file=file with a
>> modified read_raw enabling DMA)
>>
>> # perf stat -r 100 cat /sys/firmware/qemu_fw_cfg/by_name/test/raw >/dev/null
>> - with DMA:
>> 0.636037 task-clock (msec) # 0.141 CPUs utilized ( +- 1.19% )
>> - without DMA:
>> 6.430128 task-clock (msec) # 0.622 CPUs utilized ( +- 0.22% )
>>
>> That's a few msec saved during boot by enabling DMA read (the gain
>> would be more substantial if other & bigger fw-cfg entries are read by
>> others from sysfs, unfortunately, it's not clear if we can always
>> enable DMA there)
>>
>> Signed-off-by: Marc-André Lureau <[email protected]>
>> ---
>> drivers/firmware/qemu_fw_cfg.c | 47 ++++++++++++++++++++++++++++++------------
>> 1 file changed, 34 insertions(+), 13 deletions(-)
>>
>> diff --git a/drivers/firmware/qemu_fw_cfg.c b/drivers/firmware/qemu_fw_cfg.c
>> index fd576ba7b337..3721dc868a2b 100644
>> --- a/drivers/firmware/qemu_fw_cfg.c
>> +++ b/drivers/firmware/qemu_fw_cfg.c
>> @@ -150,11 +150,13 @@ static ssize_t fw_cfg_dma_transfer(void *address, u32 length, u32 control)
>> }
>>
>> /* read chunk of given fw_cfg blob (caller responsible for sanity-check) */
>> -static inline void fw_cfg_read_blob(u16 key,
>> - void *buf, loff_t pos, size_t count)
>> +static ssize_t fw_cfg_read_blob(u16 key,
>> + void *buf, loff_t pos, size_t count,
>> + bool dma)
>> {
>> u32 glk = -1U;
>> acpi_status status;
>> + ssize_t ret = count;
>>
>> /* If we have ACPI, ensure mutual exclusion against any potential
>> * device access by the firmware, e.g. via AML methods:
>> @@ -164,17 +166,36 @@ static inline void fw_cfg_read_blob(u16 key,
>> /* Should never get here */
>> WARN(1, "fw_cfg_read_blob: Failed to lock ACPI!\n");
>> memset(buf, 0, count);
>> - return;
>> + return -EINVAL;
>> }
>>
>> mutex_lock(&fw_cfg_dev_lock);
>> - iowrite16(fw_cfg_sel_endianness(key), fw_cfg_reg_ctrl);
>> - while (pos-- > 0)
>> - ioread8(fw_cfg_reg_data);
>> - ioread8_rep(fw_cfg_reg_data, buf, count);
>> + if (dma && fw_cfg_dma_enabled()) {
>> + if (pos == 0) {
>> + ret = fw_cfg_dma_transfer(buf, count, key << 16
>> + | FW_CFG_DMA_CTL_SELECT
>> + | FW_CFG_DMA_CTL_READ);
>> + } else {
>> + iowrite16(fw_cfg_sel_endianness(key), fw_cfg_reg_ctrl);
>> + ret = fw_cfg_dma_transfer(NULL, pos, FW_CFG_DMA_CTL_SKIP);
>> + if (ret < 0)
>> + goto end;
>> + ret = fw_cfg_dma_transfer(buf, count,
>> + FW_CFG_DMA_CTL_READ);
>> + }
>> + } else {
>> + iowrite16(fw_cfg_sel_endianness(key), fw_cfg_reg_ctrl);
>> + while (pos-- > 0)
>> + ioread8(fw_cfg_reg_data);
>> + ioread8_rep(fw_cfg_reg_data, buf, count);
>> + }
>> +
>> +end:
>> mutex_unlock(&fw_cfg_dev_lock);
>>
>> acpi_release_global_lock(glk);
>> +
>> + return ret;
>> }
>>
>
> These two functions share no common code at all.
> Pls name the dma one fw_cfg_dma_read or something like this,
> cleaner than a flag.
They share arguments, ACPI locking, error handling, cleanup. But they
also allow to abstract read over dma-capable and non-dma capable. I'll
split both cases in two functions.
>
>> #ifdef CONFIG_CRASH_CORE
>> @@ -307,7 +328,7 @@ static int fw_cfg_do_platform_probe(struct platform_device *pdev)
>> #endif
>>
>> /* verify fw_cfg device signature */
>> - fw_cfg_read_blob(FW_CFG_SIGNATURE, sig, 0, FW_CFG_SIG_SIZE);
>> + fw_cfg_read_blob(FW_CFG_SIGNATURE, sig, 0, FW_CFG_SIG_SIZE, false);
>> if (memcmp(sig, "QEMU", FW_CFG_SIG_SIZE) != 0) {
>> fw_cfg_io_cleanup();
>> return -ENODEV;
>
> BTW it looks like fw_cfg_read_blob can fail and that failure isn't
> handled properly here.
ok
>
>> @@ -494,8 +515,8 @@ static ssize_t fw_cfg_sysfs_read_raw(struct file *filp, struct kobject *kobj,
>> if (count > entry->f.size - pos)
>> count = entry->f.size - pos;
>>
>> - fw_cfg_read_blob(entry->f.select, buf, pos, count);
>> - return count;
>> + /* do not use DMA, virt_to_phys(buf) might not be ok */
>> + return fw_cfg_read_blob(entry->f.select, buf, pos, count, false);
>> }
>>
>> static struct bin_attribute fw_cfg_sysfs_attr_raw = {
>> @@ -656,7 +677,7 @@ static int fw_cfg_register_dir_entries(void)
>> struct fw_cfg_file *dir;
>> size_t dir_size;
>>
>> - fw_cfg_read_blob(FW_CFG_FILE_DIR, &count, 0, sizeof(count));
>> + fw_cfg_read_blob(FW_CFG_FILE_DIR, &count, 0, sizeof(count), false);
>> count = be32_to_cpu(count);
>> dir_size = count * sizeof(struct fw_cfg_file);
>>
>> @@ -664,7 +685,7 @@ static int fw_cfg_register_dir_entries(void)
>> if (!dir)
>> return -ENOMEM;
>>
>> - fw_cfg_read_blob(FW_CFG_FILE_DIR, dir, sizeof(count), dir_size);
>> + fw_cfg_read_blob(FW_CFG_FILE_DIR, dir, sizeof(count), dir_size, true);
>>
>> for (i = 0; i < count; i++) {
>> dir[i].size = be32_to_cpu(dir[i].size);
>> @@ -713,7 +734,7 @@ static int fw_cfg_sysfs_probe(struct platform_device *pdev)
>> goto err_probe;
>>
>> /* get revision number, add matching top-level attribute */
>> - fw_cfg_read_blob(FW_CFG_ID, &fw_cfg_rev, 0, sizeof(fw_cfg_rev));
>> + fw_cfg_read_blob(FW_CFG_ID, &fw_cfg_rev, 0, sizeof(fw_cfg_rev), false);
>> fw_cfg_rev = le32_to_cpu(fw_cfg_rev);
>> err = sysfs_create_file(fw_cfg_top_ko, &fw_cfg_rev_attr.attr);
>> if (err)
>> --
>> 2.16.1.73.g5832b7e9f2
On Mon, Feb 12, 2018 at 11:04:49AM +0100, Marc-Andre Lureau wrote:
> Hi
>
> On Mon, Feb 12, 2018 at 4:43 AM, Michael S. Tsirkin <[email protected]> wrote:
> > On Wed, Feb 07, 2018 at 02:35:24AM +0100, Marc-Andr? Lureau wrote:
> >> If the "etc/vmcoreinfo" fw_cfg file is present and we are not running
> >> the kdump kernel, write the addr/size of the vmcoreinfo ELF note.
> >>
> >> The DMA operation is expected to run synchronously with today qemu,
> >> but the specification states that it may become async, so we run
> >> "control" field check in a loop for eventual changes.
> >>
> >> Signed-off-by: Marc-Andr? Lureau <[email protected]>
> >> ---
> >> drivers/firmware/qemu_fw_cfg.c | 157 ++++++++++++++++++++++++++++++++++++++++-
> >> 1 file changed, 154 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/drivers/firmware/qemu_fw_cfg.c b/drivers/firmware/qemu_fw_cfg.c
> >> index 740df0df2260..fd576ba7b337 100644
> >> --- a/drivers/firmware/qemu_fw_cfg.c
> >> +++ b/drivers/firmware/qemu_fw_cfg.c
> >> @@ -33,6 +33,9 @@
> >> #include <linux/slab.h>
> >> #include <linux/io.h>
> >> #include <linux/ioport.h>
> >> +#include <linux/delay.h>
> >> +#include <linux/crash_dump.h>
> >> +#include <linux/crash_core.h>
> >>
> >> MODULE_AUTHOR("Gabriel L. Somlo <[email protected]>");
> >> MODULE_DESCRIPTION("QEMU fw_cfg sysfs support");
> >> @@ -43,12 +46,24 @@ MODULE_LICENSE("GPL");
> >> #define FW_CFG_ID 0x01
> >> #define FW_CFG_FILE_DIR 0x19
> >>
> >> +#define FW_CFG_VERSION_DMA 0x02
> >> +#define FW_CFG_DMA_CTL_ERROR 0x01
> >> +#define FW_CFG_DMA_CTL_READ 0x02
> >> +#define FW_CFG_DMA_CTL_SKIP 0x04
> >> +#define FW_CFG_DMA_CTL_SELECT 0x08
> >> +#define FW_CFG_DMA_CTL_WRITE 0x10
> >> +
> >> /* size in bytes of fw_cfg signature */
> >> #define FW_CFG_SIG_SIZE 4
> >>
> >> /* fw_cfg "file name" is up to 56 characters (including terminating nul) */
> >> #define FW_CFG_MAX_FILE_PATH 56
> >>
> >> +#define VMCOREINFO_FORMAT_ELF 0x1
> >
> > How about exporting interface parts in include/uapi/linux/ ?
> > QEMU can import it from there then.
> > This is what virtio does.
>
> Good idea, we didn't have it yet. So this is an additional change.
> I'll work on it. Though, if this should delay more this series, I
> think we should drop it.
It's not a new issue so I'm fine doing this as a separate patch on top
if that helps converge.
> >
> >> +
> >> +/* fw_cfg revision attribute, in /sys/firmware/qemu_fw_cfg top-level dir. */
> >> +static u32 fw_cfg_rev;
> >> +
> >> /* fw_cfg file directory entry type */
> >> struct fw_cfg_file {
> >> u32 size;
> >> @@ -57,6 +72,12 @@ struct fw_cfg_file {
> >> char name[FW_CFG_MAX_FILE_PATH];
> >> };
> >>
> >> +struct fw_cfg_dma {
> >> + u32 control;
> >> + u32 length;
> >> + u64 address;
> >> +} __packed;
> >> +
> >
> > you can drop __packed here - it's always aligned properly.
>
> Isn't it preferable to make that explicit? Fwiw, qemu also declares
> the struct packed in its headers.
qemu has a weird coding style with all kind of theoretical ideas.
This attribute disables structure alignment
rules often making gcc generate more code, which we do not
want in the kernel.
> >
> >> /* fw_cfg device i/o register addresses */
> >> static bool fw_cfg_is_mmio;
> >> static phys_addr_t fw_cfg_p_base;
> >> @@ -75,6 +96,59 @@ static inline u16 fw_cfg_sel_endianness(u16 key)
> >> return fw_cfg_is_mmio ? cpu_to_be16(key) : cpu_to_le16(key);
> >> }
> >>
> >> +static inline bool fw_cfg_dma_enabled(void)
> >> +{
> >> + return fw_cfg_rev & FW_CFG_VERSION_DMA && fw_cfg_reg_dma;
> >
> > Why do you use () with == below but not with && here?
> >
>
> Let's add them.
>
> >> +}
> >> +
> >> +/* qemu fw_cfg device is sync today, but spec says it may become async */
> >> +static void fw_cfg_wait_for_control(struct fw_cfg_dma *d)
> >> +{
> >> + do {
> >> + u32 ctrl = be32_to_cpu(READ_ONCE(d->control));
> >> +
> >> + if ((ctrl & ~FW_CFG_DMA_CTL_ERROR) == 0)
> >> + return;
> >> +
> >> + usleep_range(50, 100);
> >> + } while (true);
> >
> > And you need an smp rmb here.
I'd just do rmb() in fact.
> Could you explain? thanks
See Documentation/memory-barriers.txt
You know that control is valid, but following read of
the structure could be reordered. So you need that barrier there.
Same for write: wmb.
> >
> >> +}
> >> +
> >> +static ssize_t fw_cfg_dma_transfer(void *address, u32 length, u32 control)
> >> +{
> >> + phys_addr_t dma;
> >> + struct fw_cfg_dma *d = NULL;
> >> + ssize_t ret = length;
> >> +
> >> + d = kmalloc(sizeof(*d), GFP_KERNEL);
> >> + if (!d) {
> >> + ret = -ENOMEM;
> >> + goto end;
> >> + }
> >> +
> >> + *d = (struct fw_cfg_dma) {
> >> + .address = address ? cpu_to_be64(virt_to_phys(address)) : 0,
> >> + .length = cpu_to_be32(length),
> >> + .control = cpu_to_be32(control)
> >> + };
> >> +
> >> + dma = virt_to_phys(d);
> >
> > Pls add docs on why this DMA bypasses the DMA API.
>
> Peter said in his patch: "fw_cfg device does not need IOMMU
> protection, so use physical addresses
> always. That's how QEMU implements fw_cfg. Otherwise we'll see call
> traces during boot."
>
> Is that enough justification?
what are the reasons for the traces exactly though?
some kind of explanation should go into comments, and
I think it should be a bit more detailed than just "it doesn't
work otherwise".
> >
> >> +
> >> + iowrite32be((u64)dma >> 32, fw_cfg_reg_dma);
> >> + iowrite32be(dma, fw_cfg_reg_dma + 4);
> >> +
> >> + fw_cfg_wait_for_control(d);
> >> +
> >> + if (be32_to_cpu(READ_ONCE(d->control)) & FW_CFG_DMA_CTL_ERROR) {
> >> + ret = -EIO;
> >> + }
> >> +
> >> +end:
> >> + kfree(d);
> >> +
> >> + return ret;
> >> +}
> >> +
> >> /* read chunk of given fw_cfg blob (caller responsible for sanity-check) */
> >> static inline void fw_cfg_read_blob(u16 key,
> >> void *buf, loff_t pos, size_t count)
> >> @@ -103,6 +177,47 @@ static inline void fw_cfg_read_blob(u16 key,
> >> acpi_release_global_lock(glk);
> >> }
> >>
> >> +#ifdef CONFIG_CRASH_CORE
> >> +/* write chunk of given fw_cfg blob (caller responsible for sanity-check) */
> >> +static ssize_t fw_cfg_write_blob(u16 key,
> >> + void *buf, loff_t pos, size_t count)
> >
> > fw_cfg_dma_write seems like a nicer name.
>
> ok (I used the same naming as fw_cfg_read_blob() for consistency)
>
> >
> >> +{
> >> + u32 glk = -1U;
> >> + acpi_status status;
> >> + ssize_t ret = count;
> >> +
> >> + /* If we have ACPI, ensure mutual exclusion against any potential
> >> + * device access by the firmware, e.g. via AML methods:
> >> + */
> >> + status = acpi_acquire_global_lock(ACPI_WAIT_FOREVER, &glk);
> >> + if (ACPI_FAILURE(status) && status != AE_NOT_CONFIGURED) {
> >> + /* Should never get here */
> >> + WARN(1, "%s: Failed to lock ACPI!\n", __func__);
> >> + return -EINVAL;
> >> + }
> >> +
> >> + mutex_lock(&fw_cfg_dev_lock);
> >> + if (pos == 0) {
> >> + ret = fw_cfg_dma_transfer(buf, count, key << 16
> >> + | FW_CFG_DMA_CTL_SELECT
> >> + | FW_CFG_DMA_CTL_WRITE);
> >> + } else {
> >> + iowrite16(fw_cfg_sel_endianness(key), fw_cfg_reg_ctrl);
> >> + ret = fw_cfg_dma_transfer(NULL, pos, FW_CFG_DMA_CTL_SKIP);
> >> + if (ret < 0)
> >> + goto end;
> >> + ret = fw_cfg_dma_transfer(buf, count, FW_CFG_DMA_CTL_WRITE);
> >> + }
> >> +
> >> +end:
> >> + mutex_unlock(&fw_cfg_dev_lock);
> >> +
> >> + acpi_release_global_lock(glk);
> >> +
> >> + return ret;
> >> +}
> >> +#endif /* CONFIG_CRASH_CORE */
> >> +
> >> /* clean up fw_cfg device i/o */
> >> static void fw_cfg_io_cleanup(void)
> >> {
> >> @@ -201,9 +316,6 @@ static int fw_cfg_do_platform_probe(struct platform_device *pdev)
> >> return 0;
> >> }
> >>
> >> -/* fw_cfg revision attribute, in /sys/firmware/qemu_fw_cfg top-level dir. */
> >> -static u32 fw_cfg_rev;
> >> -
> >> static ssize_t fw_cfg_showrev(struct kobject *k, struct attribute *a, char *buf)
> >> {
> >> return sprintf(buf, "%u\n", fw_cfg_rev);
> >> @@ -224,6 +336,37 @@ struct fw_cfg_sysfs_entry {
> >> struct list_head list;
> >> };
> >>
> >> +#ifdef CONFIG_CRASH_CORE
> >> +static ssize_t write_vmcoreinfo(const struct fw_cfg_file *f)
> >
> > why not prefix with fw_cfg here?
>
> ok
>
> >
> >> +{
> >> + struct vmci {
> >> + __le16 host_format;
> >> + __le16 guest_format;
> >> + __le32 size;
> >> + __le64 paddr;
> >> + } __packed;
> >
> >
> > No need for the __packed attribute.
>
> discussed above
>
> > And pls do not declare structures within functions.
> > Name them sanely and place in a header or near top of file.
>
> ok
>
> >
> >> + static struct vmci *data;
> >> + ssize_t ret;
> >> +
> >> + data = kmalloc(sizeof(struct vmci), GFP_KERNEL);
> >> + if (!data)
> >> + return -ENOMEM;
> >> +
> >> + *data = (struct vmci) {
> >> + .guest_format = cpu_to_le16(VMCOREINFO_FORMAT_ELF),
> >> + .size = cpu_to_le32(VMCOREINFO_NOTE_SIZE),
> >> + .paddr = cpu_to_le64(paddr_vmcoreinfo_note())
> >> + };
> >> + /* spare ourself reading host format support for now since we
> >> + * don't know what else to format - host may ignore ours
> >> + */
> >> + ret = fw_cfg_write_blob(f->select, data, 0, sizeof(struct vmci));
> >> +
> >> + kfree(data);
> >> + return ret;
> >> +}
> >> +#endif /* CONFIG_CRASH_CORE */
> >> +
> >> /* get fw_cfg_sysfs_entry from kobject member */
> >> static inline struct fw_cfg_sysfs_entry *to_entry(struct kobject *kobj)
> >> {
> >> @@ -464,6 +607,14 @@ static int fw_cfg_register_file(const struct fw_cfg_file *f)
> >> int err;
> >> struct fw_cfg_sysfs_entry *entry;
> >>
> >> +#ifdef CONFIG_CRASH_CORE
> >> + if (fw_cfg_dma_enabled() &&
> >> + strcmp(f->name, "etc/vmcoreinfo") == 0 && !is_kdump_kernel()) {
> >> + if (write_vmcoreinfo(f) < 0)
> >> + pr_warn("fw_cfg: failed to write vmcoreinfo");
> >> + }
> >> +#endif
> >> +
> >> /* allocate new entry */
> >> entry = kzalloc(sizeof(*entry), GFP_KERNEL);
> >> if (!entry)
> >> --
> >> 2.16.1.73.g5832b7e9f2
>
> thanks
Hi
On Mon, Feb 12, 2018 at 10:00 PM, Michael S. Tsirkin <[email protected]> wrote:
> On Mon, Feb 12, 2018 at 11:04:49AM +0100, Marc-Andre Lureau wrote:
>> >> +}
>> >> +
>> >> +/* qemu fw_cfg device is sync today, but spec says it may become async */
>> >> +static void fw_cfg_wait_for_control(struct fw_cfg_dma *d)
>> >> +{
>> >> + do {
>> >> + u32 ctrl = be32_to_cpu(READ_ONCE(d->control));
>> >> +
>> >> + if ((ctrl & ~FW_CFG_DMA_CTL_ERROR) == 0)
>> >> + return;
>> >> +
>> >> + usleep_range(50, 100);
>> >> + } while (true);
>> >
>> > And you need an smp rmb here.
>
> I'd just do rmb() in fact.
>
>> Could you explain? thanks
>
> See Documentation/memory-barriers.txt
> You know that control is valid, but following read of
> the structure could be reordered. So you need that barrier there.
> Same for write: wmb.
Is this ok?
@@ -103,10 +104,14 @@ static ssize_t fw_cfg_dma_transfer(void
*address, u32 length, u32 control)
dma = virt_to_phys(d);
iowrite32be((u64)dma >> 32, fw_cfg_reg_dma);
+ /* force memory to sync before notifying device via MMIO */
+ wmb();
iowrite32be(dma, fw_cfg_reg_dma + 4);
fw_cfg_wait_for_control(d);
+ /* do not reorder the read to d->control */
+ rmb();
if (be32_to_cpu(READ_ONCE(d->control)) & FW_CFG_DMA_CTL_ERROR) {
ret = -EIO;
}
>
>
>> >
>> >> +}
>> >> +
>> >> +static ssize_t fw_cfg_dma_transfer(void *address, u32 length, u32 control)
>> >> +{
>> >> + phys_addr_t dma;
>> >> + struct fw_cfg_dma *d = NULL;
>> >> + ssize_t ret = length;
>> >> +
>> >> + d = kmalloc(sizeof(*d), GFP_KERNEL);
>> >> + if (!d) {
>> >> + ret = -ENOMEM;
>> >> + goto end;
>> >> + }
>> >> +
>> >> + *d = (struct fw_cfg_dma) {
>> >> + .address = address ? cpu_to_be64(virt_to_phys(address)) : 0,
>> >> + .length = cpu_to_be32(length),
>> >> + .control = cpu_to_be32(control)
>> >> + };
>> >> +
>> >> + dma = virt_to_phys(d);
>> >
>> > Pls add docs on why this DMA bypasses the DMA API.
>>
>> Peter said in his patch: "fw_cfg device does not need IOMMU
>> protection, so use physical addresses
>> always. That's how QEMU implements fw_cfg. Otherwise we'll see call
>> traces during boot."
>>
>> Is that enough justification?
>
> what are the reasons for the traces exactly though?
> some kind of explanation should go into comments, and
> I think it should be a bit more detailed than just "it doesn't
> work otherwise".
>
I can use Peter help here. My understanding is because the qemu fw-cfg
device doesn't go through iommu when doing DMA op. Whether it should
or could, I can't answer.
thanks
On Tue, Feb 13, 2018 at 03:14:03PM +0100, Marc-Andre Lureau wrote:
> Hi
>
> On Mon, Feb 12, 2018 at 10:00 PM, Michael S. Tsirkin <[email protected]> wrote:
> > On Mon, Feb 12, 2018 at 11:04:49AM +0100, Marc-Andre Lureau wrote:
> >> >> +}
> >> >> +
> >> >> +/* qemu fw_cfg device is sync today, but spec says it may become async */
> >> >> +static void fw_cfg_wait_for_control(struct fw_cfg_dma *d)
> >> >> +{
> >> >> + do {
> >> >> + u32 ctrl = be32_to_cpu(READ_ONCE(d->control));
> >> >> +
> >> >> + if ((ctrl & ~FW_CFG_DMA_CTL_ERROR) == 0)
> >> >> + return;
> >> >> +
> >> >> + usleep_range(50, 100);
> >> >> + } while (true);
> >> >
> >> > And you need an smp rmb here.
> >
> > I'd just do rmb() in fact.
> >
> >> Could you explain? thanks
> >
> > See Documentation/memory-barriers.txt
> > You know that control is valid, but following read of
> > the structure could be reordered. So you need that barrier there.
> > Same for write: wmb.
>
> Is this ok?
> @@ -103,10 +104,14 @@ static ssize_t fw_cfg_dma_transfer(void
> *address, u32 length, u32 control)
> dma = virt_to_phys(d);
>
> iowrite32be((u64)dma >> 32, fw_cfg_reg_dma);
> + /* force memory to sync before notifying device via MMIO */
> + wmb();
> iowrite32be(dma, fw_cfg_reg_dma + 4);
>
> fw_cfg_wait_for_control(d);
>
> + /* do not reorder the read to d->control */
> + rmb();
> if (be32_to_cpu(READ_ONCE(d->control)) & FW_CFG_DMA_CTL_ERROR) {
> ret = -EIO;
> }
I think you need an rmb after the read of d->control.
> >
> >
> >> >
> >> >> +}
> >> >> +
> >> >> +static ssize_t fw_cfg_dma_transfer(void *address, u32 length, u32 control)
> >> >> +{
> >> >> + phys_addr_t dma;
> >> >> + struct fw_cfg_dma *d = NULL;
> >> >> + ssize_t ret = length;
> >> >> +
> >> >> + d = kmalloc(sizeof(*d), GFP_KERNEL);
> >> >> + if (!d) {
> >> >> + ret = -ENOMEM;
> >> >> + goto end;
> >> >> + }
> >> >> +
> >> >> + *d = (struct fw_cfg_dma) {
> >> >> + .address = address ? cpu_to_be64(virt_to_phys(address)) : 0,
> >> >> + .length = cpu_to_be32(length),
> >> >> + .control = cpu_to_be32(control)
> >> >> + };
> >> >> +
> >> >> + dma = virt_to_phys(d);
> >> >
> >> > Pls add docs on why this DMA bypasses the DMA API.
> >>
> >> Peter said in his patch: "fw_cfg device does not need IOMMU
> >> protection, so use physical addresses
> >> always. That's how QEMU implements fw_cfg. Otherwise we'll see call
> >> traces during boot."
> >>
> >> Is that enough justification?
> >
> > what are the reasons for the traces exactly though?
> > some kind of explanation should go into comments, and
> > I think it should be a bit more detailed than just "it doesn't
> > work otherwise".
> >
>
> I can use Peter help here. My understanding is because the qemu fw-cfg
> device doesn't go through iommu when doing DMA op. Whether it should
> or could, I can't answer.
>
> thanks
Hi
On Tue, Feb 13, 2018 at 3:27 PM, Michael S. Tsirkin <[email protected]> wrote:
> On Tue, Feb 13, 2018 at 03:14:03PM +0100, Marc-Andre Lureau wrote:
>> Hi
>>
>> On Mon, Feb 12, 2018 at 10:00 PM, Michael S. Tsirkin <[email protected]> wrote:
>> > On Mon, Feb 12, 2018 at 11:04:49AM +0100, Marc-Andre Lureau wrote:
>> >> >> +}
>> >> >> +
>> >> >> +/* qemu fw_cfg device is sync today, but spec says it may become async */
>> >> >> +static void fw_cfg_wait_for_control(struct fw_cfg_dma *d)
>> >> >> +{
>> >> >> + do {
>> >> >> + u32 ctrl = be32_to_cpu(READ_ONCE(d->control));
>> >> >> +
>> >> >> + if ((ctrl & ~FW_CFG_DMA_CTL_ERROR) == 0)
>> >> >> + return;
>> >> >> +
>> >> >> + usleep_range(50, 100);
>> >> >> + } while (true);
>> >> >
>> >> > And you need an smp rmb here.
>> >
>> > I'd just do rmb() in fact.
>> >
>> >> Could you explain? thanks
>> >
>> > See Documentation/memory-barriers.txt
>> > You know that control is valid, but following read of
>> > the structure could be reordered. So you need that barrier there.
>> > Same for write: wmb.
>>
>> Is this ok?
>> @@ -103,10 +104,14 @@ static ssize_t fw_cfg_dma_transfer(void
>> *address, u32 length, u32 control)
>> dma = virt_to_phys(d);
>>
>> iowrite32be((u64)dma >> 32, fw_cfg_reg_dma);
>> + /* force memory to sync before notifying device via MMIO */
>> + wmb();
>> iowrite32be(dma, fw_cfg_reg_dma + 4);
>>
>> fw_cfg_wait_for_control(d);
>>
>> + /* do not reorder the read to d->control */
>> + rmb();
>> if (be32_to_cpu(READ_ONCE(d->control)) & FW_CFG_DMA_CTL_ERROR) {
>> ret = -EIO;
>> }
>
> I think you need an rmb after the read of d->control.
>
There are two reads of d->control, one in fw_cfg_wait_for_control() to
wait for completion, and the other one here to handle error. Do you
mean that for clarity rmb() should be moved at the end of
fw_cfg_wait_for_control() instead?
thanks
On Tue, Feb 13, 2018 at 04:16:08PM +0100, Marc-Andre Lureau wrote:
> Hi
>
> On Tue, Feb 13, 2018 at 3:27 PM, Michael S. Tsirkin <[email protected]> wrote:
> > On Tue, Feb 13, 2018 at 03:14:03PM +0100, Marc-Andre Lureau wrote:
> >> Hi
> >>
> >> On Mon, Feb 12, 2018 at 10:00 PM, Michael S. Tsirkin <[email protected]> wrote:
> >> > On Mon, Feb 12, 2018 at 11:04:49AM +0100, Marc-Andre Lureau wrote:
> >> >> >> +}
> >> >> >> +
> >> >> >> +/* qemu fw_cfg device is sync today, but spec says it may become async */
> >> >> >> +static void fw_cfg_wait_for_control(struct fw_cfg_dma *d)
> >> >> >> +{
> >> >> >> + do {
> >> >> >> + u32 ctrl = be32_to_cpu(READ_ONCE(d->control));
> >> >> >> +
> >> >> >> + if ((ctrl & ~FW_CFG_DMA_CTL_ERROR) == 0)
> >> >> >> + return;
> >> >> >> +
> >> >> >> + usleep_range(50, 100);
> >> >> >> + } while (true);
> >> >> >
> >> >> > And you need an smp rmb here.
> >> >
> >> > I'd just do rmb() in fact.
> >> >
> >> >> Could you explain? thanks
> >> >
> >> > See Documentation/memory-barriers.txt
> >> > You know that control is valid, but following read of
> >> > the structure could be reordered. So you need that barrier there.
> >> > Same for write: wmb.
> >>
> >> Is this ok?
> >> @@ -103,10 +104,14 @@ static ssize_t fw_cfg_dma_transfer(void
> >> *address, u32 length, u32 control)
> >> dma = virt_to_phys(d);
> >>
> >> iowrite32be((u64)dma >> 32, fw_cfg_reg_dma);
> >> + /* force memory to sync before notifying device via MMIO */
> >> + wmb();
> >> iowrite32be(dma, fw_cfg_reg_dma + 4);
> >>
> >> fw_cfg_wait_for_control(d);
> >>
> >> + /* do not reorder the read to d->control */
> >> + rmb();
> >> if (be32_to_cpu(READ_ONCE(d->control)) & FW_CFG_DMA_CTL_ERROR) {
> >> ret = -EIO;
> >> }
> >
> > I think you need an rmb after the read of d->control.
> >
>
>
> There are two reads of d->control, one in fw_cfg_wait_for_control() to
> wait for completion, and the other one here to handle error. Do you
> mean that for clarity rmb() should be moved at the end of
> fw_cfg_wait_for_control() instead?
>
> thanks
IMHO that's a reasonable way to do it, yes.
OTOH is looking at DMA data the only way to detect DMA is complete?
Isn't there an IO register for that?
On Tue, Feb 13, 2018 at 4:19 PM, Michael S. Tsirkin <[email protected]> wrote:
> On Tue, Feb 13, 2018 at 04:16:08PM +0100, Marc-Andre Lureau wrote:
>> Hi
>>
>> On Tue, Feb 13, 2018 at 3:27 PM, Michael S. Tsirkin <[email protected]> wrote:
>> > On Tue, Feb 13, 2018 at 03:14:03PM +0100, Marc-Andre Lureau wrote:
>> >> Hi
>> >>
>> >> On Mon, Feb 12, 2018 at 10:00 PM, Michael S. Tsirkin <[email protected]> wrote:
>> >> > On Mon, Feb 12, 2018 at 11:04:49AM +0100, Marc-Andre Lureau wrote:
>> >> >> >> +}
>> >> >> >> +
>> >> >> >> +/* qemu fw_cfg device is sync today, but spec says it may become async */
>> >> >> >> +static void fw_cfg_wait_for_control(struct fw_cfg_dma *d)
>> >> >> >> +{
>> >> >> >> + do {
>> >> >> >> + u32 ctrl = be32_to_cpu(READ_ONCE(d->control));
>> >> >> >> +
>> >> >> >> + if ((ctrl & ~FW_CFG_DMA_CTL_ERROR) == 0)
>> >> >> >> + return;
>> >> >> >> +
>> >> >> >> + usleep_range(50, 100);
>> >> >> >> + } while (true);
>> >> >> >
>> >> >> > And you need an smp rmb here.
>> >> >
>> >> > I'd just do rmb() in fact.
>> >> >
>> >> >> Could you explain? thanks
>> >> >
>> >> > See Documentation/memory-barriers.txt
>> >> > You know that control is valid, but following read of
>> >> > the structure could be reordered. So you need that barrier there.
>> >> > Same for write: wmb.
>> >>
>> >> Is this ok?
>> >> @@ -103,10 +104,14 @@ static ssize_t fw_cfg_dma_transfer(void
>> >> *address, u32 length, u32 control)
>> >> dma = virt_to_phys(d);
>> >>
>> >> iowrite32be((u64)dma >> 32, fw_cfg_reg_dma);
>> >> + /* force memory to sync before notifying device via MMIO */
>> >> + wmb();
>> >> iowrite32be(dma, fw_cfg_reg_dma + 4);
>> >>
>> >> fw_cfg_wait_for_control(d);
>> >>
>> >> + /* do not reorder the read to d->control */
>> >> + rmb();
>> >> if (be32_to_cpu(READ_ONCE(d->control)) & FW_CFG_DMA_CTL_ERROR) {
>> >> ret = -EIO;
>> >> }
>> >
>> > I think you need an rmb after the read of d->control.
>> >
>>
>>
>> There are two reads of d->control, one in fw_cfg_wait_for_control() to
>> wait for completion, and the other one here to handle error. Do you
>> mean that for clarity rmb() should be moved at the end of
>> fw_cfg_wait_for_control() instead?
>>
>> thanks
>
>
> IMHO that's a reasonable way to do it, yes.
> OTOH is looking at DMA data the only way to detect DMA is complete?
> Isn't there an IO register for that?
That's the only thing that indicate completion it seems:
https://git.qemu.org/?p=qemu.git;a=blob;f=hw/nvram/fw_cfg.c;hb=HEAD#l389
The spec doesn't describe other ways either:
https://git.qemu.org/?p=qemu.git;a=blob;f=docs/specs/fw_cfg.txt