2014-07-03 08:53:00

by Philipp Reisner

[permalink] [raw]
Subject: [PATCH 00/23] RFC DRBD fixes

Hi,

this series adds a debugfs hierarchy to DRBD. It unveils a lot
of information about timing of states/stages of IO requests.

This information helps a lot while optimizing DRBD for SSD
based IO stacks.

Dan Carpenter (1):
drbd: silence underflow warning in read_in_block()

Lars Ellenberg (22):
drbd: poison free'd device, resource and connection structs
drbd: fix drbd_destroy_device reference count updates
drbd: track meta data IO intent, start and submit time
drbd: gather detailed timing statistics for drbd_requests
drbd: add lists to find oldest pending requests
drbd: add caching oldest request pointers for replication stages
drbd: improve throttling decisions of background resynchronisation
drbd: track timing details of peer_requests
drbd: register peer requests on read_ee early
drbd: track details of bitmap IO
drbd: debugfs: add basic hierarchy
drbd: debugfs: add in_flight_summary data
drbd: debugfs: deal with destructor racing with open of debugfs file
drbd: debugfs: Add in_flight_summary
drbd: debugfs: add callback_history
drbd: debugfs: add per volume oldest_requests
drbd: debugfs: add version tag to debugfs files
drbd: debugfs: add per connection oldest requests
drbd: debugfs: add per device data_gen_id
drbd: resync should only lock out specific ranges
drbd: drop spurious parameters from _drbd_md_sync_page_io
drbd: implicitly truncate cpu-mask

drivers/block/drbd/Makefile | 1 +
drivers/block/drbd/drbd_actlog.c | 80 +++-
drivers/block/drbd/drbd_bitmap.c | 70 ++-
drivers/block/drbd/drbd_debugfs.c | 958 +++++++++++++++++++++++++++++++++++++
drivers/block/drbd/drbd_debugfs.h | 39 ++
drivers/block/drbd/drbd_int.h | 175 ++++++-
drivers/block/drbd/drbd_main.c | 116 +++--
drivers/block/drbd/drbd_nl.c | 4 +-
drivers/block/drbd/drbd_proc.c | 3 +
drivers/block/drbd/drbd_receiver.c | 138 ++++--
drivers/block/drbd/drbd_req.c | 411 +++++++++++-----
drivers/block/drbd/drbd_worker.c | 59 ++-
include/linux/drbd.h | 2 +-
lib/lru_cache.c | 21 +-
14 files changed, 1784 insertions(+), 293 deletions(-)
create mode 100644 drivers/block/drbd/drbd_debugfs.c
create mode 100644 drivers/block/drbd/drbd_debugfs.h

--
1.9.1


2014-07-03 08:43:23

by Philipp Reisner

[permalink] [raw]
Subject: [PATCH 01/23] drbd: poison free'd device, resource and connection structs

From: Lars Ellenberg <[email protected]>

Now that we have additional asynchronous kref_get/kref_put
via debugfs, make sure we catch access after free.

Poison struct drbd_device, drbd_connection and drbd_resource
before kfree() with 0xfd, 0xfc, and 0xf2, respectively.

Signed-off-by: Philipp Reisner <[email protected]>
Signed-off-by: Lars Ellenberg <[email protected]>
---
drivers/block/drbd/drbd_main.c | 3 +++
1 file changed, 3 insertions(+)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 686c5a5..92547d1 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2192,6 +2192,7 @@ void drbd_destroy_device(struct kref *kref)
blk_cleanup_queue(device->rq_queue);
kfree(device->rs_plan_s);
kfree(first_peer_device(device));
+ memset(device, 0xfd, sizeof(*device));
kfree(device);

for_each_connection(connection, resource)
@@ -2285,6 +2286,7 @@ void drbd_destroy_resource(struct kref *kref)
idr_destroy(&resource->devices);
free_cpumask_var(resource->cpu_mask);
kfree(resource->name);
+ memset(resource, 0xf2, sizeof(*resource));
kfree(resource);
}

@@ -2665,6 +2667,7 @@ void drbd_destroy_connection(struct kref *kref)
drbd_free_socket(&connection->data);
kfree(connection->int_dig_in);
kfree(connection->int_dig_vv);
+ memset(connection, 0xfc, sizeof(*connection));
kfree(connection);
kref_put(&resource->kref, drbd_destroy_resource);
}
--
1.9.1

2014-07-03 08:43:34

by Philipp Reisner

[permalink] [raw]
Subject: [PATCH 12/23] drbd: debugfs: add in_flight_summary data

From: Lars Ellenberg <[email protected]>

To help diagnosing "high latency" or "hung" IO situations on DRBD,
present per drbd resource group a summary of operations currently in progress.

First item is a list of oldest drbd_request objects
waiting for various things:
* still being prepared
* waiting for activity log transaction
* waiting for local disk
* waiting to be sent
* waiting for peer acknowledgement ("receive ack", "write ack")
* waiting for peer epoch acknowledgement ("barrier ack")

Signed-off-by: Philipp Reisner <[email protected]>
Signed-off-by: Lars Ellenberg <[email protected]>
---
drivers/block/drbd/drbd_debugfs.c | 196 ++++++++++++++++++++++++++++++++++++++
1 file changed, 196 insertions(+)

diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index 9b120c3..d393aee 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -2,7 +2,9 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/debugfs.h>
+#include <linux/seq_file.h>
#include <linux/stat.h>
+#include <linux/jiffies.h>
#include <linux/list.h>

#include "drbd_int.h"
@@ -13,6 +15,194 @@ static struct dentry *drbd_debugfs_root;
static struct dentry *drbd_debugfs_resources;
static struct dentry *drbd_debugfs_minors;

+static void seq_print_age_or_dash(struct seq_file *m, bool valid, unsigned long dt)
+{
+ if (valid)
+ seq_printf(m, "\t%d", jiffies_to_msecs(dt));
+ else
+ seq_printf(m, "\t-");
+}
+
+static void seq_print_rq_state_bit(struct seq_file *m,
+ bool is_set, char *sep, const char *name)
+{
+ if (!is_set)
+ return;
+ seq_putc(m, *sep);
+ seq_puts(m, name);
+ *sep = '|';
+}
+
+/* pretty print enum drbd_req_state_bits req->rq_state */
+static void seq_print_request_state(struct seq_file *m, struct drbd_request *req)
+{
+ unsigned int s = req->rq_state;
+ char sep = ' ';
+ seq_printf(m, "\t0x%08x", s);
+ seq_printf(m, "\tmaster: %s", req->master_bio ? "pending" : "completed");
+
+ /* RQ_WRITE ignored, already reported */
+ seq_puts(m, "\tlocal:");
+ seq_print_rq_state_bit(m, s & RQ_IN_ACT_LOG, &sep, "in-AL");
+ seq_print_rq_state_bit(m, s & RQ_POSTPONED, &sep, "postponed");
+ seq_print_rq_state_bit(m, s & RQ_COMPLETION_SUSP, &sep, "suspended");
+ sep = ' ';
+ seq_print_rq_state_bit(m, s & RQ_LOCAL_PENDING, &sep, "pending");
+ seq_print_rq_state_bit(m, s & RQ_LOCAL_COMPLETED, &sep, "completed");
+ seq_print_rq_state_bit(m, s & RQ_LOCAL_ABORTED, &sep, "aborted");
+ seq_print_rq_state_bit(m, s & RQ_LOCAL_OK, &sep, "ok");
+ if (sep == ' ')
+ seq_puts(m, " -");
+
+ /* for_each_connection ... */
+ seq_printf(m, "\tnet:");
+ sep = ' ';
+ seq_print_rq_state_bit(m, s & RQ_NET_PENDING, &sep, "pending");
+ seq_print_rq_state_bit(m, s & RQ_NET_QUEUED, &sep, "queued");
+ seq_print_rq_state_bit(m, s & RQ_NET_SENT, &sep, "sent");
+ seq_print_rq_state_bit(m, s & RQ_NET_DONE, &sep, "done");
+ seq_print_rq_state_bit(m, s & RQ_NET_SIS, &sep, "sis");
+ seq_print_rq_state_bit(m, s & RQ_NET_OK, &sep, "ok");
+ if (sep == ' ')
+ seq_puts(m, " -");
+
+ seq_printf(m, " :");
+ sep = ' ';
+ seq_print_rq_state_bit(m, s & RQ_EXP_RECEIVE_ACK, &sep, "B");
+ seq_print_rq_state_bit(m, s & RQ_EXP_WRITE_ACK, &sep, "C");
+ seq_print_rq_state_bit(m, s & RQ_EXP_BARR_ACK, &sep, "barr");
+ if (sep == ' ')
+ seq_puts(m, " -");
+ seq_printf(m, "\n");
+}
+
+static void seq_print_one_request(struct seq_file *m, struct drbd_request *req, unsigned long now)
+{
+ /* change anything here, fixup header below! */
+ unsigned int s = req->rq_state;
+
+#define RQ_HDR_1 "epoch\tsector\tsize\trw"
+ seq_printf(m, "0x%x\t%llu\t%u\t%s",
+ req->epoch,
+ (unsigned long long)req->i.sector, req->i.size >> 9,
+ (s & RQ_WRITE) ? "W" : "R");
+
+#define RQ_HDR_2 "\tstart\tin AL\tsubmit"
+ seq_printf(m, "\t%d", jiffies_to_msecs(now - req->start_jif));
+ seq_print_age_or_dash(m, s & RQ_IN_ACT_LOG, now - req->in_actlog_jif);
+ seq_print_age_or_dash(m, s & RQ_LOCAL_PENDING, now - req->pre_submit_jif);
+
+#define RQ_HDR_3 "\tsent\tacked\tdone"
+ seq_print_age_or_dash(m, s & RQ_NET_SENT, now - req->pre_send_jif);
+ seq_print_age_or_dash(m, (s & RQ_NET_SENT) && !(s & RQ_NET_PENDING), now - req->acked_jif);
+ seq_print_age_or_dash(m, s & RQ_NET_DONE, now - req->net_done_jif);
+
+#define RQ_HDR_4 "\tstate\n"
+ seq_print_request_state(m, req);
+}
+#define RQ_HDR RQ_HDR_1 RQ_HDR_2 RQ_HDR_3 RQ_HDR_4
+
+static void seq_print_minor_vnr_req(struct seq_file *m, struct drbd_request *req, unsigned long now)
+{
+ seq_printf(m, "%u\t%u\t", req->device->minor, req->device->vnr);
+ seq_print_one_request(m, req, now);
+}
+
+static void seq_print_resource_transfer_log_summary(struct seq_file *m,
+ struct drbd_resource *resource,
+ struct drbd_connection *connection,
+ unsigned long now)
+{
+ struct drbd_request *req;
+ unsigned int count = 0;
+ unsigned int show_state = 0;
+
+ seq_puts(m, "n\tdevice\tvnr\t" RQ_HDR);
+ spin_lock_irq(&resource->req_lock);
+ list_for_each_entry(req, &connection->transfer_log, tl_requests) {
+ unsigned int tmp = 0;
+ unsigned int s;
+ ++count;
+
+ /* don't disable irq "forever" */
+ if (!(count & 0x1ff)) {
+ struct drbd_request *req_next;
+ kref_get(&req->kref);
+ spin_unlock_irq(&resource->req_lock);
+ cond_resched();
+ spin_lock_irq(&resource->req_lock);
+ req_next = list_next_entry(req, tl_requests);
+ if (kref_put(&req->kref, drbd_req_destroy))
+ req = req_next;
+ if (&req->tl_requests == &connection->transfer_log)
+ break;
+ }
+
+ s = req->rq_state;
+
+ /* This is meant to summarize timing issues, to be able to tell
+ * local disk problems from network problems.
+ * Skip requests, if we have shown an even older request with
+ * similar aspects already. */
+ if (req->master_bio == NULL)
+ tmp |= 1;
+ if ((s & RQ_LOCAL_MASK) && (s & RQ_LOCAL_PENDING))
+ tmp |= 2;
+ if (s & RQ_NET_MASK) {
+ if (!(s & RQ_NET_SENT))
+ tmp |= 4;
+ if (s & RQ_NET_PENDING)
+ tmp |= 8;
+ if (!(s & RQ_NET_DONE))
+ tmp |= 16;
+ }
+ if ((tmp & show_state) == tmp)
+ continue;
+ show_state |= tmp;
+ seq_printf(m, "%u\t", count);
+ seq_print_minor_vnr_req(m, req, now);
+ if (show_state == 0x1f)
+ break;
+ }
+ spin_unlock_irq(&resource->req_lock);
+}
+
+/* TODO: transfer_log and friends should be moved to resource */
+static int in_flight_summary_show(struct seq_file *m, void *pos)
+{
+ struct drbd_resource *resource = m->private;
+ struct drbd_connection *connection;
+ unsigned long jif = jiffies;
+
+ connection = first_connection(resource);
+ /* This does not happen, actually.
+ * But be robust and prepare for future code changes. */
+ if (!connection)
+ return 0;
+
+ seq_puts(m, "oldest application requests\n");
+ seq_print_resource_transfer_log_summary(m, resource, connection, jif);
+ seq_putc(m, '\n');
+
+ jif = jiffies - jif;
+ if (jif)
+ seq_printf(m, "generated in %d ms\n", jiffies_to_msecs(jif));
+ return 0;
+}
+
+static int in_flight_summary_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, in_flight_summary_show, inode->i_private);
+}
+
+static const struct file_operations in_flight_summary_fops = {
+ .owner = THIS_MODULE,
+ .open = in_flight_summary_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
void drbd_debugfs_resource_add(struct drbd_resource *resource)
{
struct dentry *dentry;
@@ -34,6 +224,12 @@ void drbd_debugfs_resource_add(struct drbd_resource *resource)
goto fail;
resource->debugfs_res_connections = dentry;

+ dentry = debugfs_create_file("in_flight_summary", S_IRUSR|S_IRGRP,
+ resource->debugfs_res, resource,
+ &in_flight_summary_fops);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ resource->debugfs_res_in_flight_summary = dentry;
return;

fail:
--
1.9.1

2014-07-03 08:44:04

by Philipp Reisner

[permalink] [raw]
Subject: [PATCH 17/23] drbd: debugfs: add version tag to debugfs files

From: Lars Ellenberg <[email protected]>

Make the first line of debugfs files a version number,
starting now with "v: 0".

If we change content of presentation, we will bump that.
Monitoring or diagnostic scritps that may parse these files
can then easily know when they need to be reviewed.

Signed-off-by: Philipp Reisner <[email protected]>
Signed-off-by: Lars Ellenberg <[email protected]>
---
drivers/block/drbd/drbd_debugfs.c | 52 +++++++++++++++++++++++++++++++++++++++
1 file changed, 52 insertions(+)

diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index 45b5205..a4d0666 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -11,7 +11,13 @@
#include "drbd_req.h"
#include "drbd_debugfs.h"

+
+/**********************************************************************
+ * Whenever you change the file format, remember to bump the version. *
+ **********************************************************************/
+
static struct dentry *drbd_debugfs_root;
+static struct dentry *drbd_debugfs_version;
static struct dentry *drbd_debugfs_resources;
static struct dentry *drbd_debugfs_minors;

@@ -368,6 +374,9 @@ static int in_flight_summary_show(struct seq_file *m, void *pos)
if (!connection || !kref_get_unless_zero(&connection->kref))
return -ESTALE;

+ /* BUMP me if you change the file format/content/presentation */
+ seq_printf(m, "v: %u\n\n", 0);
+
seq_puts(m, "oldest bitmap IO\n");
seq_print_resource_pending_bitmap_io(m, resource, jif);
seq_putc(m, '\n');
@@ -562,6 +571,9 @@ static int callback_history_show(struct seq_file *m, void *ignored)
struct drbd_connection *connection = m->private;
unsigned long jif = jiffies;

+ /* BUMP me if you change the file format/content/presentation */
+ seq_printf(m, "v: %u\n\n", 0);
+
seq_puts(m, "n\tage\tcallsite\tfn\n");
seq_print_timing_details(m, "worker", connection->w_cb_nr, connection->w_timing_details, jif);
seq_print_timing_details(m, "receiver", connection->r_cb_nr, connection->r_timing_details, jif);
@@ -686,6 +698,10 @@ static void resync_dump_detail(struct seq_file *m, struct lc_element *e)
static int device_resync_extents_show(struct seq_file *m, void *ignored)
{
struct drbd_device *device = m->private;
+
+ /* BUMP me if you change the file format/content/presentation */
+ seq_printf(m, "v: %u\n\n", 0);
+
if (get_ldev_if_state(device, D_FAILED)) {
lc_seq_printf_stats(m, device->resync);
lc_seq_dump_details(m, device->resync, "rs_left flags", resync_dump_detail);
@@ -697,6 +713,10 @@ static int device_resync_extents_show(struct seq_file *m, void *ignored)
static int device_act_log_extents_show(struct seq_file *m, void *ignored)
{
struct drbd_device *device = m->private;
+
+ /* BUMP me if you change the file format/content/presentation */
+ seq_printf(m, "v: %u\n\n", 0);
+
if (get_ldev_if_state(device, D_FAILED)) {
lc_seq_printf_stats(m, device->act_log);
lc_seq_dump_details(m, device->act_log, "", NULL);
@@ -713,6 +733,9 @@ static int device_oldest_requests_show(struct seq_file *m, void *ignored)
struct drbd_request *r1, *r2;
int i;

+ /* BUMP me if you change the file format/content/presentation */
+ seq_printf(m, "v: %u\n\n", 0);
+
seq_puts(m, RQ_HDR);
spin_lock_irq(&resource->req_lock);
/* WRITE, then READ */
@@ -839,12 +862,36 @@ void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device)
drbd_debugfs_remove(&peer_device->debugfs_peer_dev);
}

+static int drbd_version_show(struct seq_file *m, void *ignored)
+{
+ seq_printf(m, "# %s\n", drbd_buildtag());
+ seq_printf(m, "VERSION=%s\n", REL_VERSION);
+ seq_printf(m, "API_VERSION=%u\n", API_VERSION);
+ seq_printf(m, "PRO_VERSION_MIN=%u\n", PRO_VERSION_MIN);
+ seq_printf(m, "PRO_VERSION_MAX=%u\n", PRO_VERSION_MAX);
+ return 0;
+}
+
+static int drbd_version_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, drbd_version_show, NULL);
+}
+
+static struct file_operations drbd_version_fops = {
+ .owner = THIS_MODULE,
+ .open = drbd_version_open,
+ .llseek = seq_lseek,
+ .read = seq_read,
+ .release = single_release,
+};
+
/* not __exit, may be indirectly called
* from the module-load-failure path as well. */
void drbd_debugfs_cleanup(void)
{
drbd_debugfs_remove(&drbd_debugfs_resources);
drbd_debugfs_remove(&drbd_debugfs_minors);
+ drbd_debugfs_remove(&drbd_debugfs_version);
drbd_debugfs_remove(&drbd_debugfs_root);
}

@@ -857,6 +904,11 @@ int __init drbd_debugfs_init(void)
goto fail;
drbd_debugfs_root = dentry;

+ dentry = debugfs_create_file("version", 0444, drbd_debugfs_root, NULL, &drbd_version_fops);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ drbd_debugfs_version = dentry;
+
dentry = debugfs_create_dir("resources", drbd_debugfs_root);
if (IS_ERR_OR_NULL(dentry))
goto fail;
--
1.9.1

2014-07-03 08:44:09

by Philipp Reisner

[permalink] [raw]
Subject: [PATCH 14/23] drbd: debugfs: Add in_flight_summary

From: Lars Ellenberg <[email protected]>

* Add details about pending meta data operations to in_flight_summary.

* Report number of requests waiting for activity log transactions.

* timing details of peer_requests to in_flight_summary.

* FLUSH details
DRBD devides the incoming request stream into "epochs",
in which peers are allowed to re-order writes independendly.

These epochs are separated by P_BARRIER on the replication link.
Such barrier packets, depending on configuration, may cause
the receiving side to drain the lower level device request queues
and call blkdev_issue_flush().

This is known to be an other major source of latency in DRBD.

Track timing details of calls to blkdev_issue_flush(),
and add them to in_flight_summary.

* data socket stats
To be able to diagnose bottlenecks and root causes of "slow" IO on DRBD,
it is useful to see network buffer stats along with the timing details of
requests, peer requests, and meta data IO.

* pending bitmap IO timing details to in_flight_summary.

Signed-off-by: Philipp Reisner <[email protected]>
Signed-off-by: Lars Ellenberg <[email protected]>
---
drivers/block/drbd/drbd_debugfs.c | 231 ++++++++++++++++++++++++++++++++++++-
drivers/block/drbd/drbd_int.h | 3 +
drivers/block/drbd/drbd_main.c | 33 +++---
drivers/block/drbd/drbd_receiver.c | 8 ++
4 files changed, 255 insertions(+), 20 deletions(-)

diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index 51c64ec..12d072d 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -23,14 +23,24 @@ static void seq_print_age_or_dash(struct seq_file *m, bool valid, unsigned long
seq_printf(m, "\t-");
}

+static void __seq_print_rq_state_bit(struct seq_file *m,
+ bool is_set, char *sep, const char *set_name, const char *unset_name)
+{
+ if (is_set && set_name) {
+ seq_putc(m, *sep);
+ seq_puts(m, set_name);
+ *sep = '|';
+ } else if (!is_set && unset_name) {
+ seq_putc(m, *sep);
+ seq_puts(m, unset_name);
+ *sep = '|';
+ }
+}
+
static void seq_print_rq_state_bit(struct seq_file *m,
- bool is_set, char *sep, const char *name)
+ bool is_set, char *sep, const char *set_name)
{
- if (!is_set)
- return;
- seq_putc(m, *sep);
- seq_puts(m, name);
- *sep = '|';
+ __seq_print_rq_state_bit(m, is_set, sep, set_name, NULL);
}

/* pretty print enum drbd_req_state_bits req->rq_state */
@@ -108,6 +118,184 @@ static void seq_print_minor_vnr_req(struct seq_file *m, struct drbd_request *req
seq_print_one_request(m, req, now);
}

+static void seq_print_resource_pending_meta_io(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
+{
+ struct drbd_device *device;
+ unsigned int i;
+
+ seq_puts(m, "minor\tvnr\tstart\tsubmit\tintent\n");
+ rcu_read_lock();
+ idr_for_each_entry(&resource->devices, device, i) {
+ struct drbd_md_io tmp;
+ /* In theory this is racy,
+ * in the sense that there could have been a
+ * drbd_md_put_buffer(); drbd_md_get_buffer();
+ * between accessing these members here. */
+ tmp = device->md_io;
+ if (atomic_read(&tmp.in_use)) {
+ seq_printf(m, "%u\t%u\t%d\t",
+ device->minor, device->vnr,
+ jiffies_to_msecs(now - tmp.start_jif));
+ if (time_before(tmp.submit_jif, tmp.start_jif))
+ seq_puts(m, "-\t");
+ else
+ seq_printf(m, "%d\t", jiffies_to_msecs(now - tmp.submit_jif));
+ seq_printf(m, "%s\n", tmp.current_use);
+ }
+ }
+ rcu_read_unlock();
+}
+
+static void seq_print_waiting_for_AL(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
+{
+ struct drbd_device *device;
+ unsigned int i;
+
+ seq_puts(m, "minor\tvnr\tage\t#waiting\n");
+ rcu_read_lock();
+ idr_for_each_entry(&resource->devices, device, i) {
+ unsigned long jif;
+ struct drbd_request *req;
+ int n = atomic_read(&device->ap_actlog_cnt);
+ if (n) {
+ spin_lock_irq(&device->resource->req_lock);
+ req = list_first_entry_or_null(&device->pending_master_completion[1],
+ struct drbd_request, req_pending_master_completion);
+ /* if the oldest request does not wait for the activity log
+ * it is not interesting for us here */
+ if (req && !(req->rq_state & RQ_IN_ACT_LOG))
+ jif = req->start_jif;
+ else
+ req = NULL;
+ spin_unlock_irq(&device->resource->req_lock);
+ }
+ if (n) {
+ seq_printf(m, "%u\t%u\t", device->minor, device->vnr);
+ if (req)
+ seq_printf(m, "%u\t", jiffies_to_msecs(now - jif));
+ else
+ seq_puts(m, "-\t");
+ seq_printf(m, "%u\n", n);
+ }
+ }
+ rcu_read_unlock();
+}
+
+static void seq_print_device_bitmap_io(struct seq_file *m, struct drbd_device *device, unsigned long now)
+{
+ struct drbd_bm_aio_ctx *ctx;
+ unsigned long start_jif;
+ unsigned int in_flight;
+ unsigned int flags;
+ spin_lock_irq(&device->resource->req_lock);
+ ctx = list_first_entry_or_null(&device->pending_bitmap_io, struct drbd_bm_aio_ctx, list);
+ if (ctx && ctx->done)
+ ctx = NULL;
+ if (ctx) {
+ start_jif = ctx->start_jif;
+ in_flight = atomic_read(&ctx->in_flight);
+ flags = ctx->flags;
+ }
+ spin_unlock_irq(&device->resource->req_lock);
+ if (ctx) {
+ seq_printf(m, "%u\t%u\t%c\t%u\t%u\n",
+ device->minor, device->vnr,
+ (flags & BM_AIO_READ) ? 'R' : 'W',
+ jiffies_to_msecs(now - start_jif),
+ in_flight);
+ }
+}
+
+static void seq_print_resource_pending_bitmap_io(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
+{
+ struct drbd_device *device;
+ unsigned int i;
+
+ seq_puts(m, "minor\tvnr\trw\tage\t#in-flight\n");
+ rcu_read_lock();
+ idr_for_each_entry(&resource->devices, device, i) {
+ seq_print_device_bitmap_io(m, device, now);
+ }
+ rcu_read_unlock();
+}
+
+/* pretty print enum peer_req->flags */
+static void seq_print_peer_request_flags(struct seq_file *m, struct drbd_peer_request *peer_req)
+{
+ unsigned long f = peer_req->flags;
+ char sep = ' ';
+
+ __seq_print_rq_state_bit(m, f & EE_SUBMITTED, &sep, "submitted", "preparing");
+ __seq_print_rq_state_bit(m, f & EE_APPLICATION, &sep, "application", "internal");
+ seq_print_rq_state_bit(m, f & EE_CALL_AL_COMPLETE_IO, &sep, "in-AL");
+ seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C");
+ seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync");
+
+ if (f & EE_IS_TRIM) {
+ seq_putc(m, sep);
+ sep = '|';
+ if (f & EE_IS_TRIM_USE_ZEROOUT)
+ seq_puts(m, "zero-out");
+ else
+ seq_puts(m, "trim");
+ }
+ seq_putc(m, '\n');
+}
+
+static void seq_print_peer_request(struct seq_file *m,
+ struct drbd_device *device, struct list_head *lh,
+ unsigned long now)
+{
+ bool reported_preparing = false;
+ struct drbd_peer_request *peer_req;
+ list_for_each_entry(peer_req, lh, w.list) {
+ if (reported_preparing && !(peer_req->flags & EE_SUBMITTED))
+ continue;
+
+ if (device)
+ seq_printf(m, "%u\t%u\t", device->minor, device->vnr);
+
+ seq_printf(m, "%llu\t%u\t%c\t%u\t",
+ (unsigned long long)peer_req->i.sector, peer_req->i.size >> 9,
+ (peer_req->flags & EE_WRITE) ? 'W' : 'R',
+ jiffies_to_msecs(now - peer_req->submit_jif));
+ seq_print_peer_request_flags(m, peer_req);
+ if (peer_req->flags & EE_SUBMITTED)
+ break;
+ else
+ reported_preparing = true;
+ }
+}
+
+static void seq_print_device_peer_requests(struct seq_file *m,
+ struct drbd_device *device, unsigned long now)
+{
+ seq_puts(m, "minor\tvnr\tsector\tsize\trw\tage\tflags\n");
+ spin_lock_irq(&device->resource->req_lock);
+ seq_print_peer_request(m, device, &device->active_ee, now);
+ seq_print_peer_request(m, device, &device->read_ee, now);
+ seq_print_peer_request(m, device, &device->sync_ee, now);
+ spin_unlock_irq(&device->resource->req_lock);
+ if (test_bit(FLUSH_PENDING, &device->flags)) {
+ seq_printf(m, "%u\t%u\t-\t-\tF\t%u\tflush\n",
+ device->minor, device->vnr,
+ jiffies_to_msecs(now - device->flush_jif));
+ }
+}
+
+static void seq_print_resource_pending_peer_requests(struct seq_file *m,
+ struct drbd_resource *resource, unsigned long now)
+{
+ struct drbd_device *device;
+ unsigned int i;
+
+ rcu_read_lock();
+ idr_for_each_entry(&resource->devices, device, i) {
+ seq_print_device_peer_requests(m, device, now);
+ }
+ rcu_read_unlock();
+}
+
static void seq_print_resource_transfer_log_summary(struct seq_file *m,
struct drbd_resource *resource,
struct drbd_connection *connection,
@@ -180,6 +368,37 @@ static int in_flight_summary_show(struct seq_file *m, void *pos)
if (!connection || !kref_get_unless_zero(&connection->kref))
return -ESTALE;

+ seq_puts(m, "oldest bitmap IO\n");
+ seq_print_resource_pending_bitmap_io(m, resource, jif);
+ seq_putc(m, '\n');
+
+ seq_puts(m, "meta data IO\n");
+ seq_print_resource_pending_meta_io(m, resource, jif);
+ seq_putc(m, '\n');
+
+ seq_puts(m, "socket buffer stats\n");
+ /* for each connection ... once we have more than one */
+ rcu_read_lock();
+ if (connection->data.socket) {
+ /* open coded SIOCINQ, the "relevant" part */
+ struct tcp_sock *tp = tcp_sk(connection->data.socket->sk);
+ int answ = tp->rcv_nxt - tp->copied_seq;
+ seq_printf(m, "unread receive buffer: %u Byte\n", answ);
+ /* open coded SIOCOUTQ, the "relevant" part */
+ answ = tp->write_seq - tp->snd_una;
+ seq_printf(m, "unacked send buffer: %u Byte\n", answ);
+ }
+ rcu_read_unlock();
+ seq_putc(m, '\n');
+
+ seq_puts(m, "oldest peer requests\n");
+ seq_print_resource_pending_peer_requests(m, resource, jif);
+ seq_putc(m, '\n');
+
+ seq_puts(m, "application requests waiting for activity log\n");
+ seq_print_waiting_for_AL(m, resource, jif);
+ seq_putc(m, '\n');
+
seq_puts(m, "oldest application requests\n");
seq_print_resource_transfer_log_summary(m, resource, connection, jif);
seq_putc(m, '\n');
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 20f2b38..30ed430 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -522,6 +522,9 @@ enum {
DISCARD_MY_DATA, /* discard_my_data flag per volume */
READ_BALANCE_RR,

+ FLUSH_PENDING, /* if set, device->flush_jif is when we submitted that flush
+ * from drbd_flush_after_epoch() */
+
/* cleared only after backing device related structures have been destroyed. */
GOING_DISKLESS, /* Disk is being detached, because of io-error, or admin request. */

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 01de57e..9712bcc 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2982,24 +2982,29 @@ void drbd_free_ldev(struct drbd_backing_dev *ldev)
kfree(ldev);
}

-void drbd_free_sock(struct drbd_connection *connection)
+static void drbd_free_one_sock(struct drbd_socket *ds)
{
- if (connection->data.socket) {
- mutex_lock(&connection->data.mutex);
- kernel_sock_shutdown(connection->data.socket, SHUT_RDWR);
- sock_release(connection->data.socket);
- connection->data.socket = NULL;
- mutex_unlock(&connection->data.mutex);
- }
- if (connection->meta.socket) {
- mutex_lock(&connection->meta.mutex);
- kernel_sock_shutdown(connection->meta.socket, SHUT_RDWR);
- sock_release(connection->meta.socket);
- connection->meta.socket = NULL;
- mutex_unlock(&connection->meta.mutex);
+ struct socket *s;
+ mutex_lock(&ds->mutex);
+ s = ds->socket;
+ ds->socket = NULL;
+ mutex_unlock(&ds->mutex);
+ if (s) {
+ /* so debugfs does not need to mutex_lock() */
+ synchronize_rcu();
+ kernel_sock_shutdown(s, SHUT_RDWR);
+ sock_release(s);
}
}

+void drbd_free_sock(struct drbd_connection *connection)
+{
+ if (connection->data.socket)
+ drbd_free_one_sock(&connection->data);
+ if (connection->meta.socket)
+ drbd_free_one_sock(&connection->meta);
+}
+
/* meta data management */

void conn_md_sync(struct drbd_connection *connection)
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 42e3835..9f6ed24 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1187,8 +1187,16 @@ static void drbd_flush(struct drbd_connection *connection)
kref_get(&device->kref);
rcu_read_unlock();

+ /* Right now, we have only this one synchronous code path
+ * for flushes between request epochs.
+ * We may want to make those asynchronous,
+ * or at least parallelize the flushes to the volume devices.
+ */
+ device->flush_jif = jiffies;
+ set_bit(FLUSH_PENDING, &device->flags);
rv = blkdev_issue_flush(device->ldev->backing_bdev,
GFP_NOIO, NULL);
+ clear_bit(FLUSH_PENDING, &device->flags);
if (rv) {
drbd_info(device, "local disk flush failed with status %d\n", rv);
/* would rather check on EOPNOTSUPP, but that is not reliable.
--
1.9.1

2014-07-03 08:44:06

by Philipp Reisner

[permalink] [raw]
Subject: [PATCH 16/23] drbd: debugfs: add per volume oldest_requests

From: Lars Ellenberg <[email protected]>

Show oldest requests
* pending master bio completion and,
* if different, local disk bio completion.

Signed-off-by: Philipp Reisner <[email protected]>
Signed-off-by: Lars Ellenberg <[email protected]>
---
drivers/block/drbd/drbd_debugfs.c | 153 ++++++++++++++++++++++++++++++++++++--
lib/lru_cache.c | 21 +++---
2 files changed, 160 insertions(+), 14 deletions(-)

diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index 230d9e1..45b5205 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -434,12 +434,10 @@ static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, vo
goto out;
/* serialize with d_delete() */
mutex_lock(&parent->d_inode->i_mutex);
- if (!debugfs_positive(file->f_dentry))
- goto out_unlock;
/* Make sure the object is still alive */
- if (kref_get_unless_zero(kref))
+ if (debugfs_positive(file->f_dentry)
+ && kref_get_unless_zero(kref))
ret = 0;
-out_unlock:
mutex_unlock(&parent->d_inode->i_mutex);
if (!ret) {
ret = single_open(file, show, data);
@@ -592,6 +590,53 @@ static const struct file_operations connection_callback_history_fops = {
.release = callback_history_release,
};

+static int connection_oldest_requests_show(struct seq_file *m, void *ignored)
+{
+ struct drbd_connection *connection = m->private;
+ unsigned long now = jiffies;
+ struct drbd_request *r1, *r2;
+
+ /* BUMP me if you change the file format/content/presentation */
+ seq_printf(m, "v: %u\n\n", 0);
+
+ spin_lock_irq(&connection->resource->req_lock);
+ r1 = connection->req_next;
+ if (r1)
+ seq_print_minor_vnr_req(m, r1, now);
+ r2 = connection->req_ack_pending;
+ if (r2 && r2 != r1) {
+ r1 = r2;
+ seq_print_minor_vnr_req(m, r1, now);
+ }
+ r2 = connection->req_not_net_done;
+ if (r2 && r2 != r1)
+ seq_print_minor_vnr_req(m, r2, now);
+ spin_unlock_irq(&connection->resource->req_lock);
+ return 0;
+}
+
+static int connection_oldest_requests_open(struct inode *inode, struct file *file)
+{
+ struct drbd_connection *connection = inode->i_private;
+ return drbd_single_open(file, connection_oldest_requests_show, connection,
+ &connection->kref, drbd_destroy_connection);
+}
+
+static int connection_oldest_requests_release(struct inode *inode, struct file *file)
+{
+ struct drbd_connection *connection = inode->i_private;
+ kref_put(&connection->kref, drbd_destroy_connection);
+ return single_release(inode, file);
+}
+
+static const struct file_operations connection_oldest_requests_fops = {
+ .owner = THIS_MODULE,
+ .open = connection_oldest_requests_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = connection_oldest_requests_release,
+};
+
void drbd_debugfs_connection_add(struct drbd_connection *connection)
{
struct dentry *conns_dir = connection->resource->debugfs_res_connections;
@@ -627,6 +672,89 @@ void drbd_debugfs_connection_cleanup(struct drbd_connection *connection)
drbd_debugfs_remove(&connection->debugfs_conn);
}

+static void resync_dump_detail(struct seq_file *m, struct lc_element *e)
+{
+ struct bm_extent *bme = lc_entry(e, struct bm_extent, lce);
+
+ seq_printf(m, "%5d %s %s %s\n", bme->rs_left,
+ test_bit(BME_NO_WRITES, &bme->flags) ? "NO_WRITES" : "---------",
+ test_bit(BME_LOCKED, &bme->flags) ? "LOCKED" : "------",
+ test_bit(BME_PRIORITY, &bme->flags) ? "PRIORITY" : "--------"
+ );
+}
+
+static int device_resync_extents_show(struct seq_file *m, void *ignored)
+{
+ struct drbd_device *device = m->private;
+ if (get_ldev_if_state(device, D_FAILED)) {
+ lc_seq_printf_stats(m, device->resync);
+ lc_seq_dump_details(m, device->resync, "rs_left flags", resync_dump_detail);
+ put_ldev(device);
+ }
+ return 0;
+}
+
+static int device_act_log_extents_show(struct seq_file *m, void *ignored)
+{
+ struct drbd_device *device = m->private;
+ if (get_ldev_if_state(device, D_FAILED)) {
+ lc_seq_printf_stats(m, device->act_log);
+ lc_seq_dump_details(m, device->act_log, "", NULL);
+ put_ldev(device);
+ }
+ return 0;
+}
+
+static int device_oldest_requests_show(struct seq_file *m, void *ignored)
+{
+ struct drbd_device *device = m->private;
+ struct drbd_resource *resource = device->resource;
+ unsigned long now = jiffies;
+ struct drbd_request *r1, *r2;
+ int i;
+
+ seq_puts(m, RQ_HDR);
+ spin_lock_irq(&resource->req_lock);
+ /* WRITE, then READ */
+ for (i = 1; i >= 0; --i) {
+ r1 = list_first_entry_or_null(&device->pending_master_completion[i],
+ struct drbd_request, req_pending_master_completion);
+ r2 = list_first_entry_or_null(&device->pending_completion[i],
+ struct drbd_request, req_pending_local);
+ if (r1)
+ seq_print_one_request(m, r1, now);
+ if (r2 && r2 != r1)
+ seq_print_one_request(m, r2, now);
+ }
+ spin_unlock_irq(&resource->req_lock);
+ return 0;
+}
+
+#define drbd_debugfs_device_attr(name) \
+static int device_ ## name ## _open(struct inode *inode, struct file *file) \
+{ \
+ struct drbd_device *device = inode->i_private; \
+ return drbd_single_open(file, device_ ## name ## _show, device, \
+ &device->kref, drbd_destroy_device); \
+} \
+static int device_ ## name ## _release(struct inode *inode, struct file *file) \
+{ \
+ struct drbd_device *device = inode->i_private; \
+ kref_put(&device->kref, drbd_destroy_device); \
+ return single_release(inode, file); \
+} \
+static const struct file_operations device_ ## name ## _fops = { \
+ .owner = THIS_MODULE, \
+ .open = device_ ## name ## _open, \
+ .read = seq_read, \
+ .llseek = seq_lseek, \
+ .release = device_ ## name ## _release, \
+};
+
+drbd_debugfs_device_attr(oldest_requests)
+drbd_debugfs_device_attr(act_log_extents)
+drbd_debugfs_device_attr(resync_extents)
+
void drbd_debugfs_device_add(struct drbd_device *device)
{
struct dentry *vols_dir = device->resource->debugfs_res_volumes;
@@ -650,10 +778,25 @@ void drbd_debugfs_device_add(struct drbd_device *device)
if (!slink_name)
goto fail;
dentry = debugfs_create_symlink(minor_buf, drbd_debugfs_minors, slink_name);
+ kfree(slink_name);
+ slink_name = NULL;
if (IS_ERR_OR_NULL(dentry))
goto fail;
device->debugfs_minor = dentry;
- kfree(slink_name);
+
+#define DCF(name) do { \
+ dentry = debugfs_create_file(#name, S_IRUSR|S_IRGRP, \
+ device->debugfs_vol, device, \
+ &device_ ## name ## _fops); \
+ if (IS_ERR_OR_NULL(dentry)) \
+ goto fail; \
+ device->debugfs_vol_ ## name = dentry; \
+ } while (0)
+
+ DCF(oldest_requests);
+ DCF(act_log_extents);
+ DCF(resync_extents);
+ return;

fail:
drbd_debugfs_device_cleanup(device);
diff --git a/lib/lru_cache.c b/lib/lru_cache.c
index 6111cd1..852c81e 100644
--- a/lib/lru_cache.c
+++ b/lib/lru_cache.c
@@ -643,9 +643,10 @@ void lc_set(struct lru_cache *lc, unsigned int enr, int index)
* lc_dump - Dump a complete LRU cache to seq in textual form.
* @lc: the lru cache to operate on
* @seq: the &struct seq_file pointer to seq_printf into
- * @utext: user supplied "heading" or other info
+ * @utext: user supplied additional "heading" or other info
* @detail: function pointer the user may provide to dump further details
- * of the object the lc_element is embedded in.
+ * of the object the lc_element is embedded in. May be NULL.
+ * Note: a leading space ' ' and trailing newline '\n' is implied.
*/
void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext,
void (*detail) (struct seq_file *, struct lc_element *))
@@ -654,16 +655,18 @@ void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext
struct lc_element *e;
int i;

- seq_printf(seq, "\tnn: lc_number refcnt %s\n ", utext);
+ seq_printf(seq, "\tnn: lc_number (new nr) refcnt %s\n ", utext);
for (i = 0; i < nr_elements; i++) {
e = lc_element_by_index(lc, i);
- if (e->lc_number == LC_FREE) {
- seq_printf(seq, "\t%2d: FREE\n", i);
- } else {
- seq_printf(seq, "\t%2d: %4u %4u ", i,
- e->lc_number, e->refcnt);
+ if (e->lc_number != e->lc_new_number)
+ seq_printf(seq, "\t%5d: %6d %8d %6d ",
+ i, e->lc_number, e->lc_new_number, e->refcnt);
+ else
+ seq_printf(seq, "\t%5d: %6d %-8s %6d ",
+ i, e->lc_number, "-\"-", e->refcnt);
+ if (detail)
detail(seq, e);
- }
+ seq_putc(seq, '\n');
}
}

--
1.9.1

2014-07-03 08:44:01

by Philipp Reisner

[permalink] [raw]
Subject: [PATCH 15/23] drbd: debugfs: add callback_history

From: Lars Ellenberg <[email protected]>

Add a per-connection worker thread callback_history
with timing details, call site and callback function.

Signed-off-by: Philipp Reisner <[email protected]>
Signed-off-by: Lars Ellenberg <[email protected]>
---
drivers/block/drbd/drbd_debugfs.c | 78 ++++++++++++++++++++++++++++++++++++++
drivers/block/drbd/drbd_int.h | 26 +++++++++++++
drivers/block/drbd/drbd_receiver.c | 6 +++
drivers/block/drbd/drbd_worker.c | 37 ++++++++++++++++--
4 files changed, 144 insertions(+), 3 deletions(-)

diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index 12d072d..230d9e1 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -521,6 +521,77 @@ void drbd_debugfs_resource_cleanup(struct drbd_resource *resource)
drbd_debugfs_remove(&resource->debugfs_res);
}

+static void seq_print_one_timing_detail(struct seq_file *m,
+ const struct drbd_thread_timing_details *tdp,
+ unsigned long now)
+{
+ struct drbd_thread_timing_details td;
+ /* No locking...
+ * use temporary assignment to get at consistent data. */
+ do {
+ td = *tdp;
+ } while (td.cb_nr != tdp->cb_nr);
+ if (!td.cb_addr)
+ return;
+ seq_printf(m, "%u\t%d\t%s:%u\t%ps\n",
+ td.cb_nr,
+ jiffies_to_msecs(now - td.start_jif),
+ td.caller_fn, td.line,
+ td.cb_addr);
+}
+
+static void seq_print_timing_details(struct seq_file *m,
+ const char *title,
+ unsigned int cb_nr, struct drbd_thread_timing_details *tdp, unsigned long now)
+{
+ unsigned int start_idx;
+ unsigned int i;
+
+ seq_printf(m, "%s\n", title);
+ /* If not much is going on, this will result in natural ordering.
+ * If it is very busy, we will possibly skip events, or even see wrap
+ * arounds, which could only be avoided with locking.
+ */
+ start_idx = cb_nr % DRBD_THREAD_DETAILS_HIST;
+ for (i = start_idx; i < DRBD_THREAD_DETAILS_HIST; i++)
+ seq_print_one_timing_detail(m, tdp+i, now);
+ for (i = 0; i < start_idx; i++)
+ seq_print_one_timing_detail(m, tdp+i, now);
+}
+
+static int callback_history_show(struct seq_file *m, void *ignored)
+{
+ struct drbd_connection *connection = m->private;
+ unsigned long jif = jiffies;
+
+ seq_puts(m, "n\tage\tcallsite\tfn\n");
+ seq_print_timing_details(m, "worker", connection->w_cb_nr, connection->w_timing_details, jif);
+ seq_print_timing_details(m, "receiver", connection->r_cb_nr, connection->r_timing_details, jif);
+ return 0;
+}
+
+static int callback_history_open(struct inode *inode, struct file *file)
+{
+ struct drbd_connection *connection = inode->i_private;
+ return drbd_single_open(file, callback_history_show, connection,
+ &connection->kref, drbd_destroy_connection);
+}
+
+static int callback_history_release(struct inode *inode, struct file *file)
+{
+ struct drbd_connection *connection = inode->i_private;
+ kref_put(&connection->kref, drbd_destroy_connection);
+ return single_release(inode, file);
+}
+
+static const struct file_operations connection_callback_history_fops = {
+ .owner = THIS_MODULE,
+ .open = callback_history_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = callback_history_release,
+};
+
void drbd_debugfs_connection_add(struct drbd_connection *connection)
{
struct dentry *conns_dir = connection->resource->debugfs_res_connections;
@@ -535,6 +606,13 @@ void drbd_debugfs_connection_add(struct drbd_connection *connection)
if (IS_ERR_OR_NULL(dentry))
goto fail;
connection->debugfs_conn = dentry;
+
+ dentry = debugfs_create_file("callback_history", S_IRUSR|S_IRGRP,
+ connection->debugfs_conn, connection,
+ &connection_callback_history_fops);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ connection->debugfs_conn_callback_history = dentry;
return;

fail:
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 30ed430..1a00001 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -697,6 +697,15 @@ struct drbd_resource {
cpumask_var_t cpu_mask;
};

+struct drbd_thread_timing_details
+{
+ unsigned long start_jif;
+ void *cb_addr;
+ const char *caller_fn;
+ unsigned int line;
+ unsigned int cb_nr;
+};
+
struct drbd_connection {
struct list_head connections;
struct drbd_resource *resource;
@@ -759,6 +768,12 @@ struct drbd_connection {
/* sender side */
struct drbd_work_queue sender_work;

+#define DRBD_THREAD_DETAILS_HIST 16
+ unsigned int w_cb_nr; /* keeps counting up */
+ unsigned int r_cb_nr; /* keeps counting up */
+ struct drbd_thread_timing_details w_timing_details[DRBD_THREAD_DETAILS_HIST];
+ struct drbd_thread_timing_details r_timing_details[DRBD_THREAD_DETAILS_HIST];
+
struct {
/* whether this sender thread
* has processed a single write yet. */
@@ -774,6 +789,17 @@ struct drbd_connection {
} send;
};

+void __update_timing_details(
+ struct drbd_thread_timing_details *tdp,
+ unsigned int *cb_nr,
+ void *cb,
+ const char *fn, const unsigned int line);
+
+#define update_worker_timing_details(c, cb) \
+ __update_timing_details(c->w_timing_details, &c->w_cb_nr, cb, __func__ , __LINE__ )
+#define update_receiver_timing_details(c, cb) \
+ __update_timing_details(c->r_timing_details, &c->r_cb_nr, cb, __func__ , __LINE__ )
+
struct submit_worker {
struct workqueue_struct *wq;
struct work_struct worker;
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 9f6ed24..f972988 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -2682,9 +2682,11 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
list_add_tail(&peer_req->w.list, &device->read_ee);
spin_unlock_irq(&device->resource->req_lock);

+ update_receiver_timing_details(connection, drbd_rs_should_slow_down);
if (device->state.peer != R_PRIMARY
&& drbd_rs_should_slow_down(device, sector, false))
schedule_timeout_uninterruptible(HZ/10);
+ update_receiver_timing_details(connection, drbd_rs_begin_io);
if (drbd_rs_begin_io(device, sector))
goto out_free_e;

@@ -2692,6 +2694,7 @@ submit_for_resync:
atomic_add(size >> 9, &device->rs_sect_ev);

submit:
+ update_receiver_timing_details(connection, drbd_submit_peer_request);
inc_unacked(device);
if (drbd_submit_peer_request(device, peer_req, READ, fault_type) == 0)
return 0;
@@ -4601,6 +4604,7 @@ static void drbdd(struct drbd_connection *connection)
struct data_cmd *cmd;

drbd_thread_current_set_cpu(&connection->receiver);
+ update_receiver_timing_details(connection, drbd_recv_header);
if (drbd_recv_header(connection, &pi))
goto err_out;

@@ -4619,12 +4623,14 @@ static void drbdd(struct drbd_connection *connection)
}

if (shs) {
+ update_receiver_timing_details(connection, drbd_recv_all_warn);
err = drbd_recv_all_warn(connection, pi.data, shs);
if (err)
goto err_out;
pi.size -= shs;
}

+ update_receiver_timing_details(connection, cmd->fn);
err = cmd->fn(connection, &pi);
if (err) {
drbd_err(connection, "error receiving %s, e: %d l: %d!\n",
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index b908e9b..50776b3 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1905,6 +1905,29 @@ static int do_md_sync(struct drbd_device *device)
return 0;
}

+/* only called from drbd_worker thread, no locking */
+void __update_timing_details(
+ struct drbd_thread_timing_details *tdp,
+ unsigned int *cb_nr,
+ void *cb,
+ const char *fn, const unsigned int line)
+{
+ unsigned int i = *cb_nr % DRBD_THREAD_DETAILS_HIST;
+ struct drbd_thread_timing_details *td = tdp + i;
+
+ td->start_jif = jiffies;
+ td->cb_addr = cb;
+ td->caller_fn = fn;
+ td->line = line;
+ td->cb_nr = *cb_nr;
+
+ i = (i+1) % DRBD_THREAD_DETAILS_HIST;
+ td = tdp + i;
+ memset(td, 0, sizeof(*td));
+
+ ++(*cb_nr);
+}
+
#define WORK_PENDING(work_bit, todo) (todo & (1UL << work_bit))
static void do_device_work(struct drbd_device *device, const unsigned long todo)
{
@@ -2076,11 +2099,15 @@ int drbd_worker(struct drbd_thread *thi)
while (get_t_state(thi) == RUNNING) {
drbd_thread_current_set_cpu(thi);

- if (list_empty(&work_list))
+ if (list_empty(&work_list)) {
+ update_worker_timing_details(connection, wait_for_work);
wait_for_work(connection, &work_list);
+ }

- if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags))
+ if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) {
+ update_worker_timing_details(connection, do_unqueued_work);
do_unqueued_work(connection);
+ }

if (signal_pending(current)) {
flush_signals(current);
@@ -2097,6 +2124,7 @@ int drbd_worker(struct drbd_thread *thi)
while (!list_empty(&work_list)) {
w = list_first_entry(&work_list, struct drbd_work, list);
list_del_init(&w->list);
+ update_worker_timing_details(connection, w->cb);
if (w->cb(w, connection->cstate < C_WF_REPORT_PARAMS) == 0)
continue;
if (connection->cstate >= C_WF_REPORT_PARAMS)
@@ -2105,11 +2133,14 @@ int drbd_worker(struct drbd_thread *thi)
}

do {
- if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags))
+ if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) {
+ update_worker_timing_details(connection, do_unqueued_work);
do_unqueued_work(connection);
+ }
while (!list_empty(&work_list)) {
w = list_first_entry(&work_list, struct drbd_work, list);
list_del_init(&w->list);
+ update_worker_timing_details(connection, w->cb);
w->cb(w, 1);
}
dequeue_work_batch(&connection->sender_work, &work_list);
--
1.9.1

2014-07-03 08:43:59

by Philipp Reisner

[permalink] [raw]
Subject: [PATCH 18/23] drbd: debugfs: add per connection oldest requests

From: Lars Ellenberg <[email protected]>

Information of former /sys/block/drbdX/drbd/oldest_requests
is already with higher detail in these files:
debugfs/drbd/resource/$name/in_flight_summary,
debugfs/drbd/resource/$name/volumes/$vnr/oldest_requests

This patch adds
debugfs/drbd/resource/$name/connections/peer/oldest_requests

Signed-off-by: Philipp Reisner <[email protected]>
Signed-off-by: Lars Ellenberg <[email protected]>
---
drivers/block/drbd/drbd_debugfs.c | 7 +++++++
1 file changed, 7 insertions(+)

diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index a4d0666..db70d6c 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -670,6 +670,13 @@ void drbd_debugfs_connection_add(struct drbd_connection *connection)
if (IS_ERR_OR_NULL(dentry))
goto fail;
connection->debugfs_conn_callback_history = dentry;
+
+ dentry = debugfs_create_file("oldest_requests", S_IRUSR|S_IRGRP,
+ connection->debugfs_conn, connection,
+ &connection_oldest_requests_fops);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ connection->debugfs_conn_oldest_requests = dentry;
return;

fail:
--
1.9.1

2014-07-03 08:43:58

by Philipp Reisner

[permalink] [raw]
Subject: [PATCH 20/23] drbd: resync should only lock out specific ranges

From: Lars Ellenberg <[email protected]>

During resync, if we need to block some specific incoming write because
of active resync requests to that same range, we potentially caused
*all* new application writes (to "cold" activity log extents) to block
until this one request has been processed.

Improve the do_submit() logic to
* grab all incoming requests to some "incoming" list
* process this list
- move aside requests that are blocked by resync
- prepare activity log transactions,
- commit transactions and submit corresponding requests
- if there are remaining requests that only wait for
activity log extents to become free, stop the fast path
(mark activity log as "starving")
- iterate until no more requests are waiting for the activity log,
but all potentially remaining requests are only blocked by resync
* only then grab new incoming requests

That way, very busy IO on currently "hot" activity log extents cannot
starve scattered IO to "cold" extents. And blocked-by-resync requests
are processed once resync traffic on the affected region has ceased,
without blocking anything else.

The only blocking mode left is when we cannot start requests to "cold"
extents because all currently "hot" extents are actually used.

Signed-off-by: Philipp Reisner <[email protected]>
Signed-off-by: Lars Ellenberg <[email protected]>
---
drivers/block/drbd/drbd_actlog.c | 15 ++++-
drivers/block/drbd/drbd_req.c | 136 ++++++++++++++++++++++++---------------
2 files changed, 98 insertions(+), 53 deletions(-)

diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 6ce5c76..e9fbcaf 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -357,8 +357,19 @@ int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *
/* We want all necessary updates for a given request within the same transaction
* We could first check how many updates are *actually* needed,
* and use that instead of the worst-case nr_al_extents */
- if (available_update_slots < nr_al_extents)
- return -EWOULDBLOCK;
+ if (available_update_slots < nr_al_extents) {
+ /* Too many activity log extents are currently "hot".
+ *
+ * If we have accumulated pending changes already,
+ * we made progress.
+ *
+ * If we cannot get even a single pending change through,
+ * stop the fast path until we made some progress,
+ * or requests to "cold" extents could be starved. */
+ if (!al->pending_changes)
+ __set_bit(__LC_STARVING, &device->act_log->flags);
+ return -ENOBUFS;
+ }

/* Is resync active in this area? */
for (enr = first; enr <= last; enr++) {
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 74ebef1..c67717d 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1182,6 +1182,8 @@ static void drbd_queue_write(struct drbd_device *device, struct drbd_request *re
&device->pending_master_completion[1 /* WRITE */]);
spin_unlock_irq(&device->resource->req_lock);
queue_work(device->submit.wq, &device->submit.worker);
+ /* do_submit() may sleep internally on al_wait, too */
+ wake_up(&device->al_wait);
}

/* returns the new drbd_request pointer, if the caller is expected to
@@ -1365,7 +1367,8 @@ static void submit_fast_path(struct drbd_device *device, struct list_head *incom

static bool prepare_al_transaction_nonblock(struct drbd_device *device,
struct list_head *incoming,
- struct list_head *pending)
+ struct list_head *pending,
+ struct list_head *later)
{
struct drbd_request *req, *tmp;
int wake = 0;
@@ -1374,44 +1377,105 @@ static bool prepare_al_transaction_nonblock(struct drbd_device *device,
spin_lock_irq(&device->al_lock);
list_for_each_entry_safe(req, tmp, incoming, tl_requests) {
err = drbd_al_begin_io_nonblock(device, &req->i);
+ if (err == -ENOBUFS)
+ break;
if (err == -EBUSY)
wake = 1;
if (err)
- continue;
- list_move_tail(&req->tl_requests, pending);
+ list_move_tail(&req->tl_requests, later);
+ else
+ list_move_tail(&req->tl_requests, pending);
}
spin_unlock_irq(&device->al_lock);
if (wake)
wake_up(&device->al_wait);
-
return !list_empty(pending);
}

+void send_and_submit_pending(struct drbd_device *device, struct list_head *pending)
+{
+ struct drbd_request *req, *tmp;
+
+ list_for_each_entry_safe(req, tmp, pending, tl_requests) {
+ req->rq_state |= RQ_IN_ACT_LOG;
+ req->in_actlog_jif = jiffies;
+ atomic_dec(&device->ap_actlog_cnt);
+ list_del_init(&req->tl_requests);
+ drbd_send_and_submit(device, req);
+ }
+}
+
void do_submit(struct work_struct *ws)
{
struct drbd_device *device = container_of(ws, struct drbd_device, submit.worker);
- LIST_HEAD(incoming);
- LIST_HEAD(pending);
- struct drbd_request *req, *tmp;
+ LIST_HEAD(incoming); /* from drbd_make_request() */
+ LIST_HEAD(pending); /* to be submitted after next AL-transaction commit */
+ LIST_HEAD(busy); /* blocked by resync requests */
+
+ /* grab new incoming requests */
+ spin_lock_irq(&device->resource->req_lock);
+ list_splice_tail_init(&device->submit.writes, &incoming);
+ spin_unlock_irq(&device->resource->req_lock);

for (;;) {
- spin_lock_irq(&device->resource->req_lock);
- list_splice_tail_init(&device->submit.writes, &incoming);
- spin_unlock_irq(&device->resource->req_lock);
+ DEFINE_WAIT(wait);

+ /* move used-to-be-busy back to front of incoming */
+ list_splice_init(&busy, &incoming);
submit_fast_path(device, &incoming);
if (list_empty(&incoming))
break;

-skip_fast_path:
- wait_event(device->al_wait, prepare_al_transaction_nonblock(device, &incoming, &pending));
- /* Maybe more was queued, while we prepared the transaction?
- * Try to stuff them into this transaction as well.
- * Be strictly non-blocking here, no wait_event, we already
- * have something to commit.
- * Stop if we don't make any more progres.
- */
for (;;) {
+ prepare_to_wait(&device->al_wait, &wait, TASK_UNINTERRUPTIBLE);
+
+ list_splice_init(&busy, &incoming);
+ prepare_al_transaction_nonblock(device, &incoming, &pending, &busy);
+ if (!list_empty(&pending))
+ break;
+
+ schedule();
+
+ /* If all currently "hot" activity log extents are kept busy by
+ * incoming requests, we still must not totally starve new
+ * requests to "cold" extents.
+ * Something left on &incoming means there had not been
+ * enough update slots available, and the activity log
+ * has been marked as "starving".
+ *
+ * Try again now, without looking for new requests,
+ * effectively blocking all new requests until we made
+ * at least _some_ progress with what we currently have.
+ */
+ if (!list_empty(&incoming))
+ continue;
+
+ /* Nothing moved to pending, but nothing left
+ * on incoming: all moved to busy!
+ * Grab new and iterate. */
+ spin_lock_irq(&device->resource->req_lock);
+ list_splice_tail_init(&device->submit.writes, &incoming);
+ spin_unlock_irq(&device->resource->req_lock);
+ }
+ finish_wait(&device->al_wait, &wait);
+
+ /* If the transaction was full, before all incoming requests
+ * had been processed, skip ahead to commit, and iterate
+ * without splicing in more incoming requests from upper layers.
+ *
+ * Else, if all incoming have been processed,
+ * they have become either "pending" (to be submitted after
+ * next transaction commit) or "busy" (blocked by resync).
+ *
+ * Maybe more was queued, while we prepared the transaction?
+ * Try to stuff those into this transaction as well.
+ * Be strictly non-blocking here,
+ * we already have something to commit.
+ *
+ * Commit if we don't make any more progres.
+ */
+
+ while (list_empty(&incoming)) {
LIST_HEAD(more_pending);
LIST_HEAD(more_incoming);
bool made_progress;
@@ -1428,46 +1492,16 @@ skip_fast_path:
if (list_empty(&more_incoming))
break;

- made_progress = prepare_al_transaction_nonblock(device, &more_incoming, &more_pending);
+ made_progress = prepare_al_transaction_nonblock(device, &more_incoming, &more_pending, &busy);

list_splice_tail_init(&more_pending, &pending);
list_splice_tail_init(&more_incoming, &incoming);
-
if (!made_progress)
break;
}
- drbd_al_begin_io_commit(device);

- list_for_each_entry_safe(req, tmp, &pending, tl_requests) {
- req->rq_state |= RQ_IN_ACT_LOG;
- req->in_actlog_jif = jiffies;
- atomic_dec(&device->ap_actlog_cnt);
- list_del_init(&req->tl_requests);
- drbd_send_and_submit(device, req);
- }
-
- /* If all currently hot activity log extents are kept busy by
- * incoming requests, we still must not totally starve new
- * requests to cold extents. In that case, prepare one request
- * in blocking mode. */
- list_for_each_entry_safe(req, tmp, &incoming, tl_requests) {
- bool was_cold;
- list_del_init(&req->tl_requests);
- was_cold = drbd_al_begin_io_prepare(device, &req->i);
- if (!was_cold) {
- req->rq_state |= RQ_IN_ACT_LOG;
- req->in_actlog_jif = jiffies;
- atomic_dec(&device->ap_actlog_cnt);
- /* Corresponding extent was hot after all? */
- drbd_send_and_submit(device, req);
- } else {
- /* Found a request to a cold extent.
- * Put on "pending" list,
- * and try to cumulate with more. */
- list_add(&req->tl_requests, &pending);
- goto skip_fast_path;
- }
- }
+ drbd_al_begin_io_commit(device);
+ send_and_submit_pending(device, &pending);
}
}

--
1.9.1

2014-07-03 08:43:56

by Philipp Reisner

[permalink] [raw]
Subject: [PATCH 19/23] drbd: debugfs: add per device data_gen_id

From: Lars Ellenberg <[email protected]>

The data generation identifiers used to be exposed via sysfs
at /sys/block/drbdX/drbd/meta_data/data_gen_id (out-of-tree),
for advanced policy scripting.
Bring that information over to debugfs.

Signed-off-by: Philipp Reisner <[email protected]>
Signed-off-by: Lars Ellenberg <[email protected]>
---
drivers/block/drbd/drbd_debugfs.c | 22 ++++++++++++++++++++++
1 file changed, 22 insertions(+)

diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index db70d6c..5c20b18 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -760,6 +760,25 @@ static int device_oldest_requests_show(struct seq_file *m, void *ignored)
return 0;
}

+static int device_data_gen_id_show(struct seq_file *m, void *ignored)
+{
+ struct drbd_device *device = m->private;
+ struct drbd_md *md;
+ enum drbd_uuid_index idx;
+
+ if (!get_ldev_if_state(device, D_FAILED))
+ return -ENODEV;
+
+ md = &device->ldev->md;
+ spin_lock_irq(&md->uuid_lock);
+ for (idx = UI_CURRENT; idx <= UI_HISTORY_END; idx++) {
+ seq_printf(m, "0x%016llX\n", md->uuid[idx]);
+ }
+ spin_unlock_irq(&md->uuid_lock);
+ put_ldev(device);
+ return 0;
+}
+
#define drbd_debugfs_device_attr(name) \
static int device_ ## name ## _open(struct inode *inode, struct file *file) \
{ \
@@ -784,6 +803,7 @@ static const struct file_operations device_ ## name ## _fops = { \
drbd_debugfs_device_attr(oldest_requests)
drbd_debugfs_device_attr(act_log_extents)
drbd_debugfs_device_attr(resync_extents)
+drbd_debugfs_device_attr(data_gen_id)

void drbd_debugfs_device_add(struct drbd_device *device)
{
@@ -826,6 +846,8 @@ void drbd_debugfs_device_add(struct drbd_device *device)
DCF(oldest_requests);
DCF(act_log_extents);
DCF(resync_extents);
+ DCF(data_gen_id);
+#undef DCF
return;

fail:
--
1.9.1

2014-07-03 08:43:55

by Philipp Reisner

[permalink] [raw]
Subject: [PATCH 21/23] drbd: drop spurious parameters from _drbd_md_sync_page_io

From: Lars Ellenberg <[email protected]>

size is always 4096,
page is always device->md_io.page.

Signed-off-by: Philipp Reisner <[email protected]>
Signed-off-by: Lars Ellenberg <[email protected]>
---
drivers/block/drbd/drbd_actlog.c | 12 +++++-------
1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index e9fbcaf..d26a3fa 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -137,10 +137,11 @@ void wait_until_done_or_force_detached(struct drbd_device *device, struct drbd_b

static int _drbd_md_sync_page_io(struct drbd_device *device,
struct drbd_backing_dev *bdev,
- struct page *page, sector_t sector,
- int rw, int size)
+ sector_t sector, int rw)
{
struct bio *bio;
+ /* we do all our meta data IO in aligned 4k blocks. */
+ const int size = 4096;
int err;

device->md_io.done = 0;
@@ -154,7 +155,7 @@ static int _drbd_md_sync_page_io(struct drbd_device *device,
bio->bi_bdev = bdev->md_bdev;
bio->bi_iter.bi_sector = sector;
err = -EIO;
- if (bio_add_page(bio, page, size, 0) != size)
+ if (bio_add_page(bio, device->md_io.page, size, 0) != size)
goto out;
bio->bi_private = device;
bio->bi_end_io = drbd_md_io_complete;
@@ -190,8 +191,6 @@ int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bd
sector_t sector, int rw)
{
int err;
- struct page *iop = device->md_io.page;
-
D_ASSERT(device, atomic_read(&device->md_io.in_use) == 1);

BUG_ON(!bdev->md_bdev);
@@ -207,8 +206,7 @@ int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bd
current->comm, current->pid, __func__,
(unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");

- /* we do all our meta data IO in aligned 4k blocks. */
- err = _drbd_md_sync_page_io(device, bdev, iop, sector, rw, 4096);
+ err = _drbd_md_sync_page_io(device, bdev, sector, rw);
if (err) {
drbd_err(device, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n",
(unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err);
--
1.9.1

2014-07-03 08:43:53

by Philipp Reisner

[permalink] [raw]
Subject: [PATCH 23/23] drbd: silence underflow warning in read_in_block()

From: Dan Carpenter <[email protected]>

My static checker warns that "data_size" could be negative and underflow
the limit check. The code looks suspicious but I don't know if it is a
real bug.

Signed-off-by: Dan Carpenter <[email protected]>

Signed-off-by: Philipp Reisner <[email protected]>
Signed-off-by: Lars Ellenberg <[email protected]>
---
drivers/block/drbd/drbd_receiver.c | 2 +-
include/linux/drbd.h | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index f972988..9342b8d 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1592,7 +1592,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
struct drbd_peer_request *peer_req;
struct page *page;
int dgs, ds, err;
- int data_size = pi->size;
+ unsigned int data_size = pi->size;
void *dig_in = peer_device->connection->int_dig_in;
void *dig_vv = peer_device->connection->int_dig_vv;
unsigned long *data;
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 20ec890..debb70d 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -52,7 +52,7 @@
#endif

extern const char *drbd_buildtag(void);
-#define REL_VERSION "8.4.3"
+#define REL_VERSION "8.4.5"
#define API_VERSION 1
#define PRO_VERSION_MIN 86
#define PRO_VERSION_MAX 101
--
1.9.1

2014-07-03 08:43:52

by Philipp Reisner

[permalink] [raw]
Subject: [PATCH 22/23] drbd: implicitly truncate cpu-mask

From: Lars Ellenberg <[email protected]>

Don't error out with misleading "out of memory"
if the cpu-mask has more bits set than there are CPUs.
Just truncate to nr_cpu_ids implicitly.

Signed-off-by: Philipp Reisner <[email protected]>
Signed-off-by: Lars Ellenberg <[email protected]>
---
drivers/block/drbd/drbd_main.c | 14 ++++++++++++++
1 file changed, 14 insertions(+)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 9712bcc..9b465bb 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2541,6 +2541,20 @@ int set_resource_options(struct drbd_resource *resource, struct res_opts *res_op
if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) {
err = bitmap_parse(res_opts->cpu_mask, DRBD_CPU_MASK_SIZE,
cpumask_bits(new_cpu_mask), nr_cpu_ids);
+ if (err == -EOVERFLOW) {
+ /* So what. mask it out. */
+ cpumask_var_t tmp_cpu_mask;
+ if (zalloc_cpumask_var(&tmp_cpu_mask, GFP_KERNEL)) {
+ cpumask_setall(tmp_cpu_mask);
+ cpumask_and(new_cpu_mask, new_cpu_mask, tmp_cpu_mask);
+ drbd_warn(resource, "Overflow in bitmap_parse(%.12s%s), truncating to %u bits\n",
+ res_opts->cpu_mask,
+ strlen(res_opts->cpu_mask) > 12 ? "..." : "",
+ nr_cpu_ids);
+ free_cpumask_var(tmp_cpu_mask);
+ err = 0;
+ }
+ }
if (err) {
drbd_warn(resource, "bitmap_parse() failed with %d\n", err);
/* retcode = ERR_CPU_MASK_PARSE; */
--
1.9.1

2014-07-03 08:49:24

by Philipp Reisner

[permalink] [raw]
Subject: [PATCH 11/23] drbd: debugfs: add basic hierarchy

From: Lars Ellenberg <[email protected]>

Add new debugfs hierarchy /sys/kernel/debug/
drbd/
resources/
$resource_name/connections/peer/$volume_number/
$resource_name/volumes/$volume_number/
minors/$minor_number -> ../resources/$resource_name/volumes/$volume_number/

Followup commits will populate this hierarchy with files containing
statistics, diagnostic information and some attribute data.

Signed-off-by: Philipp Reisner <[email protected]>
Signed-off-by: Lars Ellenberg <[email protected]>
---
drivers/block/drbd/Makefile | 1 +
drivers/block/drbd/drbd_debugfs.c | 191 ++++++++++++++++++++++++++++++++++++++
drivers/block/drbd/drbd_debugfs.h | 39 ++++++++
drivers/block/drbd/drbd_int.h | 30 +++++-
drivers/block/drbd/drbd_main.c | 22 ++++-
5 files changed, 278 insertions(+), 5 deletions(-)
create mode 100644 drivers/block/drbd/drbd_debugfs.c
create mode 100644 drivers/block/drbd/drbd_debugfs.h

diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile
index 8b45033..4464e35 100644
--- a/drivers/block/drbd/Makefile
+++ b/drivers/block/drbd/Makefile
@@ -3,5 +3,6 @@ drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
drbd-y += drbd_interval.o drbd_state.o
drbd-y += drbd_nla.o
+drbd-$(CONFIG_DEBUG_FS) += drbd_debugfs.o

obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o
diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
new file mode 100644
index 0000000..9b120c3
--- /dev/null
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -0,0 +1,191 @@
+#define pr_fmt(fmt) "drbd debugfs: " fmt
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+
+#include "drbd_int.h"
+#include "drbd_req.h"
+#include "drbd_debugfs.h"
+
+static struct dentry *drbd_debugfs_root;
+static struct dentry *drbd_debugfs_resources;
+static struct dentry *drbd_debugfs_minors;
+
+void drbd_debugfs_resource_add(struct drbd_resource *resource)
+{
+ struct dentry *dentry;
+ if (!drbd_debugfs_resources)
+ return;
+
+ dentry = debugfs_create_dir(resource->name, drbd_debugfs_resources);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ resource->debugfs_res = dentry;
+
+ dentry = debugfs_create_dir("volumes", resource->debugfs_res);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ resource->debugfs_res_volumes = dentry;
+
+ dentry = debugfs_create_dir("connections", resource->debugfs_res);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ resource->debugfs_res_connections = dentry;
+
+ return;
+
+fail:
+ drbd_debugfs_resource_cleanup(resource);
+ drbd_err(resource, "failed to create debugfs dentry\n");
+}
+
+static void drbd_debugfs_remove(struct dentry **dp)
+{
+ debugfs_remove(*dp);
+ *dp = NULL;
+}
+
+void drbd_debugfs_resource_cleanup(struct drbd_resource *resource)
+{
+ /* it is ok to call debugfs_remove(NULL) */
+ drbd_debugfs_remove(&resource->debugfs_res_in_flight_summary);
+ drbd_debugfs_remove(&resource->debugfs_res_connections);
+ drbd_debugfs_remove(&resource->debugfs_res_volumes);
+ drbd_debugfs_remove(&resource->debugfs_res);
+}
+
+void drbd_debugfs_connection_add(struct drbd_connection *connection)
+{
+ struct dentry *conns_dir = connection->resource->debugfs_res_connections;
+ struct dentry *dentry;
+ if (!conns_dir)
+ return;
+
+ /* Once we enable mutliple peers,
+ * these connections will have descriptive names.
+ * For now, it is just the one connection to the (only) "peer". */
+ dentry = debugfs_create_dir("peer", conns_dir);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ connection->debugfs_conn = dentry;
+ return;
+
+fail:
+ drbd_debugfs_connection_cleanup(connection);
+ drbd_err(connection, "failed to create debugfs dentry\n");
+}
+
+void drbd_debugfs_connection_cleanup(struct drbd_connection *connection)
+{
+ drbd_debugfs_remove(&connection->debugfs_conn_callback_history);
+ drbd_debugfs_remove(&connection->debugfs_conn_oldest_requests);
+ drbd_debugfs_remove(&connection->debugfs_conn);
+}
+
+void drbd_debugfs_device_add(struct drbd_device *device)
+{
+ struct dentry *vols_dir = device->resource->debugfs_res_volumes;
+ char minor_buf[8]; /* MINORMASK, MINORBITS == 20; */
+ char vnr_buf[8]; /* volume number vnr is even 16 bit only; */
+ char *slink_name = NULL;
+
+ struct dentry *dentry;
+ if (!vols_dir || !drbd_debugfs_minors)
+ return;
+
+ snprintf(vnr_buf, sizeof(vnr_buf), "%u", device->vnr);
+ dentry = debugfs_create_dir(vnr_buf, vols_dir);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ device->debugfs_vol = dentry;
+
+ snprintf(minor_buf, sizeof(minor_buf), "%u", device->minor);
+ slink_name = kasprintf(GFP_KERNEL, "../resources/%s/volumes/%u",
+ device->resource->name, device->vnr);
+ if (!slink_name)
+ goto fail;
+ dentry = debugfs_create_symlink(minor_buf, drbd_debugfs_minors, slink_name);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ device->debugfs_minor = dentry;
+ kfree(slink_name);
+
+fail:
+ drbd_debugfs_device_cleanup(device);
+ drbd_err(device, "failed to create debugfs entries\n");
+}
+
+void drbd_debugfs_device_cleanup(struct drbd_device *device)
+{
+ drbd_debugfs_remove(&device->debugfs_minor);
+ drbd_debugfs_remove(&device->debugfs_vol_oldest_requests);
+ drbd_debugfs_remove(&device->debugfs_vol_act_log_extents);
+ drbd_debugfs_remove(&device->debugfs_vol_resync_extents);
+ drbd_debugfs_remove(&device->debugfs_vol_data_gen_id);
+ drbd_debugfs_remove(&device->debugfs_vol);
+}
+
+void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device)
+{
+ struct dentry *conn_dir = peer_device->connection->debugfs_conn;
+ struct dentry *dentry;
+ char vnr_buf[8];
+
+ if (!conn_dir)
+ return;
+
+ snprintf(vnr_buf, sizeof(vnr_buf), "%u", peer_device->device->vnr);
+ dentry = debugfs_create_dir(vnr_buf, conn_dir);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ peer_device->debugfs_peer_dev = dentry;
+ return;
+
+fail:
+ drbd_debugfs_peer_device_cleanup(peer_device);
+ drbd_err(peer_device, "failed to create debugfs entries\n");
+}
+
+void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device)
+{
+ drbd_debugfs_remove(&peer_device->debugfs_peer_dev);
+}
+
+/* not __exit, may be indirectly called
+ * from the module-load-failure path as well. */
+void drbd_debugfs_cleanup(void)
+{
+ drbd_debugfs_remove(&drbd_debugfs_resources);
+ drbd_debugfs_remove(&drbd_debugfs_minors);
+ drbd_debugfs_remove(&drbd_debugfs_root);
+}
+
+int __init drbd_debugfs_init(void)
+{
+ struct dentry *dentry;
+
+ dentry = debugfs_create_dir("drbd", NULL);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ drbd_debugfs_root = dentry;
+
+ dentry = debugfs_create_dir("resources", drbd_debugfs_root);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ drbd_debugfs_resources = dentry;
+
+ dentry = debugfs_create_dir("minors", drbd_debugfs_root);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ drbd_debugfs_minors = dentry;
+ return 0;
+
+fail:
+ drbd_debugfs_cleanup();
+ if (dentry)
+ return PTR_ERR(dentry);
+ else
+ return -EINVAL;
+}
diff --git a/drivers/block/drbd/drbd_debugfs.h b/drivers/block/drbd/drbd_debugfs.h
new file mode 100644
index 0000000..8bee213
--- /dev/null
+++ b/drivers/block/drbd/drbd_debugfs.h
@@ -0,0 +1,39 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+
+#include "drbd_int.h"
+
+#ifdef CONFIG_DEBUG_FS
+int __init drbd_debugfs_init(void);
+void drbd_debugfs_cleanup(void);
+
+void drbd_debugfs_resource_add(struct drbd_resource *resource);
+void drbd_debugfs_resource_cleanup(struct drbd_resource *resource);
+
+void drbd_debugfs_connection_add(struct drbd_connection *connection);
+void drbd_debugfs_connection_cleanup(struct drbd_connection *connection);
+
+void drbd_debugfs_device_add(struct drbd_device *device);
+void drbd_debugfs_device_cleanup(struct drbd_device *device);
+
+void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device);
+void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device);
+#else
+
+static inline int __init drbd_debugfs_init(void) { return -ENODEV; }
+static inline void drbd_debugfs_cleanup(void) { }
+
+static inline void drbd_debugfs_resource_add(struct drbd_resource *resource) { }
+static inline void drbd_debugfs_resource_cleanup(struct drbd_resource *resource) { }
+
+static inline void drbd_debugfs_connection_add(struct drbd_connection *connection) { }
+static inline void drbd_debugfs_connection_cleanup(struct drbd_connection *connection) { }
+
+static inline void drbd_debugfs_device_add(struct drbd_device *device) { }
+static inline void drbd_debugfs_device_cleanup(struct drbd_device *device) { }
+
+static inline void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device) { }
+static inline void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device) { }
+
+#endif
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 40c816c..20f2b38 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -670,6 +670,12 @@ enum {

struct drbd_resource {
char *name;
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *debugfs_res;
+ struct dentry *debugfs_res_volumes;
+ struct dentry *debugfs_res_connections;
+ struct dentry *debugfs_res_in_flight_summary;
+#endif
struct kref kref;
struct idr devices; /* volume number to device mapping */
struct list_head connections;
@@ -691,6 +697,11 @@ struct drbd_resource {
struct drbd_connection {
struct list_head connections;
struct drbd_resource *resource;
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *debugfs_conn;
+ struct dentry *debugfs_conn_callback_history;
+ struct dentry *debugfs_conn_oldest_requests;
+#endif
struct kref kref;
struct idr peer_devices; /* volume number to peer device mapping */
enum drbd_conns cstate; /* Only C_STANDALONE to C_WF_REPORT_PARAMS */
@@ -772,13 +783,29 @@ struct drbd_peer_device {
struct list_head peer_devices;
struct drbd_device *device;
struct drbd_connection *connection;
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *debugfs_peer_dev;
+#endif
};

struct drbd_device {
struct drbd_resource *resource;
struct list_head peer_devices;
struct list_head pending_bitmap_io;
- int vnr; /* volume number within the connection */
+
+ unsigned long flush_jif;
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *debugfs_minor;
+ struct dentry *debugfs_vol;
+ struct dentry *debugfs_vol_oldest_requests;
+ struct dentry *debugfs_vol_act_log_extents;
+ struct dentry *debugfs_vol_resync_extents;
+ struct dentry *debugfs_vol_data_gen_id;
+#endif
+
+ unsigned int vnr; /* volume number within the connection */
+ unsigned int minor; /* device minor number */
+
struct kref kref;

/* things that are stored as / read from meta data on disk */
@@ -895,7 +922,6 @@ struct drbd_device {
atomic_t packet_seq;
unsigned int peer_seq;
spinlock_t peer_seq_lock;
- unsigned int minor;
unsigned long comm_bm_set; /* communicated number of set bits. */
struct bm_io_work bm_io_work;
u64 ed_uuid; /* UUID of the exposed data */
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 922d631..01de57e 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -57,8 +57,8 @@
#include "drbd_int.h"
#include "drbd_protocol.h"
#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
-
#include "drbd_vli.h"
+#include "drbd_debugfs.h"

static DEFINE_MUTEX(drbd_main_mutex);
static int drbd_open(struct block_device *bdev, fmode_t mode);
@@ -2308,8 +2308,10 @@ void drbd_free_resource(struct drbd_resource *resource)

for_each_connection_safe(connection, tmp, resource) {
list_del(&connection->connections);
+ drbd_debugfs_connection_cleanup(connection);
kref_put(&connection->kref, drbd_destroy_connection);
}
+ drbd_debugfs_resource_cleanup(resource);
kref_put(&resource->kref, drbd_destroy_resource);
}

@@ -2334,6 +2336,7 @@ static void drbd_cleanup(void)
destroy_workqueue(retry.wq);

drbd_genl_unregister();
+ drbd_debugfs_cleanup();

idr_for_each_entry(&drbd_devices, device, i)
drbd_delete_device(device);
@@ -2583,6 +2586,7 @@ struct drbd_resource *drbd_create_resource(const char *name)
mutex_init(&resource->conf_update);
mutex_init(&resource->adm_mutex);
spin_lock_init(&resource->req_lock);
+ drbd_debugfs_resource_add(resource);
return resource;

fail_free_name:
@@ -2593,7 +2597,7 @@ fail:
return NULL;
}

-/* caller must be under genl_lock() */
+/* caller must be under adm_mutex */
struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts)
{
struct drbd_resource *resource;
@@ -2651,6 +2655,7 @@ struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts)

kref_get(&resource->kref);
list_add_tail_rcu(&connection->connections, &resource->connections);
+ drbd_debugfs_connection_add(connection);
return connection;

fail_resource:
@@ -2829,7 +2834,10 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
for_each_peer_device(peer_device, device)
drbd_connected(peer_device);
}
-
+ /* move to create_peer_device() */
+ for_each_peer_device(peer_device, device)
+ drbd_debugfs_peer_device_add(peer_device);
+ drbd_debugfs_device_add(device);
return NO_ERROR;

out_idr_remove_vol:
@@ -2868,8 +2876,13 @@ void drbd_delete_device(struct drbd_device *device)
{
struct drbd_resource *resource = device->resource;
struct drbd_connection *connection;
+ struct drbd_peer_device *peer_device;
int refs = 3;

+ /* move to free_peer_device() */
+ for_each_peer_device(peer_device, device)
+ drbd_debugfs_peer_device_cleanup(peer_device);
+ drbd_debugfs_device_cleanup(device);
for_each_connection(connection, resource) {
idr_remove(&connection->peer_devices, device->vnr);
refs++;
@@ -2938,6 +2951,9 @@ static int __init drbd_init(void)
spin_lock_init(&retry.lock);
INIT_LIST_HEAD(&retry.writes);

+ if (drbd_debugfs_init())
+ pr_notice("failed to initialize debugfs -- will not be available\n");
+
pr_info("initialized. "
"Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
--
1.9.1

2014-07-03 08:49:23

by Philipp Reisner

[permalink] [raw]
Subject: [PATCH 13/23] drbd: debugfs: deal with destructor racing with open of debugfs file

From: Lars Ellenberg <[email protected]>

Try to close the race between open() and debugfs_remove_recursive()
from inside an object destructor.
Once open succeeds, the object should stay around.
Open should not succeed if the object has already reached its destructor.

This may be overkill, but to make that happen, we check for existence of
a parent directory, "stale-ness" of "this" dentry, and serialize
kref_get_unless_zero() on the outermost object relevant for this file
with d_delete() on this dentry (using the parent's i_mutex).

Signed-off-by: Philipp Reisner <[email protected]>
Signed-off-by: Lars Ellenberg <[email protected]>
---
drivers/block/drbd/drbd_debugfs.c | 58 ++++++++++++++++++++++++++++++++++++---
1 file changed, 54 insertions(+), 4 deletions(-)

diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index d393aee..51c64ec 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -177,8 +177,8 @@ static int in_flight_summary_show(struct seq_file *m, void *pos)
connection = first_connection(resource);
/* This does not happen, actually.
* But be robust and prepare for future code changes. */
- if (!connection)
- return 0;
+ if (!connection || !kref_get_unless_zero(&connection->kref))
+ return -ESTALE;

seq_puts(m, "oldest application requests\n");
seq_print_resource_transfer_log_summary(m, resource, connection, jif);
@@ -187,12 +187,62 @@ static int in_flight_summary_show(struct seq_file *m, void *pos)
jif = jiffies - jif;
if (jif)
seq_printf(m, "generated in %d ms\n", jiffies_to_msecs(jif));
+ kref_put(&connection->kref, drbd_destroy_connection);
return 0;
}

+/* simple_positive(file->f_dentry) respectively debugfs_positive(),
+ * but neither is "reachable" from here.
+ * So we have our own inline version of it above. :-( */
+static inline int debugfs_positive(struct dentry *dentry)
+{
+ return dentry->d_inode && !d_unhashed(dentry);
+}
+
+/* make sure at *open* time that the respective object won't go away. */
+static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, void *),
+ void *data, struct kref *kref,
+ void (*release)(struct kref *))
+{
+ struct dentry *parent;
+ int ret = -ESTALE;
+
+ /* Are we still linked,
+ * or has debugfs_remove() already been called? */
+ parent = file->f_dentry->d_parent;
+ /* not sure if this can happen: */
+ if (!parent || !parent->d_inode)
+ goto out;
+ /* serialize with d_delete() */
+ mutex_lock(&parent->d_inode->i_mutex);
+ if (!debugfs_positive(file->f_dentry))
+ goto out_unlock;
+ /* Make sure the object is still alive */
+ if (kref_get_unless_zero(kref))
+ ret = 0;
+out_unlock:
+ mutex_unlock(&parent->d_inode->i_mutex);
+ if (!ret) {
+ ret = single_open(file, show, data);
+ if (ret)
+ kref_put(kref, release);
+ }
+out:
+ return ret;
+}
+
static int in_flight_summary_open(struct inode *inode, struct file *file)
{
- return single_open(file, in_flight_summary_show, inode->i_private);
+ struct drbd_resource *resource = inode->i_private;
+ return drbd_single_open(file, in_flight_summary_show, resource,
+ &resource->kref, drbd_destroy_resource);
+}
+
+static int in_flight_summary_release(struct inode *inode, struct file *file)
+{
+ struct drbd_resource *resource = inode->i_private;
+ kref_put(&resource->kref, drbd_destroy_resource);
+ return single_release(inode, file);
}

static const struct file_operations in_flight_summary_fops = {
@@ -200,7 +250,7 @@ static const struct file_operations in_flight_summary_fops = {
.open = in_flight_summary_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = in_flight_summary_release,
};

void drbd_debugfs_resource_add(struct drbd_resource *resource)
--
1.9.1

2014-07-03 08:50:25

by Philipp Reisner

[permalink] [raw]
Subject: [PATCH 06/23] drbd: add caching oldest request pointers for replication stages

From: Lars Ellenberg <[email protected]>

A request that is to be shipped to the peer goes through a few stages:
- queued
- sent, waiting for ack
- ack received, waiting for "barrier ack", which is re-order epoch being
closed on the peer by acknowledging a "cache flush" equivalent
on the lower level device.

In the later two stages, depending on protocol, we may have already
completed this request to the upper layers, so it won't be found anymore
on device->pending_master_completion[] lists.

Track the oldest request yet to be sent (req_next), the oldest not yet
acknowledged (req_ack_pending) and the oldest "still waiting for
something from the peer" (req_not_net_done), doing short list walks on
the transfer log to find the next pending one whenever such a request
makes progress.

Now we have a fast way to look up the oldest requests,
don't do a transfer log walk every time.

Signed-off-by: Philipp Reisner <[email protected]>
Signed-off-by: Lars Ellenberg <[email protected]>
---
drivers/block/drbd/drbd_int.h | 7 ++
drivers/block/drbd/drbd_req.c | 169 ++++++++++++++++++++++++++++++++----------
2 files changed, 136 insertions(+), 40 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index f29f107..fa010ea 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -720,6 +720,13 @@ struct drbd_connection {
struct drbd_thread worker;
struct drbd_thread asender;

+ /* cached pointers,
+ * so we can look up the oldest pending requests more quickly.
+ * protected by resource->req_lock */
+ struct drbd_request *req_next; /* DRBD 9: todo.req_next */
+ struct drbd_request *req_ack_pending;
+ struct drbd_request *req_not_net_done;
+
/* sender side */
struct drbd_work_queue sender_work;

diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 23cd909..3f6a6ed 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -345,12 +345,91 @@ static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_
return 1;
}

+static void set_if_null_req_next(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+ struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+ if (!connection)
+ return;
+ if (connection->req_next == NULL)
+ connection->req_next = req;
+}
+
+static void advance_conn_req_next(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+ struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+ if (!connection)
+ return;
+ if (connection->req_next != req)
+ return;
+ list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) {
+ const unsigned s = req->rq_state;
+ if (s & RQ_NET_QUEUED)
+ break;
+ }
+ if (&req->tl_requests == &connection->transfer_log)
+ req = NULL;
+ connection->req_next = req;
+}
+
+static void set_if_null_req_ack_pending(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+ struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+ if (!connection)
+ return;
+ if (connection->req_ack_pending == NULL)
+ connection->req_ack_pending = req;
+}
+
+static void advance_conn_req_ack_pending(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+ struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+ if (!connection)
+ return;
+ if (connection->req_ack_pending != req)
+ return;
+ list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) {
+ const unsigned s = req->rq_state;
+ if ((s & RQ_NET_SENT) && (s & RQ_NET_PENDING))
+ break;
+ }
+ if (&req->tl_requests == &connection->transfer_log)
+ req = NULL;
+ connection->req_ack_pending = req;
+}
+
+static void set_if_null_req_not_net_done(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+ struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+ if (!connection)
+ return;
+ if (connection->req_not_net_done == NULL)
+ connection->req_not_net_done = req;
+}
+
+static void advance_conn_req_not_net_done(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+ struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+ if (!connection)
+ return;
+ if (connection->req_not_net_done != req)
+ return;
+ list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) {
+ const unsigned s = req->rq_state;
+ if ((s & RQ_NET_SENT) && !(s & RQ_NET_DONE))
+ break;
+ }
+ if (&req->tl_requests == &connection->transfer_log)
+ req = NULL;
+ connection->req_not_net_done = req;
+}
+
/* I'd like this to be the only place that manipulates
* req->completion_ref and req->kref. */
static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
int clear, int set)
{
struct drbd_device *device = req->device;
+ struct drbd_peer_device *peer_device = first_peer_device(device);
unsigned s = req->rq_state;
int c_put = 0;
int k_put = 0;
@@ -379,6 +458,7 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,

if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED)) {
atomic_inc(&req->completion_ref);
+ set_if_null_req_next(peer_device, req);
}

if (!(s & RQ_EXP_BARR_ACK) && (set & RQ_EXP_BARR_ACK))
@@ -386,8 +466,12 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,

if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) {
/* potentially already completed in the asender thread */
- if (!(s & RQ_NET_DONE))
+ if (!(s & RQ_NET_DONE)) {
atomic_add(req->i.size >> 9, &device->ap_in_flight);
+ set_if_null_req_not_net_done(peer_device, req);
+ }
+ if (s & RQ_NET_PENDING)
+ set_if_null_req_ack_pending(peer_device, req);
}

if (!(s & RQ_COMPLETION_SUSP) && (set & RQ_COMPLETION_SUSP))
@@ -418,10 +502,13 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
dec_ap_pending(device);
++c_put;
req->acked_jif = jiffies;
+ advance_conn_req_ack_pending(peer_device, req);
}

- if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED))
+ if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED)) {
++c_put;
+ advance_conn_req_next(peer_device, req);
+ }

if (!(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) {
if (s & RQ_NET_SENT)
@@ -429,6 +516,13 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
if (s & RQ_EXP_BARR_ACK)
++k_put;
req->net_done_jif = jiffies;
+
+ /* in ahead/behind mode, or just in case,
+ * before we finally destroy this request,
+ * the caching pointers must not reference it anymore */
+ advance_conn_req_next(peer_device, req);
+ advance_conn_req_ack_pending(peer_device, req);
+ advance_conn_req_not_net_done(peer_device, req);
}

/* potentially complete and destroy */
@@ -1423,36 +1517,13 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct
return limit;
}

-static void find_oldest_requests(
- struct drbd_connection *connection,
- struct drbd_device *device,
- struct drbd_request **oldest_req_waiting_for_peer,
- struct drbd_request **oldest_req_waiting_for_disk)
-{
- struct drbd_request *r;
- *oldest_req_waiting_for_peer = NULL;
- *oldest_req_waiting_for_disk = NULL;
- list_for_each_entry(r, &connection->transfer_log, tl_requests) {
- const unsigned s = r->rq_state;
- if (!*oldest_req_waiting_for_peer
- && ((s & RQ_NET_MASK) && !(s & RQ_NET_DONE)))
- *oldest_req_waiting_for_peer = r;
-
- if (!*oldest_req_waiting_for_disk
- && (s & RQ_LOCAL_PENDING) && r->device == device)
- *oldest_req_waiting_for_disk = r;
-
- if (*oldest_req_waiting_for_peer && *oldest_req_waiting_for_disk)
- break;
- }
-}
-
void request_timer_fn(unsigned long data)
{
struct drbd_device *device = (struct drbd_device *) data;
struct drbd_connection *connection = first_peer_device(device)->connection;
- struct drbd_request *req_disk, *req_peer; /* oldest request */
+ struct drbd_request *req_read, *req_write, *req_peer; /* oldest request */
struct net_conf *nc;
+ unsigned long oldest_submit_jif;
unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
unsigned long now;

@@ -1473,14 +1544,31 @@ void request_timer_fn(unsigned long data)
return; /* Recurring timer stopped */

now = jiffies;
+ nt = now + et;

spin_lock_irq(&device->resource->req_lock);
- find_oldest_requests(connection, device, &req_peer, &req_disk);
- if (req_peer == NULL && req_disk == NULL) {
- spin_unlock_irq(&device->resource->req_lock);
- mod_timer(&device->request_timer, now + et);
- return;
- }
+ req_read = list_first_entry_or_null(&device->pending_completion[0], struct drbd_request, req_pending_local);
+ req_write = list_first_entry_or_null(&device->pending_completion[1], struct drbd_request, req_pending_local);
+ req_peer = connection->req_not_net_done;
+ /* maybe the oldest request waiting for the peer is in fact still
+ * blocking in tcp sendmsg */
+ if (!req_peer && connection->req_next && connection->req_next->pre_send_jif)
+ req_peer = connection->req_next;
+
+ /* evaluate the oldest peer request only in one timer! */
+ if (req_peer && req_peer->device != device)
+ req_peer = NULL;
+
+ /* do we have something to evaluate? */
+ if (req_peer == NULL && req_write == NULL && req_read == NULL)
+ goto out;
+
+ oldest_submit_jif =
+ (req_write && req_read)
+ ? ( time_before(req_write->pre_submit_jif, req_read->pre_submit_jif)
+ ? req_write->pre_submit_jif : req_read->pre_submit_jif )
+ : req_write ? req_write->pre_submit_jif
+ : req_read ? req_read->pre_submit_jif : now;

/* The request is considered timed out, if
* - we have some effective timeout from the configuration,
@@ -1499,13 +1587,13 @@ void request_timer_fn(unsigned long data)
* to expire twice (worst case) to become effective. Good enough.
*/
if (ent && req_peer &&
- time_after(now, req_peer->start_jif + ent) &&
+ time_after(now, req_peer->pre_send_jif + ent) &&
!time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) {
drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n");
_drbd_set_state(_NS(device, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL);
}
- if (dt && req_disk &&
- time_after(now, req_disk->start_jif + dt) &&
+ if (dt && oldest_submit_jif != now &&
+ time_after(now, oldest_submit_jif + dt) &&
!time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) {
drbd_warn(device, "Local backing device failed to meet the disk-timeout\n");
__drbd_chk_io_error(device, DRBD_FORCE_DETACH);
@@ -1513,11 +1601,12 @@ void request_timer_fn(unsigned long data)

/* Reschedule timer for the nearest not already expired timeout.
* Fallback to now + min(effective network timeout, disk timeout). */
- ent = (ent && req_peer && time_before(now, req_peer->start_jif + ent))
- ? req_peer->start_jif + ent : now + et;
- dt = (dt && req_disk && time_before(now, req_disk->start_jif + dt))
- ? req_disk->start_jif + dt : now + et;
+ ent = (ent && req_peer && time_before(now, req_peer->pre_send_jif + ent))
+ ? req_peer->pre_send_jif + ent : now + et;
+ dt = (dt && oldest_submit_jif != now && time_before(now, oldest_submit_jif + dt))
+ ? oldest_submit_jif + dt : now + et;
nt = time_before(ent, dt) ? ent : dt;
+out:
spin_unlock_irq(&connection->resource->req_lock);
mod_timer(&device->request_timer, nt);
}
--
1.9.1

2014-07-03 08:50:27

by Philipp Reisner

[permalink] [raw]
Subject: [PATCH 05/23] drbd: add lists to find oldest pending requests

From: Lars Ellenberg <[email protected]>

Adding requests to per-device fifo lists as soon as possible after
allocating them leaves a simple list_first_entry_or_null() to find the
oldest request, regardless what it is still waiting for.

Signed-off-by: Philipp Reisner <[email protected]>
Signed-off-by: Lars Ellenberg <[email protected]>
---
drivers/block/drbd/drbd_int.h | 11 ++++++++++-
drivers/block/drbd/drbd_main.c | 7 ++++++-
drivers/block/drbd/drbd_req.c | 45 +++++++++++++++++++++++++++++++-----------
3 files changed, 49 insertions(+), 14 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 08fa2dc..f29f107 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -318,6 +318,10 @@ struct drbd_request {
struct list_head tl_requests; /* ring list in the transfer log */
struct bio *master_bio; /* master bio pointer */

+ /* see struct drbd_device */
+ struct list_head req_pending_master_completion;
+ struct list_head req_pending_local;
+
/* for generic IO accounting */
unsigned long start_jif;

@@ -738,7 +742,7 @@ struct submit_worker {
struct workqueue_struct *wq;
struct work_struct worker;

- spinlock_t lock;
+ /* protected by ..->resource->req_lock */
struct list_head writes;
};

@@ -795,6 +799,11 @@ struct drbd_device {
struct rb_root read_requests;
struct rb_root write_requests;

+ /* for statistics and timeouts */
+ /* [0] read, [1] write */
+ struct list_head pending_master_completion[2];
+ struct list_head pending_completion[2];
+
/* use checksums for *this* resync */
bool use_csums;
/* blocks to resync in this run [unit BM_BLOCK_SIZE] */
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 0baec7a..5886596 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1934,6 +1934,10 @@ void drbd_init_set_defaults(struct drbd_device *device)
INIT_LIST_HEAD(&device->resync_work.list);
INIT_LIST_HEAD(&device->unplug_work.list);
INIT_LIST_HEAD(&device->bm_io_work.w.list);
+ INIT_LIST_HEAD(&device->pending_master_completion[0]);
+ INIT_LIST_HEAD(&device->pending_master_completion[1]);
+ INIT_LIST_HEAD(&device->pending_completion[0]);
+ INIT_LIST_HEAD(&device->pending_completion[1]);

device->resync_work.cb = w_resync_timer;
device->unplug_work.cb = w_send_write_hint;
@@ -2268,6 +2272,8 @@ static void do_retry(struct work_struct *ws)
}
}

+/* called via drbd_req_put_completion_ref(),
+ * holds resource->req_lock */
void drbd_restart_request(struct drbd_request *req)
{
unsigned long flags;
@@ -2687,7 +2693,6 @@ static int init_submitter(struct drbd_device *device)
return -ENOMEM;

INIT_WORK(&device->submit.worker, do_submit);
- spin_lock_init(&device->submit.lock);
INIT_LIST_HEAD(&device->submit.writes);
return 0;
}
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 1319bea..23cd909 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -84,6 +84,8 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device,

INIT_LIST_HEAD(&req->tl_requests);
INIT_LIST_HEAD(&req->w.list);
+ INIT_LIST_HEAD(&req->req_pending_master_completion);
+ INIT_LIST_HEAD(&req->req_pending_local);

/* one reference to be put by __drbd_make_request */
atomic_set(&req->completion_ref, 1);
@@ -120,12 +122,14 @@ void drbd_req_destroy(struct kref *kref)
return;
}

- /* remove it from the transfer log.
- * well, only if it had been there in the first
- * place... if it had not (local only or conflicting
- * and never sent), it should still be "empty" as
- * initialized in drbd_req_new(), so we can list_del() it
- * here unconditionally */
+ /* If called from mod_rq_state (expected normal case) or
+ * drbd_send_and_submit (the less likely normal path), this holds the
+ * req_lock, and req->tl_requests will typicaly be on ->transfer_log,
+ * though it may be still empty (never added to the transfer log).
+ *
+ * If called from do_retry(), we do NOT hold the req_lock, but we are
+ * still allowed to unconditionally list_del(&req->tl_requests),
+ * because it will be on a local on-stack list only. */
list_del_init(&req->tl_requests);

/* finally remove the request from the conflict detection
@@ -312,8 +316,15 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)

if (req->i.waiting)
wake_up(&device->misc_wait);
+
+ /* Either we are about to complete to upper layers,
+ * or we will restart this request.
+ * In either case, the request object will be destroyed soon,
+ * so better remove it from all lists. */
+ list_del_init(&req->req_pending_master_completion);
}

+/* still holds resource->req_lock */
static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_error *m, int put)
{
struct drbd_device *device = req->device;
@@ -400,6 +411,7 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
++k_put;
else
++c_put;
+ list_del_init(&req->req_pending_local);
}

if ((s & RQ_NET_PENDING) && (clear & RQ_NET_PENDING)) {
@@ -1070,9 +1082,11 @@ drbd_submit_req_private_bio(struct drbd_request *req)

static void drbd_queue_write(struct drbd_device *device, struct drbd_request *req)
{
- spin_lock(&device->submit.lock);
+ spin_lock_irq(&device->resource->req_lock);
list_add_tail(&req->tl_requests, &device->submit.writes);
- spin_unlock(&device->submit.lock);
+ list_add_tail(&req->req_pending_master_completion,
+ &device->pending_master_completion[1 /* WRITE */]);
+ spin_unlock_irq(&device->resource->req_lock);
queue_work(device->submit.wq, &device->submit.worker);
}

@@ -1186,8 +1200,15 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request
no_remote = true;
}

+ /* If it took the fast path in drbd_request_prepare, add it here.
+ * The slow path has added it already. */
+ if (list_empty(&req->req_pending_master_completion))
+ list_add_tail(&req->req_pending_master_completion,
+ &device->pending_master_completion[rw == WRITE]);
if (req->private_bio) {
/* needs to be marked within the same spinlock */
+ list_add_tail(&req->req_pending_local,
+ &device->pending_completion[rw == WRITE]);
_req_mod(req, TO_BE_SUBMITTED);
/* but we need to give up the spinlock to submit */
submit_private_bio = true;
@@ -1278,9 +1299,9 @@ void do_submit(struct work_struct *ws)
struct drbd_request *req, *tmp;

for (;;) {
- spin_lock(&device->submit.lock);
+ spin_lock_irq(&device->resource->req_lock);
list_splice_tail_init(&device->submit.writes, &incoming);
- spin_unlock(&device->submit.lock);
+ spin_unlock_irq(&device->resource->req_lock);

submit_fast_path(device, &incoming);
if (list_empty(&incoming))
@@ -1304,9 +1325,9 @@ skip_fast_path:
if (list_empty(&device->submit.writes))
break;

- spin_lock(&device->submit.lock);
+ spin_lock_irq(&device->resource->req_lock);
list_splice_tail_init(&device->submit.writes, &more_incoming);
- spin_unlock(&device->submit.lock);
+ spin_unlock_irq(&device->resource->req_lock);

if (list_empty(&more_incoming))
break;
--
1.9.1

2014-07-03 08:50:23

by Philipp Reisner

[permalink] [raw]
Subject: [PATCH 07/23] drbd: improve throttling decisions of background resynchronisation

From: Lars Ellenberg <[email protected]>

Background resynchronisation does some "side-stepping", or throttles
itself, if it detects application IO activity, and the current resync
rate estimate is above the configured "cmin-rate".

What was not detected: if there is no application IO,
because it blocks on activity log transactions.

Introduce a new atomic_t ap_actlog_cnt, tracking such blocked requests,
and count non-zero as application IO activity.
This counter is exposed at proc_details level 2 and above.

Also make sure to release the currently locked resync extent
if we side-step due to such voluntary throttling.

Signed-off-by: Philipp Reisner <[email protected]>
Signed-off-by: Lars Ellenberg <[email protected]>
---
drivers/block/drbd/drbd_actlog.c | 29 ++++++++++++++++++++++++++---
drivers/block/drbd/drbd_int.h | 4 +++-
drivers/block/drbd/drbd_main.c | 1 +
drivers/block/drbd/drbd_proc.c | 3 +++
drivers/block/drbd/drbd_receiver.c | 19 ++++++++++++-------
drivers/block/drbd/drbd_req.c | 4 ++++
drivers/block/drbd/drbd_worker.c | 9 ++-------
7 files changed, 51 insertions(+), 18 deletions(-)

diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index d7e8066..6ce5c76 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -991,6 +991,15 @@ int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector)
struct lc_element *e;
struct bm_extent *bm_ext;
int i;
+ bool throttle = drbd_rs_should_slow_down(device, sector, true);
+
+ /* If we need to throttle, a half-locked (only marked BME_NO_WRITES,
+ * not yet BME_LOCKED) extent needs to be kicked out explicitly if we
+ * need to throttle. There is at most one such half-locked extent,
+ * which is remembered in resync_wenr. */
+
+ if (throttle && device->resync_wenr != enr)
+ return -EAGAIN;

spin_lock_irq(&device->al_lock);
if (device->resync_wenr != LC_FREE && device->resync_wenr != enr) {
@@ -1014,8 +1023,10 @@ int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector)
D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags));
clear_bit(BME_NO_WRITES, &bm_ext->flags);
device->resync_wenr = LC_FREE;
- if (lc_put(device->resync, &bm_ext->lce) == 0)
+ if (lc_put(device->resync, &bm_ext->lce) == 0) {
+ bm_ext->flags = 0;
device->resync_locked--;
+ }
wake_up(&device->al_wait);
} else {
drbd_alert(device, "LOGIC BUG\n");
@@ -1077,8 +1088,20 @@ proceed:
return 0;

try_again:
- if (bm_ext)
- device->resync_wenr = enr;
+ if (bm_ext) {
+ if (throttle) {
+ D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags));
+ D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags));
+ clear_bit(BME_NO_WRITES, &bm_ext->flags);
+ device->resync_wenr = LC_FREE;
+ if (lc_put(device->resync, &bm_ext->lce) == 0) {
+ bm_ext->flags = 0;
+ device->resync_locked--;
+ }
+ wake_up(&device->al_wait);
+ } else
+ device->resync_wenr = enr;
+ }
spin_unlock_irq(&device->al_lock);
return -EAGAIN;
}
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index fa010ea..81f4af4 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -797,6 +797,7 @@ struct drbd_device {
unsigned int al_writ_cnt;
unsigned int bm_writ_cnt;
atomic_t ap_bio_cnt; /* Requests we need to complete */
+ atomic_t ap_actlog_cnt; /* Requests waiting for activity log */
atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */
atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
atomic_t unacked_cnt; /* Need to send replies for */
@@ -1454,7 +1455,8 @@ extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
extern int drbd_receiver(struct drbd_thread *thi);
extern int drbd_asender(struct drbd_thread *thi);
extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device);
-extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector);
+extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
+ bool throttle_if_app_is_waiting);
extern int drbd_submit_peer_request(struct drbd_device *,
struct drbd_peer_request *, const unsigned,
const int);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 5886596..ad7c0e8 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1909,6 +1909,7 @@ void drbd_init_set_defaults(struct drbd_device *device)
drbd_set_defaults(device);

atomic_set(&device->ap_bio_cnt, 0);
+ atomic_set(&device->ap_actlog_cnt, 0);
atomic_set(&device->ap_pending_cnt, 0);
atomic_set(&device->rs_pending_cnt, 0);
atomic_set(&device->unacked_cnt, 0);
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 9059d7b..06e6147 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -335,6 +335,9 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
lc_seq_printf_stats(seq, device->act_log);
put_ldev(device);
}
+
+ if (proc_details >= 2)
+ seq_printf(seq, "\tblocked on activity log: %d\n", atomic_read(&device->ap_actlog_cnt));
}
rcu_read_unlock();

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 7a1078d..0d3cbd8 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -2417,13 +2417,14 @@ out_interrupted:
* The current sync rate used here uses only the most recent two step marks,
* to have a short time average so we can react faster.
*/
-bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector)
+bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
+ bool throttle_if_app_is_waiting)
{
struct lc_element *tmp;
- bool throttle = true;
+ bool throttle = drbd_rs_c_min_rate_throttle(device);

- if (!drbd_rs_c_min_rate_throttle(device))
- return false;
+ if (!throttle || throttle_if_app_is_waiting)
+ return throttle;

spin_lock_irq(&device->al_lock);
tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector));
@@ -2431,7 +2432,8 @@ bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector)
struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
if (test_bit(BME_PRIORITY, &bm_ext->flags))
throttle = false;
- /* Do not slow down if app IO is already waiting for this extent */
+ /* Do not slow down if app IO is already waiting for this extent,
+ * and our progress is necessary for application IO to complete. */
}
spin_unlock_irq(&device->al_lock);

@@ -2456,7 +2458,9 @@ bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
(int)part_stat_read(&disk->part0, sectors[1]) -
atomic_read(&device->rs_sect_ev);
- if (!device->rs_last_events || curr_events - device->rs_last_events > 64) {
+
+ if (atomic_read(&device->ap_actlog_cnt)
+ || !device->rs_last_events || curr_events - device->rs_last_events > 64) {
unsigned long rs_left;
int i;

@@ -2646,7 +2650,8 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
* we would also throttle its application reads.
* In that case, throttling is done on the SyncTarget only.
*/
- if (device->state.peer != R_PRIMARY && drbd_rs_should_slow_down(device, sector))
+ if (device->state.peer != R_PRIMARY
+ && drbd_rs_should_slow_down(device, sector, false))
schedule_timeout_uninterruptible(HZ/10);
if (drbd_rs_begin_io(device, sector))
goto out_free_e;
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 3f6a6ed..74ebef1 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1218,6 +1218,7 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long
if (rw == WRITE && req->private_bio && req->i.size
&& !test_bit(AL_SUSPENDED, &device->flags)) {
if (!drbd_al_begin_io_fastpath(device, &req->i)) {
+ atomic_inc(&device->ap_actlog_cnt);
drbd_queue_write(device, req);
return NULL;
}
@@ -1354,6 +1355,7 @@ static void submit_fast_path(struct drbd_device *device, struct list_head *incom

req->rq_state |= RQ_IN_ACT_LOG;
req->in_actlog_jif = jiffies;
+ atomic_dec(&device->ap_actlog_cnt);
}

list_del_init(&req->tl_requests);
@@ -1439,6 +1441,7 @@ skip_fast_path:
list_for_each_entry_safe(req, tmp, &pending, tl_requests) {
req->rq_state |= RQ_IN_ACT_LOG;
req->in_actlog_jif = jiffies;
+ atomic_dec(&device->ap_actlog_cnt);
list_del_init(&req->tl_requests);
drbd_send_and_submit(device, req);
}
@@ -1454,6 +1457,7 @@ skip_fast_path:
if (!was_cold) {
req->rq_state |= RQ_IN_ACT_LOG;
req->in_actlog_jif = jiffies;
+ atomic_dec(&device->ap_actlog_cnt);
/* Corresponding extent was hot after all? */
drbd_send_and_submit(device, req);
} else {
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 0ff8f46..48975a2 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -395,9 +395,6 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector,
if (!get_ldev(device))
return -EIO;

- if (drbd_rs_should_slow_down(device, sector))
- goto defer;
-
/* GFP_TRY, because if there is no memory available right now, this may
* be rescheduled for later. It is "only" background resync, after all. */
peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector,
@@ -651,8 +648,7 @@ next_sector:

sector = BM_BIT_TO_SECT(bit);

- if (drbd_rs_should_slow_down(device, sector) ||
- drbd_try_rs_begin_io(device, sector)) {
+ if (drbd_try_rs_begin_io(device, sector)) {
device->bm_resync_fo = bit;
goto requeue;
}
@@ -783,8 +779,7 @@ static int make_ov_request(struct drbd_device *device, int cancel)

size = BM_BLOCK_SIZE;

- if (drbd_rs_should_slow_down(device, sector) ||
- drbd_try_rs_begin_io(device, sector)) {
+ if (drbd_try_rs_begin_io(device, sector)) {
device->ov_position = sector;
goto requeue;
}
--
1.9.1

2014-07-03 08:50:21

by Philipp Reisner

[permalink] [raw]
Subject: [PATCH 08/23] drbd: track timing details of peer_requests

From: Lars Ellenberg <[email protected]>

To be able to present timing details in debugfs,
we need to track preparation/submit times of peer requests.

Track peer request flags early,
before they are put on the epoch_entry lists.

Waiting for activity log transactions may be a major latency factor.
We want to be able to present the peer_request state accurately in
debugfs, and what it is waiting for.

Consistently mark/unmark peer requests with EE_CALL_AL_COMPLETE_IO.
Set it only *after* calling drbd_al_begin_io(),
clear it as soon as we call drbd_al_complete_io().

Signed-off-by: Philipp Reisner <[email protected]>
Signed-off-by: Lars Ellenberg <[email protected]>
---
drivers/block/drbd/drbd_int.h | 15 ++++++++
drivers/block/drbd/drbd_receiver.c | 78 +++++++++++++++++++++++---------------
drivers/block/drbd/drbd_worker.c | 1 +
3 files changed, 64 insertions(+), 30 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 81f4af4..c7a409f 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -422,6 +422,7 @@ struct drbd_peer_request {
struct drbd_interval i;
/* see comments on ee flag bits below */
unsigned long flags;
+ unsigned long submit_jif;
union {
u64 block_id;
struct digest_info *digest;
@@ -464,6 +465,17 @@ enum {

/* Is set when net_conf had two_primaries set while creating this peer_req */
__EE_IN_INTERVAL_TREE,
+
+ /* for debugfs: */
+ /* has this been submitted, or does it still wait for something else? */
+ __EE_SUBMITTED,
+
+ /* this is/was a write request */
+ __EE_WRITE,
+
+ /* this originates from application on peer
+ * (not some resync or verify or other DRBD internal request) */
+ __EE_APPLICATION,
};
#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
@@ -475,6 +487,9 @@ enum {
#define EE_RESTART_REQUESTS (1<<__EE_RESTART_REQUESTS)
#define EE_SEND_WRITE_ACK (1<<__EE_SEND_WRITE_ACK)
#define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE)
+#define EE_SUBMITTED (1<<__EE_SUBMITTED)
+#define EE_WRITE (1<<__EE_WRITE)
+#define EE_APPLICATION (1<<__EE_APPLICATION)

/* flag bits per device */
enum {
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 0d3cbd8..2f67dc0 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -389,11 +389,16 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req,
int is_net)
{
+ might_sleep();
if (peer_req->flags & EE_HAS_DIGEST)
kfree(peer_req->digest);
drbd_free_pages(device, peer_req->pages, is_net);
D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0);
D_ASSERT(device, drbd_interval_empty(&peer_req->i));
+ if (!expect(!(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) {
+ peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
+ drbd_al_complete_io(device, &peer_req->i);
+ }
mempool_free(peer_req, drbd_ee_mempool);
}

@@ -1372,6 +1377,8 @@ int drbd_submit_peer_request(struct drbd_device *device,
conn_wait_active_ee_empty(first_peer_device(device)->connection);
/* add it to the active list now,
* so we can find it to present it in debugfs */
+ peer_req->submit_jif = jiffies;
+ peer_req->flags |= EE_SUBMITTED;
spin_lock_irq(&device->resource->req_lock);
list_add_tail(&peer_req->w.list, &device->active_ee);
spin_unlock_irq(&device->resource->req_lock);
@@ -1443,6 +1450,9 @@ submit:
D_ASSERT(device, page == NULL);

atomic_set(&peer_req->pending_bios, n_bios);
+ /* for debugfs: update timestamp, mark as submitted */
+ peer_req->submit_jif = jiffies;
+ peer_req->flags |= EE_SUBMITTED;
do {
bio = bios;
bios = bios->bi_next;
@@ -1624,6 +1634,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
if (!peer_req)
return NULL;

+ peer_req->flags |= EE_WRITE;
if (trim)
return peer_req;

@@ -1780,6 +1791,7 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t secto
* respective _drbd_clear_done_ee */

peer_req->w.cb = e_end_resync_block;
+ peer_req->submit_jif = jiffies;

spin_lock_irq(&device->resource->req_lock);
list_add_tail(&peer_req->w.list, &device->sync_ee);
@@ -2196,7 +2208,6 @@ static int handle_write_conflicts(struct drbd_device *device,
(unsigned long long)sector, size,
superseded ? "local" : "remote");

- inc_unacked(device);
peer_req->w.cb = superseded ? e_send_superseded :
e_send_retry_write;
list_add_tail(&peer_req->w.list, &device->done_ee);
@@ -2255,6 +2266,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
{
struct drbd_peer_device *peer_device;
struct drbd_device *device;
+ struct net_conf *nc;
sector_t sector;
struct drbd_peer_request *peer_req;
struct p_data *p = pi->data;
@@ -2294,6 +2306,8 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
}

peer_req->w.cb = e_end_block;
+ peer_req->submit_jif = jiffies;
+ peer_req->flags |= EE_APPLICATION;

dp_flags = be32_to_cpu(p->dp_flags);
rw |= wire_flags_to_bio(dp_flags);
@@ -2320,9 +2334,36 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
spin_unlock(&connection->epoch_lock);

rcu_read_lock();
- tp = rcu_dereference(peer_device->connection->net_conf)->two_primaries;
+ nc = rcu_dereference(peer_device->connection->net_conf);
+ tp = nc->two_primaries;
+ if (peer_device->connection->agreed_pro_version < 100) {
+ switch (nc->wire_protocol) {
+ case DRBD_PROT_C:
+ dp_flags |= DP_SEND_WRITE_ACK;
+ break;
+ case DRBD_PROT_B:
+ dp_flags |= DP_SEND_RECEIVE_ACK;
+ break;
+ }
+ }
rcu_read_unlock();
+
+ if (dp_flags & DP_SEND_WRITE_ACK) {
+ peer_req->flags |= EE_SEND_WRITE_ACK;
+ inc_unacked(device);
+ /* corresponding dec_unacked() in e_end_block()
+ * respective _drbd_clear_done_ee */
+ }
+
+ if (dp_flags & DP_SEND_RECEIVE_ACK) {
+ /* I really don't like it that the receiver thread
+ * sends on the msock, but anyways */
+ drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req);
+ }
+
if (tp) {
+ /* two primaries implies protocol C */
+ D_ASSERT(device, dp_flags & DP_SEND_WRITE_ACK);
peer_req->flags |= EE_IN_INTERVAL_TREE;
err = wait_for_and_update_peer_seq(peer_device, peer_seq);
if (err)
@@ -2352,38 +2393,12 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
if (device->state.conn == C_SYNC_TARGET)
wait_event(device->ee_wait, !overlapping_resync_write(device, peer_req));

- if (peer_device->connection->agreed_pro_version < 100) {
- rcu_read_lock();
- switch (rcu_dereference(peer_device->connection->net_conf)->wire_protocol) {
- case DRBD_PROT_C:
- dp_flags |= DP_SEND_WRITE_ACK;
- break;
- case DRBD_PROT_B:
- dp_flags |= DP_SEND_RECEIVE_ACK;
- break;
- }
- rcu_read_unlock();
- }
-
- if (dp_flags & DP_SEND_WRITE_ACK) {
- peer_req->flags |= EE_SEND_WRITE_ACK;
- inc_unacked(device);
- /* corresponding dec_unacked() in e_end_block()
- * respective _drbd_clear_done_ee */
- }
-
- if (dp_flags & DP_SEND_RECEIVE_ACK) {
- /* I really don't like it that the receiver thread
- * sends on the msock, but anyways */
- drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req);
- }
-
if (device->state.pdsk < D_INCONSISTENT) {
/* In case we have the only disk of the cluster, */
drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size);
- peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
drbd_al_begin_io(device, &peer_req->i);
+ peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
}

err = drbd_submit_peer_request(device, peer_req, rw, DRBD_FAULT_DT_WR);
@@ -2396,8 +2411,10 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
list_del(&peer_req->w.list);
drbd_remove_epoch_entry_interval(device, peer_req);
spin_unlock_irq(&device->resource->req_lock);
- if (peer_req->flags & EE_CALL_AL_COMPLETE_IO)
+ if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) {
+ peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
drbd_al_complete_io(device, &peer_req->i);
+ }

out_interrupted:
drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT + EV_CLEANUP);
@@ -2561,6 +2578,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
peer_req->w.cb = w_e_end_data_req;
fault_type = DRBD_FAULT_DT_RD;
/* application IO, don't drbd_rs_begin_io */
+ peer_req->flags |= EE_APPLICATION;
goto submit;

case P_RS_DATA_REQUEST:
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 48975a2..b908e9b 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -132,6 +132,7 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l
i = peer_req->i;
do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO;
block_id = peer_req->block_id;
+ peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;

spin_lock_irqsave(&device->resource->req_lock, flags);
device->writ_cnt += peer_req->i.size >> 9;
--
1.9.1

2014-07-03 08:50:19

by Philipp Reisner

[permalink] [raw]
Subject: [PATCH 09/23] drbd: register peer requests on read_ee early

From: Lars Ellenberg <[email protected]>

Initialize peer_request with timestamp and proper empty list head.
Add peer_request to list early, so debugfs can find this request and
report it as "preparing", even if we sleep before we actually submit it.

Signed-off-by: Philipp Reisner <[email protected]>
Signed-off-by: Lars Ellenberg <[email protected]>
---
drivers/block/drbd/drbd_receiver.c | 25 ++++++++++++++-----------
1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 2f67dc0..42e3835 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -362,17 +362,14 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
goto fail;
}

+ memset(peer_req, 0, sizeof(*peer_req));
+ INIT_LIST_HEAD(&peer_req->w.list);
drbd_clear_interval(&peer_req->i);
peer_req->i.size = data_size;
peer_req->i.sector = sector;
- peer_req->i.local = false;
- peer_req->i.waiting = false;
-
- peer_req->epoch = NULL;
+ peer_req->submit_jif = jiffies;
peer_req->peer_device = peer_device;
peer_req->pages = page;
- atomic_set(&peer_req->pending_bios, 0);
- peer_req->flags = 0;
/*
* The block_id is opaque to the receiver. It is not endianness
* converted, and sent back to the sender unchanged.
@@ -2668,6 +2665,15 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
* we would also throttle its application reads.
* In that case, throttling is done on the SyncTarget only.
*/
+
+ /* Even though this may be a resync request, we do add to "read_ee";
+ * "sync_ee" is only used for resync WRITEs.
+ * Add to list early, so debugfs can find this request
+ * even if we have to sleep below. */
+ spin_lock_irq(&device->resource->req_lock);
+ list_add_tail(&peer_req->w.list, &device->read_ee);
+ spin_unlock_irq(&device->resource->req_lock);
+
if (device->state.peer != R_PRIMARY
&& drbd_rs_should_slow_down(device, sector, false))
schedule_timeout_uninterruptible(HZ/10);
@@ -2679,21 +2685,18 @@ submit_for_resync:

submit:
inc_unacked(device);
- spin_lock_irq(&device->resource->req_lock);
- list_add_tail(&peer_req->w.list, &device->read_ee);
- spin_unlock_irq(&device->resource->req_lock);
-
if (drbd_submit_peer_request(device, peer_req, READ, fault_type) == 0)
return 0;

/* don't care for the reason here */
drbd_err(device, "submit failed, triggering re-connect\n");
+
+out_free_e:
spin_lock_irq(&device->resource->req_lock);
list_del(&peer_req->w.list);
spin_unlock_irq(&device->resource->req_lock);
/* no drbd_rs_complete_io(), we are dropping the connection anyways */

-out_free_e:
put_ldev(device);
drbd_free_peer_req(device, peer_req);
return -EIO;
--
1.9.1

2014-07-03 08:50:17

by Philipp Reisner

[permalink] [raw]
Subject: [PATCH 10/23] drbd: track details of bitmap IO

From: Lars Ellenberg <[email protected]>

Track start and submit time of bitmap operations, and
add pending bitmap IO contexts to a new pending_bitmap_io list.

Signed-off-by: Philipp Reisner <[email protected]>
Signed-off-by: Lars Ellenberg <[email protected]>
---
drivers/block/drbd/drbd_bitmap.c | 70 +++++++++++++++++++---------------------
drivers/block/drbd/drbd_int.h | 16 +++++++++
drivers/block/drbd/drbd_main.c | 1 +
3 files changed, 51 insertions(+), 36 deletions(-)

diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index c5f5a20..f435e7e 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -928,22 +928,14 @@ void drbd_bm_clear_all(struct drbd_device *device)
spin_unlock_irq(&b->bm_lock);
}

-struct bm_aio_ctx {
- struct drbd_device *device;
- atomic_t in_flight;
- unsigned int done;
- unsigned flags;
-#define BM_AIO_COPY_PAGES 1
-#define BM_AIO_WRITE_HINTED 2
-#define BM_WRITE_ALL_PAGES 4
- int error;
- struct kref kref;
-};
-
-static void bm_aio_ctx_destroy(struct kref *kref)
+static void drbd_bm_aio_ctx_destroy(struct kref *kref)
{
- struct bm_aio_ctx *ctx = container_of(kref, struct bm_aio_ctx, kref);
+ struct drbd_bm_aio_ctx *ctx = container_of(kref, struct drbd_bm_aio_ctx, kref);
+ unsigned long flags;

+ spin_lock_irqsave(&ctx->device->resource->req_lock, flags);
+ list_del(&ctx->list);
+ spin_unlock_irqrestore(&ctx->device->resource->req_lock, flags);
put_ldev(ctx->device);
kfree(ctx);
}
@@ -951,7 +943,7 @@ static void bm_aio_ctx_destroy(struct kref *kref)
/* bv_page may be a copy, or may be the original */
static void bm_async_io_complete(struct bio *bio, int error)
{
- struct bm_aio_ctx *ctx = bio->bi_private;
+ struct drbd_bm_aio_ctx *ctx = bio->bi_private;
struct drbd_device *device = ctx->device;
struct drbd_bitmap *b = device->bitmap;
unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page);
@@ -994,17 +986,18 @@ static void bm_async_io_complete(struct bio *bio, int error)
if (atomic_dec_and_test(&ctx->in_flight)) {
ctx->done = 1;
wake_up(&device->misc_wait);
- kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+ kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
}
}

-static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local)
+static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local)
{
struct bio *bio = bio_alloc_drbd(GFP_NOIO);
struct drbd_device *device = ctx->device;
struct drbd_bitmap *b = device->bitmap;
struct page *page;
unsigned int len;
+ unsigned int rw = (ctx->flags & BM_AIO_READ) ? READ : WRITE;

sector_t on_disk_sector =
device->ldev->md.md_offset + device->ldev->md.bm_offset;
@@ -1050,9 +1043,9 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must
/*
* bm_rw: read/write the whole bitmap from/to its on disk location.
*/
-static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned lazy_writeout_upper_idx) __must_hold(local)
+static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned lazy_writeout_upper_idx) __must_hold(local)
{
- struct bm_aio_ctx *ctx;
+ struct drbd_bm_aio_ctx *ctx;
struct drbd_bitmap *b = device->bitmap;
int num_pages, i, count = 0;
unsigned long now;
@@ -1068,12 +1061,13 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
* as we submit copies of pages anyways.
*/

- ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO);
+ ctx = kmalloc(sizeof(struct drbd_bm_aio_ctx), GFP_NOIO);
if (!ctx)
return -ENOMEM;

- *ctx = (struct bm_aio_ctx) {
+ *ctx = (struct drbd_bm_aio_ctx) {
.device = device,
+ .start_jif = jiffies,
.in_flight = ATOMIC_INIT(1),
.done = 0,
.flags = flags,
@@ -1081,7 +1075,7 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
.kref = { ATOMIC_INIT(2) },
};

- if (!get_ldev_if_state(device, D_ATTACHING)) { /* put is in bm_aio_ctx_destroy() */
+ if (!get_ldev_if_state(device, D_ATTACHING)) { /* put is in drbd_bm_aio_ctx_destroy() */
drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n");
kfree(ctx);
return -ENODEV;
@@ -1089,9 +1083,13 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
/* Here D_ATTACHING is sufficient since drbd_bm_read() is called only from
drbd_adm_attach(), after device->ldev was assigned. */

- if (!ctx->flags)
+ if (0 == (ctx->flags & ~BM_AIO_READ))
WARN_ON(!(BM_LOCKED_MASK & b->bm_flags));

+ spin_lock_irq(&device->resource->req_lock);
+ list_add_tail(&ctx->list, &device->pending_bitmap_io);
+ spin_unlock_irq(&device->resource->req_lock);
+
num_pages = b->bm_number_of_pages;

now = jiffies;
@@ -1101,13 +1099,13 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
/* ignore completely unchanged pages */
if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
break;
- if (rw & WRITE) {
+ if (!(flags & BM_AIO_READ)) {
if ((flags & BM_AIO_WRITE_HINTED) &&
!test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
&page_private(b->bm_pages[i])))
continue;

- if (!(flags & BM_WRITE_ALL_PAGES) &&
+ if (!(flags & BM_AIO_WRITE_ALL_PAGES) &&
bm_test_page_unchanged(b->bm_pages[i])) {
dynamic_drbd_dbg(device, "skipped bm write for idx %u\n", i);
continue;
@@ -1121,7 +1119,7 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
}
}
atomic_inc(&ctx->in_flight);
- bm_page_io_async(ctx, i, rw);
+ bm_page_io_async(ctx, i);
++count;
cond_resched();
}
@@ -1137,12 +1135,12 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
if (!atomic_dec_and_test(&ctx->in_flight))
wait_until_done_or_force_detached(device, device->ldev, &ctx->done);
else
- kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+ kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);

/* summary for global bitmap IO */
if (flags == 0)
drbd_info(device, "bitmap %s of %u pages took %lu jiffies\n",
- rw == WRITE ? "WRITE" : "READ",
+ (flags & BM_AIO_READ) ? "READ" : "WRITE",
count, jiffies - now);

if (ctx->error) {
@@ -1155,18 +1153,18 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
err = -EIO; /* Disk timeout/force-detach during IO... */

now = jiffies;
- if (rw == READ) {
+ if (flags & BM_AIO_READ) {
b->bm_set = bm_count_bits(b);
drbd_info(device, "recounting of set bits took additional %lu jiffies\n",
jiffies - now);
}
now = b->bm_set;

- if (flags == 0)
+ if ((flags & ~BM_AIO_READ) == 0)
drbd_info(device, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);

- kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+ kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
return err;
}

@@ -1176,7 +1174,7 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
*/
int drbd_bm_read(struct drbd_device *device) __must_hold(local)
{
- return bm_rw(device, READ, 0, 0);
+ return bm_rw(device, BM_AIO_READ, 0);
}

/**
@@ -1187,7 +1185,7 @@ int drbd_bm_read(struct drbd_device *device) __must_hold(local)
*/
int drbd_bm_write(struct drbd_device *device) __must_hold(local)
{
- return bm_rw(device, WRITE, 0, 0);
+ return bm_rw(device, 0, 0);
}

/**
@@ -1198,7 +1196,7 @@ int drbd_bm_write(struct drbd_device *device) __must_hold(local)
*/
int drbd_bm_write_all(struct drbd_device *device) __must_hold(local)
{
- return bm_rw(device, WRITE, BM_WRITE_ALL_PAGES, 0);
+ return bm_rw(device, BM_AIO_WRITE_ALL_PAGES, 0);
}

/**
@@ -1214,7 +1212,7 @@ int drbd_bm_write_all(struct drbd_device *device) __must_hold(local)
*/
int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local)
{
- return bm_rw(device, WRITE, BM_AIO_COPY_PAGES, 0);
+ return bm_rw(device, BM_AIO_COPY_PAGES, 0);
}

/**
@@ -1223,7 +1221,7 @@ int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local)
*/
int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local)
{
- return bm_rw(device, WRITE, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0);
+ return bm_rw(device, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0);
}

/* NOTE
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index c7a409f..40c816c 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -777,6 +777,7 @@ struct drbd_peer_device {
struct drbd_device {
struct drbd_resource *resource;
struct list_head peer_devices;
+ struct list_head pending_bitmap_io;
int vnr; /* volume number within the connection */
struct kref kref;

@@ -918,6 +919,21 @@ struct drbd_device {
struct submit_worker submit;
};

+struct drbd_bm_aio_ctx {
+ struct drbd_device *device;
+ struct list_head list; /* on device->pending_bitmap_io */;
+ unsigned long start_jif;
+ atomic_t in_flight;
+ unsigned int done;
+ unsigned flags;
+#define BM_AIO_COPY_PAGES 1
+#define BM_AIO_WRITE_HINTED 2
+#define BM_AIO_WRITE_ALL_PAGES 4
+#define BM_AIO_READ 8
+ int error;
+ struct kref kref;
+};
+
struct drbd_config_context {
/* assigned from drbd_genlmsghdr */
unsigned int minor;
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index ad7c0e8..922d631 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2793,6 +2793,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
kref_get(&device->kref);

INIT_LIST_HEAD(&device->peer_devices);
+ INIT_LIST_HEAD(&device->pending_bitmap_io);
for_each_connection(connection, resource) {
peer_device = kzalloc(sizeof(struct drbd_peer_device), GFP_KERNEL);
if (!peer_device)
--
1.9.1

2014-07-03 08:43:20

by Philipp Reisner

[permalink] [raw]
Subject: [PATCH 02/23] drbd: fix drbd_destroy_device reference count updates

From: Lars Ellenberg <[email protected]>

drbd_destroy_device means to give up reference counts
on the connection(s) reachable via the peer_device(s).

It must not do that by iterating via device->resource->connections,
resource and connections may have already been disassociated
by drbd_free_resource, and we'd leak connection refs.

Instead, iterate via device->peer_devices->connection.

Signed-off-by: Philipp Reisner <[email protected]>
Signed-off-by: Lars Ellenberg <[email protected]>
---
drivers/block/drbd/drbd_main.c | 14 +++++++++-----
1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 92547d1..56cf11b 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2160,7 +2160,7 @@ void drbd_destroy_device(struct kref *kref)
{
struct drbd_device *device = container_of(kref, struct drbd_device, kref);
struct drbd_resource *resource = device->resource;
- struct drbd_connection *connection;
+ struct drbd_peer_device *peer_device, *tmp_peer_device;

del_timer_sync(&device->request_timer);

@@ -2191,12 +2191,16 @@ void drbd_destroy_device(struct kref *kref)
put_disk(device->vdisk);
blk_cleanup_queue(device->rq_queue);
kfree(device->rs_plan_s);
- kfree(first_peer_device(device));
+
+ /* not for_each_connection(connection, resource):
+ * those may have been cleaned up and disassociated already.
+ */
+ for_each_peer_device_safe(peer_device, tmp_peer_device, device) {
+ kref_put(&peer_device->connection->kref, drbd_destroy_connection);
+ kfree(peer_device);
+ }
memset(device, 0xfd, sizeof(*device));
kfree(device);
-
- for_each_connection(connection, resource)
- kref_put(&connection->kref, drbd_destroy_connection);
kref_put(&resource->kref, drbd_destroy_resource);
}

--
1.9.1

2014-07-03 08:52:58

by Philipp Reisner

[permalink] [raw]
Subject: [PATCH 04/23] drbd: gather detailed timing statistics for drbd_requests

From: Lars Ellenberg <[email protected]>

Record (in jiffies) how much time a request spends in which stages.
Followup commits will use and present this additional timing information
so we can better locate and tackle the root causes of latency spikes,
or present the backlog for asynchronous replication.

Signed-off-by: Philipp Reisner <[email protected]>
Signed-off-by: Lars Ellenberg <[email protected]>
---
drivers/block/drbd/drbd_int.h | 54 ++++++++++++++++++++++-
drivers/block/drbd/drbd_main.c | 7 +--
drivers/block/drbd/drbd_req.c | 93 +++++++++++++++++++++++++---------------
drivers/block/drbd/drbd_worker.c | 3 ++
4 files changed, 119 insertions(+), 38 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 3f8281b..08fa2dc 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -317,7 +317,59 @@ struct drbd_request {

struct list_head tl_requests; /* ring list in the transfer log */
struct bio *master_bio; /* master bio pointer */
- unsigned long start_time;
+
+ /* for generic IO accounting */
+ unsigned long start_jif;
+
+ /* for DRBD internal statistics */
+
+ /* Minimal set of time stamps to determine if we wait for activity log
+ * transactions, local disk or peer. 32 bit "jiffies" are good enough,
+ * we don't expect a DRBD request to be stalled for several month.
+ */
+
+ /* before actual request processing */
+ unsigned long in_actlog_jif;
+
+ /* local disk */
+ unsigned long pre_submit_jif;
+
+ /* per connection */
+ unsigned long pre_send_jif;
+ unsigned long acked_jif;
+ unsigned long net_done_jif;
+
+ /* Possibly even more detail to track each phase:
+ * master_completion_jif
+ * how long did it take to complete the master bio
+ * (application visible latency)
+ * allocated_jif
+ * how long the master bio was blocked until we finally allocated
+ * a tracking struct
+ * in_actlog_jif
+ * how long did we wait for activity log transactions
+ *
+ * net_queued_jif
+ * when did we finally queue it for sending
+ * pre_send_jif
+ * when did we start sending it
+ * post_send_jif
+ * how long did we block in the network stack trying to send it
+ * acked_jif
+ * when did we receive (or fake, in protocol A) a remote ACK
+ * net_done_jif
+ * when did we receive final acknowledgement (P_BARRIER_ACK),
+ * or decide, e.g. on connection loss, that we do no longer expect
+ * anything from this peer for this request.
+ *
+ * pre_submit_jif
+ * post_sub_jif
+ * when did we start submiting to the lower level device,
+ * and how long did we block in that submit function
+ * local_completion_jif
+ * how long did it take the lower level device to complete this request
+ */
+

/* once it hits 0, we may complete the master_bio */
atomic_t completion_ref;
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 3ab7461..0baec7a 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -29,6 +29,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/module.h>
+#include <linux/jiffies.h>
#include <linux/drbd.h>
#include <asm/uaccess.h>
#include <asm/types.h>
@@ -264,7 +265,7 @@ bail:

/**
* _tl_restart() - Walks the transfer log, and applies an action to all requests
- * @device: DRBD device.
+ * @connection: DRBD connection to operate on.
* @what: The action/event to perform with all request objects
*
* @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO,
@@ -2228,7 +2229,7 @@ static void do_retry(struct work_struct *ws)
list_for_each_entry_safe(req, tmp, &writes, tl_requests) {
struct drbd_device *device = req->device;
struct bio *bio = req->master_bio;
- unsigned long start_time = req->start_time;
+ unsigned long start_jif = req->start_jif;
bool expected;

expected =
@@ -2263,7 +2264,7 @@ static void do_retry(struct work_struct *ws)
/* We are not just doing generic_make_request(),
* as we want to keep the start_time information. */
inc_ap_bio(device);
- __drbd_make_request(device, bio, start_time);
+ __drbd_make_request(device, bio, start_jif);
}
}

diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 3824d5c..1319bea 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -52,7 +52,7 @@ static void _drbd_start_io_acct(struct drbd_device *device, struct drbd_request
static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *req)
{
int rw = bio_data_dir(req->master_bio);
- unsigned long duration = jiffies - req->start_time;
+ unsigned long duration = jiffies - req->start_jif;
int cpu;
cpu = part_stat_lock();
part_stat_add(cpu, &device->vdisk->part0, ticks[rw], duration);
@@ -66,7 +66,7 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device,
{
struct drbd_request *req;

- req = mempool_alloc(drbd_request_mempool, GFP_NOIO);
+ req = mempool_alloc(drbd_request_mempool, GFP_NOIO | __GFP_ZERO);
if (!req)
return NULL;

@@ -366,14 +366,18 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
atomic_inc(&req->completion_ref);
}

- if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED))
+ if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED)) {
atomic_inc(&req->completion_ref);
+ }

if (!(s & RQ_EXP_BARR_ACK) && (set & RQ_EXP_BARR_ACK))
kref_get(&req->kref); /* wait for the DONE */

- if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT))
- atomic_add(req->i.size >> 9, &device->ap_in_flight);
+ if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) {
+ /* potentially already completed in the asender thread */
+ if (!(s & RQ_NET_DONE))
+ atomic_add(req->i.size >> 9, &device->ap_in_flight);
+ }

if (!(s & RQ_COMPLETION_SUSP) && (set & RQ_COMPLETION_SUSP))
atomic_inc(&req->completion_ref);
@@ -401,15 +405,18 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
if ((s & RQ_NET_PENDING) && (clear & RQ_NET_PENDING)) {
dec_ap_pending(device);
++c_put;
+ req->acked_jif = jiffies;
}

if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED))
++c_put;

- if ((s & RQ_EXP_BARR_ACK) && !(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) {
- if (req->rq_state & RQ_NET_SENT)
+ if (!(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) {
+ if (s & RQ_NET_SENT)
atomic_sub(req->i.size >> 9, &device->ap_in_flight);
- ++k_put;
+ if (s & RQ_EXP_BARR_ACK)
+ ++k_put;
+ req->net_done_jif = jiffies;
}

/* potentially complete and destroy */
@@ -449,6 +456,19 @@ static void drbd_report_io_error(struct drbd_device *device, struct drbd_request
bdevname(device->ldev->backing_bdev, b));
}

+/* Helper for HANDED_OVER_TO_NETWORK.
+ * Is this a protocol A write (neither WRITE_ACK nor RECEIVE_ACK expected)?
+ * Is it also still "PENDING"?
+ * --> If so, clear PENDING and set NET_OK below.
+ * If it is a protocol A write, but not RQ_PENDING anymore, neg-ack was faster
+ * (and we must not set RQ_NET_OK) */
+static inline bool is_pending_write_protocol_A(struct drbd_request *req)
+{
+ return (req->rq_state &
+ (RQ_WRITE|RQ_NET_PENDING|RQ_EXP_WRITE_ACK|RQ_EXP_RECEIVE_ACK))
+ == (RQ_WRITE|RQ_NET_PENDING);
+}
+
/* obviously this could be coded as many single functions
* instead of one huge switch,
* or by putting the code directly in the respective locations
@@ -627,18 +647,16 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,

case HANDED_OVER_TO_NETWORK:
/* assert something? */
- if (bio_data_dir(req->master_bio) == WRITE &&
- !(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK))) {
+ if (is_pending_write_protocol_A(req))
/* this is what is dangerous about protocol A:
* pretend it was successfully written on the peer. */
- if (req->rq_state & RQ_NET_PENDING)
- mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK);
- /* else: neg-ack was faster... */
- /* it is still not yet RQ_NET_DONE until the
- * corresponding epoch barrier got acked as well,
- * so we know what to dirty on connection loss */
- }
- mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT);
+ mod_rq_state(req, m, RQ_NET_QUEUED|RQ_NET_PENDING,
+ RQ_NET_SENT|RQ_NET_OK);
+ else
+ mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT);
+ /* It is still not yet RQ_NET_DONE until the
+ * corresponding epoch barrier got acked as well,
+ * so we know what to dirty on connection loss. */
break;

case OOS_HANDED_TO_NETWORK:
@@ -1037,6 +1055,7 @@ drbd_submit_req_private_bio(struct drbd_request *req)
* stable storage, and this is a WRITE, we may not even submit
* this bio. */
if (get_ldev(device)) {
+ req->pre_submit_jif = jiffies;
if (drbd_insert_fault(device,
rw == WRITE ? DRBD_FAULT_DT_WR
: rw == READ ? DRBD_FAULT_DT_RD
@@ -1063,7 +1082,7 @@ static void drbd_queue_write(struct drbd_device *device, struct drbd_request *re
* Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request.
*/
static struct drbd_request *
-drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long start_time)
+drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long start_jif)
{
const int rw = bio_data_dir(bio);
struct drbd_request *req;
@@ -1078,7 +1097,7 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long
bio_endio(bio, -ENOMEM);
return ERR_PTR(-ENOMEM);
}
- req->start_time = start_time;
+ req->start_jif = start_jif;

if (!get_ldev(device)) {
bio_put(req->private_bio);
@@ -1095,6 +1114,7 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long
return NULL;
}
req->rq_state |= RQ_IN_ACT_LOG;
+ req->in_actlog_jif = jiffies;
}

return req;
@@ -1197,9 +1217,9 @@ out:
complete_master_bio(device, &m);
}

-void __drbd_make_request(struct drbd_device *device, struct bio *bio, unsigned long start_time)
+void __drbd_make_request(struct drbd_device *device, struct bio *bio, unsigned long start_jif)
{
- struct drbd_request *req = drbd_request_prepare(device, bio, start_time);
+ struct drbd_request *req = drbd_request_prepare(device, bio, start_jif);
if (IS_ERR_OR_NULL(req))
return;
drbd_send_and_submit(device, req);
@@ -1218,6 +1238,7 @@ static void submit_fast_path(struct drbd_device *device, struct list_head *incom
continue;

req->rq_state |= RQ_IN_ACT_LOG;
+ req->in_actlog_jif = jiffies;
}

list_del_init(&req->tl_requests);
@@ -1240,7 +1261,6 @@ static bool prepare_al_transaction_nonblock(struct drbd_device *device,
wake = 1;
if (err)
continue;
- req->rq_state |= RQ_IN_ACT_LOG;
list_move_tail(&req->tl_requests, pending);
}
spin_unlock_irq(&device->al_lock);
@@ -1302,6 +1322,8 @@ skip_fast_path:
drbd_al_begin_io_commit(device);

list_for_each_entry_safe(req, tmp, &pending, tl_requests) {
+ req->rq_state |= RQ_IN_ACT_LOG;
+ req->in_actlog_jif = jiffies;
list_del_init(&req->tl_requests);
drbd_send_and_submit(device, req);
}
@@ -1311,9 +1333,12 @@ skip_fast_path:
* requests to cold extents. In that case, prepare one request
* in blocking mode. */
list_for_each_entry_safe(req, tmp, &incoming, tl_requests) {
+ bool was_cold;
list_del_init(&req->tl_requests);
- req->rq_state |= RQ_IN_ACT_LOG;
- if (!drbd_al_begin_io_prepare(device, &req->i)) {
+ was_cold = drbd_al_begin_io_prepare(device, &req->i);
+ if (!was_cold) {
+ req->rq_state |= RQ_IN_ACT_LOG;
+ req->in_actlog_jif = jiffies;
/* Corresponding extent was hot after all? */
drbd_send_and_submit(device, req);
} else {
@@ -1330,9 +1355,9 @@ skip_fast_path:
void drbd_make_request(struct request_queue *q, struct bio *bio)
{
struct drbd_device *device = (struct drbd_device *) q->queuedata;
- unsigned long start_time;
+ unsigned long start_jif;

- start_time = jiffies;
+ start_jif = jiffies;

/*
* what we "blindly" assume:
@@ -1340,7 +1365,7 @@ void drbd_make_request(struct request_queue *q, struct bio *bio)
D_ASSERT(device, IS_ALIGNED(bio->bi_iter.bi_size, 512));

inc_ap_bio(device);
- __drbd_make_request(device, bio, start_time);
+ __drbd_make_request(device, bio, start_jif);
}

/* This is called by bio_add_page().
@@ -1453,13 +1478,13 @@ void request_timer_fn(unsigned long data)
* to expire twice (worst case) to become effective. Good enough.
*/
if (ent && req_peer &&
- time_after(now, req_peer->start_time + ent) &&
+ time_after(now, req_peer->start_jif + ent) &&
!time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) {
drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n");
_drbd_set_state(_NS(device, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL);
}
if (dt && req_disk &&
- time_after(now, req_disk->start_time + dt) &&
+ time_after(now, req_disk->start_jif + dt) &&
!time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) {
drbd_warn(device, "Local backing device failed to meet the disk-timeout\n");
__drbd_chk_io_error(device, DRBD_FORCE_DETACH);
@@ -1467,10 +1492,10 @@ void request_timer_fn(unsigned long data)

/* Reschedule timer for the nearest not already expired timeout.
* Fallback to now + min(effective network timeout, disk timeout). */
- ent = (ent && req_peer && time_before(now, req_peer->start_time + ent))
- ? req_peer->start_time + ent : now + et;
- dt = (dt && req_disk && time_before(now, req_disk->start_time + dt))
- ? req_disk->start_time + dt : now + et;
+ ent = (ent && req_peer && time_before(now, req_peer->start_jif + ent))
+ ? req_peer->start_jif + ent : now + et;
+ dt = (dt && req_disk && time_before(now, req_disk->start_jif + dt))
+ ? req_disk->start_jif + dt : now + et;
nt = time_before(ent, dt) ? ent : dt;
spin_unlock_irq(&connection->resource->req_lock);
mod_timer(&device->request_timer, nt);
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 3978d9e..0ff8f46 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1368,6 +1368,7 @@ int w_send_out_of_sync(struct drbd_work *w, int cancel)
req_mod(req, SEND_CANCELED);
return 0;
}
+ req->pre_send_jif = jiffies;

/* this time, no connection->send.current_epoch_writes++;
* If it was sent, it was the closing barrier for the last
@@ -1398,6 +1399,7 @@ int w_send_dblock(struct drbd_work *w, int cancel)
req_mod(req, SEND_CANCELED);
return 0;
}
+ req->pre_send_jif = jiffies;

re_init_if_first_write(connection, req->epoch);
maybe_send_barrier(connection, req->epoch);
@@ -1426,6 +1428,7 @@ int w_send_read_req(struct drbd_work *w, int cancel)
req_mod(req, SEND_CANCELED);
return 0;
}
+ req->pre_send_jif = jiffies;

/* Even read requests may close a write epoch,
* if there was any yet. */
--
1.9.1

2014-07-03 08:52:57

by Philipp Reisner

[permalink] [raw]
Subject: [PATCH 03/23] drbd: track meta data IO intent, start and submit time

From: Lars Ellenberg <[email protected]>

For diagnostic purposes, track intent, start time
and latest submit time of meta data IO.

Move separate members from struct drbd_device
into the embeded struct drbd_md_io.
s/md_io_(page|in_use)/md_io.\1/

Signed-off-by: Philipp Reisner <[email protected]>
Signed-off-by: Lars Ellenberg <[email protected]>
---
drivers/block/drbd/drbd_actlog.c | 26 +++++++++++++++++---------
drivers/block/drbd/drbd_int.h | 9 ++++++---
drivers/block/drbd/drbd_main.c | 14 +++++++-------
drivers/block/drbd/drbd_nl.c | 4 ++--
drivers/block/drbd/drbd_worker.c | 9 +++------
5 files changed, 35 insertions(+), 27 deletions(-)

diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 5f82a36..d7e8066 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -92,20 +92,26 @@ struct __packed al_transaction_on_disk {
__be32 context[AL_CONTEXT_PER_TRANSACTION];
};

-void *drbd_md_get_buffer(struct drbd_device *device)
+void *drbd_md_get_buffer(struct drbd_device *device, const char *intent)
{
int r;

wait_event(device->misc_wait,
- (r = atomic_cmpxchg(&device->md_io_in_use, 0, 1)) == 0 ||
+ (r = atomic_cmpxchg(&device->md_io.in_use, 0, 1)) == 0 ||
device->state.disk <= D_FAILED);

- return r ? NULL : page_address(device->md_io_page);
+ if (r)
+ return NULL;
+
+ device->md_io.current_use = intent;
+ device->md_io.start_jif = jiffies;
+ device->md_io.submit_jif = device->md_io.start_jif - 1;
+ return page_address(device->md_io.page);
}

void drbd_md_put_buffer(struct drbd_device *device)
{
- if (atomic_dec_and_test(&device->md_io_in_use))
+ if (atomic_dec_and_test(&device->md_io.in_use))
wake_up(&device->misc_wait);
}

@@ -150,7 +156,7 @@ static int _drbd_md_sync_page_io(struct drbd_device *device,
err = -EIO;
if (bio_add_page(bio, page, size, 0) != size)
goto out;
- bio->bi_private = &device->md_io;
+ bio->bi_private = device;
bio->bi_end_io = drbd_md_io_complete;
bio->bi_rw = rw;

@@ -165,7 +171,8 @@ static int _drbd_md_sync_page_io(struct drbd_device *device,
}

bio_get(bio); /* one bio_put() is in the completion handler */
- atomic_inc(&device->md_io_in_use); /* drbd_md_put_buffer() is in the completion handler */
+ atomic_inc(&device->md_io.in_use); /* drbd_md_put_buffer() is in the completion handler */
+ device->md_io.submit_jif = jiffies;
if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
bio_endio(bio, -EIO);
else
@@ -183,9 +190,9 @@ int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bd
sector_t sector, int rw)
{
int err;
- struct page *iop = device->md_io_page;
+ struct page *iop = device->md_io.page;

- D_ASSERT(device, atomic_read(&device->md_io_in_use) == 1);
+ D_ASSERT(device, atomic_read(&device->md_io.in_use) == 1);

BUG_ON(!bdev->md_bdev);

@@ -465,7 +472,8 @@ int al_write_transaction(struct drbd_device *device)
return -EIO;
}

- buffer = drbd_md_get_buffer(device); /* protects md_io_buffer, al_tr_cycle, ... */
+ /* protects md_io_buffer, al_tr_cycle, ... */
+ buffer = drbd_md_get_buffer(device, __func__);
if (!buffer) {
drbd_err(device, "disk failed while waiting for md_io buffer\n");
put_ldev(device);
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index d85f43c..3f8281b 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -542,6 +542,11 @@ struct drbd_backing_dev {
};

struct drbd_md_io {
+ struct page *page;
+ unsigned long start_jif; /* last call to drbd_md_get_buffer */
+ unsigned long submit_jif; /* last _drbd_md_sync_page_io() submit */
+ const char *current_use;
+ atomic_t in_use;
unsigned int done;
int error;
};
@@ -795,9 +800,7 @@ struct drbd_device {
atomic_t pp_in_use; /* allocated from page pool */
atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */
wait_queue_head_t ee_wait;
- struct page *md_io_page; /* one page buffer for md_io */
struct drbd_md_io md_io;
- atomic_t md_io_in_use; /* protects the md_io, md_io_page and md_io_tmpp */
spinlock_t al_lock;
wait_queue_head_t al_wait;
struct lru_cache *act_log; /* activity log */
@@ -1336,7 +1339,7 @@ extern void resume_next_sg(struct drbd_device *device);
extern void suspend_other_sg(struct drbd_device *device);
extern int drbd_resync_finished(struct drbd_device *device);
/* maybe rather drbd_main.c ? */
-extern void *drbd_md_get_buffer(struct drbd_device *device);
+extern void *drbd_md_get_buffer(struct drbd_device *device, const char *intent);
extern void drbd_md_put_buffer(struct drbd_device *device);
extern int drbd_md_sync_page_io(struct drbd_device *device,
struct drbd_backing_dev *bdev, sector_t sector, int rw);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 56cf11b..3ab7461 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1916,7 +1916,7 @@ void drbd_init_set_defaults(struct drbd_device *device)
atomic_set(&device->rs_sect_in, 0);
atomic_set(&device->rs_sect_ev, 0);
atomic_set(&device->ap_in_flight, 0);
- atomic_set(&device->md_io_in_use, 0);
+ atomic_set(&device->md_io.in_use, 0);

mutex_init(&device->own_state_mutex);
device->state_mutex = &device->own_state_mutex;
@@ -2187,7 +2187,7 @@ void drbd_destroy_device(struct kref *kref)

if (device->bitmap) /* should no longer be there. */
drbd_bm_cleanup(device);
- __free_page(device->md_io_page);
+ __free_page(device->md_io.page);
put_disk(device->vdisk);
blk_cleanup_queue(device->rq_queue);
kfree(device->rs_plan_s);
@@ -2756,8 +2756,8 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
blk_queue_merge_bvec(q, drbd_merge_bvec);
q->queue_lock = &resource->req_lock;

- device->md_io_page = alloc_page(GFP_KERNEL);
- if (!device->md_io_page)
+ device->md_io.page = alloc_page(GFP_KERNEL);
+ if (!device->md_io.page)
goto out_no_io_page;

if (drbd_bm_init(device))
@@ -2845,7 +2845,7 @@ out_idr_remove_minor:
out_no_minor_idr:
drbd_bm_cleanup(device);
out_no_bitmap:
- __free_page(device->md_io_page);
+ __free_page(device->md_io.page);
out_no_io_page:
put_disk(disk);
out_no_disk:
@@ -3079,7 +3079,7 @@ void drbd_md_sync(struct drbd_device *device)
if (!get_ldev_if_state(device, D_FAILED))
return;

- buffer = drbd_md_get_buffer(device);
+ buffer = drbd_md_get_buffer(device, __func__);
if (!buffer)
goto out;

@@ -3239,7 +3239,7 @@ int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev)
if (device->state.disk != D_DISKLESS)
return ERR_DISK_CONFIGURED;

- buffer = drbd_md_get_buffer(device);
+ buffer = drbd_md_get_buffer(device, __func__);
if (!buffer)
return ERR_NOMEM;

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index c2e047f..7fcdc54 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -887,7 +887,7 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
* still lock the act_log to not trigger ASSERTs there.
*/
drbd_suspend_io(device);
- buffer = drbd_md_get_buffer(device); /* Lock meta-data IO */
+ buffer = drbd_md_get_buffer(device, __func__); /* Lock meta-data IO */
if (!buffer) {
drbd_resume_io(device);
return DS_ERROR;
@@ -1899,7 +1899,7 @@ static int adm_detach(struct drbd_device *device, int force)
}

drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */
- drbd_md_get_buffer(device); /* make sure there is no in-flight meta-data IO */
+ drbd_md_get_buffer(device, __func__); /* make sure there is no in-flight meta-data IO */
retcode = drbd_request_state(device, NS(disk, D_FAILED));
drbd_md_put_buffer(device);
/* D_FAILED will transition to DISKLESS. */
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index ad57129..3978d9e 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -67,13 +67,10 @@ rwlock_t global_state_lock;
*/
void drbd_md_io_complete(struct bio *bio, int error)
{
- struct drbd_md_io *md_io;
struct drbd_device *device;

- md_io = (struct drbd_md_io *)bio->bi_private;
- device = container_of(md_io, struct drbd_device, md_io);
-
- md_io->error = error;
+ device = bio->bi_private;
+ device->md_io.error = error;

/* We grabbed an extra reference in _drbd_md_sync_page_io() to be able
* to timeout on the lower level device, and eventually detach from it.
@@ -87,7 +84,7 @@ void drbd_md_io_complete(struct bio *bio, int error)
* ASSERT(atomic_read(&device->md_io_in_use) == 1) there.
*/
drbd_md_put_buffer(device);
- md_io->done = 1;
+ device->md_io.done = 1;
wake_up(&device->misc_wait);
bio_put(bio);
if (device->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */
--
1.9.1