2008-08-16 11:37:21

by Stefan Richter

[permalink] [raw]
Subject: [patch 1/3] ieee1394: regression in 2.6.25: updates should happen before probes

Regression since commit 73cf60232ef16e1f8a64defa97214a1722db1e6c,
"ieee1394: use class iteration api": The two loops for (1.) driver
updates and (2.) driver probes were replaced by a single loop with
bogus needs_probe checks. Hence updates and probes were now intermixed,
and especially sbp2 updates (reconnects) held up longer than necessary.

While we fix it, change the needs_probe flag to bool type for clarity.

Signed-off-by: Stefan Richter <[email protected]>
---

I will make sure that this goes into -stable too.

Dave, I won't Cc you on patch 2/3 and 3/3 of this series because they
are not related to the loop regression.

drivers/ieee1394/nodemgr.c | 14 ++++++++------
drivers/ieee1394/nodemgr.h | 2 +-
2 files changed, 9 insertions(+), 7 deletions(-)

Index: linux/drivers/ieee1394/nodemgr.c
===================================================================
--- linux.orig/drivers/ieee1394/nodemgr.c
+++ linux/drivers/ieee1394/nodemgr.c
@@ -843,7 +843,7 @@ static struct node_entry *nodemgr_create
ne->host = host;
ne->nodeid = nodeid;
ne->generation = generation;
- ne->needs_probe = 1;
+ ne->needs_probe = true;

ne->guid = guid;
ne->guid_vendor_id = (guid >> 40) & 0xffffff;
@@ -1141,7 +1141,7 @@ static void nodemgr_process_root_directo
struct csr1212_keyval *kv, *vendor_name_kv = NULL;
u8 last_key_id = 0;

- ne->needs_probe = 0;
+ ne->needs_probe = false;

csr1212_for_each_dir_entry(ne->csr, kv, ne->csr->root_kv, dentry) {
switch (kv->key.id) {
@@ -1292,7 +1292,7 @@ static void nodemgr_update_node(struct n
nodemgr_update_bus_options(ne);

/* Mark the node as new, so it gets re-probed */
- ne->needs_probe = 1;
+ ne->needs_probe = true;
} else {
/* old cache is valid, so update its generation */
struct nodemgr_csr_info *ci = ne->csr->private;
@@ -1560,6 +1560,7 @@ static void nodemgr_probe_ne(struct host
struct probe_param {
struct host_info *hi;
int generation;
+ bool probe_now;
};

static int __nodemgr_node_probe(struct device *dev, void *data)
@@ -1568,9 +1569,7 @@ static int __nodemgr_node_probe(struct d
struct node_entry *ne;

ne = container_of(dev, struct node_entry, node_dev);
- if (!ne->needs_probe)
- nodemgr_probe_ne(param->hi, ne, param->generation);
- if (ne->needs_probe)
+ if (ne->needs_probe == param->probe_now)
nodemgr_probe_ne(param->hi, ne, param->generation);
return 0;
}
@@ -1591,6 +1590,9 @@ static void nodemgr_node_probe(struct ho
* while probes are time-consuming. (Well, those probes need some
* improvement...) */

+ param.probe_now = false;
+ class_for_each_device(&nodemgr_ne_class, &param, __nodemgr_node_probe);
+ param.probe_now = true;
class_for_each_device(&nodemgr_ne_class, &param, __nodemgr_node_probe);

/* If we had a bus reset while we were scanning the bus, it is
Index: linux/drivers/ieee1394/nodemgr.h
===================================================================
--- linux.orig/drivers/ieee1394/nodemgr.h
+++ linux/drivers/ieee1394/nodemgr.h
@@ -97,7 +97,7 @@ struct node_entry {
struct hpsb_host *host; /* Host this node is attached to */
nodeid_t nodeid; /* NodeID */
struct bus_options busopt; /* Bus Options */
- int needs_probe;
+ bool needs_probe;
unsigned int generation; /* Synced with hpsb generation */

/* The following is read from the config rom */

--
Stefan Richter
-=====-==--- =--- =----
http://arcgraph.de/sr/


2008-08-16 11:38:38

by Stefan Richter

[permalink] [raw]
Subject: [patch 2/3] ieee1394: don't drop nodes during bus reset series

nodemgr_node_probe checked for generation increments too late and
therefore prematurely reported nodes as "suspended".

Fixes http://bugzilla.kernel.org/show_bug.cgi?id=11349 for me.

Signed-off-by: Stefan Richter <[email protected]>
---
drivers/ieee1394/nodemgr.c | 42 +++++++++++++++++++------------------
1 file changed, 22 insertions(+), 20 deletions(-)

Index: linux/drivers/ieee1394/nodemgr.c
===================================================================
--- linux.orig/drivers/ieee1394/nodemgr.c
+++ linux/drivers/ieee1394/nodemgr.c
@@ -1563,11 +1563,14 @@ struct probe_param {
bool probe_now;
};

-static int __nodemgr_node_probe(struct device *dev, void *data)
+static int node_probe(struct device *dev, void *data)
{
struct probe_param *param = (struct probe_param *)data;
struct node_entry *ne;

+ if (param->generation != get_hpsb_generation(param->hi->host))
+ return -EAGAIN;
+
ne = container_of(dev, struct node_entry, node_dev);
if (ne->needs_probe == param->probe_now)
nodemgr_probe_ne(param->hi, ne, param->generation);
@@ -1576,42 +1579,41 @@ static int __nodemgr_node_probe(struct d

static void nodemgr_node_probe(struct host_info *hi, int generation)
{
- struct hpsb_host *host = hi->host;
struct probe_param param;

param.hi = hi;
param.generation = generation;
- /* Do some processing of the nodes we've probed. This pulls them
+ /*
+ * Do some processing of the nodes we've probed. This pulls them
* into the sysfs layer if needed, and can result in processing of
* unit-directories, or just updating the node and it's
* unit-directories.
*
* Run updates before probes. Usually, updates are time-critical
- * while probes are time-consuming. (Well, those probes need some
- * improvement...) */
-
+ * while probes are time-consuming.
+ *
+ * Meanwhile, another bus reset may have happened. In this case we
+ * skip everything here and let the next bus scan handle it.
+ * Otherwise we may prematurely remove nodes which are still there.
+ */
param.probe_now = false;
- class_for_each_device(&nodemgr_ne_class, &param, __nodemgr_node_probe);
- param.probe_now = true;
- class_for_each_device(&nodemgr_ne_class, &param, __nodemgr_node_probe);
+ if (class_for_each_device(&nodemgr_ne_class, &param, node_probe) != 0)
+ return;

- /* If we had a bus reset while we were scanning the bus, it is
- * possible that we did not probe all nodes. In that case, we
- * skip the clean up for now, since we could remove nodes that
- * were still on the bus. Another bus scan is pending which will
- * do the clean up eventually.
- *
+ param.probe_now = true;
+ if (class_for_each_device(&nodemgr_ne_class, &param, node_probe) != 0)
+ return;
+ /*
* Now let's tell the bus to rescan our devices. This may seem
* like overhead, but the driver-model core will only scan a
* device for a driver when either the device is added, or when a
* new driver is added. A bus reset is a good reason to rescan
* devices that were there before. For example, an sbp2 device
* may become available for login, if the host that held it was
- * just removed. */
-
- if (generation == get_hpsb_generation(host))
- if (bus_rescan_devices(&ieee1394_bus_type))
- HPSB_DEBUG("bus_rescan_devices had an error");
+ * just removed.
+ */
+ if (bus_rescan_devices(&ieee1394_bus_type) != 0)
+ HPSB_DEBUG("bus_rescan_devices had an error");
}

static int nodemgr_send_resume_packet(struct hpsb_host *host)

--
Stefan Richter
-=====-==--- =--- =----
http://arcgraph.de/sr/

2008-08-16 11:39:51

by Stefan Richter

[permalink] [raw]
Subject: [patch 3/3] ieee1394: sbp2: let nodemgr retry node updates during bus reset series

sbp2 was too quick to run away screaming "Failed to reconnect to sbp2
device!" when additional nodes on the same bus came online.

Signed-off-by: Stefan Richter <[email protected]>
---
drivers/ieee1394/sbp2.c | 25 ++++++++++++++++++-------
1 file changed, 18 insertions(+), 7 deletions(-)

Index: linux/drivers/ieee1394/sbp2.c
===================================================================
--- linux.orig/drivers/ieee1394/sbp2.c
+++ linux/drivers/ieee1394/sbp2.c
@@ -733,15 +733,26 @@ static int sbp2_update(struct unit_direc
{
struct sbp2_lu *lu = ud->device.driver_data;

- if (sbp2_reconnect_device(lu)) {
- /* Reconnect has failed. Perhaps we didn't reconnect fast
- * enough. Try a regular login, but first log out just in
- * case of any weirdness. */
+ if (sbp2_reconnect_device(lu) != 0) {
+ /*
+ * Reconnect failed. If another bus reset happened,
+ * let nodemgr proceed and call sbp2_update again later
+ * (or sbp2_remove if this node went away).
+ */
+ if (!hpsb_node_entry_valid(lu->ne))
+ return 0;
+ /*
+ * Or the target rejected the reconnect because we weren't
+ * fast enough. Try a regular login, but first log out
+ * just in case of any weirdness.
+ */
sbp2_logout_device(lu);

- if (sbp2_login_device(lu)) {
- /* Login failed too, just fail, and the backend
- * will call our sbp2_remove for us */
+ if (sbp2_login_device(lu) != 0) {
+ if (!hpsb_node_entry_valid(lu->ne))
+ return 0;
+
+ /* Maybe another initiator won the login. */
SBP2_ERR("Failed to reconnect to sbp2 device!");
return -EBUSY;
}

--
Stefan Richter
-=====-==--- =--- =----
http://arcgraph.de/sr/

2008-08-19 19:28:50

by Stefan Richter

[permalink] [raw]
Subject: Re: [patch 2/3] ieee1394: don't drop nodes during bus reset series

I wrote:
> nodemgr_node_probe checked for generation increments too late and
> therefore prematurely reported nodes as "suspended".
>
> Fixes http://bugzilla.kernel.org/show_bug.cgi?id=11349 for me.

This and the accompanying sbp2 patch 3/3 allows the drivers to keep
going if they have temporary trouble with the protocol traffic due to
bus resets in a series.

I now implemented an additional patch which lets the drivers even
tolerate it if nodes vanish a few seconds from the bus --- e.g. a wonky
repeater blacks out briefly, or user plugs cables from left to right,
and so on. As a preparation for this enhancement, I have a cleanup in
nodemgr in the pipeline. Next up:

patch 1/2) ieee1394: nodemgr clean up class iterators
patch 2/2) ieee1394: survive a few seconds connection loss

With this you can indeed unplug a disk, even a bus powered one, and plug
it back in in the next few seconds, while programs are actively
accessing the disk. The IO of these programs will merely be blocked
during the disturbance, but sbp2 will log back in and IO will continue
without error.

Of course situations like these should rather be avoided; but they _do_
happen for example on a bus with PC--disk_A--disk_B when disk_A is
switched from self power to bus power and continues to work as repeater
when bus-powered. The repeater function is only established after a
short disruption of the bus though. Not nice at all if you forgot to
unmount disk_B for the time being. From now on, unmounting it will not
be necessary because it is very very likely that sbp2 will be able to
re-login to disk_B.

The patches which I'll post will apply to 2.6.27-rc only. Variants for
2.6.25 and .26 will be uploaded to
http://user.in-berlin.de/~s5r6/linux1394/updates/ in a few minutes.

The ieee1394 core driver actually contained stubs for this capability
for years, but the implementation wasn't fleshed out for this purpose
until now. Therefore my patches even remove more code than they add.

The new firewire stack desperately needs a similar feature. It monitors
the bus for PHYs vanishing even more precisely than the ieee1394 stack,
thus there is an even higher probability that firewire-sbp2
unnecessarily withdraws a disk from the SCSI stack.
--
Stefan Richter
-=====-==--- =--- =--==
http://arcgraph.de/sr/

2008-08-19 19:29:54

by Stefan Richter

[permalink] [raw]
Subject: [patch 1/2] ieee1394: nodemgr clean up class iterators

Remove useless pointer type casts.
Remove unnecessary hi->host indirection where only host is used.
Remove an unnecessary WARN_ON.
Change a few names.

Signed-off-by: Stefan Richter <[email protected]>
---
drivers/ieee1394/nodemgr.c | 178 ++++++++++++++++---------------------
1 file changed, 81 insertions(+), 97 deletions(-)

Index: linux-2.6.27-rc3/drivers/ieee1394/nodemgr.c
===================================================================
--- linux-2.6.27-rc3.orig/drivers/ieee1394/nodemgr.c
+++ linux-2.6.27-rc3/drivers/ieee1394/nodemgr.c
@@ -154,7 +154,7 @@ struct host_info {

static int nodemgr_bus_match(struct device * dev, struct device_driver * drv);
static int nodemgr_uevent(struct device *dev, struct kobj_uevent_env *env);
-static void nodemgr_resume_ne(struct node_entry *ne);
+static void nodemgr_reactivate_ne(struct node_entry *ne);
static void nodemgr_remove_ne(struct node_entry *ne);
static struct node_entry *find_entry_by_guid(u64 guid);

@@ -734,10 +734,10 @@ static int nodemgr_bus_match(struct devi

static DEFINE_MUTEX(nodemgr_serialize_remove_uds);

-static int __match_ne(struct device *dev, void *data)
+static int match_ne(struct device *dev, void *data)
{
struct unit_directory *ud;
- struct node_entry *ne = (struct node_entry *)data;
+ struct node_entry *ne = data;

ud = container_of(dev, struct unit_directory, unit_dev);
return ud->ne == ne;
@@ -754,8 +754,7 @@ static void nodemgr_remove_uds(struct no
*/
mutex_lock(&nodemgr_serialize_remove_uds);
for (;;) {
- dev = class_find_device(&nodemgr_ud_class, NULL, ne,
- __match_ne);
+ dev = class_find_device(&nodemgr_ud_class, NULL, ne, match_ne);
if (!dev)
break;
ud = container_of(dev, struct unit_directory, unit_dev);
@@ -785,7 +784,7 @@ static void nodemgr_remove_ne(struct nod
put_device(dev);
}

-static int __nodemgr_remove_host_dev(struct device *dev, void *data)
+static int remove_host_dev(struct device *dev, void *data)
{
if (dev->bus == &ieee1394_bus_type)
nodemgr_remove_ne(container_of(dev, struct node_entry,
@@ -795,7 +794,7 @@ static int __nodemgr_remove_host_dev(str

static void nodemgr_remove_host_dev(struct device *dev)
{
- WARN_ON(device_for_each_child(dev, NULL, __nodemgr_remove_host_dev));
+ device_for_each_child(dev, NULL, remove_host_dev);
sysfs_remove_link(&dev->kobj, "irm_id");
sysfs_remove_link(&dev->kobj, "busmgr_id");
sysfs_remove_link(&dev->kobj, "host_id");
@@ -830,11 +829,10 @@ static void nodemgr_update_bus_options(s
}


-static struct node_entry *nodemgr_create_node(octlet_t guid, struct csr1212_csr *csr,
- struct host_info *hi, nodeid_t nodeid,
- unsigned int generation)
+static struct node_entry *nodemgr_create_node(octlet_t guid,
+ struct csr1212_csr *csr, struct hpsb_host *host,
+ nodeid_t nodeid, unsigned int generation)
{
- struct hpsb_host *host = hi->host;
struct node_entry *ne;

ne = kzalloc(sizeof(*ne), GFP_KERNEL);
@@ -888,10 +886,10 @@ fail_alloc:
return NULL;
}

-static int __match_ne_guid(struct device *dev, void *data)
+static int match_ne_guid(struct device *dev, void *data)
{
struct node_entry *ne;
- u64 *guid = (u64 *)data;
+ u64 *guid = data;

ne = container_of(dev, struct node_entry, node_dev);
return ne->guid == *guid;
@@ -902,8 +900,7 @@ static struct node_entry *find_entry_by_
struct device *dev;
struct node_entry *ne;

- dev = class_find_device(&nodemgr_ne_class, NULL, &guid,
- __match_ne_guid);
+ dev = class_find_device(&nodemgr_ne_class, NULL, &guid, match_ne_guid);
if (!dev)
return NULL;
ne = container_of(dev, struct node_entry, node_dev);
@@ -912,21 +909,21 @@ static struct node_entry *find_entry_by_
return ne;
}

-struct match_nodeid_param {
+struct match_nodeid_parameter {
struct hpsb_host *host;
nodeid_t nodeid;
};

-static int __match_ne_nodeid(struct device *dev, void *data)
+static int match_ne_nodeid(struct device *dev, void *data)
{
int found = 0;
struct node_entry *ne;
- struct match_nodeid_param *param = (struct match_nodeid_param *)data;
+ struct match_nodeid_parameter *p = data;

if (!dev)
goto ret;
ne = container_of(dev, struct node_entry, node_dev);
- if (ne->host == param->host && ne->nodeid == param->nodeid)
+ if (ne->host == p->host && ne->nodeid == p->nodeid)
found = 1;
ret:
return found;
@@ -937,13 +934,12 @@ static struct node_entry *find_entry_by_
{
struct device *dev;
struct node_entry *ne;
- struct match_nodeid_param param;
+ struct match_nodeid_parameter p;

- param.host = host;
- param.nodeid = nodeid;
+ p.host = host;
+ p.nodeid = nodeid;

- dev = class_find_device(&nodemgr_ne_class, NULL, &param,
- __match_ne_nodeid);
+ dev = class_find_device(&nodemgr_ne_class, NULL, &p, match_ne_nodeid);
if (!dev)
return NULL;
ne = container_of(dev, struct node_entry, node_dev);
@@ -990,7 +986,7 @@ fail_devreg:
* immediate unit directories looking for software_id and
* software_version entries, in order to get driver autoloading working. */
static struct unit_directory *nodemgr_process_unit_directory
- (struct host_info *hi, struct node_entry *ne, struct csr1212_keyval *ud_kv,
+ (struct node_entry *ne, struct csr1212_keyval *ud_kv,
unsigned int *id, struct unit_directory *parent)
{
struct unit_directory *ud;
@@ -1083,7 +1079,7 @@ static struct unit_directory *nodemgr_pr
nodemgr_register_device(ne, ud, &ne->device);

/* process the child unit */
- ud_child = nodemgr_process_unit_directory(hi, ne, kv, id, ud);
+ ud_child = nodemgr_process_unit_directory(ne, kv, id, ud);

if (ud_child == NULL)
break;
@@ -1137,7 +1133,7 @@ unit_directory_error:
}


-static void nodemgr_process_root_directory(struct host_info *hi, struct node_entry *ne)
+static void nodemgr_process_root_directory(struct node_entry *ne)
{
unsigned int ud_id = 0;
struct csr1212_dentry *dentry;
@@ -1157,7 +1153,7 @@ static void nodemgr_process_root_directo
break;

case CSR1212_KV_ID_UNIT:
- nodemgr_process_unit_directory(hi, ne, kv, &ud_id, NULL);
+ nodemgr_process_unit_directory(ne, kv, &ud_id, NULL);
break;

case CSR1212_KV_ID_DESCRIPTOR:
@@ -1273,8 +1269,7 @@ void hpsb_unregister_protocol(struct hps
* the to take whatever actions required.
*/
static void nodemgr_update_node(struct node_entry *ne, struct csr1212_csr *csr,
- struct host_info *hi, nodeid_t nodeid,
- unsigned int generation)
+ nodeid_t nodeid, unsigned int generation)
{
if (ne->nodeid != nodeid) {
HPSB_DEBUG("Node changed: " NODE_BUS_FMT " -> " NODE_BUS_FMT,
@@ -1306,7 +1301,7 @@ static void nodemgr_update_node(struct n
}

if (ne->in_limbo)
- nodemgr_resume_ne(ne);
+ nodemgr_reactivate_ne(ne);

/* Mark the node current */
ne->generation = generation;
@@ -1314,10 +1309,9 @@ static void nodemgr_update_node(struct n



-static void nodemgr_node_scan_one(struct host_info *hi,
+static void nodemgr_node_scan_one(struct hpsb_host *host,
nodeid_t nodeid, int generation)
{
- struct hpsb_host *host = hi->host;
struct node_entry *ne;
octlet_t guid;
struct csr1212_csr *csr;
@@ -1373,16 +1367,15 @@ static void nodemgr_node_scan_one(struct
}

if (!ne)
- nodemgr_create_node(guid, csr, hi, nodeid, generation);
+ nodemgr_create_node(guid, csr, host, nodeid, generation);
else
- nodemgr_update_node(ne, csr, hi, nodeid, generation);
+ nodemgr_update_node(ne, csr, nodeid, generation);
}


-static void nodemgr_node_scan(struct host_info *hi, int generation)
+static void nodemgr_node_scan(struct hpsb_host *host, int generation)
{
int count;
- struct hpsb_host *host = hi->host;
struct selfid *sid = (struct selfid *)host->topology_map;
nodeid_t nodeid = LOCAL_BUS;

@@ -1395,15 +1388,15 @@ static void nodemgr_node_scan(struct hos
nodeid++;
continue;
}
- nodemgr_node_scan_one(hi, nodeid++, generation);
+ nodemgr_node_scan_one(host, nodeid++, generation);
}
}

-static int __nodemgr_driver_suspend(struct device *dev, void *data)
+static int pause_ne(struct device *dev, void *data)
{
struct unit_directory *ud;
struct device_driver *drv;
- struct node_entry *ne = (struct node_entry *)data;
+ struct node_entry *ne = data;
int error;

ud = container_of(dev, struct unit_directory, unit_dev);
@@ -1425,11 +1418,23 @@ static int __nodemgr_driver_suspend(stru
return 0;
}

-static int __nodemgr_driver_resume(struct device *dev, void *data)
+static void nodemgr_pause_ne(struct node_entry *ne)
+{
+ HPSB_DEBUG("Node suspended: ID:BUS[" NODE_BUS_FMT "] GUID[%016Lx]",
+ NODE_BUS_ARGS(ne->host, ne->nodeid),
+ (unsigned long long)ne->guid);
+
+ ne->in_limbo = 1;
+ WARN_ON(device_create_file(&ne->device, &dev_attr_ne_in_limbo));
+
+ class_for_each_device(&nodemgr_ud_class, NULL, ne, pause_ne);
+}
+
+static int reactivate_ne(struct device *dev, void *data)
{
struct unit_directory *ud;
struct device_driver *drv;
- struct node_entry *ne = (struct node_entry *)data;
+ struct node_entry *ne = data;

ud = container_of(dev, struct unit_directory, unit_dev);
if (ud->ne == ne) {
@@ -1447,37 +1452,23 @@ static int __nodemgr_driver_resume(struc
return 0;
}

-static void nodemgr_suspend_ne(struct node_entry *ne)
-{
- HPSB_DEBUG("Node suspended: ID:BUS[" NODE_BUS_FMT "] GUID[%016Lx]",
- NODE_BUS_ARGS(ne->host, ne->nodeid),
- (unsigned long long)ne->guid);
-
- ne->in_limbo = 1;
- WARN_ON(device_create_file(&ne->device, &dev_attr_ne_in_limbo));
-
- class_for_each_device(&nodemgr_ud_class, NULL, ne,
- __nodemgr_driver_suspend);
-}
-
-
-static void nodemgr_resume_ne(struct node_entry *ne)
+static void nodemgr_reactivate_ne(struct node_entry *ne)
{
ne->in_limbo = 0;
device_remove_file(&ne->device, &dev_attr_ne_in_limbo);

- class_for_each_device(&nodemgr_ud_class, NULL, ne,
- __nodemgr_driver_resume);
+ class_for_each_device(&nodemgr_ud_class, NULL, ne, reactivate_ne);
HPSB_DEBUG("Node resumed: ID:BUS[" NODE_BUS_FMT "] GUID[%016Lx]",
- NODE_BUS_ARGS(ne->host, ne->nodeid), (unsigned long long)ne->guid);
+ NODE_BUS_ARGS(ne->host, ne->nodeid),
+ (unsigned long long)ne->guid);
}

-static int __nodemgr_update_pdrv(struct device *dev, void *data)
+static int update_pdrv(struct device *dev, void *data)
{
struct unit_directory *ud;
struct device_driver *drv;
struct hpsb_protocol_driver *pdrv;
- struct node_entry *ne = (struct node_entry *)data;
+ struct node_entry *ne = data;
int error;

ud = container_of(dev, struct unit_directory, unit_dev);
@@ -1503,8 +1494,7 @@ static int __nodemgr_update_pdrv(struct

static void nodemgr_update_pdrv(struct node_entry *ne)
{
- class_for_each_device(&nodemgr_ud_class, NULL, ne,
- __nodemgr_update_pdrv);
+ class_for_each_device(&nodemgr_ud_class, NULL, ne, update_pdrv);
}


@@ -1535,11 +1525,12 @@ static void nodemgr_irm_write_bc(struct
}


-static void nodemgr_probe_ne(struct host_info *hi, struct node_entry *ne, int generation)
+static void nodemgr_probe_ne(struct hpsb_host *host, struct node_entry *ne,
+ int generation)
{
struct device *dev;

- if (ne->host != hi->host || ne->in_limbo)
+ if (ne->host != host || ne->in_limbo)
return;

dev = get_device(&ne->device);
@@ -1554,40 +1545,40 @@ static void nodemgr_probe_ne(struct host
* down to the drivers. Otherwise, this is a dead node and we
* suspend it. */
if (ne->needs_probe)
- nodemgr_process_root_directory(hi, ne);
+ nodemgr_process_root_directory(ne);
else if (ne->generation == generation)
nodemgr_update_pdrv(ne);
else
- nodemgr_suspend_ne(ne);
+ nodemgr_pause_ne(ne);

put_device(dev);
}

-struct probe_param {
- struct host_info *hi;
+struct node_probe_parameter {
+ struct hpsb_host *host;
int generation;
bool probe_now;
};

static int node_probe(struct device *dev, void *data)
{
- struct probe_param *p = data;
+ struct node_probe_parameter *p = data;
struct node_entry *ne;

- if (p->generation != get_hpsb_generation(p->hi->host))
+ if (p->generation != get_hpsb_generation(p->host))
return -EAGAIN;

ne = container_of(dev, struct node_entry, node_dev);
if (ne->needs_probe == p->probe_now)
- nodemgr_probe_ne(p->hi, ne, p->generation);
+ nodemgr_probe_ne(p->host, ne, p->generation);
return 0;
}

-static void nodemgr_node_probe(struct host_info *hi, int generation)
+static void nodemgr_node_probe(struct hpsb_host *host, int generation)
{
- struct probe_param p;
+ struct node_probe_parameter p;

- p.hi = hi;
+ p.host = host;
p.generation = generation;
/*
* Do some processing of the nodes we've probed. This pulls them
@@ -1730,10 +1721,9 @@ static int nodemgr_check_irm_capability(
return 1;
}

-static int nodemgr_host_thread(void *__hi)
+static int nodemgr_host_thread(void *data)
{
- struct host_info *hi = (struct host_info *)__hi;
- struct hpsb_host *host = hi->host;
+ struct hpsb_host *host = data;
unsigned int g, generation = 0;
int i, reset_cycles = 0;

@@ -1787,11 +1777,11 @@ static int nodemgr_host_thread(void *__h
* entries. This does not do the sysfs stuff, since that
* would trigger uevents and such, which is a bad idea at
* this point. */
- nodemgr_node_scan(hi, generation);
+ nodemgr_node_scan(host, generation);

/* This actually does the full probe, with sysfs
* registration. */
- nodemgr_node_probe(hi, generation);
+ nodemgr_node_probe(host, generation);

/* Update some of our sysfs symlinks */
nodemgr_update_host_dev_links(host);
@@ -1801,22 +1791,20 @@ exit:
return 0;
}

-struct host_iter_param {
+struct per_host_parameter {
void *data;
int (*cb)(struct hpsb_host *, void *);
};

-static int __nodemgr_for_each_host(struct device *dev, void *data)
+static int per_host(struct device *dev, void *data)
{
struct hpsb_host *host;
- struct host_iter_param *hip = (struct host_iter_param *)data;
- int error = 0;
+ struct per_host_parameter *p = data;

host = container_of(dev, struct hpsb_host, host_dev);
- error = hip->cb(host, hip->data);
-
- return error;
+ return p->cb(host, p->data);
}
+
/**
* nodemgr_for_each_host - call a function for each IEEE 1394 host
* @data: an address to supply to the callback
@@ -1831,15 +1819,11 @@ static int __nodemgr_for_each_host(struc
*/
int nodemgr_for_each_host(void *data, int (*cb)(struct hpsb_host *, void *))
{
- struct host_iter_param hip;
- int error;
+ struct per_host_parameter p;

- hip.cb = cb;
- hip.data = data;
- error = class_for_each_device(&hpsb_host_class, NULL, &hip,
- __nodemgr_for_each_host);
-
- return error;
+ p.cb = cb;
+ p.data = data;
+ return class_for_each_device(&hpsb_host_class, NULL, &p, per_host);
}

/* The following two convenience functions use a struct node_entry
@@ -1893,7 +1877,7 @@ static void nodemgr_add_host(struct hpsb
return;
}
hi->host = host;
- hi->thread = kthread_run(nodemgr_host_thread, hi, "knodemgrd_%d",
+ hi->thread = kthread_run(nodemgr_host_thread, host, "knodemgrd_%d",
host->id);
if (IS_ERR(hi->thread)) {
HPSB_ERR("NodeMgr: cannot start thread for host %d", host->id);

--
Stefan Richter
-=====-==--- =--- =--==
http://arcgraph.de/sr/

2008-08-19 19:30:47

by Stefan Richter

[permalink] [raw]
Subject: [patch 2/2] ieee1394: survive a few seconds connection loss

There are situations when nodes vanish from the bus and come back in
quickly thereafter:
- When certain bus-powered hubs are plugged in,
- when certain disk enclosures are switched from self-power to bus
power or vice versa and break the daisy chain during the transition,
- when the user plugs a cable out and quickly plugs it back in, e.g.
to reorder a daisy chain (works on Mac OS X if done quickly enough),
- when certain hubs temporarily malfunction during high bus traffic.

The ieee1394 driver's nodemgr already contained a function to set
vanished nodes aside into "limbo"; i.e. they wouldn't actually be
deleted right away. (In fact, only unloading the driver or writing into
an obscure sysfs attribute would delete them eventually.) If nodes
reappeared later, they would be resurrected out of limbo.

Moving nodes into and out of limbo was accompanied with calling the
.suspend() and .resume() driver methods of the drivers which were bound
to a respective node's unit directories. Not only is this somewhat
strange due to the primary use of these drivers for power management,
also the sbp2 driver in particular does not implement .suspend() and
.resume().

Hence sbp2 would be disconnected from devices in situations as listed
above.

We now:
- leave drivers bound when nodes go into limbo,
- call the drivers' .update() when nodes come out of limbo,
- automatically delete in-limbo nodes 5 seconds after the last
bus reset and bus rescan.
- Because of the automatic removal, the now obsolete bus attribute
/sys/bus/ieee1394/destroy_node is removed.

This especially lets sbp2 survive brief disconnections. You can for
example yank a disk's cable and plug it back in while reading the
respective disk with dd, but dd will happily continue as if nothing
happened.

Signed-off-by: Stefan Richter <[email protected]>
---
drivers/ieee1394/nodemgr.c | 146 ++++++++++++-------------------------
drivers/ieee1394/nodemgr.h | 2
2 files changed, 52 insertions(+), 96 deletions(-)

Index: linux-2.6.27-rc3/drivers/ieee1394/nodemgr.c
===================================================================
--- linux-2.6.27-rc3.orig/drivers/ieee1394/nodemgr.c
+++ linux-2.6.27-rc3/drivers/ieee1394/nodemgr.c
@@ -154,7 +154,6 @@ struct host_info {

static int nodemgr_bus_match(struct device * dev, struct device_driver * drv);
static int nodemgr_uevent(struct device *dev, struct kobj_uevent_env *env);
-static void nodemgr_reactivate_ne(struct node_entry *ne);
static void nodemgr_remove_ne(struct node_entry *ne);
static struct node_entry *find_entry_by_guid(u64 guid);

@@ -385,27 +384,6 @@ static ssize_t fw_get_ignore_driver(stru
static DEVICE_ATTR(ignore_driver, S_IWUSR | S_IRUGO, fw_get_ignore_driver, fw_set_ignore_driver);


-static ssize_t fw_set_destroy_node(struct bus_type *bus, const char *buf, size_t count)
-{
- struct node_entry *ne;
- u64 guid = (u64)simple_strtoull(buf, NULL, 16);
-
- ne = find_entry_by_guid(guid);
-
- if (ne == NULL || !ne->in_limbo)
- return -EINVAL;
-
- nodemgr_remove_ne(ne);
-
- return count;
-}
-static ssize_t fw_get_destroy_node(struct bus_type *bus, char *buf)
-{
- return sprintf(buf, "You can destroy in_limbo nodes by writing their GUID to this file\n");
-}
-static BUS_ATTR(destroy_node, S_IWUSR | S_IRUGO, fw_get_destroy_node, fw_set_destroy_node);
-
-
static ssize_t fw_set_rescan(struct bus_type *bus, const char *buf,
size_t count)
{
@@ -442,7 +420,6 @@ static BUS_ATTR(ignore_drivers, S_IWUSR


struct bus_attribute *const fw_bus_attrs[] = {
- &bus_attr_destroy_node,
&bus_attr_rescan,
&bus_attr_ignore_drivers,
NULL
@@ -1300,14 +1277,19 @@ static void nodemgr_update_node(struct n
csr1212_destroy_csr(csr);
}

- if (ne->in_limbo)
- nodemgr_reactivate_ne(ne);
-
/* Mark the node current */
ne->generation = generation;
-}

+ if (ne->in_limbo) {
+ device_remove_file(&ne->device, &dev_attr_ne_in_limbo);
+ ne->in_limbo = false;

+ HPSB_DEBUG("Node reactivated: "
+ "ID:BUS[" NODE_BUS_FMT "] GUID[%016Lx]",
+ NODE_BUS_ARGS(ne->host, ne->nodeid),
+ (unsigned long long)ne->guid);
+ }
+}

static void nodemgr_node_scan_one(struct hpsb_host *host,
nodeid_t nodeid, int generation)
@@ -1392,75 +1374,14 @@ static void nodemgr_node_scan(struct hps
}
}

-static int pause_ne(struct device *dev, void *data)
-{
- struct unit_directory *ud;
- struct device_driver *drv;
- struct node_entry *ne = data;
- int error;
-
- ud = container_of(dev, struct unit_directory, unit_dev);
- if (ud->ne == ne) {
- drv = get_driver(ud->device.driver);
- if (drv) {
- error = 1; /* release if suspend is not implemented */
- if (drv->suspend) {
- down(&ud->device.sem);
- error = drv->suspend(&ud->device, PMSG_SUSPEND);
- up(&ud->device.sem);
- }
- if (error)
- device_release_driver(&ud->device);
- put_driver(drv);
- }
- }
-
- return 0;
-}
-
static void nodemgr_pause_ne(struct node_entry *ne)
{
- HPSB_DEBUG("Node suspended: ID:BUS[" NODE_BUS_FMT "] GUID[%016Lx]",
+ HPSB_DEBUG("Node paused: ID:BUS[" NODE_BUS_FMT "] GUID[%016Lx]",
NODE_BUS_ARGS(ne->host, ne->nodeid),
(unsigned long long)ne->guid);

- ne->in_limbo = 1;
+ ne->in_limbo = true;
WARN_ON(device_create_file(&ne->device, &dev_attr_ne_in_limbo));
-
- class_for_each_device(&nodemgr_ud_class, NULL, ne, pause_ne);
-}
-
-static int reactivate_ne(struct device *dev, void *data)
-{
- struct unit_directory *ud;
- struct device_driver *drv;
- struct node_entry *ne = data;
-
- ud = container_of(dev, struct unit_directory, unit_dev);
- if (ud->ne == ne) {
- drv = get_driver(ud->device.driver);
- if (drv) {
- if (drv->resume) {
- down(&ud->device.sem);
- drv->resume(&ud->device);
- up(&ud->device.sem);
- }
- put_driver(drv);
- }
- }
-
- return 0;
-}
-
-static void nodemgr_reactivate_ne(struct node_entry *ne)
-{
- ne->in_limbo = 0;
- device_remove_file(&ne->device, &dev_attr_ne_in_limbo);
-
- class_for_each_device(&nodemgr_ud_class, NULL, ne, reactivate_ne);
- HPSB_DEBUG("Node resumed: ID:BUS[" NODE_BUS_FMT "] GUID[%016Lx]",
- NODE_BUS_ARGS(ne->host, ne->nodeid),
- (unsigned long long)ne->guid);
}

static int update_pdrv(struct device *dev, void *data)
@@ -1497,7 +1418,6 @@ static void nodemgr_update_pdrv(struct n
class_for_each_device(&nodemgr_ud_class, NULL, ne, update_pdrv);
}

-
/* Write the BROADCAST_CHANNEL as per IEEE1394a 8.3.2.3.11 and 8.4.2.3. This
* seems like an optional service but in the end it is practically mandatory
* as a consequence of these clauses.
@@ -1574,7 +1494,7 @@ static int node_probe(struct device *dev
return 0;
}

-static void nodemgr_node_probe(struct hpsb_host *host, int generation)
+static int nodemgr_node_probe(struct hpsb_host *host, int generation)
{
struct node_probe_parameter p;

@@ -1595,11 +1515,11 @@ static void nodemgr_node_probe(struct hp
*/
p.probe_now = false;
if (class_for_each_device(&nodemgr_ne_class, NULL, &p, node_probe) != 0)
- return;
+ return 0;

p.probe_now = true;
if (class_for_each_device(&nodemgr_ne_class, NULL, &p, node_probe) != 0)
- return;
+ return 0;
/*
* Now let's tell the bus to rescan our devices. This may seem
* like overhead, but the driver-model core will only scan a
@@ -1611,6 +1531,27 @@ static void nodemgr_node_probe(struct hp
*/
if (bus_rescan_devices(&ieee1394_bus_type) != 0)
HPSB_DEBUG("bus_rescan_devices had an error");
+
+ return 1;
+}
+
+static int remove_nodes_in_limbo(struct device *dev, void *data)
+{
+ struct node_entry *ne;
+
+ if (dev->bus != &ieee1394_bus_type)
+ return 0;
+
+ ne = container_of(dev, struct node_entry, device);
+ if (ne->in_limbo)
+ nodemgr_remove_ne(ne);
+
+ return 0;
+}
+
+static void nodemgr_remove_nodes_in_limbo(struct hpsb_host *host)
+{
+ device_for_each_child(&host->device, NULL, remove_nodes_in_limbo);
}

static int nodemgr_send_resume_packet(struct hpsb_host *host)
@@ -1781,10 +1722,25 @@ static int nodemgr_host_thread(void *dat

/* This actually does the full probe, with sysfs
* registration. */
- nodemgr_node_probe(host, generation);
+ if (!nodemgr_node_probe(host, generation))
+ continue;

/* Update some of our sysfs symlinks */
nodemgr_update_host_dev_links(host);
+
+ /* Sleep 5 seconds */
+ for (i = 0; i < 5000/100 ; i++) {
+ msleep_interruptible(100);
+ if (kthread_should_stop())
+ goto exit;
+
+ if (generation != get_hpsb_generation(host))
+ break;
+ }
+
+ /* Remove nodes which are gone, unless a bus reset happened */
+ if (i == 5000/100)
+ nodemgr_remove_nodes_in_limbo(host);
}
exit:
HPSB_VERBOSE("NodeMgr: Exiting thread");
Index: linux-2.6.27-rc3/drivers/ieee1394/nodemgr.h
===================================================================
--- linux-2.6.27-rc3.orig/drivers/ieee1394/nodemgr.h
+++ linux-2.6.27-rc3/drivers/ieee1394/nodemgr.h
@@ -110,7 +110,7 @@ struct node_entry {
struct device node_dev;

/* Means this node is not attached anymore */
- int in_limbo;
+ bool in_limbo;

struct csr1212_csr *csr;
};

--
Stefan Richter
-=====-==--- =--- =--==
http://arcgraph.de/sr/

2008-10-10 17:13:54

by Stefan Richter

[permalink] [raw]
Subject: [patch amendment] ieee1394: survive a few seconds connection loss

On 19 Aug, Stefan Richter wrote:
> There are situations when nodes vanish from the bus and come back in
> quickly thereafter:
> - When certain bus-powered hubs are plugged in,
> - when certain disk enclosures are switched from self-power to bus
> power or vice versa and break the daisy chain during the transition,
> - when the user plugs a cable out and quickly plugs it back in, e.g.
> to reorder a daisy chain (works on Mac OS X if done quickly enough),
> - when certain hubs temporarily malfunction during high bus traffic.
>
> The ieee1394 driver's nodemgr already contained a function to set
> vanished nodes aside into "limbo"; i.e. they wouldn't actually be
> deleted right away. (In fact, only unloading the driver or writing into
> an obscure sysfs attribute would delete them eventually.) If nodes
> reappeared later, they would be resurrected out of limbo.
>
> Moving nodes into and out of limbo was accompanied with calling the
> .suspend() and .resume() driver methods of the drivers which were bound
> to a respective node's unit directories. Not only is this somewhat
> strange due to the primary use of these drivers for power management,
> also the sbp2 driver in particular does not implement .suspend() and
> .resume().
>
> Hence sbp2 would be disconnected from devices in situations as listed
> above.
>
> We now:
> - leave drivers bound when nodes go into limbo,
> - call the drivers' .update() when nodes come out of limbo,
> - automatically delete in-limbo nodes 5 seconds after the last
> bus reset and bus rescan.
> - Because of the automatic removal, the now obsolete bus attribute
> /sys/bus/ieee1394/destroy_node is removed.
>
> This especially lets sbp2 survive brief disconnections. You can for
> example yank a disk's cable and plug it back in while reading the
> respective disk with dd, but dd will happily continue as if nothing
> happened.

Amendment: Reduce timeout from 5 to 3 seconds. This is enough because
the timeout is restarted if another bus reset happened during the
timeout.

Signed-off-by: Stefan Richter <[email protected]>
---
drivers/ieee1394/nodemgr.c | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)

Index: linux/drivers/ieee1394/nodemgr.c
===================================================================
--- linux.orig/drivers/ieee1394/nodemgr.c
+++ linux/drivers/ieee1394/nodemgr.c
@@ -1726,18 +1726,17 @@ static int nodemgr_host_thread(void *dat
/* Update some of our sysfs symlinks */
nodemgr_update_host_dev_links(host);

- /* Sleep 5 seconds */
- for (i = 0; i < 5000/100 ; i++) {
- msleep_interruptible(100);
+ /* Sleep 3 seconds */
+ for (i = 3000/200; i; i--) {
+ msleep_interruptible(200);
if (kthread_should_stop())
goto exit;

if (generation != get_hpsb_generation(host))
break;
}
-
/* Remove nodes which are gone, unless a bus reset happened */
- if (i == 5000/100)
+ if (!i)
nodemgr_remove_nodes_in_limbo(host);
}
exit:


--
Stefan Richter
-=====-==--- =-=- -=-=-
http://arcgraph.de/sr/