2015-02-03 17:00:52

by Vitaly Kuznetsov

[permalink] [raw]
Subject: [PATCH 0/4] Drivers: hv: Further protection for the rescind path

This series is a continuation of the "Drivers: hv: vmbus: serialize Offer and
Rescind offer". I'm trying to address a number of theoretically possible issues
with rescind offer handling. All these complications come from the fact that
a rescind offer results in vmbus channel being freed and we must ensure nobody
still uses it. Instead of introducing new locks I suggest we switch channels
usage to the get/put workflow.

The main part of the series is [PATCH 1/4] which introduces the workflow for
vmbus channels, all other patches fix different corner cases using this
workflow. I'm not sure all such cases are covered with this series (probably
not), but in case protection is required in some other places it should become
relatively easy to add one.

I did some sanity testing with CONFIG_DEBUG_LOCKDEP=y and nothing popped out,
however, additional testing would be much appreciated.

K.Y., Haiyang, I'm not sending this series to netdev@ and linux-scsi@ as it is
supposed to be applied as a whole, please resend these patches with your
sign-offs when (and if) we're done with reviews. Thanks!

Vitaly Kuznetsov (4):
Drivers: hv: vmbus: implement get/put usage workflow for vmbus
channels
Drivers: hv: vmbus: do not lose rescind offer on failure in
vmbus_process_offer()
Drivers: hv: vmbus: protect vmbus_get_outgoing_channel() against
channel removal
hyperv: netvsc: improve protection against rescind offer

drivers/hv/channel_mgmt.c | 75 +++++++++++++++++++++++++++++++++++++--------
drivers/hv/connection.c | 7 +++--
drivers/hv/hyperv_vmbus.h | 4 +++
drivers/net/hyperv/netvsc.c | 10 ++++--
drivers/scsi/storvsc_drv.c | 2 ++
include/linux/hyperv.h | 13 ++++++++
6 files changed, 95 insertions(+), 16 deletions(-)

--
1.9.3


2015-02-03 17:01:24

by Vitaly Kuznetsov

[permalink] [raw]
Subject: [PATCH 1/4] Drivers: hv: vmbus: implement get/put usage workflow for vmbus channels

free_channel() function frees the channel unconditionally so we need to make
sure nobody has any link to it. This is not trivial and there are several
examples of races we have:

1) In vmbus_onoffer_rescind() we check for channel existence with
relid2channel() and then use it. This can go wrong if we're in the middle
of channel removal (free_channel() was already called).

2) In process_chn_event() we check for channel existence with
pcpu_relid2channel() and then use it. This can also go wrong.

3) vmbus_free_channels() just frees all channels, in case we're in the middle
of vmbus_process_rescind_offer() crash is possible.

The issue can be solved by holding vmbus_connection.channel_lock everywhere,
however, it looks like a way to deadlocks and performance degradation. Get/put
workflow fits here the best.

Implement vmbus_get_channel()/vmbus_put_channel() pair instead of
free_channel().

Signed-off-by: Vitaly Kuznetsov <[email protected]>
---
drivers/hv/channel_mgmt.c | 45 ++++++++++++++++++++++++++++++++++++++-------
drivers/hv/connection.c | 7 +++++--
drivers/hv/hyperv_vmbus.h | 4 ++++
include/linux/hyperv.h | 13 +++++++++++++
4 files changed, 60 insertions(+), 9 deletions(-)

diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 36bacc7..eb9ce94 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -147,6 +147,8 @@ static struct vmbus_channel *alloc_channel(void)
return NULL;

channel->id = atomic_inc_return(&chan_num);
+ atomic_set(&channel->count, 1);
+
spin_lock_init(&channel->inbound_lock);
spin_lock_init(&channel->lock);

@@ -178,19 +180,47 @@ static void release_channel(struct work_struct *work)
}

/*
- * free_channel - Release the resources used by the vmbus channel object
+ * vmbus_put_channel - Decrease the channel usage counter and release the
+ * resources when this counter reaches zero.
*/
-static void free_channel(struct vmbus_channel *channel)
+void vmbus_put_channel(struct vmbus_channel *channel)
{
+ unsigned long flags;

/*
* We have to release the channel's workqueue/thread in the vmbus's
* workqueue/thread context
* ie we can't destroy ourselves.
*/
- INIT_WORK(&channel->work, release_channel);
- queue_work(vmbus_connection.work_queue, &channel->work);
+ spin_lock_irqsave(&channel->lock, flags);
+ if (atomic_dec_and_test(&channel->count)) {
+ channel->dying = true;
+ INIT_WORK(&channel->work, release_channel);
+ spin_unlock_irqrestore(&channel->lock, flags);
+ queue_work(vmbus_connection.work_queue, &channel->work);
+ } else
+ spin_unlock_irqrestore(&channel->lock, flags);
+}
+EXPORT_SYMBOL_GPL(vmbus_put_channel);
+
+/* vmbus_get_channel - Get additional reference to the channel */
+struct vmbus_channel *vmbus_get_channel(struct vmbus_channel *channel)
+{
+ unsigned long flags;
+ struct vmbus_channel *ret = NULL;
+
+ if (!channel)
+ return NULL;
+
+ spin_lock_irqsave(&channel->lock, flags);
+ if (!channel->dying) {
+ atomic_inc(&channel->count);
+ ret = channel;
+ }
+ spin_unlock_irqrestore(&channel->lock, flags);
+ return ret;
}
+EXPORT_SYMBOL_GPL(vmbus_get_channel);

static void percpu_channel_enq(void *arg)
{
@@ -253,7 +283,7 @@ static void vmbus_process_rescind_offer(struct work_struct *work)
list_del(&channel->sc_list);
spin_unlock_irqrestore(&primary_channel->lock, flags);
}
- free_channel(channel);
+ vmbus_put_channel(channel);
}

void vmbus_free_channels(void)
@@ -262,7 +292,7 @@ void vmbus_free_channels(void)

list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
vmbus_device_unregister(channel->device_obj);
- free_channel(channel);
+ vmbus_put_channel(channel);
}
}

@@ -391,7 +421,7 @@ done_init_rescind:
spin_unlock_irqrestore(&newchannel->lock, flags);
return;
err_free_chan:
- free_channel(newchannel);
+ vmbus_put_channel(newchannel);
}

enum {
@@ -549,6 +579,7 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr)
queue_work(channel->controlwq, &channel->work);

spin_unlock_irqrestore(&channel->lock, flags);
+ vmbus_put_channel(channel);
}

/*
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index c4acd1c..d1ce134 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -247,7 +247,8 @@ void vmbus_disconnect(void)
* Map the given relid to the corresponding channel based on the
* per-cpu list of channels that have been affinitized to this CPU.
* This will be used in the channel callback path as we can do this
- * mapping in a lock-free fashion.
+ * mapping in a lock-free fashion. Takes additional reference to the
+ * channel, all users are supposed to do vmbus_put_channel().
*/
static struct vmbus_channel *pcpu_relid2channel(u32 relid)
{
@@ -263,7 +264,7 @@ static struct vmbus_channel *pcpu_relid2channel(u32 relid)
}
}

- return found_channel;
+ return vmbus_get_channel(found_channel);
}

/*
@@ -297,6 +298,7 @@ struct vmbus_channel *relid2channel(u32 relid)
}
}
}
+ found_channel = vmbus_get_channel(found_channel);
spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags);

return found_channel;
@@ -360,6 +362,7 @@ static void process_chn_event(u32 relid)
pr_err("no channel callback for relid - %u\n", relid);
}

+ vmbus_put_channel(channel);
}

/*
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index b055e53..40d70f0 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -687,6 +687,10 @@ void vmbus_device_unregister(struct hv_device *device_obj);
/* VmbusChildDeviceDestroy( */
/* struct hv_device *); */

+/*
+ * Get the channel by its relid. Takes additional reference to the channel so
+ * all users are supposed to do vmbus_put_channel() when they're done.
+ */
struct vmbus_channel *relid2channel(u32 relid);

void vmbus_free_channels(void);
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index e73cfeb..c576d2d 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -649,6 +649,9 @@ struct vmbus_channel {
/* Unique channel id */
int id;

+ /* Active reference count */
+ atomic_t count;
+
struct list_head listentry;

struct hv_device *device_obj;
@@ -666,6 +669,7 @@ struct vmbus_channel {
u8 monitor_bit;

bool rescind; /* got rescind msg */
+ bool dying; /* channel is dying */

u32 ringbuffer_gpadlhandle;

@@ -907,6 +911,15 @@ extern int vmbus_recvpacket_raw(struct vmbus_channel *channel,

extern void vmbus_ontimer(unsigned long data);

+/*
+ * Decrease reference count for the channel. Frees the channel when its usage
+ * count reaches zero.
+ */
+extern void vmbus_put_channel(struct vmbus_channel *channel);
+
+/* Get additional reference to the channel */
+extern struct vmbus_channel *vmbus_get_channel(struct vmbus_channel *channel);
+
/* Base driver object */
struct hv_driver {
const char *name;
--
1.9.3

2015-02-03 17:01:03

by Vitaly Kuznetsov

[permalink] [raw]
Subject: [PATCH 2/4] Drivers: hv: vmbus: do not lose rescind offer on failure in vmbus_process_offer()

In case we hit a failure condition in vmbus_process_offer() and a rescind offer
was pending for the channel we just do free_channel() so CHANNELMSG_RELID_RELEASED
will never be send to the host. We have to follow vmbus_process_rescind_offer()
path anyway.

To support the change we need to protect list_del in vmbus_process_rescind_offer()
hitting an uninitialized list.

Reported-by: Dexuan Cui <[email protected]>
Signed-off-by: Vitaly Kuznetsov <[email protected]>
---
drivers/hv/channel_mgmt.c | 20 ++++++++++++++++++--
1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index eb9ce94..fdccd16 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -152,6 +152,7 @@ static struct vmbus_channel *alloc_channel(void)
spin_lock_init(&channel->inbound_lock);
spin_lock_init(&channel->lock);

+ INIT_LIST_HEAD(&channel->listentry);
INIT_LIST_HEAD(&channel->sc_list);
INIT_LIST_HEAD(&channel->percpu_list);

@@ -308,6 +309,7 @@ static void vmbus_process_offer(struct work_struct *work)
struct vmbus_channel *channel;
bool fnew = true;
bool enq = false;
+ bool failure = false;
int ret;
unsigned long flags;

@@ -408,19 +410,33 @@ static void vmbus_process_offer(struct work_struct *work)
spin_lock_irqsave(&vmbus_connection.channel_lock, flags);
list_del(&newchannel->listentry);
spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags);
+ /*
+ * Init listentry again as vmbus_process_rescind_offer can try
+ * doing list_del again.
+ */
+ INIT_LIST_HEAD(&channel->listentry);
kfree(newchannel->device_obj);
+ newchannel->device_obj = NULL;
goto err_free_chan;
}
+ goto done_init_rescind;
+err_free_chan:
+ failure = true;
done_init_rescind:
+ /*
+ * Get additional reference as vmbus_put_channel() can be called
+ * either directly or through vmbus_process_rescind_offer().
+ */
+ vmbus_get_channel(newchannel);
spin_lock_irqsave(&newchannel->lock, flags);
/* The next possible work is rescind handling */
INIT_WORK(&newchannel->work, vmbus_process_rescind_offer);
/* Check if rescind offer was already received */
if (newchannel->rescind)
queue_work(newchannel->controlwq, &newchannel->work);
+ else if (failure)
+ vmbus_put_channel(newchannel);
spin_unlock_irqrestore(&newchannel->lock, flags);
- return;
-err_free_chan:
vmbus_put_channel(newchannel);
}

--
1.9.3

2015-02-03 17:01:27

by Vitaly Kuznetsov

[permalink] [raw]
Subject: [PATCH 3/4] Drivers: hv: vmbus: protect vmbus_get_outgoing_channel() against channel removal

list_for_each_safe() we have in vmbus_get_outgoing_channel() works, however, we
are not protected against the channel being removed (e.g. after receiving rescind
offer). Users of this function (storvsc_do_io() is the only one at this moment)
can get a link to an already freed channel. Make vmbus_get_outgoing_channel()
search holding primary->lock as child channels are not being freed unless they're
removed from parent's list.

Signed-off-by: Vitaly Kuznetsov <[email protected]>
---
drivers/hv/channel_mgmt.c | 10 +++++++---
drivers/scsi/storvsc_drv.c | 2 ++
2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index fdccd16..af6243c 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -881,18 +881,20 @@ cleanup:
*/
struct vmbus_channel *vmbus_get_outgoing_channel(struct vmbus_channel *primary)
{
- struct list_head *cur, *tmp;
+ struct list_head *cur;
int cur_cpu;
struct vmbus_channel *cur_channel;
struct vmbus_channel *outgoing_channel = primary;
int cpu_distance, new_cpu_distance;
+ unsigned long flags;

if (list_empty(&primary->sc_list))
- return outgoing_channel;
+ return vmbus_get_channel(outgoing_channel);

cur_cpu = hv_context.vp_index[get_cpu()];
put_cpu();
- list_for_each_safe(cur, tmp, &primary->sc_list) {
+ spin_lock_irqsave(&primary->lock, flags);
+ list_for_each(cur, &primary->sc_list) {
cur_channel = list_entry(cur, struct vmbus_channel, sc_list);
if (cur_channel->state != CHANNEL_OPENED_STATE)
continue;
@@ -913,6 +915,8 @@ struct vmbus_channel *vmbus_get_outgoing_channel(struct vmbus_channel *primary)

outgoing_channel = cur_channel;
}
+ outgoing_channel = vmbus_get_channel(outgoing_channel);
+ spin_unlock_irqrestore(&primary->lock, flags);

return outgoing_channel;
}
diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 4cff0dd..3b9b851 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -1370,6 +1370,8 @@ static int storvsc_do_io(struct hv_device *device,
VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
}

+ vmbus_put_channel(outgoing_channel);
+
if (ret != 0)
return ret;

--
1.9.3

2015-02-03 17:00:58

by Vitaly Kuznetsov

[permalink] [raw]
Subject: [PATCH 4/4] hyperv: netvsc: improve protection against rescind offer

The check added in commit c3582a2c4d0b ("hyperv: Add support for vNIC hot
removal") is incomplete as there is no synchronization between
vmbus_onoffer_rescind() and netvsc_send(). In case we get the offer after we
checked out_channel->rescind and before netvsc_send() finishes its job we can
get a crash as we'll be dealing with already freed channel.

Make netvsc_send() take additional reference to the channel with newly
introduced vmbus_get_channel(), this guarantees we won't lose the channel. We
can still get rescind while we're processing but this won't cause a crash.

Reported-by: Jason Wang <[email protected]>
Signed-off-by: Vitaly Kuznetsov <[email protected]>
---
drivers/net/hyperv/netvsc.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 9f49c01..d9b13a1 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -763,11 +763,16 @@ int netvsc_send(struct hv_device *device,
out_channel = net_device->chn_table[packet->q_idx];
if (out_channel == NULL)
out_channel = device->channel;
- packet->channel = out_channel;
+ packet->channel = vmbus_get_channel(out_channel);

- if (out_channel->rescind)
+ if (!packet->channel)
return -ENODEV;

+ if (out_channel->rescind) {
+ vmbus_put_channel(out_channel);
+ return -ENODEV;
+ }
+
if (packet->page_buf_cnt) {
ret = vmbus_sendpacket_pagebuffer(out_channel,
packet->page_buf,
@@ -810,6 +815,7 @@ int netvsc_send(struct hv_device *device,
packet, ret);
}

+ vmbus_put_channel(packet->channel);
return ret;
}

--
1.9.3

2015-02-04 07:43:45

by Dexuan Cui

[permalink] [raw]
Subject: RE: [PATCH 3/4] Drivers: hv: vmbus: protect vmbus_get_outgoing_channel() against channel removal

> -----Original Message-----
> From: Vitaly Kuznetsov [mailto:[email protected]]
> Sent: Wednesday, February 4, 2015 1:01 AM
> To: KY Srinivasan; [email protected]
> Cc: Haiyang Zhang; [email protected]; Dexuan Cui; Jason Wang
> Subject: [PATCH 3/4] Drivers: hv: vmbus: protect vmbus_get_outgoing_channel()
> against channel removal
>
> list_for_each_safe() we have in vmbus_get_outgoing_channel() works, however,
> we
> are not protected against the channel being removed (e.g. after receiving
> rescind
> offer). Users of this function (storvsc_do_io() is the only one at this moment)
> can get a link to an already freed channel. Make vmbus_get_outgoing_channel()
> search holding primary->lock as child channels are not being freed unless they're
> removed from parent's list.
>
> Signed-off-by: Vitaly Kuznetsov <[email protected]>
> ---
> drivers/hv/channel_mgmt.c | 10 +++++++---
> drivers/scsi/storvsc_drv.c | 2 ++
> 2 files changed, 9 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
> index fdccd16..af6243c 100644
> --- a/drivers/hv/channel_mgmt.c
> +++ b/drivers/hv/channel_mgmt.c
> @@ -881,18 +881,20 @@ cleanup:
> */
> struct vmbus_channel *vmbus_get_outgoing_channel(struct vmbus_channel
> *primary)
> {
> - struct list_head *cur, *tmp;
> + struct list_head *cur;
> int cur_cpu;
> struct vmbus_channel *cur_channel;
> struct vmbus_channel *outgoing_channel = primary;
> int cpu_distance, new_cpu_distance;
> + unsigned long flags;
>
> if (list_empty(&primary->sc_list))
> - return outgoing_channel;
> + return vmbus_get_channel(outgoing_channel);
>
> cur_cpu = hv_context.vp_index[get_cpu()];
> put_cpu();
> - list_for_each_safe(cur, tmp, &primary->sc_list) {
> + spin_lock_irqsave(&primary->lock, flags);
hmm, we should avoid the locking here because it's a performance killer...
How about adding vmbus_get/put_channel() in vmbus_open/close()?

Thanks,
-- Dexuan

> + list_for_each(cur, &primary->sc_list) {
> cur_channel = list_entry(cur, struct vmbus_channel, sc_list);
> if (cur_channel->state != CHANNEL_OPENED_STATE)
> continue;
> @@ -913,6 +915,8 @@ struct vmbus_channel
> *vmbus_get_outgoing_channel(struct vmbus_channel *primary)
>
> outgoing_channel = cur_channel;
> }
> + outgoing_channel = vmbus_get_channel(outgoing_channel);
> + spin_unlock_irqrestore(&primary->lock, flags);
>
> return outgoing_channel;
> }
> diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
> index 4cff0dd..3b9b851 100644
> --- a/drivers/scsi/storvsc_drv.c
> +++ b/drivers/scsi/storvsc_drv.c
> @@ -1370,6 +1370,8 @@ static int storvsc_do_io(struct hv_device *device,
>
> VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
> }
>
> + vmbus_put_channel(outgoing_channel);
> +
> if (ret != 0)
> return ret;
>
> --
> 1.9.3

2015-02-04 07:29:28

by Dexuan Cui

[permalink] [raw]
Subject: RE: [PATCH 4/4] hyperv: netvsc: improve protection against rescind offer

> -----Original Message-----
> From: Vitaly Kuznetsov [mailto:[email protected]]
> Sent: Wednesday, February 4, 2015 1:01 AM
> To: KY Srinivasan; [email protected]
> Cc: Haiyang Zhang; [email protected]; Dexuan Cui; Jason Wang
> Subject: [PATCH 4/4] hyperv: netvsc: improve protection against rescind offer
>
> The check added in commit c3582a2c4d0b ("hyperv: Add support for vNIC hot
> removal") is incomplete as there is no synchronization between
> vmbus_onoffer_rescind() and netvsc_send(). In case we get the offer after we
> checked out_channel->rescind and before netvsc_send() finishes its job we can
> get a crash as we'll be dealing with already freed channel.
>
> Make netvsc_send() take additional reference to the channel with newly
> introduced vmbus_get_channel(), this guarantees we won't lose the channel.
> We
> can still get rescind while we're processing but this won't cause a crash.
>
> Reported-by: Jason Wang <[email protected]>
> Signed-off-by: Vitaly Kuznetsov <[email protected]>
> ---
> drivers/net/hyperv/netvsc.c | 10 ++++++++--
> 1 file changed, 8 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
> index 9f49c01..d9b13a1 100644
> --- a/drivers/net/hyperv/netvsc.c
> +++ b/drivers/net/hyperv/netvsc.c
> @@ -763,11 +763,16 @@ int netvsc_send(struct hv_device *device,
> out_channel = net_device->chn_table[packet->q_idx];
> if (out_channel == NULL)
> out_channel = device->channel;
> - packet->channel = out_channel;
> + packet->channel = vmbus_get_channel(out_channel);
>
> - if (out_channel->rescind)
> + if (!packet->channel)
> return -ENODEV;
>
> + if (out_channel->rescind) {
> + vmbus_put_channel(out_channel);

IMO the patch doesn't resolve the real issue.
At most it prevents the channel from disappearing in netvsc_send() only,
while actually we also need to make sure the channel is not freed before the
driver runs netvsc_remove() -> rndis_filter_device_remove() ->
-> netvsc_device_remove() -> vmbus_close().

I suggest we add vmbus_get/put_channel() in vmbus_open/close()?

-- Dexuan

> + return -ENODEV;
> + }
> +
> if (packet->page_buf_cnt) {
> ret = vmbus_sendpacket_pagebuffer(out_channel,
> packet->page_buf,
> @@ -810,6 +815,7 @@ int netvsc_send(struct hv_device *device,
> packet, ret);
> }
>
> + vmbus_put_channel(packet->channel);
> return ret;
> }
>
> --

2015-02-04 07:42:57

by Dexuan Cui

[permalink] [raw]
Subject: RE: [PATCH 2/4] Drivers: hv: vmbus: do not lose rescind offer on failure in vmbus_process_offer()

> -----Original Message-----
> From: Vitaly Kuznetsov [mailto:[email protected]]
> Sent: Wednesday, February 4, 2015 1:01 AM
> To: KY Srinivasan; [email protected]
> Cc: Haiyang Zhang; [email protected]; Dexuan Cui; Jason Wang
> Subject: [PATCH 2/4] Drivers: hv: vmbus: do not lose rescind offer on failure in
> vmbus_process_offer()
>
> In case we hit a failure condition in vmbus_process_offer() and a rescind offer
> was pending for the channel we just do free_channel() so
> CHANNELMSG_RELID_RELEASED
> will never be send to the host. We have to follow vmbus_process_rescind_offer()
> path anyway.
>
> To support the change we need to protect list_del in
> vmbus_process_rescind_offer()
> hitting an uninitialized list.
>
> Reported-by: Dexuan Cui <[email protected]>
> Signed-off-by: Vitaly Kuznetsov <[email protected]>
> ---
> drivers/hv/channel_mgmt.c | 20 ++++++++++++++++++--
> 1 file changed, 18 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
> index eb9ce94..fdccd16 100644
> --- a/drivers/hv/channel_mgmt.c
> +++ b/drivers/hv/channel_mgmt.c
> @@ -152,6 +152,7 @@ static struct vmbus_channel *alloc_channel(void)
> spin_lock_init(&channel->inbound_lock);
> spin_lock_init(&channel->lock);
>
> + INIT_LIST_HEAD(&channel->listentry);
> INIT_LIST_HEAD(&channel->sc_list);
> INIT_LIST_HEAD(&channel->percpu_list);
>
> @@ -308,6 +309,7 @@ static void vmbus_process_offer(struct work_struct
> *work)
> struct vmbus_channel *channel;
> bool fnew = true;
> bool enq = false;
> + bool failure = false;
> int ret;
> unsigned long flags;
>
> @@ -408,19 +410,33 @@ static void vmbus_process_offer(struct work_struct
> *work)
> spin_lock_irqsave(&vmbus_connection.channel_lock, flags);
> list_del(&newchannel->listentry);
> spin_unlock_irqrestore(&vmbus_connection.channel_lock,
> flags);
> + /*
> + * Init listentry again as vmbus_process_rescind_offer can try
> + * doing list_del again.
> + */
> + INIT_LIST_HEAD(&channel->listentry);
> kfree(newchannel->device_obj);
> + newchannel->device_obj = NULL;
> goto err_free_chan;
> }
> + goto done_init_rescind;
> +err_free_chan:
> + failure = true;
> done_init_rescind:
> + /*
> + * Get additional reference as vmbus_put_channel() can be called
> + * either directly or through vmbus_process_rescind_offer().
> + */
> + vmbus_get_channel(newchannel);
> spin_lock_irqsave(&newchannel->lock, flags);
here we get the lock.

> /* The next possible work is rescind handling */
> INIT_WORK(&newchannel->work, vmbus_process_rescind_offer);
> /* Check if rescind offer was already received */
> if (newchannel->rescind)
> queue_work(newchannel->controlwq, &newchannel->work);
> + else if (failure)
> + vmbus_put_channel(newchannel);
Here in vmbus_put_channel(), we try to get the same spinlock -- dead lock.

-- Dexuan

> spin_unlock_irqrestore(&newchannel->lock, flags);
> - return;
> -err_free_chan:
> vmbus_put_channel(newchannel);
> }
>
> --
> 1.9.3

2015-02-04 08:18:46

by Dexuan Cui

[permalink] [raw]
Subject: RE: [PATCH 1/4] Drivers: hv: vmbus: implement get/put usage workflow for vmbus channels

> -----Original Message-----
> From: Vitaly Kuznetsov [mailto:[email protected]]
> Sent: Wednesday, February 4, 2015 1:01 AM
> To: KY Srinivasan; [email protected]
> Cc: Haiyang Zhang; [email protected]; Dexuan Cui; Jason Wang
> Subject: [PATCH 1/4] Drivers: hv: vmbus: implement get/put usage workflow for
> vmbus channels
>
> free_channel() function frees the channel unconditionally so we need to make
> sure nobody has any link to it. This is not trivial and there are several
> examples of races we have:
>
> 1) In vmbus_onoffer_rescind() we check for channel existence with
> relid2channel() and then use it. This can go wrong if we're in the middle
> of channel removal (free_channel() was already called).
>
> 2) In process_chn_event() we check for channel existence with
> pcpu_relid2channel() and then use it. This can also go wrong.
>
> 3) vmbus_free_channels() just frees all channels, in case we're in the middle
> of vmbus_process_rescind_offer() crash is possible.
>
> The issue can be solved by holding vmbus_connection.channel_lock everywhere,
> however, it looks like a way to deadlocks and performance degradation. Get/put
> workflow fits here the best.
>
> Implement vmbus_get_channel()/vmbus_put_channel() pair instead of
> free_channel().
>
> Signed-off-by: Vitaly Kuznetsov <[email protected]>
> ---
> drivers/hv/channel_mgmt.c | 45
> ++++++++++++++++++++++++++++++++++++++-------
> drivers/hv/connection.c | 7 +++++--
> drivers/hv/hyperv_vmbus.h | 4 ++++
> include/linux/hyperv.h | 13 +++++++++++++
> 4 files changed, 60 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
> index 36bacc7..eb9ce94 100644
> --- a/drivers/hv/channel_mgmt.c
> +++ b/drivers/hv/channel_mgmt.c
> @@ -147,6 +147,8 @@ static struct vmbus_channel *alloc_channel(void)
> return NULL;
>
> channel->id = atomic_inc_return(&chan_num);
> + atomic_set(&channel->count, 1);
> +
> spin_lock_init(&channel->inbound_lock);
> spin_lock_init(&channel->lock);
>
> @@ -178,19 +180,47 @@ static void release_channel(struct work_struct *work)
> }
>
> /*
> - * free_channel - Release the resources used by the vmbus channel object
> + * vmbus_put_channel - Decrease the channel usage counter and release the
> + * resources when this counter reaches zero.
> */
> -static void free_channel(struct vmbus_channel *channel)
> +void vmbus_put_channel(struct vmbus_channel *channel)
> {
> + unsigned long flags;
>
> /*
> * We have to release the channel's workqueue/thread in the vmbus's
> * workqueue/thread context
> * ie we can't destroy ourselves.
> */
> - INIT_WORK(&channel->work, release_channel);
> - queue_work(vmbus_connection.work_queue, &channel->work);
> + spin_lock_irqsave(&channel->lock, flags);
> + if (atomic_dec_and_test(&channel->count)) {
> + channel->dying = true;
> + INIT_WORK(&channel->work, release_channel);
> + spin_unlock_irqrestore(&channel->lock, flags);
> + queue_work(vmbus_connection.work_queue, &channel->work);
> + } else
> + spin_unlock_irqrestore(&channel->lock, flags);
> +}
> +EXPORT_SYMBOL_GPL(vmbus_put_channel);
> +
> +/* vmbus_get_channel - Get additional reference to the channel */
> +struct vmbus_channel *vmbus_get_channel(struct vmbus_channel *channel)
> +{
> + unsigned long flags;
> + struct vmbus_channel *ret = NULL;
> +
> + if (!channel)
> + return NULL;
> +
> + spin_lock_irqsave(&channel->lock, flags);
> + if (!channel->dying) {
> + atomic_inc(&channel->count);
> + ret = channel;
> + }
> + spin_unlock_irqrestore(&channel->lock, flags);
> + return ret;
> }
> +EXPORT_SYMBOL_GPL(vmbus_get_channel);
>
> static void percpu_channel_enq(void *arg)
> {
> @@ -253,7 +283,7 @@ static void vmbus_process_rescind_offer(struct
> work_struct *work)
> list_del(&channel->sc_list);
> spin_unlock_irqrestore(&primary_channel->lock, flags);
> }
> - free_channel(channel);
> + vmbus_put_channel(channel);
> }
>
> void vmbus_free_channels(void)
> @@ -262,7 +292,7 @@ void vmbus_free_channels(void)
>
> list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
> vmbus_device_unregister(channel->device_obj);
> - free_channel(channel);
> + vmbus_put_channel(channel);
> }
> }
>
> @@ -391,7 +421,7 @@ done_init_rescind:
> spin_unlock_irqrestore(&newchannel->lock, flags);
> return;
> err_free_chan:
> - free_channel(newchannel);
> + vmbus_put_channel(newchannel);
> }
>
> enum {
> @@ -549,6 +579,7 @@ static void vmbus_onoffer_rescind(struct
> vmbus_channel_message_header *hdr)
> queue_work(channel->controlwq, &channel->work);
>
> spin_unlock_irqrestore(&channel->lock, flags);
> + vmbus_put_channel(channel);
> }
>
> /*
> diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
> index c4acd1c..d1ce134 100644
> --- a/drivers/hv/connection.c
> +++ b/drivers/hv/connection.c
> @@ -247,7 +247,8 @@ void vmbus_disconnect(void)
> * Map the given relid to the corresponding channel based on the
> * per-cpu list of channels that have been affinitized to this CPU.
> * This will be used in the channel callback path as we can do this
> - * mapping in a lock-free fashion.
> + * mapping in a lock-free fashion. Takes additional reference to the
> + * channel, all users are supposed to do vmbus_put_channel().
> */
> static struct vmbus_channel *pcpu_relid2channel(u32 relid)
> {
> @@ -263,7 +264,7 @@ static struct vmbus_channel *pcpu_relid2channel(u32
> relid)
> }
> }
>
> - return found_channel;
> + return vmbus_get_channel(found_channel);
> }
>
> /*
> @@ -297,6 +298,7 @@ struct vmbus_channel *relid2channel(u32 relid)
> }
> }
> }
> + found_channel = vmbus_get_channel(found_channel);
> spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags);
>
> return found_channel;
> @@ -360,6 +362,7 @@ static void process_chn_event(u32 relid)
> pr_err("no channel callback for relid - %u\n", relid);
> }
>
> + vmbus_put_channel(channel);
> }
>
> /*
> diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
> index b055e53..40d70f0 100644
> --- a/drivers/hv/hyperv_vmbus.h
> +++ b/drivers/hv/hyperv_vmbus.h
> @@ -687,6 +687,10 @@ void vmbus_device_unregister(struct hv_device
> *device_obj);
> /* VmbusChildDeviceDestroy( */
> /* struct hv_device *); */
>
> +/*
> + * Get the channel by its relid. Takes additional reference to the channel so
> + * all users are supposed to do vmbus_put_channel() when they're done.
> + */
> struct vmbus_channel *relid2channel(u32 relid);
>
> void vmbus_free_channels(void);
> diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
> index e73cfeb..c576d2d 100644
> --- a/include/linux/hyperv.h
> +++ b/include/linux/hyperv.h
> @@ -649,6 +649,9 @@ struct vmbus_channel {
> /* Unique channel id */
> int id;
>
> + /* Active reference count */
> + atomic_t count;
> +
> struct list_head listentry;
>
> struct hv_device *device_obj;
> @@ -666,6 +669,7 @@ struct vmbus_channel {
> u8 monitor_bit;
>
> bool rescind; /* got rescind msg */
> + bool dying; /* channel is dying */
>
> u32 ringbuffer_gpadlhandle;
>
> @@ -907,6 +911,15 @@ extern int vmbus_recvpacket_raw(struct
> vmbus_channel *channel,
>
> extern void vmbus_ontimer(unsigned long data);
>
> +/*
> + * Decrease reference count for the channel. Frees the channel when its usage
> + * count reaches zero.
> + */
> +extern void vmbus_put_channel(struct vmbus_channel *channel);
> +
> +/* Get additional reference to the channel */
> +extern struct vmbus_channel *vmbus_get_channel(struct vmbus_channel
> *channel);
> +
> /* Base driver object */
> struct hv_driver {
> const char *name;
> --

Hi Vitaly,
Thanks for the patchset!
I once tried to do the same work, but just didn't find enough time. :-)

Please see my below questions:

Since we always get channel->lock when accessing channel->count, I don't
think the counter has to be of atomic_t?

I suggest we add vmbus_get/put_channel() in vmbus_open/close().
In this way, IMO patch 3/4 and 4/4 would be unnecessary?

In vmbus_exit(), I suspect we should swap the order of the 2 lines:
vmbus_free_channels();
bus_unregister(&hv_bus);
I suppose in bus_unregister(), the .remove callbacks of all the drivers are
invoked, and vmbus_close() is invoked for every channel.


BTW, I just noticed: alloc_channel() -> alloc_workqueue(,,max_active==1,):
due to max_active==1 here, a channel's process_offer() and
process_rescind_offer() is already serialized.
I'm not sure if this fact can be used to simplify the logic?
Is the previous commit "Drivers: hv: vmbus: serialize Offer and Rescind offer"
necessary?

BTW2, I think there is an unsafe race in the 2 paths (this is an existing issue,
not related to this patch of yours):

vmbus_on_event() -> process_chn_event() -> pcpu_relid2channel ->
list_for_each_entry(channel, pcpu_head, percpu_list)

and

vmbus_process_rescind_offer() -> percpu_channel_deq() ->
list_del(&channel->percpu_list)

because the former is running in tasklet and hence can preempt the latter (running
in process context). percpu_channel_enq() should have the same issue(?)

We should add local_bh_disable()/enable() accordingly?
Can you please help to fix this?

vmbus_exit() -> vmbus_free_channels() -> vmbus_put_channel() may have
a race with vmbus_process_rescind_offer()?
After vmbus_process_rescind_offer() -> vmbus_device_unregister(), the count
can already be the initial 1, later vmbus_free_channels() -> vmbus_put_channel() can
kfree the channel before vmbus_process_rescind_offer() completely finishes?

Thanks,
-- Dexuan

2015-02-04 09:14:44

by Jason Wang

[permalink] [raw]
Subject: Re: [PATCH 1/4] Drivers: hv: vmbus: implement get/put usage workflow for vmbus channels



On Wed, Feb 4, 2015 at 1:00 AM, Vitaly Kuznetsov <[email protected]>
wrote:
> free_channel() function frees the channel unconditionally so we need
> to make
> sure nobody has any link to it. This is not trivial and there are
> several
> examples of races we have:
>
> 1) In vmbus_onoffer_rescind() we check for channel existence with
> relid2channel() and then use it. This can go wrong if we're in the
> middle
> of channel removal (free_channel() was already called).
>
> 2) In process_chn_event() we check for channel existence with
> pcpu_relid2channel() and then use it. This can also go wrong.
>
> 3) vmbus_free_channels() just frees all channels, in case we're in
> the middle
> of vmbus_process_rescind_offer() crash is possible.
>
> The issue can be solved by holding vmbus_connection.channel_lock
> everywhere,
> however, it looks like a way to deadlocks and performance
> degradation. Get/put
> workflow fits here the best.
>
> Implement vmbus_get_channel()/vmbus_put_channel() pair instead of
> free_channel().
>
> Signed-off-by: Vitaly Kuznetsov <[email protected]>
> ---
> drivers/hv/channel_mgmt.c | 45
> ++++++++++++++++++++++++++++++++++++++-------
> drivers/hv/connection.c | 7 +++++--
> drivers/hv/hyperv_vmbus.h | 4 ++++
> include/linux/hyperv.h | 13 +++++++++++++
> 4 files changed, 60 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
> index 36bacc7..eb9ce94 100644
> --- a/drivers/hv/channel_mgmt.c
> +++ b/drivers/hv/channel_mgmt.c
> @@ -147,6 +147,8 @@ static struct vmbus_channel *alloc_channel(void)
> return NULL;
>
> channel->id = atomic_inc_return(&chan_num);
> + atomic_set(&channel->count, 1);
> +
> spin_lock_init(&channel->inbound_lock);
> spin_lock_init(&channel->lock);
>
> @@ -178,19 +180,47 @@ static void release_channel(struct work_struct
> *work)
> }
>
> /*
> - * free_channel - Release the resources used by the vmbus channel
> object
> + * vmbus_put_channel - Decrease the channel usage counter and
> release the
> + * resources when this counter reaches zero.
> */
> -static void free_channel(struct vmbus_channel *channel)
> +void vmbus_put_channel(struct vmbus_channel *channel)
> {
> + unsigned long flags;
>
> /*
> * We have to release the channel's workqueue/thread in the vmbus's
> * workqueue/thread context
> * ie we can't destroy ourselves.
> */
> - INIT_WORK(&channel->work, release_channel);
> - queue_work(vmbus_connection.work_queue, &channel->work);
> + spin_lock_irqsave(&channel->lock, flags);
> + if (atomic_dec_and_test(&channel->count)) {
> + channel->dying = true;
> + INIT_WORK(&channel->work, release_channel);
> + spin_unlock_irqrestore(&channel->lock, flags);
> + queue_work(vmbus_connection.work_queue, &channel->work);
> + } else
> + spin_unlock_irqrestore(&channel->lock, flags);
> +}
> +EXPORT_SYMBOL_GPL(vmbus_put_channel);
> +
> +/* vmbus_get_channel - Get additional reference to the channel */
> +struct vmbus_channel *vmbus_get_channel(struct vmbus_channel
> *channel)
> +{
> + unsigned long flags;
> + struct vmbus_channel *ret = NULL;
> +
> + if (!channel)
> + return NULL;
> +
> + spin_lock_irqsave(&channel->lock, flags);
> + if (!channel->dying) {
> + atomic_inc(&channel->count);
> + ret = channel;
> + }
> + spin_unlock_irqrestore(&channel->lock, flags);

Looks like we can use atomic_inc_return_safe() here to avoid extra
dying. And then there's also no need for the spinlock.

if (atomic_inc_return_safe(&channel->count) > 0)
return channel;
else
return NULL;
>
> + return ret;
> }
> +EXPORT_SYMBOL_GPL(vmbus_get_channel);
>
> static void percpu_channel_enq(void *arg)
> {
> @@ -253,7 +283,7 @@ static void vmbus_process_rescind_offer(struct
> work_struct *work)
> list_del(&channel->sc_list);
> spin_unlock_irqrestore(&primary_channel->lock, flags);
> }
> - free_channel(channel);
> + vmbus_put_channel(channel);
> }
>
> void vmbus_free_channels(void)
> @@ -262,7 +292,7 @@ void vmbus_free_channels(void)
>
> list_for_each_entry(channel, &vmbus_connection.chn_list, listentry)
> {
> vmbus_device_unregister(channel->device_obj);
> - free_channel(channel);
> + vmbus_put_channel(channel);
> }
> }
>
> @@ -391,7 +421,7 @@ done_init_rescind:
> spin_unlock_irqrestore(&newchannel->lock, flags);
> return;
> err_free_chan:
> - free_channel(newchannel);
> + vmbus_put_channel(newchannel);
> }
>
> enum {
> @@ -549,6 +579,7 @@ static void vmbus_onoffer_rescind(struct
> vmbus_channel_message_header *hdr)
> queue_work(channel->controlwq, &channel->work);
>
> spin_unlock_irqrestore(&channel->lock, flags);
> + vmbus_put_channel(channel);
> }
>
> /*
> diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
> index c4acd1c..d1ce134 100644
> --- a/drivers/hv/connection.c
> +++ b/drivers/hv/connection.c
> @@ -247,7 +247,8 @@ void vmbus_disconnect(void)
> * Map the given relid to the corresponding channel based on the
> * per-cpu list of channels that have been affinitized to this CPU.
> * This will be used in the channel callback path as we can do this
> - * mapping in a lock-free fashion.
> + * mapping in a lock-free fashion. Takes additional reference to the
> + * channel, all users are supposed to do vmbus_put_channel().
> */
> static struct vmbus_channel *pcpu_relid2channel(u32 relid)
> {
> @@ -263,7 +264,7 @@ static struct vmbus_channel
> *pcpu_relid2channel(u32 relid)
> }
> }
>
> - return found_channel;
> + return vmbus_get_channel(found_channel);
> }
>
> /*
> @@ -297,6 +298,7 @@ struct vmbus_channel *relid2channel(u32 relid)
> }
> }
> }
> + found_channel = vmbus_get_channel(found_channel);
> spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags);
>
> return found_channel;
> @@ -360,6 +362,7 @@ static void process_chn_event(u32 relid)
> pr_err("no channel callback for relid - %u\n", relid);
> }
>
> + vmbus_put_channel(channel);
> }
>
> /*
> diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
> index b055e53..40d70f0 100644
> --- a/drivers/hv/hyperv_vmbus.h
> +++ b/drivers/hv/hyperv_vmbus.h
> @@ -687,6 +687,10 @@ void vmbus_device_unregister(struct hv_device
> *device_obj);
> /* VmbusChildDeviceDestroy( */
> /* struct hv_device *); */
>
> +/*
> + * Get the channel by its relid. Takes additional reference to the
> channel so
> + * all users are supposed to do vmbus_put_channel() when they're
> done.
> + */
> struct vmbus_channel *relid2channel(u32 relid);
>
> void vmbus_free_channels(void);
> diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
> index e73cfeb..c576d2d 100644
> --- a/include/linux/hyperv.h
> +++ b/include/linux/hyperv.h
> @@ -649,6 +649,9 @@ struct vmbus_channel {
> /* Unique channel id */
> int id;
>
> + /* Active reference count */
> + atomic_t count;
> +
> struct list_head listentry;
>
> struct hv_device *device_obj;
> @@ -666,6 +669,7 @@ struct vmbus_channel {
> u8 monitor_bit;
>
> bool rescind; /* got rescind msg */
> + bool dying; /* channel is dying */
>
> u32 ringbuffer_gpadlhandle;
>
> @@ -907,6 +911,15 @@ extern int vmbus_recvpacket_raw(struct
> vmbus_channel *channel,
>
> extern void vmbus_ontimer(unsigned long data);
>
> +/*
> + * Decrease reference count for the channel. Frees the channel when
> its usage
> + * count reaches zero.
> + */
> +extern void vmbus_put_channel(struct vmbus_channel *channel);
> +
> +/* Get additional reference to the channel */
> +extern struct vmbus_channel *vmbus_get_channel(struct vmbus_channel
> *channel);
> +
> /* Base driver object */
> struct hv_driver {
> const char *name;
> --
> 1.9.3
>

2015-02-04 09:32:46

by Vitaly Kuznetsov

[permalink] [raw]
Subject: Re: [PATCH 1/4] Drivers: hv: vmbus: implement get/put usage workflow for vmbus channels

Dexuan Cui <[email protected]> writes:

>> -----Original Message-----
>> From: Vitaly Kuznetsov [mailto:[email protected]]
>> Sent: Wednesday, February 4, 2015 1:01 AM
>> To: KY Srinivasan; [email protected]
>> Cc: Haiyang Zhang; [email protected]; Dexuan Cui; Jason Wang
>> Subject: [PATCH 1/4] Drivers: hv: vmbus: implement get/put usage workflow for
>> vmbus channels
>>
>> free_channel() function frees the channel unconditionally so we need to make
>> sure nobody has any link to it. This is not trivial and there are several
>> examples of races we have:
>>
>> 1) In vmbus_onoffer_rescind() we check for channel existence with
>> relid2channel() and then use it. This can go wrong if we're in the middle
>> of channel removal (free_channel() was already called).
>>
>> 2) In process_chn_event() we check for channel existence with
>> pcpu_relid2channel() and then use it. This can also go wrong.
>>
>> 3) vmbus_free_channels() just frees all channels, in case we're in the middle
>> of vmbus_process_rescind_offer() crash is possible.
>>
>> The issue can be solved by holding vmbus_connection.channel_lock everywhere,
>> however, it looks like a way to deadlocks and performance degradation. Get/put
>> workflow fits here the best.
>>
>> Implement vmbus_get_channel()/vmbus_put_channel() pair instead of
>> free_channel().
>>
>> Signed-off-by: Vitaly Kuznetsov <[email protected]>
>> ---
>> drivers/hv/channel_mgmt.c | 45
>> ++++++++++++++++++++++++++++++++++++++-------
>> drivers/hv/connection.c | 7 +++++--
>> drivers/hv/hyperv_vmbus.h | 4 ++++
>> include/linux/hyperv.h | 13 +++++++++++++
>> 4 files changed, 60 insertions(+), 9 deletions(-)
>>
>> diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
>> index 36bacc7..eb9ce94 100644
>> --- a/drivers/hv/channel_mgmt.c
>> +++ b/drivers/hv/channel_mgmt.c
>> @@ -147,6 +147,8 @@ static struct vmbus_channel *alloc_channel(void)
>> return NULL;
>>
>> channel->id = atomic_inc_return(&chan_num);
>> + atomic_set(&channel->count, 1);
>> +
>> spin_lock_init(&channel->inbound_lock);
>> spin_lock_init(&channel->lock);
>>
>> @@ -178,19 +180,47 @@ static void release_channel(struct work_struct *work)
>> }
>>
>> /*
>> - * free_channel - Release the resources used by the vmbus channel object
>> + * vmbus_put_channel - Decrease the channel usage counter and release the
>> + * resources when this counter reaches zero.
>> */
>> -static void free_channel(struct vmbus_channel *channel)
>> +void vmbus_put_channel(struct vmbus_channel *channel)
>> {
>> + unsigned long flags;
>>
>> /*
>> * We have to release the channel's workqueue/thread in the vmbus's
>> * workqueue/thread context
>> * ie we can't destroy ourselves.
>> */
>> - INIT_WORK(&channel->work, release_channel);
>> - queue_work(vmbus_connection.work_queue, &channel->work);
>> + spin_lock_irqsave(&channel->lock, flags);
>> + if (atomic_dec_and_test(&channel->count)) {
>> + channel->dying = true;
>> + INIT_WORK(&channel->work, release_channel);
>> + spin_unlock_irqrestore(&channel->lock, flags);
>> + queue_work(vmbus_connection.work_queue, &channel->work);
>> + } else
>> + spin_unlock_irqrestore(&channel->lock, flags);
>> +}
>> +EXPORT_SYMBOL_GPL(vmbus_put_channel);
>> +
>> +/* vmbus_get_channel - Get additional reference to the channel */
>> +struct vmbus_channel *vmbus_get_channel(struct vmbus_channel *channel)
>> +{
>> + unsigned long flags;
>> + struct vmbus_channel *ret = NULL;
>> +
>> + if (!channel)
>> + return NULL;
>> +
>> + spin_lock_irqsave(&channel->lock, flags);
>> + if (!channel->dying) {
>> + atomic_inc(&channel->count);
>> + ret = channel;
>> + }
>> + spin_unlock_irqrestore(&channel->lock, flags);
>> + return ret;
>> }
>> +EXPORT_SYMBOL_GPL(vmbus_get_channel);
>>
>> static void percpu_channel_enq(void *arg)
>> {
>> @@ -253,7 +283,7 @@ static void vmbus_process_rescind_offer(struct
>> work_struct *work)
>> list_del(&channel->sc_list);
>> spin_unlock_irqrestore(&primary_channel->lock, flags);
>> }
>> - free_channel(channel);
>> + vmbus_put_channel(channel);
>> }
>>
>> void vmbus_free_channels(void)
>> @@ -262,7 +292,7 @@ void vmbus_free_channels(void)
>>
>> list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
>> vmbus_device_unregister(channel->device_obj);
>> - free_channel(channel);
>> + vmbus_put_channel(channel);
>> }
>> }
>>
>> @@ -391,7 +421,7 @@ done_init_rescind:
>> spin_unlock_irqrestore(&newchannel->lock, flags);
>> return;
>> err_free_chan:
>> - free_channel(newchannel);
>> + vmbus_put_channel(newchannel);
>> }
>>
>> enum {
>> @@ -549,6 +579,7 @@ static void vmbus_onoffer_rescind(struct
>> vmbus_channel_message_header *hdr)
>> queue_work(channel->controlwq, &channel->work);
>>
>> spin_unlock_irqrestore(&channel->lock, flags);
>> + vmbus_put_channel(channel);
>> }
>>
>> /*
>> diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
>> index c4acd1c..d1ce134 100644
>> --- a/drivers/hv/connection.c
>> +++ b/drivers/hv/connection.c
>> @@ -247,7 +247,8 @@ void vmbus_disconnect(void)
>> * Map the given relid to the corresponding channel based on the
>> * per-cpu list of channels that have been affinitized to this CPU.
>> * This will be used in the channel callback path as we can do this
>> - * mapping in a lock-free fashion.
>> + * mapping in a lock-free fashion. Takes additional reference to the
>> + * channel, all users are supposed to do vmbus_put_channel().
>> */
>> static struct vmbus_channel *pcpu_relid2channel(u32 relid)
>> {
>> @@ -263,7 +264,7 @@ static struct vmbus_channel *pcpu_relid2channel(u32
>> relid)
>> }
>> }
>>
>> - return found_channel;
>> + return vmbus_get_channel(found_channel);
>> }
>>
>> /*
>> @@ -297,6 +298,7 @@ struct vmbus_channel *relid2channel(u32 relid)
>> }
>> }
>> }
>> + found_channel = vmbus_get_channel(found_channel);
>> spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags);
>>
>> return found_channel;
>> @@ -360,6 +362,7 @@ static void process_chn_event(u32 relid)
>> pr_err("no channel callback for relid - %u\n", relid);
>> }
>>
>> + vmbus_put_channel(channel);
>> }
>>
>> /*
>> diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
>> index b055e53..40d70f0 100644
>> --- a/drivers/hv/hyperv_vmbus.h
>> +++ b/drivers/hv/hyperv_vmbus.h
>> @@ -687,6 +687,10 @@ void vmbus_device_unregister(struct hv_device
>> *device_obj);
>> /* VmbusChildDeviceDestroy( */
>> /* struct hv_device *); */
>>
>> +/*
>> + * Get the channel by its relid. Takes additional reference to the channel so
>> + * all users are supposed to do vmbus_put_channel() when they're done.
>> + */
>> struct vmbus_channel *relid2channel(u32 relid);
>>
>> void vmbus_free_channels(void);
>> diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
>> index e73cfeb..c576d2d 100644
>> --- a/include/linux/hyperv.h
>> +++ b/include/linux/hyperv.h
>> @@ -649,6 +649,9 @@ struct vmbus_channel {
>> /* Unique channel id */
>> int id;
>>
>> + /* Active reference count */
>> + atomic_t count;
>> +
>> struct list_head listentry;
>>
>> struct hv_device *device_obj;
>> @@ -666,6 +669,7 @@ struct vmbus_channel {
>> u8 monitor_bit;
>>
>> bool rescind; /* got rescind msg */
>> + bool dying; /* channel is dying */
>>
>> u32 ringbuffer_gpadlhandle;
>>
>> @@ -907,6 +911,15 @@ extern int vmbus_recvpacket_raw(struct
>> vmbus_channel *channel,
>>
>> extern void vmbus_ontimer(unsigned long data);
>>
>> +/*
>> + * Decrease reference count for the channel. Frees the channel when its usage
>> + * count reaches zero.
>> + */
>> +extern void vmbus_put_channel(struct vmbus_channel *channel);
>> +
>> +/* Get additional reference to the channel */
>> +extern struct vmbus_channel *vmbus_get_channel(struct vmbus_channel
>> *channel);
>> +
>> /* Base driver object */
>> struct hv_driver {
>> const char *name;
>> --
>
> Hi Vitaly,
> Thanks for the patchset!
> I once tried to do the same work, but just didn't find enough time. :-)
>
> Please see my below questions:

Thanks for your detailed review! I should have added 'RFC' to my series :-)

>
> Since we always get channel->lock when accessing channel->count, I don't
> think the counter has to be of atomic_t?

Agreed

>
> I suggest we add vmbus_get/put_channel() in vmbus_open/close().
> In this way, IMO patch 3/4 and 4/4 would be unnecessary?
>

I was thinking about that but I wasn't sure it is going to be enough for
sub-channels. I'll look again and report.

> In vmbus_exit(), I suspect we should swap the order of the 2 lines:
> vmbus_free_channels();
> bus_unregister(&hv_bus);
> I suppose in bus_unregister(), the .remove callbacks of all the drivers are
> invoked, and vmbus_close() is invoked for every channel.

Makes sense!

>
> BTW, I just noticed: alloc_channel() -> alloc_workqueue(,,max_active==1,):
> due to max_active==1 here, a channel's process_offer() and
> process_rescind_offer() is already serialized.
> I'm not sure if this fact can be used to simplify the logic?

Workers are serialized but e.g. vmbus_onoffer_rescind() is not
serialized with them, e.g. there is no protection against channel
dessapearance after we found it with relid2channel() (e.g. we're on
failure path in vmbus_process_offer()).

Anyway, we have another examples of (theoretically) possible issues and
get/put workflow should simplify the syncronization.

> Is the previous commit "Drivers: hv: vmbus: serialize Offer and Rescind offer"
> necessary?

It is as not checking work function is bad anyway as (in theory) we can
run vmbus_process_offer() twice if vmbus_onoffer_rescind() runs before
vmbus_process_offer() was started.

>
> BTW2, I think there is an unsafe race in the 2 paths (this is an existing issue,
> not related to this patch of yours):
>
> vmbus_on_event() -> process_chn_event() -> pcpu_relid2channel ->
> list_for_each_entry(channel, pcpu_head, percpu_list)
>
> and
>
> vmbus_process_rescind_offer() -> percpu_channel_deq() ->
> list_del(&channel->percpu_list)
>
> because the former is running in tasklet and hence can preempt the latter (running
> in process context). percpu_channel_enq() should have the same
> issue(?)

As I said in my cover letter I don't believe my series solves all issues :-)

>
> We should add local_bh_disable()/enable() accordingly?
> Can you please help to fix this?

Yes, sure, I'll take a look.

>
> vmbus_exit() -> vmbus_free_channels() -> vmbus_put_channel() may have
> a race with vmbus_process_rescind_offer()?
> After vmbus_process_rescind_offer() -> vmbus_device_unregister(), the count
> can already be the initial 1, later vmbus_free_channels() -> vmbus_put_channel() can
> kfree the channel before vmbus_process_rescind_offer() completely
> finishes?

I was going to rewrite vmbus_free_channels() not only because of
that. E.g. vmbus_get_outgoing_channel() presumes all lists (in both
channel.sc_list and vmbus_connection.chn_list) are valid and we don't
take care of that. Not that I believe vmbus_get_outgoing_channel()
may happen when vmbus_free_channels() is active (and we call
vmbus_free_channels() only if we unload the module)


Thanks for your thorough review, it gave me further inspiration :-)

--
Vitaly

2015-02-04 09:33:49

by Vitaly Kuznetsov

[permalink] [raw]
Subject: Re: [PATCH 1/4] Drivers: hv: vmbus: implement get/put usage workflow for vmbus channels

Jason Wang <[email protected]> writes:

> On Wed, Feb 4, 2015 at 1:00 AM, Vitaly Kuznetsov <[email protected]>
> wrote:
>> free_channel() function frees the channel unconditionally so we need
>> to make
>> sure nobody has any link to it. This is not trivial and there are
>> several
>> examples of races we have:
>>
>> 1) In vmbus_onoffer_rescind() we check for channel existence with
>> relid2channel() and then use it. This can go wrong if we're in
>> the middle
>> of channel removal (free_channel() was already called).
>>
>> 2) In process_chn_event() we check for channel existence with
>> pcpu_relid2channel() and then use it. This can also go wrong.
>>
>> 3) vmbus_free_channels() just frees all channels, in case we're in
>> the middle
>> of vmbus_process_rescind_offer() crash is possible.
>>
>> The issue can be solved by holding vmbus_connection.channel_lock
>> everywhere,
>> however, it looks like a way to deadlocks and performance
>> degradation. Get/put
>> workflow fits here the best.
>>
>> Implement vmbus_get_channel()/vmbus_put_channel() pair instead of
>> free_channel().
>>
>> Signed-off-by: Vitaly Kuznetsov <[email protected]>
>> ---
>> drivers/hv/channel_mgmt.c | 45
>> ++++++++++++++++++++++++++++++++++++++-------
>> drivers/hv/connection.c | 7 +++++--
>> drivers/hv/hyperv_vmbus.h | 4 ++++
>> include/linux/hyperv.h | 13 +++++++++++++
>> 4 files changed, 60 insertions(+), 9 deletions(-)
>>
>> diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
>> index 36bacc7..eb9ce94 100644
>> --- a/drivers/hv/channel_mgmt.c
>> +++ b/drivers/hv/channel_mgmt.c
>> @@ -147,6 +147,8 @@ static struct vmbus_channel *alloc_channel(void)
>> return NULL;
>> channel->id = atomic_inc_return(&chan_num);
>> + atomic_set(&channel->count, 1);
>> +
>> spin_lock_init(&channel->inbound_lock);
>> spin_lock_init(&channel->lock);
>> @@ -178,19 +180,47 @@ static void release_channel(struct
>> work_struct *work)
>> }
>> /*
>> - * free_channel - Release the resources used by the vmbus channel
>> object
>> + * vmbus_put_channel - Decrease the channel usage counter and
>> release the
>> + * resources when this counter reaches zero.
>> */
>> -static void free_channel(struct vmbus_channel *channel)
>> +void vmbus_put_channel(struct vmbus_channel *channel)
>> {
>> + unsigned long flags;
>> /*
>> * We have to release the channel's workqueue/thread in the vmbus's
>> * workqueue/thread context
>> * ie we can't destroy ourselves.
>> */
>> - INIT_WORK(&channel->work, release_channel);
>> - queue_work(vmbus_connection.work_queue, &channel->work);
>> + spin_lock_irqsave(&channel->lock, flags);
>> + if (atomic_dec_and_test(&channel->count)) {
>> + channel->dying = true;
>> + INIT_WORK(&channel->work, release_channel);
>> + spin_unlock_irqrestore(&channel->lock, flags);
>> + queue_work(vmbus_connection.work_queue, &channel->work);
>> + } else
>> + spin_unlock_irqrestore(&channel->lock, flags);
>> +}
>> +EXPORT_SYMBOL_GPL(vmbus_put_channel);
>> +
>> +/* vmbus_get_channel - Get additional reference to the channel */
>> +struct vmbus_channel *vmbus_get_channel(struct vmbus_channel
>> *channel)
>> +{
>> + unsigned long flags;
>> + struct vmbus_channel *ret = NULL;
>> +
>> + if (!channel)
>> + return NULL;
>> +
>> + spin_lock_irqsave(&channel->lock, flags);
>> + if (!channel->dying) {
>> + atomic_inc(&channel->count);
>> + ret = channel;
>> + }
>> + spin_unlock_irqrestore(&channel->lock, flags);
>
> Looks like we can use atomic_inc_return_safe() here to avoid extra
> dying. And then there's also no need for the spinlock.
>
> if (atomic_inc_return_safe(&channel->count) > 0)
> return channel;
> else
> return NULL;

Good idea, thanks! I'll try.

>>
>> + return ret;
>> }
>> +EXPORT_SYMBOL_GPL(vmbus_get_channel);
>> static void percpu_channel_enq(void *arg)
>> {
>> @@ -253,7 +283,7 @@ static void vmbus_process_rescind_offer(struct
>> work_struct *work)
>> list_del(&channel->sc_list);
>> spin_unlock_irqrestore(&primary_channel->lock, flags);
>> }
>> - free_channel(channel);
>> + vmbus_put_channel(channel);
>> }
>> void vmbus_free_channels(void)
>> @@ -262,7 +292,7 @@ void vmbus_free_channels(void)
>> list_for_each_entry(channel, &vmbus_connection.chn_list,
>> listentry) {
>> vmbus_device_unregister(channel->device_obj);
>> - free_channel(channel);
>> + vmbus_put_channel(channel);
>> }
>> }
>> @@ -391,7 +421,7 @@ done_init_rescind:
>> spin_unlock_irqrestore(&newchannel->lock, flags);
>> return;
>> err_free_chan:
>> - free_channel(newchannel);
>> + vmbus_put_channel(newchannel);
>> }
>> enum {
>> @@ -549,6 +579,7 @@ static void vmbus_onoffer_rescind(struct
>> vmbus_channel_message_header *hdr)
>> queue_work(channel->controlwq, &channel->work);
>> spin_unlock_irqrestore(&channel->lock, flags);
>> + vmbus_put_channel(channel);
>> }
>> /*
>> diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
>> index c4acd1c..d1ce134 100644
>> --- a/drivers/hv/connection.c
>> +++ b/drivers/hv/connection.c
>> @@ -247,7 +247,8 @@ void vmbus_disconnect(void)
>> * Map the given relid to the corresponding channel based on the
>> * per-cpu list of channels that have been affinitized to this CPU.
>> * This will be used in the channel callback path as we can do this
>> - * mapping in a lock-free fashion.
>> + * mapping in a lock-free fashion. Takes additional reference to the
>> + * channel, all users are supposed to do vmbus_put_channel().
>> */
>> static struct vmbus_channel *pcpu_relid2channel(u32 relid)
>> {
>> @@ -263,7 +264,7 @@ static struct vmbus_channel
>> *pcpu_relid2channel(u32 relid)
>> }
>> }
>> - return found_channel;
>> + return vmbus_get_channel(found_channel);
>> }
>> /*
>> @@ -297,6 +298,7 @@ struct vmbus_channel *relid2channel(u32 relid)
>> }
>> }
>> }
>> + found_channel = vmbus_get_channel(found_channel);
>> spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags);
>> return found_channel;
>> @@ -360,6 +362,7 @@ static void process_chn_event(u32 relid)
>> pr_err("no channel callback for relid - %u\n", relid);
>> }
>> + vmbus_put_channel(channel);
>> }
>> /*
>> diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
>> index b055e53..40d70f0 100644
>> --- a/drivers/hv/hyperv_vmbus.h
>> +++ b/drivers/hv/hyperv_vmbus.h
>> @@ -687,6 +687,10 @@ void vmbus_device_unregister(struct hv_device
>> *device_obj);
>> /* VmbusChildDeviceDestroy( */
>> /* struct hv_device *); */
>> +/*
>> + * Get the channel by its relid. Takes additional reference to the
>> channel so
>> + * all users are supposed to do vmbus_put_channel() when they're
>> done.
>> + */
>> struct vmbus_channel *relid2channel(u32 relid);
>> void vmbus_free_channels(void);
>> diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
>> index e73cfeb..c576d2d 100644
>> --- a/include/linux/hyperv.h
>> +++ b/include/linux/hyperv.h
>> @@ -649,6 +649,9 @@ struct vmbus_channel {
>> /* Unique channel id */
>> int id;
>> + /* Active reference count */
>> + atomic_t count;
>> +
>> struct list_head listentry;
>> struct hv_device *device_obj;
>> @@ -666,6 +669,7 @@ struct vmbus_channel {
>> u8 monitor_bit;
>> bool rescind; /* got rescind msg */
>> + bool dying; /* channel is dying */
>> u32 ringbuffer_gpadlhandle;
>> @@ -907,6 +911,15 @@ extern int vmbus_recvpacket_raw(struct
>> vmbus_channel *channel,
>> extern void vmbus_ontimer(unsigned long data);
>> +/*
>> + * Decrease reference count for the channel. Frees the channel when
>> its usage
>> + * count reaches zero.
>> + */
>> +extern void vmbus_put_channel(struct vmbus_channel *channel);
>> +
>> +/* Get additional reference to the channel */
>> +extern struct vmbus_channel *vmbus_get_channel(struct vmbus_channel
>> *channel);
>> +
>> /* Base driver object */
>> struct hv_driver {
>> const char *name;
>> --
>> 1.9.3
>>

--
Vitaly

2015-02-04 10:09:03

by Dexuan Cui

[permalink] [raw]
Subject: RE: [PATCH 1/4] Drivers: hv: vmbus: implement get/put usage workflow for vmbus channels

> -----Original Message-----
> From: Vitaly Kuznetsov [mailto:[email protected]]
> Sent: Wednesday, February 4, 2015 17:32 PM
> To: Dexuan Cui
> Cc: KY Srinivasan; [email protected]; Haiyang Zhang; linux-
> [email protected]; Jason Wang
> Subject: Re: [PATCH 1/4] Drivers: hv: vmbus: implement get/put usage workflow
> for vmbus channels
>
> Dexuan Cui <[email protected]> writes:
>
> >> -----Original Message-----
> >> From: Vitaly Kuznetsov [mailto:[email protected]]
> >> Sent: Wednesday, February 4, 2015 1:01 AM
> >> To: KY Srinivasan; [email protected]
> >> Cc: Haiyang Zhang; [email protected]; Dexuan Cui; Jason Wang
> >> Subject: [PATCH 1/4] Drivers: hv: vmbus: implement get/put usage workflow
> for
> >> vmbus channels
> >>
> >> free_channel() function frees the channel unconditionally so we need to
> make
> >> sure nobody has any link to it. This is not trivial and there are several
> >> examples of races we have:
> >>
> >> 1) In vmbus_onoffer_rescind() we check for channel existence with
> >> relid2channel() and then use it. This can go wrong if we're in the middle
> >> of channel removal (free_channel() was already called).
> >>
> >> 2) In process_chn_event() we check for channel existence with
> >> pcpu_relid2channel() and then use it. This can also go wrong.
> >>
> >> 3) vmbus_free_channels() just frees all channels, in case we're in the middle
> >> of vmbus_process_rescind_offer() crash is possible.
> >>
> >> The issue can be solved by holding vmbus_connection.channel_lock
> everywhere,
> >> however, it looks like a way to deadlocks and performance degradation.
> Get/put
> >> workflow fits here the best.
> >>
> >> Implement vmbus_get_channel()/vmbus_put_channel() pair instead of
> >> free_channel().
> >>
> >> Signed-off-by: Vitaly Kuznetsov <[email protected]>
> >> ---
> >> drivers/hv/channel_mgmt.c | 45
> >> ++++++++++++++++++++++++++++++++++++++-------
> >> drivers/hv/connection.c | 7 +++++--
> >> drivers/hv/hyperv_vmbus.h | 4 ++++
> >> include/linux/hyperv.h | 13 +++++++++++++
> >> 4 files changed, 60 insertions(+), 9 deletions(-)
> >>
> >> diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
> >> index 36bacc7..eb9ce94 100644
> >> --- a/drivers/hv/channel_mgmt.c
> >> +++ b/drivers/hv/channel_mgmt.c
> >> @@ -147,6 +147,8 @@ static struct vmbus_channel *alloc_channel(void)
> >> return NULL;
> >>
> >> channel->id = atomic_inc_return(&chan_num);
> >> + atomic_set(&channel->count, 1);
> >> +
> >> spin_lock_init(&channel->inbound_lock);
> >> spin_lock_init(&channel->lock);
> >>
> >> @@ -178,19 +180,47 @@ static void release_channel(struct work_struct
> *work)
> >> }
> >>
> >> /*
> >> - * free_channel - Release the resources used by the vmbus channel object
> >> + * vmbus_put_channel - Decrease the channel usage counter and release
> the
> >> + * resources when this counter reaches zero.
> >> */
> >> -static void free_channel(struct vmbus_channel *channel)
> >> +void vmbus_put_channel(struct vmbus_channel *channel)
> >> {
> >> + unsigned long flags;
> >>
> >> /*
> >> * We have to release the channel's workqueue/thread in the vmbus's
> >> * workqueue/thread context
> >> * ie we can't destroy ourselves.
> >> */
> >> - INIT_WORK(&channel->work, release_channel);
> >> - queue_work(vmbus_connection.work_queue, &channel->work);
> >> + spin_lock_irqsave(&channel->lock, flags);
> >> + if (atomic_dec_and_test(&channel->count)) {
> >> + channel->dying = true;
> >> + INIT_WORK(&channel->work, release_channel);
> >> + spin_unlock_irqrestore(&channel->lock, flags);
> >> + queue_work(vmbus_connection.work_queue, &channel->work);
> >> + } else
> >> + spin_unlock_irqrestore(&channel->lock, flags);
> >> +}
> >> +EXPORT_SYMBOL_GPL(vmbus_put_channel);
> >> +
> >> +/* vmbus_get_channel - Get additional reference to the channel */
> >> +struct vmbus_channel *vmbus_get_channel(struct vmbus_channel *channel)
> >> +{
> >> + unsigned long flags;
> >> + struct vmbus_channel *ret = NULL;
> >> +
> >> + if (!channel)
> >> + return NULL;
> >> +
> >> + spin_lock_irqsave(&channel->lock, flags);
> >> + if (!channel->dying) {
> >> + atomic_inc(&channel->count);
> >> + ret = channel;
> >> + }
> >> + spin_unlock_irqrestore(&channel->lock, flags);
> >> + return ret;
> >> }
> >> +EXPORT_SYMBOL_GPL(vmbus_get_channel);
> >>
> >> static void percpu_channel_enq(void *arg)
> >> {
> >> @@ -253,7 +283,7 @@ static void vmbus_process_rescind_offer(struct
> >> work_struct *work)
> >> list_del(&channel->sc_list);
> >> spin_unlock_irqrestore(&primary_channel->lock, flags);
> >> }
> >> - free_channel(channel);
> >> + vmbus_put_channel(channel);
> >> }
> >>
> >> void vmbus_free_channels(void)
> >> @@ -262,7 +292,7 @@ void vmbus_free_channels(void)
> >>
> >> list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
> >> vmbus_device_unregister(channel->device_obj);
> >> - free_channel(channel);
> >> + vmbus_put_channel(channel);
> >> }
> >> }
> >>
> >> @@ -391,7 +421,7 @@ done_init_rescind:
> >> spin_unlock_irqrestore(&newchannel->lock, flags);
> >> return;
> >> err_free_chan:
> >> - free_channel(newchannel);
> >> + vmbus_put_channel(newchannel);
> >> }
> >>
> >> enum {
> >> @@ -549,6 +579,7 @@ static void vmbus_onoffer_rescind(struct
> >> vmbus_channel_message_header *hdr)
> >> queue_work(channel->controlwq, &channel->work);
> >>
> >> spin_unlock_irqrestore(&channel->lock, flags);
> >> + vmbus_put_channel(channel);
> >> }
> >>
> >> /*
> >> diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
> >> index c4acd1c..d1ce134 100644
> >> --- a/drivers/hv/connection.c
> >> +++ b/drivers/hv/connection.c
> >> @@ -247,7 +247,8 @@ void vmbus_disconnect(void)
> >> * Map the given relid to the corresponding channel based on the
> >> * per-cpu list of channels that have been affinitized to this CPU.
> >> * This will be used in the channel callback path as we can do this
> >> - * mapping in a lock-free fashion.
> >> + * mapping in a lock-free fashion. Takes additional reference to the
> >> + * channel, all users are supposed to do vmbus_put_channel().
> >> */
> >> static struct vmbus_channel *pcpu_relid2channel(u32 relid)
> >> {
> >> @@ -263,7 +264,7 @@ static struct vmbus_channel
> *pcpu_relid2channel(u32
> >> relid)
> >> }
> >> }
> >>
> >> - return found_channel;
> >> + return vmbus_get_channel(found_channel);
> >> }
> >>
> >> /*
> >> @@ -297,6 +298,7 @@ struct vmbus_channel *relid2channel(u32 relid)
> >> }
> >> }
> >> }
> >> + found_channel = vmbus_get_channel(found_channel);
> >> spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags);
> >>
> >> return found_channel;
> >> @@ -360,6 +362,7 @@ static void process_chn_event(u32 relid)
> >> pr_err("no channel callback for relid - %u\n", relid);
> >> }
> >>
> >> + vmbus_put_channel(channel);
> >> }
> >>
> >> /*
> >> diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
> >> index b055e53..40d70f0 100644
> >> --- a/drivers/hv/hyperv_vmbus.h
> >> +++ b/drivers/hv/hyperv_vmbus.h
> >> @@ -687,6 +687,10 @@ void vmbus_device_unregister(struct hv_device
> >> *device_obj);
> >> /* VmbusChildDeviceDestroy( */
> >> /* struct hv_device *); */
> >>
> >> +/*
> >> + * Get the channel by its relid. Takes additional reference to the channel so
> >> + * all users are supposed to do vmbus_put_channel() when they're done.
> >> + */
> >> struct vmbus_channel *relid2channel(u32 relid);
> >>
> >> void vmbus_free_channels(void);
> >> diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
> >> index e73cfeb..c576d2d 100644
> >> --- a/include/linux/hyperv.h
> >> +++ b/include/linux/hyperv.h
> >> @@ -649,6 +649,9 @@ struct vmbus_channel {
> >> /* Unique channel id */
> >> int id;
> >>
> >> + /* Active reference count */
> >> + atomic_t count;
> >> +
> >> struct list_head listentry;
> >>
> >> struct hv_device *device_obj;
> >> @@ -666,6 +669,7 @@ struct vmbus_channel {
> >> u8 monitor_bit;
> >>
> >> bool rescind; /* got rescind msg */
> >> + bool dying; /* channel is dying */
> >>
> >> u32 ringbuffer_gpadlhandle;
> >>
> >> @@ -907,6 +911,15 @@ extern int vmbus_recvpacket_raw(struct
> >> vmbus_channel *channel,
> >>
> >> extern void vmbus_ontimer(unsigned long data);
> >>
> >> +/*
> >> + * Decrease reference count for the channel. Frees the channel when its
> usage
> >> + * count reaches zero.
> >> + */
> >> +extern void vmbus_put_channel(struct vmbus_channel *channel);
> >> +
> >> +/* Get additional reference to the channel */
> >> +extern struct vmbus_channel *vmbus_get_channel(struct vmbus_channel
> >> *channel);
> >> +
> >> /* Base driver object */
> >> struct hv_driver {
> >> const char *name;
> >> --
> >
> > Hi Vitaly,
> > Thanks for the patchset!
> > I once tried to do the same work, but just didn't find enough time. :-)
> >
> > Please see my below questions:
>
> Thanks for your detailed review! I should have added 'RFC' to my series :-)
>
> >
> > Since we always get channel->lock when accessing channel->count, I don't
> > think the counter has to be of atomic_t?
>
> Agreed
>
> >
> > I suggest we add vmbus_get/put_channel() in vmbus_open/close().
> > In this way, IMO patch 3/4 and 4/4 would be unnecessary?
> >
>
> I was thinking about that but I wasn't sure it is going to be enough for
> sub-channels. I'll look again and report.
>
> > In vmbus_exit(), I suspect we should swap the order of the 2 lines:
> > vmbus_free_channels();
> > bus_unregister(&hv_bus);
> > I suppose in bus_unregister(), the .remove callbacks of all the drivers are
> > invoked, and vmbus_close() is invoked for every channel.
>
> Makes sense!
>
> >
> > BTW, I just noticed: alloc_channel() -> alloc_workqueue(,,max_active==1,):
> > due to max_active==1 here, a channel's process_offer() and
> > process_rescind_offer() is already serialized.
> > I'm not sure if this fact can be used to simplify the logic?
>
> Workers are serialized but e.g. vmbus_onoffer_rescind() is not
> serialized with them, e.g. there is no protection against channel
> dessapearance after we found it with relid2channel() (e.g. we're on
> failure path in vmbus_process_offer()).
>
> Anyway, we have another examples of (theoretically) possible issues and
> get/put workflow should simplify the syncronization.
>
> > Is the previous commit "Drivers: hv: vmbus: serialize Offer and Rescind offer"
> > necessary?
>
> It is as not checking work function is bad anyway as (in theory) we can
> run vmbus_process_offer() twice if vmbus_onoffer_rescind() runs before
> vmbus_process_offer() was started.
>
> >
> > BTW2, I think there is an unsafe race in the 2 paths (this is an existing issue,
> > not related to this patch of yours):
> >
> > vmbus_on_event() -> process_chn_event() -> pcpu_relid2channel ->
> > list_for_each_entry(channel, pcpu_head, percpu_list)
> >
> > and
> >
> > vmbus_process_rescind_offer() -> percpu_channel_deq() ->
> > list_del(&channel->percpu_list)
> >
> > because the former is running in tasklet and hence can preempt the latter
> (running
> > in process context). percpu_channel_enq() should have the same
> > issue(?)
>
> As I said in my cover letter I don't believe my series solves all issues :-)
>
> >
> > We should add local_bh_disable()/enable() accordingly?
> > Can you please help to fix this?
>
> Yes, sure, I'll take a look.
>
> >
> > vmbus_exit() -> vmbus_free_channels() -> vmbus_put_channel() may have
> > a race with vmbus_process_rescind_offer()?
> > After vmbus_process_rescind_offer() -> vmbus_device_unregister(), the count
> > can already be the initial 1, later vmbus_free_channels() ->
> vmbus_put_channel() can
> > kfree the channel before vmbus_process_rescind_offer() completely
> > finishes?
>
> I was going to rewrite vmbus_free_channels() not only because of
> that. E.g. vmbus_get_outgoing_channel() presumes all lists (in both
> channel.sc_list and vmbus_connection.chn_list) are valid and we don't
> take care of that. Not that I believe vmbus_get_outgoing_channel()
> may happen when vmbus_free_channels() is active (and we call
> vmbus_free_channels() only if we unload the module)
>
>
> Thanks for your thorough review, it gave me further inspiration :-)
>
> --
> Vitaly

Thanks, Vitaly!

I know it's tricky here. :-)

Thanks,
-- Dexuan

2015-02-04 18:26:30

by KY Srinivasan

[permalink] [raw]
Subject: RE: [PATCH 0/4] Drivers: hv: Further protection for the rescind path



> -----Original Message-----
> From: Vitaly Kuznetsov [mailto:[email protected]]
> Sent: Tuesday, February 3, 2015 9:01 AM
> To: KY Srinivasan; [email protected]
> Cc: Haiyang Zhang; [email protected]; Dexuan Cui; Jason Wang
> Subject: [PATCH 0/4] Drivers: hv: Further protection for the rescind path
>
> This series is a continuation of the "Drivers: hv: vmbus: serialize Offer and
> Rescind offer". I'm trying to address a number of theoretically possible issues
> with rescind offer handling. All these complications come from the fact that a
> rescind offer results in vmbus channel being freed and we must ensure
> nobody still uses it. Instead of introducing new locks I suggest we switch
> channels usage to the get/put workflow.
>
> The main part of the series is [PATCH 1/4] which introduces the workflow for
> vmbus channels, all other patches fix different corner cases using this
> workflow. I'm not sure all such cases are covered with this series (probably
> not), but in case protection is required in some other places it should become
> relatively easy to add one.
>
> I did some sanity testing with CONFIG_DEBUG_LOCKDEP=y and nothing
> popped out, however, additional testing would be much appreciated.
>
> K.Y., Haiyang, I'm not sending this series to netdev@ and linux-scsi@ as it is
> supposed to be applied as a whole, please resend these patches with your
> sign-offs when (and if) we're done with reviews. Thanks!

Vitaly,

Thanks for looking into this issue. While today, rescind offer results in the freeing of the channel, I don't think
that is required. By not freeing up the channel in the rescind path, we can have a safe way to access the channel and
that does not have to involve taking a reference on the channel every time you access it - the get/put workflow in your
patch set. As part of the network performance improvement work, I had eliminated all locks in the receive path by setting
up per-cpu data structures for mapping the relid to channel etc. These set of patches introduces locking/atomic operations
in performance critical code paths to deal with an event that is truly rare - the channel getting rescinded.

All channel messages are handled in a single work context:

vmbus_on_msg_dpc() -> vmbus_onmessage_work()-> Various channel messages [offer, rescind etc.]

So, the rescind message cannot be processed while we are processing the offer message and since an offer
cannot be rescinded before it is offered, offer and rescind are naturally serialized (I think I have patchset in my queue
from you that is trying to solve the concurrent execution of offer and rescind and looking at the code I cannot see how
this can occur).

As part of handling the rescind message, we will just set the channel state to indicate that the offer is rescinded (we can add
the rescind state to the channel states already defined and this will be done under the protection of the channel lock).
The cleanup of the channel and sending of the RELID release message will only be done in the context of the driver as part of
driver remove function. I think this should be doable in a way that does not penalize the normal path. If it is ok with you, I will
try to put together a patch along the lines I have described here.

Regards,

K. Y



>
> Vitaly Kuznetsov (4):
> Drivers: hv: vmbus: implement get/put usage workflow for vmbus
> channels
> Drivers: hv: vmbus: do not lose rescind offer on failure in
> vmbus_process_offer()
> Drivers: hv: vmbus: protect vmbus_get_outgoing_channel() against
> channel removal
> hyperv: netvsc: improve protection against rescind offer
>
> drivers/hv/channel_mgmt.c | 75
> +++++++++++++++++++++++++++++++++++++--------
> drivers/hv/connection.c | 7 +++--
> drivers/hv/hyperv_vmbus.h | 4 +++
> drivers/net/hyperv/netvsc.c | 10 ++++-- drivers/scsi/storvsc_drv.c | 2 ++
> include/linux/hyperv.h | 13 ++++++++
> 6 files changed, 95 insertions(+), 16 deletions(-)
>
> --
> 1.9.3

2015-02-05 10:14:05

by Vitaly Kuznetsov

[permalink] [raw]
Subject: Re: [PATCH 0/4] Drivers: hv: Further protection for the rescind path

KY Srinivasan <[email protected]> writes:

>> -----Original Message-----
>> From: Vitaly Kuznetsov [mailto:[email protected]]
>> Sent: Tuesday, February 3, 2015 9:01 AM
>> To: KY Srinivasan; [email protected]
>> Cc: Haiyang Zhang; [email protected]; Dexuan Cui; Jason Wang
>> Subject: [PATCH 0/4] Drivers: hv: Further protection for the rescind path
>>
>> This series is a continuation of the "Drivers: hv: vmbus: serialize Offer and
>> Rescind offer". I'm trying to address a number of theoretically possible issues
>> with rescind offer handling. All these complications come from the fact that a
>> rescind offer results in vmbus channel being freed and we must ensure
>> nobody still uses it. Instead of introducing new locks I suggest we switch
>> channels usage to the get/put workflow.
>>
>> The main part of the series is [PATCH 1/4] which introduces the workflow for
>> vmbus channels, all other patches fix different corner cases using this
>> workflow. I'm not sure all such cases are covered with this series (probably
>> not), but in case protection is required in some other places it should become
>> relatively easy to add one.
>>
>> I did some sanity testing with CONFIG_DEBUG_LOCKDEP=y and nothing
>> popped out, however, additional testing would be much appreciated.
>>
>> K.Y., Haiyang, I'm not sending this series to netdev@ and linux-scsi@ as it is
>> supposed to be applied as a whole, please resend these patches with your
>> sign-offs when (and if) we're done with reviews. Thanks!
>
> Vitaly,
>
> Thanks for looking into this issue. While today, rescind offer results in the freeing of the channel, I don't think
> that is required. By not freeing up the channel in the rescind path, we can have a safe way to access the channel and
> that does not have to involve taking a reference on the channel every time you access it - the get/put workflow in your
> patch set. As part of the network performance improvement work, I had eliminated all locks in the receive path by setting
> up per-cpu data structures for mapping the relid to channel etc. These set of patches introduces locking/atomic operations
> in performance critical code paths to deal with an event that is truly
> rare - the channel getting rescinded.

It is possible to eliminate all locks/atomic operations from performance
critical pyth in my patch series by following Dexuan's suggestion -
we'll get the channel in vmbus_open and put it in vmbus_close (and on
processing offer/rescind offer) this won't affect performance. I'm in
the middle of testing this approach.

>
> All channel messages are handled in a single work context:
>
> vmbus_on_msg_dpc() -> vmbus_onmessage_work()-> Various channel messages [offer, rescind etc.]
>
> So, the rescind message cannot be processed while we are processing the offer message and since an offer
> cannot be rescinded before it is offered, offer and rescind are naturally serialized (I think I have patchset in my queue
> from you that is trying to solve the concurrent execution of offer and rescind and looking at the code I cannot see how
> this can occur).
>
> As part of handling the rescind message, we will just set the channel state to indicate that the offer is rescinded (we can add
> the rescind state to the channel states already defined and this will be done under the protection of the channel lock).
> The cleanup of the channel and sending of the RELID release message will only be done in the context of the driver as part of
> driver remove function. I think this should be doable in a way that does not penalize the normal path. If it is ok with you, I will
> try to put together a patch along the lines I have described here.
>

Yes, if we consider rescind event as a very rare event we can avoid
freeing channels, but if (in some conditions) it happens frequently
we'll have significant memory leakage.

We can also free them with something like schedule_deyalyed_work with
e.g. 10 second delay after removing it from all lists so probability of
hitting a crash will me very low, I seriously doubt we will ever hit it.

Please let me know what you think is better. In case we follow 'never
free' or 'delayed free' approach I'll extract and send separately PATCH
2/4 from my series to address 'loosing rescind offer' issue pointed out
by Dexuan.

Thanks,

> Regards,
>
> K. Y
>
>>
>> Vitaly Kuznetsov (4):
>> Drivers: hv: vmbus: implement get/put usage workflow for vmbus
>> channels
>> Drivers: hv: vmbus: do not lose rescind offer on failure in
>> vmbus_process_offer()
>> Drivers: hv: vmbus: protect vmbus_get_outgoing_channel() against
>> channel removal
>> hyperv: netvsc: improve protection against rescind offer
>>
>> drivers/hv/channel_mgmt.c | 75
>> +++++++++++++++++++++++++++++++++++++--------
>> drivers/hv/connection.c | 7 +++--
>> drivers/hv/hyperv_vmbus.h | 4 +++
>> drivers/net/hyperv/netvsc.c | 10 ++++-- drivers/scsi/storvsc_drv.c | 2 ++
>> include/linux/hyperv.h | 13 ++++++++
>> 6 files changed, 95 insertions(+), 16 deletions(-)
>>
>> --
>> 1.9.3

--
Vitaly

2015-02-05 12:45:18

by Dexuan Cui

[permalink] [raw]
Subject: RE: [PATCH 0/4] Drivers: hv: Further protection for the rescind path

> -----Original Message-----
> From: Vitaly Kuznetsov [mailto:[email protected]]
> Sent: Thursday, February 5, 2015 18:10 PM
> To: KY Srinivasan
> Cc: [email protected]; Haiyang Zhang; [email protected];
> Dexuan Cui; Jason Wang
> Subject: Re: [PATCH 0/4] Drivers: hv: Further protection for the rescind path
>
> KY Srinivasan <[email protected]> writes:
>
> >> -----Original Message-----
> >> From: Vitaly Kuznetsov [mailto:[email protected]]
> >> Sent: Tuesday, February 3, 2015 9:01 AM
> >> To: KY Srinivasan; [email protected]
> >> Cc: Haiyang Zhang; [email protected]; Dexuan Cui; Jason Wang
> >> Subject: [PATCH 0/4] Drivers: hv: Further protection for the rescind path
> >>
> >> This series is a continuation of the "Drivers: hv: vmbus: serialize Offer and
> >> Rescind offer". I'm trying to address a number of theoretically possible issues
> >> with rescind offer handling. All these complications come from the fact that a
> >> rescind offer results in vmbus channel being freed and we must ensure
> >> nobody still uses it. Instead of introducing new locks I suggest we switch
> >> channels usage to the get/put workflow.
> >>
> >> The main part of the series is [PATCH 1/4] which introduces the workflow for
> >> vmbus channels, all other patches fix different corner cases using this
> >> workflow. I'm not sure all such cases are covered with this series (probably
> >> not), but in case protection is required in some other places it should become
> >> relatively easy to add one.
> >>
> >> I did some sanity testing with CONFIG_DEBUG_LOCKDEP=y and nothing
> >> popped out, however, additional testing would be much appreciated.
> >>
> >> K.Y., Haiyang, I'm not sending this series to netdev@ and linux-scsi@ as it is
> >> supposed to be applied as a whole, please resend these patches with your
> >> sign-offs when (and if) we're done with reviews. Thanks!
> >
> > Vitaly,
> >
> > Thanks for looking into this issue. While today, rescind offer results in the
> freeing of the channel, I don't think
> > that is required. By not freeing up the channel in the rescind path, we can have
> a safe way to access the channel and
> > that does not have to involve taking a reference on the channel every time you
> access it - the get/put workflow in your
> > patch set. As part of the network performance improvement work, I had
> eliminated all locks in the receive path by setting
> > up per-cpu data structures for mapping the relid to channel etc. These set of
> patches introduces locking/atomic operations
> > in performance critical code paths to deal with an event that is truly
> > rare - the channel getting rescinded.
>
> It is possible to eliminate all locks/atomic operations from performance
> critical pyth in my patch series by following Dexuan's suggestion -
> we'll get the channel in vmbus_open and put it in vmbus_close (and on
> processing offer/rescind offer) this won't affect performance. I'm in
> the middle of testing this approach.
>
> >
> > All channel messages are handled in a single work context:
> > vmbus_on_msg_dpc() -> vmbus_onmessage_work()-> Various channel
> messages [offer, rescind etc.]
This is true.

> >
> > So, the rescind message cannot be processed while we are processing the
> offer message and since an offer
> > cannot be rescinded before it is offered, offer and rescind are naturally
> > serialized (I think I have patchset in my queue
IMO this may not be true.
The cause is:
(I'm using the latest linux-next repo, which includes Vitaly's patch
"Drivers: hv: vmbus: serialize Offer and Rescind offer".)

vmbus_onoffer_rescind() runs in vmbus_connection.work_queue, but
vmbus_process_offer() runs in the per-channel newchannel->controlwq, so the
two functions are not serialized, at least in theory.

As a result, in vmbus_process_offer(): after the new channel is added into
vmbus_connection.chn_list, but before the channel is completely initialized by
us (we need to create a vmbus device and associate the device with the
channel -- this procedure could fail and we goto err_free_chan and free the
channel directly!), vmbus_onoffer_rescind() can see the new channel, but
doesn't know the channel could be freed in another place at the same time.

BTW, when vmbus_process_offer() -> vmbus_device_create() fails, we
goto err_free_chan without removing the new channel from
vmbus_connection.chn_list?

As another result : in vmbus_process_offer(), in the case
vmbus_device_register() fails, we'll run "list_del(&newchannel->listentry) and
unlock vmbus_connection.channel_lock" -- just after these 2 lines, at this time,
vmbus_onoffer_rescind() -> relid2channel() can return NULL, and we'll miss the
rescind message, i.e., we fail to send the CHANNELMSG_RELID_RELEASED
message to the host.

Of course, these corner cases should be rare since the rescind message is really
rare today, but it may not be true in future: we can have a VMsock-like
guest/host communication channel and many dynamic channels can be
relatively frequently created and rescinded.

Please let me know if my understanding is correct.
If yes, I'm wondering if we can remove the per-channel workqueue and run all the
offer/rescind works in the global vmbus_connection.work_queue? This can realize
the thorough serialization. :-)

> > from you that is trying to solve the concurrent execution of offer and rescind
> > and looking at the code I cannot see how this can occur).
> >
> > As part of handling the rescind message, we will just set the channel state to
> indicate that the offer is rescinded (we can add
> > the rescind state to the channel states already defined and this will be done
> under the protection of the channel lock).
> > The cleanup of the channel and sending of the RELID release message will
> only be done in the context of the driver as part of
> > driver remove function. I think this should be doable in a way that does not
> penalize the normal path. If it is ok with you, I will
> > try to put together a patch along the lines I have described here.
> >
>
> Yes, if we consider rescind event as a very rare event we can avoid
> freeing channels, but if (in some conditions) it happens frequently
> we'll have significant memory leakage.
>
> We can also free them with something like schedule_deyalyed_work with
> e.g. 10 second delay after removing it from all lists so probability of
> hitting a crash will me very low, I seriously doubt we will ever hit it.
>
> Please let me know what you think is better. In case we follow 'never
> free' or 'delayed free' approach I'll extract and send separately PATCH
> 2/4 from my series to address 'loosing rescind offer' issue pointed out
> by Dexuan.
>
> Vitaly

IMO it's not a good idea to 'never free' a channel, considering the future
VMsock-like feature.

Thanks,
-- Dexuan

2015-02-05 15:52:57

by KY Srinivasan

[permalink] [raw]
Subject: RE: [PATCH 0/4] Drivers: hv: Further protection for the rescind path



> -----Original Message-----
> From: Vitaly Kuznetsov [mailto:[email protected]]
> Sent: Thursday, February 5, 2015 2:10 AM
> To: KY Srinivasan
> Cc: [email protected]; Haiyang Zhang; linux-
> [email protected]; Dexuan Cui; Jason Wang
> Subject: Re: [PATCH 0/4] Drivers: hv: Further protection for the rescind path
>
> KY Srinivasan <[email protected]> writes:
>
> >> -----Original Message-----
> >> From: Vitaly Kuznetsov [mailto:[email protected]]
> >> Sent: Tuesday, February 3, 2015 9:01 AM
> >> To: KY Srinivasan; [email protected]
> >> Cc: Haiyang Zhang; [email protected]; Dexuan Cui; Jason
> >> Wang
> >> Subject: [PATCH 0/4] Drivers: hv: Further protection for the rescind
> >> path
> >>
> >> This series is a continuation of the "Drivers: hv: vmbus: serialize
> >> Offer and Rescind offer". I'm trying to address a number of
> >> theoretically possible issues with rescind offer handling. All these
> >> complications come from the fact that a rescind offer results in
> >> vmbus channel being freed and we must ensure nobody still uses it.
> >> Instead of introducing new locks I suggest we switch channels usage to
> the get/put workflow.
> >>
> >> The main part of the series is [PATCH 1/4] which introduces the
> >> workflow for vmbus channels, all other patches fix different corner
> >> cases using this workflow. I'm not sure all such cases are covered
> >> with this series (probably not), but in case protection is required
> >> in some other places it should become relatively easy to add one.
> >>
> >> I did some sanity testing with CONFIG_DEBUG_LOCKDEP=y and nothing
> >> popped out, however, additional testing would be much appreciated.
> >>
> >> K.Y., Haiyang, I'm not sending this series to netdev@ and linux-scsi@
> >> as it is supposed to be applied as a whole, please resend these
> >> patches with your sign-offs when (and if) we're done with reviews.
> Thanks!
> >
> > Vitaly,
> >
> > Thanks for looking into this issue. While today, rescind offer results
> > in the freeing of the channel, I don't think that is required. By not
> > freeing up the channel in the rescind path, we can have a safe way to
> > access the channel and that does not have to involve taking a
> > reference on the channel every time you access it - the get/put
> > workflow in your patch set. As part of the network performance
> > improvement work, I had eliminated all locks in the receive path by setting
> up per-cpu data structures for mapping the relid to channel etc. These set of
> patches introduces locking/atomic operations in performance critical code
> paths to deal with an event that is truly rare - the channel getting rescinded.
>
> It is possible to eliminate all locks/atomic operations from performance
> critical pyth in my patch series by following Dexuan's suggestion - we'll get
> the channel in vmbus_open and put it in vmbus_close (and on processing
> offer/rescind offer) this won't affect performance. I'm in the middle of
> testing this approach.
>
> >
> > All channel messages are handled in a single work context:
> >
> > vmbus_on_msg_dpc() -> vmbus_onmessage_work()-> Various channel
> > messages [offer, rescind etc.]
> >
> > So, the rescind message cannot be processed while we are processing
> > the offer message and since an offer cannot be rescinded before it is
> > offered, offer and rescind are naturally serialized (I think I have
> > patchset in my queue from you that is trying to solve the concurrent
> execution of offer and rescind and looking at the code I cannot see how this
> can occur).
> >
> > As part of handling the rescind message, we will just set the channel
> > state to indicate that the offer is rescinded (we can add the rescind state to
> the channel states already defined and this will be done under the protection
> of the channel lock).
> > The cleanup of the channel and sending of the RELID release message
> > will only be done in the context of the driver as part of driver
> > remove function. I think this should be doable in a way that does not
> penalize the normal path. If it is ok with you, I will try to put together a patch
> along the lines I have described here.
> >
>
> Yes, if we consider rescind event as a very rare event we can avoid freeing
> channels, but if (in some conditions) it happens frequently we'll have
> significant memory leakage.
>
Rescind offer is rare; but I am not suggesting that we would leak memory in this case.
What I am suggesting is that we can place the burden where it should be - do all
the cleanup in vmbus_close() driven by the remove function.

K. Y

2015-02-05 22:47:11

by KY Srinivasan

[permalink] [raw]
Subject: RE: [PATCH 0/4] Drivers: hv: Further protection for the rescind path



> -----Original Message-----
> From: Dexuan Cui
> Sent: Thursday, February 5, 2015 4:45 AM
> To: Vitaly Kuznetsov; KY Srinivasan
> Cc: [email protected]; Haiyang Zhang; linux-
> [email protected]; Jason Wang
> Subject: RE: [PATCH 0/4] Drivers: hv: Further protection for the rescind path
>
> > -----Original Message-----
> > From: Vitaly Kuznetsov [mailto:[email protected]]
> > Sent: Thursday, February 5, 2015 18:10 PM
> > To: KY Srinivasan
> > Cc: [email protected]; Haiyang Zhang;
> > [email protected]; Dexuan Cui; Jason Wang
> > Subject: Re: [PATCH 0/4] Drivers: hv: Further protection for the
> > rescind path
> >
> > KY Srinivasan <[email protected]> writes:
> >
> > >> -----Original Message-----
> > >> From: Vitaly Kuznetsov [mailto:[email protected]]
> > >> Sent: Tuesday, February 3, 2015 9:01 AM
> > >> To: KY Srinivasan; [email protected]
> > >> Cc: Haiyang Zhang; [email protected]; Dexuan Cui; Jason
> > >> Wang
> > >> Subject: [PATCH 0/4] Drivers: hv: Further protection for the
> > >> rescind path
> > >>
> > >> This series is a continuation of the "Drivers: hv: vmbus: serialize
> > >> Offer and Rescind offer". I'm trying to address a number of
> > >> theoretically possible issues with rescind offer handling. All
> > >> these complications come from the fact that a rescind offer results
> > >> in vmbus channel being freed and we must ensure nobody still uses
> > >> it. Instead of introducing new locks I suggest we switch channels usage
> to the get/put workflow.
> > >>
> > >> The main part of the series is [PATCH 1/4] which introduces the
> > >> workflow for vmbus channels, all other patches fix different corner
> > >> cases using this workflow. I'm not sure all such cases are covered
> > >> with this series (probably not), but in case protection is required
> > >> in some other places it should become relatively easy to add one.
> > >>
> > >> I did some sanity testing with CONFIG_DEBUG_LOCKDEP=y and nothing
> > >> popped out, however, additional testing would be much appreciated.
> > >>
> > >> K.Y., Haiyang, I'm not sending this series to netdev@ and
> > >> linux-scsi@ as it is supposed to be applied as a whole, please
> > >> resend these patches with your sign-offs when (and if) we're done with
> reviews. Thanks!
> > >
> > > Vitaly,
> > >
> > > Thanks for looking into this issue. While today, rescind offer
> > > results in the
> > freeing of the channel, I don't think
> > > that is required. By not freeing up the channel in the rescind path,
> > > we can have
> > a safe way to access the channel and
> > > that does not have to involve taking a reference on the channel
> > > every time you
> > access it - the get/put workflow in your
> > > patch set. As part of the network performance improvement work, I
> > > had
> > eliminated all locks in the receive path by setting
> > > up per-cpu data structures for mapping the relid to channel etc.
> > > These set of
> > patches introduces locking/atomic operations
> > > in performance critical code paths to deal with an event that is
> > > truly rare - the channel getting rescinded.
> >
> > It is possible to eliminate all locks/atomic operations from
> > performance critical pyth in my patch series by following Dexuan's
> > suggestion - we'll get the channel in vmbus_open and put it in
> > vmbus_close (and on processing offer/rescind offer) this won't affect
> > performance. I'm in the middle of testing this approach.
> >
> > >
> > > All channel messages are handled in a single work context:
> > > vmbus_on_msg_dpc() -> vmbus_onmessage_work()-> Various channel
> > messages [offer, rescind etc.]
> This is true.
>
> > >
> > > So, the rescind message cannot be processed while we are processing
> > > the
> > offer message and since an offer
> > > cannot be rescinded before it is offered, offer and rescind are
> > > naturally serialized (I think I have patchset in my queue
> IMO this may not be true.
> The cause is:
> (I'm using the latest linux-next repo, which includes Vitaly's patch
> "Drivers: hv: vmbus: serialize Offer and Rescind offer".)
>
> vmbus_onoffer_rescind() runs in vmbus_connection.work_queue, but
> vmbus_process_offer() runs in the per-channel newchannel->controlwq, so
> the two functions are not serialized, at least in theory.
>
> As a result, in vmbus_process_offer(): after the new channel is added into
> vmbus_connection.chn_list, but before the channel is completely initialized
> by us (we need to create a vmbus device and associate the device with the
> channel -- this procedure could fail and we goto err_free_chan and free the
> channel directly!), vmbus_onoffer_rescind() can see the new channel, but
> doesn't know the channel could be freed in another place at the same time.
>
> BTW, when vmbus_process_offer() -> vmbus_device_create() fails, we goto
> err_free_chan without removing the new channel from
> vmbus_connection.chn_list?
>
> As another result : in vmbus_process_offer(), in the case
> vmbus_device_register() fails, we'll run "list_del(&newchannel->listentry)
> and unlock vmbus_connection.channel_lock" -- just after these 2 lines, at
> this time,
> vmbus_onoffer_rescind() -> relid2channel() can return NULL, and we'll miss
> the rescind message, i.e., we fail to send the
> CHANNELMSG_RELID_RELEASED message to the host.

This is the way the code is; but it does not have to be. Here is a patch that implements what I have
been trying to describe. This is a rough proposal; there is additional (obvious) cleanup that I have not done.
This is on top of the patches that have been submitted to Greg.

Signed-off-by: K. Y. Srinivasan <[email protected]>
---
drivers/hv/channel.c | 8 +++++
drivers/hv/channel_mgmt.c | 63 +++++++++++++-------------------------------
drivers/hv/hv_util.c | 2 +-
drivers/hv/vmbus_drv.c | 24 ++++++++++++-----
include/linux/hyperv.h | 1 +
5 files changed, 46 insertions(+), 52 deletions(-)

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index bf0cf8f..4a21cd8 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -501,6 +501,14 @@ static int vmbus_close_internal(struct vmbus_channel *channel)
put_cpu();
}

+ /*
+ * If the channel has been rescinded; process device removal.
+ */
+ if (channel->rescind) {
+ hv_process_device_removal(channel);
+ return 0;
+ }
+
/* Send a closing message */

msg = &channel->close_msg.msg;
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 0ba6b5c..a208256 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -207,27 +207,11 @@ static void percpu_channel_deq(void *arg)
list_del(&channel->percpu_list);
}

-/*
- * vmbus_process_rescind_offer -
- * Rescind the offer by initiating a device removal
- */
-static void vmbus_process_rescind_offer(struct work_struct *work)
+void hv_process_device_removal(struct vmbus_channel *channel)
{
- struct vmbus_channel *channel = container_of(work,
- struct vmbus_channel,
- work);
+ struct vmbus_channel_relid_released msg;
unsigned long flags;
struct vmbus_channel *primary_channel;
- struct vmbus_channel_relid_released msg;
- struct device *dev;
-
- if (channel->device_obj) {
- dev = get_device(&channel->device_obj->device);
- if (dev) {
- vmbus_device_unregister(channel->device_obj);
- put_device(dev);
- }
- }

memset(&msg, 0, sizeof(struct vmbus_channel_relid_released));
msg.child_relid = channel->offermsg.child_relid;
@@ -256,6 +240,7 @@ static void vmbus_process_rescind_offer(struct work_struct *work)
free_channel(channel);
}

+
void vmbus_free_channels(void)
{
struct vmbus_channel *channel;
@@ -270,11 +255,8 @@ void vmbus_free_channels(void)
* vmbus_process_offer - Process the offer by creating a channel/device
* associated with this offer
*/
-static void vmbus_process_offer(struct work_struct *work)
+static void vmbus_process_offer(struct vmbus_channel *newchannel)
{
- struct vmbus_channel *newchannel = container_of(work,
- struct vmbus_channel,
- work);
struct vmbus_channel *channel;
bool fnew = true;
bool enq = false;
@@ -340,7 +322,7 @@ static void vmbus_process_offer(struct work_struct *work)
if (channel->sc_creation_callback != NULL)
channel->sc_creation_callback(newchannel);

- goto done_init_rescind;
+ goto done;
}

goto err_free_chan;
@@ -381,15 +363,10 @@ static void vmbus_process_offer(struct work_struct *work)
kfree(newchannel->device_obj);
goto err_free_chan;
}
-done_init_rescind:
- spin_lock_irqsave(&newchannel->lock, flags);
- /* The next possible work is rescind handling */
- INIT_WORK(&newchannel->work, vmbus_process_rescind_offer);
- /* Check if rescind offer was already received */
- if (newchannel->rescind)
- queue_work(newchannel->controlwq, &newchannel->work);
- spin_unlock_irqrestore(&newchannel->lock, flags);
+
+done:
return;
+
err_free_chan:
free_channel(newchannel);
}
@@ -515,8 +492,7 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
newchannel->monitor_grp = (u8)offer->monitorid / 32;
newchannel->monitor_bit = (u8)offer->monitorid % 32;

- INIT_WORK(&newchannel->work, vmbus_process_offer);
- queue_work(newchannel->controlwq, &newchannel->work);
+ vmbus_process_offer(newchannel);
}

/*
@@ -529,6 +505,7 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr)
struct vmbus_channel_rescind_offer *rescind;
struct vmbus_channel *channel;
unsigned long flags;
+ struct device *dev;

rescind = (struct vmbus_channel_rescind_offer *)hdr;
channel = relid2channel(rescind->child_relid);
@@ -539,18 +516,16 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr)

spin_lock_irqsave(&channel->lock, flags);
channel->rescind = true;
- /*
- * channel->work.func != vmbus_process_rescind_offer means we are still
- * processing offer request and the rescind offer processing should be
- * postponed. It will be done at the very end of vmbus_process_offer()
- * as rescind flag is being checked there.
- */
- if (channel->work.func == vmbus_process_rescind_offer)
- /* work is initialized for vmbus_process_rescind_offer() from
- * vmbus_process_offer() where the channel got created */
- queue_work(channel->controlwq, &channel->work);
-
spin_unlock_irqrestore(&channel->lock, flags);
+
+ if (channel->device_obj) {
+ dev = get_device(&channel->device_obj->device);
+ if (dev) {
+ vmbus_device_unregister(channel->device_obj);
+ put_device(dev);
+ }
+ }
+
}

/*
diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c
index c5be773..7994ec2 100644
--- a/drivers/hv/hv_util.c
+++ b/drivers/hv/hv_util.c
@@ -380,9 +380,9 @@ static int util_remove(struct hv_device *dev)
{
struct hv_util_service *srv = hv_get_drvdata(dev);

- vmbus_close(dev->channel);
if (srv->util_deinit)
srv->util_deinit();
+ vmbus_close(dev->channel);
kfree(srv->recv_buffer);

return 0;
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 3cd44ae..64627c4 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -505,15 +505,25 @@ static int vmbus_probe(struct device *child_device)
*/
static int vmbus_remove(struct device *child_device)
{
- struct hv_driver *drv = drv_to_hv_drv(child_device->driver);
+ struct hv_driver *drv;
struct hv_device *dev = device_to_hv_device(child_device);

- if (drv->remove)
- drv->remove(dev);
- else
- pr_err("remove not set for driver %s\n",
- dev_name(child_device));
-
+ if (child_device->driver) {
+ drv = drv_to_hv_drv(child_device->driver);
+ if (drv->remove) {
+ drv->remove(dev);
+ } else {
+ pr_err("remove not set for driver %s\n",
+ dev_name(child_device));
+ hv_process_device_removal(dev->channel);
+ }
+ } else {
+ /*
+ * We don't have a driver for this device; deal with the
+ * rescind message.
+ */
+ hv_process_device_removal(dev->channel);
+ }
return 0;
}

diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index b079abf..9658bfe 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1226,6 +1226,7 @@ void hv_kvp_onchannelcallback(void *);
int hv_vss_init(struct hv_util_service *);
void hv_vss_deinit(void);
void hv_vss_onchannelcallback(void *);
+void hv_process_device_removal(struct vmbus_channel *channel);

extern struct resource *hyperv_mmio;

--
1.7.4.1

2015-02-06 14:53:47

by Dexuan Cui

[permalink] [raw]
Subject: RE: [PATCH 0/4] Drivers: hv: Further protection for the rescind path

> -----Original Message-----
> From: KY Srinivasan
> Sent: Friday, February 6, 2015 6:47 AM
> To: Dexuan Cui; Vitaly Kuznetsov
> Cc: [email protected]; Haiyang Zhang; [email protected];
> Jason Wang
> Subject: RE: [PATCH 0/4] Drivers: hv: Further protection for the rescind path
> > -----Original Message-----
> > From: Dexuan Cui
> > Sent: Thursday, February 5, 2015 4:45 AM
> > To: Vitaly Kuznetsov; KY Srinivasan
> > Cc: [email protected]; Haiyang Zhang; linux-
> > [email protected]; Jason Wang
> > Subject: RE: [PATCH 0/4] Drivers: hv: Further protection for the rescind path
> >
> > > -----Original Message-----
> > > From: Vitaly Kuznetsov [mailto:[email protected]]
> > > Sent: Thursday, February 5, 2015 18:10 PM
> > > To: KY Srinivasan
> > > Cc: [email protected]; Haiyang Zhang;
> > > [email protected]; Dexuan Cui; Jason Wang
> > > Subject: Re: [PATCH 0/4] Drivers: hv: Further protection for the
> > > rescind path
> > >
> > > KY Srinivasan <[email protected]> writes:
> > >
> > > >> -----Original Message-----
> > > >> From: Vitaly Kuznetsov [mailto:[email protected]]
> > > >> Sent: Tuesday, February 3, 2015 9:01 AM
> > > >> To: KY Srinivasan; [email protected]
> > > >> Cc: Haiyang Zhang; [email protected]; Dexuan Cui; Jason
> > > >> Wang
> > > >> Subject: [PATCH 0/4] Drivers: hv: Further protection for the
> > > >> rescind path
> > > >>
> > > >> This series is a continuation of the "Drivers: hv: vmbus: serialize
> > > >> Offer and Rescind offer". I'm trying to address a number of
> > > >> theoretically possible issues with rescind offer handling. All
> > > >> these complications come from the fact that a rescind offer results
> > > >> in vmbus channel being freed and we must ensure nobody still uses
> > > >> it. Instead of introducing new locks I suggest we switch channels usage
> > to the get/put workflow.
> > > >>
> > > >> The main part of the series is [PATCH 1/4] which introduces the
> > > >> workflow for vmbus channels, all other patches fix different corner
> > > >> cases using this workflow. I'm not sure all such cases are covered
> > > >> with this series (probably not), but in case protection is required
> > > >> in some other places it should become relatively easy to add one.
> > > >>
> > > >> I did some sanity testing with CONFIG_DEBUG_LOCKDEP=y and nothing
> > > >> popped out, however, additional testing would be much appreciated.
> > > >>
> > > >> K.Y., Haiyang, I'm not sending this series to netdev@ and
> > > >> linux-scsi@ as it is supposed to be applied as a whole, please
> > > >> resend these patches with your sign-offs when (and if) we're done with
> > reviews. Thanks!
> > > >
> > > > Vitaly,
> > > >
> > > > Thanks for looking into this issue. While today, rescind offer
> > > > results in the
> > > freeing of the channel, I don't think
> > > > that is required. By not freeing up the channel in the rescind path,
> > > > we can have
> > > a safe way to access the channel and
> > > > that does not have to involve taking a reference on the channel
> > > > every time you
> > > access it - the get/put workflow in your
> > > > patch set. As part of the network performance improvement work, I
> > > > had
> > > eliminated all locks in the receive path by setting
> > > > up per-cpu data structures for mapping the relid to channel etc.
> > > > These set of
> > > patches introduces locking/atomic operations
> > > > in performance critical code paths to deal with an event that is
> > > > truly rare - the channel getting rescinded.
> > >
> > > It is possible to eliminate all locks/atomic operations from
> > > performance critical pyth in my patch series by following Dexuan's
> > > suggestion - we'll get the channel in vmbus_open and put it in
> > > vmbus_close (and on processing offer/rescind offer) this won't affect
> > > performance. I'm in the middle of testing this approach.
> > >
> > > >
> > > > All channel messages are handled in a single work context:
> > > > vmbus_on_msg_dpc() -> vmbus_onmessage_work()-> Various channel
> > > messages [offer, rescind etc.]
> > This is true.
> >
> > > >
> > > > So, the rescind message cannot be processed while we are processing
> > > > the
> > > offer message and since an offer
> > > > cannot be rescinded before it is offered, offer and rescind are
> > > > naturally serialized (I think I have patchset in my queue
> > IMO this may not be true.
> > The cause is:
> > (I'm using the latest linux-next repo, which includes Vitaly's patch
> > "Drivers: hv: vmbus: serialize Offer and Rescind offer".)
> >
> > vmbus_onoffer_rescind() runs in vmbus_connection.work_queue, but
> > vmbus_process_offer() runs in the per-channel newchannel->controlwq, so
> > the two functions are not serialized, at least in theory.
> >
> > As a result, in vmbus_process_offer(): after the new channel is added into
> > vmbus_connection.chn_list, but before the channel is completely initialized
> > by us (we need to create a vmbus device and associate the device with the
> > channel -- this procedure could fail and we goto err_free_chan and free the
> > channel directly!), vmbus_onoffer_rescind() can see the new channel, but
> > doesn't know the channel could be freed in another place at the same time.
> >
> > BTW, when vmbus_process_offer() -> vmbus_device_create() fails, we goto
> > err_free_chan without removing the new channel from
> > vmbus_connection.chn_list?
> >
> > As another result : in vmbus_process_offer(), in the case
> > vmbus_device_register() fails, we'll run "list_del(&newchannel->listentry)
> > and unlock vmbus_connection.channel_lock" -- just after these 2 lines, at
> > this time,
> > vmbus_onoffer_rescind() -> relid2channel() can return NULL, and we'll miss
> > the rescind message, i.e., we fail to send the
> > CHANNELMSG_RELID_RELEASED message to the host.
>
> This is the way the code is; but it does not have to be. Here is a patch that
> implements what I have
> been trying to describe. This is a rough proposal; there is additional (obvious)
> cleanup that I have not done.
> This is on top of the patches that have been submitted to Greg.
>
> Signed-off-by: K. Y. Srinivasan <[email protected]>

Hi KY,
The patch is great!

It effectively removes the per-channel workqueue and uses the global
vmbus_connection.work_queue to implement natural serialization.
It simplifies all the things a lot!

Two small suggestions:

1) hv_process_device_removal() -> hv_process_channel_removal().

2) I think we can revert the patch "Drivers: hv: vmbus: serialize Offer and Rescind offer"
first and then this change will be smaller and clearer. :-)

-- Dexuan


> ---
> drivers/hv/channel.c | 8 +++++
> drivers/hv/channel_mgmt.c | 63 +++++++++++++-------------------------------
> drivers/hv/hv_util.c | 2 +-
> drivers/hv/vmbus_drv.c | 24 ++++++++++++-----
> include/linux/hyperv.h | 1 +
> 5 files changed, 46 insertions(+), 52 deletions(-)
>
> diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
> index bf0cf8f..4a21cd8 100644
> --- a/drivers/hv/channel.c
> +++ b/drivers/hv/channel.c
> @@ -501,6 +501,14 @@ static int vmbus_close_internal(struct vmbus_channel
> *channel)
> put_cpu();
> }
>
> +/*
> + * If the channel has been rescinded; process device removal.
> + */
> +if (channel->rescind) {
> +hv_process_device_removal(channel);
> +return 0;
> +}
> +
> /* Send a closing message */
>
> msg = &channel->close_msg.msg;
> diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
> index 0ba6b5c..a208256 100644
> --- a/drivers/hv/channel_mgmt.c
> +++ b/drivers/hv/channel_mgmt.c
> @@ -207,27 +207,11 @@ static void percpu_channel_deq(void *arg)
> list_del(&channel->percpu_list);
> }
>
> -/*
> - * vmbus_process_rescind_offer -
> - * Rescind the offer by initiating a device removal
> - */
> -static void vmbus_process_rescind_offer(struct work_struct *work)
> +void hv_process_device_removal(struct vmbus_channel *channel)
> {
> -struct vmbus_channel *channel = container_of(work,
> - struct vmbus_channel,
> - work);
> +struct vmbus_channel_relid_released msg;
> unsigned long flags;
> struct vmbus_channel *primary_channel;
> -struct vmbus_channel_relid_released msg;
> -struct device *dev;
> -
> -if (channel->device_obj) {
> -dev = get_device(&channel->device_obj->device);
> -if (dev) {
> -vmbus_device_unregister(channel->device_obj);
> -put_device(dev);
> -}
> -}
>
> memset(&msg, 0, sizeof(struct vmbus_channel_relid_released));
> msg.child_relid = channel->offermsg.child_relid;
> @@ -256,6 +240,7 @@ static void vmbus_process_rescind_offer(struct
> work_struct *work)
> free_channel(channel);
> }
>
> +
> void vmbus_free_channels(void)
> {
> struct vmbus_channel *channel;
> @@ -270,11 +255,8 @@ void vmbus_free_channels(void)
> * vmbus_process_offer - Process the offer by creating a channel/device
> * associated with this offer
> */
> -static void vmbus_process_offer(struct work_struct *work)
> +static void vmbus_process_offer(struct vmbus_channel *newchannel)
> {
> -struct vmbus_channel *newchannel = container_of(work,
> -struct vmbus_channel,
> -work);
> struct vmbus_channel *channel;
> bool fnew = true;
> bool enq = false;
> @@ -340,7 +322,7 @@ static void vmbus_process_offer(struct work_struct
> *work)
> if (channel->sc_creation_callback != NULL)
> channel->sc_creation_callback(newchannel);
>
> -goto done_init_rescind;
> +goto done;
> }
>
> goto err_free_chan;
> @@ -381,15 +363,10 @@ static void vmbus_process_offer(struct work_struct
> *work)
> kfree(newchannel->device_obj);
> goto err_free_chan;
> }
> -done_init_rescind:
> -spin_lock_irqsave(&newchannel->lock, flags);
> -/* The next possible work is rescind handling */
> -INIT_WORK(&newchannel->work, vmbus_process_rescind_offer);
> -/* Check if rescind offer was already received */
> -if (newchannel->rescind)
> -queue_work(newchannel->controlwq, &newchannel->work);
> -spin_unlock_irqrestore(&newchannel->lock, flags);
> +
> +done:
> return;
> +
> err_free_chan:
> free_channel(newchannel);
> }
> @@ -515,8 +492,7 @@ static void vmbus_onoffer(struct
> vmbus_channel_message_header *hdr)
> newchannel->monitor_grp = (u8)offer->monitorid / 32;
> newchannel->monitor_bit = (u8)offer->monitorid % 32;
>
> -INIT_WORK(&newchannel->work, vmbus_process_offer);
> -queue_work(newchannel->controlwq, &newchannel->work);
> +vmbus_process_offer(newchannel);
> }
>
> /*
> @@ -529,6 +505,7 @@ static void vmbus_onoffer_rescind(struct
> vmbus_channel_message_header *hdr)
> struct vmbus_channel_rescind_offer *rescind;
> struct vmbus_channel *channel;
> unsigned long flags;
> +struct device *dev;
>
> rescind = (struct vmbus_channel_rescind_offer *)hdr;
> channel = relid2channel(rescind->child_relid);
> @@ -539,18 +516,16 @@ static void vmbus_onoffer_rescind(struct
> vmbus_channel_message_header *hdr)
>
> spin_lock_irqsave(&channel->lock, flags);
> channel->rescind = true;
> -/*
> - * channel->work.func != vmbus_process_rescind_offer means we are still
> - * processing offer request and the rescind offer processing should be
> - * postponed. It will be done at the very end of vmbus_process_offer()
> - * as rescind flag is being checked there.
> - */
> -if (channel->work.func == vmbus_process_rescind_offer)
> -/* work is initialized for vmbus_process_rescind_offer() from
> - * vmbus_process_offer() where the channel got created */
> -queue_work(channel->controlwq, &channel->work);
> -
> spin_unlock_irqrestore(&channel->lock, flags);
> +
> +if (channel->device_obj) {
> +dev = get_device(&channel->device_obj->device);
> +if (dev) {
> +vmbus_device_unregister(channel->device_obj);
> +put_device(dev);
> +}
> +}
> +
> }
>
> /*
> diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c
> index c5be773..7994ec2 100644
> --- a/drivers/hv/hv_util.c
> +++ b/drivers/hv/hv_util.c
> @@ -380,9 +380,9 @@ static int util_remove(struct hv_device *dev)
> {
> struct hv_util_service *srv = hv_get_drvdata(dev);
>
> -vmbus_close(dev->channel);
> if (srv->util_deinit)
> srv->util_deinit();
> +vmbus_close(dev->channel);
> kfree(srv->recv_buffer);
>
> return 0;
> diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
> index 3cd44ae..64627c4 100644
> --- a/drivers/hv/vmbus_drv.c
> +++ b/drivers/hv/vmbus_drv.c
> @@ -505,15 +505,25 @@ static int vmbus_probe(struct device *child_device)
> */
> static int vmbus_remove(struct device *child_device)
> {
> -struct hv_driver *drv = drv_to_hv_drv(child_device->driver);
> +struct hv_driver *drv;
> struct hv_device *dev = device_to_hv_device(child_device);
>
> -if (drv->remove)
> -drv->remove(dev);
> -else
> -pr_err("remove not set for driver %s\n",
> -dev_name(child_device));
> -
> +if (child_device->driver) {
> +drv = drv_to_hv_drv(child_device->driver);
> +if (drv->remove) {
> +drv->remove(dev);
> +} else {
> +pr_err("remove not set for driver %s\n",
> +dev_name(child_device));
> +hv_process_device_removal(dev->channel);
> +}
> +} else {
> +/*
> + * We don't have a driver for this device; deal with the
> + * rescind message.
> + */
> +hv_process_device_removal(dev->channel);
> +}
> return 0;
> }
>
> diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
> index b079abf..9658bfe 100644
> --- a/include/linux/hyperv.h
> +++ b/include/linux/hyperv.h
> @@ -1226,6 +1226,7 @@ void hv_kvp_onchannelcallback(void *);
> int hv_vss_init(struct hv_util_service *);
> void hv_vss_deinit(void);
> void hv_vss_onchannelcallback(void *);
> +void hv_process_device_removal(struct vmbus_channel *channel);
>
> extern struct resource *hyperv_mmio;
>
> --
> 1.7.4.1
>

2015-02-07 16:27:20

by KY Srinivasan

[permalink] [raw]
Subject: RE: [PATCH 0/4] Drivers: hv: Further protection for the rescind path



> -----Original Message-----
> From: Dexuan Cui
> Sent: Friday, February 6, 2015 6:54 AM
> To: KY Srinivasan; Vitaly Kuznetsov
> Cc: [email protected]; Haiyang Zhang; linux-
> [email protected]; Jason Wang
> Subject: RE: [PATCH 0/4] Drivers: hv: Further protection for the rescind path
>
> > -----Original Message-----
> > From: KY Srinivasan
> > Sent: Friday, February 6, 2015 6:47 AM
> > To: Dexuan Cui; Vitaly Kuznetsov
> > Cc: [email protected]; Haiyang Zhang;
> > [email protected]; Jason Wang
> > Subject: RE: [PATCH 0/4] Drivers: hv: Further protection for the
> > rescind path
> > > -----Original Message-----
> > > From: Dexuan Cui
> > > Sent: Thursday, February 5, 2015 4:45 AM
> > > To: Vitaly Kuznetsov; KY Srinivasan
> > > Cc: [email protected]; Haiyang Zhang; linux-
> > > [email protected]; Jason Wang
> > > Subject: RE: [PATCH 0/4] Drivers: hv: Further protection for the
> > > rescind path
> > >
> > > > -----Original Message-----
> > > > From: Vitaly Kuznetsov [mailto:[email protected]]
> > > > Sent: Thursday, February 5, 2015 18:10 PM
> > > > To: KY Srinivasan
> > > > Cc: [email protected]; Haiyang Zhang;
> > > > [email protected]; Dexuan Cui; Jason Wang
> > > > Subject: Re: [PATCH 0/4] Drivers: hv: Further protection for the
> > > > rescind path
> > > >
> > > > KY Srinivasan <[email protected]> writes:
> > > >
> > > > >> -----Original Message-----
> > > > >> From: Vitaly Kuznetsov [mailto:[email protected]]
> > > > >> Sent: Tuesday, February 3, 2015 9:01 AM
> > > > >> To: KY Srinivasan; [email protected]
> > > > >> Cc: Haiyang Zhang; [email protected]; Dexuan Cui;
> > > > >> Jason Wang
> > > > >> Subject: [PATCH 0/4] Drivers: hv: Further protection for the
> > > > >> rescind path
> > > > >>
> > > > >> This series is a continuation of the "Drivers: hv: vmbus:
> > > > >> serialize Offer and Rescind offer". I'm trying to address a
> > > > >> number of theoretically possible issues with rescind offer
> > > > >> handling. All these complications come from the fact that a
> > > > >> rescind offer results in vmbus channel being freed and we must
> > > > >> ensure nobody still uses it. Instead of introducing new locks I
> > > > >> suggest we switch channels usage
> > > to the get/put workflow.
> > > > >>
> > > > >> The main part of the series is [PATCH 1/4] which introduces the
> > > > >> workflow for vmbus channels, all other patches fix different
> > > > >> corner cases using this workflow. I'm not sure all such cases
> > > > >> are covered with this series (probably not), but in case
> > > > >> protection is required in some other places it should become
> relatively easy to add one.
> > > > >>
> > > > >> I did some sanity testing with CONFIG_DEBUG_LOCKDEP=y and
> > > > >> nothing popped out, however, additional testing would be much
> appreciated.
> > > > >>
> > > > >> K.Y., Haiyang, I'm not sending this series to netdev@ and
> > > > >> linux-scsi@ as it is supposed to be applied as a whole, please
> > > > >> resend these patches with your sign-offs when (and if) we're
> > > > >> done with
> > > reviews. Thanks!
> > > > >
> > > > > Vitaly,
> > > > >
> > > > > Thanks for looking into this issue. While today, rescind offer
> > > > > results in the
> > > > freeing of the channel, I don't think
> > > > > that is required. By not freeing up the channel in the rescind
> > > > > path, we can have
> > > > a safe way to access the channel and
> > > > > that does not have to involve taking a reference on the channel
> > > > > every time you
> > > > access it - the get/put workflow in your
> > > > > patch set. As part of the network performance improvement work,
> > > > > I had
> > > > eliminated all locks in the receive path by setting
> > > > > up per-cpu data structures for mapping the relid to channel etc.
> > > > > These set of
> > > > patches introduces locking/atomic operations
> > > > > in performance critical code paths to deal with an event that is
> > > > > truly rare - the channel getting rescinded.
> > > >
> > > > It is possible to eliminate all locks/atomic operations from
> > > > performance critical pyth in my patch series by following Dexuan's
> > > > suggestion - we'll get the channel in vmbus_open and put it in
> > > > vmbus_close (and on processing offer/rescind offer) this won't
> > > > affect performance. I'm in the middle of testing this approach.
> > > >
> > > > >
> > > > > All channel messages are handled in a single work context:
> > > > > vmbus_on_msg_dpc() -> vmbus_onmessage_work()-> Various
> channel
> > > > messages [offer, rescind etc.]
> > > This is true.
> > >
> > > > >
> > > > > So, the rescind message cannot be processed while we are
> > > > > processing the
> > > > offer message and since an offer
> > > > > cannot be rescinded before it is offered, offer and rescind are
> > > > > naturally serialized (I think I have patchset in my queue
> > > IMO this may not be true.
> > > The cause is:
> > > (I'm using the latest linux-next repo, which includes Vitaly's patch
> > > "Drivers: hv: vmbus: serialize Offer and Rescind offer".)
> > >
> > > vmbus_onoffer_rescind() runs in vmbus_connection.work_queue, but
> > > vmbus_process_offer() runs in the per-channel newchannel->controlwq,
> > > so the two functions are not serialized, at least in theory.
> > >
> > > As a result, in vmbus_process_offer(): after the new channel is
> > > added into vmbus_connection.chn_list, but before the channel is
> > > completely initialized by us (we need to create a vmbus device and
> > > associate the device with the channel -- this procedure could fail
> > > and we goto err_free_chan and free the channel directly!),
> > > vmbus_onoffer_rescind() can see the new channel, but doesn't know
> the channel could be freed in another place at the same time.
> > >
> > > BTW, when vmbus_process_offer() -> vmbus_device_create() fails, we
> > > goto err_free_chan without removing the new channel from
> > > vmbus_connection.chn_list?
> > >
> > > As another result : in vmbus_process_offer(), in the case
> > > vmbus_device_register() fails, we'll run
> > > "list_del(&newchannel->listentry) and unlock
> > > vmbus_connection.channel_lock" -- just after these 2 lines, at this
> > > time,
> > > vmbus_onoffer_rescind() -> relid2channel() can return NULL, and
> > > we'll miss the rescind message, i.e., we fail to send the
> > > CHANNELMSG_RELID_RELEASED message to the host.
> >
> > This is the way the code is; but it does not have to be. Here is a
> > patch that implements what I have been trying to describe. This is a
> > rough proposal; there is additional (obvious) cleanup that I have not
> > done.
> > This is on top of the patches that have been submitted to Greg.
> >
> > Signed-off-by: K. Y. Srinivasan <[email protected]>
>
> Hi KY,
> The patch is great!
>
> It effectively removes the per-channel workqueue and uses the global
> vmbus_connection.work_queue to implement natural serialization.
> It simplifies all the things a lot!
>
> Two small suggestions:
>
> 1) hv_process_device_removal() -> hv_process_channel_removal().
>
> 2) I think we can revert the patch "Drivers: hv: vmbus: serialize Offer and
> Rescind offer"
> first and then this change will be smaller and clearer. :-)

Thanks Dexuan. I will submit a cleaned up version of this patch. Even if we cannot map
The relid to a channel, we can post the relid release message back to the host. I will make
this adjustment and send the patch out.

Thank you,

K. Y
>
> -- Dexuan
>
>
> > ---
> > drivers/hv/channel.c | 8 +++++
> > drivers/hv/channel_mgmt.c | 63 +++++++++++++-----------------------------
> --
> > drivers/hv/hv_util.c | 2 +-
> > drivers/hv/vmbus_drv.c | 24 ++++++++++++-----
> > include/linux/hyperv.h | 1 +
> > 5 files changed, 46 insertions(+), 52 deletions(-)
> >
> > diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index
> > bf0cf8f..4a21cd8 100644
> > --- a/drivers/hv/channel.c
> > +++ b/drivers/hv/channel.c
> > @@ -501,6 +501,14 @@ static int vmbus_close_internal(struct
> > vmbus_channel
> > *channel)
> > put_cpu();
> > }
> >
> > +/*
> > + * If the channel has been rescinded; process device removal.
> > + */
> > +if (channel->rescind) {
> > +hv_process_device_removal(channel);
> > +return 0;
> > +}
> > +
> > /* Send a closing message */
> >
> > msg = &channel->close_msg.msg;
> > diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
> > index 0ba6b5c..a208256 100644
> > --- a/drivers/hv/channel_mgmt.c
> > +++ b/drivers/hv/channel_mgmt.c
> > @@ -207,27 +207,11 @@ static void percpu_channel_deq(void *arg)
> > list_del(&channel->percpu_list); }
> >
> > -/*
> > - * vmbus_process_rescind_offer -
> > - * Rescind the offer by initiating a device removal
> > - */
> > -static void vmbus_process_rescind_offer(struct work_struct *work)
> > +void hv_process_device_removal(struct vmbus_channel *channel)
> > {
> > -struct vmbus_channel *channel = container_of(work,
> > - struct vmbus_channel,
> > - work);
> > +struct vmbus_channel_relid_released msg;
> > unsigned long flags;
> > struct vmbus_channel *primary_channel; -struct
> > vmbus_channel_relid_released msg; -struct device *dev;
> > -
> > -if (channel->device_obj) {
> > -dev = get_device(&channel->device_obj->device);
> > -if (dev) {
> > -vmbus_device_unregister(channel->device_obj);
> > -put_device(dev);
> > -}
> > -}
> >
> > memset(&msg, 0, sizeof(struct vmbus_channel_relid_released));
> > msg.child_relid = channel->offermsg.child_relid; @@ -256,6 +240,7 @@
> > static void vmbus_process_rescind_offer(struct
> > work_struct *work)
> > free_channel(channel);
> > }
> >
> > +
> > void vmbus_free_channels(void)
> > {
> > struct vmbus_channel *channel;
> > @@ -270,11 +255,8 @@ void vmbus_free_channels(void)
> > * vmbus_process_offer - Process the offer by creating a channel/device
> > * associated with this offer
> > */
> > -static void vmbus_process_offer(struct work_struct *work)
> > +static void vmbus_process_offer(struct vmbus_channel *newchannel)
> > {
> > -struct vmbus_channel *newchannel = container_of(work, -struct
> > vmbus_channel, -work); struct vmbus_channel *channel; bool fnew =
> > true; bool enq = false; @@ -340,7 +322,7 @@ static void
> > vmbus_process_offer(struct work_struct
> > *work)
> > if (channel->sc_creation_callback != NULL)
> > channel->sc_creation_callback(newchannel);
> >
> > -goto done_init_rescind;
> > +goto done;
> > }
> >
> > goto err_free_chan;
> > @@ -381,15 +363,10 @@ static void vmbus_process_offer(struct
> > work_struct
> > *work)
> > kfree(newchannel->device_obj);
> > goto err_free_chan;
> > }
> > -done_init_rescind:
> > -spin_lock_irqsave(&newchannel->lock, flags);
> > -/* The next possible work is rescind handling */
> > -INIT_WORK(&newchannel->work, vmbus_process_rescind_offer);
> > -/* Check if rescind offer was already received */ -if
> > (newchannel->rescind) -queue_work(newchannel->controlwq,
> > &newchannel->work); -spin_unlock_irqrestore(&newchannel->lock, flags);
> > +
> > +done:
> > return;
> > +
> > err_free_chan:
> > free_channel(newchannel);
> > }
> > @@ -515,8 +492,7 @@ static void vmbus_onoffer(struct
> > vmbus_channel_message_header *hdr) newchannel->monitor_grp =
> > (u8)offer->monitorid / 32; newchannel->monitor_bit =
> > (u8)offer->monitorid % 32;
> >
> > -INIT_WORK(&newchannel->work, vmbus_process_offer);
> > -queue_work(newchannel->controlwq, &newchannel->work);
> > +vmbus_process_offer(newchannel);
> > }
> >
> > /*
> > @@ -529,6 +505,7 @@ static void vmbus_onoffer_rescind(struct
> > vmbus_channel_message_header *hdr) struct
> vmbus_channel_rescind_offer
> > *rescind; struct vmbus_channel *channel; unsigned long flags;
> > +struct device *dev;
> >
> > rescind = (struct vmbus_channel_rescind_offer *)hdr; channel =
> > relid2channel(rescind->child_relid);
> > @@ -539,18 +516,16 @@ static void vmbus_onoffer_rescind(struct
> > vmbus_channel_message_header *hdr)
> >
> > spin_lock_irqsave(&channel->lock, flags); channel->rescind = true;
> > -/*
> > - * channel->work.func != vmbus_process_rescind_offer means we are
> > still
> > - * processing offer request and the rescind offer processing should
> > be
> > - * postponed. It will be done at the very end of
> > vmbus_process_offer()
> > - * as rescind flag is being checked there.
> > - */
> > -if (channel->work.func == vmbus_process_rescind_offer)
> > -/* work is initialized for vmbus_process_rescind_offer() from
> > - * vmbus_process_offer() where the channel got created */
> > -queue_work(channel->controlwq, &channel->work);
> > -
> > spin_unlock_irqrestore(&channel->lock, flags);
> > +
> > +if (channel->device_obj) {
> > +dev = get_device(&channel->device_obj->device);
> > +if (dev) {
> > +vmbus_device_unregister(channel->device_obj);
> > +put_device(dev);
> > +}
> > +}
> > +
> > }
> >
> > /*
> > diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c index
> > c5be773..7994ec2 100644
> > --- a/drivers/hv/hv_util.c
> > +++ b/drivers/hv/hv_util.c
> > @@ -380,9 +380,9 @@ static int util_remove(struct hv_device *dev) {
> > struct hv_util_service *srv = hv_get_drvdata(dev);
> >
> > -vmbus_close(dev->channel);
> > if (srv->util_deinit)
> > srv->util_deinit();
> > +vmbus_close(dev->channel);
> > kfree(srv->recv_buffer);
> >
> > return 0;
> > diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index
> > 3cd44ae..64627c4 100644
> > --- a/drivers/hv/vmbus_drv.c
> > +++ b/drivers/hv/vmbus_drv.c
> > @@ -505,15 +505,25 @@ static int vmbus_probe(struct device
> *child_device)
> > */
> > static int vmbus_remove(struct device *child_device) { -struct
> > hv_driver *drv = drv_to_hv_drv(child_device->driver);
> > +struct hv_driver *drv;
> > struct hv_device *dev = device_to_hv_device(child_device);
> >
> > -if (drv->remove)
> > -drv->remove(dev);
> > -else
> > -pr_err("remove not set for driver %s\n", -dev_name(child_device));
> > -
> > +if (child_device->driver) {
> > +drv = drv_to_hv_drv(child_device->driver);
> > +if (drv->remove) {
> > +drv->remove(dev);
> > +} else {
> > +pr_err("remove not set for driver %s\n", dev_name(child_device));
> > +hv_process_device_removal(dev->channel);
> > +}
> > +} else {
> > +/*
> > + * We don't have a driver for this device; deal with the
> > + * rescind message.
> > + */
> > +hv_process_device_removal(dev->channel);
> > +}
> > return 0;
> > }
> >
> > diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index
> > b079abf..9658bfe 100644
> > --- a/include/linux/hyperv.h
> > +++ b/include/linux/hyperv.h
> > @@ -1226,6 +1226,7 @@ void hv_kvp_onchannelcallback(void *); int
> > hv_vss_init(struct hv_util_service *); void hv_vss_deinit(void);
> > void hv_vss_onchannelcallback(void *);
> > +void hv_process_device_removal(struct vmbus_channel *channel);
> >
> > extern struct resource *hyperv_mmio;
> >
> > --
> > 1.7.4.1
> >
>