2018-03-28 00:51:11

by Long Li

[permalink] [raw]
Subject: [Resend Patch 1/3] Vmbus: Add function to report available ring buffer to write in total ring size percentage

From: Long Li <[email protected]>

Netvsc has a function to calculate how much ring buffer in percentage is
available to write. This function is also useful for storvsc and other
vmbus devices.

Define a similar function in vmbus to be used by other vmbus devices.

Signed-off-by: Long Li <[email protected]>
---
drivers/hv/ring_buffer.c | 2 ++
include/linux/hyperv.h | 12 ++++++++++++
2 files changed, 14 insertions(+)

diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
index 8699bb969e7e..3c836c099a8f 100644
--- a/drivers/hv/ring_buffer.c
+++ b/drivers/hv/ring_buffer.c
@@ -227,6 +227,8 @@ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
ring_info->ring_buffer->feature_bits.value = 1;

ring_info->ring_size = page_cnt << PAGE_SHIFT;
+ ring_info->ring_size_div10_reciprocal =
+ reciprocal_value(ring_info->ring_size / 10);
ring_info->ring_datasize = ring_info->ring_size -
sizeof(struct hv_ring_buffer);

diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 2048f3c3b68a..eb7204851089 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -36,6 +36,7 @@
#include <linux/device.h>
#include <linux/mod_devicetable.h>
#include <linux/interrupt.h>
+#include <linux/reciprocal_div.h>

#define MAX_PAGE_BUFFER_COUNT 32
#define MAX_MULTIPAGE_BUFFER_COUNT 32 /* 128K */
@@ -121,6 +122,7 @@ struct hv_ring_buffer {
struct hv_ring_buffer_info {
struct hv_ring_buffer *ring_buffer;
u32 ring_size; /* Include the shared header */
+ struct reciprocal_value ring_size_div10_reciprocal;
spinlock_t ring_lock;

u32 ring_datasize; /* < ring_size */
@@ -155,6 +157,16 @@ static inline u32 hv_get_bytes_to_write(const struct hv_ring_buffer_info *rbi)
return write;
}

+static inline u32 hv_get_avail_to_write_percent(
+ const struct hv_ring_buffer_info *rbi)
+{
+ u32 avail_write = hv_get_bytes_to_write(rbi);
+
+ return reciprocal_divide(
+ (avail_write << 3) + (avail_write << 1),
+ rbi->ring_size_div10_reciprocal);
+}
+
/*
* VMBUS version is 32 bit entity broken up into
* two 16 bit quantities: major_number. minor_number.
--
2.14.1



2018-03-28 00:52:03

by Long Li

[permalink] [raw]
Subject: [Resend Patch 2/3] Netvsc: Use the vmbus functiton to calculate ring buffer percentage

From: Long Li <[email protected]>

In Vmbus, we have defined a function to calculate available ring buffer
percentage to write.

Use that function and remove netvsc's private version.

Signed-off-by: Long Li <[email protected]>
---
drivers/net/hyperv/hyperv_net.h | 1 -
drivers/net/hyperv/netvsc.c | 17 +++--------------
drivers/net/hyperv/netvsc_drv.c | 3 ---
3 files changed, 3 insertions(+), 18 deletions(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index cd538d5a7986..a0199ab13d67 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -189,7 +189,6 @@ struct netvsc_device;
struct net_device_context;

extern u32 netvsc_ring_bytes;
-extern struct reciprocal_value netvsc_ring_reciprocal;

struct netvsc_device *netvsc_device_add(struct hv_device *device,
const struct netvsc_device_info *info);
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 0265d703eb03..8af0069e4d8c 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -31,7 +31,6 @@
#include <linux/vmalloc.h>
#include <linux/rtnetlink.h>
#include <linux/prefetch.h>
-#include <linux/reciprocal_div.h>

#include <asm/sync_bitops.h>

@@ -590,17 +589,6 @@ void netvsc_device_remove(struct hv_device *device)
#define RING_AVAIL_PERCENT_HIWATER 20
#define RING_AVAIL_PERCENT_LOWATER 10

-/*
- * Get the percentage of available bytes to write in the ring.
- * The return value is in range from 0 to 100.
- */
-static u32 hv_ringbuf_avail_percent(const struct hv_ring_buffer_info *ring_info)
-{
- u32 avail_write = hv_get_bytes_to_write(ring_info);
-
- return reciprocal_divide(avail_write * 100, netvsc_ring_reciprocal);
-}
-
static inline void netvsc_free_send_slot(struct netvsc_device *net_device,
u32 index)
{
@@ -649,7 +637,8 @@ static void netvsc_send_tx_complete(struct netvsc_device *net_device,
wake_up(&net_device->wait_drain);

if (netif_tx_queue_stopped(netdev_get_tx_queue(ndev, q_idx)) &&
- (hv_ringbuf_avail_percent(&channel->outbound) > RING_AVAIL_PERCENT_HIWATER ||
+ (hv_get_avail_to_write_percent(&channel->outbound) >
+ RING_AVAIL_PERCENT_HIWATER ||
queue_sends < 1)) {
netif_tx_wake_queue(netdev_get_tx_queue(ndev, q_idx));
ndev_ctx->eth_stats.wake_queue++;
@@ -757,7 +746,7 @@ static inline int netvsc_send_pkt(
struct netdev_queue *txq = netdev_get_tx_queue(ndev, packet->q_idx);
u64 req_id;
int ret;
- u32 ring_avail = hv_ringbuf_avail_percent(&out_channel->outbound);
+ u32 ring_avail = hv_get_avail_to_write_percent(&out_channel->outbound);

nvmsg.hdr.msg_type = NVSP_MSG1_TYPE_SEND_RNDIS_PKT;
if (skb)
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index faea0be18924..b0b1c2fd2b7b 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -35,7 +35,6 @@
#include <linux/slab.h>
#include <linux/rtnetlink.h>
#include <linux/netpoll.h>
-#include <linux/reciprocal_div.h>

#include <net/arp.h>
#include <net/route.h>
@@ -55,7 +54,6 @@ static unsigned int ring_size __ro_after_init = 128;
module_param(ring_size, uint, S_IRUGO);
MODULE_PARM_DESC(ring_size, "Ring buffer size (# of pages)");
unsigned int netvsc_ring_bytes __ro_after_init;
-struct reciprocal_value netvsc_ring_reciprocal __ro_after_init;

static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE |
NETIF_MSG_LINK | NETIF_MSG_IFUP |
@@ -2186,7 +2184,6 @@ static int __init netvsc_drv_init(void)
ring_size);
}
netvsc_ring_bytes = ring_size * PAGE_SIZE;
- netvsc_ring_reciprocal = reciprocal_value(netvsc_ring_bytes);

ret = vmbus_driver_register(&netvsc_drv);
if (ret)
--
2.14.1


2018-03-28 00:52:25

by Long Li

[permalink] [raw]
Subject: [Resend Patch 3/3] Storvsc: Select channel based on available percentage of ring buffer to write

From: Long Li <[email protected]>

This is a best effort for estimating on how busy the ring buffer is for
that channel, based on available buffer to write in percentage. It is still
possible that at the time of actual ring buffer write, the space may not be
available due to other processes may be writing at the time.

Selecting a channel based on how full it is can reduce the possibility that
a ring buffer write will fail, and avoid the situation a channel is over
busy.

Now it's possible that storvsc can use a smaller ring buffer size
(e.g. 40k bytes) to take advantage of cache locality.

Signed-off-by: Long Li <[email protected]>
---
drivers/scsi/storvsc_drv.c | 62 +++++++++++++++++++++++++++++++++++++---------
1 file changed, 50 insertions(+), 12 deletions(-)

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index a2ec0bc9e9fa..b1a87072b3ab 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -395,6 +395,12 @@ MODULE_PARM_DESC(storvsc_ringbuffer_size, "Ring buffer size (bytes)");

module_param(storvsc_vcpus_per_sub_channel, int, S_IRUGO);
MODULE_PARM_DESC(storvsc_vcpus_per_sub_channel, "Ratio of VCPUs to subchannels");
+
+static int ring_avail_percent_lowater = 10;
+module_param(ring_avail_percent_lowater, int, S_IRUGO);
+MODULE_PARM_DESC(ring_avail_percent_lowater,
+ "Select a channel if available ring size > this in percent");
+
/*
* Timeout in seconds for all devices managed by this driver.
*/
@@ -1285,9 +1291,9 @@ static int storvsc_do_io(struct hv_device *device,
{
struct storvsc_device *stor_device;
struct vstor_packet *vstor_packet;
- struct vmbus_channel *outgoing_channel;
+ struct vmbus_channel *outgoing_channel, *channel;
int ret = 0;
- struct cpumask alloced_mask;
+ struct cpumask alloced_mask, other_numa_mask;
int tgt_cpu;

vstor_packet = &request->vstor_packet;
@@ -1301,22 +1307,53 @@ static int storvsc_do_io(struct hv_device *device,
/*
* Select an an appropriate channel to send the request out.
*/
-
if (stor_device->stor_chns[q_num] != NULL) {
outgoing_channel = stor_device->stor_chns[q_num];
- if (outgoing_channel->target_cpu == smp_processor_id()) {
+ if (outgoing_channel->target_cpu == q_num) {
/*
* Ideally, we want to pick a different channel if
* available on the same NUMA node.
*/
cpumask_and(&alloced_mask, &stor_device->alloced_cpus,
cpumask_of_node(cpu_to_node(q_num)));
- for_each_cpu_wrap(tgt_cpu, &alloced_mask,
- outgoing_channel->target_cpu + 1) {
- if (tgt_cpu != outgoing_channel->target_cpu) {
- outgoing_channel =
- stor_device->stor_chns[tgt_cpu];
- break;
+
+ for_each_cpu_wrap(tgt_cpu, &alloced_mask, q_num + 1) {
+ if (tgt_cpu == q_num)
+ continue;
+ channel = stor_device->stor_chns[tgt_cpu];
+ if (hv_get_avail_to_write_percent(
+ &channel->outbound)
+ > ring_avail_percent_lowater) {
+ outgoing_channel = channel;
+ goto found_channel;
+ }
+ }
+
+ /*
+ * All the other channels on the same NUMA node are
+ * busy. Try to use the channel on the current CPU
+ */
+ if (hv_get_avail_to_write_percent(
+ &outgoing_channel->outbound)
+ > ring_avail_percent_lowater)
+ goto found_channel;
+
+ /*
+ * If we reach here, all the channels on the current
+ * NUMA node are busy. Try to find a channel in
+ * other NUMA nodes
+ */
+ cpumask_andnot(&other_numa_mask,
+ &stor_device->alloced_cpus,
+ cpumask_of_node(cpu_to_node(q_num)));
+
+ for_each_cpu(tgt_cpu, &other_numa_mask) {
+ channel = stor_device->stor_chns[tgt_cpu];
+ if (hv_get_avail_to_write_percent(
+ &channel->outbound)
+ > ring_avail_percent_lowater) {
+ outgoing_channel = channel;
+ goto found_channel;
}
}
}
@@ -1324,7 +1361,7 @@ static int storvsc_do_io(struct hv_device *device,
outgoing_channel = get_og_chn(stor_device, q_num);
}

-
+found_channel:
vstor_packet->flags |= REQUEST_COMPLETION_FLAG;

vstor_packet->vm_srb.length = (sizeof(struct vmscsi_request) -
@@ -1733,7 +1770,8 @@ static int storvsc_probe(struct hv_device *device,
}

scsi_driver.can_queue = (max_outstanding_req_per_channel *
- (max_sub_channels + 1));
+ (max_sub_channels + 1)) *
+ (100 - ring_avail_percent_lowater) / 100;

host = scsi_host_alloc(&scsi_driver,
sizeof(struct hv_host_device));
--
2.14.1


2018-03-28 22:05:04

by Martin K. Petersen

[permalink] [raw]
Subject: Re: [Resend Patch 1/3] Vmbus: Add function to report available ring buffer to write in total ring size percentage


Long,

> Netvsc has a function to calculate how much ring buffer in percentage is
> available to write. This function is also useful for storvsc and other
> vmbus devices.

What is the submission strategy for this series? Do you expect it to go
through net or scsi? If the latter, I'll need an ack from davem.

--
Martin K. Petersen Oracle Linux Engineering

2018-03-28 22:44:04

by Long Li

[permalink] [raw]
Subject: RE: [Resend Patch 1/3] Vmbus: Add function to report available ring buffer to write in total ring size percentage

> Subject: Re: [Resend Patch 1/3] Vmbus: Add function to report available ring
> buffer to write in total ring size percentage
>
>
> Long,
>
> > Netvsc has a function to calculate how much ring buffer in percentage
> > is available to write. This function is also useful for storvsc and
> > other vmbus devices.
>
> What is the submission strategy for this series? Do you expect it to go
> through net or scsi? If the latter, I'll need an ack from davem.

Martin,

I hope this patch set goes through SCSI, because it's purpose is to improve storvsc.

If this strategy is not possible, I can resubmit the 1st two patches to net, and the 3rd patch to scsi after the 1st two are merged.

Long

>
> --
> Martin K. Petersen Oracle Linux Engineering

2018-03-28 23:24:20

by Stephen Hemminger

[permalink] [raw]
Subject: RE: [Resend Patch 1/3] Vmbus: Add function to report available ring buffer to write in total ring size percentage

Maybe best through greg's char-misc tree since it has generic hv code and sometime updates both network and scsi.

Alternatively, put common code through one tree, and hold off the network device change till next release.

-----Original Message-----
From: Long Li
Sent: Wednesday, March 28, 2018 3:43 PM
To: Martin K. Petersen <[email protected]>; Long Li <[email protected]>
Cc: KY Srinivasan <[email protected]>; Haiyang Zhang <[email protected]>; Stephen Hemminger <[email protected]>; James E . J . Bottomley <[email protected]>; [email protected]; [email protected]; [email protected]; [email protected]
Subject: RE: [Resend Patch 1/3] Vmbus: Add function to report available ring buffer to write in total ring size percentage

> Subject: Re: [Resend Patch 1/3] Vmbus: Add function to report available ring
> buffer to write in total ring size percentage
>
>
> Long,
>
> > Netvsc has a function to calculate how much ring buffer in percentage
> > is available to write. This function is also useful for storvsc and
> > other vmbus devices.
>
> What is the submission strategy for this series? Do you expect it to go
> through net or scsi? If the latter, I'll need an ack from davem.

Martin,

I hope this patch set goes through SCSI, because it's purpose is to improve storvsc.

If this strategy is not possible, I can resubmit the 1st two patches to net, and the 3rd patch to scsi after the 1st two are merged.

Long

>
> --
> Martin K. Petersen Oracle Linux Engineering

2018-04-10 03:03:31

by Martin K. Petersen

[permalink] [raw]
Subject: Re: [Resend Patch 1/3] Vmbus: Add function to report available ring buffer to write in total ring size percentage


Long,

> I hope this patch set goes through SCSI, because it's purpose is to
> improve storvsc.
>
> If this strategy is not possible, I can resubmit the 1st two patches to
> net, and the 3rd patch to scsi after the 1st two are merged.

Applied to my staging tree for 4.18/scsi-queue. Thanks!

--
Martin K. Petersen Oracle Linux Engineering

2018-04-13 23:29:30

by Michael Kelley (EOSG)

[permalink] [raw]
Subject: RE: [Resend Patch 3/3] Storvsc: Select channel based on available percentage of ring buffer to write

> -----Original Message-----
> From: [email protected] <[email protected]> On Behalf
> Of Long Li
> Sent: Tuesday, March 27, 2018 5:49 PM
> To: KY Srinivasan <[email protected]>; Haiyang Zhang <[email protected]>; Stephen
> Hemminger <[email protected]>; James E . J . Bottomley <[email protected]>;
> Martin K . Petersen <[email protected]>; [email protected]; linux-
> [email protected]; [email protected]; [email protected]
> Cc: Long Li <[email protected]>
> Subject: [Resend Patch 3/3] Storvsc: Select channel based on available percentage of ring
> buffer to write
>
> From: Long Li <[email protected]>
>
> This is a best effort for estimating on how busy the ring buffer is for
> that channel, based on available buffer to write in percentage. It is still
> possible that at the time of actual ring buffer write, the space may not be
> available due to other processes may be writing at the time.
>
> Selecting a channel based on how full it is can reduce the possibility that
> a ring buffer write will fail, and avoid the situation a channel is over
> busy.
>
> Now it's possible that storvsc can use a smaller ring buffer size
> (e.g. 40k bytes) to take advantage of cache locality.
>
> Signed-off-by: Long Li <[email protected]>
> ---
> drivers/scsi/storvsc_drv.c | 62 +++++++++++++++++++++++++++++++++++++---------
> 1 file changed, 50 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
> index a2ec0bc9e9fa..b1a87072b3ab 100644
> --- a/drivers/scsi/storvsc_drv.c
> +++ b/drivers/scsi/storvsc_drv.c
> @@ -395,6 +395,12 @@ MODULE_PARM_DESC(storvsc_ringbuffer_size, "Ring buffer size
> (bytes)");
>
> module_param(storvsc_vcpus_per_sub_channel, int, S_IRUGO);
> MODULE_PARM_DESC(storvsc_vcpus_per_sub_channel, "Ratio of VCPUs to subchannels");
> +
> +static int ring_avail_percent_lowater = 10;

Reserving 10% of each ring buffer by default seems like more than is needed
in the storvsc driver. That would be about 4Kbytes for the 40K ring buffer
you suggest, and even more for a ring buffer of 128K. Each outgoing record
is only about 344 bytes (I'd have to check exactly). With the new channel
selection algorithm below, the only time we use a channel that is already
below the low water mark is when no channel could be found that is above
the low water mark. There could be a case of two or more threads deciding
that a channel is above the low water mark at the same time and both
choosing it, but that's likely to be rare. So it seems like we could set the
default low water mark to 5 percent or even 3 percent, which will let more
of the ring buffer be used, and let a channel be assigned according to the
algorithm, rather than falling through to the default because all channels
appear to be "full".

> +module_param(ring_avail_percent_lowater, int, S_IRUGO);
> +MODULE_PARM_DESC(ring_avail_percent_lowater,
> + "Select a channel if available ring size > this in percent");
> +
> /*
> * Timeout in seconds for all devices managed by this driver.
> */
> @@ -1285,9 +1291,9 @@ static int storvsc_do_io(struct hv_device *device,
> {
> struct storvsc_device *stor_device;
> struct vstor_packet *vstor_packet;
> - struct vmbus_channel *outgoing_channel;
> + struct vmbus_channel *outgoing_channel, *channel;
> int ret = 0;
> - struct cpumask alloced_mask;
> + struct cpumask alloced_mask, other_numa_mask;
> int tgt_cpu;
>
> vstor_packet = &request->vstor_packet;
> @@ -1301,22 +1307,53 @@ static int storvsc_do_io(struct hv_device *device,
> /*
> * Select an an appropriate channel to send the request out.
> */
> -
> if (stor_device->stor_chns[q_num] != NULL) {
> outgoing_channel = stor_device->stor_chns[q_num];
> - if (outgoing_channel->target_cpu == smp_processor_id()) {
> + if (outgoing_channel->target_cpu == q_num) {
> /*
> * Ideally, we want to pick a different channel if
> * available on the same NUMA node.
> */
> cpumask_and(&alloced_mask, &stor_device->alloced_cpus,
> cpumask_of_node(cpu_to_node(q_num)));
> - for_each_cpu_wrap(tgt_cpu, &alloced_mask,
> - outgoing_channel->target_cpu + 1) {
> - if (tgt_cpu != outgoing_channel->target_cpu) {
> - outgoing_channel =
> - stor_device->stor_chns[tgt_cpu];
> - break;
> +
> + for_each_cpu_wrap(tgt_cpu, &alloced_mask, q_num + 1) {
> + if (tgt_cpu == q_num)
> + continue;
> + channel = stor_device->stor_chns[tgt_cpu];
> + if (hv_get_avail_to_write_percent(
> + &channel->outbound)
> + > ring_avail_percent_lowater) {
> + outgoing_channel = channel;
> + goto found_channel;
> + }
> + }
> +
> + /*
> + * All the other channels on the same NUMA node are
> + * busy. Try to use the channel on the current CPU
> + */
> + if (hv_get_avail_to_write_percent(
> + &outgoing_channel->outbound)
> + > ring_avail_percent_lowater)
> + goto found_channel;
> +
> + /*
> + * If we reach here, all the channels on the current
> + * NUMA node are busy. Try to find a channel in
> + * other NUMA nodes
> + */
> + cpumask_andnot(&other_numa_mask,
> + &stor_device->alloced_cpus,
> + cpumask_of_node(cpu_to_node(q_num)));
> +
> + for_each_cpu(tgt_cpu, &other_numa_mask) {
> + channel = stor_device->stor_chns[tgt_cpu];
> + if (hv_get_avail_to_write_percent(
> + &channel->outbound)
> + > ring_avail_percent_lowater) {
> + outgoing_channel = channel;
> + goto found_channel;
> }
> }
> }
> @@ -1324,7 +1361,7 @@ static int storvsc_do_io(struct hv_device *device,
> outgoing_channel = get_og_chn(stor_device, q_num);
> }
>
> -
> +found_channel:
> vstor_packet->flags |= REQUEST_COMPLETION_FLAG;
>
> vstor_packet->vm_srb.length = (sizeof(struct vmscsi_request) -
> @@ -1733,7 +1770,8 @@ static int storvsc_probe(struct hv_device *device,
> }
>
> scsi_driver.can_queue = (max_outstanding_req_per_channel *
> - (max_sub_channels + 1));
> + (max_sub_channels + 1)) *
> + (100 - ring_avail_percent_lowater) / 100;

A minor nit, but the use of parentheses here is inconsistent. There's a
set of parens around the first two expressions to explicitly code the
associativity, but not a set to encompass the third term, which must
be processed before the fourth one is. C does multiplication and
division with left to right associativity, so the result is as intended.
But if we're depending on C's default associativity, then that set of
parens around the first two expression really isn't needed, and one
wonders why they are there.

Michael

>
> host = scsi_host_alloc(&scsi_driver,
> sizeof(struct hv_host_device));
> --
> 2.14.1


2018-04-13 23:41:28

by Michael Kelley (EOSG)

[permalink] [raw]
Subject: RE: [Resend Patch 2/3] Netvsc: Use the vmbus functiton to calculate ring buffer percentage

> -----Original Message-----
> From: [email protected] <[email protected]> On Behalf
> Of Long Li
> Sent: Tuesday, March 27, 2018 5:49 PM
> To: KY Srinivasan <[email protected]>; Haiyang Zhang <[email protected]>; Stephen
> Hemminger <[email protected]>; James E . J . Bottomley <[email protected]>;
> Martin K . Petersen <[email protected]>; [email protected]; linux-
> [email protected]; [email protected]; [email protected]
> Cc: Long Li <[email protected]>
> Subject: [Resend Patch 2/3] Netvsc: Use the vmbus functiton to calculate ring buffer
> percentage
>
> From: Long Li <[email protected]>
>
> In Vmbus, we have defined a function to calculate available ring buffer
> percentage to write.
>
> Use that function and remove netvsc's private version.
>
> Signed-off-by: Long Li <[email protected]>

Reviewed-by: Michael Kelley <[email protected]>

> ---
> drivers/net/hyperv/hyperv_net.h | 1 -
> drivers/net/hyperv/netvsc.c | 17 +++--------------
> drivers/net/hyperv/netvsc_drv.c | 3 ---
> 3 files changed, 3 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
> index cd538d5a7986..a0199ab13d67 100644
> --- a/drivers/net/hyperv/hyperv_net.h
> +++ b/drivers/net/hyperv/hyperv_net.h
> @@ -189,7 +189,6 @@ struct netvsc_device;
> struct net_device_context;
>
> extern u32 netvsc_ring_bytes;
> -extern struct reciprocal_value netvsc_ring_reciprocal;
>
> struct netvsc_device *netvsc_device_add(struct hv_device *device,
> const struct netvsc_device_info *info);
> diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
> index 0265d703eb03..8af0069e4d8c 100644
> --- a/drivers/net/hyperv/netvsc.c
> +++ b/drivers/net/hyperv/netvsc.c
> @@ -31,7 +31,6 @@
> #include <linux/vmalloc.h>
> #include <linux/rtnetlink.h>
> #include <linux/prefetch.h>
> -#include <linux/reciprocal_div.h>
>
> #include <asm/sync_bitops.h>
>
> @@ -590,17 +589,6 @@ void netvsc_device_remove(struct hv_device *device)
> #define RING_AVAIL_PERCENT_HIWATER 20
> #define RING_AVAIL_PERCENT_LOWATER 10
>
> -/*
> - * Get the percentage of available bytes to write in the ring.
> - * The return value is in range from 0 to 100.
> - */
> -static u32 hv_ringbuf_avail_percent(const struct hv_ring_buffer_info *ring_info)
> -{
> - u32 avail_write = hv_get_bytes_to_write(ring_info);
> -
> - return reciprocal_divide(avail_write * 100, netvsc_ring_reciprocal);
> -}
> -
> static inline void netvsc_free_send_slot(struct netvsc_device *net_device,
> u32 index)
> {
> @@ -649,7 +637,8 @@ static void netvsc_send_tx_complete(struct netvsc_device
> *net_device,
> wake_up(&net_device->wait_drain);
>
> if (netif_tx_queue_stopped(netdev_get_tx_queue(ndev, q_idx)) &&
> - (hv_ringbuf_avail_percent(&channel->outbound) >
> RING_AVAIL_PERCENT_HIWATER ||
> + (hv_get_avail_to_write_percent(&channel->outbound) >
> + RING_AVAIL_PERCENT_HIWATER ||
> queue_sends < 1)) {
> netif_tx_wake_queue(netdev_get_tx_queue(ndev, q_idx));
> ndev_ctx->eth_stats.wake_queue++;
> @@ -757,7 +746,7 @@ static inline int netvsc_send_pkt(
> struct netdev_queue *txq = netdev_get_tx_queue(ndev, packet->q_idx);
> u64 req_id;
> int ret;
> - u32 ring_avail = hv_ringbuf_avail_percent(&out_channel->outbound);
> + u32 ring_avail = hv_get_avail_to_write_percent(&out_channel->outbound);
>
> nvmsg.hdr.msg_type = NVSP_MSG1_TYPE_SEND_RNDIS_PKT;
> if (skb)
> diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
> index faea0be18924..b0b1c2fd2b7b 100644
> --- a/drivers/net/hyperv/netvsc_drv.c
> +++ b/drivers/net/hyperv/netvsc_drv.c
> @@ -35,7 +35,6 @@
> #include <linux/slab.h>
> #include <linux/rtnetlink.h>
> #include <linux/netpoll.h>
> -#include <linux/reciprocal_div.h>
>
> #include <net/arp.h>
> #include <net/route.h>
> @@ -55,7 +54,6 @@ static unsigned int ring_size __ro_after_init = 128;
> module_param(ring_size, uint, S_IRUGO);
> MODULE_PARM_DESC(ring_size, "Ring buffer size (# of pages)");
> unsigned int netvsc_ring_bytes __ro_after_init;
> -struct reciprocal_value netvsc_ring_reciprocal __ro_after_init;
>
> static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE |
> NETIF_MSG_LINK | NETIF_MSG_IFUP |
> @@ -2186,7 +2184,6 @@ static int __init netvsc_drv_init(void)
> ring_size);
> }
> netvsc_ring_bytes = ring_size * PAGE_SIZE;
> - netvsc_ring_reciprocal = reciprocal_value(netvsc_ring_bytes);
>
> ret = vmbus_driver_register(&netvsc_drv);
> if (ret)
> --
> 2.14.1


2018-04-14 00:28:06

by Long Li

[permalink] [raw]
Subject: RE: [Resend Patch 3/3] Storvsc: Select channel based on available percentage of ring buffer to write

> Subject: RE: [Resend Patch 3/3] Storvsc: Select channel based on available
> percentage of ring buffer to write
>
> > -----Original Message-----
> > From: [email protected]
> > <[email protected]> On Behalf Of Long Li
> > Sent: Tuesday, March 27, 2018 5:49 PM
> > To: KY Srinivasan <[email protected]>; Haiyang Zhang
> > <[email protected]>; Stephen Hemminger
> <[email protected]>;
> > James E . J . Bottomley <[email protected]>; Martin K . Petersen
> > <[email protected]>; [email protected]; linux-
> > [email protected]; [email protected];
> > [email protected]
> > Cc: Long Li <[email protected]>
> > Subject: [Resend Patch 3/3] Storvsc: Select channel based on available
> > percentage of ring buffer to write
> >
> > From: Long Li <[email protected]>
> >
> > This is a best effort for estimating on how busy the ring buffer is
> > for that channel, based on available buffer to write in percentage. It
> > is still possible that at the time of actual ring buffer write, the
> > space may not be available due to other processes may be writing at the
> time.
> >
> > Selecting a channel based on how full it is can reduce the possibility
> > that a ring buffer write will fail, and avoid the situation a channel
> > is over busy.
> >
> > Now it's possible that storvsc can use a smaller ring buffer size
> > (e.g. 40k bytes) to take advantage of cache locality.
> >
> > Signed-off-by: Long Li <[email protected]>
> > ---
> > drivers/scsi/storvsc_drv.c | 62
> > +++++++++++++++++++++++++++++++++++++---------
> > 1 file changed, 50 insertions(+), 12 deletions(-)
> >
> > diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
> > index a2ec0bc9e9fa..b1a87072b3ab 100644
> > --- a/drivers/scsi/storvsc_drv.c
> > +++ b/drivers/scsi/storvsc_drv.c
> > @@ -395,6 +395,12 @@ MODULE_PARM_DESC(storvsc_ringbuffer_size,
> "Ring
> > buffer size (bytes)");
> >
> > module_param(storvsc_vcpus_per_sub_channel, int, S_IRUGO);
> > MODULE_PARM_DESC(storvsc_vcpus_per_sub_channel, "Ratio of VCPUs
> to
> > subchannels");
> > +
> > +static int ring_avail_percent_lowater = 10;
>
> Reserving 10% of each ring buffer by default seems like more than is needed
> in the storvsc driver. That would be about 4Kbytes for the 40K ring buffer
> you suggest, and even more for a ring buffer of 128K. Each outgoing record is
> only about 344 bytes (I'd have to check exactly). With the new channel
> selection algorithm below, the only time we use a channel that is already
> below the low water mark is when no channel could be found that is above
> the low water mark. There could be a case of two or more threads deciding
> that a channel is above the low water mark at the same time and both
> choosing it, but that's likely to be rare. So it seems like we could set the

It's not rare for two processes checking on the same channel at the same time, when running multiple processes I/O workload. The CPU to channel is not 1:1 mapping.

> default low water mark to 5 percent or even 3 percent, which will let more of
> the ring buffer be used, and let a channel be assigned according to the
> algorithm, rather than falling through to the default because all channels
> appear to be "full".

It seems it's not about how big ring buffer is, e.g. even you have a ring buffer of infinite size, it won't help with performance if it's getting queued all the time, while other ring buffers are near empty. It's more about how multiple ring buffers are getting utilized in a reasonable and balanced way. Testing shows 10 is a good choice, while 3 is prone to return BUSY and trigger block layer retry.

>
> > +module_param(ring_avail_percent_lowater, int, S_IRUGO);
> > +MODULE_PARM_DESC(ring_avail_percent_lowater,
> > + "Select a channel if available ring size > this in percent");
> > +
> > /*
> > * Timeout in seconds for all devices managed by this driver.
> > */
> > @@ -1285,9 +1291,9 @@ static int storvsc_do_io(struct hv_device
> > *device, {
> > struct storvsc_device *stor_device;
> > struct vstor_packet *vstor_packet;
> > - struct vmbus_channel *outgoing_channel;
> > + struct vmbus_channel *outgoing_channel, *channel;
> > int ret = 0;
> > - struct cpumask alloced_mask;
> > + struct cpumask alloced_mask, other_numa_mask;
> > int tgt_cpu;
> >
> > vstor_packet = &request->vstor_packet; @@ -1301,22 +1307,53 @@
> > static int storvsc_do_io(struct hv_device *device,
> > /*
> > * Select an an appropriate channel to send the request out.
> > */
> > -
> > if (stor_device->stor_chns[q_num] != NULL) {
> > outgoing_channel = stor_device->stor_chns[q_num];
> > - if (outgoing_channel->target_cpu == smp_processor_id()) {
> > + if (outgoing_channel->target_cpu == q_num) {
> > /*
> > * Ideally, we want to pick a different channel if
> > * available on the same NUMA node.
> > */
> > cpumask_and(&alloced_mask, &stor_device-
> >alloced_cpus,
> >
> cpumask_of_node(cpu_to_node(q_num)));
> > - for_each_cpu_wrap(tgt_cpu, &alloced_mask,
> > - outgoing_channel->target_cpu + 1) {
> > - if (tgt_cpu != outgoing_channel->target_cpu)
> {
> > - outgoing_channel =
> > - stor_device->stor_chns[tgt_cpu];
> > - break;
> > +
> > + for_each_cpu_wrap(tgt_cpu, &alloced_mask,
> q_num + 1) {
> > + if (tgt_cpu == q_num)
> > + continue;
> > + channel = stor_device->stor_chns[tgt_cpu];
> > + if (hv_get_avail_to_write_percent(
> > + &channel->outbound)
> > + > ring_avail_percent_lowater)
> {
> > + outgoing_channel = channel;
> > + goto found_channel;
> > + }
> > + }
> > +
> > + /*
> > + * All the other channels on the same NUMA node
> are
> > + * busy. Try to use the channel on the current CPU
> > + */
> > + if (hv_get_avail_to_write_percent(
> > + &outgoing_channel-
> >outbound)
> > + > ring_avail_percent_lowater)
> > + goto found_channel;
> > +
> > + /*
> > + * If we reach here, all the channels on the current
> > + * NUMA node are busy. Try to find a channel in
> > + * other NUMA nodes
> > + */
> > + cpumask_andnot(&other_numa_mask,
> > + &stor_device->alloced_cpus,
> > +
> cpumask_of_node(cpu_to_node(q_num)));
> > +
> > + for_each_cpu(tgt_cpu, &other_numa_mask) {
> > + channel = stor_device->stor_chns[tgt_cpu];
> > + if (hv_get_avail_to_write_percent(
> > + &channel->outbound)
> > + > ring_avail_percent_lowater)
> {
> > + outgoing_channel = channel;
> > + goto found_channel;
> > }
> > }
> > }
> > @@ -1324,7 +1361,7 @@ static int storvsc_do_io(struct hv_device *device,
> > outgoing_channel = get_og_chn(stor_device, q_num);
> > }
> >
> > -
> > +found_channel:
> > vstor_packet->flags |= REQUEST_COMPLETION_FLAG;
> >
> > vstor_packet->vm_srb.length = (sizeof(struct vmscsi_request) - @@
> > -1733,7 +1770,8 @@ static int storvsc_probe(struct hv_device *device,
> > }
> >
> > scsi_driver.can_queue = (max_outstanding_req_per_channel *
> > - (max_sub_channels + 1));
> > + (max_sub_channels + 1)) *
> > + (100 - ring_avail_percent_lowater) / 100;
>
> A minor nit, but the use of parentheses here is inconsistent. There's a set of
> parens around the first two expressions to explicitly code the associativity,
> but not a set to encompass the third term, which must be processed before
> the fourth one is. C does multiplication and division with left to right
> associativity, so the result is as intended.
> But if we're depending on C's default associativity, then that set of parens
> around the first two expression really isn't needed, and one wonders why
> they are there.
>
> Michael
>
> >
> > host = scsi_host_alloc(&scsi_driver,
> > sizeof(struct hv_host_device));
> > --
> > 2.14.1