Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753971AbdHUOgC (ORCPT ); Mon, 21 Aug 2017 10:36:02 -0400 Received: from mail-eopbgr670112.outbound.protection.outlook.com ([40.107.67.112]:42976 "EHLO CAN01-TO1-obe.outbound.protection.outlook.com" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1753412AbdHUOf6 (ORCPT ); Mon, 21 Aug 2017 10:35:58 -0400 Authentication-Results: spf=none (sender IP is ) smtp.mailfrom=sbates@raithlin.com; From: sbates@raithlin.com To: axboe@kernel.dk, linux-block@vger.kernel.org, linux-kernel@vger.kernel.org Cc: osandov@fb.com, damien.lemoal@wdc.com, Stephen Bates Subject: [PATCH] blk-mq: Improvements to the hybrid polling sleep time calculation Date: Mon, 21 Aug 2017 08:35:34 -0600 Message-Id: <1503326134-3862-1-git-send-email-sbates@raithlin.com> X-Mailer: git-send-email 2.7.4 MIME-Version: 1.0 Content-Type: text/plain X-Originating-IP: [70.65.224.121] X-ClientProxiedBy: MWHPR11CA0027.namprd11.prod.outlook.com (10.175.56.141) To YTOPR01MB0619.CANPRD01.PROD.OUTLOOK.COM (10.166.150.7) X-MS-PublicTrafficType: Email X-MS-Office365-Filtering-Correlation-Id: 379167f6-f6bf-4745-7311-08d4e8a1eec1 X-Microsoft-Antispam: UriScan:;BCL:0;PCL:0;RULEID:(300000500095)(300135000095)(300000501095)(300135300095)(22001)(300000502095)(300135100095)(2017030254152)(2017082002075)(300000503095)(300135400095)(201703131423075)(201702281549075)(300000504095)(300135200095)(300000505095)(300135600095)(300000506095)(300135500095);SRVR:YTOPR01MB0619; X-Microsoft-Exchange-Diagnostics: 1;YTOPR01MB0619;3:Dq2OA9SZtU/T1jbcuokWUylqTMltqD5PTGdmKsJlDV4rGskGFXyMQd0u4tk842/La5Xy5+66SH98hbpFnfLIKEHLRcwIFEeZ2Zum28p/byl5Vr0lg6xbGXSaFtm5NJTIxN8r/PikdQXlvMY5LN0ZiZVN6SxbOmkALDfKe0vin0HdUksQiYeuCOJrJna3OZO7RiexkvUGA2CpKvMOt44VxOpt1tU7KoEA6OwaKLvGjnjHJTA5XDI2BvH/eIJpDYmY;25:AxzzUBNqoOY+38HkiRz/HqQic4rTfYpQb6M9MEzJgDS5lW962yLNGZ5WF0hY3GO4+i0WFq6CbR+Wb0IJKIw08ygiKtvoavkcF1JCU0qjsJnHVN594cwB0O24s6MmIEkUDCMhtRSUerOzFRG2AihSZDIuWnVRNmpm1qTMO90dZnBZl8i50rocGRkdQogqt90N/vXx9FValjNKApbAP4xlwR6zA2qFBBPdf0EU5mp6KshrbW7B67thApe8KCEq2o3Pocrl3u2Yj6Zcxx5ilvIPr1d8SJzlEVkKTR+0EGQDgbCodXYiW+ClTGE34WkUKhXo42waK0sQJHkC81z2/ftUlw==;31:Bx0znaPNt4cks9EZkX0+qdxLbuVsd4cPYXcpb8ZmXC0IxrE+QQ7NPcdNb7vy4JZwWFp4rJ8i6/6pX4zsrYleEQ1K5gtuk+m6ltEWlYzPOY/NpeQyAS5KIS96OZCgeporr0CGeAtBCRCnx+2kXGQkiKPCZfMsbZCiSKkTQrWz1v4C/ddYKO4zkyPvdRE1PAShEpDjnopKAZCAz3pfqg2Ei6PXn6xQKO3jGqUHww3kL/Y= X-MS-TrafficTypeDiagnostic: YTOPR01MB0619: X-Exchange-Antispam-Report-Test: UriScan:; X-Microsoft-Antispam-PRVS: X-Exchange-Antispam-Report-CFA-Test: BCL:0;PCL:0;RULEID:(100000700101)(100105000095)(100000701101)(100105300095)(100000702101)(100105100095)(6040450)(2401047)(8121501046)(5005006)(93006095)(93001095)(100000703101)(100105400095)(10201501046)(3002001)(6041248)(20161123562025)(20161123560025)(2016111802025)(20161123558100)(20161123564025)(20161123555025)(201703131423075)(201702281528075)(201703061421075)(201703061406153)(6043046)(6072148)(201708071742011)(100000704101)(100105200095)(100000705101)(100105500095);SRVR:YTOPR01MB0619;BCL:0;PCL:0;RULEID:(100000800101)(100110000095)(100000801101)(100110300095)(100000802101)(100110100095)(100000803101)(100110400095)(100000804101)(100110200095)(100000805101)(100110500095);SRVR:YTOPR01MB0619; X-Microsoft-Exchange-Diagnostics: 1;YTOPR01MB0619;4:TGpjx4O+cmmoNYZhqIQZPDXdFQVqznARV8APi0L0TcPEALqhGXU+lIjWzTLXwkcodh/xykoLHL424Oj789C5JQ1XcZx3OwjNPJGdB1VK+1F7l2TtX4f2GTKFHD3YOPpr3CcepOf/eUWTbScTCUOYCr3GCTbgral1TW7RQ6tJD4q1DUMQgcJXKKI3s6puYj6O7130YI9SGXTN29vj5149ejHmbJwZPQ6FsJle2D4WiOJLkosJ7uWFddIcmIzn93S5 X-Forefront-PRVS: 040655413E X-Forefront-Antispam-Report: SFV:NSPM;SFS:(10019020)(4630300001)(7370300001)(6069001)(6009001)(39830400002)(199003)(189002)(42186005)(107886003)(97736004)(478600001)(68736007)(189998001)(6486002)(105586002)(106356001)(53936002)(101416001)(86362001)(110136004)(2906002)(50986999)(33646002)(5003940100001)(25786009)(47776003)(66066001)(6666003)(7350300001)(4326008)(36756003)(81156014)(8676002)(305945005)(81166006)(6512007)(48376002)(6116002)(9686003)(5660300001)(50466002)(7736002)(85782001)(3846002)(50226002)(85772001);DIR:OUT;SFP:1102;SCL:1;SRVR:YTOPR01MB0619;H:localhost.localdomain;FPR:;SPF:None;PTR:InfoNoRecords;A:1;MX:1;LANG:en; X-Microsoft-Exchange-Diagnostics: =?us-ascii?Q?1;YTOPR01MB0619;23:ez7WFt5/nTSPbb9Q6fKtcZTYmjdu7+mTlZHSxULvs?= =?us-ascii?Q?imnXu7YCZph8k4J/Mb336+CwcUL1H/+aM10SDoRxpaoai2ZF+AqgW6JcQOEu?= =?us-ascii?Q?oLEi8hnfQrfYw9VZf2LYhgoKSVmK5KEJvfiLVIY+7kfkf3ZM3+XJIJt76I3O?= =?us-ascii?Q?WPc5FWsqfVCfcKis4820ZVjqPBboO4+ZADwAhU2Sh44zVoNeDuav10n9lVhj?= =?us-ascii?Q?xQkYY4lKa0/g9VNoIEGCborxWLuUhtJbVIlGLLU2QuvGug4A3mdyhPb0KQHF?= =?us-ascii?Q?uwIyHqw2ZUP/18XhT2G3DtToIzsuA48DmvfVlUkhHv6kA6AxjqODTOzHagR6?= =?us-ascii?Q?zV5jgdhagMNKPxkD+m39SVcK7WVHFFWvnxT0YInLAeQJolgDnILWZLbj0BLz?= =?us-ascii?Q?CQ2kUK0NEkPW5aI+6hdwOhgVFnhVerYymYbSda5u+CGxwU4+NfPCxWMRtNMS?= =?us-ascii?Q?Mm4qY4z5/GmU1dX2QC5Smiylsc8yW5E2DtoZQwQjJgHRmzbPfq1l0SMeMv9m?= =?us-ascii?Q?urO9izWbIB1j1JJ+lAPjXgonHyS+t68PL4fclIhLp8WKJ5CK0oLizaUKHhGO?= =?us-ascii?Q?Vl8Rzrjyu+Xml/inkmVoL+YMQLXrdGdvUq1iqXQDR9Ym8XTkVHXHb0tnppKz?= =?us-ascii?Q?jpJUpIfay+c/HLgMWOi8tFQ2CS1O1N6ofklyhrDhCxlT+VdXjUK2BhClP4MS?= =?us-ascii?Q?nrVPF+dViDVSBo8iZDr2AEN3OKwRSJ/yW8GcqyoGsUUDq1MHCI2JTtRLA6MU?= =?us-ascii?Q?lkvzNIHANJyUG8Mx7vFb36Nh/C6342LKQFnN5HsoReCcx0yeIz7UmPFJLJrp?= =?us-ascii?Q?e4EXCuu4LZ1yGTOJoV/JvZ44+RUCrL6fcOb8hO/p8KmDyOikeCItrZa/2ReC?= =?us-ascii?Q?z7BFjGgR0hc2QDvpeD9wX33k1Il/sMc4KZ+lTjYr38OToNP5eHc+cfmTot4a?= =?us-ascii?Q?OomaT37TrPrdRbCbC8rIv/+FE619IULvTICTloJm89NZN4QFHD2N4s/5r5Lo?= =?us-ascii?Q?/bUi3z2k1EHQDgjnAIZRsRBRbO8aMkOmQGYPxRF4HF1RE5VQXT3y72+lwGU6?= =?us-ascii?Q?NYqZ67kt4YGO8KD05F12U1ut7KLJXuNJyeYKNanxc6CEJhmbbepLncO5Myp+?= =?us-ascii?Q?wRR41JorpY=3D?= X-Microsoft-Exchange-Diagnostics: 1;YTOPR01MB0619;6:9XVi2up9Y59j8voVuZQGOJEGwhN2JD33sywoYoRnRrmQpRgM44NFmhmufxCyL9bQ0zVL8qcxDoJ9giqJV9mAWcs/Hs3L1EBeN0hTxdSxR1luk18nLJA8Q+vkzPBsHV1dixBkRxUzEyAzUrDMQveIRPweMVAbaSy1/bUgJb47HoZ+bujK1sCv2rPwRtsMLhz+6iqSdtEjFHBDJzSiAjBM/F86FJugTyAMUJeQJBAE8P3cL/aViCmNA7SLEc45BHWyFfuZkGHSS1yRCJz7EEiqtM+Wh/6U4Ls1w0ZiU+6V0cpP95AvVNdm1vFPgK64YTm7RCY3OFqIH9hKNjSiJLCuOA==;5:lKOcWesT+JFyf8R3hj8di2FZhMkRXDF+IweoGMQbT6vVo/70Y0bnWXQPv0ggJfetbNT3jwPJcQ2F0ZFgOx7OyImGfKt7zk6D0tYGplPalfQAcs1QlhrNKcvz2qiaCZTkoG+vc1bgq/p1gQ+lTufO7g==;24:mSo74PkcrlIprJ+/KsYd/pre376cP8zqVhEgihy2LeOesaqBb9SIKhyf1h5XXwFCPcmryJw89DaRRi3qy5oTn1nZs8IjgKyBzmDq4oMoCKc=;7:TXbhcJZTQocPXy1BkrnHTSN/II1umEfAarLeyX1irnye4P6J5KJgzj5YbW2rlJ0BxOjpwMutu+DTE9poOnf/B5VrRD8w/hm9gL8nAdCHtGbMvOKpxKQOEgNQ9KYr514A3rtf6hnc0BK1xEtZmQ5hDzxbZn05rbABHosU6i+3t+ErQSIBXYKCtssLxFwiX8EvXwAM26pPygOgXozBB4S2cb/bfVre7yzImy3r8sAzJR8= SpamDiagnosticOutput: 1:99 SpamDiagnosticMetadata: NSPM X-OriginatorOrg: raithlin.com X-MS-Exchange-CrossTenant-OriginalArrivalTime: 21 Aug 2017 14:35:55.0678 (UTC) X-MS-Exchange-CrossTenant-FromEntityHeader: Hosted X-MS-Exchange-Transport-CrossTenantHeadersStamped: YTOPR01MB0619 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 4541 Lines: 155 From: Stephen Bates Hybrid polling currently uses half the average completion time as an estimate of how long to poll for. We can improve upon this by noting that polling before the minimum completion time makes no sense. Add a sysfs entry to use this fact to improve CPU utilization in certain cases. At the same time the minimum is a bit too long to sleep for since we must factor in OS wake time for the thread. For now allow the user to set this via a second sysfs entry (in nanoseconds). Testing this patch on Intel Optane SSDs showed that using the minimum rather than half reduced CPU utilization from 59% to 38%. Tuning this via the wake time adjustment allowed us to trade CPU load for latency. For example io_poll delay hyb_use_min adjust latency CPU load 1 -1 N/A N/A 8.4 100% 1 0 0 N/A 8.4 57% 1 0 1 0 10.3 34% 1 9 1 1000 9.9 37% 1 0 1 2000 8.4 47% 1 0 1 10000 8.4 100% Ideally we will extend this to auto-calculate the wake time rather than have it set by the user. Signed-off-by: Stephen Bates --- block/blk-mq.c | 10 +++++++++ block/blk-sysfs.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/blkdev.h | 3 +++ 3 files changed, 71 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index f84d145..f453a35 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2739,6 +2739,16 @@ static unsigned long blk_mq_poll_nsecs(struct request_queue *q, if (q->poll_stat[bucket].nr_samples) ret = (q->poll_stat[bucket].mean + 1) / 2; + if (q->poll_hyb_use_min) + ret = max(ret, (unsigned long)q->poll_stat[bucket].min); + + if (q->poll_hyb_adjust) { + if (ret >= q->poll_hyb_adjust) + ret -= q->poll_hyb_adjust; + else + return 0; + } + return ret; } diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 27aceab..51e5853 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -395,6 +395,50 @@ static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page, return count; } +static ssize_t queue_poll_hyb_use_min_show(struct request_queue *q, char *page) +{ + return sprintf(page, "%d\n", q->poll_hyb_use_min); +} + +static ssize_t queue_poll_hyb_use_min_store(struct request_queue *q, + const char *page, size_t count) +{ + int err, val; + + if (!q->mq_ops || !q->mq_ops->poll) + return -EINVAL; + + err = kstrtoint(page, 10, &val); + if (err < 0) + return err; + + q->poll_hyb_use_min = val; + + return count; +} + +static ssize_t queue_poll_hyb_adjust_show(struct request_queue *q, char *page) +{ + return sprintf(page, "%d\n", q->poll_hyb_adjust); +} + +static ssize_t queue_poll_hyb_adjust_store(struct request_queue *q, + const char *page, size_t count) +{ + int err, val; + + if (!q->mq_ops || !q->mq_ops->poll) + return -EINVAL; + + err = kstrtoint(page, 10, &val); + if (err < 0) + return err; + + q->poll_hyb_adjust = val; + + return count; +} + static ssize_t queue_poll_show(struct request_queue *q, char *page) { return queue_var_show(test_bit(QUEUE_FLAG_POLL, &q->queue_flags), page); @@ -661,6 +705,18 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page) .store = queue_poll_delay_store, }; +static struct queue_sysfs_entry queue_poll_hyb_use_min_entry = { + .attr = {.name = "io_poll_hyb_use_min", .mode = S_IRUGO | S_IWUSR }, + .show = queue_poll_hyb_use_min_show, + .store = queue_poll_hyb_use_min_store, +}; + +static struct queue_sysfs_entry queue_poll_hyb_adjust_entry = { + .attr = {.name = "io_poll_hyb_adjust", .mode = S_IRUGO | S_IWUSR }, + .show = queue_poll_hyb_adjust_show, + .store = queue_poll_hyb_adjust_store, +}; + static struct queue_sysfs_entry queue_wc_entry = { .attr = {.name = "write_cache", .mode = S_IRUGO | S_IWUSR }, .show = queue_wc_show, @@ -719,6 +775,8 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page) &queue_dax_entry.attr, &queue_wb_lat_entry.attr, &queue_poll_delay_entry.attr, + &queue_poll_hyb_use_min_entry.attr, + &queue_poll_hyb_adjust_entry.attr, #ifdef CONFIG_BLK_DEV_THROTTLING_LOW &throtl_sample_time_entry.attr, #endif diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f45f157..97b46ce 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -527,6 +527,9 @@ struct request_queue { unsigned int rq_timeout; int poll_nsec; + int poll_hyb_use_min; + int poll_hyb_adjust; + struct blk_stat_callback *poll_cb; struct blk_rq_stat poll_stat[BLK_MQ_POLL_STATS_BKTS]; -- 1.9.1