Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1754838AbeAIQrC (ORCPT + 1 other); Tue, 9 Jan 2018 11:47:02 -0500 Received: from mail-eopbgr60119.outbound.protection.outlook.com ([40.107.6.119]:62080 "EHLO EUR01-DB5-obe.outbound.protection.outlook.com" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1751348AbeAIQq7 (ORCPT ); Tue, 9 Jan 2018 11:46:59 -0500 Authentication-Results: spf=none (sender IP is ) smtp.mailfrom=aryabinin@virtuozzo.com; Subject: Re: [PATCH] lib/strscpy: remove word-at-a-time optimization. To: Andrew Morton , Linus Torvalds Cc: linux-kernel@vger.kernel.org, Kees Cook , Eryu Guan , Alexander Potapenko , Chris Metcalf , David Laight , Dmitry Vyukov , stable@vger.kernel.org References: <20180109163745.3692-1-aryabinin@virtuozzo.com> From: Andrey Ryabinin Message-ID: <50fcfba8-fc16-b4a1-d117-24ebbe959c0c@virtuozzo.com> Date: Tue, 9 Jan 2018 19:47:05 +0300 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Thunderbird/52.5.2 MIME-Version: 1.0 In-Reply-To: <20180109163745.3692-1-aryabinin@virtuozzo.com> Content-Type: multipart/mixed; boundary="------------C94E214A5541822E728FB5B1" Content-Language: en-US X-Originating-IP: [195.214.232.6] X-ClientProxiedBy: HE1PR05CA0282.eurprd05.prod.outlook.com (2603:10a6:3:fc::34) To HE1PR08MB2826.eurprd08.prod.outlook.com (2603:10a6:7:2e::25) X-MS-PublicTrafficType: Email X-MS-Office365-Filtering-Correlation-Id: 6bc0a73f-3a4c-4fdb-7926-08d5578098b5 X-Microsoft-Antispam: UriScan:;BCL:0;PCL:0;RULEID:(4534020)(4602075)(7168020)(4627115)(201703031133081)(201702281549075)(5600026)(4604075)(2017052603307)(7153060)(49563074)(7193020);SRVR:HE1PR08MB2826; X-Microsoft-Exchange-Diagnostics: 1;HE1PR08MB2826;3:09UA3K+XC+ZqCUAcBa5sYEiq3skff8ydwxWGtR1oV0yjBBpy+RE35Ax5RYrKHZ1JC1NhCfDWJq5EoxQaI4fRZOqP0xCy3KxQ3qT3nRpJsceNTSL1DNZHg5vCildnuI4YCQrFqroRytBX14B3N1++numN+ABI1+VxdsGFmyGEquLe+EOyRnhzWkzIzmgNIhACq9Xg7nc509UEyS1bqhiI8GSjyt/cs2ONG6uXSug3+QHln+cIuFOl4d/DotPDeUxX;25:XvjIoPGsZc8SMBflfe0SRB9xyRJf2ySRT8uvJ5JJZrJcDfFvMlsDY1sp2nWasd/OVtqgL/uSRtUXP+RtguVAjaqWlkdrFvEsMyk5D03sgadsdchD/6+cv8Ij9ijc8RKur9/QKtrznfIUCsUCNZNgcEZTKJ8XwkDDh0ofxTElWlMYuHIOr8AOxZoPgicoTRt1k+sP7PS+a6zjNSOWYQFtf0GL4dUY8LbDBhUWC8BgcXZ0ZJifGCjJk65zSnCyif08ulai0XvsvCkngdbA4HFqM9aSFyE1CmU614SLVt3bUp4BhmCuf0rSWb8eQLlUDt1q0Fv1bKBk0pWHL0M7IavM/A==;31:xKkquUj3gLM5JehjlsWOYg8WCUFNeF1hHWriayUV6CoPrObTDUT/a4ettn633ZyLKOp+LsavOyRpJTMy6paOnZqoII7oqNd5H7pu3QteyttFG7pKTqlJ2lDM3plSB4kB2KZS+jW3NjNg5nQtzNPFYkkLmNOmpwHS3aSx0SWvArw2Hjoch/wfAuXxhXXmDgOuaDdKrFdThQq+L77kpENgxjtEZ2QeK9Cn2nzAJlhHhCA= X-MS-TrafficTypeDiagnostic: HE1PR08MB2826: X-Microsoft-Exchange-Diagnostics: 1;HE1PR08MB2826;20:PpXCe9fg41UiMqyNdsLs/0OPQGiKjQli+91y1hgefUcm2MF5wBNHW8XOtKdyCzTJ/ek+NsXFYnya6UwhHe7GsNhIqh3NyoXgbpf6RGg/9fxopp6cvnE0Fd6uS0xHgXlDj8t2BiIB+5Jm7kK336HtNvO7PSfbiE2XGBGwxR7Y4XTULSUbuTNk5yRaIaM99QGH38NF7x+9kjc71foT+QrL8C6bAHs6d2mHWKh5vbiveq9UAG4OwNTX7eIPOQRBF1Wdk1Sv3+2NsRbC+TmYxkD/kpvxuUCoBuK4Thbk2O52CX4qkL8ikfEV/DDoul66m4XJlWGHTC/Kl4DxJ4FMKJUXCEto2pw+nFEPrV2vTXUl8W0EenF6QdlrLDNkHbsBfhx+KNatDw6kd5m8PcdFG0cHHkViV9dPbBZmi+le80ub10s=;4:OtDYeSv9AY1JqHNl/GFGNjcAByLTyQWUyIIqvfT6T4m+cJejw6kE49zlBSHd7C9lHy4JEY4WsDvB5XVN8ZXZJarrqF3z4Ui6w3MIeqdhlA6sMyyyqFs9xzMDNemiEjq/gR2Qx8gjacTNxW+NcLqNE1yj2G7g6UFx65yqnGjlesWdJGo5t6MqVeRAxwR7/BNVPsV+zo7HXf77mJLLiF5qFMUBxVDFppHOvfo/tnMP/6QdTlLsQJJ+eavKOCfFAsdwOAfldi1+5aZC8qmVytGmYg== X-Microsoft-Antispam-PRVS: X-Exchange-Antispam-Report-Test: UriScan:; X-Exchange-Antispam-Report-CFA-Test: BCL:0;PCL:0;RULEID:(102415395)(6040470)(2401047)(5005006)(8121501046)(3002001)(10201501046)(3231023)(944501075)(93006095)(93001095)(6041268)(20161123562045)(20161123560045)(20161123558120)(201703131423095)(201702281528075)(20161123555045)(201703061421075)(201703061406153)(20161123564045)(6072148)(201708071742011);SRVR:HE1PR08MB2826;BCL:0;PCL:0;RULEID:(100000803101)(100110400095);SRVR:HE1PR08MB2826; X-Forefront-PRVS: 0547116B72 X-Forefront-Antispam-Report: SFV:NSPM;SFS:(10019020)(6049001)(376002)(366004)(39850400004)(39380400002)(396003)(346002)(199004)(189003)(7736002)(86362001)(105586002)(230783001)(76176011)(97736004)(6116002)(5890100001)(55236004)(4610100001)(33964004)(2476003)(31686004)(3846002)(83506002)(106356001)(305945005)(68736007)(8676002)(81166006)(52116002)(81156014)(568964002)(64126003)(36756003)(386003)(270700001)(16526018)(8936002)(6486002)(110136005)(53936002)(84326002)(229853002)(6246003)(54906003)(65826007)(25786009)(65806001)(77096006)(2906002)(6666003)(7416002)(16586007)(316002)(16576012)(5660300001)(2950100002)(37036004)(66066001)(65956001)(58126008)(4326008)(478600001)(31696002);DIR:OUT;SFP:1102;SCL:1;SRVR:HE1PR08MB2826;H:[172.16.25.12];FPR:;SPF:None;PTR:InfoNoRecords;A:1;MX:1;LANG:en; X-Microsoft-Exchange-Diagnostics: =?us-ascii?Q?1;HE1PR08MB2826;23:5EspxDG+rmNQx9h8sq2WdO0edfeKxvG66d1I4TK3L?= =?us-ascii?Q?UdaC4b1gZe5rS2HirPzDqIiEz7YTYRQmnPpMXpqksVT9MQZuUxNU7GL3UdkU?= =?us-ascii?Q?WK7niKpXd/ifTfYZ1vko+4EUABFz4VFaiR9FR7K48J5A6wjWgKKcSRc9JvHa?= =?us-ascii?Q?c2U9QxYEEg92wmU1/4llUQoKD8nzqP+rrucIn0pdTOnS+LIDUUCgf2GhI/go?= =?us-ascii?Q?kGW7FqvmvanxpjLSxW/NOcS9Tonal3i5m+1nSS/eELz3q/2JHPjwvF3HK7A3?= =?us-ascii?Q?iqGYjr2kFHzdD1+BBhB1Ffx6i9Ij58ckT+JClZFMowzW/kX5z2zsAAqKbd+n?= =?us-ascii?Q?IXsCL2snrcXbsFLupi5YxWoy+SHs4XA1qrrC9u/D/4+Lnkgqe1mUOAQjcqNT?= =?us-ascii?Q?PonsPn1tskiWBlHrZyjzYJcfxP3aaj96Di6n2OK3FqNMboSWcGb3nvV4pyM/?= =?us-ascii?Q?5H/w5+T6XBgS6JFn8yo5IKgynNtO6YpReMzHESzdHlK2Ar1zZwmuBq5Unjtd?= =?us-ascii?Q?89WeRJ62E5swf659+oFhD2VKe4pMC1QzmRQDToaqT6ooYpt8rkyjRKQONU9E?= =?us-ascii?Q?Sam15R5yT57ZoJceuSuaybbcakaZX08ovl5O6lVNZqdGn1mZB6CiKMIQ7RiG?= =?us-ascii?Q?fQPNgkxou2l+89W594L4ApbgfWSKCkBhpHVo63sTIfDAChPdLHl41h6NqEau?= =?us-ascii?Q?VxVa7Fs9fWtwmqpNu4bdfHQ0ZB6CcZ70P1BxB8qR1rS93ldlzwenP2himn6Q?= =?us-ascii?Q?deUnVkW8Zc0SxsHBBsJ74eMmq6zK+Kg4/6TLjCBG24MvhTCsK9EGKAKL9Fsp?= =?us-ascii?Q?czDmKrS/XcprzmIMG5Dco0U1FhfY9AHIPJQ7fgq2oQeMW/YpIud0cc7nibYL?= =?us-ascii?Q?0skIHJI2aCmpKHDmllpV/XrdpcLBsQVNqRcyPGecp1TpiAR73zk0wtftkTtv?= =?us-ascii?Q?LkJpqxP5Utu9/fkuFbrzV9YBZbBTvtaw1XK08D375v9W2o7GE8vZz57qY2+S?= =?us-ascii?Q?Zm+M0apmqbEIdjiKQTa4W26Eas8WcBkJdeHguXLulhkCDrkWolu5nl6uxALn?= =?us-ascii?Q?A+Ettt7CVxLf1W5Nhumh4mtiJ3ELIyh5od1UyN+ast6A94Q05lrX7JATMX66?= =?us-ascii?Q?ybMDqvkbDTXDhk04eU2IePUcLDwu1C2+OlrIVwEZIZlULvuH/iJcadFgPPWx?= =?us-ascii?Q?/DD2RHP3mgslEMDs8HQ9q3dYghxzjl+sjg7frX9es7zPHsERJRnYRX65wSr4?= =?us-ascii?Q?BEsHw+u1IFSNd8+ujHrsw4m54zenzsOS8Ht6YWfcl0EvPZqZ6TM1aQrqaPJP?= =?us-ascii?Q?eESKYcqtxPAEucwz69jYF3fdzf1qubHA3RT5mrZYBBFAEMo/enteRIVynXVL?= =?us-ascii?Q?dtA076EIjTsm218nUSYUUvmngcz3vFF2Z0aBPUEPakZ15sqSjRlG2OCBhdms?= =?us-ascii?Q?TEefOmMgFTXlpPyV0SicLGlFX+/Yk5b1VDaTiunkwmA2L4+VueE?= X-Microsoft-Exchange-Diagnostics: 1;HE1PR08MB2826;6:Cbuu3XJBpRUI550HSpJ2FY6+f9aoNz2QVUz3I/RVgY9EVPENs6nM1cu8hmFdgreCBPWd85szbdU06FlhZoOPp6yaIxBA6szsFx1+wOSgUGXVB55oIVJ5cW7jseszKR2mt15dvuMsPvn4Rv/qo42/QkmCz4xgoDzJ1FQYBJ+weYh9a5i7VwqGQLSRbFrCvAJBNpbHXCXltTKyOFqD8ocu5kJL2O1i9NJ9eEGsNgOWHfNhZYeOLkZutIqGXqfWYsD5nUlq0hTYgYl5lnA4+x/P2hhSMMW+i42d+qE9mh1wxF4tBrymLoZL5MdHCUY90F5fWdnafI/5UJO1gE4WKTmHtY7o6HDkE0RgzEAmLl2iX4w=;5:LPb8AOLMcBpMF5JsZkqkeVYPv6X69THqlnZH0AmvHdvJK9SQvdlRBOTRmE6gSMInAThbTBA1iohfwbp8VZqf/C/osaSCs0SHndrV5qurCozQqBdEfv5cM92SQ0Ib1d/VmekLrRPXQBbHqtAtIb566riKqMtkkBqXE28Vxn2otvg=;24:Z1dZ5iPiUUGcgkjdn58vAXy09IRzu+PX1If6sNvdRbhs8/wJyXz4hWA5wR/Se8E4xIPbw4DUW+2AB9UvlrSPrI74IVVa8Pn7J6nEuxhGbY8=;7:A75YmyT3MIH4Y+ybSnnO5W2jBizCfDBSzr2WFhVwEC8lVd902elv5c+Ts5iVCB7WY66InYv8kP99r+aFMf2NVZ62rO+H8PLrP6/1DU587KEMBazw6qd3BszM+/viwtbDRk+/ws7Zrp/+XGTL74TjtmpBH7H7LXHPSjVqIwXdW3fMu4W2afQoXmSK891v707lurkU8vkoOHGi18rM+qKGurg0hUMGjvvDfwCRY2WLVM0Qbbx0BJH2VDxFL0+3wMe7 SpamDiagnosticOutput: 1:99 SpamDiagnosticMetadata: NSPM X-Microsoft-Exchange-Diagnostics: 1;HE1PR08MB2826;20:jsIbsGirOsSi5QDqWArKUCj4Tw41roPPfzdGYjvhk1VvGMk/xtUwPAyp/94hpffrA/3RJzNet9zoP5FWxqHZ7oC33T4eFli4cpIPGl+r4HJybcOEKDYhIQvn4yIpc8cDSO/yvCuDRJ9H3eWIdJJnTmGACw5r6I4PypwNm+MADZ8= X-OriginatorOrg: virtuozzo.com X-MS-Exchange-CrossTenant-OriginalArrivalTime: 09 Jan 2018 16:46:56.8815 (UTC) X-MS-Exchange-CrossTenant-Network-Message-Id: 6bc0a73f-3a4c-4fdb-7926-08d5578098b5 X-MS-Exchange-CrossTenant-FromEntityHeader: Hosted X-MS-Exchange-CrossTenant-Id: 0bc7f26d-0264-416e-a6fc-8352af79c58f X-MS-Exchange-Transport-CrossTenantHeadersStamped: HE1PR08MB2826 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Return-Path: This is a multi-part message in MIME format. --------------C94E214A5541822E728FB5B1 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 7bit Attached user space program I used to see the difference. Usage: gcc -02 -o strscpy strscpy_test.c ./strscpy {b|w} src_str_len count src_str_len - length of source string in between 1-4096 count - how many strscpy() to execute. Also I've noticed something strange. I'm not sure why, but certain src_len values (e.g. 30) drives branch predictor crazy causing worse than usual results for byte-at-a-time copy: $ perf stat ./strscpy b 29 10000000 Performance counter stats for './strscpy b 29 10000000': 165.354974 task-clock:u (msec) # 0.999 CPUs utilized 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 48 page-faults:u # 0.290 K/sec 640,475,981 cycles:u # 3.873 GHz 2,500,090,080 instructions:u # 3.90 insn per cycle 640,017,126 branches:u # 3870.565 M/sec 1,589 branch-misses:u # 0.00% of all branches 0.165568346 seconds time elapsed Performance counter stats for './strscpy b 30 10000000': 250.835659 task-clock:u (msec) # 0.999 CPUs utilized 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 46 page-faults:u # 0.183 K/sec 974,528,780 cycles:u # 3.885 GHz 2,580,090,165 instructions:u # 2.65 insn per cycle 660,017,211 branches:u # 2631.273 M/sec 14,488,234 branch-misses:u # 2.20% of all branches 0.251147341 seconds time elapsed Performance counter stats for './strscpy b 31 10000000': 176.598368 task-clock:u (msec) # 0.997 CPUs utilized 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 46 page-faults:u # 0.260 K/sec 681,367,948 cycles:u # 3.858 GHz 2,660,090,092 instructions:u # 3.90 insn per cycle 680,017,138 branches:u # 3850.642 M/sec 1,817 branch-misses:u # 0.00% of all branches 0.177150181 seconds time elapsed --------------C94E214A5541822E728FB5B1 Content-Type: text/x-csrc; name="strscpy_test.c" Content-Transfer-Encoding: 7bit Content-Disposition: attachment; filename="strscpy_test.c" #include #include #define CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS #define REPEAT_BYTE(x) ((~0ul / 0xff) * (x)) #define E2BIG -1 #define PAGE_SIZE 4096 struct word_at_a_time { const unsigned long one_bits, high_bits; }; #define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) } static inline long count_masked_bytes(unsigned long mask) { return mask*0x0001020304050608ul >> 56; } static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c) { unsigned long mask = ((a - c->one_bits) & ~a) & c->high_bits; *bits = mask; return mask; } static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits, const struct word_at_a_time *c) { return bits; } static inline unsigned long create_zero_mask(unsigned long bits) { bits = (bits - 1) & ~bits; return bits >> 7; } /* The mask we created is directly usable as a bytemask */ #define zero_bytemask(mask) (mask) static inline unsigned long find_zero(unsigned long mask) { return count_masked_bytes(mask); } __attribute__((noinline)) int strscpy_word(char *dest, const char *src, size_t count) { const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; size_t max = count; long res = 0; if (count == 0) return -E2BIG; #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS /* * If src is unaligned, don't cross a page boundary, * since we don't know if the next page is mapped. */ if ((long)src & (sizeof(long) - 1)) { size_t limit = PAGE_SIZE - ((long)src & (PAGE_SIZE - 1)); if (limit < max) max = limit; } #else /* If src or dest is unaligned, don't do word-at-a-time. */ if (((long) dest | (long) src) & (sizeof(long) - 1)) max = 0; #endif while (max >= sizeof(unsigned long)) { unsigned long c, data; c = *(unsigned long *)(src+res); if (has_zero(c, &data, &constants)) { data = prep_zero_mask(c, data, &constants); data = create_zero_mask(data); *(unsigned long *)(dest+res) = c & zero_bytemask(data); return res + find_zero(data); } *(unsigned long *)(dest+res) = c; res += sizeof(unsigned long); count -= sizeof(unsigned long); max -= sizeof(unsigned long); } while (count) { char c; c = src[res]; dest[res] = c; if (!c) return res; res++; count--; } /* Hit buffer length without finding a NUL; force NUL-termination. */ if (res) dest[res-1] = '\0'; return -E2BIG; } __attribute__((noinline)) int strscpy_byte(char *dest, const char *src, int count) { int res = 0; while (count) { char c; c = src[res]; dest[res] = c; if (!c) return res; res++; count--; } /* Hit buffer length without finding a NUL; force NUL-termination. */ if (res) dest[res-1] = '\0'; return -E2BIG; } char dest[4096] __attribute__((aligned(4096))); char src[4096] __attribute__((aligned(4096))); int main(int argc, char **argv) { unsigned long long i; unsigned long src_len; unsigned long count; if (argc < 4) return -1; src_len = atoi(argv[2]); count = atoi(argv[3]); memset(src, 1, src_len); if (argv[1][0] == 'w') { for (i = 0; i < count; i++) { strscpy_word(dest, src, sizeof(dest)); } } else if (argv[1][0] == 'b') { for (i = 0; i < count; i++) { strscpy_byte(dest, src, sizeof(dest)); } } return 0; } --------------C94E214A5541822E728FB5B1--