Received: by 2002:a05:6a10:22f:0:0:0:0 with SMTP id 15csp2242201pxk; Mon, 14 Sep 2020 08:13:58 -0700 (PDT) X-Google-Smtp-Source: ABdhPJysNfhMciIKAeVW87BQitPH8ZeEG34rCbD3pykggHZy+OT7g5YgEtbeR+ZqlewvKoN2yB1P X-Received: by 2002:a05:6402:b72:: with SMTP id cb18mr17266874edb.299.1600096438666; Mon, 14 Sep 2020 08:13:58 -0700 (PDT) ARC-Seal: i=1; a=rsa-sha256; t=1600096438; cv=none; d=google.com; s=arc-20160816; b=kjVyGNN8tSAHsYBeT+Lq97FeDSRCTa2vgrumgh/WoGq+Q63Sy6wFldelyPYqmNQdAW ztZ6ihy82skCG4A5HNcOM/K/sTv0y6lEquml1JaI33a8Fe/9+ilP8kye7/3YRFegnJEV j2JwEKZAUP1W/+85URvdRoDpHwGWY6g2Hz2TRcc5g2FmEExBboUfvnKK50R3OYUCcchF EJqF/BmLs9+ZLq2VJ7l42grC2dDj4hnWTXIbPQzy7z485Rwj235WPvsjsKopc7hJGBJb /RNTedOWFBVFu57TkzEaAGMTNlmTrVHWBR93LJ0vd8nN/MxgU1TOzrIX0HZM+aBiInac jU/A== ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=arc-20160816; h=list-id:precedence:sender:references:in-reply-to:message-id:date :subject:cc:to:from; bh=aDxbC5BcqTk7tEN8wu7gf3AfRt+VzeBs7VGnw3dpzKo=; b=jHdMfsKU1yXkabjex5yLjhcyBp+2deYyEL5K1mplco1rp257FHutuqJrdUQjG3prN2 7i23yDbR3/X0r0QIQRzsI3yrH7rFklLP3+bJWcK7ss7QvKlJ8WScwNs84pdAIWFshS0m Bqg8k6GxAR+N1LahoOzLrPvVT3MhzeNOKmM5QvWXoA6/jCIN02E89uqVIslQCdR7p53A iVDlZ5N7alaQtHPO79K5c8+dHGeFVMOHOuCujtaxCCH/0QrpU1IEnf7+Nu+yQvyW8coo fJbi4O4YFQ0hnXHUJCr5HWRfkUxCsMYWbD0V7XMpYiESs0EPDO9LQJwvy7AiD+D6axpK hoZg== ARC-Authentication-Results: i=1; mx.google.com; spf=pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 23.128.96.18 as permitted sender) smtp.mailfrom=linux-kernel-owner@vger.kernel.org Return-Path: Received: from vger.kernel.org (vger.kernel.org. [23.128.96.18]) by mx.google.com with ESMTP id m25si7477062ejb.269.2020.09.14.08.13.35; Mon, 14 Sep 2020 08:13:58 -0700 (PDT) Received-SPF: pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 23.128.96.18 as permitted sender) client-ip=23.128.96.18; Authentication-Results: mx.google.com; spf=pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 23.128.96.18 as permitted sender) smtp.mailfrom=linux-kernel-owner@vger.kernel.org Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1726125AbgINPL5 (ORCPT + 99 others); Mon, 14 Sep 2020 11:11:57 -0400 Received: from foss.arm.com ([217.140.110.172]:39634 "EHLO foss.arm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726348AbgINPKy (ORCPT ); Mon, 14 Sep 2020 11:10:54 -0400 Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 7B1A0152D; Mon, 14 Sep 2020 08:10:44 -0700 (PDT) Received: from seattle-bionic.arm.com.Home (unknown [172.31.20.19]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id 67A523F718; Mon, 14 Sep 2020 08:10:43 -0700 (PDT) From: Oliver Swede To: catalin.marinas@arm.com, will@kernel.org Cc: robin.murphy@arm.com, linux-arm-kernel@lists.indradead.org, linux-kernel@vger.kernel.org Subject: [PATCH v5 08/14] arm64: Import latest optimization of memcpy Date: Mon, 14 Sep 2020 15:09:52 +0000 Message-Id: <20200914150958.2200-9-oli.swede@arm.com> X-Mailer: git-send-email 2.17.1 In-Reply-To: <20200914150958.2200-1-oli.swede@arm.com> References: <20200914150958.2200-1-oli.swede@arm.com> Sender: linux-kernel-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org From: Sam Tebbs Import the latest memcpy implementation into memcpy, copy_{from, to and in}_user. The implementation of the user routines is separated into two forms: one for when UAO is enabled and one for when UAO is disabled, with the two being chosen between with a runtime patch. This avoids executing the many NOPs emitted when UAO is disabled. The project containing optimized implementations for various library functions has now been renamed from 'cortex-strings' to 'optimized-routines', and the new upstream source is string/aarch64/memcpy.S as of commit 4c175c8be12 in https://github.com/ARM-software/optimized-routines. Signed-off-by: Sam Tebbs [ rm: add UAO fixups, streamline copy_exit paths, expand commit message ] Signed-off-by: Robin Murphy [ os: import newer memcpy algorithm, update commit message ] Signed-off-by: Oliver Swede --- arch/arm64/include/asm/alternative.h | 36 --- arch/arm64/lib/copy_from_user.S | 113 ++++++-- arch/arm64/lib/copy_in_user.S | 129 +++++++-- arch/arm64/lib/copy_template.S | 375 +++++++++++++++------------ arch/arm64/lib/copy_template_user.S | 24 ++ arch/arm64/lib/copy_to_user.S | 112 ++++++-- arch/arm64/lib/copy_user_fixup.S | 14 + arch/arm64/lib/memcpy.S | 47 ++-- 8 files changed, 557 insertions(+), 293 deletions(-) create mode 100644 arch/arm64/lib/copy_template_user.S create mode 100644 arch/arm64/lib/copy_user_fixup.S diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h index 619db9b4c9d5..581bacacc1bc 100644 --- a/arch/arm64/include/asm/alternative.h +++ b/arch/arm64/include/asm/alternative.h @@ -230,36 +230,6 @@ alternative_endif * unprivileged instructions, and USER() only works for single instructions. */ #ifdef CONFIG_ARM64_UAO - .macro uao_ldp l, reg1, reg2, addr, post_inc - alternative_if_not ARM64_HAS_UAO -8888: ldp \reg1, \reg2, [\addr], \post_inc; -8889: nop; - nop; - alternative_else - ldtr \reg1, [\addr]; - ldtr \reg2, [\addr, #8]; - add \addr, \addr, \post_inc; - alternative_endif - - _asm_extable 8888b,\l; - _asm_extable 8889b,\l; - .endm - - .macro uao_stp l, reg1, reg2, addr, post_inc - alternative_if_not ARM64_HAS_UAO -8888: stp \reg1, \reg2, [\addr], \post_inc; -8889: nop; - nop; - alternative_else - sttr \reg1, [\addr]; - sttr \reg2, [\addr, #8]; - add \addr, \addr, \post_inc; - alternative_endif - - _asm_extable 8888b,\l; - _asm_extable 8889b,\l; - .endm - .macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc alternative_if_not ARM64_HAS_UAO 8888: \inst \reg, [\addr], \post_inc; @@ -272,12 +242,6 @@ alternative_endif _asm_extable 8888b,\l; .endm #else - .macro uao_ldp l, reg1, reg2, addr, post_inc - USER(\l, ldp \reg1, \reg2, [\addr], \post_inc) - .endm - .macro uao_stp l, reg1, reg2, addr, post_inc - USER(\l, stp \reg1, \reg2, [\addr], \post_inc) - .endm .macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc USER(\l, \inst \reg, [\addr], \post_inc) .endm diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 0f8a3a9e3795..86945e84c009 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -19,50 +19,111 @@ * Returns: * x0 - bytes not copied */ + 8888: ldtrb \reg, [\ptr, \offset] + _asm_extable_faultaddr 8888b,9998f; + .endm + + .macro strb1 reg, ptr, offset=0 + strb \reg, [\ptr, \offset] + .endm + + .macro ldrb1_reg reg, ptr, offset + add \ptr, \ptr, \offset + 8888: ldtrb \reg, [\ptr] + sub \ptr, \ptr, \offset + _asm_extable_faultaddr 8888b,9998f; + .endm + + .macro strb1_reg reg, ptr, offset + strb \reg, [\ptr, \offset] + .endm - .macro ldrb1 reg, ptr, val - uao_user_alternative 9998f, ldrb, ldtrb, \reg, \ptr, \val + .macro ldr1 reg, ptr, offset=0 + 8888: ldtr \reg, [\ptr, \offset] + _asm_extable_faultaddr 8888b,9998f; .endm - .macro strb1 reg, ptr, val - strb \reg, [\ptr], \val + .macro str1 reg, ptr, offset=0 + str \reg, [\ptr, \offset] .endm - .macro ldrh1 reg, ptr, val - uao_user_alternative 9998f, ldrh, ldtrh, \reg, \ptr, \val + .macro ldp1 regA, regB, ptr, offset=0 + 8888: ldtr \regA, [\ptr, \offset] + 8889: ldtr \regB, [\ptr, \offset + 8] + _asm_extable_faultaddr 8888b,9998f; + _asm_extable_faultaddr 8889b,9998f; .endm - .macro strh1 reg, ptr, val - strh \reg, [\ptr], \val + .macro stp1 regA, regB, ptr, offset=0 + stp \regA, \regB, [\ptr, \offset] .endm - .macro ldr1 reg, ptr, val - uao_user_alternative 9998f, ldr, ldtr, \reg, \ptr, \val + .macro ldp1_pre regA, regB, ptr, offset + 8888: ldtr \regA, [\ptr, \offset] + 8889: ldtr \regB, [\ptr, \offset + 8] + add \ptr, \ptr, \offset + _asm_extable_faultaddr 8888b,9998f; + _asm_extable_faultaddr 8889b,9998f; .endm - .macro str1 reg, ptr, val - str \reg, [\ptr], \val + .macro stp1_pre regA, regB, ptr, offset + stp \regA, \regB, [\ptr, \offset]! .endm - .macro ldp1 reg1, reg2, ptr, val - uao_ldp 9998f, \reg1, \reg2, \ptr, \val + .macro ldrb1_nuao reg, ptr, offset=0 + 8888: ldrb \reg, [\ptr, \offset] + _asm_extable_faultaddr 8888b,9998f; .endm - .macro stp1 reg1, reg2, ptr, val - stp \reg1, \reg2, [\ptr], \val + .macro strb1_nuao reg, ptr, offset=0 + strb \reg, [\ptr, \offset] + .endm + + .macro ldrb1_nuao_reg reg, ptr, offset=0 + 8888: ldrb \reg, [\ptr, \offset] + _asm_extable_faultaddr 8888b,9998f; + .endm + + .macro strb1_nuao_reg reg, ptr, offset=0 + strb \reg, [\ptr, \offset] + .endm + + .macro ldr1_nuao reg, ptr, offset=0 + 8888: ldr \reg, [\ptr, \offset] + _asm_extable_faultaddr 8888b,9998f; + .endm + + .macro str1_nuao reg, ptr, offset=0 + str \reg, [\ptr, \offset] + .endm + + .macro ldp1_nuao regA, regB, ptr, offset=0 + 8888: ldp \regA, \regB, [\ptr, \offset] + _asm_extable_faultaddr 8888b,9998f; + .endm + + .macro stp1_nuao regA, regB, ptr, offset=0 + stp \regA, \regB, [\ptr, \offset] + .endm + + .macro ldp1_pre_nuao regA, regB, ptr, offset + 8888: ldp \regA, \regB, [\ptr, \offset]! + _asm_extable_faultaddr 8888b,9998f; + .endm + + .macro stp1_pre_nuao regA, regB, ptr, offset + stp \regA, \regB, [\ptr, \offset]! + .endm + + .macro copy_exit + b .Luaccess_finish .endm -end .req x5 SYM_FUNC_START(__arch_copy_from_user) - add end, x0, x2 -#include "copy_template.S" - mov x0, #0 // Nothing to copy +#include "copy_template_user.S" +.Luaccess_finish: + mov x0, #0 ret SYM_FUNC_END(__arch_copy_from_user) EXPORT_SYMBOL(__arch_copy_from_user) - - .section .fixup,"ax" - .align 2 -9998: sub x0, end, dst // bytes not copied - ret - .previous +#include "copy_user_fixup.S" diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S index 80e37ada0ee1..77dfccc618b6 100644 --- a/arch/arm64/lib/copy_in_user.S +++ b/arch/arm64/lib/copy_in_user.S @@ -21,50 +21,129 @@ * Returns: * x0 - bytes not copied */ - .macro ldrb1 reg, ptr, val - uao_user_alternative 9998f, ldrb, ldtrb, \reg, \ptr, \val + .macro ldrb1 reg, ptr, offset=0 + 8888: ldtrb \reg, [\ptr, \offset] + _asm_extable_faultaddr 8888b,9998f; .endm - .macro strb1 reg, ptr, val - uao_user_alternative 9998f, strb, sttrb, \reg, \ptr, \val + .macro strb1 reg, ptr, offset=0 + 8888: sttrb \reg, [\ptr, \offset] + _asm_extable_faultaddr 8888b,9998f; .endm - .macro ldrh1 reg, ptr, val - uao_user_alternative 9998f, ldrh, ldtrh, \reg, \ptr, \val + .macro ldrb1_reg reg, ptr, offset + add \ptr, \ptr, \offset + 8888: ldtrb \reg, [\ptr] + sub \ptr, \ptr, \offset + _asm_extable_faultaddr 8888b,9998f; .endm - .macro strh1 reg, ptr, val - uao_user_alternative 9998f, strh, sttrh, \reg, \ptr, \val + .macro strb1_reg reg, ptr, offset + add \ptr, \ptr, \offset + 8888: sttrb \reg, [\ptr] + sub \ptr, \ptr, \offset + _asm_extable_faultaddr 8888b,9998f; .endm - .macro ldr1 reg, ptr, val - uao_user_alternative 9998f, ldr, ldtr, \reg, \ptr, \val + .macro ldr1 reg, ptr, offset=0 + 8888: ldtr \reg, [\ptr, \offset] + _asm_extable_faultaddr 8888b,9998f; .endm - .macro str1 reg, ptr, val - uao_user_alternative 9998f, str, sttr, \reg, \ptr, \val + .macro str1 reg, ptr, offset=0 + 8888: sttr \reg, [\ptr, \offset] + _asm_extable_faultaddr 8888b,9998f; .endm - .macro ldp1 reg1, reg2, ptr, val - uao_ldp 9998f, \reg1, \reg2, \ptr, \val + .macro ldp1 regA, regB, ptr, offset=0 + 8888: ldtr \regA, [\ptr, \offset] + 8889: ldtr \regB, [\ptr, \offset + 8] + _asm_extable_faultaddr 8888b,9998f; + _asm_extable_faultaddr 8889b,9998f; .endm - .macro stp1 reg1, reg2, ptr, val - uao_stp 9998f, \reg1, \reg2, \ptr, \val + .macro stp1 regA, regB, ptr, offset=0 + 8888: sttr \regA, [\ptr, \offset] + 8889: sttr \regB, [\ptr, \offset + 8] + _asm_extable_faultaddr 8888b,9998f; + _asm_extable_faultaddr 8889b,9998f; .endm -end .req x5 + .macro ldp1_pre regA, regB, ptr, offset + 8888: ldtr \regA, [\ptr, \offset] + 8889: ldtr \regB, [\ptr, \offset + 8] + add \ptr, \ptr, \offset + _asm_extable_faultaddr 8888b,9998f; + _asm_extable_faultaddr 8889b,9998f; + .endm + + .macro stp1_pre regA, regB, ptr, offset + 8888: sttr \regA, [\ptr, \offset] + 8889: sttr \regB, [\ptr, \offset + 8] + add \ptr, \ptr, \offset + _asm_extable_faultaddr 8888b,9998f; + _asm_extable_faultaddr 8889b,9998f; + .endm + + .macro ldrb1_nuao reg, ptr, offset=0 + 8888: ldrb \reg, [\ptr, \offset] + _asm_extable_faultaddr 8888b,9998f; + .endm + + .macro strb1_nuao reg, ptr, offset=0 + 8888: strb \reg, [\ptr, \offset] + _asm_extable_faultaddr 8888b,9998f; + .endm + + .macro ldrb1_nuao_reg reg, ptr, offset=0 + 8888: ldrb \reg, [\ptr, \offset] + _asm_extable_faultaddr 8888b,9998f; + .endm + + .macro strb1_nuao_reg reg, ptr, offset=0 + 8888: strb \reg, [\ptr, \offset] + _asm_extable_faultaddr 8888b,9998f; + .endm + + .macro ldr1_nuao reg, ptr, offset=0 + 8888: ldr \reg, [\ptr, \offset] + _asm_extable_faultaddr 8888b,9998f; + .endm + + .macro str1_nuao reg, ptr, offset=0 + 8888: str \reg, [\ptr, \offset] + _asm_extable_faultaddr 8888b,9998f; + .endm + + .macro ldp1_nuao regA, regB, ptr, offset=0 + 8888: ldp \regA, \regB, [\ptr, \offset] + _asm_extable_faultaddr 8888b,9998f; + .endm + + .macro stp1_nuao regA, regB, ptr, offset=0 + 8888: stp \regA, \regB, [\ptr, \offset] + _asm_extable_faultaddr 8888b,9998f; + .endm + + .macro ldp1_pre_nuao regA, regB, ptr, offset + 8888: ldp \regA, \regB, [\ptr, \offset]! + _asm_extable_faultaddr 8888b,9998f; + .endm + + .macro stp1_pre_nuao regA, regB, ptr, offset + 8888: stp \regA, \regB, [\ptr, \offset]! + _asm_extable_faultaddr 8888b,9998f; + .endm + + .macro copy_exit + b .Luaccess_finish + .endm SYM_FUNC_START(__arch_copy_in_user) - add end, x0, x2 -#include "copy_template.S" +#include "copy_template_user.S" +.Luaccess_finish: mov x0, #0 ret SYM_FUNC_END(__arch_copy_in_user) EXPORT_SYMBOL(__arch_copy_in_user) - - .section .fixup,"ax" - .align 2 -9998: sub x0, end, dst // bytes not copied - ret - .previous +#include "copy_user_fixup.S" diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S index 488df234c49a..90b5f63ff227 100644 --- a/arch/arm64/lib/copy_template.S +++ b/arch/arm64/lib/copy_template.S @@ -1,13 +1,12 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright (C) 2013 ARM Ltd. - * Copyright (C) 2013 Linaro. + * Copyright (c) 2012 Linaro Limited. All rights reserved. + * Copyright (c) 2015 ARM Ltd. All rights reserved. * - * This code is based on glibc cortex strings work originally authored by Linaro - * be found @ + * This code is based on work originally authored by Linaro, + * found at: * - * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ - * files/head:/src/aarch64/ + * https://github.com/ARM-software/optimized-routines */ @@ -21,161 +20,209 @@ * Returns: * x0 - dest */ -dstin .req x0 -src .req x1 -count .req x2 -tmp1 .req x3 -tmp1w .req w3 -tmp2 .req x4 -tmp2w .req w4 -dst .req x6 - -A_l .req x7 -A_h .req x8 -B_l .req x9 -B_h .req x10 -C_l .req x11 -C_h .req x12 -D_l .req x13 -D_h .req x14 - - mov dst, dstin - cmp count, #16 - /*When memory length is less than 16, the accessed are not aligned.*/ - b.lo .Ltiny15 - - neg tmp2, src - ands tmp2, tmp2, #15/* Bytes to reach alignment. */ - b.eq .LSrcAligned - sub count, count, tmp2 - /* - * Copy the leading memory data from src to dst in an increasing - * address order.By this way,the risk of overwriting the source - * memory data is eliminated when the distance between src and - * dst is less than 16. The memory accesses here are alignment. - */ - tbz tmp2, #0, 1f - ldrb1 tmp1w, src, #1 - strb1 tmp1w, dst, #1 -1: - tbz tmp2, #1, 2f - ldrh1 tmp1w, src, #2 - strh1 tmp1w, dst, #2 -2: - tbz tmp2, #2, 3f - ldr1 tmp1w, src, #4 - str1 tmp1w, dst, #4 -3: - tbz tmp2, #3, .LSrcAligned - ldr1 tmp1, src, #8 - str1 tmp1, dst, #8 - -.LSrcAligned: - cmp count, #64 - b.ge .Lcpy_over64 - /* - * Deal with small copies quickly by dropping straight into the - * exit block. - */ -.Ltail63: - /* - * Copy up to 48 bytes of data. At this point we only need the - * bottom 6 bits of count to be accurate. - */ - ands tmp1, count, #0x30 - b.eq .Ltiny15 - cmp tmp1w, #0x20 - b.eq 1f - b.lt 2f - ldp1 A_l, A_h, src, #16 - stp1 A_l, A_h, dst, #16 -1: - ldp1 A_l, A_h, src, #16 - stp1 A_l, A_h, dst, #16 -2: - ldp1 A_l, A_h, src, #16 - stp1 A_l, A_h, dst, #16 -.Ltiny15: - /* - * Prefer to break one ldp/stp into several load/store to access - * memory in an increasing address order,rather than to load/store 16 - * bytes from (src-16) to (dst-16) and to backward the src to aligned - * address,which way is used in original cortex memcpy. If keeping - * the original memcpy process here, memmove need to satisfy the - * precondition that src address is at least 16 bytes bigger than dst - * address,otherwise some source data will be overwritten when memove - * call memcpy directly. To make memmove simpler and decouple the - * memcpy's dependency on memmove, withdrew the original process. - */ - tbz count, #3, 1f - ldr1 tmp1, src, #8 - str1 tmp1, dst, #8 -1: - tbz count, #2, 2f - ldr1 tmp1w, src, #4 - str1 tmp1w, dst, #4 -2: - tbz count, #1, 3f - ldrh1 tmp1w, src, #2 - strh1 tmp1w, dst, #2 -3: - tbz count, #0, .Lexitfunc - ldrb1 tmp1w, src, #1 - strb1 tmp1w, dst, #1 - - b .Lexitfunc - -.Lcpy_over64: - subs count, count, #128 - b.ge .Lcpy_body_large - /* - * Less than 128 bytes to copy, so handle 64 here and then jump - * to the tail. - */ - ldp1 A_l, A_h, src, #16 - stp1 A_l, A_h, dst, #16 - ldp1 B_l, B_h, src, #16 - ldp1 C_l, C_h, src, #16 - stp1 B_l, B_h, dst, #16 - stp1 C_l, C_h, dst, #16 - ldp1 D_l, D_h, src, #16 - stp1 D_l, D_h, dst, #16 - - tst count, #0x3f - b.ne .Ltail63 - b .Lexitfunc - - /* - * Critical loop. Start at a new cache line boundary. Assuming - * 64 bytes per line this ensures the entire loop is in one line. - */ - .p2align L1_CACHE_SHIFT -.Lcpy_body_large: - /* pre-get 64 bytes data. */ - ldp1 A_l, A_h, src, #16 - ldp1 B_l, B_h, src, #16 - ldp1 C_l, C_h, src, #16 - ldp1 D_l, D_h, src, #16 -1: - /* - * interlace the load of next 64 bytes data block with store of the last - * loaded 64 bytes data. - */ - stp1 A_l, A_h, dst, #16 - ldp1 A_l, A_h, src, #16 - stp1 B_l, B_h, dst, #16 - ldp1 B_l, B_h, src, #16 - stp1 C_l, C_h, dst, #16 - ldp1 C_l, C_h, src, #16 - stp1 D_l, D_h, dst, #16 - ldp1 D_l, D_h, src, #16 - subs count, count, #64 - b.ge 1b - stp1 A_l, A_h, dst, #16 - stp1 B_l, B_h, dst, #16 - stp1 C_l, C_h, dst, #16 - stp1 D_l, D_h, dst, #16 - - tst count, #0x3f - b.ne .Ltail63 -.Lexitfunc: + #define dstin x0 + #define src x1 + #define count x2 + #define dst x3 + #define srcend x4 + #define dstend x5 + #define A_l x6 + #define A_lw w6 + #define A_h x7 + #define B_l x8 + #define B_lw w8 + #define B_h x9 + #define C_l x10 + #define C_lw w10 + #define C_h x11 + #define D_l x12 + #define D_h x13 + #define E_l x14 + #define E_h x15 + #define F_l x16 + #define F_h x17 + #define G_l count + #define G_h dst + #define H_l src + #define H_h srcend + #define tmp1 x14 + + add srcend, src, count + add dstend, dstin, count + cmp count, 128 + b.hi L(copy_long) + cmp count, 32 + b.hi L(copy32_128) + + /* Small copies: 0..32 bytes. */ + cmp count, 16 + b.lo L(copy16) + ldp1 A_l, A_h, src + ldp1 D_l, D_h, srcend, -16 + stp1 A_l, A_h, dstin + stp1 D_l, D_h, dstend, -16 + copy_exit + + /* Copy 8-15 bytes. */ +L(copy16): + tbz count, 3, L(copy8) + ldr1 A_l, src + ldr1 A_h, srcend, -8 + str1 A_l, dstin + str1 A_h, dstend, -8 + copy_exit + + .p2align 3 + /* Copy 4-7 bytes. */ +L(copy8): + tbz count, 2, L(copy4) + ldr1 A_lw, src + ldr1 B_lw, srcend, -4 + str1 A_lw, dstin + str1 B_lw, dstend, -4 + copy_exit + + /* Copy 0..3 bytes using a branchless sequence. */ +L(copy4): + cbz count, L(copy0) + lsr tmp1, count, 1 + ldrb1 A_lw, src + ldrb1 C_lw, srcend, -1 + ldrb1_reg B_lw, src, tmp1 + strb1 A_lw, dstin + strb1_reg B_lw, dstin, tmp1 + strb1 C_lw, dstend, -1 +L(copy0): + copy_exit + + .p2align 4 + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + ldp1 A_l, A_h, src + ldp1 B_l, B_h, src, 16 + ldp1 C_l, C_h, srcend, -32 + ldp1 D_l, D_h, srcend, -16 + cmp count, 64 + b.hi L(copy128) + stp1 A_l, A_h, dstin + stp1 B_l, B_h, dstin, 16 + stp1 C_l, C_h, dstend, -32 + stp1 D_l, D_h, dstend, -16 + copy_exit + + .p2align 4 + /* Copy 65..128 bytes. */ +L(copy128): + ldp1 E_l, E_h, src, 32 + ldp1 F_l, F_h, src, 48 + cmp count, 96 + b.ls L(copy96) + ldp1 G_l, G_h, srcend, -64 + ldp1 H_l, H_h, srcend, -48 + stp1 G_l, G_h, dstend, -64 + stp1 H_l, H_h, dstend, -48 +L(copy96): + stp1 A_l, A_h, dstin + stp1 B_l, B_h, dstin, 16 + stp1 E_l, E_h, dstin, 32 + stp1 F_l, F_h, dstin, 48 + stp1 C_l, C_h, dstend, -32 + stp1 D_l, D_h, dstend, -16 + copy_exit + + .p2align 4 + /* Copy more than 128 bytes. */ +L(copy_long): + /* Use backwards copy if there is an overlap. */ + sub tmp1, dstin, src + cbz tmp1, L(copy0) + cmp tmp1, count + b.lo L(copy_long_backwards) + + /* Copy 16 bytes and then align dst to 16-byte alignment. */ + + ldp1 D_l, D_h, src + and tmp1, dstin, 15 + bic dst, dstin, 15 + sub src, src, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp1 A_l, A_h, src, 16 + stp1 D_l, D_h, dstin + ldp1 B_l, B_h, src, 32 + ldp1 C_l, C_h, src, 48 + ldp1_pre D_l, D_h, src, 64 + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(copy64_from_end) + +L(loop64): + stp1 A_l, A_h, dst, 16 + ldp1 A_l, A_h, src, 16 + stp1 B_l, B_h, dst, 32 + ldp1 B_l, B_h, src, 32 + stp1 C_l, C_h, dst, 48 + ldp1 C_l, C_h, src, 48 + stp1_pre D_l, D_h, dst, 64 + ldp1_pre D_l, D_h, src, 64 + subs count, count, 64 + b.hi L(loop64) + + /* Write the last iteration and copy 64 bytes from the end. */ +L(copy64_from_end): + ldp1 E_l, E_h, srcend, -64 + stp1 A_l, A_h, dst, 16 + ldp1 A_l, A_h, srcend, -48 + stp1 B_l, B_h, dst, 32 + ldp1 B_l, B_h, srcend, -32 + stp1 C_l, C_h, dst, 48 + ldp1 C_l, C_h, srcend, -16 + stp1 D_l, D_h, dst, 64 + stp1 E_l, E_h, dstend, -64 + stp1 A_l, A_h, dstend, -48 + stp1 B_l, B_h, dstend, -32 + stp1 C_l, C_h, dstend, -16 + copy_exit + + .p2align 4 + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align dst to 16-byte alignment. */ +L(copy_long_backwards): + ldp1 D_l, D_h, srcend, -16 + and tmp1, dstend, 15 + sub srcend, srcend, tmp1 + sub count, count, tmp1 + ldp1 A_l, A_h, srcend, -16 + stp1 D_l, D_h, dstend, -16 + ldp1 B_l, B_h, srcend, -32 + ldp1 C_l, C_h, srcend, -48 + ldp1_pre D_l, D_h, srcend, -64 + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls L(copy64_from_start) + +L(loop64_backwards): + stp1 A_l, A_h, dstend, -16 + ldp1 A_l, A_h, srcend, -16 + stp1 B_l, B_h, dstend, -32 + ldp1 B_l, B_h, srcend, -32 + stp1 C_l, C_h, dstend, -48 + ldp1 C_l, C_h, srcend, -48 + stp1_pre D_l, D_h, dstend, -64 + ldp1_pre D_l, D_h, srcend, -64 + subs count, count, 64 + b.hi L(loop64_backwards) + + /* Write the last iteration and copy 64 bytes from the start. */ +L(copy64_from_start): + ldp1 G_l, G_h, src, 48 + stp1 A_l, A_h, dstend, -16 + ldp1 A_l, A_h, src, 32 + stp1 B_l, B_h, dstend, -32 + ldp1 B_l, B_h, src, 16 + stp1 C_l, C_h, dstend, -48 + ldp1 C_l, C_h, src + stp1 D_l, D_h, dstend, -64 + stp1 G_l, G_h, dstin, 48 + stp1 A_l, A_h, dstin, 32 + stp1 B_l, B_h, dstin, 16 + stp1 C_l, C_h, dstin + copy_exit diff --git a/arch/arm64/lib/copy_template_user.S b/arch/arm64/lib/copy_template_user.S new file mode 100644 index 000000000000..3db24dcdab05 --- /dev/null +++ b/arch/arm64/lib/copy_template_user.S @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#define L(l) .L ## l + + alternative_if_not ARM64_HAS_UAO + b L(copy_non_uao) + alternative_else_nop_endif +#include "copy_template.S" + +#define ldp1 ldp1_nuao +#define ldp1_pre ldp1_pre_nuao +#define stp1 stp1_nuao +#define stp1_pre stp1_pre_nuao +#define ldr1 ldr1_nuao +#define str1 str1_nuao +#define ldrb1 ldrb1_nuao +#define strb1 strb1_nuao +#define ldrb1_reg ldrb1_nuao_reg +#define strb1_reg strb1_nuao_reg + +L(copy_non_uao): +#undef L +#define L(l) .Lnuao ## l +#include "copy_template.S" diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index 4ec59704b8f2..6b4742cac083 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -19,49 +19,111 @@ * Returns: * x0 - bytes not copied */ - .macro ldrb1 reg, ptr, val - ldrb \reg, [\ptr], \val + .macro ldrb1 reg, ptr, offset=0 + ldrb \reg, [\ptr, \offset] .endm - .macro strb1 reg, ptr, val - uao_user_alternative 9998f, strb, sttrb, \reg, \ptr, \val + .macro strb1 reg, ptr, offset=0 + 8888: sttrb \reg, [\ptr, \offset] + _asm_extable_faultaddr 8888b,9998f; .endm - .macro ldrh1 reg, ptr, val - ldrh \reg, [\ptr], \val + .macro ldrb1_reg reg, ptr, offset + ldrb \reg, [\ptr, \offset] .endm - .macro strh1 reg, ptr, val - uao_user_alternative 9998f, strh, sttrh, \reg, \ptr, \val + .macro strb1_reg reg, ptr, offset + add \ptr, \ptr, \offset + 8888: sttrb \reg, [\ptr] + sub \ptr, \ptr, \offset + _asm_extable_faultaddr 8888b,9998f; .endm - .macro ldr1 reg, ptr, val - ldr \reg, [\ptr], \val + .macro ldr1 reg, ptr, offset=0 + ldr \reg, [\ptr, \offset] .endm - .macro str1 reg, ptr, val - uao_user_alternative 9998f, str, sttr, \reg, \ptr, \val + .macro str1 reg, ptr, offset=0 + 8888: sttr \reg, [\ptr, \offset] + _asm_extable_faultaddr 8888b,9998f; .endm - .macro ldp1 reg1, reg2, ptr, val - ldp \reg1, \reg2, [\ptr], \val + .macro ldp1 regA, regB, ptr, offset=0 + ldp \regA, \regB, [\ptr, \offset] .endm - .macro stp1 reg1, reg2, ptr, val - uao_stp 9998f, \reg1, \reg2, \ptr, \val + .macro stp1 regA, regB, ptr, offset=0 + 8888: sttr \regA, [\ptr, \offset] + 8889: sttr \regB, [\ptr, \offset + 8] + _asm_extable_faultaddr 8888b,9998f; + _asm_extable_faultaddr 8889b,9998f; + .endm + + .macro ldp1_pre regA, regB, ptr, offset + ldp \regA, \regB, [\ptr, \offset]! + .endm + + .macro stp1_pre regA, regB, ptr, offset + 8888: sttr \regA, [\ptr, \offset] + 8889: sttr \regB, [\ptr, \offset + 8] + add \ptr, \ptr, \offset + _asm_extable_faultaddr 8888b,9998f; + _asm_extable_faultaddr 8889b,9998f; + .endm + + .macro ldrb1_nuao reg, ptr, offset=0 + ldrb \reg, [\ptr, \offset] + .endm + + .macro strb1_nuao reg, ptr, offset=0 + 8888: strb \reg, [\ptr, \offset] + _asm_extable_faultaddr 8888b,9998f; + .endm + + .macro ldrb1_nuao_reg reg, ptr, offset=0 + ldrb \reg, [\ptr, \offset] + .endm + + .macro strb1_nuao_reg reg, ptr, offset=0 + strb \reg, [\ptr, \offset] + .endm + + .macro ldr1_nuao reg, ptr, offset=0 + ldr \reg, [\ptr, \offset] + .endm + + .macro str1_nuao reg, ptr, offset=0 + 8888: str \reg, [\ptr, \offset] + _asm_extable_faultaddr 8888b,9998f; + .endm + + .macro ldp1_nuao regA, regB, ptr, offset=0 + ldp \regA, \regB, [\ptr, \offset] + .endm + + .macro ldp1_pre_nuao regA, regB, ptr, offset + ldp \regA, \regB, [\ptr, \offset]! + .endm + + .macro stp1_nuao regA, regB, ptr, offset=0 + 8888: stp \regA, \regB, [\ptr, \offset] + _asm_extable_faultaddr 8888b,9998f; + .endm + + .macro stp1_pre_nuao regA, regB, ptr, offset + 8888: stp \regA, \regB, [\ptr, \offset]! + _asm_extable_faultaddr 8888b,9998f; + .endm + + .macro copy_exit + b .Luaccess_finish .endm -end .req x5 SYM_FUNC_START(__arch_copy_to_user) - add end, x0, x2 -#include "copy_template.S" +#include "copy_template_user.S" +.Luaccess_finish: mov x0, #0 ret SYM_FUNC_END(__arch_copy_to_user) EXPORT_SYMBOL(__arch_copy_to_user) - - .section .fixup,"ax" - .align 2 -9998: sub x0, end, dst // bytes not copied - ret - .previous +#include "copy_user_fixup.S" diff --git a/arch/arm64/lib/copy_user_fixup.S b/arch/arm64/lib/copy_user_fixup.S new file mode 100644 index 000000000000..32fae9e2e799 --- /dev/null +++ b/arch/arm64/lib/copy_user_fixup.S @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +addr .req x15 +.section .fixup,"ax" +.align 2 +9998: + // If it falls in the src range then it was a load that failed, + // otherwise it was a store + cmp addr, src + ccmp addr, srcend, #0x0, ge + csel x0, srcend, dstend, lt + sub x0, x0, addr + ret + diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S index e0bf83d556f2..c24925aef236 100644 --- a/arch/arm64/lib/memcpy.S +++ b/arch/arm64/lib/memcpy.S @@ -24,43 +24,56 @@ * Returns: * x0 - dest */ - .macro ldrb1 reg, ptr, val - ldrb \reg, [\ptr], \val + #define L(l) .L ## l + + .macro ldrb1 reg, ptr, offset=0 + ldrb \reg, [\ptr, \offset] + .endm + + .macro strb1 reg, ptr, offset=0 + strb \reg, [\ptr, \offset] + .endm + + .macro ldr1 reg, ptr, offset=0 + ldr \reg, [\ptr, \offset] .endm - .macro strb1 reg, ptr, val - strb \reg, [\ptr], \val + .macro str1 reg, ptr, offset=0 + str \reg, [\ptr, \offset] .endm - .macro ldrh1 reg, ptr, val - ldrh \reg, [\ptr], \val + .macro ldp1 regA, regB, ptr, offset=0 + ldp \regA, \regB, [\ptr, \offset] .endm - .macro strh1 reg, ptr, val - strh \reg, [\ptr], \val + .macro stp1 regA, regB, ptr, offset=0 + stp \regA, \regB, [\ptr, \offset] .endm - .macro ldr1 reg, ptr, val - ldr \reg, [\ptr], \val + .macro ldrb1_reg reg, ptr, offset + ldrb1 \reg, \ptr, \offset .endm - .macro str1 reg, ptr, val - str \reg, [\ptr], \val + .macro strb1_reg reg, ptr, offset + strb1 \reg, \ptr, \offset .endm - .macro ldp1 reg1, reg2, ptr, val - ldp \reg1, \reg2, [\ptr], \val + .macro ldp1_pre regA, regB, ptr, offset + ldp \regA, \regB, [\ptr, \offset]! .endm - .macro stp1 reg1, reg2, ptr, val - stp \reg1, \reg2, [\ptr], \val + .macro stp1_pre regA, regB, ptr, offset + stp \regA, \regB, [\ptr, \offset]! + .endm + + .macro copy_exit + ret .endm .weak memcpy SYM_FUNC_START_ALIAS(__memcpy) SYM_FUNC_START_PI(memcpy) #include "copy_template.S" - ret SYM_FUNC_END_PI(memcpy) EXPORT_SYMBOL(memcpy) SYM_FUNC_END_ALIAS(__memcpy) -- 2.17.1