Received: by 2002:ac0:a5b6:0:0:0:0:0 with SMTP id m51-v6csp880774imm; Thu, 31 May 2018 11:01:35 -0700 (PDT) X-Google-Smtp-Source: ADUXVKJMYfojhnKD6pwHQVQq3vbrR/+mFDRwYoggzvfwt1mhbidMuH9nTpgrDl63pfb4AmKqEQnQ X-Received: by 2002:a65:51c4:: with SMTP id i4-v6mr6231993pgq.190.1527789695119; Thu, 31 May 2018 11:01:35 -0700 (PDT) ARC-Seal: i=1; a=rsa-sha256; t=1527789695; cv=none; d=google.com; s=arc-20160816; b=h7EAOfrHROoR18iDk3hPW+oE/8frnHHBM7Rjvcq7xxAqW7GE5eHFe2g5v+9EHS3kv4 4qBSK8Z1XEtEuAdY2hvvHCEx0BymQ8A3NGDNwe6nPCyzZfxSJ1+/WA/kwlVaLGWZkTmr gpQjuRUM5W69qnUCIP8pEcRSX6+EBSI186g8YMAT2eH8UkYUqzhRcHacakyyLG0j+O6C tTcSVHmBCLE9EmKyP0tunqWqh8QbRkFM98GXNbHx7sJvzbJsfjZB9E63RA98iIWKi0Uc S4dDu6HSihsVLBupo+tTRcPSoVsRPESsrvf7VlT+g/sYLZn8lrCXlgcxHsgviMOXHCjg qXfw== ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=arc-20160816; h=list-id:precedence:sender:references:in-reply-to:message-id:date :subject:cc:to:from:arc-authentication-results; bh=n5cNsHHtgQGwY+fmDp5I9Zn/g2xVmerjQamHsT9Sb/g=; b=Vb7orkSvQrZzyRLPS2tOFhFD04pZFe8JmLSe0FaZIcTQkN70zCsxPP3EdvXnxearF1 qYY+NFdF2EpWd1KYgt3D4ONpnVp9dy3wOa2B+I2t/reRUTxfZCvI60vMHNBDMWgRfhN7 3PYiLEFK+0vgV44Er70kJaEIbPx5Nyv/3AdnbYM5ku4sKVlMEpGro/si4qV6hlTC3nc0 IKsAhLH1pgSinH9T29Gn6uUtYPD0jGnlm8YGb8WelyJStnTxCrRzSkguEQmAAzpwICnC jzHz4GghBwfdBLkHCfafqIlMVhnOTeVP38O3oPDXfehIZ/M3j8FP41m7nK3H9WvmeoVP Y6Nw== ARC-Authentication-Results: i=1; mx.google.com; spf=pass (google.com: best guess record for domain of linux-kernel-owner@vger.kernel.org designates 209.132.180.67 as permitted sender) smtp.mailfrom=linux-kernel-owner@vger.kernel.org; dmarc=fail (p=NONE sp=NONE dis=NONE) header.from=intel.com Return-Path: Received: from vger.kernel.org (vger.kernel.org. [209.132.180.67]) by mx.google.com with ESMTP id bh5-v6si36191651plb.320.2018.05.31.11.01.20; Thu, 31 May 2018 11:01:35 -0700 (PDT) Received-SPF: pass (google.com: best guess record for domain of linux-kernel-owner@vger.kernel.org designates 209.132.180.67 as permitted sender) client-ip=209.132.180.67; Authentication-Results: mx.google.com; spf=pass (google.com: best guess record for domain of linux-kernel-owner@vger.kernel.org designates 209.132.180.67 as permitted sender) smtp.mailfrom=linux-kernel-owner@vger.kernel.org; dmarc=fail (p=NONE sp=NONE dis=NONE) header.from=intel.com Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S932838AbeEaSAI (ORCPT + 99 others); Thu, 31 May 2018 14:00:08 -0400 Received: from mga01.intel.com ([192.55.52.88]:18561 "EHLO mga01.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1755824AbeEaR7F (ORCPT ); Thu, 31 May 2018 13:59:05 -0400 X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from orsmga001.jf.intel.com ([10.7.209.18]) by fmsmga101.fm.intel.com with ESMTP/TLS/DHE-RSA-AES256-GCM-SHA384; 31 May 2018 10:58:59 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.49,463,1520924400"; d="scan'208";a="60725540" Received: from chang-linux-2.sc.intel.com ([10.3.52.139]) by orsmga001.jf.intel.com with ESMTP; 31 May 2018 10:58:59 -0700 From: "Chang S. Bae" To: Andy Lutomirski , "H . Peter Anvin" , Thomas Gleixner , Ingo Molnar Cc: Andi Kleen , Dave Hansen , Markus T Metzger , "Ravi V . Shankar" , "Chang S . Bae" , linux-kernel@vger.kernel.org Subject: [PATCH V2 12/15] x86/fsgsbase/64: Use per-CPU base as GS base on paranoid_entry Date: Thu, 31 May 2018 10:58:42 -0700 Message-Id: <1527789525-8857-13-git-send-email-chang.seok.bae@intel.com> X-Mailer: git-send-email 2.7.4 In-Reply-To: <1527789525-8857-1-git-send-email-chang.seok.bae@intel.com> References: <1527789525-8857-1-git-send-email-chang.seok.bae@intel.com> Sender: linux-kernel-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org FSGSBASE allows fast access on GS base. With that, per-CPU base is always copied to GS base on paranoid entry. The current GS base value is restored on the exit. Currently, userspace can't modify GS base and the kernel's conventions are that a negative GS base means it is a kernel value and a positive GS base means it is a user value. But, with FSGSBASE enabled, userspace can put arbitrary data in there. This behavior will be the same with the patch. Per-CPU base can be found from per_cpu_offset table with CPU number, which is in the (per-CPU) segment limit or obtained by RDPID instruction. GAS-compatible RDPID macro is included. Suggested-by: H. Peter Anvin Signed-off-by: Chang S. Bae Cc: Andi Kleen Cc: Andy Lutomirski Cc: Dave Hansen Cc: Thomas Gleixner Cc: Ingo Molnar --- arch/x86/entry/entry_64.S | 74 +++++++++++++++++++++++++++++++++-------- arch/x86/include/asm/fsgsbase.h | 57 +++++++++++++++++++++++++++++++ arch/x86/include/asm/inst.h | 15 +++++++++ 3 files changed, 132 insertions(+), 14 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 3166b96..cfac4c0 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -38,6 +38,8 @@ #include #include #include +#include +#include #include #include "calling.h" @@ -954,10 +956,14 @@ ENTRY(\sym) addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) .endif - /* these procedures expect "no swapgs" flag in ebx */ .if \paranoid + /* + * With FSGSBASE, original GS base is stored in rbx + * Without FSGSBASE, expect "no swapgs" flag in ebx + */ jmp paranoid_exit .else + /* expect "no swapgs" flag in ebx */ jmp error_exit .endif @@ -1168,26 +1174,57 @@ idtentry machine_check do_mce has_error_code=0 paranoid=1 #endif /* - * Save all registers in pt_regs, and switch gs if needed. - * Use slow, but surefire "are we in kernel?" check. - * Return: ebx=0: need swapgs on exit, ebx=1: otherwise + * Save all registers in pt_regs. + * + * When FSGSBASE enabled, current GS base is always copied to rbx. + * + * Without FSGSBASE, SWAPGS is needed when entering from userspace. + * A positive GS base means it is a user value and a negative GS + * base means it is a kernel value. + * + * Return: + * With FSGSBASE, rbx has current GS base. + * Without that, + * ebx=0: need SWAPGS on exit, ebx=1: otherwise */ ENTRY(paranoid_entry) UNWIND_HINT_FUNC cld PUSH_AND_CLEAR_REGS save_ret=1 ENCODE_FRAME_POINTER 8 - movl $1, %ebx - movl $MSR_GS_BASE, %ecx - rdmsr - testl %edx, %edx - js 1f /* negative -> in kernel */ - SWAPGS - xorl %ebx, %ebx -1: + /* + * As long as this PTI macro doesn't depend on kernel GS base, + * we can do it early. This is because FIND_PERCPU_BASE + * references data in kernel space. + */ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 + /* + * Read GS base by RDGSBASE. Kernel GS base is found + * from the per-CPU offset table with CPU number. + */ + ALTERNATIVE "jmp .Lparanoid_entry_no_fsgsbase", "",\ + X86_FEATURE_FSGSBASE + RDGSBASE %rbx + FIND_PERCPU_BASE %rax + WRGSBASE %rax + ret + +.Lparanoid_entry_no_fsgsbase: + movl $1, %ebx + /* + * FSGSBASE is not in use, so depend on the kernel-enforced + * convention that a negative GS base indicates a kernel value. + */ + READ_MSR_GSBASE save_reg=%edx + testl %edx, %edx /* negative -> in kernel */ + jns .Lparanoid_entry_swapgs + ret + +.Lparanoid_entry_swapgs: + SWAPGS + xorl %ebx, %ebx ret END(paranoid_entry) @@ -1201,12 +1238,21 @@ END(paranoid_entry) * be complicated. Fortunately, we there's no good reason * to try to handle preemption here. * - * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) + * On entry, + * With FSGSBASE, + * rbx is original GS base that needs to be restored on the exit + * Without that, + * ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(paranoid_exit) UNWIND_HINT_REGS DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF_DEBUG + ALTERNATIVE "jmp .Lparanoid_exit_no_fsgsbase", "nop",\ + X86_FEATURE_FSGSBASE + WRGSBASE %rbx + jmp .Lparanoid_exit_no_swapgs; +.Lparanoid_exit_no_fsgsbase: testl %ebx, %ebx /* swapgs needed? */ jnz .Lparanoid_exit_no_swapgs TRACE_IRQS_IRETQ @@ -1217,7 +1263,7 @@ ENTRY(paranoid_exit) TRACE_IRQS_IRETQ_DEBUG RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 .Lparanoid_exit_restore: - jmp restore_regs_and_return_to_kernel + jmp restore_regs_and_return_to_kernel END(paranoid_exit) /* diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h index 903c7a0..3a5e1ec 100644 --- a/arch/x86/include/asm/fsgsbase.h +++ b/arch/x86/include/asm/fsgsbase.h @@ -107,6 +107,63 @@ void write_inactive_gsbase(unsigned long gsbase); MODRM 0xd0 wrgsbase_opd 1 .endm +#if CONFIG_SMP + +/* + * Fetch the per-CPU GSBASE value for this processor and put it in @reg. + * We normally use %GS for accessing per-CPU data, but we are setting up + * %GS here and obviously can not use %GS itself to access per-CPU data. + */ +.macro FIND_PERCPU_BASE_RDPID reg:req + RDPID \reg + + /* + * CPU number is written before IST initialization. Later, + * processor id is (also) written during vDSO initialization, + * with 12 bits for the CPU and 8 bits for the node. + */ + andq $PERCPU_CPU_MASK, \reg + /* + * Kernel GS base is looked up from the __per_cpu_offset list with + * the CPU number (processor id). + */ + movq __per_cpu_offset(, \reg, 8), \reg +.endm + +.macro FIND_PERCPU_BASE_SEG_LIMIT reg:req + /* CPU number is found from the limit of PER_CPU entry in GDT */ + movq $__PER_CPU_SEG, \reg + lsl \reg, \reg + + /* Same as FIND_PERCPU_BASE_RDPID */ + andq $PERCPU_CPU_MASK, \reg + movq __per_cpu_offset(, \reg, 8), \reg +.endm + +.macro FIND_PERCPU_BASE reg:req + ALTERNATIVE \ + "FIND_PERCPU_BASE_SEG_LIMIT \reg", \ + "FIND_PERCPU_BASE_RDPID \reg", \ + X86_FEATURE_RDPID +.endm + +#else + +.macro FIND_PERCPU_BASE reg:req + /* Tracking the base offset value */ + movq pcpu_unit_offsets(%rip), \reg +.endm + +#endif /* CONFIG_SMP */ + +.macro READ_MSR_GSBASE save_reg:req + movl $MSR_GS_BASE, %ecx + /* Read MSR specified by %ecx into %edx:%eax */ + rdmsr + .ifnc \save_reg, %edx + movl %edx, \save_reg + .endif +.endm #endif /* CONFIG_X86_64 */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h index f5a796d..d063841 100644 --- a/arch/x86/include/asm/inst.h +++ b/arch/x86/include/asm/inst.h @@ -306,6 +306,21 @@ .endif MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2 .endm + +.macro RDPID opd + REG_TYPE rdpid_opd_type \opd + .if rdpid_opd_type == REG_TYPE_R64 + R64_NUM rdpid_opd \opd + .else + R32_NUM rdpid_opd \opd + .endif + .byte 0xf3 + .if rdpid_opd > 7 + PFX_REX rdpid_opd 0 + .endif + .byte 0x0f, 0xc7 + MODRM 0xc0 rdpid_opd 0x7 +.endm #endif #endif -- 2.7.4