LinuxLists.cc - [PATCH 1/2] x86/bitops: implement __test

2015-08-30 08:38:17

Subject: [PATCH 1/2] x86/bitops: implement __test_bit

One little known side effect of test_bit is that it adds
a kind of a compiler barrier since the pointer parameter
is volatile.

It seems risky to change the semantics of test_bit so let's just
add __test_bit (matching __set_bit and __clear_bit) that does
not add such a barrier.

Will be used by kvm on x86, where it shaves several bytes off
the binary size. Small win, but comes at no cost, so why not.

Signed-off-by: Michael S. Tsirkin <[email protected]>
---

x86 maintainers - please specify whether you are ok with
adding this to arch/x86/include/asm/bitops.h
An alternative is to add this to kvm/x86 only.
It might be worth it to add this to all architectures,
though I haven't explored too much.

arch/x86/include/asm/bitops.h | 30 ++++++++++++++++++++++++++++++
1 file changed, 30 insertions(+)

diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index cfe3b95..9229334 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -323,6 +323,24 @@ static inline int variable_test_bit(long nr, volatile const unsigned long *addr)
return oldbit;
}

+static __always_inline int __constant_test_bit(long nr, const unsigned long *addr)
+{
+ return ((1UL << (nr & (BITS_PER_LONG-1))) &
+ (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
+}
+
+static inline int __variable_test_bit(long nr, const unsigned long *addr)
+{
+ int oldbit;
+
+ asm volatile("bt %2,%1\n\t"
+ "sbb %0,%0"
+ : "=r" (oldbit)
+ : "m" (*addr), "Ir" (nr));
+
+ return oldbit;
+}
+
#if 0 /* Fool kernel-doc since it doesn't do macros yet */
/**
* test_bit - Determine whether a bit is set
@@ -330,6 +348,13 @@ static inline int variable_test_bit(long nr, volatile const unsigned long *addr)
* @addr: Address to start counting from
*/
static int test_bit(int nr, const volatile unsigned long *addr);
+
+/**
+ * __test_bit - Determine whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static int __test_bit(int nr, const volatile unsigned long *addr);
#endif

#define test_bit(nr, addr) \
@@ -337,6 +362,11 @@ static int test_bit(int nr, const volatile unsigned long *addr);
? constant_test_bit((nr), (addr)) \
: variable_test_bit((nr), (addr)))

+#define __test_bit(nr, addr) \
+ (__builtin_constant_p((nr)) \
+ ? __constant_test_bit((nr), (addr)) \
+ : __variable_test_bit((nr), (addr)))
+
/**
* __ffs - find first set bit in word
* @word: The word to search
--
MST

2015-08-30 08:38:43

by Michael S. Tsirkin

[permalink] [raw]

Subject: [PATCH 2/2] kvm/x86: use __test_bit

Let compiler do slightly better optimizations using
the non-volatile __test_bit in all cases where
the values are set using the non-volatile __set_bit and
__clear_bit.

I left test_bit in place where the mask is set using
the atomic set_bit/clear_bit, for symmetry.

This shaves about 100 bytes off the kernel size:

before:
134868 2997 8372 146237 23b3d arch/x86/kvm/kvm-intel.ko
343129 47640 441 391210 5f82a arch/x86/kvm/kvm.ko
after:
134836 2997 8372 146205 23b1d arch/x86/kvm/kvm-intel.ko
343017 47640 441 391098 5f7ba arch/x86/kvm/kvm.ko

Signed-off-by: Michael S. Tsirkin <[email protected]>
---
arch/x86/kvm/ioapic.h | 2 +-
arch/x86/kvm/kvm_cache_regs.h | 6 +++---
arch/x86/kvm/ioapic.c | 2 +-
arch/x86/kvm/pmu_intel.c | 2 +-
arch/x86/kvm/vmx.c | 18 +++++++++---------
arch/x86/kvm/x86.c | 2 +-
6 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index ca0b0b4..3b58d41 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -102,7 +102,7 @@ static inline bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector)
{
struct kvm_ioapic *ioapic = kvm->arch.vioapic;
smp_rmb();
- return test_bit(vector, ioapic->handled_vectors);
+ return __test_bit(vector, ioapic->handled_vectors);
}

void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index e1e89ee..21ef6d6 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -9,7 +9,7 @@
static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
enum kvm_reg reg)
{
- if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail))
+ if (!__test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail))
kvm_x86_ops->cache_reg(vcpu, reg);

return vcpu->arch.regs[reg];
@@ -38,7 +38,7 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
{
might_sleep(); /* on svm */

- if (!test_bit(VCPU_EXREG_PDPTR,
+ if (!__test_bit(VCPU_EXREG_PDPTR,
(unsigned long *)&vcpu->arch.regs_avail))
kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);

@@ -68,7 +68,7 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)

static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
{
- if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
+ if (!__test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
kvm_x86_ops->decache_cr3(vcpu);
return vcpu->arch.cr3;
}
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 856f791..bf2afa5 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -117,7 +117,7 @@ static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
return;

new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector);
- old_val = test_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map);
+ old_val = __test_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map);

if (new_val == old_val)
return;
diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c
index ab38af4..fb20a0f 100644
--- a/arch/x86/kvm/pmu_intel.c
+++ b/arch/x86/kvm/pmu_intel.c
@@ -98,7 +98,7 @@ static bool intel_pmc_is_enabled(struct kvm_pmc *pmc)
{
struct kvm_pmu *pmu = pmc_to_pmu(pmc);

- return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
+ return __test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
}

static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c117703..ed44026 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2025,7 +2025,7 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
{
unsigned long rflags, save_rflags;

- if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
+ if (!__test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
__set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
rflags = vmcs_readl(GUEST_RFLAGS);
if (to_vmx(vcpu)->rmode.vm86_active) {
@@ -3478,7 +3478,7 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;

- if (!test_bit(VCPU_EXREG_PDPTR,
+ if (!__test_bit(VCPU_EXREG_PDPTR,
(unsigned long *)&vcpu->arch.regs_dirty))
return;

@@ -3513,7 +3513,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
unsigned long cr0,
struct kvm_vcpu *vcpu)
{
- if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
+ if (!__test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
vmx_decache_cr3(vcpu);
if (!(cr0 & X86_CR0_PG)) {
/* From paging/starting to nonpaging */
@@ -4275,24 +4275,24 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
*/
if (msr <= 0x1fff) {
if (type & MSR_TYPE_R &&
- !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
+ !__test_bit(msr, msr_bitmap_l1 + 0x000 / f))
/* read-low */
__clear_bit(msr, msr_bitmap_nested + 0x000 / f);

if (type & MSR_TYPE_W &&
- !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
+ !__test_bit(msr, msr_bitmap_l1 + 0x800 / f))
/* write-low */
__clear_bit(msr, msr_bitmap_nested + 0x800 / f);

} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
msr &= 0x1fff;
if (type & MSR_TYPE_R &&
- !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
+ !__test_bit(msr, msr_bitmap_l1 + 0x400 / f))
/* read-high */
__clear_bit(msr, msr_bitmap_nested + 0x400 / f);

if (type & MSR_TYPE_W &&
- !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
+ !__test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
/* write-high */
__clear_bit(msr, msr_bitmap_nested + 0xc00 / f);

@@ -8316,9 +8316,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx->nested.sync_shadow_vmcs = false;
}

- if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
+ if (__test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
- if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
+ if (__test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);

cr4 = cr4_read_shadow();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5ef2560..c8015fa 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -555,7 +555,7 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
if (is_long_mode(vcpu) || !is_pae(vcpu))
return false;

- if (!test_bit(VCPU_EXREG_PDPTR,
+ if (!__test_bit(VCPU_EXREG_PDPTR,
(unsigned long *)&vcpu->arch.regs_avail))
return true;

--
MST

2015-08-31 06:05:54

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH 1/2] x86/bitops: implement __test_bit

* Michael S. Tsirkin <[email protected]> wrote:

> +static __always_inline int __constant_test_bit(long nr, const unsigned long *addr)
> +{
> + return ((1UL << (nr & (BITS_PER_LONG-1))) &
> + (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
> +}
> +
> +static inline int __variable_test_bit(long nr, const unsigned long *addr)
> +{
> + int oldbit;
> +
> + asm volatile("bt %2,%1\n\t"
> + "sbb %0,%0"
> + : "=r" (oldbit)
> + : "m" (*addr), "Ir" (nr));
> +
> + return oldbit;
> +}

Color me confused, why use assembly for this at all?

Why not just use C for testing the bit (i.e. turn __constant_test_bit() into
__test_bit()) - that would also allow the compiler to propagate the result,
potentially more optimally than we can do it via SBB...

Thanks,

Ingo

2015-08-31 06:13:43

by H. Peter Anvin

[permalink] [raw]

Subject: Re: [PATCH 1/2] x86/bitops: implement __test_bit

Presumably because gcc can't generate bt... whether or not it is worth it is another matter.

On August 30, 2015 11:05:49 PM PDT, Ingo Molnar <[email protected]> wrote:
>
>* Michael S. Tsirkin <[email protected]> wrote:
>
>> +static __always_inline int __constant_test_bit(long nr, const
>unsigned long *addr)
>> +{
>> + return ((1UL << (nr & (BITS_PER_LONG-1))) &
>> + (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
>> +}
>> +
>> +static inline int __variable_test_bit(long nr, const unsigned long
>*addr)
>> +{
>> + int oldbit;
>> +
>> + asm volatile("bt %2,%1\n\t"
>> + "sbb %0,%0"
>> + : "=r" (oldbit)
>> + : "m" (*addr), "Ir" (nr));
>> +
>> + return oldbit;
>> +}
>
>Color me confused, why use assembly for this at all?
>
>Why not just use C for testing the bit (i.e. turn __constant_test_bit()
>into
>__test_bit()) - that would also allow the compiler to propagate the
>result,
>potentially more optimally than we can do it via SBB...
>
>Thanks,
>
> Ingo

--
Sent from my Android device with K-9 Mail. Please excuse my brevity.

2015-08-31 07:56:35

by Michael S. Tsirkin

[permalink] [raw]

Subject: Re: [PATCH 1/2] x86/bitops: implement __test_bit

On Sun, Aug 30, 2015 at 11:13:20PM -0700, H. Peter Anvin wrote:
> Presumably because gcc can't generate bt... whether or not it is worth it is another matter.
>
> On August 30, 2015 11:05:49 PM PDT, Ingo Molnar <[email protected]> wrote:
> >
> >* Michael S. Tsirkin <[email protected]> wrote:
> >
> >> +static __always_inline int __constant_test_bit(long nr, const
> >unsigned long *addr)
> >> +{
> >> + return ((1UL << (nr & (BITS_PER_LONG-1))) &
> >> + (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
> >> +}
> >> +
> >> +static inline int __variable_test_bit(long nr, const unsigned long
> >*addr)
> >> +{
> >> + int oldbit;
> >> +
> >> + asm volatile("bt %2,%1\n\t"
> >> + "sbb %0,%0"
> >> + : "=r" (oldbit)
> >> + : "m" (*addr), "Ir" (nr));
> >> +
> >> + return oldbit;
> >> +}
> >
> >Color me confused, why use assembly for this at all?
> >
> >Why not just use C for testing the bit (i.e. turn __constant_test_bit()
> >into
> >__test_bit()) - that would also allow the compiler to propagate the
> >result,
> >potentially more optimally than we can do it via SBB...
> >
> >Thanks,
> >
> > Ingo

Exactly:

Disassembly of section .text:

00000000 <__variable_test_bit>:
__variable_test_bit():
0: 8b 54 24 08 mov 0x8(%esp),%edx
4: 8b 44 24 04 mov 0x4(%esp),%eax
8: 0f a3 02 bt %eax,(%edx)
b: 19 c0 sbb %eax,%eax
d: c3 ret
e: 66 90 xchg %ax,%ax

00000010 <__constant_test_bit>:
__constant_test_bit():
10: 8b 4c 24 04 mov 0x4(%esp),%ecx
14: 8b 44 24 08 mov 0x8(%esp),%eax
18: 89 ca mov %ecx,%edx
1a: c1 fa 04 sar $0x4,%edx
1d: 8b 04 90 mov (%eax,%edx,4),%eax
20: d3 e8 shr %cl,%eax
22: 83 e0 01 and $0x1,%eax
25: c3 ret

That's also probably why we still have variable_test_bit
for test_bit too. It's best to be consistent with that - do you agree?
Or would you rather drop that too?

--
MST

2015-08-31 07:59:53

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH 1/2] x86/bitops: implement __test_bit

* Michael S. Tsirkin <[email protected]> wrote:

> On Sun, Aug 30, 2015 at 11:13:20PM -0700, H. Peter Anvin wrote:
> > Presumably because gcc can't generate bt... whether or not it is worth it is another matter.
> >
> > On August 30, 2015 11:05:49 PM PDT, Ingo Molnar <[email protected]> wrote:
> > >
> > >* Michael S. Tsirkin <[email protected]> wrote:
> > >
> > >> +static __always_inline int __constant_test_bit(long nr, const
> > >unsigned long *addr)
> > >> +{
> > >> + return ((1UL << (nr & (BITS_PER_LONG-1))) &
> > >> + (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
> > >> +}
> > >> +
> > >> +static inline int __variable_test_bit(long nr, const unsigned long
> > >*addr)
> > >> +{
> > >> + int oldbit;
> > >> +
> > >> + asm volatile("bt %2,%1\n\t"
> > >> + "sbb %0,%0"
> > >> + : "=r" (oldbit)
> > >> + : "m" (*addr), "Ir" (nr));
> > >> +
> > >> + return oldbit;
> > >> +}
> > >
> > >Color me confused, why use assembly for this at all?
> > >
> > >Why not just use C for testing the bit (i.e. turn __constant_test_bit()
> > >into
> > >__test_bit()) - that would also allow the compiler to propagate the
> > >result,
> > >potentially more optimally than we can do it via SBB...
> > >
> > >Thanks,
> > >
> > > Ingo
>
> Exactly:
>
>
> Disassembly of section .text:
>
> 00000000 <__variable_test_bit>:
> __variable_test_bit():
> 0: 8b 54 24 08 mov 0x8(%esp),%edx
> 4: 8b 44 24 04 mov 0x4(%esp),%eax
> 8: 0f a3 02 bt %eax,(%edx)
> b: 19 c0 sbb %eax,%eax
> d: c3 ret
> e: 66 90 xchg %ax,%ax
>
> 00000010 <__constant_test_bit>:
> __constant_test_bit():
> 10: 8b 4c 24 04 mov 0x4(%esp),%ecx
> 14: 8b 44 24 08 mov 0x8(%esp),%eax
> 18: 89 ca mov %ecx,%edx
> 1a: c1 fa 04 sar $0x4,%edx
> 1d: 8b 04 90 mov (%eax,%edx,4),%eax
> 20: d3 e8 shr %cl,%eax
> 22: 83 e0 01 and $0x1,%eax
> 25: c3 ret

But that's due to the forced interface of generating a return code. Please compare
it at an inlined usage site, where GCC is free to do the comparison directly and
use the result in flags.

Thanks,

Ingo

2015-08-31 08:15:14

by yalin wang

[permalink] [raw]

Subject: Re: [PATCH 1/2] x86/bitops: implement __test_bit

> On Aug 31, 2015, at 15:59, Ingo Molnar <[email protected]> wrote:
>
>
> * Michael S. Tsirkin <[email protected]> wrote:
>
>> On Sun, Aug 30, 2015 at 11:13:20PM -0700, H. Peter Anvin wrote:
>>> Presumably because gcc can't generate bt... whether or not it is worth it is another matter.
>>>
>>> On August 30, 2015 11:05:49 PM PDT, Ingo Molnar <[email protected]> wrote:
>>>>
>>>> * Michael S. Tsirkin <[email protected]> wrote:
>>>>
>>>>> +static __always_inline int __constant_test_bit(long nr, const
>>>> unsigned long *addr)
>>>>> +{
>>>>> + return ((1UL << (nr & (BITS_PER_LONG-1))) &
>>>>> + (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
>>>>> +}
>>>>> +
>>>>> +static inline int __variable_test_bit(long nr, const unsigned long
>>>> *addr)
>>>>> +{
>>>>> + int oldbit;
>>>>> +
>>>>> + asm volatile("bt %2,%1\n\t"
>>>>> + "sbb %0,%0"
>>>>> + : "=r" (oldbit)
>>>>> + : "m" (*addr), "Ir" (nr));
>>>>> +
>>>>> + return oldbit;
>>>>> +}
>>>>
>>>> Color me confused, why use assembly for this at all?
>>>>
>>>> Why not just use C for testing the bit (i.e. turn __constant_test_bit()
>>>> into
>>>> __test_bit()) - that would also allow the compiler to propagate the
>>>> result,
>>>> potentially more optimally than we can do it via SBB...
>>>>
>>>> Thanks,
>>>>
>>>> Ingo
>>
>> Exactly:
>>
>>
>> Disassembly of section .text:
>>
>> 00000000 <__variable_test_bit>:
>> __variable_test_bit():
>> 0: 8b 54 24 08 mov 0x8(%esp),%edx
>> 4: 8b 44 24 04 mov 0x4(%esp),%eax
>> 8: 0f a3 02 bt %eax,(%edx)
>> b: 19 c0 sbb %eax,%eax
>> d: c3 ret
>> e: 66 90 xchg %ax,%ax
>>
>> 00000010 <__constant_test_bit>:
>> __constant_test_bit():
>> 10: 8b 4c 24 04 mov 0x4(%esp),%ecx
>> 14: 8b 44 24 08 mov 0x8(%esp),%eax
>> 18: 89 ca mov %ecx,%edx
>> 1a: c1 fa 04 sar $0x4,%edx
>> 1d: 8b 04 90 mov (%eax,%edx,4),%eax
>> 20: d3 e8 shr %cl,%eax
>> 22: 83 e0 01 and $0x1,%eax
>> 25: c3 ret
>
> But that's due to the forced interface of generating a return code. Please compare
> it at an inlined usage site, where GCC is free to do the comparison directly and
> use the result in flags.
just curious :
it seems __variable_test_bit() use less instructions,
why not always use __variable_test_bit() , remove __constant_test_bit() version ?

Thanks

2015-08-31 08:15:33

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH 1/2] x86/bitops: implement __test_bit

* Ingo Molnar <[email protected]> wrote:

> > Disassembly of section .text:
> >
> > 00000000 <__variable_test_bit>:
> > __variable_test_bit():
> > 0: 8b 54 24 08 mov 0x8(%esp),%edx
> > 4: 8b 44 24 04 mov 0x4(%esp),%eax
> > 8: 0f a3 02 bt %eax,(%edx)
> > b: 19 c0 sbb %eax,%eax
> > d: c3 ret
> > e: 66 90 xchg %ax,%ax
> >
> > 00000010 <__constant_test_bit>:
> > __constant_test_bit():
> > 10: 8b 4c 24 04 mov 0x4(%esp),%ecx
> > 14: 8b 44 24 08 mov 0x8(%esp),%eax
> > 18: 89 ca mov %ecx,%edx
> > 1a: c1 fa 04 sar $0x4,%edx
> > 1d: 8b 04 90 mov (%eax,%edx,4),%eax
> > 20: d3 e8 shr %cl,%eax
> > 22: 83 e0 01 and $0x1,%eax
> > 25: c3 ret
>
> But that's due to the forced interface of generating a return code. Please
> compare it at an inlined usage site, where GCC is free to do the comparison
> directly and use the result in flags.

So I was thinking about the patch below on top of yours.

But turns out GCC indeed generates worse code even under the best of
circumstances. For example the nested_vmx_disable_intercept_for_msr() change:

@@ -4275,24 +4275,24 @@ static void nested_vmx_disable_intercept
*/
if (msr <= 0x1fff) {
if (type & MSR_TYPE_R &&
- !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
+ !__test_bit(msr, msr_bitmap_l1 + 0x000 / f))
/* read-low */
__clear_bit(msr, msr_bitmap_nested + 0x000 / f);

before (i.e. your series):

ffffffff818b1082: 89 d0 mov %edx,%eax
ffffffff818b1084: 48 0f a3 07 bt %rax,(%rdi)
ffffffff818b1088: 45 19 c0 sbb %r8d,%r8d
ffffffff818b108b: 45 85 c0 test %r8d,%r8d
ffffffff818b108e: 75 04 jne ffffffff818b1094 <nested_vmx_disable_intercept_for_msr+0x43>

after (with my 'optimization' patch applied):

ffffffff818b1091: 89 d0 mov %edx,%eax
ffffffff818b1093: 49 89 c0 mov %rax,%r8
ffffffff818b1096: 49 c1 f8 06 sar $0x6,%r8
ffffffff818b109a: 4e 8b 04 c7 mov (%rdi,%r8,8),%r8
ffffffff818b109e: 49 0f a3 d0 bt %rdx,%r8
ffffffff818b10a2: 72 04 jb ffffffff818b10a8 <nested_vmx_disable_intercept_for_msr+0x48>

So GCC when left to its own devices, generates one more instruction and 4 more
bytes. Why does GCC do that? Why doesn't it use BT directly and use the flag, i.e.
something like [pseudocode]:

ffffffff818b1082: 89 d0 mov %edx,%eax
ffffffff818b1084: 48 0f a3 07 bt %rax,(%rdi)
ffffffff818b108e: 75 04 jne ffffffff818b1094 <nested_vmx_disable_intercept_for_msr+0x43>

?

In any case I take back my objection:

Acked-by: Ingo Molnar <[email protected]>

Thanks,

Ingo

---
arch/x86/include/asm/bitops.h | 19 +------------------
1 file changed, 1 insertion(+), 18 deletions(-)

Index: tip/arch/x86/include/asm/bitops.h
===================================================================
--- tip.orig/arch/x86/include/asm/bitops.h
+++ tip/arch/x86/include/asm/bitops.h
@@ -323,24 +323,12 @@ static inline int variable_test_bit(long
return oldbit;
}

-static __always_inline int __constant_test_bit(long nr, const unsigned long *addr)
+static __always_inline int __test_bit(long nr, const unsigned long *addr)
{
return ((1UL << (nr & (BITS_PER_LONG-1))) &
(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
}

-static inline int __variable_test_bit(long nr, const unsigned long *addr)
-{
- int oldbit;
-
- asm volatile("bt %2,%1\n\t"
- "sbb %0,%0"
- : "=r" (oldbit)
- : "m" (*addr), "Ir" (nr));
-
- return oldbit;
-}
-
#if 0 /* Fool kernel-doc since it doesn't do macros yet */
/**
* test_bit - Determine whether a bit is set
@@ -362,11 +350,6 @@ static int __test_bit(int nr, const vola
? constant_test_bit((nr), (addr)) \
: variable_test_bit((nr), (addr)))

-#define __test_bit(nr, addr) \
- (__builtin_constant_p((nr)) \
- ? __constant_test_bit((nr), (addr)) \
- : __variable_test_bit((nr), (addr)))
-
/**
* __ffs - find first set bit in word
* @word: The word to search

2015-08-31 08:19:56

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH 1/2] x86/bitops: implement __test_bit

* yalin wang <[email protected]> wrote:

>
> > On Aug 31, 2015, at 15:59, Ingo Molnar <[email protected]> wrote:
> >
> >
> > * Michael S. Tsirkin <[email protected]> wrote:
> >
> >> On Sun, Aug 30, 2015 at 11:13:20PM -0700, H. Peter Anvin wrote:
> >>> Presumably because gcc can't generate bt... whether or not it is worth it is another matter.
> >>>
> >>> On August 30, 2015 11:05:49 PM PDT, Ingo Molnar <[email protected]> wrote:
> >>>>
> >>>> * Michael S. Tsirkin <[email protected]> wrote:
> >>>>
> >>>>> +static __always_inline int __constant_test_bit(long nr, const
> >>>> unsigned long *addr)
> >>>>> +{
> >>>>> + return ((1UL << (nr & (BITS_PER_LONG-1))) &
> >>>>> + (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
> >>>>> +}
> >>>>> +
> >>>>> +static inline int __variable_test_bit(long nr, const unsigned long
> >>>> *addr)
> >>>>> +{
> >>>>> + int oldbit;
> >>>>> +
> >>>>> + asm volatile("bt %2,%1\n\t"
> >>>>> + "sbb %0,%0"
> >>>>> + : "=r" (oldbit)
> >>>>> + : "m" (*addr), "Ir" (nr));
> >>>>> +
> >>>>> + return oldbit;
> >>>>> +}
> >>>>
> >>>> Color me confused, why use assembly for this at all?
> >>>>
> >>>> Why not just use C for testing the bit (i.e. turn __constant_test_bit()
> >>>> into
> >>>> __test_bit()) - that would also allow the compiler to propagate the
> >>>> result,
> >>>> potentially more optimally than we can do it via SBB...
> >>>>
> >>>> Thanks,
> >>>>
> >>>> Ingo
> >>
> >> Exactly:
> >>
> >>
> >> Disassembly of section .text:
> >>
> >> 00000000 <__variable_test_bit>:
> >> __variable_test_bit():
> >> 0: 8b 54 24 08 mov 0x8(%esp),%edx
> >> 4: 8b 44 24 04 mov 0x4(%esp),%eax
> >> 8: 0f a3 02 bt %eax,(%edx)
> >> b: 19 c0 sbb %eax,%eax
> >> d: c3 ret
> >> e: 66 90 xchg %ax,%ax
> >>
> >> 00000010 <__constant_test_bit>:
> >> __constant_test_bit():
> >> 10: 8b 4c 24 04 mov 0x4(%esp),%ecx
> >> 14: 8b 44 24 08 mov 0x8(%esp),%eax
> >> 18: 89 ca mov %ecx,%edx
> >> 1a: c1 fa 04 sar $0x4,%edx
> >> 1d: 8b 04 90 mov (%eax,%edx,4),%eax
> >> 20: d3 e8 shr %cl,%eax
> >> 22: 83 e0 01 and $0x1,%eax
> >> 25: c3 ret
> >
> > But that's due to the forced interface of generating a return code. Please compare
> > it at an inlined usage site, where GCC is free to do the comparison directly and
> > use the result in flags.
> just curious :
> it seems __variable_test_bit() use less instructions,
> why not always use __variable_test_bit() , remove __constant_test_bit() version ?

It's an artifact of testing it in isolation.

For constant bit positions GCC is able to do a fairly good job:

ffffffff8103d2a0 <vmx_get_rflags>:
ffffffff8103d2a0: f6 87 4a 02 00 00 08 testb $0x8,0x24a(%rdi)
...
ffffffff8103d2ab: 75 39 jne ffffffff8103d2e6 <vmx_get_rflags+0x46>

with just 2 instructions: a TESTB plus using the flag result in a JNE.

Using variable_test_bit() forces the result into a register, which is suboptimal.

Thanks,

Ingo

2015-08-31 11:19:15

by Michael S. Tsirkin

[permalink] [raw]

Subject: Re: [PATCH 1/2] x86/bitops: implement __test_bit

On Mon, Aug 31, 2015 at 09:59:47AM +0200, Ingo Molnar wrote:
>
> * Michael S. Tsirkin <[email protected]> wrote:
>
> > On Sun, Aug 30, 2015 at 11:13:20PM -0700, H. Peter Anvin wrote:
> > > Presumably because gcc can't generate bt... whether or not it is worth it is another matter.
> > >
> > > On August 30, 2015 11:05:49 PM PDT, Ingo Molnar <[email protected]> wrote:
> > > >
> > > >* Michael S. Tsirkin <[email protected]> wrote:
> > > >
> > > >> +static __always_inline int __constant_test_bit(long nr, const
> > > >unsigned long *addr)
> > > >> +{
> > > >> + return ((1UL << (nr & (BITS_PER_LONG-1))) &
> > > >> + (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
> > > >> +}
> > > >> +
> > > >> +static inline int __variable_test_bit(long nr, const unsigned long
> > > >*addr)
> > > >> +{
> > > >> + int oldbit;
> > > >> +
> > > >> + asm volatile("bt %2,%1\n\t"
> > > >> + "sbb %0,%0"
> > > >> + : "=r" (oldbit)
> > > >> + : "m" (*addr), "Ir" (nr));
> > > >> +
> > > >> + return oldbit;
> > > >> +}
> > > >
> > > >Color me confused, why use assembly for this at all?
> > > >
> > > >Why not just use C for testing the bit (i.e. turn __constant_test_bit()
> > > >into
> > > >__test_bit()) - that would also allow the compiler to propagate the
> > > >result,
> > > >potentially more optimally than we can do it via SBB...
> > > >
> > > >Thanks,
> > > >
> > > > Ingo
> >
> > Exactly:
> >
> >
> > Disassembly of section .text:
> >
> > 00000000 <__variable_test_bit>:
> > __variable_test_bit():
> > 0: 8b 54 24 08 mov 0x8(%esp),%edx
> > 4: 8b 44 24 04 mov 0x4(%esp),%eax
> > 8: 0f a3 02 bt %eax,(%edx)
> > b: 19 c0 sbb %eax,%eax
> > d: c3 ret
> > e: 66 90 xchg %ax,%ax
> >
> > 00000010 <__constant_test_bit>:
> > __constant_test_bit():
> > 10: 8b 4c 24 04 mov 0x4(%esp),%ecx
> > 14: 8b 44 24 08 mov 0x8(%esp),%eax
> > 18: 89 ca mov %ecx,%edx
> > 1a: c1 fa 04 sar $0x4,%edx
> > 1d: 8b 04 90 mov (%eax,%edx,4),%eax
> > 20: d3 e8 shr %cl,%eax
> > 22: 83 e0 01 and $0x1,%eax
> > 25: c3 ret
>
> But that's due to the forced interface of generating a return code. Please compare
> it at an inlined usage site, where GCC is free to do the comparison directly and
> use the result in flags.
>
> Thanks,
>
> Ingo

I applied this patch on top of mine:

diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 9229334..2aed985 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -323,24 +323,17 @@ static inline int variable_test_bit(long nr, volatile const unsigned long *addr)
return oldbit;
}

-static __always_inline int __constant_test_bit(long nr, const unsigned long *addr)
+/**
+ * __test_bit - Determine whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static __always_inline int __test_bit(long nr, const unsigned long *addr)
{
return ((1UL << (nr & (BITS_PER_LONG-1))) &
(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
}

-static inline int __variable_test_bit(long nr, const unsigned long *addr)
-{
- int oldbit;
-
- asm volatile("bt %2,%1\n\t"
- "sbb %0,%0"
- : "=r" (oldbit)
- : "m" (*addr), "Ir" (nr));
-
- return oldbit;
-}
-
#if 0 /* Fool kernel-doc since it doesn't do macros yet */
/**
* test_bit - Determine whether a bit is set
@@ -348,13 +341,6 @@ static inline int __variable_test_bit(long nr, const unsigned long *addr)
* @addr: Address to start counting from
*/
static int test_bit(int nr, const volatile unsigned long *addr);
-
-/**
- * __test_bit - Determine whether a bit is set
- * @nr: bit number to test
- * @addr: Address to start counting from
- */
-static int __test_bit(int nr, const volatile unsigned long *addr);
#endif

#define test_bit(nr, addr) \
@@ -362,10 +348,6 @@ static int __test_bit(int nr, const volatile unsigned long *addr);
? constant_test_bit((nr), (addr)) \
: variable_test_bit((nr), (addr)))

-#define __test_bit(nr, addr) \
- (__builtin_constant_p((nr)) \
- ? __constant_test_bit((nr), (addr)) \
- : __variable_test_bit((nr), (addr)))

/**
* __ffs - find first set bit in word

And the code size went up:

134836 2997 8372 146205 23b1d arch/x86/kvm/kvm-intel.ko ->
134846 2997 8372 146215 23b27 arch/x86/kvm/kvm-intel.ko

342690 47640 441 390771 5f673 arch/x86/kvm/kvm.ko ->
342738 47640 441 390819 5f6a3 arch/x86/kvm/kvm.ko

I tried removing __always_inline, this had no effect.

--
MST