Series speed-ups GFN to memslot lookup time by:
* introducing LRU cache, which improves looukup time for
same slot workload (typically boot time of Windows and Linux guest)
* switching to binary search for GFN to memslot lookup,
improving lookup time with large amount of memory slots
Igor Mammedov (5):
kvm: update_memslots: drop not needed check for the same number of
pages
kvm: update_memslots: drop not needed check for the same slot
kvm: search_memslots: add simple LRU memslot caching
kvm: change memslot sorting rule from size to GFN
kvm: optimize GFN to memslot lookup with large slots amount
include/linux/kvm_host.h | 28 +++++++++++++++++++++++-----
virt/kvm/kvm_main.c | 46 ++++++++++++++++++++++++++--------------------
2 files changed, 49 insertions(+), 25 deletions(-)
--
1.8.3.1
In typical guest boot workload only 2-3 memslots are used
extensively, and at that it's mostly the same memslot
lookup operation.
Adding LRU cache improves average lookup time from
46 to 28 cycles (~40%) for this workload.
Signed-off-by: Igor Mammedov <[email protected]>
---
include/linux/kvm_host.h | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 231dd94..1a37144 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -353,6 +353,7 @@ struct kvm_memslots {
struct kvm_memory_slot memslots[KVM_MEM_SLOTS_NUM];
/* The mapping table from slot id to the index in memslots[]. */
short id_to_index[KVM_MEM_SLOTS_NUM];
+ atomic_t lru_slot;
};
struct kvm {
@@ -790,12 +791,19 @@ static inline void kvm_guest_exit(void)
static inline struct kvm_memory_slot *
search_memslots(struct kvm_memslots *slots, gfn_t gfn)
{
- struct kvm_memory_slot *memslot;
+ int slot = atomic_read(&slots->lru_slot);
+ struct kvm_memory_slot *memslot = &slots->memslots[slot];
+
+ if (gfn >= memslot->base_gfn &&
+ gfn < memslot->base_gfn + memslot->npages)
+ return memslot;
kvm_for_each_memslot(memslot, slots)
if (gfn >= memslot->base_gfn &&
- gfn < memslot->base_gfn + memslot->npages)
+ gfn < memslot->base_gfn + memslot->npages) {
+ atomic_set(&slots->lru_slot, memslot - slots->memslots);
return memslot;
+ }
return NULL;
}
--
1.8.3.1
Current linear search doesn't scale well when
large amount of memslots is used and looked up slot
is not in the beginning memslots array.
Taking in account that memslots don't overlap, it's
possible to switch sorting order of memslots array from
'npages' to 'base_gfn' and use binary search for
memslot lookup by GFN.
As result of switching to binary search lookup times
are reduced with large amount of memslots.
Following is a table of search_memslot() cycles
during WS2008R2 guest boot.
boot, boot + ~10 min
mostly same of using it,
slot lookup randomized lookup
max average average
cycles cycles cycles
13 slots : 1450 28 30
13 slots : 1400 30 40
binary search
117 slots : 13000 30 460
117 slots : 2000 35 180
binary search
Signed-off-by: Igor Mammedov <[email protected]>
---
include/linux/kvm_host.h | 34 ++++++++++++++++++++++------------
virt/kvm/kvm_main.c | 8 +++++++-
2 files changed, 29 insertions(+), 13 deletions(-)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1a37144..193bca6 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -354,6 +354,7 @@ struct kvm_memslots {
/* The mapping table from slot id to the index in memslots[]. */
short id_to_index[KVM_MEM_SLOTS_NUM];
atomic_t lru_slot;
+ int used_slots;
};
struct kvm {
@@ -791,19 +792,28 @@ static inline void kvm_guest_exit(void)
static inline struct kvm_memory_slot *
search_memslots(struct kvm_memslots *slots, gfn_t gfn)
{
+ int start = 0, end = slots->used_slots;
int slot = atomic_read(&slots->lru_slot);
- struct kvm_memory_slot *memslot = &slots->memslots[slot];
-
- if (gfn >= memslot->base_gfn &&
- gfn < memslot->base_gfn + memslot->npages)
- return memslot;
-
- kvm_for_each_memslot(memslot, slots)
- if (gfn >= memslot->base_gfn &&
- gfn < memslot->base_gfn + memslot->npages) {
- atomic_set(&slots->lru_slot, memslot - slots->memslots);
- return memslot;
- }
+ struct kvm_memory_slot *memslots = slots->memslots;
+
+ if (gfn >= memslots[slot].base_gfn &&
+ gfn < memslots[slot].base_gfn + memslots[slot].npages)
+ return &memslots[slot];
+
+ while (start < end) {
+ slot = start + (end - start) / 2;
+
+ if (gfn >= memslots[slot].base_gfn)
+ end = slot;
+ else
+ start = slot + 1;
+ }
+
+ if (gfn >= memslots[start].base_gfn &&
+ gfn < memslots[start].base_gfn + memslots[start].npages) {
+ atomic_set(&slots->lru_slot, start);
+ return &memslots[start];
+ }
return NULL;
}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 162817f..759af659 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -679,8 +679,14 @@ static void update_memslots(struct kvm_memslots *slots,
struct kvm_memory_slot *mslots = slots->memslots;
WARN_ON(mslots[i].id != id);
- if (!new->npages)
+ if (!new->npages) {
new->base_gfn = 0;
+ if (mslots[i].npages)
+ slots->used_slots--;
+ } else {
+ if (!mslots[i].npages)
+ slots->used_slots++;
+ }
while (i < KVM_MEM_SLOTS_NUM - 1 &&
new->base_gfn <= mslots[i + 1].base_gfn) {
--
1.8.3.1
it will allow to use binary search for GFN -> memslot
lookups, reducing lookup cost with large slots amount.
Signed-off-by: Igor Mammedov <[email protected]>
---
virt/kvm/kvm_main.c | 17 +++++++++++------
1 file changed, 11 insertions(+), 6 deletions(-)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 278ed65..162817f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -666,10 +666,10 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
}
/*
- * Insert memslot and re-sort memslots based on their size,
- * so the larger slots will get better fit. Sorting algorithm
- * takes advantage of having initially sorted array and
- * known changed memslot position.
+ * Insert memslot and re-sort memslots based on their GFN,
+ * so binary search could be used to lookup GFN.
+ * Sorting algorithm takes advantage of having initially
+ * sorted array and known changed memslot position.
*/
static void update_memslots(struct kvm_memslots *slots,
struct kvm_memory_slot *new)
@@ -679,14 +679,19 @@ static void update_memslots(struct kvm_memslots *slots,
struct kvm_memory_slot *mslots = slots->memslots;
WARN_ON(mslots[i].id != id);
+ if (!new->npages)
+ new->base_gfn = 0;
+
while (i < KVM_MEM_SLOTS_NUM - 1 &&
- new->npages < mslots[i + 1].npages) {
+ new->base_gfn <= mslots[i + 1].base_gfn) {
+ if (!mslots[i + 1].npages)
+ break;
mslots[i] = mslots[i + 1];
slots->id_to_index[mslots[i].id] = i;
i++;
}
while (i > 0 &&
- new->npages > mslots[i - 1].npages) {
+ new->base_gfn > mslots[i - 1].base_gfn) {
mslots[i] = mslots[i - 1];
slots->id_to_index[mslots[i].id] = i;
i--;
--
1.8.3.1
if number of pages haven't changed sorting algorithm
will do nothing, so there is no need to do extra check
to avoid entering sorting logic.
Signed-off-by: Igor Mammedov <[email protected]>
---
virt/kvm/kvm_main.c | 28 +++++++++++++---------------
1 file changed, 13 insertions(+), 15 deletions(-)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5b45330..407277b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -679,21 +679,19 @@ static void update_memslots(struct kvm_memslots *slots,
struct kvm_memory_slot *mslots = slots->memslots;
WARN_ON(mslots[i].id != id);
- if (new->npages != mslots[i].npages) {
- if (new->npages < mslots[i].npages) {
- while (i < KVM_MEM_SLOTS_NUM - 1 &&
- new->npages < mslots[i + 1].npages) {
- mslots[i] = mslots[i + 1];
- slots->id_to_index[mslots[i].id] = i;
- i++;
- }
- } else {
- while (i > 0 &&
- new->npages > mslots[i - 1].npages) {
- mslots[i] = mslots[i - 1];
- slots->id_to_index[mslots[i].id] = i;
- i--;
- }
+ if (new->npages < mslots[i].npages) {
+ while (i < KVM_MEM_SLOTS_NUM - 1 &&
+ new->npages < mslots[i + 1].npages) {
+ mslots[i] = mslots[i + 1];
+ slots->id_to_index[mslots[i].id] = i;
+ i++;
+ }
+ } else {
+ while (i > 0 &&
+ new->npages > mslots[i - 1].npages) {
+ mslots[i] = mslots[i - 1];
+ slots->id_to_index[mslots[i].id] = i;
+ i--;
}
}
--
1.8.3.1
UP/DOWN shift loops will shift array in needed
direction and stop at place where new slot should
be placed regardless of old slot size.
Signed-off-by: Igor Mammedov <[email protected]>
---
virt/kvm/kvm_main.c | 25 +++++++++++--------------
1 file changed, 11 insertions(+), 14 deletions(-)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 407277b..278ed65 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -679,20 +679,17 @@ static void update_memslots(struct kvm_memslots *slots,
struct kvm_memory_slot *mslots = slots->memslots;
WARN_ON(mslots[i].id != id);
- if (new->npages < mslots[i].npages) {
- while (i < KVM_MEM_SLOTS_NUM - 1 &&
- new->npages < mslots[i + 1].npages) {
- mslots[i] = mslots[i + 1];
- slots->id_to_index[mslots[i].id] = i;
- i++;
- }
- } else {
- while (i > 0 &&
- new->npages > mslots[i - 1].npages) {
- mslots[i] = mslots[i - 1];
- slots->id_to_index[mslots[i].id] = i;
- i--;
- }
+ while (i < KVM_MEM_SLOTS_NUM - 1 &&
+ new->npages < mslots[i + 1].npages) {
+ mslots[i] = mslots[i + 1];
+ slots->id_to_index[mslots[i].id] = i;
+ i++;
+ }
+ while (i > 0 &&
+ new->npages > mslots[i - 1].npages) {
+ mslots[i] = mslots[i - 1];
+ slots->id_to_index[mslots[i].id] = i;
+ i--;
}
mslots[i] = *new;
--
1.8.3.1
On 01/12/2014 18:29, Igor Mammedov wrote:
> Series speed-ups GFN to memslot lookup time by:
> * introducing LRU cache, which improves looukup time for
> same slot workload (typically boot time of Windows and Linux guest)
> * switching to binary search for GFN to memslot lookup,
> improving lookup time with large amount of memory slots
>
> Igor Mammedov (5):
> kvm: update_memslots: drop not needed check for the same number of
> pages
> kvm: update_memslots: drop not needed check for the same slot
> kvm: search_memslots: add simple LRU memslot caching
> kvm: change memslot sorting rule from size to GFN
> kvm: optimize GFN to memslot lookup with large slots amount
>
> include/linux/kvm_host.h | 28 +++++++++++++++++++++++-----
> virt/kvm/kvm_main.c | 46 ++++++++++++++++++++++++++--------------------
> 2 files changed, 49 insertions(+), 25 deletions(-)
>
Applied patches 1-3 for now, I'm not in the mood for proving that the
binary search is correct. :)
Paolo
On Mon, 01 Dec 2014 18:38:34 +0100
Paolo Bonzini <[email protected]> wrote:
>
>
> On 01/12/2014 18:29, Igor Mammedov wrote:
> > Series speed-ups GFN to memslot lookup time by:
> > * introducing LRU cache, which improves looukup time for
> > same slot workload (typically boot time of Windows and Linux guest)
> > * switching to binary search for GFN to memslot lookup,
> > improving lookup time with large amount of memory slots
> >
> > Igor Mammedov (5):
> > kvm: update_memslots: drop not needed check for the same number of
> > pages
> > kvm: update_memslots: drop not needed check for the same slot
> > kvm: search_memslots: add simple LRU memslot caching
> > kvm: change memslot sorting rule from size to GFN
> > kvm: optimize GFN to memslot lookup with large slots amount
> >
> > include/linux/kvm_host.h | 28 +++++++++++++++++++++++-----
> > virt/kvm/kvm_main.c | 46 ++++++++++++++++++++++++++--------------------
> > 2 files changed, 49 insertions(+), 25 deletions(-)
> >
>
> Applied patches 1-3 for now, I'm not in the mood for proving that the
> binary search is correct. :)
Following write up could help with improving a proving mood :)
http://community.topcoder.com/tc?module=Static&d1=tutorials&d2=binarySearch
>
> Paolo
On 02/12/2014 08:57, Igor Mammedov wrote:
>> On 01/12/2014 18:29, Igor Mammedov wrote:
>>> Series speed-ups GFN to memslot lookup time by:
>>> * introducing LRU cache, which improves looukup time for
>>> same slot workload (typically boot time of Windows and Linux guest)
>>> * switching to binary search for GFN to memslot lookup,
>>> improving lookup time with large amount of memory slots
>>>
>>> Igor Mammedov (5):
>>> kvm: update_memslots: drop not needed check for the same number of
>>> pages
>>> kvm: update_memslots: drop not needed check for the same slot
>>> kvm: search_memslots: add simple LRU memslot caching
>>> kvm: change memslot sorting rule from size to GFN
>>> kvm: optimize GFN to memslot lookup with large slots amount
>>>
>>> include/linux/kvm_host.h | 28 +++++++++++++++++++++++-----
>>> virt/kvm/kvm_main.c | 46 ++++++++++++++++++++++++++--------------------
>>> 2 files changed, 49 insertions(+), 25 deletions(-)
>>>
>>
>> Applied patches 1-3 for now, I'm not in the mood for proving that the
>> binary search is correct. :)
Looks good, thanks. Gleb, any objections?
Paolo
2014-12-01 17:29+0000, Igor Mammedov:
> Current linear search doesn't scale well when
> large amount of memslots is used and looked up slot
> is not in the beginning memslots array.
> Taking in account that memslots don't overlap, it's
> possible to switch sorting order of memslots array from
> 'npages' to 'base_gfn' and use binary search for
> memslot lookup by GFN.
>
> As result of switching to binary search lookup times
> are reduced with large amount of memslots.
>
> Following is a table of search_memslot() cycles
> during WS2008R2 guest boot.
>
> boot, boot + ~10 min
> mostly same of using it,
> slot lookup randomized lookup
> max average average
> cycles cycles cycles
>
> 13 slots : 1450 28 30
>
> 13 slots : 1400 30 40
> binary search
>
> 117 slots : 13000 30 460
>
> 117 slots : 2000 35 180
> binary search
>
> Signed-off-by: Igor Mammedov <[email protected]>
> ---
Fast ... it looks that we don't even want to transfort the list-in-array
into a tree-in-array to have multiplication instead of division.
Reviewed-by: Radim Krčmář <[email protected]>
(Actually, all patches.)
> include/linux/kvm_host.h | 34 ++++++++++++++++++++++------------
> virt/kvm/kvm_main.c | 8 +++++++-
> 2 files changed, 29 insertions(+), 13 deletions(-)
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 1a37144..193bca6 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -354,6 +354,7 @@ struct kvm_memslots {
> /* The mapping table from slot id to the index in memslots[]. */
> short id_to_index[KVM_MEM_SLOTS_NUM];
> atomic_t lru_slot;
> + int used_slots;
> };
>
> struct kvm {
> @@ -791,19 +792,28 @@ static inline void kvm_guest_exit(void)
> static inline struct kvm_memory_slot *
> search_memslots(struct kvm_memslots *slots, gfn_t gfn)
> {
> + int start = 0, end = slots->used_slots;
> int slot = atomic_read(&slots->lru_slot);
> - struct kvm_memory_slot *memslot = &slots->memslots[slot];
> -
> - if (gfn >= memslot->base_gfn &&
> - gfn < memslot->base_gfn + memslot->npages)
> - return memslot;
> -
> - kvm_for_each_memslot(memslot, slots)
> - if (gfn >= memslot->base_gfn &&
> - gfn < memslot->base_gfn + memslot->npages) {
> - atomic_set(&slots->lru_slot, memslot - slots->memslots);
> - return memslot;
> - }
> + struct kvm_memory_slot *memslots = slots->memslots;
> +
> + if (gfn >= memslots[slot].base_gfn &&
> + gfn < memslots[slot].base_gfn + memslots[slot].npages)
> + return &memslots[slot];
> +
> + while (start < end) {
> + slot = start + (end - start) / 2;
> +
> + if (gfn >= memslots[slot].base_gfn)
(Even thought division is costly, I think that checking here if 'slot'
is the one we want wouldn't help very much.)
> + end = slot;
> + else
> + start = slot + 1;
> + }
> +
> + if (gfn >= memslots[start].base_gfn &&
> + gfn < memslots[start].base_gfn + memslots[start].npages) {
> + atomic_set(&slots->lru_slot, start);
> + return &memslots[start];
> + }
>
> return NULL;
> }
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 162817f..759af659 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -679,8 +679,14 @@ static void update_memslots(struct kvm_memslots *slots,
> struct kvm_memory_slot *mslots = slots->memslots;
>
> WARN_ON(mslots[i].id != id);
> - if (!new->npages)
> + if (!new->npages) {
> new->base_gfn = 0;
> + if (mslots[i].npages)
> + slots->used_slots--;
> + } else {
> + if (!mslots[i].npages)
> + slots->used_slots++;
> + }
>
> while (i < KVM_MEM_SLOTS_NUM - 1 &&
> new->base_gfn <= mslots[i + 1].base_gfn) {
> --
> 1.8.3.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On 02/12/2014 18:33, Radim Krčmář wrote:
>> > + while (start < end) {
>> > + slot = start + (end - start) / 2;
>> > +
>> > + if (gfn >= memslots[slot].base_gfn)
> (Even thought division is costly, I think that checking here if 'slot'
> is the one we want wouldn't help very much.)
>
Division by an unsigned is just a right shift. Division by signed
integer is a right shift + conditional move. We can change / 2 to
explicit >> 1, or change start and end to unsigned, or both.
Paolo
2014-12-02 19:45+0100, Paolo Bonzini:
> On 02/12/2014 18:33, Radim Krčmář wrote:
> >> > + while (start < end) {
> >> > + slot = start + (end - start) / 2;
> >> > +
> >> > + if (gfn >= memslots[slot].base_gfn)
> > (Even thought division is costly, I think that checking here if 'slot'
> > is the one we want wouldn't help very much.)
> >
>
> Division by an unsigned is just a right shift. Division by signed
> integer is a right shift + conditional move. We can change / 2 to
> explicit >> 1, or change start and end to unsigned, or both.
My bad, no respectable optimizer would miss that.