Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753274AbdGJBtA (ORCPT ); Sun, 9 Jul 2017 21:49:00 -0400 Received: from mga06.intel.com ([134.134.136.31]:24495 "EHLO mga06.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753253AbdGJBs6 (ORCPT ); Sun, 9 Jul 2017 21:48:58 -0400 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.40,337,1496127600"; d="scan'208";a="1193388572" From: Aubrey Li To: tglx@linutronix.de, peterz@infradead.org, len.brown@intel.com, rjw@rjwysocki.net, ak@linux.intel.com, tim.c.chen@linux.intel.com, arjan@linux.intel.com, paulmck@linux.vnet.ibm.com, yang.zhang.wz@gmail.com Cc: x86@kernel.org, linux-kernel@vger.kernel.org, Aubrey Li Subject: [RFC PATCH v1 03/11] cpuidle: introduce cpuidle governor for idle prediction Date: Mon, 10 Jul 2017 09:38:33 +0800 Message-Id: <1499650721-5928-4-git-send-email-aubrey.li@intel.com> X-Mailer: git-send-email 2.7.4 In-Reply-To: <1499650721-5928-1-git-send-email-aubrey.li@intel.com> References: <1499650721-5928-1-git-send-email-aubrey.li@intel.com> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 6902 Lines: 222 From: Aubrey Li Two factors taken from menu governor for idle prediction in the cpu idle governor: - Energy break even point - Repeatable interval detector Based on the actual known "next timer event" time, and coordinate with the algorithm in the above two factors, we'll make a prediction of how long we'll be idle --- drivers/cpuidle/cpuidle.c | 165 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/cpuidle.h | 4 ++ 2 files changed, 169 insertions(+) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 2706be7..0be7f75 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include #include #include @@ -179,6 +181,169 @@ int cpuidle_enter_freeze(struct cpuidle_driver *drv, struct cpuidle_device *dev) } #endif /* CONFIG_SUSPEND */ +static inline int which_bucket(unsigned int duration, + unsigned long nr_iowaiters) +{ + int bucket = 0; + + /* + * We keep two groups of stats; one with no + * IO pending, one without. + * This allows us to calculate + * E(duration)|iowait + */ + if (nr_iowaiters) + bucket = BUCKETS/2; + + if (duration < 10) + return bucket; + if (duration < 100) + return bucket + 1; + if (duration < 1000) + return bucket + 2; + if (duration < 10000) + return bucket + 3; + if (duration < 100000) + return bucket + 4; + return bucket + 5; +} + +/* + * Try detecting repeating patterns by keeping track of the last 8 + * intervals, and checking if the standard deviation of that set + * of points is below a threshold. If it is... then use the + * average of these 8 points as the estimated value. + */ +static unsigned int get_typical_interval(struct cpuidle_device *dev) +{ + struct cpuidle_governor_stat *gov_stat = + (struct cpuidle_governor_stat *)&(dev->gov_stat); + int i, divisor; + unsigned int max, thresh, avg; + uint64_t sum, variance; + + thresh = UINT_MAX; /* Discard outliers above this value */ + +again: + + /* First calculate the average of past intervals */ + max = 0; + sum = 0; + divisor = 0; + for (i = 0; i < INTERVALS; i++) { + unsigned int value = gov_stat->intervals[i]; + + if (value <= thresh) { + sum += value; + divisor++; + if (value > max) + max = value; + } + } + if (divisor == INTERVALS) + avg = sum >> INTERVAL_SHIFT; + else + avg = div_u64(sum, divisor); + + /* Then try to determine variance */ + variance = 0; + for (i = 0; i < INTERVALS; i++) { + unsigned int value = gov_stat->intervals[i]; + + if (value <= thresh) { + int64_t diff = (int64_t)value - avg; + + variance += diff * diff; + } + } + if (divisor == INTERVALS) + variance >>= INTERVAL_SHIFT; + else + do_div(variance, divisor); + + /* + * The typical interval is obtained when standard deviation is + * small (stddev <= 20 us, variance <= 400 us^2) or standard + * deviation is small compared to the average interval (avg > + * 6*stddev, avg^2 > 36*variance). The average is smaller than + * UINT_MAX aka U32_MAX, so computing its square does not + * overflow a u64. We simply reject this candidate average if + * the standard deviation is greater than 715 s (which is + * rather unlikely). + * + * Use this result only if there is no timer to wake us up sooner. + */ + if (likely(variance <= U64_MAX/36)) { + if ((((u64)avg*avg > variance*36) && + (divisor * 4 >= INTERVALS * 3)) || variance <= 400) { + return avg; + } + } + + /* + * If we have outliers to the upside in our distribution, discard + * those by setting the threshold to exclude these outliers, then + * calculate the average and standard deviation again. Once we get + * down to the bottom 3/4 of our samples, stop excluding samples. + * + * This can deal with workloads that have long pauses interspersed + * with sporadic activity with a bunch of short pauses. + */ + if ((divisor * 4) <= INTERVALS * 3) + return UINT_MAX; + + thresh = max - 1; + goto again; +} + +/** + * cpuidle_predict - predict the next idle duration, in micro-second. + * There are two factors in the cpu idle governor, taken from menu: + * 1) Energy break even point + * 2) Repeatable interval detector + * + * Based on the actual known "next timer event" time, and coordinate with + * the algorithm in the two factors, we'll make a prediction of how long + * we'll be idle + */ +unsigned int cpuidle_predict(void) +{ + struct cpuidle_device *dev = cpuidle_get_device(); + struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); + struct cpuidle_governor_stat *gov_stat; + unsigned int expected_interval; + unsigned long nr_iowaiters, cpu_load; + + if (need_resched()) + return 0; + + if (cpuidle_not_available(drv, dev)) + return -ENODEV; + + gov_stat = (struct cpuidle_governor_stat *)&(dev->gov_stat); + + gov_stat->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length()); + get_iowait_load(&nr_iowaiters, &cpu_load); + gov_stat->bucket = which_bucket(gov_stat->next_timer_us, nr_iowaiters); + + /* + * Force the result of multiplication to be 64 bits even if both + * operands are 32 bits. + * Make sure to round up for half microseconds. + */ + gov_stat->predicted_us = DIV_ROUND_CLOSEST_ULL( + (uint64_t)gov_stat->next_timer_us * + gov_stat->correction_factor[gov_stat->bucket], + RESOLUTION * DECAY); + + expected_interval = get_typical_interval(dev); + expected_interval = min(expected_interval, gov_stat->next_timer_us); + + gov_stat->predicted_us = min(gov_stat->predicted_us, expected_interval); + + return gov_stat->predicted_us; +} + /** * cpuidle_enter_state - enter the state and update stats * @dev: cpuidle device for this cpu diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index f17a53b..b9964ec 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -164,6 +164,7 @@ extern int cpuidle_select(struct cpuidle_driver *drv, extern int cpuidle_enter(struct cpuidle_driver *drv, struct cpuidle_device *dev, int index); extern void cpuidle_reflect(struct cpuidle_device *dev, int index); +extern unsigned int cpuidle_predict(void); extern int cpuidle_register_driver(struct cpuidle_driver *drv); extern struct cpuidle_driver *cpuidle_get_driver(void); @@ -198,6 +199,9 @@ static inline int cpuidle_enter(struct cpuidle_driver *drv, struct cpuidle_device *dev, int index) {return -ENODEV; } static inline void cpuidle_reflect(struct cpuidle_device *dev, int index) { } +static inline unsigned int cpuidle_predict(struct cpuidle_device *dev, + struct cpuidle_driver *drv) +{return -ENODEV; } static inline int cpuidle_register_driver(struct cpuidle_driver *drv) {return -ENODEV; } static inline struct cpuidle_driver *cpuidle_get_driver(void) {return NULL; } -- 2.7.4